From ebcc9301eacb99350ef7e1e2492e5d18f7986ef8 Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <nborisov@suse.com>
Date: Tue, 27 Jun 2017 10:02:24 +0300
Subject: [PATCH 001/104] btrfs: convert while loop to list_for_each_entry

No functional changes, just make the loop a bit more readable

Signed-off-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/volumes.c | 11 ++---------
 1 file changed, 2 insertions(+), 9 deletions(-)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index e8b9a269fdde..c665af0727e6 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -4629,7 +4629,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 {
 	struct btrfs_fs_info *info = trans->fs_info;
 	struct btrfs_fs_devices *fs_devices = info->fs_devices;
-	struct list_head *cur;
+	struct btrfs_device *device;
 	struct map_lookup *map = NULL;
 	struct extent_map_tree *em_tree;
 	struct extent_map *em;
@@ -4703,22 +4703,15 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 	if (!devices_info)
 		return -ENOMEM;
 
-	cur = fs_devices->alloc_list.next;
-
 	/*
 	 * in the first pass through the devices list, we gather information
 	 * about the available holes on each device.
 	 */
 	ndevs = 0;
-	while (cur != &fs_devices->alloc_list) {
-		struct btrfs_device *device;
+	list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
 		u64 max_avail;
 		u64 dev_offset;
 
-		device = list_entry(cur, struct btrfs_device, dev_alloc_list);
-
-		cur = cur->next;
-
 		if (!device->writeable) {
 			WARN(1, KERN_ERR
 			       "BTRFS: read-only device in alloc_list\n");

From e5600fd6fc7d1924ff77681128a364e1dad4435b Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <nborisov@suse.com>
Date: Tue, 27 Jun 2017 10:02:25 +0300
Subject: [PATCH 002/104] btrfs: Use explicit round_down call rather than
 open-coding it

No functional changes.

Signed-off-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/volumes.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index c665af0727e6..b011753ce8bb 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -4762,7 +4762,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 	     btrfs_cmp_device_info, NULL);
 
 	/* round down to number of usable stripes */
-	ndevs -= ndevs % devs_increment;
+	ndevs = round_down(ndevs, devs_increment);
 
 	if (ndevs < devs_increment * sub_stripes || ndevs < devs_min) {
 		ret = -ENOSPC;

From f148ef4d3a8feb2e30017050e3534a4d0646b769 Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <nborisov@suse.com>
Date: Tue, 27 Jun 2017 10:02:26 +0300
Subject: [PATCH 003/104] btrfs: Be explicit about usage of min()

__btrfs_alloc_chunk contains code which boils down to:

    ndevs = min(ndevs, devs_max)

It's conditional upon devs_max not being 0. However, it cannot really be 0
since it's always set to either BTRFS_MAX_DEVS_SYS_CHUNK or
BTRFS_MAX_DEVS(fs_info->chunk_root). So eliminate the condition check and use
min explicitly. This has no functional changes.

Signed-off-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/volumes.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index b011753ce8bb..f93ac3d7e997 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -4769,8 +4769,8 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 		goto error;
 	}
 
-	if (devs_max && ndevs > devs_max)
-		ndevs = devs_max;
+	ndevs = min(ndevs, devs_max);
+
 	/*
 	 * the primary goal is to maximize the number of stripes, so use as many
 	 * devices as possible, even if the stripes are not maximum sized.

From 23d1f73788785a770fe6eb348fee4b26281d2064 Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <nborisov@suse.com>
Date: Wed, 28 Jun 2017 11:05:22 +0300
Subject: [PATCH 004/104] btrfs: remove unused sectorsize member

The sectorsize member of btrfs_block_group_cache is unused. So remove it, this
reduces the number of holes in the struct.

With patch:
/* size: 856, cachelines: 14, members: 40 */
/* sum members: 837, holes: 4, sum holes: 19 */
/* bit holes: 1, sum bit holes: 29 bits */
/* last cacheline: 24 bytes */

Without patch:
/* size: 864, cachelines: 14, members: 41 */
/* sum members: 841, holes: 5, sum holes: 23 */
/* bit holes: 1, sum bit holes: 29 bits */
/* last cacheline: 32 bytes */

Signed-off-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.h                       | 1 -
 fs/btrfs/extent-tree.c                 | 1 -
 fs/btrfs/tests/btrfs-tests.c           | 1 -
 fs/btrfs/tests/free-space-tree-tests.c | 2 +-
 4 files changed, 1 insertion(+), 4 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 3f3eb7b17cac..589491040950 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -558,7 +558,6 @@ struct btrfs_block_group_cache {
 	u64 bytes_super;
 	u64 flags;
 	u64 cache_generation;
-	u32 sectorsize;
 
 	/*
 	 * If the free space extent count exceeds this number, convert the block
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index e3b0b4196d3d..6d04563585e6 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -9952,7 +9952,6 @@ btrfs_create_block_group_cache(struct btrfs_fs_info *fs_info,
 	cache->key.offset = size;
 	cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
 
-	cache->sectorsize = fs_info->sectorsize;
 	cache->fs_info = fs_info;
 	cache->full_stripe_len = btrfs_full_stripe_len(fs_info,
 						       &fs_info->mapping_tree,
diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c
index b18ab8f327a5..d3f25376a0f8 100644
--- a/fs/btrfs/tests/btrfs-tests.c
+++ b/fs/btrfs/tests/btrfs-tests.c
@@ -211,7 +211,6 @@ btrfs_alloc_dummy_block_group(struct btrfs_fs_info *fs_info,
 	cache->key.objectid = 0;
 	cache->key.offset = length;
 	cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
-	cache->sectorsize = fs_info->sectorsize;
 	cache->full_stripe_len = fs_info->sectorsize;
 	cache->fs_info = fs_info;
 
diff --git a/fs/btrfs/tests/free-space-tree-tests.c b/fs/btrfs/tests/free-space-tree-tests.c
index b29954c01673..1458bb0ea124 100644
--- a/fs/btrfs/tests/free-space-tree-tests.c
+++ b/fs/btrfs/tests/free-space-tree-tests.c
@@ -81,7 +81,7 @@ static int __check_free_space_extents(struct btrfs_trans_handle *trans,
 					i++;
 				}
 				prev_bit = bit;
-				offset += cache->sectorsize;
+				offset += fs_info->sectorsize;
 			}
 		}
 		if (prev_bit == 1) {

From 1cbb1f454e5321e47fc1e6b233066c7ccc979d15 Mon Sep 17 00:00:00 2001
From: Jeff Mahoney <jeffm@suse.com>
Date: Wed, 28 Jun 2017 21:56:53 -0600
Subject: [PATCH 005/104] btrfs: struct-funcs, constify readers

We have reader helpers for most of the on-disk structures that use
an extent_buffer and pointer as offset into the buffer that are
read-only.  We should mark them as const and, in turn, allow consumers
of these interfaces to mark the buffers const as well.

No impact on code, but serves as documentation that a buffer is intended
not to be modified.

Signed-off-by: Jeff Mahoney <jeffm@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.h        | 128 +++++++++++++++++++++-------------------
 fs/btrfs/extent_io.c    |  24 ++++----
 fs/btrfs/extent_io.h    |  19 +++---
 fs/btrfs/struct-funcs.c |   9 +--
 4 files changed, 91 insertions(+), 89 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 589491040950..34ed3dabd43a 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1434,7 +1434,7 @@ do {                                                                   \
 #define BTRFS_INODE_ROOT_ITEM_INIT	(1 << 31)
 
 struct btrfs_map_token {
-	struct extent_buffer *eb;
+	const struct extent_buffer *eb;
 	char *kaddr;
 	unsigned long offset;
 };
@@ -1468,18 +1468,19 @@ static inline void btrfs_init_map_token (struct btrfs_map_token *token)
 			   sizeof(((type *)0)->member)))
 
 #define DECLARE_BTRFS_SETGET_BITS(bits)					\
-u##bits btrfs_get_token_##bits(struct extent_buffer *eb, void *ptr,	\
-			       unsigned long off,			\
-                              struct btrfs_map_token *token);		\
-void btrfs_set_token_##bits(struct extent_buffer *eb, void *ptr,	\
+u##bits btrfs_get_token_##bits(const struct extent_buffer *eb,		\
+			       const void *ptr, unsigned long off,	\
+			       struct btrfs_map_token *token);		\
+void btrfs_set_token_##bits(struct extent_buffer *eb, const void *ptr,	\
 			    unsigned long off, u##bits val,		\
 			    struct btrfs_map_token *token);		\
-static inline u##bits btrfs_get_##bits(struct extent_buffer *eb, void *ptr, \
+static inline u##bits btrfs_get_##bits(const struct extent_buffer *eb,	\
+				       const void *ptr,			\
 				       unsigned long off)		\
 {									\
 	return btrfs_get_token_##bits(eb, ptr, off, NULL);		\
 }									\
-static inline void btrfs_set_##bits(struct extent_buffer *eb, void *ptr, \
+static inline void btrfs_set_##bits(struct extent_buffer *eb, void *ptr,\
 				    unsigned long off, u##bits val)	\
 {									\
        btrfs_set_token_##bits(eb, ptr, off, val, NULL);			\
@@ -1491,7 +1492,8 @@ DECLARE_BTRFS_SETGET_BITS(32)
 DECLARE_BTRFS_SETGET_BITS(64)
 
 #define BTRFS_SETGET_FUNCS(name, type, member, bits)			\
-static inline u##bits btrfs_##name(struct extent_buffer *eb, type *s)	\
+static inline u##bits btrfs_##name(const struct extent_buffer *eb,	\
+				   const type *s)			\
 {									\
 	BUILD_BUG_ON(sizeof(u##bits) != sizeof(((type *)0))->member);	\
 	return btrfs_get_##bits(eb, s, offsetof(type, member));		\
@@ -1502,7 +1504,8 @@ static inline void btrfs_set_##name(struct extent_buffer *eb, type *s,	\
 	BUILD_BUG_ON(sizeof(u##bits) != sizeof(((type *)0))->member);	\
 	btrfs_set_##bits(eb, s, offsetof(type, member), val);		\
 }									\
-static inline u##bits btrfs_token_##name(struct extent_buffer *eb, type *s, \
+static inline u##bits btrfs_token_##name(const struct extent_buffer *eb,\
+					 const type *s,			\
 					 struct btrfs_map_token *token)	\
 {									\
 	BUILD_BUG_ON(sizeof(u##bits) != sizeof(((type *)0))->member);	\
@@ -1517,9 +1520,9 @@ static inline void btrfs_set_token_##name(struct extent_buffer *eb,	\
 }
 
 #define BTRFS_SETGET_HEADER_FUNCS(name, type, member, bits)		\
-static inline u##bits btrfs_##name(struct extent_buffer *eb)		\
+static inline u##bits btrfs_##name(const struct extent_buffer *eb)	\
 {									\
-	type *p = page_address(eb->pages[0]);				\
+	const type *p = page_address(eb->pages[0]);			\
 	u##bits res = le##bits##_to_cpu(p->member);			\
 	return res;							\
 }									\
@@ -1531,7 +1534,7 @@ static inline void btrfs_set_##name(struct extent_buffer *eb,		\
 }
 
 #define BTRFS_SETGET_STACK_FUNCS(name, type, member, bits)		\
-static inline u##bits btrfs_##name(type *s)				\
+static inline u##bits btrfs_##name(const type *s)			\
 {									\
 	return le##bits##_to_cpu(s->member);				\
 }									\
@@ -1856,7 +1859,7 @@ static inline unsigned long btrfs_node_key_ptr_offset(int nr)
 		sizeof(struct btrfs_key_ptr) * nr;
 }
 
-void btrfs_node_key(struct extent_buffer *eb,
+void btrfs_node_key(const struct extent_buffer *eb,
 		    struct btrfs_disk_key *disk_key, int nr);
 
 static inline void btrfs_set_node_key(struct extent_buffer *eb,
@@ -1885,28 +1888,28 @@ static inline struct btrfs_item *btrfs_item_nr(int nr)
 	return (struct btrfs_item *)btrfs_item_nr_offset(nr);
 }
 
-static inline u32 btrfs_item_end(struct extent_buffer *eb,
+static inline u32 btrfs_item_end(const struct extent_buffer *eb,
 				 struct btrfs_item *item)
 {
 	return btrfs_item_offset(eb, item) + btrfs_item_size(eb, item);
 }
 
-static inline u32 btrfs_item_end_nr(struct extent_buffer *eb, int nr)
+static inline u32 btrfs_item_end_nr(const struct extent_buffer *eb, int nr)
 {
 	return btrfs_item_end(eb, btrfs_item_nr(nr));
 }
 
-static inline u32 btrfs_item_offset_nr(struct extent_buffer *eb, int nr)
+static inline u32 btrfs_item_offset_nr(const struct extent_buffer *eb, int nr)
 {
 	return btrfs_item_offset(eb, btrfs_item_nr(nr));
 }
 
-static inline u32 btrfs_item_size_nr(struct extent_buffer *eb, int nr)
+static inline u32 btrfs_item_size_nr(const struct extent_buffer *eb, int nr)
 {
 	return btrfs_item_size(eb, btrfs_item_nr(nr));
 }
 
-static inline void btrfs_item_key(struct extent_buffer *eb,
+static inline void btrfs_item_key(const struct extent_buffer *eb,
 			   struct btrfs_disk_key *disk_key, int nr)
 {
 	struct btrfs_item *item = btrfs_item_nr(nr);
@@ -1942,8 +1945,8 @@ BTRFS_SETGET_STACK_FUNCS(stack_dir_name_len, struct btrfs_dir_item,
 BTRFS_SETGET_STACK_FUNCS(stack_dir_transid, struct btrfs_dir_item,
 			 transid, 64);
 
-static inline void btrfs_dir_item_key(struct extent_buffer *eb,
-				      struct btrfs_dir_item *item,
+static inline void btrfs_dir_item_key(const struct extent_buffer *eb,
+				      const struct btrfs_dir_item *item,
 				      struct btrfs_disk_key *key)
 {
 	read_eb_member(eb, item, struct btrfs_dir_item, location, key);
@@ -1951,7 +1954,7 @@ static inline void btrfs_dir_item_key(struct extent_buffer *eb,
 
 static inline void btrfs_set_dir_item_key(struct extent_buffer *eb,
 					  struct btrfs_dir_item *item,
-					  struct btrfs_disk_key *key)
+					  const struct btrfs_disk_key *key)
 {
 	write_eb_member(eb, item, struct btrfs_dir_item, location, key);
 }
@@ -1963,8 +1966,8 @@ BTRFS_SETGET_FUNCS(free_space_bitmaps, struct btrfs_free_space_header,
 BTRFS_SETGET_FUNCS(free_space_generation, struct btrfs_free_space_header,
 		   generation, 64);
 
-static inline void btrfs_free_space_key(struct extent_buffer *eb,
-					struct btrfs_free_space_header *h,
+static inline void btrfs_free_space_key(const struct extent_buffer *eb,
+					const struct btrfs_free_space_header *h,
 					struct btrfs_disk_key *key)
 {
 	read_eb_member(eb, h, struct btrfs_free_space_header, location, key);
@@ -1972,7 +1975,7 @@ static inline void btrfs_free_space_key(struct extent_buffer *eb,
 
 static inline void btrfs_set_free_space_key(struct extent_buffer *eb,
 					    struct btrfs_free_space_header *h,
-					    struct btrfs_disk_key *key)
+					    const struct btrfs_disk_key *key)
 {
 	write_eb_member(eb, h, struct btrfs_free_space_header, location, key);
 }
@@ -1999,25 +2002,25 @@ static inline void btrfs_cpu_key_to_disk(struct btrfs_disk_key *disk,
 	disk->objectid = cpu_to_le64(cpu->objectid);
 }
 
-static inline void btrfs_node_key_to_cpu(struct extent_buffer *eb,
-				  struct btrfs_key *key, int nr)
+static inline void btrfs_node_key_to_cpu(const struct extent_buffer *eb,
+					 struct btrfs_key *key, int nr)
 {
 	struct btrfs_disk_key disk_key;
 	btrfs_node_key(eb, &disk_key, nr);
 	btrfs_disk_key_to_cpu(key, &disk_key);
 }
 
-static inline void btrfs_item_key_to_cpu(struct extent_buffer *eb,
-				  struct btrfs_key *key, int nr)
+static inline void btrfs_item_key_to_cpu(const struct extent_buffer *eb,
+					 struct btrfs_key *key, int nr)
 {
 	struct btrfs_disk_key disk_key;
 	btrfs_item_key(eb, &disk_key, nr);
 	btrfs_disk_key_to_cpu(key, &disk_key);
 }
 
-static inline void btrfs_dir_item_key_to_cpu(struct extent_buffer *eb,
-				      struct btrfs_dir_item *item,
-				      struct btrfs_key *key)
+static inline void btrfs_dir_item_key_to_cpu(const struct extent_buffer *eb,
+					     const struct btrfs_dir_item *item,
+					     struct btrfs_key *key)
 {
 	struct btrfs_disk_key disk_key;
 	btrfs_dir_item_key(eb, item, &disk_key);
@@ -2049,7 +2052,7 @@ BTRFS_SETGET_STACK_FUNCS(stack_header_nritems, struct btrfs_header,
 			 nritems, 32);
 BTRFS_SETGET_STACK_FUNCS(stack_header_bytenr, struct btrfs_header, bytenr, 64);
 
-static inline int btrfs_header_flag(struct extent_buffer *eb, u64 flag)
+static inline int btrfs_header_flag(const struct extent_buffer *eb, u64 flag)
 {
 	return (btrfs_header_flags(eb) & flag) == flag;
 }
@@ -2068,7 +2071,7 @@ static inline int btrfs_clear_header_flag(struct extent_buffer *eb, u64 flag)
 	return (flags & flag) == flag;
 }
 
-static inline int btrfs_header_backref_rev(struct extent_buffer *eb)
+static inline int btrfs_header_backref_rev(const struct extent_buffer *eb)
 {
 	u64 flags = btrfs_header_flags(eb);
 	return flags >> BTRFS_BACKREF_REV_SHIFT;
@@ -2088,12 +2091,12 @@ static inline unsigned long btrfs_header_fsid(void)
 	return offsetof(struct btrfs_header, fsid);
 }
 
-static inline unsigned long btrfs_header_chunk_tree_uuid(struct extent_buffer *eb)
+static inline unsigned long btrfs_header_chunk_tree_uuid(const struct extent_buffer *eb)
 {
 	return offsetof(struct btrfs_header, chunk_tree_uuid);
 }
 
-static inline int btrfs_is_leaf(struct extent_buffer *eb)
+static inline int btrfs_is_leaf(const struct extent_buffer *eb)
 {
 	return btrfs_header_level(eb) == 0;
 }
@@ -2127,12 +2130,12 @@ BTRFS_SETGET_STACK_FUNCS(root_stransid, struct btrfs_root_item,
 BTRFS_SETGET_STACK_FUNCS(root_rtransid, struct btrfs_root_item,
 			 rtransid, 64);
 
-static inline bool btrfs_root_readonly(struct btrfs_root *root)
+static inline bool btrfs_root_readonly(const struct btrfs_root *root)
 {
 	return (root->root_item.flags & cpu_to_le64(BTRFS_ROOT_SUBVOL_RDONLY)) != 0;
 }
 
-static inline bool btrfs_root_dead(struct btrfs_root *root)
+static inline bool btrfs_root_dead(const struct btrfs_root *root)
 {
 	return (root->root_item.flags & cpu_to_le64(BTRFS_ROOT_SUBVOL_DEAD)) != 0;
 }
@@ -2189,51 +2192,51 @@ BTRFS_SETGET_STACK_FUNCS(backup_num_devices, struct btrfs_root_backup,
 /* struct btrfs_balance_item */
 BTRFS_SETGET_FUNCS(balance_flags, struct btrfs_balance_item, flags, 64);
 
-static inline void btrfs_balance_data(struct extent_buffer *eb,
-				      struct btrfs_balance_item *bi,
+static inline void btrfs_balance_data(const struct extent_buffer *eb,
+				      const struct btrfs_balance_item *bi,
 				      struct btrfs_disk_balance_args *ba)
 {
 	read_eb_member(eb, bi, struct btrfs_balance_item, data, ba);
 }
 
 static inline void btrfs_set_balance_data(struct extent_buffer *eb,
-					  struct btrfs_balance_item *bi,
-					  struct btrfs_disk_balance_args *ba)
+				  struct btrfs_balance_item *bi,
+				  const struct btrfs_disk_balance_args *ba)
 {
 	write_eb_member(eb, bi, struct btrfs_balance_item, data, ba);
 }
 
-static inline void btrfs_balance_meta(struct extent_buffer *eb,
-				      struct btrfs_balance_item *bi,
+static inline void btrfs_balance_meta(const struct extent_buffer *eb,
+				      const struct btrfs_balance_item *bi,
 				      struct btrfs_disk_balance_args *ba)
 {
 	read_eb_member(eb, bi, struct btrfs_balance_item, meta, ba);
 }
 
 static inline void btrfs_set_balance_meta(struct extent_buffer *eb,
-					  struct btrfs_balance_item *bi,
-					  struct btrfs_disk_balance_args *ba)
+				  struct btrfs_balance_item *bi,
+				  const struct btrfs_disk_balance_args *ba)
 {
 	write_eb_member(eb, bi, struct btrfs_balance_item, meta, ba);
 }
 
-static inline void btrfs_balance_sys(struct extent_buffer *eb,
-				     struct btrfs_balance_item *bi,
+static inline void btrfs_balance_sys(const struct extent_buffer *eb,
+				     const struct btrfs_balance_item *bi,
 				     struct btrfs_disk_balance_args *ba)
 {
 	read_eb_member(eb, bi, struct btrfs_balance_item, sys, ba);
 }
 
 static inline void btrfs_set_balance_sys(struct extent_buffer *eb,
-					 struct btrfs_balance_item *bi,
-					 struct btrfs_disk_balance_args *ba)
+				 struct btrfs_balance_item *bi,
+				 const struct btrfs_disk_balance_args *ba)
 {
 	write_eb_member(eb, bi, struct btrfs_balance_item, sys, ba);
 }
 
 static inline void
 btrfs_disk_balance_args_to_cpu(struct btrfs_balance_args *cpu,
-			       struct btrfs_disk_balance_args *disk)
+			       const struct btrfs_disk_balance_args *disk)
 {
 	memset(cpu, 0, sizeof(*cpu));
 
@@ -2253,7 +2256,7 @@ btrfs_disk_balance_args_to_cpu(struct btrfs_balance_args *cpu,
 
 static inline void
 btrfs_cpu_balance_args_to_disk(struct btrfs_disk_balance_args *disk,
-			       struct btrfs_balance_args *cpu)
+			       const struct btrfs_balance_args *cpu)
 {
 	memset(disk, 0, sizeof(*disk));
 
@@ -2321,7 +2324,7 @@ BTRFS_SETGET_STACK_FUNCS(super_magic, struct btrfs_super_block, magic, 64);
 BTRFS_SETGET_STACK_FUNCS(super_uuid_tree_generation, struct btrfs_super_block,
 			 uuid_tree_generation, 64);
 
-static inline int btrfs_super_csum_size(struct btrfs_super_block *s)
+static inline int btrfs_super_csum_size(const struct btrfs_super_block *s)
 {
 	u16 t = btrfs_super_csum_type(s);
 	/*
@@ -2336,8 +2339,8 @@ static inline int btrfs_super_csum_size(struct btrfs_super_block *s)
  * this returns the address of the start of the last item,
  * which is the stop of the leaf data stack
  */
-static inline unsigned int leaf_data_end(struct btrfs_fs_info *fs_info,
-					 struct extent_buffer *leaf)
+static inline unsigned int leaf_data_end(const struct btrfs_fs_info *fs_info,
+					 const struct extent_buffer *leaf)
 {
 	u32 nr = btrfs_header_nritems(leaf);
 
@@ -2362,7 +2365,7 @@ BTRFS_SETGET_STACK_FUNCS(stack_file_extent_compression,
 			 struct btrfs_file_extent_item, compression, 8);
 
 static inline unsigned long
-btrfs_file_extent_inline_start(struct btrfs_file_extent_item *e)
+btrfs_file_extent_inline_start(const struct btrfs_file_extent_item *e)
 {
 	return (unsigned long)e + BTRFS_FILE_EXTENT_INLINE_DATA_START;
 }
@@ -2396,8 +2399,9 @@ BTRFS_SETGET_FUNCS(file_extent_other_encoding, struct btrfs_file_extent_item,
  * size of any extent headers.  If a file is compressed on disk, this is
  * the compressed size
  */
-static inline u32 btrfs_file_extent_inline_item_len(struct extent_buffer *eb,
-						    struct btrfs_item *e)
+static inline u32 btrfs_file_extent_inline_item_len(
+						const struct extent_buffer *eb,
+						struct btrfs_item *e)
 {
 	return btrfs_item_size(eb, e) - BTRFS_FILE_EXTENT_INLINE_DATA_START;
 }
@@ -2405,9 +2409,9 @@ static inline u32 btrfs_file_extent_inline_item_len(struct extent_buffer *eb,
 /* this returns the number of file bytes represented by the inline item.
  * If an item is compressed, this is the uncompressed size
  */
-static inline u32 btrfs_file_extent_inline_len(struct extent_buffer *eb,
-					       int slot,
-					       struct btrfs_file_extent_item *fi)
+static inline u32 btrfs_file_extent_inline_len(const struct extent_buffer *eb,
+					int slot,
+					const struct btrfs_file_extent_item *fi)
 {
 	struct btrfs_map_token token;
 
@@ -2429,8 +2433,8 @@ static inline u32 btrfs_file_extent_inline_len(struct extent_buffer *eb,
 
 
 /* btrfs_dev_stats_item */
-static inline u64 btrfs_dev_stats_value(struct extent_buffer *eb,
-					struct btrfs_dev_stats_item *ptr,
+static inline u64 btrfs_dev_stats_value(const struct extent_buffer *eb,
+					const struct btrfs_dev_stats_item *ptr,
 					int index)
 {
 	u64 val;
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 0aff9b278c19..d6f761b4fae0 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -5405,9 +5405,8 @@ unlock_exit:
 	return ret;
 }
 
-void read_extent_buffer(struct extent_buffer *eb, void *dstv,
-			unsigned long start,
-			unsigned long len)
+void read_extent_buffer(const struct extent_buffer *eb, void *dstv,
+			unsigned long start, unsigned long len)
 {
 	size_t cur;
 	size_t offset;
@@ -5436,9 +5435,9 @@ void read_extent_buffer(struct extent_buffer *eb, void *dstv,
 	}
 }
 
-int read_extent_buffer_to_user(struct extent_buffer *eb, void __user *dstv,
-			unsigned long start,
-			unsigned long len)
+int read_extent_buffer_to_user(const struct extent_buffer *eb,
+			       void __user *dstv,
+			       unsigned long start, unsigned long len)
 {
 	size_t cur;
 	size_t offset;
@@ -5478,10 +5477,10 @@ int read_extent_buffer_to_user(struct extent_buffer *eb, void __user *dstv,
  * return 1 if the item spans two pages.
  * return -EINVAL otherwise.
  */
-int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start,
-			       unsigned long min_len, char **map,
-			       unsigned long *map_start,
-			       unsigned long *map_len)
+int map_private_extent_buffer(const struct extent_buffer *eb,
+			      unsigned long start, unsigned long min_len,
+			      char **map, unsigned long *map_start,
+			      unsigned long *map_len)
 {
 	size_t offset = start & (PAGE_SIZE - 1);
 	char *kaddr;
@@ -5515,9 +5514,8 @@ int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start,
 	return 0;
 }
 
-int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv,
-			  unsigned long start,
-			  unsigned long len)
+int memcmp_extent_buffer(const struct extent_buffer *eb, const void *ptrv,
+			 unsigned long start, unsigned long len)
 {
 	size_t cur;
 	size_t offset;
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 4f030912f3ef..faffa28ba707 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -449,14 +449,13 @@ static inline void extent_buffer_get(struct extent_buffer *eb)
 	atomic_inc(&eb->refs);
 }
 
-int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv,
-			  unsigned long start,
-			  unsigned long len);
-void read_extent_buffer(struct extent_buffer *eb, void *dst,
+int memcmp_extent_buffer(const struct extent_buffer *eb, const void *ptrv,
+			 unsigned long start, unsigned long len);
+void read_extent_buffer(const struct extent_buffer *eb, void *dst,
 			unsigned long start,
 			unsigned long len);
-int read_extent_buffer_to_user(struct extent_buffer *eb, void __user *dst,
-			       unsigned long start,
+int read_extent_buffer_to_user(const struct extent_buffer *eb,
+			       void __user *dst, unsigned long start,
 			       unsigned long len);
 void write_extent_buffer_fsid(struct extent_buffer *eb, const void *src);
 void write_extent_buffer_chunk_tree_uuid(struct extent_buffer *eb,
@@ -486,10 +485,10 @@ void set_extent_buffer_uptodate(struct extent_buffer *eb);
 void clear_extent_buffer_uptodate(struct extent_buffer *eb);
 int extent_buffer_uptodate(struct extent_buffer *eb);
 int extent_buffer_under_io(struct extent_buffer *eb);
-int map_private_extent_buffer(struct extent_buffer *eb, unsigned long offset,
-		      unsigned long min_len, char **map,
-		      unsigned long *map_start,
-		      unsigned long *map_len);
+int map_private_extent_buffer(const struct extent_buffer *eb,
+			      unsigned long offset, unsigned long min_len,
+			      char **map, unsigned long *map_start,
+			      unsigned long *map_len);
 void extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end);
 void extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end);
 void extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end,
diff --git a/fs/btrfs/struct-funcs.c b/fs/btrfs/struct-funcs.c
index 875c757e73e2..5e2b92d83617 100644
--- a/fs/btrfs/struct-funcs.c
+++ b/fs/btrfs/struct-funcs.c
@@ -50,8 +50,8 @@ static inline void put_unaligned_le8(u8 val, void *p)
  */
 
 #define DEFINE_BTRFS_SETGET_BITS(bits)					\
-u##bits btrfs_get_token_##bits(struct extent_buffer *eb, void *ptr,	\
-			       unsigned long off,			\
+u##bits btrfs_get_token_##bits(const struct extent_buffer *eb,		\
+			       const void *ptr, unsigned long off,	\
 			       struct btrfs_map_token *token)		\
 {									\
 	unsigned long part_offset = (unsigned long)ptr;			\
@@ -90,7 +90,8 @@ u##bits btrfs_get_token_##bits(struct extent_buffer *eb, void *ptr,	\
 	return res;							\
 }									\
 void btrfs_set_token_##bits(struct extent_buffer *eb,			\
-			    void *ptr, unsigned long off, u##bits val,	\
+			    const void *ptr, unsigned long off,		\
+			    u##bits val,				\
 			    struct btrfs_map_token *token)		\
 {									\
 	unsigned long part_offset = (unsigned long)ptr;			\
@@ -133,7 +134,7 @@ DEFINE_BTRFS_SETGET_BITS(16)
 DEFINE_BTRFS_SETGET_BITS(32)
 DEFINE_BTRFS_SETGET_BITS(64)
 
-void btrfs_node_key(struct extent_buffer *eb,
+void btrfs_node_key(const struct extent_buffer *eb,
 		    struct btrfs_disk_key *disk_key, int nr)
 {
 	unsigned long ptr = btrfs_node_key_ptr_offset(nr);

From 9a35b63728ceb8602c111260044451dd64952500 Mon Sep 17 00:00:00 2001
From: Jeff Mahoney <jeffm@suse.com>
Date: Wed, 28 Jun 2017 21:56:54 -0600
Subject: [PATCH 006/104] btrfs: constify tracepoint arguments

Tracepoint arguments are all read-only.  If we mark the arguments
as const, we're able to keep or convert those arguments to const
where appropriate.

Signed-off-by: Jeff Mahoney <jeffm@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/async-thread.c      |   6 +-
 fs/btrfs/async-thread.h      |   6 +-
 fs/btrfs/btrfs_inode.h       |   4 +-
 include/trace/events/btrfs.h | 242 ++++++++++++++++++-----------------
 4 files changed, 136 insertions(+), 122 deletions(-)

diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index ff0b0be92d61..e00c8a9fd5bb 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -75,18 +75,18 @@ void btrfs_##name(struct work_struct *arg)				\
 }
 
 struct btrfs_fs_info *
-btrfs_workqueue_owner(struct __btrfs_workqueue *wq)
+btrfs_workqueue_owner(const struct __btrfs_workqueue *wq)
 {
 	return wq->fs_info;
 }
 
 struct btrfs_fs_info *
-btrfs_work_owner(struct btrfs_work *work)
+btrfs_work_owner(const struct btrfs_work *work)
 {
 	return work->wq->fs_info;
 }
 
-bool btrfs_workqueue_normal_congested(struct btrfs_workqueue *wq)
+bool btrfs_workqueue_normal_congested(const struct btrfs_workqueue *wq)
 {
 	/*
 	 * We could compare wq->normal->pending with num_online_cpus()
diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h
index 1f9597355c9d..fc957e00cef1 100644
--- a/fs/btrfs/async-thread.h
+++ b/fs/btrfs/async-thread.h
@@ -82,7 +82,7 @@ void btrfs_queue_work(struct btrfs_workqueue *wq,
 void btrfs_destroy_workqueue(struct btrfs_workqueue *wq);
 void btrfs_workqueue_set_max(struct btrfs_workqueue *wq, int max);
 void btrfs_set_work_high_priority(struct btrfs_work *work);
-struct btrfs_fs_info *btrfs_work_owner(struct btrfs_work *work);
-struct btrfs_fs_info *btrfs_workqueue_owner(struct __btrfs_workqueue *wq);
-bool btrfs_workqueue_normal_congested(struct btrfs_workqueue *wq);
+struct btrfs_fs_info *btrfs_work_owner(const struct btrfs_work *work);
+struct btrfs_fs_info *btrfs_workqueue_owner(const struct __btrfs_workqueue *wq);
+bool btrfs_workqueue_normal_congested(const struct btrfs_workqueue *wq);
 #endif
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index d87ac27a5f2b..31c259c8462d 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -207,7 +207,7 @@ struct btrfs_inode {
 
 extern unsigned char btrfs_filetype_table[];
 
-static inline struct btrfs_inode *BTRFS_I(struct inode *inode)
+static inline struct btrfs_inode *BTRFS_I(const struct inode *inode)
 {
 	return container_of(inode, struct btrfs_inode, vfs_inode);
 }
@@ -231,7 +231,7 @@ static inline void btrfs_insert_inode_hash(struct inode *inode)
 	__insert_inode_hash(inode, h);
 }
 
-static inline u64 btrfs_ino(struct btrfs_inode *inode)
+static inline u64 btrfs_ino(const struct btrfs_inode *inode)
 {
 	u64 ino = inode->location.objectid;
 
diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h
index cd99a3658156..42560feb9920 100644
--- a/include/trace/events/btrfs.h
+++ b/include/trace/events/btrfs.h
@@ -92,7 +92,7 @@ struct btrfs_qgroup;
 
 TRACE_EVENT(btrfs_transaction_commit,
 
-	TP_PROTO(struct btrfs_root *root),
+	TP_PROTO(const struct btrfs_root *root),
 
 	TP_ARGS(root),
 
@@ -113,7 +113,7 @@ TRACE_EVENT(btrfs_transaction_commit,
 
 DECLARE_EVENT_CLASS(btrfs__inode,
 
-	TP_PROTO(struct inode *inode),
+	TP_PROTO(const struct inode *inode),
 
 	TP_ARGS(inode),
 
@@ -151,21 +151,21 @@ DECLARE_EVENT_CLASS(btrfs__inode,
 
 DEFINE_EVENT(btrfs__inode, btrfs_inode_new,
 
-	TP_PROTO(struct inode *inode),
+	TP_PROTO(const struct inode *inode),
 
 	TP_ARGS(inode)
 );
 
 DEFINE_EVENT(btrfs__inode, btrfs_inode_request,
 
-	TP_PROTO(struct inode *inode),
+	TP_PROTO(const struct inode *inode),
 
 	TP_ARGS(inode)
 );
 
 DEFINE_EVENT(btrfs__inode, btrfs_inode_evict,
 
-	TP_PROTO(struct inode *inode),
+	TP_PROTO(const struct inode *inode),
 
 	TP_ARGS(inode)
 );
@@ -192,8 +192,8 @@ DEFINE_EVENT(btrfs__inode, btrfs_inode_evict,
 
 TRACE_EVENT_CONDITION(btrfs_get_extent,
 
-	TP_PROTO(struct btrfs_root *root, struct btrfs_inode *inode,
-		 struct extent_map *map),
+	TP_PROTO(const struct btrfs_root *root, const struct btrfs_inode *inode,
+		 const struct extent_map *map),
 
 	TP_ARGS(root, inode, map),
 
@@ -388,7 +388,8 @@ DEFINE_EVENT(
 
 DECLARE_EVENT_CLASS(btrfs__ordered_extent,
 
-	TP_PROTO(struct inode *inode, struct btrfs_ordered_extent *ordered),
+	TP_PROTO(const struct inode *inode,
+		 const struct btrfs_ordered_extent *ordered),
 
 	TP_ARGS(inode, ordered),
 
@@ -440,36 +441,40 @@ DECLARE_EVENT_CLASS(btrfs__ordered_extent,
 
 DEFINE_EVENT(btrfs__ordered_extent, btrfs_ordered_extent_add,
 
-	TP_PROTO(struct inode *inode, struct btrfs_ordered_extent *ordered),
+	TP_PROTO(const struct inode *inode,
+		 const struct btrfs_ordered_extent *ordered),
 
 	TP_ARGS(inode, ordered)
 );
 
 DEFINE_EVENT(btrfs__ordered_extent, btrfs_ordered_extent_remove,
 
-	TP_PROTO(struct inode *inode, struct btrfs_ordered_extent *ordered),
+	TP_PROTO(const struct inode *inode,
+		 const struct btrfs_ordered_extent *ordered),
 
 	TP_ARGS(inode, ordered)
 );
 
 DEFINE_EVENT(btrfs__ordered_extent, btrfs_ordered_extent_start,
 
-	TP_PROTO(struct inode *inode, struct btrfs_ordered_extent *ordered),
+	TP_PROTO(const struct inode *inode,
+		 const struct btrfs_ordered_extent *ordered),
 
 	TP_ARGS(inode, ordered)
 );
 
 DEFINE_EVENT(btrfs__ordered_extent, btrfs_ordered_extent_put,
 
-	TP_PROTO(struct inode *inode, struct btrfs_ordered_extent *ordered),
+	TP_PROTO(const struct inode *inode,
+		 const struct btrfs_ordered_extent *ordered),
 
 	TP_ARGS(inode, ordered)
 );
 
 DECLARE_EVENT_CLASS(btrfs__writepage,
 
-	TP_PROTO(struct page *page, struct inode *inode,
-		 struct writeback_control *wbc),
+	TP_PROTO(const struct page *page, const struct inode *inode,
+		 const struct writeback_control *wbc),
 
 	TP_ARGS(page, inode, wbc),
 
@@ -517,15 +522,15 @@ DECLARE_EVENT_CLASS(btrfs__writepage,
 
 DEFINE_EVENT(btrfs__writepage, __extent_writepage,
 
-	TP_PROTO(struct page *page, struct inode *inode,
-		 struct writeback_control *wbc),
+	TP_PROTO(const struct page *page, const struct inode *inode,
+		 const struct writeback_control *wbc),
 
 	TP_ARGS(page, inode, wbc)
 );
 
 TRACE_EVENT(btrfs_writepage_end_io_hook,
 
-	TP_PROTO(struct page *page, u64 start, u64 end, int uptodate),
+	TP_PROTO(const struct page *page, u64 start, u64 end, int uptodate),
 
 	TP_ARGS(page, start, end, uptodate),
 
@@ -558,7 +563,7 @@ TRACE_EVENT(btrfs_writepage_end_io_hook,
 
 TRACE_EVENT(btrfs_sync_file,
 
-	TP_PROTO(struct file *file, int datasync),
+	TP_PROTO(const struct file *file, int datasync),
 
 	TP_ARGS(file, datasync),
 
@@ -570,8 +575,8 @@ TRACE_EVENT(btrfs_sync_file,
 	),
 
 	TP_fast_assign(
-		struct dentry *dentry = file->f_path.dentry;
-		struct inode *inode = d_inode(dentry);
+		const struct dentry *dentry = file->f_path.dentry;
+		const struct inode *inode = d_inode(dentry);
 
 		TP_fast_assign_fsid(btrfs_sb(file->f_path.dentry->d_sb));
 		__entry->ino		= inode->i_ino;
@@ -589,7 +594,7 @@ TRACE_EVENT(btrfs_sync_file,
 
 TRACE_EVENT(btrfs_sync_fs,
 
-	TP_PROTO(struct btrfs_fs_info *fs_info, int wait),
+	TP_PROTO(const struct btrfs_fs_info *fs_info, int wait),
 
 	TP_ARGS(fs_info, wait),
 
@@ -606,8 +611,8 @@ TRACE_EVENT(btrfs_sync_fs,
 
 TRACE_EVENT(btrfs_add_block_group,
 
-	TP_PROTO(struct btrfs_fs_info *fs_info,
-		 struct btrfs_block_group_cache *block_group, int create),
+	TP_PROTO(const struct btrfs_fs_info *fs_info,
+		 const struct btrfs_block_group_cache *block_group, int create),
 
 	TP_ARGS(fs_info, block_group, create),
 
@@ -654,9 +659,9 @@ TRACE_EVENT(btrfs_add_block_group,
 
 DECLARE_EVENT_CLASS(btrfs_delayed_tree_ref,
 
-	TP_PROTO(struct btrfs_fs_info *fs_info,
-		 struct btrfs_delayed_ref_node *ref,
-		 struct btrfs_delayed_tree_ref *full_ref,
+	TP_PROTO(const struct btrfs_fs_info *fs_info,
+		 const struct btrfs_delayed_ref_node *ref,
+		 const struct btrfs_delayed_tree_ref *full_ref,
 		 int action),
 
 	TP_ARGS(fs_info, ref, full_ref, action),
@@ -697,9 +702,9 @@ DECLARE_EVENT_CLASS(btrfs_delayed_tree_ref,
 
 DEFINE_EVENT(btrfs_delayed_tree_ref,  add_delayed_tree_ref,
 
-	TP_PROTO(struct btrfs_fs_info *fs_info,
-		 struct btrfs_delayed_ref_node *ref,
-		 struct btrfs_delayed_tree_ref *full_ref,
+	TP_PROTO(const struct btrfs_fs_info *fs_info,
+		 const struct btrfs_delayed_ref_node *ref,
+		 const struct btrfs_delayed_tree_ref *full_ref,
 		 int action),
 
 	TP_ARGS(fs_info, ref, full_ref, action)
@@ -707,9 +712,9 @@ DEFINE_EVENT(btrfs_delayed_tree_ref,  add_delayed_tree_ref,
 
 DEFINE_EVENT(btrfs_delayed_tree_ref,  run_delayed_tree_ref,
 
-	TP_PROTO(struct btrfs_fs_info *fs_info,
-		 struct btrfs_delayed_ref_node *ref,
-		 struct btrfs_delayed_tree_ref *full_ref,
+	TP_PROTO(const struct btrfs_fs_info *fs_info,
+		 const struct btrfs_delayed_ref_node *ref,
+		 const struct btrfs_delayed_tree_ref *full_ref,
 		 int action),
 
 	TP_ARGS(fs_info, ref, full_ref, action)
@@ -717,9 +722,9 @@ DEFINE_EVENT(btrfs_delayed_tree_ref,  run_delayed_tree_ref,
 
 DECLARE_EVENT_CLASS(btrfs_delayed_data_ref,
 
-	TP_PROTO(struct btrfs_fs_info *fs_info,
-		 struct btrfs_delayed_ref_node *ref,
-		 struct btrfs_delayed_data_ref *full_ref,
+	TP_PROTO(const struct btrfs_fs_info *fs_info,
+		 const struct btrfs_delayed_ref_node *ref,
+		 const struct btrfs_delayed_data_ref *full_ref,
 		 int action),
 
 	TP_ARGS(fs_info, ref, full_ref, action),
@@ -764,9 +769,9 @@ DECLARE_EVENT_CLASS(btrfs_delayed_data_ref,
 
 DEFINE_EVENT(btrfs_delayed_data_ref,  add_delayed_data_ref,
 
-	TP_PROTO(struct btrfs_fs_info *fs_info,
-		 struct btrfs_delayed_ref_node *ref,
-		 struct btrfs_delayed_data_ref *full_ref,
+	TP_PROTO(const struct btrfs_fs_info *fs_info,
+		 const struct btrfs_delayed_ref_node *ref,
+		 const struct btrfs_delayed_data_ref *full_ref,
 		 int action),
 
 	TP_ARGS(fs_info, ref, full_ref, action)
@@ -774,9 +779,9 @@ DEFINE_EVENT(btrfs_delayed_data_ref,  add_delayed_data_ref,
 
 DEFINE_EVENT(btrfs_delayed_data_ref,  run_delayed_data_ref,
 
-	TP_PROTO(struct btrfs_fs_info *fs_info,
-		 struct btrfs_delayed_ref_node *ref,
-		 struct btrfs_delayed_data_ref *full_ref,
+	TP_PROTO(const struct btrfs_fs_info *fs_info,
+		 const struct btrfs_delayed_ref_node *ref,
+		 const struct btrfs_delayed_data_ref *full_ref,
 		 int action),
 
 	TP_ARGS(fs_info, ref, full_ref, action)
@@ -784,9 +789,9 @@ DEFINE_EVENT(btrfs_delayed_data_ref,  run_delayed_data_ref,
 
 DECLARE_EVENT_CLASS(btrfs_delayed_ref_head,
 
-	TP_PROTO(struct btrfs_fs_info *fs_info,
-		 struct btrfs_delayed_ref_node *ref,
-		 struct btrfs_delayed_ref_head *head_ref,
+	TP_PROTO(const struct btrfs_fs_info *fs_info,
+		 const struct btrfs_delayed_ref_node *ref,
+		 const struct btrfs_delayed_ref_head *head_ref,
 		 int action),
 
 	TP_ARGS(fs_info, ref, head_ref, action),
@@ -814,9 +819,9 @@ DECLARE_EVENT_CLASS(btrfs_delayed_ref_head,
 
 DEFINE_EVENT(btrfs_delayed_ref_head,  add_delayed_ref_head,
 
-	TP_PROTO(struct btrfs_fs_info *fs_info,
-		 struct btrfs_delayed_ref_node *ref,
-		 struct btrfs_delayed_ref_head *head_ref,
+	TP_PROTO(const struct btrfs_fs_info *fs_info,
+		 const struct btrfs_delayed_ref_node *ref,
+		 const struct btrfs_delayed_ref_head *head_ref,
 		 int action),
 
 	TP_ARGS(fs_info, ref, head_ref, action)
@@ -824,9 +829,9 @@ DEFINE_EVENT(btrfs_delayed_ref_head,  add_delayed_ref_head,
 
 DEFINE_EVENT(btrfs_delayed_ref_head,  run_delayed_ref_head,
 
-	TP_PROTO(struct btrfs_fs_info *fs_info,
-		 struct btrfs_delayed_ref_node *ref,
-		 struct btrfs_delayed_ref_head *head_ref,
+	TP_PROTO(const struct btrfs_fs_info *fs_info,
+		 const struct btrfs_delayed_ref_node *ref,
+		 const struct btrfs_delayed_ref_head *head_ref,
 		 int action),
 
 	TP_ARGS(fs_info, ref, head_ref, action)
@@ -846,8 +851,8 @@ DEFINE_EVENT(btrfs_delayed_ref_head,  run_delayed_ref_head,
 
 DECLARE_EVENT_CLASS(btrfs__chunk,
 
-	TP_PROTO(struct btrfs_fs_info *fs_info, struct map_lookup *map,
-		 u64 offset, u64 size),
+	TP_PROTO(const struct btrfs_fs_info *fs_info,
+		 const struct map_lookup *map, u64 offset, u64 size),
 
 	TP_ARGS(fs_info, map, offset, size),
 
@@ -880,24 +885,24 @@ DECLARE_EVENT_CLASS(btrfs__chunk,
 
 DEFINE_EVENT(btrfs__chunk,  btrfs_chunk_alloc,
 
-	TP_PROTO(struct btrfs_fs_info *fs_info, struct map_lookup *map,
-		 u64 offset, u64 size),
+	TP_PROTO(const struct btrfs_fs_info *fs_info,
+		 const struct map_lookup *map, u64 offset, u64 size),
 
 	TP_ARGS(fs_info, map, offset, size)
 );
 
 DEFINE_EVENT(btrfs__chunk,  btrfs_chunk_free,
 
-	TP_PROTO(struct btrfs_fs_info *fs_info, struct map_lookup *map,
-		 u64 offset, u64 size),
+	TP_PROTO(const struct btrfs_fs_info *fs_info,
+		 const struct map_lookup *map, u64 offset, u64 size),
 
 	TP_ARGS(fs_info, map, offset, size)
 );
 
 TRACE_EVENT(btrfs_cow_block,
 
-	TP_PROTO(struct btrfs_root *root, struct extent_buffer *buf,
-		 struct extent_buffer *cow),
+	TP_PROTO(const struct btrfs_root *root, const struct extent_buffer *buf,
+		 const struct extent_buffer *cow),
 
 	TP_ARGS(root, buf, cow),
 
@@ -931,7 +936,7 @@ TRACE_EVENT(btrfs_cow_block,
 
 TRACE_EVENT(btrfs_space_reservation,
 
-	TP_PROTO(struct btrfs_fs_info *fs_info, char *type, u64 val,
+	TP_PROTO(const struct btrfs_fs_info *fs_info, char *type, u64 val,
 		 u64 bytes, int reserve),
 
 	TP_ARGS(fs_info, type, val, bytes, reserve),
@@ -963,7 +968,7 @@ TRACE_EVENT(btrfs_space_reservation,
 
 TRACE_EVENT(btrfs_trigger_flush,
 
-	TP_PROTO(struct btrfs_fs_info *fs_info, u64 flags, u64 bytes,
+	TP_PROTO(const struct btrfs_fs_info *fs_info, u64 flags, u64 bytes,
 		 int flush, char *reason),
 
 	TP_ARGS(fs_info, flags, bytes, flush, reason),
@@ -1004,7 +1009,7 @@ TRACE_EVENT(btrfs_trigger_flush,
 
 TRACE_EVENT(btrfs_flush_space,
 
-	TP_PROTO(struct btrfs_fs_info *fs_info, u64 flags, u64 num_bytes,
+	TP_PROTO(const struct btrfs_fs_info *fs_info, u64 flags, u64 num_bytes,
 		 u64 orig_bytes, int state, int ret),
 
 	TP_ARGS(fs_info, flags, num_bytes, orig_bytes, state, ret),
@@ -1039,7 +1044,7 @@ TRACE_EVENT(btrfs_flush_space,
 
 DECLARE_EVENT_CLASS(btrfs__reserved_extent,
 
-	TP_PROTO(struct btrfs_fs_info *fs_info, u64 start, u64 len),
+	TP_PROTO(const struct btrfs_fs_info *fs_info, u64 start, u64 len),
 
 	TP_ARGS(fs_info, start, len),
 
@@ -1061,22 +1066,22 @@ DECLARE_EVENT_CLASS(btrfs__reserved_extent,
 
 DEFINE_EVENT(btrfs__reserved_extent,  btrfs_reserved_extent_alloc,
 
-	TP_PROTO(struct btrfs_fs_info *fs_info, u64 start, u64 len),
+	TP_PROTO(const struct btrfs_fs_info *fs_info, u64 start, u64 len),
 
 	TP_ARGS(fs_info, start, len)
 );
 
 DEFINE_EVENT(btrfs__reserved_extent,  btrfs_reserved_extent_free,
 
-	TP_PROTO(struct btrfs_fs_info *fs_info, u64 start, u64 len),
+	TP_PROTO(const struct btrfs_fs_info *fs_info, u64 start, u64 len),
 
 	TP_ARGS(fs_info, start, len)
 );
 
 TRACE_EVENT(find_free_extent,
 
-	TP_PROTO(struct btrfs_fs_info *fs_info, u64 num_bytes, u64 empty_size,
-		 u64 data),
+	TP_PROTO(const struct btrfs_fs_info *fs_info, u64 num_bytes,
+		 u64 empty_size, u64 data),
 
 	TP_ARGS(fs_info, num_bytes, empty_size, data),
 
@@ -1101,8 +1106,8 @@ TRACE_EVENT(find_free_extent,
 
 DECLARE_EVENT_CLASS(btrfs__reserve_extent,
 
-	TP_PROTO(struct btrfs_fs_info *fs_info,
-		 struct btrfs_block_group_cache *block_group, u64 start,
+	TP_PROTO(const struct btrfs_fs_info *fs_info,
+		 const struct btrfs_block_group_cache *block_group, u64 start,
 		 u64 len),
 
 	TP_ARGS(fs_info, block_group, start, len),
@@ -1132,8 +1137,8 @@ DECLARE_EVENT_CLASS(btrfs__reserve_extent,
 
 DEFINE_EVENT(btrfs__reserve_extent, btrfs_reserve_extent,
 
-	TP_PROTO(struct btrfs_fs_info *fs_info,
-		 struct btrfs_block_group_cache *block_group, u64 start,
+	TP_PROTO(const struct btrfs_fs_info *fs_info,
+		 const struct btrfs_block_group_cache *block_group, u64 start,
 		 u64 len),
 
 	TP_ARGS(fs_info, block_group, start, len)
@@ -1141,8 +1146,8 @@ DEFINE_EVENT(btrfs__reserve_extent, btrfs_reserve_extent,
 
 DEFINE_EVENT(btrfs__reserve_extent, btrfs_reserve_extent_cluster,
 
-	TP_PROTO(struct btrfs_fs_info *fs_info,
-		 struct btrfs_block_group_cache *block_group, u64 start,
+	TP_PROTO(const struct btrfs_fs_info *fs_info,
+		 const struct btrfs_block_group_cache *block_group, u64 start,
 		 u64 len),
 
 	TP_ARGS(fs_info, block_group, start, len)
@@ -1150,7 +1155,7 @@ DEFINE_EVENT(btrfs__reserve_extent, btrfs_reserve_extent_cluster,
 
 TRACE_EVENT(btrfs_find_cluster,
 
-	TP_PROTO(struct btrfs_block_group_cache *block_group, u64 start,
+	TP_PROTO(const struct btrfs_block_group_cache *block_group, u64 start,
 		 u64 bytes, u64 empty_size, u64 min_bytes),
 
 	TP_ARGS(block_group, start, bytes, empty_size, min_bytes),
@@ -1183,7 +1188,7 @@ TRACE_EVENT(btrfs_find_cluster,
 
 TRACE_EVENT(btrfs_failed_cluster_setup,
 
-	TP_PROTO(struct btrfs_block_group_cache *block_group),
+	TP_PROTO(const struct btrfs_block_group_cache *block_group),
 
 	TP_ARGS(block_group),
 
@@ -1200,8 +1205,9 @@ TRACE_EVENT(btrfs_failed_cluster_setup,
 
 TRACE_EVENT(btrfs_setup_cluster,
 
-	TP_PROTO(struct btrfs_block_group_cache *block_group,
-		 struct btrfs_free_cluster *cluster, u64 size, int bitmap),
+	TP_PROTO(const struct btrfs_block_group_cache *block_group,
+		 const struct btrfs_free_cluster *cluster,
+		 u64 size, int bitmap),
 
 	TP_ARGS(block_group, cluster, size, bitmap),
 
@@ -1235,12 +1241,13 @@ TRACE_EVENT(btrfs_setup_cluster,
 struct extent_state;
 TRACE_EVENT(alloc_extent_state,
 
-	TP_PROTO(struct extent_state *state, gfp_t mask, unsigned long IP),
+	TP_PROTO(const struct extent_state *state,
+		 gfp_t mask, unsigned long IP),
 
 	TP_ARGS(state, mask, IP),
 
 	TP_STRUCT__entry(
-		__field(struct extent_state *, state)
+		__field(const struct extent_state *, state)
 		__field(gfp_t, mask)
 		__field(unsigned long, ip)
 	),
@@ -1252,17 +1259,17 @@ TRACE_EVENT(alloc_extent_state,
 	),
 
 	TP_printk("state=%p mask=%s caller=%pS", __entry->state,
-		  show_gfp_flags(__entry->mask), (void *)__entry->ip)
+		  show_gfp_flags(__entry->mask), (const void *)__entry->ip)
 );
 
 TRACE_EVENT(free_extent_state,
 
-	TP_PROTO(struct extent_state *state, unsigned long IP),
+	TP_PROTO(const struct extent_state *state, unsigned long IP),
 
 	TP_ARGS(state, IP),
 
 	TP_STRUCT__entry(
-		__field(struct extent_state *, state)
+		__field(const struct extent_state *, state)
 		__field(unsigned long, ip)
 	),
 
@@ -1272,22 +1279,22 @@ TRACE_EVENT(free_extent_state,
 	),
 
 	TP_printk("state=%p caller=%pS", __entry->state,
-		  (void *)__entry->ip)
+		  (const void *)__entry->ip)
 );
 
 DECLARE_EVENT_CLASS(btrfs__work,
 
-	TP_PROTO(struct btrfs_work *work),
+	TP_PROTO(const struct btrfs_work *work),
 
 	TP_ARGS(work),
 
 	TP_STRUCT__entry_btrfs(
-		__field(	void *,	work			)
-		__field(	void *, wq			)
-		__field(	void *,	func			)
-		__field(	void *,	ordered_func		)
-		__field(	void *,	ordered_free		)
-		__field(	void *,	normal_work		)
+		__field(	const void *,	work			)
+		__field(	const void *,	wq			)
+		__field(	const void *,	func			)
+		__field(	const void *,	ordered_func		)
+		__field(	const void *,	ordered_free		)
+		__field(	const void *,	normal_work		)
 	),
 
 	TP_fast_assign_btrfs(btrfs_work_owner(work),
@@ -1312,12 +1319,12 @@ DECLARE_EVENT_CLASS(btrfs__work,
  */
 DECLARE_EVENT_CLASS(btrfs__work__done,
 
-	TP_PROTO(struct btrfs_fs_info *fs_info, void *wtag),
+	TP_PROTO(const struct btrfs_fs_info *fs_info, const void *wtag),
 
 	TP_ARGS(fs_info, wtag),
 
 	TP_STRUCT__entry_btrfs(
-		__field(	void *,	wtag			)
+		__field(	const void *,	wtag			)
 	),
 
 	TP_fast_assign_btrfs(fs_info,
@@ -1329,40 +1336,41 @@ DECLARE_EVENT_CLASS(btrfs__work__done,
 
 DEFINE_EVENT(btrfs__work, btrfs_work_queued,
 
-	TP_PROTO(struct btrfs_work *work),
+	TP_PROTO(const struct btrfs_work *work),
 
 	TP_ARGS(work)
 );
 
 DEFINE_EVENT(btrfs__work, btrfs_work_sched,
 
-	TP_PROTO(struct btrfs_work *work),
+	TP_PROTO(const struct btrfs_work *work),
 
 	TP_ARGS(work)
 );
 
 DEFINE_EVENT(btrfs__work__done, btrfs_all_work_done,
 
-	TP_PROTO(struct btrfs_fs_info *fs_info, void *wtag),
+	TP_PROTO(const struct btrfs_fs_info *fs_info, const void *wtag),
 
 	TP_ARGS(fs_info, wtag)
 );
 
 DEFINE_EVENT(btrfs__work, btrfs_ordered_sched,
 
-	TP_PROTO(struct btrfs_work *work),
+	TP_PROTO(const struct btrfs_work *work),
 
 	TP_ARGS(work)
 );
 
 DECLARE_EVENT_CLASS(btrfs__workqueue,
 
-	TP_PROTO(struct __btrfs_workqueue *wq, const char *name, int high),
+	TP_PROTO(const struct __btrfs_workqueue *wq,
+		 const char *name, int high),
 
 	TP_ARGS(wq, name, high),
 
 	TP_STRUCT__entry_btrfs(
-		__field(	void *,	wq			)
+		__field(	const void *,	wq			)
 		__string(	name,	name			)
 		__field(	int ,	high			)
 	),
@@ -1381,19 +1389,20 @@ DECLARE_EVENT_CLASS(btrfs__workqueue,
 
 DEFINE_EVENT(btrfs__workqueue, btrfs_workqueue_alloc,
 
-	TP_PROTO(struct __btrfs_workqueue *wq, const char *name, int high),
+	TP_PROTO(const struct __btrfs_workqueue *wq,
+		 const char *name, int high),
 
 	TP_ARGS(wq, name, high)
 );
 
 DECLARE_EVENT_CLASS(btrfs__workqueue_done,
 
-	TP_PROTO(struct __btrfs_workqueue *wq),
+	TP_PROTO(const struct __btrfs_workqueue *wq),
 
 	TP_ARGS(wq),
 
 	TP_STRUCT__entry_btrfs(
-		__field(	void *,	wq			)
+		__field(	const void *,	wq		)
 	),
 
 	TP_fast_assign_btrfs(btrfs_workqueue_owner(wq),
@@ -1405,7 +1414,7 @@ DECLARE_EVENT_CLASS(btrfs__workqueue_done,
 
 DEFINE_EVENT(btrfs__workqueue_done, btrfs_workqueue_destroy,
 
-	TP_PROTO(struct __btrfs_workqueue *wq),
+	TP_PROTO(const struct __btrfs_workqueue *wq),
 
 	TP_ARGS(wq)
 );
@@ -1417,7 +1426,8 @@ DEFINE_EVENT(btrfs__workqueue_done, btrfs_workqueue_destroy,
 
 DECLARE_EVENT_CLASS(btrfs__qgroup_rsv_data,
 
-	TP_PROTO(struct inode *inode, u64 start, u64 len, u64 reserved, int op),
+	TP_PROTO(const struct inode *inode, u64 start, u64 len,
+		 u64 reserved, int op),
 
 	TP_ARGS(inode, start, len, reserved, op),
 
@@ -1449,21 +1459,24 @@ DECLARE_EVENT_CLASS(btrfs__qgroup_rsv_data,
 
 DEFINE_EVENT(btrfs__qgroup_rsv_data, btrfs_qgroup_reserve_data,
 
-	TP_PROTO(struct inode *inode, u64 start, u64 len, u64 reserved, int op),
+	TP_PROTO(const struct inode *inode, u64 start, u64 len,
+		 u64 reserved, int op),
 
 	TP_ARGS(inode, start, len, reserved, op)
 );
 
 DEFINE_EVENT(btrfs__qgroup_rsv_data, btrfs_qgroup_release_data,
 
-	TP_PROTO(struct inode *inode, u64 start, u64 len, u64 reserved, int op),
+	TP_PROTO(const struct inode *inode, u64 start, u64 len,
+		 u64 reserved, int op),
 
 	TP_ARGS(inode, start, len, reserved, op)
 );
 
 DECLARE_EVENT_CLASS(btrfs__qgroup_delayed_ref,
 
-	TP_PROTO(struct btrfs_fs_info *fs_info, u64 ref_root, u64 reserved),
+	TP_PROTO(const struct btrfs_fs_info *fs_info,
+		 u64 ref_root, u64 reserved),
 
 	TP_ARGS(fs_info, ref_root, reserved),
 
@@ -1483,14 +1496,15 @@ DECLARE_EVENT_CLASS(btrfs__qgroup_delayed_ref,
 
 DEFINE_EVENT(btrfs__qgroup_delayed_ref, btrfs_qgroup_free_delayed_ref,
 
-	TP_PROTO(struct btrfs_fs_info *fs_info, u64 ref_root, u64 reserved),
+	TP_PROTO(const struct btrfs_fs_info *fs_info,
+		 u64 ref_root, u64 reserved),
 
 	TP_ARGS(fs_info, ref_root, reserved)
 );
 
 DECLARE_EVENT_CLASS(btrfs_qgroup_extent,
-	TP_PROTO(struct btrfs_fs_info *fs_info,
-		 struct btrfs_qgroup_extent_record *rec),
+	TP_PROTO(const struct btrfs_fs_info *fs_info,
+		 const struct btrfs_qgroup_extent_record *rec),
 
 	TP_ARGS(fs_info, rec),
 
@@ -1511,23 +1525,23 @@ DECLARE_EVENT_CLASS(btrfs_qgroup_extent,
 
 DEFINE_EVENT(btrfs_qgroup_extent, btrfs_qgroup_account_extents,
 
-	TP_PROTO(struct btrfs_fs_info *fs_info,
-		 struct btrfs_qgroup_extent_record *rec),
+	TP_PROTO(const struct btrfs_fs_info *fs_info,
+		 const struct btrfs_qgroup_extent_record *rec),
 
 	TP_ARGS(fs_info, rec)
 );
 
 DEFINE_EVENT(btrfs_qgroup_extent, btrfs_qgroup_trace_extent,
 
-	TP_PROTO(struct btrfs_fs_info *fs_info,
-		 struct btrfs_qgroup_extent_record *rec),
+	TP_PROTO(const struct btrfs_fs_info *fs_info,
+		 const struct btrfs_qgroup_extent_record *rec),
 
 	TP_ARGS(fs_info, rec)
 );
 
 TRACE_EVENT(btrfs_qgroup_account_extent,
 
-	TP_PROTO(struct btrfs_fs_info *fs_info, u64 bytenr,
+	TP_PROTO(const struct btrfs_fs_info *fs_info, u64 bytenr,
 		 u64 num_bytes, u64 nr_old_roots, u64 nr_new_roots),
 
 	TP_ARGS(fs_info, bytenr, num_bytes, nr_old_roots, nr_new_roots),
@@ -1556,7 +1570,7 @@ TRACE_EVENT(btrfs_qgroup_account_extent,
 
 TRACE_EVENT(qgroup_update_counters,
 
-	TP_PROTO(struct btrfs_fs_info *fs_info, u64 qgid,
+	TP_PROTO(const struct btrfs_fs_info *fs_info, u64 qgid,
 		 u64 cur_old_count, u64 cur_new_count),
 
 	TP_ARGS(fs_info, qgid, cur_old_count, cur_new_count),

From 73980becaebfd4dd3c56f2ae64d4081af2a65b27 Mon Sep 17 00:00:00 2001
From: Jeff Mahoney <jeffm@suse.com>
Date: Wed, 28 Jun 2017 21:56:55 -0600
Subject: [PATCH 007/104] btrfs: backref, constify some arguments

This constifies a few buffers used in the backref code.

Signed-off-by: Jeff Mahoney <jeffm@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/backref.c | 29 ++++++++++++++++-------------
 1 file changed, 16 insertions(+), 13 deletions(-)

diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index f723c11bb763..9d6474ddf674 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -299,10 +299,11 @@ static int ref_tree_add(struct ref_root *ref_tree, u64 root_id, u64 object_id,
 	return 0;
 }
 
-static int check_extent_in_eb(struct btrfs_key *key, struct extent_buffer *eb,
-				struct btrfs_file_extent_item *fi,
-				u64 extent_item_pos,
-				struct extent_inode_elem **eie)
+static int check_extent_in_eb(const struct btrfs_key *key,
+			      const struct extent_buffer *eb,
+			      const struct btrfs_file_extent_item *fi,
+			      u64 extent_item_pos,
+			      struct extent_inode_elem **eie)
 {
 	u64 offset = 0;
 	struct extent_inode_elem *e;
@@ -344,9 +345,9 @@ static void free_inode_elem_list(struct extent_inode_elem *eie)
 	}
 }
 
-static int find_extent_in_eb(struct extent_buffer *eb, u64 wanted_disk_byte,
-				u64 extent_item_pos,
-				struct extent_inode_elem **eie)
+static int find_extent_in_eb(const struct extent_buffer *eb,
+			     u64 wanted_disk_byte, u64 extent_item_pos,
+			     struct extent_inode_elem **eie)
 {
 	u64 disk_byte;
 	struct btrfs_key key;
@@ -456,7 +457,7 @@ void btrfs_prelim_ref_exit(void)
  */
 
 static int __add_prelim_ref(struct list_head *head, u64 root_id,
-			    struct btrfs_key *key, int level,
+			    const struct btrfs_key *key, int level,
 			    u64 parent, u64 wanted_disk_byte, int count,
 			    gfp_t gfp_mask)
 {
@@ -1649,7 +1650,7 @@ int btrfs_find_one_extref(struct btrfs_root *root, u64 inode_objectid,
 	struct btrfs_key key;
 	struct btrfs_key found_key;
 	struct btrfs_inode_extref *extref;
-	struct extent_buffer *leaf;
+	const struct extent_buffer *leaf;
 	unsigned long ptr;
 
 	key.objectid = inode_objectid;
@@ -1806,7 +1807,7 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
 	u64 flags;
 	u64 size = 0;
 	u32 item_size;
-	struct extent_buffer *eb;
+	const struct extent_buffer *eb;
 	struct btrfs_extent_item *ei;
 	struct btrfs_key key;
 
@@ -1874,9 +1875,11 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
  * next ref. after the last ref was processed, 1 is returned.
  * returns <0 on error
  */
-static int __get_extent_inline_ref(unsigned long *ptr, struct extent_buffer *eb,
-				   struct btrfs_key *key,
-				   struct btrfs_extent_item *ei, u32 item_size,
+static int __get_extent_inline_ref(unsigned long *ptr,
+				   const struct extent_buffer *eb,
+				   const struct btrfs_key *key,
+				   const struct btrfs_extent_item *ei,
+				   u32 item_size,
 				   struct btrfs_extent_inline_ref **out_eiref,
 				   int *out_type)
 {

From 4dae077a83dd8944ed351b09a0651c1283f46185 Mon Sep 17 00:00:00 2001
From: Jeff Mahoney <jeffm@suse.com>
Date: Wed, 28 Jun 2017 21:56:56 -0600
Subject: [PATCH 008/104] btrfs: backref, add unode_aux_to_inode_list helper

Replacing the double cast and ternary conditional with a helper makes
the code easier on the eyes.

Signed-off-by: Jeff Mahoney <jeffm@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/backref.c | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 9d6474ddf674..4a7a4b032c2f 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -682,6 +682,14 @@ out:
 	return ret;
 }
 
+static struct extent_inode_elem *
+unode_aux_to_inode_list(struct ulist_node *node)
+{
+	if (!node)
+		return NULL;
+	return (struct extent_inode_elem *)(uintptr_t)node->aux;
+}
+
 /*
  * resolve all indirect backrefs from the list
  */
@@ -736,8 +744,7 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
 		ULIST_ITER_INIT(&uiter);
 		node = ulist_next(parents, &uiter);
 		ref->parent = node ? node->val : 0;
-		ref->inode_list = node ?
-			(struct extent_inode_elem *)(uintptr_t)node->aux : NULL;
+		ref->inode_list = unode_aux_to_inode_list(node);
 
 		/* additional parents require new refs being added here */
 		while ((node = ulist_next(parents, &uiter))) {
@@ -749,8 +756,7 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
 			}
 			memcpy(new_ref, ref, sizeof(*ref));
 			new_ref->parent = node->val;
-			new_ref->inode_list = (struct extent_inode_elem *)
-							(uintptr_t)node->aux;
+			new_ref->inode_list = unode_aux_to_inode_list(node);
 			list_add(&new_ref->list, &ref->list);
 		}
 		ulist_reinit(parents);
@@ -1476,7 +1482,7 @@ static void free_leaf_list(struct ulist *blocks)
 	while ((node = ulist_next(blocks, &uiter))) {
 		if (!node->aux)
 			continue;
-		eie = (struct extent_inode_elem *)(uintptr_t)node->aux;
+		eie = unode_aux_to_inode_list(node);
 		free_inode_elem_list(eie);
 		node->aux = 0;
 	}

From e0c476b128e37daa37d630dd68da5681e9c16bab Mon Sep 17 00:00:00 2001
From: Jeff Mahoney <jeffm@suse.com>
Date: Wed, 28 Jun 2017 21:56:57 -0600
Subject: [PATCH 009/104] btrfs: backref, cleanup __ namespace abuse

We typically use __ to indicate a helper routine that shouldn't be
called directly without understanding the proper context required
to do so.  We use static functions to indicate that a function is
private to a particular C file.  The backref code uses static
function and __ prefixes on nearly everything, which makes the code
difficult to read and establishes a pattern for future code that
shouldn't be followed.  This patch drops all the unnecessary prefixes.

Signed-off-by: Jeff Mahoney <jeffm@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/backref.c | 225 ++++++++++++++++++++++-----------------------
 1 file changed, 109 insertions(+), 116 deletions(-)

diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 4a7a4b032c2f..3725277f6e08 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -387,7 +387,7 @@ static int find_extent_in_eb(const struct extent_buffer *eb,
 /*
  * this structure records all encountered refs on the way up to the root
  */
-struct __prelim_ref {
+struct prelim_ref {
 	struct list_head list;
 	u64 root_id;
 	struct btrfs_key key_for_search;
@@ -403,7 +403,7 @@ static struct kmem_cache *btrfs_prelim_ref_cache;
 int __init btrfs_prelim_ref_init(void)
 {
 	btrfs_prelim_ref_cache = kmem_cache_create("btrfs_prelim_ref",
-					sizeof(struct __prelim_ref),
+					sizeof(struct prelim_ref),
 					0,
 					SLAB_MEM_SPREAD,
 					NULL);
@@ -449,19 +449,17 @@ void btrfs_prelim_ref_exit(void)
  *
  * - column 1, 3: we've the parent -> done
  * - column 2:    we take the first key from the block to find the parent
- *                (see __add_missing_keys)
+ *                (see add_missing_keys)
  * - column 4:    we use the key to find the parent
  *
  * additional information that's available but not required to find the parent
  * block might help in merging entries to gain some speed.
  */
-
-static int __add_prelim_ref(struct list_head *head, u64 root_id,
-			    const struct btrfs_key *key, int level,
-			    u64 parent, u64 wanted_disk_byte, int count,
-			    gfp_t gfp_mask)
+static int add_prelim_ref(struct list_head *head, u64 root_id,
+			  const struct btrfs_key *key, int level, u64 parent,
+			  u64 wanted_disk_byte, int count, gfp_t gfp_mask)
 {
-	struct __prelim_ref *ref;
+	struct prelim_ref *ref;
 
 	if (root_id == BTRFS_DATA_RELOC_TREE_OBJECTID)
 		return 0;
@@ -510,7 +508,7 @@ static int __add_prelim_ref(struct list_head *head, u64 root_id,
 }
 
 static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
-			   struct ulist *parents, struct __prelim_ref *ref,
+			   struct ulist *parents, struct prelim_ref *ref,
 			   int level, u64 time_seq, const u64 *extent_item_pos,
 			   u64 total_refs)
 {
@@ -600,11 +598,10 @@ next:
  * resolve an indirect backref in the form (root_id, key, level)
  * to a logical address
  */
-static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info,
-				  struct btrfs_path *path, u64 time_seq,
-				  struct __prelim_ref *ref,
-				  struct ulist *parents,
-				  const u64 *extent_item_pos, u64 total_refs)
+static int resolve_indirect_ref(struct btrfs_fs_info *fs_info,
+				struct btrfs_path *path, u64 time_seq,
+				struct prelim_ref *ref, struct ulist *parents,
+				const u64 *extent_item_pos, u64 total_refs)
 {
 	struct btrfs_root *root;
 	struct btrfs_key root_key;
@@ -693,17 +690,17 @@ unode_aux_to_inode_list(struct ulist_node *node)
 /*
  * resolve all indirect backrefs from the list
  */
-static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
-				   struct btrfs_path *path, u64 time_seq,
-				   struct list_head *head,
-				   const u64 *extent_item_pos, u64 total_refs,
-				   u64 root_objectid)
+static int resolve_indirect_refs(struct btrfs_fs_info *fs_info,
+				 struct btrfs_path *path, u64 time_seq,
+				 struct list_head *head,
+				 const u64 *extent_item_pos, u64 total_refs,
+				 u64 root_objectid)
 {
 	int err;
 	int ret = 0;
-	struct __prelim_ref *ref;
-	struct __prelim_ref *ref_safe;
-	struct __prelim_ref *new_ref;
+	struct prelim_ref *ref;
+	struct prelim_ref *ref_safe;
+	struct prelim_ref *new_ref;
 	struct ulist *parents;
 	struct ulist_node *node;
 	struct ulist_iterator uiter;
@@ -726,9 +723,9 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
 			ret = BACKREF_FOUND_SHARED;
 			goto out;
 		}
-		err = __resolve_indirect_ref(fs_info, path, time_seq, ref,
-					     parents, extent_item_pos,
-					     total_refs);
+		err = resolve_indirect_ref(fs_info, path, time_seq, ref,
+					   parents, extent_item_pos,
+					   total_refs);
 		/*
 		 * we can only tolerate ENOENT,otherwise,we should catch error
 		 * and return directly.
@@ -766,8 +763,8 @@ out:
 	return ret;
 }
 
-static inline int ref_for_same_block(struct __prelim_ref *ref1,
-				     struct __prelim_ref *ref2)
+static inline int ref_for_same_block(struct prelim_ref *ref1,
+				     struct prelim_ref *ref2)
 {
 	if (ref1->level != ref2->level)
 		return 0;
@@ -788,10 +785,10 @@ static inline int ref_for_same_block(struct __prelim_ref *ref1,
 /*
  * read tree blocks and add keys where required.
  */
-static int __add_missing_keys(struct btrfs_fs_info *fs_info,
-			      struct list_head *head)
+static int add_missing_keys(struct btrfs_fs_info *fs_info,
+			    struct list_head *head)
 {
-	struct __prelim_ref *ref;
+	struct prelim_ref *ref;
 	struct extent_buffer *eb;
 
 	list_for_each_entry(ref, head, list) {
@@ -821,20 +818,20 @@ static int __add_missing_keys(struct btrfs_fs_info *fs_info,
 /*
  * merge backrefs and adjust counts accordingly
  *
- *    FIXME: For MERGE_IDENTICAL_KEYS, if we add more keys in __add_prelim_ref
+ *    FIXME: For MERGE_IDENTICAL_KEYS, if we add more keys in add_prelim_ref
  *           then we can merge more here. Additionally, we could even add a key
  *           range for the blocks we looked into to merge even more (-> replace
  *           unresolved refs by those having a parent).
  */
-static void __merge_refs(struct list_head *head, enum merge_mode mode)
+static void merge_refs(struct list_head *head, enum merge_mode mode)
 {
-	struct __prelim_ref *pos1;
+	struct prelim_ref *pos1;
 
 	list_for_each_entry(pos1, head, list) {
-		struct __prelim_ref *pos2 = pos1, *tmp;
+		struct prelim_ref *pos2 = pos1, *tmp;
 
 		list_for_each_entry_safe_continue(pos2, tmp, head, list) {
-			struct __prelim_ref *ref1 = pos1, *ref2 = pos2;
+			struct prelim_ref *ref1 = pos1, *ref2 = pos2;
 			struct extent_inode_elem *eie;
 
 			if (!ref_for_same_block(ref1, ref2))
@@ -868,9 +865,9 @@ static void __merge_refs(struct list_head *head, enum merge_mode mode)
  * add all currently queued delayed refs from this head whose seq nr is
  * smaller or equal that seq to the list
  */
-static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
-			      struct list_head *prefs, u64 *total_refs,
-			      u64 inum)
+static int add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
+			    struct list_head *prefs, u64 *total_refs,
+			    u64 inum)
 {
 	struct btrfs_delayed_ref_node *node;
 	struct btrfs_delayed_extent_op *extent_op = head->extent_op;
@@ -907,19 +904,18 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
 			struct btrfs_delayed_tree_ref *ref;
 
 			ref = btrfs_delayed_node_to_tree_ref(node);
-			ret = __add_prelim_ref(prefs, ref->root, &op_key,
-					       ref->level + 1, 0, node->bytenr,
-					       node->ref_mod * sgn, GFP_ATOMIC);
+			ret = add_prelim_ref(prefs, ref->root, &op_key,
+					     ref->level + 1, 0, node->bytenr,
+					     node->ref_mod * sgn, GFP_ATOMIC);
 			break;
 		}
 		case BTRFS_SHARED_BLOCK_REF_KEY: {
 			struct btrfs_delayed_tree_ref *ref;
 
 			ref = btrfs_delayed_node_to_tree_ref(node);
-			ret = __add_prelim_ref(prefs, 0, NULL,
-					       ref->level + 1, ref->parent,
-					       node->bytenr,
-					       node->ref_mod * sgn, GFP_ATOMIC);
+			ret = add_prelim_ref(prefs, 0, NULL, ref->level + 1,
+					     ref->parent, node->bytenr,
+					     node->ref_mod * sgn, GFP_ATOMIC);
 			break;
 		}
 		case BTRFS_EXTENT_DATA_REF_KEY: {
@@ -939,18 +935,18 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
 				break;
 			}
 
-			ret = __add_prelim_ref(prefs, ref->root, &key, 0, 0,
-					       node->bytenr,
-					       node->ref_mod * sgn, GFP_ATOMIC);
+			ret = add_prelim_ref(prefs, ref->root, &key, 0, 0,
+					     node->bytenr, node->ref_mod * sgn,
+					     GFP_ATOMIC);
 			break;
 		}
 		case BTRFS_SHARED_DATA_REF_KEY: {
 			struct btrfs_delayed_data_ref *ref;
 
 			ref = btrfs_delayed_node_to_data_ref(node);
-			ret = __add_prelim_ref(prefs, 0, NULL, 0,
-					       ref->parent, node->bytenr,
-					       node->ref_mod * sgn, GFP_ATOMIC);
+			ret = add_prelim_ref(prefs, 0, NULL, 0, ref->parent,
+					     node->bytenr, node->ref_mod * sgn,
+					     GFP_ATOMIC);
 			break;
 		}
 		default:
@@ -966,10 +962,10 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
 /*
  * add all inline backrefs for bytenr to the list
  */
-static int __add_inline_refs(struct btrfs_path *path, u64 bytenr,
-			     int *info_level, struct list_head *prefs,
-			     struct ref_root *ref_tree,
-			     u64 *total_refs, u64 inum)
+static int add_inline_refs(struct btrfs_path *path, u64 bytenr,
+			   int *info_level, struct list_head *prefs,
+			   struct ref_root *ref_tree,
+			   u64 *total_refs, u64 inum)
 {
 	int ret = 0;
 	int slot;
@@ -1024,9 +1020,8 @@ static int __add_inline_refs(struct btrfs_path *path, u64 bytenr,
 
 		switch (type) {
 		case BTRFS_SHARED_BLOCK_REF_KEY:
-			ret = __add_prelim_ref(prefs, 0, NULL,
-						*info_level + 1, offset,
-						bytenr, 1, GFP_NOFS);
+			ret = add_prelim_ref(prefs, 0, NULL, *info_level + 1,
+					     offset, bytenr, 1, GFP_NOFS);
 			break;
 		case BTRFS_SHARED_DATA_REF_KEY: {
 			struct btrfs_shared_data_ref *sdref;
@@ -1034,8 +1029,8 @@ static int __add_inline_refs(struct btrfs_path *path, u64 bytenr,
 
 			sdref = (struct btrfs_shared_data_ref *)(iref + 1);
 			count = btrfs_shared_data_ref_count(leaf, sdref);
-			ret = __add_prelim_ref(prefs, 0, NULL, 0, offset,
-					       bytenr, count, GFP_NOFS);
+			ret = add_prelim_ref(prefs, 0, NULL, 0, offset,
+					     bytenr, count, GFP_NOFS);
 			if (ref_tree) {
 				if (!ret)
 					ret = ref_tree_add(ref_tree, 0, 0, 0,
@@ -1046,9 +1041,9 @@ static int __add_inline_refs(struct btrfs_path *path, u64 bytenr,
 			break;
 		}
 		case BTRFS_TREE_BLOCK_REF_KEY:
-			ret = __add_prelim_ref(prefs, offset, NULL,
-					       *info_level + 1, 0,
-					       bytenr, 1, GFP_NOFS);
+			ret = add_prelim_ref(prefs, offset, NULL,
+					     *info_level + 1, 0,
+					     bytenr, 1, GFP_NOFS);
 			break;
 		case BTRFS_EXTENT_DATA_REF_KEY: {
 			struct btrfs_extent_data_ref *dref;
@@ -1068,8 +1063,8 @@ static int __add_inline_refs(struct btrfs_path *path, u64 bytenr,
 			}
 
 			root = btrfs_extent_data_ref_root(leaf, dref);
-			ret = __add_prelim_ref(prefs, root, &key, 0, 0,
-					       bytenr, count, GFP_NOFS);
+			ret = add_prelim_ref(prefs, root, &key, 0, 0,
+					     bytenr, count, GFP_NOFS);
 			if (ref_tree) {
 				if (!ret)
 					ret = ref_tree_add(ref_tree, root,
@@ -1095,10 +1090,10 @@ static int __add_inline_refs(struct btrfs_path *path, u64 bytenr,
 /*
  * add all non-inline backrefs for bytenr to the list
  */
-static int __add_keyed_refs(struct btrfs_fs_info *fs_info,
-			    struct btrfs_path *path, u64 bytenr,
-			    int info_level, struct list_head *prefs,
-			    struct ref_root *ref_tree, u64 inum)
+static int add_keyed_refs(struct btrfs_fs_info *fs_info,
+			  struct btrfs_path *path, u64 bytenr,
+			  int info_level, struct list_head *prefs,
+			  struct ref_root *ref_tree, u64 inum)
 {
 	struct btrfs_root *extent_root = fs_info->extent_root;
 	int ret;
@@ -1128,9 +1123,8 @@ static int __add_keyed_refs(struct btrfs_fs_info *fs_info,
 
 		switch (key.type) {
 		case BTRFS_SHARED_BLOCK_REF_KEY:
-			ret = __add_prelim_ref(prefs, 0, NULL,
-						info_level + 1, key.offset,
-						bytenr, 1, GFP_NOFS);
+			ret = add_prelim_ref(prefs, 0, NULL, info_level + 1,
+					     key.offset, bytenr, 1, GFP_NOFS);
 			break;
 		case BTRFS_SHARED_DATA_REF_KEY: {
 			struct btrfs_shared_data_ref *sdref;
@@ -1139,8 +1133,8 @@ static int __add_keyed_refs(struct btrfs_fs_info *fs_info,
 			sdref = btrfs_item_ptr(leaf, slot,
 					      struct btrfs_shared_data_ref);
 			count = btrfs_shared_data_ref_count(leaf, sdref);
-			ret = __add_prelim_ref(prefs, 0, NULL, 0, key.offset,
-						bytenr, count, GFP_NOFS);
+			ret = add_prelim_ref(prefs, 0, NULL, 0, key.offset,
+					     bytenr, count, GFP_NOFS);
 			if (ref_tree) {
 				if (!ret)
 					ret = ref_tree_add(ref_tree, 0, 0, 0,
@@ -1151,9 +1145,9 @@ static int __add_keyed_refs(struct btrfs_fs_info *fs_info,
 			break;
 		}
 		case BTRFS_TREE_BLOCK_REF_KEY:
-			ret = __add_prelim_ref(prefs, key.offset, NULL,
-					       info_level + 1, 0,
-					       bytenr, 1, GFP_NOFS);
+			ret = add_prelim_ref(prefs, key.offset, NULL,
+					     info_level + 1, 0,
+					     bytenr, 1, GFP_NOFS);
 			break;
 		case BTRFS_EXTENT_DATA_REF_KEY: {
 			struct btrfs_extent_data_ref *dref;
@@ -1174,8 +1168,8 @@ static int __add_keyed_refs(struct btrfs_fs_info *fs_info,
 			}
 
 			root = btrfs_extent_data_ref_root(leaf, dref);
-			ret = __add_prelim_ref(prefs, root, &key, 0, 0,
-					       bytenr, count, GFP_NOFS);
+			ret = add_prelim_ref(prefs, root, &key, 0, 0,
+					     bytenr, count, GFP_NOFS);
 			if (ref_tree) {
 				if (!ret)
 					ret = ref_tree_add(ref_tree, root,
@@ -1230,7 +1224,7 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans,
 	int ret;
 	struct list_head prefs_delayed;
 	struct list_head prefs;
-	struct __prelim_ref *ref;
+	struct prelim_ref *ref;
 	struct extent_inode_elem *eie = NULL;
 	struct ref_root *ref_tree = NULL;
 	u64 total_refs = 0;
@@ -1311,9 +1305,9 @@ again:
 				goto again;
 			}
 			spin_unlock(&delayed_refs->lock);
-			ret = __add_delayed_refs(head, time_seq,
-						 &prefs_delayed, &total_refs,
-						 inum);
+			ret = add_delayed_refs(head, time_seq,
+					       &prefs_delayed, &total_refs,
+					       inum);
 			mutex_unlock(&head->mutex);
 			if (ret)
 				goto out;
@@ -1363,15 +1357,13 @@ again:
 		if (key.objectid == bytenr &&
 		    (key.type == BTRFS_EXTENT_ITEM_KEY ||
 		     key.type == BTRFS_METADATA_ITEM_KEY)) {
-			ret = __add_inline_refs(path, bytenr,
-						&info_level, &prefs,
-						ref_tree, &total_refs,
-						inum);
+			ret = add_inline_refs(path, bytenr, &info_level,
+					      &prefs, ref_tree, &total_refs,
+					      inum);
 			if (ret)
 				goto out;
-			ret = __add_keyed_refs(fs_info, path, bytenr,
-					       info_level, &prefs,
-					       ref_tree, inum);
+			ret = add_keyed_refs(fs_info, path, bytenr, info_level,
+					     &prefs, ref_tree, inum);
 			if (ret)
 				goto out;
 		}
@@ -1380,22 +1372,22 @@ again:
 
 	list_splice_init(&prefs_delayed, &prefs);
 
-	ret = __add_missing_keys(fs_info, &prefs);
+	ret = add_missing_keys(fs_info, &prefs);
 	if (ret)
 		goto out;
 
-	__merge_refs(&prefs, MERGE_IDENTICAL_KEYS);
+	merge_refs(&prefs, MERGE_IDENTICAL_KEYS);
 
-	ret = __resolve_indirect_refs(fs_info, path, time_seq, &prefs,
-				      extent_item_pos, total_refs,
-				      root_objectid);
+	ret = resolve_indirect_refs(fs_info, path, time_seq, &prefs,
+				    extent_item_pos, total_refs,
+				    root_objectid);
 	if (ret)
 		goto out;
 
-	__merge_refs(&prefs, MERGE_IDENTICAL_PARENTS);
+	merge_refs(&prefs, MERGE_IDENTICAL_PARENTS);
 
 	while (!list_empty(&prefs)) {
-		ref = list_first_entry(&prefs, struct __prelim_ref, list);
+		ref = list_first_entry(&prefs, struct prelim_ref, list);
 		WARN_ON(ref->count < 0);
 		if (roots && ref->count && ref->root_id && ref->parent == 0) {
 			if (root_objectid && ref->root_id != root_objectid) {
@@ -1457,12 +1449,12 @@ out:
 	btrfs_free_path(path);
 	ref_root_free(ref_tree);
 	while (!list_empty(&prefs)) {
-		ref = list_first_entry(&prefs, struct __prelim_ref, list);
+		ref = list_first_entry(&prefs, struct prelim_ref, list);
 		list_del(&ref->list);
 		kmem_cache_free(btrfs_prelim_ref_cache, ref);
 	}
 	while (!list_empty(&prefs_delayed)) {
-		ref = list_first_entry(&prefs_delayed, struct __prelim_ref,
+		ref = list_first_entry(&prefs_delayed, struct prelim_ref,
 				       list);
 		list_del(&ref->list);
 		kmem_cache_free(btrfs_prelim_ref_cache, ref);
@@ -1532,9 +1524,9 @@ static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans,
  *
  * returns 0 on success, < 0 on error.
  */
-static int __btrfs_find_all_roots(struct btrfs_trans_handle *trans,
-				  struct btrfs_fs_info *fs_info, u64 bytenr,
-				  u64 time_seq, struct ulist **roots)
+static int btrfs_find_all_roots_safe(struct btrfs_trans_handle *trans,
+				     struct btrfs_fs_info *fs_info, u64 bytenr,
+				     u64 time_seq, struct ulist **roots)
 {
 	struct ulist *tmp;
 	struct ulist_node *node = NULL;
@@ -1578,7 +1570,8 @@ int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
 
 	if (!trans)
 		down_read(&fs_info->commit_root_sem);
-	ret = __btrfs_find_all_roots(trans, fs_info, bytenr, time_seq, roots);
+	ret = btrfs_find_all_roots_safe(trans, fs_info, bytenr,
+					time_seq, roots);
 	if (!trans)
 		up_read(&fs_info->commit_root_sem);
 	return ret;
@@ -1877,17 +1870,17 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
  * helper function to iterate extent inline refs. ptr must point to a 0 value
  * for the first call and may be modified. it is used to track state.
  * if more refs exist, 0 is returned and the next call to
- * __get_extent_inline_ref must pass the modified ptr parameter to get the
+ * get_extent_inline_ref must pass the modified ptr parameter to get the
  * next ref. after the last ref was processed, 1 is returned.
  * returns <0 on error
  */
-static int __get_extent_inline_ref(unsigned long *ptr,
-				   const struct extent_buffer *eb,
-				   const struct btrfs_key *key,
-				   const struct btrfs_extent_item *ei,
-				   u32 item_size,
-				   struct btrfs_extent_inline_ref **out_eiref,
-				   int *out_type)
+static int get_extent_inline_ref(unsigned long *ptr,
+				 const struct extent_buffer *eb,
+				 const struct btrfs_key *key,
+				 const struct btrfs_extent_item *ei,
+				 u32 item_size,
+				 struct btrfs_extent_inline_ref **out_eiref,
+				 int *out_type)
 {
 	unsigned long end;
 	u64 flags;
@@ -1930,7 +1923,7 @@ static int __get_extent_inline_ref(unsigned long *ptr,
 /*
  * reads the tree block backref for an extent. tree level and root are returned
  * through out_level and out_root. ptr must point to a 0 value for the first
- * call and may be modified (see __get_extent_inline_ref comment).
+ * call and may be modified (see get_extent_inline_ref comment).
  * returns 0 if data was provided, 1 if there was no more data to provide or
  * <0 on error.
  */
@@ -1946,7 +1939,7 @@ int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb,
 		return 1;
 
 	while (1) {
-		ret = __get_extent_inline_ref(ptr, eb, key, ei, item_size,
+		ret = get_extent_inline_ref(ptr, eb, key, ei, item_size,
 					      &eiref, &type);
 		if (ret < 0)
 			return ret;
@@ -2043,8 +2036,8 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
 
 	ULIST_ITER_INIT(&ref_uiter);
 	while (!ret && (ref_node = ulist_next(refs, &ref_uiter))) {
-		ret = __btrfs_find_all_roots(trans, fs_info, ref_node->val,
-					     tree_mod_seq_elem.seq, &roots);
+		ret = btrfs_find_all_roots_safe(trans, fs_info, ref_node->val,
+						tree_mod_seq_elem.seq, &roots);
 		if (ret)
 			break;
 		ULIST_ITER_INIT(&root_uiter);

From bb739cf08e8f32ea0b4a6d2ae22466488182c2fe Mon Sep 17 00:00:00 2001
From: Edmund Nadolski <enadolski@suse.com>
Date: Wed, 28 Jun 2017 21:56:58 -0600
Subject: [PATCH 010/104] btrfs: btrfs_check_shared should manage its own
 transaction

Commit afce772e87c3 ("btrfs: fix check_shared for fiemap ioctl") added
transaction semantics around calls to btrfs_check_shared() in order to
provide accurate accounting of delayed refs. The transaction management
should be done inside btrfs_check_shared(), so that callers do not need
to manage transactions individually.

Signed-off-by: Edmund Nadolski <enadolski@suse.com>
Signed-off-by: Jeff Mahoney <jeffm@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/backref.c   | 30 +++++++++++++++++++-----------
 fs/btrfs/backref.h   |  4 +---
 fs/btrfs/extent_io.c | 22 +++-------------------
 3 files changed, 23 insertions(+), 33 deletions(-)

diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 3725277f6e08..35cfa388dc0b 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -1580,20 +1580,21 @@ int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
 /**
  * btrfs_check_shared - tell us whether an extent is shared
  *
- * @trans: optional trans handle
- *
  * btrfs_check_shared uses the backref walking code but will short
  * circuit as soon as it finds a root or inode that doesn't match the
  * one passed in. This provides a significant performance benefit for
  * callers (such as fiemap) which want to know whether the extent is
  * shared but do not need a ref count.
  *
+ * This attempts to allocate a transaction in order to account for
+ * delayed refs, but continues on even when the alloc fails.
+ *
  * Return: 0 if extent is not shared, 1 if it is shared, < 0 on error.
  */
-int btrfs_check_shared(struct btrfs_trans_handle *trans,
-		       struct btrfs_fs_info *fs_info, u64 root_objectid,
-		       u64 inum, u64 bytenr)
+int btrfs_check_shared(struct btrfs_root *root, u64 inum, u64 bytenr)
 {
+	struct btrfs_fs_info *fs_info = root->fs_info;
+	struct btrfs_trans_handle *trans;
 	struct ulist *tmp = NULL;
 	struct ulist *roots = NULL;
 	struct ulist_iterator uiter;
@@ -1609,14 +1610,18 @@ int btrfs_check_shared(struct btrfs_trans_handle *trans,
 		return -ENOMEM;
 	}
 
-	if (trans)
-		btrfs_get_tree_mod_seq(fs_info, &elem);
-	else
+	trans = btrfs_join_transaction(root);
+	if (IS_ERR(trans)) {
+		trans = NULL;
 		down_read(&fs_info->commit_root_sem);
+	} else {
+		btrfs_get_tree_mod_seq(fs_info, &elem);
+	}
+
 	ULIST_ITER_INIT(&uiter);
 	while (1) {
 		ret = find_parent_nodes(trans, fs_info, bytenr, elem.seq, tmp,
-					roots, NULL, root_objectid, inum, 1);
+					roots, NULL, root->objectid, inum, 1);
 		if (ret == BACKREF_FOUND_SHARED) {
 			/* this is the only condition under which we return 1 */
 			ret = 1;
@@ -1631,10 +1636,13 @@ int btrfs_check_shared(struct btrfs_trans_handle *trans,
 		bytenr = node->val;
 		cond_resched();
 	}
-	if (trans)
+
+	if (trans) {
 		btrfs_put_tree_mod_seq(fs_info, &elem);
-	else
+		btrfs_end_transaction(trans);
+	} else {
 		up_read(&fs_info->commit_root_sem);
+	}
 	ulist_free(tmp);
 	ulist_free(roots);
 	return ret;
diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h
index 9c41fbac3009..f9428aaaa77a 100644
--- a/fs/btrfs/backref.h
+++ b/fs/btrfs/backref.h
@@ -68,9 +68,7 @@ int btrfs_find_one_extref(struct btrfs_root *root, u64 inode_objectid,
 			  u64 start_off, struct btrfs_path *path,
 			  struct btrfs_inode_extref **ret_extref,
 			  u64 *found_off);
-int btrfs_check_shared(struct btrfs_trans_handle *trans,
-		       struct btrfs_fs_info *fs_info, u64 root_objectid,
-		       u64 inum, u64 bytenr);
+int btrfs_check_shared(struct btrfs_root *root, u64 inum, u64 bytenr);
 
 int __init btrfs_prelim_ref_init(void);
 void btrfs_prelim_ref_exit(void);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index d6f761b4fae0..7dd1b2dc7c68 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -20,7 +20,6 @@
 #include "locking.h"
 #include "rcu-string.h"
 #include "backref.h"
-#include "transaction.h"
 
 static struct kmem_cache *extent_state_cache;
 static struct kmem_cache *extent_buffer_cache;
@@ -4606,24 +4605,11 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 			flags |= (FIEMAP_EXTENT_DELALLOC |
 				  FIEMAP_EXTENT_UNKNOWN);
 		} else if (fieinfo->fi_extents_max) {
-			struct btrfs_trans_handle *trans;
-
 			u64 bytenr = em->block_start -
 				(em->start - em->orig_start);
 
 			disko = em->block_start + offset_in_extent;
 
-			/*
-			 * We need a trans handle to get delayed refs
-			 */
-			trans = btrfs_join_transaction(root);
-			/*
-			 * It's OK if we can't start a trans we can still check
-			 * from commit_root
-			 */
-			if (IS_ERR(trans))
-				trans = NULL;
-
 			/*
 			 * As btrfs supports shared space, this information
 			 * can be exported to userspace tools via
@@ -4631,11 +4617,9 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 			 * then we're just getting a count and we can skip the
 			 * lookup stuff.
 			 */
-			ret = btrfs_check_shared(trans, root->fs_info,
-					root->objectid,
-					btrfs_ino(BTRFS_I(inode)), bytenr);
-			if (trans)
-				btrfs_end_transaction(trans);
+			ret = btrfs_check_shared(root,
+						 btrfs_ino(BTRFS_I(inode)),
+						 bytenr);
 			if (ret < 0)
 				goto out_free;
 			if (ret)

From f6954245d9e17902a66a1253d2a3afc05e335172 Mon Sep 17 00:00:00 2001
From: Edmund Nadolski <enadolski@suse.com>
Date: Wed, 28 Jun 2017 21:56:59 -0600
Subject: [PATCH 011/104] btrfs: remove ref_tree implementation from backref.c

Commit afce772e87c3 ("btrfs: fix check_shared for fiemap ioctl") added
the ref_tree code in backref.c to reduce backref searching for
shared extents under the FIEMAP ioctl. This code will not be
compatible with the upcoming rbtree changes for improved backref
searching, so this patch removes the ref_tree code.  The rbtree
changes will provide the equivalent functionality for FIEMAP.

The above commit also introduced transaction semantics around calls to
btrfs_check_shared() in order to accurately account for delayed refs.
This functionality needs to be retained, so a complete revert of the
above commit is not desirable. This patch therefore removes the
ref_tree portion of the commit as above, however it does not remove
the transaction portion.

Signed-off-by: Edmund Nadolski <enadolski@suse.com>
Signed-off-by: Jeff Mahoney <jeffm@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/backref.c | 355 +--------------------------------------------
 1 file changed, 7 insertions(+), 348 deletions(-)

diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 35cfa388dc0b..6cac5ab8d5e0 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -40,265 +40,6 @@ struct extent_inode_elem {
 	struct extent_inode_elem *next;
 };
 
-/*
- * ref_root is used as the root of the ref tree that hold a collection
- * of unique references.
- */
-struct ref_root {
-	struct rb_root rb_root;
-
-	/*
-	 * The unique_refs represents the number of ref_nodes with a positive
-	 * count stored in the tree. Even if a ref_node (the count is greater
-	 * than one) is added, the unique_refs will only increase by one.
-	 */
-	unsigned int unique_refs;
-};
-
-/* ref_node is used to store a unique reference to the ref tree. */
-struct ref_node {
-	struct rb_node rb_node;
-
-	/* For NORMAL_REF, otherwise all these fields should be set to 0 */
-	u64 root_id;
-	u64 object_id;
-	u64 offset;
-
-	/* For SHARED_REF, otherwise parent field should be set to 0 */
-	u64 parent;
-
-	/* Ref to the ref_mod of btrfs_delayed_ref_node */
-	int ref_mod;
-};
-
-/* Dynamically allocate and initialize a ref_root */
-static struct ref_root *ref_root_alloc(void)
-{
-	struct ref_root *ref_tree;
-
-	ref_tree = kmalloc(sizeof(*ref_tree), GFP_NOFS);
-	if (!ref_tree)
-		return NULL;
-
-	ref_tree->rb_root = RB_ROOT;
-	ref_tree->unique_refs = 0;
-
-	return ref_tree;
-}
-
-/* Free all nodes in the ref tree, and reinit ref_root */
-static void ref_root_fini(struct ref_root *ref_tree)
-{
-	struct ref_node *node;
-	struct rb_node *next;
-
-	while ((next = rb_first(&ref_tree->rb_root)) != NULL) {
-		node = rb_entry(next, struct ref_node, rb_node);
-		rb_erase(next, &ref_tree->rb_root);
-		kfree(node);
-	}
-
-	ref_tree->rb_root = RB_ROOT;
-	ref_tree->unique_refs = 0;
-}
-
-static void ref_root_free(struct ref_root *ref_tree)
-{
-	if (!ref_tree)
-		return;
-
-	ref_root_fini(ref_tree);
-	kfree(ref_tree);
-}
-
-/*
- * Compare ref_node with (root_id, object_id, offset, parent)
- *
- * The function compares two ref_node a and b. It returns an integer less
- * than, equal to, or greater than zero , respectively, to be less than, to
- * equal, or be greater than b.
- */
-static int ref_node_cmp(struct ref_node *a, struct ref_node *b)
-{
-	if (a->root_id < b->root_id)
-		return -1;
-	else if (a->root_id > b->root_id)
-		return 1;
-
-	if (a->object_id < b->object_id)
-		return -1;
-	else if (a->object_id > b->object_id)
-		return 1;
-
-	if (a->offset < b->offset)
-		return -1;
-	else if (a->offset > b->offset)
-		return 1;
-
-	if (a->parent < b->parent)
-		return -1;
-	else if (a->parent > b->parent)
-		return 1;
-
-	return 0;
-}
-
-/*
- * Search ref_node with (root_id, object_id, offset, parent) in the tree
- *
- * if found, the pointer of the ref_node will be returned;
- * if not found, NULL will be returned and pos will point to the rb_node for
- * insert, pos_parent will point to pos'parent for insert;
-*/
-static struct ref_node *__ref_tree_search(struct ref_root *ref_tree,
-					  struct rb_node ***pos,
-					  struct rb_node **pos_parent,
-					  u64 root_id, u64 object_id,
-					  u64 offset, u64 parent)
-{
-	struct ref_node *cur = NULL;
-	struct ref_node entry;
-	int ret;
-
-	entry.root_id = root_id;
-	entry.object_id = object_id;
-	entry.offset = offset;
-	entry.parent = parent;
-
-	*pos = &ref_tree->rb_root.rb_node;
-
-	while (**pos) {
-		*pos_parent = **pos;
-		cur = rb_entry(*pos_parent, struct ref_node, rb_node);
-
-		ret = ref_node_cmp(cur, &entry);
-		if (ret > 0)
-			*pos = &(**pos)->rb_left;
-		else if (ret < 0)
-			*pos = &(**pos)->rb_right;
-		else
-			return cur;
-	}
-
-	return NULL;
-}
-
-/*
- * Insert a ref_node to the ref tree
- * @pos used for specifiy the position to insert
- * @pos_parent for specifiy pos's parent
- *
- * success, return 0;
- * ref_node already exists, return -EEXIST;
-*/
-static int ref_tree_insert(struct ref_root *ref_tree, struct rb_node **pos,
-			   struct rb_node *pos_parent, struct ref_node *ins)
-{
-	struct rb_node **p = NULL;
-	struct rb_node *parent = NULL;
-	struct ref_node *cur = NULL;
-
-	if (!pos) {
-		cur = __ref_tree_search(ref_tree, &p, &parent, ins->root_id,
-					ins->object_id, ins->offset,
-					ins->parent);
-		if (cur)
-			return -EEXIST;
-	} else {
-		p = pos;
-		parent = pos_parent;
-	}
-
-	rb_link_node(&ins->rb_node, parent, p);
-	rb_insert_color(&ins->rb_node, &ref_tree->rb_root);
-
-	return 0;
-}
-
-/* Erase and free ref_node, caller should update ref_root->unique_refs */
-static void ref_tree_remove(struct ref_root *ref_tree, struct ref_node *node)
-{
-	rb_erase(&node->rb_node, &ref_tree->rb_root);
-	kfree(node);
-}
-
-/*
- * Update ref_root->unique_refs
- *
- * Call __ref_tree_search
- *	1. if ref_node doesn't exist, ref_tree_insert this node, and update
- *	ref_root->unique_refs:
- *		if ref_node->ref_mod > 0, ref_root->unique_refs++;
- *		if ref_node->ref_mod < 0, do noting;
- *
- *	2. if ref_node is found, then get origin ref_node->ref_mod, and update
- *	ref_node->ref_mod.
- *		if ref_node->ref_mod is equal to 0,then call ref_tree_remove
- *
- *		according to origin_mod and new_mod, update ref_root->items
- *		+----------------+--------------+-------------+
- *		|		 |new_count <= 0|new_count > 0|
- *		+----------------+--------------+-------------+
- *		|origin_count < 0|       0      |      1      |
- *		+----------------+--------------+-------------+
- *		|origin_count > 0|      -1      |      0      |
- *		+----------------+--------------+-------------+
- *
- * In case of allocation failure, -ENOMEM is returned and the ref_tree stays
- * unaltered.
- * Success, return 0
- */
-static int ref_tree_add(struct ref_root *ref_tree, u64 root_id, u64 object_id,
-			u64 offset, u64 parent, int count)
-{
-	struct ref_node *node = NULL;
-	struct rb_node **pos = NULL;
-	struct rb_node *pos_parent = NULL;
-	int origin_count;
-	int ret;
-
-	if (!count)
-		return 0;
-
-	node = __ref_tree_search(ref_tree, &pos, &pos_parent, root_id,
-				 object_id, offset, parent);
-	if (node == NULL) {
-		node = kmalloc(sizeof(*node), GFP_NOFS);
-		if (!node)
-			return -ENOMEM;
-
-		node->root_id = root_id;
-		node->object_id = object_id;
-		node->offset = offset;
-		node->parent = parent;
-		node->ref_mod = count;
-
-		ret = ref_tree_insert(ref_tree, pos, pos_parent, node);
-		ASSERT(!ret);
-		if (ret) {
-			kfree(node);
-			return ret;
-		}
-
-		ref_tree->unique_refs += node->ref_mod > 0 ? 1 : 0;
-
-		return 0;
-	}
-
-	origin_count = node->ref_mod;
-	node->ref_mod += count;
-
-	if (node->ref_mod > 0)
-		ref_tree->unique_refs += origin_count > 0 ? 0 : 1;
-	else if (node->ref_mod <= 0)
-		ref_tree->unique_refs += origin_count > 0 ? -1 : 0;
-
-	if (!node->ref_mod)
-		ref_tree_remove(ref_tree, node);
-
-	return 0;
-}
-
 static int check_extent_in_eb(const struct btrfs_key *key,
 			      const struct extent_buffer *eb,
 			      const struct btrfs_file_extent_item *fi,
@@ -964,7 +705,6 @@ static int add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
  */
 static int add_inline_refs(struct btrfs_path *path, u64 bytenr,
 			   int *info_level, struct list_head *prefs,
-			   struct ref_root *ref_tree,
 			   u64 *total_refs, u64 inum)
 {
 	int ret = 0;
@@ -1031,13 +771,6 @@ static int add_inline_refs(struct btrfs_path *path, u64 bytenr,
 			count = btrfs_shared_data_ref_count(leaf, sdref);
 			ret = add_prelim_ref(prefs, 0, NULL, 0, offset,
 					     bytenr, count, GFP_NOFS);
-			if (ref_tree) {
-				if (!ret)
-					ret = ref_tree_add(ref_tree, 0, 0, 0,
-							   bytenr, count);
-				if (!ret && ref_tree->unique_refs > 1)
-					ret = BACKREF_FOUND_SHARED;
-			}
 			break;
 		}
 		case BTRFS_TREE_BLOCK_REF_KEY:
@@ -1065,15 +798,6 @@ static int add_inline_refs(struct btrfs_path *path, u64 bytenr,
 			root = btrfs_extent_data_ref_root(leaf, dref);
 			ret = add_prelim_ref(prefs, root, &key, 0, 0,
 					     bytenr, count, GFP_NOFS);
-			if (ref_tree) {
-				if (!ret)
-					ret = ref_tree_add(ref_tree, root,
-							   key.objectid,
-							   key.offset, 0,
-							   count);
-				if (!ret && ref_tree->unique_refs > 1)
-					ret = BACKREF_FOUND_SHARED;
-			}
 			break;
 		}
 		default:
@@ -1092,8 +816,7 @@ static int add_inline_refs(struct btrfs_path *path, u64 bytenr,
  */
 static int add_keyed_refs(struct btrfs_fs_info *fs_info,
 			  struct btrfs_path *path, u64 bytenr,
-			  int info_level, struct list_head *prefs,
-			  struct ref_root *ref_tree, u64 inum)
+			  int info_level, struct list_head *prefs, u64 inum)
 {
 	struct btrfs_root *extent_root = fs_info->extent_root;
 	int ret;
@@ -1135,13 +858,6 @@ static int add_keyed_refs(struct btrfs_fs_info *fs_info,
 			count = btrfs_shared_data_ref_count(leaf, sdref);
 			ret = add_prelim_ref(prefs, 0, NULL, 0, key.offset,
 					     bytenr, count, GFP_NOFS);
-			if (ref_tree) {
-				if (!ret)
-					ret = ref_tree_add(ref_tree, 0, 0, 0,
-							   bytenr, count);
-				if (!ret && ref_tree->unique_refs > 1)
-					ret = BACKREF_FOUND_SHARED;
-			}
 			break;
 		}
 		case BTRFS_TREE_BLOCK_REF_KEY:
@@ -1170,15 +886,6 @@ static int add_keyed_refs(struct btrfs_fs_info *fs_info,
 			root = btrfs_extent_data_ref_root(leaf, dref);
 			ret = add_prelim_ref(prefs, root, &key, 0, 0,
 					     bytenr, count, GFP_NOFS);
-			if (ref_tree) {
-				if (!ret)
-					ret = ref_tree_add(ref_tree, root,
-							   key.objectid,
-							   key.offset, 0,
-							   count);
-				if (!ret && ref_tree->unique_refs > 1)
-					ret = BACKREF_FOUND_SHARED;
-			}
 			break;
 		}
 		default:
@@ -1205,16 +912,13 @@ static int add_keyed_refs(struct btrfs_fs_info *fs_info,
  * commit root.
  * The special case is for qgroup to search roots in commit_transaction().
  *
- * If check_shared is set to 1, any extent has more than one ref item, will
- * be returned BACKREF_FOUND_SHARED immediately.
- *
  * FIXME some caching might speed things up
  */
 static int find_parent_nodes(struct btrfs_trans_handle *trans,
 			     struct btrfs_fs_info *fs_info, u64 bytenr,
 			     u64 time_seq, struct ulist *refs,
 			     struct ulist *roots, const u64 *extent_item_pos,
-			     u64 root_objectid, u64 inum, int check_shared)
+			     u64 root_objectid, u64 inum)
 {
 	struct btrfs_key key;
 	struct btrfs_path *path;
@@ -1226,7 +930,6 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans,
 	struct list_head prefs;
 	struct prelim_ref *ref;
 	struct extent_inode_elem *eie = NULL;
-	struct ref_root *ref_tree = NULL;
 	u64 total_refs = 0;
 
 	INIT_LIST_HEAD(&prefs);
@@ -1258,18 +961,6 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans,
 again:
 	head = NULL;
 
-	if (check_shared) {
-		if (!ref_tree) {
-			ref_tree = ref_root_alloc();
-			if (!ref_tree) {
-				ret = -ENOMEM;
-				goto out;
-			}
-		} else {
-			ref_root_fini(ref_tree);
-		}
-	}
-
 	ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 0);
 	if (ret < 0)
 		goto out;
@@ -1314,36 +1005,6 @@ again:
 		} else {
 			spin_unlock(&delayed_refs->lock);
 		}
-
-		if (check_shared && !list_empty(&prefs_delayed)) {
-			/*
-			 * Add all delay_ref to the ref_tree and check if there
-			 * are multiple ref items added.
-			 */
-			list_for_each_entry(ref, &prefs_delayed, list) {
-				if (ref->key_for_search.type) {
-					ret = ref_tree_add(ref_tree,
-						ref->root_id,
-						ref->key_for_search.objectid,
-						ref->key_for_search.offset,
-						0, ref->count);
-					if (ret)
-						goto out;
-				} else {
-					ret = ref_tree_add(ref_tree, 0, 0, 0,
-						     ref->parent, ref->count);
-					if (ret)
-						goto out;
-				}
-
-			}
-
-			if (ref_tree->unique_refs > 1) {
-				ret = BACKREF_FOUND_SHARED;
-				goto out;
-			}
-
-		}
 	}
 
 	if (path->slots[0]) {
@@ -1358,12 +1019,11 @@ again:
 		    (key.type == BTRFS_EXTENT_ITEM_KEY ||
 		     key.type == BTRFS_METADATA_ITEM_KEY)) {
 			ret = add_inline_refs(path, bytenr, &info_level,
-					      &prefs, ref_tree, &total_refs,
-					      inum);
+					      &prefs, &total_refs, inum);
 			if (ret)
 				goto out;
 			ret = add_keyed_refs(fs_info, path, bytenr, info_level,
-					     &prefs, ref_tree, inum);
+					     &prefs, inum);
 			if (ret)
 				goto out;
 		}
@@ -1447,7 +1107,6 @@ again:
 
 out:
 	btrfs_free_path(path);
-	ref_root_free(ref_tree);
 	while (!list_empty(&prefs)) {
 		ref = list_first_entry(&prefs, struct prelim_ref, list);
 		list_del(&ref->list);
@@ -1502,7 +1161,7 @@ static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans,
 		return -ENOMEM;
 
 	ret = find_parent_nodes(trans, fs_info, bytenr, time_seq,
-				*leafs, NULL, extent_item_pos, 0, 0, 0);
+				*leafs, NULL, extent_item_pos, 0, 0);
 	if (ret < 0 && ret != -ENOENT) {
 		free_leaf_list(*leafs);
 		return ret;
@@ -1545,7 +1204,7 @@ static int btrfs_find_all_roots_safe(struct btrfs_trans_handle *trans,
 	ULIST_ITER_INIT(&uiter);
 	while (1) {
 		ret = find_parent_nodes(trans, fs_info, bytenr, time_seq,
-					tmp, *roots, NULL, 0, 0, 0);
+					tmp, *roots, NULL, 0, 0);
 		if (ret < 0 && ret != -ENOENT) {
 			ulist_free(tmp);
 			ulist_free(*roots);
@@ -1621,7 +1280,7 @@ int btrfs_check_shared(struct btrfs_root *root, u64 inum, u64 bytenr)
 	ULIST_ITER_INIT(&uiter);
 	while (1) {
 		ret = find_parent_nodes(trans, fs_info, bytenr, elem.seq, tmp,
-					roots, NULL, root->objectid, inum, 1);
+					roots, NULL, root->objectid, inum);
 		if (ret == BACKREF_FOUND_SHARED) {
 			/* this is the only condition under which we return 1 */
 			ret = 1;

From 86d5f994425252d8a40e2184c94a2682ae8ecfbf Mon Sep 17 00:00:00 2001
From: Edmund Nadolski <enadolski@suse.com>
Date: Wed, 12 Jul 2017 16:20:06 -0600
Subject: [PATCH 012/104] btrfs: convert prelimary reference tracking to use
 rbtrees

It's been known for a while that the use of multiple lists
that are periodically merged was an algorithmic problem within
btrfs.  There are several workloads that don't complete in any
reasonable amount of time (e.g. btrfs/130) and others that cause
soft lockups.

The solution is to use a set of rbtrees that do insertion merging
for both indirect and direct refs, with the former converting
refs into the latter.  The result is a btrfs/130 workload that
used to take several hours now takes about half of that. This
runtime still isn't acceptable and a future patch will address that
by moving the rbtrees higher in the stack so the lookups can be
shared across multiple calls to find_parent_nodes.

Signed-off-by: Edmund Nadolski <enadolski@suse.com>
Signed-off-by: Jeff Mahoney <jeffm@suse.com>
Reviewed-by: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/backref.c | 442 +++++++++++++++++++++++++++++----------------
 1 file changed, 285 insertions(+), 157 deletions(-)

diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 6cac5ab8d5e0..baf907adede1 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -26,11 +26,6 @@
 #include "delayed-ref.h"
 #include "locking.h"
 
-enum merge_mode {
-	MERGE_IDENTICAL_KEYS = 1,
-	MERGE_IDENTICAL_PARENTS,
-};
-
 /* Just an arbitrary number so we can be sure this happened */
 #define BACKREF_FOUND_SHARED 6
 
@@ -129,7 +124,7 @@ static int find_extent_in_eb(const struct extent_buffer *eb,
  * this structure records all encountered refs on the way up to the root
  */
 struct prelim_ref {
-	struct list_head list;
+	struct rb_node rbnode;
 	u64 root_id;
 	struct btrfs_key key_for_search;
 	int level;
@@ -139,6 +134,18 @@ struct prelim_ref {
 	u64 wanted_disk_byte;
 };
 
+struct preftree {
+	struct rb_root root;
+};
+
+#define PREFTREE_INIT	{ .root = RB_ROOT }
+
+struct preftrees {
+	struct preftree direct;    /* BTRFS_SHARED_[DATA|BLOCK]_REF_KEY */
+	struct preftree indirect;  /* BTRFS_[TREE_BLOCK|EXTENT_DATA]_REF_KEY */
+	struct preftree indirect_missing_keys;
+};
+
 static struct kmem_cache *btrfs_prelim_ref_cache;
 
 int __init btrfs_prelim_ref_init(void)
@@ -158,6 +165,108 @@ void btrfs_prelim_ref_exit(void)
 	kmem_cache_destroy(btrfs_prelim_ref_cache);
 }
 
+static void free_pref(struct prelim_ref *ref)
+{
+	kmem_cache_free(btrfs_prelim_ref_cache, ref);
+}
+
+/*
+ * Return 0 when both refs are for the same block (and can be merged).
+ * A -1 return indicates ref1 is a 'lower' block than ref2, while 1
+ * indicates a 'higher' block.
+ */
+static int prelim_ref_compare(struct prelim_ref *ref1,
+			      struct prelim_ref *ref2)
+{
+	if (ref1->level < ref2->level)
+		return -1;
+	if (ref1->level > ref2->level)
+		return 1;
+	if (ref1->root_id < ref2->root_id)
+		return -1;
+	if (ref1->root_id > ref2->root_id)
+		return 1;
+	if (ref1->key_for_search.type < ref2->key_for_search.type)
+		return -1;
+	if (ref1->key_for_search.type > ref2->key_for_search.type)
+		return 1;
+	if (ref1->key_for_search.objectid < ref2->key_for_search.objectid)
+		return -1;
+	if (ref1->key_for_search.objectid > ref2->key_for_search.objectid)
+		return 1;
+	if (ref1->key_for_search.offset < ref2->key_for_search.offset)
+		return -1;
+	if (ref1->key_for_search.offset > ref2->key_for_search.offset)
+		return 1;
+	if (ref1->parent < ref2->parent)
+		return -1;
+	if (ref1->parent > ref2->parent)
+		return 1;
+
+	return 0;
+}
+
+/*
+ * Add @newref to the @root rbtree, merging identical refs.
+ *
+ * Callers should assumed that newref has been freed after calling.
+ */
+static void prelim_ref_insert(struct preftree *preftree,
+			      struct prelim_ref *newref)
+{
+	struct rb_root *root;
+	struct rb_node **p;
+	struct rb_node *parent = NULL;
+	struct prelim_ref *ref;
+	int result;
+
+	root = &preftree->root;
+	p = &root->rb_node;
+
+	while (*p) {
+		parent = *p;
+		ref = rb_entry(parent, struct prelim_ref, rbnode);
+		result = prelim_ref_compare(ref, newref);
+		if (result < 0) {
+			p = &(*p)->rb_left;
+		} else if (result > 0) {
+			p = &(*p)->rb_right;
+		} else {
+			/* Identical refs, merge them and free @newref */
+			struct extent_inode_elem *eie = ref->inode_list;
+
+			while (eie && eie->next)
+				eie = eie->next;
+
+			if (!eie)
+				ref->inode_list = newref->inode_list;
+			else
+				eie->next = newref->inode_list;
+			ref->count += newref->count;
+			free_pref(newref);
+			return;
+		}
+	}
+
+	rb_link_node(&newref->rbnode, parent, p);
+	rb_insert_color(&newref->rbnode, root);
+}
+
+/*
+ * Release the entire tree.  We don't care about internal consistency so
+ * just free everything and then reset the tree root.
+ */
+static void prelim_release(struct preftree *preftree)
+{
+	struct prelim_ref *ref, *next_ref;
+
+	rbtree_postorder_for_each_entry_safe(ref, next_ref, &preftree->root,
+					     rbnode)
+		free_pref(ref);
+
+	preftree->root = RB_ROOT;
+}
+
 /*
  * the rules for all callers of this function are:
  * - obtaining the parent is the goal
@@ -196,7 +305,7 @@ void btrfs_prelim_ref_exit(void)
  * additional information that's available but not required to find the parent
  * block might help in merging entries to gain some speed.
  */
-static int add_prelim_ref(struct list_head *head, u64 root_id,
+static int add_prelim_ref(struct preftree *preftree, u64 root_id,
 			  const struct btrfs_key *key, int level, u64 parent,
 			  u64 wanted_disk_byte, int count, gfp_t gfp_mask)
 {
@@ -243,11 +352,32 @@ static int add_prelim_ref(struct list_head *head, u64 root_id,
 	ref->count = count;
 	ref->parent = parent;
 	ref->wanted_disk_byte = wanted_disk_byte;
-	list_add_tail(&ref->list, head);
+	prelim_ref_insert(preftree, ref);
 
 	return 0;
 }
 
+/* direct refs use root == 0, key == NULL */
+static int add_direct_ref(struct preftrees *preftrees, int level, u64 parent,
+			  u64 wanted_disk_byte, int count, gfp_t gfp_mask)
+{
+	return add_prelim_ref(&preftrees->direct, 0, NULL, level, parent,
+			      wanted_disk_byte, count, gfp_mask);
+}
+
+/* indirect refs use parent == 0 */
+static int add_indirect_ref(struct preftrees *preftrees, u64 root_id,
+			    const struct btrfs_key *key, int level,
+			    u64 wanted_disk_byte, int count, gfp_t gfp_mask)
+{
+	struct preftree *tree = &preftrees->indirect;
+
+	if (!key)
+		tree = &preftrees->indirect_missing_keys;
+	return add_prelim_ref(tree, root_id, key, level, 0,
+			      wanted_disk_byte, count, gfp_mask);
+}
+
 static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
 			   struct ulist *parents, struct prelim_ref *ref,
 			   int level, u64 time_seq, const u64 *extent_item_pos,
@@ -429,38 +559,63 @@ unode_aux_to_inode_list(struct ulist_node *node)
 }
 
 /*
- * resolve all indirect backrefs from the list
+ * We maintain three seperate rbtrees: one for direct refs, one for
+ * indirect refs which have a key, and one for indirect refs which do not
+ * have a key. Each tree does merge on insertion.
+ *
+ * Once all of the references are located, we iterate over the tree of
+ * indirect refs with missing keys. An appropriate key is located and
+ * the ref is moved onto the tree for indirect refs. After all missing
+ * keys are thus located, we iterate over the indirect ref tree, resolve
+ * each reference, and then insert the resolved reference onto the
+ * direct tree (merging there too).
+ *
+ * New backrefs (i.e., for parent nodes) are added to the appropriate
+ * rbtree as they are encountered. The new backrefs are subsequently
+ * resolved as above.
  */
 static int resolve_indirect_refs(struct btrfs_fs_info *fs_info,
 				 struct btrfs_path *path, u64 time_seq,
-				 struct list_head *head,
+				 struct preftrees *preftrees,
 				 const u64 *extent_item_pos, u64 total_refs,
 				 u64 root_objectid)
 {
 	int err;
 	int ret = 0;
-	struct prelim_ref *ref;
-	struct prelim_ref *ref_safe;
-	struct prelim_ref *new_ref;
 	struct ulist *parents;
 	struct ulist_node *node;
 	struct ulist_iterator uiter;
+	struct rb_node *rnode;
 
 	parents = ulist_alloc(GFP_NOFS);
 	if (!parents)
 		return -ENOMEM;
 
 	/*
-	 * _safe allows us to insert directly after the current item without
-	 * iterating over the newly inserted items.
-	 * we're also allowed to re-assign ref during iteration.
+	 * We could trade memory usage for performance here by iterating
+	 * the tree, allocating new refs for each insertion, and then
+	 * freeing the entire indirect tree when we're done.  In some test
+	 * cases, the tree can grow quite large (~200k objects).
 	 */
-	list_for_each_entry_safe(ref, ref_safe, head, list) {
-		if (ref->parent)	/* already direct */
-			continue;
-		if (ref->count == 0)
+	while ((rnode = rb_first(&preftrees->indirect.root))) {
+		struct prelim_ref *ref;
+
+		ref = rb_entry(rnode, struct prelim_ref, rbnode);
+		if (WARN(ref->parent,
+			 "BUG: direct ref found in indirect tree")) {
+			ret = -EINVAL;
+			goto out;
+		}
+
+		rb_erase(&ref->rbnode, &preftrees->indirect.root);
+
+		if (ref->count == 0) {
+			free_pref(ref);
 			continue;
+		}
+
 		if (root_objectid && ref->root_id != root_objectid) {
+			free_pref(ref);
 			ret = BACKREF_FOUND_SHARED;
 			goto out;
 		}
@@ -472,8 +627,10 @@ static int resolve_indirect_refs(struct btrfs_fs_info *fs_info,
 		 * and return directly.
 		 */
 		if (err == -ENOENT) {
+			prelim_ref_insert(&preftrees->direct, ref);
 			continue;
 		} else if (err) {
+			free_pref(ref);
 			ret = err;
 			goto out;
 		}
@@ -484,19 +641,26 @@ static int resolve_indirect_refs(struct btrfs_fs_info *fs_info,
 		ref->parent = node ? node->val : 0;
 		ref->inode_list = unode_aux_to_inode_list(node);
 
-		/* additional parents require new refs being added here */
+		/* Add a prelim_ref(s) for any other parent(s). */
 		while ((node = ulist_next(parents, &uiter))) {
+			struct prelim_ref *new_ref;
+
 			new_ref = kmem_cache_alloc(btrfs_prelim_ref_cache,
 						   GFP_NOFS);
 			if (!new_ref) {
+				free_pref(ref);
 				ret = -ENOMEM;
 				goto out;
 			}
 			memcpy(new_ref, ref, sizeof(*ref));
 			new_ref->parent = node->val;
 			new_ref->inode_list = unode_aux_to_inode_list(node);
-			list_add(&new_ref->list, &ref->list);
+			prelim_ref_insert(&preftrees->direct, new_ref);
 		}
+
+		/* Now it's a direct ref, put it in the the direct tree */
+		prelim_ref_insert(&preftrees->direct, ref);
+
 		ulist_reinit(parents);
 	}
 out:
@@ -504,44 +668,31 @@ out:
 	return ret;
 }
 
-static inline int ref_for_same_block(struct prelim_ref *ref1,
-				     struct prelim_ref *ref2)
-{
-	if (ref1->level != ref2->level)
-		return 0;
-	if (ref1->root_id != ref2->root_id)
-		return 0;
-	if (ref1->key_for_search.type != ref2->key_for_search.type)
-		return 0;
-	if (ref1->key_for_search.objectid != ref2->key_for_search.objectid)
-		return 0;
-	if (ref1->key_for_search.offset != ref2->key_for_search.offset)
-		return 0;
-	if (ref1->parent != ref2->parent)
-		return 0;
-
-	return 1;
-}
-
 /*
  * read tree blocks and add keys where required.
  */
 static int add_missing_keys(struct btrfs_fs_info *fs_info,
-			    struct list_head *head)
+			    struct preftrees *preftrees)
 {
 	struct prelim_ref *ref;
 	struct extent_buffer *eb;
+	struct preftree *tree = &preftrees->indirect_missing_keys;
+	struct rb_node *node;
 
-	list_for_each_entry(ref, head, list) {
-		if (ref->parent)
-			continue;
-		if (ref->key_for_search.type)
-			continue;
+	while ((node = rb_first(&tree->root))) {
+		ref = rb_entry(node, struct prelim_ref, rbnode);
+		rb_erase(node, &tree->root);
+
+		BUG_ON(ref->parent);	/* should not be a direct ref */
+		BUG_ON(ref->key_for_search.type);
 		BUG_ON(!ref->wanted_disk_byte);
+
 		eb = read_tree_block(fs_info, ref->wanted_disk_byte, 0);
 		if (IS_ERR(eb)) {
+			free_pref(ref);
 			return PTR_ERR(eb);
 		} else if (!extent_buffer_uptodate(eb)) {
+			free_pref(ref);
 			free_extent_buffer(eb);
 			return -EIO;
 		}
@@ -552,73 +703,31 @@ static int add_missing_keys(struct btrfs_fs_info *fs_info,
 			btrfs_node_key_to_cpu(eb, &ref->key_for_search, 0);
 		btrfs_tree_read_unlock(eb);
 		free_extent_buffer(eb);
+		prelim_ref_insert(&preftrees->indirect, ref);
 	}
 	return 0;
 }
 
-/*
- * merge backrefs and adjust counts accordingly
- *
- *    FIXME: For MERGE_IDENTICAL_KEYS, if we add more keys in add_prelim_ref
- *           then we can merge more here. Additionally, we could even add a key
- *           range for the blocks we looked into to merge even more (-> replace
- *           unresolved refs by those having a parent).
- */
-static void merge_refs(struct list_head *head, enum merge_mode mode)
-{
-	struct prelim_ref *pos1;
-
-	list_for_each_entry(pos1, head, list) {
-		struct prelim_ref *pos2 = pos1, *tmp;
-
-		list_for_each_entry_safe_continue(pos2, tmp, head, list) {
-			struct prelim_ref *ref1 = pos1, *ref2 = pos2;
-			struct extent_inode_elem *eie;
-
-			if (!ref_for_same_block(ref1, ref2))
-				continue;
-			if (mode == MERGE_IDENTICAL_KEYS) {
-				if (!ref1->parent && ref2->parent)
-					swap(ref1, ref2);
-			} else {
-				if (ref1->parent != ref2->parent)
-					continue;
-			}
-
-			eie = ref1->inode_list;
-			while (eie && eie->next)
-				eie = eie->next;
-			if (eie)
-				eie->next = ref2->inode_list;
-			else
-				ref1->inode_list = ref2->inode_list;
-			ref1->count += ref2->count;
-
-			list_del(&ref2->list);
-			kmem_cache_free(btrfs_prelim_ref_cache, ref2);
-			cond_resched();
-		}
-
-	}
-}
-
 /*
  * add all currently queued delayed refs from this head whose seq nr is
  * smaller or equal that seq to the list
  */
 static int add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
-			    struct list_head *prefs, u64 *total_refs,
+			    struct preftrees *preftrees, u64 *total_refs,
 			    u64 inum)
 {
 	struct btrfs_delayed_ref_node *node;
 	struct btrfs_delayed_extent_op *extent_op = head->extent_op;
 	struct btrfs_key key;
-	struct btrfs_key op_key = {0};
+	struct btrfs_key tmp_op_key;
+	struct btrfs_key *op_key = NULL;
 	int sgn;
 	int ret = 0;
 
-	if (extent_op && extent_op->update_key)
-		btrfs_disk_key_to_cpu(&op_key, &extent_op->key);
+	if (extent_op && extent_op->update_key) {
+		btrfs_disk_key_to_cpu(&tmp_op_key, &extent_op->key);
+		op_key = &tmp_op_key;
+	}
 
 	spin_lock(&head->lock);
 	list_for_each_entry(node, &head->ref_list, list) {
@@ -642,24 +751,30 @@ static int add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
 		*total_refs += (node->ref_mod * sgn);
 		switch (node->type) {
 		case BTRFS_TREE_BLOCK_REF_KEY: {
+			/* NORMAL INDIRECT METADATA backref */
 			struct btrfs_delayed_tree_ref *ref;
 
 			ref = btrfs_delayed_node_to_tree_ref(node);
-			ret = add_prelim_ref(prefs, ref->root, &op_key,
-					     ref->level + 1, 0, node->bytenr,
-					     node->ref_mod * sgn, GFP_ATOMIC);
+			ret = add_indirect_ref(preftrees, ref->root, &tmp_op_key,
+					       ref->level + 1, node->bytenr,
+					       node->ref_mod * sgn,
+					       GFP_ATOMIC);
 			break;
 		}
 		case BTRFS_SHARED_BLOCK_REF_KEY: {
+			/* SHARED DIRECT METADATA backref */
 			struct btrfs_delayed_tree_ref *ref;
 
 			ref = btrfs_delayed_node_to_tree_ref(node);
-			ret = add_prelim_ref(prefs, 0, NULL, ref->level + 1,
+
+			ret = add_direct_ref(preftrees, ref->level + 1,
 					     ref->parent, node->bytenr,
-					     node->ref_mod * sgn, GFP_ATOMIC);
+					     node->ref_mod * sgn,
+					     GFP_ATOMIC);
 			break;
 		}
 		case BTRFS_EXTENT_DATA_REF_KEY: {
+			/* NORMAL INDIRECT DATA backref */
 			struct btrfs_delayed_data_ref *ref;
 			ref = btrfs_delayed_node_to_data_ref(node);
 
@@ -676,17 +791,21 @@ static int add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
 				break;
 			}
 
-			ret = add_prelim_ref(prefs, ref->root, &key, 0, 0,
-					     node->bytenr, node->ref_mod * sgn,
-					     GFP_ATOMIC);
+			ret = add_indirect_ref(preftrees, ref->root, &key, 0,
+					       node->bytenr,
+					       node->ref_mod * sgn,
+					       GFP_ATOMIC);
 			break;
 		}
 		case BTRFS_SHARED_DATA_REF_KEY: {
+			/* SHARED DIRECT FULL backref */
 			struct btrfs_delayed_data_ref *ref;
 
 			ref = btrfs_delayed_node_to_data_ref(node);
-			ret = add_prelim_ref(prefs, 0, NULL, 0, ref->parent,
-					     node->bytenr, node->ref_mod * sgn,
+
+			ret = add_direct_ref(preftrees, 0, ref->parent,
+					     node->bytenr,
+					     node->ref_mod * sgn,
 					     GFP_ATOMIC);
 			break;
 		}
@@ -704,7 +823,7 @@ static int add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
  * add all inline backrefs for bytenr to the list
  */
 static int add_inline_refs(struct btrfs_path *path, u64 bytenr,
-			   int *info_level, struct list_head *prefs,
+			   int *info_level, struct preftrees *preftrees,
 			   u64 *total_refs, u64 inum)
 {
 	int ret = 0;
@@ -760,8 +879,8 @@ static int add_inline_refs(struct btrfs_path *path, u64 bytenr,
 
 		switch (type) {
 		case BTRFS_SHARED_BLOCK_REF_KEY:
-			ret = add_prelim_ref(prefs, 0, NULL, *info_level + 1,
-					     offset, bytenr, 1, GFP_NOFS);
+			ret = add_direct_ref(preftrees, *info_level + 1, offset,
+					     bytenr, 1, GFP_NOFS);
 			break;
 		case BTRFS_SHARED_DATA_REF_KEY: {
 			struct btrfs_shared_data_ref *sdref;
@@ -769,14 +888,15 @@ static int add_inline_refs(struct btrfs_path *path, u64 bytenr,
 
 			sdref = (struct btrfs_shared_data_ref *)(iref + 1);
 			count = btrfs_shared_data_ref_count(leaf, sdref);
-			ret = add_prelim_ref(prefs, 0, NULL, 0, offset,
+
+			ret = add_direct_ref(preftrees, 0, offset,
 					     bytenr, count, GFP_NOFS);
 			break;
 		}
 		case BTRFS_TREE_BLOCK_REF_KEY:
-			ret = add_prelim_ref(prefs, offset, NULL,
-					     *info_level + 1, 0,
-					     bytenr, 1, GFP_NOFS);
+			ret = add_indirect_ref(preftrees, offset, NULL,
+					       *info_level + 1, bytenr, 1,
+					       GFP_NOFS);
 			break;
 		case BTRFS_EXTENT_DATA_REF_KEY: {
 			struct btrfs_extent_data_ref *dref;
@@ -796,8 +916,9 @@ static int add_inline_refs(struct btrfs_path *path, u64 bytenr,
 			}
 
 			root = btrfs_extent_data_ref_root(leaf, dref);
-			ret = add_prelim_ref(prefs, root, &key, 0, 0,
-					     bytenr, count, GFP_NOFS);
+
+			ret = add_indirect_ref(preftrees, root, &key, 0, bytenr,
+					       count, GFP_NOFS);
 			break;
 		}
 		default:
@@ -816,7 +937,8 @@ static int add_inline_refs(struct btrfs_path *path, u64 bytenr,
  */
 static int add_keyed_refs(struct btrfs_fs_info *fs_info,
 			  struct btrfs_path *path, u64 bytenr,
-			  int info_level, struct list_head *prefs, u64 inum)
+			  int info_level, struct preftrees *preftrees,
+			  u64 inum)
 {
 	struct btrfs_root *extent_root = fs_info->extent_root;
 	int ret;
@@ -846,26 +968,31 @@ static int add_keyed_refs(struct btrfs_fs_info *fs_info,
 
 		switch (key.type) {
 		case BTRFS_SHARED_BLOCK_REF_KEY:
-			ret = add_prelim_ref(prefs, 0, NULL, info_level + 1,
-					     key.offset, bytenr, 1, GFP_NOFS);
+			/* SHARED DIRECT METADATA backref */
+			ret = add_direct_ref(preftrees, info_level + 1,
+					     key.offset, bytenr, 1,
+					     GFP_NOFS);
 			break;
 		case BTRFS_SHARED_DATA_REF_KEY: {
+			/* SHARED DIRECT FULL backref */
 			struct btrfs_shared_data_ref *sdref;
 			int count;
 
 			sdref = btrfs_item_ptr(leaf, slot,
 					      struct btrfs_shared_data_ref);
 			count = btrfs_shared_data_ref_count(leaf, sdref);
-			ret = add_prelim_ref(prefs, 0, NULL, 0, key.offset,
-					     bytenr, count, GFP_NOFS);
+			ret = add_direct_ref(preftrees, 0, key.offset, bytenr,
+					     count, GFP_NOFS);
 			break;
 		}
 		case BTRFS_TREE_BLOCK_REF_KEY:
-			ret = add_prelim_ref(prefs, key.offset, NULL,
-					     info_level + 1, 0,
-					     bytenr, 1, GFP_NOFS);
+			/* NORMAL INDIRECT METADATA backref */
+			ret = add_indirect_ref(preftrees, key.offset, NULL,
+					       info_level + 1, bytenr, 1,
+					       GFP_NOFS);
 			break;
 		case BTRFS_EXTENT_DATA_REF_KEY: {
+			/* NORMAL INDIRECT DATA backref */
 			struct btrfs_extent_data_ref *dref;
 			int count;
 			u64 root;
@@ -884,8 +1011,8 @@ static int add_keyed_refs(struct btrfs_fs_info *fs_info,
 			}
 
 			root = btrfs_extent_data_ref_root(leaf, dref);
-			ret = add_prelim_ref(prefs, root, &key, 0, 0,
-					     bytenr, count, GFP_NOFS);
+			ret = add_indirect_ref(preftrees, root, &key, 0, bytenr,
+					       count, GFP_NOFS);
 			break;
 		}
 		default:
@@ -926,14 +1053,16 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans,
 	struct btrfs_delayed_ref_head *head;
 	int info_level = 0;
 	int ret;
-	struct list_head prefs_delayed;
-	struct list_head prefs;
 	struct prelim_ref *ref;
+	struct rb_node *node;
 	struct extent_inode_elem *eie = NULL;
+	/* total of both direct AND indirect refs! */
 	u64 total_refs = 0;
-
-	INIT_LIST_HEAD(&prefs);
-	INIT_LIST_HEAD(&prefs_delayed);
+	struct preftrees preftrees = {
+		.direct = PREFTREE_INIT,
+		.indirect = PREFTREE_INIT,
+		.indirect_missing_keys = PREFTREE_INIT
+	};
 
 	key.objectid = bytenr;
 	key.offset = (u64)-1;
@@ -996,9 +1125,8 @@ again:
 				goto again;
 			}
 			spin_unlock(&delayed_refs->lock);
-			ret = add_delayed_refs(head, time_seq,
-					       &prefs_delayed, &total_refs,
-					       inum);
+			ret = add_delayed_refs(head, time_seq, &preftrees,
+					       &total_refs, inum);
 			mutex_unlock(&head->mutex);
 			if (ret)
 				goto out;
@@ -1019,35 +1147,43 @@ again:
 		    (key.type == BTRFS_EXTENT_ITEM_KEY ||
 		     key.type == BTRFS_METADATA_ITEM_KEY)) {
 			ret = add_inline_refs(path, bytenr, &info_level,
-					      &prefs, &total_refs, inum);
+					      &preftrees, &total_refs, inum);
 			if (ret)
 				goto out;
 			ret = add_keyed_refs(fs_info, path, bytenr, info_level,
-					     &prefs, inum);
+					     &preftrees, inum);
 			if (ret)
 				goto out;
 		}
 	}
+
 	btrfs_release_path(path);
 
-	list_splice_init(&prefs_delayed, &prefs);
-
-	ret = add_missing_keys(fs_info, &prefs);
+	ret = add_missing_keys(fs_info, &preftrees);
 	if (ret)
 		goto out;
 
-	merge_refs(&prefs, MERGE_IDENTICAL_KEYS);
+	WARN_ON(!RB_EMPTY_ROOT(&preftrees.indirect_missing_keys.root));
 
-	ret = resolve_indirect_refs(fs_info, path, time_seq, &prefs,
+	ret = resolve_indirect_refs(fs_info, path, time_seq, &preftrees,
 				    extent_item_pos, total_refs,
 				    root_objectid);
 	if (ret)
 		goto out;
 
-	merge_refs(&prefs, MERGE_IDENTICAL_PARENTS);
+	WARN_ON(!RB_EMPTY_ROOT(&preftrees.indirect.root));
 
-	while (!list_empty(&prefs)) {
-		ref = list_first_entry(&prefs, struct prelim_ref, list);
+	/*
+	 * This walks the tree of merged and resolved refs. Tree blocks are
+	 * read in as needed. Unique entries are added to the ulist, and
+	 * the list of found roots is updated.
+	 *
+	 * We release the entire tree in one go before returning.
+	 */
+	node = rb_first(&preftrees.direct.root);
+	while (node) {
+		ref = rb_entry(node, struct prelim_ref, rbnode);
+		node = rb_next(&ref->rbnode);
 		WARN_ON(ref->count < 0);
 		if (roots && ref->count && ref->root_id && ref->parent == 0) {
 			if (root_objectid && ref->root_id != root_objectid) {
@@ -1101,23 +1237,15 @@ again:
 			}
 			eie = NULL;
 		}
-		list_del(&ref->list);
-		kmem_cache_free(btrfs_prelim_ref_cache, ref);
 	}
 
 out:
 	btrfs_free_path(path);
-	while (!list_empty(&prefs)) {
-		ref = list_first_entry(&prefs, struct prelim_ref, list);
-		list_del(&ref->list);
-		kmem_cache_free(btrfs_prelim_ref_cache, ref);
-	}
-	while (!list_empty(&prefs_delayed)) {
-		ref = list_first_entry(&prefs_delayed, struct prelim_ref,
-				       list);
-		list_del(&ref->list);
-		kmem_cache_free(btrfs_prelim_ref_cache, ref);
-	}
+
+	prelim_release(&preftrees.direct);
+	prelim_release(&preftrees.indirect);
+	prelim_release(&preftrees.indirect_missing_keys);
+
 	if (ret < 0)
 		free_inode_elem_list(eie);
 	return ret;

From 6c336b212bef66e507897c78551b3bb4e613a857 Mon Sep 17 00:00:00 2001
From: Jeff Mahoney <jeffm@suse.com>
Date: Wed, 12 Jul 2017 16:20:07 -0600
Subject: [PATCH 013/104] btrfs: add a node counter to each of the rbtrees

This patch adds counters to each of the rbtrees so that we can tell
how large they are growing for a given workload.  These counters
will be exported by tracepoints in the next patch.

Signed-off-by: Jeff Mahoney <jeffm@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/backref.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index baf907adede1..297f33850425 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -136,9 +136,10 @@ struct prelim_ref {
 
 struct preftree {
 	struct rb_root root;
+	unsigned int count;
 };
 
-#define PREFTREE_INIT	{ .root = RB_ROOT }
+#define PREFTREE_INIT	{ .root = RB_ROOT, .count = 0 }
 
 struct preftrees {
 	struct preftree direct;    /* BTRFS_SHARED_[DATA|BLOCK]_REF_KEY */
@@ -248,6 +249,7 @@ static void prelim_ref_insert(struct preftree *preftree,
 		}
 	}
 
+	preftree->count++;
 	rb_link_node(&newref->rbnode, parent, p);
 	rb_insert_color(&newref->rbnode, root);
 }
@@ -265,6 +267,7 @@ static void prelim_release(struct preftree *preftree)
 		free_pref(ref);
 
 	preftree->root = RB_ROOT;
+	preftree->count = 0;
 }
 
 /*
@@ -608,6 +611,7 @@ static int resolve_indirect_refs(struct btrfs_fs_info *fs_info,
 		}
 
 		rb_erase(&ref->rbnode, &preftrees->indirect.root);
+		preftrees->indirect.count--;
 
 		if (ref->count == 0) {
 			free_pref(ref);

From 00142756e1f8015d2f8ce96532d156689db7e448 Mon Sep 17 00:00:00 2001
From: Jeff Mahoney <jeffm@suse.com>
Date: Wed, 12 Jul 2017 16:20:08 -0600
Subject: [PATCH 014/104] btrfs: backref, add tracepoints for prelim_ref
 insertion and merging

This patch adds a tracepoint event for prelim_ref insertion and
merging.  For each, the ref being inserted or merged and the count
of tree nodes is issued.

Signed-off-by: Jeff Mahoney <jeffm@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/backref.c           | 118 ++++++++++++++++++-----------------
 fs/btrfs/backref.h           |  12 ++++
 fs/btrfs/super.c             |   1 +
 include/trace/events/btrfs.h |  58 +++++++++++++++++
 4 files changed, 131 insertions(+), 58 deletions(-)

diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 297f33850425..4cda81964dd4 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -18,6 +18,7 @@
 
 #include <linux/mm.h>
 #include <linux/rbtree.h>
+#include <trace/events/btrfs.h>
 #include "ctree.h"
 #include "disk-io.h"
 #include "backref.h"
@@ -120,20 +121,6 @@ static int find_extent_in_eb(const struct extent_buffer *eb,
 	return 0;
 }
 
-/*
- * this structure records all encountered refs on the way up to the root
- */
-struct prelim_ref {
-	struct rb_node rbnode;
-	u64 root_id;
-	struct btrfs_key key_for_search;
-	int level;
-	int count;
-	struct extent_inode_elem *inode_list;
-	u64 parent;
-	u64 wanted_disk_byte;
-};
-
 struct preftree {
 	struct rb_root root;
 	unsigned int count;
@@ -212,7 +199,8 @@ static int prelim_ref_compare(struct prelim_ref *ref1,
  *
  * Callers should assumed that newref has been freed after calling.
  */
-static void prelim_ref_insert(struct preftree *preftree,
+static void prelim_ref_insert(const struct btrfs_fs_info *fs_info,
+			      struct preftree *preftree,
 			      struct prelim_ref *newref)
 {
 	struct rb_root *root;
@@ -243,6 +231,8 @@ static void prelim_ref_insert(struct preftree *preftree,
 				ref->inode_list = newref->inode_list;
 			else
 				eie->next = newref->inode_list;
+			trace_btrfs_prelim_ref_merge(fs_info, ref, newref,
+						     preftree->count);
 			ref->count += newref->count;
 			free_pref(newref);
 			return;
@@ -250,6 +240,7 @@ static void prelim_ref_insert(struct preftree *preftree,
 	}
 
 	preftree->count++;
+	trace_btrfs_prelim_ref_insert(fs_info, newref, NULL, preftree->count);
 	rb_link_node(&newref->rbnode, parent, p);
 	rb_insert_color(&newref->rbnode, root);
 }
@@ -308,7 +299,8 @@ static void prelim_release(struct preftree *preftree)
  * additional information that's available but not required to find the parent
  * block might help in merging entries to gain some speed.
  */
-static int add_prelim_ref(struct preftree *preftree, u64 root_id,
+static int add_prelim_ref(const struct btrfs_fs_info *fs_info,
+			  struct preftree *preftree, u64 root_id,
 			  const struct btrfs_key *key, int level, u64 parent,
 			  u64 wanted_disk_byte, int count, gfp_t gfp_mask)
 {
@@ -355,21 +347,23 @@ static int add_prelim_ref(struct preftree *preftree, u64 root_id,
 	ref->count = count;
 	ref->parent = parent;
 	ref->wanted_disk_byte = wanted_disk_byte;
-	prelim_ref_insert(preftree, ref);
+	prelim_ref_insert(fs_info, preftree, ref);
 
 	return 0;
 }
 
 /* direct refs use root == 0, key == NULL */
-static int add_direct_ref(struct preftrees *preftrees, int level, u64 parent,
+static int add_direct_ref(const struct btrfs_fs_info *fs_info,
+			  struct preftrees *preftrees, int level, u64 parent,
 			  u64 wanted_disk_byte, int count, gfp_t gfp_mask)
 {
-	return add_prelim_ref(&preftrees->direct, 0, NULL, level, parent,
-			      wanted_disk_byte, count, gfp_mask);
+	return add_prelim_ref(fs_info, &preftrees->direct, 0, NULL, level,
+			      parent, wanted_disk_byte, count, gfp_mask);
 }
 
 /* indirect refs use parent == 0 */
-static int add_indirect_ref(struct preftrees *preftrees, u64 root_id,
+static int add_indirect_ref(const struct btrfs_fs_info *fs_info,
+			    struct preftrees *preftrees, u64 root_id,
 			    const struct btrfs_key *key, int level,
 			    u64 wanted_disk_byte, int count, gfp_t gfp_mask)
 {
@@ -377,7 +371,7 @@ static int add_indirect_ref(struct preftrees *preftrees, u64 root_id,
 
 	if (!key)
 		tree = &preftrees->indirect_missing_keys;
-	return add_prelim_ref(tree, root_id, key, level, 0,
+	return add_prelim_ref(fs_info, tree, root_id, key, level, 0,
 			      wanted_disk_byte, count, gfp_mask);
 }
 
@@ -631,7 +625,7 @@ static int resolve_indirect_refs(struct btrfs_fs_info *fs_info,
 		 * and return directly.
 		 */
 		if (err == -ENOENT) {
-			prelim_ref_insert(&preftrees->direct, ref);
+			prelim_ref_insert(fs_info, &preftrees->direct, ref);
 			continue;
 		} else if (err) {
 			free_pref(ref);
@@ -659,11 +653,11 @@ static int resolve_indirect_refs(struct btrfs_fs_info *fs_info,
 			memcpy(new_ref, ref, sizeof(*ref));
 			new_ref->parent = node->val;
 			new_ref->inode_list = unode_aux_to_inode_list(node);
-			prelim_ref_insert(&preftrees->direct, new_ref);
+			prelim_ref_insert(fs_info, &preftrees->direct, new_ref);
 		}
 
 		/* Now it's a direct ref, put it in the the direct tree */
-		prelim_ref_insert(&preftrees->direct, ref);
+		prelim_ref_insert(fs_info, &preftrees->direct, ref);
 
 		ulist_reinit(parents);
 	}
@@ -707,7 +701,7 @@ static int add_missing_keys(struct btrfs_fs_info *fs_info,
 			btrfs_node_key_to_cpu(eb, &ref->key_for_search, 0);
 		btrfs_tree_read_unlock(eb);
 		free_extent_buffer(eb);
-		prelim_ref_insert(&preftrees->indirect, ref);
+		prelim_ref_insert(fs_info, &preftrees->indirect, ref);
 	}
 	return 0;
 }
@@ -716,7 +710,8 @@ static int add_missing_keys(struct btrfs_fs_info *fs_info,
  * add all currently queued delayed refs from this head whose seq nr is
  * smaller or equal that seq to the list
  */
-static int add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
+static int add_delayed_refs(const struct btrfs_fs_info *fs_info,
+			    struct btrfs_delayed_ref_head *head, u64 seq,
 			    struct preftrees *preftrees, u64 *total_refs,
 			    u64 inum)
 {
@@ -759,8 +754,9 @@ static int add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
 			struct btrfs_delayed_tree_ref *ref;
 
 			ref = btrfs_delayed_node_to_tree_ref(node);
-			ret = add_indirect_ref(preftrees, ref->root, &tmp_op_key,
-					       ref->level + 1, node->bytenr,
+			ret = add_indirect_ref(fs_info, preftrees, ref->root,
+					       &tmp_op_key, ref->level + 1,
+					       node->bytenr,
 					       node->ref_mod * sgn,
 					       GFP_ATOMIC);
 			break;
@@ -771,9 +767,9 @@ static int add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
 
 			ref = btrfs_delayed_node_to_tree_ref(node);
 
-			ret = add_direct_ref(preftrees, ref->level + 1,
-					     ref->parent, node->bytenr,
-					     node->ref_mod * sgn,
+			ret = add_direct_ref(fs_info, preftrees,
+					     ref->level + 1, ref->parent,
+					     node->bytenr, node->ref_mod * sgn,
 					     GFP_ATOMIC);
 			break;
 		}
@@ -795,8 +791,8 @@ static int add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
 				break;
 			}
 
-			ret = add_indirect_ref(preftrees, ref->root, &key, 0,
-					       node->bytenr,
+			ret = add_indirect_ref(fs_info, preftrees, ref->root,
+					       &key, 0, node->bytenr,
 					       node->ref_mod * sgn,
 					       GFP_ATOMIC);
 			break;
@@ -807,8 +803,8 @@ static int add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
 
 			ref = btrfs_delayed_node_to_data_ref(node);
 
-			ret = add_direct_ref(preftrees, 0, ref->parent,
-					     node->bytenr,
+			ret = add_direct_ref(fs_info, preftrees, 0,
+					     ref->parent, node->bytenr,
 					     node->ref_mod * sgn,
 					     GFP_ATOMIC);
 			break;
@@ -826,7 +822,8 @@ static int add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
 /*
  * add all inline backrefs for bytenr to the list
  */
-static int add_inline_refs(struct btrfs_path *path, u64 bytenr,
+static int add_inline_refs(const struct btrfs_fs_info *fs_info,
+			   struct btrfs_path *path, u64 bytenr,
 			   int *info_level, struct preftrees *preftrees,
 			   u64 *total_refs, u64 inum)
 {
@@ -883,7 +880,8 @@ static int add_inline_refs(struct btrfs_path *path, u64 bytenr,
 
 		switch (type) {
 		case BTRFS_SHARED_BLOCK_REF_KEY:
-			ret = add_direct_ref(preftrees, *info_level + 1, offset,
+			ret = add_direct_ref(fs_info, preftrees,
+					     *info_level + 1, offset,
 					     bytenr, 1, GFP_NOFS);
 			break;
 		case BTRFS_SHARED_DATA_REF_KEY: {
@@ -893,14 +891,14 @@ static int add_inline_refs(struct btrfs_path *path, u64 bytenr,
 			sdref = (struct btrfs_shared_data_ref *)(iref + 1);
 			count = btrfs_shared_data_ref_count(leaf, sdref);
 
-			ret = add_direct_ref(preftrees, 0, offset,
+			ret = add_direct_ref(fs_info, preftrees, 0, offset,
 					     bytenr, count, GFP_NOFS);
 			break;
 		}
 		case BTRFS_TREE_BLOCK_REF_KEY:
-			ret = add_indirect_ref(preftrees, offset, NULL,
-					       *info_level + 1, bytenr, 1,
-					       GFP_NOFS);
+			ret = add_indirect_ref(fs_info, preftrees, offset,
+					       NULL, *info_level + 1,
+					       bytenr, 1, GFP_NOFS);
 			break;
 		case BTRFS_EXTENT_DATA_REF_KEY: {
 			struct btrfs_extent_data_ref *dref;
@@ -921,8 +919,9 @@ static int add_inline_refs(struct btrfs_path *path, u64 bytenr,
 
 			root = btrfs_extent_data_ref_root(leaf, dref);
 
-			ret = add_indirect_ref(preftrees, root, &key, 0, bytenr,
-					       count, GFP_NOFS);
+			ret = add_indirect_ref(fs_info, preftrees, root,
+					       &key, 0, bytenr, count,
+					       GFP_NOFS);
 			break;
 		}
 		default:
@@ -973,9 +972,9 @@ static int add_keyed_refs(struct btrfs_fs_info *fs_info,
 		switch (key.type) {
 		case BTRFS_SHARED_BLOCK_REF_KEY:
 			/* SHARED DIRECT METADATA backref */
-			ret = add_direct_ref(preftrees, info_level + 1,
-					     key.offset, bytenr, 1,
-					     GFP_NOFS);
+			ret = add_direct_ref(fs_info, preftrees,
+					     info_level + 1, key.offset,
+					     bytenr, 1, GFP_NOFS);
 			break;
 		case BTRFS_SHARED_DATA_REF_KEY: {
 			/* SHARED DIRECT FULL backref */
@@ -985,15 +984,16 @@ static int add_keyed_refs(struct btrfs_fs_info *fs_info,
 			sdref = btrfs_item_ptr(leaf, slot,
 					      struct btrfs_shared_data_ref);
 			count = btrfs_shared_data_ref_count(leaf, sdref);
-			ret = add_direct_ref(preftrees, 0, key.offset, bytenr,
-					     count, GFP_NOFS);
+			ret = add_direct_ref(fs_info, preftrees, 0,
+					     key.offset, bytenr, count,
+					     GFP_NOFS);
 			break;
 		}
 		case BTRFS_TREE_BLOCK_REF_KEY:
 			/* NORMAL INDIRECT METADATA backref */
-			ret = add_indirect_ref(preftrees, key.offset, NULL,
-					       info_level + 1, bytenr, 1,
-					       GFP_NOFS);
+			ret = add_indirect_ref(fs_info, preftrees, key.offset,
+					       NULL, info_level + 1, bytenr,
+					       1, GFP_NOFS);
 			break;
 		case BTRFS_EXTENT_DATA_REF_KEY: {
 			/* NORMAL INDIRECT DATA backref */
@@ -1015,8 +1015,9 @@ static int add_keyed_refs(struct btrfs_fs_info *fs_info,
 			}
 
 			root = btrfs_extent_data_ref_root(leaf, dref);
-			ret = add_indirect_ref(preftrees, root, &key, 0, bytenr,
-					       count, GFP_NOFS);
+			ret = add_indirect_ref(fs_info, preftrees, root,
+					       &key, 0, bytenr, count,
+					       GFP_NOFS);
 			break;
 		}
 		default:
@@ -1129,8 +1130,8 @@ again:
 				goto again;
 			}
 			spin_unlock(&delayed_refs->lock);
-			ret = add_delayed_refs(head, time_seq, &preftrees,
-					       &total_refs, inum);
+			ret = add_delayed_refs(fs_info, head, time_seq,
+					       &preftrees, &total_refs, inum);
 			mutex_unlock(&head->mutex);
 			if (ret)
 				goto out;
@@ -1150,8 +1151,9 @@ again:
 		if (key.objectid == bytenr &&
 		    (key.type == BTRFS_EXTENT_ITEM_KEY ||
 		     key.type == BTRFS_METADATA_ITEM_KEY)) {
-			ret = add_inline_refs(path, bytenr, &info_level,
-					      &preftrees, &total_refs, inum);
+			ret = add_inline_refs(fs_info, path, bytenr,
+					      &info_level, &preftrees,
+					      &total_refs, inum);
 			if (ret)
 				goto out;
 			ret = add_keyed_refs(fs_info, path, bytenr, info_level,
diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h
index f9428aaaa77a..e410335841aa 100644
--- a/fs/btrfs/backref.h
+++ b/fs/btrfs/backref.h
@@ -72,4 +72,16 @@ int btrfs_check_shared(struct btrfs_root *root, u64 inum, u64 bytenr);
 
 int __init btrfs_prelim_ref_init(void);
 void btrfs_prelim_ref_exit(void);
+
+struct prelim_ref {
+	struct rb_node rbnode;
+	u64 root_id;
+	struct btrfs_key key_for_search;
+	int level;
+	int count;
+	struct extent_inode_elem *inode_list;
+	u64 parent;
+	u64 wanted_disk_byte;
+};
+
 #endif
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 12540b6104b5..58650f2e0f17 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -61,6 +61,7 @@
 #include "tests/btrfs-tests.h"
 
 #include "qgroup.h"
+#include "backref.h"
 #define CREATE_TRACE_POINTS
 #include <trace/events/btrfs.h>
 
diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h
index 42560feb9920..90d25085762f 100644
--- a/include/trace/events/btrfs.h
+++ b/include/trace/events/btrfs.h
@@ -26,6 +26,7 @@ struct btrfs_work;
 struct __btrfs_workqueue;
 struct btrfs_qgroup_extent_record;
 struct btrfs_qgroup;
+struct prelim_ref;
 
 #define show_ref_type(type)						\
 	__print_symbolic(type,						\
@@ -1636,6 +1637,63 @@ TRACE_EVENT(qgroup_meta_reserve,
 		show_root_type(__entry->refroot), __entry->diff)
 );
 
+DECLARE_EVENT_CLASS(btrfs__prelim_ref,
+	TP_PROTO(const struct btrfs_fs_info *fs_info,
+		 const struct prelim_ref *oldref,
+		 const struct prelim_ref *newref, u64 tree_size),
+	TP_ARGS(fs_info, newref, oldref, tree_size),
+
+	TP_STRUCT__entry_btrfs(
+		__field(	u64,  root_id		)
+		__field(	u64,  objectid		)
+		__field(	 u8,  type		)
+		__field(	u64,  offset		)
+		__field(	int,  level		)
+		__field(	int,  old_count		)
+		__field(	u64,  parent		)
+		__field(	u64,  bytenr		)
+		__field(	int,  mod_count		)
+		__field(	u64,  tree_size		)
+	),
+
+	TP_fast_assign_btrfs(fs_info,
+		__entry->root_id	= oldref->root_id;
+		__entry->objectid	= oldref->key_for_search.objectid;
+		__entry->type		= oldref->key_for_search.type;
+		__entry->offset		= oldref->key_for_search.offset;
+		__entry->level		= oldref->level;
+		__entry->old_count	= oldref->count;
+		__entry->parent		= oldref->parent;
+		__entry->bytenr		= oldref->wanted_disk_byte;
+		__entry->mod_count	= newref ? newref->count : 0;
+		__entry->tree_size	= tree_size;
+	),
+
+	TP_printk_btrfs("root_id=%llu key=[%llu,%u,%llu] level=%d count=[%d+%d=%d] parent=%llu wanted_disk_byte=%llu nodes=%llu",
+			(unsigned long long)__entry->root_id,
+			(unsigned long long)__entry->objectid, __entry->type,
+			(unsigned long long)__entry->offset, __entry->level,
+			__entry->old_count, __entry->mod_count,
+			__entry->old_count + __entry->mod_count,
+			(unsigned long long)__entry->parent,
+			(unsigned long long)__entry->bytenr,
+			(unsigned long long)__entry->tree_size)
+);
+
+DEFINE_EVENT(btrfs__prelim_ref, btrfs_prelim_ref_merge,
+	TP_PROTO(const struct btrfs_fs_info *fs_info,
+		 const struct prelim_ref *oldref,
+		 const struct prelim_ref *newref, u64 tree_size),
+	TP_ARGS(fs_info, oldref, newref, tree_size)
+);
+
+DEFINE_EVENT(btrfs__prelim_ref, btrfs_prelim_ref_insert,
+	TP_PROTO(const struct btrfs_fs_info *fs_info,
+		 const struct prelim_ref *oldref,
+		 const struct prelim_ref *newref, u64 tree_size),
+	TP_ARGS(fs_info, oldref, newref, tree_size)
+);
+
 #endif /* _TRACE_BTRFS_H */
 
 /* This part must be outside protection */

From 9dd14fd6964e6db02346d5f472f915029728b8cf Mon Sep 17 00:00:00 2001
From: Edmund Nadolski <enadolski@suse.com>
Date: Wed, 12 Jul 2017 16:20:09 -0600
Subject: [PATCH 015/104] btrfs: add cond_resched() calls when resolving
 backrefs

Since backref resolution is CPU-intensive, the cond_resched calls
should help alleviate soft lockup occurences.

Signed-off-by: Edmund Nadolski <enadolski@suse.com>
Signed-off-by: Jeff Mahoney <jeffm@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/backref.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 4cda81964dd4..9593102bdc2c 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -660,6 +660,7 @@ static int resolve_indirect_refs(struct btrfs_fs_info *fs_info,
 		prelim_ref_insert(fs_info, &preftrees->direct, ref);
 
 		ulist_reinit(parents);
+		cond_resched();
 	}
 out:
 	ulist_free(parents);
@@ -702,6 +703,7 @@ static int add_missing_keys(struct btrfs_fs_info *fs_info,
 		btrfs_tree_read_unlock(eb);
 		free_extent_buffer(eb);
 		prelim_ref_insert(fs_info, &preftrees->indirect, ref);
+		cond_resched();
 	}
 	return 0;
 }
@@ -1243,6 +1245,7 @@ again:
 			}
 			eie = NULL;
 		}
+		cond_resched();
 	}
 
 out:

From 3ec4d3238ab1655ae3f696c412fb3244cd3b58de Mon Sep 17 00:00:00 2001
From: Edmund Nadolski <enadolski@suse.com>
Date: Wed, 12 Jul 2017 16:20:10 -0600
Subject: [PATCH 016/104] btrfs: allow backref search checks for shared extents

When called with a struct share_check, find_parent_nodes()
will detect a shared extent and immediately return with
BACKREF_SHARED_FOUND.

Signed-off-by: Edmund Nadolski <enadolski@suse.com>
Signed-off-by: Jeff Mahoney <jeffm@suse.com>
Reviewed-by: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/backref.c | 164 +++++++++++++++++++++++++++++++--------------
 1 file changed, 115 insertions(+), 49 deletions(-)

diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 9593102bdc2c..2a983a640069 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -134,6 +134,25 @@ struct preftrees {
 	struct preftree indirect_missing_keys;
 };
 
+/*
+ * Checks for a shared extent during backref search.
+ *
+ * The share_count tracks prelim_refs (direct and indirect) having a
+ * ref->count >0:
+ *  - incremented when a ref->count transitions to >0
+ *  - decremented when a ref->count transitions to <1
+ */
+struct share_check {
+	u64 root_objectid;
+	u64 inum;
+	int share_count;
+};
+
+static inline int extent_is_shared(struct share_check *sc)
+{
+	return (sc && sc->share_count > 1) ? BACKREF_FOUND_SHARED : 0;
+}
+
 static struct kmem_cache *btrfs_prelim_ref_cache;
 
 int __init btrfs_prelim_ref_init(void)
@@ -194,14 +213,26 @@ static int prelim_ref_compare(struct prelim_ref *ref1,
 	return 0;
 }
 
+void update_share_count(struct share_check *sc, int oldcount, int newcount)
+{
+	if ((!sc) || (oldcount == 0 && newcount < 1))
+		return;
+
+	if (oldcount > 0 && newcount < 1)
+		sc->share_count--;
+	else if (oldcount < 1 && newcount > 0)
+		sc->share_count++;
+}
+
 /*
  * Add @newref to the @root rbtree, merging identical refs.
  *
- * Callers should assumed that newref has been freed after calling.
+ * Callers should assume that newref has been freed after calling.
  */
 static void prelim_ref_insert(const struct btrfs_fs_info *fs_info,
 			      struct preftree *preftree,
-			      struct prelim_ref *newref)
+			      struct prelim_ref *newref,
+			      struct share_check *sc)
 {
 	struct rb_root *root;
 	struct rb_node **p;
@@ -233,12 +264,20 @@ static void prelim_ref_insert(const struct btrfs_fs_info *fs_info,
 				eie->next = newref->inode_list;
 			trace_btrfs_prelim_ref_merge(fs_info, ref, newref,
 						     preftree->count);
+			/*
+			 * A delayed ref can have newref->count < 0.
+			 * The ref->count is updated to follow any
+			 * BTRFS_[ADD|DROP]_DELAYED_REF actions.
+			 */
+			update_share_count(sc, ref->count,
+					   ref->count + newref->count);
 			ref->count += newref->count;
 			free_pref(newref);
 			return;
 		}
 	}
 
+	update_share_count(sc, 0, newref->count);
 	preftree->count++;
 	trace_btrfs_prelim_ref_insert(fs_info, newref, NULL, preftree->count);
 	rb_link_node(&newref->rbnode, parent, p);
@@ -302,7 +341,8 @@ static void prelim_release(struct preftree *preftree)
 static int add_prelim_ref(const struct btrfs_fs_info *fs_info,
 			  struct preftree *preftree, u64 root_id,
 			  const struct btrfs_key *key, int level, u64 parent,
-			  u64 wanted_disk_byte, int count, gfp_t gfp_mask)
+			  u64 wanted_disk_byte, int count,
+			  struct share_check *sc, gfp_t gfp_mask)
 {
 	struct prelim_ref *ref;
 
@@ -347,32 +387,33 @@ static int add_prelim_ref(const struct btrfs_fs_info *fs_info,
 	ref->count = count;
 	ref->parent = parent;
 	ref->wanted_disk_byte = wanted_disk_byte;
-	prelim_ref_insert(fs_info, preftree, ref);
-
-	return 0;
+	prelim_ref_insert(fs_info, preftree, ref, sc);
+	return extent_is_shared(sc);
 }
 
 /* direct refs use root == 0, key == NULL */
 static int add_direct_ref(const struct btrfs_fs_info *fs_info,
 			  struct preftrees *preftrees, int level, u64 parent,
-			  u64 wanted_disk_byte, int count, gfp_t gfp_mask)
+			  u64 wanted_disk_byte, int count,
+			  struct share_check *sc, gfp_t gfp_mask)
 {
 	return add_prelim_ref(fs_info, &preftrees->direct, 0, NULL, level,
-			      parent, wanted_disk_byte, count, gfp_mask);
+			      parent, wanted_disk_byte, count, sc, gfp_mask);
 }
 
 /* indirect refs use parent == 0 */
 static int add_indirect_ref(const struct btrfs_fs_info *fs_info,
 			    struct preftrees *preftrees, u64 root_id,
 			    const struct btrfs_key *key, int level,
-			    u64 wanted_disk_byte, int count, gfp_t gfp_mask)
+			    u64 wanted_disk_byte, int count,
+			    struct share_check *sc, gfp_t gfp_mask)
 {
 	struct preftree *tree = &preftrees->indirect;
 
 	if (!key)
 		tree = &preftrees->indirect_missing_keys;
 	return add_prelim_ref(fs_info, tree, root_id, key, level, 0,
-			      wanted_disk_byte, count, gfp_mask);
+			      wanted_disk_byte, count, sc, gfp_mask);
 }
 
 static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
@@ -575,7 +616,7 @@ static int resolve_indirect_refs(struct btrfs_fs_info *fs_info,
 				 struct btrfs_path *path, u64 time_seq,
 				 struct preftrees *preftrees,
 				 const u64 *extent_item_pos, u64 total_refs,
-				 u64 root_objectid)
+				 struct share_check *sc)
 {
 	int err;
 	int ret = 0;
@@ -612,7 +653,8 @@ static int resolve_indirect_refs(struct btrfs_fs_info *fs_info,
 			continue;
 		}
 
-		if (root_objectid && ref->root_id != root_objectid) {
+		if (sc && sc->root_objectid &&
+		    ref->root_id != sc->root_objectid) {
 			free_pref(ref);
 			ret = BACKREF_FOUND_SHARED;
 			goto out;
@@ -625,7 +667,8 @@ static int resolve_indirect_refs(struct btrfs_fs_info *fs_info,
 		 * and return directly.
 		 */
 		if (err == -ENOENT) {
-			prelim_ref_insert(fs_info, &preftrees->direct, ref);
+			prelim_ref_insert(fs_info, &preftrees->direct, ref,
+					  NULL);
 			continue;
 		} else if (err) {
 			free_pref(ref);
@@ -653,11 +696,15 @@ static int resolve_indirect_refs(struct btrfs_fs_info *fs_info,
 			memcpy(new_ref, ref, sizeof(*ref));
 			new_ref->parent = node->val;
 			new_ref->inode_list = unode_aux_to_inode_list(node);
-			prelim_ref_insert(fs_info, &preftrees->direct, new_ref);
+			prelim_ref_insert(fs_info, &preftrees->direct,
+					  new_ref, NULL);
 		}
 
-		/* Now it's a direct ref, put it in the the direct tree */
-		prelim_ref_insert(fs_info, &preftrees->direct, ref);
+		/*
+		 * Now it's a direct ref, put it in the the direct tree. We must
+		 * do this last because the ref could be merged/freed here.
+		 */
+		prelim_ref_insert(fs_info, &preftrees->direct, ref, NULL);
 
 		ulist_reinit(parents);
 		cond_resched();
@@ -702,7 +749,7 @@ static int add_missing_keys(struct btrfs_fs_info *fs_info,
 			btrfs_node_key_to_cpu(eb, &ref->key_for_search, 0);
 		btrfs_tree_read_unlock(eb);
 		free_extent_buffer(eb);
-		prelim_ref_insert(fs_info, &preftrees->indirect, ref);
+		prelim_ref_insert(fs_info, &preftrees->indirect, ref, NULL);
 		cond_resched();
 	}
 	return 0;
@@ -715,7 +762,7 @@ static int add_missing_keys(struct btrfs_fs_info *fs_info,
 static int add_delayed_refs(const struct btrfs_fs_info *fs_info,
 			    struct btrfs_delayed_ref_head *head, u64 seq,
 			    struct preftrees *preftrees, u64 *total_refs,
-			    u64 inum)
+			    struct share_check *sc)
 {
 	struct btrfs_delayed_ref_node *node;
 	struct btrfs_delayed_extent_op *extent_op = head->extent_op;
@@ -760,7 +807,7 @@ static int add_delayed_refs(const struct btrfs_fs_info *fs_info,
 					       &tmp_op_key, ref->level + 1,
 					       node->bytenr,
 					       node->ref_mod * sgn,
-					       GFP_ATOMIC);
+					       sc, GFP_ATOMIC);
 			break;
 		}
 		case BTRFS_SHARED_BLOCK_REF_KEY: {
@@ -772,7 +819,7 @@ static int add_delayed_refs(const struct btrfs_fs_info *fs_info,
 			ret = add_direct_ref(fs_info, preftrees,
 					     ref->level + 1, ref->parent,
 					     node->bytenr, node->ref_mod * sgn,
-					     GFP_ATOMIC);
+					     sc, GFP_ATOMIC);
 			break;
 		}
 		case BTRFS_EXTENT_DATA_REF_KEY: {
@@ -788,15 +835,15 @@ static int add_delayed_refs(const struct btrfs_fs_info *fs_info,
 			 * Found a inum that doesn't match our known inum, we
 			 * know it's shared.
 			 */
-			if (inum && ref->objectid != inum) {
+			if (sc && sc->inum && ref->objectid != sc->inum) {
 				ret = BACKREF_FOUND_SHARED;
-				break;
+				goto out;
 			}
 
 			ret = add_indirect_ref(fs_info, preftrees, ref->root,
 					       &key, 0, node->bytenr,
 					       node->ref_mod * sgn,
-					       GFP_ATOMIC);
+					       sc, GFP_ATOMIC);
 			break;
 		}
 		case BTRFS_SHARED_DATA_REF_KEY: {
@@ -808,26 +855,35 @@ static int add_delayed_refs(const struct btrfs_fs_info *fs_info,
 			ret = add_direct_ref(fs_info, preftrees, 0,
 					     ref->parent, node->bytenr,
 					     node->ref_mod * sgn,
-					     GFP_ATOMIC);
+					     sc, GFP_ATOMIC);
 			break;
 		}
 		default:
 			WARN_ON(1);
 		}
-		if (ret)
+		/*
+		 * We must ignore BACKREF_FOUND_SHARED until all delayed
+		 * refs have been checked.
+		 */
+		if (ret && (ret != BACKREF_FOUND_SHARED))
 			break;
 	}
+	if (!ret)
+		ret = extent_is_shared(sc);
+out:
 	spin_unlock(&head->lock);
 	return ret;
 }
 
 /*
  * add all inline backrefs for bytenr to the list
+ *
+ * Returns 0 on success, <0 on error, or BACKREF_FOUND_SHARED.
  */
 static int add_inline_refs(const struct btrfs_fs_info *fs_info,
 			   struct btrfs_path *path, u64 bytenr,
 			   int *info_level, struct preftrees *preftrees,
-			   u64 *total_refs, u64 inum)
+			   u64 *total_refs, struct share_check *sc)
 {
 	int ret = 0;
 	int slot;
@@ -884,7 +940,7 @@ static int add_inline_refs(const struct btrfs_fs_info *fs_info,
 		case BTRFS_SHARED_BLOCK_REF_KEY:
 			ret = add_direct_ref(fs_info, preftrees,
 					     *info_level + 1, offset,
-					     bytenr, 1, GFP_NOFS);
+					     bytenr, 1, NULL, GFP_NOFS);
 			break;
 		case BTRFS_SHARED_DATA_REF_KEY: {
 			struct btrfs_shared_data_ref *sdref;
@@ -894,13 +950,13 @@ static int add_inline_refs(const struct btrfs_fs_info *fs_info,
 			count = btrfs_shared_data_ref_count(leaf, sdref);
 
 			ret = add_direct_ref(fs_info, preftrees, 0, offset,
-					     bytenr, count, GFP_NOFS);
+					     bytenr, count, sc, GFP_NOFS);
 			break;
 		}
 		case BTRFS_TREE_BLOCK_REF_KEY:
 			ret = add_indirect_ref(fs_info, preftrees, offset,
 					       NULL, *info_level + 1,
-					       bytenr, 1, GFP_NOFS);
+					       bytenr, 1, NULL, GFP_NOFS);
 			break;
 		case BTRFS_EXTENT_DATA_REF_KEY: {
 			struct btrfs_extent_data_ref *dref;
@@ -914,7 +970,7 @@ static int add_inline_refs(const struct btrfs_fs_info *fs_info,
 			key.type = BTRFS_EXTENT_DATA_KEY;
 			key.offset = btrfs_extent_data_ref_offset(leaf, dref);
 
-			if (inum && key.objectid != inum) {
+			if (sc && sc->inum && key.objectid != sc->inum) {
 				ret = BACKREF_FOUND_SHARED;
 				break;
 			}
@@ -923,7 +979,7 @@ static int add_inline_refs(const struct btrfs_fs_info *fs_info,
 
 			ret = add_indirect_ref(fs_info, preftrees, root,
 					       &key, 0, bytenr, count,
-					       GFP_NOFS);
+					       sc, GFP_NOFS);
 			break;
 		}
 		default:
@@ -939,11 +995,13 @@ static int add_inline_refs(const struct btrfs_fs_info *fs_info,
 
 /*
  * add all non-inline backrefs for bytenr to the list
+ *
+ * Returns 0 on success, <0 on error, or BACKREF_FOUND_SHARED.
  */
 static int add_keyed_refs(struct btrfs_fs_info *fs_info,
 			  struct btrfs_path *path, u64 bytenr,
 			  int info_level, struct preftrees *preftrees,
-			  u64 inum)
+			  struct share_check *sc)
 {
 	struct btrfs_root *extent_root = fs_info->extent_root;
 	int ret;
@@ -976,7 +1034,7 @@ static int add_keyed_refs(struct btrfs_fs_info *fs_info,
 			/* SHARED DIRECT METADATA backref */
 			ret = add_direct_ref(fs_info, preftrees,
 					     info_level + 1, key.offset,
-					     bytenr, 1, GFP_NOFS);
+					     bytenr, 1, NULL, GFP_NOFS);
 			break;
 		case BTRFS_SHARED_DATA_REF_KEY: {
 			/* SHARED DIRECT FULL backref */
@@ -988,14 +1046,14 @@ static int add_keyed_refs(struct btrfs_fs_info *fs_info,
 			count = btrfs_shared_data_ref_count(leaf, sdref);
 			ret = add_direct_ref(fs_info, preftrees, 0,
 					     key.offset, bytenr, count,
-					     GFP_NOFS);
+					     sc, GFP_NOFS);
 			break;
 		}
 		case BTRFS_TREE_BLOCK_REF_KEY:
 			/* NORMAL INDIRECT METADATA backref */
 			ret = add_indirect_ref(fs_info, preftrees, key.offset,
 					       NULL, info_level + 1, bytenr,
-					       1, GFP_NOFS);
+					       1, NULL, GFP_NOFS);
 			break;
 		case BTRFS_EXTENT_DATA_REF_KEY: {
 			/* NORMAL INDIRECT DATA backref */
@@ -1011,7 +1069,7 @@ static int add_keyed_refs(struct btrfs_fs_info *fs_info,
 			key.type = BTRFS_EXTENT_DATA_KEY;
 			key.offset = btrfs_extent_data_ref_offset(leaf, dref);
 
-			if (inum && key.objectid != inum) {
+			if (sc && sc->inum && key.objectid != sc->inum) {
 				ret = BACKREF_FOUND_SHARED;
 				break;
 			}
@@ -1019,7 +1077,7 @@ static int add_keyed_refs(struct btrfs_fs_info *fs_info,
 			root = btrfs_extent_data_ref_root(leaf, dref);
 			ret = add_indirect_ref(fs_info, preftrees, root,
 					       &key, 0, bytenr, count,
-					       GFP_NOFS);
+					       sc, GFP_NOFS);
 			break;
 		}
 		default:
@@ -1039,20 +1097,23 @@ static int add_keyed_refs(struct btrfs_fs_info *fs_info,
  * indirect refs to their parent bytenr.
  * When roots are found, they're added to the roots list
  *
- * NOTE: This can return values > 0
- *
  * If time_seq is set to SEQ_LAST, it will not search delayed_refs, and behave
  * much like trans == NULL case, the difference only lies in it will not
  * commit root.
  * The special case is for qgroup to search roots in commit_transaction().
  *
+ * @sc - if !NULL, then immediately return BACKREF_FOUND_SHARED when a
+ * shared extent is detected.
+ *
+ * Otherwise this returns 0 for success and <0 for an error.
+ *
  * FIXME some caching might speed things up
  */
 static int find_parent_nodes(struct btrfs_trans_handle *trans,
 			     struct btrfs_fs_info *fs_info, u64 bytenr,
 			     u64 time_seq, struct ulist *refs,
 			     struct ulist *roots, const u64 *extent_item_pos,
-			     u64 root_objectid, u64 inum)
+			     struct share_check *sc)
 {
 	struct btrfs_key key;
 	struct btrfs_path *path;
@@ -1133,7 +1194,7 @@ again:
 			}
 			spin_unlock(&delayed_refs->lock);
 			ret = add_delayed_refs(fs_info, head, time_seq,
-					       &preftrees, &total_refs, inum);
+					       &preftrees, &total_refs, sc);
 			mutex_unlock(&head->mutex);
 			if (ret)
 				goto out;
@@ -1155,11 +1216,11 @@ again:
 		     key.type == BTRFS_METADATA_ITEM_KEY)) {
 			ret = add_inline_refs(fs_info, path, bytenr,
 					      &info_level, &preftrees,
-					      &total_refs, inum);
+					      &total_refs, sc);
 			if (ret)
 				goto out;
 			ret = add_keyed_refs(fs_info, path, bytenr, info_level,
-					     &preftrees, inum);
+					     &preftrees, sc);
 			if (ret)
 				goto out;
 		}
@@ -1174,8 +1235,7 @@ again:
 	WARN_ON(!RB_EMPTY_ROOT(&preftrees.indirect_missing_keys.root));
 
 	ret = resolve_indirect_refs(fs_info, path, time_seq, &preftrees,
-				    extent_item_pos, total_refs,
-				    root_objectid);
+				    extent_item_pos, total_refs, sc);
 	if (ret)
 		goto out;
 
@@ -1194,7 +1254,8 @@ again:
 		node = rb_next(&ref->rbnode);
 		WARN_ON(ref->count < 0);
 		if (roots && ref->count && ref->root_id && ref->parent == 0) {
-			if (root_objectid && ref->root_id != root_objectid) {
+			if (sc && sc->root_objectid &&
+			    ref->root_id != sc->root_objectid) {
 				ret = BACKREF_FOUND_SHARED;
 				goto out;
 			}
@@ -1298,7 +1359,7 @@ static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans,
 		return -ENOMEM;
 
 	ret = find_parent_nodes(trans, fs_info, bytenr, time_seq,
-				*leafs, NULL, extent_item_pos, 0, 0);
+				*leafs, NULL, extent_item_pos, NULL);
 	if (ret < 0 && ret != -ENOENT) {
 		free_leaf_list(*leafs);
 		return ret;
@@ -1341,7 +1402,7 @@ static int btrfs_find_all_roots_safe(struct btrfs_trans_handle *trans,
 	ULIST_ITER_INIT(&uiter);
 	while (1) {
 		ret = find_parent_nodes(trans, fs_info, bytenr, time_seq,
-					tmp, *roots, NULL, 0, 0);
+					tmp, *roots, NULL, NULL);
 		if (ret < 0 && ret != -ENOENT) {
 			ulist_free(tmp);
 			ulist_free(*roots);
@@ -1397,6 +1458,11 @@ int btrfs_check_shared(struct btrfs_root *root, u64 inum, u64 bytenr)
 	struct ulist_node *node;
 	struct seq_list elem = SEQ_LIST_INIT(elem);
 	int ret = 0;
+	struct share_check shared = {
+		.root_objectid = root->objectid,
+		.inum = inum,
+		.share_count = 0,
+	};
 
 	tmp = ulist_alloc(GFP_NOFS);
 	roots = ulist_alloc(GFP_NOFS);
@@ -1417,7 +1483,7 @@ int btrfs_check_shared(struct btrfs_root *root, u64 inum, u64 bytenr)
 	ULIST_ITER_INIT(&uiter);
 	while (1) {
 		ret = find_parent_nodes(trans, fs_info, bytenr, elem.seq, tmp,
-					roots, NULL, root->objectid, inum);
+					roots, NULL, &shared);
 		if (ret == BACKREF_FOUND_SHARED) {
 			/* this is the only condition under which we return 1 */
 			ret = 1;

From 01747e92a996cc2f2965c28fde485da932836ef8 Mon Sep 17 00:00:00 2001
From: Edmund Nadolski <enadolski@suse.com>
Date: Wed, 12 Jul 2017 16:20:11 -0600
Subject: [PATCH 017/104] btrfs: clean up extraneous computations in
 add_delayed_refs

Repeating the same computation in multiple places is not
necessary.

Signed-off-by: Edmund Nadolski <enadolski@suse.com>
Signed-off-by: Jeff Mahoney <jeffm@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/backref.c | 30 +++++++++++++-----------------
 1 file changed, 13 insertions(+), 17 deletions(-)

diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 2a983a640069..6bae986bfcfb 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -769,7 +769,7 @@ static int add_delayed_refs(const struct btrfs_fs_info *fs_info,
 	struct btrfs_key key;
 	struct btrfs_key tmp_op_key;
 	struct btrfs_key *op_key = NULL;
-	int sgn;
+	int count;
 	int ret = 0;
 
 	if (extent_op && extent_op->update_key) {
@@ -788,15 +788,15 @@ static int add_delayed_refs(const struct btrfs_fs_info *fs_info,
 			WARN_ON(1);
 			continue;
 		case BTRFS_ADD_DELAYED_REF:
-			sgn = 1;
+			count = node->ref_mod;
 			break;
 		case BTRFS_DROP_DELAYED_REF:
-			sgn = -1;
+			count = node->ref_mod * -1;
 			break;
 		default:
 			BUG_ON(1);
 		}
-		*total_refs += (node->ref_mod * sgn);
+		*total_refs += count;
 		switch (node->type) {
 		case BTRFS_TREE_BLOCK_REF_KEY: {
 			/* NORMAL INDIRECT METADATA backref */
@@ -805,9 +805,8 @@ static int add_delayed_refs(const struct btrfs_fs_info *fs_info,
 			ref = btrfs_delayed_node_to_tree_ref(node);
 			ret = add_indirect_ref(fs_info, preftrees, ref->root,
 					       &tmp_op_key, ref->level + 1,
-					       node->bytenr,
-					       node->ref_mod * sgn,
-					       sc, GFP_ATOMIC);
+					       node->bytenr, count, sc,
+					       GFP_ATOMIC);
 			break;
 		}
 		case BTRFS_SHARED_BLOCK_REF_KEY: {
@@ -816,9 +815,8 @@ static int add_delayed_refs(const struct btrfs_fs_info *fs_info,
 
 			ref = btrfs_delayed_node_to_tree_ref(node);
 
-			ret = add_direct_ref(fs_info, preftrees,
-					     ref->level + 1, ref->parent,
-					     node->bytenr, node->ref_mod * sgn,
+			ret = add_direct_ref(fs_info, preftrees, ref->level + 1,
+					     ref->parent, node->bytenr, count,
 					     sc, GFP_ATOMIC);
 			break;
 		}
@@ -841,9 +839,8 @@ static int add_delayed_refs(const struct btrfs_fs_info *fs_info,
 			}
 
 			ret = add_indirect_ref(fs_info, preftrees, ref->root,
-					       &key, 0, node->bytenr,
-					       node->ref_mod * sgn,
-					       sc, GFP_ATOMIC);
+					       &key, 0, node->bytenr, count, sc,
+					       GFP_ATOMIC);
 			break;
 		}
 		case BTRFS_SHARED_DATA_REF_KEY: {
@@ -852,10 +849,9 @@ static int add_delayed_refs(const struct btrfs_fs_info *fs_info,
 
 			ref = btrfs_delayed_node_to_data_ref(node);
 
-			ret = add_direct_ref(fs_info, preftrees, 0,
-					     ref->parent, node->bytenr,
-					     node->ref_mod * sgn,
-					     sc, GFP_ATOMIC);
+			ret = add_direct_ref(fs_info, preftrees, 0, ref->parent,
+					     node->bytenr, count, sc,
+					     GFP_ATOMIC);
 			break;
 		}
 		default:

From 6c6b5a39c4bf3dbd8cf629c9f5450e983c19dbb9 Mon Sep 17 00:00:00 2001
From: Aleksa Sarai <asarai@suse.de>
Date: Tue, 4 Jul 2017 21:49:06 +1000
Subject: [PATCH 018/104] btrfs: resume qgroup rescan on rw remount

Several distributions mount the "proper root" as ro during initrd and
then remount it as rw before pivot_root(2). Thus, if a rescan had been
aborted by a previous shutdown, the rescan would never be resumed.

This issue would manifest itself as several btrfs ioctl(2)s causing the
entire machine to hang when btrfs_qgroup_wait_for_completion was hit
(due to the fs_info->qgroup_rescan_running flag being set but the rescan
itself not being resumed). Notably, Docker's btrfs storage driver makes
regular use of BTRFS_QUOTA_CTL_DISABLE and BTRFS_IOC_QUOTA_RESCAN_WAIT
(causing this problem to be manifested on boot for some machines).

Cc: <stable@vger.kernel.org> # v3.11+
Cc: Jeff Mahoney <jeffm@suse.com>
Fixes: b382a324b60f ("Btrfs: fix qgroup rescan resume on mount")
Signed-off-by: Aleksa Sarai <asarai@suse.de>
Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Tested-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/super.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 58650f2e0f17..2351794fbc44 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1815,6 +1815,8 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
 			goto restore;
 		}
 
+		btrfs_qgroup_rescan_resume(fs_info);
+
 		if (!fs_info->uuid_root) {
 			btrfs_info(fs_info, "creating UUID tree");
 			ret = btrfs_create_uuid_tree(fs_info);

From 7bdd6277e0dc2beb4f5db5ea4ff7670ecf0b5879 Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <nborisov@suse.com>
Date: Tue, 11 Jul 2017 13:25:13 +0300
Subject: [PATCH 019/104] btrfs: Remove redundant argument of flush_space

All callers of flush_space pass the same number for orig/num_bytes
arguments. Let's remove one of the numbers and also modify the trace
point to show only a single number - bytes requested.

Seems that last point where the two parameters were treated differently
is before the ticketed enospc rework.

Signed-off-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent-tree.c       | 16 +++++++---------
 include/trace/events/btrfs.h | 13 +++++--------
 2 files changed, 12 insertions(+), 17 deletions(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 6d04563585e6..288b38ae8791 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -4906,7 +4906,7 @@ struct reserve_ticket {
 
 static int flush_space(struct btrfs_fs_info *fs_info,
 		       struct btrfs_space_info *space_info, u64 num_bytes,
-		       u64 orig_bytes, int state)
+		       int state)
 {
 	struct btrfs_root *root = fs_info->extent_root;
 	struct btrfs_trans_handle *trans;
@@ -4931,7 +4931,7 @@ static int flush_space(struct btrfs_fs_info *fs_info,
 		break;
 	case FLUSH_DELALLOC:
 	case FLUSH_DELALLOC_WAIT:
-		shrink_delalloc(fs_info, num_bytes * 2, orig_bytes,
+		shrink_delalloc(fs_info, num_bytes * 2, num_bytes,
 				state == FLUSH_DELALLOC_WAIT);
 		break;
 	case ALLOC_CHUNK:
@@ -4949,15 +4949,15 @@ static int flush_space(struct btrfs_fs_info *fs_info,
 		break;
 	case COMMIT_TRANS:
 		ret = may_commit_transaction(fs_info, space_info,
-					     orig_bytes, 0);
+					     num_bytes, 0);
 		break;
 	default:
 		ret = -ENOSPC;
 		break;
 	}
 
-	trace_btrfs_flush_space(fs_info, space_info->flags, num_bytes,
-				orig_bytes, state, ret);
+	trace_btrfs_flush_space(fs_info, space_info->flags, num_bytes, state,
+				ret);
 	return ret;
 }
 
@@ -5063,8 +5063,7 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
 		struct reserve_ticket *ticket;
 		int ret;
 
-		ret = flush_space(fs_info, space_info, to_reclaim, to_reclaim,
-				  flush_state);
+		ret = flush_space(fs_info, space_info, to_reclaim, flush_state);
 		spin_lock(&space_info->lock);
 		if (list_empty(&space_info->tickets)) {
 			space_info->flush = 0;
@@ -5120,8 +5119,7 @@ static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info,
 	spin_unlock(&space_info->lock);
 
 	do {
-		flush_space(fs_info, space_info, to_reclaim, to_reclaim,
-			    flush_state);
+		flush_space(fs_info, space_info, to_reclaim, flush_state);
 		flush_state++;
 		spin_lock(&space_info->lock);
 		if (ticket->bytes == 0) {
diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h
index 90d25085762f..1e4908dcd065 100644
--- a/include/trace/events/btrfs.h
+++ b/include/trace/events/btrfs.h
@@ -1011,15 +1011,14 @@ TRACE_EVENT(btrfs_trigger_flush,
 TRACE_EVENT(btrfs_flush_space,
 
 	TP_PROTO(const struct btrfs_fs_info *fs_info, u64 flags, u64 num_bytes,
-		 u64 orig_bytes, int state, int ret),
+		 int state, int ret),
 
-	TP_ARGS(fs_info, flags, num_bytes, orig_bytes, state, ret),
+	TP_ARGS(fs_info, flags, num_bytes, state, ret),
 
 	TP_STRUCT__entry(
 		__array(	u8,	fsid,	BTRFS_UUID_SIZE	)
 		__field(	u64,	flags			)
 		__field(	u64,	num_bytes		)
-		__field(	u64,	orig_bytes		)
 		__field(	int,	state			)
 		__field(	int,	ret			)
 	),
@@ -1028,19 +1027,17 @@ TRACE_EVENT(btrfs_flush_space,
 		memcpy(__entry->fsid, fs_info->fsid, BTRFS_UUID_SIZE);
 		__entry->flags		=	flags;
 		__entry->num_bytes	=	num_bytes;
-		__entry->orig_bytes	=	orig_bytes;
 		__entry->state		=	state;
 		__entry->ret		=	ret;
 	),
 
-	TP_printk("%pU: state=%d(%s) flags=%llu(%s) num_bytes=%llu "
-		  "orig_bytes=%llu ret=%d", __entry->fsid, __entry->state,
+	TP_printk("%pU: state=%d(%s) flags=%llu(%s) num_bytes=%llu ret=%d",
+		  __entry->fsid, __entry->state,
 		  show_flush_state(__entry->state),
 		  (unsigned long long)__entry->flags,
 		  __print_flags((unsigned long)__entry->flags, "|",
 				BTRFS_GROUP_FLAGS),
-		  (unsigned long long)__entry->num_bytes,
-		  (unsigned long long)__entry->orig_bytes, __entry->ret)
+		  (unsigned long long)__entry->num_bytes, __entry->ret)
 );
 
 DECLARE_EVENT_CLASS(btrfs__reserved_extent,

From 1174cade8182b4136c8a162342bf7e8eba7200de Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <nborisov@suse.com>
Date: Tue, 11 Jul 2017 13:47:50 +0300
Subject: [PATCH 020/104] btrfs: Remove redundant checks from
 btrfs_alloc_data_chunk_ondemand

Many commits ago the data space_info in alloc_data_chunk_ondemand used to be
acquired from the inode. At that point commit
33b4d47f5e24 ("Btrfs: deal with NULL space info") got introduced to deal with
spurios cases where the space info could be null, following a rebalance.
Nowadays, however, the space info is referenced directly from the btrfs_fs_info
struct which is initialised at filesystem mount time. This makes the null
checks redundant, so remove them.

Signed-off-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent-tree.c | 11 ++---------
 1 file changed, 2 insertions(+), 9 deletions(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 288b38ae8791..4fe93c436302 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -4199,9 +4199,9 @@ static u64 btrfs_space_info_used(struct btrfs_space_info *s_info,
 
 int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes)
 {
-	struct btrfs_space_info *data_sinfo;
 	struct btrfs_root *root = inode->root;
 	struct btrfs_fs_info *fs_info = root->fs_info;
+	struct btrfs_space_info *data_sinfo = fs_info->data_sinfo;
 	u64 used;
 	int ret = 0;
 	int need_commit = 2;
@@ -4215,10 +4215,6 @@ int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes)
 		ASSERT(current->journal_info);
 	}
 
-	data_sinfo = fs_info->data_sinfo;
-	if (!data_sinfo)
-		goto alloc;
-
 again:
 	/* make sure we have enough space to handle the data first */
 	spin_lock(&data_sinfo->lock);
@@ -4236,7 +4232,7 @@ again:
 
 			data_sinfo->force_alloc = CHUNK_ALLOC_FORCE;
 			spin_unlock(&data_sinfo->lock);
-alloc:
+
 			alloc_target = btrfs_data_alloc_profile(fs_info);
 			/*
 			 * It is ugly that we don't call nolock join
@@ -4264,9 +4260,6 @@ alloc:
 				}
 			}
 
-			if (!data_sinfo)
-				data_sinfo = fs_info->data_sinfo;
-
 			goto again;
 		}
 

From 69f03f137a1cb1a5215bc9e64597bfdfcdf22fde Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <nborisov@suse.com>
Date: Tue, 11 Jul 2017 16:55:51 +0300
Subject: [PATCH 021/104] btrfs: Prevent possible ERR_PTR() dereference

In btrfs_full_stripe_len/btrfs_is_parity_mirror we have similar code which
gets the chunk map for a particular range via get_chunk_map. However,
get_chunk_map can return an ERR_PTR value and while the 2 callers do catch
this with a WARN_ON they then proceed to indiscriminately dereference the
extent map. This of course leads to a crash. Fix the offenders by making the
dereference conditional on IS_ERR.

Signed-off-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/volumes.c | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index f93ac3d7e997..59f82939e634 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -5173,12 +5173,13 @@ unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info,
 	unsigned long len = fs_info->sectorsize;
 
 	em = get_chunk_map(fs_info, logical, len);
-	WARN_ON(IS_ERR(em));
 
-	map = em->map_lookup;
-	if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
-		len = map->stripe_len * nr_data_stripes(map);
-	free_extent_map(em);
+	if (!WARN_ON(IS_ERR(em))) {
+		map = em->map_lookup;
+		if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
+			len = map->stripe_len * nr_data_stripes(map);
+		free_extent_map(em);
+	}
 	return len;
 }
 
@@ -5190,12 +5191,13 @@ int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info,
 	int ret = 0;
 
 	em = get_chunk_map(fs_info, logical, len);
-	WARN_ON(IS_ERR(em));
 
-	map = em->map_lookup;
-	if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
-		ret = 1;
-	free_extent_map(em);
+	if(!WARN_ON(IS_ERR(em))) {
+		map = em->map_lookup;
+		if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
+			ret = 1;
+		free_extent_map(em);
+	}
 	return ret;
 }
 

From 0d1e0bead62a1778c45f22439cf067e63068faea Mon Sep 17 00:00:00 2001
From: Liu Bo <bo.li.liu@oracle.com>
Date: Tue, 11 Jul 2017 14:43:16 -0600
Subject: [PATCH 022/104] Btrfs: report errors when checksum is not found

When btrfs fails the checksum check, it'll fill the whole page with
"1".

However, if %csum_expected is 0 (which means there is no checksum), then
for some unknown reason, we just pretend that the read is correct, so
userspace would be confused about the dilemma that read is successful but
getting a page with all content being "1".

This can happen due to a bug in btrfs-convert.

This fixes it by always returning errors if checksum doesn't match.

Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/inode.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 95c212037095..97970602c3d5 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3159,8 +3159,6 @@ zeroit:
 	memset(kaddr + pgoff, 1, len);
 	flush_dcache_page(page);
 	kunmap_atomic(kaddr);
-	if (csum_expected == 0)
-		return 0;
 	return -EIO;
 }
 

From 21634a19f6467674ef67fba9714c835a1c0a1e67 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <quwenruo@cn.fujitsu.com>
Date: Thu, 9 Mar 2017 09:34:36 +0800
Subject: [PATCH 023/104] btrfs: Introduce a function to check if all chunks a
 OK for degraded rw mount

Introduce a new function, btrfs_check_rw_degradable(), to check if all
chunks in btrfs is OK for degraded rw mount.

It provides the new basis for accurate btrfs mount/remount and even
runtime degraded mount check other than old one-size-fit-all method.

Btrfs currently uses num_tolerated_disk_barrier_failures to do global
check for tolerated missing device.

Although the one-size-fit-all solution is quite safe, it's too strict
if data and metadata has different duplication level.

For example, if one use Single data and RAID1 metadata for 2 disks, it
means any missing device will make the fs unable to be degraded
mounted.

But in fact, some times all single chunks may be in the existing
device and in that case, we should allow it to be rw degraded mounted.

Such case can be easily reproduced using the following script:
 # mkfs.btrfs -f -m raid1 -d sing /dev/sdb /dev/sdc
 # wipefs -f /dev/sdc
 # mount /dev/sdb -o degraded,rw

If using btrfs-debug-tree to check /dev/sdb, one should find that the
data chunk is only in sdb, so in fact it should allow degraded mount.

This patchset will introduce a new per-chunk degradable check for
btrfs, allow above case to succeed, and it's quite small anyway.

Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Signed-off-by: Anand Jain <anand.jain@oracle.com>
Reviewed-by: David Sterba <dsterba@suse.com>
[ copied text from cover letter with more details about the problem being
  solved ]
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/volumes.c | 58 ++++++++++++++++++++++++++++++++++++++++++++++
 fs/btrfs/volumes.h |  2 ++
 2 files changed, 60 insertions(+)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 59f82939e634..1bc822b5a4c0 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -6813,6 +6813,64 @@ out_short_read:
 	return -EIO;
 }
 
+/*
+ * Check if all chunks in the fs are OK for read-write degraded mount
+ *
+ * Return true if all chunks meet the minimal RW mount requirements.
+ * Return false if any chunk doesn't meet the minimal RW mount requirements.
+ */
+bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info)
+{
+	struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
+	struct extent_map *em;
+	u64 next_start = 0;
+	bool ret = true;
+
+	read_lock(&map_tree->map_tree.lock);
+	em = lookup_extent_mapping(&map_tree->map_tree, 0, (u64)-1);
+	read_unlock(&map_tree->map_tree.lock);
+	/* No chunk at all? Return false anyway */
+	if (!em) {
+		ret = false;
+		goto out;
+	}
+	while (em) {
+		struct map_lookup *map;
+		int missing = 0;
+		int max_tolerated;
+		int i;
+
+		map = em->map_lookup;
+		max_tolerated =
+			btrfs_get_num_tolerated_disk_barrier_failures(
+					map->type);
+		for (i = 0; i < map->num_stripes; i++) {
+			struct btrfs_device *dev = map->stripes[i].dev;
+
+			if (!dev || !dev->bdev || dev->missing ||
+			    dev->last_flush_error)
+				missing++;
+		}
+		if (missing > max_tolerated) {
+			btrfs_warn(fs_info,
+	"chunk %llu missing %d devices, max tolerance is %d for writeable mount",
+				   em->start, missing, max_tolerated);
+			free_extent_map(em);
+			ret = false;
+			goto out;
+		}
+		next_start = extent_map_end(em);
+		free_extent_map(em);
+
+		read_lock(&map_tree->map_tree.lock);
+		em = lookup_extent_mapping(&map_tree->map_tree, next_start,
+					   (u64)(-1) - next_start);
+		read_unlock(&map_tree->map_tree.lock);
+	}
+out:
+	return ret;
+}
+
 int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info)
 {
 	struct btrfs_root *root = fs_info->chunk_root;
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 6f45fd60d15a..5824cdc0b3fc 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -543,4 +543,6 @@ struct list_head *btrfs_get_fs_uuids(void);
 void btrfs_set_fs_info_ptr(struct btrfs_fs_info *fs_info);
 void btrfs_reset_fs_info_ptr(struct btrfs_fs_info *fs_info);
 
+bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info);
+
 #endif

From 4330e183c9537df20952d4a9ee142c536fb8ae54 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <quwenruo@cn.fujitsu.com>
Date: Thu, 9 Mar 2017 09:34:37 +0800
Subject: [PATCH 024/104] btrfs: Do chunk level check for degraded rw mount

Now use the btrfs_check_rw_degradable() to check if we can mount in the
degraded mode.

With this patch, we can mount in the following case:
 # mkfs.btrfs -f -m raid1 -d single /dev/sdb /dev/sdc
 # wipefs -a /dev/sdc
 # mount /dev/sdb /mnt/btrfs -o degraded
 As the single data chunk is only on sdb, so it's OK to mount as
 degraded, as missing one device is OK for RAID1.

But still fail in the following case as expected:
 # mkfs.btrfs -f -m raid1 -d single /dev/sdb /dev/sdc
 # wipefs -a /dev/sdb
 # mount /dev/sdc /mnt/btrfs -o degraded
 As the data chunk is only in sdb, so it's not OK to mount it as
 degraded.

Reported-by: Zhao Lei <zhaolei@cn.fujitsu.com>
Reported-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/disk-io.c | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 080e2ebb8aa0..78fc7f5ab773 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -3035,15 +3035,10 @@ retry_root_backup:
 		btrfs_err(fs_info, "failed to read block groups: %d", ret);
 		goto fail_sysfs;
 	}
-	fs_info->num_tolerated_disk_barrier_failures =
-		btrfs_calc_num_tolerated_disk_barrier_failures(fs_info);
-	if (fs_info->fs_devices->missing_devices >
-	     fs_info->num_tolerated_disk_barrier_failures &&
-	    !(sb->s_flags & MS_RDONLY)) {
+
+	if (!(sb->s_flags & MS_RDONLY) && !btrfs_check_rw_degradable(fs_info)) {
 		btrfs_warn(fs_info,
-"missing devices (%llu) exceeds the limit (%d), writeable mount is not allowed",
-			fs_info->fs_devices->missing_devices,
-			fs_info->num_tolerated_disk_barrier_failures);
+		"writeable mount is not allowed due to too many missing devices");
 		goto fail_sysfs;
 	}
 

From b382cfe889da5d39aeceb79e261f862c3a53eafb Mon Sep 17 00:00:00 2001
From: Qu Wenruo <quwenruo@cn.fujitsu.com>
Date: Thu, 9 Mar 2017 09:34:38 +0800
Subject: [PATCH 025/104] btrfs: Do chunk level check for degraded remount

Just the same for mount time check, use btrfs_check_rw_degradable() to
check if we are OK to be remounted rw.

Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/super.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 2351794fbc44..e4c268c250f5 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1781,8 +1781,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
 			goto restore;
 		}
 
-		if (fs_info->fs_devices->missing_devices >
-		     fs_info->num_tolerated_disk_barrier_failures) {
+		if (!btrfs_check_rw_degradable(fs_info)) {
 			btrfs_warn(fs_info,
 				"too many missing devices, writeable remount is not allowed");
 			ret = -EACCES;

From d10b82fe29cf3730ab7c1a487f0f62f10af5d301 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <quwenruo@cn.fujitsu.com>
Date: Tue, 27 Jun 2017 17:28:40 +0800
Subject: [PATCH 026/104] btrfs: Allow barrier_all_devices to do chunk level
 device check

The last user of num_tolerated_disk_barrier_failures is
barrier_all_devices().
But it can be easily changed to the new per-chunk degradable check
framework.

Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/disk-io.c | 16 +++-------------
 1 file changed, 3 insertions(+), 13 deletions(-)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 78fc7f5ab773..552098e1b65b 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -3519,20 +3519,10 @@ static blk_status_t wait_dev_flush(struct btrfs_device *device)
 	return bio->bi_status;
 }
 
-static int check_barrier_error(struct btrfs_fs_devices *fsdevs)
+static int check_barrier_error(struct btrfs_fs_info *fs_info)
 {
-	int dev_flush_error = 0;
-	struct btrfs_device *dev;
-
-	list_for_each_entry_rcu(dev, &fsdevs->devices, dev_list) {
-		if (!dev->bdev || dev->last_flush_error)
-			dev_flush_error++;
-	}
-
-	if (dev_flush_error >
-	    fsdevs->fs_info->num_tolerated_disk_barrier_failures)
+	if (!btrfs_check_rw_degradable(fs_info))
 		return -EIO;
-
 	return 0;
 }
 
@@ -3587,7 +3577,7 @@ static int barrier_all_devices(struct btrfs_fs_info *info)
 		 * to arrive at the volume status. So error checking
 		 * is being pushed to a separate loop.
 		 */
-		return check_barrier_error(info->fs_devices);
+		return check_barrier_error(info);
 	}
 	return 0;
 }

From bc3cce2378b9eeb420873b17664493e68fd07e8c Mon Sep 17 00:00:00 2001
From: Qu Wenruo <quwenruo@cn.fujitsu.com>
Date: Thu, 9 Mar 2017 09:34:41 +0800
Subject: [PATCH 027/104] btrfs: Cleanup num_tolerated_disk_barrier_failures

As we use per-chunk degradable check, the global
num_tolerated_disk_barrier_failures is of no use.

We can now remove it.

Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.h   |  2 --
 fs/btrfs/disk-io.c | 54 ----------------------------------------------
 fs/btrfs/disk-io.h |  2 --
 fs/btrfs/volumes.c | 17 ---------------
 4 files changed, 75 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 34ed3dabd43a..06033b5f1657 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1071,8 +1071,6 @@ struct btrfs_fs_info {
 	/* next backup root to be overwritten */
 	int backup_root_index;
 
-	int num_tolerated_disk_barrier_failures;
-
 	/* device replace state */
 	struct btrfs_dev_replace dev_replace;
 
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 552098e1b65b..2f366044d891 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -3611,60 +3611,6 @@ int btrfs_get_num_tolerated_disk_barrier_failures(u64 flags)
 	return min_tolerated;
 }
 
-int btrfs_calc_num_tolerated_disk_barrier_failures(
-	struct btrfs_fs_info *fs_info)
-{
-	struct btrfs_ioctl_space_info space;
-	struct btrfs_space_info *sinfo;
-	u64 types[] = {BTRFS_BLOCK_GROUP_DATA,
-		       BTRFS_BLOCK_GROUP_SYSTEM,
-		       BTRFS_BLOCK_GROUP_METADATA,
-		       BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA};
-	int i;
-	int c;
-	int num_tolerated_disk_barrier_failures =
-		(int)fs_info->fs_devices->num_devices;
-
-	for (i = 0; i < ARRAY_SIZE(types); i++) {
-		struct btrfs_space_info *tmp;
-
-		sinfo = NULL;
-		rcu_read_lock();
-		list_for_each_entry_rcu(tmp, &fs_info->space_info, list) {
-			if (tmp->flags == types[i]) {
-				sinfo = tmp;
-				break;
-			}
-		}
-		rcu_read_unlock();
-
-		if (!sinfo)
-			continue;
-
-		down_read(&sinfo->groups_sem);
-		for (c = 0; c < BTRFS_NR_RAID_TYPES; c++) {
-			u64 flags;
-
-			if (list_empty(&sinfo->block_groups[c]))
-				continue;
-
-			btrfs_get_block_group_info(&sinfo->block_groups[c],
-						   &space);
-			if (space.total_bytes == 0 || space.used_bytes == 0)
-				continue;
-			flags = space.flags;
-
-			num_tolerated_disk_barrier_failures = min(
-				num_tolerated_disk_barrier_failures,
-				btrfs_get_num_tolerated_disk_barrier_failures(
-					flags));
-		}
-		up_read(&sinfo->groups_sem);
-	}
-
-	return num_tolerated_disk_barrier_failures;
-}
-
 int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors)
 {
 	struct list_head *head;
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 0a634d3ffc16..b5d0c4cef819 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -142,8 +142,6 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
 int btree_lock_page_hook(struct page *page, void *data,
 				void (*flush_fn)(void *));
 int btrfs_get_num_tolerated_disk_barrier_failures(u64 flags);
-int btrfs_calc_num_tolerated_disk_barrier_failures(
-	struct btrfs_fs_info *fs_info);
 int __init btrfs_end_io_wq_init(void);
 void btrfs_end_io_wq_exit(void);
 
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 1bc822b5a4c0..2272ac48a8a9 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1987,9 +1987,6 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
 		free_fs_devices(cur_devices);
 	}
 
-	fs_info->num_tolerated_disk_barrier_failures =
-		btrfs_calc_num_tolerated_disk_barrier_failures(fs_info);
-
 out:
 	mutex_unlock(&uuid_mutex);
 	return ret;
@@ -2487,8 +2484,6 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
 				   "sysfs: failed to create fsid for sprout");
 	}
 
-	fs_info->num_tolerated_disk_barrier_failures =
-		btrfs_calc_num_tolerated_disk_barrier_failures(fs_info);
 	ret = btrfs_commit_transaction(trans);
 
 	if (seeding_dev) {
@@ -3898,13 +3893,6 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
 			   meta_target, data_target);
 	}
 
-	if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
-		fs_info->num_tolerated_disk_barrier_failures = min(
-			btrfs_calc_num_tolerated_disk_barrier_failures(fs_info),
-			btrfs_get_num_tolerated_disk_barrier_failures(
-				bctl->sys.target));
-	}
-
 	ret = insert_balance_item(fs_info, bctl);
 	if (ret && ret != -EEXIST)
 		goto out;
@@ -3927,11 +3915,6 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
 	mutex_lock(&fs_info->balance_mutex);
 	atomic_dec(&fs_info->balance_running);
 
-	if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
-		fs_info->num_tolerated_disk_barrier_failures =
-			btrfs_calc_num_tolerated_disk_barrier_failures(fs_info);
-	}
-
 	if (bargs) {
 		memset(bargs, 0, sizeof(*bargs));
 		update_ioctl_balance_args(fs_info, 0, bargs);

From c55024514852930a6a31b6794aad44a56b117d9d Mon Sep 17 00:00:00 2001
From: Qu Wenruo <quwenruo@cn.fujitsu.com>
Date: Thu, 9 Mar 2017 09:34:42 +0800
Subject: [PATCH 028/104] btrfs: Enhance message when a device is missing
 during mount

For a missing device, btrfs will just refuse to mount with almost
meaningless kernel message like:

 BTRFS info (device vdb6): disk space caching is enabled
 BTRFS info (device vdb6): has skinny extents
 BTRFS error (device vdb6): failed to read the system array: -5
 BTRFS error (device vdb6): open_ctree failed

This patch will print a new message about the missing device:

 BTRFS info (device vdb6): disk space caching is enabled
 BTRFS info (device vdb6): has skinny extents
 BTRFS warning (device vdb6): devid 2 uuid 80470722-cad2-4b90-b7c3-fee294552f1b is missing
 BTRFS error (device vdb6): failed to read the system array: -5
 BTRFS error (device vdb6): open_ctree failed

Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Reviewed-by: Anand Jain <anand.jain@oracle.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/volumes.c | 24 +++++++++++++++++-------
 fs/btrfs/volumes.h |  2 ++
 2 files changed, 19 insertions(+), 7 deletions(-)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 2272ac48a8a9..da997eabde3b 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -6492,6 +6492,7 @@ static int read_one_chunk(struct btrfs_fs_info *fs_info, struct btrfs_key *key,
 		if (!map->stripes[i].dev &&
 		    !btrfs_test_opt(fs_info, DEGRADED)) {
 			free_extent_map(em);
+			btrfs_report_missing_device(fs_info, devid, uuid);
 			return -EIO;
 		}
 		if (!map->stripes[i].dev) {
@@ -6502,8 +6503,7 @@ static int read_one_chunk(struct btrfs_fs_info *fs_info, struct btrfs_key *key,
 				free_extent_map(em);
 				return -EIO;
 			}
-			btrfs_warn(fs_info, "devid %llu uuid %pU is missing",
-				   devid, uuid);
+			btrfs_report_missing_device(fs_info, devid, uuid);
 		}
 		map->stripes[i].dev->in_fs_metadata = 1;
 	}
@@ -6620,17 +6620,21 @@ static int read_one_dev(struct btrfs_fs_info *fs_info,
 
 	device = btrfs_find_device(fs_info, devid, dev_uuid, fs_uuid);
 	if (!device) {
-		if (!btrfs_test_opt(fs_info, DEGRADED))
+		if (!btrfs_test_opt(fs_info, DEGRADED)) {
+			btrfs_report_missing_device(fs_info, devid, dev_uuid);
 			return -EIO;
+		}
 
 		device = add_missing_dev(fs_devices, devid, dev_uuid);
 		if (!device)
 			return -ENOMEM;
-		btrfs_warn(fs_info, "devid %llu uuid %pU missing",
-				devid, dev_uuid);
+		btrfs_report_missing_device(fs_info, devid, dev_uuid);
 	} else {
-		if (!device->bdev && !btrfs_test_opt(fs_info, DEGRADED))
-			return -EIO;
+		if (!device->bdev) {
+			btrfs_report_missing_device(fs_info, devid, dev_uuid);
+			if (!btrfs_test_opt(fs_info, DEGRADED))
+				return -EIO;
+		}
 
 		if(!device->bdev && !device->missing) {
 			/*
@@ -6796,6 +6800,12 @@ out_short_read:
 	return -EIO;
 }
 
+void btrfs_report_missing_device(struct btrfs_fs_info *fs_info, u64 devid,
+				 u8 *uuid)
+{
+	btrfs_warn_rl(fs_info, "devid %llu uuid %pU is missing", devid, uuid);
+}
+
 /*
  * Check if all chunks in the fs are OK for read-write degraded mount
  *
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 5824cdc0b3fc..e906377ed329 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -544,5 +544,7 @@ void btrfs_set_fs_info_ptr(struct btrfs_fs_info *fs_info);
 void btrfs_reset_fs_info_ptr(struct btrfs_fs_info *fs_info);
 
 bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info);
+void btrfs_report_missing_device(struct btrfs_fs_info *fs_info, u64 devid,
+				 u8 *uuid);
 
 #endif

From b6e6bca51e531989de11af7967bd32bd973d96b9 Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <nborisov@suse.com>
Date: Wed, 12 Jul 2017 09:42:19 +0300
Subject: [PATCH 029/104] btrfs: qgroups: Fix BUG_ON condition in tree level
 check

The current code was erroneously checking for
root_level > BTRFS_MAX_LEVEL. If we had a root_level of 8 then the check
won't trigger and we could potentially hit a buffer overflow. The
correct check should be root_level >= BTRFS_MAX_LEVEL .

Signed-off-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: Qu Wenruo <quwenruo.btrfs@gmx.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/qgroup.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 4ce351efe281..acb48983be26 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -1603,7 +1603,7 @@ int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans,
 	struct extent_buffer *eb = root_eb;
 	struct btrfs_path *path = NULL;
 
-	BUG_ON(root_level < 0 || root_level > BTRFS_MAX_LEVEL);
+	BUG_ON(root_level < 0 || root_level >= BTRFS_MAX_LEVEL);
 	BUG_ON(root_eb == NULL);
 
 	if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))

From 913e153572218c911125414d4ca1f8531f20c120 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Thu, 13 Jul 2017 15:32:18 +0200
Subject: [PATCH 030/104] btrfs: drop newlines from strings when using btrfs_*
 helpers

The helpers append "\n" so we can keep the actual strings shorter. The
extra newline will print an empty line.  Some messages have been
slightly modified to be more consistent with the rest (lowercase first
letter).

Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent-tree.c      | 2 +-
 fs/btrfs/free-space-cache.c | 2 +-
 fs/btrfs/inode.c            | 2 +-
 fs/btrfs/qgroup.c           | 2 +-
 fs/btrfs/scrub.c            | 3 +--
 5 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 4fe93c436302..fd2fee398c83 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -6832,7 +6832,7 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
 		if (ret) {
 			const char *errstr = btrfs_decode_error(ret);
 			btrfs_warn(fs_info,
-				   "Discard failed while removing blockgroup: errno=%d %s\n",
+			   "discard failed while removing blockgroup: errno=%d %s",
 				   ret, errstr);
 		}
 	}
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index c5e6180cdb8c..cdc9f4015ec3 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -709,7 +709,7 @@ static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
 
 	if (!BTRFS_I(inode)->generation) {
 		btrfs_info(fs_info,
-			   "The free space cache file (%llu) is invalid. skip it\n",
+			   "the free space cache file (%llu) is invalid, skip it",
 			   offset);
 		return 0;
 	}
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 97970602c3d5..3bf7bae36e56 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -8017,7 +8017,7 @@ static int dio_read_error(struct inode *inode, struct bio *failed_bio,
 	bio_set_op_attrs(bio, REQ_OP_READ, read_mode);
 
 	btrfs_debug(BTRFS_I(inode)->root->fs_info,
-		    "Repair DIO Read Error: submitting new dio read[%#x] to this_mirror=%d, in_validation=%d\n",
+		    "repair DIO read error: submitting new dio read[%#x] to this_mirror=%d, in_validation=%d",
 		    read_mode, failrec->this_mirror, failrec->in_validation);
 
 	ret = submit_dio_repair_bio(inode, bio, failrec->this_mirror);
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index acb48983be26..ddc37c537058 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -2646,7 +2646,7 @@ out:
 	if (IS_ERR(trans)) {
 		err = PTR_ERR(trans);
 		btrfs_err(fs_info,
-			  "fail to start transaction for status update: %d\n",
+			  "fail to start transaction for status update: %d",
 			  err);
 		goto done;
 	}
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 6f1e4c984b94..de53c521a50f 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -3869,8 +3869,7 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
 			ro_set = 0;
 		} else {
 			btrfs_warn(fs_info,
-				   "failed setting block group ro, ret=%d\n",
-				   ret);
+				   "failed setting block group ro: %d", ret);
 			btrfs_put_block_group(cache);
 			break;
 		}

From 26b28dce50091ae36ebb0bf9cb814a43861f0641 Mon Sep 17 00:00:00 2001
From: Nick Terrell <terrelln@fb.com>
Date: Thu, 29 Jun 2017 10:57:26 -0700
Subject: [PATCH 031/104] btrfs: Keep one more workspace around

find_workspace() allocates up to num_online_cpus() + 1 workspaces.
free_workspace() will only keep num_online_cpus() workspaces. When
(de)compressing we will allocate num_online_cpus() + 1 workspaces, then
free one, and repeat. Instead, we can just keep num_online_cpus() + 1
workspaces around, and never have to allocate/free another workspace in the
common case.

I tested on a Ubuntu 14.04 VM with 2 cores and 4 GiB of RAM. I mounted a
BtrFS partition with -o compress-force={lzo,zlib,zstd} and logged whenever
a workspace was allocated of freed. Then I copied vmlinux (527 MB) to the
partition. Before the patch, during the copy it would allocate and free 5-6
workspaces. After, it only allocated the initial 3. This held true for lzo,
zlib, and zstd. The time it took to execute cp vmlinux /mnt/btrfs && sync
dropped from 1.70s to 1.44s with lzo compression, and from 2.04s to 1.80s
for zstd compression.

Signed-off-by: Nick Terrell <terrelln@fb.com>
Reviewed-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/compression.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index d2ef9ac2a630..3896bd0175ec 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -825,7 +825,7 @@ static void free_workspace(int type, struct list_head *workspace)
 	int *free_ws			= &btrfs_comp_ws[idx].free_ws;
 
 	spin_lock(ws_lock);
-	if (*free_ws < num_online_cpus()) {
+	if (*free_ws <= num_online_cpus()) {
 		list_add(workspace, idle_ws);
 		(*free_ws)++;
 		spin_unlock(ws_lock);

From 19aee8dea31fbaa5139968d20e36e1d78cbf30a0 Mon Sep 17 00:00:00 2001
From: Anand Jain <Anand.Jain@oracle.com>
Date: Tue, 18 Jul 2017 17:37:05 +0800
Subject: [PATCH 032/104] btrfs: btrfs_inherit_iflags() can be static

btrfs_new_inode() is the only consumer move it to inode.c,
from ioctl.c.

Signed-off-by: Anand Jain <anand.jain@oracle.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.h |  1 -
 fs/btrfs/inode.c | 31 +++++++++++++++++++++++++++++++
 fs/btrfs/ioctl.c | 31 -------------------------------
 3 files changed, 31 insertions(+), 32 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 06033b5f1657..6510f246f71e 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3230,7 +3230,6 @@ long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
 long btrfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
 int btrfs_ioctl_get_supported_features(void __user *arg);
 void btrfs_update_iflags(struct inode *inode);
-void btrfs_inherit_iflags(struct inode *inode, struct inode *dir);
 int btrfs_is_empty_uuid(u8 *uuid);
 int btrfs_defrag_file(struct inode *inode, struct file *file,
 		      struct btrfs_ioctl_defrag_range_args *range,
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 3bf7bae36e56..3926e9cbf0d9 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -6183,6 +6183,37 @@ static int btrfs_insert_inode_locked(struct inode *inode)
 		   btrfs_find_actor, &args);
 }
 
+/*
+ * Inherit flags from the parent inode.
+ *
+ * Currently only the compression flags and the cow flags are inherited.
+ */
+static void btrfs_inherit_iflags(struct inode *inode, struct inode *dir)
+{
+	unsigned int flags;
+
+	if (!dir)
+		return;
+
+	flags = BTRFS_I(dir)->flags;
+
+	if (flags & BTRFS_INODE_NOCOMPRESS) {
+		BTRFS_I(inode)->flags &= ~BTRFS_INODE_COMPRESS;
+		BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS;
+	} else if (flags & BTRFS_INODE_COMPRESS) {
+		BTRFS_I(inode)->flags &= ~BTRFS_INODE_NOCOMPRESS;
+		BTRFS_I(inode)->flags |= BTRFS_INODE_COMPRESS;
+	}
+
+	if (flags & BTRFS_INODE_NODATACOW) {
+		BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW;
+		if (S_ISREG(inode->i_mode))
+			BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM;
+	}
+
+	btrfs_update_iflags(inode);
+}
+
 static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
 				     struct btrfs_root *root,
 				     struct inode *dir,
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index fa1b78cf25f6..335e252a99f5 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -156,37 +156,6 @@ void btrfs_update_iflags(struct inode *inode)
 		      new_fl);
 }
 
-/*
- * Inherit flags from the parent inode.
- *
- * Currently only the compression flags and the cow flags are inherited.
- */
-void btrfs_inherit_iflags(struct inode *inode, struct inode *dir)
-{
-	unsigned int flags;
-
-	if (!dir)
-		return;
-
-	flags = BTRFS_I(dir)->flags;
-
-	if (flags & BTRFS_INODE_NOCOMPRESS) {
-		BTRFS_I(inode)->flags &= ~BTRFS_INODE_COMPRESS;
-		BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS;
-	} else if (flags & BTRFS_INODE_COMPRESS) {
-		BTRFS_I(inode)->flags &= ~BTRFS_INODE_NOCOMPRESS;
-		BTRFS_I(inode)->flags |= BTRFS_INODE_COMPRESS;
-	}
-
-	if (flags & BTRFS_INODE_NODATACOW) {
-		BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW;
-		if (S_ISREG(inode->i_mode))
-			BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM;
-	}
-
-	btrfs_update_iflags(inode);
-}
-
 static int btrfs_ioctl_getflags(struct file *file, void __user *arg)
 {
 	struct btrfs_inode *ip = BTRFS_I(file_inode(file));

From 47f08b96993831f4c51ed7cb07a86a97d4138d3f Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <nborisov@suse.com>
Date: Tue, 18 Jul 2017 15:39:08 +0300
Subject: [PATCH 033/104] btrfs: Use explicit round_down macro in btrfs resize
 ioctl handler

No functional changes, just make the code more self-explanatory.

Signed-off-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ioctl.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 335e252a99f5..3e744dd17ac5 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1569,8 +1569,7 @@ static noinline int btrfs_ioctl_resize(struct file *file,
 		goto out_free;
 	}
 
-	new_size = div_u64(new_size, fs_info->sectorsize);
-	new_size *= fs_info->sectorsize;
+	new_size = round_down(new_size, fs_info->sectorsize);
 
 	btrfs_info_in_rcu(fs_info, "new size for %s is %llu",
 			  rcu_str_deref(device->name), new_size);

From 500ceed807c4af359be7d08be2e17487ba00e14e Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <nborisov@suse.com>
Date: Fri, 14 Jul 2017 09:55:41 +0300
Subject: [PATCH 034/104] btrfs: Remove find_raid56_stripe_len

find_raid56_stripe_len statically returns SZ_64K which equals BTRFS_STRIPE_LEN.
It's sole caller is __btrfs_alloc_chunk and it assigns the return value to ai
variable which is already set to BTRFS_STRIPE_LEN. So remove the function
invocation altogether and remove the function itself. Also remove the variable
since it's only aliasing BTRFS_STRIPE_LEN and use the define directly. Use
the occassion to simplify the rounding down of stripe_size now that the value
we want it to align is a power of 2.

Signed-off-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: Qu Wenruo <quwenruo.btrfs@gmx.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/volumes.c | 27 +++++++--------------------
 1 file changed, 7 insertions(+), 20 deletions(-)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index da997eabde3b..be3c34733d82 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -4584,12 +4584,6 @@ static int btrfs_cmp_device_info(const void *a, const void *b)
 	return 0;
 }
 
-static u32 find_raid56_stripe_len(u32 data_devices, u32 dev_stripe_target)
-{
-	/* TODO allow them to set a preferred stripe size */
-	return SZ_64K;
-}
-
 static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type)
 {
 	if (!(type & BTRFS_BLOCK_GROUP_RAID56_MASK))
@@ -4632,7 +4626,6 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 	u64 max_chunk_size;
 	u64 stripe_size;
 	u64 num_bytes;
-	u64 raid_stripe_len = BTRFS_STRIPE_LEN;
 	int ndevs;
 	int i;
 	int j;
@@ -4767,16 +4760,11 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 	 */
 	data_stripes = num_stripes / ncopies;
 
-	if (type & BTRFS_BLOCK_GROUP_RAID5) {
-		raid_stripe_len = find_raid56_stripe_len(ndevs - 1,
-							 info->stripesize);
+	if (type & BTRFS_BLOCK_GROUP_RAID5)
 		data_stripes = num_stripes - 1;
-	}
-	if (type & BTRFS_BLOCK_GROUP_RAID6) {
-		raid_stripe_len = find_raid56_stripe_len(ndevs - 2,
-							 info->stripesize);
+
+	if (type & BTRFS_BLOCK_GROUP_RAID6)
 		data_stripes = num_stripes - 2;
-	}
 
 	/*
 	 * Use the number of data stripes to figure out how big this chunk
@@ -4801,8 +4789,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 	stripe_size = div_u64(stripe_size, dev_stripes);
 
 	/* align to BTRFS_STRIPE_LEN */
-	stripe_size = div64_u64(stripe_size, raid_stripe_len);
-	stripe_size *= raid_stripe_len;
+	stripe_size = round_down(stripe_size, BTRFS_STRIPE_LEN);
 
 	map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
 	if (!map) {
@@ -4820,9 +4807,9 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 		}
 	}
 	map->sector_size = info->sectorsize;
-	map->stripe_len = raid_stripe_len;
-	map->io_align = raid_stripe_len;
-	map->io_width = raid_stripe_len;
+	map->stripe_len = BTRFS_STRIPE_LEN;
+	map->io_align = BTRFS_STRIPE_LEN;
+	map->io_width = BTRFS_STRIPE_LEN;
 	map->type = type;
 	map->sub_stripes = sub_stripes;
 

From 110840bb629a974b1dafafce2d50a891496171b6 Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <nborisov@suse.com>
Date: Wed, 19 Jul 2017 10:47:57 +0300
Subject: [PATCH 035/104] btrfs: Remove unused variables

clear_super - usage was removed in commit cea67ab92d3d ("btrfs: clean
the old superblocks before freeing the device") but that change forgot
to remove the actual variable.

max_key - commit 6174d3cb43aa ("Btrfs: remove unused max_key arg from
btrfs_search_forward") removed the max_key parameter but it forgot to
remove references from callers.

stripe_len - this one was added by e06cd3dd7cea ("Btrfs: add validadtion
checks for chunk loading") but even then it wasn't used.

Signed-off-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/volumes.c | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index be3c34733d82..325ea062dc6b 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1872,7 +1872,6 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
 	struct btrfs_fs_devices *cur_devices;
 	u64 num_devices;
 	int ret = 0;
-	bool clear_super = false;
 
 	mutex_lock(&uuid_mutex);
 
@@ -1908,7 +1907,6 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
 		list_del_init(&device->dev_alloc_list);
 		device->fs_devices->rw_devices--;
 		mutex_unlock(&fs_info->chunk_mutex);
-		clear_super = true;
 	}
 
 	mutex_unlock(&uuid_mutex);
@@ -4110,7 +4108,6 @@ static int btrfs_uuid_scan_kthread(void *data)
 	struct btrfs_fs_info *fs_info = data;
 	struct btrfs_root *root = fs_info->tree_root;
 	struct btrfs_key key;
-	struct btrfs_key max_key;
 	struct btrfs_path *path = NULL;
 	int ret = 0;
 	struct extent_buffer *eb;
@@ -4129,10 +4126,6 @@ static int btrfs_uuid_scan_kthread(void *data)
 	key.type = BTRFS_ROOT_ITEM_KEY;
 	key.offset = 0;
 
-	max_key.objectid = (u64)-1;
-	max_key.type = BTRFS_ROOT_ITEM_KEY;
-	max_key.offset = (u64)-1;
-
 	while (1) {
 		ret = btrfs_search_forward(root, &key, path, 0);
 		if (ret) {
@@ -6415,7 +6408,6 @@ static int read_one_chunk(struct btrfs_fs_info *fs_info, struct btrfs_key *key,
 	struct extent_map *em;
 	u64 logical;
 	u64 length;
-	u64 stripe_len;
 	u64 devid;
 	u8 uuid[BTRFS_UUID_SIZE];
 	int num_stripes;
@@ -6424,7 +6416,6 @@ static int read_one_chunk(struct btrfs_fs_info *fs_info, struct btrfs_key *key,
 
 	logical = key->offset;
 	length = btrfs_chunk_length(leaf, chunk);
-	stripe_len = btrfs_chunk_stripe_len(leaf, chunk);
 	num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
 
 	ret = btrfs_check_chunk_valid(fs_info, leaf, chunk, logical);

From e4ff5fb5dc3742b126a2f4d1f18706509812d084 Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <nborisov@suse.com>
Date: Wed, 19 Jul 2017 10:48:42 +0300
Subject: [PATCH 036/104] btrfs: Remove unused parameters from volume.c
 functions

This also adjusts the respective callers in other files. Those were
found with -Wunused-parameter.

btrfs_full_stripe_len's mapping_tree - introduced by 53b381b3abeb
("Btrfs: RAID5 and RAID6") but it was never really used even in that
commit

btrfs_is_parity_mirror's mirror_num - same as above

chunk_drange_filter's chunk_offset - introduced by 94e60d5a5c4b ("Btrfs:
devid subset filter") and never used.

Signed-off-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent-tree.c | 4 +---
 fs/btrfs/extent_io.c   | 2 +-
 fs/btrfs/volumes.c     | 7 ++-----
 fs/btrfs/volumes.h     | 3 +--
 4 files changed, 5 insertions(+), 11 deletions(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index fd2fee398c83..08e620c43842 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -9944,9 +9944,7 @@ btrfs_create_block_group_cache(struct btrfs_fs_info *fs_info,
 	cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
 
 	cache->fs_info = fs_info;
-	cache->full_stripe_len = btrfs_full_stripe_len(fs_info,
-						       &fs_info->mapping_tree,
-						       start);
+	cache->full_stripe_len = btrfs_full_stripe_len(fs_info, start);
 	set_free_space_tree_thresholds(cache);
 
 	atomic_set(&cache->count, 1);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 7dd1b2dc7c68..a7bebba4f9fc 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1997,7 +1997,7 @@ int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
 	 * read repair operation.
 	 */
 	btrfs_bio_counter_inc_blocked(fs_info);
-	if (btrfs_is_parity_mirror(fs_info, logical, length, mirror_num)) {
+	if (btrfs_is_parity_mirror(fs_info, logical, length)) {
 		/*
 		 * Note that we don't use BTRFS_MAP_WRITE because it's supposed
 		 * to update all raid stripes, but here we just want to correct
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 325ea062dc6b..877224b66a12 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -3305,7 +3305,6 @@ static int chunk_devid_filter(struct extent_buffer *leaf,
 /* [pstart, pend) */
 static int chunk_drange_filter(struct extent_buffer *leaf,
 			       struct btrfs_chunk *chunk,
-			       u64 chunk_offset,
 			       struct btrfs_balance_args *bargs)
 {
 	struct btrfs_stripe *stripe;
@@ -3432,7 +3431,7 @@ static int should_balance_chunk(struct btrfs_fs_info *fs_info,
 
 	/* drange filter, makes sense only with devid filter */
 	if ((bargs->flags & BTRFS_BALANCE_ARGS_DRANGE) &&
-	    chunk_drange_filter(leaf, chunk, chunk_offset, bargs)) {
+	    chunk_drange_filter(leaf, chunk, bargs)) {
 		return 0;
 	}
 
@@ -5128,7 +5127,6 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
 }
 
 unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info,
-				    struct btrfs_mapping_tree *map_tree,
 				    u64 logical)
 {
 	struct extent_map *em;
@@ -5146,8 +5144,7 @@ unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info,
 	return len;
 }
 
-int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info,
-			   u64 logical, u64 len, int mirror_num)
+int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
 {
 	struct extent_map *em;
 	struct map_lookup *map;
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index e906377ed329..181b365cab0c 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -481,9 +481,8 @@ void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info,
 					      struct btrfs_device *tgtdev);
 void btrfs_scratch_superblocks(struct block_device *bdev, const char *device_path);
 int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info,
-			   u64 logical, u64 len, int mirror_num);
+			   u64 logical, u64 len);
 unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info,
-				    struct btrfs_mapping_tree *map_tree,
 				    u64 logical);
 int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
 				struct btrfs_fs_info *fs_info,

From e3f3ad126829779a5ad0e7846cd52adf7cc180a9 Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <nborisov@suse.com>
Date: Thu, 13 Jul 2017 14:11:07 +0300
Subject: [PATCH 037/104] btrfs: Remove never reached error handling code in
 __add_reloc_root

One of the error handling paths in __add_reloc_root contains btrfs_panic()
followed by some other code. As the name implies what it does is print
some error message and call BUG, naturally what follow afterwards is not
invoked. So remove this extra code.

Signed-off-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/relocation.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 65661d1aae4e..1a532bb72eab 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -1308,8 +1308,6 @@ static int __must_check __add_reloc_root(struct btrfs_root *root)
 		btrfs_panic(fs_info, -EEXIST,
 			    "Duplicate root found for start=%llu while inserting into relocation tree",
 			    node->bytenr);
-		kfree(node);
-		return -EEXIST;
 	}
 
 	list_add_tail(&root->root_list, &rc->reloc_roots);

From 3ec836211183eee87609f832a949dfe711af2b5a Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Thu, 22 Jun 2017 02:26:54 +0200
Subject: [PATCH 038/104] btrfs: use GFP_KERNEL in mount and remount

We don't need to restrict the allocation flags in btrfs_mount or
_remount. No big filesystem locks are held (possibly s_umount but that
does no count here).

Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/super.c | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index e4c268c250f5..64981fc8e39e 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -426,7 +426,7 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
 	 * strsep changes the string, duplicate it because parse_options
 	 * gets called twice
 	 */
-	options = kstrdup(options, GFP_NOFS);
+	options = kstrdup(options, GFP_KERNEL);
 	if (!options)
 		return -ENOMEM;
 
@@ -950,7 +950,7 @@ static char *get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info,
 	}
 	path->leave_spinning = 1;
 
-	name = kmalloc(PATH_MAX, GFP_NOFS);
+	name = kmalloc(PATH_MAX, GFP_KERNEL);
 	if (!name) {
 		ret = -ENOMEM;
 		goto err;
@@ -1336,10 +1336,11 @@ static char *setup_root_args(char *args)
 	char *buf, *dst, *sep;
 
 	if (!args)
-		return kstrdup("subvolid=0", GFP_NOFS);
+		return kstrdup("subvolid=0", GFP_KERNEL);
 
 	/* The worst case is that we add ",subvolid=0" to the end. */
-	buf = dst = kmalloc(strlen(args) + strlen(",subvolid=0") + 1, GFP_NOFS);
+	buf = dst = kmalloc(strlen(args) + strlen(",subvolid=0") + 1,
+			GFP_KERNEL);
 	if (!buf)
 		return NULL;
 
@@ -1568,7 +1569,7 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
 	 * it for searching for existing supers, so this lets us do that and
 	 * then open_ctree will properly initialize everything later.
 	 */
-	fs_info = kzalloc(sizeof(struct btrfs_fs_info), GFP_NOFS);
+	fs_info = kzalloc(sizeof(struct btrfs_fs_info), GFP_KERNEL);
 	if (!fs_info) {
 		error = -ENOMEM;
 		goto error_sec_opts;
@@ -1576,8 +1577,8 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
 
 	fs_info->fs_devices = fs_devices;
 
-	fs_info->super_copy = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_NOFS);
-	fs_info->super_for_commit = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_NOFS);
+	fs_info->super_copy = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_KERNEL);
+	fs_info->super_for_commit = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_KERNEL);
 	security_init_mnt_opts(&fs_info->security_opts);
 	if (!fs_info->super_copy || !fs_info->super_for_commit) {
 		error = -ENOMEM;

From 63e727ecd238be2281156f2f0d5d7ad593d14ee8 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Thu, 22 Jun 2017 03:13:02 +0200
Subject: [PATCH 039/104] btrfs: use GFP_KERNEL in btrfs_defrag_file

We can safely use GFP_KERNEL, the function is called from two contexts:

- ioctl handler, called directly, no locks taken
- cleaner thread, running all queued defrag work, outside of any locks

Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ioctl.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 3e744dd17ac5..4d1b07b81549 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1277,7 +1277,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
 	 * context
 	 */
 	if (!file) {
-		ra = kzalloc(sizeof(*ra), GFP_NOFS);
+		ra = kzalloc(sizeof(*ra), GFP_KERNEL);
 		if (!ra)
 			return -ENOMEM;
 		file_ra_state_init(ra, inode->i_mapping);
@@ -1285,8 +1285,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
 		ra = &file->f_ra;
 	}
 
-	pages = kmalloc_array(max_cluster, sizeof(struct page *),
-			GFP_NOFS);
+	pages = kmalloc_array(max_cluster, sizeof(struct page *), GFP_KERNEL);
 	if (!pages) {
 		ret = -ENOMEM;
 		goto out_ra;

From 0a52d108089f337ce20d74bf53f2e574c8ee43f2 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Thu, 22 Jun 2017 03:22:58 +0200
Subject: [PATCH 040/104] btrfs: defrag: make readahead state allocation
 failure non-fatal

All sorts of readahead errors are not considered fatal. We can continue
defragmentation without it, with some potential slow down, which will
last only for the current inode.

Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ioctl.c | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 4d1b07b81549..496e873b14de 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1273,14 +1273,14 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
 		extent_thresh = SZ_256K;
 
 	/*
-	 * if we were not given a file, allocate a readahead
-	 * context
+	 * If we were not given a file, allocate a readahead context. As
+	 * readahead is just an optimization, defrag will work without it so
+	 * we don't error out.
 	 */
 	if (!file) {
 		ra = kzalloc(sizeof(*ra), GFP_KERNEL);
-		if (!ra)
-			return -ENOMEM;
-		file_ra_state_init(ra, inode->i_mapping);
+		if (ra)
+			file_ra_state_init(ra, inode->i_mapping);
 	} else {
 		ra = &file->f_ra;
 	}
@@ -1363,8 +1363,9 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
 
 		if (i + cluster > ra_index) {
 			ra_index = max(i, ra_index);
-			btrfs_force_ra(inode->i_mapping, ra, file, ra_index,
-				       cluster);
+			if (ra)
+				btrfs_force_ra(inode->i_mapping, ra, file,
+						ra_index, cluster);
 			ra_index += cluster;
 		}
 

From 74b595fe672a4c4a40fe87c727304e2df3485f13 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Wed, 14 Jun 2017 16:24:56 +0200
Subject: [PATCH 041/104] btrfs: factor reading progress out of
 btrfs_dev_replace_status

We'll want to read the percentage value from dev_replace elsewhere, move
the logic to a separate helper.

Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/dev-replace.c | 46 +++++++++++++++++++++++++++---------------
 1 file changed, 30 insertions(+), 16 deletions(-)

diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index bee3edeea7a3..31d01eefecd0 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -639,11 +639,39 @@ static void btrfs_dev_replace_update_device_in_mapping_tree(
 	write_unlock(&em_tree->lock);
 }
 
+/*
+ * Read progress of device replace status according to the state and last
+ * stored position. The value format is the same as for
+ * btrfs_dev_replace::progress_1000
+ */
+static u64 btrfs_dev_replace_progress(struct btrfs_fs_info *fs_info)
+{
+	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+	u64 ret = 0;
+
+	switch (dev_replace->replace_state) {
+	case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
+	case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
+		ret = 0;
+		break;
+	case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
+		ret = 1000;
+		break;
+	case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
+	case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
+		ret = div64_u64(dev_replace->cursor_left,
+				div_u64(btrfs_device_get_total_bytes(
+						dev_replace->srcdev), 1000));
+		break;
+	}
+
+	return ret;
+}
+
 void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info,
 			      struct btrfs_ioctl_dev_replace_args *args)
 {
 	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
-	struct btrfs_device *srcdev;
 
 	btrfs_dev_replace_lock(dev_replace, 0);
 	/* even if !dev_replace_is_valid, the values are good enough for
@@ -656,21 +684,7 @@ void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info,
 		atomic64_read(&dev_replace->num_write_errors);
 	args->status.num_uncorrectable_read_errors =
 		atomic64_read(&dev_replace->num_uncorrectable_read_errors);
-	switch (dev_replace->replace_state) {
-	case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
-	case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
-		args->status.progress_1000 = 0;
-		break;
-	case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
-		args->status.progress_1000 = 1000;
-		break;
-	case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
-	case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
-		srcdev = dev_replace->srcdev;
-		args->status.progress_1000 = div64_u64(dev_replace->cursor_left,
-			div_u64(btrfs_device_get_total_bytes(srcdev), 1000));
-		break;
-	}
+	args->status.progress_1000 = btrfs_dev_replace_progress(fs_info);
 	btrfs_dev_replace_unlock(dev_replace, 0);
 }
 

From f1b8a1e8c00a12000e5683f7312f2b1d877717ae Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Wed, 14 Jun 2017 16:28:42 +0200
Subject: [PATCH 042/104] btrfs: simplify btrfs_dev_replace_kthread

This function prints an informative message and then continues
dev-replace. The message contains a progress percentage which is read
from the status. The status is allocated dynamically, about 2600 bytes,
just to read the single value. That's an overkill. We'll use the new
helper and drop the allocation.

Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/dev-replace.c | 28 +++++++++++-----------------
 1 file changed, 11 insertions(+), 17 deletions(-)

diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 31d01eefecd0..7a93a3e1a847 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -809,25 +809,19 @@ static int btrfs_dev_replace_kthread(void *data)
 {
 	struct btrfs_fs_info *fs_info = data;
 	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
-	struct btrfs_ioctl_dev_replace_args *status_args;
 	u64 progress;
 
-	status_args = kzalloc(sizeof(*status_args), GFP_KERNEL);
-	if (status_args) {
-		btrfs_dev_replace_status(fs_info, status_args);
-		progress = status_args->status.progress_1000;
-		kfree(status_args);
-		progress = div_u64(progress, 10);
-		btrfs_info_in_rcu(fs_info,
-			"continuing dev_replace from %s (devid %llu) to %s @%u%%",
-			dev_replace->srcdev->missing ? "<missing disk>" :
-			rcu_str_deref(dev_replace->srcdev->name),
-			dev_replace->srcdev->devid,
-			dev_replace->tgtdev ?
-			rcu_str_deref(dev_replace->tgtdev->name) :
-			"<missing target disk>",
-			(unsigned int)progress);
-	}
+	progress = btrfs_dev_replace_progress(fs_info);
+	progress = div_u64(progress, 10);
+	btrfs_info_in_rcu(fs_info,
+		"continuing dev_replace from %s (devid %llu) to %s @%u%%",
+		dev_replace->srcdev->missing ? "<missing disk>"
+			: rcu_str_deref(dev_replace->srcdev->name),
+		dev_replace->srcdev->devid,
+		dev_replace->tgtdev ? rcu_str_deref(dev_replace->tgtdev->name)
+			: "<missing target disk>",
+		(unsigned int)progress);
+
 	btrfs_dev_replace_continue_on_mount(fs_info);
 	clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
 

From a4f78750ef1882e59bb4f947e216cf61ef2d67d2 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Thu, 29 Jun 2017 18:37:49 +0200
Subject: [PATCH 043/104] btrfs: get fs_info from eb in btrfs_print_leaf,
 remove argument

Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.c       | 14 +++++++-------
 fs/btrfs/disk-io.c     |  2 +-
 fs/btrfs/extent-tree.c |  6 +++---
 fs/btrfs/print-tree.c  |  6 ++++--
 fs/btrfs/print-tree.h  |  2 +-
 fs/btrfs/root-tree.c   |  2 +-
 6 files changed, 17 insertions(+), 15 deletions(-)

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 3f4daa9d6e2c..6d49db7d86be 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -4650,7 +4650,7 @@ void btrfs_truncate_item(struct btrfs_fs_info *fs_info,
 	btrfs_mark_buffer_dirty(leaf);
 
 	if (btrfs_leaf_free_space(fs_info, leaf) < 0) {
-		btrfs_print_leaf(fs_info, leaf);
+		btrfs_print_leaf(leaf);
 		BUG();
 	}
 }
@@ -4679,7 +4679,7 @@ void btrfs_extend_item(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
 	data_end = leaf_data_end(fs_info, leaf);
 
 	if (btrfs_leaf_free_space(fs_info, leaf) < data_size) {
-		btrfs_print_leaf(fs_info, leaf);
+		btrfs_print_leaf(leaf);
 		BUG();
 	}
 	slot = path->slots[0];
@@ -4687,7 +4687,7 @@ void btrfs_extend_item(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
 
 	BUG_ON(slot < 0);
 	if (slot >= nritems) {
-		btrfs_print_leaf(fs_info, leaf);
+		btrfs_print_leaf(leaf);
 		btrfs_crit(fs_info, "slot %d too large, nritems %d",
 			   slot, nritems);
 		BUG_ON(1);
@@ -4718,7 +4718,7 @@ void btrfs_extend_item(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
 	btrfs_mark_buffer_dirty(leaf);
 
 	if (btrfs_leaf_free_space(fs_info, leaf) < 0) {
-		btrfs_print_leaf(fs_info, leaf);
+		btrfs_print_leaf(leaf);
 		BUG();
 	}
 }
@@ -4757,7 +4757,7 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
 	data_end = leaf_data_end(fs_info, leaf);
 
 	if (btrfs_leaf_free_space(fs_info, leaf) < total_size) {
-		btrfs_print_leaf(fs_info, leaf);
+		btrfs_print_leaf(leaf);
 		btrfs_crit(fs_info, "not enough freespace need %u have %d",
 			   total_size, btrfs_leaf_free_space(fs_info, leaf));
 		BUG();
@@ -4767,7 +4767,7 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
 		unsigned int old_data = btrfs_item_end_nr(leaf, slot);
 
 		if (old_data < data_end) {
-			btrfs_print_leaf(fs_info, leaf);
+			btrfs_print_leaf(leaf);
 			btrfs_crit(fs_info, "slot %d old_data %d data_end %d",
 				   slot, old_data, data_end);
 			BUG_ON(1);
@@ -4811,7 +4811,7 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
 	btrfs_mark_buffer_dirty(leaf);
 
 	if (btrfs_leaf_free_space(fs_info, leaf) < 0) {
-		btrfs_print_leaf(fs_info, leaf);
+		btrfs_print_leaf(leaf);
 		BUG();
 	}
 }
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 2f366044d891..a42ae7676759 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -3984,7 +3984,7 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
 					 fs_info->dirty_metadata_batch);
 #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
 	if (btrfs_header_level(buf) == 0 && check_leaf(root, buf)) {
-		btrfs_print_leaf(fs_info, buf);
+		btrfs_print_leaf(buf);
 		ASSERT(0);
 	}
 #endif
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 08e620c43842..9f0563dbbd5f 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -6960,7 +6960,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
 					  "umm, got %d back from search, was looking for %llu",
 					  ret, bytenr);
 				if (ret > 0)
-					btrfs_print_leaf(info, path->nodes[0]);
+					btrfs_print_leaf(path->nodes[0]);
 			}
 			if (ret < 0) {
 				btrfs_abort_transaction(trans, ret);
@@ -6969,7 +6969,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
 			extent_slot = path->slots[0];
 		}
 	} else if (WARN_ON(ret == -ENOENT)) {
-		btrfs_print_leaf(info, path->nodes[0]);
+		btrfs_print_leaf(path->nodes[0]);
 		btrfs_err(info,
 			"unable to find ref byte nr %llu parent %llu root %llu  owner %llu offset %llu",
 			bytenr, parent, root_objectid, owner_objectid,
@@ -7006,7 +7006,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
 			btrfs_err(info,
 				  "umm, got %d back from search, was looking for %llu",
 				ret, bytenr);
-			btrfs_print_leaf(info, path->nodes[0]);
+			btrfs_print_leaf(path->nodes[0]);
 		}
 		if (ret < 0) {
 			btrfs_abort_transaction(trans, ret);
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index fcae61e175f3..9c3911f4a100 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -161,8 +161,9 @@ static void print_uuid_item(struct extent_buffer *l, unsigned long offset,
 	}
 }
 
-void btrfs_print_leaf(struct btrfs_fs_info *fs_info, struct extent_buffer *l)
+void btrfs_print_leaf(struct extent_buffer *l)
 {
+	struct btrfs_fs_info *fs_info;
 	int i;
 	u32 type, nr;
 	struct btrfs_item *item;
@@ -180,6 +181,7 @@ void btrfs_print_leaf(struct btrfs_fs_info *fs_info, struct extent_buffer *l)
 	if (!l)
 		return;
 
+	fs_info = l->fs_info;
 	nr = btrfs_header_nritems(l);
 
 	btrfs_info(fs_info, "leaf %llu total ptrs %d free space %d",
@@ -329,7 +331,7 @@ void btrfs_print_tree(struct btrfs_fs_info *fs_info, struct extent_buffer *c)
 	nr = btrfs_header_nritems(c);
 	level = btrfs_header_level(c);
 	if (level == 0) {
-		btrfs_print_leaf(fs_info, c);
+		btrfs_print_leaf(c);
 		return;
 	}
 	btrfs_info(fs_info,
diff --git a/fs/btrfs/print-tree.h b/fs/btrfs/print-tree.h
index 4f2e0ea0e95a..689a52ee0cd1 100644
--- a/fs/btrfs/print-tree.h
+++ b/fs/btrfs/print-tree.h
@@ -18,6 +18,6 @@
 
 #ifndef __PRINT_TREE_
 #define __PRINT_TREE_
-void btrfs_print_leaf(struct btrfs_fs_info *fs_info, struct extent_buffer *l);
+void btrfs_print_leaf(struct extent_buffer *l);
 void btrfs_print_tree(struct btrfs_fs_info *fs_info, struct extent_buffer *c);
 #endif
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 460db0cb2d07..5b488af6f25e 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -151,7 +151,7 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root
 	}
 
 	if (ret != 0) {
-		btrfs_print_leaf(fs_info, path->nodes[0]);
+		btrfs_print_leaf(path->nodes[0]);
 		btrfs_crit(fs_info, "unable to update root key %llu %u %llu",
 			   key->objectid, key->type, key->offset);
 		BUG_ON(1);

From abe60ba45c145db2d90609138755fbb3dbc47d00 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Thu, 29 Jun 2017 18:37:49 +0200
Subject: [PATCH 044/104] btrfs: get fs_info from eb in btrfs_print_tree,
 remove argument

Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/print-tree.c | 6 ++++--
 fs/btrfs/print-tree.h | 2 +-
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index 9c3911f4a100..6e7a8c40dcd9 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -320,14 +320,16 @@ void btrfs_print_leaf(struct extent_buffer *l)
 	}
 }
 
-void btrfs_print_tree(struct btrfs_fs_info *fs_info, struct extent_buffer *c)
+void btrfs_print_tree(struct extent_buffer *c)
 {
+	struct btrfs_fs_info *fs_info;
 	int i; u32 nr;
 	struct btrfs_key key;
 	int level;
 
 	if (!c)
 		return;
+	fs_info = c->fs_info;
 	nr = btrfs_header_nritems(c);
 	level = btrfs_header_level(c);
 	if (level == 0) {
@@ -361,7 +363,7 @@ void btrfs_print_tree(struct btrfs_fs_info *fs_info, struct extent_buffer *c)
 		if (btrfs_header_level(next) !=
 		       level - 1)
 			BUG();
-		btrfs_print_tree(fs_info, next);
+		btrfs_print_tree(next);
 		free_extent_buffer(next);
 	}
 }
diff --git a/fs/btrfs/print-tree.h b/fs/btrfs/print-tree.h
index 689a52ee0cd1..3afd508ed8c5 100644
--- a/fs/btrfs/print-tree.h
+++ b/fs/btrfs/print-tree.h
@@ -19,5 +19,5 @@
 #ifndef __PRINT_TREE_
 #define __PRINT_TREE_
 void btrfs_print_leaf(struct extent_buffer *l);
-void btrfs_print_tree(struct btrfs_fs_info *fs_info, struct extent_buffer *c);
+void btrfs_print_tree(struct extent_buffer *c);
 #endif

From f1c77c55cda19445a490036bce497ed3af9696d7 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Tue, 6 Jun 2017 19:03:49 +0200
Subject: [PATCH 045/104] btrfs: cleanup types storing REQ_*

Unify types of local variables and parameters that store various
REQ_* values to unsigned int.

Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent_io.c | 14 ++++++++------
 fs/btrfs/inode.c     |  2 +-
 2 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index a7bebba4f9fc..339f39caa0b7 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2756,7 +2756,8 @@ static int merge_bio(struct extent_io_tree *tree, struct page *page,
 
 }
 
-static int submit_extent_page(int op, int op_flags, struct extent_io_tree *tree,
+static int submit_extent_page(unsigned int op, unsigned int op_flags,
+			      struct extent_io_tree *tree,
 			      struct writeback_control *wbc,
 			      struct page *page, sector_t sector,
 			      size_t size, unsigned long offset,
@@ -2877,7 +2878,7 @@ static int __do_readpage(struct extent_io_tree *tree,
 			 get_extent_t *get_extent,
 			 struct extent_map **em_cached,
 			 struct bio **bio, int mirror_num,
-			 unsigned long *bio_flags, int read_flags,
+			 unsigned long *bio_flags, unsigned int read_flags,
 			 u64 *prev_em_start)
 {
 	struct inode *inode = page->mapping->host;
@@ -3163,7 +3164,8 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
 				   struct page *page,
 				   get_extent_t *get_extent,
 				   struct bio **bio, int mirror_num,
-				   unsigned long *bio_flags, int read_flags)
+				   unsigned long *bio_flags,
+				   unsigned int read_flags)
 {
 	struct inode *inode = page->mapping->host;
 	struct btrfs_ordered_extent *ordered;
@@ -3310,7 +3312,7 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode,
 				 struct extent_page_data *epd,
 				 loff_t i_size,
 				 unsigned long nr_written,
-				 int write_flags, int *nr_ret)
+				 unsigned int write_flags, int *nr_ret)
 {
 	struct extent_io_tree *tree = epd->tree;
 	u64 start = page_offset(page);
@@ -3464,7 +3466,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
 	size_t pg_offset = 0;
 	loff_t i_size = i_size_read(inode);
 	unsigned long end_index = i_size >> PAGE_SHIFT;
-	int write_flags = 0;
+	unsigned int write_flags = 0;
 	unsigned long nr_written = 0;
 
 	if (wbc->sync_mode == WB_SYNC_ALL)
@@ -3714,7 +3716,7 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
 	unsigned long i, num_pages;
 	unsigned long bio_flags = 0;
 	unsigned long start, end;
-	int write_flags = (epd->sync_io ? REQ_SYNC : 0) | REQ_META;
+	unsigned int write_flags = (epd->sync_io ? REQ_SYNC : 0) | REQ_META;
 	int ret = 0;
 
 	clear_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 3926e9cbf0d9..9349a13b3d72 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -8019,7 +8019,7 @@ static int dio_read_error(struct inode *inode, struct bio *failed_bio,
 	struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
 	struct bio *bio;
 	int isector;
-	int read_mode = 0;
+	unsigned int read_mode = 0;
 	int segs;
 	int ret;
 

From 4b81ba48c66b1f927449585aa2a13bad8d91fcb4 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Tue, 6 Jun 2017 19:14:26 +0200
Subject: [PATCH 046/104] btrfs: merge REQ_OP and REQ_ flags to one parameter
 in submit_extent_page

The function submit_extent_page has 15(!) parameters right now, op and
op_flags are effectively one value stored to bio::bi_opf, no need to
pass them separately. So it's 14 parameters now.

Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent_io.c | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 339f39caa0b7..ea4947c97505 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2756,8 +2756,10 @@ static int merge_bio(struct extent_io_tree *tree, struct page *page,
 
 }
 
-static int submit_extent_page(unsigned int op, unsigned int op_flags,
-			      struct extent_io_tree *tree,
+/*
+ * @opf:	bio REQ_OP_* and REQ_* flags as one value
+ */
+static int submit_extent_page(unsigned int opf, struct extent_io_tree *tree,
 			      struct writeback_control *wbc,
 			      struct page *page, sector_t sector,
 			      size_t size, unsigned long offset,
@@ -2804,7 +2806,7 @@ static int submit_extent_page(unsigned int op, unsigned int op_flags,
 	bio->bi_end_io = end_io_func;
 	bio->bi_private = tree;
 	bio->bi_write_hint = page->mapping->host->i_write_hint;
-	bio_set_op_attrs(bio, op, op_flags);
+	bio->bi_opf = opf;
 	if (wbc) {
 		wbc_init_bio(wbc, bio);
 		wbc_account_io(wbc, page, page_size);
@@ -3059,7 +3061,7 @@ static int __do_readpage(struct extent_io_tree *tree,
 			continue;
 		}
 
-		ret = submit_extent_page(REQ_OP_READ, read_flags, tree, NULL,
+		ret = submit_extent_page(REQ_OP_READ | read_flags, tree, NULL,
 					 page, sector, disk_io_size, pg_offset,
 					 bdev, bio,
 					 end_bio_extent_readpage, mirror_num,
@@ -3428,7 +3430,7 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode,
 			       page->index, cur, end);
 		}
 
-		ret = submit_extent_page(REQ_OP_WRITE, write_flags, tree, wbc,
+		ret = submit_extent_page(REQ_OP_WRITE | write_flags, tree, wbc,
 					 page, sector, iosize, pg_offset,
 					 bdev, &epd->bio,
 					 end_bio_extent_writepage,
@@ -3746,7 +3748,7 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
 
 		clear_page_dirty_for_io(p);
 		set_page_writeback(p);
-		ret = submit_extent_page(REQ_OP_WRITE, write_flags, tree, wbc,
+		ret = submit_extent_page(REQ_OP_WRITE | write_flags, tree, wbc,
 					 p, offset >> 9, PAGE_SIZE, 0, bdev,
 					 &epd->bio,
 					 end_bio_extent_buffer_writepage,

From 2dfeca9bfb0cdd9b18bdb8d222a5e5f1479c5ca2 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Wed, 14 Jun 2017 02:48:07 +0200
Subject: [PATCH 047/104] btrfs: merge alloc_device helpers

There are two helpers called in chain from one location, we can merge the
functionaliy.

Originally, alloc_fs_devices could fill the device uuid randomly if we
we didn't give the uuid buffer. This happens for seed devices but the
fsid is generated in btrfs_prepare_sprout, so we can remove it.

Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/volumes.c | 36 +++++++++++-------------------------
 1 file changed, 11 insertions(+), 25 deletions(-)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 877224b66a12..ccedaa940f2e 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -152,7 +152,15 @@ struct list_head *btrfs_get_fs_uuids(void)
 	return &fs_uuids;
 }
 
-static struct btrfs_fs_devices *__alloc_fs_devices(void)
+/*
+ * alloc_fs_devices - allocate struct btrfs_fs_devices
+ * @fsid:	if not NULL, copy the uuid to fs_devices::fsid
+ *
+ * Return a pointer to a new struct btrfs_fs_devices on success, or ERR_PTR().
+ * The returned struct is not linked onto any lists and can be destroyed with
+ * kfree() right away.
+ */
+static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid)
 {
 	struct btrfs_fs_devices *fs_devs;
 
@@ -166,31 +174,8 @@ static struct btrfs_fs_devices *__alloc_fs_devices(void)
 	INIT_LIST_HEAD(&fs_devs->resized_devices);
 	INIT_LIST_HEAD(&fs_devs->alloc_list);
 	INIT_LIST_HEAD(&fs_devs->list);
-
-	return fs_devs;
-}
-
-/**
- * alloc_fs_devices - allocate struct btrfs_fs_devices
- * @fsid:	a pointer to UUID for this FS.  If NULL a new UUID is
- *		generated.
- *
- * Return: a pointer to a new &struct btrfs_fs_devices on success;
- * ERR_PTR() on error.  Returned struct is not linked onto any lists and
- * can be destroyed with kfree() right away.
- */
-static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid)
-{
-	struct btrfs_fs_devices *fs_devs;
-
-	fs_devs = __alloc_fs_devices();
-	if (IS_ERR(fs_devs))
-		return fs_devs;
-
 	if (fsid)
 		memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE);
-	else
-		generate_random_uuid(fs_devs->fsid);
 
 	return fs_devs;
 }
@@ -2197,7 +2182,7 @@ static int btrfs_prepare_sprout(struct btrfs_fs_info *fs_info)
 	if (!fs_devices->seeding)
 		return -EINVAL;
 
-	seed_devices = __alloc_fs_devices();
+	seed_devices = alloc_fs_devices(NULL);
 	if (IS_ERR(seed_devices))
 		return PTR_ERR(seed_devices);
 
@@ -6522,6 +6507,7 @@ static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info,
 	int ret;
 
 	BUG_ON(!mutex_is_locked(&uuid_mutex));
+	ASSERT(fsid);
 
 	fs_devices = fs_info->fs_devices->seed;
 	while (fs_devices) {

From 35c70103a528603afb538ffeafe40c7b8251b0fe Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Thu, 15 Jun 2017 19:51:51 +0200
Subject: [PATCH 048/104] btrfs: refactor find_device helper

Polish the helper:
* drop underscores, no special meaning here
* pass fs_devices, as this is what the API implements
* drop noinline, no apparent reason for such simple helper
* constify uuid
* add comment

Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/volumes.c | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index ccedaa940f2e..3a43a9cc6183 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -254,9 +254,17 @@ static struct btrfs_device *__alloc_device(void)
 	return dev;
 }
 
-static noinline struct btrfs_device *__find_device(struct list_head *head,
-						   u64 devid, u8 *uuid)
+/*
+ * Find a device specified by @devid or @uuid in the list of @fs_devices, or
+ * return NULL.
+ *
+ * If devid and uuid are both specified, the match must be exact, otherwise
+ * only devid is used.
+ */
+static struct btrfs_device *find_device(struct btrfs_fs_devices *fs_devices,
+		u64 devid, const u8 *uuid)
 {
+	struct list_head *head = &fs_devices->devices;
 	struct btrfs_device *dev;
 
 	list_for_each_entry(dev, head, dev_list) {
@@ -621,8 +629,8 @@ static noinline int device_list_add(const char *path,
 
 		device = NULL;
 	} else {
-		device = __find_device(&fs_devices->devices, devid,
-				       disk_super->dev_item.uuid);
+		device = find_device(fs_devices, devid,
+				disk_super->dev_item.uuid);
 	}
 
 	if (!device) {
@@ -6236,8 +6244,7 @@ struct btrfs_device *btrfs_find_device(struct btrfs_fs_info *fs_info, u64 devid,
 	while (cur_devices) {
 		if (!fsid ||
 		    !memcmp(cur_devices->fsid, fsid, BTRFS_UUID_SIZE)) {
-			device = __find_device(&cur_devices->devices,
-					       devid, uuid);
+			device = find_device(cur_devices, devid, uuid);
 			if (device)
 				return device;
 		}

From abbb3b8ebfec577390873d33076d56bc6e758939 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Fri, 16 Jun 2017 00:50:33 +0200
Subject: [PATCH 049/104] btrfs: split write_dev_supers to two functions

There are two independent parts, one that writes the superblocks and
another that waits for completion. No functional changes, but cleanups,
reformatting and comment updates.

Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/disk-io.c | 133 +++++++++++++++++++++++++--------------------
 1 file changed, 75 insertions(+), 58 deletions(-)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index a42ae7676759..b72711ad7121 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -3373,19 +3373,17 @@ struct buffer_head *btrfs_read_dev_super(struct block_device *bdev)
 }
 
 /*
- * this should be called twice, once with wait == 0 and
- * once with wait == 1.  When wait == 0 is done, all the buffer heads
- * we write are pinned.
+ * Write superblock @sb to the @device. Do not wait for completion, all the
+ * buffer heads we write are pinned.
  *
- * They are released when wait == 1 is done.
- * max_mirrors must be the same for both runs, and it indicates how
- * many supers on this one device should be written.
+ * Write @max_mirrors copies of the superblock, where 0 means default that fit
+ * the expected device size at commit time. Note that max_mirrors must be
+ * same for write and wait phases.
  *
- * max_mirrors == 0 means to write them all.
+ * Return number of errors when buffer head is not found or submission fails.
  */
 static int write_dev_supers(struct btrfs_device *device,
-			    struct btrfs_super_block *sb,
-			    int wait, int max_mirrors)
+			    struct btrfs_super_block *sb, int max_mirrors)
 {
 	struct buffer_head *bh;
 	int i;
@@ -3403,58 +3401,34 @@ static int write_dev_supers(struct btrfs_device *device,
 		    device->commit_total_bytes)
 			break;
 
-		if (wait) {
-			bh = __find_get_block(device->bdev, bytenr / 4096,
-					      BTRFS_SUPER_INFO_SIZE);
-			if (!bh) {
-				errors++;
-				continue;
-			}
-			wait_on_buffer(bh);
-			if (!buffer_uptodate(bh))
-				errors++;
+		btrfs_set_super_bytenr(sb, bytenr);
 
-			/* drop our reference */
-			brelse(bh);
+		crc = ~(u32)0;
+		crc = btrfs_csum_data((const char *)sb + BTRFS_CSUM_SIZE, crc,
+				      BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE);
+		btrfs_csum_final(crc, sb->csum);
 
-			/* drop the reference from the wait == 0 run */
-			brelse(bh);
+		/* One reference for us, and we leave it for the caller */
+		bh = __getblk(device->bdev, bytenr / 4096,
+			      BTRFS_SUPER_INFO_SIZE);
+		if (!bh) {
+			btrfs_err(device->fs_info,
+			    "couldn't get super buffer head for bytenr %llu",
+			    bytenr);
+			errors++;
 			continue;
-		} else {
-			btrfs_set_super_bytenr(sb, bytenr);
-
-			crc = ~(u32)0;
-			crc = btrfs_csum_data((const char *)sb +
-					      BTRFS_CSUM_SIZE, crc,
-					      BTRFS_SUPER_INFO_SIZE -
-					      BTRFS_CSUM_SIZE);
-			btrfs_csum_final(crc, sb->csum);
-
-			/*
-			 * one reference for us, and we leave it for the
-			 * caller
-			 */
-			bh = __getblk(device->bdev, bytenr / 4096,
-				      BTRFS_SUPER_INFO_SIZE);
-			if (!bh) {
-				btrfs_err(device->fs_info,
-				    "couldn't get super buffer head for bytenr %llu",
-				    bytenr);
-				errors++;
-				continue;
-			}
-
-			memcpy(bh->b_data, sb, BTRFS_SUPER_INFO_SIZE);
-
-			/* one reference for submit_bh */
-			get_bh(bh);
-
-			set_buffer_uptodate(bh);
-			lock_buffer(bh);
-			bh->b_end_io = btrfs_end_buffer_write_sync;
-			bh->b_private = device;
 		}
 
+		memcpy(bh->b_data, sb, BTRFS_SUPER_INFO_SIZE);
+
+		/* one reference for submit_bh */
+		get_bh(bh);
+
+		set_buffer_uptodate(bh);
+		lock_buffer(bh);
+		bh->b_end_io = btrfs_end_buffer_write_sync;
+		bh->b_private = device;
+
 		/*
 		 * we fua the first super.  The others we allow
 		 * to go down lazy.
@@ -3471,6 +3445,49 @@ static int write_dev_supers(struct btrfs_device *device,
 	return errors < i ? 0 : -1;
 }
 
+/*
+ * Wait for write completion of superblocks done by write_dev_supers,
+ * @max_mirrors same for write and wait phases.
+ *
+ * Return number of errors when buffer head is not found or not marked up to
+ * date.
+ */
+static int wait_dev_supers(struct btrfs_device *device, int max_mirrors)
+{
+	struct buffer_head *bh;
+	int i;
+	int errors = 0;
+	u64 bytenr;
+
+	if (max_mirrors == 0)
+		max_mirrors = BTRFS_SUPER_MIRROR_MAX;
+
+	for (i = 0; i < max_mirrors; i++) {
+		bytenr = btrfs_sb_offset(i);
+		if (bytenr + BTRFS_SUPER_INFO_SIZE >=
+		    device->commit_total_bytes)
+			break;
+
+		bh = __find_get_block(device->bdev, bytenr / 4096,
+				      BTRFS_SUPER_INFO_SIZE);
+		if (!bh) {
+			errors++;
+			continue;
+		}
+		wait_on_buffer(bh);
+		if (!buffer_uptodate(bh))
+			errors++;
+
+		/* drop our reference */
+		brelse(bh);
+
+		/* drop the reference from the writing run */
+		brelse(bh);
+	}
+
+	return errors < i ? 0 : -1;
+}
+
 /*
  * endio for the write_dev_flush, this will wake anyone waiting
  * for the barrier when it is done
@@ -3668,7 +3685,7 @@ int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors)
 		flags = btrfs_super_flags(sb);
 		btrfs_set_super_flags(sb, flags | BTRFS_HEADER_FLAG_WRITTEN);
 
-		ret = write_dev_supers(dev, sb, 0, max_mirrors);
+		ret = write_dev_supers(dev, sb, max_mirrors);
 		if (ret)
 			total_errors++;
 	}
@@ -3691,7 +3708,7 @@ int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors)
 		if (!dev->in_fs_metadata || !dev->writeable)
 			continue;
 
-		ret = write_dev_supers(dev, sb, 1, max_mirrors);
+		ret = wait_dev_supers(dev, max_mirrors);
 		if (ret)
 			total_errors++;
 	}

From 9f6d251033bc5bd0115c23fae8b6aac5f6263c2c Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Fri, 16 Jun 2017 01:48:05 +0200
Subject: [PATCH 050/104] btrfs: use named constant for bdev blocksize

Superblock is read and written using buffer heads, we need to set the
bdev blocksize. The magic constant has been hardcoded in several places,
so replace it with a named constant.

Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/check-integrity.c |  6 +++---
 fs/btrfs/disk-io.c         | 11 ++++++-----
 fs/btrfs/disk-io.h         |  8 ++++++++
 fs/btrfs/volumes.c         |  6 +++---
 4 files changed, 20 insertions(+), 11 deletions(-)

diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index 11d37c94ce05..7d65d98d2790 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -795,12 +795,12 @@ static int btrfsic_process_superblock_dev_mirror(
 	dev_bytenr = btrfs_sb_offset(superblock_mirror_num);
 	if (dev_bytenr + BTRFS_SUPER_INFO_SIZE > device->commit_total_bytes)
 		return -1;
-	bh = __bread(superblock_bdev, dev_bytenr / 4096,
+	bh = __bread(superblock_bdev, dev_bytenr / BTRFS_BDEV_BLOCKSIZE,
 		     BTRFS_SUPER_INFO_SIZE);
 	if (NULL == bh)
 		return -1;
 	super_tmp = (struct btrfs_super_block *)
-	    (bh->b_data + (dev_bytenr & 4095));
+	    (bh->b_data + (dev_bytenr & (BTRFS_BDEV_BLOCKSIZE - 1)));
 
 	if (btrfs_super_bytenr(super_tmp) != dev_bytenr ||
 	    btrfs_super_magic(super_tmp) != BTRFS_MAGIC ||
@@ -2758,7 +2758,7 @@ int btrfsic_submit_bh(int op, int op_flags, struct buffer_head *bh)
 	    (op == REQ_OP_WRITE) && bh->b_size > 0) {
 		u64 dev_bytenr;
 
-		dev_bytenr = 4096 * bh->b_blocknr;
+		dev_bytenr = BTRFS_BDEV_BLOCKSIZE * bh->b_blocknr;
 		if (dev_state->state->print_mask &
 		    BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH)
 			pr_info("submit_bh(op=0x%x,0x%x, blocknr=%llu (bytenr %llu), size=%zu, data=%p, bdev=%p)\n",
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index b72711ad7121..9b1f4ef54438 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2694,8 +2694,8 @@ int open_ctree(struct super_block *sb,
 	btrfs_init_balance(fs_info);
 	btrfs_init_async_reclaim_work(&fs_info->async_reclaim_work);
 
-	sb->s_blocksize = 4096;
-	sb->s_blocksize_bits = blksize_bits(4096);
+	sb->s_blocksize = BTRFS_BDEV_BLOCKSIZE;
+	sb->s_blocksize_bits = blksize_bits(BTRFS_BDEV_BLOCKSIZE);
 
 	btrfs_init_btree_inode(fs_info);
 
@@ -3316,7 +3316,7 @@ int btrfs_read_dev_one_super(struct block_device *bdev, int copy_num,
 	if (bytenr + BTRFS_SUPER_INFO_SIZE >= i_size_read(bdev->bd_inode))
 		return -EINVAL;
 
-	bh = __bread(bdev, bytenr / 4096, BTRFS_SUPER_INFO_SIZE);
+	bh = __bread(bdev, bytenr / BTRFS_BDEV_BLOCKSIZE, BTRFS_SUPER_INFO_SIZE);
 	/*
 	 * If we fail to read from the underlying devices, as of now
 	 * the best option we have is to mark it EIO.
@@ -3409,7 +3409,7 @@ static int write_dev_supers(struct btrfs_device *device,
 		btrfs_csum_final(crc, sb->csum);
 
 		/* One reference for us, and we leave it for the caller */
-		bh = __getblk(device->bdev, bytenr / 4096,
+		bh = __getblk(device->bdev, bytenr / BTRFS_BDEV_BLOCKSIZE,
 			      BTRFS_SUPER_INFO_SIZE);
 		if (!bh) {
 			btrfs_err(device->fs_info,
@@ -3468,7 +3468,8 @@ static int wait_dev_supers(struct btrfs_device *device, int max_mirrors)
 		    device->commit_total_bytes)
 			break;
 
-		bh = __find_get_block(device->bdev, bytenr / 4096,
+		bh = __find_get_block(device->bdev,
+				      bytenr / BTRFS_BDEV_BLOCKSIZE,
 				      BTRFS_SUPER_INFO_SIZE);
 		if (!bh) {
 			errors++;
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index b5d0c4cef819..7f7c35d6347a 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -25,6 +25,14 @@
 #define BTRFS_SUPER_MIRROR_MAX	 3
 #define BTRFS_SUPER_MIRROR_SHIFT 12
 
+/*
+ * Fixed blocksize for all devices, applies to specific ways of reading
+ * metadata like superblock. Must meet the set_blocksize requirements.
+ *
+ * Do not change.
+ */
+#define BTRFS_BDEV_BLOCKSIZE	(4096)
+
 enum btrfs_wq_endio_type {
 	BTRFS_WQ_ENDIO_DATA = 0,
 	BTRFS_WQ_ENDIO_METADATA = 1,
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 3a43a9cc6183..12bd04a4104f 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -303,7 +303,7 @@ btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder,
 
 	if (flush)
 		filemap_write_and_wait((*bdev)->bd_inode->i_mapping);
-	ret = set_blocksize(*bdev, 4096);
+	ret = set_blocksize(*bdev, BTRFS_BDEV_BLOCKSIZE);
 	if (ret) {
 		blkdev_put(*bdev, flags);
 		goto error;
@@ -2395,7 +2395,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
 	device->is_tgtdev_for_dev_replace = 0;
 	device->mode = FMODE_EXCL;
 	device->dev_stats_valid = 1;
-	set_blocksize(device->bdev, 4096);
+	set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE);
 
 	if (seeding_dev) {
 		sb->s_flags &= ~MS_RDONLY;
@@ -2598,7 +2598,7 @@ int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
 	device->is_tgtdev_for_dev_replace = 1;
 	device->mode = FMODE_EXCL;
 	device->dev_stats_valid = 1;
-	set_blocksize(device->bdev, 4096);
+	set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE);
 	device->fs_devices = fs_info->fs_devices;
 	list_add(&device->dev_list, &fs_info->fs_devices->devices);
 	fs_info->fs_devices->num_devices++;

From 3558d4f88ec8db65ccea5cfe5a421ee1e8cbb12b Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <nborisov@suse.com>
Date: Wed, 26 Jul 2017 11:26:28 +0300
Subject: [PATCH 051/104] btrfs: Deprecate userspace transaction ioctls

Userspace transactions were introduced in commit 6bf13c0cc833 ("Btrfs:
transaction ioctls") to provide semantics that Ceph's object store
required. However, things have changed significantly since then, to the
point where btrfs is no longer suitable as a backend for ceph and in
fact it's actively advised against such usages. Considering this, there
doesn't seem to be a widespread, legit use case of userspace
transaction. They also clutter the file->private pointer.

So to end the agony let's nuke the userspace transaction ioctls. As a
first step let's give time for people to voice their objection by just
WARN()ining when the userspace transaction is used.

Signed-off-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
[ move the warning past perm checks, keep the has-been-printed state;
  we're ok with just one warning over all filesystems ]
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ioctl.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 496e873b14de..4cfc3d4c0a37 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -3967,11 +3967,22 @@ static long btrfs_ioctl_trans_start(struct file *file)
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_trans_handle *trans;
 	int ret;
+	static bool warned = false;
 
 	ret = -EPERM;
 	if (!capable(CAP_SYS_ADMIN))
 		goto out;
 
+	if (!warned) {
+		btrfs_warn(fs_info,
+			"Userspace transaction mechanism is considered "
+			"deprecated and slated to be removed in 4.17. "
+			"If you have a valid use case please "
+			"speak up on the mailing list");
+		WARN_ON(1);
+		warned = true;
+	}
+
 	ret = -EINPROGRESS;
 	if (file->private_data)
 		goto out;

From e38ae7a0868b02e7b183f18a4b75c8b2b68ce258 Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <nborisov@suse.com>
Date: Tue, 25 Jul 2017 17:48:28 +0300
Subject: [PATCH 052/104] btrfs: Make flush_space return void

The return value of flush_space was used to have significance in the
early days when the code was first introduced and before the ticketed
enospc rework. Since the latter got introduced the return value lost any
significance whatsoever to its callers. So let's remove it. While at it
also remove the unused ticket variable in
btrfs_async_reclaim_metadata_space. It was used in the initial version
of the ticketed ENOSPC work, however Wang Xiaoguang detected a problem
with this and fixed it in ce129655c9d9 ("btrfs: introduce tickets_id to
determine whether asynchronous metadata reclaim work makes progress").

Signed-off-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
[ add comment ]
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent-tree.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 9f0563dbbd5f..42251c2eb7d2 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -4897,7 +4897,12 @@ struct reserve_ticket {
 	wait_queue_head_t wait;
 };
 
-static int flush_space(struct btrfs_fs_info *fs_info,
+/*
+ * Try to flush some data based on policy set by @state. This is only advisory
+ * and may fail for various reasons. The caller is supposed to examine the
+ * state of @space_info to detect the outcome.
+ */
+static void flush_space(struct btrfs_fs_info *fs_info,
 		       struct btrfs_space_info *space_info, u64 num_bytes,
 		       int state)
 {
@@ -4951,7 +4956,7 @@ static int flush_space(struct btrfs_fs_info *fs_info,
 
 	trace_btrfs_flush_space(fs_info, space_info->flags, num_bytes, state,
 				ret);
-	return ret;
+	return;
 }
 
 static inline u64
@@ -5053,10 +5058,7 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
 
 	flush_state = FLUSH_DELAYED_ITEMS_NR;
 	do {
-		struct reserve_ticket *ticket;
-		int ret;
-
-		ret = flush_space(fs_info, space_info, to_reclaim, flush_state);
+		flush_space(fs_info, space_info, to_reclaim, flush_state);
 		spin_lock(&space_info->lock);
 		if (list_empty(&space_info->tickets)) {
 			space_info->flush = 0;
@@ -5066,8 +5068,6 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
 		to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info,
 							      space_info,
 							      false);
-		ticket = list_first_entry(&space_info->tickets,
-					  struct reserve_ticket, list);
 		if (last_tickets_id == space_info->tickets_id) {
 			flush_state++;
 		} else {

From ea14b57fd1954fa3193e025224bbbeab7415c490 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Thu, 22 Jun 2017 02:19:11 +0200
Subject: [PATCH 053/104] btrfs: fix spelling of snapshotting

Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.h       |  6 +++---
 fs/btrfs/disk-io.c     |  2 +-
 fs/btrfs/extent-tree.c | 22 +++++++++++-----------
 fs/btrfs/file.c        | 10 +++++-----
 fs/btrfs/inode.c       | 22 +++++++++++-----------
 fs/btrfs/ioctl.c       | 10 +++++-----
 6 files changed, 36 insertions(+), 36 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 6510f246f71e..874d814da371 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1258,7 +1258,7 @@ struct btrfs_root {
 	 */
 	int send_in_progress;
 	struct btrfs_subvolume_writers *subv_writers;
-	atomic_t will_be_snapshoted;
+	atomic_t will_be_snapshotted;
 
 	/* For qgroup metadata space reserve */
 	atomic64_t qgroup_meta_rsv;
@@ -2773,8 +2773,8 @@ int btrfs_init_space_info(struct btrfs_fs_info *fs_info);
 int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans,
 					 struct btrfs_fs_info *fs_info);
 int __get_raid_index(u64 flags);
-int btrfs_start_write_no_snapshoting(struct btrfs_root *root);
-void btrfs_end_write_no_snapshoting(struct btrfs_root *root);
+int btrfs_start_write_no_snapshotting(struct btrfs_root *root);
+void btrfs_end_write_no_snapshotting(struct btrfs_root *root);
 void btrfs_wait_for_snapshot_creation(struct btrfs_root *root);
 void check_system_chunk(struct btrfs_trans_handle *trans,
 			struct btrfs_fs_info *fs_info, const u64 type);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 9b1f4ef54438..57a857142cda 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1343,7 +1343,7 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
 	atomic_set(&root->log_batch, 0);
 	atomic_set(&root->orphan_inodes, 0);
 	refcount_set(&root->refs, 1);
-	atomic_set(&root->will_be_snapshoted, 0);
+	atomic_set(&root->will_be_snapshotted, 0);
 	atomic64_set(&root->qgroup_meta_rsv, 0);
 	root->log_transid = 0;
 	root->log_transid_committed = -1;
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 42251c2eb7d2..7d7abc0d47b9 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -10989,14 +10989,14 @@ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range)
 }
 
 /*
- * btrfs_{start,end}_write_no_snapshoting() are similar to
+ * btrfs_{start,end}_write_no_snapshotting() are similar to
  * mnt_{want,drop}_write(), they are used to prevent some tasks from writing
  * data into the page cache through nocow before the subvolume is snapshoted,
  * but flush the data into disk after the snapshot creation, or to prevent
- * operations while snapshoting is ongoing and that cause the snapshot to be
+ * operations while snapshotting is ongoing and that cause the snapshot to be
  * inconsistent (writes followed by expanding truncates for example).
  */
-void btrfs_end_write_no_snapshoting(struct btrfs_root *root)
+void btrfs_end_write_no_snapshotting(struct btrfs_root *root)
 {
 	percpu_counter_dec(&root->subv_writers->counter);
 	/*
@@ -11007,9 +11007,9 @@ void btrfs_end_write_no_snapshoting(struct btrfs_root *root)
 		wake_up(&root->subv_writers->wait);
 }
 
-int btrfs_start_write_no_snapshoting(struct btrfs_root *root)
+int btrfs_start_write_no_snapshotting(struct btrfs_root *root)
 {
-	if (atomic_read(&root->will_be_snapshoted))
+	if (atomic_read(&root->will_be_snapshotted))
 		return 0;
 
 	percpu_counter_inc(&root->subv_writers->counter);
@@ -11017,14 +11017,14 @@ int btrfs_start_write_no_snapshoting(struct btrfs_root *root)
 	 * Make sure counter is updated before we check for snapshot creation.
 	 */
 	smp_mb();
-	if (atomic_read(&root->will_be_snapshoted)) {
-		btrfs_end_write_no_snapshoting(root);
+	if (atomic_read(&root->will_be_snapshotted)) {
+		btrfs_end_write_no_snapshotting(root);
 		return 0;
 	}
 	return 1;
 }
 
-static int wait_snapshoting_atomic_t(atomic_t *a)
+static int wait_snapshotting_atomic_t(atomic_t *a)
 {
 	schedule();
 	return 0;
@@ -11035,11 +11035,11 @@ void btrfs_wait_for_snapshot_creation(struct btrfs_root *root)
 	while (true) {
 		int ret;
 
-		ret = btrfs_start_write_no_snapshoting(root);
+		ret = btrfs_start_write_no_snapshotting(root);
 		if (ret)
 			break;
-		wait_on_atomic_t(&root->will_be_snapshoted,
-				 wait_snapshoting_atomic_t,
+		wait_on_atomic_t(&root->will_be_snapshotted,
+				 wait_snapshotting_atomic_t,
 				 TASK_UNINTERRUPTIBLE);
 	}
 }
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 9e75d8a39aac..58818cf7f82d 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1536,7 +1536,7 @@ static noinline int check_can_nocow(struct btrfs_inode *inode, loff_t pos,
 	u64 num_bytes;
 	int ret;
 
-	ret = btrfs_start_write_no_snapshoting(root);
+	ret = btrfs_start_write_no_snapshotting(root);
 	if (!ret)
 		return -ENOSPC;
 
@@ -1561,7 +1561,7 @@ static noinline int check_can_nocow(struct btrfs_inode *inode, loff_t pos,
 			NULL, NULL, NULL);
 	if (ret <= 0) {
 		ret = 0;
-		btrfs_end_write_no_snapshoting(root);
+		btrfs_end_write_no_snapshotting(root);
 	} else {
 		*write_bytes = min_t(size_t, *write_bytes ,
 				     num_bytes - pos + lockstart);
@@ -1664,7 +1664,7 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
 						data_reserved, pos,
 						write_bytes);
 			else
-				btrfs_end_write_no_snapshoting(root);
+				btrfs_end_write_no_snapshotting(root);
 			break;
 		}
 
@@ -1767,7 +1767,7 @@ again:
 
 		release_bytes = 0;
 		if (only_release_metadata)
-			btrfs_end_write_no_snapshoting(root);
+			btrfs_end_write_no_snapshotting(root);
 
 		if (only_release_metadata && copied > 0) {
 			lockstart = round_down(pos,
@@ -1797,7 +1797,7 @@ again:
 
 	if (release_bytes) {
 		if (only_release_metadata) {
-			btrfs_end_write_no_snapshoting(root);
+			btrfs_end_write_no_snapshotting(root);
 			btrfs_delalloc_release_metadata(BTRFS_I(inode),
 					release_bytes);
 		} else {
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 9349a13b3d72..9ad9dda871ca 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1381,7 +1381,7 @@ next_slot:
 			 * we fall into common COW way.
 			 */
 			if (!nolock) {
-				err = btrfs_start_write_no_snapshoting(root);
+				err = btrfs_start_write_no_snapshotting(root);
 				if (!err)
 					goto out_check;
 			}
@@ -1393,12 +1393,12 @@ next_slot:
 			if (csum_exist_in_range(fs_info, disk_bytenr,
 						num_bytes)) {
 				if (!nolock)
-					btrfs_end_write_no_snapshoting(root);
+					btrfs_end_write_no_snapshotting(root);
 				goto out_check;
 			}
 			if (!btrfs_inc_nocow_writers(fs_info, disk_bytenr)) {
 				if (!nolock)
-					btrfs_end_write_no_snapshoting(root);
+					btrfs_end_write_no_snapshotting(root);
 				goto out_check;
 			}
 			nocow = 1;
@@ -1415,7 +1415,7 @@ out_check:
 		if (extent_end <= start) {
 			path->slots[0]++;
 			if (!nolock && nocow)
-				btrfs_end_write_no_snapshoting(root);
+				btrfs_end_write_no_snapshotting(root);
 			if (nocow)
 				btrfs_dec_nocow_writers(fs_info, disk_bytenr);
 			goto next_slot;
@@ -1438,7 +1438,7 @@ out_check:
 					     NULL);
 			if (ret) {
 				if (!nolock && nocow)
-					btrfs_end_write_no_snapshoting(root);
+					btrfs_end_write_no_snapshotting(root);
 				if (nocow)
 					btrfs_dec_nocow_writers(fs_info,
 								disk_bytenr);
@@ -1459,7 +1459,7 @@ out_check:
 					  BTRFS_ORDERED_PREALLOC);
 			if (IS_ERR(em)) {
 				if (!nolock && nocow)
-					btrfs_end_write_no_snapshoting(root);
+					btrfs_end_write_no_snapshotting(root);
 				if (nocow)
 					btrfs_dec_nocow_writers(fs_info,
 								disk_bytenr);
@@ -1499,7 +1499,7 @@ out_check:
 					     PAGE_UNLOCK | PAGE_SET_PRIVATE2);
 
 		if (!nolock && nocow)
-			btrfs_end_write_no_snapshoting(root);
+			btrfs_end_write_no_snapshotting(root);
 		cur_offset = extent_end;
 
 		/*
@@ -5053,7 +5053,7 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr)
 
 	if (newsize > oldsize) {
 		/*
-		 * Don't do an expanding truncate while snapshoting is ongoing.
+		 * Don't do an expanding truncate while snapshotting is ongoing.
 		 * This is to ensure the snapshot captures a fully consistent
 		 * state of this file - if the snapshot captures this expanding
 		 * truncation, it must capture all writes that happened before
@@ -5062,13 +5062,13 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr)
 		btrfs_wait_for_snapshot_creation(root);
 		ret = btrfs_cont_expand(inode, oldsize, newsize);
 		if (ret) {
-			btrfs_end_write_no_snapshoting(root);
+			btrfs_end_write_no_snapshotting(root);
 			return ret;
 		}
 
 		trans = btrfs_start_transaction(root, 1);
 		if (IS_ERR(trans)) {
-			btrfs_end_write_no_snapshoting(root);
+			btrfs_end_write_no_snapshotting(root);
 			return PTR_ERR(trans);
 		}
 
@@ -5076,7 +5076,7 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr)
 		btrfs_ordered_update_i_size(inode, i_size_read(inode), NULL);
 		pagecache_isize_extended(inode, oldsize, newsize);
 		ret = btrfs_update_inode(trans, root, inode);
-		btrfs_end_write_no_snapshoting(root);
+		btrfs_end_write_no_snapshotting(root);
 		btrfs_end_transaction(trans);
 	} else {
 
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 4cfc3d4c0a37..7d144a676d95 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -607,7 +607,7 @@ fail_free:
 	return ret;
 }
 
-static void btrfs_wait_for_no_snapshoting_writes(struct btrfs_root *root)
+static void btrfs_wait_for_no_snapshotting_writes(struct btrfs_root *root)
 {
 	s64 writers;
 	DEFINE_WAIT(wait);
@@ -650,9 +650,9 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
 		goto free_pending;
 	}
 
-	atomic_inc(&root->will_be_snapshoted);
+	atomic_inc(&root->will_be_snapshotted);
 	smp_mb__after_atomic();
-	btrfs_wait_for_no_snapshoting_writes(root);
+	btrfs_wait_for_no_snapshotting_writes(root);
 
 	ret = btrfs_start_delalloc_inodes(root, 0);
 	if (ret)
@@ -723,8 +723,8 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
 fail:
 	btrfs_subvolume_release_metadata(fs_info, &pending_snapshot->block_rsv);
 dec_and_free:
-	if (atomic_dec_and_test(&root->will_be_snapshoted))
-		wake_up_atomic_t(&root->will_be_snapshoted);
+	if (atomic_dec_and_test(&root->will_be_snapshotted))
+		wake_up_atomic_t(&root->will_be_snapshotted);
 free_pending:
 	kfree(pending_snapshot->root_item);
 	btrfs_free_path(pending_snapshot->path);

From 35dc313046f3f1b95178a307e646e612b288e628 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Thu, 22 Jun 2017 03:28:55 +0200
Subject: [PATCH 054/104] btrfs: drop ancient page flag mappings

There's no PageFsMisc. Added by patch 4881ee5a2e995 in 2008, the flag is
not present in current kernels.

Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.h | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 874d814da371..95a50b88c31e 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3136,13 +3136,6 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
 			      u64 *orig_start, u64 *orig_block_len,
 			      u64 *ram_bytes);
 
-/* RHEL and EL kernels have a patch that renames PG_checked to FsMisc */
-#if defined(ClearPageFsMisc) && !defined(ClearPageChecked)
-#define ClearPageChecked ClearPageFsMisc
-#define SetPageChecked SetPageFsMisc
-#define PageChecked PageFsMisc
-#endif
-
 /* This forces readahead on a given range of bytes in an inode */
 static inline void btrfs_force_ra(struct address_space *mapping,
 				  struct file_ra_state *ra, struct file *file,

From d3c0bab5632337f6d3841fbe0dc238a743cb4472 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Thu, 22 Jun 2017 03:35:28 +0200
Subject: [PATCH 055/104] btrfs: remove trivial wrapper btrfs_force_ra

It's a simple call page_cache_sync_readahead, same arguments in the same
order.

Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.h | 8 --------
 fs/btrfs/ioctl.c | 4 ++--
 fs/btrfs/send.c  | 2 +-
 3 files changed, 3 insertions(+), 11 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 95a50b88c31e..b9d5d95bc583 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3136,14 +3136,6 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
 			      u64 *orig_start, u64 *orig_block_len,
 			      u64 *ram_bytes);
 
-/* This forces readahead on a given range of bytes in an inode */
-static inline void btrfs_force_ra(struct address_space *mapping,
-				  struct file_ra_state *ra, struct file *file,
-				  pgoff_t offset, unsigned long req_size)
-{
-	page_cache_sync_readahead(mapping, ra, file, offset, req_size);
-}
-
 struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry);
 int btrfs_set_inode_index(struct btrfs_inode *dir, u64 *index);
 int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 7d144a676d95..8189cb328d4a 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1364,8 +1364,8 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
 		if (i + cluster > ra_index) {
 			ra_index = max(i, ra_index);
 			if (ra)
-				btrfs_force_ra(inode->i_mapping, ra, file,
-						ra_index, cluster);
+				page_cache_sync_readahead(inode->i_mapping, ra,
+						file, ra_index, cluster);
 			ra_index += cluster;
 		}
 
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index b082210df9c8..59fb1ed6ca20 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -4733,7 +4733,7 @@ static ssize_t fill_read_buf(struct send_ctx *sctx, u64 offset, u32 len)
 	/* initial readahead */
 	memset(&sctx->ra, 0, sizeof(struct file_ra_state));
 	file_ra_state_init(&sctx->ra, inode->i_mapping);
-	btrfs_force_ra(inode->i_mapping, &sctx->ra, NULL, index,
+	page_cache_sync_readahead(inode->i_mapping, &sctx->ra, NULL, index,
 		       last_index - index + 1);
 
 	while (index <= last_index) {

From 4958aa6821c7c4e9682806bc4fea92250d020829 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Thu, 22 Jun 2017 03:59:40 +0200
Subject: [PATCH 056/104] btrfs: drop chunk locks at the end of close_ctree

The pinned chunks might be left over so we clean them but at this point
of close_ctree, there's noone to race with, the locking can be removed.

Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/disk-io.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 57a857142cda..c2d3699a70bd 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -3944,7 +3944,6 @@ void close_ctree(struct btrfs_fs_info *fs_info)
 	__btrfs_free_block_rsv(root->orphan_block_rsv);
 	root->orphan_block_rsv = NULL;
 
-	mutex_lock(&fs_info->chunk_mutex);
 	while (!list_empty(&fs_info->pinned_chunks)) {
 		struct extent_map *em;
 
@@ -3953,7 +3952,6 @@ void close_ctree(struct btrfs_fs_info *fs_info)
 		list_del_init(&em->list);
 		free_extent_map(em);
 	}
-	mutex_unlock(&fs_info->chunk_mutex);
 }
 
 int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid,

From 9c17f6cda1f5debb186a2af1dad4178f63cfbace Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Wed, 19 Jul 2017 19:26:45 +0200
Subject: [PATCH 057/104] btrfs: account that we're waiting for DIO read

Correctly account for IO when waiting for a submitted DIO read, the case
when we're retrying.  This only for the accounting purposes and should
not change other behaviour.

Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/inode.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 9ad9dda871ca..467b9477dac4 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -8133,7 +8133,7 @@ next_block_or_try_again:
 			goto next;
 		}
 
-		wait_for_completion(&done.done);
+		wait_for_completion_io(&done.done);
 
 		if (!done.uptodate) {
 			/* We might have another mirror, so try again */
@@ -8247,7 +8247,7 @@ try_again:
 			goto next;
 		}
 
-		wait_for_completion(&done.done);
+		wait_for_completion_io(&done.done);
 
 		if (!done.uptodate) {
 			/* We might have another mirror, so try again */

From 131ce4367a8f37c6609148117a051d86cd55a5d9 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Wed, 19 Jul 2017 19:30:41 +0200
Subject: [PATCH 058/104] btrfs: account that we're waiting for IO in
 scrub_submit_raid56_bio_wait

Correctly account for IO when waiting for a submitted bio in scrub. This
only for the accounting purposes and should not change other behaviour.

Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/scrub.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index de53c521a50f..f49b94ab3d2a 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -1704,7 +1704,7 @@ static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info,
 	if (ret)
 		return ret;
 
-	wait_for_completion(&done.event);
+	wait_for_completion_io(&done.event);
 	if (done.status)
 		return -EIO;
 

From c2fcdcdf36bba08c5d2fbf4f17c2d8a944bfd4df Mon Sep 17 00:00:00 2001
From: Timofey Titovets <nefelim4ag@gmail.com>
Date: Mon, 17 Jul 2017 16:52:58 +0300
Subject: [PATCH 059/104] Btrfs: add skeleton code for compression heuristic

Add skeleton code for compresison heuristics. Now it iterates over all
the pages, but in the end always says "yes, compress please", ie it does
not change the current behaviour.

In the future we're going to add various heuristics to analyze the data.
This patch can be used as a baseline for measuring if the effectivness
and performance.

Signed-off-by: Timofey Titovets <nefelim4ag@gmail.com>
Reviewed-by: David Sterba <dsterba@suse.com>
[ enhanced changelog, modified comments ]
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/compression.c | 33 +++++++++++++++++++++++++++++++++
 fs/btrfs/compression.h |  2 ++
 fs/btrfs/inode.c       |  8 ++++----
 3 files changed, 39 insertions(+), 4 deletions(-)

diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 3896bd0175ec..883ecc58fd0d 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -1047,3 +1047,36 @@ int btrfs_decompress_buf2page(const char *buf, unsigned long buf_start,
 
 	return 1;
 }
+
+/*
+ * Compression heuristic.
+ *
+ * For now is's a naive and optimistic 'return true', we'll extend the logic to
+ * quickly (compared to direct compression) detect data characteristics
+ * (compressible/uncompressible) to avoid wasting CPU time on uncompressible
+ * data.
+ *
+ * The following types of analysis can be performed:
+ * - detect mostly zero data
+ * - detect data with low "byte set" size (text, etc)
+ * - detect data with low/high "core byte" set
+ *
+ * Return non-zero if the compression should be done, 0 otherwise.
+ */
+int btrfs_compress_heuristic(struct inode *inode, u64 start, u64 end)
+{
+	u64 index = start >> PAGE_SHIFT;
+	u64 end_index = end >> PAGE_SHIFT;
+	struct page *page;
+	int ret = 1;
+
+	while (index <= end_index) {
+		page = find_get_page(inode->i_mapping, index);
+		kmap(page);
+		kunmap(page);
+		put_page(page);
+		index++;
+	}
+
+	return ret;
+}
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
index 87f6d3332163..8508ba6b9aef 100644
--- a/fs/btrfs/compression.h
+++ b/fs/btrfs/compression.h
@@ -129,4 +129,6 @@ struct btrfs_compress_op {
 extern const struct btrfs_compress_op btrfs_zlib_compress;
 extern const struct btrfs_compress_op btrfs_lzo_compress;
 
+int btrfs_compress_heuristic(struct inode *inode, u64 start, u64 end);
+
 #endif
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 467b9477dac4..0bd008b9e0d1 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -392,7 +392,7 @@ static noinline int add_async_extent(struct async_cow *cow,
 	return 0;
 }
 
-static inline int inode_need_compress(struct inode *inode)
+static inline int inode_need_compress(struct inode *inode, u64 start, u64 end)
 {
 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
 
@@ -405,7 +405,7 @@ static inline int inode_need_compress(struct inode *inode)
 	if (btrfs_test_opt(fs_info, COMPRESS) ||
 	    BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS ||
 	    BTRFS_I(inode)->force_compress)
-		return 1;
+		return btrfs_compress_heuristic(inode, start, end);
 	return 0;
 }
 
@@ -503,7 +503,7 @@ again:
 	 * inode has not been flagged as nocompress.  This flag can
 	 * change at any time if we discover bad compression ratios.
 	 */
-	if (inode_need_compress(inode)) {
+	if (inode_need_compress(inode, start, end)) {
 		WARN_ON(pages);
 		pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
 		if (!pages) {
@@ -1576,7 +1576,7 @@ static int run_delalloc_range(void *private_data, struct page *locked_page,
 	} else if (BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC && !force_cow) {
 		ret = run_delalloc_nocow(inode, locked_page, start, end,
 					 page_started, 0, nr_written);
-	} else if (!inode_need_compress(inode)) {
+	} else if (!inode_need_compress(inode, start, end)) {
 		ret = cow_file_range(inode, locked_page, start, end, end,
 				      page_started, nr_written, 1, NULL);
 	} else {

From b52aa8c93e1fec97fcd87345b13f532f0dee8239 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Mon, 17 Jul 2017 19:17:20 +0200
Subject: [PATCH 060/104] btrfs: rename variable holding per-inode compression
 type

This is preparatory for separating inode compression requested by defrag
and set via properties. This will fix a usability bug when defrag will
reset compression type to NONE. If the file has compression set via
property, it will not apply anymore (until next mount or reset through
command line).

We're going to fix that by adding another variable just for the defrag
call and won't touch the property. The defrag will have higher priority
when deciding whether to compress the data.

Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/btrfs_inode.h |  4 ++--
 fs/btrfs/inode.c       | 10 +++++-----
 fs/btrfs/ioctl.c       |  4 ++--
 fs/btrfs/props.c       |  6 +++---
 4 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 31c259c8462d..fa118036a84e 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -179,9 +179,9 @@ struct btrfs_inode {
 	unsigned reserved_extents;
 
 	/*
-	 * always compress this one file
+	 * Cached values of inode properties
 	 */
-	unsigned force_compress;
+	unsigned prop_compress;		/* per-file compression algorithm */
 
 	struct btrfs_delayed_node *delayed_node;
 
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 0bd008b9e0d1..c60a6d692bc1 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -404,7 +404,7 @@ static inline int inode_need_compress(struct inode *inode, u64 start, u64 end)
 		return 0;
 	if (btrfs_test_opt(fs_info, COMPRESS) ||
 	    BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS ||
-	    BTRFS_I(inode)->force_compress)
+	    BTRFS_I(inode)->prop_compress)
 		return btrfs_compress_heuristic(inode, start, end);
 	return 0;
 }
@@ -511,8 +511,8 @@ again:
 			goto cont;
 		}
 
-		if (BTRFS_I(inode)->force_compress)
-			compress_type = BTRFS_I(inode)->force_compress;
+		if (BTRFS_I(inode)->prop_compress)
+			compress_type = BTRFS_I(inode)->prop_compress;
 
 		/*
 		 * we need to call clear_page_dirty_for_io on each
@@ -645,7 +645,7 @@ cont:
 
 		/* flag the file so we don't compress in the future */
 		if (!btrfs_test_opt(fs_info, FORCE_COMPRESS) &&
-		    !(BTRFS_I(inode)->force_compress)) {
+		    !(BTRFS_I(inode)->prop_compress)) {
 			BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS;
 		}
 	}
@@ -9433,7 +9433,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
 	ei->reserved_extents = 0;
 
 	ei->runtime_flags = 0;
-	ei->force_compress = BTRFS_COMPRESS_NONE;
+	ei->prop_compress = BTRFS_COMPRESS_NONE;
 
 	ei->delayed_node = NULL;
 
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 8189cb328d4a..e1c15a2ac785 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1371,7 +1371,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
 
 		inode_lock(inode);
 		if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)
-			BTRFS_I(inode)->force_compress = compress_type;
+			BTRFS_I(inode)->prop_compress = compress_type;
 		ret = cluster_pages_for_defrag(inode, pages, i, cluster);
 		if (ret < 0) {
 			inode_unlock(inode);
@@ -1442,7 +1442,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
 out_ra:
 	if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS) {
 		inode_lock(inode);
-		BTRFS_I(inode)->force_compress = BTRFS_COMPRESS_NONE;
+		BTRFS_I(inode)->prop_compress = BTRFS_COMPRESS_NONE;
 		inode_unlock(inode);
 	}
 	if (!file)
diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c
index 4b23ae5d0e5c..916f5cf9b292 100644
--- a/fs/btrfs/props.c
+++ b/fs/btrfs/props.c
@@ -403,7 +403,7 @@ static int prop_compression_apply(struct inode *inode,
 	if (len == 0) {
 		BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS;
 		BTRFS_I(inode)->flags &= ~BTRFS_INODE_COMPRESS;
-		BTRFS_I(inode)->force_compress = BTRFS_COMPRESS_NONE;
+		BTRFS_I(inode)->prop_compress = BTRFS_COMPRESS_NONE;
 
 		return 0;
 	}
@@ -417,14 +417,14 @@ static int prop_compression_apply(struct inode *inode,
 
 	BTRFS_I(inode)->flags &= ~BTRFS_INODE_NOCOMPRESS;
 	BTRFS_I(inode)->flags |= BTRFS_INODE_COMPRESS;
-	BTRFS_I(inode)->force_compress = type;
+	BTRFS_I(inode)->prop_compress = type;
 
 	return 0;
 }
 
 static const char *prop_compression_extract(struct inode *inode)
 {
-	switch (BTRFS_I(inode)->force_compress) {
+	switch (BTRFS_I(inode)->prop_compress) {
 	case BTRFS_COMPRESS_ZLIB:
 		return "zlib";
 	case BTRFS_COMPRESS_LZO:

From eec63c65dcbeb14b59c95159eb225b1fc2310806 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Mon, 17 Jul 2017 19:41:31 +0200
Subject: [PATCH 061/104] btrfs: separate defrag and property compression

Add new value for compression to distinguish between defrag and
property. Previously, a single variable was used and this caused clashes
when the per-file 'compression' was set and a defrag -c was called.

The property-compression is loaded when the file is open, defrag will
overwrite the same variable and reset to 0 (ie. NONE) at when the file
defragmentaion is finished. That's considered a usability bug.

Now we won't touch the property value, use the defrag-compression. The
precedence of defrag is higher than for property (and whole-filesystem).

Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/btrfs_inode.h | 5 +++++
 fs/btrfs/inode.c       | 8 +++++++-
 fs/btrfs/ioctl.c       | 4 ++--
 3 files changed, 14 insertions(+), 3 deletions(-)

diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index fa118036a84e..eccadb5f62a5 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -182,6 +182,11 @@ struct btrfs_inode {
 	 * Cached values of inode properties
 	 */
 	unsigned prop_compress;		/* per-file compression algorithm */
+	/*
+	 * Force compression on the file using the defrag ioctl, could be
+	 * different from prop_compress and takes precedence if set
+	 */
+	unsigned defrag_compress;
 
 	struct btrfs_delayed_node *delayed_node;
 
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index c60a6d692bc1..1e8eb5e8551c 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -402,6 +402,9 @@ static inline int inode_need_compress(struct inode *inode, u64 start, u64 end)
 	/* bad compression ratios */
 	if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS)
 		return 0;
+	/* defrag ioctl */
+	if (BTRFS_I(inode)->defrag_compress)
+		return 1;
 	if (btrfs_test_opt(fs_info, COMPRESS) ||
 	    BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS ||
 	    BTRFS_I(inode)->prop_compress)
@@ -511,7 +514,9 @@ again:
 			goto cont;
 		}
 
-		if (BTRFS_I(inode)->prop_compress)
+		if (BTRFS_I(inode)->defrag_compress)
+			compress_type = BTRFS_I(inode)->defrag_compress;
+		else if (BTRFS_I(inode)->prop_compress)
 			compress_type = BTRFS_I(inode)->prop_compress;
 
 		/*
@@ -9434,6 +9439,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
 
 	ei->runtime_flags = 0;
 	ei->prop_compress = BTRFS_COMPRESS_NONE;
+	ei->defrag_compress = BTRFS_COMPRESS_NONE;
 
 	ei->delayed_node = NULL;
 
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index e1c15a2ac785..1d6603dc1d59 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1371,7 +1371,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
 
 		inode_lock(inode);
 		if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)
-			BTRFS_I(inode)->prop_compress = compress_type;
+			BTRFS_I(inode)->defrag_compress = compress_type;
 		ret = cluster_pages_for_defrag(inode, pages, i, cluster);
 		if (ret < 0) {
 			inode_unlock(inode);
@@ -1442,7 +1442,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
 out_ra:
 	if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS) {
 		inode_lock(inode);
-		BTRFS_I(inode)->prop_compress = BTRFS_COMPRESS_NONE;
+		BTRFS_I(inode)->defrag_compress = BTRFS_COMPRESS_NONE;
 		inode_unlock(inode);
 	}
 	if (!file)

From 1e2ef46d89ee41c0c5387563491493029c77cdd2 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Mon, 17 Jul 2017 20:01:59 +0200
Subject: [PATCH 062/104] btrfs: defrag: cleanup checking for compression
 status

Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ioctl.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 1d6603dc1d59..7a3c56c17e00 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1255,6 +1255,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
 	unsigned long cluster = max_cluster;
 	u64 new_align = ~((u64)SZ_128K - 1);
 	struct page **pages = NULL;
+	bool do_compress = range->flags & BTRFS_DEFRAG_RANGE_COMPRESS;
 
 	if (isize == 0)
 		return 0;
@@ -1262,7 +1263,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
 	if (range->start >= isize)
 		return -EINVAL;
 
-	if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS) {
+	if (do_compress) {
 		if (range->compress_type > BTRFS_COMPRESS_TYPES)
 			return -EINVAL;
 		if (range->compress_type)
@@ -1341,8 +1342,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
 
 		if (!should_defrag_range(inode, (u64)i << PAGE_SHIFT,
 					 extent_thresh, &last_len, &skip,
-					 &defrag_end, range->flags &
-					 BTRFS_DEFRAG_RANGE_COMPRESS)) {
+					 &defrag_end, do_compress)){
 			unsigned long next;
 			/*
 			 * the should_defrag function tells us how much to skip
@@ -1370,7 +1370,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
 		}
 
 		inode_lock(inode);
-		if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)
+		if (do_compress)
 			BTRFS_I(inode)->defrag_compress = compress_type;
 		ret = cluster_pages_for_defrag(inode, pages, i, cluster);
 		if (ret < 0) {
@@ -1418,7 +1418,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
 			filemap_flush(inode->i_mapping);
 	}
 
-	if ((range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)) {
+	if (do_compress) {
 		/* the filemap_flush will queue IO into the worker threads, but
 		 * we have to make sure the IO is actually started and that
 		 * ordered extents get created before we return
@@ -1440,7 +1440,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
 	ret = defrag_count;
 
 out_ra:
-	if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS) {
+	if (do_compress) {
 		inode_lock(inode);
 		BTRFS_I(inode)->defrag_compress = BTRFS_COMPRESS_NONE;
 		inode_unlock(inode);

From 1e20d1c45fd37fb085dee716e1f207959e8e5660 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Mon, 17 Jul 2017 20:42:03 +0200
Subject: [PATCH 063/104] btrfs: allow defrag compress to override NOCOMPRESS
 attribute

Currently, the BTRFS_INODE_NOCOMPRESS will prevent any compression on a
given file, except when the mount is force-compress. As users have
reported on IRC, this will also prevent compression when requested by
defrag (btrfs fi defrag -c file).

The nocompress flag is set automatically by filesystem when the ratios
are bad and the user would have to manually drop the bit in order to
make defrag -c work. This is not good from the usability perspective.

This patch will raise priority for the defrag -c over nocompress, ie.
any file with NOCOMPRESS bit set will get defragmented. The bit will
remain untouched.

Alternate option was to also drop the nocompress bit and keep the
decision logic as is, but I think this is not the right solution.

Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/inode.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 1e8eb5e8551c..a17a61e2ff9d 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -399,12 +399,12 @@ static inline int inode_need_compress(struct inode *inode, u64 start, u64 end)
 	/* force compress */
 	if (btrfs_test_opt(fs_info, FORCE_COMPRESS))
 		return 1;
-	/* bad compression ratios */
-	if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS)
-		return 0;
 	/* defrag ioctl */
 	if (BTRFS_I(inode)->defrag_compress)
 		return 1;
+	/* bad compression ratios */
+	if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS)
+		return 0;
 	if (btrfs_test_opt(fs_info, COMPRESS) ||
 	    BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS ||
 	    BTRFS_I(inode)->prop_compress)

From a7164fa4e055daf6368cb68ed946aa5a362a1a75 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Mon, 17 Jul 2017 18:11:10 +0200
Subject: [PATCH 064/104] btrfs: prepare for extensions in compression options

This is a minimal patch intended to be backported to older kernels.
We're going to extend the string specifying the compression method and
this would fail on kernels before that change (the string is compared
exactly).

Relax the string matching only to the prefix, ie. ignoring anything that
goes after "zlib" or "lzo", regardless of th format extension we decide
to use. This applies to the mount options and properties.

That way, patched old kernels could be booted on systems already
utilizing the new compression spec.

Applicable since commit 63541927c8d11, v3.14.

Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/props.c | 4 ++--
 fs/btrfs/super.c | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c
index 916f5cf9b292..09c0266f248d 100644
--- a/fs/btrfs/props.c
+++ b/fs/btrfs/props.c
@@ -408,9 +408,9 @@ static int prop_compression_apply(struct inode *inode,
 		return 0;
 	}
 
-	if (!strncmp("lzo", value, len))
+	if (!strncmp("lzo", value, 3))
 		type = BTRFS_COMPRESS_LZO;
-	else if (!strncmp("zlib", value, len))
+	else if (!strncmp("zlib", value, 4))
 		type = BTRFS_COMPRESS_ZLIB;
 	else
 		return -EINVAL;
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 64981fc8e39e..8a9bcad3b06a 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -499,14 +499,14 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
 				btrfs_test_opt(info, FORCE_COMPRESS);
 			if (token == Opt_compress ||
 			    token == Opt_compress_force ||
-			    strcmp(args[0].from, "zlib") == 0) {
+			    strncmp(args[0].from, "zlib", 4) == 0) {
 				compress_type = "zlib";
 				info->compress_type = BTRFS_COMPRESS_ZLIB;
 				btrfs_set_opt(info->mount_opt, COMPRESS);
 				btrfs_clear_opt(info->mount_opt, NODATACOW);
 				btrfs_clear_opt(info->mount_opt, NODATASUM);
 				no_compress = 0;
-			} else if (strcmp(args[0].from, "lzo") == 0) {
+			} else if (strncmp(args[0].from, "lzo", 3) == 0) {
 				compress_type = "lzo";
 				info->compress_type = BTRFS_COMPRESS_LZO;
 				btrfs_set_opt(info->mount_opt, COMPRESS);

From f44d2287d2879f50b921e909b2377d6dcba3e251 Mon Sep 17 00:00:00 2001
From: Jeff Mahoney <jeffm@suse.com>
Date: Thu, 22 Jun 2017 09:51:47 -0400
Subject: [PATCH 065/104] btrfs: account for pinned bytes in should_alloc_chunk

In a heavy write scenario, we can end up with a large number of pinned bytes.
This can translate into (very) premature ENOSPC because pinned bytes
must be accounted for when allowing a reservation but aren't accounted for
when deciding whether to create a new chunk.

This patch adds the accounting to should_alloc_chunk so that we can
create the chunk.

Signed-off-by: Jeff Mahoney <jeffm@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent-tree.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 7d7abc0d47b9..b929fe201981 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -4419,7 +4419,7 @@ static int should_alloc_chunk(struct btrfs_fs_info *fs_info,
 {
 	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
 	u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly;
-	u64 num_allocated = sinfo->bytes_used + sinfo->bytes_reserved;
+	u64 num_allocated = sinfo->bytes_used + sinfo->bytes_reserved + sinfo->bytes_pinned;
 	u64 thresh;
 
 	if (force == CHUNK_ALLOC_FORCE)

From 8d8aafeea23e2d641460d7e6231361f0322ac058 Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <nborisov@suse.com>
Date: Thu, 22 Jun 2017 09:51:48 -0400
Subject: [PATCH 066/104] btrfs: Simplify math in should_alloc chunk

Currently should_alloc_chunk uses ->total_bytes - ->bytes_readonly to
signify the total amount of bytes in this space info. However, given
Jeff's patch which adds bytes_pinned and bytes_may_use to the calculation
of num_allocated it becomes a lot more clear to just eliminate num_bytes
altogether and add the bytes_readonly to the amount of used space. That
way we don't change the results of the following statements. In the
process also start using btrfs_space_info_used.

Signed-off-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: Jeff Mahoney <jeffm@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent-tree.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index b929fe201981..c74d24c1bbc9 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -4418,8 +4418,7 @@ static int should_alloc_chunk(struct btrfs_fs_info *fs_info,
 			      struct btrfs_space_info *sinfo, int force)
 {
 	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
-	u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly;
-	u64 num_allocated = sinfo->bytes_used + sinfo->bytes_reserved + sinfo->bytes_pinned;
+	u64 bytes_used = btrfs_space_info_used(sinfo, false);
 	u64 thresh;
 
 	if (force == CHUNK_ALLOC_FORCE)
@@ -4431,7 +4430,7 @@ static int should_alloc_chunk(struct btrfs_fs_info *fs_info,
 	 * global_rsv, it doesn't change except when the transaction commits.
 	 */
 	if (sinfo->flags & BTRFS_BLOCK_GROUP_METADATA)
-		num_allocated += calc_global_rsv_need_space(global_rsv);
+		bytes_used += calc_global_rsv_need_space(global_rsv);
 
 	/*
 	 * in limited mode, we want to have some free space up to
@@ -4441,11 +4440,11 @@ static int should_alloc_chunk(struct btrfs_fs_info *fs_info,
 		thresh = btrfs_super_total_bytes(fs_info->super_copy);
 		thresh = max_t(u64, SZ_64M, div_factor_fine(thresh, 1));
 
-		if (num_bytes - num_allocated < thresh)
+		if (sinfo->total_bytes - bytes_used < thresh)
 			return 1;
 	}
 
-	if (num_allocated + SZ_2M < div_factor(num_bytes, 8))
+	if (bytes_used + SZ_2M < div_factor(sinfo->total_bytes, 8))
 		return 0;
 	return 1;
 }

From 23b5ec74943f44378b68c0edd8e210a86318ea5e Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fb.com>
Date: Mon, 24 Jul 2017 15:14:25 -0400
Subject: [PATCH 067/104] btrfs: fix readdir deadlock with pagefault

Readdir does dir_emit while under the btree lock.  dir_emit can trigger
the page fault which means we can deadlock.  Fix this by allocating a
buffer on opening a directory and copying the readdir into this buffer
and doing dir_emit from outside of the tree lock.

Thread A
readdir  <holding tree lock>
  dir_emit
    <page fault>
      down_read(mmap_sem)

Thread B
mmap write
  down_write(mmap_sem)
    page_mkwrite
      wait_ordered_extents

Process C
finish_ordered_extent
  insert_reserved_file_extent
   try to lock leaf <hang>

Signed-off-by: Josef Bacik <jbacik@fb.com>
Reviewed-by: David Sterba <dsterba@suse.com>
[ copy the deadlock scenario to changelog ]
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.h |   5 +++
 fs/btrfs/file.c  |   9 +++-
 fs/btrfs/inode.c | 108 +++++++++++++++++++++++++++++++++++------------
 fs/btrfs/ioctl.c |  22 +++++++---
 4 files changed, 110 insertions(+), 34 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index b9d5d95bc583..a3db2b1738aa 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1264,6 +1264,11 @@ struct btrfs_root {
 	atomic64_t qgroup_meta_rsv;
 };
 
+struct btrfs_file_private {
+	struct btrfs_trans_handle *trans;
+	void *filldir_buf;
+};
+
 static inline u32 btrfs_inode_sectorsize(const struct inode *inode)
 {
 	return btrfs_sb(inode->i_sb)->sectorsize;
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 58818cf7f82d..74fd7756cff3 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1990,8 +1990,15 @@ out:
 
 int btrfs_release_file(struct inode *inode, struct file *filp)
 {
-	if (filp->private_data)
+	struct btrfs_file_private *private = filp->private_data;
+
+	if (private && private->trans)
 		btrfs_ioctl_trans_end(filp);
+	if (private && private->filldir_buf)
+		kfree(private->filldir_buf);
+	kfree(private);
+	filp->private_data = NULL;
+
 	/*
 	 * ordered_data_close is set by settattr when we are about to truncate
 	 * a file from a non-zero size to a zero size.  This tries to
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index a17a61e2ff9d..fa4b2563dfd7 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -5876,25 +5876,74 @@ unsigned char btrfs_filetype_table[] = {
 	DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
 };
 
+/*
+ * All this infrastructure exists because dir_emit can fault, and we are holding
+ * the tree lock when doing readdir.  For now just allocate a buffer and copy
+ * our information into that, and then dir_emit from the buffer.  This is
+ * similar to what NFS does, only we don't keep the buffer around in pagecache
+ * because I'm afraid I'll mess that up.  Long term we need to make filldir do
+ * copy_to_user_inatomic so we don't have to worry about page faulting under the
+ * tree lock.
+ */
+static int btrfs_opendir(struct inode *inode, struct file *file)
+{
+	struct btrfs_file_private *private;
+
+	private = kzalloc(sizeof(struct btrfs_file_private), GFP_KERNEL);
+	if (!private)
+		return -ENOMEM;
+	private->filldir_buf = kzalloc(PAGE_SIZE, GFP_KERNEL);
+	if (!private->filldir_buf) {
+		kfree(private);
+		return -ENOMEM;
+	}
+	file->private_data = private;
+	return 0;
+}
+
+struct dir_entry {
+	u64 ino;
+	u64 offset;
+	unsigned type;
+	int name_len;
+};
+
+static int btrfs_filldir(void *addr, int entries, struct dir_context *ctx)
+{
+	while (entries--) {
+		struct dir_entry *entry = addr;
+		char *name = (char *)(entry + 1);
+
+		ctx->pos = entry->offset;
+		if (!dir_emit(ctx, name, entry->name_len, entry->ino,
+			      entry->type))
+			return 1;
+		addr += sizeof(struct dir_entry) + entry->name_len;
+		ctx->pos++;
+	}
+	return 0;
+}
+
 static int btrfs_real_readdir(struct file *file, struct dir_context *ctx)
 {
 	struct inode *inode = file_inode(file);
 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
 	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_file_private *private = file->private_data;
 	struct btrfs_dir_item *di;
 	struct btrfs_key key;
 	struct btrfs_key found_key;
 	struct btrfs_path *path;
+	void *addr;
 	struct list_head ins_list;
 	struct list_head del_list;
 	int ret;
 	struct extent_buffer *leaf;
 	int slot;
-	unsigned char d_type;
-	int over = 0;
-	char tmp_name[32];
 	char *name_ptr;
 	int name_len;
+	int entries = 0;
+	int total_len = 0;
 	bool put = false;
 	struct btrfs_key location;
 
@@ -5905,12 +5954,14 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx)
 	if (!path)
 		return -ENOMEM;
 
+	addr = private->filldir_buf;
 	path->reada = READA_FORWARD;
 
 	INIT_LIST_HEAD(&ins_list);
 	INIT_LIST_HEAD(&del_list);
 	put = btrfs_readdir_get_delayed_items(inode, &ins_list, &del_list);
 
+again:
 	key.type = BTRFS_DIR_INDEX_KEY;
 	key.offset = ctx->pos;
 	key.objectid = btrfs_ino(BTRFS_I(inode));
@@ -5920,6 +5971,8 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx)
 		goto err;
 
 	while (1) {
+		struct dir_entry *entry;
+
 		leaf = path->nodes[0];
 		slot = path->slots[0];
 		if (slot >= btrfs_header_nritems(leaf)) {
@@ -5941,41 +5994,43 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx)
 			goto next;
 		if (btrfs_should_delete_dir_index(&del_list, found_key.offset))
 			goto next;
-
-		ctx->pos = found_key.offset;
-
 		di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
 		if (verify_dir_item(fs_info, leaf, slot, di))
 			goto next;
 
 		name_len = btrfs_dir_name_len(leaf, di);
-		if (name_len <= sizeof(tmp_name)) {
-			name_ptr = tmp_name;
-		} else {
-			name_ptr = kmalloc(name_len, GFP_KERNEL);
-			if (!name_ptr) {
-				ret = -ENOMEM;
-				goto err;
-			}
+		if ((total_len + sizeof(struct dir_entry) + name_len) >=
+		    PAGE_SIZE) {
+			btrfs_release_path(path);
+			ret = btrfs_filldir(private->filldir_buf, entries, ctx);
+			if (ret)
+				goto nopos;
+			addr = private->filldir_buf;
+			entries = 0;
+			total_len = 0;
+			goto again;
 		}
+
+		entry = addr;
+		entry->name_len = name_len;
+		name_ptr = (char *)(entry + 1);
 		read_extent_buffer(leaf, name_ptr, (unsigned long)(di + 1),
 				   name_len);
-
-		d_type = btrfs_filetype_table[btrfs_dir_type(leaf, di)];
+		entry->type = btrfs_filetype_table[btrfs_dir_type(leaf, di)];
 		btrfs_dir_item_key_to_cpu(leaf, di, &location);
-
-		over = !dir_emit(ctx, name_ptr, name_len, location.objectid,
-				 d_type);
-
-		if (name_ptr != tmp_name)
-			kfree(name_ptr);
-
-		if (over)
-			goto nopos;
-		ctx->pos++;
+		entry->ino = location.objectid;
+		entry->offset = found_key.offset;
+		entries++;
+		addr += sizeof(struct dir_entry) + name_len;
+		total_len += sizeof(struct dir_entry) + name_len;
 next:
 		path->slots[0]++;
 	}
+	btrfs_release_path(path);
+
+	ret = btrfs_filldir(private->filldir_buf, entries, ctx);
+	if (ret)
+		goto nopos;
 
 	ret = btrfs_readdir_delayed_dir_index(ctx, &ins_list);
 	if (ret)
@@ -10779,6 +10834,7 @@ static const struct file_operations btrfs_dir_file_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= generic_read_dir,
 	.iterate_shared	= btrfs_real_readdir,
+	.open		= btrfs_opendir,
 	.unlocked_ioctl	= btrfs_ioctl,
 #ifdef CONFIG_COMPAT
 	.compat_ioctl	= btrfs_compat_ioctl,
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 7a3c56c17e00..b21558bb1294 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -3966,6 +3966,7 @@ static long btrfs_ioctl_trans_start(struct file *file)
 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_trans_handle *trans;
+	struct btrfs_file_private *private;
 	int ret;
 	static bool warned = false;
 
@@ -3984,8 +3985,16 @@ static long btrfs_ioctl_trans_start(struct file *file)
 	}
 
 	ret = -EINPROGRESS;
-	if (file->private_data)
+	private = file->private_data;
+	if (private && private->trans)
 		goto out;
+	if (!private) {
+		private = kzalloc(sizeof(struct btrfs_file_private),
+				  GFP_KERNEL);
+		if (!private)
+			return -ENOMEM;
+		file->private_data = private;
+	}
 
 	ret = -EROFS;
 	if (btrfs_root_readonly(root))
@@ -4002,7 +4011,7 @@ static long btrfs_ioctl_trans_start(struct file *file)
 	if (IS_ERR(trans))
 		goto out_drop;
 
-	file->private_data = trans;
+	private->trans = trans;
 	return 0;
 
 out_drop:
@@ -4257,14 +4266,13 @@ long btrfs_ioctl_trans_end(struct file *file)
 {
 	struct inode *inode = file_inode(file);
 	struct btrfs_root *root = BTRFS_I(inode)->root;
-	struct btrfs_trans_handle *trans;
+	struct btrfs_file_private *private = file->private_data;
 
-	trans = file->private_data;
-	if (!trans)
+	if (!private || !private->trans)
 		return -EINVAL;
-	file->private_data = NULL;
 
-	btrfs_end_transaction(trans);
+	btrfs_end_transaction(private->trans);
+	private->trans = NULL;
 
 	atomic_dec(&root->fs_info->open_ioctl_trans);
 

From 42e9cc46fbe86f83278c14f5d88f353d037258d2 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fb.com>
Date: Mon, 24 Jul 2017 15:14:26 -0400
Subject: [PATCH 068/104] btrfs: increase ctx->pos for delayed dir index

Our dir_context->pos is supposed to hold the next position we're
supposed to look.  If we successfully insert a delayed dir index we
could end up with a duplicate entry because we don't increase ctx->pos
after doing the dir_emit.

Signed-off-by: Josef Bacik <jbacik@fb.com>
Reviewed-by: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/delayed-inode.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 8ae409b5a61d..19e4ad2f3f2e 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -1727,6 +1727,7 @@ int btrfs_readdir_delayed_dir_index(struct dir_context *ctx,
 
 		if (over)
 			return 1;
+		ctx->pos++;
 	}
 	return 0;
 }

From 44880fdc91bc4f6730e37f2cb6025b35c70b312d Mon Sep 17 00:00:00 2001
From: Anand Jain <anand.jain@oracle.com>
Date: Sat, 29 Jul 2017 17:50:09 +0800
Subject: [PATCH 069/104] btrfs: use appropriate define for the fsid

Though BTRFS_FSID_SIZE and BTRFS_UUID_SIZE are of the same size, we
should use the matching constant for the fsid buffer.

Signed-off-by: Anand Jain <anand.jain@oracle.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/check-integrity.c |  2 +-
 fs/btrfs/disk-io.c         |  6 +++---
 fs/btrfs/scrub.c           |  2 +-
 fs/btrfs/volumes.c         | 16 ++++++++--------
 4 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index 7d65d98d2790..7b9aabc2cb78 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -1732,7 +1732,7 @@ static int btrfsic_test_for_metadata(struct btrfsic_state *state,
 	num_pages = state->metablock_size >> PAGE_SHIFT;
 	h = (struct btrfs_header *)datav[0];
 
-	if (memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE))
+	if (memcmp(h->fsid, fs_info->fsid, BTRFS_FSID_SIZE))
 		return 1;
 
 	for (i = 0; i < num_pages; i++) {
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index c2d3699a70bd..3715478fffed 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -529,7 +529,7 @@ static int check_tree_block_fsid(struct btrfs_fs_info *fs_info,
 				 struct extent_buffer *eb)
 {
 	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
-	u8 fsid[BTRFS_UUID_SIZE];
+	u8 fsid[BTRFS_FSID_SIZE];
 	int ret = 1;
 
 	read_extent_buffer(eb, fsid, btrfs_header_fsid(), BTRFS_FSID_SIZE);
@@ -3681,7 +3681,7 @@ int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors)
 		btrfs_set_stack_device_io_width(dev_item, dev->io_width);
 		btrfs_set_stack_device_sector_size(dev_item, dev->sector_size);
 		memcpy(dev_item->uuid, dev->uuid, BTRFS_UUID_SIZE);
-		memcpy(dev_item->fsid, dev->fs_devices->fsid, BTRFS_UUID_SIZE);
+		memcpy(dev_item->fsid, dev->fs_devices->fsid, BTRFS_FSID_SIZE);
 
 		flags = btrfs_super_flags(sb);
 		btrfs_set_super_flags(sb, flags | BTRFS_HEADER_FLAG_WRITTEN);
@@ -4120,7 +4120,7 @@ static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info)
 		ret = -EINVAL;
 	}
 
-	if (memcmp(fs_info->fsid, sb->dev_item.fsid, BTRFS_UUID_SIZE) != 0) {
+	if (memcmp(fs_info->fsid, sb->dev_item.fsid, BTRFS_FSID_SIZE) != 0) {
 		btrfs_err(fs_info,
 			   "dev_item UUID does not match fsid: %pU != %pU",
 			   fs_info->fsid, sb->dev_item.fsid);
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index f49b94ab3d2a..20c1ba19e665 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -1769,7 +1769,7 @@ static inline int scrub_check_fsid(u8 fsid[],
 	struct btrfs_fs_devices *fs_devices = spage->dev->fs_devices;
 	int ret;
 
-	ret = memcmp(fsid, fs_devices->fsid, BTRFS_UUID_SIZE);
+	ret = memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
 	return !ret;
 }
 
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 12bd04a4104f..ed82edf987f4 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1719,7 +1719,7 @@ static int btrfs_add_device(struct btrfs_trans_handle *trans,
 	ptr = btrfs_device_uuid(dev_item);
 	write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
 	ptr = btrfs_device_fsid(dev_item);
-	write_extent_buffer(leaf, fs_info->fsid, ptr, BTRFS_UUID_SIZE);
+	write_extent_buffer(leaf, fs_info->fsid, ptr, BTRFS_FSID_SIZE);
 	btrfs_mark_buffer_dirty(leaf);
 
 	ret = 0;
@@ -2249,7 +2249,7 @@ static int btrfs_finish_sprout(struct btrfs_trans_handle *trans,
 	struct btrfs_dev_item *dev_item;
 	struct btrfs_device *device;
 	struct btrfs_key key;
-	u8 fs_uuid[BTRFS_UUID_SIZE];
+	u8 fs_uuid[BTRFS_FSID_SIZE];
 	u8 dev_uuid[BTRFS_UUID_SIZE];
 	u64 devid;
 	int ret;
@@ -2292,7 +2292,7 @@ next_slot:
 		read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item),
 				   BTRFS_UUID_SIZE);
 		read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item),
-				   BTRFS_UUID_SIZE);
+				   BTRFS_FSID_SIZE);
 		device = btrfs_find_device(fs_info, devid, dev_uuid, fs_uuid);
 		BUG_ON(!device); /* Logic error */
 
@@ -6243,7 +6243,7 @@ struct btrfs_device *btrfs_find_device(struct btrfs_fs_info *fs_info, u64 devid,
 	cur_devices = fs_info->fs_devices;
 	while (cur_devices) {
 		if (!fsid ||
-		    !memcmp(cur_devices->fsid, fsid, BTRFS_UUID_SIZE)) {
+		    !memcmp(cur_devices->fsid, fsid, BTRFS_FSID_SIZE)) {
 			device = find_device(cur_devices, devid, uuid);
 			if (device)
 				return device;
@@ -6518,7 +6518,7 @@ static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info,
 
 	fs_devices = fs_info->fs_devices->seed;
 	while (fs_devices) {
-		if (!memcmp(fs_devices->fsid, fsid, BTRFS_UUID_SIZE))
+		if (!memcmp(fs_devices->fsid, fsid, BTRFS_FSID_SIZE))
 			return fs_devices;
 
 		fs_devices = fs_devices->seed;
@@ -6571,16 +6571,16 @@ static int read_one_dev(struct btrfs_fs_info *fs_info,
 	struct btrfs_device *device;
 	u64 devid;
 	int ret;
-	u8 fs_uuid[BTRFS_UUID_SIZE];
+	u8 fs_uuid[BTRFS_FSID_SIZE];
 	u8 dev_uuid[BTRFS_UUID_SIZE];
 
 	devid = btrfs_device_id(leaf, dev_item);
 	read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item),
 			   BTRFS_UUID_SIZE);
 	read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item),
-			   BTRFS_UUID_SIZE);
+			   BTRFS_FSID_SIZE);
 
-	if (memcmp(fs_uuid, fs_info->fsid, BTRFS_UUID_SIZE)) {
+	if (memcmp(fs_uuid, fs_info->fsid, BTRFS_FSID_SIZE)) {
 		fs_devices = open_seed_devices(fs_info, fs_uuid);
 		if (IS_ERR(fs_devices))
 			return PTR_ERR(fs_devices);

From b94417eaa5f5a20d58a99328a401c0b5a812ec7d Mon Sep 17 00:00:00 2001
From: Anand Jain <anand.jain@oracle.com>
Date: Sun, 13 Aug 2017 11:58:30 +0800
Subject: [PATCH 070/104] btrfs: use BTRFS_FSID_SIZE for fsid

We have define for FSID size so use it.

Signed-off-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 include/trace/events/btrfs.h | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h
index 1e4908dcd065..dc1d0df91e0b 100644
--- a/include/trace/events/btrfs.h
+++ b/include/trace/events/btrfs.h
@@ -74,11 +74,11 @@ struct prelim_ref;
 	{ BTRFS_BLOCK_GROUP_RAID5,	"RAID5"},	\
 	{ BTRFS_BLOCK_GROUP_RAID6,	"RAID6"}
 
-#define BTRFS_UUID_SIZE 16
-#define TP_STRUCT__entry_fsid __array(u8, fsid, BTRFS_UUID_SIZE)
+#define BTRFS_FSID_SIZE 16
+#define TP_STRUCT__entry_fsid __array(u8, fsid, BTRFS_FSID_SIZE)
 
 #define TP_fast_assign_fsid(fs_info)					\
-	memcpy(__entry->fsid, fs_info->fsid, BTRFS_UUID_SIZE)
+	memcpy(__entry->fsid, fs_info->fsid, BTRFS_FSID_SIZE)
 
 #define TP_STRUCT__entry_btrfs(args...)					\
 	TP_STRUCT__entry(						\
@@ -618,7 +618,7 @@ TRACE_EVENT(btrfs_add_block_group,
 	TP_ARGS(fs_info, block_group, create),
 
 	TP_STRUCT__entry(
-		__array(	u8,	fsid,	BTRFS_UUID_SIZE	)
+		__array(	u8,	fsid,	BTRFS_FSID_SIZE	)
 		__field(	u64,	offset			)
 		__field(	u64,	size			)
 		__field(	u64,	flags			)
@@ -628,7 +628,7 @@ TRACE_EVENT(btrfs_add_block_group,
 	),
 
 	TP_fast_assign(
-		memcpy(__entry->fsid, fs_info->fsid, BTRFS_UUID_SIZE);
+		memcpy(__entry->fsid, fs_info->fsid, BTRFS_FSID_SIZE);
 		__entry->offset		= block_group->key.objectid;
 		__entry->size		= block_group->key.offset;
 		__entry->flags		= block_group->flags;
@@ -975,7 +975,7 @@ TRACE_EVENT(btrfs_trigger_flush,
 	TP_ARGS(fs_info, flags, bytes, flush, reason),
 
 	TP_STRUCT__entry(
-		__array(	u8,	fsid,	BTRFS_UUID_SIZE	)
+		__array(	u8,	fsid,	BTRFS_FSID_SIZE	)
 		__field(	u64,	flags			)
 		__field(	u64,	bytes			)
 		__field(	int,	flush			)
@@ -983,7 +983,7 @@ TRACE_EVENT(btrfs_trigger_flush,
 	),
 
 	TP_fast_assign(
-		memcpy(__entry->fsid, fs_info->fsid, BTRFS_UUID_SIZE);
+		memcpy(__entry->fsid, fs_info->fsid, BTRFS_FSID_SIZE);
 		__entry->flags	= flags;
 		__entry->bytes	= bytes;
 		__entry->flush	= flush;
@@ -1016,7 +1016,7 @@ TRACE_EVENT(btrfs_flush_space,
 	TP_ARGS(fs_info, flags, num_bytes, state, ret),
 
 	TP_STRUCT__entry(
-		__array(	u8,	fsid,	BTRFS_UUID_SIZE	)
+		__array(	u8,	fsid,	BTRFS_FSID_SIZE	)
 		__field(	u64,	flags			)
 		__field(	u64,	num_bytes		)
 		__field(	int,	state			)
@@ -1024,7 +1024,7 @@ TRACE_EVENT(btrfs_flush_space,
 	),
 
 	TP_fast_assign(
-		memcpy(__entry->fsid, fs_info->fsid, BTRFS_UUID_SIZE);
+		memcpy(__entry->fsid, fs_info->fsid, BTRFS_FSID_SIZE);
 		__entry->flags		=	flags;
 		__entry->num_bytes	=	num_bytes;
 		__entry->state		=	state;

From dc2f29212a2648164b054016dc5b948bf0fc92d5 Mon Sep 17 00:00:00 2001
From: Anand Jain <anand.jain@oracle.com>
Date: Sun, 13 Aug 2017 12:02:41 +0800
Subject: [PATCH 071/104] btrfs: remove unused BTRFS_COMPRESS_LAST

We aren't using this define, so removing it.

Signed-off-by: Anand Jain <anand.jain@oracle.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/compression.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
index 8508ba6b9aef..3b1b0ac15fdc 100644
--- a/fs/btrfs/compression.h
+++ b/fs/btrfs/compression.h
@@ -100,7 +100,6 @@ enum btrfs_compression_type {
 	BTRFS_COMPRESS_ZLIB  = 1,
 	BTRFS_COMPRESS_LZO   = 2,
 	BTRFS_COMPRESS_TYPES = 2,
-	BTRFS_COMPRESS_LAST  = 3,
 };
 
 struct btrfs_compress_op {

From 92ac58ec99db0a9ad7337ce85f0ad98a90b88805 Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <nborisov@suse.com>
Date: Thu, 17 Aug 2017 10:52:28 +0300
Subject: [PATCH 072/104] btrfs: Remove never-reached WARN_ON

We have a WARN_ON(!var) inside an if branch which is executed (among
others) only when var is true.

Signed-off-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent-tree.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index c74d24c1bbc9..eff674bfd162 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -6745,7 +6745,7 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info,
 		if (!readonly && return_free_space &&
 		    global_rsv->space_info == space_info) {
 			u64 to_add = len;
-			WARN_ON(!return_free_space);
+
 			spin_lock(&global_rsv->lock);
 			if (!global_rsv->full) {
 				to_add = min(len, global_rsv->size -

From db7c942ce84cabbf740a1e5654d58b9201a4a5f0 Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <nborisov@suse.com>
Date: Wed, 16 Aug 2017 18:41:44 +0300
Subject: [PATCH 073/104] btrfs: Remove unused sectorsize variable from struct
 map_lookup

This variable was added in 1abe9b8a138c ("Btrfs: add initial tracepointi
support for btrfs"), yet it never really got used, only assigned to. So
let's remove it.

Signed-off-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/volumes.c | 2 --
 fs/btrfs/volumes.h | 1 -
 2 files changed, 3 deletions(-)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index ed82edf987f4..18231194835e 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -4791,7 +4791,6 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 						   j * stripe_size;
 		}
 	}
-	map->sector_size = info->sectorsize;
 	map->stripe_len = BTRFS_STRIPE_LEN;
 	map->io_align = BTRFS_STRIPE_LEN;
 	map->io_width = BTRFS_STRIPE_LEN;
@@ -6443,7 +6442,6 @@ static int read_one_chunk(struct btrfs_fs_info *fs_info, struct btrfs_key *key,
 	map->num_stripes = num_stripes;
 	map->io_width = btrfs_chunk_io_width(leaf, chunk);
 	map->io_align = btrfs_chunk_io_align(leaf, chunk);
-	map->sector_size = btrfs_chunk_sector_size(leaf, chunk);
 	map->stripe_len = btrfs_chunk_stripe_len(leaf, chunk);
 	map->type = btrfs_chunk_type(leaf, chunk);
 	map->sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk);
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 181b365cab0c..eebe2f871dc7 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -353,7 +353,6 @@ struct map_lookup {
 	int io_align;
 	int io_width;
 	u64 stripe_len;
-	int sector_size;
 	int num_stripes;
 	int sub_stripes;
 	struct btrfs_bio_stripe stripes[];

From 2d77ab3cfbf26ced6d445bfbffe8f7b9c72c4008 Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <nborisov@suse.com>
Date: Wed, 16 Aug 2017 18:15:23 +0300
Subject: [PATCH 074/104] btrfs: expose internal free space tree routine only
 if sanity tests are enabled

The internal free space tree management routines are always exposed for
testing purposes. Make them dependent on SANITY_TESTS being on so that
they are exposed only when they really have to.

Signed-off-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/free-space-tree.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/free-space-tree.h b/fs/btrfs/free-space-tree.h
index 54ffced3bce8..ba3787df43c3 100644
--- a/fs/btrfs/free-space-tree.h
+++ b/fs/btrfs/free-space-tree.h
@@ -44,7 +44,7 @@ int remove_from_free_space_tree(struct btrfs_trans_handle *trans,
 				struct btrfs_fs_info *fs_info,
 				u64 start, u64 size);
 
-/* Exposed for testing. */
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
 struct btrfs_free_space_info *
 search_free_space_info(struct btrfs_trans_handle *trans,
 		       struct btrfs_fs_info *fs_info,
@@ -68,5 +68,6 @@ int convert_free_space_to_extents(struct btrfs_trans_handle *trans,
 				  struct btrfs_path *path);
 int free_space_test_bit(struct btrfs_block_group_cache *block_group,
 			struct btrfs_path *path, u64 offset);
+#endif
 
 #endif

From 938e1c77f8c9d345dab7c44ea5c0515fdad1ee8c Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Tue, 15 Aug 2017 08:51:02 +0100
Subject: [PATCH 075/104] btrfs: remove redundant check on ret being non-zero

The error return variable ret is initialized to zero and then is
checked to see if it is non-zero in the if-block that follows it.
It is therefore impossible for ret to be non-zero after the if-block
hence the check is redundant and can be removed.

Detected by CoverityScan, CID#1021040 ("Logically dead code")

Signed-off-by: Colin Ian King <colin.king@canonical.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/tree-log.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 3a11ae63676e..f05fcc67efa6 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -1143,8 +1143,6 @@ again:
 				goto again;
 			}
 			kfree(victim_name);
-			if (ret)
-				return ret;
 next:
 			cur_offset += victim_name_len + sizeof(*extref);
 		}

From 4a4b964f42fa5a70d0023d2f1d44a2764bd144f4 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Thu, 27 Jul 2017 19:52:55 +0100
Subject: [PATCH 076/104] Btrfs: avoid unnecessarily locking inode when
 clearing a range

If the range being cleared was not marked for defrag and we are not
about to clear the range from the defrag status, we don't need to
lock and unlock the inode.

Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: Chris Mason <clm@fb.com>
Reviewed-by: Wang Shilong <wangshilong1991@gmail.com>
Signed-off-by: Chris Mason <clm@fb.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/inode.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index fa4b2563dfd7..6747292d437e 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1801,10 +1801,11 @@ static void btrfs_clear_bit_hook(void *private_data,
 	u64 len = state->end + 1 - state->start;
 	u32 num_extents = count_max_extents(len);
 
-	spin_lock(&inode->lock);
-	if ((state->state & EXTENT_DEFRAG) && (*bits & EXTENT_DEFRAG))
+	if ((state->state & EXTENT_DEFRAG) && (*bits & EXTENT_DEFRAG)) {
+		spin_lock(&inode->lock);
 		inode->defrag_bytes -= len;
-	spin_unlock(&inode->lock);
+		spin_unlock(&inode->lock);
+	}
 
 	/*
 	 * set_bit and clear bit hooks normally require _irqsave/restore

From 6399fb5a0b69a9cc73aa84767954366d867b145d Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Fri, 28 Jul 2017 15:22:36 +0100
Subject: [PATCH 077/104] Btrfs: fix assertion failure during fsync in no-holes
 mode

When logging an inode in full mode that has an inline compressed extent
that represents a range with a size matching the sector size (currently
the same as the page size), has a trailing hole and the no-holes feature
is enabled, we end up failing an assertion leading to a trace like the
following:

[141812.031528] assertion failed: len == i_size, file: fs/btrfs/tree-log.c, line: 4453
[141812.033069] ------------[ cut here ]------------
[141812.034330] kernel BUG at fs/btrfs/ctree.h:3452!
[141812.035137] invalid opcode: 0000 [#1] PREEMPT SMP
[141812.035932] Modules linked in: btrfs dm_thin_pool dm_persistent_data dm_bio_prison dm_bufio dm_flakey dm_mod dax ppdev evdev ghash_clmulni_intel pcbc aesni_intel aes_x86_64 tpm_tis psmouse crypto_simd parport_pc sg pcspkr tpm_tis_core cryptd parport serio_raw glue_helper tpm i2c_piix4 i2c_core button sunrpc loop autofs4 ext4 crc16 jbd2 mbcache raid10 raid456 async_raid6_recov async_memcpy async_pq async_xor async_tx xor raid6_pq libcrc32c crc32c_generic raid1 raid0 multipath linear md_mod sd_mod ata_generic virtio_scsi ata_piix floppy crc32c_intel libata scsi_mod virtio_pci virtio_ring e1000 virtio [last unloaded: btrfs]
[141812.036790] CPU: 3 PID: 845 Comm: fdm-stress Tainted: G    B   W       4.12.3-btrfs-next-52+ #1
[141812.036790] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.10.2-0-g5f4c7b1-prebuilt.qemu-project.org 04/01/2014
[141812.036790] task: ffff8801e6694180 task.stack: ffffc90009004000
[141812.036790] RIP: 0010:assfail.constprop.18+0x1c/0x1e [btrfs]
[141812.036790] RSP: 0018:ffffc90009007bc0 EFLAGS: 00010282
[141812.036790] RAX: 0000000000000046 RBX: ffff88017512c008 RCX: 0000000000000001
[141812.036790] RDX: ffff88023fd95201 RSI: ffffffff8182264c RDI: 00000000ffffffff
[141812.036790] RBP: ffffc90009007bc0 R08: 0000000000000001 R09: 0000000000000001
[141812.036790] R10: 0000000000001000 R11: ffffffff82f5a0c9 R12: ffff88014e5947e8
[141812.036790] R13: 00000000000b4000 R14: ffff8801b234d008 R15: 0000000000000000
[141812.036790] FS:  00007fdba6ffd700(0000) GS:ffff88023fd80000(0000) knlGS:0000000000000000
[141812.036790] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[141812.036790] CR2: 00007fdb9c000010 CR3: 000000016efa2000 CR4: 00000000001406e0
[141812.036790] Call Trace:
[141812.036790]  btrfs_log_inode+0x9f0/0xd3d [btrfs]
[141812.036790]  ? __mutex_lock+0x120/0x3ce
[141812.036790]  btrfs_log_inode_parent+0x224/0x685 [btrfs]
[141812.036790]  ? lock_acquire+0x16b/0x1af
[141812.036790]  btrfs_log_dentry_safe+0x60/0x7b [btrfs]
[141812.036790]  btrfs_sync_file+0x32e/0x3f8 [btrfs]
[141812.036790]  vfs_fsync_range+0x8a/0x9d
[141812.036790]  vfs_fsync+0x1c/0x1e
[141812.036790]  do_fsync+0x31/0x4a
[141812.036790]  SyS_fdatasync+0x13/0x17
[141812.036790]  entry_SYSCALL_64_fastpath+0x18/0xad
[141812.036790] RIP: 0033:0x7fdbac41a47d
[141812.036790] RSP: 002b:00007fdba6ffce30 EFLAGS: 00000293 ORIG_RAX: 000000000000004b
[141812.036790] RAX: ffffffffffffffda RBX: ffffffff81092c9f RCX: 00007fdbac41a47d
[141812.036790] RDX: 0000004cf0160a40 RSI: 0000000000000000 RDI: 0000000000000006
[141812.036790] RBP: ffffc90009007f98 R08: 0000000000000000 R09: 0000000000000010
[141812.036790] R10: 00000000000002e8 R11: 0000000000000293 R12: ffffffff8110cd90
[141812.036790] R13: ffffc90009007f78 R14: 0000000000000000 R15: 0000000000000000
[141812.036790]  ? time_hardirqs_off+0x9/0x14
[141812.036790]  ? trace_hardirqs_off_caller+0x1f/0xa3
[141812.036790] Code: c7 d6 61 6b a0 48 89 e5 e8 ba ef a8 e0 0f 0b 55 89 f1 48 c7 c2 6d 65 6b a0 48 89 fe 48 c7 c7 81 65 6b a0 48 89 e5 e8 9c ef a8 e0 <0f> 0b 0f 1f 44 00 00 55 48 89 e5 41 57 41 56 41 55 41 54 49 89
[141812.036790] RIP: assfail.constprop.18+0x1c/0x1e [btrfs] RSP: ffffc90009007bc0
[141812.084448] ---[ end trace 44e472684c7a32cc ]---

Which happens because the code that logs a trailing hole when the no-holes
feature is enabled, did not consider that a compressed inline extent can
represent a range with a size matching the sector size, in which case
expanding the inode's i_size, through a truncate operation, won't lead
to padding with zeroes the page that represents the inline extent, and
therefore the inline extent remains after the truncation.

Fix this by adapting the assertion to accept inline extents representing
data with a sector size length if, and only if, the inline extents are
compressed.

A sample and trivial reproducer (for systems with a 4K page size) for this
issue:

  mkfs.btrfs -O no-holes -f /dev/sdc
  mount -o compress /dev/sdc /mnt
  xfs_io -f -c "pwrite -S 0xab 0 4K" /mnt/foobar
  sync
  xfs_io -c "truncate 32K" /mnt/foobar
  xfs_io -c "fsync" /mnt/foobar

Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/tree-log.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index f05fcc67efa6..a2a84e266d37 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -4448,7 +4448,10 @@ static int btrfs_log_trailing_hole(struct btrfs_trans_handle *trans,
 			len = btrfs_file_extent_inline_len(leaf,
 							   path->slots[0],
 							   extent);
-			ASSERT(len == i_size);
+			ASSERT(len == i_size ||
+			       (len == fs_info->sectorsize &&
+				btrfs_file_extent_compression(leaf, extent) !=
+				BTRFS_COMPRESS_NONE));
 			return 0;
 		}
 

From e6961cac730f62d9b07efb53835c4eb5ed407bde Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <nborisov@suse.com>
Date: Thu, 3 Aug 2017 15:44:58 +0300
Subject: [PATCH 078/104] btrfs: Move skip checksum check from
 btrfs_submit_direct to __btrfs_submit_dio_bio

Currently the code checks whether we should do data checksumming in
btrfs_submit_direct and the boolean result of this check is passed to
btrfs_submit_direct_hook, in turn passing it to __btrfs_submit_dio_bio which
actually consumes it. The last function actually has all the necessary context
to figure out whether to skip the check or not, so let's move the check closer
to where it's being consumed. No functional changes.

Signed-off-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: Chris Mason <clm@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/inode.c | 18 ++++++------------
 1 file changed, 6 insertions(+), 12 deletions(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 6747292d437e..61f1ad89e97a 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -8514,8 +8514,7 @@ static inline blk_status_t btrfs_lookup_and_bind_dio_csum(struct inode *inode,
 }
 
 static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode,
-					 u64 file_offset, int skip_sum,
-					 int async_submit)
+					 u64 file_offset, int async_submit)
 {
 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
 	struct btrfs_dio_private *dip = bio->bi_private;
@@ -8533,7 +8532,7 @@ static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode,
 			goto err;
 	}
 
-	if (skip_sum)
+	if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)
 		goto map;
 
 	if (write && async_submit) {
@@ -8563,8 +8562,7 @@ err:
 	return ret;
 }
 
-static int btrfs_submit_direct_hook(struct btrfs_dio_private *dip,
-				    int skip_sum)
+static int btrfs_submit_direct_hook(struct btrfs_dio_private *dip)
 {
 	struct inode *inode = dip->inode;
 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
@@ -8627,7 +8625,7 @@ static int btrfs_submit_direct_hook(struct btrfs_dio_private *dip,
 		 */
 		atomic_inc(&dip->pending_bios);
 
-		ret = __btrfs_submit_dio_bio(bio, inode, file_offset, skip_sum,
+		ret = __btrfs_submit_dio_bio(bio, inode, file_offset,
 					     async_submit);
 		if (ret) {
 			bio_put(bio);
@@ -8647,8 +8645,7 @@ static int btrfs_submit_direct_hook(struct btrfs_dio_private *dip,
 	} while (submit_len > 0);
 
 submit:
-	ret = __btrfs_submit_dio_bio(bio, inode, file_offset, skip_sum,
-				     async_submit);
+	ret = __btrfs_submit_dio_bio(bio, inode, file_offset, async_submit);
 	if (!ret)
 		return 0;
 
@@ -8673,12 +8670,9 @@ static void btrfs_submit_direct(struct bio *dio_bio, struct inode *inode,
 	struct btrfs_dio_private *dip = NULL;
 	struct bio *bio = NULL;
 	struct btrfs_io_bio *io_bio;
-	int skip_sum;
 	bool write = (bio_op(dio_bio) == REQ_OP_WRITE);
 	int ret = 0;
 
-	skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
-
 	bio = btrfs_bio_clone(dio_bio);
 
 	dip = kzalloc(sizeof(*dip), GFP_NOFS);
@@ -8721,7 +8715,7 @@ static void btrfs_submit_direct(struct bio *dio_bio, struct inode *inode,
 			dio_data->unsubmitted_oe_range_end;
 	}
 
-	ret = btrfs_submit_direct_hook(dip, skip_sum);
+	ret = btrfs_submit_direct_hook(dip);
 	if (!ret)
 		return;
 

From c59efa7eb2400ae356553a8ed5e3f3ef4cc47831 Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <nborisov@suse.com>
Date: Fri, 4 Aug 2017 14:41:18 +0300
Subject: [PATCH 079/104] btrfs: Fix -EOVERFLOW handling in
 btrfs_ioctl_tree_search_v2

The buffer passed to btrfs_ioctl_tree_search* functions have to be at least
sizeof(struct btrfs_ioctl_search_header). If this is not the case then the
ioctl should return -EOVERFLOW and set the uarg->buf_size to the minimum
required size. Currently btrfs_ioctl_tree_search_v2 would return an -EOVERFLOW
error with ->buf_size being set to the value passed by user space. Fix this by
removing the size check and relying on search_ioctl, which already includes it
and correctly sets buf_size.

Signed-off-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ioctl.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index b21558bb1294..ae8fbf9d3de2 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -2169,9 +2169,6 @@ static noinline int btrfs_ioctl_tree_search_v2(struct file *file,
 
 	buf_size = args.buf_size;
 
-	if (buf_size < sizeof(struct btrfs_ioctl_search_header))
-		return -EOVERFLOW;
-
 	/* limit result size to 16MB */
 	if (buf_size > buf_limit)
 		buf_size = buf_limit;

From f716abd55d1e141772fd44a99c07c89f4aff1978 Mon Sep 17 00:00:00 2001
From: Liu Bo <bo.li.liu@oracle.com>
Date: Wed, 9 Aug 2017 11:10:16 -0600
Subject: [PATCH 080/104] Btrfs: fix out of bounds array access while reading
 extent buffer

There is a corner case that slips through the checkers in functions
reading extent buffer, ie.

if (start < eb->len) and (start + len > eb->len),
then

a) map_private_extent_buffer() returns immediately because
it's thinking the range spans across two pages,

b) and the checkers in read_extent_buffer(), WARN_ON(start > eb->len)
and WARN_ON(start + len > eb->start + eb->len), both are OK in this
corner case, but it'd actually try to access the eb->pages out of
bounds because of (start + len > eb->len).

The case is found by switching extent inline ref type from shared data
ref to non-shared data ref, which is a kind of metadata corruption.

It'd use the wrong helper to access the eb,
eg. btrfs_extent_data_ref_root(eb, ref) is used but the %ref passing
here is "struct btrfs_shared_data_ref".  And if the extent item
happens to be the first item in the eb, then offset/length will get
over eb->len which ends up an invalid memory access.

This is adding proper checks in order to avoid invalid memory access,
ie. 'general protection fault', before it's too late.

Reviewed-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: Chris Mason <clm@fb.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent_io.c | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index ea4947c97505..d17783d70228 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -5404,8 +5404,12 @@ void read_extent_buffer(const struct extent_buffer *eb, void *dstv,
 	size_t start_offset = eb->start & ((u64)PAGE_SIZE - 1);
 	unsigned long i = (start_offset + start) >> PAGE_SHIFT;
 
-	WARN_ON(start > eb->len);
-	WARN_ON(start + len > eb->start + eb->len);
+	if (start + len > eb->len) {
+		WARN(1, KERN_ERR "btrfs bad mapping eb start %llu len %lu, wanted %lu %lu\n",
+		     eb->start, eb->len, start, len);
+		memset(dst, 0, len);
+		return;
+	}
 
 	offset = (start_offset + start) & (PAGE_SIZE - 1);
 
@@ -5478,6 +5482,12 @@ int map_private_extent_buffer(const struct extent_buffer *eb,
 	unsigned long end_i = (start_offset + start + min_len - 1) >>
 		PAGE_SHIFT;
 
+	if (start + min_len > eb->len) {
+		WARN(1, KERN_ERR "btrfs bad mapping eb start %llu len %lu, wanted %lu %lu\n",
+		       eb->start, eb->len, start, min_len);
+		return -EINVAL;
+	}
+
 	if (i != end_i)
 		return 1;
 
@@ -5489,12 +5499,6 @@ int map_private_extent_buffer(const struct extent_buffer *eb,
 		*map_start = ((u64)i << PAGE_SHIFT) - start_offset;
 	}
 
-	if (start + min_len > eb->len) {
-		WARN(1, KERN_ERR "btrfs bad mapping eb start %llu len %lu, wanted %lu %lu\n",
-		       eb->start, eb->len, start, min_len);
-		return -EINVAL;
-	}
-
 	p = eb->pages[i];
 	kaddr = page_address(p);
 	*map = kaddr + offset;

From 72610b1b40005fa19a107ea62b8bcc0ca84ef11b Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Thu, 10 Aug 2017 22:54:51 +0100
Subject: [PATCH 081/104] Btrfs: incremental send, fix emission of invalid
 clone operations

When doing an incremental send it's possible that the computed send stream
contains clone operations that will fail on the receiver if the receiver
has compression enabled and the clone operations target a sector sized
extent that starts at a zero file offset, is not compressed on the source
filesystem but ends up being compressed and inlined at the destination
filesystem.

Example scenario:

  $ mkfs.btrfs -f /dev/sdb
  $ mount -o compress /dev/sdb /mnt

  # By doing a direct IO write, the data is not compressed.
  $ xfs_io -f -d -c "pwrite -S 0xab 0 4K" /mnt/foobar
  $ btrfs subvolume snapshot -r /mnt /mnt/mysnap1

  $ xfs_io -c "reflink /mnt/foobar 0 8K 4K" /mnt/foobar
  $ btrfs subvolume snapshot -r /mnt /mnt/mysnap2

  $ btrfs send -f /tmp/1.snap /mnt/mysnap1
  $ btrfs send -f /tmp/2.snap -p /mnt/mysnap1 /mnt/mysnap2
  $ umount /mnt

  $ mkfs.btrfs -f /dev/sdc
  $ mount -o compress /dev/sdc /mnt
  $ btrfs receive -f /tmp/1.snap /mnt
  $ btrfs receive -f /tmp/2.snap /mnt
  ERROR: failed to clone extents to foobar
  Operation not supported

The same could be achieved by mounting the source filesystem without
compression and doing a buffered IO write instead of a direct IO one,
and mounting the destination filesystem with compression enabled.

So fix this by issuing regular write operations in the send stream
instead of clone operations when the source offset is zero and the
range has a length matching the sector size.

Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: Chris Mason <clm@fb.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/send.c | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 59fb1ed6ca20..8f1d3d6e7087 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -4992,6 +4992,25 @@ static int clone_range(struct send_ctx *sctx,
 	struct btrfs_key key;
 	int ret;
 
+	/*
+	 * Prevent cloning from a zero offset with a length matching the sector
+	 * size because in some scenarios this will make the receiver fail.
+	 *
+	 * For example, if in the source filesystem the extent at offset 0
+	 * has a length of sectorsize and it was written using direct IO, then
+	 * it can never be an inline extent (even if compression is enabled).
+	 * Then this extent can be cloned in the original filesystem to a non
+	 * zero file offset, but it may not be possible to clone in the
+	 * destination filesystem because it can be inlined due to compression
+	 * on the destination filesystem (as the receiver's write operations are
+	 * always done using buffered IO). The same happens when the original
+	 * filesystem does not have compression enabled but the destination
+	 * filesystem has.
+	 */
+	if (clone_root->offset == 0 &&
+	    len == sctx->send_root->fs_info->sectorsize)
+		return send_extent_data(sctx, offset, len);
+
 	path = alloc_path_for_send();
 	if (!path)
 		return -ENOMEM;

From 43a0111103af2d358f6d2d671d8429ba169e53bb Mon Sep 17 00:00:00 2001
From: Lu Fengqi <lufq.fnst@cn.fujitsu.com>
Date: Fri, 18 Aug 2017 16:38:07 +0800
Subject: [PATCH 082/104] btrfs: use btrfsic_submit_bio instead of submit_bio
 in write_dev_flush

Although this bio has no data attached, it will reach this condition
(bio->bi_opf & REQ_PREFLUSH) and then update the flush_gen of dev_state
in __btrfsic_submit_bio. So we should still submit it through integrity
checker. Otherwise, the integrity checker will throw the following warning
when I mount a newly created btrfs filesystem.

[10264.755497] btrfs: attempt to write superblock which references block M @29523968 (sdb1/1111654400/0) which is not flushed out of disk's write cache (block flush_gen=1, dev->flush_gen=0)!
[10264.755498] btrfs: attempt to write superblock which references block M @29523968 (sdb1/37912576/0) which is not flushed out of disk's write cache (block flush_gen=1, dev->flush_gen=0)!

Signed-off-by: Lu Fengqi <lufq.fnst@cn.fujitsu.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/disk-io.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 3715478fffed..195634098f21 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -3517,7 +3517,7 @@ static void write_dev_flush(struct btrfs_device *device)
 	init_completion(&device->flush_wait);
 	bio->bi_private = &device->flush_wait;
 
-	submit_bio(bio);
+	btrfsic_submit_bio(bio);
 	device->flush_bio_sent = 1;
 }
 

From 583b723151794e2ff1691f1510b4e43710293875 Mon Sep 17 00:00:00 2001
From: Hans van Kranenburg <hans.van.kranenburg@mendix.com>
Date: Fri, 28 Jul 2017 08:31:28 +0200
Subject: [PATCH 083/104] btrfs: Do not use data_alloc_cluster in ssd mode

    This patch provides a band aid to improve the 'out of the box'
behaviour of btrfs for disks that are detected as being an ssd.  In a
general purpose mixed workload scenario, the current ssd mode causes
overallocation of available raw disk space for data, while leaving
behind increasing amounts of unused fragmented free space. This
situation leads to early ENOSPC problems which are harming user
experience and adoption of btrfs as a general purpose filesystem.

This patch modifies the data extent allocation behaviour of the ssd mode
to make it behave identical to nossd mode.  The metadata behaviour and
additional ssd_spread option stay untouched so far.

Recommendations for future development are to reconsider the current
oversimplified nossd / ssd distinction and the broken detection
mechanism based on the rotational attribute in sysfs and provide
experienced users with a more flexible way to choose allocator behaviour
for data and metadata, optimized for certain use cases, while keeping
sane 'out of the box' default settings.  The internals of the current
btrfs code have more potential than what currently gets exposed to the
user to choose from.

    The SSD story...

    In the first year of btrfs development, around early 2008, btrfs
gained a mount option which enables specific functionality for
filesystems on solid state devices. The first occurance of this
functionality is in commit e18e4809, labeled "Add mount -o ssd, which
includes optimizations for seek free storage".

The effect on allocating free space for doing (data) writes is to
'cluster' writes together, writing them out in contiguous space, as
opposed to a 'tetris' way of putting all separate writes into any free
space fragment that fits (which is what the -o nossd behaviour does).

A somewhat simplified explanation of what happens is that, when for
example, the 'cluster' size is set to 2MiB, when we do some writes, the
data allocator will search for a free space block that is 2MiB big, and
put the writes in there. The ssd mode itself might allow a 2MiB cluster
to be composed of multiple free space extents with some existing data in
between, while the additional ssd_spread mount option kills off this
option and requires fully free space.

The idea behind this is (commit 536ac8ae): "The [...] clusters make it
more likely a given IO will completely overwrite the ssd block, so it
doesn't have to do an internal rwm cycle."; ssd block meaning nand erase
block. So, effectively this means applying a "locality based algorithm"
and trying to outsmart the actual ssd.

Since then, various changes have been made to the involved code, but the
basic idea is still present, and gets activated whenever the ssd mount
option is active. This also happens by default, when the rotational flag
as seen at /sys/block/<device>/queue/rotational is set to 0.

    However, there's a number of problems with this approach.

    First, what the optimization is trying to do is outsmart the ssd by
assuming there is a relation between the physical address space of the
block device as seen by btrfs and the actual physical storage of the
ssd, and then adjusting data placement. However, since the introduction
of the Flash Translation Layer (FTL) which is a part of the internal
controller of an ssd, these attempts are futile. The use of good quality
FTL in consumer ssd products might have been limited in 2008, but this
situation has changed drastically soon after that time. Today, even the
flash memory in your automatic cat feeding machine or your grandma's
wheelchair has a full featured one.

Second, the behaviour as described above results in the filesystem being
filled up with badly fragmented free space extents because of relatively
small pieces of space that are freed up by deletes, but not selected
again as part of a 'cluster'. Since the algorithm prefers allocating a
new chunk over going back to tetris mode, the end result is a filesystem
in which all raw space is allocated, but which is composed of
underutilized chunks with a 'shotgun blast' pattern of fragmented free
space. Usually, the next problematic thing that happens is the
filesystem wanting to allocate new space for metadata, which causes the
filesystem to fail in spectacular ways.

Third, the default mount options you get for an ssd ('ssd' mode enabled,
'discard' not enabled), in combination with spreading out writes over
the full address space and ignoring freed up space leads to worst case
behaviour in providing information to the ssd itself, since it will
never learn that all the free space left behind is actually free.  There
are two ways to let an ssd know previously written data does not have to
be preserved, which are sending explicit signals using discard or
fstrim, or by simply overwriting the space with new data.  The worst
case behaviour is the btrfs ssd_spread mount option in combination with
not having discard enabled. It has a side effect of minimizing the reuse
of free space previously written in.

Fourth, the rotational flag in /sys/ does not reliably indicate if the
device is a locally attached ssd. For example, iSCSI or NBD displays as
non-rotational, while a loop device on an ssd shows up as rotational.

The combination of the second and third problem effectively means that
despite all the good intentions, the btrfs ssd mode reliably causes the
ssd hardware and the filesystem structures and performance to be choked
to death. The clickbait version of the title of this story would have
been "Btrfs ssd optimizations considered harmful for ssds".

The current nossd 'tetris' mode (even still without discard) allows a
pattern of overwriting much more previously used space, causing many
more implicit discards to happen because of the overwrite information
the ssd gets. The actual location in the physical address space, as seen
from the point of view of btrfs is irrelevant, because the actual writes
to the low level flash are reordered anyway thanks to the FTL.

    Changes made in the code

1. Make ssd mode data allocation identical to tetris mode, like nossd.
2. Adjust and clean up filesystem mount messages so that we can easily
identify if a kernel has this patch applied or not, when providing
support to end users. Also, make better use of the *_and_info helpers to
only trigger messages on actual state changes.

    Backporting notes

Notes for whoever wants to backport this patch to their 4.9 LTS kernel:
* First apply commit 951e7966 "btrfs: drop the nossd flag when
  remounting with -o ssd", or fixup the differences manually.
* The rest of the conflicts are because of the fs_info refactoring. So,
  for example, instead of using fs_info, it's root->fs_info in
  extent-tree.c

Signed-off-by: Hans van Kranenburg <hans.van.kranenburg@mendix.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.h       |  6 +++---
 fs/btrfs/disk-io.c     |  6 ++----
 fs/btrfs/extent-tree.c | 11 ++++++-----
 fs/btrfs/super.c       | 16 +++++++++-------
 4 files changed, 20 insertions(+), 19 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index a3db2b1738aa..02edcddbcc9c 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -470,8 +470,8 @@ struct btrfs_block_rsv {
 
 /*
  * free clusters are used to claim free space in relatively large chunks,
- * allowing us to do less seeky writes.  They are used for all metadata
- * allocations and data allocations in ssd mode.
+ * allowing us to do less seeky writes. They are used for all metadata
+ * allocations. In ssd_spread mode they are also used for data allocations.
  */
 struct btrfs_free_cluster {
 	spinlock_t lock;
@@ -967,7 +967,7 @@ struct btrfs_fs_info {
 
 	struct reloc_control *reloc_ctl;
 
-	/* data_alloc_cluster is only used in ssd mode */
+	/* data_alloc_cluster is only used in ssd_spread mode */
 	struct btrfs_free_cluster data_alloc_cluster;
 
 	/* all metadata allocations go through this cluster */
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 195634098f21..90b967ae46d0 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -3053,11 +3053,9 @@ retry_root_backup:
 	if (IS_ERR(fs_info->transaction_kthread))
 		goto fail_cleaner;
 
-	if (!btrfs_test_opt(fs_info, SSD) &&
-	    !btrfs_test_opt(fs_info, NOSSD) &&
+	if (!btrfs_test_opt(fs_info, NOSSD) &&
 	    !fs_info->fs_devices->rotating) {
-		btrfs_info(fs_info, "detected SSD devices, enabling SSD mode");
-		btrfs_set_opt(fs_info->mount_opt, SSD);
+		btrfs_set_and_info(fs_info, SSD, "enabling ssd optimizations");
 	}
 
 	/*
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index eff674bfd162..6a8da7c19182 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -6654,19 +6654,20 @@ fetch_cluster_info(struct btrfs_fs_info *fs_info,
 		   struct btrfs_space_info *space_info, u64 *empty_cluster)
 {
 	struct btrfs_free_cluster *ret = NULL;
-	bool ssd = btrfs_test_opt(fs_info, SSD);
 
 	*empty_cluster = 0;
 	if (btrfs_mixed_space_info(space_info))
 		return ret;
 
-	if (ssd)
-		*empty_cluster = SZ_2M;
 	if (space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
 		ret = &fs_info->meta_alloc_cluster;
-		if (!ssd)
+		if (btrfs_test_opt(fs_info, SSD))
+			*empty_cluster = SZ_2M;
+		else
 			*empty_cluster = SZ_64K;
-	} else if ((space_info->flags & BTRFS_BLOCK_GROUP_DATA) && ssd) {
+	} else if ((space_info->flags & BTRFS_BLOCK_GROUP_DATA) &&
+		   btrfs_test_opt(fs_info, SSD_SPREAD)) {
+		*empty_cluster = SZ_2M;
 		ret = &fs_info->data_alloc_cluster;
 	}
 
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 8a9bcad3b06a..0b7a1d8cd08b 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -549,20 +549,22 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
 			break;
 		case Opt_ssd:
 			btrfs_set_and_info(info, SSD,
-					   "use ssd allocation scheme");
+					   "enabling ssd optimizations");
 			btrfs_clear_opt(info->mount_opt, NOSSD);
 			break;
 		case Opt_ssd_spread:
+			btrfs_set_and_info(info, SSD,
+					   "enabling ssd optimizations");
 			btrfs_set_and_info(info, SSD_SPREAD,
-					   "use spread ssd allocation scheme");
-			btrfs_set_opt(info->mount_opt, SSD);
+					   "using spread ssd allocation scheme");
 			btrfs_clear_opt(info->mount_opt, NOSSD);
 			break;
 		case Opt_nossd:
-			btrfs_set_and_info(info, NOSSD,
-					     "not using ssd allocation scheme");
-			btrfs_clear_opt(info->mount_opt, SSD);
-			btrfs_clear_opt(info->mount_opt, SSD_SPREAD);
+			btrfs_set_opt(info->mount_opt, NOSSD);
+			btrfs_clear_and_info(info, SSD,
+					     "not using ssd optimizations");
+			btrfs_clear_and_info(info, SSD_SPREAD,
+					     "not using spread ssd allocation scheme");
 			break;
 		case Opt_barrier:
 			btrfs_clear_and_info(info, NOBARRIER,

From 0ce1dd2a4ab5513cfa9cce3138251a3c71e446cd Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <nborisov@suse.com>
Date: Fri, 28 Jul 2017 10:50:14 +0300
Subject: [PATCH 084/104] btrfs: Remove redundant setting of uuid in
 btrfs_block_header

btrfs_alloc_dev_extent currently unconditionally sets the uuid in the
leaf block header the function is working with. This is unnecessary
since this operation is peformed by the core btree handling code
(splitting a node, allocating a new btree block etc). So let's remove
it.

Signed-off-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/volumes.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 18231194835e..3561397a0c29 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1603,8 +1603,6 @@ static int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
 	btrfs_set_dev_extent_chunk_objectid(leaf, extent, chunk_objectid);
 	btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset);
 
-	write_extent_buffer_chunk_tree_uuid(leaf, fs_info->chunk_tree_uuid);
-
 	btrfs_set_dev_extent_length(leaf, extent, num_bytes);
 	btrfs_mark_buffer_dirty(leaf);
 out:

From 0dde10bed2c44a4024eb446cc72fe4e0cb97ec06 Mon Sep 17 00:00:00 2001
From: Matthias Kaehlcke <mka@chromium.org>
Date: Thu, 27 Jul 2017 14:30:23 -0700
Subject: [PATCH 085/104] btrfs: Remove extra parentheses from condition in
 copy_items()

There is no need for the extra pair of parentheses, remove it. This
fixes the following warning when building with clang:

fs/btrfs/tree-log.c:3694:10: warning: equality comparison with extraneous
  parentheses [-Wparentheses-equality]
                if ((i == (nr - 1)))
                     ~~^~~~~~~~~~~

Also remove the unnecessary parentheses around the substraction.

Signed-off-by: Matthias Kaehlcke <mka@chromium.org>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/tree-log.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index a2a84e266d37..ad7f4bab640b 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -3688,7 +3688,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
 
 		src_offset = btrfs_item_ptr_offset(src, start_slot + i);
 
-		if ((i == (nr - 1)))
+		if (i == nr - 1)
 			last_key = ins_keys[i];
 
 		if (ins_keys[i].type == BTRFS_INODE_ITEM_KEY) {

From 0174484d619460a65e88f594c36983cd2b7f4128 Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <nborisov@suse.com>
Date: Thu, 27 Jul 2017 14:22:11 +0300
Subject: [PATCH 086/104] btrfs: Remove chunk_objectid argument from
 btrfs_make_block_group

btrfs_make_block_group is always called with chunk_objectid set to
BTRFS_FIRST_CHUNK_TREE_OBJECTID. There's no reason why this behavior will
change anytime soon, so let's remove the argument and decrease the cognitive
load when reading the code path. No functional change

Signed-off-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.h       | 3 +--
 fs/btrfs/extent-tree.c | 6 +++---
 fs/btrfs/volumes.c     | 4 +---
 3 files changed, 5 insertions(+), 8 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 02edcddbcc9c..ca087ad5ac48 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -2676,8 +2676,7 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info);
 int btrfs_can_relocate(struct btrfs_fs_info *fs_info, u64 bytenr);
 int btrfs_make_block_group(struct btrfs_trans_handle *trans,
 			   struct btrfs_fs_info *fs_info, u64 bytes_used,
-			   u64 type, u64 chunk_objectid, u64 chunk_offset,
-			   u64 size);
+			   u64 type, u64 chunk_offset, u64 size);
 struct btrfs_trans_handle *btrfs_start_trans_remove_block_group(
 				struct btrfs_fs_info *fs_info,
 				const u64 chunk_offset);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 6a8da7c19182..1a80f6e58296 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -10180,8 +10180,7 @@ next:
 
 int btrfs_make_block_group(struct btrfs_trans_handle *trans,
 			   struct btrfs_fs_info *fs_info, u64 bytes_used,
-			   u64 type, u64 chunk_objectid, u64 chunk_offset,
-			   u64 size)
+			   u64 type, u64 chunk_offset, u64 size)
 {
 	struct btrfs_block_group_cache *cache;
 	int ret;
@@ -10193,7 +10192,8 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
 		return -ENOMEM;
 
 	btrfs_set_block_group_used(&cache->item, bytes_used);
-	btrfs_set_block_group_chunk_objectid(&cache->item, chunk_objectid);
+	btrfs_set_block_group_chunk_objectid(&cache->item,
+					     BTRFS_FIRST_CHUNK_TREE_OBJECTID);
 	btrfs_set_block_group_flags(&cache->item, type);
 
 	cache->flags = type;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 3561397a0c29..1df1044e9008 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -4826,9 +4826,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 		goto error;
 	}
 
-	ret = btrfs_make_block_group(trans, info, 0, type,
-				     BTRFS_FIRST_CHUNK_TREE_OBJECTID,
-				     start, num_bytes);
+	ret = btrfs_make_block_group(trans, info, 0, type, start, num_bytes);
 	if (ret)
 		goto error_del_extent;
 

From 408fbf19ad78d8ab0656289efea77c10a342f043 Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <nborisov@suse.com>
Date: Thu, 27 Jul 2017 14:37:29 +0300
Subject: [PATCH 087/104] btrfs: Remove extraneous chunk_objectid variable

BTRFS_FIRST_CHUNK_TREE_OBJECTIS id the only objectid being used in the
chunk_tree. So remove a variable which is always set to that value and collapse
its usage in callees which are passed this variable. No functional changes

Signed-off-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/volumes.c | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 1df1044e9008..a37a31ba6843 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -2712,8 +2712,7 @@ int btrfs_grow_device(struct btrfs_trans_handle *trans,
 }
 
 static int btrfs_free_chunk(struct btrfs_trans_handle *trans,
-			    struct btrfs_fs_info *fs_info, u64 chunk_objectid,
-			    u64 chunk_offset)
+			    struct btrfs_fs_info *fs_info, u64 chunk_offset)
 {
 	struct btrfs_root *root = fs_info->chunk_root;
 	int ret;
@@ -2724,7 +2723,7 @@ static int btrfs_free_chunk(struct btrfs_trans_handle *trans,
 	if (!path)
 		return -ENOMEM;
 
-	key.objectid = chunk_objectid;
+	key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
 	key.offset = chunk_offset;
 	key.type = BTRFS_CHUNK_ITEM_KEY;
 
@@ -2747,8 +2746,7 @@ out:
 	return ret;
 }
 
-static int btrfs_del_sys_chunk(struct btrfs_fs_info *fs_info,
-			       u64 chunk_objectid, u64 chunk_offset)
+static int btrfs_del_sys_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
 {
 	struct btrfs_super_block *super_copy = fs_info->super_copy;
 	struct btrfs_disk_key *disk_key;
@@ -2781,7 +2779,7 @@ static int btrfs_del_sys_chunk(struct btrfs_fs_info *fs_info,
 			ret = -EIO;
 			break;
 		}
-		if (key.objectid == chunk_objectid &&
+		if (key.objectid == BTRFS_FIRST_CHUNK_TREE_OBJECTID &&
 		    key.offset == chunk_offset) {
 			memmove(ptr, ptr + len, array_size - (cur + len));
 			array_size -= len;
@@ -2830,7 +2828,6 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans,
 	struct extent_map *em;
 	struct map_lookup *map;
 	u64 dev_extent_len = 0;
-	u64 chunk_objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
 	int i, ret = 0;
 	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
 
@@ -2886,7 +2883,7 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans,
 	}
 	mutex_unlock(&fs_devices->device_list_mutex);
 
-	ret = btrfs_free_chunk(trans, fs_info, chunk_objectid, chunk_offset);
+	ret = btrfs_free_chunk(trans, fs_info, chunk_offset);
 	if (ret) {
 		btrfs_abort_transaction(trans, ret);
 		goto out;
@@ -2895,8 +2892,7 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans,
 	trace_btrfs_chunk_free(fs_info, map, chunk_offset, em->len);
 
 	if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
-		ret = btrfs_del_sys_chunk(fs_info, chunk_objectid,
-					  chunk_offset);
+		ret = btrfs_del_sys_chunk(fs_info, chunk_offset);
 		if (ret) {
 			btrfs_abort_transaction(trans, ret);
 			goto out;

From d7d824966530acfe32b94d1ed672e6fe1638cd68 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ernesto=20A=2E=20Fern=C3=A1ndez?=
 <ernesto.mnd.fernandez@gmail.com>
Date: Wed, 2 Aug 2017 03:18:27 -0300
Subject: [PATCH 088/104] btrfs: preserve i_mode if __btrfs_set_acl() fails
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When changing a file's acl mask, btrfs_set_acl() will first set the
group bits of i_mode to the value of the mask, and only then set the
actual extended attribute representing the new acl.

If the second part fails (due to lack of space, for example) and the
file had no acl attribute to begin with, the system will from now on
assume that the mask permission bits are actual group permission bits,
potentially granting access to the wrong users.

Prevent this by restoring the original mode bits if __btrfs_set_acl
fails.

Signed-off-by: Ernesto A. Fernández <ernesto.mnd.fernandez@gmail.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/acl.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index 8d8370ddb6b2..1ba49ebe67da 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -114,13 +114,17 @@ out:
 int btrfs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
 {
 	int ret;
+	umode_t old_mode = inode->i_mode;
 
 	if (type == ACL_TYPE_ACCESS && acl) {
 		ret = posix_acl_update_mode(inode, &inode->i_mode, &acl);
 		if (ret)
 			return ret;
 	}
-	return __btrfs_set_acl(NULL, inode, acl, type);
+	ret = __btrfs_set_acl(NULL, inode, acl, type);
+	if (ret)
+		inode->i_mode = old_mode;
+	return ret;
 }
 
 /*

From 2073c4c2e51a9343af10862ba478e7a67d6caf04 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Fri, 31 Mar 2017 17:12:51 +0200
Subject: [PATCH 089/104] btrfs: scrub: use bool for flush_all_writes

flush_all_writes is an atomic but does not use the semantics at all,
it's just on/off indicator, we can use bool.

Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/scrub.c | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 20c1ba19e665..810fce240489 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -182,8 +182,8 @@ struct scrub_ctx {
 	struct scrub_bio        *wr_curr_bio;
 	struct mutex            wr_lock;
 	int                     pages_per_wr_bio; /* <= SCRUB_PAGES_PER_WR_BIO */
-	atomic_t                flush_all_writes;
 	struct btrfs_device     *wr_tgtdev;
+	bool                    flush_all_writes;
 
 	/*
 	 * statistics
@@ -717,7 +717,7 @@ struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace)
 		WARN_ON(!fs_info->dev_replace.tgtdev);
 		sctx->pages_per_wr_bio = SCRUB_PAGES_PER_WR_BIO;
 		sctx->wr_tgtdev = fs_info->dev_replace.tgtdev;
-		atomic_set(&sctx->flush_all_writes, 0);
+		sctx->flush_all_writes = false;
 	}
 
 	return sctx;
@@ -2402,8 +2402,7 @@ static void scrub_missing_raid56_worker(struct btrfs_work *work)
 
 	scrub_block_put(sblock);
 
-	if (sctx->is_dev_replace &&
-	    atomic_read(&sctx->flush_all_writes)) {
+	if (sctx->is_dev_replace && sctx->flush_all_writes) {
 		mutex_lock(&sctx->wr_lock);
 		scrub_wr_submit(sctx);
 		mutex_unlock(&sctx->wr_lock);
@@ -2607,8 +2606,7 @@ static void scrub_bio_end_io_worker(struct btrfs_work *work)
 	sctx->first_free = sbio->index;
 	spin_unlock(&sctx->list_lock);
 
-	if (sctx->is_dev_replace &&
-	    atomic_read(&sctx->flush_all_writes)) {
+	if (sctx->is_dev_replace && sctx->flush_all_writes) {
 		mutex_lock(&sctx->wr_lock);
 		scrub_wr_submit(sctx);
 		mutex_unlock(&sctx->wr_lock);
@@ -3440,14 +3438,14 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
 		 */
 		if (atomic_read(&fs_info->scrub_pause_req)) {
 			/* push queued extents */
-			atomic_set(&sctx->flush_all_writes, 1);
+			sctx->flush_all_writes = true;
 			scrub_submit(sctx);
 			mutex_lock(&sctx->wr_lock);
 			scrub_wr_submit(sctx);
 			mutex_unlock(&sctx->wr_lock);
 			wait_event(sctx->list_wait,
 				   atomic_read(&sctx->bios_in_flight) == 0);
-			atomic_set(&sctx->flush_all_writes, 0);
+			sctx->flush_all_writes = false;
 			scrub_blocked_if_needed(fs_info);
 		}
 
@@ -3892,7 +3890,7 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
 		 * write requests are really completed when bios_in_flight
 		 * changes to 0.
 		 */
-		atomic_set(&sctx->flush_all_writes, 1);
+		sctx->flush_all_writes = true;
 		scrub_submit(sctx);
 		mutex_lock(&sctx->wr_lock);
 		scrub_wr_submit(sctx);
@@ -3910,7 +3908,7 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
 		 */
 		wait_event(sctx->list_wait,
 			   atomic_read(&sctx->workers_pending) == 0);
-		atomic_set(&sctx->flush_all_writes, 0);
+		sctx->flush_all_writes = false;
 
 		scrub_pause_off(fs_info);
 

From 7736b0a4313c936dc8ccb87e8acf447e12ae2fd1 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Fri, 31 Mar 2017 18:02:48 +0200
Subject: [PATCH 090/104] btrfs: scrub: clean up division in
 __scrub_mark_bitmap

Use proper helpers for 64bit division and then cast to narrower type.

Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/scrub.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 810fce240489..99f3a7e9eb61 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -2620,7 +2620,8 @@ static inline void __scrub_mark_bitmap(struct scrub_parity *sparity,
 				       u64 start, u64 len)
 {
 	u64 offset;
-	int nsectors;
+	u64 nsectors64;
+	u32 nsectors;
 	int sectorsize = sparity->sctx->fs_info->sectorsize;
 
 	if (len >= sparity->stripe_len) {
@@ -2631,7 +2632,10 @@ static inline void __scrub_mark_bitmap(struct scrub_parity *sparity,
 	start -= sparity->logic_start;
 	start = div64_u64_rem(start, sparity->stripe_len, &offset);
 	offset = div_u64(offset, sectorsize);
-	nsectors = (int)len / sectorsize;
+	nsectors64 = div_u64(len, sectorsize);
+
+	ASSERT(nsectors64 < UINT_MAX);
+	nsectors = (u32)nsectors64;
 
 	if (offset + nsectors <= sparity->nsectors) {
 		bitmap_set(bitmap, offset, nsectors);

From 1d1bf92d9dee0aeb14dee38151bcf214443f99d5 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Fri, 31 Mar 2017 18:02:48 +0200
Subject: [PATCH 091/104] btrfs: scrub: clean up division in scrub_find_csum

Use proper helpers for 64bit division.

Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/scrub.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 99f3a7e9eb61..a1b29d581dca 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -2708,7 +2708,9 @@ static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u8 *csum)
 	if (!sum)
 		return 0;
 
-	index = ((u32)(logical - sum->bytenr)) / sctx->fs_info->sectorsize;
+	index = div_u64(logical - sum->bytenr, sctx->fs_info->sectorsize);
+	ASSERT(index < UINT_MAX);
+
 	num_sectors = sum->len / sctx->fs_info->sectorsize;
 	memcpy(csum, sum->sums + index, sctx->csum_size);
 	if (index == num_sectors - 1) {

From af1cbe0a66cb9e6c3f2acf7f0a7fc647b5dc4f2f Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Fri, 31 Mar 2017 18:42:57 +0200
Subject: [PATCH 092/104] btrfs: scrub: simplify scrub worker initialization

Minor simplification, merge calls to one.

Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/scrub.c | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index a1b29d581dca..7c9c82bb6255 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -4015,14 +4015,8 @@ static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
 	int max_active = fs_info->thread_pool_size;
 
 	if (fs_info->scrub_workers_refcnt == 0) {
-		if (is_dev_replace)
-			fs_info->scrub_workers =
-				btrfs_alloc_workqueue(fs_info, "scrub", flags,
-						      1, 4);
-		else
-			fs_info->scrub_workers =
-				btrfs_alloc_workqueue(fs_info, "scrub", flags,
-						      max_active, 4);
+		fs_info->scrub_workers = btrfs_alloc_workqueue(fs_info, "scrub",
+				flags, is_dev_replace ? 1 : max_active, 4);
 		if (!fs_info->scrub_workers)
 			goto fail_scrub_workers;
 

From 167ce953ca55bdee20fe56c3c0fa51002435f745 Mon Sep 17 00:00:00 2001
From: Liu Bo <bo.li.liu@oracle.com>
Date: Fri, 18 Aug 2017 15:15:18 -0600
Subject: [PATCH 093/104] Btrfs: add a helper to retrive extent inline ref type

An invalid value of extent inline ref type may be read from a
malicious image which may force btrfs to crash.

This adds a helper which does sanity check for the ref type, so we can
know if it's sane, return he type, otherwise return an error.

Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
Reviewed-by: David Sterba <dsterba@suse.com>
[ minimal tweak const types, causing warnings due to other cleanup patches ]
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.h       | 11 +++++++++++
 fs/btrfs/extent-tree.c | 37 +++++++++++++++++++++++++++++++++++++
 2 files changed, 48 insertions(+)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index ca087ad5ac48..542db9d0dbcd 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -2567,6 +2567,17 @@ static inline gfp_t btrfs_alloc_write_mask(struct address_space *mapping)
 
 /* extent-tree.c */
 
+enum btrfs_inline_ref_type {
+	BTRFS_REF_TYPE_INVALID =	 0,
+	BTRFS_REF_TYPE_BLOCK =		 1,
+	BTRFS_REF_TYPE_DATA =		 2,
+	BTRFS_REF_TYPE_ANY =		 3,
+};
+
+int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb,
+				     struct btrfs_extent_inline_ref *iref,
+				     enum btrfs_inline_ref_type is_data);
+
 u64 btrfs_csum_bytes_to_leaves(struct btrfs_fs_info *fs_info, u64 csum_bytes);
 
 static inline u64 btrfs_calc_trans_metadata_size(struct btrfs_fs_info *fs_info,
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 1a80f6e58296..794b06dd824a 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1148,6 +1148,43 @@ static int convert_extent_item_v0(struct btrfs_trans_handle *trans,
 }
 #endif
 
+/*
+ * is_data == BTRFS_REF_TYPE_BLOCK, tree block type is required,
+ * is_data == BTRFS_REF_TYPE_DATA, data type is requried,
+ * is_data == BTRFS_REF_TYPE_ANY, either type is OK.
+ */
+int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb,
+				     struct btrfs_extent_inline_ref *iref,
+				     enum btrfs_inline_ref_type is_data)
+{
+	int type = btrfs_extent_inline_ref_type(eb, iref);
+
+	if (type == BTRFS_TREE_BLOCK_REF_KEY ||
+	    type == BTRFS_SHARED_BLOCK_REF_KEY ||
+	    type == BTRFS_SHARED_DATA_REF_KEY ||
+	    type == BTRFS_EXTENT_DATA_REF_KEY) {
+		if (is_data == BTRFS_REF_TYPE_BLOCK) {
+			if (type == BTRFS_TREE_BLOCK_REF_KEY ||
+			    type == BTRFS_SHARED_BLOCK_REF_KEY)
+				return type;
+		} else if (is_data == BTRFS_REF_TYPE_DATA) {
+			if (type == BTRFS_EXTENT_DATA_REF_KEY ||
+			    type == BTRFS_SHARED_DATA_REF_KEY)
+				return type;
+		} else {
+			ASSERT(is_data == BTRFS_REF_TYPE_ANY);
+			return type;
+		}
+	}
+
+	btrfs_print_leaf((struct extent_buffer *)eb);
+	btrfs_err(eb->fs_info, "eb %llu invalid extent inline ref type %d",
+		  eb->start, type);
+	WARN_ON(1);
+
+	return BTRFS_REF_TYPE_INVALID;
+}
+
 static u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset)
 {
 	u32 high_crc = ~(u32)0;

From 3de28d579edbd35294bf44aee8402c804331bc37 Mon Sep 17 00:00:00 2001
From: Liu Bo <bo.li.liu@oracle.com>
Date: Fri, 18 Aug 2017 15:15:19 -0600
Subject: [PATCH 094/104] Btrfs: convert to use
 btrfs_get_extent_inline_ref_type

Since we have a helper which can do sanity check, this converts all
btrfs_extent_inline_ref_type to it.

Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/backref.c     | 11 +++++++++--
 fs/btrfs/extent-tree.c | 36 ++++++++++++++++++++++++++++++------
 fs/btrfs/relocation.c  | 13 +++++++++++--
 3 files changed, 50 insertions(+), 10 deletions(-)

diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 6bae986bfcfb..b517ef1477ea 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -929,7 +929,11 @@ static int add_inline_refs(const struct btrfs_fs_info *fs_info,
 		int type;
 
 		iref = (struct btrfs_extent_inline_ref *)ptr;
-		type = btrfs_extent_inline_ref_type(leaf, iref);
+		type = btrfs_get_extent_inline_ref_type(leaf, iref,
+							BTRFS_REF_TYPE_ANY);
+		if (type == BTRFS_REF_TYPE_INVALID)
+			return -EINVAL;
+
 		offset = btrfs_extent_inline_ref_offset(leaf, iref);
 
 		switch (type) {
@@ -1776,7 +1780,10 @@ static int get_extent_inline_ref(unsigned long *ptr,
 
 	end = (unsigned long)ei + item_size;
 	*out_eiref = (struct btrfs_extent_inline_ref *)(*ptr);
-	*out_type = btrfs_extent_inline_ref_type(eb, *out_eiref);
+	*out_type = btrfs_get_extent_inline_ref_type(eb, *out_eiref,
+						     BTRFS_REF_TYPE_ANY);
+	if (*out_type == BTRFS_REF_TYPE_INVALID)
+		return -EINVAL;
 
 	*ptr += btrfs_extent_inline_ref_size(*out_type);
 	WARN_ON(*ptr > end);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 794b06dd824a..51a691532fd8 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1454,12 +1454,18 @@ static noinline u32 extent_data_ref_count(struct btrfs_path *path,
 	struct btrfs_extent_data_ref *ref1;
 	struct btrfs_shared_data_ref *ref2;
 	u32 num_refs = 0;
+	int type;
 
 	leaf = path->nodes[0];
 	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
 	if (iref) {
-		if (btrfs_extent_inline_ref_type(leaf, iref) ==
-		    BTRFS_EXTENT_DATA_REF_KEY) {
+		/*
+		 * If type is invalid, we should have bailed out earlier than
+		 * this call.
+		 */
+		type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA);
+		ASSERT(type != BTRFS_REF_TYPE_INVALID);
+		if (type == BTRFS_EXTENT_DATA_REF_KEY) {
 			ref1 = (struct btrfs_extent_data_ref *)(&iref->offset);
 			num_refs = btrfs_extent_data_ref_count(leaf, ref1);
 		} else {
@@ -1620,6 +1626,7 @@ int lookup_inline_extent_backref(struct btrfs_trans_handle *trans,
 	int ret;
 	int err = 0;
 	bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
+	int needed;
 
 	key.objectid = bytenr;
 	key.type = BTRFS_EXTENT_ITEM_KEY;
@@ -1711,6 +1718,11 @@ again:
 		BUG_ON(ptr > end);
 	}
 
+	if (owner >= BTRFS_FIRST_FREE_OBJECTID)
+		needed = BTRFS_REF_TYPE_DATA;
+	else
+		needed = BTRFS_REF_TYPE_BLOCK;
+
 	err = -ENOENT;
 	while (1) {
 		if (ptr >= end) {
@@ -1718,7 +1730,12 @@ again:
 			break;
 		}
 		iref = (struct btrfs_extent_inline_ref *)ptr;
-		type = btrfs_extent_inline_ref_type(leaf, iref);
+		type = btrfs_get_extent_inline_ref_type(leaf, iref, needed);
+		if (type == BTRFS_REF_TYPE_INVALID) {
+			err = -EINVAL;
+			goto out;
+		}
+
 		if (want < type)
 			break;
 		if (want > type) {
@@ -1910,7 +1927,12 @@ void update_inline_extent_backref(struct btrfs_fs_info *fs_info,
 	if (extent_op)
 		__run_delayed_extent_op(extent_op, leaf, ei);
 
-	type = btrfs_extent_inline_ref_type(leaf, iref);
+	/*
+	 * If type is invalid, we should have bailed out after
+	 * lookup_inline_extent_backref().
+	 */
+	type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_ANY);
+	ASSERT(type != BTRFS_REF_TYPE_INVALID);
 
 	if (type == BTRFS_EXTENT_DATA_REF_KEY) {
 		dref = (struct btrfs_extent_data_ref *)(&iref->offset);
@@ -3195,6 +3217,7 @@ static noinline int check_committed_ref(struct btrfs_root *root,
 	struct btrfs_extent_item *ei;
 	struct btrfs_key key;
 	u32 item_size;
+	int type;
 	int ret;
 
 	key.objectid = bytenr;
@@ -3236,8 +3259,9 @@ static noinline int check_committed_ref(struct btrfs_root *root,
 		goto out;
 
 	iref = (struct btrfs_extent_inline_ref *)(ei + 1);
-	if (btrfs_extent_inline_ref_type(leaf, iref) !=
-	    BTRFS_EXTENT_DATA_REF_KEY)
+
+	type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA);
+	if (type != BTRFS_EXTENT_DATA_REF_KEY)
 		goto out;
 
 	ref = (struct btrfs_extent_data_ref *)(&iref->offset);
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 1a532bb72eab..96f816aa9ed3 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -799,9 +799,17 @@ again:
 		if (ptr < end) {
 			/* update key for inline back ref */
 			struct btrfs_extent_inline_ref *iref;
+			int type;
 			iref = (struct btrfs_extent_inline_ref *)ptr;
-			key.type = btrfs_extent_inline_ref_type(eb, iref);
+			type = btrfs_get_extent_inline_ref_type(eb, iref,
+							BTRFS_REF_TYPE_BLOCK);
+			if (type == BTRFS_REF_TYPE_INVALID) {
+				err = -EINVAL;
+				goto out;
+			}
+			key.type = type;
 			key.offset = btrfs_extent_inline_ref_offset(eb, iref);
+
 			WARN_ON(key.type != BTRFS_TREE_BLOCK_REF_KEY &&
 				key.type != BTRFS_SHARED_BLOCK_REF_KEY);
 		}
@@ -3753,7 +3761,8 @@ int add_data_references(struct reloc_control *rc,
 
 	while (ptr < end) {
 		iref = (struct btrfs_extent_inline_ref *)ptr;
-		key.type = btrfs_extent_inline_ref_type(eb, iref);
+		key.type = btrfs_get_extent_inline_ref_type(eb, iref,
+							BTRFS_REF_TYPE_DATA);
 		if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
 			key.offset = btrfs_extent_inline_ref_offset(eb, iref);
 			ret = __add_tree_block(rc, key.offset, blocksize,

From 4335958de2a43c6790c7f6aa0682aa7189983fa4 Mon Sep 17 00:00:00 2001
From: Liu Bo <bo.li.liu@oracle.com>
Date: Fri, 18 Aug 2017 15:15:20 -0600
Subject: [PATCH 095/104] Btrfs: remove BUG() in btrfs_extent_inline_ref_size

Now that btrfs_get_extent_inline_ref_type() can report if type is a
valid one and all callers can gracefully deal with that, we don't need
to crash here.

Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 542db9d0dbcd..b7cfc74c1757 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1804,7 +1804,6 @@ static inline u32 btrfs_extent_inline_ref_size(int type)
 	if (type == BTRFS_EXTENT_DATA_REF_KEY)
 		return sizeof(struct btrfs_extent_data_ref) +
 		       offsetof(struct btrfs_extent_inline_ref, offset);
-	BUG();
 	return 0;
 }
 

From 07638ea5987e51715b35eb5a9a9e908f18ffabf7 Mon Sep 17 00:00:00 2001
From: Liu Bo <bo.li.liu@oracle.com>
Date: Fri, 18 Aug 2017 15:15:21 -0600
Subject: [PATCH 096/104] Btrfs: remove BUG() in print_extent_item

btrfs_print_leaf() is used in btrfs_get_extent_inline_ref_type, so
here we really want to print the invalid value of ref type instead of
causing a kernel panic.

Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/print-tree.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index 6e7a8c40dcd9..c1acbdcb476c 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -121,7 +121,10 @@ static void print_extent_item(struct extent_buffer *eb, int slot, int type)
 			       offset, btrfs_shared_data_ref_count(eb, sref));
 			break;
 		default:
-			BUG();
+			btrfs_err(eb->fs_info,
+				  "extent %llu has invalid ref type %d",
+				  eb->start, type);
+			return;
 		}
 		ptr += btrfs_extent_inline_ref_size(type);
 	}

From b14c55a191263889c379aeee85223bb72501824d Mon Sep 17 00:00:00 2001
From: Liu Bo <bo.li.liu@oracle.com>
Date: Fri, 18 Aug 2017 15:15:22 -0600
Subject: [PATCH 097/104] Btrfs: remove BUG() in add_data_reference

Now that we have a helper to report invalid value of extent inline ref
type, we need to quit gracefully instead of throwing out a kernel panic.

Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/relocation.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 96f816aa9ed3..1c086d0667be 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -3772,7 +3772,10 @@ int add_data_references(struct reloc_control *rc,
 			ret = find_data_references(rc, extent_key,
 						   eb, dref, blocks);
 		} else {
-			BUG();
+			ret = -EINVAL;
+			btrfs_err(rc->extent_root->fs_info,
+		     "extent %llu slot %d has an invalid inline ref type",
+			     eb->start, path->slots[0]);
 		}
 		if (ret) {
 			err = ret;

From cdccee993f2f3466f69a358daec19de744a02f92 Mon Sep 17 00:00:00 2001
From: Liu Bo <bo.li.liu@oracle.com>
Date: Fri, 18 Aug 2017 15:15:23 -0600
Subject: [PATCH 098/104] Btrfs: remove BUG_ON in __add_tree_block

The BUG_ON() can be triggered when the caller is processing an invalid
extent inline ref, e.g.

a shared data ref is offered instead of an extent data ref, such that
it tries to find a non-existent tree block and then btrfs_search_slot
returns 1 for no such item.

This replaces the BUG_ON() with a WARN() followed by calling
btrfs_print_leaf() to show more details about what's going on and
returning -EINVAL to upper callers.

Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/relocation.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 1c086d0667be..3a49a3c2fca4 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -32,6 +32,7 @@
 #include "free-space-cache.h"
 #include "inode-map.h"
 #include "qgroup.h"
+#include "print-tree.h"
 
 /*
  * backref_node, mapping_node and tree_block start with this
@@ -3483,7 +3484,16 @@ again:
 			goto again;
 		}
 	}
-	BUG_ON(ret);
+	if (ret) {
+		ASSERT(ret == 1);
+		btrfs_print_leaf(path->nodes[0]);
+		btrfs_err(fs_info,
+	     "tree block extent item (%llu) is not found in extent tree",
+		     bytenr);
+		WARN_ON(1);
+		ret = -EINVAL;
+		goto out;
+	}
 
 	ret = add_tree_block(rc, &key, path, blocks);
 out:

From 64ecdb647ddb83dcff9c8e2a5c40119f171ea004 Mon Sep 17 00:00:00 2001
From: Liu Bo <bo.li.liu@oracle.com>
Date: Fri, 18 Aug 2017 15:15:24 -0600
Subject: [PATCH 099/104] Btrfs: add one more sanity check for shared ref type

Every shared ref has a parent tree block, which can be get from
btrfs_extent_inline_ref_offset().  And the tree block must be aligned
to the nodesize, so we'd know this inline ref is not valid if this
block's bytenr is not aligned to the nodesize, in which case, most
likely the ref type has been misused.

This adds the above mentioned check and also updates
print_extent_item() called by btrfs_print_leaf() to point out the
invalid ref while printing the tree structure.

Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent-tree.c | 29 +++++++++++++++++++++++++----
 fs/btrfs/print-tree.c  | 27 +++++++++++++++++++++------
 2 files changed, 46 insertions(+), 10 deletions(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 51a691532fd8..96e49fd5b888 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1158,19 +1158,40 @@ int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb,
 				     enum btrfs_inline_ref_type is_data)
 {
 	int type = btrfs_extent_inline_ref_type(eb, iref);
+	u64 offset = btrfs_extent_inline_ref_offset(eb, iref);
 
 	if (type == BTRFS_TREE_BLOCK_REF_KEY ||
 	    type == BTRFS_SHARED_BLOCK_REF_KEY ||
 	    type == BTRFS_SHARED_DATA_REF_KEY ||
 	    type == BTRFS_EXTENT_DATA_REF_KEY) {
 		if (is_data == BTRFS_REF_TYPE_BLOCK) {
-			if (type == BTRFS_TREE_BLOCK_REF_KEY ||
-			    type == BTRFS_SHARED_BLOCK_REF_KEY)
+			if (type == BTRFS_TREE_BLOCK_REF_KEY)
 				return type;
+			if (type == BTRFS_SHARED_BLOCK_REF_KEY) {
+				ASSERT(eb->fs_info);
+				/*
+				 * Every shared one has parent tree
+				 * block, which must be aligned to
+				 * nodesize.
+				 */
+				if (offset &&
+				    IS_ALIGNED(offset, eb->fs_info->nodesize))
+					return type;
+			}
 		} else if (is_data == BTRFS_REF_TYPE_DATA) {
-			if (type == BTRFS_EXTENT_DATA_REF_KEY ||
-			    type == BTRFS_SHARED_DATA_REF_KEY)
+			if (type == BTRFS_EXTENT_DATA_REF_KEY)
 				return type;
+			if (type == BTRFS_SHARED_DATA_REF_KEY) {
+				ASSERT(eb->fs_info);
+				/*
+				 * Every shared one has parent tree
+				 * block, which must be aligned to
+				 * nodesize.
+				 */
+				if (offset &&
+				    IS_ALIGNED(offset, eb->fs_info->nodesize))
+					return type;
+			}
 		} else {
 			ASSERT(is_data == BTRFS_REF_TYPE_ANY);
 			return type;
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index c1acbdcb476c..569205e651c7 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -44,7 +44,7 @@ static void print_dev_item(struct extent_buffer *eb,
 static void print_extent_data_ref(struct extent_buffer *eb,
 				  struct btrfs_extent_data_ref *ref)
 {
-	pr_info("\t\textent data backref root %llu objectid %llu offset %llu count %u\n",
+	pr_cont("extent data backref root %llu objectid %llu offset %llu count %u\n",
 	       btrfs_extent_data_ref_root(eb, ref),
 	       btrfs_extent_data_ref_objectid(eb, ref),
 	       btrfs_extent_data_ref_offset(eb, ref),
@@ -63,6 +63,7 @@ static void print_extent_item(struct extent_buffer *eb, int slot, int type)
 	u32 item_size = btrfs_item_size_nr(eb, slot);
 	u64 flags;
 	u64 offset;
+	int ref_index = 0;
 
 	if (item_size < sizeof(*ei)) {
 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
@@ -104,12 +105,20 @@ static void print_extent_item(struct extent_buffer *eb, int slot, int type)
 		iref = (struct btrfs_extent_inline_ref *)ptr;
 		type = btrfs_extent_inline_ref_type(eb, iref);
 		offset = btrfs_extent_inline_ref_offset(eb, iref);
+		pr_info("\t\tref#%d: ", ref_index++);
 		switch (type) {
 		case BTRFS_TREE_BLOCK_REF_KEY:
-			pr_info("\t\ttree block backref root %llu\n", offset);
+			pr_cont("tree block backref root %llu\n", offset);
 			break;
 		case BTRFS_SHARED_BLOCK_REF_KEY:
-			pr_info("\t\tshared block backref parent %llu\n", offset);
+			pr_cont("shared block backref parent %llu\n", offset);
+			/*
+			 * offset is supposed to be a tree block which
+			 * must be aligned to nodesize.
+			 */
+			if (!IS_ALIGNED(offset, eb->fs_info->nodesize))
+				pr_info("\t\t\t(parent %llu is NOT ALIGNED to nodesize %llu)\n",
+					offset, (unsigned long long)eb->fs_info->nodesize);
 			break;
 		case BTRFS_EXTENT_DATA_REF_KEY:
 			dref = (struct btrfs_extent_data_ref *)(&iref->offset);
@@ -117,12 +126,18 @@ static void print_extent_item(struct extent_buffer *eb, int slot, int type)
 			break;
 		case BTRFS_SHARED_DATA_REF_KEY:
 			sref = (struct btrfs_shared_data_ref *)(iref + 1);
-			pr_info("\t\tshared data backref parent %llu count %u\n",
+			pr_cont("shared data backref parent %llu count %u\n",
 			       offset, btrfs_shared_data_ref_count(eb, sref));
+			/*
+			 * offset is supposed to be a tree block which
+			 * must be aligned to nodesize.
+			 */
+			if (!IS_ALIGNED(offset, eb->fs_info->nodesize))
+				pr_info("\t\t\t(parent %llu is NOT ALIGNED to nodesize %llu)\n",
+				     offset, (unsigned long long)eb->fs_info->nodesize);
 			break;
 		default:
-			btrfs_err(eb->fs_info,
-				  "extent %llu has invalid ref type %d",
+			pr_cont("(extent %llu has INVALID ref type %d)\n",
 				  eb->start, type);
 			return;
 		}

From 1cd5447eb677822c5c22bb52161c2105507dcce0 Mon Sep 17 00:00:00 2001
From: Jeff Mahoney <jeffm@suse.com>
Date: Thu, 17 Aug 2017 10:25:11 -0400
Subject: [PATCH 100/104] btrfs: pass fs_info to btrfs_del_root instead of
 tree_root

btrfs_del_roots always uses the tree_root.  Let's pass fs_info instead.

Signed-off-by: Jeff Mahoney <jeffm@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.h           | 4 ++--
 fs/btrfs/extent-tree.c     | 2 +-
 fs/btrfs/free-space-tree.c | 2 +-
 fs/btrfs/qgroup.c          | 3 +--
 fs/btrfs/root-tree.c       | 7 ++++---
 5 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index b7cfc74c1757..2add002662f4 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -2988,8 +2988,8 @@ int btrfs_del_root_ref(struct btrfs_trans_handle *trans,
 		       struct btrfs_fs_info *fs_info,
 		       u64 root_id, u64 ref_id, u64 dirid, u64 *sequence,
 		       const char *name, int name_len);
-int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
-		   const struct btrfs_key *key);
+int btrfs_del_root(struct btrfs_trans_handle *trans,
+		   struct btrfs_fs_info *fs_info, const struct btrfs_key *key);
 int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 		      const struct btrfs_key *key,
 		      struct btrfs_root_item *item);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 96e49fd5b888..e2d7e86b51d1 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -9266,7 +9266,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
 	if (err)
 		goto out_end_trans;
 
-	ret = btrfs_del_root(trans, tree_root, &root->root_key);
+	ret = btrfs_del_root(trans, fs_info, &root->root_key);
 	if (ret) {
 		btrfs_abort_transaction(trans, ret);
 		goto out_end_trans;
diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c
index a5e34de06c2f..684f12247db7 100644
--- a/fs/btrfs/free-space-tree.c
+++ b/fs/btrfs/free-space-tree.c
@@ -1257,7 +1257,7 @@ int btrfs_clear_free_space_tree(struct btrfs_fs_info *fs_info)
 	if (ret)
 		goto abort;
 
-	ret = btrfs_del_root(trans, tree_root, &free_space_root->root_key);
+	ret = btrfs_del_root(trans, fs_info, &free_space_root->root_key);
 	if (ret)
 		goto abort;
 
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index ddc37c537058..5c8b61c86e61 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -946,7 +946,6 @@ out:
 int btrfs_quota_disable(struct btrfs_trans_handle *trans,
 			struct btrfs_fs_info *fs_info)
 {
-	struct btrfs_root *tree_root = fs_info->tree_root;
 	struct btrfs_root *quota_root;
 	int ret = 0;
 
@@ -968,7 +967,7 @@ int btrfs_quota_disable(struct btrfs_trans_handle *trans,
 	if (ret)
 		goto out;
 
-	ret = btrfs_del_root(trans, tree_root, &quota_root->root_key);
+	ret = btrfs_del_root(trans, fs_info, &quota_root->root_key);
 	if (ret)
 		goto out;
 
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 5b488af6f25e..9fb9896610e0 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -335,10 +335,11 @@ int btrfs_find_orphan_roots(struct btrfs_fs_info *fs_info)
 	return err;
 }
 
-/* drop the root item for 'key' from 'root' */
-int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
-		   const struct btrfs_key *key)
+/* drop the root item for 'key' from the tree root */
+int btrfs_del_root(struct btrfs_trans_handle *trans,
+		   struct btrfs_fs_info *fs_info, const struct btrfs_key *key)
 {
+	struct btrfs_root *root = fs_info->tree_root;
 	struct btrfs_path *path;
 	int ret;
 

From 0ca00afb2b8ab37e543e4daa650bf6bdab2005da Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <nborisov@suse.com>
Date: Fri, 18 Aug 2017 17:58:22 +0300
Subject: [PATCH 101/104] btrfs: Remove chunk_objectid parameter of
 btrfs_alloc_dev_extent

THe function is always called with chunk_objectid set to
BTRFS_FIRST_CHUNK_TREE_OBJECTID. Let's collapse the parameter in the
function itself. No functional changes.

Signed-off-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/volumes.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index a37a31ba6843..63608c5f4487 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1571,8 +1571,8 @@ out:
 
 static int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
 				  struct btrfs_device *device,
-				  u64 chunk_tree, u64 chunk_objectid,
-				  u64 chunk_offset, u64 start, u64 num_bytes)
+				  u64 chunk_tree, u64 chunk_offset, u64 start,
+				  u64 num_bytes)
 {
 	int ret;
 	struct btrfs_path *path;
@@ -1600,7 +1600,8 @@ static int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
 	extent = btrfs_item_ptr(leaf, path->slots[0],
 				struct btrfs_dev_extent);
 	btrfs_set_dev_extent_chunk_tree(leaf, extent, chunk_tree);
-	btrfs_set_dev_extent_chunk_objectid(leaf, extent, chunk_objectid);
+	btrfs_set_dev_extent_chunk_objectid(leaf, extent,
+					    BTRFS_FIRST_CHUNK_TREE_OBJECTID);
 	btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset);
 
 	btrfs_set_dev_extent_length(leaf, extent, num_bytes);
@@ -4904,7 +4905,6 @@ int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
 			break;
 		ret = btrfs_alloc_dev_extent(trans, device,
 					     chunk_root->root_key.objectid,
-					     BTRFS_FIRST_CHUNK_TREE_OBJECTID,
 					     chunk_offset, dev_offset,
 					     stripe_size);
 		if (ret)

From b5d9071c4f5c087614eb477bff04795e325bea70 Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <nborisov@suse.com>
Date: Fri, 18 Aug 2017 17:58:23 +0300
Subject: [PATCH 102/104] btrfs: remove superfluous chunk_tree argument from
 btrfs_alloc_dev_extent

Currently this function is always called with the object id of the root
key of the chunk_tree, which is always BTRFS_CHUNK_TREE_OBJECTID. So
let's subsume it straight into the function itself. No functional
change.

Signed-off-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/volumes.c | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 63608c5f4487..d024f1b07282 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1571,8 +1571,7 @@ out:
 
 static int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
 				  struct btrfs_device *device,
-				  u64 chunk_tree, u64 chunk_offset, u64 start,
-				  u64 num_bytes)
+				  u64 chunk_offset, u64 start, u64 num_bytes)
 {
 	int ret;
 	struct btrfs_path *path;
@@ -1599,7 +1598,8 @@ static int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
 	leaf = path->nodes[0];
 	extent = btrfs_item_ptr(leaf, path->slots[0],
 				struct btrfs_dev_extent);
-	btrfs_set_dev_extent_chunk_tree(leaf, extent, chunk_tree);
+	btrfs_set_dev_extent_chunk_tree(leaf, extent,
+					BTRFS_CHUNK_TREE_OBJECTID);
 	btrfs_set_dev_extent_chunk_objectid(leaf, extent,
 					    BTRFS_FIRST_CHUNK_TREE_OBJECTID);
 	btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset);
@@ -4903,10 +4903,8 @@ int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
 		ret = btrfs_update_device(trans, device);
 		if (ret)
 			break;
-		ret = btrfs_alloc_dev_extent(trans, device,
-					     chunk_root->root_key.objectid,
-					     chunk_offset, dev_offset,
-					     stripe_size);
+		ret = btrfs_alloc_dev_extent(trans, device, chunk_offset,
+					     dev_offset, stripe_size);
 		if (ret)
 			break;
 	}

From dc59215d4f42084ee13654bafe3e5130b146aeb7 Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <nborisov@suse.com>
Date: Tue, 1 Aug 2017 18:25:56 +0300
Subject: [PATCH 103/104] btrfs: remove unnecessary memory barrier in
 btrfs_direct_IO

Commit 38851cc19adb ("Btrfs: implement unlocked dio write") implemented
unlocked dio write, allowing multiple dio writers to write to
non-overlapping, and non-eof-extending regions. In doing so it also
introduced a broken memory barrier. It is broken due to 2 things:

1. Memory barriers _MUST_ always be paired, this is clearly not the case
   here

2. Checkpatch actually produces a warning if a memory barrier is
   introduced that doesn't have a comment explaining how it's being
   paired.

Specifically for inode::i_dio_count that's wrapped inside
inode_dio_begin, there is no explicit barrier semantics attached, so
removing is fine as the atomic is used in common the waiter/wakeup
pattern.

Signed-off-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
[ enhance changelog ]
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/inode.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 61f1ad89e97a..d184a46e46c4 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -8815,7 +8815,6 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
 		return 0;
 
 	inode_dio_begin(inode);
-	smp_mb__after_atomic();
 
 	/*
 	 * The generic stuff only does filemap_write_and_wait_range, which

From db95c876c568cef951fbbd4c0118cb5386e4bb99 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Fri, 18 Aug 2017 18:16:25 +0200
Subject: [PATCH 104/104] btrfs: submit superblock io with REQ_META and
 REQ_PRIO

The superblock is also metadata of the filesystem so the relevant IO
should be tagged as such. We also tag it as high priority, as it's the
last block committed for metadata from a given transaction. Any delays
would effectively block the whole transaction, also blocking any other
operation holding the device_list_mutex.

Reviewed-by: Josef Bacik <jbacik@fb.com>
Reviewed-by: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/disk-io.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 90b967ae46d0..27d458640536 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -3433,9 +3433,10 @@ static int write_dev_supers(struct btrfs_device *device,
 		 */
 		if (i == 0) {
 			ret = btrfsic_submit_bh(REQ_OP_WRITE,
-						REQ_SYNC | REQ_FUA, bh);
+				REQ_SYNC | REQ_FUA | REQ_META | REQ_PRIO, bh);
 		} else {
-			ret = btrfsic_submit_bh(REQ_OP_WRITE, REQ_SYNC, bh);
+			ret = btrfsic_submit_bh(REQ_OP_WRITE,
+				REQ_SYNC | REQ_META | REQ_PRIO, bh);
 		}
 		if (ret)
 			errors++;