From ed150e1a5cf20c04cf0b2d2c34e498fc1d6519be Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <darrick.wong@oracle.com>
Date: Fri, 26 Aug 2016 15:58:40 +1000
Subject: [PATCH 1/9] xfs: don't perform lookups on zero-height btrees

If the caller passes in a cursor to a zero-height btree (which is
impossible), we never set block to anything but NULL, which causes the
later dereference of it to crash.  Instead, just return -EFSCORRUPTED.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/libxfs/xfs_btree.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
index b5c213a051cd..33f14067c320 100644
--- a/fs/xfs/libxfs/xfs_btree.c
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -1814,6 +1814,10 @@ xfs_btree_lookup(
 
 	XFS_BTREE_STATS_INC(cur, lookup);
 
+	/* No such thing as a zero-level tree. */
+	if (cur->bc_nlevels == 0)
+		return -EFSCORRUPTED;
+
 	block = NULL;
 	keyno = 0;
 

From 738f57c16a2bb527c705641f0fc1c68ff8cba72a Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <darrick.wong@oracle.com>
Date: Fri, 26 Aug 2016 15:59:19 +1000
Subject: [PATCH 2/9] xfs: disallow mounting of realtime + rmap filesystems

Since the kernel doesn't currently support the realtime rmapbt,
don't allow such filesystems to be mounted.  Support will appear
in a future release.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Carlos Maiolino <cmaiolino@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_super.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 24ef83ef04de..fd6be45b3a1e 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -1574,9 +1574,16 @@ xfs_fs_fill_super(
 		}
 	}
 
-	if (xfs_sb_version_hasrmapbt(&mp->m_sb))
+	if (xfs_sb_version_hasrmapbt(&mp->m_sb)) {
+		if (mp->m_sb.sb_rblocks) {
+			xfs_alert(mp,
+	"EXPERIMENTAL reverse mapping btree not compatible with realtime device!");
+			error = -EINVAL;
+			goto out_filestream_unmount;
+		}
 		xfs_alert(mp,
 	"EXPERIMENTAL reverse mapping btree feature enabled. Use at your own risk!");
+	}
 
 	error = xfs_mountfs(mp);
 	if (error)

From da1f039d6947b1a49f13b39a6de0df2a3e9e1ed1 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <darrick.wong@oracle.com>
Date: Fri, 26 Aug 2016 15:59:31 +1000
Subject: [PATCH 3/9] xfs: don't log the entire end of the AGF

When we're logging the last non-spare field in the AGF, we don't
need to log the spare fields, so plumb in a new AGF logging flag
to help us avoid that.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/libxfs/xfs_alloc.c  | 2 ++
 fs/xfs/libxfs/xfs_format.h | 6 ++++--
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index 3dd8f1d54498..05b5243d89f6 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -2278,6 +2278,8 @@ xfs_alloc_log_agf(
 		offsetof(xfs_agf_t, agf_btreeblks),
 		offsetof(xfs_agf_t, agf_uuid),
 		offsetof(xfs_agf_t, agf_rmap_blocks),
+		/* needed so that we don't log the whole rest of the structure: */
+		offsetof(xfs_agf_t, agf_spare64),
 		sizeof(xfs_agf_t)
 	};
 
diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
index e6a8bea0f7ba..270fb5cf4fa1 100644
--- a/fs/xfs/libxfs/xfs_format.h
+++ b/fs/xfs/libxfs/xfs_format.h
@@ -674,7 +674,8 @@ typedef struct xfs_agf {
 #define	XFS_AGF_BTREEBLKS	0x00000800
 #define	XFS_AGF_UUID		0x00001000
 #define	XFS_AGF_RMAP_BLOCKS	0x00002000
-#define	XFS_AGF_NUM_BITS	14
+#define	XFS_AGF_SPARE64		0x00004000
+#define	XFS_AGF_NUM_BITS	15
 #define	XFS_AGF_ALL_BITS	((1 << XFS_AGF_NUM_BITS) - 1)
 
 #define XFS_AGF_FLAGS \
@@ -691,7 +692,8 @@ typedef struct xfs_agf {
 	{ XFS_AGF_LONGEST,	"LONGEST" }, \
 	{ XFS_AGF_BTREEBLKS,	"BTREEBLKS" }, \
 	{ XFS_AGF_UUID,		"UUID" }, \
-	{ XFS_AGF_RMAP_BLOCKS,	"RMAP_BLOCKS" }
+	{ XFS_AGF_RMAP_BLOCKS,	"RMAP_BLOCKS" }, \
+	{ XFS_AGF_SPARE64,	"SPARE64" }
 
 /* disk block (xfs_daddr_t) in the AG */
 #define XFS_AGF_DADDR(mp)	((xfs_daddr_t)(1 << (mp)->m_sectbb_log))

From 722278997bc964349e23e7061d541f8df3133a04 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <darrick.wong@oracle.com>
Date: Fri, 26 Aug 2016 15:59:50 +1000
Subject: [PATCH 4/9] xfs: fix some key handling problems in
 _btree_simple_query_range

We only need the record's high key for the first record that we look
at; for all records, we /definitely/ need the regular record key.
Therefore, fix how the simple range query function gets its keys.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/libxfs/xfs_btree.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
index 33f14067c320..b70d9f918156 100644
--- a/fs/xfs/libxfs/xfs_btree.c
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -4563,10 +4563,10 @@ xfs_btree_simple_query_range(
 		error = xfs_btree_get_rec(cur, &recp, &stat);
 		if (error || !stat)
 			break;
-		cur->bc_ops->init_high_key_from_rec(&rec_key, recp);
 
 		/* Skip if high_key(rec) < low_key. */
 		if (firstrec) {
+			cur->bc_ops->init_high_key_from_rec(&rec_key, recp);
 			firstrec = false;
 			diff = cur->bc_ops->diff_two_keys(cur, low_key,
 					&rec_key);
@@ -4575,6 +4575,7 @@ xfs_btree_simple_query_range(
 		}
 
 		/* Stop if high_key < low_key(rec). */
+		cur->bc_ops->init_key_from_rec(&rec_key, recp);
 		diff = cur->bc_ops->diff_two_keys(cur, &rec_key, high_key);
 		if (diff > 0)
 			break;

From 5b5c2dbd3c9bcfa89fba9709c12ecc0a445c6e40 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <darrick.wong@oracle.com>
Date: Fri, 26 Aug 2016 16:00:10 +1000
Subject: [PATCH 5/9] xfs: simple btree query range should look right if LE
 lookup fails

If the initial LOOKUP_LE in the simple query range fails to find
anything, we should attempt to increment the btree cursor to see
if there actually /are/ records for what we're trying to find.
Without this patch, a bnobt range query of (0, $agsize) returns
no results because the leftmost record never has a startblock
of zero.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/libxfs/xfs_btree.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
index b70d9f918156..08569792fe20 100644
--- a/fs/xfs/libxfs/xfs_btree.c
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -4558,6 +4558,13 @@ xfs_btree_simple_query_range(
 	if (error)
 		goto out;
 
+	/* Nothing?  See if there's anything to the right. */
+	if (!stat) {
+		error = xfs_btree_increment(cur, 0, &stat);
+		if (error)
+			goto out;
+	}
+
 	while (stat) {
 		/* Find the record. */
 		error = xfs_btree_get_rec(cur, &recp, &stat);

From f3d7ebdeb2c297bd26272384e955033493ca291c Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Fri, 26 Aug 2016 16:01:30 +1000
Subject: [PATCH 6/9] xfs: fix superblock inprogress check

From inspection, the superblock sb_inprogress check is done in the
verifier and triggered only for the primary superblock via a
"bp->b_bn == XFS_SB_DADDR" check.

Unfortunately, the primary superblock is an uncached buffer, and
hence it is configured by xfs_buf_read_uncached() with:

	bp->b_bn = XFS_BUF_DADDR_NULL;  /* always null for uncached buffers */

And so this check never triggers. Fix it.

cc: <stable@vger.kernel.org>
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/libxfs/xfs_sb.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
index 0e3d4f5ec33c..4aecc5fefe96 100644
--- a/fs/xfs/libxfs/xfs_sb.c
+++ b/fs/xfs/libxfs/xfs_sb.c
@@ -583,7 +583,8 @@ xfs_sb_verify(
 	 * Only check the in progress field for the primary superblock as
 	 * mkfs.xfs doesn't clear it from secondary superblocks.
 	 */
-	return xfs_mount_validate_sb(mp, &sb, bp->b_bn == XFS_SB_DADDR,
+	return xfs_mount_validate_sb(mp, &sb,
+				     bp->b_maps[0].bm_bn == XFS_SB_DADDR,
 				     check_version);
 }
 

From 800b2694f890cc35a1bda63501fc71c94389d517 Mon Sep 17 00:00:00 2001
From: Brian Foster <bfoster@redhat.com>
Date: Fri, 26 Aug 2016 16:01:59 +1000
Subject: [PATCH 7/9] xfs: prevent dropping ioend completions during buftarg
 wait

xfs_wait_buftarg() waits for all pending I/O, drains the ioend
completion workqueue and walks the LRU until all buffers in the cache
have been released. This is traditionally an unmount operation` but the
mechanism is also reused during filesystem freeze.

xfs_wait_buftarg() invokes drain_workqueue() as part of the quiesce,
which is intended more for a shutdown sequence in that it indicates to
the queue that new operations are not expected once the drain has begun.
New work jobs after this point result in a WARN_ON_ONCE() and are
otherwise dropped.

With filesystem freeze, however, read operations are allowed and can
proceed during or after the workqueue drain. If such a read occurs
during the drain sequence, the workqueue infrastructure complains about
the queued ioend completion work item and drops it on the floor. As a
result, the buffer remains on the LRU and the freeze never completes.

Despite the fact that the overall buffer cache cleanup is not necessary
during freeze, fix up this operation such that it is safe to invoke
during non-unmount quiesce operations. Replace the drain_workqueue()
call with flush_workqueue(), which runs a similar serialization on
pending workqueue jobs without causing new jobs to be dropped. This is
safe for unmount as unmount independently locks out new operations by
the time xfs_wait_buftarg() is invoked.

cc: <stable@vger.kernel.org>
Signed-off-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_buf.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 607cc29bba21..b5b9bffe3520 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -1611,7 +1611,7 @@ xfs_wait_buftarg(
 	 */
 	while (percpu_counter_sum(&btp->bt_io_count))
 		delay(100);
-	drain_workqueue(btp->bt_mount->m_buf_workqueue);
+	flush_workqueue(btp->bt_mount->m_buf_workqueue);
 
 	/* loop until there is nothing left on the lru list. */
 	while (list_lru_count(&btp->bt_lru)) {

From 17de0a9ff3df8f54f2f47746d118112d4e61d973 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 29 Aug 2016 11:33:58 +1000
Subject: [PATCH 8/9] iomap: don't set FIEMAP_EXTENT_MERGED for extent based
 filesystems

Filesystems like XFS that use extents should not set the
FIEMAP_EXTENT_MERGED flag in the fiemap extent structures.  To allow
for both behaviors for the upcoming gfs2 usage split the iomap
type field into type and flags, and only set FIEMAP_EXTENT_MERGED if
the IOMAP_F_MERGED flag is set.  The flags field will also come in
handy for future features such as shared extents on reflink-enabled
file systems.

Reported-by: Andreas Gruenbacher <agruenba@redhat.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/iomap.c            | 5 ++++-
 include/linux/iomap.h | 8 +++++++-
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/fs/iomap.c b/fs/iomap.c
index 0342254646e3..706270f21b35 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -428,9 +428,12 @@ static int iomap_to_fiemap(struct fiemap_extent_info *fi,
 		break;
 	}
 
+	if (iomap->flags & IOMAP_F_MERGED)
+		flags |= FIEMAP_EXTENT_MERGED;
+
 	return fiemap_fill_next_extent(fi, iomap->offset,
 			iomap->blkno != IOMAP_NULL_BLOCK ? iomap->blkno << 9: 0,
-			iomap->length, flags | FIEMAP_EXTENT_MERGED);
+			iomap->length, flags);
 
 }
 
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index 3267df461012..3d70ece10313 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -18,6 +18,11 @@ struct vm_fault;
 #define IOMAP_MAPPED	0x03	/* blocks allocated @blkno */
 #define IOMAP_UNWRITTEN	0x04	/* blocks allocated @blkno in unwritten state */
 
+/*
+ * Flags for iomap mappings:
+ */
+#define IOMAP_F_MERGED	0x01	/* contains multiple blocks/extents */
+
 /*
  * Magic value for blkno:
  */
@@ -27,7 +32,8 @@ struct iomap {
 	sector_t		blkno;	/* 1st sector of mapping, 512b units */
 	loff_t			offset;	/* file offset of mapping, bytes */
 	u64			length;	/* length of mapping, bytes */
-	int			type;	/* type of mapping */
+	u16			type;	/* type of mapping */
+	u16			flags;	/* flags for mapping */
 	struct block_device	*bdev;	/* block device for I/O */
 };
 

From ea78d80866ce375defb2fdd1c8a3aafec95e0f85 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <darrick.wong@oracle.com>
Date: Tue, 30 Aug 2016 13:51:39 +1000
Subject: [PATCH 9/9] xfs: track log done items directly in the deferred
 pending work item

Christoph reports slab corruption when a deferred refcount update
aborts during _defer_finish().  The cause of this was broken log item
state tracking in xfs_defer_pending -- upon an abort,
_defer_trans_abort() will call abort_intent on all intent items,
including the ones that have already had a done item attached.

This is incorrect because each intent item has 2 refcount: the first
is released when the intent item is committed to the log; and the
second is released when the _done_ item is committed to the log, or
by the intent creator if there is no done item.  In other words, once
we log the done item, responsibility for releasing the intent item's
second refcount is transferred to the done item and /must not/ be
performed by anything else.

The dfp_committed flag should have been tracking whether or not we had
a done item so that _defer_trans_abort could decide if it needs to
abort the intent item, but due to a thinko this was not the case.  Rip
it out and track the done item directly so that we do the right thing
w.r.t. intent item freeing.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reported-by: Christoph Hellwig <hch@infradead.org>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/libxfs/xfs_defer.c | 17 ++++-------------
 fs/xfs/libxfs/xfs_defer.h |  2 +-
 fs/xfs/xfs_trace.h        |  2 +-
 3 files changed, 6 insertions(+), 15 deletions(-)

diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c
index 054a2032fdb3..c221d0ecd52e 100644
--- a/fs/xfs/libxfs/xfs_defer.c
+++ b/fs/xfs/libxfs/xfs_defer.c
@@ -194,7 +194,7 @@ xfs_defer_trans_abort(
 	/* Abort intent items. */
 	list_for_each_entry(dfp, &dop->dop_pending, dfp_list) {
 		trace_xfs_defer_pending_abort(tp->t_mountp, dfp);
-		if (dfp->dfp_committed)
+		if (!dfp->dfp_done)
 			dfp->dfp_type->abort_intent(dfp->dfp_intent);
 	}
 
@@ -290,7 +290,6 @@ xfs_defer_finish(
 	struct xfs_defer_pending	*dfp;
 	struct list_head		*li;
 	struct list_head		*n;
-	void				*done_item = NULL;
 	void				*state;
 	int				error = 0;
 	void				(*cleanup_fn)(struct xfs_trans *, void *, int);
@@ -309,19 +308,11 @@ xfs_defer_finish(
 		if (error)
 			goto out;
 
-		/* Mark all pending intents as committed. */
-		list_for_each_entry_reverse(dfp, &dop->dop_pending, dfp_list) {
-			if (dfp->dfp_committed)
-				break;
-			trace_xfs_defer_pending_commit((*tp)->t_mountp, dfp);
-			dfp->dfp_committed = true;
-		}
-
 		/* Log an intent-done item for the first pending item. */
 		dfp = list_first_entry(&dop->dop_pending,
 				struct xfs_defer_pending, dfp_list);
 		trace_xfs_defer_pending_finish((*tp)->t_mountp, dfp);
-		done_item = dfp->dfp_type->create_done(*tp, dfp->dfp_intent,
+		dfp->dfp_done = dfp->dfp_type->create_done(*tp, dfp->dfp_intent,
 				dfp->dfp_count);
 		cleanup_fn = dfp->dfp_type->finish_cleanup;
 
@@ -331,7 +322,7 @@ xfs_defer_finish(
 			list_del(li);
 			dfp->dfp_count--;
 			error = dfp->dfp_type->finish_item(*tp, dop, li,
-					done_item, &state);
+					dfp->dfp_done, &state);
 			if (error) {
 				/*
 				 * Clean up after ourselves and jump out.
@@ -428,8 +419,8 @@ xfs_defer_add(
 		dfp = kmem_alloc(sizeof(struct xfs_defer_pending),
 				KM_SLEEP | KM_NOFS);
 		dfp->dfp_type = defer_op_types[type];
-		dfp->dfp_committed = false;
 		dfp->dfp_intent = NULL;
+		dfp->dfp_done = NULL;
 		dfp->dfp_count = 0;
 		INIT_LIST_HEAD(&dfp->dfp_work);
 		list_add_tail(&dfp->dfp_list, &dop->dop_intake);
diff --git a/fs/xfs/libxfs/xfs_defer.h b/fs/xfs/libxfs/xfs_defer.h
index cc3981c48296..e96533d178cf 100644
--- a/fs/xfs/libxfs/xfs_defer.h
+++ b/fs/xfs/libxfs/xfs_defer.h
@@ -30,8 +30,8 @@ struct xfs_defer_op_type;
 struct xfs_defer_pending {
 	const struct xfs_defer_op_type	*dfp_type;	/* function pointers */
 	struct list_head		dfp_list;	/* pending items */
-	bool				dfp_committed;	/* committed trans? */
 	void				*dfp_intent;	/* log intent item */
+	void				*dfp_done;	/* log done item */
 	struct list_head		dfp_work;	/* work items */
 	unsigned int			dfp_count;	/* # extent items */
 };
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 7e88bec3f359..d303a665dba9 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -2295,7 +2295,7 @@ DECLARE_EVENT_CLASS(xfs_defer_pending_class,
 		__entry->dev = mp ? mp->m_super->s_dev : 0;
 		__entry->type = dfp->dfp_type->type;
 		__entry->intent = dfp->dfp_intent;
-		__entry->committed = dfp->dfp_committed;
+		__entry->committed = dfp->dfp_done != NULL;
 		__entry->nr = dfp->dfp_count;
 	),
 	TP_printk("dev %d:%d optype %d intent %p committed %d nr %d\n",