We've got ten GFS2 patches for this merge window.

1. Andreas Gruenbacher wrote a patch to replace the deprecated call to rhashtable_walk_init with rhashtable_walk_enter. 2. Andreas also wrote a patch to eliminate redundant code in two of our debugfs sequence files. 3. Andreas also cleaned up the rhashtable key ugliness Linus pointed out during this cycle, following Linus's suggestions. 4. Andreas also wrote a patch to take advantage of his new function rhashtable_lookup_get_insert_fast. This makes glock lookup faster and more bullet-proof. 5. Andreas also wrote a patch to revert a patch in the evict path that caused occasional deadlocks, and is no longer needed. 6. Andrew Price wrote a patch to re-enable fallocate for the rindex system file to enable gfs2_grow to grow properly on secondary file system grow operations. 7. I wrote a patch to initialize an inode number field to make certain kernel trace points more understandable. 8. I also wrote a patch that makes GFS2 file system "withdraw" work more like it should by ignoring operations after a withdraw that would formerly cause a BUG() and kernel panic. 9. I also reworked the entire truncate/delete algorithm, scrapping the old recursive algorithm in favor of a new non-recursive algorithm. This was done for performance: This way, GFS2 no longer needs to lock multiple resource groups while doing truncates and deletes of files that cross multiple resource group boundaries, allowing for better parallelism. It also solves a problem whereby deleting large files would request a large chunk of kernel memory, which resulted in a get_page_from_freelist warning. 10. Due to a regression found during testing, I added a new patch to correct "GFS2: Prevent BUG from occurring when normal Withdraws occur". -----BEGIN PGP SIGNATURE----- iQEcBAABAgAGBQJZDNnaAAoJENeLYdPf93o7B7kIAJzwz7vVDVg2TpWVhMmXIWhf rZx3Gth5F0h+ZHddW7HzTLg+64XQ5//GyDD3UDtCpkhl5SJH+nt3juHyPJlRwioT 0ua4SjyKLQSoJJVAEgAwu42QjORTXab7NjYn5LEhvRc0Gg/El9WGU+ZgmP2/aAvf KE2u/IEYNDkoJNS3Oqc7shajAyLYda6wCAASs/1ZGt9u48m/o/I23Zd7wr7EOkzw rd3gB0x80cJqDAB5IcymGOm111Tg4g34LwsRuyMnWE3H1jOgV+J515FVHEIvZuPq Wl9X7V8CzktI7nyLKVnZhpuv5JzyMq/vOPiD01tTFx8Oy1JCRezjmATXFjW/zIo= =MX3c -----END PGP SIGNATURE----- Merge tag 'gfs2-4.12.fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/gfs2/linux-gfs2 Pull GFS2 updates from Bob Peterson: "We've got ten GFS2 patches for this merge window. - Andreas Gruenbacher wrote a patch to replace the deprecated call to rhashtable_walk_init with rhashtable_walk_enter. - Andreas also wrote a patch to eliminate redundant code in two of our debugfs sequence files. - Andreas also cleaned up the rhashtable key ugliness Linus pointed out during this cycle, following Linus's suggestions. - Andreas also wrote a patch to take advantage of his new function rhashtable_lookup_get_insert_fast. This makes glock lookup faster and more bullet-proof. - Andreas also wrote a patch to revert a patch in the evict path that caused occasional deadlocks, and is no longer needed. - Andrew Price wrote a patch to re-enable fallocate for the rindex system file to enable gfs2_grow to grow properly on secondary file system grow operations. - I wrote a patch to initialize an inode number field to make certain kernel trace points more understandable. - I also wrote a patch that makes GFS2 file system "withdraw" work more like it should by ignoring operations after a withdraw that would formerly cause a BUG() and kernel panic. - I also reworked the entire truncate/delete algorithm, scrapping the old recursive algorithm in favor of a new non-recursive algorithm. This was done for performance: This way, GFS2 no longer needs to lock multiple resource groups while doing truncates and deletes of files that cross multiple resource group boundaries, allowing for better parallelism. It also solves a problem whereby deleting large files would request a large chunk of kernel memory, which resulted in a get_page_from_freelist warning. - Due to a regression found during testing, I added a new patch to correct 'GFS2: Prevent BUG from occurring when normal Withdraws occur'." * tag 'gfs2-4.12.fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/gfs2/linux-gfs2: GFS2: Allow glocks to be unlocked after withdraw GFS2: Non-recursive delete gfs2: Re-enable fallocate for the rindex Revert "GFS2: Wait for iopen glock dequeues" gfs2: Switch to rhashtable_lookup_get_insert_fast GFS2: Temporarily zero i_no_addr when creating a dinode gfs2: Don't pack struct lm_lockname gfs2: Deduplicate gfs2_{glocks,glstats}_open gfs2: Replace rhashtable_walk_init with rhashtable_walk_enter GFS2: Prevent BUG from occurring when normal Withdraws occur
2017-05-05 13:40:20 -07:00 · 2017-05-05 13:40:20 -07:00 · 1a5fb64fee
parent aeced66196 ed17545d01
commit 1a5fb64fee
8 changed files with 518 additions and 347 deletions
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@ -38,11 +38,6 @@ struct metapath {
 	__u16 mp_list[GFS2_MAX_META_HEIGHT];
 };

-struct strip_mine {
-	int sm_first;
-	unsigned int sm_height;
-};
-
 /**
 * gfs2_unstuffer_page - unstuff a stuffed inode into a block cached by a page
 * @ip: the inode
@ -252,6 +247,19 @@ static inline unsigned int metapath_branch_start(const struct metapath *mp)
 	return 1;
 }

+/**
+ * metaptr1 - Return the first possible metadata pointer in a metaath buffer
+ * @height: The metadata height (0 = dinode)
+ * @mp: The metapath
+ */
+static inline __be64 *metaptr1(unsigned int height, const struct metapath *mp)
+{
+	struct buffer_head *bh = mp->mp_bh[height];
+	if (height == 0)
+		return ((__be64 *)(bh->b_data + sizeof(struct gfs2_dinode)));
+	return ((__be64 *)(bh->b_data + sizeof(struct gfs2_meta_header)));
+}
+
 /**
 * metapointer - Return pointer to start of metadata in a buffer
 * @height: The metadata height (0 = dinode)
@ -264,10 +272,8 @@ static inline unsigned int metapath_branch_start(const struct metapath *mp)

 static inline __be64 *metapointer(unsigned int height, const struct metapath *mp)
 {
-	struct buffer_head *bh = mp->mp_bh[height];
-	unsigned int head_size = (height > 0) ?
-		sizeof(struct gfs2_meta_header) : sizeof(struct gfs2_dinode);
-	return ((__be64 *)(bh->b_data + head_size)) + mp->mp_list[height];
+	__be64 *p = metaptr1(height, mp);
+	return p + mp->mp_list[height];
 }

 static void gfs2_metapath_ra(struct gfs2_glock *gl,
@ -295,6 +301,23 @@ static void gfs2_metapath_ra(struct gfs2_glock *gl,
 	}
 }

+/**
+ * lookup_mp_height - helper function for lookup_metapath
+ * @ip: the inode
+ * @mp: the metapath
+ * @h: the height which needs looking up
+ */
+static int lookup_mp_height(struct gfs2_inode *ip, struct metapath *mp, int h)
+{
+	__be64 *ptr = metapointer(h, mp);
+	u64 dblock = be64_to_cpu(*ptr);
+
+	if (!dblock)
+		return h + 1;
+
+	return gfs2_meta_indirect_buffer(ip, h + 1, dblock, &mp->mp_bh[h + 1]);
+}
+
 /**
 * lookup_metapath - Walk the metadata tree to a specific point
 * @ip: The inode
@ -316,17 +339,10 @@ static int lookup_metapath(struct gfs2_inode *ip, struct metapath *mp)
 {
 	unsigned int end_of_metadata = ip->i_height - 1;
 	unsigned int x;
-	__be64 *ptr;
-	u64 dblock;
 	int ret;

 	for (x = 0; x < end_of_metadata; x++) {
-		ptr = metapointer(x, mp);
-		dblock = be64_to_cpu(*ptr);
-		if (!dblock)
-			return x + 1;
-
-		ret = gfs2_meta_indirect_buffer(ip, x+1, dblock, &mp->mp_bh[x+1]);
+		ret = lookup_mp_height(ip, mp, x);
 		if (ret)
 			return ret;
 	}
@ -334,6 +350,35 @@ static int lookup_metapath(struct gfs2_inode *ip, struct metapath *mp)
 	return ip->i_height;
 }

+/**
+ * fillup_metapath - fill up buffers for the metadata path to a specific height
+ * @ip: The inode
+ * @mp: The metapath
+ * @h: The height to which it should be mapped
+ *
+ * Similar to lookup_metapath, but does lookups for a range of heights
+ *
+ * Returns: error or height of metadata tree
+ */
+
+static int fillup_metapath(struct gfs2_inode *ip, struct metapath *mp, int h)
+{
+	unsigned int start_h = h - 1;
+	int ret;
+
+	if (h) {
+		/* find the first buffer we need to look up. */
+		while (start_h > 0 && mp->mp_bh[start_h] == NULL)
+			start_h--;
+		for (; start_h < h; start_h++) {
+			ret = lookup_mp_height(ip, mp, start_h);
+			if (ret)
+				return ret;
+		}
+	}
+	return ip->i_height;
+}
+
 static inline void release_metapath(struct metapath *mp)
 {
 	int i;
@ -422,6 +467,13 @@ enum alloc_state {
 	/* ALLOC_UNSTUFF = 3,   TBD and rather complicated */
 };

+static inline unsigned int hptrs(struct gfs2_sbd *sdp, const unsigned int hgt)
+{
+	if (hgt)
+		return sdp->sd_inptrs;
+	return sdp->sd_diptrs;
+}
+
 /**
 * gfs2_bmap_alloc - Build a metadata tree of the requested height
 * @inode: The GFS2 inode
@ -620,7 +672,7 @@ int gfs2_block_map(struct inode *inode, sector_t lblock,

 	BUG_ON(maxlen == 0);

-	memset(mp.mp_bh, 0, sizeof(mp.mp_bh));
+	memset(&mp, 0, sizeof(mp));
 	bmap_lock(ip, create);
 	clear_buffer_mapped(bh_map);
 	clear_buffer_new(bh_map);
@ -701,252 +753,6 @@ int gfs2_extent_map(struct inode *inode, u64 lblock, int *new, u64 *dblock, unsi
 	return ret;
 }

-/**
- * do_strip - Look for a layer a particular layer of the file and strip it off
- * @ip: the inode
- * @dibh: the dinode buffer
- * @bh: A buffer of pointers
- * @top: The first pointer in the buffer
- * @bottom: One more than the last pointer
- * @height: the height this buffer is at
- * @sm: a pointer to a struct strip_mine
- *
- * Returns: errno
- */
-
-static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
-		    struct buffer_head *bh, __be64 *top, __be64 *bottom,
-		    unsigned int height, struct strip_mine *sm)
-{
-	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
-	struct gfs2_rgrp_list rlist;
-	struct gfs2_trans *tr;
-	u64 bn, bstart;
-	u32 blen, btotal;
-	__be64 *p;
-	unsigned int rg_blocks = 0;
-	int metadata;
-	unsigned int revokes = 0;
-	int x;
-	int error;
-	int jblocks_rqsted;
-
-	error = gfs2_rindex_update(sdp);
-	if (error)
-		return error;
-
-	if (!*top)
-		sm->sm_first = 0;
-
-	if (height != sm->sm_height)
-		return 0;
-
-	if (sm->sm_first) {
-		top++;
-		sm->sm_first = 0;
-	}
-
-	metadata = (height != ip->i_height - 1);
-	if (metadata)
-		revokes = (height) ? sdp->sd_inptrs : sdp->sd_diptrs;
-	else if (ip->i_depth)
-		revokes = sdp->sd_inptrs;
-
-	memset(&rlist, 0, sizeof(struct gfs2_rgrp_list));
-	bstart = 0;
-	blen = 0;
-
-	for (p = top; p < bottom; p++) {
-		if (!*p)
-			continue;
-
-		bn = be64_to_cpu(*p);
-
-		if (bstart + blen == bn)
-			blen++;
-		else {
-			if (bstart)
-				gfs2_rlist_add(ip, &rlist, bstart);
-
-			bstart = bn;
-			blen = 1;
-		}
-	}
-
-	if (bstart)
-		gfs2_rlist_add(ip, &rlist, bstart);
-	else
-		goto out; /* Nothing to do */
-
-	gfs2_rlist_alloc(&rlist, LM_ST_EXCLUSIVE);
-
-	for (x = 0; x < rlist.rl_rgrps; x++) {
-		struct gfs2_rgrpd *rgd;
-		rgd = rlist.rl_ghs[x].gh_gl->gl_object;
-		rg_blocks += rgd->rd_length;
-	}
-
-	error = gfs2_glock_nq_m(rlist.rl_rgrps, rlist.rl_ghs);
-	if (error)
-		goto out_rlist;
-
-	if (gfs2_rs_active(&ip->i_res)) /* needs to be done with the rgrp glock held */
-		gfs2_rs_deltree(&ip->i_res);
-
-restart:
-	jblocks_rqsted = rg_blocks + RES_DINODE +
-		RES_INDIRECT + RES_STATFS + RES_QUOTA +
-		gfs2_struct2blk(sdp, revokes, sizeof(u64));
-	if (jblocks_rqsted > atomic_read(&sdp->sd_log_thresh2))
-		jblocks_rqsted = atomic_read(&sdp->sd_log_thresh2);
-	error = gfs2_trans_begin(sdp, jblocks_rqsted, revokes);
-	if (error)
-		goto out_rg_gunlock;
-
-	tr = current->journal_info;
-	down_write(&ip->i_rw_mutex);
-
-	gfs2_trans_add_meta(ip->i_gl, dibh);
-	gfs2_trans_add_meta(ip->i_gl, bh);
-
-	bstart = 0;
-	blen = 0;
-	btotal = 0;
-
-	for (p = top; p < bottom; p++) {
-		if (!*p)
-			continue;
-
-		/* check for max reasonable journal transaction blocks */
-		if (tr->tr_num_buf_new + RES_STATFS +
-		    RES_QUOTA >= atomic_read(&sdp->sd_log_thresh2)) {
-			if (rg_blocks >= tr->tr_num_buf_new)
-				rg_blocks -= tr->tr_num_buf_new;
-			else
-				rg_blocks = 0;
-			break;
-		}
-
-		bn = be64_to_cpu(*p);
-
-		if (bstart + blen == bn)
-			blen++;
-		else {
-			if (bstart) {
-				__gfs2_free_blocks(ip, bstart, blen, metadata);
-				btotal += blen;
-			}
-
-			bstart = bn;
-			blen = 1;
-		}
-
-		*p = 0;
-		gfs2_add_inode_blocks(&ip->i_inode, -1);
-	}
-	if (p == bottom)
-		rg_blocks = 0;
-
-	if (bstart) {
-		__gfs2_free_blocks(ip, bstart, blen, metadata);
-		btotal += blen;
-	}
-
-	gfs2_statfs_change(sdp, 0, +btotal, 0);
-	gfs2_quota_change(ip, -(s64)btotal, ip->i_inode.i_uid,
-			  ip->i_inode.i_gid);
-
-	ip->i_inode.i_mtime = ip->i_inode.i_ctime = current_time(&ip->i_inode);
-
-	gfs2_dinode_out(ip, dibh->b_data);
-
-	up_write(&ip->i_rw_mutex);
-
-	gfs2_trans_end(sdp);
-
-	if (rg_blocks)
-		goto restart;
-
-out_rg_gunlock:
-	gfs2_glock_dq_m(rlist.rl_rgrps, rlist.rl_ghs);
-out_rlist:
-	gfs2_rlist_free(&rlist);
-out:
-	return error;
-}
-
-/**
- * recursive_scan - recursively scan through the end of a file
- * @ip: the inode
- * @dibh: the dinode buffer
- * @mp: the path through the metadata to the point to start
- * @height: the height the recursion is at
- * @block: the indirect block to look at
- * @first: 1 if this is the first block
- * @sm: data opaque to this function to pass to @bc
- *
- * When this is first called @height and @block should be zero and
- * @first should be 1.
- *
- * Returns: errno
- */
-
-static int recursive_scan(struct gfs2_inode *ip, struct buffer_head *dibh,
-			  struct metapath *mp, unsigned int height,
-			  u64 block, int first, struct strip_mine *sm)
-{
-	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
-	struct buffer_head *bh = NULL;
-	__be64 *top, *bottom;
-	u64 bn;
-	int error;
-	int mh_size = sizeof(struct gfs2_meta_header);
-
-	if (!height) {
-		error = gfs2_meta_inode_buffer(ip, &bh);
-		if (error)
-			return error;
-		dibh = bh;
-
-		top = (__be64 *)(bh->b_data + sizeof(struct gfs2_dinode)) + mp->mp_list[0];
-		bottom = (__be64 *)(bh->b_data + sizeof(struct gfs2_dinode)) + sdp->sd_diptrs;
-	} else {
-		error = gfs2_meta_indirect_buffer(ip, height, block, &bh);
-		if (error)
-			return error;
-
-		top = (__be64 *)(bh->b_data + mh_size) +
-				  (first ? mp->mp_list[height] : 0);
-
-		bottom = (__be64 *)(bh->b_data + mh_size) + sdp->sd_inptrs;
-	}
-
-	error = do_strip(ip, dibh, bh, top, bottom, height, sm);
-	if (error)
-		goto out;
-
-	if (height < ip->i_height - 1) {
-
-		gfs2_metapath_ra(ip->i_gl, bh, top);
-
-		for (; top < bottom; top++, first = 0) {
-			if (!*top)
-				continue;
-
-			bn = be64_to_cpu(*top);
-
-			error = recursive_scan(ip, dibh, mp, height + 1, bn,
-					       first, sm);
-			if (error)
-				break;
-		}
-	}
-out:
-	brelse(bh);
-	return error;
-}
-
-
 /**
 * gfs2_block_truncate_page - Deal with zeroing out data for truncate
 *
@ -1106,41 +912,406 @@ out:
 	return error;
 }

-static int trunc_dealloc(struct gfs2_inode *ip, u64 size)
+/**
+ * sweep_bh_for_rgrps - find an rgrp in a meta buffer and free blocks therein
+ * @ip: inode
+ * @rg_gh: holder of resource group glock
+ * @mp: current metapath fully populated with buffers
+ * @btotal: place to keep count of total blocks freed
+ * @hgt: height we're processing
+ * @first: true if this is the first call to this function for this height
+ *
+ * We sweep a metadata buffer (provided by the metapath) for blocks we need to
+ * free, and free them all. However, we do it one rgrp at a time. If this
+ * block has references to multiple rgrps, we break it into individual
+ * transactions. This allows other processes to use the rgrps while we're
+ * focused on a single one, for better concurrency / performance.
+ * At every transaction boundary, we rewrite the inode into the journal.
+ * That way the bitmaps are kept consistent with the inode and we can recover
+ * if we're interrupted by power-outages.
+ *
+ * Returns: 0, or return code if an error occurred.
+ *          *btotal has the total number of blocks freed
+ */
+static int sweep_bh_for_rgrps(struct gfs2_inode *ip, struct gfs2_holder *rd_gh,
+			      const struct metapath *mp, u32 *btotal, int hgt,
+			      bool preserve1)
 {
 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
-	unsigned int height = ip->i_height;
-	u64 lblock;
-	struct metapath mp;
-	int error;
+	struct gfs2_rgrpd *rgd;
+	struct gfs2_trans *tr;
+	struct buffer_head *bh = mp->mp_bh[hgt];
+	__be64 *top, *bottom, *p;
+	int blks_outside_rgrp;
+	u64 bn, bstart, isize_blks;
+	s64 blen; /* needs to be s64 or gfs2_add_inode_blocks breaks */
+	int meta = ((hgt != ip->i_height - 1) ? 1 : 0);
+	int ret = 0;
+	bool buf_in_tr = false; /* buffer was added to transaction */

-	if (!size)
+	if (gfs2_metatype_check(sdp, bh,
+				(hgt ? GFS2_METATYPE_IN : GFS2_METATYPE_DI)))
+		return -EIO;
+
+more_rgrps:
+	blks_outside_rgrp = 0;
+	bstart = 0;
+	blen = 0;
+	top = metapointer(hgt, mp); /* first ptr from metapath */
+	/* If we're keeping some data at the truncation point, we've got to
+	   preserve the metadata tree by adding 1 to the starting metapath. */
+	if (preserve1)
+		top++;
+
+	bottom = (__be64 *)(bh->b_data + bh->b_size);
+
+	for (p = top; p < bottom; p++) {
+		if (!*p)
+			continue;
+		bn = be64_to_cpu(*p);
+		if (gfs2_holder_initialized(rd_gh)) {
+			rgd = (struct gfs2_rgrpd *)rd_gh->gh_gl->gl_object;
+			gfs2_assert_withdraw(sdp,
+				     gfs2_glock_is_locked_by_me(rd_gh->gh_gl));
+		} else {
+			rgd = gfs2_blk2rgrpd(sdp, bn, false);
+			ret = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE,
+						 0, rd_gh);
+			if (ret)
+				goto out;
+
+			/* Must be done with the rgrp glock held: */
+			if (gfs2_rs_active(&ip->i_res) &&
+			    rgd == ip->i_res.rs_rbm.rgd)
+				gfs2_rs_deltree(&ip->i_res);
+		}
+
+		if (!rgrp_contains_block(rgd, bn)) {
+			blks_outside_rgrp++;
+			continue;
+		}
+
+		/* The size of our transactions will be unknown until we
+		   actually process all the metadata blocks that relate to
+		   the rgrp. So we estimate. We know it can't be more than
+		   the dinode's i_blocks and we don't want to exceed the
+		   journal flush threshold, sd_log_thresh2. */
+		if (current->journal_info == NULL) {
+			unsigned int jblocks_rqsted, revokes;
+
+			jblocks_rqsted = rgd->rd_length + RES_DINODE +
+				RES_INDIRECT;
+			isize_blks = gfs2_get_inode_blocks(&ip->i_inode);
+			if (isize_blks > atomic_read(&sdp->sd_log_thresh2))
+				jblocks_rqsted +=
+					atomic_read(&sdp->sd_log_thresh2);
+			else
+				jblocks_rqsted += isize_blks;
+			revokes = jblocks_rqsted;
+			if (meta)
+				revokes += hptrs(sdp, hgt);
+			else if (ip->i_depth)
+				revokes += sdp->sd_inptrs;
+			ret = gfs2_trans_begin(sdp, jblocks_rqsted, revokes);
+			if (ret)
+				goto out_unlock;
+			down_write(&ip->i_rw_mutex);
+		}
+		/* check if we will exceed the transaction blocks requested */
+		tr = current->journal_info;
+		if (tr->tr_num_buf_new + RES_STATFS +
+		    RES_QUOTA >= atomic_read(&sdp->sd_log_thresh2)) {
+			/* We set blks_outside_rgrp to ensure the loop will
+			   be repeated for the same rgrp, but with a new
+			   transaction. */
+			blks_outside_rgrp++;
+			/* This next part is tricky. If the buffer was added
+			   to the transaction, we've already set some block
+			   pointers to 0, so we better follow through and free
+			   them, or we will introduce corruption (so break).
+			   This may be impossible, or at least rare, but I
+			   decided to cover the case regardless.
+
+			   If the buffer was not added to the transaction
+			   (this call), doing so would exceed our transaction
+			   size, so we need to end the transaction and start a
+			   new one (so goto). */
+
+			if (buf_in_tr)
+				break;
+			goto out_unlock;
+		}
+
+		gfs2_trans_add_meta(ip->i_gl, bh);
+		buf_in_tr = true;
+		*p = 0;
+		if (bstart + blen == bn) {
+			blen++;
+			continue;
+		}
+		if (bstart) {
+			__gfs2_free_blocks(ip, bstart, (u32)blen, meta);
+			(*btotal) += blen;
+			gfs2_add_inode_blocks(&ip->i_inode, -blen);
+		}
+		bstart = bn;
+		blen = 1;
+	}
+	if (bstart) {
+		__gfs2_free_blocks(ip, bstart, (u32)blen, meta);
+		(*btotal) += blen;
+		gfs2_add_inode_blocks(&ip->i_inode, -blen);
+	}
+out_unlock:
+	if (!ret && blks_outside_rgrp) { /* If buffer still has non-zero blocks
+					    outside the rgrp we just processed,
+					    do it all over again. */
+		if (current->journal_info) {
+			struct buffer_head *dibh = mp->mp_bh[0];
+
+			/* Every transaction boundary, we rewrite the dinode
+			   to keep its di_blocks current in case of failure. */
+			ip->i_inode.i_mtime = ip->i_inode.i_ctime =
+				CURRENT_TIME;
+			gfs2_trans_add_meta(ip->i_gl, dibh);
+			gfs2_dinode_out(ip, dibh->b_data);
+			up_write(&ip->i_rw_mutex);
+			gfs2_trans_end(sdp);
+		}
+		gfs2_glock_dq_uninit(rd_gh);
+		cond_resched();
+		goto more_rgrps;
+	}
+out:
+	return ret;
+}
+
+/**
+ * find_nonnull_ptr - find a non-null pointer given a metapath and height
+ * assumes the metapath is valid (with buffers) out to height h
+ * @mp: starting metapath
+ * @h: desired height to search
+ *
+ * Returns: true if a non-null pointer was found in the metapath buffer
+ *          false if all remaining pointers are NULL in the buffer
+ */
+static bool find_nonnull_ptr(struct gfs2_sbd *sdp, struct metapath *mp,
+			     unsigned int h)
+{
+	__be64 *ptr;
+	unsigned int ptrs = hptrs(sdp, h) - 1;
+
+	while (true) {
+		ptr = metapointer(h, mp);
+		if (*ptr) /* if we have a non-null pointer */
+			return true;
+
+		if (mp->mp_list[h] < ptrs)
+			mp->mp_list[h]++;
+		else
+			return false; /* no more pointers in this buffer */
+	}
+}
+
+enum dealloc_states {
+	DEALLOC_MP_FULL = 0,    /* Strip a metapath with all buffers read in */
+	DEALLOC_MP_LOWER = 1,   /* lower the metapath strip height */
+	DEALLOC_FILL_MP = 2,  /* Fill in the metapath to the given height. */
+	DEALLOC_DONE = 3,       /* process complete */
+};
+
+/**
+ * trunc_dealloc - truncate a file down to a desired size
+ * @ip: inode to truncate
+ * @newsize: The desired size of the file
+ *
+ * This function truncates a file to newsize. It works from the
+ * bottom up, and from the right to the left. In other words, it strips off
+ * the highest layer (data) before stripping any of the metadata. Doing it
+ * this way is best in case the operation is interrupted by power failure, etc.
+ * The dinode is rewritten in every transaction to guarantee integrity.
+ */
+static int trunc_dealloc(struct gfs2_inode *ip, u64 newsize)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+	struct metapath mp;
+	struct buffer_head *dibh, *bh;
+	struct gfs2_holder rd_gh;
+	u64 lblock;
+	__u16 nbof[GFS2_MAX_META_HEIGHT]; /* new beginning of truncation */
+	unsigned int strip_h = ip->i_height - 1;
+	u32 btotal = 0;
+	int ret, state;
+	int mp_h; /* metapath buffers are read in to this height */
+	sector_t last_ra = 0;
+	u64 prev_bnr = 0;
+	bool preserve1; /* need to preserve the first meta pointer? */
+
+	if (!newsize)
 		lblock = 0;
 	else
-		lblock = (size - 1) >> sdp->sd_sb.sb_bsize_shift;
+		lblock = (newsize - 1) >> sdp->sd_sb.sb_bsize_shift;

+	memset(&mp, 0, sizeof(mp));
 	find_metapath(sdp, lblock, &mp, ip->i_height);
-	error = gfs2_rindex_update(sdp);
-	if (error)
-		return error;

-	error = gfs2_quota_hold(ip, NO_UID_QUOTA_CHANGE, NO_GID_QUOTA_CHANGE);
-	if (error)
-		return error;
+	memcpy(&nbof, &mp.mp_list, sizeof(nbof));

-	while (height--) {
-		struct strip_mine sm;
-		sm.sm_first = !!size;
-		sm.sm_height = height;
+	ret = gfs2_meta_inode_buffer(ip, &dibh);
+	if (ret)
+		return ret;

-		error = recursive_scan(ip, NULL, &mp, 0, 0, 1, &sm);
-		if (error)
+	mp.mp_bh[0] = dibh;
+	ret = lookup_metapath(ip, &mp);
+	if (ret == ip->i_height)
+		state = DEALLOC_MP_FULL; /* We have a complete metapath */
+	else
+		state = DEALLOC_FILL_MP; /* deal with partial metapath */
+
+	ret = gfs2_rindex_update(sdp);
+	if (ret)
+		goto out_metapath;
+
+	ret = gfs2_quota_hold(ip, NO_UID_QUOTA_CHANGE, NO_GID_QUOTA_CHANGE);
+	if (ret)
+		goto out_metapath;
+	gfs2_holder_mark_uninitialized(&rd_gh);
+
+	mp_h = strip_h;
+
+	while (state != DEALLOC_DONE) {
+		switch (state) {
+		/* Truncate a full metapath at the given strip height.
+		 * Note that strip_h == mp_h in order to be in this state. */
+		case DEALLOC_MP_FULL:
+			if (mp_h > 0) { /* issue read-ahead on metadata */
+				__be64 *top;
+
+				bh = mp.mp_bh[mp_h - 1];
+				if (bh->b_blocknr != last_ra) {
+					last_ra = bh->b_blocknr;
+					top = metaptr1(mp_h - 1, &mp);
+					gfs2_metapath_ra(ip->i_gl, bh, top);
+				}
+			}
+			/* If we're truncating to a non-zero size and the mp is
+			   at the beginning of file for the strip height, we
+			   need to preserve the first metadata pointer. */
+			preserve1 = (newsize &&
+				     (mp.mp_list[mp_h] == nbof[mp_h]));
+			bh = mp.mp_bh[mp_h];
+			gfs2_assert_withdraw(sdp, bh);
+			if (gfs2_assert_withdraw(sdp,
+						 prev_bnr != bh->b_blocknr)) {
+				printk(KERN_EMERG "GFS2: fsid=%s:inode %llu, "
+				       "block:%llu, i_h:%u, s_h:%u, mp_h:%u\n",
+				       sdp->sd_fsname,
+				       (unsigned long long)ip->i_no_addr,
+				       prev_bnr, ip->i_height, strip_h, mp_h);
+			}
+			prev_bnr = bh->b_blocknr;
+			ret = sweep_bh_for_rgrps(ip, &rd_gh, &mp, &btotal,
+						 mp_h, preserve1);
+			/* If we hit an error or just swept dinode buffer,
+			   just exit. */
+			if (ret || !mp_h) {
+				state = DEALLOC_DONE;
+				break;
+			}
+			state = DEALLOC_MP_LOWER;
 			break;
+
+		/* lower the metapath strip height */
+		case DEALLOC_MP_LOWER:
+			/* We're done with the current buffer, so release it,
+			   unless it's the dinode buffer. Then back up to the
+			   previous pointer. */
+			if (mp_h) {
+				brelse(mp.mp_bh[mp_h]);
+				mp.mp_bh[mp_h] = NULL;
+			}
+			/* If we can't get any lower in height, we've stripped
+			   off all we can. Next step is to back up and start
+			   stripping the previous level of metadata. */
+			if (mp_h == 0) {
+				strip_h--;
+				memcpy(&mp.mp_list, &nbof, sizeof(nbof));
+				mp_h = strip_h;
+				state = DEALLOC_FILL_MP;
+				break;
+			}
+			mp.mp_list[mp_h] = 0;
+			mp_h--; /* search one metadata height down */
+			if (mp.mp_list[mp_h] >= hptrs(sdp, mp_h) - 1)
+				break; /* loop around in the same state */
+			mp.mp_list[mp_h]++;
+			/* Here we've found a part of the metapath that is not
+			 * allocated. We need to search at that height for the
+			 * next non-null pointer. */
+			if (find_nonnull_ptr(sdp, &mp, mp_h)) {
+				state = DEALLOC_FILL_MP;
+				mp_h++;
+			}
+			/* No more non-null pointers at this height. Back up
+			   to the previous height and try again. */
+			break; /* loop around in the same state */
+
+		/* Fill the metapath with buffers to the given height. */
+		case DEALLOC_FILL_MP:
+			/* Fill the buffers out to the current height. */
+			ret = fillup_metapath(ip, &mp, mp_h);
+			if (ret < 0)
+				goto out;
+
+			/* If buffers found for the entire strip height */
+			if ((ret == ip->i_height) && (mp_h == strip_h)) {
+				state = DEALLOC_MP_FULL;
+				break;
+			}
+			if (ret < ip->i_height) /* We have a partial height */
+				mp_h = ret - 1;
+
+			/* If we find a non-null block pointer, crawl a bit
+			   higher up in the metapath and try again, otherwise
+			   we need to look lower for a new starting point. */
+			if (find_nonnull_ptr(sdp, &mp, mp_h))
+				mp_h++;
+			else
+				state = DEALLOC_MP_LOWER;
+			break;
+		}
 	}

-	gfs2_quota_unhold(ip);
+	if (btotal) {
+		if (current->journal_info == NULL) {
+			ret = gfs2_trans_begin(sdp, RES_DINODE + RES_STATFS +
+					       RES_QUOTA, 0);
+			if (ret)
+				goto out;
+			down_write(&ip->i_rw_mutex);
+		}
+		gfs2_statfs_change(sdp, 0, +btotal, 0);
+		gfs2_quota_change(ip, -(s64)btotal, ip->i_inode.i_uid,
+				  ip->i_inode.i_gid);
+		ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
+		gfs2_trans_add_meta(ip->i_gl, dibh);
+		gfs2_dinode_out(ip, dibh->b_data);
+		up_write(&ip->i_rw_mutex);
+		gfs2_trans_end(sdp);
+	}

-	return error;
+out:
+	if (gfs2_holder_initialized(&rd_gh))
+		gfs2_glock_dq_uninit(&rd_gh);
+	if (current->journal_info) {
+		up_write(&ip->i_rw_mutex);
+		gfs2_trans_end(sdp);
+		cond_resched();
+	}
+	gfs2_quota_unhold(ip);
+out_metapath:
+	release_metapath(&mp);
+	return ret;
 }

 static int trunc_end(struct gfs2_inode *ip)
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@ -911,11 +911,15 @@ out_qunlock:
 static long gfs2_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
 {
 	struct inode *inode = file_inode(file);
+	struct gfs2_sbd *sdp = GFS2_SB(inode);
 	struct gfs2_inode *ip = GFS2_I(inode);
 	struct gfs2_holder gh;
 	int ret;

-	if ((mode & ~FALLOC_FL_KEEP_SIZE) || gfs2_is_jdata(ip))
+	if (mode & ~FALLOC_FL_KEEP_SIZE)
+		return -EOPNOTSUPP;
+	/* fallocate is needed by gfs2_grow to reserve space in the rindex */
+	if (gfs2_is_jdata(ip) && inode != sdp->sd_rindex)
 		return -EOPNOTSUPP;

 	inode_lock(inode);
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@ -73,7 +73,7 @@ static DEFINE_SPINLOCK(lru_lock);

 static struct rhashtable_params ht_parms = {
 	.nelem_hint = GFS2_GL_HASH_SIZE * 3 / 4,
-	.key_len = sizeof(struct lm_lockname),
+	.key_len = offsetofend(struct lm_lockname, ln_type),
 	.key_offset = offsetof(struct gfs2_glock, gl_name),
 	.head_offset = offsetof(struct gfs2_glock, gl_node),
 };
@ -449,6 +449,9 @@ __acquires(&gl->gl_lockref.lock)
 	unsigned int lck_flags = (unsigned int)(gh ? gh->gh_flags : 0);
 	int ret;

+	if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)) &&
+	    target != LM_ST_UNLOCKED)
+		return;
 	lck_flags &= (LM_FLAG_TRY | LM_FLAG_TRY_1CB | LM_FLAG_NOEXP |
 		      LM_FLAG_PRIORITY);
 	GLOCK_BUG_ON(gl, gl->gl_state == target);
@ -484,7 +487,8 @@ __acquires(&gl->gl_lockref.lock)
 		}
 		else if (ret) {
 			pr_err("lm_lock ret %d\n", ret);
-			GLOCK_BUG_ON(gl, 1);
+			GLOCK_BUG_ON(gl, !test_bit(SDF_SHUTDOWN,
+						   &sdp->sd_flags));
 		}
 	} else { /* lock_nolock */
 		finish_xmote(gl, target);
@ -653,10 +657,10 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
 	struct lm_lockname name = { .ln_number = number,
 				    .ln_type = glops->go_type,
 				    .ln_sbd = sdp };
-	struct gfs2_glock *gl, *tmp = NULL;
+	struct gfs2_glock *gl, *tmp;
 	struct address_space *mapping;
 	struct kmem_cache *cachep;
-	int ret, tries = 0;
+	int ret = 0;

 	rcu_read_lock();
 	gl = rhashtable_lookup_fast(&gl_hash_table, &name, ht_parms);
@ -721,35 +725,32 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
 	}

 again:
-	ret = rhashtable_lookup_insert_fast(&gl_hash_table, &gl->gl_node,
-					    ht_parms);
-	if (ret == 0) {
+	rcu_read_lock();
+	tmp = rhashtable_lookup_get_insert_fast(&gl_hash_table, &gl->gl_node,
+						ht_parms);
+	if (!tmp) {
 		*glp = gl;
-		return 0;
+		goto out;
 	}
+	if (IS_ERR(tmp)) {
+		ret = PTR_ERR(tmp);
+		goto out_free;
+	}
+	if (lockref_get_not_dead(&tmp->gl_lockref)) {
+		*glp = tmp;
+		goto out_free;
+	}
+	rcu_read_unlock();
+	cond_resched();
+	goto again;

-	if (ret == -EEXIST) {
-		ret = 0;
-		rcu_read_lock();
-		tmp = rhashtable_lookup_fast(&gl_hash_table, &name, ht_parms);
-		if (tmp == NULL || !lockref_get_not_dead(&tmp->gl_lockref)) {
-			if (++tries < 100) {
-				rcu_read_unlock();
-				cond_resched();
-				goto again;
-			}
-			tmp = NULL;
-			ret = -ENOMEM;
-		}
-		rcu_read_unlock();
-	} else {
-		WARN_ON_ONCE(ret);
-	}
+out_free:
 	kfree(gl->gl_lksb.sb_lvbptr);
 	kmem_cache_free(cachep, gl);
 	atomic_dec(&sdp->sd_glock_disposal);
-	*glp = tmp;

+out:
+	rcu_read_unlock();
 	return ret;
 }

@ -1918,10 +1919,10 @@ static const struct seq_operations gfs2_sbstats_seq_ops = {

 #define GFS2_SEQ_GOODSIZE min(PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER, 65536UL)

-static int gfs2_glocks_open(struct inode *inode, struct file *file)
+static int __gfs2_glocks_open(struct inode *inode, struct file *file,
+			      const struct seq_operations *ops)
 {
-	int ret = seq_open_private(file, &gfs2_glock_seq_ops,
-				   sizeof(struct gfs2_glock_iter));
+	int ret = seq_open_private(file, ops, sizeof(struct gfs2_glock_iter));
 	if (ret == 0) {
 		struct seq_file *seq = file->private_data;
 		struct gfs2_glock_iter *gi = seq->private;
@ -1932,11 +1933,16 @@ static int gfs2_glocks_open(struct inode *inode, struct file *file)
 		if (seq->buf)
 			seq->size = GFS2_SEQ_GOODSIZE;
 		gi->gl = NULL;
-		ret = rhashtable_walk_init(&gl_hash_table, &gi->hti, GFP_KERNEL);
+		rhashtable_walk_enter(&gl_hash_table, &gi->hti);
 	}
 	return ret;
 }

+static int gfs2_glocks_open(struct inode *inode, struct file *file)
+{
+	return __gfs2_glocks_open(inode, file, &gfs2_glock_seq_ops);
+}
+
 static int gfs2_glocks_release(struct inode *inode, struct file *file)
 {
 	struct seq_file *seq = file->private_data;
@ -1949,20 +1955,7 @@ static int gfs2_glocks_release(struct inode *inode, struct file *file)

 static int gfs2_glstats_open(struct inode *inode, struct file *file)
 {
-	int ret = seq_open_private(file, &gfs2_glstats_seq_ops,
-				   sizeof(struct gfs2_glock_iter));
-	if (ret == 0) {
-		struct seq_file *seq = file->private_data;
-		struct gfs2_glock_iter *gi = seq->private;
-		gi->sdp = inode->i_private;
-		gi->last_pos = 0;
-		seq->buf = kmalloc(GFS2_SEQ_GOODSIZE, GFP_KERNEL | __GFP_NOWARN);
-		if (seq->buf)
-			seq->size = GFS2_SEQ_GOODSIZE;
-		gi->gl = NULL;
-		ret = rhashtable_walk_init(&gl_hash_table, &gi->hti, GFP_KERNEL);
-	}
-	return ret;
+	return __gfs2_glocks_open(inode, file, &gfs2_glstats_seq_ops);
 }

 static int gfs2_sbstats_open(struct inode *inode, struct file *file)
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@ -203,11 +203,15 @@ enum {
 	DFL_DLM_RECOVERY	= 6,
 };

+/*
+ * We are using struct lm_lockname as an rhashtable key.  Avoid holes within
+ * the struct; padding at the end is fine.
+ */
 struct lm_lockname {
-	struct gfs2_sbd *ln_sbd;
 	u64 ln_number;
+	struct gfs2_sbd *ln_sbd;
 	unsigned int ln_type;
-} __packed __aligned(sizeof(int));
+};

 #define lm_name_equal(name1, name2) \
        (((name1)->ln_number == (name2)->ln_number) &&	\
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@ -202,8 +202,7 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type,
 fail_refresh:
 	ip->i_iopen_gh.gh_flags |= GL_NOCACHE;
 	ip->i_iopen_gh.gh_gl->gl_object = NULL;
-	gfs2_glock_dq_wait(&ip->i_iopen_gh);
-	gfs2_holder_uninit(&ip->i_iopen_gh);
+	gfs2_glock_dq_uninit(&ip->i_iopen_gh);
 fail_put:
 	if (io_gl)
 		gfs2_glock_put(io_gl);
@ -667,6 +666,7 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
 	ip->i_height = 0;
 	ip->i_depth = 0;
 	ip->i_entries = 0;
+	ip->i_no_addr = 0; /* Temporarily zero until real addr is assigned */

 	switch(mode & S_IFMT) {
 	case S_IFREG:
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@ -483,13 +483,6 @@ void gfs2_rgrp_verify(struct gfs2_rgrpd *rgd)
 	}
 }

-static inline int rgrp_contains_block(struct gfs2_rgrpd *rgd, u64 block)
-{
-	u64 first = rgd->rd_data0;
-	u64 last = first + rgd->rd_data;
-	return first <= block && block < last;
-}
-
 /**
 * gfs2_blk2rgrpd - Find resource group for a given data/meta block number
 * @sdp: The GFS2 superblock
--- a/fs/gfs2/rgrp.h
+++ b/fs/gfs2/rgrp.h
@ -83,5 +83,12 @@ static inline bool gfs2_rs_active(const struct gfs2_blkreserv *rs)
 	return rs && !RB_EMPTY_NODE(&rs->rs_node);
 }

+static inline int rgrp_contains_block(struct gfs2_rgrpd *rgd, u64 block)
+{
+	u64 first = rgd->rd_data0;
+	u64 last = first + rgd->rd_data;
+	return first <= block && block < last;
+}
+
 extern void check_and_update_goal(struct gfs2_inode *ip);
 #endif /* __RGRP_DOT_H__ */
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@ -793,7 +793,8 @@ static void gfs2_dirty_inode(struct inode *inode, int flags)

 	if (!(flags & (I_DIRTY_DATASYNC|I_DIRTY_SYNC)))
 		return;
-
+	if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+		return;
 	if (!gfs2_glock_is_locked_by_me(ip->i_gl)) {
 		ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
 		if (ret) {
@ -1538,8 +1539,7 @@ static void gfs2_evict_inode(struct inode *inode)
 	error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_SKIP, &gh);
 	if (unlikely(error)) {
 		ip->i_iopen_gh.gh_flags |= GL_NOCACHE;
-		gfs2_glock_dq_wait(&ip->i_iopen_gh);
-		gfs2_holder_uninit(&ip->i_iopen_gh);
+		gfs2_glock_dq_uninit(&ip->i_iopen_gh);
 		goto out;
 	}

@ -1617,7 +1617,7 @@ out_unlock:
 	if (gfs2_holder_initialized(&ip->i_iopen_gh)) {
 		if (test_bit(HIF_HOLDER, &ip->i_iopen_gh.gh_iflags)) {
 			ip->i_iopen_gh.gh_flags |= GL_NOCACHE;
-			gfs2_glock_dq_wait(&ip->i_iopen_gh);
+			gfs2_glock_dq(&ip->i_iopen_gh);
 		}
 		gfs2_holder_uninit(&ip->i_iopen_gh);
 	}
@ -1639,8 +1639,7 @@ out:
 	if (gfs2_holder_initialized(&ip->i_iopen_gh)) {
 		ip->i_iopen_gh.gh_gl->gl_object = NULL;
 		ip->i_iopen_gh.gh_flags |= GL_NOCACHE;
-		gfs2_glock_dq_wait(&ip->i_iopen_gh);
-		gfs2_holder_uninit(&ip->i_iopen_gh);
+		gfs2_glock_dq_uninit(&ip->i_iopen_gh);
 	}
 }