xfs: merge bmap records for faster scrubs [v24.5]

I started looking into performance problems with the data fork scrubber
 in generic/333, and noticed a few things that needed improving.  First,
 due to design reasons, it's possible for file forks btrees to contain
 multiple contiguous mappings to the same physical space.  Instead of
 checking each ondisk mapping individually, it's much faster to combine
 them when possible and check the combined mapping because that's fewer
 trips through the rmap btree, and we can drop this check-around
 behavior that it does when an rmapbt lookup produces a record that
 starts before or ends after a particular bmbt mapping.
 
 Second, I noticed that the bmbt scrubber decides to walk every reverse
 mapping in the filesystem if the file fork is in btree format.  This is
 very costly, and only necessary if the inode repair code had to zap a
 fork to convince iget to work.  Constraining the full-rmap scan to this
 one case means we can skip it for normal files, which drives the runtime
 of this test from 8 hours down to 45 minutes (observed with realtime
 reflink and rebuild-all mode.)
 
 Signed-off-by: Darrick J. Wong <djwong@kernel.org>
 -----BEGIN PGP SIGNATURE-----
 
 iHUEABYKAB0WIQQ2qTKExjcn+O1o2YRKO3ySh0YRpgUCZDdPcQAKCRBKO3ySh0YR
 pl1UAPoDtMaFrsLvz7clh31S6Yi+X8oCB/iJZXWl7HXaNsIjUQEA253GuiOj80Rz
 IHYo3t0KPYTm2Mc/7kBFQcctFbisDwE=
 =zFQ+
 -----END PGP SIGNATURE-----

Merge tag 'scrub-merge-bmap-records-6.4_2023-04-12' of git://git.kernel.org/pub/scm/linux/kernel/git/djwong/xfs-linux into guilt/xfs-for-next

xfs: merge bmap records for faster scrubs [v24.5]

I started looking into performance problems with the data fork scrubber
in generic/333, and noticed a few things that needed improving.  First,
due to design reasons, it's possible for file forks btrees to contain
multiple contiguous mappings to the same physical space.  Instead of
checking each ondisk mapping individually, it's much faster to combine
them when possible and check the combined mapping because that's fewer
trips through the rmap btree, and we can drop this check-around
behavior that it does when an rmapbt lookup produces a record that
starts before or ends after a particular bmbt mapping.

Second, I noticed that the bmbt scrubber decides to walk every reverse
mapping in the filesystem if the file fork is in btree format.  This is
very costly, and only necessary if the inode repair code had to zap a
fork to convince iget to work.  Constraining the full-rmap scan to this
one case means we can skip it for normal files, which drives the runtime
of this test from 8 hours down to 45 minutes (observed with realtime
reflink and rebuild-all mode.)

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Dave Chinner <david@fromorbit.com>
This commit is contained in:
Dave Chinner 2023-04-14 07:10:53 +10:00 committed by Dave Chinner
commit d808a8e6b9
2 changed files with 241 additions and 138 deletions

View File

@ -145,7 +145,7 @@ static inline int xfs_bmapi_whichfork(uint32_t bmapi_flags)
{ BMAP_COWFORK, "COW" }
/* Return true if the extent is an allocated extent, written or not. */
static inline bool xfs_bmap_is_real_extent(struct xfs_bmbt_irec *irec)
static inline bool xfs_bmap_is_real_extent(const struct xfs_bmbt_irec *irec)
{
return irec->br_startblock != HOLESTARTBLOCK &&
irec->br_startblock != DELAYSTARTBLOCK &&

View File

@ -96,11 +96,23 @@ out:
struct xchk_bmap_info {
struct xfs_scrub *sc;
/* Incore extent tree cursor */
struct xfs_iext_cursor icur;
xfs_fileoff_t lastoff;
/* Previous fork mapping that we examined */
struct xfs_bmbt_irec prev_rec;
/* Is this a realtime fork? */
bool is_rt;
/* May mappings point to shared space? */
bool is_shared;
/* Was the incore extent tree loaded? */
bool was_loaded;
/* Which inode fork are we checking? */
int whichfork;
};
@ -153,49 +165,7 @@ xchk_bmap_get_rmap(
return has_rmap;
}
static inline bool
xchk_bmap_has_prev(
struct xchk_bmap_info *info,
struct xfs_bmbt_irec *irec)
{
struct xfs_bmbt_irec got;
struct xfs_ifork *ifp;
ifp = xfs_ifork_ptr(info->sc->ip, info->whichfork);
if (!xfs_iext_peek_prev_extent(ifp, &info->icur, &got))
return false;
if (got.br_startoff + got.br_blockcount != irec->br_startoff)
return false;
if (got.br_startblock + got.br_blockcount != irec->br_startblock)
return false;
if (got.br_state != irec->br_state)
return false;
return true;
}
static inline bool
xchk_bmap_has_next(
struct xchk_bmap_info *info,
struct xfs_bmbt_irec *irec)
{
struct xfs_bmbt_irec got;
struct xfs_ifork *ifp;
ifp = xfs_ifork_ptr(info->sc->ip, info->whichfork);
if (!xfs_iext_peek_next_extent(ifp, &info->icur, &got))
return false;
if (irec->br_startoff + irec->br_blockcount != got.br_startoff)
return false;
if (irec->br_startblock + irec->br_blockcount != got.br_startblock)
return false;
if (got.br_state != irec->br_state)
return false;
return true;
}
/* Make sure that we have rmapbt records for this extent. */
/* Make sure that we have rmapbt records for this data/attr fork extent. */
STATIC void
xchk_bmap_xref_rmap(
struct xchk_bmap_info *info,
@ -204,41 +174,39 @@ xchk_bmap_xref_rmap(
{
struct xfs_rmap_irec rmap;
unsigned long long rmap_end;
uint64_t owner;
uint64_t owner = info->sc->ip->i_ino;
if (!info->sc->sa.rmap_cur || xchk_skip_xref(info->sc->sm))
return;
if (info->whichfork == XFS_COW_FORK)
owner = XFS_RMAP_OWN_COW;
else
owner = info->sc->ip->i_ino;
/* Find the rmap record for this irec. */
if (!xchk_bmap_get_rmap(info, irec, agbno, owner, &rmap))
return;
/* Check the rmap. */
rmap_end = (unsigned long long)rmap.rm_startblock + rmap.rm_blockcount;
if (rmap.rm_startblock > agbno ||
agbno + irec->br_blockcount > rmap_end)
/*
* The rmap must be an exact match for this incore file mapping record,
* which may have arisen from multiple ondisk records.
*/
if (rmap.rm_startblock != agbno)
xchk_fblock_xref_set_corrupt(info->sc, info->whichfork,
irec->br_startoff);
/*
* Check the logical offsets if applicable. CoW staging extents
* don't track logical offsets since the mappings only exist in
* memory.
*/
if (info->whichfork != XFS_COW_FORK) {
rmap_end = (unsigned long long)rmap.rm_offset +
rmap.rm_blockcount;
if (rmap.rm_offset > irec->br_startoff ||
irec->br_startoff + irec->br_blockcount > rmap_end)
xchk_fblock_xref_set_corrupt(info->sc,
info->whichfork, irec->br_startoff);
}
rmap_end = (unsigned long long)rmap.rm_startblock + rmap.rm_blockcount;
if (rmap_end != agbno + irec->br_blockcount)
xchk_fblock_xref_set_corrupt(info->sc, info->whichfork,
irec->br_startoff);
/* Check the logical offsets. */
if (rmap.rm_offset != irec->br_startoff)
xchk_fblock_xref_set_corrupt(info->sc, info->whichfork,
irec->br_startoff);
rmap_end = (unsigned long long)rmap.rm_offset + rmap.rm_blockcount;
if (rmap_end != irec->br_startoff + irec->br_blockcount)
xchk_fblock_xref_set_corrupt(info->sc, info->whichfork,
irec->br_startoff);
/* Check the owner */
if (rmap.rm_owner != owner)
xchk_fblock_xref_set_corrupt(info->sc, info->whichfork,
irec->br_startoff);
@ -250,8 +218,7 @@ xchk_bmap_xref_rmap(
* records because the blocks are owned (on-disk) by the refcountbt,
* which doesn't track unwritten state.
*/
if (owner != XFS_RMAP_OWN_COW &&
!!(irec->br_state == XFS_EXT_UNWRITTEN) !=
if (!!(irec->br_state == XFS_EXT_UNWRITTEN) !=
!!(rmap.rm_flags & XFS_RMAP_UNWRITTEN))
xchk_fblock_xref_set_corrupt(info->sc, info->whichfork,
irec->br_startoff);
@ -263,34 +230,60 @@ xchk_bmap_xref_rmap(
if (rmap.rm_flags & XFS_RMAP_BMBT_BLOCK)
xchk_fblock_xref_set_corrupt(info->sc, info->whichfork,
irec->br_startoff);
}
/* Make sure that we have rmapbt records for this COW fork extent. */
STATIC void
xchk_bmap_xref_rmap_cow(
struct xchk_bmap_info *info,
struct xfs_bmbt_irec *irec,
xfs_agblock_t agbno)
{
struct xfs_rmap_irec rmap;
unsigned long long rmap_end;
uint64_t owner = XFS_RMAP_OWN_COW;
if (!info->sc->sa.rmap_cur || xchk_skip_xref(info->sc->sm))
return;
/* Find the rmap record for this irec. */
if (!xchk_bmap_get_rmap(info, irec, agbno, owner, &rmap))
return;
/*
* If the rmap starts before this bmbt record, make sure there's a bmbt
* record for the previous offset that is contiguous with this mapping.
* Skip this for CoW fork extents because the refcount btree (and not
* the inode) is the ondisk owner for those extents.
* CoW staging extents are owned by the refcount btree, so the rmap
* can start before and end after the physical space allocated to this
* mapping. There are no offsets to check.
*/
if (info->whichfork != XFS_COW_FORK && rmap.rm_startblock < agbno &&
!xchk_bmap_has_prev(info, irec)) {
if (rmap.rm_startblock > agbno)
xchk_fblock_xref_set_corrupt(info->sc, info->whichfork,
irec->br_startoff);
return;
}
/*
* If the rmap ends after this bmbt record, make sure there's a bmbt
* record for the next offset that is contiguous with this mapping.
* Skip this for CoW fork extents because the refcount btree (and not
* the inode) is the ondisk owner for those extents.
*/
rmap_end = (unsigned long long)rmap.rm_startblock + rmap.rm_blockcount;
if (info->whichfork != XFS_COW_FORK &&
rmap_end > agbno + irec->br_blockcount &&
!xchk_bmap_has_next(info, irec)) {
if (rmap_end < agbno + irec->br_blockcount)
xchk_fblock_xref_set_corrupt(info->sc, info->whichfork,
irec->br_startoff);
/* Check the owner */
if (rmap.rm_owner != owner)
xchk_fblock_xref_set_corrupt(info->sc, info->whichfork,
irec->br_startoff);
/*
* No flags allowed. Note that the (in-memory) CoW fork distinguishes
* between unwritten and written extents, but we don't track that in
* the rmap records because the blocks are owned (on-disk) by the
* refcountbt, which doesn't track unwritten state.
*/
if (rmap.rm_flags & XFS_RMAP_ATTR_FORK)
xchk_fblock_xref_set_corrupt(info->sc, info->whichfork,
irec->br_startoff);
if (rmap.rm_flags & XFS_RMAP_BMBT_BLOCK)
xchk_fblock_xref_set_corrupt(info->sc, info->whichfork,
irec->br_startoff);
if (rmap.rm_flags & XFS_RMAP_UNWRITTEN)
xchk_fblock_xref_set_corrupt(info->sc, info->whichfork,
irec->br_startoff);
return;
}
}
/* Cross-reference a single rtdev extent record. */
@ -329,9 +322,9 @@ xchk_bmap_iextent_xref(
xchk_xref_is_used_space(info->sc, agbno, len);
xchk_xref_is_not_inode_chunk(info->sc, agbno, len);
xchk_bmap_xref_rmap(info, irec, agbno);
switch (info->whichfork) {
case XFS_DATA_FORK:
xchk_bmap_xref_rmap(info, irec, agbno);
if (!xfs_is_reflink_inode(info->sc->ip)) {
xfs_rmap_ino_owner(&oinfo, info->sc->ip->i_ino,
info->whichfork, irec->br_startoff);
@ -344,6 +337,7 @@ xchk_bmap_iextent_xref(
irec->br_blockcount);
break;
case XFS_ATTR_FORK:
xchk_bmap_xref_rmap(info, irec, agbno);
xfs_rmap_ino_owner(&oinfo, info->sc->ip->i_ino,
info->whichfork, irec->br_startoff);
xchk_xref_is_only_owned_by(info->sc, agbno, irec->br_blockcount,
@ -354,6 +348,7 @@ xchk_bmap_iextent_xref(
irec->br_blockcount);
break;
case XFS_COW_FORK:
xchk_bmap_xref_rmap_cow(info, irec, agbno);
xchk_xref_is_only_owned_by(info->sc, agbno, irec->br_blockcount,
&XFS_RMAP_OINFO_COW);
xchk_xref_is_cow_staging(info->sc, agbno,
@ -405,7 +400,8 @@ xchk_bmap_iextent(
* Check for out-of-order extents. This record could have come
* from the incore list, for which there is no ordering check.
*/
if (irec->br_startoff < info->lastoff)
if (irec->br_startoff < info->prev_rec.br_startoff +
info->prev_rec.br_blockcount)
xchk_fblock_set_corrupt(info->sc, info->whichfork,
irec->br_startoff);
@ -415,15 +411,7 @@ xchk_bmap_iextent(
xchk_bmap_dirattr_extent(ip, info, irec);
/* There should never be a "hole" extent in either extent list. */
if (irec->br_startblock == HOLESTARTBLOCK)
xchk_fblock_set_corrupt(info->sc, info->whichfork,
irec->br_startoff);
/* Make sure the extent points to a valid place. */
if (irec->br_blockcount > XFS_MAX_BMBT_EXTLEN)
xchk_fblock_set_corrupt(info->sc, info->whichfork,
irec->br_startoff);
if (info->is_rt &&
!xfs_verify_rtext(mp, irec->br_startblock, irec->br_blockcount))
xchk_fblock_set_corrupt(info->sc, info->whichfork,
@ -647,46 +635,58 @@ xchk_bmap_check_ag_rmaps(
return error;
}
/*
* Decide if we want to walk every rmap btree in the fs to make sure that each
* rmap for this file fork has corresponding bmbt entries.
*/
static bool
xchk_bmap_want_check_rmaps(
struct xchk_bmap_info *info)
{
struct xfs_scrub *sc = info->sc;
struct xfs_ifork *ifp;
if (!xfs_has_rmapbt(sc->mp))
return false;
if (info->whichfork == XFS_COW_FORK)
return false;
if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
return false;
/* Don't support realtime rmap checks yet. */
if (info->is_rt)
return false;
/*
* The inode repair code zaps broken inode forks by resetting them back
* to EXTENTS format and zero extent records. If we encounter a fork
* in this state along with evidence that the fork isn't supposed to be
* empty, we need to scan the reverse mappings to decide if we're going
* to rebuild the fork. Data forks with nonzero file size are scanned.
* xattr forks are never empty of content, so they are always scanned.
*/
ifp = xfs_ifork_ptr(sc->ip, info->whichfork);
if (ifp->if_format == XFS_DINODE_FMT_EXTENTS && ifp->if_nextents == 0) {
if (info->whichfork == XFS_DATA_FORK &&
i_size_read(VFS_I(sc->ip)) == 0)
return false;
return true;
}
return false;
}
/* Make sure each rmap has a corresponding bmbt entry. */
STATIC int
xchk_bmap_check_rmaps(
struct xfs_scrub *sc,
int whichfork)
{
struct xfs_ifork *ifp = xfs_ifork_ptr(sc->ip, whichfork);
struct xfs_perag *pag;
xfs_agnumber_t agno;
bool zero_size;
int error;
if (!xfs_has_rmapbt(sc->mp) ||
whichfork == XFS_COW_FORK ||
(sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT))
return 0;
/* Don't support realtime rmap checks yet. */
if (XFS_IS_REALTIME_INODE(sc->ip) && whichfork == XFS_DATA_FORK)
return 0;
ASSERT(xfs_ifork_ptr(sc->ip, whichfork) != NULL);
/*
* Only do this for complex maps that are in btree format, or for
* situations where we would seem to have a size but zero extents.
* The inode repair code can zap broken iforks, which means we have
* to flag this bmap as corrupt if there are rmaps that need to be
* reattached.
*/
if (whichfork == XFS_DATA_FORK)
zero_size = i_size_read(VFS_I(sc->ip)) == 0;
else
zero_size = false;
if (ifp->if_format != XFS_DINODE_FMT_BTREE &&
(zero_size || ifp->if_nextents > 0))
return 0;
for_each_perag(sc->mp, agno, pag) {
error = xchk_bmap_check_ag_rmaps(sc, whichfork, pag);
if (error ||
@ -712,7 +712,8 @@ xchk_bmap_iextent_delalloc(
* Check for out-of-order extents. This record could have come
* from the incore list, for which there is no ordering check.
*/
if (irec->br_startoff < info->lastoff)
if (irec->br_startoff < info->prev_rec.br_startoff +
info->prev_rec.br_blockcount)
xchk_fblock_set_corrupt(info->sc, info->whichfork,
irec->br_startoff);
@ -726,6 +727,101 @@ xchk_bmap_iextent_delalloc(
irec->br_startoff);
}
/* Decide if this individual fork mapping is ok. */
static bool
xchk_bmap_iext_mapping(
struct xchk_bmap_info *info,
const struct xfs_bmbt_irec *irec)
{
/* There should never be a "hole" extent in either extent list. */
if (irec->br_startblock == HOLESTARTBLOCK)
return false;
if (irec->br_blockcount > XFS_MAX_BMBT_EXTLEN)
return false;
return true;
}
/* Are these two mappings contiguous with each other? */
static inline bool
xchk_are_bmaps_contiguous(
const struct xfs_bmbt_irec *b1,
const struct xfs_bmbt_irec *b2)
{
/* Don't try to combine unallocated mappings. */
if (!xfs_bmap_is_real_extent(b1))
return false;
if (!xfs_bmap_is_real_extent(b2))
return false;
/* Does b2 come right after b1 in the logical and physical range? */
if (b1->br_startoff + b1->br_blockcount != b2->br_startoff)
return false;
if (b1->br_startblock + b1->br_blockcount != b2->br_startblock)
return false;
if (b1->br_state != b2->br_state)
return false;
return true;
}
/*
* Walk the incore extent records, accumulating consecutive contiguous records
* into a single incore mapping. Returns true if @irec has been set to a
* mapping or false if there are no more mappings. Caller must ensure that
* @info.icur is zeroed before the first call.
*/
static int
xchk_bmap_iext_iter(
struct xchk_bmap_info *info,
struct xfs_bmbt_irec *irec)
{
struct xfs_bmbt_irec got;
struct xfs_ifork *ifp;
xfs_filblks_t prev_len;
ifp = xfs_ifork_ptr(info->sc->ip, info->whichfork);
/* Advance to the next iextent record and check the mapping. */
xfs_iext_next(ifp, &info->icur);
if (!xfs_iext_get_extent(ifp, &info->icur, irec))
return false;
if (!xchk_bmap_iext_mapping(info, irec)) {
xchk_fblock_set_corrupt(info->sc, info->whichfork,
irec->br_startoff);
return false;
}
/*
* Iterate subsequent iextent records and merge them with the one
* that we just read, if possible.
*/
prev_len = irec->br_blockcount;
while (xfs_iext_peek_next_extent(ifp, &info->icur, &got)) {
if (!xchk_are_bmaps_contiguous(irec, &got))
break;
if (!xchk_bmap_iext_mapping(info, &got)) {
xchk_fblock_set_corrupt(info->sc, info->whichfork,
got.br_startoff);
return false;
}
/*
* Notify the user of mergeable records in the data or attr
* forks. CoW forks only exist in memory so we ignore them.
*/
if (info->whichfork != XFS_COW_FORK &&
prev_len + got.br_blockcount > BMBT_BLOCKCOUNT_MASK)
xchk_ino_set_preen(info->sc, info->sc->ip->i_ino);
irec->br_blockcount += got.br_blockcount;
prev_len = got.br_blockcount;
xfs_iext_next(ifp, &info->icur);
}
return true;
}
/*
* Scrub an inode fork's block mappings.
*
@ -805,10 +901,15 @@ xchk_bmap(
if (!xchk_fblock_process_error(sc, whichfork, 0, &error))
goto out;
/* Scrub extent records. */
info.lastoff = 0;
ifp = xfs_ifork_ptr(ip, whichfork);
for_each_xfs_iext(ifp, &info.icur, &irec) {
/*
* Scrub extent records. We use a special iterator function here that
* combines adjacent mappings if they are logically and physically
* contiguous. For large allocations that require multiple bmbt
* records, this reduces the number of cross-referencing calls, which
* reduces runtime. Cross referencing with the rmap is simpler because
* the rmap must match the combined mapping exactly.
*/
while (xchk_bmap_iext_iter(&info, &irec)) {
if (xchk_should_terminate(sc, &error) ||
(sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT))
goto out;
@ -823,12 +924,14 @@ xchk_bmap(
xchk_bmap_iextent_delalloc(ip, &info, &irec);
else
xchk_bmap_iextent(ip, &info, &irec);
info.lastoff = irec.br_startoff + irec.br_blockcount;
memcpy(&info.prev_rec, &irec, sizeof(struct xfs_bmbt_irec));
}
error = xchk_bmap_check_rmaps(sc, whichfork);
if (!xchk_fblock_xref_process_error(sc, whichfork, 0, &error))
goto out;
if (xchk_bmap_want_check_rmaps(&info)) {
error = xchk_bmap_check_rmaps(sc, whichfork);
if (!xchk_fblock_xref_process_error(sc, whichfork, 0, &error))
goto out;
}
out:
return error;
}