From 06734e3c95a34e4d71342f0583f1bb88c61ed9b7 Mon Sep 17 00:00:00 2001 From: Keyur Patel Date: Mon, 29 Jun 2020 14:44:35 -0700 Subject: [PATCH 001/117] xfs: Couple of typo fixes in comments ./xfs/libxfs/xfs_inode_buf.c:56: unnecssary ==> unnecessary ./xfs/libxfs/xfs_inode_buf.c:59: behavour ==> behaviour ./xfs/libxfs/xfs_inode_buf.c:206: unitialized ==> uninitialized Signed-off-by: Keyur Patel Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_inode_buf.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c index 6f84ea85fdd8..5c93e8e6de74 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.c +++ b/fs/xfs/libxfs/xfs_inode_buf.c @@ -53,10 +53,10 @@ xfs_inobp_check( * If the readahead buffer is invalid, we need to mark it with an error and * clear the DONE status of the buffer so that a followup read will re-read it * from disk. We don't report the error otherwise to avoid warnings during log - * recovery and we don't get unnecssary panics on debug kernels. We use EIO here + * recovery and we don't get unnecessary panics on debug kernels. We use EIO here * because all we want to do is say readahead failed; there is no-one to report * the error to, so this will distinguish it from a non-ra verifier failure. - * Changes to this readahead error behavour also need to be reflected in + * Changes to this readahead error behaviour also need to be reflected in * xfs_dquot_buf_readahead_verify(). */ static void @@ -203,7 +203,7 @@ xfs_inode_from_disk( /* * First get the permanent information that is needed to allocate an * inode. If the inode is unused, mode is zero and we shouldn't mess - * with the unitialized part of it. + * with the uninitialized part of it. */ to->di_flushiter = be16_to_cpu(from->di_flushiter); inode->i_generation = be32_to_cpu(from->di_gen); From f74681ba2006434be195402e0b15fc5763cddd7e Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Mon, 29 Jun 2020 14:44:36 -0700 Subject: [PATCH 002/117] xfs: preserve rmapbt swapext block reservation from freed blocks The rmapbt extent swap algorithm remaps individual extents between the source inode and the target to trigger reverse mapping metadata updates. If either inode straddles a format or other bmap allocation boundary, the individual unmap and map cycles can trigger repeated bmap block allocations and frees as the extent count bounces back and forth across the boundary. While net block usage is bound across the swap operation, this behavior can prematurely exhaust the transaction block reservation because it continuously drains as the transaction rolls. Each allocation accounts against the reservation and each free returns to global free space on transaction roll. The previous workaround to this problem attempted to detect this boundary condition and provide surplus block reservation to acommodate it. This is insufficient because more remaps can occur than implied by the extent counts; if start offset boundaries are not aligned between the two inodes, for example. To address this problem more generically and dynamically, add a transaction accounting mode that returns freed blocks to the transaction reservation instead of the superblock counters on transaction roll and use it when the rmapbt based algorithm is active. This allows the chain of remap transactions to preserve the block reservation based own its own frees and prevent premature exhaustion regardless of the remap pattern. Note that this is only safe for superblocks with lazy sb accounting, but the latter is required for v5 supers and the rmap feature depends on v5. Fixes: b3fed434822d0 ("xfs: account format bouncing into rmapbt swapext tx reservation") Root-caused-by: Darrick J. Wong Signed-off-by: Brian Foster Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_shared.h | 1 + fs/xfs/xfs_bmap_util.c | 18 +++++++++--------- fs/xfs/xfs_trans.c | 19 ++++++++++++++++++- 3 files changed, 28 insertions(+), 10 deletions(-) diff --git a/fs/xfs/libxfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h index c45acbd3add9..708feb8eac76 100644 --- a/fs/xfs/libxfs/xfs_shared.h +++ b/fs/xfs/libxfs/xfs_shared.h @@ -65,6 +65,7 @@ void xfs_log_get_max_trans_res(struct xfs_mount *mp, #define XFS_TRANS_DQ_DIRTY 0x10 /* at least one dquot in trx dirty */ #define XFS_TRANS_RESERVE 0x20 /* OK to use reserved data blocks */ #define XFS_TRANS_NO_WRITECOUNT 0x40 /* do not elevate SB writecount */ +#define XFS_TRANS_RES_FDBLKS 0x80 /* reserve newly freed blocks */ /* * LOWMODE is used by the allocator to activate the lowspace algorithm - when * free space is running low the extent allocator may choose to allocate an diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index f37f5cc4b19f..afdc7f8e0e70 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -1567,6 +1567,7 @@ xfs_swap_extents( int lock_flags; uint64_t f; int resblks = 0; + unsigned int flags = 0; /* * Lock the inodes against other IO, page faults and truncate to @@ -1630,17 +1631,16 @@ xfs_swap_extents( resblks += XFS_SWAP_RMAP_SPACE_RES(mp, tipnext, w); /* - * Handle the corner case where either inode might straddle the - * btree format boundary. If so, the inode could bounce between - * btree <-> extent format on unmap -> remap cycles, freeing and - * allocating a bmapbt block each time. + * If either inode straddles a bmapbt block allocation boundary, + * the rmapbt algorithm triggers repeated allocs and frees as + * extents are remapped. This can exhaust the block reservation + * prematurely and cause shutdown. Return freed blocks to the + * transaction reservation to counter this behavior. */ - if (ipnext == (XFS_IFORK_MAXEXT(ip, w) + 1)) - resblks += XFS_IFORK_MAXEXT(ip, w); - if (tipnext == (XFS_IFORK_MAXEXT(tip, w) + 1)) - resblks += XFS_IFORK_MAXEXT(tip, w); + flags |= XFS_TRANS_RES_FDBLKS; } - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0, &tp); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, flags, + &tp); if (error) goto out_unlock; diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index 3c94e5ff4316..0ad72a83edac 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -107,7 +107,8 @@ xfs_trans_dup( ntp->t_flags = XFS_TRANS_PERM_LOG_RES | (tp->t_flags & XFS_TRANS_RESERVE) | - (tp->t_flags & XFS_TRANS_NO_WRITECOUNT); + (tp->t_flags & XFS_TRANS_NO_WRITECOUNT) | + (tp->t_flags & XFS_TRANS_RES_FDBLKS); /* We gave our writer reference to the new transaction */ tp->t_flags |= XFS_TRANS_NO_WRITECOUNT; ntp->t_ticket = xfs_log_ticket_get(tp->t_ticket); @@ -272,6 +273,8 @@ xfs_trans_alloc( */ WARN_ON(resp->tr_logres > 0 && mp->m_super->s_writers.frozen == SB_FREEZE_COMPLETE); + ASSERT(!(flags & XFS_TRANS_RES_FDBLKS) || + xfs_sb_version_haslazysbcount(&mp->m_sb)); tp->t_magic = XFS_TRANS_HEADER_MAGIC; tp->t_flags = flags; @@ -365,6 +368,20 @@ xfs_trans_mod_sb( tp->t_blk_res_used += (uint)-delta; if (tp->t_blk_res_used > tp->t_blk_res) xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); + } else if (delta > 0 && (tp->t_flags & XFS_TRANS_RES_FDBLKS)) { + int64_t blkres_delta; + + /* + * Return freed blocks directly to the reservation + * instead of the global pool, being careful not to + * overflow the trans counter. This is used to preserve + * reservation across chains of transaction rolls that + * repeatedly free and allocate blocks. + */ + blkres_delta = min_t(int64_t, delta, + UINT_MAX - tp->t_blk_res); + tp->t_blk_res += blkres_delta; + delta -= blkres_delta; } tp->t_fdblocks_delta += delta; if (xfs_sb_version_haslazysbcount(&mp->m_sb)) From eb0efe5063bb10bcb653e4f8e92a74719c03a347 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 29 Jun 2020 14:47:17 -0700 Subject: [PATCH 003/117] xfs: don't eat an EIO/ENOSPC writeback error when scrubbing data fork The data fork scrubber calls filemap_write_and_wait to flush dirty pages and delalloc reservations out to disk prior to checking the data fork's extent mappings. Unfortunately, this means that scrub can consume the EIO/ENOSPC errors that would otherwise have stayed around in the address space until (we hope) the writer application calls fsync to persist data and collect errors. The end result is that programs that wrote to a file might never see the error code and proceed as if nothing were wrong. xfs_scrub is not in a position to notify file writers about the writeback failure, and it's only here to check metadata, not file contents. Therefore, if writeback fails, we should stuff the error code back into the address space so that an fsync by the writer application can pick that up. Fixes: 99d9d8d05da2 ("xfs: scrub inode block mappings") Signed-off-by: Darrick J. Wong Reviewed-by: Brian Foster Reviewed-by: Dave Chinner --- fs/xfs/scrub/bmap.c | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c index 7badd6dfe544..955302e7cdde 100644 --- a/fs/xfs/scrub/bmap.c +++ b/fs/xfs/scrub/bmap.c @@ -45,9 +45,27 @@ xchk_setup_inode_bmap( */ if (S_ISREG(VFS_I(sc->ip)->i_mode) && sc->sm->sm_type == XFS_SCRUB_TYPE_BMBTD) { + struct address_space *mapping = VFS_I(sc->ip)->i_mapping; + inode_dio_wait(VFS_I(sc->ip)); - error = filemap_write_and_wait(VFS_I(sc->ip)->i_mapping); - if (error) + + /* + * Try to flush all incore state to disk before we examine the + * space mappings for the data fork. Leave accumulated errors + * in the mapping for the writer threads to consume. + * + * On ENOSPC or EIO writeback errors, we continue into the + * extent mapping checks because write failures do not + * necessarily imply anything about the correctness of the file + * metadata. The metadata and the file data could be on + * completely separate devices; a media failure might only + * affect a subset of the disk, etc. We can handle delalloc + * extents in the scrubber, so leaving them in memory is fine. + */ + error = filemap_fdatawrite(mapping); + if (!error) + error = filemap_fdatawait_keep_errors(mapping); + if (error && (error != -ENOSPC && error != -EIO)) goto out; } From 83895227aba1ade33e81f586aa7b6b1e143096a5 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 29 Jun 2020 14:47:18 -0700 Subject: [PATCH 004/117] xfs: fix reflink quota reservation accounting error Quota reservations are supposed to account for the blocks that might be allocated due to a bmap btree split. Reflink doesn't do this, so fix this to make the quota accounting more accurate before we start rearranging things. Fixes: 862bb360ef56 ("xfs: reflink extents from one file to another") Signed-off-by: Darrick J. Wong Reviewed-by: Brian Foster --- fs/xfs/xfs_reflink.c | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index 107bf2a2f344..d89201d40891 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -1003,6 +1003,7 @@ xfs_reflink_remap_extent( xfs_filblks_t rlen; xfs_filblks_t unmap_len; xfs_off_t newlen; + int64_t qres; int error; unmap_len = irec->br_startoff + irec->br_blockcount - destoff; @@ -1025,13 +1026,19 @@ xfs_reflink_remap_extent( xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, 0); - /* If we're not just clearing space, then do we have enough quota? */ - if (real_extent) { - error = xfs_trans_reserve_quota_nblks(tp, ip, - irec->br_blockcount, 0, XFS_QMOPT_RES_REGBLKS); - if (error) - goto out_cancel; - } + /* + * Reserve quota for this operation. We don't know if the first unmap + * in the dest file will cause a bmap btree split, so we always reserve + * at least enough blocks for that split. If the extent being mapped + * in is written, we need to reserve quota for that too. + */ + qres = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK); + if (real_extent) + qres += irec->br_blockcount; + error = xfs_trans_reserve_quota_nblks(tp, ip, qres, 0, + XFS_QMOPT_RES_REGBLKS); + if (error) + goto out_cancel; trace_xfs_reflink_remap(ip, irec->br_startoff, irec->br_blockcount, irec->br_startblock); From 877f58f53684f14ca3202640f70592bf44890924 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 29 Jun 2020 14:47:18 -0700 Subject: [PATCH 005/117] xfs: rename xfs_bmap_is_real_extent to is_written_extent The name of this predicate is a little misleading -- it decides if the extent mapping is allocated and written. Change the name to be more direct, as we're going to add a new predicate in the next patch. Signed-off-by: Darrick J. Wong Reviewed-by: Brian Foster --- fs/xfs/libxfs/xfs_bmap.h | 2 +- fs/xfs/libxfs/xfs_rtbitmap.c | 2 +- fs/xfs/xfs_reflink.c | 6 +++--- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h index 6028a3c825ba..2b18338d0643 100644 --- a/fs/xfs/libxfs/xfs_bmap.h +++ b/fs/xfs/libxfs/xfs_bmap.h @@ -163,7 +163,7 @@ static inline int xfs_bmapi_whichfork(int bmapi_flags) * Return true if the extent is a real, allocated extent, or false if it is a * delayed allocation, and unwritten extent or a hole. */ -static inline bool xfs_bmap_is_real_extent(struct xfs_bmbt_irec *irec) +static inline bool xfs_bmap_is_written_extent(struct xfs_bmbt_irec *irec) { return irec->br_state != XFS_EXT_UNWRITTEN && irec->br_startblock != HOLESTARTBLOCK && diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c index 9498ced947be..1d9fa8a300f1 100644 --- a/fs/xfs/libxfs/xfs_rtbitmap.c +++ b/fs/xfs/libxfs/xfs_rtbitmap.c @@ -70,7 +70,7 @@ xfs_rtbuf_get( if (error) return error; - if (XFS_IS_CORRUPT(mp, nmap == 0 || !xfs_bmap_is_real_extent(&map))) + if (XFS_IS_CORRUPT(mp, nmap == 0 || !xfs_bmap_is_written_extent(&map))) return -EFSCORRUPTED; ASSERT(map.br_startblock != NULLFSBLOCK); diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index d89201d40891..22fdea6d69d3 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -179,7 +179,7 @@ xfs_reflink_trim_around_shared( int error = 0; /* Holes, unwritten, and delalloc extents cannot be shared */ - if (!xfs_is_cow_inode(ip) || !xfs_bmap_is_real_extent(irec)) { + if (!xfs_is_cow_inode(ip) || !xfs_bmap_is_written_extent(irec)) { *shared = false; return 0; } @@ -655,7 +655,7 @@ xfs_reflink_end_cow_extent( * preallocations can leak into the range we are called upon, and we * need to skip them. */ - if (!xfs_bmap_is_real_extent(&got)) { + if (!xfs_bmap_is_written_extent(&got)) { *end_fsb = del.br_startoff; goto out_cancel; } @@ -996,7 +996,7 @@ xfs_reflink_remap_extent( xfs_off_t new_isize) { struct xfs_mount *mp = ip->i_mount; - bool real_extent = xfs_bmap_is_real_extent(irec); + bool real_extent = xfs_bmap_is_written_extent(irec); struct xfs_trans *tp; unsigned int resblks; struct xfs_bmbt_irec uirec; From 00fd1d56dd08a8ceaa9e4ee1a41fefd9f6c6bc7d Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 29 Jun 2020 14:47:18 -0700 Subject: [PATCH 006/117] xfs: redesign the reflink remap loop to fix blkres depletion crash MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The existing reflink remapping loop has some structural problems that need addressing: The biggest problem is that we create one transaction for each extent in the source file without accounting for the number of mappings there are for the same range in the destination file. In other words, we don't know the number of remap operations that will be necessary and we therefore cannot guess the block reservation required. On highly fragmented filesystems (e.g. ones with active dedupe) we guess wrong, run out of block reservation, and fail. The second problem is that we don't actually use the bmap intents to their full potential -- instead of calling bunmapi directly and having to deal with its backwards operation, we could call the deferred ops xfs_bmap_unmap_extent and xfs_refcount_decrease_extent instead. This makes the frontend loop much simpler. Solve all of these problems by refactoring the remapping loops so that we only perform one remapping operation per transaction, and each operation only tries to remap a single extent from source to dest. Signed-off-by: Darrick J. Wong Reviewed-by: Brian Foster Reported-by: Edwin Török Tested-by: Edwin Török --- fs/xfs/libxfs/xfs_bmap.h | 13 ++- fs/xfs/xfs_reflink.c | 242 +++++++++++++++++++++------------------ fs/xfs/xfs_trace.h | 52 +-------- 3 files changed, 143 insertions(+), 164 deletions(-) diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h index 2b18338d0643..e1bd484e5548 100644 --- a/fs/xfs/libxfs/xfs_bmap.h +++ b/fs/xfs/libxfs/xfs_bmap.h @@ -158,6 +158,13 @@ static inline int xfs_bmapi_whichfork(int bmapi_flags) { BMAP_ATTRFORK, "ATTR" }, \ { BMAP_COWFORK, "COW" } +/* Return true if the extent is an allocated extent, written or not. */ +static inline bool xfs_bmap_is_real_extent(struct xfs_bmbt_irec *irec) +{ + return irec->br_startblock != HOLESTARTBLOCK && + irec->br_startblock != DELAYSTARTBLOCK && + !isnullstartblock(irec->br_startblock); +} /* * Return true if the extent is a real, allocated extent, or false if it is a @@ -165,10 +172,8 @@ static inline int xfs_bmapi_whichfork(int bmapi_flags) */ static inline bool xfs_bmap_is_written_extent(struct xfs_bmbt_irec *irec) { - return irec->br_state != XFS_EXT_UNWRITTEN && - irec->br_startblock != HOLESTARTBLOCK && - irec->br_startblock != DELAYSTARTBLOCK && - !isnullstartblock(irec->br_startblock); + return xfs_bmap_is_real_extent(irec) && + irec->br_state != XFS_EXT_UNWRITTEN; } /* diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index 22fdea6d69d3..e7dd8950d40a 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -984,41 +984,28 @@ xfs_reflink_ag_has_free_space( } /* - * Unmap a range of blocks from a file, then map other blocks into the hole. - * The range to unmap is (destoff : destoff + srcioff + irec->br_blockcount). - * The extent irec is mapped into dest at irec->br_startoff. + * Remap the given extent into the file. The dmap blockcount will be set to + * the number of blocks that were actually remapped. */ STATIC int xfs_reflink_remap_extent( struct xfs_inode *ip, - struct xfs_bmbt_irec *irec, - xfs_fileoff_t destoff, + struct xfs_bmbt_irec *dmap, xfs_off_t new_isize) { + struct xfs_bmbt_irec smap; struct xfs_mount *mp = ip->i_mount; - bool real_extent = xfs_bmap_is_written_extent(irec); struct xfs_trans *tp; - unsigned int resblks; - struct xfs_bmbt_irec uirec; - xfs_filblks_t rlen; - xfs_filblks_t unmap_len; xfs_off_t newlen; - int64_t qres; + int64_t qres, qdelta; + unsigned int resblks; + bool smap_real; + bool dmap_written = xfs_bmap_is_written_extent(dmap); + int nimaps; int error; - unmap_len = irec->br_startoff + irec->br_blockcount - destoff; - trace_xfs_reflink_punch_range(ip, destoff, unmap_len); - - /* No reflinking if we're low on space */ - if (real_extent) { - error = xfs_reflink_ag_has_free_space(mp, - XFS_FSB_TO_AGNO(mp, irec->br_startblock)); - if (error) - goto out; - } - /* Start a rolling transaction to switch the mappings */ - resblks = XFS_EXTENTADD_SPACE_RES(ip->i_mount, XFS_DATA_FORK); + resblks = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK); error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0, &tp); if (error) goto out; @@ -1027,92 +1014,121 @@ xfs_reflink_remap_extent( xfs_trans_ijoin(tp, ip, 0); /* - * Reserve quota for this operation. We don't know if the first unmap - * in the dest file will cause a bmap btree split, so we always reserve - * at least enough blocks for that split. If the extent being mapped - * in is written, we need to reserve quota for that too. + * Read what's currently mapped in the destination file into smap. + * If smap isn't a hole, we will have to remove it before we can add + * dmap to the destination file. */ + nimaps = 1; + error = xfs_bmapi_read(ip, dmap->br_startoff, dmap->br_blockcount, + &smap, &nimaps, 0); + if (error) + goto out_cancel; + ASSERT(nimaps == 1 && smap.br_startoff == dmap->br_startoff); + smap_real = xfs_bmap_is_real_extent(&smap); + + /* + * We can only remap as many blocks as the smaller of the two extent + * maps, because we can only remap one extent at a time. + */ + dmap->br_blockcount = min(dmap->br_blockcount, smap.br_blockcount); + ASSERT(dmap->br_blockcount == smap.br_blockcount); + + trace_xfs_reflink_remap_extent_dest(ip, &smap); + + /* No reflinking if the AG of the dest mapping is low on space. */ + if (dmap_written) { + error = xfs_reflink_ag_has_free_space(mp, + XFS_FSB_TO_AGNO(mp, dmap->br_startblock)); + if (error) + goto out_cancel; + } + + /* + * Compute quota reservation if we think the quota block counter for + * this file could increase. + * + * We start by reserving enough blocks to handle a bmbt split. + * + * If we are mapping a written extent into the file, we need to have + * enough quota block count reservation to handle the blocks in that + * extent. + * + * Note that if we're replacing a delalloc reservation with a written + * extent, we have to take the full quota reservation because removing + * the delalloc reservation gives the block count back to the quota + * count. This is suboptimal, but the VFS flushed the dest range + * before we started. That should have removed all the delalloc + * reservations, but we code defensively. + */ + qdelta = 0; qres = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK); - if (real_extent) - qres += irec->br_blockcount; + if (dmap_written) + qres += dmap->br_blockcount; error = xfs_trans_reserve_quota_nblks(tp, ip, qres, 0, XFS_QMOPT_RES_REGBLKS); if (error) goto out_cancel; - trace_xfs_reflink_remap(ip, irec->br_startoff, - irec->br_blockcount, irec->br_startblock); - - /* Unmap the old blocks in the data fork. */ - rlen = unmap_len; - while (rlen) { - ASSERT(tp->t_firstblock == NULLFSBLOCK); - error = __xfs_bunmapi(tp, ip, destoff, &rlen, 0, 1); - if (error) - goto out_cancel; + if (smap_real) { + /* + * If the extent we're unmapping is backed by storage (written + * or not), unmap the extent and drop its refcount. + */ + xfs_bmap_unmap_extent(tp, ip, &smap); + xfs_refcount_decrease_extent(tp, &smap); + qdelta -= smap.br_blockcount; + } else if (smap.br_startblock == DELAYSTARTBLOCK) { + xfs_filblks_t len = smap.br_blockcount; /* - * Trim the extent to whatever got unmapped. - * Remember, bunmapi works backwards. + * If the extent we're unmapping is a delalloc reservation, + * we can use the regular bunmapi function to release the + * incore state. Dropping the delalloc reservation takes care + * of the quota reservation for us. */ - uirec.br_startblock = irec->br_startblock + rlen; - uirec.br_startoff = irec->br_startoff + rlen; - uirec.br_blockcount = unmap_len - rlen; - uirec.br_state = irec->br_state; - unmap_len = rlen; - - /* If this isn't a real mapping, we're done. */ - if (!real_extent || uirec.br_blockcount == 0) - goto next_extent; - - trace_xfs_reflink_remap(ip, uirec.br_startoff, - uirec.br_blockcount, uirec.br_startblock); - - /* Update the refcount tree */ - xfs_refcount_increase_extent(tp, &uirec); - - /* Map the new blocks into the data fork. */ - xfs_bmap_map_extent(tp, ip, &uirec); - - /* Update quota accounting. */ - xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, - uirec.br_blockcount); - - /* Update dest isize if needed. */ - newlen = XFS_FSB_TO_B(mp, - uirec.br_startoff + uirec.br_blockcount); - newlen = min_t(xfs_off_t, newlen, new_isize); - if (newlen > i_size_read(VFS_I(ip))) { - trace_xfs_reflink_update_inode_size(ip, newlen); - i_size_write(VFS_I(ip), newlen); - ip->i_d.di_size = newlen; - xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - } - -next_extent: - /* Process all the deferred stuff. */ - error = xfs_defer_finish(&tp); + error = __xfs_bunmapi(NULL, ip, smap.br_startoff, &len, 0, 1); if (error) goto out_cancel; + ASSERT(len == 0); } + /* + * If the extent we're sharing is backed by written storage, increase + * its refcount and map it into the file. + */ + if (dmap_written) { + xfs_refcount_increase_extent(tp, dmap); + xfs_bmap_map_extent(tp, ip, dmap); + qdelta += dmap->br_blockcount; + } + + xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, qdelta); + + /* Update dest isize if needed. */ + newlen = XFS_FSB_TO_B(mp, dmap->br_startoff + dmap->br_blockcount); + newlen = min_t(xfs_off_t, newlen, new_isize); + if (newlen > i_size_read(VFS_I(ip))) { + trace_xfs_reflink_update_inode_size(ip, newlen); + i_size_write(VFS_I(ip), newlen); + ip->i_d.di_size = newlen; + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + } + + /* Commit everything and unlock. */ error = xfs_trans_commit(tp); - xfs_iunlock(ip, XFS_ILOCK_EXCL); - if (error) - goto out; - return 0; + goto out_unlock; out_cancel: xfs_trans_cancel(tp); +out_unlock: xfs_iunlock(ip, XFS_ILOCK_EXCL); out: - trace_xfs_reflink_remap_extent_error(ip, error, _RET_IP_); + if (error) + trace_xfs_reflink_remap_extent_error(ip, error, _RET_IP_); return error; } -/* - * Iteratively remap one file's extents (and holes) to another's. - */ +/* Remap a range of one file to the other. */ int xfs_reflink_remap_blocks( struct xfs_inode *src, @@ -1123,25 +1139,22 @@ xfs_reflink_remap_blocks( loff_t *remapped) { struct xfs_bmbt_irec imap; - xfs_fileoff_t srcoff; - xfs_fileoff_t destoff; + struct xfs_mount *mp = src->i_mount; + xfs_fileoff_t srcoff = XFS_B_TO_FSBT(mp, pos_in); + xfs_fileoff_t destoff = XFS_B_TO_FSBT(mp, pos_out); xfs_filblks_t len; - xfs_filblks_t range_len; xfs_filblks_t remapped_len = 0; xfs_off_t new_isize = pos_out + remap_len; int nimaps; int error = 0; - destoff = XFS_B_TO_FSBT(src->i_mount, pos_out); - srcoff = XFS_B_TO_FSBT(src->i_mount, pos_in); - len = XFS_B_TO_FSB(src->i_mount, remap_len); + len = min_t(xfs_filblks_t, XFS_B_TO_FSB(mp, remap_len), + XFS_MAX_FILEOFF); - /* drange = (destoff, destoff + len); srange = (srcoff, srcoff + len) */ - while (len) { - uint lock_mode; + trace_xfs_reflink_remap_blocks(src, srcoff, len, dest, destoff); - trace_xfs_reflink_remap_blocks_loop(src, srcoff, len, - dest, destoff); + while (len > 0) { + unsigned int lock_mode; /* Read extent from the source file */ nimaps = 1; @@ -1150,18 +1163,25 @@ xfs_reflink_remap_blocks( xfs_iunlock(src, lock_mode); if (error) break; - ASSERT(nimaps == 1); + /* + * The caller supposedly flushed all dirty pages in the source + * file range, which means that writeback should have allocated + * or deleted all delalloc reservations in that range. If we + * find one, that's a good sign that something is seriously + * wrong here. + */ + ASSERT(nimaps == 1 && imap.br_startoff == srcoff); + if (imap.br_startblock == DELAYSTARTBLOCK) { + ASSERT(imap.br_startblock != DELAYSTARTBLOCK); + error = -EFSCORRUPTED; + break; + } - trace_xfs_reflink_remap_imap(src, srcoff, len, XFS_DATA_FORK, - &imap); + trace_xfs_reflink_remap_extent_src(src, &imap); - /* Translate imap into the destination file. */ - range_len = imap.br_startoff + imap.br_blockcount - srcoff; - imap.br_startoff += destoff - srcoff; - - /* Clear dest from destoff to the end of imap and map it in. */ - error = xfs_reflink_remap_extent(dest, &imap, destoff, - new_isize); + /* Remap into the destination file at the given offset. */ + imap.br_startoff = destoff; + error = xfs_reflink_remap_extent(dest, &imap, new_isize); if (error) break; @@ -1171,10 +1191,10 @@ xfs_reflink_remap_blocks( } /* Advance drange/srange */ - srcoff += range_len; - destoff += range_len; - len -= range_len; - remapped_len += range_len; + srcoff += imap.br_blockcount; + destoff += imap.br_blockcount; + len -= imap.br_blockcount; + remapped_len += imap.br_blockcount; } if (error) diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 460136628a79..50c478374a31 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -3052,8 +3052,7 @@ DEFINE_EVENT(xfs_inode_irec_class, name, \ DEFINE_INODE_EVENT(xfs_reflink_set_inode_flag); DEFINE_INODE_EVENT(xfs_reflink_unset_inode_flag); DEFINE_ITRUNC_EVENT(xfs_reflink_update_inode_size); -DEFINE_IMAP_EVENT(xfs_reflink_remap_imap); -TRACE_EVENT(xfs_reflink_remap_blocks_loop, +TRACE_EVENT(xfs_reflink_remap_blocks, TP_PROTO(struct xfs_inode *src, xfs_fileoff_t soffset, xfs_filblks_t len, struct xfs_inode *dest, xfs_fileoff_t doffset), @@ -3084,59 +3083,14 @@ TRACE_EVENT(xfs_reflink_remap_blocks_loop, __entry->dest_ino, __entry->dest_lblk) ); -TRACE_EVENT(xfs_reflink_punch_range, - TP_PROTO(struct xfs_inode *ip, xfs_fileoff_t lblk, - xfs_extlen_t len), - TP_ARGS(ip, lblk, len), - TP_STRUCT__entry( - __field(dev_t, dev) - __field(xfs_ino_t, ino) - __field(xfs_fileoff_t, lblk) - __field(xfs_extlen_t, len) - ), - TP_fast_assign( - __entry->dev = VFS_I(ip)->i_sb->s_dev; - __entry->ino = ip->i_ino; - __entry->lblk = lblk; - __entry->len = len; - ), - TP_printk("dev %d:%d ino 0x%llx lblk 0x%llx len 0x%x", - MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->ino, - __entry->lblk, - __entry->len) -); -TRACE_EVENT(xfs_reflink_remap, - TP_PROTO(struct xfs_inode *ip, xfs_fileoff_t lblk, - xfs_extlen_t len, xfs_fsblock_t new_pblk), - TP_ARGS(ip, lblk, len, new_pblk), - TP_STRUCT__entry( - __field(dev_t, dev) - __field(xfs_ino_t, ino) - __field(xfs_fileoff_t, lblk) - __field(xfs_extlen_t, len) - __field(xfs_fsblock_t, new_pblk) - ), - TP_fast_assign( - __entry->dev = VFS_I(ip)->i_sb->s_dev; - __entry->ino = ip->i_ino; - __entry->lblk = lblk; - __entry->len = len; - __entry->new_pblk = new_pblk; - ), - TP_printk("dev %d:%d ino 0x%llx lblk 0x%llx len 0x%x new_pblk %llu", - MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->ino, - __entry->lblk, - __entry->len, - __entry->new_pblk) -); DEFINE_DOUBLE_IO_EVENT(xfs_reflink_remap_range); DEFINE_INODE_ERROR_EVENT(xfs_reflink_remap_range_error); DEFINE_INODE_ERROR_EVENT(xfs_reflink_set_inode_flag_error); DEFINE_INODE_ERROR_EVENT(xfs_reflink_update_inode_size_error); DEFINE_INODE_ERROR_EVENT(xfs_reflink_remap_blocks_error); DEFINE_INODE_ERROR_EVENT(xfs_reflink_remap_extent_error); +DEFINE_INODE_IREC_EVENT(xfs_reflink_remap_extent_src); +DEFINE_INODE_IREC_EVENT(xfs_reflink_remap_extent_dest); /* dedupe tracepoints */ DEFINE_DOUBLE_IO_EVENT(xfs_reflink_compare_extents); From aa5d0ba0b5dbb5105276214c7f9124855b20f75e Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 29 Jun 2020 14:47:19 -0700 Subject: [PATCH 007/117] xfs: only reserve quota blocks for bmbt changes if we're changing the data fork Now that we've reworked xfs_reflink_remap_extent to remap only one extent per transaction, we actually know if the extent being removed is an allocated mapping. This means that we now know ahead of time if we're going to be touching the data fork. Since we only need blocks for a bmbt split if we're going to update the data fork, we only need to get quota reservation if we know we're going to touch the data fork. Signed-off-by: Darrick J. Wong Reviewed-by: Brian Foster --- fs/xfs/xfs_reflink.c | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index e7dd8950d40a..3073c608216c 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -1047,7 +1047,11 @@ xfs_reflink_remap_extent( * Compute quota reservation if we think the quota block counter for * this file could increase. * - * We start by reserving enough blocks to handle a bmbt split. + * Adding a written extent to the extent map can cause a bmbt split, + * and removing a mapped extent from the extent can cause a bmbt split. + * The two operations cannot both cause a split since they operate on + * the same index in the bmap btree, so we only need a reservation for + * one bmbt split if either thing is happening. * * If we are mapping a written extent into the file, we need to have * enough quota block count reservation to handle the blocks in that @@ -1060,14 +1064,17 @@ xfs_reflink_remap_extent( * before we started. That should have removed all the delalloc * reservations, but we code defensively. */ - qdelta = 0; - qres = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK); + qres = qdelta = 0; + if (smap_real || dmap_written) + qres = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK); if (dmap_written) qres += dmap->br_blockcount; - error = xfs_trans_reserve_quota_nblks(tp, ip, qres, 0, - XFS_QMOPT_RES_REGBLKS); - if (error) - goto out_cancel; + if (qres > 0) { + error = xfs_trans_reserve_quota_nblks(tp, ip, qres, 0, + XFS_QMOPT_RES_REGBLKS); + if (error) + goto out_cancel; + } if (smap_real) { /* From 94b941fd7a98cebd6b77b8925c54ef76bbf7473f Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 29 Jun 2020 14:47:19 -0700 Subject: [PATCH 008/117] xfs: only reserve quota blocks if we're mapping into a hole When logging quota block count updates during a reflink operation, we only log the /delta/ of the block count changes to the dquot. Since we now know ahead of time the extent type of both dmap and smap (and that they have the same length), we know that we only need to reserve quota blocks for dmap's blockcount if we're mapping it into a hole. Signed-off-by: Darrick J. Wong Reviewed-by: Brian Foster --- fs/xfs/xfs_reflink.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index 3073c608216c..35a17ca5b508 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -1055,7 +1055,9 @@ xfs_reflink_remap_extent( * * If we are mapping a written extent into the file, we need to have * enough quota block count reservation to handle the blocks in that - * extent. + * extent. We log only the delta to the quota block counts, so if the + * extent we're unmapping also has blocks allocated to it, we don't + * need a quota reservation for the extent itself. * * Note that if we're replacing a delalloc reservation with a written * extent, we have to take the full quota reservation because removing @@ -1067,7 +1069,7 @@ xfs_reflink_remap_extent( qres = qdelta = 0; if (smap_real || dmap_written) qres = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK); - if (dmap_written) + if (!smap_real && dmap_written) qres += dmap->br_blockcount; if (qres > 0) { error = xfs_trans_reserve_quota_nblks(tp, ip, qres, 0, From 168eae803cede459d67ed0ab3ddb19539700a78a Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 29 Jun 2020 14:47:19 -0700 Subject: [PATCH 009/117] xfs: reflink can skip remap existing mappings If the source and destination map are identical, we can skip the remap step to save some time. Signed-off-by: Darrick J. Wong Reviewed-by: Brian Foster --- fs/xfs/xfs_reflink.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index 35a17ca5b508..0a3681646fc9 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -1035,6 +1035,22 @@ xfs_reflink_remap_extent( trace_xfs_reflink_remap_extent_dest(ip, &smap); + /* + * Two extents mapped to the same physical block must not have + * different states; that's filesystem corruption. Move on to the next + * extent if they're both holes or both the same physical extent. + */ + if (dmap->br_startblock == smap.br_startblock) { + if (dmap->br_state != smap.br_state) + error = -EFSCORRUPTED; + goto out_cancel; + } + + /* If both extents are unwritten, leave them alone. */ + if (dmap->br_state == XFS_EXT_UNWRITTEN && + smap.br_state == XFS_EXT_UNWRITTEN) + goto out_cancel; + /* No reflinking if the AG of the dest mapping is low on space. */ if (dmap_written) { error = xfs_reflink_ag_has_free_space(mp, From 451d34ee075023d790213885a947fc7a71f26e6d Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 29 Jun 2020 14:47:20 -0700 Subject: [PATCH 010/117] xfs: fix xfs_reflink_remap_prep calling conventions Fix the return value of xfs_reflink_remap_prep so that its return value conventions match the rest of xfs. Signed-off-by: Darrick J. Wong Reviewed-by: Brian Foster --- fs/xfs/xfs_file.c | 2 +- fs/xfs/xfs_reflink.c | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 00db81eac80d..b375fae811f2 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -1035,7 +1035,7 @@ xfs_file_remap_range( /* Prepare and then clone file data. */ ret = xfs_reflink_remap_prep(file_in, pos_in, file_out, pos_out, &len, remap_flags); - if (ret < 0 || len == 0) + if (ret || len == 0) return ret; trace_xfs_reflink_remap_range(src, pos_in, len, dest, pos_out); diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index 0a3681646fc9..f07a09d35a68 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -1367,7 +1367,7 @@ xfs_reflink_remap_prep( struct inode *inode_out = file_inode(file_out); struct xfs_inode *dest = XFS_I(inode_out); bool same_inode = (inode_in == inode_out); - ssize_t ret; + int ret; /* Lock both files against IO */ ret = xfs_iolock_two_inodes_and_break_layout(inode_in, inode_out); @@ -1391,7 +1391,7 @@ xfs_reflink_remap_prep( ret = generic_remap_file_range_prep(file_in, pos_in, file_out, pos_out, len, remap_flags); - if (ret < 0 || *len == 0) + if (ret || *len == 0) goto out_unlock; /* Attach dquots to dest inode before changing block map */ @@ -1426,7 +1426,7 @@ xfs_reflink_remap_prep( if (ret) goto out_unlock; - return 1; + return 0; out_unlock: xfs_reflink_remap_unlock(file_in, file_out); return ret; From 10b4bd6c9cbc1434c9aca523dac9c4313b56f1ae Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 29 Jun 2020 14:47:20 -0700 Subject: [PATCH 011/117] xfs: refactor locking and unlocking two inodes against userspace IO Refactor the two functions that we use to lock and unlock two inodes to block userspace from initiating IO against a file, whether via system calls or mmap activity. Signed-off-by: Darrick J. Wong Reviewed-by: Brian Foster --- fs/xfs/xfs_file.c | 2 +- fs/xfs/xfs_reflink.c | 52 +++++++++++++++++++++++++++----------------- fs/xfs/xfs_reflink.h | 3 +-- 3 files changed, 34 insertions(+), 23 deletions(-) diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index b375fae811f2..f189bdcbeddd 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -1065,7 +1065,7 @@ xfs_file_remap_range( if (mp->m_flags & XFS_MOUNT_WSYNC) xfs_log_force_inode(dest); out_unlock: - xfs_reflink_remap_unlock(file_in, file_out); + xfs_reflink_remap_unlock(src, dest); if (ret) trace_xfs_reflink_remap_range_error(dest, ret, _RET_IP_); return remapped > 0 ? remapped : ret; diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index f07a09d35a68..c1f2222ffece 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -1284,24 +1284,42 @@ retry: return 0; } -/* Unlock both inodes after they've been prepped for a range clone. */ +/* + * Lock two files so that userspace cannot initiate I/O via file syscalls or + * mmap activity. + */ +static int +xfs_reflink_remap_lock( + struct xfs_inode *ip1, + struct xfs_inode *ip2) +{ + int ret; + + ret = xfs_iolock_two_inodes_and_break_layout(VFS_I(ip1), VFS_I(ip2)); + if (ret) + return ret; + if (ip1 == ip2) + xfs_ilock(ip1, XFS_MMAPLOCK_EXCL); + else + xfs_lock_two_inodes(ip1, XFS_MMAPLOCK_EXCL, + ip2, XFS_MMAPLOCK_EXCL); + return 0; +} + +/* Unlock both files to allow IO and mmap activity. */ void xfs_reflink_remap_unlock( - struct file *file_in, - struct file *file_out) + struct xfs_inode *ip1, + struct xfs_inode *ip2) { - struct inode *inode_in = file_inode(file_in); - struct xfs_inode *src = XFS_I(inode_in); - struct inode *inode_out = file_inode(file_out); - struct xfs_inode *dest = XFS_I(inode_out); - bool same_inode = (inode_in == inode_out); + bool same_inode = (ip1 == ip2); - xfs_iunlock(dest, XFS_MMAPLOCK_EXCL); + xfs_iunlock(ip2, XFS_MMAPLOCK_EXCL); if (!same_inode) - xfs_iunlock(src, XFS_MMAPLOCK_EXCL); - inode_unlock(inode_out); + xfs_iunlock(ip1, XFS_MMAPLOCK_EXCL); + inode_unlock(VFS_I(ip2)); if (!same_inode) - inode_unlock(inode_in); + inode_unlock(VFS_I(ip1)); } /* @@ -1366,18 +1384,12 @@ xfs_reflink_remap_prep( struct xfs_inode *src = XFS_I(inode_in); struct inode *inode_out = file_inode(file_out); struct xfs_inode *dest = XFS_I(inode_out); - bool same_inode = (inode_in == inode_out); int ret; /* Lock both files against IO */ - ret = xfs_iolock_two_inodes_and_break_layout(inode_in, inode_out); + ret = xfs_reflink_remap_lock(src, dest); if (ret) return ret; - if (same_inode) - xfs_ilock(src, XFS_MMAPLOCK_EXCL); - else - xfs_lock_two_inodes(src, XFS_MMAPLOCK_EXCL, dest, - XFS_MMAPLOCK_EXCL); /* Check file eligibility and prepare for block sharing. */ ret = -EINVAL; @@ -1428,7 +1440,7 @@ xfs_reflink_remap_prep( return 0; out_unlock: - xfs_reflink_remap_unlock(file_in, file_out); + xfs_reflink_remap_unlock(src, dest); return ret; } diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h index 3e4fd46373ab..ceeb59b86b29 100644 --- a/fs/xfs/xfs_reflink.h +++ b/fs/xfs/xfs_reflink.h @@ -56,7 +56,6 @@ extern int xfs_reflink_remap_blocks(struct xfs_inode *src, loff_t pos_in, loff_t *remapped); extern int xfs_reflink_update_dest(struct xfs_inode *dest, xfs_off_t newlen, xfs_extlen_t cowextsize, unsigned int remap_flags); -extern void xfs_reflink_remap_unlock(struct file *file_in, - struct file *file_out); +extern void xfs_reflink_remap_unlock(struct xfs_inode *ip1, struct xfs_inode *ip2); #endif /* __XFS_REFLINK_H */ From e2aaee9cd34d8396a48abf0b1be81a464c1d51c5 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 29 Jun 2020 14:47:20 -0700 Subject: [PATCH 012/117] xfs: move helpers that lock and unlock two inodes against userspace IO Move the double-inode locking helpers to xfs_inode.c since they're not specific to reflink. Signed-off-by: Darrick J. Wong Reviewed-by: Brian Foster --- fs/xfs/xfs_file.c | 2 +- fs/xfs/xfs_inode.c | 93 ++++++++++++++++++++++++++++++++++++++++++ fs/xfs/xfs_inode.h | 3 ++ fs/xfs/xfs_reflink.c | 97 +------------------------------------------- fs/xfs/xfs_reflink.h | 1 - 5 files changed, 99 insertions(+), 97 deletions(-) diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index f189bdcbeddd..97aa74800bd9 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -1065,7 +1065,7 @@ xfs_file_remap_range( if (mp->m_flags & XFS_MOUNT_WSYNC) xfs_log_force_inode(dest); out_unlock: - xfs_reflink_remap_unlock(src, dest); + xfs_iunlock2_io_mmap(src, dest); if (ret) trace_xfs_reflink_remap_range_error(dest, ret, _RET_IP_); return remapped > 0 ? remapped : ret; diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 9aea7d68d8ab..24edec472a7c 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -3881,3 +3881,96 @@ xfs_log_force_inode( return 0; return xfs_log_force_lsn(ip->i_mount, lsn, XFS_LOG_SYNC, NULL); } + +/* + * Grab the exclusive iolock for a data copy from src to dest, making sure to + * abide vfs locking order (lowest pointer value goes first) and breaking the + * layout leases before proceeding. The loop is needed because we cannot call + * the blocking break_layout() with the iolocks held, and therefore have to + * back out both locks. + */ +static int +xfs_iolock_two_inodes_and_break_layout( + struct inode *src, + struct inode *dest) +{ + int error; + + if (src > dest) + swap(src, dest); + +retry: + /* Wait to break both inodes' layouts before we start locking. */ + error = break_layout(src, true); + if (error) + return error; + if (src != dest) { + error = break_layout(dest, true); + if (error) + return error; + } + + /* Lock one inode and make sure nobody got in and leased it. */ + inode_lock(src); + error = break_layout(src, false); + if (error) { + inode_unlock(src); + if (error == -EWOULDBLOCK) + goto retry; + return error; + } + + if (src == dest) + return 0; + + /* Lock the other inode and make sure nobody got in and leased it. */ + inode_lock_nested(dest, I_MUTEX_NONDIR2); + error = break_layout(dest, false); + if (error) { + inode_unlock(src); + inode_unlock(dest); + if (error == -EWOULDBLOCK) + goto retry; + return error; + } + + return 0; +} + +/* + * Lock two inodes so that userspace cannot initiate I/O via file syscalls or + * mmap activity. + */ +int +xfs_ilock2_io_mmap( + struct xfs_inode *ip1, + struct xfs_inode *ip2) +{ + int ret; + + ret = xfs_iolock_two_inodes_and_break_layout(VFS_I(ip1), VFS_I(ip2)); + if (ret) + return ret; + if (ip1 == ip2) + xfs_ilock(ip1, XFS_MMAPLOCK_EXCL); + else + xfs_lock_two_inodes(ip1, XFS_MMAPLOCK_EXCL, + ip2, XFS_MMAPLOCK_EXCL); + return 0; +} + +/* Unlock both inodes to allow IO and mmap activity. */ +void +xfs_iunlock2_io_mmap( + struct xfs_inode *ip1, + struct xfs_inode *ip2) +{ + bool same_inode = (ip1 == ip2); + + xfs_iunlock(ip2, XFS_MMAPLOCK_EXCL); + if (!same_inode) + xfs_iunlock(ip1, XFS_MMAPLOCK_EXCL); + inode_unlock(VFS_I(ip2)); + if (!same_inode) + inode_unlock(VFS_I(ip1)); +} diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 47d3b391030d..1534386b430c 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -499,4 +499,7 @@ void xfs_iunlink_destroy(struct xfs_perag *pag); void xfs_end_io(struct work_struct *work); +int xfs_ilock2_io_mmap(struct xfs_inode *ip1, struct xfs_inode *ip2); +void xfs_iunlock2_io_mmap(struct xfs_inode *ip1, struct xfs_inode *ip2); + #endif /* __XFS_INODE_H__ */ diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index c1f2222ffece..aac83f9d6107 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -1229,99 +1229,6 @@ xfs_reflink_remap_blocks( return error; } -/* - * Grab the exclusive iolock for a data copy from src to dest, making sure to - * abide vfs locking order (lowest pointer value goes first) and breaking the - * layout leases before proceeding. The loop is needed because we cannot call - * the blocking break_layout() with the iolocks held, and therefore have to - * back out both locks. - */ -static int -xfs_iolock_two_inodes_and_break_layout( - struct inode *src, - struct inode *dest) -{ - int error; - - if (src > dest) - swap(src, dest); - -retry: - /* Wait to break both inodes' layouts before we start locking. */ - error = break_layout(src, true); - if (error) - return error; - if (src != dest) { - error = break_layout(dest, true); - if (error) - return error; - } - - /* Lock one inode and make sure nobody got in and leased it. */ - inode_lock(src); - error = break_layout(src, false); - if (error) { - inode_unlock(src); - if (error == -EWOULDBLOCK) - goto retry; - return error; - } - - if (src == dest) - return 0; - - /* Lock the other inode and make sure nobody got in and leased it. */ - inode_lock_nested(dest, I_MUTEX_NONDIR2); - error = break_layout(dest, false); - if (error) { - inode_unlock(src); - inode_unlock(dest); - if (error == -EWOULDBLOCK) - goto retry; - return error; - } - - return 0; -} - -/* - * Lock two files so that userspace cannot initiate I/O via file syscalls or - * mmap activity. - */ -static int -xfs_reflink_remap_lock( - struct xfs_inode *ip1, - struct xfs_inode *ip2) -{ - int ret; - - ret = xfs_iolock_two_inodes_and_break_layout(VFS_I(ip1), VFS_I(ip2)); - if (ret) - return ret; - if (ip1 == ip2) - xfs_ilock(ip1, XFS_MMAPLOCK_EXCL); - else - xfs_lock_two_inodes(ip1, XFS_MMAPLOCK_EXCL, - ip2, XFS_MMAPLOCK_EXCL); - return 0; -} - -/* Unlock both files to allow IO and mmap activity. */ -void -xfs_reflink_remap_unlock( - struct xfs_inode *ip1, - struct xfs_inode *ip2) -{ - bool same_inode = (ip1 == ip2); - - xfs_iunlock(ip2, XFS_MMAPLOCK_EXCL); - if (!same_inode) - xfs_iunlock(ip1, XFS_MMAPLOCK_EXCL); - inode_unlock(VFS_I(ip2)); - if (!same_inode) - inode_unlock(VFS_I(ip1)); -} - /* * If we're reflinking to a point past the destination file's EOF, we must * zero any speculative post-EOF preallocations that sit between the old EOF @@ -1387,7 +1294,7 @@ xfs_reflink_remap_prep( int ret; /* Lock both files against IO */ - ret = xfs_reflink_remap_lock(src, dest); + ret = xfs_ilock2_io_mmap(src, dest); if (ret) return ret; @@ -1440,7 +1347,7 @@ xfs_reflink_remap_prep( return 0; out_unlock: - xfs_reflink_remap_unlock(src, dest); + xfs_iunlock2_io_mmap(src, dest); return ret; } diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h index ceeb59b86b29..487b00434b96 100644 --- a/fs/xfs/xfs_reflink.h +++ b/fs/xfs/xfs_reflink.h @@ -56,6 +56,5 @@ extern int xfs_reflink_remap_blocks(struct xfs_inode *src, loff_t pos_in, loff_t *remapped); extern int xfs_reflink_update_dest(struct xfs_inode *dest, xfs_off_t newlen, xfs_extlen_t cowextsize, unsigned int remap_flags); -extern void xfs_reflink_remap_unlock(struct xfs_inode *ip1, struct xfs_inode *ip2); #endif /* __XFS_REFLINK_H */ From cd647d5651c0b0deaa26c1acb9e1789437ba9bc7 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 30 Jun 2020 11:28:53 -0700 Subject: [PATCH 013/117] xfs: use MMAPLOCK around filemap_map_pages() The page faultround path ->map_pages is implemented in XFS via filemap_map_pages(). This function checks that pages found in page cache lookups have not raced with truncate based invalidation by checking page->mapping is correct and page->index is within EOF. However, we've known for a long time that this is not sufficient to protect against races with invalidations done by operations that do not change EOF. e.g. hole punching and other fallocate() based direct extent manipulations. The way we protect against these races is we wrap the page fault operations in a XFS_MMAPLOCK_SHARED lock so they serialise against fallocate and truncate before calling into the filemap function that processes the fault. Do the same for XFS's ->map_pages implementation to close this potential data corruption issue. Signed-off-by: Dave Chinner Reviewed-by: Amir Goldstein Reviewed-by: Brian Foster Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_file.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 97aa74800bd9..cc6528726187 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -1263,10 +1263,23 @@ xfs_filemap_pfn_mkwrite( return __xfs_filemap_fault(vmf, PE_SIZE_PTE, true); } +static void +xfs_filemap_map_pages( + struct vm_fault *vmf, + pgoff_t start_pgoff, + pgoff_t end_pgoff) +{ + struct inode *inode = file_inode(vmf->vma->vm_file); + + xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); + filemap_map_pages(vmf, start_pgoff, end_pgoff); + xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); +} + static const struct vm_operations_struct xfs_file_vm_ops = { .fault = xfs_filemap_fault, .huge_fault = xfs_filemap_huge_fault, - .map_pages = filemap_map_pages, + .map_pages = xfs_filemap_map_pages, .page_mkwrite = xfs_filemap_page_mkwrite, .pfn_mkwrite = xfs_filemap_pfn_mkwrite, }; From 0d5a57140b3e942a26352815a2d1defe6eebde35 Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Thu, 2 Jul 2020 08:37:28 -0700 Subject: [PATCH 014/117] xfs: remove useless definitions in xfs_linux.h Remove current_pid(), current_test_flags() and current_clear_flags_nested(), because they are useless. Signed-off-by: Yafang Shao Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_linux.h | 4 ---- 1 file changed, 4 deletions(-) diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h index 9f70d2f68e05..ab737fed7b12 100644 --- a/fs/xfs/xfs_linux.h +++ b/fs/xfs/xfs_linux.h @@ -102,12 +102,8 @@ typedef __u32 xfs_nlink_t; #define xfs_cowb_secs xfs_params.cowb_timer.val #define current_cpu() (raw_smp_processor_id()) -#define current_pid() (current->pid) -#define current_test_flags(f) (current->flags & (f)) #define current_set_flags_nested(sp, f) \ (*(sp) = current->flags, current->flags |= (f)) -#define current_clear_flags_nested(sp, f) \ - (*(sp) = current->flags, current->flags &= ~(f)) #define current_restore_flags_nested(sp, f) \ (current->flags = ((current->flags & ~(f)) | (*(sp) & (f)))) From 96355d5a1f0ee6dcc182c37db4894ec0c29f1692 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 29 Jun 2020 14:48:45 -0700 Subject: [PATCH 015/117] xfs: Don't allow logging of XFS_ISTALE inodes In tracking down a problem in this patchset, I discovered we are reclaiming dirty stale inodes. This wasn't discovered until inodes were always attached to the cluster buffer and then the rcu callback that freed inodes was assert failing because the inode still had an active pointer to the cluster buffer after it had been reclaimed. Debugging the issue indicated that this was a pre-existing issue resulting from the way the inodes are handled in xfs_inactive_ifree. When we free a cluster buffer from xfs_ifree_cluster, all the inodes in cache are marked XFS_ISTALE. Those that are clean have nothing else done to them and so eventually get cleaned up by background reclaim. i.e. it is assumed we'll never dirty/relog an inode marked XFS_ISTALE. On journal commit dirty stale inodes as are handled by both buffer and inode log items to run though xfs_istale_done() and removed from the AIL (buffer log item commit) or the log item will simply unpin it because the buffer log item will clean it. What happens to any specific inode is entirely dependent on which log item wins the commit race, but the result is the same - stale inodes are clean, not attached to the cluster buffer, and not in the AIL. Hence inode reclaim can just free these inodes without further care. However, if the stale inode is relogged, it gets dirtied again and relogged into the CIL. Most of the time this isn't an issue, because relogging simply changes the inode's location in the current checkpoint. Problems arise, however, when the CIL checkpoints between two transactions in the xfs_inactive_ifree() deferops processing. This results in the XFS_ISTALE inode being redirtied and inserted into the CIL without any of the other stale cluster buffer infrastructure being in place. Hence on journal commit, it simply gets unpinned, so it remains dirty in memory. Everything in inode writeback avoids XFS_ISTALE inodes so it can't be written back, and it is not tracked in the AIL so there's not even a trigger to attempt to clean the inode. Hence the inode just sits dirty in memory until inode reclaim comes along, sees that it is XFS_ISTALE, and goes to reclaim it. This reclaiming of a dirty inode caused use after free, list corruptions and other nasty issues later in this patchset. Hence this patch addresses a violation of the "never log XFS_ISTALE inodes" caused by the deferops processing rolling a transaction and relogging a stale inode in xfs_inactive_free. It also adds a bunch of asserts to catch this problem in debug kernels so that we don't reintroduce this problem in future. Reproducer for this issue was generic/558 on a v4 filesystem. Signed-off-by: Dave Chinner Reviewed-by: Brian Foster Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_trans_inode.c | 2 ++ fs/xfs/xfs_icache.c | 3 ++- fs/xfs/xfs_inode.c | 25 ++++++++++++++++++++++--- 3 files changed, 26 insertions(+), 4 deletions(-) diff --git a/fs/xfs/libxfs/xfs_trans_inode.c b/fs/xfs/libxfs/xfs_trans_inode.c index b5dfb6654842..4504d215cd59 100644 --- a/fs/xfs/libxfs/xfs_trans_inode.c +++ b/fs/xfs/libxfs/xfs_trans_inode.c @@ -36,6 +36,7 @@ xfs_trans_ijoin( ASSERT(iip->ili_lock_flags == 0); iip->ili_lock_flags = lock_flags; + ASSERT(!xfs_iflags_test(ip, XFS_ISTALE)); /* * Get a log_item_desc to point at the new item. @@ -89,6 +90,7 @@ xfs_trans_log_inode( ASSERT(ip->i_itemp != NULL); ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + ASSERT(!xfs_iflags_test(ip, XFS_ISTALE)); /* * Don't bother with i_lock for the I_DIRTY_TIME check here, as races diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 5daef654956c..59dea8178ae3 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -1141,7 +1141,7 @@ restart: goto out_ifunlock; xfs_iunpin_wait(ip); } - if (xfs_iflags_test(ip, XFS_ISTALE) || xfs_inode_clean(ip)) { + if (xfs_inode_clean(ip)) { xfs_ifunlock(ip); goto reclaim; } @@ -1228,6 +1228,7 @@ reclaim: xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_qm_dqdetach(ip); xfs_iunlock(ip, XFS_ILOCK_EXCL); + ASSERT(xfs_inode_clean(ip)); __xfs_inode_free(ip); return error; diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 24edec472a7c..917998801a99 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -1740,10 +1740,31 @@ xfs_inactive_ifree( return error; } + /* + * We do not hold the inode locked across the entire rolling transaction + * here. We only need to hold it for the first transaction that + * xfs_ifree() builds, which may mark the inode XFS_ISTALE if the + * underlying cluster buffer is freed. Relogging an XFS_ISTALE inode + * here breaks the relationship between cluster buffer invalidation and + * stale inode invalidation on cluster buffer item journal commit + * completion, and can result in leaving dirty stale inodes hanging + * around in memory. + * + * We have no need for serialising this inode operation against other + * operations - we freed the inode and hence reallocation is required + * and that will serialise on reallocating the space the deferops need + * to free. Hence we can unlock the inode on the first commit of + * the transaction rather than roll it right through the deferops. This + * avoids relogging the XFS_ISTALE inode. + * + * We check that xfs_ifree() hasn't grown an internal transaction roll + * by asserting that the inode is still locked when it returns. + */ xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, ip, 0); + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); error = xfs_ifree(tp, ip); + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); if (error) { /* * If we fail to free the inode, shut down. The cancel @@ -1756,7 +1777,6 @@ xfs_inactive_ifree( xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); } xfs_trans_cancel(tp); - xfs_iunlock(ip, XFS_ILOCK_EXCL); return error; } @@ -1774,7 +1794,6 @@ xfs_inactive_ifree( xfs_notice(mp, "%s: xfs_trans_commit returned error %d", __func__, error); - xfs_iunlock(ip, XFS_ILOCK_EXCL); return 0; } From 1dfde687a65fec73e6914c184ecf8e9e54ccfe74 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 29 Jun 2020 14:48:45 -0700 Subject: [PATCH 016/117] xfs: remove logged flag from inode log item This was used to track if the item had logged fields being flushed to disk. We log everything in the inode these days, so this logic is no longer needed. Remove it. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Reviewed-by: Brian Foster Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_inode.c | 13 ++++--------- fs/xfs/xfs_inode_item.c | 35 ++++++++++------------------------- fs/xfs/xfs_inode_item.h | 1 - 3 files changed, 14 insertions(+), 35 deletions(-) diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 917998801a99..2f65fe70d305 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -2679,7 +2679,6 @@ xfs_ifree_cluster( list_for_each_entry(lip, &bp->b_li_list, li_bio_list) { if (lip->li_type == XFS_LI_INODE) { iip = (struct xfs_inode_log_item *)lip; - ASSERT(iip->ili_logged == 1); lip->li_cb = xfs_istale_done; xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn, @@ -2708,7 +2707,6 @@ xfs_ifree_cluster( iip->ili_last_fields = iip->ili_fields; iip->ili_fields = 0; iip->ili_fsync_fields = 0; - iip->ili_logged = 1; xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn, &iip->ili_item.li_lsn); @@ -3840,19 +3838,16 @@ xfs_iflush_int( * * We can play with the ili_fields bits here, because the inode lock * must be held exclusively in order to set bits there and the flush - * lock protects the ili_last_fields bits. Set ili_logged so the flush - * done routine can tell whether or not to look in the AIL. Also, store - * the current LSN of the inode so that we can tell whether the item has - * moved in the AIL from xfs_iflush_done(). In order to read the lsn we - * need the AIL lock, because it is a 64 bit value that cannot be read - * atomically. + * lock protects the ili_last_fields bits. Store the current LSN of the + * inode so that we can tell whether the item has moved in the AIL from + * xfs_iflush_done(). In order to read the lsn we need the AIL lock, + * because it is a 64 bit value that cannot be read atomically. */ error = 0; flush_out: iip->ili_last_fields = iip->ili_fields; iip->ili_fields = 0; iip->ili_fsync_fields = 0; - iip->ili_logged = 1; xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn, &iip->ili_item.li_lsn); diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index ba47bf65b772..b17384aa8df4 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -528,8 +528,6 @@ xfs_inode_item_push( } ASSERT(iip->ili_fields != 0 || XFS_FORCED_SHUTDOWN(ip->i_mount)); - ASSERT(iip->ili_logged == 0 || XFS_FORCED_SHUTDOWN(ip->i_mount)); - spin_unlock(&lip->li_ailp->ail_lock); error = xfs_iflush(ip, &bp); @@ -690,30 +688,24 @@ xfs_iflush_done( continue; list_move_tail(&blip->li_bio_list, &tmp); - /* - * while we have the item, do the unlocked check for needing - * the AIL lock. - */ + + /* Do an unlocked check for needing the AIL lock. */ iip = INODE_ITEM(blip); - if ((iip->ili_logged && blip->li_lsn == iip->ili_flush_lsn) || + if (blip->li_lsn == iip->ili_flush_lsn || test_bit(XFS_LI_FAILED, &blip->li_flags)) need_ail++; } /* make sure we capture the state of the initial inode. */ iip = INODE_ITEM(lip); - if ((iip->ili_logged && lip->li_lsn == iip->ili_flush_lsn) || + if (lip->li_lsn == iip->ili_flush_lsn || test_bit(XFS_LI_FAILED, &lip->li_flags)) need_ail++; /* - * We only want to pull the item from the AIL if it is - * actually there and its location in the log has not - * changed since we started the flush. Thus, we only bother - * if the ili_logged flag is set and the inode's lsn has not - * changed. First we check the lsn outside - * the lock since it's cheaper, and then we recheck while - * holding the lock before removing the inode from the AIL. + * We only want to pull the item from the AIL if it is actually there + * and its location in the log has not changed since we started the + * flush. Thus, we only bother if the inode's lsn has not changed. */ if (need_ail) { xfs_lsn_t tail_lsn = 0; @@ -721,8 +713,7 @@ xfs_iflush_done( /* this is an opencoded batch version of xfs_trans_ail_delete */ spin_lock(&ailp->ail_lock); list_for_each_entry(blip, &tmp, li_bio_list) { - if (INODE_ITEM(blip)->ili_logged && - blip->li_lsn == INODE_ITEM(blip)->ili_flush_lsn) { + if (blip->li_lsn == INODE_ITEM(blip)->ili_flush_lsn) { /* * xfs_ail_update_finish() only cares about the * lsn of the first tail item removed, any @@ -740,14 +731,13 @@ xfs_iflush_done( } /* - * clean up and unlock the flush lock now we are done. We can clear the + * Clean up and unlock the flush lock now we are done. We can clear the * ili_last_fields bits now that we know that the data corresponding to * them is safely on disk. */ list_for_each_entry_safe(blip, n, &tmp, li_bio_list) { list_del_init(&blip->li_bio_list); iip = INODE_ITEM(blip); - iip->ili_logged = 0; iip->ili_last_fields = 0; xfs_ifunlock(iip->ili_inode); } @@ -768,16 +758,11 @@ xfs_iflush_abort( if (iip) { xfs_trans_ail_delete(&iip->ili_item, 0); - iip->ili_logged = 0; - /* - * Clear the ili_last_fields bits now that we know that the - * data corresponding to them is safely on disk. - */ - iip->ili_last_fields = 0; /* * Clear the inode logging fields so no more flushes are * attempted. */ + iip->ili_last_fields = 0; iip->ili_fields = 0; iip->ili_fsync_fields = 0; } diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h index 60b34bb66e8e..4de5070e0765 100644 --- a/fs/xfs/xfs_inode_item.h +++ b/fs/xfs/xfs_inode_item.h @@ -19,7 +19,6 @@ struct xfs_inode_log_item { xfs_lsn_t ili_flush_lsn; /* lsn at last flush */ xfs_lsn_t ili_last_lsn; /* lsn at last transaction */ unsigned short ili_lock_flags; /* lock flags */ - unsigned short ili_logged; /* flushed logged data */ unsigned int ili_last_fields; /* fields when flushed */ unsigned int ili_fields; /* fields to be logged */ unsigned int ili_fsync_fields; /* logged since last fsync */ From 1319ebefd6ed7a9988b7b4bc9317fbcf61a28bfc Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 29 Jun 2020 14:48:46 -0700 Subject: [PATCH 017/117] xfs: add an inode item lock The inode log item is kind of special in that it can be aggregating new changes in memory at the same time time existing changes are being written back to disk. This means there are fields in the log item that are accessed concurrently from contexts that don't share any locking at all. e.g. updating ili_last_fields occurs at flush time under the ILOCK_EXCL and flush lock at flush time, under the flush lock at IO completion time, and is read under the ILOCK_EXCL when the inode is logged. Hence there is no actual serialisation between reading the field during logging of the inode in transactions vs clearing the field in IO completion. We currently get away with this by the fact that we are only clearing fields in IO completion, and nothing bad happens if we accidentally log more of the inode than we actually modify. Worst case is we consume a tiny bit more memory and log bandwidth. However, if we want to do more complex state manipulations on the log item that requires updates at all three of these potential locations, we need to have some mechanism of serialising those operations. To do this, introduce a spinlock into the log item to serialise internal state. This could be done via the xfs_inode i_flags_lock, but this then leads to potential lock inversion issues where inode flag updates need to occur inside locks that best nest inside the inode log item locks (e.g. marking inodes stale during inode cluster freeing). Using a separate spinlock avoids these sorts of problems and simplifies future code. This does not touch the use of ili_fields in the item formatting code - that is entirely protected by the ILOCK_EXCL at this point in time, so it remains untouched. Signed-off-by: Dave Chinner Reviewed-by: Brian Foster Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_trans_inode.c | 52 ++++++++++++++++----------------- fs/xfs/xfs_file.c | 9 ++++-- fs/xfs/xfs_inode.c | 20 ++++++++----- fs/xfs/xfs_inode_item.c | 7 +++++ fs/xfs/xfs_inode_item.h | 18 ++++++++++-- 5 files changed, 66 insertions(+), 40 deletions(-) diff --git a/fs/xfs/libxfs/xfs_trans_inode.c b/fs/xfs/libxfs/xfs_trans_inode.c index 4504d215cd59..c66d9d1dd58b 100644 --- a/fs/xfs/libxfs/xfs_trans_inode.c +++ b/fs/xfs/libxfs/xfs_trans_inode.c @@ -82,16 +82,20 @@ xfs_trans_ichgtime( */ void xfs_trans_log_inode( - xfs_trans_t *tp, - xfs_inode_t *ip, - uint flags) + struct xfs_trans *tp, + struct xfs_inode *ip, + uint flags) { - struct inode *inode = VFS_I(ip); + struct xfs_inode_log_item *iip = ip->i_itemp; + struct inode *inode = VFS_I(ip); + uint iversion_flags = 0; - ASSERT(ip->i_itemp != NULL); + ASSERT(iip); ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); ASSERT(!xfs_iflags_test(ip, XFS_ISTALE)); + tp->t_flags |= XFS_TRANS_DIRTY; + /* * Don't bother with i_lock for the I_DIRTY_TIME check here, as races * don't matter - we either will need an extra transaction in 24 hours @@ -104,15 +108,6 @@ xfs_trans_log_inode( spin_unlock(&inode->i_lock); } - /* - * Record the specific change for fdatasync optimisation. This - * allows fdatasync to skip log forces for inodes that are only - * timestamp dirty. We do this before the change count so that - * the core being logged in this case does not impact on fdatasync - * behaviour. - */ - ip->i_itemp->ili_fsync_fields |= flags; - /* * First time we log the inode in a transaction, bump the inode change * counter if it is configured for this to occur. While we have the @@ -122,23 +117,28 @@ xfs_trans_log_inode( * set however, then go ahead and bump the i_version counter * unconditionally. */ - if (!test_and_set_bit(XFS_LI_DIRTY, &ip->i_itemp->ili_item.li_flags) && - IS_I_VERSION(VFS_I(ip))) { - if (inode_maybe_inc_iversion(VFS_I(ip), flags & XFS_ILOG_CORE)) - flags |= XFS_ILOG_CORE; + if (!test_and_set_bit(XFS_LI_DIRTY, &iip->ili_item.li_flags)) { + if (IS_I_VERSION(inode) && + inode_maybe_inc_iversion(inode, flags & XFS_ILOG_CORE)) + iversion_flags = XFS_ILOG_CORE; } - tp->t_flags |= XFS_TRANS_DIRTY; + /* + * Record the specific change for fdatasync optimisation. This allows + * fdatasync to skip log forces for inodes that are only timestamp + * dirty. + */ + spin_lock(&iip->ili_lock); + iip->ili_fsync_fields |= flags; /* - * Always OR in the bits from the ili_last_fields field. - * This is to coordinate with the xfs_iflush() and xfs_iflush_done() - * routines in the eventual clearing of the ili_fields bits. - * See the big comment in xfs_iflush() for an explanation of - * this coordination mechanism. + * Always OR in the bits from the ili_last_fields field. This is to + * coordinate with the xfs_iflush() and xfs_iflush_done() routines in + * the eventual clearing of the ili_fields bits. See the big comment in + * xfs_iflush() for an explanation of this coordination mechanism. */ - flags |= ip->i_itemp->ili_last_fields; - ip->i_itemp->ili_fields |= flags; + iip->ili_fields |= (flags | iip->ili_last_fields | iversion_flags); + spin_unlock(&iip->ili_lock); } int diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index cc6528726187..01c098834c4b 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -94,6 +94,7 @@ xfs_file_fsync( { struct inode *inode = file->f_mapping->host; struct xfs_inode *ip = XFS_I(inode); + struct xfs_inode_log_item *iip = ip->i_itemp; struct xfs_mount *mp = ip->i_mount; int error = 0; int log_flushed = 0; @@ -137,13 +138,15 @@ xfs_file_fsync( xfs_ilock(ip, XFS_ILOCK_SHARED); if (xfs_ipincount(ip)) { if (!datasync || - (ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP)) - lsn = ip->i_itemp->ili_last_lsn; + (iip->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP)) + lsn = iip->ili_last_lsn; } if (lsn) { error = xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, &log_flushed); - ip->i_itemp->ili_fsync_fields = 0; + spin_lock(&iip->ili_lock); + iip->ili_fsync_fields = 0; + spin_unlock(&iip->ili_lock); } xfs_iunlock(ip, XFS_ILOCK_SHARED); diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 2f65fe70d305..d6da08165a2e 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -2704,9 +2704,11 @@ xfs_ifree_cluster( continue; iip = ip->i_itemp; + spin_lock(&iip->ili_lock); iip->ili_last_fields = iip->ili_fields; iip->ili_fields = 0; iip->ili_fsync_fields = 0; + spin_unlock(&iip->ili_lock); xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn, &iip->ili_item.li_lsn); @@ -2742,6 +2744,7 @@ xfs_ifree( { int error; struct xfs_icluster xic = { 0 }; + struct xfs_inode_log_item *iip = ip->i_itemp; ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); ASSERT(VFS_I(ip)->i_nlink == 0); @@ -2779,7 +2782,9 @@ xfs_ifree( ip->i_df.if_format = XFS_DINODE_FMT_EXTENTS; /* Don't attempt to replay owner changes for a deleted inode */ - ip->i_itemp->ili_fields &= ~(XFS_ILOG_AOWNER|XFS_ILOG_DOWNER); + spin_lock(&iip->ili_lock); + iip->ili_fields &= ~(XFS_ILOG_AOWNER | XFS_ILOG_DOWNER); + spin_unlock(&iip->ili_lock); /* * Bump the generation count so no one will be confused @@ -3835,20 +3840,19 @@ xfs_iflush_int( * know that the information those bits represent is permanently on * disk. As long as the flush completes before the inode is logged * again, then both ili_fields and ili_last_fields will be cleared. - * - * We can play with the ili_fields bits here, because the inode lock - * must be held exclusively in order to set bits there and the flush - * lock protects the ili_last_fields bits. Store the current LSN of the - * inode so that we can tell whether the item has moved in the AIL from - * xfs_iflush_done(). In order to read the lsn we need the AIL lock, - * because it is a 64 bit value that cannot be read atomically. */ error = 0; flush_out: + spin_lock(&iip->ili_lock); iip->ili_last_fields = iip->ili_fields; iip->ili_fields = 0; iip->ili_fsync_fields = 0; + spin_unlock(&iip->ili_lock); + /* + * Store the current LSN of the inode so that we can tell whether the + * item has moved in the AIL from xfs_iflush_done(). + */ xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn, &iip->ili_item.li_lsn); diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index b17384aa8df4..6ef9cbcfc94a 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -637,6 +637,7 @@ xfs_inode_item_init( iip = ip->i_itemp = kmem_zone_zalloc(xfs_ili_zone, 0); iip->ili_inode = ip; + spin_lock_init(&iip->ili_lock); xfs_log_item_init(mp, &iip->ili_item, XFS_LI_INODE, &xfs_inode_item_ops); } @@ -738,7 +739,11 @@ xfs_iflush_done( list_for_each_entry_safe(blip, n, &tmp, li_bio_list) { list_del_init(&blip->li_bio_list); iip = INODE_ITEM(blip); + + spin_lock(&iip->ili_lock); iip->ili_last_fields = 0; + spin_unlock(&iip->ili_lock); + xfs_ifunlock(iip->ili_inode); } list_del(&tmp); @@ -762,9 +767,11 @@ xfs_iflush_abort( * Clear the inode logging fields so no more flushes are * attempted. */ + spin_lock(&iip->ili_lock); iip->ili_last_fields = 0; iip->ili_fields = 0; iip->ili_fsync_fields = 0; + spin_unlock(&iip->ili_lock); } /* * Release the inode's flush lock since we're done with it. diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h index 4de5070e0765..4a10a1b92ee9 100644 --- a/fs/xfs/xfs_inode_item.h +++ b/fs/xfs/xfs_inode_item.h @@ -16,12 +16,24 @@ struct xfs_mount; struct xfs_inode_log_item { struct xfs_log_item ili_item; /* common portion */ struct xfs_inode *ili_inode; /* inode ptr */ - xfs_lsn_t ili_flush_lsn; /* lsn at last flush */ - xfs_lsn_t ili_last_lsn; /* lsn at last transaction */ - unsigned short ili_lock_flags; /* lock flags */ + unsigned short ili_lock_flags; /* inode lock flags */ + /* + * The ili_lock protects the interactions between the dirty state and + * the flush state of the inode log item. This allows us to do atomic + * modifications of multiple state fields without having to hold a + * specific inode lock to serialise them. + * + * We need atomic changes between inode dirtying, inode flushing and + * inode completion, but these all hold different combinations of + * ILOCK and iflock and hence we need some other method of serialising + * updates to the flush state. + */ + spinlock_t ili_lock; /* flush state lock */ unsigned int ili_last_fields; /* fields when flushed */ unsigned int ili_fields; /* fields to be logged */ unsigned int ili_fsync_fields; /* logged since last fsync */ + xfs_lsn_t ili_flush_lsn; /* lsn at last flush */ + xfs_lsn_t ili_last_lsn; /* lsn at last transaction */ }; static inline int xfs_inode_clean(xfs_inode_t *ip) From f593bf144c7dfee9715aa787ebbbe5dd8882e8e9 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 29 Jun 2020 14:48:46 -0700 Subject: [PATCH 018/117] xfs: mark inode buffers in cache Inode buffers always have write IO callbacks, so by marking them directly we can avoid needing to attach ->b_iodone functions to them. This avoids an indirect call, and makes future modifications much simpler. While this is largely a refactor of existing functionality, we broaden the scope of the flag to beyond where inodes are explicitly attached because future changes need to know what type of log items are attached to the buffer. Adding this buffer flag may invoke the inode iodone callback in cases where it wouldn't have been previously, but this is not a functional change because the callback is identical to the normal buffer write iodone callback when inodes are not attached. Signed-off-by: Dave Chinner Reviewed-by: Brian Foster Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_buf.c | 21 ++++++++++++++++----- fs/xfs/xfs_buf.h | 38 +++++++++++++++++++++++++------------- fs/xfs/xfs_buf_item.c | 42 +++++++++++++++++++++++++++++++----------- fs/xfs/xfs_buf_item.h | 1 + fs/xfs/xfs_inode.c | 2 +- fs/xfs/xfs_trans_buf.c | 3 +++ 6 files changed, 77 insertions(+), 30 deletions(-) diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 20b748f7e186..ae0c923574df 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -14,6 +14,8 @@ #include "xfs_mount.h" #include "xfs_trace.h" #include "xfs_log.h" +#include "xfs_trans.h" +#include "xfs_buf_item.h" #include "xfs_errortag.h" #include "xfs_error.h" @@ -1202,12 +1204,21 @@ xfs_buf_ioend( bp->b_flags |= XBF_DONE; } - if (bp->b_iodone) + if (read) + goto out_finish; + + if (bp->b_flags & _XBF_INODES) { + xfs_buf_inode_iodone(bp); + return; + } + + if (bp->b_iodone) { (*(bp->b_iodone))(bp); - else if (bp->b_flags & XBF_ASYNC) - xfs_buf_relse(bp); - else - complete(&bp->b_iowait); + return; + } + +out_finish: + xfs_buf_ioend_finish(bp); } static void diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index 050c53b739e2..2400cb90a04c 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -30,15 +30,18 @@ #define XBF_STALE (1 << 6) /* buffer has been staled, do not find it */ #define XBF_WRITE_FAIL (1 << 7) /* async writes have failed on this buffer */ -/* flags used only as arguments to access routines */ -#define XBF_TRYLOCK (1 << 16)/* lock requested, but do not wait */ -#define XBF_UNMAPPED (1 << 17)/* do not map the buffer */ +/* buffer type flags for write callbacks */ +#define _XBF_INODES (1 << 16)/* inode buffer */ /* flags used only internally */ #define _XBF_PAGES (1 << 20)/* backed by refcounted pages */ #define _XBF_KMEM (1 << 21)/* backed by heap memory */ #define _XBF_DELWRI_Q (1 << 22)/* buffer on a delwri queue */ +/* flags used only as arguments to access routines */ +#define XBF_TRYLOCK (1 << 30)/* lock requested, but do not wait */ +#define XBF_UNMAPPED (1 << 31)/* do not map the buffer */ + typedef unsigned int xfs_buf_flags_t; #define XFS_BUF_FLAGS \ @@ -50,12 +53,13 @@ typedef unsigned int xfs_buf_flags_t; { XBF_DONE, "DONE" }, \ { XBF_STALE, "STALE" }, \ { XBF_WRITE_FAIL, "WRITE_FAIL" }, \ - { XBF_TRYLOCK, "TRYLOCK" }, /* should never be set */\ - { XBF_UNMAPPED, "UNMAPPED" }, /* ditto */\ + { _XBF_INODES, "INODES" }, \ { _XBF_PAGES, "PAGES" }, \ { _XBF_KMEM, "KMEM" }, \ - { _XBF_DELWRI_Q, "DELWRI_Q" } - + { _XBF_DELWRI_Q, "DELWRI_Q" }, \ + /* The following interface flags should never be set */ \ + { XBF_TRYLOCK, "TRYLOCK" }, \ + { XBF_UNMAPPED, "UNMAPPED" } /* * Internal state flags. @@ -257,9 +261,23 @@ extern void xfs_buf_unlock(xfs_buf_t *); #define xfs_buf_islocked(bp) \ ((bp)->b_sema.count <= 0) +static inline void xfs_buf_relse(xfs_buf_t *bp) +{ + xfs_buf_unlock(bp); + xfs_buf_rele(bp); +} + /* Buffer Read and Write Routines */ extern int xfs_bwrite(struct xfs_buf *bp); extern void xfs_buf_ioend(struct xfs_buf *bp); +static inline void xfs_buf_ioend_finish(struct xfs_buf *bp) +{ + if (bp->b_flags & XBF_ASYNC) + xfs_buf_relse(bp); + else + complete(&bp->b_iowait); +} + extern void __xfs_buf_ioerror(struct xfs_buf *bp, int error, xfs_failaddr_t failaddr); #define xfs_buf_ioerror(bp, err) __xfs_buf_ioerror((bp), (err), __this_address) @@ -324,12 +342,6 @@ static inline int xfs_buf_ispinned(struct xfs_buf *bp) return atomic_read(&bp->b_pin_count); } -static inline void xfs_buf_relse(xfs_buf_t *bp) -{ - xfs_buf_unlock(bp); - xfs_buf_rele(bp); -} - static inline int xfs_buf_verify_cksum(struct xfs_buf *bp, unsigned long cksum_offset) { diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index 9e75e8d6042e..8659cf4282a6 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -1158,20 +1158,15 @@ out_stale: return false; } -/* - * This is the iodone() function for buffers which have had callbacks attached - * to them by xfs_buf_attach_iodone(). We need to iterate the items on the - * callback list, mark the buffer as having no more callbacks and then push the - * buffer through IO completion processing. - */ -void -xfs_buf_iodone_callbacks( +static void +xfs_buf_run_callbacks( struct xfs_buf *bp) { + /* - * If there is an error, process it. Some errors require us - * to run callbacks after failure processing is done so we - * detect that and take appropriate action. + * If there is an error, process it. Some errors require us to run + * callbacks after failure processing is done so we detect that and take + * appropriate action. */ if (bp->b_error && xfs_buf_iodone_callback_error(bp)) return; @@ -1188,9 +1183,34 @@ xfs_buf_iodone_callbacks( bp->b_log_item = NULL; list_del_init(&bp->b_li_list); bp->b_iodone = NULL; +} + +/* + * This is the iodone() function for buffers which have had callbacks attached + * to them by xfs_buf_attach_iodone(). We need to iterate the items on the + * callback list, mark the buffer as having no more callbacks and then push the + * buffer through IO completion processing. + */ +void +xfs_buf_iodone_callbacks( + struct xfs_buf *bp) +{ + xfs_buf_run_callbacks(bp); xfs_buf_ioend(bp); } +/* + * Inode buffer iodone callback function. + */ +void +xfs_buf_inode_iodone( + struct xfs_buf *bp) +{ + xfs_buf_run_callbacks(bp); + xfs_buf_ioend_finish(bp); +} + + /* * This is the iodone() function for buffers which have been * logged. It is called when they are eventually flushed out. diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h index c9c57e2da932..a342933ad9b8 100644 --- a/fs/xfs/xfs_buf_item.h +++ b/fs/xfs/xfs_buf_item.h @@ -59,6 +59,7 @@ void xfs_buf_attach_iodone(struct xfs_buf *, struct xfs_log_item *); void xfs_buf_iodone_callbacks(struct xfs_buf *); void xfs_buf_iodone(struct xfs_buf *, struct xfs_log_item *); +void xfs_buf_inode_iodone(struct xfs_buf *); bool xfs_buf_log_check_iovec(struct xfs_log_iovec *iovec); extern kmem_zone_t *xfs_buf_item_zone; diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index d6da08165a2e..4621d67f3428 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -3862,13 +3862,13 @@ flush_out: * completion on the buffer to remove the inode from the AIL and release * the flush lock. */ + bp->b_flags |= _XBF_INODES; xfs_buf_attach_iodone(bp, xfs_iflush_done, &iip->ili_item); /* generate the checksum. */ xfs_dinode_calc_crc(mp, dip); ASSERT(!list_empty(&bp->b_li_list)); - ASSERT(bp->b_iodone != NULL); return error; } diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c index 08174ffa2118..552d0869aa0f 100644 --- a/fs/xfs/xfs_trans_buf.c +++ b/fs/xfs/xfs_trans_buf.c @@ -626,6 +626,7 @@ xfs_trans_inode_buf( ASSERT(atomic_read(&bip->bli_refcount) > 0); bip->bli_flags |= XFS_BLI_INODE_BUF; + bp->b_flags |= _XBF_INODES; xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DINO_BUF); } @@ -651,6 +652,7 @@ xfs_trans_stale_inode_buf( bip->bli_flags |= XFS_BLI_STALE_INODE; bip->bli_item.li_cb = xfs_buf_iodone; + bp->b_flags |= _XBF_INODES; xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DINO_BUF); } @@ -675,6 +677,7 @@ xfs_trans_inode_alloc_buf( ASSERT(atomic_read(&bip->bli_refcount) > 0); bip->bli_flags |= XFS_BLI_INODE_ALLOC_BUF; + bp->b_flags |= _XBF_INODES; xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DINO_BUF); } From 0c7e5afbea9962bc65c54337c30559bf913a97d6 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 29 Jun 2020 14:48:46 -0700 Subject: [PATCH 019/117] xfs: mark dquot buffers in cache dquot buffers always have write IO callbacks, so by marking them directly we can avoid needing to attach ->b_iodone functions to them. This avoids an indirect call, and makes future modifications much simpler. This is largely a rearrangement of the code at this point - no IO completion functionality changes at this point, just how the code is run is modified. Signed-off-by: Dave Chinner Reviewed-by: Brian Foster Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_buf.c | 5 +++++ fs/xfs/xfs_buf.h | 2 ++ fs/xfs/xfs_buf_item.c | 10 ++++++++++ fs/xfs/xfs_buf_item.h | 1 + fs/xfs/xfs_dquot.c | 1 + fs/xfs/xfs_trans_buf.c | 1 + 6 files changed, 20 insertions(+) diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index ae0c923574df..517932675b12 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -1212,6 +1212,11 @@ xfs_buf_ioend( return; } + if (bp->b_flags & _XBF_DQUOTS) { + xfs_buf_dquot_iodone(bp); + return; + } + if (bp->b_iodone) { (*(bp->b_iodone))(bp); return; diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index 2400cb90a04c..c1d0843206dd 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -32,6 +32,7 @@ /* buffer type flags for write callbacks */ #define _XBF_INODES (1 << 16)/* inode buffer */ +#define _XBF_DQUOTS (1 << 17)/* dquot buffer */ /* flags used only internally */ #define _XBF_PAGES (1 << 20)/* backed by refcounted pages */ @@ -54,6 +55,7 @@ typedef unsigned int xfs_buf_flags_t; { XBF_STALE, "STALE" }, \ { XBF_WRITE_FAIL, "WRITE_FAIL" }, \ { _XBF_INODES, "INODES" }, \ + { _XBF_DQUOTS, "DQUOTS" }, \ { _XBF_PAGES, "PAGES" }, \ { _XBF_KMEM, "KMEM" }, \ { _XBF_DELWRI_Q, "DELWRI_Q" }, \ diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index 8659cf4282a6..a42cdf9ccc47 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -1210,6 +1210,16 @@ xfs_buf_inode_iodone( xfs_buf_ioend_finish(bp); } +/* + * Dquot buffer iodone callback function. + */ +void +xfs_buf_dquot_iodone( + struct xfs_buf *bp) +{ + xfs_buf_run_callbacks(bp); + xfs_buf_ioend_finish(bp); +} /* * This is the iodone() function for buffers which have been diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h index a342933ad9b8..27d13d29b5bb 100644 --- a/fs/xfs/xfs_buf_item.h +++ b/fs/xfs/xfs_buf_item.h @@ -60,6 +60,7 @@ void xfs_buf_attach_iodone(struct xfs_buf *, void xfs_buf_iodone_callbacks(struct xfs_buf *); void xfs_buf_iodone(struct xfs_buf *, struct xfs_log_item *); void xfs_buf_inode_iodone(struct xfs_buf *); +void xfs_buf_dquot_iodone(struct xfs_buf *); bool xfs_buf_log_check_iovec(struct xfs_log_iovec *iovec); extern kmem_zone_t *xfs_buf_item_zone; diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index d5b7f03e93c8..2e2146fa0914 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -1179,6 +1179,7 @@ xfs_qm_dqflush( * Attach an iodone routine so that we can remove this dquot from the * AIL and release the flush lock once the dquot is synced to disk. */ + bp->b_flags |= _XBF_DQUOTS; xfs_buf_attach_iodone(bp, xfs_qm_dqflush_done, &dqp->q_logitem.qli_item); diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c index 552d0869aa0f..93d62cb864c1 100644 --- a/fs/xfs/xfs_trans_buf.c +++ b/fs/xfs/xfs_trans_buf.c @@ -788,5 +788,6 @@ xfs_trans_dquot_buf( break; } + bp->b_flags |= _XBF_DQUOTS; xfs_trans_buf_set_type(tp, bp, type); } From 9fe5c77cbe3cacc60d03ae5940033e4173fc1847 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 29 Jun 2020 14:48:47 -0700 Subject: [PATCH 020/117] xfs: mark log recovery buffers for completion Log recovery has it's own buffer write completion handler for buffers that it directly recovers. Convert these to direct calls by flagging these buffers as being log recovery buffers. The flag will get cleared by the log recovery IO completion routine, so it will never leak out of log recovery. Signed-off-by: Dave Chinner Reviewed-by: Brian Foster Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_buf.c | 10 ++++++++++ fs/xfs/xfs_buf.h | 2 ++ fs/xfs/xfs_buf_item_recover.c | 5 ++--- fs/xfs/xfs_dquot_item_recover.c | 2 +- fs/xfs/xfs_inode_item_recover.c | 2 +- fs/xfs/xfs_log_recover.c | 5 ++--- 6 files changed, 18 insertions(+), 8 deletions(-) diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 517932675b12..6a2c942372fc 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -14,6 +14,7 @@ #include "xfs_mount.h" #include "xfs_trace.h" #include "xfs_log.h" +#include "xfs_log_recover.h" #include "xfs_trans.h" #include "xfs_buf_item.h" #include "xfs_errortag.h" @@ -1207,6 +1208,15 @@ xfs_buf_ioend( if (read) goto out_finish; + /* + * If this is a log recovery buffer, we aren't doing transactional IO + * yet so we need to let it handle IO completions. + */ + if (bp->b_flags & _XBF_LOGRECOVERY) { + xlog_recover_iodone(bp); + return; + } + if (bp->b_flags & _XBF_INODES) { xfs_buf_inode_iodone(bp); return; diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index c1d0843206dd..30dabc5bae96 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -33,6 +33,7 @@ /* buffer type flags for write callbacks */ #define _XBF_INODES (1 << 16)/* inode buffer */ #define _XBF_DQUOTS (1 << 17)/* dquot buffer */ +#define _XBF_LOGRECOVERY (1 << 18)/* log recovery buffer */ /* flags used only internally */ #define _XBF_PAGES (1 << 20)/* backed by refcounted pages */ @@ -56,6 +57,7 @@ typedef unsigned int xfs_buf_flags_t; { XBF_WRITE_FAIL, "WRITE_FAIL" }, \ { _XBF_INODES, "INODES" }, \ { _XBF_DQUOTS, "DQUOTS" }, \ + { _XBF_LOGRECOVERY, "LOG_RECOVERY" }, \ { _XBF_PAGES, "PAGES" }, \ { _XBF_KMEM, "KMEM" }, \ { _XBF_DELWRI_Q, "DELWRI_Q" }, \ diff --git a/fs/xfs/xfs_buf_item_recover.c b/fs/xfs/xfs_buf_item_recover.c index 04faa7310c4f..74c851f60eee 100644 --- a/fs/xfs/xfs_buf_item_recover.c +++ b/fs/xfs/xfs_buf_item_recover.c @@ -419,8 +419,7 @@ xlog_recover_validate_buf_type( if (bp->b_ops) { struct xfs_buf_log_item *bip; - ASSERT(!bp->b_iodone || bp->b_iodone == xlog_recover_iodone); - bp->b_iodone = xlog_recover_iodone; + bp->b_flags |= _XBF_LOGRECOVERY; xfs_buf_item_init(bp, mp); bip = bp->b_log_item; bip->bli_item.li_lsn = current_lsn; @@ -963,7 +962,7 @@ xlog_recover_buf_commit_pass2( error = xfs_bwrite(bp); } else { ASSERT(bp->b_mount == mp); - bp->b_iodone = xlog_recover_iodone; + bp->b_flags |= _XBF_LOGRECOVERY; xfs_buf_delwri_queue(bp, buffer_list); } diff --git a/fs/xfs/xfs_dquot_item_recover.c b/fs/xfs/xfs_dquot_item_recover.c index 3400be4c88f0..f9ea9f55aa7c 100644 --- a/fs/xfs/xfs_dquot_item_recover.c +++ b/fs/xfs/xfs_dquot_item_recover.c @@ -153,7 +153,7 @@ xlog_recover_dquot_commit_pass2( ASSERT(dq_f->qlf_size == 2); ASSERT(bp->b_mount == mp); - bp->b_iodone = xlog_recover_iodone; + bp->b_flags |= _XBF_LOGRECOVERY; xfs_buf_delwri_queue(bp, buffer_list); out_release: diff --git a/fs/xfs/xfs_inode_item_recover.c b/fs/xfs/xfs_inode_item_recover.c index dc3e26ff16c9..5e0d291835b3 100644 --- a/fs/xfs/xfs_inode_item_recover.c +++ b/fs/xfs/xfs_inode_item_recover.c @@ -376,7 +376,7 @@ out_owner_change: xfs_dinode_calc_crc(log->l_mp, dip); ASSERT(bp->b_mount == mp); - bp->b_iodone = xlog_recover_iodone; + bp->b_flags |= _XBF_LOGRECOVERY; xfs_buf_delwri_queue(bp, buffer_list); out_release: diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index ec015df55b77..52a65a74208f 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -287,9 +287,8 @@ xlog_recover_iodone( if (bp->b_log_item) xfs_buf_item_relse(bp); ASSERT(bp->b_log_item == NULL); - - bp->b_iodone = NULL; - xfs_buf_ioend(bp); + bp->b_flags &= ~_XBF_LOGRECOVERY; + xfs_buf_ioend_finish(bp); } /* From b01d1461ae6d98165cddab6f7219b459e2ac413d Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 29 Jun 2020 14:48:47 -0700 Subject: [PATCH 021/117] xfs: call xfs_buf_iodone directly All unmarked dirty buffers should be in the AIL and have log items attached to them. Hence when they are written, we will run a callback to remove the item from the AIL if appropriate. Now that we've handled inode and dquot buffers, all remaining calls are to xfs_buf_iodone() and so we can hard code this rather than use an indirect call. Signed-off-by: Dave Chinner Reviewed-by: Darrick J. Wong Reviewed-by: Amir Goldstein Reviewed-by: Brian Foster Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_buf.c | 24 +++++++-------------- fs/xfs/xfs_buf.h | 6 +----- fs/xfs/xfs_buf_item.c | 48 ++++++++++++------------------------------ fs/xfs/xfs_buf_item.h | 4 ++-- fs/xfs/xfs_trans_buf.c | 13 +++--------- 5 files changed, 28 insertions(+), 67 deletions(-) diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 6a2c942372fc..dda0c9445879 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -658,7 +658,6 @@ found: */ if (bp->b_flags & XBF_STALE) { ASSERT((bp->b_flags & _XBF_DELWRI_Q) == 0); - ASSERT(bp->b_iodone == NULL); bp->b_flags &= _XBF_KMEM | _XBF_PAGES; bp->b_ops = NULL; } @@ -1194,10 +1193,13 @@ xfs_buf_ioend( if (!bp->b_error && bp->b_io_error) xfs_buf_ioerror(bp, bp->b_io_error); - /* Only validate buffers that were read without errors */ - if (read && !bp->b_error && bp->b_ops) { - ASSERT(!bp->b_iodone); - bp->b_ops->verify_read(bp); + if (read) { + if (!bp->b_error && bp->b_ops) + bp->b_ops->verify_read(bp); + if (!bp->b_error) + bp->b_flags |= XBF_DONE; + xfs_buf_ioend_finish(bp); + return; } if (!bp->b_error) { @@ -1205,9 +1207,6 @@ xfs_buf_ioend( bp->b_flags |= XBF_DONE; } - if (read) - goto out_finish; - /* * If this is a log recovery buffer, we aren't doing transactional IO * yet so we need to let it handle IO completions. @@ -1226,14 +1225,7 @@ xfs_buf_ioend( xfs_buf_dquot_iodone(bp); return; } - - if (bp->b_iodone) { - (*(bp->b_iodone))(bp); - return; - } - -out_finish: - xfs_buf_ioend_finish(bp); + xfs_buf_iodone(bp); } static void diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index 30dabc5bae96..755b652e695a 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -18,6 +18,7 @@ /* * Base types */ +struct xfs_buf; #define XFS_BUF_DADDR_NULL ((xfs_daddr_t) (-1LL)) @@ -102,10 +103,6 @@ typedef struct xfs_buftarg { struct ratelimit_state bt_ioerror_rl; } xfs_buftarg_t; -struct xfs_buf; -typedef void (*xfs_buf_iodone_t)(struct xfs_buf *); - - #define XB_PAGES 2 struct xfs_buf_map { @@ -158,7 +155,6 @@ typedef struct xfs_buf { xfs_buftarg_t *b_target; /* buffer target (device) */ void *b_addr; /* virtual address of buffer */ struct work_struct b_ioend_work; - xfs_buf_iodone_t b_iodone; /* I/O completion function */ struct completion b_iowait; /* queue for I/O waiters */ struct xfs_buf_log_item *b_log_item; struct list_head b_li_list; /* Log items list head */ diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index a42cdf9ccc47..d87ae6363a13 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -460,7 +460,6 @@ xfs_buf_item_unpin( xfs_buf_do_callbacks(bp); bp->b_log_item = NULL; list_del_init(&bp->b_li_list); - bp->b_iodone = NULL; } else { xfs_trans_ail_delete(lip, SHUTDOWN_LOG_IO_ERROR); xfs_buf_item_relse(bp); @@ -936,11 +935,7 @@ xfs_buf_item_free( } /* - * This is called when the buf log item is no longer needed. It should - * free the buf log item associated with the given buffer and clear - * the buffer's pointer to the buf log item. If there are no more - * items in the list, clear the b_iodone field of the buffer (see - * xfs_buf_attach_iodone() below). + * xfs_buf_item_relse() is called when the buf log item is no longer needed. */ void xfs_buf_item_relse( @@ -952,9 +947,6 @@ xfs_buf_item_relse( ASSERT(!test_bit(XFS_LI_IN_AIL, &bip->bli_item.li_flags)); bp->b_log_item = NULL; - if (list_empty(&bp->b_li_list)) - bp->b_iodone = NULL; - xfs_buf_rele(bp); xfs_buf_item_free(bip); } @@ -962,10 +954,7 @@ xfs_buf_item_relse( /* * Add the given log item with its callback to the list of callbacks - * to be called when the buffer's I/O completes. If it is not set - * already, set the buffer's b_iodone() routine to be - * xfs_buf_iodone_callbacks() and link the log item into the list of - * items rooted at b_li_list. + * to be called when the buffer's I/O completes. */ void xfs_buf_attach_iodone( @@ -977,10 +966,6 @@ xfs_buf_attach_iodone( lip->li_cb = cb; list_add_tail(&lip->li_bio_list, &bp->b_li_list); - - ASSERT(bp->b_iodone == NULL || - bp->b_iodone == xfs_buf_iodone_callbacks); - bp->b_iodone = xfs_buf_iodone_callbacks; } /* @@ -1096,7 +1081,6 @@ xfs_buf_iodone_callback_error( goto out_stale; trace_xfs_buf_item_iodone_async(bp, _RET_IP_); - ASSERT(bp->b_iodone != NULL); cfg = xfs_error_get_cfg(mp, XFS_ERR_METADATA, bp->b_error); @@ -1182,21 +1166,6 @@ xfs_buf_run_callbacks( xfs_buf_do_callbacks(bp); bp->b_log_item = NULL; list_del_init(&bp->b_li_list); - bp->b_iodone = NULL; -} - -/* - * This is the iodone() function for buffers which have had callbacks attached - * to them by xfs_buf_attach_iodone(). We need to iterate the items on the - * callback list, mark the buffer as having no more callbacks and then push the - * buffer through IO completion processing. - */ -void -xfs_buf_iodone_callbacks( - struct xfs_buf *bp) -{ - xfs_buf_run_callbacks(bp); - xfs_buf_ioend(bp); } /* @@ -1221,6 +1190,17 @@ xfs_buf_dquot_iodone( xfs_buf_ioend_finish(bp); } +/* + * Dirty buffer iodone callback function. + */ +void +xfs_buf_iodone( + struct xfs_buf *bp) +{ + xfs_buf_run_callbacks(bp); + xfs_buf_ioend_finish(bp); +} + /* * This is the iodone() function for buffers which have been * logged. It is called when they are eventually flushed out. @@ -1229,7 +1209,7 @@ xfs_buf_dquot_iodone( * care of cleaning up the buffer itself. */ void -xfs_buf_iodone( +xfs_buf_item_iodone( struct xfs_buf *bp, struct xfs_log_item *lip) { diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h index 27d13d29b5bb..610cd0019328 100644 --- a/fs/xfs/xfs_buf_item.h +++ b/fs/xfs/xfs_buf_item.h @@ -57,10 +57,10 @@ bool xfs_buf_item_dirty_format(struct xfs_buf_log_item *); void xfs_buf_attach_iodone(struct xfs_buf *, void(*)(struct xfs_buf *, struct xfs_log_item *), struct xfs_log_item *); -void xfs_buf_iodone_callbacks(struct xfs_buf *); -void xfs_buf_iodone(struct xfs_buf *, struct xfs_log_item *); +void xfs_buf_item_iodone(struct xfs_buf *, struct xfs_log_item *); void xfs_buf_inode_iodone(struct xfs_buf *); void xfs_buf_dquot_iodone(struct xfs_buf *); +void xfs_buf_iodone(struct xfs_buf *); bool xfs_buf_log_check_iovec(struct xfs_log_iovec *iovec); extern kmem_zone_t *xfs_buf_item_zone; diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c index 93d62cb864c1..6752676b94fe 100644 --- a/fs/xfs/xfs_trans_buf.c +++ b/fs/xfs/xfs_trans_buf.c @@ -465,24 +465,17 @@ xfs_trans_dirty_buf( ASSERT(bp->b_transp == tp); ASSERT(bip != NULL); - ASSERT(bp->b_iodone == NULL || - bp->b_iodone == xfs_buf_iodone_callbacks); /* * Mark the buffer as needing to be written out eventually, * and set its iodone function to remove the buffer's buf log * item from the AIL and free it when the buffer is flushed - * to disk. See xfs_buf_attach_iodone() for more details - * on li_cb and xfs_buf_iodone_callbacks(). - * If we end up aborting this transaction, we trap this buffer - * inside the b_bdstrat callback so that this won't get written to - * disk. + * to disk. */ bp->b_flags |= XBF_DONE; ASSERT(atomic_read(&bip->bli_refcount) > 0); - bp->b_iodone = xfs_buf_iodone_callbacks; - bip->bli_item.li_cb = xfs_buf_iodone; + bip->bli_item.li_cb = xfs_buf_item_iodone; /* * If we invalidated the buffer within this transaction, then @@ -651,7 +644,7 @@ xfs_trans_stale_inode_buf( ASSERT(atomic_read(&bip->bli_refcount) > 0); bip->bli_flags |= XFS_BLI_STALE_INODE; - bip->bli_item.li_cb = xfs_buf_iodone; + bip->bli_item.li_cb = xfs_buf_item_iodone; bp->b_flags |= _XBF_INODES; xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DINO_BUF); } From a7e134ef37172fd4f13bbb11f8f440c807ba294b Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 29 Jun 2020 14:48:47 -0700 Subject: [PATCH 022/117] xfs: clean up whacky buffer log item list reinit When we've emptied the buffer log item list, it does a list_del_init on itself to reset it's pointers to itself. This is unnecessary as the list is already empty at this point - it was a left-over fragment from the list_head conversion of the buffer log item list. Remove them. Signed-off-by: Dave Chinner Reviewed-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Reviewed-by: Brian Foster Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_buf_item.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index d87ae6363a13..5b3cd5e90947 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -459,7 +459,6 @@ xfs_buf_item_unpin( if (bip->bli_flags & XFS_BLI_STALE_INODE) { xfs_buf_do_callbacks(bp); bp->b_log_item = NULL; - list_del_init(&bp->b_li_list); } else { xfs_trans_ail_delete(lip, SHUTDOWN_LOG_IO_ERROR); xfs_buf_item_relse(bp); @@ -1165,7 +1164,6 @@ xfs_buf_run_callbacks( xfs_buf_do_callbacks(bp); bp->b_log_item = NULL; - list_del_init(&bp->b_li_list); } /* From aac855ab1a98d9c20762047f26af47d391c3ba7a Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 29 Jun 2020 14:48:48 -0700 Subject: [PATCH 023/117] xfs: make inode IO completion buffer centric Having different io completion callbacks for different inode states makes things complex. We can detect if the inode is stale via the XFS_ISTALE flag in IO completion, so we don't need a special callback just for this. This means inodes only have a single iodone callback, and inode IO completion is entirely buffer centric at this point. Hence we no longer need to use a log item callback at all as we can just call xfs_iflush_done() directly from the buffer completions and walk the buffer log item list to complete the all inodes under IO. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Reviewed-by: Brian Foster Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_buf_item.c | 35 ++++++++++++++++++---- fs/xfs/xfs_inode.c | 6 ++-- fs/xfs/xfs_inode_item.c | 65 ++++++++++++++--------------------------- fs/xfs/xfs_inode_item.h | 5 ++-- 4 files changed, 56 insertions(+), 55 deletions(-) diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index 5b3cd5e90947..a4e416af5c61 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -13,6 +13,8 @@ #include "xfs_mount.h" #include "xfs_trans.h" #include "xfs_buf_item.h" +#include "xfs_inode.h" +#include "xfs_inode_item.h" #include "xfs_trans_priv.h" #include "xfs_trace.h" #include "xfs_log.h" @@ -457,7 +459,8 @@ xfs_buf_item_unpin( * the AIL lock. */ if (bip->bli_flags & XFS_BLI_STALE_INODE) { - xfs_buf_do_callbacks(bp); + lip->li_cb(bp, lip); + xfs_iflush_done(bp); bp->b_log_item = NULL; } else { xfs_trans_ail_delete(lip, SHUTDOWN_LOG_IO_ERROR); @@ -1141,8 +1144,8 @@ out_stale: return false; } -static void -xfs_buf_run_callbacks( +static inline bool +xfs_buf_had_callback_errors( struct xfs_buf *bp) { @@ -1152,7 +1155,7 @@ xfs_buf_run_callbacks( * appropriate action. */ if (bp->b_error && xfs_buf_iodone_callback_error(bp)) - return; + return true; /* * Successful IO or permanent error. Either way, we can clear the @@ -1161,7 +1164,16 @@ xfs_buf_run_callbacks( bp->b_last_error = 0; bp->b_retries = 0; bp->b_first_retry_time = 0; + return false; +} +static void +xfs_buf_run_callbacks( + struct xfs_buf *bp) +{ + + if (xfs_buf_had_callback_errors(bp)) + return; xfs_buf_do_callbacks(bp); bp->b_log_item = NULL; } @@ -1173,7 +1185,20 @@ void xfs_buf_inode_iodone( struct xfs_buf *bp) { - xfs_buf_run_callbacks(bp); + struct xfs_buf_log_item *blip = bp->b_log_item; + struct xfs_log_item *lip; + + if (xfs_buf_had_callback_errors(bp)) + return; + + /* If there is a buf_log_item attached, run its callback */ + if (blip) { + lip = &blip->bli_item; + lip->li_cb(bp, lip); + bp->b_log_item = NULL; + } + + xfs_iflush_done(bp); xfs_buf_ioend_finish(bp); } diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 4621d67f3428..721b8420be04 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -2679,7 +2679,6 @@ xfs_ifree_cluster( list_for_each_entry(lip, &bp->b_li_list, li_bio_list) { if (lip->li_type == XFS_LI_INODE) { iip = (struct xfs_inode_log_item *)lip; - lip->li_cb = xfs_istale_done; xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn, &iip->ili_item.li_lsn); @@ -2712,8 +2711,7 @@ xfs_ifree_cluster( xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn, &iip->ili_item.li_lsn); - xfs_buf_attach_iodone(bp, xfs_istale_done, - &iip->ili_item); + xfs_buf_attach_iodone(bp, NULL, &iip->ili_item); if (ip != free_ip) xfs_iunlock(ip, XFS_ILOCK_EXCL); @@ -3863,7 +3861,7 @@ flush_out: * the flush lock. */ bp->b_flags |= _XBF_INODES; - xfs_buf_attach_iodone(bp, xfs_iflush_done, &iip->ili_item); + xfs_buf_attach_iodone(bp, NULL, &iip->ili_item); /* generate the checksum. */ xfs_dinode_calc_crc(mp, dip); diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index 6ef9cbcfc94a..7049f2ae8d18 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -668,40 +668,34 @@ xfs_inode_item_destroy( */ void xfs_iflush_done( - struct xfs_buf *bp, - struct xfs_log_item *lip) + struct xfs_buf *bp) { struct xfs_inode_log_item *iip; - struct xfs_log_item *blip, *n; - struct xfs_ail *ailp = lip->li_ailp; + struct xfs_log_item *lip, *n; + struct xfs_ail *ailp = bp->b_mount->m_ail; int need_ail = 0; LIST_HEAD(tmp); /* - * Scan the buffer IO completions for other inodes being completed and - * attach them to the current inode log item. + * Pull the attached inodes from the buffer one at a time and take the + * appropriate action on them. */ - - list_add_tail(&lip->li_bio_list, &tmp); - - list_for_each_entry_safe(blip, n, &bp->b_li_list, li_bio_list) { - if (lip->li_cb != xfs_iflush_done) + list_for_each_entry_safe(lip, n, &bp->b_li_list, li_bio_list) { + iip = INODE_ITEM(lip); + if (xfs_iflags_test(iip->ili_inode, XFS_ISTALE)) { + list_del_init(&lip->li_bio_list); + xfs_iflush_abort(iip->ili_inode); continue; + } - list_move_tail(&blip->li_bio_list, &tmp); + list_move_tail(&lip->li_bio_list, &tmp); /* Do an unlocked check for needing the AIL lock. */ - iip = INODE_ITEM(blip); - if (blip->li_lsn == iip->ili_flush_lsn || - test_bit(XFS_LI_FAILED, &blip->li_flags)) + if (lip->li_lsn == iip->ili_flush_lsn || + test_bit(XFS_LI_FAILED, &lip->li_flags)) need_ail++; } - - /* make sure we capture the state of the initial inode. */ - iip = INODE_ITEM(lip); - if (lip->li_lsn == iip->ili_flush_lsn || - test_bit(XFS_LI_FAILED, &lip->li_flags)) - need_ail++; + ASSERT(list_empty(&bp->b_li_list)); /* * We only want to pull the item from the AIL if it is actually there @@ -713,19 +707,13 @@ xfs_iflush_done( /* this is an opencoded batch version of xfs_trans_ail_delete */ spin_lock(&ailp->ail_lock); - list_for_each_entry(blip, &tmp, li_bio_list) { - if (blip->li_lsn == INODE_ITEM(blip)->ili_flush_lsn) { - /* - * xfs_ail_update_finish() only cares about the - * lsn of the first tail item removed, any - * others will be at the same or higher lsn so - * we just ignore them. - */ - xfs_lsn_t lsn = xfs_ail_delete_one(ailp, blip); + list_for_each_entry(lip, &tmp, li_bio_list) { + if (lip->li_lsn == INODE_ITEM(lip)->ili_flush_lsn) { + xfs_lsn_t lsn = xfs_ail_delete_one(ailp, lip); if (!tail_lsn && lsn) tail_lsn = lsn; } else { - xfs_clear_li_failed(blip); + xfs_clear_li_failed(lip); } } xfs_ail_update_finish(ailp, tail_lsn); @@ -736,9 +724,9 @@ xfs_iflush_done( * ili_last_fields bits now that we know that the data corresponding to * them is safely on disk. */ - list_for_each_entry_safe(blip, n, &tmp, li_bio_list) { - list_del_init(&blip->li_bio_list); - iip = INODE_ITEM(blip); + list_for_each_entry_safe(lip, n, &tmp, li_bio_list) { + list_del_init(&lip->li_bio_list); + iip = INODE_ITEM(lip); spin_lock(&iip->ili_lock); iip->ili_last_fields = 0; @@ -746,7 +734,6 @@ xfs_iflush_done( xfs_ifunlock(iip->ili_inode); } - list_del(&tmp); } /* @@ -779,14 +766,6 @@ xfs_iflush_abort( xfs_ifunlock(ip); } -void -xfs_istale_done( - struct xfs_buf *bp, - struct xfs_log_item *lip) -{ - xfs_iflush_abort(INODE_ITEM(lip)->ili_inode); -} - /* * convert an xfs_inode_log_format struct from the old 32 bit version * (which can have different field alignments) to the native 64 bit version diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h index 4a10a1b92ee9..048b5e7dee90 100644 --- a/fs/xfs/xfs_inode_item.h +++ b/fs/xfs/xfs_inode_item.h @@ -36,15 +36,14 @@ struct xfs_inode_log_item { xfs_lsn_t ili_last_lsn; /* lsn at last transaction */ }; -static inline int xfs_inode_clean(xfs_inode_t *ip) +static inline int xfs_inode_clean(struct xfs_inode *ip) { return !ip->i_itemp || !(ip->i_itemp->ili_fields & XFS_ILOG_ALL); } extern void xfs_inode_item_init(struct xfs_inode *, struct xfs_mount *); extern void xfs_inode_item_destroy(struct xfs_inode *); -extern void xfs_iflush_done(struct xfs_buf *, struct xfs_log_item *); -extern void xfs_istale_done(struct xfs_buf *, struct xfs_log_item *); +extern void xfs_iflush_done(struct xfs_buf *); extern void xfs_iflush_abort(struct xfs_inode *); extern int xfs_inode_item_format_convert(xfs_log_iovec_t *, struct xfs_inode_log_format *); From 6f5de1808e3663917b5c682e2d91d95645ce2df2 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 29 Jun 2020 14:48:59 -0700 Subject: [PATCH 024/117] xfs: use direct calls for dquot IO completion Similar to inodes, we can call the dquot IO completion functions directly from the buffer completion code, removing another user of log item callbacks for IO completion processing. Signed-off-by: Dave Chinner Reviewed-by: Brian Foster Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_buf_item.c | 18 +++++++++++++++++- fs/xfs/xfs_dquot.c | 18 ++++++++++++++---- fs/xfs/xfs_quota.h | 9 +++++++++ 3 files changed, 40 insertions(+), 5 deletions(-) diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index a4e416af5c61..f46e5ec28111 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -15,6 +15,9 @@ #include "xfs_buf_item.h" #include "xfs_inode.h" #include "xfs_inode_item.h" +#include "xfs_quota.h" +#include "xfs_dquot_item.h" +#include "xfs_dquot.h" #include "xfs_trans_priv.h" #include "xfs_trace.h" #include "xfs_log.h" @@ -1209,7 +1212,20 @@ void xfs_buf_dquot_iodone( struct xfs_buf *bp) { - xfs_buf_run_callbacks(bp); + struct xfs_buf_log_item *blip = bp->b_log_item; + struct xfs_log_item *lip; + + if (xfs_buf_had_callback_errors(bp)) + return; + + /* a newly allocated dquot buffer might have a log item attached */ + if (blip) { + lip = &blip->bli_item; + lip->li_cb(bp, lip); + bp->b_log_item = NULL; + } + + xfs_dquot_done(bp); xfs_buf_ioend_finish(bp); } diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index 2e2146fa0914..403bc4e9f21f 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -1048,9 +1048,8 @@ xfs_qm_dqrele( * from the AIL if it has not been re-logged, and unlocking the dquot's * flush lock. This behavior is very similar to that of inodes.. */ -STATIC void +static void xfs_qm_dqflush_done( - struct xfs_buf *bp, struct xfs_log_item *lip) { struct xfs_dq_logitem *qip = (struct xfs_dq_logitem *)lip; @@ -1091,6 +1090,18 @@ xfs_qm_dqflush_done( xfs_dqfunlock(dqp); } +void +xfs_dquot_done( + struct xfs_buf *bp) +{ + struct xfs_log_item *lip, *n; + + list_for_each_entry_safe(lip, n, &bp->b_li_list, li_bio_list) { + list_del_init(&lip->li_bio_list); + xfs_qm_dqflush_done(lip); + } +} + /* * Write a modified dquot to disk. * The dquot must be locked and the flush lock too taken by caller. @@ -1180,8 +1191,7 @@ xfs_qm_dqflush( * AIL and release the flush lock once the dquot is synced to disk. */ bp->b_flags |= _XBF_DQUOTS; - xfs_buf_attach_iodone(bp, xfs_qm_dqflush_done, - &dqp->q_logitem.qli_item); + xfs_buf_attach_iodone(bp, NULL, &dqp->q_logitem.qli_item); /* * If the buffer is pinned then push on the log so we won't diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h index aa8fc1f55fbd..c92ae5e02ce8 100644 --- a/fs/xfs/xfs_quota.h +++ b/fs/xfs/xfs_quota.h @@ -13,6 +13,7 @@ */ struct xfs_trans; +struct xfs_buf; /* * This check is done typically without holding the inode lock; @@ -107,6 +108,8 @@ extern void xfs_qm_mount_quotas(struct xfs_mount *); extern void xfs_qm_unmount(struct xfs_mount *); extern void xfs_qm_unmount_quotas(struct xfs_mount *); +void xfs_dquot_done(struct xfs_buf *); + #else static inline int xfs_qm_vop_dqalloc(struct xfs_inode *ip, kuid_t kuid, kgid_t kgid, @@ -148,6 +151,12 @@ static inline int xfs_trans_reserve_quota_bydquots(struct xfs_trans *tp, #define xfs_qm_mount_quotas(mp) #define xfs_qm_unmount(mp) #define xfs_qm_unmount_quotas(mp) + +static inline void xfs_dquot_done(struct xfs_buf *bp) +{ + return; +} + #endif /* CONFIG_XFS_QUOTA */ #define xfs_trans_unreserve_quota_nblks(tp, ip, nblks, ninos, flags) \ From fec671cd350ff3ef737a83236ab2c6d3e4b8d600 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 29 Jun 2020 14:49:14 -0700 Subject: [PATCH 025/117] xfs: clean up the buffer iodone callback functions Now that we've sorted inode and dquot buffers, we can apply the same cleanups to dirty buffers with buffer log items. They only have one callback, too, so we don't need the log item callback. Collapse the iodone functions and remove all the now unnecessary infrastructure around callback processing. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Reviewed-by: Brian Foster Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_buf_item.c | 140 +++++++++-------------------------------- fs/xfs/xfs_buf_item.h | 1 - fs/xfs/xfs_trans_buf.c | 2 - 3 files changed, 29 insertions(+), 114 deletions(-) diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index f46e5ec28111..0ece5de9dd71 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -30,7 +30,7 @@ static inline struct xfs_buf_log_item *BUF_ITEM(struct xfs_log_item *lip) return container_of(lip, struct xfs_buf_log_item, bli_item); } -STATIC void xfs_buf_do_callbacks(struct xfs_buf *bp); +static void xfs_buf_item_done(struct xfs_buf *bp); /* Is this log iovec plausibly large enough to contain the buffer log format? */ bool @@ -462,9 +462,8 @@ xfs_buf_item_unpin( * the AIL lock. */ if (bip->bli_flags & XFS_BLI_STALE_INODE) { - lip->li_cb(bp, lip); + xfs_buf_item_done(bp); xfs_iflush_done(bp); - bp->b_log_item = NULL; } else { xfs_trans_ail_delete(lip, SHUTDOWN_LOG_IO_ERROR); xfs_buf_item_relse(bp); @@ -973,46 +972,6 @@ xfs_buf_attach_iodone( list_add_tail(&lip->li_bio_list, &bp->b_li_list); } -/* - * We can have many callbacks on a buffer. Running the callbacks individually - * can cause a lot of contention on the AIL lock, so we allow for a single - * callback to be able to scan the remaining items in bp->b_li_list for other - * items of the same type and callback to be processed in the first call. - * - * As a result, the loop walking the callback list below will also modify the - * list. it removes the first item from the list and then runs the callback. - * The loop then restarts from the new first item int the list. This allows the - * callback to scan and modify the list attached to the buffer and we don't - * have to care about maintaining a next item pointer. - */ -STATIC void -xfs_buf_do_callbacks( - struct xfs_buf *bp) -{ - struct xfs_buf_log_item *blip = bp->b_log_item; - struct xfs_log_item *lip; - - /* If there is a buf_log_item attached, run its callback */ - if (blip) { - lip = &blip->bli_item; - lip->li_cb(bp, lip); - } - - while (!list_empty(&bp->b_li_list)) { - lip = list_first_entry(&bp->b_li_list, struct xfs_log_item, - li_bio_list); - - /* - * Remove the item from the list, so we don't have any - * confusion if the item is added to another buf. - * Don't touch the log item after calling its - * callback, because it could have freed itself. - */ - list_del_init(&lip->li_bio_list); - lip->li_cb(bp, lip); - } -} - /* * Invoke the error state callback for each log item affected by the failed I/O. * @@ -1025,8 +984,8 @@ STATIC void xfs_buf_do_callbacks_fail( struct xfs_buf *bp) { + struct xfs_ail *ailp = bp->b_mount->m_ail; struct xfs_log_item *lip; - struct xfs_ail *ailp; /* * Buffer log item errors are handled directly by xfs_buf_item_push() @@ -1036,9 +995,6 @@ xfs_buf_do_callbacks_fail( if (list_empty(&bp->b_li_list)) return; - lip = list_first_entry(&bp->b_li_list, struct xfs_log_item, - li_bio_list); - ailp = lip->li_ailp; spin_lock(&ailp->ail_lock); list_for_each_entry(lip, &bp->b_li_list, li_bio_list) { if (lip->li_ops->iop_error) @@ -1051,22 +1007,11 @@ static bool xfs_buf_iodone_callback_error( struct xfs_buf *bp) { - struct xfs_buf_log_item *bip = bp->b_log_item; - struct xfs_log_item *lip; - struct xfs_mount *mp; + struct xfs_mount *mp = bp->b_mount; static ulong lasttime; static xfs_buftarg_t *lasttarg; struct xfs_error_cfg *cfg; - /* - * The failed buffer might not have a buf_log_item attached or the - * log_item list might be empty. Get the mp from the available - * xfs_log_item - */ - lip = list_first_entry_or_null(&bp->b_li_list, struct xfs_log_item, - li_bio_list); - mp = lip ? lip->li_mountp : bip->bli_item.li_mountp; - /* * If we've already decided to shutdown the filesystem because of * I/O errors, there's no point in giving this a retry. @@ -1171,14 +1116,27 @@ xfs_buf_had_callback_errors( } static void -xfs_buf_run_callbacks( +xfs_buf_item_done( struct xfs_buf *bp) { + struct xfs_buf_log_item *bip = bp->b_log_item; - if (xfs_buf_had_callback_errors(bp)) + if (!bip) return; - xfs_buf_do_callbacks(bp); + + /* + * If we are forcibly shutting down, this may well be off the AIL + * already. That's because we simulate the log-committed callbacks to + * unpin these buffers. Or we may never have put this item on AIL + * because of the transaction was aborted forcibly. + * xfs_trans_ail_delete() takes care of these. + * + * Either way, AIL is useless if we're forcing a shutdown. + */ + xfs_trans_ail_delete(&bip->bli_item, SHUTDOWN_CORRUPT_INCORE); bp->b_log_item = NULL; + xfs_buf_item_free(bip); + xfs_buf_rele(bp); } /* @@ -1188,19 +1146,10 @@ void xfs_buf_inode_iodone( struct xfs_buf *bp) { - struct xfs_buf_log_item *blip = bp->b_log_item; - struct xfs_log_item *lip; - if (xfs_buf_had_callback_errors(bp)) return; - /* If there is a buf_log_item attached, run its callback */ - if (blip) { - lip = &blip->bli_item; - lip->li_cb(bp, lip); - bp->b_log_item = NULL; - } - + xfs_buf_item_done(bp); xfs_iflush_done(bp); xfs_buf_ioend_finish(bp); } @@ -1212,59 +1161,28 @@ void xfs_buf_dquot_iodone( struct xfs_buf *bp) { - struct xfs_buf_log_item *blip = bp->b_log_item; - struct xfs_log_item *lip; - if (xfs_buf_had_callback_errors(bp)) return; /* a newly allocated dquot buffer might have a log item attached */ - if (blip) { - lip = &blip->bli_item; - lip->li_cb(bp, lip); - bp->b_log_item = NULL; - } - + xfs_buf_item_done(bp); xfs_dquot_done(bp); xfs_buf_ioend_finish(bp); } /* * Dirty buffer iodone callback function. + * + * Note that for things like remote attribute buffers, there may not be a buffer + * log item here, so processing the buffer log item must remain be optional. */ void xfs_buf_iodone( struct xfs_buf *bp) { - xfs_buf_run_callbacks(bp); + if (xfs_buf_had_callback_errors(bp)) + return; + + xfs_buf_item_done(bp); xfs_buf_ioend_finish(bp); } - -/* - * This is the iodone() function for buffers which have been - * logged. It is called when they are eventually flushed out. - * It should remove the buf item from the AIL, and free the buf item. - * It is called by xfs_buf_iodone_callbacks() above which will take - * care of cleaning up the buffer itself. - */ -void -xfs_buf_item_iodone( - struct xfs_buf *bp, - struct xfs_log_item *lip) -{ - ASSERT(BUF_ITEM(lip)->bli_buf == bp); - - xfs_buf_rele(bp); - - /* - * If we are forcibly shutting down, this may well be off the AIL - * already. That's because we simulate the log-committed callbacks to - * unpin these buffers. Or we may never have put this item on AIL - * because of the transaction was aborted forcibly. - * xfs_trans_ail_delete() takes care of these. - * - * Either way, AIL is useless if we're forcing a shutdown. - */ - xfs_trans_ail_delete(lip, SHUTDOWN_CORRUPT_INCORE); - xfs_buf_item_free(BUF_ITEM(lip)); -} diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h index 610cd0019328..7c0bd2a210af 100644 --- a/fs/xfs/xfs_buf_item.h +++ b/fs/xfs/xfs_buf_item.h @@ -57,7 +57,6 @@ bool xfs_buf_item_dirty_format(struct xfs_buf_log_item *); void xfs_buf_attach_iodone(struct xfs_buf *, void(*)(struct xfs_buf *, struct xfs_log_item *), struct xfs_log_item *); -void xfs_buf_item_iodone(struct xfs_buf *, struct xfs_log_item *); void xfs_buf_inode_iodone(struct xfs_buf *); void xfs_buf_dquot_iodone(struct xfs_buf *); void xfs_buf_iodone(struct xfs_buf *); diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c index 6752676b94fe..11cd666cd99a 100644 --- a/fs/xfs/xfs_trans_buf.c +++ b/fs/xfs/xfs_trans_buf.c @@ -475,7 +475,6 @@ xfs_trans_dirty_buf( bp->b_flags |= XBF_DONE; ASSERT(atomic_read(&bip->bli_refcount) > 0); - bip->bli_item.li_cb = xfs_buf_item_iodone; /* * If we invalidated the buffer within this transaction, then @@ -644,7 +643,6 @@ xfs_trans_stale_inode_buf( ASSERT(atomic_read(&bip->bli_refcount) > 0); bip->bli_flags |= XFS_BLI_STALE_INODE; - bip->bli_item.li_cb = xfs_buf_item_iodone; bp->b_flags |= _XBF_INODES; xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DINO_BUF); } From 2ef3f7f5db15aea47b92fd770bc45cf317aa2b97 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 29 Jun 2020 14:49:14 -0700 Subject: [PATCH 026/117] xfs: get rid of log item callbacks They are not used anymore, so remove them from the log item and the buffer iodone attachment interfaces. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Reviewed-by: Brian Foster Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_buf_item.c | 17 ----------------- fs/xfs/xfs_buf_item.h | 3 --- fs/xfs/xfs_dquot.c | 6 +++--- fs/xfs/xfs_inode.c | 5 +++-- fs/xfs/xfs_trans.h | 4 ---- 5 files changed, 6 insertions(+), 29 deletions(-) diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index 0ece5de9dd71..09bfe9c52dbd 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -955,23 +955,6 @@ xfs_buf_item_relse( xfs_buf_item_free(bip); } - -/* - * Add the given log item with its callback to the list of callbacks - * to be called when the buffer's I/O completes. - */ -void -xfs_buf_attach_iodone( - struct xfs_buf *bp, - void (*cb)(struct xfs_buf *, struct xfs_log_item *), - struct xfs_log_item *lip) -{ - ASSERT(xfs_buf_islocked(bp)); - - lip->li_cb = cb; - list_add_tail(&lip->li_bio_list, &bp->b_li_list); -} - /* * Invoke the error state callback for each log item affected by the failed I/O. * diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h index 7c0bd2a210af..23507cbb4c41 100644 --- a/fs/xfs/xfs_buf_item.h +++ b/fs/xfs/xfs_buf_item.h @@ -54,9 +54,6 @@ void xfs_buf_item_relse(struct xfs_buf *); bool xfs_buf_item_put(struct xfs_buf_log_item *); void xfs_buf_item_log(struct xfs_buf_log_item *, uint, uint); bool xfs_buf_item_dirty_format(struct xfs_buf_log_item *); -void xfs_buf_attach_iodone(struct xfs_buf *, - void(*)(struct xfs_buf *, struct xfs_log_item *), - struct xfs_log_item *); void xfs_buf_inode_iodone(struct xfs_buf *); void xfs_buf_dquot_iodone(struct xfs_buf *); void xfs_buf_iodone(struct xfs_buf *); diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index 403bc4e9f21f..d5984a926d1d 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -1187,11 +1187,11 @@ xfs_qm_dqflush( } /* - * Attach an iodone routine so that we can remove this dquot from the - * AIL and release the flush lock once the dquot is synced to disk. + * Attach the dquot to the buffer so that we can remove this dquot from + * the AIL and release the flush lock once the dquot is synced to disk. */ bp->b_flags |= _XBF_DQUOTS; - xfs_buf_attach_iodone(bp, NULL, &dqp->q_logitem.qli_item); + list_add_tail(&dqp->q_logitem.qli_item.li_bio_list, &bp->b_li_list); /* * If the buffer is pinned then push on the log so we won't diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 721b8420be04..77cc9cbcd311 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -2711,7 +2711,8 @@ xfs_ifree_cluster( xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn, &iip->ili_item.li_lsn); - xfs_buf_attach_iodone(bp, NULL, &iip->ili_item); + list_add_tail(&iip->ili_item.li_bio_list, + &bp->b_li_list); if (ip != free_ip) xfs_iunlock(ip, XFS_ILOCK_EXCL); @@ -3861,7 +3862,7 @@ flush_out: * the flush lock. */ bp->b_flags |= _XBF_INODES; - xfs_buf_attach_iodone(bp, NULL, &iip->ili_item); + list_add_tail(&iip->ili_item.li_bio_list, &bp->b_li_list); /* generate the checksum. */ xfs_dinode_calc_crc(mp, dip); diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index 8308bf6d7e40..99a9ab9cab25 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -37,10 +37,6 @@ struct xfs_log_item { unsigned long li_flags; /* misc flags */ struct xfs_buf *li_buf; /* real buffer pointer */ struct list_head li_bio_list; /* buffer item list */ - void (*li_cb)(struct xfs_buf *, - struct xfs_log_item *); - /* buffer item iodone */ - /* callback func */ const struct xfs_item_ops *li_ops; /* function list */ /* delayed logging */ From 428947e9d525ae3a03dbdce4cdbcb2afe020732d Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 29 Jun 2020 14:49:14 -0700 Subject: [PATCH 027/117] xfs: handle buffer log item IO errors directly Currently when a buffer with attached log items has an IO error it called ->iop_error for each attched log item. These all call xfs_set_li_failed() to handle the error, but we are about to change the way log items manage buffers. hence we first need to remove the per-item dependency on buffer handling done by xfs_set_li_failed(). We already have specific buffer type IO completion routines, so move the log item error handling out of the generic error handling and into the log item specific functions so we can implement per-type error handling easily. This requires a more complex return value from the error handling code so that we can take the correct action the failure handling requires. This results in some repeated boilerplate in the functions, but that can be cleaned up later once all the changes cascade through this code. Signed-off-by: Dave Chinner Reviewed-by: Brian Foster Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_buf_item.c | 228 ++++++++++++++++++++++++++++-------------- 1 file changed, 151 insertions(+), 77 deletions(-) diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index 09bfe9c52dbd..f80fc5bd3bff 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -986,21 +986,24 @@ xfs_buf_do_callbacks_fail( spin_unlock(&ailp->ail_lock); } +/* + * Decide if we're going to retry the write after a failure, and prepare + * the buffer for retrying the write. + */ static bool -xfs_buf_iodone_callback_error( +xfs_buf_ioerror_fail_without_retry( struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_mount; static ulong lasttime; static xfs_buftarg_t *lasttarg; - struct xfs_error_cfg *cfg; /* * If we've already decided to shutdown the filesystem because of * I/O errors, there's no point in giving this a retry. */ if (XFS_FORCED_SHUTDOWN(mp)) - goto out_stale; + return true; if (bp->b_target != lasttarg || time_after(jiffies, (lasttime + 5*HZ))) { @@ -1011,91 +1014,114 @@ xfs_buf_iodone_callback_error( /* synchronous writes will have callers process the error */ if (!(bp->b_flags & XBF_ASYNC)) + return true; + return false; +} + +static bool +xfs_buf_ioerror_retry( + struct xfs_buf *bp, + struct xfs_error_cfg *cfg) +{ + if ((bp->b_flags & (XBF_STALE | XBF_WRITE_FAIL)) && + bp->b_last_error == bp->b_error) + return false; + + bp->b_flags |= (XBF_WRITE | XBF_DONE | XBF_WRITE_FAIL); + bp->b_last_error = bp->b_error; + if (cfg->retry_timeout != XFS_ERR_RETRY_FOREVER && + !bp->b_first_retry_time) + bp->b_first_retry_time = jiffies; + return true; +} + +/* + * Account for this latest trip around the retry handler, and decide if + * we've failed enough times to constitute a permanent failure. + */ +static bool +xfs_buf_ioerror_permanent( + struct xfs_buf *bp, + struct xfs_error_cfg *cfg) +{ + struct xfs_mount *mp = bp->b_mount; + + if (cfg->max_retries != XFS_ERR_RETRY_FOREVER && + ++bp->b_retries > cfg->max_retries) + return true; + if (cfg->retry_timeout != XFS_ERR_RETRY_FOREVER && + time_after(jiffies, cfg->retry_timeout + bp->b_first_retry_time)) + return true; + + /* At unmount we may treat errors differently */ + if ((mp->m_flags & XFS_MOUNT_UNMOUNTING) && mp->m_fail_unmount) + return true; + + return false; +} + +/* + * On a sync write or shutdown we just want to stale the buffer and let the + * caller handle the error in bp->b_error appropriately. + * + * If the write was asynchronous then no one will be looking for the error. If + * this is the first failure of this type, clear the error state and write the + * buffer out again. This means we always retry an async write failure at least + * once, but we also need to set the buffer up to behave correctly now for + * repeated failures. + * + * If we get repeated async write failures, then we take action according to the + * error configuration we have been set up to use. + * + * Multi-state return value: + * + * XBF_IOERROR_FINISH: clear IO error retry state and run callback completions + * XBF_IOERROR_DONE: resubmitted immediately, do not run any completions + * XBF_IOERROR_FAIL: transient error, run failure callback completions and then + * release the buffer + */ +enum { + XBF_IOERROR_FINISH, + XBF_IOERROR_DONE, + XBF_IOERROR_FAIL, +}; + +static int +xfs_buf_iodone_error( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_mount; + struct xfs_error_cfg *cfg; + + if (xfs_buf_ioerror_fail_without_retry(bp)) goto out_stale; trace_xfs_buf_item_iodone_async(bp, _RET_IP_); cfg = xfs_error_get_cfg(mp, XFS_ERR_METADATA, bp->b_error); - - /* - * If the write was asynchronous then no one will be looking for the - * error. If this is the first failure of this type, clear the error - * state and write the buffer out again. This means we always retry an - * async write failure at least once, but we also need to set the buffer - * up to behave correctly now for repeated failures. - */ - if (!(bp->b_flags & (XBF_STALE | XBF_WRITE_FAIL)) || - bp->b_last_error != bp->b_error) { - bp->b_flags |= (XBF_WRITE | XBF_DONE | XBF_WRITE_FAIL); - bp->b_last_error = bp->b_error; - if (cfg->retry_timeout != XFS_ERR_RETRY_FOREVER && - !bp->b_first_retry_time) - bp->b_first_retry_time = jiffies; - + if (xfs_buf_ioerror_retry(bp, cfg)) { xfs_buf_ioerror(bp, 0); xfs_buf_submit(bp); - return true; + return XBF_IOERROR_DONE; } - /* - * Repeated failure on an async write. Take action according to the - * error configuration we have been set up to use. - */ - - if (cfg->max_retries != XFS_ERR_RETRY_FOREVER && - ++bp->b_retries > cfg->max_retries) - goto permanent_error; - if (cfg->retry_timeout != XFS_ERR_RETRY_FOREVER && - time_after(jiffies, cfg->retry_timeout + bp->b_first_retry_time)) - goto permanent_error; - - /* At unmount we may treat errors differently */ - if ((mp->m_flags & XFS_MOUNT_UNMOUNTING) && mp->m_fail_unmount) - goto permanent_error; - - /* - * Still a transient error, run IO completion failure callbacks and let - * the higher layers retry the buffer. - */ - xfs_buf_do_callbacks_fail(bp); - xfs_buf_ioerror(bp, 0); - xfs_buf_relse(bp); - return true; - /* * Permanent error - we need to trigger a shutdown if we haven't already * to indicate that inconsistency will result from this action. */ -permanent_error: - xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); + if (xfs_buf_ioerror_permanent(bp, cfg)) { + xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); + goto out_stale; + } + + /* Still considered a transient error. Caller will schedule retries. */ + return XBF_IOERROR_FAIL; + out_stale: xfs_buf_stale(bp); bp->b_flags |= XBF_DONE; trace_xfs_buf_error_relse(bp, _RET_IP_); - return false; -} - -static inline bool -xfs_buf_had_callback_errors( - struct xfs_buf *bp) -{ - - /* - * If there is an error, process it. Some errors require us to run - * callbacks after failure processing is done so we detect that and take - * appropriate action. - */ - if (bp->b_error && xfs_buf_iodone_callback_error(bp)) - return true; - - /* - * Successful IO or permanent error. Either way, we can clear the - * retry state here in preparation for the next error that may occur. - */ - bp->b_last_error = 0; - bp->b_retries = 0; - bp->b_first_retry_time = 0; - return false; + return XBF_IOERROR_FINISH; } static void @@ -1122,6 +1148,15 @@ xfs_buf_item_done( xfs_buf_rele(bp); } +static inline void +xfs_buf_clear_ioerror_retry_state( + struct xfs_buf *bp) +{ + bp->b_last_error = 0; + bp->b_retries = 0; + bp->b_first_retry_time = 0; +} + /* * Inode buffer iodone callback function. */ @@ -1129,9 +1164,22 @@ void xfs_buf_inode_iodone( struct xfs_buf *bp) { - if (xfs_buf_had_callback_errors(bp)) - return; + if (bp->b_error) { + int ret = xfs_buf_iodone_error(bp); + if (ret == XBF_IOERROR_FINISH) + goto finish_iodone; + if (ret == XBF_IOERROR_DONE) + return; + ASSERT(ret == XBF_IOERROR_FAIL); + xfs_buf_do_callbacks_fail(bp); + xfs_buf_ioerror(bp, 0); + xfs_buf_relse(bp); + return; + } + +finish_iodone: + xfs_buf_clear_ioerror_retry_state(bp); xfs_buf_item_done(bp); xfs_iflush_done(bp); xfs_buf_ioend_finish(bp); @@ -1144,9 +1192,22 @@ void xfs_buf_dquot_iodone( struct xfs_buf *bp) { - if (xfs_buf_had_callback_errors(bp)) - return; + if (bp->b_error) { + int ret = xfs_buf_iodone_error(bp); + if (ret == XBF_IOERROR_FINISH) + goto finish_iodone; + if (ret == XBF_IOERROR_DONE) + return; + ASSERT(ret == XBF_IOERROR_FAIL); + xfs_buf_do_callbacks_fail(bp); + xfs_buf_ioerror(bp, 0); + xfs_buf_relse(bp); + return; + } + +finish_iodone: + xfs_buf_clear_ioerror_retry_state(bp); /* a newly allocated dquot buffer might have a log item attached */ xfs_buf_item_done(bp); xfs_dquot_done(bp); @@ -1163,9 +1224,22 @@ void xfs_buf_iodone( struct xfs_buf *bp) { - if (xfs_buf_had_callback_errors(bp)) - return; + if (bp->b_error) { + int ret = xfs_buf_iodone_error(bp); + if (ret == XBF_IOERROR_FINISH) + goto finish_iodone; + if (ret == XBF_IOERROR_DONE) + return; + ASSERT(ret == XBF_IOERROR_FAIL); + xfs_buf_do_callbacks_fail(bp); + xfs_buf_ioerror(bp, 0); + xfs_buf_relse(bp); + return; + } + +finish_iodone: + xfs_buf_clear_ioerror_retry_state(bp); xfs_buf_item_done(bp); xfs_buf_ioend_finish(bp); } From 3536b61e74aa232d0ae42cff57b80278724f209c Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 29 Jun 2020 14:49:15 -0700 Subject: [PATCH 028/117] xfs: unwind log item error flagging When an buffer IO error occurs, we want to mark all the log items attached to the buffer as failed. Open code the error handling loop so that we can modify the flagging for the different types of objects directly and independently of each other. This also allows us to remove the ->iop_error method from the log item operations. Signed-off-by: Dave Chinner Reviewed-by: Brian Foster Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_buf_item.c | 48 ++++++++++++----------------------------- fs/xfs/xfs_dquot_item.c | 18 ---------------- fs/xfs/xfs_inode_item.c | 18 ---------------- fs/xfs/xfs_trans.h | 1 - 4 files changed, 14 insertions(+), 71 deletions(-) diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index f80fc5bd3bff..d61f20b989cd 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -12,6 +12,7 @@ #include "xfs_bit.h" #include "xfs_mount.h" #include "xfs_trans.h" +#include "xfs_trans_priv.h" #include "xfs_buf_item.h" #include "xfs_inode.h" #include "xfs_inode_item.h" @@ -955,37 +956,6 @@ xfs_buf_item_relse( xfs_buf_item_free(bip); } -/* - * Invoke the error state callback for each log item affected by the failed I/O. - * - * If a metadata buffer write fails with a non-permanent error, the buffer is - * eventually resubmitted and so the completion callbacks are not run. The error - * state may need to be propagated to the log items attached to the buffer, - * however, so the next AIL push of the item knows hot to handle it correctly. - */ -STATIC void -xfs_buf_do_callbacks_fail( - struct xfs_buf *bp) -{ - struct xfs_ail *ailp = bp->b_mount->m_ail; - struct xfs_log_item *lip; - - /* - * Buffer log item errors are handled directly by xfs_buf_item_push() - * and xfs_buf_iodone_callback_error, and they have no IO error - * callbacks. Check only for items in b_li_list. - */ - if (list_empty(&bp->b_li_list)) - return; - - spin_lock(&ailp->ail_lock); - list_for_each_entry(lip, &bp->b_li_list, li_bio_list) { - if (lip->li_ops->iop_error) - lip->li_ops->iop_error(lip, bp); - } - spin_unlock(&ailp->ail_lock); -} - /* * Decide if we're going to retry the write after a failure, and prepare * the buffer for retrying the write. @@ -1165,6 +1135,7 @@ xfs_buf_inode_iodone( struct xfs_buf *bp) { if (bp->b_error) { + struct xfs_log_item *lip; int ret = xfs_buf_iodone_error(bp); if (ret == XBF_IOERROR_FINISH) @@ -1172,7 +1143,11 @@ xfs_buf_inode_iodone( if (ret == XBF_IOERROR_DONE) return; ASSERT(ret == XBF_IOERROR_FAIL); - xfs_buf_do_callbacks_fail(bp); + spin_lock(&bp->b_mount->m_ail->ail_lock); + list_for_each_entry(lip, &bp->b_li_list, li_bio_list) { + xfs_set_li_failed(lip, bp); + } + spin_unlock(&bp->b_mount->m_ail->ail_lock); xfs_buf_ioerror(bp, 0); xfs_buf_relse(bp); return; @@ -1193,6 +1168,7 @@ xfs_buf_dquot_iodone( struct xfs_buf *bp) { if (bp->b_error) { + struct xfs_log_item *lip; int ret = xfs_buf_iodone_error(bp); if (ret == XBF_IOERROR_FINISH) @@ -1200,7 +1176,11 @@ xfs_buf_dquot_iodone( if (ret == XBF_IOERROR_DONE) return; ASSERT(ret == XBF_IOERROR_FAIL); - xfs_buf_do_callbacks_fail(bp); + spin_lock(&bp->b_mount->m_ail->ail_lock); + list_for_each_entry(lip, &bp->b_li_list, li_bio_list) { + xfs_set_li_failed(lip, bp); + } + spin_unlock(&bp->b_mount->m_ail->ail_lock); xfs_buf_ioerror(bp, 0); xfs_buf_relse(bp); return; @@ -1232,7 +1212,7 @@ xfs_buf_iodone( if (ret == XBF_IOERROR_DONE) return; ASSERT(ret == XBF_IOERROR_FAIL); - xfs_buf_do_callbacks_fail(bp); + ASSERT(list_empty(&bp->b_li_list)); xfs_buf_ioerror(bp, 0); xfs_buf_relse(bp); return; diff --git a/fs/xfs/xfs_dquot_item.c b/fs/xfs/xfs_dquot_item.c index 349c92d26570..d7e4de7151d7 100644 --- a/fs/xfs/xfs_dquot_item.c +++ b/fs/xfs/xfs_dquot_item.c @@ -113,23 +113,6 @@ xfs_qm_dqunpin_wait( wait_event(dqp->q_pinwait, (atomic_read(&dqp->q_pincount) == 0)); } -/* - * Callback used to mark a buffer with XFS_LI_FAILED when items in the buffer - * have been failed during writeback - * - * this informs the AIL that the dquot is already flush locked on the next push, - * and acquires a hold on the buffer to ensure that it isn't reclaimed before - * dirty data makes it to disk. - */ -STATIC void -xfs_dquot_item_error( - struct xfs_log_item *lip, - struct xfs_buf *bp) -{ - ASSERT(!completion_done(&DQUOT_ITEM(lip)->qli_dquot->q_flush)); - xfs_set_li_failed(lip, bp); -} - STATIC uint xfs_qm_dquot_logitem_push( struct xfs_log_item *lip, @@ -216,7 +199,6 @@ static const struct xfs_item_ops xfs_dquot_item_ops = { .iop_release = xfs_qm_dquot_logitem_release, .iop_committing = xfs_qm_dquot_logitem_committing, .iop_push = xfs_qm_dquot_logitem_push, - .iop_error = xfs_dquot_item_error }; /* diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index 7049f2ae8d18..86c783dec2ba 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -464,23 +464,6 @@ xfs_inode_item_unpin( wake_up_bit(&ip->i_flags, __XFS_IPINNED_BIT); } -/* - * Callback used to mark a buffer with XFS_LI_FAILED when items in the buffer - * have been failed during writeback - * - * This informs the AIL that the inode is already flush locked on the next push, - * and acquires a hold on the buffer to ensure that it isn't reclaimed before - * dirty data makes it to disk. - */ -STATIC void -xfs_inode_item_error( - struct xfs_log_item *lip, - struct xfs_buf *bp) -{ - ASSERT(xfs_isiflocked(INODE_ITEM(lip)->ili_inode)); - xfs_set_li_failed(lip, bp); -} - STATIC uint xfs_inode_item_push( struct xfs_log_item *lip, @@ -619,7 +602,6 @@ static const struct xfs_item_ops xfs_inode_item_ops = { .iop_committed = xfs_inode_item_committed, .iop_push = xfs_inode_item_push, .iop_committing = xfs_inode_item_committing, - .iop_error = xfs_inode_item_error }; diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index 99a9ab9cab25..b752501818d2 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -74,7 +74,6 @@ struct xfs_item_ops { void (*iop_committing)(struct xfs_log_item *, xfs_lsn_t commit_lsn); void (*iop_release)(struct xfs_log_item *); xfs_lsn_t (*iop_committed)(struct xfs_log_item *, xfs_lsn_t); - void (*iop_error)(struct xfs_log_item *, xfs_buf_t *); int (*iop_recover)(struct xfs_log_item *lip, struct xfs_trans *tp); bool (*iop_match)(struct xfs_log_item *item, uint64_t id); }; From e98084b8bef7e357dbd201b162fea0817d1908c5 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 29 Jun 2020 14:49:15 -0700 Subject: [PATCH 029/117] xfs: move xfs_clear_li_failed out of xfs_ail_delete_one() xfs_ail_delete_one() is called directly from dquot and inode IO completion, as well as from the generic xfs_trans_ail_delete() function. Inodes are about to have their own failure handling, and dquots will in future, too. Pull the clearing of the LI_FAILED flag up into the callers so we can customise the code appropriately. Signed-off-by: Dave Chinner Reviewed-by: Brian Foster Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_dquot.c | 6 +----- fs/xfs/xfs_inode_item.c | 3 +-- fs/xfs/xfs_trans_ail.c | 2 +- 3 files changed, 3 insertions(+), 8 deletions(-) diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index d5984a926d1d..76353c9a723e 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -1070,16 +1070,12 @@ xfs_qm_dqflush_done( test_bit(XFS_LI_FAILED, &lip->li_flags))) { spin_lock(&ailp->ail_lock); + xfs_clear_li_failed(lip); if (lip->li_lsn == qip->qli_flush_lsn) { /* xfs_ail_update_finish() drops the AIL lock */ tail_lsn = xfs_ail_delete_one(ailp, lip); xfs_ail_update_finish(ailp, tail_lsn); } else { - /* - * Clear the failed state since we are about to drop the - * flush lock - */ - xfs_clear_li_failed(lip); spin_unlock(&ailp->ail_lock); } } diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index 86c783dec2ba..0ba75764a8dc 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -690,12 +690,11 @@ xfs_iflush_done( /* this is an opencoded batch version of xfs_trans_ail_delete */ spin_lock(&ailp->ail_lock); list_for_each_entry(lip, &tmp, li_bio_list) { + xfs_clear_li_failed(lip); if (lip->li_lsn == INODE_ITEM(lip)->ili_flush_lsn) { xfs_lsn_t lsn = xfs_ail_delete_one(ailp, lip); if (!tail_lsn && lsn) tail_lsn = lsn; - } else { - xfs_clear_li_failed(lip); } } xfs_ail_update_finish(ailp, tail_lsn); diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c index ac5019361a13..ac33f6393f99 100644 --- a/fs/xfs/xfs_trans_ail.c +++ b/fs/xfs/xfs_trans_ail.c @@ -843,7 +843,6 @@ xfs_ail_delete_one( trace_xfs_ail_delete(lip, mlip->li_lsn, lip->li_lsn); xfs_ail_delete(ailp, lip); - xfs_clear_li_failed(lip); clear_bit(XFS_LI_IN_AIL, &lip->li_flags); lip->li_lsn = 0; @@ -874,6 +873,7 @@ xfs_trans_ail_delete( } /* xfs_ail_update_finish() drops the AIL lock */ + xfs_clear_li_failed(lip); tail_lsn = xfs_ail_delete_one(ailp, lip); xfs_ail_update_finish(ailp, tail_lsn); } From 298f7bec503f30bd98242ec02df6abe13b31a677 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 29 Jun 2020 14:49:15 -0700 Subject: [PATCH 030/117] xfs: pin inode backing buffer to the inode log item When we dirty an inode, we are going to have to write it disk at some point in the near future. This requires the inode cluster backing buffer to be present in memory. Unfortunately, under severe memory pressure we can reclaim the inode backing buffer while the inode is dirty in memory, resulting in stalling the AIL pushing because it has to do a read-modify-write cycle on the cluster buffer. When we have no memory available, the read of the cluster buffer blocks the AIL pushing process, and this causes all sorts of issues for memory reclaim as it requires inode writeback to make forwards progress. Allocating a cluster buffer causes more memory pressure, and results in more cluster buffers to be reclaimed, resulting in more RMW cycles to be done in the AIL context and everything then backs up on AIL progress. Only the synchronous inode cluster writeback in the the inode reclaim code provides some level of forwards progress guarantees that prevent OOM-killer rampages in this situation. Fix this by pinning the inode backing buffer to the inode log item when the inode is first dirtied (i.e. in xfs_trans_log_inode()). This may mean the first modification of an inode that has been held in cache for a long time may block on a cluster buffer read, but we can do that in transaction context and block safely until the buffer has been allocated and read. Once we have the cluster buffer, the inode log item takes a reference to it, pinning it in memory, and attaches it to the log item for future reference. This means we can always grab the cluster buffer from the inode log item when we need it. When the inode is finally cleaned and removed from the AIL, we can drop the reference the inode log item holds on the cluster buffer. Once all inodes on the cluster buffer are clean, the cluster buffer will be unpinned and it will be available for memory reclaim to reclaim again. This avoids the issues with needing to do RMW cycles in the AIL pushing context, and hence allows complete non-blocking inode flushing to be performed by the AIL pushing context. Signed-off-by: Dave Chinner Reviewed-by: Brian Foster Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_inode_buf.c | 3 +- fs/xfs/libxfs/xfs_trans_inode.c | 53 +++++++++++++++++++++++---- fs/xfs/xfs_buf_item.c | 4 +-- fs/xfs/xfs_inode_item.c | 63 ++++++++++++++++++++++++++------- fs/xfs/xfs_trans_ail.c | 8 +++-- 5 files changed, 106 insertions(+), 25 deletions(-) diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c index 5c93e8e6de74..b4a6c091571e 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.c +++ b/fs/xfs/libxfs/xfs_inode_buf.c @@ -176,7 +176,8 @@ xfs_imap_to_bp( } *bpp = bp; - *dipp = xfs_buf_offset(bp, imap->im_boffset); + if (dipp) + *dipp = xfs_buf_offset(bp, imap->im_boffset); return 0; } diff --git a/fs/xfs/libxfs/xfs_trans_inode.c b/fs/xfs/libxfs/xfs_trans_inode.c index c66d9d1dd58b..ad5974365c58 100644 --- a/fs/xfs/libxfs/xfs_trans_inode.c +++ b/fs/xfs/libxfs/xfs_trans_inode.c @@ -8,6 +8,8 @@ #include "xfs_shared.h" #include "xfs_format.h" #include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" #include "xfs_inode.h" #include "xfs_trans.h" #include "xfs_trans_priv.h" @@ -72,13 +74,19 @@ xfs_trans_ichgtime( } /* - * This is called to mark the fields indicated in fieldmask as needing - * to be logged when the transaction is committed. The inode must - * already be associated with the given transaction. + * This is called to mark the fields indicated in fieldmask as needing to be + * logged when the transaction is committed. The inode must already be + * associated with the given transaction. * - * The values for fieldmask are defined in xfs_inode_item.h. We always - * log all of the core inode if any of it has changed, and we always log - * all of the inline data/extents/b-tree root if any of them has changed. + * The values for fieldmask are defined in xfs_inode_item.h. We always log all + * of the core inode if any of it has changed, and we always log all of the + * inline data/extents/b-tree root if any of them has changed. + * + * Grab and pin the cluster buffer associated with this inode to avoid RMW + * cycles at inode writeback time. Avoid the need to add error handling to every + * xfs_trans_log_inode() call by shutting down on read error. This will cause + * transactions to fail and everything to error out, just like if we return a + * read error in a dirty transaction and cancel it. */ void xfs_trans_log_inode( @@ -131,6 +139,39 @@ xfs_trans_log_inode( spin_lock(&iip->ili_lock); iip->ili_fsync_fields |= flags; + if (!iip->ili_item.li_buf) { + struct xfs_buf *bp; + int error; + + /* + * We hold the ILOCK here, so this inode is not going to be + * flushed while we are here. Further, because there is no + * buffer attached to the item, we know that there is no IO in + * progress, so nothing will clear the ili_fields while we read + * in the buffer. Hence we can safely drop the spin lock and + * read the buffer knowing that the state will not change from + * here. + */ + spin_unlock(&iip->ili_lock); + error = xfs_imap_to_bp(ip->i_mount, tp, &ip->i_imap, NULL, + &bp, 0); + if (error) { + xfs_force_shutdown(ip->i_mount, SHUTDOWN_META_IO_ERROR); + return; + } + + /* + * We need an explicit buffer reference for the log item but + * don't want the buffer to remain attached to the transaction. + * Hold the buffer but release the transaction reference. + */ + xfs_buf_hold(bp); + xfs_trans_brelse(tp, bp); + + spin_lock(&iip->ili_lock); + iip->ili_item.li_buf = bp; + } + /* * Always OR in the bits from the ili_last_fields field. This is to * coordinate with the xfs_iflush() and xfs_iflush_done() routines in diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index d61f20b989cd..ecb3362395af 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -1143,11 +1143,9 @@ xfs_buf_inode_iodone( if (ret == XBF_IOERROR_DONE) return; ASSERT(ret == XBF_IOERROR_FAIL); - spin_lock(&bp->b_mount->m_ail->ail_lock); list_for_each_entry(lip, &bp->b_li_list, li_bio_list) { - xfs_set_li_failed(lip, bp); + set_bit(XFS_LI_FAILED, &lip->li_flags); } - spin_unlock(&bp->b_mount->m_ail->ail_lock); xfs_buf_ioerror(bp, 0); xfs_buf_relse(bp); return; diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index 0ba75764a8dc..64bdda72f7b2 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -439,6 +439,7 @@ xfs_inode_item_pin( struct xfs_inode *ip = INODE_ITEM(lip)->ili_inode; ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + ASSERT(lip->li_buf); trace_xfs_inode_pin(ip, _RET_IP_); atomic_inc(&ip->i_pincount); @@ -450,6 +451,12 @@ xfs_inode_item_pin( * item which was previously pinned with a call to xfs_inode_item_pin(). * * Also wake up anyone in xfs_iunpin_wait() if the count goes to 0. + * + * Note that unpin can race with inode cluster buffer freeing marking the buffer + * stale. In that case, flush completions are run from the buffer unpin call, + * which may happen before the inode is unpinned. If we lose the race, there + * will be no buffer attached to the log item, but the inode will be marked + * XFS_ISTALE. */ STATIC void xfs_inode_item_unpin( @@ -459,6 +466,7 @@ xfs_inode_item_unpin( struct xfs_inode *ip = INODE_ITEM(lip)->ili_inode; trace_xfs_inode_unpin(ip, _RET_IP_); + ASSERT(lip->li_buf || xfs_iflags_test(ip, XFS_ISTALE)); ASSERT(atomic_read(&ip->i_pincount) > 0); if (atomic_dec_and_test(&ip->i_pincount)) wake_up_bit(&ip->i_flags, __XFS_IPINNED_BIT); @@ -629,10 +637,15 @@ xfs_inode_item_init( */ void xfs_inode_item_destroy( - xfs_inode_t *ip) + struct xfs_inode *ip) { - kmem_free(ip->i_itemp->ili_item.li_lv_shadow); - kmem_cache_free(xfs_ili_zone, ip->i_itemp); + struct xfs_inode_log_item *iip = ip->i_itemp; + + ASSERT(iip->ili_item.li_buf == NULL); + + ip->i_itemp = NULL; + kmem_free(iip->ili_item.li_lv_shadow); + kmem_cache_free(xfs_ili_zone, iip); } @@ -673,11 +686,10 @@ xfs_iflush_done( list_move_tail(&lip->li_bio_list, &tmp); /* Do an unlocked check for needing the AIL lock. */ - if (lip->li_lsn == iip->ili_flush_lsn || + if (iip->ili_flush_lsn == lip->li_lsn || test_bit(XFS_LI_FAILED, &lip->li_flags)) need_ail++; } - ASSERT(list_empty(&bp->b_li_list)); /* * We only want to pull the item from the AIL if it is actually there @@ -690,7 +702,7 @@ xfs_iflush_done( /* this is an opencoded batch version of xfs_trans_ail_delete */ spin_lock(&ailp->ail_lock); list_for_each_entry(lip, &tmp, li_bio_list) { - xfs_clear_li_failed(lip); + clear_bit(XFS_LI_FAILED, &lip->li_flags); if (lip->li_lsn == INODE_ITEM(lip)->ili_flush_lsn) { xfs_lsn_t lsn = xfs_ail_delete_one(ailp, lip); if (!tail_lsn && lsn) @@ -706,14 +718,29 @@ xfs_iflush_done( * them is safely on disk. */ list_for_each_entry_safe(lip, n, &tmp, li_bio_list) { + bool drop_buffer = false; + list_del_init(&lip->li_bio_list); iip = INODE_ITEM(lip); spin_lock(&iip->ili_lock); - iip->ili_last_fields = 0; - spin_unlock(&iip->ili_lock); + /* + * Remove the reference to the cluster buffer if the inode is + * clean in memory. Drop the buffer reference once we've dropped + * the locks we hold. + */ + ASSERT(iip->ili_item.li_buf == bp); + if (!iip->ili_fields) { + iip->ili_item.li_buf = NULL; + drop_buffer = true; + } + iip->ili_last_fields = 0; + iip->ili_flush_lsn = 0; + spin_unlock(&iip->ili_lock); xfs_ifunlock(iip->ili_inode); + if (drop_buffer) + xfs_buf_rele(bp); } } @@ -725,12 +752,20 @@ xfs_iflush_done( */ void xfs_iflush_abort( - struct xfs_inode *ip) + struct xfs_inode *ip) { - struct xfs_inode_log_item *iip = ip->i_itemp; + struct xfs_inode_log_item *iip = ip->i_itemp; + struct xfs_buf *bp = NULL; if (iip) { + /* + * Clear the failed bit before removing the item from the AIL so + * xfs_trans_ail_delete() doesn't try to clear and release the + * buffer attached to the log item before we are done with it. + */ + clear_bit(XFS_LI_FAILED, &iip->ili_item.li_flags); xfs_trans_ail_delete(&iip->ili_item, 0); + /* * Clear the inode logging fields so no more flushes are * attempted. @@ -739,12 +774,14 @@ xfs_iflush_abort( iip->ili_last_fields = 0; iip->ili_fields = 0; iip->ili_fsync_fields = 0; + iip->ili_flush_lsn = 0; + bp = iip->ili_item.li_buf; + iip->ili_item.li_buf = NULL; spin_unlock(&iip->ili_lock); } - /* - * Release the inode's flush lock since we're done with it. - */ xfs_ifunlock(ip); + if (bp) + xfs_buf_rele(bp); } /* diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c index ac33f6393f99..c3be6e440134 100644 --- a/fs/xfs/xfs_trans_ail.c +++ b/fs/xfs/xfs_trans_ail.c @@ -377,8 +377,12 @@ xfsaild_resubmit_item( } /* protected by ail_lock */ - list_for_each_entry(lip, &bp->b_li_list, li_bio_list) - xfs_clear_li_failed(lip); + list_for_each_entry(lip, &bp->b_li_list, li_bio_list) { + if (bp->b_flags & _XBF_INODES) + clear_bit(XFS_LI_FAILED, &lip->li_flags); + else + xfs_clear_li_failed(lip); + } xfs_buf_unlock(bp); return XFS_ITEM_SUCCESS; From 993f951f501c85e963b3664739c07196a286eac7 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 29 Jun 2020 14:49:16 -0700 Subject: [PATCH 031/117] xfs: make inode reclaim almost non-blocking Now that dirty inode writeback doesn't cause read-modify-write cycles on the inode cluster buffer under memory pressure, the need to throttle memory reclaim to the rate at which we can clean dirty inodes goes away. That is due to the fact that we no longer thrash inode cluster buffers under memory pressure to clean dirty inodes. This means inode writeback no longer stalls on memory allocation or read IO, and hence can be done asynchronously without generating memory pressure. As a result, blocking inode writeback in reclaim is no longer necessary to prevent reclaim priority windup as cleaning dirty inodes is no longer dependent on having memory reserves available for the filesystem to make progress reclaiming inodes. Hence we can convert inode reclaim to be non-blocking for shrinker callouts, both for direct reclaim and kswapd. On a vanilla kernel, running a 16-way fsmark create workload on a 4 node/16p/16GB RAM machine, I can reliably pin 14.75GB of RAM via userspace mlock(). The OOM killer gets invoked at 15GB of pinned RAM. Without the inode cluster pinning, this non-blocking reclaim patch triggers premature OOM killer invocation with the same memory pinning, sometimes with as much as 45% of RAM being free. It's trivially easy to trigger the OOM killer when reclaim does not block. With pinning inode clusters in RAM and then adding this patch, I can reliably pin 14.5GB of RAM and still have the fsmark workload run to completion. The OOM killer gets invoked 14.75GB of pinned RAM, which is only a small amount of memory less than the vanilla kernel. It is much more reliable than just with async reclaim alone. simoops shows that allocation stalls go away when async reclaim is used. Vanilla kernel: Run time: 1924 seconds Read latency (p50: 3,305,472) (p95: 3,723,264) (p99: 4,001,792) Write latency (p50: 184,064) (p95: 553,984) (p99: 807,936) Allocation latency (p50: 2,641,920) (p95: 3,911,680) (p99: 4,464,640) work rate = 13.45/sec (avg 13.44/sec) (p50: 13.46) (p95: 13.58) (p99: 13.70) alloc stall rate = 3.80/sec (avg: 2.59) (p50: 2.54) (p95: 2.96) (p99: 3.02) With inode cluster pinning and async reclaim: Run time: 1924 seconds Read latency (p50: 3,305,472) (p95: 3,715,072) (p99: 3,977,216) Write latency (p50: 187,648) (p95: 553,984) (p99: 789,504) Allocation latency (p50: 2,748,416) (p95: 3,919,872) (p99: 4,448,256) work rate = 13.28/sec (avg 13.32/sec) (p50: 13.26) (p95: 13.34) (p99: 13.34) alloc stall rate = 0.02/sec (avg: 0.02) (p50: 0.01) (p95: 0.03) (p99: 0.03) Latencies don't really change much, nor does the work rate. However, allocation almost never stalls with these changes, whilst the vanilla kernel is sometimes reporting 20 stalls/s over a 60s sample period. This difference is due to inode reclaim being largely non-blocking now. IOWs, once we have pinned inode cluster buffers, we can make inode reclaim non-blocking without a major risk of premature and/or spurious OOM killer invocation, and without any changes to memory reclaim infrastructure. Signed-off-by: Dave Chinner Reviewed-by: Amir Goldstein Reviewed-by: Darrick J. Wong Reviewed-by: Brian Foster Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_icache.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 59dea8178ae3..01f8efc5f59c 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -1402,7 +1402,7 @@ xfs_reclaim_inodes_nr( xfs_reclaim_work_queue(mp); xfs_ail_push_all(mp->m_ail); - return xfs_reclaim_inodes_ag(mp, SYNC_TRYLOCK | SYNC_WAIT, &nr_to_scan); + return xfs_reclaim_inodes_ag(mp, SYNC_TRYLOCK, &nr_to_scan); } /* From 617825fe3489ac231790e5c843107168838b8547 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 29 Jun 2020 14:49:16 -0700 Subject: [PATCH 032/117] xfs: remove IO submission from xfs_reclaim_inode() We no longer need to issue IO from shrinker based inode reclaim to prevent spurious OOM killer invocation. This leaves only the global filesystem management operations such as unmount needing to writeback dirty inodes and reclaim them. Instead of using the reclaim pass to write dirty inodes before reclaiming them, use the AIL to push all the dirty inodes before we try to reclaim them. This allows us to remove all the conditional SYNC_WAIT locking and the writeback code from xfs_reclaim_inode() and greatly simplify the checks we need to do to reclaim an inode. Signed-off-by: Dave Chinner Reviewed-by: Darrick J. Wong Reviewed-by: Brian Foster Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_icache.c | 117 ++++++++++++-------------------------------- 1 file changed, 31 insertions(+), 86 deletions(-) diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 01f8efc5f59c..fa4df9b4edb5 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -1111,24 +1111,17 @@ xfs_reclaim_inode_grab( * dirty, async => requeue * dirty, sync => flush, wait and reclaim */ -STATIC int +static bool xfs_reclaim_inode( struct xfs_inode *ip, struct xfs_perag *pag, int sync_mode) { - struct xfs_buf *bp = NULL; xfs_ino_t ino = ip->i_ino; /* for radix_tree_delete */ - int error; -restart: - error = 0; xfs_ilock(ip, XFS_ILOCK_EXCL); - if (!xfs_iflock_nowait(ip)) { - if (!(sync_mode & SYNC_WAIT)) - goto out; - xfs_iflock(ip); - } + if (!xfs_iflock_nowait(ip)) + goto out; if (XFS_FORCED_SHUTDOWN(ip->i_mount)) { xfs_iunpin_wait(ip); @@ -1136,52 +1129,12 @@ restart: xfs_iflush_abort(ip); goto reclaim; } - if (xfs_ipincount(ip)) { - if (!(sync_mode & SYNC_WAIT)) - goto out_ifunlock; - xfs_iunpin_wait(ip); - } - if (xfs_inode_clean(ip)) { - xfs_ifunlock(ip); - goto reclaim; - } - - /* - * Never flush out dirty data during non-blocking reclaim, as it would - * just contend with AIL pushing trying to do the same job. - */ - if (!(sync_mode & SYNC_WAIT)) + if (xfs_ipincount(ip)) + goto out_ifunlock; + if (!xfs_inode_clean(ip)) goto out_ifunlock; - /* - * Now we have an inode that needs flushing. - * - * Note that xfs_iflush will never block on the inode buffer lock, as - * xfs_ifree_cluster() can lock the inode buffer before it locks the - * ip->i_lock, and we are doing the exact opposite here. As a result, - * doing a blocking xfs_imap_to_bp() to get the cluster buffer would - * result in an ABBA deadlock with xfs_ifree_cluster(). - * - * As xfs_ifree_cluser() must gather all inodes that are active in the - * cache to mark them stale, if we hit this case we don't actually want - * to do IO here - we want the inode marked stale so we can simply - * reclaim it. Hence if we get an EAGAIN error here, just unlock the - * inode, back off and try again. Hopefully the next pass through will - * see the stale flag set on the inode. - */ - error = xfs_iflush(ip, &bp); - if (error == -EAGAIN) { - xfs_iunlock(ip, XFS_ILOCK_EXCL); - /* backoff longer than in xfs_ifree_cluster */ - delay(2); - goto restart; - } - - if (!error) { - error = xfs_bwrite(bp); - xfs_buf_relse(bp); - } - + xfs_ifunlock(ip); reclaim: ASSERT(!xfs_isiflocked(ip)); @@ -1231,21 +1184,14 @@ reclaim: ASSERT(xfs_inode_clean(ip)); __xfs_inode_free(ip); - return error; + return true; out_ifunlock: xfs_ifunlock(ip); out: - xfs_iflags_clear(ip, XFS_IRECLAIM); xfs_iunlock(ip, XFS_ILOCK_EXCL); - /* - * We could return -EAGAIN here to make reclaim rescan the inode tree in - * a short while. However, this just burns CPU time scanning the tree - * waiting for IO to complete and the reclaim work never goes back to - * the idle state. Instead, return 0 to let the next scheduled - * background reclaim attempt to reclaim the inode again. - */ - return 0; + xfs_iflags_clear(ip, XFS_IRECLAIM); + return false; } /* @@ -1253,21 +1199,22 @@ out: * corrupted, we still want to try to reclaim all the inodes. If we don't, * then a shut down during filesystem unmount reclaim walk leak all the * unreclaimed inodes. + * + * Returns non-zero if any AGs or inodes were skipped in the reclaim pass + * so that callers that want to block until all dirty inodes are written back + * and reclaimed can sanely loop. */ -STATIC int +static int xfs_reclaim_inodes_ag( struct xfs_mount *mp, int flags, int *nr_to_scan) { struct xfs_perag *pag; - int error = 0; - int last_error = 0; xfs_agnumber_t ag; int trylock = flags & SYNC_TRYLOCK; int skipped; -restart: ag = 0; skipped = 0; while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) { @@ -1341,9 +1288,8 @@ restart: for (i = 0; i < nr_found; i++) { if (!batch[i]) continue; - error = xfs_reclaim_inode(batch[i], pag, flags); - if (error && last_error != -EFSCORRUPTED) - last_error = error; + if (!xfs_reclaim_inode(batch[i], pag, flags)) + skipped++; } *nr_to_scan -= XFS_LOOKUP_BATCH; @@ -1359,19 +1305,7 @@ restart: mutex_unlock(&pag->pag_ici_reclaim_lock); xfs_perag_put(pag); } - - /* - * if we skipped any AG, and we still have scan count remaining, do - * another pass this time using blocking reclaim semantics (i.e - * waiting on the reclaim locks and ignoring the reclaim cursors). This - * ensure that when we get more reclaimers than AGs we block rather - * than spin trying to execute reclaim. - */ - if (skipped && (flags & SYNC_WAIT) && *nr_to_scan > 0) { - trylock = 0; - goto restart; - } - return last_error; + return skipped; } int @@ -1380,8 +1314,18 @@ xfs_reclaim_inodes( int mode) { int nr_to_scan = INT_MAX; + int skipped; - return xfs_reclaim_inodes_ag(mp, mode, &nr_to_scan); + xfs_reclaim_inodes_ag(mp, mode, &nr_to_scan); + if (!(mode & SYNC_WAIT)) + return 0; + + do { + xfs_ail_push_all_sync(mp->m_ail); + skipped = xfs_reclaim_inodes_ag(mp, mode, &nr_to_scan); + } while (skipped > 0); + + return 0; } /* @@ -1402,7 +1346,8 @@ xfs_reclaim_inodes_nr( xfs_reclaim_work_queue(mp); xfs_ail_push_all(mp->m_ail); - return xfs_reclaim_inodes_ag(mp, SYNC_TRYLOCK, &nr_to_scan); + xfs_reclaim_inodes_ag(mp, SYNC_TRYLOCK, &nr_to_scan); + return 0; } /* From 0e8e2c6343dd74a4f55f8507a9fae9064d456436 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 29 Jun 2020 14:49:16 -0700 Subject: [PATCH 033/117] xfs: allow multiple reclaimers per AG Inode reclaim will still throttle direct reclaim on the per-ag reclaim locks. This is no longer necessary as reclaim can run non-blocking now. Hence we can remove these locks so that we don't arbitrarily block reclaimers just because there are more direct reclaimers than there are AGs. This can result in multiple reclaimers working on the same range of an AG, but this doesn't cause any apparent issues. Optimising the spread of concurrent reclaimers for best efficiency can be done in a future patchset. Signed-off-by: Dave Chinner Reviewed-by: Darrick J. Wong Reviewed-by: Brian Foster Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_icache.c | 31 ++++++++++++------------------- fs/xfs/xfs_mount.c | 4 ---- fs/xfs/xfs_mount.h | 1 - 3 files changed, 12 insertions(+), 24 deletions(-) diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index fa4df9b4edb5..592eab23c6e7 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -1211,12 +1211,9 @@ xfs_reclaim_inodes_ag( int *nr_to_scan) { struct xfs_perag *pag; - xfs_agnumber_t ag; - int trylock = flags & SYNC_TRYLOCK; - int skipped; + xfs_agnumber_t ag = 0; + int skipped = 0; - ag = 0; - skipped = 0; while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) { unsigned long first_index = 0; int done = 0; @@ -1224,15 +1221,13 @@ xfs_reclaim_inodes_ag( ag = pag->pag_agno + 1; - if (trylock) { - if (!mutex_trylock(&pag->pag_ici_reclaim_lock)) { - skipped++; - xfs_perag_put(pag); - continue; - } - first_index = pag->pag_ici_reclaim_cursor; - } else - mutex_lock(&pag->pag_ici_reclaim_lock); + /* + * If the cursor is not zero, we haven't scanned the whole AG + * so we might have skipped inodes here. + */ + first_index = READ_ONCE(pag->pag_ici_reclaim_cursor); + if (first_index) + skipped++; do { struct xfs_inode *batch[XFS_LOOKUP_BATCH]; @@ -1298,11 +1293,9 @@ xfs_reclaim_inodes_ag( } while (nr_found && !done && *nr_to_scan > 0); - if (trylock && !done) - pag->pag_ici_reclaim_cursor = first_index; - else - pag->pag_ici_reclaim_cursor = 0; - mutex_unlock(&pag->pag_ici_reclaim_lock); + if (done) + first_index = 0; + WRITE_ONCE(pag->pag_ici_reclaim_cursor, first_index); xfs_perag_put(pag); } return skipped; diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index d5dcf9869860..03158b42a194 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -148,7 +148,6 @@ xfs_free_perag( ASSERT(atomic_read(&pag->pag_ref) == 0); xfs_iunlink_destroy(pag); xfs_buf_hash_destroy(pag); - mutex_destroy(&pag->pag_ici_reclaim_lock); call_rcu(&pag->rcu_head, __xfs_free_perag); } } @@ -200,7 +199,6 @@ xfs_initialize_perag( pag->pag_agno = index; pag->pag_mount = mp; spin_lock_init(&pag->pag_ici_lock); - mutex_init(&pag->pag_ici_reclaim_lock); INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC); if (xfs_buf_hash_init(pag)) goto out_free_pag; @@ -242,7 +240,6 @@ xfs_initialize_perag( out_hash_destroy: xfs_buf_hash_destroy(pag); out_free_pag: - mutex_destroy(&pag->pag_ici_reclaim_lock); kmem_free(pag); out_unwind_new_pags: /* unwind any prior newly initialized pags */ @@ -252,7 +249,6 @@ out_unwind_new_pags: break; xfs_buf_hash_destroy(pag); xfs_iunlink_destroy(pag); - mutex_destroy(&pag->pag_ici_reclaim_lock); kmem_free(pag); } return error; diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 3725d25ad97e..a72cfcaa4ad1 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -354,7 +354,6 @@ typedef struct xfs_perag { spinlock_t pag_ici_lock; /* incore inode cache lock */ struct radix_tree_root pag_ici_root; /* incore inode cache root */ int pag_ici_reclaimable; /* reclaimable inodes */ - struct mutex pag_ici_reclaim_lock; /* serialisation point */ unsigned long pag_ici_reclaim_cursor; /* reclaim restart point */ /* buffer cache index */ From 9552e14d3e879a3b4281427ef368271f371ea167 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 29 Jun 2020 14:49:17 -0700 Subject: [PATCH 034/117] xfs: don't block inode reclaim on the ILOCK When we attempt to reclaim an inode, the first thing we do is take the inode lock. This is blocking right now, so if the inode being accessed by something else (e.g. being flushed to the cluster buffer) we will block here. Change this to a trylock so that we do not block inode reclaim unnecessarily here. Signed-off-by: Dave Chinner Reviewed-by: Darrick J. Wong Reviewed-by: Brian Foster Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_icache.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 592eab23c6e7..f387ec21dd35 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -1119,9 +1119,10 @@ xfs_reclaim_inode( { xfs_ino_t ino = ip->i_ino; /* for radix_tree_delete */ - xfs_ilock(ip, XFS_ILOCK_EXCL); - if (!xfs_iflock_nowait(ip)) + if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) goto out; + if (!xfs_iflock_nowait(ip)) + goto out_iunlock; if (XFS_FORCED_SHUTDOWN(ip->i_mount)) { xfs_iunpin_wait(ip); @@ -1188,8 +1189,9 @@ reclaim: out_ifunlock: xfs_ifunlock(ip); -out: +out_iunlock: xfs_iunlock(ip, XFS_ILOCK_EXCL); +out: xfs_iflags_clear(ip, XFS_IRECLAIM); return false; } From 50718b8d73dda01bb168f9f3b16f6311a2debe7b Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 1 Jul 2020 10:21:05 -0700 Subject: [PATCH 035/117] xfs: remove SYNC_TRYLOCK from inode reclaim All background reclaim is SYNC_TRYLOCK already, and even blocking reclaim (SYNC_WAIT) can use trylock mechanisms as xfs_reclaim_inodes_ag() will keep cycling until there are no more reclaimable inodes. Hence we can kill SYNC_TRYLOCK from inode reclaim and make everything unconditionally non-blocking. We remove all the optimistic "avoid blocking on locks" checks done in xfs_reclaim_inode_grab() as nothing blocks on locks anymore. Further, checking XFS_IFLOCK optimistically can result in detecting inodes in the process of being cleaned (i.e. between being removed from the AIL and having the flush lock dropped), so for xfs_reclaim_inodes() to reliably reclaim all inodes we need to drop these checks anyway. Signed-off-by: Dave Chinner Reviewed-by: Darrick J. Wong Reviewed-by: Brian Foster Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_icache.c | 63 ++++++++++++++++++--------------------------- 1 file changed, 25 insertions(+), 38 deletions(-) diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index f387ec21dd35..8d18117242e1 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -174,7 +174,7 @@ xfs_reclaim_worker( struct xfs_mount *mp = container_of(to_delayed_work(work), struct xfs_mount, m_reclaim_work); - xfs_reclaim_inodes(mp, SYNC_TRYLOCK); + xfs_reclaim_inodes(mp, 0); xfs_reclaim_work_queue(mp); } @@ -1028,48 +1028,37 @@ xfs_cowblocks_worker( /* * Grab the inode for reclaim exclusively. - * Return 0 if we grabbed it, non-zero otherwise. + * + * We have found this inode via a lookup under RCU, so the inode may have + * already been freed, or it may be in the process of being recycled by + * xfs_iget(). In both cases, the inode will have XFS_IRECLAIM set. If the inode + * has been fully recycled by the time we get the i_flags_lock, XFS_IRECLAIMABLE + * will not be set. Hence we need to check for both these flag conditions to + * avoid inodes that are no longer reclaim candidates. + * + * Note: checking for other state flags here, under the i_flags_lock or not, is + * racy and should be avoided. Those races should be resolved only after we have + * ensured that we are able to reclaim this inode and the world can see that we + * are going to reclaim it. + * + * Return true if we grabbed it, false otherwise. */ -STATIC int +static bool xfs_reclaim_inode_grab( - struct xfs_inode *ip, - int flags) + struct xfs_inode *ip) { ASSERT(rcu_read_lock_held()); - /* quick check for stale RCU freed inode */ - if (!ip->i_ino) - return 1; - - /* - * If we are asked for non-blocking operation, do unlocked checks to - * see if the inode already is being flushed or in reclaim to avoid - * lock traffic. - */ - if ((flags & SYNC_TRYLOCK) && - __xfs_iflags_test(ip, XFS_IFLOCK | XFS_IRECLAIM)) - return 1; - - /* - * The radix tree lock here protects a thread in xfs_iget from racing - * with us starting reclaim on the inode. Once we have the - * XFS_IRECLAIM flag set it will not touch us. - * - * Due to RCU lookup, we may find inodes that have been freed and only - * have XFS_IRECLAIM set. Indeed, we may see reallocated inodes that - * aren't candidates for reclaim at all, so we must check the - * XFS_IRECLAIMABLE is set first before proceeding to reclaim. - */ spin_lock(&ip->i_flags_lock); if (!__xfs_iflags_test(ip, XFS_IRECLAIMABLE) || __xfs_iflags_test(ip, XFS_IRECLAIM)) { /* not a reclaim candidate. */ spin_unlock(&ip->i_flags_lock); - return 1; + return false; } __xfs_iflags_set(ip, XFS_IRECLAIM); spin_unlock(&ip->i_flags_lock); - return 0; + return true; } /* @@ -1114,8 +1103,7 @@ xfs_reclaim_inode_grab( static bool xfs_reclaim_inode( struct xfs_inode *ip, - struct xfs_perag *pag, - int sync_mode) + struct xfs_perag *pag) { xfs_ino_t ino = ip->i_ino; /* for radix_tree_delete */ @@ -1209,7 +1197,6 @@ out: static int xfs_reclaim_inodes_ag( struct xfs_mount *mp, - int flags, int *nr_to_scan) { struct xfs_perag *pag; @@ -1254,7 +1241,7 @@ xfs_reclaim_inodes_ag( for (i = 0; i < nr_found; i++) { struct xfs_inode *ip = batch[i]; - if (done || xfs_reclaim_inode_grab(ip, flags)) + if (done || !xfs_reclaim_inode_grab(ip)) batch[i] = NULL; /* @@ -1285,7 +1272,7 @@ xfs_reclaim_inodes_ag( for (i = 0; i < nr_found; i++) { if (!batch[i]) continue; - if (!xfs_reclaim_inode(batch[i], pag, flags)) + if (!xfs_reclaim_inode(batch[i], pag)) skipped++; } @@ -1311,13 +1298,13 @@ xfs_reclaim_inodes( int nr_to_scan = INT_MAX; int skipped; - xfs_reclaim_inodes_ag(mp, mode, &nr_to_scan); + xfs_reclaim_inodes_ag(mp, &nr_to_scan); if (!(mode & SYNC_WAIT)) return 0; do { xfs_ail_push_all_sync(mp->m_ail); - skipped = xfs_reclaim_inodes_ag(mp, mode, &nr_to_scan); + skipped = xfs_reclaim_inodes_ag(mp, &nr_to_scan); } while (skipped > 0); return 0; @@ -1341,7 +1328,7 @@ xfs_reclaim_inodes_nr( xfs_reclaim_work_queue(mp); xfs_ail_push_all(mp->m_ail); - xfs_reclaim_inodes_ag(mp, SYNC_TRYLOCK, &nr_to_scan); + xfs_reclaim_inodes_ag(mp, &nr_to_scan); return 0; } From 4d0bab3a44686f26be7ee7295c6c1987605ae35e Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 1 Jul 2020 10:21:28 -0700 Subject: [PATCH 036/117] xfs: remove SYNC_WAIT from xfs_reclaim_inodes() Clean up xfs_reclaim_inodes() callers. Most callers want blocking behaviour, so just make the existing SYNC_WAIT behaviour the default. For the xfs_reclaim_worker(), just call xfs_reclaim_inodes_ag() directly because we just want optimistic clean inode reclaim to be done in the background. For xfs_quiesce_attr() we can just remove the inode reclaim calls as they are a historic relic that was required to flush dirty inodes that contained unlogged changes. We now log all changes to the inodes, so the sync AIL push from xfs_log_quiesce() called by xfs_quiesce_attr() will do all the required inode writeback for freeze. Seeing as we now want to loop until all reclaimable inodes have been reclaimed, make xfs_reclaim_inodes() loop on the XFS_ICI_RECLAIM_TAG tag rather than having xfs_reclaim_inodes_ag() tell it that inodes were skipped. This is much more reliable and will always loop until all reclaimable inodes are reclaimed. Signed-off-by: Dave Chinner Reviewed-by: Darrick J. Wong Reviewed-by: Brian Foster Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_icache.c | 79 +++++++++++++++++---------------------------- fs/xfs/xfs_icache.h | 2 +- fs/xfs/xfs_mount.c | 11 +++---- fs/xfs/xfs_super.c | 3 -- 4 files changed, 35 insertions(+), 60 deletions(-) diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 8d18117242e1..f4e7b98d9639 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -160,24 +160,6 @@ xfs_reclaim_work_queue( rcu_read_unlock(); } -/* - * This is a fast pass over the inode cache to try to get reclaim moving on as - * many inodes as possible in a short period of time. It kicks itself every few - * seconds, as well as being kicked by the inode cache shrinker when memory - * goes low. It scans as quickly as possible avoiding locked inodes or those - * already being flushed, and once done schedules a future pass. - */ -void -xfs_reclaim_worker( - struct work_struct *work) -{ - struct xfs_mount *mp = container_of(to_delayed_work(work), - struct xfs_mount, m_reclaim_work); - - xfs_reclaim_inodes(mp, 0); - xfs_reclaim_work_queue(mp); -} - static void xfs_perag_set_reclaim_tag( struct xfs_perag *pag) @@ -1100,7 +1082,7 @@ xfs_reclaim_inode_grab( * dirty, async => requeue * dirty, sync => flush, wait and reclaim */ -static bool +static void xfs_reclaim_inode( struct xfs_inode *ip, struct xfs_perag *pag) @@ -1173,7 +1155,7 @@ reclaim: ASSERT(xfs_inode_clean(ip)); __xfs_inode_free(ip); - return true; + return; out_ifunlock: xfs_ifunlock(ip); @@ -1181,7 +1163,6 @@ out_iunlock: xfs_iunlock(ip, XFS_ILOCK_EXCL); out: xfs_iflags_clear(ip, XFS_IRECLAIM); - return false; } /* @@ -1194,14 +1175,13 @@ out: * so that callers that want to block until all dirty inodes are written back * and reclaimed can sanely loop. */ -static int +static void xfs_reclaim_inodes_ag( struct xfs_mount *mp, int *nr_to_scan) { struct xfs_perag *pag; xfs_agnumber_t ag = 0; - int skipped = 0; while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) { unsigned long first_index = 0; @@ -1210,14 +1190,7 @@ xfs_reclaim_inodes_ag( ag = pag->pag_agno + 1; - /* - * If the cursor is not zero, we haven't scanned the whole AG - * so we might have skipped inodes here. - */ first_index = READ_ONCE(pag->pag_ici_reclaim_cursor); - if (first_index) - skipped++; - do { struct xfs_inode *batch[XFS_LOOKUP_BATCH]; int i; @@ -1270,16 +1243,12 @@ xfs_reclaim_inodes_ag( rcu_read_unlock(); for (i = 0; i < nr_found; i++) { - if (!batch[i]) - continue; - if (!xfs_reclaim_inode(batch[i], pag)) - skipped++; + if (batch[i]) + xfs_reclaim_inode(batch[i], pag); } *nr_to_scan -= XFS_LOOKUP_BATCH; - cond_resched(); - } while (nr_found && !done && *nr_to_scan > 0); if (done) @@ -1287,27 +1256,18 @@ xfs_reclaim_inodes_ag( WRITE_ONCE(pag->pag_ici_reclaim_cursor, first_index); xfs_perag_put(pag); } - return skipped; } -int +void xfs_reclaim_inodes( - xfs_mount_t *mp, - int mode) + struct xfs_mount *mp) { int nr_to_scan = INT_MAX; - int skipped; - xfs_reclaim_inodes_ag(mp, &nr_to_scan); - if (!(mode & SYNC_WAIT)) - return 0; - - do { + while (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) { xfs_ail_push_all_sync(mp->m_ail); - skipped = xfs_reclaim_inodes_ag(mp, &nr_to_scan); - } while (skipped > 0); - - return 0; + xfs_reclaim_inodes_ag(mp, &nr_to_scan); + }; } /* @@ -1426,6 +1386,25 @@ xfs_inode_matches_eofb( return true; } +/* + * This is a fast pass over the inode cache to try to get reclaim moving on as + * many inodes as possible in a short period of time. It kicks itself every few + * seconds, as well as being kicked by the inode cache shrinker when memory + * goes low. It scans as quickly as possible avoiding locked inodes or those + * already being flushed, and once done schedules a future pass. + */ +void +xfs_reclaim_worker( + struct work_struct *work) +{ + struct xfs_mount *mp = container_of(to_delayed_work(work), + struct xfs_mount, m_reclaim_work); + int nr_to_scan = INT_MAX; + + xfs_reclaim_inodes_ag(mp, &nr_to_scan); + xfs_reclaim_work_queue(mp); +} + STATIC int xfs_inode_free_eofblocks( struct xfs_inode *ip, diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h index 93b54e7d55f0..ae92ca53de42 100644 --- a/fs/xfs/xfs_icache.h +++ b/fs/xfs/xfs_icache.h @@ -51,7 +51,7 @@ void xfs_inode_free(struct xfs_inode *ip); void xfs_reclaim_worker(struct work_struct *work); -int xfs_reclaim_inodes(struct xfs_mount *mp, int mode); +void xfs_reclaim_inodes(struct xfs_mount *mp); int xfs_reclaim_inodes_count(struct xfs_mount *mp); long xfs_reclaim_inodes_nr(struct xfs_mount *mp, int nr_to_scan); diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 03158b42a194..c8ae49a1e99c 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -1011,7 +1011,7 @@ xfs_mountfs( * quota inodes. */ cancel_delayed_work_sync(&mp->m_reclaim_work); - xfs_reclaim_inodes(mp, SYNC_WAIT); + xfs_reclaim_inodes(mp); xfs_health_unmount(mp); out_log_dealloc: mp->m_flags |= XFS_MOUNT_UNMOUNTING; @@ -1088,13 +1088,12 @@ xfs_unmountfs( xfs_ail_push_all_sync(mp->m_ail); /* - * And reclaim all inodes. At this point there should be no dirty - * inodes and none should be pinned or locked, but use synchronous - * reclaim just to be sure. We can stop background inode reclaim - * here as well if it is still running. + * Reclaim all inodes. At this point there should be no dirty inodes and + * none should be pinned or locked. Stop background inode reclaim here + * if it is still running. */ cancel_delayed_work_sync(&mp->m_reclaim_work); - xfs_reclaim_inodes(mp, SYNC_WAIT); + xfs_reclaim_inodes(mp); xfs_health_unmount(mp); xfs_qm_unmount(mp); diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 379cbff438bc..5a5d9453cf51 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -890,9 +890,6 @@ xfs_quiesce_attr( /* force the log to unpin objects from the now complete transactions */ xfs_log_force(mp, XFS_LOG_SYNC); - /* reclaim inodes to do any IO before the freeze completes */ - xfs_reclaim_inodes(mp, 0); - xfs_reclaim_inodes(mp, SYNC_WAIT); /* Push the superblock and write an unmount record */ error = xfs_log_sbcount(mp); From 02511a5a6a49f9730ad215caa77e4d980008c6c6 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 29 Jun 2020 14:49:18 -0700 Subject: [PATCH 037/117] xfs: clean up inode reclaim comments Inode reclaim is quite different now to the way described in various comments, so update all the comments explaining what it does and how it works. Signed-off-by: Dave Chinner Reviewed-by: Darrick J. Wong Reviewed-by: Brian Foster Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_icache.c | 128 ++++++++++++-------------------------------- 1 file changed, 35 insertions(+), 93 deletions(-) diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index f4e7b98d9639..dc90a81abb1a 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -141,11 +141,8 @@ xfs_inode_free( } /* - * Queue a new inode reclaim pass if there are reclaimable inodes and there - * isn't a reclaim pass already in progress. By default it runs every 5s based - * on the xfs periodic sync default of 30s. Perhaps this should have it's own - * tunable, but that can be done if this method proves to be ineffective or too - * aggressive. + * Queue background inode reclaim work if there are reclaimable inodes and there + * isn't reclaim work already scheduled or in progress. */ static void xfs_reclaim_work_queue( @@ -600,48 +597,31 @@ out_destroy: } /* - * Look up an inode by number in the given file system. - * The inode is looked up in the cache held in each AG. - * If the inode is found in the cache, initialise the vfs inode - * if necessary. + * Look up an inode by number in the given file system. The inode is looked up + * in the cache held in each AG. If the inode is found in the cache, initialise + * the vfs inode if necessary. * - * If it is not in core, read it in from the file system's device, - * add it to the cache and initialise the vfs inode. + * If it is not in core, read it in from the file system's device, add it to the + * cache and initialise the vfs inode. * * The inode is locked according to the value of the lock_flags parameter. - * This flag parameter indicates how and if the inode's IO lock and inode lock - * should be taken. - * - * mp -- the mount point structure for the current file system. It points - * to the inode hash table. - * tp -- a pointer to the current transaction if there is one. This is - * simply passed through to the xfs_iread() call. - * ino -- the number of the inode desired. This is the unique identifier - * within the file system for the inode being requested. - * lock_flags -- flags indicating how to lock the inode. See the comment - * for xfs_ilock() for a list of valid values. + * Inode lookup is only done during metadata operations and not as part of the + * data IO path. Hence we only allow locking of the XFS_ILOCK during lookup. */ int xfs_iget( - xfs_mount_t *mp, - xfs_trans_t *tp, - xfs_ino_t ino, - uint flags, - uint lock_flags, - xfs_inode_t **ipp) + struct xfs_mount *mp, + struct xfs_trans *tp, + xfs_ino_t ino, + uint flags, + uint lock_flags, + struct xfs_inode **ipp) { - xfs_inode_t *ip; - int error; - xfs_perag_t *pag; - xfs_agino_t agino; + struct xfs_inode *ip; + struct xfs_perag *pag; + xfs_agino_t agino; + int error; - /* - * xfs_reclaim_inode() uses the ILOCK to ensure an inode - * doesn't get freed while it's being referenced during a - * radix tree traversal here. It assumes this function - * aqcuires only the ILOCK (and therefore it has no need to - * involve the IOLOCK in this synchronization). - */ ASSERT((lock_flags & (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED)) == 0); /* reject inode numbers outside existing AGs */ @@ -758,15 +738,7 @@ xfs_inode_walk_ag_grab( ASSERT(rcu_read_lock_held()); - /* - * check for stale RCU freed inode - * - * If the inode has been reallocated, it doesn't matter if it's not in - * the AG we are walking - we are walking for writeback, so if it - * passes all the "valid inode" checks and is dirty, then we'll write - * it back anyway. If it has been reallocated and still being - * initialised, the XFS_INEW check below will catch it. - */ + /* Check for stale RCU freed inode */ spin_lock(&ip->i_flags_lock); if (!ip->i_ino) goto out_unlock_noent; @@ -1044,43 +1016,16 @@ xfs_reclaim_inode_grab( } /* - * Inodes in different states need to be treated differently. The following - * table lists the inode states and the reclaim actions necessary: + * Inode reclaim is non-blocking, so the default action if progress cannot be + * made is to "requeue" the inode for reclaim by unlocking it and clearing the + * XFS_IRECLAIM flag. If we are in a shutdown state, we don't care about + * blocking anymore and hence we can wait for the inode to be able to reclaim + * it. * - * inode state iflush ret required action - * --------------- ---------- --------------- - * bad - reclaim - * shutdown EIO unpin and reclaim - * clean, unpinned 0 reclaim - * stale, unpinned 0 reclaim - * clean, pinned(*) 0 requeue - * stale, pinned EAGAIN requeue - * dirty, async - requeue - * dirty, sync 0 reclaim - * - * (*) dgc: I don't think the clean, pinned state is possible but it gets - * handled anyway given the order of checks implemented. - * - * Also, because we get the flush lock first, we know that any inode that has - * been flushed delwri has had the flush completed by the time we check that - * the inode is clean. - * - * Note that because the inode is flushed delayed write by AIL pushing, the - * flush lock may already be held here and waiting on it can result in very - * long latencies. Hence for sync reclaims, where we wait on the flush lock, - * the caller should push the AIL first before trying to reclaim inodes to - * minimise the amount of time spent waiting. For background relaim, we only - * bother to reclaim clean inodes anyway. - * - * Hence the order of actions after gaining the locks should be: - * bad => reclaim - * shutdown => unpin and reclaim - * pinned, async => requeue - * pinned, sync => unpin - * stale => reclaim - * clean => reclaim - * dirty, async => requeue - * dirty, sync => flush, wait and reclaim + * We do no IO here - if callers require inodes to be cleaned they must push the + * AIL first to trigger writeback of dirty inodes. This enables writeback to be + * done in the background in a non-blocking manner, and enables memory reclaim + * to make progress without blocking. */ static void xfs_reclaim_inode( @@ -1271,13 +1216,11 @@ xfs_reclaim_inodes( } /* - * Scan a certain number of inodes for reclaim. - * - * When called we make sure that there is a background (fast) inode reclaim in - * progress, while we will throttle the speed of reclaim via doing synchronous - * reclaim of inodes. That means if we come across dirty inodes, we wait for - * them to be cleaned, which we hope will not be very long due to the - * background walker having already kicked the IO off on those dirty inodes. + * The shrinker infrastructure determines how many inodes we should scan for + * reclaim. We want as many clean inodes ready to reclaim as possible, so we + * push the AIL here. We also want to proactively free up memory if we can to + * minimise the amount of work memory reclaim has to do so we kick the + * background reclaim if it isn't already scheduled. */ long xfs_reclaim_inodes_nr( @@ -1390,8 +1333,7 @@ xfs_inode_matches_eofb( * This is a fast pass over the inode cache to try to get reclaim moving on as * many inodes as possible in a short period of time. It kicks itself every few * seconds, as well as being kicked by the inode cache shrinker when memory - * goes low. It scans as quickly as possible avoiding locked inodes or those - * already being flushed, and once done schedules a future pass. + * goes low. */ void xfs_reclaim_worker( From 71e3e35646861f2f9b8d36e00720904ed3ca31cb Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 29 Jun 2020 14:49:18 -0700 Subject: [PATCH 038/117] xfs: rework stale inodes in xfs_ifree_cluster Once we have inodes pinning the cluster buffer and attached whenever they are dirty, we no longer have a guarantee that the items are flush locked when we lock the cluster buffer. Hence we cannot just walk the buffer log item list and modify the attached inodes. If the inode is not flush locked, we have to ILOCK it first and then flush lock it to do all the prerequisite checks needed to avoid races with other code. This is already handled by xfs_ifree_get_one_inode(), so rework the inode iteration loop and function to update all inodes in cache whether they are attached to the buffer or not. Note: we also remove the copying of the log item lsn to the ili_flush_lsn as xfs_iflush_done() now uses the XFS_ISTALE flag to trigger aborts and so flush lsn matching is not needed in IO completion for processing freed inodes. Signed-off-by: Dave Chinner Reviewed-by: Darrick J. Wong Reviewed-by: Brian Foster Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_inode.c | 165 ++++++++++++++++++--------------------------- 1 file changed, 64 insertions(+), 101 deletions(-) diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 77cc9cbcd311..1c3a8bed4875 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -2517,17 +2517,19 @@ out: } /* - * Look up the inode number specified and mark it stale if it is found. If it is - * dirty, return the inode so it can be attached to the cluster buffer so it can - * be processed appropriately when the cluster free transaction completes. + * Look up the inode number specified and if it is not already marked XFS_ISTALE + * mark it stale. We should only find clean inodes in this lookup that aren't + * already stale. */ -static struct xfs_inode * -xfs_ifree_get_one_inode( - struct xfs_perag *pag, +static void +xfs_ifree_mark_inode_stale( + struct xfs_buf *bp, struct xfs_inode *free_ip, xfs_ino_t inum) { - struct xfs_mount *mp = pag->pag_mount; + struct xfs_mount *mp = bp->b_mount; + struct xfs_perag *pag = bp->b_pag; + struct xfs_inode_log_item *iip; struct xfs_inode *ip; retry: @@ -2535,8 +2537,10 @@ retry: ip = radix_tree_lookup(&pag->pag_ici_root, XFS_INO_TO_AGINO(mp, inum)); /* Inode not in memory, nothing to do */ - if (!ip) - goto out_rcu_unlock; + if (!ip) { + rcu_read_unlock(); + return; + } /* * because this is an RCU protected lookup, we could find a recently @@ -2547,9 +2551,9 @@ retry: spin_lock(&ip->i_flags_lock); if (ip->i_ino != inum || __xfs_iflags_test(ip, XFS_ISTALE)) { spin_unlock(&ip->i_flags_lock); - goto out_rcu_unlock; + rcu_read_unlock(); + return; } - spin_unlock(&ip->i_flags_lock); /* * Don't try to lock/unlock the current inode, but we _cannot_ skip the @@ -2559,43 +2563,53 @@ retry: */ if (ip != free_ip) { if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) { + spin_unlock(&ip->i_flags_lock); rcu_read_unlock(); delay(1); goto retry; } - - /* - * Check the inode number again in case we're racing with - * freeing in xfs_reclaim_inode(). See the comments in that - * function for more information as to why the initial check is - * not sufficient. - */ - if (ip->i_ino != inum) { - xfs_iunlock(ip, XFS_ILOCK_EXCL); - goto out_rcu_unlock; - } } + ip->i_flags |= XFS_ISTALE; + spin_unlock(&ip->i_flags_lock); rcu_read_unlock(); - xfs_iflock(ip); - xfs_iflags_set(ip, XFS_ISTALE); - /* - * We don't need to attach clean inodes or those only with unlogged - * changes (which we throw away, anyway). + * If we can't get the flush lock, the inode is already attached. All + * we needed to do here is mark the inode stale so buffer IO completion + * will remove it from the AIL. */ - if (!ip->i_itemp || xfs_inode_clean(ip)) { - ASSERT(ip != free_ip); - xfs_ifunlock(ip); - xfs_iunlock(ip, XFS_ILOCK_EXCL); - goto out_no_inode; + iip = ip->i_itemp; + if (!xfs_iflock_nowait(ip)) { + ASSERT(!list_empty(&iip->ili_item.li_bio_list)); + ASSERT(iip->ili_last_fields); + goto out_iunlock; } - return ip; + ASSERT(!iip || list_empty(&iip->ili_item.li_bio_list)); -out_rcu_unlock: - rcu_read_unlock(); -out_no_inode: - return NULL; + /* + * Clean inodes can be released immediately. Everything else has to go + * through xfs_iflush_abort() on journal commit as the flock + * synchronises removal of the inode from the cluster buffer against + * inode reclaim. + */ + if (xfs_inode_clean(ip)) { + xfs_ifunlock(ip); + goto out_iunlock; + } + + /* we have a dirty inode in memory that has not yet been flushed. */ + ASSERT(iip->ili_fields); + spin_lock(&iip->ili_lock); + iip->ili_last_fields = iip->ili_fields; + iip->ili_fields = 0; + iip->ili_fsync_fields = 0; + spin_unlock(&iip->ili_lock); + list_add_tail(&iip->ili_item.li_bio_list, &bp->b_li_list); + ASSERT(iip->ili_last_fields); + +out_iunlock: + if (ip != free_ip) + xfs_iunlock(ip, XFS_ILOCK_EXCL); } /* @@ -2605,26 +2619,20 @@ out_no_inode: */ STATIC int xfs_ifree_cluster( - xfs_inode_t *free_ip, - xfs_trans_t *tp, + struct xfs_inode *free_ip, + struct xfs_trans *tp, struct xfs_icluster *xic) { - xfs_mount_t *mp = free_ip->i_mount; + struct xfs_mount *mp = free_ip->i_mount; + struct xfs_ino_geometry *igeo = M_IGEO(mp); + struct xfs_buf *bp; + xfs_daddr_t blkno; + xfs_ino_t inum = xic->first_ino; int nbufs; int i, j; int ioffset; - xfs_daddr_t blkno; - xfs_buf_t *bp; - xfs_inode_t *ip; - struct xfs_inode_log_item *iip; - struct xfs_log_item *lip; - struct xfs_perag *pag; - struct xfs_ino_geometry *igeo = M_IGEO(mp); - xfs_ino_t inum; int error; - inum = xic->first_ino; - pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, inum)); nbufs = igeo->ialloc_blks / igeo->blocks_per_cluster; for (j = 0; j < nbufs; j++, inum += igeo->inodes_per_cluster) { @@ -2653,10 +2661,8 @@ xfs_ifree_cluster( error = xfs_trans_get_buf(tp, mp->m_ddev_targp, blkno, mp->m_bsize * igeo->blocks_per_cluster, XBF_UNMAPPED, &bp); - if (error) { - xfs_perag_put(pag); + if (error) return error; - } /* * This buffer may not have been correctly initialised as we @@ -2670,59 +2676,16 @@ xfs_ifree_cluster( bp->b_ops = &xfs_inode_buf_ops; /* - * Walk the inodes already attached to the buffer and mark them - * stale. These will all have the flush locks held, so an - * in-memory inode walk can't lock them. By marking them all - * stale first, we will not attempt to lock them in the loop - * below as the XFS_ISTALE flag will be set. + * Now we need to set all the cached clean inodes as XFS_ISTALE, + * too. This requires lookups, and will skip inodes that we've + * already marked XFS_ISTALE. */ - list_for_each_entry(lip, &bp->b_li_list, li_bio_list) { - if (lip->li_type == XFS_LI_INODE) { - iip = (struct xfs_inode_log_item *)lip; - xfs_trans_ail_copy_lsn(mp->m_ail, - &iip->ili_flush_lsn, - &iip->ili_item.li_lsn); - xfs_iflags_set(iip->ili_inode, XFS_ISTALE); - } - } - - - /* - * For each inode in memory attempt to add it to the inode - * buffer and set it up for being staled on buffer IO - * completion. This is safe as we've locked out tail pushing - * and flushing by locking the buffer. - * - * We have already marked every inode that was part of a - * transaction stale above, which means there is no point in - * even trying to lock them. - */ - for (i = 0; i < igeo->inodes_per_cluster; i++) { - ip = xfs_ifree_get_one_inode(pag, free_ip, inum + i); - if (!ip) - continue; - - iip = ip->i_itemp; - spin_lock(&iip->ili_lock); - iip->ili_last_fields = iip->ili_fields; - iip->ili_fields = 0; - iip->ili_fsync_fields = 0; - spin_unlock(&iip->ili_lock); - xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn, - &iip->ili_item.li_lsn); - - list_add_tail(&iip->ili_item.li_bio_list, - &bp->b_li_list); - - if (ip != free_ip) - xfs_iunlock(ip, XFS_ILOCK_EXCL); - } + for (i = 0; i < igeo->inodes_per_cluster; i++) + xfs_ifree_mark_inode_stale(bp, free_ip, inum + i); xfs_trans_stale_inode_buf(tp, bp); xfs_trans_binval(tp, bp); } - - xfs_perag_put(pag); return 0; } From 48d55e2ae3ce837598c073995bbbac5d24a35fe1 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 29 Jun 2020 14:49:18 -0700 Subject: [PATCH 039/117] xfs: attach inodes to the cluster buffer when dirtied Rather than attach inodes to the cluster buffer just when we are doing IO, attach the inodes to the cluster buffer when they are dirtied. The means the buffer always carries a list of dirty inodes that reference it, and we can use that list to make more fundamental changes to inode writeback that aren't otherwise possible. Signed-off-by: Dave Chinner Reviewed-by: Darrick J. Wong Reviewed-by: Brian Foster Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_trans_inode.c | 9 ++++++--- fs/xfs/xfs_buf_item.c | 1 + fs/xfs/xfs_icache.c | 1 + fs/xfs/xfs_inode.c | 24 +++++------------------- fs/xfs/xfs_inode_item.c | 16 ++++++++++++++-- 5 files changed, 27 insertions(+), 24 deletions(-) diff --git a/fs/xfs/libxfs/xfs_trans_inode.c b/fs/xfs/libxfs/xfs_trans_inode.c index ad5974365c58..e15129647e00 100644 --- a/fs/xfs/libxfs/xfs_trans_inode.c +++ b/fs/xfs/libxfs/xfs_trans_inode.c @@ -163,13 +163,16 @@ xfs_trans_log_inode( /* * We need an explicit buffer reference for the log item but * don't want the buffer to remain attached to the transaction. - * Hold the buffer but release the transaction reference. + * Hold the buffer but release the transaction reference once + * we've attached the inode log item to the buffer log item + * list. */ xfs_buf_hold(bp); - xfs_trans_brelse(tp, bp); - spin_lock(&iip->ili_lock); iip->ili_item.li_buf = bp; + bp->b_flags |= _XBF_INODES; + list_add_tail(&iip->ili_item.li_bio_list, &bp->b_li_list); + xfs_trans_brelse(tp, bp); } /* diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index ecb3362395af..e9428c30862a 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -465,6 +465,7 @@ xfs_buf_item_unpin( if (bip->bli_flags & XFS_BLI_STALE_INODE) { xfs_buf_item_done(bp); xfs_iflush_done(bp); + ASSERT(list_empty(&bp->b_li_list)); } else { xfs_trans_ail_delete(lip, SHUTDOWN_LOG_IO_ERROR); xfs_buf_item_relse(bp); diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index dc90a81abb1a..58a750ce689c 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -115,6 +115,7 @@ __xfs_inode_free( { /* asserts to verify all state is correct here */ ASSERT(atomic_read(&ip->i_pincount) == 0); + ASSERT(!ip->i_itemp || list_empty(&ip->i_itemp->ili_item.li_bio_list)); XFS_STATS_DEC(ip->i_mount, vn_active); call_rcu(&VFS_I(ip)->i_rcu, xfs_inode_free_callback); diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 1c3a8bed4875..c4586ac3656a 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -2584,27 +2584,24 @@ retry: ASSERT(iip->ili_last_fields); goto out_iunlock; } - ASSERT(!iip || list_empty(&iip->ili_item.li_bio_list)); /* - * Clean inodes can be released immediately. Everything else has to go - * through xfs_iflush_abort() on journal commit as the flock - * synchronises removal of the inode from the cluster buffer against - * inode reclaim. + * Inodes not attached to the buffer can be released immediately. + * Everything else has to go through xfs_iflush_abort() on journal + * commit as the flock synchronises removal of the inode from the + * cluster buffer against inode reclaim. */ - if (xfs_inode_clean(ip)) { + if (!iip || list_empty(&iip->ili_item.li_bio_list)) { xfs_ifunlock(ip); goto out_iunlock; } /* we have a dirty inode in memory that has not yet been flushed. */ - ASSERT(iip->ili_fields); spin_lock(&iip->ili_lock); iip->ili_last_fields = iip->ili_fields; iip->ili_fields = 0; iip->ili_fsync_fields = 0; spin_unlock(&iip->ili_lock); - list_add_tail(&iip->ili_item.li_bio_list, &bp->b_li_list); ASSERT(iip->ili_last_fields); out_iunlock: @@ -3818,19 +3815,8 @@ flush_out: xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn, &iip->ili_item.li_lsn); - /* - * Attach the inode item callback to the buffer whether the flush - * succeeded or not. If not, the caller will shut down and fail I/O - * completion on the buffer to remove the inode from the AIL and release - * the flush lock. - */ - bp->b_flags |= _XBF_INODES; - list_add_tail(&iip->ili_item.li_bio_list, &bp->b_li_list); - /* generate the checksum. */ xfs_dinode_calc_crc(mp, dip); - - ASSERT(!list_empty(&bp->b_li_list)); return error; } diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index 64bdda72f7b2..697248b7eb2b 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -660,6 +660,10 @@ xfs_inode_item_destroy( * list for other inodes that will run this function. We remove them from the * buffer list so we can process all the inode IO completions in one AIL lock * traversal. + * + * Note: Now that we attach the log item to the buffer when we first log the + * inode in memory, we can have unflushed inodes on the buffer list here. These + * inodes will have a zero ili_last_fields, so skip over them here. */ void xfs_iflush_done( @@ -677,12 +681,15 @@ xfs_iflush_done( */ list_for_each_entry_safe(lip, n, &bp->b_li_list, li_bio_list) { iip = INODE_ITEM(lip); + if (xfs_iflags_test(iip->ili_inode, XFS_ISTALE)) { - list_del_init(&lip->li_bio_list); xfs_iflush_abort(iip->ili_inode); continue; } + if (!iip->ili_last_fields) + continue; + list_move_tail(&lip->li_bio_list, &tmp); /* Do an unlocked check for needing the AIL lock. */ @@ -728,12 +735,16 @@ xfs_iflush_done( /* * Remove the reference to the cluster buffer if the inode is * clean in memory. Drop the buffer reference once we've dropped - * the locks we hold. + * the locks we hold. If the inode is dirty in memory, we need + * to put the inode item back on the buffer list for another + * pass through the flush machinery. */ ASSERT(iip->ili_item.li_buf == bp); if (!iip->ili_fields) { iip->ili_item.li_buf = NULL; drop_buffer = true; + } else { + list_add(&lip->li_bio_list, &bp->b_li_list); } iip->ili_last_fields = 0; iip->ili_flush_lsn = 0; @@ -777,6 +788,7 @@ xfs_iflush_abort( iip->ili_flush_lsn = 0; bp = iip->ili_item.li_buf; iip->ili_item.li_buf = NULL; + list_del_init(&iip->ili_item.li_bio_list); spin_unlock(&iip->ili_lock); } xfs_ifunlock(ip); From 90c60e16401248a4900f3f9387f563d0178dcf34 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 29 Jun 2020 14:49:19 -0700 Subject: [PATCH 040/117] xfs: xfs_iflush() is no longer necessary Now we have a cached buffer on inode log items, we don't need to do buffer lookups when flushing inodes anymore - all we need to do is lock the buffer and we are ready to go. This largely gets rid of the need for xfs_iflush(), which is essentially just a mechanism to look up the buffer and flush the inode to it. Instead, we can just call xfs_iflush_cluster() with a few modifications to ensure it also flushes the inode we already hold locked. This allows the AIL inode item pushing to be almost entirely non-blocking in XFS - we won't block unless memory allocation for the cluster inode lookup blocks or the block device queues are full. Writeback during inode reclaim becomes a little more complex because we now have to lock the buffer ourselves, but otherwise this change is largely a functional no-op that removes a whole lot of code. Signed-off-by: Dave Chinner Reviewed-by: Darrick J. Wong Reviewed-by: Brian Foster Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_inode.c | 107 ++++++---------------------------------- fs/xfs/xfs_inode.h | 2 +- fs/xfs/xfs_inode_item.c | 53 +++++++------------- 3 files changed, 34 insertions(+), 128 deletions(-) diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index c4586ac3656a..4a9539048639 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -3450,7 +3450,18 @@ out_release_wip: return error; } -STATIC int +/* + * Non-blocking flush of dirty inode metadata into the backing buffer. + * + * The caller must have a reference to the inode and hold the cluster buffer + * locked. The function will walk across all the inodes on the cluster buffer it + * can find and lock without blocking, and flush them to the cluster buffer. + * + * On success, the caller must write out the buffer returned in *bp and + * release it. On failure, the filesystem will be shut down, the buffer will + * have been unlocked and released, and EFSCORRUPTED will be returned. + */ +int xfs_iflush_cluster( struct xfs_inode *ip, struct xfs_buf *bp) @@ -3485,8 +3496,6 @@ xfs_iflush_cluster( for (i = 0; i < nr_found; i++) { cip = cilist[i]; - if (cip == ip) - continue; /* * because this is an RCU protected lookup, we could find a @@ -3577,99 +3586,11 @@ out_free: kmem_free(cilist); out_put: xfs_perag_put(pag); - return error; -} - -/* - * Flush dirty inode metadata into the backing buffer. - * - * The caller must have the inode lock and the inode flush lock held. The - * inode lock will still be held upon return to the caller, and the inode - * flush lock will be released after the inode has reached the disk. - * - * The caller must write out the buffer returned in *bpp and release it. - */ -int -xfs_iflush( - struct xfs_inode *ip, - struct xfs_buf **bpp) -{ - struct xfs_mount *mp = ip->i_mount; - struct xfs_buf *bp = NULL; - struct xfs_dinode *dip; - int error; - - XFS_STATS_INC(mp, xs_iflush_count); - - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); - ASSERT(xfs_isiflocked(ip)); - ASSERT(ip->i_df.if_format != XFS_DINODE_FMT_BTREE || - ip->i_df.if_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK)); - - *bpp = NULL; - - xfs_iunpin_wait(ip); - - /* - * For stale inodes we cannot rely on the backing buffer remaining - * stale in cache for the remaining life of the stale inode and so - * xfs_imap_to_bp() below may give us a buffer that no longer contains - * inodes below. We have to check this after ensuring the inode is - * unpinned so that it is safe to reclaim the stale inode after the - * flush call. - */ - if (xfs_iflags_test(ip, XFS_ISTALE)) { - xfs_ifunlock(ip); - return 0; - } - - /* - * Get the buffer containing the on-disk inode. We are doing a try-lock - * operation here, so we may get an EAGAIN error. In that case, return - * leaving the inode dirty. - * - * If we get any other error, we effectively have a corruption situation - * and we cannot flush the inode. Abort the flush and shut down. - */ - error = xfs_imap_to_bp(mp, NULL, &ip->i_imap, &dip, &bp, XBF_TRYLOCK); - if (error == -EAGAIN) { - xfs_ifunlock(ip); - return error; - } - if (error) - goto abort; - - /* - * If the buffer is pinned then push on the log now so we won't - * get stuck waiting in the write for too long. - */ - if (xfs_buf_ispinned(bp)) - xfs_log_force(mp, 0); - - /* - * Flush the provided inode then attempt to gather others from the - * cluster into the write. - * - * Note: Once we attempt to flush an inode, we must run buffer - * completion callbacks on any failure. If this fails, simulate an I/O - * failure on the buffer and shut down. - */ - error = xfs_iflush_int(ip, bp); - if (!error) - error = xfs_iflush_cluster(ip, bp); if (error) { bp->b_flags |= XBF_ASYNC; xfs_buf_ioend_fail(bp); - goto shutdown; + xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); } - - *bpp = bp; - return 0; - -abort: - xfs_iflush_abort(ip); -shutdown: - xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); return error; } @@ -3687,7 +3608,7 @@ xfs_iflush_int( ASSERT(xfs_isiflocked(ip)); ASSERT(ip->i_df.if_format != XFS_DINODE_FMT_BTREE || ip->i_df.if_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK)); - ASSERT(iip != NULL && iip->ili_fields != 0); + ASSERT(iip->ili_item.li_buf == bp); dip = xfs_buf_offset(bp, ip->i_imap.im_boffset); diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 1534386b430c..c482e7306fe0 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -426,7 +426,7 @@ int xfs_log_force_inode(struct xfs_inode *ip); void xfs_iunpin_wait(xfs_inode_t *); #define xfs_ipincount(ip) ((unsigned int) atomic_read(&ip->i_pincount)) -int xfs_iflush(struct xfs_inode *, struct xfs_buf **); +int xfs_iflush_cluster(struct xfs_inode *, struct xfs_buf *); void xfs_lock_two_inodes(struct xfs_inode *ip0, uint ip0_mode, struct xfs_inode *ip1, uint ip1_mode); diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index 697248b7eb2b..e8eda2ac25fb 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -485,53 +485,38 @@ xfs_inode_item_push( uint rval = XFS_ITEM_SUCCESS; int error; - if (xfs_ipincount(ip) > 0) + ASSERT(iip->ili_item.li_buf); + + if (xfs_ipincount(ip) > 0 || xfs_buf_ispinned(bp) || + (ip->i_flags & XFS_ISTALE)) return XFS_ITEM_PINNED; - if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) + /* If the inode is already flush locked, we're already flushing. */ + if (xfs_isiflocked(ip)) + return XFS_ITEM_FLUSHING; + + if (!xfs_buf_trylock(bp)) return XFS_ITEM_LOCKED; - /* - * Re-check the pincount now that we stabilized the value by - * taking the ilock. - */ - if (xfs_ipincount(ip) > 0) { - rval = XFS_ITEM_PINNED; - goto out_unlock; - } - - /* - * Stale inode items should force out the iclog. - */ - if (ip->i_flags & XFS_ISTALE) { - rval = XFS_ITEM_PINNED; - goto out_unlock; - } - - /* - * Someone else is already flushing the inode. Nothing we can do - * here but wait for the flush to finish and remove the item from - * the AIL. - */ - if (!xfs_iflock_nowait(ip)) { - rval = XFS_ITEM_FLUSHING; - goto out_unlock; - } - - ASSERT(iip->ili_fields != 0 || XFS_FORCED_SHUTDOWN(ip->i_mount)); spin_unlock(&lip->li_ailp->ail_lock); - error = xfs_iflush(ip, &bp); + /* + * We need to hold a reference for flushing the cluster buffer as it may + * fail the buffer without IO submission. In which case, we better get a + * reference for that completion because otherwise we don't get a + * reference for IO until we queue the buffer for delwri submission. + */ + xfs_buf_hold(bp); + error = xfs_iflush_cluster(ip, bp); if (!error) { if (!xfs_buf_delwri_queue(bp, buffer_list)) rval = XFS_ITEM_FLUSHING; xfs_buf_relse(bp); - } else if (error == -EAGAIN) + } else { rval = XFS_ITEM_LOCKED; + } spin_lock(&lip->li_ailp->ail_lock); -out_unlock: - xfs_iunlock(ip, XFS_ILOCK_SHARED); return rval; } From e6187b3444e88ed9aa5f3843603e1f024b6d0309 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 29 Jun 2020 14:49:19 -0700 Subject: [PATCH 041/117] xfs: rename xfs_iflush_int() with xfs_iflush() gone, we can rename xfs_iflush_int() back to xfs_iflush(). Also move it up above xfs_iflush_cluster() so we don't need the forward definition any more. Signed-off-by: Dave Chinner Reviewed-by: Amir Goldstein Reviewed-by: Darrick J. Wong Reviewed-by: Brian Foster Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_inode.c | 293 ++++++++++++++++++++++----------------------- 1 file changed, 146 insertions(+), 147 deletions(-) diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 4a9539048639..31e105f95739 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -44,7 +44,6 @@ kmem_zone_t *xfs_inode_zone; */ #define XFS_ITRUNC_MAX_EXTENTS 2 -STATIC int xfs_iflush_int(struct xfs_inode *, struct xfs_buf *); STATIC int xfs_iunlink(struct xfs_trans *, struct xfs_inode *); STATIC int xfs_iunlink_remove(struct xfs_trans *, struct xfs_inode *); @@ -3450,152 +3449,8 @@ out_release_wip: return error; } -/* - * Non-blocking flush of dirty inode metadata into the backing buffer. - * - * The caller must have a reference to the inode and hold the cluster buffer - * locked. The function will walk across all the inodes on the cluster buffer it - * can find and lock without blocking, and flush them to the cluster buffer. - * - * On success, the caller must write out the buffer returned in *bp and - * release it. On failure, the filesystem will be shut down, the buffer will - * have been unlocked and released, and EFSCORRUPTED will be returned. - */ -int -xfs_iflush_cluster( - struct xfs_inode *ip, - struct xfs_buf *bp) -{ - struct xfs_mount *mp = ip->i_mount; - struct xfs_perag *pag; - unsigned long first_index, mask; - int cilist_size; - struct xfs_inode **cilist; - struct xfs_inode *cip; - struct xfs_ino_geometry *igeo = M_IGEO(mp); - int error = 0; - int nr_found; - int clcount = 0; - int i; - - pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); - - cilist_size = igeo->inodes_per_cluster * sizeof(struct xfs_inode *); - cilist = kmem_alloc(cilist_size, KM_MAYFAIL|KM_NOFS); - if (!cilist) - goto out_put; - - mask = ~(igeo->inodes_per_cluster - 1); - first_index = XFS_INO_TO_AGINO(mp, ip->i_ino) & mask; - rcu_read_lock(); - /* really need a gang lookup range call here */ - nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, (void**)cilist, - first_index, igeo->inodes_per_cluster); - if (nr_found == 0) - goto out_free; - - for (i = 0; i < nr_found; i++) { - cip = cilist[i]; - - /* - * because this is an RCU protected lookup, we could find a - * recently freed or even reallocated inode during the lookup. - * We need to check under the i_flags_lock for a valid inode - * here. Skip it if it is not valid or the wrong inode. - */ - spin_lock(&cip->i_flags_lock); - if (!cip->i_ino || - __xfs_iflags_test(cip, XFS_ISTALE)) { - spin_unlock(&cip->i_flags_lock); - continue; - } - - /* - * Once we fall off the end of the cluster, no point checking - * any more inodes in the list because they will also all be - * outside the cluster. - */ - if ((XFS_INO_TO_AGINO(mp, cip->i_ino) & mask) != first_index) { - spin_unlock(&cip->i_flags_lock); - break; - } - spin_unlock(&cip->i_flags_lock); - - /* - * Do an un-protected check to see if the inode is dirty and - * is a candidate for flushing. These checks will be repeated - * later after the appropriate locks are acquired. - */ - if (xfs_inode_clean(cip) && xfs_ipincount(cip) == 0) - continue; - - /* - * Try to get locks. If any are unavailable or it is pinned, - * then this inode cannot be flushed and is skipped. - */ - - if (!xfs_ilock_nowait(cip, XFS_ILOCK_SHARED)) - continue; - if (!xfs_iflock_nowait(cip)) { - xfs_iunlock(cip, XFS_ILOCK_SHARED); - continue; - } - if (xfs_ipincount(cip)) { - xfs_ifunlock(cip); - xfs_iunlock(cip, XFS_ILOCK_SHARED); - continue; - } - - - /* - * Check the inode number again, just to be certain we are not - * racing with freeing in xfs_reclaim_inode(). See the comments - * in that function for more information as to why the initial - * check is not sufficient. - */ - if (!cip->i_ino) { - xfs_ifunlock(cip); - xfs_iunlock(cip, XFS_ILOCK_SHARED); - continue; - } - - /* - * arriving here means that this inode can be flushed. First - * re-check that it's dirty before flushing. - */ - if (!xfs_inode_clean(cip)) { - error = xfs_iflush_int(cip, bp); - if (error) { - xfs_iunlock(cip, XFS_ILOCK_SHARED); - goto out_free; - } - clcount++; - } else { - xfs_ifunlock(cip); - } - xfs_iunlock(cip, XFS_ILOCK_SHARED); - } - - if (clcount) { - XFS_STATS_INC(mp, xs_icluster_flushcnt); - XFS_STATS_ADD(mp, xs_icluster_flushinode, clcount); - } - -out_free: - rcu_read_unlock(); - kmem_free(cilist); -out_put: - xfs_perag_put(pag); - if (error) { - bp->b_flags |= XBF_ASYNC; - xfs_buf_ioend_fail(bp); - xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); - } - return error; -} - -STATIC int -xfs_iflush_int( +static int +xfs_iflush( struct xfs_inode *ip, struct xfs_buf *bp) { @@ -3741,6 +3596,150 @@ flush_out: return error; } +/* + * Non-blocking flush of dirty inode metadata into the backing buffer. + * + * The caller must have a reference to the inode and hold the cluster buffer + * locked. The function will walk across all the inodes on the cluster buffer it + * can find and lock without blocking, and flush them to the cluster buffer. + * + * On success, the caller must write out the buffer returned in *bp and + * release it. On failure, the filesystem will be shut down, the buffer will + * have been unlocked and released, and EFSCORRUPTED will be returned. + */ +int +xfs_iflush_cluster( + struct xfs_inode *ip, + struct xfs_buf *bp) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_perag *pag; + unsigned long first_index, mask; + int cilist_size; + struct xfs_inode **cilist; + struct xfs_inode *cip; + struct xfs_ino_geometry *igeo = M_IGEO(mp); + int error = 0; + int nr_found; + int clcount = 0; + int i; + + pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); + + cilist_size = igeo->inodes_per_cluster * sizeof(struct xfs_inode *); + cilist = kmem_alloc(cilist_size, KM_MAYFAIL|KM_NOFS); + if (!cilist) + goto out_put; + + mask = ~(igeo->inodes_per_cluster - 1); + first_index = XFS_INO_TO_AGINO(mp, ip->i_ino) & mask; + rcu_read_lock(); + /* really need a gang lookup range call here */ + nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, (void**)cilist, + first_index, igeo->inodes_per_cluster); + if (nr_found == 0) + goto out_free; + + for (i = 0; i < nr_found; i++) { + cip = cilist[i]; + + /* + * because this is an RCU protected lookup, we could find a + * recently freed or even reallocated inode during the lookup. + * We need to check under the i_flags_lock for a valid inode + * here. Skip it if it is not valid or the wrong inode. + */ + spin_lock(&cip->i_flags_lock); + if (!cip->i_ino || + __xfs_iflags_test(cip, XFS_ISTALE)) { + spin_unlock(&cip->i_flags_lock); + continue; + } + + /* + * Once we fall off the end of the cluster, no point checking + * any more inodes in the list because they will also all be + * outside the cluster. + */ + if ((XFS_INO_TO_AGINO(mp, cip->i_ino) & mask) != first_index) { + spin_unlock(&cip->i_flags_lock); + break; + } + spin_unlock(&cip->i_flags_lock); + + /* + * Do an un-protected check to see if the inode is dirty and + * is a candidate for flushing. These checks will be repeated + * later after the appropriate locks are acquired. + */ + if (xfs_inode_clean(cip) && xfs_ipincount(cip) == 0) + continue; + + /* + * Try to get locks. If any are unavailable or it is pinned, + * then this inode cannot be flushed and is skipped. + */ + + if (!xfs_ilock_nowait(cip, XFS_ILOCK_SHARED)) + continue; + if (!xfs_iflock_nowait(cip)) { + xfs_iunlock(cip, XFS_ILOCK_SHARED); + continue; + } + if (xfs_ipincount(cip)) { + xfs_ifunlock(cip); + xfs_iunlock(cip, XFS_ILOCK_SHARED); + continue; + } + + + /* + * Check the inode number again, just to be certain we are not + * racing with freeing in xfs_reclaim_inode(). See the comments + * in that function for more information as to why the initial + * check is not sufficient. + */ + if (!cip->i_ino) { + xfs_ifunlock(cip); + xfs_iunlock(cip, XFS_ILOCK_SHARED); + continue; + } + + /* + * arriving here means that this inode can be flushed. First + * re-check that it's dirty before flushing. + */ + if (!xfs_inode_clean(cip)) { + error = xfs_iflush(cip, bp); + if (error) { + xfs_iunlock(cip, XFS_ILOCK_SHARED); + goto out_free; + } + clcount++; + } else { + xfs_ifunlock(cip); + } + xfs_iunlock(cip, XFS_ILOCK_SHARED); + } + + if (clcount) { + XFS_STATS_INC(mp, xs_icluster_flushcnt); + XFS_STATS_ADD(mp, xs_icluster_flushinode, clcount); + } + +out_free: + rcu_read_unlock(); + kmem_free(cilist); +out_put: + xfs_perag_put(pag); + if (error) { + bp->b_flags |= XBF_ASYNC; + xfs_buf_ioend_fail(bp); + xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); + } + return error; +} + /* Release an inode. */ void xfs_irele( From 5717ea4d527acbec9300cb083b100dd0003ac777 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 29 Jun 2020 14:49:20 -0700 Subject: [PATCH 042/117] xfs: rework xfs_iflush_cluster() dirty inode iteration Now that we have all the dirty inodes attached to the cluster buffer, we don't actually have to do radix tree lookups to find them. Sure, the radix tree is efficient, but walking a linked list of just the dirty inodes attached to the buffer is much better. We are also no longer dependent on having a locked inode passed into the function to determine where to start the lookup. This means we can drop it from the function call and treat all inodes the same. We also make xfs_iflush_cluster skip inodes marked with XFS_IRECLAIM. This we avoid races with inodes that reclaim is actively referencing or are being re-initialised by inode lookup. If they are actually dirty, they'll get written by a future cluster flush.... We also add a shutdown check after obtaining the flush lock so that we catch inodes that are dirty in memory and may have inconsistent state due to the shutdown in progress. We abort these inodes directly and so they remove themselves directly from the buffer list and the AIL rather than having to wait for the buffer to be failed and callbacks run to be processed correctly. Signed-off-by: Dave Chinner Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_inode.c | 203 ++++++++++++++++++---------------------- fs/xfs/xfs_inode.h | 2 +- fs/xfs/xfs_inode_item.c | 8 +- 3 files changed, 99 insertions(+), 114 deletions(-) diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 31e105f95739..ece3622f6d28 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -3603,141 +3603,120 @@ flush_out: * locked. The function will walk across all the inodes on the cluster buffer it * can find and lock without blocking, and flush them to the cluster buffer. * - * On success, the caller must write out the buffer returned in *bp and - * release it. On failure, the filesystem will be shut down, the buffer will - * have been unlocked and released, and EFSCORRUPTED will be returned. + * On successful flushing of at least one inode, the caller must write out the + * buffer and release it. If no inodes are flushed, -EAGAIN will be returned and + * the caller needs to release the buffer. On failure, the filesystem will be + * shut down, the buffer will have been unlocked and released, and EFSCORRUPTED + * will be returned. */ int xfs_iflush_cluster( - struct xfs_inode *ip, struct xfs_buf *bp) { - struct xfs_mount *mp = ip->i_mount; - struct xfs_perag *pag; - unsigned long first_index, mask; - int cilist_size; - struct xfs_inode **cilist; - struct xfs_inode *cip; - struct xfs_ino_geometry *igeo = M_IGEO(mp); - int error = 0; - int nr_found; + struct xfs_mount *mp = bp->b_mount; + struct xfs_log_item *lip, *n; + struct xfs_inode *ip; + struct xfs_inode_log_item *iip; int clcount = 0; - int i; + int error = 0; - pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); - - cilist_size = igeo->inodes_per_cluster * sizeof(struct xfs_inode *); - cilist = kmem_alloc(cilist_size, KM_MAYFAIL|KM_NOFS); - if (!cilist) - goto out_put; - - mask = ~(igeo->inodes_per_cluster - 1); - first_index = XFS_INO_TO_AGINO(mp, ip->i_ino) & mask; - rcu_read_lock(); - /* really need a gang lookup range call here */ - nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, (void**)cilist, - first_index, igeo->inodes_per_cluster); - if (nr_found == 0) - goto out_free; - - for (i = 0; i < nr_found; i++) { - cip = cilist[i]; + /* + * We must use the safe variant here as on shutdown xfs_iflush_abort() + * can remove itself from the list. + */ + list_for_each_entry_safe(lip, n, &bp->b_li_list, li_bio_list) { + iip = (struct xfs_inode_log_item *)lip; + ip = iip->ili_inode; /* - * because this is an RCU protected lookup, we could find a - * recently freed or even reallocated inode during the lookup. - * We need to check under the i_flags_lock for a valid inode - * here. Skip it if it is not valid or the wrong inode. + * Quick and dirty check to avoid locks if possible. */ - spin_lock(&cip->i_flags_lock); - if (!cip->i_ino || - __xfs_iflags_test(cip, XFS_ISTALE)) { - spin_unlock(&cip->i_flags_lock); + if (__xfs_iflags_test(ip, XFS_IRECLAIM | XFS_IFLOCK)) + continue; + if (xfs_ipincount(ip)) + continue; + + /* + * The inode is still attached to the buffer, which means it is + * dirty but reclaim might try to grab it. Check carefully for + * that, and grab the ilock while still holding the i_flags_lock + * to guarantee reclaim will not be able to reclaim this inode + * once we drop the i_flags_lock. + */ + spin_lock(&ip->i_flags_lock); + ASSERT(!__xfs_iflags_test(ip, XFS_ISTALE)); + if (__xfs_iflags_test(ip, XFS_IRECLAIM | XFS_IFLOCK)) { + spin_unlock(&ip->i_flags_lock); continue; } /* - * Once we fall off the end of the cluster, no point checking - * any more inodes in the list because they will also all be - * outside the cluster. + * ILOCK will pin the inode against reclaim and prevent + * concurrent transactions modifying the inode while we are + * flushing the inode. */ - if ((XFS_INO_TO_AGINO(mp, cip->i_ino) & mask) != first_index) { - spin_unlock(&cip->i_flags_lock); + if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) { + spin_unlock(&ip->i_flags_lock); + continue; + } + spin_unlock(&ip->i_flags_lock); + + /* + * Skip inodes that are already flush locked as they have + * already been written to the buffer. + */ + if (!xfs_iflock_nowait(ip)) { + xfs_iunlock(ip, XFS_ILOCK_SHARED); + continue; + } + + /* + * Abort flushing this inode if we are shut down because the + * inode may not currently be in the AIL. This can occur when + * log I/O failure unpins the inode without inserting into the + * AIL, leaving a dirty/unpinned inode attached to the buffer + * that otherwise looks like it should be flushed. + */ + if (XFS_FORCED_SHUTDOWN(mp)) { + xfs_iunpin_wait(ip); + /* xfs_iflush_abort() drops the flush lock */ + xfs_iflush_abort(ip); + xfs_iunlock(ip, XFS_ILOCK_SHARED); + error = -EIO; + continue; + } + + /* don't block waiting on a log force to unpin dirty inodes */ + if (xfs_ipincount(ip)) { + xfs_ifunlock(ip); + xfs_iunlock(ip, XFS_ILOCK_SHARED); + continue; + } + + if (!xfs_inode_clean(ip)) + error = xfs_iflush(ip, bp); + else + xfs_ifunlock(ip); + xfs_iunlock(ip, XFS_ILOCK_SHARED); + if (error) break; - } - spin_unlock(&cip->i_flags_lock); - - /* - * Do an un-protected check to see if the inode is dirty and - * is a candidate for flushing. These checks will be repeated - * later after the appropriate locks are acquired. - */ - if (xfs_inode_clean(cip) && xfs_ipincount(cip) == 0) - continue; - - /* - * Try to get locks. If any are unavailable or it is pinned, - * then this inode cannot be flushed and is skipped. - */ - - if (!xfs_ilock_nowait(cip, XFS_ILOCK_SHARED)) - continue; - if (!xfs_iflock_nowait(cip)) { - xfs_iunlock(cip, XFS_ILOCK_SHARED); - continue; - } - if (xfs_ipincount(cip)) { - xfs_ifunlock(cip); - xfs_iunlock(cip, XFS_ILOCK_SHARED); - continue; - } - - - /* - * Check the inode number again, just to be certain we are not - * racing with freeing in xfs_reclaim_inode(). See the comments - * in that function for more information as to why the initial - * check is not sufficient. - */ - if (!cip->i_ino) { - xfs_ifunlock(cip); - xfs_iunlock(cip, XFS_ILOCK_SHARED); - continue; - } - - /* - * arriving here means that this inode can be flushed. First - * re-check that it's dirty before flushing. - */ - if (!xfs_inode_clean(cip)) { - error = xfs_iflush(cip, bp); - if (error) { - xfs_iunlock(cip, XFS_ILOCK_SHARED); - goto out_free; - } - clcount++; - } else { - xfs_ifunlock(cip); - } - xfs_iunlock(cip, XFS_ILOCK_SHARED); + clcount++; } - if (clcount) { - XFS_STATS_INC(mp, xs_icluster_flushcnt); - XFS_STATS_ADD(mp, xs_icluster_flushinode, clcount); - } - -out_free: - rcu_read_unlock(); - kmem_free(cilist); -out_put: - xfs_perag_put(pag); if (error) { bp->b_flags |= XBF_ASYNC; xfs_buf_ioend_fail(bp); xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); + return error; } - return error; + + if (!clcount) + return -EAGAIN; + + XFS_STATS_INC(mp, xs_icluster_flushcnt); + XFS_STATS_ADD(mp, xs_icluster_flushinode, clcount); + return 0; + } /* Release an inode. */ diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index c482e7306fe0..e9a8bb184d1f 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -426,7 +426,7 @@ int xfs_log_force_inode(struct xfs_inode *ip); void xfs_iunpin_wait(xfs_inode_t *); #define xfs_ipincount(ip) ((unsigned int) atomic_read(&ip->i_pincount)) -int xfs_iflush_cluster(struct xfs_inode *, struct xfs_buf *); +int xfs_iflush_cluster(struct xfs_buf *); void xfs_lock_two_inodes(struct xfs_inode *ip0, uint ip0_mode, struct xfs_inode *ip1, uint ip1_mode); diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index e8eda2ac25fb..4e7fce8d4f7c 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -507,12 +507,18 @@ xfs_inode_item_push( * reference for IO until we queue the buffer for delwri submission. */ xfs_buf_hold(bp); - error = xfs_iflush_cluster(ip, bp); + error = xfs_iflush_cluster(bp); if (!error) { if (!xfs_buf_delwri_queue(bp, buffer_list)) rval = XFS_ITEM_FLUSHING; xfs_buf_relse(bp); } else { + /* + * Release the buffer if we were unable to flush anything. On + * any other error, the buffer has already been released. + */ + if (error == -EAGAIN) + xfs_buf_relse(bp); rval = XFS_ITEM_LOCKED; } From a69a1dc2842e4548efca956c86e0816f2662ccb7 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 29 Jun 2020 14:49:20 -0700 Subject: [PATCH 043/117] xfs: factor xfs_iflush_done xfs_iflush_done() does 3 distinct operations to the inodes attached to the buffer. Separate these operations out into functions so that it is easier to modify these operations independently in future. Signed-off-by: Dave Chinner Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_inode_item.c | 167 +++++++++++++++++++++------------------- 1 file changed, 87 insertions(+), 80 deletions(-) diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index 4e7fce8d4f7c..3840117f8a5e 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -641,101 +641,63 @@ xfs_inode_item_destroy( /* - * This is the inode flushing I/O completion routine. It is called - * from interrupt level when the buffer containing the inode is - * flushed to disk. It is responsible for removing the inode item - * from the AIL if it has not been re-logged, and unlocking the inode's - * flush lock. - * - * To reduce AIL lock traffic as much as possible, we scan the buffer log item - * list for other inodes that will run this function. We remove them from the - * buffer list so we can process all the inode IO completions in one AIL lock - * traversal. - * - * Note: Now that we attach the log item to the buffer when we first log the - * inode in memory, we can have unflushed inodes on the buffer list here. These - * inodes will have a zero ili_last_fields, so skip over them here. + * We only want to pull the item from the AIL if it is actually there + * and its location in the log has not changed since we started the + * flush. Thus, we only bother if the inode's lsn has not changed. */ -void -xfs_iflush_done( - struct xfs_buf *bp) +static void +xfs_iflush_ail_updates( + struct xfs_ail *ailp, + struct list_head *list) +{ + struct xfs_log_item *lip; + xfs_lsn_t tail_lsn = 0; + + /* this is an opencoded batch version of xfs_trans_ail_delete */ + spin_lock(&ailp->ail_lock); + list_for_each_entry(lip, list, li_bio_list) { + xfs_lsn_t lsn; + + clear_bit(XFS_LI_FAILED, &lip->li_flags); + if (INODE_ITEM(lip)->ili_flush_lsn != lip->li_lsn) + continue; + + lsn = xfs_ail_delete_one(ailp, lip); + if (!tail_lsn && lsn) + tail_lsn = lsn; + } + xfs_ail_update_finish(ailp, tail_lsn); +} + +/* + * Walk the list of inodes that have completed their IOs. If they are clean + * remove them from the list and dissociate them from the buffer. Buffers that + * are still dirty remain linked to the buffer and on the list. Caller must + * handle them appropriately. + */ +static void +xfs_iflush_finish( + struct xfs_buf *bp, + struct list_head *list) { - struct xfs_inode_log_item *iip; struct xfs_log_item *lip, *n; - struct xfs_ail *ailp = bp->b_mount->m_ail; - int need_ail = 0; - LIST_HEAD(tmp); - /* - * Pull the attached inodes from the buffer one at a time and take the - * appropriate action on them. - */ - list_for_each_entry_safe(lip, n, &bp->b_li_list, li_bio_list) { - iip = INODE_ITEM(lip); - - if (xfs_iflags_test(iip->ili_inode, XFS_ISTALE)) { - xfs_iflush_abort(iip->ili_inode); - continue; - } - - if (!iip->ili_last_fields) - continue; - - list_move_tail(&lip->li_bio_list, &tmp); - - /* Do an unlocked check for needing the AIL lock. */ - if (iip->ili_flush_lsn == lip->li_lsn || - test_bit(XFS_LI_FAILED, &lip->li_flags)) - need_ail++; - } - - /* - * We only want to pull the item from the AIL if it is actually there - * and its location in the log has not changed since we started the - * flush. Thus, we only bother if the inode's lsn has not changed. - */ - if (need_ail) { - xfs_lsn_t tail_lsn = 0; - - /* this is an opencoded batch version of xfs_trans_ail_delete */ - spin_lock(&ailp->ail_lock); - list_for_each_entry(lip, &tmp, li_bio_list) { - clear_bit(XFS_LI_FAILED, &lip->li_flags); - if (lip->li_lsn == INODE_ITEM(lip)->ili_flush_lsn) { - xfs_lsn_t lsn = xfs_ail_delete_one(ailp, lip); - if (!tail_lsn && lsn) - tail_lsn = lsn; - } - } - xfs_ail_update_finish(ailp, tail_lsn); - } - - /* - * Clean up and unlock the flush lock now we are done. We can clear the - * ili_last_fields bits now that we know that the data corresponding to - * them is safely on disk. - */ - list_for_each_entry_safe(lip, n, &tmp, li_bio_list) { + list_for_each_entry_safe(lip, n, list, li_bio_list) { + struct xfs_inode_log_item *iip = INODE_ITEM(lip); bool drop_buffer = false; - list_del_init(&lip->li_bio_list); - iip = INODE_ITEM(lip); - spin_lock(&iip->ili_lock); /* * Remove the reference to the cluster buffer if the inode is - * clean in memory. Drop the buffer reference once we've dropped - * the locks we hold. If the inode is dirty in memory, we need - * to put the inode item back on the buffer list for another - * pass through the flush machinery. + * clean in memory and drop the buffer reference once we've + * dropped the locks we hold. */ ASSERT(iip->ili_item.li_buf == bp); if (!iip->ili_fields) { iip->ili_item.li_buf = NULL; + list_del_init(&lip->li_bio_list); drop_buffer = true; - } else { - list_add(&lip->li_bio_list, &bp->b_li_list); } iip->ili_last_fields = 0; iip->ili_flush_lsn = 0; @@ -746,6 +708,51 @@ xfs_iflush_done( } } +/* + * Inode buffer IO completion routine. It is responsible for removing inodes + * attached to the buffer from the AIL if they have not been re-logged, as well + * as completing the flush and unlocking the inode. + */ +void +xfs_iflush_done( + struct xfs_buf *bp) +{ + struct xfs_log_item *lip, *n; + LIST_HEAD(flushed_inodes); + LIST_HEAD(ail_updates); + + /* + * Pull the attached inodes from the buffer one at a time and take the + * appropriate action on them. + */ + list_for_each_entry_safe(lip, n, &bp->b_li_list, li_bio_list) { + struct xfs_inode_log_item *iip = INODE_ITEM(lip); + + if (xfs_iflags_test(iip->ili_inode, XFS_ISTALE)) { + xfs_iflush_abort(iip->ili_inode); + continue; + } + if (!iip->ili_last_fields) + continue; + + /* Do an unlocked check for needing the AIL lock. */ + if (iip->ili_flush_lsn == lip->li_lsn || + test_bit(XFS_LI_FAILED, &lip->li_flags)) + list_move_tail(&lip->li_bio_list, &ail_updates); + else + list_move_tail(&lip->li_bio_list, &flushed_inodes); + } + + if (!list_empty(&ail_updates)) { + xfs_iflush_ail_updates(bp->b_mount->m_ail, &ail_updates); + list_splice_tail(&ail_updates, &flushed_inodes); + } + + xfs_iflush_finish(bp, &flushed_inodes); + if (!list_empty(&flushed_inodes)) + list_splice_tail(&flushed_inodes, &bp->b_li_list); +} + /* * This is the inode flushing abort routine. It is called from xfs_iflush when * the filesystem is shutting down to clean up the inode state. It is From e2705b0304778916db87831217ec642e34d9d9fa Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 29 Jun 2020 14:49:20 -0700 Subject: [PATCH 044/117] xfs: remove xfs_inobp_check() This debug code is called on every xfs_iflush() call, which then checks every inode in the buffer for non-zero unlinked list field. Hence it checks every inode in the cluster buffer every time a single inode on that cluster it flushed. This is resulting in: - 38.91% 5.33% [kernel] [k] xfs_iflush - 17.70% xfs_iflush - 9.93% xfs_inobp_check 4.36% xfs_buf_offset 10% of the CPU time spent flushing inodes is repeatedly checking unlinked fields in the buffer. We don't need to do this. The other place we call xfs_inobp_check() is xfs_iunlink_update_dinode(), and this is after we've done this assert for the agino we are about to write into that inode: ASSERT(xfs_verify_agino_or_null(mp, agno, next_agino)); which means we've already checked that the agino we are about to write is not 0 on debug kernels. The inode buffer verifiers do everything else we need, so let's just remove this debug code. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_inode_buf.c | 24 ------------------------ fs/xfs/libxfs/xfs_inode_buf.h | 6 ------ fs/xfs/xfs_inode.c | 2 -- 3 files changed, 32 deletions(-) diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c index b4a6c091571e..8d5dd08eab75 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.c +++ b/fs/xfs/libxfs/xfs_inode_buf.c @@ -20,30 +20,6 @@ #include -/* - * Check that none of the inode's in the buffer have a next - * unlinked field of 0. - */ -#if defined(DEBUG) -void -xfs_inobp_check( - xfs_mount_t *mp, - xfs_buf_t *bp) -{ - int i; - xfs_dinode_t *dip; - - for (i = 0; i < M_IGEO(mp)->inodes_per_cluster; i++) { - dip = xfs_buf_offset(bp, i * mp->m_sb.sb_inodesize); - if (!dip->di_next_unlinked) { - xfs_alert(mp, - "Detected bogus zero next_unlinked field in inode %d buffer 0x%llx.", - i, (long long)bp->b_bn); - } - } -} -#endif - /* * If we are doing readahead on an inode buffer, we might be in log recovery * reading an inode allocation buffer that hasn't yet been replayed, and hence diff --git a/fs/xfs/libxfs/xfs_inode_buf.h b/fs/xfs/libxfs/xfs_inode_buf.h index 865ac493c72a..6b08b9d060c2 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.h +++ b/fs/xfs/libxfs/xfs_inode_buf.h @@ -52,12 +52,6 @@ int xfs_inode_from_disk(struct xfs_inode *ip, struct xfs_dinode *from); void xfs_log_dinode_to_disk(struct xfs_log_dinode *from, struct xfs_dinode *to); -#if defined(DEBUG) -void xfs_inobp_check(struct xfs_mount *, struct xfs_buf *); -#else -#define xfs_inobp_check(mp, bp) -#endif /* DEBUG */ - xfs_failaddr_t xfs_dinode_verify(struct xfs_mount *mp, xfs_ino_t ino, struct xfs_dinode *dip); xfs_failaddr_t xfs_inode_validate_extsize(struct xfs_mount *mp, diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index ece3622f6d28..5c07bf491d9f 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -2165,7 +2165,6 @@ xfs_iunlink_update_dinode( xfs_dinode_calc_crc(mp, dip); xfs_trans_inode_buf(tp, ibp); xfs_trans_log_buf(tp, ibp, offset, offset + sizeof(xfs_agino_t) - 1); - xfs_inobp_check(mp, ibp); } /* Set an in-core inode's unlinked pointer and return the old value. */ @@ -3558,7 +3557,6 @@ xfs_iflush( xfs_iflush_fork(ip, dip, iip, XFS_DATA_FORK); if (XFS_IFORK_Q(ip)) xfs_iflush_fork(ip, dip, iip, XFS_ATTR_FORK); - xfs_inobp_check(mp, bp); /* * We've recorded everything logged in the inode, so we'd like to clear From f866560be21966421c4a6e83079a6b76fadf667f Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 2 Jul 2020 08:42:12 -0700 Subject: [PATCH 045/117] xfs: rtbitmap scrubber should verify written extents Ensure that the realtime bitmap file is backed entirely by written extents. No holes, no unwritten blocks, etc. Signed-off-by: Darrick J. Wong Reviewed-by: Allison Collins --- fs/xfs/scrub/rtbitmap.c | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/fs/xfs/scrub/rtbitmap.c b/fs/xfs/scrub/rtbitmap.c index c642bc206c41..c777c98c50c3 100644 --- a/fs/xfs/scrub/rtbitmap.c +++ b/fs/xfs/scrub/rtbitmap.c @@ -13,6 +13,7 @@ #include "xfs_trans.h" #include "xfs_rtalloc.h" #include "xfs_inode.h" +#include "xfs_bmap.h" #include "scrub/scrub.h" #include "scrub/common.h" @@ -58,6 +59,41 @@ xchk_rtbitmap_rec( return 0; } +/* Make sure the entire rtbitmap file is mapped with written extents. */ +STATIC int +xchk_rtbitmap_check_extents( + struct xfs_scrub *sc) +{ + struct xfs_mount *mp = sc->mp; + struct xfs_bmbt_irec map; + xfs_rtblock_t off; + int nmap; + int error = 0; + + for (off = 0; off < mp->m_sb.sb_rbmblocks;) { + if (xchk_should_terminate(sc, &error) || + (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) + break; + + /* Make sure we have a written extent. */ + nmap = 1; + error = xfs_bmapi_read(mp->m_rbmip, off, + mp->m_sb.sb_rbmblocks - off, &map, &nmap, + XFS_DATA_FORK); + if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, off, &error)) + break; + + if (nmap != 1 || !xfs_bmap_is_written_extent(&map)) { + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, off); + break; + } + + off += map.br_blockcount; + } + + return error; +} + /* Scrub the realtime bitmap. */ int xchk_rtbitmap( @@ -70,6 +106,10 @@ xchk_rtbitmap( if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) return error; + error = xchk_rtbitmap_check_extents(sc); + if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) + return error; + error = xfs_rtalloc_query_all(sc->tp, xchk_rtbitmap_rec, sc); if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, 0, &error)) goto out; From 2fb94e36b6833332057043699d3338228f6e1e2b Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 2 Jul 2020 08:42:13 -0700 Subject: [PATCH 046/117] xfs: rtbitmap scrubber should check inode size Make sure the rtbitmap is large enough to store the entire bitmap. Signed-off-by: Darrick J. Wong Reviewed-by: Allison Collins --- fs/xfs/scrub/rtbitmap.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/fs/xfs/scrub/rtbitmap.c b/fs/xfs/scrub/rtbitmap.c index c777c98c50c3..76e4ffe0315b 100644 --- a/fs/xfs/scrub/rtbitmap.c +++ b/fs/xfs/scrub/rtbitmap.c @@ -101,6 +101,13 @@ xchk_rtbitmap( { int error; + /* Is the size of the rtbitmap correct? */ + if (sc->mp->m_rbmip->i_d.di_size != + XFS_FSB_TO_B(sc->mp, sc->mp->m_sb.sb_rbmblocks)) { + xchk_ino_set_corrupt(sc, sc->mp->m_rbmip->i_ino); + return 0; + } + /* Invoke the fork scrubber. */ error = xchk_metadata_inode_forks(sc); if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) From c3f2375b90d0d26f257b926a0d236371df483ca1 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Wed, 8 Jul 2020 10:21:44 -0700 Subject: [PATCH 047/117] xfs: Fix false positive lockdep warning with sb_internal & fs_reclaim Depending on the workloads, the following circular locking dependency warning between sb_internal (a percpu rwsem) and fs_reclaim (a pseudo lock) may show up: ====================================================== WARNING: possible circular locking dependency detected 5.0.0-rc1+ #60 Tainted: G W ------------------------------------------------------ fsfreeze/4346 is trying to acquire lock: 0000000026f1d784 (fs_reclaim){+.+.}, at: fs_reclaim_acquire.part.19+0x5/0x30 but task is already holding lock: 0000000072bfc54b (sb_internal){++++}, at: percpu_down_write+0xb4/0x650 which lock already depends on the new lock. : Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(sb_internal); lock(fs_reclaim); lock(sb_internal); lock(fs_reclaim); *** DEADLOCK *** 4 locks held by fsfreeze/4346: #0: 00000000b478ef56 (sb_writers#8){++++}, at: percpu_down_write+0xb4/0x650 #1: 000000001ec487a9 (&type->s_umount_key#28){++++}, at: freeze_super+0xda/0x290 #2: 000000003edbd5a0 (sb_pagefaults){++++}, at: percpu_down_write+0xb4/0x650 #3: 0000000072bfc54b (sb_internal){++++}, at: percpu_down_write+0xb4/0x650 stack backtrace: Call Trace: dump_stack+0xe0/0x19a print_circular_bug.isra.10.cold.34+0x2f4/0x435 check_prev_add.constprop.19+0xca1/0x15f0 validate_chain.isra.14+0x11af/0x3b50 __lock_acquire+0x728/0x1200 lock_acquire+0x269/0x5a0 fs_reclaim_acquire.part.19+0x29/0x30 fs_reclaim_acquire+0x19/0x20 kmem_cache_alloc+0x3e/0x3f0 kmem_zone_alloc+0x79/0x150 xfs_trans_alloc+0xfa/0x9d0 xfs_sync_sb+0x86/0x170 xfs_log_sbcount+0x10f/0x140 xfs_quiesce_attr+0x134/0x270 xfs_fs_freeze+0x4a/0x70 freeze_super+0x1af/0x290 do_vfs_ioctl+0xedc/0x16c0 ksys_ioctl+0x41/0x80 __x64_sys_ioctl+0x73/0xa9 do_syscall_64+0x18f/0xd23 entry_SYSCALL_64_after_hwframe+0x49/0xbe This is a false positive as all the dirty pages are flushed out before the filesystem can be frozen. One way to avoid this splat is to add GFP_NOFS to the affected allocation calls by using the memalloc_nofs_save()/memalloc_nofs_restore() pair. This shouldn't matter unless the system is really running out of memory. In that particular case, the filesystem freeze operation may fail while it was succeeding previously. Without this patch, the command sequence below will show that the lock dependency chain sb_internal -> fs_reclaim exists. # fsfreeze -f /home # fsfreeze --unfreeze /home # grep -i fs_reclaim -C 3 /proc/lockdep_chains | grep -C 5 sb_internal After applying the patch, such sb_internal -> fs_reclaim lock dependency chain can no longer be found. Because of that, the locking dependency warning will not be shown. Suggested-by: Dave Chinner Signed-off-by: Waiman Long Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/xfs_super.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 5a5d9453cf51..5ef5d8416f70 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -910,11 +910,21 @@ xfs_fs_freeze( struct super_block *sb) { struct xfs_mount *mp = XFS_M(sb); + unsigned int flags; + int ret; + /* + * The filesystem is now frozen far enough that memory reclaim + * cannot safely operate on the filesystem. Hence we need to + * set a GFP_NOFS context here to avoid recursion deadlocks. + */ + flags = memalloc_nofs_save(); xfs_stop_block_reaping(mp); xfs_save_resvblks(mp); xfs_quiesce_attr(mp); - return xfs_sync_sb(mp, true); + ret = xfs_sync_sb(mp, true); + memalloc_nofs_restore(flags); + return ret; } STATIC int From 92a005448f6fed70b5e7a9f29a1f930118449f1b Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Mon, 13 Jul 2020 09:13:00 -0700 Subject: [PATCH 048/117] xfs: get rid of unnecessary xfs_perag_{get,put} pairs In the course of some operations, we look up the perag from the mount multiple times to get or change perag information. These are often very short pieces of code, so while the lookup cost is generally low, the cost of the lookup is far higher than the cost of the operation we are doing on the perag. Since we changed buffers to hold references to the perag they are cached in, many modification contexts already hold active references to the perag that are held across these operations. This is especially true for any operation that is serialised by an allocation group header buffer. In these cases, we can just use the buffer's reference to the perag to avoid needing to do lookups to access the perag. This means that many operations don't need to do perag lookups at all to access the perag because they've already looked up objects that own persistent references and hence can use that reference instead. Cc: Dave Chinner Cc: "Darrick J. Wong" Signed-off-by: Gao Xiang Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_ag.c | 4 ++-- fs/xfs/libxfs/xfs_ag_resv.h | 12 ---------- fs/xfs/libxfs/xfs_alloc.c | 22 ++++++----------- fs/xfs/libxfs/xfs_alloc_btree.c | 8 ++----- fs/xfs/libxfs/xfs_ialloc.c | 28 ++++++---------------- fs/xfs/libxfs/xfs_refcount_btree.c | 4 +--- fs/xfs/libxfs/xfs_rmap_btree.c | 9 ++++--- fs/xfs/xfs_inode.c | 38 +++++++++--------------------- 8 files changed, 34 insertions(+), 91 deletions(-) diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c index 9d84007a5c65..8cf73fe4338e 100644 --- a/fs/xfs/libxfs/xfs_ag.c +++ b/fs/xfs/libxfs/xfs_ag.c @@ -563,7 +563,8 @@ xfs_ag_get_geometry( error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agf_bp); if (error) goto out_agi; - pag = xfs_perag_get(mp, agno); + + pag = agi_bp->b_pag; /* Fill out form. */ memset(ageo, 0, sizeof(*ageo)); @@ -583,7 +584,6 @@ xfs_ag_get_geometry( xfs_ag_geom_health(pag, ageo); /* Release resources. */ - xfs_perag_put(pag); xfs_buf_relse(agf_bp); out_agi: xfs_buf_relse(agi_bp); diff --git a/fs/xfs/libxfs/xfs_ag_resv.h b/fs/xfs/libxfs/xfs_ag_resv.h index f3fd0ee9a7f7..8a8eb4bc48bb 100644 --- a/fs/xfs/libxfs/xfs_ag_resv.h +++ b/fs/xfs/libxfs/xfs_ag_resv.h @@ -37,16 +37,4 @@ xfs_ag_resv_rmapbt_alloc( xfs_perag_put(pag); } -static inline void -xfs_ag_resv_rmapbt_free( - struct xfs_mount *mp, - xfs_agnumber_t agno) -{ - struct xfs_perag *pag; - - pag = xfs_perag_get(mp, agno); - xfs_ag_resv_free_extent(pag, XFS_AG_RESV_RMAPBT, NULL, 1); - xfs_perag_put(pag); -} - #endif /* __XFS_AG_RESV_H__ */ diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index 203e74fa64aa..bf4d07e5c73f 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -710,13 +710,12 @@ xfs_alloc_read_agfl( STATIC int xfs_alloc_update_counters( struct xfs_trans *tp, - struct xfs_perag *pag, struct xfs_buf *agbp, long len) { struct xfs_agf *agf = agbp->b_addr; - pag->pagf_freeblks += len; + agbp->b_pag->pagf_freeblks += len; be32_add_cpu(&agf->agf_freeblks, len); xfs_trans_agblocks_delta(tp, len); @@ -1175,8 +1174,7 @@ xfs_alloc_ag_vextent( } if (!args->wasfromfl) { - error = xfs_alloc_update_counters(args->tp, args->pag, - args->agbp, + error = xfs_alloc_update_counters(args->tp, args->agbp, -((long)(args->len))); if (error) return error; @@ -1887,7 +1885,6 @@ xfs_free_ag_extent( enum xfs_ag_resv_type type) { struct xfs_mount *mp; - struct xfs_perag *pag; struct xfs_btree_cur *bno_cur; struct xfs_btree_cur *cnt_cur; xfs_agblock_t gtbno; /* start of right neighbor */ @@ -2167,10 +2164,8 @@ xfs_free_ag_extent( /* * Update the freespace totals in the ag and superblock. */ - pag = xfs_perag_get(mp, agno); - error = xfs_alloc_update_counters(tp, pag, agbp, len); - xfs_ag_resv_free_extent(pag, type, tp, len); - xfs_perag_put(pag); + error = xfs_alloc_update_counters(tp, agbp, len); + xfs_ag_resv_free_extent(agbp->b_pag, type, tp, len); if (error) goto error0; @@ -2689,7 +2684,7 @@ xfs_alloc_get_freelist( if (be32_to_cpu(agf->agf_flfirst) == xfs_agfl_size(mp)) agf->agf_flfirst = 0; - pag = xfs_perag_get(mp, be32_to_cpu(agf->agf_seqno)); + pag = agbp->b_pag; ASSERT(!pag->pagf_agflreset); be32_add_cpu(&agf->agf_flcount, -1); xfs_trans_agflist_delta(tp, -1); @@ -2701,7 +2696,6 @@ xfs_alloc_get_freelist( pag->pagf_btreeblks++; logflags |= XFS_AGF_BTREEBLKS; } - xfs_perag_put(pag); xfs_alloc_log_agf(tp, agbp, logflags); *bnop = bno; @@ -2797,7 +2791,7 @@ xfs_alloc_put_freelist( if (be32_to_cpu(agf->agf_fllast) == xfs_agfl_size(mp)) agf->agf_fllast = 0; - pag = xfs_perag_get(mp, be32_to_cpu(agf->agf_seqno)); + pag = agbp->b_pag; ASSERT(!pag->pagf_agflreset); be32_add_cpu(&agf->agf_flcount, 1); xfs_trans_agflist_delta(tp, 1); @@ -2809,7 +2803,6 @@ xfs_alloc_put_freelist( pag->pagf_btreeblks--; logflags |= XFS_AGF_BTREEBLKS; } - xfs_perag_put(pag); xfs_alloc_log_agf(tp, agbp, logflags); @@ -3006,7 +2999,7 @@ xfs_alloc_read_agf( ASSERT(!(*bpp)->b_error); agf = (*bpp)->b_addr; - pag = xfs_perag_get(mp, agno); + pag = (*bpp)->b_pag; if (!pag->pagf_init) { pag->pagf_freeblks = be32_to_cpu(agf->agf_freeblks); pag->pagf_btreeblks = be32_to_cpu(agf->agf_btreeblks); @@ -3034,7 +3027,6 @@ xfs_alloc_read_agf( be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi])); } #endif - xfs_perag_put(pag); return 0; } diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c index 60c453cb3ee3..3d1226aa2eb5 100644 --- a/fs/xfs/libxfs/xfs_alloc_btree.c +++ b/fs/xfs/libxfs/xfs_alloc_btree.c @@ -38,16 +38,14 @@ xfs_allocbt_set_root( { struct xfs_buf *agbp = cur->bc_ag.agbp; struct xfs_agf *agf = agbp->b_addr; - xfs_agnumber_t seqno = be32_to_cpu(agf->agf_seqno); int btnum = cur->bc_btnum; - struct xfs_perag *pag = xfs_perag_get(cur->bc_mp, seqno); + struct xfs_perag *pag = agbp->b_pag; ASSERT(ptr->s != 0); agf->agf_roots[btnum] = ptr->s; be32_add_cpu(&agf->agf_levels[btnum], inc); pag->pagf_levels[btnum] += inc; - xfs_perag_put(pag); xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_ROOTS | XFS_AGF_LEVELS); } @@ -115,7 +113,6 @@ xfs_allocbt_update_lastrec( int reason) { struct xfs_agf *agf = cur->bc_ag.agbp->b_addr; - xfs_agnumber_t seqno = be32_to_cpu(agf->agf_seqno); struct xfs_perag *pag; __be32 len; int numrecs; @@ -160,9 +157,8 @@ xfs_allocbt_update_lastrec( } agf->agf_longest = len; - pag = xfs_perag_get(cur->bc_mp, seqno); + pag = cur->bc_ag.agbp->b_pag; pag->pagf_longest = be32_to_cpu(len); - xfs_perag_put(pag); xfs_alloc_log_agf(cur->bc_tp, cur->bc_ag.agbp, XFS_AGF_LONGEST); } diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index 7fcf62b324b0..f742a96a2fe1 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -888,10 +888,9 @@ sparse_alloc: */ be32_add_cpu(&agi->agi_count, newlen); be32_add_cpu(&agi->agi_freecount, newlen); - pag = xfs_perag_get(args.mp, agno); + pag = agbp->b_pag; pag->pagi_freecount += newlen; pag->pagi_count += newlen; - xfs_perag_put(pag); agi->agi_newino = cpu_to_be32(newino); /* @@ -1134,7 +1133,7 @@ xfs_dialloc_ag_inobt( xfs_agnumber_t agno = be32_to_cpu(agi->agi_seqno); xfs_agnumber_t pagno = XFS_INO_TO_AGNO(mp, parent); xfs_agino_t pagino = XFS_INO_TO_AGINO(mp, parent); - struct xfs_perag *pag; + struct xfs_perag *pag = agbp->b_pag; struct xfs_btree_cur *cur, *tcur; struct xfs_inobt_rec_incore rec, trec; xfs_ino_t ino; @@ -1143,8 +1142,6 @@ xfs_dialloc_ag_inobt( int i, j; int searchdistance = 10; - pag = xfs_perag_get(mp, agno); - ASSERT(pag->pagi_init); ASSERT(pag->pagi_inodeok); ASSERT(pag->pagi_freecount > 0); @@ -1384,14 +1381,12 @@ alloc_inode: xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -1); - xfs_perag_put(pag); *inop = ino; return 0; error1: xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR); error0: xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); - xfs_perag_put(pag); return error; } @@ -1587,7 +1582,6 @@ xfs_dialloc_ag( xfs_agnumber_t agno = be32_to_cpu(agi->agi_seqno); xfs_agnumber_t pagno = XFS_INO_TO_AGNO(mp, parent); xfs_agino_t pagino = XFS_INO_TO_AGINO(mp, parent); - struct xfs_perag *pag; struct xfs_btree_cur *cur; /* finobt cursor */ struct xfs_btree_cur *icur; /* inobt cursor */ struct xfs_inobt_rec_incore rec; @@ -1599,8 +1593,6 @@ xfs_dialloc_ag( if (!xfs_sb_version_hasfinobt(&mp->m_sb)) return xfs_dialloc_ag_inobt(tp, agbp, parent, inop); - pag = xfs_perag_get(mp, agno); - /* * If pagino is 0 (this is the root inode allocation) use newino. * This must work because we've just allocated some. @@ -1667,7 +1659,7 @@ xfs_dialloc_ag( */ be32_add_cpu(&agi->agi_freecount, -1); xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT); - pag->pagi_freecount--; + agbp->b_pag->pagi_freecount--; xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -1); @@ -1680,7 +1672,6 @@ xfs_dialloc_ag( xfs_btree_del_cursor(icur, XFS_BTREE_NOERROR); xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); - xfs_perag_put(pag); *inop = ino; return 0; @@ -1688,7 +1679,6 @@ error_icur: xfs_btree_del_cursor(icur, XFS_BTREE_ERROR); error_cur: xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); - xfs_perag_put(pag); return error; } @@ -1945,7 +1935,6 @@ xfs_difree_inobt( { struct xfs_agi *agi = agbp->b_addr; xfs_agnumber_t agno = be32_to_cpu(agi->agi_seqno); - struct xfs_perag *pag; struct xfs_btree_cur *cur; struct xfs_inobt_rec_incore rec; int ilen; @@ -2007,6 +1996,8 @@ xfs_difree_inobt( if (!(mp->m_flags & XFS_MOUNT_IKEEP) && rec.ir_free == XFS_INOBT_ALL_FREE && mp->m_sb.sb_inopblock <= XFS_INODES_PER_CHUNK) { + struct xfs_perag *pag = agbp->b_pag; + xic->deleted = true; xic->first_ino = XFS_AGINO_TO_INO(mp, agno, rec.ir_startino); xic->alloc = xfs_inobt_irec_to_allocmask(&rec); @@ -2020,10 +2011,8 @@ xfs_difree_inobt( be32_add_cpu(&agi->agi_count, -ilen); be32_add_cpu(&agi->agi_freecount, -(ilen - 1)); xfs_ialloc_log_agi(tp, agbp, XFS_AGI_COUNT | XFS_AGI_FREECOUNT); - pag = xfs_perag_get(mp, agno); pag->pagi_freecount -= ilen - 1; pag->pagi_count -= ilen; - xfs_perag_put(pag); xfs_trans_mod_sb(tp, XFS_TRANS_SB_ICOUNT, -ilen); xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -(ilen - 1)); @@ -2049,9 +2038,7 @@ xfs_difree_inobt( */ be32_add_cpu(&agi->agi_freecount, 1); xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT); - pag = xfs_perag_get(mp, agno); - pag->pagi_freecount++; - xfs_perag_put(pag); + agbp->b_pag->pagi_freecount++; xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, 1); } @@ -2661,7 +2648,7 @@ xfs_ialloc_read_agi( return error; agi = (*bpp)->b_addr; - pag = xfs_perag_get(mp, agno); + pag = (*bpp)->b_pag; if (!pag->pagi_init) { pag->pagi_freecount = be32_to_cpu(agi->agi_freecount); pag->pagi_count = be32_to_cpu(agi->agi_count); @@ -2674,7 +2661,6 @@ xfs_ialloc_read_agi( */ ASSERT(pag->pagi_freecount == be32_to_cpu(agi->agi_freecount) || XFS_FORCED_SHUTDOWN(mp)); - xfs_perag_put(pag); return 0; } diff --git a/fs/xfs/libxfs/xfs_refcount_btree.c b/fs/xfs/libxfs/xfs_refcount_btree.c index 7fd6044a4f78..c5296c124a4c 100644 --- a/fs/xfs/libxfs/xfs_refcount_btree.c +++ b/fs/xfs/libxfs/xfs_refcount_btree.c @@ -37,15 +37,13 @@ xfs_refcountbt_set_root( { struct xfs_buf *agbp = cur->bc_ag.agbp; struct xfs_agf *agf = agbp->b_addr; - xfs_agnumber_t seqno = be32_to_cpu(agf->agf_seqno); - struct xfs_perag *pag = xfs_perag_get(cur->bc_mp, seqno); + struct xfs_perag *pag = agbp->b_pag; ASSERT(ptr->s != 0); agf->agf_refcount_root = ptr->s; be32_add_cpu(&agf->agf_refcount_level, inc); pag->pagf_refcount_level += inc; - xfs_perag_put(pag); xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_REFCOUNT_ROOT | XFS_AGF_REFCOUNT_LEVEL); diff --git a/fs/xfs/libxfs/xfs_rmap_btree.c b/fs/xfs/libxfs/xfs_rmap_btree.c index b7c05314d07c..94948a53569f 100644 --- a/fs/xfs/libxfs/xfs_rmap_btree.c +++ b/fs/xfs/libxfs/xfs_rmap_btree.c @@ -63,16 +63,14 @@ xfs_rmapbt_set_root( { struct xfs_buf *agbp = cur->bc_ag.agbp; struct xfs_agf *agf = agbp->b_addr; - xfs_agnumber_t seqno = be32_to_cpu(agf->agf_seqno); int btnum = cur->bc_btnum; - struct xfs_perag *pag = xfs_perag_get(cur->bc_mp, seqno); + struct xfs_perag *pag = agbp->b_pag; ASSERT(ptr->s != 0); agf->agf_roots[btnum] = ptr->s; be32_add_cpu(&agf->agf_levels[btnum], inc); pag->pagf_levels[btnum] += inc; - xfs_perag_put(pag); xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_ROOTS | XFS_AGF_LEVELS); } @@ -123,6 +121,7 @@ xfs_rmapbt_free_block( { struct xfs_buf *agbp = cur->bc_ag.agbp; struct xfs_agf *agf = agbp->b_addr; + struct xfs_perag *pag; xfs_agblock_t bno; int error; @@ -139,8 +138,8 @@ xfs_rmapbt_free_block( XFS_EXTENT_BUSY_SKIP_DISCARD); xfs_trans_agbtree_delta(cur->bc_tp, -1); - xfs_ag_resv_rmapbt_free(cur->bc_mp, cur->bc_ag.agno); - + pag = cur->bc_ag.agbp->b_pag; + xfs_ag_resv_free_extent(pag, XFS_AG_RESV_RMAPBT, NULL, 1); return 0; } diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 5c07bf491d9f..407d6299606d 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -2265,7 +2265,6 @@ xfs_iunlink( } if (next_agino != NULLAGINO) { - struct xfs_perag *pag; xfs_agino_t old_agino; /* @@ -2282,9 +2281,7 @@ xfs_iunlink( * agino has been unlinked, add a backref from the next inode * back to agino. */ - pag = xfs_perag_get(mp, agno); - error = xfs_iunlink_add_backref(pag, agino, next_agino); - xfs_perag_put(pag); + error = xfs_iunlink_add_backref(agibp->b_pag, agino, next_agino); if (error) return error; } @@ -2420,7 +2417,6 @@ xfs_iunlink_remove( struct xfs_buf *agibp; struct xfs_buf *last_ibp; struct xfs_dinode *last_dip = NULL; - struct xfs_perag *pag = NULL; xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, ip->i_ino); xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ip->i_ino); xfs_agino_t next_agino; @@ -2464,32 +2460,22 @@ xfs_iunlink_remove( * this inode's backref to point from the next inode. */ if (next_agino != NULLAGINO) { - pag = xfs_perag_get(mp, agno); - error = xfs_iunlink_change_backref(pag, next_agino, + error = xfs_iunlink_change_backref(agibp->b_pag, next_agino, NULLAGINO); if (error) - goto out; + return error; } - if (head_agino == agino) { - /* Point the head of the list to the next unlinked inode. */ - error = xfs_iunlink_update_bucket(tp, agno, agibp, bucket_index, - next_agino); - if (error) - goto out; - } else { + if (head_agino != agino) { struct xfs_imap imap; xfs_agino_t prev_agino; - if (!pag) - pag = xfs_perag_get(mp, agno); - /* We need to search the list for the inode being freed. */ error = xfs_iunlink_map_prev(tp, agno, head_agino, agino, &prev_agino, &imap, &last_dip, &last_ibp, - pag); + agibp->b_pag); if (error) - goto out; + return error; /* Point the previous inode on the list to the next inode. */ xfs_iunlink_update_dinode(tp, agno, prev_agino, last_ibp, @@ -2503,15 +2489,13 @@ xfs_iunlink_remove( * change_backref takes care of deleting the backref if * next_agino is NULLAGINO. */ - error = xfs_iunlink_change_backref(pag, agino, next_agino); - if (error) - goto out; + return xfs_iunlink_change_backref(agibp->b_pag, agino, + next_agino); } -out: - if (pag) - xfs_perag_put(pag); - return error; + /* Point the head of the list to the next unlinked inode. */ + return xfs_iunlink_update_bucket(tp, agno, agibp, bucket_index, + next_agino); } /* From 76622c88c2ce89d9ce494a0f656a2ad0abd67a69 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 13 Jul 2020 09:14:50 -0700 Subject: [PATCH 049/117] xfs: remove SYNC_WAIT and SYNC_TRYLOCK These two definitions are unused now. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner Reviewed-by: Chaitanya Kulkarni --- fs/xfs/xfs_icache.h | 3 --- 1 file changed, 3 deletions(-) diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h index ae92ca53de42..3a4c8b382cd0 100644 --- a/fs/xfs/xfs_icache.h +++ b/fs/xfs/xfs_icache.h @@ -17,9 +17,6 @@ struct xfs_eofblocks { __u64 eof_min_file_size; }; -#define SYNC_WAIT 0x0001 /* wait for i/o to complete */ -#define SYNC_TRYLOCK 0x0002 /* only try to lock inodes */ - /* * tags for inode radix tree */ From 8464e650b957888e3fb4789b4b16bf66840a287a Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Mon, 13 Jul 2020 09:17:32 -0700 Subject: [PATCH 050/117] xfs: remove duplicated include from xfs_buf_item.c Remove duplicated include. Signed-off-by: YueHaibing Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong Reviewed-by: Chaitanya Kulkarni --- fs/xfs/xfs_buf_item.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index e9428c30862a..ed1bf1d99483 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -19,7 +19,6 @@ #include "xfs_quota.h" #include "xfs_dquot_item.h" #include "xfs_dquot.h" -#include "xfs_trans_priv.h" #include "xfs_trace.h" #include "xfs_log.h" From 4750a171c3290f9bbebca16c6372db723a4cfa3b Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Wed, 15 Jul 2020 08:30:37 -0700 Subject: [PATCH 051/117] xfs: preserve inode versioning across remounts The MS_I_VERSION mount flag is exposed via the VFS, as documented in the mount manpages etc; see the iversion and noiversion mount options in mount(8). As a result, mount -o remount looks for this option in /proc/mounts and will only send the I_VERSION flag back in during remount it it is present. Since it's not there, a remount will /remove/ the I_VERSION flag at the vfs level, and iversion functionality is lost. xfs v5 superblocks intend to always have i_version enabled; it is set as a default at mount time, but is lost during remount for the reasons above. The generic fix would be to expose this documented option in /proc/mounts, but since that was rejected, fix it up again in the xfs remount path instead, so that at least xfs won't suffer from this misbehavior. Signed-off-by: Eric Sandeen Reviewed-by: Dave Chinner Reviewed-by: Christoph Hellwig Signed-off-by: Darrick J. Wong Reviewed-by: Darrick J. Wong --- fs/xfs/xfs_super.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 5ef5d8416f70..71ac6c1cdc36 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -1721,6 +1721,10 @@ xfs_fc_reconfigure( int flags = fc->sb_flags; int error; + /* version 5 superblocks always support version counters. */ + if (XFS_SB_VERSION_NUM(&mp->m_sb) == XFS_SB_VERSION_5) + fc->sb_flags |= SB_I_VERSION; + error = xfs_fc_validate_params(new_mp); if (error) return error; From f376b45e861d8b7b34bf0eceeecfdd00dbe65cde Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Thu, 16 Jul 2020 07:39:29 -0700 Subject: [PATCH 052/117] xfs: drain the buf delwri queue before xfsaild idles xfsaild is racy with respect to transaction abort and shutdown in that the task can idle or exit with an empty AIL but buffers still on the delwri queue. This was partly addressed by cancelling the delwri queue before the task exits to prevent memory leaks, but it's also possible for xfsaild to empty and idle with buffers on the delwri queue. For example, a transaction that pins a buffer that also happens to sit on the AIL delwri queue will explicitly remove the associated log item from the AIL if the transaction aborts. The side effect of this is an unmount hang in xfs_wait_buftarg() as the associated buffers remain held by the delwri queue indefinitely. This is reproduced on repeated runs of generic/531 with an fs format (-mrmapbt=1 -bsize=1k) that happens to also reproduce transaction aborts. Update xfsaild to not idle until both the AIL and associated delwri queue are empty and update the push code to continue delwri queue submission attempts even when the AIL is empty. This allows the AIL to eventually release aborted buffers stranded on the delwri queue when they are unlocked by the associated transaction. This should have no significant effect on normal runtime behavior because the xfsaild currently idles only when the AIL is empty and in practice the AIL is rarely empty with a populated delwri queue. The items must be AIL resident to land in the queue in the first place and generally aren't removed until writeback completes. Note that the pre-existing delwri queue cancel logic in the exit path is retained because task stop is external, could technically come at any point, and xfsaild is still responsible to release its buffer references before it exits. Signed-off-by: Brian Foster Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_trans_ail.c | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c index c3be6e440134..0c783d339675 100644 --- a/fs/xfs/xfs_trans_ail.c +++ b/fs/xfs/xfs_trans_ail.c @@ -448,16 +448,10 @@ xfsaild_push( target = ailp->ail_target; ailp->ail_target_prev = target; + /* we're done if the AIL is empty or our push has reached the end */ lip = xfs_trans_ail_cursor_first(ailp, &cur, ailp->ail_last_pushed_lsn); - if (!lip) { - /* - * If the AIL is empty or our push has reached the end we are - * done now. - */ - xfs_trans_ail_cursor_done(&cur); - spin_unlock(&ailp->ail_lock); + if (!lip) goto out_done; - } XFS_STATS_INC(mp, xs_push_ail); @@ -539,6 +533,8 @@ xfsaild_push( break; lsn = lip->li_lsn; } + +out_done: xfs_trans_ail_cursor_done(&cur); spin_unlock(&ailp->ail_lock); @@ -546,7 +542,6 @@ xfsaild_push( ailp->ail_log_flush++; if (!count || XFS_LSN_CMP(lsn, target) >= 0) { -out_done: /* * We reached the target or the AIL is empty, so wait a bit * longer for I/O to complete and remove pushed items from the @@ -638,7 +633,8 @@ xfsaild( */ smp_rmb(); if (!xfs_ail_min(ailp) && - ailp->ail_target == ailp->ail_target_prev) { + ailp->ail_target == ailp->ail_target_prev && + list_empty(&ailp->ail_buf_list)) { spin_unlock(&ailp->ail_lock); freezable_schedule(); tout = 0; From b2a8864728683443f34a9fd33a2b78b860934cc1 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Wed, 15 Jul 2020 18:44:50 -0700 Subject: [PATCH 053/117] xfs: fix inode allocation block res calculation precedence The block reservation calculation for inode allocation is supposed to consist of the blocks required for the inode chunk plus (maxlevels-1) of the inode btree multiplied by the number of inode btrees in the fs (2 when finobt is enabled, 1 otherwise). Instead, the macro returns (ialloc_blocks + 2) due to a precedence error in the calculation logic. This leads to block reservation overruns via generic/531 on small block filesystems with finobt enabled. Add braces to fix the calculation and reserve the appropriate number of blocks. Fixes: 9d43b180af67 ("xfs: update inode allocation/free transaction reservations for finobt") Signed-off-by: Brian Foster Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_trans_space.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/xfs/libxfs/xfs_trans_space.h b/fs/xfs/libxfs/xfs_trans_space.h index 88221c7a04cc..c6df01a2a158 100644 --- a/fs/xfs/libxfs/xfs_trans_space.h +++ b/fs/xfs/libxfs/xfs_trans_space.h @@ -57,7 +57,7 @@ XFS_DAREMOVE_SPACE_RES(mp, XFS_DATA_FORK) #define XFS_IALLOC_SPACE_RES(mp) \ (M_IGEO(mp)->ialloc_blks + \ - (xfs_sb_version_hasfinobt(&mp->m_sb) ? 2 : 1 * \ + ((xfs_sb_version_hasfinobt(&mp->m_sb) ? 2 : 1) * \ (M_IGEO(mp)->inobt_maxlevels - 1))) /* From c97738a960a86081a147e7d436138e6481757445 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 14 Jul 2020 10:36:08 -0700 Subject: [PATCH 054/117] xfs: clear XFS_DQ_FREEING if we can't lock the dquot buffer to flush In commit 8d3d7e2b35ea, we changed xfs_qm_dqpurge to bail out if we can't lock the dquot buf to flush the dquot. This prevents the AIL from blocking on the dquot, but it also forgets to clear the FREEING flag on its way out. A subsequent purge attempt will see the FREEING flag is set and bail out, which leads to dqpurge_all failing to purge all the dquots. (copy-pasting from Dave Chinner's identical patch) This was found by inspection after having xfs/305 hang 1 in ~50 iterations in a quotaoff operation: [ 8872.301115] xfs_quota D13888 92262 91813 0x00004002 [ 8872.302538] Call Trace: [ 8872.303193] __schedule+0x2d2/0x780 [ 8872.304108] ? do_raw_spin_unlock+0x57/0xd0 [ 8872.305198] schedule+0x6e/0xe0 [ 8872.306021] schedule_timeout+0x14d/0x300 [ 8872.307060] ? __next_timer_interrupt+0xe0/0xe0 [ 8872.308231] ? xfs_qm_dqusage_adjust+0x200/0x200 [ 8872.309422] schedule_timeout_uninterruptible+0x2a/0x30 [ 8872.310759] xfs_qm_dquot_walk.isra.0+0x15a/0x1b0 [ 8872.311971] xfs_qm_dqpurge_all+0x7f/0x90 [ 8872.313022] xfs_qm_scall_quotaoff+0x18d/0x2b0 [ 8872.314163] xfs_quota_disable+0x3a/0x60 [ 8872.315179] kernel_quotactl+0x7e2/0x8d0 [ 8872.316196] ? __do_sys_newstat+0x51/0x80 [ 8872.317238] __x64_sys_quotactl+0x1e/0x30 [ 8872.318266] do_syscall_64+0x46/0x90 [ 8872.319193] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [ 8872.320490] RIP: 0033:0x7f46b5490f2a [ 8872.321414] Code: Bad RIP value. Returning -EAGAIN from xfs_qm_dqpurge() without clearing the XFS_DQ_FREEING flag means the xfs_qm_dqpurge_all() code can never free the dquot, and we loop forever waiting for the XFS_DQ_FREEING flag to go away on the dquot that leaked it via -EAGAIN. Fixes: 8d3d7e2b35ea ("xfs: trylock underlying buffer on dquot flush") Signed-off-by: Darrick J. Wong Reviewed-by: Allison Collins Reviewed-by: Chandan Babu R Reviewed-by: Christoph Hellwig Reviewed-by: Brian Foster Signed-off-by: Dave Chinner Reviewed-by: Dave Chinner --- fs/xfs/xfs_qm.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index d6cd83317344..938023dd8ce5 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -148,6 +148,7 @@ xfs_qm_dqpurge( error = xfs_bwrite(bp); xfs_buf_relse(bp); } else if (error == -EAGAIN) { + dqp->dq_flags &= ~XFS_DQ_FREEING; goto out_unlock; } xfs_dqflock(dqp); From f959b5d037e71a4d69b5bf71faffa065d9269b4a Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 14 Jul 2020 10:36:09 -0700 Subject: [PATCH 055/117] xfs: fix inode quota reservation checks xfs_trans_dqresv is the function that we use to make reservations against resource quotas. Each resource contains two counters: the q_core counter, which tracks resources allocated on disk; and the dquot reservation counter, which tracks how much of that resource has either been allocated or reserved by threads that are working on metadata updates. For disk blocks, we compare the proposed reservation counter against the hard and soft limits to decide if we're going to fail the operation. However, for inodes we inexplicably compare against the q_core counter, not the incore reservation count. Since the q_core counter is always lower than the reservation count and we unlock the dquot between reservation and transaction commit, this means that multiple threads can reserve the last inode count before we hit the hard limit, and when they commit, we'll be well over the hard limit. Fix this by checking against the incore inode reservation counter, since we would appear to maintain that correctly (and that's what we report in GETQUOTA). Signed-off-by: Darrick J. Wong Reviewed-by: Allison Collins Reviewed-by: Chandan Babu R Reviewed-by: Christoph Hellwig --- fs/xfs/xfs_trans_dquot.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c index c0f73b82c055..ed0ce8b301b4 100644 --- a/fs/xfs/xfs_trans_dquot.c +++ b/fs/xfs/xfs_trans_dquot.c @@ -647,7 +647,7 @@ xfs_trans_dqresv( } } if (ninos > 0) { - total_count = be64_to_cpu(dqp->q_core.d_icount) + ninos; + total_count = dqp->q_res_icount + ninos; timer = be32_to_cpu(dqp->q_core.d_itimer); warns = be16_to_cpu(dqp->q_core.d_iwarns); warnlimit = defq->iwarnlimit; From afeda6000b0cb6d873920be4ce67f351f017a654 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 14 Jul 2020 10:36:09 -0700 Subject: [PATCH 056/117] xfs: validate ondisk/incore dquot flags While loading dquot records off disk, make sure that the quota type flags are the same between the incore dquot and the ondisk dquot. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Reviewed-by: Chandan Babu R --- fs/xfs/xfs_dquot.c | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index 76353c9a723e..7503c6695569 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -23,6 +23,7 @@ #include "xfs_trace.h" #include "xfs_log.h" #include "xfs_bmap_btree.h" +#include "xfs_error.h" /* * Lock order: @@ -524,13 +525,26 @@ xfs_dquot_alloc( } /* Copy the in-core quota fields in from the on-disk buffer. */ -STATIC void +STATIC int xfs_dquot_from_disk( struct xfs_dquot *dqp, struct xfs_buf *bp) { struct xfs_disk_dquot *ddqp = bp->b_addr + dqp->q_bufoffset; + /* + * Ensure that we got the type and ID we were looking for. + * Everything else was checked by the dquot buffer verifier. + */ + if ((ddqp->d_flags & XFS_DQ_ALLTYPES) != dqp->dq_flags || + ddqp->d_id != dqp->q_core.d_id) { + xfs_alert_tag(bp->b_mount, XFS_PTAG_VERIFIER_ERROR, + "Metadata corruption detected at %pS, quota %u", + __this_address, be32_to_cpu(dqp->q_core.d_id)); + xfs_alert(bp->b_mount, "Unmount and run xfs_repair"); + return -EFSCORRUPTED; + } + /* copy everything from disk dquot to the incore dquot */ memcpy(&dqp->q_core, ddqp, sizeof(struct xfs_disk_dquot)); @@ -544,6 +558,7 @@ xfs_dquot_from_disk( /* initialize the dquot speculative prealloc thresholds */ xfs_dquot_set_prealloc_limits(dqp); + return 0; } /* Allocate and initialize the dquot buffer for this in-core dquot. */ @@ -617,9 +632,11 @@ xfs_qm_dqread( * further. */ ASSERT(xfs_buf_islocked(bp)); - xfs_dquot_from_disk(dqp, bp); - + error = xfs_dquot_from_disk(dqp, bp); xfs_buf_relse(bp); + if (error) + goto err; + *dqpp = dqp; return error; From 41ed4a5f2ba41882d8fbdf4cf455855e80ab6b90 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 14 Jul 2020 10:36:09 -0700 Subject: [PATCH 057/117] xfs: move the flags argument of xfs_qm_scall_trunc_qfiles to XFS_QMOPT_* Since xfs_qm_scall_trunc_qfiles can take a bitset of quota types that we want to truncate, change the flags argument to take XFS_QMOPT_[UGP}QUOTA so that the next patch can start to deprecate XFS_DQ_*. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Reviewed-by: Chandan Babu R --- fs/xfs/xfs_qm_syscalls.c | 8 ++++---- fs/xfs/xfs_quotaops.c | 6 +++--- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c index 7effd7a28136..35fad348e3a2 100644 --- a/fs/xfs/xfs_qm_syscalls.c +++ b/fs/xfs/xfs_qm_syscalls.c @@ -322,23 +322,23 @@ xfs_qm_scall_trunc_qfiles( int error = -EINVAL; if (!xfs_sb_version_hasquota(&mp->m_sb) || flags == 0 || - (flags & ~XFS_DQ_ALLTYPES)) { + (flags & ~XFS_QMOPT_QUOTALL)) { xfs_debug(mp, "%s: flags=%x m_qflags=%x", __func__, flags, mp->m_qflags); return -EINVAL; } - if (flags & XFS_DQ_USER) { + if (flags & XFS_QMOPT_UQUOTA) { error = xfs_qm_scall_trunc_qfile(mp, mp->m_sb.sb_uquotino); if (error) return error; } - if (flags & XFS_DQ_GROUP) { + if (flags & XFS_QMOPT_GQUOTA) { error = xfs_qm_scall_trunc_qfile(mp, mp->m_sb.sb_gquotino); if (error) return error; } - if (flags & XFS_DQ_PROJ) + if (flags & XFS_QMOPT_PQUOTA) error = xfs_qm_scall_trunc_qfile(mp, mp->m_sb.sb_pquotino); return error; diff --git a/fs/xfs/xfs_quotaops.c b/fs/xfs/xfs_quotaops.c index bf809b77a316..0868e6ee2219 100644 --- a/fs/xfs/xfs_quotaops.c +++ b/fs/xfs/xfs_quotaops.c @@ -205,11 +205,11 @@ xfs_fs_rm_xquota( return -EINVAL; if (uflags & FS_USER_QUOTA) - flags |= XFS_DQ_USER; + flags |= XFS_QMOPT_UQUOTA; if (uflags & FS_GROUP_QUOTA) - flags |= XFS_DQ_GROUP; + flags |= XFS_QMOPT_GQUOTA; if (uflags & FS_PROJ_QUOTA) - flags |= XFS_DQ_PROJ; + flags |= XFS_QMOPT_PQUOTA; return xfs_qm_scall_trunc_qfiles(mp, flags); } From 0dcc0728c119cb1b295c18c379bb39416ce86aeb Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 14 Jul 2020 10:37:12 -0700 Subject: [PATCH 058/117] xfs: refactor quotacheck flags usage We only use the XFS_QMOPT flags in quotacheck to signal the quota type, so rip out all the flags handling and just pass the type all the way through. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Reviewed-by: Chandan Babu R --- fs/xfs/xfs_qm.c | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index 938023dd8ce5..259ec5738c33 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -902,17 +902,13 @@ xfs_qm_reset_dqcounts_all( xfs_dqid_t firstid, xfs_fsblock_t bno, xfs_filblks_t blkcnt, - uint flags, + uint type, struct list_head *buffer_list) { struct xfs_buf *bp; - int error; - int type; + int error = 0; ASSERT(blkcnt > 0); - type = flags & XFS_QMOPT_UQUOTA ? XFS_DQ_USER : - (flags & XFS_QMOPT_PQUOTA ? XFS_DQ_PROJ : XFS_DQ_GROUP); - error = 0; /* * Blkcnt arg can be a very big number, and might even be @@ -972,7 +968,7 @@ STATIC int xfs_qm_reset_dqcounts_buf( struct xfs_mount *mp, struct xfs_inode *qip, - uint flags, + uint type, struct list_head *buffer_list) { struct xfs_bmbt_irec *map; @@ -1048,7 +1044,7 @@ xfs_qm_reset_dqcounts_buf( error = xfs_qm_reset_dqcounts_all(mp, firstid, map[i].br_startblock, map[i].br_blockcount, - flags, buffer_list); + type, buffer_list); if (error) goto out; } @@ -1292,7 +1288,7 @@ xfs_qm_quotacheck( * We don't log our changes till later. */ if (uip) { - error = xfs_qm_reset_dqcounts_buf(mp, uip, XFS_QMOPT_UQUOTA, + error = xfs_qm_reset_dqcounts_buf(mp, uip, XFS_DQ_USER, &buffer_list); if (error) goto error_return; @@ -1300,7 +1296,7 @@ xfs_qm_quotacheck( } if (gip) { - error = xfs_qm_reset_dqcounts_buf(mp, gip, XFS_QMOPT_GQUOTA, + error = xfs_qm_reset_dqcounts_buf(mp, gip, XFS_DQ_GROUP, &buffer_list); if (error) goto error_return; @@ -1308,7 +1304,7 @@ xfs_qm_quotacheck( } if (pip) { - error = xfs_qm_reset_dqcounts_buf(mp, pip, XFS_QMOPT_PQUOTA, + error = xfs_qm_reset_dqcounts_buf(mp, pip, XFS_DQ_PROJ, &buffer_list); if (error) goto error_return; From 985a78fdde15e1730383f99867ca38b5648444bf Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 14 Jul 2020 10:37:13 -0700 Subject: [PATCH 059/117] xfs: rename dquot incore state flags Rename the existing incore dquot "dq_flags" field to "q_flags" to match everything else in the structure, then move the two actual dquot state flags to the XFS_DQFLAG_ namespace from XFS_DQ_. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Reviewed-by: Chandan Babu R --- fs/xfs/libxfs/xfs_quota_defs.h | 10 +++++----- fs/xfs/xfs_dquot.c | 6 +++--- fs/xfs/xfs_dquot.h | 5 +++-- fs/xfs/xfs_qm.c | 12 ++++++------ fs/xfs/xfs_qm_syscalls.c | 2 +- fs/xfs/xfs_trace.h | 4 ++-- fs/xfs/xfs_trans_dquot.c | 2 +- 7 files changed, 21 insertions(+), 20 deletions(-) diff --git a/fs/xfs/libxfs/xfs_quota_defs.h b/fs/xfs/libxfs/xfs_quota_defs.h index 56d9dd787e7b..e2da08055e6b 100644 --- a/fs/xfs/libxfs/xfs_quota_defs.h +++ b/fs/xfs/libxfs/xfs_quota_defs.h @@ -24,17 +24,17 @@ typedef uint16_t xfs_qwarncnt_t; #define XFS_DQ_USER 0x0001 /* a user quota */ #define XFS_DQ_PROJ 0x0002 /* project quota */ #define XFS_DQ_GROUP 0x0004 /* a group quota */ -#define XFS_DQ_DIRTY 0x0008 /* dquot is dirty */ -#define XFS_DQ_FREEING 0x0010 /* dquot is being torn down */ +#define XFS_DQFLAG_DIRTY 0x0008 /* dquot is dirty */ +#define XFS_DQFLAG_FREEING 0x0010 /* dquot is being torn down */ #define XFS_DQ_ALLTYPES (XFS_DQ_USER|XFS_DQ_PROJ|XFS_DQ_GROUP) -#define XFS_DQ_FLAGS \ +#define XFS_DQFLAG_STRINGS \ { XFS_DQ_USER, "USER" }, \ { XFS_DQ_PROJ, "PROJ" }, \ { XFS_DQ_GROUP, "GROUP" }, \ - { XFS_DQ_DIRTY, "DIRTY" }, \ - { XFS_DQ_FREEING, "FREEING" } + { XFS_DQFLAG_DIRTY, "DIRTY" }, \ + { XFS_DQFLAG_FREEING, "FREEING" } /* * We have the possibility of all three quota types being active at once, and diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index 7503c6695569..5408ec82e0d9 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -723,7 +723,7 @@ restart: } xfs_dqlock(dqp); - if (dqp->dq_flags & XFS_DQ_FREEING) { + if (dqp->q_flags & XFS_DQFLAG_FREEING) { xfs_dqunlock(dqp); mutex_unlock(&qi->qi_tree_lock); trace_xfs_dqget_freeing(dqp); @@ -1179,7 +1179,7 @@ xfs_qm_dqflush( /* * Clear the dirty field and remember the flush lsn for later use. */ - dqp->dq_flags &= ~XFS_DQ_DIRTY; + dqp->q_flags &= ~XFS_DQFLAG_DIRTY; xfs_trans_ail_copy_lsn(mp->m_ail, &dqp->q_logitem.qli_flush_lsn, &dqp->q_logitem.qli_item.li_lsn); @@ -1220,7 +1220,7 @@ xfs_qm_dqflush( return 0; out_abort: - dqp->dq_flags &= ~XFS_DQ_DIRTY; + dqp->q_flags &= ~XFS_DQFLAG_DIRTY; xfs_trans_ail_delete(lip, 0); xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); out_unlock: diff --git a/fs/xfs/xfs_dquot.h b/fs/xfs/xfs_dquot.h index 71e36c85e20b..6d43c48c67a1 100644 --- a/fs/xfs/xfs_dquot.h +++ b/fs/xfs/xfs_dquot.h @@ -31,9 +31,10 @@ enum { * The incore dquot structure */ struct xfs_dquot { - uint dq_flags; struct list_head q_lru; struct xfs_mount *q_mount; + uint8_t dq_flags; + uint16_t q_flags; uint q_nrefs; xfs_daddr_t q_blkno; int q_bufoffset; @@ -145,7 +146,7 @@ static inline bool xfs_dquot_lowsp(struct xfs_dquot *dqp) } #define XFS_DQ_IS_LOCKED(dqp) (mutex_is_locked(&((dqp)->q_qlock))) -#define XFS_DQ_IS_DIRTY(dqp) ((dqp)->dq_flags & XFS_DQ_DIRTY) +#define XFS_DQ_IS_DIRTY(dqp) ((dqp)->q_flags & XFS_DQFLAG_DIRTY) #define XFS_QM_ISUDQ(dqp) ((dqp)->dq_flags & XFS_DQ_USER) #define XFS_QM_ISPDQ(dqp) ((dqp)->dq_flags & XFS_DQ_PROJ) #define XFS_QM_ISGDQ(dqp) ((dqp)->dq_flags & XFS_DQ_GROUP) diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index 259ec5738c33..f80bcba83616 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -124,10 +124,10 @@ xfs_qm_dqpurge( int error = -EAGAIN; xfs_dqlock(dqp); - if ((dqp->dq_flags & XFS_DQ_FREEING) || dqp->q_nrefs != 0) + if ((dqp->q_flags & XFS_DQFLAG_FREEING) || dqp->q_nrefs != 0) goto out_unlock; - dqp->dq_flags |= XFS_DQ_FREEING; + dqp->q_flags |= XFS_DQFLAG_FREEING; xfs_dqflock(dqp); @@ -148,7 +148,7 @@ xfs_qm_dqpurge( error = xfs_bwrite(bp); xfs_buf_relse(bp); } else if (error == -EAGAIN) { - dqp->dq_flags &= ~XFS_DQ_FREEING; + dqp->q_flags &= ~XFS_DQFLAG_FREEING; goto out_unlock; } xfs_dqflock(dqp); @@ -474,7 +474,7 @@ xfs_qm_dquot_isolate( /* * Prevent lookups now that we are past the point of no return. */ - dqp->dq_flags |= XFS_DQ_FREEING; + dqp->q_flags |= XFS_DQFLAG_FREEING; xfs_dqunlock(dqp); ASSERT(dqp->q_nrefs == 0); @@ -1113,7 +1113,7 @@ xfs_qm_quotacheck_dqadjust( xfs_qm_adjust_dqtimers(mp, dqp); } - dqp->dq_flags |= XFS_DQ_DIRTY; + dqp->q_flags |= XFS_DQFLAG_DIRTY; xfs_qm_dqput(dqp); return 0; } @@ -1219,7 +1219,7 @@ xfs_qm_flush_one( int error = 0; xfs_dqlock(dqp); - if (dqp->dq_flags & XFS_DQ_FREEING) + if (dqp->q_flags & XFS_DQFLAG_FREEING) goto out_unlock; if (!XFS_DQ_IS_DIRTY(dqp)) goto out_unlock; diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c index 35fad348e3a2..a9e4a7e1b9d9 100644 --- a/fs/xfs/xfs_qm_syscalls.c +++ b/fs/xfs/xfs_qm_syscalls.c @@ -598,7 +598,7 @@ xfs_qm_scall_setqlim( */ xfs_qm_adjust_dqtimers(mp, dqp); } - dqp->dq_flags |= XFS_DQ_DIRTY; + dqp->q_flags |= XFS_DQFLAG_DIRTY; xfs_trans_log_dquot(tp, dqp); error = xfs_trans_commit(tp); diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 50c478374a31..97d8daf11816 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -877,7 +877,7 @@ DECLARE_EVENT_CLASS(xfs_dquot_class, TP_fast_assign( __entry->dev = dqp->q_mount->m_super->s_dev; __entry->id = be32_to_cpu(dqp->q_core.d_id); - __entry->flags = dqp->dq_flags; + __entry->flags = dqp->dq_flags | dqp->q_flags; __entry->nrefs = dqp->q_nrefs; __entry->res_bcount = dqp->q_res_bcount; __entry->bcount = be64_to_cpu(dqp->q_core.d_bcount); @@ -896,7 +896,7 @@ DECLARE_EVENT_CLASS(xfs_dquot_class, "icnt 0x%llx ihardlimit 0x%llx isoftlimit 0x%llx]", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->id, - __print_flags(__entry->flags, "|", XFS_DQ_FLAGS), + __print_flags(__entry->flags, "|", XFS_DQFLAG_STRINGS), __entry->nrefs, __entry->res_bcount, __entry->bcount, diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c index ed0ce8b301b4..8963cfac2a6a 100644 --- a/fs/xfs/xfs_trans_dquot.c +++ b/fs/xfs/xfs_trans_dquot.c @@ -391,7 +391,7 @@ xfs_trans_apply_dquot_deltas( xfs_qm_adjust_dqtimers(tp->t_mountp, dqp); } - dqp->dq_flags |= XFS_DQ_DIRTY; + dqp->q_flags |= XFS_DQFLAG_DIRTY; /* * add this to the list of items to get logged */ From cb64e1299364a51bf60e96f2a35df31f47aa2eee Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 14 Jul 2020 10:37:13 -0700 Subject: [PATCH 060/117] xfs: make XFS_DQUOT_CLUSTER_SIZE_FSB part of the ondisk format Move the dquot cluster size #define to xfs_format.h. It is an important part of the ondisk format because the ondisk dquot record size is not an even power of two, which means that the buffer size we use is significant here because the kernel leaves slack space at the end of the buffer to avoid having to deal with a dquot record crossing a block boundary. This is also an excuse to fix one of the longstanding discrepancies between kernel and userspace libxfs headers. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Reviewed-by: Chandan Babu R --- fs/xfs/libxfs/xfs_format.h | 16 ++++++++++++++++ fs/xfs/xfs_qm.h | 11 ----------- 2 files changed, 16 insertions(+), 11 deletions(-) diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h index b42a52bfa1e9..4f665646ba7d 100644 --- a/fs/xfs/libxfs/xfs_format.h +++ b/fs/xfs/libxfs/xfs_format.h @@ -1198,6 +1198,22 @@ typedef struct xfs_dqblk { #define XFS_DQUOT_CRC_OFF offsetof(struct xfs_dqblk, dd_crc) +/* + * This defines the unit of allocation of dquots. + * + * Currently, it is just one file system block, and a 4K blk contains 30 + * (136 * 30 = 4080) dquots. It's probably not worth trying to make + * this more dynamic. + * + * However, if this number is changed, we have to make sure that we don't + * implicitly assume that we do allocations in chunks of a single filesystem + * block in the dquot/xqm code. + * + * This is part of the ondisk format because the structure size is not a power + * of two, which leaves slack at the end of the disk block. + */ +#define XFS_DQUOT_CLUSTER_SIZE_FSB (xfs_filblks_t)1 + /* * Remote symlink format and access functions. */ diff --git a/fs/xfs/xfs_qm.h b/fs/xfs/xfs_qm.h index 7b0e771fcbce..2c8ca9df23af 100644 --- a/fs/xfs/xfs_qm.h +++ b/fs/xfs/xfs_qm.h @@ -30,17 +30,6 @@ extern struct kmem_zone *xfs_qm_dqtrxzone; !dqp->q_core.d_rtbcount && \ !dqp->q_core.d_icount) -/* - * This defines the unit of allocation of dquots. - * Currently, it is just one file system block, and a 4K blk contains 30 - * (136 * 30 = 4080) dquots. It's probably not worth trying to make - * this more dynamic. - * XXXsup However, if this number is changed, we have to make sure that we don't - * implicitly assume that we do allocations in chunks of a single filesystem - * block in the dquot/xqm code. - */ -#define XFS_DQUOT_CLUSTER_SIZE_FSB (xfs_filblks_t)1 - /* Defaults for each quota type: time limits, warn limits, usage limits */ struct xfs_def_quota { time64_t btimelimit; /* limit for blks timer */ From 0b0fa1d1d16794ce48188cc6434ec38f6e1f1b4b Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 14 Jul 2020 10:37:22 -0700 Subject: [PATCH 061/117] xfs: stop using q_core.d_flags in the quota code Use the incore dq_flags to figure out the dquot type. This is the first step towards removing xfs_disk_dquot from the incore dquot. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Reviewed-by: Chandan Babu R --- fs/xfs/xfs_dquot.c | 36 ++++++++++++++++++++++++++++++++++-- fs/xfs/xfs_dquot.h | 2 ++ fs/xfs/xfs_dquot_item.c | 6 ++++-- 3 files changed, 40 insertions(+), 4 deletions(-) diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index 5408ec82e0d9..3d1d876b45fc 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -561,6 +561,15 @@ xfs_dquot_from_disk( return 0; } +/* Copy the in-core quota fields into the on-disk buffer. */ +void +xfs_dquot_to_disk( + struct xfs_disk_dquot *ddqp, + struct xfs_dquot *dqp) +{ + memcpy(ddqp, &dqp->q_core, sizeof(struct xfs_disk_dquot)); +} + /* Allocate and initialize the dquot buffer for this in-core dquot. */ static int xfs_qm_dqread_alloc( @@ -1115,6 +1124,21 @@ xfs_dquot_done( } } +/* Check incore dquot for errors before we flush. */ +static xfs_failaddr_t +xfs_qm_dqflush_check( + struct xfs_dquot *dqp) +{ + __u8 type = dqp->dq_flags & XFS_DQ_ALLTYPES; + + if (type != XFS_DQ_USER && + type != XFS_DQ_GROUP && + type != XFS_DQ_PROJ) + return __this_address; + + return NULL; +} + /* * Write a modified dquot to disk. * The dquot must be locked and the flush lock too taken by caller. @@ -1173,8 +1197,16 @@ xfs_qm_dqflush( goto out_abort; } - /* This is the only portion of data that needs to persist */ - memcpy(ddqp, &dqp->q_core, sizeof(struct xfs_disk_dquot)); + fa = xfs_qm_dqflush_check(dqp); + if (fa) { + xfs_alert(mp, "corrupt dquot ID 0x%x in memory at %pS", + be32_to_cpu(dqp->q_core.d_id), fa); + xfs_buf_relse(bp); + error = -EFSCORRUPTED; + goto out_abort; + } + + xfs_dquot_to_disk(ddqp, dqp); /* * Clear the dirty field and remember the flush lsn for later use. diff --git a/fs/xfs/xfs_dquot.h b/fs/xfs/xfs_dquot.h index 6d43c48c67a1..944e43a0e202 100644 --- a/fs/xfs/xfs_dquot.h +++ b/fs/xfs/xfs_dquot.h @@ -145,6 +145,8 @@ static inline bool xfs_dquot_lowsp(struct xfs_dquot *dqp) return false; } +void xfs_dquot_to_disk(struct xfs_disk_dquot *ddqp, struct xfs_dquot *dqp); + #define XFS_DQ_IS_LOCKED(dqp) (mutex_is_locked(&((dqp)->q_qlock))) #define XFS_DQ_IS_DIRTY(dqp) ((dqp)->q_flags & XFS_DQFLAG_DIRTY) #define XFS_QM_ISUDQ(dqp) ((dqp)->dq_flags & XFS_DQ_USER) diff --git a/fs/xfs/xfs_dquot_item.c b/fs/xfs/xfs_dquot_item.c index d7e4de7151d7..fc21e48c889c 100644 --- a/fs/xfs/xfs_dquot_item.c +++ b/fs/xfs/xfs_dquot_item.c @@ -45,6 +45,7 @@ xfs_qm_dquot_logitem_format( struct xfs_log_item *lip, struct xfs_log_vec *lv) { + struct xfs_disk_dquot ddq; struct xfs_dq_logitem *qlip = DQUOT_ITEM(lip); struct xfs_log_iovec *vecp = NULL; struct xfs_dq_logformat *qlf; @@ -58,8 +59,9 @@ xfs_qm_dquot_logitem_format( qlf->qlf_boffset = qlip->qli_dquot->q_bufoffset; xlog_finish_iovec(lv, vecp, sizeof(struct xfs_dq_logformat)); - xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_DQUOT, - &qlip->qli_dquot->q_core, + xfs_dquot_to_disk(&ddq, qlip->qli_dquot); + + xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_DQUOT, &ddq, sizeof(struct xfs_disk_dquot)); } From c51df7334167e445f2cafc5511d6e2407a32e8f1 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 14 Jul 2020 10:37:30 -0700 Subject: [PATCH 062/117] xfs: stop using q_core.d_id in the quota code Add a dquot id field to the incore dquot, and use that instead of the one in qcore. This eliminates a bunch of endian conversions and will eventually allow us to remove qcore entirely. We also rearrange the start of xfs_dquot to remove padding holes, saving 8 bytes. Signed-off-by: Darrick J. Wong Reviewed-by: Chandan Babu R Reviewed-by: Christoph Hellwig Reviewed-by: Allison Collins --- fs/xfs/scrub/quota.c | 23 ++++++++++++++--------- fs/xfs/xfs_dquot.c | 25 +++++++++++-------------- fs/xfs/xfs_dquot.h | 3 ++- fs/xfs/xfs_dquot_item.c | 2 +- fs/xfs/xfs_qm.c | 22 ++++++++++------------ fs/xfs/xfs_qm_syscalls.c | 4 ++-- fs/xfs/xfs_trace.h | 2 +- fs/xfs/xfs_trans_dquot.c | 8 +++----- 8 files changed, 44 insertions(+), 45 deletions(-) diff --git a/fs/xfs/scrub/quota.c b/fs/xfs/scrub/quota.c index 905a34558361..145f8710bac9 100644 --- a/fs/xfs/scrub/quota.c +++ b/fs/xfs/scrub/quota.c @@ -92,7 +92,6 @@ xchk_quota_item( unsigned long long icount; unsigned long long rcount; xfs_ino_t fs_icount; - xfs_dqid_t id = be32_to_cpu(d->d_id); int error = 0; if (xchk_should_terminate(sc, &error)) @@ -102,11 +101,11 @@ xchk_quota_item( * Except for the root dquot, the actual dquot we got must either have * the same or higher id as we saw before. */ - offset = id / qi->qi_dqperchunk; - if (id && id <= sqi->last_id) + offset = dq->q_id / qi->qi_dqperchunk; + if (dq->q_id && dq->q_id <= sqi->last_id) xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset); - sqi->last_id = id; + sqi->last_id = dq->q_id; /* Did we get the dquot type we wanted? */ if (dqtype != (d->d_flags & XFS_DQ_ALLTYPES)) @@ -175,13 +174,19 @@ xchk_quota_item( * lower limit than the actual usage. However, we flag it for * admin review. */ - if (id != 0 && bhard != 0 && bcount > bhard) - xchk_fblock_set_warning(sc, XFS_DATA_FORK, offset); - if (id != 0 && ihard != 0 && icount > ihard) - xchk_fblock_set_warning(sc, XFS_DATA_FORK, offset); - if (id != 0 && rhard != 0 && rcount > rhard) + if (dq->q_id == 0) + goto out; + + if (bhard != 0 && bcount > bhard) xchk_fblock_set_warning(sc, XFS_DATA_FORK, offset); + if (ihard != 0 && icount > ihard) + xchk_fblock_set_warning(sc, XFS_DATA_FORK, offset); + + if (rhard != 0 && rcount > rhard) + xchk_fblock_set_warning(sc, XFS_DATA_FORK, offset); + +out: if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) return -EFSCORRUPTED; diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index 3d1d876b45fc..a91a0c7e9103 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -75,7 +75,7 @@ xfs_qm_adjust_dqlimits( struct xfs_def_quota *defq; int prealloc = 0; - ASSERT(d->d_id); + ASSERT(dq->q_id); defq = xfs_get_defquota(q, xfs_dquot_type(dq)); if (defq->bsoftlimit && !d->d_blk_softlimit) { @@ -121,7 +121,7 @@ xfs_qm_adjust_dqtimers( struct xfs_disk_dquot *d = &dq->q_core; struct xfs_def_quota *defq; - ASSERT(d->d_id); + ASSERT(dq->q_id); defq = xfs_get_defquota(qi, xfs_dquot_type(dq)); #ifdef DEBUG @@ -366,7 +366,7 @@ xfs_dquot_disk_alloc( * Make a chunk of dquots out of this buffer and log * the entire thing. */ - xfs_qm_init_dquot_blk(tp, mp, be32_to_cpu(dqp->q_core.d_id), + xfs_qm_init_dquot_blk(tp, mp, dqp->q_id, dqp->dq_flags & XFS_DQ_ALLTYPES, bp); xfs_buf_set_ref(bp, XFS_DQUOT_REF); @@ -479,7 +479,7 @@ xfs_dquot_alloc( dqp = kmem_zone_zalloc(xfs_qm_dqzone, 0); dqp->dq_flags = type; - dqp->q_core.d_id = cpu_to_be32(id); + dqp->q_id = id; dqp->q_mount = mp; INIT_LIST_HEAD(&dqp->q_lru); mutex_init(&dqp->q_qlock); @@ -537,10 +537,10 @@ xfs_dquot_from_disk( * Everything else was checked by the dquot buffer verifier. */ if ((ddqp->d_flags & XFS_DQ_ALLTYPES) != dqp->dq_flags || - ddqp->d_id != dqp->q_core.d_id) { + be32_to_cpu(ddqp->d_id) != dqp->q_id) { xfs_alert_tag(bp->b_mount, XFS_PTAG_VERIFIER_ERROR, "Metadata corruption detected at %pS, quota %u", - __this_address, be32_to_cpu(dqp->q_core.d_id)); + __this_address, dqp->q_id); xfs_alert(bp->b_mount, "Unmount and run xfs_repair"); return -EFSCORRUPTED; } @@ -1187,11 +1187,10 @@ xfs_qm_dqflush( ddqp = &dqb->dd_diskdq; /* sanity check the in-core structure before we flush */ - fa = xfs_dquot_verify(mp, &dqp->q_core, be32_to_cpu(dqp->q_core.d_id), - 0); + fa = xfs_dquot_verify(mp, &dqp->q_core, dqp->q_id, 0); if (fa) { xfs_alert(mp, "corrupt dquot ID 0x%x in memory at %pS", - be32_to_cpu(dqp->q_core.d_id), fa); + dqp->q_id, fa); xfs_buf_relse(bp); error = -EFSCORRUPTED; goto out_abort; @@ -1200,7 +1199,7 @@ xfs_qm_dqflush( fa = xfs_qm_dqflush_check(dqp); if (fa) { xfs_alert(mp, "corrupt dquot ID 0x%x in memory at %pS", - be32_to_cpu(dqp->q_core.d_id), fa); + dqp->q_id, fa); xfs_buf_relse(bp); error = -EFSCORRUPTED; goto out_abort; @@ -1273,8 +1272,7 @@ xfs_dqlock2( { if (d1 && d2) { ASSERT(d1 != d2); - if (be32_to_cpu(d1->q_core.d_id) > - be32_to_cpu(d2->q_core.d_id)) { + if (d1->q_id > d2->q_id) { mutex_lock(&d2->q_qlock); mutex_lock_nested(&d1->q_qlock, XFS_QLOCK_NESTED); } else { @@ -1342,9 +1340,8 @@ xfs_qm_dqiterate( return error; error = iter_fn(dq, dqtype, priv); - id = be32_to_cpu(dq->q_core.d_id); + id = dq->q_id; xfs_qm_dqput(dq); - id++; } while (error == 0 && id != 0); return error; diff --git a/fs/xfs/xfs_dquot.h b/fs/xfs/xfs_dquot.h index 944e43a0e202..41664a1fc073 100644 --- a/fs/xfs/xfs_dquot.h +++ b/fs/xfs/xfs_dquot.h @@ -35,9 +35,10 @@ struct xfs_dquot { struct xfs_mount *q_mount; uint8_t dq_flags; uint16_t q_flags; + xfs_dqid_t q_id; uint q_nrefs; - xfs_daddr_t q_blkno; int q_bufoffset; + xfs_daddr_t q_blkno; xfs_fileoff_t q_fileoffset; struct xfs_disk_dquot q_core; diff --git a/fs/xfs/xfs_dquot_item.c b/fs/xfs/xfs_dquot_item.c index fc21e48c889c..8c1fdf37ee8f 100644 --- a/fs/xfs/xfs_dquot_item.c +++ b/fs/xfs/xfs_dquot_item.c @@ -53,7 +53,7 @@ xfs_qm_dquot_logitem_format( qlf = xlog_prepare_iovec(lv, &vecp, XLOG_REG_TYPE_QFORMAT); qlf->qlf_type = XFS_LI_DQUOT; qlf->qlf_size = 2; - qlf->qlf_id = be32_to_cpu(qlip->qli_dquot->q_core.d_id); + qlf->qlf_id = qlip->qli_dquot->q_id; qlf->qlf_blkno = qlip->qli_dquot->q_blkno; qlf->qlf_len = 1; qlf->qlf_boffset = qlip->qli_dquot->q_bufoffset; diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index f80bcba83616..a2a860ae1564 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -79,7 +79,7 @@ restart: for (i = 0; i < nr_found; i++) { struct xfs_dquot *dqp = batch[i]; - next_index = be32_to_cpu(dqp->q_core.d_id) + 1; + next_index = dqp->q_id + 1; error = execute(batch[i], data); if (error == -EAGAIN) { @@ -161,8 +161,7 @@ xfs_qm_dqpurge( xfs_dqfunlock(dqp); xfs_dqunlock(dqp); - radix_tree_delete(xfs_dquot_tree(qi, dqp->q_core.d_flags), - be32_to_cpu(dqp->q_core.d_id)); + radix_tree_delete(xfs_dquot_tree(qi, dqp->q_core.d_flags), dqp->q_id); qi->qi_dquots--; /* @@ -1108,7 +1107,7 @@ xfs_qm_quotacheck_dqadjust( * * There are no timers for the default values set in the root dquot. */ - if (dqp->q_core.d_id) { + if (dqp->q_id) { xfs_qm_adjust_dqlimits(mp, dqp); xfs_qm_adjust_dqtimers(mp, dqp); } @@ -1594,8 +1593,7 @@ xfs_qm_dqfree_one( struct xfs_quotainfo *qi = mp->m_quotainfo; mutex_lock(&qi->qi_tree_lock); - radix_tree_delete(xfs_dquot_tree(qi, dqp->q_core.d_flags), - be32_to_cpu(dqp->q_core.d_id)); + radix_tree_delete(xfs_dquot_tree(qi, dqp->q_core.d_flags), dqp->q_id); qi->qi_dquots--; mutex_unlock(&qi->qi_tree_lock); @@ -1819,7 +1817,7 @@ xfs_qm_vop_chown_reserve( XFS_QMOPT_RES_RTBLKS : XFS_QMOPT_RES_REGBLKS; if (XFS_IS_UQUOTA_ON(mp) && udqp && - i_uid_read(VFS_I(ip)) != be32_to_cpu(udqp->q_core.d_id)) { + i_uid_read(VFS_I(ip)) != udqp->q_id) { udq_delblks = udqp; /* * If there are delayed allocation blocks, then we have to @@ -1832,7 +1830,7 @@ xfs_qm_vop_chown_reserve( } } if (XFS_IS_GQUOTA_ON(ip->i_mount) && gdqp && - i_gid_read(VFS_I(ip)) != be32_to_cpu(gdqp->q_core.d_id)) { + i_gid_read(VFS_I(ip)) != gdqp->q_id) { gdq_delblks = gdqp; if (delblks) { ASSERT(ip->i_gdquot); @@ -1841,7 +1839,7 @@ xfs_qm_vop_chown_reserve( } if (XFS_IS_PQUOTA_ON(ip->i_mount) && pdqp && - ip->i_d.di_projid != be32_to_cpu(pdqp->q_core.d_id)) { + ip->i_d.di_projid != pdqp->q_id) { pdq_delblks = pdqp; if (delblks) { ASSERT(ip->i_pdquot); @@ -1925,21 +1923,21 @@ xfs_qm_vop_create_dqattach( if (udqp && XFS_IS_UQUOTA_ON(mp)) { ASSERT(ip->i_udquot == NULL); - ASSERT(i_uid_read(VFS_I(ip)) == be32_to_cpu(udqp->q_core.d_id)); + ASSERT(i_uid_read(VFS_I(ip)) == udqp->q_id); ip->i_udquot = xfs_qm_dqhold(udqp); xfs_trans_mod_dquot(tp, udqp, XFS_TRANS_DQ_ICOUNT, 1); } if (gdqp && XFS_IS_GQUOTA_ON(mp)) { ASSERT(ip->i_gdquot == NULL); - ASSERT(i_gid_read(VFS_I(ip)) == be32_to_cpu(gdqp->q_core.d_id)); + ASSERT(i_gid_read(VFS_I(ip)) == gdqp->q_id); ip->i_gdquot = xfs_qm_dqhold(gdqp); xfs_trans_mod_dquot(tp, gdqp, XFS_TRANS_DQ_ICOUNT, 1); } if (pdqp && XFS_IS_PQUOTA_ON(mp)) { ASSERT(ip->i_pdquot == NULL); - ASSERT(ip->i_d.di_projid == be32_to_cpu(pdqp->q_core.d_id)); + ASSERT(ip->i_d.di_projid == pdqp->q_id); ip->i_pdquot = xfs_qm_dqhold(pdqp); xfs_trans_mod_dquot(tp, pdqp, XFS_TRANS_DQ_ICOUNT, 1); diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c index a9e4a7e1b9d9..8529eee3454c 100644 --- a/fs/xfs/xfs_qm_syscalls.c +++ b/fs/xfs/xfs_qm_syscalls.c @@ -659,7 +659,7 @@ xfs_qm_scall_getquota_fill_qc( if (((XFS_IS_UQUOTA_ENFORCED(mp) && type == XFS_DQ_USER) || (XFS_IS_GQUOTA_ENFORCED(mp) && type == XFS_DQ_GROUP) || (XFS_IS_PQUOTA_ENFORCED(mp) && type == XFS_DQ_PROJ)) && - dqp->q_core.d_id != 0) { + dqp->q_id != 0) { if ((dst->d_space > dst->d_spc_softlimit) && (dst->d_spc_softlimit > 0)) { ASSERT(dst->d_spc_timer != 0); @@ -726,7 +726,7 @@ xfs_qm_scall_getquota_next( return error; /* Fill in the ID we actually read from disk */ - *id = be32_to_cpu(dqp->q_core.d_id); + *id = dqp->q_id; xfs_qm_scall_getquota_fill_qc(mp, type, dqp, dst); diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 97d8daf11816..39f82dbafa37 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -876,7 +876,7 @@ DECLARE_EVENT_CLASS(xfs_dquot_class, ), \ TP_fast_assign( __entry->dev = dqp->q_mount->m_super->s_dev; - __entry->id = be32_to_cpu(dqp->q_core.d_id); + __entry->id = dqp->q_id; __entry->flags = dqp->dq_flags | dqp->q_flags; __entry->nrefs = dqp->q_nrefs; __entry->res_bcount = dqp->q_res_bcount; diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c index 8963cfac2a6a..8dd6aa6e8fa3 100644 --- a/fs/xfs/xfs_trans_dquot.c +++ b/fs/xfs/xfs_trans_dquot.c @@ -386,7 +386,7 @@ xfs_trans_apply_dquot_deltas( * Get any default limits in use. * Start/reset the timer(s) if needed. */ - if (d->d_id) { + if (dqp->q_id) { xfs_qm_adjust_dqlimits(tp->t_mountp, dqp); xfs_qm_adjust_dqtimers(tp->t_mountp, dqp); } @@ -558,8 +558,7 @@ xfs_quota_warn( else qtype = GRPQUOTA; - quota_send_warning(make_kqid(&init_user_ns, qtype, - be32_to_cpu(dqp->q_core.d_id)), + quota_send_warning(make_kqid(&init_user_ns, qtype, dqp->q_id), mp->m_super->s_dev, type); } @@ -618,8 +617,7 @@ xfs_trans_dqresv( resbcountp = &dqp->q_res_rtbcount; } - if ((flags & XFS_QMOPT_FORCE_RES) == 0 && - dqp->q_core.d_id && + if ((flags & XFS_QMOPT_FORCE_RES) == 0 && dqp->q_id && ((XFS_IS_UQUOTA_ENFORCED(dqp->q_mount) && XFS_QM_ISUDQ(dqp)) || (XFS_IS_GQUOTA_ENFORCED(dqp->q_mount) && XFS_QM_ISGDQ(dqp)) || (XFS_IS_PQUOTA_ENFORCED(dqp->q_mount) && XFS_QM_ISPDQ(dqp)))) { From 784e80f5640db9a925af1143a25e9bb98624350d Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 14 Jul 2020 10:37:30 -0700 Subject: [PATCH 063/117] xfs: use a per-resource struct for incore dquot data Introduce a new struct xfs_dquot_res that we'll use to track all the incore data for a particular resource type (block, inode, rt block). This will help us (once we've eliminated q_core) to declutter quota functions that currently open-code field access or pass around fields around explicitly. Signed-off-by: Darrick J. Wong Reviewed-by: Chandan Babu R Reviewed-by: Christoph Hellwig Reviewed-by: Allison Collins --- fs/xfs/xfs_dquot.c | 6 +++--- fs/xfs/xfs_dquot.h | 18 +++++++++------- fs/xfs/xfs_iomap.c | 6 +++--- fs/xfs/xfs_qm.c | 6 +++--- fs/xfs/xfs_qm_bhv.c | 8 ++++---- fs/xfs/xfs_qm_syscalls.c | 6 +++--- fs/xfs/xfs_trace.h | 2 +- fs/xfs/xfs_trans_dquot.c | 44 ++++++++++++++++++++-------------------- 8 files changed, 50 insertions(+), 46 deletions(-) diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index a91a0c7e9103..8fc7ae1f4653 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -552,9 +552,9 @@ xfs_dquot_from_disk( * Reservation counters are defined as reservation plus current usage * to avoid having to add every time. */ - dqp->q_res_bcount = be64_to_cpu(ddqp->d_bcount); - dqp->q_res_icount = be64_to_cpu(ddqp->d_icount); - dqp->q_res_rtbcount = be64_to_cpu(ddqp->d_rtbcount); + dqp->q_blk.reserved = be64_to_cpu(ddqp->d_bcount); + dqp->q_ino.reserved = be64_to_cpu(ddqp->d_icount); + dqp->q_rtb.reserved = be64_to_cpu(ddqp->d_rtbcount); /* initialize the dquot speculative prealloc thresholds */ xfs_dquot_set_prealloc_limits(dqp); diff --git a/fs/xfs/xfs_dquot.h b/fs/xfs/xfs_dquot.h index 41664a1fc073..c2f841ed2661 100644 --- a/fs/xfs/xfs_dquot.h +++ b/fs/xfs/xfs_dquot.h @@ -27,6 +27,11 @@ enum { XFS_QLOWSP_MAX }; +struct xfs_dquot_res { + /* Total resources allocated and reserved. */ + xfs_qcnt_t reserved; +}; + /* * The incore dquot structure */ @@ -41,14 +46,13 @@ struct xfs_dquot { xfs_daddr_t q_blkno; xfs_fileoff_t q_fileoffset; + struct xfs_dquot_res q_blk; /* regular blocks */ + struct xfs_dquot_res q_ino; /* inodes */ + struct xfs_dquot_res q_rtb; /* realtime blocks */ + struct xfs_disk_dquot q_core; struct xfs_dq_logitem q_logitem; - /* total regular nblks used+reserved */ - xfs_qcnt_t q_res_bcount; - /* total inos allocd+reserved */ - xfs_qcnt_t q_res_icount; - /* total realtime blks used+reserved */ - xfs_qcnt_t q_res_rtbcount; + xfs_qcnt_t q_prealloc_lo_wmark; xfs_qcnt_t q_prealloc_hi_wmark; int64_t q_low_space[XFS_QLOWSP_MAX]; @@ -139,7 +143,7 @@ static inline bool xfs_dquot_lowsp(struct xfs_dquot *dqp) { int64_t freesp; - freesp = be64_to_cpu(dqp->q_core.d_blk_hardlimit) - dqp->q_res_bcount; + freesp = be64_to_cpu(dqp->q_core.d_blk_hardlimit) - dqp->q_blk.reserved; if (freesp < dqp->q_low_space[XFS_QLOWSP_1_PCNT]) return true; diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index b9a8c3798e08..f60a6e44363b 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -307,7 +307,7 @@ xfs_quota_need_throttle( return false; /* under the lo watermark, no throttle */ - if (dq->q_res_bcount + alloc_blocks < dq->q_prealloc_lo_wmark) + if (dq->q_blk.reserved + alloc_blocks < dq->q_prealloc_lo_wmark) return false; return true; @@ -326,13 +326,13 @@ xfs_quota_calc_throttle( struct xfs_dquot *dq = xfs_inode_dquot(ip, type); /* no dq, or over hi wmark, squash the prealloc completely */ - if (!dq || dq->q_res_bcount >= dq->q_prealloc_hi_wmark) { + if (!dq || dq->q_blk.reserved >= dq->q_prealloc_hi_wmark) { *qblocks = 0; *qfreesp = 0; return; } - freesp = dq->q_prealloc_hi_wmark - dq->q_res_bcount; + freesp = dq->q_prealloc_hi_wmark - dq->q_blk.reserved; if (freesp < dq->q_low_space[XFS_QLOWSP_5_PCNT]) { shift = 2; if (freesp < dq->q_low_space[XFS_QLOWSP_3_PCNT]) diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index a2a860ae1564..aed4a76b8d61 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -1092,14 +1092,14 @@ xfs_qm_quotacheck_dqadjust( * resource usage. */ be64_add_cpu(&dqp->q_core.d_icount, 1); - dqp->q_res_icount++; + dqp->q_ino.reserved++; if (nblks) { be64_add_cpu(&dqp->q_core.d_bcount, nblks); - dqp->q_res_bcount += nblks; + dqp->q_blk.reserved += nblks; } if (rtblks) { be64_add_cpu(&dqp->q_core.d_rtbcount, rtblks); - dqp->q_res_rtbcount += rtblks; + dqp->q_rtb.reserved += rtblks; } /* diff --git a/fs/xfs/xfs_qm_bhv.c b/fs/xfs/xfs_qm_bhv.c index fc2fa418919f..94b2b4b0fc17 100644 --- a/fs/xfs/xfs_qm_bhv.c +++ b/fs/xfs/xfs_qm_bhv.c @@ -29,8 +29,8 @@ xfs_fill_statvfs_from_dquot( if (limit && statp->f_blocks > limit) { statp->f_blocks = limit; statp->f_bfree = statp->f_bavail = - (statp->f_blocks > dqp->q_res_bcount) ? - (statp->f_blocks - dqp->q_res_bcount) : 0; + (statp->f_blocks > dqp->q_blk.reserved) ? + (statp->f_blocks - dqp->q_blk.reserved) : 0; } limit = dqp->q_core.d_ino_softlimit ? @@ -39,8 +39,8 @@ xfs_fill_statvfs_from_dquot( if (limit && statp->f_files > limit) { statp->f_files = limit; statp->f_ffree = - (statp->f_files > dqp->q_res_icount) ? - (statp->f_files - dqp->q_res_icount) : 0; + (statp->f_files > dqp->q_ino.reserved) ? + (statp->f_files - dqp->q_ino.reserved) : 0; } } diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c index 8529eee3454c..a65ea8b4d24d 100644 --- a/fs/xfs/xfs_qm_syscalls.c +++ b/fs/xfs/xfs_qm_syscalls.c @@ -625,8 +625,8 @@ xfs_qm_scall_getquota_fill_qc( XFS_FSB_TO_B(mp, be64_to_cpu(dqp->q_core.d_blk_softlimit)); dst->d_ino_hardlimit = be64_to_cpu(dqp->q_core.d_ino_hardlimit); dst->d_ino_softlimit = be64_to_cpu(dqp->q_core.d_ino_softlimit); - dst->d_space = XFS_FSB_TO_B(mp, dqp->q_res_bcount); - dst->d_ino_count = dqp->q_res_icount; + dst->d_space = XFS_FSB_TO_B(mp, dqp->q_blk.reserved); + dst->d_ino_count = dqp->q_ino.reserved; dst->d_spc_timer = be32_to_cpu(dqp->q_core.d_btimer); dst->d_ino_timer = be32_to_cpu(dqp->q_core.d_itimer); dst->d_ino_warns = be16_to_cpu(dqp->q_core.d_iwarns); @@ -635,7 +635,7 @@ xfs_qm_scall_getquota_fill_qc( XFS_FSB_TO_B(mp, be64_to_cpu(dqp->q_core.d_rtb_hardlimit)); dst->d_rt_spc_softlimit = XFS_FSB_TO_B(mp, be64_to_cpu(dqp->q_core.d_rtb_softlimit)); - dst->d_rt_space = XFS_FSB_TO_B(mp, dqp->q_res_rtbcount); + dst->d_rt_space = XFS_FSB_TO_B(mp, dqp->q_rtb.reserved); dst->d_rt_spc_timer = be32_to_cpu(dqp->q_core.d_rtbtimer); dst->d_rt_spc_warns = be16_to_cpu(dqp->q_core.d_rtbwarns); diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 39f82dbafa37..b49c672d1674 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -879,7 +879,7 @@ DECLARE_EVENT_CLASS(xfs_dquot_class, __entry->id = dqp->q_id; __entry->flags = dqp->dq_flags | dqp->q_flags; __entry->nrefs = dqp->q_nrefs; - __entry->res_bcount = dqp->q_res_bcount; + __entry->res_bcount = dqp->q_blk.reserved; __entry->bcount = be64_to_cpu(dqp->q_core.d_bcount); __entry->icount = be64_to_cpu(dqp->q_core.d_icount); __entry->blk_hardlimit = diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c index 8dd6aa6e8fa3..ed3928e9392f 100644 --- a/fs/xfs/xfs_trans_dquot.c +++ b/fs/xfs/xfs_trans_dquot.c @@ -409,11 +409,11 @@ xfs_trans_apply_dquot_deltas( if (qtrx->qt_blk_res != blk_res_used) { if (qtrx->qt_blk_res > blk_res_used) - dqp->q_res_bcount -= (xfs_qcnt_t) + dqp->q_blk.reserved -= (xfs_qcnt_t) (qtrx->qt_blk_res - blk_res_used); else - dqp->q_res_bcount -= (xfs_qcnt_t) + dqp->q_blk.reserved -= (xfs_qcnt_t) (blk_res_used - qtrx->qt_blk_res); } @@ -426,7 +426,7 @@ xfs_trans_apply_dquot_deltas( * deliberately skip quota reservations. */ if (qtrx->qt_bcount_delta) { - dqp->q_res_bcount += + dqp->q_blk.reserved += (xfs_qcnt_t)qtrx->qt_bcount_delta; } } @@ -437,17 +437,17 @@ xfs_trans_apply_dquot_deltas( if (qtrx->qt_rtblk_res != qtrx->qt_rtblk_res_used) { if (qtrx->qt_rtblk_res > qtrx->qt_rtblk_res_used) - dqp->q_res_rtbcount -= (xfs_qcnt_t) + dqp->q_rtb.reserved -= (xfs_qcnt_t) (qtrx->qt_rtblk_res - qtrx->qt_rtblk_res_used); else - dqp->q_res_rtbcount -= (xfs_qcnt_t) + dqp->q_rtb.reserved -= (xfs_qcnt_t) (qtrx->qt_rtblk_res_used - qtrx->qt_rtblk_res); } } else { if (qtrx->qt_rtbcount_delta) - dqp->q_res_rtbcount += + dqp->q_rtb.reserved += (xfs_qcnt_t)qtrx->qt_rtbcount_delta; } @@ -458,20 +458,20 @@ xfs_trans_apply_dquot_deltas( ASSERT(qtrx->qt_ino_res >= qtrx->qt_ino_res_used); if (qtrx->qt_ino_res > qtrx->qt_ino_res_used) - dqp->q_res_icount -= (xfs_qcnt_t) + dqp->q_ino.reserved -= (xfs_qcnt_t) (qtrx->qt_ino_res - qtrx->qt_ino_res_used); } else { if (qtrx->qt_icount_delta) - dqp->q_res_icount += + dqp->q_ino.reserved += (xfs_qcnt_t)qtrx->qt_icount_delta; } - ASSERT(dqp->q_res_bcount >= + ASSERT(dqp->q_blk.reserved >= be64_to_cpu(dqp->q_core.d_bcount)); - ASSERT(dqp->q_res_icount >= + ASSERT(dqp->q_ino.reserved >= be64_to_cpu(dqp->q_core.d_icount)); - ASSERT(dqp->q_res_rtbcount >= + ASSERT(dqp->q_rtb.reserved >= be64_to_cpu(dqp->q_core.d_rtbcount)); } } @@ -516,7 +516,7 @@ xfs_trans_unreserve_and_mod_dquots( if (qtrx->qt_blk_res) { xfs_dqlock(dqp); locked = true; - dqp->q_res_bcount -= + dqp->q_blk.reserved -= (xfs_qcnt_t)qtrx->qt_blk_res; } if (qtrx->qt_ino_res) { @@ -524,7 +524,7 @@ xfs_trans_unreserve_and_mod_dquots( xfs_dqlock(dqp); locked = true; } - dqp->q_res_icount -= + dqp->q_ino.reserved -= (xfs_qcnt_t)qtrx->qt_ino_res; } @@ -533,7 +533,7 @@ xfs_trans_unreserve_and_mod_dquots( xfs_dqlock(dqp); locked = true; } - dqp->q_res_rtbcount -= + dqp->q_rtb.reserved -= (xfs_qcnt_t)qtrx->qt_rtblk_res; } if (locked) @@ -602,7 +602,7 @@ xfs_trans_dqresv( timer = be32_to_cpu(dqp->q_core.d_btimer); warns = be16_to_cpu(dqp->q_core.d_bwarns); warnlimit = defq->bwarnlimit; - resbcountp = &dqp->q_res_bcount; + resbcountp = &dqp->q_blk.reserved; } else { ASSERT(flags & XFS_TRANS_DQ_RES_RTBLKS); hardlimit = be64_to_cpu(dqp->q_core.d_rtb_hardlimit); @@ -614,7 +614,7 @@ xfs_trans_dqresv( timer = be32_to_cpu(dqp->q_core.d_rtbtimer); warns = be16_to_cpu(dqp->q_core.d_rtbwarns); warnlimit = defq->rtbwarnlimit; - resbcountp = &dqp->q_res_rtbcount; + resbcountp = &dqp->q_rtb.reserved; } if ((flags & XFS_QMOPT_FORCE_RES) == 0 && dqp->q_id && @@ -645,7 +645,7 @@ xfs_trans_dqresv( } } if (ninos > 0) { - total_count = dqp->q_res_icount + ninos; + total_count = dqp->q_ino.reserved + ninos; timer = be32_to_cpu(dqp->q_core.d_itimer); warns = be16_to_cpu(dqp->q_core.d_iwarns); warnlimit = defq->iwarnlimit; @@ -675,11 +675,11 @@ xfs_trans_dqresv( /* * Change the reservation, but not the actual usage. - * Note that q_res_bcount = q_core.d_bcount + resv + * Note that q_blk.reserved = q_core.d_bcount + resv */ (*resbcountp) += (xfs_qcnt_t)nblks; if (ninos != 0) - dqp->q_res_icount += (xfs_qcnt_t)ninos; + dqp->q_ino.reserved += (xfs_qcnt_t)ninos; /* * note the reservation amt in the trans struct too, @@ -700,9 +700,9 @@ xfs_trans_dqresv( XFS_TRANS_DQ_RES_INOS, ninos); } - ASSERT(dqp->q_res_bcount >= be64_to_cpu(dqp->q_core.d_bcount)); - ASSERT(dqp->q_res_rtbcount >= be64_to_cpu(dqp->q_core.d_rtbcount)); - ASSERT(dqp->q_res_icount >= be64_to_cpu(dqp->q_core.d_icount)); + ASSERT(dqp->q_blk.reserved >= be64_to_cpu(dqp->q_core.d_bcount)); + ASSERT(dqp->q_rtb.reserved >= be64_to_cpu(dqp->q_core.d_rtbcount)); + ASSERT(dqp->q_ino.reserved >= be64_to_cpu(dqp->q_core.d_icount)); xfs_dqunlock(dqp); return 0; From d3537cf93e5e2f8b4e95cfe8bc8fa03b58c88e32 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 14 Jul 2020 10:37:31 -0700 Subject: [PATCH 064/117] xfs: stop using q_core limits in the quota code Add limits fields in the incore dquot, and use that instead of the ones in qcore. This eliminates a bunch of endian conversions and will eventually allow us to remove qcore entirely. Signed-off-by: Darrick J. Wong Reviewed-by: Chandan Babu R Reviewed-by: Christoph Hellwig Reviewed-by: Allison Collins --- fs/xfs/scrub/quota.c | 36 ++++------- fs/xfs/xfs_dquot.c | 135 ++++++++++++++++++++++----------------- fs/xfs/xfs_dquot.h | 6 +- fs/xfs/xfs_qm.c | 14 ++-- fs/xfs/xfs_qm.h | 12 ++-- fs/xfs/xfs_qm_bhv.c | 12 ++-- fs/xfs/xfs_qm_syscalls.c | 40 ++++++------ fs/xfs/xfs_trace.h | 12 ++-- fs/xfs/xfs_trans_dquot.c | 12 ++-- 9 files changed, 138 insertions(+), 141 deletions(-) diff --git a/fs/xfs/scrub/quota.c b/fs/xfs/scrub/quota.c index 145f8710bac9..e73e6c88e76a 100644 --- a/fs/xfs/scrub/quota.c +++ b/fs/xfs/scrub/quota.c @@ -82,12 +82,6 @@ xchk_quota_item( struct xfs_disk_dquot *d = &dq->q_core; struct xfs_quotainfo *qi = mp->m_quotainfo; xfs_fileoff_t offset; - unsigned long long bsoft; - unsigned long long isoft; - unsigned long long rsoft; - unsigned long long bhard; - unsigned long long ihard; - unsigned long long rhard; unsigned long long bcount; unsigned long long icount; unsigned long long rcount; @@ -114,15 +108,6 @@ xchk_quota_item( if (d->d_pad0 != cpu_to_be32(0) || d->d_pad != cpu_to_be16(0)) xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset); - /* Check the limits. */ - bhard = be64_to_cpu(d->d_blk_hardlimit); - ihard = be64_to_cpu(d->d_ino_hardlimit); - rhard = be64_to_cpu(d->d_rtb_hardlimit); - - bsoft = be64_to_cpu(d->d_blk_softlimit); - isoft = be64_to_cpu(d->d_ino_softlimit); - rsoft = be64_to_cpu(d->d_rtb_softlimit); - /* * Warn if the hard limits are larger than the fs. * Administrators can do this, though in production this seems @@ -131,19 +116,19 @@ xchk_quota_item( * Complain about corruption if the soft limit is greater than * the hard limit. */ - if (bhard > mp->m_sb.sb_dblocks) + if (dq->q_blk.hardlimit > mp->m_sb.sb_dblocks) xchk_fblock_set_warning(sc, XFS_DATA_FORK, offset); - if (bsoft > bhard) + if (dq->q_blk.softlimit > dq->q_blk.hardlimit) xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset); - if (ihard > M_IGEO(mp)->maxicount) + if (dq->q_ino.hardlimit > M_IGEO(mp)->maxicount) xchk_fblock_set_warning(sc, XFS_DATA_FORK, offset); - if (isoft > ihard) + if (dq->q_ino.softlimit > dq->q_ino.hardlimit) xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset); - if (rhard > mp->m_sb.sb_rblocks) + if (dq->q_rtb.hardlimit > mp->m_sb.sb_rblocks) xchk_fblock_set_warning(sc, XFS_DATA_FORK, offset); - if (rsoft > rhard) + if (dq->q_rtb.softlimit > dq->q_rtb.hardlimit) xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset); /* Check the resource counts. */ @@ -177,13 +162,16 @@ xchk_quota_item( if (dq->q_id == 0) goto out; - if (bhard != 0 && bcount > bhard) + if (dq->q_blk.hardlimit != 0 && + bcount > dq->q_blk.hardlimit) xchk_fblock_set_warning(sc, XFS_DATA_FORK, offset); - if (ihard != 0 && icount > ihard) + if (dq->q_ino.hardlimit != 0 && + icount > dq->q_ino.hardlimit) xchk_fblock_set_warning(sc, XFS_DATA_FORK, offset); - if (rhard != 0 && rcount > rhard) + if (dq->q_rtb.hardlimit != 0 && + rcount > dq->q_rtb.hardlimit) xchk_fblock_set_warning(sc, XFS_DATA_FORK, offset); out: diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index 8fc7ae1f4653..9116e6ad7e9e 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -71,29 +71,28 @@ xfs_qm_adjust_dqlimits( struct xfs_dquot *dq) { struct xfs_quotainfo *q = mp->m_quotainfo; - struct xfs_disk_dquot *d = &dq->q_core; struct xfs_def_quota *defq; int prealloc = 0; ASSERT(dq->q_id); defq = xfs_get_defquota(q, xfs_dquot_type(dq)); - if (defq->bsoftlimit && !d->d_blk_softlimit) { - d->d_blk_softlimit = cpu_to_be64(defq->bsoftlimit); + if (defq->bsoftlimit && !dq->q_blk.softlimit) { + dq->q_blk.softlimit = defq->bsoftlimit; prealloc = 1; } - if (defq->bhardlimit && !d->d_blk_hardlimit) { - d->d_blk_hardlimit = cpu_to_be64(defq->bhardlimit); + if (defq->bhardlimit && !dq->q_blk.hardlimit) { + dq->q_blk.hardlimit = defq->bhardlimit; prealloc = 1; } - if (defq->isoftlimit && !d->d_ino_softlimit) - d->d_ino_softlimit = cpu_to_be64(defq->isoftlimit); - if (defq->ihardlimit && !d->d_ino_hardlimit) - d->d_ino_hardlimit = cpu_to_be64(defq->ihardlimit); - if (defq->rtbsoftlimit && !d->d_rtb_softlimit) - d->d_rtb_softlimit = cpu_to_be64(defq->rtbsoftlimit); - if (defq->rtbhardlimit && !d->d_rtb_hardlimit) - d->d_rtb_hardlimit = cpu_to_be64(defq->rtbhardlimit); + if (defq->isoftlimit && !dq->q_ino.softlimit) + dq->q_ino.softlimit = defq->isoftlimit; + if (defq->ihardlimit && !dq->q_ino.hardlimit) + dq->q_ino.hardlimit = defq->ihardlimit; + if (defq->rtbsoftlimit && !dq->q_rtb.softlimit) + dq->q_rtb.softlimit = defq->rtbsoftlimit; + if (defq->rtbhardlimit && !dq->q_rtb.hardlimit) + dq->q_rtb.hardlimit = defq->rtbhardlimit; if (prealloc) xfs_dquot_set_prealloc_limits(dq); @@ -125,82 +124,67 @@ xfs_qm_adjust_dqtimers( defq = xfs_get_defquota(qi, xfs_dquot_type(dq)); #ifdef DEBUG - if (d->d_blk_hardlimit) - ASSERT(be64_to_cpu(d->d_blk_softlimit) <= - be64_to_cpu(d->d_blk_hardlimit)); - if (d->d_ino_hardlimit) - ASSERT(be64_to_cpu(d->d_ino_softlimit) <= - be64_to_cpu(d->d_ino_hardlimit)); - if (d->d_rtb_hardlimit) - ASSERT(be64_to_cpu(d->d_rtb_softlimit) <= - be64_to_cpu(d->d_rtb_hardlimit)); + if (dq->q_blk.hardlimit) + ASSERT(dq->q_blk.softlimit <= dq->q_blk.hardlimit); + if (dq->q_ino.hardlimit) + ASSERT(dq->q_ino.softlimit <= dq->q_ino.hardlimit); + if (dq->q_rtb.hardlimit) + ASSERT(dq->q_rtb.softlimit <= dq->q_rtb.hardlimit); #endif if (!d->d_btimer) { - if ((d->d_blk_softlimit && - (be64_to_cpu(d->d_bcount) > - be64_to_cpu(d->d_blk_softlimit))) || - (d->d_blk_hardlimit && - (be64_to_cpu(d->d_bcount) > - be64_to_cpu(d->d_blk_hardlimit)))) { + if ((dq->q_blk.softlimit && + (be64_to_cpu(d->d_bcount) > dq->q_blk.softlimit)) || + (dq->q_blk.hardlimit && + (be64_to_cpu(d->d_bcount) > dq->q_blk.hardlimit))) { d->d_btimer = cpu_to_be32(ktime_get_real_seconds() + defq->btimelimit); } else { d->d_bwarns = 0; } } else { - if ((!d->d_blk_softlimit || - (be64_to_cpu(d->d_bcount) <= - be64_to_cpu(d->d_blk_softlimit))) && - (!d->d_blk_hardlimit || - (be64_to_cpu(d->d_bcount) <= - be64_to_cpu(d->d_blk_hardlimit)))) { + if ((!dq->q_blk.softlimit || + (be64_to_cpu(d->d_bcount) <= dq->q_blk.softlimit)) && + (!dq->q_blk.hardlimit || + (be64_to_cpu(d->d_bcount) <= dq->q_blk.hardlimit))) { d->d_btimer = 0; } } if (!d->d_itimer) { - if ((d->d_ino_softlimit && - (be64_to_cpu(d->d_icount) > - be64_to_cpu(d->d_ino_softlimit))) || - (d->d_ino_hardlimit && - (be64_to_cpu(d->d_icount) > - be64_to_cpu(d->d_ino_hardlimit)))) { + if ((dq->q_ino.softlimit && + (be64_to_cpu(d->d_icount) > dq->q_ino.softlimit)) || + (dq->q_ino.hardlimit && + (be64_to_cpu(d->d_icount) > dq->q_ino.hardlimit))) { d->d_itimer = cpu_to_be32(ktime_get_real_seconds() + defq->itimelimit); } else { d->d_iwarns = 0; } } else { - if ((!d->d_ino_softlimit || - (be64_to_cpu(d->d_icount) <= - be64_to_cpu(d->d_ino_softlimit))) && - (!d->d_ino_hardlimit || - (be64_to_cpu(d->d_icount) <= - be64_to_cpu(d->d_ino_hardlimit)))) { + if ((!dq->q_ino.softlimit || + (be64_to_cpu(d->d_icount) <= dq->q_ino.softlimit)) && + (!dq->q_ino.hardlimit || + (be64_to_cpu(d->d_icount) <= dq->q_ino.hardlimit))) { d->d_itimer = 0; } } if (!d->d_rtbtimer) { - if ((d->d_rtb_softlimit && - (be64_to_cpu(d->d_rtbcount) > - be64_to_cpu(d->d_rtb_softlimit))) || - (d->d_rtb_hardlimit && - (be64_to_cpu(d->d_rtbcount) > - be64_to_cpu(d->d_rtb_hardlimit)))) { + if ((dq->q_rtb.softlimit && + (be64_to_cpu(d->d_rtbcount) > dq->q_rtb.softlimit)) || + (dq->q_rtb.hardlimit && + (be64_to_cpu(d->d_rtbcount) > dq->q_rtb.hardlimit))) { d->d_rtbtimer = cpu_to_be32(ktime_get_real_seconds() + defq->rtbtimelimit); } else { d->d_rtbwarns = 0; } } else { - if ((!d->d_rtb_softlimit || - (be64_to_cpu(d->d_rtbcount) <= - be64_to_cpu(d->d_rtb_softlimit))) && - (!d->d_rtb_hardlimit || - (be64_to_cpu(d->d_rtbcount) <= - be64_to_cpu(d->d_rtb_hardlimit)))) { + if ((!dq->q_rtb.softlimit || + (be64_to_cpu(d->d_rtbcount) <= dq->q_rtb.softlimit)) && + (!dq->q_rtb.hardlimit || + (be64_to_cpu(d->d_rtbcount) <= dq->q_rtb.hardlimit))) { d->d_rtbtimer = 0; } } @@ -291,8 +275,8 @@ xfs_dquot_set_prealloc_limits(struct xfs_dquot *dqp) { uint64_t space; - dqp->q_prealloc_hi_wmark = be64_to_cpu(dqp->q_core.d_blk_hardlimit); - dqp->q_prealloc_lo_wmark = be64_to_cpu(dqp->q_core.d_blk_softlimit); + dqp->q_prealloc_hi_wmark = dqp->q_blk.hardlimit; + dqp->q_prealloc_lo_wmark = dqp->q_blk.softlimit; if (!dqp->q_prealloc_lo_wmark) { dqp->q_prealloc_lo_wmark = dqp->q_prealloc_hi_wmark; do_div(dqp->q_prealloc_lo_wmark, 100); @@ -547,6 +531,12 @@ xfs_dquot_from_disk( /* copy everything from disk dquot to the incore dquot */ memcpy(&dqp->q_core, ddqp, sizeof(struct xfs_disk_dquot)); + dqp->q_blk.hardlimit = be64_to_cpu(ddqp->d_blk_hardlimit); + dqp->q_blk.softlimit = be64_to_cpu(ddqp->d_blk_softlimit); + dqp->q_ino.hardlimit = be64_to_cpu(ddqp->d_ino_hardlimit); + dqp->q_ino.softlimit = be64_to_cpu(ddqp->d_ino_softlimit); + dqp->q_rtb.hardlimit = be64_to_cpu(ddqp->d_rtb_hardlimit); + dqp->q_rtb.softlimit = be64_to_cpu(ddqp->d_rtb_softlimit); /* * Reservation counters are defined as reservation plus current usage @@ -568,6 +558,12 @@ xfs_dquot_to_disk( struct xfs_dquot *dqp) { memcpy(ddqp, &dqp->q_core, sizeof(struct xfs_disk_dquot)); + ddqp->d_blk_hardlimit = cpu_to_be64(dqp->q_blk.hardlimit); + ddqp->d_blk_softlimit = cpu_to_be64(dqp->q_blk.softlimit); + ddqp->d_ino_hardlimit = cpu_to_be64(dqp->q_ino.hardlimit); + ddqp->d_ino_softlimit = cpu_to_be64(dqp->q_ino.softlimit); + ddqp->d_rtb_hardlimit = cpu_to_be64(dqp->q_rtb.hardlimit); + ddqp->d_rtb_softlimit = cpu_to_be64(dqp->q_rtb.softlimit); } /* Allocate and initialize the dquot buffer for this in-core dquot. */ @@ -1129,6 +1125,7 @@ static xfs_failaddr_t xfs_qm_dqflush_check( struct xfs_dquot *dqp) { + struct xfs_disk_dquot *ddq = &dqp->q_core; __u8 type = dqp->dq_flags & XFS_DQ_ALLTYPES; if (type != XFS_DQ_USER && @@ -1136,6 +1133,24 @@ xfs_qm_dqflush_check( type != XFS_DQ_PROJ) return __this_address; + if (dqp->q_id == 0) + return NULL; + + if (dqp->q_blk.softlimit && + be64_to_cpu(ddq->d_bcount) > dqp->q_blk.softlimit && + !ddq->d_btimer) + return __this_address; + + if (dqp->q_ino.softlimit && + be64_to_cpu(ddq->d_icount) > dqp->q_ino.softlimit && + !ddq->d_itimer) + return __this_address; + + if (dqp->q_rtb.softlimit && + be64_to_cpu(ddq->d_rtbcount) > dqp->q_rtb.softlimit && + !ddq->d_rtbtimer) + return __this_address; + return NULL; } diff --git a/fs/xfs/xfs_dquot.h b/fs/xfs/xfs_dquot.h index c2f841ed2661..59790140fcb6 100644 --- a/fs/xfs/xfs_dquot.h +++ b/fs/xfs/xfs_dquot.h @@ -30,6 +30,10 @@ enum { struct xfs_dquot_res { /* Total resources allocated and reserved. */ xfs_qcnt_t reserved; + + /* Absolute and preferred limits. */ + xfs_qcnt_t hardlimit; + xfs_qcnt_t softlimit; }; /* @@ -143,7 +147,7 @@ static inline bool xfs_dquot_lowsp(struct xfs_dquot *dqp) { int64_t freesp; - freesp = be64_to_cpu(dqp->q_core.d_blk_hardlimit) - dqp->q_blk.reserved; + freesp = dqp->q_blk.hardlimit - dqp->q_blk.reserved; if (freesp < dqp->q_low_space[XFS_QLOWSP_1_PCNT]) return true; diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index aed4a76b8d61..83ae59536b2b 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -550,26 +550,24 @@ xfs_qm_set_defquota( { struct xfs_dquot *dqp; struct xfs_def_quota *defq; - struct xfs_disk_dquot *ddqp; int error; error = xfs_qm_dqget_uncached(mp, 0, type, &dqp); if (error) return; - ddqp = &dqp->q_core; defq = xfs_get_defquota(qinf, xfs_dquot_type(dqp)); /* * Timers and warnings have been already set, let's just set the * default limits for this quota type */ - defq->bhardlimit = be64_to_cpu(ddqp->d_blk_hardlimit); - defq->bsoftlimit = be64_to_cpu(ddqp->d_blk_softlimit); - defq->ihardlimit = be64_to_cpu(ddqp->d_ino_hardlimit); - defq->isoftlimit = be64_to_cpu(ddqp->d_ino_softlimit); - defq->rtbhardlimit = be64_to_cpu(ddqp->d_rtb_hardlimit); - defq->rtbsoftlimit = be64_to_cpu(ddqp->d_rtb_softlimit); + defq->bhardlimit = dqp->q_blk.hardlimit; + defq->bsoftlimit = dqp->q_blk.softlimit; + defq->ihardlimit = dqp->q_ino.hardlimit; + defq->isoftlimit = dqp->q_ino.softlimit; + defq->rtbhardlimit = dqp->q_rtb.hardlimit; + defq->rtbsoftlimit = dqp->q_rtb.softlimit; xfs_qm_dqdestroy(dqp); } diff --git a/fs/xfs/xfs_qm.h b/fs/xfs/xfs_qm.h index 2c8ca9df23af..eb5fb7d9d995 100644 --- a/fs/xfs/xfs_qm.h +++ b/fs/xfs/xfs_qm.h @@ -20,12 +20,12 @@ extern struct kmem_zone *xfs_qm_dqtrxzone; #define XFS_DQITER_MAP_SIZE 10 #define XFS_IS_DQUOT_UNINITIALIZED(dqp) ( \ - !dqp->q_core.d_blk_hardlimit && \ - !dqp->q_core.d_blk_softlimit && \ - !dqp->q_core.d_rtb_hardlimit && \ - !dqp->q_core.d_rtb_softlimit && \ - !dqp->q_core.d_ino_hardlimit && \ - !dqp->q_core.d_ino_softlimit && \ + !dqp->q_blk.hardlimit && \ + !dqp->q_blk.softlimit && \ + !dqp->q_rtb.hardlimit && \ + !dqp->q_rtb.softlimit && \ + !dqp->q_ino.hardlimit && \ + !dqp->q_ino.softlimit && \ !dqp->q_core.d_bcount && \ !dqp->q_core.d_rtbcount && \ !dqp->q_core.d_icount) diff --git a/fs/xfs/xfs_qm_bhv.c b/fs/xfs/xfs_qm_bhv.c index 94b2b4b0fc17..0993217e5ac8 100644 --- a/fs/xfs/xfs_qm_bhv.c +++ b/fs/xfs/xfs_qm_bhv.c @@ -23,9 +23,9 @@ xfs_fill_statvfs_from_dquot( { uint64_t limit; - limit = dqp->q_core.d_blk_softlimit ? - be64_to_cpu(dqp->q_core.d_blk_softlimit) : - be64_to_cpu(dqp->q_core.d_blk_hardlimit); + limit = dqp->q_blk.softlimit ? + dqp->q_blk.softlimit : + dqp->q_blk.hardlimit; if (limit && statp->f_blocks > limit) { statp->f_blocks = limit; statp->f_bfree = statp->f_bavail = @@ -33,9 +33,9 @@ xfs_fill_statvfs_from_dquot( (statp->f_blocks - dqp->q_blk.reserved) : 0; } - limit = dqp->q_core.d_ino_softlimit ? - be64_to_cpu(dqp->q_core.d_ino_softlimit) : - be64_to_cpu(dqp->q_core.d_ino_hardlimit); + limit = dqp->q_ino.softlimit ? + dqp->q_ino.softlimit : + dqp->q_ino.hardlimit; if (limit && statp->f_files > limit) { statp->f_files = limit; statp->f_ffree = diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c index a65ea8b4d24d..7aab60ea67e2 100644 --- a/fs/xfs/xfs_qm_syscalls.c +++ b/fs/xfs/xfs_qm_syscalls.c @@ -495,13 +495,13 @@ xfs_qm_scall_setqlim( */ hard = (newlim->d_fieldmask & QC_SPC_HARD) ? (xfs_qcnt_t) XFS_B_TO_FSB(mp, newlim->d_spc_hardlimit) : - be64_to_cpu(ddq->d_blk_hardlimit); + dqp->q_blk.hardlimit; soft = (newlim->d_fieldmask & QC_SPC_SOFT) ? (xfs_qcnt_t) XFS_B_TO_FSB(mp, newlim->d_spc_softlimit) : - be64_to_cpu(ddq->d_blk_softlimit); + dqp->q_blk.softlimit; if (hard == 0 || hard >= soft) { - ddq->d_blk_hardlimit = cpu_to_be64(hard); - ddq->d_blk_softlimit = cpu_to_be64(soft); + dqp->q_blk.hardlimit = hard; + dqp->q_blk.softlimit = soft; xfs_dquot_set_prealloc_limits(dqp); if (id == 0) { defq->bhardlimit = hard; @@ -512,13 +512,13 @@ xfs_qm_scall_setqlim( } hard = (newlim->d_fieldmask & QC_RT_SPC_HARD) ? (xfs_qcnt_t) XFS_B_TO_FSB(mp, newlim->d_rt_spc_hardlimit) : - be64_to_cpu(ddq->d_rtb_hardlimit); + dqp->q_rtb.hardlimit; soft = (newlim->d_fieldmask & QC_RT_SPC_SOFT) ? (xfs_qcnt_t) XFS_B_TO_FSB(mp, newlim->d_rt_spc_softlimit) : - be64_to_cpu(ddq->d_rtb_softlimit); + dqp->q_rtb.softlimit; if (hard == 0 || hard >= soft) { - ddq->d_rtb_hardlimit = cpu_to_be64(hard); - ddq->d_rtb_softlimit = cpu_to_be64(soft); + dqp->q_rtb.hardlimit = hard; + dqp->q_rtb.softlimit = soft; if (id == 0) { defq->rtbhardlimit = hard; defq->rtbsoftlimit = soft; @@ -529,13 +529,13 @@ xfs_qm_scall_setqlim( hard = (newlim->d_fieldmask & QC_INO_HARD) ? (xfs_qcnt_t) newlim->d_ino_hardlimit : - be64_to_cpu(ddq->d_ino_hardlimit); + dqp->q_ino.hardlimit; soft = (newlim->d_fieldmask & QC_INO_SOFT) ? (xfs_qcnt_t) newlim->d_ino_softlimit : - be64_to_cpu(ddq->d_ino_softlimit); + dqp->q_ino.softlimit; if (hard == 0 || hard >= soft) { - ddq->d_ino_hardlimit = cpu_to_be64(hard); - ddq->d_ino_softlimit = cpu_to_be64(soft); + dqp->q_ino.hardlimit = hard; + dqp->q_ino.softlimit = soft; if (id == 0) { defq->ihardlimit = hard; defq->isoftlimit = soft; @@ -619,10 +619,8 @@ xfs_qm_scall_getquota_fill_qc( struct qc_dqblk *dst) { memset(dst, 0, sizeof(*dst)); - dst->d_spc_hardlimit = - XFS_FSB_TO_B(mp, be64_to_cpu(dqp->q_core.d_blk_hardlimit)); - dst->d_spc_softlimit = - XFS_FSB_TO_B(mp, be64_to_cpu(dqp->q_core.d_blk_softlimit)); + dst->d_spc_hardlimit = XFS_FSB_TO_B(mp, dqp->q_blk.hardlimit); + dst->d_spc_softlimit = XFS_FSB_TO_B(mp, dqp->q_blk.softlimit); dst->d_ino_hardlimit = be64_to_cpu(dqp->q_core.d_ino_hardlimit); dst->d_ino_softlimit = be64_to_cpu(dqp->q_core.d_ino_softlimit); dst->d_space = XFS_FSB_TO_B(mp, dqp->q_blk.reserved); @@ -631,10 +629,8 @@ xfs_qm_scall_getquota_fill_qc( dst->d_ino_timer = be32_to_cpu(dqp->q_core.d_itimer); dst->d_ino_warns = be16_to_cpu(dqp->q_core.d_iwarns); dst->d_spc_warns = be16_to_cpu(dqp->q_core.d_bwarns); - dst->d_rt_spc_hardlimit = - XFS_FSB_TO_B(mp, be64_to_cpu(dqp->q_core.d_rtb_hardlimit)); - dst->d_rt_spc_softlimit = - XFS_FSB_TO_B(mp, be64_to_cpu(dqp->q_core.d_rtb_softlimit)); + dst->d_rt_spc_hardlimit = XFS_FSB_TO_B(mp, dqp->q_rtb.hardlimit); + dst->d_rt_spc_softlimit = XFS_FSB_TO_B(mp, dqp->q_rtb.softlimit); dst->d_rt_space = XFS_FSB_TO_B(mp, dqp->q_rtb.reserved); dst->d_rt_spc_timer = be32_to_cpu(dqp->q_core.d_rtbtimer); dst->d_rt_spc_warns = be16_to_cpu(dqp->q_core.d_rtbwarns); @@ -664,8 +660,8 @@ xfs_qm_scall_getquota_fill_qc( (dst->d_spc_softlimit > 0)) { ASSERT(dst->d_spc_timer != 0); } - if ((dst->d_ino_count > dst->d_ino_softlimit) && - (dst->d_ino_softlimit > 0)) { + if ((dst->d_ino_count > dqp->q_ino.softlimit) && + (dqp->q_ino.softlimit > 0)) { ASSERT(dst->d_ino_timer != 0); } } diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index b49c672d1674..93fe31a22ce8 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -882,14 +882,10 @@ DECLARE_EVENT_CLASS(xfs_dquot_class, __entry->res_bcount = dqp->q_blk.reserved; __entry->bcount = be64_to_cpu(dqp->q_core.d_bcount); __entry->icount = be64_to_cpu(dqp->q_core.d_icount); - __entry->blk_hardlimit = - be64_to_cpu(dqp->q_core.d_blk_hardlimit); - __entry->blk_softlimit = - be64_to_cpu(dqp->q_core.d_blk_softlimit); - __entry->ino_hardlimit = - be64_to_cpu(dqp->q_core.d_ino_hardlimit); - __entry->ino_softlimit = - be64_to_cpu(dqp->q_core.d_ino_softlimit); + __entry->blk_hardlimit = dqp->q_blk.hardlimit; + __entry->blk_softlimit = dqp->q_blk.softlimit; + __entry->ino_hardlimit = dqp->q_ino.hardlimit; + __entry->ino_softlimit = dqp->q_ino.softlimit; ), TP_printk("dev %d:%d id 0x%x flags %s nrefs %u res_bc 0x%llx " "bcnt 0x%llx bhardlimit 0x%llx bsoftlimit 0x%llx " diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c index ed3928e9392f..0d6a69a81a58 100644 --- a/fs/xfs/xfs_trans_dquot.c +++ b/fs/xfs/xfs_trans_dquot.c @@ -593,10 +593,10 @@ xfs_trans_dqresv( defq = xfs_get_defquota(q, xfs_dquot_type(dqp)); if (flags & XFS_TRANS_DQ_RES_BLKS) { - hardlimit = be64_to_cpu(dqp->q_core.d_blk_hardlimit); + hardlimit = dqp->q_blk.hardlimit; if (!hardlimit) hardlimit = defq->bhardlimit; - softlimit = be64_to_cpu(dqp->q_core.d_blk_softlimit); + softlimit = dqp->q_blk.softlimit; if (!softlimit) softlimit = defq->bsoftlimit; timer = be32_to_cpu(dqp->q_core.d_btimer); @@ -605,10 +605,10 @@ xfs_trans_dqresv( resbcountp = &dqp->q_blk.reserved; } else { ASSERT(flags & XFS_TRANS_DQ_RES_RTBLKS); - hardlimit = be64_to_cpu(dqp->q_core.d_rtb_hardlimit); + hardlimit = dqp->q_rtb.hardlimit; if (!hardlimit) hardlimit = defq->rtbhardlimit; - softlimit = be64_to_cpu(dqp->q_core.d_rtb_softlimit); + softlimit = dqp->q_rtb.softlimit; if (!softlimit) softlimit = defq->rtbsoftlimit; timer = be32_to_cpu(dqp->q_core.d_rtbtimer); @@ -649,10 +649,10 @@ xfs_trans_dqresv( timer = be32_to_cpu(dqp->q_core.d_itimer); warns = be16_to_cpu(dqp->q_core.d_iwarns); warnlimit = defq->iwarnlimit; - hardlimit = be64_to_cpu(dqp->q_core.d_ino_hardlimit); + hardlimit = dqp->q_ino.hardlimit; if (!hardlimit) hardlimit = defq->ihardlimit; - softlimit = be64_to_cpu(dqp->q_core.d_ino_softlimit); + softlimit = dqp->q_ino.softlimit; if (!softlimit) softlimit = defq->isoftlimit; From be37d40c1ba0b5484ea2f8c109a9eda13e4c690a Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 14 Jul 2020 10:37:31 -0700 Subject: [PATCH 065/117] xfs: stop using q_core counters in the quota code Add counter fields to the incore dquot, and use that instead of the ones in qcore. This eliminates a bunch of endian conversions and will eventually allow us to remove qcore entirely. Signed-off-by: Darrick J. Wong Reviewed-by: Chandan Babu R Reviewed-by: Christoph Hellwig Reviewed-by: Allison Collins --- fs/xfs/scrub/quota.c | 18 +++++---------- fs/xfs/xfs_dquot.c | 47 ++++++++++++++++++++++------------------ fs/xfs/xfs_dquot.h | 3 +++ fs/xfs/xfs_qm.c | 6 ++--- fs/xfs/xfs_qm.h | 6 ++--- fs/xfs/xfs_trace.h | 4 ++-- fs/xfs/xfs_trans_dquot.c | 34 +++++++++++------------------ 7 files changed, 56 insertions(+), 62 deletions(-) diff --git a/fs/xfs/scrub/quota.c b/fs/xfs/scrub/quota.c index e73e6c88e76a..20bc763e88b4 100644 --- a/fs/xfs/scrub/quota.c +++ b/fs/xfs/scrub/quota.c @@ -82,9 +82,6 @@ xchk_quota_item( struct xfs_disk_dquot *d = &dq->q_core; struct xfs_quotainfo *qi = mp->m_quotainfo; xfs_fileoff_t offset; - unsigned long long bcount; - unsigned long long icount; - unsigned long long rcount; xfs_ino_t fs_icount; int error = 0; @@ -132,9 +129,6 @@ xchk_quota_item( xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset); /* Check the resource counts. */ - bcount = be64_to_cpu(d->d_bcount); - icount = be64_to_cpu(d->d_icount); - rcount = be64_to_cpu(d->d_rtbcount); fs_icount = percpu_counter_sum(&mp->m_icount); /* @@ -143,15 +137,15 @@ xchk_quota_item( * if there are no quota limits. */ if (xfs_sb_version_hasreflink(&mp->m_sb)) { - if (mp->m_sb.sb_dblocks < bcount) + if (mp->m_sb.sb_dblocks < dq->q_blk.count) xchk_fblock_set_warning(sc, XFS_DATA_FORK, offset); } else { - if (mp->m_sb.sb_dblocks < bcount) + if (mp->m_sb.sb_dblocks < dq->q_blk.count) xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset); } - if (icount > fs_icount || rcount > mp->m_sb.sb_rblocks) + if (dq->q_ino.count > fs_icount || dq->q_rtb.count > mp->m_sb.sb_rblocks) xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset); /* @@ -163,15 +157,15 @@ xchk_quota_item( goto out; if (dq->q_blk.hardlimit != 0 && - bcount > dq->q_blk.hardlimit) + dq->q_blk.count > dq->q_blk.hardlimit) xchk_fblock_set_warning(sc, XFS_DATA_FORK, offset); if (dq->q_ino.hardlimit != 0 && - icount > dq->q_ino.hardlimit) + dq->q_ino.count > dq->q_ino.hardlimit) xchk_fblock_set_warning(sc, XFS_DATA_FORK, offset); if (dq->q_rtb.hardlimit != 0 && - rcount > dq->q_rtb.hardlimit) + dq->q_rtb.count > dq->q_rtb.hardlimit) xchk_fblock_set_warning(sc, XFS_DATA_FORK, offset); out: diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index 9116e6ad7e9e..dc3bfce7f28b 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -134,9 +134,9 @@ xfs_qm_adjust_dqtimers( if (!d->d_btimer) { if ((dq->q_blk.softlimit && - (be64_to_cpu(d->d_bcount) > dq->q_blk.softlimit)) || + (dq->q_blk.count > dq->q_blk.softlimit)) || (dq->q_blk.hardlimit && - (be64_to_cpu(d->d_bcount) > dq->q_blk.hardlimit))) { + (dq->q_blk.count > dq->q_blk.hardlimit))) { d->d_btimer = cpu_to_be32(ktime_get_real_seconds() + defq->btimelimit); } else { @@ -144,18 +144,18 @@ xfs_qm_adjust_dqtimers( } } else { if ((!dq->q_blk.softlimit || - (be64_to_cpu(d->d_bcount) <= dq->q_blk.softlimit)) && + (dq->q_blk.count <= dq->q_blk.softlimit)) && (!dq->q_blk.hardlimit || - (be64_to_cpu(d->d_bcount) <= dq->q_blk.hardlimit))) { + (dq->q_blk.count <= dq->q_blk.hardlimit))) { d->d_btimer = 0; } } if (!d->d_itimer) { if ((dq->q_ino.softlimit && - (be64_to_cpu(d->d_icount) > dq->q_ino.softlimit)) || + (dq->q_ino.count > dq->q_ino.softlimit)) || (dq->q_ino.hardlimit && - (be64_to_cpu(d->d_icount) > dq->q_ino.hardlimit))) { + (dq->q_ino.count > dq->q_ino.hardlimit))) { d->d_itimer = cpu_to_be32(ktime_get_real_seconds() + defq->itimelimit); } else { @@ -163,18 +163,18 @@ xfs_qm_adjust_dqtimers( } } else { if ((!dq->q_ino.softlimit || - (be64_to_cpu(d->d_icount) <= dq->q_ino.softlimit)) && + (dq->q_ino.count <= dq->q_ino.softlimit)) && (!dq->q_ino.hardlimit || - (be64_to_cpu(d->d_icount) <= dq->q_ino.hardlimit))) { + (dq->q_ino.count <= dq->q_ino.hardlimit))) { d->d_itimer = 0; } } if (!d->d_rtbtimer) { if ((dq->q_rtb.softlimit && - (be64_to_cpu(d->d_rtbcount) > dq->q_rtb.softlimit)) || + (dq->q_rtb.count > dq->q_rtb.softlimit)) || (dq->q_rtb.hardlimit && - (be64_to_cpu(d->d_rtbcount) > dq->q_rtb.hardlimit))) { + (dq->q_rtb.count > dq->q_rtb.hardlimit))) { d->d_rtbtimer = cpu_to_be32(ktime_get_real_seconds() + defq->rtbtimelimit); } else { @@ -182,9 +182,9 @@ xfs_qm_adjust_dqtimers( } } else { if ((!dq->q_rtb.softlimit || - (be64_to_cpu(d->d_rtbcount) <= dq->q_rtb.softlimit)) && + (dq->q_rtb.count <= dq->q_rtb.softlimit)) && (!dq->q_rtb.hardlimit || - (be64_to_cpu(d->d_rtbcount) <= dq->q_rtb.hardlimit))) { + (dq->q_rtb.count <= dq->q_rtb.hardlimit))) { d->d_rtbtimer = 0; } } @@ -538,13 +538,17 @@ xfs_dquot_from_disk( dqp->q_rtb.hardlimit = be64_to_cpu(ddqp->d_rtb_hardlimit); dqp->q_rtb.softlimit = be64_to_cpu(ddqp->d_rtb_softlimit); + dqp->q_blk.count = be64_to_cpu(ddqp->d_bcount); + dqp->q_ino.count = be64_to_cpu(ddqp->d_icount); + dqp->q_rtb.count = be64_to_cpu(ddqp->d_rtbcount); + /* * Reservation counters are defined as reservation plus current usage * to avoid having to add every time. */ - dqp->q_blk.reserved = be64_to_cpu(ddqp->d_bcount); - dqp->q_ino.reserved = be64_to_cpu(ddqp->d_icount); - dqp->q_rtb.reserved = be64_to_cpu(ddqp->d_rtbcount); + dqp->q_blk.reserved = dqp->q_blk.count; + dqp->q_ino.reserved = dqp->q_ino.count; + dqp->q_rtb.reserved = dqp->q_rtb.count; /* initialize the dquot speculative prealloc thresholds */ xfs_dquot_set_prealloc_limits(dqp); @@ -564,6 +568,10 @@ xfs_dquot_to_disk( ddqp->d_ino_softlimit = cpu_to_be64(dqp->q_ino.softlimit); ddqp->d_rtb_hardlimit = cpu_to_be64(dqp->q_rtb.hardlimit); ddqp->d_rtb_softlimit = cpu_to_be64(dqp->q_rtb.softlimit); + + ddqp->d_bcount = cpu_to_be64(dqp->q_blk.count); + ddqp->d_icount = cpu_to_be64(dqp->q_ino.count); + ddqp->d_rtbcount = cpu_to_be64(dqp->q_rtb.count); } /* Allocate and initialize the dquot buffer for this in-core dquot. */ @@ -1136,18 +1144,15 @@ xfs_qm_dqflush_check( if (dqp->q_id == 0) return NULL; - if (dqp->q_blk.softlimit && - be64_to_cpu(ddq->d_bcount) > dqp->q_blk.softlimit && + if (dqp->q_blk.softlimit && dqp->q_blk.count > dqp->q_blk.softlimit && !ddq->d_btimer) return __this_address; - if (dqp->q_ino.softlimit && - be64_to_cpu(ddq->d_icount) > dqp->q_ino.softlimit && + if (dqp->q_ino.softlimit && dqp->q_ino.count > dqp->q_ino.softlimit && !ddq->d_itimer) return __this_address; - if (dqp->q_rtb.softlimit && - be64_to_cpu(ddq->d_rtbcount) > dqp->q_rtb.softlimit && + if (dqp->q_rtb.softlimit && dqp->q_rtb.count > dqp->q_rtb.softlimit && !ddq->d_rtbtimer) return __this_address; diff --git a/fs/xfs/xfs_dquot.h b/fs/xfs/xfs_dquot.h index 59790140fcb6..f5f0a15c0f7b 100644 --- a/fs/xfs/xfs_dquot.h +++ b/fs/xfs/xfs_dquot.h @@ -31,6 +31,9 @@ struct xfs_dquot_res { /* Total resources allocated and reserved. */ xfs_qcnt_t reserved; + /* Total resources allocated. */ + xfs_qcnt_t count; + /* Absolute and preferred limits. */ xfs_qcnt_t hardlimit; xfs_qcnt_t softlimit; diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index 83ae59536b2b..d6df2a4ca4ca 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -1089,14 +1089,14 @@ xfs_qm_quotacheck_dqadjust( * Adjust the inode count and the block count to reflect this inode's * resource usage. */ - be64_add_cpu(&dqp->q_core.d_icount, 1); + dqp->q_ino.count++; dqp->q_ino.reserved++; if (nblks) { - be64_add_cpu(&dqp->q_core.d_bcount, nblks); + dqp->q_blk.count += nblks; dqp->q_blk.reserved += nblks; } if (rtblks) { - be64_add_cpu(&dqp->q_core.d_rtbcount, rtblks); + dqp->q_rtb.count += rtblks; dqp->q_rtb.reserved += rtblks; } diff --git a/fs/xfs/xfs_qm.h b/fs/xfs/xfs_qm.h index eb5fb7d9d995..57bddadbc051 100644 --- a/fs/xfs/xfs_qm.h +++ b/fs/xfs/xfs_qm.h @@ -26,9 +26,9 @@ extern struct kmem_zone *xfs_qm_dqtrxzone; !dqp->q_rtb.softlimit && \ !dqp->q_ino.hardlimit && \ !dqp->q_ino.softlimit && \ - !dqp->q_core.d_bcount && \ - !dqp->q_core.d_rtbcount && \ - !dqp->q_core.d_icount) + !dqp->q_blk.count && \ + !dqp->q_rtb.count && \ + !dqp->q_ino.count) /* Defaults for each quota type: time limits, warn limits, usage limits */ struct xfs_def_quota { diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 93fe31a22ce8..f0c2bce69a36 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -880,8 +880,8 @@ DECLARE_EVENT_CLASS(xfs_dquot_class, __entry->flags = dqp->dq_flags | dqp->q_flags; __entry->nrefs = dqp->q_nrefs; __entry->res_bcount = dqp->q_blk.reserved; - __entry->bcount = be64_to_cpu(dqp->q_core.d_bcount); - __entry->icount = be64_to_cpu(dqp->q_core.d_icount); + __entry->bcount = dqp->q_blk.count; + __entry->icount = dqp->q_ino.count; __entry->blk_hardlimit = dqp->q_blk.hardlimit; __entry->blk_softlimit = dqp->q_blk.softlimit; __entry->ino_hardlimit = dqp->q_ino.hardlimit; diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c index 0d6a69a81a58..1c2a45989aaf 100644 --- a/fs/xfs/xfs_trans_dquot.c +++ b/fs/xfs/xfs_trans_dquot.c @@ -309,7 +309,6 @@ xfs_trans_apply_dquot_deltas( int i, j; struct xfs_dquot *dqp; struct xfs_dqtrx *qtrx, *qa; - struct xfs_disk_dquot *d; int64_t totalbdelta; int64_t totalrtbdelta; @@ -341,7 +340,6 @@ xfs_trans_apply_dquot_deltas( /* * adjust the actual number of blocks used */ - d = &dqp->q_core; /* * The issue here is - sometimes we don't make a blkquota @@ -362,25 +360,22 @@ xfs_trans_apply_dquot_deltas( qtrx->qt_delrtb_delta; #ifdef DEBUG if (totalbdelta < 0) - ASSERT(be64_to_cpu(d->d_bcount) >= - -totalbdelta); + ASSERT(dqp->q_blk.count >= -totalbdelta); if (totalrtbdelta < 0) - ASSERT(be64_to_cpu(d->d_rtbcount) >= - -totalrtbdelta); + ASSERT(dqp->q_rtb.count >= -totalrtbdelta); if (qtrx->qt_icount_delta < 0) - ASSERT(be64_to_cpu(d->d_icount) >= - -qtrx->qt_icount_delta); + ASSERT(dqp->q_ino.count >= -qtrx->qt_icount_delta); #endif if (totalbdelta) - be64_add_cpu(&d->d_bcount, (xfs_qcnt_t)totalbdelta); + dqp->q_blk.count += totalbdelta; if (qtrx->qt_icount_delta) - be64_add_cpu(&d->d_icount, (xfs_qcnt_t)qtrx->qt_icount_delta); + dqp->q_ino.count += qtrx->qt_icount_delta; if (totalrtbdelta) - be64_add_cpu(&d->d_rtbcount, (xfs_qcnt_t)totalrtbdelta); + dqp->q_rtb.count += totalrtbdelta; /* * Get any default limits in use. @@ -467,12 +462,9 @@ xfs_trans_apply_dquot_deltas( (xfs_qcnt_t)qtrx->qt_icount_delta; } - ASSERT(dqp->q_blk.reserved >= - be64_to_cpu(dqp->q_core.d_bcount)); - ASSERT(dqp->q_ino.reserved >= - be64_to_cpu(dqp->q_core.d_icount)); - ASSERT(dqp->q_rtb.reserved >= - be64_to_cpu(dqp->q_core.d_rtbcount)); + ASSERT(dqp->q_blk.reserved >= dqp->q_blk.count); + ASSERT(dqp->q_ino.reserved >= dqp->q_ino.count); + ASSERT(dqp->q_rtb.reserved >= dqp->q_rtb.count); } } } @@ -675,7 +667,7 @@ xfs_trans_dqresv( /* * Change the reservation, but not the actual usage. - * Note that q_blk.reserved = q_core.d_bcount + resv + * Note that q_blk.reserved = q_blk.count + resv */ (*resbcountp) += (xfs_qcnt_t)nblks; if (ninos != 0) @@ -700,9 +692,9 @@ xfs_trans_dqresv( XFS_TRANS_DQ_RES_INOS, ninos); } - ASSERT(dqp->q_blk.reserved >= be64_to_cpu(dqp->q_core.d_bcount)); - ASSERT(dqp->q_rtb.reserved >= be64_to_cpu(dqp->q_core.d_rtbcount)); - ASSERT(dqp->q_ino.reserved >= be64_to_cpu(dqp->q_core.d_icount)); + ASSERT(dqp->q_blk.reserved >= dqp->q_blk.count); + ASSERT(dqp->q_rtb.reserved >= dqp->q_rtb.count); + ASSERT(dqp->q_ino.reserved >= dqp->q_ino.count); xfs_dqunlock(dqp); return 0; From c8c45fb2f614e1d30526c5aa304352923ad76416 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 14 Jul 2020 10:37:31 -0700 Subject: [PATCH 066/117] xfs: stop using q_core warning counters in the quota code Add warning counter fields to the incore dquot, and use that instead of the ones in qcore. This eliminates a bunch of endian conversions and will eventually allow us to remove qcore entirely. Signed-off-by: Darrick J. Wong Reviewed-by: Chandan Babu R Reviewed-by: Christoph Hellwig Reviewed-by: Allison Collins --- fs/xfs/xfs_dquot.c | 14 +++++++++++--- fs/xfs/xfs_dquot.h | 8 ++++++++ fs/xfs/xfs_qm.c | 12 ++++++------ fs/xfs/xfs_qm_syscalls.c | 12 ++++++------ fs/xfs/xfs_trans_dquot.c | 6 +++--- 5 files changed, 34 insertions(+), 18 deletions(-) diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index dc3bfce7f28b..9ba21523a481 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -140,7 +140,7 @@ xfs_qm_adjust_dqtimers( d->d_btimer = cpu_to_be32(ktime_get_real_seconds() + defq->btimelimit); } else { - d->d_bwarns = 0; + dq->q_blk.warnings = 0; } } else { if ((!dq->q_blk.softlimit || @@ -159,7 +159,7 @@ xfs_qm_adjust_dqtimers( d->d_itimer = cpu_to_be32(ktime_get_real_seconds() + defq->itimelimit); } else { - d->d_iwarns = 0; + dq->q_ino.warnings = 0; } } else { if ((!dq->q_ino.softlimit || @@ -178,7 +178,7 @@ xfs_qm_adjust_dqtimers( d->d_rtbtimer = cpu_to_be32(ktime_get_real_seconds() + defq->rtbtimelimit); } else { - d->d_rtbwarns = 0; + dq->q_rtb.warnings = 0; } } else { if ((!dq->q_rtb.softlimit || @@ -542,6 +542,10 @@ xfs_dquot_from_disk( dqp->q_ino.count = be64_to_cpu(ddqp->d_icount); dqp->q_rtb.count = be64_to_cpu(ddqp->d_rtbcount); + dqp->q_blk.warnings = be16_to_cpu(ddqp->d_bwarns); + dqp->q_ino.warnings = be16_to_cpu(ddqp->d_iwarns); + dqp->q_rtb.warnings = be16_to_cpu(ddqp->d_rtbwarns); + /* * Reservation counters are defined as reservation plus current usage * to avoid having to add every time. @@ -572,6 +576,10 @@ xfs_dquot_to_disk( ddqp->d_bcount = cpu_to_be64(dqp->q_blk.count); ddqp->d_icount = cpu_to_be64(dqp->q_ino.count); ddqp->d_rtbcount = cpu_to_be64(dqp->q_rtb.count); + + ddqp->d_bwarns = cpu_to_be16(dqp->q_blk.warnings); + ddqp->d_iwarns = cpu_to_be16(dqp->q_ino.warnings); + ddqp->d_rtbwarns = cpu_to_be16(dqp->q_rtb.warnings); } /* Allocate and initialize the dquot buffer for this in-core dquot. */ diff --git a/fs/xfs/xfs_dquot.h b/fs/xfs/xfs_dquot.h index f5f0a15c0f7b..afac622c0fde 100644 --- a/fs/xfs/xfs_dquot.h +++ b/fs/xfs/xfs_dquot.h @@ -37,6 +37,14 @@ struct xfs_dquot_res { /* Absolute and preferred limits. */ xfs_qcnt_t hardlimit; xfs_qcnt_t softlimit; + + /* + * For root dquots, this is the maximum number of warnings that will + * be issued for this quota type. Otherwise, this is the number of + * warnings issued against this quota. Note that none of this is + * implemented. + */ + xfs_qwarncnt_t warnings; }; /* diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index d6df2a4ca4ca..4dc989551710 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -616,12 +616,12 @@ xfs_qm_init_timelimits( defq->itimelimit = be32_to_cpu(ddqp->d_itimer); if (ddqp->d_rtbtimer) defq->rtbtimelimit = be32_to_cpu(ddqp->d_rtbtimer); - if (ddqp->d_bwarns) - defq->bwarnlimit = be16_to_cpu(ddqp->d_bwarns); - if (ddqp->d_iwarns) - defq->iwarnlimit = be16_to_cpu(ddqp->d_iwarns); - if (ddqp->d_rtbwarns) - defq->rtbwarnlimit = be16_to_cpu(ddqp->d_rtbwarns); + if (dqp->q_blk.warnings) + defq->bwarnlimit = dqp->q_blk.warnings; + if (dqp->q_ino.warnings) + defq->iwarnlimit = dqp->q_ino.warnings; + if (dqp->q_rtb.warnings) + defq->rtbwarnlimit = dqp->q_rtb.warnings; xfs_qm_dqdestroy(dqp); } diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c index 7aab60ea67e2..acea375c0d17 100644 --- a/fs/xfs/xfs_qm_syscalls.c +++ b/fs/xfs/xfs_qm_syscalls.c @@ -548,11 +548,11 @@ xfs_qm_scall_setqlim( * Update warnings counter(s) if requested */ if (newlim->d_fieldmask & QC_SPC_WARNS) - ddq->d_bwarns = cpu_to_be16(newlim->d_spc_warns); + dqp->q_blk.warnings = newlim->d_spc_warns; if (newlim->d_fieldmask & QC_INO_WARNS) - ddq->d_iwarns = cpu_to_be16(newlim->d_ino_warns); + dqp->q_ino.warnings = newlim->d_ino_warns; if (newlim->d_fieldmask & QC_RT_SPC_WARNS) - ddq->d_rtbwarns = cpu_to_be16(newlim->d_rt_spc_warns); + dqp->q_rtb.warnings = newlim->d_rt_spc_warns; if (id == 0) { if (newlim->d_fieldmask & QC_SPC_WARNS) @@ -627,13 +627,13 @@ xfs_qm_scall_getquota_fill_qc( dst->d_ino_count = dqp->q_ino.reserved; dst->d_spc_timer = be32_to_cpu(dqp->q_core.d_btimer); dst->d_ino_timer = be32_to_cpu(dqp->q_core.d_itimer); - dst->d_ino_warns = be16_to_cpu(dqp->q_core.d_iwarns); - dst->d_spc_warns = be16_to_cpu(dqp->q_core.d_bwarns); + dst->d_ino_warns = dqp->q_ino.warnings; + dst->d_spc_warns = dqp->q_blk.warnings; dst->d_rt_spc_hardlimit = XFS_FSB_TO_B(mp, dqp->q_rtb.hardlimit); dst->d_rt_spc_softlimit = XFS_FSB_TO_B(mp, dqp->q_rtb.softlimit); dst->d_rt_space = XFS_FSB_TO_B(mp, dqp->q_rtb.reserved); dst->d_rt_spc_timer = be32_to_cpu(dqp->q_core.d_rtbtimer); - dst->d_rt_spc_warns = be16_to_cpu(dqp->q_core.d_rtbwarns); + dst->d_rt_spc_warns = dqp->q_rtb.warnings; /* * Internally, we don't reset all the timers when quota enforcement diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c index 1c2a45989aaf..58cbc8f216f1 100644 --- a/fs/xfs/xfs_trans_dquot.c +++ b/fs/xfs/xfs_trans_dquot.c @@ -592,7 +592,7 @@ xfs_trans_dqresv( if (!softlimit) softlimit = defq->bsoftlimit; timer = be32_to_cpu(dqp->q_core.d_btimer); - warns = be16_to_cpu(dqp->q_core.d_bwarns); + warns = dqp->q_blk.warnings; warnlimit = defq->bwarnlimit; resbcountp = &dqp->q_blk.reserved; } else { @@ -604,7 +604,7 @@ xfs_trans_dqresv( if (!softlimit) softlimit = defq->rtbsoftlimit; timer = be32_to_cpu(dqp->q_core.d_rtbtimer); - warns = be16_to_cpu(dqp->q_core.d_rtbwarns); + warns = dqp->q_rtb.warnings; warnlimit = defq->rtbwarnlimit; resbcountp = &dqp->q_rtb.reserved; } @@ -639,7 +639,7 @@ xfs_trans_dqresv( if (ninos > 0) { total_count = dqp->q_ino.reserved + ninos; timer = be32_to_cpu(dqp->q_core.d_itimer); - warns = be16_to_cpu(dqp->q_core.d_iwarns); + warns = dqp->q_ino.warnings; warnlimit = defq->iwarnlimit; hardlimit = dqp->q_ino.hardlimit; if (!hardlimit) From 19dce7eaef7f8fdecab965afacc7a7bf3eb4e0a1 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 14 Jul 2020 10:37:32 -0700 Subject: [PATCH 067/117] xfs: stop using q_core timers in the quota code Add timers fields to the incore dquot, and use that instead of the ones in qcore. This eliminates a bunch of endian conversions and will eventually allow us to remove qcore entirely. Signed-off-by: Darrick J. Wong Reviewed-by: Chandan Babu R Reviewed-by: Christoph Hellwig Reviewed-by: Allison Collins --- fs/xfs/xfs_dquot.c | 40 +++++++++++++++++++++++----------------- fs/xfs/xfs_dquot.h | 7 +++++++ fs/xfs/xfs_qm.c | 15 ++++++--------- fs/xfs/xfs_qm_syscalls.c | 18 ++++++++---------- fs/xfs/xfs_trans_dquot.c | 6 +++--- 5 files changed, 47 insertions(+), 39 deletions(-) diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index 9ba21523a481..b094fc495407 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -117,7 +117,6 @@ xfs_qm_adjust_dqtimers( struct xfs_dquot *dq) { struct xfs_quotainfo *qi = mp->m_quotainfo; - struct xfs_disk_dquot *d = &dq->q_core; struct xfs_def_quota *defq; ASSERT(dq->q_id); @@ -132,13 +131,13 @@ xfs_qm_adjust_dqtimers( ASSERT(dq->q_rtb.softlimit <= dq->q_rtb.hardlimit); #endif - if (!d->d_btimer) { + if (!dq->q_blk.timer) { if ((dq->q_blk.softlimit && (dq->q_blk.count > dq->q_blk.softlimit)) || (dq->q_blk.hardlimit && (dq->q_blk.count > dq->q_blk.hardlimit))) { - d->d_btimer = cpu_to_be32(ktime_get_real_seconds() + - defq->btimelimit); + dq->q_blk.timer = ktime_get_real_seconds() + + defq->btimelimit; } else { dq->q_blk.warnings = 0; } @@ -147,17 +146,17 @@ xfs_qm_adjust_dqtimers( (dq->q_blk.count <= dq->q_blk.softlimit)) && (!dq->q_blk.hardlimit || (dq->q_blk.count <= dq->q_blk.hardlimit))) { - d->d_btimer = 0; + dq->q_blk.timer = 0; } } - if (!d->d_itimer) { + if (!dq->q_ino.timer) { if ((dq->q_ino.softlimit && (dq->q_ino.count > dq->q_ino.softlimit)) || (dq->q_ino.hardlimit && (dq->q_ino.count > dq->q_ino.hardlimit))) { - d->d_itimer = cpu_to_be32(ktime_get_real_seconds() + - defq->itimelimit); + dq->q_ino.timer = ktime_get_real_seconds() + + defq->itimelimit; } else { dq->q_ino.warnings = 0; } @@ -166,17 +165,17 @@ xfs_qm_adjust_dqtimers( (dq->q_ino.count <= dq->q_ino.softlimit)) && (!dq->q_ino.hardlimit || (dq->q_ino.count <= dq->q_ino.hardlimit))) { - d->d_itimer = 0; + dq->q_ino.timer = 0; } } - if (!d->d_rtbtimer) { + if (!dq->q_rtb.timer) { if ((dq->q_rtb.softlimit && (dq->q_rtb.count > dq->q_rtb.softlimit)) || (dq->q_rtb.hardlimit && (dq->q_rtb.count > dq->q_rtb.hardlimit))) { - d->d_rtbtimer = cpu_to_be32(ktime_get_real_seconds() + - defq->rtbtimelimit); + dq->q_rtb.timer = ktime_get_real_seconds() + + defq->rtbtimelimit; } else { dq->q_rtb.warnings = 0; } @@ -185,7 +184,7 @@ xfs_qm_adjust_dqtimers( (dq->q_rtb.count <= dq->q_rtb.softlimit)) && (!dq->q_rtb.hardlimit || (dq->q_rtb.count <= dq->q_rtb.hardlimit))) { - d->d_rtbtimer = 0; + dq->q_rtb.timer = 0; } } } @@ -546,6 +545,10 @@ xfs_dquot_from_disk( dqp->q_ino.warnings = be16_to_cpu(ddqp->d_iwarns); dqp->q_rtb.warnings = be16_to_cpu(ddqp->d_rtbwarns); + dqp->q_blk.timer = be32_to_cpu(ddqp->d_btimer); + dqp->q_ino.timer = be32_to_cpu(ddqp->d_itimer); + dqp->q_rtb.timer = be32_to_cpu(ddqp->d_rtbtimer); + /* * Reservation counters are defined as reservation plus current usage * to avoid having to add every time. @@ -580,6 +583,10 @@ xfs_dquot_to_disk( ddqp->d_bwarns = cpu_to_be16(dqp->q_blk.warnings); ddqp->d_iwarns = cpu_to_be16(dqp->q_ino.warnings); ddqp->d_rtbwarns = cpu_to_be16(dqp->q_rtb.warnings); + + ddqp->d_btimer = cpu_to_be32(dqp->q_blk.timer); + ddqp->d_itimer = cpu_to_be32(dqp->q_ino.timer); + ddqp->d_rtbtimer = cpu_to_be32(dqp->q_rtb.timer); } /* Allocate and initialize the dquot buffer for this in-core dquot. */ @@ -1141,7 +1148,6 @@ static xfs_failaddr_t xfs_qm_dqflush_check( struct xfs_dquot *dqp) { - struct xfs_disk_dquot *ddq = &dqp->q_core; __u8 type = dqp->dq_flags & XFS_DQ_ALLTYPES; if (type != XFS_DQ_USER && @@ -1153,15 +1159,15 @@ xfs_qm_dqflush_check( return NULL; if (dqp->q_blk.softlimit && dqp->q_blk.count > dqp->q_blk.softlimit && - !ddq->d_btimer) + !dqp->q_blk.timer) return __this_address; if (dqp->q_ino.softlimit && dqp->q_ino.count > dqp->q_ino.softlimit && - !ddq->d_itimer) + !dqp->q_ino.timer) return __this_address; if (dqp->q_rtb.softlimit && dqp->q_rtb.count > dqp->q_rtb.softlimit && - !ddq->d_rtbtimer) + !dqp->q_rtb.timer) return __this_address; return NULL; diff --git a/fs/xfs/xfs_dquot.h b/fs/xfs/xfs_dquot.h index afac622c0fde..69dbf3d95bf3 100644 --- a/fs/xfs/xfs_dquot.h +++ b/fs/xfs/xfs_dquot.h @@ -38,6 +38,13 @@ struct xfs_dquot_res { xfs_qcnt_t hardlimit; xfs_qcnt_t softlimit; + /* + * For root dquots, this is the default grace period, in seconds. + * Otherwise, this is when the quota grace period expires, + * in seconds since the Unix epoch. + */ + time64_t timer; + /* * For root dquots, this is the maximum number of warnings that will * be issued for this quota type. Otherwise, this is the number of diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index 4dc989551710..73fa0b5d37b3 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -579,7 +579,6 @@ xfs_qm_init_timelimits( { struct xfs_quotainfo *qinf = mp->m_quotainfo; struct xfs_def_quota *defq; - struct xfs_disk_dquot *ddqp; struct xfs_dquot *dqp; int error; @@ -603,19 +602,17 @@ xfs_qm_init_timelimits( if (error) return; - ddqp = &dqp->q_core; - /* * The warnings and timers set the grace period given to * a user or group before he or she can not perform any * more writing. If it is zero, a default is used. */ - if (ddqp->d_btimer) - defq->btimelimit = be32_to_cpu(ddqp->d_btimer); - if (ddqp->d_itimer) - defq->itimelimit = be32_to_cpu(ddqp->d_itimer); - if (ddqp->d_rtbtimer) - defq->rtbtimelimit = be32_to_cpu(ddqp->d_rtbtimer); + if (dqp->q_blk.timer) + defq->btimelimit = dqp->q_blk.timer; + if (dqp->q_ino.timer) + defq->itimelimit = dqp->q_ino.timer; + if (dqp->q_rtb.timer) + defq->rtbtimelimit = dqp->q_rtb.timer; if (dqp->q_blk.warnings) defq->bwarnlimit = dqp->q_blk.warnings; if (dqp->q_ino.warnings) diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c index acea375c0d17..9e3507b083ce 100644 --- a/fs/xfs/xfs_qm_syscalls.c +++ b/fs/xfs/xfs_qm_syscalls.c @@ -447,7 +447,6 @@ xfs_qm_scall_setqlim( struct qc_dqblk *newlim) { struct xfs_quotainfo *q = mp->m_quotainfo; - struct xfs_disk_dquot *ddq; struct xfs_dquot *dqp; struct xfs_trans *tp; struct xfs_def_quota *defq; @@ -488,7 +487,6 @@ xfs_qm_scall_setqlim( xfs_dqlock(dqp); xfs_trans_dqjoin(tp, dqp); - ddq = &dqp->q_core; /* * Make sure that hardlimits are >= soft limits before changing. @@ -573,11 +571,11 @@ xfs_qm_scall_setqlim( * the soft limit. */ if (newlim->d_fieldmask & QC_SPC_TIMER) - ddq->d_btimer = cpu_to_be32(newlim->d_spc_timer); + dqp->q_blk.timer = newlim->d_spc_timer; if (newlim->d_fieldmask & QC_INO_TIMER) - ddq->d_itimer = cpu_to_be32(newlim->d_ino_timer); + dqp->q_ino.timer = newlim->d_ino_timer; if (newlim->d_fieldmask & QC_RT_SPC_TIMER) - ddq->d_rtbtimer = cpu_to_be32(newlim->d_rt_spc_timer); + dqp->q_rtb.timer = newlim->d_rt_spc_timer; if (id == 0) { if (newlim->d_fieldmask & QC_SPC_TIMER) @@ -621,18 +619,18 @@ xfs_qm_scall_getquota_fill_qc( memset(dst, 0, sizeof(*dst)); dst->d_spc_hardlimit = XFS_FSB_TO_B(mp, dqp->q_blk.hardlimit); dst->d_spc_softlimit = XFS_FSB_TO_B(mp, dqp->q_blk.softlimit); - dst->d_ino_hardlimit = be64_to_cpu(dqp->q_core.d_ino_hardlimit); - dst->d_ino_softlimit = be64_to_cpu(dqp->q_core.d_ino_softlimit); + dst->d_ino_hardlimit = dqp->q_ino.hardlimit; + dst->d_ino_softlimit = dqp->q_ino.softlimit; dst->d_space = XFS_FSB_TO_B(mp, dqp->q_blk.reserved); dst->d_ino_count = dqp->q_ino.reserved; - dst->d_spc_timer = be32_to_cpu(dqp->q_core.d_btimer); - dst->d_ino_timer = be32_to_cpu(dqp->q_core.d_itimer); + dst->d_spc_timer = dqp->q_blk.timer; + dst->d_ino_timer = dqp->q_ino.timer; dst->d_ino_warns = dqp->q_ino.warnings; dst->d_spc_warns = dqp->q_blk.warnings; dst->d_rt_spc_hardlimit = XFS_FSB_TO_B(mp, dqp->q_rtb.hardlimit); dst->d_rt_spc_softlimit = XFS_FSB_TO_B(mp, dqp->q_rtb.softlimit); dst->d_rt_space = XFS_FSB_TO_B(mp, dqp->q_rtb.reserved); - dst->d_rt_spc_timer = be32_to_cpu(dqp->q_core.d_rtbtimer); + dst->d_rt_spc_timer = dqp->q_rtb.timer; dst->d_rt_spc_warns = dqp->q_rtb.warnings; /* diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c index 58cbc8f216f1..817c3d48b911 100644 --- a/fs/xfs/xfs_trans_dquot.c +++ b/fs/xfs/xfs_trans_dquot.c @@ -591,7 +591,7 @@ xfs_trans_dqresv( softlimit = dqp->q_blk.softlimit; if (!softlimit) softlimit = defq->bsoftlimit; - timer = be32_to_cpu(dqp->q_core.d_btimer); + timer = dqp->q_blk.timer; warns = dqp->q_blk.warnings; warnlimit = defq->bwarnlimit; resbcountp = &dqp->q_blk.reserved; @@ -603,7 +603,7 @@ xfs_trans_dqresv( softlimit = dqp->q_rtb.softlimit; if (!softlimit) softlimit = defq->rtbsoftlimit; - timer = be32_to_cpu(dqp->q_core.d_rtbtimer); + timer = dqp->q_rtb.timer; warns = dqp->q_rtb.warnings; warnlimit = defq->rtbwarnlimit; resbcountp = &dqp->q_rtb.reserved; @@ -638,7 +638,7 @@ xfs_trans_dqresv( } if (ninos > 0) { total_count = dqp->q_ino.reserved + ninos; - timer = be32_to_cpu(dqp->q_core.d_itimer); + timer = dqp->q_ino.timer; warns = dqp->q_ino.warnings; warnlimit = defq->iwarnlimit; hardlimit = dqp->q_ino.hardlimit; From 51dbb1be52fedfe7f612854bfcba3400043a7a75 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 14 Jul 2020 10:37:32 -0700 Subject: [PATCH 068/117] xfs: remove qcore from incore dquots Now that we've stopped using qcore entirely, drop it from the incore dquot. Signed-off-by: Darrick J. Wong Reviewed-by: Chandan Babu R Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_format.h | 7 +++---- fs/xfs/scrub/quota.c | 8 -------- fs/xfs/xfs_dquot.c | 36 +++++++++++++----------------------- fs/xfs/xfs_dquot.h | 1 - fs/xfs/xfs_qm.c | 4 ++-- fs/xfs/xfs_qm_syscalls.c | 9 +++------ 6 files changed, 21 insertions(+), 44 deletions(-) diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h index 4f665646ba7d..a534ebee92b9 100644 --- a/fs/xfs/libxfs/xfs_format.h +++ b/fs/xfs/libxfs/xfs_format.h @@ -1150,10 +1150,9 @@ static inline void xfs_dinode_put_rdev(struct xfs_dinode *dip, xfs_dev_t rdev) #define XFS_DQUOT_VERSION (uint8_t)0x01 /* latest version number */ /* - * This is the main portion of the on-disk representation of quota - * information for a user. This is the q_core of the struct xfs_dquot that - * is kept in kernel memory. We pad this with some more expansion room - * to construct the on disk structure. + * This is the main portion of the on-disk representation of quota information + * for a user. We pad this with some more expansion room to construct the on + * disk structure. */ struct xfs_disk_dquot { __be16 d_magic; /* dquot magic = XFS_DQUOT_MAGIC */ diff --git a/fs/xfs/scrub/quota.c b/fs/xfs/scrub/quota.c index 20bc763e88b4..f4aad5b00188 100644 --- a/fs/xfs/scrub/quota.c +++ b/fs/xfs/scrub/quota.c @@ -79,7 +79,6 @@ xchk_quota_item( struct xchk_quota_info *sqi = priv; struct xfs_scrub *sc = sqi->sc; struct xfs_mount *mp = sc->mp; - struct xfs_disk_dquot *d = &dq->q_core; struct xfs_quotainfo *qi = mp->m_quotainfo; xfs_fileoff_t offset; xfs_ino_t fs_icount; @@ -98,13 +97,6 @@ xchk_quota_item( sqi->last_id = dq->q_id; - /* Did we get the dquot type we wanted? */ - if (dqtype != (d->d_flags & XFS_DQ_ALLTYPES)) - xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset); - - if (d->d_pad0 != cpu_to_be32(0) || d->d_pad != cpu_to_be16(0)) - xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset); - /* * Warn if the hard limits are larger than the fs. * Administrators can do this, though in production this seems diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index b094fc495407..6ed3cdeb67f0 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -529,7 +529,6 @@ xfs_dquot_from_disk( } /* copy everything from disk dquot to the incore dquot */ - memcpy(&dqp->q_core, ddqp, sizeof(struct xfs_disk_dquot)); dqp->q_blk.hardlimit = be64_to_cpu(ddqp->d_blk_hardlimit); dqp->q_blk.softlimit = be64_to_cpu(ddqp->d_blk_softlimit); dqp->q_ino.hardlimit = be64_to_cpu(ddqp->d_ino_hardlimit); @@ -568,7 +567,13 @@ xfs_dquot_to_disk( struct xfs_disk_dquot *ddqp, struct xfs_dquot *dqp) { - memcpy(ddqp, &dqp->q_core, sizeof(struct xfs_disk_dquot)); + ddqp->d_magic = cpu_to_be16(XFS_DQUOT_MAGIC); + ddqp->d_version = XFS_DQUOT_VERSION; + ddqp->d_flags = dqp->dq_flags & XFS_DQ_ALLTYPES; + ddqp->d_id = cpu_to_be32(dqp->q_id); + ddqp->d_pad0 = 0; + ddqp->d_pad = 0; + ddqp->d_blk_hardlimit = cpu_to_be64(dqp->q_blk.hardlimit); ddqp->d_blk_softlimit = cpu_to_be64(dqp->q_blk.softlimit); ddqp->d_ino_hardlimit = cpu_to_be64(dqp->q_ino.hardlimit); @@ -1189,8 +1194,7 @@ xfs_qm_dqflush( struct xfs_mount *mp = dqp->q_mount; struct xfs_log_item *lip = &dqp->q_logitem.qli_item; struct xfs_buf *bp; - struct xfs_dqblk *dqb; - struct xfs_disk_dquot *ddqp; + struct xfs_dqblk *dqblk; xfs_failaddr_t fa; int error; @@ -1214,22 +1218,6 @@ xfs_qm_dqflush( if (error) goto out_abort; - /* - * Calculate the location of the dquot inside the buffer. - */ - dqb = bp->b_addr + dqp->q_bufoffset; - ddqp = &dqb->dd_diskdq; - - /* sanity check the in-core structure before we flush */ - fa = xfs_dquot_verify(mp, &dqp->q_core, dqp->q_id, 0); - if (fa) { - xfs_alert(mp, "corrupt dquot ID 0x%x in memory at %pS", - dqp->q_id, fa); - xfs_buf_relse(bp); - error = -EFSCORRUPTED; - goto out_abort; - } - fa = xfs_qm_dqflush_check(dqp); if (fa) { xfs_alert(mp, "corrupt dquot ID 0x%x in memory at %pS", @@ -1239,7 +1227,9 @@ xfs_qm_dqflush( goto out_abort; } - xfs_dquot_to_disk(ddqp, dqp); + /* Flush the incore dquot to the ondisk buffer. */ + dqblk = bp->b_addr + dqp->q_bufoffset; + xfs_dquot_to_disk(&dqblk->dd_diskdq, dqp); /* * Clear the dirty field and remember the flush lsn for later use. @@ -1259,8 +1249,8 @@ xfs_qm_dqflush( * of a dquot without an up-to-date CRC getting to disk. */ if (xfs_sb_version_hascrc(&mp->m_sb)) { - dqb->dd_lsn = cpu_to_be64(dqp->q_logitem.qli_item.li_lsn); - xfs_update_cksum((char *)dqb, sizeof(struct xfs_dqblk), + dqblk->dd_lsn = cpu_to_be64(dqp->q_logitem.qli_item.li_lsn); + xfs_update_cksum((char *)dqblk, sizeof(struct xfs_dqblk), XFS_DQUOT_CRC_OFF); } diff --git a/fs/xfs/xfs_dquot.h b/fs/xfs/xfs_dquot.h index 69dbf3d95bf3..48fb0f263981 100644 --- a/fs/xfs/xfs_dquot.h +++ b/fs/xfs/xfs_dquot.h @@ -72,7 +72,6 @@ struct xfs_dquot { struct xfs_dquot_res q_ino; /* inodes */ struct xfs_dquot_res q_rtb; /* realtime blocks */ - struct xfs_disk_dquot q_core; struct xfs_dq_logitem q_logitem; xfs_qcnt_t q_prealloc_lo_wmark; diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index 73fa0b5d37b3..1ef666314fa8 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -161,7 +161,7 @@ xfs_qm_dqpurge( xfs_dqfunlock(dqp); xfs_dqunlock(dqp); - radix_tree_delete(xfs_dquot_tree(qi, dqp->q_core.d_flags), dqp->q_id); + radix_tree_delete(xfs_dquot_tree(qi, xfs_dquot_type(dqp)), dqp->q_id); qi->qi_dquots--; /* @@ -1588,7 +1588,7 @@ xfs_qm_dqfree_one( struct xfs_quotainfo *qi = mp->m_quotainfo; mutex_lock(&qi->qi_tree_lock); - radix_tree_delete(xfs_dquot_tree(qi, dqp->q_core.d_flags), dqp->q_id); + radix_tree_delete(xfs_dquot_tree(qi, xfs_dquot_type(dqp)), dqp->q_id); qi->qi_dquots--; mutex_unlock(&qi->qi_tree_lock); diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c index 9e3507b083ce..d05b16ea78c5 100644 --- a/fs/xfs/xfs_qm_syscalls.c +++ b/fs/xfs/xfs_qm_syscalls.c @@ -638,12 +638,9 @@ xfs_qm_scall_getquota_fill_qc( * gets turned off. No need to confuse the user level code, * so return zeroes in that case. */ - if ((!XFS_IS_UQUOTA_ENFORCED(mp) && - dqp->q_core.d_flags == XFS_DQ_USER) || - (!XFS_IS_GQUOTA_ENFORCED(mp) && - dqp->q_core.d_flags == XFS_DQ_GROUP) || - (!XFS_IS_PQUOTA_ENFORCED(mp) && - dqp->q_core.d_flags == XFS_DQ_PROJ)) { + if ((!XFS_IS_UQUOTA_ENFORCED(mp) && type == XFS_DQ_USER) || + (!XFS_IS_GQUOTA_ENFORCED(mp) && type == XFS_DQ_GROUP) || + (!XFS_IS_PQUOTA_ENFORCED(mp) && type == XFS_DQ_PROJ)) { dst->d_spc_timer = 0; dst->d_ino_timer = 0; dst->d_rt_spc_timer = 0; From 438769e31e4410a20ea0df0ed1769516ae80740d Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 14 Jul 2020 10:37:32 -0700 Subject: [PATCH 069/117] xfs: refactor default quota limits by resource Now that we've split up the dquot resource fields into separate structs, do the same for the default limits to enable further refactoring. Signed-off-by: Darrick J. Wong Reviewed-by: Chandan Babu R Reviewed-by: Christoph Hellwig --- fs/xfs/xfs_dquot.c | 30 +++++++++++++++--------------- fs/xfs/xfs_qm.c | 36 ++++++++++++++++++------------------ fs/xfs/xfs_qm.h | 22 ++++++++++------------ fs/xfs/xfs_qm_syscalls.c | 24 ++++++++++++------------ fs/xfs/xfs_quotaops.c | 12 ++++++------ fs/xfs/xfs_trans_dquot.c | 18 +++++++++--------- 6 files changed, 70 insertions(+), 72 deletions(-) diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index 6ed3cdeb67f0..1827a1043faa 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -77,22 +77,22 @@ xfs_qm_adjust_dqlimits( ASSERT(dq->q_id); defq = xfs_get_defquota(q, xfs_dquot_type(dq)); - if (defq->bsoftlimit && !dq->q_blk.softlimit) { - dq->q_blk.softlimit = defq->bsoftlimit; + if (defq->blk.soft && !dq->q_blk.softlimit) { + dq->q_blk.softlimit = defq->blk.soft; prealloc = 1; } - if (defq->bhardlimit && !dq->q_blk.hardlimit) { - dq->q_blk.hardlimit = defq->bhardlimit; + if (defq->blk.hard && !dq->q_blk.hardlimit) { + dq->q_blk.hardlimit = defq->blk.hard; prealloc = 1; } - if (defq->isoftlimit && !dq->q_ino.softlimit) - dq->q_ino.softlimit = defq->isoftlimit; - if (defq->ihardlimit && !dq->q_ino.hardlimit) - dq->q_ino.hardlimit = defq->ihardlimit; - if (defq->rtbsoftlimit && !dq->q_rtb.softlimit) - dq->q_rtb.softlimit = defq->rtbsoftlimit; - if (defq->rtbhardlimit && !dq->q_rtb.hardlimit) - dq->q_rtb.hardlimit = defq->rtbhardlimit; + if (defq->ino.soft && !dq->q_ino.softlimit) + dq->q_ino.softlimit = defq->ino.soft; + if (defq->ino.hard && !dq->q_ino.hardlimit) + dq->q_ino.hardlimit = defq->ino.hard; + if (defq->rtb.soft && !dq->q_rtb.softlimit) + dq->q_rtb.softlimit = defq->rtb.soft; + if (defq->rtb.hard && !dq->q_rtb.hardlimit) + dq->q_rtb.hardlimit = defq->rtb.hard; if (prealloc) xfs_dquot_set_prealloc_limits(dq); @@ -137,7 +137,7 @@ xfs_qm_adjust_dqtimers( (dq->q_blk.hardlimit && (dq->q_blk.count > dq->q_blk.hardlimit))) { dq->q_blk.timer = ktime_get_real_seconds() + - defq->btimelimit; + defq->blk.time; } else { dq->q_blk.warnings = 0; } @@ -156,7 +156,7 @@ xfs_qm_adjust_dqtimers( (dq->q_ino.hardlimit && (dq->q_ino.count > dq->q_ino.hardlimit))) { dq->q_ino.timer = ktime_get_real_seconds() + - defq->itimelimit; + defq->ino.time; } else { dq->q_ino.warnings = 0; } @@ -175,7 +175,7 @@ xfs_qm_adjust_dqtimers( (dq->q_rtb.hardlimit && (dq->q_rtb.count > dq->q_rtb.hardlimit))) { dq->q_rtb.timer = ktime_get_real_seconds() + - defq->rtbtimelimit; + defq->rtb.time; } else { dq->q_rtb.warnings = 0; } diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index 1ef666314fa8..8ab4dd4f842d 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -562,12 +562,12 @@ xfs_qm_set_defquota( * Timers and warnings have been already set, let's just set the * default limits for this quota type */ - defq->bhardlimit = dqp->q_blk.hardlimit; - defq->bsoftlimit = dqp->q_blk.softlimit; - defq->ihardlimit = dqp->q_ino.hardlimit; - defq->isoftlimit = dqp->q_ino.softlimit; - defq->rtbhardlimit = dqp->q_rtb.hardlimit; - defq->rtbsoftlimit = dqp->q_rtb.softlimit; + defq->blk.hard = dqp->q_blk.hardlimit; + defq->blk.soft = dqp->q_blk.softlimit; + defq->ino.hard = dqp->q_ino.hardlimit; + defq->ino.soft = dqp->q_ino.softlimit; + defq->rtb.hard = dqp->q_rtb.hardlimit; + defq->rtb.soft = dqp->q_rtb.softlimit; xfs_qm_dqdestroy(dqp); } @@ -584,12 +584,12 @@ xfs_qm_init_timelimits( defq = xfs_get_defquota(qinf, type); - defq->btimelimit = XFS_QM_BTIMELIMIT; - defq->itimelimit = XFS_QM_ITIMELIMIT; - defq->rtbtimelimit = XFS_QM_RTBTIMELIMIT; - defq->bwarnlimit = XFS_QM_BWARNLIMIT; - defq->iwarnlimit = XFS_QM_IWARNLIMIT; - defq->rtbwarnlimit = XFS_QM_RTBWARNLIMIT; + defq->blk.time = XFS_QM_BTIMELIMIT; + defq->ino.time = XFS_QM_ITIMELIMIT; + defq->rtb.time = XFS_QM_RTBTIMELIMIT; + defq->blk.warn = XFS_QM_BWARNLIMIT; + defq->ino.warn = XFS_QM_IWARNLIMIT; + defq->rtb.warn = XFS_QM_RTBWARNLIMIT; /* * We try to get the limits from the superuser's limits fields. @@ -608,17 +608,17 @@ xfs_qm_init_timelimits( * more writing. If it is zero, a default is used. */ if (dqp->q_blk.timer) - defq->btimelimit = dqp->q_blk.timer; + defq->blk.time = dqp->q_blk.timer; if (dqp->q_ino.timer) - defq->itimelimit = dqp->q_ino.timer; + defq->ino.time = dqp->q_ino.timer; if (dqp->q_rtb.timer) - defq->rtbtimelimit = dqp->q_rtb.timer; + defq->rtb.time = dqp->q_rtb.timer; if (dqp->q_blk.warnings) - defq->bwarnlimit = dqp->q_blk.warnings; + defq->blk.warn = dqp->q_blk.warnings; if (dqp->q_ino.warnings) - defq->iwarnlimit = dqp->q_ino.warnings; + defq->ino.warn = dqp->q_ino.warnings; if (dqp->q_rtb.warnings) - defq->rtbwarnlimit = dqp->q_rtb.warnings; + defq->rtb.warn = dqp->q_rtb.warnings; xfs_qm_dqdestroy(dqp); } diff --git a/fs/xfs/xfs_qm.h b/fs/xfs/xfs_qm.h index 57bddadbc051..11c28ff0298c 100644 --- a/fs/xfs/xfs_qm.h +++ b/fs/xfs/xfs_qm.h @@ -30,20 +30,18 @@ extern struct kmem_zone *xfs_qm_dqtrxzone; !dqp->q_rtb.count && \ !dqp->q_ino.count) +struct xfs_quota_limits { + xfs_qcnt_t hard; /* default hard limit */ + xfs_qcnt_t soft; /* default soft limit */ + time64_t time; /* limit for timers */ + xfs_qwarncnt_t warn; /* limit for warnings */ +}; + /* Defaults for each quota type: time limits, warn limits, usage limits */ struct xfs_def_quota { - time64_t btimelimit; /* limit for blks timer */ - time64_t itimelimit; /* limit for inodes timer */ - time64_t rtbtimelimit; /* limit for rt blks timer */ - xfs_qwarncnt_t bwarnlimit; /* limit for blks warnings */ - xfs_qwarncnt_t iwarnlimit; /* limit for inodes warnings */ - xfs_qwarncnt_t rtbwarnlimit; /* limit for rt blks warnings */ - xfs_qcnt_t bhardlimit; /* default data blk hard limit */ - xfs_qcnt_t bsoftlimit; /* default data blk soft limit */ - xfs_qcnt_t ihardlimit; /* default inode count hard limit */ - xfs_qcnt_t isoftlimit; /* default inode count soft limit */ - xfs_qcnt_t rtbhardlimit; /* default realtime blk hard limit */ - xfs_qcnt_t rtbsoftlimit; /* default realtime blk soft limit */ + struct xfs_quota_limits blk; + struct xfs_quota_limits ino; + struct xfs_quota_limits rtb; }; /* diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c index d05b16ea78c5..51941c1b0492 100644 --- a/fs/xfs/xfs_qm_syscalls.c +++ b/fs/xfs/xfs_qm_syscalls.c @@ -502,8 +502,8 @@ xfs_qm_scall_setqlim( dqp->q_blk.softlimit = soft; xfs_dquot_set_prealloc_limits(dqp); if (id == 0) { - defq->bhardlimit = hard; - defq->bsoftlimit = soft; + defq->blk.hard = hard; + defq->blk.soft = soft; } } else { xfs_debug(mp, "blkhard %Ld < blksoft %Ld", hard, soft); @@ -518,8 +518,8 @@ xfs_qm_scall_setqlim( dqp->q_rtb.hardlimit = hard; dqp->q_rtb.softlimit = soft; if (id == 0) { - defq->rtbhardlimit = hard; - defq->rtbsoftlimit = soft; + defq->rtb.hard = hard; + defq->rtb.soft = soft; } } else { xfs_debug(mp, "rtbhard %Ld < rtbsoft %Ld", hard, soft); @@ -535,8 +535,8 @@ xfs_qm_scall_setqlim( dqp->q_ino.hardlimit = hard; dqp->q_ino.softlimit = soft; if (id == 0) { - defq->ihardlimit = hard; - defq->isoftlimit = soft; + defq->ino.hard = hard; + defq->ino.soft = soft; } } else { xfs_debug(mp, "ihard %Ld < isoft %Ld", hard, soft); @@ -554,11 +554,11 @@ xfs_qm_scall_setqlim( if (id == 0) { if (newlim->d_fieldmask & QC_SPC_WARNS) - defq->bwarnlimit = newlim->d_spc_warns; + defq->blk.warn = newlim->d_spc_warns; if (newlim->d_fieldmask & QC_INO_WARNS) - defq->iwarnlimit = newlim->d_ino_warns; + defq->ino.warn = newlim->d_ino_warns; if (newlim->d_fieldmask & QC_RT_SPC_WARNS) - defq->rtbwarnlimit = newlim->d_rt_spc_warns; + defq->rtb.warn = newlim->d_rt_spc_warns; } /* @@ -579,11 +579,11 @@ xfs_qm_scall_setqlim( if (id == 0) { if (newlim->d_fieldmask & QC_SPC_TIMER) - defq->btimelimit = newlim->d_spc_timer; + defq->blk.time = newlim->d_spc_timer; if (newlim->d_fieldmask & QC_INO_TIMER) - defq->itimelimit = newlim->d_ino_timer; + defq->ino.time = newlim->d_ino_timer; if (newlim->d_fieldmask & QC_RT_SPC_TIMER) - defq->rtbtimelimit = newlim->d_rt_spc_timer; + defq->rtb.time = newlim->d_rt_spc_timer; } if (id != 0) { diff --git a/fs/xfs/xfs_quotaops.c b/fs/xfs/xfs_quotaops.c index 0868e6ee2219..299695a068f3 100644 --- a/fs/xfs/xfs_quotaops.c +++ b/fs/xfs/xfs_quotaops.c @@ -37,12 +37,12 @@ xfs_qm_fill_state( tstate->flags |= QCI_SYSFILE; tstate->blocks = ip->i_d.di_nblocks; tstate->nextents = ip->i_df.if_nextents; - tstate->spc_timelimit = (u32)defq->btimelimit; - tstate->ino_timelimit = (u32)defq->itimelimit; - tstate->rt_spc_timelimit = (u32)defq->rtbtimelimit; - tstate->spc_warnlimit = defq->bwarnlimit; - tstate->ino_warnlimit = defq->iwarnlimit; - tstate->rt_spc_warnlimit = defq->rtbwarnlimit; + tstate->spc_timelimit = (u32)defq->blk.time; + tstate->ino_timelimit = (u32)defq->ino.time; + tstate->rt_spc_timelimit = (u32)defq->rtb.time; + tstate->spc_warnlimit = defq->blk.warn; + tstate->ino_warnlimit = defq->ino.warn; + tstate->rt_spc_warnlimit = defq->rtb.warn; if (tempqip) xfs_irele(ip); } diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c index 817c3d48b911..9adea8f6ba87 100644 --- a/fs/xfs/xfs_trans_dquot.c +++ b/fs/xfs/xfs_trans_dquot.c @@ -587,25 +587,25 @@ xfs_trans_dqresv( if (flags & XFS_TRANS_DQ_RES_BLKS) { hardlimit = dqp->q_blk.hardlimit; if (!hardlimit) - hardlimit = defq->bhardlimit; + hardlimit = defq->blk.hard; softlimit = dqp->q_blk.softlimit; if (!softlimit) - softlimit = defq->bsoftlimit; + softlimit = defq->blk.soft; timer = dqp->q_blk.timer; warns = dqp->q_blk.warnings; - warnlimit = defq->bwarnlimit; + warnlimit = defq->blk.warn; resbcountp = &dqp->q_blk.reserved; } else { ASSERT(flags & XFS_TRANS_DQ_RES_RTBLKS); hardlimit = dqp->q_rtb.hardlimit; if (!hardlimit) - hardlimit = defq->rtbhardlimit; + hardlimit = defq->rtb.hard; softlimit = dqp->q_rtb.softlimit; if (!softlimit) - softlimit = defq->rtbsoftlimit; + softlimit = defq->rtb.soft; timer = dqp->q_rtb.timer; warns = dqp->q_rtb.warnings; - warnlimit = defq->rtbwarnlimit; + warnlimit = defq->rtb.warn; resbcountp = &dqp->q_rtb.reserved; } @@ -640,13 +640,13 @@ xfs_trans_dqresv( total_count = dqp->q_ino.reserved + ninos; timer = dqp->q_ino.timer; warns = dqp->q_ino.warnings; - warnlimit = defq->iwarnlimit; + warnlimit = defq->ino.warn; hardlimit = dqp->q_ino.hardlimit; if (!hardlimit) - hardlimit = defq->ihardlimit; + hardlimit = defq->ino.hard; softlimit = dqp->q_ino.softlimit; if (!softlimit) - softlimit = defq->isoftlimit; + softlimit = defq->ino.soft; if (hardlimit && total_count > hardlimit) { xfs_quota_warn(mp, dqp, QUOTA_NL_IHARDWARN); From c8c753e19a76507a2b686d9c521d5266351c75d0 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 14 Jul 2020 10:37:33 -0700 Subject: [PATCH 070/117] xfs: remove unnecessary arguments from quota adjust functions struct xfs_dquot already has a pointer to the xfs mount, so remove the redundant parameter from xfs_qm_adjust_dq*. Signed-off-by: Darrick J. Wong Reviewed-by: Chandan Babu R Reviewed-by: Christoph Hellwig --- fs/xfs/xfs_dquot.c | 4 ++-- fs/xfs/xfs_dquot.h | 9 ++++----- fs/xfs/xfs_qm.c | 4 ++-- fs/xfs/xfs_qm_syscalls.c | 2 +- fs/xfs/xfs_trans_dquot.c | 4 ++-- 5 files changed, 11 insertions(+), 12 deletions(-) diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index 1827a1043faa..c09d9257fad9 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -67,9 +67,9 @@ xfs_qm_dqdestroy( */ void xfs_qm_adjust_dqlimits( - struct xfs_mount *mp, struct xfs_dquot *dq) { + struct xfs_mount *mp = dq->q_mount; struct xfs_quotainfo *q = mp->m_quotainfo; struct xfs_def_quota *defq; int prealloc = 0; @@ -113,9 +113,9 @@ xfs_qm_adjust_dqlimits( */ void xfs_qm_adjust_dqtimers( - struct xfs_mount *mp, struct xfs_dquot *dq) { + struct xfs_mount *mp = dq->q_mount; struct xfs_quotainfo *qi = mp->m_quotainfo; struct xfs_def_quota *defq; diff --git a/fs/xfs/xfs_dquot.h b/fs/xfs/xfs_dquot.h index 48fb0f263981..9e44da522684 100644 --- a/fs/xfs/xfs_dquot.h +++ b/fs/xfs/xfs_dquot.h @@ -182,11 +182,10 @@ void xfs_dquot_to_disk(struct xfs_disk_dquot *ddqp, struct xfs_dquot *dqp); void xfs_qm_dqdestroy(struct xfs_dquot *dqp); int xfs_qm_dqflush(struct xfs_dquot *dqp, struct xfs_buf **bpp); void xfs_qm_dqunpin_wait(struct xfs_dquot *dqp); -void xfs_qm_adjust_dqtimers(struct xfs_mount *mp, - struct xfs_dquot *d); -void xfs_qm_adjust_dqlimits(struct xfs_mount *mp, - struct xfs_dquot *d); -xfs_dqid_t xfs_qm_id_for_quotatype(struct xfs_inode *ip, uint type); +void xfs_qm_adjust_dqtimers(struct xfs_dquot *d); +void xfs_qm_adjust_dqlimits(struct xfs_dquot *d); +xfs_dqid_t xfs_qm_id_for_quotatype(struct xfs_inode *ip, + uint type); int xfs_qm_dqget(struct xfs_mount *mp, xfs_dqid_t id, uint type, bool can_alloc, struct xfs_dquot **dqpp); diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index 8ab4dd4f842d..7d83c1623cb2 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -1103,8 +1103,8 @@ xfs_qm_quotacheck_dqadjust( * There are no timers for the default values set in the root dquot. */ if (dqp->q_id) { - xfs_qm_adjust_dqlimits(mp, dqp); - xfs_qm_adjust_dqtimers(mp, dqp); + xfs_qm_adjust_dqlimits(dqp); + xfs_qm_adjust_dqtimers(dqp); } dqp->q_flags |= XFS_DQFLAG_DIRTY; diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c index 51941c1b0492..aeea59d6cf23 100644 --- a/fs/xfs/xfs_qm_syscalls.c +++ b/fs/xfs/xfs_qm_syscalls.c @@ -594,7 +594,7 @@ xfs_qm_scall_setqlim( * is on or off. We don't really want to bother with iterating * over all ondisk dquots and turning the timers on/off. */ - xfs_qm_adjust_dqtimers(mp, dqp); + xfs_qm_adjust_dqtimers(dqp); } dqp->q_flags |= XFS_DQFLAG_DIRTY; xfs_trans_log_dquot(tp, dqp); diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c index 9adea8f6ba87..6f0bfd522116 100644 --- a/fs/xfs/xfs_trans_dquot.c +++ b/fs/xfs/xfs_trans_dquot.c @@ -382,8 +382,8 @@ xfs_trans_apply_dquot_deltas( * Start/reset the timer(s) if needed. */ if (dqp->q_id) { - xfs_qm_adjust_dqlimits(tp->t_mountp, dqp); - xfs_qm_adjust_dqtimers(tp->t_mountp, dqp); + xfs_qm_adjust_dqlimits(dqp); + xfs_qm_adjust_dqtimers(dqp); } dqp->q_flags |= XFS_DQFLAG_DIRTY; From ea0cc6fa8f89a0089c561b65b909ceab48463338 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 14 Jul 2020 10:37:33 -0700 Subject: [PATCH 071/117] xfs: refactor quota exceeded test Refactor the open-coded test for whether or not we're over quota. Signed-off-by: Darrick J. Wong Reviewed-by: Chandan Babu R Reviewed-by: Christoph Hellwig --- fs/xfs/xfs_dquot.c | 91 +++++++++++++--------------------------------- 1 file changed, 26 insertions(+), 65 deletions(-) diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index c09d9257fad9..98cb31f28aaf 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -98,6 +98,29 @@ xfs_qm_adjust_dqlimits( xfs_dquot_set_prealloc_limits(dq); } +/* + * Determine if this quota counter is over either limit and set the quota + * timers as appropriate. + */ +static inline void +xfs_qm_adjust_res_timer( + struct xfs_dquot_res *res, + struct xfs_quota_limits *qlim) +{ + ASSERT(res->hardlimit == 0 || res->softlimit <= res->hardlimit); + + if ((res->softlimit && res->count > res->softlimit) || + (res->hardlimit && res->count > res->hardlimit)) { + if (res->timer == 0) + res->timer = ktime_get_real_seconds() + qlim->time; + } else { + if (res->timer == 0) + res->warnings = 0; + else + res->timer = 0; + } +} + /* * Check the limits and timers of a dquot and start or reset timers * if necessary. @@ -122,71 +145,9 @@ xfs_qm_adjust_dqtimers( ASSERT(dq->q_id); defq = xfs_get_defquota(qi, xfs_dquot_type(dq)); -#ifdef DEBUG - if (dq->q_blk.hardlimit) - ASSERT(dq->q_blk.softlimit <= dq->q_blk.hardlimit); - if (dq->q_ino.hardlimit) - ASSERT(dq->q_ino.softlimit <= dq->q_ino.hardlimit); - if (dq->q_rtb.hardlimit) - ASSERT(dq->q_rtb.softlimit <= dq->q_rtb.hardlimit); -#endif - - if (!dq->q_blk.timer) { - if ((dq->q_blk.softlimit && - (dq->q_blk.count > dq->q_blk.softlimit)) || - (dq->q_blk.hardlimit && - (dq->q_blk.count > dq->q_blk.hardlimit))) { - dq->q_blk.timer = ktime_get_real_seconds() + - defq->blk.time; - } else { - dq->q_blk.warnings = 0; - } - } else { - if ((!dq->q_blk.softlimit || - (dq->q_blk.count <= dq->q_blk.softlimit)) && - (!dq->q_blk.hardlimit || - (dq->q_blk.count <= dq->q_blk.hardlimit))) { - dq->q_blk.timer = 0; - } - } - - if (!dq->q_ino.timer) { - if ((dq->q_ino.softlimit && - (dq->q_ino.count > dq->q_ino.softlimit)) || - (dq->q_ino.hardlimit && - (dq->q_ino.count > dq->q_ino.hardlimit))) { - dq->q_ino.timer = ktime_get_real_seconds() + - defq->ino.time; - } else { - dq->q_ino.warnings = 0; - } - } else { - if ((!dq->q_ino.softlimit || - (dq->q_ino.count <= dq->q_ino.softlimit)) && - (!dq->q_ino.hardlimit || - (dq->q_ino.count <= dq->q_ino.hardlimit))) { - dq->q_ino.timer = 0; - } - } - - if (!dq->q_rtb.timer) { - if ((dq->q_rtb.softlimit && - (dq->q_rtb.count > dq->q_rtb.softlimit)) || - (dq->q_rtb.hardlimit && - (dq->q_rtb.count > dq->q_rtb.hardlimit))) { - dq->q_rtb.timer = ktime_get_real_seconds() + - defq->rtb.time; - } else { - dq->q_rtb.warnings = 0; - } - } else { - if ((!dq->q_rtb.softlimit || - (dq->q_rtb.count <= dq->q_rtb.softlimit)) && - (!dq->q_rtb.hardlimit || - (dq->q_rtb.count <= dq->q_rtb.hardlimit))) { - dq->q_rtb.timer = 0; - } - } + xfs_qm_adjust_res_timer(&dq->q_blk, &defq->blk); + xfs_qm_adjust_res_timer(&dq->q_ino, &defq->ino); + xfs_qm_adjust_res_timer(&dq->q_rtb, &defq->rtb); } /* From d1520deab039ca1bac50c53b941a3df3b3a680de Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 14 Jul 2020 10:37:33 -0700 Subject: [PATCH 072/117] xfs: refactor xfs_qm_scall_setqlim Now that we can pass around quota resource and limit structures, clean up the open-coded field setting in xfs_qm_scall_setqlim. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Reviewed-by: Allison Collins Reviewed-by: Chandan Babu R --- fs/xfs/xfs_qm_syscalls.c | 194 ++++++++++++++++++++++----------------- 1 file changed, 108 insertions(+), 86 deletions(-) diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c index aeea59d6cf23..cbe352187d32 100644 --- a/fs/xfs/xfs_qm_syscalls.c +++ b/fs/xfs/xfs_qm_syscalls.c @@ -436,6 +436,58 @@ xfs_qm_scall_quotaon( #define XFS_QC_MASK \ (QC_LIMIT_MASK | QC_TIMER_MASK | QC_WARNS_MASK) +/* + * Adjust limits of this quota, and the defaults if passed in. Returns true + * if the new limits made sense and were applied, false otherwise. + */ +static inline bool +xfs_setqlim_limits( + struct xfs_mount *mp, + struct xfs_dquot_res *res, + struct xfs_quota_limits *qlim, + xfs_qcnt_t hard, + xfs_qcnt_t soft, + const char *tag) +{ + /* The hard limit can't be less than the soft limit. */ + if (hard != 0 && hard < soft) { + xfs_debug(mp, "%shard %lld < %ssoft %lld", tag, hard, tag, + soft); + return false; + } + + res->hardlimit = hard; + res->softlimit = soft; + if (qlim) { + qlim->hard = hard; + qlim->soft = soft; + } + + return true; +} + +static inline void +xfs_setqlim_warns( + struct xfs_dquot_res *res, + struct xfs_quota_limits *qlim, + int warns) +{ + res->warnings = warns; + if (qlim) + qlim->warn = warns; +} + +static inline void +xfs_setqlim_timer( + struct xfs_dquot_res *res, + struct xfs_quota_limits *qlim, + s64 timer) +{ + res->timer = timer; + if (qlim) + qlim->time = timer; +} + /* * Adjust quota limits, and start/stop timers accordingly. */ @@ -450,6 +502,8 @@ xfs_qm_scall_setqlim( struct xfs_dquot *dqp; struct xfs_trans *tp; struct xfs_def_quota *defq; + struct xfs_dquot_res *res; + struct xfs_quota_limits *qlim; int error; xfs_qcnt_t hard, soft; @@ -489,79 +543,13 @@ xfs_qm_scall_setqlim( xfs_trans_dqjoin(tp, dqp); /* + * Update quota limits, warnings, and timers, and the defaults + * if we're touching id == 0. + * * Make sure that hardlimits are >= soft limits before changing. - */ - hard = (newlim->d_fieldmask & QC_SPC_HARD) ? - (xfs_qcnt_t) XFS_B_TO_FSB(mp, newlim->d_spc_hardlimit) : - dqp->q_blk.hardlimit; - soft = (newlim->d_fieldmask & QC_SPC_SOFT) ? - (xfs_qcnt_t) XFS_B_TO_FSB(mp, newlim->d_spc_softlimit) : - dqp->q_blk.softlimit; - if (hard == 0 || hard >= soft) { - dqp->q_blk.hardlimit = hard; - dqp->q_blk.softlimit = soft; - xfs_dquot_set_prealloc_limits(dqp); - if (id == 0) { - defq->blk.hard = hard; - defq->blk.soft = soft; - } - } else { - xfs_debug(mp, "blkhard %Ld < blksoft %Ld", hard, soft); - } - hard = (newlim->d_fieldmask & QC_RT_SPC_HARD) ? - (xfs_qcnt_t) XFS_B_TO_FSB(mp, newlim->d_rt_spc_hardlimit) : - dqp->q_rtb.hardlimit; - soft = (newlim->d_fieldmask & QC_RT_SPC_SOFT) ? - (xfs_qcnt_t) XFS_B_TO_FSB(mp, newlim->d_rt_spc_softlimit) : - dqp->q_rtb.softlimit; - if (hard == 0 || hard >= soft) { - dqp->q_rtb.hardlimit = hard; - dqp->q_rtb.softlimit = soft; - if (id == 0) { - defq->rtb.hard = hard; - defq->rtb.soft = soft; - } - } else { - xfs_debug(mp, "rtbhard %Ld < rtbsoft %Ld", hard, soft); - } - - hard = (newlim->d_fieldmask & QC_INO_HARD) ? - (xfs_qcnt_t) newlim->d_ino_hardlimit : - dqp->q_ino.hardlimit; - soft = (newlim->d_fieldmask & QC_INO_SOFT) ? - (xfs_qcnt_t) newlim->d_ino_softlimit : - dqp->q_ino.softlimit; - if (hard == 0 || hard >= soft) { - dqp->q_ino.hardlimit = hard; - dqp->q_ino.softlimit = soft; - if (id == 0) { - defq->ino.hard = hard; - defq->ino.soft = soft; - } - } else { - xfs_debug(mp, "ihard %Ld < isoft %Ld", hard, soft); - } - - /* - * Update warnings counter(s) if requested - */ - if (newlim->d_fieldmask & QC_SPC_WARNS) - dqp->q_blk.warnings = newlim->d_spc_warns; - if (newlim->d_fieldmask & QC_INO_WARNS) - dqp->q_ino.warnings = newlim->d_ino_warns; - if (newlim->d_fieldmask & QC_RT_SPC_WARNS) - dqp->q_rtb.warnings = newlim->d_rt_spc_warns; - - if (id == 0) { - if (newlim->d_fieldmask & QC_SPC_WARNS) - defq->blk.warn = newlim->d_spc_warns; - if (newlim->d_fieldmask & QC_INO_WARNS) - defq->ino.warn = newlim->d_ino_warns; - if (newlim->d_fieldmask & QC_RT_SPC_WARNS) - defq->rtb.warn = newlim->d_rt_spc_warns; - } - - /* + * + * Update warnings counter(s) if requested. + * * Timelimits for the super user set the relative time the other users * can be over quota for this file system. If it is zero a default is * used. Ditto for the default soft and hard limit values (already @@ -570,21 +558,55 @@ xfs_qm_scall_setqlim( * For other IDs, userspace can bump out the grace period if over * the soft limit. */ - if (newlim->d_fieldmask & QC_SPC_TIMER) - dqp->q_blk.timer = newlim->d_spc_timer; - if (newlim->d_fieldmask & QC_INO_TIMER) - dqp->q_ino.timer = newlim->d_ino_timer; - if (newlim->d_fieldmask & QC_RT_SPC_TIMER) - dqp->q_rtb.timer = newlim->d_rt_spc_timer; - if (id == 0) { - if (newlim->d_fieldmask & QC_SPC_TIMER) - defq->blk.time = newlim->d_spc_timer; - if (newlim->d_fieldmask & QC_INO_TIMER) - defq->ino.time = newlim->d_ino_timer; - if (newlim->d_fieldmask & QC_RT_SPC_TIMER) - defq->rtb.time = newlim->d_rt_spc_timer; - } + /* Blocks on the data device. */ + hard = (newlim->d_fieldmask & QC_SPC_HARD) ? + (xfs_qcnt_t) XFS_B_TO_FSB(mp, newlim->d_spc_hardlimit) : + dqp->q_blk.hardlimit; + soft = (newlim->d_fieldmask & QC_SPC_SOFT) ? + (xfs_qcnt_t) XFS_B_TO_FSB(mp, newlim->d_spc_softlimit) : + dqp->q_blk.softlimit; + res = &dqp->q_blk; + qlim = id == 0 ? &defq->blk : NULL; + + if (xfs_setqlim_limits(mp, res, qlim, hard, soft, "blk")) + xfs_dquot_set_prealloc_limits(dqp); + if (newlim->d_fieldmask & QC_SPC_WARNS) + xfs_setqlim_warns(res, qlim, newlim->d_spc_warns); + if (newlim->d_fieldmask & QC_SPC_TIMER) + xfs_setqlim_timer(res, qlim, newlim->d_spc_timer); + + /* Blocks on the realtime device. */ + hard = (newlim->d_fieldmask & QC_RT_SPC_HARD) ? + (xfs_qcnt_t) XFS_B_TO_FSB(mp, newlim->d_rt_spc_hardlimit) : + dqp->q_rtb.hardlimit; + soft = (newlim->d_fieldmask & QC_RT_SPC_SOFT) ? + (xfs_qcnt_t) XFS_B_TO_FSB(mp, newlim->d_rt_spc_softlimit) : + dqp->q_rtb.softlimit; + res = &dqp->q_rtb; + qlim = id == 0 ? &defq->rtb : NULL; + + xfs_setqlim_limits(mp, res, qlim, hard, soft, "rtb"); + if (newlim->d_fieldmask & QC_RT_SPC_WARNS) + xfs_setqlim_warns(res, qlim, newlim->d_rt_spc_warns); + if (newlim->d_fieldmask & QC_RT_SPC_TIMER) + xfs_setqlim_timer(res, qlim, newlim->d_rt_spc_timer); + + /* Inodes */ + hard = (newlim->d_fieldmask & QC_INO_HARD) ? + (xfs_qcnt_t) newlim->d_ino_hardlimit : + dqp->q_ino.hardlimit; + soft = (newlim->d_fieldmask & QC_INO_SOFT) ? + (xfs_qcnt_t) newlim->d_ino_softlimit : + dqp->q_ino.softlimit; + res = &dqp->q_ino; + qlim = id == 0 ? &defq->ino : NULL; + + xfs_setqlim_limits(mp, res, qlim, hard, soft, "ino"); + if (newlim->d_fieldmask & QC_INO_WARNS) + xfs_setqlim_warns(res, qlim, newlim->d_ino_warns); + if (newlim->d_fieldmask & QC_INO_TIMER) + xfs_setqlim_timer(res, qlim, newlim->d_ino_timer); if (id != 0) { /* From 292b47b4fcfad2b8b2bc749d68c8baa63fc8bf36 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 14 Jul 2020 10:37:34 -0700 Subject: [PATCH 073/117] xfs: refactor xfs_trans_dqresv Now that we've refactored the resource usage and limits into per-resource structures, we can refactor some of the open-coded reservation limit checking in xfs_trans_dqresv. Signed-off-by: Darrick J. Wong Reviewed-by: Chandan Babu R Reviewed-by: Allison Collins Reviewed-by: Christoph Hellwig --- fs/xfs/xfs_trans_dquot.c | 153 ++++++++++++++++++++------------------- 1 file changed, 78 insertions(+), 75 deletions(-) diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c index 6f0bfd522116..9dc8dd9f852c 100644 --- a/fs/xfs/xfs_trans_dquot.c +++ b/fs/xfs/xfs_trans_dquot.c @@ -554,6 +554,58 @@ xfs_quota_warn( mp->m_super->s_dev, type); } +/* + * Decide if we can make an additional reservation against a quota resource. + * Returns an inode QUOTA_NL_ warning code and whether or not it's fatal. + * + * Note that we assume that the numeric difference between the inode and block + * warning codes will always be 3 since it's userspace ABI now, and will never + * decrease the quota reservation, so the *BELOW messages are irrelevant. + */ +static inline int +xfs_dqresv_check( + struct xfs_dquot_res *res, + struct xfs_quota_limits *qlim, + int64_t delta, + bool *fatal) +{ + xfs_qcnt_t hardlimit = res->hardlimit; + xfs_qcnt_t softlimit = res->softlimit; + xfs_qcnt_t total_count = res->reserved + delta; + + BUILD_BUG_ON(QUOTA_NL_BHARDWARN != QUOTA_NL_IHARDWARN + 3); + BUILD_BUG_ON(QUOTA_NL_BSOFTLONGWARN != QUOTA_NL_ISOFTLONGWARN + 3); + BUILD_BUG_ON(QUOTA_NL_BSOFTWARN != QUOTA_NL_ISOFTWARN + 3); + + *fatal = false; + if (delta <= 0) + return QUOTA_NL_NOWARN; + + if (!hardlimit) + hardlimit = qlim->hard; + if (!softlimit) + softlimit = qlim->soft; + + if (hardlimit && total_count > hardlimit) { + *fatal = true; + return QUOTA_NL_IHARDWARN; + } + + if (softlimit && total_count > softlimit) { + time64_t now = ktime_get_real_seconds(); + + if ((res->timer != 0 && now > res->timer) || + (res->warnings != 0 && res->warnings >= qlim->warn)) { + *fatal = true; + return QUOTA_NL_ISOFTLONGWARN; + } + + return QUOTA_NL_ISOFTWARN; + } + + return QUOTA_NL_NOWARN; +} + /* * This reserves disk blocks and inodes against a dquot. * Flags indicate if the dquot is to be locked here and also @@ -569,99 +621,51 @@ xfs_trans_dqresv( long ninos, uint flags) { - xfs_qcnt_t hardlimit; - xfs_qcnt_t softlimit; - time64_t timer; - xfs_qwarncnt_t warns; - xfs_qwarncnt_t warnlimit; - xfs_qcnt_t total_count; - xfs_qcnt_t *resbcountp; struct xfs_quotainfo *q = mp->m_quotainfo; struct xfs_def_quota *defq; - + struct xfs_dquot_res *blkres; + struct xfs_quota_limits *qlim; xfs_dqlock(dqp); defq = xfs_get_defquota(q, xfs_dquot_type(dqp)); if (flags & XFS_TRANS_DQ_RES_BLKS) { - hardlimit = dqp->q_blk.hardlimit; - if (!hardlimit) - hardlimit = defq->blk.hard; - softlimit = dqp->q_blk.softlimit; - if (!softlimit) - softlimit = defq->blk.soft; - timer = dqp->q_blk.timer; - warns = dqp->q_blk.warnings; - warnlimit = defq->blk.warn; - resbcountp = &dqp->q_blk.reserved; + blkres = &dqp->q_blk; + qlim = &defq->blk; } else { - ASSERT(flags & XFS_TRANS_DQ_RES_RTBLKS); - hardlimit = dqp->q_rtb.hardlimit; - if (!hardlimit) - hardlimit = defq->rtb.hard; - softlimit = dqp->q_rtb.softlimit; - if (!softlimit) - softlimit = defq->rtb.soft; - timer = dqp->q_rtb.timer; - warns = dqp->q_rtb.warnings; - warnlimit = defq->rtb.warn; - resbcountp = &dqp->q_rtb.reserved; + blkres = &dqp->q_rtb; + qlim = &defq->rtb; } if ((flags & XFS_QMOPT_FORCE_RES) == 0 && dqp->q_id && ((XFS_IS_UQUOTA_ENFORCED(dqp->q_mount) && XFS_QM_ISUDQ(dqp)) || (XFS_IS_GQUOTA_ENFORCED(dqp->q_mount) && XFS_QM_ISGDQ(dqp)) || (XFS_IS_PQUOTA_ENFORCED(dqp->q_mount) && XFS_QM_ISPDQ(dqp)))) { - if (nblks > 0) { + int quota_nl; + bool fatal; + + /* + * dquot is locked already. See if we'd go over the hardlimit + * or exceed the timelimit if we'd reserve resources. + */ + quota_nl = xfs_dqresv_check(blkres, qlim, nblks, &fatal); + if (quota_nl != QUOTA_NL_NOWARN) { /* - * dquot is locked already. See if we'd go over the - * hardlimit or exceed the timelimit if we allocate - * nblks. + * Quota block warning codes are 3 more than the inode + * codes, which we check above. */ - total_count = *resbcountp + nblks; - if (hardlimit && total_count > hardlimit) { - xfs_quota_warn(mp, dqp, QUOTA_NL_BHARDWARN); + xfs_quota_warn(mp, dqp, quota_nl + 3); + if (fatal) goto error_return; - } - if (softlimit && total_count > softlimit) { - if ((timer != 0 && - ktime_get_real_seconds() > timer) || - (warns != 0 && warns >= warnlimit)) { - xfs_quota_warn(mp, dqp, - QUOTA_NL_BSOFTLONGWARN); - goto error_return; - } - - xfs_quota_warn(mp, dqp, QUOTA_NL_BSOFTWARN); - } } - if (ninos > 0) { - total_count = dqp->q_ino.reserved + ninos; - timer = dqp->q_ino.timer; - warns = dqp->q_ino.warnings; - warnlimit = defq->ino.warn; - hardlimit = dqp->q_ino.hardlimit; - if (!hardlimit) - hardlimit = defq->ino.hard; - softlimit = dqp->q_ino.softlimit; - if (!softlimit) - softlimit = defq->ino.soft; - if (hardlimit && total_count > hardlimit) { - xfs_quota_warn(mp, dqp, QUOTA_NL_IHARDWARN); + quota_nl = xfs_dqresv_check(&dqp->q_ino, &defq->ino, ninos, + &fatal); + if (quota_nl != QUOTA_NL_NOWARN) { + xfs_quota_warn(mp, dqp, quota_nl); + if (fatal) goto error_return; - } - if (softlimit && total_count > softlimit) { - if ((timer != 0 && - ktime_get_real_seconds() > timer) || - (warns != 0 && warns >= warnlimit)) { - xfs_quota_warn(mp, dqp, - QUOTA_NL_ISOFTLONGWARN); - goto error_return; - } - xfs_quota_warn(mp, dqp, QUOTA_NL_ISOFTWARN); - } } } @@ -669,9 +673,8 @@ xfs_trans_dqresv( * Change the reservation, but not the actual usage. * Note that q_blk.reserved = q_blk.count + resv */ - (*resbcountp) += (xfs_qcnt_t)nblks; - if (ninos != 0) - dqp->q_ino.reserved += (xfs_qcnt_t)ninos; + blkres->reserved += (xfs_qcnt_t)nblks; + dqp->q_ino.reserved += (xfs_qcnt_t)ninos; /* * note the reservation amt in the trans struct too, From d92c881538c40a18d0a86193898fc0b6e2598aa0 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 14 Jul 2020 10:37:34 -0700 Subject: [PATCH 074/117] xfs: refactor xfs_trans_apply_dquot_deltas Hoist the code that adjusts the incore quota reservation count adjustments into a separate function, both to reduce the level of indentation and also to reduce the amount of open-coded logic. Signed-off-by: Darrick J. Wong Reviewed-by: Allison Collins Reviewed-by: Chandan Babu R Reviewed-by: Christoph Hellwig --- fs/xfs/xfs_trans_dquot.c | 103 +++++++++++++++++---------------------- 1 file changed, 46 insertions(+), 57 deletions(-) diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c index 9dc8dd9f852c..510b9c3164d6 100644 --- a/fs/xfs/xfs_trans_dquot.c +++ b/fs/xfs/xfs_trans_dquot.c @@ -293,6 +293,37 @@ xfs_trans_dqlockedjoin( } } +/* Apply dqtrx changes to the quota reservation counters. */ +static inline void +xfs_apply_quota_reservation_deltas( + struct xfs_dquot_res *res, + uint64_t reserved, + int64_t res_used, + int64_t count_delta) +{ + if (reserved != 0) { + /* + * Subtle math here: If reserved > res_used (the normal case), + * we're simply subtracting the unused transaction quota + * reservation from the dquot reservation. + * + * If, however, res_used > reserved, then we have allocated + * more quota blocks than were reserved for the transaction. + * We must add that excess to the dquot reservation since it + * tracks (usage + resv) and by definition we didn't reserve + * that excess. + */ + res->reserved -= abs(reserved - res_used); + } else if (count_delta != 0) { + /* + * These blks were never reserved, either inside a transaction + * or outside one (in a delayed allocation). Also, this isn't + * always a negative number since we sometimes deliberately + * skip quota reservations. + */ + res->reserved += count_delta; + } +} /* * Called by xfs_trans_commit() and similar in spirit to @@ -327,6 +358,8 @@ xfs_trans_apply_dquot_deltas( xfs_trans_dqlockedjoin(tp, qa); for (i = 0; i < XFS_QM_TRANS_MAXDQS; i++) { + uint64_t blk_res_used; + qtrx = &qa[i]; /* * The array of dquots is filled @@ -396,71 +429,27 @@ xfs_trans_apply_dquot_deltas( * In case of delayed allocations, there's no * reservation that a transaction structure knows of. */ - if (qtrx->qt_blk_res != 0) { - uint64_t blk_res_used = 0; + blk_res_used = max_t(int64_t, 0, qtrx->qt_bcount_delta); + xfs_apply_quota_reservation_deltas(&dqp->q_blk, + qtrx->qt_blk_res, blk_res_used, + qtrx->qt_bcount_delta); - if (qtrx->qt_bcount_delta > 0) - blk_res_used = qtrx->qt_bcount_delta; - - if (qtrx->qt_blk_res != blk_res_used) { - if (qtrx->qt_blk_res > blk_res_used) - dqp->q_blk.reserved -= (xfs_qcnt_t) - (qtrx->qt_blk_res - - blk_res_used); - else - dqp->q_blk.reserved -= (xfs_qcnt_t) - (blk_res_used - - qtrx->qt_blk_res); - } - } else { - /* - * These blks were never reserved, either inside - * a transaction or outside one (in a delayed - * allocation). Also, this isn't always a - * negative number since we sometimes - * deliberately skip quota reservations. - */ - if (qtrx->qt_bcount_delta) { - dqp->q_blk.reserved += - (xfs_qcnt_t)qtrx->qt_bcount_delta; - } - } /* * Adjust the RT reservation. */ - if (qtrx->qt_rtblk_res != 0) { - if (qtrx->qt_rtblk_res != qtrx->qt_rtblk_res_used) { - if (qtrx->qt_rtblk_res > - qtrx->qt_rtblk_res_used) - dqp->q_rtb.reserved -= (xfs_qcnt_t) - (qtrx->qt_rtblk_res - - qtrx->qt_rtblk_res_used); - else - dqp->q_rtb.reserved -= (xfs_qcnt_t) - (qtrx->qt_rtblk_res_used - - qtrx->qt_rtblk_res); - } - } else { - if (qtrx->qt_rtbcount_delta) - dqp->q_rtb.reserved += - (xfs_qcnt_t)qtrx->qt_rtbcount_delta; - } + xfs_apply_quota_reservation_deltas(&dqp->q_rtb, + qtrx->qt_rtblk_res, + qtrx->qt_rtblk_res_used, + qtrx->qt_rtbcount_delta); /* * Adjust the inode reservation. */ - if (qtrx->qt_ino_res != 0) { - ASSERT(qtrx->qt_ino_res >= - qtrx->qt_ino_res_used); - if (qtrx->qt_ino_res > qtrx->qt_ino_res_used) - dqp->q_ino.reserved -= (xfs_qcnt_t) - (qtrx->qt_ino_res - - qtrx->qt_ino_res_used); - } else { - if (qtrx->qt_icount_delta) - dqp->q_ino.reserved += - (xfs_qcnt_t)qtrx->qt_icount_delta; - } + ASSERT(qtrx->qt_ino_res >= qtrx->qt_ino_res_used); + xfs_apply_quota_reservation_deltas(&dqp->q_ino, + qtrx->qt_ino_res, + qtrx->qt_ino_res_used, + qtrx->qt_icount_delta); ASSERT(dqp->q_blk.reserved >= dqp->q_blk.count); ASSERT(dqp->q_ino.reserved >= dqp->q_ino.count); From 12d720fb864547922ad4bf53120de72a91807999 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 14 Jul 2020 10:37:34 -0700 Subject: [PATCH 075/117] xfs: assume the default quota limits are always set in xfs_qm_adjust_dqlimits We always initialize the default quota limits to something nowadays, so we don't need to check that the defaults are set to something before using them. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Reviewed-by: Chandan Babu R --- fs/xfs/xfs_dquot.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index 98cb31f28aaf..8e84623cc331 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -77,21 +77,21 @@ xfs_qm_adjust_dqlimits( ASSERT(dq->q_id); defq = xfs_get_defquota(q, xfs_dquot_type(dq)); - if (defq->blk.soft && !dq->q_blk.softlimit) { + if (!dq->q_blk.softlimit) { dq->q_blk.softlimit = defq->blk.soft; prealloc = 1; } - if (defq->blk.hard && !dq->q_blk.hardlimit) { + if (!dq->q_blk.hardlimit) { dq->q_blk.hardlimit = defq->blk.hard; prealloc = 1; } - if (defq->ino.soft && !dq->q_ino.softlimit) + if (!dq->q_ino.softlimit) dq->q_ino.softlimit = defq->ino.soft; - if (defq->ino.hard && !dq->q_ino.hardlimit) + if (!dq->q_ino.hardlimit) dq->q_ino.hardlimit = defq->ino.hard; - if (defq->rtb.soft && !dq->q_rtb.softlimit) + if (!dq->q_rtb.softlimit) dq->q_rtb.softlimit = defq->rtb.soft; - if (defq->rtb.hard && !dq->q_rtb.hardlimit) + if (!dq->q_rtb.hardlimit) dq->q_rtb.hardlimit = defq->rtb.hard; if (prealloc) From 4b8628d57b725b32616965e66975fcdebe008fe7 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 14 Jul 2020 10:37:35 -0700 Subject: [PATCH 076/117] xfs: actually bump warning counts when we send warnings Currently, xfs quotas have the ability to send netlink warnings when a user exceeds the limits. They also have all the support code necessary to convert softlimit warnings into failures if the number of warnings exceeds a limit set by the administrator. Unfortunately, we never actually increase the warning counter, so this never actually happens. Make it so we actually do something useful with the warning counts. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Reviewed-by: Chandan Babu R --- fs/xfs/xfs_trans_dquot.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c index 510b9c3164d6..6be6287da1ac 100644 --- a/fs/xfs/xfs_trans_dquot.c +++ b/fs/xfs/xfs_trans_dquot.c @@ -589,6 +589,7 @@ xfs_dqresv_check( return QUOTA_NL_ISOFTLONGWARN; } + res->warnings++; return QUOTA_NL_ISOFTWARN; } From 2cb91bab4fa4effe56da1c7fe2fc5723c4935db1 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 14 Jul 2020 10:37:35 -0700 Subject: [PATCH 077/117] xfs: add more dquot tracepoints Add all the xfs_dquot fields to the tracepoint for that type; add a new tracepoint type for the qtrx structure (dquot transaction deltas); and use our new tracepoints. This makes it easier for the author to trace changes to dquot counters for debugging. Signed-off-by: Darrick J. Wong Reviewed-by: Allison Collins Reviewed-by: Chandan Babu R Reviewed-by: Christoph Hellwig --- fs/xfs/xfs_trace.h | 140 ++++++++++++++++++++++++++++++++++++++- fs/xfs/xfs_trans_dquot.c | 21 ++++++ 2 files changed, 159 insertions(+), 2 deletions(-) diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index f0c2bce69a36..81534095f52b 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -36,6 +36,7 @@ struct xfs_owner_info; struct xfs_trans_res; struct xfs_inobt_rec_incore; union xfs_btree_ptr; +struct xfs_dqtrx; #define XFS_ATTR_FILTER_FLAGS \ { XFS_ATTR_ROOT, "ROOT" }, \ @@ -867,37 +868,59 @@ DECLARE_EVENT_CLASS(xfs_dquot_class, __field(unsigned, flags) __field(unsigned, nrefs) __field(unsigned long long, res_bcount) + __field(unsigned long long, res_rtbcount) + __field(unsigned long long, res_icount) + __field(unsigned long long, bcount) + __field(unsigned long long, rtbcount) __field(unsigned long long, icount) + __field(unsigned long long, blk_hardlimit) __field(unsigned long long, blk_softlimit) + __field(unsigned long long, rtb_hardlimit) + __field(unsigned long long, rtb_softlimit) __field(unsigned long long, ino_hardlimit) __field(unsigned long long, ino_softlimit) - ), \ + ), TP_fast_assign( __entry->dev = dqp->q_mount->m_super->s_dev; __entry->id = dqp->q_id; __entry->flags = dqp->dq_flags | dqp->q_flags; __entry->nrefs = dqp->q_nrefs; + __entry->res_bcount = dqp->q_blk.reserved; + __entry->res_rtbcount = dqp->q_rtb.reserved; + __entry->res_icount = dqp->q_ino.reserved; + __entry->bcount = dqp->q_blk.count; + __entry->rtbcount = dqp->q_rtb.count; __entry->icount = dqp->q_ino.count; + __entry->blk_hardlimit = dqp->q_blk.hardlimit; __entry->blk_softlimit = dqp->q_blk.softlimit; + __entry->rtb_hardlimit = dqp->q_rtb.hardlimit; + __entry->rtb_softlimit = dqp->q_rtb.softlimit; __entry->ino_hardlimit = dqp->q_ino.hardlimit; __entry->ino_softlimit = dqp->q_ino.softlimit; ), - TP_printk("dev %d:%d id 0x%x flags %s nrefs %u res_bc 0x%llx " + TP_printk("dev %d:%d id 0x%x flags %s nrefs %u " + "res_bc 0x%llx res_rtbc 0x%llx res_ic 0x%llx " "bcnt 0x%llx bhardlimit 0x%llx bsoftlimit 0x%llx " + "rtbcnt 0x%llx rtbhardlimit 0x%llx rtbsoftlimit 0x%llx " "icnt 0x%llx ihardlimit 0x%llx isoftlimit 0x%llx]", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->id, __print_flags(__entry->flags, "|", XFS_DQFLAG_STRINGS), __entry->nrefs, __entry->res_bcount, + __entry->res_rtbcount, + __entry->res_icount, __entry->bcount, __entry->blk_hardlimit, __entry->blk_softlimit, + __entry->rtbcount, + __entry->rtb_hardlimit, + __entry->rtb_softlimit, __entry->icount, __entry->ino_hardlimit, __entry->ino_softlimit) @@ -928,6 +951,119 @@ DEFINE_DQUOT_EVENT(xfs_dqrele); DEFINE_DQUOT_EVENT(xfs_dqflush); DEFINE_DQUOT_EVENT(xfs_dqflush_force); DEFINE_DQUOT_EVENT(xfs_dqflush_done); +DEFINE_DQUOT_EVENT(xfs_trans_apply_dquot_deltas_before); +DEFINE_DQUOT_EVENT(xfs_trans_apply_dquot_deltas_after); + +#define XFS_QMOPT_FLAGS \ + { XFS_QMOPT_UQUOTA, "UQUOTA" }, \ + { XFS_QMOPT_PQUOTA, "PQUOTA" }, \ + { XFS_QMOPT_FORCE_RES, "FORCE_RES" }, \ + { XFS_QMOPT_SBVERSION, "SBVERSION" }, \ + { XFS_QMOPT_GQUOTA, "GQUOTA" }, \ + { XFS_QMOPT_INHERIT, "INHERIT" }, \ + { XFS_QMOPT_RES_REGBLKS, "RES_REGBLKS" }, \ + { XFS_QMOPT_RES_RTBLKS, "RES_RTBLKS" }, \ + { XFS_QMOPT_BCOUNT, "BCOUNT" }, \ + { XFS_QMOPT_ICOUNT, "ICOUNT" }, \ + { XFS_QMOPT_RTBCOUNT, "RTBCOUNT" }, \ + { XFS_QMOPT_DELBCOUNT, "DELBCOUNT" }, \ + { XFS_QMOPT_DELRTBCOUNT, "DELRTBCOUNT" }, \ + { XFS_QMOPT_RES_INOS, "RES_INOS" } + +TRACE_EVENT(xfs_trans_mod_dquot, + TP_PROTO(struct xfs_trans *tp, struct xfs_dquot *dqp, + unsigned int field, int64_t delta), + TP_ARGS(tp, dqp, field, delta), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(unsigned int, flags) + __field(unsigned int, dqid) + __field(unsigned int, field) + __field(int64_t, delta) + ), + TP_fast_assign( + __entry->dev = tp->t_mountp->m_super->s_dev; + __entry->flags = dqp->dq_flags | dqp->q_flags; + __entry->dqid = dqp->q_id; + __entry->field = field; + __entry->delta = delta; + ), + TP_printk("dev %d:%d dquot id 0x%x flags %s field %s delta %lld", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->dqid, + __print_flags(__entry->flags, "|", XFS_DQFLAG_STRINGS), + __print_flags(__entry->field, "|", XFS_QMOPT_FLAGS), + __entry->delta) +); + +DECLARE_EVENT_CLASS(xfs_dqtrx_class, + TP_PROTO(struct xfs_dqtrx *qtrx), + TP_ARGS(qtrx), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(unsigned int, flags) + __field(u32, dqid) + + __field(uint64_t, blk_res) + __field(int64_t, bcount_delta) + __field(int64_t, delbcnt_delta) + + __field(uint64_t, rtblk_res) + __field(uint64_t, rtblk_res_used) + __field(int64_t, rtbcount_delta) + __field(int64_t, delrtb_delta) + + __field(uint64_t, ino_res) + __field(uint64_t, ino_res_used) + __field(int64_t, icount_delta) + ), + TP_fast_assign( + __entry->dev = qtrx->qt_dquot->q_mount->m_super->s_dev; + __entry->flags = qtrx->qt_dquot->dq_flags | qtrx->qt_dquot->q_flags; + __entry->dqid = qtrx->qt_dquot->q_id; + + __entry->blk_res = qtrx->qt_blk_res; + __entry->bcount_delta = qtrx->qt_bcount_delta; + __entry->delbcnt_delta = qtrx->qt_delbcnt_delta; + + __entry->rtblk_res = qtrx->qt_rtblk_res; + __entry->rtblk_res_used = qtrx->qt_rtblk_res_used; + __entry->rtbcount_delta = qtrx->qt_rtbcount_delta; + __entry->delrtb_delta = qtrx->qt_delrtb_delta; + + __entry->ino_res = qtrx->qt_ino_res; + __entry->ino_res_used = qtrx->qt_ino_res_used; + __entry->icount_delta = qtrx->qt_icount_delta; + ), + TP_printk("dev %d:%d dquot id 0x%x flags %s" + "blk_res %llu bcount_delta %lld delbcnt_delta %lld " + "rtblk_res %llu rtblk_res_used %llu rtbcount_delta %lld delrtb_delta %lld " + "ino_res %llu ino_res_used %llu icount_delta %lld", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->dqid, + __print_flags(__entry->flags, "|", XFS_DQFLAG_STRINGS), + + __entry->blk_res, + __entry->bcount_delta, + __entry->delbcnt_delta, + + __entry->rtblk_res, + __entry->rtblk_res_used, + __entry->rtbcount_delta, + __entry->delrtb_delta, + + __entry->ino_res, + __entry->ino_res_used, + __entry->icount_delta) +) + +#define DEFINE_DQTRX_EVENT(name) \ +DEFINE_EVENT(xfs_dqtrx_class, name, \ + TP_PROTO(struct xfs_dqtrx *qtrx), \ + TP_ARGS(qtrx)) +DEFINE_DQTRX_EVENT(xfs_trans_apply_dquot_deltas); +DEFINE_DQTRX_EVENT(xfs_trans_mod_dquot_before); +DEFINE_DQTRX_EVENT(xfs_trans_mod_dquot_after); DECLARE_EVENT_CLASS(xfs_loggrant_class, TP_PROTO(struct xlog *log, struct xlog_ticket *tic), diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c index 6be6287da1ac..a8f480e5401f 100644 --- a/fs/xfs/xfs_trans_dquot.c +++ b/fs/xfs/xfs_trans_dquot.c @@ -15,6 +15,7 @@ #include "xfs_trans_priv.h" #include "xfs_quota.h" #include "xfs_qm.h" +#include "xfs_trace.h" STATIC void xfs_trans_alloc_dqinfo(xfs_trans_t *); @@ -203,6 +204,11 @@ xfs_trans_mod_dquot( if (qtrx->qt_dquot == NULL) qtrx->qt_dquot = dqp; + if (delta) { + trace_xfs_trans_mod_dquot_before(qtrx); + trace_xfs_trans_mod_dquot(tp, dqp, field, delta); + } + switch (field) { /* @@ -266,6 +272,10 @@ xfs_trans_mod_dquot( default: ASSERT(0); } + + if (delta) + trace_xfs_trans_mod_dquot_after(qtrx); + tp->t_flags |= XFS_TRANS_DQ_DIRTY; } @@ -391,6 +401,13 @@ xfs_trans_apply_dquot_deltas( qtrx->qt_delbcnt_delta; totalrtbdelta = qtrx->qt_rtbcount_delta + qtrx->qt_delrtb_delta; + + if (totalbdelta != 0 || totalrtbdelta != 0 || + qtrx->qt_icount_delta != 0) { + trace_xfs_trans_apply_dquot_deltas_before(dqp); + trace_xfs_trans_apply_dquot_deltas(qtrx); + } + #ifdef DEBUG if (totalbdelta < 0) ASSERT(dqp->q_blk.count >= -totalbdelta); @@ -410,6 +427,10 @@ xfs_trans_apply_dquot_deltas( if (totalrtbdelta) dqp->q_rtb.count += totalrtbdelta; + if (totalbdelta != 0 || totalrtbdelta != 0 || + qtrx->qt_icount_delta != 0) + trace_xfs_trans_apply_dquot_deltas_after(dqp); + /* * Get any default limits in use. * Start/reset the timer(s) if needed. From f9751c4ad3d17fa93773c187732f10c8a49940e3 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Wed, 15 Jul 2020 17:41:24 -0700 Subject: [PATCH 078/117] xfs: drop the type parameter from xfs_dquot_verify xfs_qm_reset_dqcounts (aka quotacheck) is the only xfs_dqblk_verify caller that actually knows the specific quota type that it's looking for. Since everything else just pass in type==0 (including the buffer verifier), drop the parameter and open-code the check like xfs_dquot_from_disk already does. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_dquot_buf.c | 12 ++++-------- fs/xfs/libxfs/xfs_quota_defs.h | 4 ++-- fs/xfs/xfs_buf_item_recover.c | 3 +-- fs/xfs/xfs_dquot_item_recover.c | 2 +- fs/xfs/xfs_qm.c | 5 ++--- 5 files changed, 10 insertions(+), 16 deletions(-) diff --git a/fs/xfs/libxfs/xfs_dquot_buf.c b/fs/xfs/libxfs/xfs_dquot_buf.c index bedc1e752b60..eb2412e13f30 100644 --- a/fs/xfs/libxfs/xfs_dquot_buf.c +++ b/fs/xfs/libxfs/xfs_dquot_buf.c @@ -37,8 +37,7 @@ xfs_failaddr_t xfs_dquot_verify( struct xfs_mount *mp, struct xfs_disk_dquot *ddq, - xfs_dqid_t id, - uint type) /* used only during quotacheck */ + xfs_dqid_t id) /* used only during quotacheck */ { /* * We can encounter an uninitialized dquot buffer for 2 reasons: @@ -60,8 +59,6 @@ xfs_dquot_verify( if (ddq->d_version != XFS_DQUOT_VERSION) return __this_address; - if (type && ddq->d_flags != type) - return __this_address; if (ddq->d_flags != XFS_DQ_USER && ddq->d_flags != XFS_DQ_PROJ && ddq->d_flags != XFS_DQ_GROUP) @@ -95,14 +92,13 @@ xfs_failaddr_t xfs_dqblk_verify( struct xfs_mount *mp, struct xfs_dqblk *dqb, - xfs_dqid_t id, - uint type) /* used only during quotacheck */ + xfs_dqid_t id) /* used only during quotacheck */ { if (xfs_sb_version_hascrc(&mp->m_sb) && !uuid_equal(&dqb->dd_uuid, &mp->m_sb.sb_meta_uuid)) return __this_address; - return xfs_dquot_verify(mp, &dqb->dd_diskdq, id, type); + return xfs_dquot_verify(mp, &dqb->dd_diskdq, id); } /* @@ -205,7 +201,7 @@ xfs_dquot_buf_verify( if (i == 0) id = be32_to_cpu(ddq->d_id); - fa = xfs_dqblk_verify(mp, &dqb[i], id + i, 0); + fa = xfs_dqblk_verify(mp, &dqb[i], id + i); if (fa) { if (!readahead) xfs_buf_verifier_error(bp, -EFSCORRUPTED, diff --git a/fs/xfs/libxfs/xfs_quota_defs.h b/fs/xfs/libxfs/xfs_quota_defs.h index e2da08055e6b..d2245f375719 100644 --- a/fs/xfs/libxfs/xfs_quota_defs.h +++ b/fs/xfs/libxfs/xfs_quota_defs.h @@ -137,9 +137,9 @@ typedef uint16_t xfs_qwarncnt_t; #define XFS_QMOPT_RESBLK_MASK (XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_RES_RTBLKS) extern xfs_failaddr_t xfs_dquot_verify(struct xfs_mount *mp, - struct xfs_disk_dquot *ddq, xfs_dqid_t id, uint type); + struct xfs_disk_dquot *ddq, xfs_dqid_t id); extern xfs_failaddr_t xfs_dqblk_verify(struct xfs_mount *mp, - struct xfs_dqblk *dqb, xfs_dqid_t id, uint type); + struct xfs_dqblk *dqb, xfs_dqid_t id); extern int xfs_calc_dquots_per_chunk(unsigned int nbblks); extern void xfs_dqblk_repair(struct xfs_mount *mp, struct xfs_dqblk *dqb, xfs_dqid_t id, uint type); diff --git a/fs/xfs/xfs_buf_item_recover.c b/fs/xfs/xfs_buf_item_recover.c index 74c851f60eee..8bee582cf66a 100644 --- a/fs/xfs/xfs_buf_item_recover.c +++ b/fs/xfs/xfs_buf_item_recover.c @@ -493,8 +493,7 @@ xlog_recover_do_reg_buffer( item->ri_buf[i].i_len, __func__); goto next; } - fa = xfs_dquot_verify(mp, item->ri_buf[i].i_addr, - -1, 0); + fa = xfs_dquot_verify(mp, item->ri_buf[i].i_addr, -1); if (fa) { xfs_alert(mp, "dquot corrupt at %pS trying to replay into block 0x%llx", diff --git a/fs/xfs/xfs_dquot_item_recover.c b/fs/xfs/xfs_dquot_item_recover.c index f9ea9f55aa7c..9f64162ca300 100644 --- a/fs/xfs/xfs_dquot_item_recover.c +++ b/fs/xfs/xfs_dquot_item_recover.c @@ -108,7 +108,7 @@ xlog_recover_dquot_commit_pass2( */ dq_f = item->ri_buf[0].i_addr; ASSERT(dq_f); - fa = xfs_dquot_verify(mp, recddq, dq_f->qlf_id, 0); + fa = xfs_dquot_verify(mp, recddq, dq_f->qlf_id); if (fa) { xfs_alert(mp, "corrupt dquot ID 0x%x in log at %pS", dq_f->qlf_id, fa); diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index 7d83c1623cb2..bf94c1bbda16 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -830,7 +830,6 @@ xfs_qm_reset_dqcounts( { struct xfs_dqblk *dqb; int j; - xfs_failaddr_t fa; trace_xfs_reset_dqcounts(bp, _RET_IP_); @@ -855,8 +854,8 @@ xfs_qm_reset_dqcounts( * find uninitialised dquot blks. See comment in * xfs_dquot_verify. */ - fa = xfs_dqblk_verify(mp, &dqb[j], id + j, type); - if (fa) + if (xfs_dqblk_verify(mp, &dqb[j], id + j) || + (dqb[j].dd_diskdq.d_flags & XFS_DQ_ALLTYPES) != type) xfs_dqblk_repair(mp, &dqb[j], id + j, type); /* From 8cd4901da56caadc16b4e8d6b434291a8ce31d7c Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Wed, 15 Jul 2020 17:42:36 -0700 Subject: [PATCH 079/117] xfs: rename XFS_DQ_{USER,GROUP,PROJ} to XFS_DQTYPE_* We're going to split up the incore dquot state flags from the ondisk dquot flags (eventually renaming this "type") so start by renaming the three flags and the bitmask that are going to participate in this. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_dquot_buf.c | 6 ++-- fs/xfs/libxfs/xfs_format.h | 2 +- fs/xfs/libxfs/xfs_quota_defs.h | 16 +++++----- fs/xfs/scrub/quota.c | 6 ++-- fs/xfs/scrub/repair.c | 6 ++-- fs/xfs/xfs_buf_item_recover.c | 6 ++-- fs/xfs/xfs_dquot.c | 36 +++++++++++------------ fs/xfs/xfs_dquot.h | 22 +++++++------- fs/xfs/xfs_dquot_item_recover.c | 10 +++---- fs/xfs/xfs_icache.c | 4 +-- fs/xfs/xfs_iomap.c | 12 ++++---- fs/xfs/xfs_qm.c | 52 ++++++++++++++++----------------- fs/xfs/xfs_qm.h | 26 ++++++++--------- fs/xfs/xfs_qm_bhv.c | 2 +- fs/xfs/xfs_qm_syscalls.c | 12 ++++---- fs/xfs/xfs_quota.h | 6 ++-- fs/xfs/xfs_quotaops.c | 6 ++-- fs/xfs/xfs_trans_dquot.c | 4 +-- 18 files changed, 118 insertions(+), 116 deletions(-) diff --git a/fs/xfs/libxfs/xfs_dquot_buf.c b/fs/xfs/libxfs/xfs_dquot_buf.c index eb2412e13f30..450147df3042 100644 --- a/fs/xfs/libxfs/xfs_dquot_buf.c +++ b/fs/xfs/libxfs/xfs_dquot_buf.c @@ -59,9 +59,9 @@ xfs_dquot_verify( if (ddq->d_version != XFS_DQUOT_VERSION) return __this_address; - if (ddq->d_flags != XFS_DQ_USER && - ddq->d_flags != XFS_DQ_PROJ && - ddq->d_flags != XFS_DQ_GROUP) + if (ddq->d_flags != XFS_DQTYPE_USER && + ddq->d_flags != XFS_DQTYPE_PROJ && + ddq->d_flags != XFS_DQTYPE_GROUP) return __this_address; if (id != -1 && id != be32_to_cpu(ddq->d_id)) diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h index a534ebee92b9..5d5e0f5eda97 100644 --- a/fs/xfs/libxfs/xfs_format.h +++ b/fs/xfs/libxfs/xfs_format.h @@ -1157,7 +1157,7 @@ static inline void xfs_dinode_put_rdev(struct xfs_dinode *dip, xfs_dev_t rdev) struct xfs_disk_dquot { __be16 d_magic; /* dquot magic = XFS_DQUOT_MAGIC */ __u8 d_version; /* dquot version */ - __u8 d_flags; /* XFS_DQ_USER/PROJ/GROUP */ + __u8 d_flags; /* XFS_DQTYPE_USER/PROJ/GROUP */ __be32 d_id; /* user,project,group id */ __be64 d_blk_hardlimit;/* absolute limit on disk blks */ __be64 d_blk_softlimit;/* preferred limit on disk blks */ diff --git a/fs/xfs/libxfs/xfs_quota_defs.h b/fs/xfs/libxfs/xfs_quota_defs.h index d2245f375719..baf6c4ad88af 100644 --- a/fs/xfs/libxfs/xfs_quota_defs.h +++ b/fs/xfs/libxfs/xfs_quota_defs.h @@ -21,18 +21,20 @@ typedef uint16_t xfs_qwarncnt_t; /* * flags for q_flags field in the dquot. */ -#define XFS_DQ_USER 0x0001 /* a user quota */ -#define XFS_DQ_PROJ 0x0002 /* project quota */ -#define XFS_DQ_GROUP 0x0004 /* a group quota */ +#define XFS_DQTYPE_USER 0x0001 /* a user quota */ +#define XFS_DQTYPE_PROJ 0x0002 /* project quota */ +#define XFS_DQTYPE_GROUP 0x0004 /* a group quota */ #define XFS_DQFLAG_DIRTY 0x0008 /* dquot is dirty */ #define XFS_DQFLAG_FREEING 0x0010 /* dquot is being torn down */ -#define XFS_DQ_ALLTYPES (XFS_DQ_USER|XFS_DQ_PROJ|XFS_DQ_GROUP) +#define XFS_DQTYPE_REC_MASK (XFS_DQTYPE_USER | \ + XFS_DQTYPE_PROJ | \ + XFS_DQTYPE_GROUP) #define XFS_DQFLAG_STRINGS \ - { XFS_DQ_USER, "USER" }, \ - { XFS_DQ_PROJ, "PROJ" }, \ - { XFS_DQ_GROUP, "GROUP" }, \ + { XFS_DQTYPE_USER, "USER" }, \ + { XFS_DQTYPE_PROJ, "PROJ" }, \ + { XFS_DQTYPE_GROUP, "GROUP" }, \ { XFS_DQFLAG_DIRTY, "DIRTY" }, \ { XFS_DQFLAG_FREEING, "FREEING" } diff --git a/fs/xfs/scrub/quota.c b/fs/xfs/scrub/quota.c index f4aad5b00188..1db07485f148 100644 --- a/fs/xfs/scrub/quota.c +++ b/fs/xfs/scrub/quota.c @@ -24,11 +24,11 @@ xchk_quota_to_dqtype( { switch (sc->sm->sm_type) { case XFS_SCRUB_TYPE_UQUOTA: - return XFS_DQ_USER; + return XFS_DQTYPE_USER; case XFS_SCRUB_TYPE_GQUOTA: - return XFS_DQ_GROUP; + return XFS_DQTYPE_GROUP; case XFS_SCRUB_TYPE_PQUOTA: - return XFS_DQ_PROJ; + return XFS_DQTYPE_PROJ; default: return 0; } diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c index db3cfd12803d..074651896586 100644 --- a/fs/xfs/scrub/repair.c +++ b/fs/xfs/scrub/repair.c @@ -939,11 +939,11 @@ xrep_ino_dqattach( "inode %llu repair encountered quota error %d, quotacheck forced.", (unsigned long long)sc->ip->i_ino, error); if (XFS_IS_UQUOTA_ON(sc->mp) && !sc->ip->i_udquot) - xrep_force_quotacheck(sc, XFS_DQ_USER); + xrep_force_quotacheck(sc, XFS_DQTYPE_USER); if (XFS_IS_GQUOTA_ON(sc->mp) && !sc->ip->i_gdquot) - xrep_force_quotacheck(sc, XFS_DQ_GROUP); + xrep_force_quotacheck(sc, XFS_DQTYPE_GROUP); if (XFS_IS_PQUOTA_ON(sc->mp) && !sc->ip->i_pdquot) - xrep_force_quotacheck(sc, XFS_DQ_PROJ); + xrep_force_quotacheck(sc, XFS_DQTYPE_PROJ); /* fall through */ case -ESRCH: error = 0; diff --git a/fs/xfs/xfs_buf_item_recover.c b/fs/xfs/xfs_buf_item_recover.c index 8bee582cf66a..d480f11e6b00 100644 --- a/fs/xfs/xfs_buf_item_recover.c +++ b/fs/xfs/xfs_buf_item_recover.c @@ -546,11 +546,11 @@ xlog_recover_do_dquot_buffer( type = 0; if (buf_f->blf_flags & XFS_BLF_UDQUOT_BUF) - type |= XFS_DQ_USER; + type |= XFS_DQTYPE_USER; if (buf_f->blf_flags & XFS_BLF_PDQUOT_BUF) - type |= XFS_DQ_PROJ; + type |= XFS_DQTYPE_PROJ; if (buf_f->blf_flags & XFS_BLF_GDQUOT_BUF) - type |= XFS_DQ_GROUP; + type |= XFS_DQTYPE_GROUP; /* * This type of quotas was turned off, so ignore this buffer */ diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index 8e84623cc331..4053e7e390f1 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -190,10 +190,10 @@ xfs_qm_init_dquot_blk( } } - if (type & XFS_DQ_USER) { + if (type & XFS_DQTYPE_USER) { qflag = XFS_UQUOTA_CHKD; blftype = XFS_BLF_UDQUOT_BUF; - } else if (type & XFS_DQ_PROJ) { + } else if (type & XFS_DQTYPE_PROJ) { qflag = XFS_PQUOTA_CHKD; blftype = XFS_BLF_PDQUOT_BUF; } else { @@ -311,7 +311,7 @@ xfs_dquot_disk_alloc( * the entire thing. */ xfs_qm_init_dquot_blk(tp, mp, dqp->q_id, - dqp->dq_flags & XFS_DQ_ALLTYPES, bp); + dqp->dq_flags & XFS_DQTYPE_REC_MASK, bp); xfs_buf_set_ref(bp, XFS_DQUOT_REF); /* @@ -448,13 +448,13 @@ xfs_dquot_alloc( * quotas. */ switch (type) { - case XFS_DQ_USER: + case XFS_DQTYPE_USER: /* uses the default lock class */ break; - case XFS_DQ_GROUP: + case XFS_DQTYPE_GROUP: lockdep_set_class(&dqp->q_qlock, &xfs_dquot_group_class); break; - case XFS_DQ_PROJ: + case XFS_DQTYPE_PROJ: lockdep_set_class(&dqp->q_qlock, &xfs_dquot_project_class); break; default: @@ -480,7 +480,7 @@ xfs_dquot_from_disk( * Ensure that we got the type and ID we were looking for. * Everything else was checked by the dquot buffer verifier. */ - if ((ddqp->d_flags & XFS_DQ_ALLTYPES) != dqp->dq_flags || + if ((ddqp->d_flags & XFS_DQTYPE_REC_MASK) != dqp->dq_flags || be32_to_cpu(ddqp->d_id) != dqp->q_id) { xfs_alert_tag(bp->b_mount, XFS_PTAG_VERIFIER_ERROR, "Metadata corruption detected at %pS, quota %u", @@ -530,7 +530,7 @@ xfs_dquot_to_disk( { ddqp->d_magic = cpu_to_be16(XFS_DQUOT_MAGIC); ddqp->d_version = XFS_DQUOT_VERSION; - ddqp->d_flags = dqp->dq_flags & XFS_DQ_ALLTYPES; + ddqp->d_flags = dqp->dq_flags & XFS_DQTYPE_REC_MASK; ddqp->d_id = cpu_to_be32(dqp->q_id); ddqp->d_pad0 = 0; ddqp->d_pad = 0; @@ -779,15 +779,15 @@ xfs_qm_dqget_checks( return -ESRCH; switch (type) { - case XFS_DQ_USER: + case XFS_DQTYPE_USER: if (!XFS_IS_UQUOTA_ON(mp)) return -ESRCH; return 0; - case XFS_DQ_GROUP: + case XFS_DQTYPE_GROUP: if (!XFS_IS_GQUOTA_ON(mp)) return -ESRCH; return 0; - case XFS_DQ_PROJ: + case XFS_DQTYPE_PROJ: if (!XFS_IS_PQUOTA_ON(mp)) return -ESRCH; return 0; @@ -874,11 +874,11 @@ xfs_qm_id_for_quotatype( uint type) { switch (type) { - case XFS_DQ_USER: + case XFS_DQTYPE_USER: return i_uid_read(VFS_I(ip)); - case XFS_DQ_GROUP: + case XFS_DQTYPE_GROUP: return i_gid_read(VFS_I(ip)); - case XFS_DQ_PROJ: + case XFS_DQTYPE_PROJ: return ip->i_d.di_projid; } ASSERT(0); @@ -1114,11 +1114,11 @@ static xfs_failaddr_t xfs_qm_dqflush_check( struct xfs_dquot *dqp) { - __u8 type = dqp->dq_flags & XFS_DQ_ALLTYPES; + __u8 type = dqp->dq_flags & XFS_DQTYPE_REC_MASK; - if (type != XFS_DQ_USER && - type != XFS_DQ_GROUP && - type != XFS_DQ_PROJ) + if (type != XFS_DQTYPE_USER && + type != XFS_DQTYPE_GROUP && + type != XFS_DQTYPE_PROJ) return __this_address; if (dqp->q_id == 0) diff --git a/fs/xfs/xfs_dquot.h b/fs/xfs/xfs_dquot.h index 9e44da522684..17a21677723f 100644 --- a/fs/xfs/xfs_dquot.h +++ b/fs/xfs/xfs_dquot.h @@ -130,12 +130,12 @@ static inline void xfs_dqunlock(struct xfs_dquot *dqp) static inline int xfs_this_quota_on(struct xfs_mount *mp, int type) { - switch (type & XFS_DQ_ALLTYPES) { - case XFS_DQ_USER: + switch (type & XFS_DQTYPE_REC_MASK) { + case XFS_DQTYPE_USER: return XFS_IS_UQUOTA_ON(mp); - case XFS_DQ_GROUP: + case XFS_DQTYPE_GROUP: return XFS_IS_GQUOTA_ON(mp); - case XFS_DQ_PROJ: + case XFS_DQTYPE_PROJ: return XFS_IS_PQUOTA_ON(mp); default: return 0; @@ -144,12 +144,12 @@ static inline int xfs_this_quota_on(struct xfs_mount *mp, int type) static inline struct xfs_dquot *xfs_inode_dquot(struct xfs_inode *ip, int type) { - switch (type & XFS_DQ_ALLTYPES) { - case XFS_DQ_USER: + switch (type & XFS_DQTYPE_REC_MASK) { + case XFS_DQTYPE_USER: return ip->i_udquot; - case XFS_DQ_GROUP: + case XFS_DQTYPE_GROUP: return ip->i_gdquot; - case XFS_DQ_PROJ: + case XFS_DQTYPE_PROJ: return ip->i_pdquot; default: return NULL; @@ -175,9 +175,9 @@ void xfs_dquot_to_disk(struct xfs_disk_dquot *ddqp, struct xfs_dquot *dqp); #define XFS_DQ_IS_LOCKED(dqp) (mutex_is_locked(&((dqp)->q_qlock))) #define XFS_DQ_IS_DIRTY(dqp) ((dqp)->q_flags & XFS_DQFLAG_DIRTY) -#define XFS_QM_ISUDQ(dqp) ((dqp)->dq_flags & XFS_DQ_USER) -#define XFS_QM_ISPDQ(dqp) ((dqp)->dq_flags & XFS_DQ_PROJ) -#define XFS_QM_ISGDQ(dqp) ((dqp)->dq_flags & XFS_DQ_GROUP) +#define XFS_QM_ISUDQ(dqp) ((dqp)->dq_flags & XFS_DQTYPE_USER) +#define XFS_QM_ISPDQ(dqp) ((dqp)->dq_flags & XFS_DQTYPE_PROJ) +#define XFS_QM_ISGDQ(dqp) ((dqp)->dq_flags & XFS_DQTYPE_GROUP) void xfs_qm_dqdestroy(struct xfs_dquot *dqp); int xfs_qm_dqflush(struct xfs_dquot *dqp, struct xfs_buf **bpp); diff --git a/fs/xfs/xfs_dquot_item_recover.c b/fs/xfs/xfs_dquot_item_recover.c index 9f64162ca300..d7eb85c7d394 100644 --- a/fs/xfs/xfs_dquot_item_recover.c +++ b/fs/xfs/xfs_dquot_item_recover.c @@ -39,7 +39,7 @@ xlog_recover_dquot_ra_pass2( if (item->ri_buf[1].i_len < sizeof(struct xfs_disk_dquot)) return; - type = recddq->d_flags & (XFS_DQ_USER | XFS_DQ_PROJ | XFS_DQ_GROUP); + type = recddq->d_flags & (XFS_DQTYPE_USER | XFS_DQTYPE_PROJ | XFS_DQTYPE_GROUP); ASSERT(type); if (log->l_quotaoffs_flag & type) return; @@ -91,7 +91,7 @@ xlog_recover_dquot_commit_pass2( /* * This type of quotas was turned off, so ignore this record. */ - type = recddq->d_flags & (XFS_DQ_USER | XFS_DQ_PROJ | XFS_DQ_GROUP); + type = recddq->d_flags & (XFS_DQTYPE_USER | XFS_DQTYPE_PROJ | XFS_DQTYPE_GROUP); ASSERT(type); if (log->l_quotaoffs_flag & type) return 0; @@ -185,11 +185,11 @@ xlog_recover_quotaoff_commit_pass1( * group/project quotaoff or both. */ if (qoff_f->qf_flags & XFS_UQUOTA_ACCT) - log->l_quotaoffs_flag |= XFS_DQ_USER; + log->l_quotaoffs_flag |= XFS_DQTYPE_USER; if (qoff_f->qf_flags & XFS_PQUOTA_ACCT) - log->l_quotaoffs_flag |= XFS_DQ_PROJ; + log->l_quotaoffs_flag |= XFS_DQTYPE_PROJ; if (qoff_f->qf_flags & XFS_GQUOTA_ACCT) - log->l_quotaoffs_flag |= XFS_DQ_GROUP; + log->l_quotaoffs_flag |= XFS_DQTYPE_GROUP; return 0; } diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 58a750ce689c..3c6e936d2f99 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -1424,7 +1424,7 @@ __xfs_inode_free_quota_eofblocks( eofb.eof_flags = XFS_EOF_FLAGS_UNION|XFS_EOF_FLAGS_SYNC; if (XFS_IS_UQUOTA_ENFORCED(ip->i_mount)) { - dq = xfs_inode_dquot(ip, XFS_DQ_USER); + dq = xfs_inode_dquot(ip, XFS_DQTYPE_USER); if (dq && xfs_dquot_lowsp(dq)) { eofb.eof_uid = VFS_I(ip)->i_uid; eofb.eof_flags |= XFS_EOF_FLAGS_UID; @@ -1433,7 +1433,7 @@ __xfs_inode_free_quota_eofblocks( } if (XFS_IS_GQUOTA_ENFORCED(ip->i_mount)) { - dq = xfs_inode_dquot(ip, XFS_DQ_GROUP); + dq = xfs_inode_dquot(ip, XFS_DQTYPE_GROUP); if (dq && xfs_dquot_lowsp(dq)) { eofb.eof_gid = VFS_I(ip)->i_gid; eofb.eof_flags |= XFS_EOF_FLAGS_GID; diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index f60a6e44363b..d3dc4106a35c 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -450,14 +450,14 @@ xfs_iomap_prealloc_size( * Check each quota to cap the prealloc size, provide a shift value to * throttle with and adjust amount of available space. */ - if (xfs_quota_need_throttle(ip, XFS_DQ_USER, alloc_blocks)) - xfs_quota_calc_throttle(ip, XFS_DQ_USER, &qblocks, &qshift, + if (xfs_quota_need_throttle(ip, XFS_DQTYPE_USER, alloc_blocks)) + xfs_quota_calc_throttle(ip, XFS_DQTYPE_USER, &qblocks, &qshift, &freesp); - if (xfs_quota_need_throttle(ip, XFS_DQ_GROUP, alloc_blocks)) - xfs_quota_calc_throttle(ip, XFS_DQ_GROUP, &qblocks, &qshift, + if (xfs_quota_need_throttle(ip, XFS_DQTYPE_GROUP, alloc_blocks)) + xfs_quota_calc_throttle(ip, XFS_DQTYPE_GROUP, &qblocks, &qshift, &freesp); - if (xfs_quota_need_throttle(ip, XFS_DQ_PROJ, alloc_blocks)) - xfs_quota_calc_throttle(ip, XFS_DQ_PROJ, &qblocks, &qshift, + if (xfs_quota_need_throttle(ip, XFS_DQTYPE_PROJ, alloc_blocks)) + xfs_quota_calc_throttle(ip, XFS_DQTYPE_PROJ, &qblocks, &qshift, &freesp); /* diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index bf94c1bbda16..47d4b6937c84 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -189,11 +189,11 @@ xfs_qm_dqpurge_all( uint flags) { if (flags & XFS_QMOPT_UQUOTA) - xfs_qm_dquot_walk(mp, XFS_DQ_USER, xfs_qm_dqpurge, NULL); + xfs_qm_dquot_walk(mp, XFS_DQTYPE_USER, xfs_qm_dqpurge, NULL); if (flags & XFS_QMOPT_GQUOTA) - xfs_qm_dquot_walk(mp, XFS_DQ_GROUP, xfs_qm_dqpurge, NULL); + xfs_qm_dquot_walk(mp, XFS_DQTYPE_GROUP, xfs_qm_dqpurge, NULL); if (flags & XFS_QMOPT_PQUOTA) - xfs_qm_dquot_walk(mp, XFS_DQ_PROJ, xfs_qm_dqpurge, NULL); + xfs_qm_dquot_walk(mp, XFS_DQTYPE_PROJ, xfs_qm_dqpurge, NULL); } /* @@ -331,7 +331,7 @@ xfs_qm_dqattach_locked( if (XFS_IS_UQUOTA_ON(mp) && !ip->i_udquot) { error = xfs_qm_dqattach_one(ip, i_uid_read(VFS_I(ip)), - XFS_DQ_USER, doalloc, &ip->i_udquot); + XFS_DQTYPE_USER, doalloc, &ip->i_udquot); if (error) goto done; ASSERT(ip->i_udquot); @@ -339,14 +339,14 @@ xfs_qm_dqattach_locked( if (XFS_IS_GQUOTA_ON(mp) && !ip->i_gdquot) { error = xfs_qm_dqattach_one(ip, i_gid_read(VFS_I(ip)), - XFS_DQ_GROUP, doalloc, &ip->i_gdquot); + XFS_DQTYPE_GROUP, doalloc, &ip->i_gdquot); if (error) goto done; ASSERT(ip->i_gdquot); } if (XFS_IS_PQUOTA_ON(mp) && !ip->i_pdquot) { - error = xfs_qm_dqattach_one(ip, ip->i_d.di_projid, XFS_DQ_PROJ, + error = xfs_qm_dqattach_one(ip, ip->i_d.di_projid, XFS_DQTYPE_PROJ, doalloc, &ip->i_pdquot); if (error) goto done; @@ -664,16 +664,16 @@ xfs_qm_init_quotainfo( mp->m_qflags |= (mp->m_sb.sb_qflags & XFS_ALL_QUOTA_CHKD); - xfs_qm_init_timelimits(mp, XFS_DQ_USER); - xfs_qm_init_timelimits(mp, XFS_DQ_GROUP); - xfs_qm_init_timelimits(mp, XFS_DQ_PROJ); + xfs_qm_init_timelimits(mp, XFS_DQTYPE_USER); + xfs_qm_init_timelimits(mp, XFS_DQTYPE_GROUP); + xfs_qm_init_timelimits(mp, XFS_DQTYPE_PROJ); if (XFS_IS_UQUOTA_RUNNING(mp)) - xfs_qm_set_defquota(mp, XFS_DQ_USER, qinf); + xfs_qm_set_defquota(mp, XFS_DQTYPE_USER, qinf); if (XFS_IS_GQUOTA_RUNNING(mp)) - xfs_qm_set_defquota(mp, XFS_DQ_GROUP, qinf); + xfs_qm_set_defquota(mp, XFS_DQTYPE_GROUP, qinf); if (XFS_IS_PQUOTA_RUNNING(mp)) - xfs_qm_set_defquota(mp, XFS_DQ_PROJ, qinf); + xfs_qm_set_defquota(mp, XFS_DQTYPE_PROJ, qinf); qinf->qi_shrinker.count_objects = xfs_qm_shrink_count; qinf->qi_shrinker.scan_objects = xfs_qm_shrink_scan; @@ -855,7 +855,7 @@ xfs_qm_reset_dqcounts( * xfs_dquot_verify. */ if (xfs_dqblk_verify(mp, &dqb[j], id + j) || - (dqb[j].dd_diskdq.d_flags & XFS_DQ_ALLTYPES) != type) + (dqb[j].dd_diskdq.d_flags & XFS_DQTYPE_REC_MASK) != type) xfs_dqblk_repair(mp, &dqb[j], id + j, type); /* @@ -1176,21 +1176,21 @@ xfs_qm_dqusage_adjust( * and quotaoffs don't race. (Quotachecks happen at mount time only). */ if (XFS_IS_UQUOTA_ON(mp)) { - error = xfs_qm_quotacheck_dqadjust(ip, XFS_DQ_USER, nblks, + error = xfs_qm_quotacheck_dqadjust(ip, XFS_DQTYPE_USER, nblks, rtblks); if (error) goto error0; } if (XFS_IS_GQUOTA_ON(mp)) { - error = xfs_qm_quotacheck_dqadjust(ip, XFS_DQ_GROUP, nblks, + error = xfs_qm_quotacheck_dqadjust(ip, XFS_DQTYPE_GROUP, nblks, rtblks); if (error) goto error0; } if (XFS_IS_PQUOTA_ON(mp)) { - error = xfs_qm_quotacheck_dqadjust(ip, XFS_DQ_PROJ, nblks, + error = xfs_qm_quotacheck_dqadjust(ip, XFS_DQTYPE_PROJ, nblks, rtblks); if (error) goto error0; @@ -1281,7 +1281,7 @@ xfs_qm_quotacheck( * We don't log our changes till later. */ if (uip) { - error = xfs_qm_reset_dqcounts_buf(mp, uip, XFS_DQ_USER, + error = xfs_qm_reset_dqcounts_buf(mp, uip, XFS_DQTYPE_USER, &buffer_list); if (error) goto error_return; @@ -1289,7 +1289,7 @@ xfs_qm_quotacheck( } if (gip) { - error = xfs_qm_reset_dqcounts_buf(mp, gip, XFS_DQ_GROUP, + error = xfs_qm_reset_dqcounts_buf(mp, gip, XFS_DQTYPE_GROUP, &buffer_list); if (error) goto error_return; @@ -1297,7 +1297,7 @@ xfs_qm_quotacheck( } if (pip) { - error = xfs_qm_reset_dqcounts_buf(mp, pip, XFS_DQ_PROJ, + error = xfs_qm_reset_dqcounts_buf(mp, pip, XFS_DQTYPE_PROJ, &buffer_list); if (error) goto error_return; @@ -1314,17 +1314,17 @@ xfs_qm_quotacheck( * down to disk buffers if everything was updated successfully. */ if (XFS_IS_UQUOTA_ON(mp)) { - error = xfs_qm_dquot_walk(mp, XFS_DQ_USER, xfs_qm_flush_one, + error = xfs_qm_dquot_walk(mp, XFS_DQTYPE_USER, xfs_qm_flush_one, &buffer_list); } if (XFS_IS_GQUOTA_ON(mp)) { - error2 = xfs_qm_dquot_walk(mp, XFS_DQ_GROUP, xfs_qm_flush_one, + error2 = xfs_qm_dquot_walk(mp, XFS_DQTYPE_GROUP, xfs_qm_flush_one, &buffer_list); if (!error) error = error2; } if (XFS_IS_PQUOTA_ON(mp)) { - error2 = xfs_qm_dquot_walk(mp, XFS_DQ_PROJ, xfs_qm_flush_one, + error2 = xfs_qm_dquot_walk(mp, XFS_DQTYPE_PROJ, xfs_qm_flush_one, &buffer_list); if (!error) error = error2; @@ -1662,7 +1662,7 @@ xfs_qm_vop_dqalloc( */ xfs_iunlock(ip, lockflags); error = xfs_qm_dqget(mp, from_kuid(user_ns, uid), - XFS_DQ_USER, true, &uq); + XFS_DQTYPE_USER, true, &uq); if (error) { ASSERT(error != -ENOENT); return error; @@ -1686,7 +1686,7 @@ xfs_qm_vop_dqalloc( if (!gid_eq(inode->i_gid, gid)) { xfs_iunlock(ip, lockflags); error = xfs_qm_dqget(mp, from_kgid(user_ns, gid), - XFS_DQ_GROUP, true, &gq); + XFS_DQTYPE_GROUP, true, &gq); if (error) { ASSERT(error != -ENOENT); goto error_rele; @@ -1702,8 +1702,8 @@ xfs_qm_vop_dqalloc( if ((flags & XFS_QMOPT_PQUOTA) && XFS_IS_PQUOTA_ON(mp)) { if (ip->i_d.di_projid != prid) { xfs_iunlock(ip, lockflags); - error = xfs_qm_dqget(mp, (xfs_dqid_t)prid, XFS_DQ_PROJ, - true, &pq); + error = xfs_qm_dqget(mp, (xfs_dqid_t)prid, + XFS_DQTYPE_PROJ, true, &pq); if (error) { ASSERT(error != -ENOENT); goto error_rele; diff --git a/fs/xfs/xfs_qm.h b/fs/xfs/xfs_qm.h index 11c28ff0298c..21bc67d4962c 100644 --- a/fs/xfs/xfs_qm.h +++ b/fs/xfs/xfs_qm.h @@ -73,11 +73,11 @@ xfs_dquot_tree( int type) { switch (type) { - case XFS_DQ_USER: + case XFS_DQTYPE_USER: return &qi->qi_uquota_tree; - case XFS_DQ_GROUP: + case XFS_DQTYPE_GROUP: return &qi->qi_gquota_tree; - case XFS_DQ_PROJ: + case XFS_DQTYPE_PROJ: return &qi->qi_pquota_tree; default: ASSERT(0); @@ -88,12 +88,12 @@ xfs_dquot_tree( static inline struct xfs_inode * xfs_quota_inode(xfs_mount_t *mp, uint dq_flags) { - switch (dq_flags & XFS_DQ_ALLTYPES) { - case XFS_DQ_USER: + switch (dq_flags & XFS_DQTYPE_REC_MASK) { + case XFS_DQTYPE_USER: return mp->m_quotainfo->qi_uquotaip; - case XFS_DQ_GROUP: + case XFS_DQTYPE_GROUP: return mp->m_quotainfo->qi_gquotaip; - case XFS_DQ_PROJ: + case XFS_DQTYPE_PROJ: return mp->m_quotainfo->qi_pquotaip; default: ASSERT(0); @@ -105,11 +105,11 @@ static inline int xfs_dquot_type(struct xfs_dquot *dqp) { if (XFS_QM_ISUDQ(dqp)) - return XFS_DQ_USER; + return XFS_DQTYPE_USER; if (XFS_QM_ISGDQ(dqp)) - return XFS_DQ_GROUP; + return XFS_DQTYPE_GROUP; ASSERT(XFS_QM_ISPDQ(dqp)); - return XFS_DQ_PROJ; + return XFS_DQTYPE_PROJ; } extern void xfs_trans_mod_dquot(struct xfs_trans *tp, struct xfs_dquot *dqp, @@ -166,11 +166,11 @@ static inline struct xfs_def_quota * xfs_get_defquota(struct xfs_quotainfo *qi, int type) { switch (type) { - case XFS_DQ_USER: + case XFS_DQTYPE_USER: return &qi->qi_usr_default; - case XFS_DQ_GROUP: + case XFS_DQTYPE_GROUP: return &qi->qi_grp_default; - case XFS_DQ_PROJ: + case XFS_DQTYPE_PROJ: return &qi->qi_prj_default; default: ASSERT(0); diff --git a/fs/xfs/xfs_qm_bhv.c b/fs/xfs/xfs_qm_bhv.c index 0993217e5ac8..639398091ad6 100644 --- a/fs/xfs/xfs_qm_bhv.c +++ b/fs/xfs/xfs_qm_bhv.c @@ -60,7 +60,7 @@ xfs_qm_statvfs( struct xfs_mount *mp = ip->i_mount; struct xfs_dquot *dqp; - if (!xfs_qm_dqget(mp, ip->i_d.di_projid, XFS_DQ_PROJ, false, &dqp)) { + if (!xfs_qm_dqget(mp, ip->i_d.di_projid, XFS_DQTYPE_PROJ, false, &dqp)) { xfs_fill_statvfs_from_dquot(statp, dqp); xfs_qm_dqput(dqp); } diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c index cbe352187d32..119c3d7d5f51 100644 --- a/fs/xfs/xfs_qm_syscalls.c +++ b/fs/xfs/xfs_qm_syscalls.c @@ -660,18 +660,18 @@ xfs_qm_scall_getquota_fill_qc( * gets turned off. No need to confuse the user level code, * so return zeroes in that case. */ - if ((!XFS_IS_UQUOTA_ENFORCED(mp) && type == XFS_DQ_USER) || - (!XFS_IS_GQUOTA_ENFORCED(mp) && type == XFS_DQ_GROUP) || - (!XFS_IS_PQUOTA_ENFORCED(mp) && type == XFS_DQ_PROJ)) { + if ((!XFS_IS_UQUOTA_ENFORCED(mp) && type == XFS_DQTYPE_USER) || + (!XFS_IS_GQUOTA_ENFORCED(mp) && type == XFS_DQTYPE_GROUP) || + (!XFS_IS_PQUOTA_ENFORCED(mp) && type == XFS_DQTYPE_PROJ)) { dst->d_spc_timer = 0; dst->d_ino_timer = 0; dst->d_rt_spc_timer = 0; } #ifdef DEBUG - if (((XFS_IS_UQUOTA_ENFORCED(mp) && type == XFS_DQ_USER) || - (XFS_IS_GQUOTA_ENFORCED(mp) && type == XFS_DQ_GROUP) || - (XFS_IS_PQUOTA_ENFORCED(mp) && type == XFS_DQ_PROJ)) && + if (((XFS_IS_UQUOTA_ENFORCED(mp) && type == XFS_DQTYPE_USER) || + (XFS_IS_GQUOTA_ENFORCED(mp) && type == XFS_DQTYPE_GROUP) || + (XFS_IS_PQUOTA_ENFORCED(mp) && type == XFS_DQTYPE_PROJ)) && dqp->q_id != 0) { if ((dst->d_space > dst->d_spc_softlimit) && (dst->d_spc_softlimit > 0)) { diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h index c92ae5e02ce8..0ae35fb5cb89 100644 --- a/fs/xfs/xfs_quota.h +++ b/fs/xfs/xfs_quota.h @@ -42,11 +42,11 @@ xfs_quota_chkd_flag( uint dqtype) { switch (dqtype) { - case XFS_DQ_USER: + case XFS_DQTYPE_USER: return XFS_UQUOTA_CHKD; - case XFS_DQ_GROUP: + case XFS_DQTYPE_GROUP: return XFS_GQUOTA_CHKD; - case XFS_DQ_PROJ: + case XFS_DQTYPE_PROJ: return XFS_PQUOTA_CHKD; default: return 0; diff --git a/fs/xfs/xfs_quotaops.c b/fs/xfs/xfs_quotaops.c index 299695a068f3..ba69906edecf 100644 --- a/fs/xfs/xfs_quotaops.c +++ b/fs/xfs/xfs_quotaops.c @@ -90,11 +90,11 @@ xfs_quota_type(int type) { switch (type) { case USRQUOTA: - return XFS_DQ_USER; + return XFS_DQTYPE_USER; case GRPQUOTA: - return XFS_DQ_GROUP; + return XFS_DQTYPE_GROUP; default: - return XFS_DQ_PROJ; + return XFS_DQTYPE_PROJ; } } diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c index a8f480e5401f..ea61e279f831 100644 --- a/fs/xfs/xfs_trans_dquot.c +++ b/fs/xfs/xfs_trans_dquot.c @@ -553,9 +553,9 @@ xfs_quota_warn( { enum quota_type qtype; - if (dqp->dq_flags & XFS_DQ_PROJ) + if (dqp->dq_flags & XFS_DQTYPE_PROJ) qtype = PRJQUOTA; - else if (dqp->dq_flags & XFS_DQ_USER) + else if (dqp->dq_flags & XFS_DQTYPE_USER) qtype = USRQUOTA; else qtype = GRPQUOTA; From dbcbc7b90e8a43aa00412bfa601a2d110a29086a Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Wed, 15 Jul 2020 17:48:31 -0700 Subject: [PATCH 080/117] xfs: refactor testing if a particular dquot is being enforced Create a small helper to test if enforcement is enabled for a given incore dquot and replace the open-code logic testing. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner Reviewed-by: Christoph Hellwig --- fs/xfs/xfs_dquot.h | 17 +++++++++++++++++ fs/xfs/xfs_qm_syscalls.c | 9 ++------- fs/xfs/xfs_trans_dquot.c | 4 +--- 3 files changed, 20 insertions(+), 10 deletions(-) diff --git a/fs/xfs/xfs_dquot.h b/fs/xfs/xfs_dquot.h index 17a21677723f..fcf9bd676615 100644 --- a/fs/xfs/xfs_dquot.h +++ b/fs/xfs/xfs_dquot.h @@ -156,6 +156,23 @@ static inline struct xfs_dquot *xfs_inode_dquot(struct xfs_inode *ip, int type) } } +/* Decide if the dquot's limits are actually being enforced. */ +static inline bool +xfs_dquot_is_enforced( + const struct xfs_dquot *dqp) +{ + switch (dqp->dq_flags & XFS_DQTYPE_REC_MASK) { + case XFS_DQTYPE_USER: + return XFS_IS_UQUOTA_ENFORCED(dqp->q_mount); + case XFS_DQTYPE_GROUP: + return XFS_IS_GQUOTA_ENFORCED(dqp->q_mount); + case XFS_DQTYPE_PROJ: + return XFS_IS_PQUOTA_ENFORCED(dqp->q_mount); + } + ASSERT(0); + return false; +} + /* * Check whether a dquot is under low free space conditions. We assume the quota * is enabled and enforced. diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c index 119c3d7d5f51..f7dbc702e4d6 100644 --- a/fs/xfs/xfs_qm_syscalls.c +++ b/fs/xfs/xfs_qm_syscalls.c @@ -660,19 +660,14 @@ xfs_qm_scall_getquota_fill_qc( * gets turned off. No need to confuse the user level code, * so return zeroes in that case. */ - if ((!XFS_IS_UQUOTA_ENFORCED(mp) && type == XFS_DQTYPE_USER) || - (!XFS_IS_GQUOTA_ENFORCED(mp) && type == XFS_DQTYPE_GROUP) || - (!XFS_IS_PQUOTA_ENFORCED(mp) && type == XFS_DQTYPE_PROJ)) { + if (!xfs_dquot_is_enforced(dqp)) { dst->d_spc_timer = 0; dst->d_ino_timer = 0; dst->d_rt_spc_timer = 0; } #ifdef DEBUG - if (((XFS_IS_UQUOTA_ENFORCED(mp) && type == XFS_DQTYPE_USER) || - (XFS_IS_GQUOTA_ENFORCED(mp) && type == XFS_DQTYPE_GROUP) || - (XFS_IS_PQUOTA_ENFORCED(mp) && type == XFS_DQTYPE_PROJ)) && - dqp->q_id != 0) { + if (xfs_dquot_is_enforced(dqp) && dqp->q_id != 0) { if ((dst->d_space > dst->d_spc_softlimit) && (dst->d_spc_softlimit > 0)) { ASSERT(dst->d_spc_timer != 0); diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c index ea61e279f831..d7d710d25bbd 100644 --- a/fs/xfs/xfs_trans_dquot.c +++ b/fs/xfs/xfs_trans_dquot.c @@ -650,9 +650,7 @@ xfs_trans_dqresv( } if ((flags & XFS_QMOPT_FORCE_RES) == 0 && dqp->q_id && - ((XFS_IS_UQUOTA_ENFORCED(dqp->q_mount) && XFS_QM_ISUDQ(dqp)) || - (XFS_IS_GQUOTA_ENFORCED(dqp->q_mount) && XFS_QM_ISGDQ(dqp)) || - (XFS_IS_PQUOTA_ENFORCED(dqp->q_mount) && XFS_QM_ISPDQ(dqp)))) { + xfs_dquot_is_enforced(dqp)) { int quota_nl; bool fatal; From 00a342e4965ee91fff06e01949fa27db53c0bb1c Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Wed, 15 Jul 2020 17:47:13 -0700 Subject: [PATCH 081/117] xfs: remove the XFS_QM_IS[UGP]DQ macros Remove these macros and use xfs_dquot_type() for everything. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner Reviewed-by: Christoph Hellwig --- fs/xfs/xfs_dquot.h | 9 ++++++--- fs/xfs/xfs_qm.h | 11 ----------- fs/xfs/xfs_trans_dquot.c | 15 ++++++++++----- 3 files changed, 16 insertions(+), 19 deletions(-) diff --git a/fs/xfs/xfs_dquot.h b/fs/xfs/xfs_dquot.h index fcf9bd676615..60bccb5f7435 100644 --- a/fs/xfs/xfs_dquot.h +++ b/fs/xfs/xfs_dquot.h @@ -128,6 +128,12 @@ static inline void xfs_dqunlock(struct xfs_dquot *dqp) mutex_unlock(&dqp->q_qlock); } +static inline int +xfs_dquot_type(const struct xfs_dquot *dqp) +{ + return dqp->dq_flags & XFS_DQTYPE_REC_MASK; +} + static inline int xfs_this_quota_on(struct xfs_mount *mp, int type) { switch (type & XFS_DQTYPE_REC_MASK) { @@ -192,9 +198,6 @@ void xfs_dquot_to_disk(struct xfs_disk_dquot *ddqp, struct xfs_dquot *dqp); #define XFS_DQ_IS_LOCKED(dqp) (mutex_is_locked(&((dqp)->q_qlock))) #define XFS_DQ_IS_DIRTY(dqp) ((dqp)->q_flags & XFS_DQFLAG_DIRTY) -#define XFS_QM_ISUDQ(dqp) ((dqp)->dq_flags & XFS_DQTYPE_USER) -#define XFS_QM_ISPDQ(dqp) ((dqp)->dq_flags & XFS_DQTYPE_PROJ) -#define XFS_QM_ISGDQ(dqp) ((dqp)->dq_flags & XFS_DQTYPE_GROUP) void xfs_qm_dqdestroy(struct xfs_dquot *dqp); int xfs_qm_dqflush(struct xfs_dquot *dqp, struct xfs_buf **bpp); diff --git a/fs/xfs/xfs_qm.h b/fs/xfs/xfs_qm.h index 21bc67d4962c..f04af35349d7 100644 --- a/fs/xfs/xfs_qm.h +++ b/fs/xfs/xfs_qm.h @@ -101,17 +101,6 @@ xfs_quota_inode(xfs_mount_t *mp, uint dq_flags) return NULL; } -static inline int -xfs_dquot_type(struct xfs_dquot *dqp) -{ - if (XFS_QM_ISUDQ(dqp)) - return XFS_DQTYPE_USER; - if (XFS_QM_ISGDQ(dqp)) - return XFS_DQTYPE_GROUP; - ASSERT(XFS_QM_ISPDQ(dqp)); - return XFS_DQTYPE_PROJ; -} - extern void xfs_trans_mod_dquot(struct xfs_trans *tp, struct xfs_dquot *dqp, uint field, int64_t delta); extern void xfs_trans_dqjoin(struct xfs_trans *, struct xfs_dquot *); diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c index d7d710d25bbd..19d3e283aafa 100644 --- a/fs/xfs/xfs_trans_dquot.c +++ b/fs/xfs/xfs_trans_dquot.c @@ -156,14 +156,19 @@ xfs_trans_get_dqtrx( int i; struct xfs_dqtrx *qa; - if (XFS_QM_ISUDQ(dqp)) + switch (xfs_dquot_type(dqp)) { + case XFS_DQTYPE_USER: qa = tp->t_dqinfo->dqs[XFS_QM_TRANS_USR]; - else if (XFS_QM_ISGDQ(dqp)) + break; + case XFS_DQTYPE_GROUP: qa = tp->t_dqinfo->dqs[XFS_QM_TRANS_GRP]; - else if (XFS_QM_ISPDQ(dqp)) + break; + case XFS_DQTYPE_PROJ: qa = tp->t_dqinfo->dqs[XFS_QM_TRANS_PRJ]; - else + break; + default: return NULL; + } for (i = 0; i < XFS_QM_TRANS_MAXDQS; i++) { if (qa[i].qt_dquot == NULL || @@ -713,7 +718,7 @@ xfs_trans_dqresv( error_return: xfs_dqunlock(dqp); - if (XFS_QM_ISPDQ(dqp)) + if (xfs_dquot_type(dqp) == XFS_DQTYPE_PROJ) return -ENOSPC; return -EDQUOT; } From e6eb603c7e4f39cbd41042c7ceb7141dffc08096 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Wed, 15 Jul 2020 17:50:57 -0700 Subject: [PATCH 082/117] xfs: refactor quota type testing Certain functions can only act upon one quota type, so refactor those functions to use switch statements, in keeping with all the other high level xfs quota api calls. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner Reviewed-by: Christoph Hellwig --- fs/xfs/xfs_dquot.c | 29 ++++++++++++++++++----------- fs/xfs/xfs_trans_dquot.c | 15 +++++++++++---- 2 files changed, 29 insertions(+), 15 deletions(-) diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index 4053e7e390f1..ce946d53bb61 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -171,6 +171,24 @@ xfs_qm_init_dquot_blk( ASSERT(tp); ASSERT(xfs_buf_islocked(bp)); + switch (type) { + case XFS_DQTYPE_USER: + qflag = XFS_UQUOTA_CHKD; + blftype = XFS_BLF_UDQUOT_BUF; + break; + case XFS_DQTYPE_PROJ: + qflag = XFS_PQUOTA_CHKD; + blftype = XFS_BLF_PDQUOT_BUF; + break; + case XFS_DQTYPE_GROUP: + qflag = XFS_GQUOTA_CHKD; + blftype = XFS_BLF_GDQUOT_BUF; + break; + default: + ASSERT(0); + return; + } + d = bp->b_addr; /* @@ -190,17 +208,6 @@ xfs_qm_init_dquot_blk( } } - if (type & XFS_DQTYPE_USER) { - qflag = XFS_UQUOTA_CHKD; - blftype = XFS_BLF_UDQUOT_BUF; - } else if (type & XFS_DQTYPE_PROJ) { - qflag = XFS_PQUOTA_CHKD; - blftype = XFS_BLF_PDQUOT_BUF; - } else { - qflag = XFS_GQUOTA_CHKD; - blftype = XFS_BLF_GDQUOT_BUF; - } - xfs_trans_dquot_buf(tp, bp, blftype); /* diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c index 19d3e283aafa..518cf0347891 100644 --- a/fs/xfs/xfs_trans_dquot.c +++ b/fs/xfs/xfs_trans_dquot.c @@ -556,14 +556,21 @@ xfs_quota_warn( struct xfs_dquot *dqp, int type) { - enum quota_type qtype; + enum quota_type qtype; - if (dqp->dq_flags & XFS_DQTYPE_PROJ) + switch (xfs_dquot_type(dqp)) { + case XFS_DQTYPE_PROJ: qtype = PRJQUOTA; - else if (dqp->dq_flags & XFS_DQTYPE_USER) + break; + case XFS_DQTYPE_USER: qtype = USRQUOTA; - else + break; + case XFS_DQTYPE_GROUP: qtype = GRPQUOTA; + break; + default: + return; + } quota_send_warning(make_kqid(&init_user_ns, qtype, dqp->q_id), mp->m_super->s_dev, type); From 0b04dd5d7ca79ccb88e258641b328fe4f548272a Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Wed, 15 Jul 2020 17:51:47 -0700 Subject: [PATCH 083/117] xfs: always use xfs_dquot_type when extracting type from a dquot Always use the xfs_dquot_type helper to extract the quota type from an incore dquot. This moves responsibility for filtering internal state information and whatnot to anybody passing around a struct xfs_dquot. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner Reviewed-by: Christoph Hellwig --- fs/xfs/xfs_dquot.c | 15 ++++++++------- fs/xfs/xfs_dquot.h | 2 +- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index ce946d53bb61..5a60238fcdba 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -273,14 +273,15 @@ xfs_dquot_disk_alloc( struct xfs_trans *tp = *tpp; struct xfs_mount *mp = tp->t_mountp; struct xfs_buf *bp; - struct xfs_inode *quotip = xfs_quota_inode(mp, dqp->dq_flags); + uint qtype = xfs_dquot_type(dqp); + struct xfs_inode *quotip = xfs_quota_inode(mp, qtype); int nmaps = 1; int error; trace_xfs_dqalloc(dqp); xfs_ilock(quotip, XFS_ILOCK_EXCL); - if (!xfs_this_quota_on(dqp->q_mount, dqp->dq_flags)) { + if (!xfs_this_quota_on(dqp->q_mount, qtype)) { /* * Return if this type of quotas is turned off while we didn't * have an inode lock @@ -317,8 +318,7 @@ xfs_dquot_disk_alloc( * Make a chunk of dquots out of this buffer and log * the entire thing. */ - xfs_qm_init_dquot_blk(tp, mp, dqp->q_id, - dqp->dq_flags & XFS_DQTYPE_REC_MASK, bp); + xfs_qm_init_dquot_blk(tp, mp, dqp->q_id, qtype, bp); xfs_buf_set_ref(bp, XFS_DQUOT_REF); /* @@ -365,13 +365,14 @@ xfs_dquot_disk_read( { struct xfs_bmbt_irec map; struct xfs_buf *bp; - struct xfs_inode *quotip = xfs_quota_inode(mp, dqp->dq_flags); + uint qtype = xfs_dquot_type(dqp); + struct xfs_inode *quotip = xfs_quota_inode(mp, qtype); uint lock_mode; int nmaps = 1; int error; lock_mode = xfs_ilock_data_map_shared(quotip); - if (!xfs_this_quota_on(mp, dqp->dq_flags)) { + if (!xfs_this_quota_on(mp, qtype)) { /* * Return if this type of quotas is turned off while we * didn't have the quota inode lock. @@ -487,7 +488,7 @@ xfs_dquot_from_disk( * Ensure that we got the type and ID we were looking for. * Everything else was checked by the dquot buffer verifier. */ - if ((ddqp->d_flags & XFS_DQTYPE_REC_MASK) != dqp->dq_flags || + if ((ddqp->d_flags & XFS_DQTYPE_REC_MASK) != xfs_dquot_type(dqp) || be32_to_cpu(ddqp->d_id) != dqp->q_id) { xfs_alert_tag(bp->b_mount, XFS_PTAG_VERIFIER_ERROR, "Metadata corruption detected at %pS, quota %u", diff --git a/fs/xfs/xfs_dquot.h b/fs/xfs/xfs_dquot.h index 60bccb5f7435..07e18ce33560 100644 --- a/fs/xfs/xfs_dquot.h +++ b/fs/xfs/xfs_dquot.h @@ -167,7 +167,7 @@ static inline bool xfs_dquot_is_enforced( const struct xfs_dquot *dqp) { - switch (dqp->dq_flags & XFS_DQTYPE_REC_MASK) { + switch (xfs_dquot_type(dqp)) { case XFS_DQTYPE_USER: return XFS_IS_UQUOTA_ENFORCED(dqp->q_mount); case XFS_DQTYPE_GROUP: From af1db8f12e2dc5a172f2e35cfcd8cc4cf57dbb6e Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Wed, 15 Jul 2020 17:52:57 -0700 Subject: [PATCH 084/117] xfs: remove unnecessary quota type masking When XFS' quota functions take a parameter for the quota type, they only care about the three quota record types (user, group, project). Internal state flags and whatnot should never be passed by callers and are an error. Now that we've moved responsibility for filtering out internal state to the callers, we can drop the masking everywhere else. In other words, if you call a quota function, you must only pass in one of XFS_DQTYPE_{USER,GROUP,PROJ}. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner Reviewed-by: Christoph Hellwig --- fs/xfs/xfs_dquot.h | 4 ++-- fs/xfs/xfs_qm.h | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/xfs/xfs_dquot.h b/fs/xfs/xfs_dquot.h index 07e18ce33560..81ba614439bd 100644 --- a/fs/xfs/xfs_dquot.h +++ b/fs/xfs/xfs_dquot.h @@ -136,7 +136,7 @@ xfs_dquot_type(const struct xfs_dquot *dqp) static inline int xfs_this_quota_on(struct xfs_mount *mp, int type) { - switch (type & XFS_DQTYPE_REC_MASK) { + switch (type) { case XFS_DQTYPE_USER: return XFS_IS_UQUOTA_ON(mp); case XFS_DQTYPE_GROUP: @@ -150,7 +150,7 @@ static inline int xfs_this_quota_on(struct xfs_mount *mp, int type) static inline struct xfs_dquot *xfs_inode_dquot(struct xfs_inode *ip, int type) { - switch (type & XFS_DQTYPE_REC_MASK) { + switch (type) { case XFS_DQTYPE_USER: return ip->i_udquot; case XFS_DQTYPE_GROUP: diff --git a/fs/xfs/xfs_qm.h b/fs/xfs/xfs_qm.h index f04af35349d7..fac6fa81f1fa 100644 --- a/fs/xfs/xfs_qm.h +++ b/fs/xfs/xfs_qm.h @@ -88,7 +88,7 @@ xfs_dquot_tree( static inline struct xfs_inode * xfs_quota_inode(xfs_mount_t *mp, uint dq_flags) { - switch (dq_flags & XFS_DQTYPE_REC_MASK) { + switch (dq_flags) { case XFS_DQTYPE_USER: return mp->m_quotainfo->qi_uquotaip; case XFS_DQTYPE_GROUP: From 74ddd6b3dd553a48c294d671a6cfe02e57cfa4c7 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Wed, 15 Jul 2020 17:53:18 -0700 Subject: [PATCH 085/117] xfs: replace a few open-coded XFS_DQTYPE_REC_MASK uses Fix a few places where we open-coded this mask constant. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner Reviewed-by: Christoph Hellwig --- fs/xfs/xfs_dquot_item_recover.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/xfs/xfs_dquot_item_recover.c b/fs/xfs/xfs_dquot_item_recover.c index d7eb85c7d394..93178341569a 100644 --- a/fs/xfs/xfs_dquot_item_recover.c +++ b/fs/xfs/xfs_dquot_item_recover.c @@ -39,7 +39,7 @@ xlog_recover_dquot_ra_pass2( if (item->ri_buf[1].i_len < sizeof(struct xfs_disk_dquot)) return; - type = recddq->d_flags & (XFS_DQTYPE_USER | XFS_DQTYPE_PROJ | XFS_DQTYPE_GROUP); + type = recddq->d_flags & XFS_DQTYPE_REC_MASK; ASSERT(type); if (log->l_quotaoffs_flag & type) return; @@ -91,7 +91,7 @@ xlog_recover_dquot_commit_pass2( /* * This type of quotas was turned off, so ignore this record. */ - type = recddq->d_flags & (XFS_DQTYPE_USER | XFS_DQTYPE_PROJ | XFS_DQTYPE_GROUP); + type = recddq->d_flags & XFS_DQTYPE_REC_MASK; ASSERT(type); if (log->l_quotaoffs_flag & type) return 0; From 1a7ed271653a4f418a6398465f861ee795d34468 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Wed, 15 Jul 2020 17:53:43 -0700 Subject: [PATCH 086/117] xfs: create xfs_dqtype_t to represent quota types Create a new type (xfs_dqtype_t) to represent the type of an incore dquot (user, group, project, or none). Rename the incore dquot's dq_flags field to q_type. This allows us to replace all the "uint type" arguments to the quota functions with "xfs_dqtype_t type", to make it obvious when we're passing a quota type argument into a function. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_dquot_buf.c | 2 +- fs/xfs/libxfs/xfs_format.h | 9 +++++++++ fs/xfs/libxfs/xfs_quota_defs.h | 23 +++++++++------------ fs/xfs/scrub/quota.c | 8 ++++---- fs/xfs/scrub/repair.c | 4 ++-- fs/xfs/scrub/repair.h | 4 +++- fs/xfs/xfs_dquot.c | 37 +++++++++++++++++----------------- fs/xfs/xfs_dquot.h | 33 +++++++++++++++--------------- fs/xfs/xfs_iomap.c | 24 +++++++++++----------- fs/xfs/xfs_qm.c | 22 ++++++++++---------- fs/xfs/xfs_qm.h | 26 +++++++++++++++--------- fs/xfs/xfs_qm_syscalls.c | 8 ++++---- fs/xfs/xfs_quota.h | 4 ++-- fs/xfs/xfs_quotaops.c | 2 +- fs/xfs/xfs_trace.h | 21 +++++++++++++------ 15 files changed, 126 insertions(+), 101 deletions(-) diff --git a/fs/xfs/libxfs/xfs_dquot_buf.c b/fs/xfs/libxfs/xfs_dquot_buf.c index 450147df3042..75c164ed141c 100644 --- a/fs/xfs/libxfs/xfs_dquot_buf.c +++ b/fs/xfs/libxfs/xfs_dquot_buf.c @@ -109,7 +109,7 @@ xfs_dqblk_repair( struct xfs_mount *mp, struct xfs_dqblk *dqb, xfs_dqid_t id, - uint type) + xfs_dqtype_t type) { /* * Typically, a repair is only requested by quotacheck. diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h index 5d5e0f5eda97..0fa969f6202c 100644 --- a/fs/xfs/libxfs/xfs_format.h +++ b/fs/xfs/libxfs/xfs_format.h @@ -1149,6 +1149,15 @@ static inline void xfs_dinode_put_rdev(struct xfs_dinode *dip, xfs_dev_t rdev) #define XFS_DQUOT_MAGIC 0x4451 /* 'DQ' */ #define XFS_DQUOT_VERSION (uint8_t)0x01 /* latest version number */ +#define XFS_DQTYPE_USER 0x01 /* user dquot record */ +#define XFS_DQTYPE_PROJ 0x02 /* project dquot record */ +#define XFS_DQTYPE_GROUP 0x04 /* group dquot record */ + +/* bitmask to determine if this is a user/group/project dquot */ +#define XFS_DQTYPE_REC_MASK (XFS_DQTYPE_USER | \ + XFS_DQTYPE_PROJ | \ + XFS_DQTYPE_GROUP) + /* * This is the main portion of the on-disk representation of quota information * for a user. We pad this with some more expansion room to construct the on diff --git a/fs/xfs/libxfs/xfs_quota_defs.h b/fs/xfs/libxfs/xfs_quota_defs.h index baf6c4ad88af..076bdc7037ee 100644 --- a/fs/xfs/libxfs/xfs_quota_defs.h +++ b/fs/xfs/libxfs/xfs_quota_defs.h @@ -18,23 +18,20 @@ typedef uint64_t xfs_qcnt_t; typedef uint16_t xfs_qwarncnt_t; +typedef uint8_t xfs_dqtype_t; + +#define XFS_DQTYPE_STRINGS \ + { XFS_DQTYPE_USER, "USER" }, \ + { XFS_DQTYPE_PROJ, "PROJ" }, \ + { XFS_DQTYPE_GROUP, "GROUP" } + /* * flags for q_flags field in the dquot. */ -#define XFS_DQTYPE_USER 0x0001 /* a user quota */ -#define XFS_DQTYPE_PROJ 0x0002 /* project quota */ -#define XFS_DQTYPE_GROUP 0x0004 /* a group quota */ -#define XFS_DQFLAG_DIRTY 0x0008 /* dquot is dirty */ -#define XFS_DQFLAG_FREEING 0x0010 /* dquot is being torn down */ - -#define XFS_DQTYPE_REC_MASK (XFS_DQTYPE_USER | \ - XFS_DQTYPE_PROJ | \ - XFS_DQTYPE_GROUP) +#define XFS_DQFLAG_DIRTY (1 << 0) /* dquot is dirty */ +#define XFS_DQFLAG_FREEING (1 << 1) /* dquot is being torn down */ #define XFS_DQFLAG_STRINGS \ - { XFS_DQTYPE_USER, "USER" }, \ - { XFS_DQTYPE_PROJ, "PROJ" }, \ - { XFS_DQTYPE_GROUP, "GROUP" }, \ { XFS_DQFLAG_DIRTY, "DIRTY" }, \ { XFS_DQFLAG_FREEING, "FREEING" } @@ -144,6 +141,6 @@ extern xfs_failaddr_t xfs_dqblk_verify(struct xfs_mount *mp, struct xfs_dqblk *dqb, xfs_dqid_t id); extern int xfs_calc_dquots_per_chunk(unsigned int nbblks); extern void xfs_dqblk_repair(struct xfs_mount *mp, struct xfs_dqblk *dqb, - xfs_dqid_t id, uint type); + xfs_dqid_t id, xfs_dqtype_t type); #endif /* __XFS_QUOTA_H__ */ diff --git a/fs/xfs/scrub/quota.c b/fs/xfs/scrub/quota.c index 1db07485f148..e34ca20ae8e4 100644 --- a/fs/xfs/scrub/quota.c +++ b/fs/xfs/scrub/quota.c @@ -18,7 +18,7 @@ #include "scrub/common.h" /* Convert a scrub type code to a DQ flag, or return 0 if error. */ -static inline uint +static inline xfs_dqtype_t xchk_quota_to_dqtype( struct xfs_scrub *sc) { @@ -40,7 +40,7 @@ xchk_setup_quota( struct xfs_scrub *sc, struct xfs_inode *ip) { - uint dqtype; + xfs_dqtype_t dqtype; int error; if (!XFS_IS_QUOTA_RUNNING(sc->mp) || !XFS_IS_QUOTA_ON(sc->mp)) @@ -73,7 +73,7 @@ struct xchk_quota_info { STATIC int xchk_quota_item( struct xfs_dquot *dq, - uint dqtype, + xfs_dqtype_t dqtype, void *priv) { struct xchk_quota_info *sqi = priv; @@ -214,7 +214,7 @@ xchk_quota( struct xchk_quota_info sqi; struct xfs_mount *mp = sc->mp; struct xfs_quotainfo *qi = mp->m_quotainfo; - uint dqtype; + xfs_dqtype_t dqtype; int error = 0; dqtype = xchk_quota_to_dqtype(sc); diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c index 074651896586..25e86c71e7b9 100644 --- a/fs/xfs/scrub/repair.c +++ b/fs/xfs/scrub/repair.c @@ -899,11 +899,11 @@ xrep_find_ag_btree_roots( void xrep_force_quotacheck( struct xfs_scrub *sc, - uint dqtype) + xfs_dqtype_t type) { uint flag; - flag = xfs_quota_chkd_flag(dqtype); + flag = xfs_quota_chkd_flag(type); if (!(flag & sc->mp->m_qflags)) return; diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h index 04a47d45605b..fe77de01abe0 100644 --- a/fs/xfs/scrub/repair.h +++ b/fs/xfs/scrub/repair.h @@ -6,6 +6,8 @@ #ifndef __XFS_SCRUB_REPAIR_H__ #define __XFS_SCRUB_REPAIR_H__ +#include "xfs_quota_defs.h" + static inline int xrep_notsupported(struct xfs_scrub *sc) { return -EOPNOTSUPP; @@ -49,7 +51,7 @@ struct xrep_find_ag_btree { int xrep_find_ag_btree_roots(struct xfs_scrub *sc, struct xfs_buf *agf_bp, struct xrep_find_ag_btree *btree_info, struct xfs_buf *agfl_bp); -void xrep_force_quotacheck(struct xfs_scrub *sc, uint dqtype); +void xrep_force_quotacheck(struct xfs_scrub *sc, xfs_dqtype_t type); int xrep_ino_dqattach(struct xfs_scrub *sc); /* Metadata repairers */ diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index 5a60238fcdba..30cfa0c87175 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -158,7 +158,7 @@ xfs_qm_init_dquot_blk( struct xfs_trans *tp, struct xfs_mount *mp, xfs_dqid_t id, - uint type, + xfs_dqtype_t type, struct xfs_buf *bp) { struct xfs_quotainfo *q = mp->m_quotainfo; @@ -273,7 +273,7 @@ xfs_dquot_disk_alloc( struct xfs_trans *tp = *tpp; struct xfs_mount *mp = tp->t_mountp; struct xfs_buf *bp; - uint qtype = xfs_dquot_type(dqp); + xfs_dqtype_t qtype = xfs_dquot_type(dqp); struct xfs_inode *quotip = xfs_quota_inode(mp, qtype); int nmaps = 1; int error; @@ -365,7 +365,7 @@ xfs_dquot_disk_read( { struct xfs_bmbt_irec map; struct xfs_buf *bp; - uint qtype = xfs_dquot_type(dqp); + xfs_dqtype_t qtype = xfs_dquot_type(dqp); struct xfs_inode *quotip = xfs_quota_inode(mp, qtype); uint lock_mode; int nmaps = 1; @@ -424,13 +424,13 @@ STATIC struct xfs_dquot * xfs_dquot_alloc( struct xfs_mount *mp, xfs_dqid_t id, - uint type) + xfs_dqtype_t type) { struct xfs_dquot *dqp; dqp = kmem_zone_zalloc(xfs_qm_dqzone, 0); - dqp->dq_flags = type; + dqp->q_type = type; dqp->q_id = id; dqp->q_mount = mp; INIT_LIST_HEAD(&dqp->q_lru); @@ -498,6 +498,7 @@ xfs_dquot_from_disk( } /* copy everything from disk dquot to the incore dquot */ + dqp->q_type = ddqp->d_flags; dqp->q_blk.hardlimit = be64_to_cpu(ddqp->d_blk_hardlimit); dqp->q_blk.softlimit = be64_to_cpu(ddqp->d_blk_softlimit); dqp->q_ino.hardlimit = be64_to_cpu(ddqp->d_ino_hardlimit); @@ -538,7 +539,7 @@ xfs_dquot_to_disk( { ddqp->d_magic = cpu_to_be16(XFS_DQUOT_MAGIC); ddqp->d_version = XFS_DQUOT_VERSION; - ddqp->d_flags = dqp->dq_flags & XFS_DQTYPE_REC_MASK; + ddqp->d_flags = dqp->q_type; ddqp->d_id = cpu_to_be32(dqp->q_id); ddqp->d_pad0 = 0; ddqp->d_pad = 0; @@ -609,7 +610,7 @@ static int xfs_qm_dqread( struct xfs_mount *mp, xfs_dqid_t id, - uint type, + xfs_dqtype_t type, bool can_alloc, struct xfs_dquot **dqpp) { @@ -657,7 +658,7 @@ err: static int xfs_dq_get_next_id( struct xfs_mount *mp, - uint type, + xfs_dqtype_t type, xfs_dqid_t *id) { struct xfs_inode *quotip = xfs_quota_inode(mp, type); @@ -781,7 +782,7 @@ xfs_qm_dqget_cache_insert( static int xfs_qm_dqget_checks( struct xfs_mount *mp, - uint type) + xfs_dqtype_t type) { if (WARN_ON_ONCE(!XFS_IS_QUOTA_RUNNING(mp))) return -ESRCH; @@ -813,7 +814,7 @@ int xfs_qm_dqget( struct xfs_mount *mp, xfs_dqid_t id, - uint type, + xfs_dqtype_t type, bool can_alloc, struct xfs_dquot **O_dqpp) { @@ -863,7 +864,7 @@ int xfs_qm_dqget_uncached( struct xfs_mount *mp, xfs_dqid_t id, - uint type, + xfs_dqtype_t type, struct xfs_dquot **dqpp) { int error; @@ -879,7 +880,7 @@ xfs_qm_dqget_uncached( xfs_dqid_t xfs_qm_id_for_quotatype( struct xfs_inode *ip, - uint type) + xfs_dqtype_t type) { switch (type) { case XFS_DQTYPE_USER: @@ -901,7 +902,7 @@ xfs_qm_id_for_quotatype( int xfs_qm_dqget_inode( struct xfs_inode *ip, - uint type, + xfs_dqtype_t type, bool can_alloc, struct xfs_dquot **O_dqpp) { @@ -987,7 +988,7 @@ int xfs_qm_dqget_next( struct xfs_mount *mp, xfs_dqid_t id, - uint type, + xfs_dqtype_t type, struct xfs_dquot **dqpp) { struct xfs_dquot *dqp; @@ -1122,7 +1123,7 @@ static xfs_failaddr_t xfs_qm_dqflush_check( struct xfs_dquot *dqp) { - __u8 type = dqp->dq_flags & XFS_DQTYPE_REC_MASK; + xfs_dqtype_t type = xfs_dquot_type(dqp); if (type != XFS_DQTYPE_USER && type != XFS_DQTYPE_GROUP && @@ -1317,7 +1318,7 @@ xfs_qm_exit(void) int xfs_qm_dqiterate( struct xfs_mount *mp, - uint dqtype, + xfs_dqtype_t type, xfs_qm_dqiterate_fn iter_fn, void *priv) { @@ -1326,13 +1327,13 @@ xfs_qm_dqiterate( int error; do { - error = xfs_qm_dqget_next(mp, id, dqtype, &dq); + error = xfs_qm_dqget_next(mp, id, type, &dq); if (error == -ENOENT) return 0; if (error) return error; - error = iter_fn(dq, dqtype, priv); + error = iter_fn(dq, type, priv); id = dq->q_id; xfs_qm_dqput(dq); } while (error == 0 && id != 0); diff --git a/fs/xfs/xfs_dquot.h b/fs/xfs/xfs_dquot.h index 81ba614439bd..282a65da93c7 100644 --- a/fs/xfs/xfs_dquot.h +++ b/fs/xfs/xfs_dquot.h @@ -60,7 +60,7 @@ struct xfs_dquot_res { struct xfs_dquot { struct list_head q_lru; struct xfs_mount *q_mount; - uint8_t dq_flags; + xfs_dqtype_t q_type; uint16_t q_flags; xfs_dqid_t q_id; uint q_nrefs; @@ -131,10 +131,10 @@ static inline void xfs_dqunlock(struct xfs_dquot *dqp) static inline int xfs_dquot_type(const struct xfs_dquot *dqp) { - return dqp->dq_flags & XFS_DQTYPE_REC_MASK; + return dqp->q_type & XFS_DQTYPE_REC_MASK; } -static inline int xfs_this_quota_on(struct xfs_mount *mp, int type) +static inline int xfs_this_quota_on(struct xfs_mount *mp, xfs_dqtype_t type) { switch (type) { case XFS_DQTYPE_USER: @@ -148,7 +148,9 @@ static inline int xfs_this_quota_on(struct xfs_mount *mp, int type) } } -static inline struct xfs_dquot *xfs_inode_dquot(struct xfs_inode *ip, int type) +static inline struct xfs_dquot *xfs_inode_dquot( + struct xfs_inode *ip, + xfs_dqtype_t type) { switch (type) { case XFS_DQTYPE_USER: @@ -205,18 +207,17 @@ void xfs_qm_dqunpin_wait(struct xfs_dquot *dqp); void xfs_qm_adjust_dqtimers(struct xfs_dquot *d); void xfs_qm_adjust_dqlimits(struct xfs_dquot *d); xfs_dqid_t xfs_qm_id_for_quotatype(struct xfs_inode *ip, - uint type); + xfs_dqtype_t type); int xfs_qm_dqget(struct xfs_mount *mp, xfs_dqid_t id, - uint type, bool can_alloc, - struct xfs_dquot **dqpp); -int xfs_qm_dqget_inode(struct xfs_inode *ip, uint type, - bool can_alloc, - struct xfs_dquot **dqpp); + xfs_dqtype_t type, bool can_alloc, + struct xfs_dquot **dqpp); +int xfs_qm_dqget_inode(struct xfs_inode *ip, xfs_dqtype_t type, + bool can_alloc, struct xfs_dquot **dqpp); int xfs_qm_dqget_next(struct xfs_mount *mp, xfs_dqid_t id, - uint type, struct xfs_dquot **dqpp); + xfs_dqtype_t type, struct xfs_dquot **dqpp); int xfs_qm_dqget_uncached(struct xfs_mount *mp, - xfs_dqid_t id, uint type, - struct xfs_dquot **dqpp); + xfs_dqid_t id, xfs_dqtype_t type, + struct xfs_dquot **dqpp); void xfs_qm_dqput(struct xfs_dquot *dqp); void xfs_dqlock2(struct xfs_dquot *, struct xfs_dquot *); @@ -231,9 +232,9 @@ static inline struct xfs_dquot *xfs_qm_dqhold(struct xfs_dquot *dqp) return dqp; } -typedef int (*xfs_qm_dqiterate_fn)(struct xfs_dquot *dq, uint dqtype, - void *priv); -int xfs_qm_dqiterate(struct xfs_mount *mp, uint dqtype, +typedef int (*xfs_qm_dqiterate_fn)(struct xfs_dquot *dq, + xfs_dqtype_t type, void *priv); +int xfs_qm_dqiterate(struct xfs_mount *mp, xfs_dqtype_t type, xfs_qm_dqiterate_fn iter_fn, void *priv); #endif /* __XFS_DQUOT_H__ */ diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index d3dc4106a35c..0e3f62cde375 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -293,11 +293,11 @@ out_trans_cancel: STATIC bool xfs_quota_need_throttle( - struct xfs_inode *ip, - int type, - xfs_fsblock_t alloc_blocks) + struct xfs_inode *ip, + xfs_dqtype_t type, + xfs_fsblock_t alloc_blocks) { - struct xfs_dquot *dq = xfs_inode_dquot(ip, type); + struct xfs_dquot *dq = xfs_inode_dquot(ip, type); if (!dq || !xfs_this_quota_on(ip->i_mount, type)) return false; @@ -315,15 +315,15 @@ xfs_quota_need_throttle( STATIC void xfs_quota_calc_throttle( - struct xfs_inode *ip, - int type, - xfs_fsblock_t *qblocks, - int *qshift, - int64_t *qfreesp) + struct xfs_inode *ip, + xfs_dqtype_t type, + xfs_fsblock_t *qblocks, + int *qshift, + int64_t *qfreesp) { - int64_t freesp; - int shift = 0; - struct xfs_dquot *dq = xfs_inode_dquot(ip, type); + struct xfs_dquot *dq = xfs_inode_dquot(ip, type); + int64_t freesp; + int shift = 0; /* no dq, or over hi wmark, squash the prealloc completely */ if (!dq || dq->q_blk.reserved >= dq->q_prealloc_hi_wmark) { diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index 47d4b6937c84..123757717e21 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -47,7 +47,7 @@ STATIC void xfs_qm_dqfree_one(struct xfs_dquot *dqp); STATIC int xfs_qm_dquot_walk( struct xfs_mount *mp, - int type, + xfs_dqtype_t type, int (*execute)(struct xfs_dquot *dqp, void *data), void *data) { @@ -250,7 +250,7 @@ STATIC int xfs_qm_dqattach_one( struct xfs_inode *ip, xfs_dqid_t id, - uint type, + xfs_dqtype_t type, bool doalloc, struct xfs_dquot **IO_idqpp) { @@ -545,7 +545,7 @@ xfs_qm_shrink_count( STATIC void xfs_qm_set_defquota( struct xfs_mount *mp, - uint type, + xfs_dqtype_t type, struct xfs_quotainfo *qinf) { struct xfs_dquot *dqp; @@ -575,7 +575,7 @@ xfs_qm_set_defquota( static void xfs_qm_init_timelimits( struct xfs_mount *mp, - uint type) + xfs_dqtype_t type) { struct xfs_quotainfo *qinf = mp->m_quotainfo; struct xfs_def_quota *defq; @@ -823,10 +823,10 @@ xfs_qm_qino_alloc( STATIC void xfs_qm_reset_dqcounts( - xfs_mount_t *mp, - xfs_buf_t *bp, - xfs_dqid_t id, - uint type) + struct xfs_mount *mp, + struct xfs_buf *bp, + xfs_dqid_t id, + xfs_dqtype_t type) { struct xfs_dqblk *dqb; int j; @@ -895,7 +895,7 @@ xfs_qm_reset_dqcounts_all( xfs_dqid_t firstid, xfs_fsblock_t bno, xfs_filblks_t blkcnt, - uint type, + xfs_dqtype_t type, struct list_head *buffer_list) { struct xfs_buf *bp; @@ -961,7 +961,7 @@ STATIC int xfs_qm_reset_dqcounts_buf( struct xfs_mount *mp, struct xfs_inode *qip, - uint type, + xfs_dqtype_t type, struct list_head *buffer_list) { struct xfs_bmbt_irec *map; @@ -1059,7 +1059,7 @@ out: STATIC int xfs_qm_quotacheck_dqadjust( struct xfs_inode *ip, - uint type, + xfs_dqtype_t type, xfs_qcnt_t nblks, xfs_qcnt_t rtblks) { diff --git a/fs/xfs/xfs_qm.h b/fs/xfs/xfs_qm.h index fac6fa81f1fa..9c078c35d924 100644 --- a/fs/xfs/xfs_qm.h +++ b/fs/xfs/xfs_qm.h @@ -70,7 +70,7 @@ struct xfs_quotainfo { static inline struct radix_tree_root * xfs_dquot_tree( struct xfs_quotainfo *qi, - int type) + xfs_dqtype_t type) { switch (type) { case XFS_DQTYPE_USER: @@ -86,9 +86,9 @@ xfs_dquot_tree( } static inline struct xfs_inode * -xfs_quota_inode(xfs_mount_t *mp, uint dq_flags) +xfs_quota_inode(struct xfs_mount *mp, xfs_dqtype_t type) { - switch (dq_flags) { + switch (type) { case XFS_DQTYPE_USER: return mp->m_quotainfo->qi_uquotaip; case XFS_DQTYPE_GROUP: @@ -142,17 +142,23 @@ extern void xfs_qm_dqrele_all_inodes(struct xfs_mount *, uint); /* quota ops */ extern int xfs_qm_scall_trunc_qfiles(struct xfs_mount *, uint); -extern int xfs_qm_scall_getquota(struct xfs_mount *, xfs_dqid_t, - uint, struct qc_dqblk *); -extern int xfs_qm_scall_getquota_next(struct xfs_mount *, - xfs_dqid_t *, uint, struct qc_dqblk *); -extern int xfs_qm_scall_setqlim(struct xfs_mount *, xfs_dqid_t, uint, - struct qc_dqblk *); +extern int xfs_qm_scall_getquota(struct xfs_mount *mp, + xfs_dqid_t id, + xfs_dqtype_t type, + struct qc_dqblk *dst); +extern int xfs_qm_scall_getquota_next(struct xfs_mount *mp, + xfs_dqid_t *id, + xfs_dqtype_t type, + struct qc_dqblk *dst); +extern int xfs_qm_scall_setqlim(struct xfs_mount *mp, + xfs_dqid_t id, + xfs_dqtype_t type, + struct qc_dqblk *newlim); extern int xfs_qm_scall_quotaon(struct xfs_mount *, uint); extern int xfs_qm_scall_quotaoff(struct xfs_mount *, uint); static inline struct xfs_def_quota * -xfs_get_defquota(struct xfs_quotainfo *qi, int type) +xfs_get_defquota(struct xfs_quotainfo *qi, xfs_dqtype_t type) { switch (type) { case XFS_DQTYPE_USER: diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c index f7dbc702e4d6..1c542b4a5220 100644 --- a/fs/xfs/xfs_qm_syscalls.c +++ b/fs/xfs/xfs_qm_syscalls.c @@ -495,7 +495,7 @@ int xfs_qm_scall_setqlim( struct xfs_mount *mp, xfs_dqid_t id, - uint type, + xfs_dqtype_t type, struct qc_dqblk *newlim) { struct xfs_quotainfo *q = mp->m_quotainfo; @@ -634,7 +634,7 @@ out_unlock: static void xfs_qm_scall_getquota_fill_qc( struct xfs_mount *mp, - uint type, + xfs_dqtype_t type, const struct xfs_dquot *dqp, struct qc_dqblk *dst) { @@ -685,7 +685,7 @@ int xfs_qm_scall_getquota( struct xfs_mount *mp, xfs_dqid_t id, - uint type, + xfs_dqtype_t type, struct qc_dqblk *dst) { struct xfs_dquot *dqp; @@ -723,7 +723,7 @@ int xfs_qm_scall_getquota_next( struct xfs_mount *mp, xfs_dqid_t *id, - uint type, + xfs_dqtype_t type, struct qc_dqblk *dst) { struct xfs_dquot *dqp; diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h index 0ae35fb5cb89..06b22e35fc90 100644 --- a/fs/xfs/xfs_quota.h +++ b/fs/xfs/xfs_quota.h @@ -39,9 +39,9 @@ struct xfs_buf; static inline uint xfs_quota_chkd_flag( - uint dqtype) + xfs_dqtype_t type) { - switch (dqtype) { + switch (type) { case XFS_DQTYPE_USER: return XFS_UQUOTA_CHKD; case XFS_DQTYPE_GROUP: diff --git a/fs/xfs/xfs_quotaops.c b/fs/xfs/xfs_quotaops.c index ba69906edecf..d27c0e852c0b 100644 --- a/fs/xfs/xfs_quotaops.c +++ b/fs/xfs/xfs_quotaops.c @@ -85,7 +85,7 @@ xfs_fs_get_quota_state( return 0; } -STATIC int +STATIC xfs_dqtype_t xfs_quota_type(int type) { switch (type) { diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 81534095f52b..e9b2ce0948b6 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -865,6 +865,7 @@ DECLARE_EVENT_CLASS(xfs_dquot_class, TP_STRUCT__entry( __field(dev_t, dev) __field(u32, id) + __field(xfs_dqtype_t, type) __field(unsigned, flags) __field(unsigned, nrefs) __field(unsigned long long, res_bcount) @@ -885,7 +886,8 @@ DECLARE_EVENT_CLASS(xfs_dquot_class, TP_fast_assign( __entry->dev = dqp->q_mount->m_super->s_dev; __entry->id = dqp->q_id; - __entry->flags = dqp->dq_flags | dqp->q_flags; + __entry->type = dqp->q_type; + __entry->flags = dqp->q_flags; __entry->nrefs = dqp->q_nrefs; __entry->res_bcount = dqp->q_blk.reserved; @@ -903,13 +905,14 @@ DECLARE_EVENT_CLASS(xfs_dquot_class, __entry->ino_hardlimit = dqp->q_ino.hardlimit; __entry->ino_softlimit = dqp->q_ino.softlimit; ), - TP_printk("dev %d:%d id 0x%x flags %s nrefs %u " + TP_printk("dev %d:%d id 0x%x type %s flags %s nrefs %u " "res_bc 0x%llx res_rtbc 0x%llx res_ic 0x%llx " "bcnt 0x%llx bhardlimit 0x%llx bsoftlimit 0x%llx " "rtbcnt 0x%llx rtbhardlimit 0x%llx rtbsoftlimit 0x%llx " "icnt 0x%llx ihardlimit 0x%llx isoftlimit 0x%llx]", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->id, + __print_flags(__entry->type, "|", XFS_DQTYPE_STRINGS), __print_flags(__entry->flags, "|", XFS_DQFLAG_STRINGS), __entry->nrefs, __entry->res_bcount, @@ -976,6 +979,7 @@ TRACE_EVENT(xfs_trans_mod_dquot, TP_ARGS(tp, dqp, field, delta), TP_STRUCT__entry( __field(dev_t, dev) + __field(xfs_dqtype_t, type) __field(unsigned int, flags) __field(unsigned int, dqid) __field(unsigned int, field) @@ -983,14 +987,16 @@ TRACE_EVENT(xfs_trans_mod_dquot, ), TP_fast_assign( __entry->dev = tp->t_mountp->m_super->s_dev; - __entry->flags = dqp->dq_flags | dqp->q_flags; + __entry->type = dqp->q_type; + __entry->flags = dqp->q_flags; __entry->dqid = dqp->q_id; __entry->field = field; __entry->delta = delta; ), - TP_printk("dev %d:%d dquot id 0x%x flags %s field %s delta %lld", + TP_printk("dev %d:%d dquot id 0x%x type %s flags %s field %s delta %lld", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->dqid, + __print_flags(__entry->type, "|", XFS_DQTYPE_STRINGS), __print_flags(__entry->flags, "|", XFS_DQFLAG_STRINGS), __print_flags(__entry->field, "|", XFS_QMOPT_FLAGS), __entry->delta) @@ -1001,6 +1007,7 @@ DECLARE_EVENT_CLASS(xfs_dqtrx_class, TP_ARGS(qtrx), TP_STRUCT__entry( __field(dev_t, dev) + __field(xfs_dqtype_t, type) __field(unsigned int, flags) __field(u32, dqid) @@ -1019,7 +1026,8 @@ DECLARE_EVENT_CLASS(xfs_dqtrx_class, ), TP_fast_assign( __entry->dev = qtrx->qt_dquot->q_mount->m_super->s_dev; - __entry->flags = qtrx->qt_dquot->dq_flags | qtrx->qt_dquot->q_flags; + __entry->type = qtrx->qt_dquot->q_type; + __entry->flags = qtrx->qt_dquot->q_flags; __entry->dqid = qtrx->qt_dquot->q_id; __entry->blk_res = qtrx->qt_blk_res; @@ -1035,12 +1043,13 @@ DECLARE_EVENT_CLASS(xfs_dqtrx_class, __entry->ino_res_used = qtrx->qt_ino_res_used; __entry->icount_delta = qtrx->qt_icount_delta; ), - TP_printk("dev %d:%d dquot id 0x%x flags %s" + TP_printk("dev %d:%d dquot id 0x%x type %s flags %s" "blk_res %llu bcount_delta %lld delbcnt_delta %lld " "rtblk_res %llu rtblk_res_used %llu rtbcount_delta %lld delrtb_delta %lld " "ino_res %llu ino_res_used %llu icount_delta %lld", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->dqid, + __print_flags(__entry->type, "|", XFS_DQTYPE_STRINGS), __print_flags(__entry->flags, "|", XFS_DQFLAG_STRINGS), __entry->blk_res, From a990f7a84edc9941956ea3c1dfb89733c80f9ad0 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Wed, 15 Jul 2020 18:39:51 -0700 Subject: [PATCH 087/117] xfs: improve ondisk dquot flags checking Create an XFS_DQTYPE_ANY mask for ondisk dquots flags, and use that to ensure that we never accept any garbage flags when we're loading dquots. While we're at it, restructure the quota type flag checking to use the proper masking. Note that I plan to add y2038 support soon, which will require a new xfs_dqtype_t flag for extended timestamp support, hence all the work to make the type masking work correctly. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_dquot_buf.c | 11 ++++++++--- fs/xfs/libxfs/xfs_format.h | 2 ++ 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/fs/xfs/libxfs/xfs_dquot_buf.c b/fs/xfs/libxfs/xfs_dquot_buf.c index 75c164ed141c..39d64fbc6b87 100644 --- a/fs/xfs/libxfs/xfs_dquot_buf.c +++ b/fs/xfs/libxfs/xfs_dquot_buf.c @@ -39,6 +39,8 @@ xfs_dquot_verify( struct xfs_disk_dquot *ddq, xfs_dqid_t id) /* used only during quotacheck */ { + __u8 ddq_type; + /* * We can encounter an uninitialized dquot buffer for 2 reasons: * 1. If we crash while deleting the quotainode(s), and those blks got @@ -59,9 +61,12 @@ xfs_dquot_verify( if (ddq->d_version != XFS_DQUOT_VERSION) return __this_address; - if (ddq->d_flags != XFS_DQTYPE_USER && - ddq->d_flags != XFS_DQTYPE_PROJ && - ddq->d_flags != XFS_DQTYPE_GROUP) + if (ddq->d_flags & ~XFS_DQTYPE_ANY) + return __this_address; + ddq_type = ddq->d_flags & XFS_DQTYPE_REC_MASK; + if (ddq_type != XFS_DQTYPE_USER && + ddq_type != XFS_DQTYPE_PROJ && + ddq_type != XFS_DQTYPE_GROUP) return __this_address; if (id != -1 && id != be32_to_cpu(ddq->d_id)) diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h index 0fa969f6202c..29564bd32bef 100644 --- a/fs/xfs/libxfs/xfs_format.h +++ b/fs/xfs/libxfs/xfs_format.h @@ -1158,6 +1158,8 @@ static inline void xfs_dinode_put_rdev(struct xfs_dinode *dip, xfs_dev_t rdev) XFS_DQTYPE_PROJ | \ XFS_DQTYPE_GROUP) +#define XFS_DQTYPE_ANY (XFS_DQTYPE_REC_MASK) + /* * This is the main portion of the on-disk representation of quota information * for a user. We pad this with some more expansion room to construct the on From d8c1af0d6ad617df4563e78bbee70691f25ace58 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Wed, 15 Jul 2020 18:05:39 -0700 Subject: [PATCH 088/117] xfs: rename the ondisk dquot d_flags to d_type The ondisk dquot stores the quota record type in the flags field. Rename this field to d_type to make the _type relationship between the ondisk and incore dquot more obvious. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_dquot_buf.c | 6 +++--- fs/xfs/libxfs/xfs_format.h | 2 +- fs/xfs/xfs_dquot.c | 8 ++++---- fs/xfs/xfs_dquot_item_recover.c | 4 ++-- fs/xfs/xfs_qm.c | 4 ++-- 5 files changed, 12 insertions(+), 12 deletions(-) diff --git a/fs/xfs/libxfs/xfs_dquot_buf.c b/fs/xfs/libxfs/xfs_dquot_buf.c index 39d64fbc6b87..5a2db00b9d5f 100644 --- a/fs/xfs/libxfs/xfs_dquot_buf.c +++ b/fs/xfs/libxfs/xfs_dquot_buf.c @@ -61,9 +61,9 @@ xfs_dquot_verify( if (ddq->d_version != XFS_DQUOT_VERSION) return __this_address; - if (ddq->d_flags & ~XFS_DQTYPE_ANY) + if (ddq->d_type & ~XFS_DQTYPE_ANY) return __this_address; - ddq_type = ddq->d_flags & XFS_DQTYPE_REC_MASK; + ddq_type = ddq->d_type & XFS_DQTYPE_REC_MASK; if (ddq_type != XFS_DQTYPE_USER && ddq_type != XFS_DQTYPE_PROJ && ddq_type != XFS_DQTYPE_GROUP) @@ -124,7 +124,7 @@ xfs_dqblk_repair( dqb->dd_diskdq.d_magic = cpu_to_be16(XFS_DQUOT_MAGIC); dqb->dd_diskdq.d_version = XFS_DQUOT_VERSION; - dqb->dd_diskdq.d_flags = type; + dqb->dd_diskdq.d_type = type; dqb->dd_diskdq.d_id = cpu_to_be32(id); if (xfs_sb_version_hascrc(&mp->m_sb)) { diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h index 29564bd32bef..31b7ece985bb 100644 --- a/fs/xfs/libxfs/xfs_format.h +++ b/fs/xfs/libxfs/xfs_format.h @@ -1168,7 +1168,7 @@ static inline void xfs_dinode_put_rdev(struct xfs_dinode *dip, xfs_dev_t rdev) struct xfs_disk_dquot { __be16 d_magic; /* dquot magic = XFS_DQUOT_MAGIC */ __u8 d_version; /* dquot version */ - __u8 d_flags; /* XFS_DQTYPE_USER/PROJ/GROUP */ + __u8 d_type; /* XFS_DQTYPE_USER/PROJ/GROUP */ __be32 d_id; /* user,project,group id */ __be64 d_blk_hardlimit;/* absolute limit on disk blks */ __be64 d_blk_softlimit;/* preferred limit on disk blks */ diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index 30cfa0c87175..799b986975e8 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -200,7 +200,7 @@ xfs_qm_init_dquot_blk( d->dd_diskdq.d_magic = cpu_to_be16(XFS_DQUOT_MAGIC); d->dd_diskdq.d_version = XFS_DQUOT_VERSION; d->dd_diskdq.d_id = cpu_to_be32(curid); - d->dd_diskdq.d_flags = type; + d->dd_diskdq.d_type = type; if (xfs_sb_version_hascrc(&mp->m_sb)) { uuid_copy(&d->dd_uuid, &mp->m_sb.sb_meta_uuid); xfs_update_cksum((char *)d, sizeof(struct xfs_dqblk), @@ -488,7 +488,7 @@ xfs_dquot_from_disk( * Ensure that we got the type and ID we were looking for. * Everything else was checked by the dquot buffer verifier. */ - if ((ddqp->d_flags & XFS_DQTYPE_REC_MASK) != xfs_dquot_type(dqp) || + if ((ddqp->d_type & XFS_DQTYPE_REC_MASK) != xfs_dquot_type(dqp) || be32_to_cpu(ddqp->d_id) != dqp->q_id) { xfs_alert_tag(bp->b_mount, XFS_PTAG_VERIFIER_ERROR, "Metadata corruption detected at %pS, quota %u", @@ -498,7 +498,7 @@ xfs_dquot_from_disk( } /* copy everything from disk dquot to the incore dquot */ - dqp->q_type = ddqp->d_flags; + dqp->q_type = ddqp->d_type; dqp->q_blk.hardlimit = be64_to_cpu(ddqp->d_blk_hardlimit); dqp->q_blk.softlimit = be64_to_cpu(ddqp->d_blk_softlimit); dqp->q_ino.hardlimit = be64_to_cpu(ddqp->d_ino_hardlimit); @@ -539,7 +539,7 @@ xfs_dquot_to_disk( { ddqp->d_magic = cpu_to_be16(XFS_DQUOT_MAGIC); ddqp->d_version = XFS_DQUOT_VERSION; - ddqp->d_flags = dqp->q_type; + ddqp->d_type = dqp->q_type; ddqp->d_id = cpu_to_be32(dqp->q_id); ddqp->d_pad0 = 0; ddqp->d_pad = 0; diff --git a/fs/xfs/xfs_dquot_item_recover.c b/fs/xfs/xfs_dquot_item_recover.c index 93178341569a..5875c7e1bd28 100644 --- a/fs/xfs/xfs_dquot_item_recover.c +++ b/fs/xfs/xfs_dquot_item_recover.c @@ -39,7 +39,7 @@ xlog_recover_dquot_ra_pass2( if (item->ri_buf[1].i_len < sizeof(struct xfs_disk_dquot)) return; - type = recddq->d_flags & XFS_DQTYPE_REC_MASK; + type = recddq->d_type & XFS_DQTYPE_REC_MASK; ASSERT(type); if (log->l_quotaoffs_flag & type) return; @@ -91,7 +91,7 @@ xlog_recover_dquot_commit_pass2( /* * This type of quotas was turned off, so ignore this record. */ - type = recddq->d_flags & XFS_DQTYPE_REC_MASK; + type = recddq->d_type & XFS_DQTYPE_REC_MASK; ASSERT(type); if (log->l_quotaoffs_flag & type) return 0; diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index 123757717e21..be67570badf8 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -855,14 +855,14 @@ xfs_qm_reset_dqcounts( * xfs_dquot_verify. */ if (xfs_dqblk_verify(mp, &dqb[j], id + j) || - (dqb[j].dd_diskdq.d_flags & XFS_DQTYPE_REC_MASK) != type) + (dqb[j].dd_diskdq.d_type & XFS_DQTYPE_REC_MASK) != type) xfs_dqblk_repair(mp, &dqb[j], id + j, type); /* * Reset type in case we are reusing group quota file for * project quotas or vice versa */ - ddq->d_flags = type; + ddq->d_type = type; ddq->d_bcount = 0; ddq->d_icount = 0; ddq->d_rtbcount = 0; From 26270c9f4cf77815397e749f38a343732c28c4a5 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sun, 19 Jul 2020 18:31:27 -0700 Subject: [PATCH 089/117] xfs: xfs_btree_staging.h: delete duplicated words Drop the repeated words "with" and "be" in comments. Signed-off-by: Randy Dunlap Cc: "Darrick J. Wong" Cc: linux-xfs@vger.kernel.org Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_btree_staging.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/xfs/libxfs/xfs_btree_staging.h b/fs/xfs/libxfs/xfs_btree_staging.h index 643f0f9b2994..f0d2976050ae 100644 --- a/fs/xfs/libxfs/xfs_btree_staging.h +++ b/fs/xfs/libxfs/xfs_btree_staging.h @@ -18,7 +18,7 @@ struct xbtree_afakeroot { unsigned int af_blocks; }; -/* Cursor interactions with with fake roots for AG-rooted btrees. */ +/* Cursor interactions with fake roots for AG-rooted btrees. */ void xfs_btree_stage_afakeroot(struct xfs_btree_cur *cur, struct xbtree_afakeroot *afake); void xfs_btree_commit_afakeroot(struct xfs_btree_cur *cur, struct xfs_trans *tp, @@ -45,7 +45,7 @@ struct xbtree_ifakeroot { unsigned int if_extents; }; -/* Cursor interactions with with fake roots for inode-rooted btrees. */ +/* Cursor interactions with fake roots for inode-rooted btrees. */ void xfs_btree_stage_ifakeroot(struct xfs_btree_cur *cur, struct xbtree_ifakeroot *ifake, struct xfs_btree_ops **new_ops); @@ -90,7 +90,7 @@ struct xfs_btree_bload { /* * Number of free records to leave in each leaf block. If the caller - * sets this to -1, the slack value will be calculated to be be halfway + * sets this to -1, the slack value will be calculated to be halfway * between maxrecs and minrecs. This typically leaves the block 75% * full. Note that slack values are not enforced on inode root blocks. */ From 3050bd0bfe706381c36e4b48bf4de465b0ab94f7 Mon Sep 17 00:00:00 2001 From: Carlos Maiolino Date: Wed, 22 Jul 2020 09:23:04 -0700 Subject: [PATCH 090/117] xfs: Remove kmem_zone_alloc() usage Use kmem_cache_alloc() directly. All kmem_zone_alloc() users pass 0 as flags, which are translated into: GFP_KERNEL | __GFP_NOWARN, and kmem_zone_alloc() loops forever until the allocation succeeds. We can use __GFP_NOFAIL to tell the allocator to loop forever rather than doing it ourself, and because the allocation will never fail, we do not need to use __GFP_NOWARN anymore. Hence, all callers can be converted to use GFP_KERNEL | __GFP_NOFAIL Signed-off-by: Carlos Maiolino Reviewed-by: Darrick J. Wong [darrick: add a comment back in about nofail] Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Reviewed-by: Dave Chinner --- fs/xfs/libxfs/xfs_alloc.c | 3 ++- fs/xfs/libxfs/xfs_bmap.c | 3 ++- fs/xfs/xfs_icache.c | 10 ++++------ 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index bf4d07e5c73f..852b536551b5 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -2462,7 +2462,8 @@ xfs_defer_agfl_block( ASSERT(xfs_bmap_free_item_zone != NULL); ASSERT(oinfo != NULL); - new = kmem_zone_alloc(xfs_bmap_free_item_zone, 0); + new = kmem_cache_alloc(xfs_bmap_free_item_zone, + GFP_KERNEL | __GFP_NOFAIL); new->xefi_startblock = XFS_AGB_TO_FSB(mp, agno, agbno); new->xefi_blockcount = 1; new->xefi_oinfo = *oinfo; diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 667cdd0dfdf4..fd5c0d669d0d 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -553,7 +553,8 @@ __xfs_bmap_add_free( #endif ASSERT(xfs_bmap_free_item_zone != NULL); - new = kmem_zone_alloc(xfs_bmap_free_item_zone, 0); + new = kmem_cache_alloc(xfs_bmap_free_item_zone, + GFP_KERNEL | __GFP_NOFAIL); new->xefi_startblock = bno; new->xefi_blockcount = (xfs_extlen_t)len; if (oinfo) diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 3c6e936d2f99..101028ebb571 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -37,13 +37,11 @@ xfs_inode_alloc( struct xfs_inode *ip; /* - * if this didn't occur in transactions, we could use - * KM_MAYFAIL and return NULL here on ENOMEM. Set the - * code up to do this anyway. + * XXX: If this didn't occur in transactions, we could drop GFP_NOFAIL + * and return NULL here on ENOMEM. */ - ip = kmem_zone_alloc(xfs_inode_zone, 0); - if (!ip) - return NULL; + ip = kmem_cache_alloc(xfs_inode_zone, GFP_KERNEL | __GFP_NOFAIL); + if (inode_init_always(mp->m_super, VFS_I(ip))) { kmem_cache_free(xfs_inode_zone, ip); return NULL; From 32a2b11f467642ea700bc0b01f4693e52ec0fabd Mon Sep 17 00:00:00 2001 From: Carlos Maiolino Date: Wed, 22 Jul 2020 09:23:10 -0700 Subject: [PATCH 091/117] xfs: Remove kmem_zone_zalloc() usage Use kmem_cache_zalloc() directly. With the exception of xlog_ticket_alloc() which will be dealt on the next patch for readability. Reviewed-by: Christoph Hellwig Signed-off-by: Carlos Maiolino Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner --- fs/xfs/libxfs/xfs_alloc_btree.c | 2 +- fs/xfs/libxfs/xfs_bmap.c | 5 ++++- fs/xfs/libxfs/xfs_bmap_btree.c | 2 +- fs/xfs/libxfs/xfs_da_btree.c | 2 +- fs/xfs/libxfs/xfs_ialloc_btree.c | 2 +- fs/xfs/libxfs/xfs_inode_fork.c | 6 +++--- fs/xfs/libxfs/xfs_refcount_btree.c | 2 +- fs/xfs/libxfs/xfs_rmap_btree.c | 2 +- fs/xfs/xfs_bmap_item.c | 4 ++-- fs/xfs/xfs_buf.c | 4 +--- fs/xfs/xfs_buf_item.c | 2 +- fs/xfs/xfs_dquot.c | 2 +- fs/xfs/xfs_extfree_item.c | 6 ++++-- fs/xfs/xfs_icreate_item.c | 2 +- fs/xfs/xfs_inode_item.c | 3 ++- fs/xfs/xfs_refcount_item.c | 5 +++-- fs/xfs/xfs_rmap_item.c | 5 +++-- fs/xfs/xfs_trans.c | 4 ++-- fs/xfs/xfs_trans_dquot.c | 3 ++- 19 files changed, 35 insertions(+), 28 deletions(-) diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c index 3d1226aa2eb5..8e01231b308e 100644 --- a/fs/xfs/libxfs/xfs_alloc_btree.c +++ b/fs/xfs/libxfs/xfs_alloc_btree.c @@ -480,7 +480,7 @@ xfs_allocbt_init_common( ASSERT(btnum == XFS_BTNUM_BNO || btnum == XFS_BTNUM_CNT); - cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_NOFS); + cur = kmem_cache_zalloc(xfs_btree_cur_zone, GFP_NOFS | __GFP_NOFAIL); cur->bc_tp = tp; cur->bc_mp = mp; diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index fd5c0d669d0d..9c40d5971035 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -1099,7 +1099,10 @@ xfs_bmap_add_attrfork( if (error) goto trans_cancel; ASSERT(ip->i_afp == NULL); - ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, 0); + + ip->i_afp = kmem_cache_zalloc(xfs_ifork_zone, + GFP_KERNEL | __GFP_NOFAIL); + ip->i_afp->if_format = XFS_DINODE_FMT_EXTENTS; ip->i_afp->if_flags = XFS_IFEXTENTS; logflags = 0; diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c index d9c63f17d2de..ecec604e6e4d 100644 --- a/fs/xfs/libxfs/xfs_bmap_btree.c +++ b/fs/xfs/libxfs/xfs_bmap_btree.c @@ -552,7 +552,7 @@ xfs_bmbt_init_cursor( struct xfs_btree_cur *cur; ASSERT(whichfork != XFS_COW_FORK); - cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_NOFS); + cur = kmem_cache_zalloc(xfs_btree_cur_zone, GFP_NOFS | __GFP_NOFAIL); cur->bc_tp = tp; cur->bc_mp = mp; diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c index 897749c41f36..a4e1f01daf3d 100644 --- a/fs/xfs/libxfs/xfs_da_btree.c +++ b/fs/xfs/libxfs/xfs_da_btree.c @@ -81,7 +81,7 @@ kmem_zone_t *xfs_da_state_zone; /* anchor for state struct zone */ xfs_da_state_t * xfs_da_state_alloc(void) { - return kmem_zone_zalloc(xfs_da_state_zone, KM_NOFS); + return kmem_cache_zalloc(xfs_da_state_zone, GFP_NOFS | __GFP_NOFAIL); } /* diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c index b2c122ad8f0e..3c8aebc36e64 100644 --- a/fs/xfs/libxfs/xfs_ialloc_btree.c +++ b/fs/xfs/libxfs/xfs_ialloc_btree.c @@ -411,7 +411,7 @@ xfs_inobt_init_common( { struct xfs_btree_cur *cur; - cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_NOFS); + cur = kmem_cache_zalloc(xfs_btree_cur_zone, GFP_NOFS | __GFP_NOFAIL); cur->bc_tp = tp; cur->bc_mp = mp; cur->bc_btnum = btnum; diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c index 28b366275ae0..0cf853d42d62 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.c +++ b/fs/xfs/libxfs/xfs_inode_fork.c @@ -291,7 +291,7 @@ xfs_iformat_attr_fork( * Initialize the extent count early, as the per-format routines may * depend on it. */ - ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_NOFS); + ip->i_afp = kmem_cache_zalloc(xfs_ifork_zone, GFP_NOFS | __GFP_NOFAIL); ip->i_afp->if_format = dip->di_aformat; if (unlikely(ip->i_afp->if_format == 0)) /* pre IRIX 6.2 file system */ ip->i_afp->if_format = XFS_DINODE_FMT_EXTENTS; @@ -673,8 +673,8 @@ xfs_ifork_init_cow( if (ip->i_cowfp) return; - ip->i_cowfp = kmem_zone_zalloc(xfs_ifork_zone, - KM_NOFS); + ip->i_cowfp = kmem_cache_zalloc(xfs_ifork_zone, + GFP_NOFS | __GFP_NOFAIL); ip->i_cowfp->if_flags = XFS_IFEXTENTS; ip->i_cowfp->if_format = XFS_DINODE_FMT_EXTENTS; } diff --git a/fs/xfs/libxfs/xfs_refcount_btree.c b/fs/xfs/libxfs/xfs_refcount_btree.c index c5296c124a4c..a6ac60ae9421 100644 --- a/fs/xfs/libxfs/xfs_refcount_btree.c +++ b/fs/xfs/libxfs/xfs_refcount_btree.c @@ -323,7 +323,7 @@ xfs_refcountbt_init_common( ASSERT(agno != NULLAGNUMBER); ASSERT(agno < mp->m_sb.sb_agcount); - cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_NOFS); + cur = kmem_cache_zalloc(xfs_btree_cur_zone, GFP_NOFS | __GFP_NOFAIL); cur->bc_tp = tp; cur->bc_mp = mp; cur->bc_btnum = XFS_BTNUM_REFC; diff --git a/fs/xfs/libxfs/xfs_rmap_btree.c b/fs/xfs/libxfs/xfs_rmap_btree.c index 94948a53569f..beb81c84a937 100644 --- a/fs/xfs/libxfs/xfs_rmap_btree.c +++ b/fs/xfs/libxfs/xfs_rmap_btree.c @@ -456,7 +456,7 @@ xfs_rmapbt_init_common( { struct xfs_btree_cur *cur; - cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_NOFS); + cur = kmem_cache_zalloc(xfs_btree_cur_zone, GFP_NOFS | __GFP_NOFAIL); cur->bc_tp = tp; cur->bc_mp = mp; /* Overlapping btree; 2 keys per pointer. */ diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c index 6736c5ab188f..ec3691372e7c 100644 --- a/fs/xfs/xfs_bmap_item.c +++ b/fs/xfs/xfs_bmap_item.c @@ -138,7 +138,7 @@ xfs_bui_init( { struct xfs_bui_log_item *buip; - buip = kmem_zone_zalloc(xfs_bui_zone, 0); + buip = kmem_cache_zalloc(xfs_bui_zone, GFP_KERNEL | __GFP_NOFAIL); xfs_log_item_init(mp, &buip->bui_item, XFS_LI_BUI, &xfs_bui_item_ops); buip->bui_format.bui_nextents = XFS_BUI_MAX_FAST_EXTENTS; @@ -215,7 +215,7 @@ xfs_trans_get_bud( { struct xfs_bud_log_item *budp; - budp = kmem_zone_zalloc(xfs_bud_zone, 0); + budp = kmem_cache_zalloc(xfs_bud_zone, GFP_KERNEL | __GFP_NOFAIL); xfs_log_item_init(tp->t_mountp, &budp->bud_item, XFS_LI_BUD, &xfs_bud_item_ops); budp->bud_buip = buip; diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index dda0c9445879..d4cdcb6fb2fe 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -214,9 +214,7 @@ _xfs_buf_alloc( int i; *bpp = NULL; - bp = kmem_zone_zalloc(xfs_buf_zone, KM_NOFS); - if (unlikely(!bp)) - return -ENOMEM; + bp = kmem_cache_zalloc(xfs_buf_zone, GFP_NOFS | __GFP_NOFAIL); /* * We don't want certain flags to appear in b_flags unless they are diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index ed1bf1d99483..5bb6f22cc11a 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -738,7 +738,7 @@ xfs_buf_item_init( return 0; } - bip = kmem_zone_zalloc(xfs_buf_item_zone, 0); + bip = kmem_cache_zalloc(xfs_buf_item_zone, GFP_KERNEL | __GFP_NOFAIL); xfs_log_item_init(mp, &bip->bli_item, XFS_LI_BUF, &xfs_buf_item_ops); bip->bli_buf = bp; diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index 799b986975e8..04dc2be19c3a 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -428,7 +428,7 @@ xfs_dquot_alloc( { struct xfs_dquot *dqp; - dqp = kmem_zone_zalloc(xfs_qm_dqzone, 0); + dqp = kmem_cache_zalloc(xfs_qm_dqzone, GFP_KERNEL | __GFP_NOFAIL); dqp->q_type = type; dqp->q_id = id; diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index b9c333bae0a1..6cb8cd11072a 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -161,7 +161,8 @@ xfs_efi_init( ((nextents - 1) * sizeof(xfs_extent_t))); efip = kmem_zalloc(size, 0); } else { - efip = kmem_zone_zalloc(xfs_efi_zone, 0); + efip = kmem_cache_zalloc(xfs_efi_zone, + GFP_KERNEL | __GFP_NOFAIL); } xfs_log_item_init(mp, &efip->efi_item, XFS_LI_EFI, &xfs_efi_item_ops); @@ -332,7 +333,8 @@ xfs_trans_get_efd( (nextents - 1) * sizeof(struct xfs_extent), 0); } else { - efdp = kmem_zone_zalloc(xfs_efd_zone, 0); + efdp = kmem_cache_zalloc(xfs_efd_zone, + GFP_KERNEL | __GFP_NOFAIL); } xfs_log_item_init(tp->t_mountp, &efdp->efd_item, XFS_LI_EFD, diff --git a/fs/xfs/xfs_icreate_item.c b/fs/xfs/xfs_icreate_item.c index 287a9e5c7d75..9b3994b9c716 100644 --- a/fs/xfs/xfs_icreate_item.c +++ b/fs/xfs/xfs_icreate_item.c @@ -97,7 +97,7 @@ xfs_icreate_log( { struct xfs_icreate_item *icp; - icp = kmem_zone_zalloc(xfs_icreate_zone, 0); + icp = kmem_cache_zalloc(xfs_icreate_zone, GFP_KERNEL | __GFP_NOFAIL); xfs_log_item_init(tp->t_mountp, &icp->ic_item, XFS_LI_ICREATE, &xfs_icreate_item_ops); diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index 3840117f8a5e..895f61b2b4f0 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -615,7 +615,8 @@ xfs_inode_item_init( struct xfs_inode_log_item *iip; ASSERT(ip->i_itemp == NULL); - iip = ip->i_itemp = kmem_zone_zalloc(xfs_ili_zone, 0); + iip = ip->i_itemp = kmem_cache_zalloc(xfs_ili_zone, + GFP_KERNEL | __GFP_NOFAIL); iip->ili_inode = ip; spin_lock_init(&iip->ili_lock); diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c index c81639891e29..7b2c72bc2858 100644 --- a/fs/xfs/xfs_refcount_item.c +++ b/fs/xfs/xfs_refcount_item.c @@ -143,7 +143,8 @@ xfs_cui_init( cuip = kmem_zalloc(xfs_cui_log_item_sizeof(nextents), 0); else - cuip = kmem_zone_zalloc(xfs_cui_zone, 0); + cuip = kmem_cache_zalloc(xfs_cui_zone, + GFP_KERNEL | __GFP_NOFAIL); xfs_log_item_init(mp, &cuip->cui_item, XFS_LI_CUI, &xfs_cui_item_ops); cuip->cui_format.cui_nextents = nextents; @@ -220,7 +221,7 @@ xfs_trans_get_cud( { struct xfs_cud_log_item *cudp; - cudp = kmem_zone_zalloc(xfs_cud_zone, 0); + cudp = kmem_cache_zalloc(xfs_cud_zone, GFP_KERNEL | __GFP_NOFAIL); xfs_log_item_init(tp->t_mountp, &cudp->cud_item, XFS_LI_CUD, &xfs_cud_item_ops); cudp->cud_cuip = cuip; diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c index a86599db20a6..dc5b0753cd51 100644 --- a/fs/xfs/xfs_rmap_item.c +++ b/fs/xfs/xfs_rmap_item.c @@ -141,7 +141,8 @@ xfs_rui_init( if (nextents > XFS_RUI_MAX_FAST_EXTENTS) ruip = kmem_zalloc(xfs_rui_log_item_sizeof(nextents), 0); else - ruip = kmem_zone_zalloc(xfs_rui_zone, 0); + ruip = kmem_cache_zalloc(xfs_rui_zone, + GFP_KERNEL | __GFP_NOFAIL); xfs_log_item_init(mp, &ruip->rui_item, XFS_LI_RUI, &xfs_rui_item_ops); ruip->rui_format.rui_nextents = nextents; @@ -243,7 +244,7 @@ xfs_trans_get_rud( { struct xfs_rud_log_item *rudp; - rudp = kmem_zone_zalloc(xfs_rud_zone, 0); + rudp = kmem_cache_zalloc(xfs_rud_zone, GFP_KERNEL | __GFP_NOFAIL); xfs_log_item_init(tp->t_mountp, &rudp->rud_item, XFS_LI_RUD, &xfs_rud_item_ops); rudp->rud_ruip = ruip; diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index 0ad72a83edac..ed72867b1a19 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -90,7 +90,7 @@ xfs_trans_dup( trace_xfs_trans_dup(tp, _RET_IP_); - ntp = kmem_zone_zalloc(xfs_trans_zone, 0); + ntp = kmem_cache_zalloc(xfs_trans_zone, GFP_KERNEL | __GFP_NOFAIL); /* * Initialize the new transaction structure. @@ -263,7 +263,7 @@ xfs_trans_alloc( * GFP_NOFS allocation context so that we avoid lockdep false positives * by doing GFP_KERNEL allocations inside sb_start_intwrite(). */ - tp = kmem_zone_zalloc(xfs_trans_zone, 0); + tp = kmem_cache_zalloc(xfs_trans_zone, GFP_KERNEL | __GFP_NOFAIL); if (!(flags & XFS_TRANS_NO_WRITECOUNT)) sb_start_intwrite(mp->m_super); diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c index 518cf0347891..c6ba7ef18e06 100644 --- a/fs/xfs/xfs_trans_dquot.c +++ b/fs/xfs/xfs_trans_dquot.c @@ -874,7 +874,8 @@ STATIC void xfs_trans_alloc_dqinfo( xfs_trans_t *tp) { - tp->t_dqinfo = kmem_zone_zalloc(xfs_qm_dqtrxzone, 0); + tp->t_dqinfo = kmem_cache_zalloc(xfs_qm_dqtrxzone, + GFP_KERNEL | __GFP_NOFAIL); } void From ca4f2589905954b155e920b543e13a370d578511 Mon Sep 17 00:00:00 2001 From: Carlos Maiolino Date: Wed, 22 Jul 2020 09:23:17 -0700 Subject: [PATCH 092/117] xfs: Modify xlog_ticket_alloc() to use kernel's MM API xlog_ticket_alloc() is always called under NOFS context, except from unmount path, which eitherway is holding many FS locks, so, there is no need for its callers to keep passing allocation flags into it. change xlog_ticket_alloc() to use default kmem_cache_zalloc(), remove its alloc_flags argument, and always use GFP_NOFS | __GFP_NOFAIL flags. Reviewed-by: Christoph Hellwig Signed-off-by: Carlos Maiolino Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner --- fs/xfs/xfs_log.c | 9 +++------ fs/xfs/xfs_log_cil.c | 3 +-- fs/xfs/xfs_log_priv.h | 4 +--- 3 files changed, 5 insertions(+), 11 deletions(-) diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 00fda2e8e738..ad0c69ee8947 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -433,7 +433,7 @@ xfs_log_reserve( XFS_STATS_INC(mp, xs_try_logspace); ASSERT(*ticp == NULL); - tic = xlog_ticket_alloc(log, unit_bytes, cnt, client, permanent, 0); + tic = xlog_ticket_alloc(log, unit_bytes, cnt, client, permanent); *ticp = tic; xlog_grant_push_ail(log, tic->t_cnt ? tic->t_unit_res * tic->t_cnt @@ -3408,15 +3408,12 @@ xlog_ticket_alloc( int unit_bytes, int cnt, char client, - bool permanent, - xfs_km_flags_t alloc_flags) + bool permanent) { struct xlog_ticket *tic; int unit_res; - tic = kmem_zone_zalloc(xfs_log_ticket_zone, alloc_flags); - if (!tic) - return NULL; + tic = kmem_cache_zalloc(xfs_log_ticket_zone, GFP_NOFS | __GFP_NOFAIL); unit_res = xfs_log_calc_unit_res(log->l_mp, unit_bytes); diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index 9ed90368ab31..56c32eecffea 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -37,8 +37,7 @@ xlog_cil_ticket_alloc( { struct xlog_ticket *tic; - tic = xlog_ticket_alloc(log, 0, 1, XFS_TRANSACTION, 0, - KM_NOFS); + tic = xlog_ticket_alloc(log, 0, 1, XFS_TRANSACTION, 0); /* * set the current reservation to zero so we know to steal the basic diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index 75a62870b63a..1c6fdbf3d506 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -464,9 +464,7 @@ xlog_ticket_alloc( int unit_bytes, int count, char client, - bool permanent, - xfs_km_flags_t alloc_flags); - + bool permanent); static inline void xlog_write_adv_cnt(void **ptr, int *len, int *off, size_t bytes) From bae633a4a283afc8ceff5672c1f4f096d0558d70 Mon Sep 17 00:00:00 2001 From: Carlos Maiolino Date: Wed, 22 Jul 2020 09:23:17 -0700 Subject: [PATCH 093/117] xfs: remove xfs_zone_{alloc,zalloc} helpers All their users have been converted to use MM API directly, no need to keep them around anymore. Reviewed-by: Christoph Hellwig Reviewed-by: Dave Chinner Signed-off-by: Carlos Maiolino Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner --- fs/xfs/kmem.c | 21 --------------------- fs/xfs/kmem.h | 8 -------- fs/xfs/xfs_trace.h | 1 - 3 files changed, 30 deletions(-) diff --git a/fs/xfs/kmem.c b/fs/xfs/kmem.c index f1366475c389..e841ed781a25 100644 --- a/fs/xfs/kmem.c +++ b/fs/xfs/kmem.c @@ -115,24 +115,3 @@ kmem_realloc(const void *old, size_t newsize, xfs_km_flags_t flags) congestion_wait(BLK_RW_ASYNC, HZ/50); } while (1); } - -void * -kmem_zone_alloc(kmem_zone_t *zone, xfs_km_flags_t flags) -{ - int retries = 0; - gfp_t lflags = kmem_flags_convert(flags); - void *ptr; - - trace_kmem_zone_alloc(kmem_cache_size(zone), flags, _RET_IP_); - do { - ptr = kmem_cache_alloc(zone, lflags); - if (ptr || (flags & KM_MAYFAIL)) - return ptr; - if (!(++retries % 100)) - xfs_err(NULL, - "%s(%u) possible memory allocation deadlock in %s (mode:0x%x)", - current->comm, current->pid, - __func__, lflags); - congestion_wait(BLK_RW_ASYNC, HZ/50); - } while (1); -} diff --git a/fs/xfs/kmem.h b/fs/xfs/kmem.h index 34cbcfde9228..8e8555817e6d 100644 --- a/fs/xfs/kmem.h +++ b/fs/xfs/kmem.h @@ -85,14 +85,6 @@ kmem_zalloc_large(size_t size, xfs_km_flags_t flags) #define kmem_zone kmem_cache #define kmem_zone_t struct kmem_cache -extern void *kmem_zone_alloc(kmem_zone_t *, xfs_km_flags_t); - -static inline void * -kmem_zone_zalloc(kmem_zone_t *zone, xfs_km_flags_t flags) -{ - return kmem_zone_alloc(zone, flags | KM_ZERO); -} - static inline struct page * kmem_to_page(void *addr) { diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index e9b2ce0948b6..abb1d859f226 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -3677,7 +3677,6 @@ DEFINE_KMEM_EVENT(kmem_alloc); DEFINE_KMEM_EVENT(kmem_alloc_io); DEFINE_KMEM_EVENT(kmem_alloc_large); DEFINE_KMEM_EVENT(kmem_realloc); -DEFINE_KMEM_EVENT(kmem_zone_alloc); TRACE_EVENT(xfs_check_new_dalign, TP_PROTO(struct xfs_mount *mp, int new_dalign, xfs_ino_t calc_rootino), From 4491a3dd7192fb0db54117c42f785a8eb524607d Mon Sep 17 00:00:00 2001 From: Carlos Maiolino Date: Wed, 22 Jul 2020 09:23:18 -0700 Subject: [PATCH 094/117] xfs: Refactor xfs_da_state_alloc() helper Every call to xfs_da_state_alloc() also requires setting up state->args and state->mp Change xfs_da_state_alloc() to receive an xfs_da_args_t as argument and return a xfs_da_state_t with both args and mp already set. Signed-off-by: Carlos Maiolino Reviewed-by: Darrick J. Wong [darrick: reduce struct typedef usage] Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Reviewed-by: Dave Chinner --- fs/xfs/libxfs/xfs_attr.c | 19 +++++-------------- fs/xfs/libxfs/xfs_da_btree.c | 12 +++++++++--- fs/xfs/libxfs/xfs_da_btree.h | 2 +- fs/xfs/libxfs/xfs_dir2_node.c | 17 +++++------------ fs/xfs/scrub/dabtree.c | 4 +--- 5 files changed, 21 insertions(+), 33 deletions(-) diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index 3b1bd6e112f8..6bb114af4535 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -739,7 +739,6 @@ xfs_attr_node_addname( struct xfs_da_state *state; struct xfs_da_state_blk *blk; struct xfs_inode *dp; - struct xfs_mount *mp; int retval, error; trace_xfs_attr_node_addname(args); @@ -748,11 +747,8 @@ xfs_attr_node_addname( * Fill in bucket of arguments/results/context to carry around. */ dp = args->dp; - mp = dp->i_mount; restart: - state = xfs_da_state_alloc(); - state->args = args; - state->mp = mp; + state = xfs_da_state_alloc(args); /* * Search to see if name already exists, and get back a pointer @@ -899,9 +895,8 @@ restart: * attr, not the "new" one. */ args->attr_filter |= XFS_ATTR_INCOMPLETE; - state = xfs_da_state_alloc(); - state->args = args; - state->mp = mp; + state = xfs_da_state_alloc(args); + state->inleaf = 0; error = xfs_da3_node_lookup_int(state, &retval); if (error) @@ -975,9 +970,7 @@ xfs_attr_node_removename( * Tie a string around our finger to remind us where we are. */ dp = args->dp; - state = xfs_da_state_alloc(); - state->args = args; - state->mp = dp->i_mount; + state = xfs_da_state_alloc(args); /* * Search to see if name exists, and get back a pointer to it. @@ -1207,9 +1200,7 @@ xfs_attr_node_get(xfs_da_args_t *args) trace_xfs_attr_node_get(args); - state = xfs_da_state_alloc(); - state->args = args; - state->mp = args->dp->i_mount; + state = xfs_da_state_alloc(args); /* * Search to see if name exists, and get back a pointer to it. diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c index a4e1f01daf3d..e46bc03365db 100644 --- a/fs/xfs/libxfs/xfs_da_btree.c +++ b/fs/xfs/libxfs/xfs_da_btree.c @@ -78,10 +78,16 @@ kmem_zone_t *xfs_da_state_zone; /* anchor for state struct zone */ * Allocate a dir-state structure. * We don't put them on the stack since they're large. */ -xfs_da_state_t * -xfs_da_state_alloc(void) +struct xfs_da_state * +xfs_da_state_alloc( + struct xfs_da_args *args) { - return kmem_cache_zalloc(xfs_da_state_zone, GFP_NOFS | __GFP_NOFAIL); + struct xfs_da_state *state; + + state = kmem_cache_zalloc(xfs_da_state_zone, GFP_NOFS | __GFP_NOFAIL); + state->args = args; + state->mp = args->dp->i_mount; + return state; } /* diff --git a/fs/xfs/libxfs/xfs_da_btree.h b/fs/xfs/libxfs/xfs_da_btree.h index 6e25de6621e4..ad5dd324631a 100644 --- a/fs/xfs/libxfs/xfs_da_btree.h +++ b/fs/xfs/libxfs/xfs_da_btree.h @@ -219,7 +219,7 @@ enum xfs_dacmp xfs_da_compname(struct xfs_da_args *args, const unsigned char *name, int len); -xfs_da_state_t *xfs_da_state_alloc(void); +struct xfs_da_state *xfs_da_state_alloc(struct xfs_da_args *args); void xfs_da_state_free(xfs_da_state_t *state); void xfs_da3_node_hdr_from_disk(struct xfs_mount *mp, diff --git a/fs/xfs/libxfs/xfs_dir2_node.c b/fs/xfs/libxfs/xfs_dir2_node.c index 6ac4aad98cd7..5d51265d29d6 100644 --- a/fs/xfs/libxfs/xfs_dir2_node.c +++ b/fs/xfs/libxfs/xfs_dir2_node.c @@ -2015,9 +2015,7 @@ xfs_dir2_node_addname( /* * Allocate and initialize the state (btree cursor). */ - state = xfs_da_state_alloc(); - state->args = args; - state->mp = args->dp->i_mount; + state = xfs_da_state_alloc(args); /* * Look up the name. We're not supposed to find it, but * this gives us the insertion point. @@ -2086,9 +2084,8 @@ xfs_dir2_node_lookup( /* * Allocate and initialize the btree cursor. */ - state = xfs_da_state_alloc(); - state->args = args; - state->mp = args->dp->i_mount; + state = xfs_da_state_alloc(args); + /* * Fill in the path to the entry in the cursor. */ @@ -2139,9 +2136,7 @@ xfs_dir2_node_removename( /* * Allocate and initialize the btree cursor. */ - state = xfs_da_state_alloc(); - state->args = args; - state->mp = args->dp->i_mount; + state = xfs_da_state_alloc(args); /* Look up the entry we're deleting, set up the cursor. */ error = xfs_da3_node_lookup_int(state, &rval); @@ -2206,9 +2201,7 @@ xfs_dir2_node_replace( /* * Allocate and initialize the btree cursor. */ - state = xfs_da_state_alloc(); - state->args = args; - state->mp = args->dp->i_mount; + state = xfs_da_state_alloc(args); /* * We have to save new inode number and ftype since diff --git a/fs/xfs/scrub/dabtree.c b/fs/xfs/scrub/dabtree.c index 44b15015021f..e56786f0a13c 100644 --- a/fs/xfs/scrub/dabtree.c +++ b/fs/xfs/scrub/dabtree.c @@ -476,9 +476,7 @@ xchk_da_btree( ds.dargs.whichfork = whichfork; ds.dargs.trans = sc->tp; ds.dargs.op_flags = XFS_DA_OP_OKNOENT; - ds.state = xfs_da_state_alloc(); - ds.state->args = &ds.dargs; - ds.state->mp = mp; + ds.state = xfs_da_state_alloc(&ds.dargs); ds.sc = sc; ds.private = private; if (whichfork == XFS_ATTR_FORK) { From 07120f1abdff80f3d1351f733661abe28d609535 Mon Sep 17 00:00:00 2001 From: Allison Collins Date: Mon, 20 Jul 2020 21:47:22 -0700 Subject: [PATCH 095/117] xfs: Add xfs_has_attr and subroutines This patch adds a new functions to check for the existence of an attribute. Subroutines are also added to handle the cases of leaf blocks, nodes or shortform. Common code that appears in existing attr add and remove functions have been factored out to help reduce the appearance of duplicated code. We will need these routines later for delayed attributes since delayed operations cannot return error codes. Signed-off-by: Allison Collins Reviewed-by: Chandan Rajendra Reviewed-by: Brian Foster Reviewed-by: Darrick J. Wong [darrick: fix a leak-on-error bug reported by Dan Carpenter] [darrick: fix unused variable warning reported by 0day] Signed-off-by: Darrick J. Wong Acked-by: Dave Chinner Reported-by: dan.carpenter@oracle.com Reported-by: kernel test robot --- fs/xfs/libxfs/xfs_attr.c | 184 +++++++++++++++++++++++----------- fs/xfs/libxfs/xfs_attr.h | 1 + fs/xfs/libxfs/xfs_attr_leaf.c | 100 ++++++++++++------ fs/xfs/libxfs/xfs_attr_leaf.h | 3 + 4 files changed, 197 insertions(+), 91 deletions(-) diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index 6bb114af4535..f7f796ea11fd 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -46,6 +46,7 @@ STATIC int xfs_attr_shortform_addname(xfs_da_args_t *args); STATIC int xfs_attr_leaf_get(xfs_da_args_t *args); STATIC int xfs_attr_leaf_addname(xfs_da_args_t *args); STATIC int xfs_attr_leaf_removename(xfs_da_args_t *args); +STATIC int xfs_attr_leaf_hasname(struct xfs_da_args *args, struct xfs_buf **bp); /* * Internal routines when attribute list is more than one block. @@ -53,6 +54,8 @@ STATIC int xfs_attr_leaf_removename(xfs_da_args_t *args); STATIC int xfs_attr_node_get(xfs_da_args_t *args); STATIC int xfs_attr_node_addname(xfs_da_args_t *args); STATIC int xfs_attr_node_removename(xfs_da_args_t *args); +STATIC int xfs_attr_node_hasname(xfs_da_args_t *args, + struct xfs_da_state **state); STATIC int xfs_attr_fillstate(xfs_da_state_t *state); STATIC int xfs_attr_refillstate(xfs_da_state_t *state); @@ -260,6 +263,37 @@ xfs_attr_set_args( return error; } +/* + * Return EEXIST if attr is found, or ENOATTR if not + */ +int +xfs_has_attr( + struct xfs_da_args *args) +{ + struct xfs_inode *dp = args->dp; + struct xfs_buf *bp = NULL; + int error; + + if (!xfs_inode_hasattr(dp)) + return -ENOATTR; + + if (dp->i_afp->if_format == XFS_DINODE_FMT_LOCAL) { + ASSERT(dp->i_afp->if_flags & XFS_IFINLINE); + return xfs_attr_sf_findname(args, NULL, NULL); + } + + if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) { + error = xfs_attr_leaf_hasname(args, &bp); + + if (bp) + xfs_trans_brelse(args->trans, bp); + + return error; + } + + return xfs_attr_node_hasname(args, NULL); +} + /* * Remove the attribute specified in @args. */ @@ -469,26 +503,19 @@ STATIC int xfs_attr_leaf_addname( struct xfs_da_args *args) { - struct xfs_inode *dp; struct xfs_buf *bp; int retval, error, forkoff; + struct xfs_inode *dp = args->dp; trace_xfs_attr_leaf_addname(args); - /* - * Read the (only) block in the attribute list in. - */ - dp = args->dp; - args->blkno = 0; - error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, &bp); - if (error) - return error; - /* * Look up the given attribute in the leaf block. Figure out if * the given flags produce an error or call for an atomic rename. */ - retval = xfs_attr3_leaf_lookup_int(bp, args); + retval = xfs_attr_leaf_hasname(args, &bp); + if (retval != -ENOATTR && retval != -EEXIST) + return retval; if (retval == -ENOATTR && (args->attr_flags & XATTR_REPLACE)) goto out_brelse; if (retval == -EEXIST) { @@ -639,6 +666,27 @@ out_brelse: return retval; } +/* + * Return EEXIST if attr is found, or ENOATTR if not + */ +STATIC int +xfs_attr_leaf_hasname( + struct xfs_da_args *args, + struct xfs_buf **bp) +{ + int error = 0; + + error = xfs_attr3_leaf_read(args->trans, args->dp, 0, bp); + if (error) + return error; + + error = xfs_attr3_leaf_lookup_int(*bp, args); + if (error != -ENOATTR && error != -EEXIST) + xfs_trans_brelse(args->trans, *bp); + + return error; +} + /* * Remove a name from the leaf attribute list structure * @@ -659,16 +707,14 @@ xfs_attr_leaf_removename( * Remove the attribute. */ dp = args->dp; - args->blkno = 0; - error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, &bp); - if (error) - return error; - error = xfs_attr3_leaf_lookup_int(bp, args); + error = xfs_attr_leaf_hasname(args, &bp); + if (error == -ENOATTR) { xfs_trans_brelse(args->trans, bp); return error; - } + } else if (error != -EEXIST) + return error; xfs_attr3_leaf_remove(bp, args); @@ -703,21 +749,53 @@ xfs_attr_leaf_get(xfs_da_args_t *args) trace_xfs_attr_leaf_get(args); - args->blkno = 0; - error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, &bp); - if (error) - return error; + error = xfs_attr_leaf_hasname(args, &bp); - error = xfs_attr3_leaf_lookup_int(bp, args); - if (error != -EEXIST) { + if (error == -ENOATTR) { xfs_trans_brelse(args->trans, bp); return error; - } + } else if (error != -EEXIST) + return error; + + error = xfs_attr3_leaf_getvalue(bp, args); xfs_trans_brelse(args->trans, bp); return error; } +/* + * Return EEXIST if attr is found, or ENOATTR if not + * statep: If not null is set to point at the found state. Caller will + * be responsible for freeing the state in this case. + */ +STATIC int +xfs_attr_node_hasname( + struct xfs_da_args *args, + struct xfs_da_state **statep) +{ + struct xfs_da_state *state; + int retval, error; + + state = xfs_da_state_alloc(args); + if (statep != NULL) + *statep = NULL; + + /* + * Search to see if name exists, and get back a pointer to it. + */ + error = xfs_da3_node_lookup_int(state, &retval); + if (error) { + xfs_da_state_free(state); + return error; + } + + if (statep != NULL) + *statep = state; + else + xfs_da_state_free(state); + return retval; +} + /*======================================================================== * External routines when attribute list size > geo->blksize *========================================================================*/ @@ -748,15 +826,14 @@ xfs_attr_node_addname( */ dp = args->dp; restart: - state = xfs_da_state_alloc(args); - /* * Search to see if name already exists, and get back a pointer * to where it should go. */ - error = xfs_da3_node_lookup_int(state, &retval); - if (error) + retval = xfs_attr_node_hasname(args, &state); + if (retval != -ENOATTR && retval != -EEXIST) goto out; + blk = &state->path.blk[ state->path.active-1 ]; ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC); if (retval == -ENOATTR && (args->attr_flags & XATTR_REPLACE)) @@ -960,27 +1037,15 @@ xfs_attr_node_removename( { struct xfs_da_state *state; struct xfs_da_state_blk *blk; - struct xfs_inode *dp; struct xfs_buf *bp; int retval, error, forkoff; + struct xfs_inode *dp = args->dp; trace_xfs_attr_node_removename(args); - /* - * Tie a string around our finger to remind us where we are. - */ - dp = args->dp; - state = xfs_da_state_alloc(args); - - /* - * Search to see if name exists, and get back a pointer to it. - */ - error = xfs_da3_node_lookup_int(state, &retval); - if (error || (retval != -EEXIST)) { - if (error == 0) - error = retval; + error = xfs_attr_node_hasname(args, &state); + if (error != -EEXIST) goto out; - } /* * If there is an out-of-line value, de-allocate the blocks. @@ -1075,7 +1140,8 @@ xfs_attr_node_removename( error = 0; out: - xfs_da_state_free(state); + if (state) + xfs_da_state_free(state); return error; } @@ -1191,45 +1257,41 @@ xfs_attr_refillstate(xfs_da_state_t *state) * Returns 0 on successful retrieval, otherwise an error. */ STATIC int -xfs_attr_node_get(xfs_da_args_t *args) +xfs_attr_node_get( + struct xfs_da_args *args) { - xfs_da_state_t *state; - xfs_da_state_blk_t *blk; - int error, retval; - int i; + struct xfs_da_state *state; + struct xfs_da_state_blk *blk; + int i; + int error; trace_xfs_attr_node_get(args); - state = xfs_da_state_alloc(args); - /* * Search to see if name exists, and get back a pointer to it. */ - error = xfs_da3_node_lookup_int(state, &retval); - if (error) { - retval = error; - goto out_release; - } - if (retval != -EEXIST) + error = xfs_attr_node_hasname(args, &state); + if (error != -EEXIST) goto out_release; /* * Get the value, local or "remote" */ blk = &state->path.blk[state->path.active - 1]; - retval = xfs_attr3_leaf_getvalue(blk->bp, args); + error = xfs_attr3_leaf_getvalue(blk->bp, args); /* * If not in a transaction, we have to release all the buffers. */ out_release: - for (i = 0; i < state->path.active; i++) { + for (i = 0; state != NULL && i < state->path.active; i++) { xfs_trans_brelse(args->trans, state->path.blk[i].bp); state->path.blk[i].bp = NULL; } - xfs_da_state_free(state); - return retval; + if (state) + xfs_da_state_free(state); + return error; } /* Returns true if the attribute entry name is valid. */ diff --git a/fs/xfs/libxfs/xfs_attr.h b/fs/xfs/libxfs/xfs_attr.h index db4717657ca1..3e97a935e712 100644 --- a/fs/xfs/libxfs/xfs_attr.h +++ b/fs/xfs/libxfs/xfs_attr.h @@ -89,6 +89,7 @@ int xfs_attr_get_ilocked(struct xfs_da_args *args); int xfs_attr_get(struct xfs_da_args *args); int xfs_attr_set(struct xfs_da_args *args); int xfs_attr_set_args(struct xfs_da_args *args); +int xfs_has_attr(struct xfs_da_args *args); int xfs_attr_remove_args(struct xfs_da_args *args); bool xfs_attr_namecheck(const void *name, size_t length); diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c index 2f7e89e4be3e..7ec757bb3b2d 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.c +++ b/fs/xfs/libxfs/xfs_attr_leaf.c @@ -659,19 +659,66 @@ xfs_attr_shortform_create( xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_ADATA); } +/* + * Return -EEXIST if attr is found, or -ENOATTR if not + * args: args containing attribute name and namelen + * sfep: If not null, pointer will be set to the last attr entry found on + -EEXIST. On -ENOATTR pointer is left at the last entry in the list + * basep: If not null, pointer is set to the byte offset of the entry in the + * list on -EEXIST. On -ENOATTR, pointer is left at the byte offset of + * the last entry in the list + */ +int +xfs_attr_sf_findname( + struct xfs_da_args *args, + struct xfs_attr_sf_entry **sfep, + unsigned int *basep) +{ + struct xfs_attr_shortform *sf; + struct xfs_attr_sf_entry *sfe; + unsigned int base = sizeof(struct xfs_attr_sf_hdr); + int size = 0; + int end; + int i; + + sf = (struct xfs_attr_shortform *)args->dp->i_afp->if_u1.if_data; + sfe = &sf->list[0]; + end = sf->hdr.count; + for (i = 0; i < end; sfe = XFS_ATTR_SF_NEXTENTRY(sfe), + base += size, i++) { + size = XFS_ATTR_SF_ENTSIZE(sfe); + if (!xfs_attr_match(args, sfe->namelen, sfe->nameval, + sfe->flags)) + continue; + break; + } + + if (sfep != NULL) + *sfep = sfe; + + if (basep != NULL) + *basep = base; + + if (i == end) + return -ENOATTR; + return -EEXIST; +} + /* * Add a name/value pair to the shortform attribute list. * Overflow from the inode has already been checked for. */ void -xfs_attr_shortform_add(xfs_da_args_t *args, int forkoff) +xfs_attr_shortform_add( + struct xfs_da_args *args, + int forkoff) { - xfs_attr_shortform_t *sf; - xfs_attr_sf_entry_t *sfe; - int i, offset, size; - xfs_mount_t *mp; - xfs_inode_t *dp; - struct xfs_ifork *ifp; + struct xfs_attr_shortform *sf; + struct xfs_attr_sf_entry *sfe; + int offset, size; + struct xfs_mount *mp; + struct xfs_inode *dp; + struct xfs_ifork *ifp; trace_xfs_attr_sf_add(args); @@ -682,11 +729,8 @@ xfs_attr_shortform_add(xfs_da_args_t *args, int forkoff) ifp = dp->i_afp; ASSERT(ifp->if_flags & XFS_IFINLINE); sf = (xfs_attr_shortform_t *)ifp->if_u1.if_data; - sfe = &sf->list[0]; - for (i = 0; i < sf->hdr.count; sfe = XFS_ATTR_SF_NEXTENTRY(sfe), i++) { - ASSERT(!xfs_attr_match(args, sfe->namelen, sfe->nameval, - sfe->flags)); - } + if (xfs_attr_sf_findname(args, &sfe, NULL) == -EEXIST) + ASSERT(0); offset = (char *)sfe - (char *)sf; size = XFS_ATTR_SF_ENTSIZE_BYNAME(args->namelen, args->valuelen); @@ -728,31 +772,27 @@ xfs_attr_fork_remove( * Remove an attribute from the shortform attribute list structure. */ int -xfs_attr_shortform_remove(xfs_da_args_t *args) +xfs_attr_shortform_remove( + struct xfs_da_args *args) { - xfs_attr_shortform_t *sf; - xfs_attr_sf_entry_t *sfe; - int base, size=0, end, totsize, i; - xfs_mount_t *mp; - xfs_inode_t *dp; + struct xfs_attr_shortform *sf; + struct xfs_attr_sf_entry *sfe; + int size = 0, end, totsize; + unsigned int base; + struct xfs_mount *mp; + struct xfs_inode *dp; + int error; trace_xfs_attr_sf_remove(args); dp = args->dp; mp = dp->i_mount; - base = sizeof(xfs_attr_sf_hdr_t); sf = (xfs_attr_shortform_t *)dp->i_afp->if_u1.if_data; - sfe = &sf->list[0]; - end = sf->hdr.count; - for (i = 0; i < end; sfe = XFS_ATTR_SF_NEXTENTRY(sfe), - base += size, i++) { - size = XFS_ATTR_SF_ENTSIZE(sfe); - if (xfs_attr_match(args, sfe->namelen, sfe->nameval, - sfe->flags)) - break; - } - if (i == end) - return -ENOATTR; + + error = xfs_attr_sf_findname(args, &sfe, &base); + if (error != -EEXIST) + return error; + size = XFS_ATTR_SF_ENTSIZE(sfe); /* * Fix up the attribute fork data, covering the hole diff --git a/fs/xfs/libxfs/xfs_attr_leaf.h b/fs/xfs/libxfs/xfs_attr_leaf.h index 5be6be309302..9b1c59f40a26 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.h +++ b/fs/xfs/libxfs/xfs_attr_leaf.h @@ -52,6 +52,9 @@ int xfs_attr_shortform_getvalue(struct xfs_da_args *args); int xfs_attr_shortform_to_leaf(struct xfs_da_args *args, struct xfs_buf **leaf_bp); int xfs_attr_shortform_remove(struct xfs_da_args *args); +int xfs_attr_sf_findname(struct xfs_da_args *args, + struct xfs_attr_sf_entry **sfep, + unsigned int *basep); int xfs_attr_shortform_allfit(struct xfs_buf *bp, struct xfs_inode *dp); int xfs_attr_shortform_bytesfit(struct xfs_inode *dp, int bytes); xfs_failaddr_t xfs_attr_shortform_verify(struct xfs_inode *ip); From deed9512872d094ad1eae4abd0ff1c674df251d5 Mon Sep 17 00:00:00 2001 From: Allison Collins Date: Mon, 20 Jul 2020 21:47:23 -0700 Subject: [PATCH 096/117] xfs: Check for -ENOATTR or -EEXIST Delayed operations cannot return error codes. So we must check for these conditions first before starting set or remove operations Signed-off-by: Allison Collins Reviewed-by: Darrick J. Wong Reviewed-by: Brian Foster Signed-off-by: Darrick J. Wong Acked-by: Dave Chinner --- fs/xfs/libxfs/xfs_attr.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index f7f796ea11fd..1618efda0332 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -404,6 +404,15 @@ xfs_attr_set( args->total, 0, quota_flags); if (error) goto out_trans_cancel; + + error = xfs_has_attr(args); + if (error == -EEXIST && (args->attr_flags & XATTR_CREATE)) + goto out_trans_cancel; + if (error == -ENOATTR && (args->attr_flags & XATTR_REPLACE)) + goto out_trans_cancel; + if (error != -ENOATTR && error != -EEXIST) + goto out_trans_cancel; + error = xfs_attr_set_args(args); if (error) goto out_trans_cancel; @@ -411,6 +420,10 @@ xfs_attr_set( if (!args->trans) goto out_unlock; } else { + error = xfs_has_attr(args); + if (error != -EEXIST) + goto out_trans_cancel; + error = xfs_attr_remove_args(args); if (error) goto out_trans_cancel; From 1a485fc1e96533587e2eab0e4d60798035528b7a Mon Sep 17 00:00:00 2001 From: Allison Collins Date: Mon, 20 Jul 2020 21:47:23 -0700 Subject: [PATCH 097/117] xfs: Factor out new helper functions xfs_attr_rmtval_set Break xfs_attr_rmtval_set into two helper functions xfs_attr_rmt_find_hole and xfs_attr_rmtval_set_value. xfs_attr_rmtval_set rolls the transaction between the helpers, but delayed operations cannot. We will use the helpers later when constructing new delayed attribute routines. Signed-off-by: Allison Collins Reviewed-by: Brian Foster Reviewed-by: Darrick J. Wong Reviewed-by: Chandan Rajendra Signed-off-by: Darrick J. Wong Acked-by: Dave Chinner --- fs/xfs/libxfs/xfs_attr_remote.c | 149 ++++++++++++++++++++------------ 1 file changed, 92 insertions(+), 57 deletions(-) diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c index 01ad7f353e08..f825eedc04b8 100644 --- a/fs/xfs/libxfs/xfs_attr_remote.c +++ b/fs/xfs/libxfs/xfs_attr_remote.c @@ -440,32 +440,23 @@ xfs_attr_rmtval_get( } /* - * Write the value associated with an attribute into the out-of-line buffer - * that we have defined for it. + * Find a "hole" in the attribute address space large enough for us to drop the + * new attribute's value into */ -int -xfs_attr_rmtval_set( +STATIC int +xfs_attr_rmt_find_hole( struct xfs_da_args *args) { struct xfs_inode *dp = args->dp; struct xfs_mount *mp = dp->i_mount; - struct xfs_bmbt_irec map; - xfs_dablk_t lblkno; - xfs_fileoff_t lfileoff = 0; - uint8_t *src = args->value; - int blkcnt; - int valuelen; - int nmap; int error; - int offset = 0; - - trace_xfs_attr_rmtval_set(args); + int blkcnt; + xfs_fileoff_t lfileoff = 0; /* - * Find a "hole" in the attribute address space large enough for - * us to drop the new attribute's value into. Because CRC enable - * attributes have headers, we can't just do a straight byte to FSB - * conversion and have to take the header space into account. + * Because CRC enable attributes have headers, we can't just do a + * straight byte to FSB conversion and have to take the header space + * into account. */ blkcnt = xfs_attr3_rmt_blocks(mp, args->rmtvaluelen); error = xfs_bmap_first_unused(args->trans, args->dp, blkcnt, &lfileoff, @@ -473,48 +464,26 @@ xfs_attr_rmtval_set( if (error) return error; - args->rmtblkno = lblkno = (xfs_dablk_t)lfileoff; + args->rmtblkno = (xfs_dablk_t)lfileoff; args->rmtblkcnt = blkcnt; - /* - * Roll through the "value", allocating blocks on disk as required. - */ - while (blkcnt > 0) { - /* - * Allocate a single extent, up to the size of the value. - * - * Note that we have to consider this a data allocation as we - * write the remote attribute without logging the contents. - * Hence we must ensure that we aren't using blocks that are on - * the busy list so that we don't overwrite blocks which have - * recently been freed but their transactions are not yet - * committed to disk. If we overwrite the contents of a busy - * extent and then crash then the block may not contain the - * correct metadata after log recovery occurs. - */ - nmap = 1; - error = xfs_bmapi_write(args->trans, dp, (xfs_fileoff_t)lblkno, - blkcnt, XFS_BMAPI_ATTRFORK, args->total, &map, - &nmap); - if (error) - return error; - error = xfs_defer_finish(&args->trans); - if (error) - return error; + return 0; +} - ASSERT(nmap == 1); - ASSERT((map.br_startblock != DELAYSTARTBLOCK) && - (map.br_startblock != HOLESTARTBLOCK)); - lblkno += map.br_blockcount; - blkcnt -= map.br_blockcount; - - /* - * Start the next trans in the chain. - */ - error = xfs_trans_roll_inode(&args->trans, dp); - if (error) - return error; - } +STATIC int +xfs_attr_rmtval_set_value( + struct xfs_da_args *args) +{ + struct xfs_inode *dp = args->dp; + struct xfs_mount *mp = dp->i_mount; + struct xfs_bmbt_irec map; + xfs_dablk_t lblkno; + uint8_t *src = args->value; + int blkcnt; + int valuelen; + int nmap; + int error; + int offset = 0; /* * Roll through the "value", copying the attribute value to the @@ -594,6 +563,72 @@ xfs_attr_rmtval_stale( return 0; } +/* + * Write the value associated with an attribute into the out-of-line buffer + * that we have defined for it. + */ +int +xfs_attr_rmtval_set( + struct xfs_da_args *args) +{ + struct xfs_inode *dp = args->dp; + struct xfs_bmbt_irec map; + xfs_dablk_t lblkno; + int blkcnt; + int nmap; + int error; + + trace_xfs_attr_rmtval_set(args); + + error = xfs_attr_rmt_find_hole(args); + if (error) + return error; + + blkcnt = args->rmtblkcnt; + lblkno = (xfs_dablk_t)args->rmtblkno; + /* + * Roll through the "value", allocating blocks on disk as required. + */ + while (blkcnt > 0) { + /* + * Allocate a single extent, up to the size of the value. + * + * Note that we have to consider this a data allocation as we + * write the remote attribute without logging the contents. + * Hence we must ensure that we aren't using blocks that are on + * the busy list so that we don't overwrite blocks which have + * recently been freed but their transactions are not yet + * committed to disk. If we overwrite the contents of a busy + * extent and then crash then the block may not contain the + * correct metadata after log recovery occurs. + */ + nmap = 1; + error = xfs_bmapi_write(args->trans, dp, (xfs_fileoff_t)lblkno, + blkcnt, XFS_BMAPI_ATTRFORK, args->total, &map, + &nmap); + if (error) + return error; + error = xfs_defer_finish(&args->trans); + if (error) + return error; + + ASSERT(nmap == 1); + ASSERT((map.br_startblock != DELAYSTARTBLOCK) && + (map.br_startblock != HOLESTARTBLOCK)); + lblkno += map.br_blockcount; + blkcnt -= map.br_blockcount; + + /* + * Start the next trans in the chain. + */ + error = xfs_trans_roll_inode(&args->trans, dp); + if (error) + return error; + } + + return xfs_attr_rmtval_set_value(args); +} + /* * Remove the value associated with an attribute by deleting the * out-of-line buffer that it is stored on. From e3be1272ddaf5f7482008578b467cf0ca8c35695 Mon Sep 17 00:00:00 2001 From: Allison Collins Date: Mon, 20 Jul 2020 21:47:23 -0700 Subject: [PATCH 098/117] xfs: Pull up trans handling in xfs_attr3_leaf_flipflags Since delayed operations cannot roll transactions, pull up the transaction handling into the calling function Signed-off-by: Allison Collins Reviewed-by: Brian Foster Reviewed-by: Darrick J. Wong Reviewed-by: Chandan Rajendra Signed-off-by: Darrick J. Wong Acked-by: Dave Chinner --- fs/xfs/libxfs/xfs_attr.c | 14 ++++++++++++++ fs/xfs/libxfs/xfs_attr_leaf.c | 7 +------ 2 files changed, 15 insertions(+), 6 deletions(-) diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index 1618efda0332..7912e23ff901 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -620,6 +620,13 @@ xfs_attr_leaf_addname( * "old" attr and clear the incomplete flag on the "new" attr. */ error = xfs_attr3_leaf_flipflags(args); + if (error) + return error; + /* + * Commit the flag value change and start the next trans in + * series. + */ + error = xfs_trans_roll_inode(&args->trans, args->dp); if (error) return error; @@ -961,6 +968,13 @@ restart: * "old" attr and clear the incomplete flag on the "new" attr. */ error = xfs_attr3_leaf_flipflags(args); + if (error) + goto out; + /* + * Commit the flag value change and start the next trans in + * series + */ + error = xfs_trans_roll_inode(&args->trans, args->dp); if (error) goto out; diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c index 7ec757bb3b2d..906049db57e4 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.c +++ b/fs/xfs/libxfs/xfs_attr_leaf.c @@ -2951,10 +2951,5 @@ xfs_attr3_leaf_flipflags( XFS_DA_LOGRANGE(leaf2, name_rmt, sizeof(*name_rmt))); } - /* - * Commit the flag value change and start the next trans in series. - */ - error = xfs_trans_roll_inode(&args->trans, args->dp); - - return error; + return 0; } From 7c93d4a8fc39cf209c6a67ff26fc83646c9f7d61 Mon Sep 17 00:00:00 2001 From: Allison Collins Date: Mon, 20 Jul 2020 21:47:24 -0700 Subject: [PATCH 099/117] xfs: Split apart xfs_attr_leaf_addname Split out new helper function xfs_attr_leaf_try_add from xfs_attr_leaf_addname. Because new delayed attribute routines cannot roll transactions, we split off the parts of xfs_attr_leaf_addname that we can use, and move the commit into the calling function. Signed-off-by: Allison Collins Reviewed-by: Brian Foster Reviewed-by: Chandan Rajendra Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong Acked-by: Dave Chinner --- fs/xfs/libxfs/xfs_attr.c | 99 +++++++++++++++++++++++++--------------- 1 file changed, 63 insertions(+), 36 deletions(-) diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index 7912e23ff901..2fbd3cbf8849 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -256,10 +256,31 @@ xfs_attr_set_args( } } - if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) + if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) { error = xfs_attr_leaf_addname(args); - else - error = xfs_attr_node_addname(args); + if (error != -ENOSPC) + return error; + + /* + * Finish any deferred work items and roll the transaction once + * more. The goal here is to call node_addname with the inode + * and transaction in the same state (inode locked and joined, + * transaction clean) no matter how we got to this step. + */ + error = xfs_defer_finish(&args->trans); + if (error) + return error; + + /* + * Commit the current trans (including the inode) and + * start a new one. + */ + error = xfs_trans_roll_inode(&args->trans, dp); + if (error) + return error; + } + + error = xfs_attr_node_addname(args); return error; } @@ -507,20 +528,21 @@ xfs_attr_shortform_addname(xfs_da_args_t *args) *========================================================================*/ /* - * Add a name to the leaf attribute list structure + * Tries to add an attribute to an inode in leaf form * - * This leaf block cannot have a "remote" value, we only call this routine - * if bmap_one_block() says there is only one block (ie: no remote blks). + * This function is meant to execute as part of a delayed operation and leaves + * the transaction handling to the caller. On success the attribute is added + * and the inode and transaction are left dirty. If there is not enough space, + * the attr data is converted to node format and -ENOSPC is returned. Caller is + * responsible for handling the dirty inode and transaction or adding the attr + * in node format. */ STATIC int -xfs_attr_leaf_addname( - struct xfs_da_args *args) +xfs_attr_leaf_try_add( + struct xfs_da_args *args, + struct xfs_buf *bp) { - struct xfs_buf *bp; - int retval, error, forkoff; - struct xfs_inode *dp = args->dp; - - trace_xfs_attr_leaf_addname(args); + int retval, error; /* * Look up the given attribute in the leaf block. Figure out if @@ -562,31 +584,39 @@ xfs_attr_leaf_addname( retval = xfs_attr3_leaf_add(bp, args); if (retval == -ENOSPC) { /* - * Promote the attribute list to the Btree format, then - * Commit that transaction so that the node_addname() call - * can manage its own transactions. + * Promote the attribute list to the Btree format. Unless an + * error occurs, retain the -ENOSPC retval */ error = xfs_attr3_leaf_to_node(args); if (error) return error; - error = xfs_defer_finish(&args->trans); - if (error) - return error; - - /* - * Commit the current trans (including the inode) and start - * a new one. - */ - error = xfs_trans_roll_inode(&args->trans, dp); - if (error) - return error; - - /* - * Fob the whole rest of the problem off on the Btree code. - */ - error = xfs_attr_node_addname(args); - return error; } + return retval; +out_brelse: + xfs_trans_brelse(args->trans, bp); + return retval; +} + + +/* + * Add a name to the leaf attribute list structure + * + * This leaf block cannot have a "remote" value, we only call this routine + * if bmap_one_block() says there is only one block (ie: no remote blks). + */ +STATIC int +xfs_attr_leaf_addname( + struct xfs_da_args *args) +{ + int error, forkoff; + struct xfs_buf *bp = NULL; + struct xfs_inode *dp = args->dp; + + trace_xfs_attr_leaf_addname(args); + + error = xfs_attr_leaf_try_add(args, bp); + if (error) + return error; /* * Commit the transaction that added the attr name so that @@ -681,9 +711,6 @@ xfs_attr_leaf_addname( error = xfs_attr3_leaf_clearflag(args); } return error; -out_brelse: - xfs_trans_brelse(args->trans, bp); - return retval; } /* From 6cc5b5f89840cfe85cbd14e20500f25353a7f241 Mon Sep 17 00:00:00 2001 From: Allison Collins Date: Mon, 20 Jul 2020 21:47:24 -0700 Subject: [PATCH 100/117] xfs: Refactor xfs_attr_try_sf_addname To help pre-simplify xfs_attr_set_args, we need to hoist transaction handling up, while modularizing the adjacent code down into helpers. In this patch, hoist the commit in xfs_attr_try_sf_addname up into the calling function, and also pull the attr list creation down. Signed-off-by: Allison Collins Reviewed-by: Darrick J. Wong Reviewed-by: Amir Goldstein Reviewed-by: Brian Foster Reviewed-by: Chandan Rajendra Signed-off-by: Darrick J. Wong Acked-by: Dave Chinner --- fs/xfs/libxfs/xfs_attr.c | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index 2fbd3cbf8849..9e57409edc55 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -178,8 +178,13 @@ xfs_attr_try_sf_addname( struct xfs_da_args *args) { - struct xfs_mount *mp = dp->i_mount; - int error, error2; + int error; + + /* + * Build initial attribute list (if required). + */ + if (dp->i_afp->if_format == XFS_DINODE_FMT_EXTENTS) + xfs_attr_shortform_create(args); error = xfs_attr_shortform_addname(args); if (error == -ENOSPC) @@ -192,12 +197,10 @@ xfs_attr_try_sf_addname( if (!error && !(args->op_flags & XFS_DA_OP_NOTIME)) xfs_trans_ichgtime(args->trans, dp, XFS_ICHGTIME_CHG); - if (mp->m_flags & XFS_MOUNT_WSYNC) + if (dp->i_mount->m_flags & XFS_MOUNT_WSYNC) xfs_trans_set_sync(args->trans); - error2 = xfs_trans_commit(args->trans); - args->trans = NULL; - return error ? error : error2; + return error; } /* @@ -209,7 +212,7 @@ xfs_attr_set_args( { struct xfs_inode *dp = args->dp; struct xfs_buf *leaf_bp = NULL; - int error; + int error, error2 = 0; /* * If the attribute list is non-existent or a shortform list, @@ -219,18 +222,15 @@ xfs_attr_set_args( (dp->i_afp->if_format == XFS_DINODE_FMT_EXTENTS && dp->i_afp->if_nextents == 0)) { - /* - * Build initial attribute list (if required). - */ - if (dp->i_afp->if_format == XFS_DINODE_FMT_EXTENTS) - xfs_attr_shortform_create(args); - /* * Try to add the attr to the attribute list in the inode. */ error = xfs_attr_try_sf_addname(dp, args); - if (error != -ENOSPC) - return error; + if (error != -ENOSPC) { + error2 = xfs_trans_commit(args->trans); + args->trans = NULL; + return error ? error : error2; + } /* * It won't fit in the shortform, transform to a leaf block. From 0949d317aee051fcb7ad9c8c7ec5d60b5cc412eb Mon Sep 17 00:00:00 2001 From: Allison Collins Date: Mon, 20 Jul 2020 21:47:25 -0700 Subject: [PATCH 101/117] xfs: Pull up trans roll from xfs_attr3_leaf_setflag New delayed allocation routines cannot be handling transactions so pull them up into the calling functions Signed-off-by: Allison Collins Reviewed-by: Darrick J. Wong Reviewed-by: Brian Foster Reviewed-by: Chandan Rajendra Signed-off-by: Darrick J. Wong Acked-by: Dave Chinner --- fs/xfs/libxfs/xfs_attr.c | 5 +++++ fs/xfs/libxfs/xfs_attr_leaf.c | 5 +---- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index 9e57409edc55..d3ab32d0f6ce 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -1126,6 +1126,11 @@ xfs_attr_node_removename( error = xfs_attr3_leaf_setflag(args); if (error) goto out; + + error = xfs_trans_roll_inode(&args->trans, args->dp); + if (error) + goto out; + error = xfs_attr_rmtval_remove(args); if (error) goto out; diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c index 906049db57e4..7c72ff53f4bf 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.c +++ b/fs/xfs/libxfs/xfs_attr_leaf.c @@ -2833,10 +2833,7 @@ xfs_attr3_leaf_setflag( XFS_DA_LOGRANGE(leaf, name_rmt, sizeof(*name_rmt))); } - /* - * Commit the flag value change and start the next trans in series. - */ - return xfs_trans_roll_inode(&args->trans, args->dp); + return 0; } /* From 795141099abc2ef4c801ca847190757880371390 Mon Sep 17 00:00:00 2001 From: Allison Collins Date: Mon, 20 Jul 2020 21:47:25 -0700 Subject: [PATCH 102/117] xfs: Factor out xfs_attr_rmtval_invalidate Because new delayed attribute routines cannot roll transactions, we carve off the parts of xfs_attr_rmtval_remove that we can use. This will help to reduce repetitive code later when we introduce delayed attributes. Signed-off-by: Allison Collins Reviewed-by: Brian Foster Reviewed-by: Darrick J. Wong Reviewed-by: Chandan Rajendra Signed-off-by: Darrick J. Wong Acked-by: Dave Chinner --- fs/xfs/libxfs/xfs_attr_remote.c | 26 +++++++++++++++++++++----- fs/xfs/libxfs/xfs_attr_remote.h | 2 +- 2 files changed, 22 insertions(+), 6 deletions(-) diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c index f825eedc04b8..4d519691daaf 100644 --- a/fs/xfs/libxfs/xfs_attr_remote.c +++ b/fs/xfs/libxfs/xfs_attr_remote.c @@ -634,15 +634,12 @@ xfs_attr_rmtval_set( * out-of-line buffer that it is stored on. */ int -xfs_attr_rmtval_remove( +xfs_attr_rmtval_invalidate( struct xfs_da_args *args) { xfs_dablk_t lblkno; int blkcnt; int error; - int done; - - trace_xfs_attr_rmtval_remove(args); /* * Roll through the "value", invalidating the attribute value's blocks. @@ -670,13 +667,32 @@ xfs_attr_rmtval_remove( lblkno += map.br_blockcount; blkcnt -= map.br_blockcount; } + return 0; +} +/* + * Remove the value associated with an attribute by deleting the + * out-of-line buffer that it is stored on. + */ +int +xfs_attr_rmtval_remove( + struct xfs_da_args *args) +{ + xfs_dablk_t lblkno; + int blkcnt; + int error = 0; + int done = 0; + + trace_xfs_attr_rmtval_remove(args); + + error = xfs_attr_rmtval_invalidate(args); + if (error) + return error; /* * Keep de-allocating extents until the remote-value region is gone. */ lblkno = args->rmtblkno; blkcnt = args->rmtblkcnt; - done = 0; while (!done) { error = xfs_bunmapi(args->trans, args->dp, lblkno, blkcnt, XFS_BMAPI_ATTRFORK, 1, &done); diff --git a/fs/xfs/libxfs/xfs_attr_remote.h b/fs/xfs/libxfs/xfs_attr_remote.h index e1144f22b005..3616e888f794 100644 --- a/fs/xfs/libxfs/xfs_attr_remote.h +++ b/fs/xfs/libxfs/xfs_attr_remote.h @@ -13,5 +13,5 @@ int xfs_attr_rmtval_set(struct xfs_da_args *args); int xfs_attr_rmtval_remove(struct xfs_da_args *args); int xfs_attr_rmtval_stale(struct xfs_inode *ip, struct xfs_bmbt_irec *map, xfs_buf_flags_t incore_flags); - +int xfs_attr_rmtval_invalidate(struct xfs_da_args *args); #endif /* __XFS_ATTR_REMOTE_H__ */ From 1fc618d76266750230f19e4053f5422a58b84141 Mon Sep 17 00:00:00 2001 From: Allison Collins Date: Mon, 20 Jul 2020 21:47:26 -0700 Subject: [PATCH 103/117] xfs: Pull up trans roll in xfs_attr3_leaf_clearflag New delayed allocation routines cannot be handling transactions so pull them out into the calling functions Signed-off-by: Allison Collins Reviewed-by: Brian Foster Reviewed-by: Darrick J. Wong Reviewed-by: Chandan Rajendra Signed-off-by: Darrick J. Wong Acked-by: Dave Chinner --- fs/xfs/libxfs/xfs_attr.c | 16 ++++++++++++++++ fs/xfs/libxfs/xfs_attr_leaf.c | 5 +---- 2 files changed, 17 insertions(+), 4 deletions(-) diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index d3ab32d0f6ce..2d84ab40b70b 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -709,6 +709,14 @@ xfs_attr_leaf_addname( * Added a "remote" value, just clear the incomplete flag. */ error = xfs_attr3_leaf_clearflag(args); + if (error) + return error; + + /* + * Commit the flag value change and start the next trans in + * series. + */ + error = xfs_trans_roll_inode(&args->trans, args->dp); } return error; } @@ -1067,6 +1075,14 @@ restart: error = xfs_attr3_leaf_clearflag(args); if (error) goto out; + + /* + * Commit the flag value change and start the next trans in + * series. + */ + error = xfs_trans_roll_inode(&args->trans, args->dp); + if (error) + goto out; } retval = error = 0; diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c index 7c72ff53f4bf..8623c815164a 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.c +++ b/fs/xfs/libxfs/xfs_attr_leaf.c @@ -2782,10 +2782,7 @@ xfs_attr3_leaf_clearflag( XFS_DA_LOGRANGE(leaf, name_rmt, sizeof(*name_rmt))); } - /* - * Commit the flag value change and start the next trans in series. - */ - return xfs_trans_roll_inode(&args->trans, args->dp); + return 0; } /* From 8b8e0cc0208befc5971d552a8798c8f5537afa45 Mon Sep 17 00:00:00 2001 From: Allison Collins Date: Mon, 20 Jul 2020 21:47:26 -0700 Subject: [PATCH 104/117] xfs: Refactor xfs_attr_rmtval_remove Refactor xfs_attr_rmtval_remove to add helper function __xfs_attr_rmtval_remove. We will use this later when we introduce delayed attributes. This function will eventually replace xfs_attr_rmtval_remove Signed-off-by: Allison Collins Reviewed-by: Chandan Rajendra Reviewed-by: Brian Foster Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong Acked-by: Dave Chinner --- fs/xfs/libxfs/xfs_attr_remote.c | 52 +++++++++++++++++++++++---------- fs/xfs/libxfs/xfs_attr_remote.h | 1 + 2 files changed, 38 insertions(+), 15 deletions(-) diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c index 4d519691daaf..3e7f6e075860 100644 --- a/fs/xfs/libxfs/xfs_attr_remote.c +++ b/fs/xfs/libxfs/xfs_attr_remote.c @@ -678,10 +678,8 @@ int xfs_attr_rmtval_remove( struct xfs_da_args *args) { - xfs_dablk_t lblkno; - int blkcnt; - int error = 0; - int done = 0; + int error; + int retval; trace_xfs_attr_rmtval_remove(args); @@ -691,16 +689,10 @@ xfs_attr_rmtval_remove( /* * Keep de-allocating extents until the remote-value region is gone. */ - lblkno = args->rmtblkno; - blkcnt = args->rmtblkcnt; - while (!done) { - error = xfs_bunmapi(args->trans, args->dp, lblkno, blkcnt, - XFS_BMAPI_ATTRFORK, 1, &done); - if (error) - return error; - error = xfs_defer_finish(&args->trans); - if (error) - return error; + do { + retval = __xfs_attr_rmtval_remove(args); + if (retval && retval != -EAGAIN) + return retval; /* * Close out trans and start the next one in the chain. @@ -708,6 +700,36 @@ xfs_attr_rmtval_remove( error = xfs_trans_roll_inode(&args->trans, args->dp); if (error) return error; - } + } while (retval == -EAGAIN); + return 0; } + +/* + * Remove the value associated with an attribute by deleting the out-of-line + * buffer that it is stored on. Returns EAGAIN for the caller to refresh the + * transaction and re-call the function + */ +int +__xfs_attr_rmtval_remove( + struct xfs_da_args *args) +{ + int error, done; + + /* + * Unmap value blocks for this attr. + */ + error = xfs_bunmapi(args->trans, args->dp, args->rmtblkno, + args->rmtblkcnt, XFS_BMAPI_ATTRFORK, 1, &done); + if (error) + return error; + + error = xfs_defer_finish(&args->trans); + if (error) + return error; + + if (!done) + return -EAGAIN; + + return error; +} diff --git a/fs/xfs/libxfs/xfs_attr_remote.h b/fs/xfs/libxfs/xfs_attr_remote.h index 3616e888f794..9eee615da156 100644 --- a/fs/xfs/libxfs/xfs_attr_remote.h +++ b/fs/xfs/libxfs/xfs_attr_remote.h @@ -14,4 +14,5 @@ int xfs_attr_rmtval_remove(struct xfs_da_args *args); int xfs_attr_rmtval_stale(struct xfs_inode *ip, struct xfs_bmbt_irec *map, xfs_buf_flags_t incore_flags); int xfs_attr_rmtval_invalidate(struct xfs_da_args *args); +int __xfs_attr_rmtval_remove(struct xfs_da_args *args); #endif /* __XFS_ATTR_REMOTE_H__ */ From d4034c4662af5d40de0655c641ddc6eccde0e8fc Mon Sep 17 00:00:00 2001 From: Allison Collins Date: Mon, 20 Jul 2020 21:47:26 -0700 Subject: [PATCH 105/117] xfs: Pull up xfs_attr_rmtval_invalidate This patch pulls xfs_attr_rmtval_invalidate out of xfs_attr_rmtval_remove and into the calling functions. Eventually __xfs_attr_rmtval_remove will replace xfs_attr_rmtval_remove when we introduce delayed attributes. These functions are exepcted to return -EAGAIN when they need a new transaction. Because the invalidate does not need a new transaction, we need to separate it from the rest of the function that does. This will enable __xfs_attr_rmtval_remove to smoothly replace xfs_attr_rmtval_remove later. Signed-off-by: Allison Collins Reviewed-by: Brian Foster Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong Acked-by: Dave Chinner --- fs/xfs/libxfs/xfs_attr.c | 12 ++++++++++++ fs/xfs/libxfs/xfs_attr_remote.c | 3 --- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index 2d84ab40b70b..2379d5b1a1b9 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -670,6 +670,10 @@ xfs_attr_leaf_addname( args->rmtblkcnt = args->rmtblkcnt2; args->rmtvaluelen = args->rmtvaluelen2; if (args->rmtblkno) { + error = xfs_attr_rmtval_invalidate(args); + if (error) + return error; + error = xfs_attr_rmtval_remove(args); if (error) return error; @@ -1023,6 +1027,10 @@ restart: args->rmtblkcnt = args->rmtblkcnt2; args->rmtvaluelen = args->rmtvaluelen2; if (args->rmtblkno) { + error = xfs_attr_rmtval_invalidate(args); + if (error) + return error; + error = xfs_attr_rmtval_remove(args); if (error) return error; @@ -1147,6 +1155,10 @@ xfs_attr_node_removename( if (error) goto out; + error = xfs_attr_rmtval_invalidate(args); + if (error) + return error; + error = xfs_attr_rmtval_remove(args); if (error) goto out; diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c index 3e7f6e075860..3f80cede7406 100644 --- a/fs/xfs/libxfs/xfs_attr_remote.c +++ b/fs/xfs/libxfs/xfs_attr_remote.c @@ -683,9 +683,6 @@ xfs_attr_rmtval_remove( trace_xfs_attr_rmtval_remove(args); - error = xfs_attr_rmtval_invalidate(args); - if (error) - return error; /* * Keep de-allocating extents until the remote-value region is gone. */ From 3f6e011ee2bfcf5b70c5f8978673cd0e5bc64a33 Mon Sep 17 00:00:00 2001 From: Allison Collins Date: Mon, 20 Jul 2020 21:47:27 -0700 Subject: [PATCH 106/117] xfs: Add helper function xfs_attr_node_shrink This patch adds a new helper function xfs_attr_node_shrink used to shrink an attr name into an inode if it is small enough. This helps to modularize the greater calling function xfs_attr_node_removename. Signed-off-by: Allison Collins Reviewed-by: Darrick J. Wong Reviewed-by: Brian Foster Signed-off-by: Darrick J. Wong Acked-by: Dave Chinner --- fs/xfs/libxfs/xfs_attr.c | 68 +++++++++++++++++++++++++--------------- 1 file changed, 42 insertions(+), 26 deletions(-) diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index 2379d5b1a1b9..b576f46d1d75 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -1102,6 +1102,45 @@ out: return retval; } +/* + * Shrink an attribute from leaf to shortform + */ +STATIC int +xfs_attr_node_shrink( + struct xfs_da_args *args, + struct xfs_da_state *state) +{ + struct xfs_inode *dp = args->dp; + int error, forkoff; + struct xfs_buf *bp; + + /* + * Have to get rid of the copy of this dabuf in the state. + */ + ASSERT(state->path.active == 1); + ASSERT(state->path.blk[0].bp); + state->path.blk[0].bp = NULL; + + error = xfs_attr3_leaf_read(args->trans, args->dp, 0, &bp); + if (error) + return error; + + forkoff = xfs_attr_shortform_allfit(bp, dp); + if (forkoff) { + error = xfs_attr3_leaf_to_shortform(bp, args, forkoff); + /* bp is gone due to xfs_da_shrink_inode */ + if (error) + return error; + + error = xfs_defer_finish(&args->trans); + if (error) + return error; + } else + xfs_trans_brelse(args->trans, bp); + + return 0; +} + /* * Remove a name from a B-tree attribute list. * @@ -1115,8 +1154,7 @@ xfs_attr_node_removename( { struct xfs_da_state *state; struct xfs_da_state_blk *blk; - struct xfs_buf *bp; - int retval, error, forkoff; + int retval, error; struct xfs_inode *dp = args->dp; trace_xfs_attr_node_removename(args); @@ -1201,30 +1239,8 @@ xfs_attr_node_removename( /* * If the result is small enough, push it all into the inode. */ - if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) { - /* - * Have to get rid of the copy of this dabuf in the state. - */ - ASSERT(state->path.active == 1); - ASSERT(state->path.blk[0].bp); - state->path.blk[0].bp = NULL; - - error = xfs_attr3_leaf_read(args->trans, args->dp, 0, &bp); - if (error) - goto out; - - if ((forkoff = xfs_attr_shortform_allfit(bp, dp))) { - error = xfs_attr3_leaf_to_shortform(bp, args, forkoff); - /* bp is gone due to xfs_da_shrink_inode */ - if (error) - goto out; - error = xfs_defer_finish(&args->trans); - if (error) - goto out; - } else - xfs_trans_brelse(args->trans, bp); - } - error = 0; + if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) + error = xfs_attr_node_shrink(args, state); out: if (state) From 0feaef17db9ddf0d97676eaa258dc55ff504cfb6 Mon Sep 17 00:00:00 2001 From: Allison Collins Date: Mon, 20 Jul 2020 21:47:27 -0700 Subject: [PATCH 107/117] xfs: Remove unneeded xfs_trans_roll_inode calls Some calls to xfs_trans_roll_inode and xfs_defer_finish routines are not needed. If they are the last operations executed in these functions, and no further changes are made, then higher level routines will roll or commit the transactions. Signed-off-by: Allison Collins Reviewed-by: Brian Foster Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong Acked-by: Dave Chinner --- fs/xfs/libxfs/xfs_attr.c | 61 +++++----------------------------------- 1 file changed, 7 insertions(+), 54 deletions(-) diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index b576f46d1d75..2daf3084f3e3 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -693,34 +693,15 @@ xfs_attr_leaf_addname( /* * If the result is small enough, shrink it all into the inode. */ - if ((forkoff = xfs_attr_shortform_allfit(bp, dp))) { + forkoff = xfs_attr_shortform_allfit(bp, dp); + if (forkoff) error = xfs_attr3_leaf_to_shortform(bp, args, forkoff); /* bp is gone due to xfs_da_shrink_inode */ - if (error) - return error; - error = xfs_defer_finish(&args->trans); - if (error) - return error; - } - - /* - * Commit the remove and start the next trans in series. - */ - error = xfs_trans_roll_inode(&args->trans, dp); - } else if (args->rmtblkno > 0) { /* * Added a "remote" value, just clear the incomplete flag. */ error = xfs_attr3_leaf_clearflag(args); - if (error) - return error; - - /* - * Commit the flag value change and start the next trans in - * series. - */ - error = xfs_trans_roll_inode(&args->trans, args->dp); } return error; } @@ -780,15 +761,11 @@ xfs_attr_leaf_removename( /* * If the result is small enough, shrink it all into the inode. */ - if ((forkoff = xfs_attr_shortform_allfit(bp, dp))) { - error = xfs_attr3_leaf_to_shortform(bp, args, forkoff); + forkoff = xfs_attr_shortform_allfit(bp, dp); + if (forkoff) + return xfs_attr3_leaf_to_shortform(bp, args, forkoff); /* bp is gone due to xfs_da_shrink_inode */ - if (error) - return error; - error = xfs_defer_finish(&args->trans); - if (error) - return error; - } + return 0; } @@ -1064,18 +1041,8 @@ restart: error = xfs_da3_join(state); if (error) goto out; - error = xfs_defer_finish(&args->trans); - if (error) - goto out; } - /* - * Commit and start the next trans in the chain. - */ - error = xfs_trans_roll_inode(&args->trans, dp); - if (error) - goto out; - } else if (args->rmtblkno > 0) { /* * Added a "remote" value, just clear the incomplete flag. @@ -1083,14 +1050,6 @@ restart: error = xfs_attr3_leaf_clearflag(args); if (error) goto out; - - /* - * Commit the flag value change and start the next trans in - * series. - */ - error = xfs_trans_roll_inode(&args->trans, args->dp); - if (error) - goto out; } retval = error = 0; @@ -1129,16 +1088,10 @@ xfs_attr_node_shrink( if (forkoff) { error = xfs_attr3_leaf_to_shortform(bp, args, forkoff); /* bp is gone due to xfs_da_shrink_inode */ - if (error) - return error; - - error = xfs_defer_finish(&args->trans); - if (error) - return error; } else xfs_trans_brelse(args->trans, bp); - return 0; + return error; } /* From a237f2ddae4b79797fcb4290116acda38323da16 Mon Sep 17 00:00:00 2001 From: Allison Collins Date: Mon, 20 Jul 2020 21:47:28 -0700 Subject: [PATCH 108/117] xfs: Remove xfs_trans_roll in xfs_attr_node_removename A transaction roll is not necessary immediately after setting the INCOMPLETE flag when removing a node xattr entry with remote value blocks. The remote block invalidation that immediately follows setting the flag is an in-core only change. The next step after that is to start unmapping the remote blocks from the attr fork, but the xattr remove transaction reservation includes reservation for full tree splits of the dabtree and bmap tree. The remote block unmap code will roll the transaction as extents are unmapped and freed. Signed-off-by: Allison Collins Reviewed-by: Brian Foster Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong Acked-by: Dave Chinner --- fs/xfs/libxfs/xfs_attr.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index 2daf3084f3e3..780af0ef6bc6 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -1142,10 +1142,6 @@ xfs_attr_node_removename( if (error) goto out; - error = xfs_trans_roll_inode(&args->trans, args->dp); - if (error) - goto out; - error = xfs_attr_rmtval_invalidate(args); if (error) return error; From db1a28cc591c1abe5470bedca1a489bb165c4c7c Mon Sep 17 00:00:00 2001 From: Allison Collins Date: Mon, 20 Jul 2020 21:47:28 -0700 Subject: [PATCH 109/117] xfs: Add helpers xfs_attr_is_shortform and xfs_attr_set_shortform In this patch, we hoist code from xfs_attr_set_args into two new helpers xfs_attr_is_shortform and xfs_attr_set_shortform. These two will help to simplify xfs_attr_set_args when we get into delayed attrs later. Signed-off-by: Allison Collins Reviewed-by: Brian Foster Reviewed-by: Chandan Rajendra Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong Acked-by: Dave Chinner --- fs/xfs/libxfs/xfs_attr.c | 107 ++++++++++++++++++++++++++------------- 1 file changed, 72 insertions(+), 35 deletions(-) diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index 780af0ef6bc6..93a1c55cceb2 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -203,6 +203,66 @@ xfs_attr_try_sf_addname( return error; } +/* + * Check to see if the attr should be upgraded from non-existent or shortform to + * single-leaf-block attribute list. + */ +static inline bool +xfs_attr_is_shortform( + struct xfs_inode *ip) +{ + return ip->i_afp->if_format == XFS_DINODE_FMT_LOCAL || + (ip->i_afp->if_format == XFS_DINODE_FMT_EXTENTS && + ip->i_afp->if_nextents == 0); +} + +/* + * Attempts to set an attr in shortform, or converts short form to leaf form if + * there is not enough room. If the attr is set, the transaction is committed + * and set to NULL. + */ +STATIC int +xfs_attr_set_shortform( + struct xfs_da_args *args, + struct xfs_buf **leaf_bp) +{ + struct xfs_inode *dp = args->dp; + int error, error2 = 0; + + /* + * Try to add the attr to the attribute list in the inode. + */ + error = xfs_attr_try_sf_addname(dp, args); + if (error != -ENOSPC) { + error2 = xfs_trans_commit(args->trans); + args->trans = NULL; + return error ? error : error2; + } + /* + * It won't fit in the shortform, transform to a leaf block. GROT: + * another possible req'mt for a double-split btree op. + */ + error = xfs_attr_shortform_to_leaf(args, leaf_bp); + if (error) + return error; + + /* + * Prevent the leaf buffer from being unlocked so that a concurrent AIL + * push cannot grab the half-baked leaf buffer and run into problems + * with the write verifier. Once we're done rolling the transaction we + * can release the hold and add the attr to the leaf. + */ + xfs_trans_bhold(args->trans, *leaf_bp); + error = xfs_defer_finish(&args->trans); + xfs_trans_bhold_release(args->trans, *leaf_bp); + if (error) { + xfs_trans_brelse(args->trans, *leaf_bp); + return error; + } + + return 0; +} + /* * Set the attribute specified in @args. */ @@ -212,48 +272,25 @@ xfs_attr_set_args( { struct xfs_inode *dp = args->dp; struct xfs_buf *leaf_bp = NULL; - int error, error2 = 0; + int error = 0; /* - * If the attribute list is non-existent or a shortform list, - * upgrade it to a single-leaf-block attribute list. + * If the attribute list is already in leaf format, jump straight to + * leaf handling. Otherwise, try to add the attribute to the shortform + * list; if there's no room then convert the list to leaf format and try + * again. */ - if (dp->i_afp->if_format == XFS_DINODE_FMT_LOCAL || - (dp->i_afp->if_format == XFS_DINODE_FMT_EXTENTS && - dp->i_afp->if_nextents == 0)) { + if (xfs_attr_is_shortform(dp)) { /* - * Try to add the attr to the attribute list in the inode. + * If the attr was successfully set in shortform, the + * transaction is committed and set to NULL. Otherwise, is it + * converted from shortform to leaf, and the transaction is + * retained. */ - error = xfs_attr_try_sf_addname(dp, args); - if (error != -ENOSPC) { - error2 = xfs_trans_commit(args->trans); - args->trans = NULL; - return error ? error : error2; - } - - /* - * It won't fit in the shortform, transform to a leaf block. - * GROT: another possible req'mt for a double-split btree op. - */ - error = xfs_attr_shortform_to_leaf(args, &leaf_bp); - if (error) + error = xfs_attr_set_shortform(args, &leaf_bp); + if (error || !args->trans) return error; - - /* - * Prevent the leaf buffer from being unlocked so that a - * concurrent AIL push cannot grab the half-baked leaf - * buffer and run into problems with the write verifier. - * Once we're done rolling the transaction we can release - * the hold and add the attr to the leaf. - */ - xfs_trans_bhold(args->trans, leaf_bp); - error = xfs_defer_finish(&args->trans); - xfs_trans_bhold_release(args->trans, leaf_bp); - if (error) { - xfs_trans_brelse(args->trans, leaf_bp); - return error; - } } if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) { From f44df68c82dc060b9b9942e204096447e1efc677 Mon Sep 17 00:00:00 2001 From: Allison Collins Date: Mon, 20 Jul 2020 21:47:28 -0700 Subject: [PATCH 110/117] xfs: Add helper function xfs_attr_leaf_mark_incomplete This patch helps to simplify xfs_attr_node_removename by modularizing the code around the transactions into helper functions. This will make the function easier to follow when we introduce delayed attributes. Signed-off-by: Allison Collins Reviewed-by: Amir Goldstein Reviewed-by: Chandan Rajendra Reviewed-by: Brian Foster Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong Acked-by: Dave Chinner --- fs/xfs/libxfs/xfs_attr.c | 41 ++++++++++++++++++++++++++-------------- 1 file changed, 27 insertions(+), 14 deletions(-) diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index 93a1c55cceb2..1420f6ebd496 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -1131,6 +1131,32 @@ xfs_attr_node_shrink( return error; } +/* + * Mark an attribute entry INCOMPLETE and save pointers to the relevant buffers + * for later deletion of the entry. + */ +STATIC int +xfs_attr_leaf_mark_incomplete( + struct xfs_da_args *args, + struct xfs_da_state *state) +{ + int error; + + /* + * Fill in disk block numbers in the state structure + * so that we can get the buffers back after we commit + * several transactions in the following calls. + */ + error = xfs_attr_fillstate(state); + if (error) + return error; + + /* + * Mark the attribute as INCOMPLETE + */ + return xfs_attr3_leaf_setflag(args); +} + /* * Remove a name from a B-tree attribute list. * @@ -1162,20 +1188,7 @@ xfs_attr_node_removename( ASSERT(blk->bp != NULL); ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC); if (args->rmtblkno > 0) { - /* - * Fill in disk block numbers in the state structure - * so that we can get the buffers back after we commit - * several transactions in the following calls. - */ - error = xfs_attr_fillstate(state); - if (error) - goto out; - - /* - * Mark the attribute as INCOMPLETE, then bunmapi() the - * remote value. - */ - error = xfs_attr3_leaf_setflag(args); + error = xfs_attr_leaf_mark_incomplete(args, state); if (error) goto out; From 410c19885db5f7c4fca55b437e199e18252142b6 Mon Sep 17 00:00:00 2001 From: Allison Collins Date: Mon, 20 Jul 2020 21:47:29 -0700 Subject: [PATCH 111/117] xfs: Add remote block helper functions This patch adds two new helper functions xfs_attr_store_rmt_blk and xfs_attr_restore_rmt_blk. These two helpers assist to remove redundant code associated with storing and retrieving remote blocks during the attr set operations. Signed-off-by: Allison Collins Reviewed-by: Chandan Rajendra Reviewed-by: Amir Goldstein Reviewed-by: Brian Foster Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong Acked-by: Dave Chinner --- fs/xfs/libxfs/xfs_attr.c | 50 ++++++++++++++++++++++++---------------- 1 file changed, 30 insertions(+), 20 deletions(-) diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index 1420f6ebd496..37fda9f90b7f 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -564,6 +564,30 @@ xfs_attr_shortform_addname(xfs_da_args_t *args) * External routines when attribute list is one block *========================================================================*/ +/* Store info about a remote block */ +STATIC void +xfs_attr_save_rmt_blk( + struct xfs_da_args *args) +{ + args->blkno2 = args->blkno; + args->index2 = args->index; + args->rmtblkno2 = args->rmtblkno; + args->rmtblkcnt2 = args->rmtblkcnt; + args->rmtvaluelen2 = args->rmtvaluelen; +} + +/* Set stored info about a remote block */ +STATIC void +xfs_attr_restore_rmt_blk( + struct xfs_da_args *args) +{ + args->blkno = args->blkno2; + args->index = args->index2; + args->rmtblkno = args->rmtblkno2; + args->rmtblkcnt = args->rmtblkcnt2; + args->rmtvaluelen = args->rmtvaluelen2; +} + /* * Tries to add an attribute to an inode in leaf form * @@ -598,11 +622,7 @@ xfs_attr_leaf_try_add( /* save the attribute state for later removal*/ args->op_flags |= XFS_DA_OP_RENAME; /* an atomic rename */ - args->blkno2 = args->blkno; /* set 2nd entry info*/ - args->index2 = args->index; - args->rmtblkno2 = args->rmtblkno; - args->rmtblkcnt2 = args->rmtblkcnt; - args->rmtvaluelen2 = args->rmtvaluelen; + xfs_attr_save_rmt_blk(args); /* * clear the remote attr state now that it is saved so that the @@ -701,11 +721,8 @@ xfs_attr_leaf_addname( * Dismantle the "old" attribute/value pair by removing * a "remote" value (if it exists). */ - args->index = args->index2; - args->blkno = args->blkno2; - args->rmtblkno = args->rmtblkno2; - args->rmtblkcnt = args->rmtblkcnt2; - args->rmtvaluelen = args->rmtvaluelen2; + xfs_attr_restore_rmt_blk(args); + if (args->rmtblkno) { error = xfs_attr_rmtval_invalidate(args); if (error) @@ -919,11 +936,7 @@ restart: /* save the attribute state for later removal*/ args->op_flags |= XFS_DA_OP_RENAME; /* atomic rename op */ - args->blkno2 = args->blkno; /* set 2nd entry info*/ - args->index2 = args->index; - args->rmtblkno2 = args->rmtblkno; - args->rmtblkcnt2 = args->rmtblkcnt; - args->rmtvaluelen2 = args->rmtvaluelen; + xfs_attr_save_rmt_blk(args); /* * clear the remote attr state now that it is saved so that the @@ -1035,11 +1048,8 @@ restart: * Dismantle the "old" attribute/value pair by removing * a "remote" value (if it exists). */ - args->index = args->index2; - args->blkno = args->blkno2; - args->rmtblkno = args->rmtblkno2; - args->rmtblkcnt = args->rmtblkcnt2; - args->rmtvaluelen = args->rmtvaluelen2; + xfs_attr_restore_rmt_blk(args); + if (args->rmtblkno) { error = xfs_attr_rmtval_invalidate(args); if (error) From 674eb548cf0ced1487ee229f96af2c7cf0099d2a Mon Sep 17 00:00:00 2001 From: Allison Collins Date: Mon, 20 Jul 2020 21:47:30 -0700 Subject: [PATCH 112/117] xfs: Add helper function xfs_attr_node_removename_setup This patch adds a new helper function xfs_attr_node_removename_setup. This will help modularize xfs_attr_node_removename when we add delay ready attributes later. Signed-off-by: Allison Collins Reviewed-by: Brian Foster Reviewed-by: Chandan Rajendra [darrick: fix unused variable complaints by 0day robot] Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong Acked-by: Dave Chinner Reported-by: kernel test robot --- fs/xfs/libxfs/xfs_attr.c | 46 ++++++++++++++++++++++++++++------------ 1 file changed, 33 insertions(+), 13 deletions(-) diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index 37fda9f90b7f..bf91da55ef47 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -1167,6 +1167,37 @@ xfs_attr_leaf_mark_incomplete( return xfs_attr3_leaf_setflag(args); } +/* + * Initial setup for xfs_attr_node_removename. Make sure the attr is there and + * the blocks are valid. Attr keys with remote blocks will be marked + * incomplete. + */ +STATIC +int xfs_attr_node_removename_setup( + struct xfs_da_args *args, + struct xfs_da_state **state) +{ + int error; + + error = xfs_attr_node_hasname(args, state); + if (error != -EEXIST) + return error; + + ASSERT((*state)->path.blk[(*state)->path.active - 1].bp != NULL); + ASSERT((*state)->path.blk[(*state)->path.active - 1].magic == + XFS_ATTR_LEAF_MAGIC); + + if (args->rmtblkno > 0) { + error = xfs_attr_leaf_mark_incomplete(args, *state); + if (error) + return error; + + return xfs_attr_rmtval_invalidate(args); + } + + return 0; +} + /* * Remove a name from a B-tree attribute list. * @@ -1185,8 +1216,8 @@ xfs_attr_node_removename( trace_xfs_attr_node_removename(args); - error = xfs_attr_node_hasname(args, &state); - if (error != -EEXIST) + error = xfs_attr_node_removename_setup(args, &state); + if (error) goto out; /* @@ -1194,18 +1225,7 @@ xfs_attr_node_removename( * This is done before we remove the attribute so that we don't * overflow the maximum size of a transaction and/or hit a deadlock. */ - blk = &state->path.blk[ state->path.active-1 ]; - ASSERT(blk->bp != NULL); - ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC); if (args->rmtblkno > 0) { - error = xfs_attr_leaf_mark_incomplete(args, state); - if (error) - goto out; - - error = xfs_attr_rmtval_invalidate(args); - if (error) - return error; - error = xfs_attr_rmtval_remove(args); if (error) goto out; From 72b97ea40d23b333175323251d6afecb565a0b53 Mon Sep 17 00:00:00 2001 From: Allison Collins Date: Mon, 20 Jul 2020 21:47:30 -0700 Subject: [PATCH 113/117] xfs: Add helper function xfs_attr_node_removename_rmt This patch adds another new helper function xfs_attr_node_removename_rmt. This will also help modularize xfs_attr_node_removename when we add delay ready attributes later. Signed-off-by: Allison Collins Reviewed-by: Brian Foster Reviewed-by: Chandan Rajendra Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong Acked-by: Dave Chinner --- fs/xfs/libxfs/xfs_attr.c | 28 +++++++++++++++++++--------- 1 file changed, 19 insertions(+), 9 deletions(-) diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index bf91da55ef47..82f7cc796c26 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -1198,6 +1198,24 @@ int xfs_attr_node_removename_setup( return 0; } +STATIC int +xfs_attr_node_remove_rmt( + struct xfs_da_args *args, + struct xfs_da_state *state) +{ + int error = 0; + + error = xfs_attr_rmtval_remove(args); + if (error) + return error; + + /* + * Refill the state structure with buffers, the prior calls released our + * buffers. + */ + return xfs_attr_refillstate(state); +} + /* * Remove a name from a B-tree attribute list. * @@ -1226,15 +1244,7 @@ xfs_attr_node_removename( * overflow the maximum size of a transaction and/or hit a deadlock. */ if (args->rmtblkno > 0) { - error = xfs_attr_rmtval_remove(args); - if (error) - goto out; - - /* - * Refill the state structure with buffers, the prior calls - * released our buffers. - */ - error = xfs_attr_refillstate(state); + error = xfs_attr_node_remove_rmt(args, state); if (error) goto out; } From 5fdca0ad5c95464a60c5954f853c22127f8bb64f Mon Sep 17 00:00:00 2001 From: Allison Collins Date: Mon, 20 Jul 2020 21:47:31 -0700 Subject: [PATCH 114/117] xfs: Simplify xfs_attr_leaf_addname Invert the rename logic in xfs_attr_leaf_addname to simplify the delayed attr logic later. Signed-off-by: Allison Collins Reviewed-by: Brian Foster Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong Acked-by: Dave Chinner --- fs/xfs/libxfs/xfs_attr.c | 119 ++++++++++++++++++++------------------- 1 file changed, 61 insertions(+), 58 deletions(-) diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index 82f7cc796c26..44288e4baea7 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -695,68 +695,71 @@ xfs_attr_leaf_addname( return error; } - /* - * If this is an atomic rename operation, we must "flip" the - * incomplete flags on the "new" and "old" attribute/value pairs - * so that one disappears and one appears atomically. Then we - * must remove the "old" attribute/value pair. - */ - if (args->op_flags & XFS_DA_OP_RENAME) { - /* - * In a separate transaction, set the incomplete flag on the - * "old" attr and clear the incomplete flag on the "new" attr. - */ - error = xfs_attr3_leaf_flipflags(args); - if (error) - return error; - /* - * Commit the flag value change and start the next trans in - * series. - */ - error = xfs_trans_roll_inode(&args->trans, args->dp); - if (error) - return error; - - /* - * Dismantle the "old" attribute/value pair by removing - * a "remote" value (if it exists). - */ - xfs_attr_restore_rmt_blk(args); - - if (args->rmtblkno) { - error = xfs_attr_rmtval_invalidate(args); - if (error) - return error; - - error = xfs_attr_rmtval_remove(args); - if (error) - return error; - } - - /* - * Read in the block containing the "old" attr, then - * remove the "old" attr from that block (neat, huh!) - */ - error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, - &bp); - if (error) - return error; - - xfs_attr3_leaf_remove(bp, args); - - /* - * If the result is small enough, shrink it all into the inode. - */ - forkoff = xfs_attr_shortform_allfit(bp, dp); - if (forkoff) - error = xfs_attr3_leaf_to_shortform(bp, args, forkoff); - /* bp is gone due to xfs_da_shrink_inode */ - } else if (args->rmtblkno > 0) { + if (!(args->op_flags & XFS_DA_OP_RENAME)) { /* * Added a "remote" value, just clear the incomplete flag. */ - error = xfs_attr3_leaf_clearflag(args); + if (args->rmtblkno > 0) + error = xfs_attr3_leaf_clearflag(args); + + return error; } + + /* + * If this is an atomic rename operation, we must "flip" the incomplete + * flags on the "new" and "old" attribute/value pairs so that one + * disappears and one appears atomically. Then we must remove the "old" + * attribute/value pair. + * + * In a separate transaction, set the incomplete flag on the "old" attr + * and clear the incomplete flag on the "new" attr. + */ + + error = xfs_attr3_leaf_flipflags(args); + if (error) + return error; + /* + * Commit the flag value change and start the next trans in series. + */ + error = xfs_trans_roll_inode(&args->trans, args->dp); + if (error) + return error; + + /* + * Dismantle the "old" attribute/value pair by removing a "remote" value + * (if it exists). + */ + xfs_attr_restore_rmt_blk(args); + + if (args->rmtblkno) { + error = xfs_attr_rmtval_invalidate(args); + if (error) + return error; + + error = xfs_attr_rmtval_remove(args); + if (error) + return error; + } + + /* + * Read in the block containing the "old" attr, then remove the "old" + * attr from that block (neat, huh!) + */ + error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, + &bp); + if (error) + return error; + + xfs_attr3_leaf_remove(bp, args); + + /* + * If the result is small enough, shrink it all into the inode. + */ + forkoff = xfs_attr_shortform_allfit(bp, dp); + if (forkoff) + error = xfs_attr3_leaf_to_shortform(bp, args, forkoff); + /* bp is gone due to xfs_da_shrink_inode */ + return error; } From bf4a5cfffee096624764d5aed82f786c708dfe90 Mon Sep 17 00:00:00 2001 From: Allison Collins Date: Mon, 20 Jul 2020 21:47:31 -0700 Subject: [PATCH 115/117] xfs: Simplify xfs_attr_node_addname Invert the rename logic in xfs_attr_node_addname to simplify the delayed attr logic later. Signed-off-by: Allison Collins Reviewed-by: Brian Foster Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong Acked-by: Dave Chinner --- fs/xfs/libxfs/xfs_attr.c | 136 +++++++++++++++++++-------------------- 1 file changed, 66 insertions(+), 70 deletions(-) diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index 44288e4baea7..cdfc136b5bfb 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -1025,79 +1025,75 @@ restart: return error; } - /* - * If this is an atomic rename operation, we must "flip" the - * incomplete flags on the "new" and "old" attribute/value pairs - * so that one disappears and one appears atomically. Then we - * must remove the "old" attribute/value pair. - */ - if (args->op_flags & XFS_DA_OP_RENAME) { - /* - * In a separate transaction, set the incomplete flag on the - * "old" attr and clear the incomplete flag on the "new" attr. - */ - error = xfs_attr3_leaf_flipflags(args); - if (error) - goto out; - /* - * Commit the flag value change and start the next trans in - * series - */ - error = xfs_trans_roll_inode(&args->trans, args->dp); - if (error) - goto out; - - /* - * Dismantle the "old" attribute/value pair by removing - * a "remote" value (if it exists). - */ - xfs_attr_restore_rmt_blk(args); - - if (args->rmtblkno) { - error = xfs_attr_rmtval_invalidate(args); - if (error) - return error; - - error = xfs_attr_rmtval_remove(args); - if (error) - return error; - } - - /* - * Re-find the "old" attribute entry after any split ops. - * The INCOMPLETE flag means that we will find the "old" - * attr, not the "new" one. - */ - args->attr_filter |= XFS_ATTR_INCOMPLETE; - state = xfs_da_state_alloc(args); - - state->inleaf = 0; - error = xfs_da3_node_lookup_int(state, &retval); - if (error) - goto out; - - /* - * Remove the name and update the hashvals in the tree. - */ - blk = &state->path.blk[ state->path.active-1 ]; - ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC); - error = xfs_attr3_leaf_remove(blk->bp, args); - xfs_da3_fixhashpath(state, &state->path); - - /* - * Check to see if the tree needs to be collapsed. - */ - if (retval && (state->path.active > 1)) { - error = xfs_da3_join(state); - if (error) - goto out; - } - - } else if (args->rmtblkno > 0) { + if (!(args->op_flags & XFS_DA_OP_RENAME)) { /* * Added a "remote" value, just clear the incomplete flag. */ - error = xfs_attr3_leaf_clearflag(args); + if (args->rmtblkno > 0) + error = xfs_attr3_leaf_clearflag(args); + retval = error; + goto out; + } + + /* + * If this is an atomic rename operation, we must "flip" the incomplete + * flags on the "new" and "old" attribute/value pairs so that one + * disappears and one appears atomically. Then we must remove the "old" + * attribute/value pair. + * + * In a separate transaction, set the incomplete flag on the "old" attr + * and clear the incomplete flag on the "new" attr. + */ + error = xfs_attr3_leaf_flipflags(args); + if (error) + goto out; + /* + * Commit the flag value change and start the next trans in series + */ + error = xfs_trans_roll_inode(&args->trans, args->dp); + if (error) + goto out; + + /* + * Dismantle the "old" attribute/value pair by removing a "remote" value + * (if it exists). + */ + xfs_attr_restore_rmt_blk(args); + + if (args->rmtblkno) { + error = xfs_attr_rmtval_invalidate(args); + if (error) + return error; + + error = xfs_attr_rmtval_remove(args); + if (error) + return error; + } + + /* + * Re-find the "old" attribute entry after any split ops. The INCOMPLETE + * flag means that we will find the "old" attr, not the "new" one. + */ + args->attr_filter |= XFS_ATTR_INCOMPLETE; + state = xfs_da_state_alloc(args); + state->inleaf = 0; + error = xfs_da3_node_lookup_int(state, &retval); + if (error) + goto out; + + /* + * Remove the name and update the hashvals in the tree. + */ + blk = &state->path.blk[state->path.active-1]; + ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC); + error = xfs_attr3_leaf_remove(blk->bp, args); + xfs_da3_fixhashpath(state, &state->path); + + /* + * Check to see if the tree needs to be collapsed. + */ + if (retval && (state->path.active > 1)) { + error = xfs_da3_join(state); if (error) goto out; } From 0f89edcd8e8484dd1790ec474a650dd774c6b75e Mon Sep 17 00:00:00 2001 From: Allison Collins Date: Mon, 20 Jul 2020 21:47:31 -0700 Subject: [PATCH 116/117] xfs: Lift -ENOSPC handler from xfs_attr_leaf_addname Lift -ENOSPC handler from xfs_attr_leaf_addname. This will help to reorganize transitions between the attr forms later. Signed-off-by: Allison Collins Reviewed-by: Darrick J. Wong Reviewed-by: Brian Foster Signed-off-by: Darrick J. Wong Acked-by: Dave Chinner --- fs/xfs/libxfs/xfs_attr.c | 25 +++++++++++-------------- 1 file changed, 11 insertions(+), 14 deletions(-) diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index cdfc136b5bfb..2e055c079f39 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -298,6 +298,13 @@ xfs_attr_set_args( if (error != -ENOSPC) return error; + /* + * Promote the attribute list to the Btree format. + */ + error = xfs_attr3_leaf_to_node(args); + if (error) + return error; + /* * Finish any deferred work items and roll the transaction once * more. The goal here is to call node_addname with the inode @@ -603,7 +610,7 @@ xfs_attr_leaf_try_add( struct xfs_da_args *args, struct xfs_buf *bp) { - int retval, error; + int retval; /* * Look up the given attribute in the leaf block. Figure out if @@ -635,20 +642,10 @@ xfs_attr_leaf_try_add( } /* - * Add the attribute to the leaf block, transitioning to a Btree - * if required. + * Add the attribute to the leaf block */ - retval = xfs_attr3_leaf_add(bp, args); - if (retval == -ENOSPC) { - /* - * Promote the attribute list to the Btree format. Unless an - * error occurs, retain the -ENOSPC retval - */ - error = xfs_attr3_leaf_to_node(args); - if (error) - return error; - } - return retval; + return xfs_attr3_leaf_add(bp, args); + out_brelse: xfs_trans_brelse(args->trans, bp); return retval; From 818d5a91559ffe1e1f2095dcbbdb96c13fdb94ec Mon Sep 17 00:00:00 2001 From: Xiao Yang Date: Tue, 28 Jul 2020 08:57:21 -0700 Subject: [PATCH 117/117] fs/xfs: Support that ioctl(SETXFLAGS/GETXFLAGS) can set/get inode DAX on XFS. 1) FS_DAX_FL has been introduced by commit b383a73f2b83. 2) In future, chattr/lsattr command from e2fsprogs can set/get inode DAX on XFS by calling ioctl(SETXFLAGS/GETXFLAGS). Signed-off-by: Xiao Yang Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_ioctl.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index a190212ca85d..6f22a66777cd 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -1075,13 +1075,18 @@ xfs_merge_ioc_xflags( xflags |= FS_XFLAG_NODUMP; else xflags &= ~FS_XFLAG_NODUMP; + if (flags & FS_DAX_FL) + xflags |= FS_XFLAG_DAX; + else + xflags &= ~FS_XFLAG_DAX; return xflags; } STATIC unsigned int xfs_di2lxflags( - uint16_t di_flags) + uint16_t di_flags, + uint64_t di_flags2) { unsigned int flags = 0; @@ -1095,6 +1100,9 @@ xfs_di2lxflags( flags |= FS_NOATIME_FL; if (di_flags & XFS_DIFLAG_NODUMP) flags |= FS_NODUMP_FL; + if (di_flags2 & XFS_DIFLAG2_DAX) { + flags |= FS_DAX_FL; + } return flags; } @@ -1565,7 +1573,7 @@ xfs_ioc_getxflags( { unsigned int flags; - flags = xfs_di2lxflags(ip->i_d.di_flags); + flags = xfs_di2lxflags(ip->i_d.di_flags, ip->i_d.di_flags2); if (copy_to_user(arg, &flags, sizeof(flags))) return -EFAULT; return 0; @@ -1588,7 +1596,7 @@ xfs_ioc_setxflags( if (flags & ~(FS_IMMUTABLE_FL | FS_APPEND_FL | \ FS_NOATIME_FL | FS_NODUMP_FL | \ - FS_SYNC_FL)) + FS_SYNC_FL | FS_DAX_FL)) return -EOPNOTSUPP; fa.fsx_xflags = xfs_merge_ioc_xflags(flags, xfs_ip2xflags(ip));