OpenCloudOS-Kernel/fs/xfs/xfs_fsops.c

613 lines
15 KiB
C
Raw Normal View History

// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (c) 2000-2005 Silicon Graphics, Inc.
* All Rights Reserved.
*/
#include "xfs.h"
#include "xfs_fs.h"
#include "xfs_shared.h"
#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
#include "xfs_sb.h"
#include "xfs_mount.h"
#include "xfs_trans.h"
#include "xfs_error.h"
#include "xfs_alloc.h"
#include "xfs_fsops.h"
#include "xfs_trans_space.h"
#include "xfs_log.h"
#include "xfs_ag.h"
#include "xfs_ag_resv.h"
#include "xfs_trace.h"
/*
* Write new AG headers to disk. Non-transactional, but need to be
* written and completed prior to the growfs transaction being logged.
* To do this, we use a delayed write buffer list and wait for
* submission and IO completion of the list as a whole. This allows the
* IO subsystem to merge all the AG headers in a single AG into a single
* IO and hide most of the latency of the IO from us.
*
* This also means that if we get an error whilst building the buffer
* list to write, we can cancel the entire list without having written
* anything.
*/
static int
xfs_resizefs_init_new_ags(
struct xfs_trans *tp,
struct aghdr_init_data *id,
xfs_agnumber_t oagcount,
xfs_agnumber_t nagcount,
xfs_rfsblock_t delta,
bool *lastag_extended)
{
struct xfs_mount *mp = tp->t_mountp;
xfs_rfsblock_t nb = mp->m_sb.sb_dblocks + delta;
int error;
*lastag_extended = false;
INIT_LIST_HEAD(&id->buffer_list);
for (id->agno = nagcount - 1;
id->agno >= oagcount;
id->agno--, delta -= id->agsize) {
if (id->agno == nagcount - 1)
id->agsize = nb - (id->agno *
(xfs_rfsblock_t)mp->m_sb.sb_agblocks);
else
id->agsize = mp->m_sb.sb_agblocks;
error = xfs_ag_init_headers(mp, id);
if (error) {
xfs_buf_delwri_cancel(&id->buffer_list);
return error;
}
}
error = xfs_buf_delwri_submit(&id->buffer_list);
if (error)
return error;
if (delta) {
*lastag_extended = true;
error = xfs_ag_extend_space(mp, tp, id, delta);
}
return error;
}
/*
* growfs operations
*/
static int
xfs_growfs_data_private(
struct xfs_mount *mp, /* mount point for filesystem */
struct xfs_growfs_data *in) /* growfs data input struct */
{
struct xfs_buf *bp;
int error;
xfs_agnumber_t nagcount;
xfs_agnumber_t nagimax = 0;
xfs_rfsblock_t nb, nb_div, nb_mod;
int64_t delta;
bool lastag_extended;
xfs_agnumber_t oagcount;
struct xfs_trans *tp;
struct aghdr_init_data id = {};
nb = in->newblocks;
error = xfs_sb_validate_fsb_count(&mp->m_sb, nb);
if (error)
return error;
if (nb > mp->m_sb.sb_dblocks) {
error = xfs_buf_read_uncached(mp->m_ddev_targp,
XFS_FSB_TO_BB(mp, nb) - XFS_FSS_TO_BB(mp, 1),
XFS_FSS_TO_BB(mp, 1), 0, &bp, NULL);
if (error)
return error;
xfs_buf_relse(bp);
}
nb_div = nb;
nb_mod = do_div(nb_div, mp->m_sb.sb_agblocks);
nagcount = nb_div + (nb_mod != 0);
if (nb_mod && nb_mod < XFS_MIN_AG_BLOCKS) {
nagcount--;
nb = (xfs_rfsblock_t)nagcount * mp->m_sb.sb_agblocks;
}
delta = nb - mp->m_sb.sb_dblocks;
/*
* Reject filesystems with a single AG because they are not
* supported, and reject a shrink operation that would cause a
* filesystem to become unsupported.
*/
if (delta < 0 && nagcount < 2)
return -EINVAL;
oagcount = mp->m_sb.sb_agcount;
xfs: Replace per-ag array with a radix tree The use of an array for the per-ag structures requires reallocation of the array when growing the filesystem. This requires locking access to the array to avoid use after free situations, and the locking is difficult to get right. To avoid needing to reallocate an array, change the per-ag structures to an allocated object per ag and index them using a tree structure. The AGs are always densely indexed (hence the use of an array), but the number supported is 2^32 and lookups tend to be random and hence indexing needs to scale. A simple choice is a radix tree - it works well with this sort of index. This change also removes another large contiguous allocation from the mount/growfs path in XFS. The growing process now needs to change to only initialise the new AGs required for the extra space, and as such only needs to exclusively lock the tree for inserts. The rest of the code only needs to lock the tree while doing lookups, and hence this will remove all the deadlocks that currently occur on the m_perag_lock as it is now an innermost lock. The lock is also changed to a spinlock from a read/write lock as the hold time is now extremely short. To complete the picture, the per-ag structures will need to be reference counted to ensure that we don't free/modify them while they are still in use. This will be done in subsequent patch. Signed-off-by: Dave Chinner <david@fromorbit.com> Reviewed-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Alex Elder <aelder@sgi.com>
2010-01-11 19:47:44 +08:00
/* allocate the new per-ag structures */
if (nagcount > oagcount) {
error = xfs_initialize_perag(mp, nagcount, &nagimax);
if (error)
return error;
} else if (nagcount < oagcount) {
/* TODO: shrinking the entire AGs hasn't yet completed */
return -EINVAL;
}
xfs: Replace per-ag array with a radix tree The use of an array for the per-ag structures requires reallocation of the array when growing the filesystem. This requires locking access to the array to avoid use after free situations, and the locking is difficult to get right. To avoid needing to reallocate an array, change the per-ag structures to an allocated object per ag and index them using a tree structure. The AGs are always densely indexed (hence the use of an array), but the number supported is 2^32 and lookups tend to be random and hence indexing needs to scale. A simple choice is a radix tree - it works well with this sort of index. This change also removes another large contiguous allocation from the mount/growfs path in XFS. The growing process now needs to change to only initialise the new AGs required for the extra space, and as such only needs to exclusively lock the tree for inserts. The rest of the code only needs to lock the tree while doing lookups, and hence this will remove all the deadlocks that currently occur on the m_perag_lock as it is now an innermost lock. The lock is also changed to a spinlock from a read/write lock as the hold time is now extremely short. To complete the picture, the per-ag structures will need to be reference counted to ensure that we don't free/modify them while they are still in use. This will be done in subsequent patch. Signed-off-by: Dave Chinner <david@fromorbit.com> Reviewed-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Alex Elder <aelder@sgi.com>
2010-01-11 19:47:44 +08:00
error = xfs_trans_alloc(mp, &M_RES(mp)->tr_growdata,
(delta > 0 ? XFS_GROWFS_SPACE_RES(mp) : -delta), 0,
XFS_TRANS_RESERVE, &tp);
if (error)
return error;
if (delta > 0) {
error = xfs_resizefs_init_new_ags(tp, &id, oagcount, nagcount,
delta, &lastag_extended);
} else {
static struct ratelimit_state shrink_warning = \
RATELIMIT_STATE_INIT("shrink_warning", 86400 * HZ, 1);
ratelimit_set_flags(&shrink_warning, RATELIMIT_MSG_ON_RELEASE);
if (__ratelimit(&shrink_warning))
xfs_alert(mp,
"EXPERIMENTAL online shrink feature in use. Use at your own risk!");
error = xfs_ag_shrink_space(mp, &tp, nagcount - 1, -delta);
}
if (error)
goto out_trans_cancel;
xfs: Replace per-ag array with a radix tree The use of an array for the per-ag structures requires reallocation of the array when growing the filesystem. This requires locking access to the array to avoid use after free situations, and the locking is difficult to get right. To avoid needing to reallocate an array, change the per-ag structures to an allocated object per ag and index them using a tree structure. The AGs are always densely indexed (hence the use of an array), but the number supported is 2^32 and lookups tend to be random and hence indexing needs to scale. A simple choice is a radix tree - it works well with this sort of index. This change also removes another large contiguous allocation from the mount/growfs path in XFS. The growing process now needs to change to only initialise the new AGs required for the extra space, and as such only needs to exclusively lock the tree for inserts. The rest of the code only needs to lock the tree while doing lookups, and hence this will remove all the deadlocks that currently occur on the m_perag_lock as it is now an innermost lock. The lock is also changed to a spinlock from a read/write lock as the hold time is now extremely short. To complete the picture, the per-ag structures will need to be reference counted to ensure that we don't free/modify them while they are still in use. This will be done in subsequent patch. Signed-off-by: Dave Chinner <david@fromorbit.com> Reviewed-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Alex Elder <aelder@sgi.com>
2010-01-11 19:47:44 +08:00
/*
* Update changed superblock fields transactionally. These are not
* seen by the rest of the world until the transaction commit applies
* them atomically to the superblock.
*/
if (nagcount > oagcount)
xfs_trans_mod_sb(tp, XFS_TRANS_SB_AGCOUNT, nagcount - oagcount);
if (delta)
xfs_trans_mod_sb(tp, XFS_TRANS_SB_DBLOCKS, delta);
if (id.nfree)
xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, id.nfree);
/*
* Sync sb counters now to reflect the updated values. This is
* particularly important for shrink because the write verifier
* will fail if sb_fdblocks is ever larger than sb_dblocks.
*/
if (xfs_has_lazysbcount(mp))
xfs_log_sb(tp);
xfs_trans_set_sync(tp);
error = xfs_trans_commit(tp);
xfs: Replace per-ag array with a radix tree The use of an array for the per-ag structures requires reallocation of the array when growing the filesystem. This requires locking access to the array to avoid use after free situations, and the locking is difficult to get right. To avoid needing to reallocate an array, change the per-ag structures to an allocated object per ag and index them using a tree structure. The AGs are always densely indexed (hence the use of an array), but the number supported is 2^32 and lookups tend to be random and hence indexing needs to scale. A simple choice is a radix tree - it works well with this sort of index. This change also removes another large contiguous allocation from the mount/growfs path in XFS. The growing process now needs to change to only initialise the new AGs required for the extra space, and as such only needs to exclusively lock the tree for inserts. The rest of the code only needs to lock the tree while doing lookups, and hence this will remove all the deadlocks that currently occur on the m_perag_lock as it is now an innermost lock. The lock is also changed to a spinlock from a read/write lock as the hold time is now extremely short. To complete the picture, the per-ag structures will need to be reference counted to ensure that we don't free/modify them while they are still in use. This will be done in subsequent patch. Signed-off-by: Dave Chinner <david@fromorbit.com> Reviewed-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Alex Elder <aelder@sgi.com>
2010-01-11 19:47:44 +08:00
if (error)
return error;
xfs: Replace per-ag array with a radix tree The use of an array for the per-ag structures requires reallocation of the array when growing the filesystem. This requires locking access to the array to avoid use after free situations, and the locking is difficult to get right. To avoid needing to reallocate an array, change the per-ag structures to an allocated object per ag and index them using a tree structure. The AGs are always densely indexed (hence the use of an array), but the number supported is 2^32 and lookups tend to be random and hence indexing needs to scale. A simple choice is a radix tree - it works well with this sort of index. This change also removes another large contiguous allocation from the mount/growfs path in XFS. The growing process now needs to change to only initialise the new AGs required for the extra space, and as such only needs to exclusively lock the tree for inserts. The rest of the code only needs to lock the tree while doing lookups, and hence this will remove all the deadlocks that currently occur on the m_perag_lock as it is now an innermost lock. The lock is also changed to a spinlock from a read/write lock as the hold time is now extremely short. To complete the picture, the per-ag structures will need to be reference counted to ensure that we don't free/modify them while they are still in use. This will be done in subsequent patch. Signed-off-by: Dave Chinner <david@fromorbit.com> Reviewed-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Alex Elder <aelder@sgi.com>
2010-01-11 19:47:44 +08:00
/* New allocation groups fully initialized, so update mount struct */
if (nagimax)
mp->m_maxagi = nagimax;
xfs: dynamic speculative EOF preallocation Currently the size of the speculative preallocation during delayed allocation is fixed by either the allocsize mount option of a default size. We are seeing a lot of cases where we need to recommend using the allocsize mount option to prevent fragmentation when buffered writes land in the same AG. Rather than using a fixed preallocation size by default (up to 64k), make it dynamic by basing it on the current inode size. That way the EOF preallocation will increase as the file size increases. Hence for streaming writes we are much more likely to get large preallocations exactly when we need it to reduce fragementation. For default settings, the size of the initial extents is determined by the number of parallel writers and the amount of memory in the machine. For 4GB RAM and 4 concurrent 32GB file writes: EXT: FILE-OFFSET BLOCK-RANGE AG AG-OFFSET TOTAL 0: [0..1048575]: 1048672..2097247 0 (1048672..2097247) 1048576 1: [1048576..2097151]: 5242976..6291551 0 (5242976..6291551) 1048576 2: [2097152..4194303]: 12583008..14680159 0 (12583008..14680159) 2097152 3: [4194304..8388607]: 25165920..29360223 0 (25165920..29360223) 4194304 4: [8388608..16777215]: 58720352..67108959 0 (58720352..67108959) 8388608 5: [16777216..33554423]: 117440584..134217791 0 (117440584..134217791) 16777208 6: [33554424..50331511]: 184549056..201326143 0 (184549056..201326143) 16777088 7: [50331512..67108599]: 251657408..268434495 0 (251657408..268434495) 16777088 and for 16 concurrent 16GB file writes: EXT: FILE-OFFSET BLOCK-RANGE AG AG-OFFSET TOTAL 0: [0..262143]: 2490472..2752615 0 (2490472..2752615) 262144 1: [262144..524287]: 6291560..6553703 0 (6291560..6553703) 262144 2: [524288..1048575]: 13631592..14155879 0 (13631592..14155879) 524288 3: [1048576..2097151]: 30408808..31457383 0 (30408808..31457383) 1048576 4: [2097152..4194303]: 52428904..54526055 0 (52428904..54526055) 2097152 5: [4194304..8388607]: 104857704..109052007 0 (104857704..109052007) 4194304 6: [8388608..16777215]: 209715304..218103911 0 (209715304..218103911) 8388608 7: [16777216..33554423]: 452984848..469762055 0 (452984848..469762055) 16777208 Because it is hard to take back specualtive preallocation, cases where there are large slow growing log files on a nearly full filesystem may cause premature ENOSPC. Hence as the filesystem nears full, the maximum dynamic prealloc size іs reduced according to this table (based on 4k block size): freespace max prealloc size >5% full extent (8GB) 4-5% 2GB (8GB >> 2) 3-4% 1GB (8GB >> 3) 2-3% 512MB (8GB >> 4) 1-2% 256MB (8GB >> 5) <1% 128MB (8GB >> 6) This should reduce the amount of space held in speculative preallocation for such cases. The allocsize mount option turns off the dynamic behaviour and fixes the prealloc size to whatever the mount option specifies. i.e. the behaviour is unchanged. Signed-off-by: Dave Chinner <dchinner@redhat.com>
2011-01-04 08:35:03 +08:00
xfs_set_low_space_thresholds(mp);
mp->m_alloc_set_aside = xfs_alloc_set_aside(mp);
xfs: Replace per-ag array with a radix tree The use of an array for the per-ag structures requires reallocation of the array when growing the filesystem. This requires locking access to the array to avoid use after free situations, and the locking is difficult to get right. To avoid needing to reallocate an array, change the per-ag structures to an allocated object per ag and index them using a tree structure. The AGs are always densely indexed (hence the use of an array), but the number supported is 2^32 and lookups tend to be random and hence indexing needs to scale. A simple choice is a radix tree - it works well with this sort of index. This change also removes another large contiguous allocation from the mount/growfs path in XFS. The growing process now needs to change to only initialise the new AGs required for the extra space, and as such only needs to exclusively lock the tree for inserts. The rest of the code only needs to lock the tree while doing lookups, and hence this will remove all the deadlocks that currently occur on the m_perag_lock as it is now an innermost lock. The lock is also changed to a spinlock from a read/write lock as the hold time is now extremely short. To complete the picture, the per-ag structures will need to be reference counted to ensure that we don't free/modify them while they are still in use. This will be done in subsequent patch. Signed-off-by: Dave Chinner <david@fromorbit.com> Reviewed-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Alex Elder <aelder@sgi.com>
2010-01-11 19:47:44 +08:00
if (delta > 0) {
/*
* If we expanded the last AG, free the per-AG reservation
* so we can reinitialize it with the new size.
*/
if (lastag_extended) {
struct xfs_perag *pag;
pag = xfs_perag_get(mp, id.agno);
error = xfs_ag_resv_free(pag);
xfs_perag_put(pag);
if (error)
return error;
}
/*
* Reserve AG metadata blocks. ENOSPC here does not mean there
* was a growfs failure, just that there still isn't space for
* new user data after the grow has been run.
*/
error = xfs_fs_reserve_ag_blocks(mp);
if (error == -ENOSPC)
error = 0;
}
return error;
out_trans_cancel:
xfs_trans_cancel(tp);
return error;
}
static int
xfs_growfs_log_private(
struct xfs_mount *mp, /* mount point for filesystem */
struct xfs_growfs_log *in) /* growfs log input struct */
{
xfs_extlen_t nb;
nb = in->newblocks;
if (nb < XFS_MIN_LOG_BLOCKS || nb < XFS_B_TO_FSB(mp, XFS_MIN_LOG_BYTES))
return -EINVAL;
if (nb == mp->m_sb.sb_logblocks &&
in->isint == (mp->m_sb.sb_logstart != 0))
return -EINVAL;
/*
* Moving the log is hard, need new interfaces to sync
* the log first, hold off all activity while moving it.
* Can have shorter or longer log in the same space,
* or transform internal to external log or vice versa.
*/
return -ENOSYS;
}
static int
xfs_growfs_imaxpct(
struct xfs_mount *mp,
__u32 imaxpct)
{
struct xfs_trans *tp;
int dpct;
int error;
if (imaxpct > 100)
return -EINVAL;
error = xfs_trans_alloc(mp, &M_RES(mp)->tr_growdata,
XFS_GROWFS_SPACE_RES(mp), 0, XFS_TRANS_RESERVE, &tp);
if (error)
return error;
dpct = imaxpct - mp->m_sb.sb_imax_pct;
xfs_trans_mod_sb(tp, XFS_TRANS_SB_IMAXPCT, dpct);
xfs_trans_set_sync(tp);
return xfs_trans_commit(tp);
}
/*
* protected versions of growfs function acquire and release locks on the mount
* point - exported through ioctls: XFS_IOC_FSGROWFSDATA, XFS_IOC_FSGROWFSLOG,
* XFS_IOC_FSGROWFSRT
*/
int
xfs_growfs_data(
struct xfs_mount *mp,
struct xfs_growfs_data *in)
{
int error = 0;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
if (!mutex_trylock(&mp->m_growlock))
return -EWOULDBLOCK;
/* update imaxpct separately to the physical grow of the filesystem */
if (in->imaxpct != mp->m_sb.sb_imax_pct) {
error = xfs_growfs_imaxpct(mp, in->imaxpct);
if (error)
goto out_error;
}
if (in->newblocks != mp->m_sb.sb_dblocks) {
error = xfs_growfs_data_private(mp, in);
if (error)
goto out_error;
}
/* Post growfs calculations needed to reflect new state in operations */
if (mp->m_sb.sb_imax_pct) {
uint64_t icount = mp->m_sb.sb_dblocks * mp->m_sb.sb_imax_pct;
do_div(icount, 100);
M_IGEO(mp)->maxicount = XFS_FSB_TO_INO(mp, icount);
} else
M_IGEO(mp)->maxicount = 0;
/* Update secondary superblocks now the physical grow has completed */
error = xfs_update_secondary_sbs(mp);
out_error:
/*
* Increment the generation unconditionally, the error could be from
* updating the secondary superblocks, in which case the new size
* is live already.
*/
mp->m_generation++;
mutex_unlock(&mp->m_growlock);
return error;
}
int
xfs_growfs_log(
xfs_mount_t *mp,
struct xfs_growfs_log *in)
{
int error;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
if (!mutex_trylock(&mp->m_growlock))
return -EWOULDBLOCK;
error = xfs_growfs_log_private(mp, in);
mutex_unlock(&mp->m_growlock);
return error;
}
/*
* exported through ioctl XFS_IOC_FSCOUNTS
*/
void
xfs_fs_counts(
xfs_mount_t *mp,
xfs_fsop_counts_t *cnt)
{
cnt->allocino = percpu_counter_read_positive(&mp->m_icount);
cnt->freeino = percpu_counter_read_positive(&mp->m_ifree);
cnt->freedata = percpu_counter_read_positive(&mp->m_fdblocks) -
mp->m_alloc_set_aside;
spin_lock(&mp->m_sb_lock);
cnt->freertx = mp->m_sb.sb_frextents;
spin_unlock(&mp->m_sb_lock);
}
/*
* exported through ioctl XFS_IOC_SET_RESBLKS & XFS_IOC_GET_RESBLKS
*
* xfs_reserve_blocks is called to set m_resblks
* in the in-core mount table. The number of unused reserved blocks
* is kept in m_resblks_avail.
*
* Reserve the requested number of blocks if available. Otherwise return
* as many as possible to satisfy the request. The actual number
* reserved are returned in outval
*
* A null inval pointer indicates that only the current reserved blocks
* available should be returned no settings are changed.
*/
int
xfs_reserve_blocks(
xfs_mount_t *mp,
uint64_t *inval,
xfs_fsop_resblks_t *outval)
{
int64_t lcounter, delta;
int64_t fdblks_delta = 0;
uint64_t request;
int64_t free;
xfs: refactor xfs_reserve_blocks() to handle ENOSPC correctly xfs_reserve_blocks() is responsible to update the XFS reserved block pool count at mount time or based on user request. When the caller requests to increase the reserve pool, blocks must be allocated from the global counters such that they are no longer available for general purpose use. If the requested reserve pool size is too large, XFS reserves what blocks are available. The implementation requires looking at the percpu counters and making an educated guess as to how many blocks to try and allocate from xfs_mod_fdblocks(), which can return -ENOSPC if the guess was not accurate due to counters being modified in parallel. xfs_reserve_blocks() retries the guess in this scenario until the allocation succeeds or it is determined that there is no space available in the fs. While not easily reproducible in the current form, the retry code doesn't actually work correctly if xfs_mod_fdblocks() actually fails. The problem is that the percpu calculations use the m_resblks counter to determine how many blocks to allocate, but unconditionally update m_resblks before the block allocation has actually succeeded. Therefore, if xfs_mod_fdblocks() fails, the code jumps to the retry label and uses the already updated m_resblks value to determine how many blocks to try and allocate. If the percpu counters previously suggested that the entire request was available, fdblocks_delta could end up set to 0. In that case, m_resblks is updated to the requested value, yet no blocks have been reserved at all. Refactor xfs_reserve_blocks() to use an explicit loop and make the code easier to follow. Since we have to drop the spinlock across the xfs_mod_fdblocks() call, use a delta value for m_resblks as well and only apply the delta once allocation succeeds. [dchinner: convert to do {} while() loop] Signed-off-by: Brian Foster <bfoster@redhat.com> Reviewed-by: Dave Chinner <dchinner@redhat.com> Signed-off-by: Dave Chinner <david@fromorbit.com>
2016-06-21 09:53:28 +08:00
int error = 0;
/* If inval is null, report current values and return */
if (inval == (uint64_t *)NULL) {
if (!outval)
return -EINVAL;
outval->resblks = mp->m_resblks;
outval->resblks_avail = mp->m_resblks_avail;
return 0;
}
request = *inval;
/*
xfs: refactor xfs_reserve_blocks() to handle ENOSPC correctly xfs_reserve_blocks() is responsible to update the XFS reserved block pool count at mount time or based on user request. When the caller requests to increase the reserve pool, blocks must be allocated from the global counters such that they are no longer available for general purpose use. If the requested reserve pool size is too large, XFS reserves what blocks are available. The implementation requires looking at the percpu counters and making an educated guess as to how many blocks to try and allocate from xfs_mod_fdblocks(), which can return -ENOSPC if the guess was not accurate due to counters being modified in parallel. xfs_reserve_blocks() retries the guess in this scenario until the allocation succeeds or it is determined that there is no space available in the fs. While not easily reproducible in the current form, the retry code doesn't actually work correctly if xfs_mod_fdblocks() actually fails. The problem is that the percpu calculations use the m_resblks counter to determine how many blocks to allocate, but unconditionally update m_resblks before the block allocation has actually succeeded. Therefore, if xfs_mod_fdblocks() fails, the code jumps to the retry label and uses the already updated m_resblks value to determine how many blocks to try and allocate. If the percpu counters previously suggested that the entire request was available, fdblocks_delta could end up set to 0. In that case, m_resblks is updated to the requested value, yet no blocks have been reserved at all. Refactor xfs_reserve_blocks() to use an explicit loop and make the code easier to follow. Since we have to drop the spinlock across the xfs_mod_fdblocks() call, use a delta value for m_resblks as well and only apply the delta once allocation succeeds. [dchinner: convert to do {} while() loop] Signed-off-by: Brian Foster <bfoster@redhat.com> Reviewed-by: Dave Chinner <dchinner@redhat.com> Signed-off-by: Dave Chinner <david@fromorbit.com>
2016-06-21 09:53:28 +08:00
* With per-cpu counters, this becomes an interesting problem. we need
* to work out if we are freeing or allocation blocks first, then we can
* do the modification as necessary.
*
xfs: refactor xfs_reserve_blocks() to handle ENOSPC correctly xfs_reserve_blocks() is responsible to update the XFS reserved block pool count at mount time or based on user request. When the caller requests to increase the reserve pool, blocks must be allocated from the global counters such that they are no longer available for general purpose use. If the requested reserve pool size is too large, XFS reserves what blocks are available. The implementation requires looking at the percpu counters and making an educated guess as to how many blocks to try and allocate from xfs_mod_fdblocks(), which can return -ENOSPC if the guess was not accurate due to counters being modified in parallel. xfs_reserve_blocks() retries the guess in this scenario until the allocation succeeds or it is determined that there is no space available in the fs. While not easily reproducible in the current form, the retry code doesn't actually work correctly if xfs_mod_fdblocks() actually fails. The problem is that the percpu calculations use the m_resblks counter to determine how many blocks to allocate, but unconditionally update m_resblks before the block allocation has actually succeeded. Therefore, if xfs_mod_fdblocks() fails, the code jumps to the retry label and uses the already updated m_resblks value to determine how many blocks to try and allocate. If the percpu counters previously suggested that the entire request was available, fdblocks_delta could end up set to 0. In that case, m_resblks is updated to the requested value, yet no blocks have been reserved at all. Refactor xfs_reserve_blocks() to use an explicit loop and make the code easier to follow. Since we have to drop the spinlock across the xfs_mod_fdblocks() call, use a delta value for m_resblks as well and only apply the delta once allocation succeeds. [dchinner: convert to do {} while() loop] Signed-off-by: Brian Foster <bfoster@redhat.com> Reviewed-by: Dave Chinner <dchinner@redhat.com> Signed-off-by: Dave Chinner <david@fromorbit.com>
2016-06-21 09:53:28 +08:00
* We do this under the m_sb_lock so that if we are near ENOSPC, we will
* hold out any changes while we work out what to do. This means that
* the amount of free space can change while we do this, so we need to
* retry if we end up trying to reserve more space than is available.
*/
spin_lock(&mp->m_sb_lock);
/*
* If our previous reservation was larger than the current value,
xfs: refactor xfs_reserve_blocks() to handle ENOSPC correctly xfs_reserve_blocks() is responsible to update the XFS reserved block pool count at mount time or based on user request. When the caller requests to increase the reserve pool, blocks must be allocated from the global counters such that they are no longer available for general purpose use. If the requested reserve pool size is too large, XFS reserves what blocks are available. The implementation requires looking at the percpu counters and making an educated guess as to how many blocks to try and allocate from xfs_mod_fdblocks(), which can return -ENOSPC if the guess was not accurate due to counters being modified in parallel. xfs_reserve_blocks() retries the guess in this scenario until the allocation succeeds or it is determined that there is no space available in the fs. While not easily reproducible in the current form, the retry code doesn't actually work correctly if xfs_mod_fdblocks() actually fails. The problem is that the percpu calculations use the m_resblks counter to determine how many blocks to allocate, but unconditionally update m_resblks before the block allocation has actually succeeded. Therefore, if xfs_mod_fdblocks() fails, the code jumps to the retry label and uses the already updated m_resblks value to determine how many blocks to try and allocate. If the percpu counters previously suggested that the entire request was available, fdblocks_delta could end up set to 0. In that case, m_resblks is updated to the requested value, yet no blocks have been reserved at all. Refactor xfs_reserve_blocks() to use an explicit loop and make the code easier to follow. Since we have to drop the spinlock across the xfs_mod_fdblocks() call, use a delta value for m_resblks as well and only apply the delta once allocation succeeds. [dchinner: convert to do {} while() loop] Signed-off-by: Brian Foster <bfoster@redhat.com> Reviewed-by: Dave Chinner <dchinner@redhat.com> Signed-off-by: Dave Chinner <david@fromorbit.com>
2016-06-21 09:53:28 +08:00
* then move any unused blocks back to the free pool. Modify the resblks
* counters directly since we shouldn't have any problems unreserving
* space.
*/
if (mp->m_resblks > request) {
lcounter = mp->m_resblks_avail - request;
if (lcounter > 0) { /* release unused blocks */
fdblks_delta = lcounter;
mp->m_resblks_avail -= lcounter;
}
mp->m_resblks = request;
xfs: refactor xfs_reserve_blocks() to handle ENOSPC correctly xfs_reserve_blocks() is responsible to update the XFS reserved block pool count at mount time or based on user request. When the caller requests to increase the reserve pool, blocks must be allocated from the global counters such that they are no longer available for general purpose use. If the requested reserve pool size is too large, XFS reserves what blocks are available. The implementation requires looking at the percpu counters and making an educated guess as to how many blocks to try and allocate from xfs_mod_fdblocks(), which can return -ENOSPC if the guess was not accurate due to counters being modified in parallel. xfs_reserve_blocks() retries the guess in this scenario until the allocation succeeds or it is determined that there is no space available in the fs. While not easily reproducible in the current form, the retry code doesn't actually work correctly if xfs_mod_fdblocks() actually fails. The problem is that the percpu calculations use the m_resblks counter to determine how many blocks to allocate, but unconditionally update m_resblks before the block allocation has actually succeeded. Therefore, if xfs_mod_fdblocks() fails, the code jumps to the retry label and uses the already updated m_resblks value to determine how many blocks to try and allocate. If the percpu counters previously suggested that the entire request was available, fdblocks_delta could end up set to 0. In that case, m_resblks is updated to the requested value, yet no blocks have been reserved at all. Refactor xfs_reserve_blocks() to use an explicit loop and make the code easier to follow. Since we have to drop the spinlock across the xfs_mod_fdblocks() call, use a delta value for m_resblks as well and only apply the delta once allocation succeeds. [dchinner: convert to do {} while() loop] Signed-off-by: Brian Foster <bfoster@redhat.com> Reviewed-by: Dave Chinner <dchinner@redhat.com> Signed-off-by: Dave Chinner <david@fromorbit.com>
2016-06-21 09:53:28 +08:00
if (fdblks_delta) {
spin_unlock(&mp->m_sb_lock);
error = xfs_mod_fdblocks(mp, fdblks_delta, 0);
spin_lock(&mp->m_sb_lock);
}
goto out;
}
xfs: refactor xfs_reserve_blocks() to handle ENOSPC correctly xfs_reserve_blocks() is responsible to update the XFS reserved block pool count at mount time or based on user request. When the caller requests to increase the reserve pool, blocks must be allocated from the global counters such that they are no longer available for general purpose use. If the requested reserve pool size is too large, XFS reserves what blocks are available. The implementation requires looking at the percpu counters and making an educated guess as to how many blocks to try and allocate from xfs_mod_fdblocks(), which can return -ENOSPC if the guess was not accurate due to counters being modified in parallel. xfs_reserve_blocks() retries the guess in this scenario until the allocation succeeds or it is determined that there is no space available in the fs. While not easily reproducible in the current form, the retry code doesn't actually work correctly if xfs_mod_fdblocks() actually fails. The problem is that the percpu calculations use the m_resblks counter to determine how many blocks to allocate, but unconditionally update m_resblks before the block allocation has actually succeeded. Therefore, if xfs_mod_fdblocks() fails, the code jumps to the retry label and uses the already updated m_resblks value to determine how many blocks to try and allocate. If the percpu counters previously suggested that the entire request was available, fdblocks_delta could end up set to 0. In that case, m_resblks is updated to the requested value, yet no blocks have been reserved at all. Refactor xfs_reserve_blocks() to use an explicit loop and make the code easier to follow. Since we have to drop the spinlock across the xfs_mod_fdblocks() call, use a delta value for m_resblks as well and only apply the delta once allocation succeeds. [dchinner: convert to do {} while() loop] Signed-off-by: Brian Foster <bfoster@redhat.com> Reviewed-by: Dave Chinner <dchinner@redhat.com> Signed-off-by: Dave Chinner <david@fromorbit.com>
2016-06-21 09:53:28 +08:00
/*
* If the request is larger than the current reservation, reserve the
* blocks before we update the reserve counters. Sample m_fdblocks and
* perform a partial reservation if the request exceeds free space.
*/
error = -ENOSPC;
do {
free = percpu_counter_sum(&mp->m_fdblocks) -
mp->m_alloc_set_aside;
if (free <= 0)
xfs: refactor xfs_reserve_blocks() to handle ENOSPC correctly xfs_reserve_blocks() is responsible to update the XFS reserved block pool count at mount time or based on user request. When the caller requests to increase the reserve pool, blocks must be allocated from the global counters such that they are no longer available for general purpose use. If the requested reserve pool size is too large, XFS reserves what blocks are available. The implementation requires looking at the percpu counters and making an educated guess as to how many blocks to try and allocate from xfs_mod_fdblocks(), which can return -ENOSPC if the guess was not accurate due to counters being modified in parallel. xfs_reserve_blocks() retries the guess in this scenario until the allocation succeeds or it is determined that there is no space available in the fs. While not easily reproducible in the current form, the retry code doesn't actually work correctly if xfs_mod_fdblocks() actually fails. The problem is that the percpu calculations use the m_resblks counter to determine how many blocks to allocate, but unconditionally update m_resblks before the block allocation has actually succeeded. Therefore, if xfs_mod_fdblocks() fails, the code jumps to the retry label and uses the already updated m_resblks value to determine how many blocks to try and allocate. If the percpu counters previously suggested that the entire request was available, fdblocks_delta could end up set to 0. In that case, m_resblks is updated to the requested value, yet no blocks have been reserved at all. Refactor xfs_reserve_blocks() to use an explicit loop and make the code easier to follow. Since we have to drop the spinlock across the xfs_mod_fdblocks() call, use a delta value for m_resblks as well and only apply the delta once allocation succeeds. [dchinner: convert to do {} while() loop] Signed-off-by: Brian Foster <bfoster@redhat.com> Reviewed-by: Dave Chinner <dchinner@redhat.com> Signed-off-by: Dave Chinner <david@fromorbit.com>
2016-06-21 09:53:28 +08:00
break;
delta = request - mp->m_resblks;
lcounter = free - delta;
xfs: refactor xfs_reserve_blocks() to handle ENOSPC correctly xfs_reserve_blocks() is responsible to update the XFS reserved block pool count at mount time or based on user request. When the caller requests to increase the reserve pool, blocks must be allocated from the global counters such that they are no longer available for general purpose use. If the requested reserve pool size is too large, XFS reserves what blocks are available. The implementation requires looking at the percpu counters and making an educated guess as to how many blocks to try and allocate from xfs_mod_fdblocks(), which can return -ENOSPC if the guess was not accurate due to counters being modified in parallel. xfs_reserve_blocks() retries the guess in this scenario until the allocation succeeds or it is determined that there is no space available in the fs. While not easily reproducible in the current form, the retry code doesn't actually work correctly if xfs_mod_fdblocks() actually fails. The problem is that the percpu calculations use the m_resblks counter to determine how many blocks to allocate, but unconditionally update m_resblks before the block allocation has actually succeeded. Therefore, if xfs_mod_fdblocks() fails, the code jumps to the retry label and uses the already updated m_resblks value to determine how many blocks to try and allocate. If the percpu counters previously suggested that the entire request was available, fdblocks_delta could end up set to 0. In that case, m_resblks is updated to the requested value, yet no blocks have been reserved at all. Refactor xfs_reserve_blocks() to use an explicit loop and make the code easier to follow. Since we have to drop the spinlock across the xfs_mod_fdblocks() call, use a delta value for m_resblks as well and only apply the delta once allocation succeeds. [dchinner: convert to do {} while() loop] Signed-off-by: Brian Foster <bfoster@redhat.com> Reviewed-by: Dave Chinner <dchinner@redhat.com> Signed-off-by: Dave Chinner <david@fromorbit.com>
2016-06-21 09:53:28 +08:00
if (lcounter < 0)
/* We can't satisfy the request, just get what we can */
xfs: refactor xfs_reserve_blocks() to handle ENOSPC correctly xfs_reserve_blocks() is responsible to update the XFS reserved block pool count at mount time or based on user request. When the caller requests to increase the reserve pool, blocks must be allocated from the global counters such that they are no longer available for general purpose use. If the requested reserve pool size is too large, XFS reserves what blocks are available. The implementation requires looking at the percpu counters and making an educated guess as to how many blocks to try and allocate from xfs_mod_fdblocks(), which can return -ENOSPC if the guess was not accurate due to counters being modified in parallel. xfs_reserve_blocks() retries the guess in this scenario until the allocation succeeds or it is determined that there is no space available in the fs. While not easily reproducible in the current form, the retry code doesn't actually work correctly if xfs_mod_fdblocks() actually fails. The problem is that the percpu calculations use the m_resblks counter to determine how many blocks to allocate, but unconditionally update m_resblks before the block allocation has actually succeeded. Therefore, if xfs_mod_fdblocks() fails, the code jumps to the retry label and uses the already updated m_resblks value to determine how many blocks to try and allocate. If the percpu counters previously suggested that the entire request was available, fdblocks_delta could end up set to 0. In that case, m_resblks is updated to the requested value, yet no blocks have been reserved at all. Refactor xfs_reserve_blocks() to use an explicit loop and make the code easier to follow. Since we have to drop the spinlock across the xfs_mod_fdblocks() call, use a delta value for m_resblks as well and only apply the delta once allocation succeeds. [dchinner: convert to do {} while() loop] Signed-off-by: Brian Foster <bfoster@redhat.com> Reviewed-by: Dave Chinner <dchinner@redhat.com> Signed-off-by: Dave Chinner <david@fromorbit.com>
2016-06-21 09:53:28 +08:00
fdblks_delta = free;
else
fdblks_delta = delta;
/*
xfs: refactor xfs_reserve_blocks() to handle ENOSPC correctly xfs_reserve_blocks() is responsible to update the XFS reserved block pool count at mount time or based on user request. When the caller requests to increase the reserve pool, blocks must be allocated from the global counters such that they are no longer available for general purpose use. If the requested reserve pool size is too large, XFS reserves what blocks are available. The implementation requires looking at the percpu counters and making an educated guess as to how many blocks to try and allocate from xfs_mod_fdblocks(), which can return -ENOSPC if the guess was not accurate due to counters being modified in parallel. xfs_reserve_blocks() retries the guess in this scenario until the allocation succeeds or it is determined that there is no space available in the fs. While not easily reproducible in the current form, the retry code doesn't actually work correctly if xfs_mod_fdblocks() actually fails. The problem is that the percpu calculations use the m_resblks counter to determine how many blocks to allocate, but unconditionally update m_resblks before the block allocation has actually succeeded. Therefore, if xfs_mod_fdblocks() fails, the code jumps to the retry label and uses the already updated m_resblks value to determine how many blocks to try and allocate. If the percpu counters previously suggested that the entire request was available, fdblocks_delta could end up set to 0. In that case, m_resblks is updated to the requested value, yet no blocks have been reserved at all. Refactor xfs_reserve_blocks() to use an explicit loop and make the code easier to follow. Since we have to drop the spinlock across the xfs_mod_fdblocks() call, use a delta value for m_resblks as well and only apply the delta once allocation succeeds. [dchinner: convert to do {} while() loop] Signed-off-by: Brian Foster <bfoster@redhat.com> Reviewed-by: Dave Chinner <dchinner@redhat.com> Signed-off-by: Dave Chinner <david@fromorbit.com>
2016-06-21 09:53:28 +08:00
* We'll either succeed in getting space from the free block
* count or we'll get an ENOSPC. If we get a ENOSPC, it means
* things changed while we were calculating fdblks_delta and so
* we should try again to see if there is anything left to
* reserve.
*
* Don't set the reserved flag here - we don't want to reserve
* the extra reserve blocks from the reserve.....
*/
xfs: refactor xfs_reserve_blocks() to handle ENOSPC correctly xfs_reserve_blocks() is responsible to update the XFS reserved block pool count at mount time or based on user request. When the caller requests to increase the reserve pool, blocks must be allocated from the global counters such that they are no longer available for general purpose use. If the requested reserve pool size is too large, XFS reserves what blocks are available. The implementation requires looking at the percpu counters and making an educated guess as to how many blocks to try and allocate from xfs_mod_fdblocks(), which can return -ENOSPC if the guess was not accurate due to counters being modified in parallel. xfs_reserve_blocks() retries the guess in this scenario until the allocation succeeds or it is determined that there is no space available in the fs. While not easily reproducible in the current form, the retry code doesn't actually work correctly if xfs_mod_fdblocks() actually fails. The problem is that the percpu calculations use the m_resblks counter to determine how many blocks to allocate, but unconditionally update m_resblks before the block allocation has actually succeeded. Therefore, if xfs_mod_fdblocks() fails, the code jumps to the retry label and uses the already updated m_resblks value to determine how many blocks to try and allocate. If the percpu counters previously suggested that the entire request was available, fdblocks_delta could end up set to 0. In that case, m_resblks is updated to the requested value, yet no blocks have been reserved at all. Refactor xfs_reserve_blocks() to use an explicit loop and make the code easier to follow. Since we have to drop the spinlock across the xfs_mod_fdblocks() call, use a delta value for m_resblks as well and only apply the delta once allocation succeeds. [dchinner: convert to do {} while() loop] Signed-off-by: Brian Foster <bfoster@redhat.com> Reviewed-by: Dave Chinner <dchinner@redhat.com> Signed-off-by: Dave Chinner <david@fromorbit.com>
2016-06-21 09:53:28 +08:00
spin_unlock(&mp->m_sb_lock);
error = xfs_mod_fdblocks(mp, -fdblks_delta, 0);
spin_lock(&mp->m_sb_lock);
} while (error == -ENOSPC);
/*
* Update the reserve counters if blocks have been successfully
* allocated.
*/
if (!error && fdblks_delta) {
mp->m_resblks += fdblks_delta;
mp->m_resblks_avail += fdblks_delta;
}
xfs: refactor xfs_reserve_blocks() to handle ENOSPC correctly xfs_reserve_blocks() is responsible to update the XFS reserved block pool count at mount time or based on user request. When the caller requests to increase the reserve pool, blocks must be allocated from the global counters such that they are no longer available for general purpose use. If the requested reserve pool size is too large, XFS reserves what blocks are available. The implementation requires looking at the percpu counters and making an educated guess as to how many blocks to try and allocate from xfs_mod_fdblocks(), which can return -ENOSPC if the guess was not accurate due to counters being modified in parallel. xfs_reserve_blocks() retries the guess in this scenario until the allocation succeeds or it is determined that there is no space available in the fs. While not easily reproducible in the current form, the retry code doesn't actually work correctly if xfs_mod_fdblocks() actually fails. The problem is that the percpu calculations use the m_resblks counter to determine how many blocks to allocate, but unconditionally update m_resblks before the block allocation has actually succeeded. Therefore, if xfs_mod_fdblocks() fails, the code jumps to the retry label and uses the already updated m_resblks value to determine how many blocks to try and allocate. If the percpu counters previously suggested that the entire request was available, fdblocks_delta could end up set to 0. In that case, m_resblks is updated to the requested value, yet no blocks have been reserved at all. Refactor xfs_reserve_blocks() to use an explicit loop and make the code easier to follow. Since we have to drop the spinlock across the xfs_mod_fdblocks() call, use a delta value for m_resblks as well and only apply the delta once allocation succeeds. [dchinner: convert to do {} while() loop] Signed-off-by: Brian Foster <bfoster@redhat.com> Reviewed-by: Dave Chinner <dchinner@redhat.com> Signed-off-by: Dave Chinner <david@fromorbit.com>
2016-06-21 09:53:28 +08:00
out:
if (outval) {
outval->resblks = mp->m_resblks;
outval->resblks_avail = mp->m_resblks_avail;
}
spin_unlock(&mp->m_sb_lock);
return error;
}
int
xfs_fs_goingdown(
xfs_mount_t *mp,
uint32_t inflags)
{
switch (inflags) {
case XFS_FSOP_GOING_FLAGS_DEFAULT: {
if (!freeze_bdev(mp->m_super->s_bdev)) {
xfs_force_shutdown(mp, SHUTDOWN_FORCE_UMOUNT);
thaw_bdev(mp->m_super->s_bdev);
}
break;
}
case XFS_FSOP_GOING_FLAGS_LOGFLUSH:
xfs_force_shutdown(mp, SHUTDOWN_FORCE_UMOUNT);
break;
case XFS_FSOP_GOING_FLAGS_NOLOGFLUSH:
xfs_force_shutdown(mp,
SHUTDOWN_FORCE_UMOUNT | SHUTDOWN_LOG_IO_ERROR);
break;
default:
return -EINVAL;
}
return 0;
}
/*
* Force a shutdown of the filesystem instantly while keeping the filesystem
* consistent. We don't do an unmount here; just shutdown the shop, make sure
* that absolutely nothing persistent happens to this filesystem after this
* point.
xfs: make forced shutdown processing atomic The running of a forced shutdown is a bit of a mess. It does racy checks for XFS_MOUNT_SHUTDOWN in xfs_do_force_shutdown(), then does more racy checks in xfs_log_force_unmount() before finally setting XFS_MOUNT_SHUTDOWN and XLOG_IO_ERROR under the log->icloglock. Move the checking and setting of XFS_MOUNT_SHUTDOWN into xfs_do_force_shutdown() so we only process a shutdown once and once only. Serialise this with the mp->m_sb_lock spinlock so that the state change is atomic and won't race. Move all the mount specific shutdown state changes from xfs_log_force_unmount() to xfs_do_force_shutdown() so they are done atomically with setting XFS_MOUNT_SHUTDOWN. Then get rid of the racy xlog_is_shutdown() check from xlog_force_shutdown(), and gate the log shutdown on the test_and_set_bit(XLOG_IO_ERROR) test under the icloglock. This means that the log is shutdown once and once only, and code that needs to prevent races with shutdown can do so by holding the icloglock and checking the return value of xlog_is_shutdown(). This results in a predictable shutdown execution process - we set the shutdown flags once and process the shutdown once rather than the current "as many concurrent shutdowns as can race to the flag setting" situation we have now. Also, now that shutdown is atomic, alway emit a stack trace when the error level for the filesystem is high enough. This means that we always get a stack trace when trying to diagnose the cause of shutdowns in the field, rather than just for SHUTDOWN_CORRUPT_INCORE cases. Signed-off-by: Dave Chinner <dchinner@redhat.com> Reviewed-by: Darrick J. Wong <djwong@kernel.org> Reviewed-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Darrick J. Wong <djwong@kernel.org>
2021-08-11 09:00:39 +08:00
*
* The shutdown state change is atomic, resulting in the first and only the
* first shutdown call processing the shutdown. This means we only shutdown the
* log once as it requires, and we don't spam the logs when multiple concurrent
* shutdowns race to set the shutdown flags.
*/
void
xfs_do_force_shutdown(
struct xfs_mount *mp,
int flags,
char *fname,
int lnnum)
{
xfs: make forced shutdown processing atomic The running of a forced shutdown is a bit of a mess. It does racy checks for XFS_MOUNT_SHUTDOWN in xfs_do_force_shutdown(), then does more racy checks in xfs_log_force_unmount() before finally setting XFS_MOUNT_SHUTDOWN and XLOG_IO_ERROR under the log->icloglock. Move the checking and setting of XFS_MOUNT_SHUTDOWN into xfs_do_force_shutdown() so we only process a shutdown once and once only. Serialise this with the mp->m_sb_lock spinlock so that the state change is atomic and won't race. Move all the mount specific shutdown state changes from xfs_log_force_unmount() to xfs_do_force_shutdown() so they are done atomically with setting XFS_MOUNT_SHUTDOWN. Then get rid of the racy xlog_is_shutdown() check from xlog_force_shutdown(), and gate the log shutdown on the test_and_set_bit(XLOG_IO_ERROR) test under the icloglock. This means that the log is shutdown once and once only, and code that needs to prevent races with shutdown can do so by holding the icloglock and checking the return value of xlog_is_shutdown(). This results in a predictable shutdown execution process - we set the shutdown flags once and process the shutdown once rather than the current "as many concurrent shutdowns as can race to the flag setting" situation we have now. Also, now that shutdown is atomic, alway emit a stack trace when the error level for the filesystem is high enough. This means that we always get a stack trace when trying to diagnose the cause of shutdowns in the field, rather than just for SHUTDOWN_CORRUPT_INCORE cases. Signed-off-by: Dave Chinner <dchinner@redhat.com> Reviewed-by: Darrick J. Wong <djwong@kernel.org> Reviewed-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Darrick J. Wong <djwong@kernel.org>
2021-08-11 09:00:39 +08:00
int tag;
const char *why;
if (test_and_set_bit(XFS_OPSTATE_SHUTDOWN, &mp->m_opstate))
return;
xfs: make forced shutdown processing atomic The running of a forced shutdown is a bit of a mess. It does racy checks for XFS_MOUNT_SHUTDOWN in xfs_do_force_shutdown(), then does more racy checks in xfs_log_force_unmount() before finally setting XFS_MOUNT_SHUTDOWN and XLOG_IO_ERROR under the log->icloglock. Move the checking and setting of XFS_MOUNT_SHUTDOWN into xfs_do_force_shutdown() so we only process a shutdown once and once only. Serialise this with the mp->m_sb_lock spinlock so that the state change is atomic and won't race. Move all the mount specific shutdown state changes from xfs_log_force_unmount() to xfs_do_force_shutdown() so they are done atomically with setting XFS_MOUNT_SHUTDOWN. Then get rid of the racy xlog_is_shutdown() check from xlog_force_shutdown(), and gate the log shutdown on the test_and_set_bit(XLOG_IO_ERROR) test under the icloglock. This means that the log is shutdown once and once only, and code that needs to prevent races with shutdown can do so by holding the icloglock and checking the return value of xlog_is_shutdown(). This results in a predictable shutdown execution process - we set the shutdown flags once and process the shutdown once rather than the current "as many concurrent shutdowns as can race to the flag setting" situation we have now. Also, now that shutdown is atomic, alway emit a stack trace when the error level for the filesystem is high enough. This means that we always get a stack trace when trying to diagnose the cause of shutdowns in the field, rather than just for SHUTDOWN_CORRUPT_INCORE cases. Signed-off-by: Dave Chinner <dchinner@redhat.com> Reviewed-by: Darrick J. Wong <djwong@kernel.org> Reviewed-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Darrick J. Wong <djwong@kernel.org>
2021-08-11 09:00:39 +08:00
if (mp->m_sb_bp)
mp->m_sb_bp->b_flags |= XBF_DONE;
if (flags & SHUTDOWN_FORCE_UMOUNT)
xfs_alert(mp, "User initiated shutdown received.");
xfs: make forced shutdown processing atomic The running of a forced shutdown is a bit of a mess. It does racy checks for XFS_MOUNT_SHUTDOWN in xfs_do_force_shutdown(), then does more racy checks in xfs_log_force_unmount() before finally setting XFS_MOUNT_SHUTDOWN and XLOG_IO_ERROR under the log->icloglock. Move the checking and setting of XFS_MOUNT_SHUTDOWN into xfs_do_force_shutdown() so we only process a shutdown once and once only. Serialise this with the mp->m_sb_lock spinlock so that the state change is atomic and won't race. Move all the mount specific shutdown state changes from xfs_log_force_unmount() to xfs_do_force_shutdown() so they are done atomically with setting XFS_MOUNT_SHUTDOWN. Then get rid of the racy xlog_is_shutdown() check from xlog_force_shutdown(), and gate the log shutdown on the test_and_set_bit(XLOG_IO_ERROR) test under the icloglock. This means that the log is shutdown once and once only, and code that needs to prevent races with shutdown can do so by holding the icloglock and checking the return value of xlog_is_shutdown(). This results in a predictable shutdown execution process - we set the shutdown flags once and process the shutdown once rather than the current "as many concurrent shutdowns as can race to the flag setting" situation we have now. Also, now that shutdown is atomic, alway emit a stack trace when the error level for the filesystem is high enough. This means that we always get a stack trace when trying to diagnose the cause of shutdowns in the field, rather than just for SHUTDOWN_CORRUPT_INCORE cases. Signed-off-by: Dave Chinner <dchinner@redhat.com> Reviewed-by: Darrick J. Wong <djwong@kernel.org> Reviewed-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Darrick J. Wong <djwong@kernel.org>
2021-08-11 09:00:39 +08:00
if (xlog_force_shutdown(mp->m_log, flags)) {
tag = XFS_PTAG_SHUTDOWN_LOGERROR;
why = "Log I/O Error";
} else if (flags & SHUTDOWN_CORRUPT_INCORE) {
tag = XFS_PTAG_SHUTDOWN_CORRUPT;
why = "Corruption of in-memory data";
} else {
xfs: make forced shutdown processing atomic The running of a forced shutdown is a bit of a mess. It does racy checks for XFS_MOUNT_SHUTDOWN in xfs_do_force_shutdown(), then does more racy checks in xfs_log_force_unmount() before finally setting XFS_MOUNT_SHUTDOWN and XLOG_IO_ERROR under the log->icloglock. Move the checking and setting of XFS_MOUNT_SHUTDOWN into xfs_do_force_shutdown() so we only process a shutdown once and once only. Serialise this with the mp->m_sb_lock spinlock so that the state change is atomic and won't race. Move all the mount specific shutdown state changes from xfs_log_force_unmount() to xfs_do_force_shutdown() so they are done atomically with setting XFS_MOUNT_SHUTDOWN. Then get rid of the racy xlog_is_shutdown() check from xlog_force_shutdown(), and gate the log shutdown on the test_and_set_bit(XLOG_IO_ERROR) test under the icloglock. This means that the log is shutdown once and once only, and code that needs to prevent races with shutdown can do so by holding the icloglock and checking the return value of xlog_is_shutdown(). This results in a predictable shutdown execution process - we set the shutdown flags once and process the shutdown once rather than the current "as many concurrent shutdowns as can race to the flag setting" situation we have now. Also, now that shutdown is atomic, alway emit a stack trace when the error level for the filesystem is high enough. This means that we always get a stack trace when trying to diagnose the cause of shutdowns in the field, rather than just for SHUTDOWN_CORRUPT_INCORE cases. Signed-off-by: Dave Chinner <dchinner@redhat.com> Reviewed-by: Darrick J. Wong <djwong@kernel.org> Reviewed-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Darrick J. Wong <djwong@kernel.org>
2021-08-11 09:00:39 +08:00
tag = XFS_PTAG_SHUTDOWN_IOERROR;
why = "Metadata I/O Error";
}
trace_xfs_force_shutdown(mp, tag, flags, fname, lnnum);
xfs: make forced shutdown processing atomic The running of a forced shutdown is a bit of a mess. It does racy checks for XFS_MOUNT_SHUTDOWN in xfs_do_force_shutdown(), then does more racy checks in xfs_log_force_unmount() before finally setting XFS_MOUNT_SHUTDOWN and XLOG_IO_ERROR under the log->icloglock. Move the checking and setting of XFS_MOUNT_SHUTDOWN into xfs_do_force_shutdown() so we only process a shutdown once and once only. Serialise this with the mp->m_sb_lock spinlock so that the state change is atomic and won't race. Move all the mount specific shutdown state changes from xfs_log_force_unmount() to xfs_do_force_shutdown() so they are done atomically with setting XFS_MOUNT_SHUTDOWN. Then get rid of the racy xlog_is_shutdown() check from xlog_force_shutdown(), and gate the log shutdown on the test_and_set_bit(XLOG_IO_ERROR) test under the icloglock. This means that the log is shutdown once and once only, and code that needs to prevent races with shutdown can do so by holding the icloglock and checking the return value of xlog_is_shutdown(). This results in a predictable shutdown execution process - we set the shutdown flags once and process the shutdown once rather than the current "as many concurrent shutdowns as can race to the flag setting" situation we have now. Also, now that shutdown is atomic, alway emit a stack trace when the error level for the filesystem is high enough. This means that we always get a stack trace when trying to diagnose the cause of shutdowns in the field, rather than just for SHUTDOWN_CORRUPT_INCORE cases. Signed-off-by: Dave Chinner <dchinner@redhat.com> Reviewed-by: Darrick J. Wong <djwong@kernel.org> Reviewed-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Darrick J. Wong <djwong@kernel.org>
2021-08-11 09:00:39 +08:00
xfs_alert_tag(mp, tag,
"%s (0x%x) detected at %pS (%s:%d). Shutting down filesystem.",
why, flags, __return_address, fname, lnnum);
xfs_alert(mp,
"Please unmount the filesystem and rectify the problem(s)");
xfs: make forced shutdown processing atomic The running of a forced shutdown is a bit of a mess. It does racy checks for XFS_MOUNT_SHUTDOWN in xfs_do_force_shutdown(), then does more racy checks in xfs_log_force_unmount() before finally setting XFS_MOUNT_SHUTDOWN and XLOG_IO_ERROR under the log->icloglock. Move the checking and setting of XFS_MOUNT_SHUTDOWN into xfs_do_force_shutdown() so we only process a shutdown once and once only. Serialise this with the mp->m_sb_lock spinlock so that the state change is atomic and won't race. Move all the mount specific shutdown state changes from xfs_log_force_unmount() to xfs_do_force_shutdown() so they are done atomically with setting XFS_MOUNT_SHUTDOWN. Then get rid of the racy xlog_is_shutdown() check from xlog_force_shutdown(), and gate the log shutdown on the test_and_set_bit(XLOG_IO_ERROR) test under the icloglock. This means that the log is shutdown once and once only, and code that needs to prevent races with shutdown can do so by holding the icloglock and checking the return value of xlog_is_shutdown(). This results in a predictable shutdown execution process - we set the shutdown flags once and process the shutdown once rather than the current "as many concurrent shutdowns as can race to the flag setting" situation we have now. Also, now that shutdown is atomic, alway emit a stack trace when the error level for the filesystem is high enough. This means that we always get a stack trace when trying to diagnose the cause of shutdowns in the field, rather than just for SHUTDOWN_CORRUPT_INCORE cases. Signed-off-by: Dave Chinner <dchinner@redhat.com> Reviewed-by: Darrick J. Wong <djwong@kernel.org> Reviewed-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Darrick J. Wong <djwong@kernel.org>
2021-08-11 09:00:39 +08:00
if (xfs_error_level >= XFS_ERRLEVEL_HIGH)
xfs_stack_trace();
}
/*
* Reserve free space for per-AG metadata.
*/
int
xfs_fs_reserve_ag_blocks(
struct xfs_mount *mp)
{
xfs_agnumber_t agno;
struct xfs_perag *pag;
int error = 0;
int err2;
mp->m_finobt_nores = false;
for_each_perag(mp, agno, pag) {
err2 = xfs_ag_resv_init(pag, NULL);
if (err2 && !error)
error = err2;
}
if (error && error != -ENOSPC) {
xfs_warn(mp,
"Error %d reserving per-AG metadata reserve pool.", error);
xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
}
return error;
}
/*
* Free space reserved for per-AG metadata.
*/
int
xfs_fs_unreserve_ag_blocks(
struct xfs_mount *mp)
{
xfs_agnumber_t agno;
struct xfs_perag *pag;
int error = 0;
int err2;
for_each_perag(mp, agno, pag) {
err2 = xfs_ag_resv_free(pag);
if (err2 && !error)
error = err2;
}
if (error)
xfs_warn(mp,
"Error %d freeing per-AG metadata reserve pool.", error);
return error;
}