From 5400da7dc0862d73523691038c044535f518a57f Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Thu, 24 Jul 2014 19:40:22 +1000 Subject: [PATCH 1/4] xfs: add scan owner field to xfs_eofblocks From: Brian Foster The scan owner field represents an optional inode number that is responsible for the current scan. The purpose is to identify that an inode is under iolock and as such, the iolock shouldn't be attempted when trimming eofblocks. This is an internal only field. Signed-off-by: Brian Foster Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/xfs_icache.c | 13 ++++++++++++- fs/xfs/xfs_icache.h | 2 ++ 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 08ba4c6e1359..6e522ff5a006 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -1211,6 +1211,9 @@ xfs_inode_free_eofblocks( { int ret; struct xfs_eofblocks *eofb = args; + bool need_iolock = true; + + ASSERT(!eofb || (eofb && eofb->eof_scan_owner != 0)); if (!xfs_can_free_eofblocks(ip, false)) { /* inode could be preallocated or append-only */ @@ -1235,9 +1238,17 @@ xfs_inode_free_eofblocks( if (eofb->eof_flags & XFS_EOF_FLAGS_MINFILESIZE && XFS_ISIZE(ip) < eofb->eof_min_file_size) return 0; + + /* + * A scan owner implies we already hold the iolock. Skip it in + * xfs_free_eofblocks() to avoid deadlock. This also eliminates + * the possibility of EAGAIN being returned. + */ + if (eofb->eof_scan_owner == ip->i_ino) + need_iolock = false; } - ret = xfs_free_eofblocks(ip->i_mount, ip, true); + ret = xfs_free_eofblocks(ip->i_mount, ip, need_iolock); /* don't revisit the inode if we're not waiting */ if (ret == -EAGAIN && !(flags & SYNC_WAIT)) diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h index 6250430d609c..98687af6a99d 100644 --- a/fs/xfs/xfs_icache.h +++ b/fs/xfs/xfs_icache.h @@ -27,6 +27,7 @@ struct xfs_eofblocks { kgid_t eof_gid; prid_t eof_prid; __u64 eof_min_file_size; + xfs_ino_t eof_scan_owner; }; #define SYNC_WAIT 0x0001 /* wait for i/o to complete */ @@ -84,6 +85,7 @@ xfs_fs_eofblocks_from_user( dst->eof_flags = src->eof_flags; dst->eof_prid = src->eof_prid; dst->eof_min_file_size = src->eof_min_file_size; + dst->eof_scan_owner = NULLFSINO; dst->eof_uid = INVALID_UID; if (src->eof_flags & XFS_EOF_FLAGS_UID) { From f4526397928fff052f795713748f376a2bba1b5e Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Thu, 24 Jul 2014 19:44:28 +1000 Subject: [PATCH 2/4] xfs: support a union-based filter for eofblocks scans From: Brian Foster The eofblocks scan inode filter uses intersection logic by default. E.g., specifying both user and group quota ids filters out inodes that are not covered by both the specified user and group quotas. This is suitable for behavior exposed to userspace. Scans that are initiated from within the kernel might require more broad semantics, such as scanning all inodes under each quota associated with an inode to alleviate low free space conditions in each. Create the XFS_EOF_FLAGS_UNION flag to support a conditional union-based filtering algorithm for eofblocks scans. This flag is intentionally left out of the valid mask as it is not supported for scans initiated from userspace. Signed-off-by: Brian Foster Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/xfs_fs.h | 3 +++ fs/xfs/xfs_icache.c | 31 ++++++++++++++++++++++++++++++- 2 files changed, 33 insertions(+), 1 deletion(-) diff --git a/fs/xfs/xfs_fs.h b/fs/xfs/xfs_fs.h index d34703dbcb42..ffdabc687852 100644 --- a/fs/xfs/xfs_fs.h +++ b/fs/xfs/xfs_fs.h @@ -375,6 +375,9 @@ struct xfs_fs_eofblocks { #define XFS_EOF_FLAGS_GID (1 << 2) /* filter by gid */ #define XFS_EOF_FLAGS_PRID (1 << 3) /* filter by project id */ #define XFS_EOF_FLAGS_MINFILESIZE (1 << 4) /* filter by min file size */ +#define XFS_EOF_FLAGS_UNION (1 << 5) /* union filter algorithm; + * kernel only, not included in + * valid mask */ #define XFS_EOF_FLAGS_VALID \ (XFS_EOF_FLAGS_SYNC | \ XFS_EOF_FLAGS_UID | \ diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 6e522ff5a006..43452081b705 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -1203,6 +1203,30 @@ xfs_inode_match_id( return 1; } +/* + * A union-based inode filtering algorithm. Process the inode if any of the + * criteria match. This is for global/internal scans only. + */ +STATIC int +xfs_inode_match_id_union( + struct xfs_inode *ip, + struct xfs_eofblocks *eofb) +{ + if ((eofb->eof_flags & XFS_EOF_FLAGS_UID) && + uid_eq(VFS_I(ip)->i_uid, eofb->eof_uid)) + return 1; + + if ((eofb->eof_flags & XFS_EOF_FLAGS_GID) && + gid_eq(VFS_I(ip)->i_gid, eofb->eof_gid)) + return 1; + + if ((eofb->eof_flags & XFS_EOF_FLAGS_PRID) && + xfs_get_projid(ip) == eofb->eof_prid) + return 1; + + return 0; +} + STATIC int xfs_inode_free_eofblocks( struct xfs_inode *ip, @@ -1212,6 +1236,7 @@ xfs_inode_free_eofblocks( int ret; struct xfs_eofblocks *eofb = args; bool need_iolock = true; + int match; ASSERT(!eofb || (eofb && eofb->eof_scan_owner != 0)); @@ -1231,7 +1256,11 @@ xfs_inode_free_eofblocks( return 0; if (eofb) { - if (!xfs_inode_match_id(ip, eofb)) + if (eofb->eof_flags & XFS_EOF_FLAGS_UNION) + match = xfs_inode_match_id_union(ip, eofb); + else + match = xfs_inode_match_id(ip, eofb); + if (!match) return 0; /* skip the inode if the file size is too small */ From dc06f398f00059707236d456d954a3a9d2a829db Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Thu, 24 Jul 2014 19:49:28 +1000 Subject: [PATCH 3/4] xfs: run an eofblocks scan on ENOSPC/EDQUOT From: Brian Foster Speculative preallocation and and the associated throttling metrics assume we're working with large files on large filesystems. Users have reported inefficiencies in these mechanisms when we happen to be dealing with large files on smaller filesystems. This can occur because while prealloc throttling is aggressive under low free space conditions, it is not active until we reach 5% free space or less. For example, a 40GB filesystem has enough space for several files large enough to have multi-GB preallocations at any given time. If those files are slow growing, they might reserve preallocation for long periods of time as well as avoid the background scanner due to frequent modification. If a new file is written under these conditions, said file has no access to this already reserved space and premature ENOSPC is imminent. To handle this scenario, modify the buffered write ENOSPC handling and retry sequence to invoke an eofblocks scan. In the smaller filesystem scenario, the eofblocks scan resets the usage of preallocation such that when the 5% free space threshold is met, throttling effectively takes over to provide fair and efficient preallocation until legitimate ENOSPC. The eofblocks scan is selective based on the nature of the failure. For example, an EDQUOT failure in a particular quota will use a filtered scan for that quota. Because we don't know which quota might have caused an allocation failure at any given time, we include each applicable quota determined to be under low free space conditions in the scan. Signed-off-by: Brian Foster Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/xfs_dquot.h | 15 +++++++++++++ fs/xfs/xfs_file.c | 23 ++++++++++++++++---- fs/xfs/xfs_icache.c | 52 +++++++++++++++++++++++++++++++++++++++++++++ fs/xfs/xfs_icache.h | 1 + 4 files changed, 87 insertions(+), 4 deletions(-) diff --git a/fs/xfs/xfs_dquot.h b/fs/xfs/xfs_dquot.h index 68a68f704837..c24c67e22a2a 100644 --- a/fs/xfs/xfs_dquot.h +++ b/fs/xfs/xfs_dquot.h @@ -139,6 +139,21 @@ static inline xfs_dquot_t *xfs_inode_dquot(struct xfs_inode *ip, int type) } } +/* + * Check whether a dquot is under low free space conditions. We assume the quota + * is enabled and enforced. + */ +static inline bool xfs_dquot_lowsp(struct xfs_dquot *dqp) +{ + int64_t freesp; + + freesp = be64_to_cpu(dqp->q_core.d_blk_hardlimit) - dqp->q_res_bcount; + if (freesp < dqp->q_low_space[XFS_QLOWSP_1_PCNT]) + return true; + + return false; +} + #define XFS_DQ_IS_LOCKED(dqp) (mutex_is_locked(&((dqp)->q_qlock))) #define XFS_DQ_IS_DIRTY(dqp) ((dqp)->dq_flags & XFS_DQ_DIRTY) #define XFS_QM_ISUDQ(dqp) ((dqp)->dq_flags & XFS_DQ_USER) diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 181605da08e4..fcf91a22f5d8 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -38,6 +38,7 @@ #include "xfs_trace.h" #include "xfs_log.h" #include "xfs_dinode.h" +#include "xfs_icache.h" #include #include @@ -689,14 +690,28 @@ write_retry: ret = generic_perform_write(file, from, pos); if (likely(ret >= 0)) iocb->ki_pos = pos + ret; + /* - * If we just got an ENOSPC, try to write back all dirty inodes to - * convert delalloc space to free up some of the excess reserved - * metadata space. + * If we hit a space limit, try to free up some lingering preallocated + * space before returning an error. In the case of ENOSPC, first try to + * write back all dirty inodes to free up some of the excess reserved + * metadata space. This reduces the chances that the eofblocks scan + * waits on dirty mappings. Since xfs_flush_inodes() is serialized, this + * also behaves as a filter to prevent too many eofblocks scans from + * running at the same time. */ - if (ret == -ENOSPC && !enospc) { + if (ret == -EDQUOT && !enospc) { + enospc = xfs_inode_free_quota_eofblocks(ip); + if (enospc) + goto write_retry; + } else if (ret == -ENOSPC && !enospc) { + struct xfs_eofblocks eofb = {0}; + enospc = 1; xfs_flush_inodes(ip->i_mount); + eofb.eof_scan_owner = ip->i_ino; /* for locking */ + eofb.eof_flags = XFS_EOF_FLAGS_SYNC; + xfs_icache_free_eofblocks(ip->i_mount, &eofb); goto write_retry; } diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 43452081b705..981b2cf51985 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -33,6 +33,9 @@ #include "xfs_trace.h" #include "xfs_icache.h" #include "xfs_bmap_util.h" +#include "xfs_quota.h" +#include "xfs_dquot_item.h" +#include "xfs_dquot.h" #include #include @@ -1300,6 +1303,55 @@ xfs_icache_free_eofblocks( eofb, XFS_ICI_EOFBLOCKS_TAG); } +/* + * Run eofblocks scans on the quotas applicable to the inode. For inodes with + * multiple quotas, we don't know exactly which quota caused an allocation + * failure. We make a best effort by including each quota under low free space + * conditions (less than 1% free space) in the scan. + */ +int +xfs_inode_free_quota_eofblocks( + struct xfs_inode *ip) +{ + int scan = 0; + struct xfs_eofblocks eofb = {0}; + struct xfs_dquot *dq; + + ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL)); + + /* + * Set the scan owner to avoid a potential livelock. Otherwise, the scan + * can repeatedly trylock on the inode we're currently processing. We + * run a sync scan to increase effectiveness and use the union filter to + * cover all applicable quotas in a single scan. + */ + eofb.eof_scan_owner = ip->i_ino; + eofb.eof_flags = XFS_EOF_FLAGS_UNION|XFS_EOF_FLAGS_SYNC; + + if (XFS_IS_UQUOTA_ENFORCED(ip->i_mount)) { + dq = xfs_inode_dquot(ip, XFS_DQ_USER); + if (dq && xfs_dquot_lowsp(dq)) { + eofb.eof_uid = VFS_I(ip)->i_uid; + eofb.eof_flags |= XFS_EOF_FLAGS_UID; + scan = 1; + } + } + + if (XFS_IS_GQUOTA_ENFORCED(ip->i_mount)) { + dq = xfs_inode_dquot(ip, XFS_DQ_GROUP); + if (dq && xfs_dquot_lowsp(dq)) { + eofb.eof_gid = VFS_I(ip)->i_gid; + eofb.eof_flags |= XFS_EOF_FLAGS_GID; + scan = 1; + } + } + + if (scan) + xfs_icache_free_eofblocks(ip->i_mount, &eofb); + + return scan; +} + void xfs_inode_set_eofblocks_tag( xfs_inode_t *ip) diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h index 98687af6a99d..46748b86b12f 100644 --- a/fs/xfs/xfs_icache.h +++ b/fs/xfs/xfs_icache.h @@ -58,6 +58,7 @@ void xfs_inode_set_reclaim_tag(struct xfs_inode *ip); void xfs_inode_set_eofblocks_tag(struct xfs_inode *ip); void xfs_inode_clear_eofblocks_tag(struct xfs_inode *ip); int xfs_icache_free_eofblocks(struct xfs_mount *, struct xfs_eofblocks *); +int xfs_inode_free_quota_eofblocks(struct xfs_inode *ip); void xfs_eofblocks_worker(struct work_struct *); int xfs_inode_ag_iterator(struct xfs_mount *mp, From f074051ff550f9f1f1a8ab4868277d049a7fd7aa Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Thu, 24 Jul 2014 19:56:08 +1000 Subject: [PATCH 4/4] xfs: squash prealloc while over quota free space as well From: Brian Foster Commit 4d559a3b introduced heavy prealloc. squashing to catch the case of requesting too large a prealloc on smaller filesystems, leading to repeated flush and retry cycles that occur on ENOSPC. Now that we issue eofblocks scans on EDQUOT/ENOSPC, squash the prealloc against the minimum available free space across all applicable quotas as well to avoid a similar problem of repeated eofblocks scans. Signed-off-by: Brian Foster Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/xfs_iomap.c | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 923c044bd26f..783b3b1b0684 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -397,7 +397,8 @@ xfs_quota_calc_throttle( struct xfs_inode *ip, int type, xfs_fsblock_t *qblocks, - int *qshift) + int *qshift, + int64_t *qfreesp) { int64_t freesp; int shift = 0; @@ -406,6 +407,7 @@ xfs_quota_calc_throttle( /* over hi wmark, squash the prealloc completely */ if (dq->q_res_bcount >= dq->q_prealloc_hi_wmark) { *qblocks = 0; + *qfreesp = 0; return; } @@ -418,6 +420,9 @@ xfs_quota_calc_throttle( shift += 2; } + if (freesp < *qfreesp) + *qfreesp = freesp; + /* only overwrite the throttle values if we are more aggressive */ if ((freesp >> shift) < (*qblocks >> *qshift)) { *qblocks = freesp; @@ -476,15 +481,18 @@ xfs_iomap_prealloc_size( } /* - * Check each quota to cap the prealloc size and provide a shift - * value to throttle with. + * Check each quota to cap the prealloc size, provide a shift value to + * throttle with and adjust amount of available space. */ if (xfs_quota_need_throttle(ip, XFS_DQ_USER, alloc_blocks)) - xfs_quota_calc_throttle(ip, XFS_DQ_USER, &qblocks, &qshift); + xfs_quota_calc_throttle(ip, XFS_DQ_USER, &qblocks, &qshift, + &freesp); if (xfs_quota_need_throttle(ip, XFS_DQ_GROUP, alloc_blocks)) - xfs_quota_calc_throttle(ip, XFS_DQ_GROUP, &qblocks, &qshift); + xfs_quota_calc_throttle(ip, XFS_DQ_GROUP, &qblocks, &qshift, + &freesp); if (xfs_quota_need_throttle(ip, XFS_DQ_PROJ, alloc_blocks)) - xfs_quota_calc_throttle(ip, XFS_DQ_PROJ, &qblocks, &qshift); + xfs_quota_calc_throttle(ip, XFS_DQ_PROJ, &qblocks, &qshift, + &freesp); /* * The final prealloc size is set to the minimum of free space available