From 04da0c8196ac0b12fb6b84f4b7a51ad2fa56d869 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 1 Feb 2012 13:57:20 +0000 Subject: [PATCH] xfs: use a normal shrinker for the dquot freelist Stop reusing dquots from the freelist when allocating new ones directly, and implement a shrinker that actually follows the specifications for the interface. The shrinker implementation is still highly suboptimal at this point, but we can gradually work on it. This also fixes an bug in the previous lock ordering, where we would take the hash and dqlist locks inside of the freelist lock against the normal lock ordering. This is only solvable by introducing the dispose list, and thus not when using direct reclaim of unused dquots for new allocations. As a side-effect the quota upper bound and used to free ratio values in /proc/fs/xfs/xqm are set to 0 as these values don't make any sense in the new world order. Signed-off-by: Christoph Hellwig Signed-off-by: Ben Myers --- fs/xfs/kmem.h | 6 - fs/xfs/xfs_dquot.c | 103 ++++----------- fs/xfs/xfs_qm.c | 297 ++++++++++++++++-------------------------- fs/xfs/xfs_qm.h | 14 -- fs/xfs/xfs_qm_stats.c | 4 +- fs/xfs/xfs_trace.h | 5 +- 6 files changed, 144 insertions(+), 285 deletions(-) diff --git a/fs/xfs/kmem.h b/fs/xfs/kmem.h index 292eff198030..ab7c53fe346e 100644 --- a/fs/xfs/kmem.h +++ b/fs/xfs/kmem.h @@ -110,10 +110,4 @@ kmem_zone_destroy(kmem_zone_t *zone) extern void *kmem_zone_alloc(kmem_zone_t *, unsigned int __nocast); extern void *kmem_zone_zalloc(kmem_zone_t *, unsigned int __nocast); -static inline int -kmem_shake_allow(gfp_t gfp_mask) -{ - return ((gfp_mask & __GFP_WAIT) && (gfp_mask & __GFP_FS)); -} - #endif /* __XFS_SUPPORT_KMEM_H__ */ diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index b4ff40b5f918..cbcb7bea38e2 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -62,82 +62,6 @@ int xfs_dqerror_mod = 33; static struct lock_class_key xfs_dquot_other_class; -/* - * Allocate and initialize a dquot. We don't always allocate fresh memory; - * we try to reclaim a free dquot if the number of incore dquots are above - * a threshold. - * The only field inside the core that gets initialized at this point - * is the d_id field. The idea is to fill in the entire q_core - * when we read in the on disk dquot. - */ -STATIC xfs_dquot_t * -xfs_qm_dqinit( - xfs_mount_t *mp, - xfs_dqid_t id, - uint type) -{ - xfs_dquot_t *dqp; - boolean_t brandnewdquot; - - brandnewdquot = xfs_qm_dqalloc_incore(&dqp); - dqp->dq_flags = type; - dqp->q_core.d_id = cpu_to_be32(id); - dqp->q_mount = mp; - - /* - * No need to re-initialize these if this is a reclaimed dquot. - */ - if (brandnewdquot) { - INIT_LIST_HEAD(&dqp->q_freelist); - mutex_init(&dqp->q_qlock); - init_waitqueue_head(&dqp->q_pinwait); - - /* - * Because we want to use a counting completion, complete - * the flush completion once to allow a single access to - * the flush completion without blocking. - */ - init_completion(&dqp->q_flush); - complete(&dqp->q_flush); - - trace_xfs_dqinit(dqp); - } else { - /* - * Only the q_core portion was zeroed in dqreclaim_one(). - * So, we need to reset others. - */ - dqp->q_nrefs = 0; - dqp->q_blkno = 0; - INIT_LIST_HEAD(&dqp->q_mplist); - INIT_LIST_HEAD(&dqp->q_hashlist); - dqp->q_bufoffset = 0; - dqp->q_fileoffset = 0; - dqp->q_transp = NULL; - dqp->q_gdquot = NULL; - dqp->q_res_bcount = 0; - dqp->q_res_icount = 0; - dqp->q_res_rtbcount = 0; - atomic_set(&dqp->q_pincount, 0); - dqp->q_hash = NULL; - ASSERT(list_empty(&dqp->q_freelist)); - - trace_xfs_dqreuse(dqp); - } - - /* - * In either case we need to make sure group quotas have a different - * lock class than user quotas, to make sure lockdep knows we can - * locks of one of each at the same time. - */ - if (!(type & XFS_DQ_USER)) - lockdep_set_class(&dqp->q_qlock, &xfs_dquot_other_class); - - /* - * log item gets initialized later - */ - return (dqp); -} - /* * This is called to free all the memory associated with a dquot */ @@ -567,7 +491,32 @@ xfs_qm_dqread( int error; int cancelflags = 0; - dqp = xfs_qm_dqinit(mp, id, type); + + dqp = kmem_zone_zalloc(xfs_Gqm->qm_dqzone, KM_SLEEP); + + dqp->dq_flags = type; + dqp->q_core.d_id = cpu_to_be32(id); + dqp->q_mount = mp; + INIT_LIST_HEAD(&dqp->q_freelist); + mutex_init(&dqp->q_qlock); + init_waitqueue_head(&dqp->q_pinwait); + + /* + * Because we want to use a counting completion, complete + * the flush completion once to allow a single access to + * the flush completion without blocking. + */ + init_completion(&dqp->q_flush); + complete(&dqp->q_flush); + + /* + * Make sure group quotas have a different lock class than user + * quotas. + */ + if (!(type & XFS_DQ_USER)) + lockdep_set_class(&dqp->q_qlock, &xfs_dquot_other_class); + + atomic_inc(&xfs_Gqm->qm_totaldquots); trace_xfs_dqread(dqp); diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index 671f37eae1c7..c436def733bf 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -50,7 +50,6 @@ */ struct mutex xfs_Gqm_lock; struct xfs_qm *xfs_Gqm; -uint ndquot; kmem_zone_t *qm_dqzone; kmem_zone_t *qm_dqtrxzone; @@ -93,7 +92,6 @@ xfs_Gqm_init(void) goto out_free_udqhash; hsize /= sizeof(xfs_dqhash_t); - ndquot = hsize << 8; xqm = kmem_zalloc(sizeof(xfs_qm_t), KM_SLEEP); xqm->qm_dqhashmask = hsize - 1; @@ -137,7 +135,6 @@ xfs_Gqm_init(void) xqm->qm_dqtrxzone = qm_dqtrxzone; atomic_set(&xqm->qm_totaldquots, 0); - xqm->qm_dqfree_ratio = XFS_QM_DQFREE_RATIO; xqm->qm_nrefs = 0; return xqm; @@ -1600,216 +1597,150 @@ xfs_qm_init_quotainos( return 0; } - - -/* - * Pop the least recently used dquot off the freelist and recycle it. - */ -STATIC struct xfs_dquot * -xfs_qm_dqreclaim_one(void) +STATIC void +xfs_qm_dqfree_one( + struct xfs_dquot *dqp) { - struct xfs_dquot *dqp; - int restarts = 0; + struct xfs_mount *mp = dqp->q_mount; + struct xfs_quotainfo *qi = mp->m_quotainfo; - mutex_lock(&xfs_Gqm->qm_dqfrlist_lock); -restart: - list_for_each_entry(dqp, &xfs_Gqm->qm_dqfrlist, q_freelist) { - struct xfs_mount *mp = dqp->q_mount; + mutex_lock(&dqp->q_hash->qh_lock); + list_del_init(&dqp->q_hashlist); + dqp->q_hash->qh_version++; + mutex_unlock(&dqp->q_hash->qh_lock); - if (!xfs_dqlock_nowait(dqp)) - continue; + mutex_lock(&qi->qi_dqlist_lock); + list_del_init(&dqp->q_mplist); + qi->qi_dquots--; + qi->qi_dqreclaims++; + mutex_unlock(&qi->qi_dqlist_lock); - /* - * This dquot has already been grabbed by dqlookup. - * Remove it from the freelist and try again. - */ - if (dqp->q_nrefs) { - trace_xfs_dqreclaim_want(dqp); - XQM_STATS_INC(xqmstats.xs_qm_dqwants); + xfs_qm_dqdestroy(dqp); +} - list_del_init(&dqp->q_freelist); - xfs_Gqm->qm_dqfrlist_cnt--; - restarts++; - goto dqunlock; - } +STATIC void +xfs_qm_dqreclaim_one( + struct xfs_dquot *dqp, + struct list_head *dispose_list) +{ + struct xfs_mount *mp = dqp->q_mount; + int error; - ASSERT(dqp->q_hash); - ASSERT(!list_empty(&dqp->q_mplist)); + if (!xfs_dqlock_nowait(dqp)) + goto out_busy; - /* - * Try to grab the flush lock. If this dquot is in the process - * of getting flushed to disk, we don't want to reclaim it. - */ - if (!xfs_dqflock_nowait(dqp)) - goto dqunlock; - - /* - * We have the flush lock so we know that this is not in the - * process of being flushed. So, if this is dirty, flush it - * DELWRI so that we don't get a freelist infested with - * dirty dquots. - */ - if (XFS_DQ_IS_DIRTY(dqp)) { - int error; - - trace_xfs_dqreclaim_dirty(dqp); - - /* - * We flush it delayed write, so don't bother - * releasing the freelist lock. - */ - error = xfs_qm_dqflush(dqp, SYNC_TRYLOCK); - if (error) { - xfs_warn(mp, "%s: dquot %p flush failed", - __func__, dqp); - } - goto dqunlock; - } - xfs_dqfunlock(dqp); - - /* - * Prevent lookup now that we are going to reclaim the dquot. - * Once XFS_DQ_FREEING is set lookup won't touch the dquot, - * thus we can drop the lock now. - */ - dqp->dq_flags |= XFS_DQ_FREEING; + /* + * This dquot has acquired a reference in the meantime remove it from + * the freelist and try again. + */ + if (dqp->q_nrefs) { xfs_dqunlock(dqp); - mutex_lock(&dqp->q_hash->qh_lock); - list_del_init(&dqp->q_hashlist); - dqp->q_hash->qh_version++; - mutex_unlock(&dqp->q_hash->qh_lock); + trace_xfs_dqreclaim_want(dqp); + XQM_STATS_INC(xqmstats.xs_qm_dqwants); - mutex_lock(&mp->m_quotainfo->qi_dqlist_lock); - list_del_init(&dqp->q_mplist); - mp->m_quotainfo->qi_dquots--; - mp->m_quotainfo->qi_dqreclaims++; - mutex_unlock(&mp->m_quotainfo->qi_dqlist_lock); - - ASSERT(dqp->q_nrefs == 0); list_del_init(&dqp->q_freelist); xfs_Gqm->qm_dqfrlist_cnt--; - - mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock); - return dqp; -dqunlock: - xfs_dqunlock(dqp); - if (restarts >= XFS_QM_RECLAIM_MAX_RESTARTS) - break; - goto restart; + return; } - mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock); - return NULL; -} + ASSERT(dqp->q_hash); + ASSERT(!list_empty(&dqp->q_mplist)); -/* - * Traverse the freelist of dquots and attempt to reclaim a maximum of - * 'howmany' dquots. This operation races with dqlookup(), and attempts to - * favor the lookup function ... - */ -STATIC int -xfs_qm_shake_freelist( - int howmany) -{ - int nreclaimed = 0; - xfs_dquot_t *dqp; + /* + * Try to grab the flush lock. If this dquot is in the process of + * getting flushed to disk, we don't want to reclaim it. + */ + if (!xfs_dqflock_nowait(dqp)) + goto out_busy; - if (howmany <= 0) - return 0; + /* + * We have the flush lock so we know that this is not in the + * process of being flushed. So, if this is dirty, flush it + * DELWRI so that we don't get a freelist infested with + * dirty dquots. + */ + if (XFS_DQ_IS_DIRTY(dqp)) { + trace_xfs_dqreclaim_dirty(dqp); - while (nreclaimed < howmany) { - dqp = xfs_qm_dqreclaim_one(); - if (!dqp) - return nreclaimed; - xfs_qm_dqdestroy(dqp); - nreclaimed++; + /* + * We flush it delayed write, so don't bother releasing the + * freelist lock. + */ + error = xfs_qm_dqflush(dqp, 0); + if (error) { + xfs_warn(mp, "%s: dquot %p flush failed", + __func__, dqp); + } + + /* + * Give the dquot another try on the freelist, as the + * flushing will take some time. + */ + goto out_busy; } - return nreclaimed; + xfs_dqfunlock(dqp); + + /* + * Prevent lookups now that we are past the point of no return. + */ + dqp->dq_flags |= XFS_DQ_FREEING; + xfs_dqunlock(dqp); + + ASSERT(dqp->q_nrefs == 0); + list_move_tail(&dqp->q_freelist, dispose_list); + xfs_Gqm->qm_dqfrlist_cnt--; + + trace_xfs_dqreclaim_done(dqp); + XQM_STATS_INC(xqmstats.xs_qm_dqreclaims); + return; + +out_busy: + xfs_dqunlock(dqp); + + /* + * Move the dquot to the tail of the list so that we don't spin on it. + */ + list_move_tail(&dqp->q_freelist, &xfs_Gqm->qm_dqfrlist); + + trace_xfs_dqreclaim_busy(dqp); + XQM_STATS_INC(xqmstats.xs_qm_dqreclaim_misses); } -/* - * The kmem_shake interface is invoked when memory is running low. - */ -/* ARGSUSED */ STATIC int xfs_qm_shake( - struct shrinker *shrink, - struct shrink_control *sc) + struct shrinker *shrink, + struct shrink_control *sc) { - int ndqused, nfree, n; - gfp_t gfp_mask = sc->gfp_mask; + int nr_to_scan = sc->nr_to_scan; + LIST_HEAD (dispose_list); + struct xfs_dquot *dqp; - if (!kmem_shake_allow(gfp_mask)) - return 0; - if (!xfs_Gqm) + if ((sc->gfp_mask & (__GFP_FS|__GFP_WAIT)) != (__GFP_FS|__GFP_WAIT)) return 0; + if (!nr_to_scan) + goto out; - nfree = xfs_Gqm->qm_dqfrlist_cnt; /* free dquots */ - /* incore dquots in all f/s's */ - ndqused = atomic_read(&xfs_Gqm->qm_totaldquots) - nfree; - - ASSERT(ndqused >= 0); - - if (nfree <= ndqused && nfree < ndquot) - return 0; - - ndqused *= xfs_Gqm->qm_dqfree_ratio; /* target # of free dquots */ - n = nfree - ndqused - ndquot; /* # over target */ - - return xfs_qm_shake_freelist(MAX(nfree, n)); -} - - -/*------------------------------------------------------------------*/ - -/* - * Return a new incore dquot. Depending on the number of - * dquots in the system, we either allocate a new one on the kernel heap, - * or reclaim a free one. - * Return value is B_TRUE if we allocated a new dquot, B_FALSE if we managed - * to reclaim an existing one from the freelist. - */ -boolean_t -xfs_qm_dqalloc_incore( - xfs_dquot_t **O_dqpp) -{ - xfs_dquot_t *dqp; - - /* - * Check against high water mark to see if we want to pop - * a nincompoop dquot off the freelist. - */ - if (atomic_read(&xfs_Gqm->qm_totaldquots) >= ndquot) { - /* - * Try to recycle a dquot from the freelist. - */ - if ((dqp = xfs_qm_dqreclaim_one())) { - XQM_STATS_INC(xqmstats.xs_qm_dqreclaims); - /* - * Just zero the core here. The rest will get - * reinitialized by caller. XXX we shouldn't even - * do this zero ... - */ - memset(&dqp->q_core, 0, sizeof(dqp->q_core)); - *O_dqpp = dqp; - return B_FALSE; - } - XQM_STATS_INC(xqmstats.xs_qm_dqreclaim_misses); + mutex_lock(&xfs_Gqm->qm_dqfrlist_lock); + while (!list_empty(&xfs_Gqm->qm_dqfrlist)) { + if (nr_to_scan-- <= 0) + break; + dqp = list_first_entry(&xfs_Gqm->qm_dqfrlist, struct xfs_dquot, + q_freelist); + xfs_qm_dqreclaim_one(dqp, &dispose_list); } + mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock); - /* - * Allocate a brand new dquot on the kernel heap and return it - * to the caller to initialize. - */ - ASSERT(xfs_Gqm->qm_dqzone != NULL); - *O_dqpp = kmem_zone_zalloc(xfs_Gqm->qm_dqzone, KM_SLEEP); - atomic_inc(&xfs_Gqm->qm_totaldquots); - - return B_TRUE; + while (!list_empty(&dispose_list)) { + dqp = list_first_entry(&dispose_list, struct xfs_dquot, + q_freelist); + list_del_init(&dqp->q_freelist); + xfs_qm_dqfree_one(dqp); + } +out: + return (xfs_Gqm->qm_dqfrlist_cnt / 100) * sysctl_vfs_cache_pressure; } - /* * Start a transaction and write the incore superblock changes to * disk. flags parameter indicates which fields have changed. diff --git a/fs/xfs/xfs_qm.h b/fs/xfs/xfs_qm.h index 9b4f3adefbc5..9a9b997e1a0a 100644 --- a/fs/xfs/xfs_qm.h +++ b/fs/xfs/xfs_qm.h @@ -26,23 +26,11 @@ struct xfs_qm; struct xfs_inode; -extern uint ndquot; extern struct mutex xfs_Gqm_lock; extern struct xfs_qm *xfs_Gqm; extern kmem_zone_t *qm_dqzone; extern kmem_zone_t *qm_dqtrxzone; -/* - * Ditto, for xfs_qm_dqreclaim_one. - */ -#define XFS_QM_RECLAIM_MAX_RESTARTS 4 - -/* - * Ideal ratio of free to in use dquots. Quota manager makes an attempt - * to keep this balance. - */ -#define XFS_QM_DQFREE_RATIO 2 - /* * Dquot hashtable constants/threshold values. */ @@ -74,7 +62,6 @@ typedef struct xfs_qm { int qm_dqfrlist_cnt; atomic_t qm_totaldquots; /* total incore dquots */ uint qm_nrefs; /* file systems with quota on */ - int qm_dqfree_ratio;/* ratio of free to inuse dquots */ kmem_zone_t *qm_dqzone; /* dquot mem-alloc zone */ kmem_zone_t *qm_dqtrxzone; /* t_dqinfo of transactions */ } xfs_qm_t; @@ -143,7 +130,6 @@ extern int xfs_qm_quotacheck(xfs_mount_t *); extern int xfs_qm_write_sb_changes(xfs_mount_t *, __int64_t); /* dquot stuff */ -extern boolean_t xfs_qm_dqalloc_incore(xfs_dquot_t **); extern int xfs_qm_dqpurge_all(xfs_mount_t *, uint); extern void xfs_qm_dqrele_all_inodes(xfs_mount_t *, uint); diff --git a/fs/xfs/xfs_qm_stats.c b/fs/xfs/xfs_qm_stats.c index 8671a0b32644..5729ba570877 100644 --- a/fs/xfs/xfs_qm_stats.c +++ b/fs/xfs/xfs_qm_stats.c @@ -42,9 +42,9 @@ static int xqm_proc_show(struct seq_file *m, void *v) { /* maximum; incore; ratio free to inuse; freelist */ seq_printf(m, "%d\t%d\t%d\t%u\n", - ndquot, + 0, xfs_Gqm? atomic_read(&xfs_Gqm->qm_totaldquots) : 0, - xfs_Gqm? xfs_Gqm->qm_dqfree_ratio : 0, + 0, xfs_Gqm? xfs_Gqm->qm_dqfrlist_cnt : 0); return 0; } diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 6b6df5802e95..bb134a819930 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -733,11 +733,10 @@ DEFINE_EVENT(xfs_dquot_class, name, \ DEFINE_DQUOT_EVENT(xfs_dqadjust); DEFINE_DQUOT_EVENT(xfs_dqreclaim_want); DEFINE_DQUOT_EVENT(xfs_dqreclaim_dirty); -DEFINE_DQUOT_EVENT(xfs_dqreclaim_unlink); +DEFINE_DQUOT_EVENT(xfs_dqreclaim_busy); +DEFINE_DQUOT_EVENT(xfs_dqreclaim_done); DEFINE_DQUOT_EVENT(xfs_dqattach_found); DEFINE_DQUOT_EVENT(xfs_dqattach_get); -DEFINE_DQUOT_EVENT(xfs_dqinit); -DEFINE_DQUOT_EVENT(xfs_dqreuse); DEFINE_DQUOT_EVENT(xfs_dqalloc); DEFINE_DQUOT_EVENT(xfs_dqtobp_read); DEFINE_DQUOT_EVENT(xfs_dqread);