xfs: use rhashtable to track buffer cache

On filesystems with a lot of metadata and in metadata intensive workloads
xfs_buf_find() is showing up at the top of the CPU cycles trace. Most of
the CPU time is spent on CPU cache misses while traversing the rbtree.

As the buffer cache does not need any kind of ordering, but fast lookups
a hashtable is the natural data structure to use. The rhashtable
infrastructure provides a self-scaling hashtable implementation and
allows lookups to proceed while the table is going through a resize
operation.

This reduces the CPU-time spent for the lookups to 1/3 even for small
filesystems with a relatively small number of cached buffers, with
possibly much larger gains on higher loaded filesystems.

[dchinner: reduce minimum hash size to an acceptable size for large
	   filesystems with many AGs with no active use.]
[dchinner: remove stale rbtree asserts.]
[dchinner: use xfs_buf_map for compare function argument.]
[dchinner: make functions static.]
[dchinner: remove redundant comments.]

Signed-off-by: Lucas Stach <dev@lynxeye.de>
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Dave Chinner <david@fromorbit.com>
This commit is contained in:
Lucas Stach 2016-12-07 17:36:36 +11:00 committed by Dave Chinner
parent cae028df53
commit 6031e73a5b
5 changed files with 85 additions and 52 deletions

View File

@ -219,7 +219,6 @@ _xfs_buf_alloc(
init_completion(&bp->b_iowait); init_completion(&bp->b_iowait);
INIT_LIST_HEAD(&bp->b_lru); INIT_LIST_HEAD(&bp->b_lru);
INIT_LIST_HEAD(&bp->b_list); INIT_LIST_HEAD(&bp->b_list);
RB_CLEAR_NODE(&bp->b_rbnode);
sema_init(&bp->b_sema, 0); /* held, no waiters */ sema_init(&bp->b_sema, 0); /* held, no waiters */
spin_lock_init(&bp->b_lock); spin_lock_init(&bp->b_lock);
XB_SET_OWNER(bp); XB_SET_OWNER(bp);
@ -473,6 +472,62 @@ _xfs_buf_map_pages(
/* /*
* Finding and Reading Buffers * Finding and Reading Buffers
*/ */
static int
_xfs_buf_obj_cmp(
struct rhashtable_compare_arg *arg,
const void *obj)
{
const struct xfs_buf_map *map = arg->key;
const struct xfs_buf *bp = obj;
/*
* The key hashing in the lookup path depends on the key being the
* first element of the compare_arg, make sure to assert this.
*/
BUILD_BUG_ON(offsetof(struct xfs_buf_map, bm_bn) != 0);
if (bp->b_bn != map->bm_bn)
return 1;
if (unlikely(bp->b_length != map->bm_len)) {
/*
* found a block number match. If the range doesn't
* match, the only way this is allowed is if the buffer
* in the cache is stale and the transaction that made
* it stale has not yet committed. i.e. we are
* reallocating a busy extent. Skip this buffer and
* continue searching for an exact match.
*/
ASSERT(bp->b_flags & XBF_STALE);
return 1;
}
return 0;
}
static const struct rhashtable_params xfs_buf_hash_params = {
.min_size = 32, /* empty AGs have minimal footprint */
.nelem_hint = 16,
.key_len = sizeof(xfs_daddr_t),
.key_offset = offsetof(struct xfs_buf, b_bn),
.head_offset = offsetof(struct xfs_buf, b_rhash_head),
.automatic_shrinking = true,
.obj_cmpfn = _xfs_buf_obj_cmp,
};
int
xfs_buf_hash_init(
struct xfs_perag *pag)
{
spin_lock_init(&pag->pag_buf_lock);
return rhashtable_init(&pag->pag_buf_hash, &xfs_buf_hash_params);
}
void
xfs_buf_hash_destroy(
struct xfs_perag *pag)
{
rhashtable_destroy(&pag->pag_buf_hash);
}
/* /*
* Look up, and creates if absent, a lockable buffer for * Look up, and creates if absent, a lockable buffer for
@ -488,27 +543,24 @@ _xfs_buf_find(
xfs_buf_t *new_bp) xfs_buf_t *new_bp)
{ {
struct xfs_perag *pag; struct xfs_perag *pag;
struct rb_node **rbp;
struct rb_node *parent;
xfs_buf_t *bp; xfs_buf_t *bp;
xfs_daddr_t blkno = map[0].bm_bn; struct xfs_buf_map cmap = { .bm_bn = map[0].bm_bn };
xfs_daddr_t eofs; xfs_daddr_t eofs;
int numblks = 0;
int i; int i;
for (i = 0; i < nmaps; i++) for (i = 0; i < nmaps; i++)
numblks += map[i].bm_len; cmap.bm_len += map[i].bm_len;
/* Check for IOs smaller than the sector size / not sector aligned */ /* Check for IOs smaller than the sector size / not sector aligned */
ASSERT(!(BBTOB(numblks) < btp->bt_meta_sectorsize)); ASSERT(!(BBTOB(cmap.bm_len) < btp->bt_meta_sectorsize));
ASSERT(!(BBTOB(blkno) & (xfs_off_t)btp->bt_meta_sectormask)); ASSERT(!(BBTOB(cmap.bm_bn) & (xfs_off_t)btp->bt_meta_sectormask));
/* /*
* Corrupted block numbers can get through to here, unfortunately, so we * Corrupted block numbers can get through to here, unfortunately, so we
* have to check that the buffer falls within the filesystem bounds. * have to check that the buffer falls within the filesystem bounds.
*/ */
eofs = XFS_FSB_TO_BB(btp->bt_mount, btp->bt_mount->m_sb.sb_dblocks); eofs = XFS_FSB_TO_BB(btp->bt_mount, btp->bt_mount->m_sb.sb_dblocks);
if (blkno < 0 || blkno >= eofs) { if (cmap.bm_bn < 0 || cmap.bm_bn >= eofs) {
/* /*
* XXX (dgc): we should really be returning -EFSCORRUPTED here, * XXX (dgc): we should really be returning -EFSCORRUPTED here,
* but none of the higher level infrastructure supports * but none of the higher level infrastructure supports
@ -516,53 +568,29 @@ _xfs_buf_find(
*/ */
xfs_alert(btp->bt_mount, xfs_alert(btp->bt_mount,
"%s: Block out of range: block 0x%llx, EOFS 0x%llx ", "%s: Block out of range: block 0x%llx, EOFS 0x%llx ",
__func__, blkno, eofs); __func__, cmap.bm_bn, eofs);
WARN_ON(1); WARN_ON(1);
return NULL; return NULL;
} }
/* get tree root */
pag = xfs_perag_get(btp->bt_mount, pag = xfs_perag_get(btp->bt_mount,
xfs_daddr_to_agno(btp->bt_mount, blkno)); xfs_daddr_to_agno(btp->bt_mount, cmap.bm_bn));
/* walk tree */
spin_lock(&pag->pag_buf_lock); spin_lock(&pag->pag_buf_lock);
rbp = &pag->pag_buf_tree.rb_node; bp = rhashtable_lookup_fast(&pag->pag_buf_hash, &cmap,
parent = NULL; xfs_buf_hash_params);
bp = NULL; if (bp) {
while (*rbp) {
parent = *rbp;
bp = rb_entry(parent, struct xfs_buf, b_rbnode);
if (blkno < bp->b_bn)
rbp = &(*rbp)->rb_left;
else if (blkno > bp->b_bn)
rbp = &(*rbp)->rb_right;
else {
/*
* found a block number match. If the range doesn't
* match, the only way this is allowed is if the buffer
* in the cache is stale and the transaction that made
* it stale has not yet committed. i.e. we are
* reallocating a busy extent. Skip this buffer and
* continue searching to the right for an exact match.
*/
if (bp->b_length != numblks) {
ASSERT(bp->b_flags & XBF_STALE);
rbp = &(*rbp)->rb_right;
continue;
}
atomic_inc(&bp->b_hold); atomic_inc(&bp->b_hold);
goto found; goto found;
} }
}
/* No match found */ /* No match found */
if (new_bp) { if (new_bp) {
rb_link_node(&new_bp->b_rbnode, parent, rbp);
rb_insert_color(&new_bp->b_rbnode, &pag->pag_buf_tree);
/* the buffer keeps the perag reference until it is freed */ /* the buffer keeps the perag reference until it is freed */
new_bp->b_pag = pag; new_bp->b_pag = pag;
rhashtable_insert_fast(&pag->pag_buf_hash,
&new_bp->b_rhash_head,
xfs_buf_hash_params);
spin_unlock(&pag->pag_buf_lock); spin_unlock(&pag->pag_buf_lock);
} else { } else {
XFS_STATS_INC(btp->bt_mount, xb_miss_locked); XFS_STATS_INC(btp->bt_mount, xb_miss_locked);
@ -930,7 +958,6 @@ xfs_buf_rele(
if (!pag) { if (!pag) {
ASSERT(list_empty(&bp->b_lru)); ASSERT(list_empty(&bp->b_lru));
ASSERT(RB_EMPTY_NODE(&bp->b_rbnode));
if (atomic_dec_and_test(&bp->b_hold)) { if (atomic_dec_and_test(&bp->b_hold)) {
xfs_buf_ioacct_dec(bp); xfs_buf_ioacct_dec(bp);
xfs_buf_free(bp); xfs_buf_free(bp);
@ -938,8 +965,6 @@ xfs_buf_rele(
return; return;
} }
ASSERT(!RB_EMPTY_NODE(&bp->b_rbnode));
ASSERT(atomic_read(&bp->b_hold) > 0); ASSERT(atomic_read(&bp->b_hold) > 0);
release = atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock); release = atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock);
@ -983,7 +1008,8 @@ xfs_buf_rele(
} }
ASSERT(!(bp->b_flags & _XBF_DELWRI_Q)); ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
rb_erase(&bp->b_rbnode, &pag->pag_buf_tree); rhashtable_remove_fast(&pag->pag_buf_hash, &bp->b_rhash_head,
xfs_buf_hash_params);
spin_unlock(&pag->pag_buf_lock); spin_unlock(&pag->pag_buf_lock);
xfs_perag_put(pag); xfs_perag_put(pag);
freebuf = true; freebuf = true;

View File

@ -151,7 +151,7 @@ typedef struct xfs_buf {
* which is the only bit that is touched if we hit the semaphore * which is the only bit that is touched if we hit the semaphore
* fast-path on locking. * fast-path on locking.
*/ */
struct rb_node b_rbnode; /* rbtree node */ struct rhash_head b_rhash_head; /* pag buffer hash node */
xfs_daddr_t b_bn; /* block number of buffer */ xfs_daddr_t b_bn; /* block number of buffer */
int b_length; /* size of buffer in BBs */ int b_length; /* size of buffer in BBs */
atomic_t b_hold; /* reference count */ atomic_t b_hold; /* reference count */

View File

@ -78,6 +78,7 @@ typedef __u32 xfs_nlink_t;
#include <linux/freezer.h> #include <linux/freezer.h>
#include <linux/list_sort.h> #include <linux/list_sort.h>
#include <linux/ratelimit.h> #include <linux/ratelimit.h>
#include <linux/rhashtable.h>
#include <asm/page.h> #include <asm/page.h>
#include <asm/div64.h> #include <asm/div64.h>

View File

@ -157,6 +157,7 @@ xfs_free_perag(
spin_unlock(&mp->m_perag_lock); spin_unlock(&mp->m_perag_lock);
ASSERT(pag); ASSERT(pag);
ASSERT(atomic_read(&pag->pag_ref) == 0); ASSERT(atomic_read(&pag->pag_ref) == 0);
xfs_buf_hash_destroy(pag);
call_rcu(&pag->rcu_head, __xfs_free_perag); call_rcu(&pag->rcu_head, __xfs_free_perag);
} }
} }
@ -212,8 +213,8 @@ xfs_initialize_perag(
spin_lock_init(&pag->pag_ici_lock); spin_lock_init(&pag->pag_ici_lock);
mutex_init(&pag->pag_ici_reclaim_lock); mutex_init(&pag->pag_ici_reclaim_lock);
INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC); INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC);
spin_lock_init(&pag->pag_buf_lock); if (xfs_buf_hash_init(pag))
pag->pag_buf_tree = RB_ROOT; goto out_unwind;
if (radix_tree_preload(GFP_NOFS)) if (radix_tree_preload(GFP_NOFS))
goto out_unwind; goto out_unwind;
@ -239,9 +240,11 @@ xfs_initialize_perag(
return 0; return 0;
out_unwind: out_unwind:
xfs_buf_hash_destroy(pag);
kmem_free(pag); kmem_free(pag);
for (; index > first_initialised; index--) { for (; index > first_initialised; index--) {
pag = radix_tree_delete(&mp->m_perag_tree, index); pag = radix_tree_delete(&mp->m_perag_tree, index);
xfs_buf_hash_destroy(pag);
kmem_free(pag); kmem_free(pag);
} }
return error; return error;

View File

@ -393,8 +393,8 @@ typedef struct xfs_perag {
unsigned long pag_ici_reclaim_cursor; /* reclaim restart point */ unsigned long pag_ici_reclaim_cursor; /* reclaim restart point */
/* buffer cache index */ /* buffer cache index */
spinlock_t pag_buf_lock; /* lock for pag_buf_tree */ spinlock_t pag_buf_lock; /* lock for pag_buf_hash */
struct rb_root pag_buf_tree; /* ordered tree of active buffers */ struct rhashtable pag_buf_hash;
/* for rcu-safe freeing */ /* for rcu-safe freeing */
struct rcu_head rcu_head; struct rcu_head rcu_head;
@ -424,6 +424,9 @@ xfs_perag_resv(
} }
} }
int xfs_buf_hash_init(xfs_perag_t *pag);
void xfs_buf_hash_destroy(xfs_perag_t *pag);
extern void xfs_uuid_table_free(void); extern void xfs_uuid_table_free(void);
extern int xfs_log_sbcount(xfs_mount_t *); extern int xfs_log_sbcount(xfs_mount_t *);
extern __uint64_t xfs_default_resblks(xfs_mount_t *mp); extern __uint64_t xfs_default_resblks(xfs_mount_t *mp);