Btrfs: use percpu counter for dirty metadata count
->dirty_metadata_bytes is accessed very frequently, so use percpu counter instead of the u64 variant to reduce the contention of the lock. This patch also fixed the problem that we access it without lock protection in __btrfs_btree_balance_dirty(), which may cause we skip the dirty pages flush. Signed-off-by: Miao Xie <miaox@cn.fujitsu.com> Signed-off-by: Josef Bacik <jbacik@fusionio.com>
This commit is contained in:
parent
c018daecea
commit
e2d845211e
|
@ -191,6 +191,8 @@ static int btrfs_csum_sizes[] = { 4, 0 };
|
||||||
/* ioprio of readahead is set to idle */
|
/* ioprio of readahead is set to idle */
|
||||||
#define BTRFS_IOPRIO_READA (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0))
|
#define BTRFS_IOPRIO_READA (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0))
|
||||||
|
|
||||||
|
#define BTRFS_DIRTY_METADATA_THRESH (32 * 1024 * 1024)
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* The key defines the order in the tree, and so it also defines (optimal)
|
* The key defines the order in the tree, and so it also defines (optimal)
|
||||||
* block layout.
|
* block layout.
|
||||||
|
@ -1448,10 +1450,9 @@ struct btrfs_fs_info {
|
||||||
|
|
||||||
u64 total_pinned;
|
u64 total_pinned;
|
||||||
|
|
||||||
/* protected by the delalloc lock, used to keep from writing
|
/* used to keep from writing metadata until there is a nice batch */
|
||||||
* metadata until there is a nice batch
|
struct percpu_counter dirty_metadata_bytes;
|
||||||
*/
|
s32 dirty_metadata_batch;
|
||||||
u64 dirty_metadata_bytes;
|
|
||||||
struct list_head dirty_cowonly_roots;
|
struct list_head dirty_cowonly_roots;
|
||||||
|
|
||||||
struct btrfs_fs_devices *fs_devices;
|
struct btrfs_fs_devices *fs_devices;
|
||||||
|
|
|
@ -946,18 +946,20 @@ static int btree_writepages(struct address_space *mapping,
|
||||||
struct writeback_control *wbc)
|
struct writeback_control *wbc)
|
||||||
{
|
{
|
||||||
struct extent_io_tree *tree;
|
struct extent_io_tree *tree;
|
||||||
|
struct btrfs_fs_info *fs_info;
|
||||||
|
int ret;
|
||||||
|
|
||||||
tree = &BTRFS_I(mapping->host)->io_tree;
|
tree = &BTRFS_I(mapping->host)->io_tree;
|
||||||
if (wbc->sync_mode == WB_SYNC_NONE) {
|
if (wbc->sync_mode == WB_SYNC_NONE) {
|
||||||
struct btrfs_root *root = BTRFS_I(mapping->host)->root;
|
|
||||||
u64 num_dirty;
|
|
||||||
unsigned long thresh = 32 * 1024 * 1024;
|
|
||||||
|
|
||||||
if (wbc->for_kupdate)
|
if (wbc->for_kupdate)
|
||||||
return 0;
|
return 0;
|
||||||
|
|
||||||
|
fs_info = BTRFS_I(mapping->host)->root->fs_info;
|
||||||
/* this is a bit racy, but that's ok */
|
/* this is a bit racy, but that's ok */
|
||||||
num_dirty = root->fs_info->dirty_metadata_bytes;
|
ret = percpu_counter_compare(&fs_info->dirty_metadata_bytes,
|
||||||
if (num_dirty < thresh)
|
BTRFS_DIRTY_METADATA_THRESH);
|
||||||
|
if (ret < 0)
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
return btree_write_cache_pages(mapping, wbc);
|
return btree_write_cache_pages(mapping, wbc);
|
||||||
|
@ -1125,24 +1127,16 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
|
||||||
void clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root,
|
void clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root,
|
||||||
struct extent_buffer *buf)
|
struct extent_buffer *buf)
|
||||||
{
|
{
|
||||||
|
struct btrfs_fs_info *fs_info = root->fs_info;
|
||||||
|
|
||||||
if (btrfs_header_generation(buf) ==
|
if (btrfs_header_generation(buf) ==
|
||||||
root->fs_info->running_transaction->transid) {
|
fs_info->running_transaction->transid) {
|
||||||
btrfs_assert_tree_locked(buf);
|
btrfs_assert_tree_locked(buf);
|
||||||
|
|
||||||
if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)) {
|
if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)) {
|
||||||
spin_lock(&root->fs_info->delalloc_lock);
|
__percpu_counter_add(&fs_info->dirty_metadata_bytes,
|
||||||
if (root->fs_info->dirty_metadata_bytes >= buf->len)
|
-buf->len,
|
||||||
root->fs_info->dirty_metadata_bytes -= buf->len;
|
fs_info->dirty_metadata_batch);
|
||||||
else {
|
|
||||||
spin_unlock(&root->fs_info->delalloc_lock);
|
|
||||||
btrfs_panic(root->fs_info, -EOVERFLOW,
|
|
||||||
"Can't clear %lu bytes from "
|
|
||||||
" dirty_mdatadata_bytes (%llu)",
|
|
||||||
buf->len,
|
|
||||||
root->fs_info->dirty_metadata_bytes);
|
|
||||||
}
|
|
||||||
spin_unlock(&root->fs_info->delalloc_lock);
|
|
||||||
|
|
||||||
/* ugh, clear_extent_buffer_dirty needs to lock the page */
|
/* ugh, clear_extent_buffer_dirty needs to lock the page */
|
||||||
btrfs_set_lock_blocking(buf);
|
btrfs_set_lock_blocking(buf);
|
||||||
clear_extent_buffer_dirty(buf);
|
clear_extent_buffer_dirty(buf);
|
||||||
|
@ -2008,10 +2002,18 @@ int open_ctree(struct super_block *sb,
|
||||||
goto fail_srcu;
|
goto fail_srcu;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
ret = percpu_counter_init(&fs_info->dirty_metadata_bytes, 0);
|
||||||
|
if (ret) {
|
||||||
|
err = ret;
|
||||||
|
goto fail_bdi;
|
||||||
|
}
|
||||||
|
fs_info->dirty_metadata_batch = PAGE_CACHE_SIZE *
|
||||||
|
(1 + ilog2(nr_cpu_ids));
|
||||||
|
|
||||||
fs_info->btree_inode = new_inode(sb);
|
fs_info->btree_inode = new_inode(sb);
|
||||||
if (!fs_info->btree_inode) {
|
if (!fs_info->btree_inode) {
|
||||||
err = -ENOMEM;
|
err = -ENOMEM;
|
||||||
goto fail_bdi;
|
goto fail_dirty_metadata_bytes;
|
||||||
}
|
}
|
||||||
|
|
||||||
mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS);
|
mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS);
|
||||||
|
@ -2266,6 +2268,7 @@ int open_ctree(struct super_block *sb,
|
||||||
leafsize = btrfs_super_leafsize(disk_super);
|
leafsize = btrfs_super_leafsize(disk_super);
|
||||||
sectorsize = btrfs_super_sectorsize(disk_super);
|
sectorsize = btrfs_super_sectorsize(disk_super);
|
||||||
stripesize = btrfs_super_stripesize(disk_super);
|
stripesize = btrfs_super_stripesize(disk_super);
|
||||||
|
fs_info->dirty_metadata_batch = leafsize * (1 + ilog2(nr_cpu_ids));
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* mixed block groups end up with duplicate but slightly offset
|
* mixed block groups end up with duplicate but slightly offset
|
||||||
|
@ -2728,6 +2731,8 @@ fail_iput:
|
||||||
|
|
||||||
invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
|
invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
|
||||||
iput(fs_info->btree_inode);
|
iput(fs_info->btree_inode);
|
||||||
|
fail_dirty_metadata_bytes:
|
||||||
|
percpu_counter_destroy(&fs_info->dirty_metadata_bytes);
|
||||||
fail_bdi:
|
fail_bdi:
|
||||||
bdi_destroy(&fs_info->bdi);
|
bdi_destroy(&fs_info->bdi);
|
||||||
fail_srcu:
|
fail_srcu:
|
||||||
|
@ -3406,6 +3411,7 @@ int close_ctree(struct btrfs_root *root)
|
||||||
btrfs_close_devices(fs_info->fs_devices);
|
btrfs_close_devices(fs_info->fs_devices);
|
||||||
btrfs_mapping_tree_free(&fs_info->mapping_tree);
|
btrfs_mapping_tree_free(&fs_info->mapping_tree);
|
||||||
|
|
||||||
|
percpu_counter_destroy(&fs_info->dirty_metadata_bytes);
|
||||||
bdi_destroy(&fs_info->bdi);
|
bdi_destroy(&fs_info->bdi);
|
||||||
cleanup_srcu_struct(&fs_info->subvol_srcu);
|
cleanup_srcu_struct(&fs_info->subvol_srcu);
|
||||||
|
|
||||||
|
@ -3448,11 +3454,10 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
|
||||||
(unsigned long long)transid,
|
(unsigned long long)transid,
|
||||||
(unsigned long long)root->fs_info->generation);
|
(unsigned long long)root->fs_info->generation);
|
||||||
was_dirty = set_extent_buffer_dirty(buf);
|
was_dirty = set_extent_buffer_dirty(buf);
|
||||||
if (!was_dirty) {
|
if (!was_dirty)
|
||||||
spin_lock(&root->fs_info->delalloc_lock);
|
__percpu_counter_add(&root->fs_info->dirty_metadata_bytes,
|
||||||
root->fs_info->dirty_metadata_bytes += buf->len;
|
buf->len,
|
||||||
spin_unlock(&root->fs_info->delalloc_lock);
|
root->fs_info->dirty_metadata_batch);
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static void __btrfs_btree_balance_dirty(struct btrfs_root *root,
|
static void __btrfs_btree_balance_dirty(struct btrfs_root *root,
|
||||||
|
@ -3462,8 +3467,7 @@ static void __btrfs_btree_balance_dirty(struct btrfs_root *root,
|
||||||
* looks as though older kernels can get into trouble with
|
* looks as though older kernels can get into trouble with
|
||||||
* this code, they end up stuck in balance_dirty_pages forever
|
* this code, they end up stuck in balance_dirty_pages forever
|
||||||
*/
|
*/
|
||||||
u64 num_dirty;
|
int ret;
|
||||||
unsigned long thresh = 32 * 1024 * 1024;
|
|
||||||
|
|
||||||
if (current->flags & PF_MEMALLOC)
|
if (current->flags & PF_MEMALLOC)
|
||||||
return;
|
return;
|
||||||
|
@ -3471,9 +3475,9 @@ static void __btrfs_btree_balance_dirty(struct btrfs_root *root,
|
||||||
if (flush_delayed)
|
if (flush_delayed)
|
||||||
btrfs_balance_delayed_items(root);
|
btrfs_balance_delayed_items(root);
|
||||||
|
|
||||||
num_dirty = root->fs_info->dirty_metadata_bytes;
|
ret = percpu_counter_compare(&root->fs_info->dirty_metadata_bytes,
|
||||||
|
BTRFS_DIRTY_METADATA_THRESH);
|
||||||
if (num_dirty > thresh) {
|
if (ret > 0) {
|
||||||
balance_dirty_pages_ratelimited_nr(
|
balance_dirty_pages_ratelimited_nr(
|
||||||
root->fs_info->btree_inode->i_mapping, 1);
|
root->fs_info->btree_inode->i_mapping, 1);
|
||||||
}
|
}
|
||||||
|
|
|
@ -3122,12 +3122,9 @@ static int lock_extent_buffer_for_io(struct extent_buffer *eb,
|
||||||
set_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
|
set_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
|
||||||
spin_unlock(&eb->refs_lock);
|
spin_unlock(&eb->refs_lock);
|
||||||
btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
|
btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
|
||||||
spin_lock(&fs_info->delalloc_lock);
|
__percpu_counter_add(&fs_info->dirty_metadata_bytes,
|
||||||
if (fs_info->dirty_metadata_bytes >= eb->len)
|
-eb->len,
|
||||||
fs_info->dirty_metadata_bytes -= eb->len;
|
fs_info->dirty_metadata_batch);
|
||||||
else
|
|
||||||
WARN_ON(1);
|
|
||||||
spin_unlock(&fs_info->delalloc_lock);
|
|
||||||
ret = 1;
|
ret = 1;
|
||||||
} else {
|
} else {
|
||||||
spin_unlock(&eb->refs_lock);
|
spin_unlock(&eb->refs_lock);
|
||||||
|
|
Loading…
Reference in New Issue