Btrfs: extent_map and data=ordered fixes for space balancing

* Add an EXTENT_BOUNDARY state bit to keep the writepage code
from merging data extents that are in the process of being
relocated.  This allows us to do accounting for them properly.

* The balancing code relocates data extents indepdent of the underlying
inode.  The extent_map code was modified to properly account for
things moving around (invalidating extent_map caches in the inode).

* Don't take the drop_mutex in the create_subvol ioctl.  It isn't
required.

* Fix walking of the ordered extent list to avoid races with sys_unlink

* Change the lock ordering rules.  Transaction start goes outside
the drop_mutex.  This allows btrfs_commit_transaction to directly
drop the relocation trees.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
This commit is contained in:
Zheng Yan 2008-09-26 10:05:38 -04:00 committed by Chris Mason
parent e465768938
commit 5b21f2ed3f
10 changed files with 108 additions and 49 deletions

View File

@ -290,7 +290,6 @@ int noinline btrfs_cow_block(struct btrfs_trans_handle *trans,
struct extent_buffer **cow_ret, u64 prealloc_dest)
{
u64 search_start;
u64 header_trans;
int ret;
if (trans->transaction != root->fs_info->running_transaction) {
@ -304,9 +303,9 @@ int noinline btrfs_cow_block(struct btrfs_trans_handle *trans,
WARN_ON(1);
}
header_trans = btrfs_header_generation(buf);
spin_lock(&root->fs_info->hash_lock);
if (header_trans == trans->transid &&
if (btrfs_header_generation(buf) == trans->transid &&
btrfs_header_owner(buf) == root->root_key.objectid &&
!btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
*cow_ret = buf;
spin_unlock(&root->fs_info->hash_lock);
@ -1300,6 +1299,7 @@ again:
/* is a cow on this block not required */
spin_lock(&root->fs_info->hash_lock);
if (btrfs_header_generation(b) == trans->transid &&
btrfs_header_owner(b) == root->root_key.objectid &&
!btrfs_header_flag(b, BTRFS_HEADER_FLAG_WRITTEN)) {
spin_unlock(&root->fs_info->hash_lock);
goto cow_done;
@ -1396,7 +1396,8 @@ cow_done:
/* this is only true while dropping a snapshot */
if (level == lowest_level) {
break;
ret = 0;
goto done;
}
blocknr = btrfs_node_blockptr(b, slot);

View File

@ -1486,6 +1486,9 @@ static inline struct dentry *fdentry(struct file *file)
/* extent-tree.c */
int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len);
int btrfs_lookup_extent_ref(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 bytenr,
u64 num_bytes, u32 *refs);
int btrfs_update_pinned_extents(struct btrfs_root *root,
u64 bytenr, u64 num, int pin);
int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
@ -1812,6 +1815,8 @@ void btrfs_destroy_inode(struct inode *inode);
int btrfs_init_cachep(void);
void btrfs_destroy_cachep(void);
long btrfs_ioctl_trans_end(struct file *file);
struct inode *btrfs_ilookup(struct super_block *s, u64 objectid,
struct btrfs_root *root, int wait);
struct inode *btrfs_iget_locked(struct super_block *s, u64 objectid,
struct btrfs_root *root);
struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
@ -1824,13 +1829,17 @@ struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
int btrfs_update_inode(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct inode *inode);
int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode);
int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode);
void btrfs_orphan_cleanup(struct btrfs_root *root);
/* ioctl.c */
long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
/* file.c */
int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync);
int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end);
int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
int skip_pinned);
int btrfs_check_file(struct btrfs_root *root, struct inode *inode);
extern struct file_operations btrfs_file_operations;
int btrfs_drop_extents(struct btrfs_trans_handle *trans,

View File

@ -292,7 +292,7 @@ static int merge_state(struct extent_io_tree *tree,
struct extent_state *other;
struct rb_node *other_node;
if (state->state & EXTENT_IOBITS)
if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY))
return 0;
other_node = rb_prev(&state->rb_node);
@ -1070,7 +1070,8 @@ search_again:
while(1) {
state = rb_entry(node, struct extent_state, rb_node);
if (found && state->start != cur_start) {
if (found && (state->start != cur_start ||
(state->state & EXTENT_BOUNDARY))) {
goto out;
}
if (!(state->state & EXTENT_DELALLOC)) {
@ -1078,7 +1079,7 @@ search_again:
*end = state->end;
goto out;
}
if (!found) {
if (!found && !(state->state & EXTENT_BOUNDARY)) {
struct extent_state *prev_state;
struct rb_node *prev_node = node;
while(1) {
@ -1088,7 +1089,11 @@ search_again:
prev_state = rb_entry(prev_node,
struct extent_state,
rb_node);
if (!(prev_state->state & EXTENT_DELALLOC))
if ((prev_state->end + 1 != state->start) ||
!(prev_state->state & EXTENT_DELALLOC))
break;
if ((cur_start - prev_state->start) * 2 >
max_bytes)
break;
state = prev_state;
node = prev_node;

View File

@ -15,6 +15,7 @@
#define EXTENT_BUFFER_FILLED (1 << 8)
#define EXTENT_ORDERED (1 << 9)
#define EXTENT_ORDERED_METADATA (1 << 10)
#define EXTENT_BOUNDARY (1 << 11)
#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
/*

View File

@ -294,7 +294,7 @@ static int noinline dirty_and_release_pages(struct btrfs_trans_handle *trans,
last_pos_in_file,
0, 0, hole_size, 0);
btrfs_drop_extent_cache(inode, last_pos_in_file,
last_pos_in_file + hole_size -1);
last_pos_in_file + hole_size - 1, 0);
mutex_unlock(&BTRFS_I(inode)->extent_mutex);
btrfs_check_file(root, inode);
}
@ -337,7 +337,7 @@ static int noinline dirty_and_release_pages(struct btrfs_trans_handle *trans,
inline_size -= start_pos;
err = insert_inline_extent(trans, root, inode, start_pos,
inline_size, pages, 0, num_pages);
btrfs_drop_extent_cache(inode, start_pos, aligned_end - 1);
btrfs_drop_extent_cache(inode, start_pos, aligned_end - 1, 0);
BUG_ON(err);
mutex_unlock(&BTRFS_I(inode)->extent_mutex);
@ -362,7 +362,8 @@ out_unlock:
return err;
}
int noinline btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end)
int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
int skip_pinned)
{
struct extent_map *em;
struct extent_map *split = NULL;
@ -371,6 +372,7 @@ int noinline btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end)
u64 len = end - start + 1;
int ret;
int testend = 1;
unsigned long flags;
WARN_ON(end < start);
if (end == (u64)-1) {
@ -389,6 +391,23 @@ int noinline btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end)
spin_unlock(&em_tree->lock);
break;
}
flags = em->flags;
if (skip_pinned && test_bit(EXTENT_FLAG_PINNED, &em->flags)) {
spin_unlock(&em_tree->lock);
if (em->start <= start &&
(!testend || em->start + em->len >= start + len)) {
free_extent_map(em);
break;
}
if (start < em->start) {
len = em->start - start;
} else {
len = start + len - (em->start + em->len);
start = em->start + em->len;
}
free_extent_map(em);
continue;
}
clear_bit(EXTENT_FLAG_PINNED, &em->flags);
remove_extent_mapping(em_tree, em);
@ -398,7 +417,7 @@ int noinline btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end)
split->len = start - em->start;
split->block_start = em->block_start;
split->bdev = em->bdev;
split->flags = em->flags;
split->flags = flags;
ret = add_extent_mapping(em_tree, split);
BUG_ON(ret);
free_extent_map(split);
@ -412,7 +431,7 @@ int noinline btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end)
split->start = start + len;
split->len = em->start + em->len - (start + len);
split->bdev = em->bdev;
split->flags = em->flags;
split->flags = flags;
split->block_start = em->block_start + diff;
@ -541,7 +560,7 @@ int noinline btrfs_drop_extents(struct btrfs_trans_handle *trans,
int recow;
int ret;
btrfs_drop_extent_cache(inode, start, end - 1);
btrfs_drop_extent_cache(inode, start, end - 1, 0);
path = btrfs_alloc_path();
if (!path)

View File

@ -117,10 +117,14 @@ int btrfs_find_free_objectid(struct btrfs_trans_handle *trans,
*objectid = last_ino;
goto found;
}
} else if (key.objectid > search_start) {
*objectid = search_start;
goto found;
}
}
if (key.objectid >= BTRFS_LAST_FREE_OBJECTID)
break;
start_found = 1;
last_ino = key.objectid + 1;
path->slots[0]++;

View File

@ -135,7 +135,7 @@ static int cow_file_range(struct inode *inode, u64 start, u64 end)
BUG_ON(num_bytes > btrfs_super_total_bytes(&root->fs_info->super_copy));
mutex_lock(&BTRFS_I(inode)->extent_mutex);
btrfs_drop_extent_cache(inode, start, start + num_bytes - 1);
btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0);
mutex_unlock(&BTRFS_I(inode)->extent_mutex);
while(num_bytes > 0) {
@ -163,7 +163,7 @@ static int cow_file_range(struct inode *inode, u64 start, u64 end)
break;
}
btrfs_drop_extent_cache(inode, start,
start + ins.offset - 1);
start + ins.offset - 1, 0);
}
mutex_unlock(&BTRFS_I(inode)->extent_mutex);
@ -587,7 +587,7 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
btrfs_drop_extent_cache(inode, ordered_extent->file_offset,
ordered_extent->file_offset +
ordered_extent->len - 1);
ordered_extent->len - 1, 0);
mutex_unlock(&BTRFS_I(inode)->extent_mutex);
ins.objectid = ordered_extent->start;
@ -880,7 +880,7 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
int ret = 0, nr_unlink = 0, nr_truncate = 0;
/* don't do orphan cleanup if the fs is readonly. */
if (root->inode->i_sb->s_flags & MS_RDONLY)
if (root->fs_info->sb->s_flags & MS_RDONLY)
return;
path = btrfs_alloc_path();
@ -892,8 +892,6 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY);
key.offset = (u64)-1;
trans = btrfs_start_transaction(root, 1);
btrfs_set_trans_block_group(trans, root->inode);
while (1) {
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
@ -933,7 +931,7 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
* crossing root thing. we store the inode number in the
* offset of the orphan item.
*/
inode = btrfs_iget_locked(root->inode->i_sb,
inode = btrfs_iget_locked(root->fs_info->sb,
found_key.offset, root);
if (!inode)
break;
@ -965,7 +963,9 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
* do a destroy_inode
*/
if (is_bad_inode(inode)) {
trans = btrfs_start_transaction(root, 1);
btrfs_orphan_del(trans, inode);
btrfs_end_transaction(trans, root);
iput(inode);
continue;
}
@ -988,7 +988,6 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
printk(KERN_INFO "btrfs: truncated %d orphans\n", nr_truncate);
btrfs_free_path(path);
btrfs_end_transaction(trans, root);
}
void btrfs_read_locked_inode(struct inode *inode)
@ -1343,8 +1342,7 @@ noinline int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
u64 mask = root->sectorsize - 1;
if (root->ref_cows)
btrfs_drop_extent_cache(inode,
new_size & (~mask), (u64)-1);
btrfs_drop_extent_cache(inode, new_size & (~mask), (u64)-1, 0);
path = btrfs_alloc_path();
path->reada = -1;
BUG_ON(!path);
@ -1677,7 +1675,7 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
hole_start, 0, 0,
hole_size, 0);
btrfs_drop_extent_cache(inode, hole_start,
(u64)-1);
(u64)-1, 0);
btrfs_check_file(root, inode);
}
mutex_unlock(&BTRFS_I(inode)->extent_mutex);
@ -1843,6 +1841,24 @@ static int btrfs_find_actor(struct inode *inode, void *opaque)
args->root == BTRFS_I(inode)->root);
}
struct inode *btrfs_ilookup(struct super_block *s, u64 objectid,
struct btrfs_root *root, int wait)
{
struct inode *inode;
struct btrfs_iget_args args;
args.ino = objectid;
args.root = root;
if (wait) {
inode = ilookup5(s, objectid, btrfs_find_actor,
(void *)&args);
} else {
inode = ilookup5_nowait(s, objectid, btrfs_find_actor,
(void *)&args);
}
return inode;
}
struct inode *btrfs_iget_locked(struct super_block *s, u64 objectid,
struct btrfs_root *root)
{
@ -3266,7 +3282,7 @@ void btrfs_destroy_inode(struct inode *inode)
btrfs_put_ordered_extent(ordered);
}
}
btrfs_drop_extent_cache(inode, 0, (u64)-1);
btrfs_drop_extent_cache(inode, 0, (u64)-1, 0);
kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
}
@ -3412,16 +3428,22 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root)
{
struct list_head *head = &root->fs_info->delalloc_inodes;
struct btrfs_inode *binode;
struct inode *inode;
unsigned long flags;
spin_lock_irqsave(&root->fs_info->delalloc_lock, flags);
while(!list_empty(head)) {
binode = list_entry(head->next, struct btrfs_inode,
delalloc_inodes);
atomic_inc(&binode->vfs_inode.i_count);
inode = igrab(&binode->vfs_inode);
if (!inode)
list_del_init(&binode->delalloc_inodes);
spin_unlock_irqrestore(&root->fs_info->delalloc_lock, flags);
filemap_write_and_wait(binode->vfs_inode.i_mapping);
iput(&binode->vfs_inode);
if (inode) {
filemap_write_and_wait(inode->i_mapping);
iput(inode);
}
cond_resched();
spin_lock_irqsave(&root->fs_info->delalloc_lock, flags);
}
spin_unlock_irqrestore(&root->fs_info->delalloc_lock, flags);

View File

@ -444,12 +444,10 @@ static noinline int btrfs_ioctl_snap_create(struct btrfs_root *root,
goto out;
}
mutex_lock(&root->fs_info->drop_mutex);
if (root == root->fs_info->tree_root)
ret = create_subvol(root, vol_args->name, namelen);
else
ret = create_snapshot(root, vol_args->name, namelen);
mutex_unlock(&root->fs_info->drop_mutex);
out:
kfree(vol_args);
return ret;

View File

@ -309,7 +309,6 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root, int nocow_only)
{
struct list_head splice;
struct list_head *cur;
struct list_head *tmp;
struct btrfs_ordered_extent *ordered;
struct inode *inode;
@ -317,37 +316,38 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root, int nocow_only)
spin_lock(&root->fs_info->ordered_extent_lock);
list_splice_init(&root->fs_info->ordered_extents, &splice);
list_for_each_safe(cur, tmp, &splice) {
while (!list_empty(&splice)) {
cur = splice.next;
ordered = list_entry(cur, struct btrfs_ordered_extent,
root_extent_list);
if (nocow_only &&
!test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) {
list_move(&ordered->root_extent_list,
&root->fs_info->ordered_extents);
cond_resched_lock(&root->fs_info->ordered_extent_lock);
continue;
}
list_del_init(&ordered->root_extent_list);
atomic_inc(&ordered->refs);
inode = ordered->inode;
/*
* the inode can't go away until all the pages are gone
* and the pages won't go away while there is still
* an ordered extent and the ordered extent won't go
* away until it is off this list. So, we can safely
* increment i_count here and call iput later
* the inode may be getting freed (in sys_unlink path).
*/
atomic_inc(&inode->i_count);
inode = igrab(ordered->inode);
spin_unlock(&root->fs_info->ordered_extent_lock);
btrfs_start_ordered_extent(inode, ordered, 1);
btrfs_put_ordered_extent(ordered);
iput(inode);
if (inode) {
btrfs_start_ordered_extent(inode, ordered, 1);
btrfs_put_ordered_extent(ordered);
iput(inode);
} else {
btrfs_put_ordered_extent(ordered);
}
spin_lock(&root->fs_info->ordered_extent_lock);
}
list_splice_init(&splice, &root->fs_info->ordered_extents);
spin_unlock(&root->fs_info->ordered_extent_lock);
return 0;
}

View File

@ -109,6 +109,7 @@ noinline int btrfs_record_root_in_trans(struct btrfs_root *root)
spin_lock_init(&dirty->root->node_lock);
spin_lock_init(&dirty->root->list_lock);
mutex_init(&dirty->root->objectid_mutex);
mutex_init(&dirty->root->log_mutex);
INIT_LIST_HEAD(&dirty->root->dead_list);
dirty->root->node = root->commit_root;
dirty->root->commit_root = NULL;
@ -590,13 +591,14 @@ static noinline int drop_dirty_roots(struct btrfs_root *tree_root,
root = dirty->latest_root;
atomic_inc(&root->fs_info->throttles);
mutex_lock(&root->fs_info->drop_mutex);
while(1) {
trans = btrfs_start_transaction(tree_root, 1);
mutex_lock(&root->fs_info->drop_mutex);
ret = btrfs_drop_snapshot(trans, dirty->root);
if (ret != -EAGAIN) {
break;
}
mutex_unlock(&root->fs_info->drop_mutex);
err = btrfs_update_root(trans,
tree_root,
@ -608,10 +610,8 @@ static noinline int drop_dirty_roots(struct btrfs_root *tree_root,
ret = btrfs_end_transaction(trans, tree_root);
BUG_ON(ret);
mutex_unlock(&root->fs_info->drop_mutex);
btrfs_btree_balance_dirty(tree_root, nr);
cond_resched();
mutex_lock(&root->fs_info->drop_mutex);
}
BUG_ON(ret);
atomic_dec(&root->fs_info->throttles);
@ -689,7 +689,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
memcpy(new_root_item, &root->root_item, sizeof(*new_root_item));
key.objectid = objectid;
key.offset = 1;
key.offset = trans->transid;
btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
old = btrfs_lock_root_node(root);