diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 0d1e2ba47a9c..f37e09e724f7 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1343,7 +1343,7 @@ again: unlock_extent(io_tree, page_start, page_end, GFP_NOFS); unlock_page(page); page_cache_release(page); - btrfs_wait_ordered_extent(inode, ordered); + btrfs_start_ordered_extent(inode, ordered, 1); btrfs_put_ordered_extent(ordered); goto again; } @@ -2660,6 +2660,10 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset) ordered = btrfs_lookup_ordered_extent(page->mapping->host, page_offset(page)); if (ordered) { + /* + * IO on this page will never be started, so we need + * to account for any ordered extents now + */ clear_extent_bit(tree, page_start, page_end, EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_LOCKED, 1, 0, GFP_NOFS); @@ -2732,11 +2736,15 @@ again: lock_extent(io_tree, page_start, page_end, GFP_NOFS); set_page_extent_mapped(page); + /* + * we can't set the delalloc bits if there are pending ordered + * extents. Drop our locks and wait for them to finish + */ ordered = btrfs_lookup_ordered_extent(inode, page_start); if (ordered) { unlock_extent(io_tree, page_start, page_end, GFP_NOFS); unlock_page(page); - btrfs_wait_ordered_extent(inode, ordered); + btrfs_start_ordered_extent(inode, ordered, 1); btrfs_put_ordered_extent(ordered); goto again; } diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index b739e3abebb9..230fd3ca6b2c 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -136,6 +136,19 @@ static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree, return ret; } +/* allocate and add a new ordered_extent into the per-inode tree. + * file_offset is the logical offset in the file + * + * start is the disk block number of an extent already reserved in the + * extent allocation tree + * + * len is the length of the extent + * + * This also sets the EXTENT_ORDERED bit on the range in the inode. + * + * The tree is given a single reference on the ordered extent that was + * inserted. + */ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, u64 start, u64 len) { @@ -152,7 +165,6 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, entry->file_offset = file_offset; entry->start = start; entry->len = len; - entry->inode = inode; /* one ref for the tree */ atomic_set(&entry->refs, 1); init_waitqueue_head(&entry->wait); @@ -167,12 +179,15 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, set_extent_ordered(&BTRFS_I(inode)->io_tree, file_offset, entry_end(entry) - 1, GFP_NOFS); - set_bit(BTRFS_ORDERED_START, &entry->flags); mutex_unlock(&tree->mutex); BUG_ON(node); return 0; } +/* + * Add a struct btrfs_ordered_sum into the list of checksums to be inserted + * when an ordered extent is finished. + */ int btrfs_add_ordered_sum(struct inode *inode, struct btrfs_ordered_sum *sum) { struct btrfs_ordered_inode_tree *tree; @@ -182,29 +197,25 @@ int btrfs_add_ordered_sum(struct inode *inode, struct btrfs_ordered_sum *sum) tree = &BTRFS_I(inode)->ordered_tree; mutex_lock(&tree->mutex); node = tree_search(tree, sum->file_offset); - if (!node) { -search_fail: -printk("add ordered sum failed to find a node for inode %lu offset %Lu\n", inode->i_ino, sum->file_offset); - node = rb_first(&tree->tree); - while(node) { - entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); - printk("entry %Lu %Lu %Lu\n", entry->file_offset, entry->file_offset + entry->len, entry->start); - node = rb_next(node); - } - BUG(); - } BUG_ON(!node); entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); - if (!offset_in_entry(entry, sum->file_offset)) { - goto search_fail; - } + BUG_ON(!offset_in_entry(entry, sum->file_offset)); list_add_tail(&sum->list, &entry->list); mutex_unlock(&tree->mutex); return 0; } +/* + * this is used to account for finished IO across a given range + * of the file. The IO should not span ordered extents. If + * a given ordered_extent is completely done, 1 is returned, otherwise + * 0. + * + * test_and_set_bit on a flag in the struct btrfs_ordered_extent is used + * to make sure this function only returns 1 once for a given ordered extent. + */ int btrfs_dec_test_ordered_pending(struct inode *inode, u64 file_offset, u64 io_size) { @@ -233,9 +244,6 @@ int btrfs_dec_test_ordered_pending(struct inode *inode, ret = test_range_bit(io_tree, entry->file_offset, entry->file_offset + entry->len - 1, EXTENT_ORDERED, 0); - if (!test_bit(BTRFS_ORDERED_START, &entry->flags)) { -printk("inode %lu not ready yet for extent %Lu %Lu\n", inode->i_ino, entry->file_offset, entry_end(entry)); - } if (ret == 0) ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags); out: @@ -243,6 +251,10 @@ out: return ret == 0; } +/* + * used to drop a reference on an ordered extent. This will free + * the extent if the last reference is dropped + */ int btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry) { struct list_head *cur; @@ -260,6 +272,10 @@ int btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry) return 0; } +/* + * remove an ordered extent from the tree. No references are dropped + * but, anyone waiting on this extent is woken up. + */ int btrfs_remove_ordered_extent(struct inode *inode, struct btrfs_ordered_extent *entry) { @@ -277,27 +293,25 @@ int btrfs_remove_ordered_extent(struct inode *inode, return 0; } -void btrfs_wait_ordered_extent(struct inode *inode, - struct btrfs_ordered_extent *entry) -{ - u64 start = entry->file_offset; - u64 end = start + entry->len - 1; -#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,22) - do_sync_file_range(file, start, end, SYNC_FILE_RANGE_WRITE); -#else - do_sync_mapping_range(inode->i_mapping, start, end, - SYNC_FILE_RANGE_WRITE); -#endif - wait_event(entry->wait, - test_bit(BTRFS_ORDERED_COMPLETE, &entry->flags)); -} - -static void btrfs_start_ordered_extent(struct inode *inode, - struct btrfs_ordered_extent *entry, int wait) +/* + * Used to start IO or wait for a given ordered extent to finish. + * + * If wait is one, this effectively waits on page writeback for all the pages + * in the extent, and it waits on the io completion code to insert + * metadata into the btree corresponding to the extent + */ +void btrfs_start_ordered_extent(struct inode *inode, + struct btrfs_ordered_extent *entry, + int wait) { u64 start = entry->file_offset; u64 end = start + entry->len - 1; + /* + * pages in the range can be dirty, clean or writeback. We + * start IO on any dirty ones so the wait doesn't stall waiting + * for pdflush to find them + */ #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,22) do_sync_file_range(file, start, end, SYNC_FILE_RANGE_WRITE); #else @@ -309,6 +323,9 @@ static void btrfs_start_ordered_extent(struct inode *inode, &entry->flags)); } +/* + * Used to wait on ordered extents across a large range of bytes. + */ void btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len) { u64 end; @@ -349,31 +366,11 @@ again: } } -int btrfs_add_ordered_pending(struct inode *inode, - struct btrfs_ordered_extent *ordered, - u64 start, u64 len) -{ - WARN_ON(1); - return 0; -#if 0 - int ret; - struct btrfs_ordered_inode_tree *tree; - struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; - - tree = &BTRFS_I(inode)->ordered_tree; - mutex_lock(&tree->mutex); - if (test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags)) { - ret = -EAGAIN; - goto out; - } - set_extent_ordered(io_tree, start, start + len - 1, GFP_NOFS); - ret = 0; -out: - mutex_unlock(&tree->mutex); - return ret; -#endif -} +/* + * find an ordered extent corresponding to file_offset. return NULL if + * nothing is found, otherwise take a reference on the extent and return it + */ struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode, u64 file_offset) { @@ -397,6 +394,10 @@ out: return entry; } +/* + * lookup and return any extent before 'file_offset'. NULL is returned + * if none is found + */ struct btrfs_ordered_extent * btrfs_lookup_first_ordered_extent(struct inode * inode, u64 file_offset) { @@ -417,6 +418,10 @@ out: return entry; } +/* + * After an extent is done, call this to conditionally update the on disk + * i_size. i_size is updated to cover any fully written part of the file. + */ int btrfs_ordered_update_i_size(struct inode *inode, struct btrfs_ordered_extent *ordered) { @@ -507,6 +512,11 @@ out: return 0; } +/* + * search the ordered extents for one corresponding to 'offset' and + * try to find a checksum. This is used because we allow pages to + * be reclaimed before their checksum is actually put into the btree + */ int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u32 *sum) { struct btrfs_ordered_sum *ordered_sum; diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index 33f0d9e91b11..98f491d1022b 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -19,12 +19,19 @@ #ifndef __BTRFS_ORDERED_DATA__ #define __BTRFS_ORDERED_DATA__ +/* one of these per inode */ struct btrfs_ordered_inode_tree { struct mutex mutex; struct rb_root tree; struct rb_node *last; }; +/* + * these are used to collect checksums done just before bios submission. + * They are attached via a list into the ordered extent, and + * checksum items are inserted into the tree after all the blocks in + * the ordered extent are on disk + */ struct btrfs_sector_sum { u64 offset; u32 sum; @@ -34,27 +41,56 @@ struct btrfs_ordered_sum { u64 file_offset; u64 len; struct list_head list; + /* last field is a variable length array of btrfs_sector_sums */ struct btrfs_sector_sum sums; }; -/* bits for the flags field */ +/* + * bits for the flags field: + * + * BTRFS_ORDERED_IO_DONE is set when all of the blocks are written. + * It is used to make sure metadata is inserted into the tree only once + * per extent. + * + * BTRFS_ORDERED_COMPLETE is set when the extent is removed from the + * rbtree, just before waking any waiters. It is used to indicate the + * IO is done and any metadata is inserted into the tree. + */ #define BTRFS_ORDERED_IO_DONE 0 /* set when all the pages are written */ + #define BTRFS_ORDERED_COMPLETE 1 /* set when removed from the tree */ -#define BTRFS_ORDERED_START 2 /* set when tree setup */ struct btrfs_ordered_extent { + /* logical offset in the file */ u64 file_offset; + + /* disk byte number */ u64 start; + + /* length of the extent in bytes */ u64 len; + + /* flags (described above) */ unsigned long flags; + + /* reference count */ atomic_t refs; + + /* list of checksums for insertion when the extent io is done */ struct list_head list; - struct inode *inode; + + /* used to wait for the BTRFS_ORDERED_COMPLETE bit */ wait_queue_head_t wait; + + /* our friendly rbtree entry */ struct rb_node rb_node; }; +/* + * calculates the total size you need to allocate for an ordered sum + * structure spanning 'bytes' in the file + */ static inline int btrfs_ordered_sum_size(struct btrfs_root *root, u64 bytes) { unsigned long num_sectors = (bytes + root->sectorsize - 1) / @@ -81,14 +117,11 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, int btrfs_add_ordered_sum(struct inode *inode, struct btrfs_ordered_sum *sum); struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode, u64 file_offset); -void btrfs_wait_ordered_extent(struct inode *inode, - struct btrfs_ordered_extent *entry); +void btrfs_start_ordered_extent(struct inode *inode, + struct btrfs_ordered_extent *entry, int wait); void btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len); struct btrfs_ordered_extent * btrfs_lookup_first_ordered_extent(struct inode * inode, u64 file_offset); -int btrfs_add_ordered_pending(struct inode *inode, - struct btrfs_ordered_extent *ordered, - u64 start, u64 len); int btrfs_ordered_update_i_size(struct inode *inode, struct btrfs_ordered_extent *ordered); int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u32 *sum);