fs: move i_wb_list out from under inode_lock
Protect the inode writeback list with a new global lock inode_wb_list_lock and use it to protect the list manipulations and traversals. This lock replaces the inode_lock as the inodes on the list can be validity checked while holding the inode->i_lock and hence the inode_lock is no longer needed to protect the list. Signed-off-by: Dave Chinner <dchinner@redhat.com> Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
This commit is contained in:
parent
55fa6091d8
commit
a66979abad
|
@ -55,13 +55,13 @@ EXPORT_SYMBOL(I_BDEV);
|
|||
static void bdev_inode_switch_bdi(struct inode *inode,
|
||||
struct backing_dev_info *dst)
|
||||
{
|
||||
spin_lock(&inode_lock);
|
||||
spin_lock(&inode_wb_list_lock);
|
||||
spin_lock(&inode->i_lock);
|
||||
inode->i_data.backing_dev_info = dst;
|
||||
if (inode->i_state & I_DIRTY)
|
||||
list_move(&inode->i_wb_list, &dst->wb.b_dirty);
|
||||
spin_unlock(&inode->i_lock);
|
||||
spin_unlock(&inode_lock);
|
||||
spin_unlock(&inode_wb_list_lock);
|
||||
}
|
||||
|
||||
static sector_t max_block(struct block_device *bdev)
|
||||
|
|
|
@ -175,6 +175,17 @@ void bdi_start_background_writeback(struct backing_dev_info *bdi)
|
|||
spin_unlock_bh(&bdi->wb_lock);
|
||||
}
|
||||
|
||||
/*
|
||||
* Remove the inode from the writeback list it is on.
|
||||
*/
|
||||
void inode_wb_list_del(struct inode *inode)
|
||||
{
|
||||
spin_lock(&inode_wb_list_lock);
|
||||
list_del_init(&inode->i_wb_list);
|
||||
spin_unlock(&inode_wb_list_lock);
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Redirty an inode: set its when-it-was dirtied timestamp and move it to the
|
||||
* furthest end of its superblock's dirty-inode list.
|
||||
|
@ -188,6 +199,7 @@ static void redirty_tail(struct inode *inode)
|
|||
{
|
||||
struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
|
||||
|
||||
assert_spin_locked(&inode_wb_list_lock);
|
||||
if (!list_empty(&wb->b_dirty)) {
|
||||
struct inode *tail;
|
||||
|
||||
|
@ -205,14 +217,17 @@ static void requeue_io(struct inode *inode)
|
|||
{
|
||||
struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
|
||||
|
||||
assert_spin_locked(&inode_wb_list_lock);
|
||||
list_move(&inode->i_wb_list, &wb->b_more_io);
|
||||
}
|
||||
|
||||
static void inode_sync_complete(struct inode *inode)
|
||||
{
|
||||
/*
|
||||
* Prevent speculative execution through spin_unlock(&inode_lock);
|
||||
* Prevent speculative execution through
|
||||
* spin_unlock(&inode_wb_list_lock);
|
||||
*/
|
||||
|
||||
smp_mb();
|
||||
wake_up_bit(&inode->i_state, __I_SYNC);
|
||||
}
|
||||
|
@ -286,6 +301,7 @@ static void move_expired_inodes(struct list_head *delaying_queue,
|
|||
*/
|
||||
static void queue_io(struct bdi_writeback *wb, unsigned long *older_than_this)
|
||||
{
|
||||
assert_spin_locked(&inode_wb_list_lock);
|
||||
list_splice_init(&wb->b_more_io, &wb->b_io);
|
||||
move_expired_inodes(&wb->b_dirty, &wb->b_io, older_than_this);
|
||||
}
|
||||
|
@ -308,25 +324,23 @@ static void inode_wait_for_writeback(struct inode *inode)
|
|||
wqh = bit_waitqueue(&inode->i_state, __I_SYNC);
|
||||
while (inode->i_state & I_SYNC) {
|
||||
spin_unlock(&inode->i_lock);
|
||||
spin_unlock(&inode_lock);
|
||||
spin_unlock(&inode_wb_list_lock);
|
||||
__wait_on_bit(wqh, &wq, inode_wait, TASK_UNINTERRUPTIBLE);
|
||||
spin_lock(&inode_lock);
|
||||
spin_lock(&inode_wb_list_lock);
|
||||
spin_lock(&inode->i_lock);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Write out an inode's dirty pages. Called under inode_lock. Either the
|
||||
* caller has ref on the inode (either via __iget or via syscall against an fd)
|
||||
* or the inode has I_WILL_FREE set (via generic_forget_inode)
|
||||
* Write out an inode's dirty pages. Called under inode_wb_list_lock. Either
|
||||
* the caller has an active reference on the inode or the inode has I_WILL_FREE
|
||||
* set.
|
||||
*
|
||||
* If `wait' is set, wait on the writeout.
|
||||
*
|
||||
* The whole writeout design is quite complex and fragile. We want to avoid
|
||||
* starvation of particular inodes when others are being redirtied, prevent
|
||||
* livelocks, etc.
|
||||
*
|
||||
* Called under inode_lock.
|
||||
*/
|
||||
static int
|
||||
writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
|
||||
|
@ -368,7 +382,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
|
|||
inode->i_state |= I_SYNC;
|
||||
inode->i_state &= ~I_DIRTY_PAGES;
|
||||
spin_unlock(&inode->i_lock);
|
||||
spin_unlock(&inode_lock);
|
||||
spin_unlock(&inode_wb_list_lock);
|
||||
|
||||
ret = do_writepages(mapping, wbc);
|
||||
|
||||
|
@ -388,12 +402,10 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
|
|||
* due to delalloc, clear dirty metadata flags right before
|
||||
* write_inode()
|
||||
*/
|
||||
spin_lock(&inode_lock);
|
||||
spin_lock(&inode->i_lock);
|
||||
dirty = inode->i_state & I_DIRTY;
|
||||
inode->i_state &= ~(I_DIRTY_SYNC | I_DIRTY_DATASYNC);
|
||||
spin_unlock(&inode->i_lock);
|
||||
spin_unlock(&inode_lock);
|
||||
/* Don't write the inode if only I_DIRTY_PAGES was set */
|
||||
if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) {
|
||||
int err = write_inode(inode, wbc);
|
||||
|
@ -401,7 +413,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
|
|||
ret = err;
|
||||
}
|
||||
|
||||
spin_lock(&inode_lock);
|
||||
spin_lock(&inode_wb_list_lock);
|
||||
spin_lock(&inode->i_lock);
|
||||
inode->i_state &= ~I_SYNC;
|
||||
if (!(inode->i_state & I_FREEING)) {
|
||||
|
@ -543,10 +555,10 @@ static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb,
|
|||
*/
|
||||
redirty_tail(inode);
|
||||
}
|
||||
spin_unlock(&inode_lock);
|
||||
spin_unlock(&inode_wb_list_lock);
|
||||
iput(inode);
|
||||
cond_resched();
|
||||
spin_lock(&inode_lock);
|
||||
spin_lock(&inode_wb_list_lock);
|
||||
if (wbc->nr_to_write <= 0) {
|
||||
wbc->more_io = 1;
|
||||
return 1;
|
||||
|
@ -565,7 +577,7 @@ void writeback_inodes_wb(struct bdi_writeback *wb,
|
|||
|
||||
if (!wbc->wb_start)
|
||||
wbc->wb_start = jiffies; /* livelock avoidance */
|
||||
spin_lock(&inode_lock);
|
||||
spin_lock(&inode_wb_list_lock);
|
||||
if (!wbc->for_kupdate || list_empty(&wb->b_io))
|
||||
queue_io(wb, wbc->older_than_this);
|
||||
|
||||
|
@ -583,7 +595,7 @@ void writeback_inodes_wb(struct bdi_writeback *wb,
|
|||
if (ret)
|
||||
break;
|
||||
}
|
||||
spin_unlock(&inode_lock);
|
||||
spin_unlock(&inode_wb_list_lock);
|
||||
/* Leave any unwritten inodes on b_io */
|
||||
}
|
||||
|
||||
|
@ -592,11 +604,11 @@ static void __writeback_inodes_sb(struct super_block *sb,
|
|||
{
|
||||
WARN_ON(!rwsem_is_locked(&sb->s_umount));
|
||||
|
||||
spin_lock(&inode_lock);
|
||||
spin_lock(&inode_wb_list_lock);
|
||||
if (!wbc->for_kupdate || list_empty(&wb->b_io))
|
||||
queue_io(wb, wbc->older_than_this);
|
||||
writeback_sb_inodes(sb, wb, wbc, true);
|
||||
spin_unlock(&inode_lock);
|
||||
spin_unlock(&inode_wb_list_lock);
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -735,7 +747,7 @@ static long wb_writeback(struct bdi_writeback *wb,
|
|||
* become available for writeback. Otherwise
|
||||
* we'll just busyloop.
|
||||
*/
|
||||
spin_lock(&inode_lock);
|
||||
spin_lock(&inode_wb_list_lock);
|
||||
if (!list_empty(&wb->b_more_io)) {
|
||||
inode = wb_inode(wb->b_more_io.prev);
|
||||
trace_wbc_writeback_wait(&wbc, wb->bdi);
|
||||
|
@ -743,7 +755,7 @@ static long wb_writeback(struct bdi_writeback *wb,
|
|||
inode_wait_for_writeback(inode);
|
||||
spin_unlock(&inode->i_lock);
|
||||
}
|
||||
spin_unlock(&inode_lock);
|
||||
spin_unlock(&inode_wb_list_lock);
|
||||
}
|
||||
|
||||
return wrote;
|
||||
|
@ -1009,7 +1021,6 @@ void __mark_inode_dirty(struct inode *inode, int flags)
|
|||
{
|
||||
struct super_block *sb = inode->i_sb;
|
||||
struct backing_dev_info *bdi = NULL;
|
||||
bool wakeup_bdi = false;
|
||||
|
||||
/*
|
||||
* Don't do this for I_DIRTY_PAGES - that doesn't actually
|
||||
|
@ -1033,7 +1044,6 @@ void __mark_inode_dirty(struct inode *inode, int flags)
|
|||
if (unlikely(block_dump))
|
||||
block_dump___mark_inode_dirty(inode);
|
||||
|
||||
spin_lock(&inode_lock);
|
||||
spin_lock(&inode->i_lock);
|
||||
if ((inode->i_state & flags) != flags) {
|
||||
const int was_dirty = inode->i_state & I_DIRTY;
|
||||
|
@ -1059,12 +1069,12 @@ void __mark_inode_dirty(struct inode *inode, int flags)
|
|||
if (inode->i_state & I_FREEING)
|
||||
goto out_unlock_inode;
|
||||
|
||||
spin_unlock(&inode->i_lock);
|
||||
/*
|
||||
* If the inode was already on b_dirty/b_io/b_more_io, don't
|
||||
* reposition it (that would break b_dirty time-ordering).
|
||||
*/
|
||||
if (!was_dirty) {
|
||||
bool wakeup_bdi = false;
|
||||
bdi = inode_to_bdi(inode);
|
||||
|
||||
if (bdi_cap_writeback_dirty(bdi)) {
|
||||
|
@ -1081,18 +1091,20 @@ void __mark_inode_dirty(struct inode *inode, int flags)
|
|||
wakeup_bdi = true;
|
||||
}
|
||||
|
||||
spin_unlock(&inode->i_lock);
|
||||
spin_lock(&inode_wb_list_lock);
|
||||
inode->dirtied_when = jiffies;
|
||||
list_move(&inode->i_wb_list, &bdi->wb.b_dirty);
|
||||
spin_unlock(&inode_wb_list_lock);
|
||||
|
||||
if (wakeup_bdi)
|
||||
bdi_wakeup_thread_delayed(bdi);
|
||||
return;
|
||||
}
|
||||
goto out;
|
||||
}
|
||||
out_unlock_inode:
|
||||
spin_unlock(&inode->i_lock);
|
||||
out:
|
||||
spin_unlock(&inode_lock);
|
||||
|
||||
if (wakeup_bdi)
|
||||
bdi_wakeup_thread_delayed(bdi);
|
||||
}
|
||||
EXPORT_SYMBOL(__mark_inode_dirty);
|
||||
|
||||
|
@ -1296,9 +1308,9 @@ int write_inode_now(struct inode *inode, int sync)
|
|||
wbc.nr_to_write = 0;
|
||||
|
||||
might_sleep();
|
||||
spin_lock(&inode_lock);
|
||||
spin_lock(&inode_wb_list_lock);
|
||||
ret = writeback_single_inode(inode, &wbc);
|
||||
spin_unlock(&inode_lock);
|
||||
spin_unlock(&inode_wb_list_lock);
|
||||
if (sync)
|
||||
inode_sync_wait(inode);
|
||||
return ret;
|
||||
|
@ -1320,9 +1332,9 @@ int sync_inode(struct inode *inode, struct writeback_control *wbc)
|
|||
{
|
||||
int ret;
|
||||
|
||||
spin_lock(&inode_lock);
|
||||
spin_lock(&inode_wb_list_lock);
|
||||
ret = writeback_single_inode(inode, wbc);
|
||||
spin_unlock(&inode_lock);
|
||||
spin_unlock(&inode_wb_list_lock);
|
||||
return ret;
|
||||
}
|
||||
EXPORT_SYMBOL(sync_inode);
|
||||
|
|
12
fs/inode.c
12
fs/inode.c
|
@ -26,6 +26,7 @@
|
|||
#include <linux/posix_acl.h>
|
||||
#include <linux/ima.h>
|
||||
#include <linux/cred.h>
|
||||
#include "internal.h"
|
||||
|
||||
/*
|
||||
* inode locking rules.
|
||||
|
@ -36,6 +37,8 @@
|
|||
* inode_lru, inode->i_lru
|
||||
* inode_sb_list_lock protects:
|
||||
* sb->s_inodes, inode->i_sb_list
|
||||
* inode_wb_list_lock protects:
|
||||
* bdi->wb.b_{dirty,io,more_io}, inode->i_wb_list
|
||||
*
|
||||
* Lock ordering:
|
||||
* inode_lock
|
||||
|
@ -44,6 +47,9 @@
|
|||
* inode_sb_list_lock
|
||||
* inode->i_lock
|
||||
* inode_lru_lock
|
||||
*
|
||||
* inode_wb_list_lock
|
||||
* inode->i_lock
|
||||
*/
|
||||
|
||||
/*
|
||||
|
@ -105,6 +111,7 @@ static struct hlist_head *inode_hashtable __read_mostly;
|
|||
DEFINE_SPINLOCK(inode_lock);
|
||||
|
||||
__cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_sb_list_lock);
|
||||
__cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_wb_list_lock);
|
||||
|
||||
/*
|
||||
* iprune_sem provides exclusion between the icache shrinking and the
|
||||
|
@ -483,10 +490,7 @@ static void evict(struct inode *inode)
|
|||
BUG_ON(!(inode->i_state & I_FREEING));
|
||||
BUG_ON(!list_empty(&inode->i_lru));
|
||||
|
||||
spin_lock(&inode_lock);
|
||||
list_del_init(&inode->i_wb_list);
|
||||
spin_unlock(&inode_lock);
|
||||
|
||||
inode_wb_list_del(inode);
|
||||
inode_sb_list_del(inode);
|
||||
|
||||
if (op->evict_inode) {
|
||||
|
|
|
@ -127,6 +127,11 @@ extern long do_handle_open(int mountdirfd,
|
|||
*/
|
||||
extern spinlock_t inode_sb_list_lock;
|
||||
|
||||
/*
|
||||
* fs-writeback.c
|
||||
*/
|
||||
extern void inode_wb_list_del(struct inode *inode);
|
||||
|
||||
extern int get_nr_dirty_inodes(void);
|
||||
extern void evict_inodes(struct super_block *);
|
||||
extern int invalidate_inodes(struct super_block *, bool);
|
||||
|
|
|
@ -10,6 +10,7 @@
|
|||
struct backing_dev_info;
|
||||
|
||||
extern spinlock_t inode_lock;
|
||||
extern spinlock_t inode_wb_list_lock;
|
||||
|
||||
/*
|
||||
* fs/fs-writeback.c
|
||||
|
|
|
@ -73,14 +73,14 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
|
|||
struct inode *inode;
|
||||
|
||||
nr_wb = nr_dirty = nr_io = nr_more_io = 0;
|
||||
spin_lock(&inode_lock);
|
||||
spin_lock(&inode_wb_list_lock);
|
||||
list_for_each_entry(inode, &wb->b_dirty, i_wb_list)
|
||||
nr_dirty++;
|
||||
list_for_each_entry(inode, &wb->b_io, i_wb_list)
|
||||
nr_io++;
|
||||
list_for_each_entry(inode, &wb->b_more_io, i_wb_list)
|
||||
nr_more_io++;
|
||||
spin_unlock(&inode_lock);
|
||||
spin_unlock(&inode_wb_list_lock);
|
||||
|
||||
global_dirty_limits(&background_thresh, &dirty_thresh);
|
||||
bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh);
|
||||
|
@ -682,11 +682,11 @@ void bdi_destroy(struct backing_dev_info *bdi)
|
|||
if (bdi_has_dirty_io(bdi)) {
|
||||
struct bdi_writeback *dst = &default_backing_dev_info.wb;
|
||||
|
||||
spin_lock(&inode_lock);
|
||||
spin_lock(&inode_wb_list_lock);
|
||||
list_splice(&bdi->wb.b_dirty, &dst->b_dirty);
|
||||
list_splice(&bdi->wb.b_io, &dst->b_io);
|
||||
list_splice(&bdi->wb.b_more_io, &dst->b_more_io);
|
||||
spin_unlock(&inode_lock);
|
||||
spin_unlock(&inode_wb_list_lock);
|
||||
}
|
||||
|
||||
bdi_unregister(bdi);
|
||||
|
|
|
@ -80,8 +80,8 @@
|
|||
* ->i_mutex
|
||||
* ->i_alloc_sem (various)
|
||||
*
|
||||
* ->inode_lock
|
||||
* ->sb_lock (fs/fs-writeback.c)
|
||||
* inode_wb_list_lock
|
||||
* sb_lock (fs/fs-writeback.c)
|
||||
* ->mapping->tree_lock (__sync_single_inode)
|
||||
*
|
||||
* ->i_mmap_lock
|
||||
|
@ -98,9 +98,9 @@
|
|||
* ->zone.lru_lock (check_pte_range->isolate_lru_page)
|
||||
* ->private_lock (page_remove_rmap->set_page_dirty)
|
||||
* ->tree_lock (page_remove_rmap->set_page_dirty)
|
||||
* ->inode_lock (page_remove_rmap->set_page_dirty)
|
||||
* inode_wb_list_lock (page_remove_rmap->set_page_dirty)
|
||||
* ->inode->i_lock (page_remove_rmap->set_page_dirty)
|
||||
* ->inode_lock (zap_pte_range->set_page_dirty)
|
||||
* inode_wb_list_lock (zap_pte_range->set_page_dirty)
|
||||
* ->inode->i_lock (zap_pte_range->set_page_dirty)
|
||||
* ->private_lock (zap_pte_range->__set_page_dirty_buffers)
|
||||
*
|
||||
|
|
|
@ -31,12 +31,12 @@
|
|||
* swap_lock (in swap_duplicate, swap_info_get)
|
||||
* mmlist_lock (in mmput, drain_mmlist and others)
|
||||
* mapping->private_lock (in __set_page_dirty_buffers)
|
||||
* inode_lock (in set_page_dirty's __mark_inode_dirty)
|
||||
* inode->i_lock (in set_page_dirty's __mark_inode_dirty)
|
||||
* inode_wb_list_lock (in set_page_dirty's __mark_inode_dirty)
|
||||
* sb_lock (within inode_lock in fs/fs-writeback.c)
|
||||
* mapping->tree_lock (widely used, in set_page_dirty,
|
||||
* in arch-dependent flush_dcache_mmap_lock,
|
||||
* within inode_lock in __sync_single_inode)
|
||||
* within inode_wb_list_lock in __sync_single_inode)
|
||||
*
|
||||
* (code doesn't rely on that order so it could be switched around)
|
||||
* ->tasklist_lock
|
||||
|
|
Loading…
Reference in New Issue