From 2c96ce9f2084c1e04d02883e622f74a537a63aea Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 15 Sep 2009 09:43:56 +0200 Subject: [PATCH 01/16] fs: remove bdev->bd_inode_backing_dev_info It has been unused since it was introduced in: commit 520808bf20e90fdbdb320264ba7dd5cf9d47dcac Author: Andrew Morton Date: Fri May 21 00:46:17 2004 -0700 [PATCH] block device layer: separate backing_dev_info infrastructure So lets just kill it. Acked-by: Jan Kara Signed-off-by: Jens Axboe --- fs/block_dev.c | 1 - fs/inode.c | 4 +--- fs/nilfs2/the_nilfs.c | 4 +--- include/linux/fs.h | 1 - 4 files changed, 2 insertions(+), 8 deletions(-) diff --git a/fs/block_dev.c b/fs/block_dev.c index 3581a4e53942..71e7e03ac343 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -420,7 +420,6 @@ static void bdev_destroy_inode(struct inode *inode) { struct bdev_inode *bdi = BDEV_I(inode); - bdi->bdev.bd_inode_backing_dev_info = NULL; kmem_cache_free(bdev_cachep, bdi); } diff --git a/fs/inode.c b/fs/inode.c index ae7b67e48661..b2ba83d2c4e1 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -182,9 +182,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode) if (sb->s_bdev) { struct backing_dev_info *bdi; - bdi = sb->s_bdev->bd_inode_backing_dev_info; - if (!bdi) - bdi = sb->s_bdev->bd_inode->i_mapping->backing_dev_info; + bdi = sb->s_bdev->bd_inode->i_mapping->backing_dev_info; mapping->backing_dev_info = bdi; } inode->i_private = NULL; diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c index d4168e269c5d..ad391a8c3e7e 100644 --- a/fs/nilfs2/the_nilfs.c +++ b/fs/nilfs2/the_nilfs.c @@ -591,9 +591,7 @@ int init_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi, char *data) nilfs->ns_mount_state = le16_to_cpu(sbp->s_state); - bdi = nilfs->ns_bdev->bd_inode_backing_dev_info; - if (!bdi) - bdi = nilfs->ns_bdev->bd_inode->i_mapping->backing_dev_info; + bdi = nilfs->ns_bdev->bd_inode->i_mapping->backing_dev_info; nilfs->ns_bdi = bdi ? : &default_backing_dev_info; /* Finding last segment */ diff --git a/include/linux/fs.h b/include/linux/fs.h index b21cf6b9c80b..db29588874ac 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -655,7 +655,6 @@ struct block_device { int bd_invalidated; struct gendisk * bd_disk; struct list_head bd_list; - struct backing_dev_info *bd_inode_backing_dev_info; /* * Private data. You must have bd_claim'ed the block_device * to use this. NOTE: bd_claim allows an owner to claim From 1fe06ad89255c211fe100d7f690d10b161398df8 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 15 Sep 2009 15:10:20 +0200 Subject: [PATCH 02/16] writeback: get rid of wbc->for_writepages It's only set, it's never checked. Kill it. Acked-by: Jan Kara Signed-off-by: Jens Axboe --- fs/afs/write.c | 1 - fs/btrfs/ordered-data.c | 1 - fs/jbd2/commit.c | 1 - fs/nfs/write.c | 1 - include/linux/writeback.h | 1 - include/trace/events/ext4.h | 6 ++---- mm/page-writeback.c | 2 -- 7 files changed, 2 insertions(+), 11 deletions(-) diff --git a/fs/afs/write.c b/fs/afs/write.c index c2e7a7ff0080..c63a3c8beb73 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -712,7 +712,6 @@ int afs_writeback_all(struct afs_vnode *vnode) .bdi = mapping->backing_dev_info, .sync_mode = WB_SYNC_ALL, .nr_to_write = LONG_MAX, - .for_writepages = 1, .range_cyclic = 1, }; int ret; diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index d6f0806c682f..7b2f401e604e 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -740,7 +740,6 @@ int btrfs_fdatawrite_range(struct address_space *mapping, loff_t start, .nr_to_write = mapping->nrpages * 2, .range_start = start, .range_end = end, - .for_writepages = 1, }; return btrfs_writepages(mapping, &wbc); } diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 7b4088b2364d..0df600e9162d 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -220,7 +220,6 @@ static int journal_submit_inode_data_buffers(struct address_space *mapping) .nr_to_write = mapping->nrpages * 2, .range_start = 0, .range_end = i_size_read(mapping->host), - .for_writepages = 1, }; ret = generic_writepages(mapping, &wbc); diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 120acadc6a84..53eb26c16b50 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -1490,7 +1490,6 @@ static int nfs_write_mapping(struct address_space *mapping, int how) .nr_to_write = LONG_MAX, .range_start = 0, .range_end = LLONG_MAX, - .for_writepages = 1, }; return __nfs_write_mapping(mapping, &wbc, how); diff --git a/include/linux/writeback.h b/include/linux/writeback.h index d347632f1861..48a054e2b716 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -50,7 +50,6 @@ struct writeback_control { unsigned encountered_congestion:1; /* An output: a queue is full */ unsigned for_kupdate:1; /* A kupdate writeback */ unsigned for_reclaim:1; /* Invoked from the page allocator */ - unsigned for_writepages:1; /* This is a writepages() call */ unsigned range_cyclic:1; /* range_start is cyclic */ unsigned more_io:1; /* more io to be dispatched */ /* diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h index 7d8b5bc74185..8d433c4e3709 100644 --- a/include/trace/events/ext4.h +++ b/include/trace/events/ext4.h @@ -227,7 +227,6 @@ TRACE_EVENT(ext4_da_writepages, __field( char, nonblocking ) __field( char, for_kupdate ) __field( char, for_reclaim ) - __field( char, for_writepages ) __field( char, range_cyclic ) ), @@ -241,16 +240,15 @@ TRACE_EVENT(ext4_da_writepages, __entry->nonblocking = wbc->nonblocking; __entry->for_kupdate = wbc->for_kupdate; __entry->for_reclaim = wbc->for_reclaim; - __entry->for_writepages = wbc->for_writepages; __entry->range_cyclic = wbc->range_cyclic; ), - TP_printk("dev %s ino %lu nr_t_write %ld pages_skipped %ld range_start %llu range_end %llu nonblocking %d for_kupdate %d for_reclaim %d for_writepages %d range_cyclic %d", + TP_printk("dev %s ino %lu nr_t_write %ld pages_skipped %ld range_start %llu range_end %llu nonblocking %d for_kupdate %d for_reclaim %d range_cyclic %d", jbd2_dev_to_name(__entry->dev), __entry->ino, __entry->nr_to_write, __entry->pages_skipped, __entry->range_start, __entry->range_end, __entry->nonblocking, __entry->for_kupdate, __entry->for_reclaim, - __entry->for_writepages, __entry->range_cyclic) + __entry->range_cyclic) ); TRACE_EVENT(ext4_da_writepages_result, diff --git a/mm/page-writeback.c b/mm/page-writeback.c index dd73d29c15a8..abc648f5de00 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -1020,12 +1020,10 @@ int do_writepages(struct address_space *mapping, struct writeback_control *wbc) if (wbc->nr_to_write <= 0) return 0; - wbc->for_writepages = 1; if (mapping->a_ops->writepages) ret = mapping->a_ops->writepages(mapping, wbc); else ret = generic_writepages(mapping, wbc); - wbc->for_writepages = 0; return ret; } From f0fad8a530e7cbad5f686dbca3079d1a626a3882 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 11 Sep 2009 09:47:56 +0200 Subject: [PATCH 03/16] writeback: merely wakeup flusher thread if work allocation fails for WB_SYNC_NONE Since it's an opportunistic writeback and not a data integrity action, don't punt to blocking writeback. Just wakeup the thread and it will flush old data. Acked-by: Jan Kara Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- fs/fs-writeback.c | 48 +++++++++++++++-------------------------------- 1 file changed, 15 insertions(+), 33 deletions(-) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 628235cf44b5..783ed44c7cfe 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -75,13 +75,6 @@ static inline void bdi_work_init(struct bdi_work *work, work->state = WS_USED; } -static inline void bdi_work_init_on_stack(struct bdi_work *work, - struct writeback_control *wbc) -{ - bdi_work_init(work, wbc); - work->state |= WS_ONSTACK; -} - /** * writeback_in_progress - determine whether there is writeback in progress * @bdi: the device's backing_dev_info structure. @@ -207,34 +200,23 @@ static struct bdi_work *bdi_alloc_work(struct writeback_control *wbc) void bdi_start_writeback(struct writeback_control *wbc) { - const bool must_wait = wbc->sync_mode == WB_SYNC_ALL; - struct bdi_work work_stack, *work = NULL; - - if (!must_wait) - work = bdi_alloc_work(wbc); - - if (!work) { - work = &work_stack; - bdi_work_init_on_stack(work, wbc); - } - - bdi_queue_work(wbc->bdi, work); - /* - * If the sync mode is WB_SYNC_ALL, block waiting for the work to - * complete. If not, we only need to wait for the work to be started, - * if we allocated it on-stack. We use the same mechanism, if the - * wait bit is set in the bdi_work struct, then threads will not - * clear pending until after they are done. - * - * Note that work == &work_stack if must_wait is true, so we don't - * need to do call_rcu() here ever, since the completion path will - * have done that for us. + * WB_SYNC_NONE is opportunistic writeback. If this allocation fails, + * bdi_queue_work() will wake up the thread and flush old data. This + * should ensure some amount of progress in freeing memory. */ - if (must_wait || work == &work_stack) { - bdi_wait_on_work_clear(work); - if (work != &work_stack) - call_rcu(&work->rcu_head, bdi_work_free); + if (wbc->sync_mode != WB_SYNC_ALL) { + struct bdi_work *w = bdi_alloc_work(wbc); + + bdi_queue_work(wbc->bdi, w); + } else { + struct bdi_work work; + + bdi_work_init(&work, wbc); + work.state |= WS_ONSTACK; + + bdi_queue_work(wbc->bdi, &work); + bdi_wait_on_work_clear(&work); } } From c4a77a6c7dcff04a2abc7fe4b6b2ae605be41c5b Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 16 Sep 2009 15:18:25 +0200 Subject: [PATCH 04/16] writeback: make wb_writeback() take an argument structure We need to be able to pass in range_cyclic as well, so instead of growing yet another argument, split the arguments into a struct wb_writeback_args structure that we can use internally. Also makes it easier to just copy all members to an on-stack struct, since we can't access work after clearing the pending bit. Signed-off-by: Jens Axboe --- fs/fs-writeback.c | 78 +++++++++++++++++++++++++++++------------------ 1 file changed, 49 insertions(+), 29 deletions(-) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 783ed44c7cfe..c5e91225501d 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -34,6 +34,17 @@ */ int nr_pdflush_threads; +/* + * Passed into wb_writeback(), essentially a subset of writeback_control + */ +struct wb_writeback_args { + long nr_pages; + struct super_block *sb; + enum writeback_sync_modes sync_mode; + int for_kupdate; + int range_cyclic; +}; + /* * Work items for the bdi_writeback threads */ @@ -45,9 +56,7 @@ struct bdi_work { unsigned long seen; atomic_t pending; - struct super_block *sb; - unsigned long nr_pages; - enum writeback_sync_modes sync_mode; + struct wb_writeback_args args; unsigned long state; }; @@ -69,9 +78,11 @@ static inline void bdi_work_init(struct bdi_work *work, struct writeback_control *wbc) { INIT_RCU_HEAD(&work->rcu_head); - work->sb = wbc->sb; - work->nr_pages = wbc->nr_to_write; - work->sync_mode = wbc->sync_mode; + work->args.sb = wbc->sb; + work->args.nr_pages = wbc->nr_to_write; + work->args.sync_mode = wbc->sync_mode; + work->args.range_cyclic = wbc->range_cyclic; + work->args.for_kupdate = 0; work->state = WS_USED; } @@ -106,7 +117,7 @@ static void bdi_work_free(struct rcu_head *head) static void wb_work_complete(struct bdi_work *work) { - const enum writeback_sync_modes sync_mode = work->sync_mode; + const enum writeback_sync_modes sync_mode = work->args.sync_mode; /* * For allocated work, we can clear the done/seen bit right here. @@ -653,17 +664,16 @@ static inline bool over_bground_thresh(void) * older_than_this takes precedence over nr_to_write. So we'll only write back * all dirty pages if they are all attached to "old" mappings. */ -static long wb_writeback(struct bdi_writeback *wb, long nr_pages, - struct super_block *sb, - enum writeback_sync_modes sync_mode, int for_kupdate) +static long wb_writeback(struct bdi_writeback *wb, + struct wb_writeback_args *args) { struct writeback_control wbc = { .bdi = wb->bdi, - .sb = sb, - .sync_mode = sync_mode, + .sb = args->sb, + .sync_mode = args->sync_mode, .older_than_this = NULL, - .for_kupdate = for_kupdate, - .range_cyclic = 1, + .for_kupdate = args->for_kupdate, + .range_cyclic = args->range_cyclic, }; unsigned long oldest_jif; long wrote = 0; @@ -673,13 +683,18 @@ static long wb_writeback(struct bdi_writeback *wb, long nr_pages, oldest_jif = jiffies - msecs_to_jiffies(dirty_expire_interval * 10); } + if (!wbc.range_cyclic) { + wbc.range_start = 0; + wbc.range_end = LLONG_MAX; + } for (;;) { /* * Don't flush anything for non-integrity writeback where * no nr_pages was given */ - if (!for_kupdate && nr_pages <= 0 && sync_mode == WB_SYNC_NONE) + if (!args->for_kupdate && args->nr_pages <= 0 && + args->sync_mode == WB_SYNC_NONE) break; /* @@ -687,7 +702,8 @@ static long wb_writeback(struct bdi_writeback *wb, long nr_pages, * periodic background writeout and we are below the * background dirty threshold, don't do anything */ - if (for_kupdate && nr_pages <= 0 && !over_bground_thresh()) + if (args->for_kupdate && args->nr_pages <= 0 && + !over_bground_thresh()) break; wbc.more_io = 0; @@ -695,7 +711,7 @@ static long wb_writeback(struct bdi_writeback *wb, long nr_pages, wbc.nr_to_write = MAX_WRITEBACK_PAGES; wbc.pages_skipped = 0; writeback_inodes_wb(wb, &wbc); - nr_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write; + args->nr_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write; wrote += MAX_WRITEBACK_PAGES - wbc.nr_to_write; /* @@ -749,8 +765,16 @@ static long wb_check_old_data_flush(struct bdi_writeback *wb) global_page_state(NR_UNSTABLE_NFS) + (inodes_stat.nr_inodes - inodes_stat.nr_unused); - if (nr_pages) - return wb_writeback(wb, nr_pages, NULL, WB_SYNC_NONE, 1); + if (nr_pages) { + struct wb_writeback_args args = { + .nr_pages = nr_pages, + .sync_mode = WB_SYNC_NONE, + .for_kupdate = 1, + .range_cyclic = 1, + }; + + return wb_writeback(wb, &args); + } return 0; } @@ -762,35 +786,31 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait) { struct backing_dev_info *bdi = wb->bdi; struct bdi_work *work; - long nr_pages, wrote = 0; + long wrote = 0; while ((work = get_next_work_item(bdi, wb)) != NULL) { - enum writeback_sync_modes sync_mode; - - nr_pages = work->nr_pages; + struct wb_writeback_args args = work->args; /* * Override sync mode, in case we must wait for completion */ if (force_wait) - work->sync_mode = sync_mode = WB_SYNC_ALL; - else - sync_mode = work->sync_mode; + work->args.sync_mode = args.sync_mode = WB_SYNC_ALL; /* * If this isn't a data integrity operation, just notify * that we have seen this work and we are now starting it. */ - if (sync_mode == WB_SYNC_NONE) + if (args.sync_mode == WB_SYNC_NONE) wb_clear_pending(wb, work); - wrote += wb_writeback(wb, nr_pages, work->sb, sync_mode, 0); + wrote += wb_writeback(wb, &args); /* * This is a data integrity writeback, so only do the * notification when we have completed the work. */ - if (sync_mode == WB_SYNC_ALL) + if (args.sync_mode == WB_SYNC_ALL) wb_clear_pending(wb, work); } From 32a88aa1b6dfb901cec64e1898cac78d0f25028a Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 16 Sep 2009 15:02:33 +0200 Subject: [PATCH 05/16] fs: Assign bdi in super_block We do this automatically in get_sb_bdev() from the set_bdev_super() callback. Filesystems that have their own private backing_dev_info must assign that in ->fill_super(). Note that ->s_bdi assignment is required for proper writeback! Acked-by: Christoph Hellwig Signed-off-by: Jens Axboe --- fs/btrfs/disk-io.c | 1 + fs/fuse/inode.c | 2 ++ fs/nfs/super.c | 2 ++ fs/super.c | 6 ++++++ fs/sync.c | 9 ++++++++- fs/ubifs/super.c | 1 + include/linux/fs.h | 1 + 7 files changed, 21 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 15831d5c7367..8b8192790011 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1600,6 +1600,7 @@ struct btrfs_root *open_ctree(struct super_block *sb, sb->s_blocksize = 4096; sb->s_blocksize_bits = blksize_bits(4096); + sb->s_bdi = &fs_info->bdi; /* * we set the i_size on the btree inode to the max possible int. diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 4567db6f9430..e5dbecd87b0f 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -894,6 +894,8 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) if (err) goto err_put_conn; + sb->s_bdi = &fc->bdi; + /* Handle umasking inside the fuse code */ if (sb->s_flags & MS_POSIXACL) fc->dont_mask = 1; diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 867f70504531..de935692d40d 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -1918,6 +1918,8 @@ static inline void nfs_initialise_sb(struct super_block *sb) if (server->flags & NFS_MOUNT_NOAC) sb->s_flags |= MS_SYNCHRONOUS; + sb->s_bdi = &server->backing_dev_info; + nfs_super_set_maxbytes(sb, server->maxfilesize); } diff --git a/fs/super.c b/fs/super.c index 9cda337ddae2..b03fea8fbfb6 100644 --- a/fs/super.c +++ b/fs/super.c @@ -707,6 +707,12 @@ static int set_bdev_super(struct super_block *s, void *data) { s->s_bdev = data; s->s_dev = s->s_bdev->bd_dev; + + /* + * We set the bdi here to the queue backing, file systems can + * overwrite this in ->fill_super() + */ + s->s_bdi = &bdev_get_queue(s->s_bdev)->backing_dev_info; return 0; } diff --git a/fs/sync.c b/fs/sync.c index 192340930bb4..c08467a5d7cb 100644 --- a/fs/sync.c +++ b/fs/sync.c @@ -27,6 +27,13 @@ */ static int __sync_filesystem(struct super_block *sb, int wait) { + /* + * This should be safe, as we require bdi backing to actually + * write out data in the first place + */ + if (!sb->s_bdi) + return 0; + /* Avoid doing twice syncing and cache pruning for quota sync */ if (!wait) { writeout_quota_sb(sb, -1); @@ -101,7 +108,7 @@ restart: spin_unlock(&sb_lock); down_read(&sb->s_umount); - if (!(sb->s_flags & MS_RDONLY) && sb->s_root) + if (!(sb->s_flags & MS_RDONLY) && sb->s_root && sb->s_bdi) __sync_filesystem(sb, wait); up_read(&sb->s_umount); diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index 51763aa8f4de..c4af069df1ad 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -1980,6 +1980,7 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent) if (err) goto out_bdi; + sb->s_bdi = &c->bdi; sb->s_fs_info = c; sb->s_magic = UBIFS_SUPER_MAGIC; sb->s_blocksize = UBIFS_BLOCK_SIZE; diff --git a/include/linux/fs.h b/include/linux/fs.h index db29588874ac..90162fb3bf04 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1342,6 +1342,7 @@ struct super_block { int s_nr_dentry_unused; /* # of dentry on lru */ struct block_device *s_bdev; + struct backing_dev_info *s_bdi; struct mtd_info *s_mtd; struct list_head s_instances; struct quota_info s_dquot; /* Diskquota specific options */ From f11fcae8401a3175f528e2f7917362645d570111 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 15 Sep 2009 09:53:35 +0200 Subject: [PATCH 06/16] writeback: only use bdi_writeback_all() for WB_SYNC_NONE writeout Data integrity writeback must use bdi_start_writeback() and ensure that wbc->sb and wbc->bdi are set. Acked-by: Jan Kara Signed-off-by: Jens Axboe --- fs/fs-writeback.c | 70 ++++++++++------------------------------------- 1 file changed, 14 insertions(+), 56 deletions(-) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index c5e91225501d..14f06b459197 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -50,7 +50,6 @@ struct wb_writeback_args { */ struct bdi_work { struct list_head list; - struct list_head wait_list; struct rcu_head rcu_head; unsigned long seen; @@ -198,7 +197,8 @@ static void bdi_wait_on_work_clear(struct bdi_work *work) TASK_UNINTERRUPTIBLE); } -static struct bdi_work *bdi_alloc_work(struct writeback_control *wbc) +static void bdi_alloc_queue_work(struct backing_dev_info *bdi, + struct writeback_control *wbc) { struct bdi_work *work; @@ -206,7 +206,7 @@ static struct bdi_work *bdi_alloc_work(struct writeback_control *wbc) if (work) bdi_work_init(work, wbc); - return work; + bdi_queue_work(bdi, work); } void bdi_start_writeback(struct writeback_control *wbc) @@ -216,11 +216,9 @@ void bdi_start_writeback(struct writeback_control *wbc) * bdi_queue_work() will wake up the thread and flush old data. This * should ensure some amount of progress in freeing memory. */ - if (wbc->sync_mode != WB_SYNC_ALL) { - struct bdi_work *w = bdi_alloc_work(wbc); - - bdi_queue_work(wbc->bdi, w); - } else { + if (wbc->sync_mode != WB_SYNC_ALL) + bdi_alloc_queue_work(wbc->bdi, wbc); + else { struct bdi_work work; bdi_work_init(&work, wbc); @@ -860,67 +858,26 @@ int bdi_writeback_task(struct bdi_writeback *wb) } /* - * Schedule writeback for all backing devices. Expensive! If this is a data - * integrity operation, writeback will be complete when this returns. If - * we are simply called for WB_SYNC_NONE, then writeback will merely be - * scheduled to run. + * Schedule writeback for all backing devices. Can only be used for + * WB_SYNC_NONE writeback, WB_SYNC_ALL should use bdi_start_writeback() + * and pass in the superblock. */ static void bdi_writeback_all(struct writeback_control *wbc) { - const bool must_wait = wbc->sync_mode == WB_SYNC_ALL; struct backing_dev_info *bdi; - struct bdi_work *work; - LIST_HEAD(list); -restart: + WARN_ON(wbc->sync_mode == WB_SYNC_ALL); + spin_lock(&bdi_lock); list_for_each_entry(bdi, &bdi_list, bdi_list) { - struct bdi_work *work; - if (!bdi_has_dirty_io(bdi)) continue; - /* - * If work allocation fails, do the writes inline. We drop - * the lock and restart the list writeout. This should be OK, - * since this happens rarely and because the writeout should - * eventually make more free memory available. - */ - work = bdi_alloc_work(wbc); - if (!work) { - struct writeback_control __wbc; - - /* - * Not a data integrity writeout, just continue - */ - if (!must_wait) - continue; - - spin_unlock(&bdi_lock); - __wbc = *wbc; - __wbc.bdi = bdi; - writeback_inodes_wbc(&__wbc); - goto restart; - } - if (must_wait) - list_add_tail(&work->wait_list, &list); - - bdi_queue_work(bdi, work); + bdi_alloc_queue_work(bdi, wbc); } spin_unlock(&bdi_lock); - - /* - * If this is for WB_SYNC_ALL, wait for pending work to complete - * before returning. - */ - while (!list_empty(&list)) { - work = list_entry(list.next, struct bdi_work, wait_list); - list_del(&work->wait_list); - bdi_wait_on_work_clear(work); - call_rcu(&work->rcu_head, bdi_work_free); - } } /* @@ -1177,6 +1134,7 @@ long sync_inodes_sb(struct super_block *sb) { struct writeback_control wbc = { .sb = sb, + .bdi = sb->s_bdi, .sync_mode = WB_SYNC_ALL, .range_start = 0, .range_end = LLONG_MAX, @@ -1184,7 +1142,7 @@ long sync_inodes_sb(struct super_block *sb) long nr_to_write = LONG_MAX; /* doesn't actually matter */ wbc.nr_to_write = nr_to_write; - bdi_writeback_all(&wbc); + bdi_start_writeback(&wbc); wait_sb_inodes(&wbc); return nr_to_write - wbc.nr_to_write; } From cfc4ba5365449cb6b5c9f68d755a142f17da1e47 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 14 Sep 2009 13:12:40 +0200 Subject: [PATCH 07/16] writeback: use RCU to protect bdi_list Now that bdi_writeback_all() no longer handles integrity writeback, it doesn't have to block anymore. This means that we can switch bdi_list reader side protection to RCU. Signed-off-by: Jens Axboe --- fs/fs-writeback.c | 6 +-- include/linux/backing-dev.h | 1 + mm/backing-dev.c | 76 +++++++++++++++++++++++++++---------- mm/page-writeback.c | 8 ++-- 4 files changed, 63 insertions(+), 28 deletions(-) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 14f06b459197..f8cd7a97f5b7 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -868,16 +868,16 @@ static void bdi_writeback_all(struct writeback_control *wbc) WARN_ON(wbc->sync_mode == WB_SYNC_ALL); - spin_lock(&bdi_lock); + rcu_read_lock(); - list_for_each_entry(bdi, &bdi_list, bdi_list) { + list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) { if (!bdi_has_dirty_io(bdi)) continue; bdi_alloc_queue_work(bdi, wbc); } - spin_unlock(&bdi_lock); + rcu_read_unlock(); } /* diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index f169bcb90b58..859e797f4576 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -59,6 +59,7 @@ struct bdi_writeback { struct backing_dev_info { struct list_head bdi_list; + struct rcu_head rcu_head; unsigned long ra_pages; /* max readahead in PAGE_CACHE_SIZE units */ unsigned long state; /* Always use atomic bitops on this */ unsigned int capabilities; /* Device capabilities */ diff --git a/mm/backing-dev.c b/mm/backing-dev.c index d3ca0dac1111..fd93566345b6 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -26,6 +26,12 @@ struct backing_dev_info default_backing_dev_info = { EXPORT_SYMBOL_GPL(default_backing_dev_info); static struct class *bdi_class; + +/* + * bdi_lock protects updates to bdi_list and bdi_pending_list, as well as + * reader side protection for bdi_pending_list. bdi_list has RCU reader side + * locking. + */ DEFINE_SPINLOCK(bdi_lock); LIST_HEAD(bdi_list); LIST_HEAD(bdi_pending_list); @@ -284,9 +290,9 @@ static int bdi_start_fn(void *ptr) /* * Add us to the active bdi_list */ - spin_lock(&bdi_lock); - list_add(&bdi->bdi_list, &bdi_list); - spin_unlock(&bdi_lock); + spin_lock_bh(&bdi_lock); + list_add_rcu(&bdi->bdi_list, &bdi_list); + spin_unlock_bh(&bdi_lock); bdi_task_init(bdi, wb); @@ -389,7 +395,7 @@ static int bdi_forker_task(void *ptr) if (wb_has_dirty_io(me) || !list_empty(&me->bdi->work_list)) wb_do_writeback(me, 0); - spin_lock(&bdi_lock); + spin_lock_bh(&bdi_lock); /* * Check if any existing bdi's have dirty data without @@ -410,7 +416,7 @@ static int bdi_forker_task(void *ptr) if (list_empty(&bdi_pending_list)) { unsigned long wait; - spin_unlock(&bdi_lock); + spin_unlock_bh(&bdi_lock); wait = msecs_to_jiffies(dirty_writeback_interval * 10); schedule_timeout(wait); try_to_freeze(); @@ -426,7 +432,7 @@ static int bdi_forker_task(void *ptr) bdi = list_entry(bdi_pending_list.next, struct backing_dev_info, bdi_list); list_del_init(&bdi->bdi_list); - spin_unlock(&bdi_lock); + spin_unlock_bh(&bdi_lock); wb = &bdi->wb; wb->task = kthread_run(bdi_start_fn, wb, "flush-%s", @@ -445,9 +451,9 @@ static int bdi_forker_task(void *ptr) * a chance to flush other bdi's to free * memory. */ - spin_lock(&bdi_lock); + spin_lock_bh(&bdi_lock); list_add_tail(&bdi->bdi_list, &bdi_pending_list); - spin_unlock(&bdi_lock); + spin_unlock_bh(&bdi_lock); bdi_flush_io(bdi); } @@ -456,6 +462,24 @@ static int bdi_forker_task(void *ptr) return 0; } +static void bdi_add_to_pending(struct rcu_head *head) +{ + struct backing_dev_info *bdi; + + bdi = container_of(head, struct backing_dev_info, rcu_head); + INIT_LIST_HEAD(&bdi->bdi_list); + + spin_lock(&bdi_lock); + list_add_tail(&bdi->bdi_list, &bdi_pending_list); + spin_unlock(&bdi_lock); + + /* + * We are now on the pending list, wake up bdi_forker_task() + * to finish the job and add us back to the active bdi_list + */ + wake_up_process(default_backing_dev_info.wb.task); +} + /* * Add the default flusher task that gets created for any bdi * that has dirty data pending writeout @@ -478,16 +502,29 @@ void static bdi_add_default_flusher_task(struct backing_dev_info *bdi) * waiting for previous additions to finish. */ if (!test_and_set_bit(BDI_pending, &bdi->state)) { - list_move_tail(&bdi->bdi_list, &bdi_pending_list); + list_del_rcu(&bdi->bdi_list); /* - * We are now on the pending list, wake up bdi_forker_task() - * to finish the job and add us back to the active bdi_list + * We must wait for the current RCU period to end before + * moving to the pending list. So schedule that operation + * from an RCU callback. */ - wake_up_process(default_backing_dev_info.wb.task); + call_rcu(&bdi->rcu_head, bdi_add_to_pending); } } +/* + * Remove bdi from bdi_list, and ensure that it is no longer visible + */ +static void bdi_remove_from_list(struct backing_dev_info *bdi) +{ + spin_lock_bh(&bdi_lock); + list_del_rcu(&bdi->bdi_list); + spin_unlock_bh(&bdi_lock); + + synchronize_rcu(); +} + int bdi_register(struct backing_dev_info *bdi, struct device *parent, const char *fmt, ...) { @@ -506,9 +543,9 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent, goto exit; } - spin_lock(&bdi_lock); - list_add_tail(&bdi->bdi_list, &bdi_list); - spin_unlock(&bdi_lock); + spin_lock_bh(&bdi_lock); + list_add_tail_rcu(&bdi->bdi_list, &bdi_list); + spin_unlock_bh(&bdi_lock); bdi->dev = dev; @@ -526,9 +563,7 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent, wb->task = NULL; ret = -ENOMEM; - spin_lock(&bdi_lock); - list_del(&bdi->bdi_list); - spin_unlock(&bdi_lock); + bdi_remove_from_list(bdi); goto exit; } } @@ -565,9 +600,7 @@ static void bdi_wb_shutdown(struct backing_dev_info *bdi) /* * Make sure nobody finds us on the bdi_list anymore */ - spin_lock(&bdi_lock); - list_del(&bdi->bdi_list); - spin_unlock(&bdi_lock); + bdi_remove_from_list(bdi); /* * Finally, kill the kernel threads. We don't need to be RCU @@ -599,6 +632,7 @@ int bdi_init(struct backing_dev_info *bdi) bdi->max_ratio = 100; bdi->max_prop_frac = PROP_FRAC_BASE; spin_lock_init(&bdi->wb_lock); + INIT_RCU_HEAD(&bdi->rcu_head); INIT_LIST_HEAD(&bdi->bdi_list); INIT_LIST_HEAD(&bdi->wb_list); INIT_LIST_HEAD(&bdi->work_list); diff --git a/mm/page-writeback.c b/mm/page-writeback.c index abc648f5de00..12c3d843ce93 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -315,7 +315,7 @@ int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio) { int ret = 0; - spin_lock(&bdi_lock); + spin_lock_bh(&bdi_lock); if (min_ratio > bdi->max_ratio) { ret = -EINVAL; } else { @@ -327,7 +327,7 @@ int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio) ret = -EINVAL; } } - spin_unlock(&bdi_lock); + spin_unlock_bh(&bdi_lock); return ret; } @@ -339,14 +339,14 @@ int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned max_ratio) if (max_ratio > 100) return -EINVAL; - spin_lock(&bdi_lock); + spin_lock_bh(&bdi_lock); if (bdi->min_ratio > max_ratio) { ret = -EINVAL; } else { bdi->max_ratio = max_ratio; bdi->max_prop_frac = (PROP_FRAC_BASE * max_ratio) / 100; } - spin_unlock(&bdi_lock); + spin_unlock_bh(&bdi_lock); return ret; } From bcddc3f01c9122882c8b9f12ab94a934e55aef97 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sun, 13 Sep 2009 20:07:36 +0200 Subject: [PATCH 08/16] writeback: inline allocation failure handling in bdi_alloc_queue_work() This gets rid of work == NULL in bdi_queue_work() and puts the OOM handling where it belongs. Acked-by: Jan Kara Signed-off-by: Jens Axboe --- fs/fs-writeback.c | 49 ++++++++++++++++++++++++++--------------------- 1 file changed, 27 insertions(+), 22 deletions(-) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index f8cd7a97f5b7..59b3ee63b624 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -149,21 +149,19 @@ static void wb_clear_pending(struct bdi_writeback *wb, struct bdi_work *work) static void bdi_queue_work(struct backing_dev_info *bdi, struct bdi_work *work) { - if (work) { - work->seen = bdi->wb_mask; - BUG_ON(!work->seen); - atomic_set(&work->pending, bdi->wb_cnt); - BUG_ON(!bdi->wb_cnt); + work->seen = bdi->wb_mask; + BUG_ON(!work->seen); + atomic_set(&work->pending, bdi->wb_cnt); + BUG_ON(!bdi->wb_cnt); - /* - * Make sure stores are seen before it appears on the list - */ - smp_mb(); + /* + * Make sure stores are seen before it appears on the list + */ + smp_mb(); - spin_lock(&bdi->wb_lock); - list_add_tail_rcu(&work->list, &bdi->work_list); - spin_unlock(&bdi->wb_lock); - } + spin_lock(&bdi->wb_lock); + list_add_tail_rcu(&work->list, &bdi->work_list); + spin_unlock(&bdi->wb_lock); /* * If the default thread isn't there, make sure we add it. When @@ -175,14 +173,12 @@ static void bdi_queue_work(struct backing_dev_info *bdi, struct bdi_work *work) struct bdi_writeback *wb = &bdi->wb; /* - * If we failed allocating the bdi work item, wake up the wb - * thread always. As a safety precaution, it'll flush out - * everything + * End work now if this wb has no dirty IO pending. Otherwise + * wakeup the handling thread */ - if (!wb_has_dirty_io(wb)) { - if (work) - wb_clear_pending(wb, work); - } else if (wb->task) + if (!wb_has_dirty_io(wb)) + wb_clear_pending(wb, work); + else if (wb->task) wake_up_process(wb->task); } } @@ -202,11 +198,20 @@ static void bdi_alloc_queue_work(struct backing_dev_info *bdi, { struct bdi_work *work; + /* + * This is WB_SYNC_NONE writeback, so if allocation fails just + * wakeup the thread for old dirty data writeback + */ work = kmalloc(sizeof(*work), GFP_ATOMIC); - if (work) + if (work) { bdi_work_init(work, wbc); + bdi_queue_work(bdi, work); + } else { + struct bdi_writeback *wb = &bdi->wb; - bdi_queue_work(bdi, work); + if (wb->task) + wake_up_process(wb->task); + } } void bdi_start_writeback(struct writeback_control *wbc) From b6e51316daede0633e9274e1e30391cfa4747877 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 16 Sep 2009 15:13:54 +0200 Subject: [PATCH 09/16] writeback: separate starting of sync vs opportunistic writeback bdi_start_writeback() is currently split into two paths, one for WB_SYNC_NONE and one for WB_SYNC_ALL. Add bdi_sync_writeback() for WB_SYNC_ALL writeback and let bdi_start_writeback() handle only WB_SYNC_NONE. Push down the writeback_control allocation and only accept the parameters that make sense for each function. This cleans up the API considerably. Signed-off-by: Jens Axboe --- fs/fs-writeback.c | 132 ++++++++++++++++++------------------ fs/ubifs/budget.c | 20 +----- include/linux/backing-dev.h | 2 +- include/linux/writeback.h | 4 +- mm/page-writeback.c | 12 +--- 5 files changed, 75 insertions(+), 95 deletions(-) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 59b3ee63b624..5887328b5a06 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -74,14 +74,10 @@ static inline bool bdi_work_on_stack(struct bdi_work *work) } static inline void bdi_work_init(struct bdi_work *work, - struct writeback_control *wbc) + struct wb_writeback_args *args) { INIT_RCU_HEAD(&work->rcu_head); - work->args.sb = wbc->sb; - work->args.nr_pages = wbc->nr_to_write; - work->args.sync_mode = wbc->sync_mode; - work->args.range_cyclic = wbc->range_cyclic; - work->args.for_kupdate = 0; + work->args = *args; work->state = WS_USED; } @@ -194,7 +190,7 @@ static void bdi_wait_on_work_clear(struct bdi_work *work) } static void bdi_alloc_queue_work(struct backing_dev_info *bdi, - struct writeback_control *wbc) + struct wb_writeback_args *args) { struct bdi_work *work; @@ -204,7 +200,7 @@ static void bdi_alloc_queue_work(struct backing_dev_info *bdi, */ work = kmalloc(sizeof(*work), GFP_ATOMIC); if (work) { - bdi_work_init(work, wbc); + bdi_work_init(work, args); bdi_queue_work(bdi, work); } else { struct bdi_writeback *wb = &bdi->wb; @@ -214,24 +210,54 @@ static void bdi_alloc_queue_work(struct backing_dev_info *bdi, } } -void bdi_start_writeback(struct writeback_control *wbc) +/** + * bdi_sync_writeback - start and wait for writeback + * @bdi: the backing device to write from + * @sb: write inodes from this super_block + * + * Description: + * This does WB_SYNC_ALL data integrity writeback and waits for the + * IO to complete. Callers must hold the sb s_umount semaphore for + * reading, to avoid having the super disappear before we are done. + */ +static void bdi_sync_writeback(struct backing_dev_info *bdi, + struct super_block *sb) { - /* - * WB_SYNC_NONE is opportunistic writeback. If this allocation fails, - * bdi_queue_work() will wake up the thread and flush old data. This - * should ensure some amount of progress in freeing memory. - */ - if (wbc->sync_mode != WB_SYNC_ALL) - bdi_alloc_queue_work(wbc->bdi, wbc); - else { - struct bdi_work work; + struct wb_writeback_args args = { + .sb = sb, + .sync_mode = WB_SYNC_ALL, + .nr_pages = LONG_MAX, + .range_cyclic = 0, + }; + struct bdi_work work; - bdi_work_init(&work, wbc); - work.state |= WS_ONSTACK; + bdi_work_init(&work, &args); + work.state |= WS_ONSTACK; - bdi_queue_work(wbc->bdi, &work); - bdi_wait_on_work_clear(&work); - } + bdi_queue_work(bdi, &work); + bdi_wait_on_work_clear(&work); +} + +/** + * bdi_start_writeback - start writeback + * @bdi: the backing device to write from + * @nr_pages: the number of pages to write + * + * Description: + * This does WB_SYNC_NONE opportunistic writeback. The IO is only + * started when this function returns, we make no guarentees on + * completion. Caller need not hold sb s_umount semaphore. + * + */ +void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages) +{ + struct wb_writeback_args args = { + .sync_mode = WB_SYNC_NONE, + .nr_pages = nr_pages, + .range_cyclic = 1, + }; + + bdi_alloc_queue_work(bdi, &args); } /* @@ -863,23 +889,25 @@ int bdi_writeback_task(struct bdi_writeback *wb) } /* - * Schedule writeback for all backing devices. Can only be used for - * WB_SYNC_NONE writeback, WB_SYNC_ALL should use bdi_start_writeback() - * and pass in the superblock. + * Schedule writeback for all backing devices. This does WB_SYNC_NONE + * writeback, for integrity writeback see bdi_sync_writeback(). */ -static void bdi_writeback_all(struct writeback_control *wbc) +static void bdi_writeback_all(struct super_block *sb, long nr_pages) { + struct wb_writeback_args args = { + .sb = sb, + .nr_pages = nr_pages, + .sync_mode = WB_SYNC_NONE, + }; struct backing_dev_info *bdi; - WARN_ON(wbc->sync_mode == WB_SYNC_ALL); - rcu_read_lock(); list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) { if (!bdi_has_dirty_io(bdi)) continue; - bdi_alloc_queue_work(bdi, wbc); + bdi_alloc_queue_work(bdi, &args); } rcu_read_unlock(); @@ -891,17 +919,10 @@ static void bdi_writeback_all(struct writeback_control *wbc) */ void wakeup_flusher_threads(long nr_pages) { - struct writeback_control wbc = { - .sync_mode = WB_SYNC_NONE, - .older_than_this = NULL, - .range_cyclic = 1, - }; - if (nr_pages == 0) nr_pages = global_page_state(NR_FILE_DIRTY) + global_page_state(NR_UNSTABLE_NFS); - wbc.nr_to_write = nr_pages; - bdi_writeback_all(&wbc); + bdi_writeback_all(NULL, nr_pages); } static noinline void block_dump___mark_inode_dirty(struct inode *inode) @@ -1048,7 +1069,7 @@ EXPORT_SYMBOL(__mark_inode_dirty); * on the writer throttling path, and we get decent balancing between many * throttled threads: we don't want them all piling up on inode_sync_wait. */ -static void wait_sb_inodes(struct writeback_control *wbc) +static void wait_sb_inodes(struct super_block *sb) { struct inode *inode, *old_inode = NULL; @@ -1056,7 +1077,7 @@ static void wait_sb_inodes(struct writeback_control *wbc) * We need to be protected against the filesystem going from * r/o to r/w or vice versa. */ - WARN_ON(!rwsem_is_locked(&wbc->sb->s_umount)); + WARN_ON(!rwsem_is_locked(&sb->s_umount)); spin_lock(&inode_lock); @@ -1067,7 +1088,7 @@ static void wait_sb_inodes(struct writeback_control *wbc) * In which case, the inode may not be on the dirty list, but * we still have to wait for that writeout. */ - list_for_each_entry(inode, &wbc->sb->s_inodes, i_sb_list) { + list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { struct address_space *mapping; if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE|I_NEW)) @@ -1107,14 +1128,8 @@ static void wait_sb_inodes(struct writeback_control *wbc) * for IO completion of submitted IO. The number of pages submitted is * returned. */ -long writeback_inodes_sb(struct super_block *sb) +void writeback_inodes_sb(struct super_block *sb) { - struct writeback_control wbc = { - .sb = sb, - .sync_mode = WB_SYNC_NONE, - .range_start = 0, - .range_end = LLONG_MAX, - }; unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY); unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS); long nr_to_write; @@ -1122,9 +1137,7 @@ long writeback_inodes_sb(struct super_block *sb) nr_to_write = nr_dirty + nr_unstable + (inodes_stat.nr_inodes - inodes_stat.nr_unused); - wbc.nr_to_write = nr_to_write; - bdi_writeback_all(&wbc); - return nr_to_write - wbc.nr_to_write; + bdi_writeback_all(sb, nr_to_write); } EXPORT_SYMBOL(writeback_inodes_sb); @@ -1135,21 +1148,10 @@ EXPORT_SYMBOL(writeback_inodes_sb); * This function writes and waits on any dirty inode belonging to this * super_block. The number of pages synced is returned. */ -long sync_inodes_sb(struct super_block *sb) +void sync_inodes_sb(struct super_block *sb) { - struct writeback_control wbc = { - .sb = sb, - .bdi = sb->s_bdi, - .sync_mode = WB_SYNC_ALL, - .range_start = 0, - .range_end = LLONG_MAX, - }; - long nr_to_write = LONG_MAX; /* doesn't actually matter */ - - wbc.nr_to_write = nr_to_write; - bdi_start_writeback(&wbc); - wait_sb_inodes(&wbc); - return nr_to_write - wbc.nr_to_write; + bdi_sync_writeback(sb->s_bdi, sb); + wait_sb_inodes(sb); } EXPORT_SYMBOL(sync_inodes_sb); diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c index 1c8991b0db13..ee1ce68fd98b 100644 --- a/fs/ubifs/budget.c +++ b/fs/ubifs/budget.c @@ -54,29 +54,15 @@ * @nr_to_write: how many dirty pages to write-back * * This function shrinks UBIFS liability by means of writing back some amount - * of dirty inodes and their pages. Returns the amount of pages which were - * written back. The returned value does not include dirty inodes which were - * synchronized. + * of dirty inodes and their pages. * * Note, this function synchronizes even VFS inodes which are locked * (@i_mutex) by the caller of the budgeting function, because write-back does * not touch @i_mutex. */ -static int shrink_liability(struct ubifs_info *c, int nr_to_write) +static void shrink_liability(struct ubifs_info *c, int nr_to_write) { - int nr_written; - - nr_written = writeback_inodes_sb(c->vfs_sb); - if (!nr_written) { - /* - * Re-try again but wait on pages/inodes which are being - * written-back concurrently (e.g., by pdflush). - */ - nr_written = sync_inodes_sb(c->vfs_sb); - } - - dbg_budg("%d pages were written back", nr_written); - return nr_written; + writeback_inodes_sb(c->vfs_sb); } /** diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 859e797f4576..0ee33c2e6129 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -101,7 +101,7 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent, const char *fmt, ...); int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev); void bdi_unregister(struct backing_dev_info *bdi); -void bdi_start_writeback(struct writeback_control *wbc); +void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages); int bdi_writeback_task(struct bdi_writeback *wb); int bdi_has_dirty_io(struct backing_dev_info *bdi); diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 48a054e2b716..75cf58666ff9 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -68,8 +68,8 @@ struct writeback_control { */ struct bdi_writeback; int inode_wait(void *); -long writeback_inodes_sb(struct super_block *); -long sync_inodes_sb(struct super_block *); +void writeback_inodes_sb(struct super_block *); +void sync_inodes_sb(struct super_block *); void writeback_inodes_wbc(struct writeback_control *wbc); long wb_do_writeback(struct bdi_writeback *wb, int force_wait); void wakeup_flusher_threads(long nr_pages); diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 12c3d843ce93..1eea4fa0d410 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -582,16 +582,8 @@ static void balance_dirty_pages(struct address_space *mapping) if ((laptop_mode && pages_written) || (!laptop_mode && ((nr_writeback = global_page_state(NR_FILE_DIRTY) + global_page_state(NR_UNSTABLE_NFS)) - > background_thresh))) { - struct writeback_control wbc = { - .bdi = bdi, - .sync_mode = WB_SYNC_NONE, - .nr_to_write = nr_writeback, - }; - - - bdi_start_writeback(&wbc); - } + > background_thresh))) + bdi_start_writeback(bdi, nr_writeback); } void set_page_dirty_balance(struct page *page, int page_mkwrite) From ce5f8e7795195edb6f84f74aa9d72e739df38486 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 14 Sep 2009 12:57:56 +0200 Subject: [PATCH 10/16] writeback: splice dirty inode entries to default bdi on bdi_destroy() We cannot safely ensure that the inodes are all gone at this point in time, and we must not destroy this bdi with inodes having off it. So just splice our entries to the default bdi since that one will always persist. Signed-off-by: Jens Axboe --- mm/backing-dev.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/mm/backing-dev.c b/mm/backing-dev.c index fd93566345b6..3d3accb1f800 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -668,7 +668,19 @@ void bdi_destroy(struct backing_dev_info *bdi) { int i; - WARN_ON(bdi_has_dirty_io(bdi)); + /* + * Splice our entries to the default_backing_dev_info, if this + * bdi disappears + */ + if (bdi_has_dirty_io(bdi)) { + struct bdi_writeback *dst = &default_backing_dev_info.wb; + + spin_lock(&inode_lock); + list_splice(&bdi->wb.b_dirty, &dst->b_dirty); + list_splice(&bdi->wb.b_io, &dst->b_io); + list_splice(&bdi->wb.b_more_io, &dst->b_more_io); + spin_unlock(&inode_lock); + } bdi_unregister(bdi); From 8010c3b6349b407f8f11b3f4d7e9f94cb00fe528 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 15 Sep 2009 20:04:57 +0200 Subject: [PATCH 11/16] writeback: add comments to bdi_work structure And document its retriever, get_next_work_item(). Acked-by: Jan Kara Signed-off-by: Jens Axboe --- fs/fs-writeback.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 5887328b5a06..55f0d4e51b59 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -49,15 +49,15 @@ struct wb_writeback_args { * Work items for the bdi_writeback threads */ struct bdi_work { - struct list_head list; - struct rcu_head rcu_head; + struct list_head list; /* pending work list */ + struct rcu_head rcu_head; /* for RCU free/clear of work */ - unsigned long seen; - atomic_t pending; + unsigned long seen; /* threads that have seen this work */ + atomic_t pending; /* number of threads still to do work */ - struct wb_writeback_args args; + struct wb_writeback_args args; /* writeback arguments */ - unsigned long state; + unsigned long state; /* flag bits, see WS_* */ }; enum { @@ -758,7 +758,11 @@ static long wb_writeback(struct bdi_writeback *wb, /* * Return the next bdi_work struct that hasn't been processed by this - * wb thread yet + * wb thread yet. ->seen is initially set for each thread that exists + * for this device, when a thread first notices a piece of work it + * clears its bit. Depending on writeback type, the thread will notify + * completion on either receiving the work (WB_SYNC_NONE) or after + * it is done (WB_SYNC_ALL). */ static struct bdi_work *get_next_work_item(struct backing_dev_info *bdi, struct bdi_writeback *wb) From 49db041430e8a856dbc3af15430bf068f1c74655 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 15 Sep 2009 21:27:40 +0200 Subject: [PATCH 12/16] writeback: use schedule_timeout_interruptible() Gets rid of a manual set_current_state(). Signed-off-by: Jens Axboe --- fs/fs-writeback.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 55f0d4e51b59..34757758511a 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -884,8 +884,7 @@ int bdi_writeback_task(struct bdi_writeback *wb) } wait_jiffies = msecs_to_jiffies(dirty_writeback_interval * 10); - set_current_state(TASK_INTERRUPTIBLE); - schedule_timeout(wait_jiffies); + schedule_timeout_interruptible(wait_jiffies); try_to_freeze(); } From deed62edffe600bc5b379c872d3004116e001b66 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Tue, 15 Sep 2009 21:32:58 +0200 Subject: [PATCH 13/16] writeback: remove smp_mb(), it's not needed with list_add_tail_rcu() list_add_tail_rcu contains required barriers. Signed-off-by: Nick Piggin Signed-off-by: Jens Axboe --- fs/fs-writeback.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 34757758511a..59c99e729187 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -151,10 +151,10 @@ static void bdi_queue_work(struct backing_dev_info *bdi, struct bdi_work *work) BUG_ON(!bdi->wb_cnt); /* - * Make sure stores are seen before it appears on the list + * list_add_tail_rcu() contains the necessary barriers to + * make sure the above stores are seen before the item is + * noticed on the list */ - smp_mb(); - spin_lock(&bdi->wb_lock); list_add_tail_rcu(&work->list, &bdi->work_list); spin_unlock(&bdi->wb_lock); From 77fad5e625e56eb31a343ae1d489979fdc61a2aa Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Tue, 15 Sep 2009 21:34:12 +0200 Subject: [PATCH 14/16] writeback: improve scalability of bdi writeback work queues If you're going to do an atomic RMW on each list entry, there's not much point in all the RCU complexities of the list walking. This is only going to help the multi-thread case I guess, but it doesn't hurt to do now. Signed-off-by: Nick Piggin Signed-off-by: Jens Axboe --- fs/fs-writeback.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 59c99e729187..6bca6f8176f0 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -772,8 +772,9 @@ static struct bdi_work *get_next_work_item(struct backing_dev_info *bdi, rcu_read_lock(); list_for_each_entry_rcu(work, &bdi->work_list, list) { - if (!test_and_clear_bit(wb->nr, &work->seen)) + if (!test_bit(wb->nr, &work->seen)) continue; + clear_bit(wb->nr, &work->seen); ret = work; break; From 77b9d059cb3ddb8b1246d5878e81d52926550b23 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Tue, 15 Sep 2009 21:34:51 +0200 Subject: [PATCH 15/16] writeback: Fix bdi use after free in wb_work_complete() By the time bdi_work_on_stack gets evaluated again in bdi_work_free, it can already have been deallocated and used for something else in the !on stack case, giving a false positive in this test and causing corruption. Signed-off-by: Nick Piggin Signed-off-by: Jens Axboe --- fs/fs-writeback.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 6bca6f8176f0..7eba7326b9b9 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -113,6 +113,7 @@ static void bdi_work_free(struct rcu_head *head) static void wb_work_complete(struct bdi_work *work) { const enum writeback_sync_modes sync_mode = work->args.sync_mode; + int onstack = bdi_work_on_stack(work); /* * For allocated work, we can clear the done/seen bit right here. @@ -120,9 +121,9 @@ static void wb_work_complete(struct bdi_work *work) * to after the RCU grace period, since the stack could be invalidated * as soon as bdi_work_clear() has done the wakeup. */ - if (!bdi_work_on_stack(work)) + if (!onstack) bdi_work_clear(work); - if (sync_mode == WB_SYNC_NONE || bdi_work_on_stack(work)) + if (sync_mode == WB_SYNC_NONE || onstack) call_rcu(&work->rcu_head, bdi_work_free); } From 1ef7d9aa32a8ee054c4d4fdcd2ea537c04d61b2f Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Tue, 15 Sep 2009 21:37:55 +0200 Subject: [PATCH 16/16] writeback: fix possible bdi writeback refcounting problem wb_clear_pending AFAIKS should not be called after the item has been put on the list, except by the worker threads. It could lead to the situation where the refcount is decremented below 0 and cause lots of problems. Presumably the !wb_has_dirty_io case is not a common one, so it can be discovered when the thread wakes up to check? Also add a comment in bdi_work_clear. Signed-off-by: Nick Piggin Signed-off-by: Jens Axboe --- fs/fs-writeback.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 7eba7326b9b9..8e1e5e19d21e 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -97,6 +97,11 @@ static void bdi_work_clear(struct bdi_work *work) { clear_bit(WS_USED_B, &work->state); smp_mb__after_clear_bit(); + /* + * work can have disappeared at this point. bit waitq functions + * should be able to tolerate this, provided bdi_sched_wait does + * not dereference it's pointer argument. + */ wake_up_bit(&work->state, WS_USED_B); } @@ -169,13 +174,7 @@ static void bdi_queue_work(struct backing_dev_info *bdi, struct bdi_work *work) else { struct bdi_writeback *wb = &bdi->wb; - /* - * End work now if this wb has no dirty IO pending. Otherwise - * wakeup the handling thread - */ - if (!wb_has_dirty_io(wb)) - wb_clear_pending(wb, work); - else if (wb->task) + if (wb->task) wake_up_process(wb->task); } }