vfs: Make sys_sync() use fsync_super() (version 4)
It is unnecessarily fragile to have two places (fsync_super() and do_sync()) doing data integrity sync of the filesystem. Alter __fsync_super() to accommodate needs of both callers and use it. So after this patch __fsync_super() is the only place where we gather all the calls needed to properly send all data on a filesystem to disk. Nice bonus is that we get a complete livelock avoidance and write_supers() is now only used for periodic writeback of superblocks. sync_blockdevs() introduced a couple of patches ago is gone now. [build fixes folded] Signed-off-by: Jan Kara <jack@suse.cz> Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
This commit is contained in:
parent
429479f031
commit
5cee5815d1
|
@ -176,17 +176,22 @@ blkdev_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
|
||||||
iov, offset, nr_segs, blkdev_get_blocks, NULL);
|
iov, offset, nr_segs, blkdev_get_blocks, NULL);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int __sync_blockdev(struct block_device *bdev, int wait)
|
||||||
|
{
|
||||||
|
if (!bdev)
|
||||||
|
return 0;
|
||||||
|
if (!wait)
|
||||||
|
return filemap_flush(bdev->bd_inode->i_mapping);
|
||||||
|
return filemap_write_and_wait(bdev->bd_inode->i_mapping);
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Write out and wait upon all the dirty data associated with a block
|
* Write out and wait upon all the dirty data associated with a block
|
||||||
* device via its mapping. Does not take the superblock lock.
|
* device via its mapping. Does not take the superblock lock.
|
||||||
*/
|
*/
|
||||||
int sync_blockdev(struct block_device *bdev)
|
int sync_blockdev(struct block_device *bdev)
|
||||||
{
|
{
|
||||||
int ret = 0;
|
return __sync_blockdev(bdev, 1);
|
||||||
|
|
||||||
if (bdev)
|
|
||||||
ret = filemap_write_and_wait(bdev->bd_inode->i_mapping);
|
|
||||||
return ret;
|
|
||||||
}
|
}
|
||||||
EXPORT_SYMBOL(sync_blockdev);
|
EXPORT_SYMBOL(sync_blockdev);
|
||||||
|
|
||||||
|
|
|
@ -678,55 +678,6 @@ void sync_inodes_sb(struct super_block *sb, int wait)
|
||||||
sync_sb_inodes(sb, &wbc);
|
sync_sb_inodes(sb, &wbc);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* sync_inodes - writes all inodes to disk
|
|
||||||
* @wait: wait for completion
|
|
||||||
*
|
|
||||||
* sync_inodes() goes through each super block's dirty inode list, writes the
|
|
||||||
* inodes out, waits on the writeout and puts the inodes back on the normal
|
|
||||||
* list.
|
|
||||||
*
|
|
||||||
* This is for sys_sync(). fsync_dev() uses the same algorithm. The subtle
|
|
||||||
* part of the sync functions is that the blockdev "superblock" is processed
|
|
||||||
* last. This is because the write_inode() function of a typical fs will
|
|
||||||
* perform no I/O, but will mark buffers in the blockdev mapping as dirty.
|
|
||||||
* What we want to do is to perform all that dirtying first, and then write
|
|
||||||
* back all those inode blocks via the blockdev mapping in one sweep. So the
|
|
||||||
* additional (somewhat redundant) sync_blockdev() calls here are to make
|
|
||||||
* sure that really happens. Because if we call sync_inodes_sb(wait=1) with
|
|
||||||
* outstanding dirty inodes, the writeback goes block-at-a-time within the
|
|
||||||
* filesystem's write_inode(). This is extremely slow.
|
|
||||||
*/
|
|
||||||
static void __sync_inodes(int wait)
|
|
||||||
{
|
|
||||||
struct super_block *sb;
|
|
||||||
|
|
||||||
spin_lock(&sb_lock);
|
|
||||||
restart:
|
|
||||||
list_for_each_entry(sb, &super_blocks, s_list) {
|
|
||||||
sb->s_count++;
|
|
||||||
spin_unlock(&sb_lock);
|
|
||||||
down_read(&sb->s_umount);
|
|
||||||
if (sb->s_root) {
|
|
||||||
sync_inodes_sb(sb, wait);
|
|
||||||
sync_blockdev(sb->s_bdev);
|
|
||||||
}
|
|
||||||
up_read(&sb->s_umount);
|
|
||||||
spin_lock(&sb_lock);
|
|
||||||
if (__put_super_and_need_restart(sb))
|
|
||||||
goto restart;
|
|
||||||
}
|
|
||||||
spin_unlock(&sb_lock);
|
|
||||||
}
|
|
||||||
|
|
||||||
void sync_inodes(int wait)
|
|
||||||
{
|
|
||||||
__sync_inodes(0);
|
|
||||||
|
|
||||||
if (wait)
|
|
||||||
__sync_inodes(1);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* write_inode_now - write an inode to disk
|
* write_inode_now - write an inode to disk
|
||||||
* @inode: inode to write to disk
|
* @inode: inode to write to disk
|
||||||
|
|
|
@ -25,6 +25,8 @@ static inline int sb_is_blkdev_sb(struct super_block *sb)
|
||||||
return sb == blockdev_superblock;
|
return sb == blockdev_superblock;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
extern int __sync_blockdev(struct block_device *bdev, int wait);
|
||||||
|
|
||||||
#else
|
#else
|
||||||
static inline void bdev_cache_init(void)
|
static inline void bdev_cache_init(void)
|
||||||
{
|
{
|
||||||
|
@ -34,6 +36,11 @@ static inline int sb_is_blkdev_sb(struct super_block *sb)
|
||||||
{
|
{
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static inline int __sync_blockdev(struct block_device *bdev, int wait)
|
||||||
|
{
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -71,12 +78,3 @@ extern void chroot_fs_refs(struct path *, struct path *);
|
||||||
* file_table.c
|
* file_table.c
|
||||||
*/
|
*/
|
||||||
extern void mark_files_ro(struct super_block *);
|
extern void mark_files_ro(struct super_block *);
|
||||||
|
|
||||||
/*
|
|
||||||
* super.c
|
|
||||||
*/
|
|
||||||
#ifdef CONFIG_BLOCK
|
|
||||||
extern void sync_blockdevs(void);
|
|
||||||
#else
|
|
||||||
static inline void sync_blockdevs(void) { }
|
|
||||||
#endif
|
|
||||||
|
|
72
fs/super.c
72
fs/super.c
|
@ -284,23 +284,23 @@ EXPORT_SYMBOL(lock_super);
|
||||||
EXPORT_SYMBOL(unlock_super);
|
EXPORT_SYMBOL(unlock_super);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Write out and wait upon all dirty data associated with this
|
* Do the filesystem syncing work. For simple filesystems sync_inodes_sb(sb, 0)
|
||||||
* superblock. Filesystem data as well as the underlying block
|
* just dirties buffers with inodes so we have to submit IO for these buffers
|
||||||
* device. Takes the superblock lock. Requires a second blkdev
|
* via __sync_blockdev(). This also speeds up the wait == 1 case since in that
|
||||||
* flush by the caller to complete the operation.
|
* case write_inode() functions do sync_dirty_buffer() and thus effectively
|
||||||
|
* write one block at a time.
|
||||||
*/
|
*/
|
||||||
static int __fsync_super(struct super_block *sb)
|
static int __fsync_super(struct super_block *sb, int wait)
|
||||||
{
|
{
|
||||||
sync_inodes_sb(sb, 0);
|
|
||||||
vfs_dq_sync(sb);
|
vfs_dq_sync(sb);
|
||||||
sync_inodes_sb(sb, 1);
|
sync_inodes_sb(sb, wait);
|
||||||
lock_super(sb);
|
lock_super(sb);
|
||||||
if (sb->s_dirt && sb->s_op->write_super)
|
if (sb->s_dirt && sb->s_op->write_super)
|
||||||
sb->s_op->write_super(sb);
|
sb->s_op->write_super(sb);
|
||||||
unlock_super(sb);
|
unlock_super(sb);
|
||||||
if (sb->s_op->sync_fs)
|
if (sb->s_op->sync_fs)
|
||||||
sb->s_op->sync_fs(sb, 1);
|
sb->s_op->sync_fs(sb, wait);
|
||||||
return sync_blockdev(sb->s_bdev);
|
return __sync_blockdev(sb->s_bdev, wait);
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -310,7 +310,12 @@ static int __fsync_super(struct super_block *sb)
|
||||||
*/
|
*/
|
||||||
int fsync_super(struct super_block *sb)
|
int fsync_super(struct super_block *sb)
|
||||||
{
|
{
|
||||||
return __fsync_super(sb);
|
int ret;
|
||||||
|
|
||||||
|
ret = __fsync_super(sb, 0);
|
||||||
|
if (ret < 0)
|
||||||
|
return ret;
|
||||||
|
return __fsync_super(sb, 1);
|
||||||
}
|
}
|
||||||
EXPORT_SYMBOL_GPL(fsync_super);
|
EXPORT_SYMBOL_GPL(fsync_super);
|
||||||
|
|
||||||
|
@ -469,20 +474,18 @@ restart:
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Call the ->sync_fs super_op against all filesystems which are r/w and
|
* Sync all the data for all the filesystems (called by sys_sync() and
|
||||||
* which implement it.
|
* emergency sync)
|
||||||
*
|
*
|
||||||
* This operation is careful to avoid the livelock which could easily happen
|
* This operation is careful to avoid the livelock which could easily happen
|
||||||
* if two or more filesystems are being continuously dirtied. s_need_sync_fs
|
* if two or more filesystems are being continuously dirtied. s_need_sync
|
||||||
* is used only here. We set it against all filesystems and then clear it as
|
* is used only here. We set it against all filesystems and then clear it as
|
||||||
* we sync them. So redirtied filesystems are skipped.
|
* we sync them. So redirtied filesystems are skipped.
|
||||||
*
|
*
|
||||||
* But if process A is currently running sync_filesystems and then process B
|
* But if process A is currently running sync_filesystems and then process B
|
||||||
* calls sync_filesystems as well, process B will set all the s_need_sync_fs
|
* calls sync_filesystems as well, process B will set all the s_need_sync
|
||||||
* flags again, which will cause process A to resync everything. Fix that with
|
* flags again, which will cause process A to resync everything. Fix that with
|
||||||
* a local mutex.
|
* a local mutex.
|
||||||
*
|
|
||||||
* (Fabian) Avoid sync_fs with clean fs & wait mode 0
|
|
||||||
*/
|
*/
|
||||||
void sync_filesystems(int wait)
|
void sync_filesystems(int wait)
|
||||||
{
|
{
|
||||||
|
@ -492,25 +495,23 @@ void sync_filesystems(int wait)
|
||||||
mutex_lock(&mutex); /* Could be down_interruptible */
|
mutex_lock(&mutex); /* Could be down_interruptible */
|
||||||
spin_lock(&sb_lock);
|
spin_lock(&sb_lock);
|
||||||
list_for_each_entry(sb, &super_blocks, s_list) {
|
list_for_each_entry(sb, &super_blocks, s_list) {
|
||||||
if (!sb->s_op->sync_fs)
|
|
||||||
continue;
|
|
||||||
if (sb->s_flags & MS_RDONLY)
|
if (sb->s_flags & MS_RDONLY)
|
||||||
continue;
|
continue;
|
||||||
sb->s_need_sync_fs = 1;
|
sb->s_need_sync = 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
restart:
|
restart:
|
||||||
list_for_each_entry(sb, &super_blocks, s_list) {
|
list_for_each_entry(sb, &super_blocks, s_list) {
|
||||||
if (!sb->s_need_sync_fs)
|
if (!sb->s_need_sync)
|
||||||
continue;
|
continue;
|
||||||
sb->s_need_sync_fs = 0;
|
sb->s_need_sync = 0;
|
||||||
if (sb->s_flags & MS_RDONLY)
|
if (sb->s_flags & MS_RDONLY)
|
||||||
continue; /* hm. Was remounted r/o meanwhile */
|
continue; /* hm. Was remounted r/o meanwhile */
|
||||||
sb->s_count++;
|
sb->s_count++;
|
||||||
spin_unlock(&sb_lock);
|
spin_unlock(&sb_lock);
|
||||||
down_read(&sb->s_umount);
|
down_read(&sb->s_umount);
|
||||||
if (sb->s_root)
|
if (sb->s_root)
|
||||||
sb->s_op->sync_fs(sb, wait);
|
__fsync_super(sb, wait);
|
||||||
up_read(&sb->s_umount);
|
up_read(&sb->s_umount);
|
||||||
/* restart only when sb is no longer on the list */
|
/* restart only when sb is no longer on the list */
|
||||||
spin_lock(&sb_lock);
|
spin_lock(&sb_lock);
|
||||||
|
@ -521,33 +522,6 @@ restart:
|
||||||
mutex_unlock(&mutex);
|
mutex_unlock(&mutex);
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef CONFIG_BLOCK
|
|
||||||
/*
|
|
||||||
* Sync all block devices underlying some superblock
|
|
||||||
*/
|
|
||||||
void sync_blockdevs(void)
|
|
||||||
{
|
|
||||||
struct super_block *sb;
|
|
||||||
|
|
||||||
spin_lock(&sb_lock);
|
|
||||||
restart:
|
|
||||||
list_for_each_entry(sb, &super_blocks, s_list) {
|
|
||||||
if (!sb->s_bdev)
|
|
||||||
continue;
|
|
||||||
sb->s_count++;
|
|
||||||
spin_unlock(&sb_lock);
|
|
||||||
down_read(&sb->s_umount);
|
|
||||||
if (sb->s_root)
|
|
||||||
sync_blockdev(sb->s_bdev);
|
|
||||||
up_read(&sb->s_umount);
|
|
||||||
spin_lock(&sb_lock);
|
|
||||||
if (__put_super_and_need_restart(sb))
|
|
||||||
goto restart;
|
|
||||||
}
|
|
||||||
spin_unlock(&sb_lock);
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* get_super - get the superblock of a device
|
* get_super - get the superblock of a device
|
||||||
* @bdev: device to get the superblock for
|
* @bdev: device to get the superblock for
|
||||||
|
|
33
fs/sync.c
33
fs/sync.c
|
@ -18,35 +18,24 @@
|
||||||
#define VALID_FLAGS (SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE| \
|
#define VALID_FLAGS (SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE| \
|
||||||
SYNC_FILE_RANGE_WAIT_AFTER)
|
SYNC_FILE_RANGE_WAIT_AFTER)
|
||||||
|
|
||||||
/*
|
|
||||||
* sync everything. Start out by waking pdflush, because that writes back
|
|
||||||
* all queues in parallel.
|
|
||||||
*/
|
|
||||||
static void do_sync(unsigned long wait)
|
|
||||||
{
|
|
||||||
wakeup_pdflush(0);
|
|
||||||
sync_inodes(0); /* All mappings, inodes and their blockdevs */
|
|
||||||
vfs_dq_sync(NULL);
|
|
||||||
sync_inodes(wait); /* Mappings, inodes and blockdevs, again. */
|
|
||||||
sync_supers(); /* Write the superblocks */
|
|
||||||
sync_filesystems(0); /* Start syncing the filesystems */
|
|
||||||
sync_filesystems(wait); /* Waitingly sync the filesystems */
|
|
||||||
sync_blockdevs();
|
|
||||||
if (!wait)
|
|
||||||
printk("Emergency Sync complete\n");
|
|
||||||
if (unlikely(laptop_mode))
|
|
||||||
laptop_sync_completion();
|
|
||||||
}
|
|
||||||
|
|
||||||
SYSCALL_DEFINE0(sync)
|
SYSCALL_DEFINE0(sync)
|
||||||
{
|
{
|
||||||
do_sync(1);
|
sync_filesystems(0);
|
||||||
|
sync_filesystems(1);
|
||||||
|
if (unlikely(laptop_mode))
|
||||||
|
laptop_sync_completion();
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void do_sync_work(struct work_struct *work)
|
static void do_sync_work(struct work_struct *work)
|
||||||
{
|
{
|
||||||
do_sync(0);
|
/*
|
||||||
|
* Sync twice to reduce the possibility we skipped some inodes / pages
|
||||||
|
* because they were temporarily locked
|
||||||
|
*/
|
||||||
|
sync_filesystems(0);
|
||||||
|
sync_filesystems(0);
|
||||||
|
printk("Emergency Sync complete\n");
|
||||||
kfree(work);
|
kfree(work);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -1321,7 +1321,7 @@ struct super_block {
|
||||||
struct rw_semaphore s_umount;
|
struct rw_semaphore s_umount;
|
||||||
struct mutex s_lock;
|
struct mutex s_lock;
|
||||||
int s_count;
|
int s_count;
|
||||||
int s_need_sync_fs;
|
int s_need_sync;
|
||||||
atomic_t s_active;
|
atomic_t s_active;
|
||||||
#ifdef CONFIG_SECURITY
|
#ifdef CONFIG_SECURITY
|
||||||
void *s_security;
|
void *s_security;
|
||||||
|
|
|
@ -79,7 +79,6 @@ struct writeback_control {
|
||||||
void writeback_inodes(struct writeback_control *wbc);
|
void writeback_inodes(struct writeback_control *wbc);
|
||||||
int inode_wait(void *);
|
int inode_wait(void *);
|
||||||
void sync_inodes_sb(struct super_block *, int wait);
|
void sync_inodes_sb(struct super_block *, int wait);
|
||||||
void sync_inodes(int wait);
|
|
||||||
|
|
||||||
/* writeback.h requires fs.h; it, too, is not included from here. */
|
/* writeback.h requires fs.h; it, too, is not included from here. */
|
||||||
static inline void wait_on_inode(struct inode *inode)
|
static inline void wait_on_inode(struct inode *inode)
|
||||||
|
|
Loading…
Reference in New Issue