ext3: Wait for proper transaction commit on fsync
We cannot rely on buffer dirty bits during fsync because pdflush can come before fsync is called and clear dirty bits without forcing a transaction commit. What we do is that we track which transaction has last changed the inode and which transaction last changed allocation and force it to disk on fsync. Signed-off-by: Jan Kara <jack@suse.cz> Reviewed-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
This commit is contained in:
parent
ea0174a713
commit
fe8bc91c4c
|
@ -46,19 +46,21 @@
|
|||
int ext3_sync_file(struct file * file, struct dentry *dentry, int datasync)
|
||||
{
|
||||
struct inode *inode = dentry->d_inode;
|
||||
struct ext3_inode_info *ei = EXT3_I(inode);
|
||||
journal_t *journal = EXT3_SB(inode->i_sb)->s_journal;
|
||||
int ret = 0;
|
||||
tid_t commit_tid;
|
||||
|
||||
if (inode->i_sb->s_flags & MS_RDONLY)
|
||||
return 0;
|
||||
|
||||
J_ASSERT(ext3_journal_current_handle() == NULL);
|
||||
|
||||
/*
|
||||
* data=writeback:
|
||||
* data=writeback,ordered:
|
||||
* The caller's filemap_fdatawrite()/wait will sync the data.
|
||||
* sync_inode() will sync the metadata
|
||||
*
|
||||
* data=ordered:
|
||||
* The caller's filemap_fdatawrite() will write the data and
|
||||
* sync_inode() will write the inode if it is dirty. Then the caller's
|
||||
* filemap_fdatawait() will wait on the pages.
|
||||
* Metadata is in the journal, we wait for a proper transaction
|
||||
* to commit here.
|
||||
*
|
||||
* data=journal:
|
||||
* filemap_fdatawrite won't do anything (the buffers are clean).
|
||||
|
@ -73,22 +75,16 @@ int ext3_sync_file(struct file * file, struct dentry *dentry, int datasync)
|
|||
goto out;
|
||||
}
|
||||
|
||||
if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
|
||||
goto flush;
|
||||
if (datasync)
|
||||
commit_tid = atomic_read(&ei->i_datasync_tid);
|
||||
else
|
||||
commit_tid = atomic_read(&ei->i_sync_tid);
|
||||
|
||||
/*
|
||||
* The VFS has written the file data. If the inode is unaltered
|
||||
* then we need not start a commit.
|
||||
*/
|
||||
if (inode->i_state & (I_DIRTY_SYNC|I_DIRTY_DATASYNC)) {
|
||||
struct writeback_control wbc = {
|
||||
.sync_mode = WB_SYNC_ALL,
|
||||
.nr_to_write = 0, /* sys_fsync did this */
|
||||
};
|
||||
ret = sync_inode(inode, &wbc);
|
||||
if (log_start_commit(journal, commit_tid)) {
|
||||
log_wait_commit(journal, commit_tid);
|
||||
goto out;
|
||||
}
|
||||
flush:
|
||||
|
||||
/*
|
||||
* In case we didn't commit a transaction, we have to flush
|
||||
* disk caches manually so that data really is on persistent
|
||||
|
|
|
@ -699,8 +699,9 @@ static int ext3_splice_branch(handle_t *handle, struct inode *inode,
|
|||
int err = 0;
|
||||
struct ext3_block_alloc_info *block_i;
|
||||
ext3_fsblk_t current_block;
|
||||
struct ext3_inode_info *ei = EXT3_I(inode);
|
||||
|
||||
block_i = EXT3_I(inode)->i_block_alloc_info;
|
||||
block_i = ei->i_block_alloc_info;
|
||||
/*
|
||||
* If we're splicing into a [td]indirect block (as opposed to the
|
||||
* inode) then we need to get write access to the [td]indirect block
|
||||
|
@ -741,6 +742,8 @@ static int ext3_splice_branch(handle_t *handle, struct inode *inode,
|
|||
|
||||
inode->i_ctime = CURRENT_TIME_SEC;
|
||||
ext3_mark_inode_dirty(handle, inode);
|
||||
/* ext3_mark_inode_dirty already updated i_sync_tid */
|
||||
atomic_set(&ei->i_datasync_tid, handle->h_transaction->t_tid);
|
||||
|
||||
/* had we spliced it onto indirect block? */
|
||||
if (where->bh) {
|
||||
|
@ -2754,6 +2757,8 @@ struct inode *ext3_iget(struct super_block *sb, unsigned long ino)
|
|||
struct ext3_inode_info *ei;
|
||||
struct buffer_head *bh;
|
||||
struct inode *inode;
|
||||
journal_t *journal = EXT3_SB(sb)->s_journal;
|
||||
transaction_t *transaction;
|
||||
long ret;
|
||||
int block;
|
||||
|
||||
|
@ -2831,6 +2836,30 @@ struct inode *ext3_iget(struct super_block *sb, unsigned long ino)
|
|||
ei->i_data[block] = raw_inode->i_block[block];
|
||||
INIT_LIST_HEAD(&ei->i_orphan);
|
||||
|
||||
/*
|
||||
* Set transaction id's of transactions that have to be committed
|
||||
* to finish f[data]sync. We set them to currently running transaction
|
||||
* as we cannot be sure that the inode or some of its metadata isn't
|
||||
* part of the transaction - the inode could have been reclaimed and
|
||||
* now it is reread from disk.
|
||||
*/
|
||||
if (journal) {
|
||||
tid_t tid;
|
||||
|
||||
spin_lock(&journal->j_state_lock);
|
||||
if (journal->j_running_transaction)
|
||||
transaction = journal->j_running_transaction;
|
||||
else
|
||||
transaction = journal->j_committing_transaction;
|
||||
if (transaction)
|
||||
tid = transaction->t_tid;
|
||||
else
|
||||
tid = journal->j_commit_sequence;
|
||||
spin_unlock(&journal->j_state_lock);
|
||||
atomic_set(&ei->i_sync_tid, tid);
|
||||
atomic_set(&ei->i_datasync_tid, tid);
|
||||
}
|
||||
|
||||
if (inode->i_ino >= EXT3_FIRST_INO(inode->i_sb) + 1 &&
|
||||
EXT3_INODE_SIZE(inode->i_sb) > EXT3_GOOD_OLD_INODE_SIZE) {
|
||||
/*
|
||||
|
@ -3015,6 +3044,7 @@ again:
|
|||
err = rc;
|
||||
ei->i_state &= ~EXT3_STATE_NEW;
|
||||
|
||||
atomic_set(&ei->i_sync_tid, handle->h_transaction->t_tid);
|
||||
out_brelse:
|
||||
brelse (bh);
|
||||
ext3_std_error(inode->i_sb, err);
|
||||
|
|
|
@ -466,6 +466,8 @@ static struct inode *ext3_alloc_inode(struct super_block *sb)
|
|||
return NULL;
|
||||
ei->i_block_alloc_info = NULL;
|
||||
ei->vfs_inode.i_version = 1;
|
||||
atomic_set(&ei->i_datasync_tid, 0);
|
||||
atomic_set(&ei->i_sync_tid, 0);
|
||||
return &ei->vfs_inode;
|
||||
}
|
||||
|
||||
|
|
|
@ -137,6 +137,14 @@ struct ext3_inode_info {
|
|||
* by other means, so we have truncate_mutex.
|
||||
*/
|
||||
struct mutex truncate_mutex;
|
||||
|
||||
/*
|
||||
* Transactions that contain inode's metadata needed to complete
|
||||
* fsync and fdatasync, respectively.
|
||||
*/
|
||||
atomic_t i_sync_tid;
|
||||
atomic_t i_datasync_tid;
|
||||
|
||||
struct inode vfs_inode;
|
||||
};
|
||||
|
||||
|
|
Loading…
Reference in New Issue