jbd2: refine waiting for shadow buffers
Currently when we add a buffer to a transaction, we wait until the buffer is removed from BJ_Shadow list (so that we prevent any changes to the buffer that is just written to the journal). This can take unnecessarily long as a lot happens between the time the buffer is submitted to the journal and the time when we remove the buffer from BJ_Shadow list. (e.g. We wait for all data buffers in the transaction, we issue a cache flush, etc.) Also this creates a dependency of do_get_write_access() on transaction commit (namely waiting for data IO to complete) which we want to avoid when implementing transaction reservation. So we modify commit code to set new BH_Shadow flag when temporary shadowing buffer is created and we clear that flag once IO on that buffer is complete. This allows do_get_write_access() to wait only for BH_Shadow bit and thus removes the dependency on data IO completion. Reviewed-by: Zheng Liu <wenqing.lz@taobao.com> Signed-off-by: Jan Kara <jack@suse.cz> Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
This commit is contained in:
parent
e5a120aeb5
commit
b34090e5e2
|
@ -30,15 +30,22 @@
|
||||||
#include <trace/events/jbd2.h>
|
#include <trace/events/jbd2.h>
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Default IO end handler for temporary BJ_IO buffer_heads.
|
* IO end handler for temporary buffer_heads handling writes to the journal.
|
||||||
*/
|
*/
|
||||||
static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
|
static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
|
||||||
{
|
{
|
||||||
|
struct buffer_head *orig_bh = bh->b_private;
|
||||||
|
|
||||||
BUFFER_TRACE(bh, "");
|
BUFFER_TRACE(bh, "");
|
||||||
if (uptodate)
|
if (uptodate)
|
||||||
set_buffer_uptodate(bh);
|
set_buffer_uptodate(bh);
|
||||||
else
|
else
|
||||||
clear_buffer_uptodate(bh);
|
clear_buffer_uptodate(bh);
|
||||||
|
if (orig_bh) {
|
||||||
|
clear_bit_unlock(BH_Shadow, &orig_bh->b_state);
|
||||||
|
smp_mb__after_clear_bit();
|
||||||
|
wake_up_bit(&orig_bh->b_state, BH_Shadow);
|
||||||
|
}
|
||||||
unlock_buffer(bh);
|
unlock_buffer(bh);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -832,6 +839,7 @@ start_journal_io:
|
||||||
bh = jh2bh(jh);
|
bh = jh2bh(jh);
|
||||||
clear_buffer_jwrite(bh);
|
clear_buffer_jwrite(bh);
|
||||||
J_ASSERT_BH(bh, buffer_jbddirty(bh));
|
J_ASSERT_BH(bh, buffer_jbddirty(bh));
|
||||||
|
J_ASSERT_BH(bh, !buffer_shadow(bh));
|
||||||
|
|
||||||
/* The metadata is now released for reuse, but we need
|
/* The metadata is now released for reuse, but we need
|
||||||
to remember it against this transaction so that when
|
to remember it against this transaction so that when
|
||||||
|
@ -839,14 +847,6 @@ start_journal_io:
|
||||||
required. */
|
required. */
|
||||||
JBUFFER_TRACE(jh, "file as BJ_Forget");
|
JBUFFER_TRACE(jh, "file as BJ_Forget");
|
||||||
jbd2_journal_file_buffer(jh, commit_transaction, BJ_Forget);
|
jbd2_journal_file_buffer(jh, commit_transaction, BJ_Forget);
|
||||||
/*
|
|
||||||
* Wake up any transactions which were waiting for this IO to
|
|
||||||
* complete. The barrier must be here so that changes by
|
|
||||||
* jbd2_journal_file_buffer() take effect before wake_up_bit()
|
|
||||||
* does the waitqueue check.
|
|
||||||
*/
|
|
||||||
smp_mb();
|
|
||||||
wake_up_bit(&bh->b_state, BH_Unshadow);
|
|
||||||
JBUFFER_TRACE(jh, "brelse shadowed buffer");
|
JBUFFER_TRACE(jh, "brelse shadowed buffer");
|
||||||
__brelse(bh);
|
__brelse(bh);
|
||||||
}
|
}
|
||||||
|
|
|
@ -451,6 +451,7 @@ repeat:
|
||||||
new_bh->b_size = bh_in->b_size;
|
new_bh->b_size = bh_in->b_size;
|
||||||
new_bh->b_bdev = journal->j_dev;
|
new_bh->b_bdev = journal->j_dev;
|
||||||
new_bh->b_blocknr = blocknr;
|
new_bh->b_blocknr = blocknr;
|
||||||
|
new_bh->b_private = bh_in;
|
||||||
set_buffer_mapped(new_bh);
|
set_buffer_mapped(new_bh);
|
||||||
set_buffer_dirty(new_bh);
|
set_buffer_dirty(new_bh);
|
||||||
|
|
||||||
|
@ -465,6 +466,7 @@ repeat:
|
||||||
spin_lock(&journal->j_list_lock);
|
spin_lock(&journal->j_list_lock);
|
||||||
__jbd2_journal_file_buffer(jh_in, transaction, BJ_Shadow);
|
__jbd2_journal_file_buffer(jh_in, transaction, BJ_Shadow);
|
||||||
spin_unlock(&journal->j_list_lock);
|
spin_unlock(&journal->j_list_lock);
|
||||||
|
set_buffer_shadow(bh_in);
|
||||||
jbd_unlock_bh_state(bh_in);
|
jbd_unlock_bh_state(bh_in);
|
||||||
|
|
||||||
return do_escape | (done_copy_out << 1);
|
return do_escape | (done_copy_out << 1);
|
||||||
|
|
|
@ -619,6 +619,12 @@ static void warn_dirty_buffer(struct buffer_head *bh)
|
||||||
bdevname(bh->b_bdev, b), (unsigned long long)bh->b_blocknr);
|
bdevname(bh->b_bdev, b), (unsigned long long)bh->b_blocknr);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static int sleep_on_shadow_bh(void *word)
|
||||||
|
{
|
||||||
|
io_schedule();
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* If the buffer is already part of the current transaction, then there
|
* If the buffer is already part of the current transaction, then there
|
||||||
* is nothing we need to do. If it is already part of a prior
|
* is nothing we need to do. If it is already part of a prior
|
||||||
|
@ -754,41 +760,29 @@ repeat:
|
||||||
* journaled. If the primary copy is already going to
|
* journaled. If the primary copy is already going to
|
||||||
* disk then we cannot do copy-out here. */
|
* disk then we cannot do copy-out here. */
|
||||||
|
|
||||||
if (jh->b_jlist == BJ_Shadow) {
|
if (buffer_shadow(bh)) {
|
||||||
DEFINE_WAIT_BIT(wait, &bh->b_state, BH_Unshadow);
|
|
||||||
wait_queue_head_t *wqh;
|
|
||||||
|
|
||||||
wqh = bit_waitqueue(&bh->b_state, BH_Unshadow);
|
|
||||||
|
|
||||||
JBUFFER_TRACE(jh, "on shadow: sleep");
|
JBUFFER_TRACE(jh, "on shadow: sleep");
|
||||||
jbd_unlock_bh_state(bh);
|
jbd_unlock_bh_state(bh);
|
||||||
/* commit wakes up all shadow buffers after IO */
|
wait_on_bit(&bh->b_state, BH_Shadow,
|
||||||
for ( ; ; ) {
|
sleep_on_shadow_bh, TASK_UNINTERRUPTIBLE);
|
||||||
prepare_to_wait(wqh, &wait.wait,
|
|
||||||
TASK_UNINTERRUPTIBLE);
|
|
||||||
if (jh->b_jlist != BJ_Shadow)
|
|
||||||
break;
|
|
||||||
schedule();
|
|
||||||
}
|
|
||||||
finish_wait(wqh, &wait.wait);
|
|
||||||
goto repeat;
|
goto repeat;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Only do the copy if the currently-owning transaction
|
/*
|
||||||
* still needs it. If it is on the Forget list, the
|
* Only do the copy if the currently-owning transaction still
|
||||||
* committing transaction is past that stage. The
|
* needs it. If buffer isn't on BJ_Metadata list, the
|
||||||
* buffer had better remain locked during the kmalloc,
|
* committing transaction is past that stage (here we use the
|
||||||
* but that should be true --- we hold the journal lock
|
* fact that BH_Shadow is set under bh_state lock together with
|
||||||
* still and the buffer is already on the BUF_JOURNAL
|
* refiling to BJ_Shadow list and at this point we know the
|
||||||
* list so won't be flushed.
|
* buffer doesn't have BH_Shadow set).
|
||||||
*
|
*
|
||||||
* Subtle point, though: if this is a get_undo_access,
|
* Subtle point, though: if this is a get_undo_access,
|
||||||
* then we will be relying on the frozen_data to contain
|
* then we will be relying on the frozen_data to contain
|
||||||
* the new value of the committed_data record after the
|
* the new value of the committed_data record after the
|
||||||
* transaction, so we HAVE to force the frozen_data copy
|
* transaction, so we HAVE to force the frozen_data copy
|
||||||
* in that case. */
|
* in that case.
|
||||||
|
*/
|
||||||
if (jh->b_jlist != BJ_Forget || force_copy) {
|
if (jh->b_jlist == BJ_Metadata || force_copy) {
|
||||||
JBUFFER_TRACE(jh, "generate frozen data");
|
JBUFFER_TRACE(jh, "generate frozen data");
|
||||||
if (!frozen_buffer) {
|
if (!frozen_buffer) {
|
||||||
JBUFFER_TRACE(jh, "allocate memory for buffer");
|
JBUFFER_TRACE(jh, "allocate memory for buffer");
|
||||||
|
|
|
@ -244,6 +244,31 @@ typedef struct journal_superblock_s
|
||||||
|
|
||||||
#include <linux/fs.h>
|
#include <linux/fs.h>
|
||||||
#include <linux/sched.h>
|
#include <linux/sched.h>
|
||||||
|
|
||||||
|
enum jbd_state_bits {
|
||||||
|
BH_JBD /* Has an attached ext3 journal_head */
|
||||||
|
= BH_PrivateStart,
|
||||||
|
BH_JWrite, /* Being written to log (@@@ DEBUGGING) */
|
||||||
|
BH_Freed, /* Has been freed (truncated) */
|
||||||
|
BH_Revoked, /* Has been revoked from the log */
|
||||||
|
BH_RevokeValid, /* Revoked flag is valid */
|
||||||
|
BH_JBDDirty, /* Is dirty but journaled */
|
||||||
|
BH_State, /* Pins most journal_head state */
|
||||||
|
BH_JournalHead, /* Pins bh->b_private and jh->b_bh */
|
||||||
|
BH_Unshadow, /* Dummy bit, for BJ_Shadow wakeup filtering */
|
||||||
|
BH_JBDPrivateStart, /* First bit available for private use by FS */
|
||||||
|
};
|
||||||
|
|
||||||
|
BUFFER_FNS(JBD, jbd)
|
||||||
|
BUFFER_FNS(JWrite, jwrite)
|
||||||
|
BUFFER_FNS(JBDDirty, jbddirty)
|
||||||
|
TAS_BUFFER_FNS(JBDDirty, jbddirty)
|
||||||
|
BUFFER_FNS(Revoked, revoked)
|
||||||
|
TAS_BUFFER_FNS(Revoked, revoked)
|
||||||
|
BUFFER_FNS(RevokeValid, revokevalid)
|
||||||
|
TAS_BUFFER_FNS(RevokeValid, revokevalid)
|
||||||
|
BUFFER_FNS(Freed, freed)
|
||||||
|
|
||||||
#include <linux/jbd_common.h>
|
#include <linux/jbd_common.h>
|
||||||
|
|
||||||
#define J_ASSERT(assert) BUG_ON(!(assert))
|
#define J_ASSERT(assert) BUG_ON(!(assert))
|
||||||
|
|
|
@ -302,6 +302,34 @@ typedef struct journal_superblock_s
|
||||||
|
|
||||||
#include <linux/fs.h>
|
#include <linux/fs.h>
|
||||||
#include <linux/sched.h>
|
#include <linux/sched.h>
|
||||||
|
|
||||||
|
enum jbd_state_bits {
|
||||||
|
BH_JBD /* Has an attached ext3 journal_head */
|
||||||
|
= BH_PrivateStart,
|
||||||
|
BH_JWrite, /* Being written to log (@@@ DEBUGGING) */
|
||||||
|
BH_Freed, /* Has been freed (truncated) */
|
||||||
|
BH_Revoked, /* Has been revoked from the log */
|
||||||
|
BH_RevokeValid, /* Revoked flag is valid */
|
||||||
|
BH_JBDDirty, /* Is dirty but journaled */
|
||||||
|
BH_State, /* Pins most journal_head state */
|
||||||
|
BH_JournalHead, /* Pins bh->b_private and jh->b_bh */
|
||||||
|
BH_Shadow, /* IO on shadow buffer is running */
|
||||||
|
BH_Verified, /* Metadata block has been verified ok */
|
||||||
|
BH_JBDPrivateStart, /* First bit available for private use by FS */
|
||||||
|
};
|
||||||
|
|
||||||
|
BUFFER_FNS(JBD, jbd)
|
||||||
|
BUFFER_FNS(JWrite, jwrite)
|
||||||
|
BUFFER_FNS(JBDDirty, jbddirty)
|
||||||
|
TAS_BUFFER_FNS(JBDDirty, jbddirty)
|
||||||
|
BUFFER_FNS(Revoked, revoked)
|
||||||
|
TAS_BUFFER_FNS(Revoked, revoked)
|
||||||
|
BUFFER_FNS(RevokeValid, revokevalid)
|
||||||
|
TAS_BUFFER_FNS(RevokeValid, revokevalid)
|
||||||
|
BUFFER_FNS(Freed, freed)
|
||||||
|
BUFFER_FNS(Shadow, shadow)
|
||||||
|
BUFFER_FNS(Verified, verified)
|
||||||
|
|
||||||
#include <linux/jbd_common.h>
|
#include <linux/jbd_common.h>
|
||||||
|
|
||||||
#define J_ASSERT(assert) BUG_ON(!(assert))
|
#define J_ASSERT(assert) BUG_ON(!(assert))
|
||||||
|
|
|
@ -1,32 +1,6 @@
|
||||||
#ifndef _LINUX_JBD_STATE_H
|
#ifndef _LINUX_JBD_STATE_H
|
||||||
#define _LINUX_JBD_STATE_H
|
#define _LINUX_JBD_STATE_H
|
||||||
|
|
||||||
enum jbd_state_bits {
|
|
||||||
BH_JBD /* Has an attached ext3 journal_head */
|
|
||||||
= BH_PrivateStart,
|
|
||||||
BH_JWrite, /* Being written to log (@@@ DEBUGGING) */
|
|
||||||
BH_Freed, /* Has been freed (truncated) */
|
|
||||||
BH_Revoked, /* Has been revoked from the log */
|
|
||||||
BH_RevokeValid, /* Revoked flag is valid */
|
|
||||||
BH_JBDDirty, /* Is dirty but journaled */
|
|
||||||
BH_State, /* Pins most journal_head state */
|
|
||||||
BH_JournalHead, /* Pins bh->b_private and jh->b_bh */
|
|
||||||
BH_Unshadow, /* Dummy bit, for BJ_Shadow wakeup filtering */
|
|
||||||
BH_Verified, /* Metadata block has been verified ok */
|
|
||||||
BH_JBDPrivateStart, /* First bit available for private use by FS */
|
|
||||||
};
|
|
||||||
|
|
||||||
BUFFER_FNS(JBD, jbd)
|
|
||||||
BUFFER_FNS(JWrite, jwrite)
|
|
||||||
BUFFER_FNS(JBDDirty, jbddirty)
|
|
||||||
TAS_BUFFER_FNS(JBDDirty, jbddirty)
|
|
||||||
BUFFER_FNS(Revoked, revoked)
|
|
||||||
TAS_BUFFER_FNS(Revoked, revoked)
|
|
||||||
BUFFER_FNS(RevokeValid, revokevalid)
|
|
||||||
TAS_BUFFER_FNS(RevokeValid, revokevalid)
|
|
||||||
BUFFER_FNS(Freed, freed)
|
|
||||||
BUFFER_FNS(Verified, verified)
|
|
||||||
|
|
||||||
static inline struct buffer_head *jh2bh(struct journal_head *jh)
|
static inline struct buffer_head *jh2bh(struct journal_head *jh)
|
||||||
{
|
{
|
||||||
return jh->b_bh;
|
return jh->b_bh;
|
||||||
|
|
Loading…
Reference in New Issue