bcache: performance improvement for btree_flush_write()

This patch improves performance for btree_flush_write() in following
ways,
- Use another spinlock journal.flush_write_lock to replace the very
  hot journal.lock. We don't have to use journal.lock here, selecting
  candidate btree nodes takes a lot of time, hold journal.lock here will
  block other jouranling threads and drop the overall I/O performance.
- Only select flushing btree node from c->btree_cache list. When the
  machine has a large system memory, mca cache may have a huge number of
  cached btree nodes. Iterating all the cached nodes will take a lot
  of CPU time, and most of the nodes on c->btree_cache_freeable and
  c->btree_cache_freed lists are cleared and have need to flush. So only
  travel mca list c->btree_cache to select flushing btree node should be
  enough for most of the cases.
- Don't iterate whole c->btree_cache list, only reversely select first
  BTREE_FLUSH_NR btree nodes to flush. Iterate all btree nodes from
  c->btree_cache and select the oldest journal pin btree nodes consumes
  huge number of CPU cycles if the list is huge (push and pop a node
  into/out of a heap is expensive). The last several dirty btree nodes
  on the tail of c->btree_cache list are earlest allocated and cached
  btree nodes, they are relative to the oldest journal pin btree nodes.
  Therefore only flushing BTREE_FLUSH_NR btree nodes from tail of
  c->btree_cache probably includes the oldest journal pin btree nodes.

In my testing, the above change decreases 50%+ CPU consumption when
journal space is full. Some times IOPS drops to 0 for 5-8 seconds,
comparing blocking I/O for 120+ seconds in previous code, this is much
better. Maybe there is room to improve in future, but at this momment
the fix looks fine and performs well in my testing.

Signed-off-by: Coly Li <colyli@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
This commit is contained in:
Coly Li 2019-06-28 19:59:59 +08:00 committed by Jens Axboe
parent 50a260e859
commit 91be66e131
2 changed files with 67 additions and 22 deletions

View File

@ -419,47 +419,87 @@ err:
static void btree_flush_write(struct cache_set *c) static void btree_flush_write(struct cache_set *c)
{ {
/* struct btree *b, *t, *btree_nodes[BTREE_FLUSH_NR];
* Try to find the btree node with that references the oldest journal unsigned int i, n;
* entry, best is our current candidate and is locked if non NULL:
*/ if (c->journal.btree_flushing)
struct btree *b, *best; return;
unsigned int i;
spin_lock(&c->journal.flush_write_lock);
if (c->journal.btree_flushing) {
spin_unlock(&c->journal.flush_write_lock);
return;
}
c->journal.btree_flushing = true;
spin_unlock(&c->journal.flush_write_lock);
atomic_long_inc(&c->flush_write); atomic_long_inc(&c->flush_write);
retry: memset(btree_nodes, 0, sizeof(btree_nodes));
best = NULL; n = 0;
mutex_lock(&c->bucket_lock); mutex_lock(&c->bucket_lock);
for_each_cached_btree(b, c, i) list_for_each_entry_safe_reverse(b, t, &c->btree_cache, list) {
if (btree_current_write(b)->journal) { if (btree_node_journal_flush(b))
if (!best) pr_err("BUG: flush_write bit should not be set here!");
best = b;
else if (journal_pin_cmp(c, mutex_lock(&b->write_lock);
btree_current_write(best)->journal,
btree_current_write(b)->journal)) { if (!btree_node_dirty(b)) {
best = b; mutex_unlock(&b->write_lock);
} continue;
}
if (!btree_current_write(b)->journal) {
mutex_unlock(&b->write_lock);
continue;
} }
b = best;
if (b)
set_btree_node_journal_flush(b); set_btree_node_journal_flush(b);
mutex_unlock(&b->write_lock);
btree_nodes[n++] = b;
if (n == BTREE_FLUSH_NR)
break;
}
mutex_unlock(&c->bucket_lock); mutex_unlock(&c->bucket_lock);
if (b) { for (i = 0; i < n; i++) {
b = btree_nodes[i];
if (!b) {
pr_err("BUG: btree_nodes[%d] is NULL", i);
continue;
}
/* safe to check without holding b->write_lock */
if (!btree_node_journal_flush(b)) {
pr_err("BUG: bnode %p: journal_flush bit cleaned", b);
continue;
}
mutex_lock(&b->write_lock); mutex_lock(&b->write_lock);
if (!btree_current_write(b)->journal) { if (!btree_current_write(b)->journal) {
clear_bit(BTREE_NODE_journal_flush, &b->flags); clear_bit(BTREE_NODE_journal_flush, &b->flags);
mutex_unlock(&b->write_lock); mutex_unlock(&b->write_lock);
/* We raced */ pr_debug("bnode %p: written by others", b);
goto retry; continue;
}
if (!btree_node_dirty(b)) {
clear_bit(BTREE_NODE_journal_flush, &b->flags);
mutex_unlock(&b->write_lock);
pr_debug("bnode %p: dirty bit cleaned by others", b);
continue;
} }
__bch_btree_node_write(b, NULL); __bch_btree_node_write(b, NULL);
clear_bit(BTREE_NODE_journal_flush, &b->flags); clear_bit(BTREE_NODE_journal_flush, &b->flags);
mutex_unlock(&b->write_lock); mutex_unlock(&b->write_lock);
} }
spin_lock(&c->journal.flush_write_lock);
c->journal.btree_flushing = false;
spin_unlock(&c->journal.flush_write_lock);
} }
#define last_seq(j) ((j)->seq - fifo_used(&(j)->pin) + 1) #define last_seq(j) ((j)->seq - fifo_used(&(j)->pin) + 1)
@ -881,6 +921,7 @@ int bch_journal_alloc(struct cache_set *c)
struct journal *j = &c->journal; struct journal *j = &c->journal;
spin_lock_init(&j->lock); spin_lock_init(&j->lock);
spin_lock_init(&j->flush_write_lock);
INIT_DELAYED_WORK(&j->work, journal_write_work); INIT_DELAYED_WORK(&j->work, journal_write_work);
c->journal_delay_ms = 100; c->journal_delay_ms = 100;

View File

@ -103,6 +103,8 @@ struct journal_write {
/* Embedded in struct cache_set */ /* Embedded in struct cache_set */
struct journal { struct journal {
spinlock_t lock; spinlock_t lock;
spinlock_t flush_write_lock;
bool btree_flushing;
/* used when waiting because the journal was full */ /* used when waiting because the journal was full */
struct closure_waitlist wait; struct closure_waitlist wait;
struct closure io; struct closure io;
@ -154,6 +156,8 @@ struct journal_device {
struct bio_vec bv[8]; struct bio_vec bv[8];
}; };
#define BTREE_FLUSH_NR 8
#define journal_pin_cmp(c, l, r) \ #define journal_pin_cmp(c, l, r) \
(fifo_idx(&(c)->journal.pin, (l)) > fifo_idx(&(c)->journal.pin, (r))) (fifo_idx(&(c)->journal.pin, (l)) > fifo_idx(&(c)->journal.pin, (r)))