Merge branch 'bcache-for-3.11' of git://evilpiepirate.org/~kent/linux-bcache into for-3.11/drivers

2013-07-02 08:32:57 +02:00 · 2013-07-02 08:32:57 +02:00 · d0e3d0238d
parent 5f0e5afa0d 8e51e414a3
commit d0e3d0238d
23 changed files with 1166 additions and 954 deletions
--- a/Documentation/bcache.txt
+++ b/Documentation/bcache.txt
@ -46,29 +46,33 @@ you format your backing devices and cache device at the same time, you won't
 have to manually attach:
  make-bcache -B /dev/sda /dev/sdb -C /dev/sdc

-To make bcache devices known to the kernel, echo them to /sys/fs/bcache/register:
+bcache-tools now ships udev rules, and bcache devices are known to the kernel
+immediately.  Without udev, you can manually register devices like this:

  echo /dev/sdb > /sys/fs/bcache/register
  echo /dev/sdc > /sys/fs/bcache/register

-To register your bcache devices automatically, you could add something like
-this to an init script:
+Registering the backing device makes the bcache device show up in /dev; you can
+now format it and use it as normal. But the first time using a new bcache
+device, it'll be running in passthrough mode until you attach it to a cache.
+See the section on attaching.

-  echo /dev/sd* > /sys/fs/bcache/register_quiet
+The devices show up as:

-It'll look for bcache superblocks and ignore everything that doesn't have one.
+  /dev/bcache<N>

-Registering the backing device makes the bcache show up in /dev; you can now
-format it and use it as normal. But the first time using a new bcache device,
-it'll be running in passthrough mode until you attach it to a cache. See the
-section on attaching.
+As well as (with udev):

-The devices show up at /dev/bcacheN, and can be controlled via sysfs from
-/sys/block/bcacheN/bcache:
+  /dev/bcache/by-uuid/<uuid>
+  /dev/bcache/by-label/<label>
+
+To get started:

  mkfs.ext4 /dev/bcache0
  mount /dev/bcache0 /mnt

+You can control bcache devices through sysfs at /sys/block/bcache<N>/bcache .
+
 Cache devices are managed as sets; multiple caches per set isn't supported yet
 but will allow for mirroring of metadata and dirty data in the future. Your new
 cache set shows up as /sys/fs/bcache/<UUID>
@ -80,11 +84,11 @@ must be attached to your cache set to enable caching. Attaching a backing
 device to a cache set is done thusly, with the UUID of the cache set in
 /sys/fs/bcache:

-  echo <UUID> > /sys/block/bcache0/bcache/attach
+  echo <CSET-UUID> > /sys/block/bcache0/bcache/attach

 This only has to be done once. The next time you reboot, just reregister all
 your bcache devices. If a backing device has data in a cache somewhere, the
-/dev/bcache# device won't be created until the cache shows up - particularly
+/dev/bcache<N> device won't be created until the cache shows up - particularly
 important if you have writeback caching turned on.

 If you're booting up and your cache device is gone and never coming back, you
@ -181,7 +185,7 @@ want for getting the best possible numbers when benchmarking.

   In practice this isn't an issue because as soon as a write comes along it'll
   cause the btree node to be split, and you need almost no write traffic for
-   this to not show up enough to be noticable (especially since bcache's btree
+   this to not show up enough to be noticeable (especially since bcache's btree
   nodes are huge and index large regions of the device). But when you're
   benchmarking, if you're trying to warm the cache by reading a bunch of data
   and there's no other traffic - that can be a problem.
@ -191,6 +195,9 @@ want for getting the best possible numbers when benchmarking.

 SYSFS - BACKING DEVICE:

+Available at /sys/block/<bdev>/bcache, /sys/block/bcache*/bcache and
+(if attached) /sys/fs/bcache/<cset-uuid>/bdev*
+
 attach
  Echo the UUID of a cache set to this file to enable caching.

@ -222,7 +229,7 @@ running
  it's in passthrough mode or caching).

 sequential_cutoff
-  A sequential IO will bypass the cache once it passes this threshhold; the
+  A sequential IO will bypass the cache once it passes this threshold; the
  most recent 128 IOs are tracked so sequential IO can be detected even when
  it isn't all done at once.

@ -296,10 +303,12 @@ cache_miss_collisions
  since the synchronization for cache misses was rewritten)

 cache_readaheads
-  Count of times readahead occured.
+  Count of times readahead occurred.

 SYSFS - CACHE SET:

+Available at /sys/fs/bcache/<cset-uuid>
+
 average_key_size
  Average data per key in the btree.

@ -362,7 +371,7 @@ unregister
 SYSFS - CACHE SET INTERNAL:

 This directory also exposes timings for a number of internal operations, with
-separate files for average duration, average frequency, last occurence and max
+separate files for average duration, average frequency, last occurrence and max
 duration: garbage collection, btree read, btree node sorts and btree splits.

 active_journal_entries
@ -390,6 +399,8 @@ trigger_gc

 SYSFS - CACHE DEVICE:

+Available at /sys/block/<cdev>/bcache
+
 block_size
  Minimum granularity of writes - should match hardware sector size.

@ -417,7 +428,7 @@ freelist_percent
  space.

 io_errors
-  Number of errors that have occured, decayed by io_error_halflife.
+  Number of errors that have occurred, decayed by io_error_halflife.

 metadata_written
  Sum of all non data writes (btree writes and all other metadata).
--- a/2
+++ b/2
@ -1621,7 +1621,7 @@ S:	Maintained
 F:	drivers/net/hamradio/baycom*

 BCACHE (BLOCK LAYER CACHE)
-M:	Kent Overstreet <koverstreet@google.com>
+M:	Kent Overstreet <kmo@daterainc.com>
 L:	linux-bcache@vger.kernel.org
 W:	http://bcache.evilpiepirate.org
 S:	Maintained:
--- a/drivers/md/bcache/alloc.c
+++ b/drivers/md/bcache/alloc.c
@ -63,7 +63,9 @@
 #include "bcache.h"
 #include "btree.h"

+#include <linux/kthread.h>
 #include <linux/random.h>
+#include <trace/events/bcache.h>

 #define MAX_IN_FLIGHT_DISCARDS		8U

@ -151,7 +153,7 @@ static void discard_finish(struct work_struct *w)
 	mutex_unlock(&ca->set->bucket_lock);

 	closure_wake_up(&ca->set->bucket_wait);
-	wake_up(&ca->set->alloc_wait);
+	wake_up_process(ca->alloc_thread);

 	closure_put(&ca->set->cl);
 }
@ -350,38 +352,31 @@ static void invalidate_buckets(struct cache *ca)
 		break;
 	}

-	pr_debug("free %zu/%zu free_inc %zu/%zu unused %zu/%zu",
-		 fifo_used(&ca->free), ca->free.size,
-		 fifo_used(&ca->free_inc), ca->free_inc.size,
-		 fifo_used(&ca->unused), ca->unused.size);
+	trace_bcache_alloc_invalidate(ca);
 }

 #define allocator_wait(ca, cond)					\
 do {									\
-	DEFINE_WAIT(__wait);						\
-									\
 	while (1) {							\
-		prepare_to_wait(&ca->set->alloc_wait,			\
-				&__wait, TASK_INTERRUPTIBLE);		\
+		set_current_state(TASK_INTERRUPTIBLE);			\
 		if (cond)						\
 			break;						\
 									\
 		mutex_unlock(&(ca)->set->bucket_lock);			\
 		if (test_bit(CACHE_SET_STOPPING_2, &ca->set->flags)) {	\
-			finish_wait(&ca->set->alloc_wait, &__wait);	\
-			closure_return(cl);				\
+			closure_put(&ca->set->cl);			\
+			return 0;					\
 		}							\
 									\
 		schedule();						\
 		mutex_lock(&(ca)->set->bucket_lock);			\
 	}								\
-									\
-	finish_wait(&ca->set->alloc_wait, &__wait);			\
+	__set_current_state(TASK_RUNNING);				\
 } while (0)

-void bch_allocator_thread(struct closure *cl)
+static int bch_allocator_thread(void *arg)
 {
-	struct cache *ca = container_of(cl, struct cache, alloc);
+	struct cache *ca = arg;

 	mutex_lock(&ca->set->bucket_lock);

@ -442,7 +437,7 @@ long bch_bucket_alloc(struct cache *ca, unsigned watermark, struct closure *cl)
 {
 	long r = -1;
 again:
-	wake_up(&ca->set->alloc_wait);
+	wake_up_process(ca->alloc_thread);

 	if (fifo_used(&ca->free) > ca->watermark[watermark] &&
 	    fifo_pop(&ca->free, r)) {
@ -476,9 +471,7 @@ again:
 		return r;
 	}

-	pr_debug("alloc failure: blocked %i free %zu free_inc %zu unused %zu",
-		 atomic_read(&ca->set->prio_blocked), fifo_used(&ca->free),
-		 fifo_used(&ca->free_inc), fifo_used(&ca->unused));
+	trace_bcache_alloc_fail(ca);

 	if (cl) {
 		closure_wait(&ca->set->bucket_wait, cl);
@ -552,6 +545,19 @@ int bch_bucket_alloc_set(struct cache_set *c, unsigned watermark,

 /* Init */

+int bch_cache_allocator_start(struct cache *ca)
+{
+	ca->alloc_thread = kthread_create(bch_allocator_thread,
+					  ca, "bcache_allocator");
+	if (IS_ERR(ca->alloc_thread))
+		return PTR_ERR(ca->alloc_thread);
+
+	closure_get(&ca->set->cl);
+	wake_up_process(ca->alloc_thread);
+
+	return 0;
+}
+
 void bch_cache_allocator_exit(struct cache *ca)
 {
 	struct discard *d;
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@ -178,7 +178,6 @@
 #define pr_fmt(fmt) "bcache: %s() " fmt "\n", __func__

 #include <linux/bio.h>
-#include <linux/blktrace_api.h>
 #include <linux/kobject.h>
 #include <linux/list.h>
 #include <linux/mutex.h>
@ -388,8 +387,6 @@ struct keybuf_key {
 typedef bool (keybuf_pred_fn)(struct keybuf *, struct bkey *);

 struct keybuf {
-	keybuf_pred_fn		*key_predicate;
-
 	struct bkey		last_scanned;
 	spinlock_t		lock;

@ -438,8 +435,10 @@ struct bcache_device {
 	/* If nonzero, we're detaching/unregistering from cache set */
 	atomic_t		detaching;

-	atomic_long_t		sectors_dirty;
-	unsigned long		sectors_dirty_gc;
+	uint64_t		nr_stripes;
+	unsigned		stripe_size_bits;
+	atomic_t		*stripe_sectors_dirty;
+
 	unsigned long		sectors_dirty_last;
 	long			sectors_dirty_derivative;

@ -531,6 +530,7 @@ struct cached_dev {
 	unsigned		sequential_merge:1;
 	unsigned		verify:1;

+	unsigned		partial_stripes_expensive:1;
 	unsigned		writeback_metadata:1;
 	unsigned		writeback_running:1;
 	unsigned char		writeback_percent;
@ -565,8 +565,7 @@ struct cache {

 	unsigned		watermark[WATERMARK_MAX];

-	struct closure		alloc;
-	struct workqueue_struct	*alloc_workqueue;
+	struct task_struct	*alloc_thread;

 	struct closure		prio;
 	struct prio_set		*disk_buckets;
@ -703,9 +702,6 @@ struct cache_set {
 	/* For the btree cache */
 	struct shrinker		shrink;

-	/* For the allocator itself */
-	wait_queue_head_t	alloc_wait;
-
 	/* For the btree cache and anything allocation related */
 	struct mutex		bucket_lock;

@ -823,10 +819,9 @@ struct cache_set {

 	/*
 	 * A btree node on disk could have too many bsets for an iterator to fit
-	 * on the stack - this is a single element mempool for btree_read_work()
+	 * on the stack - have to dynamically allocate them
 	 */
-	struct mutex		fill_lock;
-	struct btree_iter	*fill_iter;
+	mempool_t		*fill_iter;

 	/*
 	 * btree_sort() is a merge sort and requires temporary space - single
@ -834,6 +829,7 @@ struct cache_set {
 	 */
 	struct mutex		sort_lock;
 	struct bset		*sort;
+	unsigned		sort_crit_factor;

 	/* List of buckets we're currently writing data to */
 	struct list_head	data_buckets;
@ -906,8 +902,6 @@ static inline unsigned local_clock_us(void)
 	return local_clock() >> 10;
 }

-#define MAX_BSETS		4U
-
 #define BTREE_PRIO		USHRT_MAX
 #define INITIAL_PRIO		32768

@ -1112,23 +1106,6 @@ static inline void __bkey_put(struct cache_set *c, struct bkey *k)
 		atomic_dec_bug(&PTR_BUCKET(c, k, i)->pin);
 }

-/* Blktrace macros */
-
-#define blktrace_msg(c, fmt, ...)					\
-do {									\
-	struct request_queue *q = bdev_get_queue(c->bdev);		\
-	if (q)								\
-		blk_add_trace_msg(q, fmt, ##__VA_ARGS__);		\
-} while (0)
-
-#define blktrace_msg_all(s, fmt, ...)					\
-do {									\
-	struct cache *_c;						\
-	unsigned i;							\
-	for_each_cache(_c, (s), i)					\
-		blktrace_msg(_c, fmt, ##__VA_ARGS__);			\
-} while (0)
-
 static inline void cached_dev_put(struct cached_dev *dc)
 {
 	if (atomic_dec_and_test(&dc->count))
@ -1173,10 +1150,16 @@ static inline uint8_t bucket_disk_gen(struct bucket *b)
 	static struct kobj_attribute ksysfs_##n =			\
 		__ATTR(n, S_IWUSR|S_IRUSR, show, store)

-/* Forward declarations */
+static inline void wake_up_allocators(struct cache_set *c)
+{
+	struct cache *ca;
+	unsigned i;

-void bch_writeback_queue(struct cached_dev *);
-void bch_writeback_add(struct cached_dev *, unsigned);
+	for_each_cache(ca, c, i)
+		wake_up_process(ca->alloc_thread);
+}
+
+/* Forward declarations */

 void bch_count_io_errors(struct cache *, int, const char *);
 void bch_bbio_count_io_errors(struct cache_set *, struct bio *,
@ -1193,7 +1176,6 @@ void bch_submit_bbio(struct bio *, struct cache_set *, struct bkey *, unsigned);
 uint8_t bch_inc_gen(struct cache *, struct bucket *);
 void bch_rescale_priorities(struct cache_set *, int);
 bool bch_bucket_add_unused(struct cache *, struct bucket *);
-void bch_allocator_thread(struct closure *);

 long bch_bucket_alloc(struct cache *, unsigned, struct closure *);
 void bch_bucket_free(struct cache_set *, struct bkey *);
@ -1241,9 +1223,9 @@ void bch_cache_set_stop(struct cache_set *);
 struct cache_set *bch_cache_set_alloc(struct cache_sb *);
 void bch_btree_cache_free(struct cache_set *);
 int bch_btree_cache_alloc(struct cache_set *);
-void bch_cached_dev_writeback_init(struct cached_dev *);
 void bch_moving_init_cache_set(struct cache_set *);

+int bch_cache_allocator_start(struct cache *ca);
 void bch_cache_allocator_exit(struct cache *ca);
 int bch_cache_allocator_init(struct cache *ca);

--- a/drivers/md/bcache/bset.c
+++ b/drivers/md/bcache/bset.c
@ -78,6 +78,7 @@ struct bkey *bch_keylist_pop(struct keylist *l)
 bool __bch_ptr_invalid(struct cache_set *c, int level, const struct bkey *k)
 {
 	unsigned i;
+	char buf[80];

 	if (level && (!KEY_PTRS(k) || !KEY_SIZE(k) || KEY_DIRTY(k)))
 		goto bad;
@ -102,7 +103,8 @@ bool __bch_ptr_invalid(struct cache_set *c, int level, const struct bkey *k)

 	return false;
 bad:
-	cache_bug(c, "spotted bad key %s: %s", pkey(k), bch_ptr_status(c, k));
+	bch_bkey_to_text(buf, sizeof(buf), k);
+	cache_bug(c, "spotted bad key %s: %s", buf, bch_ptr_status(c, k));
 	return true;
 }

@ -162,10 +164,16 @@ bool bch_ptr_bad(struct btree *b, const struct bkey *k)
 #ifdef CONFIG_BCACHE_EDEBUG
 bug:
 	mutex_unlock(&b->c->bucket_lock);
+
+	{
+		char buf[80];
+
+		bch_bkey_to_text(buf, sizeof(buf), k);
 		btree_bug(b,
 "inconsistent pointer %s: bucket %zu pin %i prio %i gen %i last_gc %i mark %llu gc_gen %i",
-		  pkey(k), PTR_BUCKET_NR(b->c, k, i), atomic_read(&g->pin),
+			  buf, PTR_BUCKET_NR(b->c, k, i), atomic_read(&g->pin),
 			  g->prio, g->gen, g->last_gc, GC_MARK(g), g->gc_gen);
+	}
 	return true;
 #endif
 }
@ -1084,33 +1092,39 @@ void bch_btree_sort_into(struct btree *b, struct btree *new)
 	new->sets->size = 0;
 }

+#define SORT_CRIT	(4096 / sizeof(uint64_t))
+
 void bch_btree_sort_lazy(struct btree *b)
 {
-	if (b->nsets) {
-		unsigned i, j, keys = 0, total;
+	unsigned crit = SORT_CRIT;
+	int i;

-		for (i = 0; i <= b->nsets; i++)
-			keys += b->sets[i].data->keys;
+	/* Don't sort if nothing to do */
+	if (!b->nsets)
+		goto out;

-		total = keys;
-
-		for (j = 0; j < b->nsets; j++) {
-			if (keys * 2 < total ||
-			    keys < 1000) {
-				bch_btree_sort_partial(b, j);
-				return;
-			}
-
-			keys -= b->sets[j].data->keys;
-		}
-
-		/* Must sort if b->nsets == 3 or we'll overflow */
-		if (b->nsets >= (MAX_BSETS - 1) - b->level) {
+	/* If not a leaf node, always sort */
+	if (b->level) {
 		bch_btree_sort(b);
 		return;
 	}
+
+	for (i = b->nsets - 1; i >= 0; --i) {
+		crit *= b->c->sort_crit_factor;
+
+		if (b->sets[i].data->keys < crit) {
+			bch_btree_sort_partial(b, i);
+			return;
+		}
 	}

+	/* Sort if we'd overflow */
+	if (b->nsets + 1 == MAX_BSETS) {
+		bch_btree_sort(b);
+		return;
+	}
+
+out:
 	bset_build_written_tree(b);
 }

--- a/drivers/md/bcache/bset.h
+++ b/drivers/md/bcache/bset.h
@ -1,6 +1,8 @@
 #ifndef _BCACHE_BSET_H
 #define _BCACHE_BSET_H

+#include <linux/slab.h>
+
 /*
 * BKEYS:
 *
@ -142,6 +144,8 @@

 /* Btree key comparison/iteration */

+#define MAX_BSETS		4U
+
 struct btree_iter {
 	size_t size, used;
 	struct btree_iter_set {
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@ -24,6 +24,7 @@
 #include "btree.h"
 #include "debug.h"
 #include "request.h"
+#include "writeback.h"

 #include <linux/slab.h>
 #include <linux/bitops.h>
@ -134,44 +135,17 @@ static uint64_t btree_csum_set(struct btree *b, struct bset *i)
 	return crc ^ 0xffffffffffffffffULL;
 }

-static void btree_bio_endio(struct bio *bio, int error)
+static void bch_btree_node_read_done(struct btree *b)
 {
-	struct closure *cl = bio->bi_private;
-	struct btree *b = container_of(cl, struct btree, io.cl);
-
-	if (error)
-		set_btree_node_io_error(b);
-
-	bch_bbio_count_io_errors(b->c, bio, error, (bio->bi_rw & WRITE)
-				 ? "writing btree" : "reading btree");
-	closure_put(cl);
-}
-
-static void btree_bio_init(struct btree *b)
-{
-	BUG_ON(b->bio);
-	b->bio = bch_bbio_alloc(b->c);
-
-	b->bio->bi_end_io	= btree_bio_endio;
-	b->bio->bi_private	= &b->io.cl;
-}
-
-void bch_btree_read_done(struct closure *cl)
-{
-	struct btree *b = container_of(cl, struct btree, io.cl);
-	struct bset *i = b->sets[0].data;
-	struct btree_iter *iter = b->c->fill_iter;
 	const char *err = "bad btree header";
-	BUG_ON(b->nsets || b->written);
+	struct bset *i = b->sets[0].data;
+	struct btree_iter *iter;

-	bch_bbio_free(b->bio, b->c);
-	b->bio = NULL;
-
-	mutex_lock(&b->c->fill_lock);
+	iter = mempool_alloc(b->c->fill_iter, GFP_NOWAIT);
+	iter->size = b->c->sb.bucket_size / b->c->sb.block_size;
 	iter->used = 0;

-	if (btree_node_io_error(b) ||
-	    !i->seq)
+	if (!i->seq)
 		goto err;

 	for (;
@ -228,17 +202,8 @@ void bch_btree_read_done(struct closure *cl)
 	if (b->written < btree_blocks(b))
 		bch_bset_init_next(b);
 out:
-
-	mutex_unlock(&b->c->fill_lock);
-
-	spin_lock(&b->c->btree_read_time_lock);
-	bch_time_stats_update(&b->c->btree_read_time, b->io_start_time);
-	spin_unlock(&b->c->btree_read_time_lock);
-
-	smp_wmb(); /* read_done is our write lock */
-	set_btree_node_read_done(b);
-
-	closure_return(cl);
+	mempool_free(iter, b->c->fill_iter);
+	return;
 err:
 	set_btree_node_io_error(b);
 	bch_cache_set_error(b->c, "%s at bucket %zu, block %zu, %u keys",
@ -247,48 +212,69 @@ err:
 	goto out;
 }

-void bch_btree_read(struct btree *b)
+static void btree_node_read_endio(struct bio *bio, int error)
 {
-	BUG_ON(b->nsets || b->written);
+	struct closure *cl = bio->bi_private;
+	closure_put(cl);
+}

-	if (!closure_trylock(&b->io.cl, &b->c->cl))
-		BUG();
+void bch_btree_node_read(struct btree *b)
+{
+	uint64_t start_time = local_clock();
+	struct closure cl;
+	struct bio *bio;

-	b->io_start_time = local_clock();
+	trace_bcache_btree_read(b);

-	btree_bio_init(b);
-	b->bio->bi_rw	= REQ_META|READ_SYNC;
-	b->bio->bi_size	= KEY_SIZE(&b->key) << 9;
+	closure_init_stack(&cl);

-	bch_bio_map(b->bio, b->sets[0].data);
+	bio = bch_bbio_alloc(b->c);
+	bio->bi_rw	= REQ_META|READ_SYNC;
+	bio->bi_size	= KEY_SIZE(&b->key) << 9;
+	bio->bi_end_io	= btree_node_read_endio;
+	bio->bi_private	= &cl;

-	pr_debug("%s", pbtree(b));
-	trace_bcache_btree_read(b->bio);
-	bch_submit_bbio(b->bio, b->c, &b->key, 0);
+	bch_bio_map(bio, b->sets[0].data);

-	continue_at(&b->io.cl, bch_btree_read_done, system_wq);
+	bch_submit_bbio(bio, b->c, &b->key, 0);
+	closure_sync(&cl);
+
+	if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
+		set_btree_node_io_error(b);
+
+	bch_bbio_free(bio, b->c);
+
+	if (btree_node_io_error(b))
+		goto err;
+
+	bch_btree_node_read_done(b);
+
+	spin_lock(&b->c->btree_read_time_lock);
+	bch_time_stats_update(&b->c->btree_read_time, start_time);
+	spin_unlock(&b->c->btree_read_time_lock);
+
+	return;
+err:
+	bch_cache_set_error(b->c, "io error reading bucket %lu",
+			    PTR_BUCKET_NR(b->c, &b->key, 0));
 }

 static void btree_complete_write(struct btree *b, struct btree_write *w)
 {
 	if (w->prio_blocked &&
 	    !atomic_sub_return(w->prio_blocked, &b->c->prio_blocked))
-		wake_up(&b->c->alloc_wait);
+		wake_up_allocators(b->c);

 	if (w->journal) {
 		atomic_dec_bug(w->journal);
 		__closure_wake_up(&b->c->journal.wait);
 	}

-	if (w->owner)
-		closure_put(w->owner);
-
 	w->prio_blocked	= 0;
 	w->journal	= NULL;
-	w->owner	= NULL;
 }

-static void __btree_write_done(struct closure *cl)
+static void __btree_node_write_done(struct closure *cl)
 {
 	struct btree *b = container_of(cl, struct btree, io.cl);
 	struct btree_write *w = btree_prev_write(b);
@ -304,7 +290,7 @@ static void __btree_write_done(struct closure *cl)
 	closure_return(cl);
 }

-static void btree_write_done(struct closure *cl)
+static void btree_node_write_done(struct closure *cl)
 {
 	struct btree *b = container_of(cl, struct btree, io.cl);
 	struct bio_vec *bv;
@ -313,10 +299,22 @@ static void btree_write_done(struct closure *cl)
 	__bio_for_each_segment(bv, b->bio, n, 0)
 		__free_page(bv->bv_page);

-	__btree_write_done(cl);
+	__btree_node_write_done(cl);
 }

-static void do_btree_write(struct btree *b)
+static void btree_node_write_endio(struct bio *bio, int error)
+{
+	struct closure *cl = bio->bi_private;
+	struct btree *b = container_of(cl, struct btree, io.cl);
+
+	if (error)
+		set_btree_node_io_error(b);
+
+	bch_bbio_count_io_errors(b->c, bio, error, "writing btree");
+	closure_put(cl);
+}
+
+static void do_btree_node_write(struct btree *b)
 {
 	struct closure *cl = &b->io.cl;
 	struct bset *i = b->sets[b->nsets].data;
@ -325,15 +323,34 @@ static void do_btree_write(struct btree *b)
 	i->version	= BCACHE_BSET_VERSION;
 	i->csum		= btree_csum_set(b, i);

-	btree_bio_init(b);
-	b->bio->bi_rw	= REQ_META|WRITE_SYNC;
+	BUG_ON(b->bio);
+	b->bio = bch_bbio_alloc(b->c);
+
+	b->bio->bi_end_io	= btree_node_write_endio;
+	b->bio->bi_private	= &b->io.cl;
+	b->bio->bi_rw		= REQ_META|WRITE_SYNC|REQ_FUA;
 	b->bio->bi_size		= set_blocks(i, b->c) * block_bytes(b->c);
 	bch_bio_map(b->bio, i);

+	/*
+	 * If we're appending to a leaf node, we don't technically need FUA -
+	 * this write just needs to be persisted before the next journal write,
+	 * which will be marked FLUSH|FUA.
+	 *
+	 * Similarly if we're writing a new btree root - the pointer is going to
+	 * be in the next journal entry.
+	 *
+	 * But if we're writing a new btree node (that isn't a root) or
+	 * appending to a non leaf btree node, we need either FUA or a flush
+	 * when we write the parent with the new pointer. FUA is cheaper than a
+	 * flush, and writes appending to leaf nodes aren't blocking anything so
+	 * just make all btree node writes FUA to keep things sane.
+	 */
+
 	bkey_copy(&k.key, &b->key);
 	SET_PTR_OFFSET(&k.key, 0, PTR_OFFSET(&k.key, 0) + bset_offset(b, i));

-	if (!bch_bio_alloc_pages(b->bio, GFP_NOIO)) {
+	if (!bio_alloc_pages(b->bio, GFP_NOIO)) {
 		int j;
 		struct bio_vec *bv;
 		void *base = (void *) ((unsigned long) i & ~(PAGE_SIZE - 1));
@ -342,40 +359,41 @@ static void do_btree_write(struct btree *b)
 			memcpy(page_address(bv->bv_page),
 			       base + j * PAGE_SIZE, PAGE_SIZE);

-		trace_bcache_btree_write(b->bio);
 		bch_submit_bbio(b->bio, b->c, &k.key, 0);

-		continue_at(cl, btree_write_done, NULL);
+		continue_at(cl, btree_node_write_done, NULL);
 	} else {
 		b->bio->bi_vcnt = 0;
 		bch_bio_map(b->bio, i);

-		trace_bcache_btree_write(b->bio);
 		bch_submit_bbio(b->bio, b->c, &k.key, 0);

 		closure_sync(cl);
-		__btree_write_done(cl);
+		__btree_node_write_done(cl);
 	}
 }

-static void __btree_write(struct btree *b)
+void bch_btree_node_write(struct btree *b, struct closure *parent)
 {
 	struct bset *i = b->sets[b->nsets].data;

-	BUG_ON(current->bio_list);
+	trace_bcache_btree_write(b);
+
+	BUG_ON(current->bio_list);
+	BUG_ON(b->written >= btree_blocks(b));
+	BUG_ON(b->written && !i->keys);
+	BUG_ON(b->sets->data->seq != i->seq);
+	bch_check_key_order(b, i);

-	closure_lock(&b->io, &b->c->cl);
 	cancel_delayed_work(&b->work);

+	/* If caller isn't waiting for write, parent refcount is cache set */
+	closure_lock(&b->io, parent ?: &b->c->cl);
+
 	clear_bit(BTREE_NODE_dirty,	 &b->flags);
 	change_bit(BTREE_NODE_write_idx, &b->flags);

-	bch_check_key_order(b, i);
-	BUG_ON(b->written && !i->keys);
-
-	do_btree_write(b);
-
-	pr_debug("%s block %i keys %i", pbtree(b), b->written, i->keys);
+	do_btree_node_write(b);

 	b->written += set_blocks(i, b->c);
 	atomic_long_add(set_blocks(i, b->c) * b->c->sb.block_size,
@ -387,37 +405,31 @@ static void __btree_write(struct btree *b)
 		bch_bset_init_next(b);
 }

-static void btree_write_work(struct work_struct *w)
+static void btree_node_write_work(struct work_struct *w)
 {
 	struct btree *b = container_of(to_delayed_work(w), struct btree, work);

-	down_write(&b->lock);
+	rw_lock(true, b, b->level);

 	if (btree_node_dirty(b))
-		__btree_write(b);
-	up_write(&b->lock);
+		bch_btree_node_write(b, NULL);
+	rw_unlock(true, b);
 }

-void bch_btree_write(struct btree *b, bool now, struct btree_op *op)
+static void bch_btree_leaf_dirty(struct btree *b, struct btree_op *op)
 {
 	struct bset *i = b->sets[b->nsets].data;
 	struct btree_write *w = btree_current_write(b);

-	BUG_ON(b->written &&
-	       (b->written >= btree_blocks(b) ||
-		i->seq != b->sets[0].data->seq ||
-		!i->keys));
+	BUG_ON(!b->written);
+	BUG_ON(!i->keys);
+
+	if (!btree_node_dirty(b))
+		queue_delayed_work(btree_io_wq, &b->work, 30 * HZ);

-	if (!btree_node_dirty(b)) {
 	set_btree_node_dirty(b);
-		queue_delayed_work(btree_io_wq, &b->work,
-				   msecs_to_jiffies(30000));
-	}

-	w->prio_blocked += b->prio_blocked;
-	b->prio_blocked = 0;
-
-	if (op && op->journal && !b->level) {
+	if (op && op->journal) {
 		if (w->journal &&
 		    journal_pin_cmp(b->c, w, op)) {
 			atomic_dec_bug(w->journal);
@ -430,23 +442,10 @@ void bch_btree_write(struct btree *b, bool now, struct btree_op *op)
 		}
 	}

-	if (current->bio_list)
-		return;
-
 	/* Force write if set is too big */
-	if (now ||
-	    b->level ||
-	    set_bytes(i) > PAGE_SIZE - 48) {
-		if (op && now) {
-			/* Must wait on multiple writes */
-			BUG_ON(w->owner);
-			w->owner = &op->cl;
-			closure_get(&op->cl);
-		}
-
-		__btree_write(b);
-	}
-	BUG_ON(!b->written);
+	if (set_bytes(i) > PAGE_SIZE - 48 &&
+	    !current->bio_list)
+		bch_btree_node_write(b, NULL);
 }

 /*
@ -559,7 +558,7 @@ static struct btree *mca_bucket_alloc(struct cache_set *c,
 	init_rwsem(&b->lock);
 	lockdep_set_novalidate_class(&b->lock);
 	INIT_LIST_HEAD(&b->list);
-	INIT_DELAYED_WORK(&b->work, btree_write_work);
+	INIT_DELAYED_WORK(&b->work, btree_node_write_work);
 	b->c = c;
 	closure_init_unlocked(&b->io);

@ -582,7 +581,7 @@ static int mca_reap(struct btree *b, struct closure *cl, unsigned min_order)
 	BUG_ON(btree_node_dirty(b) && !b->sets[0].data);

 	if (cl && btree_node_dirty(b))
-		bch_btree_write(b, true, NULL);
+		bch_btree_node_write(b, NULL);

 	if (cl)
 		closure_wait_event_async(&b->io.wait, cl,
@ -623,6 +622,13 @@ static int bch_mca_shrink(struct shrinker *shrink, struct shrink_control *sc)
 	else if (!mutex_trylock(&c->bucket_lock))
 		return -1;

+	/*
+	 * It's _really_ critical that we don't free too many btree nodes - we
+	 * have to always leave ourselves a reserve. The reserve is how we
+	 * guarantee that allocating memory for a new btree node can always
+	 * succeed, so that inserting keys into the btree can always succeed and
+	 * IO can always make forward progress:
+	 */
 	nr /= c->btree_pages;
 	nr = min_t(unsigned long, nr, mca_can_free(c));

@ -766,6 +772,8 @@ static struct btree *mca_cannibalize(struct cache_set *c, struct bkey *k,
 	int ret = -ENOMEM;
 	struct btree *i;

+	trace_bcache_btree_cache_cannibalize(c);
+
 	if (!cl)
 		return ERR_PTR(-ENOMEM);

@ -784,7 +792,6 @@ static struct btree *mca_cannibalize(struct cache_set *c, struct bkey *k,
 		return ERR_PTR(-EAGAIN);
 	}

-	/* XXX: tracepoint */
 	c->try_harder = cl;
 	c->try_harder_start = local_clock();
 retry:
@ -905,6 +912,9 @@ retry:
 	b = mca_find(c, k);

 	if (!b) {
+		if (current->bio_list)
+			return ERR_PTR(-EAGAIN);
+
 		mutex_lock(&c->bucket_lock);
 		b = mca_alloc(c, k, level, &op->cl);
 		mutex_unlock(&c->bucket_lock);
@ -914,7 +924,7 @@ retry:
 		if (IS_ERR(b))
 			return b;

-		bch_btree_read(b);
+		bch_btree_node_read(b);

 		if (!write)
 			downgrade_write(&b->lock);
@ -937,14 +947,11 @@ retry:
 	for (; i <= b->nsets; i++)
 		prefetch(b->sets[i].data);

-	if (!closure_wait_event(&b->io.wait, &op->cl,
-				btree_node_read_done(b))) {
+	if (btree_node_io_error(b)) {
 		rw_unlock(write, b);
-		b = ERR_PTR(-EAGAIN);
-	} else if (btree_node_io_error(b)) {
-		rw_unlock(write, b);
-		b = ERR_PTR(-EIO);
-	} else
+		return ERR_PTR(-EIO);
+	}
+
 	BUG_ON(!b->written);

 	return b;
@ -959,7 +966,7 @@ static void btree_node_prefetch(struct cache_set *c, struct bkey *k, int level)
 	mutex_unlock(&c->bucket_lock);

 	if (!IS_ERR_OR_NULL(b)) {
-		bch_btree_read(b);
+		bch_btree_node_read(b);
 		rw_unlock(true, b);
 	}
 }
@ -970,24 +977,19 @@ static void btree_node_free(struct btree *b, struct btree_op *op)
 {
 	unsigned i;

+	trace_bcache_btree_node_free(b);
+
 	/*
 	 * The BUG_ON() in btree_node_get() implies that we must have a write
 	 * lock on parent to free or even invalidate a node
 	 */
 	BUG_ON(op->lock <= b->level);
 	BUG_ON(b == b->c->root);
-	pr_debug("bucket %s", pbtree(b));

 	if (btree_node_dirty(b))
 		btree_complete_write(b, btree_current_write(b));
 	clear_bit(BTREE_NODE_dirty, &b->flags);

-	if (b->prio_blocked &&
-	    !atomic_sub_return(b->prio_blocked, &b->c->prio_blocked))
-		wake_up(&b->c->alloc_wait);
-
-	b->prio_blocked = 0;
-
 	cancel_delayed_work(&b->work);

 	mutex_lock(&b->c->bucket_lock);
@ -1028,17 +1030,20 @@ retry:
 		goto retry;
 	}

-	set_btree_node_read_done(b);
 	b->accessed = 1;
 	bch_bset_init_next(b);

 	mutex_unlock(&c->bucket_lock);
+
+	trace_bcache_btree_node_alloc(b);
 	return b;
 err_free:
 	bch_bucket_free(c, &k.key);
 	__bkey_put(c, &k.key);
 err:
 	mutex_unlock(&c->bucket_lock);
+
+	trace_bcache_btree_node_alloc_fail(b);
 	return b;
 }

@ -1137,11 +1142,8 @@ static int btree_gc_mark_node(struct btree *b, unsigned *keys,
 		gc->nkeys++;

 		gc->data += KEY_SIZE(k);
-		if (KEY_DIRTY(k)) {
+		if (KEY_DIRTY(k))
 			gc->dirty += KEY_SIZE(k);
-			if (d)
-				d->sectors_dirty_gc += KEY_SIZE(k);
-		}
 	}

 	for (t = b->sets; t <= &b->sets[b->nsets]; t++)
@ -1166,14 +1168,11 @@ static struct btree *btree_gc_alloc(struct btree *b, struct bkey *k,

 	if (!IS_ERR_OR_NULL(n)) {
 		swap(b, n);
+		__bkey_put(b->c, &b->key);

 		memcpy(k->ptr, b->key.ptr,
 		       sizeof(uint64_t) * KEY_PTRS(&b->key));

-		__bkey_put(b->c, &b->key);
-		atomic_inc(&b->c->prio_blocked);
-		b->prio_blocked++;
-
 		btree_node_free(n, op);
 		up_write(&n->lock);
 	}
@ -1278,7 +1277,7 @@ static void btree_gc_coalesce(struct btree *b, struct btree_op *op,
 	btree_node_free(r->b, op);
 	up_write(&r->b->lock);

-	pr_debug("coalesced %u nodes", nodes);
+	trace_bcache_btree_gc_coalesce(nodes);

 	gc->nodes--;
 	nodes--;
@ -1293,14 +1292,9 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op,
 	void write(struct btree *r)
 	{
 		if (!r->written)
-			bch_btree_write(r, true, op);
-		else if (btree_node_dirty(r)) {
-			BUG_ON(btree_current_write(r)->owner);
-			btree_current_write(r)->owner = writes;
-			closure_get(writes);
-
-			bch_btree_write(r, true, NULL);
-		}
+			bch_btree_node_write(r, &op->cl);
+		else if (btree_node_dirty(r))
+			bch_btree_node_write(r, writes);

 		up_write(&r->lock);
 	}
@ -1386,9 +1380,7 @@ static int bch_btree_gc_root(struct btree *b, struct btree_op *op,
 		ret = btree_gc_recurse(b, op, writes, gc);

 	if (!b->written || btree_node_dirty(b)) {
-		atomic_inc(&b->c->prio_blocked);
-		b->prio_blocked++;
-		bch_btree_write(b, true, n ? op : NULL);
+		bch_btree_node_write(b, n ? &op->cl : NULL);
 	}

 	if (!IS_ERR_OR_NULL(n)) {
@ -1405,7 +1397,6 @@ static void btree_gc_start(struct cache_set *c)
 {
 	struct cache *ca;
 	struct bucket *b;
-	struct bcache_device **d;
 	unsigned i;

 	if (!c->gc_mark_valid)
@ -1423,12 +1414,6 @@ static void btree_gc_start(struct cache_set *c)
 				SET_GC_MARK(b, GC_MARK_RECLAIMABLE);
 		}

-	for (d = c->devices;
-	     d < c->devices + c->nr_uuids;
-	     d++)
-		if (*d)
-			(*d)->sectors_dirty_gc = 0;
-
 	mutex_unlock(&c->bucket_lock);
 }

@ -1437,7 +1422,6 @@ size_t bch_btree_gc_finish(struct cache_set *c)
 	size_t available = 0;
 	struct bucket *b;
 	struct cache *ca;
-	struct bcache_device **d;
 	unsigned i;

 	mutex_lock(&c->bucket_lock);
@ -1480,22 +1464,6 @@ size_t bch_btree_gc_finish(struct cache_set *c)
 		}
 	}

-	for (d = c->devices;
-	     d < c->devices + c->nr_uuids;
-	     d++)
-		if (*d) {
-			unsigned long last =
-				atomic_long_read(&((*d)->sectors_dirty));
-			long difference = (*d)->sectors_dirty_gc - last;
-
-			pr_debug("sectors dirty off by %li", difference);
-
-			(*d)->sectors_dirty_last += difference;
-
-			atomic_long_set(&((*d)->sectors_dirty),
-					(*d)->sectors_dirty_gc);
-		}
-
 	mutex_unlock(&c->bucket_lock);
 	return available;
 }
@ -1508,10 +1476,9 @@ static void bch_btree_gc(struct closure *cl)
 	struct gc_stat stats;
 	struct closure writes;
 	struct btree_op op;
-
 	uint64_t start_time = local_clock();
-	trace_bcache_gc_start(c->sb.set_uuid);
-	blktrace_msg_all(c, "Starting gc");
+
+	trace_bcache_gc_start(c);

 	memset(&stats, 0, sizeof(struct gc_stat));
 	closure_init_stack(&writes);
@ -1520,14 +1487,14 @@ static void bch_btree_gc(struct closure *cl)

 	btree_gc_start(c);

+	atomic_inc(&c->prio_blocked);
+
 	ret = btree_root(gc_root, c, &op, &writes, &stats);
 	closure_sync(&op.cl);
 	closure_sync(&writes);

 	if (ret) {
-		blktrace_msg_all(c, "Stopped gc");
 		pr_warn("gc failed!");
-
 		continue_at(cl, bch_btree_gc, bch_gc_wq);
 	}

@ -1537,6 +1504,9 @@ static void bch_btree_gc(struct closure *cl)

 	available = bch_btree_gc_finish(c);

+	atomic_dec(&c->prio_blocked);
+	wake_up_allocators(c);
+
 	bch_time_stats_update(&c->btree_gc_time, start_time);

 	stats.key_bytes *= sizeof(uint64_t);
@ -1544,10 +1514,8 @@ static void bch_btree_gc(struct closure *cl)
 	stats.data	<<= 9;
 	stats.in_use	= (c->nbuckets - available) * 100 / c->nbuckets;
 	memcpy(&c->gc_stats, &stats, sizeof(struct gc_stat));
-	blktrace_msg_all(c, "Finished gc");

-	trace_bcache_gc_end(c->sb.set_uuid);
-	wake_up(&c->alloc_wait);
+	trace_bcache_gc_end(c);

 	continue_at(cl, bch_moving_gc, bch_gc_wq);
 }
@ -1654,14 +1622,14 @@ static bool fix_overlapping_extents(struct btree *b,
 				    struct btree_iter *iter,
 				    struct btree_op *op)
 {
-	void subtract_dirty(struct bkey *k, int sectors)
+	void subtract_dirty(struct bkey *k, uint64_t offset, int sectors)
 	{
-		struct bcache_device *d = b->c->devices[KEY_INODE(k)];
-
-		if (KEY_DIRTY(k) && d)
-			atomic_long_sub(sectors, &d->sectors_dirty);
+		if (KEY_DIRTY(k))
+			bcache_dev_sectors_dirty_add(b->c, KEY_INODE(k),
+						     offset, -sectors);
 	}

+	uint64_t old_offset;
 	unsigned old_size, sectors_found = 0;

 	while (1) {
@ -1673,6 +1641,7 @@ static bool fix_overlapping_extents(struct btree *b,
 		if (bkey_cmp(k, &START_KEY(insert)) <= 0)
 			continue;

+		old_offset = KEY_START(k);
 		old_size = KEY_SIZE(k);

 		/*
@ -1728,7 +1697,7 @@ static bool fix_overlapping_extents(struct btree *b,

 			struct bkey *top;

-			subtract_dirty(k, KEY_SIZE(insert));
+			subtract_dirty(k, KEY_START(insert), KEY_SIZE(insert));

 			if (bkey_written(b, k)) {
 				/*
@ -1775,7 +1744,7 @@ static bool fix_overlapping_extents(struct btree *b,
 			}
 		}

-		subtract_dirty(k, old_size - KEY_SIZE(k));
+		subtract_dirty(k, old_offset, old_size - KEY_SIZE(k));
 	}

 check_failed:
@ -1798,7 +1767,7 @@ static bool btree_insert_key(struct btree *b, struct btree_op *op,
 {
 	struct bset *i = b->sets[b->nsets].data;
 	struct bkey *m, *prev;
-	const char *status = "insert";
+	unsigned status = BTREE_INSERT_STATUS_INSERT;

 	BUG_ON(bkey_cmp(k, &b->key) > 0);
 	BUG_ON(b->level && !KEY_PTRS(k));
@ -1831,17 +1800,17 @@ static bool btree_insert_key(struct btree *b, struct btree_op *op,
 			goto insert;

 		/* prev is in the tree, if we merge we're done */
-		status = "back merging";
+		status = BTREE_INSERT_STATUS_BACK_MERGE;
 		if (prev &&
 		    bch_bkey_try_merge(b, prev, k))
 			goto merged;

-		status = "overwrote front";
+		status = BTREE_INSERT_STATUS_OVERWROTE;
 		if (m != end(i) &&
 		    KEY_PTRS(m) == KEY_PTRS(k) && !KEY_SIZE(m))
 			goto copy;

-		status = "front merge";
+		status = BTREE_INSERT_STATUS_FRONT_MERGE;
 		if (m != end(i) &&
 		    bch_bkey_try_merge(b, k, m))
 			goto copy;
@ -1851,21 +1820,21 @@ static bool btree_insert_key(struct btree *b, struct btree_op *op,
 insert:	shift_keys(b, m, k);
 copy:	bkey_copy(m, k);
 merged:
-	bch_check_keys(b, "%s for %s at %s: %s", status,
-		       op_type(op), pbtree(b), pkey(k));
-	bch_check_key_order_msg(b, i, "%s for %s at %s: %s", status,
-				op_type(op), pbtree(b), pkey(k));
+	if (KEY_DIRTY(k))
+		bcache_dev_sectors_dirty_add(b->c, KEY_INODE(k),
+					     KEY_START(k), KEY_SIZE(k));
+
+	bch_check_keys(b, "%u for %s", status, op_type(op));

 	if (b->level && !KEY_OFFSET(k))
-		b->prio_blocked++;
+		btree_current_write(b)->prio_blocked++;

-	pr_debug("%s for %s at %s: %s", status,
-		 op_type(op), pbtree(b), pkey(k));
+	trace_bcache_btree_insert_key(b, k, op->type, status);

 	return true;
 }

-bool bch_btree_insert_keys(struct btree *b, struct btree_op *op)
+static bool bch_btree_insert_keys(struct btree *b, struct btree_op *op)
 {
 	bool ret = false;
 	struct bkey *k;
@ -1896,7 +1865,7 @@ bool bch_btree_insert_check_key(struct btree *b, struct btree_op *op,
 	    should_split(b))
 		goto out;

-	op->replace = KEY(op->inode, bio_end(bio), bio_sectors(bio));
+	op->replace = KEY(op->inode, bio_end_sector(bio), bio_sectors(bio));

 	SET_KEY_PTRS(&op->replace, 1);
 	get_random_bytes(&op->replace.ptr[0], sizeof(uint64_t));
@ -1907,7 +1876,6 @@ bool bch_btree_insert_check_key(struct btree *b, struct btree_op *op,

 	BUG_ON(op->type != BTREE_INSERT);
 	BUG_ON(!btree_insert_key(b, op, &tmp.k));
-	bch_btree_write(b, false, NULL);
 	ret = true;
 out:
 	downgrade_write(&b->lock);
@ -1929,12 +1897,11 @@ static int btree_split(struct btree *b, struct btree_op *op)

 	split = set_blocks(n1->sets[0].data, n1->c) > (btree_blocks(b) * 4) / 5;

-	pr_debug("%ssplitting at %s keys %i", split ? "" : "not ",
-		 pbtree(b), n1->sets[0].data->keys);
-
 	if (split) {
 		unsigned keys = 0;

+		trace_bcache_btree_node_split(b, n1->sets[0].data->keys);
+
 		n2 = bch_btree_node_alloc(b->c, b->level, &op->cl);
 		if (IS_ERR(n2))
 			goto err_free1;
@ -1967,18 +1934,21 @@ static int btree_split(struct btree *b, struct btree_op *op)
 		bkey_copy_key(&n2->key, &b->key);

 		bch_keylist_add(&op->keys, &n2->key);
-		bch_btree_write(n2, true, op);
+		bch_btree_node_write(n2, &op->cl);
 		rw_unlock(true, n2);
-	} else
+	} else {
+		trace_bcache_btree_node_compact(b, n1->sets[0].data->keys);
+
 		bch_btree_insert_keys(n1, op);
+	}

 	bch_keylist_add(&op->keys, &n1->key);
-	bch_btree_write(n1, true, op);
+	bch_btree_node_write(n1, &op->cl);

 	if (n3) {
 		bkey_copy_key(&n3->key, &MAX_KEY);
 		bch_btree_insert_keys(n3, op);
-		bch_btree_write(n3, true, op);
+		bch_btree_node_write(n3, &op->cl);

 		closure_sync(&op->cl);
 		bch_btree_set_root(n3);
@ -2082,8 +2052,12 @@ static int bch_btree_insert_recurse(struct btree *b, struct btree_op *op,

 		BUG_ON(write_block(b) != b->sets[b->nsets].data);

-		if (bch_btree_insert_keys(b, op))
-			bch_btree_write(b, false, op);
+		if (bch_btree_insert_keys(b, op)) {
+			if (!b->level)
+				bch_btree_leaf_dirty(b, op);
+			else
+				bch_btree_node_write(b, &op->cl);
+		}
 	}

 	return 0;
@ -2140,6 +2114,11 @@ int bch_btree_insert(struct btree_op *op, struct cache_set *c)
 void bch_btree_set_root(struct btree *b)
 {
 	unsigned i;
+	struct closure cl;
+
+	closure_init_stack(&cl);
+
+	trace_bcache_btree_set_root(b);

 	BUG_ON(!b->written);

@ -2153,8 +2132,8 @@ void bch_btree_set_root(struct btree *b)
 	b->c->root = b;
 	__bkey_put(b->c, &b->key);

-	bch_journal_meta(b->c, NULL);
-	pr_debug("%s for %pf", pbtree(b), __builtin_return_address(0));
+	bch_journal_meta(b->c, &cl);
+	closure_sync(&cl);
 }

 /* Cache lookup */
@ -2215,9 +2194,6 @@ static int submit_partial_cache_hit(struct btree *b, struct btree_op *op,
 					 KEY_OFFSET(k) - bio->bi_sector);

 		n = bch_bio_split(bio, sectors, GFP_NOIO, s->d->bio_split);
-		if (!n)
-			return -EAGAIN;
-
 		if (n == bio)
 			op->lookup_done = true;

@ -2240,7 +2216,6 @@ static int submit_partial_cache_hit(struct btree *b, struct btree_op *op,
 		n->bi_end_io	= bch_cache_read_endio;
 		n->bi_private	= &s->cl;

-		trace_bcache_cache_hit(n);
 		__bch_submit_bbio(n, b->c);
 	}

@ -2257,9 +2232,6 @@ int bch_btree_search_recurse(struct btree *b, struct btree_op *op)
 	struct btree_iter iter;
 	bch_btree_iter_init(b, &iter, &KEY(op->inode, bio->bi_sector, 0));

-	pr_debug("at %s searching for %u:%llu", pbtree(b), op->inode,
-		 (uint64_t) bio->bi_sector);
-
 	do {
 		k = bch_btree_iter_next_filter(&iter, b, bch_ptr_bad);
 		if (!k) {
@ -2303,7 +2275,8 @@ static inline int keybuf_nonoverlapping_cmp(struct keybuf_key *l,
 }

 static int bch_btree_refill_keybuf(struct btree *b, struct btree_op *op,
-				   struct keybuf *buf, struct bkey *end)
+				   struct keybuf *buf, struct bkey *end,
+				   keybuf_pred_fn *pred)
 {
 	struct btree_iter iter;
 	bch_btree_iter_init(b, &iter, &buf->last_scanned);
@ -2322,11 +2295,9 @@ static int bch_btree_refill_keybuf(struct btree *b, struct btree_op *op,
 			if (bkey_cmp(&buf->last_scanned, end) >= 0)
 				break;

-			if (buf->key_predicate(buf, k)) {
+			if (pred(buf, k)) {
 				struct keybuf_key *w;

-				pr_debug("%s", pkey(k));
-
 				spin_lock(&buf->lock);

 				w = array_alloc(&buf->freelist);
@ -2343,7 +2314,7 @@ static int bch_btree_refill_keybuf(struct btree *b, struct btree_op *op,
 			if (!k)
 				break;

-			btree(refill_keybuf, k, b, op, buf, end);
+			btree(refill_keybuf, k, b, op, buf, end, pred);
 			/*
 			 * Might get an error here, but can't really do anything
 			 * and it'll get logged elsewhere. Just read what we
@ -2361,7 +2332,7 @@ static int bch_btree_refill_keybuf(struct btree *b, struct btree_op *op,
 }

 void bch_refill_keybuf(struct cache_set *c, struct keybuf *buf,
-			  struct bkey *end)
+		       struct bkey *end, keybuf_pred_fn *pred)
 {
 	struct bkey start = buf->last_scanned;
 	struct btree_op op;
@ -2369,7 +2340,7 @@ void bch_refill_keybuf(struct cache_set *c, struct keybuf *buf,

 	cond_resched();

-	btree_root(refill_keybuf, c, &op, buf, end);
+	btree_root(refill_keybuf, c, &op, buf, end, pred);
 	closure_sync(&op.cl);

 	pr_debug("found %s keys from %llu:%llu to %llu:%llu",
@ -2455,7 +2426,8 @@ struct keybuf_key *bch_keybuf_next(struct keybuf *buf)

 struct keybuf_key *bch_keybuf_next_rescan(struct cache_set *c,
 					     struct keybuf *buf,
-					     struct bkey *end)
+					     struct bkey *end,
+					     keybuf_pred_fn *pred)
 {
 	struct keybuf_key *ret;

@ -2469,15 +2441,14 @@ struct keybuf_key *bch_keybuf_next_rescan(struct cache_set *c,
 			break;
 		}

-		bch_refill_keybuf(c, buf, end);
+		bch_refill_keybuf(c, buf, end, pred);
 	}

 	return ret;
 }

-void bch_keybuf_init(struct keybuf *buf, keybuf_pred_fn *fn)
+void bch_keybuf_init(struct keybuf *buf)
 {
-	buf->key_predicate	= fn;
 	buf->last_scanned	= MAX_KEY;
 	buf->keys		= RB_ROOT;

--- a/drivers/md/bcache/btree.h
+++ b/drivers/md/bcache/btree.h
@ -102,7 +102,6 @@
 #include "debug.h"

 struct btree_write {
-	struct closure		*owner;
 	atomic_t		*journal;

 	/* If btree_split() frees a btree node, it writes a new pointer to that
@ -142,16 +141,12 @@ struct btree {
 	 */
 	struct bset_tree	sets[MAX_BSETS];

-	/* Used to refcount bio splits, also protects b->bio */
+	/* For outstanding btree writes, used as a lock - protects write_idx */
 	struct closure_with_waitlist	io;

-	/* Gets transferred to w->prio_blocked - see the comment there */
-	int			prio_blocked;
-
 	struct list_head	list;
 	struct delayed_work	work;

-	uint64_t		io_start_time;
 	struct btree_write	writes[2];
 	struct bio		*bio;
 };
@ -164,13 +159,11 @@ static inline void set_btree_node_ ## flag(struct btree *b)		\
 {	set_bit(BTREE_NODE_ ## flag, &b->flags); }			\

 enum btree_flags {
-	BTREE_NODE_read_done,
 	BTREE_NODE_io_error,
 	BTREE_NODE_dirty,
 	BTREE_NODE_write_idx,
 };

-BTREE_FLAG(read_done);
 BTREE_FLAG(io_error);
 BTREE_FLAG(dirty);
 BTREE_FLAG(write_idx);
@ -278,6 +271,13 @@ struct btree_op {
 	BKEY_PADDED(replace);
 };

+enum {
+	BTREE_INSERT_STATUS_INSERT,
+	BTREE_INSERT_STATUS_BACK_MERGE,
+	BTREE_INSERT_STATUS_OVERWROTE,
+	BTREE_INSERT_STATUS_FRONT_MERGE,
+};
+
 void bch_btree_op_init_stack(struct btree_op *);

 static inline void rw_lock(bool w, struct btree *b, int level)
@ -293,9 +293,7 @@ static inline void rw_unlock(bool w, struct btree *b)
 #ifdef CONFIG_BCACHE_EDEBUG
 	unsigned i;

-	if (w &&
-	    b->key.ptr[0] &&
-	    btree_node_read_done(b))
+	if (w && b->key.ptr[0])
 		for (i = 0; i <= b->nsets; i++)
 			bch_check_key_order(b, b->sets[i].data);
 #endif
@ -370,9 +368,8 @@ static inline bool should_split(struct btree *b)
 		 > btree_blocks(b));
 }

-void bch_btree_read_done(struct closure *);
-void bch_btree_read(struct btree *);
-void bch_btree_write(struct btree *b, bool now, struct btree_op *op);
+void bch_btree_node_read(struct btree *);
+void bch_btree_node_write(struct btree *, struct closure *);

 void bch_cannibalize_unlock(struct cache_set *, struct closure *);
 void bch_btree_set_root(struct btree *);
@ -380,7 +377,6 @@ struct btree *bch_btree_node_alloc(struct cache_set *, int, struct closure *);
 struct btree *bch_btree_node_get(struct cache_set *, struct bkey *,
 				int, struct btree_op *);

-bool bch_btree_insert_keys(struct btree *, struct btree_op *);
 bool bch_btree_insert_check_key(struct btree *, struct btree_op *,
 				   struct bio *);
 int bch_btree_insert(struct btree_op *, struct cache_set *);
@ -393,13 +389,14 @@ void bch_moving_gc(struct closure *);
 int bch_btree_check(struct cache_set *, struct btree_op *);
 uint8_t __bch_btree_mark_key(struct cache_set *, int, struct bkey *);

-void bch_keybuf_init(struct keybuf *, keybuf_pred_fn *);
-void bch_refill_keybuf(struct cache_set *, struct keybuf *, struct bkey *);
+void bch_keybuf_init(struct keybuf *);
+void bch_refill_keybuf(struct cache_set *, struct keybuf *, struct bkey *,
+		       keybuf_pred_fn *);
 bool bch_keybuf_check_overlapping(struct keybuf *, struct bkey *,
 				  struct bkey *);
 void bch_keybuf_del(struct keybuf *, struct keybuf_key *);
 struct keybuf_key *bch_keybuf_next(struct keybuf *);
-struct keybuf_key *bch_keybuf_next_rescan(struct cache_set *,
-					  struct keybuf *, struct bkey *);
+struct keybuf_key *bch_keybuf_next_rescan(struct cache_set *, struct keybuf *,
+					  struct bkey *, keybuf_pred_fn *);

 #endif
--- a/drivers/md/bcache/debug.c
+++ b/drivers/md/bcache/debug.c
@ -47,11 +47,10 @@ const char *bch_ptr_status(struct cache_set *c, const struct bkey *k)
 	return "";
 }

-struct keyprint_hack bch_pkey(const struct bkey *k)
+int bch_bkey_to_text(char *buf, size_t size, const struct bkey *k)
 {
 	unsigned i = 0;
-	struct keyprint_hack r;
-	char *out = r.s, *end = r.s + KEYHACK_SIZE;
+	char *out = buf, *end = buf + size;

 #define p(...)	(out += scnprintf(out, end - out, __VA_ARGS__))

@ -75,16 +74,14 @@ struct keyprint_hack bch_pkey(const struct bkey *k)
 	if (KEY_CSUM(k))
 		p(" cs%llu %llx", KEY_CSUM(k), k->ptr[1]);
 #undef p
-	return r;
+	return out - buf;
 }

-struct keyprint_hack bch_pbtree(const struct btree *b)
+int bch_btree_to_text(char *buf, size_t size, const struct btree *b)
 {
-	struct keyprint_hack r;
-
-	snprintf(r.s, 40, "%zu level %i/%i", PTR_BUCKET_NR(b->c, &b->key, 0),
+	return scnprintf(buf, size, "%zu level %i/%i",
+			 PTR_BUCKET_NR(b->c, &b->key, 0),
 			 b->level, b->c->root ? b->c->root->level : -1);
-	return r;
 }

 #if defined(CONFIG_BCACHE_DEBUG) || defined(CONFIG_BCACHE_EDEBUG)
@ -100,10 +97,12 @@ static void dump_bset(struct btree *b, struct bset *i)
 {
 	struct bkey *k;
 	unsigned j;
+	char buf[80];

 	for (k = i->start; k < end(i); k = bkey_next(k)) {
+		bch_bkey_to_text(buf, sizeof(buf), k);
 		printk(KERN_ERR "block %zu key %zi/%u: %s", index(i, b),
-		       (uint64_t *) k - i->d, i->keys, pkey(k));
+		       (uint64_t *) k - i->d, i->keys, buf);

 		for (j = 0; j < KEY_PTRS(k); j++) {
 			size_t n = PTR_BUCKET_NR(b->c, k, j);
@ -144,7 +143,7 @@ void bch_btree_verify(struct btree *b, struct bset *new)
 	v->written = 0;
 	v->level = b->level;

-	bch_btree_read(v);
+	bch_btree_node_read(v);
 	closure_wait_event(&v->io.wait, &cl,
 			   atomic_read(&b->io.cl.remaining) == -1);

@ -200,7 +199,7 @@ void bch_data_verify(struct search *s)
 	if (!check)
 		return;

-	if (bch_bio_alloc_pages(check, GFP_NOIO))
+	if (bio_alloc_pages(check, GFP_NOIO))
 		goto out_put;

 	check->bi_rw		= READ_SYNC;
@ -252,6 +251,7 @@ static void vdump_bucket_and_panic(struct btree *b, const char *fmt,
 				   va_list args)
 {
 	unsigned i;
+	char buf[80];

 	console_lock();

@ -262,7 +262,8 @@ static void vdump_bucket_and_panic(struct btree *b, const char *fmt,

 	console_unlock();

-	panic("at %s\n", pbtree(b));
+	bch_btree_to_text(buf, sizeof(buf), b);
+	panic("at %s\n", buf);
 }

 void bch_check_key_order_msg(struct btree *b, struct bset *i,
@ -337,6 +338,7 @@ static ssize_t bch_dump_read(struct file *file, char __user *buf,
 {
 	struct dump_iterator *i = file->private_data;
 	ssize_t ret = 0;
+	char kbuf[80];

 	while (size) {
 		struct keybuf_key *w;
@ -355,11 +357,12 @@ static ssize_t bch_dump_read(struct file *file, char __user *buf,
 		if (i->bytes)
 			break;

-		w = bch_keybuf_next_rescan(i->c, &i->keys, &MAX_KEY);
+		w = bch_keybuf_next_rescan(i->c, &i->keys, &MAX_KEY, dump_pred);
 		if (!w)
 			break;

-		i->bytes = snprintf(i->buf, PAGE_SIZE, "%s\n", pkey(&w->key));
+		bch_bkey_to_text(kbuf, sizeof(kbuf), &w->key);
+		i->bytes = snprintf(i->buf, PAGE_SIZE, "%s\n", kbuf);
 		bch_keybuf_del(&i->keys, w);
 	}

@ -377,7 +380,7 @@ static int bch_dump_open(struct inode *inode, struct file *file)

 	file->private_data = i;
 	i->c = c;
-	bch_keybuf_init(&i->keys, dump_pred);
+	bch_keybuf_init(&i->keys);
 	i->keys.last_scanned = KEY(0, 0, 0);

 	return 0;
@ -409,142 +412,6 @@ void bch_debug_init_cache_set(struct cache_set *c)

 #endif

-/* Fuzz tester has rotted: */
-#if 0
-
-static ssize_t btree_fuzz(struct kobject *k, struct kobj_attribute *a,
-			  const char *buffer, size_t size)
-{
-	void dump(struct btree *b)
-	{
-		struct bset *i;
-
-		for (i = b->sets[0].data;
-		     index(i, b) < btree_blocks(b) &&
-		     i->seq == b->sets[0].data->seq;
-		     i = ((void *) i) + set_blocks(i, b->c) * block_bytes(b->c))
-			dump_bset(b, i);
-	}
-
-	struct cache_sb *sb;
-	struct cache_set *c;
-	struct btree *all[3], *b, *fill, *orig;
-	int j;
-
-	struct btree_op op;
-	bch_btree_op_init_stack(&op);
-
-	sb = kzalloc(sizeof(struct cache_sb), GFP_KERNEL);
-	if (!sb)
-		return -ENOMEM;
-
-	sb->bucket_size = 128;
-	sb->block_size = 4;
-
-	c = bch_cache_set_alloc(sb);
-	if (!c)
-		return -ENOMEM;
-
-	for (j = 0; j < 3; j++) {
-		BUG_ON(list_empty(&c->btree_cache));
-		all[j] = list_first_entry(&c->btree_cache, struct btree, list);
-		list_del_init(&all[j]->list);
-
-		all[j]->key = KEY(0, 0, c->sb.bucket_size);
-		bkey_copy_key(&all[j]->key, &MAX_KEY);
-	}
-
-	b = all[0];
-	fill = all[1];
-	orig = all[2];
-
-	while (1) {
-		for (j = 0; j < 3; j++)
-			all[j]->written = all[j]->nsets = 0;
-
-		bch_bset_init_next(b);
-
-		while (1) {
-			struct bset *i = write_block(b);
-			struct bkey *k = op.keys.top;
-			unsigned rand;
-
-			bkey_init(k);
-			rand = get_random_int();
-
-			op.type = rand & 1
-				? BTREE_INSERT
-				: BTREE_REPLACE;
-			rand >>= 1;
-
-			SET_KEY_SIZE(k, bucket_remainder(c, rand));
-			rand >>= c->bucket_bits;
-			rand &= 1024 * 512 - 1;
-			rand += c->sb.bucket_size;
-			SET_KEY_OFFSET(k, rand);
-#if 0
-			SET_KEY_PTRS(k, 1);
-#endif
-			bch_keylist_push(&op.keys);
-			bch_btree_insert_keys(b, &op);
-
-			if (should_split(b) ||
-			    set_blocks(i, b->c) !=
-			    __set_blocks(i, i->keys + 15, b->c)) {
-				i->csum = csum_set(i);
-
-				memcpy(write_block(fill),
-				       i, set_bytes(i));
-
-				b->written += set_blocks(i, b->c);
-				fill->written = b->written;
-				if (b->written == btree_blocks(b))
-					break;
-
-				bch_btree_sort_lazy(b);
-				bch_bset_init_next(b);
-			}
-		}
-
-		memcpy(orig->sets[0].data,
-		       fill->sets[0].data,
-		       btree_bytes(c));
-
-		bch_btree_sort(b);
-		fill->written = 0;
-		bch_btree_read_done(&fill->io.cl);
-
-		if (b->sets[0].data->keys != fill->sets[0].data->keys ||
-		    memcmp(b->sets[0].data->start,
-			   fill->sets[0].data->start,
-			   b->sets[0].data->keys * sizeof(uint64_t))) {
-			struct bset *i = b->sets[0].data;
-			struct bkey *k, *l;
-
-			for (k = i->start,
-			     l = fill->sets[0].data->start;
-			     k < end(i);
-			     k = bkey_next(k), l = bkey_next(l))
-				if (bkey_cmp(k, l) ||
-				    KEY_SIZE(k) != KEY_SIZE(l))
-					pr_err("key %zi differs: %s != %s",
-					       (uint64_t *) k - i->d,
-					       pkey(k), pkey(l));
-
-			for (j = 0; j < 3; j++) {
-				pr_err("**** Set %i ****", j);
-				dump(all[j]);
-			}
-			panic("\n");
-		}
-
-		pr_info("fuzz complete: %i keys", b->sets[0].data->keys);
-	}
-}
-
-kobj_attribute_write(fuzz, btree_fuzz);
-#endif
-
 void bch_debug_exit(void)
 {
 	if (!IS_ERR_OR_NULL(debug))
@ -554,11 +421,6 @@ void bch_debug_exit(void)
 int __init bch_debug_init(struct kobject *kobj)
 {
 	int ret = 0;
-#if 0
-	ret = sysfs_create_file(kobj, &ksysfs_fuzz.attr);
-	if (ret)
-		return ret;
-#endif

 	debug = debugfs_create_dir("bcache", NULL);
 	return ret;
--- a/drivers/md/bcache/debug.h
+++ b/drivers/md/bcache/debug.h
@ -3,15 +3,8 @@

 /* Btree/bkey debug printing */

-#define KEYHACK_SIZE 80
-struct keyprint_hack {
-	char s[KEYHACK_SIZE];
-};
-
-struct keyprint_hack bch_pkey(const struct bkey *k);
-struct keyprint_hack bch_pbtree(const struct btree *b);
-#define pkey(k)		(&bch_pkey(k).s[0])
-#define pbtree(b)	(&bch_pbtree(b).s[0])
+int bch_bkey_to_text(char *buf, size_t size, const struct bkey *k);
+int bch_btree_to_text(char *buf, size_t size, const struct btree *b);

 #ifdef CONFIG_BCACHE_EDEBUG

--- a/drivers/md/bcache/io.c
+++ b/drivers/md/bcache/io.c
@ -9,6 +9,8 @@
 #include "bset.h"
 #include "debug.h"

+#include <linux/blkdev.h>
+
 static void bch_bi_idx_hack_endio(struct bio *bio, int error)
 {
 	struct bio *p = bio->bi_private;
@ -66,13 +68,6 @@ static void bch_generic_make_request_hack(struct bio *bio)
 * The newly allocated bio will point to @bio's bi_io_vec, if the split was on a
 * bvec boundry; it is the caller's responsibility to ensure that @bio is not
 * freed before the split.
- *
- * If bch_bio_split() is running under generic_make_request(), it's not safe to
- * allocate more than one bio from the same bio set. Therefore, if it is running
- * under generic_make_request() it masks out __GFP_WAIT when doing the
- * allocation. The caller must check for failure if there's any possibility of
- * it being called from under generic_make_request(); it is then the caller's
- * responsibility to retry from a safe context (by e.g. punting to workqueue).
 */
 struct bio *bch_bio_split(struct bio *bio, int sectors,
 			  gfp_t gfp, struct bio_set *bs)
@ -83,20 +78,13 @@ struct bio *bch_bio_split(struct bio *bio, int sectors,

 	BUG_ON(sectors <= 0);

-	/*
-	 * If we're being called from underneath generic_make_request() and we
-	 * already allocated any bios from this bio set, we risk deadlock if we
-	 * use the mempool. So instead, we possibly fail and let the caller punt
-	 * to workqueue or somesuch and retry in a safe context.
-	 */
-	if (current->bio_list)
-		gfp &= ~__GFP_WAIT;
-
 	if (sectors >= bio_sectors(bio))
 		return bio;

 	if (bio->bi_rw & REQ_DISCARD) {
 		ret = bio_alloc_bioset(gfp, 1, bs);
+		if (!ret)
+			return NULL;
 		idx = 0;
 		goto out;
 	}
@ -160,17 +148,18 @@ static unsigned bch_bio_max_sectors(struct bio *bio)
 	struct request_queue *q = bdev_get_queue(bio->bi_bdev);
 	unsigned max_segments = min_t(unsigned, BIO_MAX_PAGES,
 				      queue_max_segments(q));
-	struct bio_vec *bv, *end = bio_iovec(bio) +
-		min_t(int, bio_segments(bio), max_segments);

 	if (bio->bi_rw & REQ_DISCARD)
 		return min(ret, q->limits.max_discard_sectors);

 	if (bio_segments(bio) > max_segments ||
 	    q->merge_bvec_fn) {
+		struct bio_vec *bv;
+		int i, seg = 0;
+
 		ret = 0;

-		for (bv = bio_iovec(bio); bv < end; bv++) {
+		bio_for_each_segment(bv, bio, i) {
 			struct bvec_merge_data bvm = {
 				.bi_bdev	= bio->bi_bdev,
 				.bi_sector	= bio->bi_sector,
@ -178,10 +167,14 @@ static unsigned bch_bio_max_sectors(struct bio *bio)
 				.bi_rw		= bio->bi_rw,
 			};

+			if (seg == max_segments)
+				break;
+
 			if (q->merge_bvec_fn &&
 			    q->merge_bvec_fn(q, &bvm, bv) < (int) bv->bv_len)
 				break;

+			seg++;
 			ret += bv->bv_len >> 9;
 		}
 	}
@ -218,30 +211,10 @@ static void bch_bio_submit_split_endio(struct bio *bio, int error)
 	closure_put(cl);
 }

-static void __bch_bio_submit_split(struct closure *cl)
-{
-	struct bio_split_hook *s = container_of(cl, struct bio_split_hook, cl);
-	struct bio *bio = s->bio, *n;
-
-	do {
-		n = bch_bio_split(bio, bch_bio_max_sectors(bio),
-				  GFP_NOIO, s->p->bio_split);
-		if (!n)
-			continue_at(cl, __bch_bio_submit_split, system_wq);
-
-		n->bi_end_io	= bch_bio_submit_split_endio;
-		n->bi_private	= cl;
-
-		closure_get(cl);
-		bch_generic_make_request_hack(n);
-	} while (n != bio);
-
-	continue_at(cl, bch_bio_submit_split_done, NULL);
-}
-
 void bch_generic_make_request(struct bio *bio, struct bio_split_pool *p)
 {
 	struct bio_split_hook *s;
+	struct bio *n;

 	if (!bio_has_data(bio) && !(bio->bi_rw & REQ_DISCARD))
 		goto submit;
@ -250,6 +223,7 @@ void bch_generic_make_request(struct bio *bio, struct bio_split_pool *p)
 		goto submit;

 	s = mempool_alloc(p->bio_split_hook, GFP_NOIO);
+	closure_init(&s->cl, NULL);

 	s->bio		= bio;
 	s->p		= p;
@ -257,8 +231,18 @@ void bch_generic_make_request(struct bio *bio, struct bio_split_pool *p)
 	s->bi_private	= bio->bi_private;
 	bio_get(bio);

-	closure_call(&s->cl, __bch_bio_submit_split, NULL, NULL);
-	return;
+	do {
+		n = bch_bio_split(bio, bch_bio_max_sectors(bio),
+				  GFP_NOIO, s->p->bio_split);
+
+		n->bi_end_io	= bch_bio_submit_split_endio;
+		n->bi_private	= &s->cl;
+
+		closure_get(&s->cl);
+		bch_generic_make_request_hack(n);
+	} while (n != bio);
+
+	continue_at(&s->cl, bch_bio_submit_split_done, NULL);
 submit:
 	bch_generic_make_request_hack(bio);
 }
--- a/drivers/md/bcache/journal.c
+++ b/drivers/md/bcache/journal.c
@ -9,6 +9,8 @@
 #include "debug.h"
 #include "request.h"

+#include <trace/events/bcache.h>
+
 /*
 * Journal replay/recovery:
 *
@ -300,7 +302,8 @@ int bch_journal_replay(struct cache_set *s, struct list_head *list,
 		for (k = i->j.start;
 		     k < end(&i->j);
 		     k = bkey_next(k)) {
-			pr_debug("%s", pkey(k));
+			trace_bcache_journal_replay_key(k);
+
 			bkey_copy(op->keys.top, k);
 			bch_keylist_push(&op->keys);

@ -384,7 +387,7 @@ out:
 		return;
 found:
 	if (btree_node_dirty(best))
-		bch_btree_write(best, true, NULL);
+		bch_btree_node_write(best, NULL);
 	rw_unlock(true, best);
 }

@ -617,7 +620,7 @@ static void journal_write_unlocked(struct closure *cl)
 		bio_reset(bio);
 		bio->bi_sector	= PTR_OFFSET(k, i);
 		bio->bi_bdev	= ca->bdev;
-		bio->bi_rw	= REQ_WRITE|REQ_SYNC|REQ_META|REQ_FLUSH;
+		bio->bi_rw	= REQ_WRITE|REQ_SYNC|REQ_META|REQ_FLUSH|REQ_FUA;
 		bio->bi_size	= sectors << 9;

 		bio->bi_end_io	= journal_write_endio;
@ -712,7 +715,8 @@ void bch_journal(struct closure *cl)
 	spin_lock(&c->journal.lock);

 	if (journal_full(&c->journal)) {
-		/* XXX: tracepoint */
+		trace_bcache_journal_full(c);
+
 		closure_wait(&c->journal.wait, cl);

 		journal_reclaim(c);
@ -728,13 +732,15 @@ void bch_journal(struct closure *cl)

 	if (b * c->sb.block_size > PAGE_SECTORS << JSET_BITS ||
 	    b > c->journal.blocks_free) {
-		/* XXX: If we were inserting so many keys that they won't fit in
+		trace_bcache_journal_entry_full(c);
+
+		/*
+		 * XXX: If we were inserting so many keys that they won't fit in
 		 * an _empty_ journal write, we'll deadlock. For now, handle
 		 * this in bch_keylist_realloc() - but something to think about.
 		 */
 		BUG_ON(!w->data->keys);

-		/* XXX: tracepoint */
 		BUG_ON(!closure_wait(&w->wait, cl));

 		closure_flush(&c->journal.io);
--- a/drivers/md/bcache/movinggc.c
+++ b/drivers/md/bcache/movinggc.c
@ -9,6 +9,8 @@
 #include "debug.h"
 #include "request.h"

+#include <trace/events/bcache.h>
+
 struct moving_io {
 	struct keybuf_key	*w;
 	struct search		s;
@ -44,14 +46,14 @@ static void write_moving_finish(struct closure *cl)
 {
 	struct moving_io *io = container_of(cl, struct moving_io, s.cl);
 	struct bio *bio = &io->bio.bio;
-	struct bio_vec *bv = bio_iovec_idx(bio, bio->bi_vcnt);
+	struct bio_vec *bv;
+	int i;

-	while (bv-- != bio->bi_io_vec)
+	bio_for_each_segment_all(bv, bio, i)
 		__free_page(bv->bv_page);

-	pr_debug("%s %s", io->s.op.insert_collision
-		 ? "collision moving" : "moved",
-		 pkey(&io->w->key));
+	if (io->s.op.insert_collision)
+		trace_bcache_gc_copy_collision(&io->w->key);

 	bch_keybuf_del(&io->s.op.c->moving_gc_keys, io->w);

@ -94,8 +96,6 @@ static void write_moving(struct closure *cl)
 	struct moving_io *io = container_of(s, struct moving_io, s);

 	if (!s->error) {
-		trace_bcache_write_moving(&io->bio.bio);
-
 		moving_init(io);

 		io->bio.bio.bi_sector	= KEY_START(&io->w->key);
@ -122,7 +122,6 @@ static void read_moving_submit(struct closure *cl)
 	struct moving_io *io = container_of(s, struct moving_io, s);
 	struct bio *bio = &io->bio.bio;

-	trace_bcache_read_moving(bio);
 	bch_submit_bbio(bio, s->op.c, &io->w->key, 0);

 	continue_at(cl, write_moving, bch_gc_wq);
@ -138,7 +137,8 @@ static void read_moving(struct closure *cl)
 	/* XXX: if we error, background writeback could stall indefinitely */

 	while (!test_bit(CACHE_SET_STOPPING, &c->flags)) {
-		w = bch_keybuf_next_rescan(c, &c->moving_gc_keys, &MAX_KEY);
+		w = bch_keybuf_next_rescan(c, &c->moving_gc_keys,
+					   &MAX_KEY, moving_pred);
 		if (!w)
 			break;

@ -159,10 +159,10 @@ static void read_moving(struct closure *cl)
 		bio->bi_rw	= READ;
 		bio->bi_end_io	= read_moving_endio;

-		if (bch_bio_alloc_pages(bio, GFP_KERNEL))
+		if (bio_alloc_pages(bio, GFP_KERNEL))
 			goto err;

-		pr_debug("%s", pkey(&w->key));
+		trace_bcache_gc_copy(&w->key);

 		closure_call(&io->s.cl, read_moving_submit, NULL, &c->gc.cl);

@ -250,5 +250,5 @@ void bch_moving_gc(struct closure *cl)

 void bch_moving_init_cache_set(struct cache_set *c)
 {
-	bch_keybuf_init(&c->moving_gc_keys, moving_pred);
+	bch_keybuf_init(&c->moving_gc_keys);
 }
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@ -10,6 +10,7 @@
 #include "btree.h"
 #include "debug.h"
 #include "request.h"
+#include "writeback.h"

 #include <linux/cgroup.h>
 #include <linux/module.h>
@ -21,8 +22,6 @@

 #define CUTOFF_CACHE_ADD	95
 #define CUTOFF_CACHE_READA	90
-#define CUTOFF_WRITEBACK	50
-#define CUTOFF_WRITEBACK_SYNC	75

 struct kmem_cache *bch_search_cache;

@ -510,10 +509,6 @@ static void bch_insert_data_loop(struct closure *cl)
 			goto err;

 		n = bch_bio_split(bio, KEY_SIZE(k), GFP_NOIO, split);
-		if (!n) {
-			__bkey_put(op->c, k);
-			continue_at(cl, bch_insert_data_loop, bcache_wq);
-		}

 		n->bi_end_io	= bch_insert_data_endio;
 		n->bi_private	= cl;
@ -530,10 +525,9 @@ static void bch_insert_data_loop(struct closure *cl)
 		if (KEY_CSUM(k))
 			bio_csum(n, k);

-		pr_debug("%s", pkey(k));
+		trace_bcache_cache_insert(k);
 		bch_keylist_push(&op->keys);

-		trace_bcache_cache_insert(n, n->bi_sector, n->bi_bdev);
 		n->bi_rw |= REQ_WRITE;
 		bch_submit_bbio(n, op->c, k, 0);
 	} while (n != bio);
@ -784,11 +778,8 @@ static void request_read_error(struct closure *cl)
 	int i;

 	if (s->recoverable) {
-		/* The cache read failed, but we can retry from the backing
-		 * device.
-		 */
-		pr_debug("recovering at sector %llu",
-			 (uint64_t) s->orig_bio->bi_sector);
+		/* Retry from the backing device: */
+		trace_bcache_read_retry(s->orig_bio);

 		s->error = 0;
 		bv = s->bio.bio.bi_io_vec;
@ -806,7 +797,6 @@ static void request_read_error(struct closure *cl)

 		/* XXX: invalidate cache */

-		trace_bcache_read_retry(&s->bio.bio);
 		closure_bio_submit(&s->bio.bio, &s->cl, s->d);
 	}

@ -827,53 +817,13 @@ static void request_read_done(struct closure *cl)
 	 */

 	if (s->op.cache_bio) {
-		struct bio_vec *src, *dst;
-		unsigned src_offset, dst_offset, bytes;
-		void *dst_ptr;
-
 		bio_reset(s->op.cache_bio);
 		s->op.cache_bio->bi_sector	= s->cache_miss->bi_sector;
 		s->op.cache_bio->bi_bdev	= s->cache_miss->bi_bdev;
 		s->op.cache_bio->bi_size	= s->cache_bio_sectors << 9;
 		bch_bio_map(s->op.cache_bio, NULL);

-		src = bio_iovec(s->op.cache_bio);
-		dst = bio_iovec(s->cache_miss);
-		src_offset = src->bv_offset;
-		dst_offset = dst->bv_offset;
-		dst_ptr = kmap(dst->bv_page);
-
-		while (1) {
-			if (dst_offset == dst->bv_offset + dst->bv_len) {
-				kunmap(dst->bv_page);
-				dst++;
-				if (dst == bio_iovec_idx(s->cache_miss,
-						s->cache_miss->bi_vcnt))
-					break;
-
-				dst_offset = dst->bv_offset;
-				dst_ptr = kmap(dst->bv_page);
-			}
-
-			if (src_offset == src->bv_offset + src->bv_len) {
-				src++;
-				if (src == bio_iovec_idx(s->op.cache_bio,
-						 s->op.cache_bio->bi_vcnt))
-					BUG();
-
-				src_offset = src->bv_offset;
-			}
-
-			bytes = min(dst->bv_offset + dst->bv_len - dst_offset,
-				    src->bv_offset + src->bv_len - src_offset);
-
-			memcpy(dst_ptr + dst_offset,
-			       page_address(src->bv_page) + src_offset,
-			       bytes);
-
-			src_offset	+= bytes;
-			dst_offset	+= bytes;
-		}
+		bio_copy_data(s->cache_miss, s->op.cache_bio);

 		bio_put(s->cache_miss);
 		s->cache_miss = NULL;
@ -899,6 +849,7 @@ static void request_read_done_bh(struct closure *cl)
 	struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);

 	bch_mark_cache_accounting(s, !s->cache_miss, s->op.skip);
+	trace_bcache_read(s->orig_bio, !s->cache_miss, s->op.skip);

 	if (s->error)
 		continue_at_nobarrier(cl, request_read_error, bcache_wq);
@ -917,9 +868,6 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s,
 	struct bio *miss;

 	miss = bch_bio_split(bio, sectors, GFP_NOIO, s->d->bio_split);
-	if (!miss)
-		return -EAGAIN;
-
 	if (miss == bio)
 		s->op.lookup_done = true;

@ -938,8 +886,9 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s,
 		reada = min(dc->readahead >> 9,
 			    sectors - bio_sectors(miss));

-		if (bio_end(miss) + reada > bdev_sectors(miss->bi_bdev))
-			reada = bdev_sectors(miss->bi_bdev) - bio_end(miss);
+		if (bio_end_sector(miss) + reada > bdev_sectors(miss->bi_bdev))
+			reada = bdev_sectors(miss->bi_bdev) -
+				bio_end_sector(miss);
 	}

 	s->cache_bio_sectors = bio_sectors(miss) + reada;
@ -963,13 +912,12 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s,
 		goto out_put;

 	bch_bio_map(s->op.cache_bio, NULL);
-	if (bch_bio_alloc_pages(s->op.cache_bio, __GFP_NOWARN|GFP_NOIO))
+	if (bio_alloc_pages(s->op.cache_bio, __GFP_NOWARN|GFP_NOIO))
 		goto out_put;

 	s->cache_miss = miss;
 	bio_get(s->op.cache_bio);

-	trace_bcache_cache_miss(s->orig_bio);
 	closure_bio_submit(s->op.cache_bio, &s->cl, s->d);

 	return ret;
@ -1002,24 +950,13 @@ static void cached_dev_write_complete(struct closure *cl)
 	cached_dev_bio_complete(cl);
 }

-static bool should_writeback(struct cached_dev *dc, struct bio *bio)
-{
-	unsigned threshold = (bio->bi_rw & REQ_SYNC)
-		? CUTOFF_WRITEBACK_SYNC
-		: CUTOFF_WRITEBACK;
-
-	return !atomic_read(&dc->disk.detaching) &&
-		cache_mode(dc, bio) == CACHE_MODE_WRITEBACK &&
-		dc->disk.c->gc_stats.in_use < threshold;
-}
-
 static void request_write(struct cached_dev *dc, struct search *s)
 {
 	struct closure *cl = &s->cl;
 	struct bio *bio = &s->bio.bio;
 	struct bkey start, end;
 	start = KEY(dc->disk.id, bio->bi_sector, 0);
-	end = KEY(dc->disk.id, bio_end(bio), 0);
+	end = KEY(dc->disk.id, bio_end_sector(bio), 0);

 	bch_keybuf_check_overlapping(&s->op.c->moving_gc_keys, &start, &end);

@ -1034,22 +971,37 @@ static void request_write(struct cached_dev *dc, struct search *s)
 	if (bio->bi_rw & REQ_DISCARD)
 		goto skip;

+	if (should_writeback(dc, s->orig_bio,
+			     cache_mode(dc, bio),
+			     s->op.skip)) {
+		s->op.skip = false;
+		s->writeback = true;
+	}
+
 	if (s->op.skip)
 		goto skip;

-	if (should_writeback(dc, s->orig_bio))
-		s->writeback = true;
+	trace_bcache_write(s->orig_bio, s->writeback, s->op.skip);

 	if (!s->writeback) {
 		s->op.cache_bio = bio_clone_bioset(bio, GFP_NOIO,
 						   dc->disk.bio_split);

-		trace_bcache_writethrough(s->orig_bio);
+		closure_bio_submit(bio, cl, s->d);
+	} else {
+		bch_writeback_add(dc);
+
+		if (s->op.flush_journal) {
+			/* Also need to send a flush to the backing device */
+			s->op.cache_bio = bio_clone_bioset(bio, GFP_NOIO,
+							   dc->disk.bio_split);
+
+			bio->bi_size = 0;
+			bio->bi_vcnt = 0;
 			closure_bio_submit(bio, cl, s->d);
 		} else {
 			s->op.cache_bio = bio;
-		trace_bcache_writeback(s->orig_bio);
-		bch_writeback_add(dc, bio_sectors(bio));
+		}
 	}
 out:
 	closure_call(&s->op.cl, bch_insert_data, NULL, cl);
@ -1058,7 +1010,6 @@ skip:
 	s->op.skip = true;
 	s->op.cache_bio = s->orig_bio;
 	bio_get(s->op.cache_bio);
-	trace_bcache_write_skip(s->orig_bio);

 	if ((bio->bi_rw & REQ_DISCARD) &&
 	    !blk_queue_discard(bdev_get_queue(dc->bdev)))
@ -1088,9 +1039,10 @@ static void request_nodata(struct cached_dev *dc, struct search *s)

 /* Cached devices - read & write stuff */

-int bch_get_congested(struct cache_set *c)
+unsigned bch_get_congested(struct cache_set *c)
 {
 	int i;
+	long rand;

 	if (!c->congested_read_threshold_us &&
 	    !c->congested_write_threshold_us)
@ -1106,7 +1058,13 @@ int bch_get_congested(struct cache_set *c)

 	i += CONGESTED_MAX;

-	return i <= 0 ? 1 : fract_exp_two(i, 6);
+	if (i > 0)
+		i = fract_exp_two(i, 6);
+
+	rand = get_random_int();
+	i -= bitmap_weight(&rand, BITS_PER_LONG);
+
+	return i > 0 ? i : 1;
 }

 static void add_sequential(struct task_struct *t)
@ -1126,10 +1084,8 @@ static void check_should_skip(struct cached_dev *dc, struct search *s)
 {
 	struct cache_set *c = s->op.c;
 	struct bio *bio = &s->bio.bio;
-
-	long rand;
-	int cutoff = bch_get_congested(c);
 	unsigned mode = cache_mode(dc, bio);
+	unsigned sectors, congested = bch_get_congested(c);

 	if (atomic_read(&dc->disk.detaching) ||
 	    c->gc_stats.in_use > CUTOFF_CACHE_ADD ||
@ -1147,17 +1103,14 @@ static void check_should_skip(struct cached_dev *dc, struct search *s)
 		goto skip;
 	}

-	if (!cutoff) {
-		cutoff = dc->sequential_cutoff >> 9;
-
-		if (!cutoff)
+	if (!congested && !dc->sequential_cutoff)
 		goto rescale;

-		if (mode == CACHE_MODE_WRITEBACK &&
+	if (!congested &&
+	    mode == CACHE_MODE_WRITEBACK &&
 	    (bio->bi_rw & REQ_WRITE) &&
 	    (bio->bi_rw & REQ_SYNC))
 		goto rescale;
-	}

 	if (dc->sequential_merge) {
 		struct io *i;
@ -1177,7 +1130,7 @@ found:
 		if (i->sequential + bio->bi_size > i->sequential)
 			i->sequential	+= bio->bi_size;

-		i->last			 = bio_end(bio);
+		i->last			 = bio_end_sector(bio);
 		i->jiffies		 = jiffies + msecs_to_jiffies(5000);
 		s->task->sequential_io	 = i->sequential;

@ -1192,12 +1145,19 @@ found:
 		add_sequential(s->task);
 	}

-	rand = get_random_int();
-	cutoff -= bitmap_weight(&rand, BITS_PER_LONG);
+	sectors = max(s->task->sequential_io,
+		      s->task->sequential_io_avg) >> 9;

-	if (cutoff <= (int) (max(s->task->sequential_io,
-				 s->task->sequential_io_avg) >> 9))
+	if (dc->sequential_cutoff &&
+	    sectors >= dc->sequential_cutoff >> 9) {
+		trace_bcache_bypass_sequential(s->orig_bio);
 		goto skip;
+	}
+
+	if (congested && sectors >= congested) {
+		trace_bcache_bypass_congested(s->orig_bio);
+		goto skip;
+	}

 rescale:
 	bch_rescale_priorities(c, bio_sectors(bio));
@ -1288,29 +1248,24 @@ void bch_cached_dev_request_init(struct cached_dev *dc)
 static int flash_dev_cache_miss(struct btree *b, struct search *s,
 				struct bio *bio, unsigned sectors)
 {
+	struct bio_vec *bv;
+	int i;
+
 	/* Zero fill bio */

-	while (bio->bi_idx != bio->bi_vcnt) {
-		struct bio_vec *bv = bio_iovec(bio);
+	bio_for_each_segment(bv, bio, i) {
 		unsigned j = min(bv->bv_len >> 9, sectors);

 		void *p = kmap(bv->bv_page);
 		memset(p + bv->bv_offset, 0, j << 9);
 		kunmap(bv->bv_page);

-		bv->bv_len	-= j << 9;
-		bv->bv_offset	+= j << 9;
-
-		if (bv->bv_len)
-			return 0;
-
-		bio->bi_sector	+= j;
-		bio->bi_size	-= j << 9;
-
-		bio->bi_idx++;
 		sectors	-= j;
 	}

+	bio_advance(bio, min(sectors << 9, bio->bi_size));
+
+	if (!bio->bi_size)
 		s->op.lookup_done = true;

 	return 0;
@ -1339,7 +1294,7 @@ static void flash_dev_make_request(struct request_queue *q, struct bio *bio)
 	} else if (bio_has_data(bio) || s->op.skip) {
 		bch_keybuf_check_overlapping(&s->op.c->moving_gc_keys,
 					&KEY(d->id, bio->bi_sector, 0),
-					     &KEY(d->id, bio_end(bio), 0));
+					&KEY(d->id, bio_end_sector(bio), 0));

 		s->writeback	= true;
 		s->op.cache_bio	= bio;
--- a/drivers/md/bcache/request.h
+++ b/drivers/md/bcache/request.h
@ -30,7 +30,7 @@ struct search {
 };

 void bch_cache_read_endio(struct bio *, int);
-int bch_get_congested(struct cache_set *);
+unsigned bch_get_congested(struct cache_set *);
 void bch_insert_data(struct closure *cl);
 void bch_btree_insert_async(struct closure *);
 void bch_cache_read_endio(struct bio *, int);
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@ -10,7 +10,9 @@
 #include "btree.h"
 #include "debug.h"
 #include "request.h"
+#include "writeback.h"

+#include <linux/blkdev.h>
 #include <linux/buffer_head.h>
 #include <linux/debugfs.h>
 #include <linux/genhd.h>
@ -342,6 +344,7 @@ static void uuid_io(struct cache_set *c, unsigned long rw,
 	struct closure *cl = &c->uuid_write.cl;
 	struct uuid_entry *u;
 	unsigned i;
+	char buf[80];

 	BUG_ON(!parent);
 	closure_lock(&c->uuid_write, parent);
@ -362,8 +365,8 @@ static void uuid_io(struct cache_set *c, unsigned long rw,
 			break;
 	}

-	pr_debug("%s UUIDs at %s", rw & REQ_WRITE ? "wrote" : "read",
-		 pkey(&c->uuid_bucket));
+	bch_bkey_to_text(buf, sizeof(buf), k);
+	pr_debug("%s UUIDs at %s", rw & REQ_WRITE ? "wrote" : "read", buf);

 	for (u = c->uuids; u < c->uuids + c->nr_uuids; u++)
 		if (!bch_is_zero(u->uuid, 16))
@ -543,7 +546,6 @@ void bch_prio_write(struct cache *ca)

 	pr_debug("free %zu, free_inc %zu, unused %zu", fifo_used(&ca->free),
 		 fifo_used(&ca->free_inc), fifo_used(&ca->unused));
-	blktrace_msg(ca, "Starting priorities: " buckets_free(ca));

 	for (i = prio_buckets(ca) - 1; i >= 0; --i) {
 		long bucket;
@ -743,13 +745,35 @@ static void bcache_device_free(struct bcache_device *d)
 		mempool_destroy(d->unaligned_bvec);
 	if (d->bio_split)
 		bioset_free(d->bio_split);
+	if (is_vmalloc_addr(d->stripe_sectors_dirty))
+		vfree(d->stripe_sectors_dirty);
+	else
+		kfree(d->stripe_sectors_dirty);

 	closure_debug_destroy(&d->cl);
 }

-static int bcache_device_init(struct bcache_device *d, unsigned block_size)
+static int bcache_device_init(struct bcache_device *d, unsigned block_size,
+			      sector_t sectors)
 {
 	struct request_queue *q;
+	size_t n;
+
+	if (!d->stripe_size_bits)
+		d->stripe_size_bits = 31;
+
+	d->nr_stripes = round_up(sectors, 1 << d->stripe_size_bits) >>
+		d->stripe_size_bits;
+
+	if (!d->nr_stripes || d->nr_stripes > SIZE_MAX / sizeof(atomic_t))
+		return -ENOMEM;
+
+	n = d->nr_stripes * sizeof(atomic_t);
+	d->stripe_sectors_dirty = n < PAGE_SIZE << 6
+		? kzalloc(n, GFP_KERNEL)
+		: vzalloc(n);
+	if (!d->stripe_sectors_dirty)
+		return -ENOMEM;

 	if (!(d->bio_split = bioset_create(4, offsetof(struct bbio, bio))) ||
 	    !(d->unaligned_bvec = mempool_create_kmalloc_pool(1,
@ -759,6 +783,7 @@ static int bcache_device_init(struct bcache_device *d, unsigned block_size)
 	    !(q = blk_alloc_queue(GFP_KERNEL)))
 		return -ENOMEM;

+	set_capacity(d->disk, sectors);
 	snprintf(d->disk->disk_name, DISK_NAME_LEN, "bcache%i", bcache_minor);

 	d->disk->major		= bcache_major;
@ -800,6 +825,17 @@ static void calc_cached_dev_sectors(struct cache_set *c)
 void bch_cached_dev_run(struct cached_dev *dc)
 {
 	struct bcache_device *d = &dc->disk;
+	char buf[SB_LABEL_SIZE + 1];
+	char *env[] = {
+		"DRIVER=bcache",
+		kasprintf(GFP_KERNEL, "CACHED_UUID=%pU", dc->sb.uuid),
+		NULL,
+		NULL,
+	};
+
+	memcpy(buf, dc->sb.label, SB_LABEL_SIZE);
+	buf[SB_LABEL_SIZE] = '\0';
+	env[2] = kasprintf(GFP_KERNEL, "CACHED_LABEL=%s", buf);

 	if (atomic_xchg(&dc->running, 1))
 		return;
@ -816,10 +852,12 @@ void bch_cached_dev_run(struct cached_dev *dc)

 	add_disk(d->disk);
 	bd_link_disk_holder(dc->bdev, dc->disk.disk);
-#if 0
-	char *env[] = { "SYMLINK=label" , NULL };
+	/* won't show up in the uevent file, use udevadm monitor -e instead
+	 * only class / kset properties are persistent */
 	kobject_uevent_env(&disk_to_dev(d->disk)->kobj, KOBJ_CHANGE, env);
-#endif
+	kfree(env[1]);
+	kfree(env[2]);
+
 	if (sysfs_create_link(&d->kobj, &disk_to_dev(d->disk)->kobj, "dev") ||
 	    sysfs_create_link(&disk_to_dev(d->disk)->kobj, &d->kobj, "bcache"))
 		pr_debug("error creating sysfs link");
@ -960,6 +998,7 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c)
 	atomic_set(&dc->count, 1);

 	if (BDEV_STATE(&dc->sb) == BDEV_STATE_DIRTY) {
+		bch_sectors_dirty_init(dc);
 		atomic_set(&dc->has_dirty, 1);
 		atomic_inc(&dc->count);
 		bch_writeback_queue(dc);
@ -1045,7 +1084,8 @@ static int cached_dev_init(struct cached_dev *dc, unsigned block_size)
 		hlist_add_head(&io->hash, dc->io_hash + RECENT_IO);
 	}

-	ret = bcache_device_init(&dc->disk, block_size);
+	ret = bcache_device_init(&dc->disk, block_size,
+			 dc->bdev->bd_part->nr_sects - dc->sb.data_offset);
 	if (ret)
 		return ret;

@ -1144,11 +1184,10 @@ static int flash_dev_run(struct cache_set *c, struct uuid_entry *u)

 	kobject_init(&d->kobj, &bch_flash_dev_ktype);

-	if (bcache_device_init(d, block_bytes(c)))
+	if (bcache_device_init(d, block_bytes(c), u->sectors))
 		goto err;

 	bcache_device_attach(d, c, u - c->uuids);
-	set_capacity(d->disk, u->sectors);
 	bch_flash_dev_request_init(d);
 	add_disk(d->disk);

@ -1255,9 +1294,10 @@ static void cache_set_free(struct closure *cl)
 	free_pages((unsigned long) c->uuids, ilog2(bucket_pages(c)));
 	free_pages((unsigned long) c->sort, ilog2(bucket_pages(c)));

-	kfree(c->fill_iter);
 	if (c->bio_split)
 		bioset_free(c->bio_split);
+	if (c->fill_iter)
+		mempool_destroy(c->fill_iter);
 	if (c->bio_meta)
 		mempool_destroy(c->bio_meta);
 	if (c->search)
@ -1282,7 +1322,7 @@ static void cache_set_flush(struct closure *cl)

 	/* Shut down allocator threads */
 	set_bit(CACHE_SET_STOPPING_2, &c->flags);
-	wake_up(&c->alloc_wait);
+	wake_up_allocators(c);

 	bch_cache_accounting_destroy(&c->accounting);

@ -1295,7 +1335,7 @@ static void cache_set_flush(struct closure *cl)
 	/* Should skip this if we're unregistering because of an error */
 	list_for_each_entry(b, &c->btree_cache, list)
 		if (btree_node_dirty(b))
-			bch_btree_write(b, true, NULL);
+			bch_btree_node_write(b, NULL);

 	closure_return(cl);
 }
@ -1373,9 +1413,9 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
 		c->btree_pages = max_t(int, c->btree_pages / 4,
 				       BTREE_MAX_PAGES);

-	init_waitqueue_head(&c->alloc_wait);
+	c->sort_crit_factor = int_sqrt(c->btree_pages);
+
 	mutex_init(&c->bucket_lock);
-	mutex_init(&c->fill_lock);
 	mutex_init(&c->sort_lock);
 	spin_lock_init(&c->sort_time_lock);
 	closure_init_unlocked(&c->sb_write);
@ -1401,8 +1441,8 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
 	    !(c->bio_meta = mempool_create_kmalloc_pool(2,
 				sizeof(struct bbio) + sizeof(struct bio_vec) *
 				bucket_pages(c))) ||
+	    !(c->fill_iter = mempool_create_kmalloc_pool(1, iter_size)) ||
 	    !(c->bio_split = bioset_create(4, offsetof(struct bbio, bio))) ||
-	    !(c->fill_iter = kmalloc(iter_size, GFP_KERNEL)) ||
 	    !(c->sort = alloc_bucket_pages(GFP_KERNEL, c)) ||
 	    !(c->uuids = alloc_bucket_pages(GFP_KERNEL, c)) ||
 	    bch_journal_alloc(c) ||
@ -1410,8 +1450,6 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
 	    bch_open_buckets_alloc(c))
 		goto err;

-	c->fill_iter->size = sb->bucket_size / sb->block_size;
-
 	c->congested_read_threshold_us	= 2000;
 	c->congested_write_threshold_us	= 20000;
 	c->error_limit	= 8 << IO_ERROR_SHIFT;
@ -1496,9 +1534,10 @@ static void run_cache_set(struct cache_set *c)
 		 */
 		bch_journal_next(&c->journal);

+		err = "error starting allocator thread";
 		for_each_cache(ca, c, i)
-			closure_call(&ca->alloc, bch_allocator_thread,
-				     system_wq, &c->cl);
+			if (bch_cache_allocator_start(ca))
+				goto err;

 		/*
 		 * First place it's safe to allocate: btree_check() and
@ -1531,17 +1570,16 @@ static void run_cache_set(struct cache_set *c)

 		bch_btree_gc_finish(c);

+		err = "error starting allocator thread";
 		for_each_cache(ca, c, i)
-			closure_call(&ca->alloc, bch_allocator_thread,
-				     ca->alloc_workqueue, &c->cl);
+			if (bch_cache_allocator_start(ca))
+				goto err;

 		mutex_lock(&c->bucket_lock);
 		for_each_cache(ca, c, i)
 			bch_prio_write(ca);
 		mutex_unlock(&c->bucket_lock);

-		wake_up(&c->alloc_wait);
-
 		err = "cannot allocate new UUID bucket";
 		if (__uuid_write(c))
 			goto err_unlock_gc;
@ -1552,7 +1590,7 @@ static void run_cache_set(struct cache_set *c)
 			goto err_unlock_gc;

 		bkey_copy_key(&c->root->key, &MAX_KEY);
-		bch_btree_write(c->root, true, &op);
+		bch_btree_node_write(c->root, &op.cl);

 		bch_btree_set_root(c->root);
 		rw_unlock(true, c->root);
@ -1673,9 +1711,6 @@ void bch_cache_release(struct kobject *kobj)

 	bio_split_pool_free(&ca->bio_split_hook);

-	if (ca->alloc_workqueue)
-		destroy_workqueue(ca->alloc_workqueue);
-
 	free_pages((unsigned long) ca->disk_buckets, ilog2(bucket_pages(ca)));
 	kfree(ca->prio_buckets);
 	vfree(ca->buckets);
@ -1723,7 +1758,6 @@ static int cache_alloc(struct cache_sb *sb, struct cache *ca)
 	    !(ca->prio_buckets	= kzalloc(sizeof(uint64_t) * prio_buckets(ca) *
 					  2, GFP_KERNEL)) ||
 	    !(ca->disk_buckets	= alloc_bucket_pages(GFP_KERNEL, ca)) ||
-	    !(ca->alloc_workqueue = alloc_workqueue("bch_allocator", 0, 1)) ||
 	    bio_split_pool_init(&ca->bio_split_hook))
 		return -ENOMEM;

@ -1786,6 +1820,36 @@ static ssize_t register_bcache(struct kobject *, struct kobj_attribute *,
 kobj_attribute_write(register,		register_bcache);
 kobj_attribute_write(register_quiet,	register_bcache);

+static bool bch_is_open_backing(struct block_device *bdev) {
+	struct cache_set *c, *tc;
+	struct cached_dev *dc, *t;
+
+	list_for_each_entry_safe(c, tc, &bch_cache_sets, list)
+		list_for_each_entry_safe(dc, t, &c->cached_devs, list)
+			if (dc->bdev == bdev)
+				return true;
+	list_for_each_entry_safe(dc, t, &uncached_devices, list)
+		if (dc->bdev == bdev)
+			return true;
+	return false;
+}
+
+static bool bch_is_open_cache(struct block_device *bdev) {
+	struct cache_set *c, *tc;
+	struct cache *ca;
+	unsigned i;
+
+	list_for_each_entry_safe(c, tc, &bch_cache_sets, list)
+		for_each_cache(ca, c, i)
+			if (ca->bdev == bdev)
+				return true;
+	return false;
+}
+
+static bool bch_is_open(struct block_device *bdev) {
+	return bch_is_open_cache(bdev) || bch_is_open_backing(bdev);
+}
+
 static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
 			       const char *buffer, size_t size)
 {
@ -1810,8 +1874,13 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
 				  FMODE_READ|FMODE_WRITE|FMODE_EXCL,
 				  sb);
 	if (IS_ERR(bdev)) {
-		if (bdev == ERR_PTR(-EBUSY))
+		if (bdev == ERR_PTR(-EBUSY)) {
+			bdev = lookup_bdev(strim(path));
+			if (!IS_ERR(bdev) && bch_is_open(bdev))
+				err = "device already registered";
+			else
 				err = "device busy";
+		}
 		goto err;
 	}

--- a/drivers/md/bcache/sysfs.c
+++ b/drivers/md/bcache/sysfs.c
@ -9,7 +9,9 @@
 #include "sysfs.h"
 #include "btree.h"
 #include "request.h"
+#include "writeback.h"

+#include <linux/blkdev.h>
 #include <linux/sort.h>

 static const char * const cache_replacement_policies[] = {
@ -79,6 +81,9 @@ rw_attribute(writeback_rate_p_term_inverse);
 rw_attribute(writeback_rate_d_smooth);
 read_attribute(writeback_rate_debug);

+read_attribute(stripe_size);
+read_attribute(partial_stripes_expensive);
+
 rw_attribute(synchronous);
 rw_attribute(journal_delay_ms);
 rw_attribute(discard);
@ -127,7 +132,7 @@ SHOW(__bch_cached_dev)
 		char derivative[20];
 		char target[20];
 		bch_hprint(dirty,
-		       atomic_long_read(&dc->disk.sectors_dirty) << 9);
+			   bcache_dev_sectors_dirty(&dc->disk) << 9);
 		bch_hprint(derivative,	dc->writeback_rate_derivative << 9);
 		bch_hprint(target,	dc->writeback_rate_target << 9);

@ -143,7 +148,10 @@ SHOW(__bch_cached_dev)
 	}

 	sysfs_hprint(dirty_data,
-		     atomic_long_read(&dc->disk.sectors_dirty) << 9);
+		     bcache_dev_sectors_dirty(&dc->disk) << 9);
+
+	sysfs_hprint(stripe_size,	(1 << dc->disk.stripe_size_bits) << 9);
+	var_printf(partial_stripes_expensive,	"%u");

 	var_printf(sequential_merge,	"%i");
 	var_hprint(sequential_cutoff);
@ -170,6 +178,7 @@ STORE(__cached_dev)
 					     disk.kobj);
 	unsigned v = size;
 	struct cache_set *c;
+	struct kobj_uevent_env *env;

 #define d_strtoul(var)		sysfs_strtoul(var, dc->var)
 #define d_strtoi_h(var)		sysfs_hatoi(var, dc->var)
@ -214,6 +223,7 @@ STORE(__cached_dev)
 	}

 	if (attr == &sysfs_label) {
+		/* note: endlines are preserved */
 		memcpy(dc->sb.label, buf, SB_LABEL_SIZE);
 		bch_write_bdev_super(dc, NULL);
 		if (dc->disk.c) {
@ -221,6 +231,13 @@ STORE(__cached_dev)
 			       buf, SB_LABEL_SIZE);
 			bch_uuid_write(dc->disk.c);
 		}
+		env = kzalloc(sizeof(struct kobj_uevent_env), GFP_KERNEL);
+		add_uevent_var(env, "DRIVER=bcache");
+		add_uevent_var(env, "CACHED_UUID=%pU", dc->sb.uuid),
+		add_uevent_var(env, "CACHED_LABEL=%s", buf);
+		kobject_uevent_env(
+			&disk_to_dev(dc->disk.disk)->kobj, KOBJ_CHANGE, env->envp);
+		kfree(env);
 	}

 	if (attr == &sysfs_attach) {
@ -284,6 +301,8 @@ static struct attribute *bch_cached_dev_files[] = {
 	&sysfs_writeback_rate_d_smooth,
 	&sysfs_writeback_rate_debug,
 	&sysfs_dirty_data,
+	&sysfs_stripe_size,
+	&sysfs_partial_stripes_expensive,
 	&sysfs_sequential_cutoff,
 	&sysfs_sequential_merge,
 	&sysfs_clear_stats,
@ -665,12 +684,10 @@ SHOW(__bch_cache)
 		int cmp(const void *l, const void *r)
 		{	return *((uint16_t *) r) - *((uint16_t *) l); }

-		/* Number of quantiles we compute */
-		const unsigned nq = 31;
-
 		size_t n = ca->sb.nbuckets, i, unused, btree;
 		uint64_t sum = 0;
-		uint16_t q[nq], *p, *cached;
+		/* Compute 31 quantiles */
+		uint16_t q[31], *p, *cached;
 		ssize_t ret;

 		cached = p = vmalloc(ca->sb.nbuckets * sizeof(uint16_t));
@ -703,12 +720,13 @@ SHOW(__bch_cache)
 		if (n)
 			do_div(sum, n);

-		for (i = 0; i < nq; i++)
-			q[i] = INITIAL_PRIO - cached[n * (i + 1) / (nq + 1)];
+		for (i = 0; i < ARRAY_SIZE(q); i++)
+			q[i] = INITIAL_PRIO - cached[n * (i + 1) /
+				(ARRAY_SIZE(q) + 1)];

 		vfree(p);

-		ret = snprintf(buf, PAGE_SIZE,
+		ret = scnprintf(buf, PAGE_SIZE,
 				"Unused:		%zu%%\n"
 				"Metadata:	%zu%%\n"
 				"Average:	%llu\n"
@ -716,13 +734,15 @@ SHOW(__bch_cache)
 				"Quantiles:	[",
 				unused * 100 / (size_t) ca->sb.nbuckets,
 				btree * 100 / (size_t) ca->sb.nbuckets, sum,
-			       n * ca->sb.bucket_size / (nq + 1));
+				n * ca->sb.bucket_size / (ARRAY_SIZE(q) + 1));

-		for (i = 0; i < nq && ret < (ssize_t) PAGE_SIZE; i++)
-			ret += snprintf(buf + ret, PAGE_SIZE - ret,
-					i < nq - 1 ? "%u " : "%u]\n", q[i]);
+		for (i = 0; i < ARRAY_SIZE(q); i++)
+			ret += scnprintf(buf + ret, PAGE_SIZE - ret,
+					 "%u ", q[i]);
+		ret--;
+
+		ret += scnprintf(buf + ret, PAGE_SIZE - ret, "]\n");

-		buf[PAGE_SIZE - 1] = '\0';
 		return ret;
 	}

--- a/drivers/md/bcache/trace.c
+++ b/drivers/md/bcache/trace.c
@ -2,6 +2,7 @@
 #include "btree.h"
 #include "request.h"

+#include <linux/blktrace_api.h>
 #include <linux/module.h>

 #define CREATE_TRACE_POINTS
@ -9,18 +10,44 @@

 EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_request_start);
 EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_request_end);
-EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_passthrough);
-EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_cache_hit);
-EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_cache_miss);
+
+EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_bypass_sequential);
+EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_bypass_congested);
+
+EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_read);
+EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_write);
 EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_read_retry);
-EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_writethrough);
-EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_writeback);
-EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_write_skip);
+
+EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_cache_insert);
+
+EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_journal_replay_key);
+EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_journal_write);
+EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_journal_full);
+EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_journal_entry_full);
+
+EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_cache_cannibalize);
+
 EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_read);
 EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_write);
-EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_write_dirty);
-EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_read_dirty);
-EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_journal_write);
-EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_cache_insert);
+
+EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_node_alloc);
+EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_node_alloc_fail);
+EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_node_free);
+
+EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_gc_coalesce);
 EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_gc_start);
 EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_gc_end);
+EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_gc_copy);
+EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_gc_copy_collision);
+
+EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_insert_key);
+
+EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_node_split);
+EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_node_compact);
+EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_set_root);
+
+EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_alloc_invalidate);
+EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_alloc_fail);
+
+EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_writeback);
+EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_writeback_collision);
--- a/drivers/md/bcache/util.c
+++ b/drivers/md/bcache/util.c
@ -228,23 +228,6 @@ start:		bv->bv_len	= min_t(size_t, PAGE_SIZE - bv->bv_offset,
 	}
 }

-int bch_bio_alloc_pages(struct bio *bio, gfp_t gfp)
-{
-	int i;
-	struct bio_vec *bv;
-
-	bio_for_each_segment(bv, bio, i) {
-		bv->bv_page = alloc_page(gfp);
-		if (!bv->bv_page) {
-			while (bv-- != bio->bi_io_vec + bio->bi_idx)
-				__free_page(bv->bv_page);
-			return -ENOMEM;
-		}
-	}
-
-	return 0;
-}
-
 /*
 * Portions Copyright (c) 1996-2001, PostgreSQL Global Development Group (Any
 * use permitted, subject to terms of PostgreSQL license; see.)
--- a/drivers/md/bcache/util.h
+++ b/drivers/md/bcache/util.h
@ -15,8 +15,6 @@

 struct closure;

-#include <trace/events/bcache.h>
-
 #ifdef CONFIG_BCACHE_EDEBUG

 #define atomic_dec_bug(v)	BUG_ON(atomic_dec_return(v) < 0)
@ -566,12 +564,8 @@ static inline unsigned fract_exp_two(unsigned x, unsigned fract_bits)
 	return x;
 }

-#define bio_end(bio)	((bio)->bi_sector + bio_sectors(bio))
-
 void bch_bio_map(struct bio *bio, void *base);

-int bch_bio_alloc_pages(struct bio *bio, gfp_t gfp);
-
 static inline sector_t bdev_sectors(struct block_device *bdev)
 {
 	return bdev->bd_inode->i_size >> 9;
--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@ -9,6 +9,9 @@
 #include "bcache.h"
 #include "btree.h"
 #include "debug.h"
+#include "writeback.h"
+
+#include <trace/events/bcache.h>

 static struct workqueue_struct *dirty_wq;

@ -36,7 +39,7 @@ static void __update_writeback_rate(struct cached_dev *dc)

 	int change = 0;
 	int64_t error;
-	int64_t dirty = atomic_long_read(&dc->disk.sectors_dirty);
+	int64_t dirty = bcache_dev_sectors_dirty(&dc->disk);
 	int64_t derivative = dirty - dc->disk.sectors_dirty_last;

 	dc->disk.sectors_dirty_last = dirty;
@ -105,6 +108,31 @@ static bool dirty_pred(struct keybuf *buf, struct bkey *k)
 	return KEY_DIRTY(k);
 }

+static bool dirty_full_stripe_pred(struct keybuf *buf, struct bkey *k)
+{
+	uint64_t stripe;
+	unsigned nr_sectors = KEY_SIZE(k);
+	struct cached_dev *dc = container_of(buf, struct cached_dev,
+					     writeback_keys);
+	unsigned stripe_size = 1 << dc->disk.stripe_size_bits;
+
+	if (!KEY_DIRTY(k))
+		return false;
+
+	stripe = KEY_START(k) >> dc->disk.stripe_size_bits;
+	while (1) {
+		if (atomic_read(dc->disk.stripe_sectors_dirty + stripe) !=
+		    stripe_size)
+			return false;
+
+		if (nr_sectors <= stripe_size)
+			return true;
+
+		nr_sectors -= stripe_size;
+		stripe++;
+	}
+}
+
 static void dirty_init(struct keybuf_key *w)
 {
 	struct dirty_io *io = w->private;
@ -149,7 +177,22 @@ static void refill_dirty(struct closure *cl)
 		searched_from_start = true;
 	}

-	bch_refill_keybuf(dc->disk.c, buf, &end);
+	if (dc->partial_stripes_expensive) {
+		uint64_t i;
+
+		for (i = 0; i < dc->disk.nr_stripes; i++)
+			if (atomic_read(dc->disk.stripe_sectors_dirty + i) ==
+			    1 << dc->disk.stripe_size_bits)
+				goto full_stripes;
+
+		goto normal_refill;
+full_stripes:
+		bch_refill_keybuf(dc->disk.c, buf, &end,
+				  dirty_full_stripe_pred);
+	} else {
+normal_refill:
+		bch_refill_keybuf(dc->disk.c, buf, &end, dirty_pred);
+	}

 	if (bkey_cmp(&buf->last_scanned, &end) >= 0 && searched_from_start) {
 		/* Searched the entire btree  - delay awhile */
@ -181,10 +224,8 @@ void bch_writeback_queue(struct cached_dev *dc)
 	}
 }

-void bch_writeback_add(struct cached_dev *dc, unsigned sectors)
+void bch_writeback_add(struct cached_dev *dc)
 {
-	atomic_long_add(sectors, &dc->disk.sectors_dirty);
-
 	if (!atomic_read(&dc->has_dirty) &&
 	    !atomic_xchg(&dc->has_dirty, 1)) {
 		atomic_inc(&dc->count);
@ -203,6 +244,34 @@ void bch_writeback_add(struct cached_dev *dc, unsigned sectors)
 	}
 }

+void bcache_dev_sectors_dirty_add(struct cache_set *c, unsigned inode,
+				  uint64_t offset, int nr_sectors)
+{
+	struct bcache_device *d = c->devices[inode];
+	unsigned stripe_size, stripe_offset;
+	uint64_t stripe;
+
+	if (!d)
+		return;
+
+	stripe_size = 1 << d->stripe_size_bits;
+	stripe = offset >> d->stripe_size_bits;
+	stripe_offset = offset & (stripe_size - 1);
+
+	while (nr_sectors) {
+		int s = min_t(unsigned, abs(nr_sectors),
+			      stripe_size - stripe_offset);
+
+		if (nr_sectors < 0)
+			s = -s;
+
+		atomic_add(s, d->stripe_sectors_dirty + stripe);
+		nr_sectors -= s;
+		stripe_offset = 0;
+		stripe++;
+	}
+}
+
 /* Background writeback - IO loop */

 static void dirty_io_destructor(struct closure *cl)
@ -216,9 +285,10 @@ static void write_dirty_finish(struct closure *cl)
 	struct dirty_io *io = container_of(cl, struct dirty_io, cl);
 	struct keybuf_key *w = io->bio.bi_private;
 	struct cached_dev *dc = io->dc;
-	struct bio_vec *bv = bio_iovec_idx(&io->bio, io->bio.bi_vcnt);
+	struct bio_vec *bv;
+	int i;

-	while (bv-- != io->bio.bi_io_vec)
+	bio_for_each_segment_all(bv, &io->bio, i)
 		__free_page(bv->bv_page);

 	/* This is kind of a dumb way of signalling errors. */
@ -236,10 +306,12 @@ static void write_dirty_finish(struct closure *cl)
 		for (i = 0; i < KEY_PTRS(&w->key); i++)
 			atomic_inc(&PTR_BUCKET(dc->disk.c, &w->key, i)->pin);

-		pr_debug("clearing %s", pkey(&w->key));
 		bch_btree_insert(&op, dc->disk.c);
 		closure_sync(&op.cl);

+		if (op.insert_collision)
+			trace_bcache_writeback_collision(&w->key);
+
 		atomic_long_inc(op.insert_collision
 				? &dc->disk.c->writeback_keys_failed
 				: &dc->disk.c->writeback_keys_done);
@ -275,7 +347,6 @@ static void write_dirty(struct closure *cl)
 	io->bio.bi_bdev		= io->dc->bdev;
 	io->bio.bi_end_io	= dirty_endio;

-	trace_bcache_write_dirty(&io->bio);
 	closure_bio_submit(&io->bio, cl, &io->dc->disk);

 	continue_at(cl, write_dirty_finish, dirty_wq);
@ -296,7 +367,6 @@ static void read_dirty_submit(struct closure *cl)
 {
 	struct dirty_io *io = container_of(cl, struct dirty_io, cl);

-	trace_bcache_read_dirty(&io->bio);
 	closure_bio_submit(&io->bio, cl, &io->dc->disk);

 	continue_at(cl, write_dirty, dirty_wq);
@ -349,10 +419,10 @@ static void read_dirty(struct closure *cl)
 		io->bio.bi_rw		= READ;
 		io->bio.bi_end_io	= read_dirty_endio;

-		if (bch_bio_alloc_pages(&io->bio, GFP_KERNEL))
+		if (bio_alloc_pages(&io->bio, GFP_KERNEL))
 			goto err_free;

-		pr_debug("%s", pkey(&w->key));
+		trace_bcache_writeback(&w->key);

 		closure_call(&io->cl, read_dirty_submit, NULL, &dc->disk.cl);

@ -375,12 +445,49 @@ err:
 	refill_dirty(cl);
 }

+/* Init */
+
+static int bch_btree_sectors_dirty_init(struct btree *b, struct btree_op *op,
+					struct cached_dev *dc)
+{
+	struct bkey *k;
+	struct btree_iter iter;
+
+	bch_btree_iter_init(b, &iter, &KEY(dc->disk.id, 0, 0));
+	while ((k = bch_btree_iter_next_filter(&iter, b, bch_ptr_bad)))
+		if (!b->level) {
+			if (KEY_INODE(k) > dc->disk.id)
+				break;
+
+			if (KEY_DIRTY(k))
+				bcache_dev_sectors_dirty_add(b->c, dc->disk.id,
+							     KEY_START(k),
+							     KEY_SIZE(k));
+		} else {
+			btree(sectors_dirty_init, k, b, op, dc);
+			if (KEY_INODE(k) > dc->disk.id)
+				break;
+
+			cond_resched();
+		}
+
+	return 0;
+}
+
+void bch_sectors_dirty_init(struct cached_dev *dc)
+{
+	struct btree_op op;
+
+	bch_btree_op_init_stack(&op);
+	btree_root(sectors_dirty_init, dc->disk.c, &op, dc);
+}
+
 void bch_cached_dev_writeback_init(struct cached_dev *dc)
 {
 	closure_init_unlocked(&dc->writeback);
 	init_rwsem(&dc->writeback_lock);

-	bch_keybuf_init(&dc->writeback_keys, dirty_pred);
+	bch_keybuf_init(&dc->writeback_keys);

 	dc->writeback_metadata		= true;
 	dc->writeback_running		= true;
--- a/drivers/md/bcache/writeback.h
+++ b/drivers/md/bcache/writeback.h
@ -0,0 +1,64 @@
+#ifndef _BCACHE_WRITEBACK_H
+#define _BCACHE_WRITEBACK_H
+
+#define CUTOFF_WRITEBACK	40
+#define CUTOFF_WRITEBACK_SYNC	70
+
+static inline uint64_t bcache_dev_sectors_dirty(struct bcache_device *d)
+{
+	uint64_t i, ret = 0;
+
+	for (i = 0; i < d->nr_stripes; i++)
+		ret += atomic_read(d->stripe_sectors_dirty + i);
+
+	return ret;
+}
+
+static inline bool bcache_dev_stripe_dirty(struct bcache_device *d,
+					   uint64_t offset,
+					   unsigned nr_sectors)
+{
+	uint64_t stripe = offset >> d->stripe_size_bits;
+
+	while (1) {
+		if (atomic_read(d->stripe_sectors_dirty + stripe))
+			return true;
+
+		if (nr_sectors <= 1 << d->stripe_size_bits)
+			return false;
+
+		nr_sectors -= 1 << d->stripe_size_bits;
+		stripe++;
+	}
+}
+
+static inline bool should_writeback(struct cached_dev *dc, struct bio *bio,
+				    unsigned cache_mode, bool would_skip)
+{
+	unsigned in_use = dc->disk.c->gc_stats.in_use;
+
+	if (cache_mode != CACHE_MODE_WRITEBACK ||
+	    atomic_read(&dc->disk.detaching) ||
+	    in_use > CUTOFF_WRITEBACK_SYNC)
+		return false;
+
+	if (dc->partial_stripes_expensive &&
+	    bcache_dev_stripe_dirty(&dc->disk, bio->bi_sector,
+				    bio_sectors(bio)))
+		return true;
+
+	if (would_skip)
+		return false;
+
+	return bio->bi_rw & REQ_SYNC ||
+		in_use <= CUTOFF_WRITEBACK;
+}
+
+void bcache_dev_sectors_dirty_add(struct cache_set *, unsigned, uint64_t, int);
+void bch_writeback_queue(struct cached_dev *);
+void bch_writeback_add(struct cached_dev *);
+
+void bch_sectors_dirty_init(struct cached_dev *dc);
+void bch_cached_dev_writeback_init(struct cached_dev *);
+
+#endif
--- a/include/trace/events/bcache.h
+++ b/include/trace/events/bcache.h
@ -9,9 +9,7 @@
 struct search;

 DECLARE_EVENT_CLASS(bcache_request,
-
 	TP_PROTO(struct search *s, struct bio *bio),
-
 	TP_ARGS(s, bio),

 	TP_STRUCT__entry(
@ -22,7 +20,6 @@ DECLARE_EVENT_CLASS(bcache_request,
 		__field(dev_t,		orig_sector		)
 		__field(unsigned int,	nr_sector		)
 		__array(char,		rwbs,	6		)
-		__array(char,		comm,	TASK_COMM_LEN	)
 	),

 	TP_fast_assign(
@ -33,36 +30,66 @@ DECLARE_EVENT_CLASS(bcache_request,
 		__entry->orig_sector	= bio->bi_sector - 16;
 		__entry->nr_sector	= bio->bi_size >> 9;
 		blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size);
-		memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
 	),

-	TP_printk("%d,%d %s %llu + %u [%s] (from %d,%d @ %llu)",
+	TP_printk("%d,%d %s %llu + %u (from %d,%d @ %llu)",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
-		  __entry->rwbs,
-		  (unsigned long long)__entry->sector,
-		  __entry->nr_sector, __entry->comm,
-		  __entry->orig_major, __entry->orig_minor,
+		  __entry->rwbs, (unsigned long long)__entry->sector,
+		  __entry->nr_sector, __entry->orig_major, __entry->orig_minor,
 		  (unsigned long long)__entry->orig_sector)
 );

+DECLARE_EVENT_CLASS(bkey,
+	TP_PROTO(struct bkey *k),
+	TP_ARGS(k),
+
+	TP_STRUCT__entry(
+		__field(u32,	size				)
+		__field(u32,	inode				)
+		__field(u64,	offset				)
+		__field(bool,	dirty				)
+	),
+
+	TP_fast_assign(
+		__entry->inode	= KEY_INODE(k);
+		__entry->offset	= KEY_OFFSET(k);
+		__entry->size	= KEY_SIZE(k);
+		__entry->dirty	= KEY_DIRTY(k);
+	),
+
+	TP_printk("%u:%llu len %u dirty %u", __entry->inode,
+		  __entry->offset, __entry->size, __entry->dirty)
+);
+
+DECLARE_EVENT_CLASS(btree_node,
+	TP_PROTO(struct btree *b),
+	TP_ARGS(b),
+
+	TP_STRUCT__entry(
+		__field(size_t,		bucket			)
+	),
+
+	TP_fast_assign(
+		__entry->bucket	= PTR_BUCKET_NR(b->c, &b->key, 0);
+	),
+
+	TP_printk("bucket %zu", __entry->bucket)
+);
+
+/* request.c */
+
 DEFINE_EVENT(bcache_request, bcache_request_start,
-
 	TP_PROTO(struct search *s, struct bio *bio),
-
 	TP_ARGS(s, bio)
 );

 DEFINE_EVENT(bcache_request, bcache_request_end,
-
 	TP_PROTO(struct search *s, struct bio *bio),
-
 	TP_ARGS(s, bio)
 );

 DECLARE_EVENT_CLASS(bcache_bio,
-
 	TP_PROTO(struct bio *bio),
-
 	TP_ARGS(bio),

 	TP_STRUCT__entry(
@ -70,7 +97,6 @@ DECLARE_EVENT_CLASS(bcache_bio,
 		__field(sector_t,	sector			)
 		__field(unsigned int,	nr_sector		)
 		__array(char,		rwbs,	6		)
-		__array(char,		comm,	TASK_COMM_LEN	)
 	),

 	TP_fast_assign(
@ -78,191 +104,328 @@ DECLARE_EVENT_CLASS(bcache_bio,
 		__entry->sector		= bio->bi_sector;
 		__entry->nr_sector	= bio->bi_size >> 9;
 		blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size);
-		memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
 	),

-	TP_printk("%d,%d  %s %llu + %u [%s]",
-		  MAJOR(__entry->dev), MINOR(__entry->dev),
-		  __entry->rwbs,
-		  (unsigned long long)__entry->sector,
-		  __entry->nr_sector, __entry->comm)
+	TP_printk("%d,%d  %s %llu + %u",
+		  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
+		  (unsigned long long)__entry->sector, __entry->nr_sector)
 );

-
-DEFINE_EVENT(bcache_bio, bcache_passthrough,
-
+DEFINE_EVENT(bcache_bio, bcache_bypass_sequential,
 	TP_PROTO(struct bio *bio),
-
 	TP_ARGS(bio)
 );

-DEFINE_EVENT(bcache_bio, bcache_cache_hit,
-
+DEFINE_EVENT(bcache_bio, bcache_bypass_congested,
 	TP_PROTO(struct bio *bio),
-
 	TP_ARGS(bio)
 );

-DEFINE_EVENT(bcache_bio, bcache_cache_miss,
-
-	TP_PROTO(struct bio *bio),
-
-	TP_ARGS(bio)
-);
-
-DEFINE_EVENT(bcache_bio, bcache_read_retry,
-
-	TP_PROTO(struct bio *bio),
-
-	TP_ARGS(bio)
-);
-
-DEFINE_EVENT(bcache_bio, bcache_writethrough,
-
-	TP_PROTO(struct bio *bio),
-
-	TP_ARGS(bio)
-);
-
-DEFINE_EVENT(bcache_bio, bcache_writeback,
-
-	TP_PROTO(struct bio *bio),
-
-	TP_ARGS(bio)
-);
-
-DEFINE_EVENT(bcache_bio, bcache_write_skip,
-
-	TP_PROTO(struct bio *bio),
-
-	TP_ARGS(bio)
-);
-
-DEFINE_EVENT(bcache_bio, bcache_btree_read,
-
-	TP_PROTO(struct bio *bio),
-
-	TP_ARGS(bio)
-);
-
-DEFINE_EVENT(bcache_bio, bcache_btree_write,
-
-	TP_PROTO(struct bio *bio),
-
-	TP_ARGS(bio)
-);
-
-DEFINE_EVENT(bcache_bio, bcache_write_dirty,
-
-	TP_PROTO(struct bio *bio),
-
-	TP_ARGS(bio)
-);
-
-DEFINE_EVENT(bcache_bio, bcache_read_dirty,
-
-	TP_PROTO(struct bio *bio),
-
-	TP_ARGS(bio)
-);
-
-DEFINE_EVENT(bcache_bio, bcache_write_moving,
-
-	TP_PROTO(struct bio *bio),
-
-	TP_ARGS(bio)
-);
-
-DEFINE_EVENT(bcache_bio, bcache_read_moving,
-
-	TP_PROTO(struct bio *bio),
-
-	TP_ARGS(bio)
-);
-
-DEFINE_EVENT(bcache_bio, bcache_journal_write,
-
-	TP_PROTO(struct bio *bio),
-
-	TP_ARGS(bio)
-);
-
-DECLARE_EVENT_CLASS(bcache_cache_bio,
-
-	TP_PROTO(struct bio *bio,
-		 sector_t orig_sector,
-		 struct block_device* orig_bdev),
-
-	TP_ARGS(bio, orig_sector, orig_bdev),
+TRACE_EVENT(bcache_read,
+	TP_PROTO(struct bio *bio, bool hit, bool bypass),
+	TP_ARGS(bio, hit, bypass),

 	TP_STRUCT__entry(
 		__field(dev_t,		dev			)
-		__field(dev_t,		orig_dev		)
 		__field(sector_t,	sector			)
-		__field(sector_t,	orig_sector		)
 		__field(unsigned int,	nr_sector		)
 		__array(char,		rwbs,	6		)
-		__array(char,		comm,	TASK_COMM_LEN	)
+		__field(bool,		cache_hit		)
+		__field(bool,		bypass			)
 	),

 	TP_fast_assign(
 		__entry->dev		= bio->bi_bdev->bd_dev;
-		__entry->orig_dev	= orig_bdev->bd_dev;
 		__entry->sector		= bio->bi_sector;
-		__entry->orig_sector	= orig_sector;
 		__entry->nr_sector	= bio->bi_size >> 9;
 		blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size);
-		memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
+		__entry->cache_hit = hit;
+		__entry->bypass = bypass;
 	),

-	TP_printk("%d,%d  %s %llu + %u [%s] (from %d,%d %llu)",
+	TP_printk("%d,%d  %s %llu + %u hit %u bypass %u",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
-		  __entry->rwbs,
-		  (unsigned long long)__entry->sector,
-		  __entry->nr_sector, __entry->comm,
-		  MAJOR(__entry->orig_dev), MINOR(__entry->orig_dev),
-		  (unsigned long long)__entry->orig_sector)
+		  __entry->rwbs, (unsigned long long)__entry->sector,
+		  __entry->nr_sector, __entry->cache_hit, __entry->bypass)
 );

-DEFINE_EVENT(bcache_cache_bio, bcache_cache_insert,
-
-	TP_PROTO(struct bio *bio,
-		 sector_t orig_sector,
-		 struct block_device *orig_bdev),
-
-	TP_ARGS(bio, orig_sector, orig_bdev)
-);
-
-DECLARE_EVENT_CLASS(bcache_gc,
-
-	TP_PROTO(uint8_t *uuid),
-
-	TP_ARGS(uuid),
+TRACE_EVENT(bcache_write,
+	TP_PROTO(struct bio *bio, bool writeback, bool bypass),
+	TP_ARGS(bio, writeback, bypass),

 	TP_STRUCT__entry(
-		__field(uint8_t *,	uuid)
+		__field(dev_t,		dev			)
+		__field(sector_t,	sector			)
+		__field(unsigned int,	nr_sector		)
+		__array(char,		rwbs,	6		)
+		__field(bool,		writeback		)
+		__field(bool,		bypass			)
 	),

 	TP_fast_assign(
-		__entry->uuid		= uuid;
+		__entry->dev		= bio->bi_bdev->bd_dev;
+		__entry->sector		= bio->bi_sector;
+		__entry->nr_sector	= bio->bi_size >> 9;
+		blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size);
+		__entry->writeback = writeback;
+		__entry->bypass = bypass;
+	),
+
+	TP_printk("%d,%d  %s %llu + %u hit %u bypass %u",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->rwbs, (unsigned long long)__entry->sector,
+		  __entry->nr_sector, __entry->writeback, __entry->bypass)
+);
+
+DEFINE_EVENT(bcache_bio, bcache_read_retry,
+	TP_PROTO(struct bio *bio),
+	TP_ARGS(bio)
+);
+
+DEFINE_EVENT(bkey, bcache_cache_insert,
+	TP_PROTO(struct bkey *k),
+	TP_ARGS(k)
+);
+
+/* Journal */
+
+DECLARE_EVENT_CLASS(cache_set,
+	TP_PROTO(struct cache_set *c),
+	TP_ARGS(c),
+
+	TP_STRUCT__entry(
+		__array(char,		uuid,	16 )
+	),
+
+	TP_fast_assign(
+		memcpy(__entry->uuid, c->sb.set_uuid, 16);
 	),

 	TP_printk("%pU", __entry->uuid)
 );

-
-DEFINE_EVENT(bcache_gc, bcache_gc_start,
-
-	     TP_PROTO(uint8_t *uuid),
-
-	     TP_ARGS(uuid)
+DEFINE_EVENT(bkey, bcache_journal_replay_key,
+	TP_PROTO(struct bkey *k),
+	TP_ARGS(k)
 );

-DEFINE_EVENT(bcache_gc, bcache_gc_end,
+DEFINE_EVENT(cache_set, bcache_journal_full,
+	TP_PROTO(struct cache_set *c),
+	TP_ARGS(c)
+);

-	     TP_PROTO(uint8_t *uuid),
+DEFINE_EVENT(cache_set, bcache_journal_entry_full,
+	TP_PROTO(struct cache_set *c),
+	TP_ARGS(c)
+);

-	     TP_ARGS(uuid)
+DEFINE_EVENT(bcache_bio, bcache_journal_write,
+	TP_PROTO(struct bio *bio),
+	TP_ARGS(bio)
+);
+
+/* Btree */
+
+DEFINE_EVENT(cache_set, bcache_btree_cache_cannibalize,
+	TP_PROTO(struct cache_set *c),
+	TP_ARGS(c)
+);
+
+DEFINE_EVENT(btree_node, bcache_btree_read,
+	TP_PROTO(struct btree *b),
+	TP_ARGS(b)
+);
+
+TRACE_EVENT(bcache_btree_write,
+	TP_PROTO(struct btree *b),
+	TP_ARGS(b),
+
+	TP_STRUCT__entry(
+		__field(size_t,		bucket			)
+		__field(unsigned,	block			)
+		__field(unsigned,	keys			)
+	),
+
+	TP_fast_assign(
+		__entry->bucket	= PTR_BUCKET_NR(b->c, &b->key, 0);
+		__entry->block	= b->written;
+		__entry->keys	= b->sets[b->nsets].data->keys;
+	),
+
+	TP_printk("bucket %zu", __entry->bucket)
+);
+
+DEFINE_EVENT(btree_node, bcache_btree_node_alloc,
+	TP_PROTO(struct btree *b),
+	TP_ARGS(b)
+);
+
+DEFINE_EVENT(btree_node, bcache_btree_node_alloc_fail,
+	TP_PROTO(struct btree *b),
+	TP_ARGS(b)
+);
+
+DEFINE_EVENT(btree_node, bcache_btree_node_free,
+	TP_PROTO(struct btree *b),
+	TP_ARGS(b)
+);
+
+TRACE_EVENT(bcache_btree_gc_coalesce,
+	TP_PROTO(unsigned nodes),
+	TP_ARGS(nodes),
+
+	TP_STRUCT__entry(
+		__field(unsigned,	nodes			)
+	),
+
+	TP_fast_assign(
+		__entry->nodes	= nodes;
+	),
+
+	TP_printk("coalesced %u nodes", __entry->nodes)
+);
+
+DEFINE_EVENT(cache_set, bcache_gc_start,
+	TP_PROTO(struct cache_set *c),
+	TP_ARGS(c)
+);
+
+DEFINE_EVENT(cache_set, bcache_gc_end,
+	TP_PROTO(struct cache_set *c),
+	TP_ARGS(c)
+);
+
+DEFINE_EVENT(bkey, bcache_gc_copy,
+	TP_PROTO(struct bkey *k),
+	TP_ARGS(k)
+);
+
+DEFINE_EVENT(bkey, bcache_gc_copy_collision,
+	TP_PROTO(struct bkey *k),
+	TP_ARGS(k)
+);
+
+TRACE_EVENT(bcache_btree_insert_key,
+	TP_PROTO(struct btree *b, struct bkey *k, unsigned op, unsigned status),
+	TP_ARGS(b, k, op, status),
+
+	TP_STRUCT__entry(
+		__field(u64,	btree_node			)
+		__field(u32,	btree_level			)
+		__field(u32,	inode				)
+		__field(u64,	offset				)
+		__field(u32,	size				)
+		__field(u8,	dirty				)
+		__field(u8,	op				)
+		__field(u8,	status				)
+	),
+
+	TP_fast_assign(
+		__entry->btree_node = PTR_BUCKET_NR(b->c, &b->key, 0);
+		__entry->btree_level = b->level;
+		__entry->inode	= KEY_INODE(k);
+		__entry->offset	= KEY_OFFSET(k);
+		__entry->size	= KEY_SIZE(k);
+		__entry->dirty	= KEY_DIRTY(k);
+		__entry->op = op;
+		__entry->status = status;
+	),
+
+	TP_printk("%u for %u at %llu(%u): %u:%llu len %u dirty %u",
+		  __entry->status, __entry->op,
+		  __entry->btree_node, __entry->btree_level,
+		  __entry->inode, __entry->offset,
+		  __entry->size, __entry->dirty)
+);
+
+DECLARE_EVENT_CLASS(btree_split,
+	TP_PROTO(struct btree *b, unsigned keys),
+	TP_ARGS(b, keys),
+
+	TP_STRUCT__entry(
+		__field(size_t,		bucket			)
+		__field(unsigned,	keys			)
+	),
+
+	TP_fast_assign(
+		__entry->bucket	= PTR_BUCKET_NR(b->c, &b->key, 0);
+		__entry->keys	= keys;
+	),
+
+	TP_printk("bucket %zu keys %u", __entry->bucket, __entry->keys)
+);
+
+DEFINE_EVENT(btree_split, bcache_btree_node_split,
+	TP_PROTO(struct btree *b, unsigned keys),
+	TP_ARGS(b, keys)
+);
+
+DEFINE_EVENT(btree_split, bcache_btree_node_compact,
+	TP_PROTO(struct btree *b, unsigned keys),
+	TP_ARGS(b, keys)
+);
+
+DEFINE_EVENT(btree_node, bcache_btree_set_root,
+	TP_PROTO(struct btree *b),
+	TP_ARGS(b)
+);
+
+/* Allocator */
+
+TRACE_EVENT(bcache_alloc_invalidate,
+	TP_PROTO(struct cache *ca),
+	TP_ARGS(ca),
+
+	TP_STRUCT__entry(
+		__field(unsigned,	free			)
+		__field(unsigned,	free_inc		)
+		__field(unsigned,	free_inc_size		)
+		__field(unsigned,	unused			)
+	),
+
+	TP_fast_assign(
+		__entry->free		= fifo_used(&ca->free);
+		__entry->free_inc	= fifo_used(&ca->free_inc);
+		__entry->free_inc_size	= ca->free_inc.size;
+		__entry->unused		= fifo_used(&ca->unused);
+	),
+
+	TP_printk("free %u free_inc %u/%u unused %u", __entry->free,
+		  __entry->free_inc, __entry->free_inc_size, __entry->unused)
+);
+
+TRACE_EVENT(bcache_alloc_fail,
+	TP_PROTO(struct cache *ca),
+	TP_ARGS(ca),
+
+	TP_STRUCT__entry(
+		__field(unsigned,	free			)
+		__field(unsigned,	free_inc		)
+		__field(unsigned,	unused			)
+		__field(unsigned,	blocked			)
+	),
+
+	TP_fast_assign(
+		__entry->free		= fifo_used(&ca->free);
+		__entry->free_inc	= fifo_used(&ca->free_inc);
+		__entry->unused		= fifo_used(&ca->unused);
+		__entry->blocked	= atomic_read(&ca->set->prio_blocked);
+	),
+
+	TP_printk("free %u free_inc %u unused %u blocked %u", __entry->free,
+		  __entry->free_inc, __entry->unused, __entry->blocked)
+);
+
+/* Background writeback */
+
+DEFINE_EVENT(bkey, bcache_writeback,
+	TP_PROTO(struct bkey *k),
+	TP_ARGS(k)
+);
+
+DEFINE_EVENT(bkey, bcache_writeback_collision,
+	TP_PROTO(struct bkey *k),
+	TP_ARGS(k)
 );

 #endif /* _TRACE_BCACHE_H */