From 5c694129c8db6d89c9be109049a16510b2f70f6d Mon Sep 17 00:00:00 2001 From: Kumar Amit Mehta Date: Tue, 28 May 2013 00:31:15 -0700 Subject: [PATCH 01/20] md: bcache: io.c: fix a potential NULL pointer dereference bio_alloc_bioset returns NULL on failure. This fix adds a missing check for potential NULL pointer dereferencing. Signed-off-by: Kumar Amit Mehta Signed-off-by: Kent Overstreet --- drivers/md/bcache/io.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c index 48efd4dea645..d285cd49104c 100644 --- a/drivers/md/bcache/io.c +++ b/drivers/md/bcache/io.c @@ -97,6 +97,8 @@ struct bio *bch_bio_split(struct bio *bio, int sectors, if (bio->bi_rw & REQ_DISCARD) { ret = bio_alloc_bioset(gfp, 1, bs); + if (!ret) + return NULL; idx = 0; goto out; } From bbc77aa7fb72e616edcb1b165c293e47c6d4d0cf Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 28 May 2013 21:53:19 -0700 Subject: [PATCH 02/20] bcache: fix a spurious gcc complaint, use scnprintf An old version of gcc was complaining about using a const int as the size of a stack allocated array. Which should be fine - but using ARRAY_SIZE() is better, anyways. Also, refactor the code to use scnprintf(). Signed-off-by: Kent Overstreet --- drivers/md/bcache/sysfs.c | 39 ++++++++++++++++++++------------------- 1 file changed, 20 insertions(+), 19 deletions(-) diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index 4d9cca47e4c6..29228b8a6ffe 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -665,12 +665,10 @@ SHOW(__bch_cache) int cmp(const void *l, const void *r) { return *((uint16_t *) r) - *((uint16_t *) l); } - /* Number of quantiles we compute */ - const unsigned nq = 31; - size_t n = ca->sb.nbuckets, i, unused, btree; uint64_t sum = 0; - uint16_t q[nq], *p, *cached; + /* Compute 31 quantiles */ + uint16_t q[31], *p, *cached; ssize_t ret; cached = p = vmalloc(ca->sb.nbuckets * sizeof(uint16_t)); @@ -703,26 +701,29 @@ SHOW(__bch_cache) if (n) do_div(sum, n); - for (i = 0; i < nq; i++) - q[i] = INITIAL_PRIO - cached[n * (i + 1) / (nq + 1)]; + for (i = 0; i < ARRAY_SIZE(q); i++) + q[i] = INITIAL_PRIO - cached[n * (i + 1) / + (ARRAY_SIZE(q) + 1)]; vfree(p); - ret = snprintf(buf, PAGE_SIZE, - "Unused: %zu%%\n" - "Metadata: %zu%%\n" - "Average: %llu\n" - "Sectors per Q: %zu\n" - "Quantiles: [", - unused * 100 / (size_t) ca->sb.nbuckets, - btree * 100 / (size_t) ca->sb.nbuckets, sum, - n * ca->sb.bucket_size / (nq + 1)); + ret = scnprintf(buf, PAGE_SIZE, + "Unused: %zu%%\n" + "Metadata: %zu%%\n" + "Average: %llu\n" + "Sectors per Q: %zu\n" + "Quantiles: [", + unused * 100 / (size_t) ca->sb.nbuckets, + btree * 100 / (size_t) ca->sb.nbuckets, sum, + n * ca->sb.bucket_size / (ARRAY_SIZE(q) + 1)); - for (i = 0; i < nq && ret < (ssize_t) PAGE_SIZE; i++) - ret += snprintf(buf + ret, PAGE_SIZE - ret, - i < nq - 1 ? "%u " : "%u]\n", q[i]); + for (i = 0; i < ARRAY_SIZE(q); i++) + ret += scnprintf(buf + ret, PAGE_SIZE - ret, + "%u ", q[i]); + ret--; + + ret += scnprintf(buf + ret, PAGE_SIZE - ret, "]\n"); - buf[PAGE_SIZE - 1] = '\0'; return ret; } From a9dd53adbb84c12f769a862ba2c80404873c2c99 Mon Sep 17 00:00:00 2001 From: Gabriel de Perthuis Date: Sat, 4 May 2013 12:19:41 +0200 Subject: [PATCH 03/20] bcache: Warn when a device is already registered. Signed-off-by: Gabriel de Perthuis Signed-off-by: Kent Overstreet --- drivers/md/bcache/super.c | 39 +++++++++++++++++++++++++++++++++++++-- 1 file changed, 37 insertions(+), 2 deletions(-) diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index f88e2b653a3f..3de5626919ef 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -1786,6 +1786,36 @@ static ssize_t register_bcache(struct kobject *, struct kobj_attribute *, kobj_attribute_write(register, register_bcache); kobj_attribute_write(register_quiet, register_bcache); +static bool bch_is_open_backing(struct block_device *bdev) { + struct cache_set *c, *tc; + struct cached_dev *dc, *t; + + list_for_each_entry_safe(c, tc, &bch_cache_sets, list) + list_for_each_entry_safe(dc, t, &c->cached_devs, list) + if (dc->bdev == bdev) + return true; + list_for_each_entry_safe(dc, t, &uncached_devices, list) + if (dc->bdev == bdev) + return true; + return false; +} + +static bool bch_is_open_cache(struct block_device *bdev) { + struct cache_set *c, *tc; + struct cache *ca; + unsigned i; + + list_for_each_entry_safe(c, tc, &bch_cache_sets, list) + for_each_cache(ca, c, i) + if (ca->bdev == bdev) + return true; + return false; +} + +static bool bch_is_open(struct block_device *bdev) { + return bch_is_open_cache(bdev) || bch_is_open_backing(bdev); +} + static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, const char *buffer, size_t size) { @@ -1810,8 +1840,13 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, FMODE_READ|FMODE_WRITE|FMODE_EXCL, sb); if (IS_ERR(bdev)) { - if (bdev == ERR_PTR(-EBUSY)) - err = "device busy"; + if (bdev == ERR_PTR(-EBUSY)) { + bdev = lookup_bdev(strim(path)); + if (!IS_ERR(bdev) && bch_is_open(bdev)) + err = "device already registered"; + else + err = "device busy"; + } goto err; } From 119ba0f82839cd80eaef3e6991988f1403965d5b Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 24 Apr 2013 19:01:12 -0700 Subject: [PATCH 04/20] bcache: Convert allocator thread to kthread Using a workqueue when we just want a single thread is a bit silly. Signed-off-by: Kent Overstreet --- drivers/md/bcache/alloc.c | 34 ++++++++++++++++++++++------------ drivers/md/bcache/bcache.h | 17 +++++++++++------ drivers/md/bcache/btree.c | 6 +++--- drivers/md/bcache/super.c | 19 +++++++------------ 4 files changed, 43 insertions(+), 33 deletions(-) diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c index 048f2947e08b..38428f46ea74 100644 --- a/drivers/md/bcache/alloc.c +++ b/drivers/md/bcache/alloc.c @@ -63,6 +63,7 @@ #include "bcache.h" #include "btree.h" +#include #include #define MAX_IN_FLIGHT_DISCARDS 8U @@ -151,7 +152,7 @@ static void discard_finish(struct work_struct *w) mutex_unlock(&ca->set->bucket_lock); closure_wake_up(&ca->set->bucket_wait); - wake_up(&ca->set->alloc_wait); + wake_up_process(ca->alloc_thread); closure_put(&ca->set->cl); } @@ -358,30 +359,26 @@ static void invalidate_buckets(struct cache *ca) #define allocator_wait(ca, cond) \ do { \ - DEFINE_WAIT(__wait); \ - \ while (1) { \ - prepare_to_wait(&ca->set->alloc_wait, \ - &__wait, TASK_INTERRUPTIBLE); \ + set_current_state(TASK_INTERRUPTIBLE); \ if (cond) \ break; \ \ mutex_unlock(&(ca)->set->bucket_lock); \ if (test_bit(CACHE_SET_STOPPING_2, &ca->set->flags)) { \ - finish_wait(&ca->set->alloc_wait, &__wait); \ - closure_return(cl); \ + closure_put(&ca->set->cl); \ + return 0; \ } \ \ schedule(); \ mutex_lock(&(ca)->set->bucket_lock); \ } \ - \ - finish_wait(&ca->set->alloc_wait, &__wait); \ + __set_current_state(TASK_RUNNING); \ } while (0) -void bch_allocator_thread(struct closure *cl) +static int bch_allocator_thread(void *arg) { - struct cache *ca = container_of(cl, struct cache, alloc); + struct cache *ca = arg; mutex_lock(&ca->set->bucket_lock); @@ -442,7 +439,7 @@ long bch_bucket_alloc(struct cache *ca, unsigned watermark, struct closure *cl) { long r = -1; again: - wake_up(&ca->set->alloc_wait); + wake_up_process(ca->alloc_thread); if (fifo_used(&ca->free) > ca->watermark[watermark] && fifo_pop(&ca->free, r)) { @@ -552,6 +549,19 @@ int bch_bucket_alloc_set(struct cache_set *c, unsigned watermark, /* Init */ +int bch_cache_allocator_start(struct cache *ca) +{ + ca->alloc_thread = kthread_create(bch_allocator_thread, + ca, "bcache_allocator"); + if (IS_ERR(ca->alloc_thread)) + return PTR_ERR(ca->alloc_thread); + + closure_get(&ca->set->cl); + wake_up_process(ca->alloc_thread); + + return 0; +} + void bch_cache_allocator_exit(struct cache *ca) { struct discard *d; diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index d3e15b42a4ab..166c8ddc0be4 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -565,8 +565,7 @@ struct cache { unsigned watermark[WATERMARK_MAX]; - struct closure alloc; - struct workqueue_struct *alloc_workqueue; + struct task_struct *alloc_thread; struct closure prio; struct prio_set *disk_buckets; @@ -703,9 +702,6 @@ struct cache_set { /* For the btree cache */ struct shrinker shrink; - /* For the allocator itself */ - wait_queue_head_t alloc_wait; - /* For the btree cache and anything allocation related */ struct mutex bucket_lock; @@ -1173,6 +1169,15 @@ static inline uint8_t bucket_disk_gen(struct bucket *b) static struct kobj_attribute ksysfs_##n = \ __ATTR(n, S_IWUSR|S_IRUSR, show, store) +static inline void wake_up_allocators(struct cache_set *c) +{ + struct cache *ca; + unsigned i; + + for_each_cache(ca, c, i) + wake_up_process(ca->alloc_thread); +} + /* Forward declarations */ void bch_writeback_queue(struct cached_dev *); @@ -1193,7 +1198,6 @@ void bch_submit_bbio(struct bio *, struct cache_set *, struct bkey *, unsigned); uint8_t bch_inc_gen(struct cache *, struct bucket *); void bch_rescale_priorities(struct cache_set *, int); bool bch_bucket_add_unused(struct cache *, struct bucket *); -void bch_allocator_thread(struct closure *); long bch_bucket_alloc(struct cache *, unsigned, struct closure *); void bch_bucket_free(struct cache_set *, struct bkey *); @@ -1244,6 +1248,7 @@ int bch_btree_cache_alloc(struct cache_set *); void bch_cached_dev_writeback_init(struct cached_dev *); void bch_moving_init_cache_set(struct cache_set *); +int bch_cache_allocator_start(struct cache *ca); void bch_cache_allocator_exit(struct cache *ca); int bch_cache_allocator_init(struct cache *ca); diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 7a5658f04e62..45b88fbffbe0 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -273,7 +273,7 @@ static void btree_complete_write(struct btree *b, struct btree_write *w) { if (w->prio_blocked && !atomic_sub_return(w->prio_blocked, &b->c->prio_blocked)) - wake_up(&b->c->alloc_wait); + wake_up_allocators(b->c); if (w->journal) { atomic_dec_bug(w->journal); @@ -984,7 +984,7 @@ static void btree_node_free(struct btree *b, struct btree_op *op) if (b->prio_blocked && !atomic_sub_return(b->prio_blocked, &b->c->prio_blocked)) - wake_up(&b->c->alloc_wait); + wake_up_allocators(b->c); b->prio_blocked = 0; @@ -1547,7 +1547,7 @@ static void bch_btree_gc(struct closure *cl) blktrace_msg_all(c, "Finished gc"); trace_bcache_gc_end(c->sb.set_uuid); - wake_up(&c->alloc_wait); + wake_up_allocators(c); continue_at(cl, bch_moving_gc, bch_gc_wq); } diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 3de5626919ef..aaeda235fc75 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -1282,7 +1282,7 @@ static void cache_set_flush(struct closure *cl) /* Shut down allocator threads */ set_bit(CACHE_SET_STOPPING_2, &c->flags); - wake_up(&c->alloc_wait); + wake_up_allocators(c); bch_cache_accounting_destroy(&c->accounting); @@ -1373,7 +1373,6 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) c->btree_pages = max_t(int, c->btree_pages / 4, BTREE_MAX_PAGES); - init_waitqueue_head(&c->alloc_wait); mutex_init(&c->bucket_lock); mutex_init(&c->fill_lock); mutex_init(&c->sort_lock); @@ -1496,9 +1495,10 @@ static void run_cache_set(struct cache_set *c) */ bch_journal_next(&c->journal); + err = "error starting allocator thread"; for_each_cache(ca, c, i) - closure_call(&ca->alloc, bch_allocator_thread, - system_wq, &c->cl); + if (bch_cache_allocator_start(ca)) + goto err; /* * First place it's safe to allocate: btree_check() and @@ -1531,17 +1531,16 @@ static void run_cache_set(struct cache_set *c) bch_btree_gc_finish(c); + err = "error starting allocator thread"; for_each_cache(ca, c, i) - closure_call(&ca->alloc, bch_allocator_thread, - ca->alloc_workqueue, &c->cl); + if (bch_cache_allocator_start(ca)) + goto err; mutex_lock(&c->bucket_lock); for_each_cache(ca, c, i) bch_prio_write(ca); mutex_unlock(&c->bucket_lock); - wake_up(&c->alloc_wait); - err = "cannot allocate new UUID bucket"; if (__uuid_write(c)) goto err_unlock_gc; @@ -1673,9 +1672,6 @@ void bch_cache_release(struct kobject *kobj) bio_split_pool_free(&ca->bio_split_hook); - if (ca->alloc_workqueue) - destroy_workqueue(ca->alloc_workqueue); - free_pages((unsigned long) ca->disk_buckets, ilog2(bucket_pages(ca))); kfree(ca->prio_buckets); vfree(ca->buckets); @@ -1723,7 +1719,6 @@ static int cache_alloc(struct cache_sb *sb, struct cache *ca) !(ca->prio_buckets = kzalloc(sizeof(uint64_t) * prio_buckets(ca) * 2, GFP_KERNEL)) || !(ca->disk_buckets = alloc_bucket_pages(GFP_KERNEL, ca)) || - !(ca->alloc_workqueue = alloc_workqueue("bch_allocator", 0, 1)) || bio_split_pool_init(&ca->bio_split_hook)) return -ENOMEM; From 5794351146199b9ac67a5ab1beab82be8bfd7b5d Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 25 Apr 2013 13:58:35 -0700 Subject: [PATCH 05/20] bcache: Refactor btree io The most significant change is that btree reads are now done synchronously, instead of asynchronously and doing the post read stuff from a workqueue. This was originally done because we can't block on IO under generic_make_request(). But - we already have a mechanism to punt cache lookups to workqueue if needed, so if we just use that we don't have to deal with the complexity of doing things asynchronously. The main benefit is this makes the locking situation saner; we can hold our write lock on the btree node until we're finished reading it, and we don't need that btree_node_read_done() flag anymore. Also, for writes, btree_write() was broken out into btree_node_write() and btree_leaf_dirty() - the old code with the boolean argument was dumb and confusing. The prio_blocked mechanism was improved a bit too, now the only counter is in struct btree_write, we don't mess with transfering a count from struct btree anymore. This required changing garbage collection to block prios at the start and unblock when it finishes, which is cleaner than what it was doing anyways (the old code had mostly the same effect, but was doing it in a convoluted way) And the btree iter btree_node_read_done() uses was converted to a real mempool. Signed-off-by: Kent Overstreet --- drivers/md/bcache/bcache.h | 5 +- drivers/md/bcache/btree.c | 278 ++++++++++++++++-------------------- drivers/md/bcache/btree.h | 19 +-- drivers/md/bcache/debug.c | 4 +- drivers/md/bcache/journal.c | 2 +- drivers/md/bcache/super.c | 12 +- 6 files changed, 142 insertions(+), 178 deletions(-) diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 166c8ddc0be4..ad4957b52f10 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -819,10 +819,9 @@ struct cache_set { /* * A btree node on disk could have too many bsets for an iterator to fit - * on the stack - this is a single element mempool for btree_read_work() + * on the stack - have to dynamically allocate them */ - struct mutex fill_lock; - struct btree_iter *fill_iter; + mempool_t *fill_iter; /* * btree_sort() is a merge sort and requires temporary space - single diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 45b88fbffbe0..aaec186f7ba6 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -134,44 +134,17 @@ static uint64_t btree_csum_set(struct btree *b, struct bset *i) return crc ^ 0xffffffffffffffffULL; } -static void btree_bio_endio(struct bio *bio, int error) +void bch_btree_node_read_done(struct btree *b) { - struct closure *cl = bio->bi_private; - struct btree *b = container_of(cl, struct btree, io.cl); - - if (error) - set_btree_node_io_error(b); - - bch_bbio_count_io_errors(b->c, bio, error, (bio->bi_rw & WRITE) - ? "writing btree" : "reading btree"); - closure_put(cl); -} - -static void btree_bio_init(struct btree *b) -{ - BUG_ON(b->bio); - b->bio = bch_bbio_alloc(b->c); - - b->bio->bi_end_io = btree_bio_endio; - b->bio->bi_private = &b->io.cl; -} - -void bch_btree_read_done(struct closure *cl) -{ - struct btree *b = container_of(cl, struct btree, io.cl); - struct bset *i = b->sets[0].data; - struct btree_iter *iter = b->c->fill_iter; const char *err = "bad btree header"; - BUG_ON(b->nsets || b->written); + struct bset *i = b->sets[0].data; + struct btree_iter *iter; - bch_bbio_free(b->bio, b->c); - b->bio = NULL; - - mutex_lock(&b->c->fill_lock); + iter = mempool_alloc(b->c->fill_iter, GFP_NOWAIT); + iter->size = b->c->sb.bucket_size / b->c->sb.block_size; iter->used = 0; - if (btree_node_io_error(b) || - !i->seq) + if (!i->seq) goto err; for (; @@ -228,17 +201,8 @@ void bch_btree_read_done(struct closure *cl) if (b->written < btree_blocks(b)) bch_bset_init_next(b); out: - - mutex_unlock(&b->c->fill_lock); - - spin_lock(&b->c->btree_read_time_lock); - bch_time_stats_update(&b->c->btree_read_time, b->io_start_time); - spin_unlock(&b->c->btree_read_time_lock); - - smp_wmb(); /* read_done is our write lock */ - set_btree_node_read_done(b); - - closure_return(cl); + mempool_free(iter, b->c->fill_iter); + return; err: set_btree_node_io_error(b); bch_cache_set_error(b->c, "%s at bucket %zu, block %zu, %u keys", @@ -247,26 +211,51 @@ err: goto out; } -void bch_btree_read(struct btree *b) +static void btree_node_read_endio(struct bio *bio, int error) { - BUG_ON(b->nsets || b->written); + struct closure *cl = bio->bi_private; + closure_put(cl); +} - if (!closure_trylock(&b->io.cl, &b->c->cl)) - BUG(); - - b->io_start_time = local_clock(); - - btree_bio_init(b); - b->bio->bi_rw = REQ_META|READ_SYNC; - b->bio->bi_size = KEY_SIZE(&b->key) << 9; - - bch_bio_map(b->bio, b->sets[0].data); +void bch_btree_node_read(struct btree *b) +{ + uint64_t start_time = local_clock(); + struct closure cl; + struct bio *bio; + closure_init_stack(&cl); pr_debug("%s", pbtree(b)); - trace_bcache_btree_read(b->bio); - bch_submit_bbio(b->bio, b->c, &b->key, 0); - continue_at(&b->io.cl, bch_btree_read_done, system_wq); + bio = bch_bbio_alloc(b->c); + bio->bi_rw = REQ_META|READ_SYNC; + bio->bi_size = KEY_SIZE(&b->key) << 9; + bio->bi_end_io = btree_node_read_endio; + bio->bi_private = &cl; + + bch_bio_map(bio, b->sets[0].data); + + trace_bcache_btree_read(bio); + bch_submit_bbio(bio, b->c, &b->key, 0); + closure_sync(&cl); + + if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) + set_btree_node_io_error(b); + + bch_bbio_free(bio, b->c); + + if (btree_node_io_error(b)) + goto err; + + bch_btree_node_read_done(b); + + spin_lock(&b->c->btree_read_time_lock); + bch_time_stats_update(&b->c->btree_read_time, start_time); + spin_unlock(&b->c->btree_read_time_lock); + + return; +err: + bch_cache_set_error(b->c, "io error reading bucket %lu", + PTR_BUCKET_NR(b->c, &b->key, 0)); } static void btree_complete_write(struct btree *b, struct btree_write *w) @@ -280,15 +269,11 @@ static void btree_complete_write(struct btree *b, struct btree_write *w) __closure_wake_up(&b->c->journal.wait); } - if (w->owner) - closure_put(w->owner); - w->prio_blocked = 0; w->journal = NULL; - w->owner = NULL; } -static void __btree_write_done(struct closure *cl) +static void __btree_node_write_done(struct closure *cl) { struct btree *b = container_of(cl, struct btree, io.cl); struct btree_write *w = btree_prev_write(b); @@ -304,7 +289,7 @@ static void __btree_write_done(struct closure *cl) closure_return(cl); } -static void btree_write_done(struct closure *cl) +static void btree_node_write_done(struct closure *cl) { struct btree *b = container_of(cl, struct btree, io.cl); struct bio_vec *bv; @@ -313,10 +298,22 @@ static void btree_write_done(struct closure *cl) __bio_for_each_segment(bv, b->bio, n, 0) __free_page(bv->bv_page); - __btree_write_done(cl); + __btree_node_write_done(cl); } -static void do_btree_write(struct btree *b) +static void btree_node_write_endio(struct bio *bio, int error) +{ + struct closure *cl = bio->bi_private; + struct btree *b = container_of(cl, struct btree, io.cl); + + if (error) + set_btree_node_io_error(b); + + bch_bbio_count_io_errors(b->c, bio, error, "writing btree"); + closure_put(cl); +} + +static void do_btree_node_write(struct btree *b) { struct closure *cl = &b->io.cl; struct bset *i = b->sets[b->nsets].data; @@ -325,7 +322,11 @@ static void do_btree_write(struct btree *b) i->version = BCACHE_BSET_VERSION; i->csum = btree_csum_set(b, i); - btree_bio_init(b); + BUG_ON(b->bio); + b->bio = bch_bbio_alloc(b->c); + + b->bio->bi_end_io = btree_node_write_endio; + b->bio->bi_private = &b->io.cl; b->bio->bi_rw = REQ_META|WRITE_SYNC; b->bio->bi_size = set_blocks(i, b->c) * block_bytes(b->c); bch_bio_map(b->bio, i); @@ -345,7 +346,7 @@ static void do_btree_write(struct btree *b) trace_bcache_btree_write(b->bio); bch_submit_bbio(b->bio, b->c, &k.key, 0); - continue_at(cl, btree_write_done, NULL); + continue_at(cl, btree_node_write_done, NULL); } else { b->bio->bi_vcnt = 0; bch_bio_map(b->bio, i); @@ -354,26 +355,30 @@ static void do_btree_write(struct btree *b) bch_submit_bbio(b->bio, b->c, &k.key, 0); closure_sync(cl); - __btree_write_done(cl); + __btree_node_write_done(cl); } } -static void __btree_write(struct btree *b) +void bch_btree_node_write(struct btree *b, struct closure *parent) { struct bset *i = b->sets[b->nsets].data; BUG_ON(current->bio_list); + BUG_ON(b->written >= btree_blocks(b)); + BUG_ON(b->written && !i->keys); + BUG_ON(b->sets->data->seq != i->seq); - closure_lock(&b->io, &b->c->cl); cancel_delayed_work(&b->work); + /* If caller isn't waiting for write, parent refcount is cache set */ + closure_lock(&b->io, parent ?: &b->c->cl); + clear_bit(BTREE_NODE_dirty, &b->flags); change_bit(BTREE_NODE_write_idx, &b->flags); bch_check_key_order(b, i); - BUG_ON(b->written && !i->keys); - do_btree_write(b); + do_btree_node_write(b); pr_debug("%s block %i keys %i", pbtree(b), b->written, i->keys); @@ -387,37 +392,31 @@ static void __btree_write(struct btree *b) bch_bset_init_next(b); } -static void btree_write_work(struct work_struct *w) +static void btree_node_write_work(struct work_struct *w) { struct btree *b = container_of(to_delayed_work(w), struct btree, work); - down_write(&b->lock); + rw_lock(true, b, b->level); if (btree_node_dirty(b)) - __btree_write(b); - up_write(&b->lock); + bch_btree_node_write(b, NULL); + rw_unlock(true, b); } -void bch_btree_write(struct btree *b, bool now, struct btree_op *op) +static void bch_btree_leaf_dirty(struct btree *b, struct btree_op *op) { struct bset *i = b->sets[b->nsets].data; struct btree_write *w = btree_current_write(b); - BUG_ON(b->written && - (b->written >= btree_blocks(b) || - i->seq != b->sets[0].data->seq || - !i->keys)); + BUG_ON(!b->written); + BUG_ON(!i->keys); - if (!btree_node_dirty(b)) { - set_btree_node_dirty(b); - queue_delayed_work(btree_io_wq, &b->work, - msecs_to_jiffies(30000)); - } + if (!btree_node_dirty(b)) + queue_delayed_work(btree_io_wq, &b->work, 30 * HZ); - w->prio_blocked += b->prio_blocked; - b->prio_blocked = 0; + set_btree_node_dirty(b); - if (op && op->journal && !b->level) { + if (op && op->journal) { if (w->journal && journal_pin_cmp(b->c, w, op)) { atomic_dec_bug(w->journal); @@ -430,23 +429,10 @@ void bch_btree_write(struct btree *b, bool now, struct btree_op *op) } } - if (current->bio_list) - return; - /* Force write if set is too big */ - if (now || - b->level || - set_bytes(i) > PAGE_SIZE - 48) { - if (op && now) { - /* Must wait on multiple writes */ - BUG_ON(w->owner); - w->owner = &op->cl; - closure_get(&op->cl); - } - - __btree_write(b); - } - BUG_ON(!b->written); + if (set_bytes(i) > PAGE_SIZE - 48 && + !current->bio_list) + bch_btree_node_write(b, NULL); } /* @@ -559,7 +545,7 @@ static struct btree *mca_bucket_alloc(struct cache_set *c, init_rwsem(&b->lock); lockdep_set_novalidate_class(&b->lock); INIT_LIST_HEAD(&b->list); - INIT_DELAYED_WORK(&b->work, btree_write_work); + INIT_DELAYED_WORK(&b->work, btree_node_write_work); b->c = c; closure_init_unlocked(&b->io); @@ -582,7 +568,7 @@ static int mca_reap(struct btree *b, struct closure *cl, unsigned min_order) BUG_ON(btree_node_dirty(b) && !b->sets[0].data); if (cl && btree_node_dirty(b)) - bch_btree_write(b, true, NULL); + bch_btree_node_write(b, NULL); if (cl) closure_wait_event_async(&b->io.wait, cl, @@ -905,6 +891,9 @@ retry: b = mca_find(c, k); if (!b) { + if (current->bio_list) + return ERR_PTR(-EAGAIN); + mutex_lock(&c->bucket_lock); b = mca_alloc(c, k, level, &op->cl); mutex_unlock(&c->bucket_lock); @@ -914,7 +903,7 @@ retry: if (IS_ERR(b)) return b; - bch_btree_read(b); + bch_btree_node_read(b); if (!write) downgrade_write(&b->lock); @@ -937,15 +926,12 @@ retry: for (; i <= b->nsets; i++) prefetch(b->sets[i].data); - if (!closure_wait_event(&b->io.wait, &op->cl, - btree_node_read_done(b))) { + if (btree_node_io_error(b)) { rw_unlock(write, b); - b = ERR_PTR(-EAGAIN); - } else if (btree_node_io_error(b)) { - rw_unlock(write, b); - b = ERR_PTR(-EIO); - } else - BUG_ON(!b->written); + return ERR_PTR(-EIO); + } + + BUG_ON(!b->written); return b; } @@ -959,7 +945,7 @@ static void btree_node_prefetch(struct cache_set *c, struct bkey *k, int level) mutex_unlock(&c->bucket_lock); if (!IS_ERR_OR_NULL(b)) { - bch_btree_read(b); + bch_btree_node_read(b); rw_unlock(true, b); } } @@ -982,12 +968,6 @@ static void btree_node_free(struct btree *b, struct btree_op *op) btree_complete_write(b, btree_current_write(b)); clear_bit(BTREE_NODE_dirty, &b->flags); - if (b->prio_blocked && - !atomic_sub_return(b->prio_blocked, &b->c->prio_blocked)) - wake_up_allocators(b->c); - - b->prio_blocked = 0; - cancel_delayed_work(&b->work); mutex_lock(&b->c->bucket_lock); @@ -1028,7 +1008,6 @@ retry: goto retry; } - set_btree_node_read_done(b); b->accessed = 1; bch_bset_init_next(b); @@ -1166,14 +1145,11 @@ static struct btree *btree_gc_alloc(struct btree *b, struct bkey *k, if (!IS_ERR_OR_NULL(n)) { swap(b, n); + __bkey_put(b->c, &b->key); memcpy(k->ptr, b->key.ptr, sizeof(uint64_t) * KEY_PTRS(&b->key)); - __bkey_put(b->c, &b->key); - atomic_inc(&b->c->prio_blocked); - b->prio_blocked++; - btree_node_free(n, op); up_write(&n->lock); } @@ -1293,14 +1269,9 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op, void write(struct btree *r) { if (!r->written) - bch_btree_write(r, true, op); - else if (btree_node_dirty(r)) { - BUG_ON(btree_current_write(r)->owner); - btree_current_write(r)->owner = writes; - closure_get(writes); - - bch_btree_write(r, true, NULL); - } + bch_btree_node_write(r, &op->cl); + else if (btree_node_dirty(r)) + bch_btree_node_write(r, writes); up_write(&r->lock); } @@ -1386,9 +1357,7 @@ static int bch_btree_gc_root(struct btree *b, struct btree_op *op, ret = btree_gc_recurse(b, op, writes, gc); if (!b->written || btree_node_dirty(b)) { - atomic_inc(&b->c->prio_blocked); - b->prio_blocked++; - bch_btree_write(b, true, n ? op : NULL); + bch_btree_node_write(b, n ? &op->cl : NULL); } if (!IS_ERR_OR_NULL(n)) { @@ -1508,8 +1477,8 @@ static void bch_btree_gc(struct closure *cl) struct gc_stat stats; struct closure writes; struct btree_op op; - uint64_t start_time = local_clock(); + trace_bcache_gc_start(c->sb.set_uuid); blktrace_msg_all(c, "Starting gc"); @@ -1520,6 +1489,8 @@ static void bch_btree_gc(struct closure *cl) btree_gc_start(c); + atomic_inc(&c->prio_blocked); + ret = btree_root(gc_root, c, &op, &writes, &stats); closure_sync(&op.cl); closure_sync(&writes); @@ -1537,6 +1508,9 @@ static void bch_btree_gc(struct closure *cl) available = bch_btree_gc_finish(c); + atomic_dec(&c->prio_blocked); + wake_up_allocators(c); + bch_time_stats_update(&c->btree_gc_time, start_time); stats.key_bytes *= sizeof(uint64_t); @@ -1544,10 +1518,9 @@ static void bch_btree_gc(struct closure *cl) stats.data <<= 9; stats.in_use = (c->nbuckets - available) * 100 / c->nbuckets; memcpy(&c->gc_stats, &stats, sizeof(struct gc_stat)); - blktrace_msg_all(c, "Finished gc"); + blktrace_msg_all(c, "Finished gc"); trace_bcache_gc_end(c->sb.set_uuid); - wake_up_allocators(c); continue_at(cl, bch_moving_gc, bch_gc_wq); } @@ -1857,7 +1830,7 @@ merged: op_type(op), pbtree(b), pkey(k)); if (b->level && !KEY_OFFSET(k)) - b->prio_blocked++; + btree_current_write(b)->prio_blocked++; pr_debug("%s for %s at %s: %s", status, op_type(op), pbtree(b), pkey(k)); @@ -1907,7 +1880,6 @@ bool bch_btree_insert_check_key(struct btree *b, struct btree_op *op, BUG_ON(op->type != BTREE_INSERT); BUG_ON(!btree_insert_key(b, op, &tmp.k)); - bch_btree_write(b, false, NULL); ret = true; out: downgrade_write(&b->lock); @@ -1967,18 +1939,18 @@ static int btree_split(struct btree *b, struct btree_op *op) bkey_copy_key(&n2->key, &b->key); bch_keylist_add(&op->keys, &n2->key); - bch_btree_write(n2, true, op); + bch_btree_node_write(n2, &op->cl); rw_unlock(true, n2); } else bch_btree_insert_keys(n1, op); bch_keylist_add(&op->keys, &n1->key); - bch_btree_write(n1, true, op); + bch_btree_node_write(n1, &op->cl); if (n3) { bkey_copy_key(&n3->key, &MAX_KEY); bch_btree_insert_keys(n3, op); - bch_btree_write(n3, true, op); + bch_btree_node_write(n3, &op->cl); closure_sync(&op->cl); bch_btree_set_root(n3); @@ -2082,8 +2054,12 @@ static int bch_btree_insert_recurse(struct btree *b, struct btree_op *op, BUG_ON(write_block(b) != b->sets[b->nsets].data); - if (bch_btree_insert_keys(b, op)) - bch_btree_write(b, false, op); + if (bch_btree_insert_keys(b, op)) { + if (!b->level) + bch_btree_leaf_dirty(b, op); + else + bch_btree_node_write(b, &op->cl); + } } return 0; diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h index af4a7092a28c..809bd77847a2 100644 --- a/drivers/md/bcache/btree.h +++ b/drivers/md/bcache/btree.h @@ -102,7 +102,6 @@ #include "debug.h" struct btree_write { - struct closure *owner; atomic_t *journal; /* If btree_split() frees a btree node, it writes a new pointer to that @@ -142,16 +141,12 @@ struct btree { */ struct bset_tree sets[MAX_BSETS]; - /* Used to refcount bio splits, also protects b->bio */ + /* For outstanding btree writes, used as a lock - protects write_idx */ struct closure_with_waitlist io; - /* Gets transferred to w->prio_blocked - see the comment there */ - int prio_blocked; - struct list_head list; struct delayed_work work; - uint64_t io_start_time; struct btree_write writes[2]; struct bio *bio; }; @@ -164,13 +159,11 @@ static inline void set_btree_node_ ## flag(struct btree *b) \ { set_bit(BTREE_NODE_ ## flag, &b->flags); } \ enum btree_flags { - BTREE_NODE_read_done, BTREE_NODE_io_error, BTREE_NODE_dirty, BTREE_NODE_write_idx, }; -BTREE_FLAG(read_done); BTREE_FLAG(io_error); BTREE_FLAG(dirty); BTREE_FLAG(write_idx); @@ -293,9 +286,7 @@ static inline void rw_unlock(bool w, struct btree *b) #ifdef CONFIG_BCACHE_EDEBUG unsigned i; - if (w && - b->key.ptr[0] && - btree_node_read_done(b)) + if (w && b->key.ptr[0]) for (i = 0; i <= b->nsets; i++) bch_check_key_order(b, b->sets[i].data); #endif @@ -370,9 +361,9 @@ static inline bool should_split(struct btree *b) > btree_blocks(b)); } -void bch_btree_read_done(struct closure *); -void bch_btree_read(struct btree *); -void bch_btree_write(struct btree *b, bool now, struct btree_op *op); +void bch_btree_node_read(struct btree *); +void bch_btree_node_read_done(struct btree *); +void bch_btree_node_write(struct btree *, struct closure *); void bch_cannibalize_unlock(struct cache_set *, struct closure *); void bch_btree_set_root(struct btree *); diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c index 89fd5204924e..ae6096c6845d 100644 --- a/drivers/md/bcache/debug.c +++ b/drivers/md/bcache/debug.c @@ -144,7 +144,7 @@ void bch_btree_verify(struct btree *b, struct bset *new) v->written = 0; v->level = b->level; - bch_btree_read(v); + bch_btree_node_read(v); closure_wait_event(&v->io.wait, &cl, atomic_read(&b->io.cl.remaining) == -1); @@ -512,7 +512,7 @@ static ssize_t btree_fuzz(struct kobject *k, struct kobj_attribute *a, bch_btree_sort(b); fill->written = 0; - bch_btree_read_done(&fill->io.cl); + bch_btree_node_read_done(fill); if (b->sets[0].data->keys != fill->sets[0].data->keys || memcmp(b->sets[0].data->start, diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c index 8c8dfdcd9d4c..970d819d4350 100644 --- a/drivers/md/bcache/journal.c +++ b/drivers/md/bcache/journal.c @@ -384,7 +384,7 @@ out: return; found: if (btree_node_dirty(best)) - bch_btree_write(best, true, NULL); + bch_btree_node_write(best, NULL); rw_unlock(true, best); } diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index aaeda235fc75..e53f89988b08 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -1255,9 +1255,10 @@ static void cache_set_free(struct closure *cl) free_pages((unsigned long) c->uuids, ilog2(bucket_pages(c))); free_pages((unsigned long) c->sort, ilog2(bucket_pages(c))); - kfree(c->fill_iter); if (c->bio_split) bioset_free(c->bio_split); + if (c->fill_iter) + mempool_destroy(c->fill_iter); if (c->bio_meta) mempool_destroy(c->bio_meta); if (c->search) @@ -1295,7 +1296,7 @@ static void cache_set_flush(struct closure *cl) /* Should skip this if we're unregistering because of an error */ list_for_each_entry(b, &c->btree_cache, list) if (btree_node_dirty(b)) - bch_btree_write(b, true, NULL); + bch_btree_node_write(b, NULL); closure_return(cl); } @@ -1374,7 +1375,6 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) BTREE_MAX_PAGES); mutex_init(&c->bucket_lock); - mutex_init(&c->fill_lock); mutex_init(&c->sort_lock); spin_lock_init(&c->sort_time_lock); closure_init_unlocked(&c->sb_write); @@ -1400,8 +1400,8 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) !(c->bio_meta = mempool_create_kmalloc_pool(2, sizeof(struct bbio) + sizeof(struct bio_vec) * bucket_pages(c))) || + !(c->fill_iter = mempool_create_kmalloc_pool(1, iter_size)) || !(c->bio_split = bioset_create(4, offsetof(struct bbio, bio))) || - !(c->fill_iter = kmalloc(iter_size, GFP_KERNEL)) || !(c->sort = alloc_bucket_pages(GFP_KERNEL, c)) || !(c->uuids = alloc_bucket_pages(GFP_KERNEL, c)) || bch_journal_alloc(c) || @@ -1409,8 +1409,6 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) bch_open_buckets_alloc(c)) goto err; - c->fill_iter->size = sb->bucket_size / sb->block_size; - c->congested_read_threshold_us = 2000; c->congested_write_threshold_us = 20000; c->error_limit = 8 << IO_ERROR_SHIFT; @@ -1551,7 +1549,7 @@ static void run_cache_set(struct cache_set *c) goto err_unlock_gc; bkey_copy_key(&c->root->key, &MAX_KEY); - bch_btree_write(c->root, true, &op); + bch_btree_node_write(c->root, &op.cl); bch_btree_set_root(c->root); rw_unlock(true, c->root); From c37511b863f36c1cc6e18440717fd4cc0e881b8a Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 26 Apr 2013 15:39:55 -0700 Subject: [PATCH 06/20] bcache: Fix/revamp tracepoints The tracepoints were reworked to be more sensible, and fixed a null pointer deref in one of the tracepoints. Converted some of the pr_debug()s to tracepoints - this is partly a performance optimization; it used to be that with DEBUG or CONFIG_DYNAMIC_DEBUG pr_debug() was an empty macro; but at some point it was changed to an empty inline function. Some of the pr_debug() statements had rather expensive function calls as part of the arguments, so this code was getting run unnecessarily even on non debug kernels - in some fast paths, too. Signed-off-by: Kent Overstreet --- drivers/md/bcache/alloc.c | 10 +- drivers/md/bcache/bcache.h | 20 -- drivers/md/bcache/bset.h | 4 + drivers/md/bcache/btree.c | 47 ++-- drivers/md/bcache/io.c | 2 + drivers/md/bcache/journal.c | 14 +- drivers/md/bcache/movinggc.c | 12 +- drivers/md/bcache/request.c | 65 ++--- drivers/md/bcache/request.h | 2 +- drivers/md/bcache/super.c | 2 +- drivers/md/bcache/sysfs.c | 1 + drivers/md/bcache/trace.c | 45 +++- drivers/md/bcache/util.h | 2 - drivers/md/bcache/writeback.c | 10 +- include/trace/events/bcache.h | 444 ++++++++++++++++++++++------------ 15 files changed, 413 insertions(+), 267 deletions(-) diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c index 38428f46ea74..b54b73b9b2b7 100644 --- a/drivers/md/bcache/alloc.c +++ b/drivers/md/bcache/alloc.c @@ -65,6 +65,7 @@ #include #include +#include #define MAX_IN_FLIGHT_DISCARDS 8U @@ -351,10 +352,7 @@ static void invalidate_buckets(struct cache *ca) break; } - pr_debug("free %zu/%zu free_inc %zu/%zu unused %zu/%zu", - fifo_used(&ca->free), ca->free.size, - fifo_used(&ca->free_inc), ca->free_inc.size, - fifo_used(&ca->unused), ca->unused.size); + trace_bcache_alloc_invalidate(ca); } #define allocator_wait(ca, cond) \ @@ -473,9 +471,7 @@ again: return r; } - pr_debug("alloc failure: blocked %i free %zu free_inc %zu unused %zu", - atomic_read(&ca->set->prio_blocked), fifo_used(&ca->free), - fifo_used(&ca->free_inc), fifo_used(&ca->unused)); + trace_bcache_alloc_fail(ca); if (cl) { closure_wait(&ca->set->bucket_wait, cl); diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index ad4957b52f10..59c15e09e4dd 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -178,7 +178,6 @@ #define pr_fmt(fmt) "bcache: %s() " fmt "\n", __func__ #include -#include #include #include #include @@ -901,8 +900,6 @@ static inline unsigned local_clock_us(void) return local_clock() >> 10; } -#define MAX_BSETS 4U - #define BTREE_PRIO USHRT_MAX #define INITIAL_PRIO 32768 @@ -1107,23 +1104,6 @@ static inline void __bkey_put(struct cache_set *c, struct bkey *k) atomic_dec_bug(&PTR_BUCKET(c, k, i)->pin); } -/* Blktrace macros */ - -#define blktrace_msg(c, fmt, ...) \ -do { \ - struct request_queue *q = bdev_get_queue(c->bdev); \ - if (q) \ - blk_add_trace_msg(q, fmt, ##__VA_ARGS__); \ -} while (0) - -#define blktrace_msg_all(s, fmt, ...) \ -do { \ - struct cache *_c; \ - unsigned i; \ - for_each_cache(_c, (s), i) \ - blktrace_msg(_c, fmt, ##__VA_ARGS__); \ -} while (0) - static inline void cached_dev_put(struct cached_dev *dc) { if (atomic_dec_and_test(&dc->count)) diff --git a/drivers/md/bcache/bset.h b/drivers/md/bcache/bset.h index 57a9cff41546..ae115a253d73 100644 --- a/drivers/md/bcache/bset.h +++ b/drivers/md/bcache/bset.h @@ -1,6 +1,8 @@ #ifndef _BCACHE_BSET_H #define _BCACHE_BSET_H +#include + /* * BKEYS: * @@ -142,6 +144,8 @@ /* Btree key comparison/iteration */ +#define MAX_BSETS 4U + struct btree_iter { size_t size, used; struct btree_iter_set { diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index aaec186f7ba6..218d486259a3 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -223,8 +223,9 @@ void bch_btree_node_read(struct btree *b) struct closure cl; struct bio *bio; + trace_bcache_btree_read(b); + closure_init_stack(&cl); - pr_debug("%s", pbtree(b)); bio = bch_bbio_alloc(b->c); bio->bi_rw = REQ_META|READ_SYNC; @@ -234,7 +235,6 @@ void bch_btree_node_read(struct btree *b) bch_bio_map(bio, b->sets[0].data); - trace_bcache_btree_read(bio); bch_submit_bbio(bio, b->c, &b->key, 0); closure_sync(&cl); @@ -343,7 +343,6 @@ static void do_btree_node_write(struct btree *b) memcpy(page_address(bv->bv_page), base + j * PAGE_SIZE, PAGE_SIZE); - trace_bcache_btree_write(b->bio); bch_submit_bbio(b->bio, b->c, &k.key, 0); continue_at(cl, btree_node_write_done, NULL); @@ -351,7 +350,6 @@ static void do_btree_node_write(struct btree *b) b->bio->bi_vcnt = 0; bch_bio_map(b->bio, i); - trace_bcache_btree_write(b->bio); bch_submit_bbio(b->bio, b->c, &k.key, 0); closure_sync(cl); @@ -363,10 +361,13 @@ void bch_btree_node_write(struct btree *b, struct closure *parent) { struct bset *i = b->sets[b->nsets].data; + trace_bcache_btree_write(b); + BUG_ON(current->bio_list); BUG_ON(b->written >= btree_blocks(b)); BUG_ON(b->written && !i->keys); BUG_ON(b->sets->data->seq != i->seq); + bch_check_key_order(b, i); cancel_delayed_work(&b->work); @@ -376,12 +377,8 @@ void bch_btree_node_write(struct btree *b, struct closure *parent) clear_bit(BTREE_NODE_dirty, &b->flags); change_bit(BTREE_NODE_write_idx, &b->flags); - bch_check_key_order(b, i); - do_btree_node_write(b); - pr_debug("%s block %i keys %i", pbtree(b), b->written, i->keys); - b->written += set_blocks(i, b->c); atomic_long_add(set_blocks(i, b->c) * b->c->sb.block_size, &PTR_CACHE(b->c, &b->key, 0)->btree_sectors_written); @@ -752,6 +749,8 @@ static struct btree *mca_cannibalize(struct cache_set *c, struct bkey *k, int ret = -ENOMEM; struct btree *i; + trace_bcache_btree_cache_cannibalize(c); + if (!cl) return ERR_PTR(-ENOMEM); @@ -770,7 +769,6 @@ static struct btree *mca_cannibalize(struct cache_set *c, struct bkey *k, return ERR_PTR(-EAGAIN); } - /* XXX: tracepoint */ c->try_harder = cl; c->try_harder_start = local_clock(); retry: @@ -956,13 +954,14 @@ static void btree_node_free(struct btree *b, struct btree_op *op) { unsigned i; + trace_bcache_btree_node_free(b); + /* * The BUG_ON() in btree_node_get() implies that we must have a write * lock on parent to free or even invalidate a node */ BUG_ON(op->lock <= b->level); BUG_ON(b == b->c->root); - pr_debug("bucket %s", pbtree(b)); if (btree_node_dirty(b)) btree_complete_write(b, btree_current_write(b)); @@ -1012,12 +1011,16 @@ retry: bch_bset_init_next(b); mutex_unlock(&c->bucket_lock); + + trace_bcache_btree_node_alloc(b); return b; err_free: bch_bucket_free(c, &k.key); __bkey_put(c, &k.key); err: mutex_unlock(&c->bucket_lock); + + trace_bcache_btree_node_alloc_fail(b); return b; } @@ -1254,7 +1257,7 @@ static void btree_gc_coalesce(struct btree *b, struct btree_op *op, btree_node_free(r->b, op); up_write(&r->b->lock); - pr_debug("coalesced %u nodes", nodes); + trace_bcache_btree_gc_coalesce(nodes); gc->nodes--; nodes--; @@ -1479,8 +1482,7 @@ static void bch_btree_gc(struct closure *cl) struct btree_op op; uint64_t start_time = local_clock(); - trace_bcache_gc_start(c->sb.set_uuid); - blktrace_msg_all(c, "Starting gc"); + trace_bcache_gc_start(c); memset(&stats, 0, sizeof(struct gc_stat)); closure_init_stack(&writes); @@ -1496,9 +1498,7 @@ static void bch_btree_gc(struct closure *cl) closure_sync(&writes); if (ret) { - blktrace_msg_all(c, "Stopped gc"); pr_warn("gc failed!"); - continue_at(cl, bch_btree_gc, bch_gc_wq); } @@ -1519,8 +1519,7 @@ static void bch_btree_gc(struct closure *cl) stats.in_use = (c->nbuckets - available) * 100 / c->nbuckets; memcpy(&c->gc_stats, &stats, sizeof(struct gc_stat)); - blktrace_msg_all(c, "Finished gc"); - trace_bcache_gc_end(c->sb.set_uuid); + trace_bcache_gc_end(c); continue_at(cl, bch_moving_gc, bch_gc_wq); } @@ -1901,12 +1900,11 @@ static int btree_split(struct btree *b, struct btree_op *op) split = set_blocks(n1->sets[0].data, n1->c) > (btree_blocks(b) * 4) / 5; - pr_debug("%ssplitting at %s keys %i", split ? "" : "not ", - pbtree(b), n1->sets[0].data->keys); - if (split) { unsigned keys = 0; + trace_bcache_btree_node_split(b, n1->sets[0].data->keys); + n2 = bch_btree_node_alloc(b->c, b->level, &op->cl); if (IS_ERR(n2)) goto err_free1; @@ -1941,8 +1939,11 @@ static int btree_split(struct btree *b, struct btree_op *op) bch_keylist_add(&op->keys, &n2->key); bch_btree_node_write(n2, &op->cl); rw_unlock(true, n2); - } else + } else { + trace_bcache_btree_node_compact(b, n1->sets[0].data->keys); + bch_btree_insert_keys(n1, op); + } bch_keylist_add(&op->keys, &n1->key); bch_btree_node_write(n1, &op->cl); @@ -2117,6 +2118,8 @@ void bch_btree_set_root(struct btree *b) { unsigned i; + trace_bcache_btree_set_root(b); + BUG_ON(!b->written); for (i = 0; i < KEY_PTRS(&b->key); i++) @@ -2130,7 +2133,6 @@ void bch_btree_set_root(struct btree *b) __bkey_put(b->c, &b->key); bch_journal_meta(b->c, NULL); - pr_debug("%s for %pf", pbtree(b), __builtin_return_address(0)); } /* Cache lookup */ @@ -2216,7 +2218,6 @@ static int submit_partial_cache_hit(struct btree *b, struct btree_op *op, n->bi_end_io = bch_cache_read_endio; n->bi_private = &s->cl; - trace_bcache_cache_hit(n); __bch_submit_bbio(n, b->c); } diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c index d285cd49104c..0f6d69658b61 100644 --- a/drivers/md/bcache/io.c +++ b/drivers/md/bcache/io.c @@ -9,6 +9,8 @@ #include "bset.h" #include "debug.h" +#include + static void bch_bi_idx_hack_endio(struct bio *bio, int error) { struct bio *p = bio->bi_private; diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c index 970d819d4350..5ca22149b749 100644 --- a/drivers/md/bcache/journal.c +++ b/drivers/md/bcache/journal.c @@ -9,6 +9,8 @@ #include "debug.h" #include "request.h" +#include + /* * Journal replay/recovery: * @@ -300,7 +302,8 @@ int bch_journal_replay(struct cache_set *s, struct list_head *list, for (k = i->j.start; k < end(&i->j); k = bkey_next(k)) { - pr_debug("%s", pkey(k)); + trace_bcache_journal_replay_key(k); + bkey_copy(op->keys.top, k); bch_keylist_push(&op->keys); @@ -712,7 +715,8 @@ void bch_journal(struct closure *cl) spin_lock(&c->journal.lock); if (journal_full(&c->journal)) { - /* XXX: tracepoint */ + trace_bcache_journal_full(c); + closure_wait(&c->journal.wait, cl); journal_reclaim(c); @@ -728,13 +732,15 @@ void bch_journal(struct closure *cl) if (b * c->sb.block_size > PAGE_SECTORS << JSET_BITS || b > c->journal.blocks_free) { - /* XXX: If we were inserting so many keys that they won't fit in + trace_bcache_journal_entry_full(c); + + /* + * XXX: If we were inserting so many keys that they won't fit in * an _empty_ journal write, we'll deadlock. For now, handle * this in bch_keylist_realloc() - but something to think about. */ BUG_ON(!w->data->keys); - /* XXX: tracepoint */ BUG_ON(!closure_wait(&w->wait, cl)); closure_flush(&c->journal.io); diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c index 8589512c972e..04f6b97ffda6 100644 --- a/drivers/md/bcache/movinggc.c +++ b/drivers/md/bcache/movinggc.c @@ -9,6 +9,8 @@ #include "debug.h" #include "request.h" +#include + struct moving_io { struct keybuf_key *w; struct search s; @@ -49,9 +51,8 @@ static void write_moving_finish(struct closure *cl) while (bv-- != bio->bi_io_vec) __free_page(bv->bv_page); - pr_debug("%s %s", io->s.op.insert_collision - ? "collision moving" : "moved", - pkey(&io->w->key)); + if (io->s.op.insert_collision) + trace_bcache_gc_copy_collision(&io->w->key); bch_keybuf_del(&io->s.op.c->moving_gc_keys, io->w); @@ -94,8 +95,6 @@ static void write_moving(struct closure *cl) struct moving_io *io = container_of(s, struct moving_io, s); if (!s->error) { - trace_bcache_write_moving(&io->bio.bio); - moving_init(io); io->bio.bio.bi_sector = KEY_START(&io->w->key); @@ -122,7 +121,6 @@ static void read_moving_submit(struct closure *cl) struct moving_io *io = container_of(s, struct moving_io, s); struct bio *bio = &io->bio.bio; - trace_bcache_read_moving(bio); bch_submit_bbio(bio, s->op.c, &io->w->key, 0); continue_at(cl, write_moving, bch_gc_wq); @@ -162,7 +160,7 @@ static void read_moving(struct closure *cl) if (bch_bio_alloc_pages(bio, GFP_KERNEL)) goto err; - pr_debug("%s", pkey(&w->key)); + trace_bcache_gc_copy(&w->key); closure_call(&io->s.cl, read_moving_submit, NULL, &c->gc.cl); diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index e5ff12e52d5b..695469958c1e 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -530,10 +530,9 @@ static void bch_insert_data_loop(struct closure *cl) if (KEY_CSUM(k)) bio_csum(n, k); - pr_debug("%s", pkey(k)); + trace_bcache_cache_insert(k); bch_keylist_push(&op->keys); - trace_bcache_cache_insert(n, n->bi_sector, n->bi_bdev); n->bi_rw |= REQ_WRITE; bch_submit_bbio(n, op->c, k, 0); } while (n != bio); @@ -784,11 +783,8 @@ static void request_read_error(struct closure *cl) int i; if (s->recoverable) { - /* The cache read failed, but we can retry from the backing - * device. - */ - pr_debug("recovering at sector %llu", - (uint64_t) s->orig_bio->bi_sector); + /* Retry from the backing device: */ + trace_bcache_read_retry(s->orig_bio); s->error = 0; bv = s->bio.bio.bi_io_vec; @@ -806,7 +802,6 @@ static void request_read_error(struct closure *cl) /* XXX: invalidate cache */ - trace_bcache_read_retry(&s->bio.bio); closure_bio_submit(&s->bio.bio, &s->cl, s->d); } @@ -899,6 +894,7 @@ static void request_read_done_bh(struct closure *cl) struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); bch_mark_cache_accounting(s, !s->cache_miss, s->op.skip); + trace_bcache_read(s->orig_bio, !s->cache_miss, s->op.skip); if (s->error) continue_at_nobarrier(cl, request_read_error, bcache_wq); @@ -969,7 +965,6 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s, s->cache_miss = miss; bio_get(s->op.cache_bio); - trace_bcache_cache_miss(s->orig_bio); closure_bio_submit(s->op.cache_bio, &s->cl, s->d); return ret; @@ -1040,15 +1035,15 @@ static void request_write(struct cached_dev *dc, struct search *s) if (should_writeback(dc, s->orig_bio)) s->writeback = true; + trace_bcache_write(s->orig_bio, s->writeback, s->op.skip); + if (!s->writeback) { s->op.cache_bio = bio_clone_bioset(bio, GFP_NOIO, dc->disk.bio_split); - trace_bcache_writethrough(s->orig_bio); closure_bio_submit(bio, cl, s->d); } else { s->op.cache_bio = bio; - trace_bcache_writeback(s->orig_bio); bch_writeback_add(dc, bio_sectors(bio)); } out: @@ -1058,7 +1053,6 @@ skip: s->op.skip = true; s->op.cache_bio = s->orig_bio; bio_get(s->op.cache_bio); - trace_bcache_write_skip(s->orig_bio); if ((bio->bi_rw & REQ_DISCARD) && !blk_queue_discard(bdev_get_queue(dc->bdev))) @@ -1088,9 +1082,10 @@ static void request_nodata(struct cached_dev *dc, struct search *s) /* Cached devices - read & write stuff */ -int bch_get_congested(struct cache_set *c) +unsigned bch_get_congested(struct cache_set *c) { int i; + long rand; if (!c->congested_read_threshold_us && !c->congested_write_threshold_us) @@ -1106,7 +1101,13 @@ int bch_get_congested(struct cache_set *c) i += CONGESTED_MAX; - return i <= 0 ? 1 : fract_exp_two(i, 6); + if (i > 0) + i = fract_exp_two(i, 6); + + rand = get_random_int(); + i -= bitmap_weight(&rand, BITS_PER_LONG); + + return i > 0 ? i : 1; } static void add_sequential(struct task_struct *t) @@ -1126,10 +1127,8 @@ static void check_should_skip(struct cached_dev *dc, struct search *s) { struct cache_set *c = s->op.c; struct bio *bio = &s->bio.bio; - - long rand; - int cutoff = bch_get_congested(c); unsigned mode = cache_mode(dc, bio); + unsigned sectors, congested = bch_get_congested(c); if (atomic_read(&dc->disk.detaching) || c->gc_stats.in_use > CUTOFF_CACHE_ADD || @@ -1147,17 +1146,14 @@ static void check_should_skip(struct cached_dev *dc, struct search *s) goto skip; } - if (!cutoff) { - cutoff = dc->sequential_cutoff >> 9; + if (!congested && !dc->sequential_cutoff) + goto rescale; - if (!cutoff) - goto rescale; - - if (mode == CACHE_MODE_WRITEBACK && - (bio->bi_rw & REQ_WRITE) && - (bio->bi_rw & REQ_SYNC)) - goto rescale; - } + if (!congested && + mode == CACHE_MODE_WRITEBACK && + (bio->bi_rw & REQ_WRITE) && + (bio->bi_rw & REQ_SYNC)) + goto rescale; if (dc->sequential_merge) { struct io *i; @@ -1192,12 +1188,19 @@ found: add_sequential(s->task); } - rand = get_random_int(); - cutoff -= bitmap_weight(&rand, BITS_PER_LONG); + sectors = max(s->task->sequential_io, + s->task->sequential_io_avg) >> 9; - if (cutoff <= (int) (max(s->task->sequential_io, - s->task->sequential_io_avg) >> 9)) + if (dc->sequential_cutoff && + sectors >= dc->sequential_cutoff >> 9) { + trace_bcache_bypass_sequential(s->orig_bio); goto skip; + } + + if (congested && sectors >= congested) { + trace_bcache_bypass_congested(s->orig_bio); + goto skip; + } rescale: bch_rescale_priorities(c, bio_sectors(bio)); diff --git a/drivers/md/bcache/request.h b/drivers/md/bcache/request.h index 254d9ab5707c..57dc4784f4f4 100644 --- a/drivers/md/bcache/request.h +++ b/drivers/md/bcache/request.h @@ -30,7 +30,7 @@ struct search { }; void bch_cache_read_endio(struct bio *, int); -int bch_get_congested(struct cache_set *); +unsigned bch_get_congested(struct cache_set *); void bch_insert_data(struct closure *cl); void bch_btree_insert_async(struct closure *); void bch_cache_read_endio(struct bio *, int); diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index e53f89988b08..47bc13745068 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -11,6 +11,7 @@ #include "debug.h" #include "request.h" +#include #include #include #include @@ -543,7 +544,6 @@ void bch_prio_write(struct cache *ca) pr_debug("free %zu, free_inc %zu, unused %zu", fifo_used(&ca->free), fifo_used(&ca->free_inc), fifo_used(&ca->unused)); - blktrace_msg(ca, "Starting priorities: " buckets_free(ca)); for (i = prio_buckets(ca) - 1; i >= 0; --i) { long bucket; diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index 29228b8a6ffe..f5c2d8695230 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -10,6 +10,7 @@ #include "btree.h" #include "request.h" +#include #include static const char * const cache_replacement_policies[] = { diff --git a/drivers/md/bcache/trace.c b/drivers/md/bcache/trace.c index 983f9bb411bc..7f4f38aa16ae 100644 --- a/drivers/md/bcache/trace.c +++ b/drivers/md/bcache/trace.c @@ -2,6 +2,7 @@ #include "btree.h" #include "request.h" +#include #include #define CREATE_TRACE_POINTS @@ -9,18 +10,42 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_request_start); EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_request_end); -EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_passthrough); -EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_cache_hit); -EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_cache_miss); + +EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_bypass_sequential); +EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_bypass_congested); + +EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_read); +EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_write); EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_read_retry); -EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_writethrough); -EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_writeback); -EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_write_skip); + +EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_cache_insert); + +EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_journal_replay_key); +EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_journal_write); +EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_journal_full); +EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_journal_entry_full); + +EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_cache_cannibalize); + EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_read); EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_write); -EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_write_dirty); -EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_read_dirty); -EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_journal_write); -EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_cache_insert); + +EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_node_alloc); +EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_node_alloc_fail); +EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_node_free); + +EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_gc_coalesce); EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_gc_start); EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_gc_end); +EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_gc_copy); +EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_gc_copy_collision); + +EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_node_split); +EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_node_compact); +EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_set_root); + +EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_alloc_invalidate); +EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_alloc_fail); + +EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_writeback); +EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_writeback_collision); diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h index 577393e38c3a..e02780545f12 100644 --- a/drivers/md/bcache/util.h +++ b/drivers/md/bcache/util.h @@ -15,8 +15,6 @@ struct closure; -#include - #ifdef CONFIG_BCACHE_EDEBUG #define atomic_dec_bug(v) BUG_ON(atomic_dec_return(v) < 0) diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index 2714ed3991d1..82f6d4577be2 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -10,6 +10,8 @@ #include "btree.h" #include "debug.h" +#include + static struct workqueue_struct *dirty_wq; static void read_dirty(struct closure *); @@ -236,10 +238,12 @@ static void write_dirty_finish(struct closure *cl) for (i = 0; i < KEY_PTRS(&w->key); i++) atomic_inc(&PTR_BUCKET(dc->disk.c, &w->key, i)->pin); - pr_debug("clearing %s", pkey(&w->key)); bch_btree_insert(&op, dc->disk.c); closure_sync(&op.cl); + if (op.insert_collision) + trace_bcache_writeback_collision(&w->key); + atomic_long_inc(op.insert_collision ? &dc->disk.c->writeback_keys_failed : &dc->disk.c->writeback_keys_done); @@ -275,7 +279,6 @@ static void write_dirty(struct closure *cl) io->bio.bi_bdev = io->dc->bdev; io->bio.bi_end_io = dirty_endio; - trace_bcache_write_dirty(&io->bio); closure_bio_submit(&io->bio, cl, &io->dc->disk); continue_at(cl, write_dirty_finish, dirty_wq); @@ -296,7 +299,6 @@ static void read_dirty_submit(struct closure *cl) { struct dirty_io *io = container_of(cl, struct dirty_io, cl); - trace_bcache_read_dirty(&io->bio); closure_bio_submit(&io->bio, cl, &io->dc->disk); continue_at(cl, write_dirty, dirty_wq); @@ -352,7 +354,7 @@ static void read_dirty(struct closure *cl) if (bch_bio_alloc_pages(&io->bio, GFP_KERNEL)) goto err_free; - pr_debug("%s", pkey(&w->key)); + trace_bcache_writeback(&w->key); closure_call(&io->cl, read_dirty_submit, NULL, &dc->disk.cl); diff --git a/include/trace/events/bcache.h b/include/trace/events/bcache.h index 3cc5a0b278c3..c9952b36fcea 100644 --- a/include/trace/events/bcache.h +++ b/include/trace/events/bcache.h @@ -9,9 +9,7 @@ struct search; DECLARE_EVENT_CLASS(bcache_request, - TP_PROTO(struct search *s, struct bio *bio), - TP_ARGS(s, bio), TP_STRUCT__entry( @@ -22,7 +20,6 @@ DECLARE_EVENT_CLASS(bcache_request, __field(dev_t, orig_sector ) __field(unsigned int, nr_sector ) __array(char, rwbs, 6 ) - __array(char, comm, TASK_COMM_LEN ) ), TP_fast_assign( @@ -33,36 +30,66 @@ DECLARE_EVENT_CLASS(bcache_request, __entry->orig_sector = bio->bi_sector - 16; __entry->nr_sector = bio->bi_size >> 9; blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size); - memcpy(__entry->comm, current->comm, TASK_COMM_LEN); ), - TP_printk("%d,%d %s %llu + %u [%s] (from %d,%d @ %llu)", + TP_printk("%d,%d %s %llu + %u (from %d,%d @ %llu)", MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->rwbs, - (unsigned long long)__entry->sector, - __entry->nr_sector, __entry->comm, - __entry->orig_major, __entry->orig_minor, + __entry->rwbs, (unsigned long long)__entry->sector, + __entry->nr_sector, __entry->orig_major, __entry->orig_minor, (unsigned long long)__entry->orig_sector) ); +DECLARE_EVENT_CLASS(bkey, + TP_PROTO(struct bkey *k), + TP_ARGS(k), + + TP_STRUCT__entry( + __field(u32, size ) + __field(u32, inode ) + __field(u64, offset ) + __field(bool, dirty ) + ), + + TP_fast_assign( + __entry->inode = KEY_INODE(k); + __entry->offset = KEY_OFFSET(k); + __entry->size = KEY_SIZE(k); + __entry->dirty = KEY_DIRTY(k); + ), + + TP_printk("%u:%llu len %u dirty %u", __entry->inode, + __entry->offset, __entry->size, __entry->dirty) +); + +DECLARE_EVENT_CLASS(btree_node, + TP_PROTO(struct btree *b), + TP_ARGS(b), + + TP_STRUCT__entry( + __field(size_t, bucket ) + ), + + TP_fast_assign( + __entry->bucket = PTR_BUCKET_NR(b->c, &b->key, 0); + ), + + TP_printk("bucket %zu", __entry->bucket) +); + +/* request.c */ + DEFINE_EVENT(bcache_request, bcache_request_start, - TP_PROTO(struct search *s, struct bio *bio), - TP_ARGS(s, bio) ); DEFINE_EVENT(bcache_request, bcache_request_end, - TP_PROTO(struct search *s, struct bio *bio), - TP_ARGS(s, bio) ); DECLARE_EVENT_CLASS(bcache_bio, - TP_PROTO(struct bio *bio), - TP_ARGS(bio), TP_STRUCT__entry( @@ -70,7 +97,6 @@ DECLARE_EVENT_CLASS(bcache_bio, __field(sector_t, sector ) __field(unsigned int, nr_sector ) __array(char, rwbs, 6 ) - __array(char, comm, TASK_COMM_LEN ) ), TP_fast_assign( @@ -78,191 +104,295 @@ DECLARE_EVENT_CLASS(bcache_bio, __entry->sector = bio->bi_sector; __entry->nr_sector = bio->bi_size >> 9; blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size); - memcpy(__entry->comm, current->comm, TASK_COMM_LEN); ), - TP_printk("%d,%d %s %llu + %u [%s]", - MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->rwbs, - (unsigned long long)__entry->sector, - __entry->nr_sector, __entry->comm) + TP_printk("%d,%d %s %llu + %u", + MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs, + (unsigned long long)__entry->sector, __entry->nr_sector) ); - -DEFINE_EVENT(bcache_bio, bcache_passthrough, - +DEFINE_EVENT(bcache_bio, bcache_bypass_sequential, TP_PROTO(struct bio *bio), - TP_ARGS(bio) ); -DEFINE_EVENT(bcache_bio, bcache_cache_hit, - +DEFINE_EVENT(bcache_bio, bcache_bypass_congested, TP_PROTO(struct bio *bio), - TP_ARGS(bio) ); -DEFINE_EVENT(bcache_bio, bcache_cache_miss, - - TP_PROTO(struct bio *bio), - - TP_ARGS(bio) -); - -DEFINE_EVENT(bcache_bio, bcache_read_retry, - - TP_PROTO(struct bio *bio), - - TP_ARGS(bio) -); - -DEFINE_EVENT(bcache_bio, bcache_writethrough, - - TP_PROTO(struct bio *bio), - - TP_ARGS(bio) -); - -DEFINE_EVENT(bcache_bio, bcache_writeback, - - TP_PROTO(struct bio *bio), - - TP_ARGS(bio) -); - -DEFINE_EVENT(bcache_bio, bcache_write_skip, - - TP_PROTO(struct bio *bio), - - TP_ARGS(bio) -); - -DEFINE_EVENT(bcache_bio, bcache_btree_read, - - TP_PROTO(struct bio *bio), - - TP_ARGS(bio) -); - -DEFINE_EVENT(bcache_bio, bcache_btree_write, - - TP_PROTO(struct bio *bio), - - TP_ARGS(bio) -); - -DEFINE_EVENT(bcache_bio, bcache_write_dirty, - - TP_PROTO(struct bio *bio), - - TP_ARGS(bio) -); - -DEFINE_EVENT(bcache_bio, bcache_read_dirty, - - TP_PROTO(struct bio *bio), - - TP_ARGS(bio) -); - -DEFINE_EVENT(bcache_bio, bcache_write_moving, - - TP_PROTO(struct bio *bio), - - TP_ARGS(bio) -); - -DEFINE_EVENT(bcache_bio, bcache_read_moving, - - TP_PROTO(struct bio *bio), - - TP_ARGS(bio) -); - -DEFINE_EVENT(bcache_bio, bcache_journal_write, - - TP_PROTO(struct bio *bio), - - TP_ARGS(bio) -); - -DECLARE_EVENT_CLASS(bcache_cache_bio, - - TP_PROTO(struct bio *bio, - sector_t orig_sector, - struct block_device* orig_bdev), - - TP_ARGS(bio, orig_sector, orig_bdev), +TRACE_EVENT(bcache_read, + TP_PROTO(struct bio *bio, bool hit, bool bypass), + TP_ARGS(bio, hit, bypass), TP_STRUCT__entry( __field(dev_t, dev ) - __field(dev_t, orig_dev ) __field(sector_t, sector ) - __field(sector_t, orig_sector ) __field(unsigned int, nr_sector ) __array(char, rwbs, 6 ) - __array(char, comm, TASK_COMM_LEN ) + __field(bool, cache_hit ) + __field(bool, bypass ) ), TP_fast_assign( __entry->dev = bio->bi_bdev->bd_dev; - __entry->orig_dev = orig_bdev->bd_dev; __entry->sector = bio->bi_sector; - __entry->orig_sector = orig_sector; __entry->nr_sector = bio->bi_size >> 9; blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size); - memcpy(__entry->comm, current->comm, TASK_COMM_LEN); + __entry->cache_hit = hit; + __entry->bypass = bypass; ), - TP_printk("%d,%d %s %llu + %u [%s] (from %d,%d %llu)", + TP_printk("%d,%d %s %llu + %u hit %u bypass %u", MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->rwbs, - (unsigned long long)__entry->sector, - __entry->nr_sector, __entry->comm, - MAJOR(__entry->orig_dev), MINOR(__entry->orig_dev), - (unsigned long long)__entry->orig_sector) + __entry->rwbs, (unsigned long long)__entry->sector, + __entry->nr_sector, __entry->cache_hit, __entry->bypass) ); -DEFINE_EVENT(bcache_cache_bio, bcache_cache_insert, - - TP_PROTO(struct bio *bio, - sector_t orig_sector, - struct block_device *orig_bdev), - - TP_ARGS(bio, orig_sector, orig_bdev) -); - -DECLARE_EVENT_CLASS(bcache_gc, - - TP_PROTO(uint8_t *uuid), - - TP_ARGS(uuid), +TRACE_EVENT(bcache_write, + TP_PROTO(struct bio *bio, bool writeback, bool bypass), + TP_ARGS(bio, writeback, bypass), TP_STRUCT__entry( - __field(uint8_t *, uuid) + __field(dev_t, dev ) + __field(sector_t, sector ) + __field(unsigned int, nr_sector ) + __array(char, rwbs, 6 ) + __field(bool, writeback ) + __field(bool, bypass ) ), TP_fast_assign( - __entry->uuid = uuid; + __entry->dev = bio->bi_bdev->bd_dev; + __entry->sector = bio->bi_sector; + __entry->nr_sector = bio->bi_size >> 9; + blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size); + __entry->writeback = writeback; + __entry->bypass = bypass; + ), + + TP_printk("%d,%d %s %llu + %u hit %u bypass %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->rwbs, (unsigned long long)__entry->sector, + __entry->nr_sector, __entry->writeback, __entry->bypass) +); + +DEFINE_EVENT(bcache_bio, bcache_read_retry, + TP_PROTO(struct bio *bio), + TP_ARGS(bio) +); + +DEFINE_EVENT(bkey, bcache_cache_insert, + TP_PROTO(struct bkey *k), + TP_ARGS(k) +); + +/* Journal */ + +DECLARE_EVENT_CLASS(cache_set, + TP_PROTO(struct cache_set *c), + TP_ARGS(c), + + TP_STRUCT__entry( + __array(char, uuid, 16 ) + ), + + TP_fast_assign( + memcpy(__entry->uuid, c->sb.set_uuid, 16); ), TP_printk("%pU", __entry->uuid) ); - -DEFINE_EVENT(bcache_gc, bcache_gc_start, - - TP_PROTO(uint8_t *uuid), - - TP_ARGS(uuid) +DEFINE_EVENT(bkey, bcache_journal_replay_key, + TP_PROTO(struct bkey *k), + TP_ARGS(k) ); -DEFINE_EVENT(bcache_gc, bcache_gc_end, +DEFINE_EVENT(cache_set, bcache_journal_full, + TP_PROTO(struct cache_set *c), + TP_ARGS(c) +); - TP_PROTO(uint8_t *uuid), +DEFINE_EVENT(cache_set, bcache_journal_entry_full, + TP_PROTO(struct cache_set *c), + TP_ARGS(c) +); - TP_ARGS(uuid) +DEFINE_EVENT(bcache_bio, bcache_journal_write, + TP_PROTO(struct bio *bio), + TP_ARGS(bio) +); + +/* Btree */ + +DEFINE_EVENT(cache_set, bcache_btree_cache_cannibalize, + TP_PROTO(struct cache_set *c), + TP_ARGS(c) +); + +DEFINE_EVENT(btree_node, bcache_btree_read, + TP_PROTO(struct btree *b), + TP_ARGS(b) +); + +TRACE_EVENT(bcache_btree_write, + TP_PROTO(struct btree *b), + TP_ARGS(b), + + TP_STRUCT__entry( + __field(size_t, bucket ) + __field(unsigned, block ) + __field(unsigned, keys ) + ), + + TP_fast_assign( + __entry->bucket = PTR_BUCKET_NR(b->c, &b->key, 0); + __entry->block = b->written; + __entry->keys = b->sets[b->nsets].data->keys; + ), + + TP_printk("bucket %zu", __entry->bucket) +); + +DEFINE_EVENT(btree_node, bcache_btree_node_alloc, + TP_PROTO(struct btree *b), + TP_ARGS(b) +); + +DEFINE_EVENT(btree_node, bcache_btree_node_alloc_fail, + TP_PROTO(struct btree *b), + TP_ARGS(b) +); + +DEFINE_EVENT(btree_node, bcache_btree_node_free, + TP_PROTO(struct btree *b), + TP_ARGS(b) +); + +TRACE_EVENT(bcache_btree_gc_coalesce, + TP_PROTO(unsigned nodes), + TP_ARGS(nodes), + + TP_STRUCT__entry( + __field(unsigned, nodes ) + ), + + TP_fast_assign( + __entry->nodes = nodes; + ), + + TP_printk("coalesced %u nodes", __entry->nodes) +); + +DEFINE_EVENT(cache_set, bcache_gc_start, + TP_PROTO(struct cache_set *c), + TP_ARGS(c) +); + +DEFINE_EVENT(cache_set, bcache_gc_end, + TP_PROTO(struct cache_set *c), + TP_ARGS(c) +); + +DEFINE_EVENT(bkey, bcache_gc_copy, + TP_PROTO(struct bkey *k), + TP_ARGS(k) +); + +DEFINE_EVENT(bkey, bcache_gc_copy_collision, + TP_PROTO(struct bkey *k), + TP_ARGS(k) +); + +DECLARE_EVENT_CLASS(btree_split, + TP_PROTO(struct btree *b, unsigned keys), + TP_ARGS(b, keys), + + TP_STRUCT__entry( + __field(size_t, bucket ) + __field(unsigned, keys ) + ), + + TP_fast_assign( + __entry->bucket = PTR_BUCKET_NR(b->c, &b->key, 0); + __entry->keys = keys; + ), + + TP_printk("bucket %zu keys %u", __entry->bucket, __entry->keys) +); + +DEFINE_EVENT(btree_split, bcache_btree_node_split, + TP_PROTO(struct btree *b, unsigned keys), + TP_ARGS(b, keys) +); + +DEFINE_EVENT(btree_split, bcache_btree_node_compact, + TP_PROTO(struct btree *b, unsigned keys), + TP_ARGS(b, keys) +); + +DEFINE_EVENT(btree_node, bcache_btree_set_root, + TP_PROTO(struct btree *b), + TP_ARGS(b) +); + +/* Allocator */ + +TRACE_EVENT(bcache_alloc_invalidate, + TP_PROTO(struct cache *ca), + TP_ARGS(ca), + + TP_STRUCT__entry( + __field(unsigned, free ) + __field(unsigned, free_inc ) + __field(unsigned, free_inc_size ) + __field(unsigned, unused ) + ), + + TP_fast_assign( + __entry->free = fifo_used(&ca->free); + __entry->free_inc = fifo_used(&ca->free_inc); + __entry->free_inc_size = ca->free_inc.size; + __entry->unused = fifo_used(&ca->unused); + ), + + TP_printk("free %u free_inc %u/%u unused %u", __entry->free, + __entry->free_inc, __entry->free_inc_size, __entry->unused) +); + +TRACE_EVENT(bcache_alloc_fail, + TP_PROTO(struct cache *ca), + TP_ARGS(ca), + + TP_STRUCT__entry( + __field(unsigned, free ) + __field(unsigned, free_inc ) + __field(unsigned, unused ) + __field(unsigned, blocked ) + ), + + TP_fast_assign( + __entry->free = fifo_used(&ca->free); + __entry->free_inc = fifo_used(&ca->free_inc); + __entry->unused = fifo_used(&ca->unused); + __entry->blocked = atomic_read(&ca->set->prio_blocked); + ), + + TP_printk("free %u free_inc %u unused %u blocked %u", __entry->free, + __entry->free_inc, __entry->unused, __entry->blocked) +); + +/* Background writeback */ + +DEFINE_EVENT(bkey, bcache_writeback, + TP_PROTO(struct bkey *k), + TP_ARGS(k) +); + +DEFINE_EVENT(bkey, bcache_writeback_collision, + TP_PROTO(struct bkey *k), + TP_ARGS(k) ); #endif /* _TRACE_BCACHE_H */ From 85b1492ee113486d871de7676a61f506a43ca475 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 14 May 2013 20:33:16 -0700 Subject: [PATCH 07/20] bcache: Rip out pkey()/pbtree() Old gcc doesnt like the struct hack, and it is kind of ugly. So finish off the work to convert pr_debug() statements to tracepoints, and delete pkey()/pbtree(). Signed-off-by: Kent Overstreet --- drivers/md/bcache/bset.c | 16 ++++++++++---- drivers/md/bcache/btree.c | 21 ++++++------------ drivers/md/bcache/btree.h | 7 ++++++ drivers/md/bcache/debug.c | 40 ++++++++++++++++++++++------------- drivers/md/bcache/debug.h | 11 ++-------- drivers/md/bcache/super.c | 5 +++-- drivers/md/bcache/trace.c | 2 ++ include/trace/events/bcache.h | 33 +++++++++++++++++++++++++++++ 8 files changed, 90 insertions(+), 45 deletions(-) diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c index cb4578a327b9..e9399ed7f688 100644 --- a/drivers/md/bcache/bset.c +++ b/drivers/md/bcache/bset.c @@ -78,6 +78,7 @@ struct bkey *bch_keylist_pop(struct keylist *l) bool __bch_ptr_invalid(struct cache_set *c, int level, const struct bkey *k) { unsigned i; + char buf[80]; if (level && (!KEY_PTRS(k) || !KEY_SIZE(k) || KEY_DIRTY(k))) goto bad; @@ -102,7 +103,8 @@ bool __bch_ptr_invalid(struct cache_set *c, int level, const struct bkey *k) return false; bad: - cache_bug(c, "spotted bad key %s: %s", pkey(k), bch_ptr_status(c, k)); + bch_bkey_to_text(buf, sizeof(buf), k); + cache_bug(c, "spotted bad key %s: %s", buf, bch_ptr_status(c, k)); return true; } @@ -162,10 +164,16 @@ bool bch_ptr_bad(struct btree *b, const struct bkey *k) #ifdef CONFIG_BCACHE_EDEBUG bug: mutex_unlock(&b->c->bucket_lock); - btree_bug(b, + + { + char buf[80]; + + bch_bkey_to_text(buf, sizeof(buf), k); + btree_bug(b, "inconsistent pointer %s: bucket %zu pin %i prio %i gen %i last_gc %i mark %llu gc_gen %i", - pkey(k), PTR_BUCKET_NR(b->c, k, i), atomic_read(&g->pin), - g->prio, g->gen, g->last_gc, GC_MARK(g), g->gc_gen); + buf, PTR_BUCKET_NR(b->c, k, i), atomic_read(&g->pin), + g->prio, g->gen, g->last_gc, GC_MARK(g), g->gc_gen); + } return true; #endif } diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 218d486259a3..53a0f4ef4e32 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -1770,7 +1770,7 @@ static bool btree_insert_key(struct btree *b, struct btree_op *op, { struct bset *i = b->sets[b->nsets].data; struct bkey *m, *prev; - const char *status = "insert"; + unsigned status = BTREE_INSERT_STATUS_INSERT; BUG_ON(bkey_cmp(k, &b->key) > 0); BUG_ON(b->level && !KEY_PTRS(k)); @@ -1803,17 +1803,17 @@ static bool btree_insert_key(struct btree *b, struct btree_op *op, goto insert; /* prev is in the tree, if we merge we're done */ - status = "back merging"; + status = BTREE_INSERT_STATUS_BACK_MERGE; if (prev && bch_bkey_try_merge(b, prev, k)) goto merged; - status = "overwrote front"; + status = BTREE_INSERT_STATUS_OVERWROTE; if (m != end(i) && KEY_PTRS(m) == KEY_PTRS(k) && !KEY_SIZE(m)) goto copy; - status = "front merge"; + status = BTREE_INSERT_STATUS_FRONT_MERGE; if (m != end(i) && bch_bkey_try_merge(b, k, m)) goto copy; @@ -1823,16 +1823,12 @@ static bool btree_insert_key(struct btree *b, struct btree_op *op, insert: shift_keys(b, m, k); copy: bkey_copy(m, k); merged: - bch_check_keys(b, "%s for %s at %s: %s", status, - op_type(op), pbtree(b), pkey(k)); - bch_check_key_order_msg(b, i, "%s for %s at %s: %s", status, - op_type(op), pbtree(b), pkey(k)); + bch_check_keys(b, "%u for %s", status, op_type(op)); if (b->level && !KEY_OFFSET(k)) btree_current_write(b)->prio_blocked++; - pr_debug("%s for %s at %s: %s", status, - op_type(op), pbtree(b), pkey(k)); + trace_bcache_btree_insert_key(b, k, op->type, status); return true; } @@ -2234,9 +2230,6 @@ int bch_btree_search_recurse(struct btree *b, struct btree_op *op) struct btree_iter iter; bch_btree_iter_init(b, &iter, &KEY(op->inode, bio->bi_sector, 0)); - pr_debug("at %s searching for %u:%llu", pbtree(b), op->inode, - (uint64_t) bio->bi_sector); - do { k = bch_btree_iter_next_filter(&iter, b, bch_ptr_bad); if (!k) { @@ -2302,8 +2295,6 @@ static int bch_btree_refill_keybuf(struct btree *b, struct btree_op *op, if (buf->key_predicate(buf, k)) { struct keybuf_key *w; - pr_debug("%s", pkey(k)); - spin_lock(&buf->lock); w = array_alloc(&buf->freelist); diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h index 809bd77847a2..2b016b93cad4 100644 --- a/drivers/md/bcache/btree.h +++ b/drivers/md/bcache/btree.h @@ -271,6 +271,13 @@ struct btree_op { BKEY_PADDED(replace); }; +enum { + BTREE_INSERT_STATUS_INSERT, + BTREE_INSERT_STATUS_BACK_MERGE, + BTREE_INSERT_STATUS_OVERWROTE, + BTREE_INSERT_STATUS_FRONT_MERGE, +}; + void bch_btree_op_init_stack(struct btree_op *); static inline void rw_lock(bool w, struct btree *b, int level) diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c index ae6096c6845d..82e3a07771ec 100644 --- a/drivers/md/bcache/debug.c +++ b/drivers/md/bcache/debug.c @@ -47,11 +47,10 @@ const char *bch_ptr_status(struct cache_set *c, const struct bkey *k) return ""; } -struct keyprint_hack bch_pkey(const struct bkey *k) +int bch_bkey_to_text(char *buf, size_t size, const struct bkey *k) { unsigned i = 0; - struct keyprint_hack r; - char *out = r.s, *end = r.s + KEYHACK_SIZE; + char *out = buf, *end = buf + size; #define p(...) (out += scnprintf(out, end - out, __VA_ARGS__)) @@ -75,16 +74,14 @@ struct keyprint_hack bch_pkey(const struct bkey *k) if (KEY_CSUM(k)) p(" cs%llu %llx", KEY_CSUM(k), k->ptr[1]); #undef p - return r; + return out - buf; } -struct keyprint_hack bch_pbtree(const struct btree *b) +int bch_btree_to_text(char *buf, size_t size, const struct btree *b) { - struct keyprint_hack r; - - snprintf(r.s, 40, "%zu level %i/%i", PTR_BUCKET_NR(b->c, &b->key, 0), - b->level, b->c->root ? b->c->root->level : -1); - return r; + return scnprintf(buf, size, "%zu level %i/%i", + PTR_BUCKET_NR(b->c, &b->key, 0), + b->level, b->c->root ? b->c->root->level : -1); } #if defined(CONFIG_BCACHE_DEBUG) || defined(CONFIG_BCACHE_EDEBUG) @@ -100,10 +97,12 @@ static void dump_bset(struct btree *b, struct bset *i) { struct bkey *k; unsigned j; + char buf[80]; for (k = i->start; k < end(i); k = bkey_next(k)) { + bch_bkey_to_text(buf, sizeof(buf), k); printk(KERN_ERR "block %zu key %zi/%u: %s", index(i, b), - (uint64_t *) k - i->d, i->keys, pkey(k)); + (uint64_t *) k - i->d, i->keys, buf); for (j = 0; j < KEY_PTRS(k); j++) { size_t n = PTR_BUCKET_NR(b->c, k, j); @@ -252,6 +251,7 @@ static void vdump_bucket_and_panic(struct btree *b, const char *fmt, va_list args) { unsigned i; + char buf[80]; console_lock(); @@ -262,7 +262,8 @@ static void vdump_bucket_and_panic(struct btree *b, const char *fmt, console_unlock(); - panic("at %s\n", pbtree(b)); + bch_btree_to_text(buf, sizeof(buf), b); + panic("at %s\n", buf); } void bch_check_key_order_msg(struct btree *b, struct bset *i, @@ -337,6 +338,7 @@ static ssize_t bch_dump_read(struct file *file, char __user *buf, { struct dump_iterator *i = file->private_data; ssize_t ret = 0; + char kbuf[80]; while (size) { struct keybuf_key *w; @@ -359,7 +361,8 @@ static ssize_t bch_dump_read(struct file *file, char __user *buf, if (!w) break; - i->bytes = snprintf(i->buf, PAGE_SIZE, "%s\n", pkey(&w->key)); + bch_bkey_to_text(kbuf, sizeof(kbuf), &w->key); + i->bytes = snprintf(i->buf, PAGE_SIZE, "%s\n", kbuf); bch_keybuf_del(&i->keys, w); } @@ -526,10 +529,17 @@ static ssize_t btree_fuzz(struct kobject *k, struct kobj_attribute *a, k < end(i); k = bkey_next(k), l = bkey_next(l)) if (bkey_cmp(k, l) || - KEY_SIZE(k) != KEY_SIZE(l)) + KEY_SIZE(k) != KEY_SIZE(l)) { + char buf1[80]; + char buf2[80]; + + bch_bkey_to_text(buf1, sizeof(buf1), k); + bch_bkey_to_text(buf2, sizeof(buf2), l); + pr_err("key %zi differs: %s != %s", (uint64_t *) k - i->d, - pkey(k), pkey(l)); + buf1, buf2); + } for (j = 0; j < 3; j++) { pr_err("**** Set %i ****", j); diff --git a/drivers/md/bcache/debug.h b/drivers/md/bcache/debug.h index f9378a218148..1c39b5a2489b 100644 --- a/drivers/md/bcache/debug.h +++ b/drivers/md/bcache/debug.h @@ -3,15 +3,8 @@ /* Btree/bkey debug printing */ -#define KEYHACK_SIZE 80 -struct keyprint_hack { - char s[KEYHACK_SIZE]; -}; - -struct keyprint_hack bch_pkey(const struct bkey *k); -struct keyprint_hack bch_pbtree(const struct btree *b); -#define pkey(k) (&bch_pkey(k).s[0]) -#define pbtree(b) (&bch_pbtree(b).s[0]) +int bch_bkey_to_text(char *buf, size_t size, const struct bkey *k); +int bch_btree_to_text(char *buf, size_t size, const struct btree *b); #ifdef CONFIG_BCACHE_EDEBUG diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 47bc13745068..f24c2e0cbb1c 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -343,6 +343,7 @@ static void uuid_io(struct cache_set *c, unsigned long rw, struct closure *cl = &c->uuid_write.cl; struct uuid_entry *u; unsigned i; + char buf[80]; BUG_ON(!parent); closure_lock(&c->uuid_write, parent); @@ -363,8 +364,8 @@ static void uuid_io(struct cache_set *c, unsigned long rw, break; } - pr_debug("%s UUIDs at %s", rw & REQ_WRITE ? "wrote" : "read", - pkey(&c->uuid_bucket)); + bch_bkey_to_text(buf, sizeof(buf), k); + pr_debug("%s UUIDs at %s", rw & REQ_WRITE ? "wrote" : "read", buf); for (u = c->uuids; u < c->uuids + c->nr_uuids; u++) if (!bch_is_zero(u->uuid, 16)) diff --git a/drivers/md/bcache/trace.c b/drivers/md/bcache/trace.c index 7f4f38aa16ae..f7b6c197f90f 100644 --- a/drivers/md/bcache/trace.c +++ b/drivers/md/bcache/trace.c @@ -40,6 +40,8 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_gc_end); EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_gc_copy); EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_gc_copy_collision); +EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_insert_key); + EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_node_split); EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_node_compact); EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_set_root); diff --git a/include/trace/events/bcache.h b/include/trace/events/bcache.h index c9952b36fcea..5ebda976ea93 100644 --- a/include/trace/events/bcache.h +++ b/include/trace/events/bcache.h @@ -305,6 +305,39 @@ DEFINE_EVENT(bkey, bcache_gc_copy_collision, TP_ARGS(k) ); +TRACE_EVENT(bcache_btree_insert_key, + TP_PROTO(struct btree *b, struct bkey *k, unsigned op, unsigned status), + TP_ARGS(b, k, op, status), + + TP_STRUCT__entry( + __field(u64, btree_node ) + __field(u32, btree_level ) + __field(u32, inode ) + __field(u64, offset ) + __field(u32, size ) + __field(u8, dirty ) + __field(u8, op ) + __field(u8, status ) + ), + + TP_fast_assign( + __entry->btree_node = PTR_BUCKET_NR(b->c, &b->key, 0); + __entry->btree_level = b->level; + __entry->inode = KEY_INODE(k); + __entry->offset = KEY_OFFSET(k); + __entry->size = KEY_SIZE(k); + __entry->dirty = KEY_DIRTY(k); + __entry->op = op; + __entry->status = status; + ), + + TP_printk("%u for %u at %llu(%u): %u:%llu len %u dirty %u", + __entry->status, __entry->op, + __entry->btree_node, __entry->btree_level, + __entry->inode, __entry->offset, + __entry->size, __entry->dirty) +); + DECLARE_EVENT_CLASS(btree_split, TP_PROTO(struct btree *b, unsigned keys), TP_ARGS(b, keys), From 6ded34d1a54c046a45db071d3cb7b37bd0a4a31f Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 11 May 2013 15:59:37 -0700 Subject: [PATCH 08/20] bcache: Improve lazy sorting The old lazy sorting code was kind of hacky - rewrite in a way that mathematically makes more sense; the idea is that the size of the sets of keys in a btree node should increase by a more or less fixed ratio from smallest to biggest. Signed-off-by: Kent Overstreet --- drivers/md/bcache/bcache.h | 1 + drivers/md/bcache/bset.c | 40 ++++++++++++++++++++++---------------- drivers/md/bcache/super.c | 2 ++ 3 files changed, 26 insertions(+), 17 deletions(-) diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 59c15e09e4dd..6fa5a1e33c49 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -828,6 +828,7 @@ struct cache_set { */ struct mutex sort_lock; struct bset *sort; + unsigned sort_crit_factor; /* List of buckets we're currently writing data to */ struct list_head data_buckets; diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c index e9399ed7f688..a0f190ac17a4 100644 --- a/drivers/md/bcache/bset.c +++ b/drivers/md/bcache/bset.c @@ -1092,33 +1092,39 @@ void bch_btree_sort_into(struct btree *b, struct btree *new) new->sets->size = 0; } +#define SORT_CRIT (4096 / sizeof(uint64_t)) + void bch_btree_sort_lazy(struct btree *b) { - if (b->nsets) { - unsigned i, j, keys = 0, total; + unsigned crit = SORT_CRIT; + int i; - for (i = 0; i <= b->nsets; i++) - keys += b->sets[i].data->keys; + /* Don't sort if nothing to do */ + if (!b->nsets) + goto out; - total = keys; + /* If not a leaf node, always sort */ + if (b->level) { + bch_btree_sort(b); + return; + } - for (j = 0; j < b->nsets; j++) { - if (keys * 2 < total || - keys < 1000) { - bch_btree_sort_partial(b, j); - return; - } + for (i = b->nsets - 1; i >= 0; --i) { + crit *= b->c->sort_crit_factor; - keys -= b->sets[j].data->keys; - } - - /* Must sort if b->nsets == 3 or we'll overflow */ - if (b->nsets >= (MAX_BSETS - 1) - b->level) { - bch_btree_sort(b); + if (b->sets[i].data->keys < crit) { + bch_btree_sort_partial(b, i); return; } } + /* Sort if we'd overflow */ + if (b->nsets + 1 == MAX_BSETS) { + bch_btree_sort(b); + return; + } + +out: bset_build_written_tree(b); } diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index f24c2e0cbb1c..3c8474161e8e 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -1375,6 +1375,8 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) c->btree_pages = max_t(int, c->btree_pages / 4, BTREE_MAX_PAGES); + c->sort_crit_factor = int_sqrt(c->btree_pages); + mutex_init(&c->bucket_lock); mutex_init(&c->sort_lock); spin_lock_init(&c->sort_time_lock); From 444fc0b6b167ed164e7436621a9d095e042644dd Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 11 May 2013 17:07:26 -0700 Subject: [PATCH 09/20] bcache: Initialize sectors_dirty when attaching Previously, dirty_data wouldn't get initialized until the first garbage collection... which was a bit of a problem for background writeback (as the PD controller keys off of it) and also confusing for users. This is also prep work for making background writeback aware of raid5/6 stripes. Signed-off-by: Kent Overstreet --- drivers/md/bcache/bcache.h | 2 +- drivers/md/bcache/btree.c | 29 +--------------------------- drivers/md/bcache/super.c | 1 + drivers/md/bcache/writeback.c | 36 +++++++++++++++++++++++++++++++++++ 4 files changed, 39 insertions(+), 29 deletions(-) diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 6fa5a1e33c49..d099d8894c2f 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -438,7 +438,6 @@ struct bcache_device { atomic_t detaching; atomic_long_t sectors_dirty; - unsigned long sectors_dirty_gc; unsigned long sectors_dirty_last; long sectors_dirty_derivative; @@ -1225,6 +1224,7 @@ void bch_cache_set_stop(struct cache_set *); struct cache_set *bch_cache_set_alloc(struct cache_sb *); void bch_btree_cache_free(struct cache_set *); int bch_btree_cache_alloc(struct cache_set *); +void bch_sectors_dirty_init(struct cached_dev *); void bch_cached_dev_writeback_init(struct cached_dev *); void bch_moving_init_cache_set(struct cache_set *); diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 53a0f4ef4e32..230c3a6d9be2 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -1119,11 +1119,8 @@ static int btree_gc_mark_node(struct btree *b, unsigned *keys, gc->nkeys++; gc->data += KEY_SIZE(k); - if (KEY_DIRTY(k)) { + if (KEY_DIRTY(k)) gc->dirty += KEY_SIZE(k); - if (d) - d->sectors_dirty_gc += KEY_SIZE(k); - } } for (t = b->sets; t <= &b->sets[b->nsets]; t++) @@ -1377,7 +1374,6 @@ static void btree_gc_start(struct cache_set *c) { struct cache *ca; struct bucket *b; - struct bcache_device **d; unsigned i; if (!c->gc_mark_valid) @@ -1395,12 +1391,6 @@ static void btree_gc_start(struct cache_set *c) SET_GC_MARK(b, GC_MARK_RECLAIMABLE); } - for (d = c->devices; - d < c->devices + c->nr_uuids; - d++) - if (*d) - (*d)->sectors_dirty_gc = 0; - mutex_unlock(&c->bucket_lock); } @@ -1409,7 +1399,6 @@ size_t bch_btree_gc_finish(struct cache_set *c) size_t available = 0; struct bucket *b; struct cache *ca; - struct bcache_device **d; unsigned i; mutex_lock(&c->bucket_lock); @@ -1452,22 +1441,6 @@ size_t bch_btree_gc_finish(struct cache_set *c) } } - for (d = c->devices; - d < c->devices + c->nr_uuids; - d++) - if (*d) { - unsigned long last = - atomic_long_read(&((*d)->sectors_dirty)); - long difference = (*d)->sectors_dirty_gc - last; - - pr_debug("sectors dirty off by %li", difference); - - (*d)->sectors_dirty_last += difference; - - atomic_long_set(&((*d)->sectors_dirty), - (*d)->sectors_dirty_gc); - } - mutex_unlock(&c->bucket_lock); return available; } diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 3c8474161e8e..dbfa1c38e85e 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -961,6 +961,7 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c) atomic_set(&dc->count, 1); if (BDEV_STATE(&dc->sb) == BDEV_STATE_DIRTY) { + bch_sectors_dirty_init(dc); atomic_set(&dc->has_dirty, 1); atomic_inc(&dc->count); bch_writeback_queue(dc); diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index 82f6d4577be2..553949eefd51 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -377,6 +377,42 @@ err: refill_dirty(cl); } +/* Init */ + +static int bch_btree_sectors_dirty_init(struct btree *b, struct btree_op *op, + struct cached_dev *dc) +{ + struct bkey *k; + struct btree_iter iter; + + bch_btree_iter_init(b, &iter, &KEY(dc->disk.id, 0, 0)); + while ((k = bch_btree_iter_next_filter(&iter, b, bch_ptr_bad))) + if (!b->level) { + if (KEY_INODE(k) > dc->disk.id) + break; + + if (KEY_DIRTY(k)) + atomic_long_add(KEY_SIZE(k), + &dc->disk.sectors_dirty); + } else { + btree(sectors_dirty_init, k, b, op, dc); + if (KEY_INODE(k) > dc->disk.id) + break; + + cond_resched(); + } + + return 0; +} + +void bch_sectors_dirty_init(struct cached_dev *dc) +{ + struct btree_op op; + + bch_btree_op_init_stack(&op); + btree_root(sectors_dirty_init, dc->disk.c, &op, dc); +} + void bch_cached_dev_writeback_init(struct cached_dev *dc) { closure_init_unlocked(&dc->writeback); From 279afbad4e54acbd61bf88a54a73af3bbfdeb5dd Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 5 Jun 2013 06:21:07 -0700 Subject: [PATCH 10/20] bcache: Track dirty data by stripe To make background writeback aware of raid5/6 stripes, we first need to track the amount of dirty data within each stripe - we do this by breaking up the existing sectors_dirty into per stripe atomic_ts Signed-off-by: Kent Overstreet --- drivers/md/bcache/bcache.h | 10 ++++----- drivers/md/bcache/btree.c | 20 ++++++++++++------ drivers/md/bcache/request.c | 3 ++- drivers/md/bcache/super.c | 32 ++++++++++++++++++++++++---- drivers/md/bcache/sysfs.c | 5 +++-- drivers/md/bcache/writeback.c | 40 +++++++++++++++++++++++++++++------ drivers/md/bcache/writeback.h | 21 ++++++++++++++++++ 7 files changed, 105 insertions(+), 26 deletions(-) create mode 100644 drivers/md/bcache/writeback.h diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index d099d8894c2f..dbddef0cdb59 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -437,7 +437,10 @@ struct bcache_device { /* If nonzero, we're detaching/unregistering from cache set */ atomic_t detaching; - atomic_long_t sectors_dirty; + uint64_t nr_stripes; + unsigned stripe_size_bits; + atomic_t *stripe_sectors_dirty; + unsigned long sectors_dirty_last; long sectors_dirty_derivative; @@ -1159,9 +1162,6 @@ static inline void wake_up_allocators(struct cache_set *c) /* Forward declarations */ -void bch_writeback_queue(struct cached_dev *); -void bch_writeback_add(struct cached_dev *, unsigned); - void bch_count_io_errors(struct cache *, int, const char *); void bch_bbio_count_io_errors(struct cache_set *, struct bio *, int, const char *); @@ -1224,8 +1224,6 @@ void bch_cache_set_stop(struct cache_set *); struct cache_set *bch_cache_set_alloc(struct cache_sb *); void bch_btree_cache_free(struct cache_set *); int bch_btree_cache_alloc(struct cache_set *); -void bch_sectors_dirty_init(struct cached_dev *); -void bch_cached_dev_writeback_init(struct cached_dev *); void bch_moving_init_cache_set(struct cache_set *); int bch_cache_allocator_start(struct cache *ca); diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 230c3a6d9be2..b93cf56260a4 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -24,6 +24,7 @@ #include "btree.h" #include "debug.h" #include "request.h" +#include "writeback.h" #include #include @@ -1599,14 +1600,14 @@ static bool fix_overlapping_extents(struct btree *b, struct btree_iter *iter, struct btree_op *op) { - void subtract_dirty(struct bkey *k, int sectors) + void subtract_dirty(struct bkey *k, uint64_t offset, int sectors) { - struct bcache_device *d = b->c->devices[KEY_INODE(k)]; - - if (KEY_DIRTY(k) && d) - atomic_long_sub(sectors, &d->sectors_dirty); + if (KEY_DIRTY(k)) + bcache_dev_sectors_dirty_add(b->c, KEY_INODE(k), + offset, -sectors); } + uint64_t old_offset; unsigned old_size, sectors_found = 0; while (1) { @@ -1618,6 +1619,7 @@ static bool fix_overlapping_extents(struct btree *b, if (bkey_cmp(k, &START_KEY(insert)) <= 0) continue; + old_offset = KEY_START(k); old_size = KEY_SIZE(k); /* @@ -1673,7 +1675,7 @@ static bool fix_overlapping_extents(struct btree *b, struct bkey *top; - subtract_dirty(k, KEY_SIZE(insert)); + subtract_dirty(k, KEY_START(insert), KEY_SIZE(insert)); if (bkey_written(b, k)) { /* @@ -1720,7 +1722,7 @@ static bool fix_overlapping_extents(struct btree *b, } } - subtract_dirty(k, old_size - KEY_SIZE(k)); + subtract_dirty(k, old_offset, old_size - KEY_SIZE(k)); } check_failed: @@ -1796,6 +1798,10 @@ static bool btree_insert_key(struct btree *b, struct btree_op *op, insert: shift_keys(b, m, k); copy: bkey_copy(m, k); merged: + if (KEY_DIRTY(k)) + bcache_dev_sectors_dirty_add(b->c, KEY_INODE(k), + KEY_START(k), KEY_SIZE(k)); + bch_check_keys(b, "%u for %s", status, op_type(op)); if (b->level && !KEY_OFFSET(k)) diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index 695469958c1e..017c95fced8e 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -10,6 +10,7 @@ #include "btree.h" #include "debug.h" #include "request.h" +#include "writeback.h" #include #include @@ -1044,7 +1045,7 @@ static void request_write(struct cached_dev *dc, struct search *s) closure_bio_submit(bio, cl, s->d); } else { s->op.cache_bio = bio; - bch_writeback_add(dc, bio_sectors(bio)); + bch_writeback_add(dc); } out: closure_call(&s->op.cl, bch_insert_data, NULL, cl); diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index dbfa1c38e85e..8c73f0c7f28a 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -10,6 +10,7 @@ #include "btree.h" #include "debug.h" #include "request.h" +#include "writeback.h" #include #include @@ -744,13 +745,35 @@ static void bcache_device_free(struct bcache_device *d) mempool_destroy(d->unaligned_bvec); if (d->bio_split) bioset_free(d->bio_split); + if (is_vmalloc_addr(d->stripe_sectors_dirty)) + vfree(d->stripe_sectors_dirty); + else + kfree(d->stripe_sectors_dirty); closure_debug_destroy(&d->cl); } -static int bcache_device_init(struct bcache_device *d, unsigned block_size) +static int bcache_device_init(struct bcache_device *d, unsigned block_size, + sector_t sectors) { struct request_queue *q; + size_t n; + + if (!d->stripe_size_bits) + d->stripe_size_bits = 31; + + d->nr_stripes = round_up(sectors, 1 << d->stripe_size_bits) >> + d->stripe_size_bits; + + if (!d->nr_stripes || d->nr_stripes > SIZE_MAX / sizeof(atomic_t)) + return -ENOMEM; + + n = d->nr_stripes * sizeof(atomic_t); + d->stripe_sectors_dirty = n < PAGE_SIZE << 6 + ? kzalloc(n, GFP_KERNEL) + : vzalloc(n); + if (!d->stripe_sectors_dirty) + return -ENOMEM; if (!(d->bio_split = bioset_create(4, offsetof(struct bbio, bio))) || !(d->unaligned_bvec = mempool_create_kmalloc_pool(1, @@ -760,6 +783,7 @@ static int bcache_device_init(struct bcache_device *d, unsigned block_size) !(q = blk_alloc_queue(GFP_KERNEL))) return -ENOMEM; + set_capacity(d->disk, sectors); snprintf(d->disk->disk_name, DISK_NAME_LEN, "bcache%i", bcache_minor); d->disk->major = bcache_major; @@ -1047,7 +1071,8 @@ static int cached_dev_init(struct cached_dev *dc, unsigned block_size) hlist_add_head(&io->hash, dc->io_hash + RECENT_IO); } - ret = bcache_device_init(&dc->disk, block_size); + ret = bcache_device_init(&dc->disk, block_size, + dc->bdev->bd_part->nr_sects - dc->sb.data_offset); if (ret) return ret; @@ -1146,11 +1171,10 @@ static int flash_dev_run(struct cache_set *c, struct uuid_entry *u) kobject_init(&d->kobj, &bch_flash_dev_ktype); - if (bcache_device_init(d, block_bytes(c))) + if (bcache_device_init(d, block_bytes(c), u->sectors)) goto err; bcache_device_attach(d, c, u - c->uuids); - set_capacity(d->disk, u->sectors); bch_flash_dev_request_init(d); add_disk(d->disk); diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index f5c2d8695230..cf8d91ec3238 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -9,6 +9,7 @@ #include "sysfs.h" #include "btree.h" #include "request.h" +#include "writeback.h" #include #include @@ -128,7 +129,7 @@ SHOW(__bch_cached_dev) char derivative[20]; char target[20]; bch_hprint(dirty, - atomic_long_read(&dc->disk.sectors_dirty) << 9); + bcache_dev_sectors_dirty(&dc->disk) << 9); bch_hprint(derivative, dc->writeback_rate_derivative << 9); bch_hprint(target, dc->writeback_rate_target << 9); @@ -144,7 +145,7 @@ SHOW(__bch_cached_dev) } sysfs_hprint(dirty_data, - atomic_long_read(&dc->disk.sectors_dirty) << 9); + bcache_dev_sectors_dirty(&dc->disk) << 9); var_printf(sequential_merge, "%i"); var_hprint(sequential_cutoff); diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index 553949eefd51..dd815475c524 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -9,6 +9,7 @@ #include "bcache.h" #include "btree.h" #include "debug.h" +#include "writeback.h" #include @@ -38,7 +39,7 @@ static void __update_writeback_rate(struct cached_dev *dc) int change = 0; int64_t error; - int64_t dirty = atomic_long_read(&dc->disk.sectors_dirty); + int64_t dirty = bcache_dev_sectors_dirty(&dc->disk); int64_t derivative = dirty - dc->disk.sectors_dirty_last; dc->disk.sectors_dirty_last = dirty; @@ -183,10 +184,8 @@ void bch_writeback_queue(struct cached_dev *dc) } } -void bch_writeback_add(struct cached_dev *dc, unsigned sectors) +void bch_writeback_add(struct cached_dev *dc) { - atomic_long_add(sectors, &dc->disk.sectors_dirty); - if (!atomic_read(&dc->has_dirty) && !atomic_xchg(&dc->has_dirty, 1)) { atomic_inc(&dc->count); @@ -205,6 +204,34 @@ void bch_writeback_add(struct cached_dev *dc, unsigned sectors) } } +void bcache_dev_sectors_dirty_add(struct cache_set *c, unsigned inode, + uint64_t offset, int nr_sectors) +{ + struct bcache_device *d = c->devices[inode]; + unsigned stripe_size, stripe_offset; + uint64_t stripe; + + if (!d) + return; + + stripe_size = 1 << d->stripe_size_bits; + stripe = offset >> d->stripe_size_bits; + stripe_offset = offset & (stripe_size - 1); + + while (nr_sectors) { + int s = min_t(unsigned, abs(nr_sectors), + stripe_size - stripe_offset); + + if (nr_sectors < 0) + s = -s; + + atomic_add(s, d->stripe_sectors_dirty + stripe); + nr_sectors -= s; + stripe_offset = 0; + stripe++; + } +} + /* Background writeback - IO loop */ static void dirty_io_destructor(struct closure *cl) @@ -392,8 +419,9 @@ static int bch_btree_sectors_dirty_init(struct btree *b, struct btree_op *op, break; if (KEY_DIRTY(k)) - atomic_long_add(KEY_SIZE(k), - &dc->disk.sectors_dirty); + bcache_dev_sectors_dirty_add(b->c, dc->disk.id, + KEY_START(k), + KEY_SIZE(k)); } else { btree(sectors_dirty_init, k, b, op, dc); if (KEY_INODE(k) > dc->disk.id) diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h new file mode 100644 index 000000000000..5ce9771df047 --- /dev/null +++ b/drivers/md/bcache/writeback.h @@ -0,0 +1,21 @@ +#ifndef _BCACHE_WRITEBACK_H +#define _BCACHE_WRITEBACK_H + +static inline uint64_t bcache_dev_sectors_dirty(struct bcache_device *d) +{ + uint64_t i, ret = 0; + + for (i = 0; i < d->nr_stripes; i++) + ret += atomic_read(d->stripe_sectors_dirty + i); + + return ret; +} + +void bcache_dev_sectors_dirty_add(struct cache_set *, unsigned, uint64_t, int); +void bch_writeback_queue(struct cached_dev *); +void bch_writeback_add(struct cached_dev *); + +void bch_sectors_dirty_init(struct cached_dev *dc); +void bch_cached_dev_writeback_init(struct cached_dev *); + +#endif From 72c270612bd33192fa836ad0f2939af1ca218292 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 5 Jun 2013 06:24:39 -0700 Subject: [PATCH 11/20] bcache: Write out full stripes Now that we're tracking dirty data per stripe, we can add two optimizations for raid5/6: * If a stripe is already dirty, force writes to that stripe to writeback mode - to help build up full stripes of dirty data * When flushing dirty data, preferentially write out full stripes first if there are any. Signed-off-by: Kent Overstreet --- drivers/md/bcache/bcache.h | 3 +-- drivers/md/bcache/btree.c | 19 ++++++++------- drivers/md/bcache/btree.h | 9 +++---- drivers/md/bcache/debug.c | 4 ++-- drivers/md/bcache/movinggc.c | 5 ++-- drivers/md/bcache/request.c | 23 ++++++------------ drivers/md/bcache/sysfs.c | 8 +++++++ drivers/md/bcache/writeback.c | 44 +++++++++++++++++++++++++++++++++-- drivers/md/bcache/writeback.h | 43 ++++++++++++++++++++++++++++++++++ 9 files changed, 121 insertions(+), 37 deletions(-) diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index dbddef0cdb59..342ba86c6e4f 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -387,8 +387,6 @@ struct keybuf_key { typedef bool (keybuf_pred_fn)(struct keybuf *, struct bkey *); struct keybuf { - keybuf_pred_fn *key_predicate; - struct bkey last_scanned; spinlock_t lock; @@ -532,6 +530,7 @@ struct cached_dev { unsigned sequential_merge:1; unsigned verify:1; + unsigned partial_stripes_expensive:1; unsigned writeback_metadata:1; unsigned writeback_running:1; unsigned char writeback_percent; diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index b93cf56260a4..09fb8a2f43da 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -2252,7 +2252,8 @@ static inline int keybuf_nonoverlapping_cmp(struct keybuf_key *l, } static int bch_btree_refill_keybuf(struct btree *b, struct btree_op *op, - struct keybuf *buf, struct bkey *end) + struct keybuf *buf, struct bkey *end, + keybuf_pred_fn *pred) { struct btree_iter iter; bch_btree_iter_init(b, &iter, &buf->last_scanned); @@ -2271,7 +2272,7 @@ static int bch_btree_refill_keybuf(struct btree *b, struct btree_op *op, if (bkey_cmp(&buf->last_scanned, end) >= 0) break; - if (buf->key_predicate(buf, k)) { + if (pred(buf, k)) { struct keybuf_key *w; spin_lock(&buf->lock); @@ -2290,7 +2291,7 @@ static int bch_btree_refill_keybuf(struct btree *b, struct btree_op *op, if (!k) break; - btree(refill_keybuf, k, b, op, buf, end); + btree(refill_keybuf, k, b, op, buf, end, pred); /* * Might get an error here, but can't really do anything * and it'll get logged elsewhere. Just read what we @@ -2308,7 +2309,7 @@ static int bch_btree_refill_keybuf(struct btree *b, struct btree_op *op, } void bch_refill_keybuf(struct cache_set *c, struct keybuf *buf, - struct bkey *end) + struct bkey *end, keybuf_pred_fn *pred) { struct bkey start = buf->last_scanned; struct btree_op op; @@ -2316,7 +2317,7 @@ void bch_refill_keybuf(struct cache_set *c, struct keybuf *buf, cond_resched(); - btree_root(refill_keybuf, c, &op, buf, end); + btree_root(refill_keybuf, c, &op, buf, end, pred); closure_sync(&op.cl); pr_debug("found %s keys from %llu:%llu to %llu:%llu", @@ -2402,7 +2403,8 @@ struct keybuf_key *bch_keybuf_next(struct keybuf *buf) struct keybuf_key *bch_keybuf_next_rescan(struct cache_set *c, struct keybuf *buf, - struct bkey *end) + struct bkey *end, + keybuf_pred_fn *pred) { struct keybuf_key *ret; @@ -2416,15 +2418,14 @@ struct keybuf_key *bch_keybuf_next_rescan(struct cache_set *c, break; } - bch_refill_keybuf(c, buf, end); + bch_refill_keybuf(c, buf, end, pred); } return ret; } -void bch_keybuf_init(struct keybuf *buf, keybuf_pred_fn *fn) +void bch_keybuf_init(struct keybuf *buf) { - buf->key_predicate = fn; buf->last_scanned = MAX_KEY; buf->keys = RB_ROOT; diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h index 2b016b93cad4..f66d69a7baf1 100644 --- a/drivers/md/bcache/btree.h +++ b/drivers/md/bcache/btree.h @@ -391,13 +391,14 @@ void bch_moving_gc(struct closure *); int bch_btree_check(struct cache_set *, struct btree_op *); uint8_t __bch_btree_mark_key(struct cache_set *, int, struct bkey *); -void bch_keybuf_init(struct keybuf *, keybuf_pred_fn *); -void bch_refill_keybuf(struct cache_set *, struct keybuf *, struct bkey *); +void bch_keybuf_init(struct keybuf *); +void bch_refill_keybuf(struct cache_set *, struct keybuf *, struct bkey *, + keybuf_pred_fn *); bool bch_keybuf_check_overlapping(struct keybuf *, struct bkey *, struct bkey *); void bch_keybuf_del(struct keybuf *, struct keybuf_key *); struct keybuf_key *bch_keybuf_next(struct keybuf *); -struct keybuf_key *bch_keybuf_next_rescan(struct cache_set *, - struct keybuf *, struct bkey *); +struct keybuf_key *bch_keybuf_next_rescan(struct cache_set *, struct keybuf *, + struct bkey *, keybuf_pred_fn *); #endif diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c index 82e3a07771ec..1c8fd319846e 100644 --- a/drivers/md/bcache/debug.c +++ b/drivers/md/bcache/debug.c @@ -357,7 +357,7 @@ static ssize_t bch_dump_read(struct file *file, char __user *buf, if (i->bytes) break; - w = bch_keybuf_next_rescan(i->c, &i->keys, &MAX_KEY); + w = bch_keybuf_next_rescan(i->c, &i->keys, &MAX_KEY, dump_pred); if (!w) break; @@ -380,7 +380,7 @@ static int bch_dump_open(struct inode *inode, struct file *file) file->private_data = i; i->c = c; - bch_keybuf_init(&i->keys, dump_pred); + bch_keybuf_init(&i->keys); i->keys.last_scanned = KEY(0, 0, 0); return 0; diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c index 04f6b97ffda6..a241e9fd4f7f 100644 --- a/drivers/md/bcache/movinggc.c +++ b/drivers/md/bcache/movinggc.c @@ -136,7 +136,8 @@ static void read_moving(struct closure *cl) /* XXX: if we error, background writeback could stall indefinitely */ while (!test_bit(CACHE_SET_STOPPING, &c->flags)) { - w = bch_keybuf_next_rescan(c, &c->moving_gc_keys, &MAX_KEY); + w = bch_keybuf_next_rescan(c, &c->moving_gc_keys, + &MAX_KEY, moving_pred); if (!w) break; @@ -248,5 +249,5 @@ void bch_moving_gc(struct closure *cl) void bch_moving_init_cache_set(struct cache_set *c) { - bch_keybuf_init(&c->moving_gc_keys, moving_pred); + bch_keybuf_init(&c->moving_gc_keys); } diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index 017c95fced8e..17bd59704eba 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -22,8 +22,6 @@ #define CUTOFF_CACHE_ADD 95 #define CUTOFF_CACHE_READA 90 -#define CUTOFF_WRITEBACK 50 -#define CUTOFF_WRITEBACK_SYNC 75 struct kmem_cache *bch_search_cache; @@ -998,17 +996,6 @@ static void cached_dev_write_complete(struct closure *cl) cached_dev_bio_complete(cl); } -static bool should_writeback(struct cached_dev *dc, struct bio *bio) -{ - unsigned threshold = (bio->bi_rw & REQ_SYNC) - ? CUTOFF_WRITEBACK_SYNC - : CUTOFF_WRITEBACK; - - return !atomic_read(&dc->disk.detaching) && - cache_mode(dc, bio) == CACHE_MODE_WRITEBACK && - dc->disk.c->gc_stats.in_use < threshold; -} - static void request_write(struct cached_dev *dc, struct search *s) { struct closure *cl = &s->cl; @@ -1030,12 +1017,16 @@ static void request_write(struct cached_dev *dc, struct search *s) if (bio->bi_rw & REQ_DISCARD) goto skip; + if (should_writeback(dc, s->orig_bio, + cache_mode(dc, bio), + s->op.skip)) { + s->op.skip = false; + s->writeback = true; + } + if (s->op.skip) goto skip; - if (should_writeback(dc, s->orig_bio)) - s->writeback = true; - trace_bcache_write(s->orig_bio, s->writeback, s->op.skip); if (!s->writeback) { diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index cf8d91ec3238..70c6dff0d0cd 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -81,6 +81,9 @@ rw_attribute(writeback_rate_p_term_inverse); rw_attribute(writeback_rate_d_smooth); read_attribute(writeback_rate_debug); +read_attribute(stripe_size); +read_attribute(partial_stripes_expensive); + rw_attribute(synchronous); rw_attribute(journal_delay_ms); rw_attribute(discard); @@ -147,6 +150,9 @@ SHOW(__bch_cached_dev) sysfs_hprint(dirty_data, bcache_dev_sectors_dirty(&dc->disk) << 9); + sysfs_hprint(stripe_size, (1 << dc->disk.stripe_size_bits) << 9); + var_printf(partial_stripes_expensive, "%u"); + var_printf(sequential_merge, "%i"); var_hprint(sequential_cutoff); var_hprint(readahead); @@ -286,6 +292,8 @@ static struct attribute *bch_cached_dev_files[] = { &sysfs_writeback_rate_d_smooth, &sysfs_writeback_rate_debug, &sysfs_dirty_data, + &sysfs_stripe_size, + &sysfs_partial_stripes_expensive, &sysfs_sequential_cutoff, &sysfs_sequential_merge, &sysfs_clear_stats, diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index dd815475c524..d81ee5ccc726 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -108,6 +108,31 @@ static bool dirty_pred(struct keybuf *buf, struct bkey *k) return KEY_DIRTY(k); } +static bool dirty_full_stripe_pred(struct keybuf *buf, struct bkey *k) +{ + uint64_t stripe; + unsigned nr_sectors = KEY_SIZE(k); + struct cached_dev *dc = container_of(buf, struct cached_dev, + writeback_keys); + unsigned stripe_size = 1 << dc->disk.stripe_size_bits; + + if (!KEY_DIRTY(k)) + return false; + + stripe = KEY_START(k) >> dc->disk.stripe_size_bits; + while (1) { + if (atomic_read(dc->disk.stripe_sectors_dirty + stripe) != + stripe_size) + return false; + + if (nr_sectors <= stripe_size) + return true; + + nr_sectors -= stripe_size; + stripe++; + } +} + static void dirty_init(struct keybuf_key *w) { struct dirty_io *io = w->private; @@ -152,7 +177,22 @@ static void refill_dirty(struct closure *cl) searched_from_start = true; } - bch_refill_keybuf(dc->disk.c, buf, &end); + if (dc->partial_stripes_expensive) { + uint64_t i; + + for (i = 0; i < dc->disk.nr_stripes; i++) + if (atomic_read(dc->disk.stripe_sectors_dirty + i) == + 1 << dc->disk.stripe_size_bits) + goto full_stripes; + + goto normal_refill; +full_stripes: + bch_refill_keybuf(dc->disk.c, buf, &end, + dirty_full_stripe_pred); + } else { +normal_refill: + bch_refill_keybuf(dc->disk.c, buf, &end, dirty_pred); + } if (bkey_cmp(&buf->last_scanned, &end) >= 0 && searched_from_start) { /* Searched the entire btree - delay awhile */ @@ -446,7 +486,7 @@ void bch_cached_dev_writeback_init(struct cached_dev *dc) closure_init_unlocked(&dc->writeback); init_rwsem(&dc->writeback_lock); - bch_keybuf_init(&dc->writeback_keys, dirty_pred); + bch_keybuf_init(&dc->writeback_keys); dc->writeback_metadata = true; dc->writeback_running = true; diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h index 5ce9771df047..c91f61bb95b6 100644 --- a/drivers/md/bcache/writeback.h +++ b/drivers/md/bcache/writeback.h @@ -1,6 +1,9 @@ #ifndef _BCACHE_WRITEBACK_H #define _BCACHE_WRITEBACK_H +#define CUTOFF_WRITEBACK 40 +#define CUTOFF_WRITEBACK_SYNC 70 + static inline uint64_t bcache_dev_sectors_dirty(struct bcache_device *d) { uint64_t i, ret = 0; @@ -11,6 +14,46 @@ static inline uint64_t bcache_dev_sectors_dirty(struct bcache_device *d) return ret; } +static inline bool bcache_dev_stripe_dirty(struct bcache_device *d, + uint64_t offset, + unsigned nr_sectors) +{ + uint64_t stripe = offset >> d->stripe_size_bits; + + while (1) { + if (atomic_read(d->stripe_sectors_dirty + stripe)) + return true; + + if (nr_sectors <= 1 << d->stripe_size_bits) + return false; + + nr_sectors -= 1 << d->stripe_size_bits; + stripe++; + } +} + +static inline bool should_writeback(struct cached_dev *dc, struct bio *bio, + unsigned cache_mode, bool would_skip) +{ + unsigned in_use = dc->disk.c->gc_stats.in_use; + + if (cache_mode != CACHE_MODE_WRITEBACK || + atomic_read(&dc->disk.detaching) || + in_use > CUTOFF_WRITEBACK_SYNC) + return false; + + if (dc->partial_stripes_expensive && + bcache_dev_stripe_dirty(&dc->disk, bio->bi_sector, + bio_sectors(bio))) + return true; + + if (would_skip) + return false; + + return bio->bi_rw & REQ_SYNC || + in_use <= CUTOFF_WRITEBACK; +} + void bcache_dev_sectors_dirty_add(struct cache_set *, unsigned, uint64_t, int); void bch_writeback_queue(struct cached_dev *); void bch_writeback_add(struct cached_dev *); From bd206b51bca1e0a1c5f4b00218d56213a1f6d3bd Mon Sep 17 00:00:00 2001 From: Masanari Iida Date: Mon, 20 May 2013 00:04:35 +0900 Subject: [PATCH 12/20] doc: Fix typo in documentation/bcache.txt Correct spelling typo in documentation/bcache.txt Signed-off-by: Masanari Iida Signed-off-by: Kent Overstreet --- Documentation/bcache.txt | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/Documentation/bcache.txt b/Documentation/bcache.txt index b3a7e7d384f6..c3365f26b2d9 100644 --- a/Documentation/bcache.txt +++ b/Documentation/bcache.txt @@ -181,7 +181,7 @@ want for getting the best possible numbers when benchmarking. In practice this isn't an issue because as soon as a write comes along it'll cause the btree node to be split, and you need almost no write traffic for - this to not show up enough to be noticable (especially since bcache's btree + this to not show up enough to be noticeable (especially since bcache's btree nodes are huge and index large regions of the device). But when you're benchmarking, if you're trying to warm the cache by reading a bunch of data and there's no other traffic - that can be a problem. @@ -222,7 +222,7 @@ running it's in passthrough mode or caching). sequential_cutoff - A sequential IO will bypass the cache once it passes this threshhold; the + A sequential IO will bypass the cache once it passes this threshold; the most recent 128 IOs are tracked so sequential IO can be detected even when it isn't all done at once. @@ -296,7 +296,7 @@ cache_miss_collisions since the synchronization for cache misses was rewritten) cache_readaheads - Count of times readahead occured. + Count of times readahead occurred. SYSFS - CACHE SET: @@ -362,7 +362,7 @@ unregister SYSFS - CACHE SET INTERNAL: This directory also exposes timings for a number of internal operations, with -separate files for average duration, average frequency, last occurence and max +separate files for average duration, average frequency, last occurrence and max duration: garbage collection, btree read, btree node sorts and btree splits. active_journal_entries @@ -417,7 +417,7 @@ freelist_percent space. io_errors - Number of errors that have occured, decayed by io_error_halflife. + Number of errors that have occurred, decayed by io_error_halflife. metadata_written Sum of all non data writes (btree writes and all other metadata). From a25c32bedeff3573b53572b87bcafe10ec5b75a9 Mon Sep 17 00:00:00 2001 From: Gabriel de Perthuis Date: Fri, 7 Jun 2013 23:27:01 +0200 Subject: [PATCH 13/20] bcache: Send a uevent with a cached device's UUID Signed-off-by: Gabriel de Perthuis --- drivers/md/bcache/super.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 8c73f0c7f28a..a104dad2c7fa 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -825,6 +825,11 @@ static void calc_cached_dev_sectors(struct cache_set *c) void bch_cached_dev_run(struct cached_dev *dc) { struct bcache_device *d = &dc->disk; + char *env[] = { + "DRIVER=bcache", + kasprintf(GFP_KERNEL, "CACHED_UUID=%pU", dc->sb.uuid), + NULL + }; if (atomic_xchg(&dc->running, 1)) return; @@ -841,10 +846,11 @@ void bch_cached_dev_run(struct cached_dev *dc) add_disk(d->disk); bd_link_disk_holder(dc->bdev, dc->disk.disk); -#if 0 - char *env[] = { "SYMLINK=label" , NULL }; + /* won't show up in the uevent file, use udevadm monitor -e instead + * only class / kset properties are persistent */ kobject_uevent_env(&disk_to_dev(d->disk)->kobj, KOBJ_CHANGE, env); -#endif + kfree(env[1]); + if (sysfs_create_link(&d->kobj, &disk_to_dev(d->disk)->kobj, "dev") || sysfs_create_link(&disk_to_dev(d->disk)->kobj, &d->kobj, "bcache")) pr_debug("error creating sysfs link"); From ab9e14002e271eba41f7f9ab7e9b03cac4adc22d Mon Sep 17 00:00:00 2001 From: Gabriel de Perthuis Date: Sun, 9 Jun 2013 00:54:48 +0200 Subject: [PATCH 14/20] bcache: Send label uevents Signed-off-by: Gabriel de Perthuis Signed-off-by: Kent Overstreet --- drivers/md/bcache/super.c | 9 ++++++++- drivers/md/bcache/sysfs.c | 9 +++++++++ 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index a104dad2c7fa..cff2d182dfb0 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -825,12 +825,18 @@ static void calc_cached_dev_sectors(struct cache_set *c) void bch_cached_dev_run(struct cached_dev *dc) { struct bcache_device *d = &dc->disk; + char buf[SB_LABEL_SIZE + 1]; char *env[] = { "DRIVER=bcache", kasprintf(GFP_KERNEL, "CACHED_UUID=%pU", dc->sb.uuid), - NULL + NULL, + NULL, }; + memcpy(buf, dc->sb.label, SB_LABEL_SIZE); + buf[SB_LABEL_SIZE] = '\0'; + env[2] = kasprintf(GFP_KERNEL, "CACHED_LABEL=%s", buf); + if (atomic_xchg(&dc->running, 1)) return; @@ -850,6 +856,7 @@ void bch_cached_dev_run(struct cached_dev *dc) * only class / kset properties are persistent */ kobject_uevent_env(&disk_to_dev(d->disk)->kobj, KOBJ_CHANGE, env); kfree(env[1]); + kfree(env[2]); if (sysfs_create_link(&d->kobj, &disk_to_dev(d->disk)->kobj, "dev") || sysfs_create_link(&disk_to_dev(d->disk)->kobj, &d->kobj, "bcache")) diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index 70c6dff0d0cd..dd3f00a42729 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -178,6 +178,7 @@ STORE(__cached_dev) disk.kobj); unsigned v = size; struct cache_set *c; + struct kobj_uevent_env *env; #define d_strtoul(var) sysfs_strtoul(var, dc->var) #define d_strtoi_h(var) sysfs_hatoi(var, dc->var) @@ -222,6 +223,7 @@ STORE(__cached_dev) } if (attr == &sysfs_label) { + /* note: endlines are preserved */ memcpy(dc->sb.label, buf, SB_LABEL_SIZE); bch_write_bdev_super(dc, NULL); if (dc->disk.c) { @@ -229,6 +231,13 @@ STORE(__cached_dev) buf, SB_LABEL_SIZE); bch_uuid_write(dc->disk.c); } + env = kzalloc(sizeof(struct kobj_uevent_env), GFP_KERNEL); + add_uevent_var(env, "DRIVER=bcache"); + add_uevent_var(env, "CACHED_UUID=%pU", dc->sb.uuid), + add_uevent_var(env, "CACHED_LABEL=%s", buf); + kobject_uevent_env( + &disk_to_dev(dc->disk.disk)->kobj, KOBJ_CHANGE, env->envp); + kfree(env); } if (attr == &sysfs_attach) { From cecd628d9a9966ed0af1237df5cc5818945fe9f2 Mon Sep 17 00:00:00 2001 From: Gabriel de Perthuis Date: Thu, 27 Jun 2013 02:12:07 +0200 Subject: [PATCH 15/20] bcache: Refresh usage docs Mention udev autoregistration, symlinks. Write down some sysfs paths. Signed-off-by: Gabriel de Perthuis Signed-off-by: Kent Overstreet --- Documentation/bcache.txt | 37 ++++++++++++++++++++++++------------- 1 file changed, 24 insertions(+), 13 deletions(-) diff --git a/Documentation/bcache.txt b/Documentation/bcache.txt index c3365f26b2d9..32b6c3189d98 100644 --- a/Documentation/bcache.txt +++ b/Documentation/bcache.txt @@ -46,29 +46,33 @@ you format your backing devices and cache device at the same time, you won't have to manually attach: make-bcache -B /dev/sda /dev/sdb -C /dev/sdc -To make bcache devices known to the kernel, echo them to /sys/fs/bcache/register: +bcache-tools now ships udev rules, and bcache devices are known to the kernel +immediately. Without udev, you can manually register devices like this: echo /dev/sdb > /sys/fs/bcache/register echo /dev/sdc > /sys/fs/bcache/register -To register your bcache devices automatically, you could add something like -this to an init script: +Registering the backing device makes the bcache device show up in /dev; you can +now format it and use it as normal. But the first time using a new bcache +device, it'll be running in passthrough mode until you attach it to a cache. +See the section on attaching. - echo /dev/sd* > /sys/fs/bcache/register_quiet +The devices show up as: -It'll look for bcache superblocks and ignore everything that doesn't have one. + /dev/bcache -Registering the backing device makes the bcache show up in /dev; you can now -format it and use it as normal. But the first time using a new bcache device, -it'll be running in passthrough mode until you attach it to a cache. See the -section on attaching. +As well as (with udev): -The devices show up at /dev/bcacheN, and can be controlled via sysfs from -/sys/block/bcacheN/bcache: + /dev/bcache/by-uuid/ + /dev/bcache/by-label/