From 85cbe1f88cb189322e3e4ef98816c19ab12161ea Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 17 Feb 2014 13:44:06 -0800 Subject: bcache: Fix another compiler warning on m68k Use a bigger hammer this time Signed-off-by: Kent Overstreet Cc: linux-stable --- drivers/md/bcache/bset.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers') diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c index 3f74b4b0747b..545416415305 100644 --- a/drivers/md/bcache/bset.c +++ b/drivers/md/bcache/bset.c @@ -23,8 +23,8 @@ void bch_dump_bset(struct btree_keys *b, struct bset *i, unsigned set) for (k = i->start; k < bset_bkey_last(i); k = next) { next = bkey_next(k); - printk(KERN_ERR "block %u key %li/%u: ", set, - (uint64_t *) k - i->d, i->keys); + printk(KERN_ERR "block %u key %u/%u: ", set, + (unsigned) ((u64 *) k - i->d), i->keys); if (b->ops->key_dump) b->ops->key_dump(b, k); -- cgit v1.2.3 From 1b4eaf3d3809a658c85911e92d9ff64086931efa Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 16 Jan 2014 15:04:18 -0800 Subject: bcache: Fix flash_dev_cache_miss() for real this time The code was using sectors to count the number of sectors it was zeroing... but then it passed it to bio_advance()... after it had been set to 0. Amusing... Signed-off-by: Kent Overstreet --- drivers/md/bcache/request.c | 19 +++++-------------- 1 file changed, 5 insertions(+), 14 deletions(-) (limited to 'drivers') diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index 5d5d031cf381..fc14ba3f6d05 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -1203,22 +1203,13 @@ void bch_cached_dev_request_init(struct cached_dev *dc) static int flash_dev_cache_miss(struct btree *b, struct search *s, struct bio *bio, unsigned sectors) { - struct bio_vec bv; - struct bvec_iter iter; - - /* Zero fill bio */ - - bio_for_each_segment(bv, bio, iter) { - unsigned j = min(bv.bv_len >> 9, sectors); - - void *p = kmap(bv.bv_page); - memset(p + bv.bv_offset, 0, j << 9); - kunmap(bv.bv_page); + unsigned bytes = min(sectors, bio_sectors(bio)) << 9; - sectors -= j; - } + swap(bio->bi_iter.bi_size, bytes); + zero_fill_bio(bio); + swap(bio->bi_iter.bi_size, bytes); - bio_advance(bio, min(sectors << 9, bio->bi_iter.bi_size)); + bio_advance(bio, bytes); if (!bio->bi_iter.bi_size) return MAP_DONE; -- cgit v1.2.3 From dabb44334060b4b84051b34c58573e57cc7432b2 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 19 Feb 2014 19:48:26 -0800 Subject: bcache: Fix a shutdown bug Shutdown wasn't cancelling/waiting on journal_write_work() Signed-off-by: Kent Overstreet --- drivers/md/bcache/journal.c | 9 +++++++-- drivers/md/bcache/journal.h | 1 + drivers/md/bcache/super.c | 4 ++++ 3 files changed, 12 insertions(+), 2 deletions(-) (limited to 'drivers') diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c index 18039affc306..e38c5997bf12 100644 --- a/drivers/md/bcache/journal.c +++ b/drivers/md/bcache/journal.c @@ -536,6 +536,7 @@ void bch_journal_next(struct journal *j) atomic_set(&fifo_back(&j->pin), 1); j->cur->data->seq = ++j->seq; + j->cur->dirty = false; j->cur->need_write = false; j->cur->data->keys = 0; @@ -731,7 +732,10 @@ static void journal_write_work(struct work_struct *work) struct cache_set, journal.work); spin_lock(&c->journal.lock); - journal_try_write(c); + if (c->journal.cur->dirty) + journal_try_write(c); + else + spin_unlock(&c->journal.lock); } /* @@ -761,7 +765,8 @@ atomic_t *bch_journal(struct cache_set *c, if (parent) { closure_wait(&w->wait, parent); journal_try_write(c); - } else if (!w->need_write) { + } else if (!w->dirty) { + w->dirty = true; schedule_delayed_work(&c->journal.work, msecs_to_jiffies(c->journal_delay_ms)); spin_unlock(&c->journal.lock); diff --git a/drivers/md/bcache/journal.h b/drivers/md/bcache/journal.h index 9180c4465075..e3c39457afbb 100644 --- a/drivers/md/bcache/journal.h +++ b/drivers/md/bcache/journal.h @@ -95,6 +95,7 @@ struct journal_write { struct cache_set *c; struct closure_waitlist wait; + bool dirty; bool need_write; }; diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 24a3a1546caa..c70521fe57a6 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -1403,6 +1403,10 @@ static void cache_set_flush(struct closure *cl) if (ca->alloc_thread) kthread_stop(ca->alloc_thread); + cancel_delayed_work_sync(&c->journal.work); + /* flush last journal entry if needed */ + c->journal.work.work.func(&c->journal.work.work); + closure_return(cl); } -- cgit v1.2.3 From 4fa03402cda2fac1a54248c7578b939d95931dc0 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 17 Mar 2014 18:58:55 -0700 Subject: bcache: Fix a lockdep splat in an error path Signed-off-by: Kent Overstreet --- drivers/md/bcache/super.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'drivers') diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index c70521fe57a6..5136e11eadb0 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -1873,7 +1873,10 @@ static void register_cache(struct cache_sb *sb, struct page *sb_page, if (kobject_add(&ca->kobj, &part_to_dev(bdev->bd_part)->kobj, "bcache")) goto err; + mutex_lock(&bch_register_lock); err = register_cache_set(ca); + mutex_unlock(&bch_register_lock); + if (err) goto err; @@ -1935,8 +1938,6 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, if (!try_module_get(THIS_MODULE)) return -EBUSY; - mutex_lock(&bch_register_lock); - if (!(path = kstrndup(buffer, size, GFP_KERNEL)) || !(sb = kmalloc(sizeof(struct cache_sb), GFP_KERNEL))) goto err; @@ -1969,7 +1970,9 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, if (!dc) goto err_close; + mutex_lock(&bch_register_lock); register_bdev(sb, sb_page, bdev, dc); + mutex_unlock(&bch_register_lock); } else { struct cache *ca = kzalloc(sizeof(*ca), GFP_KERNEL); if (!ca) @@ -1982,7 +1985,6 @@ out: put_page(sb_page); kfree(sb); kfree(path); - mutex_unlock(&bch_register_lock); module_put(THIS_MODULE); return ret; -- cgit v1.2.3 From 65ddf45a3102916fb622c71f7af158b19d49dc7f Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 24 Feb 2014 19:55:28 -0800 Subject: bcache: Fix a null ptr deref in journal replay Signed-off-by: Kent Overstreet --- drivers/md/bcache/journal.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c index e38c5997bf12..97e6a92da999 100644 --- a/drivers/md/bcache/journal.c +++ b/drivers/md/bcache/journal.c @@ -287,9 +287,13 @@ void bch_journal_mark(struct cache_set *c, struct list_head *list) k < bset_bkey_last(&i->j); k = bkey_next(k)) { unsigned j; + struct bucket *g; for (j = 0; j < KEY_PTRS(k); j++) { - struct bucket *g = PTR_BUCKET(c, k, j); + if (!ptr_available(c, k, j)) + continue; + + g = PTR_BUCKET(c, k, j); atomic_inc(&g->pin); if (g->prio == BTREE_PRIO && -- cgit v1.2.3 From 27201cfdaa2aeb571191494c1bae6863ffb04108 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 13 Mar 2014 13:44:21 -0700 Subject: bcache: Fix a journalling reclaim after recovery bug On recovery we weren't correctly keeping track of what journal buckets had open journal entries, thus it was possible for them to be overwritten until we'd written all new journal entries. Signed-off-by: Kent Overstreet --- drivers/md/bcache/journal.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'drivers') diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c index 97e6a92da999..4152a9119896 100644 --- a/drivers/md/bcache/journal.c +++ b/drivers/md/bcache/journal.c @@ -237,8 +237,14 @@ bsearch: for (i = 0; i < ca->sb.njournal_buckets; i++) if (ja->seq[i] > seq) { seq = ja->seq[i]; - ja->cur_idx = ja->discard_idx = - ja->last_idx = i; + /* + * When journal_reclaim() goes to allocate for + * the first time, it'll use the bucket after + * ja->cur_idx + */ + ja->cur_idx = i; + ja->last_idx = ja->discard_idx = (i + 1) % + ca->sb.njournal_buckets; } } -- cgit v1.2.3 From 0bd143fd800055b1db756693289bbebdb93f2a73 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 4 Mar 2014 17:56:24 -0800 Subject: bcache: Fix a bug recovering from unclean shutdown The code to fixup incorrect bucket prios incorrectly did not skip btree node freeing keys Signed-off-by: Kent Overstreet --- drivers/md/bcache/btree.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers') diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 5f9c2a665ca5..2d4a864865eb 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -1726,9 +1726,9 @@ static int bch_btree_check_recurse(struct btree *b, struct btree_op *op, !ptr_stale(b->c, k, i)) { g->gen = PTR_GEN(k, i); - if (b->level) + if (b->level && bkey_cmp(k, &ZERO_KEY)) g->prio = BTREE_PRIO; - else if (g->prio == BTREE_PRIO) + else if (!b->level && g->prio == BTREE_PRIO) g->prio = INITIAL_PRIO; } } -- cgit v1.2.3 From 487dded86ea065317aea121bec8f1816f2f235c9 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 17 Mar 2014 15:13:26 -0700 Subject: bcache: Fix another bug recovering from unclean shutdown The on disk bucket gens are allowed to be out of date, when we reuse buckets that didn't have any live data in them. To deal with this, the initial gc has to update the bucket gen when we find a pointer gen newer than the bucket's gen. Unfortunately we weren't doing this for pointers in the journal that we're about to replay. Signed-off-by: Kent Overstreet --- drivers/md/bcache/btree.c | 82 +++++++++++++++++---------------------------- drivers/md/bcache/btree.h | 2 +- drivers/md/bcache/journal.c | 17 +++------- 3 files changed, 36 insertions(+), 65 deletions(-) (limited to 'drivers') diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 2d4a864865eb..5f587ce57e3a 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -1126,7 +1126,8 @@ static int btree_check_reserve(struct btree *b, struct btree_op *op) /* Garbage collection */ -uint8_t __bch_btree_mark_key(struct cache_set *c, int level, struct bkey *k) +static uint8_t __bch_btree_mark_key(struct cache_set *c, int level, + struct bkey *k) { uint8_t stale = 0; unsigned i; @@ -1177,6 +1178,26 @@ uint8_t __bch_btree_mark_key(struct cache_set *c, int level, struct bkey *k) #define btree_mark_key(b, k) __bch_btree_mark_key(b->c, b->level, k) +void bch_initial_mark_key(struct cache_set *c, int level, struct bkey *k) +{ + unsigned i; + + for (i = 0; i < KEY_PTRS(k); i++) + if (ptr_available(c, k, i) && + !ptr_stale(c, k, i)) { + struct bucket *b = PTR_BUCKET(c, k, i); + + b->gen = PTR_GEN(k, i); + + if (level && bkey_cmp(k, &ZERO_KEY)) + b->prio = BTREE_PRIO; + else if (!level && b->prio == BTREE_PRIO) + b->prio = INITIAL_PRIO; + } + + __bch_btree_mark_key(c, level, k); +} + static bool btree_gc_mark_node(struct btree *b, struct gc_stat *gc) { uint8_t stale = 0; @@ -1511,6 +1532,8 @@ static int bch_btree_gc_root(struct btree *b, struct btree_op *op, } } + __bch_btree_mark_key(b->c, b->level + 1, &b->key); + if (b->level) { ret = btree_gc_recurse(b, op, writes, gc); if (ret) @@ -1561,11 +1584,6 @@ size_t bch_btree_gc_finish(struct cache_set *c) c->gc_mark_valid = 1; c->need_gc = 0; - if (c->root) - for (i = 0; i < KEY_PTRS(&c->root->key); i++) - SET_GC_MARK(PTR_BUCKET(c, &c->root->key, i), - GC_MARK_METADATA); - for (i = 0; i < KEY_PTRS(&c->uuid_bucket); i++) SET_GC_MARK(PTR_BUCKET(c, &c->uuid_bucket, i), GC_MARK_METADATA); @@ -1705,36 +1723,16 @@ int bch_gc_thread_start(struct cache_set *c) /* Initial partial gc */ -static int bch_btree_check_recurse(struct btree *b, struct btree_op *op, - unsigned long **seen) +static int bch_btree_check_recurse(struct btree *b, struct btree_op *op) { int ret = 0; - unsigned i; struct bkey *k, *p = NULL; - struct bucket *g; struct btree_iter iter; - for_each_key_filter(&b->keys, k, &iter, bch_ptr_invalid) { - for (i = 0; i < KEY_PTRS(k); i++) { - if (!ptr_available(b->c, k, i)) - continue; - - g = PTR_BUCKET(b->c, k, i); - - if (!__test_and_set_bit(PTR_BUCKET_NR(b->c, k, i), - seen[PTR_DEV(k, i)]) || - !ptr_stale(b->c, k, i)) { - g->gen = PTR_GEN(k, i); - - if (b->level && bkey_cmp(k, &ZERO_KEY)) - g->prio = BTREE_PRIO; - else if (!b->level && g->prio == BTREE_PRIO) - g->prio = INITIAL_PRIO; - } - } + for_each_key_filter(&b->keys, k, &iter, bch_ptr_invalid) + bch_initial_mark_key(b->c, b->level, k); - btree_mark_key(b, k); - } + bch_initial_mark_key(b->c, b->level + 1, &b->key); if (b->level) { bch_btree_iter_init(&b->keys, &iter, NULL); @@ -1746,40 +1744,22 @@ static int bch_btree_check_recurse(struct btree *b, struct btree_op *op, btree_node_prefetch(b->c, k, b->level - 1); if (p) - ret = btree(check_recurse, p, b, op, seen); + ret = btree(check_recurse, p, b, op); p = k; } while (p && !ret); } - return 0; + return ret; } int bch_btree_check(struct cache_set *c) { - int ret = -ENOMEM; - unsigned i; - unsigned long *seen[MAX_CACHES_PER_SET]; struct btree_op op; - memset(seen, 0, sizeof(seen)); bch_btree_op_init(&op, SHRT_MAX); - for (i = 0; c->cache[i]; i++) { - size_t n = DIV_ROUND_UP(c->cache[i]->sb.nbuckets, 8); - seen[i] = kmalloc(n, GFP_KERNEL); - if (!seen[i]) - goto err; - - /* Disables the seen array until prio_read() uses it too */ - memset(seen[i], 0xFF, n); - } - - ret = btree_root(check_recurse, c, &op, seen); -err: - for (i = 0; i < MAX_CACHES_PER_SET; i++) - kfree(seen[i]); - return ret; + return btree_root(check_recurse, c, &op); } /* Btree insertion */ diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h index af065e97e55c..def9dc4a822f 100644 --- a/drivers/md/bcache/btree.h +++ b/drivers/md/bcache/btree.h @@ -251,7 +251,7 @@ int bch_gc_thread_start(struct cache_set *); size_t bch_btree_gc_finish(struct cache_set *); void bch_moving_gc(struct cache_set *); int bch_btree_check(struct cache_set *); -uint8_t __bch_btree_mark_key(struct cache_set *, int, struct bkey *); +void bch_initial_mark_key(struct cache_set *, int, struct bkey *); static inline void wake_up_gc(struct cache_set *c) { diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c index 4152a9119896..cf8e0932aad2 100644 --- a/drivers/md/bcache/journal.c +++ b/drivers/md/bcache/journal.c @@ -293,21 +293,12 @@ void bch_journal_mark(struct cache_set *c, struct list_head *list) k < bset_bkey_last(&i->j); k = bkey_next(k)) { unsigned j; - struct bucket *g; - for (j = 0; j < KEY_PTRS(k); j++) { - if (!ptr_available(c, k, j)) - continue; + for (j = 0; j < KEY_PTRS(k); j++) + if (ptr_available(c, k, j)) + atomic_inc(&PTR_BUCKET(c, k, j)->pin); - g = PTR_BUCKET(c, k, j); - atomic_inc(&g->pin); - - if (g->prio == BTREE_PRIO && - !ptr_stale(c, k, j)) - g->prio = INITIAL_PRIO; - } - - __bch_btree_mark_key(c, 0, k); + bch_initial_mark_key(c, 0, k); } } } -- cgit v1.2.3 From 90db6919f5f1614d1b7a92052445506bc6c564d2 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 10 Feb 2014 17:26:40 -0800 Subject: bcache: Fix discard granularity blk_stack_limits() doesn't like a discard granularity of 0. Signed-off-by: Kent Overstreet --- drivers/md/bcache/super.c | 1 + 1 file changed, 1 insertion(+) (limited to 'drivers') diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 5136e11eadb0..fb343276beef 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -843,6 +843,7 @@ static int bcache_device_init(struct bcache_device *d, unsigned block_size, q->limits.max_segment_size = UINT_MAX; q->limits.max_segments = BIO_MAX_PAGES; q->limits.max_discard_sectors = UINT_MAX; + q->limits.discard_granularity = 512; q->limits.io_min = block_size; q->limits.logical_block_size = block_size; q->limits.physical_block_size = block_size; -- cgit v1.2.3 From da415a096fc06e49d1a15f7a06bcfe6ad44c5d38 Mon Sep 17 00:00:00 2001 From: Nicholas Swenson Date: Thu, 9 Jan 2014 16:03:04 -0800 Subject: bcache: Fix moving_gc deadlocking with a foreground write Deadlock happened because a foreground write slept, waiting for a bucket to be allocated. Normally the gc would mark buckets available for invalidation. But the moving_gc was stuck waiting for outstanding writes to complete. These writes used the bcache_wq, the same queue foreground writes used. This fix gives moving_gc its own work queue, so it was still finish moving even if foreground writes are stuck waiting for allocation. It also makes work queue a parameter to the data_insert path, so moving_gc can use its workqueue for writes. Signed-off-by: Nicholas Swenson Signed-off-by: Kent Overstreet --- drivers/md/bcache/bcache.h | 2 ++ drivers/md/bcache/movinggc.c | 5 +++-- drivers/md/bcache/request.c | 13 +++++++------ drivers/md/bcache/request.h | 1 + drivers/md/bcache/super.c | 3 +++ 5 files changed, 16 insertions(+), 8 deletions(-) (limited to 'drivers') diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index a4c7306ff43d..6d814f463d9e 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -628,6 +628,8 @@ struct cache_set { /* Number of moving GC bios in flight */ struct semaphore moving_in_flight; + struct workqueue_struct *moving_gc_wq; + struct btree *root; #ifdef CONFIG_BCACHE_DEBUG diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c index 9eb60d102de8..8c7205186d08 100644 --- a/drivers/md/bcache/movinggc.c +++ b/drivers/md/bcache/movinggc.c @@ -115,7 +115,7 @@ static void write_moving(struct closure *cl) closure_call(&op->cl, bch_data_insert, NULL, cl); } - continue_at(cl, write_moving_finish, system_wq); + continue_at(cl, write_moving_finish, op->wq); } static void read_moving_submit(struct closure *cl) @@ -125,7 +125,7 @@ static void read_moving_submit(struct closure *cl) bch_submit_bbio(bio, io->op.c, &io->w->key, 0); - continue_at(cl, write_moving, system_wq); + continue_at(cl, write_moving, io->op.wq); } static void read_moving(struct cache_set *c) @@ -160,6 +160,7 @@ static void read_moving(struct cache_set *c) io->w = w; io->op.inode = KEY_INODE(&w->key); io->op.c = c; + io->op.wq = c->moving_gc_wq; moving_init(io); bio = &io->bio.bio; diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index fc14ba3f6d05..3e880869871f 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -248,7 +248,7 @@ static void bch_data_insert_keys(struct closure *cl) atomic_dec_bug(journal_ref); if (!op->insert_data_done) - continue_at(cl, bch_data_insert_start, bcache_wq); + continue_at(cl, bch_data_insert_start, op->wq); bch_keylist_free(&op->insert_keys); closure_return(cl); @@ -297,7 +297,7 @@ static void bch_data_invalidate(struct closure *cl) op->insert_data_done = true; bio_put(bio); out: - continue_at(cl, bch_data_insert_keys, bcache_wq); + continue_at(cl, bch_data_insert_keys, op->wq); } static void bch_data_insert_error(struct closure *cl) @@ -340,7 +340,7 @@ static void bch_data_insert_endio(struct bio *bio, int error) if (op->writeback) op->error = error; else if (!op->replace) - set_closure_fn(cl, bch_data_insert_error, bcache_wq); + set_closure_fn(cl, bch_data_insert_error, op->wq); else set_closure_fn(cl, NULL, NULL); } @@ -376,7 +376,7 @@ static void bch_data_insert_start(struct closure *cl) if (bch_keylist_realloc(&op->insert_keys, 3 + (op->csum ? 1 : 0), op->c)) - continue_at(cl, bch_data_insert_keys, bcache_wq); + continue_at(cl, bch_data_insert_keys, op->wq); k = op->insert_keys.top; bkey_init(k); @@ -413,7 +413,7 @@ static void bch_data_insert_start(struct closure *cl) } while (n != bio); op->insert_data_done = true; - continue_at(cl, bch_data_insert_keys, bcache_wq); + continue_at(cl, bch_data_insert_keys, op->wq); err: /* bch_alloc_sectors() blocks if s->writeback = true */ BUG_ON(op->writeback); @@ -442,7 +442,7 @@ err: bio_put(bio); if (!bch_keylist_empty(&op->insert_keys)) - continue_at(cl, bch_data_insert_keys, bcache_wq); + continue_at(cl, bch_data_insert_keys, op->wq); else closure_return(cl); } @@ -824,6 +824,7 @@ static inline struct search *search_alloc(struct bio *bio, s->iop.error = 0; s->iop.flags = 0; s->iop.flush_journal = (bio->bi_rw & (REQ_FLUSH|REQ_FUA)) != 0; + s->iop.wq = bcache_wq; return s; } diff --git a/drivers/md/bcache/request.h b/drivers/md/bcache/request.h index 39f21dbedc38..c117c4082aa2 100644 --- a/drivers/md/bcache/request.h +++ b/drivers/md/bcache/request.h @@ -7,6 +7,7 @@ struct data_insert_op { struct closure cl; struct cache_set *c; struct bio *bio; + struct workqueue_struct *wq; unsigned inode; uint16_t write_point; diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index fb343276beef..ddfde380b49f 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -1356,6 +1356,8 @@ static void cache_set_free(struct closure *cl) bch_bset_sort_state_free(&c->sort); free_pages((unsigned long) c->uuids, ilog2(bucket_pages(c))); + if (c->moving_gc_wq) + destroy_workqueue(c->moving_gc_wq); if (c->bio_split) bioset_free(c->bio_split); if (c->fill_iter) @@ -1522,6 +1524,7 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) !(c->fill_iter = mempool_create_kmalloc_pool(1, iter_size)) || !(c->bio_split = bioset_create(4, offsetof(struct bbio, bio))) || !(c->uuids = alloc_bucket_pages(GFP_KERNEL, c)) || + !(c->moving_gc_wq = create_workqueue("bcache_gc")) || bch_journal_alloc(c) || bch_btree_cache_alloc(c) || bch_open_buckets_alloc(c) || -- cgit v1.2.3 From 10d9dcf6ee5909e1aabd3685c60fdd1b1306d046 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 17 Feb 2014 15:48:36 -0800 Subject: bcache: Fix moving_pred() Avoid a potential null pointer deref (e.g. from check keys for cache misses) Signed-off-by: Kent Overstreet --- drivers/md/bcache/movinggc.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'drivers') diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c index 8c7205186d08..5e8e58701d37 100644 --- a/drivers/md/bcache/movinggc.c +++ b/drivers/md/bcache/movinggc.c @@ -24,12 +24,10 @@ static bool moving_pred(struct keybuf *buf, struct bkey *k) moving_gc_keys); unsigned i; - for (i = 0; i < KEY_PTRS(k); i++) { - struct bucket *g = PTR_BUCKET(c, k, i); - - if (GC_MOVE(g)) + for (i = 0; i < KEY_PTRS(k); i++) + if (ptr_available(c, k, i) && + GC_MOVE(PTR_BUCKET(c, k, i))) return true; - } return false; } -- cgit v1.2.3 From 3f6ef38110b6955327fea3105f004a3b61a3f65f Mon Sep 17 00:00:00 2001 From: Nicholas Swenson Date: Thu, 23 Jan 2014 15:21:02 -0800 Subject: bcache: stop moving_gc marking buckets that can't be moved. Signed-off-by: Nicholas Swenson --- drivers/md/bcache/movinggc.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c index 5e8e58701d37..cd7490311e51 100644 --- a/drivers/md/bcache/movinggc.c +++ b/drivers/md/bcache/movinggc.c @@ -215,7 +215,10 @@ void bch_moving_gc(struct cache_set *c) ca->heap.used = 0; for_each_bucket(b, ca) { - if (!GC_SECTORS_USED(b)) + if (GC_MARK(b) == GC_MARK_METADATA || + !GC_SECTORS_USED(b) || + GC_SECTORS_USED(b) == ca->sb.bucket_size || + atomic_read(&b->pin)) continue; if (!heap_full(&ca->heap)) { -- cgit v1.2.3 From 3f5e0a34daed197aa55d0c6b466bb4cd03babb4f Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 23 Jan 2014 04:42:58 -0800 Subject: bcache: Kill dead cgroup code This hasn't been used or even enabled in ages. Signed-off-by: Kent Overstreet --- drivers/md/bcache/Kconfig | 8 --- drivers/md/bcache/btree.c | 4 -- drivers/md/bcache/request.c | 169 -------------------------------------------- drivers/md/bcache/request.h | 18 ----- drivers/md/bcache/stats.c | 3 - 5 files changed, 202 deletions(-) (limited to 'drivers') diff --git a/drivers/md/bcache/Kconfig b/drivers/md/bcache/Kconfig index 2638417b19aa..4d200883c505 100644 --- a/drivers/md/bcache/Kconfig +++ b/drivers/md/bcache/Kconfig @@ -24,11 +24,3 @@ config BCACHE_CLOSURES_DEBUG Keeps all active closures in a linked list and provides a debugfs interface to list them, which makes it possible to see asynchronous operations that get stuck. - -# cgroup code needs to be updated: -# -#config CGROUP_BCACHE -# bool "Cgroup controls for bcache" -# depends on BCACHE && BLK_CGROUP -# ---help--- -# TODO diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 5f587ce57e3a..ea5a59e2d740 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -68,15 +68,11 @@ * alloc_bucket() cannot fail. This should be true but is not completely * obvious. * - * Make sure all allocations get charged to the root cgroup - * * Plugging? * * If data write is less than hard sector size of ssd, round up offset in open * bucket to the next whole sector * - * Also lookup by cgroup in get_open_bucket() - * * Superblock needs to be fleshed out for multiple cache devices * * Add a sysfs tunable for the number of writeback IOs in flight diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index 3e880869871f..15fff4f68a7c 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -12,11 +12,9 @@ #include "request.h" #include "writeback.h" -#include #include #include #include -#include "blk-cgroup.h" #include @@ -27,171 +25,13 @@ struct kmem_cache *bch_search_cache; static void bch_data_insert_start(struct closure *); -/* Cgroup interface */ - -#ifdef CONFIG_CGROUP_BCACHE -static struct bch_cgroup bcache_default_cgroup = { .cache_mode = -1 }; - -static struct bch_cgroup *cgroup_to_bcache(struct cgroup *cgroup) -{ - struct cgroup_subsys_state *css; - return cgroup && - (css = cgroup_subsys_state(cgroup, bcache_subsys_id)) - ? container_of(css, struct bch_cgroup, css) - : &bcache_default_cgroup; -} - -struct bch_cgroup *bch_bio_to_cgroup(struct bio *bio) -{ - struct cgroup_subsys_state *css = bio->bi_css - ? cgroup_subsys_state(bio->bi_css->cgroup, bcache_subsys_id) - : task_subsys_state(current, bcache_subsys_id); - - return css - ? container_of(css, struct bch_cgroup, css) - : &bcache_default_cgroup; -} - -static ssize_t cache_mode_read(struct cgroup *cgrp, struct cftype *cft, - struct file *file, - char __user *buf, size_t nbytes, loff_t *ppos) -{ - char tmp[1024]; - int len = bch_snprint_string_list(tmp, PAGE_SIZE, bch_cache_modes, - cgroup_to_bcache(cgrp)->cache_mode + 1); - - if (len < 0) - return len; - - return simple_read_from_buffer(buf, nbytes, ppos, tmp, len); -} - -static int cache_mode_write(struct cgroup *cgrp, struct cftype *cft, - const char *buf) -{ - int v = bch_read_string_list(buf, bch_cache_modes); - if (v < 0) - return v; - - cgroup_to_bcache(cgrp)->cache_mode = v - 1; - return 0; -} - -static u64 bch_verify_read(struct cgroup *cgrp, struct cftype *cft) -{ - return cgroup_to_bcache(cgrp)->verify; -} - -static int bch_verify_write(struct cgroup *cgrp, struct cftype *cft, u64 val) -{ - cgroup_to_bcache(cgrp)->verify = val; - return 0; -} - -static u64 bch_cache_hits_read(struct cgroup *cgrp, struct cftype *cft) -{ - struct bch_cgroup *bcachecg = cgroup_to_bcache(cgrp); - return atomic_read(&bcachecg->stats.cache_hits); -} - -static u64 bch_cache_misses_read(struct cgroup *cgrp, struct cftype *cft) -{ - struct bch_cgroup *bcachecg = cgroup_to_bcache(cgrp); - return atomic_read(&bcachecg->stats.cache_misses); -} - -static u64 bch_cache_bypass_hits_read(struct cgroup *cgrp, - struct cftype *cft) -{ - struct bch_cgroup *bcachecg = cgroup_to_bcache(cgrp); - return atomic_read(&bcachecg->stats.cache_bypass_hits); -} - -static u64 bch_cache_bypass_misses_read(struct cgroup *cgrp, - struct cftype *cft) -{ - struct bch_cgroup *bcachecg = cgroup_to_bcache(cgrp); - return atomic_read(&bcachecg->stats.cache_bypass_misses); -} - -static struct cftype bch_files[] = { - { - .name = "cache_mode", - .read = cache_mode_read, - .write_string = cache_mode_write, - }, - { - .name = "verify", - .read_u64 = bch_verify_read, - .write_u64 = bch_verify_write, - }, - { - .name = "cache_hits", - .read_u64 = bch_cache_hits_read, - }, - { - .name = "cache_misses", - .read_u64 = bch_cache_misses_read, - }, - { - .name = "cache_bypass_hits", - .read_u64 = bch_cache_bypass_hits_read, - }, - { - .name = "cache_bypass_misses", - .read_u64 = bch_cache_bypass_misses_read, - }, - { } /* terminate */ -}; - -static void init_bch_cgroup(struct bch_cgroup *cg) -{ - cg->cache_mode = -1; -} - -static struct cgroup_subsys_state *bcachecg_create(struct cgroup *cgroup) -{ - struct bch_cgroup *cg; - - cg = kzalloc(sizeof(*cg), GFP_KERNEL); - if (!cg) - return ERR_PTR(-ENOMEM); - init_bch_cgroup(cg); - return &cg->css; -} - -static void bcachecg_destroy(struct cgroup *cgroup) -{ - struct bch_cgroup *cg = cgroup_to_bcache(cgroup); - kfree(cg); -} - -struct cgroup_subsys bcache_subsys = { - .create = bcachecg_create, - .destroy = bcachecg_destroy, - .subsys_id = bcache_subsys_id, - .name = "bcache", - .module = THIS_MODULE, -}; -EXPORT_SYMBOL_GPL(bcache_subsys); -#endif - static unsigned cache_mode(struct cached_dev *dc, struct bio *bio) { -#ifdef CONFIG_CGROUP_BCACHE - int r = bch_bio_to_cgroup(bio)->cache_mode; - if (r >= 0) - return r; -#endif return BDEV_CACHE_MODE(&dc->sb); } static bool verify(struct cached_dev *dc, struct bio *bio) { -#ifdef CONFIG_CGROUP_BCACHE - if (bch_bio_to_cgroup(bio)->verify) - return true; -#endif return dc->verify; } @@ -1305,9 +1145,6 @@ void bch_flash_dev_request_init(struct bcache_device *d) void bch_request_exit(void) { -#ifdef CONFIG_CGROUP_BCACHE - cgroup_unload_subsys(&bcache_subsys); -#endif if (bch_search_cache) kmem_cache_destroy(bch_search_cache); } @@ -1318,11 +1155,5 @@ int __init bch_request_init(void) if (!bch_search_cache) return -ENOMEM; -#ifdef CONFIG_CGROUP_BCACHE - cgroup_load_subsys(&bcache_subsys); - init_bch_cgroup(&bcache_default_cgroup); - - cgroup_add_cftypes(&bcache_subsys, bch_files); -#endif return 0; } diff --git a/drivers/md/bcache/request.h b/drivers/md/bcache/request.h index c117c4082aa2..1ff36875c2b3 100644 --- a/drivers/md/bcache/request.h +++ b/drivers/md/bcache/request.h @@ -1,8 +1,6 @@ #ifndef _BCACHE_REQUEST_H_ #define _BCACHE_REQUEST_H_ -#include - struct data_insert_op { struct closure cl; struct cache_set *c; @@ -42,20 +40,4 @@ void bch_flash_dev_request_init(struct bcache_device *d); extern struct kmem_cache *bch_search_cache, *bch_passthrough_cache; -struct bch_cgroup { -#ifdef CONFIG_CGROUP_BCACHE - struct cgroup_subsys_state css; -#endif - /* - * We subtract one from the index into bch_cache_modes[], so that - * default == -1; this makes it so the rest match up with d->cache_mode, - * and we use d->cache_mode if cgrp->cache_mode < 0 - */ - short cache_mode; - bool verify; - struct cache_stat_collector stats; -}; - -struct bch_cgroup *bch_bio_to_cgroup(struct bio *bio); - #endif /* _BCACHE_REQUEST_H_ */ diff --git a/drivers/md/bcache/stats.c b/drivers/md/bcache/stats.c index 84d0782f702e..0ca072c20d0d 100644 --- a/drivers/md/bcache/stats.c +++ b/drivers/md/bcache/stats.c @@ -201,9 +201,6 @@ void bch_mark_cache_accounting(struct cache_set *c, struct bcache_device *d, struct cached_dev *dc = container_of(d, struct cached_dev, disk); mark_cache_stats(&dc->accounting.collector, hit, bypass); mark_cache_stats(&c->accounting.collector, hit, bypass); -#ifdef CONFIG_CGROUP_BCACHE - mark_cache_stats(&(bch_bio_to_cgroup(s->orig_bio)->stats), hit, bypass); -#endif } void bch_mark_cache_readahead(struct cache_set *c, struct bcache_device *d) -- cgit v1.2.3 From 7159b1ad3dded9da040b5c608acf3d52d50f661e Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 12 Feb 2014 18:43:32 -0800 Subject: bcache: Better alloc tracepoints Change the invalidate tracepoint to indicate how much data we're invalidating, and change the alloc tracepoints to indicate what offset they're for. Signed-off-by: Kent Overstreet --- drivers/md/bcache/alloc.c | 15 ++++++++++---- drivers/md/bcache/trace.c | 2 +- include/trace/events/bcache.h | 48 ++++++++++++++++++++++++++++++------------- 3 files changed, 46 insertions(+), 19 deletions(-) (limited to 'drivers') diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c index c0d37d082443..a3e1427945f2 100644 --- a/drivers/md/bcache/alloc.c +++ b/drivers/md/bcache/alloc.c @@ -162,10 +162,15 @@ static bool can_invalidate_bucket(struct cache *ca, struct bucket *b) static void invalidate_one_bucket(struct cache *ca, struct bucket *b) { + size_t bucket = b - ca->buckets; + + if (GC_SECTORS_USED(b)) + trace_bcache_invalidate(ca, bucket); + bch_inc_gen(ca, b); b->prio = INITIAL_PRIO; atomic_inc(&b->pin); - fifo_push(&ca->free_inc, b - ca->buckets); + fifo_push(&ca->free_inc, bucket); } /* @@ -301,8 +306,6 @@ static void invalidate_buckets(struct cache *ca) invalidate_buckets_random(ca); break; } - - trace_bcache_alloc_invalidate(ca); } #define allocator_wait(ca, cond) \ @@ -408,8 +411,10 @@ long bch_bucket_alloc(struct cache *ca, unsigned reserve, bool wait) fifo_pop(&ca->free[reserve], r)) goto out; - if (!wait) + if (!wait) { + trace_bcache_alloc_fail(ca, reserve); return -1; + } do { prepare_to_wait(&ca->set->bucket_wait, &w, @@ -425,6 +430,8 @@ long bch_bucket_alloc(struct cache *ca, unsigned reserve, bool wait) out: wake_up_process(ca->alloc_thread); + trace_bcache_alloc(ca, reserve); + if (expensive_debug_checks(ca->set)) { size_t iter; long i; diff --git a/drivers/md/bcache/trace.c b/drivers/md/bcache/trace.c index adbc3df17a80..b7820b0d2621 100644 --- a/drivers/md/bcache/trace.c +++ b/drivers/md/bcache/trace.c @@ -45,7 +45,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_node_split); EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_node_compact); EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_set_root); -EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_alloc_invalidate); +EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_invalidate); EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_alloc_fail); EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_writeback); diff --git a/include/trace/events/bcache.h b/include/trace/events/bcache.h index 7110897c3dfa..8fc2a7134d3c 100644 --- a/include/trace/events/bcache.h +++ b/include/trace/events/bcache.h @@ -399,26 +399,43 @@ TRACE_EVENT(bcache_keyscan, /* Allocator */ -TRACE_EVENT(bcache_alloc_invalidate, - TP_PROTO(struct cache *ca), - TP_ARGS(ca), +TRACE_EVENT(bcache_invalidate, + TP_PROTO(struct cache *ca, size_t bucket), + TP_ARGS(ca, bucket), TP_STRUCT__entry( - __field(unsigned, free ) - __field(unsigned, free_inc ) - __field(unsigned, free_inc_size ) - __field(unsigned, unused ) + __field(unsigned, sectors ) + __field(dev_t, dev ) + __field(__u64, offset ) ), TP_fast_assign( - __entry->free = fifo_used(&ca->free[RESERVE_NONE]); - __entry->free_inc = fifo_used(&ca->free_inc); - __entry->free_inc_size = ca->free_inc.size; - __entry->unused = fifo_used(&ca->unused); + __entry->dev = ca->bdev->bd_dev; + __entry->offset = bucket << ca->set->bucket_bits; + __entry->sectors = GC_SECTORS_USED(&ca->buckets[bucket]); ), - TP_printk("free %u free_inc %u/%u unused %u", __entry->free, - __entry->free_inc, __entry->free_inc_size, __entry->unused) + TP_printk("invalidated %u sectors at %d,%d sector=%llu", + __entry->sectors, MAJOR(__entry->dev), + MINOR(__entry->dev), __entry->offset) +); + +TRACE_EVENT(bcache_alloc, + TP_PROTO(struct cache *ca, size_t bucket), + TP_ARGS(ca, bucket), + + TP_STRUCT__entry( + __field(dev_t, dev ) + __field(__u64, offset ) + ), + + TP_fast_assign( + __entry->dev = ca->bdev->bd_dev; + __entry->offset = bucket << ca->set->bucket_bits; + ), + + TP_printk("allocated %d,%d sector=%llu", MAJOR(__entry->dev), + MINOR(__entry->dev), __entry->offset) ); TRACE_EVENT(bcache_alloc_fail, @@ -426,6 +443,7 @@ TRACE_EVENT(bcache_alloc_fail, TP_ARGS(ca, reserve), TP_STRUCT__entry( + __field(dev_t, dev ) __field(unsigned, free ) __field(unsigned, free_inc ) __field(unsigned, unused ) @@ -433,13 +451,15 @@ TRACE_EVENT(bcache_alloc_fail, ), TP_fast_assign( + __entry->dev = ca->bdev->bd_dev; __entry->free = fifo_used(&ca->free[reserve]); __entry->free_inc = fifo_used(&ca->free_inc); __entry->unused = fifo_used(&ca->unused); __entry->blocked = atomic_read(&ca->set->prio_blocked); ), - TP_printk("free %u free_inc %u unused %u blocked %u", __entry->free, + TP_printk("alloc fail %d,%d free %u free_inc %u unused %u blocked %u", + MAJOR(__entry->dev), MINOR(__entry->dev), __entry->free, __entry->free_inc, __entry->unused, __entry->blocked) ); -- cgit v1.2.3 From 15754020524a56517df082799f07de880f4b29e2 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 25 Feb 2014 17:34:21 -0800 Subject: bcache: Improve priority_stats Break down data into clean data/dirty data/metadata. Signed-off-by: Kent Overstreet --- drivers/md/bcache/sysfs.c | 26 ++++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) (limited to 'drivers') diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index d8458d477a12..662b9484ed5e 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -761,7 +761,9 @@ SHOW(__bch_cache) int cmp(const void *l, const void *r) { return *((uint16_t *) r) - *((uint16_t *) l); } - size_t n = ca->sb.nbuckets, i, unused, btree; + struct bucket *b; + size_t n = ca->sb.nbuckets, i; + size_t unused = 0, available = 0, dirty = 0, meta = 0; uint64_t sum = 0; /* Compute 31 quantiles */ uint16_t q[31], *p, *cached; @@ -772,6 +774,17 @@ SHOW(__bch_cache) return -ENOMEM; mutex_lock(&ca->set->bucket_lock); + for_each_bucket(b, ca) { + if (!GC_SECTORS_USED(b)) + unused++; + if (GC_MARK(b) == GC_MARK_RECLAIMABLE) + available++; + if (GC_MARK(b) == GC_MARK_DIRTY) + dirty++; + if (GC_MARK(b) == GC_MARK_METADATA) + meta++; + } + for (i = ca->sb.first_bucket; i < n; i++) p[i] = ca->buckets[i].prio; mutex_unlock(&ca->set->bucket_lock); @@ -786,10 +799,7 @@ SHOW(__bch_cache) while (cached < p + n && *cached == BTREE_PRIO) - cached++; - - btree = cached - p; - n -= btree; + cached++, n--; for (i = 0; i < n; i++) sum += INITIAL_PRIO - cached[i]; @@ -805,12 +815,16 @@ SHOW(__bch_cache) ret = scnprintf(buf, PAGE_SIZE, "Unused: %zu%%\n" + "Clean: %zu%%\n" + "Dirty: %zu%%\n" "Metadata: %zu%%\n" "Average: %llu\n" "Sectors per Q: %zu\n" "Quantiles: [", unused * 100 / (size_t) ca->sb.nbuckets, - btree * 100 / (size_t) ca->sb.nbuckets, sum, + available * 100 / (size_t) ca->sb.nbuckets, + dirty * 100 / (size_t) ca->sb.nbuckets, + meta * 100 / (size_t) ca->sb.nbuckets, sum, n * ca->sb.bucket_size / (ARRAY_SIZE(q) + 1)); for (i = 0; i < ARRAY_SIZE(q); i++) -- cgit v1.2.3 From c13f3af9247db929fe1be86c0442ef161e615ac4 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 8 Jan 2014 21:22:02 -0800 Subject: bcache: Add bch_keylist_init_single() This will potentially save us an allocation when we've got inode/dirent bkeys that don't fit in the keylist's inline keys. Signed-off-by: Kent Overstreet --- drivers/md/bcache/bset.h | 6 ++++++ drivers/md/bcache/journal.c | 5 +---- 2 files changed, 7 insertions(+), 4 deletions(-) (limited to 'drivers') diff --git a/drivers/md/bcache/bset.h b/drivers/md/bcache/bset.h index 003260f4ddf6..5f6728d5d4dd 100644 --- a/drivers/md/bcache/bset.h +++ b/drivers/md/bcache/bset.h @@ -478,6 +478,12 @@ static inline void bch_keylist_init(struct keylist *l) l->top_p = l->keys_p = l->inline_keys; } +static inline void bch_keylist_init_single(struct keylist *l, struct bkey *k) +{ + l->keys = k; + l->top = bkey_next(k); +} + static inline void bch_keylist_push(struct keylist *l) { l->top = bkey_next(l->top); diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c index cf8e0932aad2..c8bfc28cd2bd 100644 --- a/drivers/md/bcache/journal.c +++ b/drivers/md/bcache/journal.c @@ -313,8 +313,6 @@ int bch_journal_replay(struct cache_set *s, struct list_head *list) uint64_t start = i->j.last_seq, end = i->j.seq, n = start; struct keylist keylist; - bch_keylist_init(&keylist); - list_for_each_entry(i, list, list) { BUG_ON(i->pin && atomic_read(i->pin) != 1); @@ -327,8 +325,7 @@ int bch_journal_replay(struct cache_set *s, struct list_head *list) k = bkey_next(k)) { trace_bcache_journal_replay_key(k); - bkey_copy(keylist.top, k); - bch_keylist_push(&keylist); + bch_keylist_init_single(&keylist, k); ret = bch_btree_insert(s, &keylist, i->pin, NULL); if (ret) -- cgit v1.2.3 From 4fe6a816707aace9e8e297b708411c5930537793 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 13 Mar 2014 13:46:29 -0700 Subject: bcache: Add a real GC_MARK_RECLAIMABLE This means the garbage collection code can better check for data and metadata pointers to the same buckets. Signed-off-by: Kent Overstreet --- drivers/md/bcache/alloc.c | 5 +++-- drivers/md/bcache/bcache.h | 6 +++--- drivers/md/bcache/btree.c | 18 ++++++++++++------ drivers/md/bcache/extents.c | 6 +++--- 4 files changed, 21 insertions(+), 14 deletions(-) (limited to 'drivers') diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c index a3e1427945f2..5ba4eaea57f4 100644 --- a/drivers/md/bcache/alloc.c +++ b/drivers/md/bcache/alloc.c @@ -155,7 +155,8 @@ add: static bool can_invalidate_bucket(struct cache *ca, struct bucket *b) { - return GC_MARK(b) == GC_MARK_RECLAIMABLE && + return (!GC_MARK(b) || + GC_MARK(b) == GC_MARK_RECLAIMABLE) && !atomic_read(&b->pin) && can_inc_bucket_gen(b); } @@ -475,7 +476,7 @@ void bch_bucket_free(struct cache_set *c, struct bkey *k) for (i = 0; i < KEY_PTRS(k); i++) { struct bucket *b = PTR_BUCKET(c, k, i); - SET_GC_MARK(b, GC_MARK_RECLAIMABLE); + SET_GC_MARK(b, 0); SET_GC_SECTORS_USED(b, 0); bch_bucket_add_unused(PTR_CACHE(c, k, i), b); } diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 6d814f463d9e..014236e411d8 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -207,9 +207,9 @@ struct bucket { */ BITMASK(GC_MARK, struct bucket, gc_mark, 0, 2); -#define GC_MARK_RECLAIMABLE 0 -#define GC_MARK_DIRTY 1 -#define GC_MARK_METADATA 2 +#define GC_MARK_RECLAIMABLE 1 +#define GC_MARK_DIRTY 2 +#define GC_MARK_METADATA 3 #define GC_SECTORS_USED_SIZE 13 #define MAX_GC_SECTORS_USED (~(~0ULL << GC_SECTORS_USED_SIZE)) BITMASK(GC_SECTORS_USED, struct bucket, gc_mark, 2, GC_SECTORS_USED_SIZE); diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index ea5a59e2d740..1672db348c8b 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -1160,6 +1160,8 @@ static uint8_t __bch_btree_mark_key(struct cache_set *c, int level, SET_GC_MARK(g, GC_MARK_METADATA); else if (KEY_DIRTY(k)) SET_GC_MARK(g, GC_MARK_DIRTY); + else if (!GC_MARK(g)) + SET_GC_MARK(g, GC_MARK_RECLAIMABLE); /* guard against overflow */ SET_GC_SECTORS_USED(g, min_t(unsigned, @@ -1559,7 +1561,7 @@ static void btree_gc_start(struct cache_set *c) for_each_bucket(b, ca) { b->gc_gen = b->gen; if (!atomic_read(&b->pin)) { - SET_GC_MARK(b, GC_MARK_RECLAIMABLE); + SET_GC_MARK(b, 0); SET_GC_SECTORS_USED(b, 0); } } @@ -1622,12 +1624,16 @@ size_t bch_btree_gc_finish(struct cache_set *c) b->last_gc = b->gc_gen; c->need_gc = max(c->need_gc, bucket_gc_gen(b)); - if (!atomic_read(&b->pin) && - GC_MARK(b) == GC_MARK_RECLAIMABLE) { + if (atomic_read(&b->pin)) + continue; + + BUG_ON(!GC_MARK(b) && GC_SECTORS_USED(b)); + + if (!GC_MARK(b) || GC_MARK(b) == GC_MARK_RECLAIMABLE) available++; - if (!GC_SECTORS_USED(b)) - bch_bucket_add_unused(ca, b); - } + + if (!GC_MARK(b)) + bch_bucket_add_unused(ca, b); } } diff --git a/drivers/md/bcache/extents.c b/drivers/md/bcache/extents.c index 416d1a3e028e..82d5e3288a6c 100644 --- a/drivers/md/bcache/extents.c +++ b/drivers/md/bcache/extents.c @@ -499,9 +499,9 @@ static bool bch_extent_bad_expensive(struct btree *b, const struct bkey *k, if (mutex_trylock(&b->c->bucket_lock)) { if (b->c->gc_mark_valid && - ((GC_MARK(g) != GC_MARK_DIRTY && - KEY_DIRTY(k)) || - GC_MARK(g) == GC_MARK_METADATA)) + (!GC_MARK(g) || + GC_MARK(g) == GC_MARK_METADATA || + (GC_MARK(g) != GC_MARK_DIRTY && KEY_DIRTY(k)))) goto err; if (g->prio == BTREE_PRIO) -- cgit v1.2.3 From 05335cff9f01555b769ac97b7bacc472b7ed047a Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 17 Mar 2014 18:22:34 -0700 Subject: bcache: Fix a race when freeing btree nodes This isn't a bulletproof fix; btree_node_free() -> bch_bucket_free() puts the bucket on the unused freelist, where it can be reused right away without any ordering requirements. It would be better to wait on at least a journal write to go down before reusing the bucket. bch_btree_set_root() does this, and inserting into non leaf nodes is completely synchronous so we should be ok, but future patches are just going to get rid of the unused freelist - it was needed in the past for various reasons but shouldn't be anymore. Signed-off-by: Kent Overstreet --- drivers/md/bcache/btree.c | 53 ++++++++++++++++++----------------------------- 1 file changed, 20 insertions(+), 33 deletions(-) (limited to 'drivers') diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 1672db348c8b..e83732e2d912 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -1006,8 +1006,6 @@ static void btree_node_prefetch(struct cache_set *c, struct bkey *k, int level) static void btree_node_free(struct btree *b) { - unsigned i; - trace_bcache_btree_node_free(b); BUG_ON(b == b->c->root); @@ -1019,14 +1017,6 @@ static void btree_node_free(struct btree *b) cancel_delayed_work(&b->work); mutex_lock(&b->c->bucket_lock); - - for (i = 0; i < KEY_PTRS(&b->key); i++) { - BUG_ON(atomic_read(&PTR_BUCKET(b->c, &b->key, i)->pin)); - - bch_inc_gen(PTR_CACHE(b->c, &b->key, i), - PTR_BUCKET(b->c, &b->key, i)); - } - bch_bucket_free(b->c, &b->key); mca_bucket_free(b); mutex_unlock(&b->c->bucket_lock); @@ -1086,16 +1076,19 @@ static void make_btree_freeing_key(struct btree *b, struct bkey *k) { unsigned i; + mutex_lock(&b->c->bucket_lock); + + atomic_inc(&b->c->prio_blocked); + bkey_copy(k, &b->key); bkey_copy_key(k, &ZERO_KEY); - for (i = 0; i < KEY_PTRS(k); i++) { - uint8_t g = PTR_BUCKET(b->c, k, i)->gen + 1; - - SET_PTR_GEN(k, i, g); - } + for (i = 0; i < KEY_PTRS(k); i++) + SET_PTR_GEN(k, i, + bch_inc_gen(PTR_CACHE(b->c, &b->key, i), + PTR_BUCKET(b->c, &b->key, i))); - atomic_inc(&b->c->prio_blocked); + mutex_unlock(&b->c->bucket_lock); } static int btree_check_reserve(struct btree *b, struct btree_op *op) @@ -1342,6 +1335,13 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op, bch_keylist_add(keylist, &new_nodes[i]->key); } + closure_sync(&cl); + + /* We emptied out this node */ + BUG_ON(btree_bset_first(new_nodes[0])->keys); + btree_node_free(new_nodes[0]); + rw_unlock(true, new_nodes[0]); + for (i = 0; i < nodes; i++) { if (__bch_keylist_realloc(keylist, bkey_u64s(&r[i].b->key))) goto out_nocoalesce; @@ -1350,12 +1350,8 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op, bch_keylist_push(keylist); } - /* We emptied out this node */ - BUG_ON(btree_bset_first(new_nodes[0])->keys); - btree_node_free(new_nodes[0]); - rw_unlock(true, new_nodes[0]); - - closure_sync(&cl); + bch_btree_insert_node(b, op, keylist, NULL, NULL); + BUG_ON(!bch_keylist_empty(keylist)); for (i = 0; i < nodes; i++) { btree_node_free(r[i].b); @@ -1364,9 +1360,6 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op, r[i].b = new_nodes[i]; } - bch_btree_insert_node(b, op, keylist, NULL, NULL); - BUG_ON(!bch_keylist_empty(keylist)); - memmove(r, r + 1, sizeof(r[0]) * (nodes - 1)); r[nodes - 1].b = ERR_PTR(-EINTR); @@ -1456,12 +1449,11 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op, keys.top); bch_keylist_push(&keys); - btree_node_free(last->b); - bch_btree_insert_node(b, op, &keys, NULL, NULL); BUG_ON(!bch_keylist_empty(&keys)); + btree_node_free(last->b); rw_unlock(true, last->b); last->b = n; @@ -1924,26 +1916,21 @@ static int btree_split(struct btree *b, struct btree_op *op, closure_sync(&cl); bch_btree_set_root(n3); rw_unlock(true, n3); - - btree_node_free(b); } else if (!b->parent) { /* Root filled up but didn't need to be split */ closure_sync(&cl); bch_btree_set_root(n1); - - btree_node_free(b); } else { /* Split a non root node */ closure_sync(&cl); make_btree_freeing_key(b, parent_keys.top); bch_keylist_push(&parent_keys); - btree_node_free(b); - bch_btree_insert_node(b->parent, op, &parent_keys, NULL, NULL); BUG_ON(!bch_keylist_empty(&parent_keys)); } + btree_node_free(b); rw_unlock(true, n1); bch_time_stats_update(&b->c->btree_split_time, start_time); -- cgit v1.2.3 From 2a285686c109816ba71a00b9278262cf02648258 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 4 Mar 2014 16:42:42 -0800 Subject: bcache: btree locking rework Add a new lock, b->write_lock, which is required to actually modify - or write - a btree node; this lock is only held for short durations. This means we can write out a btree node without taking b->lock, which _is_ held for long durations - solving a deadlock when btree_flush_write() (from the journalling code) is called with a btree node locked. Right now just occurs in bch_btree_set_root(), but with an upcoming journalling rework is going to happen a lot more. This also turns b->lock is now more of a read/intent lock instead of a read/write lock - but not completely, since it still blocks readers. May turn it into a real intent lock at some point in the future. Signed-off-by: Kent Overstreet --- drivers/md/bcache/btree.c | 164 ++++++++++++++++++++++++++++++++------------ drivers/md/bcache/btree.h | 3 + drivers/md/bcache/journal.c | 9 ++- drivers/md/bcache/super.c | 9 ++- 4 files changed, 133 insertions(+), 52 deletions(-) (limited to 'drivers') diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index e83732e2d912..01b1b7e23cf2 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -167,6 +167,20 @@ static inline struct bset *write_block(struct btree *b) return ((void *) btree_bset_first(b)) + b->written * block_bytes(b->c); } +static void bch_btree_init_next(struct btree *b) +{ + /* If not a leaf node, always sort */ + if (b->level && b->keys.nsets) + bch_btree_sort(&b->keys, &b->c->sort); + else + bch_btree_sort_lazy(&b->keys, &b->c->sort); + + if (b->written < btree_blocks(b)) + bch_bset_init_next(&b->keys, write_block(b), + bset_magic(&b->c->sb)); + +} + /* Btree key manipulation */ void bkey_put(struct cache_set *c, struct bkey *k) @@ -438,10 +452,12 @@ static void do_btree_node_write(struct btree *b) } } -void bch_btree_node_write(struct btree *b, struct closure *parent) +void __bch_btree_node_write(struct btree *b, struct closure *parent) { struct bset *i = btree_bset_last(b); + lockdep_assert_held(&b->write_lock); + trace_bcache_btree_write(b); BUG_ON(current->bio_list); @@ -465,23 +481,24 @@ void bch_btree_node_write(struct btree *b, struct closure *parent) &PTR_CACHE(b->c, &b->key, 0)->btree_sectors_written); b->written += set_blocks(i, block_bytes(b->c)); +} - /* If not a leaf node, always sort */ - if (b->level && b->keys.nsets) - bch_btree_sort(&b->keys, &b->c->sort); - else - bch_btree_sort_lazy(&b->keys, &b->c->sort); +void bch_btree_node_write(struct btree *b, struct closure *parent) +{ + unsigned nsets = b->keys.nsets; + + lockdep_assert_held(&b->lock); + + __bch_btree_node_write(b, parent); /* * do verify if there was more than one set initially (i.e. we did a * sort) and we sorted down to a single set: */ - if (i != b->keys.set->data && !b->keys.nsets) + if (nsets && !b->keys.nsets) bch_btree_verify(b); - if (b->written < btree_blocks(b)) - bch_bset_init_next(&b->keys, write_block(b), - bset_magic(&b->c->sb)); + bch_btree_init_next(b); } static void bch_btree_node_write_sync(struct btree *b) @@ -489,7 +506,11 @@ static void bch_btree_node_write_sync(struct btree *b) struct closure cl; closure_init_stack(&cl); + + mutex_lock(&b->write_lock); bch_btree_node_write(b, &cl); + mutex_unlock(&b->write_lock); + closure_sync(&cl); } @@ -497,11 +518,10 @@ static void btree_node_write_work(struct work_struct *w) { struct btree *b = container_of(to_delayed_work(w), struct btree, work); - rw_lock(true, b, b->level); - + mutex_lock(&b->write_lock); if (btree_node_dirty(b)) - bch_btree_node_write(b, NULL); - rw_unlock(true, b); + __bch_btree_node_write(b, NULL); + mutex_unlock(&b->write_lock); } static void bch_btree_leaf_dirty(struct btree *b, atomic_t *journal_ref) @@ -509,6 +529,8 @@ static void bch_btree_leaf_dirty(struct btree *b, atomic_t *journal_ref) struct bset *i = btree_bset_last(b); struct btree_write *w = btree_current_write(b); + lockdep_assert_held(&b->write_lock); + BUG_ON(!b->written); BUG_ON(!i->keys); @@ -593,6 +615,8 @@ static struct btree *mca_bucket_alloc(struct cache_set *c, init_rwsem(&b->lock); lockdep_set_novalidate_class(&b->lock); + mutex_init(&b->write_lock); + lockdep_set_novalidate_class(&b->write_lock); INIT_LIST_HEAD(&b->list); INIT_DELAYED_WORK(&b->work, btree_node_write_work); b->c = c; @@ -626,8 +650,12 @@ static int mca_reap(struct btree *b, unsigned min_order, bool flush) up(&b->io_mutex); } + mutex_lock(&b->write_lock); if (btree_node_dirty(b)) - bch_btree_node_write_sync(b); + __bch_btree_node_write(b, &cl); + mutex_unlock(&b->write_lock); + + closure_sync(&cl); /* wait for any in flight btree write */ down(&b->io_mutex); @@ -1010,10 +1038,14 @@ static void btree_node_free(struct btree *b) BUG_ON(b == b->c->root); + mutex_lock(&b->write_lock); + if (btree_node_dirty(b)) btree_complete_write(b, btree_current_write(b)); clear_bit(BTREE_NODE_dirty, &b->flags); + mutex_unlock(&b->write_lock); + cancel_delayed_work(&b->work); mutex_lock(&b->c->bucket_lock); @@ -1065,8 +1097,10 @@ static struct btree *btree_node_alloc_replacement(struct btree *b, bool wait) { struct btree *n = bch_btree_node_alloc(b->c, b->level, wait); if (!IS_ERR_OR_NULL(n)) { + mutex_lock(&n->write_lock); bch_btree_sort_into(&b->keys, &n->keys, &b->c->sort); bkey_copy_key(&n->key, &b->key); + mutex_unlock(&n->write_lock); } return n; @@ -1269,6 +1303,9 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op, goto out_nocoalesce; } + for (i = 0; i < nodes; i++) + mutex_lock(&new_nodes[i]->write_lock); + for (i = nodes - 1; i > 0; --i) { struct bset *n1 = btree_bset_first(new_nodes[i]); struct bset *n2 = btree_bset_first(new_nodes[i - 1]); @@ -1335,6 +1372,9 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op, bch_keylist_add(keylist, &new_nodes[i]->key); } + for (i = 0; i < nodes; i++) + mutex_unlock(&new_nodes[i]->write_lock); + closure_sync(&cl); /* We emptied out this node */ @@ -1399,7 +1439,6 @@ static unsigned btree_gc_count_keys(struct btree *b) static int btree_gc_recurse(struct btree *b, struct btree_op *op, struct closure *writes, struct gc_stat *gc) { - unsigned i; int ret = 0; bool should_rewrite; struct btree *n; @@ -1407,13 +1446,13 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op, struct keylist keys; struct btree_iter iter; struct gc_merge_info r[GC_MERGE_NODES]; - struct gc_merge_info *last = r + GC_MERGE_NODES - 1; + struct gc_merge_info *i, *last = r + ARRAY_SIZE(r) - 1; bch_keylist_init(&keys); bch_btree_iter_init(&b->keys, &iter, &b->c->gc_done); - for (i = 0; i < GC_MERGE_NODES; i++) - r[i].b = ERR_PTR(-EINTR); + for (i = r; i < r + ARRAY_SIZE(r); i++) + i->b = ERR_PTR(-EINTR); while (1) { k = bch_btree_iter_next_filter(&iter, &b->keys, bch_ptr_bad); @@ -1443,6 +1482,7 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op, if (!IS_ERR_OR_NULL(n)) { bch_btree_node_write_sync(n); + bch_keylist_add(&keys, &n->key); make_btree_freeing_key(last->b, @@ -1475,8 +1515,10 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op, * Must flush leaf nodes before gc ends, since replace * operations aren't journalled */ + mutex_lock(&last->b->write_lock); if (btree_node_dirty(last->b)) bch_btree_node_write(last->b, writes); + mutex_unlock(&last->b->write_lock); rw_unlock(true, last->b); } @@ -1489,11 +1531,13 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op, } } - for (i = 0; i < GC_MERGE_NODES; i++) - if (!IS_ERR_OR_NULL(r[i].b)) { - if (btree_node_dirty(r[i].b)) - bch_btree_node_write(r[i].b, writes); - rw_unlock(true, r[i].b); + for (i = r; i < r + ARRAY_SIZE(r); i++) + if (!IS_ERR_OR_NULL(i->b)) { + mutex_lock(&i->b->write_lock); + if (btree_node_dirty(i->b)) + bch_btree_node_write(i->b, writes); + mutex_unlock(&i->b->write_lock); + rw_unlock(true, i->b); } bch_keylist_free(&keys); @@ -1514,6 +1558,7 @@ static int bch_btree_gc_root(struct btree *b, struct btree_op *op, if (!IS_ERR_OR_NULL(n)) { bch_btree_node_write_sync(n); + bch_btree_set_root(n); btree_node_free(b); rw_unlock(true, n); @@ -1871,6 +1916,9 @@ static int btree_split(struct btree *b, struct btree_op *op, goto err_free2; } + mutex_lock(&n1->write_lock); + mutex_lock(&n2->write_lock); + bch_btree_insert_keys(n1, op, insert_keys, replace_key); /* @@ -1897,21 +1945,26 @@ static int btree_split(struct btree *b, struct btree_op *op, bch_keylist_add(&parent_keys, &n2->key); bch_btree_node_write(n2, &cl); + mutex_unlock(&n2->write_lock); rw_unlock(true, n2); } else { trace_bcache_btree_node_compact(b, btree_bset_first(n1)->keys); + mutex_lock(&n1->write_lock); bch_btree_insert_keys(n1, op, insert_keys, replace_key); } bch_keylist_add(&parent_keys, &n1->key); bch_btree_node_write(n1, &cl); + mutex_unlock(&n1->write_lock); if (n3) { /* Depth increases, make a new root */ + mutex_lock(&n3->write_lock); bkey_copy_key(&n3->key, &MAX_KEY); bch_btree_insert_keys(n3, op, &parent_keys, NULL); bch_btree_node_write(n3, &cl); + mutex_unlock(&n3->write_lock); closure_sync(&cl); bch_btree_set_root(n3); @@ -1960,33 +2013,54 @@ static int bch_btree_insert_node(struct btree *b, struct btree_op *op, atomic_t *journal_ref, struct bkey *replace_key) { + struct closure cl; + BUG_ON(b->level && replace_key); + closure_init_stack(&cl); + + mutex_lock(&b->write_lock); + + if (write_block(b) != btree_bset_last(b) && + b->keys.last_set_unwritten) + bch_btree_init_next(b); /* just wrote a set */ + if (bch_keylist_nkeys(insert_keys) > insert_u64s_remaining(b)) { - if (current->bio_list) { - op->lock = b->c->root->level + 1; - return -EAGAIN; - } else if (op->lock <= b->c->root->level) { - op->lock = b->c->root->level + 1; - return -EINTR; - } else { - /* Invalidated all iterators */ - int ret = btree_split(b, op, insert_keys, replace_key); + mutex_unlock(&b->write_lock); + goto split; + } - return bch_keylist_empty(insert_keys) ? - 0 : ret ?: -EINTR; - } - } else { - BUG_ON(write_block(b) != btree_bset_last(b)); + BUG_ON(write_block(b) != btree_bset_last(b)); - if (bch_btree_insert_keys(b, op, insert_keys, replace_key)) { - if (!b->level) - bch_btree_leaf_dirty(b, journal_ref); - else - bch_btree_node_write_sync(b); - } + if (bch_btree_insert_keys(b, op, insert_keys, replace_key)) { + if (!b->level) + bch_btree_leaf_dirty(b, journal_ref); + else + bch_btree_node_write(b, &cl); + } - return 0; + mutex_unlock(&b->write_lock); + + /* wait for btree node write if necessary, after unlock */ + closure_sync(&cl); + + return 0; +split: + if (current->bio_list) { + op->lock = b->c->root->level + 1; + return -EAGAIN; + } else if (op->lock <= b->c->root->level) { + op->lock = b->c->root->level + 1; + return -EINTR; + } else { + /* Invalidated all iterators */ + int ret = btree_split(b, op, insert_keys, replace_key); + + if (bch_keylist_empty(insert_keys)) + return 0; + else if (!ret) + return -EINTR; + return ret; } } diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h index def9dc4a822f..acebf26809cc 100644 --- a/drivers/md/bcache/btree.h +++ b/drivers/md/bcache/btree.h @@ -127,6 +127,8 @@ struct btree { struct cache_set *c; struct btree *parent; + struct mutex write_lock; + unsigned long flags; uint16_t written; /* would be nice to kill */ uint8_t level; @@ -236,6 +238,7 @@ static inline void rw_unlock(bool w, struct btree *b) } void bch_btree_node_read_done(struct btree *); +void __bch_btree_node_write(struct btree *, struct closure *); void bch_btree_node_write(struct btree *, struct closure *); void bch_btree_set_root(struct btree *); diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c index c8bfc28cd2bd..59e82021b5bb 100644 --- a/drivers/md/bcache/journal.c +++ b/drivers/md/bcache/journal.c @@ -381,16 +381,15 @@ retry: b = best; if (b) { - rw_lock(true, b, b->level); - + mutex_lock(&b->write_lock); if (!btree_current_write(b)->journal) { - rw_unlock(true, b); + mutex_unlock(&b->write_lock); /* We raced */ goto retry; } - bch_btree_node_write(b, NULL); - rw_unlock(true, b); + __bch_btree_node_write(b, NULL); + mutex_unlock(&b->write_lock); } } diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index ddfde380b49f..9ded06434e11 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -1398,9 +1398,12 @@ static void cache_set_flush(struct closure *cl) list_add(&c->root->list, &c->btree_cache); /* Should skip this if we're unregistering because of an error */ - list_for_each_entry(b, &c->btree_cache, list) + list_for_each_entry(b, &c->btree_cache, list) { + mutex_lock(&b->write_lock); if (btree_node_dirty(b)) - bch_btree_node_write(b, NULL); + __bch_btree_node_write(b, NULL); + mutex_unlock(&b->write_lock); + } for_each_cache(ca, c, i) if (ca->alloc_thread) @@ -1667,8 +1670,10 @@ static void run_cache_set(struct cache_set *c) if (IS_ERR_OR_NULL(c->root)) goto err; + mutex_lock(&c->root->write_lock); bkey_copy_key(&c->root->key, &MAX_KEY); bch_btree_node_write(c->root, &cl); + mutex_unlock(&c->root->write_lock); bch_btree_set_root(c->root); rw_unlock(true, c->root); -- cgit v1.2.3 From 56b30770b27d54d68ad51eccc6d888282b568cee Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 23 Jan 2014 01:44:55 -0800 Subject: bcache: Kill btree_io_wq With the locking rework in the last patch, this shouldn't be needed anymore - btree_node_write_work() only takes b->write_lock which is never held for very long. Signed-off-by: Kent Overstreet --- drivers/md/bcache/bcache.h | 2 -- drivers/md/bcache/btree.c | 22 ++-------------------- drivers/md/bcache/super.c | 2 -- 3 files changed, 2 insertions(+), 24 deletions(-) (limited to 'drivers') diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 014236e411d8..15d26236caf9 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -962,7 +962,5 @@ void bch_debug_exit(void); int bch_debug_init(struct kobject *); void bch_request_exit(void); int bch_request_init(void); -void bch_btree_exit(void); -int bch_btree_init(void); #endif /* _BCACHE_H */ diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 01b1b7e23cf2..beb32551da78 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -93,8 +93,6 @@ #define PTR_HASH(c, k) \ (((k)->ptr[0] >> c->bucket_bits) | PTR_GEN(k, 0)) -static struct workqueue_struct *btree_io_wq; - #define insert_lock(s, b) ((b)->level <= (s)->lock) /* @@ -362,8 +360,7 @@ static void __btree_node_write_done(struct closure *cl) btree_complete_write(b, w); if (btree_node_dirty(b)) - queue_delayed_work(btree_io_wq, &b->work, - msecs_to_jiffies(30000)); + schedule_delayed_work(&b->work, 30 * HZ); closure_return_with_destructor(cl, btree_node_write_unlock); } @@ -535,7 +532,7 @@ static void bch_btree_leaf_dirty(struct btree *b, atomic_t *journal_ref) BUG_ON(!i->keys); if (!btree_node_dirty(b)) - queue_delayed_work(btree_io_wq, &b->work, 30 * HZ); + schedule_delayed_work(&b->work, 30 * HZ); set_btree_node_dirty(b); @@ -2446,18 +2443,3 @@ void bch_keybuf_init(struct keybuf *buf) spin_lock_init(&buf->lock); array_allocator_init(&buf->freelist); } - -void bch_btree_exit(void) -{ - if (btree_io_wq) - destroy_workqueue(btree_io_wq); -} - -int __init bch_btree_init(void) -{ - btree_io_wq = create_singlethread_workqueue("bch_btree_io"); - if (!btree_io_wq) - return -ENOMEM; - - return 0; -} diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 9ded06434e11..307fe378ea43 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -2072,7 +2072,6 @@ static void bcache_exit(void) { bch_debug_exit(); bch_request_exit(); - bch_btree_exit(); if (bcache_kobj) kobject_put(bcache_kobj); if (bcache_wq) @@ -2102,7 +2101,6 @@ static int __init bcache_init(void) if (!(bcache_wq = create_workqueue("bcache")) || !(bcache_kobj = kobject_create_and_add("bcache", fs_kobj)) || sysfs_create_files(bcache_kobj, files) || - bch_btree_init() || bch_request_init() || bch_debug_init(bcache_kobj)) goto err; -- cgit v1.2.3 From 0a63b66db566cffdf90182eb6e66fdd4d0479e63 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 17 Mar 2014 17:15:53 -0700 Subject: bcache: Rework btree cache reserve handling This changes the bucket allocation reserves to use _real_ reserves - separate freelists - instead of watermarks, which if nothing else makes the current code saner to reason about and is going to be important in the future when we add support for multiple btrees. It also adds btree_check_reserve(), which checks (and locks) the reserves for both bucket allocation and memory allocation for btree nodes; the old code just kinda sorta assumed that since (e.g. for btree node splits) it had the root locked and that meant no other threads could try to make use of the same reserve; this technically should have been ok for memory allocation (we should always have a reserve for memory allocation (the btree node cache is used as a reserve and we preallocate it)), but multiple btrees will mean that locking the root won't be sufficient anymore, and for the bucket allocation reserve it was technically possible for the old code to deadlock. Signed-off-by: Kent Overstreet --- drivers/md/bcache/alloc.c | 23 +---- drivers/md/bcache/bcache.h | 15 +-- drivers/md/bcache/btree.c | 225 +++++++++++++++++++++++++++------------------ drivers/md/bcache/btree.h | 5 +- drivers/md/bcache/super.c | 13 +-- drivers/md/bcache/sysfs.c | 3 - 6 files changed, 145 insertions(+), 139 deletions(-) (limited to 'drivers') diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c index 5ba4eaea57f4..a59ef6147fc7 100644 --- a/drivers/md/bcache/alloc.c +++ b/drivers/md/bcache/alloc.c @@ -375,6 +375,7 @@ static int bch_allocator_thread(void *arg) } allocator_wait(ca, bch_allocator_push(ca, bucket)); + wake_up(&ca->set->btree_cache_wait); wake_up(&ca->set->bucket_wait); } @@ -717,25 +718,3 @@ int bch_cache_allocator_start(struct cache *ca) ca->alloc_thread = k; return 0; } - -int bch_cache_allocator_init(struct cache *ca) -{ - /* - * Reserve: - * Prio/gen writes first - * Then 8 for btree allocations - * Then half for the moving garbage collector - */ -#if 0 - ca->watermark[WATERMARK_PRIO] = 0; - - ca->watermark[WATERMARK_METADATA] = prio_buckets(ca); - - ca->watermark[WATERMARK_MOVINGGC] = 8 + - ca->watermark[WATERMARK_METADATA]; - - ca->watermark[WATERMARK_NONE] = ca->free.size / 2 + - ca->watermark[WATERMARK_MOVINGGC]; -#endif - return 0; -} diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 15d26236caf9..171cda89cb6b 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -562,19 +562,16 @@ struct cache_set { struct list_head btree_cache_freed; /* Number of elements in btree_cache + btree_cache_freeable lists */ - unsigned bucket_cache_used; + unsigned btree_cache_used; /* * If we need to allocate memory for a new btree node and that * allocation fails, we can cannibalize another node in the btree cache - * to satisfy the allocation. However, only one thread can be doing this - * at a time, for obvious reasons - try_harder and try_wait are - * basically a lock for this that we can wait on asynchronously. The - * btree_root() macro releases the lock when it returns. + * to satisfy the allocation - lock to guarantee only one thread does + * this at a time: */ - struct task_struct *try_harder; - wait_queue_head_t try_wait; - uint64_t try_harder_start; + wait_queue_head_t btree_cache_wait; + struct task_struct *btree_cache_alloc_lock; /* * When we free a btree node, we increment the gen of the bucket the @@ -669,7 +666,6 @@ struct cache_set { struct time_stats btree_gc_time; struct time_stats btree_split_time; struct time_stats btree_read_time; - struct time_stats try_harder_time; atomic_long_t cache_read_races; atomic_long_t writeback_keys_done; @@ -956,7 +952,6 @@ int bch_open_buckets_alloc(struct cache_set *); void bch_open_buckets_free(struct cache_set *); int bch_cache_allocator_start(struct cache *ca); -int bch_cache_allocator_init(struct cache *ca); void bch_debug_exit(void); int bch_debug_init(struct kobject *); diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index beb32551da78..be90596a9e2a 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -117,7 +117,7 @@ ({ \ int _r, l = (b)->level - 1; \ bool _w = l <= (op)->lock; \ - struct btree *_child = bch_btree_node_get((b)->c, key, l, _w); \ + struct btree *_child = bch_btree_node_get((b)->c, op, key, l, _w);\ if (!IS_ERR(_child)) { \ _child->parent = (b); \ _r = bch_btree_ ## fn(_child, op, ##__VA_ARGS__); \ @@ -146,17 +146,12 @@ _r = bch_btree_ ## fn(_b, op, ##__VA_ARGS__); \ } \ rw_unlock(_w, _b); \ + bch_cannibalize_unlock(c); \ if (_r == -EINTR) \ schedule(); \ - bch_cannibalize_unlock(c); \ - if (_r == -ENOSPC) { \ - wait_event((c)->try_wait, \ - !(c)->try_harder); \ - _r = -EINTR; \ - } \ } while (_r == -EINTR); \ \ - finish_wait(&(c)->bucket_wait, &(op)->wait); \ + finish_wait(&(c)->btree_cache_wait, &(op)->wait); \ _r; \ }) @@ -563,7 +558,7 @@ static void bch_btree_leaf_dirty(struct btree *b, atomic_t *journal_ref) #define mca_reserve(c) (((c->root && c->root->level) \ ? c->root->level : 1) * 8 + 16) #define mca_can_free(c) \ - max_t(int, 0, c->bucket_cache_used - mca_reserve(c)) + max_t(int, 0, c->btree_cache_used - mca_reserve(c)) static void mca_data_free(struct btree *b) { @@ -571,7 +566,7 @@ static void mca_data_free(struct btree *b) bch_btree_keys_free(&b->keys); - b->c->bucket_cache_used--; + b->c->btree_cache_used--; list_move(&b->list, &b->c->btree_cache_freed); } @@ -596,7 +591,7 @@ static void mca_data_alloc(struct btree *b, struct bkey *k, gfp_t gfp) ilog2(b->c->btree_pages), btree_order(k)), gfp)) { - b->c->bucket_cache_used++; + b->c->btree_cache_used++; list_move(&b->list, &b->c->btree_cache); } else { list_move(&b->list, &b->c->btree_cache_freed); @@ -675,7 +670,7 @@ static unsigned long bch_mca_scan(struct shrinker *shrink, if (c->shrinker_disabled) return SHRINK_STOP; - if (c->try_harder) + if (c->btree_cache_alloc_lock) return SHRINK_STOP; /* Return -1 if we can't do anything right now */ @@ -707,7 +702,7 @@ static unsigned long bch_mca_scan(struct shrinker *shrink, } } - for (i = 0; (nr--) && i < c->bucket_cache_used; i++) { + for (i = 0; (nr--) && i < c->btree_cache_used; i++) { if (list_empty(&c->btree_cache)) goto out; @@ -736,7 +731,7 @@ static unsigned long bch_mca_count(struct shrinker *shrink, if (c->shrinker_disabled) return 0; - if (c->try_harder) + if (c->btree_cache_alloc_lock) return 0; return mca_can_free(c) * c->btree_pages; @@ -840,17 +835,30 @@ out: return b; } -static struct btree *mca_cannibalize(struct cache_set *c, struct bkey *k) +static int mca_cannibalize_lock(struct cache_set *c, struct btree_op *op) +{ + struct task_struct *old; + + old = cmpxchg(&c->btree_cache_alloc_lock, NULL, current); + if (old && old != current) { + if (op) + prepare_to_wait(&c->btree_cache_wait, &op->wait, + TASK_UNINTERRUPTIBLE); + return -EINTR; + } + + return 0; +} + +static struct btree *mca_cannibalize(struct cache_set *c, struct btree_op *op, + struct bkey *k) { struct btree *b; trace_bcache_btree_cache_cannibalize(c); - if (!c->try_harder) { - c->try_harder = current; - c->try_harder_start = local_clock(); - } else if (c->try_harder != current) - return ERR_PTR(-ENOSPC); + if (mca_cannibalize_lock(c, op)) + return ERR_PTR(-EINTR); list_for_each_entry_reverse(b, &c->btree_cache, list) if (!mca_reap(b, btree_order(k), false)) @@ -860,6 +868,7 @@ static struct btree *mca_cannibalize(struct cache_set *c, struct bkey *k) if (!mca_reap(b, btree_order(k), true)) return b; + WARN(1, "btree cache cannibalize failed\n"); return ERR_PTR(-ENOMEM); } @@ -871,14 +880,14 @@ static struct btree *mca_cannibalize(struct cache_set *c, struct bkey *k) */ static void bch_cannibalize_unlock(struct cache_set *c) { - if (c->try_harder == current) { - bch_time_stats_update(&c->try_harder_time, c->try_harder_start); - c->try_harder = NULL; - wake_up(&c->try_wait); + if (c->btree_cache_alloc_lock == current) { + c->btree_cache_alloc_lock = NULL; + wake_up(&c->btree_cache_wait); } } -static struct btree *mca_alloc(struct cache_set *c, struct bkey *k, int level) +static struct btree *mca_alloc(struct cache_set *c, struct btree_op *op, + struct bkey *k, int level) { struct btree *b; @@ -941,7 +950,7 @@ err: if (b) rw_unlock(true, b); - b = mca_cannibalize(c, k); + b = mca_cannibalize(c, op, k); if (!IS_ERR(b)) goto out; @@ -957,8 +966,8 @@ err: * The btree node will have either a read or a write lock held, depending on * level and op->lock. */ -struct btree *bch_btree_node_get(struct cache_set *c, struct bkey *k, - int level, bool write) +struct btree *bch_btree_node_get(struct cache_set *c, struct btree_op *op, + struct bkey *k, int level, bool write) { int i = 0; struct btree *b; @@ -972,7 +981,7 @@ retry: return ERR_PTR(-EAGAIN); mutex_lock(&c->bucket_lock); - b = mca_alloc(c, k, level); + b = mca_alloc(c, op, k, level); mutex_unlock(&c->bucket_lock); if (!b) @@ -1018,7 +1027,7 @@ static void btree_node_prefetch(struct cache_set *c, struct bkey *k, int level) struct btree *b; mutex_lock(&c->bucket_lock); - b = mca_alloc(c, k, level); + b = mca_alloc(c, NULL, k, level); mutex_unlock(&c->bucket_lock); if (!IS_ERR_OR_NULL(b)) { @@ -1051,20 +1060,21 @@ static void btree_node_free(struct btree *b) mutex_unlock(&b->c->bucket_lock); } -struct btree *bch_btree_node_alloc(struct cache_set *c, int level, bool wait) +struct btree *bch_btree_node_alloc(struct cache_set *c, struct btree_op *op, + int level) { BKEY_PADDED(key) k; struct btree *b = ERR_PTR(-EAGAIN); mutex_lock(&c->bucket_lock); retry: - if (__bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, 1, wait)) + if (__bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, 1, op != NULL)) goto err; bkey_put(c, &k.key); SET_KEY_SIZE(&k.key, c->btree_pages * PAGE_SECTORS); - b = mca_alloc(c, &k.key, level); + b = mca_alloc(c, op, &k.key, level); if (IS_ERR(b)) goto err_free; @@ -1090,9 +1100,10 @@ err: return b; } -static struct btree *btree_node_alloc_replacement(struct btree *b, bool wait) +static struct btree *btree_node_alloc_replacement(struct btree *b, + struct btree_op *op) { - struct btree *n = bch_btree_node_alloc(b->c, b->level, wait); + struct btree *n = bch_btree_node_alloc(b->c, op, b->level); if (!IS_ERR_OR_NULL(n)) { mutex_lock(&n->write_lock); bch_btree_sort_into(&b->keys, &n->keys, &b->c->sort); @@ -1126,22 +1137,22 @@ static int btree_check_reserve(struct btree *b, struct btree_op *op) { struct cache_set *c = b->c; struct cache *ca; - unsigned i, reserve = c->root->level * 2 + 1; - int ret = 0; + unsigned i, reserve = (c->root->level - b->level) * 2 + 1; mutex_lock(&c->bucket_lock); for_each_cache(ca, c, i) if (fifo_used(&ca->free[RESERVE_BTREE]) < reserve) { if (op) - prepare_to_wait(&c->bucket_wait, &op->wait, + prepare_to_wait(&c->btree_cache_wait, &op->wait, TASK_UNINTERRUPTIBLE); - ret = -EINTR; - break; + mutex_unlock(&c->bucket_lock); + return -EINTR; } mutex_unlock(&c->bucket_lock); - return ret; + + return mca_cannibalize_lock(b->c, op); } /* Garbage collection */ @@ -1273,14 +1284,19 @@ static int bch_btree_insert_node(struct btree *, struct btree_op *, struct keylist *, atomic_t *, struct bkey *); static int btree_gc_coalesce(struct btree *b, struct btree_op *op, - struct keylist *keylist, struct gc_stat *gc, - struct gc_merge_info *r) + struct gc_stat *gc, struct gc_merge_info *r) { unsigned i, nodes = 0, keys = 0, blocks; struct btree *new_nodes[GC_MERGE_NODES]; + struct keylist keylist; struct closure cl; struct bkey *k; + bch_keylist_init(&keylist); + + if (btree_check_reserve(b, NULL)) + return 0; + memset(new_nodes, 0, sizeof(new_nodes)); closure_init_stack(&cl); @@ -1295,11 +1311,20 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op, return 0; for (i = 0; i < nodes; i++) { - new_nodes[i] = btree_node_alloc_replacement(r[i].b, false); + new_nodes[i] = btree_node_alloc_replacement(r[i].b, NULL); if (IS_ERR_OR_NULL(new_nodes[i])) goto out_nocoalesce; } + /* + * We have to check the reserve here, after we've allocated our new + * nodes, to make sure the insert below will succeed - we also check + * before as an optimization to potentially avoid a bunch of expensive + * allocs/sorts + */ + if (btree_check_reserve(b, NULL)) + goto out_nocoalesce; + for (i = 0; i < nodes; i++) mutex_lock(&new_nodes[i]->write_lock); @@ -1361,12 +1386,12 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op, n2->keys -= keys; - if (__bch_keylist_realloc(keylist, + if (__bch_keylist_realloc(&keylist, bkey_u64s(&new_nodes[i]->key))) goto out_nocoalesce; bch_btree_node_write(new_nodes[i], &cl); - bch_keylist_add(keylist, &new_nodes[i]->key); + bch_keylist_add(&keylist, &new_nodes[i]->key); } for (i = 0; i < nodes; i++) @@ -1380,15 +1405,15 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op, rw_unlock(true, new_nodes[0]); for (i = 0; i < nodes; i++) { - if (__bch_keylist_realloc(keylist, bkey_u64s(&r[i].b->key))) + if (__bch_keylist_realloc(&keylist, bkey_u64s(&r[i].b->key))) goto out_nocoalesce; - make_btree_freeing_key(r[i].b, keylist->top); - bch_keylist_push(keylist); + make_btree_freeing_key(r[i].b, keylist.top); + bch_keylist_push(&keylist); } - bch_btree_insert_node(b, op, keylist, NULL, NULL); - BUG_ON(!bch_keylist_empty(keylist)); + bch_btree_insert_node(b, op, &keylist, NULL, NULL); + BUG_ON(!bch_keylist_empty(&keylist)); for (i = 0; i < nodes; i++) { btree_node_free(r[i].b); @@ -1403,13 +1428,16 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op, trace_bcache_btree_gc_coalesce(nodes); gc->nodes--; + bch_keylist_free(&keylist); + /* Invalidated our iterator */ return -EINTR; out_nocoalesce: closure_sync(&cl); + bch_keylist_free(&keylist); - while ((k = bch_keylist_pop(keylist))) + while ((k = bch_keylist_pop(&keylist))) if (!bkey_cmp(k, &ZERO_KEY)) atomic_dec(&b->c->prio_blocked); @@ -1421,6 +1449,42 @@ out_nocoalesce: return 0; } +static int btree_gc_rewrite_node(struct btree *b, struct btree_op *op, + struct btree *replace) +{ + struct keylist keys; + struct btree *n; + + if (btree_check_reserve(b, NULL)) + return 0; + + n = btree_node_alloc_replacement(replace, NULL); + + /* recheck reserve after allocating replacement node */ + if (btree_check_reserve(b, NULL)) { + btree_node_free(n); + rw_unlock(true, n); + return 0; + } + + bch_btree_node_write_sync(n); + + bch_keylist_init(&keys); + bch_keylist_add(&keys, &n->key); + + make_btree_freeing_key(replace, keys.top); + bch_keylist_push(&keys); + + bch_btree_insert_node(b, op, &keys, NULL, NULL); + BUG_ON(!bch_keylist_empty(&keys)); + + btree_node_free(replace); + rw_unlock(true, n); + + /* Invalidated our iterator */ + return -EINTR; +} + static unsigned btree_gc_count_keys(struct btree *b) { struct bkey *k; @@ -1438,14 +1502,11 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op, { int ret = 0; bool should_rewrite; - struct btree *n; struct bkey *k; - struct keylist keys; struct btree_iter iter; struct gc_merge_info r[GC_MERGE_NODES]; struct gc_merge_info *i, *last = r + ARRAY_SIZE(r) - 1; - bch_keylist_init(&keys); bch_btree_iter_init(&b->keys, &iter, &b->c->gc_done); for (i = r; i < r + ARRAY_SIZE(r); i++) @@ -1454,7 +1515,8 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op, while (1) { k = bch_btree_iter_next_filter(&iter, &b->keys, bch_ptr_bad); if (k) { - r->b = bch_btree_node_get(b->c, k, b->level - 1, true); + r->b = bch_btree_node_get(b->c, op, k, b->level - 1, + true); if (IS_ERR(r->b)) { ret = PTR_ERR(r->b); break; @@ -1462,7 +1524,7 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op, r->keys = btree_gc_count_keys(r->b); - ret = btree_gc_coalesce(b, op, &keys, gc, r); + ret = btree_gc_coalesce(b, op, gc, r); if (ret) break; } @@ -1472,32 +1534,10 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op, if (!IS_ERR(last->b)) { should_rewrite = btree_gc_mark_node(last->b, gc); - if (should_rewrite && - !btree_check_reserve(b, NULL)) { - n = btree_node_alloc_replacement(last->b, - false); - - if (!IS_ERR_OR_NULL(n)) { - bch_btree_node_write_sync(n); - - bch_keylist_add(&keys, &n->key); - - make_btree_freeing_key(last->b, - keys.top); - bch_keylist_push(&keys); - - bch_btree_insert_node(b, op, &keys, - NULL, NULL); - BUG_ON(!bch_keylist_empty(&keys)); - - btree_node_free(last->b); - rw_unlock(true, last->b); - last->b = n; - - /* Invalidated our iterator */ - ret = -EINTR; + if (should_rewrite) { + ret = btree_gc_rewrite_node(b, op, last->b); + if (ret) break; - } } if (last->b->level) { @@ -1537,8 +1577,6 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op, rw_unlock(true, i->b); } - bch_keylist_free(&keys); - return ret; } @@ -1551,7 +1589,7 @@ static int bch_btree_gc_root(struct btree *b, struct btree_op *op, should_rewrite = btree_gc_mark_node(b, gc); if (should_rewrite) { - n = btree_node_alloc_replacement(b, false); + n = btree_node_alloc_replacement(b, NULL); if (!IS_ERR_OR_NULL(n)) { bch_btree_node_write_sync(n); @@ -1887,11 +1925,14 @@ static int btree_split(struct btree *b, struct btree_op *op, closure_init_stack(&cl); bch_keylist_init(&parent_keys); - if (!b->level && - btree_check_reserve(b, op)) - return -EINTR; + if (btree_check_reserve(b, op)) { + if (!b->level) + return -EINTR; + else + WARN(1, "insufficient reserve for split\n"); + } - n1 = btree_node_alloc_replacement(b, true); + n1 = btree_node_alloc_replacement(b, op); if (IS_ERR(n1)) goto err; @@ -1903,12 +1944,12 @@ static int btree_split(struct btree *b, struct btree_op *op, trace_bcache_btree_node_split(b, btree_bset_first(n1)->keys); - n2 = bch_btree_node_alloc(b->c, b->level, true); + n2 = bch_btree_node_alloc(b->c, op, b->level); if (IS_ERR(n2)) goto err_free1; if (!b->parent) { - n3 = bch_btree_node_alloc(b->c, b->level + 1, true); + n3 = bch_btree_node_alloc(b->c, op, b->level + 1); if (IS_ERR(n3)) goto err_free2; } @@ -1995,7 +2036,7 @@ err_free1: btree_node_free(n1); rw_unlock(true, n1); err: - WARN(1, "bcache: btree split failed"); + WARN(1, "bcache: btree split failed (level %u)", b->level); if (n3 == ERR_PTR(-EAGAIN) || n2 == ERR_PTR(-EAGAIN) || diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h index acebf26809cc..3ce371fa7f98 100644 --- a/drivers/md/bcache/btree.h +++ b/drivers/md/bcache/btree.h @@ -242,8 +242,9 @@ void __bch_btree_node_write(struct btree *, struct closure *); void bch_btree_node_write(struct btree *, struct closure *); void bch_btree_set_root(struct btree *); -struct btree *bch_btree_node_alloc(struct cache_set *, int, bool); -struct btree *bch_btree_node_get(struct cache_set *, struct bkey *, int, bool); +struct btree *bch_btree_node_alloc(struct cache_set *, struct btree_op *, int); +struct btree *bch_btree_node_get(struct cache_set *, struct btree_op *, + struct bkey *, int, bool); int bch_btree_insert_check_key(struct btree *, struct btree_op *, struct bkey *); diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 307fe378ea43..2d4a56219ec7 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -1495,14 +1495,13 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) sema_init(&c->sb_write_mutex, 1); mutex_init(&c->bucket_lock); - init_waitqueue_head(&c->try_wait); + init_waitqueue_head(&c->btree_cache_wait); init_waitqueue_head(&c->bucket_wait); sema_init(&c->uuid_write_mutex, 1); spin_lock_init(&c->btree_gc_time.lock); spin_lock_init(&c->btree_split_time.lock); spin_lock_init(&c->btree_read_time.lock); - spin_lock_init(&c->try_harder_time.lock); bch_moving_init_cache_set(c); @@ -1591,7 +1590,7 @@ static void run_cache_set(struct cache_set *c) goto err; err = "error reading btree root"; - c->root = bch_btree_node_get(c, k, j->btree_level, true); + c->root = bch_btree_node_get(c, NULL, k, j->btree_level, true); if (IS_ERR_OR_NULL(c->root)) goto err; @@ -1666,7 +1665,7 @@ static void run_cache_set(struct cache_set *c) goto err; err = "cannot allocate new btree root"; - c->root = bch_btree_node_alloc(c, 0, true); + c->root = bch_btree_node_alloc(c, NULL, 0); if (IS_ERR_OR_NULL(c->root)) goto err; @@ -1847,13 +1846,7 @@ static int cache_alloc(struct cache_sb *sb, struct cache *ca) for_each_bucket(b, ca) atomic_set(&b->pin, 0); - if (bch_cache_allocator_init(ca)) - goto err; - return 0; -err: - kobject_put(&ca->kobj); - return -ENOMEM; } static void register_cache(struct cache_sb *sb, struct page *sb_page, diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index 662b9484ed5e..89aaa2ef38f9 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -54,7 +54,6 @@ sysfs_time_stats_attribute(btree_gc, sec, ms); sysfs_time_stats_attribute(btree_split, sec, us); sysfs_time_stats_attribute(btree_sort, ms, us); sysfs_time_stats_attribute(btree_read, ms, us); -sysfs_time_stats_attribute(try_harder, ms, us); read_attribute(btree_nodes); read_attribute(btree_used_percent); @@ -534,7 +533,6 @@ lock_root: sysfs_print_time_stats(&c->btree_split_time, btree_split, sec, us); sysfs_print_time_stats(&c->sort.time, btree_sort, ms, us); sysfs_print_time_stats(&c->btree_read_time, btree_read, ms, us); - sysfs_print_time_stats(&c->try_harder_time, try_harder, ms, us); sysfs_print(btree_used_percent, btree_used(c)); sysfs_print(btree_nodes, c->gc_stats.nodes); @@ -709,7 +707,6 @@ static struct attribute *bch_cache_set_internal_files[] = { sysfs_time_stats_attribute_list(btree_split, sec, us) sysfs_time_stats_attribute_list(btree_sort, ms, us) sysfs_time_stats_attribute_list(btree_read, ms, us) - sysfs_time_stats_attribute_list(try_harder, ms, us) &sysfs_btree_nodes, &sysfs_btree_used_percent, -- cgit v1.2.3 From 2531d9ee61fa08a5a9ab8f002c50779888d232c7 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 17 Mar 2014 16:55:55 -0700 Subject: bcache: Kill unused freelist This was originally added as at optimization that for various reasons isn't needed anymore, but it does add a lot of nasty corner cases (and it was responsible for some recently fixed bugs). Just get rid of it now. Signed-off-by: Kent Overstreet --- drivers/md/bcache/alloc.c | 140 +++++++++++++++++------------------------- drivers/md/bcache/bcache.h | 28 ++------- drivers/md/bcache/btree.c | 41 +++++++++++-- drivers/md/bcache/btree.h | 2 +- drivers/md/bcache/super.c | 24 +++----- include/trace/events/bcache.h | 6 +- 6 files changed, 112 insertions(+), 129 deletions(-) (limited to 'drivers') diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c index a59ef6147fc7..443d03fbac47 100644 --- a/drivers/md/bcache/alloc.c +++ b/drivers/md/bcache/alloc.c @@ -78,12 +78,6 @@ uint8_t bch_inc_gen(struct cache *ca, struct bucket *b) ca->set->need_gc = max(ca->set->need_gc, bucket_gc_gen(b)); WARN_ON_ONCE(ca->set->need_gc > BUCKET_GC_GEN_MAX); - if (CACHE_SYNC(&ca->set->sb)) { - ca->need_save_prio = max(ca->need_save_prio, - bucket_disk_gen(b)); - WARN_ON_ONCE(ca->need_save_prio > BUCKET_DISK_GEN_MAX); - } - return ret; } @@ -120,58 +114,46 @@ void bch_rescale_priorities(struct cache_set *c, int sectors) mutex_unlock(&c->bucket_lock); } -/* Allocation */ +/* + * Background allocation thread: scans for buckets to be invalidated, + * invalidates them, rewrites prios/gens (marking them as invalidated on disk), + * then optionally issues discard commands to the newly free buckets, then puts + * them on the various freelists. + */ static inline bool can_inc_bucket_gen(struct bucket *b) { - return bucket_gc_gen(b) < BUCKET_GC_GEN_MAX && - bucket_disk_gen(b) < BUCKET_DISK_GEN_MAX; + return bucket_gc_gen(b) < BUCKET_GC_GEN_MAX; } -bool bch_bucket_add_unused(struct cache *ca, struct bucket *b) +bool bch_can_invalidate_bucket(struct cache *ca, struct bucket *b) { - BUG_ON(GC_MARK(b) || GC_SECTORS_USED(b)); + BUG_ON(!ca->set->gc_mark_valid); - if (CACHE_REPLACEMENT(&ca->sb) == CACHE_REPLACEMENT_FIFO) { - unsigned i; - - for (i = 0; i < RESERVE_NONE; i++) - if (!fifo_full(&ca->free[i])) - goto add; - - return false; - } -add: - b->prio = 0; - - if (can_inc_bucket_gen(b) && - fifo_push(&ca->unused, b - ca->buckets)) { - atomic_inc(&b->pin); - return true; - } - - return false; -} - -static bool can_invalidate_bucket(struct cache *ca, struct bucket *b) -{ return (!GC_MARK(b) || GC_MARK(b) == GC_MARK_RECLAIMABLE) && !atomic_read(&b->pin) && can_inc_bucket_gen(b); } -static void invalidate_one_bucket(struct cache *ca, struct bucket *b) +void __bch_invalidate_one_bucket(struct cache *ca, struct bucket *b) { - size_t bucket = b - ca->buckets; + lockdep_assert_held(&ca->set->bucket_lock); + BUG_ON(GC_MARK(b) && GC_MARK(b) != GC_MARK_RECLAIMABLE); if (GC_SECTORS_USED(b)) - trace_bcache_invalidate(ca, bucket); + trace_bcache_invalidate(ca, b - ca->buckets); bch_inc_gen(ca, b); b->prio = INITIAL_PRIO; atomic_inc(&b->pin); - fifo_push(&ca->free_inc, bucket); +} + +static void bch_invalidate_one_bucket(struct cache *ca, struct bucket *b) +{ + __bch_invalidate_one_bucket(ca, b); + + fifo_push(&ca->free_inc, b - ca->buckets); } /* @@ -201,20 +183,7 @@ static void invalidate_buckets_lru(struct cache *ca) ca->heap.used = 0; for_each_bucket(b, ca) { - /* - * If we fill up the unused list, if we then return before - * adding anything to the free_inc list we'll skip writing - * prios/gens and just go back to allocating from the unused - * list: - */ - if (fifo_full(&ca->unused)) - return; - - if (!can_invalidate_bucket(ca, b)) - continue; - - if (!GC_SECTORS_USED(b) && - bch_bucket_add_unused(ca, b)) + if (!bch_can_invalidate_bucket(ca, b)) continue; if (!heap_full(&ca->heap)) @@ -239,7 +208,7 @@ static void invalidate_buckets_lru(struct cache *ca) return; } - invalidate_one_bucket(ca, b); + bch_invalidate_one_bucket(ca, b); } } @@ -255,8 +224,8 @@ static void invalidate_buckets_fifo(struct cache *ca) b = ca->buckets + ca->fifo_last_bucket++; - if (can_invalidate_bucket(ca, b)) - invalidate_one_bucket(ca, b); + if (bch_can_invalidate_bucket(ca, b)) + bch_invalidate_one_bucket(ca, b); if (++checked >= ca->sb.nbuckets) { ca->invalidate_needs_gc = 1; @@ -280,8 +249,8 @@ static void invalidate_buckets_random(struct cache *ca) b = ca->buckets + n; - if (can_invalidate_bucket(ca, b)) - invalidate_one_bucket(ca, b); + if (bch_can_invalidate_bucket(ca, b)) + bch_invalidate_one_bucket(ca, b); if (++checked >= ca->sb.nbuckets / 2) { ca->invalidate_needs_gc = 1; @@ -293,8 +262,7 @@ static void invalidate_buckets_random(struct cache *ca) static void invalidate_buckets(struct cache *ca) { - if (ca->invalidate_needs_gc) - return; + BUG_ON(ca->invalidate_needs_gc); switch (CACHE_REPLACEMENT(&ca->sb)) { case CACHE_REPLACEMENT_LRU: @@ -354,17 +322,10 @@ static int bch_allocator_thread(void *arg) * possibly issue discards to them, then we add the bucket to * the free list: */ - while (1) { + while (!fifo_empty(&ca->free_inc)) { long bucket; - if ((!atomic_read(&ca->set->prio_blocked) || - !CACHE_SYNC(&ca->set->sb)) && - !fifo_empty(&ca->unused)) - fifo_pop(&ca->unused, bucket); - else if (!fifo_empty(&ca->free_inc)) - fifo_pop(&ca->free_inc, bucket); - else - break; + fifo_pop(&ca->free_inc, bucket); if (ca->discard) { mutex_unlock(&ca->set->bucket_lock); @@ -385,9 +346,9 @@ static int bch_allocator_thread(void *arg) * them to the free_inc list: */ +retry_invalidate: allocator_wait(ca, ca->set->gc_mark_valid && - (ca->need_save_prio > 64 || - !ca->invalidate_needs_gc)); + !ca->invalidate_needs_gc); invalidate_buckets(ca); /* @@ -395,13 +356,28 @@ static int bch_allocator_thread(void *arg) * new stuff to them: */ allocator_wait(ca, !atomic_read(&ca->set->prio_blocked)); - if (CACHE_SYNC(&ca->set->sb) && - (!fifo_empty(&ca->free_inc) || - ca->need_save_prio > 64)) + if (CACHE_SYNC(&ca->set->sb)) { + /* + * This could deadlock if an allocation with a btree + * node locked ever blocked - having the btree node + * locked would block garbage collection, but here we're + * waiting on garbage collection before we invalidate + * and free anything. + * + * But this should be safe since the btree code always + * uses btree_check_reserve() before allocating now, and + * if it fails it blocks without btree nodes locked. + */ + if (!fifo_full(&ca->free_inc)) + goto retry_invalidate; + bch_prio_write(ca); + } } } +/* Allocation */ + long bch_bucket_alloc(struct cache *ca, unsigned reserve, bool wait) { DEFINE_WAIT(w); @@ -447,8 +423,6 @@ out: BUG_ON(i == r); fifo_for_each(i, &ca->free_inc, iter) BUG_ON(i == r); - fifo_for_each(i, &ca->unused, iter) - BUG_ON(i == r); } b = ca->buckets + r; @@ -470,17 +444,19 @@ out: return r; } +void __bch_bucket_free(struct cache *ca, struct bucket *b) +{ + SET_GC_MARK(b, 0); + SET_GC_SECTORS_USED(b, 0); +} + void bch_bucket_free(struct cache_set *c, struct bkey *k) { unsigned i; - for (i = 0; i < KEY_PTRS(k); i++) { - struct bucket *b = PTR_BUCKET(c, k, i); - - SET_GC_MARK(b, 0); - SET_GC_SECTORS_USED(b, 0); - bch_bucket_add_unused(PTR_CACHE(c, k, i), b); - } + for (i = 0; i < KEY_PTRS(k); i++) + __bch_bucket_free(PTR_CACHE(c, k, i), + PTR_BUCKET(c, k, i)); } int __bch_bucket_alloc_set(struct cache_set *c, unsigned reserve, diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 171cda89cb6b..200efc1b5cf8 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -195,7 +195,6 @@ struct bucket { atomic_t pin; uint16_t prio; uint8_t gen; - uint8_t disk_gen; uint8_t last_gc; /* Most out of date gen in the btree */ uint8_t gc_gen; uint16_t gc_mark; /* Bitfield used by GC. See below for field */ @@ -426,14 +425,9 @@ struct cache { * their new gen to disk. After prio_write() finishes writing the new * gens/prios, they'll be moved to the free list (and possibly discarded * in the process) - * - * unused: GC found nothing pointing into these buckets (possibly - * because all the data they contained was overwritten), so we only - * need to discard them before they can be moved to the free list. */ DECLARE_FIFO(long, free)[RESERVE_NR]; DECLARE_FIFO(long, free_inc); - DECLARE_FIFO(long, unused); size_t fifo_last_bucket; @@ -442,12 +436,6 @@ struct cache { DECLARE_HEAP(struct bucket *, heap); - /* - * max(gen - disk_gen) for all buckets. When it gets too big we have to - * call prio_write() to keep gens from wrapping. - */ - uint8_t need_save_prio; - /* * If nonzero, we know we aren't going to find any buckets to invalidate * until a gc finishes - otherwise we could pointlessly burn a ton of @@ -848,9 +836,6 @@ static inline bool cached_dev_get(struct cached_dev *dc) /* * bucket_gc_gen() returns the difference between the bucket's current gen and * the oldest gen of any pointer into that bucket in the btree (last_gc). - * - * bucket_disk_gen() returns the difference between the current gen and the gen - * on disk; they're both used to make sure gens don't wrap around. */ static inline uint8_t bucket_gc_gen(struct bucket *b) @@ -858,13 +843,7 @@ static inline uint8_t bucket_gc_gen(struct bucket *b) return b->gen - b->last_gc; } -static inline uint8_t bucket_disk_gen(struct bucket *b) -{ - return b->gen - b->disk_gen; -} - #define BUCKET_GC_GEN_MAX 96U -#define BUCKET_DISK_GEN_MAX 64U #define kobj_attribute_write(n, fn) \ static struct kobj_attribute ksysfs_##n = __ATTR(n, S_IWUSR, NULL, fn) @@ -897,11 +876,14 @@ void bch_submit_bbio(struct bio *, struct cache_set *, struct bkey *, unsigned); uint8_t bch_inc_gen(struct cache *, struct bucket *); void bch_rescale_priorities(struct cache_set *, int); -bool bch_bucket_add_unused(struct cache *, struct bucket *); -long bch_bucket_alloc(struct cache *, unsigned, bool); +bool bch_can_invalidate_bucket(struct cache *, struct bucket *); +void __bch_invalidate_one_bucket(struct cache *, struct bucket *); + +void __bch_bucket_free(struct cache *, struct bucket *); void bch_bucket_free(struct cache_set *, struct bkey *); +long bch_bucket_alloc(struct cache *, unsigned, bool); int __bch_bucket_alloc_set(struct cache_set *, unsigned, struct bkey *, int, bool); int bch_bucket_alloc_set(struct cache_set *, unsigned, diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index be90596a9e2a..4c340c85b122 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -1641,7 +1641,7 @@ static void btree_gc_start(struct cache_set *c) mutex_unlock(&c->bucket_lock); } -size_t bch_btree_gc_finish(struct cache_set *c) +static size_t bch_btree_gc_finish(struct cache_set *c) { size_t available = 0; struct bucket *b; @@ -1703,9 +1703,6 @@ size_t bch_btree_gc_finish(struct cache_set *c) if (!GC_MARK(b) || GC_MARK(b) == GC_MARK_RECLAIMABLE) available++; - - if (!GC_MARK(b)) - bch_bucket_add_unused(ca, b); } } @@ -1836,6 +1833,42 @@ int bch_btree_check(struct cache_set *c) return btree_root(check_recurse, c, &op); } +void bch_initial_gc_finish(struct cache_set *c) +{ + struct cache *ca; + struct bucket *b; + unsigned i; + + bch_btree_gc_finish(c); + + mutex_lock(&c->bucket_lock); + + /* + * We need to put some unused buckets directly on the prio freelist in + * order to get the allocator thread started - it needs freed buckets in + * order to rewrite the prios and gens, and it needs to rewrite prios + * and gens in order to free buckets. + * + * This is only safe for buckets that have no live data in them, which + * there should always be some of. + */ + for_each_cache(ca, c, i) { + for_each_bucket(b, ca) { + if (fifo_full(&ca->free[RESERVE_PRIO])) + break; + + if (bch_can_invalidate_bucket(ca, b) && + !GC_MARK(b)) { + __bch_invalidate_one_bucket(ca, b); + fifo_push(&ca->free[RESERVE_PRIO], + b - ca->buckets); + } + } + } + + mutex_unlock(&c->bucket_lock); +} + /* Btree insertion */ static bool btree_insert_key(struct btree *b, struct bkey *k, diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h index 3ce371fa7f98..91dfa5e69685 100644 --- a/drivers/md/bcache/btree.h +++ b/drivers/md/bcache/btree.h @@ -252,7 +252,7 @@ int bch_btree_insert(struct cache_set *, struct keylist *, atomic_t *, struct bkey *); int bch_gc_thread_start(struct cache_set *); -size_t bch_btree_gc_finish(struct cache_set *); +void bch_initial_gc_finish(struct cache_set *); void bch_moving_gc(struct cache_set *); int bch_btree_check(struct cache_set *); void bch_initial_mark_key(struct cache_set *, int, struct bkey *); diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 2d4a56219ec7..a8c57d59a726 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -541,9 +541,6 @@ static void prio_io(struct cache *ca, uint64_t bucket, unsigned long rw) closure_sync(cl); } -#define buckets_free(c) "free %zu, free_inc %zu, unused %zu", \ - fifo_used(&c->free), fifo_used(&c->free_inc), fifo_used(&c->unused) - void bch_prio_write(struct cache *ca) { int i; @@ -554,10 +551,6 @@ void bch_prio_write(struct cache *ca) lockdep_assert_held(&ca->set->bucket_lock); - for (b = ca->buckets; - b < ca->buckets + ca->sb.nbuckets; b++) - b->disk_gen = b->gen; - ca->disk_buckets->seq++; atomic_long_add(ca->sb.bucket_size * prio_buckets(ca), @@ -601,14 +594,17 @@ void bch_prio_write(struct cache *ca) mutex_lock(&ca->set->bucket_lock); - ca->need_save_prio = 0; - /* * Don't want the old priorities to get garbage collected until after we * finish writing the new ones, and they're journalled */ - for (i = 0; i < prio_buckets(ca); i++) + for (i = 0; i < prio_buckets(ca); i++) { + if (ca->prio_last_buckets[i]) + __bch_bucket_free(ca, + &ca->buckets[ca->prio_last_buckets[i]]); + ca->prio_last_buckets[i] = ca->prio_buckets[i]; + } } static void prio_read(struct cache *ca, uint64_t bucket) @@ -639,7 +635,7 @@ static void prio_read(struct cache *ca, uint64_t bucket) } b->prio = le16_to_cpu(d->prio); - b->gen = b->disk_gen = b->last_gc = b->gc_gen = d->gen; + b->gen = b->last_gc = b->gc_gen = d->gen; } } @@ -1606,7 +1602,7 @@ static void run_cache_set(struct cache_set *c) goto err; bch_journal_mark(c, &journal); - bch_btree_gc_finish(c); + bch_initial_gc_finish(c); pr_debug("btree_check() done"); /* @@ -1648,7 +1644,7 @@ static void run_cache_set(struct cache_set *c) ca->sb.d[j] = ca->sb.first_bucket + j; } - bch_btree_gc_finish(c); + bch_initial_gc_finish(c); err = "error starting allocator thread"; for_each_cache(ca, c, i) @@ -1794,7 +1790,6 @@ void bch_cache_release(struct kobject *kobj) vfree(ca->buckets); free_heap(&ca->heap); - free_fifo(&ca->unused); free_fifo(&ca->free_inc); for (i = 0; i < RESERVE_NR; i++) @@ -1831,7 +1826,6 @@ static int cache_alloc(struct cache_sb *sb, struct cache *ca) !init_fifo(&ca->free[RESERVE_MOVINGGC], free, GFP_KERNEL) || !init_fifo(&ca->free[RESERVE_NONE], free, GFP_KERNEL) || !init_fifo(&ca->free_inc, free << 2, GFP_KERNEL) || - !init_fifo(&ca->unused, free << 2, GFP_KERNEL) || !init_heap(&ca->heap, free << 3, GFP_KERNEL) || !(ca->buckets = vzalloc(sizeof(struct bucket) * ca->sb.nbuckets)) || diff --git a/include/trace/events/bcache.h b/include/trace/events/bcache.h index 8fc2a7134d3c..c9c3c044b32f 100644 --- a/include/trace/events/bcache.h +++ b/include/trace/events/bcache.h @@ -446,7 +446,6 @@ TRACE_EVENT(bcache_alloc_fail, __field(dev_t, dev ) __field(unsigned, free ) __field(unsigned, free_inc ) - __field(unsigned, unused ) __field(unsigned, blocked ) ), @@ -454,13 +453,12 @@ TRACE_EVENT(bcache_alloc_fail, __entry->dev = ca->bdev->bd_dev; __entry->free = fifo_used(&ca->free[reserve]); __entry->free_inc = fifo_used(&ca->free_inc); - __entry->unused = fifo_used(&ca->unused); __entry->blocked = atomic_read(&ca->set->prio_blocked); ), - TP_printk("alloc fail %d,%d free %u free_inc %u unused %u blocked %u", + TP_printk("alloc fail %d,%d free %u free_inc %u blocked %u", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->free, - __entry->free_inc, __entry->unused, __entry->blocked) + __entry->free_inc, __entry->blocked) ); /* Background writeback */ -- cgit v1.2.3 From 3a2fd9d5090b83aab85378a846fa10f39b0b5aa7 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 27 Feb 2014 17:51:12 -0800 Subject: bcache: Kill bucket->gc_gen gc_gen was a temporary used to recalculate last_gc, but since we only need bucket->last_gc when gc isn't running (gc_mark_valid = 1), we can just update last_gc directly. Signed-off-by: Kent Overstreet --- drivers/md/bcache/bcache.h | 3 +-- drivers/md/bcache/btree.c | 7 +++---- drivers/md/bcache/extents.c | 8 ++++---- drivers/md/bcache/super.c | 2 +- 4 files changed, 9 insertions(+), 11 deletions(-) (limited to 'drivers') diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 200efc1b5cf8..82c9c5d35251 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -196,7 +196,6 @@ struct bucket { uint16_t prio; uint8_t gen; uint8_t last_gc; /* Most out of date gen in the btree */ - uint8_t gc_gen; uint16_t gc_mark; /* Bitfield used by GC. See below for field */ }; @@ -588,7 +587,7 @@ struct cache_set { uint16_t min_prio; /* - * max(gen - gc_gen) for all buckets. When it gets too big we have to gc + * max(gen - last_gc) for all buckets. When it gets too big we have to gc * to keep gens from wrapping around. */ uint8_t need_gc; diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 4c340c85b122..7347b6100961 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -1178,8 +1178,8 @@ static uint8_t __bch_btree_mark_key(struct cache_set *c, int level, g = PTR_BUCKET(c, k, i); - if (gen_after(g->gc_gen, PTR_GEN(k, i))) - g->gc_gen = PTR_GEN(k, i); + if (gen_after(g->last_gc, PTR_GEN(k, i))) + g->last_gc = PTR_GEN(k, i); if (ptr_stale(c, k, i)) { stale = max(stale, ptr_stale(c, k, i)); @@ -1631,7 +1631,7 @@ static void btree_gc_start(struct cache_set *c) for_each_cache(ca, c, i) for_each_bucket(b, ca) { - b->gc_gen = b->gen; + b->last_gc = b->gen; if (!atomic_read(&b->pin)) { SET_GC_MARK(b, 0); SET_GC_SECTORS_USED(b, 0); @@ -1693,7 +1693,6 @@ static size_t bch_btree_gc_finish(struct cache_set *c) SET_GC_MARK(ca->buckets + *i, GC_MARK_METADATA); for_each_bucket(b, ca) { - b->last_gc = b->gc_gen; c->need_gc = max(c->need_gc, bucket_gc_gen(b)); if (atomic_read(&b->pin)) diff --git a/drivers/md/bcache/extents.c b/drivers/md/bcache/extents.c index 82d5e3288a6c..35887330b49d 100644 --- a/drivers/md/bcache/extents.c +++ b/drivers/md/bcache/extents.c @@ -194,9 +194,9 @@ err: mutex_unlock(&b->c->bucket_lock); bch_extent_to_text(buf, sizeof(buf), k); btree_bug(b, -"inconsistent btree pointer %s: bucket %zi pin %i prio %i gen %i last_gc %i mark %llu gc_gen %i", +"inconsistent btree pointer %s: bucket %zi pin %i prio %i gen %i last_gc %i mark %llu", buf, PTR_BUCKET_NR(b->c, k, i), atomic_read(&g->pin), - g->prio, g->gen, g->last_gc, GC_MARK(g), g->gc_gen); + g->prio, g->gen, g->last_gc, GC_MARK(g)); return true; } @@ -515,9 +515,9 @@ err: mutex_unlock(&b->c->bucket_lock); bch_extent_to_text(buf, sizeof(buf), k); btree_bug(b, -"inconsistent extent pointer %s:\nbucket %zu pin %i prio %i gen %i last_gc %i mark %llu gc_gen %i", +"inconsistent extent pointer %s:\nbucket %zu pin %i prio %i gen %i last_gc %i mark %llu", buf, PTR_BUCKET_NR(b->c, k, ptr), atomic_read(&g->pin), - g->prio, g->gen, g->last_gc, GC_MARK(g), g->gc_gen); + g->prio, g->gen, g->last_gc, GC_MARK(g)); return true; } diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index a8c57d59a726..926ded8ccbf5 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -635,7 +635,7 @@ static void prio_read(struct cache *ca, uint64_t bucket) } b->prio = le16_to_cpu(d->prio); - b->gen = b->last_gc = b->gc_gen = d->gen; + b->gen = b->last_gc = d->gen; } } -- cgit v1.2.3 From cb85114956dc88b287afca2872658f562acbc302 Mon Sep 17 00:00:00 2001 From: John Sheu Date: Mon, 17 Mar 2014 23:13:56 -0700 Subject: bcache: remove nested function usage Uninlined nested functions can cause crashes when using ftrace, as they don't follow the normal calling convention and confuse the ftrace function graph tracer as it examines the stack. Also, nested functions are supported as a gcc extension, but may fail on other compilers (e.g. llvm). Signed-off-by: John Sheu --- drivers/md/bcache/extents.c | 22 ++++---- drivers/md/bcache/sysfs.c | 126 ++++++++++++++++++++++---------------------- 2 files changed, 76 insertions(+), 72 deletions(-) (limited to 'drivers') diff --git a/drivers/md/bcache/extents.c b/drivers/md/bcache/extents.c index 35887330b49d..3a0de4cf9771 100644 --- a/drivers/md/bcache/extents.c +++ b/drivers/md/bcache/extents.c @@ -308,6 +308,16 @@ static struct bkey *bch_extent_sort_fixup(struct btree_iter *iter, return NULL; } +static void bch_subtract_dirty(struct bkey *k, + struct cache_set *c, + uint64_t offset, + int sectors) +{ + if (KEY_DIRTY(k)) + bcache_dev_sectors_dirty_add(c, KEY_INODE(k), + offset, -sectors); +} + static bool bch_extent_insert_fixup(struct btree_keys *b, struct bkey *insert, struct btree_iter *iter, @@ -315,13 +325,6 @@ static bool bch_extent_insert_fixup(struct btree_keys *b, { struct cache_set *c = container_of(b, struct btree, keys)->c; - void subtract_dirty(struct bkey *k, uint64_t offset, int sectors) - { - if (KEY_DIRTY(k)) - bcache_dev_sectors_dirty_add(c, KEY_INODE(k), - offset, -sectors); - } - uint64_t old_offset; unsigned old_size, sectors_found = 0; @@ -398,7 +401,8 @@ static bool bch_extent_insert_fixup(struct btree_keys *b, struct bkey *top; - subtract_dirty(k, KEY_START(insert), KEY_SIZE(insert)); + bch_subtract_dirty(k, c, KEY_START(insert), + KEY_SIZE(insert)); if (bkey_written(b, k)) { /* @@ -448,7 +452,7 @@ static bool bch_extent_insert_fixup(struct btree_keys *b, } } - subtract_dirty(k, old_offset, old_size - KEY_SIZE(k)); + bch_subtract_dirty(k, c, old_offset, old_size - KEY_SIZE(k)); } check_failed: diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index 89aaa2ef38f9..b3ff57d61dde 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -405,7 +405,7 @@ struct bset_stats_op { struct bset_stats stats; }; -static int btree_bset_stats(struct btree_op *b_op, struct btree *b) +static int bch_btree_bset_stats(struct btree_op *b_op, struct btree *b) { struct bset_stats_op *op = container_of(b_op, struct bset_stats_op, op); @@ -423,7 +423,7 @@ static int bch_bset_print_stats(struct cache_set *c, char *buf) memset(&op, 0, sizeof(op)); bch_btree_op_init(&op.op, -1); - ret = bch_btree_map_nodes(&op.op, c, &ZERO_KEY, btree_bset_stats); + ret = bch_btree_map_nodes(&op.op, c, &ZERO_KEY, bch_btree_bset_stats); if (ret < 0) return ret; @@ -441,81 +441,81 @@ static int bch_bset_print_stats(struct cache_set *c, char *buf) op.stats.floats, op.stats.failed); } -SHOW(__bch_cache_set) +static unsigned bch_root_usage(struct cache_set *c) { - unsigned root_usage(struct cache_set *c) - { - unsigned bytes = 0; - struct bkey *k; - struct btree *b; - struct btree_iter iter; + unsigned bytes = 0; + struct bkey *k; + struct btree *b; + struct btree_iter iter; - goto lock_root; + goto lock_root; - do { - rw_unlock(false, b); + do { + rw_unlock(false, b); lock_root: - b = c->root; - rw_lock(false, b, b->level); - } while (b != c->root); - - for_each_key_filter(&b->keys, k, &iter, bch_ptr_bad) - bytes += bkey_bytes(k); + b = c->root; + rw_lock(false, b, b->level); + } while (b != c->root); - rw_unlock(false, b); + for_each_key_filter(&b->keys, k, &iter, bch_ptr_bad) + bytes += bkey_bytes(k); - return (bytes * 100) / btree_bytes(c); - } + rw_unlock(false, b); - size_t cache_size(struct cache_set *c) - { - size_t ret = 0; - struct btree *b; + return (bytes * 100) / btree_bytes(c); +} - mutex_lock(&c->bucket_lock); - list_for_each_entry(b, &c->btree_cache, list) - ret += 1 << (b->keys.page_order + PAGE_SHIFT); +static size_t bch_cache_size(struct cache_set *c) +{ + size_t ret = 0; + struct btree *b; - mutex_unlock(&c->bucket_lock); - return ret; - } + mutex_lock(&c->bucket_lock); + list_for_each_entry(b, &c->btree_cache, list) + ret += 1 << (b->keys.page_order + PAGE_SHIFT); - unsigned cache_max_chain(struct cache_set *c) - { - unsigned ret = 0; - struct hlist_head *h; + mutex_unlock(&c->bucket_lock); + return ret; +} - mutex_lock(&c->bucket_lock); +static unsigned bch_cache_max_chain(struct cache_set *c) +{ + unsigned ret = 0; + struct hlist_head *h; - for (h = c->bucket_hash; - h < c->bucket_hash + (1 << BUCKET_HASH_BITS); - h++) { - unsigned i = 0; - struct hlist_node *p; + mutex_lock(&c->bucket_lock); - hlist_for_each(p, h) - i++; + for (h = c->bucket_hash; + h < c->bucket_hash + (1 << BUCKET_HASH_BITS); + h++) { + unsigned i = 0; + struct hlist_node *p; - ret = max(ret, i); - } + hlist_for_each(p, h) + i++; - mutex_unlock(&c->bucket_lock); - return ret; + ret = max(ret, i); } - unsigned btree_used(struct cache_set *c) - { - return div64_u64(c->gc_stats.key_bytes * 100, - (c->gc_stats.nodes ?: 1) * btree_bytes(c)); - } + mutex_unlock(&c->bucket_lock); + return ret; +} - unsigned average_key_size(struct cache_set *c) - { - return c->gc_stats.nkeys - ? div64_u64(c->gc_stats.data, c->gc_stats.nkeys) - : 0; - } +static unsigned bch_btree_used(struct cache_set *c) +{ + return div64_u64(c->gc_stats.key_bytes * 100, + (c->gc_stats.nodes ?: 1) * btree_bytes(c)); +} +static unsigned bch_average_key_size(struct cache_set *c) +{ + return c->gc_stats.nkeys + ? div64_u64(c->gc_stats.data, c->gc_stats.nkeys) + : 0; +} + +SHOW(__bch_cache_set) +{ struct cache_set *c = container_of(kobj, struct cache_set, kobj); sysfs_print(synchronous, CACHE_SYNC(&c->sb)); @@ -523,10 +523,10 @@ lock_root: sysfs_hprint(bucket_size, bucket_bytes(c)); sysfs_hprint(block_size, block_bytes(c)); sysfs_print(tree_depth, c->root->level); - sysfs_print(root_usage_percent, root_usage(c)); + sysfs_print(root_usage_percent, bch_root_usage(c)); - sysfs_hprint(btree_cache_size, cache_size(c)); - sysfs_print(btree_cache_max_chain, cache_max_chain(c)); + sysfs_hprint(btree_cache_size, bch_cache_size(c)); + sysfs_print(btree_cache_max_chain, bch_cache_max_chain(c)); sysfs_print(cache_available_percent, 100 - c->gc_stats.in_use); sysfs_print_time_stats(&c->btree_gc_time, btree_gc, sec, ms); @@ -534,9 +534,9 @@ lock_root: sysfs_print_time_stats(&c->sort.time, btree_sort, ms, us); sysfs_print_time_stats(&c->btree_read_time, btree_read, ms, us); - sysfs_print(btree_used_percent, btree_used(c)); + sysfs_print(btree_used_percent, bch_btree_used(c)); sysfs_print(btree_nodes, c->gc_stats.nodes); - sysfs_hprint(average_key_size, average_key_size(c)); + sysfs_hprint(average_key_size, bch_average_key_size(c)); sysfs_print(cache_read_races, atomic_long_read(&c->cache_read_races)); -- cgit v1.2.3