diff options
Diffstat (limited to 'drivers/md')
-rw-r--r-- | drivers/md/bcache/bcache.h | 24 | ||||
-rw-r--r-- | drivers/md/bcache/bset.c | 63 | ||||
-rw-r--r-- | drivers/md/bcache/btree.c | 63 | ||||
-rw-r--r-- | drivers/md/bcache/btree.h | 2 | ||||
-rw-r--r-- | drivers/md/bcache/closure.c | 13 | ||||
-rw-r--r-- | drivers/md/bcache/closure.h | 4 | ||||
-rw-r--r-- | drivers/md/bcache/debug.c | 17 | ||||
-rw-r--r-- | drivers/md/bcache/journal.c | 1 | ||||
-rw-r--r-- | drivers/md/bcache/request.c | 75 | ||||
-rw-r--r-- | drivers/md/bcache/super.c | 59 | ||||
-rw-r--r-- | drivers/md/bcache/sysfs.c | 48 | ||||
-rw-r--r-- | drivers/md/bcache/util.c | 2 | ||||
-rw-r--r-- | drivers/md/bcache/util.h | 2 | ||||
-rw-r--r-- | drivers/md/bcache/writeback.c | 125 | ||||
-rw-r--r-- | drivers/md/bcache/writeback.h | 19 | ||||
-rw-r--r-- | drivers/md/dm-writecache.c | 43 | ||||
-rw-r--r-- | drivers/md/dm.c | 6 | ||||
-rw-r--r-- | drivers/md/md.c | 12 |
18 files changed, 425 insertions, 153 deletions
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index d6bf294f3907..05f82ff6f016 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -328,13 +328,6 @@ struct cached_dev { */ atomic_t has_dirty; - /* - * Set to zero by things that touch the backing volume-- except - * writeback. Incremented by writeback. Used to determine when to - * accelerate idle writeback. - */ - atomic_t backing_idle; - struct bch_ratelimit writeback_rate; struct delayed_work writeback_rate_update; @@ -423,9 +416,9 @@ struct cache { /* * When allocating new buckets, prio_write() gets first dibs - since we * may not be allocate at all without writing priorities and gens. - * prio_buckets[] contains the last buckets we wrote priorities to (so - * gc can mark them as metadata), prio_next[] contains the buckets - * allocated for the next prio write. + * prio_last_buckets[] contains the last buckets we wrote priorities to + * (so gc can mark them as metadata), prio_buckets[] contains the + * buckets allocated for the next prio write. */ uint64_t *prio_buckets; uint64_t *prio_last_buckets; @@ -474,6 +467,7 @@ struct cache { struct gc_stat { size_t nodes; + size_t nodes_pre; size_t key_bytes; size_t nkeys; @@ -514,6 +508,8 @@ struct cache_set { struct cache_accounting accounting; unsigned long flags; + atomic_t idle_counter; + atomic_t at_max_writeback_rate; struct cache_sb sb; @@ -523,8 +519,10 @@ struct cache_set { struct bcache_device **devices; unsigned devices_max_used; + atomic_t attached_dev_nr; struct list_head cached_devs; uint64_t cached_dev_sectors; + atomic_long_t flash_dev_dirty_sectors; struct closure caching; struct closure sb_write; @@ -603,6 +601,10 @@ struct cache_set { */ atomic_t rescale; /* + * used for GC, identify if any front side I/Os is inflight + */ + atomic_t search_inflight; + /* * When we invalidate buckets, we use both the priority and the amount * of good data to determine which buckets to reuse first - to weight * those together consistently we keep track of the smallest nonzero @@ -995,7 +997,7 @@ void bch_open_buckets_free(struct cache_set *); int bch_cache_allocator_start(struct cache *ca); void bch_debug_exit(void); -int bch_debug_init(struct kobject *); +void bch_debug_init(struct kobject *kobj); void bch_request_exit(void); int bch_request_init(void); diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c index f3403b45bc28..596c93b44e9b 100644 --- a/drivers/md/bcache/bset.c +++ b/drivers/md/bcache/bset.c @@ -366,6 +366,10 @@ EXPORT_SYMBOL(bch_btree_keys_init); /* Binary tree stuff for auxiliary search trees */ +/* + * return array index next to j when does in-order traverse + * of a binary tree which is stored in a linear array + */ static unsigned inorder_next(unsigned j, unsigned size) { if (j * 2 + 1 < size) { @@ -379,6 +383,10 @@ static unsigned inorder_next(unsigned j, unsigned size) return j; } +/* + * return array index previous to j when does in-order traverse + * of a binary tree which is stored in a linear array + */ static unsigned inorder_prev(unsigned j, unsigned size) { if (j * 2 < size) { @@ -421,6 +429,10 @@ static unsigned __to_inorder(unsigned j, unsigned size, unsigned extra) return j; } +/* + * Return the cacheline index in bset_tree->data, where j is index + * from a linear array which stores the auxiliar binary tree + */ static unsigned to_inorder(unsigned j, struct bset_tree *t) { return __to_inorder(j, t->size, t->extra); @@ -441,6 +453,10 @@ static unsigned __inorder_to_tree(unsigned j, unsigned size, unsigned extra) return j; } +/* + * Return an index from a linear array which stores the auxiliar binary + * tree, j is the cacheline index of t->data. + */ static unsigned inorder_to_tree(unsigned j, struct bset_tree *t) { return __inorder_to_tree(j, t->size, t->extra); @@ -546,6 +562,20 @@ static inline uint64_t shrd128(uint64_t high, uint64_t low, uint8_t shift) return low; } +/* + * Calculate mantissa value for struct bkey_float. + * If most significant bit of f->exponent is not set, then + * - f->exponent >> 6 is 0 + * - p[0] points to bkey->low + * - p[-1] borrows bits from KEY_INODE() of bkey->high + * if most isgnificant bits of f->exponent is set, then + * - f->exponent >> 6 is 1 + * - p[0] points to bits from KEY_INODE() of bkey->high + * - p[-1] points to other bits from KEY_INODE() of + * bkey->high too. + * See make_bfloat() to check when most significant bit of f->exponent + * is set or not. + */ static inline unsigned bfloat_mantissa(const struct bkey *k, struct bkey_float *f) { @@ -570,6 +600,16 @@ static void make_bfloat(struct bset_tree *t, unsigned j) BUG_ON(m < l || m > r); BUG_ON(bkey_next(p) != m); + /* + * If l and r have different KEY_INODE values (different backing + * device), f->exponent records how many least significant bits + * are different in KEY_INODE values and sets most significant + * bits to 1 (by +64). + * If l and r have same KEY_INODE value, f->exponent records + * how many different bits in least significant bits of bkey->low. + * See bfloat_mantiss() how the most significant bit of + * f->exponent is used to calculate bfloat mantissa value. + */ if (KEY_INODE(l) != KEY_INODE(r)) f->exponent = fls64(KEY_INODE(r) ^ KEY_INODE(l)) + 64; else @@ -633,6 +673,15 @@ void bch_bset_init_next(struct btree_keys *b, struct bset *i, uint64_t magic) } EXPORT_SYMBOL(bch_bset_init_next); +/* + * Build auxiliary binary tree 'struct bset_tree *t', this tree is used to + * accelerate bkey search in a btree node (pointed by bset_tree->data in + * memory). After search in the auxiliar tree by calling bset_search_tree(), + * a struct bset_search_iter is returned which indicates range [l, r] from + * bset_tree->data where the searching bkey might be inside. Then a followed + * linear comparison does the exact search, see __bch_bset_search() for how + * the auxiliary tree is used. + */ void bch_bset_build_written_tree(struct btree_keys *b) { struct bset_tree *t = bset_tree_last(b); @@ -898,6 +947,17 @@ static struct bset_search_iter bset_search_tree(struct bset_tree *t, unsigned inorder, j, n = 1; do { + /* + * A bit trick here. + * If p < t->size, (int)(p - t->size) is a minus value and + * the most significant bit is set, right shifting 31 bits + * gets 1. If p >= t->size, the most significant bit is + * not set, right shifting 31 bits gets 0. + * So the following 2 lines equals to + * if (p >= t->size) + * p = 0; + * but a branch instruction is avoided. + */ unsigned p = n << 4; p &= ((int) (p - t->size)) >> 31; @@ -907,6 +967,9 @@ static struct bset_search_iter bset_search_tree(struct bset_tree *t, f = &t->tree[j]; /* + * Similar bit trick, use subtract operation to avoid a branch + * instruction. + * * n = (f->mantissa > bfloat_mantissa()) * ? j * 2 * : j * 2 + 1; diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 547c9eedc2f4..c19f7716df88 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -90,6 +90,9 @@ #define MAX_NEED_GC 64 #define MAX_SAVE_PRIO 72 +#define MAX_GC_TIMES 100 +#define MIN_GC_NODES 100 +#define GC_SLEEP_MS 100 #define PTR_DIRTY_BIT (((uint64_t) 1 << 36)) @@ -1008,6 +1011,13 @@ retry: BUG_ON(b->level != level); } + if (btree_node_io_error(b)) { + rw_unlock(write, b); + return ERR_PTR(-EIO); + } + + BUG_ON(!b->written); + b->parent = parent; b->accessed = 1; @@ -1019,13 +1029,6 @@ retry: for (; i <= b->keys.nsets; i++) prefetch(b->keys.set[i].data); - if (btree_node_io_error(b)) { - rw_unlock(write, b); - return ERR_PTR(-EIO); - } - - BUG_ON(!b->written); - return b; } @@ -1520,6 +1523,32 @@ static unsigned btree_gc_count_keys(struct btree *b) return ret; } +static size_t btree_gc_min_nodes(struct cache_set *c) +{ + size_t min_nodes; + + /* + * Since incremental GC would stop 100ms when front + * side I/O comes, so when there are many btree nodes, + * if GC only processes constant (100) nodes each time, + * GC would last a long time, and the front side I/Os + * would run out of the buckets (since no new bucket + * can be allocated during GC), and be blocked again. + * So GC should not process constant nodes, but varied + * nodes according to the number of btree nodes, which + * realized by dividing GC into constant(100) times, + * so when there are many btree nodes, GC can process + * more nodes each time, otherwise, GC will process less + * nodes each time (but no less than MIN_GC_NODES) + */ + min_nodes = c->gc_stats.nodes / MAX_GC_TIMES; + if (min_nodes < MIN_GC_NODES) + min_nodes = MIN_GC_NODES; + + return min_nodes; +} + + static int btree_gc_recurse(struct btree *b, struct btree_op *op, struct closure *writes, struct gc_stat *gc) { @@ -1585,6 +1614,13 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op, memmove(r + 1, r, sizeof(r[0]) * (GC_MERGE_NODES - 1)); r->b = NULL; + if (atomic_read(&b->c->search_inflight) && + gc->nodes >= gc->nodes_pre + btree_gc_min_nodes(b->c)) { + gc->nodes_pre = gc->nodes; + ret = -EAGAIN; + break; + } + if (need_resched()) { ret = -EAGAIN; break; @@ -1753,7 +1789,10 @@ static void bch_btree_gc(struct cache_set *c) closure_sync(&writes); cond_resched(); - if (ret && ret != -EAGAIN) + if (ret == -EAGAIN) + schedule_timeout_interruptible(msecs_to_jiffies + (GC_SLEEP_MS)); + else if (ret) pr_warn("gc failed!"); } while (ret && !test_bit(CACHE_SET_IO_DISABLE, &c->flags)); @@ -1834,8 +1873,14 @@ static int bch_btree_check_recurse(struct btree *b, struct btree_op *op) do { k = bch_btree_iter_next_filter(&iter, &b->keys, bch_ptr_bad); - if (k) + if (k) { btree_node_prefetch(b, k); + /* + * initiallize c->gc_stats.nodes + * for incremental GC + */ + b->c->gc_stats.nodes++; + } if (p) ret = btree(check_recurse, p, b, op); diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h index d211e2c25b6b..68e9d926134d 100644 --- a/drivers/md/bcache/btree.h +++ b/drivers/md/bcache/btree.h @@ -152,7 +152,7 @@ static inline bool btree_node_ ## flag(struct btree *b) \ { return test_bit(BTREE_NODE_ ## flag, &b->flags); } \ \ static inline void set_btree_node_ ## flag(struct btree *b) \ -{ set_bit(BTREE_NODE_ ## flag, &b->flags); } \ +{ set_bit(BTREE_NODE_ ## flag, &b->flags); } enum btree_flags { BTREE_NODE_io_error, diff --git a/drivers/md/bcache/closure.c b/drivers/md/bcache/closure.c index 0e14969182c6..618253683d40 100644 --- a/drivers/md/bcache/closure.c +++ b/drivers/md/bcache/closure.c @@ -199,11 +199,16 @@ static const struct file_operations debug_ops = { .release = single_release }; -int __init closure_debug_init(void) +void __init closure_debug_init(void) { - closure_debug = debugfs_create_file("closures", - 0400, bcache_debug, NULL, &debug_ops); - return IS_ERR_OR_NULL(closure_debug); + if (!IS_ERR_OR_NULL(bcache_debug)) + /* + * it is unnecessary to check return value of + * debugfs_create_file(), we should not care + * about this. + */ + closure_debug = debugfs_create_file( + "closures", 0400, bcache_debug, NULL, &debug_ops); } #endif diff --git a/drivers/md/bcache/closure.h b/drivers/md/bcache/closure.h index 71427eb5fdae..7c2c5bc7c88b 100644 --- a/drivers/md/bcache/closure.h +++ b/drivers/md/bcache/closure.h @@ -186,13 +186,13 @@ static inline void closure_sync(struct closure *cl) #ifdef CONFIG_BCACHE_CLOSURES_DEBUG -int closure_debug_init(void); +void closure_debug_init(void); void closure_debug_create(struct closure *cl); void closure_debug_destroy(struct closure *cl); #else -static inline int closure_debug_init(void) { return 0; } +static inline void closure_debug_init(void) {} static inline void closure_debug_create(struct closure *cl) {} static inline void closure_debug_destroy(struct closure *cl) {} diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c index d030ce3025a6..12034c07257b 100644 --- a/drivers/md/bcache/debug.c +++ b/drivers/md/bcache/debug.c @@ -110,11 +110,15 @@ void bch_data_verify(struct cached_dev *dc, struct bio *bio) struct bio_vec bv, cbv; struct bvec_iter iter, citer = { 0 }; - check = bio_clone_kmalloc(bio, GFP_NOIO); + check = bio_kmalloc(GFP_NOIO, bio_segments(bio)); if (!check) return; + check->bi_disk = bio->bi_disk; check->bi_opf = REQ_OP_READ; + check->bi_iter.bi_sector = bio->bi_iter.bi_sector; + check->bi_iter.bi_size = bio->bi_iter.bi_size; + bch_bio_map(check, NULL); if (bch_bio_alloc_pages(check, GFP_NOIO)) goto out_put; @@ -248,11 +252,12 @@ void bch_debug_exit(void) debugfs_remove_recursive(bcache_debug); } -int __init bch_debug_init(struct kobject *kobj) +void __init bch_debug_init(struct kobject *kobj) { - if (!IS_ENABLED(CONFIG_DEBUG_FS)) - return 0; - + /* + * it is unnecessary to check return value of + * debugfs_create_file(), we should not care + * about this. + */ bcache_debug = debugfs_create_dir("bcache", NULL); - return IS_ERR_OR_NULL(bcache_debug); } diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c index 18f1b5239620..10748c626a1d 100644 --- a/drivers/md/bcache/journal.c +++ b/drivers/md/bcache/journal.c @@ -828,6 +828,7 @@ void bch_journal_free(struct cache_set *c) free_pages((unsigned long) c->journal.w[1].data, JSET_BITS); free_pages((unsigned long) c->journal.w[0].data, JSET_BITS); free_fifo(&c->journal.pin); + free_heap(&c->flush_btree); } int bch_journal_alloc(struct cache_set *c) diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index ae67f5fa8047..7dbe8b6316a0 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -107,7 +107,7 @@ static int bch_keylist_realloc(struct keylist *l, unsigned u64s, /* * The journalling code doesn't handle the case where the keys to insert * is bigger than an empty write: If we just return -ENOMEM here, - * bio_insert() and bio_invalidate() will insert the keys created so far + * bch_data_insert_keys() will insert the keys created so far * and finish the rest when the keylist is empty. */ if (newsize * sizeof(uint64_t) > block_bytes(c) - sizeof(struct jset)) @@ -667,8 +667,7 @@ static void backing_request_endio(struct bio *bio) static void bio_complete(struct search *s) { if (s->orig_bio) { - generic_end_io_acct(s->d->disk->queue, - bio_data_dir(s->orig_bio), + generic_end_io_acct(s->d->disk->queue, bio_op(s->orig_bio), &s->d->disk->part0, s->start_time); trace_bcache_request_end(s->d, s->orig_bio); @@ -702,6 +701,8 @@ static void search_free(struct closure *cl) { struct search *s = container_of(cl, struct search, cl); + atomic_dec(&s->d->c->search_inflight); + if (s->iop.bio) bio_put(s->iop.bio); @@ -719,6 +720,7 @@ static inline struct search *search_alloc(struct bio *bio, closure_init(&s->cl, NULL); do_bio_hook(s, bio, request_endio); + atomic_inc(&d->c->search_inflight); s->orig_bio = bio; s->cache_miss = NULL; @@ -1062,8 +1064,7 @@ static void detached_dev_end_io(struct bio *bio) bio->bi_end_io = ddip->bi_end_io; bio->bi_private = ddip->bi_private; - generic_end_io_acct(ddip->d->disk->queue, - bio_data_dir(bio), + generic_end_io_acct(ddip->d->disk->queue, bio_op(bio), &ddip->d->disk->part0, ddip->start_time); if (bio->bi_status) { @@ -1102,6 +1103,44 @@ static void detached_dev_do_request(struct bcache_device *d, struct bio *bio) generic_make_request(bio); } +static void quit_max_writeback_rate(struct cache_set *c, + struct cached_dev *this_dc) +{ + int i; + struct bcache_device *d; + struct cached_dev *dc; + + /* + * mutex bch_register_lock may compete with other parallel requesters, + * or attach/detach operations on other backing device. Waiting to + * the mutex lock may increase I/O request latency for seconds or more. + * To avoid such situation, if mutext_trylock() failed, only writeback + * rate of current cached device is set to 1, and __update_write_back() + * will decide writeback rate of other cached devices (remember now + * c->idle_counter is 0 already). + */ + if (mutex_trylock(&bch_register_lock)) { + for (i = 0; i < c->devices_max_used; i++) { + if (!c->devices[i]) + continue; + + if (UUID_FLASH_ONLY(&c->uuids[i])) + continue; + + d = c->devices[i]; + dc = container_of(d, struct cached_dev, disk); + /* + * set writeback rate to default minimum value, + * then let update_writeback_rate() to decide the + * upcoming rate. + */ + atomic_long_set(&dc->writeback_rate.rate, 1); + } + mutex_unlock(&bch_register_lock); + } else + atomic_long_set(&this_dc->writeback_rate.rate, 1); +} + /* Cached devices - read & write stuff */ static blk_qc_t cached_dev_make_request(struct request_queue *q, @@ -1119,8 +1158,25 @@ static blk_qc_t cached_dev_make_request(struct request_queue *q, return BLK_QC_T_NONE; } - atomic_set(&dc->backing_idle, 0); - generic_start_io_acct(q, rw, bio_sectors(bio), &d->disk->part0); + if (likely(d->c)) { + if (atomic_read(&d->c->idle_counter)) + atomic_set(&d->c->idle_counter, 0); + /* + * If at_max_writeback_rate of cache set is true and new I/O + * comes, quit max writeback rate of all cached devices + * attached to this cache set, and set at_max_writeback_rate + * to false. + */ + if (unlikely(atomic_read(&d->c->at_max_writeback_rate) == 1)) { + atomic_set(&d->c->at_max_writeback_rate, 0); + quit_max_writeback_rate(d->c, dc); + } + } + + generic_start_io_acct(q, + bio_op(bio), + bio_sectors(bio), + &d->disk->part0); bio_set_dev(bio, dc->bdev); bio->bi_iter.bi_sector += dc->sb.data_offset; @@ -1229,7 +1285,6 @@ static blk_qc_t flash_dev_make_request(struct request_queue *q, struct search *s; struct closure *cl; struct bcache_device *d = bio->bi_disk->private_data; - int rw = bio_data_dir(bio); if (unlikely(d->c && test_bit(CACHE_SET_IO_DISABLE, &d->c->flags))) { bio->bi_status = BLK_STS_IOERR; @@ -1237,7 +1292,7 @@ static blk_qc_t flash_dev_make_request(struct request_queue *q, return BLK_QC_T_NONE; } - generic_start_io_acct(q, rw, bio_sectors(bio), &d->disk->part0); + generic_start_io_acct(q, bio_op(bio), bio_sectors(bio), &d->disk->part0); s = search_alloc(bio, d); cl = &s->cl; @@ -1254,7 +1309,7 @@ static blk_qc_t flash_dev_make_request(struct request_queue *q, flash_dev_nodata, bcache_wq); return BLK_QC_T_NONE; - } else if (rw) { + } else if (bio_data_dir(bio)) { bch_keybuf_check_overlapping(&s->iop.c->moving_gc_keys, &KEY(d->id, bio->bi_iter.bi_sector, 0), &KEY(d->id, bio_end_sector(bio), 0)); diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index fa4058e43202..55a37641aa95 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -181,7 +181,7 @@ static const char *read_super(struct cache_sb *sb, struct block_device *bdev, goto err; } - sb->last_mount = get_seconds(); + sb->last_mount = (u32)ktime_get_real_seconds(); err = NULL; get_page(bh->b_page); @@ -696,12 +696,14 @@ static void bcache_device_detach(struct bcache_device *d) { lockdep_assert_held(&bch_register_lock); + atomic_dec(&d->c->attached_dev_nr); + if (test_bit(BCACHE_DEV_DETACHING, &d->flags)) { struct uuid_entry *u = d->c->uuids + d->id; SET_UUID_FLASH_ONLY(u, 0); memcpy(u->uuid, invalid_uuid, 16); - u->invalidated = cpu_to_le32(get_seconds()); + u->invalidated = cpu_to_le32((u32)ktime_get_real_seconds()); bch_uuid_write(d->c); } @@ -796,11 +798,12 @@ static int bcache_device_init(struct bcache_device *d, unsigned block_size, return idx; if (bioset_init(&d->bio_split, 4, offsetof(struct bbio, bio), - BIOSET_NEED_BVECS|BIOSET_NEED_RESCUER) || - !(d->disk = alloc_disk(BCACHE_MINORS))) { - ida_simple_remove(&bcache_device_idx, idx); - return -ENOMEM; - } + BIOSET_NEED_BVECS|BIOSET_NEED_RESCUER)) + goto err; + + d->disk = alloc_disk(BCACHE_MINORS); + if (!d->disk) + goto err; set_capacity(d->disk, sectors); snprintf(d->disk->disk_name, DISK_NAME_LEN, "bcache%i", idx); @@ -834,6 +837,11 @@ static int bcache_device_init(struct bcache_device *d, unsigned block_size, blk_queue_write_cache(q, true, true); return 0; + +err: + ida_simple_remove(&bcache_device_idx, idx); + return -ENOMEM; + } /* Cached device */ @@ -1027,7 +1035,7 @@ void bch_cached_dev_detach(struct cached_dev *dc) int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c, uint8_t *set_uuid) { - uint32_t rtime = cpu_to_le32(get_seconds()); + uint32_t rtime = cpu_to_le32((u32)ktime_get_real_seconds()); struct uuid_entry *u; struct cached_dev *exist_dc, *t; @@ -1070,7 +1078,7 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c, (BDEV_STATE(&dc->sb) == BDEV_STATE_STALE || BDEV_STATE(&dc->sb) == BDEV_STATE_NONE)) { memcpy(u->uuid, invalid_uuid, 16); - u->invalidated = cpu_to_le32(get_seconds()); + u->invalidated = cpu_to_le32((u32)ktime_get_real_seconds()); u = NULL; } @@ -1138,6 +1146,7 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c, bch_cached_dev_run(dc); bcache_device_link(&dc->disk, c, "bdev"); + atomic_inc(&c->attached_dev_nr); /* Allow the writeback thread to proceed */ up_write(&dc->writeback_lock); @@ -1285,6 +1294,7 @@ static void register_bdev(struct cache_sb *sb, struct page *sb_page, pr_info("registered backing device %s", dc->backing_dev_name); list_add(&dc->list, &uncached_devices); + /* attach to a matched cache set if it exists */ list_for_each_entry(c, &bch_cache_sets, list) bch_cached_dev_attach(dc, c, NULL); @@ -1311,6 +1321,8 @@ static void flash_dev_free(struct closure *cl) { struct bcache_device *d = container_of(cl, struct bcache_device, cl); mutex_lock(&bch_register_lock); + atomic_long_sub(bcache_dev_sectors_dirty(d), + &d->c->flash_dev_dirty_sectors); bcache_device_free(d); mutex_unlock(&bch_register_lock); kobject_put(&d->kobj); @@ -1390,7 +1402,7 @@ int bch_flash_dev_create(struct cache_set *c, uint64_t size) get_random_bytes(u->uuid, 16); memset(u->label, 0, 32); - u->first_reg = u->last_reg = cpu_to_le32(get_seconds()); + u->first_reg = u->last_reg = cpu_to_le32((u32)ktime_get_real_seconds()); SET_UUID_FLASH_ONLY(u, 1); u->sectors = size >> 9; @@ -1687,6 +1699,7 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) c->block_bits = ilog2(sb->block_size); c->nr_uuids = bucket_bytes(c) / sizeof(struct uuid_entry); c->devices_max_used = 0; + atomic_set(&c->attached_dev_nr, 0); c->btree_pages = bucket_pages(c); if (c->btree_pages > BTREE_MAX_PAGES) c->btree_pages = max_t(int, c->btree_pages / 4, @@ -1894,7 +1907,7 @@ static void run_cache_set(struct cache_set *c) goto err; closure_sync(&cl); - c->sb.last_mount = get_seconds(); + c->sb.last_mount = (u32)ktime_get_real_seconds(); bcache_write_super(c); list_for_each_entry_safe(dc, t, &uncached_devices, list) @@ -2163,8 +2176,12 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, if (!try_module_get(THIS_MODULE)) return -EBUSY; - if (!(path = kstrndup(buffer, size, GFP_KERNEL)) || - !(sb = kmalloc(sizeof(struct cache_sb), GFP_KERNEL))) + path = kstrndup(buffer, size, GFP_KERNEL); + if (!path) + goto err; + + sb = kmalloc(sizeof(struct cache_sb), GFP_KERNEL); + if (!sb) goto err; err = "failed to open device"; @@ -2324,13 +2341,21 @@ static int __init bcache_init(void) return bcache_major; } - if (!(bcache_wq = alloc_workqueue("bcache", WQ_MEM_RECLAIM, 0)) || - !(bcache_kobj = kobject_create_and_add("bcache", fs_kobj)) || - bch_request_init() || - bch_debug_init(bcache_kobj) || closure_debug_init() || + bcache_wq = alloc_workqueue("bcache", WQ_MEM_RECLAIM, 0); + if (!bcache_wq) + goto err; + + bcache_kobj = kobject_create_and_add("bcache", fs_kobj); + if (!bcache_kobj) + goto err; + + if (bch_request_init() || sysfs_create_files(bcache_kobj, files)) goto err; + bch_debug_init(bcache_kobj); + closure_debug_init(); + return 0; err: bcache_exit(); diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index 225b15aa0340..81d3520b0702 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -149,6 +149,7 @@ SHOW(__bch_cached_dev) struct cached_dev *dc = container_of(kobj, struct cached_dev, disk.kobj); const char *states[] = { "no cache", "clean", "dirty", "inconsistent" }; + int wb = dc->writeback_running; #define var(stat) (dc->stat) @@ -170,7 +171,8 @@ SHOW(__bch_cached_dev) var_printf(writeback_running, "%i"); var_print(writeback_delay); var_print(writeback_percent); - sysfs_hprint(writeback_rate, dc->writeback_rate.rate << 9); + sysfs_hprint(writeback_rate, + wb ? atomic_long_read(&dc->writeback_rate.rate) << 9 : 0); sysfs_hprint(io_errors, atomic_read(&dc->io_errors)); sysfs_printf(io_error_limit, "%i", dc->error_limit); sysfs_printf(io_disable, "%i", dc->io_disable); @@ -188,15 +190,22 @@ SHOW(__bch_cached_dev) char change[20]; s64 next_io; - bch_hprint(rate, dc->writeback_rate.rate << 9); - bch_hprint(dirty, bcache_dev_sectors_dirty(&dc->disk) << 9); - bch_hprint(target, dc->writeback_rate_target << 9); - bch_hprint(proportional,dc->writeback_rate_proportional << 9); - bch_hprint(integral, dc->writeback_rate_integral_scaled << 9); - bch_hprint(change, dc->writeback_rate_change << 9); - - next_io = div64_s64(dc->writeback_rate.next - local_clock(), - NSEC_PER_MSEC); + /* + * Except for dirty and target, other values should + * be 0 if writeback is not running. + */ + bch_hprint(rate, + wb ? atomic_long_read(&dc->writeback_rate.rate) << 9 + : 0); + bch_hprint(dirty, bcache_dev_sectors_dirty(&dc->disk) << 9); + bch_hprint(target, dc->writeback_rate_target << 9); + bch_hprint(proportional, + wb ? dc->writeback_rate_proportional << 9 : 0); + bch_hprint(integral, + wb ? dc->writeback_rate_integral_scaled << 9 : 0); + bch_hprint(change, wb ? dc->writeback_rate_change << 9 : 0); + next_io = wb ? div64_s64(dc->writeback_rate.next-local_clock(), + NSEC_PER_MSEC) : 0; return sprintf(buf, "rate:\t\t%s/sec\n" @@ -255,8 +264,19 @@ STORE(__cached_dev) sysfs_strtoul_clamp(writeback_percent, dc->writeback_percent, 0, 40); - sysfs_strtoul_clamp(writeback_rate, - dc->writeback_rate.rate, 1, INT_MAX); + if (attr == &sysfs_writeback_rate) { + ssize_t ret; + long int v = atomic_long_read(&dc->writeback_rate.rate); + + ret = strtoul_safe_clamp(buf, v, 1, INT_MAX); + + if (!ret) { + atomic_long_set(&dc->writeback_rate.rate, v); + ret = size; + } + + return ret; + } sysfs_strtoul_clamp(writeback_rate_update_seconds, dc->writeback_rate_update_seconds, @@ -338,8 +358,8 @@ STORE(__cached_dev) if (!v) return size; } - - pr_err("Can't attach %s: cache set not found", buf); + if (v == -ENOENT) + pr_err("Can't attach %s: cache set not found", buf); return v; } diff --git a/drivers/md/bcache/util.c b/drivers/md/bcache/util.c index fc479b026d6d..b15256bcf0e7 100644 --- a/drivers/md/bcache/util.c +++ b/drivers/md/bcache/util.c @@ -200,7 +200,7 @@ uint64_t bch_next_delay(struct bch_ratelimit *d, uint64_t done) { uint64_t now = local_clock(); - d->next += div_u64(done * NSEC_PER_SEC, d->rate); + d->next += div_u64(done * NSEC_PER_SEC, atomic_long_read(&d->rate)); /* Bound the time. Don't let us fall further than 2 seconds behind * (this prevents unnecessary backlog that would make it impossible diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h index cced87f8eb27..f7b0133c9d2f 100644 --- a/drivers/md/bcache/util.h +++ b/drivers/md/bcache/util.h @@ -442,7 +442,7 @@ struct bch_ratelimit { * Rate at which we want to do work, in units per second * The units here correspond to the units passed to bch_next_delay() */ - uint32_t rate; + atomic_long_t rate; }; static inline void bch_ratelimit_reset(struct bch_ratelimit *d) diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index ad45ebe1a74b..481d4cf38ac0 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -27,7 +27,7 @@ static uint64_t __calc_target_rate(struct cached_dev *dc) * flash-only devices */ uint64_t cache_sectors = c->nbuckets * c->sb.bucket_size - - bcache_flash_devs_sectors_dirty(c); + atomic_long_read(&c->flash_dev_dirty_sectors); /* * Unfortunately there is no control of global dirty data. If the @@ -104,11 +104,56 @@ static void __update_writeback_rate(struct cached_dev *dc) dc->writeback_rate_proportional = proportional_scaled; dc->writeback_rate_integral_scaled = integral_scaled; - dc->writeback_rate_change = new_rate - dc->writeback_rate.rate; - dc->writeback_rate.rate = new_rate; + dc->writeback_rate_change = new_rate - + atomic_long_read(&dc->writeback_rate.rate); + atomic_long_set(&dc->writeback_rate.rate, new_rate); dc->writeback_rate_target = target; } +static bool set_at_max_writeback_rate(struct cache_set *c, + struct cached_dev *dc) +{ + /* + * Idle_counter is increased everytime when update_writeback_rate() is + * called. If all backing devices attached to the same cache set have + * identical dc->writeback_rate_update_seconds values, it is about 6 + * rounds of update_writeback_rate() on each backing device before + * c->at_max_writeback_rate is set to 1, and then max wrteback rate set + * to each dc->writeback_rate.rate. + * In order to avoid extra locking cost for counting exact dirty cached + * devices number, c->attached_dev_nr is used to calculate the idle + * throushold. It might be bigger if not all cached device are in write- + * back mode, but it still works well with limited extra rounds of + * update_writeback_rate(). + */ + if (atomic_inc_return(&c->idle_counter) < + atomic_read(&c->attached_dev_nr) * 6) + return false; + + if (atomic_read(&c->at_max_writeback_rate) != 1) + atomic_set(&c->at_max_writeback_rate, 1); + + atomic_long_set(&dc->writeback_rate.rate, INT_MAX); + + /* keep writeback_rate_target as existing value */ + dc->writeback_rate_proportional = 0; + dc->writeback_rate_integral_scaled = 0; + dc->writeback_rate_change = 0; + + /* + * Check c->idle_counter and c->at_max_writeback_rate agagain in case + * new I/O arrives during before set_at_max_writeback_rate() returns. + * Then the writeback rate is set to 1, and its new value should be + * decided via __update_writeback_rate(). + */ + if ((atomic_read(&c->idle_counter) < + atomic_read(&c->attached_dev_nr) * 6) || + !atomic_read(&c->at_max_writeback_rate)) + return false; + + return true; +} + static void update_writeback_rate(struct work_struct *work) { struct cached_dev *dc = container_of(to_delayed_work(work), @@ -136,13 +181,20 @@ static void update_writeback_rate(struct work_struct *work) return; } - down_read(&dc->writeback_lock); - - if (atomic_read(&dc->has_dirty) && - dc->writeback_percent) - __update_writeback_rate(dc); + if (atomic_read(&dc->has_dirty) && dc->writeback_percent) { + /* + * If the whole cache set is idle, set_at_max_writeback_rate() + * will set writeback rate to a max number. Then it is + * unncessary to update writeback rate for an idle cache set + * in maximum writeback rate number(s). + */ + if (!set_at_max_writeback_rate(c, dc)) { + down_read(&dc->writeback_lock); + __update_writeback_rate(dc); + up_read(&dc->writeback_lock); + } + } - up_read(&dc->writeback_lock); /* * CACHE_SET_IO_DISABLE might be set via sysfs interface, @@ -422,27 +474,6 @@ static void read_dirty(struct cached_dev *dc) delay = writeback_delay(dc, size); - /* If the control system would wait for at least half a - * second, and there's been no reqs hitting the backing disk - * for awhile: use an alternate mode where we have at most - * one contiguous set of writebacks in flight at a time. If - * someone wants to do IO it will be quick, as it will only - * have to contend with one operation in flight, and we'll - * be round-tripping data to the backing disk as quickly as - * it can accept it. - */ - if (delay >= HZ / 2) { - /* 3 means at least 1.5 seconds, up to 7.5 if we - * have slowed way down. - */ - if (atomic_inc_return(&dc->backing_idle) >= 3) { - /* Wait for current I/Os to finish */ - closure_sync(&cl); - /* And immediately launch a new set. */ - delay = 0; - } - } - while (!kthread_should_stop() && !test_bit(CACHE_SET_IO_DISABLE, &dc->disk.c->flags) && delay) { @@ -476,6 +507,9 @@ void bcache_dev_sectors_dirty_add(struct cache_set *c, unsigned inode, if (!d) return; + if (UUID_FLASH_ONLY(&c->uuids[inode])) + atomic_long_add(nr_sectors, &c->flash_dev_dirty_sectors); + stripe = offset_to_stripe(d, offset); stripe_offset = offset & (d->stripe_size - 1); @@ -673,10 +707,14 @@ static int bch_writeback_thread(void *arg) } /* Init */ +#define INIT_KEYS_EACH_TIME 500000 +#define INIT_KEYS_SLEEP_MS 100 struct sectors_dirty_init { struct btree_op op; unsigned inode; + size_t count; + struct bkey start; }; static int sectors_dirty_init_fn(struct btree_op *_op, struct btree *b, @@ -691,18 +729,37 @@ static int sectors_dirty_init_fn(struct btree_op *_op, struct btree *b, bcache_dev_sectors_dirty_add(b->c, KEY_INODE(k), KEY_START(k), KEY_SIZE(k)); + op->count++; + if (atomic_read(&b->c->search_inflight) && + !(op->count % INIT_KEYS_EACH_TIME)) { + bkey_copy_key(&op->start, k); + return -EAGAIN; + } + return MAP_CONTINUE; } void bch_sectors_dirty_init(struct bcache_device *d) { struct sectors_dirty_init op; + int ret; bch_btree_op_init(&op.op, -1); op.inode = d->id; - - bch_btree_map_keys(&op.op, d->c, &KEY(op.inode, 0, 0), - sectors_dirty_init_fn, 0); + op.count = 0; + op.start = KEY(op.inode, 0, 0); + + do { + ret = bch_btree_map_keys(&op.op, d->c, &op.start, + sectors_dirty_init_fn, 0); + if (ret == -EAGAIN) + schedule_timeout_interruptible( + msecs_to_jiffies(INIT_KEYS_SLEEP_MS)); + else if (ret < 0) { + pr_warn("sectors dirty init failed, ret=%d!", ret); + break; + } + } while (ret == -EAGAIN); } void bch_cached_dev_writeback_init(struct cached_dev *dc) @@ -715,7 +772,7 @@ void bch_cached_dev_writeback_init(struct cached_dev *dc) dc->writeback_running = true; dc->writeback_percent = 10; dc->writeback_delay = 30; - dc->writeback_rate.rate = 1024; + atomic_long_set(&dc->writeback_rate.rate, 1024); dc->writeback_rate_minimum = 8; dc->writeback_rate_update_seconds = WRITEBACK_RATE_UPDATE_SECS_DEFAULT; diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h index 610fb01de629..3745d7004c47 100644 --- a/drivers/md/bcache/writeback.h +++ b/drivers/md/bcache/writeback.h @@ -28,25 +28,6 @@ static inline uint64_t bcache_dev_sectors_dirty(struct bcache_device *d) return ret; } -static inline uint64_t bcache_flash_devs_sectors_dirty(struct cache_set *c) -{ - uint64_t i, ret = 0; - - mutex_lock(&bch_register_lock); - - for (i = 0; i < c->devices_max_used; i++) { - struct bcache_device *d = c->devices[i]; - - if (!d || !UUID_FLASH_ONLY(&c->uuids[i])) - continue; - ret += bcache_dev_sectors_dirty(d); - } - - mutex_unlock(&bch_register_lock); - - return ret; -} - static inline unsigned offset_to_stripe(struct bcache_device *d, uint64_t offset) { diff --git a/drivers/md/dm-writecache.c b/drivers/md/dm-writecache.c index 07ea6a48aac6..87107c995cb5 100644 --- a/drivers/md/dm-writecache.c +++ b/drivers/md/dm-writecache.c @@ -136,6 +136,7 @@ struct dm_writecache { struct dm_target *ti; struct dm_dev *dev; struct dm_dev *ssd_dev; + sector_t start_sector; void *memory_map; uint64_t memory_map_size; size_t metadata_sectors; @@ -293,6 +294,10 @@ static int persistent_memory_claim(struct dm_writecache *wc) } dax_read_unlock(id); + + wc->memory_map += (size_t)wc->start_sector << SECTOR_SHIFT; + wc->memory_map_size -= (size_t)wc->start_sector << SECTOR_SHIFT; + return 0; err3: kvfree(pages); @@ -311,7 +316,7 @@ static int persistent_memory_claim(struct dm_writecache *wc) static void persistent_memory_release(struct dm_writecache *wc) { if (wc->memory_vmapped) - vunmap(wc->memory_map); + vunmap(wc->memory_map - ((size_t)wc->start_sector << SECTOR_SHIFT)); } static struct page *persistent_memory_page(void *addr) @@ -359,7 +364,7 @@ static void *memory_data(struct dm_writecache *wc, struct wc_entry *e) static sector_t cache_sector(struct dm_writecache *wc, struct wc_entry *e) { - return wc->metadata_sectors + + return wc->start_sector + wc->metadata_sectors + ((sector_t)e->index << (wc->block_size_bits - SECTOR_SHIFT)); } @@ -471,6 +476,7 @@ static void ssd_commit_flushed(struct dm_writecache *wc) if (unlikely(region.sector + region.count > wc->metadata_sectors)) region.count = wc->metadata_sectors - region.sector; + region.sector += wc->start_sector; atomic_inc(&endio.count); req.bi_op = REQ_OP_WRITE; req.bi_op_flags = REQ_SYNC; @@ -1946,14 +1952,6 @@ static int writecache_ctr(struct dm_target *ti, unsigned argc, char **argv) } wc->memory_map_size = i_size_read(wc->ssd_dev->bdev->bd_inode); - if (WC_MODE_PMEM(wc)) { - r = persistent_memory_claim(wc); - if (r) { - ti->error = "Unable to map persistent memory for cache"; - goto bad; - } - } - /* * Parse the cache block size */ @@ -1982,7 +1980,16 @@ static int writecache_ctr(struct dm_target *ti, unsigned argc, char **argv) while (opt_params) { string = dm_shift_arg(&as), opt_params--; - if (!strcasecmp(string, "high_watermark") && opt_params >= 1) { + if (!strcasecmp(string, "start_sector") && opt_params >= 1) { + unsigned long long start_sector; + string = dm_shift_arg(&as), opt_params--; + if (sscanf(string, "%llu%c", &start_sector, &dummy) != 1) + goto invalid_optional; + wc->start_sector = start_sector; + if (wc->start_sector != start_sector || + wc->start_sector >= wc->memory_map_size >> SECTOR_SHIFT) + goto invalid_optional; + } else if (!strcasecmp(string, "high_watermark") && opt_params >= 1) { string = dm_shift_arg(&as), opt_params--; if (sscanf(string, "%d%c", &high_wm_percent, &dummy) != 1) goto invalid_optional; @@ -2039,12 +2046,20 @@ invalid_optional: goto bad; } - if (!WC_MODE_PMEM(wc)) { + if (WC_MODE_PMEM(wc)) { + r = persistent_memory_claim(wc); + if (r) { + ti->error = "Unable to map persistent memory for cache"; + goto bad; + } + } else { struct dm_io_region region; struct dm_io_request req; size_t n_blocks, n_metadata_blocks; uint64_t n_bitmap_bits; + wc->memory_map_size -= (uint64_t)wc->start_sector << SECTOR_SHIFT; + bio_list_init(&wc->flush_list); wc->flush_thread = kthread_create(writecache_flush_thread, wc, "dm_writecache_flush"); if (IS_ERR(wc->flush_thread)) { @@ -2097,7 +2112,7 @@ invalid_optional: } region.bdev = wc->ssd_dev->bdev; - region.sector = 0; + region.sector = wc->start_sector; region.count = wc->metadata_sectors; req.bi_op = REQ_OP_READ; req.bi_op_flags = REQ_SYNC; @@ -2265,7 +2280,7 @@ static void writecache_status(struct dm_target *ti, status_type_t type, static struct target_type writecache_target = { .name = "writecache", - .version = {1, 0, 0}, + .version = {1, 1, 0}, .module = THIS_MODULE, .ctr = writecache_ctr, .dtr = writecache_dtr, diff --git a/drivers/md/dm.c b/drivers/md/dm.c index b0dd7027848b..20f7e4ef5342 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -609,7 +609,8 @@ static void start_io_acct(struct dm_io *io) io->start_time = jiffies; - generic_start_io_acct(md->queue, rw, bio_sectors(bio), &dm_disk(md)->part0); + generic_start_io_acct(md->queue, bio_op(bio), bio_sectors(bio), + &dm_disk(md)->part0); atomic_set(&dm_disk(md)->part0.in_flight[rw], atomic_inc_return(&md->pending[rw])); @@ -628,7 +629,8 @@ static void end_io_acct(struct dm_io *io) int pending; int rw = bio_data_dir(bio); - generic_end_io_acct(md->queue, rw, &dm_disk(md)->part0, io->start_time); + generic_end_io_acct(md->queue, bio_op(bio), &dm_disk(md)->part0, + io->start_time); if (unlikely(dm_stats_used(&md->stats))) dm_stats_account_io(&md->stats, bio_data_dir(bio), diff --git a/drivers/md/md.c b/drivers/md/md.c index da83d8710579..724def2f9eaa 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -204,10 +204,6 @@ static int start_readonly; */ static bool create_on_open = true; -/* bio_clone_mddev - * like bio_clone_bioset, but with a local bio set - */ - struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs, struct mddev *mddev) { @@ -335,6 +331,7 @@ EXPORT_SYMBOL(md_handle_request); static blk_qc_t md_make_request(struct request_queue *q, struct bio *bio) { const int rw = bio_data_dir(bio); + const int sgrp = op_stat_group(bio_op(bio)); struct mddev *mddev = q->queuedata; unsigned int sectors; int cpu; @@ -363,8 +360,8 @@ static blk_qc_t md_make_request(struct request_queue *q, struct bio *bio) md_handle_request(mddev, bio); cpu = part_stat_lock(); - part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]); - part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw], sectors); + part_stat_inc(cpu, &mddev->gendisk->part0, ios[sgrp]); + part_stat_add(cpu, &mddev->gendisk->part0, sectors[sgrp], sectors); part_stat_unlock(); return BLK_QC_T_NONE; @@ -8063,8 +8060,7 @@ static int is_mddev_idle(struct mddev *mddev, int init) rcu_read_lock(); rdev_for_each_rcu(rdev, mddev) { struct gendisk *disk = rdev->bdev->bd_contains->bd_disk; - curr_events = (int)part_stat_read(&disk->part0, sectors[0]) + - (int)part_stat_read(&disk->part0, sectors[1]) - + curr_events = (int)part_stat_read_accum(&disk->part0, sectors) - atomic_read(&disk->sync_io); /* sync IO will cause sync_io to increase before the disk_stats * as sync_io is counted when a request starts, and |