From 742c8fdc31e820503f9267070311d894978d1349 Mon Sep 17 00:00:00 2001 From: Joe Thornber Date: Fri, 21 Oct 2016 10:06:40 -0400 Subject: dm bio prison v2: new interface for the bio prison The deferred set is gone and all methods have _v2 appended to the end of their names to allow for continued use of the original bio prison in DM thin-provisioning. Signed-off-by: Joe Thornber Signed-off-by: Mike Snitzer --- drivers/md/Makefile | 1 + 1 file changed, 1 insertion(+) (limited to 'drivers/md/Makefile') diff --git a/drivers/md/Makefile b/drivers/md/Makefile index 3cbda1af87a0..d378b1db7852 100644 --- a/drivers/md/Makefile +++ b/drivers/md/Makefile @@ -11,6 +11,7 @@ dm-snapshot-y += dm-snap.o dm-exception-store.o dm-snap-transient.o \ dm-mirror-y += dm-raid1.o dm-log-userspace-y \ += dm-log-userspace-base.o dm-log-userspace-transfer.o +dm-bio-prison-y += dm-bio-prison-v1.o dm-bio-prison-v2.o dm-thin-pool-y += dm-thin.o dm-thin-metadata.o dm-cache-y += dm-cache-target.o dm-cache-metadata.o dm-cache-policy.o dm-cache-smq-y += dm-cache-policy-smq.o -- cgit v1.2.3 From b29d4986d0da1a27cd35917cdb433672f5c95d7f Mon Sep 17 00:00:00 2001 From: Joe Thornber Date: Thu, 15 Dec 2016 04:57:31 -0500 Subject: dm cache: significant rework to leverage dm-bio-prison-v2 The cache policy interfaces have been updated to work well with the new bio-prison v2 interface's ability to queue work immediately (promotion, demotion, etc) -- overriding benefit being reduced latency on processing IO through the cache. Previously such work would be left for the DM cache core to queue on various lists and then process in batches later -- this caused a serious delay in latency for IO driven by the cache. The background tracker code was factored out so that all cache policies can make use of it. Also, the "cleaner" policy has been removed and is now a variant of the smq policy that simply disallows migrations. Signed-off-by: Joe Thornber Signed-off-by: Mike Snitzer --- drivers/md/Kconfig | 8 - drivers/md/Makefile | 5 +- drivers/md/dm-cache-background-tracker.c | 238 +++ drivers/md/dm-cache-background-tracker.h | 46 + drivers/md/dm-cache-metadata.h | 2 + drivers/md/dm-cache-policy-cleaner.c | 469 ------ drivers/md/dm-cache-policy-internal.h | 76 +- drivers/md/dm-cache-policy-smq.c | 821 +++++----- drivers/md/dm-cache-policy.h | 187 +-- drivers/md/dm-cache-target.c | 2485 +++++++++++++----------------- 10 files changed, 1930 insertions(+), 2407 deletions(-) create mode 100644 drivers/md/dm-cache-background-tracker.c create mode 100644 drivers/md/dm-cache-background-tracker.h delete mode 100644 drivers/md/dm-cache-policy-cleaner.c (limited to 'drivers/md/Makefile') diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index b7767da50c26..982cd0626bc7 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -325,14 +325,6 @@ config DM_CACHE_SMQ of less memory utilization, improved performance and increased adaptability in the face of changing workloads. -config DM_CACHE_CLEANER - tristate "Cleaner Cache Policy (EXPERIMENTAL)" - depends on DM_CACHE - default y - ---help--- - A simple cache policy that writes back all data to the - origin. Used when decommissioning a dm-cache. - config DM_ERA tristate "Era target (EXPERIMENTAL)" depends on BLK_DEV_DM diff --git a/drivers/md/Makefile b/drivers/md/Makefile index d378b1db7852..2801b2fb452d 100644 --- a/drivers/md/Makefile +++ b/drivers/md/Makefile @@ -13,9 +13,9 @@ dm-log-userspace-y \ += dm-log-userspace-base.o dm-log-userspace-transfer.o dm-bio-prison-y += dm-bio-prison-v1.o dm-bio-prison-v2.o dm-thin-pool-y += dm-thin.o dm-thin-metadata.o -dm-cache-y += dm-cache-target.o dm-cache-metadata.o dm-cache-policy.o +dm-cache-y += dm-cache-target.o dm-cache-metadata.o dm-cache-policy.o \ + dm-cache-background-tracker.o dm-cache-smq-y += dm-cache-policy-smq.o -dm-cache-cleaner-y += dm-cache-policy-cleaner.o dm-era-y += dm-era-target.o dm-verity-y += dm-verity-target.o md-mod-y += md.o bitmap.o @@ -57,7 +57,6 @@ obj-$(CONFIG_DM_THIN_PROVISIONING) += dm-thin-pool.o obj-$(CONFIG_DM_VERITY) += dm-verity.o obj-$(CONFIG_DM_CACHE) += dm-cache.o obj-$(CONFIG_DM_CACHE_SMQ) += dm-cache-smq.o -obj-$(CONFIG_DM_CACHE_CLEANER) += dm-cache-cleaner.o obj-$(CONFIG_DM_ERA) += dm-era.o obj-$(CONFIG_DM_LOG_WRITES) += dm-log-writes.o diff --git a/drivers/md/dm-cache-background-tracker.c b/drivers/md/dm-cache-background-tracker.c new file mode 100644 index 000000000000..9b1afdfb13f0 --- /dev/null +++ b/drivers/md/dm-cache-background-tracker.c @@ -0,0 +1,238 @@ +/* + * Copyright (C) 2017 Red Hat. All rights reserved. + * + * This file is released under the GPL. + */ + +#include "dm-cache-background-tracker.h" + +/*----------------------------------------------------------------*/ + +#define DM_MSG_PREFIX "dm-background-tracker" + +struct bt_work { + struct list_head list; + struct rb_node node; + struct policy_work work; +}; + +struct background_tracker { + unsigned max_work; + atomic_t pending_promotes; + atomic_t pending_writebacks; + atomic_t pending_demotes; + + struct list_head issued; + struct list_head queued; + struct rb_root pending; + + struct kmem_cache *work_cache; +}; + +struct background_tracker *btracker_create(unsigned max_work) +{ + struct background_tracker *b = kmalloc(sizeof(*b), GFP_KERNEL); + + b->max_work = max_work; + atomic_set(&b->pending_promotes, 0); + atomic_set(&b->pending_writebacks, 0); + atomic_set(&b->pending_demotes, 0); + + INIT_LIST_HEAD(&b->issued); + INIT_LIST_HEAD(&b->queued); + + b->pending = RB_ROOT; + b->work_cache = KMEM_CACHE(bt_work, 0); + if (!b->work_cache) { + DMERR("couldn't create mempool for background work items"); + kfree(b); + b = NULL; + } + + return b; +} +EXPORT_SYMBOL_GPL(btracker_create); + +void btracker_destroy(struct background_tracker *b) +{ + kmem_cache_destroy(b->work_cache); + kfree(b); +} +EXPORT_SYMBOL_GPL(btracker_destroy); + +static int cmp_oblock(dm_oblock_t lhs, dm_oblock_t rhs) +{ + if (from_oblock(lhs) < from_oblock(rhs)) + return -1; + + if (from_oblock(rhs) < from_oblock(lhs)) + return 1; + + return 0; +} + +static bool __insert_pending(struct background_tracker *b, + struct bt_work *nw) +{ + int cmp; + struct bt_work *w; + struct rb_node **new = &b->pending.rb_node, *parent = NULL; + + while (*new) { + w = container_of(*new, struct bt_work, node); + + parent = *new; + cmp = cmp_oblock(w->work.oblock, nw->work.oblock); + if (cmp < 0) + new = &((*new)->rb_left); + + else if (cmp > 0) + new = &((*new)->rb_right); + + else + /* already present */ + return false; + } + + rb_link_node(&nw->node, parent, new); + rb_insert_color(&nw->node, &b->pending); + + return true; +} + +static struct bt_work *__find_pending(struct background_tracker *b, + dm_oblock_t oblock) +{ + int cmp; + struct bt_work *w; + struct rb_node **new = &b->pending.rb_node; + + while (*new) { + w = container_of(*new, struct bt_work, node); + + cmp = cmp_oblock(w->work.oblock, oblock); + if (cmp < 0) + new = &((*new)->rb_left); + + else if (cmp > 0) + new = &((*new)->rb_right); + + else + break; + } + + return *new ? w : NULL; +} + + +static void update_stats(struct background_tracker *b, struct policy_work *w, int delta) +{ + switch (w->op) { + case POLICY_PROMOTE: + atomic_add(delta, &b->pending_promotes); + break; + + case POLICY_DEMOTE: + atomic_add(delta, &b->pending_demotes); + break; + + case POLICY_WRITEBACK: + atomic_add(delta, &b->pending_writebacks); + break; + } +} + +unsigned btracker_nr_writebacks_queued(struct background_tracker *b) +{ + return atomic_read(&b->pending_writebacks); +} +EXPORT_SYMBOL_GPL(btracker_nr_writebacks_queued); + +unsigned btracker_nr_demotions_queued(struct background_tracker *b) +{ + return atomic_read(&b->pending_demotes); +} +EXPORT_SYMBOL_GPL(btracker_nr_demotions_queued); + +static bool max_work_reached(struct background_tracker *b) +{ + // FIXME: finish + return false; +} + +int btracker_queue(struct background_tracker *b, + struct policy_work *work, + struct policy_work **pwork) +{ + struct bt_work *w; + + if (pwork) + *pwork = NULL; + + if (max_work_reached(b)) + return -ENOMEM; + + w = kmem_cache_alloc(b->work_cache, GFP_NOWAIT); + if (!w) + return -ENOMEM; + + memcpy(&w->work, work, sizeof(*work)); + + if (!__insert_pending(b, w)) { + /* + * There was a race, we'll just ignore this second + * bit of work for the same oblock. + */ + kmem_cache_free(b->work_cache, w); + return -EINVAL; + } + + if (pwork) { + *pwork = &w->work; + list_add(&w->list, &b->issued); + } else + list_add(&w->list, &b->queued); + update_stats(b, &w->work, 1); + + return 0; +} +EXPORT_SYMBOL_GPL(btracker_queue); + +/* + * Returns -ENODATA if there's no work. + */ +int btracker_issue(struct background_tracker *b, struct policy_work **work) +{ + struct bt_work *w; + + if (list_empty(&b->queued)) + return -ENODATA; + + w = list_first_entry(&b->queued, struct bt_work, list); + list_move(&w->list, &b->issued); + *work = &w->work; + + return 0; +} +EXPORT_SYMBOL_GPL(btracker_issue); + +void btracker_complete(struct background_tracker *b, + struct policy_work *op) +{ + struct bt_work *w = container_of(op, struct bt_work, work); + + update_stats(b, &w->work, -1); + rb_erase(&w->node, &b->pending); + list_del(&w->list); + kmem_cache_free(b->work_cache, w); +} +EXPORT_SYMBOL_GPL(btracker_complete); + +bool btracker_promotion_already_present(struct background_tracker *b, + dm_oblock_t oblock) +{ + return __find_pending(b, oblock) != NULL; +} +EXPORT_SYMBOL_GPL(btracker_promotion_already_present); + +/*----------------------------------------------------------------*/ diff --git a/drivers/md/dm-cache-background-tracker.h b/drivers/md/dm-cache-background-tracker.h new file mode 100644 index 000000000000..27ab90dbc275 --- /dev/null +++ b/drivers/md/dm-cache-background-tracker.h @@ -0,0 +1,46 @@ +/* + * Copyright (C) 2017 Red Hat. All rights reserved. + * + * This file is released under the GPL. + */ + +#ifndef DM_CACHE_BACKGROUND_WORK_H +#define DM_CACHE_BACKGROUND_WORK_H + +#include +#include "dm-cache-policy.h" + +/*----------------------------------------------------------------*/ + +struct background_work; +struct background_tracker; + +/* + * FIXME: discuss lack of locking in all methods. + */ +struct background_tracker *btracker_create(unsigned max_work); +void btracker_destroy(struct background_tracker *b); + +unsigned btracker_nr_writebacks_queued(struct background_tracker *b); +unsigned btracker_nr_demotions_queued(struct background_tracker *b); + +/* + * returns -EINVAL iff the work is already queued. -ENOMEM if the work + * couldn't be queued for another reason. + */ +int btracker_queue(struct background_tracker *b, + struct policy_work *work, + struct policy_work **pwork); + +/* + * Returns -ENODATA if there's no work. + */ +int btracker_issue(struct background_tracker *b, struct policy_work **work); +void btracker_complete(struct background_tracker *b, + struct policy_work *op); +bool btracker_promotion_already_present(struct background_tracker *b, + dm_oblock_t oblock); + +/*----------------------------------------------------------------*/ + +#endif diff --git a/drivers/md/dm-cache-metadata.h b/drivers/md/dm-cache-metadata.h index 4f07c08cf107..179ed5bf81a3 100644 --- a/drivers/md/dm-cache-metadata.h +++ b/drivers/md/dm-cache-metadata.h @@ -50,6 +50,8 @@ #define DM_CACHE_FEATURE_COMPAT_RO_SUPP 0UL #define DM_CACHE_FEATURE_INCOMPAT_SUPP 0UL +struct dm_cache_metadata; + /* * Reopens or creates a new, empty metadata volume. Returns an ERR_PTR on * failure. If reopening then features must match. diff --git a/drivers/md/dm-cache-policy-cleaner.c b/drivers/md/dm-cache-policy-cleaner.c deleted file mode 100644 index 2e8a8f1d8358..000000000000 --- a/drivers/md/dm-cache-policy-cleaner.c +++ /dev/null @@ -1,469 +0,0 @@ -/* - * Copyright (C) 2012 Red Hat. All rights reserved. - * - * writeback cache policy supporting flushing out dirty cache blocks. - * - * This file is released under the GPL. - */ - -#include "dm-cache-policy.h" -#include "dm.h" - -#include -#include -#include -#include - -/*----------------------------------------------------------------*/ - -#define DM_MSG_PREFIX "cache cleaner" - -/* Cache entry struct. */ -struct wb_cache_entry { - struct list_head list; - struct hlist_node hlist; - - dm_oblock_t oblock; - dm_cblock_t cblock; - bool dirty:1; - bool pending:1; -}; - -struct hash { - struct hlist_head *table; - dm_block_t hash_bits; - unsigned nr_buckets; -}; - -struct policy { - struct dm_cache_policy policy; - spinlock_t lock; - - struct list_head free; - struct list_head clean; - struct list_head clean_pending; - struct list_head dirty; - - /* - * We know exactly how many cblocks will be needed, - * so we can allocate them up front. - */ - dm_cblock_t cache_size, nr_cblocks_allocated; - struct wb_cache_entry *cblocks; - struct hash chash; -}; - -/*----------------------------------------------------------------------------*/ - -/* - * Low-level functions. - */ -static unsigned next_power(unsigned n, unsigned min) -{ - return roundup_pow_of_two(max(n, min)); -} - -static struct policy *to_policy(struct dm_cache_policy *p) -{ - return container_of(p, struct policy, policy); -} - -static struct list_head *list_pop(struct list_head *q) -{ - struct list_head *r = q->next; - - list_del(r); - - return r; -} - -/*----------------------------------------------------------------------------*/ - -/* Allocate/free various resources. */ -static int alloc_hash(struct hash *hash, unsigned elts) -{ - hash->nr_buckets = next_power(elts >> 4, 16); - hash->hash_bits = __ffs(hash->nr_buckets); - hash->table = vzalloc(sizeof(*hash->table) * hash->nr_buckets); - - return hash->table ? 0 : -ENOMEM; -} - -static void free_hash(struct hash *hash) -{ - vfree(hash->table); -} - -static int alloc_cache_blocks_with_hash(struct policy *p, dm_cblock_t cache_size) -{ - int r = -ENOMEM; - - p->cblocks = vzalloc(sizeof(*p->cblocks) * from_cblock(cache_size)); - if (p->cblocks) { - unsigned u = from_cblock(cache_size); - - while (u--) - list_add(&p->cblocks[u].list, &p->free); - - p->nr_cblocks_allocated = 0; - - /* Cache entries hash. */ - r = alloc_hash(&p->chash, from_cblock(cache_size)); - if (r) - vfree(p->cblocks); - } - - return r; -} - -static void free_cache_blocks_and_hash(struct policy *p) -{ - free_hash(&p->chash); - vfree(p->cblocks); -} - -static struct wb_cache_entry *alloc_cache_entry(struct policy *p) -{ - struct wb_cache_entry *e; - - BUG_ON(from_cblock(p->nr_cblocks_allocated) >= from_cblock(p->cache_size)); - - e = list_entry(list_pop(&p->free), struct wb_cache_entry, list); - p->nr_cblocks_allocated = to_cblock(from_cblock(p->nr_cblocks_allocated) + 1); - - return e; -} - -/*----------------------------------------------------------------------------*/ - -/* Hash functions (lookup, insert, remove). */ -static struct wb_cache_entry *lookup_cache_entry(struct policy *p, dm_oblock_t oblock) -{ - struct hash *hash = &p->chash; - unsigned h = hash_64(from_oblock(oblock), hash->hash_bits); - struct wb_cache_entry *cur; - struct hlist_head *bucket = &hash->table[h]; - - hlist_for_each_entry(cur, bucket, hlist) { - if (cur->oblock == oblock) { - /* Move upfront bucket for faster access. */ - hlist_del(&cur->hlist); - hlist_add_head(&cur->hlist, bucket); - return cur; - } - } - - return NULL; -} - -static void insert_cache_hash_entry(struct policy *p, struct wb_cache_entry *e) -{ - unsigned h = hash_64(from_oblock(e->oblock), p->chash.hash_bits); - - hlist_add_head(&e->hlist, &p->chash.table[h]); -} - -static void remove_cache_hash_entry(struct wb_cache_entry *e) -{ - hlist_del(&e->hlist); -} - -/* Public interface (see dm-cache-policy.h */ -static int wb_map(struct dm_cache_policy *pe, dm_oblock_t oblock, - bool can_block, bool can_migrate, bool discarded_oblock, - struct bio *bio, struct policy_locker *locker, - struct policy_result *result) -{ - struct policy *p = to_policy(pe); - struct wb_cache_entry *e; - unsigned long flags; - - result->op = POLICY_MISS; - - if (can_block) - spin_lock_irqsave(&p->lock, flags); - - else if (!spin_trylock_irqsave(&p->lock, flags)) - return -EWOULDBLOCK; - - e = lookup_cache_entry(p, oblock); - if (e) { - result->op = POLICY_HIT; - result->cblock = e->cblock; - - } - - spin_unlock_irqrestore(&p->lock, flags); - - return 0; -} - -static int wb_lookup(struct dm_cache_policy *pe, dm_oblock_t oblock, dm_cblock_t *cblock) -{ - int r; - struct policy *p = to_policy(pe); - struct wb_cache_entry *e; - unsigned long flags; - - if (!spin_trylock_irqsave(&p->lock, flags)) - return -EWOULDBLOCK; - - e = lookup_cache_entry(p, oblock); - if (e) { - *cblock = e->cblock; - r = 0; - - } else - r = -ENOENT; - - spin_unlock_irqrestore(&p->lock, flags); - - return r; -} - -static void __set_clear_dirty(struct dm_cache_policy *pe, dm_oblock_t oblock, bool set) -{ - struct policy *p = to_policy(pe); - struct wb_cache_entry *e; - - e = lookup_cache_entry(p, oblock); - BUG_ON(!e); - - if (set) { - if (!e->dirty) { - e->dirty = true; - list_move(&e->list, &p->dirty); - } - - } else { - if (e->dirty) { - e->pending = false; - e->dirty = false; - list_move(&e->list, &p->clean); - } - } -} - -static void wb_set_dirty(struct dm_cache_policy *pe, dm_oblock_t oblock) -{ - struct policy *p = to_policy(pe); - unsigned long flags; - - spin_lock_irqsave(&p->lock, flags); - __set_clear_dirty(pe, oblock, true); - spin_unlock_irqrestore(&p->lock, flags); -} - -static void wb_clear_dirty(struct dm_cache_policy *pe, dm_oblock_t oblock) -{ - struct policy *p = to_policy(pe); - unsigned long flags; - - spin_lock_irqsave(&p->lock, flags); - __set_clear_dirty(pe, oblock, false); - spin_unlock_irqrestore(&p->lock, flags); -} - -static void add_cache_entry(struct policy *p, struct wb_cache_entry *e) -{ - insert_cache_hash_entry(p, e); - if (e->dirty) - list_add(&e->list, &p->dirty); - else - list_add(&e->list, &p->clean); -} - -static int wb_load_mapping(struct dm_cache_policy *pe, - dm_oblock_t oblock, dm_cblock_t cblock, - uint32_t hint, bool hint_valid) -{ - int r; - struct policy *p = to_policy(pe); - struct wb_cache_entry *e = alloc_cache_entry(p); - - if (e) { - e->cblock = cblock; - e->oblock = oblock; - e->dirty = false; /* blocks default to clean */ - add_cache_entry(p, e); - r = 0; - - } else - r = -ENOMEM; - - return r; -} - -static void wb_destroy(struct dm_cache_policy *pe) -{ - struct policy *p = to_policy(pe); - - free_cache_blocks_and_hash(p); - kfree(p); -} - -static struct wb_cache_entry *__wb_force_remove_mapping(struct policy *p, dm_oblock_t oblock) -{ - struct wb_cache_entry *r = lookup_cache_entry(p, oblock); - - BUG_ON(!r); - - remove_cache_hash_entry(r); - list_del(&r->list); - - return r; -} - -static void wb_remove_mapping(struct dm_cache_policy *pe, dm_oblock_t oblock) -{ - struct policy *p = to_policy(pe); - struct wb_cache_entry *e; - unsigned long flags; - - spin_lock_irqsave(&p->lock, flags); - e = __wb_force_remove_mapping(p, oblock); - list_add_tail(&e->list, &p->free); - BUG_ON(!from_cblock(p->nr_cblocks_allocated)); - p->nr_cblocks_allocated = to_cblock(from_cblock(p->nr_cblocks_allocated) - 1); - spin_unlock_irqrestore(&p->lock, flags); -} - -static void wb_force_mapping(struct dm_cache_policy *pe, - dm_oblock_t current_oblock, dm_oblock_t oblock) -{ - struct policy *p = to_policy(pe); - struct wb_cache_entry *e; - unsigned long flags; - - spin_lock_irqsave(&p->lock, flags); - e = __wb_force_remove_mapping(p, current_oblock); - e->oblock = oblock; - add_cache_entry(p, e); - spin_unlock_irqrestore(&p->lock, flags); -} - -static struct wb_cache_entry *get_next_dirty_entry(struct policy *p) -{ - struct list_head *l; - struct wb_cache_entry *r; - - if (list_empty(&p->dirty)) - return NULL; - - l = list_pop(&p->dirty); - r = container_of(l, struct wb_cache_entry, list); - list_add(l, &p->clean_pending); - - return r; -} - -static int wb_writeback_work(struct dm_cache_policy *pe, - dm_oblock_t *oblock, - dm_cblock_t *cblock, - bool critical_only) -{ - int r = -ENOENT; - struct policy *p = to_policy(pe); - struct wb_cache_entry *e; - unsigned long flags; - - spin_lock_irqsave(&p->lock, flags); - - e = get_next_dirty_entry(p); - if (e) { - *oblock = e->oblock; - *cblock = e->cblock; - r = 0; - } - - spin_unlock_irqrestore(&p->lock, flags); - - return r; -} - -static dm_cblock_t wb_residency(struct dm_cache_policy *pe) -{ - return to_policy(pe)->nr_cblocks_allocated; -} - -/* Init the policy plugin interface function pointers. */ -static void init_policy_functions(struct policy *p) -{ - p->policy.destroy = wb_destroy; - p->policy.map = wb_map; - p->policy.lookup = wb_lookup; - p->policy.set_dirty = wb_set_dirty; - p->policy.clear_dirty = wb_clear_dirty; - p->policy.load_mapping = wb_load_mapping; - p->policy.get_hint = NULL; - p->policy.remove_mapping = wb_remove_mapping; - p->policy.writeback_work = wb_writeback_work; - p->policy.force_mapping = wb_force_mapping; - p->policy.residency = wb_residency; - p->policy.tick = NULL; -} - -static struct dm_cache_policy *wb_create(dm_cblock_t cache_size, - sector_t origin_size, - sector_t cache_block_size) -{ - int r; - struct policy *p = kzalloc(sizeof(*p), GFP_KERNEL); - - if (!p) - return NULL; - - init_policy_functions(p); - INIT_LIST_HEAD(&p->free); - INIT_LIST_HEAD(&p->clean); - INIT_LIST_HEAD(&p->clean_pending); - INIT_LIST_HEAD(&p->dirty); - - p->cache_size = cache_size; - spin_lock_init(&p->lock); - - /* Allocate cache entry structs and add them to free list. */ - r = alloc_cache_blocks_with_hash(p, cache_size); - if (!r) - return &p->policy; - - kfree(p); - - return NULL; -} -/*----------------------------------------------------------------------------*/ - -static struct dm_cache_policy_type wb_policy_type = { - .name = "cleaner", - .version = {1, 0, 0}, - .hint_size = 4, - .owner = THIS_MODULE, - .create = wb_create -}; - -static int __init wb_init(void) -{ - int r = dm_cache_policy_register(&wb_policy_type); - - if (r < 0) - DMERR("register failed %d", r); - else - DMINFO("version %u.%u.%u loaded", - wb_policy_type.version[0], - wb_policy_type.version[1], - wb_policy_type.version[2]); - - return r; -} - -static void __exit wb_exit(void) -{ - dm_cache_policy_unregister(&wb_policy_type); -} - -module_init(wb_init); -module_exit(wb_exit); - -MODULE_AUTHOR("Heinz Mauelshagen "); -MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("cleaner cache policy"); diff --git a/drivers/md/dm-cache-policy-internal.h b/drivers/md/dm-cache-policy-internal.h index 808ee0e2b2c4..56f0a23f698c 100644 --- a/drivers/md/dm-cache-policy-internal.h +++ b/drivers/md/dm-cache-policy-internal.h @@ -12,70 +12,65 @@ /*----------------------------------------------------------------*/ -/* - * Little inline functions that simplify calling the policy methods. - */ -static inline int policy_map(struct dm_cache_policy *p, dm_oblock_t oblock, - bool can_block, bool can_migrate, bool discarded_oblock, - struct bio *bio, struct policy_locker *locker, - struct policy_result *result) +static inline int policy_lookup(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t *cblock, + int data_dir, bool fast_copy, bool *background_queued) { - return p->map(p, oblock, can_block, can_migrate, discarded_oblock, bio, locker, result); + return p->lookup(p, oblock, cblock, data_dir, fast_copy, background_queued); } -static inline int policy_lookup(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t *cblock) +static inline int policy_lookup_with_work(struct dm_cache_policy *p, + dm_oblock_t oblock, dm_cblock_t *cblock, + int data_dir, bool fast_copy, + struct policy_work **work) { - BUG_ON(!p->lookup); - return p->lookup(p, oblock, cblock); -} + if (!p->lookup_with_work) { + *work = NULL; + return p->lookup(p, oblock, cblock, data_dir, fast_copy, NULL); + } -static inline void policy_set_dirty(struct dm_cache_policy *p, dm_oblock_t oblock) -{ - if (p->set_dirty) - p->set_dirty(p, oblock); + return p->lookup_with_work(p, oblock, cblock, data_dir, fast_copy, work); } -static inline void policy_clear_dirty(struct dm_cache_policy *p, dm_oblock_t oblock) +static inline int policy_get_background_work(struct dm_cache_policy *p, + bool idle, struct policy_work **result) { - if (p->clear_dirty) - p->clear_dirty(p, oblock); + return p->get_background_work(p, idle, result); } -static inline int policy_load_mapping(struct dm_cache_policy *p, - dm_oblock_t oblock, dm_cblock_t cblock, - uint32_t hint, bool hint_valid) +static inline void policy_complete_background_work(struct dm_cache_policy *p, + struct policy_work *work, + bool success) { - return p->load_mapping(p, oblock, cblock, hint, hint_valid); + return p->complete_background_work(p, work, success); } -static inline uint32_t policy_get_hint(struct dm_cache_policy *p, - dm_cblock_t cblock) +static inline void policy_set_dirty(struct dm_cache_policy *p, dm_cblock_t cblock) { - return p->get_hint ? p->get_hint(p, cblock) : 0; + p->set_dirty(p, cblock); } -static inline int policy_writeback_work(struct dm_cache_policy *p, - dm_oblock_t *oblock, - dm_cblock_t *cblock, - bool critical_only) +static inline void policy_clear_dirty(struct dm_cache_policy *p, dm_cblock_t cblock) { - return p->writeback_work ? p->writeback_work(p, oblock, cblock, critical_only) : -ENOENT; + p->clear_dirty(p, cblock); } -static inline void policy_remove_mapping(struct dm_cache_policy *p, dm_oblock_t oblock) +static inline int policy_load_mapping(struct dm_cache_policy *p, + dm_oblock_t oblock, dm_cblock_t cblock, + bool dirty, uint32_t hint, bool hint_valid) { - p->remove_mapping(p, oblock); + return p->load_mapping(p, oblock, cblock, dirty, hint, hint_valid); } -static inline int policy_remove_cblock(struct dm_cache_policy *p, dm_cblock_t cblock) +static inline int policy_invalidate_mapping(struct dm_cache_policy *p, + dm_cblock_t cblock) { - return p->remove_cblock(p, cblock); + return p->invalidate_mapping(p, cblock); } -static inline void policy_force_mapping(struct dm_cache_policy *p, - dm_oblock_t current_oblock, dm_oblock_t new_oblock) +static inline uint32_t policy_get_hint(struct dm_cache_policy *p, + dm_cblock_t cblock) { - return p->force_mapping(p, current_oblock, new_oblock); + return p->get_hint ? p->get_hint(p, cblock) : 0; } static inline dm_cblock_t policy_residency(struct dm_cache_policy *p) @@ -107,6 +102,11 @@ static inline int policy_set_config_value(struct dm_cache_policy *p, return p->set_config_value ? p->set_config_value(p, key, value) : -EINVAL; } +static inline void policy_allow_migrations(struct dm_cache_policy *p, bool allow) +{ + return p->allow_migrations(p, allow); +} + /*----------------------------------------------------------------*/ /* diff --git a/drivers/md/dm-cache-policy-smq.c b/drivers/md/dm-cache-policy-smq.c index f19c6930a67c..74436dc2122f 100644 --- a/drivers/md/dm-cache-policy-smq.c +++ b/drivers/md/dm-cache-policy-smq.c @@ -4,8 +4,9 @@ * This file is released under the GPL. */ -#include "dm-cache-policy.h" +#include "dm-cache-background-tracker.h" #include "dm-cache-policy-internal.h" +#include "dm-cache-policy.h" #include "dm.h" #include @@ -38,10 +39,11 @@ struct entry { unsigned hash_next:28; unsigned prev:28; unsigned next:28; - unsigned level:7; + unsigned level:6; bool dirty:1; bool allocated:1; bool sentinel:1; + bool pending_work:1; dm_oblock_t oblock; }; @@ -279,14 +281,28 @@ static unsigned q_size(struct queue *q) */ static void q_push(struct queue *q, struct entry *e) { + BUG_ON(e->pending_work); + if (!e->sentinel) q->nr_elts++; l_add_tail(q->es, q->qs + e->level, e); } +static void q_push_front(struct queue *q, struct entry *e) +{ + BUG_ON(e->pending_work); + + if (!e->sentinel) + q->nr_elts++; + + l_add_head(q->es, q->qs + e->level, e); +} + static void q_push_before(struct queue *q, struct entry *old, struct entry *e) { + BUG_ON(e->pending_work); + if (!e->sentinel) q->nr_elts++; @@ -335,19 +351,6 @@ static struct entry *q_pop(struct queue *q) return e; } -/* - * Pops an entry from a level that is not past a sentinel. - */ -static struct entry *q_pop_old(struct queue *q, unsigned max_level) -{ - struct entry *e = q_peek(q, max_level, false); - - if (e) - q_del(q, e); - - return e; -} - /* * This function assumes there is a non-sentinel entry to pop. It's only * used by redistribute, so we know this is true. It also doesn't adjust @@ -446,45 +449,49 @@ static void q_redistribute(struct queue *q) break; e->level = level + 1u; - l_add_head(q->es, l_above, e); + l_add_tail(q->es, l_above, e); } } } -static void q_requeue_before(struct queue *q, struct entry *dest, struct entry *e, unsigned extra_levels) +static void q_requeue(struct queue *q, struct entry *e, unsigned extra_levels, + struct entry *s1, struct entry *s2) { struct entry *de; - unsigned new_level; - - q_del(q, e); + unsigned sentinels_passed = 0; + unsigned new_level = min(q->nr_levels - 1u, e->level + extra_levels); + /* try and find an entry to swap with */ if (extra_levels && (e->level < q->nr_levels - 1u)) { - new_level = min(q->nr_levels - 1u, e->level + extra_levels); - for (de = l_head(q->es, q->qs + new_level); de; de = l_next(q->es, de)) { - if (de->sentinel) - continue; + for (de = l_head(q->es, q->qs + new_level); de && de->sentinel; de = l_next(q->es, de)) + sentinels_passed++; + if (de) { q_del(q, de); de->level = e->level; + if (s1) { + switch (sentinels_passed) { + case 0: + q_push_before(q, s1, de); + break; + + case 1: + q_push_before(q, s2, de); + break; - if (dest) - q_push_before(q, dest, de); - else + default: + q_push(q, de); + } + } else q_push(q, de); - break; } - - e->level = new_level; } + q_del(q, e); + e->level = new_level; q_push(q, e); } -static void q_requeue(struct queue *q, struct entry *e, unsigned extra_levels) -{ - q_requeue_before(q, NULL, e, extra_levels); -} - /*----------------------------------------------------------------*/ #define FP_SHIFT 8 @@ -550,7 +557,7 @@ static enum performance stats_assess(struct stats *s) /*----------------------------------------------------------------*/ -struct hash_table { +struct smq_hash_table { struct entry_space *es; unsigned long long hash_bits; unsigned *buckets; @@ -560,7 +567,7 @@ struct hash_table { * All cache entries are stored in a chained hash table. To save space we * use indexing again, and only store indexes to the next entry. */ -static int h_init(struct hash_table *ht, struct entry_space *es, unsigned nr_entries) +static int h_init(struct smq_hash_table *ht, struct entry_space *es, unsigned nr_entries) { unsigned i, nr_buckets; @@ -578,34 +585,34 @@ static int h_init(struct hash_table *ht, struct entry_space *es, unsigned nr_ent return 0; } -static void h_exit(struct hash_table *ht) +static void h_exit(struct smq_hash_table *ht) { vfree(ht->buckets); } -static struct entry *h_head(struct hash_table *ht, unsigned bucket) +static struct entry *h_head(struct smq_hash_table *ht, unsigned bucket) { return to_entry(ht->es, ht->buckets[bucket]); } -static struct entry *h_next(struct hash_table *ht, struct entry *e) +static struct entry *h_next(struct smq_hash_table *ht, struct entry *e) { return to_entry(ht->es, e->hash_next); } -static void __h_insert(struct hash_table *ht, unsigned bucket, struct entry *e) +static void __h_insert(struct smq_hash_table *ht, unsigned bucket, struct entry *e) { e->hash_next = ht->buckets[bucket]; ht->buckets[bucket] = to_index(ht->es, e); } -static void h_insert(struct hash_table *ht, struct entry *e) +static void h_insert(struct smq_hash_table *ht, struct entry *e) { unsigned h = hash_64(from_oblock(e->oblock), ht->hash_bits); __h_insert(ht, h, e); } -static struct entry *__h_lookup(struct hash_table *ht, unsigned h, dm_oblock_t oblock, +static struct entry *__h_lookup(struct smq_hash_table *ht, unsigned h, dm_oblock_t oblock, struct entry **prev) { struct entry *e; @@ -621,7 +628,7 @@ static struct entry *__h_lookup(struct hash_table *ht, unsigned h, dm_oblock_t o return NULL; } -static void __h_unlink(struct hash_table *ht, unsigned h, +static void __h_unlink(struct smq_hash_table *ht, unsigned h, struct entry *e, struct entry *prev) { if (prev) @@ -633,7 +640,7 @@ static void __h_unlink(struct hash_table *ht, unsigned h, /* * Also moves each entry to the front of the bucket. */ -static struct entry *h_lookup(struct hash_table *ht, dm_oblock_t oblock) +static struct entry *h_lookup(struct smq_hash_table *ht, dm_oblock_t oblock) { struct entry *e, *prev; unsigned h = hash_64(from_oblock(oblock), ht->hash_bits); @@ -651,7 +658,7 @@ static struct entry *h_lookup(struct hash_table *ht, dm_oblock_t oblock) return e; } -static void h_remove(struct hash_table *ht, struct entry *e) +static void h_remove(struct smq_hash_table *ht, struct entry *e) { unsigned h = hash_64(from_oblock(e->oblock), ht->hash_bits); struct entry *prev; @@ -699,7 +706,10 @@ static void init_entry(struct entry *e) e->next = INDEXER_NULL; e->prev = INDEXER_NULL; e->level = 0u; + e->dirty = true; /* FIXME: audit */ e->allocated = true; + e->sentinel = false; + e->pending_work = false; } static struct entry *alloc_entry(struct entry_alloc *ea) @@ -762,11 +772,11 @@ static struct entry *get_entry(struct entry_alloc *ea, unsigned index) #define NR_HOTSPOT_LEVELS 64u #define NR_CACHE_LEVELS 64u -#define WRITEBACK_PERIOD (10 * HZ) -#define DEMOTE_PERIOD (60 * HZ) +#define WRITEBACK_PERIOD (10ul * HZ) +#define DEMOTE_PERIOD (60ul * HZ) #define HOTSPOT_UPDATE_PERIOD (HZ) -#define CACHE_UPDATE_PERIOD (10u * HZ) +#define CACHE_UPDATE_PERIOD (60ul * HZ) struct smq_policy { struct dm_cache_policy policy; @@ -814,8 +824,8 @@ struct smq_policy { * The hash tables allows us to quickly find an entry by origin * block. */ - struct hash_table table; - struct hash_table hotspot_table; + struct smq_hash_table table; + struct smq_hash_table hotspot_table; bool current_writeback_sentinels; unsigned long next_writeback_period; @@ -828,6 +838,10 @@ struct smq_policy { unsigned long next_hotspot_period; unsigned long next_cache_period; + + struct background_tracker *bg_work; + + bool migrations_allowed; }; /*----------------------------------------------------------------*/ @@ -876,15 +890,15 @@ static void __update_demote_sentinels(struct smq_policy *mq) static void update_sentinels(struct smq_policy *mq) { if (time_after(jiffies, mq->next_writeback_period)) { - __update_writeback_sentinels(mq); mq->next_writeback_period = jiffies + WRITEBACK_PERIOD; mq->current_writeback_sentinels = !mq->current_writeback_sentinels; + __update_writeback_sentinels(mq); } if (time_after(jiffies, mq->next_demote_period)) { - __update_demote_sentinels(mq); mq->next_demote_period = jiffies + DEMOTE_PERIOD; mq->current_demote_sentinels = !mq->current_demote_sentinels; + __update_demote_sentinels(mq); } } @@ -920,55 +934,40 @@ static void sentinels_init(struct smq_policy *mq) /*----------------------------------------------------------------*/ -/* - * These methods tie together the dirty queue, clean queue and hash table. - */ -static void push_new(struct smq_policy *mq, struct entry *e) +static void del_queue(struct smq_policy *mq, struct entry *e) { - struct queue *q = e->dirty ? &mq->dirty : &mq->clean; - h_insert(&mq->table, e); - q_push(q, e); + q_del(e->dirty ? &mq->dirty : &mq->clean, e); } -static void push(struct smq_policy *mq, struct entry *e) +static void push_queue(struct smq_policy *mq, struct entry *e) { - struct entry *sentinel; - - h_insert(&mq->table, e); - - /* - * Punch this into the queue just in front of the sentinel, to - * ensure it's cleaned straight away. - */ - if (e->dirty) { - sentinel = writeback_sentinel(mq, e->level); - q_push_before(&mq->dirty, sentinel, e); - } else { - sentinel = demote_sentinel(mq, e->level); - q_push_before(&mq->clean, sentinel, e); - } + if (e->dirty) + q_push(&mq->dirty, e); + else + q_push(&mq->clean, e); } -/* - * Removes an entry from cache. Removes from the hash table. - */ -static void __del(struct smq_policy *mq, struct queue *q, struct entry *e) +// !h, !q, a -> h, q, a +static void push(struct smq_policy *mq, struct entry *e) { - q_del(q, e); - h_remove(&mq->table, e); + h_insert(&mq->table, e); + if (!e->pending_work) + push_queue(mq, e); } -static void del(struct smq_policy *mq, struct entry *e) +static void push_queue_front(struct smq_policy *mq, struct entry *e) { - __del(mq, e->dirty ? &mq->dirty : &mq->clean, e); + if (e->dirty) + q_push_front(&mq->dirty, e); + else + q_push_front(&mq->clean, e); } -static struct entry *pop_old(struct smq_policy *mq, struct queue *q, unsigned max_level) +static void push_front(struct smq_policy *mq, struct entry *e) { - struct entry *e = q_pop_old(q, max_level); - if (e) - h_remove(&mq->table, e); - return e; + h_insert(&mq->table, e); + if (!e->pending_work) + push_queue_front(mq, e); } static dm_cblock_t infer_cblock(struct smq_policy *mq, struct entry *e) @@ -978,16 +977,21 @@ static dm_cblock_t infer_cblock(struct smq_policy *mq, struct entry *e) static void requeue(struct smq_policy *mq, struct entry *e) { - struct entry *sentinel; + /* + * Pending work has temporarily been taken out of the queues. + */ + if (e->pending_work) + return; if (!test_and_set_bit(from_cblock(infer_cblock(mq, e)), mq->cache_hit_bits)) { - if (e->dirty) { - sentinel = writeback_sentinel(mq, e->level); - q_requeue_before(&mq->dirty, sentinel, e, 1u); - } else { - sentinel = demote_sentinel(mq, e->level); - q_requeue_before(&mq->clean, sentinel, e, 1u); + if (!e->dirty) { + q_requeue(&mq->clean, e, 1u, NULL, NULL); + return; } + + q_requeue(&mq->dirty, e, 1u, + get_sentinel(&mq->writeback_sentinel_alloc, e->level, !mq->current_writeback_sentinels), + get_sentinel(&mq->writeback_sentinel_alloc, e->level, mq->current_writeback_sentinels)); } } @@ -1026,6 +1030,8 @@ static void update_promote_levels(struct smq_policy *mq) unsigned threshold_level = allocator_empty(&mq->cache_alloc) ? default_promote_level(mq) : (NR_HOTSPOT_LEVELS / 2u); + threshold_level = max(threshold_level, NR_HOTSPOT_LEVELS); + /* * If the hotspot queue is performing badly then we have little * confidence that we know which blocks to promote. So we cut down @@ -1045,7 +1051,7 @@ static void update_promote_levels(struct smq_policy *mq) } mq->read_promote_level = NR_HOTSPOT_LEVELS - threshold_level; - mq->write_promote_level = (NR_HOTSPOT_LEVELS - threshold_level) + 2u; + mq->write_promote_level = (NR_HOTSPOT_LEVELS - threshold_level); } /* @@ -1095,34 +1101,142 @@ static void end_cache_period(struct smq_policy *mq) } } -static int demote_cblock(struct smq_policy *mq, - struct policy_locker *locker, - dm_oblock_t *oblock) +/*----------------------------------------------------------------*/ + +/* + * Targets are given as a percentage. + */ +#define CLEAN_TARGET 25u +#define FREE_TARGET 25u + +static unsigned percent_to_target(struct smq_policy *mq, unsigned p) { - struct entry *demoted = q_peek(&mq->clean, mq->clean.nr_levels, false); - if (!demoted) - /* - * We could get a block from mq->dirty, but that - * would add extra latency to the triggering bio as it - * waits for the writeback. Better to not promote this - * time and hope there's a clean block next time this block - * is hit. - */ - return -ENOSPC; + return from_cblock(mq->cache_size) * p / 100u; +} + +static bool clean_target_met(struct smq_policy *mq, bool idle) +{ + /* + * Cache entries may not be populated. So we cannot rely on the + * size of the clean queue. + */ + unsigned nr_clean = from_cblock(mq->cache_size) - q_size(&mq->dirty); - if (locker->fn(locker, demoted->oblock)) + if (idle) /* - * We couldn't lock this block. + * We'd like to clean everything. */ - return -EBUSY; + return q_size(&mq->dirty) == 0u; + else + return (nr_clean + btracker_nr_writebacks_queued(mq->bg_work)) >= + percent_to_target(mq, CLEAN_TARGET); +} - del(mq, demoted); - *oblock = demoted->oblock; - free_entry(&mq->cache_alloc, demoted); +static bool free_target_met(struct smq_policy *mq, bool idle) +{ + unsigned nr_free = from_cblock(mq->cache_size) - + mq->cache_alloc.nr_allocated; - return 0; + if (idle) + return (nr_free + btracker_nr_demotions_queued(mq->bg_work)) >= + percent_to_target(mq, FREE_TARGET); + else + return true; } +/*----------------------------------------------------------------*/ + +static void mark_pending(struct smq_policy *mq, struct entry *e) +{ + BUG_ON(e->sentinel); + BUG_ON(!e->allocated); + BUG_ON(e->pending_work); + e->pending_work = true; +} + +static void clear_pending(struct smq_policy *mq, struct entry *e) +{ + BUG_ON(!e->pending_work); + e->pending_work = false; +} + +static void queue_writeback(struct smq_policy *mq) +{ + int r; + struct policy_work work; + struct entry *e; + + e = q_peek(&mq->dirty, mq->dirty.nr_levels, false); + if (e) { + mark_pending(mq, e); + q_del(&mq->dirty, e); + + work.op = POLICY_WRITEBACK; + work.oblock = e->oblock; + work.cblock = infer_cblock(mq, e); + + r = btracker_queue(mq->bg_work, &work, NULL); + WARN_ON_ONCE(r); // FIXME: finish, I think we have to get rid of this race. + } +} + +static void queue_demotion(struct smq_policy *mq) +{ + struct policy_work work; + struct entry *e; + + if (unlikely(WARN_ON_ONCE(!mq->migrations_allowed))) + return; + + e = q_peek(&mq->clean, mq->clean.nr_levels, true); + if (!e) { + if (!clean_target_met(mq, false)) + queue_writeback(mq); + return; + } + + mark_pending(mq, e); + q_del(&mq->clean, e); + + work.op = POLICY_DEMOTE; + work.oblock = e->oblock; + work.cblock = infer_cblock(mq, e); + btracker_queue(mq->bg_work, &work, NULL); +} + +static void queue_promotion(struct smq_policy *mq, dm_oblock_t oblock, + struct policy_work **workp) +{ + struct entry *e; + struct policy_work work; + + if (!mq->migrations_allowed) + return; + + if (allocator_empty(&mq->cache_alloc)) { + if (!free_target_met(mq, false)) + queue_demotion(mq); + return; + } + + if (btracker_promotion_already_present(mq->bg_work, oblock)) + return; + + /* + * We allocate the entry now to reserve the cblock. If the + * background work is aborted we must remember to free it. + */ + e = alloc_entry(&mq->cache_alloc); + BUG_ON(!e); + e->pending_work = true; + work.op = POLICY_PROMOTE; + work.oblock = oblock; + work.cblock = infer_cblock(mq, e); + btracker_queue(mq->bg_work, &work, workp); +} + +/*----------------------------------------------------------------*/ + enum promote_result { PROMOTE_NOT, PROMOTE_TEMPORARY, @@ -1137,49 +1251,18 @@ static enum promote_result maybe_promote(bool promote) return promote ? PROMOTE_PERMANENT : PROMOTE_NOT; } -static enum promote_result should_promote(struct smq_policy *mq, struct entry *hs_e, struct bio *bio, - bool fast_promote) +static enum promote_result should_promote(struct smq_policy *mq, struct entry *hs_e, + int data_dir, bool fast_promote) { - if (bio_data_dir(bio) == WRITE) { + if (data_dir == WRITE) { if (!allocator_empty(&mq->cache_alloc) && fast_promote) return PROMOTE_TEMPORARY; - else - return maybe_promote(hs_e->level >= mq->write_promote_level); + return maybe_promote(hs_e->level >= mq->write_promote_level); } else return maybe_promote(hs_e->level >= mq->read_promote_level); } -static void insert_in_cache(struct smq_policy *mq, dm_oblock_t oblock, - struct policy_locker *locker, - struct policy_result *result, enum promote_result pr) -{ - int r; - struct entry *e; - - if (allocator_empty(&mq->cache_alloc)) { - result->op = POLICY_REPLACE; - r = demote_cblock(mq, locker, &result->old_oblock); - if (r) { - result->op = POLICY_MISS; - return; - } - - } else - result->op = POLICY_NEW; - - e = alloc_entry(&mq->cache_alloc); - BUG_ON(!e); - e->oblock = oblock; - - if (pr == PROMOTE_TEMPORARY) - push(mq, e); - else - push_new(mq, e); - - result->cblock = infer_cblock(mq, e); -} - static dm_oblock_t to_hblock(struct smq_policy *mq, dm_oblock_t b) { sector_t r = from_oblock(b); @@ -1187,7 +1270,7 @@ static dm_oblock_t to_hblock(struct smq_policy *mq, dm_oblock_t b) return to_oblock(r); } -static struct entry *update_hotspot_queue(struct smq_policy *mq, dm_oblock_t b, struct bio *bio) +static struct entry *update_hotspot_queue(struct smq_policy *mq, dm_oblock_t b) { unsigned hi; dm_oblock_t hb = to_hblock(mq, b); @@ -1199,7 +1282,8 @@ static struct entry *update_hotspot_queue(struct smq_policy *mq, dm_oblock_t b, hi = get_index(&mq->hotspot_alloc, e); q_requeue(&mq->hotspot, e, test_and_set_bit(hi, mq->hotspot_hit_bits) ? - 0u : mq->hotspot_level_jump); + 0u : mq->hotspot_level_jump, + NULL, NULL); } else { stats_miss(&mq->hotspot_stats); @@ -1225,47 +1309,6 @@ static struct entry *update_hotspot_queue(struct smq_policy *mq, dm_oblock_t b, return e; } -/* - * Looks the oblock up in the hash table, then decides whether to put in - * pre_cache, or cache etc. - */ -static int map(struct smq_policy *mq, struct bio *bio, dm_oblock_t oblock, - bool can_migrate, bool fast_promote, - struct policy_locker *locker, struct policy_result *result) -{ - struct entry *e, *hs_e; - enum promote_result pr; - - hs_e = update_hotspot_queue(mq, oblock, bio); - - e = h_lookup(&mq->table, oblock); - if (e) { - stats_level_accessed(&mq->cache_stats, e->level); - - requeue(mq, e); - result->op = POLICY_HIT; - result->cblock = infer_cblock(mq, e); - - } else { - stats_miss(&mq->cache_stats); - - pr = should_promote(mq, hs_e, bio, fast_promote); - if (pr == PROMOTE_NOT) - result->op = POLICY_MISS; - - else { - if (!can_migrate) { - result->op = POLICY_MISS; - return -EWOULDBLOCK; - } - - insert_in_cache(mq, oblock, locker, result, pr); - } - } - - return 0; -} - /*----------------------------------------------------------------*/ /* @@ -1282,6 +1325,7 @@ static void smq_destroy(struct dm_cache_policy *p) { struct smq_policy *mq = to_smq_policy(p); + btracker_destroy(mq->bg_work); h_exit(&mq->hotspot_table); h_exit(&mq->table); free_bitset(mq->hotspot_hit_bits); @@ -1290,234 +1334,247 @@ static void smq_destroy(struct dm_cache_policy *p) kfree(mq); } -static int smq_map(struct dm_cache_policy *p, dm_oblock_t oblock, - bool can_block, bool can_migrate, bool fast_promote, - struct bio *bio, struct policy_locker *locker, - struct policy_result *result) -{ - int r; - unsigned long flags; - struct smq_policy *mq = to_smq_policy(p); - - result->op = POLICY_MISS; - - spin_lock_irqsave(&mq->lock, flags); - r = map(mq, bio, oblock, can_migrate, fast_promote, locker, result); - spin_unlock_irqrestore(&mq->lock, flags); - - return r; -} +/*----------------------------------------------------------------*/ -static int smq_lookup(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t *cblock) +static int __lookup(struct smq_policy *mq, dm_oblock_t oblock, dm_cblock_t *cblock, + int data_dir, bool fast_copy, + struct policy_work **work, bool *background_work) { - int r; - unsigned long flags; - struct smq_policy *mq = to_smq_policy(p); - struct entry *e; + struct entry *e, *hs_e; + enum promote_result pr; + + *background_work = false; - spin_lock_irqsave(&mq->lock, flags); e = h_lookup(&mq->table, oblock); if (e) { + stats_level_accessed(&mq->cache_stats, e->level); + + requeue(mq, e); *cblock = infer_cblock(mq, e); - r = 0; - } else - r = -ENOENT; - spin_unlock_irqrestore(&mq->lock, flags); + return 0; - return r; -} + } else { + stats_miss(&mq->cache_stats); -static void __smq_set_clear_dirty(struct smq_policy *mq, dm_oblock_t oblock, bool set) -{ - struct entry *e; + /* + * The hotspot queue only gets updated with misses. + */ + hs_e = update_hotspot_queue(mq, oblock); - e = h_lookup(&mq->table, oblock); - BUG_ON(!e); + pr = should_promote(mq, hs_e, data_dir, fast_copy); + if (pr != PROMOTE_NOT) { + queue_promotion(mq, oblock, work); + *background_work = true; + } - del(mq, e); - e->dirty = set; - push(mq, e); + return -ENOENT; + } } -static void smq_set_dirty(struct dm_cache_policy *p, dm_oblock_t oblock) +static int smq_lookup(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t *cblock, + int data_dir, bool fast_copy, + bool *background_work) { + int r; unsigned long flags; struct smq_policy *mq = to_smq_policy(p); spin_lock_irqsave(&mq->lock, flags); - __smq_set_clear_dirty(mq, oblock, true); + r = __lookup(mq, oblock, cblock, + data_dir, fast_copy, + NULL, background_work); spin_unlock_irqrestore(&mq->lock, flags); + + return r; } -static void smq_clear_dirty(struct dm_cache_policy *p, dm_oblock_t oblock) +static int smq_lookup_with_work(struct dm_cache_policy *p, + dm_oblock_t oblock, dm_cblock_t *cblock, + int data_dir, bool fast_copy, + struct policy_work **work) { - struct smq_policy *mq = to_smq_policy(p); + int r; + bool background_queued; unsigned long flags; + struct smq_policy *mq = to_smq_policy(p); spin_lock_irqsave(&mq->lock, flags); - __smq_set_clear_dirty(mq, oblock, false); + r = __lookup(mq, oblock, cblock, data_dir, fast_copy, work, &background_queued); spin_unlock_irqrestore(&mq->lock, flags); -} -static unsigned random_level(dm_cblock_t cblock) -{ - return hash_32(from_cblock(cblock), 9) & (NR_CACHE_LEVELS - 1); + return r; } -static int smq_load_mapping(struct dm_cache_policy *p, - dm_oblock_t oblock, dm_cblock_t cblock, - uint32_t hint, bool hint_valid) +static int smq_get_background_work(struct dm_cache_policy *p, bool idle, + struct policy_work **result) { + int r; + unsigned long flags; struct smq_policy *mq = to_smq_policy(p); - struct entry *e; - e = alloc_particular_entry(&mq->cache_alloc, from_cblock(cblock)); - e->oblock = oblock; - e->dirty = false; /* this gets corrected in a minute */ - e->level = hint_valid ? min(hint, NR_CACHE_LEVELS - 1) : random_level(cblock); - push(mq, e); - - return 0; -} + spin_lock_irqsave(&mq->lock, flags); + r = btracker_issue(mq->bg_work, result); + if (r == -ENODATA) { + /* find some writeback work to do */ + if (mq->migrations_allowed && !free_target_met(mq, idle)) + queue_demotion(mq); -static uint32_t smq_get_hint(struct dm_cache_policy *p, dm_cblock_t cblock) -{ - struct smq_policy *mq = to_smq_policy(p); - struct entry *e = get_entry(&mq->cache_alloc, from_cblock(cblock)); + else if (!clean_target_met(mq, idle)) + queue_writeback(mq); - if (!e->allocated) - return 0; + r = btracker_issue(mq->bg_work, result); + } + spin_unlock_irqrestore(&mq->lock, flags); - return e->level; + return r; } -static void __remove_mapping(struct smq_policy *mq, dm_oblock_t oblock) -{ - struct entry *e; +/* + * We need to clear any pending work flags that have been set, and in the + * case of promotion free the entry for the destination cblock. + */ +static void __complete_background_work(struct smq_policy *mq, + struct policy_work *work, + bool success) +{ + struct entry *e = get_entry(&mq->cache_alloc, + from_cblock(work->cblock)); + + switch (work->op) { + case POLICY_PROMOTE: + // !h, !q, a + clear_pending(mq, e); + if (success) { + e->oblock = work->oblock; + push(mq, e); + // h, q, a + } else { + free_entry(&mq->cache_alloc, e); + // !h, !q, !a + } + break; - e = h_lookup(&mq->table, oblock); - BUG_ON(!e); + case POLICY_DEMOTE: + // h, !q, a + if (success) { + h_remove(&mq->table, e); + free_entry(&mq->cache_alloc, e); + // !h, !q, !a + } else { + clear_pending(mq, e); + push_queue(mq, e); + // h, q, a + } + break; - del(mq, e); - free_entry(&mq->cache_alloc, e); + case POLICY_WRITEBACK: + // h, !q, a + clear_pending(mq, e); + push_queue(mq, e); + // h, q, a + break; + } + + btracker_complete(mq->bg_work, work); } -static void smq_remove_mapping(struct dm_cache_policy *p, dm_oblock_t oblock) +static void smq_complete_background_work(struct dm_cache_policy *p, + struct policy_work *work, + bool success) { - struct smq_policy *mq = to_smq_policy(p); unsigned long flags; + struct smq_policy *mq = to_smq_policy(p); spin_lock_irqsave(&mq->lock, flags); - __remove_mapping(mq, oblock); + __complete_background_work(mq, work, success); spin_unlock_irqrestore(&mq->lock, flags); } -static int __remove_cblock(struct smq_policy *mq, dm_cblock_t cblock) +// in_hash(oblock) -> in_hash(oblock) +static void __smq_set_clear_dirty(struct smq_policy *mq, dm_cblock_t cblock, bool set) { struct entry *e = get_entry(&mq->cache_alloc, from_cblock(cblock)); - if (!e || !e->allocated) - return -ENODATA; - - del(mq, e); - free_entry(&mq->cache_alloc, e); - - return 0; + if (e->pending_work) + e->dirty = set; + else { + del_queue(mq, e); + e->dirty = set; + push_queue(mq, e); + } } -static int smq_remove_cblock(struct dm_cache_policy *p, dm_cblock_t cblock) +static void smq_set_dirty(struct dm_cache_policy *p, dm_cblock_t cblock) { - int r; unsigned long flags; struct smq_policy *mq = to_smq_policy(p); spin_lock_irqsave(&mq->lock, flags); - r = __remove_cblock(mq, cblock); + __smq_set_clear_dirty(mq, cblock, true); spin_unlock_irqrestore(&mq->lock, flags); - - return r; } - -#define CLEAN_TARGET_CRITICAL 5u /* percent */ - -static bool clean_target_met(struct smq_policy *mq, bool critical) +static void smq_clear_dirty(struct dm_cache_policy *p, dm_cblock_t cblock) { - if (critical) { - /* - * Cache entries may not be populated. So we're cannot rely on the - * size of the clean queue. - */ - unsigned nr_clean = from_cblock(mq->cache_size) - q_size(&mq->dirty); - unsigned target = from_cblock(mq->cache_size) * CLEAN_TARGET_CRITICAL / 100u; + struct smq_policy *mq = to_smq_policy(p); + unsigned long flags; - return nr_clean >= target; - } else - return !q_size(&mq->dirty); + spin_lock_irqsave(&mq->lock, flags); + __smq_set_clear_dirty(mq, cblock, false); + spin_unlock_irqrestore(&mq->lock, flags); } -static int __smq_writeback_work(struct smq_policy *mq, dm_oblock_t *oblock, - dm_cblock_t *cblock, bool critical_only) +static unsigned random_level(dm_cblock_t cblock) { - struct entry *e = NULL; - bool target_met = clean_target_met(mq, critical_only); - - if (critical_only) - /* - * Always try and keep the bottom level clean. - */ - e = pop_old(mq, &mq->dirty, target_met ? 1u : mq->dirty.nr_levels); + return hash_32(from_cblock(cblock), 9) & (NR_CACHE_LEVELS - 1); +} - else - e = pop_old(mq, &mq->dirty, mq->dirty.nr_levels); +static int smq_load_mapping(struct dm_cache_policy *p, + dm_oblock_t oblock, dm_cblock_t cblock, + bool dirty, uint32_t hint, bool hint_valid) +{ + struct smq_policy *mq = to_smq_policy(p); + struct entry *e; - if (!e) - return -ENODATA; + e = alloc_particular_entry(&mq->cache_alloc, from_cblock(cblock)); + e->oblock = oblock; + e->dirty = dirty; + e->level = hint_valid ? min(hint, NR_CACHE_LEVELS - 1) : random_level(cblock); + e->pending_work = false; - *oblock = e->oblock; - *cblock = infer_cblock(mq, e); - e->dirty = false; - push_new(mq, e); + /* + * When we load mappings we push ahead of both sentinels in order to + * allow demotions and cleaning to occur immediately. + */ + push_front(mq, e); return 0; } -static int smq_writeback_work(struct dm_cache_policy *p, dm_oblock_t *oblock, - dm_cblock_t *cblock, bool critical_only) +static int smq_invalidate_mapping(struct dm_cache_policy *p, dm_cblock_t cblock) { - int r; - unsigned long flags; struct smq_policy *mq = to_smq_policy(p); + struct entry *e = get_entry(&mq->cache_alloc, from_cblock(cblock)); - spin_lock_irqsave(&mq->lock, flags); - r = __smq_writeback_work(mq, oblock, cblock, critical_only); - spin_unlock_irqrestore(&mq->lock, flags); - - return r; -} - -static void __force_mapping(struct smq_policy *mq, - dm_oblock_t current_oblock, dm_oblock_t new_oblock) -{ - struct entry *e = h_lookup(&mq->table, current_oblock); + if (!e->allocated) + return -ENODATA; - if (e) { - del(mq, e); - e->oblock = new_oblock; - e->dirty = true; - push(mq, e); - } + // FIXME: what if this block has pending background work? + del_queue(mq, e); + h_remove(&mq->table, e); + free_entry(&mq->cache_alloc, e); + return 0; } -static void smq_force_mapping(struct dm_cache_policy *p, - dm_oblock_t current_oblock, dm_oblock_t new_oblock) +static uint32_t smq_get_hint(struct dm_cache_policy *p, dm_cblock_t cblock) { - unsigned long flags; struct smq_policy *mq = to_smq_policy(p); + struct entry *e = get_entry(&mq->cache_alloc, from_cblock(cblock)); - spin_lock_irqsave(&mq->lock, flags); - __force_mapping(mq, current_oblock, new_oblock); - spin_unlock_irqrestore(&mq->lock, flags); + if (!e->allocated) + return 0; + + return e->level; } static dm_cblock_t smq_residency(struct dm_cache_policy *p) @@ -1546,6 +1603,12 @@ static void smq_tick(struct dm_cache_policy *p, bool can_block) spin_unlock_irqrestore(&mq->lock, flags); } +static void smq_allow_migrations(struct dm_cache_policy *p, bool allow) +{ + struct smq_policy *mq = to_smq_policy(p); + mq->migrations_allowed = allow; +} + /* * smq has no config values, but the old mq policy did. To avoid breaking * software we continue to accept these configurables for the mq policy, @@ -1590,18 +1653,18 @@ static int mq_emit_config_values(struct dm_cache_policy *p, char *result, static void init_policy_functions(struct smq_policy *mq, bool mimic_mq) { mq->policy.destroy = smq_destroy; - mq->policy.map = smq_map; mq->policy.lookup = smq_lookup; + mq->policy.lookup_with_work = smq_lookup_with_work; + mq->policy.get_background_work = smq_get_background_work; + mq->policy.complete_background_work = smq_complete_background_work; mq->policy.set_dirty = smq_set_dirty; mq->policy.clear_dirty = smq_clear_dirty; mq->policy.load_mapping = smq_load_mapping; + mq->policy.invalidate_mapping = smq_invalidate_mapping; mq->policy.get_hint = smq_get_hint; - mq->policy.remove_mapping = smq_remove_mapping; - mq->policy.remove_cblock = smq_remove_cblock; - mq->policy.writeback_work = smq_writeback_work; - mq->policy.force_mapping = smq_force_mapping; mq->policy.residency = smq_residency; mq->policy.tick = smq_tick; + mq->policy.allow_migrations = smq_allow_migrations; if (mimic_mq) { mq->policy.set_config_value = mq_set_config_value; @@ -1633,7 +1696,8 @@ static void calc_hotspot_params(sector_t origin_size, static struct dm_cache_policy *__smq_create(dm_cblock_t cache_size, sector_t origin_size, sector_t cache_block_size, - bool mimic_mq) + bool mimic_mq, + bool migrations_allowed) { unsigned i; unsigned nr_sentinels_per_queue = 2u * NR_CACHE_LEVELS; @@ -1658,11 +1722,11 @@ static struct dm_cache_policy *__smq_create(dm_cblock_t cache_size, } init_allocator(&mq->writeback_sentinel_alloc, &mq->es, 0, nr_sentinels_per_queue); - for (i = 0; i < nr_sentinels_per_queue; i++) + for (i = 0; i < nr_sentinels_per_queue; i++) get_entry(&mq->writeback_sentinel_alloc, i)->sentinel = true; init_allocator(&mq->demote_sentinel_alloc, &mq->es, nr_sentinels_per_queue, total_sentinels); - for (i = 0; i < nr_sentinels_per_queue; i++) + for (i = 0; i < nr_sentinels_per_queue; i++) get_entry(&mq->demote_sentinel_alloc, i)->sentinel = true; init_allocator(&mq->hotspot_alloc, &mq->es, total_sentinels, @@ -1715,8 +1779,16 @@ static struct dm_cache_policy *__smq_create(dm_cblock_t cache_size, mq->next_hotspot_period = jiffies; mq->next_cache_period = jiffies; + mq->bg_work = btracker_create(10240); /* FIXME: hard coded value */ + if (!mq->bg_work) + goto bad_btracker; + + mq->migrations_allowed = migrations_allowed; + return &mq->policy; +bad_btracker: + h_exit(&mq->hotspot_table); bad_alloc_hotspot_table: h_exit(&mq->table); bad_alloc_table: @@ -1735,21 +1807,28 @@ static struct dm_cache_policy *smq_create(dm_cblock_t cache_size, sector_t origin_size, sector_t cache_block_size) { - return __smq_create(cache_size, origin_size, cache_block_size, false); + return __smq_create(cache_size, origin_size, cache_block_size, false, true); } static struct dm_cache_policy *mq_create(dm_cblock_t cache_size, sector_t origin_size, sector_t cache_block_size) { - return __smq_create(cache_size, origin_size, cache_block_size, true); + return __smq_create(cache_size, origin_size, cache_block_size, true, true); +} + +static struct dm_cache_policy *cleaner_create(dm_cblock_t cache_size, + sector_t origin_size, + sector_t cache_block_size) +{ + return __smq_create(cache_size, origin_size, cache_block_size, false, false); } /*----------------------------------------------------------------*/ static struct dm_cache_policy_type smq_policy_type = { .name = "smq", - .version = {1, 5, 0}, + .version = {2, 0, 0}, .hint_size = 4, .owner = THIS_MODULE, .create = smq_create @@ -1757,15 +1836,23 @@ static struct dm_cache_policy_type smq_policy_type = { static struct dm_cache_policy_type mq_policy_type = { .name = "mq", - .version = {1, 5, 0}, + .version = {2, 0, 0}, .hint_size = 4, .owner = THIS_MODULE, .create = mq_create, }; +static struct dm_cache_policy_type cleaner_policy_type = { + .name = "cleaner", + .version = {2, 0, 0}, + .hint_size = 4, + .owner = THIS_MODULE, + .create = cleaner_create, +}; + static struct dm_cache_policy_type default_policy_type = { .name = "default", - .version = {1, 5, 0}, + .version = {2, 0, 0}, .hint_size = 4, .owner = THIS_MODULE, .create = smq_create, @@ -1785,23 +1872,36 @@ static int __init smq_init(void) r = dm_cache_policy_register(&mq_policy_type); if (r) { DMERR("register failed (as mq) %d", r); - dm_cache_policy_unregister(&smq_policy_type); - return -ENOMEM; + goto out_mq; + } + + r = dm_cache_policy_register(&cleaner_policy_type); + if (r) { + DMERR("register failed (as cleaner) %d", r); + goto out_cleaner; } r = dm_cache_policy_register(&default_policy_type); if (r) { DMERR("register failed (as default) %d", r); - dm_cache_policy_unregister(&mq_policy_type); - dm_cache_policy_unregister(&smq_policy_type); - return -ENOMEM; + goto out_default; } return 0; + +out_default: + dm_cache_policy_unregister(&cleaner_policy_type); +out_cleaner: + dm_cache_policy_unregister(&mq_policy_type); +out_mq: + dm_cache_policy_unregister(&smq_policy_type); + + return -ENOMEM; } static void __exit smq_exit(void) { + dm_cache_policy_unregister(&cleaner_policy_type); dm_cache_policy_unregister(&smq_policy_type); dm_cache_policy_unregister(&mq_policy_type); dm_cache_policy_unregister(&default_policy_type); @@ -1816,3 +1916,4 @@ MODULE_DESCRIPTION("smq cache policy"); MODULE_ALIAS("dm-cache-default"); MODULE_ALIAS("dm-cache-mq"); +MODULE_ALIAS("dm-cache-cleaner"); diff --git a/drivers/md/dm-cache-policy.h b/drivers/md/dm-cache-policy.h index aa10b1493f34..c05fc3436cef 100644 --- a/drivers/md/dm-cache-policy.h +++ b/drivers/md/dm-cache-policy.h @@ -13,183 +13,100 @@ /*----------------------------------------------------------------*/ -/* FIXME: make it clear which methods are optional. Get debug policy to - * double check this at start. - */ - /* * The cache policy makes the important decisions about which blocks get to * live on the faster cache device. - * - * When the core target has to remap a bio it calls the 'map' method of the - * policy. This returns an instruction telling the core target what to do. - * - * POLICY_HIT: - * That block is in the cache. Remap to the cache and carry on. - * - * POLICY_MISS: - * This block is on the origin device. Remap and carry on. - * - * POLICY_NEW: - * This block is currently on the origin device, but the policy wants to - * move it. The core should: - * - * - hold any further io to this origin block - * - copy the origin to the given cache block - * - release all the held blocks - * - remap the original block to the cache - * - * POLICY_REPLACE: - * This block is currently on the origin device. The policy wants to - * move it to the cache, with the added complication that the destination - * cache block needs a writeback first. The core should: - * - * - hold any further io to this origin block - * - hold any further io to the origin block that's being written back - * - writeback - * - copy new block to cache - * - release held blocks - * - remap bio to cache and reissue. - * - * Should the core run into trouble while processing a POLICY_NEW or - * POLICY_REPLACE instruction it will roll back the policies mapping using - * remove_mapping() or force_mapping(). These methods must not fail. This - * approach avoids having transactional semantics in the policy (ie, the - * core informing the policy when a migration is complete), and hence makes - * it easier to write new policies. - * - * In general policy methods should never block, except in the case of the - * map function when can_migrate is set. So be careful to implement using - * bounded, preallocated memory. */ enum policy_operation { - POLICY_HIT, - POLICY_MISS, - POLICY_NEW, - POLICY_REPLACE -}; - -/* - * When issuing a POLICY_REPLACE the policy needs to make a callback to - * lock the block being demoted. This doesn't need to occur during a - * writeback operation since the block remains in the cache. - */ -struct policy_locker; -typedef int (*policy_lock_fn)(struct policy_locker *l, dm_oblock_t oblock); - -struct policy_locker { - policy_lock_fn fn; + POLICY_PROMOTE, + POLICY_DEMOTE, + POLICY_WRITEBACK }; /* * This is the instruction passed back to the core target. */ -struct policy_result { +struct policy_work { enum policy_operation op; - dm_oblock_t old_oblock; /* POLICY_REPLACE */ - dm_cblock_t cblock; /* POLICY_HIT, POLICY_NEW, POLICY_REPLACE */ + dm_oblock_t oblock; + dm_cblock_t cblock; }; /* - * The cache policy object. Just a bunch of methods. It is envisaged that - * this structure will be embedded in a bigger, policy specific structure - * (ie. use container_of()). + * The cache policy object. It is envisaged that this structure will be + * embedded in a bigger, policy specific structure (ie. use container_of()). */ struct dm_cache_policy { - - /* - * FIXME: make it clear which methods are optional, and which may - * block. - */ - /* * Destroys this object. */ void (*destroy)(struct dm_cache_policy *p); /* - * See large comment above. - * - * oblock - the origin block we're interested in. - * - * can_block - indicates whether the current thread is allowed to - * block. -EWOULDBLOCK returned if it can't and would. - * - * can_migrate - gives permission for POLICY_NEW or POLICY_REPLACE - * instructions. If denied and the policy would have - * returned one of these instructions it should - * return -EWOULDBLOCK. + * Find the location of a block. * - * discarded_oblock - indicates whether the whole origin block is - * in a discarded state (FIXME: better to tell the - * policy about this sooner, so it can recycle that - * cache block if it wants.) - * bio - the bio that triggered this call. - * result - gets filled in with the instruction. + * Must not block. * - * May only return 0, or -EWOULDBLOCK (if !can_migrate) + * Returns 0 if in cache (cblock will be set), -ENOENT if not, < 0 for + * other errors (-EWOULDBLOCK would be typical). data_dir should be + * READ or WRITE. fast_copy should be set if migrating this block would + * be 'cheap' somehow (eg, discarded data). background_queued will be set + * if a migration has just been queued. */ - int (*map)(struct dm_cache_policy *p, dm_oblock_t oblock, - bool can_block, bool can_migrate, bool discarded_oblock, - struct bio *bio, struct policy_locker *locker, - struct policy_result *result); + int (*lookup)(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t *cblock, + int data_dir, bool fast_copy, bool *background_queued); /* - * Sometimes we want to see if a block is in the cache, without - * triggering any update of stats. (ie. it's not a real hit). - * - * Must not block. + * Sometimes the core target can optimise a migration, eg, the + * block may be discarded, or the bio may cover an entire block. + * In order to optimise it needs the migration immediately though + * so it knows to do something different with the bio. * - * Returns 0 if in cache, -ENOENT if not, < 0 for other errors - * (-EWOULDBLOCK would be typical). + * This method is optional (policy-internal will fallback to using + * lookup). */ - int (*lookup)(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t *cblock); - - void (*set_dirty)(struct dm_cache_policy *p, dm_oblock_t oblock); - void (*clear_dirty)(struct dm_cache_policy *p, dm_oblock_t oblock); + int (*lookup_with_work)(struct dm_cache_policy *p, + dm_oblock_t oblock, dm_cblock_t *cblock, + int data_dir, bool fast_copy, + struct policy_work **work); /* - * Called when a cache target is first created. Used to load a - * mapping from the metadata device into the policy. + * Retrieves background work. Returns -ENODATA when there's no + * background work. */ - int (*load_mapping)(struct dm_cache_policy *p, dm_oblock_t oblock, - dm_cblock_t cblock, uint32_t hint, bool hint_valid); + int (*get_background_work)(struct dm_cache_policy *p, bool idle, + struct policy_work **result); /* - * Gets the hint for a given cblock. Called in a single threaded - * context. So no locking required. + * You must pass in the same work pointer that you were given, not + * a copy. */ - uint32_t (*get_hint)(struct dm_cache_policy *p, dm_cblock_t cblock); + void (*complete_background_work)(struct dm_cache_policy *p, + struct policy_work *work, + bool success); + + void (*set_dirty)(struct dm_cache_policy *p, dm_cblock_t cblock); + void (*clear_dirty)(struct dm_cache_policy *p, dm_cblock_t cblock); /* - * Override functions used on the error paths of the core target. - * They must succeed. + * Called when a cache target is first created. Used to load a + * mapping from the metadata device into the policy. */ - void (*remove_mapping)(struct dm_cache_policy *p, dm_oblock_t oblock); - void (*force_mapping)(struct dm_cache_policy *p, dm_oblock_t current_oblock, - dm_oblock_t new_oblock); + int (*load_mapping)(struct dm_cache_policy *p, dm_oblock_t oblock, + dm_cblock_t cblock, bool dirty, + uint32_t hint, bool hint_valid); /* - * This is called via the invalidate_cblocks message. It is - * possible the particular cblock has already been removed due to a - * write io in passthrough mode. In which case this should return - * -ENODATA. + * Drops the mapping, irrespective of whether it's clean or dirty. + * Returns -ENODATA if cblock is not mapped. */ - int (*remove_cblock)(struct dm_cache_policy *p, dm_cblock_t cblock); + int (*invalidate_mapping)(struct dm_cache_policy *p, dm_cblock_t cblock); /* - * Provide a dirty block to be written back by the core target. If - * critical_only is set then the policy should only provide work if - * it urgently needs it. - * - * Returns: - * - * 0 and @cblock,@oblock: block to write back provided - * - * -ENODATA: no dirty blocks available + * Gets the hint for a given cblock. Called in a single threaded + * context. So no locking required. */ - int (*writeback_work)(struct dm_cache_policy *p, dm_oblock_t *oblock, dm_cblock_t *cblock, - bool critical_only); + uint32_t (*get_hint)(struct dm_cache_policy *p, dm_cblock_t cblock); /* * How full is the cache? @@ -202,6 +119,8 @@ struct dm_cache_policy { * queue merging has occurred). To stop the policy being fooled by * these, the core target sends regular tick() calls to the policy. * The policy should only count an entry as hit once per tick. + * + * This method is optional. */ void (*tick)(struct dm_cache_policy *p, bool can_block); @@ -213,6 +132,8 @@ struct dm_cache_policy { int (*set_config_value)(struct dm_cache_policy *p, const char *key, const char *value); + void (*allow_migrations)(struct dm_cache_policy *p, bool allow); + /* * Book keeping ptr for the policy register, not for general use. */ diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c index 2eaa414e1509..b7de289a10bb 100644 --- a/drivers/md/dm-cache-target.c +++ b/drivers/md/dm-cache-target.c @@ -5,7 +5,7 @@ */ #include "dm.h" -#include "dm-bio-prison-v1.h" +#include "dm-bio-prison-v2.h" #include "dm-bio-record.h" #include "dm-cache-metadata.h" @@ -15,6 +15,7 @@ #include #include #include +#include #include #include @@ -25,7 +26,18 @@ DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(cache_copy_throttle, /*----------------------------------------------------------------*/ -#define IOT_RESOLUTION 4 +/* + * Glossary: + * + * oblock: index of an origin block + * cblock: index of a cache block + * promotion: movement of a block from origin to cache + * demotion: movement of a block from cache to origin + * migration: movement of a block between the origin and cache device, + * either direction + */ + +/*----------------------------------------------------------------*/ struct io_tracker { spinlock_t lock; @@ -99,18 +111,177 @@ static void iot_io_end(struct io_tracker *iot, sector_t len) /*----------------------------------------------------------------*/ /* - * Glossary: - * - * oblock: index of an origin block - * cblock: index of a cache block - * promotion: movement of a block from origin to cache - * demotion: movement of a block from cache to origin - * migration: movement of a block between the origin and cache device, - * either direction + * Represents a chunk of future work. 'input' allows continuations to pass + * values between themselves, typically error values. */ +struct continuation { + struct work_struct ws; + int input; +}; + +static inline void init_continuation(struct continuation *k, + void (*fn)(struct work_struct *)) +{ + INIT_WORK(&k->ws, fn); + k->input = 0; +} + +static inline void queue_continuation(struct workqueue_struct *wq, + struct continuation *k) +{ + queue_work(wq, &k->ws); +} /*----------------------------------------------------------------*/ +/* + * The batcher collects together pieces of work that need a particular + * operation to occur before they can proceed (typically a commit). + */ +struct batcher { + /* + * The operation that everyone is waiting for. + */ + int (*commit_op)(void *context); + void *commit_context; + + /* + * This is how bios should be issued once the commit op is complete + * (accounted_request). + */ + void (*issue_op)(struct bio *bio, void *context); + void *issue_context; + + /* + * Queued work gets put on here after commit. + */ + struct workqueue_struct *wq; + + spinlock_t lock; + struct list_head work_items; + struct bio_list bios; + struct work_struct commit_work; + + bool commit_scheduled; +}; + +static void __commit(struct work_struct *_ws) +{ + struct batcher *b = container_of(_ws, struct batcher, commit_work); + + int r; + unsigned long flags; + struct list_head work_items; + struct work_struct *ws, *tmp; + struct continuation *k; + struct bio *bio; + struct bio_list bios; + + INIT_LIST_HEAD(&work_items); + bio_list_init(&bios); + + /* + * We have to grab these before the commit_op to avoid a race + * condition. + */ + spin_lock_irqsave(&b->lock, flags); + list_splice_init(&b->work_items, &work_items); + bio_list_merge(&bios, &b->bios); + bio_list_init(&b->bios); + b->commit_scheduled = false; + spin_unlock_irqrestore(&b->lock, flags); + + r = b->commit_op(b->commit_context); + + list_for_each_entry_safe(ws, tmp, &work_items, entry) { + k = container_of(ws, struct continuation, ws); + k->input = r; + INIT_LIST_HEAD(&ws->entry); /* to avoid a WARN_ON */ + queue_work(b->wq, ws); + } + + while ((bio = bio_list_pop(&bios))) { + if (r) { + bio->bi_error = r; + bio_endio(bio); + } else + b->issue_op(bio, b->issue_context); + } +} + +static void batcher_init(struct batcher *b, + int (*commit_op)(void *), + void *commit_context, + void (*issue_op)(struct bio *bio, void *), + void *issue_context, + struct workqueue_struct *wq) +{ + b->commit_op = commit_op; + b->commit_context = commit_context; + b->issue_op = issue_op; + b->issue_context = issue_context; + b->wq = wq; + + spin_lock_init(&b->lock); + INIT_LIST_HEAD(&b->work_items); + bio_list_init(&b->bios); + INIT_WORK(&b->commit_work, __commit); + b->commit_scheduled = false; +} + +static void async_commit(struct batcher *b) +{ + queue_work(b->wq, &b->commit_work); +} + +static void continue_after_commit(struct batcher *b, struct continuation *k) +{ + unsigned long flags; + bool commit_scheduled; + + spin_lock_irqsave(&b->lock, flags); + commit_scheduled = b->commit_scheduled; + list_add_tail(&k->ws.entry, &b->work_items); + spin_unlock_irqrestore(&b->lock, flags); + + if (commit_scheduled) + async_commit(b); +} + +/* + * Bios are errored if commit failed. + */ +static void issue_after_commit(struct batcher *b, struct bio *bio) +{ + unsigned long flags; + bool commit_scheduled; + + spin_lock_irqsave(&b->lock, flags); + commit_scheduled = b->commit_scheduled; + bio_list_add(&b->bios, bio); + spin_unlock_irqrestore(&b->lock, flags); + + if (commit_scheduled) + async_commit(b); +} + +/* + * Call this if some urgent work is waiting for the commit to complete. + */ +static void schedule_commit(struct batcher *b) +{ + bool immediate; + unsigned long flags; + + spin_lock_irqsave(&b->lock, flags); + immediate = !list_empty(&b->work_items) || !bio_list_empty(&b->bios); + b->commit_scheduled = true; + spin_unlock_irqrestore(&b->lock, flags); + + if (immediate) + async_commit(b); +} + /* * There are a couple of places where we let a bio run, but want to do some * work before calling its endio function. We do this by temporarily @@ -189,31 +360,13 @@ struct cache_stats { atomic_t write_miss; atomic_t demotion; atomic_t promotion; + atomic_t writeback; atomic_t copies_avoided; atomic_t cache_cell_clash; atomic_t commit_count; atomic_t discard_count; }; -/* - * Defines a range of cblocks, begin to (end - 1) are in the range. end is - * the one-past-the-end value. - */ -struct cblock_range { - dm_cblock_t begin; - dm_cblock_t end; -}; - -struct invalidation_request { - struct list_head list; - struct cblock_range *cblocks; - - atomic_t complete; - int err; - - wait_queue_head_t result_wait; -}; - struct cache { struct dm_target *ti; struct dm_target_callbacks callbacks; @@ -255,11 +408,7 @@ struct cache { spinlock_t lock; struct list_head deferred_cells; struct bio_list deferred_bios; - struct bio_list deferred_flush_bios; struct bio_list deferred_writethrough_bios; - struct list_head quiesced_migrations; - struct list_head completed_migrations; - struct list_head need_commit_migrations; sector_t migration_threshold; wait_queue_head_t migration_wait; atomic_t nr_allocated_migrations; @@ -270,9 +419,7 @@ struct cache { */ atomic_t nr_io_migrations; - wait_queue_head_t quiescing_wait; - atomic_t quiescing; - atomic_t quiescing_ack; + struct rw_semaphore quiesce_lock; /* * cache_size entries, dirty if set @@ -296,13 +443,11 @@ struct cache { struct dm_kcopyd_client *copier; struct workqueue_struct *wq; - struct work_struct worker; - + struct work_struct deferred_bio_worker; + struct work_struct deferred_writethrough_worker; + struct work_struct migration_worker; struct delayed_work waker; - unsigned long last_commit_jiffies; - - struct dm_bio_prison *prison; - struct dm_deferred_set *all_io_ds; + struct dm_bio_prison_v2 *prison; mempool_t *migration_pool; @@ -330,12 +475,17 @@ struct cache { struct list_head invalidation_requests; struct io_tracker origin_tracker; + + struct work_struct commit_ws; + struct batcher committer; + + struct rw_semaphore background_work_lock; }; struct per_bio_data { bool tick:1; unsigned req_nr:2; - struct dm_deferred_entry *all_io_entry; + struct dm_bio_prison_cell_v2 *cell; struct dm_hook_info hook_info; sector_t len; @@ -350,55 +500,64 @@ struct per_bio_data { }; struct dm_cache_migration { - struct list_head list; + struct continuation k; struct cache *cache; - unsigned long start_jiffies; - dm_oblock_t old_oblock; - dm_oblock_t new_oblock; - dm_cblock_t cblock; - - bool err:1; - bool discard:1; - bool writeback:1; - bool demote:1; - bool promote:1; - bool requeue_holder:1; - bool invalidate:1; + struct policy_work *op; + struct bio *overwrite_bio; + struct dm_bio_prison_cell_v2 *cell; - struct dm_bio_prison_cell *old_ocell; - struct dm_bio_prison_cell *new_ocell; + dm_cblock_t invalidate_cblock; + dm_oblock_t invalidate_oblock; }; -/* - * Processing a bio in the worker thread may require these memory - * allocations. We prealloc to avoid deadlocks (the same worker thread - * frees them back to the mempool). - */ -struct prealloc { - struct dm_cache_migration *mg; - struct dm_bio_prison_cell *cell1; - struct dm_bio_prison_cell *cell2; -}; +/*----------------------------------------------------------------*/ + +static bool writethrough_mode(struct cache_features *f) +{ + return f->io_mode == CM_IO_WRITETHROUGH; +} + +static bool writeback_mode(struct cache_features *f) +{ + return f->io_mode == CM_IO_WRITEBACK; +} + +static inline bool passthrough_mode(struct cache_features *f) +{ + return unlikely(f->io_mode == CM_IO_PASSTHROUGH); +} + +/*----------------------------------------------------------------*/ + +static void wake_deferred_bio_worker(struct cache *cache) +{ + queue_work(cache->wq, &cache->deferred_bio_worker); +} -static enum cache_metadata_mode get_cache_mode(struct cache *cache); +static void wake_deferred_writethrough_worker(struct cache *cache) +{ + queue_work(cache->wq, &cache->deferred_writethrough_worker); +} -static void wake_worker(struct cache *cache) +static void wake_migration_worker(struct cache *cache) { - queue_work(cache->wq, &cache->worker); + if (passthrough_mode(&cache->features)) + return; + + queue_work(cache->wq, &cache->migration_worker); } /*----------------------------------------------------------------*/ -static struct dm_bio_prison_cell *alloc_prison_cell(struct cache *cache) +static struct dm_bio_prison_cell_v2 *alloc_prison_cell(struct cache *cache) { - /* FIXME: change to use a local slab. */ - return dm_bio_prison_alloc_cell(cache->prison, GFP_NOWAIT); + return dm_bio_prison_alloc_cell_v2(cache->prison, GFP_NOWAIT); } -static void free_prison_cell(struct cache *cache, struct dm_bio_prison_cell *cell) +static void free_prison_cell(struct cache *cache, struct dm_bio_prison_cell_v2 *cell) { - dm_bio_prison_free_cell(cache->prison, cell); + dm_bio_prison_free_cell_v2(cache->prison, cell); } static struct dm_cache_migration *alloc_migration(struct cache *cache) @@ -424,146 +583,127 @@ static void free_migration(struct dm_cache_migration *mg) mempool_free(mg, cache->migration_pool); } -static int prealloc_data_structs(struct cache *cache, struct prealloc *p) -{ - if (!p->mg) { - p->mg = alloc_migration(cache); - if (!p->mg) - return -ENOMEM; - } - - if (!p->cell1) { - p->cell1 = alloc_prison_cell(cache); - if (!p->cell1) - return -ENOMEM; - } - - if (!p->cell2) { - p->cell2 = alloc_prison_cell(cache); - if (!p->cell2) - return -ENOMEM; - } - - return 0; -} +/*----------------------------------------------------------------*/ -static void prealloc_free_structs(struct cache *cache, struct prealloc *p) +static inline dm_oblock_t oblock_succ(dm_oblock_t b) { - if (p->cell2) - free_prison_cell(cache, p->cell2); - - if (p->cell1) - free_prison_cell(cache, p->cell1); - - if (p->mg) - free_migration(p->mg); + return to_oblock(from_oblock(b) + 1ull); } -static struct dm_cache_migration *prealloc_get_migration(struct prealloc *p) +static void build_key(dm_oblock_t begin, dm_oblock_t end, struct dm_cell_key_v2 *key) { - struct dm_cache_migration *mg = p->mg; - - BUG_ON(!mg); - p->mg = NULL; - - return mg; + key->virtual = 0; + key->dev = 0; + key->block_begin = from_oblock(begin); + key->block_end = from_oblock(end); } /* - * You must have a cell within the prealloc struct to return. If not this - * function will BUG() rather than returning NULL. + * We have two lock levels. Level 0, which is used to prevent WRITEs, and + * level 1 which prevents *both* READs and WRITEs. */ -static struct dm_bio_prison_cell *prealloc_get_cell(struct prealloc *p) +#define WRITE_LOCK_LEVEL 0 +#define READ_WRITE_LOCK_LEVEL 1 + +static unsigned lock_level(struct bio *bio) { - struct dm_bio_prison_cell *r = NULL; + return bio_data_dir(bio) == WRITE ? + WRITE_LOCK_LEVEL : + READ_WRITE_LOCK_LEVEL; +} - if (p->cell1) { - r = p->cell1; - p->cell1 = NULL; +/*---------------------------------------------------------------- + * Per bio data + *--------------------------------------------------------------*/ - } else if (p->cell2) { - r = p->cell2; - p->cell2 = NULL; - } else - BUG(); +/* + * If using writeback, leave out struct per_bio_data's writethrough fields. + */ +#define PB_DATA_SIZE_WB (offsetof(struct per_bio_data, cache)) +#define PB_DATA_SIZE_WT (sizeof(struct per_bio_data)) - return r; +static size_t get_per_bio_data_size(struct cache *cache) +{ + return writethrough_mode(&cache->features) ? PB_DATA_SIZE_WT : PB_DATA_SIZE_WB; } -/* - * You can't have more than two cells in a prealloc struct. BUG() will be - * called if you try and overfill. - */ -static void prealloc_put_cell(struct prealloc *p, struct dm_bio_prison_cell *cell) +static struct per_bio_data *get_per_bio_data(struct bio *bio, size_t data_size) { - if (!p->cell2) - p->cell2 = cell; + struct per_bio_data *pb = dm_per_bio_data(bio, data_size); + BUG_ON(!pb); + return pb; +} - else if (!p->cell1) - p->cell1 = cell; +static struct per_bio_data *init_per_bio_data(struct bio *bio, size_t data_size) +{ + struct per_bio_data *pb = get_per_bio_data(bio, data_size); - else - BUG(); + pb->tick = false; + pb->req_nr = dm_bio_get_target_bio_nr(bio); + pb->cell = NULL; + pb->len = 0; + + return pb; } /*----------------------------------------------------------------*/ -static void build_key(dm_oblock_t begin, dm_oblock_t end, struct dm_cell_key *key) +static void defer_bio(struct cache *cache, struct bio *bio) { - key->virtual = 0; - key->dev = 0; - key->block_begin = from_oblock(begin); - key->block_end = from_oblock(end); -} + unsigned long flags; -/* - * The caller hands in a preallocated cell, and a free function for it. - * The cell will be freed if there's an error, or if it wasn't used because - * a cell with that key already exists. - */ -typedef void (*cell_free_fn)(void *context, struct dm_bio_prison_cell *cell); + spin_lock_irqsave(&cache->lock, flags); + bio_list_add(&cache->deferred_bios, bio); + spin_unlock_irqrestore(&cache->lock, flags); + + wake_deferred_bio_worker(cache); +} -static int bio_detain_range(struct cache *cache, dm_oblock_t oblock_begin, dm_oblock_t oblock_end, - struct bio *bio, struct dm_bio_prison_cell *cell_prealloc, - cell_free_fn free_fn, void *free_context, - struct dm_bio_prison_cell **cell_result) +static void defer_bios(struct cache *cache, struct bio_list *bios) { - int r; - struct dm_cell_key key; + unsigned long flags; - build_key(oblock_begin, oblock_end, &key); - r = dm_bio_detain(cache->prison, &key, bio, cell_prealloc, cell_result); - if (r) - free_fn(free_context, cell_prealloc); + spin_lock_irqsave(&cache->lock, flags); + bio_list_merge(&cache->deferred_bios, bios); + bio_list_init(bios); + spin_unlock_irqrestore(&cache->lock, flags); - return r; + wake_deferred_bio_worker(cache); } -static int bio_detain(struct cache *cache, dm_oblock_t oblock, - struct bio *bio, struct dm_bio_prison_cell *cell_prealloc, - cell_free_fn free_fn, void *free_context, - struct dm_bio_prison_cell **cell_result) +/*----------------------------------------------------------------*/ + +static bool bio_detain_shared(struct cache *cache, dm_oblock_t oblock, struct bio *bio) { + bool r; + size_t pb_size; + struct per_bio_data *pb; + struct dm_cell_key_v2 key; dm_oblock_t end = to_oblock(from_oblock(oblock) + 1ULL); - return bio_detain_range(cache, oblock, end, bio, - cell_prealloc, free_fn, free_context, cell_result); -} + struct dm_bio_prison_cell_v2 *cell_prealloc, *cell; -static int get_cell(struct cache *cache, - dm_oblock_t oblock, - struct prealloc *structs, - struct dm_bio_prison_cell **cell_result) -{ - int r; - struct dm_cell_key key; - struct dm_bio_prison_cell *cell_prealloc; + cell_prealloc = alloc_prison_cell(cache); /* FIXME: allow wait if calling from worker */ + if (!cell_prealloc) { + defer_bio(cache, bio); + return false; + } + + build_key(oblock, end, &key); + r = dm_cell_get_v2(cache->prison, &key, lock_level(bio), bio, cell_prealloc, &cell); + if (!r) { + /* + * Failed to get the lock. + */ + free_prison_cell(cache, cell_prealloc); + return r; + } - cell_prealloc = prealloc_get_cell(structs); + if (cell != cell_prealloc) + free_prison_cell(cache, cell_prealloc); - build_key(oblock, to_oblock(from_oblock(oblock) + 1ULL), &key); - r = dm_get_cell(cache->prison, &key, cell_prealloc, cell_result); - if (r) - prealloc_put_cell(structs, cell_prealloc); + pb_size = get_per_bio_data_size(cache); + pb = get_per_bio_data(bio, pb_size); + pb->cell = cell; return r; } @@ -575,21 +715,33 @@ static bool is_dirty(struct cache *cache, dm_cblock_t b) return test_bit(from_cblock(b), cache->dirty_bitset); } -static void set_dirty(struct cache *cache, dm_oblock_t oblock, dm_cblock_t cblock) +static void set_dirty(struct cache *cache, dm_cblock_t cblock) { if (!test_and_set_bit(from_cblock(cblock), cache->dirty_bitset)) { atomic_inc(&cache->nr_dirty); - policy_set_dirty(cache->policy, oblock); + policy_set_dirty(cache->policy, cblock); } } -static void clear_dirty(struct cache *cache, dm_oblock_t oblock, dm_cblock_t cblock) +/* + * These two are called when setting after migrations to force the policy + * and dirty bitset to be in sync. + */ +static void force_set_dirty(struct cache *cache, dm_cblock_t cblock) +{ + if (!test_and_set_bit(from_cblock(cblock), cache->dirty_bitset)) + atomic_inc(&cache->nr_dirty); + policy_set_dirty(cache->policy, cblock); +} + +static void force_clear_dirty(struct cache *cache, dm_cblock_t cblock) { if (test_and_clear_bit(from_cblock(cblock), cache->dirty_bitset)) { - policy_clear_dirty(cache->policy, oblock); if (atomic_dec_return(&cache->nr_dirty) == 0) dm_table_event(cache->ti->table); } + + policy_clear_dirty(cache->policy, cblock); } /*----------------------------------------------------------------*/ @@ -628,11 +780,6 @@ static dm_dblock_t oblock_to_dblock(struct cache *cache, dm_oblock_t oblock) oblocks_per_dblock(cache))); } -static dm_oblock_t dblock_to_oblock(struct cache *cache, dm_dblock_t dblock) -{ - return to_oblock(from_dblock(dblock) * oblocks_per_dblock(cache)); -} - static void set_discard(struct cache *cache, dm_dblock_t b) { unsigned long flags; @@ -679,89 +826,12 @@ static bool is_discarded_oblock(struct cache *cache, dm_oblock_t b) return r; } -/*----------------------------------------------------------------*/ - -static void load_stats(struct cache *cache) +/*---------------------------------------------------------------- + * Remapping + *--------------------------------------------------------------*/ +static void remap_to_origin(struct cache *cache, struct bio *bio) { - struct dm_cache_statistics stats; - - dm_cache_metadata_get_stats(cache->cmd, &stats); - atomic_set(&cache->stats.read_hit, stats.read_hits); - atomic_set(&cache->stats.read_miss, stats.read_misses); - atomic_set(&cache->stats.write_hit, stats.write_hits); - atomic_set(&cache->stats.write_miss, stats.write_misses); -} - -static void save_stats(struct cache *cache) -{ - struct dm_cache_statistics stats; - - if (get_cache_mode(cache) >= CM_READ_ONLY) - return; - - stats.read_hits = atomic_read(&cache->stats.read_hit); - stats.read_misses = atomic_read(&cache->stats.read_miss); - stats.write_hits = atomic_read(&cache->stats.write_hit); - stats.write_misses = atomic_read(&cache->stats.write_miss); - - dm_cache_metadata_set_stats(cache->cmd, &stats); -} - -/*---------------------------------------------------------------- - * Per bio data - *--------------------------------------------------------------*/ - -/* - * If using writeback, leave out struct per_bio_data's writethrough fields. - */ -#define PB_DATA_SIZE_WB (offsetof(struct per_bio_data, cache)) -#define PB_DATA_SIZE_WT (sizeof(struct per_bio_data)) - -static bool writethrough_mode(struct cache_features *f) -{ - return f->io_mode == CM_IO_WRITETHROUGH; -} - -static bool writeback_mode(struct cache_features *f) -{ - return f->io_mode == CM_IO_WRITEBACK; -} - -static bool passthrough_mode(struct cache_features *f) -{ - return f->io_mode == CM_IO_PASSTHROUGH; -} - -static size_t get_per_bio_data_size(struct cache *cache) -{ - return writethrough_mode(&cache->features) ? PB_DATA_SIZE_WT : PB_DATA_SIZE_WB; -} - -static struct per_bio_data *get_per_bio_data(struct bio *bio, size_t data_size) -{ - struct per_bio_data *pb = dm_per_bio_data(bio, data_size); - BUG_ON(!pb); - return pb; -} - -static struct per_bio_data *init_per_bio_data(struct bio *bio, size_t data_size) -{ - struct per_bio_data *pb = get_per_bio_data(bio, data_size); - - pb->tick = false; - pb->req_nr = dm_bio_get_target_bio_nr(bio); - pb->all_io_entry = NULL; - pb->len = 0; - - return pb; -} - -/*---------------------------------------------------------------- - * Remapping - *--------------------------------------------------------------*/ -static void remap_to_origin(struct cache *cache, struct bio *bio) -{ - bio->bi_bdev = cache->origin_dev->bdev; + bio->bi_bdev = cache->origin_dev->bdev; } static void remap_to_cache(struct cache *cache, struct bio *bio, @@ -797,8 +867,9 @@ static void check_if_tick_bio_needed(struct cache *cache, struct bio *bio) } static void remap_to_origin_clear_discard(struct cache *cache, struct bio *bio, - dm_oblock_t oblock) + dm_oblock_t oblock) { + // FIXME: this is called way too much. check_if_tick_bio_needed(cache, bio); remap_to_origin(cache, bio); if (bio_data_dir(bio) == WRITE) @@ -811,7 +882,7 @@ static void remap_to_cache_dirty(struct cache *cache, struct bio *bio, check_if_tick_bio_needed(cache, bio); remap_to_cache(cache, bio, cblock); if (bio_data_dir(bio) == WRITE) { - set_dirty(cache, oblock, cblock); + set_dirty(cache, cblock); clear_discard(cache, oblock_to_dblock(cache, oblock)); } } @@ -828,22 +899,6 @@ static dm_oblock_t get_bio_block(struct cache *cache, struct bio *bio) return to_oblock(block_nr); } -/* - * You must increment the deferred set whilst the prison cell is held. To - * encourage this, we ask for 'cell' to be passed in. - */ -static void inc_ds(struct cache *cache, struct bio *bio, - struct dm_bio_prison_cell *cell) -{ - size_t pb_data_size = get_per_bio_data_size(cache); - struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); - - BUG_ON(!cell); - BUG_ON(pb->all_io_entry); - - pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds); -} - static bool accountable_bio(struct cache *cache, struct bio *bio) { return ((bio->bi_bdev == cache->origin_dev->bdev) && @@ -875,29 +930,10 @@ static void accounted_request(struct cache *cache, struct bio *bio) generic_make_request(bio); } -static void issue(struct cache *cache, struct bio *bio) -{ - unsigned long flags; - - if (!op_is_flush(bio->bi_opf)) { - accounted_request(cache, bio); - return; - } - - /* - * Batch together any bios that trigger commits and then issue a - * single commit for them in do_worker(). - */ - spin_lock_irqsave(&cache->lock, flags); - cache->commit_requested = true; - bio_list_add(&cache->deferred_flush_bios, bio); - spin_unlock_irqrestore(&cache->lock, flags); -} - -static void inc_and_issue(struct cache *cache, struct bio *bio, struct dm_bio_prison_cell *cell) +static void issue_op(struct bio *bio, void *context) { - inc_ds(cache, bio, cell); - issue(cache, bio); + struct cache *cache = context; + accounted_request(cache, bio); } static void defer_writethrough_bio(struct cache *cache, struct bio *bio) @@ -908,7 +944,7 @@ static void defer_writethrough_bio(struct cache *cache, struct bio *bio) bio_list_add(&cache->deferred_writethrough_bios, bio); spin_unlock_irqrestore(&cache->lock, flags); - wake_worker(cache); + wake_deferred_writethrough_worker(cache); } static void writethrough_endio(struct bio *bio) @@ -934,6 +970,7 @@ static void writethrough_endio(struct bio *bio) } /* + * FIXME: send in parallel, huge latency as is. * When running in writethrough mode we need to send writes to clean blocks * to both the cache and origin devices. In future we'd like to clone the * bio and send them in parallel, but for now we're doing them in @@ -1046,12 +1083,58 @@ static void metadata_operation_failed(struct cache *cache, const char *op, int r set_cache_mode(cache, CM_READ_ONLY); } +/*----------------------------------------------------------------*/ + +static void load_stats(struct cache *cache) +{ + struct dm_cache_statistics stats; + + dm_cache_metadata_get_stats(cache->cmd, &stats); + atomic_set(&cache->stats.read_hit, stats.read_hits); + atomic_set(&cache->stats.read_miss, stats.read_misses); + atomic_set(&cache->stats.write_hit, stats.write_hits); + atomic_set(&cache->stats.write_miss, stats.write_misses); +} + +static void save_stats(struct cache *cache) +{ + struct dm_cache_statistics stats; + + if (get_cache_mode(cache) >= CM_READ_ONLY) + return; + + stats.read_hits = atomic_read(&cache->stats.read_hit); + stats.read_misses = atomic_read(&cache->stats.read_miss); + stats.write_hits = atomic_read(&cache->stats.write_hit); + stats.write_misses = atomic_read(&cache->stats.write_miss); + + dm_cache_metadata_set_stats(cache->cmd, &stats); +} + +static void update_stats(struct cache_stats *stats, enum policy_operation op) +{ + switch (op) { + case POLICY_PROMOTE: + atomic_inc(&stats->promotion); + break; + + case POLICY_DEMOTE: + atomic_inc(&stats->demotion); + break; + + case POLICY_WRITEBACK: + atomic_inc(&stats->writeback); + break; + } +} + /*---------------------------------------------------------------- * Migration processing * * Migration covers moving data from the origin device to the cache, or * vice versa. *--------------------------------------------------------------*/ + static void inc_io_migrations(struct cache *cache) { atomic_inc(&cache->nr_io_migrations); @@ -1067,213 +1150,109 @@ static bool discard_or_flush(struct bio *bio) return bio_op(bio) == REQ_OP_DISCARD || op_is_flush(bio->bi_opf); } -static void __cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell) -{ - if (discard_or_flush(cell->holder)) { - /* - * We have to handle these bios individually. - */ - dm_cell_release(cache->prison, cell, &cache->deferred_bios); - free_prison_cell(cache, cell); - } else - list_add_tail(&cell->user_list, &cache->deferred_cells); -} - -static void cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell, bool holder) +static void calc_discard_block_range(struct cache *cache, struct bio *bio, + dm_dblock_t *b, dm_dblock_t *e) { - unsigned long flags; - - if (!holder && dm_cell_promote_or_release(cache->prison, cell)) { - /* - * There was no prisoner to promote to holder, the - * cell has been released. - */ - free_prison_cell(cache, cell); - return; - } + sector_t sb = bio->bi_iter.bi_sector; + sector_t se = bio_end_sector(bio); - spin_lock_irqsave(&cache->lock, flags); - __cell_defer(cache, cell); - spin_unlock_irqrestore(&cache->lock, flags); + *b = to_dblock(dm_sector_div_up(sb, cache->discard_block_size)); - wake_worker(cache); + if (se - sb < cache->discard_block_size) + *e = *b; + else + *e = to_dblock(block_div(se, cache->discard_block_size)); } -static void cell_error_with_code(struct cache *cache, struct dm_bio_prison_cell *cell, int err) -{ - dm_cell_error(cache->prison, cell, err); - free_prison_cell(cache, cell); -} +/*----------------------------------------------------------------*/ -static void cell_requeue(struct cache *cache, struct dm_bio_prison_cell *cell) +static void prevent_background_work(struct cache *cache) { - cell_error_with_code(cache, cell, DM_ENDIO_REQUEUE); + lockdep_off(); + down_write(&cache->background_work_lock); + lockdep_on(); } -static void free_io_migration(struct dm_cache_migration *mg) +static void allow_background_work(struct cache *cache) { - struct cache *cache = mg->cache; - - dec_io_migrations(cache); - free_migration(mg); - wake_worker(cache); + lockdep_off(); + up_write(&cache->background_work_lock); + lockdep_on(); } -static void migration_failure(struct dm_cache_migration *mg) +static bool background_work_begin(struct cache *cache) { - struct cache *cache = mg->cache; - const char *dev_name = cache_device_name(cache); - - if (mg->writeback) { - DMERR_LIMIT("%s: writeback failed; couldn't copy block", dev_name); - set_dirty(cache, mg->old_oblock, mg->cblock); - cell_defer(cache, mg->old_ocell, false); - - } else if (mg->demote) { - DMERR_LIMIT("%s: demotion failed; couldn't copy block", dev_name); - policy_force_mapping(cache->policy, mg->new_oblock, mg->old_oblock); + bool r; - cell_defer(cache, mg->old_ocell, mg->promote ? false : true); - if (mg->promote) - cell_defer(cache, mg->new_ocell, true); - } else { - DMERR_LIMIT("%s: promotion failed; couldn't copy block", dev_name); - policy_remove_mapping(cache->policy, mg->new_oblock); - cell_defer(cache, mg->new_ocell, true); - } + lockdep_off(); + r = down_read_trylock(&cache->background_work_lock); + lockdep_on(); - free_io_migration(mg); + return r; } -static void migration_success_pre_commit(struct dm_cache_migration *mg) +static void background_work_end(struct cache *cache) { - int r; - unsigned long flags; - struct cache *cache = mg->cache; - - if (mg->writeback) { - clear_dirty(cache, mg->old_oblock, mg->cblock); - cell_defer(cache, mg->old_ocell, false); - free_io_migration(mg); - return; + lockdep_off(); + up_read(&cache->background_work_lock); + lockdep_on(); +} - } else if (mg->demote) { - r = dm_cache_remove_mapping(cache->cmd, mg->cblock); - if (r) { - DMERR_LIMIT("%s: demotion failed; couldn't update on disk metadata", - cache_device_name(cache)); - metadata_operation_failed(cache, "dm_cache_remove_mapping", r); - policy_force_mapping(cache->policy, mg->new_oblock, - mg->old_oblock); - if (mg->promote) - cell_defer(cache, mg->new_ocell, true); - free_io_migration(mg); - return; - } - } else { - r = dm_cache_insert_mapping(cache->cmd, mg->cblock, mg->new_oblock); - if (r) { - DMERR_LIMIT("%s: promotion failed; couldn't update on disk metadata", - cache_device_name(cache)); - metadata_operation_failed(cache, "dm_cache_insert_mapping", r); - policy_remove_mapping(cache->policy, mg->new_oblock); - free_io_migration(mg); - return; - } - } +/*----------------------------------------------------------------*/ - spin_lock_irqsave(&cache->lock, flags); - list_add_tail(&mg->list, &cache->need_commit_migrations); - cache->commit_requested = true; - spin_unlock_irqrestore(&cache->lock, flags); +static void quiesce(struct dm_cache_migration *mg, + void (*continuation)(struct work_struct *)) +{ + init_continuation(&mg->k, continuation); + dm_cell_quiesce_v2(mg->cache->prison, mg->cell, &mg->k.ws); } -static void migration_success_post_commit(struct dm_cache_migration *mg) +static struct dm_cache_migration *ws_to_mg(struct work_struct *ws) { - unsigned long flags; - struct cache *cache = mg->cache; - - if (mg->writeback) { - DMWARN_LIMIT("%s: writeback unexpectedly triggered commit", - cache_device_name(cache)); - return; - - } else if (mg->demote) { - cell_defer(cache, mg->old_ocell, mg->promote ? false : true); - - if (mg->promote) { - mg->demote = false; - - spin_lock_irqsave(&cache->lock, flags); - list_add_tail(&mg->list, &cache->quiesced_migrations); - spin_unlock_irqrestore(&cache->lock, flags); - - } else { - if (mg->invalidate) - policy_remove_mapping(cache->policy, mg->old_oblock); - free_io_migration(mg); - } - - } else { - if (mg->requeue_holder) { - clear_dirty(cache, mg->new_oblock, mg->cblock); - cell_defer(cache, mg->new_ocell, true); - } else { - /* - * The block was promoted via an overwrite, so it's dirty. - */ - set_dirty(cache, mg->new_oblock, mg->cblock); - bio_endio(mg->new_ocell->holder); - cell_defer(cache, mg->new_ocell, false); - } - free_io_migration(mg); - } + struct continuation *k = container_of(ws, struct continuation, ws); + return container_of(k, struct dm_cache_migration, k); } static void copy_complete(int read_err, unsigned long write_err, void *context) { - unsigned long flags; - struct dm_cache_migration *mg = (struct dm_cache_migration *) context; - struct cache *cache = mg->cache; + struct dm_cache_migration *mg = container_of(context, struct dm_cache_migration, k); if (read_err || write_err) - mg->err = true; - - spin_lock_irqsave(&cache->lock, flags); - list_add_tail(&mg->list, &cache->completed_migrations); - spin_unlock_irqrestore(&cache->lock, flags); + mg->k.input = -EIO; - wake_worker(cache); + queue_continuation(mg->cache->wq, &mg->k); } -static void issue_copy(struct dm_cache_migration *mg) +static int copy(struct dm_cache_migration *mg, bool promote) { int r; struct dm_io_region o_region, c_region; struct cache *cache = mg->cache; - sector_t cblock = from_cblock(mg->cblock); o_region.bdev = cache->origin_dev->bdev; + o_region.sector = from_oblock(mg->op->oblock) * cache->sectors_per_block; o_region.count = cache->sectors_per_block; c_region.bdev = cache->cache_dev->bdev; - c_region.sector = cblock * cache->sectors_per_block; + c_region.sector = from_cblock(mg->op->cblock) * cache->sectors_per_block; c_region.count = cache->sectors_per_block; - if (mg->writeback || mg->demote) { - /* demote */ - o_region.sector = from_oblock(mg->old_oblock) * cache->sectors_per_block; - r = dm_kcopyd_copy(cache->copier, &c_region, 1, &o_region, 0, copy_complete, mg); - } else { - /* promote */ - o_region.sector = from_oblock(mg->new_oblock) * cache->sectors_per_block; - r = dm_kcopyd_copy(cache->copier, &o_region, 1, &c_region, 0, copy_complete, mg); - } + if (promote) + r = dm_kcopyd_copy(cache->copier, &o_region, 1, &c_region, 0, copy_complete, &mg->k); + else + r = dm_kcopyd_copy(cache->copier, &c_region, 1, &o_region, 0, copy_complete, &mg->k); - if (r < 0) { - DMERR_LIMIT("%s: issuing migration failed", cache_device_name(cache)); - migration_failure(mg); - } + return r; +} + +static void bio_drop_shared_lock(struct cache *cache, struct bio *bio) +{ + size_t pb_data_size = get_per_bio_data_size(cache); + struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); + + if (pb->cell && dm_cell_put_v2(cache->prison, pb->cell)) + free_prison_cell(cache, pb->cell); + pb->cell = NULL; } static void overwrite_endio(struct bio *bio) @@ -1282,368 +1261,475 @@ static void overwrite_endio(struct bio *bio) struct cache *cache = mg->cache; size_t pb_data_size = get_per_bio_data_size(cache); struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); - unsigned long flags; dm_unhook_bio(&pb->hook_info, bio); if (bio->bi_error) - mg->err = true; - - mg->requeue_holder = false; + mg->k.input = bio->bi_error; - spin_lock_irqsave(&cache->lock, flags); - list_add_tail(&mg->list, &cache->completed_migrations); - spin_unlock_irqrestore(&cache->lock, flags); - - wake_worker(cache); + queue_continuation(mg->cache->wq, &mg->k); } -static void issue_overwrite(struct dm_cache_migration *mg, struct bio *bio) +static void overwrite(struct dm_cache_migration *mg, + void (*continuation)(struct work_struct *)) { + struct bio *bio = mg->overwrite_bio; size_t pb_data_size = get_per_bio_data_size(mg->cache); struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); dm_hook_bio(&pb->hook_info, bio, overwrite_endio, mg); - remap_to_cache_dirty(mg->cache, bio, mg->new_oblock, mg->cblock); /* - * No need to inc_ds() here, since the cell will be held for the - * duration of the io. + * The overwrite bio is part of the copy operation, as such it does + * not set/clear discard or dirty flags. */ + if (mg->op->op == POLICY_PROMOTE) + remap_to_cache(mg->cache, bio, mg->op->cblock); + else + remap_to_origin(mg->cache, bio); + + init_continuation(&mg->k, continuation); accounted_request(mg->cache, bio); } -static bool bio_writes_complete_block(struct cache *cache, struct bio *bio) +/* + * Migration steps: + * + * 1) exclusive lock preventing WRITEs + * 2) quiesce + * 3) copy or issue overwrite bio + * 4) upgrade to exclusive lock preventing READs and WRITEs + * 5) quiesce + * 6) update metadata and commit + * 7) unlock + */ +static void mg_complete(struct dm_cache_migration *mg, bool success) { - return (bio_data_dir(bio) == WRITE) && - (bio->bi_iter.bi_size == (cache->sectors_per_block << SECTOR_SHIFT)); + struct bio_list bios; + struct cache *cache = mg->cache; + struct policy_work *op = mg->op; + dm_cblock_t cblock = op->cblock; + + if (success) + update_stats(&cache->stats, op->op); + + switch (op->op) { + case POLICY_PROMOTE: + clear_discard(cache, oblock_to_dblock(cache, op->oblock)); + policy_complete_background_work(cache->policy, op, success); + + if (mg->overwrite_bio) { + if (success) + force_set_dirty(cache, cblock); + else + mg->overwrite_bio->bi_error = (mg->k.input ? : -EIO); + bio_endio(mg->overwrite_bio); + } else { + if (success) + force_clear_dirty(cache, cblock); + dec_io_migrations(cache); + } + break; + + case POLICY_DEMOTE: + /* + * We clear dirty here to update the nr_dirty counter. + */ + if (success) + force_clear_dirty(cache, cblock); + policy_complete_background_work(cache->policy, op, success); + dec_io_migrations(cache); + break; + + case POLICY_WRITEBACK: + if (success) + force_clear_dirty(cache, cblock); + policy_complete_background_work(cache->policy, op, success); + dec_io_migrations(cache); + break; + } + + bio_list_init(&bios); + if (mg->cell) { + if (dm_cell_unlock_v2(cache->prison, mg->cell, &bios)) + free_prison_cell(cache, mg->cell); + } + + free_migration(mg); + defer_bios(cache, &bios); + wake_migration_worker(cache); + + background_work_end(cache); } -static void avoid_copy(struct dm_cache_migration *mg) +static void mg_success(struct work_struct *ws) { - atomic_inc(&mg->cache->stats.copies_avoided); - migration_success_pre_commit(mg); + struct dm_cache_migration *mg = ws_to_mg(ws); + mg_complete(mg, mg->k.input == 0); } -static void calc_discard_block_range(struct cache *cache, struct bio *bio, - dm_dblock_t *b, dm_dblock_t *e) +static void mg_update_metadata(struct work_struct *ws) { - sector_t sb = bio->bi_iter.bi_sector; - sector_t se = bio_end_sector(bio); + int r; + struct dm_cache_migration *mg = ws_to_mg(ws); + struct cache *cache = mg->cache; + struct policy_work *op = mg->op; - *b = to_dblock(dm_sector_div_up(sb, cache->discard_block_size)); + switch (op->op) { + case POLICY_PROMOTE: + r = dm_cache_insert_mapping(cache->cmd, op->cblock, op->oblock); + if (r) { + DMERR_LIMIT("%s: migration failed; couldn't insert mapping", + cache_device_name(cache)); + metadata_operation_failed(cache, "dm_cache_insert_mapping", r); - if (se - sb < cache->discard_block_size) - *e = *b; - else - *e = to_dblock(block_div(se, cache->discard_block_size)); -} + mg_complete(mg, false); + return; + } + mg_complete(mg, true); + break; -static void issue_discard(struct dm_cache_migration *mg) -{ - dm_dblock_t b, e; - struct bio *bio = mg->new_ocell->holder; - struct cache *cache = mg->cache; + case POLICY_DEMOTE: + r = dm_cache_remove_mapping(cache->cmd, op->cblock); + if (r) { + DMERR_LIMIT("%s: migration failed; couldn't update on disk metadata", + cache_device_name(cache)); + metadata_operation_failed(cache, "dm_cache_remove_mapping", r); - calc_discard_block_range(cache, bio, &b, &e); - while (b != e) { - set_discard(cache, b); - b = to_dblock(from_dblock(b) + 1); + mg_complete(mg, false); + return; + } + + /* + * It would be nice if we only had to commit when a REQ_FLUSH + * comes through. But there's one scenario that we have to + * look out for: + * + * - vblock x in a cache block + * - domotion occurs + * - cache block gets reallocated and over written + * - crash + * + * When we recover, because there was no commit the cache will + * rollback to having the data for vblock x in the cache block. + * But the cache block has since been overwritten, so it'll end + * up pointing to data that was never in 'x' during the history + * of the device. + * + * To avoid this issue we require a commit as part of the + * demotion operation. + */ + init_continuation(&mg->k, mg_success); + continue_after_commit(&cache->committer, &mg->k); + schedule_commit(&cache->committer); + break; + + case POLICY_WRITEBACK: + mg_complete(mg, true); + break; } +} - bio_endio(bio); - cell_defer(cache, mg->new_ocell, false); - free_migration(mg); - wake_worker(cache); +static void mg_update_metadata_after_copy(struct work_struct *ws) +{ + struct dm_cache_migration *mg = ws_to_mg(ws); + + /* + * Did the copy succeed? + */ + if (mg->k.input) + mg_complete(mg, false); + else + mg_update_metadata(ws); } -static void issue_copy_or_discard(struct dm_cache_migration *mg) +static void mg_upgrade_lock(struct work_struct *ws) { - bool avoid; - struct cache *cache = mg->cache; + int r; + struct dm_cache_migration *mg = ws_to_mg(ws); - if (mg->discard) { - issue_discard(mg); - return; - } + /* + * Did the copy succeed? + */ + if (mg->k.input) + mg_complete(mg, false); - if (mg->writeback || mg->demote) - avoid = !is_dirty(cache, mg->cblock) || - is_discarded_oblock(cache, mg->old_oblock); else { - struct bio *bio = mg->new_ocell->holder; + /* + * Now we want the lock to prevent both reads and writes. + */ + r = dm_cell_lock_promote_v2(mg->cache->prison, mg->cell, + READ_WRITE_LOCK_LEVEL); + if (r < 0) + mg_complete(mg, false); - avoid = is_discarded_oblock(cache, mg->new_oblock); + else if (r) + quiesce(mg, mg_update_metadata); - if (writeback_mode(&cache->features) && - !avoid && bio_writes_complete_block(cache, bio)) { - issue_overwrite(mg, bio); - return; - } + else + mg_update_metadata(ws); } - - avoid ? avoid_copy(mg) : issue_copy(mg); } -static void complete_migration(struct dm_cache_migration *mg) +static void mg_copy(struct work_struct *ws) { - if (mg->err) - migration_failure(mg); - else - migration_success_pre_commit(mg); -} + int r; + struct dm_cache_migration *mg = ws_to_mg(ws); -static void process_migrations(struct cache *cache, struct list_head *head, - void (*fn)(struct dm_cache_migration *)) -{ - unsigned long flags; - struct list_head list; - struct dm_cache_migration *mg, *tmp; + if (mg->overwrite_bio) { + /* + * It's safe to do this here, even though it's new data + * because all IO has been locked out of the block. + * + * mg_lock_writes() already took READ_WRITE_LOCK_LEVEL + * so _not_ using mg_upgrade_lock() as continutation. + */ + overwrite(mg, mg_update_metadata_after_copy); - INIT_LIST_HEAD(&list); - spin_lock_irqsave(&cache->lock, flags); - list_splice_init(head, &list); - spin_unlock_irqrestore(&cache->lock, flags); + } else { + struct cache *cache = mg->cache; + struct policy_work *op = mg->op; + bool is_policy_promote = (op->op == POLICY_PROMOTE); - list_for_each_entry_safe(mg, tmp, &list, list) - fn(mg); -} + if ((!is_policy_promote && !is_dirty(cache, op->cblock)) || + is_discarded_oblock(cache, op->oblock)) { + mg_upgrade_lock(ws); + return; + } -static void __queue_quiesced_migration(struct dm_cache_migration *mg) -{ - list_add_tail(&mg->list, &mg->cache->quiesced_migrations); + init_continuation(&mg->k, mg_upgrade_lock); + + r = copy(mg, is_policy_promote); + if (r) { + DMERR_LIMIT("%s: migration copy failed", cache_device_name(cache)); + mg->k.input = -EIO; + mg_complete(mg, false); + } + } } -static void queue_quiesced_migration(struct dm_cache_migration *mg) +static int mg_lock_writes(struct dm_cache_migration *mg) { - unsigned long flags; + int r; + struct dm_cell_key_v2 key; struct cache *cache = mg->cache; + struct dm_bio_prison_cell_v2 *prealloc; - spin_lock_irqsave(&cache->lock, flags); - __queue_quiesced_migration(mg); - spin_unlock_irqrestore(&cache->lock, flags); + prealloc = alloc_prison_cell(cache); + if (!prealloc) { + DMERR_LIMIT("%s: alloc_prison_cell failed", cache_device_name(cache)); + mg_complete(mg, false); + return -ENOMEM; + } - wake_worker(cache); -} + /* + * Prevent writes to the block, but allow reads to continue. + * Unless we're using an overwrite bio, in which case we lock + * everything. + */ + build_key(mg->op->oblock, oblock_succ(mg->op->oblock), &key); + r = dm_cell_lock_v2(cache->prison, &key, + mg->overwrite_bio ? READ_WRITE_LOCK_LEVEL : WRITE_LOCK_LEVEL, + prealloc, &mg->cell); + if (r < 0) { + free_prison_cell(cache, prealloc); + mg_complete(mg, false); + return r; + } -static void queue_quiesced_migrations(struct cache *cache, struct list_head *work) -{ - unsigned long flags; - struct dm_cache_migration *mg, *tmp; + if (mg->cell != prealloc) + free_prison_cell(cache, prealloc); - spin_lock_irqsave(&cache->lock, flags); - list_for_each_entry_safe(mg, tmp, work, list) - __queue_quiesced_migration(mg); - spin_unlock_irqrestore(&cache->lock, flags); + if (r == 0) + mg_copy(&mg->k.ws); + else + quiesce(mg, mg_copy); - wake_worker(cache); + return 0; } -static void check_for_quiesced_migrations(struct cache *cache, - struct per_bio_data *pb) +static int mg_start(struct cache *cache, struct policy_work *op, struct bio *bio) { - struct list_head work; + struct dm_cache_migration *mg; - if (!pb->all_io_entry) - return; + if (!background_work_begin(cache)) { + policy_complete_background_work(cache->policy, op, false); + return -EPERM; + } - INIT_LIST_HEAD(&work); - dm_deferred_entry_dec(pb->all_io_entry, &work); + mg = alloc_migration(cache); + if (!mg) { + policy_complete_background_work(cache->policy, op, false); + background_work_end(cache); + return -ENOMEM; + } - if (!list_empty(&work)) - queue_quiesced_migrations(cache, &work); -} + memset(mg, 0, sizeof(*mg)); -static void quiesce_migration(struct dm_cache_migration *mg) -{ - if (!dm_deferred_set_add_work(mg->cache->all_io_ds, &mg->list)) - queue_quiesced_migration(mg); + mg->cache = cache; + mg->op = op; + mg->overwrite_bio = bio; + + if (!bio) + inc_io_migrations(cache); + + return mg_lock_writes(mg); } -static void promote(struct cache *cache, struct prealloc *structs, - dm_oblock_t oblock, dm_cblock_t cblock, - struct dm_bio_prison_cell *cell) +/*---------------------------------------------------------------- + * invalidation processing + *--------------------------------------------------------------*/ + +static void invalidate_complete(struct dm_cache_migration *mg, bool success) { - struct dm_cache_migration *mg = prealloc_get_migration(structs); + struct bio_list bios; + struct cache *cache = mg->cache; - mg->err = false; - mg->discard = false; - mg->writeback = false; - mg->demote = false; - mg->promote = true; - mg->requeue_holder = true; - mg->invalidate = false; - mg->cache = cache; - mg->new_oblock = oblock; - mg->cblock = cblock; - mg->old_ocell = NULL; - mg->new_ocell = cell; - mg->start_jiffies = jiffies; + bio_list_init(&bios); + if (dm_cell_unlock_v2(cache->prison, mg->cell, &bios)) + free_prison_cell(cache, mg->cell); - inc_io_migrations(cache); - quiesce_migration(mg); + if (!success && mg->overwrite_bio) + bio_io_error(mg->overwrite_bio); + + free_migration(mg); + defer_bios(cache, &bios); + + background_work_end(cache); } -static void writeback(struct cache *cache, struct prealloc *structs, - dm_oblock_t oblock, dm_cblock_t cblock, - struct dm_bio_prison_cell *cell) +static void invalidate_completed(struct work_struct *ws) { - struct dm_cache_migration *mg = prealloc_get_migration(structs); + struct dm_cache_migration *mg = ws_to_mg(ws); + invalidate_complete(mg, !mg->k.input); +} - mg->err = false; - mg->discard = false; - mg->writeback = true; - mg->demote = false; - mg->promote = false; - mg->requeue_holder = true; - mg->invalidate = false; - mg->cache = cache; - mg->old_oblock = oblock; - mg->cblock = cblock; - mg->old_ocell = cell; - mg->new_ocell = NULL; - mg->start_jiffies = jiffies; - - inc_io_migrations(cache); - quiesce_migration(mg); -} - -static void demote_then_promote(struct cache *cache, struct prealloc *structs, - dm_oblock_t old_oblock, dm_oblock_t new_oblock, - dm_cblock_t cblock, - struct dm_bio_prison_cell *old_ocell, - struct dm_bio_prison_cell *new_ocell) -{ - struct dm_cache_migration *mg = prealloc_get_migration(structs); - - mg->err = false; - mg->discard = false; - mg->writeback = false; - mg->demote = true; - mg->promote = true; - mg->requeue_holder = true; - mg->invalidate = false; - mg->cache = cache; - mg->old_oblock = old_oblock; - mg->new_oblock = new_oblock; - mg->cblock = cblock; - mg->old_ocell = old_ocell; - mg->new_ocell = new_ocell; - mg->start_jiffies = jiffies; +static int invalidate_cblock(struct cache *cache, dm_cblock_t cblock) +{ + int r = policy_invalidate_mapping(cache->policy, cblock); + if (!r) { + r = dm_cache_remove_mapping(cache->cmd, cblock); + if (r) { + DMERR_LIMIT("%s: invalidation failed; couldn't update on disk metadata", + cache_device_name(cache)); + metadata_operation_failed(cache, "dm_cache_remove_mapping", r); + } - inc_io_migrations(cache); - quiesce_migration(mg); -} + } else if (r == -ENODATA) { + /* + * Harmless, already unmapped. + */ + r = 0; -/* - * Invalidate a cache entry. No writeback occurs; any changes in the cache - * block are thrown away. - */ -static void invalidate(struct cache *cache, struct prealloc *structs, - dm_oblock_t oblock, dm_cblock_t cblock, - struct dm_bio_prison_cell *cell) -{ - struct dm_cache_migration *mg = prealloc_get_migration(structs); - - mg->err = false; - mg->discard = false; - mg->writeback = false; - mg->demote = true; - mg->promote = false; - mg->requeue_holder = true; - mg->invalidate = true; - mg->cache = cache; - mg->old_oblock = oblock; - mg->cblock = cblock; - mg->old_ocell = cell; - mg->new_ocell = NULL; - mg->start_jiffies = jiffies; + } else + DMERR("%s: policy_invalidate_mapping failed", cache_device_name(cache)); - inc_io_migrations(cache); - quiesce_migration(mg); + return r; } -static void discard(struct cache *cache, struct prealloc *structs, - struct dm_bio_prison_cell *cell) +static void invalidate_remove(struct work_struct *ws) { - struct dm_cache_migration *mg = prealloc_get_migration(structs); + int r; + struct dm_cache_migration *mg = ws_to_mg(ws); + struct cache *cache = mg->cache; - mg->err = false; - mg->discard = true; - mg->writeback = false; - mg->demote = false; - mg->promote = false; - mg->requeue_holder = false; - mg->invalidate = false; - mg->cache = cache; - mg->old_ocell = NULL; - mg->new_ocell = cell; - mg->start_jiffies = jiffies; + r = invalidate_cblock(cache, mg->invalidate_cblock); + if (r) { + invalidate_complete(mg, false); + return; + } - quiesce_migration(mg); + init_continuation(&mg->k, invalidate_completed); + continue_after_commit(&cache->committer, &mg->k); + remap_to_origin_clear_discard(cache, mg->overwrite_bio, mg->invalidate_oblock); + mg->overwrite_bio = NULL; + schedule_commit(&cache->committer); } -/*---------------------------------------------------------------- - * bio processing - *--------------------------------------------------------------*/ -static void defer_bio(struct cache *cache, struct bio *bio) +static int invalidate_lock(struct dm_cache_migration *mg) { - unsigned long flags; + int r; + struct dm_cell_key_v2 key; + struct cache *cache = mg->cache; + struct dm_bio_prison_cell_v2 *prealloc; - spin_lock_irqsave(&cache->lock, flags); - bio_list_add(&cache->deferred_bios, bio); - spin_unlock_irqrestore(&cache->lock, flags); + prealloc = alloc_prison_cell(cache); + if (!prealloc) { + invalidate_complete(mg, false); + return -ENOMEM; + } - wake_worker(cache); -} + build_key(mg->invalidate_oblock, oblock_succ(mg->invalidate_oblock), &key); + r = dm_cell_lock_v2(cache->prison, &key, + READ_WRITE_LOCK_LEVEL, prealloc, &mg->cell); + if (r < 0) { + free_prison_cell(cache, prealloc); + invalidate_complete(mg, false); + return r; + } -static void process_flush_bio(struct cache *cache, struct bio *bio) -{ - size_t pb_data_size = get_per_bio_data_size(cache); - struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); + if (mg->cell != prealloc) + free_prison_cell(cache, prealloc); - BUG_ON(bio->bi_iter.bi_size); - if (!pb->req_nr) - remap_to_origin(cache, bio); - else - remap_to_cache(cache, bio, 0); + if (r) + quiesce(mg, invalidate_remove); - /* - * REQ_PREFLUSH is not directed at any particular block so we don't - * need to inc_ds(). REQ_FUA's are split into a write + REQ_PREFLUSH - * by dm-core. - */ - issue(cache, bio); + else { + /* + * We can't call invalidate_remove() directly here because we + * might still be in request context. + */ + init_continuation(&mg->k, invalidate_remove); + queue_work(cache->wq, &mg->k.ws); + } + + return 0; } -static void process_discard_bio(struct cache *cache, struct prealloc *structs, - struct bio *bio) +static int invalidate_start(struct cache *cache, dm_cblock_t cblock, + dm_oblock_t oblock, struct bio *bio) { - int r; - dm_dblock_t b, e; - struct dm_bio_prison_cell *cell_prealloc, *new_ocell; + struct dm_cache_migration *mg; - calc_discard_block_range(cache, bio, &b, &e); - if (b == e) { - bio_endio(bio); - return; + if (!background_work_begin(cache)) + return -EPERM; + + mg = alloc_migration(cache); + if (!mg) { + background_work_end(cache); + return -ENOMEM; } - cell_prealloc = prealloc_get_cell(structs); - r = bio_detain_range(cache, dblock_to_oblock(cache, b), dblock_to_oblock(cache, e), bio, cell_prealloc, - (cell_free_fn) prealloc_put_cell, - structs, &new_ocell); - if (r > 0) - return; + memset(mg, 0, sizeof(*mg)); + + mg->cache = cache; + mg->overwrite_bio = bio; + mg->invalidate_cblock = cblock; + mg->invalidate_oblock = oblock; - discard(cache, structs, new_ocell); + return invalidate_lock(mg); } -static bool spare_migration_bandwidth(struct cache *cache) +/*---------------------------------------------------------------- + * bio processing + *--------------------------------------------------------------*/ + +enum busy { + IDLE, + MODERATE, + BUSY +}; + +static enum busy spare_migration_bandwidth(struct cache *cache) { + bool idle = iot_idle_for(&cache->origin_tracker, HZ); sector_t current_volume = (atomic_read(&cache->nr_io_migrations) + 1) * cache->sectors_per_block; - return current_volume < cache->migration_threshold; + + if (current_volume <= cache->migration_threshold) + return idle ? IDLE : MODERATE; + else + return idle ? MODERATE : BUSY; } static void inc_hit_counter(struct cache *cache, struct bio *bio) @@ -1660,255 +1746,143 @@ static void inc_miss_counter(struct cache *cache, struct bio *bio) /*----------------------------------------------------------------*/ -struct inc_detail { - struct cache *cache; - struct bio_list bios_for_issue; - struct bio_list unhandled_bios; - bool any_writes; -}; - -static void inc_fn(void *context, struct dm_bio_prison_cell *cell) +static bool bio_writes_complete_block(struct cache *cache, struct bio *bio) { - struct bio *bio; - struct inc_detail *detail = context; - struct cache *cache = detail->cache; - - inc_ds(cache, cell->holder, cell); - if (bio_data_dir(cell->holder) == WRITE) - detail->any_writes = true; - - while ((bio = bio_list_pop(&cell->bios))) { - if (discard_or_flush(bio)) { - bio_list_add(&detail->unhandled_bios, bio); - continue; - } - - if (bio_data_dir(bio) == WRITE) - detail->any_writes = true; - - bio_list_add(&detail->bios_for_issue, bio); - inc_ds(cache, bio, cell); - } + return (bio_data_dir(bio) == WRITE) && + (bio->bi_iter.bi_size == (cache->sectors_per_block << SECTOR_SHIFT)); } -// FIXME: refactor these two -static void remap_cell_to_origin_clear_discard(struct cache *cache, - struct dm_bio_prison_cell *cell, - dm_oblock_t oblock, bool issue_holder) +static bool optimisable_bio(struct cache *cache, struct bio *bio, dm_oblock_t block) { - struct bio *bio; - unsigned long flags; - struct inc_detail detail; - - detail.cache = cache; - bio_list_init(&detail.bios_for_issue); - bio_list_init(&detail.unhandled_bios); - detail.any_writes = false; - - spin_lock_irqsave(&cache->lock, flags); - dm_cell_visit_release(cache->prison, inc_fn, &detail, cell); - bio_list_merge(&cache->deferred_bios, &detail.unhandled_bios); - spin_unlock_irqrestore(&cache->lock, flags); - - remap_to_origin(cache, cell->holder); - if (issue_holder) - issue(cache, cell->holder); - else - accounted_begin(cache, cell->holder); - - if (detail.any_writes) - clear_discard(cache, oblock_to_dblock(cache, oblock)); - - while ((bio = bio_list_pop(&detail.bios_for_issue))) { - remap_to_origin(cache, bio); - issue(cache, bio); - } - - free_prison_cell(cache, cell); + return writeback_mode(&cache->features) && + (is_discarded_oblock(cache, block) || bio_writes_complete_block(cache, bio)); } -static void remap_cell_to_cache_dirty(struct cache *cache, struct dm_bio_prison_cell *cell, - dm_oblock_t oblock, dm_cblock_t cblock, bool issue_holder) +static int map_bio(struct cache *cache, struct bio *bio, dm_oblock_t block, + bool *commit_needed) { - struct bio *bio; - unsigned long flags; - struct inc_detail detail; - - detail.cache = cache; - bio_list_init(&detail.bios_for_issue); - bio_list_init(&detail.unhandled_bios); - detail.any_writes = false; - - spin_lock_irqsave(&cache->lock, flags); - dm_cell_visit_release(cache->prison, inc_fn, &detail, cell); - bio_list_merge(&cache->deferred_bios, &detail.unhandled_bios); - spin_unlock_irqrestore(&cache->lock, flags); - - remap_to_cache(cache, cell->holder, cblock); - if (issue_holder) - issue(cache, cell->holder); - else - accounted_begin(cache, cell->holder); + int r, data_dir; + bool rb, background_queued; + dm_cblock_t cblock; + size_t pb_data_size = get_per_bio_data_size(cache); + struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); - if (detail.any_writes) { - set_dirty(cache, oblock, cblock); - clear_discard(cache, oblock_to_dblock(cache, oblock)); - } + *commit_needed = false; - while ((bio = bio_list_pop(&detail.bios_for_issue))) { - remap_to_cache(cache, bio, cblock); - issue(cache, bio); + rb = bio_detain_shared(cache, block, bio); + if (!rb) { + /* + * An exclusive lock is held for this block, so we have to + * wait. We set the commit_needed flag so the current + * transaction will be committed asap, allowing this lock + * to be dropped. + */ + *commit_needed = true; + return DM_MAPIO_SUBMITTED; } - free_prison_cell(cache, cell); -} - -/*----------------------------------------------------------------*/ + data_dir = bio_data_dir(bio); -struct old_oblock_lock { - struct policy_locker locker; - struct cache *cache; - struct prealloc *structs; - struct dm_bio_prison_cell *cell; -}; + if (optimisable_bio(cache, bio, block)) { + struct policy_work *op = NULL; -static int null_locker(struct policy_locker *locker, dm_oblock_t b) -{ - /* This should never be called */ - BUG(); - return 0; -} + r = policy_lookup_with_work(cache->policy, block, &cblock, data_dir, true, &op); + if (unlikely(r && r != -ENOENT)) { + DMERR_LIMIT("%s: policy_lookup_with_work() failed with r = %d", + cache_device_name(cache), r); + bio_io_error(bio); + return DM_MAPIO_SUBMITTED; + } -static int cell_locker(struct policy_locker *locker, dm_oblock_t b) -{ - struct old_oblock_lock *l = container_of(locker, struct old_oblock_lock, locker); - struct dm_bio_prison_cell *cell_prealloc = prealloc_get_cell(l->structs); + if (r == -ENOENT && op) { + bio_drop_shared_lock(cache, bio); + BUG_ON(op->op != POLICY_PROMOTE); + mg_start(cache, op, bio); + return DM_MAPIO_SUBMITTED; + } + } else { + r = policy_lookup(cache->policy, block, &cblock, data_dir, false, &background_queued); + if (unlikely(r && r != -ENOENT)) { + DMERR_LIMIT("%s: policy_lookup() failed with r = %d", + cache_device_name(cache), r); + bio_io_error(bio); + return DM_MAPIO_SUBMITTED; + } - return bio_detain(l->cache, b, NULL, cell_prealloc, - (cell_free_fn) prealloc_put_cell, - l->structs, &l->cell); -} + if (background_queued) + wake_migration_worker(cache); + } -static void process_cell(struct cache *cache, struct prealloc *structs, - struct dm_bio_prison_cell *new_ocell) -{ - int r; - bool release_cell = true; - struct bio *bio = new_ocell->holder; - dm_oblock_t block = get_bio_block(cache, bio); - struct policy_result lookup_result; - bool passthrough = passthrough_mode(&cache->features); - bool fast_promotion, can_migrate; - struct old_oblock_lock ool; - - fast_promotion = is_discarded_oblock(cache, block) || bio_writes_complete_block(cache, bio); - can_migrate = !passthrough && (fast_promotion || spare_migration_bandwidth(cache)); - - ool.locker.fn = cell_locker; - ool.cache = cache; - ool.structs = structs; - ool.cell = NULL; - r = policy_map(cache->policy, block, true, can_migrate, fast_promotion, - bio, &ool.locker, &lookup_result); - - if (r == -EWOULDBLOCK) - /* migration has been denied */ - lookup_result.op = POLICY_MISS; - - switch (lookup_result.op) { - case POLICY_HIT: - if (passthrough) { - inc_miss_counter(cache, bio); + if (r == -ENOENT) { + /* + * Miss. + */ + inc_miss_counter(cache, bio); + if (pb->req_nr == 0) { + accounted_begin(cache, bio); + remap_to_origin_clear_discard(cache, bio, block); + } else { /* - * Passthrough always maps to the origin, - * invalidating any cache blocks that are written - * to. + * This is a duplicate writethrough io that is no + * longer needed because the block has been demoted. */ + bio_endio(bio); + return DM_MAPIO_SUBMITTED; + } + } else { + /* + * Hit. + */ + inc_hit_counter(cache, bio); + /* + * Passthrough always maps to the origin, invalidating any + * cache blocks that are written to. + */ + if (passthrough_mode(&cache->features)) { if (bio_data_dir(bio) == WRITE) { + bio_drop_shared_lock(cache, bio); atomic_inc(&cache->stats.demotion); - invalidate(cache, structs, block, lookup_result.cblock, new_ocell); - release_cell = false; - - } else { - /* FIXME: factor out issue_origin() */ + invalidate_start(cache, cblock, block, bio); + } else remap_to_origin_clear_discard(cache, bio, block); - inc_and_issue(cache, bio, new_ocell); - } + } else { - inc_hit_counter(cache, bio); - - if (bio_data_dir(bio) == WRITE && - writethrough_mode(&cache->features) && - !is_dirty(cache, lookup_result.cblock)) { - remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock); - inc_and_issue(cache, bio, new_ocell); - - } else { - remap_cell_to_cache_dirty(cache, new_ocell, block, lookup_result.cblock, true); - release_cell = false; - } + if (bio_data_dir(bio) == WRITE && writethrough_mode(&cache->features) && + !is_dirty(cache, cblock)) { + remap_to_origin_then_cache(cache, bio, block, cblock); + accounted_begin(cache, bio); + } else + remap_to_cache_dirty(cache, bio, block, cblock); } - - break; - - case POLICY_MISS: - inc_miss_counter(cache, bio); - remap_cell_to_origin_clear_discard(cache, new_ocell, block, true); - release_cell = false; - break; - - case POLICY_NEW: - atomic_inc(&cache->stats.promotion); - promote(cache, structs, block, lookup_result.cblock, new_ocell); - release_cell = false; - break; - - case POLICY_REPLACE: - atomic_inc(&cache->stats.demotion); - atomic_inc(&cache->stats.promotion); - demote_then_promote(cache, structs, lookup_result.old_oblock, - block, lookup_result.cblock, - ool.cell, new_ocell); - release_cell = false; - break; - - default: - DMERR_LIMIT("%s: %s: erroring bio, unknown policy op: %u", - cache_device_name(cache), __func__, - (unsigned) lookup_result.op); - bio_io_error(bio); } - if (release_cell) - cell_defer(cache, new_ocell, false); -} - -static void process_bio(struct cache *cache, struct prealloc *structs, - struct bio *bio) -{ - int r; - dm_oblock_t block = get_bio_block(cache, bio); - struct dm_bio_prison_cell *cell_prealloc, *new_ocell; - /* - * Check to see if that block is currently migrating. + * dm core turns FUA requests into a separate payload and FLUSH req. */ - cell_prealloc = prealloc_get_cell(structs); - r = bio_detain(cache, block, bio, cell_prealloc, - (cell_free_fn) prealloc_put_cell, - structs, &new_ocell); - if (r > 0) - return; + if (bio->bi_opf & REQ_FUA) { + /* + * issue_after_commit will call accounted_begin a second time. So + * we call accounted_complete() to avoid double accounting. + */ + accounted_complete(cache, bio); + issue_after_commit(&cache->committer, bio); + *commit_needed = true; + return DM_MAPIO_SUBMITTED; + } - process_cell(cache, structs, new_ocell); + return DM_MAPIO_REMAPPED; } -static int need_commit_due_to_time(struct cache *cache) +static bool process_bio(struct cache *cache, struct bio *bio) { - return jiffies < cache->last_commit_jiffies || - jiffies > cache->last_commit_jiffies + COMMIT_PERIOD; + bool commit_needed; + + if (map_bio(cache, bio, get_bio_block(cache, bio), &commit_needed) == DM_MAPIO_REMAPPED) + generic_make_request(bio); + + return commit_needed; } /* @@ -1929,123 +1903,88 @@ static int commit(struct cache *cache, bool clean_shutdown) return r; } -static int commit_if_needed(struct cache *cache) -{ - int r = 0; - - if ((cache->commit_requested || need_commit_due_to_time(cache)) && - dm_cache_changed_this_transaction(cache->cmd)) { - r = commit(cache, false); - cache->commit_requested = false; - cache->last_commit_jiffies = jiffies; - } - - return r; -} - -static void process_deferred_bios(struct cache *cache) +/* + * Used by the batcher. + */ +static int commit_op(void *context) { - bool prealloc_used = false; - unsigned long flags; - struct bio_list bios; - struct bio *bio; - struct prealloc structs; + struct cache *cache = context; - memset(&structs, 0, sizeof(structs)); - bio_list_init(&bios); + if (dm_cache_changed_this_transaction(cache->cmd)) + return commit(cache, false); - spin_lock_irqsave(&cache->lock, flags); - bio_list_merge(&bios, &cache->deferred_bios); - bio_list_init(&cache->deferred_bios); - spin_unlock_irqrestore(&cache->lock, flags); + return 0; +} - while (!bio_list_empty(&bios)) { - /* - * If we've got no free migration structs, and processing - * this bio might require one, we pause until there are some - * prepared mappings to process. - */ - prealloc_used = true; - if (prealloc_data_structs(cache, &structs)) { - spin_lock_irqsave(&cache->lock, flags); - bio_list_merge(&cache->deferred_bios, &bios); - spin_unlock_irqrestore(&cache->lock, flags); - break; - } +/*----------------------------------------------------------------*/ - bio = bio_list_pop(&bios); +static bool process_flush_bio(struct cache *cache, struct bio *bio) +{ + size_t pb_data_size = get_per_bio_data_size(cache); + struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); - if (bio->bi_opf & REQ_PREFLUSH) - process_flush_bio(cache, bio); - else if (bio_op(bio) == REQ_OP_DISCARD) - process_discard_bio(cache, &structs, bio); - else - process_bio(cache, &structs, bio); - } + if (!pb->req_nr) + remap_to_origin(cache, bio); + else + remap_to_cache(cache, bio, 0); - if (prealloc_used) - prealloc_free_structs(cache, &structs); + issue_after_commit(&cache->committer, bio); + return true; } -static void process_deferred_cells(struct cache *cache) +static bool process_discard_bio(struct cache *cache, struct bio *bio) { - bool prealloc_used = false; - unsigned long flags; - struct dm_bio_prison_cell *cell, *tmp; - struct list_head cells; - struct prealloc structs; - - memset(&structs, 0, sizeof(structs)); - - INIT_LIST_HEAD(&cells); - - spin_lock_irqsave(&cache->lock, flags); - list_splice_init(&cache->deferred_cells, &cells); - spin_unlock_irqrestore(&cache->lock, flags); - - list_for_each_entry_safe(cell, tmp, &cells, user_list) { - /* - * If we've got no free migration structs, and processing - * this bio might require one, we pause until there are some - * prepared mappings to process. - */ - prealloc_used = true; - if (prealloc_data_structs(cache, &structs)) { - spin_lock_irqsave(&cache->lock, flags); - list_splice(&cells, &cache->deferred_cells); - spin_unlock_irqrestore(&cache->lock, flags); - break; - } + dm_dblock_t b, e; - process_cell(cache, &structs, cell); + // FIXME: do we need to lock the region? Or can we just assume the + // user wont be so foolish as to issue discard concurrently with + // other IO? + calc_discard_block_range(cache, bio, &b, &e); + while (b != e) { + set_discard(cache, b); + b = to_dblock(from_dblock(b) + 1); } - if (prealloc_used) - prealloc_free_structs(cache, &structs); + bio_endio(bio); + + return false; } -static void process_deferred_flush_bios(struct cache *cache, bool submit_bios) +static void process_deferred_bios(struct work_struct *ws) { + struct cache *cache = container_of(ws, struct cache, deferred_bio_worker); + unsigned long flags; + bool commit_needed = false; struct bio_list bios; struct bio *bio; bio_list_init(&bios); spin_lock_irqsave(&cache->lock, flags); - bio_list_merge(&bios, &cache->deferred_flush_bios); - bio_list_init(&cache->deferred_flush_bios); + bio_list_merge(&bios, &cache->deferred_bios); + bio_list_init(&cache->deferred_bios); spin_unlock_irqrestore(&cache->lock, flags); - /* - * These bios have already been through inc_ds() - */ - while ((bio = bio_list_pop(&bios))) - submit_bios ? accounted_request(cache, bio) : bio_io_error(bio); + while ((bio = bio_list_pop(&bios))) { + if (bio->bi_opf & REQ_PREFLUSH) + commit_needed = process_flush_bio(cache, bio) || commit_needed; + + else if (bio_op(bio) == REQ_OP_DISCARD) + commit_needed = process_discard_bio(cache, bio) || commit_needed; + + else + commit_needed = process_bio(cache, bio) || commit_needed; + } + + if (commit_needed) + schedule_commit(&cache->committer); } -static void process_deferred_writethrough_bios(struct cache *cache) +static void process_deferred_writethrough_bios(struct work_struct *ws) { + struct cache *cache = container_of(ws, struct cache, deferred_writethrough_worker); + unsigned long flags; struct bio_list bios; struct bio *bio; @@ -2058,153 +1997,15 @@ static void process_deferred_writethrough_bios(struct cache *cache) spin_unlock_irqrestore(&cache->lock, flags); /* - * These bios have already been through inc_ds() + * These bios have already been through accounted_begin() */ while ((bio = bio_list_pop(&bios))) - accounted_request(cache, bio); -} - -static void writeback_some_dirty_blocks(struct cache *cache) -{ - bool prealloc_used = false; - dm_oblock_t oblock; - dm_cblock_t cblock; - struct prealloc structs; - struct dm_bio_prison_cell *old_ocell; - bool busy = !iot_idle_for(&cache->origin_tracker, HZ); - - memset(&structs, 0, sizeof(structs)); - - while (spare_migration_bandwidth(cache)) { - if (policy_writeback_work(cache->policy, &oblock, &cblock, busy)) - break; /* no work to do */ - - prealloc_used = true; - if (prealloc_data_structs(cache, &structs) || - get_cell(cache, oblock, &structs, &old_ocell)) { - policy_set_dirty(cache->policy, oblock); - break; - } - - writeback(cache, &structs, oblock, cblock, old_ocell); - } - - if (prealloc_used) - prealloc_free_structs(cache, &structs); -} - -/*---------------------------------------------------------------- - * Invalidations. - * Dropping something from the cache *without* writing back. - *--------------------------------------------------------------*/ - -static void process_invalidation_request(struct cache *cache, struct invalidation_request *req) -{ - int r = 0; - uint64_t begin = from_cblock(req->cblocks->begin); - uint64_t end = from_cblock(req->cblocks->end); - - while (begin != end) { - r = policy_remove_cblock(cache->policy, to_cblock(begin)); - if (!r) { - r = dm_cache_remove_mapping(cache->cmd, to_cblock(begin)); - if (r) { - metadata_operation_failed(cache, "dm_cache_remove_mapping", r); - break; - } - - } else if (r == -ENODATA) { - /* harmless, already unmapped */ - r = 0; - - } else { - DMERR("%s: policy_remove_cblock failed", cache_device_name(cache)); - break; - } - - begin++; - } - - cache->commit_requested = true; - - req->err = r; - atomic_set(&req->complete, 1); - - wake_up(&req->result_wait); -} - -static void process_invalidation_requests(struct cache *cache) -{ - struct list_head list; - struct invalidation_request *req, *tmp; - - INIT_LIST_HEAD(&list); - spin_lock(&cache->invalidation_lock); - list_splice_init(&cache->invalidation_requests, &list); - spin_unlock(&cache->invalidation_lock); - - list_for_each_entry_safe (req, tmp, &list, list) - process_invalidation_request(cache, req); + generic_make_request(bio); } /*---------------------------------------------------------------- * Main worker loop *--------------------------------------------------------------*/ -static bool is_quiescing(struct cache *cache) -{ - return atomic_read(&cache->quiescing); -} - -static void ack_quiescing(struct cache *cache) -{ - if (is_quiescing(cache)) { - atomic_inc(&cache->quiescing_ack); - wake_up(&cache->quiescing_wait); - } -} - -static void wait_for_quiescing_ack(struct cache *cache) -{ - wait_event(cache->quiescing_wait, atomic_read(&cache->quiescing_ack)); -} - -static void start_quiescing(struct cache *cache) -{ - atomic_inc(&cache->quiescing); - wait_for_quiescing_ack(cache); -} - -static void stop_quiescing(struct cache *cache) -{ - atomic_set(&cache->quiescing, 0); - atomic_set(&cache->quiescing_ack, 0); -} - -static void wait_for_migrations(struct cache *cache) -{ - wait_event(cache->migration_wait, !atomic_read(&cache->nr_allocated_migrations)); -} - -static void stop_worker(struct cache *cache) -{ - cancel_delayed_work(&cache->waker); - flush_workqueue(cache->wq); -} - -static void requeue_deferred_cells(struct cache *cache) -{ - unsigned long flags; - struct list_head cells; - struct dm_bio_prison_cell *cell, *tmp; - - INIT_LIST_HEAD(&cells); - spin_lock_irqsave(&cache->lock, flags); - list_splice_init(&cache->deferred_cells, &cells); - spin_unlock_irqrestore(&cache->lock, flags); - - list_for_each_entry_safe(cell, tmp, &cells, user_list) - cell_requeue(cache, cell); -} static void requeue_deferred_bios(struct cache *cache) { @@ -2221,53 +2022,6 @@ static void requeue_deferred_bios(struct cache *cache) } } -static int more_work(struct cache *cache) -{ - if (is_quiescing(cache)) - return !list_empty(&cache->quiesced_migrations) || - !list_empty(&cache->completed_migrations) || - !list_empty(&cache->need_commit_migrations); - else - return !bio_list_empty(&cache->deferred_bios) || - !list_empty(&cache->deferred_cells) || - !bio_list_empty(&cache->deferred_flush_bios) || - !bio_list_empty(&cache->deferred_writethrough_bios) || - !list_empty(&cache->quiesced_migrations) || - !list_empty(&cache->completed_migrations) || - !list_empty(&cache->need_commit_migrations) || - cache->invalidate; -} - -static void do_worker(struct work_struct *ws) -{ - struct cache *cache = container_of(ws, struct cache, worker); - - do { - if (!is_quiescing(cache)) { - writeback_some_dirty_blocks(cache); - process_deferred_writethrough_bios(cache); - process_deferred_bios(cache); - process_deferred_cells(cache); - process_invalidation_requests(cache); - } - - process_migrations(cache, &cache->quiesced_migrations, issue_copy_or_discard); - process_migrations(cache, &cache->completed_migrations, complete_migration); - - if (commit_if_needed(cache)) { - process_deferred_flush_bios(cache, false); - process_migrations(cache, &cache->need_commit_migrations, migration_failure); - } else { - process_deferred_flush_bios(cache, true); - process_migrations(cache, &cache->need_commit_migrations, - migration_success_post_commit); - } - - ack_quiescing(cache); - - } while (more_work(cache)); -} - /* * We want to commit periodically so that not too much * unwritten metadata builds up. @@ -2275,25 +2029,39 @@ static void do_worker(struct work_struct *ws) static void do_waker(struct work_struct *ws) { struct cache *cache = container_of(to_delayed_work(ws), struct cache, waker); + policy_tick(cache->policy, true); - wake_worker(cache); + wake_migration_worker(cache); + schedule_commit(&cache->committer); queue_delayed_work(cache->wq, &cache->waker, COMMIT_PERIOD); } -/*----------------------------------------------------------------*/ - -static int is_congested(struct dm_dev *dev, int bdi_bits) +static void check_migrations(struct work_struct *ws) { - struct request_queue *q = bdev_get_queue(dev->bdev); - return bdi_congested(q->backing_dev_info, bdi_bits); -} + int r; + struct policy_work *op; + struct cache *cache = container_of(ws, struct cache, migration_worker); + enum busy b; -static int cache_is_congested(struct dm_target_callbacks *cb, int bdi_bits) -{ - struct cache *cache = container_of(cb, struct cache, callbacks); + for (;;) { + b = spare_migration_bandwidth(cache); + if (b == BUSY) + break; - return is_congested(cache->origin_dev, bdi_bits) || - is_congested(cache->cache_dev, bdi_bits); + r = policy_get_background_work(cache->policy, b == IDLE, &op); + if (r == -ENODATA) + break; + + if (r) { + DMERR_LIMIT("%s: policy_background_work failed", + cache_device_name(cache)); + break; + } + + r = mg_start(cache, op, NULL); + if (r) + break; + } } /*---------------------------------------------------------------- @@ -2310,11 +2078,8 @@ static void destroy(struct cache *cache) mempool_destroy(cache->migration_pool); - if (cache->all_io_ds) - dm_deferred_set_destroy(cache->all_io_ds); - if (cache->prison) - dm_bio_prison_destroy(cache->prison); + dm_bio_prison_destroy_v2(cache->prison); if (cache->wq) destroy_workqueue(cache->wq); @@ -2707,6 +2472,7 @@ static int create_cache_policy(struct cache *cache, struct cache_args *ca, return PTR_ERR(p); } cache->policy = p; + BUG_ON(!cache->policy); return 0; } @@ -2750,6 +2516,20 @@ static void set_cache_size(struct cache *cache, dm_cblock_t size) cache->cache_size = size; } +static int is_congested(struct dm_dev *dev, int bdi_bits) +{ + struct request_queue *q = bdev_get_queue(dev->bdev); + return bdi_congested(q->backing_dev_info, bdi_bits); +} + +static int cache_is_congested(struct dm_target_callbacks *cb, int bdi_bits) +{ + struct cache *cache = container_of(cb, struct cache, callbacks); + + return is_congested(cache->origin_dev, bdi_bits) || + is_congested(cache->cache_dev, bdi_bits); +} + #define DEFAULT_MIGRATION_THRESHOLD 2048 static int cache_create(struct cache_args *ca, struct cache **result) @@ -2788,7 +2568,6 @@ static int cache_create(struct cache_args *ca, struct cache **result) ca->metadata_dev = ca->origin_dev = ca->cache_dev = NULL; - /* FIXME: factor out this whole section */ origin_blocks = cache->origin_sectors = ca->origin_sectors; origin_blocks = block_div(origin_blocks, ca->block_size); cache->origin_blocks = to_oblock(origin_blocks); @@ -2854,24 +2633,18 @@ static int cache_create(struct cache_args *ca, struct cache **result) r = -EINVAL; goto bad; } + + policy_allow_migrations(cache->policy, false); } spin_lock_init(&cache->lock); INIT_LIST_HEAD(&cache->deferred_cells); bio_list_init(&cache->deferred_bios); - bio_list_init(&cache->deferred_flush_bios); bio_list_init(&cache->deferred_writethrough_bios); - INIT_LIST_HEAD(&cache->quiesced_migrations); - INIT_LIST_HEAD(&cache->completed_migrations); - INIT_LIST_HEAD(&cache->need_commit_migrations); atomic_set(&cache->nr_allocated_migrations, 0); atomic_set(&cache->nr_io_migrations, 0); init_waitqueue_head(&cache->migration_wait); - init_waitqueue_head(&cache->quiescing_wait); - atomic_set(&cache->quiescing, 0); - atomic_set(&cache->quiescing_ack, 0); - r = -ENOMEM; atomic_set(&cache->nr_dirty, 0); cache->dirty_bitset = alloc_bitset(from_cblock(cache->cache_size)); @@ -2900,27 +2673,23 @@ static int cache_create(struct cache_args *ca, struct cache **result) goto bad; } - cache->wq = alloc_ordered_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM); + cache->wq = alloc_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM, 0); if (!cache->wq) { *error = "could not create workqueue for metadata object"; goto bad; } - INIT_WORK(&cache->worker, do_worker); + INIT_WORK(&cache->deferred_bio_worker, process_deferred_bios); + INIT_WORK(&cache->deferred_writethrough_worker, + process_deferred_writethrough_bios); + INIT_WORK(&cache->migration_worker, check_migrations); INIT_DELAYED_WORK(&cache->waker, do_waker); - cache->last_commit_jiffies = jiffies; - cache->prison = dm_bio_prison_create(); + cache->prison = dm_bio_prison_create_v2(cache->wq); if (!cache->prison) { *error = "could not create bio prison"; goto bad; } - cache->all_io_ds = dm_deferred_set_create(); - if (!cache->all_io_ds) { - *error = "could not create all_io deferred set"; - goto bad; - } - cache->migration_pool = mempool_create_slab_pool(MIGRATION_POOL_SIZE, migration_cache); if (!cache->migration_pool) { @@ -2947,11 +2716,15 @@ static int cache_create(struct cache_args *ca, struct cache **result) spin_lock_init(&cache->invalidation_lock); INIT_LIST_HEAD(&cache->invalidation_requests); + batcher_init(&cache->committer, commit_op, cache, + issue_op, cache, cache->wq); iot_init(&cache->origin_tracker); + init_rwsem(&cache->background_work_lock); + prevent_background_work(cache); + *result = cache; return 0; - bad: destroy(cache); return r; @@ -3009,7 +2782,6 @@ static int cache_ctr(struct dm_target *ti, unsigned argc, char **argv) } ti->private = cache; - out: destroy_cache_args(ca); return r; @@ -3022,17 +2794,11 @@ static int cache_map(struct dm_target *ti, struct bio *bio) struct cache *cache = ti->private; int r; - struct dm_bio_prison_cell *cell = NULL; + bool commit_needed; dm_oblock_t block = get_bio_block(cache, bio); size_t pb_data_size = get_per_bio_data_size(cache); - bool can_migrate = false; - bool fast_promotion; - struct policy_result lookup_result; - struct per_bio_data *pb = init_per_bio_data(bio, pb_data_size); - struct old_oblock_lock ool; - - ool.locker.fn = null_locker; + init_per_bio_data(bio, pb_data_size); if (unlikely(from_oblock(block) >= from_oblock(cache->origin_blocks))) { /* * This can only occur if the io goes to a partial block at @@ -3049,101 +2815,9 @@ static int cache_map(struct dm_target *ti, struct bio *bio) return DM_MAPIO_SUBMITTED; } - /* - * Check to see if that block is currently migrating. - */ - cell = alloc_prison_cell(cache); - if (!cell) { - defer_bio(cache, bio); - return DM_MAPIO_SUBMITTED; - } - - r = bio_detain(cache, block, bio, cell, - (cell_free_fn) free_prison_cell, - cache, &cell); - if (r) { - if (r < 0) - defer_bio(cache, bio); - - return DM_MAPIO_SUBMITTED; - } - - fast_promotion = is_discarded_oblock(cache, block) || bio_writes_complete_block(cache, bio); - - r = policy_map(cache->policy, block, false, can_migrate, fast_promotion, - bio, &ool.locker, &lookup_result); - if (r == -EWOULDBLOCK) { - cell_defer(cache, cell, true); - return DM_MAPIO_SUBMITTED; - - } else if (r) { - DMERR_LIMIT("%s: Unexpected return from cache replacement policy: %d", - cache_device_name(cache), r); - cell_defer(cache, cell, false); - bio_io_error(bio); - return DM_MAPIO_SUBMITTED; - } - - r = DM_MAPIO_REMAPPED; - switch (lookup_result.op) { - case POLICY_HIT: - if (passthrough_mode(&cache->features)) { - if (bio_data_dir(bio) == WRITE) { - /* - * We need to invalidate this block, so - * defer for the worker thread. - */ - cell_defer(cache, cell, true); - r = DM_MAPIO_SUBMITTED; - - } else { - inc_miss_counter(cache, bio); - remap_to_origin_clear_discard(cache, bio, block); - accounted_begin(cache, bio); - inc_ds(cache, bio, cell); - // FIXME: we want to remap hits or misses straight - // away rather than passing over to the worker. - cell_defer(cache, cell, false); - } - - } else { - inc_hit_counter(cache, bio); - if (bio_data_dir(bio) == WRITE && writethrough_mode(&cache->features) && - !is_dirty(cache, lookup_result.cblock)) { - remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock); - accounted_begin(cache, bio); - inc_ds(cache, bio, cell); - cell_defer(cache, cell, false); - - } else - remap_cell_to_cache_dirty(cache, cell, block, lookup_result.cblock, false); - } - break; - - case POLICY_MISS: - inc_miss_counter(cache, bio); - if (pb->req_nr != 0) { - /* - * This is a duplicate writethrough io that is no - * longer needed because the block has been demoted. - */ - bio_endio(bio); - // FIXME: remap everything as a miss - cell_defer(cache, cell, false); - r = DM_MAPIO_SUBMITTED; - - } else - remap_cell_to_origin_clear_discard(cache, cell, block, false); - break; - - default: - DMERR_LIMIT("%s: %s: erroring bio: unknown policy op: %u", - cache_device_name(cache), __func__, - (unsigned) lookup_result.op); - cell_defer(cache, cell, false); - bio_io_error(bio); - r = DM_MAPIO_SUBMITTED; - } + r = map_bio(cache, bio, block, &commit_needed); + if (commit_needed) + schedule_commit(&cache->committer); return r; } @@ -3163,7 +2837,7 @@ static int cache_end_io(struct dm_target *ti, struct bio *bio, int error) spin_unlock_irqrestore(&cache->lock, flags); } - check_for_quiesced_migrations(cache, pb); + bio_drop_shared_lock(cache, bio); accounted_complete(cache, bio); return 0; @@ -3263,12 +2937,18 @@ static void cache_postsuspend(struct dm_target *ti) { struct cache *cache = ti->private; - start_quiescing(cache); - wait_for_migrations(cache); - stop_worker(cache); + prevent_background_work(cache); + BUG_ON(atomic_read(&cache->nr_io_migrations)); + + cancel_delayed_work(&cache->waker); + flush_workqueue(cache->wq); + WARN_ON(cache->origin_tracker.in_flight); + + /* + * If it's a flush suspend there won't be any deferred bios, so this + * call is harmless. + */ requeue_deferred_bios(cache); - requeue_deferred_cells(cache); - stop_quiescing(cache); if (get_cache_mode(cache) == CM_WRITE) (void) sync_metadata(cache); @@ -3280,15 +2960,10 @@ static int load_mapping(void *context, dm_oblock_t oblock, dm_cblock_t cblock, int r; struct cache *cache = context; - r = policy_load_mapping(cache->policy, oblock, cblock, hint, hint_valid); + r = policy_load_mapping(cache->policy, oblock, cblock, dirty, hint, hint_valid); if (r) return r; - if (dirty) - set_dirty(cache, oblock, cblock); - else - clear_dirty(cache, oblock, cblock); - return 0; } @@ -3487,6 +3162,7 @@ static void cache_resume(struct dm_target *ti) struct cache *cache = ti->private; cache->need_tick_bio = true; + allow_background_work(cache); do_waker(&cache->waker.work); } @@ -3620,11 +3296,20 @@ err: DMEMIT("Error"); } +/* + * Defines a range of cblocks, begin to (end - 1) are in the range. end is + * the one-past-the-end value. + */ +struct cblock_range { + dm_cblock_t begin; + dm_cblock_t end; +}; + /* * A cache block range can take two forms: * * i) A single cblock, eg. '3456' - * ii) A begin and end cblock with dots between, eg. 123-234 + * ii) A begin and end cblock with a dash between, eg. 123-234 */ static int parse_cblock_range(struct cache *cache, const char *str, struct cblock_range *result) @@ -3690,23 +3375,31 @@ static int validate_cblock_range(struct cache *cache, struct cblock_range *range return 0; } +static inline dm_cblock_t cblock_succ(dm_cblock_t b) +{ + return to_cblock(from_cblock(b) + 1); +} + static int request_invalidation(struct cache *cache, struct cblock_range *range) { - struct invalidation_request req; + int r = 0; - INIT_LIST_HEAD(&req.list); - req.cblocks = range; - atomic_set(&req.complete, 0); - req.err = 0; - init_waitqueue_head(&req.result_wait); + /* + * We don't need to do any locking here because we know we're in + * passthrough mode. There's is potential for a race between an + * invalidation triggered by an io and an invalidation message. This + * is harmless, we must not worry if the policy call fails. + */ + while (range->begin != range->end) { + r = invalidate_cblock(cache, range->begin); + if (r) + return r; - spin_lock(&cache->invalidation_lock); - list_add(&req.list, &cache->invalidation_requests); - spin_unlock(&cache->invalidation_lock); - wake_worker(cache); + range->begin = cblock_succ(range->begin); + } - wait_event(req.result_wait, atomic_read(&req.complete)); - return req.err; + cache->commit_requested = true; + return r; } static int process_invalidate_cblocks_message(struct cache *cache, unsigned count, @@ -3816,7 +3509,7 @@ static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits) static struct target_type cache_target = { .name = "cache", - .version = {1, 10, 0}, + .version = {2, 0, 0}, .module = THIS_MODULE, .ctr = cache_ctr, .dtr = cache_dtr, -- cgit v1.2.3 From 7eada909bfd7ac90a4522e56aa3179d1fd68cd14 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Wed, 4 Jan 2017 20:23:53 +0100 Subject: dm: add integrity target The dm-integrity target emulates a block device that has additional per-sector tags that can be used for storing integrity information. A general problem with storing integrity tags with every sector is that writing the sector and the integrity tag must be atomic - i.e. in case of crash, either both sector and integrity tag or none of them is written. To guarantee write atomicity the dm-integrity target uses a journal. It writes sector data and integrity tags into a journal, commits the journal and then copies the data and integrity tags to their respective location. The dm-integrity target can be used with the dm-crypt target - in this situation the dm-crypt target creates the integrity data and passes them to the dm-integrity target via bio_integrity_payload attached to the bio. In this mode, the dm-crypt and dm-integrity targets provide authenticated disk encryption - if the attacker modifies the encrypted device, an I/O error is returned instead of random data. The dm-integrity target can also be used as a standalone target, in this mode it calculates and verifies the integrity tag internally. In this mode, the dm-integrity target can be used to detect silent data corruption on the disk or in the I/O path. Signed-off-by: Mikulas Patocka Signed-off-by: Milan Broz Signed-off-by: Mike Snitzer --- Documentation/device-mapper/dm-integrity.txt | 189 ++ drivers/md/Kconfig | 10 + drivers/md/Makefile | 1 + drivers/md/dm-integrity.c | 3085 ++++++++++++++++++++++++++ 4 files changed, 3285 insertions(+) create mode 100644 Documentation/device-mapper/dm-integrity.txt create mode 100644 drivers/md/dm-integrity.c (limited to 'drivers/md/Makefile') diff --git a/Documentation/device-mapper/dm-integrity.txt b/Documentation/device-mapper/dm-integrity.txt new file mode 100644 index 000000000000..2406f56501dc --- /dev/null +++ b/Documentation/device-mapper/dm-integrity.txt @@ -0,0 +1,189 @@ +The dm-integrity target emulates a block device that has additional +per-sector tags that can be used for storing integrity information. + +A general problem with storing integrity tags with every sector is that +writing the sector and the integrity tag must be atomic - i.e. in case of +crash, either both sector and integrity tag or none of them is written. + +To guarantee write atomicity, the dm-integrity target uses journal, it +writes sector data and integrity tags into a journal, commits the journal +and then copies the data and integrity tags to their respective location. + +The dm-integrity target can be used with the dm-crypt target - in this +situation the dm-crypt target creates the integrity data and passes them +to the dm-integrity target via bio_integrity_payload attached to the bio. +In this mode, the dm-crypt and dm-integrity targets provide authenticated +disk encryption - if the attacker modifies the encrypted device, an I/O +error is returned instead of random data. + +The dm-integrity target can also be used as a standalone target, in this +mode it calculates and verifies the integrity tag internally. In this +mode, the dm-integrity target can be used to detect silent data +corruption on the disk or in the I/O path. + + +When loading the target for the first time, the kernel driver will format +the device. But it will only format the device if the superblock contains +zeroes. If the superblock is neither valid nor zeroed, the dm-integrity +target can't be loaded. + +To use the target for the first time: +1. overwrite the superblock with zeroes +2. load the dm-integrity target with one-sector size, the kernel driver + will format the device +3. unload the dm-integrity target +4. read the "provided_data_sectors" value from the superblock +5. load the dm-integrity target with the the target size + "provided_data_sectors" +6. if you want to use dm-integrity with dm-crypt, load the dm-crypt target + with the size "provided_data_sectors" + + +Target arguments: + +1. the underlying block device + +2. the number of reserved sector at the beginning of the device - the + dm-integrity won't read of write these sectors + +3. the size of the integrity tag (if "-" is used, the size is taken from + the internal-hash algorithm) + +4. mode: + D - direct writes (without journal) - in this mode, journaling is + not used and data sectors and integrity tags are written + separately. In case of crash, it is possible that the data + and integrity tag doesn't match. + J - journaled writes - data and integrity tags are written to the + journal and atomicity is guaranteed. In case of crash, + either both data and tag or none of them are written. The + journaled mode degrades write throughput twice because the + data have to be written twice. + +5. the number of additional arguments + +Additional arguments: + +journal-sectors:number + The size of journal, this argument is used only if formatting the + device. If the device is already formatted, the value from the + superblock is used. + +interleave-sectors:number + The number of interleaved sectors. This values is rounded down to + a power of two. If the device is already formatted, the value from + the superblock is used. + +buffer-sectors:number + The number of sectors in one buffer. The value is rounded down to + a power of two. + + The tag area is accessed using buffers, the buffer size is + configurable. The large buffer size means that the I/O size will + be larger, but there could be less I/Os issued. + +journal-watermark:number + The journal watermark in percents. When the size of the journal + exceeds this watermark, the thread that flushes the journal will + be started. + +commit-time:number + Commit time in milliseconds. When this time passes, the journal is + written. The journal is also written immediatelly if the FLUSH + request is received. + +internal-hash:algorithm(:key) (the key is optional) + Use internal hash or crc. + When this argument is used, the dm-integrity target won't accept + integrity tags from the upper target, but it will automatically + generate and verify the integrity tags. + + You can use a crc algorithm (such as crc32), then integrity target + will protect the data against accidental corruption. + You can also use a hmac algorithm (for example + "hmac(sha256):0123456789abcdef"), in this mode it will provide + cryptographic authentication of the data without encryption. + + When this argument is not used, the integrity tags are accepted + from an upper layer target, such as dm-crypt. The upper layer + target should check the validity of the integrity tags. + +journal-crypt:algorithm(:key) (the key is optional) + Encrypt the journal using given algorithm to make sure that the + attacker can't read the journal. You can use a block cipher here + (such as "cbc(aes)") or a stream cipher (for example "chacha20", + "salsa20", "ctr(aes)" or "ecb(arc4)"). + + The journal contains history of last writes to the block device, + an attacker reading the journal could see the last sector nubmers + that were written. From the sector numbers, the attacker can infer + the size of files that were written. To protect against this + situation, you can encrypt the journal. + +journal-mac:algorithm(:key) (the key is optional) + Protect sector numbers in the journal from accidental or malicious + modification. To protect against accidental modification, use a + crc algorithm, to protect against malicious modification, use a + hmac algorithm with a key. + + This option is not needed when using internal-hash because in this + mode, the integrity of journal entries is checked when replaying + the journal. Thus, modified sector number would be detected at + this stage. + + +The journal mode (D/J), buffer-sectors, journal-watermark, commit-time can +be changed when reloading the target (load an inactive table and swap the +tables with suspend and resume). The other arguments should not be changed +when reloading the target because the layout of disk data depend on them +and the reloaded target would be non-functional. + + +The layout of the formatted block device: +* reserved sectors (they are not used by this target, they can be used for + storing LUKS metadata or for other purpose), the size of the reserved + area is specified in the target arguments +* superblock (4kiB) + * magic string - identifies that the device was formatted + * version + * log2(interleave sectors) + * integrity tag size + * the number of journal sections + * provided data sectors - the number of sectors that this target + provides (i.e. the size of the device minus the size of all + metadata and padding). The user of this target should not send + bios that access data beyond the "provided data sectors" limit. + * flags - a flag is set if journal-mac is used +* journal + The journal is divided into sections, each section contains: + * metadata area (4kiB), it contains journal entries + every journal entry contains: + * logical sector (specifies where the data and tag should + be written) + * last 8 bytes of data + * integrity tag (the size is specified in the superblock) + every metadata sector ends with + * mac (8-bytes), all the macs in 8 metadata sectors form a + 64-byte value. It is used to store hmac of sector + numbers in the journal section, to protect against a + possibility that the attacker tampers with sector + numbers in the journal. + * commit id + * data area (the size is variable; it depends on how many journal + entries fit into the metadata area) + every sector in the data area contains: + * data (504 bytes of data, the last 8 bytes are stored in + the journal entry) + * commit id + To test if the whole journal section was written correctly, every + 512-byte sector of the journal ends with 8-byte commit id. If the + commit id matches on all sectors in a journal section, then it is + assumed that the section was written correctly. If the commit id + doesn't match, the section was written partially and it should not + be replayed. +* one or more runs of interleaved tags and data. Each run contains: + * tag area - it contains integrity tags. There is one tag for each + sector in the data area + * data area - it contains data sectors. The number of data sectors + in one run must be a power of two. log2 of this value is stored + in the superblock. diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index 982cd0626bc7..5c5ed97c9fda 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -500,4 +500,14 @@ config DM_LOG_WRITES If unsure, say N. +config DM_INTEGRITY + tristate "Integrity target" + depends on BLK_DEV_DM + select BLK_DEV_INTEGRITY + select DM_BUFIO + select CRYPTO + select ASYNC_XOR + ---help--- + This is the integrity target. + endif # MD diff --git a/drivers/md/Makefile b/drivers/md/Makefile index 2801b2fb452d..39cf2a1b5f90 100644 --- a/drivers/md/Makefile +++ b/drivers/md/Makefile @@ -59,6 +59,7 @@ obj-$(CONFIG_DM_CACHE) += dm-cache.o obj-$(CONFIG_DM_CACHE_SMQ) += dm-cache-smq.o obj-$(CONFIG_DM_ERA) += dm-era.o obj-$(CONFIG_DM_LOG_WRITES) += dm-log-writes.o +obj-$(CONFIG_DM_INTEGRITY) += dm-integrity.o ifeq ($(CONFIG_DM_UEVENT),y) dm-mod-objs += dm-uevent.o diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c new file mode 100644 index 000000000000..ea779cca8b45 --- /dev/null +++ b/drivers/md/dm-integrity.c @@ -0,0 +1,3085 @@ +/* + * Copyright (C) 2016-2017 Red Hat, Inc. All rights reserved. + * Copyright (C) 2016-2017 Milan Broz + * Copyright (C) 2016-2017 Mikulas Patocka + * + * This file is released under the GPL. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "dm-bufio.h" + +#define DM_MSG_PREFIX "integrity" + +#define DEFAULT_INTERLEAVE_SECTORS 32768 +#define DEFAULT_JOURNAL_SIZE_FACTOR 7 +#define DEFAULT_BUFFER_SECTORS 128 +#define DEFAULT_JOURNAL_WATERMARK 50 +#define DEFAULT_SYNC_MSEC 10000 +#define DEFAULT_MAX_JOURNAL_SECTORS 131072 +#define MIN_INTERLEAVE_SECTORS 3 +#define MAX_INTERLEAVE_SECTORS 31 +#define METADATA_WORKQUEUE_MAX_ACTIVE 16 + +/* + * Warning - DEBUG_PRINT prints security-sensitive data to the log, + * so it should not be enabled in the official kernel + */ +//#define DEBUG_PRINT +//#define INTERNAL_VERIFY + +/* + * On disk structures + */ + +#define SB_MAGIC "integrt" +#define SB_VERSION 1 +#define SB_SECTORS 8 + +struct superblock { + __u8 magic[8]; + __u8 version; + __u8 log2_interleave_sectors; + __u16 integrity_tag_size; + __u32 journal_sections; + __u64 provided_data_sectors; /* userspace uses this value */ + __u32 flags; +}; + +#define SB_FLAG_HAVE_JOURNAL_MAC 0x1 + +#define JOURNAL_ENTRY_ROUNDUP 8 + +typedef __u64 commit_id_t; +#define JOURNAL_MAC_PER_SECTOR 8 + +struct journal_entry { + union { + struct { + __u32 sector_lo; + __u32 sector_hi; + } s; + __u64 sector; + } u; + commit_id_t last_bytes; + __u8 tag[0]; +}; + +#if BITS_PER_LONG == 64 +#define journal_entry_set_sector(je, x) do { smp_wmb(); ACCESS_ONCE((je)->u.sector) = cpu_to_le64(x); } while (0) +#define journal_entry_get_sector(je) le64_to_cpu((je)->u.sector) +#elif defined(CONFIG_LBDAF) +#define journal_entry_set_sector(je, x) do { (je)->u.s.sector_lo = cpu_to_le32(x); smp_wmb(); ACCESS_ONCE((je)->u.s.sector_hi) = cpu_to_le32((x) >> 32); } while (0) +#define journal_entry_get_sector(je) le64_to_cpu((je)->u.sector) +#else +#define journal_entry_set_sector(je, x) do { (je)->u.s.sector_lo = cpu_to_le32(x); smp_wmb(); ACCESS_ONCE((je)->u.s.sector_hi) = cpu_to_le32(0); } while (0) +#define journal_entry_get_sector(je) le32_to_cpu((je)->u.s.sector_lo) +#endif +#define journal_entry_is_unused(je) ((je)->u.s.sector_hi == cpu_to_le32(-1)) +#define journal_entry_set_unused(je) do { ((je)->u.s.sector_hi = cpu_to_le32(-1)); } while (0) +#define journal_entry_is_inprogress(je) ((je)->u.s.sector_hi == cpu_to_le32(-2)) +#define journal_entry_set_inprogress(je) do { ((je)->u.s.sector_hi = cpu_to_le32(-2)); } while (0) + +#define JOURNAL_BLOCK_SECTORS 8 +#define JOURNAL_SECTOR_DATA ((1 << SECTOR_SHIFT) - sizeof(commit_id_t)) +#define JOURNAL_MAC_SIZE (JOURNAL_MAC_PER_SECTOR * JOURNAL_BLOCK_SECTORS) + +struct journal_sector { + __u8 entries[JOURNAL_SECTOR_DATA - JOURNAL_MAC_PER_SECTOR]; + __u8 mac[JOURNAL_MAC_PER_SECTOR]; + commit_id_t commit_id; +}; + +#define MAX_TAG_SIZE (JOURNAL_SECTOR_DATA - JOURNAL_MAC_PER_SECTOR - offsetof(struct journal_entry, tag)) + +#define METADATA_PADDING_SECTORS 8 + +#define N_COMMIT_IDS 4 + +static unsigned char prev_commit_seq(unsigned char seq) +{ + return (seq + N_COMMIT_IDS - 1) % N_COMMIT_IDS; +} + +static unsigned char next_commit_seq(unsigned char seq) +{ + return (seq + 1) % N_COMMIT_IDS; +} + +/* + * In-memory structures + */ + +struct journal_node { + struct rb_node node; + sector_t sector; +}; + +struct alg_spec { + char *alg_string; + char *key_string; + __u8 *key; + unsigned key_size; +}; + +struct dm_integrity_c { + struct dm_dev *dev; + unsigned tag_size; + __s8 log2_tag_size; + sector_t start; + mempool_t *journal_io_mempool; + struct dm_io_client *io; + struct dm_bufio_client *bufio; + struct workqueue_struct *metadata_wq; + struct superblock *sb; + unsigned journal_pages; + struct page_list *journal; + struct page_list *journal_io; + struct page_list *journal_xor; + + struct crypto_skcipher *journal_crypt; + struct scatterlist **journal_scatterlist; + struct scatterlist **journal_io_scatterlist; + struct skcipher_request **sk_requests; + + struct crypto_shash *journal_mac; + + struct journal_node *journal_tree; + struct rb_root journal_tree_root; + + sector_t provided_data_sectors; + + unsigned short journal_entry_size; + unsigned char journal_entries_per_sector; + unsigned char journal_section_entries; + unsigned char journal_section_sectors; + unsigned journal_sections; + unsigned journal_entries; + sector_t device_sectors; + unsigned initial_sectors; + unsigned metadata_run; + __s8 log2_metadata_run; + __u8 log2_buffer_sectors; + + unsigned char mode; + bool suspending; + + int failed; + + struct crypto_shash *internal_hash; + + /* these variables are locked with endio_wait.lock */ + struct rb_root in_progress; + wait_queue_head_t endio_wait; + struct workqueue_struct *wait_wq; + + unsigned char commit_seq; + commit_id_t commit_ids[N_COMMIT_IDS]; + + unsigned committed_section; + unsigned n_committed_sections; + + unsigned uncommitted_section; + unsigned n_uncommitted_sections; + + unsigned free_section; + unsigned char free_section_entry; + unsigned free_sectors; + + unsigned free_sectors_threshold; + + struct workqueue_struct *commit_wq; + struct work_struct commit_work; + + struct workqueue_struct *writer_wq; + struct work_struct writer_work; + + struct bio_list flush_bio_list; + + unsigned long autocommit_jiffies; + struct timer_list autocommit_timer; + unsigned autocommit_msec; + + wait_queue_head_t copy_to_journal_wait; + + struct completion crypto_backoff; + + bool journal_uptodate; + bool just_formatted; + + struct alg_spec internal_hash_alg; + struct alg_spec journal_crypt_alg; + struct alg_spec journal_mac_alg; +}; + +struct dm_integrity_range { + sector_t logical_sector; + unsigned n_sectors; + struct rb_node node; +}; + +struct dm_integrity_io { + struct work_struct work; + + struct dm_integrity_c *ic; + bool write; + bool fua; + + struct dm_integrity_range range; + + sector_t metadata_block; + unsigned metadata_offset; + + atomic_t in_flight; + int bi_error; + + struct completion *completion; + + struct block_device *orig_bi_bdev; + bio_end_io_t *orig_bi_end_io; + struct bio_integrity_payload *orig_bi_integrity; + struct bvec_iter orig_bi_iter; +}; + +struct journal_completion { + struct dm_integrity_c *ic; + atomic_t in_flight; + struct completion comp; +}; + +struct journal_io { + struct dm_integrity_range range; + struct journal_completion *comp; +}; + +static struct kmem_cache *journal_io_cache; + +#define JOURNAL_IO_MEMPOOL 32 + +#ifdef DEBUG_PRINT +#define DEBUG_print(x, ...) printk(KERN_DEBUG x, ##__VA_ARGS__) +static void __DEBUG_bytes(__u8 *bytes, size_t len, const char *msg, ...) +{ + va_list args; + va_start(args, msg); + vprintk(msg, args); + va_end(args); + if (len) + pr_cont(":"); + while (len) { + pr_cont(" %02x", *bytes); + bytes++; + len--; + } + pr_cont("\n"); +} +#define DEBUG_bytes(bytes, len, msg, ...) __DEBUG_bytes(bytes, len, KERN_DEBUG msg, ##__VA_ARGS__) +#else +#define DEBUG_print(x, ...) do { } while (0) +#define DEBUG_bytes(bytes, len, msg, ...) do { } while (0) +#endif + +/* + * DM Integrity profile, protection is performed layer above (dm-crypt) + */ +static struct blk_integrity_profile dm_integrity_profile = { + .name = "DM-DIF-EXT-TAG", + .generate_fn = NULL, + .verify_fn = NULL, +}; + +static void dm_integrity_map_continue(struct dm_integrity_io *dio, bool from_map); +static void integrity_bio_wait(struct work_struct *w); +static void dm_integrity_dtr(struct dm_target *ti); + +static void dm_integrity_io_error(struct dm_integrity_c *ic, const char *msg, int err) +{ + if (!cmpxchg(&ic->failed, 0, err)) + DMERR("Error on %s: %d", msg, err); +} + +static int dm_integrity_failed(struct dm_integrity_c *ic) +{ + return ACCESS_ONCE(ic->failed); +} + +static commit_id_t dm_integrity_commit_id(struct dm_integrity_c *ic, unsigned i, + unsigned j, unsigned char seq) +{ + /* + * Xor the number with section and sector, so that if a piece of + * journal is written at wrong place, it is detected. + */ + return ic->commit_ids[seq] ^ cpu_to_le64(((__u64)i << 32) ^ j); +} + +static void get_area_and_offset(struct dm_integrity_c *ic, sector_t data_sector, + sector_t *area, sector_t *offset) +{ + __u8 log2_interleave_sectors = ic->sb->log2_interleave_sectors; + + *area = data_sector >> log2_interleave_sectors; + *offset = (unsigned)data_sector & ((1U << log2_interleave_sectors) - 1); +} + +static __u64 get_metadata_sector_and_offset(struct dm_integrity_c *ic, sector_t area, + sector_t offset, unsigned *metadata_offset) +{ + __u64 ms; + unsigned mo; + + ms = area << ic->sb->log2_interleave_sectors; + if (likely(ic->log2_metadata_run >= 0)) + ms += area << ic->log2_metadata_run; + else + ms += area * ic->metadata_run; + ms >>= ic->log2_buffer_sectors; + + if (likely(ic->log2_tag_size >= 0)) { + ms += offset >> (SECTOR_SHIFT + ic->log2_buffer_sectors - ic->log2_tag_size); + mo = (offset << ic->log2_tag_size) & ((1U << SECTOR_SHIFT << ic->log2_buffer_sectors) - 1); + } else { + ms += (__u64)offset * ic->tag_size >> (SECTOR_SHIFT + ic->log2_buffer_sectors); + mo = (offset * ic->tag_size) & ((1U << SECTOR_SHIFT << ic->log2_buffer_sectors) - 1); + } + *metadata_offset = mo; + return ms; +} + +static sector_t get_data_sector(struct dm_integrity_c *ic, sector_t area, sector_t offset) +{ + sector_t result; + + result = area << ic->sb->log2_interleave_sectors; + if (likely(ic->log2_metadata_run >= 0)) + result += (area + 1) << ic->log2_metadata_run; + else + result += (area + 1) * ic->metadata_run; + + result += (sector_t)ic->initial_sectors + offset; + return result; +} + +static void wraparound_section(struct dm_integrity_c *ic, unsigned *sec_ptr) +{ + if (unlikely(*sec_ptr >= ic->journal_sections)) + *sec_ptr -= ic->journal_sections; +} + +static int sync_rw_sb(struct dm_integrity_c *ic, int op, int op_flags) +{ + struct dm_io_request io_req; + struct dm_io_region io_loc; + + io_req.bi_op = op; + io_req.bi_op_flags = op_flags; + io_req.mem.type = DM_IO_KMEM; + io_req.mem.ptr.addr = ic->sb; + io_req.notify.fn = NULL; + io_req.client = ic->io; + io_loc.bdev = ic->dev->bdev; + io_loc.sector = ic->start; + io_loc.count = SB_SECTORS; + + return dm_io(&io_req, 1, &io_loc, NULL); +} + +static void access_journal_check(struct dm_integrity_c *ic, unsigned section, unsigned offset, + bool e, const char *function) +{ +#if defined(CONFIG_DM_DEBUG) || defined(INTERNAL_VERIFY) + unsigned limit = e ? ic->journal_section_entries : ic->journal_section_sectors; + + if (unlikely(section >= ic->journal_sections) || + unlikely(offset >= limit)) { + printk(KERN_CRIT "%s: invalid access at (%u,%u), limit (%u,%u)\n", + function, section, offset, ic->journal_sections, limit); + BUG(); + } +#endif +} + +static void page_list_location(struct dm_integrity_c *ic, unsigned section, unsigned offset, + unsigned *pl_index, unsigned *pl_offset) +{ + unsigned sector; + + access_journal_check(ic, section, offset, false, "access_journal"); + + sector = section * ic->journal_section_sectors + offset; + + *pl_index = sector >> (PAGE_SHIFT - SECTOR_SHIFT); + *pl_offset = (sector << SECTOR_SHIFT) & (PAGE_SIZE - 1); +} + +static struct journal_sector *access_page_list(struct dm_integrity_c *ic, struct page_list *pl, + unsigned section, unsigned offset, unsigned *n_sectors) +{ + unsigned pl_index, pl_offset; + char *va; + + page_list_location(ic, section, offset, &pl_index, &pl_offset); + + if (n_sectors) + *n_sectors = (PAGE_SIZE - pl_offset) >> SECTOR_SHIFT; + + va = lowmem_page_address(pl[pl_index].page); + + return (struct journal_sector *)(va + pl_offset); +} + +static struct journal_sector *access_journal(struct dm_integrity_c *ic, unsigned section, unsigned offset) +{ + return access_page_list(ic, ic->journal, section, offset, NULL); +} + +static struct journal_entry *access_journal_entry(struct dm_integrity_c *ic, unsigned section, unsigned n) +{ + unsigned rel_sector, offset; + struct journal_sector *js; + + access_journal_check(ic, section, n, true, "access_journal_entry"); + + rel_sector = n % JOURNAL_BLOCK_SECTORS; + offset = n / JOURNAL_BLOCK_SECTORS; + + js = access_journal(ic, section, rel_sector); + return (struct journal_entry *)((char *)js + offset * ic->journal_entry_size); +} + +static struct journal_sector *access_journal_data(struct dm_integrity_c *ic, unsigned section, unsigned n) +{ + access_journal_check(ic, section, n, true, "access_journal_data"); + + return access_journal(ic, section, n + JOURNAL_BLOCK_SECTORS); +} + +static void section_mac(struct dm_integrity_c *ic, unsigned section, __u8 result[JOURNAL_MAC_SIZE]) +{ + SHASH_DESC_ON_STACK(desc, ic->journal_mac); + int r; + unsigned j, size; + + desc->tfm = ic->journal_mac; + desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP; + + r = crypto_shash_init(desc); + if (unlikely(r)) { + dm_integrity_io_error(ic, "crypto_shash_init", r); + goto err; + } + + for (j = 0; j < ic->journal_section_entries; j++) { + struct journal_entry *je = access_journal_entry(ic, section, j); + r = crypto_shash_update(desc, (__u8 *)&je->u.sector, sizeof je->u.sector); + if (unlikely(r)) { + dm_integrity_io_error(ic, "crypto_shash_update", r); + goto err; + } + } + + size = crypto_shash_digestsize(ic->journal_mac); + + if (likely(size <= JOURNAL_MAC_SIZE)) { + r = crypto_shash_final(desc, result); + if (unlikely(r)) { + dm_integrity_io_error(ic, "crypto_shash_final", r); + goto err; + } + memset(result + size, 0, JOURNAL_MAC_SIZE - size); + } else { + __u8 digest[size]; + r = crypto_shash_final(desc, digest); + if (unlikely(r)) { + dm_integrity_io_error(ic, "crypto_shash_final", r); + goto err; + } + memcpy(result, digest, JOURNAL_MAC_SIZE); + } + + return; +err: + memset(result, 0, JOURNAL_MAC_SIZE); +} + +static void rw_section_mac(struct dm_integrity_c *ic, unsigned section, bool wr) +{ + __u8 result[JOURNAL_MAC_SIZE]; + unsigned j; + + if (!ic->journal_mac) + return; + + section_mac(ic, section, result); + + for (j = 0; j < JOURNAL_BLOCK_SECTORS; j++) { + struct journal_sector *js = access_journal(ic, section, j); + + if (likely(wr)) + memcpy(&js->mac, result + (j * JOURNAL_MAC_PER_SECTOR), JOURNAL_MAC_PER_SECTOR); + else { + if (memcmp(&js->mac, result + (j * JOURNAL_MAC_PER_SECTOR), JOURNAL_MAC_PER_SECTOR)) + dm_integrity_io_error(ic, "journal mac", -EILSEQ); + } + } +} + +static void complete_journal_op(void *context) +{ + struct journal_completion *comp = context; + BUG_ON(!atomic_read(&comp->in_flight)); + if (likely(atomic_dec_and_test(&comp->in_flight))) + complete(&comp->comp); +} + +static void xor_journal(struct dm_integrity_c *ic, bool encrypt, unsigned section, + unsigned n_sections, struct journal_completion *comp) +{ + struct async_submit_ctl submit; + size_t n_bytes = (size_t)(n_sections * ic->journal_section_sectors) << SECTOR_SHIFT; + unsigned pl_index, pl_offset, section_index; + struct page_list *source_pl, *target_pl; + + if (likely(encrypt)) { + source_pl = ic->journal; + target_pl = ic->journal_io; + } else { + source_pl = ic->journal_io; + target_pl = ic->journal; + } + + page_list_location(ic, section, 0, &pl_index, &pl_offset); + + atomic_add(roundup(pl_offset + n_bytes, PAGE_SIZE) >> PAGE_SHIFT, &comp->in_flight); + + init_async_submit(&submit, ASYNC_TX_XOR_ZERO_DST, NULL, complete_journal_op, comp, NULL); + + section_index = pl_index; + + do { + size_t this_step; + struct page *src_pages[2]; + struct page *dst_page; + + while (unlikely(pl_index == section_index)) { + unsigned dummy; + if (likely(encrypt)) + rw_section_mac(ic, section, true); + section++; + n_sections--; + if (!n_sections) + break; + page_list_location(ic, section, 0, §ion_index, &dummy); + } + + this_step = min(n_bytes, (size_t)PAGE_SIZE - pl_offset); + dst_page = target_pl[pl_index].page; + src_pages[0] = source_pl[pl_index].page; + src_pages[1] = ic->journal_xor[pl_index].page; + + async_xor(dst_page, src_pages, pl_offset, 2, this_step, &submit); + + pl_index++; + pl_offset = 0; + n_bytes -= this_step; + } while (n_bytes); + + BUG_ON(n_sections); + + async_tx_issue_pending_all(); +} + +static void complete_journal_encrypt(struct crypto_async_request *req, int err) +{ + struct journal_completion *comp = req->data; + if (unlikely(err)) { + if (likely(err == -EINPROGRESS)) { + complete(&comp->ic->crypto_backoff); + return; + } + dm_integrity_io_error(comp->ic, "asynchronous encrypt", err); + } + complete_journal_op(comp); +} + +static bool do_crypt(bool encrypt, struct skcipher_request *req, struct journal_completion *comp) +{ + int r; + skcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, + complete_journal_encrypt, comp); + if (likely(encrypt)) + r = crypto_skcipher_encrypt(req); + else + r = crypto_skcipher_decrypt(req); + if (likely(!r)) + return false; + if (likely(r == -EINPROGRESS)) + return true; + if (likely(r == -EBUSY)) { + wait_for_completion(&comp->ic->crypto_backoff); + reinit_completion(&comp->ic->crypto_backoff); + return true; + } + dm_integrity_io_error(comp->ic, "encrypt", r); + return false; +} + +static void crypt_journal(struct dm_integrity_c *ic, bool encrypt, unsigned section, + unsigned n_sections, struct journal_completion *comp) +{ + struct scatterlist **source_sg; + struct scatterlist **target_sg; + + atomic_add(2, &comp->in_flight); + + if (likely(encrypt)) { + source_sg = ic->journal_scatterlist; + target_sg = ic->journal_io_scatterlist; + } else { + source_sg = ic->journal_io_scatterlist; + target_sg = ic->journal_scatterlist; + } + + do { + struct skcipher_request *req; + unsigned ivsize; + char *iv; + + if (likely(encrypt)) + rw_section_mac(ic, section, true); + + req = ic->sk_requests[section]; + ivsize = crypto_skcipher_ivsize(ic->journal_crypt); + iv = req->iv; + + memcpy(iv, iv + ivsize, ivsize); + + req->src = source_sg[section]; + req->dst = target_sg[section]; + + if (unlikely(do_crypt(encrypt, req, comp))) + atomic_inc(&comp->in_flight); + + section++; + n_sections--; + } while (n_sections); + + atomic_dec(&comp->in_flight); + complete_journal_op(comp); +} + +static void encrypt_journal(struct dm_integrity_c *ic, bool encrypt, unsigned section, + unsigned n_sections, struct journal_completion *comp) +{ + if (ic->journal_xor) + return xor_journal(ic, encrypt, section, n_sections, comp); + else + return crypt_journal(ic, encrypt, section, n_sections, comp); +} + +static void complete_journal_io(unsigned long error, void *context) +{ + struct journal_completion *comp = context; + if (unlikely(error != 0)) + dm_integrity_io_error(comp->ic, "writing journal", -EIO); + complete_journal_op(comp); +} + +static void rw_journal(struct dm_integrity_c *ic, int op, int op_flags, unsigned section, + unsigned n_sections, struct journal_completion *comp) +{ + struct dm_io_request io_req; + struct dm_io_region io_loc; + unsigned sector, n_sectors, pl_index, pl_offset; + int r; + + if (unlikely(dm_integrity_failed(ic))) { + if (comp) + complete_journal_io(-1UL, comp); + return; + } + + sector = section * ic->journal_section_sectors; + n_sectors = n_sections * ic->journal_section_sectors; + + pl_index = sector >> (PAGE_SHIFT - SECTOR_SHIFT); + pl_offset = (sector << SECTOR_SHIFT) & (PAGE_SIZE - 1); + + io_req.bi_op = op; + io_req.bi_op_flags = op_flags; + io_req.mem.type = DM_IO_PAGE_LIST; + if (ic->journal_io) + io_req.mem.ptr.pl = &ic->journal_io[pl_index]; + else + io_req.mem.ptr.pl = &ic->journal[pl_index]; + io_req.mem.offset = pl_offset; + if (likely(comp != NULL)) { + io_req.notify.fn = complete_journal_io; + io_req.notify.context = comp; + } else { + io_req.notify.fn = NULL; + } + io_req.client = ic->io; + io_loc.bdev = ic->dev->bdev; + io_loc.sector = ic->start + SB_SECTORS + sector; + io_loc.count = n_sectors; + + r = dm_io(&io_req, 1, &io_loc, NULL); + if (unlikely(r)) { + dm_integrity_io_error(ic, op == REQ_OP_READ ? "reading journal" : "writing journal", r); + if (comp) { + WARN_ONCE(1, "asynchronous dm_io failed: %d", r); + complete_journal_io(-1UL, comp); + } + } +} + +static void write_journal(struct dm_integrity_c *ic, unsigned commit_start, unsigned commit_sections) +{ + struct journal_completion io_comp; + struct journal_completion crypt_comp_1; + struct journal_completion crypt_comp_2; + unsigned i; + + io_comp.ic = ic; + io_comp.comp = COMPLETION_INITIALIZER_ONSTACK(io_comp.comp); + + if (commit_start + commit_sections <= ic->journal_sections) { + io_comp.in_flight = (atomic_t)ATOMIC_INIT(1); + if (ic->journal_io) { + crypt_comp_1.ic = ic; + crypt_comp_1.comp = COMPLETION_INITIALIZER_ONSTACK(crypt_comp_1.comp); + crypt_comp_1.in_flight = (atomic_t)ATOMIC_INIT(0); + encrypt_journal(ic, true, commit_start, commit_sections, &crypt_comp_1); + wait_for_completion_io(&crypt_comp_1.comp); + } else { + for (i = 0; i < commit_sections; i++) + rw_section_mac(ic, commit_start + i, true); + } + rw_journal(ic, REQ_OP_WRITE, REQ_FUA, commit_start, commit_sections, &io_comp); + } else { + unsigned to_end; + io_comp.in_flight = (atomic_t)ATOMIC_INIT(2); + to_end = ic->journal_sections - commit_start; + if (ic->journal_io) { + crypt_comp_1.ic = ic; + crypt_comp_1.comp = COMPLETION_INITIALIZER_ONSTACK(crypt_comp_1.comp); + crypt_comp_1.in_flight = (atomic_t)ATOMIC_INIT(0); + encrypt_journal(ic, true, commit_start, to_end, &crypt_comp_1); + if (try_wait_for_completion(&crypt_comp_1.comp)) { + rw_journal(ic, REQ_OP_WRITE, REQ_FUA, commit_start, to_end, &io_comp); + crypt_comp_1.comp = COMPLETION_INITIALIZER_ONSTACK(crypt_comp_1.comp); + crypt_comp_1.in_flight = (atomic_t)ATOMIC_INIT(0); + encrypt_journal(ic, true, 0, commit_sections - to_end, &crypt_comp_1); + wait_for_completion_io(&crypt_comp_1.comp); + } else { + crypt_comp_2.ic = ic; + crypt_comp_2.comp = COMPLETION_INITIALIZER_ONSTACK(crypt_comp_2.comp); + crypt_comp_2.in_flight = (atomic_t)ATOMIC_INIT(0); + encrypt_journal(ic, true, 0, commit_sections - to_end, &crypt_comp_2); + wait_for_completion_io(&crypt_comp_1.comp); + rw_journal(ic, REQ_OP_WRITE, REQ_FUA, commit_start, to_end, &io_comp); + wait_for_completion_io(&crypt_comp_2.comp); + } + } else { + for (i = 0; i < to_end; i++) + rw_section_mac(ic, commit_start + i, true); + rw_journal(ic, REQ_OP_WRITE, REQ_FUA, commit_start, to_end, &io_comp); + for (i = 0; i < commit_sections - to_end; i++) + rw_section_mac(ic, i, true); + } + rw_journal(ic, REQ_OP_WRITE, REQ_FUA, 0, commit_sections - to_end, &io_comp); + } + + wait_for_completion_io(&io_comp.comp); +} + +static void copy_from_journal(struct dm_integrity_c *ic, unsigned section, unsigned offset, + unsigned n_sectors, sector_t target, io_notify_fn fn, void *data) +{ + struct dm_io_request io_req; + struct dm_io_region io_loc; + int r; + unsigned sector, pl_index, pl_offset; + + if (unlikely(dm_integrity_failed(ic))) { + fn(-1UL, data); + return; + } + + sector = section * ic->journal_section_sectors + JOURNAL_BLOCK_SECTORS + offset; + + pl_index = sector >> (PAGE_SHIFT - SECTOR_SHIFT); + pl_offset = (sector << SECTOR_SHIFT) & (PAGE_SIZE - 1); + + io_req.bi_op = REQ_OP_WRITE; + io_req.bi_op_flags = 0; + io_req.mem.type = DM_IO_PAGE_LIST; + io_req.mem.ptr.pl = &ic->journal[pl_index]; + io_req.mem.offset = pl_offset; + io_req.notify.fn = fn; + io_req.notify.context = data; + io_req.client = ic->io; + io_loc.bdev = ic->dev->bdev; + io_loc.sector = ic->start + target; + io_loc.count = n_sectors; + + r = dm_io(&io_req, 1, &io_loc, NULL); + if (unlikely(r)) { + WARN_ONCE(1, "asynchronous dm_io failed: %d", r); + fn(-1UL, data); + } +} + +static bool add_new_range(struct dm_integrity_c *ic, struct dm_integrity_range *new_range) +{ + struct rb_node **n = &ic->in_progress.rb_node; + struct rb_node *parent; + + parent = NULL; + + while (*n) { + struct dm_integrity_range *range = container_of(*n, struct dm_integrity_range, node); + + parent = *n; + if (new_range->logical_sector + new_range->n_sectors <= range->logical_sector) { + n = &range->node.rb_left; + } else if (new_range->logical_sector >= range->logical_sector + range->n_sectors) { + n = &range->node.rb_right; + } else { + return false; + } + } + + rb_link_node(&new_range->node, parent, n); + rb_insert_color(&new_range->node, &ic->in_progress); + + return true; +} + +static void remove_range_unlocked(struct dm_integrity_c *ic, struct dm_integrity_range *range) +{ + rb_erase(&range->node, &ic->in_progress); + wake_up_locked(&ic->endio_wait); +} + +static void remove_range(struct dm_integrity_c *ic, struct dm_integrity_range *range) +{ + unsigned long flags; + + spin_lock_irqsave(&ic->endio_wait.lock, flags); + remove_range_unlocked(ic, range); + spin_unlock_irqrestore(&ic->endio_wait.lock, flags); +} + +static void init_journal_node(struct journal_node *node) +{ + RB_CLEAR_NODE(&node->node); + node->sector = (sector_t)-1; +} + +static void add_journal_node(struct dm_integrity_c *ic, struct journal_node *node, sector_t sector) +{ + struct rb_node **link; + struct rb_node *parent; + + node->sector = sector; + BUG_ON(!RB_EMPTY_NODE(&node->node)); + + link = &ic->journal_tree_root.rb_node; + parent = NULL; + + while (*link) { + struct journal_node *j; + parent = *link; + j = container_of(parent, struct journal_node, node); + if (sector < j->sector) + link = &j->node.rb_left; + else + link = &j->node.rb_right; + } + + rb_link_node(&node->node, parent, link); + rb_insert_color(&node->node, &ic->journal_tree_root); +} + +static void remove_journal_node(struct dm_integrity_c *ic, struct journal_node *node) +{ + BUG_ON(RB_EMPTY_NODE(&node->node)); + rb_erase(&node->node, &ic->journal_tree_root); + init_journal_node(node); +} + +#define NOT_FOUND (-1U) + +static unsigned find_journal_node(struct dm_integrity_c *ic, sector_t sector, sector_t *next_sector) +{ + struct rb_node *n = ic->journal_tree_root.rb_node; + unsigned found = NOT_FOUND; + *next_sector = (sector_t)-1; + while (n) { + struct journal_node *j = container_of(n, struct journal_node, node); + if (sector == j->sector) { + found = j - ic->journal_tree; + } + if (sector < j->sector) { + *next_sector = j->sector; + n = j->node.rb_left; + } else { + n = j->node.rb_right; + } + } + + return found; +} + +static bool test_journal_node(struct dm_integrity_c *ic, unsigned pos, sector_t sector) +{ + struct journal_node *node, *next_node; + struct rb_node *next; + + if (unlikely(pos >= ic->journal_entries)) + return false; + node = &ic->journal_tree[pos]; + if (unlikely(RB_EMPTY_NODE(&node->node))) + return false; + if (unlikely(node->sector != sector)) + return false; + + next = rb_next(&node->node); + if (unlikely(!next)) + return true; + + next_node = container_of(next, struct journal_node, node); + return next_node->sector != sector; +} + +static bool find_newer_committed_node(struct dm_integrity_c *ic, struct journal_node *node) +{ + struct rb_node *next; + struct journal_node *next_node; + unsigned next_section; + + BUG_ON(RB_EMPTY_NODE(&node->node)); + + next = rb_next(&node->node); + if (unlikely(!next)) + return false; + + next_node = container_of(next, struct journal_node, node); + + if (next_node->sector != node->sector) + return false; + + next_section = (unsigned)(next_node - ic->journal_tree) / ic->journal_section_entries; + if (next_section >= ic->committed_section && + next_section < ic->committed_section + ic->n_committed_sections) + return true; + if (next_section + ic->journal_sections < ic->committed_section + ic->n_committed_sections) + return true; + + return false; +} + +#define TAG_READ 0 +#define TAG_WRITE 1 +#define TAG_CMP 2 + +static int dm_integrity_rw_tag(struct dm_integrity_c *ic, unsigned char *tag, sector_t *metadata_block, + unsigned *metadata_offset, unsigned total_size, int op) +{ + do { + unsigned char *data, *dp; + struct dm_buffer *b; + unsigned to_copy; + int r; + + r = dm_integrity_failed(ic); + if (unlikely(r)) + return r; + + data = dm_bufio_read(ic->bufio, *metadata_block, &b); + if (unlikely(IS_ERR(data))) + return PTR_ERR(data); + + to_copy = min((1U << SECTOR_SHIFT << ic->log2_buffer_sectors) - *metadata_offset, total_size); + dp = data + *metadata_offset; + if (op == TAG_READ) { + memcpy(tag, dp, to_copy); + } else if (op == TAG_WRITE) { + memcpy(dp, tag, to_copy); + dm_bufio_mark_buffer_dirty(b); + } else { + /* e.g.: op == TAG_CMP */ + if (unlikely(memcmp(dp, tag, to_copy))) { + unsigned i; + + for (i = 0; i < to_copy; i++) { + if (dp[i] != tag[i]) + break; + total_size--; + } + dm_bufio_release(b); + return total_size; + } + } + dm_bufio_release(b); + + tag += to_copy; + *metadata_offset += to_copy; + if (unlikely(*metadata_offset == 1U << SECTOR_SHIFT << ic->log2_buffer_sectors)) { + (*metadata_block)++; + *metadata_offset = 0; + } + total_size -= to_copy; + } while (unlikely(total_size)); + + return 0; +} + +static void dm_integrity_flush_buffers(struct dm_integrity_c *ic) +{ + int r; + r = dm_bufio_write_dirty_buffers(ic->bufio); + if (unlikely(r)) + dm_integrity_io_error(ic, "writing tags", r); +} + +static void sleep_on_endio_wait(struct dm_integrity_c *ic) +{ + DECLARE_WAITQUEUE(wait, current); + __add_wait_queue(&ic->endio_wait, &wait); + __set_current_state(TASK_UNINTERRUPTIBLE); + spin_unlock_irq(&ic->endio_wait.lock); + io_schedule(); + spin_lock_irq(&ic->endio_wait.lock); + __remove_wait_queue(&ic->endio_wait, &wait); +} + +static void autocommit_fn(unsigned long data) +{ + struct dm_integrity_c *ic = (struct dm_integrity_c *)data; + + if (likely(!dm_integrity_failed(ic))) + queue_work(ic->commit_wq, &ic->commit_work); +} + +static void schedule_autocommit(struct dm_integrity_c *ic) +{ + if (!timer_pending(&ic->autocommit_timer)) + mod_timer(&ic->autocommit_timer, jiffies + ic->autocommit_jiffies); +} + +static void submit_flush_bio(struct dm_integrity_c *ic, struct dm_integrity_io *dio) +{ + struct bio *bio; + spin_lock_irq(&ic->endio_wait.lock); + bio = dm_bio_from_per_bio_data(dio, sizeof(struct dm_integrity_io)); + bio_list_add(&ic->flush_bio_list, bio); + spin_unlock_irq(&ic->endio_wait.lock); + queue_work(ic->commit_wq, &ic->commit_work); +} + +static void do_endio(struct dm_integrity_c *ic, struct bio *bio) +{ + int r = dm_integrity_failed(ic); + if (unlikely(r) && !bio->bi_error) + bio->bi_error = r; + bio_endio(bio); +} + +static void do_endio_flush(struct dm_integrity_c *ic, struct dm_integrity_io *dio) +{ + struct bio *bio = dm_bio_from_per_bio_data(dio, sizeof(struct dm_integrity_io)); + + if (unlikely(dio->fua) && likely(!bio->bi_error) && likely(!dm_integrity_failed(ic))) + submit_flush_bio(ic, dio); + else + do_endio(ic, bio); +} + +static void dec_in_flight(struct dm_integrity_io *dio) +{ + if (atomic_dec_and_test(&dio->in_flight)) { + struct dm_integrity_c *ic = dio->ic; + struct bio *bio; + + remove_range(ic, &dio->range); + + if (unlikely(dio->write)) + schedule_autocommit(ic); + + bio = dm_bio_from_per_bio_data(dio, sizeof(struct dm_integrity_io)); + + if (unlikely(dio->bi_error) && !bio->bi_error) + bio->bi_error = dio->bi_error; + if (likely(!bio->bi_error) && unlikely(bio_sectors(bio) != dio->range.n_sectors)) { + dio->range.logical_sector += dio->range.n_sectors; + bio_advance(bio, dio->range.n_sectors << SECTOR_SHIFT); + INIT_WORK(&dio->work, integrity_bio_wait); + queue_work(ic->wait_wq, &dio->work); + return; + } + do_endio_flush(ic, dio); + } +} + +static void integrity_end_io(struct bio *bio) +{ + struct dm_integrity_io *dio = dm_per_bio_data(bio, sizeof(struct dm_integrity_io)); + + bio->bi_iter = dio->orig_bi_iter; + bio->bi_bdev = dio->orig_bi_bdev; + if (dio->orig_bi_integrity) { + bio->bi_integrity = dio->orig_bi_integrity; + bio->bi_opf |= REQ_INTEGRITY; + } + bio->bi_end_io = dio->orig_bi_end_io; + + if (dio->completion) + complete(dio->completion); + + dec_in_flight(dio); +} + +static void integrity_sector_checksum(struct dm_integrity_c *ic, sector_t sector, + const char *data, char *result) +{ + __u64 sector_le = cpu_to_le64(sector); + SHASH_DESC_ON_STACK(req, ic->internal_hash); + int r; + unsigned digest_size; + + req->tfm = ic->internal_hash; + req->flags = 0; + + r = crypto_shash_init(req); + if (unlikely(r < 0)) { + dm_integrity_io_error(ic, "crypto_shash_init", r); + goto failed; + } + + r = crypto_shash_update(req, (const __u8 *)§or_le, sizeof sector_le); + if (unlikely(r < 0)) { + dm_integrity_io_error(ic, "crypto_shash_update", r); + goto failed; + } + + r = crypto_shash_update(req, data, 1 << SECTOR_SHIFT); + if (unlikely(r < 0)) { + dm_integrity_io_error(ic, "crypto_shash_update", r); + goto failed; + } + + r = crypto_shash_final(req, result); + if (unlikely(r < 0)) { + dm_integrity_io_error(ic, "crypto_shash_final", r); + goto failed; + } + + digest_size = crypto_shash_digestsize(ic->internal_hash); + if (unlikely(digest_size < ic->tag_size)) + memset(result + digest_size, 0, ic->tag_size - digest_size); + + return; + +failed: + /* this shouldn't happen anyway, the hash functions have no reason to fail */ + get_random_bytes(result, ic->tag_size); +} + +static void integrity_metadata(struct work_struct *w) +{ + struct dm_integrity_io *dio = container_of(w, struct dm_integrity_io, work); + struct dm_integrity_c *ic = dio->ic; + + int r; + + if (ic->internal_hash) { + struct bvec_iter iter; + struct bio_vec bv; + unsigned digest_size = crypto_shash_digestsize(ic->internal_hash); + struct bio *bio = dm_bio_from_per_bio_data(dio, sizeof(struct dm_integrity_io)); + char *checksums; + unsigned extra_space = digest_size > ic->tag_size ? digest_size - ic->tag_size : 0; + char checksums_onstack[ic->tag_size + extra_space]; + unsigned sectors_to_process = dio->range.n_sectors; + sector_t sector = dio->range.logical_sector; + + checksums = kmalloc((PAGE_SIZE >> SECTOR_SHIFT) * ic->tag_size + extra_space, + GFP_NOIO | __GFP_NORETRY | __GFP_NOWARN); + if (!checksums) + checksums = checksums_onstack; + + __bio_for_each_segment(bv, bio, iter, dio->orig_bi_iter) { + unsigned pos; + char *mem, *checksums_ptr; + +again: + mem = (char *)kmap_atomic(bv.bv_page) + bv.bv_offset; + pos = 0; + checksums_ptr = checksums; + do { + integrity_sector_checksum(ic, sector, mem + pos, checksums_ptr); + checksums_ptr += ic->tag_size; + sectors_to_process--; + pos += 1 << SECTOR_SHIFT; + sector++; + } while (pos < bv.bv_len && sectors_to_process && checksums != checksums_onstack); + kunmap_atomic(mem); + + r = dm_integrity_rw_tag(ic, checksums, &dio->metadata_block, &dio->metadata_offset, + checksums_ptr - checksums, !dio->write ? TAG_CMP : TAG_WRITE); + if (unlikely(r)) { + if (r > 0) { + DMERR("Checksum failed at sector 0x%llx", + (unsigned long long)(sector - ((r + ic->tag_size - 1) / ic->tag_size))); + r = -EILSEQ; + } + if (likely(checksums != checksums_onstack)) + kfree(checksums); + goto error; + } + + if (!sectors_to_process) + break; + + if (unlikely(pos < bv.bv_len)) { + bv.bv_offset += pos; + bv.bv_len -= pos; + goto again; + } + } + + if (likely(checksums != checksums_onstack)) + kfree(checksums); + } else { + struct bio_integrity_payload *bip = dio->orig_bi_integrity; + + if (bip) { + struct bio_vec biv; + struct bvec_iter iter; + unsigned data_to_process = dio->range.n_sectors * ic->tag_size; + + bip_for_each_vec(biv, bip, iter) { + unsigned char *tag; + unsigned this_len; + + BUG_ON(PageHighMem(biv.bv_page)); + tag = lowmem_page_address(biv.bv_page) + biv.bv_offset; + this_len = min(biv.bv_len, data_to_process); + r = dm_integrity_rw_tag(ic, tag, &dio->metadata_block, &dio->metadata_offset, + this_len, !dio->write ? TAG_READ : TAG_WRITE); + if (unlikely(r)) + goto error; + data_to_process -= this_len; + if (!data_to_process) + break; + } + } + } + dec_in_flight(dio); + return; +error: + dio->bi_error = r; + dec_in_flight(dio); +} + +static int dm_integrity_map(struct dm_target *ti, struct bio *bio) +{ + struct dm_integrity_c *ic = ti->private; + struct dm_integrity_io *dio = dm_per_bio_data(bio, sizeof(struct dm_integrity_io)); + + sector_t area, offset; + + dio->ic = ic; + dio->bi_error = 0; + + if (unlikely(bio->bi_opf & REQ_PREFLUSH)) { + submit_flush_bio(ic, dio); + return DM_MAPIO_SUBMITTED; + } + + dio->range.logical_sector = dm_target_offset(ti, bio->bi_iter.bi_sector); + dio->write = bio_op(bio) == REQ_OP_WRITE; + dio->fua = dio->write && bio->bi_opf & REQ_FUA; + if (unlikely(dio->fua)) { + /* + * Don't pass down the FUA flag because we have to flush + * disk cache anyway. + */ + bio->bi_opf &= ~REQ_FUA; + } + if (unlikely(dio->range.logical_sector + bio_sectors(bio) > ic->provided_data_sectors)) { + DMERR("Too big sector number: 0x%llx + 0x%x > 0x%llx", + (unsigned long long)dio->range.logical_sector, bio_sectors(bio), + (unsigned long long)ic->provided_data_sectors); + return -EIO; + } + + get_area_and_offset(ic, dio->range.logical_sector, &area, &offset); + dio->metadata_block = get_metadata_sector_and_offset(ic, area, offset, &dio->metadata_offset); + bio->bi_iter.bi_sector = get_data_sector(ic, area, offset); + + dm_integrity_map_continue(dio, true); + return DM_MAPIO_SUBMITTED; +} + +static bool __journal_read_write(struct dm_integrity_io *dio, struct bio *bio, + unsigned journal_section, unsigned journal_entry) +{ + struct dm_integrity_c *ic = dio->ic; + sector_t logical_sector; + unsigned n_sectors; + + logical_sector = dio->range.logical_sector; + n_sectors = dio->range.n_sectors; + do { + struct bio_vec bv = bio_iovec(bio); + char *mem; + + if (unlikely(bv.bv_len >> SECTOR_SHIFT > n_sectors)) + bv.bv_len = n_sectors << SECTOR_SHIFT; + n_sectors -= bv.bv_len >> SECTOR_SHIFT; + bio_advance_iter(bio, &bio->bi_iter, bv.bv_len); +retry_kmap: + mem = kmap_atomic(bv.bv_page); + if (likely(dio->write)) + flush_dcache_page(bv.bv_page); + + do { + struct journal_entry *je = access_journal_entry(ic, journal_section, journal_entry); + + if (unlikely(!dio->write)) { + struct journal_sector *js; + + if (unlikely(journal_entry_is_inprogress(je))) { + flush_dcache_page(bv.bv_page); + kunmap_atomic(mem); + + __io_wait_event(ic->copy_to_journal_wait, !journal_entry_is_inprogress(je)); + goto retry_kmap; + } + smp_rmb(); + BUG_ON(journal_entry_get_sector(je) != logical_sector); + js = access_journal_data(ic, journal_section, journal_entry); + memcpy(mem + bv.bv_offset, js, JOURNAL_SECTOR_DATA); + memcpy(mem + bv.bv_offset + JOURNAL_SECTOR_DATA, &je->last_bytes, sizeof je->last_bytes); +#ifdef INTERNAL_VERIFY + if (ic->internal_hash) { + char checksums_onstack[max(crypto_shash_digestsize(ic->internal_hash), ic->tag_size)]; + + integrity_sector_checksum(ic, logical_sector, mem + bv.bv_offset, checksums_onstack); + if (unlikely(memcmp(checksums_onstack, je->tag, ic->tag_size))) { + DMERR("Checksum failed when reading from journal, at sector 0x%llx", + (unsigned long long)logical_sector); + } + } +#endif + } + + if (!ic->internal_hash) { + struct bio_integrity_payload *bip = bio_integrity(bio); + unsigned tag_todo = ic->tag_size; + char *tag_ptr = je->tag; + + if (bip) do { + struct bio_vec biv = bvec_iter_bvec(bip->bip_vec, bip->bip_iter); + unsigned tag_now = min(biv.bv_len, tag_todo); + char *tag_addr; + BUG_ON(PageHighMem(biv.bv_page)); + tag_addr = lowmem_page_address(biv.bv_page) + biv.bv_offset; + if (likely(dio->write)) + memcpy(tag_ptr, tag_addr, tag_now); + else + memcpy(tag_addr, tag_ptr, tag_now); + bvec_iter_advance(bip->bip_vec, &bip->bip_iter, tag_now); + tag_ptr += tag_now; + tag_todo -= tag_now; + } while (unlikely(tag_todo)); else { + if (likely(dio->write)) + memset(tag_ptr, 0, tag_todo); + } + } + + if (likely(dio->write)) { + struct journal_sector *js; + + js = access_journal_data(ic, journal_section, journal_entry); + memcpy(js, mem + bv.bv_offset, 1 << SECTOR_SHIFT); + je->last_bytes = js->commit_id; + + if (ic->internal_hash) { + unsigned digest_size = crypto_shash_digestsize(ic->internal_hash); + if (unlikely(digest_size > ic->tag_size)) { + char checksums_onstack[digest_size]; + integrity_sector_checksum(ic, logical_sector, (char *)js, checksums_onstack); + memcpy(je->tag, checksums_onstack, ic->tag_size); + } else + integrity_sector_checksum(ic, logical_sector, (char *)js, je->tag); + } + + journal_entry_set_sector(je, logical_sector); + } + logical_sector++; + + journal_entry++; + if (unlikely(journal_entry == ic->journal_section_entries)) { + journal_entry = 0; + journal_section++; + wraparound_section(ic, &journal_section); + } + + bv.bv_offset += 1 << SECTOR_SHIFT; + } while (bv.bv_len -= 1 << SECTOR_SHIFT); + + if (unlikely(!dio->write)) + flush_dcache_page(bv.bv_page); + kunmap_atomic(mem); + } while (n_sectors); + + if (likely(dio->write)) { + smp_mb(); + if (unlikely(waitqueue_active(&ic->copy_to_journal_wait))) + wake_up(&ic->copy_to_journal_wait); + if (ACCESS_ONCE(ic->free_sectors) <= ic->free_sectors_threshold) { + queue_work(ic->commit_wq, &ic->commit_work); + } else { + schedule_autocommit(ic); + } + } else { + remove_range(ic, &dio->range); + } + + if (unlikely(bio->bi_iter.bi_size)) { + sector_t area, offset; + + dio->range.logical_sector = logical_sector; + get_area_and_offset(ic, dio->range.logical_sector, &area, &offset); + dio->metadata_block = get_metadata_sector_and_offset(ic, area, offset, &dio->metadata_offset); + return true; + } + + return false; +} + +static void dm_integrity_map_continue(struct dm_integrity_io *dio, bool from_map) +{ + struct dm_integrity_c *ic = dio->ic; + struct bio *bio = dm_bio_from_per_bio_data(dio, sizeof(struct dm_integrity_io)); + unsigned journal_section, journal_entry; + unsigned journal_read_pos; + struct completion read_comp; + bool need_sync_io = ic->internal_hash && !dio->write; + + if (need_sync_io && from_map) { + INIT_WORK(&dio->work, integrity_bio_wait); + queue_work(ic->metadata_wq, &dio->work); + return; + } + +lock_retry: + spin_lock_irq(&ic->endio_wait.lock); +retry: + if (unlikely(dm_integrity_failed(ic))) { + spin_unlock_irq(&ic->endio_wait.lock); + do_endio(ic, bio); + return; + } + dio->range.n_sectors = bio_sectors(bio); + journal_read_pos = NOT_FOUND; + if (likely(ic->mode == 'J')) { + if (dio->write) { + unsigned next_entry, i, pos; + unsigned ws, we; + + dio->range.n_sectors = min(dio->range.n_sectors, ic->free_sectors); + if (unlikely(!dio->range.n_sectors)) + goto sleep; + ic->free_sectors -= dio->range.n_sectors; + journal_section = ic->free_section; + journal_entry = ic->free_section_entry; + + next_entry = ic->free_section_entry + dio->range.n_sectors; + ic->free_section_entry = next_entry % ic->journal_section_entries; + ic->free_section += next_entry / ic->journal_section_entries; + ic->n_uncommitted_sections += next_entry / ic->journal_section_entries; + wraparound_section(ic, &ic->free_section); + + pos = journal_section * ic->journal_section_entries + journal_entry; + ws = journal_section; + we = journal_entry; + for (i = 0; i < dio->range.n_sectors; i++) { + struct journal_entry *je; + + add_journal_node(ic, &ic->journal_tree[pos], dio->range.logical_sector + i); + pos++; + if (unlikely(pos >= ic->journal_entries)) + pos = 0; + + je = access_journal_entry(ic, ws, we); + BUG_ON(!journal_entry_is_unused(je)); + journal_entry_set_inprogress(je); + we++; + if (unlikely(we == ic->journal_section_entries)) { + we = 0; + ws++; + wraparound_section(ic, &ws); + } + } + + spin_unlock_irq(&ic->endio_wait.lock); + goto journal_read_write; + } else { + sector_t next_sector; + journal_read_pos = find_journal_node(ic, dio->range.logical_sector, &next_sector); + if (likely(journal_read_pos == NOT_FOUND)) { + if (unlikely(dio->range.n_sectors > next_sector - dio->range.logical_sector)) + dio->range.n_sectors = next_sector - dio->range.logical_sector; + } else { + unsigned i; + for (i = 1; i < dio->range.n_sectors; i++) { + if (!test_journal_node(ic, journal_read_pos + i, dio->range.logical_sector + i)) + break; + } + dio->range.n_sectors = i; + } + } + } + if (unlikely(!add_new_range(ic, &dio->range))) { + /* + * We must not sleep in the request routine because it could + * stall bios on current->bio_list. + * So, we offload the bio to a workqueue if we have to sleep. + */ +sleep: + if (from_map) { + spin_unlock_irq(&ic->endio_wait.lock); + INIT_WORK(&dio->work, integrity_bio_wait); + queue_work(ic->wait_wq, &dio->work); + return; + } else { + sleep_on_endio_wait(ic); + goto retry; + } + } + spin_unlock_irq(&ic->endio_wait.lock); + + if (unlikely(journal_read_pos != NOT_FOUND)) { + journal_section = journal_read_pos / ic->journal_section_entries; + journal_entry = journal_read_pos % ic->journal_section_entries; + goto journal_read_write; + } + + dio->in_flight = (atomic_t)ATOMIC_INIT(2); + + if (need_sync_io) { + read_comp = COMPLETION_INITIALIZER_ONSTACK(read_comp); + dio->completion = &read_comp; + } else + dio->completion = NULL; + + dio->orig_bi_iter = bio->bi_iter; + + dio->orig_bi_bdev = bio->bi_bdev; + bio->bi_bdev = ic->dev->bdev; + + dio->orig_bi_integrity = bio_integrity(bio); + bio->bi_integrity = NULL; + bio->bi_opf &= ~REQ_INTEGRITY; + + dio->orig_bi_end_io = bio->bi_end_io; + bio->bi_end_io = integrity_end_io; + + bio->bi_iter.bi_size = dio->range.n_sectors << SECTOR_SHIFT; + bio->bi_iter.bi_sector += ic->start; + generic_make_request(bio); + + if (need_sync_io) { + wait_for_completion_io(&read_comp); + integrity_metadata(&dio->work); + } else { + INIT_WORK(&dio->work, integrity_metadata); + queue_work(ic->metadata_wq, &dio->work); + } + + return; + +journal_read_write: + if (unlikely(__journal_read_write(dio, bio, journal_section, journal_entry))) + goto lock_retry; + + do_endio_flush(ic, dio); +} + + +static void integrity_bio_wait(struct work_struct *w) +{ + struct dm_integrity_io *dio = container_of(w, struct dm_integrity_io, work); + + dm_integrity_map_continue(dio, false); +} + +static void pad_uncommitted(struct dm_integrity_c *ic) +{ + if (ic->free_section_entry) { + ic->free_sectors -= ic->journal_section_entries - ic->free_section_entry; + ic->free_section_entry = 0; + ic->free_section++; + wraparound_section(ic, &ic->free_section); + ic->n_uncommitted_sections++; + } +} + +static void integrity_commit(struct work_struct *w) +{ + struct dm_integrity_c *ic = container_of(w, struct dm_integrity_c, commit_work); + unsigned commit_start, commit_sections; + unsigned i, j, n; + struct bio *flushes; + + del_timer(&ic->autocommit_timer); + + spin_lock_irq(&ic->endio_wait.lock); + flushes = bio_list_get(&ic->flush_bio_list); + if (unlikely(ic->mode != 'J')) { + spin_unlock_irq(&ic->endio_wait.lock); + dm_integrity_flush_buffers(ic); + goto release_flush_bios; + } + + pad_uncommitted(ic); + commit_start = ic->uncommitted_section; + commit_sections = ic->n_uncommitted_sections; + spin_unlock_irq(&ic->endio_wait.lock); + + if (!commit_sections) + goto release_flush_bios; + + i = commit_start; + for (n = 0; n < commit_sections; n++) { + for (j = 0; j < ic->journal_section_entries; j++) { + struct journal_entry *je; + je = access_journal_entry(ic, i, j); + io_wait_event(ic->copy_to_journal_wait, !journal_entry_is_inprogress(je)); + } + for (j = 0; j < ic->journal_section_sectors; j++) { + struct journal_sector *js; + js = access_journal(ic, i, j); + js->commit_id = dm_integrity_commit_id(ic, i, j, ic->commit_seq); + } + i++; + if (unlikely(i >= ic->journal_sections)) + ic->commit_seq = next_commit_seq(ic->commit_seq); + wraparound_section(ic, &i); + } + smp_rmb(); + + write_journal(ic, commit_start, commit_sections); + + spin_lock_irq(&ic->endio_wait.lock); + ic->uncommitted_section += commit_sections; + wraparound_section(ic, &ic->uncommitted_section); + ic->n_uncommitted_sections -= commit_sections; + ic->n_committed_sections += commit_sections; + spin_unlock_irq(&ic->endio_wait.lock); + + if (ACCESS_ONCE(ic->free_sectors) <= ic->free_sectors_threshold) + queue_work(ic->writer_wq, &ic->writer_work); + +release_flush_bios: + while (flushes) { + struct bio *next = flushes->bi_next; + flushes->bi_next = NULL; + do_endio(ic, flushes); + flushes = next; + } +} + +static void complete_copy_from_journal(unsigned long error, void *context) +{ + struct journal_io *io = context; + struct journal_completion *comp = io->comp; + struct dm_integrity_c *ic = comp->ic; + remove_range(ic, &io->range); + mempool_free(io, ic->journal_io_mempool); + if (unlikely(error != 0)) + dm_integrity_io_error(ic, "copying from journal", -EIO); + complete_journal_op(comp); +} + +static void do_journal_write(struct dm_integrity_c *ic, unsigned write_start, + unsigned write_sections, bool from_replay) +{ + unsigned i, j, n; + struct journal_completion comp; + + comp.ic = ic; + comp.in_flight = (atomic_t)ATOMIC_INIT(1); + comp.comp = COMPLETION_INITIALIZER_ONSTACK(comp.comp); + + i = write_start; + for (n = 0; n < write_sections; n++, i++, wraparound_section(ic, &i)) { +#ifndef INTERNAL_VERIFY + if (unlikely(from_replay)) +#endif + rw_section_mac(ic, i, false); + for (j = 0; j < ic->journal_section_entries; j++) { + struct journal_entry *je = access_journal_entry(ic, i, j); + sector_t sec, area, offset; + unsigned k, l, next_loop; + sector_t metadata_block; + unsigned metadata_offset; + struct journal_io *io; + + if (journal_entry_is_unused(je)) + continue; + BUG_ON(unlikely(journal_entry_is_inprogress(je)) && !from_replay); + sec = journal_entry_get_sector(je); + get_area_and_offset(ic, sec, &area, &offset); + access_journal_data(ic, i, j)->commit_id = je->last_bytes; + for (k = j + 1; k < ic->journal_section_entries; k++) { + struct journal_entry *je2 = access_journal_entry(ic, i, k); + sector_t sec2, area2, offset2; + if (journal_entry_is_unused(je2)) + break; + BUG_ON(unlikely(journal_entry_is_inprogress(je2)) && !from_replay); + sec2 = journal_entry_get_sector(je2); + get_area_and_offset(ic, sec2, &area2, &offset2); + if (area2 != area || offset2 != offset + (k - j)) + break; + access_journal_data(ic, i, k)->commit_id = je2->last_bytes; + } + next_loop = k - 1; + + io = mempool_alloc(ic->journal_io_mempool, GFP_NOIO); + io->comp = ∁ + io->range.logical_sector = sec; + io->range.n_sectors = k - j; + + spin_lock_irq(&ic->endio_wait.lock); + while (unlikely(!add_new_range(ic, &io->range))) + sleep_on_endio_wait(ic); + + if (likely(!from_replay)) { + struct journal_node *section_node = &ic->journal_tree[i * ic->journal_section_entries]; + + /* don't write if there is newer committed sector */ + while (j < k && find_newer_committed_node(ic, §ion_node[j])) { + struct journal_entry *je2 = access_journal_entry(ic, i, j); + + journal_entry_set_unused(je2); + remove_journal_node(ic, §ion_node[j]); + j++; + sec++; + offset++; + } + while (j < k && find_newer_committed_node(ic, §ion_node[k - 1])) { + struct journal_entry *je2 = access_journal_entry(ic, i, k - 1); + + journal_entry_set_unused(je2); + remove_journal_node(ic, §ion_node[k - 1]); + k--; + } + if (j == k) { + remove_range_unlocked(ic, &io->range); + spin_unlock_irq(&ic->endio_wait.lock); + mempool_free(io, ic->journal_io_mempool); + goto skip_io; + } + for (l = j; l < k; l++) { + remove_journal_node(ic, §ion_node[l]); + } + } + spin_unlock_irq(&ic->endio_wait.lock); + + metadata_block = get_metadata_sector_and_offset(ic, area, offset, &metadata_offset); + for (l = j; l < k; l++) { + int r; + struct journal_entry *je2 = access_journal_entry(ic, i, l); + + if ( +#ifndef INTERNAL_VERIFY + unlikely(from_replay) && +#endif + ic->internal_hash) { + unsigned char test_tag[ic->tag_size]; + + integrity_sector_checksum(ic, sec + (l - j), + (char *)access_journal_data(ic, i, l), test_tag); + if (unlikely(memcmp(test_tag, je2->tag, ic->tag_size))) + dm_integrity_io_error(ic, "tag mismatch when replaying journal", -EILSEQ); + } + + journal_entry_set_unused(je2); + r = dm_integrity_rw_tag(ic, je2->tag, &metadata_block, &metadata_offset, + ic->tag_size, TAG_WRITE); + if (unlikely(r)) { + dm_integrity_io_error(ic, "reading tags", r); + } + } + + atomic_inc(&comp.in_flight); + copy_from_journal(ic, i, j, k - j, get_data_sector(ic, area, offset), + complete_copy_from_journal, io); +skip_io: + j = next_loop; + } + } + + dm_bufio_write_dirty_buffers_async(ic->bufio); + + complete_journal_op(&comp); + wait_for_completion_io(&comp.comp); + + dm_integrity_flush_buffers(ic); +} + +static void integrity_writer(struct work_struct *w) +{ + struct dm_integrity_c *ic = container_of(w, struct dm_integrity_c, writer_work); + unsigned write_start, write_sections; + + unsigned prev_free_sectors; + + /* the following test is not needed, but it tests the replay code */ + if (ACCESS_ONCE(ic->suspending)) + return; + + spin_lock_irq(&ic->endio_wait.lock); + write_start = ic->committed_section; + write_sections = ic->n_committed_sections; + spin_unlock_irq(&ic->endio_wait.lock); + + if (!write_sections) + return; + + do_journal_write(ic, write_start, write_sections, false); + + spin_lock_irq(&ic->endio_wait.lock); + + ic->committed_section += write_sections; + wraparound_section(ic, &ic->committed_section); + ic->n_committed_sections -= write_sections; + + prev_free_sectors = ic->free_sectors; + ic->free_sectors += write_sections * ic->journal_section_entries; + if (unlikely(!prev_free_sectors)) + wake_up_locked(&ic->endio_wait); + + spin_unlock_irq(&ic->endio_wait.lock); +} + +static void init_journal(struct dm_integrity_c *ic, unsigned start_section, + unsigned n_sections, unsigned char commit_seq) +{ + unsigned i, j, n; + + if (!n_sections) + return; + + for (n = 0; n < n_sections; n++) { + i = start_section + n; + wraparound_section(ic, &i); + for (j = 0; j < ic->journal_section_sectors; j++) { + struct journal_sector *js = access_journal(ic, i, j); + memset(&js->entries, 0, JOURNAL_SECTOR_DATA); + js->commit_id = dm_integrity_commit_id(ic, i, j, commit_seq); + } + for (j = 0; j < ic->journal_section_entries; j++) { + struct journal_entry *je = access_journal_entry(ic, i, j); + journal_entry_set_unused(je); + } + } + + write_journal(ic, start_section, n_sections); +} + +static int find_commit_seq(struct dm_integrity_c *ic, unsigned i, unsigned j, commit_id_t id) +{ + unsigned char k; + for (k = 0; k < N_COMMIT_IDS; k++) { + if (dm_integrity_commit_id(ic, i, j, k) == id) + return k; + } + dm_integrity_io_error(ic, "journal commit id", -EIO); + return -EIO; +} + +static void replay_journal(struct dm_integrity_c *ic) +{ + unsigned i, j; + bool used_commit_ids[N_COMMIT_IDS]; + unsigned max_commit_id_sections[N_COMMIT_IDS]; + unsigned write_start, write_sections; + unsigned continue_section; + bool journal_empty; + unsigned char unused, last_used, want_commit_seq; + + if (ic->journal_uptodate) + return; + + last_used = 0; + write_start = 0; + + if (!ic->just_formatted) { + DEBUG_print("reading journal\n"); + rw_journal(ic, REQ_OP_READ, 0, 0, ic->journal_sections, NULL); + if (ic->journal_io) + DEBUG_bytes(lowmem_page_address(ic->journal_io[0].page), 64, "read journal"); + if (ic->journal_io) { + struct journal_completion crypt_comp; + crypt_comp.ic = ic; + crypt_comp.comp = COMPLETION_INITIALIZER_ONSTACK(crypt_comp.comp); + crypt_comp.in_flight = (atomic_t)ATOMIC_INIT(0); + encrypt_journal(ic, false, 0, ic->journal_sections, &crypt_comp); + wait_for_completion(&crypt_comp.comp); + } + DEBUG_bytes(lowmem_page_address(ic->journal[0].page), 64, "decrypted journal"); + } + + if (dm_integrity_failed(ic)) + goto clear_journal; + + journal_empty = true; + memset(used_commit_ids, 0, sizeof used_commit_ids); + memset(max_commit_id_sections, 0, sizeof max_commit_id_sections); + for (i = 0; i < ic->journal_sections; i++) { + for (j = 0; j < ic->journal_section_sectors; j++) { + int k; + struct journal_sector *js = access_journal(ic, i, j); + k = find_commit_seq(ic, i, j, js->commit_id); + if (k < 0) + goto clear_journal; + used_commit_ids[k] = true; + max_commit_id_sections[k] = i; + } + if (journal_empty) { + for (j = 0; j < ic->journal_section_entries; j++) { + struct journal_entry *je = access_journal_entry(ic, i, j); + if (!journal_entry_is_unused(je)) { + journal_empty = false; + break; + } + } + } + } + + if (!used_commit_ids[N_COMMIT_IDS - 1]) { + unused = N_COMMIT_IDS - 1; + while (unused && !used_commit_ids[unused - 1]) + unused--; + } else { + for (unused = 0; unused < N_COMMIT_IDS; unused++) + if (!used_commit_ids[unused]) + break; + if (unused == N_COMMIT_IDS) { + dm_integrity_io_error(ic, "journal commit ids", -EIO); + goto clear_journal; + } + } + DEBUG_print("first unused commit seq %d [%d,%d,%d,%d]\n", + unused, used_commit_ids[0], used_commit_ids[1], + used_commit_ids[2], used_commit_ids[3]); + + last_used = prev_commit_seq(unused); + want_commit_seq = prev_commit_seq(last_used); + + if (!used_commit_ids[want_commit_seq] && used_commit_ids[prev_commit_seq(want_commit_seq)]) + journal_empty = true; + + write_start = max_commit_id_sections[last_used] + 1; + if (unlikely(write_start >= ic->journal_sections)) + want_commit_seq = next_commit_seq(want_commit_seq); + wraparound_section(ic, &write_start); + + i = write_start; + for (write_sections = 0; write_sections < ic->journal_sections; write_sections++) { + for (j = 0; j < ic->journal_section_sectors; j++) { + struct journal_sector *js = access_journal(ic, i, j); + + if (js->commit_id != dm_integrity_commit_id(ic, i, j, want_commit_seq)) { + /* + * This could be caused by crash during writing. + * We won't replay the inconsistent part of the + * journal. + */ + DEBUG_print("commit id mismatch at position (%u, %u): %d != %d\n", + i, j, find_commit_seq(ic, i, j, js->commit_id), want_commit_seq); + goto brk; + } + } + i++; + if (unlikely(i >= ic->journal_sections)) + want_commit_seq = next_commit_seq(want_commit_seq); + wraparound_section(ic, &i); + } +brk: + + if (!journal_empty) { + DEBUG_print("replaying %u sections, starting at %u, commit seq %d\n", + write_sections, write_start, want_commit_seq); + do_journal_write(ic, write_start, write_sections, true); + } + + if (write_sections == ic->journal_sections && (ic->mode == 'J' || journal_empty)) { + continue_section = write_start; + ic->commit_seq = want_commit_seq; + DEBUG_print("continuing from section %u, commit seq %d\n", write_start, ic->commit_seq); + } else { + unsigned s; + unsigned char erase_seq; +clear_journal: + DEBUG_print("clearing journal\n"); + + erase_seq = prev_commit_seq(prev_commit_seq(last_used)); + s = write_start; + init_journal(ic, s, 1, erase_seq); + s++; + wraparound_section(ic, &s); + if (ic->journal_sections >= 2) { + init_journal(ic, s, ic->journal_sections - 2, erase_seq); + s += ic->journal_sections - 2; + wraparound_section(ic, &s); + init_journal(ic, s, 1, erase_seq); + } + + continue_section = 0; + ic->commit_seq = next_commit_seq(erase_seq); + } + + ic->committed_section = continue_section; + ic->n_committed_sections = 0; + + ic->uncommitted_section = continue_section; + ic->n_uncommitted_sections = 0; + + ic->free_section = continue_section; + ic->free_section_entry = 0; + ic->free_sectors = ic->journal_entries; + + ic->journal_tree_root = RB_ROOT; + for (i = 0; i < ic->journal_entries; i++) + init_journal_node(&ic->journal_tree[i]); +} + +static void dm_integrity_postsuspend(struct dm_target *ti) +{ + struct dm_integrity_c *ic = (struct dm_integrity_c *)ti->private; + + del_timer_sync(&ic->autocommit_timer); + + ic->suspending = true; + + queue_work(ic->commit_wq, &ic->commit_work); + drain_workqueue(ic->commit_wq); + + if (ic->mode == 'J') { + drain_workqueue(ic->writer_wq); + dm_integrity_flush_buffers(ic); + } + + ic->suspending = false; + + BUG_ON(!RB_EMPTY_ROOT(&ic->in_progress)); + + ic->journal_uptodate = true; +} + +static void dm_integrity_resume(struct dm_target *ti) +{ + struct dm_integrity_c *ic = (struct dm_integrity_c *)ti->private; + + replay_journal(ic); +} + +static void dm_integrity_status(struct dm_target *ti, status_type_t type, + unsigned status_flags, char *result, unsigned maxlen) +{ + struct dm_integrity_c *ic = (struct dm_integrity_c *)ti->private; + unsigned arg_count; + size_t sz = 0; + + switch (type) { + case STATUSTYPE_INFO: + result[0] = '\0'; + break; + + case STATUSTYPE_TABLE: { + __u64 watermark_percentage = (__u64)(ic->journal_entries - ic->free_sectors_threshold) * 100; + watermark_percentage += ic->journal_entries / 2; + do_div(watermark_percentage, ic->journal_entries); + arg_count = 5; + arg_count += !!ic->internal_hash_alg.alg_string; + arg_count += !!ic->journal_crypt_alg.alg_string; + arg_count += !!ic->journal_mac_alg.alg_string; + DMEMIT("%s %llu %u %c %u", ic->dev->name, (unsigned long long)ic->start, + ic->tag_size, ic->mode, arg_count); + DMEMIT(" journal-sectors:%u", ic->initial_sectors - SB_SECTORS); + DMEMIT(" interleave-sectors:%u", 1U << ic->sb->log2_interleave_sectors); + DMEMIT(" buffer-sectors:%u", 1U << ic->log2_buffer_sectors); + DMEMIT(" journal-watermark:%u", (unsigned)watermark_percentage); + DMEMIT(" commit-time:%u", ic->autocommit_msec); + +#define EMIT_ALG(a, n) \ + do { \ + if (ic->a.alg_string) { \ + DMEMIT(" %s:%s", n, ic->a.alg_string); \ + if (ic->a.key_string) \ + DMEMIT(":%s", ic->a.key_string);\ + } \ + } while (0) + EMIT_ALG(internal_hash_alg, "internal-hash"); + EMIT_ALG(journal_crypt_alg, "journal-crypt"); + EMIT_ALG(journal_mac_alg, "journal-mac"); + break; + } + } +} + +static int dm_integrity_iterate_devices(struct dm_target *ti, + iterate_devices_callout_fn fn, void *data) +{ + struct dm_integrity_c *ic = ti->private; + + return fn(ti, ic->dev, ic->start + ic->initial_sectors + ic->metadata_run, ti->len, data); +} + +static void calculate_journal_section_size(struct dm_integrity_c *ic) +{ + unsigned sector_space = JOURNAL_SECTOR_DATA; + + ic->journal_sections = le32_to_cpu(ic->sb->journal_sections); + ic->journal_entry_size = roundup(offsetof(struct journal_entry, tag) + ic->tag_size, + JOURNAL_ENTRY_ROUNDUP); + + if (ic->sb->flags & cpu_to_le32(SB_FLAG_HAVE_JOURNAL_MAC)) + sector_space -= JOURNAL_MAC_PER_SECTOR; + ic->journal_entries_per_sector = sector_space / ic->journal_entry_size; + ic->journal_section_entries = ic->journal_entries_per_sector * JOURNAL_BLOCK_SECTORS; + ic->journal_section_sectors = ic->journal_section_entries + JOURNAL_BLOCK_SECTORS; + ic->journal_entries = ic->journal_section_entries * ic->journal_sections; +} + +static int calculate_device_limits(struct dm_integrity_c *ic) +{ + __u64 initial_sectors; + sector_t last_sector, last_area, last_offset; + + calculate_journal_section_size(ic); + initial_sectors = SB_SECTORS + (__u64)ic->journal_section_sectors * ic->journal_sections; + if (initial_sectors + METADATA_PADDING_SECTORS >= ic->device_sectors || initial_sectors > UINT_MAX) + return -EINVAL; + ic->initial_sectors = initial_sectors; + + ic->metadata_run = roundup((__u64)ic->tag_size << ic->sb->log2_interleave_sectors, + (__u64)(1 << SECTOR_SHIFT << METADATA_PADDING_SECTORS)) >> SECTOR_SHIFT; + if (!(ic->metadata_run & (ic->metadata_run - 1))) + ic->log2_metadata_run = __ffs(ic->metadata_run); + else + ic->log2_metadata_run = -1; + + get_area_and_offset(ic, ic->provided_data_sectors - 1, &last_area, &last_offset); + last_sector = get_data_sector(ic, last_area, last_offset); + + if (ic->start + last_sector < last_sector || ic->start + last_sector >= ic->device_sectors) + return -EINVAL; + + return 0; +} + +static int initialize_superblock(struct dm_integrity_c *ic, unsigned journal_sectors, unsigned interleave_sectors) +{ + unsigned journal_sections; + int test_bit; + + memcpy(ic->sb->magic, SB_MAGIC, 8); + ic->sb->version = SB_VERSION; + ic->sb->integrity_tag_size = cpu_to_le16(ic->tag_size); + if (ic->journal_mac_alg.alg_string) + ic->sb->flags |= cpu_to_le32(SB_FLAG_HAVE_JOURNAL_MAC); + + calculate_journal_section_size(ic); + journal_sections = journal_sectors / ic->journal_section_sectors; + if (!journal_sections) + journal_sections = 1; + ic->sb->journal_sections = cpu_to_le32(journal_sections); + + ic->sb->log2_interleave_sectors = __fls(interleave_sectors); + ic->sb->log2_interleave_sectors = max((__u8)MIN_INTERLEAVE_SECTORS, ic->sb->log2_interleave_sectors); + ic->sb->log2_interleave_sectors = min((__u8)MAX_INTERLEAVE_SECTORS, ic->sb->log2_interleave_sectors); + + ic->provided_data_sectors = 0; + for (test_bit = fls64(ic->device_sectors) - 1; test_bit >= 3; test_bit--) { + __u64 prev_data_sectors = ic->provided_data_sectors; + + ic->provided_data_sectors |= (sector_t)1 << test_bit; + if (calculate_device_limits(ic)) + ic->provided_data_sectors = prev_data_sectors; + } + + if (!le64_to_cpu(ic->provided_data_sectors)) + return -EINVAL; + + ic->sb->provided_data_sectors = cpu_to_le64(ic->provided_data_sectors); + + return 0; +} + +static void dm_integrity_set(struct dm_target *ti, struct dm_integrity_c *ic) +{ + struct gendisk *disk = dm_disk(dm_table_get_md(ti->table)); + struct blk_integrity bi; + + memset(&bi, 0, sizeof(bi)); + bi.profile = &dm_integrity_profile; + bi.tuple_size = ic->tag_size * (queue_logical_block_size(disk->queue) >> SECTOR_SHIFT); + bi.tag_size = ic->tag_size; + + blk_integrity_register(disk, &bi); + blk_queue_max_integrity_segments(disk->queue, UINT_MAX); +} + +/* FIXME: use new kvmalloc */ +static void *dm_integrity_kvmalloc(size_t size, gfp_t gfp) +{ + void *ptr = NULL; + + if (size <= PAGE_SIZE) + ptr = kmalloc(size, GFP_KERNEL | gfp); + if (!ptr && size <= KMALLOC_MAX_SIZE) + ptr = kmalloc(size, GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY | gfp); + if (!ptr) + ptr = __vmalloc(size, GFP_KERNEL | gfp, PAGE_KERNEL); + + return ptr; +} + +static void dm_integrity_free_page_list(struct dm_integrity_c *ic, struct page_list *pl) +{ + unsigned i; + + if (!pl) + return; + for (i = 0; i < ic->journal_pages; i++) + if (pl[i].page) + __free_page(pl[i].page); + kvfree(pl); +} + +static struct page_list *dm_integrity_alloc_page_list(struct dm_integrity_c *ic) +{ + size_t page_list_desc_size = ic->journal_pages * sizeof(struct page_list); + struct page_list *pl; + unsigned i; + + pl = dm_integrity_kvmalloc(page_list_desc_size, __GFP_ZERO); + if (!pl) + return NULL; + + for (i = 0; i < ic->journal_pages; i++) { + pl[i].page = alloc_page(GFP_KERNEL); + if (!pl[i].page) { + dm_integrity_free_page_list(ic, pl); + return NULL; + } + if (i) + pl[i - 1].next = &pl[i]; + } + + return pl; +} + +static void dm_integrity_free_journal_scatterlist(struct dm_integrity_c *ic, struct scatterlist **sl) +{ + unsigned i; + for (i = 0; i < ic->journal_sections; i++) + kvfree(sl[i]); + kfree(sl); +} + +static struct scatterlist **dm_integrity_alloc_journal_scatterlist(struct dm_integrity_c *ic, struct page_list *pl) +{ + struct scatterlist **sl; + unsigned i; + + sl = dm_integrity_kvmalloc(ic->journal_sections * sizeof(struct scatterlist *), __GFP_ZERO); + if (!sl) + return NULL; + + for (i = 0; i < ic->journal_sections; i++) { + struct scatterlist *s; + unsigned start_index, start_offset; + unsigned end_index, end_offset; + unsigned n_pages; + unsigned idx; + + page_list_location(ic, i, 0, &start_index, &start_offset); + page_list_location(ic, i, ic->journal_section_sectors - 1, &end_index, &end_offset); + + n_pages = (end_index - start_index + 1); + + s = dm_integrity_kvmalloc(n_pages * sizeof(struct scatterlist), 0); + if (!s) { + dm_integrity_free_journal_scatterlist(ic, sl); + return NULL; + } + + sg_init_table(s, n_pages); + for (idx = start_index; idx <= end_index; idx++) { + char *va = lowmem_page_address(pl[idx].page); + unsigned start = 0, end = PAGE_SIZE; + if (idx == start_index) + start = start_offset; + if (idx == end_index) + end = end_offset + (1 << SECTOR_SHIFT); + sg_set_buf(&s[idx - start_index], va + start, end - start); + } + + sl[i] = s; + } + + return sl; +} + +static void free_alg(struct alg_spec *a) +{ + kzfree(a->alg_string); + kzfree(a->key); + memset(a, 0, sizeof *a); +} + +static int get_alg_and_key(const char *arg, struct alg_spec *a, char **error, char *error_inval) +{ + char *k; + + free_alg(a); + + a->alg_string = kstrdup(strchr(arg, ':') + 1, GFP_KERNEL); + if (!a->alg_string) + goto nomem; + + k = strchr(a->alg_string, ':'); + if (k) { + unsigned i; + + *k = 0; + a->key_string = k + 1; + if (strlen(a->key_string) & 1) + goto inval; + + a->key_size = strlen(a->key_string) / 2; + a->key = kmalloc(a->key_size, GFP_KERNEL); + if (!a->key) + goto nomem; + for (i = 0; i < a->key_size; i++) { + char digit[3]; + digit[0] = a->key_string[i * 2]; + digit[1] = a->key_string[i * 2 + 1]; + digit[2] = 0; + if (strspn(digit, "0123456789abcdefABCDEF") != 2) + goto inval; + if (kstrtou8(digit, 16, &a->key[i])) + goto inval; + } + } + + return 0; +inval: + *error = error_inval; + return -EINVAL; +nomem: + *error = "Out of memory for an argument"; + return -ENOMEM; +} + +static int get_mac(struct crypto_shash **hash, struct alg_spec *a, char **error, + char *error_alg, char *error_key) +{ + int r; + + if (a->alg_string) { + *hash = crypto_alloc_shash(a->alg_string, 0, CRYPTO_ALG_ASYNC); + if (IS_ERR(*hash)) { + *error = error_alg; + r = PTR_ERR(*hash); + *hash = NULL; + return r; + } + + if (a->key) { + r = crypto_shash_setkey(*hash, a->key, a->key_size); + if (r) { + *error = error_key; + return r; + } + } + } + + return 0; +} + +/* + * Construct a integrity mapping: + * + * Arguments: + * device + * offset from the start of the device + * tag size + * D - direct writes, J - journal writes + * number of optional arguments + * optional arguments: + * journal-sectors + * interleave-sectors + * buffer-sectors + * journal-watermark + * commit-time + * internal-hash + * journal-crypt + * journal-mac + */ +static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv) +{ + struct dm_integrity_c *ic; + char dummy; + int r; + unsigned i; + unsigned extra_args; + struct dm_arg_set as; + static struct dm_arg _args[] = { + {0, 7, "Invalid number of feature args"}, + }; + unsigned journal_sectors, interleave_sectors, buffer_sectors, journal_watermark, sync_msec; + bool should_write_sb; + __u64 journal_pages, journal_desc_size, journal_tree_size; + __u64 threshold; + unsigned long long start; + +#define DIRECT_ARGUMENTS 4 + + if (argc <= DIRECT_ARGUMENTS) { + ti->error = "Invalid argument count"; + return -EINVAL; + } + + ic = kzalloc(sizeof(struct dm_integrity_c), GFP_KERNEL); + if (!ic) { + ti->error = "Cannot allocate integrity context"; + return -ENOMEM; + } + ti->private = ic; + ti->per_io_data_size = sizeof(struct dm_integrity_io); + + ic->commit_ids[0] = cpu_to_le64(0x1111111111111111ULL); + ic->commit_ids[1] = cpu_to_le64(0x2222222222222222ULL); + ic->commit_ids[2] = cpu_to_le64(0x3333333333333333ULL); + ic->commit_ids[3] = cpu_to_le64(0x4444444444444444ULL); + + ic->in_progress = RB_ROOT; + init_waitqueue_head(&ic->endio_wait); + bio_list_init(&ic->flush_bio_list); + init_waitqueue_head(&ic->copy_to_journal_wait); + init_completion(&ic->crypto_backoff); + + r = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &ic->dev); + if (r) { + ti->error = "Device lookup failed"; + goto bad; + } + + if (sscanf(argv[1], "%llu%c", &start, &dummy) != 1 || start != (sector_t)start) { + ti->error = "Invalid starting offset"; + r = -EINVAL; + goto bad; + } + ic->start = start; + + if (strcmp(argv[2], "-")) { + if (sscanf(argv[2], "%u%c", &ic->tag_size, &dummy) != 1 || !ic->tag_size) { + ti->error = "Invalid tag size"; + r = -EINVAL; + goto bad; + } + } + + if (!strcmp(argv[3], "J") || !strcmp(argv[3], "D")) + ic->mode = argv[3][0]; + else { + ti->error = "Invalid mode (expecting J or D)"; + r = -EINVAL; + goto bad; + } + + ic->device_sectors = i_size_read(ic->dev->bdev->bd_inode) >> SECTOR_SHIFT; + journal_sectors = min((sector_t)DEFAULT_MAX_JOURNAL_SECTORS, + ic->device_sectors >> DEFAULT_JOURNAL_SIZE_FACTOR); + interleave_sectors = DEFAULT_INTERLEAVE_SECTORS; + buffer_sectors = DEFAULT_BUFFER_SECTORS; + journal_watermark = DEFAULT_JOURNAL_WATERMARK; + sync_msec = DEFAULT_SYNC_MSEC; + + as.argc = argc - DIRECT_ARGUMENTS; + as.argv = argv + DIRECT_ARGUMENTS; + r = dm_read_arg_group(_args, &as, &extra_args, &ti->error); + if (r) + goto bad; + + while (extra_args--) { + const char *opt_string; + unsigned val; + opt_string = dm_shift_arg(&as); + if (!opt_string) { + r = -EINVAL; + ti->error = "Not enough feature arguments"; + goto bad; + } + if (sscanf(opt_string, "journal-sectors:%u%c", &val, &dummy) == 1) + journal_sectors = val; + else if (sscanf(opt_string, "interleave-sectors:%u%c", &val, &dummy) == 1) + interleave_sectors = val; + else if (sscanf(opt_string, "buffer-sectors:%u%c", &val, &dummy) == 1) + buffer_sectors = val; + else if (sscanf(opt_string, "journal-watermark:%u%c", &val, &dummy) == 1 && val <= 100) + journal_watermark = val; + else if (sscanf(opt_string, "commit-time:%u%c", &val, &dummy) == 1) + sync_msec = val; + else if (!memcmp(opt_string, "internal-hash:", strlen("internal-hash:"))) { + r = get_alg_and_key(opt_string, &ic->internal_hash_alg, &ti->error, + "Invalid internal-hash argument"); + if (r) + goto bad; + } else if (!memcmp(opt_string, "journal-crypt:", strlen("journal-crypt:"))) { + r = get_alg_and_key(opt_string, &ic->journal_crypt_alg, &ti->error, + "Invalid journal-crypt argument"); + if (r) + goto bad; + } else if (!memcmp(opt_string, "journal-mac:", strlen("journal-mac:"))) { + r = get_alg_and_key(opt_string, &ic->journal_mac_alg, &ti->error, + "Invalid journal-mac argument"); + if (r) + goto bad; + } else { + r = -EINVAL; + ti->error = "Invalid argument"; + goto bad; + } + } + + r = get_mac(&ic->internal_hash, &ic->internal_hash_alg, &ti->error, + "Invalid internal hash", "Error setting internal hash key"); + if (r) + goto bad; + + r = get_mac(&ic->journal_mac, &ic->journal_mac_alg, &ti->error, + "Invalid journal mac", "Error setting journal mac key"); + if (r) + goto bad; + + if (!ic->tag_size) { + if (!ic->internal_hash) { + ti->error = "Unknown tag size"; + r = -EINVAL; + goto bad; + } + ic->tag_size = crypto_shash_digestsize(ic->internal_hash); + } + if (ic->tag_size > MAX_TAG_SIZE) { + ti->error = "Too big tag size"; + r = -EINVAL; + goto bad; + } + if (!(ic->tag_size & (ic->tag_size - 1))) + ic->log2_tag_size = __ffs(ic->tag_size); + else + ic->log2_tag_size = -1; + + ic->autocommit_jiffies = msecs_to_jiffies(sync_msec); + ic->autocommit_msec = sync_msec; + setup_timer(&ic->autocommit_timer, autocommit_fn, (unsigned long)ic); + + ic->io = dm_io_client_create(); + if (IS_ERR(ic->io)) { + r = PTR_ERR(ic->io); + ic->io = NULL; + ti->error = "Cannot allocate dm io"; + goto bad; + } + + ic->journal_io_mempool = mempool_create_slab_pool(JOURNAL_IO_MEMPOOL, journal_io_cache); + if (!ic->journal_io_mempool) { + r = -ENOMEM; + ti->error = "Cannot allocate mempool"; + goto bad; + } + + ic->metadata_wq = alloc_workqueue("dm-integrity-metadata", + WQ_MEM_RECLAIM, METADATA_WORKQUEUE_MAX_ACTIVE); + if (!ic->metadata_wq) { + ti->error = "Cannot allocate workqueue"; + r = -ENOMEM; + goto bad; + } + + /* + * If this workqueue were percpu, it would cause bio reordering + * and reduced performance. + */ + ic->wait_wq = alloc_workqueue("dm-integrity-wait", WQ_MEM_RECLAIM | WQ_UNBOUND, 1); + if (!ic->wait_wq) { + ti->error = "Cannot allocate workqueue"; + r = -ENOMEM; + goto bad; + } + + ic->commit_wq = alloc_workqueue("dm-integrity-commit", WQ_MEM_RECLAIM, 1); + if (!ic->commit_wq) { + ti->error = "Cannot allocate workqueue"; + r = -ENOMEM; + goto bad; + } + INIT_WORK(&ic->commit_work, integrity_commit); + + if (ic->mode == 'J') { + ic->writer_wq = alloc_workqueue("dm-integrity-writer", WQ_MEM_RECLAIM, 1); + if (!ic->writer_wq) { + ti->error = "Cannot allocate workqueue"; + r = -ENOMEM; + goto bad; + } + INIT_WORK(&ic->writer_work, integrity_writer); + } + + ic->sb = alloc_pages_exact(SB_SECTORS << SECTOR_SHIFT, GFP_KERNEL); + if (!ic->sb) { + r = -ENOMEM; + ti->error = "Cannot allocate superblock area"; + goto bad; + } + + r = sync_rw_sb(ic, REQ_OP_READ, 0); + if (r) { + ti->error = "Error reading superblock"; + goto bad; + } + if (!memcmp(ic->sb->magic, SB_MAGIC, 8)) { + should_write_sb = false; + } else { + for (i = 0; i < 512; i += 8) { + if (*(__u64 *)((__u8 *)ic->sb + i)) { + r = -EINVAL; + ti->error = "The device is not initialized"; + goto bad; + } + } + + r = initialize_superblock(ic, journal_sectors, interleave_sectors); + if (r) { + ti->error = "Could not initialize superblock"; + goto bad; + } + should_write_sb = true; + } + + if (ic->sb->version != SB_VERSION) { + r = -EINVAL; + ti->error = "Unknown version"; + goto bad; + } + if (le16_to_cpu(ic->sb->integrity_tag_size) != ic->tag_size) { + r = -EINVAL; + ti->error = "Invalid tag size"; + goto bad; + } + /* make sure that ti->max_io_len doesn't overflow */ + if (ic->sb->log2_interleave_sectors < MIN_INTERLEAVE_SECTORS || + ic->sb->log2_interleave_sectors > MAX_INTERLEAVE_SECTORS) { + r = -EINVAL; + ti->error = "Invalid interleave_sectors in the superblock"; + goto bad; + } + ic->provided_data_sectors = le64_to_cpu(ic->sb->provided_data_sectors); + if (ic->provided_data_sectors != le64_to_cpu(ic->sb->provided_data_sectors)) { + /* test for overflow */ + r = -EINVAL; + ti->error = "The superblock has 64-bit device size, but the kernel was compiled with 32-bit sectors"; + goto bad; + } + if (!!(ic->sb->flags & cpu_to_le32(SB_FLAG_HAVE_JOURNAL_MAC)) != !!ic->journal_mac_alg.alg_string) { + r = -EINVAL; + ti->error = "Journal mac mismatch"; + goto bad; + } + r = calculate_device_limits(ic); + if (r) { + ti->error = "The device is too small"; + goto bad; + } + + if (!buffer_sectors) + buffer_sectors = 1; + ic->log2_buffer_sectors = min3((int)__fls(buffer_sectors), (int)__ffs(ic->metadata_run), 31 - SECTOR_SHIFT); + + threshold = (__u64)ic->journal_entries * (100 - journal_watermark); + threshold += 50; + do_div(threshold, 100); + ic->free_sectors_threshold = threshold; + + DEBUG_print("initialized:\n"); + DEBUG_print(" integrity_tag_size %u\n", le16_to_cpu(ic->sb->integrity_tag_size)); + DEBUG_print(" journal_entry_size %u\n", ic->journal_entry_size); + DEBUG_print(" journal_entries_per_sector %u\n", ic->journal_entries_per_sector); + DEBUG_print(" journal_section_entries %u\n", ic->journal_section_entries); + DEBUG_print(" journal_section_sectors %u\n", ic->journal_section_sectors); + DEBUG_print(" journal_sections %u\n", (unsigned)le32_to_cpu(ic->sb->journal_sections)); + DEBUG_print(" journal_entries %u\n", ic->journal_entries); + DEBUG_print(" log2_interleave_sectors %d\n", ic->sb->log2_interleave_sectors); + DEBUG_print(" device_sectors 0x%llx\n", (unsigned long long)ic->device_sectors); + DEBUG_print(" initial_sectors 0x%x\n", ic->initial_sectors); + DEBUG_print(" metadata_run 0x%x\n", ic->metadata_run); + DEBUG_print(" log2_metadata_run %d\n", ic->log2_metadata_run); + DEBUG_print(" provided_data_sectors 0x%llx (%llu)\n", (unsigned long long)ic->provided_data_sectors, + (unsigned long long)ic->provided_data_sectors); + DEBUG_print(" log2_buffer_sectors %u\n", ic->log2_buffer_sectors); + + ic->bufio = dm_bufio_client_create(ic->dev->bdev, 1U << (SECTOR_SHIFT + ic->log2_buffer_sectors), + 1, 0, NULL, NULL); + if (IS_ERR(ic->bufio)) { + r = PTR_ERR(ic->bufio); + ti->error = "Cannot initialize dm-bufio"; + ic->bufio = NULL; + goto bad; + } + dm_bufio_set_sector_offset(ic->bufio, ic->start + ic->initial_sectors); + + journal_pages = roundup((__u64)ic->journal_sections * ic->journal_section_sectors, + PAGE_SIZE >> SECTOR_SHIFT) >> (PAGE_SHIFT - SECTOR_SHIFT); + journal_desc_size = journal_pages * sizeof(struct page_list); + if (journal_pages >= totalram_pages - totalhigh_pages || journal_desc_size > ULONG_MAX) { + ti->error = "Journal doesn't fit into memory"; + r = -ENOMEM; + goto bad; + } + ic->journal_pages = journal_pages; + + ic->journal = dm_integrity_alloc_page_list(ic); + if (!ic->journal) { + ti->error = "Could not allocate memory for journal"; + r = -ENOMEM; + goto bad; + } + if (ic->journal_crypt_alg.alg_string) { + unsigned ivsize, blocksize; + struct journal_completion comp; + comp.ic = ic; + + ic->journal_crypt = crypto_alloc_skcipher(ic->journal_crypt_alg.alg_string, 0, 0); + if (IS_ERR(ic->journal_crypt)) { + ti->error = "Invalid journal cipher"; + r = PTR_ERR(ic->journal_crypt); + ic->journal_crypt = NULL; + goto bad; + } + ivsize = crypto_skcipher_ivsize(ic->journal_crypt); + blocksize = crypto_skcipher_blocksize(ic->journal_crypt); + + if (ic->journal_crypt_alg.key) { + r = crypto_skcipher_setkey(ic->journal_crypt, ic->journal_crypt_alg.key, + ic->journal_crypt_alg.key_size); + if (r) { + ti->error = "Error setting encryption key"; + goto bad; + } + } + DEBUG_print("cipher %s, block size %u iv size %u\n", + ic->journal_crypt_alg.alg_string, blocksize, ivsize); + + ic->journal_io = dm_integrity_alloc_page_list(ic); + if (!ic->journal_io) { + ti->error = "Could not allocate memory for journal io"; + r = -ENOMEM; + goto bad; + } + + if (blocksize == 1) { + struct scatterlist *sg; + SKCIPHER_REQUEST_ON_STACK(req, ic->journal_crypt); + unsigned char iv[ivsize]; + skcipher_request_set_tfm(req, ic->journal_crypt); + + ic->journal_xor = dm_integrity_alloc_page_list(ic); + if (!ic->journal_xor) { + ti->error = "Could not allocate memory for journal xor"; + r = -ENOMEM; + goto bad; + } + + sg = dm_integrity_kvmalloc((ic->journal_pages + 1) * sizeof(struct scatterlist), 0); + if (!sg) { + ti->error = "Unable to allocate sg list"; + r = -ENOMEM; + goto bad; + } + sg_init_table(sg, ic->journal_pages + 1); + for (i = 0; i < ic->journal_pages; i++) { + char *va = lowmem_page_address(ic->journal_xor[i].page); + clear_page(va); + sg_set_buf(&sg[i], va, PAGE_SIZE); + } + sg_set_buf(&sg[i], &ic->commit_ids, sizeof ic->commit_ids); + memset(iv, 0x00, ivsize); + + skcipher_request_set_crypt(req, sg, sg, PAGE_SIZE * ic->journal_pages + sizeof ic->commit_ids, iv); + comp.comp = COMPLETION_INITIALIZER_ONSTACK(comp.comp); + comp.in_flight = (atomic_t)ATOMIC_INIT(1); + if (do_crypt(true, req, &comp)) + wait_for_completion(&comp.comp); + kvfree(sg); + if ((r = dm_integrity_failed(ic))) { + ti->error = "Unable to encrypt journal"; + goto bad; + } + DEBUG_bytes(lowmem_page_address(ic->journal_xor[0].page), 64, "xor data"); + + crypto_free_skcipher(ic->journal_crypt); + ic->journal_crypt = NULL; + } else { + SKCIPHER_REQUEST_ON_STACK(req, ic->journal_crypt); + unsigned char iv[ivsize]; + unsigned crypt_len = roundup(ivsize, blocksize); + unsigned char crypt_data[crypt_len]; + + skcipher_request_set_tfm(req, ic->journal_crypt); + + ic->journal_scatterlist = dm_integrity_alloc_journal_scatterlist(ic, ic->journal); + if (!ic->journal_scatterlist) { + ti->error = "Unable to allocate sg list"; + r = -ENOMEM; + goto bad; + } + ic->journal_io_scatterlist = dm_integrity_alloc_journal_scatterlist(ic, ic->journal_io); + if (!ic->journal_io_scatterlist) { + ti->error = "Unable to allocate sg list"; + r = -ENOMEM; + goto bad; + } + ic->sk_requests = dm_integrity_kvmalloc(ic->journal_sections * sizeof(struct skcipher_request *), __GFP_ZERO); + if (!ic->sk_requests) { + ti->error = "Unable to allocate sk requests"; + r = -ENOMEM; + goto bad; + } + for (i = 0; i < ic->journal_sections; i++) { + struct scatterlist sg; + struct skcipher_request *section_req; + __u32 section_le = cpu_to_le32(i); + + memset(iv, 0x00, ivsize); + memset(crypt_data, 0x00, crypt_len); + memcpy(crypt_data, §ion_le, min((size_t)crypt_len, sizeof(section_le))); + + sg_init_one(&sg, crypt_data, crypt_len); + skcipher_request_set_crypt(req, &sg, &sg, crypt_len, iv); + comp.comp = COMPLETION_INITIALIZER_ONSTACK(comp.comp); + comp.in_flight = (atomic_t)ATOMIC_INIT(1); + if (do_crypt(true, req, &comp)) + wait_for_completion(&comp.comp); + + if ((r = dm_integrity_failed(ic))) { + ti->error = "Unable to generate iv"; + goto bad; + } + + section_req = skcipher_request_alloc(ic->journal_crypt, GFP_KERNEL); + if (!section_req) { + ti->error = "Unable to allocate crypt request"; + r = -ENOMEM; + goto bad; + } + section_req->iv = kmalloc(ivsize * 2, GFP_KERNEL); + if (!section_req->iv) { + skcipher_request_free(section_req); + ti->error = "Unable to allocate iv"; + r = -ENOMEM; + goto bad; + } + memcpy(section_req->iv + ivsize, crypt_data, ivsize); + section_req->cryptlen = (size_t)ic->journal_section_sectors << SECTOR_SHIFT; + ic->sk_requests[i] = section_req; + DEBUG_bytes(crypt_data, ivsize, "iv(%u)", i); + } + } + } + + for (i = 0; i < N_COMMIT_IDS; i++) { + unsigned j; +retest_commit_id: + for (j = 0; j < i; j++) { + if (ic->commit_ids[j] == ic->commit_ids[i]) { + ic->commit_ids[i] = cpu_to_le64(le64_to_cpu(ic->commit_ids[i]) + 1); + goto retest_commit_id; + } + } + DEBUG_print("commit id %u: %016llx\n", i, ic->commit_ids[i]); + } + + journal_tree_size = (__u64)ic->journal_entries * sizeof(struct journal_node); + if (journal_tree_size > ULONG_MAX) { + ti->error = "Journal doesn't fit into memory"; + r = -ENOMEM; + goto bad; + } + ic->journal_tree = dm_integrity_kvmalloc(journal_tree_size, 0); + if (!ic->journal_tree) { + ti->error = "Could not allocate memory for journal tree"; + r = -ENOMEM; + goto bad; + } + + if (should_write_sb) { + int r; + + init_journal(ic, 0, ic->journal_sections, 0); + r = dm_integrity_failed(ic); + if (unlikely(r)) { + ti->error = "Error initializing journal"; + goto bad; + } + r = sync_rw_sb(ic, REQ_OP_WRITE, REQ_FUA); + if (r) { + ti->error = "Error initializing superblock"; + goto bad; + } + ic->just_formatted = true; + } + + r = dm_set_target_max_io_len(ti, 1U << ic->sb->log2_interleave_sectors); + if (r) + goto bad; + + if (!ic->internal_hash) + dm_integrity_set(ti, ic); + + ti->num_flush_bios = 1; + ti->flush_supported = true; + + return 0; +bad: + dm_integrity_dtr(ti); + return r; +} + +static void dm_integrity_dtr(struct dm_target *ti) +{ + struct dm_integrity_c *ic = ti->private; + + BUG_ON(!RB_EMPTY_ROOT(&ic->in_progress)); + + if (ic->metadata_wq) + destroy_workqueue(ic->metadata_wq); + if (ic->wait_wq) + destroy_workqueue(ic->wait_wq); + if (ic->commit_wq) + destroy_workqueue(ic->commit_wq); + if (ic->writer_wq) + destroy_workqueue(ic->writer_wq); + if (ic->bufio) + dm_bufio_client_destroy(ic->bufio); + mempool_destroy(ic->journal_io_mempool); + if (ic->io) + dm_io_client_destroy(ic->io); + if (ic->dev) + dm_put_device(ti, ic->dev); + dm_integrity_free_page_list(ic, ic->journal); + dm_integrity_free_page_list(ic, ic->journal_io); + dm_integrity_free_page_list(ic, ic->journal_xor); + if (ic->journal_scatterlist) + dm_integrity_free_journal_scatterlist(ic, ic->journal_scatterlist); + if (ic->journal_io_scatterlist) + dm_integrity_free_journal_scatterlist(ic, ic->journal_io_scatterlist); + if (ic->sk_requests) { + unsigned i; + + for (i = 0; i < ic->journal_sections; i++) { + struct skcipher_request *req = ic->sk_requests[i]; + if (req) { + kzfree(req->iv); + skcipher_request_free(req); + } + } + kvfree(ic->sk_requests); + } + kvfree(ic->journal_tree); + if (ic->sb) + free_pages_exact(ic->sb, SB_SECTORS << SECTOR_SHIFT); + + if (ic->internal_hash) + crypto_free_shash(ic->internal_hash); + free_alg(&ic->internal_hash_alg); + + if (ic->journal_crypt) + crypto_free_skcipher(ic->journal_crypt); + free_alg(&ic->journal_crypt_alg); + + if (ic->journal_mac) + crypto_free_shash(ic->journal_mac); + free_alg(&ic->journal_mac_alg); + + kfree(ic); +} + +static struct target_type integrity_target = { + .name = "integrity", + .version = {1, 0, 0}, + .module = THIS_MODULE, + .features = DM_TARGET_SINGLETON | DM_TARGET_INTEGRITY, + .ctr = dm_integrity_ctr, + .dtr = dm_integrity_dtr, + .map = dm_integrity_map, + .postsuspend = dm_integrity_postsuspend, + .resume = dm_integrity_resume, + .status = dm_integrity_status, + .iterate_devices = dm_integrity_iterate_devices, +}; + +int __init dm_integrity_init(void) +{ + int r; + + journal_io_cache = kmem_cache_create("integrity_journal_io", + sizeof(struct journal_io), 0, 0, NULL); + if (!journal_io_cache) { + DMERR("can't allocate journal io cache"); + return -ENOMEM; + } + + r = dm_register_target(&integrity_target); + + if (r < 0) + DMERR("register failed %d", r); + + return r; +} + +void dm_integrity_exit(void) +{ + dm_unregister_target(&integrity_target); + kmem_cache_destroy(journal_io_cache); +} + +module_init(dm_integrity_init); +module_exit(dm_integrity_exit); + +MODULE_AUTHOR("Milan Broz"); +MODULE_AUTHOR("Mikulas Patocka"); +MODULE_DESCRIPTION(DM_NAME " target for integrity tags extension"); +MODULE_LICENSE("GPL"); -- cgit v1.2.3