summaryrefslogtreecommitdiffstats
path: root/drivers/md
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/bcache/alloc.c2
-rw-r--r--drivers/md/bcache/bcache.h4
-rw-r--r--drivers/md/bcache/bset.c2
-rw-r--r--drivers/md/bcache/bset.h2
-rw-r--r--drivers/md/bcache/btree.c50
-rw-r--r--drivers/md/bcache/btree.h5
-rw-r--r--drivers/md/bcache/extents.c13
-rw-r--r--drivers/md/bcache/extents.h1
-rw-r--r--drivers/md/bcache/journal.c24
-rw-r--r--drivers/md/bcache/request.c3
-rw-r--r--drivers/md/bcache/super.c57
-rw-r--r--drivers/md/bcache/util.h4
-rw-r--r--drivers/md/bcache/writeback.c14
-rw-r--r--drivers/md/bcache/writeback.h3
-rw-r--r--drivers/md/dm-cache-metadata.c4
-rw-r--r--drivers/md/dm-cache-metadata.h8
-rw-r--r--drivers/md/dm-cache-target.c132
-rw-r--r--drivers/md/dm-crypt.c62
-rw-r--r--drivers/md/dm-io.c77
-rw-r--r--drivers/md/dm-mpath.c6
-rw-r--r--drivers/md/dm-switch.c67
-rw-r--r--drivers/md/dm-table.c86
-rw-r--r--drivers/md/dm-thin.c181
-rw-r--r--drivers/md/dm.h1
-rw-r--r--drivers/md/md.c13
-rw-r--r--drivers/md/raid0.c6
-rw-r--r--drivers/md/raid1.c48
-rw-r--r--drivers/md/raid10.c18
-rw-r--r--drivers/md/raid5.c22
29 files changed, 602 insertions, 313 deletions
diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c
index 443d03fbac47..8eeab72b93e2 100644
--- a/drivers/md/bcache/alloc.c
+++ b/drivers/md/bcache/alloc.c
@@ -331,7 +331,7 @@ static int bch_allocator_thread(void *arg)
mutex_unlock(&ca->set->bucket_lock);
blkdev_issue_discard(ca->bdev,
bucket_to_sector(ca->set, bucket),
- ca->sb.block_size, GFP_KERNEL, 0);
+ ca->sb.bucket_size, GFP_KERNEL, 0);
mutex_lock(&ca->set->bucket_lock);
}
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index d2ebcf323094..04f7bc28ef83 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -477,9 +477,13 @@ struct gc_stat {
* CACHE_SET_STOPPING always gets set first when we're closing down a cache set;
* we'll continue to run normally for awhile with CACHE_SET_STOPPING set (i.e.
* flushing dirty data).
+ *
+ * CACHE_SET_RUNNING means all cache devices have been registered and journal
+ * replay is complete.
*/
#define CACHE_SET_UNREGISTERING 0
#define CACHE_SET_STOPPING 1
+#define CACHE_SET_RUNNING 2
struct cache_set {
struct closure cl;
diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c
index 545416415305..646fe85261c1 100644
--- a/drivers/md/bcache/bset.c
+++ b/drivers/md/bcache/bset.c
@@ -1182,7 +1182,7 @@ static void __btree_sort(struct btree_keys *b, struct btree_iter *iter,
{
uint64_t start_time;
bool used_mempool = false;
- struct bset *out = (void *) __get_free_pages(__GFP_NOWARN|GFP_NOIO,
+ struct bset *out = (void *) __get_free_pages(__GFP_NOWARN|GFP_NOWAIT,
order);
if (!out) {
struct page *outp;
diff --git a/drivers/md/bcache/bset.h b/drivers/md/bcache/bset.h
index 5f6728d5d4dd..ae964624efb2 100644
--- a/drivers/md/bcache/bset.h
+++ b/drivers/md/bcache/bset.h
@@ -453,7 +453,7 @@ static inline bool bch_bkey_equal_header(const struct bkey *l,
{
return (KEY_DIRTY(l) == KEY_DIRTY(r) &&
KEY_PTRS(l) == KEY_PTRS(r) &&
- KEY_CSUM(l) == KEY_CSUM(l));
+ KEY_CSUM(l) == KEY_CSUM(r));
}
/* Keylists */
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index 7347b6100961..00cde40db572 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -117,9 +117,9 @@
({ \
int _r, l = (b)->level - 1; \
bool _w = l <= (op)->lock; \
- struct btree *_child = bch_btree_node_get((b)->c, op, key, l, _w);\
+ struct btree *_child = bch_btree_node_get((b)->c, op, key, l, \
+ _w, b); \
if (!IS_ERR(_child)) { \
- _child->parent = (b); \
_r = bch_btree_ ## fn(_child, op, ##__VA_ARGS__); \
rw_unlock(_w, _child); \
} else \
@@ -142,7 +142,6 @@
rw_lock(_w, _b, _b->level); \
if (_b == (c)->root && \
_w == insert_lock(op, _b)) { \
- _b->parent = NULL; \
_r = bch_btree_ ## fn(_b, op, ##__VA_ARGS__); \
} \
rw_unlock(_w, _b); \
@@ -202,7 +201,7 @@ void bch_btree_node_read_done(struct btree *b)
struct bset *i = btree_bset_first(b);
struct btree_iter *iter;
- iter = mempool_alloc(b->c->fill_iter, GFP_NOWAIT);
+ iter = mempool_alloc(b->c->fill_iter, GFP_NOIO);
iter->size = b->c->sb.bucket_size / b->c->sb.block_size;
iter->used = 0;
@@ -421,7 +420,7 @@ static void do_btree_node_write(struct btree *b)
SET_PTR_OFFSET(&k.key, 0, PTR_OFFSET(&k.key, 0) +
bset_sector_offset(&b->keys, i));
- if (!bio_alloc_pages(b->bio, GFP_NOIO)) {
+ if (!bio_alloc_pages(b->bio, __GFP_NOWARN|GFP_NOWAIT)) {
int j;
struct bio_vec *bv;
void *base = (void *) ((unsigned long) i & ~(PAGE_SIZE - 1));
@@ -967,7 +966,8 @@ err:
* level and op->lock.
*/
struct btree *bch_btree_node_get(struct cache_set *c, struct btree_op *op,
- struct bkey *k, int level, bool write)
+ struct bkey *k, int level, bool write,
+ struct btree *parent)
{
int i = 0;
struct btree *b;
@@ -1002,6 +1002,7 @@ retry:
BUG_ON(b->level != level);
}
+ b->parent = parent;
b->accessed = 1;
for (; i <= b->keys.nsets && b->keys.set[i].size; i++) {
@@ -1022,15 +1023,16 @@ retry:
return b;
}
-static void btree_node_prefetch(struct cache_set *c, struct bkey *k, int level)
+static void btree_node_prefetch(struct btree *parent, struct bkey *k)
{
struct btree *b;
- mutex_lock(&c->bucket_lock);
- b = mca_alloc(c, NULL, k, level);
- mutex_unlock(&c->bucket_lock);
+ mutex_lock(&parent->c->bucket_lock);
+ b = mca_alloc(parent->c, NULL, k, parent->level - 1);
+ mutex_unlock(&parent->c->bucket_lock);
if (!IS_ERR_OR_NULL(b)) {
+ b->parent = parent;
bch_btree_node_read(b);
rw_unlock(true, b);
}
@@ -1060,15 +1062,16 @@ static void btree_node_free(struct btree *b)
mutex_unlock(&b->c->bucket_lock);
}
-struct btree *bch_btree_node_alloc(struct cache_set *c, struct btree_op *op,
- int level)
+struct btree *__bch_btree_node_alloc(struct cache_set *c, struct btree_op *op,
+ int level, bool wait,
+ struct btree *parent)
{
BKEY_PADDED(key) k;
struct btree *b = ERR_PTR(-EAGAIN);
mutex_lock(&c->bucket_lock);
retry:
- if (__bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, 1, op != NULL))
+ if (__bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, 1, wait))
goto err;
bkey_put(c, &k.key);
@@ -1085,6 +1088,7 @@ retry:
}
b->accessed = 1;
+ b->parent = parent;
bch_bset_init_next(&b->keys, b->keys.set->data, bset_magic(&b->c->sb));
mutex_unlock(&c->bucket_lock);
@@ -1096,14 +1100,21 @@ err_free:
err:
mutex_unlock(&c->bucket_lock);
- trace_bcache_btree_node_alloc_fail(b);
+ trace_bcache_btree_node_alloc_fail(c);
return b;
}
+static struct btree *bch_btree_node_alloc(struct cache_set *c,
+ struct btree_op *op, int level,
+ struct btree *parent)
+{
+ return __bch_btree_node_alloc(c, op, level, op != NULL, parent);
+}
+
static struct btree *btree_node_alloc_replacement(struct btree *b,
struct btree_op *op)
{
- struct btree *n = bch_btree_node_alloc(b->c, op, b->level);
+ struct btree *n = bch_btree_node_alloc(b->c, op, b->level, b->parent);
if (!IS_ERR_OR_NULL(n)) {
mutex_lock(&n->write_lock);
bch_btree_sort_into(&b->keys, &n->keys, &b->c->sort);
@@ -1403,6 +1414,7 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op,
BUG_ON(btree_bset_first(new_nodes[0])->keys);
btree_node_free(new_nodes[0]);
rw_unlock(true, new_nodes[0]);
+ new_nodes[0] = NULL;
for (i = 0; i < nodes; i++) {
if (__bch_keylist_realloc(&keylist, bkey_u64s(&r[i].b->key)))
@@ -1516,7 +1528,7 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op,
k = bch_btree_iter_next_filter(&iter, &b->keys, bch_ptr_bad);
if (k) {
r->b = bch_btree_node_get(b->c, op, k, b->level - 1,
- true);
+ true, b);
if (IS_ERR(r->b)) {
ret = PTR_ERR(r->b);
break;
@@ -1811,7 +1823,7 @@ static int bch_btree_check_recurse(struct btree *b, struct btree_op *op)
k = bch_btree_iter_next_filter(&iter, &b->keys,
bch_ptr_bad);
if (k)
- btree_node_prefetch(b->c, k, b->level - 1);
+ btree_node_prefetch(b, k);
if (p)
ret = btree(check_recurse, p, b, op);
@@ -1976,12 +1988,12 @@ static int btree_split(struct btree *b, struct btree_op *op,
trace_bcache_btree_node_split(b, btree_bset_first(n1)->keys);
- n2 = bch_btree_node_alloc(b->c, op, b->level);
+ n2 = bch_btree_node_alloc(b->c, op, b->level, b->parent);
if (IS_ERR(n2))
goto err_free1;
if (!b->parent) {
- n3 = bch_btree_node_alloc(b->c, op, b->level + 1);
+ n3 = bch_btree_node_alloc(b->c, op, b->level + 1, NULL);
if (IS_ERR(n3))
goto err_free2;
}
diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h
index 91dfa5e69685..5c391fa01bed 100644
--- a/drivers/md/bcache/btree.h
+++ b/drivers/md/bcache/btree.h
@@ -242,9 +242,10 @@ void __bch_btree_node_write(struct btree *, struct closure *);
void bch_btree_node_write(struct btree *, struct closure *);
void bch_btree_set_root(struct btree *);
-struct btree *bch_btree_node_alloc(struct cache_set *, struct btree_op *, int);
+struct btree *__bch_btree_node_alloc(struct cache_set *, struct btree_op *,
+ int, bool, struct btree *);
struct btree *bch_btree_node_get(struct cache_set *, struct btree_op *,
- struct bkey *, int, bool);
+ struct bkey *, int, bool, struct btree *);
int bch_btree_insert_check_key(struct btree *, struct btree_op *,
struct bkey *);
diff --git a/drivers/md/bcache/extents.c b/drivers/md/bcache/extents.c
index 3a0de4cf9771..243de0bf15cd 100644
--- a/drivers/md/bcache/extents.c
+++ b/drivers/md/bcache/extents.c
@@ -474,9 +474,8 @@ out:
return false;
}
-static bool bch_extent_invalid(struct btree_keys *bk, const struct bkey *k)
+bool __bch_extent_invalid(struct cache_set *c, const struct bkey *k)
{
- struct btree *b = container_of(bk, struct btree, keys);
char buf[80];
if (!KEY_SIZE(k))
@@ -485,16 +484,22 @@ static bool bch_extent_invalid(struct btree_keys *bk, const struct bkey *k)
if (KEY_SIZE(k) > KEY_OFFSET(k))
goto bad;
- if (__ptr_invalid(b->c, k))
+ if (__ptr_invalid(c, k))
goto bad;
return false;
bad:
bch_extent_to_text(buf, sizeof(buf), k);
- cache_bug(b->c, "spotted extent %s: %s", buf, bch_ptr_status(b->c, k));
+ cache_bug(c, "spotted extent %s: %s", buf, bch_ptr_status(c, k));
return true;
}
+static bool bch_extent_invalid(struct btree_keys *bk, const struct bkey *k)
+{
+ struct btree *b = container_of(bk, struct btree, keys);
+ return __bch_extent_invalid(b->c, k);
+}
+
static bool bch_extent_bad_expensive(struct btree *b, const struct bkey *k,
unsigned ptr)
{
diff --git a/drivers/md/bcache/extents.h b/drivers/md/bcache/extents.h
index e4e23409782d..e2ed54054e7a 100644
--- a/drivers/md/bcache/extents.h
+++ b/drivers/md/bcache/extents.h
@@ -9,5 +9,6 @@ struct cache_set;
void bch_extent_to_text(char *, size_t, const struct bkey *);
bool __bch_btree_ptr_invalid(struct cache_set *, const struct bkey *);
+bool __bch_extent_invalid(struct cache_set *, const struct bkey *);
#endif /* _BCACHE_EXTENTS_H */
diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
index 59e82021b5bb..fe080ad0e558 100644
--- a/drivers/md/bcache/journal.c
+++ b/drivers/md/bcache/journal.c
@@ -7,6 +7,7 @@
#include "bcache.h"
#include "btree.h"
#include "debug.h"
+#include "extents.h"
#include <trace/events/bcache.h>
@@ -189,11 +190,15 @@ int bch_journal_read(struct cache_set *c, struct list_head *list)
if (read_bucket(l))
goto bsearch;
- if (list_empty(list))
+ /* no journal entries on this device? */
+ if (l == ca->sb.njournal_buckets)
continue;
bsearch:
+ BUG_ON(list_empty(list));
+
/* Binary search */
- m = r = find_next_bit(bitmap, ca->sb.njournal_buckets, l + 1);
+ m = l;
+ r = find_next_bit(bitmap, ca->sb.njournal_buckets, l + 1);
pr_debug("starting binary search, l %u r %u", l, r);
while (l + 1 < r) {
@@ -291,15 +296,16 @@ void bch_journal_mark(struct cache_set *c, struct list_head *list)
for (k = i->j.start;
k < bset_bkey_last(&i->j);
- k = bkey_next(k)) {
- unsigned j;
+ k = bkey_next(k))
+ if (!__bch_extent_invalid(c, k)) {
+ unsigned j;
- for (j = 0; j < KEY_PTRS(k); j++)
- if (ptr_available(c, k, j))
- atomic_inc(&PTR_BUCKET(c, k, j)->pin);
+ for (j = 0; j < KEY_PTRS(k); j++)
+ if (ptr_available(c, k, j))
+ atomic_inc(&PTR_BUCKET(c, k, j)->pin);
- bch_initial_mark_key(c, 0, k);
- }
+ bch_initial_mark_key(c, 0, k);
+ }
}
}
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index 15fff4f68a7c..62e6e98186b5 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -311,7 +311,8 @@ void bch_data_insert(struct closure *cl)
{
struct data_insert_op *op = container_of(cl, struct data_insert_op, cl);
- trace_bcache_write(op->bio, op->writeback, op->bypass);
+ trace_bcache_write(op->c, op->inode, op->bio,
+ op->writeback, op->bypass);
bch_keylist_init(&op->insert_keys);
bio_get(op->bio);
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 926ded8ccbf5..d4713d098a39 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -733,8 +733,6 @@ static void bcache_device_detach(struct bcache_device *d)
static void bcache_device_attach(struct bcache_device *d, struct cache_set *c,
unsigned id)
{
- BUG_ON(test_bit(CACHE_SET_STOPPING, &c->flags));
-
d->id = id;
d->c = c;
c->devices[id] = d;
@@ -927,6 +925,7 @@ static void cached_dev_detach_finish(struct work_struct *w)
list_move(&dc->list, &uncached_devices);
clear_bit(BCACHE_DEV_DETACHING, &dc->disk.flags);
+ clear_bit(BCACHE_DEV_UNLINK_DONE, &dc->disk.flags);
mutex_unlock(&bch_register_lock);
@@ -1041,6 +1040,9 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c)
*/
atomic_set(&dc->count, 1);
+ if (bch_cached_dev_writeback_start(dc))
+ return -ENOMEM;
+
if (BDEV_STATE(&dc->sb) == BDEV_STATE_DIRTY) {
bch_sectors_dirty_init(dc);
atomic_set(&dc->has_dirty, 1);
@@ -1070,7 +1072,8 @@ static void cached_dev_free(struct closure *cl)
struct cached_dev *dc = container_of(cl, struct cached_dev, disk.cl);
cancel_delayed_work_sync(&dc->writeback_rate_update);
- kthread_stop(dc->writeback_thread);
+ if (!IS_ERR_OR_NULL(dc->writeback_thread))
+ kthread_stop(dc->writeback_thread);
mutex_lock(&bch_register_lock);
@@ -1081,12 +1084,8 @@ static void cached_dev_free(struct closure *cl)
mutex_unlock(&bch_register_lock);
- if (!IS_ERR_OR_NULL(dc->bdev)) {
- if (dc->bdev->bd_disk)
- blk_sync_queue(bdev_get_queue(dc->bdev));
-
+ if (!IS_ERR_OR_NULL(dc->bdev))
blkdev_put(dc->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
- }
wake_up(&unregister_wait);
@@ -1213,7 +1212,9 @@ void bch_flash_dev_release(struct kobject *kobj)
static void flash_dev_free(struct closure *cl)
{
struct bcache_device *d = container_of(cl, struct bcache_device, cl);
+ mutex_lock(&bch_register_lock);
bcache_device_free(d);
+ mutex_unlock(&bch_register_lock);
kobject_put(&d->kobj);
}
@@ -1221,7 +1222,9 @@ static void flash_dev_flush(struct closure *cl)
{
struct bcache_device *d = container_of(cl, struct bcache_device, cl);
+ mutex_lock(&bch_register_lock);
bcache_device_unlink(d);
+ mutex_unlock(&bch_register_lock);
kobject_del(&d->kobj);
continue_at(cl, flash_dev_free, system_wq);
}
@@ -1277,6 +1280,9 @@ int bch_flash_dev_create(struct cache_set *c, uint64_t size)
if (test_bit(CACHE_SET_STOPPING, &c->flags))
return -EINTR;
+ if (!test_bit(CACHE_SET_RUNNING, &c->flags))
+ return -EPERM;
+
u = uuid_find_empty(c);
if (!u) {
pr_err("Can't create volume, no room for UUID");
@@ -1346,8 +1352,11 @@ static void cache_set_free(struct closure *cl)
bch_journal_free(c);
for_each_cache(ca, c, i)
- if (ca)
+ if (ca) {
+ ca->set = NULL;
+ c->cache[ca->sb.nr_this_dev] = NULL;
kobject_put(&ca->kobj);
+ }
bch_bset_sort_state_free(&c->sort);
free_pages((unsigned long) c->uuids, ilog2(bucket_pages(c)));
@@ -1405,9 +1414,11 @@ static void cache_set_flush(struct closure *cl)
if (ca->alloc_thread)
kthread_stop(ca->alloc_thread);
- cancel_delayed_work_sync(&c->journal.work);
- /* flush last journal entry if needed */
- c->journal.work.work.func(&c->journal.work.work);
+ if (c->journal.cur) {
+ cancel_delayed_work_sync(&c->journal.work);
+ /* flush last journal entry if needed */
+ c->journal.work.work.func(&c->journal.work.work);
+ }
closure_return(cl);
}
@@ -1586,7 +1597,7 @@ static void run_cache_set(struct cache_set *c)
goto err;
err = "error reading btree root";
- c->root = bch_btree_node_get(c, NULL, k, j->btree_level, true);
+ c->root = bch_btree_node_get(c, NULL, k, j->btree_level, true, NULL);
if (IS_ERR_OR_NULL(c->root))
goto err;
@@ -1661,7 +1672,7 @@ static void run_cache_set(struct cache_set *c)
goto err;
err = "cannot allocate new btree root";
- c->root = bch_btree_node_alloc(c, NULL, 0);
+ c->root = __bch_btree_node_alloc(c, NULL, 0, true, NULL);
if (IS_ERR_OR_NULL(c->root))
goto err;
@@ -1697,6 +1708,7 @@ static void run_cache_set(struct cache_set *c)
flash_devs_run(c);
+ set_bit(CACHE_SET_RUNNING, &c->flags);
return;
err:
closure_sync(&cl);
@@ -1760,6 +1772,7 @@ found:
pr_debug("set version = %llu", c->sb.version);
}
+ kobject_get(&ca->kobj);
ca->set = c;
ca->set->cache[ca->sb.nr_this_dev] = ca;
c->cache_by_alloc[c->caches_loaded++] = ca;
@@ -1780,8 +1793,10 @@ void bch_cache_release(struct kobject *kobj)
struct cache *ca = container_of(kobj, struct cache, kobj);
unsigned i;
- if (ca->set)
+ if (ca->set) {
+ BUG_ON(ca->set->cache[ca->sb.nr_this_dev] != ca);
ca->set->cache[ca->sb.nr_this_dev] = NULL;
+ }
bio_split_pool_free(&ca->bio_split_hook);
@@ -1798,10 +1813,8 @@ void bch_cache_release(struct kobject *kobj)
if (ca->sb_bio.bi_inline_vecs[0].bv_page)
put_page(ca->sb_bio.bi_io_vec[0].bv_page);
- if (!IS_ERR_OR_NULL(ca->bdev)) {
- blk_sync_queue(bdev_get_queue(ca->bdev));
+ if (!IS_ERR_OR_NULL(ca->bdev))
blkdev_put(ca->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
- }
kfree(ca);
module_put(THIS_MODULE);
@@ -1844,7 +1857,7 @@ static int cache_alloc(struct cache_sb *sb, struct cache *ca)
}
static void register_cache(struct cache_sb *sb, struct page *sb_page,
- struct block_device *bdev, struct cache *ca)
+ struct block_device *bdev, struct cache *ca)
{
char name[BDEVNAME_SIZE];
const char *err = "cannot allocate memory";
@@ -1877,10 +1890,12 @@ static void register_cache(struct cache_sb *sb, struct page *sb_page,
goto err;
pr_info("registered cache device %s", bdevname(bdev, name));
+out:
+ kobject_put(&ca->kobj);
return;
err:
pr_notice("error opening %s: %s", bdevname(bdev, name), err);
- kobject_put(&ca->kobj);
+ goto out;
}
/* Global interfaces/init */
@@ -1945,10 +1960,12 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
if (IS_ERR(bdev)) {
if (bdev == ERR_PTR(-EBUSY)) {
bdev = lookup_bdev(strim(path));
+ mutex_lock(&bch_register_lock);
if (!IS_ERR(bdev) && bch_is_open(bdev))
err = "device already registered";
else
err = "device busy";
+ mutex_unlock(&bch_register_lock);
}
goto err;
}
diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h
index ac7d0d1f70d7..98df7572b5f7 100644
--- a/drivers/md/bcache/util.h
+++ b/drivers/md/bcache/util.h
@@ -416,8 +416,8 @@ do { \
average_frequency, frequency_units); \
__print_time_stat(stats, name, \
average_duration, duration_units); \
- __print_time_stat(stats, name, \
- max_duration, duration_units); \
+ sysfs_print(name ## _ ##max_duration ## _ ## duration_units, \
+ div_u64((stats)->max_duration, NSEC_PER_ ## duration_units));\
\
sysfs_print(name ## _last_ ## frequency_units, (stats)->last \
? div_s64(local_clock() - (stats)->last, \
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
index f4300e4c0114..f1986bcd1bf0 100644
--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@@ -239,7 +239,7 @@ static void read_dirty(struct cached_dev *dc)
if (KEY_START(&w->key) != dc->last_read ||
jiffies_to_msecs(delay) > 50)
while (!kthread_should_stop() && delay)
- delay = schedule_timeout_uninterruptible(delay);
+ delay = schedule_timeout_interruptible(delay);
dc->last_read = KEY_OFFSET(&w->key);
@@ -436,7 +436,7 @@ static int bch_writeback_thread(void *arg)
while (delay &&
!kthread_should_stop() &&
!test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags))
- delay = schedule_timeout_uninterruptible(delay);
+ delay = schedule_timeout_interruptible(delay);
}
}
@@ -478,7 +478,7 @@ void bch_sectors_dirty_init(struct cached_dev *dc)
dc->disk.sectors_dirty_last = bcache_dev_sectors_dirty(&dc->disk);
}
-int bch_cached_dev_writeback_init(struct cached_dev *dc)
+void bch_cached_dev_writeback_init(struct cached_dev *dc)
{
sema_init(&dc->in_flight, 64);
init_rwsem(&dc->writeback_lock);
@@ -494,14 +494,20 @@ int bch_cached_dev_writeback_init(struct cached_dev *dc)
dc->writeback_rate_d_term = 30;
dc->writeback_rate_p_term_inverse = 6000;
+ INIT_DELAYED_WORK(&dc->writeback_rate_update, update_writeback_rate);
+}
+
+int bch_cached_dev_writeback_start(struct cached_dev *dc)
+{
dc->writeback_thread = kthread_create(bch_writeback_thread, dc,
"bcache_writeback");
if (IS_ERR(dc->writeback_thread))
return PTR_ERR(dc->writeback_thread);
- INIT_DELAYED_WORK(&dc->writeback_rate_update, update_writeback_rate);
schedule_delayed_work(&dc->writeback_rate_update,
dc->writeback_rate_update_seconds * HZ);
+ bch_writeback_queue(dc);
+
return 0;
}
diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h
index e2f8598937ac..0a9dab187b79 100644
--- a/drivers/md/bcache/writeback.h
+++ b/drivers/md/bcache/writeback.h
@@ -85,6 +85,7 @@ static inline void bch_writeback_add(struct cached_dev *dc)
void bcache_dev_sectors_dirty_add(struct cache_set *, unsigned, uint64_t, int);
void bch_sectors_dirty_init(struct cached_dev *dc);
-int bch_cached_dev_writeback_init(struct cached_dev *);
+void bch_cached_dev_writeback_init(struct cached_dev *);
+int bch_cached_dev_writeback_start(struct cached_dev *);
#endif
diff --git a/drivers/md/dm-cache-metadata.c b/drivers/md/dm-cache-metadata.c
index d2899e7eb3aa..06709257adde 100644
--- a/drivers/md/dm-cache-metadata.c
+++ b/drivers/md/dm-cache-metadata.c
@@ -330,7 +330,7 @@ static int __write_initial_superblock(struct dm_cache_metadata *cmd)
disk_super->discard_root = cpu_to_le64(cmd->discard_root);
disk_super->discard_block_size = cpu_to_le64(cmd->discard_block_size);
disk_super->discard_nr_blocks = cpu_to_le64(from_oblock(cmd->discard_nr_blocks));
- disk_super->metadata_block_size = cpu_to_le32(DM_CACHE_METADATA_BLOCK_SIZE >> SECTOR_SHIFT);
+ disk_super->metadata_block_size = cpu_to_le32(DM_CACHE_METADATA_BLOCK_SIZE);
disk_super->data_block_size = cpu_to_le32(cmd->data_block_size);
disk_super->cache_blocks = cpu_to_le32(0);
@@ -478,7 +478,7 @@ static int __create_persistent_data_objects(struct dm_cache_metadata *cmd,
bool may_format_device)
{
int r;
- cmd->bm = dm_block_manager_create(cmd->bdev, DM_CACHE_METADATA_BLOCK_SIZE,
+ cmd->bm = dm_block_manager_create(cmd->bdev, DM_CACHE_METADATA_BLOCK_SIZE << SECTOR_SHIFT,
CACHE_METADATA_CACHE_SIZE,
CACHE_MAX_CONCURRENT_LOCKS);
if (IS_ERR(cmd->bm)) {
diff --git a/drivers/md/dm-cache-metadata.h b/drivers/md/dm-cache-metadata.h
index cd70a78623a3..7383c90ccdb8 100644
--- a/drivers/md/dm-cache-metadata.h
+++ b/drivers/md/dm-cache-metadata.h
@@ -9,19 +9,17 @@
#include "dm-cache-block-types.h"
#include "dm-cache-policy-internal.h"
+#include "persistent-data/dm-space-map-metadata.h"
/*----------------------------------------------------------------*/
-#define DM_CACHE_METADATA_BLOCK_SIZE 4096
+#define DM_CACHE_METADATA_BLOCK_SIZE DM_SM_METADATA_BLOCK_SIZE
/* FIXME: remove this restriction */
/*
* The metadata device is currently limited in size.
- *
- * We have one block of index, which can hold 255 index entries. Each
- * index entry contains allocation info about 16k metadata blocks.
*/
-#define DM_CACHE_METADATA_MAX_SECTORS (255 * (1 << 14) * (DM_CACHE_METADATA_BLOCK_SIZE / (1 << SECTOR_SHIFT)))
+#define DM_CACHE_METADATA_MAX_SECTORS DM_SM_METADATA_MAX_SECTORS
/*
* A metadata device larger than 16GB triggers a warning.
diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index 2c63326638b6..7130505c2425 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -718,6 +718,22 @@ static int bio_triggers_commit(struct cache *cache, struct bio *bio)
return bio->bi_rw & (REQ_FLUSH | REQ_FUA);
}
+/*
+ * You must increment the deferred set whilst the prison cell is held. To
+ * encourage this, we ask for 'cell' to be passed in.
+ */
+static void inc_ds(struct cache *cache, struct bio *bio,
+ struct dm_bio_prison_cell *cell)
+{
+ size_t pb_data_size = get_per_bio_data_size(cache);
+ struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
+
+ BUG_ON(!cell);
+ BUG_ON(pb->all_io_entry);
+
+ pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
+}
+
static void issue(struct cache *cache, struct bio *bio)
{
unsigned long flags;
@@ -737,6 +753,12 @@ static void issue(struct cache *cache, struct bio *bio)
spin_unlock_irqrestore(&cache->lock, flags);
}
+static void inc_and_issue(struct cache *cache, struct bio *bio, struct dm_bio_prison_cell *cell)
+{
+ inc_ds(cache, bio, cell);
+ issue(cache, bio);
+}
+
static void defer_writethrough_bio(struct cache *cache, struct bio *bio)
{
unsigned long flags;
@@ -873,8 +895,8 @@ static void migration_success_pre_commit(struct dm_cache_migration *mg)
struct cache *cache = mg->cache;
if (mg->writeback) {
- cell_defer(cache, mg->old_ocell, false);
clear_dirty(cache, mg->old_oblock, mg->cblock);
+ cell_defer(cache, mg->old_ocell, false);
cleanup_migration(mg);
return;
@@ -929,13 +951,13 @@ static void migration_success_post_commit(struct dm_cache_migration *mg)
}
} else {
+ clear_dirty(cache, mg->new_oblock, mg->cblock);
if (mg->requeue_holder)
cell_defer(cache, mg->new_ocell, true);
else {
bio_endio(mg->new_ocell->holder, 0);
cell_defer(cache, mg->new_ocell, false);
}
- clear_dirty(cache, mg->new_oblock, mg->cblock);
cleanup_migration(mg);
}
}
@@ -1015,6 +1037,11 @@ static void issue_overwrite(struct dm_cache_migration *mg, struct bio *bio)
dm_hook_bio(&pb->hook_info, bio, overwrite_endio, mg);
remap_to_cache_dirty(mg->cache, bio, mg->new_oblock, mg->cblock);
+
+ /*
+ * No need to inc_ds() here, since the cell will be held for the
+ * duration of the io.
+ */
generic_make_request(bio);
}
@@ -1115,8 +1142,7 @@ static void check_for_quiesced_migrations(struct cache *cache,
return;
INIT_LIST_HEAD(&work);
- if (pb->all_io_entry)
- dm_deferred_entry_dec(pb->all_io_entry, &work);
+ dm_deferred_entry_dec(pb->all_io_entry, &work);
if (!list_empty(&work))
queue_quiesced_migrations(cache, &work);
@@ -1252,6 +1278,11 @@ static void process_flush_bio(struct cache *cache, struct bio *bio)
else
remap_to_cache(cache, bio, 0);
+ /*
+ * REQ_FLUSH is not directed at any particular block so we don't
+ * need to inc_ds(). REQ_FUA's are split into a write + REQ_FLUSH
+ * by dm-core.
+ */
issue(cache, bio);
}
@@ -1301,15 +1332,6 @@ static void inc_miss_counter(struct cache *cache, struct bio *bio)
&cache->stats.read_miss : &cache->stats.write_miss);
}
-static void issue_cache_bio(struct cache *cache, struct bio *bio,
- struct per_bio_data *pb,
- dm_oblock_t oblock, dm_cblock_t cblock)
-{
- pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
- remap_to_cache_dirty(cache, bio, oblock, cblock);
- issue(cache, bio);
-}
-
static void process_bio(struct cache *cache, struct prealloc *structs,
struct bio *bio)
{
@@ -1318,8 +1340,6 @@ static void process_bio(struct cache *cache, struct prealloc *structs,
dm_oblock_t block = get_bio_block(cache, bio);
struct dm_bio_prison_cell *cell_prealloc, *old_ocell, *new_ocell;
struct policy_result lookup_result;
- size_t pb_data_size = get_per_bio_data_size(cache);
- struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
bool discarded_block = is_discarded_oblock(cache, block);
bool passthrough = passthrough_mode(&cache->features);
bool can_migrate = !passthrough && (discarded_block || spare_migration_bandwidth(cache));
@@ -1359,9 +1379,8 @@ static void process_bio(struct cache *cache, struct prealloc *structs,
} else {
/* FIXME: factor out issue_origin() */
- pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
remap_to_origin_clear_discard(cache, bio, block);
- issue(cache, bio);
+ inc_and_issue(cache, bio, new_ocell);
}
} else {
inc_hit_counter(cache, bio);
@@ -1369,20 +1388,21 @@ static void process_bio(struct cache *cache, struct prealloc *structs,
if (bio_data_dir(bio) == WRITE &&
writethrough_mode(&cache->features) &&
!is_dirty(cache, lookup_result.cblock)) {
- pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock);
- issue(cache, bio);
- } else
- issue_cache_bio(cache, bio, pb, block, lookup_result.cblock);
+ inc_and_issue(cache, bio, new_ocell);
+
+ } else {
+ remap_to_cache_dirty(cache, bio, block, lookup_result.cblock);
+ inc_and_issue(cache, bio, new_ocell);
+ }
}
break;
case POLICY_MISS:
inc_miss_counter(cache, bio);
- pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
remap_to_origin_clear_discard(cache, bio, block);
- issue(cache, bio);
+ inc_and_issue(cache, bio, new_ocell);
break;
case POLICY_NEW:
@@ -1501,6 +1521,9 @@ static void process_deferred_flush_bios(struct cache *cache, bool submit_bios)
bio_list_init(&cache->deferred_flush_bios);
spin_unlock_irqrestore(&cache->lock, flags);
+ /*
+ * These bios have already been through inc_ds()
+ */
while ((bio = bio_list_pop(&bios)))
submit_bios ? generic_make_request(bio) : bio_io_error(bio);
}
@@ -1518,6 +1541,9 @@ static void process_deferred_writethrough_bios(struct cache *cache)
bio_list_init(&cache->deferred_writethrough_bios);
spin_unlock_irqrestore(&cache->lock, flags);
+ /*
+ * These bios have already been through inc_ds()
+ */
while ((bio = bio_list_pop(&bios)))
generic_make_request(bio);
}
@@ -1694,6 +1720,7 @@ static void do_worker(struct work_struct *ws)
if (commit_if_needed(cache)) {
process_deferred_flush_bios(cache, false);
+ process_migrations(cache, &cache->need_commit_migrations, migration_failure);
/*
* FIXME: rollback metadata or just go into a
@@ -2406,16 +2433,13 @@ out:
return r;
}
-static int cache_map(struct dm_target *ti, struct bio *bio)
+static int __cache_map(struct cache *cache, struct bio *bio, struct dm_bio_prison_cell **cell)
{
- struct cache *cache = ti->private;
-
int r;
dm_oblock_t block = get_bio_block(cache, bio);
size_t pb_data_size = get_per_bio_data_size(cache);
bool can_migrate = false;
bool discarded_block;
- struct dm_bio_prison_cell *cell;
struct policy_result lookup_result;
struct per_bio_data *pb = init_per_bio_data(bio, pb_data_size);
@@ -2437,15 +2461,15 @@ static int cache_map(struct dm_target *ti, struct bio *bio)
/*
* Check to see if that block is currently migrating.
*/
- cell = alloc_prison_cell(cache);
- if (!cell) {
+ *cell = alloc_prison_cell(cache);
+ if (!*cell) {
defer_bio(cache, bio);
return DM_MAPIO_SUBMITTED;
}
- r = bio_detain(cache, block, bio, cell,
+ r = bio_detain(cache, block, bio, *cell,
(cell_free_fn) free_prison_cell,
- cache, &cell);
+ cache, cell);
if (r) {
if (r < 0)
defer_bio(cache, bio);
@@ -2458,11 +2482,12 @@ static int cache_map(struct dm_target *ti, struct bio *bio)
r = policy_map(cache->policy, block, false, can_migrate, discarded_block,
bio, &lookup_result);
if (r == -EWOULDBLOCK) {
- cell_defer(cache, cell, true);
+ cell_defer(cache, *cell, true);
return DM_MAPIO_SUBMITTED;
} else if (r) {
DMERR_LIMIT("Unexpected return from cache replacement policy: %d", r);
+ cell_defer(cache, *cell, false);
bio_io_error(bio);
return DM_MAPIO_SUBMITTED;
}
@@ -2476,52 +2501,44 @@ static int cache_map(struct dm_target *ti, struct bio *bio)
* We need to invalidate this block, so
* defer for the worker thread.
*/
- cell_defer(cache, cell, true);
+ cell_defer(cache, *cell, true);
r = DM_MAPIO_SUBMITTED;
} else {
- pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
inc_miss_counter(cache, bio);
remap_to_origin_clear_discard(cache, bio, block);
-
- cell_defer(cache, cell, false);
}
} else {
inc_hit_counter(cache, bio);
- pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
-
if (bio_data_dir(bio) == WRITE && writethrough_mode(&cache->features) &&
!is_dirty(cache, lookup_result.cblock))
remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock);
else
remap_to_cache_dirty(cache, bio, block, lookup_result.cblock);
-
- cell_defer(cache, cell, false);
}
break;
case POLICY_MISS:
inc_miss_counter(cache, bio);
- pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
-
if (pb->req_nr != 0) {
/*
* This is a duplicate writethrough io that is no
* longer needed because the block has been demoted.
*/
bio_endio(bio, 0);
- cell_defer(cache, cell, false);
- return DM_MAPIO_SUBMITTED;
- } else {
+ cell_defer(cache, *cell, false);
+ r = DM_MAPIO_SUBMITTED;
+
+ } else
remap_to_origin_clear_discard(cache, bio, block);
- cell_defer(cache, cell, false);
- }
+
break;
default:
DMERR_LIMIT("%s: erroring bio: unknown policy op: %u", __func__,
(unsigned) lookup_result.op);
+ cell_defer(cache, *cell, false);
bio_io_error(bio);
r = DM_MAPIO_SUBMITTED;
}
@@ -2529,6 +2546,21 @@ static int cache_map(struct dm_target *ti, struct bio *bio)
return r;
}
+static int cache_map(struct dm_target *ti, struct bio *bio)
+{
+ int r;
+ struct dm_bio_prison_cell *cell;
+ struct cache *cache = ti->private;
+
+ r = __cache_map(cache, bio, &cell);
+ if (r == DM_MAPIO_REMAPPED) {
+ inc_ds(cache, bio, cell);
+ cell_defer(cache, cell, false);
+ }
+
+ return r;
+}
+
static int cache_end_io(struct dm_target *ti, struct bio *bio, int error)
{
struct cache *cache = ti->private;
@@ -2808,7 +2840,7 @@ static void cache_status(struct dm_target *ti, status_type_t type,
residency = policy_residency(cache->policy);
DMEMIT("%u %llu/%llu %u %llu/%llu %u %u %u %u %u %u %lu ",
- (unsigned)(DM_CACHE_METADATA_BLOCK_SIZE >> SECTOR_SHIFT),
+ (unsigned)DM_CACHE_METADATA_BLOCK_SIZE,
(unsigned long long)(nr_blocks_metadata - nr_free_blocks_metadata),
(unsigned long long)nr_blocks_metadata,
cache->sectors_per_block,
@@ -3062,7 +3094,7 @@ static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits)
*/
if (io_opt_sectors < cache->sectors_per_block ||
do_div(io_opt_sectors, cache->sectors_per_block)) {
- blk_limits_io_min(limits, 0);
+ blk_limits_io_min(limits, cache->sectors_per_block << SECTOR_SHIFT);
blk_limits_io_opt(limits, cache->sectors_per_block << SECTOR_SHIFT);
}
set_discard_limits(cache, limits);
@@ -3072,7 +3104,7 @@ static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits)
static struct target_type cache_target = {
.name = "cache",
- .version = {1, 4, 0},
+ .version = {1, 5, 0},
.module = THIS_MODULE,
.ctr = cache_ctr,
.dtr = cache_dtr,
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 4cba2d808afb..cd15e0801228 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -59,7 +59,7 @@ struct dm_crypt_io {
int error;
sector_t sector;
struct dm_crypt_io *base_io;
-};
+} CRYPTO_MINALIGN_ATTR;
struct dm_crypt_request {
struct convert_context *ctx;
@@ -162,6 +162,8 @@ struct crypt_config {
*/
unsigned int dmreq_start;
+ unsigned int per_bio_data_size;
+
unsigned long flags;
unsigned int key_size;
unsigned int key_parts; /* independent parts in key buffer */
@@ -895,6 +897,15 @@ static void crypt_alloc_req(struct crypt_config *cc,
kcryptd_async_done, dmreq_of_req(cc, ctx->req));
}
+static void crypt_free_req(struct crypt_config *cc,
+ struct ablkcipher_request *req, struct bio *base_bio)
+{
+ struct dm_crypt_io *io = dm_per_bio_data(base_bio, cc->per_bio_data_size);
+
+ if ((struct ablkcipher_request *)(io + 1) != req)
+ mempool_free(req, cc->req_pool);
+}
+
/*
* Encrypt / decrypt data from one bio to another one (can be the same one)
*/
@@ -1008,12 +1019,9 @@ static void crypt_free_buffer_pages(struct crypt_config *cc, struct bio *clone)
}
}
-static struct dm_crypt_io *crypt_io_alloc(struct crypt_config *cc,
- struct bio *bio, sector_t sector)
+static void crypt_io_init(struct dm_crypt_io *io, struct crypt_config *cc,
+ struct bio *bio, sector_t sector)
{
- struct dm_crypt_io *io;
-
- io = mempool_alloc(cc->io_pool, GFP_NOIO);
io->cc = cc;
io->base_bio = bio;
io->sector = sector;
@@ -1021,8 +1029,6 @@ static struct dm_crypt_io *crypt_io_alloc(struct crypt_config *cc,
io->base_io = NULL;
io->ctx.req = NULL;
atomic_set(&io->io_pending, 0);
-
- return io;
}
static void crypt_inc_pending(struct dm_crypt_io *io)
@@ -1046,8 +1052,9 @@ static void crypt_dec_pending(struct dm_crypt_io *io)
return;
if (io->ctx.req)
- mempool_free(io->ctx.req, cc->req_pool);
- mempool_free(io, cc->io_pool);
+ crypt_free_req(cc, io->ctx.req, base_bio);
+ if (io != dm_per_bio_data(base_bio, cc->per_bio_data_size))
+ mempool_free(io, cc->io_pool);
if (likely(!base_io))
bio_endio(base_bio, error);
@@ -1255,8 +1262,8 @@ static void kcryptd_crypt_write_convert(struct dm_crypt_io *io)
* between fragments, so switch to a new dm_crypt_io structure.
*/
if (unlikely(!crypt_finished && remaining)) {
- new_io = crypt_io_alloc(io->cc, io->base_bio,
- sector);
+ new_io = mempool_alloc(cc->io_pool, GFP_NOIO);
+ crypt_io_init(new_io, io->cc, io->base_bio, sector);
crypt_inc_pending(new_io);
crypt_convert_init(cc, &new_io->ctx, NULL,
io->base_bio, sector);
@@ -1325,7 +1332,7 @@ static void kcryptd_async_done(struct crypto_async_request *async_req,
if (error < 0)
io->error = -EIO;
- mempool_free(req_of_dmreq(cc, dmreq), cc->req_pool);
+ crypt_free_req(cc, req_of_dmreq(cc, dmreq), io->base_bio);
if (!atomic_dec_and_test(&ctx->cc_pending))
return;
@@ -1681,6 +1688,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
unsigned int key_size, opt_params;
unsigned long long tmpll;
int ret;
+ size_t iv_size_padding;
struct dm_arg_set as;
const char *opt_string;
char dummy;
@@ -1717,17 +1725,33 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
cc->dmreq_start = sizeof(struct ablkcipher_request);
cc->dmreq_start += crypto_ablkcipher_reqsize(any_tfm(cc));
- cc->dmreq_start = ALIGN(cc->dmreq_start, crypto_tfm_ctx_alignment());
- cc->dmreq_start += crypto_ablkcipher_alignmask(any_tfm(cc)) &
- ~(crypto_tfm_ctx_alignment() - 1);
+ cc->dmreq_start = ALIGN(cc->dmreq_start, __alignof__(struct dm_crypt_request));
+
+ if (crypto_ablkcipher_alignmask(any_tfm(cc)) < CRYPTO_MINALIGN) {
+ /* Allocate the padding exactly */
+ iv_size_padding = -(cc->dmreq_start + sizeof(struct dm_crypt_request))
+ & crypto_ablkcipher_alignmask(any_tfm(cc));
+ } else {
+ /*
+ * If the cipher requires greater alignment than kmalloc
+ * alignment, we don't know the exact position of the
+ * initialization vector. We must assume worst case.
+ */
+ iv_size_padding = crypto_ablkcipher_alignmask(any_tfm(cc));
+ }
cc->req_pool = mempool_create_kmalloc_pool(MIN_IOS, cc->dmreq_start +
- sizeof(struct dm_crypt_request) + cc->iv_size);
+ sizeof(struct dm_crypt_request) + iv_size_padding + cc->iv_size);
if (!cc->req_pool) {
ti->error = "Cannot allocate crypt request mempool";
goto bad;
}
+ cc->per_bio_data_size = ti->per_bio_data_size =
+ ALIGN(sizeof(struct dm_crypt_io) + cc->dmreq_start +
+ sizeof(struct dm_crypt_request) + iv_size_padding + cc->iv_size,
+ ARCH_KMALLOC_MINALIGN);
+
cc->page_pool = mempool_create_page_pool(MIN_POOL_PAGES, 0);
if (!cc->page_pool) {
ti->error = "Cannot allocate page mempool";
@@ -1824,7 +1848,9 @@ static int crypt_map(struct dm_target *ti, struct bio *bio)
return DM_MAPIO_REMAPPED;
}
- io = crypt_io_alloc(cc, bio, dm_target_offset(ti, bio->bi_iter.bi_sector));
+ io = dm_per_bio_data(bio, cc->per_bio_data_size);
+ crypt_io_init(io, cc, bio, dm_target_offset(ti, bio->bi_iter.bi_sector));
+ io->ctx.req = (struct ablkcipher_request *)(io + 1);
if (bio_data_dir(io->base_bio) == READ) {
if (kcryptd_io_read(io, GFP_NOWAIT))
diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c
index db404a0f7e2c..c09359db3a90 100644
--- a/drivers/md/dm-io.c
+++ b/drivers/md/dm-io.c
@@ -33,7 +33,6 @@ struct dm_io_client {
struct io {
unsigned long error_bits;
atomic_t count;
- struct completion *wait;
struct dm_io_client *client;
io_notify_fn callback;
void *context;
@@ -112,28 +111,27 @@ static void retrieve_io_and_region_from_bio(struct bio *bio, struct io **io,
* We need an io object to keep track of the number of bios that
* have been dispatched for a particular io.
*---------------------------------------------------------------*/
-static void dec_count(struct io *io, unsigned int region, int error)
+static void complete_io(struct io *io)
{
- if (error)
- set_bit(region, &io->error_bits);
+ unsigned long error_bits = io->error_bits;
+ io_notify_fn fn = io->callback;
+ void *context = io->context;
- if (atomic_dec_and_test(&io->count)) {
- if (io->vma_invalidate_size)
- invalidate_kernel_vmap_range(io->vma_invalidate_address,
- io->vma_invalidate_size);
+ if (io->vma_invalidate_size)
+ invalidate_kernel_vmap_range(io->vma_invalidate_address,
+ io->vma_invalidate_size);
- if (io->wait)
- complete(io->wait);
+ mempool_free(io, io->client->pool);
+ fn(error_bits, context);
+}
- else {
- unsigned long r = io->error_bits;
- io_notify_fn fn = io->callback;
- void *context = io->context;
+static void dec_count(struct io *io, unsigned int region, int error)
+{
+ if (error)
+ set_bit(region, &io->error_bits);
- mempool_free(io, io->client->pool);
- fn(r, context);
- }
- }
+ if (atomic_dec_and_test(&io->count))
+ complete_io(io);
}
static void endio(struct bio *bio, int error)
@@ -376,41 +374,51 @@ static void dispatch_io(int rw, unsigned int num_regions,
dec_count(io, 0, 0);
}
+struct sync_io {
+ unsigned long error_bits;
+ struct completion wait;
+};
+
+static void sync_io_complete(unsigned long error, void *context)
+{
+ struct sync_io *sio = context;
+
+ sio->error_bits = error;
+ complete(&sio->wait);
+}
+
static int sync_io(struct dm_io_client *client, unsigned int num_regions,
struct dm_io_region *where, int rw, struct dpages *dp,
unsigned long *error_bits)
{
- /*
- * gcc <= 4.3 can't do the alignment for stack variables, so we must
- * align it on our own.
- * volatile prevents the optimizer from removing or reusing
- * "io_" field from the stack frame (allowed in ANSI C).
- */
- volatile char io_[sizeof(struct io) + __alignof__(struct io) - 1];
- struct io *io = (struct io *)PTR_ALIGN(&io_, __alignof__(struct io));
- DECLARE_COMPLETION_ONSTACK(wait);
+ struct io *io;
+ struct sync_io sio;
if (num_regions > 1 && (rw & RW_MASK) != WRITE) {
WARN_ON(1);
return -EIO;
}
+ init_completion(&sio.wait);
+
+ io = mempool_alloc(client->pool, GFP_NOIO);
io->error_bits = 0;
atomic_set(&io->count, 1); /* see dispatch_io() */
- io->wait = &wait;
io->client = client;
+ io->callback = sync_io_complete;
+ io->context = &sio;
io->vma_invalidate_address = dp->vma_invalidate_address;
io->vma_invalidate_size = dp->vma_invalidate_size;
dispatch_io(rw, num_regions, where, dp, io, 1);
- wait_for_completion_io(&wait);
+ wait_for_completion_io(&sio.wait);
if (error_bits)
- *error_bits = io->error_bits;
+ *error_bits = sio.error_bits;
- return io->error_bits ? -EIO : 0;
+ return sio.error_bits ? -EIO : 0;
}
static int async_io(struct dm_io_client *client, unsigned int num_regions,
@@ -428,7 +436,6 @@ static int async_io(struct dm_io_client *client, unsigned int num_regions,
io = mempool_alloc(client->pool, GFP_NOIO);
io->error_bits = 0;
atomic_set(&io->count, 1); /* see dispatch_io() */
- io->wait = NULL;
io->client = client;
io->callback = fn;
io->context = context;
@@ -481,9 +488,9 @@ static int dp_init(struct dm_io_request *io_req, struct dpages *dp,
* New collapsed (a)synchronous interface.
*
* If the IO is asynchronous (i.e. it has notify.fn), you must either unplug
- * the queue with blk_unplug() some time later or set REQ_SYNC in
-io_req->bi_rw. If you fail to do one of these, the IO will be submitted to
- * the disk after q->unplug_delay, which defaults to 3ms in blk-settings.c.
+ * the queue with blk_unplug() some time later or set REQ_SYNC in io_req->bi_rw.
+ * If you fail to do one of these, the IO will be submitted to the disk after
+ * q->unplug_delay, which defaults to 3ms in blk-settings.c.
*/
int dm_io(struct dm_io_request *io_req, unsigned num_regions,
struct dm_io_region *where, unsigned long *sync_error_bits)
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index f4167b013d99..833d7e752f06 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -373,8 +373,6 @@ static int __must_push_back(struct multipath *m)
dm_noflush_suspending(m->ti)));
}
-#define pg_ready(m) (!(m)->queue_io && !(m)->pg_init_required)
-
/*
* Map cloned requests
*/
@@ -402,11 +400,11 @@ static int multipath_map(struct dm_target *ti, struct request *clone,
if (!__must_push_back(m))
r = -EIO; /* Failed */
goto out_unlock;
- }
- if (!pg_ready(m)) {
+ } else if (m->queue_io || m->pg_init_required) {
__pg_init_all_paths(m);
goto out_unlock;
}
+
if (set_mapinfo(m, map_context) < 0)
/* ENOMEM, requeue */
goto out_unlock;
diff --git a/drivers/md/dm-switch.c b/drivers/md/dm-switch.c
index 09a688b3d48c..50fca469cafd 100644
--- a/drivers/md/dm-switch.c
+++ b/drivers/md/dm-switch.c
@@ -137,13 +137,23 @@ static void switch_get_position(struct switch_ctx *sctx, unsigned long region_nr
*bit *= sctx->region_table_entry_bits;
}
+static unsigned switch_region_table_read(struct switch_ctx *sctx, unsigned long region_nr)
+{
+ unsigned long region_index;
+ unsigned bit;
+
+ switch_get_position(sctx, region_nr, &region_index, &bit);
+
+ return (ACCESS_ONCE(sctx->region_table[region_index]) >> bit) &
+ ((1 << sctx->region_table_entry_bits) - 1);
+}
+
/*
* Find which path to use at given offset.
*/
static unsigned switch_get_path_nr(struct switch_ctx *sctx, sector_t offset)
{
- unsigned long region_index;
- unsigned bit, path_nr;
+ unsigned path_nr;
sector_t p;
p = offset;
@@ -152,9 +162,7 @@ static unsigned switch_get_path_nr(struct switch_ctx *sctx, sector_t offset)
else
sector_div(p, sctx->region_size);
- switch_get_position(sctx, p, &region_index, &bit);
- path_nr = (ACCESS_ONCE(sctx->region_table[region_index]) >> bit) &
- ((1 << sctx->region_table_entry_bits) - 1);
+ path_nr = switch_region_table_read(sctx, p);
/* This can only happen if the processor uses non-atomic stores. */
if (unlikely(path_nr >= sctx->nr_paths))
@@ -363,7 +371,7 @@ static __always_inline unsigned long parse_hex(const char **string)
}
static int process_set_region_mappings(struct switch_ctx *sctx,
- unsigned argc, char **argv)
+ unsigned argc, char **argv)
{
unsigned i;
unsigned long region_index = 0;
@@ -372,6 +380,51 @@ static int process_set_region_mappings(struct switch_ctx *sctx,
unsigned long path_nr;
const char *string = argv[i];
+ if ((*string & 0xdf) == 'R') {
+ unsigned long cycle_length, num_write;
+
+ string++;
+ if (unlikely(*string == ',')) {
+ DMWARN("invalid set_region_mappings argument: '%s'", argv[i]);
+ return -EINVAL;
+ }
+ cycle_length = parse_hex(&string);
+ if (unlikely(*string != ',')) {
+ DMWARN("invalid set_region_mappings argument: '%s'", argv[i]);
+ return -EINVAL;
+ }
+ string++;
+ if (unlikely(!*string)) {
+ DMWARN("invalid set_region_mappings argument: '%s'", argv[i]);
+ return -EINVAL;
+ }
+ num_write = parse_hex(&string);
+ if (unlikely(*string)) {
+ DMWARN("invalid set_region_mappings argument: '%s'", argv[i]);
+ return -EINVAL;
+ }
+
+ if (unlikely(!cycle_length) || unlikely(cycle_length - 1 > region_index)) {
+ DMWARN("invalid set_region_mappings cycle length: %lu > %lu",
+ cycle_length - 1, region_index);
+ return -EINVAL;
+ }
+ if (unlikely(region_index + num_write < region_index) ||
+ unlikely(region_index + num_write >= sctx->nr_regions)) {
+ DMWARN("invalid set_region_mappings region number: %lu + %lu >= %lu",
+ region_index, num_write, sctx->nr_regions);
+ return -EINVAL;
+ }
+
+ while (num_write--) {
+ region_index++;
+ path_nr = switch_region_table_read(sctx, region_index - cycle_length);
+ switch_region_table_write(sctx, region_index, path_nr);
+ }
+
+ continue;
+ }
+
if (*string == ':')
region_index++;
else {
@@ -500,7 +553,7 @@ static int switch_iterate_devices(struct dm_target *ti,
static struct target_type switch_target = {
.name = "switch",
- .version = {1, 0, 0},
+ .version = {1, 1, 0},
.module = THIS_MODULE,
.ctr = switch_ctr,
.dtr = switch_dtr,
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 5f59f1e3e5b1..f9c6cb8dbcf8 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -1386,6 +1386,14 @@ static int device_is_not_random(struct dm_target *ti, struct dm_dev *dev,
return q && !blk_queue_add_random(q);
}
+static int queue_supports_sg_merge(struct dm_target *ti, struct dm_dev *dev,
+ sector_t start, sector_t len, void *data)
+{
+ struct request_queue *q = bdev_get_queue(dev->bdev);
+
+ return q && !test_bit(QUEUE_FLAG_NO_SG_MERGE, &q->queue_flags);
+}
+
static bool dm_table_all_devices_attribute(struct dm_table *t,
iterate_devices_callout_fn func)
{
@@ -1430,6 +1438,43 @@ static bool dm_table_supports_write_same(struct dm_table *t)
return true;
}
+static int device_discard_capable(struct dm_target *ti, struct dm_dev *dev,
+ sector_t start, sector_t len, void *data)
+{
+ struct request_queue *q = bdev_get_queue(dev->bdev);
+
+ return q && blk_queue_discard(q);
+}
+
+static bool dm_table_supports_discards(struct dm_table *t)
+{
+ struct dm_target *ti;
+ unsigned i = 0;
+
+ /*
+ * Unless any target used by the table set discards_supported,
+ * require at least one underlying device to support discards.
+ * t->devices includes internal dm devices such as mirror logs
+ * so we need to use iterate_devices here, which targets
+ * supporting discard selectively must provide.
+ */
+ while (i < dm_table_get_num_targets(t)) {
+ ti = dm_table_get_target(t, i++);
+
+ if (!ti->num_discard_bios)
+ continue;
+
+ if (ti->discards_supported)
+ return 1;
+
+ if (ti->type->iterate_devices &&
+ ti->type->iterate_devices(ti, device_discard_capable, NULL))
+ return 1;
+ }
+
+ return 0;
+}
+
void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
struct queue_limits *limits)
{
@@ -1464,6 +1509,11 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
if (!dm_table_supports_write_same(t))
q->limits.max_write_same_sectors = 0;
+ if (dm_table_all_devices_attribute(t, queue_supports_sg_merge))
+ queue_flag_clear_unlocked(QUEUE_FLAG_NO_SG_MERGE, q);
+ else
+ queue_flag_set_unlocked(QUEUE_FLAG_NO_SG_MERGE, q);
+
dm_table_set_integrity(t);
/*
@@ -1636,39 +1686,3 @@ void dm_table_run_md_queue_async(struct dm_table *t)
}
EXPORT_SYMBOL(dm_table_run_md_queue_async);
-static int device_discard_capable(struct dm_target *ti, struct dm_dev *dev,
- sector_t start, sector_t len, void *data)
-{
- struct request_queue *q = bdev_get_queue(dev->bdev);
-
- return q && blk_queue_discard(q);
-}
-
-bool dm_table_supports_discards(struct dm_table *t)
-{
- struct dm_target *ti;
- unsigned i = 0;
-
- /*
- * Unless any target used by the table set discards_supported,
- * require at least one underlying device to support discards.
- * t->devices includes internal dm devices such as mirror logs
- * so we need to use iterate_devices here, which targets
- * supporting discard selectively must provide.
- */
- while (i < dm_table_get_num_targets(t)) {
- ti = dm_table_get_target(t, i++);
-
- if (!ti->num_discard_bios)
- continue;
-
- if (ti->discards_supported)
- return 1;
-
- if (ti->type->iterate_devices &&
- ti->type->iterate_devices(ti, device_discard_capable, NULL))
- return 1;
- }
-
- return 0;
-}
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index fc9c848a60c9..4843801173fe 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -227,6 +227,7 @@ struct thin_c {
struct list_head list;
struct dm_dev *pool_dev;
struct dm_dev *origin_dev;
+ sector_t origin_size;
dm_thin_id dev_id;
struct pool *pool;
@@ -554,11 +555,16 @@ static void remap_and_issue(struct thin_c *tc, struct bio *bio,
struct dm_thin_new_mapping {
struct list_head list;
- bool quiesced:1;
- bool prepared:1;
bool pass_discard:1;
bool definitely_not_shared:1;
+ /*
+ * Track quiescing, copying and zeroing preparation actions. When this
+ * counter hits zero the block is prepared and can be inserted into the
+ * btree.
+ */
+ atomic_t prepare_actions;
+
int err;
struct thin_c *tc;
dm_block_t virt_block;
@@ -575,43 +581,41 @@ struct dm_thin_new_mapping {
bio_end_io_t *saved_bi_end_io;
};
-static void __maybe_add_mapping(struct dm_thin_new_mapping *m)
+static void __complete_mapping_preparation(struct dm_thin_new_mapping *m)
{
struct pool *pool = m->tc->pool;
- if (m->quiesced && m->prepared) {
+ if (atomic_dec_and_test(&m->prepare_actions)) {
list_add_tail(&m->list, &pool->prepared_mappings);
wake_worker(pool);
}
}
-static void copy_complete(int read_err, unsigned long write_err, void *context)
+static void complete_mapping_preparation(struct dm_thin_new_mapping *m)
{
unsigned long flags;
- struct dm_thin_new_mapping *m = context;
struct pool *pool = m->tc->pool;
- m->err = read_err || write_err ? -EIO : 0;
-
spin_lock_irqsave(&pool->lock, flags);
- m->prepared = true;
- __maybe_add_mapping(m);
+ __complete_mapping_preparation(m);
spin_unlock_irqrestore(&pool->lock, flags);
}
+static void copy_complete(int read_err, unsigned long write_err, void *context)
+{
+ struct dm_thin_new_mapping *m = context;
+
+ m->err = read_err || write_err ? -EIO : 0;
+ complete_mapping_preparation(m);
+}
+
static void overwrite_endio(struct bio *bio, int err)
{
- unsigned long flags;
struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
struct dm_thin_new_mapping *m = h->overwrite_mapping;
- struct pool *pool = m->tc->pool;
m->err = err;
-
- spin_lock_irqsave(&pool->lock, flags);
- m->prepared = true;
- __maybe_add_mapping(m);
- spin_unlock_irqrestore(&pool->lock, flags);
+ complete_mapping_preparation(m);
}
/*----------------------------------------------------------------*/
@@ -821,10 +825,31 @@ static struct dm_thin_new_mapping *get_next_mapping(struct pool *pool)
return m;
}
+static void ll_zero(struct thin_c *tc, struct dm_thin_new_mapping *m,
+ sector_t begin, sector_t end)
+{
+ int r;
+ struct dm_io_region to;
+
+ to.bdev = tc->pool_dev->bdev;
+ to.sector = begin;
+ to.count = end - begin;
+
+ r = dm_kcopyd_zero(tc->pool->copier, 1, &to, 0, copy_complete, m);
+ if (r < 0) {
+ DMERR_LIMIT("dm_kcopyd_zero() failed");
+ copy_complete(1, 1, m);
+ }
+}
+
+/*
+ * A partial copy also needs to zero the uncopied region.
+ */
static void schedule_copy(struct thin_c *tc, dm_block_t virt_block,
struct dm_dev *origin, dm_block_t data_origin,
dm_block_t data_dest,
- struct dm_bio_prison_cell *cell, struct bio *bio)
+ struct dm_bio_prison_cell *cell, struct bio *bio,
+ sector_t len)
{
int r;
struct pool *pool = tc->pool;
@@ -835,8 +860,15 @@ static void schedule_copy(struct thin_c *tc, dm_block_t virt_block,
m->data_block = data_dest;
m->cell = cell;
+ /*
+ * quiesce action + copy action + an extra reference held for the
+ * duration of this function (we may need to inc later for a
+ * partial zero).
+ */
+ atomic_set(&m->prepare_actions, 3);
+
if (!dm_deferred_set_add_work(pool->shared_read_ds, &m->list))
- m->quiesced = true;
+ complete_mapping_preparation(m); /* already quiesced */
/*
* IO to pool_dev remaps to the pool target's data_dev.
@@ -857,20 +889,38 @@ static void schedule_copy(struct thin_c *tc, dm_block_t virt_block,
from.bdev = origin->bdev;
from.sector = data_origin * pool->sectors_per_block;
- from.count = pool->sectors_per_block;
+ from.count = len;
to.bdev = tc->pool_dev->bdev;
to.sector = data_dest * pool->sectors_per_block;
- to.count = pool->sectors_per_block;
+ to.count = len;
r = dm_kcopyd_copy(pool->copier, &from, 1, &to,
0, copy_complete, m);
if (r < 0) {
- mempool_free(m, pool->mapping_pool);
DMERR_LIMIT("dm_kcopyd_copy() failed");
- cell_error(pool, cell);
+ copy_complete(1, 1, m);
+
+ /*
+ * We allow the zero to be issued, to simplify the
+ * error path. Otherwise we'd need to start
+ * worrying about decrementing the prepare_actions
+ * counter.
+ */
+ }
+
+ /*
+ * Do we need to zero a tail region?
+ */
+ if (len < pool->sectors_per_block && pool->pf.zero_new_blocks) {
+ atomic_inc(&m->prepare_actions);
+ ll_zero(tc, m,
+ data_dest * pool->sectors_per_block + len,
+ (data_dest + 1) * pool->sectors_per_block);
}
}
+
+ complete_mapping_preparation(m); /* drop our ref */
}
static void schedule_internal_copy(struct thin_c *tc, dm_block_t virt_block,
@@ -878,15 +928,8 @@ static void schedule_internal_copy(struct thin_c *tc, dm_block_t virt_block,
struct dm_bio_prison_cell *cell, struct bio *bio)
{
schedule_copy(tc, virt_block, tc->pool_dev,
- data_origin, data_dest, cell, bio);
-}
-
-static void schedule_external_copy(struct thin_c *tc, dm_block_t virt_block,
- dm_block_t data_dest,
- struct dm_bio_prison_cell *cell, struct bio *bio)
-{
- schedule_copy(tc, virt_block, tc->origin_dev,
- virt_block, data_dest, cell, bio);
+ data_origin, data_dest, cell, bio,
+ tc->pool->sectors_per_block);
}
static void schedule_zero(struct thin_c *tc, dm_block_t virt_block,
@@ -896,8 +939,7 @@ static void schedule_zero(struct thin_c *tc, dm_block_t virt_block,
struct pool *pool = tc->pool;
struct dm_thin_new_mapping *m = get_next_mapping(pool);
- m->quiesced = true;
- m->prepared = false;
+ atomic_set(&m->prepare_actions, 1); /* no need to quiesce */
m->tc = tc;
m->virt_block = virt_block;
m->data_block = data_block;
@@ -919,21 +961,33 @@ static void schedule_zero(struct thin_c *tc, dm_block_t virt_block,
save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio);
inc_all_io_entry(pool, bio);
remap_and_issue(tc, bio, data_block);
- } else {
- int r;
- struct dm_io_region to;
- to.bdev = tc->pool_dev->bdev;
- to.sector = data_block * pool->sectors_per_block;
- to.count = pool->sectors_per_block;
+ } else
+ ll_zero(tc, m,
+ data_block * pool->sectors_per_block,
+ (data_block + 1) * pool->sectors_per_block);
+}
- r = dm_kcopyd_zero(pool->copier, 1, &to, 0, copy_complete, m);
- if (r < 0) {
- mempool_free(m, pool->mapping_pool);
- DMERR_LIMIT("dm_kcopyd_zero() failed");
- cell_error(pool, cell);
- }
- }
+static void schedule_external_copy(struct thin_c *tc, dm_block_t virt_block,
+ dm_block_t data_dest,
+ struct dm_bio_prison_cell *cell, struct bio *bio)
+{
+ struct pool *pool = tc->pool;
+ sector_t virt_block_begin = virt_block * pool->sectors_per_block;
+ sector_t virt_block_end = (virt_block + 1) * pool->sectors_per_block;
+
+ if (virt_block_end <= tc->origin_size)
+ schedule_copy(tc, virt_block, tc->origin_dev,
+ virt_block, data_dest, cell, bio,
+ pool->sectors_per_block);
+
+ else if (virt_block_begin < tc->origin_size)
+ schedule_copy(tc, virt_block, tc->origin_dev,
+ virt_block, data_dest, cell, bio,
+ tc->origin_size - virt_block_begin);
+
+ else
+ schedule_zero(tc, virt_block, data_dest, cell, bio);
}
/*
@@ -1315,7 +1369,18 @@ static void process_bio(struct thin_c *tc, struct bio *bio)
inc_all_io_entry(pool, bio);
cell_defer_no_holder(tc, cell);
- remap_to_origin_and_issue(tc, bio);
+ if (bio_end_sector(bio) <= tc->origin_size)
+ remap_to_origin_and_issue(tc, bio);
+
+ else if (bio->bi_iter.bi_sector < tc->origin_size) {
+ zero_fill_bio(bio);
+ bio->bi_iter.bi_size = (tc->origin_size - bio->bi_iter.bi_sector) << SECTOR_SHIFT;
+ remap_to_origin_and_issue(tc, bio);
+
+ } else {
+ zero_fill_bio(bio);
+ bio_endio(bio, 0);
+ }
} else
provision_block(tc, bio, block, cell);
break;
@@ -3112,7 +3177,7 @@ static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits)
*/
if (io_opt_sectors < pool->sectors_per_block ||
do_div(io_opt_sectors, pool->sectors_per_block)) {
- blk_limits_io_min(limits, 0);
+ blk_limits_io_min(limits, pool->sectors_per_block << SECTOR_SHIFT);
blk_limits_io_opt(limits, pool->sectors_per_block << SECTOR_SHIFT);
}
@@ -3141,7 +3206,7 @@ static struct target_type pool_target = {
.name = "thin-pool",
.features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE |
DM_TARGET_IMMUTABLE,
- .version = {1, 12, 0},
+ .version = {1, 13, 0},
.module = THIS_MODULE,
.ctr = pool_ctr,
.dtr = pool_dtr,
@@ -3361,8 +3426,7 @@ static int thin_endio(struct dm_target *ti, struct bio *bio, int err)
spin_lock_irqsave(&pool->lock, flags);
list_for_each_entry_safe(m, tmp, &work, list) {
list_del(&m->list);
- m->quiesced = true;
- __maybe_add_mapping(m);
+ __complete_mapping_preparation(m);
}
spin_unlock_irqrestore(&pool->lock, flags);
}
@@ -3401,6 +3465,16 @@ static void thin_postsuspend(struct dm_target *ti)
noflush_work(tc, do_noflush_stop);
}
+static int thin_preresume(struct dm_target *ti)
+{
+ struct thin_c *tc = ti->private;
+
+ if (tc->origin_dev)
+ tc->origin_size = get_dev_size(tc->origin_dev->bdev);
+
+ return 0;
+}
+
/*
* <nr mapped sectors> <highest mapped sector>
*/
@@ -3483,12 +3557,13 @@ static int thin_iterate_devices(struct dm_target *ti,
static struct target_type thin_target = {
.name = "thin",
- .version = {1, 12, 0},
+ .version = {1, 13, 0},
.module = THIS_MODULE,
.ctr = thin_ctr,
.dtr = thin_dtr,
.map = thin_map,
.end_io = thin_endio,
+ .preresume = thin_preresume,
.presuspend = thin_presuspend,
.postsuspend = thin_postsuspend,
.status = thin_status,
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index ed76126aac54..e81d2152fa68 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -72,7 +72,6 @@ int dm_table_any_busy_target(struct dm_table *t);
unsigned dm_table_get_type(struct dm_table *t);
struct target_type *dm_table_get_immutable_target_type(struct dm_table *t);
bool dm_table_request_based(struct dm_table *t);
-bool dm_table_supports_discards(struct dm_table *t);
void dm_table_free_md_mempools(struct dm_table *t);
struct dm_md_mempools *dm_table_get_md_mempools(struct dm_table *t);
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 32fc19c540d4..1294238610df 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -5961,7 +5961,7 @@ static int set_bitmap_file(struct mddev *mddev, int fd)
int err = 0;
if (mddev->pers) {
- if (!mddev->pers->quiesce)
+ if (!mddev->pers->quiesce || !mddev->thread)
return -EBUSY;
if (mddev->recovery || mddev->sync_thread)
return -EBUSY;
@@ -6263,7 +6263,7 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info)
rv = update_raid_disks(mddev, info->raid_disks);
if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) {
- if (mddev->pers->quiesce == NULL)
+ if (mddev->pers->quiesce == NULL || mddev->thread == NULL)
return -EINVAL;
if (mddev->recovery || mddev->sync_thread)
return -EBUSY;
@@ -7376,7 +7376,7 @@ void md_do_sync(struct md_thread *thread)
struct mddev *mddev2;
unsigned int currspeed = 0,
window;
- sector_t max_sectors,j, io_sectors;
+ sector_t max_sectors,j, io_sectors, recovery_done;
unsigned long mark[SYNC_MARKS];
unsigned long update_time;
sector_t mark_cnt[SYNC_MARKS];
@@ -7652,7 +7652,8 @@ void md_do_sync(struct md_thread *thread)
*/
cond_resched();
- currspeed = ((unsigned long)(io_sectors-mddev->resync_mark_cnt))/2
+ recovery_done = io_sectors - atomic_read(&mddev->recovery_active);
+ currspeed = ((unsigned long)(recovery_done - mddev->resync_mark_cnt))/2
/((jiffies-mddev->resync_mark)/HZ +1) +1;
if (currspeed > speed_min(mddev)) {
@@ -8592,7 +8593,7 @@ static int __init md_init(void)
goto err_mdp;
mdp_major = ret;
- blk_register_region(MKDEV(MD_MAJOR, 0), 1UL<<MINORBITS, THIS_MODULE,
+ blk_register_region(MKDEV(MD_MAJOR, 0), 512, THIS_MODULE,
md_probe, NULL, NULL);
blk_register_region(MKDEV(mdp_major, 0), 1UL<<MINORBITS, THIS_MODULE,
md_probe, NULL, NULL);
@@ -8687,7 +8688,7 @@ static __exit void md_exit(void)
struct list_head *tmp;
int delay = 1;
- blk_unregister_region(MKDEV(MD_MAJOR,0), 1U << MINORBITS);
+ blk_unregister_region(MKDEV(MD_MAJOR,0), 512);
blk_unregister_region(MKDEV(mdp_major,0), 1U << MINORBITS);
unregister_blkdev(MD_MAJOR,"md");
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index 407a99e46f69..cf91f5910c7c 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -685,6 +685,12 @@ static void *raid0_takeover(struct mddev *mddev)
* raid10 - assuming we have all necessary active disks
* raid1 - with (N -1) mirror drives faulty
*/
+
+ if (mddev->bitmap) {
+ printk(KERN_ERR "md/raid0: %s: cannot takeover array with bitmap\n",
+ mdname(mddev));
+ return ERR_PTR(-EBUSY);
+ }
if (mddev->level == 4)
return raid0_takeover_raid45(mddev);
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 56e24c072b62..55de4f6f7eaf 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -540,11 +540,7 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
has_nonrot_disk = 0;
choose_next_idle = 0;
- if (conf->mddev->recovery_cp < MaxSector &&
- (this_sector + sectors >= conf->next_resync))
- choose_first = 1;
- else
- choose_first = 0;
+ choose_first = (conf->mddev->recovery_cp < this_sector + sectors);
for (disk = 0 ; disk < conf->raid_disks * 2 ; disk++) {
sector_t dist;
@@ -831,7 +827,7 @@ static void flush_pending_writes(struct r1conf *conf)
* there is no normal IO happeing. It must arrange to call
* lower_barrier when the particular background IO completes.
*/
-static void raise_barrier(struct r1conf *conf)
+static void raise_barrier(struct r1conf *conf, sector_t sector_nr)
{
spin_lock_irq(&conf->resync_lock);
@@ -841,6 +837,7 @@ static void raise_barrier(struct r1conf *conf)
/* block any new IO from starting */
conf->barrier++;
+ conf->next_resync = sector_nr;
/* For these conditions we must wait:
* A: while the array is in frozen state
@@ -849,14 +846,17 @@ static void raise_barrier(struct r1conf *conf)
* C: next_resync + RESYNC_SECTORS > start_next_window, meaning
* next resync will reach to the window which normal bios are
* handling.
+ * D: while there are any active requests in the current window.
*/
wait_event_lock_irq(conf->wait_barrier,
!conf->array_frozen &&
conf->barrier < RESYNC_DEPTH &&
+ conf->current_window_requests == 0 &&
(conf->start_next_window >=
conf->next_resync + RESYNC_SECTORS),
conf->resync_lock);
+ conf->nr_pending++;
spin_unlock_irq(&conf->resync_lock);
}
@@ -866,6 +866,7 @@ static void lower_barrier(struct r1conf *conf)
BUG_ON(conf->barrier <= 0);
spin_lock_irqsave(&conf->resync_lock, flags);
conf->barrier--;
+ conf->nr_pending--;
spin_unlock_irqrestore(&conf->resync_lock, flags);
wake_up(&conf->wait_barrier);
}
@@ -877,12 +878,10 @@ static bool need_to_wait_for_sync(struct r1conf *conf, struct bio *bio)
if (conf->array_frozen || !bio)
wait = true;
else if (conf->barrier && bio_data_dir(bio) == WRITE) {
- if (conf->next_resync < RESYNC_WINDOW_SECTORS)
- wait = true;
- else if ((conf->next_resync - RESYNC_WINDOW_SECTORS
- >= bio_end_sector(bio)) ||
- (conf->next_resync + NEXT_NORMALIO_DISTANCE
- <= bio->bi_iter.bi_sector))
+ if ((conf->mddev->curr_resync_completed
+ >= bio_end_sector(bio)) ||
+ (conf->next_resync + NEXT_NORMALIO_DISTANCE
+ <= bio->bi_iter.bi_sector))
wait = false;
else
wait = true;
@@ -919,8 +918,8 @@ static sector_t wait_barrier(struct r1conf *conf, struct bio *bio)
}
if (bio && bio_data_dir(bio) == WRITE) {
- if (conf->next_resync + NEXT_NORMALIO_DISTANCE
- <= bio->bi_iter.bi_sector) {
+ if (bio->bi_iter.bi_sector >=
+ conf->mddev->curr_resync_completed) {
if (conf->start_next_window == MaxSector)
conf->start_next_window =
conf->next_resync +
@@ -1186,6 +1185,7 @@ read_again:
atomic_read(&bitmap->behind_writes) == 0);
}
r1_bio->read_disk = rdisk;
+ r1_bio->start_next_window = 0;
read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev);
bio_trim(read_bio, r1_bio->sector - bio->bi_iter.bi_sector,
@@ -1501,12 +1501,12 @@ static void error(struct mddev *mddev, struct md_rdev *rdev)
mddev->degraded++;
set_bit(Faulty, &rdev->flags);
spin_unlock_irqrestore(&conf->device_lock, flags);
- /*
- * if recovery is running, make sure it aborts.
- */
- set_bit(MD_RECOVERY_INTR, &mddev->recovery);
} else
set_bit(Faulty, &rdev->flags);
+ /*
+ * if recovery is running, make sure it aborts.
+ */
+ set_bit(MD_RECOVERY_INTR, &mddev->recovery);
set_bit(MD_CHANGE_DEVS, &mddev->flags);
printk(KERN_ALERT
"md/raid1:%s: Disk failure on %s, disabling device.\n"
@@ -1548,8 +1548,13 @@ static void close_sync(struct r1conf *conf)
mempool_destroy(conf->r1buf_pool);
conf->r1buf_pool = NULL;
+ spin_lock_irq(&conf->resync_lock);
conf->next_resync = 0;
conf->start_next_window = MaxSector;
+ conf->current_window_requests +=
+ conf->next_window_requests;
+ conf->next_window_requests = 0;
+ spin_unlock_irq(&conf->resync_lock);
}
static int raid1_spare_active(struct mddev *mddev)
@@ -2150,7 +2155,7 @@ static void fix_read_error(struct r1conf *conf, int read_disk,
d--;
rdev = conf->mirrors[d].rdev;
if (rdev &&
- test_bit(In_sync, &rdev->flags))
+ !test_bit(Faulty, &rdev->flags))
r1_sync_page_io(rdev, sect, s,
conf->tmppage, WRITE);
}
@@ -2162,7 +2167,7 @@ static void fix_read_error(struct r1conf *conf, int read_disk,
d--;
rdev = conf->mirrors[d].rdev;
if (rdev &&
- test_bit(In_sync, &rdev->flags)) {
+ !test_bit(Faulty, &rdev->flags)) {
if (r1_sync_page_io(rdev, sect, s,
conf->tmppage, READ)) {
atomic_add(s, &rdev->corrected_errors);
@@ -2541,9 +2546,8 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp
bitmap_cond_end_sync(mddev->bitmap, sector_nr);
r1_bio = mempool_alloc(conf->r1buf_pool, GFP_NOIO);
- raise_barrier(conf);
- conf->next_resync = sector_nr;
+ raise_barrier(conf, sector_nr);
rcu_read_lock();
/*
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index cb882aae9e20..6703751d87d7 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1684,13 +1684,12 @@ static void error(struct mddev *mddev, struct md_rdev *rdev)
spin_unlock_irqrestore(&conf->device_lock, flags);
return;
}
- if (test_and_clear_bit(In_sync, &rdev->flags)) {
+ if (test_and_clear_bit(In_sync, &rdev->flags))
mddev->degraded++;
- /*
- * if recovery is running, make sure it aborts.
- */
- set_bit(MD_RECOVERY_INTR, &mddev->recovery);
- }
+ /*
+ * If recovery is running, make sure it aborts.
+ */
+ set_bit(MD_RECOVERY_INTR, &mddev->recovery);
set_bit(Blocked, &rdev->flags);
set_bit(Faulty, &rdev->flags);
set_bit(MD_CHANGE_DEVS, &mddev->flags);
@@ -2954,6 +2953,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
*/
if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) {
end_reshape(conf);
+ close_sync(conf);
return 0;
}
@@ -3082,6 +3082,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
}
r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO);
+ r10_bio->state = 0;
raise_barrier(conf, rb2 != NULL);
atomic_set(&r10_bio->remaining, 0);
@@ -3270,6 +3271,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
if (sync_blocks < max_sync)
max_sync = sync_blocks;
r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO);
+ r10_bio->state = 0;
r10_bio->mddev = mddev;
atomic_set(&r10_bio->remaining, 0);
@@ -4385,6 +4387,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr,
read_more:
/* Now schedule reads for blocks from sector_nr to last */
r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO);
+ r10_bio->state = 0;
raise_barrier(conf, sectors_done != 0);
atomic_set(&r10_bio->remaining, 0);
r10_bio->mddev = mddev;
@@ -4399,6 +4402,7 @@ read_more:
* on all the target devices.
*/
// FIXME
+ mempool_free(r10_bio, conf->r10buf_pool);
set_bit(MD_RECOVERY_INTR, &mddev->recovery);
return sectors_done;
}
@@ -4411,7 +4415,7 @@ read_more:
read_bio->bi_private = r10_bio;
read_bio->bi_end_io = end_sync_read;
read_bio->bi_rw = READ;
- read_bio->bi_flags &= ~(BIO_POOL_MASK - 1);
+ read_bio->bi_flags &= (~0UL << BIO_RESET_BITS);
read_bio->bi_flags |= 1 << BIO_UPTODATE;
read_bio->bi_vcnt = 0;
read_bio->bi_iter.bi_size = 0;
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 6234b2e84587..9f0fbecd1eb5 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -64,6 +64,10 @@
#define cpu_to_group(cpu) cpu_to_node(cpu)
#define ANY_GROUP NUMA_NO_NODE
+static bool devices_handle_discard_safely = false;
+module_param(devices_handle_discard_safely, bool, 0644);
+MODULE_PARM_DESC(devices_handle_discard_safely,
+ "Set to Y if all devices in each array reliably return zeroes on reads from discarded regions");
static struct workqueue_struct *raid5_wq;
/*
* Stripe cache
@@ -2922,7 +2926,7 @@ static int fetch_block(struct stripe_head *sh, struct stripe_head_state *s,
(!test_bit(R5_Insync, &dev->flags) || test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) &&
!test_bit(R5_OVERWRITE, &fdev[0]->flags)) ||
(sh->raid_conf->level == 6 && s->failed && s->to_write &&
- s->to_write < sh->raid_conf->raid_disks - 2 &&
+ s->to_write - s->non_overwrite < sh->raid_conf->raid_disks - 2 &&
(!test_bit(R5_Insync, &dev->flags) || test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))))) {
/* we would like to get this block, possibly by computing it,
* otherwise read it if the backing disk is insync
@@ -3817,6 +3821,8 @@ static void handle_stripe(struct stripe_head *sh)
set_bit(R5_Wantwrite, &dev->flags);
if (prexor)
continue;
+ if (s.failed > 1)
+ continue;
if (!test_bit(R5_Insync, &dev->flags) ||
((i == sh->pd_idx || i == sh->qd_idx) &&
s.failed == 0))
@@ -6206,7 +6212,7 @@ static int run(struct mddev *mddev)
mddev->queue->limits.discard_granularity = stripe;
/*
* unaligned part of discard request will be ignored, so can't
- * guarantee discard_zerors_data
+ * guarantee discard_zeroes_data
*/
mddev->queue->limits.discard_zeroes_data = 0;
@@ -6231,6 +6237,18 @@ static int run(struct mddev *mddev)
!bdev_get_queue(rdev->bdev)->
limits.discard_zeroes_data)
discard_supported = false;
+ /* Unfortunately, discard_zeroes_data is not currently
+ * a guarantee - just a hint. So we only allow DISCARD
+ * if the sysadmin has confirmed that only safe devices
+ * are in use by setting a module parameter.
+ */
+ if (!devices_handle_discard_safely) {
+ if (discard_supported) {
+ pr_info("md/raid456: discard support disabled due to uncertainty.\n");
+ pr_info("Set raid456.devices_handle_discard_safely=Y to override.\n");
+ }
+ discard_supported = false;
+ }
}
if (discard_supported &&