61 files changed, 5665 insertions, 616 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index 906103c168ea..4a249ee86364 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -521,6 +521,23 @@ config DM_INTEGRITY
 	  To compile this code as a module, choose M here: the module will
 	  be called dm-integrity.
 
+config DM_ZONED
+	tristate "Drive-managed zoned block device target support"
+	depends on BLK_DEV_DM
+	depends on BLK_DEV_ZONED
+	---help---
+	  This device-mapper target takes a host-managed or host-aware zoned
+	  block device and exposes most of its capacity as a regular block
+	  device (drive-managed zoned block device) without any write
+	  constraints. This is mainly intended for use with file systems that
+	  do not natively support zoned block devices but still want to
+	  benefit from the increased capacity offered by SMR disks. Other uses
+	  by applications using raw block devices (for example object stores)
+	  are also possible.
+
+	  To compile this code as a module, choose M here: the module will
+	  be called dm-zoned.
+
 	  If unsure, say N.
 
 endif # MD
diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index 913720bd81c1..786ec9e86d65 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -20,6 +20,7 @@ dm-era-y	+= dm-era-target.o
 dm-verity-y	+= dm-verity-target.o
 md-mod-y	+= md.o bitmap.o
 raid456-y	+= raid5.o raid5-cache.o raid5-ppl.o
+dm-zoned-y	+= dm-zoned-target.o dm-zoned-metadata.o dm-zoned-reclaim.o
 
 # Note: link order is important.  All raid personalities
 # and must come before md.o, as they each initialise 
@@ -60,6 +61,7 @@ obj-$(CONFIG_DM_CACHE_SMQ)	+= dm-cache-smq.o
 obj-$(CONFIG_DM_ERA)		+= dm-era.o
 obj-$(CONFIG_DM_LOG_WRITES)	+= dm-log-writes.o
 obj-$(CONFIG_DM_INTEGRITY)	+= dm-integrity.o
+obj-$(CONFIG_DM_ZONED)		+= dm-zoned.o
 
 ifeq ($(CONFIG_DM_UEVENT),y)
 dm-mod-objs			+= dm-uevent.o
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index c3ea03c9a1a8..dee542fff68e 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -849,10 +849,11 @@ static inline void wake_up_allocators(struct cache_set *c)
 
 /* Forward declarations */
 
-void bch_count_io_errors(struct cache *, int, const char *);
+void bch_count_io_errors(struct cache *, blk_status_t, const char *);
 void bch_bbio_count_io_errors(struct cache_set *, struct bio *,
-			      int, const char *);
-void bch_bbio_endio(struct cache_set *, struct bio *, int, const char *);
+			      blk_status_t, const char *);
+void bch_bbio_endio(struct cache_set *, struct bio *, blk_status_t,
+		const char *);
 void bch_bbio_free(struct bio *, struct cache_set *);
 struct bio *bch_bbio_alloc(struct cache_set *);
 
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index 450d0e848ae4..866dcf78ff8e 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -307,7 +307,7 @@ static void bch_btree_node_read(struct btree *b)
 	bch_submit_bbio(bio, b->c, &b->key, 0);
 	closure_sync(&cl);
 
-	if (bio->bi_error)
+	if (bio->bi_status)
 		set_btree_node_io_error(b);
 
 	bch_bbio_free(bio, b->c);
@@ -374,10 +374,10 @@ static void btree_node_write_endio(struct bio *bio)
 	struct closure *cl = bio->bi_private;
 	struct btree *b = container_of(cl, struct btree, io);
 
-	if (bio->bi_error)
+	if (bio->bi_status)
 		set_btree_node_io_error(b);
 
-	bch_bbio_count_io_errors(b->c, bio, bio->bi_error, "writing btree");
+	bch_bbio_count_io_errors(b->c, bio, bio->bi_status, "writing btree");
 	closure_put(cl);
 }
 
diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h
index 9b80417cd547..73da1f5626cb 100644
--- a/drivers/md/bcache/btree.h
+++ b/drivers/md/bcache/btree.h
@@ -207,7 +207,7 @@ void bkey_put(struct cache_set *c, struct bkey *k);
 
 struct btree_op {
 	/* for waiting on btree reserve in btree_split() */
-	wait_queue_t		wait;
+	wait_queue_entry_t		wait;
 
 	/* Btree level at which we start taking write locks */
 	short			lock;
diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c
index 06f55056aaae..35a5a7210e51 100644
--- a/drivers/md/bcache/debug.c
+++ b/drivers/md/bcache/debug.c
@@ -110,7 +110,7 @@ void bch_data_verify(struct cached_dev *dc, struct bio *bio)
 	struct bio_vec bv, cbv;
 	struct bvec_iter iter, citer = { 0 };
 
-	check = bio_clone(bio, GFP_NOIO);
+	check = bio_clone_kmalloc(bio, GFP_NOIO);
 	if (!check)
 		return;
 	check->bi_opf = REQ_OP_READ;
diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c
index db45a88c0ce9..6a9b85095e7b 100644
--- a/drivers/md/bcache/io.c
+++ b/drivers/md/bcache/io.c
@@ -50,7 +50,7 @@ void bch_submit_bbio(struct bio *bio, struct cache_set *c,
 
 /* IO errors */
 
-void bch_count_io_errors(struct cache *ca, int error, const char *m)
+void bch_count_io_errors(struct cache *ca, blk_status_t error, const char *m)
 {
 	/*
 	 * The halflife of an error is:
@@ -103,7 +103,7 @@ void bch_count_io_errors(struct cache *ca, int error, const char *m)
 }
 
 void bch_bbio_count_io_errors(struct cache_set *c, struct bio *bio,
-			      int error, const char *m)
+			      blk_status_t error, const char *m)
 {
 	struct bbio *b = container_of(bio, struct bbio, bio);
 	struct cache *ca = PTR_CACHE(c, &b->key, 0);
@@ -132,7 +132,7 @@ void bch_bbio_count_io_errors(struct cache_set *c, struct bio *bio,
 }
 
 void bch_bbio_endio(struct cache_set *c, struct bio *bio,
-		    int error, const char *m)
+		    blk_status_t error, const char *m)
 {
 	struct closure *cl = bio->bi_private;
 
diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
index 1198e53d5670..0352d05e495c 100644
--- a/drivers/md/bcache/journal.c
+++ b/drivers/md/bcache/journal.c
@@ -549,7 +549,7 @@ static void journal_write_endio(struct bio *bio)
 {
 	struct journal_write *w = bio->bi_private;
 
-	cache_set_err_on(bio->bi_error, w->c, "journal io error");
+	cache_set_err_on(bio->bi_status, w->c, "journal io error");
 	closure_put(&w->c->journal.io);
 }
 
diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c
index 13b8a907006d..f633b30c962e 100644
--- a/drivers/md/bcache/movinggc.c
+++ b/drivers/md/bcache/movinggc.c
@@ -63,14 +63,14 @@ static void read_moving_endio(struct bio *bio)
 	struct moving_io *io = container_of(bio->bi_private,
 					    struct moving_io, cl);
 
-	if (bio->bi_error)
-		io->op.error = bio->bi_error;
+	if (bio->bi_status)
+		io->op.status = bio->bi_status;
 	else if (!KEY_DIRTY(&b->key) &&
 		 ptr_stale(io->op.c, &b->key, 0)) {
-		io->op.error = -EINTR;
+		io->op.status = BLK_STS_IOERR;
 	}
 
-	bch_bbio_endio(io->op.c, bio, bio->bi_error, "reading data to move");
+	bch_bbio_endio(io->op.c, bio, bio->bi_status, "reading data to move");
 }
 
 static void moving_init(struct moving_io *io)
@@ -92,7 +92,7 @@ static void write_moving(struct closure *cl)
 	struct moving_io *io = container_of(cl, struct moving_io, cl);
 	struct data_insert_op *op = &io->op;
 
-	if (!op->error) {
+	if (!op->status) {
 		moving_init(io);
 
 		io->bio.bio.bi_iter.bi_sector = KEY_START(&io->w->key);
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index 709c9cc34369..019b3df9f1c6 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -81,7 +81,7 @@ static void bch_data_insert_keys(struct closure *cl)
 	if (ret == -ESRCH) {
 		op->replace_collision = true;
 	} else if (ret) {
-		op->error		= -ENOMEM;
+		op->status		= BLK_STS_RESOURCE;
 		op->insert_data_done	= true;
 	}
 
@@ -178,17 +178,17 @@ static void bch_data_insert_endio(struct bio *bio)
 	struct closure *cl = bio->bi_private;
 	struct data_insert_op *op = container_of(cl, struct data_insert_op, cl);
 
-	if (bio->bi_error) {
+	if (bio->bi_status) {
 		/* TODO: We could try to recover from this. */
 		if (op->writeback)
-			op->error = bio->bi_error;
+			op->status = bio->bi_status;
 		else if (!op->replace)
 			set_closure_fn(cl, bch_data_insert_error, op->wq);
 		else
 			set_closure_fn(cl, NULL, NULL);
 	}
 
-	bch_bbio_endio(op->c, bio, bio->bi_error, "writing data to cache");
+	bch_bbio_endio(op->c, bio, bio->bi_status, "writing data to cache");
 }
 
 static void bch_data_insert_start(struct closure *cl)
@@ -488,15 +488,15 @@ static void bch_cache_read_endio(struct bio *bio)
 	 * from the backing device.
 	 */
 
-	if (bio->bi_error)
-		s->iop.error = bio->bi_error;
+	if (bio->bi_status)
+		s->iop.status = bio->bi_status;
 	else if (!KEY_DIRTY(&b->key) &&
 		 ptr_stale(s->iop.c, &b->key, 0)) {
 		atomic_long_inc(&s->iop.c->cache_read_races);
-		s->iop.error = -EINTR;
+		s->iop.status = BLK_STS_IOERR;
 	}
 
-	bch_bbio_endio(s->iop.c, bio, bio->bi_error, "reading from cache");
+	bch_bbio_endio(s->iop.c, bio, bio->bi_status, "reading from cache");
 }
 
 /*
@@ -593,9 +593,9 @@ static void request_endio(struct bio *bio)
 {
 	struct closure *cl = bio->bi_private;
 
-	if (bio->bi_error) {
+	if (bio->bi_status) {
 		struct search *s = container_of(cl, struct search, cl);
-		s->iop.error = bio->bi_error;
+		s->iop.status = bio->bi_status;
 		/* Only cache read errors are recoverable */
 		s->recoverable = false;
 	}
@@ -611,7 +611,7 @@ static void bio_complete(struct search *s)
 				    &s->d->disk->part0, s->start_time);
 
 		trace_bcache_request_end(s->d, s->orig_bio);
-		s->orig_bio->bi_error = s->iop.error;
+		s->orig_bio->bi_status = s->iop.status;
 		bio_endio(s->orig_bio);
 		s->orig_bio = NULL;
 	}
@@ -664,7 +664,7 @@ static inline struct search *search_alloc(struct bio *bio,
 	s->iop.inode		= d->id;
 	s->iop.write_point	= hash_long((unsigned long) current, 16);
 	s->iop.write_prio	= 0;
-	s->iop.error		= 0;
+	s->iop.status		= 0;
 	s->iop.flags		= 0;
 	s->iop.flush_journal	= op_is_flush(bio->bi_opf);
 	s->iop.wq		= bcache_wq;
@@ -707,7 +707,7 @@ static void cached_dev_read_error(struct closure *cl)
 		/* Retry from the backing device: */
 		trace_bcache_read_retry(s->orig_bio);
 
-		s->iop.error = 0;
+		s->iop.status = 0;
 		do_bio_hook(s, s->orig_bio);
 
 		/* XXX: invalidate cache */
@@ -767,7 +767,7 @@ static void cached_dev_read_done_bh(struct closure *cl)
 				  !s->cache_miss, s->iop.bypass);
 	trace_bcache_read(s->orig_bio, !s->cache_miss, s->iop.bypass);
 
-	if (s->iop.error)
+	if (s->iop.status)
 		continue_at_nobarrier(cl, cached_dev_read_error, bcache_wq);
 	else if (s->iop.bio || verify(dc, &s->bio.bio))
 		continue_at_nobarrier(cl, cached_dev_read_done, bcache_wq);
diff --git a/drivers/md/bcache/request.h b/drivers/md/bcache/request.h
index 1ff36875c2b3..7689176951ce 100644
--- a/drivers/md/bcache/request.h
+++ b/drivers/md/bcache/request.h
@@ -10,7 +10,7 @@ struct data_insert_op {
 	unsigned		inode;
 	uint16_t		write_point;
 	uint16_t		write_prio;
-	short			error;
+	blk_status_t		status;
 
 	union {
 		uint16_t	flags;
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index e57353e39168..8352fad765f6 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -271,7 +271,7 @@ static void write_super_endio(struct bio *bio)
 {
 	struct cache *ca = bio->bi_private;
 
-	bch_count_io_errors(ca, bio->bi_error, "writing superblock");
+	bch_count_io_errors(ca, bio->bi_status, "writing superblock");
 	closure_put(&ca->set->sb_write);
 }
 
@@ -321,7 +321,7 @@ static void uuid_endio(struct bio *bio)
 	struct closure *cl = bio->bi_private;
 	struct cache_set *c = container_of(cl, struct cache_set, uuid_write);
 
-	cache_set_err_on(bio->bi_error, c, "accessing uuids");
+	cache_set_err_on(bio->bi_status, c, "accessing uuids");
 	bch_bbio_free(bio, c);
 	closure_put(cl);
 }
@@ -494,7 +494,7 @@ static void prio_endio(struct bio *bio)
 {
 	struct cache *ca = bio->bi_private;
 
-	cache_set_err_on(bio->bi_error, ca->set, "accessing priorities");
+	cache_set_err_on(bio->bi_status, ca->set, "accessing priorities");
 	bch_bbio_free(bio, ca->set);
 	closure_put(&ca->prio);
 }
@@ -782,7 +782,9 @@ static int bcache_device_init(struct bcache_device *d, unsigned block_size,
 
 	minor *= BCACHE_MINORS;
 
-	if (!(d->bio_split = bioset_create(4, offsetof(struct bbio, bio))) ||
+	if (!(d->bio_split = bioset_create(4, offsetof(struct bbio, bio),
+					   BIOSET_NEED_BVECS |
+					   BIOSET_NEED_RESCUER)) ||
 	    !(d->disk = alloc_disk(BCACHE_MINORS))) {
 		ida_simple_remove(&bcache_minor, minor);
 		return -ENOMEM;
@@ -1516,7 +1518,9 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
 				sizeof(struct bbio) + sizeof(struct bio_vec) *
 				bucket_pages(c))) ||
 	    !(c->fill_iter = mempool_create_kmalloc_pool(1, iter_size)) ||
-	    !(c->bio_split = bioset_create(4, offsetof(struct bbio, bio))) ||
+	    !(c->bio_split = bioset_create(4, offsetof(struct bbio, bio),
+					   BIOSET_NEED_BVECS |
+					   BIOSET_NEED_RESCUER)) ||
 	    !(c->uuids = alloc_bucket_pages(GFP_KERNEL, c)) ||
 	    !(c->moving_gc_wq = alloc_workqueue("bcache_gc",
 						WQ_MEM_RECLAIM, 0)) ||
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
index 6ac2e48b9235..42c66e76f05e 100644
--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@@ -167,7 +167,7 @@ static void dirty_endio(struct bio *bio)
 	struct keybuf_key *w = bio->bi_private;
 	struct dirty_io *io = w->private;
 
-	if (bio->bi_error)
+	if (bio->bi_status)
 		SET_KEY_DIRTY(&w->key, false);
 
 	closure_put(&io->cl);
@@ -195,7 +195,7 @@ static void read_dirty_endio(struct bio *bio)
 	struct dirty_io *io = w->private;
 
 	bch_count_io_errors(PTR_CACHE(io->dc->disk.c, &w->key, 0),
-			    bio->bi_error, "reading dirty data from cache");
+			    bio->bi_status, "reading dirty data from cache");
 
 	dirty_endio(bio);
 }
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index bf7419a56454..f4eace5ea184 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -485,10 +485,10 @@ void bitmap_print_sb(struct bitmap *bitmap)
 	pr_debug("         magic: %08x\n", le32_to_cpu(sb->magic));
 	pr_debug("       version: %d\n", le32_to_cpu(sb->version));
 	pr_debug("          uuid: %08x.%08x.%08x.%08x\n",
-		 *(__u32 *)(sb->uuid+0),
-		 *(__u32 *)(sb->uuid+4),
-		 *(__u32 *)(sb->uuid+8),
-		 *(__u32 *)(sb->uuid+12));
+		 le32_to_cpu(*(__u32 *)(sb->uuid+0)),
+		 le32_to_cpu(*(__u32 *)(sb->uuid+4)),
+		 le32_to_cpu(*(__u32 *)(sb->uuid+8)),
+		 le32_to_cpu(*(__u32 *)(sb->uuid+12)));
 	pr_debug("        events: %llu\n",
 		 (unsigned long long) le64_to_cpu(sb->events));
 	pr_debug("events cleared: %llu\n",
diff --git a/drivers/md/dm-bio-prison-v1.c b/drivers/md/dm-bio-prison-v1.c
index ae7da2c30a57..874841f0fc83 100644
--- a/drivers/md/dm-bio-prison-v1.c
+++ b/drivers/md/dm-bio-prison-v1.c
@@ -116,7 +116,7 @@ static int __bio_detain(struct dm_bio_prison *prison,
 
 	while (*new) {
 		struct dm_bio_prison_cell *cell =
-			container_of(*new, struct dm_bio_prison_cell, node);
+			rb_entry(*new, struct dm_bio_prison_cell, node);
 
 		r = cmp_keys(key, &cell->key);
 
@@ -229,7 +229,7 @@ void dm_cell_release_no_holder(struct dm_bio_prison *prison,
 EXPORT_SYMBOL_GPL(dm_cell_release_no_holder);
 
 void dm_cell_error(struct dm_bio_prison *prison,
-		   struct dm_bio_prison_cell *cell, int error)
+		   struct dm_bio_prison_cell *cell, blk_status_t error)
 {
 	struct bio_list bios;
 	struct bio *bio;
@@ -238,7 +238,7 @@ void dm_cell_error(struct dm_bio_prison *prison,
 	dm_cell_release(prison, cell, &bios);
 
 	while ((bio = bio_list_pop(&bios))) {
-		bio->bi_error = error;
+		bio->bi_status = error;
 		bio_endio(bio);
 	}
 }
diff --git a/drivers/md/dm-bio-prison-v1.h b/drivers/md/dm-bio-prison-v1.h
index cddd4ac07e2c..cec52ac5e1ae 100644
--- a/drivers/md/dm-bio-prison-v1.h
+++ b/drivers/md/dm-bio-prison-v1.h
@@ -91,7 +91,7 @@ void dm_cell_release_no_holder(struct dm_bio_prison *prison,
 			       struct dm_bio_prison_cell *cell,
 			       struct bio_list *inmates);
 void dm_cell_error(struct dm_bio_prison *prison,
-		   struct dm_bio_prison_cell *cell, int error);
+		   struct dm_bio_prison_cell *cell, blk_status_t error);
 
 /*
  * Visits the cell and then releases.  Guarantees no new inmates are
diff --git a/drivers/md/dm-bio-prison-v2.c b/drivers/md/dm-bio-prison-v2.c
index c9b11f799cd8..8ce3a1a588cf 100644
--- a/drivers/md/dm-bio-prison-v2.c
+++ b/drivers/md/dm-bio-prison-v2.c
@@ -120,7 +120,7 @@ static bool __find_or_insert(struct dm_bio_prison_v2 *prison,
 
 	while (*new) {
 		struct dm_bio_prison_cell_v2 *cell =
-			container_of(*new, struct dm_bio_prison_cell_v2, node);
+			rb_entry(*new, struct dm_bio_prison_cell_v2, node);
 
 		r = cmp_keys(key, &cell->key);
 
diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c
index 5db11a405129..850ff6c67994 100644
--- a/drivers/md/dm-bufio.c
+++ b/drivers/md/dm-bufio.c
@@ -145,8 +145,8 @@ struct dm_buffer {
 	enum data_mode data_mode;
 	unsigned char list_mode;		/* LIST_* */
 	unsigned hold_count;
-	int read_error;
-	int write_error;
+	blk_status_t read_error;
+	blk_status_t write_error;
 	unsigned long state;
 	unsigned long last_accessed;
 	struct dm_bufio_client *c;
@@ -218,7 +218,7 @@ static DEFINE_SPINLOCK(param_spinlock);
  * Buffers are freed after this timeout
  */
 static unsigned dm_bufio_max_age = DM_BUFIO_DEFAULT_AGE_SECS;
-static unsigned dm_bufio_retain_bytes = DM_BUFIO_DEFAULT_RETAIN_BYTES;
+static unsigned long dm_bufio_retain_bytes = DM_BUFIO_DEFAULT_RETAIN_BYTES;
 
 static unsigned long dm_bufio_peak_allocated;
 static unsigned long dm_bufio_allocated_kmem_cache;
@@ -555,7 +555,7 @@ static void dmio_complete(unsigned long error, void *context)
 {
 	struct dm_buffer *b = context;
 
-	b->bio.bi_error = error ? -EIO : 0;
+	b->bio.bi_status = error ? BLK_STS_IOERR : 0;
 	b->bio.bi_end_io(&b->bio);
 }
 
@@ -588,7 +588,7 @@ static void use_dmio(struct dm_buffer *b, int rw, sector_t sector,
 
 	r = dm_io(&io_req, 1, &region, NULL);
 	if (r) {
-		b->bio.bi_error = r;
+		b->bio.bi_status = errno_to_blk_status(r);
 		end_io(&b->bio);
 	}
 }
@@ -596,7 +596,7 @@ static void use_dmio(struct dm_buffer *b, int rw, sector_t sector,
 static void inline_endio(struct bio *bio)
 {
 	bio_end_io_t *end_fn = bio->bi_private;
-	int error = bio->bi_error;
+	blk_status_t status = bio->bi_status;
 
 	/*
 	 * Reset the bio to free any attached resources
@@ -604,7 +604,7 @@ static void inline_endio(struct bio *bio)
 	 */
 	bio_reset(bio);
 
-	bio->bi_error = error;
+	bio->bi_status = status;
 	end_fn(bio);
 }
 
@@ -685,11 +685,12 @@ static void write_endio(struct bio *bio)
 {
 	struct dm_buffer *b = container_of(bio, struct dm_buffer, bio);
 
-	b->write_error = bio->bi_error;
-	if (unlikely(bio->bi_error)) {
+	b->write_error = bio->bi_status;
+	if (unlikely(bio->bi_status)) {
 		struct dm_bufio_client *c = b->c;
-		int error = bio->bi_error;
-		(void)cmpxchg(&c->async_write_error, 0, error);
+
+		(void)cmpxchg(&c->async_write_error, 0,
+				blk_status_to_errno(bio->bi_status));
 	}
 
 	BUG_ON(!test_bit(B_WRITING, &b->state));
@@ -1063,7 +1064,7 @@ static void read_endio(struct bio *bio)
 {
 	struct dm_buffer *b = container_of(bio, struct dm_buffer, bio);
 
-	b->read_error = bio->bi_error;
+	b->read_error = bio->bi_status;
 
 	BUG_ON(!test_bit(B_READING, &b->state));
 
@@ -1107,7 +1108,7 @@ static void *new_read(struct dm_bufio_client *c, sector_t block,
 	wait_on_bit_io(&b->state, B_READING, TASK_UNINTERRUPTIBLE);
 
 	if (b->read_error) {
-		int error = b->read_error;
+		int error = blk_status_to_errno(b->read_error);
 
 		dm_bufio_release(b);
 
@@ -1257,7 +1258,8 @@ EXPORT_SYMBOL_GPL(dm_bufio_write_dirty_buffers_async);
  */
 int dm_bufio_write_dirty_buffers(struct dm_bufio_client *c)
 {
-	int a, f;
+	blk_status_t a;
+	int f;
 	unsigned long buffers_processed = 0;
 	struct dm_buffer *b, *tmp;
 
@@ -1334,7 +1336,7 @@ int dm_bufio_issue_flush(struct dm_bufio_client *c)
 {
 	struct dm_io_request io_req = {
 		.bi_op = REQ_OP_WRITE,
-		.bi_op_flags = REQ_PREFLUSH,
+		.bi_op_flags = REQ_PREFLUSH | REQ_SYNC,
 		.mem.type = DM_IO_KMEM,
 		.mem.ptr.addr = NULL,
 		.client = c->dm_io,
@@ -1558,10 +1560,10 @@ static bool __try_evict_buffer(struct dm_buffer *b, gfp_t gfp)
 	return true;
 }
 
-static unsigned get_retain_buffers(struct dm_bufio_client *c)
+static unsigned long get_retain_buffers(struct dm_bufio_client *c)
 {
-        unsigned retain_bytes = ACCESS_ONCE(dm_bufio_retain_bytes);
-        return retain_bytes / c->block_size;
+        unsigned long retain_bytes = ACCESS_ONCE(dm_bufio_retain_bytes);
+        return retain_bytes >> (c->sectors_per_block_bits + SECTOR_SHIFT);
 }
 
 static unsigned long __scan(struct dm_bufio_client *c, unsigned long nr_to_scan,
@@ -1571,7 +1573,7 @@ static unsigned long __scan(struct dm_bufio_client *c, unsigned long nr_to_scan,
 	struct dm_buffer *b, *tmp;
 	unsigned long freed = 0;
 	unsigned long count = nr_to_scan;
-	unsigned retain_target = get_retain_buffers(c);
+	unsigned long retain_target = get_retain_buffers(c);
 
 	for (l = 0; l < LIST_SIZE; l++) {
 		list_for_each_entry_safe_reverse(b, tmp, &c->lru[l], lru_list) {
@@ -1794,8 +1796,8 @@ static bool older_than(struct dm_buffer *b, unsigned long age_hz)
 static void __evict_old_buffers(struct dm_bufio_client *c, unsigned long age_hz)
 {
 	struct dm_buffer *b, *tmp;
-	unsigned retain_target = get_retain_buffers(c);
-	unsigned count;
+	unsigned long retain_target = get_retain_buffers(c);
+	unsigned long count;
 	LIST_HEAD(write_list);
 
 	dm_bufio_lock(c);
@@ -1955,7 +1957,7 @@ MODULE_PARM_DESC(max_cache_size_bytes, "Size of metadata cache");
 module_param_named(max_age_seconds, dm_bufio_max_age, uint, S_IRUGO | S_IWUSR);
 MODULE_PARM_DESC(max_age_seconds, "Max age of a buffer in seconds");
 
-module_param_named(retain_bytes, dm_bufio_retain_bytes, uint, S_IRUGO | S_IWUSR);
+module_param_named(retain_bytes, dm_bufio_retain_bytes, ulong, S_IRUGO | S_IWUSR);
 MODULE_PARM_DESC(retain_bytes, "Try to keep at least this many bytes cached in memory");
 
 module_param_named(peak_allocated_bytes, dm_bufio_peak_allocated, ulong, S_IRUGO | S_IWUSR);
diff --git a/drivers/md/dm-cache-background-tracker.c b/drivers/md/dm-cache-background-tracker.c
index 9b1afdfb13f0..707233891291 100644
--- a/drivers/md/dm-cache-background-tracker.c
+++ b/drivers/md/dm-cache-background-tracker.c
@@ -33,6 +33,11 @@ struct background_tracker *btracker_create(unsigned max_work)
 {
 	struct background_tracker *b = kmalloc(sizeof(*b), GFP_KERNEL);
 
+	if (!b) {
+		DMERR("couldn't create background_tracker");
+		return NULL;
+	}
+
 	b->max_work = max_work;
 	atomic_set(&b->pending_promotes, 0);
 	atomic_set(&b->pending_writebacks, 0);
diff --git a/drivers/md/dm-cache-policy-smq.c b/drivers/md/dm-cache-policy-smq.c
index 72479bd61e11..e5eb9c9b4bc8 100644
--- a/drivers/md/dm-cache-policy-smq.c
+++ b/drivers/md/dm-cache-policy-smq.c
@@ -1120,8 +1120,6 @@ static bool clean_target_met(struct smq_policy *mq, bool idle)
 	 * Cache entries may not be populated.  So we cannot rely on the
 	 * size of the clean queue.
 	 */
-	unsigned nr_clean;
-
 	if (idle) {
 		/*
 		 * We'd like to clean everything.
@@ -1129,18 +1127,16 @@ static bool clean_target_met(struct smq_policy *mq, bool idle)
 		return q_size(&mq->dirty) == 0u;
 	}
 
-	nr_clean = from_cblock(mq->cache_size) - q_size(&mq->dirty);
-	return (nr_clean + btracker_nr_writebacks_queued(mq->bg_work)) >=
-		percent_to_target(mq, CLEAN_TARGET);
+	/*
+	 * If we're busy we don't worry about cleaning at all.
+	 */
+	return true;
 }
 
-static bool free_target_met(struct smq_policy *mq, bool idle)
+static bool free_target_met(struct smq_policy *mq)
 {
 	unsigned nr_free;
 
-	if (!idle)
-		return true;
-
 	nr_free = from_cblock(mq->cache_size) - mq->cache_alloc.nr_allocated;
 	return (nr_free + btracker_nr_demotions_queued(mq->bg_work)) >=
 		percent_to_target(mq, FREE_TARGET);
@@ -1190,9 +1186,9 @@ static void queue_demotion(struct smq_policy *mq)
 	if (unlikely(WARN_ON_ONCE(!mq->migrations_allowed)))
 		return;
 
-	e = q_peek(&mq->clean, mq->clean.nr_levels, true);
+	e = q_peek(&mq->clean, mq->clean.nr_levels / 2, true);
 	if (!e) {
-		if (!clean_target_met(mq, false))
+		if (!clean_target_met(mq, true))
 			queue_writeback(mq);
 		return;
 	}
@@ -1220,7 +1216,7 @@ static void queue_promotion(struct smq_policy *mq, dm_oblock_t oblock,
 		 * We always claim to be 'idle' to ensure some demotions happen
 		 * with continuous loads.
 		 */
-		if (!free_target_met(mq, true))
+		if (!free_target_met(mq))
 			queue_demotion(mq);
 		return;
 	}
@@ -1421,14 +1417,10 @@ static int smq_get_background_work(struct dm_cache_policy *p, bool idle,
 	spin_lock_irqsave(&mq->lock, flags);
 	r = btracker_issue(mq->bg_work, result);
 	if (r == -ENODATA) {
-		/* find some writeback work to do */
-		if (mq->migrations_allowed && !free_target_met(mq, idle))
-			queue_demotion(mq);
-
-		else if (!clean_target_met(mq, idle))
+		if (!clean_target_met(mq, idle)) {
 			queue_writeback(mq);
-
-		r = btracker_issue(mq->bg_work, result);
+			r = btracker_issue(mq->bg_work, result);
+		}
 	}
 	spin_unlock_irqrestore(&mq->lock, flags);
 
@@ -1452,6 +1444,7 @@ static void __complete_background_work(struct smq_policy *mq,
 		clear_pending(mq, e);
 		if (success) {
 			e->oblock = work->oblock;
+			e->level = NR_CACHE_LEVELS - 1;
 			push(mq, e);
 			// h, q, a
 		} else {
diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index 1db375f50a13..c5ea03fc7ee1 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -94,6 +94,9 @@ static void iot_io_begin(struct io_tracker *iot, sector_t len)
 
 static void __iot_io_end(struct io_tracker *iot, sector_t len)
 {
+	if (!len)
+		return;
+
 	iot->in_flight -= len;
 	if (!iot->in_flight)
 		iot->idle_time = jiffies;
@@ -116,7 +119,7 @@ static void iot_io_end(struct io_tracker *iot, sector_t len)
  */
 struct continuation {
 	struct work_struct ws;
-	int input;
+	blk_status_t input;
 };
 
 static inline void init_continuation(struct continuation *k,
@@ -142,7 +145,7 @@ struct batcher {
 	/*
 	 * The operation that everyone is waiting for.
 	 */
-	int (*commit_op)(void *context);
+	blk_status_t (*commit_op)(void *context);
 	void *commit_context;
 
 	/*
@@ -168,8 +171,7 @@ struct batcher {
 static void __commit(struct work_struct *_ws)
 {
 	struct batcher *b = container_of(_ws, struct batcher, commit_work);
-
-	int r;
+	blk_status_t r;
 	unsigned long flags;
 	struct list_head work_items;
 	struct work_struct *ws, *tmp;
@@ -202,7 +204,7 @@ static void __commit(struct work_struct *_ws)
 
 	while ((bio = bio_list_pop(&bios))) {
 		if (r) {
-			bio->bi_error = r;
+			bio->bi_status = r;
 			bio_endio(bio);
 		} else
 			b->issue_op(bio, b->issue_context);
@@ -210,7 +212,7 @@ static void __commit(struct work_struct *_ws)
 }
 
 static void batcher_init(struct batcher *b,
-			 int (*commit_op)(void *),
+			 blk_status_t (*commit_op)(void *),
 			 void *commit_context,
 			 void (*issue_op)(struct bio *bio, void *),
 			 void *issue_context,
@@ -474,7 +476,7 @@ struct cache {
 	spinlock_t invalidation_lock;
 	struct list_head invalidation_requests;
 
-	struct io_tracker origin_tracker;
+	struct io_tracker tracker;
 
 	struct work_struct commit_ws;
 	struct batcher committer;
@@ -901,8 +903,7 @@ static dm_oblock_t get_bio_block(struct cache *cache, struct bio *bio)
 
 static bool accountable_bio(struct cache *cache, struct bio *bio)
 {
-	return ((bio->bi_bdev == cache->origin_dev->bdev) &&
-		bio_op(bio) != REQ_OP_DISCARD);
+	return bio_op(bio) != REQ_OP_DISCARD;
 }
 
 static void accounted_begin(struct cache *cache, struct bio *bio)
@@ -912,7 +913,7 @@ static void accounted_begin(struct cache *cache, struct bio *bio)
 
 	if (accountable_bio(cache, bio)) {
 		pb->len = bio_sectors(bio);
-		iot_io_begin(&cache->origin_tracker, pb->len);
+		iot_io_begin(&cache->tracker, pb->len);
 	}
 }
 
@@ -921,7 +922,7 @@ static void accounted_complete(struct cache *cache, struct bio *bio)
 	size_t pb_data_size = get_per_bio_data_size(cache);
 	struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
 
-	iot_io_end(&cache->origin_tracker, pb->len);
+	iot_io_end(&cache->tracker, pb->len);
 }
 
 static void accounted_request(struct cache *cache, struct bio *bio)
@@ -953,7 +954,7 @@ static void writethrough_endio(struct bio *bio)
 
 	dm_unhook_bio(&pb->hook_info, bio);
 
-	if (bio->bi_error) {
+	if (bio->bi_status) {
 		bio_endio(bio);
 		return;
 	}
@@ -1218,7 +1219,7 @@ static void copy_complete(int read_err, unsigned long write_err, void *context)
 	struct dm_cache_migration *mg = container_of(context, struct dm_cache_migration, k);
 
 	if (read_err || write_err)
-		mg->k.input = -EIO;
+		mg->k.input = BLK_STS_IOERR;
 
 	queue_continuation(mg->cache->wq, &mg->k);
 }
@@ -1264,8 +1265,8 @@ static void overwrite_endio(struct bio *bio)
 
 	dm_unhook_bio(&pb->hook_info, bio);
 
-	if (bio->bi_error)
-		mg->k.input = bio->bi_error;
+	if (bio->bi_status)
+		mg->k.input = bio->bi_status;
 
 	queue_continuation(mg->cache->wq, &mg->k);
 }
@@ -1321,8 +1322,10 @@ static void mg_complete(struct dm_cache_migration *mg, bool success)
 		if (mg->overwrite_bio) {
 			if (success)
 				force_set_dirty(cache, cblock);
+			else if (mg->k.input)
+				mg->overwrite_bio->bi_status = mg->k.input;
 			else
-				mg->overwrite_bio->bi_error = (mg->k.input ? : -EIO);
+				mg->overwrite_bio->bi_status = BLK_STS_IOERR;
 			bio_endio(mg->overwrite_bio);
 		} else {
 			if (success)
@@ -1502,7 +1505,7 @@ static void mg_copy(struct work_struct *ws)
 		r = copy(mg, is_policy_promote);
 		if (r) {
 			DMERR_LIMIT("%s: migration copy failed", cache_device_name(cache));
-			mg->k.input = -EIO;
+			mg->k.input = BLK_STS_IOERR;
 			mg_complete(mg, false);
 		}
 	}
@@ -1716,20 +1719,19 @@ static int invalidate_start(struct cache *cache, dm_cblock_t cblock,
 
 enum busy {
 	IDLE,
-	MODERATE,
 	BUSY
 };
 
 static enum busy spare_migration_bandwidth(struct cache *cache)
 {
-	bool idle = iot_idle_for(&cache->origin_tracker, HZ);
+	bool idle = iot_idle_for(&cache->tracker, HZ);
 	sector_t current_volume = (atomic_read(&cache->nr_io_migrations) + 1) *
 		cache->sectors_per_block;
 
-	if (current_volume <= cache->migration_threshold)
-		return idle ? IDLE : MODERATE;
+	if (idle && current_volume <= cache->migration_threshold)
+		return IDLE;
 	else
-		return idle ? MODERATE : BUSY;
+		return BUSY;
 }
 
 static void inc_hit_counter(struct cache *cache, struct bio *bio)
@@ -1906,12 +1908,12 @@ static int commit(struct cache *cache, bool clean_shutdown)
 /*
  * Used by the batcher.
  */
-static int commit_op(void *context)
+static blk_status_t commit_op(void *context)
 {
 	struct cache *cache = context;
 
 	if (dm_cache_changed_this_transaction(cache->cmd))
-		return commit(cache, false);
+		return errno_to_blk_status(commit(cache, false));
 
 	return 0;
 }
@@ -2017,7 +2019,7 @@ static void requeue_deferred_bios(struct cache *cache)
 	bio_list_init(&cache->deferred_bios);
 
 	while ((bio = bio_list_pop(&bios))) {
-		bio->bi_error = DM_ENDIO_REQUEUE;
+		bio->bi_status = BLK_STS_DM_REQUEUE;
 		bio_endio(bio);
 	}
 }
@@ -2045,8 +2047,6 @@ static void check_migrations(struct work_struct *ws)
 
 	for (;;) {
 		b = spare_migration_bandwidth(cache);
-		if (b == BUSY)
-			break;
 
 		r = policy_get_background_work(cache->policy, b == IDLE, &op);
 		if (r == -ENODATA)
@@ -2717,7 +2717,7 @@ static int cache_create(struct cache_args *ca, struct cache **result)
 
 	batcher_init(&cache->committer, commit_op, cache,
 		     issue_op, cache, cache->wq);
-	iot_init(&cache->origin_tracker);
+	iot_init(&cache->tracker);
 
 	init_rwsem(&cache->background_work_lock);
 	prevent_background_work(cache);
@@ -2821,7 +2821,8 @@ static int cache_map(struct dm_target *ti, struct bio *bio)
 	return r;
 }
 
-static int cache_end_io(struct dm_target *ti, struct bio *bio, int error)
+static int cache_end_io(struct dm_target *ti, struct bio *bio,
+		blk_status_t *error)
 {
 	struct cache *cache = ti->private;
 	unsigned long flags;
@@ -2839,7 +2840,7 @@ static int cache_end_io(struct dm_target *ti, struct bio *bio, int error)
 	bio_drop_shared_lock(cache, bio);
 	accounted_complete(cache, bio);
 
-	return 0;
+	return DM_ENDIO_DONE;
 }
 
 static int write_dirty_bitset(struct cache *cache)
@@ -2941,7 +2942,7 @@ static void cache_postsuspend(struct dm_target *ti)
 
 	cancel_delayed_work(&cache->waker);
 	flush_workqueue(cache->wq);
-	WARN_ON(cache->origin_tracker.in_flight);
+	WARN_ON(cache->tracker.in_flight);
 
 	/*
 	 * If it's a flush suspend there won't be any deferred bios, so this
diff --git a/drivers/md/dm-core.h b/drivers/md/dm-core.h
index 52ca8d059e82..24eddbdf2ab4 100644
--- a/drivers/md/dm-core.h
+++ b/drivers/md/dm-core.h
@@ -147,4 +147,7 @@ static inline bool dm_message_test_buffer_overflow(char *result, unsigned maxlen
 	return !maxlen || strlen(result) + 1 >= maxlen;
 }
 
+extern atomic_t dm_global_event_nr;
+extern wait_queue_head_t dm_global_eventq;
+
 #endif
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index ebf9e72d479b..cdf6b1e12460 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -71,7 +71,7 @@ struct dm_crypt_io {
 	struct convert_context ctx;
 
 	atomic_t io_pending;
-	int error;
+	blk_status_t error;
 	sector_t sector;
 
 	struct rb_node rb_node;
@@ -246,6 +246,9 @@ static struct crypto_aead *any_tfm_aead(struct crypt_config *cc)
  * plain64: the initial vector is the 64-bit little-endian version of the sector
  *        number, padded with zeros if necessary.
  *
+ * plain64be: the initial vector is the 64-bit big-endian version of the sector
+ *        number, padded with zeros if necessary.
+ *
  * essiv: "encrypted sector|salt initial vector", the sector number is
  *        encrypted with the bulk cipher using a salt as key. The salt
  *        should be derived from the bulk cipher's key via hashing.
@@ -302,6 +305,16 @@ static int crypt_iv_plain64_gen(struct crypt_config *cc, u8 *iv,
 	return 0;
 }
 
+static int crypt_iv_plain64be_gen(struct crypt_config *cc, u8 *iv,
+				  struct dm_crypt_request *dmreq)
+{
+	memset(iv, 0, cc->iv_size);
+	/* iv_size is at least of size u64; usually it is 16 bytes */
+	*(__be64 *)&iv[cc->iv_size - sizeof(u64)] = cpu_to_be64(dmreq->iv_sector);
+
+	return 0;
+}
+
 /* Initialise ESSIV - compute salt but no local memory allocations */
 static int crypt_iv_essiv_init(struct crypt_config *cc)
 {
@@ -835,6 +848,10 @@ static const struct crypt_iv_operations crypt_iv_plain64_ops = {
 	.generator = crypt_iv_plain64_gen
 };
 
+static const struct crypt_iv_operations crypt_iv_plain64be_ops = {
+	.generator = crypt_iv_plain64be_gen
+};
+
 static const struct crypt_iv_operations crypt_iv_essiv_ops = {
 	.ctr       = crypt_iv_essiv_ctr,
 	.dtr       = crypt_iv_essiv_dtr,
@@ -1292,7 +1309,7 @@ static void crypt_free_req(struct crypt_config *cc, void *req, struct bio *base_
 /*
  * Encrypt / decrypt data from one bio to another one (can be the same one)
  */
-static int crypt_convert(struct crypt_config *cc,
+static blk_status_t crypt_convert(struct crypt_config *cc,
 			 struct convert_context *ctx)
 {
 	unsigned int tag_offset = 0;
@@ -1343,13 +1360,13 @@ static int crypt_convert(struct crypt_config *cc,
 		 */
 		case -EBADMSG:
 			atomic_dec(&ctx->cc_pending);
-			return -EILSEQ;
+			return BLK_STS_PROTECTION;
 		/*
 		 * There was an error while processing the request.
 		 */
 		default:
 			atomic_dec(&ctx->cc_pending);
-			return -EIO;
+			return BLK_STS_IOERR;
 		}
 	}
 
@@ -1463,7 +1480,7 @@ static void crypt_dec_pending(struct dm_crypt_io *io)
 {
 	struct crypt_config *cc = io->cc;
 	struct bio *base_bio = io->base_bio;
-	int error = io->error;
+	blk_status_t error = io->error;
 
 	if (!atomic_dec_and_test(&io->io_pending))
 		return;
@@ -1476,7 +1493,7 @@ static void crypt_dec_pending(struct dm_crypt_io *io)
 	else
 		kfree(io->integrity_metadata);
 
-	base_bio->bi_error = error;
+	base_bio->bi_status = error;
 	bio_endio(base_bio);
 }
 
@@ -1502,7 +1519,7 @@ static void crypt_endio(struct bio *clone)
 	struct dm_crypt_io *io = clone->bi_private;
 	struct crypt_config *cc = io->cc;
 	unsigned rw = bio_data_dir(clone);
-	int error;
+	blk_status_t error;
 
 	/*
 	 * free the processed pages
@@ -1510,7 +1527,7 @@ static void crypt_endio(struct bio *clone)
 	if (rw == WRITE)
 		crypt_free_buffer_pages(cc, clone);
 
-	error = clone->bi_error;
+	error = clone->bi_status;
 	bio_put(clone);
 
 	if (rw == READ && !error) {
@@ -1570,7 +1587,7 @@ static void kcryptd_io_read_work(struct work_struct *work)
 
 	crypt_inc_pending(io);
 	if (kcryptd_io_read(io, GFP_NOIO))
-		io->error = -ENOMEM;
+		io->error = BLK_STS_RESOURCE;
 	crypt_dec_pending(io);
 }
 
@@ -1656,7 +1673,7 @@ static void kcryptd_crypt_write_io_submit(struct dm_crypt_io *io, int async)
 	sector_t sector;
 	struct rb_node **rbp, *parent;
 
-	if (unlikely(io->error < 0)) {
+	if (unlikely(io->error)) {
 		crypt_free_buffer_pages(cc, clone);
 		bio_put(clone);
 		crypt_dec_pending(io);
@@ -1697,7 +1714,7 @@ static void kcryptd_crypt_write_convert(struct dm_crypt_io *io)
 	struct bio *clone;
 	int crypt_finished;
 	sector_t sector = io->sector;
-	int r;
+	blk_status_t r;
 
 	/*
 	 * Prevent io from disappearing until this function completes.
@@ -1707,7 +1724,7 @@ static void kcryptd_crypt_write_convert(struct dm_crypt_io *io)
 
 	clone = crypt_alloc_buffer(io, io->base_bio->bi_iter.bi_size);
 	if (unlikely(!clone)) {
-		io->error = -EIO;
+		io->error = BLK_STS_IOERR;
 		goto dec;
 	}
 
@@ -1718,7 +1735,7 @@ static void kcryptd_crypt_write_convert(struct dm_crypt_io *io)
 
 	crypt_inc_pending(io);
 	r = crypt_convert(cc, &io->ctx);
-	if (r < 0)
+	if (r)
 		io->error = r;
 	crypt_finished = atomic_dec_and_test(&io->ctx.cc_pending);
 
@@ -1740,7 +1757,7 @@ static void kcryptd_crypt_read_done(struct dm_crypt_io *io)
 static void kcryptd_crypt_read_convert(struct dm_crypt_io *io)
 {
 	struct crypt_config *cc = io->cc;
-	int r = 0;
+	blk_status_t r;
 
 	crypt_inc_pending(io);
 
@@ -1748,7 +1765,7 @@ static void kcryptd_crypt_read_convert(struct dm_crypt_io *io)
 			   io->sector);
 
 	r = crypt_convert(cc, &io->ctx);
-	if (r < 0)
+	if (r)
 		io->error = r;
 
 	if (atomic_dec_and_test(&io->ctx.cc_pending))
@@ -1781,9 +1798,9 @@ static void kcryptd_async_done(struct crypto_async_request *async_req,
 	if (error == -EBADMSG) {
 		DMERR_LIMIT("INTEGRITY AEAD ERROR, sector %llu",
 			    (unsigned long long)le64_to_cpu(*org_sector_of_dmreq(cc, dmreq)));
-		io->error = -EILSEQ;
+		io->error = BLK_STS_PROTECTION;
 	} else if (error < 0)
-		io->error = -EIO;
+		io->error = BLK_STS_IOERR;
 
 	crypt_free_req(cc, req_of_dmreq(cc, dmreq), io->base_bio);
 
@@ -2208,6 +2225,8 @@ static int crypt_ctr_ivmode(struct dm_target *ti, const char *ivmode)
 		cc->iv_gen_ops = &crypt_iv_plain_ops;
 	else if (strcmp(ivmode, "plain64") == 0)
 		cc->iv_gen_ops = &crypt_iv_plain64_ops;
+	else if (strcmp(ivmode, "plain64be") == 0)
+		cc->iv_gen_ops = &crypt_iv_plain64be_ops;
 	else if (strcmp(ivmode, "essiv") == 0)
 		cc->iv_gen_ops = &crypt_iv_essiv_ops;
 	else if (strcmp(ivmode, "benbi") == 0)
@@ -2677,7 +2696,8 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 		goto bad;
 	}
 
-	cc->bs = bioset_create(MIN_IOS, 0);
+	cc->bs = bioset_create(MIN_IOS, 0, (BIOSET_NEED_BVECS |
+					    BIOSET_NEED_RESCUER));
 	if (!cc->bs) {
 		ti->error = "Cannot allocate crypt bioset";
 		goto bad;
@@ -2795,10 +2815,10 @@ static int crypt_map(struct dm_target *ti, struct bio *bio)
 	 * and is aligned to this size as defined in IO hints.
 	 */
 	if (unlikely((bio->bi_iter.bi_sector & ((cc->sector_size >> SECTOR_SHIFT) - 1)) != 0))
-		return -EIO;
+		return DM_MAPIO_KILL;
 
 	if (unlikely(bio->bi_iter.bi_size & (cc->sector_size - 1)))
-		return -EIO;
+		return DM_MAPIO_KILL;
 
 	io = dm_per_bio_data(bio, cc->per_bio_data_size);
 	crypt_io_init(io, cc, bio, dm_target_offset(ti, bio->bi_iter.bi_sector));
@@ -2986,7 +3006,7 @@ static void crypt_io_hints(struct dm_target *ti, struct queue_limits *limits)
 
 static struct target_type crypt_target = {
 	.name   = "crypt",
-	.version = {1, 17, 0},
+	.version = {1, 18, 0},
 	.module = THIS_MODULE,
 	.ctr    = crypt_ctr,
 	.dtr    = crypt_dtr,
diff --git a/drivers/md/dm-flakey.c b/drivers/md/dm-flakey.c
index 13305a182611..e2c7234931bc 100644
--- a/drivers/md/dm-flakey.c
+++ b/drivers/md/dm-flakey.c
@@ -275,7 +275,7 @@ static void flakey_map_bio(struct dm_target *ti, struct bio *bio)
 	struct flakey_c *fc = ti->private;
 
 	bio->bi_bdev = fc->dev->bdev;
-	if (bio_sectors(bio))
+	if (bio_sectors(bio) || bio_op(bio) == REQ_OP_ZONE_RESET)
 		bio->bi_iter.bi_sector =
 			flakey_map_sector(ti, bio->bi_iter.bi_sector);
 }
@@ -306,6 +306,14 @@ static int flakey_map(struct dm_target *ti, struct bio *bio)
 	struct per_bio_data *pb = dm_per_bio_data(bio, sizeof(struct per_bio_data));
 	pb->bio_submitted = false;
 
+	/* Do not fail reset zone */
+	if (bio_op(bio) == REQ_OP_ZONE_RESET)
+		goto map_bio;
+
+	/* We need to remap reported zones, so remember the BIO iter */
+	if (bio_op(bio) == REQ_OP_ZONE_REPORT)
+		goto map_bio;
+
 	/* Are we alive ? */
 	elapsed = (jiffies - fc->start_time) / HZ;
 	if (elapsed % (fc->up_interval + fc->down_interval) >= fc->up_interval) {
@@ -321,7 +329,7 @@ static int flakey_map(struct dm_target *ti, struct bio *bio)
 		if (bio_data_dir(bio) == READ) {
 			if (!fc->corrupt_bio_byte && !test_bit(DROP_WRITES, &fc->flags) &&
 			    !test_bit(ERROR_WRITES, &fc->flags))
-				return -EIO;
+				return DM_MAPIO_KILL;
 			goto map_bio;
 		}
 
@@ -349,7 +357,7 @@ static int flakey_map(struct dm_target *ti, struct bio *bio)
 		/*
 		 * By default, error all I/O.
 		 */
-		return -EIO;
+		return DM_MAPIO_KILL;
 	}
 
 map_bio:
@@ -358,12 +366,21 @@ map_bio:
 	return DM_MAPIO_REMAPPED;
 }
 
-static int flakey_end_io(struct dm_target *ti, struct bio *bio, int error)
+static int flakey_end_io(struct dm_target *ti, struct bio *bio,
+			 blk_status_t *error)
 {
 	struct flakey_c *fc = ti->private;
 	struct per_bio_data *pb = dm_per_bio_data(bio, sizeof(struct per_bio_data));
 
-	if (!error && pb->bio_submitted && (bio_data_dir(bio) == READ)) {
+	if (bio_op(bio) == REQ_OP_ZONE_RESET)
+		return DM_ENDIO_DONE;
+
+	if (bio_op(bio) == REQ_OP_ZONE_REPORT) {
+		dm_remap_zone_report(ti, bio, fc->start);
+		return DM_ENDIO_DONE;
+	}
+
+	if (!*error && pb->bio_submitted && (bio_data_dir(bio) == READ)) {
 		if (fc->corrupt_bio_byte && (fc->corrupt_bio_rw == READ) &&
 		    all_corrupt_bio_flags_match(bio, fc)) {
 			/*
@@ -377,11 +394,11 @@ static int flakey_end_io(struct dm_target *ti, struct bio *bio, int error)
 			 * Error read during the down_interval if drop_writes
 			 * and error_writes were not configured.
 			 */
-			return -EIO;
+			*error = BLK_STS_IOERR;
 		}
 	}
 
-	return error;
+	return DM_ENDIO_DONE;
 }
 
 static void flakey_status(struct dm_target *ti, status_type_t type,
@@ -445,7 +462,8 @@ static int flakey_iterate_devices(struct dm_target *ti, iterate_devices_callout_
 
 static struct target_type flakey_target = {
 	.name   = "flakey",
-	.version = {1, 4, 0},
+	.version = {1, 5, 0},
+	.features = DM_TARGET_ZONED_HM,
 	.module = THIS_MODULE,
 	.ctr    = flakey_ctr,
 	.dtr    = flakey_dtr,
diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c
index c7f7c8d76576..1b224aa9cf15 100644
--- a/drivers/md/dm-integrity.c
+++ b/drivers/md/dm-integrity.c
@@ -246,7 +246,7 @@ struct dm_integrity_io {
 	unsigned metadata_offset;
 
 	atomic_t in_flight;
-	int bi_error;
+	blk_status_t bi_status;
 
 	struct completion *completion;
 
@@ -783,7 +783,8 @@ static void write_journal(struct dm_integrity_c *ic, unsigned commit_start, unsi
 			for (i = 0; i < commit_sections; i++)
 				rw_section_mac(ic, commit_start + i, true);
 		}
-		rw_journal(ic, REQ_OP_WRITE, REQ_FUA, commit_start, commit_sections, &io_comp);
+		rw_journal(ic, REQ_OP_WRITE, REQ_FUA | REQ_SYNC, commit_start,
+			   commit_sections, &io_comp);
 	} else {
 		unsigned to_end;
 		io_comp.in_flight = (atomic_t)ATOMIC_INIT(2);
@@ -1104,18 +1105,21 @@ static void schedule_autocommit(struct dm_integrity_c *ic)
 static void submit_flush_bio(struct dm_integrity_c *ic, struct dm_integrity_io *dio)
 {
 	struct bio *bio;
-	spin_lock_irq(&ic->endio_wait.lock);
+	unsigned long flags;
+
+	spin_lock_irqsave(&ic->endio_wait.lock, flags);
 	bio = dm_bio_from_per_bio_data(dio, sizeof(struct dm_integrity_io));
 	bio_list_add(&ic->flush_bio_list, bio);
-	spin_unlock_irq(&ic->endio_wait.lock);
+	spin_unlock_irqrestore(&ic->endio_wait.lock, flags);
+
 	queue_work(ic->commit_wq, &ic->commit_work);
 }
 
 static void do_endio(struct dm_integrity_c *ic, struct bio *bio)
 {
 	int r = dm_integrity_failed(ic);
-	if (unlikely(r) && !bio->bi_error)
-		bio->bi_error = r;
+	if (unlikely(r) && !bio->bi_status)
+		bio->bi_status = errno_to_blk_status(r);
 	bio_endio(bio);
 }
 
@@ -1123,7 +1127,7 @@ static void do_endio_flush(struct dm_integrity_c *ic, struct dm_integrity_io *di
 {
 	struct bio *bio = dm_bio_from_per_bio_data(dio, sizeof(struct dm_integrity_io));
 
-	if (unlikely(dio->fua) && likely(!bio->bi_error) && likely(!dm_integrity_failed(ic)))
+	if (unlikely(dio->fua) && likely(!bio->bi_status) && likely(!dm_integrity_failed(ic)))
 		submit_flush_bio(ic, dio);
 	else
 		do_endio(ic, bio);
@@ -1142,9 +1146,9 @@ static void dec_in_flight(struct dm_integrity_io *dio)
 
 		bio = dm_bio_from_per_bio_data(dio, sizeof(struct dm_integrity_io));
 
-		if (unlikely(dio->bi_error) && !bio->bi_error)
-			bio->bi_error = dio->bi_error;
-		if (likely(!bio->bi_error) && unlikely(bio_sectors(bio) != dio->range.n_sectors)) {
+		if (unlikely(dio->bi_status) && !bio->bi_status)
+			bio->bi_status = dio->bi_status;
+		if (likely(!bio->bi_status) && unlikely(bio_sectors(bio) != dio->range.n_sectors)) {
 			dio->range.logical_sector += dio->range.n_sectors;
 			bio_advance(bio, dio->range.n_sectors << SECTOR_SHIFT);
 			INIT_WORK(&dio->work, integrity_bio_wait);
@@ -1318,7 +1322,7 @@ skip_io:
 	dec_in_flight(dio);
 	return;
 error:
-	dio->bi_error = r;
+	dio->bi_status = errno_to_blk_status(r);
 	dec_in_flight(dio);
 }
 
@@ -1331,7 +1335,7 @@ static int dm_integrity_map(struct dm_target *ti, struct bio *bio)
 	sector_t area, offset;
 
 	dio->ic = ic;
-	dio->bi_error = 0;
+	dio->bi_status = 0;
 
 	if (unlikely(bio->bi_opf & REQ_PREFLUSH)) {
 		submit_flush_bio(ic, dio);
@@ -1352,13 +1356,13 @@ static int dm_integrity_map(struct dm_target *ti, struct bio *bio)
 		DMERR("Too big sector number: 0x%llx + 0x%x > 0x%llx",
 		      (unsigned long long)dio->range.logical_sector, bio_sectors(bio),
 		      (unsigned long long)ic->provided_data_sectors);
-		return -EIO;
+		return DM_MAPIO_KILL;
 	}
 	if (unlikely((dio->range.logical_sector | bio_sectors(bio)) & (unsigned)(ic->sectors_per_block - 1))) {
 		DMERR("Bio not aligned on %u sectors: 0x%llx, 0x%x",
 		      ic->sectors_per_block,
 		      (unsigned long long)dio->range.logical_sector, bio_sectors(bio));
-		return -EIO;
+		return DM_MAPIO_KILL;
 	}
 
 	if (ic->sectors_per_block > 1) {
@@ -1368,7 +1372,7 @@ static int dm_integrity_map(struct dm_target *ti, struct bio *bio)
 			if (unlikely((bv.bv_offset | bv.bv_len) & ((ic->sectors_per_block << SECTOR_SHIFT) - 1))) {
 				DMERR("Bio vector (%u,%u) is not aligned on %u-sector boundary",
 					bv.bv_offset, bv.bv_len, ic->sectors_per_block);
-				return -EIO;
+				return DM_MAPIO_KILL;
 			}
 		}
 	}
@@ -1383,18 +1387,18 @@ static int dm_integrity_map(struct dm_target *ti, struct bio *bio)
 				wanted_tag_size *= ic->tag_size;
 			if (unlikely(wanted_tag_size != bip->bip_iter.bi_size)) {
 				DMERR("Invalid integrity data size %u, expected %u", bip->bip_iter.bi_size, wanted_tag_size);
-				return -EIO;
+				return DM_MAPIO_KILL;
 			}
 		}
 	} else {
 		if (unlikely(bip != NULL)) {
 			DMERR("Unexpected integrity data when using internal hash");
-			return -EIO;
+			return DM_MAPIO_KILL;
 		}
 	}
 
 	if (unlikely(ic->mode == 'R') && unlikely(dio->write))
-		return -EIO;
+		return DM_MAPIO_KILL;
 
 	get_area_and_offset(ic, dio->range.logical_sector, &area, &offset);
 	dio->metadata_block = get_metadata_sector_and_offset(ic, area, offset, &dio->metadata_offset);
@@ -2374,21 +2378,6 @@ static void dm_integrity_set(struct dm_target *ti, struct dm_integrity_c *ic)
 	blk_queue_max_integrity_segments(disk->queue, UINT_MAX);
 }
 
-/* FIXME: use new kvmalloc */
-static void *dm_integrity_kvmalloc(size_t size, gfp_t gfp)
-{
-	void *ptr = NULL;
-
-	if (size <= PAGE_SIZE)
-		ptr = kmalloc(size, GFP_KERNEL | gfp);
-	if (!ptr && size <= KMALLOC_MAX_SIZE)
-		ptr = kmalloc(size, GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY | gfp);
-	if (!ptr)
-		ptr = __vmalloc(size, GFP_KERNEL | gfp, PAGE_KERNEL);
-
-	return ptr;
-}
-
 static void dm_integrity_free_page_list(struct dm_integrity_c *ic, struct page_list *pl)
 {
 	unsigned i;
@@ -2407,7 +2396,7 @@ static struct page_list *dm_integrity_alloc_page_list(struct dm_integrity_c *ic)
 	struct page_list *pl;
 	unsigned i;
 
-	pl = dm_integrity_kvmalloc(page_list_desc_size, __GFP_ZERO);
+	pl = kvmalloc(page_list_desc_size, GFP_KERNEL | __GFP_ZERO);
 	if (!pl)
 		return NULL;
 
@@ -2437,7 +2426,7 @@ static struct scatterlist **dm_integrity_alloc_journal_scatterlist(struct dm_int
 	struct scatterlist **sl;
 	unsigned i;
 
-	sl = dm_integrity_kvmalloc(ic->journal_sections * sizeof(struct scatterlist *), __GFP_ZERO);
+	sl = kvmalloc(ic->journal_sections * sizeof(struct scatterlist *), GFP_KERNEL | __GFP_ZERO);
 	if (!sl)
 		return NULL;
 
@@ -2453,7 +2442,7 @@ static struct scatterlist **dm_integrity_alloc_journal_scatterlist(struct dm_int
 
 		n_pages = (end_index - start_index + 1);
 
-		s = dm_integrity_kvmalloc(n_pages * sizeof(struct scatterlist), 0);
+		s = kvmalloc(n_pages * sizeof(struct scatterlist), GFP_KERNEL);
 		if (!s) {
 			dm_integrity_free_journal_scatterlist(ic, sl);
 			return NULL;
@@ -2617,7 +2606,7 @@ static int create_journal(struct dm_integrity_c *ic, char **error)
 				goto bad;
 			}
 
-			sg = dm_integrity_kvmalloc((ic->journal_pages + 1) * sizeof(struct scatterlist), 0);
+			sg = kvmalloc((ic->journal_pages + 1) * sizeof(struct scatterlist), GFP_KERNEL);
 			if (!sg) {
 				*error = "Unable to allocate sg list";
 				r = -ENOMEM;
@@ -2673,7 +2662,7 @@ static int create_journal(struct dm_integrity_c *ic, char **error)
 				r = -ENOMEM;
 				goto bad;
 			}
-			ic->sk_requests = dm_integrity_kvmalloc(ic->journal_sections * sizeof(struct skcipher_request *), __GFP_ZERO);
+			ic->sk_requests = kvmalloc(ic->journal_sections * sizeof(struct skcipher_request *), GFP_KERNEL | __GFP_ZERO);
 			if (!ic->sk_requests) {
 				*error = "Unable to allocate sk requests";
 				r = -ENOMEM;
@@ -2740,7 +2729,7 @@ retest_commit_id:
 		r = -ENOMEM;
 		goto bad;
 	}
-	ic->journal_tree = dm_integrity_kvmalloc(journal_tree_size, 0);
+	ic->journal_tree = kvmalloc(journal_tree_size, GFP_KERNEL);
 	if (!ic->journal_tree) {
 		*error = "Could not allocate memory for journal tree";
 		r = -ENOMEM;
@@ -3054,6 +3043,11 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv)
 		ti->error = "The device is too small";
 		goto bad;
 	}
+	if (ti->len > ic->provided_data_sectors) {
+		r = -EINVAL;
+		ti->error = "Not enough provided sectors for requested mapping size";
+		goto bad;
+	}
 
 	if (!buffer_sectors)
 		buffer_sectors = 1;
diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c
index 3702e502466d..25039607f3cb 100644
--- a/drivers/md/dm-io.c
+++ b/drivers/md/dm-io.c
@@ -58,7 +58,8 @@ struct dm_io_client *dm_io_client_create(void)
 	if (!client->pool)
 		goto bad;
 
-	client->bios = bioset_create(min_ios, 0);
+	client->bios = bioset_create(min_ios, 0, (BIOSET_NEED_BVECS |
+						  BIOSET_NEED_RESCUER));
 	if (!client->bios)
 		goto bad;
 
@@ -124,7 +125,7 @@ static void complete_io(struct io *io)
 	fn(error_bits, context);
 }
 
-static void dec_count(struct io *io, unsigned int region, int error)
+static void dec_count(struct io *io, unsigned int region, blk_status_t error)
 {
 	if (error)
 		set_bit(region, &io->error_bits);
@@ -137,9 +138,9 @@ static void endio(struct bio *bio)
 {
 	struct io *io;
 	unsigned region;
-	int error;
+	blk_status_t error;
 
-	if (bio->bi_error && bio_data_dir(bio) == READ)
+	if (bio->bi_status && bio_data_dir(bio) == READ)
 		zero_fill_bio(bio);
 
 	/*
@@ -147,7 +148,7 @@ static void endio(struct bio *bio)
 	 */
 	retrieve_io_and_region_from_bio(bio, &io, &region);
 
-	error = bio->bi_error;
+	error = bio->bi_status;
 	bio_put(bio);
 
 	dec_count(io, region, error);
@@ -317,9 +318,9 @@ static void do_region(int op, int op_flags, unsigned region,
 	else if (op == REQ_OP_WRITE_SAME)
 		special_cmd_max_sectors = q->limits.max_write_same_sectors;
 	if ((op == REQ_OP_DISCARD || op == REQ_OP_WRITE_ZEROES ||
-	     op == REQ_OP_WRITE_SAME)  &&
-	    special_cmd_max_sectors == 0) {
-		dec_count(io, region, -EOPNOTSUPP);
+	     op == REQ_OP_WRITE_SAME) && special_cmd_max_sectors == 0) {
+		atomic_inc(&io->count);
+		dec_count(io, region, BLK_STS_NOTSUPP);
 		return;
 	}
 
diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index 0555b4410e05..e06f0ef7d2ec 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -23,6 +23,14 @@
 #define DM_MSG_PREFIX "ioctl"
 #define DM_DRIVER_EMAIL "dm-devel@redhat.com"
 
+struct dm_file {
+	/*
+	 * poll will wait until the global event number is greater than
+	 * this value.
+	 */
+	volatile unsigned global_event_nr;
+};
+
 /*-----------------------------------------------------------------
  * The ioctl interface needs to be able to look up devices by
  * name or uuid.
@@ -456,9 +464,9 @@ void dm_deferred_remove(void)
  * All the ioctl commands get dispatched to functions with this
  * prototype.
  */
-typedef int (*ioctl_fn)(struct dm_ioctl *param, size_t param_size);
+typedef int (*ioctl_fn)(struct file *filp, struct dm_ioctl *param, size_t param_size);
 
-static int remove_all(struct dm_ioctl *param, size_t param_size)
+static int remove_all(struct file *filp, struct dm_ioctl *param, size_t param_size)
 {
 	dm_hash_remove_all(true, !!(param->flags & DM_DEFERRED_REMOVE), false);
 	param->data_size = 0;
@@ -491,13 +499,14 @@ static void *get_result_buffer(struct dm_ioctl *param, size_t param_size,
 	return ((void *) param) + param->data_start;
 }
 
-static int list_devices(struct dm_ioctl *param, size_t param_size)
+static int list_devices(struct file *filp, struct dm_ioctl *param, size_t param_size)
 {
 	unsigned int i;
 	struct hash_cell *hc;
 	size_t len, needed = 0;
 	struct gendisk *disk;
 	struct dm_name_list *nl, *old_nl = NULL;
+	uint32_t *event_nr;
 
 	down_write(&_hash_lock);
 
@@ -510,6 +519,7 @@ static int list_devices(struct dm_ioctl *param, size_t param_size)
 			needed += sizeof(struct dm_name_list);
 			needed += strlen(hc->name) + 1;
 			needed += ALIGN_MASK;
+			needed += (sizeof(uint32_t) + ALIGN_MASK) & ~ALIGN_MASK;
 		}
 	}
 
@@ -539,7 +549,9 @@ static int list_devices(struct dm_ioctl *param, size_t param_size)
 			strcpy(nl->name, hc->name);
 
 			old_nl = nl;
-			nl = align_ptr(((void *) ++nl) + strlen(hc->name) + 1);
+			event_nr = align_ptr(((void *) (nl + 1)) + strlen(hc->name) + 1);
+			*event_nr = dm_get_event_nr(hc->md);
+			nl = align_ptr(event_nr + 1);
 		}
 	}
 
@@ -582,7 +594,7 @@ static void list_version_get_info(struct target_type *tt, void *param)
     info->vers = align_ptr(((void *) ++info->vers) + strlen(tt->name) + 1);
 }
 
-static int list_versions(struct dm_ioctl *param, size_t param_size)
+static int list_versions(struct file *filp, struct dm_ioctl *param, size_t param_size)
 {
 	size_t len, needed = 0;
 	struct dm_target_versions *vers;
@@ -724,7 +736,7 @@ static void __dev_status(struct mapped_device *md, struct dm_ioctl *param)
 	}
 }
 
-static int dev_create(struct dm_ioctl *param, size_t param_size)
+static int dev_create(struct file *filp, struct dm_ioctl *param, size_t param_size)
 {
 	int r, m = DM_ANY_MINOR;
 	struct mapped_device *md;
@@ -816,7 +828,7 @@ static struct mapped_device *find_device(struct dm_ioctl *param)
 	return md;
 }
 
-static int dev_remove(struct dm_ioctl *param, size_t param_size)
+static int dev_remove(struct file *filp, struct dm_ioctl *param, size_t param_size)
 {
 	struct hash_cell *hc;
 	struct mapped_device *md;
@@ -881,7 +893,7 @@ static int invalid_str(char *str, void *end)
 	return -EINVAL;
 }
 
-static int dev_rename(struct dm_ioctl *param, size_t param_size)
+static int dev_rename(struct file *filp, struct dm_ioctl *param, size_t param_size)
 {
 	int r;
 	char *new_data = (char *) param + param->data_start;
@@ -911,7 +923,7 @@ static int dev_rename(struct dm_ioctl *param, size_t param_size)
 	return 0;
 }
 
-static int dev_set_geometry(struct dm_ioctl *param, size_t param_size)
+static int dev_set_geometry(struct file *filp, struct dm_ioctl *param, size_t param_size)
 {
 	int r = -EINVAL, x;
 	struct mapped_device *md;
@@ -1060,7 +1072,7 @@ static int do_resume(struct dm_ioctl *param)
  * Set or unset the suspension state of a device.
  * If the device already is in the requested state we just return its status.
  */
-static int dev_suspend(struct dm_ioctl *param, size_t param_size)
+static int dev_suspend(struct file *filp, struct dm_ioctl *param, size_t param_size)
 {
 	if (param->flags & DM_SUSPEND_FLAG)
 		return do_suspend(param);
@@ -1072,7 +1084,7 @@ static int dev_suspend(struct dm_ioctl *param, size_t param_size)
  * Copies device info back to user space, used by
  * the create and info ioctls.
  */
-static int dev_status(struct dm_ioctl *param, size_t param_size)
+static int dev_status(struct file *filp, struct dm_ioctl *param, size_t param_size)
 {
 	struct mapped_device *md;
 
@@ -1163,7 +1175,7 @@ static void retrieve_status(struct dm_table *table,
 /*
  * Wait for a device to report an event
  */
-static int dev_wait(struct dm_ioctl *param, size_t param_size)
+static int dev_wait(struct file *filp, struct dm_ioctl *param, size_t param_size)
 {
 	int r = 0;
 	struct mapped_device *md;
@@ -1200,6 +1212,19 @@ out:
 	return r;
 }
 
+/*
+ * Remember the global event number and make it possible to poll
+ * for further events.
+ */
+static int dev_arm_poll(struct file *filp, struct dm_ioctl *param, size_t param_size)
+{
+	struct dm_file *priv = filp->private_data;
+
+	priv->global_event_nr = atomic_read(&dm_global_event_nr);
+
+	return 0;
+}
+
 static inline fmode_t get_mode(struct dm_ioctl *param)
 {
 	fmode_t mode = FMODE_READ | FMODE_WRITE;
@@ -1269,7 +1294,7 @@ static bool is_valid_type(enum dm_queue_mode cur, enum dm_queue_mode new)
 	return false;
 }
 
-static int table_load(struct dm_ioctl *param, size_t param_size)
+static int table_load(struct file *filp, struct dm_ioctl *param, size_t param_size)
 {
 	int r;
 	struct hash_cell *hc;
@@ -1356,7 +1381,7 @@ err:
 	return r;
 }
 
-static int table_clear(struct dm_ioctl *param, size_t param_size)
+static int table_clear(struct file *filp, struct dm_ioctl *param, size_t param_size)
 {
 	struct hash_cell *hc;
 	struct mapped_device *md;
@@ -1430,7 +1455,7 @@ static void retrieve_deps(struct dm_table *table,
 	param->data_size = param->data_start + needed;
 }
 
-static int table_deps(struct dm_ioctl *param, size_t param_size)
+static int table_deps(struct file *filp, struct dm_ioctl *param, size_t param_size)
 {
 	struct mapped_device *md;
 	struct dm_table *table;
@@ -1456,7 +1481,7 @@ static int table_deps(struct dm_ioctl *param, size_t param_size)
  * Return the status of a device as a text string for each
  * target.
  */
-static int table_status(struct dm_ioctl *param, size_t param_size)
+static int table_status(struct file *filp, struct dm_ioctl *param, size_t param_size)
 {
 	struct mapped_device *md;
 	struct dm_table *table;
@@ -1511,7 +1536,7 @@ static int message_for_md(struct mapped_device *md, unsigned argc, char **argv,
 /*
  * Pass a message to the target that's at the supplied device offset.
  */
-static int target_message(struct dm_ioctl *param, size_t param_size)
+static int target_message(struct file *filp, struct dm_ioctl *param, size_t param_size)
 {
 	int r, argc;
 	char **argv;
@@ -1628,7 +1653,8 @@ static ioctl_fn lookup_ioctl(unsigned int cmd, int *ioctl_flags)
 		{DM_LIST_VERSIONS_CMD, 0, list_versions},
 
 		{DM_TARGET_MSG_CMD, 0, target_message},
-		{DM_DEV_SET_GEOMETRY_CMD, 0, dev_set_geometry}
+		{DM_DEV_SET_GEOMETRY_CMD, 0, dev_set_geometry},
+		{DM_DEV_ARM_POLL, IOCTL_FLAGS_NO_PARAMS, dev_arm_poll},
 	};
 
 	if (unlikely(cmd >= ARRAY_SIZE(_ioctls)))
@@ -1710,12 +1736,13 @@ static int copy_params(struct dm_ioctl __user *user, struct dm_ioctl *param_kern
 	}
 
 	/*
-	 * Try to avoid low memory issues when a device is suspended.
+	 * Use __GFP_HIGH to avoid low memory issues when a device is
+	 * suspended and the ioctl is needed to resume it.
 	 * Use kmalloc() rather than vmalloc() when we can.
 	 */
 	dmi = NULL;
 	noio_flag = memalloc_noio_save();
-	dmi = kvmalloc(param_kernel->data_size, GFP_KERNEL);
+	dmi = kvmalloc(param_kernel->data_size, GFP_KERNEL | __GFP_HIGH);
 	memalloc_noio_restore(noio_flag);
 
 	if (!dmi) {
@@ -1782,7 +1809,7 @@ static int validate_params(uint cmd, struct dm_ioctl *param)
 	return 0;
 }
 
-static int ctl_ioctl(uint command, struct dm_ioctl __user *user)
+static int ctl_ioctl(struct file *file, uint command, struct dm_ioctl __user *user)
 {
 	int r = 0;
 	int ioctl_flags;
@@ -1836,7 +1863,7 @@ static int ctl_ioctl(uint command, struct dm_ioctl __user *user)
 		goto out;
 
 	param->data_size = offsetof(struct dm_ioctl, data);
-	r = fn(param, input_param_size);
+	r = fn(file, param, input_param_size);
 
 	if (unlikely(param->flags & DM_BUFFER_FULL_FLAG) &&
 	    unlikely(ioctl_flags & IOCTL_FLAGS_NO_PARAMS))
@@ -1855,7 +1882,7 @@ out:
 
 static long dm_ctl_ioctl(struct file *file, uint command, ulong u)
 {
-	return (long)ctl_ioctl(command, (struct dm_ioctl __user *)u);
+	return (long)ctl_ioctl(file, command, (struct dm_ioctl __user *)u);
 }
 
 #ifdef CONFIG_COMPAT
@@ -1867,8 +1894,47 @@ static long dm_compat_ctl_ioctl(struct file *file, uint command, ulong u)
 #define dm_compat_ctl_ioctl NULL
 #endif
 
+static int dm_open(struct inode *inode, struct file *filp)
+{
+	int r;
+	struct dm_file *priv;
+
+	r = nonseekable_open(inode, filp);
+	if (unlikely(r))
+		return r;
+
+	priv = filp->private_data = kmalloc(sizeof(struct dm_file), GFP_KERNEL);
+	if (!priv)
+		return -ENOMEM;
+
+	priv->global_event_nr = atomic_read(&dm_global_event_nr);
+
+	return 0;
+}
+
+static int dm_release(struct inode *inode, struct file *filp)
+{
+	kfree(filp->private_data);
+	return 0;
+}
+
+static unsigned dm_poll(struct file *filp, poll_table *wait)
+{
+	struct dm_file *priv = filp->private_data;
+	unsigned mask = 0;
+
+	poll_wait(filp, &dm_global_eventq, wait);
+
+	if ((int)(atomic_read(&dm_global_event_nr) - priv->global_event_nr) > 0)
+		mask |= POLLIN;
+
+	return mask;
+}
+
 static const struct file_operations _ctl_fops = {
-	.open = nonseekable_open,
+	.open    = dm_open,
+	.release = dm_release,
+	.poll    = dm_poll,
 	.unlocked_ioctl	 = dm_ctl_ioctl,
 	.compat_ioctl = dm_compat_ctl_ioctl,
 	.owner	 = THIS_MODULE,
diff --git a/drivers/md/dm-kcopyd.c b/drivers/md/dm-kcopyd.c
index f85846741d50..cf2c67e35eaf 100644
--- a/drivers/md/dm-kcopyd.c
+++ b/drivers/md/dm-kcopyd.c
@@ -356,6 +356,7 @@ struct kcopyd_job {
 	struct mutex lock;
 	atomic_t sub_jobs;
 	sector_t progress;
+	sector_t write_offset;
 
 	struct kcopyd_job *master_job;
 };
@@ -386,6 +387,31 @@ void dm_kcopyd_exit(void)
  * Functions to push and pop a job onto the head of a given job
  * list.
  */
+static struct kcopyd_job *pop_io_job(struct list_head *jobs,
+				     struct dm_kcopyd_client *kc)
+{
+	struct kcopyd_job *job;
+
+	/*
+	 * For I/O jobs, pop any read, any write without sequential write
+	 * constraint and sequential writes that are at the right position.
+	 */
+	list_for_each_entry(job, jobs, list) {
+		if (job->rw == READ || !test_bit(DM_KCOPYD_WRITE_SEQ, &job->flags)) {
+			list_del(&job->list);
+			return job;
+		}
+
+		if (job->write_offset == job->master_job->write_offset) {
+			job->master_job->write_offset += job->source.count;
+			list_del(&job->list);
+			return job;
+		}
+	}
+
+	return NULL;
+}
+
 static struct kcopyd_job *pop(struct list_head *jobs,
 			      struct dm_kcopyd_client *kc)
 {
@@ -395,8 +421,12 @@ static struct kcopyd_job *pop(struct list_head *jobs,
 	spin_lock_irqsave(&kc->job_lock, flags);
 
 	if (!list_empty(jobs)) {
-		job = list_entry(jobs->next, struct kcopyd_job, list);
-		list_del(&job->list);
+		if (jobs == &kc->io_jobs)
+			job = pop_io_job(jobs, kc);
+		else {
+			job = list_entry(jobs->next, struct kcopyd_job, list);
+			list_del(&job->list);
+		}
 	}
 	spin_unlock_irqrestore(&kc->job_lock, flags);
 
@@ -506,6 +536,14 @@ static int run_io_job(struct kcopyd_job *job)
 		.client = job->kc->io_client,
 	};
 
+	/*
+	 * If we need to write sequentially and some reads or writes failed,
+	 * no point in continuing.
+	 */
+	if (test_bit(DM_KCOPYD_WRITE_SEQ, &job->flags) &&
+	    job->master_job->write_err)
+		return -EIO;
+
 	io_job_start(job->kc->throttle);
 
 	if (job->rw == READ)
@@ -655,6 +693,7 @@ static void segment_complete(int read_err, unsigned long write_err,
 		int i;
 
 		*sub_job = *job;
+		sub_job->write_offset = progress;
 		sub_job->source.sector += progress;
 		sub_job->source.count = count;
 
@@ -723,6 +762,27 @@ int dm_kcopyd_copy(struct dm_kcopyd_client *kc, struct dm_io_region *from,
 	job->num_dests = num_dests;
 	memcpy(&job->dests, dests, sizeof(*dests) * num_dests);
 
+	/*
+	 * If one of the destination is a host-managed zoned block device,
+	 * we need to write sequentially. If one of the destination is a
+	 * host-aware device, then leave it to the caller to choose what to do.
+	 */
+	if (!test_bit(DM_KCOPYD_WRITE_SEQ, &job->flags)) {
+		for (i = 0; i < job->num_dests; i++) {
+			if (bdev_zoned_model(dests[i].bdev) == BLK_ZONED_HM) {
+				set_bit(DM_KCOPYD_WRITE_SEQ, &job->flags);
+				break;
+			}
+		}
+	}
+
+	/*
+	 * If we need to write sequentially, errors cannot be ignored.
+	 */
+	if (test_bit(DM_KCOPYD_WRITE_SEQ, &job->flags) &&
+	    test_bit(DM_KCOPYD_IGNORE_ERROR, &job->flags))
+		clear_bit(DM_KCOPYD_IGNORE_ERROR, &job->flags);
+
 	if (from) {
 		job->source = *from;
 		job->pages = NULL;
@@ -746,6 +806,7 @@ int dm_kcopyd_copy(struct dm_kcopyd_client *kc, struct dm_io_region *from,
 	job->fn = fn;
 	job->context = context;
 	job->master_job = job;
+	job->write_offset = 0;
 
 	if (job->source.count <= SUB_JOB_SIZE)
 		dispatch_job(job);
diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c
index 7d42a9d9f406..41971a090e34 100644
--- a/drivers/md/dm-linear.c
+++ b/drivers/md/dm-linear.c
@@ -89,7 +89,7 @@ static void linear_map_bio(struct dm_target *ti, struct bio *bio)
 	struct linear_c *lc = ti->private;
 
 	bio->bi_bdev = lc->dev->bdev;
-	if (bio_sectors(bio))
+	if (bio_sectors(bio) || bio_op(bio) == REQ_OP_ZONE_RESET)
 		bio->bi_iter.bi_sector =
 			linear_map_sector(ti, bio->bi_iter.bi_sector);
 }
@@ -101,6 +101,17 @@ static int linear_map(struct dm_target *ti, struct bio *bio)
 	return DM_MAPIO_REMAPPED;
 }
 
+static int linear_end_io(struct dm_target *ti, struct bio *bio,
+			 blk_status_t *error)
+{
+	struct linear_c *lc = ti->private;
+
+	if (!*error && bio_op(bio) == REQ_OP_ZONE_REPORT)
+		dm_remap_zone_report(ti, bio, lc->start);
+
+	return DM_ENDIO_DONE;
+}
+
 static void linear_status(struct dm_target *ti, status_type_t type,
 			  unsigned status_flags, char *result, unsigned maxlen)
 {
@@ -159,18 +170,49 @@ static long linear_dax_direct_access(struct dm_target *ti, pgoff_t pgoff,
 	return dax_direct_access(dax_dev, pgoff, nr_pages, kaddr, pfn);
 }
 
+static size_t linear_dax_copy_from_iter(struct dm_target *ti, pgoff_t pgoff,
+		void *addr, size_t bytes, struct iov_iter *i)
+{
+	struct linear_c *lc = ti->private;
+	struct block_device *bdev = lc->dev->bdev;
+	struct dax_device *dax_dev = lc->dev->dax_dev;
+	sector_t dev_sector, sector = pgoff * PAGE_SECTORS;
+
+	dev_sector = linear_map_sector(ti, sector);
+	if (bdev_dax_pgoff(bdev, dev_sector, ALIGN(bytes, PAGE_SIZE), &pgoff))
+		return 0;
+	return dax_copy_from_iter(dax_dev, pgoff, addr, bytes, i);
+}
+
+static void linear_dax_flush(struct dm_target *ti, pgoff_t pgoff, void *addr,
+		size_t size)
+{
+	struct linear_c *lc = ti->private;
+	struct block_device *bdev = lc->dev->bdev;
+	struct dax_device *dax_dev = lc->dev->dax_dev;
+	sector_t dev_sector, sector = pgoff * PAGE_SECTORS;
+
+	dev_sector = linear_map_sector(ti, sector);
+	if (bdev_dax_pgoff(bdev, dev_sector, ALIGN(size, PAGE_SIZE), &pgoff))
+		return;
+	dax_flush(dax_dev, pgoff, addr, size);
+}
+
 static struct target_type linear_target = {
 	.name   = "linear",
-	.version = {1, 3, 0},
-	.features = DM_TARGET_PASSES_INTEGRITY,
+	.version = {1, 4, 0},
+	.features = DM_TARGET_PASSES_INTEGRITY | DM_TARGET_ZONED_HM,
 	.module = THIS_MODULE,
 	.ctr    = linear_ctr,
 	.dtr    = linear_dtr,
 	.map    = linear_map,
+	.end_io = linear_end_io,
 	.status = linear_status,
 	.prepare_ioctl = linear_prepare_ioctl,
 	.iterate_devices = linear_iterate_devices,
 	.direct_access = linear_dax_direct_access,
+	.dax_copy_from_iter = linear_dax_copy_from_iter,
+	.dax_flush = linear_dax_flush,
 };
 
 int __init dm_linear_init(void)
diff --git a/drivers/md/dm-log-writes.c b/drivers/md/dm-log-writes.c
index 4dfe38655a49..a1da0eb58a93 100644
--- a/drivers/md/dm-log-writes.c
+++ b/drivers/md/dm-log-writes.c
@@ -150,10 +150,10 @@ static void log_end_io(struct bio *bio)
 {
 	struct log_writes_c *lc = bio->bi_private;
 
-	if (bio->bi_error) {
+	if (bio->bi_status) {
 		unsigned long flags;
 
-		DMERR("Error writing log block, error=%d", bio->bi_error);
+		DMERR("Error writing log block, error=%d", bio->bi_status);
 		spin_lock_irqsave(&lc->blocks_lock, flags);
 		lc->logging_enabled = false;
 		spin_unlock_irqrestore(&lc->blocks_lock, flags);
@@ -586,7 +586,7 @@ static int log_writes_map(struct dm_target *ti, struct bio *bio)
 		spin_lock_irq(&lc->blocks_lock);
 		lc->logging_enabled = false;
 		spin_unlock_irq(&lc->blocks_lock);
-		return -ENOMEM;
+		return DM_MAPIO_KILL;
 	}
 	INIT_LIST_HEAD(&block->list);
 	pb->block = block;
@@ -639,7 +639,7 @@ static int log_writes_map(struct dm_target *ti, struct bio *bio)
 			spin_lock_irq(&lc->blocks_lock);
 			lc->logging_enabled = false;
 			spin_unlock_irq(&lc->blocks_lock);
-			return -ENOMEM;
+			return DM_MAPIO_KILL;
 		}
 
 		src = kmap_atomic(bv.bv_page);
@@ -664,7 +664,8 @@ map_bio:
 	return DM_MAPIO_REMAPPED;
 }
 
-static int normal_end_io(struct dm_target *ti, struct bio *bio, int error)
+static int normal_end_io(struct dm_target *ti, struct bio *bio,
+		blk_status_t *error)
 {
 	struct log_writes_c *lc = ti->private;
 	struct per_bio_data *pb = dm_per_bio_data(bio, sizeof(struct per_bio_data));
@@ -686,7 +687,7 @@ static int normal_end_io(struct dm_target *ti, struct bio *bio, int error)
 		spin_unlock_irqrestore(&lc->blocks_lock, flags);
 	}
 
-	return error;
+	return DM_ENDIO_DONE;
 }
 
 /*
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index 926a6bcb32c8..0e8ab5bb3575 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -447,7 +447,7 @@ failed:
  * it has been invoked.
  */
 #define dm_report_EIO(m)						\
-({									\
+do {									\
 	struct mapped_device *md = dm_table_get_md((m)->ti->table);	\
 									\
 	pr_debug("%s: returning EIO; QIFNP = %d; SQIFNP = %d; DNFS = %d\n", \
@@ -455,8 +455,7 @@ failed:
 		 test_bit(MPATHF_QUEUE_IF_NO_PATH, &(m)->flags),	\
 		 test_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &(m)->flags),	\
 		 dm_noflush_suspending((m)->ti));			\
-	-EIO;								\
-})
+} while (0)
 
 /*
  * Map cloned requests (request-based multipath)
@@ -481,7 +480,8 @@ static int multipath_clone_and_map(struct dm_target *ti, struct request *rq,
 	if (!pgpath) {
 		if (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags))
 			return DM_MAPIO_DELAY_REQUEUE;
-		return dm_report_EIO(m);	/* Failed */
+		dm_report_EIO(m);	/* Failed */
+		return DM_MAPIO_KILL;
 	} else if (test_bit(MPATHF_QUEUE_IO, &m->flags) ||
 		   test_bit(MPATHF_PG_INIT_REQUIRED, &m->flags)) {
 		if (pg_init_all_paths(m))
@@ -558,13 +558,14 @@ static int __multipath_map_bio(struct multipath *m, struct bio *bio, struct dm_m
 	if (!pgpath) {
 		if (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags))
 			return DM_MAPIO_REQUEUE;
-		return dm_report_EIO(m);
+		dm_report_EIO(m);
+		return DM_MAPIO_KILL;
 	}
 
 	mpio->pgpath = pgpath;
 	mpio->nr_bytes = nr_bytes;
 
-	bio->bi_error = 0;
+	bio->bi_status = 0;
 	bio->bi_bdev = pgpath->path.dev->bdev;
 	bio->bi_opf |= REQ_FAILFAST_TRANSPORT;
 
@@ -620,11 +621,19 @@ static void process_queued_bios(struct work_struct *work)
 	blk_start_plug(&plug);
 	while ((bio = bio_list_pop(&bios))) {
 		r = __multipath_map_bio(m, bio, get_mpio_from_bio(bio));
-		if (r < 0 || r == DM_MAPIO_REQUEUE) {
-			bio->bi_error = r;
+		switch (r) {
+		case DM_MAPIO_KILL:
+			bio->bi_status = BLK_STS_IOERR;
 			bio_endio(bio);
-		} else if (r == DM_MAPIO_REMAPPED)
+			break;
+		case DM_MAPIO_REQUEUE:
+			bio->bi_status = BLK_STS_DM_REQUEUE;
+			bio_endio(bio);
+			break;
+		case DM_MAPIO_REMAPPED:
 			generic_make_request(bio);
+			break;
+		}
 	}
 	blk_finish_plug(&plug);
 }
@@ -1441,22 +1450,15 @@ static void activate_path_work(struct work_struct *work)
 	activate_or_offline_path(pgpath);
 }
 
-static int noretry_error(int error)
+static int noretry_error(blk_status_t error)
 {
 	switch (error) {
-	case -EBADE:
-		/*
-		 * EBADE signals an reservation conflict.
-		 * We shouldn't fail the path here as we can communicate with
-		 * the target.  We should failover to the next path, but in
-		 * doing so we might be causing a ping-pong between paths.
-		 * So just return the reservation conflict error.
-		 */
-	case -EOPNOTSUPP:
-	case -EREMOTEIO:
-	case -EILSEQ:
-	case -ENODATA:
-	case -ENOSPC:
+	case BLK_STS_NOTSUPP:
+	case BLK_STS_NOSPC:
+	case BLK_STS_TARGET:
+	case BLK_STS_NEXUS:
+	case BLK_STS_MEDIUM:
+	case BLK_STS_RESOURCE:
 		return 1;
 	}
 
@@ -1465,7 +1467,7 @@ static int noretry_error(int error)
 }
 
 static int multipath_end_io(struct dm_target *ti, struct request *clone,
-			    int error, union map_info *map_context)
+			    blk_status_t error, union map_info *map_context)
 {
 	struct dm_mpath_io *mpio = get_mpio(map_context);
 	struct pgpath *pgpath = mpio->pgpath;
@@ -1492,8 +1494,8 @@ static int multipath_end_io(struct dm_target *ti, struct request *clone,
 
 		if (atomic_read(&m->nr_valid_paths) == 0 &&
 		    !test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) {
-			if (error == -EIO)
-				error = dm_report_EIO(m);
+			if (error == BLK_STS_IOERR)
+				dm_report_EIO(m);
 			/* complete with the original error */
 			r = DM_ENDIO_DONE;
 		}
@@ -1509,23 +1511,27 @@ static int multipath_end_io(struct dm_target *ti, struct request *clone,
 	return r;
 }
 
-static int do_end_io_bio(struct multipath *m, struct bio *clone,
-			 int error, struct dm_mpath_io *mpio)
+static int multipath_end_io_bio(struct dm_target *ti, struct bio *clone,
+		blk_status_t *error)
 {
+	struct multipath *m = ti->private;
+	struct dm_mpath_io *mpio = get_mpio_from_bio(clone);
+	struct pgpath *pgpath = mpio->pgpath;
 	unsigned long flags;
+	int r = DM_ENDIO_DONE;
 
-	if (!error)
-		return 0;	/* I/O complete */
-
-	if (noretry_error(error))
-		return error;
+	if (!*error || noretry_error(*error))
+		goto done;
 
-	if (mpio->pgpath)
-		fail_path(mpio->pgpath);
+	if (pgpath)
+		fail_path(pgpath);
 
 	if (atomic_read(&m->nr_valid_paths) == 0 &&
-	    !test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags))
-		return dm_report_EIO(m);
+	    !test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) {
+		dm_report_EIO(m);
+		*error = BLK_STS_IOERR;
+		goto done;
+	}
 
 	/* Queue for the daemon to resubmit */
 	dm_bio_restore(get_bio_details_from_bio(clone), clone);
@@ -1536,23 +1542,11 @@ static int do_end_io_bio(struct multipath *m, struct bio *clone,
 	if (!test_bit(MPATHF_QUEUE_IO, &m->flags))
 		queue_work(kmultipathd, &m->process_queued_bios);
 
-	return DM_ENDIO_INCOMPLETE;
-}
-
-static int multipath_end_io_bio(struct dm_target *ti, struct bio *clone, int error)
-{
-	struct multipath *m = ti->private;
-	struct dm_mpath_io *mpio = get_mpio_from_bio(clone);
-	struct pgpath *pgpath;
-	struct path_selector *ps;
-	int r;
-
-	BUG_ON(!mpio);
-
-	r = do_end_io_bio(m, clone, error, mpio);
-	pgpath = mpio->pgpath;
+	r = DM_ENDIO_INCOMPLETE;
+done:
 	if (pgpath) {
-		ps = &pgpath->pg->ps;
+		struct path_selector *ps = &pgpath->pg->ps;
+
 		if (ps->type->end_io)
 			ps->type->end_io(ps, &pgpath->path, mpio->nr_bytes);
 	}
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index 7d893228c40f..2e10c2f13a34 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -1571,7 +1571,7 @@ static sector_t __rdev_sectors(struct raid_set *rs)
 			return rdev->sectors;
 	}
 
-	BUG(); /* Constructor ensures we got some. */
+	return 0;
 }
 
 /* Calculate the sectors per device and per array used for @rs */
@@ -1927,7 +1927,7 @@ struct dm_raid_superblock {
 	/********************************************************************
 	 * BELOW FOLLOW V1.9.0 EXTENSIONS TO THE PRISTINE SUPERBLOCK FORMAT!!!
 	 *
-	 * FEATURE_FLAG_SUPPORTS_V190 in the features member indicates that those exist
+	 * FEATURE_FLAG_SUPPORTS_V190 in the compat_features member indicates that those exist
 	 */
 
 	__le32 flags; /* Flags defining array states for reshaping */
@@ -2092,6 +2092,11 @@ static void super_sync(struct mddev *mddev, struct md_rdev *rdev)
 	sb->layout = cpu_to_le32(mddev->layout);
 	sb->stripe_sectors = cpu_to_le32(mddev->chunk_sectors);
 
+	/********************************************************************
+	 * BELOW FOLLOW V1.9.0 EXTENSIONS TO THE PRISTINE SUPERBLOCK FORMAT!!!
+	 *
+	 * FEATURE_FLAG_SUPPORTS_V190 in the compat_features member indicates that those exist
+	 */
 	sb->new_level = cpu_to_le32(mddev->new_level);
 	sb->new_layout = cpu_to_le32(mddev->new_layout);
 	sb->new_stripe_sectors = cpu_to_le32(mddev->new_chunk_sectors);
@@ -2438,8 +2443,14 @@ static int super_validate(struct raid_set *rs, struct md_rdev *rdev)
 	mddev->bitmap_info.default_offset = mddev->bitmap_info.offset;
 
 	if (!test_and_clear_bit(FirstUse, &rdev->flags)) {
-		/* Retrieve device size stored in superblock to be prepared for shrink */
-		rdev->sectors = le64_to_cpu(sb->sectors);
+		/*
+		 * Retrieve rdev size stored in superblock to be prepared for shrink.
+		 * Check extended superblock members are present otherwise the size
+		 * will not be set!
+		 */
+		if (le32_to_cpu(sb->compat_features) & FEATURE_FLAG_SUPPORTS_V190)
+			rdev->sectors = le64_to_cpu(sb->sectors);
+
 		rdev->recovery_offset = le64_to_cpu(sb->disk_recovery_offset);
 		if (rdev->recovery_offset == MaxSector)
 			set_bit(In_sync, &rdev->flags);
@@ -2930,7 +2941,7 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 	bool resize;
 	struct raid_type *rt;
 	unsigned int num_raid_params, num_raid_devs;
-	sector_t calculated_dev_sectors;
+	sector_t calculated_dev_sectors, rdev_sectors;
 	struct raid_set *rs = NULL;
 	const char *arg;
 	struct rs_layout rs_layout;
@@ -3006,7 +3017,14 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 	if (r)
 		goto bad;
 
-	resize = calculated_dev_sectors != __rdev_sectors(rs);
+	rdev_sectors = __rdev_sectors(rs);
+	if (!rdev_sectors) {
+		ti->error = "Invalid rdev size";
+		r = -EINVAL;
+		goto bad;
+	}
+
+	resize = calculated_dev_sectors != rdev_sectors;
 
 	INIT_WORK(&rs->md.event_work, do_table_event);
 	ti->private = rs;
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index a95cbb80fb34..a4fbd911d566 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -145,6 +145,7 @@ static void dispatch_bios(void *context, struct bio_list *bio_list)
 
 struct dm_raid1_bio_record {
 	struct mirror *m;
+	/* if details->bi_bdev == NULL, details were not saved */
 	struct dm_bio_details details;
 	region_t write_region;
 };
@@ -260,7 +261,7 @@ static int mirror_flush(struct dm_target *ti)
 	struct mirror *m;
 	struct dm_io_request io_req = {
 		.bi_op = REQ_OP_WRITE,
-		.bi_op_flags = REQ_PREFLUSH,
+		.bi_op_flags = REQ_PREFLUSH | REQ_SYNC,
 		.mem.type = DM_IO_KMEM,
 		.mem.ptr.addr = NULL,
 		.client = ms->io_client,
@@ -490,9 +491,9 @@ static void hold_bio(struct mirror_set *ms, struct bio *bio)
 		 * If device is suspended, complete the bio.
 		 */
 		if (dm_noflush_suspending(ms->ti))
-			bio->bi_error = DM_ENDIO_REQUEUE;
+			bio->bi_status = BLK_STS_DM_REQUEUE;
 		else
-			bio->bi_error = -EIO;
+			bio->bi_status = BLK_STS_IOERR;
 
 		bio_endio(bio);
 		return;
@@ -626,7 +627,7 @@ static void write_callback(unsigned long error, void *context)
 	 * degrade the array.
 	 */
 	if (bio_op(bio) == REQ_OP_DISCARD) {
-		bio->bi_error = -EOPNOTSUPP;
+		bio->bi_status = BLK_STS_NOTSUPP;
 		bio_endio(bio);
 		return;
 	}
@@ -1198,6 +1199,8 @@ static int mirror_map(struct dm_target *ti, struct bio *bio)
 	struct dm_raid1_bio_record *bio_record =
 	  dm_per_bio_data(bio, sizeof(struct dm_raid1_bio_record));
 
+	bio_record->details.bi_bdev = NULL;
+
 	if (rw == WRITE) {
 		/* Save region for mirror_end_io() handler */
 		bio_record->write_region = dm_rh_bio_to_region(ms->rh, bio);
@@ -1207,14 +1210,14 @@ static int mirror_map(struct dm_target *ti, struct bio *bio)
 
 	r = log->type->in_sync(log, dm_rh_bio_to_region(ms->rh, bio), 0);
 	if (r < 0 && r != -EWOULDBLOCK)
-		return r;
+		return DM_MAPIO_KILL;
 
 	/*
 	 * If region is not in-sync queue the bio.
 	 */
 	if (!r || (r == -EWOULDBLOCK)) {
 		if (bio->bi_opf & REQ_RAHEAD)
-			return -EWOULDBLOCK;
+			return DM_MAPIO_KILL;
 
 		queue_bio(ms, bio, rw);
 		return DM_MAPIO_SUBMITTED;
@@ -1226,7 +1229,7 @@ static int mirror_map(struct dm_target *ti, struct bio *bio)
 	 */
 	m = choose_mirror(ms, bio->bi_iter.bi_sector);
 	if (unlikely(!m))
-		return -EIO;
+		return DM_MAPIO_KILL;
 
 	dm_bio_record(&bio_record->details, bio);
 	bio_record->m = m;
@@ -1236,7 +1239,8 @@ static int mirror_map(struct dm_target *ti, struct bio *bio)
 	return DM_MAPIO_REMAPPED;
 }
 
-static int mirror_end_io(struct dm_target *ti, struct bio *bio, int error)
+static int mirror_end_io(struct dm_target *ti, struct bio *bio,
+		blk_status_t *error)
 {
 	int rw = bio_data_dir(bio);
 	struct mirror_set *ms = (struct mirror_set *) ti->private;
@@ -1252,16 +1256,26 @@ static int mirror_end_io(struct dm_target *ti, struct bio *bio, int error)
 		if (!(bio->bi_opf & REQ_PREFLUSH) &&
 		    bio_op(bio) != REQ_OP_DISCARD)
 			dm_rh_dec(ms->rh, bio_record->write_region);
-		return error;
+		return DM_ENDIO_DONE;
 	}
 
-	if (error == -EOPNOTSUPP)
-		return error;
+	if (*error == BLK_STS_NOTSUPP)
+		goto out;
+
+	if (bio->bi_opf & REQ_RAHEAD)
+		goto out;
 
-	if ((error == -EWOULDBLOCK) && (bio->bi_opf & REQ_RAHEAD))
-		return error;
+	if (unlikely(*error)) {
+		if (!bio_record->details.bi_bdev) {
+			/*
+			 * There wasn't enough memory to record necessary
+			 * information for a retry or there was no other
+			 * mirror in-sync.
+			 */
+			DMERR_LIMIT("Mirror read failed.");
+			return DM_ENDIO_DONE;
+		}
 
-	if (unlikely(error)) {
 		m = bio_record->m;
 
 		DMERR("Mirror read failed from %s. Trying alternative device.",
@@ -1277,7 +1291,8 @@ static int mirror_end_io(struct dm_target *ti, struct bio *bio, int error)
 			bd = &bio_record->details;
 
 			dm_bio_restore(bd, bio);
-			bio->bi_error = 0;
+			bio_record->details.bi_bdev = NULL;
+			bio->bi_status = 0;
 
 			queue_bio(ms, bio, rw);
 			return DM_ENDIO_INCOMPLETE;
@@ -1285,7 +1300,10 @@ static int mirror_end_io(struct dm_target *ti, struct bio *bio, int error)
 		DMERR("All replicated volumes dead, failing I/O");
 	}
 
-	return error;
+out:
+	bio_record->details.bi_bdev = NULL;
+
+	return DM_ENDIO_DONE;
 }
 
 static void mirror_presuspend(struct dm_target *ti)
diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c
index 2af27026aa2e..c6ebc5b1e00e 100644
--- a/drivers/md/dm-rq.c
+++ b/drivers/md/dm-rq.c
@@ -71,7 +71,7 @@ static void dm_old_start_queue(struct request_queue *q)
 
 static void dm_mq_start_queue(struct request_queue *q)
 {
-	blk_mq_start_stopped_hw_queues(q, true);
+	blk_mq_unquiesce_queue(q);
 	blk_mq_kick_requeue_list(q);
 }
 
@@ -119,7 +119,7 @@ static void end_clone_bio(struct bio *clone)
 	struct dm_rq_target_io *tio = info->tio;
 	struct bio *bio = info->orig;
 	unsigned int nr_bytes = info->orig->bi_iter.bi_size;
-	int error = clone->bi_error;
+	blk_status_t error = clone->bi_status;
 
 	bio_put(clone);
 
@@ -158,7 +158,7 @@ static void end_clone_bio(struct bio *clone)
 	 * Do not use blk_end_request() here, because it may complete
 	 * the original request before the clone, and break the ordering.
 	 */
-	blk_update_request(tio->orig, 0, nr_bytes);
+	blk_update_request(tio->orig, BLK_STS_OK, nr_bytes);
 }
 
 static struct dm_rq_target_io *tio_from_request(struct request *rq)
@@ -216,7 +216,7 @@ static void rq_completed(struct mapped_device *md, int rw, bool run_queue)
  * Must be called without clone's queue lock held,
  * see end_clone_request() for more details.
  */
-static void dm_end_request(struct request *clone, int error)
+static void dm_end_request(struct request *clone, blk_status_t error)
 {
 	int rw = rq_data_dir(clone);
 	struct dm_rq_target_io *tio = clone->end_io_data;
@@ -285,7 +285,7 @@ static void dm_requeue_original_request(struct dm_rq_target_io *tio, bool delay_
 	rq_completed(md, rw, false);
 }
 
-static void dm_done(struct request *clone, int error, bool mapped)
+static void dm_done(struct request *clone, blk_status_t error, bool mapped)
 {
 	int r = DM_ENDIO_DONE;
 	struct dm_rq_target_io *tio = clone->end_io_data;
@@ -298,7 +298,7 @@ static void dm_done(struct request *clone, int error, bool mapped)
 			r = rq_end_io(tio->ti, clone, error, &tio->info);
 	}
 
-	if (unlikely(error == -EREMOTEIO)) {
+	if (unlikely(error == BLK_STS_TARGET)) {
 		if (req_op(clone) == REQ_OP_WRITE_SAME &&
 		    !clone->q->limits.max_write_same_sectors)
 			disable_write_same(tio->md);
@@ -358,7 +358,7 @@ static void dm_softirq_done(struct request *rq)
  * Complete the clone and the original request with the error status
  * through softirq context.
  */
-static void dm_complete_request(struct request *rq, int error)
+static void dm_complete_request(struct request *rq, blk_status_t error)
 {
 	struct dm_rq_target_io *tio = tio_from_request(rq);
 
@@ -375,7 +375,7 @@ static void dm_complete_request(struct request *rq, int error)
  * Target's rq_end_io() function isn't called.
  * This may be used when the target's map_rq() or clone_and_map_rq() functions fail.
  */
-static void dm_kill_unmapped_request(struct request *rq, int error)
+static void dm_kill_unmapped_request(struct request *rq, blk_status_t error)
 {
 	rq->rq_flags |= RQF_FAILED;
 	dm_complete_request(rq, error);
@@ -384,7 +384,7 @@ static void dm_kill_unmapped_request(struct request *rq, int error)
 /*
  * Called with the clone's queue lock held (in the case of .request_fn)
  */
-static void end_clone_request(struct request *clone, int error)
+static void end_clone_request(struct request *clone, blk_status_t error)
 {
 	struct dm_rq_target_io *tio = clone->end_io_data;
 
@@ -401,7 +401,7 @@ static void end_clone_request(struct request *clone, int error)
 
 static void dm_dispatch_clone_request(struct request *clone, struct request *rq)
 {
-	int r;
+	blk_status_t r;
 
 	if (blk_queue_io_stat(clone->q))
 		clone->rq_flags |= RQF_IO_STAT;
@@ -506,7 +506,8 @@ static int map_request(struct dm_rq_target_io *tio)
 		break;
 	case DM_MAPIO_KILL:
 		/* The target wants to complete the I/O */
-		dm_kill_unmapped_request(rq, -EIO);
+		dm_kill_unmapped_request(rq, BLK_STS_IOERR);
+		break;
 	default:
 		DMWARN("unimplemented target map return value: %d", r);
 		BUG();
@@ -726,7 +727,7 @@ static int dm_mq_init_request(struct blk_mq_tag_set *set, struct request *rq,
 	return __dm_rq_init_rq(set->driver_data, rq);
 }
 
-static int dm_mq_queue_rq(struct blk_mq_hw_ctx *hctx,
+static blk_status_t dm_mq_queue_rq(struct blk_mq_hw_ctx *hctx,
 			  const struct blk_mq_queue_data *bd)
 {
 	struct request *rq = bd->rq;
@@ -743,7 +744,7 @@ static int dm_mq_queue_rq(struct blk_mq_hw_ctx *hctx,
 	}
 
 	if (ti->type->busy && ti->type->busy(ti))
-		return BLK_MQ_RQ_QUEUE_BUSY;
+		return BLK_STS_RESOURCE;
 
 	dm_start_request(md, rq);
 
@@ -761,10 +762,10 @@ static int dm_mq_queue_rq(struct blk_mq_hw_ctx *hctx,
 		rq_end_stats(md, rq);
 		rq_completed(md, rq_data_dir(rq), false);
 		blk_mq_delay_run_hw_queue(hctx, 100/*ms*/);
-		return BLK_MQ_RQ_QUEUE_BUSY;
+		return BLK_STS_RESOURCE;
 	}
 
-	return BLK_MQ_RQ_QUEUE_OK;
+	return BLK_STS_OK;
 }
 
 static const struct blk_mq_ops dm_mq_ops = {
diff --git a/drivers/md/dm-rq.h b/drivers/md/dm-rq.h
index f0020d21b95f..9813922e4fe5 100644
--- a/drivers/md/dm-rq.h
+++ b/drivers/md/dm-rq.h
@@ -24,7 +24,7 @@ struct dm_rq_target_io {
 	struct dm_target *ti;
 	struct request *orig, *clone;
 	struct kthread_work work;
-	int error;
+	blk_status_t error;
 	union map_info info;
 	struct dm_stats_aux stats_aux;
 	unsigned long duration_jiffies;
diff --git a/drivers/md/dm-snap-persistent.c b/drivers/md/dm-snap-persistent.c
index b93476c3ba3f..c5534d294773 100644
--- a/drivers/md/dm-snap-persistent.c
+++ b/drivers/md/dm-snap-persistent.c
@@ -741,7 +741,8 @@ static void persistent_commit_exception(struct dm_exception_store *store,
 	/*
 	 * Commit exceptions to disk.
 	 */
-	if (ps->valid && area_io(ps, REQ_OP_WRITE, REQ_PREFLUSH | REQ_FUA))
+	if (ps->valid && area_io(ps, REQ_OP_WRITE,
+				 REQ_PREFLUSH | REQ_FUA | REQ_SYNC))
 		ps->valid = 0;
 
 	/*
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index e152d9817c81..1ba41048b438 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -1590,7 +1590,7 @@ static void full_bio_end_io(struct bio *bio)
 {
 	void *callback_data = bio->bi_private;
 
-	dm_kcopyd_do_callback(callback_data, 0, bio->bi_error ? 1 : 0);
+	dm_kcopyd_do_callback(callback_data, 0, bio->bi_status ? 1 : 0);
 }
 
 static void start_full_bio(struct dm_snap_pending_exception *pe,
@@ -1690,7 +1690,7 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio)
 	/* Full snapshots are not usable */
 	/* To get here the table must be live so s->active is always set. */
 	if (!s->valid)
-		return -EIO;
+		return DM_MAPIO_KILL;
 
 	/* FIXME: should only take write lock if we need
 	 * to copy an exception */
@@ -1698,7 +1698,7 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio)
 
 	if (!s->valid || (unlikely(s->snapshot_overflowed) &&
 	    bio_data_dir(bio) == WRITE)) {
-		r = -EIO;
+		r = DM_MAPIO_KILL;
 		goto out_unlock;
 	}
 
@@ -1723,7 +1723,7 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio)
 
 			if (!s->valid || s->snapshot_overflowed) {
 				free_pending_exception(pe);
-				r = -EIO;
+				r = DM_MAPIO_KILL;
 				goto out_unlock;
 			}
 
@@ -1741,7 +1741,7 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio)
 					DMERR("Snapshot overflowed: Unable to allocate exception.");
 				} else
 					__invalidate_snapshot(s, -ENOMEM);
-				r = -EIO;
+				r = DM_MAPIO_KILL;
 				goto out_unlock;
 			}
 		}
@@ -1851,14 +1851,15 @@ out_unlock:
 	return r;
 }
 
-static int snapshot_end_io(struct dm_target *ti, struct bio *bio, int error)
+static int snapshot_end_io(struct dm_target *ti, struct bio *bio,
+		blk_status_t *error)
 {
 	struct dm_snapshot *s = ti->private;
 
 	if (is_bio_tracked(bio))
 		stop_tracking_chunk(s, bio);
 
-	return 0;
+	return DM_ENDIO_DONE;
 }
 
 static void snapshot_merge_presuspend(struct dm_target *ti)
diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c
index 75152482f3ad..a0375530b07f 100644
--- a/drivers/md/dm-stripe.c
+++ b/drivers/md/dm-stripe.c
@@ -332,6 +332,44 @@ static long stripe_dax_direct_access(struct dm_target *ti, pgoff_t pgoff,
 	return dax_direct_access(dax_dev, pgoff, nr_pages, kaddr, pfn);
 }
 
+static size_t stripe_dax_copy_from_iter(struct dm_target *ti, pgoff_t pgoff,
+		void *addr, size_t bytes, struct iov_iter *i)
+{
+	sector_t dev_sector, sector = pgoff * PAGE_SECTORS;
+	struct stripe_c *sc = ti->private;
+	struct dax_device *dax_dev;
+	struct block_device *bdev;
+	uint32_t stripe;
+
+	stripe_map_sector(sc, sector, &stripe, &dev_sector);
+	dev_sector += sc->stripe[stripe].physical_start;
+	dax_dev = sc->stripe[stripe].dev->dax_dev;
+	bdev = sc->stripe[stripe].dev->bdev;
+
+	if (bdev_dax_pgoff(bdev, dev_sector, ALIGN(bytes, PAGE_SIZE), &pgoff))
+		return 0;
+	return dax_copy_from_iter(dax_dev, pgoff, addr, bytes, i);
+}
+
+static void stripe_dax_flush(struct dm_target *ti, pgoff_t pgoff, void *addr,
+		size_t size)
+{
+	sector_t dev_sector, sector = pgoff * PAGE_SECTORS;
+	struct stripe_c *sc = ti->private;
+	struct dax_device *dax_dev;
+	struct block_device *bdev;
+	uint32_t stripe;
+
+	stripe_map_sector(sc, sector, &stripe, &dev_sector);
+	dev_sector += sc->stripe[stripe].physical_start;
+	dax_dev = sc->stripe[stripe].dev->dax_dev;
+	bdev = sc->stripe[stripe].dev->bdev;
+
+	if (bdev_dax_pgoff(bdev, dev_sector, ALIGN(size, PAGE_SIZE), &pgoff))
+		return;
+	dax_flush(dax_dev, pgoff, addr, size);
+}
+
 /*
  * Stripe status:
  *
@@ -375,20 +413,21 @@ static void stripe_status(struct dm_target *ti, status_type_t type,
 	}
 }
 
-static int stripe_end_io(struct dm_target *ti, struct bio *bio, int error)
+static int stripe_end_io(struct dm_target *ti, struct bio *bio,
+		blk_status_t *error)
 {
 	unsigned i;
 	char major_minor[16];
 	struct stripe_c *sc = ti->private;
 
-	if (!error)
-		return 0; /* I/O complete */
+	if (!*error)
+		return DM_ENDIO_DONE; /* I/O complete */
 
-	if ((error == -EWOULDBLOCK) && (bio->bi_opf & REQ_RAHEAD))
-		return error;
+	if (bio->bi_opf & REQ_RAHEAD)
+		return DM_ENDIO_DONE;
 
-	if (error == -EOPNOTSUPP)
-		return error;
+	if (*error == BLK_STS_NOTSUPP)
+		return DM_ENDIO_DONE;
 
 	memset(major_minor, 0, sizeof(major_minor));
 	sprintf(major_minor, "%d:%d",
@@ -409,7 +448,7 @@ static int stripe_end_io(struct dm_target *ti, struct bio *bio, int error)
 				schedule_work(&sc->trigger_event);
 		}
 
-	return error;
+	return DM_ENDIO_DONE;
 }
 
 static int stripe_iterate_devices(struct dm_target *ti,
@@ -451,6 +490,8 @@ static struct target_type stripe_target = {
 	.iterate_devices = stripe_iterate_devices,
 	.io_hints = stripe_io_hints,
 	.direct_access = stripe_dax_direct_access,
+	.dax_copy_from_iter = stripe_dax_copy_from_iter,
+	.dax_flush = stripe_dax_flush,
 };
 
 int __init dm_stripe_init(void)
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 5f5eae41f804..a39bcd9b982a 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -319,6 +319,39 @@ static int device_area_is_invalid(struct dm_target *ti, struct dm_dev *dev,
 		return 1;
 	}
 
+	/*
+	 * If the target is mapped to zoned block device(s), check
+	 * that the zones are not partially mapped.
+	 */
+	if (bdev_zoned_model(bdev) != BLK_ZONED_NONE) {
+		unsigned int zone_sectors = bdev_zone_sectors(bdev);
+
+		if (start & (zone_sectors - 1)) {
+			DMWARN("%s: start=%llu not aligned to h/w zone size %u of %s",
+			       dm_device_name(ti->table->md),
+			       (unsigned long long)start,
+			       zone_sectors, bdevname(bdev, b));
+			return 1;
+		}
+
+		/*
+		 * Note: The last zone of a zoned block device may be smaller
+		 * than other zones. So for a target mapping the end of a
+		 * zoned block device with such a zone, len would not be zone
+		 * aligned. We do not allow such last smaller zone to be part
+		 * of the mapping here to ensure that mappings with multiple
+		 * devices do not end up with a smaller zone in the middle of
+		 * the sector range.
+		 */
+		if (len & (zone_sectors - 1)) {
+			DMWARN("%s: len=%llu not aligned to h/w zone size %u of %s",
+			       dm_device_name(ti->table->md),
+			       (unsigned long long)len,
+			       zone_sectors, bdevname(bdev, b));
+			return 1;
+		}
+	}
+
 	if (logical_block_size_sectors <= 1)
 		return 0;
 
@@ -456,6 +489,8 @@ static int dm_set_device_limits(struct dm_target *ti, struct dm_dev *dev,
 		       q->limits.alignment_offset,
 		       (unsigned long long) start << SECTOR_SHIFT);
 
+	limits->zoned = blk_queue_zoned_model(q);
+
 	return 0;
 }
 
@@ -1346,6 +1381,88 @@ bool dm_table_has_no_data_devices(struct dm_table *table)
 	return true;
 }
 
+static int device_is_zoned_model(struct dm_target *ti, struct dm_dev *dev,
+				 sector_t start, sector_t len, void *data)
+{
+	struct request_queue *q = bdev_get_queue(dev->bdev);
+	enum blk_zoned_model *zoned_model = data;
+
+	return q && blk_queue_zoned_model(q) == *zoned_model;
+}
+
+static bool dm_table_supports_zoned_model(struct dm_table *t,
+					  enum blk_zoned_model zoned_model)
+{
+	struct dm_target *ti;
+	unsigned i;
+
+	for (i = 0; i < dm_table_get_num_targets(t); i++) {
+		ti = dm_table_get_target(t, i);
+
+		if (zoned_model == BLK_ZONED_HM &&
+		    !dm_target_supports_zoned_hm(ti->type))
+			return false;
+
+		if (!ti->type->iterate_devices ||
+		    !ti->type->iterate_devices(ti, device_is_zoned_model, &zoned_model))
+			return false;
+	}
+
+	return true;
+}
+
+static int device_matches_zone_sectors(struct dm_target *ti, struct dm_dev *dev,
+				       sector_t start, sector_t len, void *data)
+{
+	struct request_queue *q = bdev_get_queue(dev->bdev);
+	unsigned int *zone_sectors = data;
+
+	return q && blk_queue_zone_sectors(q) == *zone_sectors;
+}
+
+static bool dm_table_matches_zone_sectors(struct dm_table *t,
+					  unsigned int zone_sectors)
+{
+	struct dm_target *ti;
+	unsigned i;
+
+	for (i = 0; i < dm_table_get_num_targets(t); i++) {
+		ti = dm_table_get_target(t, i);
+
+		if (!ti->type->iterate_devices ||
+		    !ti->type->iterate_devices(ti, device_matches_zone_sectors, &zone_sectors))
+			return false;
+	}
+
+	return true;
+}
+
+static int validate_hardware_zoned_model(struct dm_table *table,
+					 enum blk_zoned_model zoned_model,
+					 unsigned int zone_sectors)
+{
+	if (zoned_model == BLK_ZONED_NONE)
+		return 0;
+
+	if (!dm_table_supports_zoned_model(table, zoned_model)) {
+		DMERR("%s: zoned model is not consistent across all devices",
+		      dm_device_name(table->md));
+		return -EINVAL;
+	}
+
+	/* Check zone size validity and compatibility */
+	if (!zone_sectors || !is_power_of_2(zone_sectors))
+		return -EINVAL;
+
+	if (!dm_table_matches_zone_sectors(table, zone_sectors)) {
+		DMERR("%s: zone sectors is not consistent across all devices",
+		      dm_device_name(table->md));
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
 /*
  * Establish the new table's queue_limits and validate them.
  */
@@ -1355,6 +1472,8 @@ int dm_calculate_queue_limits(struct dm_table *table,
 	struct dm_target *ti;
 	struct queue_limits ti_limits;
 	unsigned i;
+	enum blk_zoned_model zoned_model = BLK_ZONED_NONE;
+	unsigned int zone_sectors = 0;
 
 	blk_set_stacking_limits(limits);
 
@@ -1372,6 +1491,15 @@ int dm_calculate_queue_limits(struct dm_table *table,
 		ti->type->iterate_devices(ti, dm_set_device_limits,
 					  &ti_limits);
 
+		if (zoned_model == BLK_ZONED_NONE && ti_limits.zoned != BLK_ZONED_NONE) {
+			/*
+			 * After stacking all limits, validate all devices
+			 * in table support this zoned model and zone sectors.
+			 */
+			zoned_model = ti_limits.zoned;
+			zone_sectors = ti_limits.chunk_sectors;
+		}
+
 		/* Set I/O hints portion of queue limits */
 		if (ti->type->io_hints)
 			ti->type->io_hints(ti, &ti_limits);
@@ -1396,8 +1524,42 @@ combine_limits:
 			       dm_device_name(table->md),
 			       (unsigned long long) ti->begin,
 			       (unsigned long long) ti->len);
+
+		/*
+		 * FIXME: this should likely be moved to blk_stack_limits(), would
+		 * also eliminate limits->zoned stacking hack in dm_set_device_limits()
+		 */
+		if (limits->zoned == BLK_ZONED_NONE && ti_limits.zoned != BLK_ZONED_NONE) {
+			/*
+			 * By default, the stacked limits zoned model is set to
+			 * BLK_ZONED_NONE in blk_set_stacking_limits(). Update
+			 * this model using the first target model reported
+			 * that is not BLK_ZONED_NONE. This will be either the
+			 * first target device zoned model or the model reported
+			 * by the target .io_hints.
+			 */
+			limits->zoned = ti_limits.zoned;
+		}
 	}
 
+	/*
+	 * Verify that the zoned model and zone sectors, as determined before
+	 * any .io_hints override, are the same across all devices in the table.
+	 * - this is especially relevant if .io_hints is emulating a disk-managed
+	 *   zoned model (aka BLK_ZONED_NONE) on host-managed zoned block devices.
+	 * BUT...
+	 */
+	if (limits->zoned != BLK_ZONED_NONE) {
+		/*
+		 * ...IF the above limits stacking determined a zoned model
+		 * validate that all of the table's devices conform to it.
+		 */
+		zoned_model = limits->zoned;
+		zone_sectors = limits->chunk_sectors;
+	}
+	if (validate_hardware_zoned_model(table, zoned_model, zone_sectors))
+		return -EINVAL;
+
 	return validate_hardware_logical_block_alignment(table, limits);
 }
 
diff --git a/drivers/md/dm-target.c b/drivers/md/dm-target.c
index b242b750542f..c0d7e60820c4 100644
--- a/drivers/md/dm-target.c
+++ b/drivers/md/dm-target.c
@@ -128,7 +128,7 @@ static void io_err_dtr(struct dm_target *tt)
 
 static int io_err_map(struct dm_target *tt, struct bio *bio)
 {
-	return -EIO;
+	return DM_MAPIO_KILL;
 }
 
 static int io_err_clone_and_map_rq(struct dm_target *ti, struct request *rq,
diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c
index 0f0251d0d337..d31d18d9727c 100644
--- a/drivers/md/dm-thin-metadata.c
+++ b/drivers/md/dm-thin-metadata.c
@@ -484,11 +484,11 @@ static int __write_initial_superblock(struct dm_pool_metadata *pmd)
 	if (r < 0)
 		return r;
 
-	r = save_sm_roots(pmd);
+	r = dm_tm_pre_commit(pmd->tm);
 	if (r < 0)
 		return r;
 
-	r = dm_tm_pre_commit(pmd->tm);
+	r = save_sm_roots(pmd);
 	if (r < 0)
 		return r;
 
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index 17ad50daed08..9dec2f8cc739 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -383,8 +383,8 @@ static void end_discard(struct discard_op *op, int r)
 	 * Even if r is set, there could be sub discards in flight that we
 	 * need to wait for.
 	 */
-	if (r && !op->parent_bio->bi_error)
-		op->parent_bio->bi_error = r;
+	if (r && !op->parent_bio->bi_status)
+		op->parent_bio->bi_status = errno_to_blk_status(r);
 	bio_endio(op->parent_bio);
 }
 
@@ -450,22 +450,20 @@ static void cell_release_no_holder(struct pool *pool,
 }
 
 static void cell_error_with_code(struct pool *pool,
-				 struct dm_bio_prison_cell *cell, int error_code)
+		struct dm_bio_prison_cell *cell, blk_status_t error_code)
 {
 	dm_cell_error(pool->prison, cell, error_code);
 	dm_bio_prison_free_cell(pool->prison, cell);
 }
 
-static int get_pool_io_error_code(struct pool *pool)
+static blk_status_t get_pool_io_error_code(struct pool *pool)
 {
-	return pool->out_of_data_space ? -ENOSPC : -EIO;
+	return pool->out_of_data_space ? BLK_STS_NOSPC : BLK_STS_IOERR;
 }
 
 static void cell_error(struct pool *pool, struct dm_bio_prison_cell *cell)
 {
-	int error = get_pool_io_error_code(pool);
-
-	cell_error_with_code(pool, cell, error);
+	cell_error_with_code(pool, cell, get_pool_io_error_code(pool));
 }
 
 static void cell_success(struct pool *pool, struct dm_bio_prison_cell *cell)
@@ -475,7 +473,7 @@ static void cell_success(struct pool *pool, struct dm_bio_prison_cell *cell)
 
 static void cell_requeue(struct pool *pool, struct dm_bio_prison_cell *cell)
 {
-	cell_error_with_code(pool, cell, DM_ENDIO_REQUEUE);
+	cell_error_with_code(pool, cell, BLK_STS_DM_REQUEUE);
 }
 
 /*----------------------------------------------------------------*/
@@ -555,17 +553,18 @@ static void __merge_bio_list(struct bio_list *bios, struct bio_list *master)
 	bio_list_init(master);
 }
 
-static void error_bio_list(struct bio_list *bios, int error)
+static void error_bio_list(struct bio_list *bios, blk_status_t error)
 {
 	struct bio *bio;
 
 	while ((bio = bio_list_pop(bios))) {
-		bio->bi_error = error;
+		bio->bi_status = error;
 		bio_endio(bio);
 	}
 }
 
-static void error_thin_bio_list(struct thin_c *tc, struct bio_list *master, int error)
+static void error_thin_bio_list(struct thin_c *tc, struct bio_list *master,
+		blk_status_t error)
 {
 	struct bio_list bios;
 	unsigned long flags;
@@ -608,11 +607,11 @@ static void requeue_io(struct thin_c *tc)
 	__merge_bio_list(&bios, &tc->retry_on_resume_list);
 	spin_unlock_irqrestore(&tc->lock, flags);
 
-	error_bio_list(&bios, DM_ENDIO_REQUEUE);
+	error_bio_list(&bios, BLK_STS_DM_REQUEUE);
 	requeue_deferred_cells(tc);
 }
 
-static void error_retry_list_with_code(struct pool *pool, int error)
+static void error_retry_list_with_code(struct pool *pool, blk_status_t error)
 {
 	struct thin_c *tc;
 
@@ -624,9 +623,7 @@ static void error_retry_list_with_code(struct pool *pool, int error)
 
 static void error_retry_list(struct pool *pool)
 {
-	int error = get_pool_io_error_code(pool);
-
-	error_retry_list_with_code(pool, error);
+	error_retry_list_with_code(pool, get_pool_io_error_code(pool));
 }
 
 /*
@@ -774,7 +771,7 @@ struct dm_thin_new_mapping {
 	 */
 	atomic_t prepare_actions;
 
-	int err;
+	blk_status_t status;
 	struct thin_c *tc;
 	dm_block_t virt_begin, virt_end;
 	dm_block_t data_block;
@@ -814,7 +811,7 @@ static void copy_complete(int read_err, unsigned long write_err, void *context)
 {
 	struct dm_thin_new_mapping *m = context;
 
-	m->err = read_err || write_err ? -EIO : 0;
+	m->status = read_err || write_err ? BLK_STS_IOERR : 0;
 	complete_mapping_preparation(m);
 }
 
@@ -825,7 +822,7 @@ static void overwrite_endio(struct bio *bio)
 
 	bio->bi_end_io = m->saved_bi_end_io;
 
-	m->err = bio->bi_error;
+	m->status = bio->bi_status;
 	complete_mapping_preparation(m);
 }
 
@@ -925,7 +922,7 @@ static void process_prepared_mapping(struct dm_thin_new_mapping *m)
 	struct bio *bio = m->bio;
 	int r;
 
-	if (m->err) {
+	if (m->status) {
 		cell_error(pool, m->cell);
 		goto out;
 	}
@@ -1094,6 +1091,19 @@ static void process_prepared_discard_passdown_pt1(struct dm_thin_new_mapping *m)
 		return;
 	}
 
+	/*
+	 * Increment the unmapped blocks.  This prevents a race between the
+	 * passdown io and reallocation of freed blocks.
+	 */
+	r = dm_pool_inc_data_range(pool->pmd, m->data_block, data_end);
+	if (r) {
+		metadata_operation_failed(pool, "dm_pool_inc_data_range", r);
+		bio_io_error(m->bio);
+		cell_defer_no_holder(tc, m->cell);
+		mempool_free(m, pool->mapping_pool);
+		return;
+	}
+
 	discard_parent = bio_alloc(GFP_NOIO, 1);
 	if (!discard_parent) {
 		DMWARN("%s: unable to allocate top level discard bio for passdown. Skipping passdown.",
@@ -1114,19 +1124,6 @@ static void process_prepared_discard_passdown_pt1(struct dm_thin_new_mapping *m)
 			end_discard(&op, r);
 		}
 	}
-
-	/*
-	 * Increment the unmapped blocks.  This prevents a race between the
-	 * passdown io and reallocation of freed blocks.
-	 */
-	r = dm_pool_inc_data_range(pool->pmd, m->data_block, data_end);
-	if (r) {
-		metadata_operation_failed(pool, "dm_pool_inc_data_range", r);
-		bio_io_error(m->bio);
-		cell_defer_no_holder(tc, m->cell);
-		mempool_free(m, pool->mapping_pool);
-		return;
-	}
 }
 
 static void process_prepared_discard_passdown_pt2(struct dm_thin_new_mapping *m)
@@ -1495,7 +1492,7 @@ static void retry_on_resume(struct bio *bio)
 	spin_unlock_irqrestore(&tc->lock, flags);
 }
 
-static int should_error_unserviceable_bio(struct pool *pool)
+static blk_status_t should_error_unserviceable_bio(struct pool *pool)
 {
 	enum pool_mode m = get_pool_mode(pool);
 
@@ -1503,27 +1500,27 @@ static int should_error_unserviceable_bio(struct pool *pool)
 	case PM_WRITE:
 		/* Shouldn't get here */
 		DMERR_LIMIT("bio unserviceable, yet pool is in PM_WRITE mode");
-		return -EIO;
+		return BLK_STS_IOERR;
 
 	case PM_OUT_OF_DATA_SPACE:
-		return pool->pf.error_if_no_space ? -ENOSPC : 0;
+		return pool->pf.error_if_no_space ? BLK_STS_NOSPC : 0;
 
 	case PM_READ_ONLY:
 	case PM_FAIL:
-		return -EIO;
+		return BLK_STS_IOERR;
 	default:
 		/* Shouldn't get here */
 		DMERR_LIMIT("bio unserviceable, yet pool has an unknown mode");
-		return -EIO;
+		return BLK_STS_IOERR;
 	}
 }
 
 static void handle_unserviceable_bio(struct pool *pool, struct bio *bio)
 {
-	int error = should_error_unserviceable_bio(pool);
+	blk_status_t error = should_error_unserviceable_bio(pool);
 
 	if (error) {
-		bio->bi_error = error;
+		bio->bi_status = error;
 		bio_endio(bio);
 	} else
 		retry_on_resume(bio);
@@ -1533,7 +1530,7 @@ static void retry_bios_on_resume(struct pool *pool, struct dm_bio_prison_cell *c
 {
 	struct bio *bio;
 	struct bio_list bios;
-	int error;
+	blk_status_t error;
 
 	error = should_error_unserviceable_bio(pool);
 	if (error) {
@@ -2071,7 +2068,8 @@ static void process_thin_deferred_bios(struct thin_c *tc)
 	unsigned count = 0;
 
 	if (tc->requeue_mode) {
-		error_thin_bio_list(tc, &tc->deferred_bio_list, DM_ENDIO_REQUEUE);
+		error_thin_bio_list(tc, &tc->deferred_bio_list,
+				BLK_STS_DM_REQUEUE);
 		return;
 	}
 
@@ -2322,7 +2320,7 @@ static void do_no_space_timeout(struct work_struct *ws)
 	if (get_pool_mode(pool) == PM_OUT_OF_DATA_SPACE && !pool->pf.error_if_no_space) {
 		pool->pf.error_if_no_space = true;
 		notify_of_pool_mode_change_to_oods(pool);
-		error_retry_list_with_code(pool, -ENOSPC);
+		error_retry_list_with_code(pool, BLK_STS_NOSPC);
 	}
 }
 
@@ -2624,7 +2622,7 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio)
 	thin_hook_bio(tc, bio);
 
 	if (tc->requeue_mode) {
-		bio->bi_error = DM_ENDIO_REQUEUE;
+		bio->bi_status = BLK_STS_DM_REQUEUE;
 		bio_endio(bio);
 		return DM_MAPIO_SUBMITTED;
 	}
@@ -4177,7 +4175,8 @@ static int thin_map(struct dm_target *ti, struct bio *bio)
 	return thin_bio_map(ti, bio);
 }
 
-static int thin_endio(struct dm_target *ti, struct bio *bio, int err)
+static int thin_endio(struct dm_target *ti, struct bio *bio,
+		blk_status_t *err)
 {
 	unsigned long flags;
 	struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
@@ -4212,7 +4211,7 @@ static int thin_endio(struct dm_target *ti, struct bio *bio, int err)
 	if (h->cell)
 		cell_defer_no_holder(h->tc, h->cell);
 
-	return 0;
+	return DM_ENDIO_DONE;
 }
 
 static void thin_presuspend(struct dm_target *ti)
diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c
index 97de961a3bfc..b46705ebf01f 100644
--- a/drivers/md/dm-verity-target.c
+++ b/drivers/md/dm-verity-target.c
@@ -166,7 +166,7 @@ static int verity_hash_init(struct dm_verity *v, struct ahash_request *req,
 		return r;
 	}
 
-	if (likely(v->version >= 1))
+	if (likely(v->salt_size && (v->version >= 1)))
 		r = verity_hash_update(v, req, v->salt, v->salt_size, res);
 
 	return r;
@@ -177,7 +177,7 @@ static int verity_hash_final(struct dm_verity *v, struct ahash_request *req,
 {
 	int r;
 
-	if (unlikely(!v->version)) {
+	if (unlikely(v->salt_size && (!v->version))) {
 		r = verity_hash_update(v, req, v->salt, v->salt_size, res);
 
 		if (r < 0) {
@@ -538,13 +538,13 @@ static int verity_verify_io(struct dm_verity_io *io)
 /*
  * End one "io" structure with a given error.
  */
-static void verity_finish_io(struct dm_verity_io *io, int error)
+static void verity_finish_io(struct dm_verity_io *io, blk_status_t status)
 {
 	struct dm_verity *v = io->v;
 	struct bio *bio = dm_bio_from_per_bio_data(io, v->ti->per_io_data_size);
 
 	bio->bi_end_io = io->orig_bi_end_io;
-	bio->bi_error = error;
+	bio->bi_status = status;
 
 	verity_fec_finish_io(io);
 
@@ -555,15 +555,15 @@ static void verity_work(struct work_struct *w)
 {
 	struct dm_verity_io *io = container_of(w, struct dm_verity_io, work);
 
-	verity_finish_io(io, verity_verify_io(io));
+	verity_finish_io(io, errno_to_blk_status(verity_verify_io(io)));
 }
 
 static void verity_end_io(struct bio *bio)
 {
 	struct dm_verity_io *io = bio->bi_private;
 
-	if (bio->bi_error && !verity_fec_is_enabled(io->v)) {
-		verity_finish_io(io, bio->bi_error);
+	if (bio->bi_status && !verity_fec_is_enabled(io->v)) {
+		verity_finish_io(io, bio->bi_status);
 		return;
 	}
 
@@ -643,17 +643,17 @@ static int verity_map(struct dm_target *ti, struct bio *bio)
 	if (((unsigned)bio->bi_iter.bi_sector | bio_sectors(bio)) &
 	    ((1 << (v->data_dev_block_bits - SECTOR_SHIFT)) - 1)) {
 		DMERR_LIMIT("unaligned io");
-		return -EIO;
+		return DM_MAPIO_KILL;
 	}
 
 	if (bio_end_sector(bio) >>
 	    (v->data_dev_block_bits - SECTOR_SHIFT) > v->data_blocks) {
 		DMERR_LIMIT("io out of range");
-		return -EIO;
+		return DM_MAPIO_KILL;
 	}
 
 	if (bio_data_dir(bio) == WRITE)
-		return -EIO;
+		return DM_MAPIO_KILL;
 
 	io = dm_per_bio_data(bio, ti->per_io_data_size);
 	io->v = v;
diff --git a/drivers/md/dm-zero.c b/drivers/md/dm-zero.c
index b616f11d8473..b65ca8dcfbdc 100644
--- a/drivers/md/dm-zero.c
+++ b/drivers/md/dm-zero.c
@@ -39,7 +39,7 @@ static int zero_map(struct dm_target *ti, struct bio *bio)
 	case REQ_OP_READ:
 		if (bio->bi_opf & REQ_RAHEAD) {
 			/* readahead of null bytes only wastes buffer cache */
-			return -EIO;
+			return DM_MAPIO_KILL;
 		}
 		zero_fill_bio(bio);
 		break;
@@ -47,7 +47,7 @@ static int zero_map(struct dm_target *ti, struct bio *bio)
 		/* writes get silently dropped */
 		break;
 	default:
-		return -EIO;
+		return DM_MAPIO_KILL;
 	}
 
 	bio_endio(bio);
diff --git a/drivers/md/dm-zoned-metadata.c b/drivers/md/dm-zoned-metadata.c
new file mode 100644
index 000000000000..884ff7c170a0
--- /dev/null
+++ b/drivers/md/dm-zoned-metadata.c
@@ -0,0 +1,2509 @@
+/*
+ * Copyright (C) 2017 Western Digital Corporation or its affiliates.
+ *
+ * This file is released under the GPL.
+ */
+
+#include "dm-zoned.h"
+
+#include <linux/module.h>
+#include <linux/crc32.h>
+
+#define	DM_MSG_PREFIX		"zoned metadata"
+
+/*
+ * Metadata version.
+ */
+#define DMZ_META_VER	1
+
+/*
+ * On-disk super block magic.
+ */
+#define DMZ_MAGIC	((((unsigned int)('D')) << 24) | \
+			 (((unsigned int)('Z')) << 16) | \
+			 (((unsigned int)('B')) <<  8) | \
+			 ((unsigned int)('D')))
+
+/*
+ * On disk super block.
+ * This uses only 512 B but uses on disk a full 4KB block. This block is
+ * followed on disk by the mapping table of chunks to zones and the bitmap
+ * blocks indicating zone block validity.
+ * The overall resulting metadata format is:
+ *    (1) Super block (1 block)
+ *    (2) Chunk mapping table (nr_map_blocks)
+ *    (3) Bitmap blocks (nr_bitmap_blocks)
+ * All metadata blocks are stored in conventional zones, starting from the
+ * the first conventional zone found on disk.
+ */
+struct dmz_super {
+	/* Magic number */
+	__le32		magic;			/*   4 */
+
+	/* Metadata version number */
+	__le32		version;		/*   8 */
+
+	/* Generation number */
+	__le64		gen;			/*  16 */
+
+	/* This block number */
+	__le64		sb_block;		/*  24 */
+
+	/* The number of metadata blocks, including this super block */
+	__le32		nr_meta_blocks;		/*  28 */
+
+	/* The number of sequential zones reserved for reclaim */
+	__le32		nr_reserved_seq;	/*  32 */
+
+	/* The number of entries in the mapping table */
+	__le32		nr_chunks;		/*  36 */
+
+	/* The number of blocks used for the chunk mapping table */
+	__le32		nr_map_blocks;		/*  40 */
+
+	/* The number of blocks used for the block bitmaps */
+	__le32		nr_bitmap_blocks;	/*  44 */
+
+	/* Checksum */
+	__le32		crc;			/*  48 */
+
+	/* Padding to full 512B sector */
+	u8		reserved[464];		/* 512 */
+};
+
+/*
+ * Chunk mapping entry: entries are indexed by chunk number
+ * and give the zone ID (dzone_id) mapping the chunk on disk.
+ * This zone may be sequential or random. If it is a sequential
+ * zone, a second zone (bzone_id) used as a write buffer may
+ * also be specified. This second zone will always be a randomly
+ * writeable zone.
+ */
+struct dmz_map {
+	__le32			dzone_id;
+	__le32			bzone_id;
+};
+
+/*
+ * Chunk mapping table metadata: 512 8-bytes entries per 4KB block.
+ */
+#define DMZ_MAP_ENTRIES		(DMZ_BLOCK_SIZE / sizeof(struct dmz_map))
+#define DMZ_MAP_ENTRIES_SHIFT	(ilog2(DMZ_MAP_ENTRIES))
+#define DMZ_MAP_ENTRIES_MASK	(DMZ_MAP_ENTRIES - 1)
+#define DMZ_MAP_UNMAPPED	UINT_MAX
+
+/*
+ * Meta data block descriptor (for cached metadata blocks).
+ */
+struct dmz_mblock {
+	struct rb_node		node;
+	struct list_head	link;
+	sector_t		no;
+	atomic_t		ref;
+	unsigned long		state;
+	struct page		*page;
+	void			*data;
+};
+
+/*
+ * Metadata block state flags.
+ */
+enum {
+	DMZ_META_DIRTY,
+	DMZ_META_READING,
+	DMZ_META_WRITING,
+	DMZ_META_ERROR,
+};
+
+/*
+ * Super block information (one per metadata set).
+ */
+struct dmz_sb {
+	sector_t		block;
+	struct dmz_mblock	*mblk;
+	struct dmz_super	*sb;
+};
+
+/*
+ * In-memory metadata.
+ */
+struct dmz_metadata {
+	struct dmz_dev		*dev;
+
+	sector_t		zone_bitmap_size;
+	unsigned int		zone_nr_bitmap_blocks;
+
+	unsigned int		nr_bitmap_blocks;
+	unsigned int		nr_map_blocks;
+
+	unsigned int		nr_useable_zones;
+	unsigned int		nr_meta_blocks;
+	unsigned int		nr_meta_zones;
+	unsigned int		nr_data_zones;
+	unsigned int		nr_rnd_zones;
+	unsigned int		nr_reserved_seq;
+	unsigned int		nr_chunks;
+
+	/* Zone information array */
+	struct dm_zone		*zones;
+
+	struct dm_zone		*sb_zone;
+	struct dmz_sb		sb[2];
+	unsigned int		mblk_primary;
+	u64			sb_gen;
+	unsigned int		min_nr_mblks;
+	unsigned int		max_nr_mblks;
+	atomic_t		nr_mblks;
+	struct rw_semaphore	mblk_sem;
+	struct mutex		mblk_flush_lock;
+	spinlock_t		mblk_lock;
+	struct rb_root		mblk_rbtree;
+	struct list_head	mblk_lru_list;
+	struct list_head	mblk_dirty_list;
+	struct shrinker		mblk_shrinker;
+
+	/* Zone allocation management */
+	struct mutex		map_lock;
+	struct dmz_mblock	**map_mblk;
+	unsigned int		nr_rnd;
+	atomic_t		unmap_nr_rnd;
+	struct list_head	unmap_rnd_list;
+	struct list_head	map_rnd_list;
+
+	unsigned int		nr_seq;
+	atomic_t		unmap_nr_seq;
+	struct list_head	unmap_seq_list;
+	struct list_head	map_seq_list;
+
+	atomic_t		nr_reserved_seq_zones;
+	struct list_head	reserved_seq_zones_list;
+
+	wait_queue_head_t	free_wq;
+};
+
+/*
+ * Various accessors
+ */
+unsigned int dmz_id(struct dmz_metadata *zmd, struct dm_zone *zone)
+{
+	return ((unsigned int)(zone - zmd->zones));
+}
+
+sector_t dmz_start_sect(struct dmz_metadata *zmd, struct dm_zone *zone)
+{
+	return (sector_t)dmz_id(zmd, zone) << zmd->dev->zone_nr_sectors_shift;
+}
+
+sector_t dmz_start_block(struct dmz_metadata *zmd, struct dm_zone *zone)
+{
+	return (sector_t)dmz_id(zmd, zone) << zmd->dev->zone_nr_blocks_shift;
+}
+
+unsigned int dmz_nr_chunks(struct dmz_metadata *zmd)
+{
+	return zmd->nr_chunks;
+}
+
+unsigned int dmz_nr_rnd_zones(struct dmz_metadata *zmd)
+{
+	return zmd->nr_rnd;
+}
+
+unsigned int dmz_nr_unmap_rnd_zones(struct dmz_metadata *zmd)
+{
+	return atomic_read(&zmd->unmap_nr_rnd);
+}
+
+/*
+ * Lock/unlock mapping table.
+ * The map lock also protects all the zone lists.
+ */
+void dmz_lock_map(struct dmz_metadata *zmd)
+{
+	mutex_lock(&zmd->map_lock);
+}
+
+void dmz_unlock_map(struct dmz_metadata *zmd)
+{
+	mutex_unlock(&zmd->map_lock);
+}
+
+/*
+ * Lock/unlock metadata access. This is a "read" lock on a semaphore
+ * that prevents metadata flush from running while metadata are being
+ * modified. The actual metadata write mutual exclusion is achieved with
+ * the map lock and zone styate management (active and reclaim state are
+ * mutually exclusive).
+ */
+void dmz_lock_metadata(struct dmz_metadata *zmd)
+{
+	down_read(&zmd->mblk_sem);
+}
+
+void dmz_unlock_metadata(struct dmz_metadata *zmd)
+{
+	up_read(&zmd->mblk_sem);
+}
+
+/*
+ * Lock/unlock flush: prevent concurrent executions
+ * of dmz_flush_metadata as well as metadata modification in reclaim
+ * while flush is being executed.
+ */
+void dmz_lock_flush(struct dmz_metadata *zmd)
+{
+	mutex_lock(&zmd->mblk_flush_lock);
+}
+
+void dmz_unlock_flush(struct dmz_metadata *zmd)
+{
+	mutex_unlock(&zmd->mblk_flush_lock);
+}
+
+/*
+ * Allocate a metadata block.
+ */
+static struct dmz_mblock *dmz_alloc_mblock(struct dmz_metadata *zmd,
+					   sector_t mblk_no)
+{
+	struct dmz_mblock *mblk = NULL;
+
+	/* See if we can reuse cached blocks */
+	if (zmd->max_nr_mblks && atomic_read(&zmd->nr_mblks) > zmd->max_nr_mblks) {
+		spin_lock(&zmd->mblk_lock);
+		mblk = list_first_entry_or_null(&zmd->mblk_lru_list,
+						struct dmz_mblock, link);
+		if (mblk) {
+			list_del_init(&mblk->link);
+			rb_erase(&mblk->node, &zmd->mblk_rbtree);
+			mblk->no = mblk_no;
+		}
+		spin_unlock(&zmd->mblk_lock);
+		if (mblk)
+			return mblk;
+	}
+
+	/* Allocate a new block */
+	mblk = kmalloc(sizeof(struct dmz_mblock), GFP_NOIO);
+	if (!mblk)
+		return NULL;
+
+	mblk->page = alloc_page(GFP_NOIO);
+	if (!mblk->page) {
+		kfree(mblk);
+		return NULL;
+	}
+
+	RB_CLEAR_NODE(&mblk->node);
+	INIT_LIST_HEAD(&mblk->link);
+	atomic_set(&mblk->ref, 0);
+	mblk->state = 0;
+	mblk->no = mblk_no;
+	mblk->data = page_address(mblk->page);
+
+	atomic_inc(&zmd->nr_mblks);
+
+	return mblk;
+}
+
+/*
+ * Free a metadata block.
+ */
+static void dmz_free_mblock(struct dmz_metadata *zmd, struct dmz_mblock *mblk)
+{
+	__free_pages(mblk->page, 0);
+	kfree(mblk);
+
+	atomic_dec(&zmd->nr_mblks);
+}
+
+/*
+ * Insert a metadata block in the rbtree.
+ */
+static void dmz_insert_mblock(struct dmz_metadata *zmd, struct dmz_mblock *mblk)
+{
+	struct rb_root *root = &zmd->mblk_rbtree;
+	struct rb_node **new = &(root->rb_node), *parent = NULL;
+	struct dmz_mblock *b;
+
+	/* Figure out where to put the new node */
+	while (*new) {
+		b = container_of(*new, struct dmz_mblock, node);
+		parent = *new;
+		new = (b->no < mblk->no) ? &((*new)->rb_left) : &((*new)->rb_right);
+	}
+
+	/* Add new node and rebalance tree */
+	rb_link_node(&mblk->node, parent, new);
+	rb_insert_color(&mblk->node, root);
+}
+
+/*
+ * Lookup a metadata block in the rbtree.
+ */
+static struct dmz_mblock *dmz_lookup_mblock(struct dmz_metadata *zmd,
+					    sector_t mblk_no)
+{
+	struct rb_root *root = &zmd->mblk_rbtree;
+	struct rb_node *node = root->rb_node;
+	struct dmz_mblock *mblk;
+
+	while (node) {
+		mblk = container_of(node, struct dmz_mblock, node);
+		if (mblk->no == mblk_no)
+			return mblk;
+		node = (mblk->no < mblk_no) ? node->rb_left : node->rb_right;
+	}
+
+	return NULL;
+}
+
+/*
+ * Metadata block BIO end callback.
+ */
+static void dmz_mblock_bio_end_io(struct bio *bio)
+{
+	struct dmz_mblock *mblk = bio->bi_private;
+	int flag;
+
+	if (bio->bi_status)
+		set_bit(DMZ_META_ERROR, &mblk->state);
+
+	if (bio_op(bio) == REQ_OP_WRITE)
+		flag = DMZ_META_WRITING;
+	else
+		flag = DMZ_META_READING;
+
+	clear_bit_unlock(flag, &mblk->state);
+	smp_mb__after_atomic();
+	wake_up_bit(&mblk->state, flag);
+
+	bio_put(bio);
+}
+
+/*
+ * Read a metadata block from disk.
+ */
+static struct dmz_mblock *dmz_fetch_mblock(struct dmz_metadata *zmd,
+					   sector_t mblk_no)
+{
+	struct dmz_mblock *mblk;
+	sector_t block = zmd->sb[zmd->mblk_primary].block + mblk_no;
+	struct bio *bio;
+
+	/* Get block and insert it */
+	mblk = dmz_alloc_mblock(zmd, mblk_no);
+	if (!mblk)
+		return NULL;
+
+	spin_lock(&zmd->mblk_lock);
+	atomic_inc(&mblk->ref);
+	set_bit(DMZ_META_READING, &mblk->state);
+	dmz_insert_mblock(zmd, mblk);
+	spin_unlock(&zmd->mblk_lock);
+
+	bio = bio_alloc(GFP_NOIO, 1);
+	if (!bio) {
+		dmz_free_mblock(zmd, mblk);
+		return NULL;
+	}
+
+	bio->bi_iter.bi_sector = dmz_blk2sect(block);
+	bio->bi_bdev = zmd->dev->bdev;
+	bio->bi_private = mblk;
+	bio->bi_end_io = dmz_mblock_bio_end_io;
+	bio_set_op_attrs(bio, REQ_OP_READ, REQ_META | REQ_PRIO);
+	bio_add_page(bio, mblk->page, DMZ_BLOCK_SIZE, 0);
+	submit_bio(bio);
+
+	return mblk;
+}
+
+/*
+ * Free metadata blocks.
+ */
+static unsigned long dmz_shrink_mblock_cache(struct dmz_metadata *zmd,
+					     unsigned long limit)
+{
+	struct dmz_mblock *mblk;
+	unsigned long count = 0;
+
+	if (!zmd->max_nr_mblks)
+		return 0;
+
+	while (!list_empty(&zmd->mblk_lru_list) &&
+	       atomic_read(&zmd->nr_mblks) > zmd->min_nr_mblks &&
+	       count < limit) {
+		mblk = list_first_entry(&zmd->mblk_lru_list,
+					struct dmz_mblock, link);
+		list_del_init(&mblk->link);
+		rb_erase(&mblk->node, &zmd->mblk_rbtree);
+		dmz_free_mblock(zmd, mblk);
+		count++;
+	}
+
+	return count;
+}
+
+/*
+ * For mblock shrinker: get the number of unused metadata blocks in the cache.
+ */
+static unsigned long dmz_mblock_shrinker_count(struct shrinker *shrink,
+					       struct shrink_control *sc)
+{
+	struct dmz_metadata *zmd = container_of(shrink, struct dmz_metadata, mblk_shrinker);
+
+	return atomic_read(&zmd->nr_mblks);
+}
+
+/*
+ * For mblock shrinker: scan unused metadata blocks and shrink the cache.
+ */
+static unsigned long dmz_mblock_shrinker_scan(struct shrinker *shrink,
+					      struct shrink_control *sc)
+{
+	struct dmz_metadata *zmd = container_of(shrink, struct dmz_metadata, mblk_shrinker);
+	unsigned long count;
+
+	spin_lock(&zmd->mblk_lock);
+	count = dmz_shrink_mblock_cache(zmd, sc->nr_to_scan);
+	spin_unlock(&zmd->mblk_lock);
+
+	return count ? count : SHRINK_STOP;
+}
+
+/*
+ * Release a metadata block.
+ */
+static void dmz_release_mblock(struct dmz_metadata *zmd,
+			       struct dmz_mblock *mblk)
+{
+
+	if (!mblk)
+		return;
+
+	spin_lock(&zmd->mblk_lock);
+
+	if (atomic_dec_and_test(&mblk->ref)) {
+		if (test_bit(DMZ_META_ERROR, &mblk->state)) {
+			rb_erase(&mblk->node, &zmd->mblk_rbtree);
+			dmz_free_mblock(zmd, mblk);
+		} else if (!test_bit(DMZ_META_DIRTY, &mblk->state)) {
+			list_add_tail(&mblk->link, &zmd->mblk_lru_list);
+			dmz_shrink_mblock_cache(zmd, 1);
+		}
+	}
+
+	spin_unlock(&zmd->mblk_lock);
+}
+
+/*
+ * Get a metadata block from the rbtree. If the block
+ * is not present, read it from disk.
+ */
+static struct dmz_mblock *dmz_get_mblock(struct dmz_metadata *zmd,
+					 sector_t mblk_no)
+{
+	struct dmz_mblock *mblk;
+
+	/* Check rbtree */
+	spin_lock(&zmd->mblk_lock);
+	mblk = dmz_lookup_mblock(zmd, mblk_no);
+	if (mblk) {
+		/* Cache hit: remove block from LRU list */
+		if (atomic_inc_return(&mblk->ref) == 1 &&
+		    !test_bit(DMZ_META_DIRTY, &mblk->state))
+			list_del_init(&mblk->link);
+	}
+	spin_unlock(&zmd->mblk_lock);
+
+	if (!mblk) {
+		/* Cache miss: read the block from disk */
+		mblk = dmz_fetch_mblock(zmd, mblk_no);
+		if (!mblk)
+			return ERR_PTR(-ENOMEM);
+	}
+
+	/* Wait for on-going read I/O and check for error */
+	wait_on_bit_io(&mblk->state, DMZ_META_READING,
+		       TASK_UNINTERRUPTIBLE);
+	if (test_bit(DMZ_META_ERROR, &mblk->state)) {
+		dmz_release_mblock(zmd, mblk);
+		return ERR_PTR(-EIO);
+	}
+
+	return mblk;
+}
+
+/*
+ * Mark a metadata block dirty.
+ */
+static void dmz_dirty_mblock(struct dmz_metadata *zmd, struct dmz_mblock *mblk)
+{
+	spin_lock(&zmd->mblk_lock);
+	if (!test_and_set_bit(DMZ_META_DIRTY, &mblk->state))
+		list_add_tail(&mblk->link, &zmd->mblk_dirty_list);
+	spin_unlock(&zmd->mblk_lock);
+}
+
+/*
+ * Issue a metadata block write BIO.
+ */
+static void dmz_write_mblock(struct dmz_metadata *zmd, struct dmz_mblock *mblk,
+			     unsigned int set)
+{
+	sector_t block = zmd->sb[set].block + mblk->no;
+	struct bio *bio;
+
+	bio = bio_alloc(GFP_NOIO, 1);
+	if (!bio) {
+		set_bit(DMZ_META_ERROR, &mblk->state);
+		return;
+	}
+
+	set_bit(DMZ_META_WRITING, &mblk->state);
+
+	bio->bi_iter.bi_sector = dmz_blk2sect(block);
+	bio->bi_bdev = zmd->dev->bdev;
+	bio->bi_private = mblk;
+	bio->bi_end_io = dmz_mblock_bio_end_io;
+	bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_META | REQ_PRIO);
+	bio_add_page(bio, mblk->page, DMZ_BLOCK_SIZE, 0);
+	submit_bio(bio);
+}
+
+/*
+ * Read/write a metadata block.
+ */
+static int dmz_rdwr_block(struct dmz_metadata *zmd, int op, sector_t block,
+			  struct page *page)
+{
+	struct bio *bio;
+	int ret;
+
+	bio = bio_alloc(GFP_NOIO, 1);
+	if (!bio)
+		return -ENOMEM;
+
+	bio->bi_iter.bi_sector = dmz_blk2sect(block);
+	bio->bi_bdev = zmd->dev->bdev;
+	bio_set_op_attrs(bio, op, REQ_SYNC | REQ_META | REQ_PRIO);
+	bio_add_page(bio, page, DMZ_BLOCK_SIZE, 0);
+	ret = submit_bio_wait(bio);
+	bio_put(bio);
+
+	return ret;
+}
+
+/*
+ * Write super block of the specified metadata set.
+ */
+static int dmz_write_sb(struct dmz_metadata *zmd, unsigned int set)
+{
+	sector_t block = zmd->sb[set].block;
+	struct dmz_mblock *mblk = zmd->sb[set].mblk;
+	struct dmz_super *sb = zmd->sb[set].sb;
+	u64 sb_gen = zmd->sb_gen + 1;
+	int ret;
+
+	sb->magic = cpu_to_le32(DMZ_MAGIC);
+	sb->version = cpu_to_le32(DMZ_META_VER);
+
+	sb->gen = cpu_to_le64(sb_gen);
+
+	sb->sb_block = cpu_to_le64(block);
+	sb->nr_meta_blocks = cpu_to_le32(zmd->nr_meta_blocks);
+	sb->nr_reserved_seq = cpu_to_le32(zmd->nr_reserved_seq);
+	sb->nr_chunks = cpu_to_le32(zmd->nr_chunks);
+
+	sb->nr_map_blocks = cpu_to_le32(zmd->nr_map_blocks);
+	sb->nr_bitmap_blocks = cpu_to_le32(zmd->nr_bitmap_blocks);
+
+	sb->crc = 0;
+	sb->crc = cpu_to_le32(crc32_le(sb_gen, (unsigned char *)sb, DMZ_BLOCK_SIZE));
+
+	ret = dmz_rdwr_block(zmd, REQ_OP_WRITE, block, mblk->page);
+	if (ret == 0)
+		ret = blkdev_issue_flush(zmd->dev->bdev, GFP_KERNEL, NULL);
+
+	return ret;
+}
+
+/*
+ * Write dirty metadata blocks to the specified set.
+ */
+static int dmz_write_dirty_mblocks(struct dmz_metadata *zmd,
+				   struct list_head *write_list,
+				   unsigned int set)
+{
+	struct dmz_mblock *mblk;
+	struct blk_plug plug;
+	int ret = 0;
+
+	/* Issue writes */
+	blk_start_plug(&plug);
+	list_for_each_entry(mblk, write_list, link)
+		dmz_write_mblock(zmd, mblk, set);
+	blk_finish_plug(&plug);
+
+	/* Wait for completion */
+	list_for_each_entry(mblk, write_list, link) {
+		wait_on_bit_io(&mblk->state, DMZ_META_WRITING,
+			       TASK_UNINTERRUPTIBLE);
+		if (test_bit(DMZ_META_ERROR, &mblk->state)) {
+			clear_bit(DMZ_META_ERROR, &mblk->state);
+			ret = -EIO;
+		}
+	}
+
+	/* Flush drive cache (this will also sync data) */
+	if (ret == 0)
+		ret = blkdev_issue_flush(zmd->dev->bdev, GFP_KERNEL, NULL);
+
+	return ret;
+}
+
+/*
+ * Log dirty metadata blocks.
+ */
+static int dmz_log_dirty_mblocks(struct dmz_metadata *zmd,
+				 struct list_head *write_list)
+{
+	unsigned int log_set = zmd->mblk_primary ^ 0x1;
+	int ret;
+
+	/* Write dirty blocks to the log */
+	ret = dmz_write_dirty_mblocks(zmd, write_list, log_set);
+	if (ret)
+		return ret;
+
+	/*
+	 * No error so far: now validate the log by updating the
+	 * log index super block generation.
+	 */
+	ret = dmz_write_sb(zmd, log_set);
+	if (ret)
+		return ret;
+
+	return 0;
+}
+
+/*
+ * Flush dirty metadata blocks.
+ */
+int dmz_flush_metadata(struct dmz_metadata *zmd)
+{
+	struct dmz_mblock *mblk;
+	struct list_head write_list;
+	int ret;
+
+	if (WARN_ON(!zmd))
+		return 0;
+
+	INIT_LIST_HEAD(&write_list);
+
+	/*
+	 * Make sure that metadata blocks are stable before logging: take
+	 * the write lock on the metadata semaphore to prevent target BIOs
+	 * from modifying metadata.
+	 */
+	down_write(&zmd->mblk_sem);
+
+	/*
+	 * This is called from the target flush work and reclaim work.
+	 * Concurrent execution is not allowed.
+	 */
+	dmz_lock_flush(zmd);
+
+	/* Get dirty blocks */
+	spin_lock(&zmd->mblk_lock);
+	list_splice_init(&zmd->mblk_dirty_list, &write_list);
+	spin_unlock(&zmd->mblk_lock);
+
+	/* If there are no dirty metadata blocks, just flush the device cache */
+	if (list_empty(&write_list)) {
+		ret = blkdev_issue_flush(zmd->dev->bdev, GFP_KERNEL, NULL);
+		goto out;
+	}
+
+	/*
+	 * The primary metadata set is still clean. Keep it this way until
+	 * all updates are successful in the secondary set. That is, use
+	 * the secondary set as a log.
+	 */
+	ret = dmz_log_dirty_mblocks(zmd, &write_list);
+	if (ret)
+		goto out;
+
+	/*
+	 * The log is on disk. It is now safe to update in place
+	 * in the primary metadata set.
+	 */
+	ret = dmz_write_dirty_mblocks(zmd, &write_list, zmd->mblk_primary);
+	if (ret)
+		goto out;
+
+	ret = dmz_write_sb(zmd, zmd->mblk_primary);
+	if (ret)
+		goto out;
+
+	while (!list_empty(&write_list)) {
+		mblk = list_first_entry(&write_list, struct dmz_mblock, link);
+		list_del_init(&mblk->link);
+
+		spin_lock(&zmd->mblk_lock);
+		clear_bit(DMZ_META_DIRTY, &mblk->state);
+		if (atomic_read(&mblk->ref) == 0)
+			list_add_tail(&mblk->link, &zmd->mblk_lru_list);
+		spin_unlock(&zmd->mblk_lock);
+	}
+
+	zmd->sb_gen++;
+out:
+	if (ret && !list_empty(&write_list)) {
+		spin_lock(&zmd->mblk_lock);
+		list_splice(&write_list, &zmd->mblk_dirty_list);
+		spin_unlock(&zmd->mblk_lock);
+	}
+
+	dmz_unlock_flush(zmd);
+	up_write(&zmd->mblk_sem);
+
+	return ret;
+}
+
+/*
+ * Check super block.
+ */
+static int dmz_check_sb(struct dmz_metadata *zmd, struct dmz_super *sb)
+{
+	unsigned int nr_meta_zones, nr_data_zones;
+	struct dmz_dev *dev = zmd->dev;
+	u32 crc, stored_crc;
+	u64 gen;
+
+	gen = le64_to_cpu(sb->gen);
+	stored_crc = le32_to_cpu(sb->crc);
+	sb->crc = 0;
+	crc = crc32_le(gen, (unsigned char *)sb, DMZ_BLOCK_SIZE);
+	if (crc != stored_crc) {
+		dmz_dev_err(dev, "Invalid checksum (needed 0x%08x, got 0x%08x)",
+			    crc, stored_crc);
+		return -ENXIO;
+	}
+
+	if (le32_to_cpu(sb->magic) != DMZ_MAGIC) {
+		dmz_dev_err(dev, "Invalid meta magic (needed 0x%08x, got 0x%08x)",
+			    DMZ_MAGIC, le32_to_cpu(sb->magic));
+		return -ENXIO;
+	}
+
+	if (le32_to_cpu(sb->version) != DMZ_META_VER) {
+		dmz_dev_err(dev, "Invalid meta version (needed %d, got %d)",
+			    DMZ_META_VER, le32_to_cpu(sb->version));
+		return -ENXIO;
+	}
+
+	nr_meta_zones = (le32_to_cpu(sb->nr_meta_blocks) + dev->zone_nr_blocks - 1)
+		>> dev->zone_nr_blocks_shift;
+	if (!nr_meta_zones ||
+	    nr_meta_zones >= zmd->nr_rnd_zones) {
+		dmz_dev_err(dev, "Invalid number of metadata blocks");
+		return -ENXIO;
+	}
+
+	if (!le32_to_cpu(sb->nr_reserved_seq) ||
+	    le32_to_cpu(sb->nr_reserved_seq) >= (zmd->nr_useable_zones - nr_meta_zones)) {
+		dmz_dev_err(dev, "Invalid number of reserved sequential zones");
+		return -ENXIO;
+	}
+
+	nr_data_zones = zmd->nr_useable_zones -
+		(nr_meta_zones * 2 + le32_to_cpu(sb->nr_reserved_seq));
+	if (le32_to_cpu(sb->nr_chunks) > nr_data_zones) {
+		dmz_dev_err(dev, "Invalid number of chunks %u / %u",
+			    le32_to_cpu(sb->nr_chunks), nr_data_zones);
+		return -ENXIO;
+	}
+
+	/* OK */
+	zmd->nr_meta_blocks = le32_to_cpu(sb->nr_meta_blocks);
+	zmd->nr_reserved_seq = le32_to_cpu(sb->nr_reserved_seq);
+	zmd->nr_chunks = le32_to_cpu(sb->nr_chunks);
+	zmd->nr_map_blocks = le32_to_cpu(sb->nr_map_blocks);
+	zmd->nr_bitmap_blocks = le32_to_cpu(sb->nr_bitmap_blocks);
+	zmd->nr_meta_zones = nr_meta_zones;
+	zmd->nr_data_zones = nr_data_zones;
+
+	return 0;
+}
+
+/*
+ * Read the first or second super block from disk.
+ */
+static int dmz_read_sb(struct dmz_metadata *zmd, unsigned int set)
+{
+	return dmz_rdwr_block(zmd, REQ_OP_READ, zmd->sb[set].block,
+			      zmd->sb[set].mblk->page);
+}
+
+/*
+ * Determine the position of the secondary super blocks on disk.
+ * This is used only if a corruption of the primary super block
+ * is detected.
+ */
+static int dmz_lookup_secondary_sb(struct dmz_metadata *zmd)
+{
+	unsigned int zone_nr_blocks = zmd->dev->zone_nr_blocks;
+	struct dmz_mblock *mblk;
+	int i;
+
+	/* Allocate a block */
+	mblk = dmz_alloc_mblock(zmd, 0);
+	if (!mblk)
+		return -ENOMEM;
+
+	zmd->sb[1].mblk = mblk;
+	zmd->sb[1].sb = mblk->data;
+
+	/* Bad first super block: search for the second one */
+	zmd->sb[1].block = zmd->sb[0].block + zone_nr_blocks;
+	for (i = 0; i < zmd->nr_rnd_zones - 1; i++) {
+		if (dmz_read_sb(zmd, 1) != 0)
+			break;
+		if (le32_to_cpu(zmd->sb[1].sb->magic) == DMZ_MAGIC)
+			return 0;
+		zmd->sb[1].block += zone_nr_blocks;
+	}
+
+	dmz_free_mblock(zmd, mblk);
+	zmd->sb[1].mblk = NULL;
+
+	return -EIO;
+}
+
+/*
+ * Read the first or second super block from disk.
+ */
+static int dmz_get_sb(struct dmz_metadata *zmd, unsigned int set)
+{
+	struct dmz_mblock *mblk;
+	int ret;
+
+	/* Allocate a block */
+	mblk = dmz_alloc_mblock(zmd, 0);
+	if (!mblk)
+		return -ENOMEM;
+
+	zmd->sb[set].mblk = mblk;
+	zmd->sb[set].sb = mblk->data;
+
+	/* Read super block */
+	ret = dmz_read_sb(zmd, set);
+	if (ret) {
+		dmz_free_mblock(zmd, mblk);
+		zmd->sb[set].mblk = NULL;
+		return ret;
+	}
+
+	return 0;
+}
+
+/*
+ * Recover a metadata set.
+ */
+static int dmz_recover_mblocks(struct dmz_metadata *zmd, unsigned int dst_set)
+{
+	unsigned int src_set = dst_set ^ 0x1;
+	struct page *page;
+	int i, ret;
+
+	dmz_dev_warn(zmd->dev, "Metadata set %u invalid: recovering", dst_set);
+
+	if (dst_set == 0)
+		zmd->sb[0].block = dmz_start_block(zmd, zmd->sb_zone);
+	else {
+		zmd->sb[1].block = zmd->sb[0].block +
+			(zmd->nr_meta_zones << zmd->dev->zone_nr_blocks_shift);
+	}
+
+	page = alloc_page(GFP_KERNEL);
+	if (!page)
+		return -ENOMEM;
+
+	/* Copy metadata blocks */
+	for (i = 1; i < zmd->nr_meta_blocks; i++) {
+		ret = dmz_rdwr_block(zmd, REQ_OP_READ,
+				     zmd->sb[src_set].block + i, page);
+		if (ret)
+			goto out;
+		ret = dmz_rdwr_block(zmd, REQ_OP_WRITE,
+				     zmd->sb[dst_set].block + i, page);
+		if (ret)
+			goto out;
+	}
+
+	/* Finalize with the super block */
+	if (!zmd->sb[dst_set].mblk) {
+		zmd->sb[dst_set].mblk = dmz_alloc_mblock(zmd, 0);
+		if (!zmd->sb[dst_set].mblk) {
+			ret = -ENOMEM;
+			goto out;
+		}
+		zmd->sb[dst_set].sb = zmd->sb[dst_set].mblk->data;
+	}
+
+	ret = dmz_write_sb(zmd, dst_set);
+out:
+	__free_pages(page, 0);
+
+	return ret;
+}
+
+/*
+ * Get super block from disk.
+ */
+static int dmz_load_sb(struct dmz_metadata *zmd)
+{
+	bool sb_good[2] = {false, false};
+	u64 sb_gen[2] = {0, 0};
+	int ret;
+
+	/* Read and check the primary super block */
+	zmd->sb[0].block = dmz_start_block(zmd, zmd->sb_zone);
+	ret = dmz_get_sb(zmd, 0);
+	if (ret) {
+		dmz_dev_err(zmd->dev, "Read primary super block failed");
+		return ret;
+	}
+
+	ret = dmz_check_sb(zmd, zmd->sb[0].sb);
+
+	/* Read and check secondary super block */
+	if (ret == 0) {
+		sb_good[0] = true;
+		zmd->sb[1].block = zmd->sb[0].block +
+			(zmd->nr_meta_zones << zmd->dev->zone_nr_blocks_shift);
+		ret = dmz_get_sb(zmd, 1);
+	} else
+		ret = dmz_lookup_secondary_sb(zmd);
+
+	if (ret) {
+		dmz_dev_err(zmd->dev, "Read secondary super block failed");
+		return ret;
+	}
+
+	ret = dmz_check_sb(zmd, zmd->sb[1].sb);
+	if (ret == 0)
+		sb_good[1] = true;
+
+	/* Use highest generation sb first */
+	if (!sb_good[0] && !sb_good[1]) {
+		dmz_dev_err(zmd->dev, "No valid super block found");
+		return -EIO;
+	}
+
+	if (sb_good[0])
+		sb_gen[0] = le64_to_cpu(zmd->sb[0].sb->gen);
+	else
+		ret = dmz_recover_mblocks(zmd, 0);
+
+	if (sb_good[1])
+		sb_gen[1] = le64_to_cpu(zmd->sb[1].sb->gen);
+	else
+		ret = dmz_recover_mblocks(zmd, 1);
+
+	if (ret) {
+		dmz_dev_err(zmd->dev, "Recovery failed");
+		return -EIO;
+	}
+
+	if (sb_gen[0] >= sb_gen[1]) {
+		zmd->sb_gen = sb_gen[0];
+		zmd->mblk_primary = 0;
+	} else {
+		zmd->sb_gen = sb_gen[1];
+		zmd->mblk_primary = 1;
+	}
+
+	dmz_dev_debug(zmd->dev, "Using super block %u (gen %llu)",
+		      zmd->mblk_primary, zmd->sb_gen);
+
+	return 0;
+}
+
+/*
+ * Initialize a zone descriptor.
+ */
+static int dmz_init_zone(struct dmz_metadata *zmd, struct dm_zone *zone,
+			 struct blk_zone *blkz)
+{
+	struct dmz_dev *dev = zmd->dev;
+
+	/* Ignore the eventual last runt (smaller) zone */
+	if (blkz->len != dev->zone_nr_sectors) {
+		if (blkz->start + blkz->len == dev->capacity)
+			return 0;
+		return -ENXIO;
+	}
+
+	INIT_LIST_HEAD(&zone->link);
+	atomic_set(&zone->refcount, 0);
+	zone->chunk = DMZ_MAP_UNMAPPED;
+
+	if (blkz->type == BLK_ZONE_TYPE_CONVENTIONAL) {
+		set_bit(DMZ_RND, &zone->flags);
+		zmd->nr_rnd_zones++;
+	} else if (blkz->type == BLK_ZONE_TYPE_SEQWRITE_REQ ||
+		   blkz->type == BLK_ZONE_TYPE_SEQWRITE_PREF) {
+		set_bit(DMZ_SEQ, &zone->flags);
+	} else
+		return -ENXIO;
+
+	if (blkz->cond == BLK_ZONE_COND_OFFLINE)
+		set_bit(DMZ_OFFLINE, &zone->flags);
+	else if (blkz->cond == BLK_ZONE_COND_READONLY)
+		set_bit(DMZ_READ_ONLY, &zone->flags);
+
+	if (dmz_is_rnd(zone))
+		zone->wp_block = 0;
+	else
+		zone->wp_block = dmz_sect2blk(blkz->wp - blkz->start);
+
+	if (!dmz_is_offline(zone) && !dmz_is_readonly(zone)) {
+		zmd->nr_useable_zones++;
+		if (dmz_is_rnd(zone)) {
+			zmd->nr_rnd_zones++;
+			if (!zmd->sb_zone) {
+				/* Super block zone */
+				zmd->sb_zone = zone;
+			}
+		}
+	}
+
+	return 0;
+}
+
+/*
+ * Free zones descriptors.
+ */
+static void dmz_drop_zones(struct dmz_metadata *zmd)
+{
+	kfree(zmd->zones);
+	zmd->zones = NULL;
+}
+
+/*
+ * The size of a zone report in number of zones.
+ * This results in 4096*64B=256KB report zones commands.
+ */
+#define DMZ_REPORT_NR_ZONES	4096
+
+/*
+ * Allocate and initialize zone descriptors using the zone
+ * information from disk.
+ */
+static int dmz_init_zones(struct dmz_metadata *zmd)
+{
+	struct dmz_dev *dev = zmd->dev;
+	struct dm_zone *zone;
+	struct blk_zone *blkz;
+	unsigned int nr_blkz;
+	sector_t sector = 0;
+	int i, ret = 0;
+
+	/* Init */
+	zmd->zone_bitmap_size = dev->zone_nr_blocks >> 3;
+	zmd->zone_nr_bitmap_blocks = zmd->zone_bitmap_size >> DMZ_BLOCK_SHIFT;
+
+	/* Allocate zone array */
+	zmd->zones = kcalloc(dev->nr_zones, sizeof(struct dm_zone), GFP_KERNEL);
+	if (!zmd->zones)
+		return -ENOMEM;
+
+	dmz_dev_info(dev, "Using %zu B for zone information",
+		     sizeof(struct dm_zone) * dev->nr_zones);
+
+	/* Get zone information */
+	nr_blkz = DMZ_REPORT_NR_ZONES;
+	blkz = kcalloc(nr_blkz, sizeof(struct blk_zone), GFP_KERNEL);
+	if (!blkz) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	/*
+	 * Get zone information and initialize zone descriptors.
+	 * At the same time, determine where the super block
+	 * should be: first block of the first randomly writable
+	 * zone.
+	 */
+	zone = zmd->zones;
+	while (sector < dev->capacity) {
+		/* Get zone information */
+		nr_blkz = DMZ_REPORT_NR_ZONES;
+		ret = blkdev_report_zones(dev->bdev, sector, blkz,
+					  &nr_blkz, GFP_KERNEL);
+		if (ret) {
+			dmz_dev_err(dev, "Report zones failed %d", ret);
+			goto out;
+		}
+
+		/* Process report */
+		for (i = 0; i < nr_blkz; i++) {
+			ret = dmz_init_zone(zmd, zone, &blkz[i]);
+			if (ret)
+				goto out;
+			sector += dev->zone_nr_sectors;
+			zone++;
+		}
+	}
+
+	/* The entire zone configuration of the disk should now be known */
+	if (sector < dev->capacity) {
+		dmz_dev_err(dev, "Failed to get correct zone information");
+		ret = -ENXIO;
+	}
+out:
+	kfree(blkz);
+	if (ret)
+		dmz_drop_zones(zmd);
+
+	return ret;
+}
+
+/*
+ * Update a zone information.
+ */
+static int dmz_update_zone(struct dmz_metadata *zmd, struct dm_zone *zone)
+{
+	unsigned int nr_blkz = 1;
+	struct blk_zone blkz;
+	int ret;
+
+	/* Get zone information from disk */
+	ret = blkdev_report_zones(zmd->dev->bdev, dmz_start_sect(zmd, zone),
+				  &blkz, &nr_blkz, GFP_KERNEL);
+	if (ret) {
+		dmz_dev_err(zmd->dev, "Get zone %u report failed",
+			    dmz_id(zmd, zone));
+		return ret;
+	}
+
+	clear_bit(DMZ_OFFLINE, &zone->flags);
+	clear_bit(DMZ_READ_ONLY, &zone->flags);
+	if (blkz.cond == BLK_ZONE_COND_OFFLINE)
+		set_bit(DMZ_OFFLINE, &zone->flags);
+	else if (blkz.cond == BLK_ZONE_COND_READONLY)
+		set_bit(DMZ_READ_ONLY, &zone->flags);
+
+	if (dmz_is_seq(zone))
+		zone->wp_block = dmz_sect2blk(blkz.wp - blkz.start);
+	else
+		zone->wp_block = 0;
+
+	return 0;
+}
+
+/*
+ * Check a zone write pointer position when the zone is marked
+ * with the sequential write error flag.
+ */
+static int dmz_handle_seq_write_err(struct dmz_metadata *zmd,
+				    struct dm_zone *zone)
+{
+	unsigned int wp = 0;
+	int ret;
+
+	wp = zone->wp_block;
+	ret = dmz_update_zone(zmd, zone);
+	if (ret)
+		return ret;
+
+	dmz_dev_warn(zmd->dev, "Processing zone %u write error (zone wp %u/%u)",
+		     dmz_id(zmd, zone), zone->wp_block, wp);
+
+	if (zone->wp_block < wp) {
+		dmz_invalidate_blocks(zmd, zone, zone->wp_block,
+				      wp - zone->wp_block);
+	}
+
+	return 0;
+}
+
+static struct dm_zone *dmz_get(struct dmz_metadata *zmd, unsigned int zone_id)
+{
+	return &zmd->zones[zone_id];
+}
+
+/*
+ * Reset a zone write pointer.
+ */
+static int dmz_reset_zone(struct dmz_metadata *zmd, struct dm_zone *zone)
+{
+	int ret;
+
+	/*
+	 * Ignore offline zones, read only zones,
+	 * and conventional zones.
+	 */
+	if (dmz_is_offline(zone) ||
+	    dmz_is_readonly(zone) ||
+	    dmz_is_rnd(zone))
+		return 0;
+
+	if (!dmz_is_empty(zone) || dmz_seq_write_err(zone)) {
+		struct dmz_dev *dev = zmd->dev;
+
+		ret = blkdev_reset_zones(dev->bdev,
+					 dmz_start_sect(zmd, zone),
+					 dev->zone_nr_sectors, GFP_KERNEL);
+		if (ret) {
+			dmz_dev_err(dev, "Reset zone %u failed %d",
+				    dmz_id(zmd, zone), ret);
+			return ret;
+		}
+	}
+
+	/* Clear write error bit and rewind write pointer position */
+	clear_bit(DMZ_SEQ_WRITE_ERR, &zone->flags);
+	zone->wp_block = 0;
+
+	return 0;
+}
+
+static void dmz_get_zone_weight(struct dmz_metadata *zmd, struct dm_zone *zone);
+
+/*
+ * Initialize chunk mapping.
+ */
+static int dmz_load_mapping(struct dmz_metadata *zmd)
+{
+	struct dmz_dev *dev = zmd->dev;
+	struct dm_zone *dzone, *bzone;
+	struct dmz_mblock *dmap_mblk = NULL;
+	struct dmz_map *dmap;
+	unsigned int i = 0, e = 0, chunk = 0;
+	unsigned int dzone_id;
+	unsigned int bzone_id;
+
+	/* Metadata block array for the chunk mapping table */
+	zmd->map_mblk = kcalloc(zmd->nr_map_blocks,
+				sizeof(struct dmz_mblk *), GFP_KERNEL);
+	if (!zmd->map_mblk)
+		return -ENOMEM;
+
+	/* Get chunk mapping table blocks and initialize zone mapping */
+	while (chunk < zmd->nr_chunks) {
+		if (!dmap_mblk) {
+			/* Get mapping block */
+			dmap_mblk = dmz_get_mblock(zmd, i + 1);
+			if (IS_ERR(dmap_mblk))
+				return PTR_ERR(dmap_mblk);
+			zmd->map_mblk[i] = dmap_mblk;
+			dmap = (struct dmz_map *) dmap_mblk->data;
+			i++;
+			e = 0;
+		}
+
+		/* Check data zone */
+		dzone_id = le32_to_cpu(dmap[e].dzone_id);
+		if (dzone_id == DMZ_MAP_UNMAPPED)
+			goto next;
+
+		if (dzone_id >= dev->nr_zones) {
+			dmz_dev_err(dev, "Chunk %u mapping: invalid data zone ID %u",
+				    chunk, dzone_id);
+			return -EIO;
+		}
+
+		dzone = dmz_get(zmd, dzone_id);
+		set_bit(DMZ_DATA, &dzone->flags);
+		dzone->chunk = chunk;
+		dmz_get_zone_weight(zmd, dzone);
+
+		if (dmz_is_rnd(dzone))
+			list_add_tail(&dzone->link, &zmd->map_rnd_list);
+		else
+			list_add_tail(&dzone->link, &zmd->map_seq_list);
+
+		/* Check buffer zone */
+		bzone_id = le32_to_cpu(dmap[e].bzone_id);
+		if (bzone_id == DMZ_MAP_UNMAPPED)
+			goto next;
+
+		if (bzone_id >= dev->nr_zones) {
+			dmz_dev_err(dev, "Chunk %u mapping: invalid buffer zone ID %u",
+				    chunk, bzone_id);
+			return -EIO;
+		}
+
+		bzone = dmz_get(zmd, bzone_id);
+		if (!dmz_is_rnd(bzone)) {
+			dmz_dev_err(dev, "Chunk %u mapping: invalid buffer zone %u",
+				    chunk, bzone_id);
+			return -EIO;
+		}
+
+		set_bit(DMZ_DATA, &bzone->flags);
+		set_bit(DMZ_BUF, &bzone->flags);
+		bzone->chunk = chunk;
+		bzone->bzone = dzone;
+		dzone->bzone = bzone;
+		dmz_get_zone_weight(zmd, bzone);
+		list_add_tail(&bzone->link, &zmd->map_rnd_list);
+next:
+		chunk++;
+		e++;
+		if (e >= DMZ_MAP_ENTRIES)
+			dmap_mblk = NULL;
+	}
+
+	/*
+	 * At this point, only meta zones and mapped data zones were
+	 * fully initialized. All remaining zones are unmapped data
+	 * zones. Finish initializing those here.
+	 */
+	for (i = 0; i < dev->nr_zones; i++) {
+		dzone = dmz_get(zmd, i);
+		if (dmz_is_meta(dzone))
+			continue;
+
+		if (dmz_is_rnd(dzone))
+			zmd->nr_rnd++;
+		else
+			zmd->nr_seq++;
+
+		if (dmz_is_data(dzone)) {
+			/* Already initialized */
+			continue;
+		}
+
+		/* Unmapped data zone */
+		set_bit(DMZ_DATA, &dzone->flags);
+		dzone->chunk = DMZ_MAP_UNMAPPED;
+		if (dmz_is_rnd(dzone)) {
+			list_add_tail(&dzone->link, &zmd->unmap_rnd_list);
+			atomic_inc(&zmd->unmap_nr_rnd);
+		} else if (atomic_read(&zmd->nr_reserved_seq_zones) < zmd->nr_reserved_seq) {
+			list_add_tail(&dzone->link, &zmd->reserved_seq_zones_list);
+			atomic_inc(&zmd->nr_reserved_seq_zones);
+			zmd->nr_seq--;
+		} else {
+			list_add_tail(&dzone->link, &zmd->unmap_seq_list);
+			atomic_inc(&zmd->unmap_nr_seq);
+		}
+	}
+
+	return 0;
+}
+
+/*
+ * Set a data chunk mapping.
+ */
+static void dmz_set_chunk_mapping(struct dmz_metadata *zmd, unsigned int chunk,
+				  unsigned int dzone_id, unsigned int bzone_id)
+{
+	struct dmz_mblock *dmap_mblk = zmd->map_mblk[chunk >> DMZ_MAP_ENTRIES_SHIFT];
+	struct dmz_map *dmap = (struct dmz_map *) dmap_mblk->data;
+	int map_idx = chunk & DMZ_MAP_ENTRIES_MASK;
+
+	dmap[map_idx].dzone_id = cpu_to_le32(dzone_id);
+	dmap[map_idx].bzone_id = cpu_to_le32(bzone_id);
+	dmz_dirty_mblock(zmd, dmap_mblk);
+}
+
+/*
+ * The list of mapped zones is maintained in LRU order.
+ * This rotates a zone at the end of its map list.
+ */
+static void __dmz_lru_zone(struct dmz_metadata *zmd, struct dm_zone *zone)
+{
+	if (list_empty(&zone->link))
+		return;
+
+	list_del_init(&zone->link);
+	if (dmz_is_seq(zone)) {
+		/* LRU rotate sequential zone */
+		list_add_tail(&zone->link, &zmd->map_seq_list);
+	} else {
+		/* LRU rotate random zone */
+		list_add_tail(&zone->link, &zmd->map_rnd_list);
+	}
+}
+
+/*
+ * The list of mapped random zones is maintained
+ * in LRU order. This rotates a zone at the end of the list.
+ */
+static void dmz_lru_zone(struct dmz_metadata *zmd, struct dm_zone *zone)
+{
+	__dmz_lru_zone(zmd, zone);
+	if (zone->bzone)
+		__dmz_lru_zone(zmd, zone->bzone);
+}
+
+/*
+ * Wait for any zone to be freed.
+ */
+static void dmz_wait_for_free_zones(struct dmz_metadata *zmd)
+{
+	DEFINE_WAIT(wait);
+
+	prepare_to_wait(&zmd->free_wq, &wait, TASK_UNINTERRUPTIBLE);
+	dmz_unlock_map(zmd);
+	dmz_unlock_metadata(zmd);
+
+	io_schedule_timeout(HZ);
+
+	dmz_lock_metadata(zmd);
+	dmz_lock_map(zmd);
+	finish_wait(&zmd->free_wq, &wait);
+}
+
+/*
+ * Lock a zone for reclaim (set the zone RECLAIM bit).
+ * Returns false if the zone cannot be locked or if it is already locked
+ * and 1 otherwise.
+ */
+int dmz_lock_zone_reclaim(struct dm_zone *zone)
+{
+	/* Active zones cannot be reclaimed */
+	if (dmz_is_active(zone))
+		return 0;
+
+	return !test_and_set_bit(DMZ_RECLAIM, &zone->flags);
+}
+
+/*
+ * Clear a zone reclaim flag.
+ */
+void dmz_unlock_zone_reclaim(struct dm_zone *zone)
+{
+	WARN_ON(dmz_is_active(zone));
+	WARN_ON(!dmz_in_reclaim(zone));
+
+	clear_bit_unlock(DMZ_RECLAIM, &zone->flags);
+	smp_mb__after_atomic();
+	wake_up_bit(&zone->flags, DMZ_RECLAIM);
+}
+
+/*
+ * Wait for a zone reclaim to complete.
+ */
+static void dmz_wait_for_reclaim(struct dmz_metadata *zmd, struct dm_zone *zone)
+{
+	dmz_unlock_map(zmd);
+	dmz_unlock_metadata(zmd);
+	wait_on_bit_timeout(&zone->flags, DMZ_RECLAIM, TASK_UNINTERRUPTIBLE, HZ);
+	dmz_lock_metadata(zmd);
+	dmz_lock_map(zmd);
+}
+
+/*
+ * Select a random write zone for reclaim.
+ */
+static struct dm_zone *dmz_get_rnd_zone_for_reclaim(struct dmz_metadata *zmd)
+{
+	struct dm_zone *dzone = NULL;
+	struct dm_zone *zone;
+
+	if (list_empty(&zmd->map_rnd_list))
+		return NULL;
+
+	list_for_each_entry(zone, &zmd->map_rnd_list, link) {
+		if (dmz_is_buf(zone))
+			dzone = zone->bzone;
+		else
+			dzone = zone;
+		if (dmz_lock_zone_reclaim(dzone))
+			return dzone;
+	}
+
+	return NULL;
+}
+
+/*
+ * Select a buffered sequential zone for reclaim.
+ */
+static struct dm_zone *dmz_get_seq_zone_for_reclaim(struct dmz_metadata *zmd)
+{
+	struct dm_zone *zone;
+
+	if (list_empty(&zmd->map_seq_list))
+		return NULL;
+
+	list_for_each_entry(zone, &zmd->map_seq_list, link) {
+		if (!zone->bzone)
+			continue;
+		if (dmz_lock_zone_reclaim(zone))
+			return zone;
+	}
+
+	return NULL;
+}
+
+/*
+ * Select a zone for reclaim.
+ */
+struct dm_zone *dmz_get_zone_for_reclaim(struct dmz_metadata *zmd)
+{
+	struct dm_zone *zone;
+
+	/*
+	 * Search for a zone candidate to reclaim: 2 cases are possible.
+	 * (1) There is no free sequential zones. Then a random data zone
+	 *     cannot be reclaimed. So choose a sequential zone to reclaim so
+	 *     that afterward a random zone can be reclaimed.
+	 * (2) At least one free sequential zone is available, then choose
+	 *     the oldest random zone (data or buffer) that can be locked.
+	 */
+	dmz_lock_map(zmd);
+	if (list_empty(&zmd->reserved_seq_zones_list))
+		zone = dmz_get_seq_zone_for_reclaim(zmd);
+	else
+		zone = dmz_get_rnd_zone_for_reclaim(zmd);
+	dmz_unlock_map(zmd);
+
+	return zone;
+}
+
+/*
+ * Activate a zone (increment its reference count).
+ */
+void dmz_activate_zone(struct dm_zone *zone)
+{
+	set_bit(DMZ_ACTIVE, &zone->flags);
+	atomic_inc(&zone->refcount);
+}
+
+/*
+ * Deactivate a zone. This decrement the zone reference counter
+ * and clears the active state of the zone once the count reaches 0,
+ * indicating that all BIOs to the zone have completed. Returns
+ * true if the zone was deactivated.
+ */
+void dmz_deactivate_zone(struct dm_zone *zone)
+{
+	if (atomic_dec_and_test(&zone->refcount)) {
+		WARN_ON(!test_bit(DMZ_ACTIVE, &zone->flags));
+		clear_bit_unlock(DMZ_ACTIVE, &zone->flags);
+		smp_mb__after_atomic();
+	}
+}
+
+/*
+ * Get the zone mapping a chunk, if the chunk is mapped already.
+ * If no mapping exist and the operation is WRITE, a zone is
+ * allocated and used to map the chunk.
+ * The zone returned will be set to the active state.
+ */
+struct dm_zone *dmz_get_chunk_mapping(struct dmz_metadata *zmd, unsigned int chunk, int op)
+{
+	struct dmz_mblock *dmap_mblk = zmd->map_mblk[chunk >> DMZ_MAP_ENTRIES_SHIFT];
+	struct dmz_map *dmap = (struct dmz_map *) dmap_mblk->data;
+	int dmap_idx = chunk & DMZ_MAP_ENTRIES_MASK;
+	unsigned int dzone_id;
+	struct dm_zone *dzone = NULL;
+	int ret = 0;
+
+	dmz_lock_map(zmd);
+again:
+	/* Get the chunk mapping */
+	dzone_id = le32_to_cpu(dmap[dmap_idx].dzone_id);
+	if (dzone_id == DMZ_MAP_UNMAPPED) {
+		/*
+		 * Read or discard in unmapped chunks are fine. But for
+		 * writes, we need a mapping, so get one.
+		 */
+		if (op != REQ_OP_WRITE)
+			goto out;
+
+		/* Alloate a random zone */
+		dzone = dmz_alloc_zone(zmd, DMZ_ALLOC_RND);
+		if (!dzone) {
+			dmz_wait_for_free_zones(zmd);
+			goto again;
+		}
+
+		dmz_map_zone(zmd, dzone, chunk);
+
+	} else {
+		/* The chunk is already mapped: get the mapping zone */
+		dzone = dmz_get(zmd, dzone_id);
+		if (dzone->chunk != chunk) {
+			dzone = ERR_PTR(-EIO);
+			goto out;
+		}
+
+		/* Repair write pointer if the sequential dzone has error */
+		if (dmz_seq_write_err(dzone)) {
+			ret = dmz_handle_seq_write_err(zmd, dzone);
+			if (ret) {
+				dzone = ERR_PTR(-EIO);
+				goto out;
+			}
+			clear_bit(DMZ_SEQ_WRITE_ERR, &dzone->flags);
+		}
+	}
+
+	/*
+	 * If the zone is being reclaimed, the chunk mapping may change
+	 * to a different zone. So wait for reclaim and retry. Otherwise,
+	 * activate the zone (this will prevent reclaim from touching it).
+	 */
+	if (dmz_in_reclaim(dzone)) {
+		dmz_wait_for_reclaim(zmd, dzone);
+		goto again;
+	}
+	dmz_activate_zone(dzone);
+	dmz_lru_zone(zmd, dzone);
+out:
+	dmz_unlock_map(zmd);
+
+	return dzone;
+}
+
+/*
+ * Write and discard change the block validity of data zones and their buffer
+ * zones. Check here that valid blocks are still present. If all blocks are
+ * invalid, the zones can be unmapped on the fly without waiting for reclaim
+ * to do it.
+ */
+void dmz_put_chunk_mapping(struct dmz_metadata *zmd, struct dm_zone *dzone)
+{
+	struct dm_zone *bzone;
+
+	dmz_lock_map(zmd);
+
+	bzone = dzone->bzone;
+	if (bzone) {
+		if (dmz_weight(bzone))
+			dmz_lru_zone(zmd, bzone);
+		else {
+			/* Empty buffer zone: reclaim it */
+			dmz_unmap_zone(zmd, bzone);
+			dmz_free_zone(zmd, bzone);
+			bzone = NULL;
+		}
+	}
+
+	/* Deactivate the data zone */
+	dmz_deactivate_zone(dzone);
+	if (dmz_is_active(dzone) || bzone || dmz_weight(dzone))
+		dmz_lru_zone(zmd, dzone);
+	else {
+		/* Unbuffered inactive empty data zone: reclaim it */
+		dmz_unmap_zone(zmd, dzone);
+		dmz_free_zone(zmd, dzone);
+	}
+
+	dmz_unlock_map(zmd);
+}
+
+/*
+ * Allocate and map a random zone to buffer a chunk
+ * already mapped to a sequential zone.
+ */
+struct dm_zone *dmz_get_chunk_buffer(struct dmz_metadata *zmd,
+				     struct dm_zone *dzone)
+{
+	struct dm_zone *bzone;
+
+	dmz_lock_map(zmd);
+again:
+	bzone = dzone->bzone;
+	if (bzone)
+		goto out;
+
+	/* Alloate a random zone */
+	bzone = dmz_alloc_zone(zmd, DMZ_ALLOC_RND);
+	if (!bzone) {
+		dmz_wait_for_free_zones(zmd);
+		goto again;
+	}
+
+	/* Update the chunk mapping */
+	dmz_set_chunk_mapping(zmd, dzone->chunk, dmz_id(zmd, dzone),
+			      dmz_id(zmd, bzone));
+
+	set_bit(DMZ_BUF, &bzone->flags);
+	bzone->chunk = dzone->chunk;
+	bzone->bzone = dzone;
+	dzone->bzone = bzone;
+	list_add_tail(&bzone->link, &zmd->map_rnd_list);
+out:
+	dmz_unlock_map(zmd);
+
+	return bzone;
+}
+
+/*
+ * Get an unmapped (free) zone.
+ * This must be called with the mapping lock held.
+ */
+struct dm_zone *dmz_alloc_zone(struct dmz_metadata *zmd, unsigned long flags)
+{
+	struct list_head *list;
+	struct dm_zone *zone;
+
+	if (flags & DMZ_ALLOC_RND)
+		list = &zmd->unmap_rnd_list;
+	else
+		list = &zmd->unmap_seq_list;
+again:
+	if (list_empty(list)) {
+		/*
+		 * No free zone: if this is for reclaim, allow using the
+		 * reserved sequential zones.
+		 */
+		if (!(flags & DMZ_ALLOC_RECLAIM) ||
+		    list_empty(&zmd->reserved_seq_zones_list))
+			return NULL;
+
+		zone = list_first_entry(&zmd->reserved_seq_zones_list,
+					struct dm_zone, link);
+		list_del_init(&zone->link);
+		atomic_dec(&zmd->nr_reserved_seq_zones);
+		return zone;
+	}
+
+	zone = list_first_entry(list, struct dm_zone, link);
+	list_del_init(&zone->link);
+
+	if (dmz_is_rnd(zone))
+		atomic_dec(&zmd->unmap_nr_rnd);
+	else
+		atomic_dec(&zmd->unmap_nr_seq);
+
+	if (dmz_is_offline(zone)) {
+		dmz_dev_warn(zmd->dev, "Zone %u is offline", dmz_id(zmd, zone));
+		zone = NULL;
+		goto again;
+	}
+
+	return zone;
+}
+
+/*
+ * Free a zone.
+ * This must be called with the mapping lock held.
+ */
+void dmz_free_zone(struct dmz_metadata *zmd, struct dm_zone *zone)
+{
+	/* If this is a sequential zone, reset it */
+	if (dmz_is_seq(zone))
+		dmz_reset_zone(zmd, zone);
+
+	/* Return the zone to its type unmap list */
+	if (dmz_is_rnd(zone)) {
+		list_add_tail(&zone->link, &zmd->unmap_rnd_list);
+		atomic_inc(&zmd->unmap_nr_rnd);
+	} else if (atomic_read(&zmd->nr_reserved_seq_zones) <
+		   zmd->nr_reserved_seq) {
+		list_add_tail(&zone->link, &zmd->reserved_seq_zones_list);
+		atomic_inc(&zmd->nr_reserved_seq_zones);
+	} else {
+		list_add_tail(&zone->link, &zmd->unmap_seq_list);
+		atomic_inc(&zmd->unmap_nr_seq);
+	}
+
+	wake_up_all(&zmd->free_wq);
+}
+
+/*
+ * Map a chunk to a zone.
+ * This must be called with the mapping lock held.
+ */
+void dmz_map_zone(struct dmz_metadata *zmd, struct dm_zone *dzone,
+		  unsigned int chunk)
+{
+	/* Set the chunk mapping */
+	dmz_set_chunk_mapping(zmd, chunk, dmz_id(zmd, dzone),
+			      DMZ_MAP_UNMAPPED);
+	dzone->chunk = chunk;
+	if (dmz_is_rnd(dzone))
+		list_add_tail(&dzone->link, &zmd->map_rnd_list);
+	else
+		list_add_tail(&dzone->link, &zmd->map_seq_list);
+}
+
+/*
+ * Unmap a zone.
+ * This must be called with the mapping lock held.
+ */
+void dmz_unmap_zone(struct dmz_metadata *zmd, struct dm_zone *zone)
+{
+	unsigned int chunk = zone->chunk;
+	unsigned int dzone_id;
+
+	if (chunk == DMZ_MAP_UNMAPPED) {
+		/* Already unmapped */
+		return;
+	}
+
+	if (test_and_clear_bit(DMZ_BUF, &zone->flags)) {
+		/*
+		 * Unmapping the chunk buffer zone: clear only
+		 * the chunk buffer mapping
+		 */
+		dzone_id = dmz_id(zmd, zone->bzone);
+		zone->bzone->bzone = NULL;
+		zone->bzone = NULL;
+
+	} else {
+		/*
+		 * Unmapping the chunk data zone: the zone must
+		 * not be buffered.
+		 */
+		if (WARN_ON(zone->bzone)) {
+			zone->bzone->bzone = NULL;
+			zone->bzone = NULL;
+		}
+		dzone_id = DMZ_MAP_UNMAPPED;
+	}
+
+	dmz_set_chunk_mapping(zmd, chunk, dzone_id, DMZ_MAP_UNMAPPED);
+
+	zone->chunk = DMZ_MAP_UNMAPPED;
+	list_del_init(&zone->link);
+}
+
+/*
+ * Set @nr_bits bits in @bitmap starting from @bit.
+ * Return the number of bits changed from 0 to 1.
+ */
+static unsigned int dmz_set_bits(unsigned long *bitmap,
+				 unsigned int bit, unsigned int nr_bits)
+{
+	unsigned long *addr;
+	unsigned int end = bit + nr_bits;
+	unsigned int n = 0;
+
+	while (bit < end) {
+		if (((bit & (BITS_PER_LONG - 1)) == 0) &&
+		    ((end - bit) >= BITS_PER_LONG)) {
+			/* Try to set the whole word at once */
+			addr = bitmap + BIT_WORD(bit);
+			if (*addr == 0) {
+				*addr = ULONG_MAX;
+				n += BITS_PER_LONG;
+				bit += BITS_PER_LONG;
+				continue;
+			}
+		}
+
+		if (!test_and_set_bit(bit, bitmap))
+			n++;
+		bit++;
+	}
+
+	return n;
+}
+
+/*
+ * Get the bitmap block storing the bit for chunk_block in zone.
+ */
+static struct dmz_mblock *dmz_get_bitmap(struct dmz_metadata *zmd,
+					 struct dm_zone *zone,
+					 sector_t chunk_block)
+{
+	sector_t bitmap_block = 1 + zmd->nr_map_blocks +
+		(sector_t)(dmz_id(zmd, zone) * zmd->zone_nr_bitmap_blocks) +
+		(chunk_block >> DMZ_BLOCK_SHIFT_BITS);
+
+	return dmz_get_mblock(zmd, bitmap_block);
+}
+
+/*
+ * Copy the valid blocks bitmap of from_zone to the bitmap of to_zone.
+ */
+int dmz_copy_valid_blocks(struct dmz_metadata *zmd, struct dm_zone *from_zone,
+			  struct dm_zone *to_zone)
+{
+	struct dmz_mblock *from_mblk, *to_mblk;
+	sector_t chunk_block = 0;
+
+	/* Get the zones bitmap blocks */
+	while (chunk_block < zmd->dev->zone_nr_blocks) {
+		from_mblk = dmz_get_bitmap(zmd, from_zone, chunk_block);
+		if (IS_ERR(from_mblk))
+			return PTR_ERR(from_mblk);
+		to_mblk = dmz_get_bitmap(zmd, to_zone, chunk_block);
+		if (IS_ERR(to_mblk)) {
+			dmz_release_mblock(zmd, from_mblk);
+			return PTR_ERR(to_mblk);
+		}
+
+		memcpy(to_mblk->data, from_mblk->data, DMZ_BLOCK_SIZE);
+		dmz_dirty_mblock(zmd, to_mblk);
+
+		dmz_release_mblock(zmd, to_mblk);
+		dmz_release_mblock(zmd, from_mblk);
+
+		chunk_block += DMZ_BLOCK_SIZE_BITS;
+	}
+
+	to_zone->weight = from_zone->weight;
+
+	return 0;
+}
+
+/*
+ * Merge the valid blocks bitmap of from_zone into the bitmap of to_zone,
+ * starting from chunk_block.
+ */
+int dmz_merge_valid_blocks(struct dmz_metadata *zmd, struct dm_zone *from_zone,
+			   struct dm_zone *to_zone, sector_t chunk_block)
+{
+	unsigned int nr_blocks;
+	int ret;
+
+	/* Get the zones bitmap blocks */
+	while (chunk_block < zmd->dev->zone_nr_blocks) {
+		/* Get a valid region from the source zone */
+		ret = dmz_first_valid_block(zmd, from_zone, &chunk_block);
+		if (ret <= 0)
+			return ret;
+
+		nr_blocks = ret;
+		ret = dmz_validate_blocks(zmd, to_zone, chunk_block, nr_blocks);
+		if (ret)
+			return ret;
+
+		chunk_block += nr_blocks;
+	}
+
+	return 0;
+}
+
+/*
+ * Validate all the blocks in the range [block..block+nr_blocks-1].
+ */
+int dmz_validate_blocks(struct dmz_metadata *zmd, struct dm_zone *zone,
+			sector_t chunk_block, unsigned int nr_blocks)
+{
+	unsigned int count, bit, nr_bits;
+	unsigned int zone_nr_blocks = zmd->dev->zone_nr_blocks;
+	struct dmz_mblock *mblk;
+	unsigned int n = 0;
+
+	dmz_dev_debug(zmd->dev, "=> VALIDATE zone %u, block %llu, %u blocks",
+		      dmz_id(zmd, zone), (unsigned long long)chunk_block,
+		      nr_blocks);
+
+	WARN_ON(chunk_block + nr_blocks > zone_nr_blocks);
+
+	while (nr_blocks) {
+		/* Get bitmap block */
+		mblk = dmz_get_bitmap(zmd, zone, chunk_block);
+		if (IS_ERR(mblk))
+			return PTR_ERR(mblk);
+
+		/* Set bits */
+		bit = chunk_block & DMZ_BLOCK_MASK_BITS;
+		nr_bits = min(nr_blocks, DMZ_BLOCK_SIZE_BITS - bit);
+
+		count = dmz_set_bits((unsigned long *)mblk->data, bit, nr_bits);
+		if (count) {
+			dmz_dirty_mblock(zmd, mblk);
+			n += count;
+		}
+		dmz_release_mblock(zmd, mblk);
+
+		nr_blocks -= nr_bits;
+		chunk_block += nr_bits;
+	}
+
+	if (likely(zone->weight + n <= zone_nr_blocks))
+		zone->weight += n;
+	else {
+		dmz_dev_warn(zmd->dev, "Zone %u: weight %u should be <= %u",
+			     dmz_id(zmd, zone), zone->weight,
+			     zone_nr_blocks - n);
+		zone->weight = zone_nr_blocks;
+	}
+
+	return 0;
+}
+
+/*
+ * Clear nr_bits bits in bitmap starting from bit.
+ * Return the number of bits cleared.
+ */
+static int dmz_clear_bits(unsigned long *bitmap, int bit, int nr_bits)
+{
+	unsigned long *addr;
+	int end = bit + nr_bits;
+	int n = 0;
+
+	while (bit < end) {
+		if (((bit & (BITS_PER_LONG - 1)) == 0) &&
+		    ((end - bit) >= BITS_PER_LONG)) {
+			/* Try to clear whole word at once */
+			addr = bitmap + BIT_WORD(bit);
+			if (*addr == ULONG_MAX) {
+				*addr = 0;
+				n += BITS_PER_LONG;
+				bit += BITS_PER_LONG;
+				continue;
+			}
+		}
+
+		if (test_and_clear_bit(bit, bitmap))
+			n++;
+		bit++;
+	}
+
+	return n;
+}
+
+/*
+ * Invalidate all the blocks in the range [block..block+nr_blocks-1].
+ */
+int dmz_invalidate_blocks(struct dmz_metadata *zmd, struct dm_zone *zone,
+			  sector_t chunk_block, unsigned int nr_blocks)
+{
+	unsigned int count, bit, nr_bits;
+	struct dmz_mblock *mblk;
+	unsigned int n = 0;
+
+	dmz_dev_debug(zmd->dev, "=> INVALIDATE zone %u, block %llu, %u blocks",
+		      dmz_id(zmd, zone), (u64)chunk_block, nr_blocks);
+
+	WARN_ON(chunk_block + nr_blocks > zmd->dev->zone_nr_blocks);
+
+	while (nr_blocks) {
+		/* Get bitmap block */
+		mblk = dmz_get_bitmap(zmd, zone, chunk_block);
+		if (IS_ERR(mblk))
+			return PTR_ERR(mblk);
+
+		/* Clear bits */
+		bit = chunk_block & DMZ_BLOCK_MASK_BITS;
+		nr_bits = min(nr_blocks, DMZ_BLOCK_SIZE_BITS - bit);
+
+		count = dmz_clear_bits((unsigned long *)mblk->data,
+				       bit, nr_bits);
+		if (count) {
+			dmz_dirty_mblock(zmd, mblk);
+			n += count;
+		}
+		dmz_release_mblock(zmd, mblk);
+
+		nr_blocks -= nr_bits;
+		chunk_block += nr_bits;
+	}
+
+	if (zone->weight >= n)
+		zone->weight -= n;
+	else {
+		dmz_dev_warn(zmd->dev, "Zone %u: weight %u should be >= %u",
+			     dmz_id(zmd, zone), zone->weight, n);
+		zone->weight = 0;
+	}
+
+	return 0;
+}
+
+/*
+ * Get a block bit value.
+ */
+static int dmz_test_block(struct dmz_metadata *zmd, struct dm_zone *zone,
+			  sector_t chunk_block)
+{
+	struct dmz_mblock *mblk;
+	int ret;
+
+	WARN_ON(chunk_block >= zmd->dev->zone_nr_blocks);
+
+	/* Get bitmap block */
+	mblk = dmz_get_bitmap(zmd, zone, chunk_block);
+	if (IS_ERR(mblk))
+		return PTR_ERR(mblk);
+
+	/* Get offset */
+	ret = test_bit(chunk_block & DMZ_BLOCK_MASK_BITS,
+		       (unsigned long *) mblk->data) != 0;
+
+	dmz_release_mblock(zmd, mblk);
+
+	return ret;
+}
+
+/*
+ * Return the number of blocks from chunk_block to the first block with a bit
+ * value specified by set. Search at most nr_blocks blocks from chunk_block.
+ */
+static int dmz_to_next_set_block(struct dmz_metadata *zmd, struct dm_zone *zone,
+				 sector_t chunk_block, unsigned int nr_blocks,
+				 int set)
+{
+	struct dmz_mblock *mblk;
+	unsigned int bit, set_bit, nr_bits;
+	unsigned long *bitmap;
+	int n = 0;
+
+	WARN_ON(chunk_block + nr_blocks > zmd->dev->zone_nr_blocks);
+
+	while (nr_blocks) {
+		/* Get bitmap block */
+		mblk = dmz_get_bitmap(zmd, zone, chunk_block);
+		if (IS_ERR(mblk))
+			return PTR_ERR(mblk);
+
+		/* Get offset */
+		bitmap = (unsigned long *) mblk->data;
+		bit = chunk_block & DMZ_BLOCK_MASK_BITS;
+		nr_bits = min(nr_blocks, DMZ_BLOCK_SIZE_BITS - bit);
+		if (set)
+			set_bit = find_next_bit(bitmap, DMZ_BLOCK_SIZE_BITS, bit);
+		else
+			set_bit = find_next_zero_bit(bitmap, DMZ_BLOCK_SIZE_BITS, bit);
+		dmz_release_mblock(zmd, mblk);
+
+		n += set_bit - bit;
+		if (set_bit < DMZ_BLOCK_SIZE_BITS)
+			break;
+
+		nr_blocks -= nr_bits;
+		chunk_block += nr_bits;
+	}
+
+	return n;
+}
+
+/*
+ * Test if chunk_block is valid. If it is, the number of consecutive
+ * valid blocks from chunk_block will be returned.
+ */
+int dmz_block_valid(struct dmz_metadata *zmd, struct dm_zone *zone,
+		    sector_t chunk_block)
+{
+	int valid;
+
+	valid = dmz_test_block(zmd, zone, chunk_block);
+	if (valid <= 0)
+		return valid;
+
+	/* The block is valid: get the number of valid blocks from block */
+	return dmz_to_next_set_block(zmd, zone, chunk_block,
+				     zmd->dev->zone_nr_blocks - chunk_block, 0);
+}
+
+/*
+ * Find the first valid block from @chunk_block in @zone.
+ * If such a block is found, its number is returned using
+ * @chunk_block and the total number of valid blocks from @chunk_block
+ * is returned.
+ */
+int dmz_first_valid_block(struct dmz_metadata *zmd, struct dm_zone *zone,
+			  sector_t *chunk_block)
+{
+	sector_t start_block = *chunk_block;
+	int ret;
+
+	ret = dmz_to_next_set_block(zmd, zone, start_block,
+				    zmd->dev->zone_nr_blocks - start_block, 1);
+	if (ret < 0)
+		return ret;
+
+	start_block += ret;
+	*chunk_block = start_block;
+
+	return dmz_to_next_set_block(zmd, zone, start_block,
+				     zmd->dev->zone_nr_blocks - start_block, 0);
+}
+
+/*
+ * Count the number of bits set starting from bit up to bit + nr_bits - 1.
+ */
+static int dmz_count_bits(void *bitmap, int bit, int nr_bits)
+{
+	unsigned long *addr;
+	int end = bit + nr_bits;
+	int n = 0;
+
+	while (bit < end) {
+		if (((bit & (BITS_PER_LONG - 1)) == 0) &&
+		    ((end - bit) >= BITS_PER_LONG)) {
+			addr = (unsigned long *)bitmap + BIT_WORD(bit);
+			if (*addr == ULONG_MAX) {
+				n += BITS_PER_LONG;
+				bit += BITS_PER_LONG;
+				continue;
+			}
+		}
+
+		if (test_bit(bit, bitmap))
+			n++;
+		bit++;
+	}
+
+	return n;
+}
+
+/*
+ * Get a zone weight.
+ */
+static void dmz_get_zone_weight(struct dmz_metadata *zmd, struct dm_zone *zone)
+{
+	struct dmz_mblock *mblk;
+	sector_t chunk_block = 0;
+	unsigned int bit, nr_bits;
+	unsigned int nr_blocks = zmd->dev->zone_nr_blocks;
+	void *bitmap;
+	int n = 0;
+
+	while (nr_blocks) {
+		/* Get bitmap block */
+		mblk = dmz_get_bitmap(zmd, zone, chunk_block);
+		if (IS_ERR(mblk)) {
+			n = 0;
+			break;
+		}
+
+		/* Count bits in this block */
+		bitmap = mblk->data;
+		bit = chunk_block & DMZ_BLOCK_MASK_BITS;
+		nr_bits = min(nr_blocks, DMZ_BLOCK_SIZE_BITS - bit);
+		n += dmz_count_bits(bitmap, bit, nr_bits);
+
+		dmz_release_mblock(zmd, mblk);
+
+		nr_blocks -= nr_bits;
+		chunk_block += nr_bits;
+	}
+
+	zone->weight = n;
+}
+
+/*
+ * Cleanup the zoned metadata resources.
+ */
+static void dmz_cleanup_metadata(struct dmz_metadata *zmd)
+{
+	struct rb_root *root;
+	struct dmz_mblock *mblk, *next;
+	int i;
+
+	/* Release zone mapping resources */
+	if (zmd->map_mblk) {
+		for (i = 0; i < zmd->nr_map_blocks; i++)
+			dmz_release_mblock(zmd, zmd->map_mblk[i]);
+		kfree(zmd->map_mblk);
+		zmd->map_mblk = NULL;
+	}
+
+	/* Release super blocks */
+	for (i = 0; i < 2; i++) {
+		if (zmd->sb[i].mblk) {
+			dmz_free_mblock(zmd, zmd->sb[i].mblk);
+			zmd->sb[i].mblk = NULL;
+		}
+	}
+
+	/* Free cached blocks */
+	while (!list_empty(&zmd->mblk_dirty_list)) {
+		mblk = list_first_entry(&zmd->mblk_dirty_list,
+					struct dmz_mblock, link);
+		dmz_dev_warn(zmd->dev, "mblock %llu still in dirty list (ref %u)",
+			     (u64)mblk->no, atomic_read(&mblk->ref));
+		list_del_init(&mblk->link);
+		rb_erase(&mblk->node, &zmd->mblk_rbtree);
+		dmz_free_mblock(zmd, mblk);
+	}
+
+	while (!list_empty(&zmd->mblk_lru_list)) {
+		mblk = list_first_entry(&zmd->mblk_lru_list,
+					struct dmz_mblock, link);
+		list_del_init(&mblk->link);
+		rb_erase(&mblk->node, &zmd->mblk_rbtree);
+		dmz_free_mblock(zmd, mblk);
+	}
+
+	/* Sanity checks: the mblock rbtree should now be empty */
+	root = &zmd->mblk_rbtree;
+	rbtree_postorder_for_each_entry_safe(mblk, next, root, node) {
+		dmz_dev_warn(zmd->dev, "mblock %llu ref %u still in rbtree",
+			     (u64)mblk->no, atomic_read(&mblk->ref));
+		atomic_set(&mblk->ref, 0);
+		dmz_free_mblock(zmd, mblk);
+	}
+
+	/* Free the zone descriptors */
+	dmz_drop_zones(zmd);
+}
+
+/*
+ * Initialize the zoned metadata.
+ */
+int dmz_ctr_metadata(struct dmz_dev *dev, struct dmz_metadata **metadata)
+{
+	struct dmz_metadata *zmd;
+	unsigned int i, zid;
+	struct dm_zone *zone;
+	int ret;
+
+	zmd = kzalloc(sizeof(struct dmz_metadata), GFP_KERNEL);
+	if (!zmd)
+		return -ENOMEM;
+
+	zmd->dev = dev;
+	zmd->mblk_rbtree = RB_ROOT;
+	init_rwsem(&zmd->mblk_sem);
+	mutex_init(&zmd->mblk_flush_lock);
+	spin_lock_init(&zmd->mblk_lock);
+	INIT_LIST_HEAD(&zmd->mblk_lru_list);
+	INIT_LIST_HEAD(&zmd->mblk_dirty_list);
+
+	mutex_init(&zmd->map_lock);
+	atomic_set(&zmd->unmap_nr_rnd, 0);
+	INIT_LIST_HEAD(&zmd->unmap_rnd_list);
+	INIT_LIST_HEAD(&zmd->map_rnd_list);
+
+	atomic_set(&zmd->unmap_nr_seq, 0);
+	INIT_LIST_HEAD(&zmd->unmap_seq_list);
+	INIT_LIST_HEAD(&zmd->map_seq_list);
+
+	atomic_set(&zmd->nr_reserved_seq_zones, 0);
+	INIT_LIST_HEAD(&zmd->reserved_seq_zones_list);
+
+	init_waitqueue_head(&zmd->free_wq);
+
+	/* Initialize zone descriptors */
+	ret = dmz_init_zones(zmd);
+	if (ret)
+		goto err;
+
+	/* Get super block */
+	ret = dmz_load_sb(zmd);
+	if (ret)
+		goto err;
+
+	/* Set metadata zones starting from sb_zone */
+	zid = dmz_id(zmd, zmd->sb_zone);
+	for (i = 0; i < zmd->nr_meta_zones << 1; i++) {
+		zone = dmz_get(zmd, zid + i);
+		if (!dmz_is_rnd(zone))
+			goto err;
+		set_bit(DMZ_META, &zone->flags);
+	}
+
+	/* Load mapping table */
+	ret = dmz_load_mapping(zmd);
+	if (ret)
+		goto err;
+
+	/*
+	 * Cache size boundaries: allow at least 2 super blocks, the chunk map
+	 * blocks and enough blocks to be able to cache the bitmap blocks of
+	 * up to 16 zones when idle (min_nr_mblks). Otherwise, if busy, allow
+	 * the cache to add 512 more metadata blocks.
+	 */
+	zmd->min_nr_mblks = 2 + zmd->nr_map_blocks + zmd->zone_nr_bitmap_blocks * 16;
+	zmd->max_nr_mblks = zmd->min_nr_mblks + 512;
+	zmd->mblk_shrinker.count_objects = dmz_mblock_shrinker_count;
+	zmd->mblk_shrinker.scan_objects = dmz_mblock_shrinker_scan;
+	zmd->mblk_shrinker.seeks = DEFAULT_SEEKS;
+
+	/* Metadata cache shrinker */
+	ret = register_shrinker(&zmd->mblk_shrinker);
+	if (ret) {
+		dmz_dev_err(dev, "Register metadata cache shrinker failed");
+		goto err;
+	}
+
+	dmz_dev_info(dev, "Host-%s zoned block device",
+		     bdev_zoned_model(dev->bdev) == BLK_ZONED_HA ?
+		     "aware" : "managed");
+	dmz_dev_info(dev, "  %llu 512-byte logical sectors",
+		     (u64)dev->capacity);
+	dmz_dev_info(dev, "  %u zones of %llu 512-byte logical sectors",
+		     dev->nr_zones, (u64)dev->zone_nr_sectors);
+	dmz_dev_info(dev, "  %u metadata zones",
+		     zmd->nr_meta_zones * 2);
+	dmz_dev_info(dev, "  %u data zones for %u chunks",
+		     zmd->nr_data_zones, zmd->nr_chunks);
+	dmz_dev_info(dev, "    %u random zones (%u unmapped)",
+		     zmd->nr_rnd, atomic_read(&zmd->unmap_nr_rnd));
+	dmz_dev_info(dev, "    %u sequential zones (%u unmapped)",
+		     zmd->nr_seq, atomic_read(&zmd->unmap_nr_seq));
+	dmz_dev_info(dev, "  %u reserved sequential data zones",
+		     zmd->nr_reserved_seq);
+
+	dmz_dev_debug(dev, "Format:");
+	dmz_dev_debug(dev, "%u metadata blocks per set (%u max cache)",
+		      zmd->nr_meta_blocks, zmd->max_nr_mblks);
+	dmz_dev_debug(dev, "  %u data zone mapping blocks",
+		      zmd->nr_map_blocks);
+	dmz_dev_debug(dev, "  %u bitmap blocks",
+		      zmd->nr_bitmap_blocks);
+
+	*metadata = zmd;
+
+	return 0;
+err:
+	dmz_cleanup_metadata(zmd);
+	kfree(zmd);
+	*metadata = NULL;
+
+	return ret;
+}
+
+/*
+ * Cleanup the zoned metadata resources.
+ */
+void dmz_dtr_metadata(struct dmz_metadata *zmd)
+{
+	unregister_shrinker(&zmd->mblk_shrinker);
+	dmz_cleanup_metadata(zmd);
+	kfree(zmd);
+}
+
+/*
+ * Check zone information on resume.
+ */
+int dmz_resume_metadata(struct dmz_metadata *zmd)
+{
+	struct dmz_dev *dev = zmd->dev;
+	struct dm_zone *zone;
+	sector_t wp_block;
+	unsigned int i;
+	int ret;
+
+	/* Check zones */
+	for (i = 0; i < dev->nr_zones; i++) {
+		zone = dmz_get(zmd, i);
+		if (!zone) {
+			dmz_dev_err(dev, "Unable to get zone %u", i);
+			return -EIO;
+		}
+
+		wp_block = zone->wp_block;
+
+		ret = dmz_update_zone(zmd, zone);
+		if (ret) {
+			dmz_dev_err(dev, "Broken zone %u", i);
+			return ret;
+		}
+
+		if (dmz_is_offline(zone)) {
+			dmz_dev_warn(dev, "Zone %u is offline", i);
+			continue;
+		}
+
+		/* Check write pointer */
+		if (!dmz_is_seq(zone))
+			zone->wp_block = 0;
+		else if (zone->wp_block != wp_block) {
+			dmz_dev_err(dev, "Zone %u: Invalid wp (%llu / %llu)",
+				    i, (u64)zone->wp_block, (u64)wp_block);
+			zone->wp_block = wp_block;
+			dmz_invalidate_blocks(zmd, zone, zone->wp_block,
+					      dev->zone_nr_blocks - zone->wp_block);
+		}
+	}
+
+	return 0;
+}
diff --git a/drivers/md/dm-zoned-reclaim.c b/drivers/md/dm-zoned-reclaim.c
new file mode 100644
index 000000000000..05c0a126f5c8
--- /dev/null
+++ b/drivers/md/dm-zoned-reclaim.c
@@ -0,0 +1,570 @@
+/*
+ * Copyright (C) 2017 Western Digital Corporation or its affiliates.
+ *
+ * This file is released under the GPL.
+ */
+
+#include "dm-zoned.h"
+
+#include <linux/module.h>
+
+#define	DM_MSG_PREFIX		"zoned reclaim"
+
+struct dmz_reclaim {
+	struct dmz_metadata     *metadata;
+	struct dmz_dev		*dev;
+
+	struct delayed_work	work;
+	struct workqueue_struct *wq;
+
+	struct dm_kcopyd_client	*kc;
+	struct dm_kcopyd_throttle kc_throttle;
+	int			kc_err;
+
+	unsigned long		flags;
+
+	/* Last target access time */
+	unsigned long		atime;
+};
+
+/*
+ * Reclaim state flags.
+ */
+enum {
+	DMZ_RECLAIM_KCOPY,
+};
+
+/*
+ * Number of seconds of target BIO inactivity to consider the target idle.
+ */
+#define DMZ_IDLE_PERIOD		(10UL * HZ)
+
+/*
+ * Percentage of unmapped (free) random zones below which reclaim starts
+ * even if the target is busy.
+ */
+#define DMZ_RECLAIM_LOW_UNMAP_RND	30
+
+/*
+ * Percentage of unmapped (free) random zones above which reclaim will
+ * stop if the target is busy.
+ */
+#define DMZ_RECLAIM_HIGH_UNMAP_RND	50
+
+/*
+ * Align a sequential zone write pointer to chunk_block.
+ */
+static int dmz_reclaim_align_wp(struct dmz_reclaim *zrc, struct dm_zone *zone,
+				sector_t block)
+{
+	struct dmz_metadata *zmd = zrc->metadata;
+	sector_t wp_block = zone->wp_block;
+	unsigned int nr_blocks;
+	int ret;
+
+	if (wp_block == block)
+		return 0;
+
+	if (wp_block > block)
+		return -EIO;
+
+	/*
+	 * Zeroout the space between the write
+	 * pointer and the requested position.
+	 */
+	nr_blocks = block - wp_block;
+	ret = blkdev_issue_zeroout(zrc->dev->bdev,
+				   dmz_start_sect(zmd, zone) + dmz_blk2sect(wp_block),
+				   dmz_blk2sect(nr_blocks), GFP_NOFS, false);
+	if (ret) {
+		dmz_dev_err(zrc->dev,
+			    "Align zone %u wp %llu to %llu (wp+%u) blocks failed %d",
+			    dmz_id(zmd, zone), (unsigned long long)wp_block,
+			    (unsigned long long)block, nr_blocks, ret);
+		return ret;
+	}
+
+	zone->wp_block = block;
+
+	return 0;
+}
+
+/*
+ * dm_kcopyd_copy end notification.
+ */
+static void dmz_reclaim_kcopy_end(int read_err, unsigned long write_err,
+				  void *context)
+{
+	struct dmz_reclaim *zrc = context;
+
+	if (read_err || write_err)
+		zrc->kc_err = -EIO;
+	else
+		zrc->kc_err = 0;
+
+	clear_bit_unlock(DMZ_RECLAIM_KCOPY, &zrc->flags);
+	smp_mb__after_atomic();
+	wake_up_bit(&zrc->flags, DMZ_RECLAIM_KCOPY);
+}
+
+/*
+ * Copy valid blocks of src_zone into dst_zone.
+ */
+static int dmz_reclaim_copy(struct dmz_reclaim *zrc,
+			    struct dm_zone *src_zone, struct dm_zone *dst_zone)
+{
+	struct dmz_metadata *zmd = zrc->metadata;
+	struct dmz_dev *dev = zrc->dev;
+	struct dm_io_region src, dst;
+	sector_t block = 0, end_block;
+	sector_t nr_blocks;
+	sector_t src_zone_block;
+	sector_t dst_zone_block;
+	unsigned long flags = 0;
+	int ret;
+
+	if (dmz_is_seq(src_zone))
+		end_block = src_zone->wp_block;
+	else
+		end_block = dev->zone_nr_blocks;
+	src_zone_block = dmz_start_block(zmd, src_zone);
+	dst_zone_block = dmz_start_block(zmd, dst_zone);
+
+	if (dmz_is_seq(dst_zone))
+		set_bit(DM_KCOPYD_WRITE_SEQ, &flags);
+
+	while (block < end_block) {
+		/* Get a valid region from the source zone */
+		ret = dmz_first_valid_block(zmd, src_zone, &block);
+		if (ret <= 0)
+			return ret;
+		nr_blocks = ret;
+
+		/*
+		 * If we are writing in a sequential zone, we must make sure
+		 * that writes are sequential. So Zeroout any eventual hole
+		 * between writes.
+		 */
+		if (dmz_is_seq(dst_zone)) {
+			ret = dmz_reclaim_align_wp(zrc, dst_zone, block);
+			if (ret)
+				return ret;
+		}
+
+		src.bdev = dev->bdev;
+		src.sector = dmz_blk2sect(src_zone_block + block);
+		src.count = dmz_blk2sect(nr_blocks);
+
+		dst.bdev = dev->bdev;
+		dst.sector = dmz_blk2sect(dst_zone_block + block);
+		dst.count = src.count;
+
+		/* Copy the valid region */
+		set_bit(DMZ_RECLAIM_KCOPY, &zrc->flags);
+		ret = dm_kcopyd_copy(zrc->kc, &src, 1, &dst, flags,
+				     dmz_reclaim_kcopy_end, zrc);
+		if (ret)
+			return ret;
+
+		/* Wait for copy to complete */
+		wait_on_bit_io(&zrc->flags, DMZ_RECLAIM_KCOPY,
+			       TASK_UNINTERRUPTIBLE);
+		if (zrc->kc_err)
+			return zrc->kc_err;
+
+		block += nr_blocks;
+		if (dmz_is_seq(dst_zone))
+			dst_zone->wp_block = block;
+	}
+
+	return 0;
+}
+
+/*
+ * Move valid blocks of dzone buffer zone into dzone (after its write pointer)
+ * and free the buffer zone.
+ */
+static int dmz_reclaim_buf(struct dmz_reclaim *zrc, struct dm_zone *dzone)
+{
+	struct dm_zone *bzone = dzone->bzone;
+	sector_t chunk_block = dzone->wp_block;
+	struct dmz_metadata *zmd = zrc->metadata;
+	int ret;
+
+	dmz_dev_debug(zrc->dev,
+		      "Chunk %u, move buf zone %u (weight %u) to data zone %u (weight %u)",
+		      dzone->chunk, dmz_id(zmd, bzone), dmz_weight(bzone),
+		      dmz_id(zmd, dzone), dmz_weight(dzone));
+
+	/* Flush data zone into the buffer zone */
+	ret = dmz_reclaim_copy(zrc, bzone, dzone);
+	if (ret < 0)
+		return ret;
+
+	dmz_lock_flush(zmd);
+
+	/* Validate copied blocks */
+	ret = dmz_merge_valid_blocks(zmd, bzone, dzone, chunk_block);
+	if (ret == 0) {
+		/* Free the buffer zone */
+		dmz_invalidate_blocks(zmd, bzone, 0, zrc->dev->zone_nr_blocks);
+		dmz_lock_map(zmd);
+		dmz_unmap_zone(zmd, bzone);
+		dmz_unlock_zone_reclaim(dzone);
+		dmz_free_zone(zmd, bzone);
+		dmz_unlock_map(zmd);
+	}
+
+	dmz_unlock_flush(zmd);
+
+	return 0;
+}
+
+/*
+ * Merge valid blocks of dzone into its buffer zone and free dzone.
+ */
+static int dmz_reclaim_seq_data(struct dmz_reclaim *zrc, struct dm_zone *dzone)
+{
+	unsigned int chunk = dzone->chunk;
+	struct dm_zone *bzone = dzone->bzone;
+	struct dmz_metadata *zmd = zrc->metadata;
+	int ret = 0;
+
+	dmz_dev_debug(zrc->dev,
+		      "Chunk %u, move data zone %u (weight %u) to buf zone %u (weight %u)",
+		      chunk, dmz_id(zmd, dzone), dmz_weight(dzone),
+		      dmz_id(zmd, bzone), dmz_weight(bzone));
+
+	/* Flush data zone into the buffer zone */
+	ret = dmz_reclaim_copy(zrc, dzone, bzone);
+	if (ret < 0)
+		return ret;
+
+	dmz_lock_flush(zmd);
+
+	/* Validate copied blocks */
+	ret = dmz_merge_valid_blocks(zmd, dzone, bzone, 0);
+	if (ret == 0) {
+		/*
+		 * Free the data zone and remap the chunk to
+		 * the buffer zone.
+		 */
+		dmz_invalidate_blocks(zmd, dzone, 0, zrc->dev->zone_nr_blocks);
+		dmz_lock_map(zmd);
+		dmz_unmap_zone(zmd, bzone);
+		dmz_unmap_zone(zmd, dzone);
+		dmz_unlock_zone_reclaim(dzone);
+		dmz_free_zone(zmd, dzone);
+		dmz_map_zone(zmd, bzone, chunk);
+		dmz_unlock_map(zmd);
+	}
+
+	dmz_unlock_flush(zmd);
+
+	return 0;
+}
+
+/*
+ * Move valid blocks of the random data zone dzone into a free sequential zone.
+ * Once blocks are moved, remap the zone chunk to the sequential zone.
+ */
+static int dmz_reclaim_rnd_data(struct dmz_reclaim *zrc, struct dm_zone *dzone)
+{
+	unsigned int chunk = dzone->chunk;
+	struct dm_zone *szone = NULL;
+	struct dmz_metadata *zmd = zrc->metadata;
+	int ret;
+
+	/* Get a free sequential zone */
+	dmz_lock_map(zmd);
+	szone = dmz_alloc_zone(zmd, DMZ_ALLOC_RECLAIM);
+	dmz_unlock_map(zmd);
+	if (!szone)
+		return -ENOSPC;
+
+	dmz_dev_debug(zrc->dev,
+		      "Chunk %u, move rnd zone %u (weight %u) to seq zone %u",
+		      chunk, dmz_id(zmd, dzone), dmz_weight(dzone),
+		      dmz_id(zmd, szone));
+
+	/* Flush the random data zone into the sequential zone */
+	ret = dmz_reclaim_copy(zrc, dzone, szone);
+
+	dmz_lock_flush(zmd);
+
+	if (ret == 0) {
+		/* Validate copied blocks */
+		ret = dmz_copy_valid_blocks(zmd, dzone, szone);
+	}
+	if (ret) {
+		/* Free the sequential zone */
+		dmz_lock_map(zmd);
+		dmz_free_zone(zmd, szone);
+		dmz_unlock_map(zmd);
+	} else {
+		/* Free the data zone and remap the chunk */
+		dmz_invalidate_blocks(zmd, dzone, 0, zrc->dev->zone_nr_blocks);
+		dmz_lock_map(zmd);
+		dmz_unmap_zone(zmd, dzone);
+		dmz_unlock_zone_reclaim(dzone);
+		dmz_free_zone(zmd, dzone);
+		dmz_map_zone(zmd, szone, chunk);
+		dmz_unlock_map(zmd);
+	}
+
+	dmz_unlock_flush(zmd);
+
+	return 0;
+}
+
+/*
+ * Reclaim an empty zone.
+ */
+static void dmz_reclaim_empty(struct dmz_reclaim *zrc, struct dm_zone *dzone)
+{
+	struct dmz_metadata *zmd = zrc->metadata;
+
+	dmz_lock_flush(zmd);
+	dmz_lock_map(zmd);
+	dmz_unmap_zone(zmd, dzone);
+	dmz_unlock_zone_reclaim(dzone);
+	dmz_free_zone(zmd, dzone);
+	dmz_unlock_map(zmd);
+	dmz_unlock_flush(zmd);
+}
+
+/*
+ * Find a candidate zone for reclaim and process it.
+ */
+static void dmz_reclaim(struct dmz_reclaim *zrc)
+{
+	struct dmz_metadata *zmd = zrc->metadata;
+	struct dm_zone *dzone;
+	struct dm_zone *rzone;
+	unsigned long start;
+	int ret;
+
+	/* Get a data zone */
+	dzone = dmz_get_zone_for_reclaim(zmd);
+	if (!dzone)
+		return;
+
+	start = jiffies;
+
+	if (dmz_is_rnd(dzone)) {
+		if (!dmz_weight(dzone)) {
+			/* Empty zone */
+			dmz_reclaim_empty(zrc, dzone);
+			ret = 0;
+		} else {
+			/*
+			 * Reclaim the random data zone by moving its
+			 * valid data blocks to a free sequential zone.
+			 */
+			ret = dmz_reclaim_rnd_data(zrc, dzone);
+		}
+		rzone = dzone;
+
+	} else {
+		struct dm_zone *bzone = dzone->bzone;
+		sector_t chunk_block = 0;
+
+		ret = dmz_first_valid_block(zmd, bzone, &chunk_block);
+		if (ret < 0)
+			goto out;
+
+		if (ret == 0 || chunk_block >= dzone->wp_block) {
+			/*
+			 * The buffer zone is empty or its valid blocks are
+			 * after the data zone write pointer.
+			 */
+			ret = dmz_reclaim_buf(zrc, dzone);
+			rzone = bzone;
+		} else {
+			/*
+			 * Reclaim the data zone by merging it into the
+			 * buffer zone so that the buffer zone itself can
+			 * be later reclaimed.
+			 */
+			ret = dmz_reclaim_seq_data(zrc, dzone);
+			rzone = dzone;
+		}
+	}
+out:
+	if (ret) {
+		dmz_unlock_zone_reclaim(dzone);
+		return;
+	}
+
+	(void) dmz_flush_metadata(zrc->metadata);
+
+	dmz_dev_debug(zrc->dev, "Reclaimed zone %u in %u ms",
+		      dmz_id(zmd, rzone), jiffies_to_msecs(jiffies - start));
+}
+
+/*
+ * Test if the target device is idle.
+ */
+static inline int dmz_target_idle(struct dmz_reclaim *zrc)
+{
+	return time_is_before_jiffies(zrc->atime + DMZ_IDLE_PERIOD);
+}
+
+/*
+ * Test if reclaim is necessary.
+ */
+static bool dmz_should_reclaim(struct dmz_reclaim *zrc)
+{
+	struct dmz_metadata *zmd = zrc->metadata;
+	unsigned int nr_rnd = dmz_nr_rnd_zones(zmd);
+	unsigned int nr_unmap_rnd = dmz_nr_unmap_rnd_zones(zmd);
+	unsigned int p_unmap_rnd = nr_unmap_rnd * 100 / nr_rnd;
+
+	/* Reclaim when idle */
+	if (dmz_target_idle(zrc) && nr_unmap_rnd < nr_rnd)
+		return true;
+
+	/* If there are still plenty of random zones, do not reclaim */
+	if (p_unmap_rnd >= DMZ_RECLAIM_HIGH_UNMAP_RND)
+		return false;
+
+	/*
+	 * If the percentage of unmappped random zones is low,
+	 * reclaim even if the target is busy.
+	 */
+	return p_unmap_rnd <= DMZ_RECLAIM_LOW_UNMAP_RND;
+}
+
+/*
+ * Reclaim work function.
+ */
+static void dmz_reclaim_work(struct work_struct *work)
+{
+	struct dmz_reclaim *zrc = container_of(work, struct dmz_reclaim, work.work);
+	struct dmz_metadata *zmd = zrc->metadata;
+	unsigned int nr_rnd, nr_unmap_rnd;
+	unsigned int p_unmap_rnd;
+
+	if (!dmz_should_reclaim(zrc)) {
+		mod_delayed_work(zrc->wq, &zrc->work, DMZ_IDLE_PERIOD);
+		return;
+	}
+
+	/*
+	 * We need to start reclaiming random zones: set up zone copy
+	 * throttling to either go fast if we are very low on random zones
+	 * and slower if there are still some free random zones to avoid
+	 * as much as possible to negatively impact the user workload.
+	 */
+	nr_rnd = dmz_nr_rnd_zones(zmd);
+	nr_unmap_rnd = dmz_nr_unmap_rnd_zones(zmd);
+	p_unmap_rnd = nr_unmap_rnd * 100 / nr_rnd;
+	if (dmz_target_idle(zrc) || p_unmap_rnd < DMZ_RECLAIM_LOW_UNMAP_RND / 2) {
+		/* Idle or very low percentage: go fast */
+		zrc->kc_throttle.throttle = 100;
+	} else {
+		/* Busy but we still have some random zone: throttle */
+		zrc->kc_throttle.throttle = min(75U, 100U - p_unmap_rnd / 2);
+	}
+
+	dmz_dev_debug(zrc->dev,
+		      "Reclaim (%u): %s, %u%% free rnd zones (%u/%u)",
+		      zrc->kc_throttle.throttle,
+		      (dmz_target_idle(zrc) ? "Idle" : "Busy"),
+		      p_unmap_rnd, nr_unmap_rnd, nr_rnd);
+
+	dmz_reclaim(zrc);
+
+	dmz_schedule_reclaim(zrc);
+}
+
+/*
+ * Initialize reclaim.
+ */
+int dmz_ctr_reclaim(struct dmz_dev *dev, struct dmz_metadata *zmd,
+		    struct dmz_reclaim **reclaim)
+{
+	struct dmz_reclaim *zrc;
+	int ret;
+
+	zrc = kzalloc(sizeof(struct dmz_reclaim), GFP_KERNEL);
+	if (!zrc)
+		return -ENOMEM;
+
+	zrc->dev = dev;
+	zrc->metadata = zmd;
+	zrc->atime = jiffies;
+
+	/* Reclaim kcopyd client */
+	zrc->kc = dm_kcopyd_client_create(&zrc->kc_throttle);
+	if (IS_ERR(zrc->kc)) {
+		ret = PTR_ERR(zrc->kc);
+		zrc->kc = NULL;
+		goto err;
+	}
+
+	/* Reclaim work */
+	INIT_DELAYED_WORK(&zrc->work, dmz_reclaim_work);
+	zrc->wq = alloc_ordered_workqueue("dmz_rwq_%s", WQ_MEM_RECLAIM,
+					  dev->name);
+	if (!zrc->wq) {
+		ret = -ENOMEM;
+		goto err;
+	}
+
+	*reclaim = zrc;
+	queue_delayed_work(zrc->wq, &zrc->work, 0);
+
+	return 0;
+err:
+	if (zrc->kc)
+		dm_kcopyd_client_destroy(zrc->kc);
+	kfree(zrc);
+
+	return ret;
+}
+
+/*
+ * Terminate reclaim.
+ */
+void dmz_dtr_reclaim(struct dmz_reclaim *zrc)
+{
+	cancel_delayed_work_sync(&zrc->work);
+	destroy_workqueue(zrc->wq);
+	dm_kcopyd_client_destroy(zrc->kc);
+	kfree(zrc);
+}
+
+/*
+ * Suspend reclaim.
+ */
+void dmz_suspend_reclaim(struct dmz_reclaim *zrc)
+{
+	cancel_delayed_work_sync(&zrc->work);
+}
+
+/*
+ * Resume reclaim.
+ */
+void dmz_resume_reclaim(struct dmz_reclaim *zrc)
+{
+	queue_delayed_work(zrc->wq, &zrc->work, DMZ_IDLE_PERIOD);
+}
+
+/*
+ * BIO accounting.
+ */
+void dmz_reclaim_bio_acc(struct dmz_reclaim *zrc)
+{
+	zrc->atime = jiffies;
+}
+
+/*
+ * Start reclaim if necessary.
+ */
+void dmz_schedule_reclaim(struct dmz_reclaim *zrc)
+{
+	if (dmz_should_reclaim(zrc))
+		mod_delayed_work(zrc->wq, &zrc->work, 0);
+}
+
diff --git a/drivers/md/dm-zoned-target.c b/drivers/md/dm-zoned-target.c
new file mode 100644
index 000000000000..2b538fa817f4
--- /dev/null
+++ b/drivers/md/dm-zoned-target.c
@@ -0,0 +1,967 @@
+/*
+ * Copyright (C) 2017 Western Digital Corporation or its affiliates.
+ *
+ * This file is released under the GPL.
+ */
+
+#include "dm-zoned.h"
+
+#include <linux/module.h>
+
+#define	DM_MSG_PREFIX		"zoned"
+
+#define DMZ_MIN_BIOS		8192
+
+/*
+ * Zone BIO context.
+ */
+struct dmz_bioctx {
+	struct dmz_target	*target;
+	struct dm_zone		*zone;
+	struct bio		*bio;
+	atomic_t		ref;
+	blk_status_t		status;
+};
+
+/*
+ * Chunk work descriptor.
+ */
+struct dm_chunk_work {
+	struct work_struct	work;
+	atomic_t		refcount;
+	struct dmz_target	*target;
+	unsigned int		chunk;
+	struct bio_list		bio_list;
+};
+
+/*
+ * Target descriptor.
+ */
+struct dmz_target {
+	struct dm_dev		*ddev;
+
+	unsigned long		flags;
+
+	/* Zoned block device information */
+	struct dmz_dev		*dev;
+
+	/* For metadata handling */
+	struct dmz_metadata     *metadata;
+
+	/* For reclaim */
+	struct dmz_reclaim	*reclaim;
+
+	/* For chunk work */
+	struct mutex		chunk_lock;
+	struct radix_tree_root	chunk_rxtree;
+	struct workqueue_struct *chunk_wq;
+
+	/* For cloned BIOs to zones */
+	struct bio_set		*bio_set;
+
+	/* For flush */
+	spinlock_t		flush_lock;
+	struct bio_list		flush_list;
+	struct delayed_work	flush_work;
+	struct workqueue_struct *flush_wq;
+};
+
+/*
+ * Flush intervals (seconds).
+ */
+#define DMZ_FLUSH_PERIOD	(10 * HZ)
+
+/*
+ * Target BIO completion.
+ */
+static inline void dmz_bio_endio(struct bio *bio, blk_status_t status)
+{
+	struct dmz_bioctx *bioctx = dm_per_bio_data(bio, sizeof(struct dmz_bioctx));
+
+	if (bioctx->status == BLK_STS_OK && status != BLK_STS_OK)
+		bioctx->status = status;
+	bio_endio(bio);
+}
+
+/*
+ * Partial clone read BIO completion callback. This terminates the
+ * target BIO when there are no more references to its context.
+ */
+static void dmz_read_bio_end_io(struct bio *bio)
+{
+	struct dmz_bioctx *bioctx = bio->bi_private;
+	blk_status_t status = bio->bi_status;
+
+	bio_put(bio);
+	dmz_bio_endio(bioctx->bio, status);
+}
+
+/*
+ * Issue a BIO to a zone. The BIO may only partially process the
+ * original target BIO.
+ */
+static int dmz_submit_read_bio(struct dmz_target *dmz, struct dm_zone *zone,
+			       struct bio *bio, sector_t chunk_block,
+			       unsigned int nr_blocks)
+{
+	struct dmz_bioctx *bioctx = dm_per_bio_data(bio, sizeof(struct dmz_bioctx));
+	sector_t sector;
+	struct bio *clone;
+
+	/* BIO remap sector */
+	sector = dmz_start_sect(dmz->metadata, zone) + dmz_blk2sect(chunk_block);
+
+	/* If the read is not partial, there is no need to clone the BIO */
+	if (nr_blocks == dmz_bio_blocks(bio)) {
+		/* Setup and submit the BIO */
+		bio->bi_iter.bi_sector = sector;
+		atomic_inc(&bioctx->ref);
+		generic_make_request(bio);
+		return 0;
+	}
+
+	/* Partial BIO: we need to clone the BIO */
+	clone = bio_clone_fast(bio, GFP_NOIO, dmz->bio_set);
+	if (!clone)
+		return -ENOMEM;
+
+	/* Setup the clone */
+	clone->bi_iter.bi_sector = sector;
+	clone->bi_iter.bi_size = dmz_blk2sect(nr_blocks) << SECTOR_SHIFT;
+	clone->bi_end_io = dmz_read_bio_end_io;
+	clone->bi_private = bioctx;
+
+	bio_advance(bio, clone->bi_iter.bi_size);
+
+	/* Submit the clone */
+	atomic_inc(&bioctx->ref);
+	generic_make_request(clone);
+
+	return 0;
+}
+
+/*
+ * Zero out pages of discarded blocks accessed by a read BIO.
+ */
+static void dmz_handle_read_zero(struct dmz_target *dmz, struct bio *bio,
+				 sector_t chunk_block, unsigned int nr_blocks)
+{
+	unsigned int size = nr_blocks << DMZ_BLOCK_SHIFT;
+
+	/* Clear nr_blocks */
+	swap(bio->bi_iter.bi_size, size);
+	zero_fill_bio(bio);
+	swap(bio->bi_iter.bi_size, size);
+
+	bio_advance(bio, size);
+}
+
+/*
+ * Process a read BIO.
+ */
+static int dmz_handle_read(struct dmz_target *dmz, struct dm_zone *zone,
+			   struct bio *bio)
+{
+	sector_t chunk_block = dmz_chunk_block(dmz->dev, dmz_bio_block(bio));
+	unsigned int nr_blocks = dmz_bio_blocks(bio);
+	sector_t end_block = chunk_block + nr_blocks;
+	struct dm_zone *rzone, *bzone;
+	int ret;
+
+	/* Read into unmapped chunks need only zeroing the BIO buffer */
+	if (!zone) {
+		zero_fill_bio(bio);
+		return 0;
+	}
+
+	dmz_dev_debug(dmz->dev, "READ chunk %llu -> %s zone %u, block %llu, %u blocks",
+		      (unsigned long long)dmz_bio_chunk(dmz->dev, bio),
+		      (dmz_is_rnd(zone) ? "RND" : "SEQ"),
+		      dmz_id(dmz->metadata, zone),
+		      (unsigned long long)chunk_block, nr_blocks);
+
+	/* Check block validity to determine the read location */
+	bzone = zone->bzone;
+	while (chunk_block < end_block) {
+		nr_blocks = 0;
+		if (dmz_is_rnd(zone) || chunk_block < zone->wp_block) {
+			/* Test block validity in the data zone */
+			ret = dmz_block_valid(dmz->metadata, zone, chunk_block);
+			if (ret < 0)
+				return ret;
+			if (ret > 0) {
+				/* Read data zone blocks */
+				nr_blocks = ret;
+				rzone = zone;
+			}
+		}
+
+		/*
+		 * No valid blocks found in the data zone.
+		 * Check the buffer zone, if there is one.
+		 */
+		if (!nr_blocks && bzone) {
+			ret = dmz_block_valid(dmz->metadata, bzone, chunk_block);
+			if (ret < 0)
+				return ret;
+			if (ret > 0) {
+				/* Read buffer zone blocks */
+				nr_blocks = ret;
+				rzone = bzone;
+			}
+		}
+
+		if (nr_blocks) {
+			/* Valid blocks found: read them */
+			nr_blocks = min_t(unsigned int, nr_blocks, end_block - chunk_block);
+			ret = dmz_submit_read_bio(dmz, rzone, bio, chunk_block, nr_blocks);
+			if (ret)
+				return ret;
+			chunk_block += nr_blocks;
+		} else {
+			/* No valid block: zeroout the current BIO block */
+			dmz_handle_read_zero(dmz, bio, chunk_block, 1);
+			chunk_block++;
+		}
+	}
+
+	return 0;
+}
+
+/*
+ * Issue a write BIO to a zone.
+ */
+static void dmz_submit_write_bio(struct dmz_target *dmz, struct dm_zone *zone,
+				 struct bio *bio, sector_t chunk_block,
+				 unsigned int nr_blocks)
+{
+	struct dmz_bioctx *bioctx = dm_per_bio_data(bio, sizeof(struct dmz_bioctx));
+
+	/* Setup and submit the BIO */
+	bio->bi_bdev = dmz->dev->bdev;
+	bio->bi_iter.bi_sector = dmz_start_sect(dmz->metadata, zone) + dmz_blk2sect(chunk_block);
+	atomic_inc(&bioctx->ref);
+	generic_make_request(bio);
+
+	if (dmz_is_seq(zone))
+		zone->wp_block += nr_blocks;
+}
+
+/*
+ * Write blocks directly in a data zone, at the write pointer.
+ * If a buffer zone is assigned, invalidate the blocks written
+ * in place.
+ */
+static int dmz_handle_direct_write(struct dmz_target *dmz,
+				   struct dm_zone *zone, struct bio *bio,
+				   sector_t chunk_block,
+				   unsigned int nr_blocks)
+{
+	struct dmz_metadata *zmd = dmz->metadata;
+	struct dm_zone *bzone = zone->bzone;
+	int ret;
+
+	if (dmz_is_readonly(zone))
+		return -EROFS;
+
+	/* Submit write */
+	dmz_submit_write_bio(dmz, zone, bio, chunk_block, nr_blocks);
+
+	/*
+	 * Validate the blocks in the data zone and invalidate
+	 * in the buffer zone, if there is one.
+	 */
+	ret = dmz_validate_blocks(zmd, zone, chunk_block, nr_blocks);
+	if (ret == 0 && bzone)
+		ret = dmz_invalidate_blocks(zmd, bzone, chunk_block, nr_blocks);
+
+	return ret;
+}
+
+/*
+ * Write blocks in the buffer zone of @zone.
+ * If no buffer zone is assigned yet, get one.
+ * Called with @zone write locked.
+ */
+static int dmz_handle_buffered_write(struct dmz_target *dmz,
+				     struct dm_zone *zone, struct bio *bio,
+				     sector_t chunk_block,
+				     unsigned int nr_blocks)
+{
+	struct dmz_metadata *zmd = dmz->metadata;
+	struct dm_zone *bzone;
+	int ret;
+
+	/* Get the buffer zone. One will be allocated if needed */
+	bzone = dmz_get_chunk_buffer(zmd, zone);
+	if (!bzone)
+		return -ENOSPC;
+
+	if (dmz_is_readonly(bzone))
+		return -EROFS;
+
+	/* Submit write */
+	dmz_submit_write_bio(dmz, bzone, bio, chunk_block, nr_blocks);
+
+	/*
+	 * Validate the blocks in the buffer zone
+	 * and invalidate in the data zone.
+	 */
+	ret = dmz_validate_blocks(zmd, bzone, chunk_block, nr_blocks);
+	if (ret == 0 && chunk_block < zone->wp_block)
+		ret = dmz_invalidate_blocks(zmd, zone, chunk_block, nr_blocks);
+
+	return ret;
+}
+
+/*
+ * Process a write BIO.
+ */
+static int dmz_handle_write(struct dmz_target *dmz, struct dm_zone *zone,
+			    struct bio *bio)
+{
+	sector_t chunk_block = dmz_chunk_block(dmz->dev, dmz_bio_block(bio));
+	unsigned int nr_blocks = dmz_bio_blocks(bio);
+
+	if (!zone)
+		return -ENOSPC;
+
+	dmz_dev_debug(dmz->dev, "WRITE chunk %llu -> %s zone %u, block %llu, %u blocks",
+		      (unsigned long long)dmz_bio_chunk(dmz->dev, bio),
+		      (dmz_is_rnd(zone) ? "RND" : "SEQ"),
+		      dmz_id(dmz->metadata, zone),
+		      (unsigned long long)chunk_block, nr_blocks);
+
+	if (dmz_is_rnd(zone) || chunk_block == zone->wp_block) {
+		/*
+		 * zone is a random zone or it is a sequential zone
+		 * and the BIO is aligned to the zone write pointer:
+		 * direct write the zone.
+		 */
+		return dmz_handle_direct_write(dmz, zone, bio, chunk_block, nr_blocks);
+	}
+
+	/*
+	 * This is an unaligned write in a sequential zone:
+	 * use buffered write.
+	 */
+	return dmz_handle_buffered_write(dmz, zone, bio, chunk_block, nr_blocks);
+}
+
+/*
+ * Process a discard BIO.
+ */
+static int dmz_handle_discard(struct dmz_target *dmz, struct dm_zone *zone,
+			      struct bio *bio)
+{
+	struct dmz_metadata *zmd = dmz->metadata;
+	sector_t block = dmz_bio_block(bio);
+	unsigned int nr_blocks = dmz_bio_blocks(bio);
+	sector_t chunk_block = dmz_chunk_block(dmz->dev, block);
+	int ret = 0;
+
+	/* For unmapped chunks, there is nothing to do */
+	if (!zone)
+		return 0;
+
+	if (dmz_is_readonly(zone))
+		return -EROFS;
+
+	dmz_dev_debug(dmz->dev, "DISCARD chunk %llu -> zone %u, block %llu, %u blocks",
+		      (unsigned long long)dmz_bio_chunk(dmz->dev, bio),
+		      dmz_id(zmd, zone),
+		      (unsigned long long)chunk_block, nr_blocks);
+
+	/*
+	 * Invalidate blocks in the data zone and its
+	 * buffer zone if one is mapped.
+	 */
+	if (dmz_is_rnd(zone) || chunk_block < zone->wp_block)
+		ret = dmz_invalidate_blocks(zmd, zone, chunk_block, nr_blocks);
+	if (ret == 0 && zone->bzone)
+		ret = dmz_invalidate_blocks(zmd, zone->bzone,
+					    chunk_block, nr_blocks);
+	return ret;
+}
+
+/*
+ * Process a BIO.
+ */
+static void dmz_handle_bio(struct dmz_target *dmz, struct dm_chunk_work *cw,
+			   struct bio *bio)
+{
+	struct dmz_bioctx *bioctx = dm_per_bio_data(bio, sizeof(struct dmz_bioctx));
+	struct dmz_metadata *zmd = dmz->metadata;
+	struct dm_zone *zone;
+	int ret;
+
+	/*
+	 * Write may trigger a zone allocation. So make sure the
+	 * allocation can succeed.
+	 */
+	if (bio_op(bio) == REQ_OP_WRITE)
+		dmz_schedule_reclaim(dmz->reclaim);
+
+	dmz_lock_metadata(zmd);
+
+	/*
+	 * Get the data zone mapping the chunk. There may be no
+	 * mapping for read and discard. If a mapping is obtained,
+	 + the zone returned will be set to active state.
+	 */
+	zone = dmz_get_chunk_mapping(zmd, dmz_bio_chunk(dmz->dev, bio),
+				     bio_op(bio));
+	if (IS_ERR(zone)) {
+		ret = PTR_ERR(zone);
+		goto out;
+	}
+
+	/* Process the BIO */
+	if (zone) {
+		dmz_activate_zone(zone);
+		bioctx->zone = zone;
+	}
+
+	switch (bio_op(bio)) {
+	case REQ_OP_READ:
+		ret = dmz_handle_read(dmz, zone, bio);
+		break;
+	case REQ_OP_WRITE:
+		ret = dmz_handle_write(dmz, zone, bio);
+		break;
+	case REQ_OP_DISCARD:
+	case REQ_OP_WRITE_ZEROES:
+		ret = dmz_handle_discard(dmz, zone, bio);
+		break;
+	default:
+		dmz_dev_err(dmz->dev, "Unsupported BIO operation 0x%x",
+			    bio_op(bio));
+		ret = -EIO;
+	}
+
+	/*
+	 * Release the chunk mapping. This will check that the mapping
+	 * is still valid, that is, that the zone used still has valid blocks.
+	 */
+	if (zone)
+		dmz_put_chunk_mapping(zmd, zone);
+out:
+	dmz_bio_endio(bio, errno_to_blk_status(ret));
+
+	dmz_unlock_metadata(zmd);
+}
+
+/*
+ * Increment a chunk reference counter.
+ */
+static inline void dmz_get_chunk_work(struct dm_chunk_work *cw)
+{
+	atomic_inc(&cw->refcount);
+}
+
+/*
+ * Decrement a chunk work reference count and
+ * free it if it becomes 0.
+ */
+static void dmz_put_chunk_work(struct dm_chunk_work *cw)
+{
+	if (atomic_dec_and_test(&cw->refcount)) {
+		WARN_ON(!bio_list_empty(&cw->bio_list));
+		radix_tree_delete(&cw->target->chunk_rxtree, cw->chunk);
+		kfree(cw);
+	}
+}
+
+/*
+ * Chunk BIO work function.
+ */
+static void dmz_chunk_work(struct work_struct *work)
+{
+	struct dm_chunk_work *cw = container_of(work, struct dm_chunk_work, work);
+	struct dmz_target *dmz = cw->target;
+	struct bio *bio;
+
+	mutex_lock(&dmz->chunk_lock);
+
+	/* Process the chunk BIOs */
+	while ((bio = bio_list_pop(&cw->bio_list))) {
+		mutex_unlock(&dmz->chunk_lock);
+		dmz_handle_bio(dmz, cw, bio);
+		mutex_lock(&dmz->chunk_lock);
+		dmz_put_chunk_work(cw);
+	}
+
+	/* Queueing the work incremented the work refcount */
+	dmz_put_chunk_work(cw);
+
+	mutex_unlock(&dmz->chunk_lock);
+}
+
+/*
+ * Flush work.
+ */
+static void dmz_flush_work(struct work_struct *work)
+{
+	struct dmz_target *dmz = container_of(work, struct dmz_target, flush_work.work);
+	struct bio *bio;
+	int ret;
+
+	/* Flush dirty metadata blocks */
+	ret = dmz_flush_metadata(dmz->metadata);
+
+	/* Process queued flush requests */
+	while (1) {
+		spin_lock(&dmz->flush_lock);
+		bio = bio_list_pop(&dmz->flush_list);
+		spin_unlock(&dmz->flush_lock);
+
+		if (!bio)
+			break;
+
+		dmz_bio_endio(bio, errno_to_blk_status(ret));
+	}
+
+	queue_delayed_work(dmz->flush_wq, &dmz->flush_work, DMZ_FLUSH_PERIOD);
+}
+
+/*
+ * Get a chunk work and start it to process a new BIO.
+ * If the BIO chunk has no work yet, create one.
+ */
+static void dmz_queue_chunk_work(struct dmz_target *dmz, struct bio *bio)
+{
+	unsigned int chunk = dmz_bio_chunk(dmz->dev, bio);
+	struct dm_chunk_work *cw;
+
+	mutex_lock(&dmz->chunk_lock);
+
+	/* Get the BIO chunk work. If one is not active yet, create one */
+	cw = radix_tree_lookup(&dmz->chunk_rxtree, chunk);
+	if (!cw) {
+		int ret;
+
+		/* Create a new chunk work */
+		cw = kmalloc(sizeof(struct dm_chunk_work), GFP_NOFS);
+		if (!cw)
+			goto out;
+
+		INIT_WORK(&cw->work, dmz_chunk_work);
+		atomic_set(&cw->refcount, 0);
+		cw->target = dmz;
+		cw->chunk = chunk;
+		bio_list_init(&cw->bio_list);
+
+		ret = radix_tree_insert(&dmz->chunk_rxtree, chunk, cw);
+		if (unlikely(ret)) {
+			kfree(cw);
+			cw = NULL;
+			goto out;
+		}
+	}
+
+	bio_list_add(&cw->bio_list, bio);
+	dmz_get_chunk_work(cw);
+
+	if (queue_work(dmz->chunk_wq, &cw->work))
+		dmz_get_chunk_work(cw);
+out:
+	mutex_unlock(&dmz->chunk_lock);
+}
+
+/*
+ * Process a new BIO.
+ */
+static int dmz_map(struct dm_target *ti, struct bio *bio)
+{
+	struct dmz_target *dmz = ti->private;
+	struct dmz_dev *dev = dmz->dev;
+	struct dmz_bioctx *bioctx = dm_per_bio_data(bio, sizeof(struct dmz_bioctx));
+	sector_t sector = bio->bi_iter.bi_sector;
+	unsigned int nr_sectors = bio_sectors(bio);
+	sector_t chunk_sector;
+
+	dmz_dev_debug(dev, "BIO op %d sector %llu + %u => chunk %llu, block %llu, %u blocks",
+		      bio_op(bio), (unsigned long long)sector, nr_sectors,
+		      (unsigned long long)dmz_bio_chunk(dmz->dev, bio),
+		      (unsigned long long)dmz_chunk_block(dmz->dev, dmz_bio_block(bio)),
+		      (unsigned int)dmz_bio_blocks(bio));
+
+	bio->bi_bdev = dev->bdev;
+
+	if (!nr_sectors && (bio_op(bio) != REQ_OP_FLUSH) && (bio_op(bio) != REQ_OP_WRITE))
+		return DM_MAPIO_REMAPPED;
+
+	/* The BIO should be block aligned */
+	if ((nr_sectors & DMZ_BLOCK_SECTORS_MASK) || (sector & DMZ_BLOCK_SECTORS_MASK))
+		return DM_MAPIO_KILL;
+
+	/* Initialize the BIO context */
+	bioctx->target = dmz;
+	bioctx->zone = NULL;
+	bioctx->bio = bio;
+	atomic_set(&bioctx->ref, 1);
+	bioctx->status = BLK_STS_OK;
+
+	/* Set the BIO pending in the flush list */
+	if (bio_op(bio) == REQ_OP_FLUSH || (!nr_sectors && bio_op(bio) == REQ_OP_WRITE)) {
+		spin_lock(&dmz->flush_lock);
+		bio_list_add(&dmz->flush_list, bio);
+		spin_unlock(&dmz->flush_lock);
+		mod_delayed_work(dmz->flush_wq, &dmz->flush_work, 0);
+		return DM_MAPIO_SUBMITTED;
+	}
+
+	/* Split zone BIOs to fit entirely into a zone */
+	chunk_sector = sector & (dev->zone_nr_sectors - 1);
+	if (chunk_sector + nr_sectors > dev->zone_nr_sectors)
+		dm_accept_partial_bio(bio, dev->zone_nr_sectors - chunk_sector);
+
+	/* Now ready to handle this BIO */
+	dmz_reclaim_bio_acc(dmz->reclaim);
+	dmz_queue_chunk_work(dmz, bio);
+
+	return DM_MAPIO_SUBMITTED;
+}
+
+/*
+ * Completed target BIO processing.
+ */
+static int dmz_end_io(struct dm_target *ti, struct bio *bio, blk_status_t *error)
+{
+	struct dmz_bioctx *bioctx = dm_per_bio_data(bio, sizeof(struct dmz_bioctx));
+
+	if (bioctx->status == BLK_STS_OK && *error)
+		bioctx->status = *error;
+
+	if (!atomic_dec_and_test(&bioctx->ref))
+		return DM_ENDIO_INCOMPLETE;
+
+	/* Done */
+	bio->bi_status = bioctx->status;
+
+	if (bioctx->zone) {
+		struct dm_zone *zone = bioctx->zone;
+
+		if (*error && bio_op(bio) == REQ_OP_WRITE) {
+			if (dmz_is_seq(zone))
+				set_bit(DMZ_SEQ_WRITE_ERR, &zone->flags);
+		}
+		dmz_deactivate_zone(zone);
+	}
+
+	return DM_ENDIO_DONE;
+}
+
+/*
+ * Get zoned device information.
+ */
+static int dmz_get_zoned_device(struct dm_target *ti, char *path)
+{
+	struct dmz_target *dmz = ti->private;
+	struct request_queue *q;
+	struct dmz_dev *dev;
+	int ret;
+
+	/* Get the target device */
+	ret = dm_get_device(ti, path, dm_table_get_mode(ti->table), &dmz->ddev);
+	if (ret) {
+		ti->error = "Get target device failed";
+		dmz->ddev = NULL;
+		return ret;
+	}
+
+	dev = kzalloc(sizeof(struct dmz_dev), GFP_KERNEL);
+	if (!dev) {
+		ret = -ENOMEM;
+		goto err;
+	}
+
+	dev->bdev = dmz->ddev->bdev;
+	(void)bdevname(dev->bdev, dev->name);
+
+	if (bdev_zoned_model(dev->bdev) == BLK_ZONED_NONE) {
+		ti->error = "Not a zoned block device";
+		ret = -EINVAL;
+		goto err;
+	}
+
+	dev->capacity = i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT;
+	if (ti->begin || (ti->len != dev->capacity)) {
+		ti->error = "Partial mapping not supported";
+		ret = -EINVAL;
+		goto err;
+	}
+
+	q = bdev_get_queue(dev->bdev);
+	dev->zone_nr_sectors = q->limits.chunk_sectors;
+	dev->zone_nr_sectors_shift = ilog2(dev->zone_nr_sectors);
+
+	dev->zone_nr_blocks = dmz_sect2blk(dev->zone_nr_sectors);
+	dev->zone_nr_blocks_shift = ilog2(dev->zone_nr_blocks);
+
+	dev->nr_zones = (dev->capacity + dev->zone_nr_sectors - 1)
+		>> dev->zone_nr_sectors_shift;
+
+	dmz->dev = dev;
+
+	return 0;
+err:
+	dm_put_device(ti, dmz->ddev);
+	kfree(dev);
+
+	return ret;
+}
+
+/*
+ * Cleanup zoned device information.
+ */
+static void dmz_put_zoned_device(struct dm_target *ti)
+{
+	struct dmz_target *dmz = ti->private;
+
+	dm_put_device(ti, dmz->ddev);
+	kfree(dmz->dev);
+	dmz->dev = NULL;
+}
+
+/*
+ * Setup target.
+ */
+static int dmz_ctr(struct dm_target *ti, unsigned int argc, char **argv)
+{
+	struct dmz_target *dmz;
+	struct dmz_dev *dev;
+	int ret;
+
+	/* Check arguments */
+	if (argc != 1) {
+		ti->error = "Invalid argument count";
+		return -EINVAL;
+	}
+
+	/* Allocate and initialize the target descriptor */
+	dmz = kzalloc(sizeof(struct dmz_target), GFP_KERNEL);
+	if (!dmz) {
+		ti->error = "Unable to allocate the zoned target descriptor";
+		return -ENOMEM;
+	}
+	ti->private = dmz;
+
+	/* Get the target zoned block device */
+	ret = dmz_get_zoned_device(ti, argv[0]);
+	if (ret) {
+		dmz->ddev = NULL;
+		goto err;
+	}
+
+	/* Initialize metadata */
+	dev = dmz->dev;
+	ret = dmz_ctr_metadata(dev, &dmz->metadata);
+	if (ret) {
+		ti->error = "Metadata initialization failed";
+		goto err_dev;
+	}
+
+	/* Set target (no write same support) */
+	ti->max_io_len = dev->zone_nr_sectors << 9;
+	ti->num_flush_bios = 1;
+	ti->num_discard_bios = 1;
+	ti->num_write_zeroes_bios = 1;
+	ti->per_io_data_size = sizeof(struct dmz_bioctx);
+	ti->flush_supported = true;
+	ti->discards_supported = true;
+	ti->split_discard_bios = true;
+
+	/* The exposed capacity is the number of chunks that can be mapped */
+	ti->len = (sector_t)dmz_nr_chunks(dmz->metadata) << dev->zone_nr_sectors_shift;
+
+	/* Zone BIO */
+	dmz->bio_set = bioset_create(DMZ_MIN_BIOS, 0, 0);
+	if (!dmz->bio_set) {
+		ti->error = "Create BIO set failed";
+		ret = -ENOMEM;
+		goto err_meta;
+	}
+
+	/* Chunk BIO work */
+	mutex_init(&dmz->chunk_lock);
+	INIT_RADIX_TREE(&dmz->chunk_rxtree, GFP_NOFS);
+	dmz->chunk_wq = alloc_workqueue("dmz_cwq_%s", WQ_MEM_RECLAIM | WQ_UNBOUND,
+					0, dev->name);
+	if (!dmz->chunk_wq) {
+		ti->error = "Create chunk workqueue failed";
+		ret = -ENOMEM;
+		goto err_bio;
+	}
+
+	/* Flush work */
+	spin_lock_init(&dmz->flush_lock);
+	bio_list_init(&dmz->flush_list);
+	INIT_DELAYED_WORK(&dmz->flush_work, dmz_flush_work);
+	dmz->flush_wq = alloc_ordered_workqueue("dmz_fwq_%s", WQ_MEM_RECLAIM,
+						dev->name);
+	if (!dmz->flush_wq) {
+		ti->error = "Create flush workqueue failed";
+		ret = -ENOMEM;
+		goto err_cwq;
+	}
+	mod_delayed_work(dmz->flush_wq, &dmz->flush_work, DMZ_FLUSH_PERIOD);
+
+	/* Initialize reclaim */
+	ret = dmz_ctr_reclaim(dev, dmz->metadata, &dmz->reclaim);
+	if (ret) {
+		ti->error = "Zone reclaim initialization failed";
+		goto err_fwq;
+	}
+
+	dmz_dev_info(dev, "Target device: %llu 512-byte logical sectors (%llu blocks)",
+		     (unsigned long long)ti->len,
+		     (unsigned long long)dmz_sect2blk(ti->len));
+
+	return 0;
+err_fwq:
+	destroy_workqueue(dmz->flush_wq);
+err_cwq:
+	destroy_workqueue(dmz->chunk_wq);
+err_bio:
+	bioset_free(dmz->bio_set);
+err_meta:
+	dmz_dtr_metadata(dmz->metadata);
+err_dev:
+	dmz_put_zoned_device(ti);
+err:
+	kfree(dmz);
+
+	return ret;
+}
+
+/*
+ * Cleanup target.
+ */
+static void dmz_dtr(struct dm_target *ti)
+{
+	struct dmz_target *dmz = ti->private;
+
+	flush_workqueue(dmz->chunk_wq);
+	destroy_workqueue(dmz->chunk_wq);
+
+	dmz_dtr_reclaim(dmz->reclaim);
+
+	cancel_delayed_work_sync(&dmz->flush_work);
+	destroy_workqueue(dmz->flush_wq);
+
+	(void) dmz_flush_metadata(dmz->metadata);
+
+	dmz_dtr_metadata(dmz->metadata);
+
+	bioset_free(dmz->bio_set);
+
+	dmz_put_zoned_device(ti);
+
+	kfree(dmz);
+}
+
+/*
+ * Setup target request queue limits.
+ */
+static void dmz_io_hints(struct dm_target *ti, struct queue_limits *limits)
+{
+	struct dmz_target *dmz = ti->private;
+	unsigned int chunk_sectors = dmz->dev->zone_nr_sectors;
+
+	limits->logical_block_size = DMZ_BLOCK_SIZE;
+	limits->physical_block_size = DMZ_BLOCK_SIZE;
+
+	blk_limits_io_min(limits, DMZ_BLOCK_SIZE);
+	blk_limits_io_opt(limits, DMZ_BLOCK_SIZE);
+
+	limits->discard_alignment = DMZ_BLOCK_SIZE;
+	limits->discard_granularity = DMZ_BLOCK_SIZE;
+	limits->max_discard_sectors = chunk_sectors;
+	limits->max_hw_discard_sectors = chunk_sectors;
+	limits->max_write_zeroes_sectors = chunk_sectors;
+
+	/* FS hint to try to align to the device zone size */
+	limits->chunk_sectors = chunk_sectors;
+	limits->max_sectors = chunk_sectors;
+
+	/* We are exposing a drive-managed zoned block device */
+	limits->zoned = BLK_ZONED_NONE;
+}
+
+/*
+ * Pass on ioctl to the backend device.
+ */
+static int dmz_prepare_ioctl(struct dm_target *ti,
+			     struct block_device **bdev, fmode_t *mode)
+{
+	struct dmz_target *dmz = ti->private;
+
+	*bdev = dmz->dev->bdev;
+
+	return 0;
+}
+
+/*
+ * Stop works on suspend.
+ */
+static void dmz_suspend(struct dm_target *ti)
+{
+	struct dmz_target *dmz = ti->private;
+
+	flush_workqueue(dmz->chunk_wq);
+	dmz_suspend_reclaim(dmz->reclaim);
+	cancel_delayed_work_sync(&dmz->flush_work);
+}
+
+/*
+ * Restart works on resume or if suspend failed.
+ */
+static void dmz_resume(struct dm_target *ti)
+{
+	struct dmz_target *dmz = ti->private;
+
+	queue_delayed_work(dmz->flush_wq, &dmz->flush_work, DMZ_FLUSH_PERIOD);
+	dmz_resume_reclaim(dmz->reclaim);
+}
+
+static int dmz_iterate_devices(struct dm_target *ti,
+			       iterate_devices_callout_fn fn, void *data)
+{
+	struct dmz_target *dmz = ti->private;
+
+	return fn(ti, dmz->ddev, 0, dmz->dev->capacity, data);
+}
+
+static struct target_type dmz_type = {
+	.name		 = "zoned",
+	.version	 = {1, 0, 0},
+	.features	 = DM_TARGET_SINGLETON | DM_TARGET_ZONED_HM,
+	.module		 = THIS_MODULE,
+	.ctr		 = dmz_ctr,
+	.dtr		 = dmz_dtr,
+	.map		 = dmz_map,
+	.end_io		 = dmz_end_io,
+	.io_hints	 = dmz_io_hints,
+	.prepare_ioctl	 = dmz_prepare_ioctl,
+	.postsuspend	 = dmz_suspend,
+	.resume		 = dmz_resume,
+	.iterate_devices = dmz_iterate_devices,
+};
+
+static int __init dmz_init(void)
+{
+	return dm_register_target(&dmz_type);
+}
+
+static void __exit dmz_exit(void)
+{
+	dm_unregister_target(&dmz_type);
+}
+
+module_init(dmz_init);
+module_exit(dmz_exit);
+
+MODULE_DESCRIPTION(DM_NAME " target for zoned block devices");
+MODULE_AUTHOR("Damien Le Moal <damien.lemoal@wdc.com>");
+MODULE_LICENSE("GPL");
diff --git a/drivers/md/dm-zoned.h b/drivers/md/dm-zoned.h
new file mode 100644
index 000000000000..12419f0bfe78
--- /dev/null
+++ b/drivers/md/dm-zoned.h
@@ -0,0 +1,228 @@
+/*
+ * Copyright (C) 2017 Western Digital Corporation or its affiliates.
+ *
+ * This file is released under the GPL.
+ */
+
+#ifndef DM_ZONED_H
+#define DM_ZONED_H
+
+#include <linux/types.h>
+#include <linux/blkdev.h>
+#include <linux/device-mapper.h>
+#include <linux/dm-kcopyd.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/mutex.h>
+#include <linux/workqueue.h>
+#include <linux/rwsem.h>
+#include <linux/rbtree.h>
+#include <linux/radix-tree.h>
+#include <linux/shrinker.h>
+
+/*
+ * dm-zoned creates block devices with 4KB blocks, always.
+ */
+#define DMZ_BLOCK_SHIFT		12
+#define DMZ_BLOCK_SIZE		(1 << DMZ_BLOCK_SHIFT)
+#define DMZ_BLOCK_MASK		(DMZ_BLOCK_SIZE - 1)
+
+#define DMZ_BLOCK_SHIFT_BITS	(DMZ_BLOCK_SHIFT + 3)
+#define DMZ_BLOCK_SIZE_BITS	(1 << DMZ_BLOCK_SHIFT_BITS)
+#define DMZ_BLOCK_MASK_BITS	(DMZ_BLOCK_SIZE_BITS - 1)
+
+#define DMZ_BLOCK_SECTORS_SHIFT	(DMZ_BLOCK_SHIFT - SECTOR_SHIFT)
+#define DMZ_BLOCK_SECTORS	(DMZ_BLOCK_SIZE >> SECTOR_SHIFT)
+#define DMZ_BLOCK_SECTORS_MASK	(DMZ_BLOCK_SECTORS - 1)
+
+/*
+ * 4KB block <-> 512B sector conversion.
+ */
+#define dmz_blk2sect(b)		((sector_t)(b) << DMZ_BLOCK_SECTORS_SHIFT)
+#define dmz_sect2blk(s)		((sector_t)(s) >> DMZ_BLOCK_SECTORS_SHIFT)
+
+#define dmz_bio_block(bio)	dmz_sect2blk((bio)->bi_iter.bi_sector)
+#define dmz_bio_blocks(bio)	dmz_sect2blk(bio_sectors(bio))
+
+/*
+ * Zoned block device information.
+ */
+struct dmz_dev {
+	struct block_device	*bdev;
+
+	char			name[BDEVNAME_SIZE];
+
+	sector_t		capacity;
+
+	unsigned int		nr_zones;
+
+	sector_t		zone_nr_sectors;
+	unsigned int		zone_nr_sectors_shift;
+
+	sector_t		zone_nr_blocks;
+	sector_t		zone_nr_blocks_shift;
+};
+
+#define dmz_bio_chunk(dev, bio)	((bio)->bi_iter.bi_sector >> \
+				 (dev)->zone_nr_sectors_shift)
+#define dmz_chunk_block(dev, b)	((b) & ((dev)->zone_nr_blocks - 1))
+
+/*
+ * Zone descriptor.
+ */
+struct dm_zone {
+	/* For listing the zone depending on its state */
+	struct list_head	link;
+
+	/* Zone type and state */
+	unsigned long		flags;
+
+	/* Zone activation reference count */
+	atomic_t		refcount;
+
+	/* Zone write pointer block (relative to the zone start block) */
+	unsigned int		wp_block;
+
+	/* Zone weight (number of valid blocks in the zone) */
+	unsigned int		weight;
+
+	/* The chunk that the zone maps */
+	unsigned int		chunk;
+
+	/*
+	 * For a sequential data zone, pointer to the random zone
+	 * used as a buffer for processing unaligned writes.
+	 * For a buffer zone, this points back to the data zone.
+	 */
+	struct dm_zone		*bzone;
+};
+
+/*
+ * Zone flags.
+ */
+enum {
+	/* Zone write type */
+	DMZ_RND,
+	DMZ_SEQ,
+
+	/* Zone critical condition */
+	DMZ_OFFLINE,
+	DMZ_READ_ONLY,
+
+	/* How the zone is being used */
+	DMZ_META,
+	DMZ_DATA,
+	DMZ_BUF,
+
+	/* Zone internal state */
+	DMZ_ACTIVE,
+	DMZ_RECLAIM,
+	DMZ_SEQ_WRITE_ERR,
+};
+
+/*
+ * Zone data accessors.
+ */
+#define dmz_is_rnd(z)		test_bit(DMZ_RND, &(z)->flags)
+#define dmz_is_seq(z)		test_bit(DMZ_SEQ, &(z)->flags)
+#define dmz_is_empty(z)		((z)->wp_block == 0)
+#define dmz_is_offline(z)	test_bit(DMZ_OFFLINE, &(z)->flags)
+#define dmz_is_readonly(z)	test_bit(DMZ_READ_ONLY, &(z)->flags)
+#define dmz_is_active(z)	test_bit(DMZ_ACTIVE, &(z)->flags)
+#define dmz_in_reclaim(z)	test_bit(DMZ_RECLAIM, &(z)->flags)
+#define dmz_seq_write_err(z)	test_bit(DMZ_SEQ_WRITE_ERR, &(z)->flags)
+
+#define dmz_is_meta(z)		test_bit(DMZ_META, &(z)->flags)
+#define dmz_is_buf(z)		test_bit(DMZ_BUF, &(z)->flags)
+#define dmz_is_data(z)		test_bit(DMZ_DATA, &(z)->flags)
+
+#define dmz_weight(z)		((z)->weight)
+
+/*
+ * Message functions.
+ */
+#define dmz_dev_info(dev, format, args...)	\
+	DMINFO("(%s): " format, (dev)->name, ## args)
+
+#define dmz_dev_err(dev, format, args...)	\
+	DMERR("(%s): " format, (dev)->name, ## args)
+
+#define dmz_dev_warn(dev, format, args...)	\
+	DMWARN("(%s): " format, (dev)->name, ## args)
+
+#define dmz_dev_debug(dev, format, args...)	\
+	DMDEBUG("(%s): " format, (dev)->name, ## args)
+
+struct dmz_metadata;
+struct dmz_reclaim;
+
+/*
+ * Functions defined in dm-zoned-metadata.c
+ */
+int dmz_ctr_metadata(struct dmz_dev *dev, struct dmz_metadata **zmd);
+void dmz_dtr_metadata(struct dmz_metadata *zmd);
+int dmz_resume_metadata(struct dmz_metadata *zmd);
+
+void dmz_lock_map(struct dmz_metadata *zmd);
+void dmz_unlock_map(struct dmz_metadata *zmd);
+void dmz_lock_metadata(struct dmz_metadata *zmd);
+void dmz_unlock_metadata(struct dmz_metadata *zmd);
+void dmz_lock_flush(struct dmz_metadata *zmd);
+void dmz_unlock_flush(struct dmz_metadata *zmd);
+int dmz_flush_metadata(struct dmz_metadata *zmd);
+
+unsigned int dmz_id(struct dmz_metadata *zmd, struct dm_zone *zone);
+sector_t dmz_start_sect(struct dmz_metadata *zmd, struct dm_zone *zone);
+sector_t dmz_start_block(struct dmz_metadata *zmd, struct dm_zone *zone);
+unsigned int dmz_nr_chunks(struct dmz_metadata *zmd);
+
+#define DMZ_ALLOC_RND		0x01
+#define DMZ_ALLOC_RECLAIM	0x02
+
+struct dm_zone *dmz_alloc_zone(struct dmz_metadata *zmd, unsigned long flags);
+void dmz_free_zone(struct dmz_metadata *zmd, struct dm_zone *zone);
+
+void dmz_map_zone(struct dmz_metadata *zmd, struct dm_zone *zone,
+		  unsigned int chunk);
+void dmz_unmap_zone(struct dmz_metadata *zmd, struct dm_zone *zone);
+unsigned int dmz_nr_rnd_zones(struct dmz_metadata *zmd);
+unsigned int dmz_nr_unmap_rnd_zones(struct dmz_metadata *zmd);
+
+void dmz_activate_zone(struct dm_zone *zone);
+void dmz_deactivate_zone(struct dm_zone *zone);
+
+int dmz_lock_zone_reclaim(struct dm_zone *zone);
+void dmz_unlock_zone_reclaim(struct dm_zone *zone);
+struct dm_zone *dmz_get_zone_for_reclaim(struct dmz_metadata *zmd);
+
+struct dm_zone *dmz_get_chunk_mapping(struct dmz_metadata *zmd,
+				      unsigned int chunk, int op);
+void dmz_put_chunk_mapping(struct dmz_metadata *zmd, struct dm_zone *zone);
+struct dm_zone *dmz_get_chunk_buffer(struct dmz_metadata *zmd,
+				     struct dm_zone *dzone);
+
+int dmz_validate_blocks(struct dmz_metadata *zmd, struct dm_zone *zone,
+			sector_t chunk_block, unsigned int nr_blocks);
+int dmz_invalidate_blocks(struct dmz_metadata *zmd, struct dm_zone *zone,
+			  sector_t chunk_block, unsigned int nr_blocks);
+int dmz_block_valid(struct dmz_metadata *zmd, struct dm_zone *zone,
+		    sector_t chunk_block);
+int dmz_first_valid_block(struct dmz_metadata *zmd, struct dm_zone *zone,
+			  sector_t *chunk_block);
+int dmz_copy_valid_blocks(struct dmz_metadata *zmd, struct dm_zone *from_zone,
+			  struct dm_zone *to_zone);
+int dmz_merge_valid_blocks(struct dmz_metadata *zmd, struct dm_zone *from_zone,
+			   struct dm_zone *to_zone, sector_t chunk_block);
+
+/*
+ * Functions defined in dm-zoned-reclaim.c
+ */
+int dmz_ctr_reclaim(struct dmz_dev *dev, struct dmz_metadata *zmd,
+		    struct dmz_reclaim **zrc);
+void dmz_dtr_reclaim(struct dmz_reclaim *zrc);
+void dmz_suspend_reclaim(struct dmz_reclaim *zrc);
+void dmz_resume_reclaim(struct dmz_reclaim *zrc);
+void dmz_reclaim_bio_acc(struct dmz_reclaim *zrc);
+void dmz_schedule_reclaim(struct dmz_reclaim *zrc);
+
+#endif /* DM_ZONED_H */
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 6ef9500226c0..10cabe961bdb 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -19,6 +19,7 @@
 #include <linux/dax.h>
 #include <linux/slab.h>
 #include <linux/idr.h>
+#include <linux/uio.h>
 #include <linux/hdreg.h>
 #include <linux/delay.h>
 #include <linux/wait.h>
@@ -58,12 +59,15 @@ static DECLARE_WORK(deferred_remove_work, do_deferred_remove);
 
 static struct workqueue_struct *deferred_remove_workqueue;
 
+atomic_t dm_global_event_nr = ATOMIC_INIT(0);
+DECLARE_WAIT_QUEUE_HEAD(dm_global_eventq);
+
 /*
  * One of these is allocated per bio.
  */
 struct dm_io {
 	struct mapped_device *md;
-	int error;
+	blk_status_t status;
 	atomic_t io_count;
 	struct bio *bio;
 	unsigned long start_time;
@@ -768,23 +772,24 @@ static int __noflush_suspending(struct mapped_device *md)
  * Decrements the number of outstanding ios that a bio has been
  * cloned into, completing the original io if necc.
  */
-static void dec_pending(struct dm_io *io, int error)
+static void dec_pending(struct dm_io *io, blk_status_t error)
 {
 	unsigned long flags;
-	int io_error;
+	blk_status_t io_error;
 	struct bio *bio;
 	struct mapped_device *md = io->md;
 
 	/* Push-back supersedes any I/O errors */
 	if (unlikely(error)) {
 		spin_lock_irqsave(&io->endio_lock, flags);
-		if (!(io->error > 0 && __noflush_suspending(md)))
-			io->error = error;
+		if (!(io->status == BLK_STS_DM_REQUEUE &&
+				__noflush_suspending(md)))
+			io->status = error;
 		spin_unlock_irqrestore(&io->endio_lock, flags);
 	}
 
 	if (atomic_dec_and_test(&io->io_count)) {
-		if (io->error == DM_ENDIO_REQUEUE) {
+		if (io->status == BLK_STS_DM_REQUEUE) {
 			/*
 			 * Target requested pushing back the I/O.
 			 */
@@ -793,16 +798,16 @@ static void dec_pending(struct dm_io *io, int error)
 				bio_list_add_head(&md->deferred, io->bio);
 			else
 				/* noflush suspend was interrupted. */
-				io->error = -EIO;
+				io->status = BLK_STS_IOERR;
 			spin_unlock_irqrestore(&md->deferred_lock, flags);
 		}
 
-		io_error = io->error;
+		io_error = io->status;
 		bio = io->bio;
 		end_io_acct(io);
 		free_io(md, io);
 
-		if (io_error == DM_ENDIO_REQUEUE)
+		if (io_error == BLK_STS_DM_REQUEUE)
 			return;
 
 		if ((bio->bi_opf & REQ_PREFLUSH) && bio->bi_iter.bi_size) {
@@ -814,7 +819,7 @@ static void dec_pending(struct dm_io *io, int error)
 			queue_io(md, bio);
 		} else {
 			/* done with normal IO or empty flush */
-			bio->bi_error = io_error;
+			bio->bi_status = io_error;
 			bio_endio(bio);
 		}
 	}
@@ -838,31 +843,13 @@ void disable_write_zeroes(struct mapped_device *md)
 
 static void clone_endio(struct bio *bio)
 {
-	int error = bio->bi_error;
-	int r = error;
+	blk_status_t error = bio->bi_status;
 	struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone);
 	struct dm_io *io = tio->io;
 	struct mapped_device *md = tio->io->md;
 	dm_endio_fn endio = tio->ti->type->end_io;
 
-	if (endio) {
-		r = endio(tio->ti, bio, error);
-		if (r < 0 || r == DM_ENDIO_REQUEUE)
-			/*
-			 * error and requeue request are handled
-			 * in dec_pending().
-			 */
-			error = r;
-		else if (r == DM_ENDIO_INCOMPLETE)
-			/* The target will handle the io */
-			return;
-		else if (r) {
-			DMWARN("unimplemented target endio return value: %d", r);
-			BUG();
-		}
-	}
-
-	if (unlikely(r == -EREMOTEIO)) {
+	if (unlikely(error == BLK_STS_TARGET)) {
 		if (bio_op(bio) == REQ_OP_WRITE_SAME &&
 		    !bdev_get_queue(bio->bi_bdev)->limits.max_write_same_sectors)
 			disable_write_same(md);
@@ -871,6 +858,23 @@ static void clone_endio(struct bio *bio)
 			disable_write_zeroes(md);
 	}
 
+	if (endio) {
+		int r = endio(tio->ti, bio, &error);
+		switch (r) {
+		case DM_ENDIO_REQUEUE:
+			error = BLK_STS_DM_REQUEUE;
+			/*FALLTHRU*/
+		case DM_ENDIO_DONE:
+			break;
+		case DM_ENDIO_INCOMPLETE:
+			/* The target will handle the io */
+			return;
+		default:
+			DMWARN("unimplemented target endio return value: %d", r);
+			BUG();
+		}
+	}
+
 	free_tio(tio);
 	dec_pending(io, error);
 }
@@ -969,6 +973,48 @@ static long dm_dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff,
 	return ret;
 }
 
+static size_t dm_dax_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff,
+		void *addr, size_t bytes, struct iov_iter *i)
+{
+	struct mapped_device *md = dax_get_private(dax_dev);
+	sector_t sector = pgoff * PAGE_SECTORS;
+	struct dm_target *ti;
+	long ret = 0;
+	int srcu_idx;
+
+	ti = dm_dax_get_live_target(md, sector, &srcu_idx);
+
+	if (!ti)
+		goto out;
+	if (!ti->type->dax_copy_from_iter) {
+		ret = copy_from_iter(addr, bytes, i);
+		goto out;
+	}
+	ret = ti->type->dax_copy_from_iter(ti, pgoff, addr, bytes, i);
+ out:
+	dm_put_live_table(md, srcu_idx);
+
+	return ret;
+}
+
+static void dm_dax_flush(struct dax_device *dax_dev, pgoff_t pgoff, void *addr,
+		size_t size)
+{
+	struct mapped_device *md = dax_get_private(dax_dev);
+	sector_t sector = pgoff * PAGE_SECTORS;
+	struct dm_target *ti;
+	int srcu_idx;
+
+	ti = dm_dax_get_live_target(md, sector, &srcu_idx);
+
+	if (!ti)
+		goto out;
+	if (ti->type->dax_flush)
+		ti->type->dax_flush(ti, pgoff, addr, size);
+ out:
+	dm_put_live_table(md, srcu_idx);
+}
+
 /*
  * A target may call dm_accept_partial_bio only from the map routine.  It is
  * allowed for all bio types except REQ_PREFLUSH.
@@ -1010,6 +1056,85 @@ void dm_accept_partial_bio(struct bio *bio, unsigned n_sectors)
 EXPORT_SYMBOL_GPL(dm_accept_partial_bio);
 
 /*
+ * The zone descriptors obtained with a zone report indicate
+ * zone positions within the target device. The zone descriptors
+ * must be remapped to match their position within the dm device.
+ * A target may call dm_remap_zone_report after completion of a
+ * REQ_OP_ZONE_REPORT bio to remap the zone descriptors obtained
+ * from the target device mapping to the dm device.
+ */
+void dm_remap_zone_report(struct dm_target *ti, struct bio *bio, sector_t start)
+{
+#ifdef CONFIG_BLK_DEV_ZONED
+	struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone);
+	struct bio *report_bio = tio->io->bio;
+	struct blk_zone_report_hdr *hdr = NULL;
+	struct blk_zone *zone;
+	unsigned int nr_rep = 0;
+	unsigned int ofst;
+	struct bio_vec bvec;
+	struct bvec_iter iter;
+	void *addr;
+
+	if (bio->bi_status)
+		return;
+
+	/*
+	 * Remap the start sector of the reported zones. For sequential zones,
+	 * also remap the write pointer position.
+	 */
+	bio_for_each_segment(bvec, report_bio, iter) {
+		addr = kmap_atomic(bvec.bv_page);
+
+		/* Remember the report header in the first page */
+		if (!hdr) {
+			hdr = addr;
+			ofst = sizeof(struct blk_zone_report_hdr);
+		} else
+			ofst = 0;
+
+		/* Set zones start sector */
+		while (hdr->nr_zones && ofst < bvec.bv_len) {
+			zone = addr + ofst;
+			if (zone->start >= start + ti->len) {
+				hdr->nr_zones = 0;
+				break;
+			}
+			zone->start = zone->start + ti->begin - start;
+			if (zone->type != BLK_ZONE_TYPE_CONVENTIONAL) {
+				if (zone->cond == BLK_ZONE_COND_FULL)
+					zone->wp = zone->start + zone->len;
+				else if (zone->cond == BLK_ZONE_COND_EMPTY)
+					zone->wp = zone->start;
+				else
+					zone->wp = zone->wp + ti->begin - start;
+			}
+			ofst += sizeof(struct blk_zone);
+			hdr->nr_zones--;
+			nr_rep++;
+		}
+
+		if (addr != hdr)
+			kunmap_atomic(addr);
+
+		if (!hdr->nr_zones)
+			break;
+	}
+
+	if (hdr) {
+		hdr->nr_zones = nr_rep;
+		kunmap_atomic(hdr);
+	}
+
+	bio_advance(report_bio, report_bio->bi_iter.bi_size);
+
+#else /* !CONFIG_BLK_DEV_ZONED */
+	bio->bi_status = BLK_STS_NOTSUPP;
+#endif
+}
+EXPORT_SYMBOL_GPL(dm_remap_zone_report);
+
+/*
  * Flush current->bio_list when the target map method blocks.
  * This fixes deadlocks in snapshot and possibly in other targets.
  */
@@ -1036,7 +1161,8 @@ static void flush_current_bio_list(struct blk_plug_cb *cb, bool from_schedule)
 
 		while ((bio = bio_list_pop(&list))) {
 			struct bio_set *bs = bio->bi_pool;
-			if (unlikely(!bs) || bs == fs_bio_set) {
+			if (unlikely(!bs) || bs == fs_bio_set ||
+			    !bs->rescue_workqueue) {
 				bio_list_add(&current->bio_list[i], bio);
 				continue;
 			}
@@ -1084,18 +1210,24 @@ static void __map_bio(struct dm_target_io *tio)
 	r = ti->type->map(ti, clone);
 	dm_offload_end(&o);
 
-	if (r == DM_MAPIO_REMAPPED) {
+	switch (r) {
+	case DM_MAPIO_SUBMITTED:
+		break;
+	case DM_MAPIO_REMAPPED:
 		/* the bio has been remapped so dispatch it */
-
 		trace_block_bio_remap(bdev_get_queue(clone->bi_bdev), clone,
 				      tio->io->bio->bi_bdev->bd_dev, sector);
-
 		generic_make_request(clone);
-	} else if (r < 0 || r == DM_MAPIO_REQUEUE) {
-		/* error the io and bail out, or requeue it if needed */
-		dec_pending(tio->io, r);
+		break;
+	case DM_MAPIO_KILL:
+		dec_pending(tio->io, BLK_STS_IOERR);
+		free_tio(tio);
+		break;
+	case DM_MAPIO_REQUEUE:
+		dec_pending(tio->io, BLK_STS_DM_REQUEUE);
 		free_tio(tio);
-	} else if (r != DM_MAPIO_SUBMITTED) {
+		break;
+	default:
 		DMWARN("unimplemented target map return value: %d", r);
 		BUG();
 	}
@@ -1142,7 +1274,8 @@ static int clone_bio(struct dm_target_io *tio, struct bio *bio,
 			return r;
 	}
 
-	bio_advance(clone, to_bytes(sector - clone->bi_iter.bi_sector));
+	if (bio_op(bio) != REQ_OP_ZONE_REPORT)
+		bio_advance(clone, to_bytes(sector - clone->bi_iter.bi_sector));
 	clone->bi_iter.bi_size = to_bytes(len);
 
 	if (unlikely(bio_integrity(bio) != NULL))
@@ -1331,7 +1464,11 @@ static int __split_and_process_non_flush(struct clone_info *ci)
 	if (!dm_target_is_valid(ti))
 		return -EIO;
 
-	len = min_t(sector_t, max_io_len(ci->sector, ti), ci->sector_count);
+	if (bio_op(bio) == REQ_OP_ZONE_REPORT)
+		len = ci->sector_count;
+	else
+		len = min_t(sector_t, max_io_len(ci->sector, ti),
+			    ci->sector_count);
 
 	r = __clone_and_map_data_bio(ci, ti, ci->sector, &len);
 	if (r < 0)
@@ -1360,7 +1497,7 @@ static void __split_and_process_bio(struct mapped_device *md,
 	ci.map = map;
 	ci.md = md;
 	ci.io = alloc_io(md);
-	ci.io->error = 0;
+	ci.io->status = 0;
 	atomic_set(&ci.io->io_count, 1);
 	ci.io->bio = bio;
 	ci.io->md = md;
@@ -1374,6 +1511,10 @@ static void __split_and_process_bio(struct mapped_device *md,
 		ci.sector_count = 0;
 		error = __send_empty_flush(&ci);
 		/* dec_pending submits any data associated with flush */
+	} else if (bio_op(bio) == REQ_OP_ZONE_RESET) {
+		ci.bio = bio;
+		ci.sector_count = 0;
+		error = __split_and_process_non_flush(&ci);
 	} else {
 		ci.bio = bio;
 		ci.sector_count = bio_sectors(bio);
@@ -1527,7 +1668,6 @@ void dm_init_normal_md_queue(struct mapped_device *md)
 	 * Initialize aspects of queue that aren't relevant for blk-mq
 	 */
 	md->queue->backing_dev_info->congested_fn = dm_any_congested;
-	blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY);
 }
 
 static void cleanup_mapped_device(struct mapped_device *md)
@@ -1657,7 +1797,7 @@ static struct mapped_device *alloc_dev(int minor)
 
 	bio_init(&md->flush_bio, NULL, 0);
 	md->flush_bio.bi_bdev = md->bdev;
-	md->flush_bio.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
+	md->flush_bio.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC;
 
 	dm_stats_init(&md->stats);
 
@@ -1753,7 +1893,9 @@ static void event_callback(void *context)
 	dm_send_uevents(&uevents, &disk_to_dev(md->disk)->kobj);
 
 	atomic_inc(&md->event_nr);
+	atomic_inc(&dm_global_event_nr);
 	wake_up(&md->eventq);
+	wake_up(&dm_global_eventq);
 }
 
 /*
@@ -2654,7 +2796,7 @@ struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, enum dm_qu
 		BUG();
 	}
 
-	pools->bs = bioset_create_nobvec(pool_size, front_pad);
+	pools->bs = bioset_create(pool_size, front_pad, BIOSET_NEED_RESCUER);
 	if (!pools->bs)
 		goto out;
 
@@ -2859,6 +3001,8 @@ static const struct block_device_operations dm_blk_dops = {
 
 static const struct dax_operations dm_dax_ops = {
 	.direct_access = dm_dax_direct_access,
+	.copy_from_iter = dm_dax_copy_from_iter,
+	.flush = dm_dax_flush,
 };
 
 /*
diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
index 7299ce2f08a8..03082e17c65c 100644
--- a/drivers/md/md-cluster.c
+++ b/drivers/md/md-cluster.c
@@ -1311,8 +1311,10 @@ static int add_new_disk(struct mddev *mddev, struct md_rdev *rdev)
 	cmsg.raid_slot = cpu_to_le32(rdev->desc_nr);
 	lock_comm(cinfo, 1);
 	ret = __sendmsg(cinfo, &cmsg);
-	if (ret)
+	if (ret) {
+		unlock_comm(cinfo);
 		return ret;
+	}
 	cinfo->no_new_dev_lockres->flags |= DLM_LKF_NOQUEUE;
 	ret = dlm_lock_sync(cinfo->no_new_dev_lockres, DLM_LOCK_EX);
 	cinfo->no_new_dev_lockres->flags &= ~DLM_LKF_NOQUEUE;
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 82f798be964f..31bcbfb09fef 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -185,7 +185,7 @@ static int start_readonly;
 static bool create_on_open = true;
 
 /* bio_clone_mddev
- * like bio_clone, but with a local bio set
+ * like bio_clone_bioset, but with a local bio set
  */
 
 struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs,
@@ -265,7 +265,7 @@ static blk_qc_t md_make_request(struct request_queue *q, struct bio *bio)
 	unsigned int sectors;
 	int cpu;
 
-	blk_queue_split(q, &bio, q->bio_split);
+	blk_queue_split(q, &bio);
 
 	if (mddev == NULL || mddev->pers == NULL) {
 		bio_io_error(bio);
@@ -273,7 +273,7 @@ static blk_qc_t md_make_request(struct request_queue *q, struct bio *bio)
 	}
 	if (mddev->ro == 1 && unlikely(rw == WRITE)) {
 		if (bio_sectors(bio) != 0)
-			bio->bi_error = -EROFS;
+			bio->bi_status = BLK_STS_IOERR;
 		bio_endio(bio);
 		return BLK_QC_T_NONE;
 	}
@@ -719,8 +719,8 @@ static void super_written(struct bio *bio)
 	struct md_rdev *rdev = bio->bi_private;
 	struct mddev *mddev = rdev->mddev;
 
-	if (bio->bi_error) {
-		pr_err("md: super_written gets error=%d\n", bio->bi_error);
+	if (bio->bi_status) {
+		pr_err("md: super_written gets error=%d\n", bio->bi_status);
 		md_error(mddev, rdev);
 		if (!test_bit(Faulty, &rdev->flags)
 		    && (bio->bi_opf & MD_FAILFAST)) {
@@ -765,7 +765,7 @@ void md_super_write(struct mddev *mddev, struct md_rdev *rdev,
 	    test_bit(FailFast, &rdev->flags) &&
 	    !test_bit(LastDev, &rdev->flags))
 		ff = MD_FAILFAST;
-	bio->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_FUA | ff;
+	bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH | REQ_FUA | ff;
 
 	atomic_inc(&mddev->pending_writes);
 	submit_bio(bio);
@@ -801,7 +801,7 @@ int sync_page_io(struct md_rdev *rdev, sector_t sector, int size,
 
 	submit_bio_wait(bio);
 
-	ret = !bio->bi_error;
+	ret = !bio->bi_status;
 	bio_put(bio);
 	return ret;
 }
@@ -825,7 +825,7 @@ fail:
 	return -EINVAL;
 }
 
-static int uuid_equal(mdp_super_t *sb1, mdp_super_t *sb2)
+static int md_uuid_equal(mdp_super_t *sb1, mdp_super_t *sb2)
 {
 	return	sb1->set_uuid0 == sb2->set_uuid0 &&
 		sb1->set_uuid1 == sb2->set_uuid1 &&
@@ -833,7 +833,7 @@ static int uuid_equal(mdp_super_t *sb1, mdp_super_t *sb2)
 		sb1->set_uuid3 == sb2->set_uuid3;
 }
 
-static int sb_equal(mdp_super_t *sb1, mdp_super_t *sb2)
+static int md_sb_equal(mdp_super_t *sb1, mdp_super_t *sb2)
 {
 	int ret;
 	mdp_super_t *tmp1, *tmp2;
@@ -1025,12 +1025,12 @@ static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor
 	} else {
 		__u64 ev1, ev2;
 		mdp_super_t *refsb = page_address(refdev->sb_page);
-		if (!uuid_equal(refsb, sb)) {
+		if (!md_uuid_equal(refsb, sb)) {
 			pr_warn("md: %s has different UUID to %s\n",
 				b, bdevname(refdev->bdev,b2));
 			goto abort;
 		}
-		if (!sb_equal(refsb, sb)) {
+		if (!md_sb_equal(refsb, sb)) {
 			pr_warn("md: %s has same UUID but different superblock to %s\n",
 				b, bdevname(refdev->bdev, b2));
 			goto abort;
@@ -5174,6 +5174,18 @@ static void mddev_delayed_delete(struct work_struct *ws)
 
 static void no_op(struct percpu_ref *r) {}
 
+int mddev_init_writes_pending(struct mddev *mddev)
+{
+	if (mddev->writes_pending.percpu_count_ptr)
+		return 0;
+	if (percpu_ref_init(&mddev->writes_pending, no_op, 0, GFP_KERNEL) < 0)
+		return -ENOMEM;
+	/* We want to start with the refcount at zero */
+	percpu_ref_put(&mddev->writes_pending);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mddev_init_writes_pending);
+
 static int md_alloc(dev_t dev, char *name)
 {
 	/*
@@ -5239,10 +5251,6 @@ static int md_alloc(dev_t dev, char *name)
 	blk_queue_make_request(mddev->queue, md_make_request);
 	blk_set_stacking_limits(&mddev->queue->limits);
 
-	if (percpu_ref_init(&mddev->writes_pending, no_op, 0, GFP_KERNEL) < 0)
-		goto abort;
-	/* We want to start with the refcount at zero */
-	percpu_ref_put(&mddev->writes_pending);
 	disk = alloc_disk(1 << shift);
 	if (!disk) {
 		blk_cleanup_queue(mddev->queue);
@@ -5420,7 +5428,7 @@ int md_run(struct mddev *mddev)
 	}
 
 	if (mddev->bio_set == NULL) {
-		mddev->bio_set = bioset_create(BIO_POOL_SIZE, 0);
+		mddev->bio_set = bioset_create(BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS);
 		if (!mddev->bio_set)
 			return -ENOMEM;
 	}
@@ -8022,18 +8030,15 @@ EXPORT_SYMBOL(md_write_end);
  * may proceed without blocking.  It is important to call this before
  * attempting a GFP_KERNEL allocation while holding the mddev lock.
  * Must be called with mddev_lock held.
- *
- * In the ->external case MD_SB_CHANGE_PENDING can not be cleared until mddev->lock
- * is dropped, so return -EAGAIN after notifying userspace.
  */
-int md_allow_write(struct mddev *mddev)
+void md_allow_write(struct mddev *mddev)
 {
 	if (!mddev->pers)
-		return 0;
+		return;
 	if (mddev->ro)
-		return 0;
+		return;
 	if (!mddev->pers->sync_request)
-		return 0;
+		return;
 
 	spin_lock(&mddev->lock);
 	if (mddev->in_sync) {
@@ -8046,13 +8051,12 @@ int md_allow_write(struct mddev *mddev)
 		spin_unlock(&mddev->lock);
 		md_update_sb(mddev, 0);
 		sysfs_notify_dirent_safe(mddev->sysfs_state);
+		/* wait for the dirty state to be recorded in the metadata */
+		wait_event(mddev->sb_wait,
+			   !test_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags) &&
+			   !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags));
 	} else
 		spin_unlock(&mddev->lock);
-
-	if (test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags))
-		return -EAGAIN;
-	else
-		return 0;
 }
 EXPORT_SYMBOL_GPL(md_allow_write);
 
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 4e75d121bfcc..0fa1de42c42b 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -648,6 +648,7 @@ extern void md_unregister_thread(struct md_thread **threadp);
 extern void md_wakeup_thread(struct md_thread *thread);
 extern void md_check_recovery(struct mddev *mddev);
 extern void md_reap_sync_thread(struct mddev *mddev);
+extern int mddev_init_writes_pending(struct mddev *mddev);
 extern void md_write_start(struct mddev *mddev, struct bio *bi);
 extern void md_write_inc(struct mddev *mddev, struct bio *bi);
 extern void md_write_end(struct mddev *mddev);
@@ -665,7 +666,7 @@ extern int sync_page_io(struct md_rdev *rdev, sector_t sector, int size,
 			bool metadata_op);
 extern void md_do_sync(struct md_thread *thread);
 extern void md_new_event(struct mddev *mddev);
-extern int md_allow_write(struct mddev *mddev);
+extern void md_allow_write(struct mddev *mddev);
 extern void md_wait_for_blocked_rdev(struct md_rdev *rdev, struct mddev *mddev);
 extern void md_set_array_sectors(struct mddev *mddev, sector_t array_sectors);
 extern int md_check_no_bitmap(struct mddev *mddev);
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c
index e95d521d93e9..68d036e64041 100644
--- a/drivers/md/multipath.c
+++ b/drivers/md/multipath.c
@@ -73,12 +73,12 @@ static void multipath_reschedule_retry (struct multipath_bh *mp_bh)
  * operation and are ready to return a success/failure code to the buffer
  * cache layer.
  */
-static void multipath_end_bh_io (struct multipath_bh *mp_bh, int err)
+static void multipath_end_bh_io(struct multipath_bh *mp_bh, blk_status_t status)
 {
 	struct bio *bio = mp_bh->master_bio;
 	struct mpconf *conf = mp_bh->mddev->private;
 
-	bio->bi_error = err;
+	bio->bi_status = status;
 	bio_endio(bio);
 	mempool_free(mp_bh, conf->pool);
 }
@@ -89,7 +89,7 @@ static void multipath_end_request(struct bio *bio)
 	struct mpconf *conf = mp_bh->mddev->private;
 	struct md_rdev *rdev = conf->multipaths[mp_bh->path].rdev;
 
-	if (!bio->bi_error)
+	if (!bio->bi_status)
 		multipath_end_bh_io(mp_bh, 0);
 	else if (!(bio->bi_opf & REQ_RAHEAD)) {
 		/*
@@ -102,7 +102,7 @@ static void multipath_end_request(struct bio *bio)
 			(unsigned long long)bio->bi_iter.bi_sector);
 		multipath_reschedule_retry(mp_bh);
 	} else
-		multipath_end_bh_io(mp_bh, bio->bi_error);
+		multipath_end_bh_io(mp_bh, bio->bi_status);
 	rdev_dec_pending(rdev, conf->mddev);
 }
 
@@ -347,7 +347,7 @@ static void multipathd(struct md_thread *thread)
 			pr_err("multipath: %s: unrecoverable IO read error for block %llu\n",
 			       bdevname(bio->bi_bdev,b),
 			       (unsigned long long)bio->bi_iter.bi_sector);
-			multipath_end_bh_io(mp_bh, -EIO);
+			multipath_end_bh_io(mp_bh, BLK_STS_IOERR);
 		} else {
 			pr_err("multipath: %s: redirecting sector %llu to another IO path\n",
 			       bdevname(bio->bi_bdev,b),
diff --git a/drivers/md/persistent-data/dm-space-map-disk.c b/drivers/md/persistent-data/dm-space-map-disk.c
index ebb280a14325..32adf6b4a9c7 100644
--- a/drivers/md/persistent-data/dm-space-map-disk.c
+++ b/drivers/md/persistent-data/dm-space-map-disk.c
@@ -142,10 +142,23 @@ static int sm_disk_inc_block(struct dm_space_map *sm, dm_block_t b)
 
 static int sm_disk_dec_block(struct dm_space_map *sm, dm_block_t b)
 {
+	int r;
+	uint32_t old_count;
 	enum allocation_event ev;
 	struct sm_disk *smd = container_of(sm, struct sm_disk, sm);
 
-	return sm_ll_dec(&smd->ll, b, &ev);
+	r = sm_ll_dec(&smd->ll, b, &ev);
+	if (!r && (ev == SM_FREE)) {
+		/*
+		 * It's only free if it's also free in the last
+		 * transaction.
+		 */
+		r = sm_ll_lookup(&smd->old_ll, b, &old_count);
+		if (!r && !old_count)
+			smd->nr_allocated_this_transaction--;
+	}
+
+	return r;
 }
 
 static int sm_disk_new_block(struct dm_space_map *sm, dm_block_t *b)
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index 84e58596594d..d6c0bc76e837 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -385,7 +385,7 @@ static int raid0_run(struct mddev *mddev)
 		blk_queue_max_hw_sectors(mddev->queue, mddev->chunk_sectors);
 		blk_queue_max_write_same_sectors(mddev->queue, mddev->chunk_sectors);
 		blk_queue_max_write_zeroes_sectors(mddev->queue, mddev->chunk_sectors);
-		blk_queue_max_discard_sectors(mddev->queue, mddev->chunk_sectors);
+		blk_queue_max_discard_sectors(mddev->queue, UINT_MAX);
 
 		blk_queue_io_min(mddev->queue, mddev->chunk_sectors << 9);
 		blk_queue_io_opt(mddev->queue,
@@ -459,6 +459,95 @@ static inline int is_io_in_chunk_boundary(struct mddev *mddev,
 	}
 }
 
+static void raid0_handle_discard(struct mddev *mddev, struct bio *bio)
+{
+	struct r0conf *conf = mddev->private;
+	struct strip_zone *zone;
+	sector_t start = bio->bi_iter.bi_sector;
+	sector_t end;
+	unsigned int stripe_size;
+	sector_t first_stripe_index, last_stripe_index;
+	sector_t start_disk_offset;
+	unsigned int start_disk_index;
+	sector_t end_disk_offset;
+	unsigned int end_disk_index;
+	unsigned int disk;
+
+	zone = find_zone(conf, &start);
+
+	if (bio_end_sector(bio) > zone->zone_end) {
+		struct bio *split = bio_split(bio,
+			zone->zone_end - bio->bi_iter.bi_sector, GFP_NOIO,
+			mddev->bio_set);
+		bio_chain(split, bio);
+		generic_make_request(bio);
+		bio = split;
+		end = zone->zone_end;
+	} else
+		end = bio_end_sector(bio);
+
+	if (zone != conf->strip_zone)
+		end = end - zone[-1].zone_end;
+
+	/* Now start and end is the offset in zone */
+	stripe_size = zone->nb_dev * mddev->chunk_sectors;
+
+	first_stripe_index = start;
+	sector_div(first_stripe_index, stripe_size);
+	last_stripe_index = end;
+	sector_div(last_stripe_index, stripe_size);
+
+	start_disk_index = (int)(start - first_stripe_index * stripe_size) /
+		mddev->chunk_sectors;
+	start_disk_offset = ((int)(start - first_stripe_index * stripe_size) %
+		mddev->chunk_sectors) +
+		first_stripe_index * mddev->chunk_sectors;
+	end_disk_index = (int)(end - last_stripe_index * stripe_size) /
+		mddev->chunk_sectors;
+	end_disk_offset = ((int)(end - last_stripe_index * stripe_size) %
+		mddev->chunk_sectors) +
+		last_stripe_index * mddev->chunk_sectors;
+
+	for (disk = 0; disk < zone->nb_dev; disk++) {
+		sector_t dev_start, dev_end;
+		struct bio *discard_bio = NULL;
+		struct md_rdev *rdev;
+
+		if (disk < start_disk_index)
+			dev_start = (first_stripe_index + 1) *
+				mddev->chunk_sectors;
+		else if (disk > start_disk_index)
+			dev_start = first_stripe_index * mddev->chunk_sectors;
+		else
+			dev_start = start_disk_offset;
+
+		if (disk < end_disk_index)
+			dev_end = (last_stripe_index + 1) * mddev->chunk_sectors;
+		else if (disk > end_disk_index)
+			dev_end = last_stripe_index * mddev->chunk_sectors;
+		else
+			dev_end = end_disk_offset;
+
+		if (dev_end <= dev_start)
+			continue;
+
+		rdev = conf->devlist[(zone - conf->strip_zone) *
+			conf->strip_zone[0].nb_dev + disk];
+		if (__blkdev_issue_discard(rdev->bdev,
+			dev_start + zone->dev_start + rdev->data_offset,
+			dev_end - dev_start, GFP_NOIO, 0, &discard_bio) ||
+		    !discard_bio)
+			continue;
+		bio_chain(discard_bio, bio);
+		if (mddev->gendisk)
+			trace_block_bio_remap(bdev_get_queue(rdev->bdev),
+				discard_bio, disk_devt(mddev->gendisk),
+				bio->bi_iter.bi_sector);
+		generic_make_request(discard_bio);
+	}
+	bio_endio(bio);
+}
+
 static void raid0_make_request(struct mddev *mddev, struct bio *bio)
 {
 	struct strip_zone *zone;
@@ -473,6 +562,11 @@ static void raid0_make_request(struct mddev *mddev, struct bio *bio)
 		return;
 	}
 
+	if (unlikely((bio_op(bio) == REQ_OP_DISCARD))) {
+		raid0_handle_discard(mddev, bio);
+		return;
+	}
+
 	bio_sector = bio->bi_iter.bi_sector;
 	sector = bio_sector;
 	chunk_sects = mddev->chunk_sectors;
@@ -498,19 +592,13 @@ static void raid0_make_request(struct mddev *mddev, struct bio *bio)
 	bio->bi_iter.bi_sector = sector + zone->dev_start +
 		tmp_dev->data_offset;
 
-	if (unlikely((bio_op(bio) == REQ_OP_DISCARD) &&
-		     !blk_queue_discard(bdev_get_queue(bio->bi_bdev)))) {
-		/* Just ignore it */
-		bio_endio(bio);
-	} else {
-		if (mddev->gendisk)
-			trace_block_bio_remap(bdev_get_queue(bio->bi_bdev),
-					      bio, disk_devt(mddev->gendisk),
-					      bio_sector);
-		mddev_check_writesame(mddev, bio);
-		mddev_check_write_zeroes(mddev, bio);
-		generic_make_request(bio);
-	}
+	if (mddev->gendisk)
+		trace_block_bio_remap(bdev_get_queue(bio->bi_bdev),
+				      bio, disk_devt(mddev->gendisk),
+				      bio_sector);
+	mddev_check_writesame(mddev, bio);
+	mddev_check_write_zeroes(mddev, bio);
+	generic_make_request(bio);
 }
 
 static void raid0_status(struct seq_file *seq, struct mddev *mddev)
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 7ed59351fe97..98ca2c1d3226 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -277,7 +277,7 @@ static void call_bio_endio(struct r1bio *r1_bio)
 	struct r1conf *conf = r1_bio->mddev->private;
 
 	if (!test_bit(R1BIO_Uptodate, &r1_bio->state))
-		bio->bi_error = -EIO;
+		bio->bi_status = BLK_STS_IOERR;
 
 	bio_endio(bio);
 	/*
@@ -335,7 +335,7 @@ static int find_bio_disk(struct r1bio *r1_bio, struct bio *bio)
 
 static void raid1_end_read_request(struct bio *bio)
 {
-	int uptodate = !bio->bi_error;
+	int uptodate = !bio->bi_status;
 	struct r1bio *r1_bio = bio->bi_private;
 	struct r1conf *conf = r1_bio->mddev->private;
 	struct md_rdev *rdev = conf->mirrors[r1_bio->read_disk].rdev;
@@ -426,12 +426,12 @@ static void raid1_end_write_request(struct bio *bio)
 	struct md_rdev *rdev = conf->mirrors[mirror].rdev;
 	bool discard_error;
 
-	discard_error = bio->bi_error && bio_op(bio) == REQ_OP_DISCARD;
+	discard_error = bio->bi_status && bio_op(bio) == REQ_OP_DISCARD;
 
 	/*
 	 * 'one mirror IO has finished' event handler:
 	 */
-	if (bio->bi_error && !discard_error) {
+	if (bio->bi_status && !discard_error) {
 		set_bit(WriteErrorSeen,	&rdev->flags);
 		if (!test_and_set_bit(WantReplacement, &rdev->flags))
 			set_bit(MD_RECOVERY_NEEDED, &
@@ -666,8 +666,11 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
 					break;
 			}
 			continue;
-		} else
+		} else {
+			if ((sectors > best_good_sectors) && (best_disk >= 0))
+				best_disk = -1;
 			best_good_sectors = sectors;
+		}
 
 		if (best_disk >= 0)
 			/* At least two disks to choose from so failfast is OK */
@@ -799,7 +802,7 @@ static void flush_bio_list(struct r1conf *conf, struct bio *bio)
 		bio->bi_next = NULL;
 		bio->bi_bdev = rdev->bdev;
 		if (test_bit(Faulty, &rdev->flags)) {
-			bio->bi_error = -EIO;
+			bio->bi_status = BLK_STS_IOERR;
 			bio_endio(bio);
 		} else if (unlikely((bio_op(bio) == REQ_OP_DISCARD) &&
 				    !blk_queue_discard(bdev_get_queue(bio->bi_bdev))))
@@ -1529,17 +1532,16 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
 			plug = container_of(cb, struct raid1_plug_cb, cb);
 		else
 			plug = NULL;
-		spin_lock_irqsave(&conf->device_lock, flags);
 		if (plug) {
 			bio_list_add(&plug->pending, mbio);
 			plug->pending_cnt++;
 		} else {
+			spin_lock_irqsave(&conf->device_lock, flags);
 			bio_list_add(&conf->pending_bio_list, mbio);
 			conf->pending_count++;
-		}
-		spin_unlock_irqrestore(&conf->device_lock, flags);
-		if (!plug)
+			spin_unlock_irqrestore(&conf->device_lock, flags);
 			md_wakeup_thread(mddev->thread);
+		}
 	}
 
 	r1_bio_write_done(r1_bio);
@@ -1854,7 +1856,7 @@ static void end_sync_read(struct bio *bio)
 	 * or re-read if the read failed.
 	 * We don't do much here, just schedule handling by raid1d
 	 */
-	if (!bio->bi_error)
+	if (!bio->bi_status)
 		set_bit(R1BIO_Uptodate, &r1_bio->state);
 
 	if (atomic_dec_and_test(&r1_bio->remaining))
@@ -1863,7 +1865,7 @@ static void end_sync_read(struct bio *bio)
 
 static void end_sync_write(struct bio *bio)
 {
-	int uptodate = !bio->bi_error;
+	int uptodate = !bio->bi_status;
 	struct r1bio *r1_bio = get_resync_r1bio(bio);
 	struct mddev *mddev = r1_bio->mddev;
 	struct r1conf *conf = mddev->private;
@@ -2056,7 +2058,7 @@ static int fix_sync_read_error(struct r1bio *r1_bio)
 		idx ++;
 	}
 	set_bit(R1BIO_Uptodate, &r1_bio->state);
-	bio->bi_error = 0;
+	bio->bi_status = 0;
 	return 1;
 }
 
@@ -2080,16 +2082,16 @@ static void process_checks(struct r1bio *r1_bio)
 	for (i = 0; i < conf->raid_disks * 2; i++) {
 		int j;
 		int size;
-		int error;
+		blk_status_t status;
 		struct bio_vec *bi;
 		struct bio *b = r1_bio->bios[i];
 		struct resync_pages *rp = get_resync_pages(b);
 		if (b->bi_end_io != end_sync_read)
 			continue;
 		/* fixup the bio for reuse, but preserve errno */
-		error = b->bi_error;
+		status = b->bi_status;
 		bio_reset(b);
-		b->bi_error = error;
+		b->bi_status = status;
 		b->bi_vcnt = vcnt;
 		b->bi_iter.bi_size = r1_bio->sectors << 9;
 		b->bi_iter.bi_sector = r1_bio->sector +
@@ -2111,7 +2113,7 @@ static void process_checks(struct r1bio *r1_bio)
 	}
 	for (primary = 0; primary < conf->raid_disks * 2; primary++)
 		if (r1_bio->bios[primary]->bi_end_io == end_sync_read &&
-		    !r1_bio->bios[primary]->bi_error) {
+		    !r1_bio->bios[primary]->bi_status) {
 			r1_bio->bios[primary]->bi_end_io = NULL;
 			rdev_dec_pending(conf->mirrors[primary].rdev, mddev);
 			break;
@@ -2121,7 +2123,7 @@ static void process_checks(struct r1bio *r1_bio)
 		int j;
 		struct bio *pbio = r1_bio->bios[primary];
 		struct bio *sbio = r1_bio->bios[i];
-		int error = sbio->bi_error;
+		blk_status_t status = sbio->bi_status;
 		struct page **ppages = get_resync_pages(pbio)->pages;
 		struct page **spages = get_resync_pages(sbio)->pages;
 		struct bio_vec *bi;
@@ -2130,12 +2132,12 @@ static void process_checks(struct r1bio *r1_bio)
 		if (sbio->bi_end_io != end_sync_read)
 			continue;
 		/* Now we can 'fixup' the error value */
-		sbio->bi_error = 0;
+		sbio->bi_status = 0;
 
 		bio_for_each_segment_all(bi, sbio, j)
 			page_len[j] = bi->bv_len;
 
-		if (!error) {
+		if (!status) {
 			for (j = vcnt; j-- ; ) {
 				if (memcmp(page_address(ppages[j]),
 					   page_address(spages[j]),
@@ -2147,7 +2149,7 @@ static void process_checks(struct r1bio *r1_bio)
 		if (j >= 0)
 			atomic64_add(r1_bio->sectors, &mddev->resync_mismatches);
 		if (j < 0 || (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)
-			      && !error)) {
+			      && !status)) {
 			/* No need to write to this device. */
 			sbio->bi_end_io = NULL;
 			rdev_dec_pending(conf->mirrors[i].rdev, mddev);
@@ -2398,11 +2400,11 @@ static void handle_sync_write_finished(struct r1conf *conf, struct r1bio *r1_bio
 		struct bio *bio = r1_bio->bios[m];
 		if (bio->bi_end_io == NULL)
 			continue;
-		if (!bio->bi_error &&
+		if (!bio->bi_status &&
 		    test_bit(R1BIO_MadeGood, &r1_bio->state)) {
 			rdev_clear_badblocks(rdev, r1_bio->sector, s, 0);
 		}
-		if (bio->bi_error &&
+		if (bio->bi_status &&
 		    test_bit(R1BIO_WriteError, &r1_bio->state)) {
 			if (!rdev_set_badblocks(rdev, r1_bio->sector, s, 0))
 				md_error(conf->mddev, rdev);
@@ -2953,7 +2955,7 @@ static struct r1conf *setup_conf(struct mddev *mddev)
 	if (!conf->r1bio_pool)
 		goto abort;
 
-	conf->bio_split = bioset_create(BIO_POOL_SIZE, 0);
+	conf->bio_split = bioset_create(BIO_POOL_SIZE, 0, 0);
 	if (!conf->bio_split)
 		goto abort;
 
@@ -3061,6 +3063,8 @@ static int raid1_run(struct mddev *mddev)
 			mdname(mddev));
 		return -EIO;
 	}
+	if (mddev_init_writes_pending(mddev) < 0)
+		return -ENOMEM;
 	/*
 	 * copy the already verified devices into our private RAID1
 	 * bookkeeping area. [whatever we allocate in run(),
@@ -3197,7 +3201,7 @@ static int raid1_reshape(struct mddev *mddev)
 	struct r1conf *conf = mddev->private;
 	int cnt, raid_disks;
 	unsigned long flags;
-	int d, d2, err;
+	int d, d2;
 
 	/* Cannot change chunk_size, layout, or level */
 	if (mddev->chunk_sectors != mddev->new_chunk_sectors ||
@@ -3209,11 +3213,8 @@ static int raid1_reshape(struct mddev *mddev)
 		return -EINVAL;
 	}
 
-	if (!mddev_is_clustered(mddev)) {
-		err = md_allow_write(mddev);
-		if (err)
-			return err;
-	}
+	if (!mddev_is_clustered(mddev))
+		md_allow_write(mddev);
 
 	raid_disks = mddev->raid_disks + mddev->delta_disks;
 
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 6b86a0032cf8..57a250fdbbcc 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -336,7 +336,7 @@ static void raid_end_bio_io(struct r10bio *r10_bio)
 	struct r10conf *conf = r10_bio->mddev->private;
 
 	if (!test_bit(R10BIO_Uptodate, &r10_bio->state))
-		bio->bi_error = -EIO;
+		bio->bi_status = BLK_STS_IOERR;
 
 	bio_endio(bio);
 	/*
@@ -389,7 +389,7 @@ static int find_bio_disk(struct r10conf *conf, struct r10bio *r10_bio,
 
 static void raid10_end_read_request(struct bio *bio)
 {
-	int uptodate = !bio->bi_error;
+	int uptodate = !bio->bi_status;
 	struct r10bio *r10_bio = bio->bi_private;
 	int slot, dev;
 	struct md_rdev *rdev;
@@ -477,7 +477,7 @@ static void raid10_end_write_request(struct bio *bio)
 	struct bio *to_put = NULL;
 	bool discard_error;
 
-	discard_error = bio->bi_error && bio_op(bio) == REQ_OP_DISCARD;
+	discard_error = bio->bi_status && bio_op(bio) == REQ_OP_DISCARD;
 
 	dev = find_bio_disk(conf, r10_bio, bio, &slot, &repl);
 
@@ -491,7 +491,7 @@ static void raid10_end_write_request(struct bio *bio)
 	/*
 	 * this branch is our 'one mirror IO has finished' event handler:
 	 */
-	if (bio->bi_error && !discard_error) {
+	if (bio->bi_status && !discard_error) {
 		if (repl)
 			/* Never record new bad blocks to replacement,
 			 * just fail it.
@@ -913,7 +913,7 @@ static void flush_pending_writes(struct r10conf *conf)
 			bio->bi_next = NULL;
 			bio->bi_bdev = rdev->bdev;
 			if (test_bit(Faulty, &rdev->flags)) {
-				bio->bi_error = -EIO;
+				bio->bi_status = BLK_STS_IOERR;
 				bio_endio(bio);
 			} else if (unlikely((bio_op(bio) ==  REQ_OP_DISCARD) &&
 					    !blk_queue_discard(bdev_get_queue(bio->bi_bdev))))
@@ -1098,7 +1098,7 @@ static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule)
 		bio->bi_next = NULL;
 		bio->bi_bdev = rdev->bdev;
 		if (test_bit(Faulty, &rdev->flags)) {
-			bio->bi_error = -EIO;
+			bio->bi_status = BLK_STS_IOERR;
 			bio_endio(bio);
 		} else if (unlikely((bio_op(bio) ==  REQ_OP_DISCARD) &&
 				    !blk_queue_discard(bdev_get_queue(bio->bi_bdev))))
@@ -1282,17 +1282,16 @@ static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio,
 		plug = container_of(cb, struct raid10_plug_cb, cb);
 	else
 		plug = NULL;
-	spin_lock_irqsave(&conf->device_lock, flags);
 	if (plug) {
 		bio_list_add(&plug->pending, mbio);
 		plug->pending_cnt++;
 	} else {
+		spin_lock_irqsave(&conf->device_lock, flags);
 		bio_list_add(&conf->pending_bio_list, mbio);
 		conf->pending_count++;
-	}
-	spin_unlock_irqrestore(&conf->device_lock, flags);
-	if (!plug)
+		spin_unlock_irqrestore(&conf->device_lock, flags);
 		md_wakeup_thread(mddev->thread);
+	}
 }
 
 static void raid10_write_request(struct mddev *mddev, struct bio *bio,
@@ -1889,7 +1888,7 @@ static void __end_sync_read(struct r10bio *r10_bio, struct bio *bio, int d)
 {
 	struct r10conf *conf = r10_bio->mddev->private;
 
-	if (!bio->bi_error)
+	if (!bio->bi_status)
 		set_bit(R10BIO_Uptodate, &r10_bio->state);
 	else
 		/* The write handler will notice the lack of
@@ -1973,7 +1972,7 @@ static void end_sync_write(struct bio *bio)
 	else
 		rdev = conf->mirrors[d].rdev;
 
-	if (bio->bi_error) {
+	if (bio->bi_status) {
 		if (repl)
 			md_error(mddev, rdev);
 		else {
@@ -2022,7 +2021,7 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
 
 	/* find the first device with a block */
 	for (i=0; i<conf->copies; i++)
-		if (!r10_bio->devs[i].bio->bi_error)
+		if (!r10_bio->devs[i].bio->bi_status)
 			break;
 
 	if (i == conf->copies)
@@ -2051,7 +2050,7 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
 		tpages = get_resync_pages(tbio)->pages;
 		d = r10_bio->devs[i].devnum;
 		rdev = conf->mirrors[d].rdev;
-		if (!r10_bio->devs[i].bio->bi_error) {
+		if (!r10_bio->devs[i].bio->bi_status) {
 			/* We know that the bi_io_vec layout is the same for
 			 * both 'first' and 'i', so we just compare them.
 			 * All vec entries are PAGE_SIZE;
@@ -2634,7 +2633,7 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio)
 			rdev = conf->mirrors[dev].rdev;
 			if (r10_bio->devs[m].bio == NULL)
 				continue;
-			if (!r10_bio->devs[m].bio->bi_error) {
+			if (!r10_bio->devs[m].bio->bi_status) {
 				rdev_clear_badblocks(
 					rdev,
 					r10_bio->devs[m].addr,
@@ -2650,7 +2649,7 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio)
 			if (r10_bio->devs[m].repl_bio == NULL)
 				continue;
 
-			if (!r10_bio->devs[m].repl_bio->bi_error) {
+			if (!r10_bio->devs[m].repl_bio->bi_status) {
 				rdev_clear_badblocks(
 					rdev,
 					r10_bio->devs[m].addr,
@@ -2676,7 +2675,7 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio)
 					r10_bio->devs[m].addr,
 					r10_bio->sectors, 0);
 				rdev_dec_pending(rdev, conf->mddev);
-			} else if (bio != NULL && bio->bi_error) {
+			} else if (bio != NULL && bio->bi_status) {
 				fail = true;
 				if (!narrow_write_error(r10_bio, m)) {
 					md_error(conf->mddev, rdev);
@@ -3268,7 +3267,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
 				r10_bio->devs[i].repl_bio->bi_end_io = NULL;
 
 			bio = r10_bio->devs[i].bio;
-			bio->bi_error = -EIO;
+			bio->bi_status = BLK_STS_IOERR;
 			rcu_read_lock();
 			rdev = rcu_dereference(conf->mirrors[d].rdev);
 			if (rdev == NULL || test_bit(Faulty, &rdev->flags)) {
@@ -3310,7 +3309,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
 
 			/* Need to set up for writing to the replacement */
 			bio = r10_bio->devs[i].repl_bio;
-			bio->bi_error = -EIO;
+			bio->bi_status = BLK_STS_IOERR;
 
 			sector = r10_bio->devs[i].addr;
 			bio->bi_next = biolist;
@@ -3376,7 +3375,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
 
 		if (bio->bi_end_io == end_sync_read) {
 			md_sync_acct(bio->bi_bdev, nr_sectors);
-			bio->bi_error = 0;
+			bio->bi_status = 0;
 			generic_make_request(bio);
 		}
 	}
@@ -3553,7 +3552,7 @@ static struct r10conf *setup_conf(struct mddev *mddev)
 	if (!conf->r10bio_pool)
 		goto out;
 
-	conf->bio_split = bioset_create(BIO_POOL_SIZE, 0);
+	conf->bio_split = bioset_create(BIO_POOL_SIZE, 0, 0);
 	if (!conf->bio_split)
 		goto out;
 
@@ -3612,6 +3611,9 @@ static int raid10_run(struct mddev *mddev)
 	int first = 1;
 	bool discard_supported = false;
 
+	if (mddev_init_writes_pending(mddev) < 0)
+		return -ENOMEM;
+
 	if (mddev->private == NULL) {
 		conf = setup_conf(mddev);
 		if (IS_ERR(conf))
@@ -4395,7 +4397,7 @@ read_more:
 	read_bio->bi_end_io = end_reshape_read;
 	bio_set_op_attrs(read_bio, REQ_OP_READ, 0);
 	read_bio->bi_flags &= (~0UL << BIO_RESET_BITS);
-	read_bio->bi_error = 0;
+	read_bio->bi_status = 0;
 	read_bio->bi_vcnt = 0;
 	read_bio->bi_iter.bi_size = 0;
 	r10_bio->master_bio = read_bio;
@@ -4639,7 +4641,7 @@ static void end_reshape_write(struct bio *bio)
 		rdev = conf->mirrors[d].rdev;
 	}
 
-	if (bio->bi_error) {
+	if (bio->bi_status) {
 		/* FIXME should record badblock */
 		md_error(mddev, rdev);
 	}
diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index 26ba09282e7c..bfa1e907c472 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -24,6 +24,7 @@
 #include "md.h"
 #include "raid5.h"
 #include "bitmap.h"
+#include "raid5-log.h"
 
 /*
  * metadata/data stored in disk with 4k size unit (a block) regardless
@@ -571,7 +572,7 @@ static void r5l_log_endio(struct bio *bio)
 	struct r5l_log *log = io->log;
 	unsigned long flags;
 
-	if (bio->bi_error)
+	if (bio->bi_status)
 		md_error(log->rdev->mddev, log->rdev);
 
 	bio_put(bio);
@@ -622,20 +623,30 @@ static void r5l_do_submit_io(struct r5l_log *log, struct r5l_io_unit *io)
 	__r5l_set_io_unit_state(io, IO_UNIT_IO_START);
 	spin_unlock_irqrestore(&log->io_list_lock, flags);
 
+	/*
+	 * In case of journal device failures, submit_bio will get error
+	 * and calls endio, then active stripes will continue write
+	 * process. Therefore, it is not necessary to check Faulty bit
+	 * of journal device here.
+	 *
+	 * We can't check split_bio after current_bio is submitted. If
+	 * io->split_bio is null, after current_bio is submitted, current_bio
+	 * might already be completed and the io_unit is freed. We submit
+	 * split_bio first to avoid the issue.
+	 */
+	if (io->split_bio) {
+		if (io->has_flush)
+			io->split_bio->bi_opf |= REQ_PREFLUSH;
+		if (io->has_fua)
+			io->split_bio->bi_opf |= REQ_FUA;
+		submit_bio(io->split_bio);
+	}
+
 	if (io->has_flush)
 		io->current_bio->bi_opf |= REQ_PREFLUSH;
 	if (io->has_fua)
 		io->current_bio->bi_opf |= REQ_FUA;
 	submit_bio(io->current_bio);
-
-	if (!io->split_bio)
-		return;
-
-	if (io->has_flush)
-		io->split_bio->bi_opf |= REQ_PREFLUSH;
-	if (io->has_fua)
-		io->split_bio->bi_opf |= REQ_FUA;
-	submit_bio(io->split_bio);
 }
 
 /* deferred io_unit will be dispatched here */
@@ -670,6 +681,11 @@ static void r5c_disable_writeback_async(struct work_struct *work)
 		return;
 	pr_info("md/raid:%s: Disabling writeback cache for degraded array.\n",
 		mdname(mddev));
+
+	/* wait superblock change before suspend */
+	wait_event(mddev->sb_wait,
+		   !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags));
+
 	mddev_suspend(mddev);
 	log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH;
 	mddev_resume(mddev);
@@ -1231,7 +1247,7 @@ static void r5l_log_flush_endio(struct bio *bio)
 	unsigned long flags;
 	struct r5l_io_unit *io;
 
-	if (bio->bi_error)
+	if (bio->bi_status)
 		md_error(log->rdev->mddev, log->rdev);
 
 	spin_lock_irqsave(&log->io_list_lock, flags);
@@ -1766,7 +1782,7 @@ static int r5l_log_write_empty_meta_block(struct r5l_log *log, sector_t pos,
 	mb->checksum = cpu_to_le32(crc32c_le(log->uuid_checksum,
 					     mb, PAGE_SIZE));
 	if (!sync_page_io(log->rdev, pos, PAGE_SIZE, page, REQ_OP_WRITE,
-			  REQ_FUA, false)) {
+			  REQ_SYNC | REQ_FUA, false)) {
 		__free_page(page);
 		return -EIO;
 	}
@@ -2372,7 +2388,7 @@ r5c_recovery_rewrite_data_only_stripes(struct r5l_log *log,
 		mb->checksum = cpu_to_le32(crc32c_le(log->uuid_checksum,
 						     mb, PAGE_SIZE));
 		sync_page_io(log->rdev, ctx->pos, PAGE_SIZE, page,
-			     REQ_OP_WRITE, REQ_FUA, false);
+			     REQ_OP_WRITE, REQ_SYNC | REQ_FUA, false);
 		sh->log_start = ctx->pos;
 		list_add_tail(&sh->r5c, &log->stripe_in_journal_list);
 		atomic_inc(&log->stripe_in_journal_count);
@@ -2621,8 +2637,11 @@ int r5c_try_caching_write(struct r5conf *conf,
 	 * When run in degraded mode, array is set to write-through mode.
 	 * This check helps drain pending write safely in the transition to
 	 * write-through mode.
+	 *
+	 * When a stripe is syncing, the write is also handled in write
+	 * through mode.
 	 */
-	if (s->failed) {
+	if (s->failed || test_bit(STRIPE_SYNCING, &sh->state)) {
 		r5c_make_stripe_write_out(sh);
 		return -EAGAIN;
 	}
@@ -2825,6 +2844,9 @@ void r5c_finish_stripe_write_out(struct r5conf *conf,
 	}
 
 	r5l_append_flush_payload(log, sh->sector);
+	/* stripe is flused to raid disks, we can do resync now */
+	if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state))
+		set_bit(STRIPE_HANDLE, &sh->state);
 }
 
 int r5c_cache_data(struct r5l_log *log, struct stripe_head *sh)
@@ -2973,7 +2995,7 @@ ioerr:
 	return ret;
 }
 
-void r5c_update_on_rdev_error(struct mddev *mddev)
+void r5c_update_on_rdev_error(struct mddev *mddev, struct md_rdev *rdev)
 {
 	struct r5conf *conf = mddev->private;
 	struct r5l_log *log = conf->log;
@@ -2981,7 +3003,8 @@ void r5c_update_on_rdev_error(struct mddev *mddev)
 	if (!log)
 		return;
 
-	if (raid5_calc_degraded(conf) > 0 &&
+	if ((raid5_calc_degraded(conf) > 0 ||
+	     test_bit(Journal, &rdev->flags)) &&
 	    conf->log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_BACK)
 		schedule_work(&log->disable_writeback_work);
 }
@@ -3040,7 +3063,7 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
 	if (!log->io_pool)
 		goto io_pool;
 
-	log->bs = bioset_create(R5L_POOL_SIZE, 0);
+	log->bs = bioset_create(R5L_POOL_SIZE, 0, BIOSET_NEED_BVECS);
 	if (!log->bs)
 		goto io_bs;
 
diff --git a/drivers/md/raid5-log.h b/drivers/md/raid5-log.h
index 27097101ccca..328d67aedda4 100644
--- a/drivers/md/raid5-log.h
+++ b/drivers/md/raid5-log.h
@@ -28,7 +28,8 @@ extern void r5c_flush_cache(struct r5conf *conf, int num);
 extern void r5c_check_stripe_cache_usage(struct r5conf *conf);
 extern void r5c_check_cached_full_stripe(struct r5conf *conf);
 extern struct md_sysfs_entry r5c_journal_mode;
-extern void r5c_update_on_rdev_error(struct mddev *mddev);
+extern void r5c_update_on_rdev_error(struct mddev *mddev,
+				     struct md_rdev *rdev);
 extern bool r5c_big_stripe_cached(struct r5conf *conf, sector_t sect);
 
 extern struct dma_async_tx_descriptor *
diff --git a/drivers/md/raid5-ppl.c b/drivers/md/raid5-ppl.c
index 5d25bebf3328..77cce3573aa8 100644
--- a/drivers/md/raid5-ppl.c
+++ b/drivers/md/raid5-ppl.c
@@ -397,7 +397,7 @@ static void ppl_log_endio(struct bio *bio)
 
 	pr_debug("%s: seq: %llu\n", __func__, io->seq);
 
-	if (bio->bi_error)
+	if (bio->bi_status)
 		md_error(ppl_conf->mddev, log->rdev);
 
 	list_for_each_entry_safe(sh, next, &io->stripe_list, log_list) {
@@ -907,8 +907,8 @@ static int ppl_write_empty_header(struct ppl_log *log)
 	pplhdr->checksum = cpu_to_le32(~crc32c_le(~0, pplhdr, PAGE_SIZE));
 
 	if (!sync_page_io(rdev, rdev->ppl.sector - rdev->data_offset,
-			  PPL_HEADER_SIZE, page, REQ_OP_WRITE | REQ_FUA, 0,
-			  false)) {
+			  PPL_HEADER_SIZE, page, REQ_OP_WRITE | REQ_SYNC |
+			  REQ_FUA, 0, false)) {
 		md_error(rdev->mddev, rdev);
 		ret = -EIO;
 	}
@@ -1150,7 +1150,7 @@ int ppl_init_log(struct r5conf *conf)
 		goto err;
 	}
 
-	ppl_conf->bs = bioset_create(conf->raid_disks, 0);
+	ppl_conf->bs = bioset_create(conf->raid_disks, 0, 0);
 	if (!ppl_conf->bs) {
 		ret = -ENOMEM;
 		goto err;
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 2e38cfac5b1d..62c965be97e1 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -103,8 +103,7 @@ static inline void unlock_device_hash_lock(struct r5conf *conf, int hash)
 static inline void lock_all_device_hash_locks_irq(struct r5conf *conf)
 {
 	int i;
-	local_irq_disable();
-	spin_lock(conf->hash_locks);
+	spin_lock_irq(conf->hash_locks);
 	for (i = 1; i < NR_STRIPE_HASH_LOCKS; i++)
 		spin_lock_nest_lock(conf->hash_locks + i, conf->hash_locks);
 	spin_lock(&conf->device_lock);
@@ -114,9 +113,9 @@ static inline void unlock_all_device_hash_locks_irq(struct r5conf *conf)
 {
 	int i;
 	spin_unlock(&conf->device_lock);
-	for (i = NR_STRIPE_HASH_LOCKS; i; i--)
-		spin_unlock(conf->hash_locks + i - 1);
-	local_irq_enable();
+	for (i = NR_STRIPE_HASH_LOCKS - 1; i; i--)
+		spin_unlock(conf->hash_locks + i);
+	spin_unlock_irq(conf->hash_locks);
 }
 
 /* Find first data disk in a raid6 stripe */
@@ -234,11 +233,15 @@ static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh,
 			if (test_bit(R5_InJournal, &sh->dev[i].flags))
 				injournal++;
 	/*
-	 * When quiesce in r5c write back, set STRIPE_HANDLE for stripes with
-	 * data in journal, so they are not released to cached lists
+	 * In the following cases, the stripe cannot be released to cached
+	 * lists. Therefore, we make the stripe write out and set
+	 * STRIPE_HANDLE:
+	 *   1. when quiesce in r5c write back;
+	 *   2. when resync is requested fot the stripe.
 	 */
-	if (conf->quiesce && r5c_is_writeback(conf->log) &&
-	    !test_bit(STRIPE_HANDLE, &sh->state) && injournal != 0) {
+	if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state) ||
+	    (conf->quiesce && r5c_is_writeback(conf->log) &&
+	     !test_bit(STRIPE_HANDLE, &sh->state) && injournal != 0)) {
 		if (test_bit(STRIPE_R5C_CACHING, &sh->state))
 			r5c_make_stripe_write_out(sh);
 		set_bit(STRIPE_HANDLE, &sh->state);
@@ -714,12 +717,11 @@ static bool is_full_stripe_write(struct stripe_head *sh)
 
 static void lock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2)
 {
-	local_irq_disable();
 	if (sh1 > sh2) {
-		spin_lock(&sh2->stripe_lock);
+		spin_lock_irq(&sh2->stripe_lock);
 		spin_lock_nested(&sh1->stripe_lock, 1);
 	} else {
-		spin_lock(&sh1->stripe_lock);
+		spin_lock_irq(&sh1->stripe_lock);
 		spin_lock_nested(&sh2->stripe_lock, 1);
 	}
 }
@@ -727,8 +729,7 @@ static void lock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2)
 static void unlock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2)
 {
 	spin_unlock(&sh1->stripe_lock);
-	spin_unlock(&sh2->stripe_lock);
-	local_irq_enable();
+	spin_unlock_irq(&sh2->stripe_lock);
 }
 
 /* Only freshly new full stripe normal write stripe can be added to a batch list */
@@ -2312,14 +2313,12 @@ static int resize_stripes(struct r5conf *conf, int newsize)
 	struct stripe_head *osh, *nsh;
 	LIST_HEAD(newstripes);
 	struct disk_info *ndisks;
-	int err;
+	int err = 0;
 	struct kmem_cache *sc;
 	int i;
 	int hash, cnt;
 
-	err = md_allow_write(conf->mddev);
-	if (err)
-		return err;
+	md_allow_write(conf->mddev);
 
 	/* Step 1 */
 	sc = kmem_cache_create(conf->cache_name[1-conf->active_name],
@@ -2477,7 +2476,7 @@ static void raid5_end_read_request(struct bio * bi)
 
 	pr_debug("end_read_request %llu/%d, count: %d, error %d.\n",
 		(unsigned long long)sh->sector, i, atomic_read(&sh->count),
-		bi->bi_error);
+		bi->bi_status);
 	if (i == disks) {
 		bio_reset(bi);
 		BUG();
@@ -2497,7 +2496,7 @@ static void raid5_end_read_request(struct bio * bi)
 		s = sh->sector + rdev->new_data_offset;
 	else
 		s = sh->sector + rdev->data_offset;
-	if (!bi->bi_error) {
+	if (!bi->bi_status) {
 		set_bit(R5_UPTODATE, &sh->dev[i].flags);
 		if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
 			/* Note that this cannot happen on a
@@ -2614,7 +2613,7 @@ static void raid5_end_write_request(struct bio *bi)
 	}
 	pr_debug("end_write_request %llu/%d, count %d, error: %d.\n",
 		(unsigned long long)sh->sector, i, atomic_read(&sh->count),
-		bi->bi_error);
+		bi->bi_status);
 	if (i == disks) {
 		bio_reset(bi);
 		BUG();
@@ -2622,14 +2621,14 @@ static void raid5_end_write_request(struct bio *bi)
 	}
 
 	if (replacement) {
-		if (bi->bi_error)
+		if (bi->bi_status)
 			md_error(conf->mddev, rdev);
 		else if (is_badblock(rdev, sh->sector,
 				     STRIPE_SECTORS,
 				     &first_bad, &bad_sectors))
 			set_bit(R5_MadeGoodRepl, &sh->dev[i].flags);
 	} else {
-		if (bi->bi_error) {
+		if (bi->bi_status) {
 			set_bit(STRIPE_DEGRADED, &sh->state);
 			set_bit(WriteErrorSeen, &rdev->flags);
 			set_bit(R5_WriteError, &sh->dev[i].flags);
@@ -2650,7 +2649,7 @@ static void raid5_end_write_request(struct bio *bi)
 	}
 	rdev_dec_pending(rdev, conf->mddev);
 
-	if (sh->batch_head && bi->bi_error && !replacement)
+	if (sh->batch_head && bi->bi_status && !replacement)
 		set_bit(STRIPE_BATCH_ERR, &sh->batch_head->state);
 
 	bio_reset(bi);
@@ -2694,7 +2693,7 @@ static void raid5_error(struct mddev *mddev, struct md_rdev *rdev)
 		bdevname(rdev->bdev, b),
 		mdname(mddev),
 		conf->raid_disks - mddev->degraded);
-	r5c_update_on_rdev_error(mddev);
+	r5c_update_on_rdev_error(mddev, rdev);
 }
 
 /*
@@ -3055,6 +3054,11 @@ sector_t raid5_compute_blocknr(struct stripe_head *sh, int i, int previous)
  *      When LOG_CRITICAL, stripes with injournal == 0 will be sent to
  *      no_space_stripes list.
  *
+ *   3. during journal failure
+ *      In journal failure, we try to flush all cached data to raid disks
+ *      based on data in stripe cache. The array is read-only to upper
+ *      layers, so we would skip all pending writes.
+ *
  */
 static inline bool delay_towrite(struct r5conf *conf,
 				 struct r5dev *dev,
@@ -3068,6 +3072,9 @@ static inline bool delay_towrite(struct r5conf *conf,
 	if (test_bit(R5C_LOG_CRITICAL, &conf->cache_state) &&
 	    s->injournal > 0)
 		return true;
+	/* case 3 above */
+	if (s->log_failed && s->injournal)
+		return true;
 	return false;
 }
 
@@ -3374,7 +3381,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
 			sh->dev[i].sector + STRIPE_SECTORS) {
 			struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector);
 
-			bi->bi_error = -EIO;
+			bi->bi_status = BLK_STS_IOERR;
 			md_write_end(conf->mddev);
 			bio_endio(bi);
 			bi = nextbi;
@@ -3396,7 +3403,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
 		       sh->dev[i].sector + STRIPE_SECTORS) {
 			struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector);
 
-			bi->bi_error = -EIO;
+			bi->bi_status = BLK_STS_IOERR;
 			md_write_end(conf->mddev);
 			bio_endio(bi);
 			bi = bi2;
@@ -3422,7 +3429,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
 				struct bio *nextbi =
 					r5_next_bio(bi, sh->dev[i].sector);
 
-				bi->bi_error = -EIO;
+				bi->bi_status = BLK_STS_IOERR;
 				bio_endio(bi);
 				bi = nextbi;
 			}
@@ -4078,10 +4085,15 @@ static void handle_parity_checks5(struct r5conf *conf, struct stripe_head *sh,
 			set_bit(STRIPE_INSYNC, &sh->state);
 		else {
 			atomic64_add(STRIPE_SECTORS, &conf->mddev->resync_mismatches);
-			if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery))
+			if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) {
 				/* don't try to repair!! */
 				set_bit(STRIPE_INSYNC, &sh->state);
-			else {
+				pr_warn_ratelimited("%s: mismatch sector in range "
+						    "%llu-%llu\n", mdname(conf->mddev),
+						    (unsigned long long) sh->sector,
+						    (unsigned long long) sh->sector +
+						    STRIPE_SECTORS);
+			} else {
 				sh->check_state = check_state_compute_run;
 				set_bit(STRIPE_COMPUTE_RUN, &sh->state);
 				set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
@@ -4230,10 +4242,15 @@ static void handle_parity_checks6(struct r5conf *conf, struct stripe_head *sh,
 			}
 		} else {
 			atomic64_add(STRIPE_SECTORS, &conf->mddev->resync_mismatches);
-			if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery))
+			if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) {
 				/* don't try to repair!! */
 				set_bit(STRIPE_INSYNC, &sh->state);
-			else {
+				pr_warn_ratelimited("%s: mismatch sector in range "
+						    "%llu-%llu\n", mdname(conf->mddev),
+						    (unsigned long long) sh->sector,
+						    (unsigned long long) sh->sector +
+						    STRIPE_SECTORS);
+			} else {
 				int *target = &sh->ops.target;
 
 				sh->ops.target = -1;
@@ -4653,8 +4670,13 @@ static void handle_stripe(struct stripe_head *sh)
 
 	if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state) && !sh->batch_head) {
 		spin_lock(&sh->stripe_lock);
-		/* Cannot process 'sync' concurrently with 'discard' */
-		if (!test_bit(STRIPE_DISCARD, &sh->state) &&
+		/*
+		 * Cannot process 'sync' concurrently with 'discard'.
+		 * Flush data in r5cache before 'sync'.
+		 */
+		if (!test_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state) &&
+		    !test_bit(STRIPE_R5C_FULL_STRIPE, &sh->state) &&
+		    !test_bit(STRIPE_DISCARD, &sh->state) &&
 		    test_and_clear_bit(STRIPE_SYNC_REQUESTED, &sh->state)) {
 			set_bit(STRIPE_SYNCING, &sh->state);
 			clear_bit(STRIPE_INSYNC, &sh->state);
@@ -4701,10 +4723,15 @@ static void handle_stripe(struct stripe_head *sh)
 	       " to_write=%d failed=%d failed_num=%d,%d\n",
 	       s.locked, s.uptodate, s.to_read, s.to_write, s.failed,
 	       s.failed_num[0], s.failed_num[1]);
-	/* check if the array has lost more than max_degraded devices and,
+	/*
+	 * check if the array has lost more than max_degraded devices and,
 	 * if so, some requests might need to be failed.
+	 *
+	 * When journal device failed (log_failed), we will only process
+	 * the stripe if there is data need write to raid disks
 	 */
-	if (s.failed > conf->max_degraded || s.log_failed) {
+	if (s.failed > conf->max_degraded ||
+	    (s.log_failed && s.injournal == 0)) {
 		sh->check_state = 0;
 		sh->reconstruct_state = 0;
 		break_stripe_batch_list(sh, 0);
@@ -5127,7 +5154,7 @@ static void raid5_align_endio(struct bio *bi)
 	struct mddev *mddev;
 	struct r5conf *conf;
 	struct md_rdev *rdev;
-	int error = bi->bi_error;
+	blk_status_t error = bi->bi_status;
 
 	bio_put(bi);
 
@@ -5277,8 +5304,10 @@ static struct stripe_head *__get_priority_stripe(struct r5conf *conf, int group)
 	struct stripe_head *sh, *tmp;
 	struct list_head *handle_list = NULL;
 	struct r5worker_group *wg;
-	bool second_try = !r5c_is_writeback(conf->log);
-	bool try_loprio = test_bit(R5C_LOG_TIGHT, &conf->cache_state);
+	bool second_try = !r5c_is_writeback(conf->log) &&
+		!r5l_log_disk_error(conf);
+	bool try_loprio = test_bit(R5C_LOG_TIGHT, &conf->cache_state) ||
+		r5l_log_disk_error(conf);
 
 again:
 	wg = NULL;
@@ -5702,7 +5731,7 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi)
 			release_stripe_plug(mddev, sh);
 		} else {
 			/* cannot get stripe for read-ahead, just give-up */
-			bi->bi_error = -EIO;
+			bi->bi_status = BLK_STS_IOERR;
 			break;
 		}
 	}
@@ -6313,7 +6342,6 @@ int
 raid5_set_cache_size(struct mddev *mddev, int size)
 {
 	struct r5conf *conf = mddev->private;
-	int err;
 
 	if (size <= 16 || size > 32768)
 		return -EINVAL;
@@ -6325,10 +6353,7 @@ raid5_set_cache_size(struct mddev *mddev, int size)
 		;
 	mutex_unlock(&conf->cache_size_mutex);
 
-
-	err = md_allow_write(mddev);
-	if (err)
-		return err;
+	md_allow_write(mddev);
 
 	mutex_lock(&conf->cache_size_mutex);
 	while (size > conf->max_nr_stripes)
@@ -6918,7 +6943,7 @@ static struct r5conf *setup_conf(struct mddev *mddev)
 			goto abort;
 	}
 
-	conf->bio_split = bioset_create(BIO_POOL_SIZE, 0);
+	conf->bio_split = bioset_create(BIO_POOL_SIZE, 0, 0);
 	if (!conf->bio_split)
 		goto abort;
 	conf->mddev = mddev;
@@ -7093,6 +7118,9 @@ static int raid5_run(struct mddev *mddev)
 	long long min_offset_diff = 0;
 	int first = 1;
 
+	if (mddev_init_writes_pending(mddev) < 0)
+		return -ENOMEM;
+
 	if (mddev->recovery_cp != MaxSector)
 		pr_notice("md/raid:%s: not clean -- starting background reconstruction\n",
 			  mdname(mddev));
@@ -7530,7 +7558,9 @@ static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
 		 * neilb: there is no locking about new writes here,
 		 * so this cannot be safe.
 		 */
-		if (atomic_read(&conf->active_stripes)) {
+		if (atomic_read(&conf->active_stripes) ||
+		    atomic_read(&conf->r5c_cached_full_stripes) ||
+		    atomic_read(&conf->r5c_cached_partial_stripes)) {
 			return -EBUSY;
 		}
 		log_exit(conf);