summaryrefslogtreecommitdiffstats
path: root/drivers/md
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/bcache/closure.c10
-rw-r--r--drivers/md/bcache/debug.c5
-rw-r--r--drivers/md/bcache/sysfs.c1
-rw-r--r--drivers/md/dm-bufio.c4
-rw-r--r--drivers/md/dm-dust.c11
-rw-r--r--drivers/md/dm-integrity.c15
-rw-r--r--drivers/md/dm-kcopyd.c5
-rw-r--r--drivers/md/dm-raid.c2
-rw-r--r--drivers/md/dm-rq.c3
-rw-r--r--drivers/md/dm-table.c5
-rw-r--r--drivers/md/dm-zoned-metadata.c68
-rw-r--r--drivers/md/dm-zoned-reclaim.c47
-rw-r--r--drivers/md/dm-zoned-target.c68
-rw-r--r--drivers/md/dm-zoned.h11
-rw-r--r--drivers/md/md-linear.c5
-rw-r--r--drivers/md/md.c96
-rw-r--r--drivers/md/md.h20
-rw-r--r--drivers/md/persistent-data/dm-btree.c31
-rw-r--r--drivers/md/persistent-data/dm-space-map-metadata.c2
-rw-r--r--drivers/md/raid0.c41
-rw-r--r--drivers/md/raid0.h14
-rw-r--r--drivers/md/raid1.c89
-rw-r--r--drivers/md/raid10.c32
-rw-r--r--drivers/md/raid5.c27
-rw-r--r--drivers/md/raid5.h5
25 files changed, 468 insertions, 149 deletions
diff --git a/drivers/md/bcache/closure.c b/drivers/md/bcache/closure.c
index 73f5319295bc..c12cd809ab19 100644
--- a/drivers/md/bcache/closure.c
+++ b/drivers/md/bcache/closure.c
@@ -105,8 +105,14 @@ struct closure_syncer {
static void closure_sync_fn(struct closure *cl)
{
- cl->s->done = 1;
- wake_up_process(cl->s->task);
+ struct closure_syncer *s = cl->s;
+ struct task_struct *p;
+
+ rcu_read_lock();
+ p = READ_ONCE(s->task);
+ s->done = 1;
+ wake_up_process(p);
+ rcu_read_unlock();
}
void __sched __closure_sync(struct closure *cl)
diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c
index 8b123be05254..336f43910383 100644
--- a/drivers/md/bcache/debug.c
+++ b/drivers/md/bcache/debug.c
@@ -178,10 +178,9 @@ static ssize_t bch_dump_read(struct file *file, char __user *buf,
while (size) {
struct keybuf_key *w;
unsigned int bytes = min(i->bytes, size);
- int err = copy_to_user(buf, i->buf, bytes);
- if (err)
- return err;
+ if (copy_to_user(buf, i->buf, bytes))
+ return -EFAULT;
ret += bytes;
buf += bytes;
diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
index e2059af90791..627dcea0f5b6 100644
--- a/drivers/md/bcache/sysfs.c
+++ b/drivers/md/bcache/sysfs.c
@@ -964,6 +964,7 @@ KTYPE(bch_cache_set_internal);
static int __bch_cache_cmp(const void *l, const void *r)
{
+ cond_resched();
return *((uint16_t *)r) - *((uint16_t *)l);
}
diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c
index b6b5acc92ca2..2a48ea3f1b30 100644
--- a/drivers/md/dm-bufio.c
+++ b/drivers/md/dm-bufio.c
@@ -1599,7 +1599,9 @@ dm_bufio_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
unsigned long freed;
c = container_of(shrink, struct dm_bufio_client, shrinker);
- if (!dm_bufio_trylock(c))
+ if (sc->gfp_mask & __GFP_FS)
+ dm_bufio_lock(c);
+ else if (!dm_bufio_trylock(c))
return SHRINK_STOP;
freed = __scan(c, sc->nr_to_scan, sc->gfp_mask);
diff --git a/drivers/md/dm-dust.c b/drivers/md/dm-dust.c
index 845f376a72d9..8288887b7f94 100644
--- a/drivers/md/dm-dust.c
+++ b/drivers/md/dm-dust.c
@@ -25,6 +25,7 @@ struct dust_device {
unsigned long long badblock_count;
spinlock_t dust_lock;
unsigned int blksz;
+ int sect_per_block_shift;
unsigned int sect_per_block;
sector_t start;
bool fail_read_on_bb:1;
@@ -79,7 +80,7 @@ static int dust_remove_block(struct dust_device *dd, unsigned long long block)
unsigned long flags;
spin_lock_irqsave(&dd->dust_lock, flags);
- bblock = dust_rb_search(&dd->badblocklist, block * dd->sect_per_block);
+ bblock = dust_rb_search(&dd->badblocklist, block);
if (bblock == NULL) {
if (!dd->quiet_mode) {
@@ -113,7 +114,7 @@ static int dust_add_block(struct dust_device *dd, unsigned long long block)
}
spin_lock_irqsave(&dd->dust_lock, flags);
- bblock->bb = block * dd->sect_per_block;
+ bblock->bb = block;
if (!dust_rb_insert(&dd->badblocklist, bblock)) {
if (!dd->quiet_mode) {
DMERR("%s: block %llu already in badblocklist",
@@ -138,7 +139,7 @@ static int dust_query_block(struct dust_device *dd, unsigned long long block)
unsigned long flags;
spin_lock_irqsave(&dd->dust_lock, flags);
- bblock = dust_rb_search(&dd->badblocklist, block * dd->sect_per_block);
+ bblock = dust_rb_search(&dd->badblocklist, block);
if (bblock != NULL)
DMINFO("%s: block %llu found in badblocklist", __func__, block);
else
@@ -165,6 +166,7 @@ static int dust_map_read(struct dust_device *dd, sector_t thisblock,
int ret = DM_MAPIO_REMAPPED;
if (fail_read_on_bb) {
+ thisblock >>= dd->sect_per_block_shift;
spin_lock_irqsave(&dd->dust_lock, flags);
ret = __dust_map_read(dd, thisblock);
spin_unlock_irqrestore(&dd->dust_lock, flags);
@@ -195,6 +197,7 @@ static int dust_map_write(struct dust_device *dd, sector_t thisblock,
unsigned long flags;
if (fail_read_on_bb) {
+ thisblock >>= dd->sect_per_block_shift;
spin_lock_irqsave(&dd->dust_lock, flags);
__dust_map_write(dd, thisblock);
spin_unlock_irqrestore(&dd->dust_lock, flags);
@@ -331,6 +334,8 @@ static int dust_ctr(struct dm_target *ti, unsigned int argc, char **argv)
dd->blksz = blksz;
dd->start = tmp;
+ dd->sect_per_block_shift = __ffs(sect_per_block);
+
/*
* Whether to fail a read on a "bad" block.
* Defaults to false; enabled later by message.
diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c
index b1b0de402dfc..9118ab85cb3a 100644
--- a/drivers/md/dm-integrity.c
+++ b/drivers/md/dm-integrity.c
@@ -1943,7 +1943,22 @@ offload_to_thread:
queue_work(ic->wait_wq, &dio->work);
return;
}
+ if (journal_read_pos != NOT_FOUND)
+ dio->range.n_sectors = ic->sectors_per_block;
wait_and_add_new_range(ic, &dio->range);
+ /*
+ * wait_and_add_new_range drops the spinlock, so the journal
+ * may have been changed arbitrarily. We need to recheck.
+ * To simplify the code, we restrict I/O size to just one block.
+ */
+ if (journal_read_pos != NOT_FOUND) {
+ sector_t next_sector;
+ unsigned new_pos = find_journal_node(ic, dio->range.logical_sector, &next_sector);
+ if (unlikely(new_pos != journal_read_pos)) {
+ remove_range_unlocked(ic, &dio->range);
+ goto retry;
+ }
+ }
}
spin_unlock_irq(&ic->endio_wait.lock);
diff --git a/drivers/md/dm-kcopyd.c b/drivers/md/dm-kcopyd.c
index df2011de7be2..1bbe4a34ef4c 100644
--- a/drivers/md/dm-kcopyd.c
+++ b/drivers/md/dm-kcopyd.c
@@ -566,8 +566,10 @@ static int run_io_job(struct kcopyd_job *job)
* no point in continuing.
*/
if (test_bit(DM_KCOPYD_WRITE_SEQ, &job->flags) &&
- job->master_job->write_err)
+ job->master_job->write_err) {
+ job->write_err = job->master_job->write_err;
return -EIO;
+ }
io_job_start(job->kc->throttle);
@@ -619,6 +621,7 @@ static int process_jobs(struct list_head *jobs, struct dm_kcopyd_client *kc,
else
job->read_err = 1;
push(&kc->complete_jobs, job);
+ wake(kc);
break;
}
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index 8a60a4a070ac..1f933dd197cd 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -3194,7 +3194,7 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv)
*/
r = rs_prepare_reshape(rs);
if (r)
- return r;
+ goto bad;
/* Reshaping ain't recovery, so disable recovery */
rs_setup_recovery(rs, MaxSector);
diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c
index c9e44ac1f9a6..3f8577e2c13b 100644
--- a/drivers/md/dm-rq.c
+++ b/drivers/md/dm-rq.c
@@ -408,6 +408,7 @@ static int map_request(struct dm_rq_target_io *tio)
ret = dm_dispatch_clone_request(clone, rq);
if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE) {
blk_rq_unprep_clone(clone);
+ blk_mq_cleanup_rq(clone);
tio->ti->type->release_clone_rq(clone, &tio->info);
tio->clone = NULL;
return DM_MAPIO_REQUEUE;
@@ -562,7 +563,7 @@ int dm_mq_init_request_queue(struct mapped_device *md, struct dm_table *t)
if (err)
goto out_kfree_tag_set;
- q = blk_mq_init_allocated_queue(md->tag_set, md->queue);
+ q = blk_mq_init_allocated_queue(md->tag_set, md->queue, true);
if (IS_ERR(q)) {
err = PTR_ERR(q);
goto out_tag_set;
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 7b6c3ee9e755..8820931ec7d2 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -1342,7 +1342,7 @@ void dm_table_event(struct dm_table *t)
}
EXPORT_SYMBOL(dm_table_event);
-sector_t dm_table_get_size(struct dm_table *t)
+inline sector_t dm_table_get_size(struct dm_table *t)
{
return t->num_targets ? (t->highs[t->num_targets - 1] + 1) : 0;
}
@@ -1367,6 +1367,9 @@ struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector)
unsigned int l, n = 0, k = 0;
sector_t *node;
+ if (unlikely(sector >= dm_table_get_size(t)))
+ return &t->targets[t->num_targets];
+
for (l = 0; l < t->depth; l++) {
n = get_child(n, k);
node = get_node(t, l, n);
diff --git a/drivers/md/dm-zoned-metadata.c b/drivers/md/dm-zoned-metadata.c
index 8545dcee9fd0..595a73110e17 100644
--- a/drivers/md/dm-zoned-metadata.c
+++ b/drivers/md/dm-zoned-metadata.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (C) 2017 Western Digital Corporation or its affiliates.
*
@@ -34,7 +35,7 @@
* (1) Super block (1 block)
* (2) Chunk mapping table (nr_map_blocks)
* (3) Bitmap blocks (nr_bitmap_blocks)
- * All metadata blocks are stored in conventional zones, starting from the
+ * All metadata blocks are stored in conventional zones, starting from
* the first conventional zone found on disk.
*/
struct dmz_super {
@@ -233,7 +234,7 @@ void dmz_unlock_map(struct dmz_metadata *zmd)
* Lock/unlock metadata access. This is a "read" lock on a semaphore
* that prevents metadata flush from running while metadata are being
* modified. The actual metadata write mutual exclusion is achieved with
- * the map lock and zone styate management (active and reclaim state are
+ * the map lock and zone state management (active and reclaim state are
* mutually exclusive).
*/
void dmz_lock_metadata(struct dmz_metadata *zmd)
@@ -402,15 +403,18 @@ static struct dmz_mblock *dmz_get_mblock_slow(struct dmz_metadata *zmd,
sector_t block = zmd->sb[zmd->mblk_primary].block + mblk_no;
struct bio *bio;
+ if (dmz_bdev_is_dying(zmd->dev))
+ return ERR_PTR(-EIO);
+
/* Get a new block and a BIO to read it */
mblk = dmz_alloc_mblock(zmd, mblk_no);
if (!mblk)
- return NULL;
+ return ERR_PTR(-ENOMEM);
bio = bio_alloc(GFP_NOIO, 1);
if (!bio) {
dmz_free_mblock(zmd, mblk);
- return NULL;
+ return ERR_PTR(-ENOMEM);
}
spin_lock(&zmd->mblk_lock);
@@ -541,8 +545,8 @@ static struct dmz_mblock *dmz_get_mblock(struct dmz_metadata *zmd,
if (!mblk) {
/* Cache miss: read the block from disk */
mblk = dmz_get_mblock_slow(zmd, mblk_no);
- if (!mblk)
- return ERR_PTR(-ENOMEM);
+ if (IS_ERR(mblk))
+ return mblk;
}
/* Wait for on-going read I/O and check for error */
@@ -570,16 +574,19 @@ static void dmz_dirty_mblock(struct dmz_metadata *zmd, struct dmz_mblock *mblk)
/*
* Issue a metadata block write BIO.
*/
-static void dmz_write_mblock(struct dmz_metadata *zmd, struct dmz_mblock *mblk,
- unsigned int set)
+static int dmz_write_mblock(struct dmz_metadata *zmd, struct dmz_mblock *mblk,
+ unsigned int set)
{
sector_t block = zmd->sb[set].block + mblk->no;
struct bio *bio;
+ if (dmz_bdev_is_dying(zmd->dev))
+ return -EIO;
+
bio = bio_alloc(GFP_NOIO, 1);
if (!bio) {
set_bit(DMZ_META_ERROR, &mblk->state);
- return;
+ return -ENOMEM;
}
set_bit(DMZ_META_WRITING, &mblk->state);
@@ -591,6 +598,8 @@ static void dmz_write_mblock(struct dmz_metadata *zmd, struct dmz_mblock *mblk,
bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_META | REQ_PRIO);
bio_add_page(bio, mblk->page, DMZ_BLOCK_SIZE, 0);
submit_bio(bio);
+
+ return 0;
}
/*
@@ -602,6 +611,9 @@ static int dmz_rdwr_block(struct dmz_metadata *zmd, int op, sector_t block,
struct bio *bio;
int ret;
+ if (dmz_bdev_is_dying(zmd->dev))
+ return -EIO;
+
bio = bio_alloc(GFP_NOIO, 1);
if (!bio)
return -ENOMEM;
@@ -659,22 +671,29 @@ static int dmz_write_dirty_mblocks(struct dmz_metadata *zmd,
{
struct dmz_mblock *mblk;
struct blk_plug plug;
- int ret = 0;
+ int ret = 0, nr_mblks_submitted = 0;
/* Issue writes */
blk_start_plug(&plug);
- list_for_each_entry(mblk, write_list, link)
- dmz_write_mblock(zmd, mblk, set);
+ list_for_each_entry(mblk, write_list, link) {
+ ret = dmz_write_mblock(zmd, mblk, set);
+ if (ret)
+ break;
+ nr_mblks_submitted++;
+ }
blk_finish_plug(&plug);
/* Wait for completion */
list_for_each_entry(mblk, write_list, link) {
+ if (!nr_mblks_submitted)
+ break;
wait_on_bit_io(&mblk->state, DMZ_META_WRITING,
TASK_UNINTERRUPTIBLE);
if (test_bit(DMZ_META_ERROR, &mblk->state)) {
clear_bit(DMZ_META_ERROR, &mblk->state);
ret = -EIO;
}
+ nr_mblks_submitted--;
}
/* Flush drive cache (this will also sync data) */
@@ -736,6 +755,11 @@ int dmz_flush_metadata(struct dmz_metadata *zmd)
*/
dmz_lock_flush(zmd);
+ if (dmz_bdev_is_dying(zmd->dev)) {
+ ret = -EIO;
+ goto out;
+ }
+
/* Get dirty blocks */
spin_lock(&zmd->mblk_lock);
list_splice_init(&zmd->mblk_dirty_list, &write_list);
@@ -1542,7 +1566,7 @@ static struct dm_zone *dmz_get_rnd_zone_for_reclaim(struct dmz_metadata *zmd)
struct dm_zone *zone;
if (list_empty(&zmd->map_rnd_list))
- return NULL;
+ return ERR_PTR(-EBUSY);
list_for_each_entry(zone, &zmd->map_rnd_list, link) {
if (dmz_is_buf(zone))
@@ -1553,7 +1577,7 @@ static struct dm_zone *dmz_get_rnd_zone_for_reclaim(struct dmz_metadata *zmd)
return dzone;
}
- return NULL;
+ return ERR_PTR(-EBUSY);
}
/*
@@ -1564,7 +1588,7 @@ static struct dm_zone *dmz_get_seq_zone_for_reclaim(struct dmz_metadata *zmd)
struct dm_zone *zone;
if (list_empty(&zmd->map_seq_list))
- return NULL;
+ return ERR_PTR(-EBUSY);
list_for_each_entry(zone, &zmd->map_seq_list, link) {
if (!zone->bzone)
@@ -1573,7 +1597,7 @@ static struct dm_zone *dmz_get_seq_zone_for_reclaim(struct dmz_metadata *zmd)
return zone;
}
- return NULL;
+ return ERR_PTR(-EBUSY);
}
/*
@@ -1628,9 +1652,13 @@ again:
if (op != REQ_OP_WRITE)
goto out;
- /* Alloate a random zone */
+ /* Allocate a random zone */
dzone = dmz_alloc_zone(zmd, DMZ_ALLOC_RND);
if (!dzone) {
+ if (dmz_bdev_is_dying(zmd->dev)) {
+ dzone = ERR_PTR(-EIO);
+ goto out;
+ }
dmz_wait_for_free_zones(zmd);
goto again;
}
@@ -1725,9 +1753,13 @@ again:
if (bzone)
goto out;
- /* Alloate a random zone */
+ /* Allocate a random zone */
bzone = dmz_alloc_zone(zmd, DMZ_ALLOC_RND);
if (!bzone) {
+ if (dmz_bdev_is_dying(zmd->dev)) {
+ bzone = ERR_PTR(-EIO);
+ goto out;
+ }
dmz_wait_for_free_zones(zmd);
goto again;
}
diff --git a/drivers/md/dm-zoned-reclaim.c b/drivers/md/dm-zoned-reclaim.c
index edf4b95eb075..d240d7ca8a8a 100644
--- a/drivers/md/dm-zoned-reclaim.c
+++ b/drivers/md/dm-zoned-reclaim.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (C) 2017 Western Digital Corporation or its affiliates.
*
@@ -37,7 +38,7 @@ enum {
/*
* Number of seconds of target BIO inactivity to consider the target idle.
*/
-#define DMZ_IDLE_PERIOD (10UL * HZ)
+#define DMZ_IDLE_PERIOD (10UL * HZ)
/*
* Percentage of unmapped (free) random zones below which reclaim starts
@@ -134,6 +135,9 @@ static int dmz_reclaim_copy(struct dmz_reclaim *zrc,
set_bit(DM_KCOPYD_WRITE_SEQ, &flags);
while (block < end_block) {
+ if (dev->flags & DMZ_BDEV_DYING)
+ return -EIO;
+
/* Get a valid region from the source zone */
ret = dmz_first_valid_block(zmd, src_zone, &block);
if (ret <= 0)
@@ -215,7 +219,7 @@ static int dmz_reclaim_buf(struct dmz_reclaim *zrc, struct dm_zone *dzone)
dmz_unlock_flush(zmd);
- return 0;
+ return ret;
}
/*
@@ -259,7 +263,7 @@ static int dmz_reclaim_seq_data(struct dmz_reclaim *zrc, struct dm_zone *dzone)
dmz_unlock_flush(zmd);
- return 0;
+ return ret;
}
/*
@@ -312,7 +316,7 @@ static int dmz_reclaim_rnd_data(struct dmz_reclaim *zrc, struct dm_zone *dzone)
dmz_unlock_flush(zmd);
- return 0;
+ return ret;
}
/*
@@ -334,7 +338,7 @@ static void dmz_reclaim_empty(struct dmz_reclaim *zrc, struct dm_zone *dzone)
/*
* Find a candidate zone for reclaim and process it.
*/
-static void dmz_reclaim(struct dmz_reclaim *zrc)
+static int dmz_do_reclaim(struct dmz_reclaim *zrc)
{
struct dmz_metadata *zmd = zrc->metadata;
struct dm_zone *dzone;
@@ -344,8 +348,8 @@ static void dmz_reclaim(struct dmz_reclaim *zrc)
/* Get a data zone */
dzone = dmz_get_zone_for_reclaim(zmd);
- if (!dzone)
- return;
+ if (IS_ERR(dzone))
+ return PTR_ERR(dzone);
start = jiffies;
@@ -391,13 +395,20 @@ static void dmz_reclaim(struct dmz_reclaim *zrc)
out:
if (ret) {
dmz_unlock_zone_reclaim(dzone);
- return;
+ return ret;
}
- (void) dmz_flush_metadata(zrc->metadata);
+ ret = dmz_flush_metadata(zrc->metadata);
+ if (ret) {
+ dmz_dev_debug(zrc->dev,
+ "Metadata flush for zone %u failed, err %d\n",
+ dmz_id(zmd, rzone), ret);
+ return ret;
+ }
dmz_dev_debug(zrc->dev, "Reclaimed zone %u in %u ms",
dmz_id(zmd, rzone), jiffies_to_msecs(jiffies - start));
+ return 0;
}
/*
@@ -427,7 +438,7 @@ static bool dmz_should_reclaim(struct dmz_reclaim *zrc)
return false;
/*
- * If the percentage of unmappped random zones is low,
+ * If the percentage of unmapped random zones is low,
* reclaim even if the target is busy.
*/
return p_unmap_rnd <= DMZ_RECLAIM_LOW_UNMAP_RND;
@@ -442,6 +453,10 @@ static void dmz_reclaim_work(struct work_struct *work)
struct dmz_metadata *zmd = zrc->metadata;
unsigned int nr_rnd, nr_unmap_rnd;
unsigned int p_unmap_rnd;
+ int ret;
+
+ if (dmz_bdev_is_dying(zrc->dev))
+ return;
if (!dmz_should_reclaim(zrc)) {
mod_delayed_work(zrc->wq, &zrc->work, DMZ_IDLE_PERIOD);
@@ -471,7 +486,17 @@ static void dmz_reclaim_work(struct work_struct *work)
(dmz_target_idle(zrc) ? "Idle" : "Busy"),
p_unmap_rnd, nr_unmap_rnd, nr_rnd);
- dmz_reclaim(zrc);
+ ret = dmz_do_reclaim(zrc);
+ if (ret) {
+ dmz_dev_debug(zrc->dev, "Reclaim error %d\n", ret);
+ if (ret == -EIO)
+ /*
+ * LLD might be performing some error handling sequence
+ * at the underlying device. To not interfere, do not
+ * attempt to schedule the next reclaim run immediately.
+ */
+ return;
+ }
dmz_schedule_reclaim(zrc);
}
diff --git a/drivers/md/dm-zoned-target.c b/drivers/md/dm-zoned-target.c
index 51d029bbb740..31478fef6032 100644
--- a/drivers/md/dm-zoned-target.c
+++ b/drivers/md/dm-zoned-target.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (C) 2017 Western Digital Corporation or its affiliates.
*
@@ -133,6 +134,8 @@ static int dmz_submit_bio(struct dmz_target *dmz, struct dm_zone *zone,
refcount_inc(&bioctx->ref);
generic_make_request(clone);
+ if (clone->bi_status == BLK_STS_IOERR)
+ return -EIO;
if (bio_op(bio) == REQ_OP_WRITE && dmz_is_seq(zone))
zone->wp_block += nr_blocks;
@@ -277,8 +280,8 @@ static int dmz_handle_buffered_write(struct dmz_target *dmz,
/* Get the buffer zone. One will be allocated if needed */
bzone = dmz_get_chunk_buffer(zmd, zone);
- if (!bzone)
- return -ENOSPC;
+ if (IS_ERR(bzone))
+ return PTR_ERR(bzone);
if (dmz_is_readonly(bzone))
return -EROFS;
@@ -389,6 +392,11 @@ static void dmz_handle_bio(struct dmz_target *dmz, struct dm_chunk_work *cw,
dmz_lock_metadata(zmd);
+ if (dmz->dev->flags & DMZ_BDEV_DYING) {
+ ret = -EIO;
+ goto out;
+ }
+
/*
* Get the data zone mapping the chunk. There may be no
* mapping for read and discard. If a mapping is obtained,
@@ -493,6 +501,8 @@ static void dmz_flush_work(struct work_struct *work)
/* Flush dirty metadata blocks */
ret = dmz_flush_metadata(dmz->metadata);
+ if (ret)
+ dmz_dev_debug(dmz->dev, "Metadata flush failed, rc=%d\n", ret);
/* Process queued flush requests */
while (1) {
@@ -513,22 +523,24 @@ static void dmz_flush_work(struct work_struct *work)
* Get a chunk work and start it to process a new BIO.
* If the BIO chunk has no work yet, create one.
*/
-static void dmz_queue_chunk_work(struct dmz_target *dmz, struct bio *bio)
+static int dmz_queue_chunk_work(struct dmz_target *dmz, struct bio *bio)
{
unsigned int chunk = dmz_bio_chunk(dmz->dev, bio);
struct dm_chunk_work *cw;
+ int ret = 0;
mutex_lock(&dmz->chunk_lock);
/* Get the BIO chunk work. If one is not active yet, create one */
cw = radix_tree_lookup(&dmz->chunk_rxtree, chunk);
if (!cw) {
- int ret;
/* Create a new chunk work */
cw = kmalloc(sizeof(struct dm_chunk_work), GFP_NOIO);
- if (!cw)
+ if (unlikely(!cw)) {
+ ret = -ENOMEM;
goto out;
+ }
INIT_WORK(&cw->work, dmz_chunk_work);
refcount_set(&cw->refcount, 0);
@@ -539,7 +551,6 @@ static void dmz_queue_chunk_work(struct dmz_target *dmz, struct bio *bio)
ret = radix_tree_insert(&dmz->chunk_rxtree, chunk, cw);
if (unlikely(ret)) {
kfree(cw);
- cw = NULL;
goto out;
}
}
@@ -547,10 +558,38 @@ static void dmz_queue_chunk_work(struct dmz_target *dmz, struct bio *bio)
bio_list_add(&cw->bio_list, bio);
dmz_get_chunk_work(cw);
+ dmz_reclaim_bio_acc(dmz->reclaim);
if (queue_work(dmz->chunk_wq, &cw->work))
dmz_get_chunk_work(cw);
out:
mutex_unlock(&dmz->chunk_lock);
+ return ret;
+}
+
+/*
+ * Check the backing device availability. If it's on the way out,
+ * start failing I/O. Reclaim and metadata components also call this
+ * function to cleanly abort operation in the event of such failure.
+ */
+bool dmz_bdev_is_dying(struct dmz_dev *dmz_dev)
+{
+ struct gendisk *disk;
+
+ if (!(dmz_dev->flags & DMZ_BDEV_DYING)) {
+ disk = dmz_dev->bdev->bd_disk;
+ if (blk_queue_dying(bdev_get_queue(dmz_dev->bdev))) {
+ dmz_dev_warn(dmz_dev, "Backing device queue dying");
+ dmz_dev->flags |= DMZ_BDEV_DYING;
+ } else if (disk->fops->check_events) {
+ if (disk->fops->check_events(disk, 0) &
+ DISK_EVENT_MEDIA_CHANGE) {
+ dmz_dev_warn(dmz_dev, "Backing device offline");
+ dmz_dev->flags |= DMZ_BDEV_DYING;
+ }
+ }
+ }
+
+ return dmz_dev->flags & DMZ_BDEV_DYING;
}
/*
@@ -564,6 +603,10 @@ static int dmz_map(struct dm_target *ti, struct bio *bio)
sector_t sector = bio->bi_iter.bi_sector;
unsigned int nr_sectors = bio_sectors(bio);
sector_t chunk_sector;
+ int ret;
+
+ if (dmz_bdev_is_dying(dmz->dev))
+ return DM_MAPIO_KILL;
dmz_dev_debug(dev, "BIO op %d sector %llu + %u => chunk %llu, block %llu, %u blocks",
bio_op(bio), (unsigned long long)sector, nr_sectors,
@@ -601,8 +644,14 @@ static int dmz_map(struct dm_target *ti, struct bio *bio)
dm_accept_partial_bio(bio, dev->zone_nr_sectors - chunk_sector);
/* Now ready to handle this BIO */
- dmz_reclaim_bio_acc(dmz->reclaim);
- dmz_queue_chunk_work(dmz, bio);
+ ret = dmz_queue_chunk_work(dmz, bio);
+ if (ret) {
+ dmz_dev_debug(dmz->dev,
+ "BIO op %d, can't process chunk %llu, err %i\n",
+ bio_op(bio), (u64)dmz_bio_chunk(dmz->dev, bio),
+ ret);
+ return DM_MAPIO_REQUEUE;
+ }
return DM_MAPIO_SUBMITTED;
}
@@ -855,6 +904,9 @@ static int dmz_prepare_ioctl(struct dm_target *ti, struct block_device **bdev)
{
struct dmz_target *dmz = ti->private;
+ if (dmz_bdev_is_dying(dmz->dev))
+ return -ENODEV;
+
*bdev = dmz->dev->bdev;
return 0;
diff --git a/drivers/md/dm-zoned.h b/drivers/md/dm-zoned.h
index ed8de49c9a08..d8e70b0ade35 100644
--- a/drivers/md/dm-zoned.h
+++ b/drivers/md/dm-zoned.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* Copyright (C) 2017 Western Digital Corporation or its affiliates.
*
@@ -56,6 +57,8 @@ struct dmz_dev {
unsigned int nr_zones;
+ unsigned int flags;
+
sector_t zone_nr_sectors;
unsigned int zone_nr_sectors_shift;
@@ -67,6 +70,9 @@ struct dmz_dev {
(dev)->zone_nr_sectors_shift)
#define dmz_chunk_block(dev, b) ((b) & ((dev)->zone_nr_blocks - 1))
+/* Device flags. */
+#define DMZ_BDEV_DYING (1 << 0)
+
/*
* Zone descriptor.
*/
@@ -245,4 +251,9 @@ void dmz_resume_reclaim(struct dmz_reclaim *zrc);
void dmz_reclaim_bio_acc(struct dmz_reclaim *zrc);
void dmz_schedule_reclaim(struct dmz_reclaim *zrc);
+/*
+ * Functions defined in dm-zoned-target.c
+ */
+bool dmz_bdev_is_dying(struct dmz_dev *dmz_dev);
+
#endif /* DM_ZONED_H */
diff --git a/drivers/md/md-linear.c b/drivers/md/md-linear.c
index 7354466ddc90..c766c559d36d 100644
--- a/drivers/md/md-linear.c
+++ b/drivers/md/md-linear.c
@@ -258,6 +258,11 @@ static bool linear_make_request(struct mddev *mddev, struct bio *bio)
bio_sector < start_sector))
goto out_of_bounds;
+ if (unlikely(is_mddev_broken(tmp_dev->rdev, "linear"))) {
+ bio_io_error(bio);
+ return true;
+ }
+
if (unlikely(bio_end_sector(bio) > end_sector)) {
/* This bio crosses a device boundary, so we have to split it */
struct bio *split = bio_split(bio, end_sector - bio_sector,
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 24638ccedce4..1be7abeb24fd 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -376,6 +376,11 @@ static blk_qc_t md_make_request(struct request_queue *q, struct bio *bio)
struct mddev *mddev = q->queuedata;
unsigned int sectors;
+ if (unlikely(test_bit(MD_BROKEN, &mddev->flags)) && (rw == WRITE)) {
+ bio_io_error(bio);
+ return BLK_QC_T_NONE;
+ }
+
blk_queue_split(q, &bio);
if (mddev == NULL || mddev->pers == NULL) {
@@ -1232,6 +1237,8 @@ static int super_90_validate(struct mddev *mddev, struct md_rdev *rdev)
mddev->new_layout = mddev->layout;
mddev->new_chunk_sectors = mddev->chunk_sectors;
}
+ if (mddev->level == 0)
+ mddev->layout = -1;
if (sb->state & (1<<MD_SB_CLEAN))
mddev->recovery_cp = MaxSector;
@@ -1647,6 +1654,10 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_
rdev->ppl.sector = rdev->sb_start + rdev->ppl.offset;
}
+ if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RAID0_LAYOUT) &&
+ sb->level != 0)
+ return -EINVAL;
+
if (!refdev) {
ret = 1;
} else {
@@ -1757,6 +1768,10 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev)
mddev->new_chunk_sectors = mddev->chunk_sectors;
}
+ if (mddev->level == 0 &&
+ !(le32_to_cpu(sb->feature_map) & MD_FEATURE_RAID0_LAYOUT))
+ mddev->layout = -1;
+
if (le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL)
set_bit(MD_HAS_JOURNAL, &mddev->flags);
@@ -1826,8 +1841,15 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev)
if (!(le32_to_cpu(sb->feature_map) &
MD_FEATURE_RECOVERY_BITMAP))
rdev->saved_raid_disk = -1;
- } else
- set_bit(In_sync, &rdev->flags);
+ } else {
+ /*
+ * If the array is FROZEN, then the device can't
+ * be in_sync with rest of array.
+ */
+ if (!test_bit(MD_RECOVERY_FROZEN,
+ &mddev->recovery))
+ set_bit(In_sync, &rdev->flags);
+ }
rdev->raid_disk = role;
break;
}
@@ -3664,11 +3686,7 @@ int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale)
return -EINVAL;
if (decimals < 0)
decimals = 0;
- while (decimals < scale) {
- result *= 10;
- decimals ++;
- }
- *res = result;
+ *res = result * int_pow(10, scale - decimals);
return 0;
}
@@ -4155,12 +4173,17 @@ __ATTR_PREALLOC(resync_start, S_IRUGO|S_IWUSR,
* active-idle
* like active, but no writes have been seen for a while (100msec).
*
+ * broken
+ * RAID0/LINEAR-only: same as clean, but array is missing a member.
+ * It's useful because RAID0/LINEAR mounted-arrays aren't stopped
+ * when a member is gone, so this state will at least alert the
+ * user that something is wrong.
*/
enum array_state { clear, inactive, suspended, readonly, read_auto, clean, active,
- write_pending, active_idle, bad_word};
+ write_pending, active_idle, broken, bad_word};
static char *array_states[] = {
"clear", "inactive", "suspended", "readonly", "read-auto", "clean", "active",
- "write-pending", "active-idle", NULL };
+ "write-pending", "active-idle", "broken", NULL };
static int match_word(const char *word, char **list)
{
@@ -4176,7 +4199,7 @@ array_state_show(struct mddev *mddev, char *page)
{
enum array_state st = inactive;
- if (mddev->pers)
+ if (mddev->pers && !test_bit(MD_NOT_READY, &mddev->flags)) {
switch(mddev->ro) {
case 1:
st = readonly;
@@ -4196,7 +4219,10 @@ array_state_show(struct mddev *mddev, char *page)
st = active;
spin_unlock(&mddev->lock);
}
- else {
+
+ if (test_bit(MD_BROKEN, &mddev->flags) && st == clean)
+ st = broken;
+ } else {
if (list_empty(&mddev->disks) &&
mddev->raid_disks == 0 &&
mddev->dev_sectors == 0)
@@ -4310,6 +4336,7 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len)
break;
case write_pending:
case active_idle:
+ case broken:
/* these cannot be set */
break;
}
@@ -5182,6 +5209,34 @@ static struct md_sysfs_entry md_consistency_policy =
__ATTR(consistency_policy, S_IRUGO | S_IWUSR, consistency_policy_show,
consistency_policy_store);
+static ssize_t fail_last_dev_show(struct mddev *mddev, char *page)
+{
+ return sprintf(page, "%d\n", mddev->fail_last_dev);
+}
+
+/*
+ * Setting fail_last_dev to true to allow last device to be forcibly removed
+ * from RAID1/RAID10.
+ */
+static ssize_t
+fail_last_dev_store(struct mddev *mddev, const char *buf, size_t len)
+{
+ int ret;
+ bool value;
+
+ ret = kstrtobool(buf, &value);
+ if (ret)
+ return ret;
+
+ if (value != mddev->fail_last_dev)
+ mddev->fail_last_dev = value;
+
+ return len;
+}
+static struct md_sysfs_entry md_fail_last_dev =
+__ATTR(fail_last_dev, S_IRUGO | S_IWUSR, fail_last_dev_show,
+ fail_last_dev_store);
+
static struct attribute *md_default_attrs[] = {
&md_level.attr,
&md_layout.attr,
@@ -5198,6 +5253,7 @@ static struct attribute *md_default_attrs[] = {
&md_array_size.attr,
&max_corr_read_errors.attr,
&md_consistency_policy.attr,
+ &md_fail_last_dev.attr,
NULL,
};
@@ -5744,9 +5800,6 @@ int md_run(struct mddev *mddev)
md_update_sb(mddev, 0);
md_new_event(mddev);
- sysfs_notify_dirent_safe(mddev->sysfs_state);
- sysfs_notify_dirent_safe(mddev->sysfs_action);
- sysfs_notify(&mddev->kobj, NULL, "degraded");
return 0;
bitmap_abort:
@@ -5767,6 +5820,7 @@ static int do_md_run(struct mddev *mddev)
{
int err;
+ set_bit(MD_NOT_READY, &mddev->flags);
err = md_run(mddev);
if (err)
goto out;
@@ -5787,9 +5841,14 @@ static int do_md_run(struct mddev *mddev)
set_capacity(mddev->gendisk, mddev->array_sectors);
revalidate_disk(mddev->gendisk);
+ clear_bit(MD_NOT_READY, &mddev->flags);
mddev->changed = 1;
kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE);
+ sysfs_notify_dirent_safe(mddev->sysfs_state);
+ sysfs_notify_dirent_safe(mddev->sysfs_action);
+ sysfs_notify(&mddev->kobj, NULL, "degraded");
out:
+ clear_bit(MD_NOT_READY, &mddev->flags);
return err;
}
@@ -6849,6 +6908,9 @@ static int set_array_info(struct mddev *mddev, mdu_array_info_t *info)
mddev->external = 0;
mddev->layout = info->layout;
+ if (mddev->level == 0)
+ /* Cannot trust RAID0 layout info here */
+ mddev->layout = -1;
mddev->chunk_sectors = info->chunk_size >> 9;
if (mddev->persistent) {
@@ -8900,6 +8962,7 @@ void md_check_recovery(struct mddev *mddev)
if (mddev_trylock(mddev)) {
int spares = 0;
+ bool try_set_sync = mddev->safemode != 0;
if (!mddev->external && mddev->safemode == 1)
mddev->safemode = 0;
@@ -8945,7 +9008,7 @@ void md_check_recovery(struct mddev *mddev)
}
}
- if (!mddev->external && !mddev->in_sync) {
+ if (try_set_sync && !mddev->external && !mddev->in_sync) {
spin_lock(&mddev->lock);
set_in_sync(mddev);
spin_unlock(&mddev->lock);
@@ -9043,7 +9106,8 @@ void md_reap_sync_thread(struct mddev *mddev)
/* resync has finished, collect result */
md_unregister_thread(&mddev->sync_thread);
if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
- !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
+ !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) &&
+ mddev->degraded != mddev->raid_disks) {
/* success...*/
/* activate any spares */
if (mddev->pers->spare_active(mddev)) {
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 10f98200e2f8..c5e3ff398b59 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -248,6 +248,12 @@ enum mddev_flags {
MD_UPDATING_SB, /* md_check_recovery is updating the metadata
* without explicitly holding reconfig_mutex.
*/
+ MD_NOT_READY, /* do_md_run() is active, so 'array_state'
+ * must not report that array is ready yet
+ */
+ MD_BROKEN, /* This is used in RAID-0/LINEAR only, to stop
+ * I/O in case an array member is gone/failed.
+ */
};
enum mddev_sb_flags {
@@ -487,6 +493,7 @@ struct mddev {
unsigned int good_device_nr; /* good device num within cluster raid */
bool has_superblocks:1;
+ bool fail_last_dev:1;
};
enum recovery_flags {
@@ -735,6 +742,19 @@ extern void mddev_create_wb_pool(struct mddev *mddev, struct md_rdev *rdev,
struct md_rdev *md_find_rdev_nr_rcu(struct mddev *mddev, int nr);
struct md_rdev *md_find_rdev_rcu(struct mddev *mddev, dev_t dev);
+static inline bool is_mddev_broken(struct md_rdev *rdev, const char *md_type)
+{
+ int flags = rdev->bdev->bd_disk->flags;
+
+ if (!(flags & GENHD_FL_UP)) {
+ if (!test_and_set_bit(MD_BROKEN, &rdev->mddev->flags))
+ pr_warn("md: %s: %s array has a missing/failed member\n",
+ mdname(rdev->mddev), md_type);
+ return true;
+ }
+ return false;
+}
+
static inline void rdev_dec_pending(struct md_rdev *rdev, struct mddev *mddev)
{
int faulty = test_bit(Faulty, &rdev->flags);
diff --git a/drivers/md/persistent-data/dm-btree.c b/drivers/md/persistent-data/dm-btree.c
index 58b319757b1e..8aae0624a297 100644
--- a/drivers/md/persistent-data/dm-btree.c
+++ b/drivers/md/persistent-data/dm-btree.c
@@ -628,39 +628,40 @@ static int btree_split_beneath(struct shadow_spine *s, uint64_t key)
new_parent = shadow_current(s);
+ pn = dm_block_data(new_parent);
+ size = le32_to_cpu(pn->header.flags) & INTERNAL_NODE ?
+ sizeof(__le64) : s->info->value_type.size;
+
+ /* create & init the left block */
r = new_block(s->info, &left);
if (r < 0)
return r;
+ ln = dm_block_data(left);
+ nr_left = le32_to_cpu(pn->header.nr_entries) / 2;
+
+ ln->header.flags = pn->header.flags;
+ ln->header.nr_entries = cpu_to_le32(nr_left);
+ ln->header.max_entries = pn->header.max_entries;
+ ln->header.value_size = pn->header.value_size;
+ memcpy(ln->keys, pn->keys, nr_left * sizeof(pn->keys[0]));
+ memcpy(value_ptr(ln, 0), value_ptr(pn, 0), nr_left * size);
+
+ /* create & init the right block */
r = new_block(s->info, &right);
if (r < 0) {
unlock_block(s->info, left);
return r;
}
- pn = dm_block_data(new_parent);
- ln = dm_block_data(left);
rn = dm_block_data(right);
-
- nr_left = le32_to_cpu(pn->header.nr_entries) / 2;
nr_right = le32_to_cpu(pn->header.nr_entries) - nr_left;
- ln->header.flags = pn->header.flags;
- ln->header.nr_entries = cpu_to_le32(nr_left);
- ln->header.max_entries = pn->header.max_entries;
- ln->header.value_size = pn->header.value_size;
-
rn->header.flags = pn->header.flags;
rn->header.nr_entries = cpu_to_le32(nr_right);
rn->header.max_entries = pn->header.max_entries;
rn->header.value_size = pn->header.value_size;
-
- memcpy(ln->keys, pn->keys, nr_left * sizeof(pn->keys[0]));
memcpy(rn->keys, pn->keys + nr_left, nr_right * sizeof(pn->keys[0]));
-
- size = le32_to_cpu(pn->header.flags) & INTERNAL_NODE ?
- sizeof(__le64) : s->info->value_type.size;
- memcpy(value_ptr(ln, 0), value_ptr(pn, 0), nr_left * size);
memcpy(value_ptr(rn, 0), value_ptr(pn, nr_left),
nr_right * size);
diff --git a/drivers/md/persistent-data/dm-space-map-metadata.c b/drivers/md/persistent-data/dm-space-map-metadata.c
index aec449243966..25328582cc48 100644
--- a/drivers/md/persistent-data/dm-space-map-metadata.c
+++ b/drivers/md/persistent-data/dm-space-map-metadata.c
@@ -249,7 +249,7 @@ static int out(struct sm_metadata *smm)
}
if (smm->recursion_count == 1)
- apply_bops(smm);
+ r = apply_bops(smm);
smm->recursion_count--;
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index bf5cf184a260..f61693e59684 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -19,6 +19,9 @@
#include "raid0.h"
#include "raid5.h"
+static int default_layout = 0;
+module_param(default_layout, int, 0644);
+
#define UNSUPPORTED_MDDEV_FLAGS \
((1L << MD_HAS_JOURNAL) | \
(1L << MD_JOURNAL_CLEAN) | \
@@ -139,6 +142,22 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
}
pr_debug("md/raid0:%s: FINAL %d zones\n",
mdname(mddev), conf->nr_strip_zones);
+
+ if (conf->nr_strip_zones == 1) {
+ conf->layout = RAID0_ORIG_LAYOUT;
+ } else if (mddev->layout == RAID0_ORIG_LAYOUT ||
+ mddev->layout == RAID0_ALT_MULTIZONE_LAYOUT) {
+ conf->layout = mddev->layout;
+ } else if (default_layout == RAID0_ORIG_LAYOUT ||
+ default_layout == RAID0_ALT_MULTIZONE_LAYOUT) {
+ conf->layout = default_layout;
+ } else {
+ pr_err("md/raid0:%s: cannot assemble multi-zone RAID0 with default_layout setting\n",
+ mdname(mddev));
+ pr_err("md/raid0: please set raid.default_layout to 1 or 2\n");
+ err = -ENOTSUPP;
+ goto abort;
+ }
/*
* now since we have the hard sector sizes, we can make sure
* chunk size is a multiple of that sector size
@@ -547,10 +566,12 @@ static void raid0_handle_discard(struct mddev *mddev, struct bio *bio)
static bool raid0_make_request(struct mddev *mddev, struct bio *bio)
{
+ struct r0conf *conf = mddev->private;
struct strip_zone *zone;
struct md_rdev *tmp_dev;
sector_t bio_sector;
sector_t sector;
+ sector_t orig_sector;
unsigned chunk_sects;
unsigned sectors;
@@ -584,8 +605,26 @@ static bool raid0_make_request(struct mddev *mddev, struct bio *bio)
bio = split;
}
+ orig_sector = sector;
zone = find_zone(mddev->private, &sector);
- tmp_dev = map_sector(mddev, zone, sector, &sector);
+ switch (conf->layout) {
+ case RAID0_ORIG_LAYOUT:
+ tmp_dev = map_sector(mddev, zone, orig_sector, &sector);
+ break;
+ case RAID0_ALT_MULTIZONE_LAYOUT:
+ tmp_dev = map_sector(mddev, zone, sector, &sector);
+ break;
+ default:
+ WARN("md/raid0:%s: Invalid layout\n", mdname(mddev));
+ bio_io_error(bio);
+ return true;
+ }
+
+ if (unlikely(is_mddev_broken(tmp_dev, "raid0"))) {
+ bio_io_error(bio);
+ return true;
+ }
+
bio_set_dev(bio, tmp_dev->bdev);
bio->bi_iter.bi_sector = sector + zone->dev_start +
tmp_dev->data_offset;
diff --git a/drivers/md/raid0.h b/drivers/md/raid0.h
index 540e65d92642..3816e5477db1 100644
--- a/drivers/md/raid0.h
+++ b/drivers/md/raid0.h
@@ -8,11 +8,25 @@ struct strip_zone {
int nb_dev; /* # of devices attached to the zone */
};
+/* Linux 3.14 (20d0189b101) made an unintended change to
+ * the RAID0 layout for multi-zone arrays (where devices aren't all
+ * the same size.
+ * RAID0_ORIG_LAYOUT restores the original layout
+ * RAID0_ALT_MULTIZONE_LAYOUT uses the altered layout
+ * The layouts are identical when there is only one zone (all
+ * devices the same size).
+ */
+
+enum r0layout {
+ RAID0_ORIG_LAYOUT = 1,
+ RAID0_ALT_MULTIZONE_LAYOUT = 2,
+};
struct r0conf {
struct strip_zone *strip_zone;
struct md_rdev **devlist; /* lists of rdevs, pointed to
* by strip_zone->dev */
int nr_strip_zones;
+ enum r0layout layout;
};
#endif
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 34e26834ad28..0466ee2453b4 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -447,19 +447,21 @@ static void raid1_end_write_request(struct bio *bio)
/* We never try FailFast to WriteMostly devices */
!test_bit(WriteMostly, &rdev->flags)) {
md_error(r1_bio->mddev, rdev);
- if (!test_bit(Faulty, &rdev->flags))
- /* This is the only remaining device,
- * We need to retry the write without
- * FailFast
- */
- set_bit(R1BIO_WriteError, &r1_bio->state);
- else {
- /* Finished with this branch */
- r1_bio->bios[mirror] = NULL;
- to_put = bio;
- }
- } else
+ }
+
+ /*
+ * When the device is faulty, it is not necessary to
+ * handle write error.
+ * For failfast, this is the only remaining device,
+ * We need to retry the write without FailFast.
+ */
+ if (!test_bit(Faulty, &rdev->flags))
set_bit(R1BIO_WriteError, &r1_bio->state);
+ else {
+ /* Finished with this branch */
+ r1_bio->bios[mirror] = NULL;
+ to_put = bio;
+ }
} else {
/*
* Set R1BIO_Uptodate in our master bio, so that we
@@ -872,8 +874,11 @@ static void flush_pending_writes(struct r1conf *conf)
* backgroup IO calls must call raise_barrier. Once that returns
* there is no normal IO happeing. It must arrange to call
* lower_barrier when the particular background IO completes.
+ *
+ * If resync/recovery is interrupted, returns -EINTR;
+ * Otherwise, returns 0.
*/
-static sector_t raise_barrier(struct r1conf *conf, sector_t sector_nr)
+static int raise_barrier(struct r1conf *conf, sector_t sector_nr)
{
int idx = sector_to_idx(sector_nr);
@@ -1612,12 +1617,12 @@ static void raid1_error(struct mddev *mddev, struct md_rdev *rdev)
/*
* If it is not operational, then we have already marked it as dead
- * else if it is the last working disks, ignore the error, let the
- * next level up know.
+ * else if it is the last working disks with "fail_last_dev == false",
+ * ignore the error, let the next level up know.
* else mark the drive as failed
*/
spin_lock_irqsave(&conf->device_lock, flags);
- if (test_bit(In_sync, &rdev->flags)
+ if (test_bit(In_sync, &rdev->flags) && !mddev->fail_last_dev
&& (conf->raid_disks - mddev->degraded) == 1) {
/*
* Don't fail the drive, act as though we were just a
@@ -1901,6 +1906,22 @@ static void abort_sync_write(struct mddev *mddev, struct r1bio *r1_bio)
} while (sectors_to_go > 0);
}
+static void put_sync_write_buf(struct r1bio *r1_bio, int uptodate)
+{
+ if (atomic_dec_and_test(&r1_bio->remaining)) {
+ struct mddev *mddev = r1_bio->mddev;
+ int s = r1_bio->sectors;
+
+ if (test_bit(R1BIO_MadeGood, &r1_bio->state) ||
+ test_bit(R1BIO_WriteError, &r1_bio->state))
+ reschedule_retry(r1_bio);
+ else {
+ put_buf(r1_bio);
+ md_done_sync(mddev, s, uptodate);
+ }
+ }
+}
+
static void end_sync_write(struct bio *bio)
{
int uptodate = !bio->bi_status;
@@ -1927,16 +1948,7 @@ static void end_sync_write(struct bio *bio)
)
set_bit(R1BIO_MadeGood, &r1_bio->state);
- if (atomic_dec_and_test(&r1_bio->remaining)) {
- int s = r1_bio->sectors;
- if (test_bit(R1BIO_MadeGood, &r1_bio->state) ||
- test_bit(R1BIO_WriteError, &r1_bio->state))
- reschedule_retry(r1_bio);
- else {
- put_buf(r1_bio);
- md_done_sync(mddev, s, uptodate);
- }
- }
+ put_sync_write_buf(r1_bio, uptodate);
}
static int r1_sync_page_io(struct md_rdev *rdev, sector_t sector,
@@ -2219,17 +2231,7 @@ static void sync_request_write(struct mddev *mddev, struct r1bio *r1_bio)
generic_make_request(wbio);
}
- if (atomic_dec_and_test(&r1_bio->remaining)) {
- /* if we're here, all write(s) have completed, so clean up */
- int s = r1_bio->sectors;
- if (test_bit(R1BIO_MadeGood, &r1_bio->state) ||
- test_bit(R1BIO_WriteError, &r1_bio->state))
- reschedule_retry(r1_bio);
- else {
- put_buf(r1_bio);
- md_done_sync(mddev, s, 1);
- }
- }
+ put_sync_write_buf(r1_bio, 1);
}
/*
@@ -3127,6 +3129,13 @@ static int raid1_run(struct mddev *mddev)
!test_bit(In_sync, &conf->mirrors[i].rdev->flags) ||
test_bit(Faulty, &conf->mirrors[i].rdev->flags))
mddev->degraded++;
+ /*
+ * RAID1 needs at least one disk in active
+ */
+ if (conf->raid_disks - mddev->degraded < 1) {
+ ret = -EINVAL;
+ goto abort;
+ }
if (conf->raid_disks - mddev->degraded == 1)
mddev->recovery_cp = MaxSector;
@@ -3160,8 +3169,12 @@ static int raid1_run(struct mddev *mddev)
ret = md_integrity_register(mddev);
if (ret) {
md_unregister_thread(&mddev->thread);
- raid1_free(mddev, conf);
+ goto abort;
}
+ return 0;
+
+abort:
+ raid1_free(mddev, conf);
return ret;
}
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 8a1354a08a1a..299c7b1c9718 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -465,19 +465,21 @@ static void raid10_end_write_request(struct bio *bio)
if (test_bit(FailFast, &rdev->flags) &&
(bio->bi_opf & MD_FAILFAST)) {
md_error(rdev->mddev, rdev);
- if (!test_bit(Faulty, &rdev->flags))
- /* This is the only remaining device,
- * We need to retry the write without
- * FailFast
- */
- set_bit(R10BIO_WriteError, &r10_bio->state);
- else {
- r10_bio->devs[slot].bio = NULL;
- to_put = bio;
- dec_rdev = 1;
- }
- } else
+ }
+
+ /*
+ * When the device is faulty, it is not necessary to
+ * handle write error.
+ * For failfast, this is the only remaining device,
+ * We need to retry the write without FailFast.
+ */
+ if (!test_bit(Faulty, &rdev->flags))
set_bit(R10BIO_WriteError, &r10_bio->state);
+ else {
+ r10_bio->devs[slot].bio = NULL;
+ to_put = bio;
+ dec_rdev = 1;
+ }
}
} else {
/*
@@ -1638,12 +1640,12 @@ static void raid10_error(struct mddev *mddev, struct md_rdev *rdev)
/*
* If it is not operational, then we have already marked it as dead
- * else if it is the last working disks, ignore the error, let the
- * next level up know.
+ * else if it is the last working disks with "fail_last_dev == false",
+ * ignore the error, let the next level up know.
* else mark the drive as failed
*/
spin_lock_irqsave(&conf->device_lock, flags);
- if (test_bit(In_sync, &rdev->flags)
+ if (test_bit(In_sync, &rdev->flags) && !mddev->fail_last_dev
&& !enough(conf, rdev->raid_disk)) {
/*
* Don't fail the drive, just return an IO error.
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 3de4e13bde98..223e97ab27e6 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -2526,7 +2526,8 @@ static void raid5_end_read_request(struct bio * bi)
int set_bad = 0;
clear_bit(R5_UPTODATE, &sh->dev[i].flags);
- atomic_inc(&rdev->read_errors);
+ if (!(bi->bi_status == BLK_STS_PROTECTION))
+ atomic_inc(&rdev->read_errors);
if (test_bit(R5_ReadRepl, &sh->dev[i].flags))
pr_warn_ratelimited(
"md/raid:%s: read error on replacement device (sector %llu on %s).\n",
@@ -2549,16 +2550,24 @@ static void raid5_end_read_request(struct bio * bi)
(unsigned long long)s,
bdn);
} else if (atomic_read(&rdev->read_errors)
- > conf->max_nr_stripes)
- pr_warn("md/raid:%s: Too many read errors, failing device %s.\n",
- mdname(conf->mddev), bdn);
- else
+ > conf->max_nr_stripes) {
+ if (!test_bit(Faulty, &rdev->flags)) {
+ pr_warn("md/raid:%s: %d read_errors > %d stripes\n",
+ mdname(conf->mddev),
+ atomic_read(&rdev->read_errors),
+ conf->max_nr_stripes);
+ pr_warn("md/raid:%s: Too many read errors, failing device %s.\n",
+ mdname(conf->mddev), bdn);
+ }
+ } else
retry = 1;
if (set_bad && test_bit(In_sync, &rdev->flags)
&& !test_bit(R5_ReadNoMerge, &sh->dev[i].flags))
retry = 1;
if (retry)
- if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) {
+ if (sh->qd_idx >= 0 && sh->pd_idx == i)
+ set_bit(R5_ReadError, &sh->dev[i].flags);
+ else if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) {
set_bit(R5_ReadError, &sh->dev[i].flags);
clear_bit(R5_ReadNoMerge, &sh->dev[i].flags);
} else
@@ -4612,7 +4621,6 @@ static void break_stripe_batch_list(struct stripe_head *head_sh,
(1 << STRIPE_FULL_WRITE) |
(1 << STRIPE_BIOFILL_RUN) |
(1 << STRIPE_COMPUTE_RUN) |
- (1 << STRIPE_OPS_REQ_PENDING) |
(1 << STRIPE_DISCARD) |
(1 << STRIPE_BATCH_READY) |
(1 << STRIPE_BATCH_ERR) |
@@ -5491,7 +5499,7 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)
return;
logical_sector = bi->bi_iter.bi_sector & ~((sector_t)STRIPE_SECTORS-1);
- last_sector = bi->bi_iter.bi_sector + (bi->bi_iter.bi_size>>9);
+ last_sector = bio_end_sector(bi);
bi->bi_next = NULL;
@@ -5718,7 +5726,8 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi)
do_flush = false;
}
- set_bit(STRIPE_HANDLE, &sh->state);
+ if (!sh->batch_head)
+ set_bit(STRIPE_HANDLE, &sh->state);
clear_bit(STRIPE_DELAYED, &sh->state);
if ((!sh->batch_head || sh == sh->batch_head) &&
(bi->bi_opf & REQ_SYNC) &&
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index cf991f13403e..f90e0704bed9 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -357,7 +357,6 @@ enum {
STRIPE_FULL_WRITE, /* all blocks are set to be overwritten */
STRIPE_BIOFILL_RUN,
STRIPE_COMPUTE_RUN,
- STRIPE_OPS_REQ_PENDING,
STRIPE_ON_UNPLUG_LIST,
STRIPE_DISCARD,
STRIPE_ON_RELEASE_LIST,
@@ -493,9 +492,7 @@ struct disk_info {
*/
static inline struct bio *r5_next_bio(struct bio *bio, sector_t sector)
{
- int sectors = bio_sectors(bio);
-
- if (bio->bi_iter.bi_sector + sectors < sector + STRIPE_SECTORS)
+ if (bio_end_sector(bio) < sector + STRIPE_SECTORS)
return bio->bi_next;
else
return NULL;