diff options
Diffstat (limited to 'drivers/md')
40 files changed, 909 insertions, 464 deletions
diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c index 8c371d5eef8e..097577ae3c47 100644 --- a/drivers/md/bcache/alloc.c +++ b/drivers/md/bcache/alloc.c @@ -482,8 +482,7 @@ void bch_bucket_free(struct cache_set *c, struct bkey *k) unsigned int i; for (i = 0; i < KEY_PTRS(k); i++) - __bch_bucket_free(PTR_CACHE(c, k, i), - PTR_BUCKET(c, k, i)); + __bch_bucket_free(c->cache, PTR_BUCKET(c, k, i)); } int __bch_bucket_alloc_set(struct cache_set *c, unsigned int reserve, @@ -674,7 +673,7 @@ bool bch_alloc_sectors(struct cache_set *c, SET_PTR_OFFSET(&b->key, i, PTR_OFFSET(&b->key, i) + sectors); atomic_long_add(sectors, - &PTR_CACHE(c, &b->key, i)->sectors_written); + &c->cache->sectors_written); } if (b->sectors_free < c->cache->sb.block_size) diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 848dd4db1659..0a4551e165ab 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -804,13 +804,6 @@ static inline sector_t bucket_remainder(struct cache_set *c, sector_t s) return s & (c->cache->sb.bucket_size - 1); } -static inline struct cache *PTR_CACHE(struct cache_set *c, - const struct bkey *k, - unsigned int ptr) -{ - return c->cache; -} - static inline size_t PTR_BUCKET_NR(struct cache_set *c, const struct bkey *k, unsigned int ptr) @@ -822,7 +815,7 @@ static inline struct bucket *PTR_BUCKET(struct cache_set *c, const struct bkey *k, unsigned int ptr) { - return PTR_CACHE(c, k, ptr)->buckets + PTR_BUCKET_NR(c, k, ptr); + return c->cache->buckets + PTR_BUCKET_NR(c, k, ptr); } static inline uint8_t gen_after(uint8_t a, uint8_t b) @@ -841,7 +834,7 @@ static inline uint8_t ptr_stale(struct cache_set *c, const struct bkey *k, static inline bool ptr_available(struct cache_set *c, const struct bkey *k, unsigned int i) { - return (PTR_DEV(k, i) < MAX_CACHES_PER_SET) && PTR_CACHE(c, k, i); + return (PTR_DEV(k, i) < MAX_CACHES_PER_SET) && c->cache; } /* Btree key macros */ diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index fe6dce125aba..183a58c89377 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -426,7 +426,7 @@ void __bch_btree_node_write(struct btree *b, struct closure *parent) do_btree_node_write(b); atomic_long_add(set_blocks(i, block_bytes(b->c->cache)) * b->c->cache->sb.block_size, - &PTR_CACHE(b->c, &b->key, 0)->btree_sectors_written); + &b->c->cache->btree_sectors_written); b->written += set_blocks(i, block_bytes(b->c->cache)); } @@ -1161,7 +1161,7 @@ static void make_btree_freeing_key(struct btree *b, struct bkey *k) for (i = 0; i < KEY_PTRS(k); i++) SET_PTR_GEN(k, i, - bch_inc_gen(PTR_CACHE(b->c, &b->key, i), + bch_inc_gen(b->c->cache, PTR_BUCKET(b->c, &b->key, i))); mutex_unlock(&b->c->bucket_lock); diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c index 63e809f38e3f..116edda845c3 100644 --- a/drivers/md/bcache/debug.c +++ b/drivers/md/bcache/debug.c @@ -50,7 +50,7 @@ void bch_btree_verify(struct btree *b) v->keys.ops = b->keys.ops; bio = bch_bbio_alloc(b->c); - bio_set_dev(bio, PTR_CACHE(b->c, &b->key, 0)->bdev); + bio_set_dev(bio, b->c->cache->bdev); bio->bi_iter.bi_sector = PTR_OFFSET(&b->key, 0); bio->bi_iter.bi_size = KEY_SIZE(&v->key) << 9; bio->bi_opf = REQ_OP_READ | REQ_META; diff --git a/drivers/md/bcache/extents.c b/drivers/md/bcache/extents.c index f4658a1f37b8..d626ffcbecb9 100644 --- a/drivers/md/bcache/extents.c +++ b/drivers/md/bcache/extents.c @@ -50,7 +50,7 @@ static bool __ptr_invalid(struct cache_set *c, const struct bkey *k) for (i = 0; i < KEY_PTRS(k); i++) if (ptr_available(c, k, i)) { - struct cache *ca = PTR_CACHE(c, k, i); + struct cache *ca = c->cache; size_t bucket = PTR_BUCKET_NR(c, k, i); size_t r = bucket_remainder(c, PTR_OFFSET(k, i)); @@ -71,7 +71,7 @@ static const char *bch_ptr_status(struct cache_set *c, const struct bkey *k) for (i = 0; i < KEY_PTRS(k); i++) if (ptr_available(c, k, i)) { - struct cache *ca = PTR_CACHE(c, k, i); + struct cache *ca = c->cache; size_t bucket = PTR_BUCKET_NR(c, k, i); size_t r = bucket_remainder(c, PTR_OFFSET(k, i)); diff --git a/drivers/md/bcache/features.c b/drivers/md/bcache/features.c index d636b7b2d070..6d2b7b84a7b7 100644 --- a/drivers/md/bcache/features.c +++ b/drivers/md/bcache/features.c @@ -19,7 +19,7 @@ struct feature { static struct feature feature_list[] = { {BCH_FEATURE_INCOMPAT, BCH_FEATURE_INCOMPAT_LOG_LARGE_BUCKET_SIZE, "large_bucket"}, - {0, 0, 0 }, + {0, 0, NULL }, }; #define compose_feature_string(type) \ diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c index dad71a6b7889..e4388fe3ab7e 100644 --- a/drivers/md/bcache/io.c +++ b/drivers/md/bcache/io.c @@ -36,7 +36,7 @@ void __bch_submit_bbio(struct bio *bio, struct cache_set *c) struct bbio *b = container_of(bio, struct bbio, bio); bio->bi_iter.bi_sector = PTR_OFFSET(&b->key, 0); - bio_set_dev(bio, PTR_CACHE(c, &b->key, 0)->bdev); + bio_set_dev(bio, c->cache->bdev); b->submit_time_us = local_clock_us(); closure_bio_submit(c, bio, bio->bi_private); @@ -137,7 +137,7 @@ void bch_bbio_count_io_errors(struct cache_set *c, struct bio *bio, blk_status_t error, const char *m) { struct bbio *b = container_of(bio, struct bbio, bio); - struct cache *ca = PTR_CACHE(c, &b->key, 0); + struct cache *ca = c->cache; int is_read = (bio_data_dir(bio) == READ ? 1 : 0); unsigned int threshold = op_is_write(bio_op(bio)) diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c index c6613e817333..61bd79babf7a 100644 --- a/drivers/md/bcache/journal.c +++ b/drivers/md/bcache/journal.c @@ -111,7 +111,7 @@ reread: left = ca->sb.bucket_size - offset; * Check from the oldest jset for last_seq. If * i->j.seq < j->last_seq, it means the oldest jset * in list is expired and useless, remove it from - * this list. Otherwise, j is a condidate jset for + * this list. Otherwise, j is a candidate jset for * further following checks. */ while (!list_empty(list)) { @@ -498,7 +498,7 @@ static void btree_flush_write(struct cache_set *c) * - If there are matched nodes recorded in btree_nodes[], * they are clean now (this is why and how the oldest * journal entry can be reclaimed). These selected nodes - * will be ignored and skipped in the folowing for-loop. + * will be ignored and skipped in the following for-loop. */ if (((btree_current_write(b)->journal - fifo_front_p) & mask) != 0) { @@ -768,7 +768,7 @@ static void journal_write_unlocked(struct closure *cl) w->data->csum = csum_set(w->data); for (i = 0; i < KEY_PTRS(k); i++) { - ca = PTR_CACHE(c, k, i); + ca = c->cache; bio = &ca->journal.bio; atomic_long_add(sectors, &ca->meta_sectors_written); diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 03e1fe4de53d..bea8c4429ae8 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -16,6 +16,7 @@ #include "features.h" #include <linux/blkdev.h> +#include <linux/pagemap.h> #include <linux/debugfs.h> #include <linux/genhd.h> #include <linux/idr.h> @@ -1052,6 +1053,7 @@ static int cached_dev_status_update(void *arg) int bch_cached_dev_run(struct cached_dev *dc) { + int ret = 0; struct bcache_device *d = &dc->disk; char *buf = kmemdup_nul(dc->sb.label, SB_LABEL_SIZE, GFP_KERNEL); char *env[] = { @@ -1064,19 +1066,15 @@ int bch_cached_dev_run(struct cached_dev *dc) if (dc->io_disable) { pr_err("I/O disabled on cached dev %s\n", dc->backing_dev_name); - kfree(env[1]); - kfree(env[2]); - kfree(buf); - return -EIO; + ret = -EIO; + goto out; } if (atomic_xchg(&dc->running, 1)) { - kfree(env[1]); - kfree(env[2]); - kfree(buf); pr_info("cached dev %s is running already\n", dc->backing_dev_name); - return -EBUSY; + ret = -EBUSY; + goto out; } if (!d->c && @@ -1097,15 +1095,13 @@ int bch_cached_dev_run(struct cached_dev *dc) * only class / kset properties are persistent */ kobject_uevent_env(&disk_to_dev(d->disk)->kobj, KOBJ_CHANGE, env); - kfree(env[1]); - kfree(env[2]); - kfree(buf); if (sysfs_create_link(&d->kobj, &disk_to_dev(d->disk)->kobj, "dev") || sysfs_create_link(&disk_to_dev(d->disk)->kobj, &d->kobj, "bcache")) { pr_err("Couldn't create bcache dev <-> disk sysfs symlinks\n"); - return -ENOMEM; + ret = -ENOMEM; + goto out; } dc->status_update_thread = kthread_run(cached_dev_status_update, @@ -1114,7 +1110,11 @@ int bch_cached_dev_run(struct cached_dev *dc) pr_warn("failed to create bcache_status_update kthread, continue to run without monitoring backing device status\n"); } - return 0; +out: + kfree(env[1]); + kfree(env[2]); + kfree(buf); + return ret; } /* diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h index c029f7443190..bca4a7c97da7 100644 --- a/drivers/md/bcache/util.h +++ b/drivers/md/bcache/util.h @@ -27,7 +27,7 @@ struct closure; #else /* DEBUG */ -#define EBUG_ON(cond) do { if (cond); } while (0) +#define EBUG_ON(cond) do { if (cond) do {} while (0); } while (0) #define atomic_dec_bug(v) atomic_dec(v) #define atomic_inc_bug(v, i) atomic_inc(v) diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index 82d4e0880a99..8120da278161 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -110,13 +110,13 @@ static void __update_writeback_rate(struct cached_dev *dc) int64_t fps; if (c->gc_stats.in_use <= BCH_WRITEBACK_FRAGMENT_THRESHOLD_MID) { - fp_term = dc->writeback_rate_fp_term_low * + fp_term = (int64_t)dc->writeback_rate_fp_term_low * (c->gc_stats.in_use - BCH_WRITEBACK_FRAGMENT_THRESHOLD_LOW); } else if (c->gc_stats.in_use <= BCH_WRITEBACK_FRAGMENT_THRESHOLD_HIGH) { - fp_term = dc->writeback_rate_fp_term_mid * + fp_term = (int64_t)dc->writeback_rate_fp_term_mid * (c->gc_stats.in_use - BCH_WRITEBACK_FRAGMENT_THRESHOLD_MID); } else { - fp_term = dc->writeback_rate_fp_term_high * + fp_term = (int64_t)dc->writeback_rate_fp_term_high * (c->gc_stats.in_use - BCH_WRITEBACK_FRAGMENT_THRESHOLD_HIGH); } fps = div_s64(dirty, dirty_buckets) * fp_term; @@ -416,7 +416,7 @@ static void read_dirty_endio(struct bio *bio) struct dirty_io *io = w->private; /* is_read = 1 */ - bch_count_io_errors(PTR_CACHE(io->dc->disk.c, &w->key, 0), + bch_count_io_errors(io->dc->disk.c->cache, bio->bi_status, 1, "reading dirty data from cache"); @@ -510,8 +510,7 @@ static void read_dirty(struct cached_dev *dc) dirty_init(w); bio_set_op_attrs(&io->bio, REQ_OP_READ, 0); io->bio.bi_iter.bi_sector = PTR_OFFSET(&w->key, 0); - bio_set_dev(&io->bio, - PTR_CACHE(dc->disk.c, &w->key, 0)->bdev); + bio_set_dev(&io->bio, dc->disk.c->cache->bdev); io->bio.bi_end_io = read_dirty_endio; if (bch_bio_alloc_pages(&io->bio, GFP_KERNEL)) diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c index 541c45027cc8..6ab01ff25747 100644 --- a/drivers/md/dm-cache-target.c +++ b/drivers/md/dm-cache-target.c @@ -3387,7 +3387,7 @@ static bool origin_dev_supports_discard(struct block_device *origin_bdev) { struct request_queue *q = bdev_get_queue(origin_bdev); - return q && blk_queue_discard(q); + return blk_queue_discard(q); } /* diff --git a/drivers/md/dm-clone-metadata.c b/drivers/md/dm-clone-metadata.c index 17712456fa63..c43d55672bce 100644 --- a/drivers/md/dm-clone-metadata.c +++ b/drivers/md/dm-clone-metadata.c @@ -276,12 +276,6 @@ static inline int superblock_read_lock(struct dm_clone_metadata *cmd, return dm_bm_read_lock(cmd->bm, SUPERBLOCK_LOCATION, &sb_validator, sblock); } -static inline int superblock_write_lock(struct dm_clone_metadata *cmd, - struct dm_block **sblock) -{ - return dm_bm_write_lock(cmd->bm, SUPERBLOCK_LOCATION, &sb_validator, sblock); -} - static inline int superblock_write_lock_zero(struct dm_clone_metadata *cmd, struct dm_block **sblock) { diff --git a/drivers/md/dm-ebs-target.c b/drivers/md/dm-ebs-target.c index 55bcfb74f51f..71475a2410be 100644 --- a/drivers/md/dm-ebs-target.c +++ b/drivers/md/dm-ebs-target.c @@ -28,7 +28,7 @@ struct ebs_c { spinlock_t lock; /* Guard bios input list above. */ sector_t start; /* <start> table line argument, see ebs_ctr below. */ unsigned int e_bs; /* Emulated block size in sectors exposed to upper layer. */ - unsigned int u_bs; /* Underlying block size in sectors retrievd from/set on lower layer device. */ + unsigned int u_bs; /* Underlying block size in sectors retrieved from/set on lower layer device. */ unsigned char block_shift; /* bitshift sectors -> blocks used in dm-bufio API. */ bool u_bs_set:1; /* Flag to indicate underlying block size is set on table line. */ }; @@ -43,7 +43,7 @@ static inline sector_t __block_mod(sector_t sector, unsigned int bs) return sector & (bs - 1); } -/* Return number of blocks for a bio, accounting for misalignement of start and end sectors. */ +/* Return number of blocks for a bio, accounting for misalignment of start and end sectors. */ static inline unsigned int __nr_blocks(struct ebs_c *ec, struct bio *bio) { sector_t end_sector = __block_mod(bio->bi_iter.bi_sector, ec->u_bs) + bio_sectors(bio); @@ -171,7 +171,7 @@ static void __ebs_forget_bio(struct ebs_c *ec, struct bio *bio) dm_bufio_forget_buffers(ec->bufio, __sector_to_block(ec, sector), blocks); } -/* Worker funtion to process incoming bios. */ +/* Worker function to process incoming bios. */ static void __ebs_process_bios(struct work_struct *ws) { int r; diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c index 46b5d542b8fe..781942aeddd1 100644 --- a/drivers/md/dm-integrity.c +++ b/drivers/md/dm-integrity.c @@ -35,7 +35,7 @@ #define MIN_LOG2_INTERLEAVE_SECTORS 3 #define MAX_LOG2_INTERLEAVE_SECTORS 31 #define METADATA_WORKQUEUE_MAX_ACTIVE 16 -#define RECALC_SECTORS 8192 +#define RECALC_SECTORS 32768 #define RECALC_WRITE_SUPER 16 #define BITMAP_BLOCK_SIZE 4096 /* don't change it */ #define BITMAP_FLUSH_INTERVAL (10 * HZ) @@ -262,6 +262,7 @@ struct dm_integrity_c { bool journal_uptodate; bool just_formatted; bool recalculate_flag; + bool reset_recalculate_flag; bool discard; bool fix_padding; bool fix_hmac; @@ -1428,8 +1429,10 @@ static int dm_integrity_rw_tag(struct dm_integrity_c *ic, unsigned char *tag, se if (op == TAG_READ) { memcpy(tag, dp, to_copy); } else if (op == TAG_WRITE) { - memcpy(dp, tag, to_copy); - dm_bufio_mark_partial_buffer_dirty(b, *metadata_offset, *metadata_offset + to_copy); + if (memcmp(dp, tag, to_copy)) { + memcpy(dp, tag, to_copy); + dm_bufio_mark_partial_buffer_dirty(b, *metadata_offset, *metadata_offset + to_copy); + } } else { /* e.g.: op == TAG_CMP */ @@ -2686,26 +2689,30 @@ next_chunk: if (unlikely(dm_integrity_failed(ic))) goto err; - io_req.bi_op = REQ_OP_READ; - io_req.bi_op_flags = 0; - io_req.mem.type = DM_IO_VMA; - io_req.mem.ptr.addr = ic->recalc_buffer; - io_req.notify.fn = NULL; - io_req.client = ic->io; - io_loc.bdev = ic->dev->bdev; - io_loc.sector = get_data_sector(ic, area, offset); - io_loc.count = n_sectors; + if (!ic->discard) { + io_req.bi_op = REQ_OP_READ; + io_req.bi_op_flags = 0; + io_req.mem.type = DM_IO_VMA; + io_req.mem.ptr.addr = ic->recalc_buffer; + io_req.notify.fn = NULL; + io_req.client = ic->io; + io_loc.bdev = ic->dev->bdev; + io_loc.sector = get_data_sector(ic, area, offset); + io_loc.count = n_sectors; - r = dm_io(&io_req, 1, &io_loc, NULL); - if (unlikely(r)) { - dm_integrity_io_error(ic, "reading data", r); - goto err; - } + r = dm_io(&io_req, 1, &io_loc, NULL); + if (unlikely(r)) { + dm_integrity_io_error(ic, "reading data", r); + goto err; + } - t = ic->recalc_tags; - for (i = 0; i < n_sectors; i += ic->sectors_per_block) { - integrity_sector_checksum(ic, logical_sector + i, ic->recalc_buffer + (i << SECTOR_SHIFT), t); - t += ic->tag_size; + t = ic->recalc_tags; + for (i = 0; i < n_sectors; i += ic->sectors_per_block) { + integrity_sector_checksum(ic, logical_sector + i, ic->recalc_buffer + (i << SECTOR_SHIFT), t); + t += ic->tag_size; + } + } else { + t = ic->recalc_tags + (n_sectors >> ic->sb->log2_sectors_per_block) * ic->tag_size; } metadata_block = get_metadata_sector_and_offset(ic, area, offset, &metadata_offset); @@ -3134,7 +3141,8 @@ static void dm_integrity_resume(struct dm_target *ti) rw_journal_sectors(ic, REQ_OP_READ, 0, 0, ic->n_bitmap_blocks * (BITMAP_BLOCK_SIZE >> SECTOR_SHIFT), NULL); if (ic->mode == 'B') { - if (ic->sb->log2_blocks_per_bitmap_bit == ic->log2_blocks_per_bitmap_bit) { + if (ic->sb->log2_blocks_per_bitmap_bit == ic->log2_blocks_per_bitmap_bit && + !ic->reset_recalculate_flag) { block_bitmap_copy(ic, ic->recalc_bitmap, ic->journal); block_bitmap_copy(ic, ic->may_write_bitmap, ic->journal); if (!block_bitmap_op(ic, ic->journal, 0, ic->provided_data_sectors, @@ -3156,7 +3164,8 @@ static void dm_integrity_resume(struct dm_target *ti) } } else { if (!(ic->sb->log2_blocks_per_bitmap_bit == ic->log2_blocks_per_bitmap_bit && - block_bitmap_op(ic, ic->journal, 0, ic->provided_data_sectors, BITMAP_OP_TEST_ALL_CLEAR))) { + block_bitmap_op(ic, ic->journal, 0, ic->provided_data_sectors, BITMAP_OP_TEST_ALL_CLEAR)) || + ic->reset_recalculate_flag) { ic->sb->flags |= cpu_to_le32(SB_FLAG_RECALCULATING); ic->sb->recalc_sector = cpu_to_le64(0); } @@ -3169,6 +3178,10 @@ static void dm_integrity_resume(struct dm_target *ti) dm_integrity_io_error(ic, "writing superblock", r); } else { replay_journal(ic); + if (ic->reset_recalculate_flag) { + ic->sb->flags |= cpu_to_le32(SB_FLAG_RECALCULATING); + ic->sb->recalc_sector = cpu_to_le64(0); + } if (ic->mode == 'B') { ic->sb->flags |= cpu_to_le32(SB_FLAG_DIRTY_BITMAP); ic->sb->log2_blocks_per_bitmap_bit = ic->log2_blocks_per_bitmap_bit; @@ -3242,6 +3255,7 @@ static void dm_integrity_status(struct dm_target *ti, status_type_t type, arg_count += !!ic->meta_dev; arg_count += ic->sectors_per_block != 1; arg_count += !!(ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING)); + arg_count += ic->reset_recalculate_flag; arg_count += ic->discard; arg_count += ic->mode == 'J'; arg_count += ic->mode == 'J'; @@ -3261,6 +3275,8 @@ static void dm_integrity_status(struct dm_target *ti, status_type_t type, DMEMIT(" block_size:%u", ic->sectors_per_block << SECTOR_SHIFT); if (ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING)) DMEMIT(" recalculate"); + if (ic->reset_recalculate_flag) + DMEMIT(" reset_recalculate"); if (ic->discard) DMEMIT(" allow_discards"); DMEMIT(" journal_sectors:%u", ic->initial_sectors - SB_SECTORS); @@ -3914,7 +3930,7 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv) unsigned extra_args; struct dm_arg_set as; static const struct dm_arg _args[] = { - {0, 17, "Invalid number of feature args"}, + {0, 18, "Invalid number of feature args"}, }; unsigned journal_sectors, interleave_sectors, buffer_sectors, journal_watermark, sync_msec; bool should_write_sb; @@ -4039,6 +4055,7 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv) if (val >= (uint64_t)UINT_MAX * 1000 / HZ) { r = -EINVAL; ti->error = "Invalid bitmap_flush_interval argument"; + goto bad; } ic->bitmap_flush_interval = msecs_to_jiffies(val); } else if (!strncmp(opt_string, "internal_hash:", strlen("internal_hash:"))) { @@ -4058,6 +4075,9 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv) goto bad; } else if (!strcmp(opt_string, "recalculate")) { ic->recalculate_flag = true; + } else if (!strcmp(opt_string, "reset_recalculate")) { + ic->recalculate_flag = true; + ic->reset_recalculate_flag = true; } else if (!strcmp(opt_string, "allow_discards")) { ic->discard = true; } else if (!strcmp(opt_string, "fix_padding")) { @@ -4348,11 +4368,13 @@ try_smaller_buffer: goto bad; } INIT_WORK(&ic->recalc_work, integrity_recalc); - ic->recalc_buffer = vmalloc(RECALC_SECTORS << SECTOR_SHIFT); - if (!ic->recalc_buffer) { - ti->error = "Cannot allocate buffer for recalculating"; - r = -ENOMEM; - goto bad; + if (!ic->discard) { + ic->recalc_buffer = vmalloc(RECALC_SECTORS << SECTOR_SHIFT); + if (!ic->recalc_buffer) { + ti->error = "Cannot allocate buffer for recalculating"; + r = -ENOMEM; + goto bad; + } } ic->recalc_tags = kvmalloc_array(RECALC_SECTORS >> ic->sb->log2_sectors_per_block, ic->tag_size, GFP_KERNEL); @@ -4361,6 +4383,9 @@ try_smaller_buffer: r = -ENOMEM; goto bad; } + if (ic->discard) + memset(ic->recalc_tags, DISCARD_FILLER, + (RECALC_SECTORS >> ic->sb->log2_sectors_per_block) * ic->tag_size); } else { if (ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING)) { ti->error = "Recalculate can only be specified with internal_hash"; @@ -4554,7 +4579,7 @@ static void dm_integrity_dtr(struct dm_target *ti) static struct target_type integrity_target = { .name = "integrity", - .version = {1, 7, 0}, + .version = {1, 9, 0}, .module = THIS_MODULE, .features = DM_TARGET_SINGLETON | DM_TARGET_INTEGRITY, .ctr = dm_integrity_ctr, diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c index 1ca65b434f1f..2209cbcd84db 100644 --- a/drivers/md/dm-ioctl.c +++ b/drivers/md/dm-ioctl.c @@ -14,6 +14,7 @@ #include <linux/init.h> #include <linux/wait.h> #include <linux/slab.h> +#include <linux/rbtree.h> #include <linux/dm-ioctl.h> #include <linux/hdreg.h> #include <linux/compat.h> @@ -36,8 +37,10 @@ struct dm_file { * name or uuid. *---------------------------------------------------------------*/ struct hash_cell { - struct list_head name_list; - struct list_head uuid_list; + struct rb_node name_node; + struct rb_node uuid_node; + bool name_set; + bool uuid_set; char *name; char *uuid; @@ -53,10 +56,8 @@ struct vers_iter { }; -#define NUM_BUCKETS 64 -#define MASK_BUCKETS (NUM_BUCKETS - 1) -static struct list_head _name_buckets[NUM_BUCKETS]; -static struct list_head _uuid_buckets[NUM_BUCKETS]; +static struct rb_root name_rb_tree = RB_ROOT; +static struct rb_root uuid_rb_tree = RB_ROOT; static void dm_hash_remove_all(bool keep_open_devices, bool mark_deferred, bool only_deferred); @@ -70,73 +71,110 @@ static DECLARE_RWSEM(_hash_lock); */ static DEFINE_MUTEX(dm_hash_cells_mutex); -static void init_buckets(struct list_head *buckets) -{ - unsigned int i; - - for (i = 0; i < NUM_BUCKETS; i++) - INIT_LIST_HEAD(buckets + i); -} - -static int dm_hash_init(void) -{ - init_buckets(_name_buckets); - init_buckets(_uuid_buckets); - return 0; -} - static void dm_hash_exit(void) { dm_hash_remove_all(false, false, false); } /*----------------------------------------------------------------- - * Hash function: - * We're not really concerned with the str hash function being - * fast since it's only used by the ioctl interface. - *---------------------------------------------------------------*/ -static unsigned int hash_str(const char *str) -{ - const unsigned int hash_mult = 2654435387U; - unsigned int h = 0; - - while (*str) - h = (h + (unsigned int) *str++) * hash_mult; - - return h & MASK_BUCKETS; -} - -/*----------------------------------------------------------------- * Code for looking up a device by name *---------------------------------------------------------------*/ static struct hash_cell *__get_name_cell(const char *str) { - struct hash_cell *hc; - unsigned int h = hash_str(str); + struct rb_node *n = name_rb_tree.rb_node; - list_for_each_entry (hc, _name_buckets + h, name_list) - if (!strcmp(hc->name, str)) { + while (n) { + struct hash_cell *hc = container_of(n, struct hash_cell, name_node); + int c = strcmp(hc->name, str); + if (!c) { dm_get(hc->md); return hc; } + n = c >= 0 ? n->rb_left : n->rb_right; + } return NULL; } static struct hash_cell *__get_uuid_cell(const char *str) { - struct hash_cell *hc; - unsigned int h = hash_str(str); + struct rb_node *n = uuid_rb_tree.rb_node; - list_for_each_entry (hc, _uuid_buckets + h, uuid_list) - if (!strcmp(hc->uuid, str)) { + while (n) { + struct hash_cell *hc = container_of(n, struct hash_cell, uuid_node); + int c = strcmp(hc->uuid, str); + if (!c) { dm_get(hc->md); return hc; } + n = c >= 0 ? n->rb_left : n->rb_right; + } return NULL; } +static void __unlink_name(struct hash_cell *hc) +{ + if (hc->name_set) { + hc->name_set = false; + rb_erase(&hc->name_node, &name_rb_tree); + } +} + +static void __unlink_uuid(struct hash_cell *hc) +{ + if (hc->uuid_set) { + hc->uuid_set = false; + rb_erase(&hc->uuid_node, &uuid_rb_tree); + } +} + +static void __link_name(struct hash_cell *new_hc) +{ + struct rb_node **n, *parent; + + __unlink_name(new_hc); + + new_hc->name_set = true; + + n = &name_rb_tree.rb_node; + parent = NULL; + + while (*n) { + struct hash_cell *hc = container_of(*n, struct hash_cell, name_node); + int c = strcmp(hc->name, new_hc->name); + BUG_ON(!c); + parent = *n; + n = c >= 0 ? &hc->name_node.rb_left : &hc->name_node.rb_right; + } + + rb_link_node(&new_hc->name_node, parent, n); + rb_insert_color(&new_hc->name_node, &name_rb_tree); +} + +static void __link_uuid(struct hash_cell *new_hc) +{ + struct rb_node **n, *parent; + + __unlink_uuid(new_hc); + + new_hc->uuid_set = true; + + n = &uuid_rb_tree.rb_node; + parent = NULL; + + while (*n) { + struct hash_cell *hc = container_of(*n, struct hash_cell, uuid_node); + int c = strcmp(hc->uuid, new_hc->uuid); + BUG_ON(!c); + parent = *n; + n = c > 0 ? &hc->uuid_node.rb_left : &hc->uuid_node.rb_right; + } + + rb_link_node(&new_hc->uuid_node, parent, n); + rb_insert_color(&new_hc->uuid_node, &uuid_rb_tree); +} + static struct hash_cell *__get_dev_cell(uint64_t dev) { struct mapped_device *md; @@ -185,8 +223,7 @@ static struct hash_cell *alloc_cell(const char *name, const char *uuid, } } - INIT_LIST_HEAD(&hc->name_list); - INIT_LIST_HEAD(&hc->uuid_list); + hc->name_set = hc->uuid_set = false; hc->md = md; hc->new_map = NULL; return hc; @@ -226,16 +263,16 @@ static int dm_hash_insert(const char *name, const char *uuid, struct mapped_devi goto bad; } - list_add(&cell->name_list, _name_buckets + hash_str(name)); + __link_name(cell); if (uuid) { hc = __get_uuid_cell(uuid); if (hc) { - list_del(&cell->name_list); + __unlink_name(cell); dm_put(hc->md); goto bad; } - list_add(&cell->uuid_list, _uuid_buckets + hash_str(uuid)); + __link_uuid(cell); } dm_get(md); mutex_lock(&dm_hash_cells_mutex); @@ -256,9 +293,9 @@ static struct dm_table *__hash_remove(struct hash_cell *hc) struct dm_table *table; int srcu_idx; - /* remove from the dev hash */ - list_del(&hc->uuid_list); - list_del(&hc->name_list); + /* remove from the dev trees */ + __unlink_name(hc); + __unlink_uuid(hc); mutex_lock(&dm_hash_cells_mutex); dm_set_mdptr(hc->md, NULL); mutex_unlock(&dm_hash_cells_mutex); @@ -279,7 +316,8 @@ static struct dm_table *__hash_remove(struct hash_cell *hc) static void dm_hash_remove_all(bool keep_open_devices, bool mark_deferred, bool only_deferred) { - int i, dev_skipped; + int dev_skipped; + struct rb_node *n; struct hash_cell *hc; struct mapped_device *md; struct dm_table *t; @@ -289,40 +327,39 @@ retry: down_write(&_hash_lock); - for (i = 0; i < NUM_BUCKETS; i++) { - list_for_each_entry(hc, _name_buckets + i, name_list) { - md = hc->md; - dm_get(md); + for (n = rb_first(&name_rb_tree); n; n = rb_next(n)) { + hc = container_of(n, struct hash_cell, name_node); + md = hc->md; + dm_get(md); - if (keep_open_devices && - dm_lock_for_deletion(md, mark_deferred, only_deferred)) { - dm_put(md); - dev_skipped++; - continue; - } + if (keep_open_devices && + dm_lock_for_deletion(md, mark_deferred, only_deferred)) { + dm_put(md); + dev_skipped++; + continue; + } - t = __hash_remove(hc); + t = __hash_remove(hc); - up_write(&_hash_lock); + up_write(&_hash_lock); - if (t) { - dm_sync_table(md); - dm_table_destroy(t); - } - dm_put(md); - if (likely(keep_open_devices)) - dm_destroy(md); - else - dm_destroy_immediate(md); - - /* - * Some mapped devices may be using other mapped - * devices, so repeat until we make no further - * progress. If a new mapped device is created - * here it will also get removed. - */ - goto retry; + if (t) { + dm_sync_table(md); + dm_table_destroy(t); } + dm_put(md); + if (likely(keep_open_devices)) + dm_destroy(md); + else + dm_destroy_immediate(md); + + /* + * Some mapped devices may be using other mapped + * devices, so repeat until we make no further + * progress. If a new mapped device is created + * here it will also get removed. + */ + goto retry; } up_write(&_hash_lock); @@ -340,7 +377,7 @@ static void __set_cell_uuid(struct hash_cell *hc, char *new_uuid) hc->uuid = new_uuid; mutex_unlock(&dm_hash_cells_mutex); - list_add(&hc->uuid_list, _uuid_buckets + hash_str(new_uuid)); + __link_uuid(hc); } /* @@ -354,14 +391,14 @@ static char *__change_cell_name(struct hash_cell *hc, char *new_name) /* * Rename and move the name cell. */ - list_del(&hc->name_list); + __unlink_name(hc); old_name = hc->name; mutex_lock(&dm_hash_cells_mutex); hc->name = new_name; mutex_unlock(&dm_hash_cells_mutex); - list_add(&hc->name_list, _name_buckets + hash_str(new_name)); + __link_name(hc); return old_name; } @@ -503,9 +540,33 @@ static void *get_result_buffer(struct dm_ioctl *param, size_t param_size, return ((void *) param) + param->data_start; } +static bool filter_device(struct hash_cell *hc, const char *pfx_name, const char *pfx_uuid) +{ + const char *val; + size_t val_len, pfx_len; + + val = hc->name; + val_len = strlen(val); + pfx_len = strnlen(pfx_name, DM_NAME_LEN); + if (pfx_len > val_len) + return false; + if (memcmp(val, pfx_name, pfx_len)) + return false; + + val = hc->uuid ? hc->uuid : ""; + val_len = strlen(val); + pfx_len = strnlen(pfx_uuid, DM_UUID_LEN); + if (pfx_len > val_len) + return false; + if (memcmp(val, pfx_uuid, pfx_len)) + return false; + + return true; +} + static int list_devices(struct file *filp, struct dm_ioctl *param, size_t param_size) { - unsigned int i; + struct rb_node *n; struct hash_cell *hc; size_t len, needed = 0; struct gendisk *disk; @@ -518,11 +579,14 @@ static int list_devices(struct file *filp, struct dm_ioctl *param, size_t param_ * Loop through all the devices working out how much * space we need. */ - for (i = 0; i < NUM_BUCKETS; i++) { - list_for_each_entry (hc, _name_buckets + i, name_list) { - needed += align_val(offsetof(struct dm_name_list, name) + strlen(hc->name) + 1); - needed += align_val(sizeof(uint32_t)); - } + for (n = rb_first(&name_rb_tree); n; n = rb_next(n)) { + hc = container_of(n, struct hash_cell, name_node); + if (!filter_device(hc, param->name, param->uuid)) + continue; + needed += align_val(offsetof(struct dm_name_list, name) + strlen(hc->name) + 1); + needed += align_val(sizeof(uint32_t) * 2); + if (param->flags & DM_UUID_FLAG && hc->uuid) + needed += align_val(strlen(hc->uuid) + 1); } /* @@ -540,21 +604,34 @@ static int list_devices(struct file *filp, struct dm_ioctl *param, size_t param_ /* * Now loop through filling out the names. */ - for (i = 0; i < NUM_BUCKETS; i++) { - list_for_each_entry (hc, _name_buckets + i, name_list) { - if (old_nl) - old_nl->next = (uint32_t) ((void *) nl - - (void *) old_nl); - disk = dm_disk(hc->md); - nl->dev = huge_encode_dev(disk_devt(disk)); - nl->next = 0; - strcpy(nl->name, hc->name); - - old_nl = nl; - event_nr = align_ptr(nl->name + strlen(hc->name) + 1); - *event_nr = dm_get_event_nr(hc->md); - nl = align_ptr(event_nr + 1); + for (n = rb_first(&name_rb_tree); n; n = rb_next(n)) { + void *uuid_ptr; + hc = container_of(n, struct hash_cell, name_node); + if (!filter_device(hc, param->name, param->uuid)) + continue; + if (old_nl) + old_nl->next = (uint32_t) ((void *) nl - + (void *) old_nl); + disk = dm_disk(hc->md); + nl->dev = huge_encode_dev(disk_devt(disk)); + nl->next = 0; + strcpy(nl->name, hc->name); + + old_nl = nl; + event_nr = align_ptr(nl->name + strlen(hc->name) + 1); + event_nr[0] = dm_get_event_nr(hc->md); + event_nr[1] = 0; + uuid_ptr = align_ptr(event_nr + 2); + if (param->flags & DM_UUID_FLAG) { + if (hc->uuid) { + event_nr[1] |= DM_NAME_LIST_FLAG_HAS_UUID; + strcpy(uuid_ptr, hc->uuid); + uuid_ptr = align_ptr(uuid_ptr + strlen(hc->uuid) + 1); + } else { + event_nr[1] |= DM_NAME_LIST_FLAG_DOESNT_HAVE_UUID; + } } + nl = uuid_ptr; } /* * If mismatch happens, security may be compromised due to buffer @@ -1991,14 +2068,9 @@ int __init dm_interface_init(void) { int r; - r = dm_hash_init(); - if (r) - return r; - r = misc_register(&_dm_misc); if (r) { DMERR("misc_register failed for control device"); - dm_hash_exit(); return r; } diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index cab12b2251ba..bf4a467fc73a 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -1853,6 +1853,7 @@ static int rs_check_takeover(struct raid_set *rs) ((mddev->layout == ALGORITHM_PARITY_N && mddev->new_layout == ALGORITHM_PARITY_N) || __within_range(mddev->new_layout, ALGORITHM_LEFT_ASYMMETRIC, ALGORITHM_RIGHT_SYMMETRIC))) return 0; + break; default: break; @@ -1868,6 +1869,14 @@ static bool rs_takeover_requested(struct raid_set *rs) return rs->md.new_level != rs->md.level; } +/* True if layout is set to reshape. */ +static bool rs_is_layout_change(struct raid_set *rs, bool use_mddev) +{ + return (use_mddev ? rs->md.delta_disks : rs->delta_disks) || + rs->md.new_layout != rs->md.layout || + rs->md.new_chunk_sectors != rs->md.chunk_sectors; +} + /* True if @rs is requested to reshape by ctr */ static bool rs_reshape_requested(struct raid_set *rs) { @@ -1880,9 +1889,7 @@ static bool rs_reshape_requested(struct raid_set *rs) if (rs_is_raid0(rs)) return false; - change = mddev->new_layout != mddev->layout || - mddev->new_chunk_sectors != mddev->chunk_sectors || - rs->delta_disks; + change = rs_is_layout_change(rs, false); /* Historical case to support raid1 reshape without delta disks */ if (rs_is_raid1(rs)) { @@ -2817,7 +2824,7 @@ static sector_t _get_reshape_sectors(struct raid_set *rs) } /* - * + * Reshape: * - change raid layout * - change chunk size * - add disks @@ -2927,6 +2934,20 @@ static int rs_setup_reshape(struct raid_set *rs) } /* + * If the md resync thread has updated superblock with max reshape position + * at the end of a reshape but not (yet) reset the layout configuration + * changes -> reset the latter. + */ +static void rs_reset_inconclusive_reshape(struct raid_set *rs) +{ + if (!rs_is_reshaping(rs) && rs_is_layout_change(rs, true)) { + rs_set_cur(rs); + rs->md.delta_disks = 0; + rs->md.reshape_backwards = 0; + } +} + +/* * Enable/disable discard support on RAID set depending on * RAID level and discard properties of underlying RAID members. */ @@ -3212,11 +3233,14 @@ size_check: if (r) goto bad; + /* Catch any inconclusive reshape superblock content. */ + rs_reset_inconclusive_reshape(rs); + /* Start raid set read-only and assumed clean to change in raid_resume() */ rs->md.ro = 1; rs->md.in_sync = 1; - /* Keep array frozen */ + /* Keep array frozen until resume. */ set_bit(MD_RECOVERY_FROZEN, &rs->md.recovery); /* Has to be held on running the array */ @@ -3230,7 +3254,6 @@ size_check: } r = md_start(&rs->md); - if (r) { ti->error = "Failed to start raid array"; mddev_unlock(&rs->md); @@ -3727,15 +3750,6 @@ static void raid_io_hints(struct dm_target *ti, struct queue_limits *limits) blk_limits_io_min(limits, chunk_size_bytes); blk_limits_io_opt(limits, chunk_size_bytes * mddev_data_stripes(rs)); - - /* - * RAID0 and RAID10 personalities require bio splitting, - * RAID1/4/5/6 don't and process large discard bios properly. - */ - if (rs_is_raid0(rs) || rs_is_raid10(rs)) { - limits->discard_granularity = chunk_size_bytes; - limits->max_discard_sectors = rs->md.chunk_sectors; - } } static void raid_postsuspend(struct dm_target *ti) diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c index 13b4385f4d5a..9c3bc3711b33 100644 --- a/drivers/md/dm-rq.c +++ b/drivers/md/dm-rq.c @@ -569,6 +569,7 @@ out_tag_set: blk_mq_free_tag_set(md->tag_set); out_kfree_tag_set: kfree(md->tag_set); + md->tag_set = NULL; return err; } @@ -578,6 +579,7 @@ void dm_mq_cleanup_mapped_device(struct mapped_device *md) if (md->tag_set) { blk_mq_free_tag_set(md->tag_set); kfree(md->tag_set); + md->tag_set = NULL; } } diff --git a/drivers/md/dm-snap-persistent.c b/drivers/md/dm-snap-persistent.c index 8e329c3f3a78..9ab4bf651ca9 100644 --- a/drivers/md/dm-snap-persistent.c +++ b/drivers/md/dm-snap-persistent.c @@ -596,7 +596,7 @@ static void persistent_dtr(struct dm_exception_store *store) free_area(ps); /* Allocated in persistent_read_metadata */ - vfree(ps->callbacks); + kvfree(ps->callbacks); kfree(ps); } @@ -621,8 +621,8 @@ static int persistent_read_metadata(struct dm_exception_store *store, */ ps->exceptions_per_area = (ps->store->chunk_size << SECTOR_SHIFT) / sizeof(struct disk_exception); - ps->callbacks = dm_vcalloc(ps->exceptions_per_area, - sizeof(*ps->callbacks)); + ps->callbacks = kvcalloc(ps->exceptions_per_area, + sizeof(*ps->callbacks), GFP_KERNEL); if (!ps->callbacks) return -ENOMEM; diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c index 11890db71f3f..a2acb014c13a 100644 --- a/drivers/md/dm-snap.c +++ b/drivers/md/dm-snap.c @@ -663,7 +663,8 @@ static int dm_exception_table_init(struct dm_exception_table *et, et->hash_shift = hash_shift; et->hash_mask = size - 1; - et->table = dm_vcalloc(size, sizeof(struct hlist_bl_head)); + et->table = kvmalloc_array(size, sizeof(struct hlist_bl_head), + GFP_KERNEL); if (!et->table) return -ENOMEM; @@ -689,7 +690,7 @@ static void dm_exception_table_exit(struct dm_exception_table *et, kmem_cache_free(mem, ex); } - vfree(et->table); + kvfree(et->table); } static uint32_t exception_hash(struct dm_exception_table *et, chunk_t chunk) diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index e5f0f1703c5d..ee47a332b462 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -94,24 +94,6 @@ static int setup_btree_index(unsigned int l, struct dm_table *t) return 0; } -void *dm_vcalloc(unsigned long nmemb, unsigned long elem_size) -{ - unsigned long size; - void *addr; - - /* - * Check that we're not going to overflow. - */ - if (nmemb > (ULONG_MAX / elem_size)) - return NULL; - - size = nmemb * elem_size; - addr = vzalloc(size); - - return addr; -} -EXPORT_SYMBOL(dm_vcalloc); - /* * highs, and targets are managed as dynamic arrays during a * table load. @@ -124,15 +106,15 @@ static int alloc_targets(struct dm_table *t, unsigned int num) /* * Allocate both the target array and offset array at once. */ - n_highs = (sector_t *) dm_vcalloc(num, sizeof(struct dm_target) + - sizeof(sector_t)); + n_highs = kvcalloc(num, sizeof(struct dm_target) + sizeof(sector_t), + GFP_KERNEL); if (!n_highs) return -ENOMEM; n_targets = (struct dm_target *) (n_highs + num); memset(n_highs, -1, sizeof(*n_highs) * num); - vfree(t->highs); + kvfree(t->highs); t->num_allocated = num; t->highs = n_highs; @@ -198,7 +180,7 @@ void dm_table_destroy(struct dm_table *t) /* free the indexes */ if (t->depth >= 2) - vfree(t->index[t->depth - 2]); + kvfree(t->index[t->depth - 2]); /* free the targets */ for (i = 0; i < t->num_targets; i++) { @@ -210,7 +192,7 @@ void dm_table_destroy(struct dm_table *t) dm_put_target_type(tgt->type); } - vfree(t->highs); + kvfree(t->highs); /* free the device list */ free_devices(&t->devices, t->md); @@ -1077,7 +1059,7 @@ static int setup_indexes(struct dm_table *t) total += t->counts[i]; } - indexes = (sector_t *) dm_vcalloc(total, (unsigned long) NODE_SIZE); + indexes = kvcalloc(total, NODE_SIZE, GFP_KERNEL); if (!indexes) return -ENOMEM; diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index fff4c50df74d..985baee3a678 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c @@ -2816,7 +2816,7 @@ static bool data_dev_supports_discard(struct pool_c *pt) { struct request_queue *q = bdev_get_queue(pt->data_dev->bdev); - return q && blk_queue_discard(q); + return blk_queue_discard(q); } static bool is_factor(sector_t block_size, uint32_t n) diff --git a/drivers/md/dm-verity-fec.c b/drivers/md/dm-verity-fec.c index 66f4c6398f67..cea2b3789736 100644 --- a/drivers/md/dm-verity-fec.c +++ b/drivers/md/dm-verity-fec.c @@ -65,7 +65,7 @@ static u8 *fec_read_parity(struct dm_verity *v, u64 rsb, int index, u8 *res; position = (index + rsb) * v->fec->roots; - block = div64_u64_rem(position, v->fec->roots << SECTOR_SHIFT, &rem); + block = div64_u64_rem(position, v->fec->io_size, &rem); *offset = (unsigned)rem; res = dm_bufio_read(v->fec->bufio, block, buf); @@ -154,7 +154,7 @@ static int fec_decode_bufs(struct dm_verity *v, struct dm_verity_fec_io *fio, /* read the next block when we run out of parity bytes */ offset += v->fec->roots; - if (offset >= v->fec->roots << SECTOR_SHIFT) { + if (offset >= v->fec->io_size) { dm_bufio_release(buf); par = fec_read_parity(v, rsb, block_offset, &offset, &buf); @@ -742,8 +742,13 @@ int verity_fec_ctr(struct dm_verity *v) return -E2BIG; } + if ((f->roots << SECTOR_SHIFT) & ((1 << v->data_dev_block_bits) - 1)) + f->io_size = 1 << v->data_dev_block_bits; + else + f->io_size = v->fec->roots << SECTOR_SHIFT; + f->bufio = dm_bufio_client_create(f->dev->bdev, - f->roots << SECTOR_SHIFT, + f->io_size, 1, 0, NULL, NULL); if (IS_ERR(f->bufio)) { ti->error = "Cannot initialize FEC bufio client"; diff --git a/drivers/md/dm-verity-fec.h b/drivers/md/dm-verity-fec.h index 42fbd3a7fc9f..3c46c8d61883 100644 --- a/drivers/md/dm-verity-fec.h +++ b/drivers/md/dm-verity-fec.h @@ -36,6 +36,7 @@ struct dm_verity_fec { struct dm_dev *dev; /* parity data device */ struct dm_bufio_client *data_bufio; /* for data dev access */ struct dm_bufio_client *bufio; /* for parity data access */ + size_t io_size; /* IO size for roots */ sector_t start; /* parity data start in blocks */ sector_t blocks; /* number of blocks covered */ sector_t rounds; /* number of interleaving rounds */ diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c index 808a98ef624c..d3e76aefc1a6 100644 --- a/drivers/md/dm-verity-target.c +++ b/drivers/md/dm-verity-target.c @@ -893,6 +893,28 @@ out: return r; } +static inline bool verity_is_verity_mode(const char *arg_name) +{ + return (!strcasecmp(arg_name, DM_VERITY_OPT_LOGGING) || + !strcasecmp(arg_name, DM_VERITY_OPT_RESTART) || + !strcasecmp(arg_name, DM_VERITY_OPT_PANIC)); +} + +static int verity_parse_verity_mode(struct dm_verity *v, const char *arg_name) +{ + if (v->mode) + return -EINVAL; + + if (!strcasecmp(arg_name, DM_VERITY_OPT_LOGGING)) + v->mode = DM_VERITY_MODE_LOGGING; + else if (!strcasecmp(arg_name, DM_VERITY_OPT_RESTART)) + v->mode = DM_VERITY_MODE_RESTART; + else if (!strcasecmp(arg_name, DM_VERITY_OPT_PANIC)) + v->mode = DM_VERITY_MODE_PANIC; + + return 0; +} + static int verity_parse_opt_args(struct dm_arg_set *as, struct dm_verity *v, struct dm_verity_sig_opts *verify_args) { @@ -916,16 +938,12 @@ static int verity_parse_opt_args(struct dm_arg_set *as, struct dm_verity *v, arg_name = dm_shift_arg(as); argc--; - if (!strcasecmp(arg_name, DM_VERITY_OPT_LOGGING)) { - v->mode = DM_VERITY_MODE_LOGGING; - continue; - - } else if (!strcasecmp(arg_name, DM_VERITY_OPT_RESTART)) { - v->mode = DM_VERITY_MODE_RESTART; - continue; - - } else if (!strcasecmp(arg_name, DM_VERITY_OPT_PANIC)) { - v->mode = DM_VERITY_MODE_PANIC; + if (verity_is_verity_mode(arg_name)) { + r = verity_parse_verity_mode(v, arg_name); + if (r) { + ti->error = "Conflicting error handling parameters"; + return r; + } continue; } else if (!strcasecmp(arg_name, DM_VERITY_OPT_IGN_ZEROES)) { @@ -1242,7 +1260,7 @@ bad: static struct target_type verity_target = { .name = "verity", - .version = {1, 7, 0}, + .version = {1, 8, 0}, .module = THIS_MODULE, .ctr = verity_ctr, .dtr = verity_dtr, diff --git a/drivers/md/dm-writecache.c b/drivers/md/dm-writecache.c index 4f72b6f66c3a..aecc246ade26 100644 --- a/drivers/md/dm-writecache.c +++ b/drivers/md/dm-writecache.c @@ -73,7 +73,7 @@ struct wc_memory_superblock { }; __le64 padding[8]; }; - struct wc_memory_entry entries[0]; + struct wc_memory_entry entries[]; }; struct wc_entry { diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 3f3be9408afa..ca2aedd8ee7d 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -840,7 +840,6 @@ int dm_get_table_device(struct mapped_device *md, dev_t dev, fmode_t mode, *result = &td->dm_dev; return 0; } -EXPORT_SYMBOL_GPL(dm_get_table_device); void dm_put_table_device(struct mapped_device *md, struct dm_dev *d) { @@ -854,7 +853,6 @@ void dm_put_table_device(struct mapped_device *md, struct dm_dev *d) } mutex_unlock(&md->table_devices_lock); } -EXPORT_SYMBOL(dm_put_table_device); static void free_table_devices(struct list_head *devices) { @@ -1641,38 +1639,35 @@ static blk_qc_t __split_and_process_bio(struct mapped_device *md, } else { ci.bio = bio; ci.sector_count = bio_sectors(bio); - while (ci.sector_count && !error) { - error = __split_and_process_non_flush(&ci); - if (ci.sector_count && !error) { - /* - * Remainder must be passed to submit_bio_noacct() - * so that it gets handled *after* bios already submitted - * have been completely processed. - * We take a clone of the original to store in - * ci.io->orig_bio to be used by end_io_acct() and - * for dec_pending to use for completion handling. - */ - struct bio *b = bio_split(bio, bio_sectors(bio) - ci.sector_count, - GFP_NOIO, &md->queue->bio_split); - ci.io->orig_bio = b; - - /* - * Adjust IO stats for each split, otherwise upon queue - * reentry there will be redundant IO accounting. - * NOTE: this is a stop-gap fix, a proper fix involves - * significant refactoring of DM core's bio splitting - * (by eliminating DM's splitting and just using bio_split) - */ - part_stat_lock(); - __dm_part_stat_sub(dm_disk(md)->part0, - sectors[op_stat_group(bio_op(bio))], ci.sector_count); - part_stat_unlock(); - - bio_chain(b, bio); - trace_block_split(b, bio->bi_iter.bi_sector); - ret = submit_bio_noacct(bio); - break; - } + error = __split_and_process_non_flush(&ci); + if (ci.sector_count && !error) { + /* + * Remainder must be passed to submit_bio_noacct() + * so that it gets handled *after* bios already submitted + * have been completely processed. + * We take a clone of the original to store in + * ci.io->orig_bio to be used by end_io_acct() and + * for dec_pending to use for completion handling. + */ + struct bio *b = bio_split(bio, bio_sectors(bio) - ci.sector_count, + GFP_NOIO, &md->queue->bio_split); + ci.io->orig_bio = b; + + /* + * Adjust IO stats for each split, otherwise upon queue + * reentry there will be redundant IO accounting. + * NOTE: this is a stop-gap fix, a proper fix involves + * significant refactoring of DM core's bio splitting + * (by eliminating DM's splitting and just using bio_split) + */ + part_stat_lock(); + __dm_part_stat_sub(dm_disk(md)->part0, + sectors[op_stat_group(bio_op(bio))], ci.sector_count); + part_stat_unlock(); + + bio_chain(b, bio); + trace_block_split(b, bio->bi_iter.bi_sector); + ret = submit_bio_noacct(bio); } } diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index 200c5d0f08bf..ea3130e11680 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -1722,6 +1722,8 @@ void md_bitmap_flush(struct mddev *mddev) md_bitmap_daemon_work(mddev); bitmap->daemon_lastrun -= sleep; md_bitmap_daemon_work(mddev); + if (mddev->bitmap_info.external) + md_super_wait(mddev); md_bitmap_update_sb(bitmap); } diff --git a/drivers/md/md.c b/drivers/md/md.c index 21da0c48f6c2..49f897fbb89b 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -734,78 +734,94 @@ void mddev_init(struct mddev *mddev) } EXPORT_SYMBOL_GPL(mddev_init); +static struct mddev *mddev_find_locked(dev_t unit) +{ + struct mddev *mddev; + + list_for_each_entry(mddev, &all_mddevs, all_mddevs) + if (mddev->unit == unit) + return mddev; + + return NULL; +} + +/* find an unused unit number */ +static dev_t mddev_alloc_unit(void) +{ + static int next_minor = 512; + int start = next_minor; + bool is_free = 0; + dev_t dev = 0; + + while (!is_free) { + dev = MKDEV(MD_MAJOR, next_minor); + next_minor++; + if (next_minor > MINORMASK) + next_minor = 0; + if (next_minor == start) + return 0; /* Oh dear, all in use. */ + is_free = !mddev_find_locked(dev); + } + + return dev; +} + static struct mddev *mddev_find(dev_t unit) { - struct mddev *mddev, *new = NULL; + struct mddev *mddev; - if (unit && MAJOR(unit) != MD_MAJOR) - unit &= ~((1<<MdpMinorShift)-1); + if (MAJOR(unit) != MD_MAJOR) + unit &= ~((1 << MdpMinorShift) - 1); - retry: spin_lock(&all_mddevs_lock); + mddev = mddev_find_locked(unit); + if (mddev) + mddev_get(mddev); + spin_unlock(&all_mddevs_lock); - if (unit) { - list_for_each_entry(mddev, &all_mddevs, all_mddevs) - if (mddev->unit == unit) { - mddev_get(mddev); - spin_unlock(&all_mddevs_lock); - kfree(new); - return mddev; - } + return mddev; +} - if (new) { - list_add(&new->all_mddevs, &all_mddevs); - spin_unlock(&all_mddevs_lock); - new->hold_active = UNTIL_IOCTL; - return new; - } - } else if (new) { - /* find an unused unit number */ - static int next_minor = 512; - int start = next_minor; - int is_free = 0; - int dev = 0; - while (!is_free) { - dev = MKDEV(MD_MAJOR, next_minor); - next_minor++; - if (next_minor > MINORMASK) - next_minor = 0; - if (next_minor == start) { - /* Oh dear, all in use. */ - spin_unlock(&all_mddevs_lock); - kfree(new); - return NULL; - } +static struct mddev *mddev_alloc(dev_t unit) +{ + struct mddev *new; + int error; - is_free = 1; - list_for_each_entry(mddev, &all_mddevs, all_mddevs) - if (mddev->unit == dev) { - is_free = 0; - break; - } - } - new->unit = dev; - new->md_minor = MINOR(dev); - new->hold_active = UNTIL_STOP; - list_add(&new->all_mddevs, &all_mddevs); - spin_unlock(&all_mddevs_lock); - return new; - } - spin_unlock(&all_mddevs_lock); + if (unit && MAJOR(unit) != MD_MAJOR) + unit &= ~((1 << MdpMinorShift) - 1); new = kzalloc(sizeof(*new), GFP_KERNEL); if (!new) - return NULL; - - new->unit = unit; - if (MAJOR(unit) == MD_MAJOR) - new->md_minor = MINOR(unit); - else - new->md_minor = MINOR(unit) >> MdpMinorShift; - + return ERR_PTR(-ENOMEM); mddev_init(new); - goto retry; + spin_lock(&all_mddevs_lock); + if (unit) { + error = -EEXIST; + if (mddev_find_locked(unit)) + goto out_free_new; + new->unit = unit; + if (MAJOR(unit) == MD_MAJOR) + new->md_minor = MINOR(unit); + else + new->md_minor = MINOR(unit) >> MdpMinorShift; + new->hold_active = UNTIL_IOCTL; + } else { + error = -ENODEV; + new->unit = mddev_alloc_unit(); + if (!new->unit) + goto out_free_new; + new->md_minor = MINOR(new->unit); + new->hold_active = UNTIL_STOP; + } + + list_add(&new->all_mddevs, &all_mddevs); + spin_unlock(&all_mddevs_lock); + return new; +out_free_new: + spin_unlock(&all_mddevs_lock); + kfree(new); + return ERR_PTR(error); } static struct attribute_group md_redundancy_group; @@ -5644,29 +5660,29 @@ static int md_alloc(dev_t dev, char *name) * writing to /sys/module/md_mod/parameters/new_array. */ static DEFINE_MUTEX(disks_mutex); - struct mddev *mddev = mddev_find(dev); + struct mddev *mddev; struct gendisk *disk; int partitioned; int shift; int unit; - int error; + int error ; - if (!mddev) - return -ENODEV; - - partitioned = (MAJOR(mddev->unit) != MD_MAJOR); - shift = partitioned ? MdpMinorShift : 0; - unit = MINOR(mddev->unit) >> shift; - - /* wait for any previous instance of this device to be - * completely removed (mddev_delayed_delete). + /* + * Wait for any previous instance of this device to be completely + * removed (mddev_delayed_delete). */ flush_workqueue(md_misc_wq); mutex_lock(&disks_mutex); - error = -EEXIST; - if (mddev->gendisk) - goto abort; + mddev = mddev_alloc(dev); + if (IS_ERR(mddev)) { + mutex_unlock(&disks_mutex); + return PTR_ERR(mddev); + } + + partitioned = (MAJOR(mddev->unit) != MD_MAJOR); + shift = partitioned ? MdpMinorShift : 0; + unit = MINOR(mddev->unit) >> shift; if (name && !dev) { /* Need to ensure that 'name' is not a duplicate. @@ -5678,6 +5694,7 @@ static int md_alloc(dev_t dev, char *name) if (mddev2->gendisk && strcmp(mddev2->gendisk->disk_name, name) == 0) { spin_unlock(&all_mddevs_lock); + error = -EEXIST; goto abort; } spin_unlock(&all_mddevs_lock); @@ -6524,11 +6541,9 @@ static void autorun_devices(int part) md_probe(dev); mddev = mddev_find(dev); - if (!mddev || !mddev->gendisk) { - if (mddev) - mddev_put(mddev); + if (!mddev) break; - } + if (mddev_lock(mddev)) pr_warn("md: %s locked, cannot run\n", mdname(mddev)); else if (mddev->raid_disks || mddev->major_version @@ -7821,8 +7836,7 @@ static int md_open(struct block_device *bdev, fmode_t mode) /* Wait until bdev->bd_disk is definitely gone */ if (work_pending(&mddev->del_work)) flush_workqueue(md_misc_wq); - /* Then retry the open from the top */ - return -ERESTARTSYS; + return -EBUSY; } BUG_ON(mddev != bdev->bd_disk->private_data); @@ -8153,7 +8167,11 @@ static void *md_seq_start(struct seq_file *seq, loff_t *pos) loff_t l = *pos; struct mddev *mddev; - if (l >= 0x10000) + if (l == 0x10000) { + ++*pos; + return (void *)2; + } + if (l > 0x10000) return NULL; if (!l--) /* header */ @@ -8575,6 +8593,26 @@ void md_write_end(struct mddev *mddev) EXPORT_SYMBOL(md_write_end); +/* This is used by raid0 and raid10 */ +void md_submit_discard_bio(struct mddev *mddev, struct md_rdev *rdev, + struct bio *bio, sector_t start, sector_t size) +{ + struct bio *discard_bio = NULL; + + if (__blkdev_issue_discard(rdev->bdev, start, size, GFP_NOIO, 0, + &discard_bio) || !discard_bio) + return; + + bio_chain(discard_bio, bio); + bio_clone_blkg_association(discard_bio, bio); + if (mddev->gendisk) + trace_block_bio_remap(discard_bio, + disk_devt(mddev->gendisk), + bio->bi_iter.bi_sector); + submit_bio_noacct(discard_bio); +} +EXPORT_SYMBOL_GPL(md_submit_discard_bio); + /* md_allow_write(mddev) * Calling this ensures that the array is marked 'active' so that writes * may proceed without blocking. It is important to call this before @@ -9251,11 +9289,11 @@ void md_check_recovery(struct mddev *mddev) } if (mddev_is_clustered(mddev)) { - struct md_rdev *rdev; + struct md_rdev *rdev, *tmp; /* kick the device if another node issued a * remove disk. */ - rdev_for_each(rdev, mddev) { + rdev_for_each_safe(rdev, tmp, mddev) { if (test_and_clear_bit(ClusterRemove, &rdev->flags) && rdev->raid_disk < 0) md_kick_rdev_from_array(rdev); @@ -9569,7 +9607,7 @@ err_wq: static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev) { struct mdp_superblock_1 *sb = page_address(rdev->sb_page); - struct md_rdev *rdev2; + struct md_rdev *rdev2, *tmp; int role, ret; char b[BDEVNAME_SIZE]; @@ -9586,7 +9624,7 @@ static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev) } /* Check for change of roles in the active devices */ - rdev_for_each(rdev2, mddev) { + rdev_for_each_safe(rdev2, tmp, mddev) { if (test_bit(Faulty, &rdev2->flags)) continue; diff --git a/drivers/md/md.h b/drivers/md/md.h index bcbba1b5ec4a..fb7eab58cfd5 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -713,6 +713,8 @@ extern void md_write_end(struct mddev *mddev); extern void md_done_sync(struct mddev *mddev, int blocks, int ok); extern void md_error(struct mddev *mddev, struct md_rdev *rdev); extern void md_finish_reshape(struct mddev *mddev); +void md_submit_discard_bio(struct mddev *mddev, struct md_rdev *rdev, + struct bio *bio, sector_t start, sector_t size); extern bool __must_check md_flush_request(struct mddev *mddev, struct bio *bio); extern void md_super_write(struct mddev *mddev, struct md_rdev *rdev, diff --git a/drivers/md/persistent-data/dm-btree-internal.h b/drivers/md/persistent-data/dm-btree-internal.h index fe073d92f01e..b1788853a355 100644 --- a/drivers/md/persistent-data/dm-btree-internal.h +++ b/drivers/md/persistent-data/dm-btree-internal.h @@ -34,12 +34,12 @@ struct node_header { __le32 max_entries; __le32 value_size; __le32 padding; -} __packed; +} __attribute__((packed, aligned(8))); struct btree_node { struct node_header header; __le64 keys[]; -} __packed; +} __attribute__((packed, aligned(8))); /* @@ -83,7 +83,7 @@ struct shadow_spine { }; void init_shadow_spine(struct shadow_spine *s, struct dm_btree_info *info); -int exit_shadow_spine(struct shadow_spine *s); +void exit_shadow_spine(struct shadow_spine *s); int shadow_step(struct shadow_spine *s, dm_block_t b, struct dm_btree_value_type *vt); diff --git a/drivers/md/persistent-data/dm-btree-spine.c b/drivers/md/persistent-data/dm-btree-spine.c index 8a2bfbfb218b..2061ab865567 100644 --- a/drivers/md/persistent-data/dm-btree-spine.c +++ b/drivers/md/persistent-data/dm-btree-spine.c @@ -30,8 +30,6 @@ static void node_prepare_for_write(struct dm_block_validator *v, h->csum = cpu_to_le32(dm_bm_checksum(&h->flags, block_size - sizeof(__le32), BTREE_CSUM_XOR)); - - BUG_ON(node_check(v, b, 4096)); } static int node_check(struct dm_block_validator *v, @@ -183,15 +181,13 @@ void init_shadow_spine(struct shadow_spine *s, struct dm_btree_info *info) s->count = 0; } -int exit_shadow_spine(struct shadow_spine *s) +void exit_shadow_spine(struct shadow_spine *s) { - int r = 0, i; + int i; for (i = 0; i < s->count; i++) { unlock_block(s->info, s->nodes[i]); } - - return r; } int shadow_step(struct shadow_spine *s, dm_block_t b, diff --git a/drivers/md/persistent-data/dm-space-map-common.c b/drivers/md/persistent-data/dm-space-map-common.c index d8b4125e338c..a213bf11738f 100644 --- a/drivers/md/persistent-data/dm-space-map-common.c +++ b/drivers/md/persistent-data/dm-space-map-common.c @@ -339,6 +339,8 @@ int sm_ll_find_free_block(struct ll_disk *ll, dm_block_t begin, */ begin = do_div(index_begin, ll->entries_per_block); end = do_div(end, ll->entries_per_block); + if (end == 0) + end = ll->entries_per_block; for (i = index_begin; i < index_end; i++, begin = 0) { struct dm_block *blk; diff --git a/drivers/md/persistent-data/dm-space-map-common.h b/drivers/md/persistent-data/dm-space-map-common.h index 8de63ce39bdd..87e17909ef52 100644 --- a/drivers/md/persistent-data/dm-space-map-common.h +++ b/drivers/md/persistent-data/dm-space-map-common.h @@ -33,7 +33,7 @@ struct disk_index_entry { __le64 blocknr; __le32 nr_free; __le32 none_free_before; -} __packed; +} __attribute__ ((packed, aligned(8))); #define MAX_METADATA_BITMAPS 255 @@ -43,7 +43,7 @@ struct disk_metadata_index { __le64 blocknr; struct disk_index_entry index[MAX_METADATA_BITMAPS]; -} __packed; +} __attribute__ ((packed, aligned(8))); struct ll_disk; @@ -86,7 +86,7 @@ struct disk_sm_root { __le64 nr_allocated; __le64 bitmap_root; __le64 ref_count_root; -} __packed; +} __attribute__ ((packed, aligned(8))); #define ENTRIES_PER_BYTE 4 @@ -94,7 +94,7 @@ struct disk_bitmap_header { __le32 csum; __le32 not_used; __le64 blocknr; -} __packed; +} __attribute__ ((packed, aligned(8))); enum allocation_event { SM_NONE, diff --git a/drivers/md/persistent-data/dm-space-map-disk.c b/drivers/md/persistent-data/dm-space-map-disk.c index bf4c5e2ccb6f..61f56909e00b 100644 --- a/drivers/md/persistent-data/dm-space-map-disk.c +++ b/drivers/md/persistent-data/dm-space-map-disk.c @@ -187,13 +187,8 @@ static int sm_disk_new_block(struct dm_space_map *sm, dm_block_t *b) static int sm_disk_commit(struct dm_space_map *sm) { int r; - dm_block_t nr_free; struct sm_disk *smd = container_of(sm, struct sm_disk, sm); - r = sm_disk_get_nr_free(sm, &nr_free); - if (r) - return r; - r = sm_ll_commit(&smd->ll); if (r) return r; @@ -202,10 +197,6 @@ static int sm_disk_commit(struct dm_space_map *sm) smd->begin = 0; smd->nr_allocated_this_transaction = 0; - r = sm_disk_get_nr_free(sm, &nr_free); - if (r) - return r; - return 0; } diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index 67f157f2525d..e5d7411cba9b 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -477,7 +477,6 @@ static void raid0_handle_discard(struct mddev *mddev, struct bio *bio) for (disk = 0; disk < zone->nb_dev; disk++) { sector_t dev_start, dev_end; - struct bio *discard_bio = NULL; struct md_rdev *rdev; if (disk < start_disk_index) @@ -500,18 +499,9 @@ static void raid0_handle_discard(struct mddev *mddev, struct bio *bio) rdev = conf->devlist[(zone - conf->strip_zone) * conf->strip_zone[0].nb_dev + disk]; - if (__blkdev_issue_discard(rdev->bdev, + md_submit_discard_bio(mddev, rdev, bio, dev_start + zone->dev_start + rdev->data_offset, - dev_end - dev_start, GFP_NOIO, 0, &discard_bio) || - !discard_bio) - continue; - bio_chain(discard_bio, bio); - bio_clone_blkg_association(discard_bio, bio); - if (mddev->gendisk) - trace_block_bio_remap(discard_bio, - disk_devt(mddev->gendisk), - bio->bi_iter.bi_sector); - submit_bio_noacct(discard_bio); + dev_end - dev_start); } bio_endio(bio); } diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index d2378765dc15..ced076ba560e 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -478,6 +478,8 @@ static void raid1_end_write_request(struct bio *bio) if (!test_bit(Faulty, &rdev->flags)) set_bit(R1BIO_WriteError, &r1_bio->state); else { + /* Fail the request */ + set_bit(R1BIO_Degraded, &r1_bio->state); /* Finished with this branch */ r1_bio->bios[mirror] = NULL; to_put = bio; diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index a9ae7d113492..13f5e6b2a73d 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -91,7 +91,7 @@ static inline struct r10bio *get_resync_r10bio(struct bio *bio) static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data) { struct r10conf *conf = data; - int size = offsetof(struct r10bio, devs[conf->copies]); + int size = offsetof(struct r10bio, devs[conf->geo.raid_disks]); /* allocate a r10bio with room for raid_disks entries in the * bios array */ @@ -238,7 +238,7 @@ static void put_all_bios(struct r10conf *conf, struct r10bio *r10_bio) { int i; - for (i = 0; i < conf->copies; i++) { + for (i = 0; i < conf->geo.raid_disks; i++) { struct bio **bio = & r10_bio->devs[i].bio; if (!BIO_SPECIAL(*bio)) bio_put(*bio); @@ -327,7 +327,7 @@ static int find_bio_disk(struct r10conf *conf, struct r10bio *r10_bio, int slot; int repl = 0; - for (slot = 0; slot < conf->copies; slot++) { + for (slot = 0; slot < conf->geo.raid_disks; slot++) { if (r10_bio->devs[slot].bio == bio) break; if (r10_bio->devs[slot].repl_bio == bio) { @@ -336,7 +336,6 @@ static int find_bio_disk(struct r10conf *conf, struct r10bio *r10_bio, } } - BUG_ON(slot == conf->copies); update_head_pos(slot, r10_bio); if (slotp) @@ -1274,12 +1273,77 @@ static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio, } } +static void wait_blocked_dev(struct mddev *mddev, struct r10bio *r10_bio) +{ + int i; + struct r10conf *conf = mddev->private; + struct md_rdev *blocked_rdev; + +retry_wait: + blocked_rdev = NULL; + rcu_read_lock(); + for (i = 0; i < conf->copies; i++) { + struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev); + struct md_rdev *rrdev = rcu_dereference( + conf->mirrors[i].replacement); + if (rdev == rrdev) + rrdev = NULL; + if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) { + atomic_inc(&rdev->nr_pending); + blocked_rdev = rdev; + break; + } + if (rrdev && unlikely(test_bit(Blocked, &rrdev->flags))) { + atomic_inc(&rrdev->nr_pending); + blocked_rdev = rrdev; + break; + } + + if (rdev && test_bit(WriteErrorSeen, &rdev->flags)) { + sector_t first_bad; + sector_t dev_sector = r10_bio->devs[i].addr; + int bad_sectors; + int is_bad; + + /* + * Discard request doesn't care the write result + * so it doesn't need to wait blocked disk here. + */ + if (!r10_bio->sectors) + continue; + + is_bad = is_badblock(rdev, dev_sector, r10_bio->sectors, + &first_bad, &bad_sectors); + if (is_bad < 0) { + /* + * Mustn't write here until the bad block + * is acknowledged + */ + atomic_inc(&rdev->nr_pending); + set_bit(BlockedBadBlocks, &rdev->flags); + blocked_rdev = rdev; + break; + } + } + } + rcu_read_unlock(); + + if (unlikely(blocked_rdev)) { + /* Have to wait for this device to get unblocked, then retry */ + allow_barrier(conf); + raid10_log(conf->mddev, "%s wait rdev %d blocked", + __func__, blocked_rdev->raid_disk); + md_wait_for_blocked_rdev(blocked_rdev, mddev); + wait_barrier(conf); + goto retry_wait; + } +} + static void raid10_write_request(struct mddev *mddev, struct bio *bio, struct r10bio *r10_bio) { struct r10conf *conf = mddev->private; int i; - struct md_rdev *blocked_rdev; sector_t sectors; int max_sectors; @@ -1337,8 +1401,9 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio, r10_bio->read_slot = -1; /* make sure repl_bio gets freed */ raid10_find_phys(conf, r10_bio); -retry_write: - blocked_rdev = NULL; + + wait_blocked_dev(mddev, r10_bio); + rcu_read_lock(); max_sectors = r10_bio->sectors; @@ -1349,16 +1414,6 @@ retry_write: conf->mirrors[d].replacement); if (rdev == rrdev) rrdev = NULL; - if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) { - atomic_inc(&rdev->nr_pending); - blocked_rdev = rdev; - break; - } - if (rrdev && unlikely(test_bit(Blocked, &rrdev->flags))) { - atomic_inc(&rrdev->nr_pending); - blocked_rdev = rrdev; - break; - } if (rdev && (test_bit(Faulty, &rdev->flags))) rdev = NULL; if (rrdev && (test_bit(Faulty, &rrdev->flags))) @@ -1379,15 +1434,6 @@ retry_write: is_bad = is_badblock(rdev, dev_sector, max_sectors, &first_bad, &bad_sectors); - if (is_bad < 0) { - /* Mustn't write here until the bad block - * is acknowledged - */ - atomic_inc(&rdev->nr_pending); - set_bit(BlockedBadBlocks, &rdev->flags); - blocked_rdev = rdev; - break; - } if (is_bad && first_bad <= dev_sector) { /* Cannot write here at all */ bad_sectors -= (dev_sector - first_bad); @@ -1423,35 +1469,6 @@ retry_write: } rcu_read_unlock(); - if (unlikely(blocked_rdev)) { - /* Have to wait for this device to get unblocked, then retry */ - int j; - int d; - - for (j = 0; j < i; j++) { - if (r10_bio->devs[j].bio) { - d = r10_bio->devs[j].devnum; - rdev_dec_pending(conf->mirrors[d].rdev, mddev); - } - if (r10_bio->devs[j].repl_bio) { - struct md_rdev *rdev; - d = r10_bio->devs[j].devnum; - rdev = conf->mirrors[d].replacement; - if (!rdev) { - /* Race with remove_disk */ - smp_mb(); - rdev = conf->mirrors[d].rdev; - } - rdev_dec_pending(rdev, mddev); - } - } - allow_barrier(conf); - raid10_log(conf->mddev, "wait rdev %d blocked", blocked_rdev->raid_disk); - md_wait_for_blocked_rdev(blocked_rdev, mddev); - wait_barrier(conf); - goto retry_write; - } - if (max_sectors < r10_bio->sectors) r10_bio->sectors = max_sectors; @@ -1492,7 +1509,8 @@ static void __make_request(struct mddev *mddev, struct bio *bio, int sectors) r10_bio->sector = bio->bi_iter.bi_sector; r10_bio->state = 0; r10_bio->read_slot = -1; - memset(r10_bio->devs, 0, sizeof(r10_bio->devs[0]) * conf->copies); + memset(r10_bio->devs, 0, sizeof(r10_bio->devs[0]) * + conf->geo.raid_disks); if (bio_data_dir(bio) == READ) raid10_read_request(mddev, bio, r10_bio); @@ -1500,6 +1518,304 @@ static void __make_request(struct mddev *mddev, struct bio *bio, int sectors) raid10_write_request(mddev, bio, r10_bio); } +static void raid_end_discard_bio(struct r10bio *r10bio) +{ + struct r10conf *conf = r10bio->mddev->private; + struct r10bio *first_r10bio; + + while (atomic_dec_and_test(&r10bio->remaining)) { + + allow_barrier(conf); + + if (!test_bit(R10BIO_Discard, &r10bio->state)) { + first_r10bio = (struct r10bio *)r10bio->master_bio; + free_r10bio(r10bio); + r10bio = first_r10bio; + } else { + md_write_end(r10bio->mddev); + bio_endio(r10bio->master_bio); + free_r10bio(r10bio); + break; + } + } +} + +static void raid10_end_discard_request(struct bio *bio) +{ + struct r10bio *r10_bio = bio->bi_private; + struct r10conf *conf = r10_bio->mddev->private; + struct md_rdev *rdev = NULL; + int dev; + int slot, repl; + + /* + * We don't care the return value of discard bio + */ + if (!test_bit(R10BIO_Uptodate, &r10_bio->state)) + set_bit(R10BIO_Uptodate, &r10_bio->state); + + dev = find_bio_disk(conf, r10_bio, bio, &slot, &repl); + if (repl) + rdev = conf->mirrors[dev].replacement; + if (!rdev) { + /* + * raid10_remove_disk uses smp_mb to make sure rdev is set to + * replacement before setting replacement to NULL. It can read + * rdev first without barrier protect even replacment is NULL + */ + smp_rmb(); + rdev = conf->mirrors[dev].rdev; + } + + raid_end_discard_bio(r10_bio); + rdev_dec_pending(rdev, conf->mddev); +} + +/* + * There are some limitations to handle discard bio + * 1st, the discard size is bigger than stripe_size*2. + * 2st, if the discard bio spans reshape progress, we use the old way to + * handle discard bio + */ +static int raid10_handle_discard(struct mddev *mddev, struct bio *bio) +{ + struct r10conf *conf = mddev->private; + struct geom *geo = &conf->geo; + int far_copies = geo->far_copies; + bool first_copy = true; + struct r10bio *r10_bio, *first_r10bio; + struct bio *split; + int disk; + sector_t chunk; + unsigned int stripe_size; + unsigned int stripe_data_disks; + sector_t split_size; + sector_t bio_start, bio_end; + sector_t first_stripe_index, last_stripe_index; + sector_t start_disk_offset; + unsigned int start_disk_index; + sector_t end_disk_offset; + unsigned int end_disk_index; + unsigned int remainder; + + if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) + return -EAGAIN; + + wait_barrier(conf); + + /* + * Check reshape again to avoid reshape happens after checking + * MD_RECOVERY_RESHAPE and before wait_barrier + */ + if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) + goto out; + + if (geo->near_copies) + stripe_data_disks = geo->raid_disks / geo->near_copies + + geo->raid_disks % geo->near_copies; + else + stripe_data_disks = geo->raid_disks; + + stripe_size = stripe_data_disks << geo->chunk_shift; + + bio_start = bio->bi_iter.bi_sector; + bio_end = bio_end_sector(bio); + + /* + * Maybe one discard bio is smaller than strip size or across one + * stripe and discard region is larger than one stripe size. For far + * offset layout, if the discard region is not aligned with stripe + * size, there is hole when we submit discard bio to member disk. + * For simplicity, we only handle discard bio which discard region + * is bigger than stripe_size * 2 + */ + if (bio_sectors(bio) < stripe_size*2) + goto out; + + /* + * Keep bio aligned with strip size. + */ + div_u64_rem(bio_start, stripe_size, &remainder); + if (remainder) { + split_size = stripe_size - remainder; + split = bio_split(bio, split_size, GFP_NOIO, &conf->bio_split); + bio_chain(split, bio); + allow_barrier(conf); + /* Resend the fist split part */ + submit_bio_noacct(split); + wait_barrier(conf); + } + div_u64_rem(bio_end, stripe_size, &remainder); + if (remainder) { + split_size = bio_sectors(bio) - remainder; + split = bio_split(bio, split_size, GFP_NOIO, &conf->bio_split); + bio_chain(split, bio); + allow_barrier(conf); + /* Resend the second split part */ + submit_bio_noacct(bio); + bio = split; + wait_barrier(conf); + } + + bio_start = bio->bi_iter.bi_sector; + bio_end = bio_end_sector(bio); + + /* + * Raid10 uses chunk as the unit to store data. It's similar like raid0. + * One stripe contains the chunks from all member disk (one chunk from + * one disk at the same HBA address). For layout detail, see 'man md 4' + */ + chunk = bio_start >> geo->chunk_shift; + chunk *= geo->near_copies; + first_stripe_index = chunk; + start_disk_index = sector_div(first_stripe_index, geo->raid_disks); + if (geo->far_offset) + first_stripe_index *= geo->far_copies; + start_disk_offset = (bio_start & geo->chunk_mask) + + (first_stripe_index << geo->chunk_shift); + + chunk = bio_end >> geo->chunk_shift; + chunk *= geo->near_copies; + last_stripe_index = chunk; + end_disk_index = sector_div(last_stripe_index, geo->raid_disks); + if (geo->far_offset) + last_stripe_index *= geo->far_copies; + end_disk_offset = (bio_end & geo->chunk_mask) + + (last_stripe_index << geo->chunk_shift); + +retry_discard: + r10_bio = mempool_alloc(&conf->r10bio_pool, GFP_NOIO); + r10_bio->mddev = mddev; + r10_bio->state = 0; + r10_bio->sectors = 0; + memset(r10_bio->devs, 0, sizeof(r10_bio->devs[0]) * geo->raid_disks); + wait_blocked_dev(mddev, r10_bio); + + /* + * For far layout it needs more than one r10bio to cover all regions. + * Inspired by raid10_sync_request, we can use the first r10bio->master_bio + * to record the discard bio. Other r10bio->master_bio record the first + * r10bio. The first r10bio only release after all other r10bios finish. + * The discard bio returns only first r10bio finishes + */ + if (first_copy) { + r10_bio->master_bio = bio; + set_bit(R10BIO_Discard, &r10_bio->state); + first_copy = false; + first_r10bio = r10_bio; + } else + r10_bio->master_bio = (struct bio *)first_r10bio; + + rcu_read_lock(); + for (disk = 0; disk < geo->raid_disks; disk++) { + struct md_rdev *rdev = rcu_dereference(conf->mirrors[disk].rdev); + struct md_rdev *rrdev = rcu_dereference( + conf->mirrors[disk].replacement); + + r10_bio->devs[disk].bio = NULL; + r10_bio->devs[disk].repl_bio = NULL; + + if (rdev && (test_bit(Faulty, &rdev->flags))) + rdev = NULL; + if (rrdev && (test_bit(Faulty, &rrdev->flags))) + rrdev = NULL; + if (!rdev && !rrdev) + continue; + + if (rdev) { + r10_bio->devs[disk].bio = bio; + atomic_inc(&rdev->nr_pending); + } + if (rrdev) { + r10_bio->devs[disk].repl_bio = bio; + atomic_inc(&rrdev->nr_pending); + } + } + rcu_read_unlock(); + + atomic_set(&r10_bio->remaining, 1); + for (disk = 0; disk < geo->raid_disks; disk++) { + sector_t dev_start, dev_end; + struct bio *mbio, *rbio = NULL; + struct md_rdev *rdev = rcu_dereference(conf->mirrors[disk].rdev); + struct md_rdev *rrdev = rcu_dereference( + conf->mirrors[disk].replacement); + + /* + * Now start to calculate the start and end address for each disk. + * The space between dev_start and dev_end is the discard region. + * + * For dev_start, it needs to consider three conditions: + * 1st, the disk is before start_disk, you can imagine the disk in + * the next stripe. So the dev_start is the start address of next + * stripe. + * 2st, the disk is after start_disk, it means the disk is at the + * same stripe of first disk + * 3st, the first disk itself, we can use start_disk_offset directly + */ + if (disk < start_disk_index) + dev_start = (first_stripe_index + 1) * mddev->chunk_sectors; + else if (disk > start_disk_index) + dev_start = first_stripe_index * mddev->chunk_sectors; + else + dev_start = start_disk_offset; + + if (disk < end_disk_index) + dev_end = (last_stripe_index + 1) * mddev->chunk_sectors; + else if (disk > end_disk_index) + dev_end = last_stripe_index * mddev->chunk_sectors; + else + dev_end = end_disk_offset; + + /* + * It only handles discard bio which size is >= stripe size, so + * dev_end > dev_start all the time + */ + if (r10_bio->devs[disk].bio) { + mbio = bio_clone_fast(bio, GFP_NOIO, &mddev->bio_set); + mbio->bi_end_io = raid10_end_discard_request; + mbio->bi_private = r10_bio; + r10_bio->devs[disk].bio = mbio; + r10_bio->devs[disk].devnum = disk; + atomic_inc(&r10_bio->remaining); + md_submit_discard_bio(mddev, rdev, mbio, + dev_start + choose_data_offset(r10_bio, rdev), + dev_end - dev_start); + bio_endio(mbio); + } + if (r10_bio->devs[disk].repl_bio) { + rbio = bio_clone_fast(bio, GFP_NOIO, &mddev->bio_set); + rbio->bi_end_io = raid10_end_discard_request; + rbio->bi_private = r10_bio; + r10_bio->devs[disk].repl_bio = rbio; + r10_bio->devs[disk].devnum = disk; + atomic_inc(&r10_bio->remaining); + md_submit_discard_bio(mddev, rrdev, rbio, + dev_start + choose_data_offset(r10_bio, rrdev), + dev_end - dev_start); + bio_endio(rbio); + } + } + + if (!geo->far_offset && --far_copies) { + first_stripe_index += geo->stride >> geo->chunk_shift; + start_disk_offset += geo->stride; + last_stripe_index += geo->stride >> geo->chunk_shift; + end_disk_offset += geo->stride; + atomic_inc(&first_r10bio->remaining); + raid_end_discard_bio(r10_bio); + wait_barrier(conf); + goto retry_discard; + } + + raid_end_discard_bio(r10_bio); + + return 0; +out: + allow_barrier(conf); + return -EAGAIN; +} + static bool raid10_make_request(struct mddev *mddev, struct bio *bio) { struct r10conf *conf = mddev->private; @@ -1514,6 +1830,10 @@ static bool raid10_make_request(struct mddev *mddev, struct bio *bio) if (!md_write_start(mddev, bio)) return false; + if (unlikely(bio_op(bio) == REQ_OP_DISCARD)) + if (!raid10_handle_discard(mddev, bio)) + return true; + /* * If this request crosses a chunk boundary, we need to split * it. @@ -3753,7 +4073,7 @@ static int raid10_run(struct mddev *mddev) if (mddev->queue) { blk_queue_max_discard_sectors(mddev->queue, - mddev->chunk_sectors); + UINT_MAX); blk_queue_max_write_same_sectors(mddev->queue, 0); blk_queue_max_write_zeroes_sectors(mddev->queue, 0); blk_queue_io_min(mddev->queue, mddev->chunk_sectors << 9); diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h index 79cd2b7d3128..1461fd55311b 100644 --- a/drivers/md/raid10.h +++ b/drivers/md/raid10.h @@ -179,5 +179,6 @@ enum r10bio_state { R10BIO_Previous, /* failfast devices did receive failfast requests. */ R10BIO_FailFast, + R10BIO_Discard, }; #endif diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 5d57a5bd171f..841e1c1aa5e6 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -953,7 +953,8 @@ static void dispatch_bio_list(struct bio_list *tmp) submit_bio_noacct(bio); } -static int cmp_stripe(void *priv, struct list_head *a, struct list_head *b) +static int cmp_stripe(void *priv, const struct list_head *a, + const struct list_head *b) { const struct r5pending_data *da = list_entry(a, struct r5pending_data, sibling); |