summaryrefslogtreecommitdiffstats
path: root/drivers/md
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/Kconfig4
-rw-r--r--drivers/md/bitmap.c28
-rw-r--r--drivers/md/dm-cache-policy-mq.c2
-rw-r--r--drivers/md/dm-cache-policy-smq.c4
-rw-r--r--drivers/md/dm-cache-target.c37
-rw-r--r--drivers/md/dm-ioctl.c4
-rw-r--r--drivers/md/dm-table.c13
-rw-r--r--drivers/md/dm-thin-metadata.c4
-rw-r--r--drivers/md/dm-thin.c53
-rw-r--r--drivers/md/dm.c12
-rw-r--r--drivers/md/md-cluster.c12
-rw-r--r--drivers/md/md-cluster.h2
-rw-r--r--drivers/md/md.c6
-rw-r--r--drivers/md/persistent-data/dm-btree-internal.h6
-rw-r--r--drivers/md/persistent-data/dm-btree-remove.c23
-rw-r--r--drivers/md/persistent-data/dm-btree-spine.c37
-rw-r--r--drivers/md/persistent-data/dm-btree.c9
-rw-r--r--drivers/md/raid1.c19
-rw-r--r--drivers/md/raid10.c5
-rw-r--r--drivers/md/raid5.c38
-rw-r--r--drivers/md/raid5.h3
21 files changed, 214 insertions, 107 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index b59727309072..d5415eedba86 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -259,7 +259,7 @@ config DM_CRYPT
the ciphers you're going to use in the cryptoapi configuration.
For further information on dm-crypt and userspace tools see:
- <http://code.google.com/p/cryptsetup/wiki/DMCrypt>
+ <https://gitlab.com/cryptsetup/cryptsetup/wikis/DMCrypt>
To compile this code as a module, choose M here: the module will
be called dm-crypt.
@@ -478,7 +478,7 @@ config DM_LOG_WRITES
This device-mapper target takes two devices, one device to use
normally, one to log all write operations done to the first device.
This is for use by file system developers wishing to verify that
- their fs is writing a consitent file system at all times by allowing
+ their fs is writing a consistent file system at all times by allowing
them to replay the log in a variety of ways and to check the
contents.
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index ed2346ddf4c9..e51de52eeb94 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -494,7 +494,7 @@ static int bitmap_new_disk_sb(struct bitmap *bitmap)
bitmap_super_t *sb;
unsigned long chunksize, daemon_sleep, write_behind;
- bitmap->storage.sb_page = alloc_page(GFP_KERNEL);
+ bitmap->storage.sb_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
if (bitmap->storage.sb_page == NULL)
return -ENOMEM;
bitmap->storage.sb_page->index = 0;
@@ -541,6 +541,7 @@ static int bitmap_new_disk_sb(struct bitmap *bitmap)
sb->state = cpu_to_le32(bitmap->flags);
bitmap->events_cleared = bitmap->mddev->events;
sb->events_cleared = cpu_to_le64(bitmap->mddev->events);
+ bitmap->mddev->bitmap_info.nodes = 0;
kunmap_atomic(sb);
@@ -558,6 +559,7 @@ static int bitmap_read_sb(struct bitmap *bitmap)
unsigned long sectors_reserved = 0;
int err = -EINVAL;
struct page *sb_page;
+ loff_t offset = bitmap->mddev->bitmap_info.offset;
if (!bitmap->storage.file && !bitmap->mddev->bitmap_info.offset) {
chunksize = 128 * 1024 * 1024;
@@ -584,9 +586,9 @@ re_read:
bm_blocks = ((bm_blocks+7) >> 3) + sizeof(bitmap_super_t);
/* to 4k blocks */
bm_blocks = DIV_ROUND_UP_SECTOR_T(bm_blocks, 4096);
- bitmap->mddev->bitmap_info.offset += bitmap->cluster_slot * (bm_blocks << 3);
+ offset = bitmap->mddev->bitmap_info.offset + (bitmap->cluster_slot * (bm_blocks << 3));
pr_info("%s:%d bm slot: %d offset: %llu\n", __func__, __LINE__,
- bitmap->cluster_slot, (unsigned long long)bitmap->mddev->bitmap_info.offset);
+ bitmap->cluster_slot, offset);
}
if (bitmap->storage.file) {
@@ -597,7 +599,7 @@ re_read:
bitmap, bytes, sb_page);
} else {
err = read_sb_page(bitmap->mddev,
- bitmap->mddev->bitmap_info.offset,
+ offset,
sb_page,
0, sizeof(bitmap_super_t));
}
@@ -611,8 +613,16 @@ re_read:
daemon_sleep = le32_to_cpu(sb->daemon_sleep) * HZ;
write_behind = le32_to_cpu(sb->write_behind);
sectors_reserved = le32_to_cpu(sb->sectors_reserved);
- nodes = le32_to_cpu(sb->nodes);
- strlcpy(bitmap->mddev->bitmap_info.cluster_name, sb->cluster_name, 64);
+ /* XXX: This is a hack to ensure that we don't use clustering
+ * in case:
+ * - dm-raid is in use and
+ * - the nodes written in bitmap_sb is erroneous.
+ */
+ if (!bitmap->mddev->sync_super) {
+ nodes = le32_to_cpu(sb->nodes);
+ strlcpy(bitmap->mddev->bitmap_info.cluster_name,
+ sb->cluster_name, 64);
+ }
/* verify that the bitmap-specific fields are valid */
if (sb->magic != cpu_to_le32(BITMAP_MAGIC))
@@ -671,7 +681,7 @@ out:
kunmap_atomic(sb);
/* Assiging chunksize is required for "re_read" */
bitmap->mddev->bitmap_info.chunksize = chunksize;
- if (nodes && (bitmap->cluster_slot < 0)) {
+ if (err == 0 && nodes && (bitmap->cluster_slot < 0)) {
err = md_setup_cluster(bitmap->mddev, nodes);
if (err) {
pr_err("%s: Could not setup cluster service (%d)\n",
@@ -1866,10 +1876,6 @@ int bitmap_copy_from_slot(struct mddev *mddev, int slot,
if (IS_ERR(bitmap))
return PTR_ERR(bitmap);
- rv = bitmap_read_sb(bitmap);
- if (rv)
- goto err;
-
rv = bitmap_init_from_disk(bitmap, 0);
if (rv)
goto err;
diff --git a/drivers/md/dm-cache-policy-mq.c b/drivers/md/dm-cache-policy-mq.c
index 32814371b8d3..aa1b41ca40f7 100644
--- a/drivers/md/dm-cache-policy-mq.c
+++ b/drivers/md/dm-cache-policy-mq.c
@@ -1471,5 +1471,3 @@ module_exit(mq_exit);
MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("mq cache policy");
-
-MODULE_ALIAS("dm-cache-default");
diff --git a/drivers/md/dm-cache-policy-smq.c b/drivers/md/dm-cache-policy-smq.c
index b6f22651dd35..200366c62231 100644
--- a/drivers/md/dm-cache-policy-smq.c
+++ b/drivers/md/dm-cache-policy-smq.c
@@ -1686,7 +1686,7 @@ static struct dm_cache_policy *smq_create(dm_cblock_t cache_size,
if (from_cblock(cache_size)) {
mq->cache_hit_bits = alloc_bitset(from_cblock(cache_size));
- if (!mq->cache_hit_bits && mq->cache_hit_bits) {
+ if (!mq->cache_hit_bits) {
DMERR("couldn't allocate cache hit bitset");
goto bad_cache_hit_bits;
}
@@ -1789,3 +1789,5 @@ module_exit(smq_exit);
MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("smq cache policy");
+
+MODULE_ALIAS("dm-cache-default");
diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index d2b5dfbb30cf..7245071778db 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -1947,6 +1947,7 @@ static int commit_if_needed(struct cache *cache)
static void process_deferred_bios(struct cache *cache)
{
+ bool prealloc_used = false;
unsigned long flags;
struct bio_list bios;
struct bio *bio;
@@ -1966,6 +1967,7 @@ static void process_deferred_bios(struct cache *cache)
* this bio might require one, we pause until there are some
* prepared mappings to process.
*/
+ prealloc_used = true;
if (prealloc_data_structs(cache, &structs)) {
spin_lock_irqsave(&cache->lock, flags);
bio_list_merge(&cache->deferred_bios, &bios);
@@ -1983,11 +1985,13 @@ static void process_deferred_bios(struct cache *cache)
process_bio(cache, &structs, bio);
}
- prealloc_free_structs(cache, &structs);
+ if (prealloc_used)
+ prealloc_free_structs(cache, &structs);
}
static void process_deferred_cells(struct cache *cache)
{
+ bool prealloc_used = false;
unsigned long flags;
struct dm_bio_prison_cell *cell, *tmp;
struct list_head cells;
@@ -2007,6 +2011,7 @@ static void process_deferred_cells(struct cache *cache)
* this bio might require one, we pause until there are some
* prepared mappings to process.
*/
+ prealloc_used = true;
if (prealloc_data_structs(cache, &structs)) {
spin_lock_irqsave(&cache->lock, flags);
list_splice(&cells, &cache->deferred_cells);
@@ -2017,7 +2022,8 @@ static void process_deferred_cells(struct cache *cache)
process_cell(cache, &structs, cell);
}
- prealloc_free_structs(cache, &structs);
+ if (prealloc_used)
+ prealloc_free_structs(cache, &structs);
}
static void process_deferred_flush_bios(struct cache *cache, bool submit_bios)
@@ -2062,7 +2068,7 @@ static void process_deferred_writethrough_bios(struct cache *cache)
static void writeback_some_dirty_blocks(struct cache *cache)
{
- int r = 0;
+ bool prealloc_used = false;
dm_oblock_t oblock;
dm_cblock_t cblock;
struct prealloc structs;
@@ -2072,15 +2078,12 @@ static void writeback_some_dirty_blocks(struct cache *cache)
memset(&structs, 0, sizeof(structs));
while (spare_migration_bandwidth(cache)) {
- if (prealloc_data_structs(cache, &structs))
- break;
-
- r = policy_writeback_work(cache->policy, &oblock, &cblock, busy);
- if (r)
- break;
+ if (policy_writeback_work(cache->policy, &oblock, &cblock, busy))
+ break; /* no work to do */
- r = get_cell(cache, oblock, &structs, &old_ocell);
- if (r) {
+ prealloc_used = true;
+ if (prealloc_data_structs(cache, &structs) ||
+ get_cell(cache, oblock, &structs, &old_ocell)) {
policy_set_dirty(cache->policy, oblock);
break;
}
@@ -2088,7 +2091,8 @@ static void writeback_some_dirty_blocks(struct cache *cache)
writeback(cache, &structs, oblock, cblock, old_ocell);
}
- prealloc_free_structs(cache, &structs);
+ if (prealloc_used)
+ prealloc_free_structs(cache, &structs);
}
/*----------------------------------------------------------------
@@ -3498,7 +3502,7 @@ static void cache_resume(struct dm_target *ti)
* <#demotions> <#promotions> <#dirty>
* <#features> <features>*
* <#core args> <core args>
- * <policy name> <#policy args> <policy args>* <cache metadata mode>
+ * <policy name> <#policy args> <policy args>* <cache metadata mode> <needs_check>
*/
static void cache_status(struct dm_target *ti, status_type_t type,
unsigned status_flags, char *result, unsigned maxlen)
@@ -3584,6 +3588,11 @@ static void cache_status(struct dm_target *ti, status_type_t type,
else
DMEMIT("rw ");
+ if (dm_cache_metadata_needs_check(cache->cmd))
+ DMEMIT("needs_check ");
+ else
+ DMEMIT("- ");
+
break;
case STATUSTYPE_TABLE:
@@ -3802,7 +3811,7 @@ static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits)
static struct target_type cache_target = {
.name = "cache",
- .version = {1, 7, 0},
+ .version = {1, 8, 0},
.module = THIS_MODULE,
.ctr = cache_ctr,
.dtr = cache_dtr,
diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index 720ceeb7fa9b..80a439543259 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -1919,9 +1919,7 @@ int __init dm_interface_init(void)
void dm_interface_exit(void)
{
- if (misc_deregister(&_dm_misc) < 0)
- DMERR("misc_deregister failed for control device");
-
+ misc_deregister(&_dm_misc);
dm_hash_exit();
}
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index afb4ad3dfeb3..e76ed003769e 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -1380,14 +1380,6 @@ static int queue_supports_sg_merge(struct dm_target *ti, struct dm_dev *dev,
return q && !test_bit(QUEUE_FLAG_NO_SG_MERGE, &q->queue_flags);
}
-static int queue_supports_sg_gaps(struct dm_target *ti, struct dm_dev *dev,
- sector_t start, sector_t len, void *data)
-{
- struct request_queue *q = bdev_get_queue(dev->bdev);
-
- return q && !test_bit(QUEUE_FLAG_SG_GAPS, &q->queue_flags);
-}
-
static bool dm_table_all_devices_attribute(struct dm_table *t,
iterate_devices_callout_fn func)
{
@@ -1508,11 +1500,6 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
else
queue_flag_set_unlocked(QUEUE_FLAG_NO_SG_MERGE, q);
- if (dm_table_all_devices_attribute(t, queue_supports_sg_gaps))
- queue_flag_clear_unlocked(QUEUE_FLAG_SG_GAPS, q);
- else
- queue_flag_set_unlocked(QUEUE_FLAG_SG_GAPS, q);
-
dm_table_set_integrity(t);
/*
diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c
index 48dfe3c4d6aa..6ba47cfb1443 100644
--- a/drivers/md/dm-thin-metadata.c
+++ b/drivers/md/dm-thin-metadata.c
@@ -1293,8 +1293,8 @@ static int __release_metadata_snap(struct dm_pool_metadata *pmd)
return r;
disk_super = dm_block_data(copy);
- dm_sm_dec_block(pmd->metadata_sm, le64_to_cpu(disk_super->data_mapping_root));
- dm_sm_dec_block(pmd->metadata_sm, le64_to_cpu(disk_super->device_details_root));
+ dm_btree_del(&pmd->info, le64_to_cpu(disk_super->data_mapping_root));
+ dm_btree_del(&pmd->details_info, le64_to_cpu(disk_super->device_details_root));
dm_sm_dec_block(pmd->metadata_sm, held_root);
return dm_tm_unlock(pmd->tm, copy);
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index f352e4990998..271a66249363 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -18,6 +18,7 @@
#include <linux/init.h>
#include <linux/module.h>
#include <linux/slab.h>
+#include <linux/vmalloc.h>
#include <linux/sort.h>
#include <linux/rbtree.h>
@@ -268,7 +269,7 @@ struct pool {
process_mapping_fn process_prepared_mapping;
process_mapping_fn process_prepared_discard;
- struct dm_bio_prison_cell *cell_sort_array[CELL_SORT_ARRAY_SIZE];
+ struct dm_bio_prison_cell **cell_sort_array;
};
static enum pool_mode get_pool_mode(struct pool *pool);
@@ -667,16 +668,21 @@ static void requeue_io(struct thin_c *tc)
requeue_deferred_cells(tc);
}
-static void error_retry_list(struct pool *pool)
+static void error_retry_list_with_code(struct pool *pool, int error)
{
struct thin_c *tc;
rcu_read_lock();
list_for_each_entry_rcu(tc, &pool->active_thins, list)
- error_thin_bio_list(tc, &tc->retry_on_resume_list, -EIO);
+ error_thin_bio_list(tc, &tc->retry_on_resume_list, error);
rcu_read_unlock();
}
+static void error_retry_list(struct pool *pool)
+{
+ return error_retry_list_with_code(pool, -EIO);
+}
+
/*
* This section of code contains the logic for processing a thin device's IO.
* Much of the code depends on pool object resources (lists, workqueues, etc)
@@ -2285,18 +2291,23 @@ static void do_waker(struct work_struct *ws)
queue_delayed_work(pool->wq, &pool->waker, COMMIT_PERIOD);
}
+static void notify_of_pool_mode_change_to_oods(struct pool *pool);
+
/*
* We're holding onto IO to allow userland time to react. After the
* timeout either the pool will have been resized (and thus back in
- * PM_WRITE mode), or we degrade to PM_READ_ONLY and start erroring IO.
+ * PM_WRITE mode), or we degrade to PM_OUT_OF_DATA_SPACE w/ error_if_no_space.
*/
static void do_no_space_timeout(struct work_struct *ws)
{
struct pool *pool = container_of(to_delayed_work(ws), struct pool,
no_space_timeout);
- if (get_pool_mode(pool) == PM_OUT_OF_DATA_SPACE && !pool->pf.error_if_no_space)
- set_pool_mode(pool, PM_READ_ONLY);
+ if (get_pool_mode(pool) == PM_OUT_OF_DATA_SPACE && !pool->pf.error_if_no_space) {
+ pool->pf.error_if_no_space = true;
+ notify_of_pool_mode_change_to_oods(pool);
+ error_retry_list_with_code(pool, -ENOSPC);
+ }
}
/*----------------------------------------------------------------*/
@@ -2374,6 +2385,14 @@ static void notify_of_pool_mode_change(struct pool *pool, const char *new_mode)
dm_device_name(pool->pool_md), new_mode);
}
+static void notify_of_pool_mode_change_to_oods(struct pool *pool)
+{
+ if (!pool->pf.error_if_no_space)
+ notify_of_pool_mode_change(pool, "out-of-data-space (queue IO)");
+ else
+ notify_of_pool_mode_change(pool, "out-of-data-space (error IO)");
+}
+
static bool passdown_enabled(struct pool_c *pt)
{
return pt->adjusted_pf.discard_passdown;
@@ -2458,7 +2477,7 @@ static void set_pool_mode(struct pool *pool, enum pool_mode new_mode)
* frequently seeing this mode.
*/
if (old_mode != new_mode)
- notify_of_pool_mode_change(pool, "out-of-data-space");
+ notify_of_pool_mode_change_to_oods(pool);
pool->process_bio = process_bio_read_only;
pool->process_discard = process_discard_bio;
pool->process_cell = process_cell_read_only;
@@ -2782,6 +2801,7 @@ static void __pool_destroy(struct pool *pool)
{
__pool_table_remove(pool);
+ vfree(pool->cell_sort_array);
if (dm_pool_metadata_close(pool->pmd) < 0)
DMWARN("%s: dm_pool_metadata_close() failed.", __func__);
@@ -2894,6 +2914,13 @@ static struct pool *pool_create(struct mapped_device *pool_md,
goto bad_mapping_pool;
}
+ pool->cell_sort_array = vmalloc(sizeof(*pool->cell_sort_array) * CELL_SORT_ARRAY_SIZE);
+ if (!pool->cell_sort_array) {
+ *error = "Error allocating cell sort array";
+ err_p = ERR_PTR(-ENOMEM);
+ goto bad_sort_array;
+ }
+
pool->ref_count = 1;
pool->last_commit_jiffies = jiffies;
pool->pool_md = pool_md;
@@ -2902,6 +2929,8 @@ static struct pool *pool_create(struct mapped_device *pool_md,
return pool;
+bad_sort_array:
+ mempool_destroy(pool->mapping_pool);
bad_mapping_pool:
dm_deferred_set_destroy(pool->all_io_ds);
bad_all_io_ds:
@@ -3719,6 +3748,7 @@ static void emit_flags(struct pool_features *pf, char *result,
* Status line is:
* <transaction id> <used metadata sectors>/<total metadata sectors>
* <used data sectors>/<total data sectors> <held metadata root>
+ * <pool mode> <discard config> <no space config> <needs_check>
*/
static void pool_status(struct dm_target *ti, status_type_t type,
unsigned status_flags, char *result, unsigned maxlen)
@@ -3820,6 +3850,11 @@ static void pool_status(struct dm_target *ti, status_type_t type,
else
DMEMIT("queue_if_no_space ");
+ if (dm_pool_metadata_needs_check(pool->pmd))
+ DMEMIT("needs_check ");
+ else
+ DMEMIT("- ");
+
break;
case STATUSTYPE_TABLE:
@@ -3909,7 +3944,7 @@ static struct target_type pool_target = {
.name = "thin-pool",
.features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE |
DM_TARGET_IMMUTABLE,
- .version = {1, 15, 0},
+ .version = {1, 16, 0},
.module = THIS_MODULE,
.ctr = pool_ctr,
.dtr = pool_dtr,
@@ -4280,7 +4315,7 @@ static void thin_io_hints(struct dm_target *ti, struct queue_limits *limits)
static struct target_type thin_target = {
.name = "thin",
- .version = {1, 15, 0},
+ .version = {1, 16, 0},
.module = THIS_MODULE,
.ctr = thin_ctr,
.dtr = thin_dtr,
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 8bb1ebb6ca7b..6ffc01bb85f2 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1065,13 +1065,10 @@ static void rq_end_stats(struct mapped_device *md, struct request *orig)
*/
static void rq_completed(struct mapped_device *md, int rw, bool run_queue)
{
- int nr_requests_pending;
-
atomic_dec(&md->pending[rw]);
/* nudge anyone waiting on suspend queue */
- nr_requests_pending = md_in_flight(md);
- if (!nr_requests_pending)
+ if (!md_in_flight(md))
wake_up(&md->wait);
/*
@@ -1083,8 +1080,7 @@ static void rq_completed(struct mapped_device *md, int rw, bool run_queue)
if (run_queue) {
if (md->queue->mq_ops)
blk_mq_run_hw_queues(md->queue, true);
- else if (!nr_requests_pending ||
- (nr_requests_pending >= md->queue->nr_congestion_on))
+ else
blk_run_queue_async(md->queue);
}
@@ -2220,8 +2216,6 @@ static void dm_init_old_md_queue(struct mapped_device *md)
static void cleanup_mapped_device(struct mapped_device *md)
{
- cleanup_srcu_struct(&md->io_barrier);
-
if (md->wq)
destroy_workqueue(md->wq);
if (md->kworker_task)
@@ -2233,6 +2227,8 @@ static void cleanup_mapped_device(struct mapped_device *md)
if (md->bs)
bioset_free(md->bs);
+ cleanup_srcu_struct(&md->io_barrier);
+
if (md->disk) {
spin_lock(&_minor_lock);
md->disk->private_data = NULL;
diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
index fcfc4b9b2672..0072190515e0 100644
--- a/drivers/md/md-cluster.c
+++ b/drivers/md/md-cluster.c
@@ -44,6 +44,7 @@ struct resync_info {
/* md_cluster_info flags */
#define MD_CLUSTER_WAITING_FOR_NEWDISK 1
+#define MD_CLUSTER_SUSPEND_READ_BALANCING 2
struct md_cluster_info {
@@ -275,6 +276,9 @@ clear_bit:
static void recover_prep(void *arg)
{
+ struct mddev *mddev = arg;
+ struct md_cluster_info *cinfo = mddev->cluster_info;
+ set_bit(MD_CLUSTER_SUSPEND_READ_BALANCING, &cinfo->state);
}
static void recover_slot(void *arg, struct dlm_slot *slot)
@@ -307,6 +311,7 @@ static void recover_done(void *arg, struct dlm_slot *slots,
cinfo->slot_number = our_slot;
complete(&cinfo->completion);
+ clear_bit(MD_CLUSTER_SUSPEND_READ_BALANCING, &cinfo->state);
}
static const struct dlm_lockspace_ops md_ls_ops = {
@@ -816,12 +821,17 @@ static void resync_finish(struct mddev *mddev)
resync_send(mddev, RESYNCING, 0, 0);
}
-static int area_resyncing(struct mddev *mddev, sector_t lo, sector_t hi)
+static int area_resyncing(struct mddev *mddev, int direction,
+ sector_t lo, sector_t hi)
{
struct md_cluster_info *cinfo = mddev->cluster_info;
int ret = 0;
struct suspend_info *s;
+ if ((direction == READ) &&
+ test_bit(MD_CLUSTER_SUSPEND_READ_BALANCING, &cinfo->state))
+ return 1;
+
spin_lock_irq(&cinfo->suspend_lock);
if (list_empty(&cinfo->suspend_list))
goto out;
diff --git a/drivers/md/md-cluster.h b/drivers/md/md-cluster.h
index 6817ee00e053..00defe2badbc 100644
--- a/drivers/md/md-cluster.h
+++ b/drivers/md/md-cluster.h
@@ -18,7 +18,7 @@ struct md_cluster_operations {
int (*metadata_update_start)(struct mddev *mddev);
int (*metadata_update_finish)(struct mddev *mddev);
int (*metadata_update_cancel)(struct mddev *mddev);
- int (*area_resyncing)(struct mddev *mddev, sector_t lo, sector_t hi);
+ int (*area_resyncing)(struct mddev *mddev, int direction, sector_t lo, sector_t hi);
int (*add_new_disk_start)(struct mddev *mddev, struct md_rdev *rdev);
int (*add_new_disk_finish)(struct mddev *mddev);
int (*new_disk_ack)(struct mddev *mddev, bool ack);
diff --git a/drivers/md/md.c b/drivers/md/md.c
index d28bf5cea224..40332625713b 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -5359,6 +5359,8 @@ static void __md_stop(struct mddev *mddev)
{
struct md_personality *pers = mddev->pers;
mddev_detach(mddev);
+ /* Ensure ->event_work is done */
+ flush_workqueue(md_misc_wq);
spin_lock(&mddev->lock);
mddev->ready = 0;
mddev->pers = NULL;
@@ -5733,7 +5735,7 @@ static int get_bitmap_file(struct mddev *mddev, void __user * arg)
char *ptr;
int err;
- file = kmalloc(sizeof(*file), GFP_NOIO);
+ file = kzalloc(sizeof(*file), GFP_NOIO);
if (!file)
return -ENOMEM;
@@ -7413,7 +7415,7 @@ int md_setup_cluster(struct mddev *mddev, int nodes)
err = request_module("md-cluster");
if (err) {
pr_err("md-cluster module not found.\n");
- return err;
+ return -ENOENT;
}
spin_lock(&pers_lock);
diff --git a/drivers/md/persistent-data/dm-btree-internal.h b/drivers/md/persistent-data/dm-btree-internal.h
index bf2b80d5c470..8731b6ea026b 100644
--- a/drivers/md/persistent-data/dm-btree-internal.h
+++ b/drivers/md/persistent-data/dm-btree-internal.h
@@ -138,4 +138,10 @@ int lower_bound(struct btree_node *n, uint64_t key);
extern struct dm_block_validator btree_node_validator;
+/*
+ * Value type for upper levels of multi-level btrees.
+ */
+extern void init_le64_type(struct dm_transaction_manager *tm,
+ struct dm_btree_value_type *vt);
+
#endif /* DM_BTREE_INTERNAL_H */
diff --git a/drivers/md/persistent-data/dm-btree-remove.c b/drivers/md/persistent-data/dm-btree-remove.c
index e04cfd2d60ef..4222f774cf36 100644
--- a/drivers/md/persistent-data/dm-btree-remove.c
+++ b/drivers/md/persistent-data/dm-btree-remove.c
@@ -309,8 +309,8 @@ static void redistribute3(struct dm_btree_info *info, struct btree_node *parent,
if (s < 0 && nr_center < -s) {
/* not enough in central node */
- shift(left, center, nr_center);
- s = nr_center - target;
+ shift(left, center, -nr_center);
+ s += nr_center;
shift(left, right, s);
nr_right += s;
} else
@@ -323,7 +323,7 @@ static void redistribute3(struct dm_btree_info *info, struct btree_node *parent,
if (s > 0 && nr_center < s) {
/* not enough in central node */
shift(center, right, nr_center);
- s = target - nr_center;
+ s -= nr_center;
shift(left, right, s);
nr_left -= s;
} else
@@ -544,14 +544,6 @@ static int remove_raw(struct shadow_spine *s, struct dm_btree_info *info,
return r;
}
-static struct dm_btree_value_type le64_type = {
- .context = NULL,
- .size = sizeof(__le64),
- .inc = NULL,
- .dec = NULL,
- .equal = NULL
-};
-
int dm_btree_remove(struct dm_btree_info *info, dm_block_t root,
uint64_t *keys, dm_block_t *new_root)
{
@@ -559,12 +551,14 @@ int dm_btree_remove(struct dm_btree_info *info, dm_block_t root,
int index = 0, r = 0;
struct shadow_spine spine;
struct btree_node *n;
+ struct dm_btree_value_type le64_vt;
+ init_le64_type(info->tm, &le64_vt);
init_shadow_spine(&spine, info);
for (level = 0; level < info->levels; level++) {
r = remove_raw(&spine, info,
(level == last_level ?
- &info->value_type : &le64_type),
+ &info->value_type : &le64_vt),
root, keys[level], (unsigned *)&index);
if (r < 0)
break;
@@ -654,11 +648,13 @@ static int remove_one(struct dm_btree_info *info, dm_block_t root,
int index = 0, r = 0;
struct shadow_spine spine;
struct btree_node *n;
+ struct dm_btree_value_type le64_vt;
uint64_t k;
+ init_le64_type(info->tm, &le64_vt);
init_shadow_spine(&spine, info);
for (level = 0; level < last_level; level++) {
- r = remove_raw(&spine, info, &le64_type,
+ r = remove_raw(&spine, info, &le64_vt,
root, keys[level], (unsigned *) &index);
if (r < 0)
goto out;
@@ -689,6 +685,7 @@ static int remove_one(struct dm_btree_info *info, dm_block_t root,
value_ptr(n, index));
delete_at(n, index);
+ keys[last_level] = k + 1ull;
} else
r = -ENODATA;
diff --git a/drivers/md/persistent-data/dm-btree-spine.c b/drivers/md/persistent-data/dm-btree-spine.c
index 1b5e13ec7f96..0dee514ba4c5 100644
--- a/drivers/md/persistent-data/dm-btree-spine.c
+++ b/drivers/md/persistent-data/dm-btree-spine.c
@@ -249,3 +249,40 @@ int shadow_root(struct shadow_spine *s)
{
return s->root;
}
+
+static void le64_inc(void *context, const void *value_le)
+{
+ struct dm_transaction_manager *tm = context;
+ __le64 v_le;
+
+ memcpy(&v_le, value_le, sizeof(v_le));
+ dm_tm_inc(tm, le64_to_cpu(v_le));
+}
+
+static void le64_dec(void *context, const void *value_le)
+{
+ struct dm_transaction_manager *tm = context;
+ __le64 v_le;
+
+ memcpy(&v_le, value_le, sizeof(v_le));
+ dm_tm_dec(tm, le64_to_cpu(v_le));
+}
+
+static int le64_equal(void *context, const void *value1_le, const void *value2_le)
+{
+ __le64 v1_le, v2_le;
+
+ memcpy(&v1_le, value1_le, sizeof(v1_le));
+ memcpy(&v2_le, value2_le, sizeof(v2_le));
+ return v1_le == v2_le;
+}
+
+void init_le64_type(struct dm_transaction_manager *tm,
+ struct dm_btree_value_type *vt)
+{
+ vt->context = tm;
+ vt->size = sizeof(__le64);
+ vt->inc = le64_inc;
+ vt->dec = le64_dec;
+ vt->equal = le64_equal;
+}
diff --git a/drivers/md/persistent-data/dm-btree.c b/drivers/md/persistent-data/dm-btree.c
index 200ac12a1d40..c7726cebc495 100644
--- a/drivers/md/persistent-data/dm-btree.c
+++ b/drivers/md/persistent-data/dm-btree.c
@@ -255,7 +255,7 @@ int dm_btree_del(struct dm_btree_info *info, dm_block_t root)
int r;
struct del_stack *s;
- s = kmalloc(sizeof(*s), GFP_KERNEL);
+ s = kmalloc(sizeof(*s), GFP_NOIO);
if (!s)
return -ENOMEM;
s->info = info;
@@ -667,12 +667,7 @@ static int insert(struct dm_btree_info *info, dm_block_t root,
struct btree_node *n;
struct dm_btree_value_type le64_type;
- le64_type.context = NULL;
- le64_type.size = sizeof(__le64);
- le64_type.inc = NULL;
- le64_type.dec = NULL;
- le64_type.equal = NULL;
-
+ init_le64_type(info->tm, &le64_type);
init_shadow_spine(&spine, info);
for (level = 0; level < (info->levels - 1); level++) {
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 0ff06fdb83a9..f39d69f884de 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -337,7 +337,7 @@ static void raid1_end_read_request(struct bio *bio)
spin_lock_irqsave(&conf->device_lock, flags);
if (r1_bio->mddev->degraded == conf->raid_disks ||
(r1_bio->mddev->degraded == conf->raid_disks-1 &&
- !test_bit(Faulty, &conf->mirrors[mirror].rdev->flags)))
+ test_bit(In_sync, &conf->mirrors[mirror].rdev->flags)))
uptodate = 1;
spin_unlock_irqrestore(&conf->device_lock, flags);
}
@@ -541,7 +541,7 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
if ((conf->mddev->recovery_cp < this_sector + sectors) ||
(mddev_is_clustered(conf->mddev) &&
- md_cluster_ops->area_resyncing(conf->mddev, this_sector,
+ md_cluster_ops->area_resyncing(conf->mddev, READ, this_sector,
this_sector + sectors)))
choose_first = 1;
else
@@ -1078,7 +1078,8 @@ static void make_request(struct mddev *mddev, struct bio * bio)
((bio_end_sector(bio) > mddev->suspend_lo &&
bio->bi_iter.bi_sector < mddev->suspend_hi) ||
(mddev_is_clustered(mddev) &&
- md_cluster_ops->area_resyncing(mddev, bio->bi_iter.bi_sector, bio_end_sector(bio))))) {
+ md_cluster_ops->area_resyncing(mddev, WRITE,
+ bio->bi_iter.bi_sector, bio_end_sector(bio))))) {
/* As the suspend_* range is controlled by
* userspace, we want an interruptible
* wait.
@@ -1091,7 +1092,7 @@ static void make_request(struct mddev *mddev, struct bio * bio)
if (bio_end_sector(bio) <= mddev->suspend_lo ||
bio->bi_iter.bi_sector >= mddev->suspend_hi ||
(mddev_is_clustered(mddev) &&
- !md_cluster_ops->area_resyncing(mddev,
+ !md_cluster_ops->area_resyncing(mddev, WRITE,
bio->bi_iter.bi_sector, bio_end_sector(bio))))
break;
schedule();
@@ -1441,6 +1442,7 @@ static void error(struct mddev *mddev, struct md_rdev *rdev)
{
char b[BDEVNAME_SIZE];
struct r1conf *conf = mddev->private;
+ unsigned long flags;
/*
* If it is not operational, then we have already marked it as dead
@@ -1460,14 +1462,13 @@ static void error(struct mddev *mddev, struct md_rdev *rdev)
return;
}
set_bit(Blocked, &rdev->flags);
+ spin_lock_irqsave(&conf->device_lock, flags);
if (test_and_clear_bit(In_sync, &rdev->flags)) {
- unsigned long flags;
- spin_lock_irqsave(&conf->device_lock, flags);
mddev->degraded++;
set_bit(Faulty, &rdev->flags);
- spin_unlock_irqrestore(&conf->device_lock, flags);
} else
set_bit(Faulty, &rdev->flags);
+ spin_unlock_irqrestore(&conf->device_lock, flags);
/*
* if recovery is running, make sure it aborts.
*/
@@ -1533,7 +1534,10 @@ static int raid1_spare_active(struct mddev *mddev)
* Find all failed disks within the RAID1 configuration
* and mark them readable.
* Called under mddev lock, so rcu protection not needed.
+ * device_lock used to avoid races with raid1_end_read_request
+ * which expects 'In_sync' flags and ->degraded to be consistent.
*/
+ spin_lock_irqsave(&conf->device_lock, flags);
for (i = 0; i < conf->raid_disks; i++) {
struct md_rdev *rdev = conf->mirrors[i].rdev;
struct md_rdev *repl = conf->mirrors[conf->raid_disks + i].rdev;
@@ -1564,7 +1568,6 @@ static int raid1_spare_active(struct mddev *mddev)
sysfs_notify_dirent_safe(rdev->sysfs_state);
}
}
- spin_lock_irqsave(&conf->device_lock, flags);
mddev->degraded -= count;
spin_unlock_irqrestore(&conf->device_lock, flags);
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index d92098f3e65b..b0fce2ebf7ad 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -3440,6 +3440,7 @@ static struct r10conf *setup_conf(struct mddev *mddev)
/* far_copies must be 1 */
conf->prev.stride = conf->dev_sectors;
}
+ conf->reshape_safe = conf->reshape_progress;
spin_lock_init(&conf->device_lock);
INIT_LIST_HEAD(&conf->retry_list);
@@ -3642,7 +3643,6 @@ static int run(struct mddev *mddev)
}
conf->offset_diff = min_offset_diff;
- conf->reshape_safe = conf->reshape_progress;
clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
@@ -3985,6 +3985,7 @@ static int raid10_start_reshape(struct mddev *mddev)
conf->reshape_progress = size;
} else
conf->reshape_progress = 0;
+ conf->reshape_safe = conf->reshape_progress;
spin_unlock_irq(&conf->device_lock);
if (mddev->delta_disks && mddev->bitmap) {
@@ -4052,6 +4053,7 @@ abort:
rdev->new_data_offset = rdev->data_offset;
smp_wmb();
conf->reshape_progress = MaxSector;
+ conf->reshape_safe = MaxSector;
mddev->reshape_position = MaxSector;
spin_unlock_irq(&conf->device_lock);
return ret;
@@ -4406,6 +4408,7 @@ static void end_reshape(struct r10conf *conf)
md_finish_reshape(conf->mddev);
smp_wmb();
conf->reshape_progress = MaxSector;
+ conf->reshape_safe = MaxSector;
spin_unlock_irq(&conf->device_lock);
/* read-ahead size must cover two whole stripes, which is
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 6d20692952d2..b29e89cb815b 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -2162,6 +2162,9 @@ static int resize_stripes(struct r5conf *conf, int newsize)
if (!sc)
return -ENOMEM;
+ /* Need to ensure auto-resizing doesn't interfere */
+ mutex_lock(&conf->cache_size_mutex);
+
for (i = conf->max_nr_stripes; i; i--) {
nsh = alloc_stripe(sc, GFP_KERNEL);
if (!nsh)
@@ -2178,6 +2181,7 @@ static int resize_stripes(struct r5conf *conf, int newsize)
kmem_cache_free(sc, nsh);
}
kmem_cache_destroy(sc);
+ mutex_unlock(&conf->cache_size_mutex);
return -ENOMEM;
}
/* Step 2 - Must use GFP_NOIO now.
@@ -2224,6 +2228,7 @@ static int resize_stripes(struct r5conf *conf, int newsize)
} else
err = -ENOMEM;
+ mutex_unlock(&conf->cache_size_mutex);
/* Step 4, return new stripes to service */
while(!list_empty(&newstripes)) {
nsh = list_entry(newstripes.next, struct stripe_head, lru);
@@ -2251,7 +2256,7 @@ static int resize_stripes(struct r5conf *conf, int newsize)
static int drop_one_stripe(struct r5conf *conf)
{
struct stripe_head *sh;
- int hash = (conf->max_nr_stripes - 1) % NR_STRIPE_HASH_LOCKS;
+ int hash = (conf->max_nr_stripes - 1) & STRIPE_HASH_LOCKS_MASK;
spin_lock_irq(conf->hash_locks + hash);
sh = get_free_stripe(conf, hash);
@@ -4062,8 +4067,10 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
&first_bad, &bad_sectors))
set_bit(R5_ReadRepl, &dev->flags);
else {
- if (rdev)
+ if (rdev && !test_bit(Faulty, &rdev->flags))
set_bit(R5_NeedReplace, &dev->flags);
+ else
+ clear_bit(R5_NeedReplace, &dev->flags);
rdev = rcu_dereference(conf->disks[i].rdev);
clear_bit(R5_ReadRepl, &dev->flags);
}
@@ -5835,12 +5842,14 @@ static void raid5d(struct md_thread *thread)
pr_debug("%d stripes handled\n", handled);
spin_unlock_irq(&conf->device_lock);
- if (test_and_clear_bit(R5_ALLOC_MORE, &conf->cache_state)) {
+ if (test_and_clear_bit(R5_ALLOC_MORE, &conf->cache_state) &&
+ mutex_trylock(&conf->cache_size_mutex)) {
grow_one_stripe(conf, __GFP_NOWARN);
/* Set flag even if allocation failed. This helps
* slow down allocation requests when mem is short
*/
set_bit(R5_DID_ALLOC, &conf->cache_state);
+ mutex_unlock(&conf->cache_size_mutex);
}
async_tx_issue_pending_all();
@@ -5872,18 +5881,22 @@ raid5_set_cache_size(struct mddev *mddev, int size)
return -EINVAL;
conf->min_nr_stripes = size;
+ mutex_lock(&conf->cache_size_mutex);
while (size < conf->max_nr_stripes &&
drop_one_stripe(conf))
;
+ mutex_unlock(&conf->cache_size_mutex);
err = md_allow_write(mddev);
if (err)
return err;
+ mutex_lock(&conf->cache_size_mutex);
while (size > conf->max_nr_stripes)
if (!grow_one_stripe(conf, GFP_KERNEL))
break;
+ mutex_unlock(&conf->cache_size_mutex);
return 0;
}
@@ -6349,11 +6362,19 @@ static unsigned long raid5_cache_scan(struct shrinker *shrink,
struct shrink_control *sc)
{
struct r5conf *conf = container_of(shrink, struct r5conf, shrinker);
- int ret = 0;
- while (ret < sc->nr_to_scan) {
- if (drop_one_stripe(conf) == 0)
- return SHRINK_STOP;
- ret++;
+ unsigned long ret = SHRINK_STOP;
+
+ if (mutex_trylock(&conf->cache_size_mutex)) {
+ ret= 0;
+ while (ret < sc->nr_to_scan &&
+ conf->max_nr_stripes > conf->min_nr_stripes) {
+ if (drop_one_stripe(conf) == 0) {
+ ret = SHRINK_STOP;
+ break;
+ }
+ ret++;
+ }
+ mutex_unlock(&conf->cache_size_mutex);
}
return ret;
}
@@ -6422,6 +6443,7 @@ static struct r5conf *setup_conf(struct mddev *mddev)
goto abort;
spin_lock_init(&conf->device_lock);
seqcount_init(&conf->gen_lock);
+ mutex_init(&conf->cache_size_mutex);
init_waitqueue_head(&conf->wait_for_quiescent);
for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++) {
init_waitqueue_head(&conf->wait_for_stripe[i]);
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 02c3bf8fbfe7..d05144278690 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -482,7 +482,8 @@ struct r5conf {
*/
int active_name;
char cache_name[2][32];
- struct kmem_cache *slab_cache; /* for allocating stripes */
+ struct kmem_cache *slab_cache; /* for allocating stripes */
+ struct mutex cache_size_mutex; /* Protect changes to cache size */
int seq_flush, seq_write;
int quiesce;