summaryrefslogtreecommitdiffstats
path: root/drivers/md/dm-raid.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2017-05-03 19:31:20 +0200
committerLinus Torvalds <torvalds@linux-foundation.org>2017-05-03 19:31:20 +0200
commitd35a878ae1c50977b55e352fd46e36e35add72a0 (patch)
tree7cd4e0ec418c6f3be365e56ee3c49bab218cd608 /drivers/md/dm-raid.c
parentMerge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/shl... (diff)
parentdm bufio: check new buffer allocation watermark every 30 seconds (diff)
downloadlinux-d35a878ae1c50977b55e352fd46e36e35add72a0.tar.xz
linux-d35a878ae1c50977b55e352fd46e36e35add72a0.zip
Merge tag 'for-4.12/dm-changes' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm
Pull device mapper updates from Mike Snitzer: - A major update for DM cache that reduces the latency for deciding whether blocks should migrate to/from the cache. The bio-prison-v2 interface supports this improvement by enabling direct dispatch of work to workqueues rather than having to delay the actual work dispatch to the DM cache core. So the dm-cache policies are much more nimble by being able to drive IO as they see fit. One immediate benefit from the improved latency is a cache that should be much more adaptive to changing workloads. - Add a new DM integrity target that emulates a block device that has additional per-sector tags that can be used for storing integrity information. - Add a new authenticated encryption feature to the DM crypt target that builds on the capabilities provided by the DM integrity target. - Add MD interface for switching the raid4/5/6 journal mode and update the DM raid target to use it to enable aid4/5/6 journal write-back support. - Switch the DM verity target over to using the asynchronous hash crypto API (this helps work better with architectures that have access to off-CPU algorithm providers, which should reduce CPU utilization). - Various request-based DM and DM multipath fixes and improvements from Bart and Christoph. - A DM thinp target fix for a bio structure leak that occurs for each discard IFF discard passdown is enabled. - A fix for a possible deadlock in DM bufio and a fix to re-check the new buffer allocation watermark in the face of competing admin changes to the 'max_cache_size_bytes' tunable. - A couple DM core cleanups. * tag 'for-4.12/dm-changes' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm: (50 commits) dm bufio: check new buffer allocation watermark every 30 seconds dm bufio: avoid a possible ABBA deadlock dm mpath: make it easier to detect unintended I/O request flushes dm mpath: cleanup QUEUE_IF_NO_PATH bit manipulation by introducing assign_bit() dm mpath: micro-optimize the hot path relative to MPATHF_QUEUE_IF_NO_PATH dm: introduce enum dm_queue_mode to cleanup related code dm mpath: verify __pg_init_all_paths locking assumptions at runtime dm: verify suspend_locking assumptions at runtime dm block manager: remove an unused argument from dm_block_manager_create() dm rq: check blk_mq_register_dev() return value in dm_mq_init_request_queue() dm mpath: delay requeuing while path initialization is in progress dm mpath: avoid that path removal can trigger an infinite loop dm mpath: split and rename activate_path() to prepare for its expanded use dm ioctl: prevent stack leak in dm ioctl call dm integrity: use previously calculated log2 of sectors_per_block dm integrity: use hex2bin instead of open-coded variant dm crypt: replace custom implementation of hex2bin() dm crypt: remove obsolete references to per-CPU state dm verity: switch to using asynchronous hash crypto API dm crypt: use WQ_HIGHPRI for the IO and crypt workqueues ...
Diffstat (limited to 'drivers/md/dm-raid.c')
-rw-r--r--drivers/md/dm-raid.c164
1 files changed, 122 insertions, 42 deletions
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index 2dae3e5b851c..7d893228c40f 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -1,6 +1,6 @@
/*
* Copyright (C) 2010-2011 Neil Brown
- * Copyright (C) 2010-2016 Red Hat, Inc. All rights reserved.
+ * Copyright (C) 2010-2017 Red Hat, Inc. All rights reserved.
*
* This file is released under the GPL.
*/
@@ -79,7 +79,10 @@ struct raid_dev {
#define __CTR_FLAG_RAID10_USE_NEAR_SETS 14 /* 2 */ /* Only with raid10! */
/* New for v1.10.0 */
-#define __CTR_FLAG_JOURNAL_DEV 15 /* 2 */ /* Only with raid4/5/6! */
+#define __CTR_FLAG_JOURNAL_DEV 15 /* 2 */ /* Only with raid4/5/6 (journal device)! */
+
+/* New for v1.11.1 */
+#define __CTR_FLAG_JOURNAL_MODE 16 /* 2 */ /* Only with raid4/5/6 (journal mode)! */
/*
* Flags for rs->ctr_flags field.
@@ -100,6 +103,7 @@ struct raid_dev {
#define CTR_FLAG_DATA_OFFSET (1 << __CTR_FLAG_DATA_OFFSET)
#define CTR_FLAG_RAID10_USE_NEAR_SETS (1 << __CTR_FLAG_RAID10_USE_NEAR_SETS)
#define CTR_FLAG_JOURNAL_DEV (1 << __CTR_FLAG_JOURNAL_DEV)
+#define CTR_FLAG_JOURNAL_MODE (1 << __CTR_FLAG_JOURNAL_MODE)
#define RESUME_STAY_FROZEN_FLAGS (CTR_FLAG_DELTA_DISKS | CTR_FLAG_DATA_OFFSET)
@@ -175,7 +179,8 @@ struct raid_dev {
CTR_FLAG_REGION_SIZE | \
CTR_FLAG_DELTA_DISKS | \
CTR_FLAG_DATA_OFFSET | \
- CTR_FLAG_JOURNAL_DEV)
+ CTR_FLAG_JOURNAL_DEV | \
+ CTR_FLAG_JOURNAL_MODE)
#define RAID6_VALID_FLAGS (CTR_FLAG_SYNC | \
CTR_FLAG_REBUILD | \
@@ -186,7 +191,8 @@ struct raid_dev {
CTR_FLAG_REGION_SIZE | \
CTR_FLAG_DELTA_DISKS | \
CTR_FLAG_DATA_OFFSET | \
- CTR_FLAG_JOURNAL_DEV)
+ CTR_FLAG_JOURNAL_DEV | \
+ CTR_FLAG_JOURNAL_MODE)
/* ...valid options definitions per raid level */
/*
@@ -239,6 +245,7 @@ struct raid_set {
struct journal_dev {
struct dm_dev *dev;
struct md_rdev rdev;
+ int mode;
} journal_dev;
struct raid_dev dev[0];
@@ -326,6 +333,7 @@ static struct arg_name_flag {
{ CTR_FLAG_DELTA_DISKS, "delta_disks"},
{ CTR_FLAG_RAID10_USE_NEAR_SETS, "raid10_use_near_sets"},
{ CTR_FLAG_JOURNAL_DEV, "journal_dev" },
+ { CTR_FLAG_JOURNAL_MODE, "journal_mode" },
};
/* Return argument name string for given @flag */
@@ -344,6 +352,39 @@ static const char *dm_raid_arg_name_by_flag(const uint32_t flag)
return NULL;
}
+/* Define correlation of raid456 journal cache modes and dm-raid target line parameters */
+static struct {
+ const int mode;
+ const char *param;
+} _raid456_journal_mode[] = {
+ { R5C_JOURNAL_MODE_WRITE_THROUGH , "writethrough" },
+ { R5C_JOURNAL_MODE_WRITE_BACK , "writeback" }
+};
+
+/* Return MD raid4/5/6 journal mode for dm @journal_mode one */
+static int dm_raid_journal_mode_to_md(const char *mode)
+{
+ int m = ARRAY_SIZE(_raid456_journal_mode);
+
+ while (m--)
+ if (!strcasecmp(mode, _raid456_journal_mode[m].param))
+ return _raid456_journal_mode[m].mode;
+
+ return -EINVAL;
+}
+
+/* Return dm-raid raid4/5/6 journal mode string for @mode */
+static const char *md_journal_mode_to_dm_raid(const int mode)
+{
+ int m = ARRAY_SIZE(_raid456_journal_mode);
+
+ while (m--)
+ if (mode == _raid456_journal_mode[m].mode)
+ return _raid456_journal_mode[m].param;
+
+ return "unknown";
+}
+
/*
* Bool helpers to test for various raid levels of a raid set.
* It's level as reported by the superblock rather than
@@ -1183,7 +1224,7 @@ static int parse_raid_params(struct raid_set *rs, struct dm_arg_set *as,
continue;
}
- /* "journal_dev dev" */
+ /* "journal_dev <dev>" */
if (!strcasecmp(key, dm_raid_arg_name_by_flag(CTR_FLAG_JOURNAL_DEV))) {
int r;
struct md_rdev *jdev;
@@ -1211,10 +1252,32 @@ static int parse_raid_params(struct raid_set *rs, struct dm_arg_set *as,
rs->ti->error = "No space for raid4/5/6 journal";
return -ENOSPC;
}
+ rs->journal_dev.mode = R5C_JOURNAL_MODE_WRITE_THROUGH;
set_bit(Journal, &jdev->flags);
continue;
}
+ /* "journal_mode <mode>" ("journal_dev" mandatory!) */
+ if (!strcasecmp(key, dm_raid_arg_name_by_flag(CTR_FLAG_JOURNAL_MODE))) {
+ int r;
+
+ if (!test_bit(__CTR_FLAG_JOURNAL_DEV, &rs->ctr_flags)) {
+ rs->ti->error = "raid4/5/6 'journal_mode' is invalid without 'journal_dev'";
+ return -EINVAL;
+ }
+ if (test_and_set_bit(__CTR_FLAG_JOURNAL_MODE, &rs->ctr_flags)) {
+ rs->ti->error = "Only one raid4/5/6 'journal_mode' argument allowed";
+ return -EINVAL;
+ }
+ r = dm_raid_journal_mode_to_md(arg);
+ if (r < 0) {
+ rs->ti->error = "Invalid 'journal_mode' argument";
+ return r;
+ }
+ rs->journal_dev.mode = r;
+ continue;
+ }
+
/*
* Parameters with number values from here on.
*/
@@ -3076,6 +3139,16 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv)
rs->callbacks.congested_fn = raid_is_congested;
dm_table_add_target_callbacks(ti->table, &rs->callbacks);
+ /* If raid4/5/6 journal mode explictely requested (only possible with journal dev) -> set it */
+ if (test_bit(__CTR_FLAG_JOURNAL_MODE, &rs->ctr_flags)) {
+ r = r5c_journal_mode_set(&rs->md, rs->journal_dev.mode);
+ if (r) {
+ ti->error = "Failed to set raid4/5/6 journal mode";
+ mddev_unlock(&rs->md);
+ goto bad_journal_mode_set;
+ }
+ }
+
mddev_suspend(&rs->md);
/* Try to adjust the raid4/5/6 stripe cache size to the stripe size */
@@ -3109,6 +3182,7 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv)
mddev_unlock(&rs->md);
return 0;
+bad_journal_mode_set:
bad_stripe_cache:
bad_check_reshape:
md_stop(&rs->md);
@@ -3180,18 +3254,18 @@ static const char *decipher_sync_action(struct mddev *mddev)
* Status characters:
*
* 'D' = Dead/Failed raid set component or raid4/5/6 journal device
- * 'a' = Alive but not in-sync
- * 'A' = Alive and in-sync raid set component or alive raid4/5/6 journal device
+ * 'a' = Alive but not in-sync raid set component _or_ alive raid4/5/6 'write_back' journal device
+ * 'A' = Alive and in-sync raid set component _or_ alive raid4/5/6 'write_through' journal device
* '-' = Non-existing device (i.e. uspace passed '- -' into the ctr)
*/
-static const char *__raid_dev_status(struct md_rdev *rdev, bool array_in_sync)
+static const char *__raid_dev_status(struct raid_set *rs, struct md_rdev *rdev, bool array_in_sync)
{
if (!rdev->bdev)
return "-";
else if (test_bit(Faulty, &rdev->flags))
return "D";
else if (test_bit(Journal, &rdev->flags))
- return "A";
+ return (rs->journal_dev.mode == R5C_JOURNAL_MODE_WRITE_THROUGH) ? "A" : "a";
else if (!array_in_sync || !test_bit(In_sync, &rdev->flags))
return "a";
else
@@ -3315,7 +3389,7 @@ static void raid_status(struct dm_target *ti, status_type_t type,
/* HM FIXME: do we want another state char for raid0? It shows 'D'/'A'/'-' now */
for (i = 0; i < rs->raid_disks; i++)
- DMEMIT(__raid_dev_status(&rs->dev[i].rdev, array_in_sync));
+ DMEMIT(__raid_dev_status(rs, &rs->dev[i].rdev, array_in_sync));
/*
* In-sync/Reshape ratio:
@@ -3366,7 +3440,7 @@ static void raid_status(struct dm_target *ti, status_type_t type,
* v1.10.0+:
*/
DMEMIT(" %s", test_bit(__CTR_FLAG_JOURNAL_DEV, &rs->ctr_flags) ?
- __raid_dev_status(&rs->journal_dev.rdev, 0) : "-");
+ __raid_dev_status(rs, &rs->journal_dev.rdev, 0) : "-");
break;
case STATUSTYPE_TABLE:
@@ -3381,39 +3455,30 @@ static void raid_status(struct dm_target *ti, status_type_t type,
write_mostly_params +
hweight32(rs->ctr_flags & CTR_FLAG_OPTIONS_NO_ARGS) +
hweight32(rs->ctr_flags & CTR_FLAG_OPTIONS_ONE_ARG) * 2 +
- (test_bit(__CTR_FLAG_JOURNAL_DEV, &rs->ctr_flags) ? 2 : 0);
+ (test_bit(__CTR_FLAG_JOURNAL_DEV, &rs->ctr_flags) ? 2 : 0) +
+ (test_bit(__CTR_FLAG_JOURNAL_MODE, &rs->ctr_flags) ? 2 : 0);
+
/* Emit table line */
+ /* This has to be in the documented order for userspace! */
DMEMIT("%s %u %u", rs->raid_type->name, raid_param_cnt, mddev->new_chunk_sectors);
- if (test_bit(__CTR_FLAG_RAID10_FORMAT, &rs->ctr_flags))
- DMEMIT(" %s %s", dm_raid_arg_name_by_flag(CTR_FLAG_RAID10_FORMAT),
- raid10_md_layout_to_format(mddev->layout));
- if (test_bit(__CTR_FLAG_RAID10_COPIES, &rs->ctr_flags))
- DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_RAID10_COPIES),
- raid10_md_layout_to_copies(mddev->layout));
- if (test_bit(__CTR_FLAG_NOSYNC, &rs->ctr_flags))
- DMEMIT(" %s", dm_raid_arg_name_by_flag(CTR_FLAG_NOSYNC));
if (test_bit(__CTR_FLAG_SYNC, &rs->ctr_flags))
DMEMIT(" %s", dm_raid_arg_name_by_flag(CTR_FLAG_SYNC));
- if (test_bit(__CTR_FLAG_REGION_SIZE, &rs->ctr_flags))
- DMEMIT(" %s %llu", dm_raid_arg_name_by_flag(CTR_FLAG_REGION_SIZE),
- (unsigned long long) to_sector(mddev->bitmap_info.chunksize));
- if (test_bit(__CTR_FLAG_DATA_OFFSET, &rs->ctr_flags))
- DMEMIT(" %s %llu", dm_raid_arg_name_by_flag(CTR_FLAG_DATA_OFFSET),
- (unsigned long long) rs->data_offset);
- if (test_bit(__CTR_FLAG_DAEMON_SLEEP, &rs->ctr_flags))
- DMEMIT(" %s %lu", dm_raid_arg_name_by_flag(CTR_FLAG_DAEMON_SLEEP),
- mddev->bitmap_info.daemon_sleep);
- if (test_bit(__CTR_FLAG_DELTA_DISKS, &rs->ctr_flags))
- DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_DELTA_DISKS),
- max(rs->delta_disks, mddev->delta_disks));
- if (test_bit(__CTR_FLAG_STRIPE_CACHE, &rs->ctr_flags))
- DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_STRIPE_CACHE),
- max_nr_stripes);
+ if (test_bit(__CTR_FLAG_NOSYNC, &rs->ctr_flags))
+ DMEMIT(" %s", dm_raid_arg_name_by_flag(CTR_FLAG_NOSYNC));
if (rebuild_disks)
for (i = 0; i < rs->raid_disks; i++)
if (test_bit(rs->dev[i].rdev.raid_disk, (void *) rs->rebuild_disks))
DMEMIT(" %s %u", dm_raid_arg_name_by_flag(CTR_FLAG_REBUILD),
rs->dev[i].rdev.raid_disk);
+ if (test_bit(__CTR_FLAG_DAEMON_SLEEP, &rs->ctr_flags))
+ DMEMIT(" %s %lu", dm_raid_arg_name_by_flag(CTR_FLAG_DAEMON_SLEEP),
+ mddev->bitmap_info.daemon_sleep);
+ if (test_bit(__CTR_FLAG_MIN_RECOVERY_RATE, &rs->ctr_flags))
+ DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_MIN_RECOVERY_RATE),
+ mddev->sync_speed_min);
+ if (test_bit(__CTR_FLAG_MAX_RECOVERY_RATE, &rs->ctr_flags))
+ DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_MAX_RECOVERY_RATE),
+ mddev->sync_speed_max);
if (write_mostly_params)
for (i = 0; i < rs->raid_disks; i++)
if (test_bit(WriteMostly, &rs->dev[i].rdev.flags))
@@ -3422,15 +3487,30 @@ static void raid_status(struct dm_target *ti, status_type_t type,
if (test_bit(__CTR_FLAG_MAX_WRITE_BEHIND, &rs->ctr_flags))
DMEMIT(" %s %lu", dm_raid_arg_name_by_flag(CTR_FLAG_MAX_WRITE_BEHIND),
mddev->bitmap_info.max_write_behind);
- if (test_bit(__CTR_FLAG_MAX_RECOVERY_RATE, &rs->ctr_flags))
- DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_MAX_RECOVERY_RATE),
- mddev->sync_speed_max);
- if (test_bit(__CTR_FLAG_MIN_RECOVERY_RATE, &rs->ctr_flags))
- DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_MIN_RECOVERY_RATE),
- mddev->sync_speed_min);
+ if (test_bit(__CTR_FLAG_STRIPE_CACHE, &rs->ctr_flags))
+ DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_STRIPE_CACHE),
+ max_nr_stripes);
+ if (test_bit(__CTR_FLAG_REGION_SIZE, &rs->ctr_flags))
+ DMEMIT(" %s %llu", dm_raid_arg_name_by_flag(CTR_FLAG_REGION_SIZE),
+ (unsigned long long) to_sector(mddev->bitmap_info.chunksize));
+ if (test_bit(__CTR_FLAG_RAID10_COPIES, &rs->ctr_flags))
+ DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_RAID10_COPIES),
+ raid10_md_layout_to_copies(mddev->layout));
+ if (test_bit(__CTR_FLAG_RAID10_FORMAT, &rs->ctr_flags))
+ DMEMIT(" %s %s", dm_raid_arg_name_by_flag(CTR_FLAG_RAID10_FORMAT),
+ raid10_md_layout_to_format(mddev->layout));
+ if (test_bit(__CTR_FLAG_DELTA_DISKS, &rs->ctr_flags))
+ DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_DELTA_DISKS),
+ max(rs->delta_disks, mddev->delta_disks));
+ if (test_bit(__CTR_FLAG_DATA_OFFSET, &rs->ctr_flags))
+ DMEMIT(" %s %llu", dm_raid_arg_name_by_flag(CTR_FLAG_DATA_OFFSET),
+ (unsigned long long) rs->data_offset);
if (test_bit(__CTR_FLAG_JOURNAL_DEV, &rs->ctr_flags))
DMEMIT(" %s %s", dm_raid_arg_name_by_flag(CTR_FLAG_JOURNAL_DEV),
__get_dev_name(rs->journal_dev.dev));
+ if (test_bit(__CTR_FLAG_JOURNAL_MODE, &rs->ctr_flags))
+ DMEMIT(" %s %s", dm_raid_arg_name_by_flag(CTR_FLAG_JOURNAL_MODE),
+ md_journal_mode_to_dm_raid(rs->journal_dev.mode));
DMEMIT(" %d", rs->raid_disks);
for (i = 0; i < rs->raid_disks; i++)
DMEMIT(" %s %s", __get_dev_name(rs->dev[i].meta_dev),
@@ -3791,7 +3871,7 @@ static void raid_resume(struct dm_target *ti)
static struct target_type raid_target = {
.name = "raid",
- .version = {1, 10, 1},
+ .version = {1, 11, 1},
.module = THIS_MODULE,
.ctr = raid_ctr,
.dtr = raid_dtr,