diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2017-05-03 19:31:20 +0200 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2017-05-03 19:31:20 +0200 |
commit | d35a878ae1c50977b55e352fd46e36e35add72a0 (patch) | |
tree | 7cd4e0ec418c6f3be365e56ee3c49bab218cd608 /drivers/md/dm-raid.c | |
parent | Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/shl... (diff) | |
parent | dm bufio: check new buffer allocation watermark every 30 seconds (diff) | |
download | linux-d35a878ae1c50977b55e352fd46e36e35add72a0.tar.xz linux-d35a878ae1c50977b55e352fd46e36e35add72a0.zip |
Merge tag 'for-4.12/dm-changes' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm
Pull device mapper updates from Mike Snitzer:
- A major update for DM cache that reduces the latency for deciding
whether blocks should migrate to/from the cache. The bio-prison-v2
interface supports this improvement by enabling direct dispatch of
work to workqueues rather than having to delay the actual work
dispatch to the DM cache core. So the dm-cache policies are much more
nimble by being able to drive IO as they see fit. One immediate
benefit from the improved latency is a cache that should be much more
adaptive to changing workloads.
- Add a new DM integrity target that emulates a block device that has
additional per-sector tags that can be used for storing integrity
information.
- Add a new authenticated encryption feature to the DM crypt target
that builds on the capabilities provided by the DM integrity target.
- Add MD interface for switching the raid4/5/6 journal mode and update
the DM raid target to use it to enable aid4/5/6 journal write-back
support.
- Switch the DM verity target over to using the asynchronous hash
crypto API (this helps work better with architectures that have
access to off-CPU algorithm providers, which should reduce CPU
utilization).
- Various request-based DM and DM multipath fixes and improvements from
Bart and Christoph.
- A DM thinp target fix for a bio structure leak that occurs for each
discard IFF discard passdown is enabled.
- A fix for a possible deadlock in DM bufio and a fix to re-check the
new buffer allocation watermark in the face of competing admin
changes to the 'max_cache_size_bytes' tunable.
- A couple DM core cleanups.
* tag 'for-4.12/dm-changes' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm: (50 commits)
dm bufio: check new buffer allocation watermark every 30 seconds
dm bufio: avoid a possible ABBA deadlock
dm mpath: make it easier to detect unintended I/O request flushes
dm mpath: cleanup QUEUE_IF_NO_PATH bit manipulation by introducing assign_bit()
dm mpath: micro-optimize the hot path relative to MPATHF_QUEUE_IF_NO_PATH
dm: introduce enum dm_queue_mode to cleanup related code
dm mpath: verify __pg_init_all_paths locking assumptions at runtime
dm: verify suspend_locking assumptions at runtime
dm block manager: remove an unused argument from dm_block_manager_create()
dm rq: check blk_mq_register_dev() return value in dm_mq_init_request_queue()
dm mpath: delay requeuing while path initialization is in progress
dm mpath: avoid that path removal can trigger an infinite loop
dm mpath: split and rename activate_path() to prepare for its expanded use
dm ioctl: prevent stack leak in dm ioctl call
dm integrity: use previously calculated log2 of sectors_per_block
dm integrity: use hex2bin instead of open-coded variant
dm crypt: replace custom implementation of hex2bin()
dm crypt: remove obsolete references to per-CPU state
dm verity: switch to using asynchronous hash crypto API
dm crypt: use WQ_HIGHPRI for the IO and crypt workqueues
...
Diffstat (limited to 'drivers/md/dm-raid.c')
-rw-r--r-- | drivers/md/dm-raid.c | 164 |
1 files changed, 122 insertions, 42 deletions
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index 2dae3e5b851c..7d893228c40f 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -1,6 +1,6 @@ /* * Copyright (C) 2010-2011 Neil Brown - * Copyright (C) 2010-2016 Red Hat, Inc. All rights reserved. + * Copyright (C) 2010-2017 Red Hat, Inc. All rights reserved. * * This file is released under the GPL. */ @@ -79,7 +79,10 @@ struct raid_dev { #define __CTR_FLAG_RAID10_USE_NEAR_SETS 14 /* 2 */ /* Only with raid10! */ /* New for v1.10.0 */ -#define __CTR_FLAG_JOURNAL_DEV 15 /* 2 */ /* Only with raid4/5/6! */ +#define __CTR_FLAG_JOURNAL_DEV 15 /* 2 */ /* Only with raid4/5/6 (journal device)! */ + +/* New for v1.11.1 */ +#define __CTR_FLAG_JOURNAL_MODE 16 /* 2 */ /* Only with raid4/5/6 (journal mode)! */ /* * Flags for rs->ctr_flags field. @@ -100,6 +103,7 @@ struct raid_dev { #define CTR_FLAG_DATA_OFFSET (1 << __CTR_FLAG_DATA_OFFSET) #define CTR_FLAG_RAID10_USE_NEAR_SETS (1 << __CTR_FLAG_RAID10_USE_NEAR_SETS) #define CTR_FLAG_JOURNAL_DEV (1 << __CTR_FLAG_JOURNAL_DEV) +#define CTR_FLAG_JOURNAL_MODE (1 << __CTR_FLAG_JOURNAL_MODE) #define RESUME_STAY_FROZEN_FLAGS (CTR_FLAG_DELTA_DISKS | CTR_FLAG_DATA_OFFSET) @@ -175,7 +179,8 @@ struct raid_dev { CTR_FLAG_REGION_SIZE | \ CTR_FLAG_DELTA_DISKS | \ CTR_FLAG_DATA_OFFSET | \ - CTR_FLAG_JOURNAL_DEV) + CTR_FLAG_JOURNAL_DEV | \ + CTR_FLAG_JOURNAL_MODE) #define RAID6_VALID_FLAGS (CTR_FLAG_SYNC | \ CTR_FLAG_REBUILD | \ @@ -186,7 +191,8 @@ struct raid_dev { CTR_FLAG_REGION_SIZE | \ CTR_FLAG_DELTA_DISKS | \ CTR_FLAG_DATA_OFFSET | \ - CTR_FLAG_JOURNAL_DEV) + CTR_FLAG_JOURNAL_DEV | \ + CTR_FLAG_JOURNAL_MODE) /* ...valid options definitions per raid level */ /* @@ -239,6 +245,7 @@ struct raid_set { struct journal_dev { struct dm_dev *dev; struct md_rdev rdev; + int mode; } journal_dev; struct raid_dev dev[0]; @@ -326,6 +333,7 @@ static struct arg_name_flag { { CTR_FLAG_DELTA_DISKS, "delta_disks"}, { CTR_FLAG_RAID10_USE_NEAR_SETS, "raid10_use_near_sets"}, { CTR_FLAG_JOURNAL_DEV, "journal_dev" }, + { CTR_FLAG_JOURNAL_MODE, "journal_mode" }, }; /* Return argument name string for given @flag */ @@ -344,6 +352,39 @@ static const char *dm_raid_arg_name_by_flag(const uint32_t flag) return NULL; } +/* Define correlation of raid456 journal cache modes and dm-raid target line parameters */ +static struct { + const int mode; + const char *param; +} _raid456_journal_mode[] = { + { R5C_JOURNAL_MODE_WRITE_THROUGH , "writethrough" }, + { R5C_JOURNAL_MODE_WRITE_BACK , "writeback" } +}; + +/* Return MD raid4/5/6 journal mode for dm @journal_mode one */ +static int dm_raid_journal_mode_to_md(const char *mode) +{ + int m = ARRAY_SIZE(_raid456_journal_mode); + + while (m--) + if (!strcasecmp(mode, _raid456_journal_mode[m].param)) + return _raid456_journal_mode[m].mode; + + return -EINVAL; +} + +/* Return dm-raid raid4/5/6 journal mode string for @mode */ +static const char *md_journal_mode_to_dm_raid(const int mode) +{ + int m = ARRAY_SIZE(_raid456_journal_mode); + + while (m--) + if (mode == _raid456_journal_mode[m].mode) + return _raid456_journal_mode[m].param; + + return "unknown"; +} + /* * Bool helpers to test for various raid levels of a raid set. * It's level as reported by the superblock rather than @@ -1183,7 +1224,7 @@ static int parse_raid_params(struct raid_set *rs, struct dm_arg_set *as, continue; } - /* "journal_dev dev" */ + /* "journal_dev <dev>" */ if (!strcasecmp(key, dm_raid_arg_name_by_flag(CTR_FLAG_JOURNAL_DEV))) { int r; struct md_rdev *jdev; @@ -1211,10 +1252,32 @@ static int parse_raid_params(struct raid_set *rs, struct dm_arg_set *as, rs->ti->error = "No space for raid4/5/6 journal"; return -ENOSPC; } + rs->journal_dev.mode = R5C_JOURNAL_MODE_WRITE_THROUGH; set_bit(Journal, &jdev->flags); continue; } + /* "journal_mode <mode>" ("journal_dev" mandatory!) */ + if (!strcasecmp(key, dm_raid_arg_name_by_flag(CTR_FLAG_JOURNAL_MODE))) { + int r; + + if (!test_bit(__CTR_FLAG_JOURNAL_DEV, &rs->ctr_flags)) { + rs->ti->error = "raid4/5/6 'journal_mode' is invalid without 'journal_dev'"; + return -EINVAL; + } + if (test_and_set_bit(__CTR_FLAG_JOURNAL_MODE, &rs->ctr_flags)) { + rs->ti->error = "Only one raid4/5/6 'journal_mode' argument allowed"; + return -EINVAL; + } + r = dm_raid_journal_mode_to_md(arg); + if (r < 0) { + rs->ti->error = "Invalid 'journal_mode' argument"; + return r; + } + rs->journal_dev.mode = r; + continue; + } + /* * Parameters with number values from here on. */ @@ -3076,6 +3139,16 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv) rs->callbacks.congested_fn = raid_is_congested; dm_table_add_target_callbacks(ti->table, &rs->callbacks); + /* If raid4/5/6 journal mode explictely requested (only possible with journal dev) -> set it */ + if (test_bit(__CTR_FLAG_JOURNAL_MODE, &rs->ctr_flags)) { + r = r5c_journal_mode_set(&rs->md, rs->journal_dev.mode); + if (r) { + ti->error = "Failed to set raid4/5/6 journal mode"; + mddev_unlock(&rs->md); + goto bad_journal_mode_set; + } + } + mddev_suspend(&rs->md); /* Try to adjust the raid4/5/6 stripe cache size to the stripe size */ @@ -3109,6 +3182,7 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv) mddev_unlock(&rs->md); return 0; +bad_journal_mode_set: bad_stripe_cache: bad_check_reshape: md_stop(&rs->md); @@ -3180,18 +3254,18 @@ static const char *decipher_sync_action(struct mddev *mddev) * Status characters: * * 'D' = Dead/Failed raid set component or raid4/5/6 journal device - * 'a' = Alive but not in-sync - * 'A' = Alive and in-sync raid set component or alive raid4/5/6 journal device + * 'a' = Alive but not in-sync raid set component _or_ alive raid4/5/6 'write_back' journal device + * 'A' = Alive and in-sync raid set component _or_ alive raid4/5/6 'write_through' journal device * '-' = Non-existing device (i.e. uspace passed '- -' into the ctr) */ -static const char *__raid_dev_status(struct md_rdev *rdev, bool array_in_sync) +static const char *__raid_dev_status(struct raid_set *rs, struct md_rdev *rdev, bool array_in_sync) { if (!rdev->bdev) return "-"; else if (test_bit(Faulty, &rdev->flags)) return "D"; else if (test_bit(Journal, &rdev->flags)) - return "A"; + return (rs->journal_dev.mode == R5C_JOURNAL_MODE_WRITE_THROUGH) ? "A" : "a"; else if (!array_in_sync || !test_bit(In_sync, &rdev->flags)) return "a"; else @@ -3315,7 +3389,7 @@ static void raid_status(struct dm_target *ti, status_type_t type, /* HM FIXME: do we want another state char for raid0? It shows 'D'/'A'/'-' now */ for (i = 0; i < rs->raid_disks; i++) - DMEMIT(__raid_dev_status(&rs->dev[i].rdev, array_in_sync)); + DMEMIT(__raid_dev_status(rs, &rs->dev[i].rdev, array_in_sync)); /* * In-sync/Reshape ratio: @@ -3366,7 +3440,7 @@ static void raid_status(struct dm_target *ti, status_type_t type, * v1.10.0+: */ DMEMIT(" %s", test_bit(__CTR_FLAG_JOURNAL_DEV, &rs->ctr_flags) ? - __raid_dev_status(&rs->journal_dev.rdev, 0) : "-"); + __raid_dev_status(rs, &rs->journal_dev.rdev, 0) : "-"); break; case STATUSTYPE_TABLE: @@ -3381,39 +3455,30 @@ static void raid_status(struct dm_target *ti, status_type_t type, write_mostly_params + hweight32(rs->ctr_flags & CTR_FLAG_OPTIONS_NO_ARGS) + hweight32(rs->ctr_flags & CTR_FLAG_OPTIONS_ONE_ARG) * 2 + - (test_bit(__CTR_FLAG_JOURNAL_DEV, &rs->ctr_flags) ? 2 : 0); + (test_bit(__CTR_FLAG_JOURNAL_DEV, &rs->ctr_flags) ? 2 : 0) + + (test_bit(__CTR_FLAG_JOURNAL_MODE, &rs->ctr_flags) ? 2 : 0); + /* Emit table line */ + /* This has to be in the documented order for userspace! */ DMEMIT("%s %u %u", rs->raid_type->name, raid_param_cnt, mddev->new_chunk_sectors); - if (test_bit(__CTR_FLAG_RAID10_FORMAT, &rs->ctr_flags)) - DMEMIT(" %s %s", dm_raid_arg_name_by_flag(CTR_FLAG_RAID10_FORMAT), - raid10_md_layout_to_format(mddev->layout)); - if (test_bit(__CTR_FLAG_RAID10_COPIES, &rs->ctr_flags)) - DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_RAID10_COPIES), - raid10_md_layout_to_copies(mddev->layout)); - if (test_bit(__CTR_FLAG_NOSYNC, &rs->ctr_flags)) - DMEMIT(" %s", dm_raid_arg_name_by_flag(CTR_FLAG_NOSYNC)); if (test_bit(__CTR_FLAG_SYNC, &rs->ctr_flags)) DMEMIT(" %s", dm_raid_arg_name_by_flag(CTR_FLAG_SYNC)); - if (test_bit(__CTR_FLAG_REGION_SIZE, &rs->ctr_flags)) - DMEMIT(" %s %llu", dm_raid_arg_name_by_flag(CTR_FLAG_REGION_SIZE), - (unsigned long long) to_sector(mddev->bitmap_info.chunksize)); - if (test_bit(__CTR_FLAG_DATA_OFFSET, &rs->ctr_flags)) - DMEMIT(" %s %llu", dm_raid_arg_name_by_flag(CTR_FLAG_DATA_OFFSET), - (unsigned long long) rs->data_offset); - if (test_bit(__CTR_FLAG_DAEMON_SLEEP, &rs->ctr_flags)) - DMEMIT(" %s %lu", dm_raid_arg_name_by_flag(CTR_FLAG_DAEMON_SLEEP), - mddev->bitmap_info.daemon_sleep); - if (test_bit(__CTR_FLAG_DELTA_DISKS, &rs->ctr_flags)) - DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_DELTA_DISKS), - max(rs->delta_disks, mddev->delta_disks)); - if (test_bit(__CTR_FLAG_STRIPE_CACHE, &rs->ctr_flags)) - DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_STRIPE_CACHE), - max_nr_stripes); + if (test_bit(__CTR_FLAG_NOSYNC, &rs->ctr_flags)) + DMEMIT(" %s", dm_raid_arg_name_by_flag(CTR_FLAG_NOSYNC)); if (rebuild_disks) for (i = 0; i < rs->raid_disks; i++) if (test_bit(rs->dev[i].rdev.raid_disk, (void *) rs->rebuild_disks)) DMEMIT(" %s %u", dm_raid_arg_name_by_flag(CTR_FLAG_REBUILD), rs->dev[i].rdev.raid_disk); + if (test_bit(__CTR_FLAG_DAEMON_SLEEP, &rs->ctr_flags)) + DMEMIT(" %s %lu", dm_raid_arg_name_by_flag(CTR_FLAG_DAEMON_SLEEP), + mddev->bitmap_info.daemon_sleep); + if (test_bit(__CTR_FLAG_MIN_RECOVERY_RATE, &rs->ctr_flags)) + DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_MIN_RECOVERY_RATE), + mddev->sync_speed_min); + if (test_bit(__CTR_FLAG_MAX_RECOVERY_RATE, &rs->ctr_flags)) + DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_MAX_RECOVERY_RATE), + mddev->sync_speed_max); if (write_mostly_params) for (i = 0; i < rs->raid_disks; i++) if (test_bit(WriteMostly, &rs->dev[i].rdev.flags)) @@ -3422,15 +3487,30 @@ static void raid_status(struct dm_target *ti, status_type_t type, if (test_bit(__CTR_FLAG_MAX_WRITE_BEHIND, &rs->ctr_flags)) DMEMIT(" %s %lu", dm_raid_arg_name_by_flag(CTR_FLAG_MAX_WRITE_BEHIND), mddev->bitmap_info.max_write_behind); - if (test_bit(__CTR_FLAG_MAX_RECOVERY_RATE, &rs->ctr_flags)) - DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_MAX_RECOVERY_RATE), - mddev->sync_speed_max); - if (test_bit(__CTR_FLAG_MIN_RECOVERY_RATE, &rs->ctr_flags)) - DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_MIN_RECOVERY_RATE), - mddev->sync_speed_min); + if (test_bit(__CTR_FLAG_STRIPE_CACHE, &rs->ctr_flags)) + DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_STRIPE_CACHE), + max_nr_stripes); + if (test_bit(__CTR_FLAG_REGION_SIZE, &rs->ctr_flags)) + DMEMIT(" %s %llu", dm_raid_arg_name_by_flag(CTR_FLAG_REGION_SIZE), + (unsigned long long) to_sector(mddev->bitmap_info.chunksize)); + if (test_bit(__CTR_FLAG_RAID10_COPIES, &rs->ctr_flags)) + DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_RAID10_COPIES), + raid10_md_layout_to_copies(mddev->layout)); + if (test_bit(__CTR_FLAG_RAID10_FORMAT, &rs->ctr_flags)) + DMEMIT(" %s %s", dm_raid_arg_name_by_flag(CTR_FLAG_RAID10_FORMAT), + raid10_md_layout_to_format(mddev->layout)); + if (test_bit(__CTR_FLAG_DELTA_DISKS, &rs->ctr_flags)) + DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_DELTA_DISKS), + max(rs->delta_disks, mddev->delta_disks)); + if (test_bit(__CTR_FLAG_DATA_OFFSET, &rs->ctr_flags)) + DMEMIT(" %s %llu", dm_raid_arg_name_by_flag(CTR_FLAG_DATA_OFFSET), + (unsigned long long) rs->data_offset); if (test_bit(__CTR_FLAG_JOURNAL_DEV, &rs->ctr_flags)) DMEMIT(" %s %s", dm_raid_arg_name_by_flag(CTR_FLAG_JOURNAL_DEV), __get_dev_name(rs->journal_dev.dev)); + if (test_bit(__CTR_FLAG_JOURNAL_MODE, &rs->ctr_flags)) + DMEMIT(" %s %s", dm_raid_arg_name_by_flag(CTR_FLAG_JOURNAL_MODE), + md_journal_mode_to_dm_raid(rs->journal_dev.mode)); DMEMIT(" %d", rs->raid_disks); for (i = 0; i < rs->raid_disks; i++) DMEMIT(" %s %s", __get_dev_name(rs->dev[i].meta_dev), @@ -3791,7 +3871,7 @@ static void raid_resume(struct dm_target *ti) static struct target_type raid_target = { .name = "raid", - .version = {1, 10, 1}, + .version = {1, 11, 1}, .module = THIS_MODULE, .ctr = raid_ctr, .dtr = raid_dtr, |