summaryrefslogtreecommitdiffstats
path: root/drivers/md/dm-writecache.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2021-07-01 03:19:39 +0200
committerLinus Torvalds <torvalds@linux-foundation.org>2021-07-01 03:19:39 +0200
commit2cfa582be80081fb8db02d4d9b44bff34b82ac54 (patch)
tree2faf8db8426b389ca8c9ed76065c688431bb7eb9 /drivers/md/dm-writecache.c
parentMerge tag 'net-next-5.14' of git://git.kernel.org/pub/scm/linux/kernel/git/ne... (diff)
parentdm writecache: make writeback pause configurable (diff)
downloadlinux-2cfa582be80081fb8db02d4d9b44bff34b82ac54.tar.xz
linux-2cfa582be80081fb8db02d4d9b44bff34b82ac54.zip
Merge tag 'for-5.14/dm-changes' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm
Pull device mapper updates from Mike Snitzer: - Various DM persistent-data library improvements and fixes that benefit both the DM thinp and cache targets. - A few small DM kcopyd efficiency improvements. - Significant zoned related block core, DM core and DM zoned target changes that culminate with adding zoned append emulation (which is required to properly fix DM crypt's zoned support). - Various DM writecache target changes that improve efficiency. Adds an optional "metadata_only" feature that only promotes bios flagged with REQ_META. But the most significant improvement is writecache's ability to pause writeback, for a confiurable time, if/when the working set is larger than the cache (and the cache is full) -- this ensures performance is no worse than the slower origin device. * tag 'for-5.14/dm-changes' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm: (35 commits) dm writecache: make writeback pause configurable dm writecache: pause writeback if cache full and origin being written directly dm io tracker: factor out IO tracker dm btree remove: assign new_root only when removal succeeds dm zone: fix dm_revalidate_zones() memory allocation dm ps io affinity: remove redundant continue statement dm writecache: add optional "metadata_only" parameter dm writecache: add "cleaner" and "max_age" to Documentation dm writecache: write at least 4k when committing dm writecache: flush origin device when writing and cache is full dm writecache: have ssd writeback wait if the kcopyd workqueue is busy dm writecache: use list_move instead of list_del/list_add in writecache_writeback() dm writecache: commit just one block, not a full page dm writecache: remove unused gfp_t argument from wc_add_block() dm crypt: Fix zoned block device support dm: introduce zone append emulation dm: rearrange core declarations for extended use from dm-zone.c block: introduce BIO_ZONE_WRITE_LOCKED bio flag block: introduce bio zone helpers block: improve handling of all zones reset operation ...
Diffstat (limited to 'drivers/md/dm-writecache.c')
-rw-r--r--drivers/md/dm-writecache.c140
1 files changed, 114 insertions, 26 deletions
diff --git a/drivers/md/dm-writecache.c b/drivers/md/dm-writecache.c
index aecc246ade26..e21e29e81bbf 100644
--- a/drivers/md/dm-writecache.c
+++ b/drivers/md/dm-writecache.c
@@ -15,6 +15,8 @@
#include <linux/dax.h>
#include <linux/pfn_t.h>
#include <linux/libnvdimm.h>
+#include <linux/delay.h>
+#include "dm-io-tracker.h"
#define DM_MSG_PREFIX "writecache"
@@ -28,6 +30,7 @@
#define AUTOCOMMIT_MSEC 1000
#define MAX_AGE_DIV 16
#define MAX_AGE_UNSPECIFIED -1UL
+#define PAUSE_WRITEBACK (HZ * 3)
#define BITMAP_GRANULARITY 65536
#if BITMAP_GRANULARITY < PAGE_SIZE
@@ -123,6 +126,7 @@ struct dm_writecache {
size_t freelist_high_watermark;
size_t freelist_low_watermark;
unsigned long max_age;
+ unsigned long pause;
unsigned uncommitted_blocks;
unsigned autocommit_blocks;
@@ -171,17 +175,22 @@ struct dm_writecache {
bool flush_on_suspend:1;
bool cleaner:1;
bool cleaner_set:1;
+ bool metadata_only:1;
+ bool pause_set:1;
unsigned high_wm_percent_value;
unsigned low_wm_percent_value;
unsigned autocommit_time_value;
unsigned max_age_value;
+ unsigned pause_value;
unsigned writeback_all;
struct workqueue_struct *writeback_wq;
struct work_struct writeback_work;
struct work_struct flush_work;
+ struct dm_io_tracker iot;
+
struct dm_io_client *dm_io;
raw_spinlock_t endio_list_lock;
@@ -532,7 +541,7 @@ static void ssd_commit_superblock(struct dm_writecache *wc)
region.bdev = wc->ssd_dev->bdev;
region.sector = 0;
- region.count = PAGE_SIZE >> SECTOR_SHIFT;
+ region.count = max(4096U, wc->block_size) >> SECTOR_SHIFT;
if (unlikely(region.sector + region.count > wc->metadata_sectors))
region.count = wc->metadata_sectors - region.sector;
@@ -1301,8 +1310,12 @@ static int writecache_map(struct dm_target *ti, struct bio *bio)
writecache_flush(wc);
if (writecache_has_error(wc))
goto unlock_error;
+ if (unlikely(wc->cleaner) || unlikely(wc->metadata_only))
+ goto unlock_remap_origin;
goto unlock_submit;
} else {
+ if (dm_bio_get_target_bio_nr(bio))
+ goto unlock_remap_origin;
writecache_offload_bio(wc, bio);
goto unlock_return;
}
@@ -1360,24 +1373,29 @@ read_next_block:
} else {
do {
bool found_entry = false;
+ bool search_used = false;
if (writecache_has_error(wc))
goto unlock_error;
e = writecache_find_entry(wc, bio->bi_iter.bi_sector, 0);
if (e) {
- if (!writecache_entry_is_committed(wc, e))
+ if (!writecache_entry_is_committed(wc, e)) {
+ search_used = true;
goto bio_copy;
+ }
if (!WC_MODE_PMEM(wc) && !e->write_in_progress) {
wc->overwrote_committed = true;
+ search_used = true;
goto bio_copy;
}
found_entry = true;
} else {
- if (unlikely(wc->cleaner))
+ if (unlikely(wc->cleaner) ||
+ (wc->metadata_only && !(bio->bi_opf & REQ_META)))
goto direct_write;
}
e = writecache_pop_from_freelist(wc, (sector_t)-1);
if (unlikely(!e)) {
- if (!found_entry) {
+ if (!WC_MODE_PMEM(wc) && !found_entry) {
direct_write:
e = writecache_find_entry(wc, bio->bi_iter.bi_sector, WFE_RETURN_FOLLOWING);
if (e) {
@@ -1404,13 +1422,31 @@ bio_copy:
sector_t current_cache_sec = start_cache_sec + (bio_size >> SECTOR_SHIFT);
while (bio_size < bio->bi_iter.bi_size) {
- struct wc_entry *f = writecache_pop_from_freelist(wc, current_cache_sec);
- if (!f)
- break;
- write_original_sector_seq_count(wc, f, bio->bi_iter.bi_sector +
- (bio_size >> SECTOR_SHIFT), wc->seq_count);
- writecache_insert_entry(wc, f);
- wc->uncommitted_blocks++;
+ if (!search_used) {
+ struct wc_entry *f = writecache_pop_from_freelist(wc, current_cache_sec);
+ if (!f)
+ break;
+ write_original_sector_seq_count(wc, f, bio->bi_iter.bi_sector +
+ (bio_size >> SECTOR_SHIFT), wc->seq_count);
+ writecache_insert_entry(wc, f);
+ wc->uncommitted_blocks++;
+ } else {
+ struct wc_entry *f;
+ struct rb_node *next = rb_next(&e->rb_node);
+ if (!next)
+ break;
+ f = container_of(next, struct wc_entry, rb_node);
+ if (f != e + 1)
+ break;
+ if (read_original_sector(wc, f) !=
+ read_original_sector(wc, e) + (wc->block_size >> SECTOR_SHIFT))
+ break;
+ if (unlikely(f->write_in_progress))
+ break;
+ if (writecache_entry_is_committed(wc, f))
+ wc->overwrote_committed = true;
+ e = f;
+ }
bio_size += wc->block_size;
current_cache_sec += wc->block_size >> SECTOR_SHIFT;
}
@@ -1438,6 +1474,12 @@ bio_copy:
}
unlock_remap_origin:
+ if (likely(wc->pause != 0)) {
+ if (bio_op(bio) == REQ_OP_WRITE) {
+ dm_iot_io_begin(&wc->iot, 1);
+ bio->bi_private = (void *)2;
+ }
+ }
bio_set_dev(bio, wc->dev->bdev);
wc_unlock(wc);
return DM_MAPIO_REMAPPED;
@@ -1468,11 +1510,13 @@ static int writecache_end_io(struct dm_target *ti, struct bio *bio, blk_status_t
{
struct dm_writecache *wc = ti->private;
- if (bio->bi_private != NULL) {
+ if (bio->bi_private == (void *)1) {
int dir = bio_data_dir(bio);
if (atomic_dec_and_test(&wc->bio_in_progress[dir]))
if (unlikely(waitqueue_active(&wc->bio_in_progress_wait[dir])))
wake_up(&wc->bio_in_progress_wait[dir]);
+ } else if (bio->bi_private == (void *)2) {
+ dm_iot_io_end(&wc->iot, 1);
}
return 0;
}
@@ -1642,7 +1686,7 @@ pop_from_list:
return 0;
}
-static bool wc_add_block(struct writeback_struct *wb, struct wc_entry *e, gfp_t gfp)
+static bool wc_add_block(struct writeback_struct *wb, struct wc_entry *e)
{
struct dm_writecache *wc = wb->wc;
unsigned block_size = wc->block_size;
@@ -1703,7 +1747,7 @@ static void __writecache_writeback_pmem(struct dm_writecache *wc, struct writeba
max_pages = WB_LIST_INLINE;
}
- BUG_ON(!wc_add_block(wb, e, GFP_NOIO));
+ BUG_ON(!wc_add_block(wb, e));
wb->wc_list[0] = e;
wb->wc_list_n = 1;
@@ -1713,7 +1757,7 @@ static void __writecache_writeback_pmem(struct dm_writecache *wc, struct writeba
if (read_original_sector(wc, f) !=
read_original_sector(wc, e) + (wc->block_size >> SECTOR_SHIFT))
break;
- if (!wc_add_block(wb, f, GFP_NOWAIT | __GFP_NOWARN))
+ if (!wc_add_block(wb, f))
break;
wbl->size--;
list_del(&f->lru);
@@ -1794,6 +1838,27 @@ static void writecache_writeback(struct work_struct *work)
struct writeback_list wbl;
unsigned long n_walked;
+ if (!WC_MODE_PMEM(wc)) {
+ /* Wait for any active kcopyd work on behalf of ssd writeback */
+ dm_kcopyd_client_flush(wc->dm_kcopyd);
+ }
+
+ if (likely(wc->pause != 0)) {
+ while (1) {
+ unsigned long idle;
+ if (unlikely(wc->cleaner) || unlikely(wc->writeback_all) ||
+ unlikely(dm_suspended(wc->ti)))
+ break;
+ idle = dm_iot_idle_time(&wc->iot);
+ if (idle >= wc->pause)
+ break;
+ idle = wc->pause - idle;
+ if (idle > HZ)
+ idle = HZ;
+ schedule_timeout_idle(idle);
+ }
+ }
+
wc_lock(wc);
restart:
if (writecache_has_error(wc)) {
@@ -1822,8 +1887,9 @@ restart:
n_walked++;
if (unlikely(n_walked > WRITEBACK_LATENCY) &&
- likely(!wc->writeback_all) && likely(!dm_suspended(wc->ti))) {
- queue_work(wc->writeback_wq, &wc->writeback_work);
+ likely(!wc->writeback_all)) {
+ if (likely(!dm_suspended(wc->ti)))
+ queue_work(wc->writeback_wq, &wc->writeback_work);
break;
}
@@ -1845,15 +1911,13 @@ restart:
if (unlikely(read_original_sector(wc, f) ==
read_original_sector(wc, e))) {
BUG_ON(!f->write_in_progress);
- list_del(&e->lru);
- list_add(&e->lru, &skipped);
+ list_move(&e->lru, &skipped);
cond_resched();
continue;
}
}
wc->writeback_size++;
- list_del(&e->lru);
- list_add(&e->lru, &wbl.list);
+ list_move(&e->lru, &wbl.list);
wbl.size++;
e->write_in_progress = true;
e->wc_list_contiguous = 1;
@@ -1888,8 +1952,7 @@ restart:
// break;
wc->writeback_size++;
- list_del(&g->lru);
- list_add(&g->lru, &wbl.list);
+ list_move(&g->lru, &wbl.list);
wbl.size++;
g->write_in_progress = true;
g->wc_list_contiguous = BIO_MAX_VECS;
@@ -2065,7 +2128,7 @@ static int writecache_ctr(struct dm_target *ti, unsigned argc, char **argv)
struct wc_memory_superblock s;
static struct dm_arg _args[] = {
- {0, 16, "Invalid number of feature args"},
+ {0, 18, "Invalid number of feature args"},
};
as.argc = argc;
@@ -2109,6 +2172,8 @@ static int writecache_ctr(struct dm_target *ti, unsigned argc, char **argv)
INIT_WORK(&wc->writeback_work, writecache_writeback);
INIT_WORK(&wc->flush_work, writecache_flush_work);
+ dm_iot_init(&wc->iot);
+
raw_spin_lock_init(&wc->endio_list_lock);
INIT_LIST_HEAD(&wc->endio_list);
wc->endio_thread = kthread_create(writecache_endio_thread, wc, "writecache_endio");
@@ -2156,6 +2221,7 @@ static int writecache_ctr(struct dm_target *ti, unsigned argc, char **argv)
goto bad;
}
} else {
+ wc->pause = PAUSE_WRITEBACK;
r = mempool_init_kmalloc_pool(&wc->copy_pool, 1, sizeof(struct copy_struct));
if (r) {
ti->error = "Could not allocate mempool";
@@ -2292,6 +2358,20 @@ static int writecache_ctr(struct dm_target *ti, unsigned argc, char **argv)
wc->writeback_fua = false;
wc->writeback_fua_set = true;
} else goto invalid_optional;
+ } else if (!strcasecmp(string, "metadata_only")) {
+ wc->metadata_only = true;
+ } else if (!strcasecmp(string, "pause_writeback") && opt_params >= 1) {
+ unsigned pause_msecs;
+ if (WC_MODE_PMEM(wc))
+ goto invalid_optional;
+ string = dm_shift_arg(&as), opt_params--;
+ if (sscanf(string, "%u%c", &pause_msecs, &dummy) != 1)
+ goto invalid_optional;
+ if (pause_msecs > 60000)
+ goto invalid_optional;
+ wc->pause = msecs_to_jiffies(pause_msecs);
+ wc->pause_set = true;
+ wc->pause_value = pause_msecs;
} else {
invalid_optional:
r = -EINVAL;
@@ -2463,7 +2543,7 @@ overflow:
goto bad;
}
- ti->num_flush_bios = 1;
+ ti->num_flush_bios = WC_MODE_PMEM(wc) ? 1 : 2;
ti->flush_supported = true;
ti->num_discard_bios = 1;
@@ -2515,6 +2595,10 @@ static void writecache_status(struct dm_target *ti, status_type_t type,
extra_args++;
if (wc->writeback_fua_set)
extra_args++;
+ if (wc->metadata_only)
+ extra_args++;
+ if (wc->pause_set)
+ extra_args += 2;
DMEMIT("%u", extra_args);
if (wc->start_sector_set)
@@ -2535,13 +2619,17 @@ static void writecache_status(struct dm_target *ti, status_type_t type,
DMEMIT(" cleaner");
if (wc->writeback_fua_set)
DMEMIT(" %sfua", wc->writeback_fua ? "" : "no");
+ if (wc->metadata_only)
+ DMEMIT(" metadata_only");
+ if (wc->pause_set)
+ DMEMIT(" pause_writeback %u", wc->pause_value);
break;
}
}
static struct target_type writecache_target = {
.name = "writecache",
- .version = {1, 4, 0},
+ .version = {1, 5, 0},
.module = THIS_MODULE,
.ctr = writecache_ctr,
.dtr = writecache_dtr,