diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2021-07-01 03:19:39 +0200 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2021-07-01 03:19:39 +0200 |
commit | 2cfa582be80081fb8db02d4d9b44bff34b82ac54 (patch) | |
tree | 2faf8db8426b389ca8c9ed76065c688431bb7eb9 /drivers/md/dm-writecache.c | |
parent | Merge tag 'net-next-5.14' of git://git.kernel.org/pub/scm/linux/kernel/git/ne... (diff) | |
parent | dm writecache: make writeback pause configurable (diff) | |
download | linux-2cfa582be80081fb8db02d4d9b44bff34b82ac54.tar.xz linux-2cfa582be80081fb8db02d4d9b44bff34b82ac54.zip |
Merge tag 'for-5.14/dm-changes' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm
Pull device mapper updates from Mike Snitzer:
- Various DM persistent-data library improvements and fixes that
benefit both the DM thinp and cache targets.
- A few small DM kcopyd efficiency improvements.
- Significant zoned related block core, DM core and DM zoned target
changes that culminate with adding zoned append emulation (which is
required to properly fix DM crypt's zoned support).
- Various DM writecache target changes that improve efficiency. Adds an
optional "metadata_only" feature that only promotes bios flagged with
REQ_META. But the most significant improvement is writecache's
ability to pause writeback, for a confiurable time, if/when the
working set is larger than the cache (and the cache is full) -- this
ensures performance is no worse than the slower origin device.
* tag 'for-5.14/dm-changes' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm: (35 commits)
dm writecache: make writeback pause configurable
dm writecache: pause writeback if cache full and origin being written directly
dm io tracker: factor out IO tracker
dm btree remove: assign new_root only when removal succeeds
dm zone: fix dm_revalidate_zones() memory allocation
dm ps io affinity: remove redundant continue statement
dm writecache: add optional "metadata_only" parameter
dm writecache: add "cleaner" and "max_age" to Documentation
dm writecache: write at least 4k when committing
dm writecache: flush origin device when writing and cache is full
dm writecache: have ssd writeback wait if the kcopyd workqueue is busy
dm writecache: use list_move instead of list_del/list_add in writecache_writeback()
dm writecache: commit just one block, not a full page
dm writecache: remove unused gfp_t argument from wc_add_block()
dm crypt: Fix zoned block device support
dm: introduce zone append emulation
dm: rearrange core declarations for extended use from dm-zone.c
block: introduce BIO_ZONE_WRITE_LOCKED bio flag
block: introduce bio zone helpers
block: improve handling of all zones reset operation
...
Diffstat (limited to 'drivers/md/dm-writecache.c')
-rw-r--r-- | drivers/md/dm-writecache.c | 140 |
1 files changed, 114 insertions, 26 deletions
diff --git a/drivers/md/dm-writecache.c b/drivers/md/dm-writecache.c index aecc246ade26..e21e29e81bbf 100644 --- a/drivers/md/dm-writecache.c +++ b/drivers/md/dm-writecache.c @@ -15,6 +15,8 @@ #include <linux/dax.h> #include <linux/pfn_t.h> #include <linux/libnvdimm.h> +#include <linux/delay.h> +#include "dm-io-tracker.h" #define DM_MSG_PREFIX "writecache" @@ -28,6 +30,7 @@ #define AUTOCOMMIT_MSEC 1000 #define MAX_AGE_DIV 16 #define MAX_AGE_UNSPECIFIED -1UL +#define PAUSE_WRITEBACK (HZ * 3) #define BITMAP_GRANULARITY 65536 #if BITMAP_GRANULARITY < PAGE_SIZE @@ -123,6 +126,7 @@ struct dm_writecache { size_t freelist_high_watermark; size_t freelist_low_watermark; unsigned long max_age; + unsigned long pause; unsigned uncommitted_blocks; unsigned autocommit_blocks; @@ -171,17 +175,22 @@ struct dm_writecache { bool flush_on_suspend:1; bool cleaner:1; bool cleaner_set:1; + bool metadata_only:1; + bool pause_set:1; unsigned high_wm_percent_value; unsigned low_wm_percent_value; unsigned autocommit_time_value; unsigned max_age_value; + unsigned pause_value; unsigned writeback_all; struct workqueue_struct *writeback_wq; struct work_struct writeback_work; struct work_struct flush_work; + struct dm_io_tracker iot; + struct dm_io_client *dm_io; raw_spinlock_t endio_list_lock; @@ -532,7 +541,7 @@ static void ssd_commit_superblock(struct dm_writecache *wc) region.bdev = wc->ssd_dev->bdev; region.sector = 0; - region.count = PAGE_SIZE >> SECTOR_SHIFT; + region.count = max(4096U, wc->block_size) >> SECTOR_SHIFT; if (unlikely(region.sector + region.count > wc->metadata_sectors)) region.count = wc->metadata_sectors - region.sector; @@ -1301,8 +1310,12 @@ static int writecache_map(struct dm_target *ti, struct bio *bio) writecache_flush(wc); if (writecache_has_error(wc)) goto unlock_error; + if (unlikely(wc->cleaner) || unlikely(wc->metadata_only)) + goto unlock_remap_origin; goto unlock_submit; } else { + if (dm_bio_get_target_bio_nr(bio)) + goto unlock_remap_origin; writecache_offload_bio(wc, bio); goto unlock_return; } @@ -1360,24 +1373,29 @@ read_next_block: } else { do { bool found_entry = false; + bool search_used = false; if (writecache_has_error(wc)) goto unlock_error; e = writecache_find_entry(wc, bio->bi_iter.bi_sector, 0); if (e) { - if (!writecache_entry_is_committed(wc, e)) + if (!writecache_entry_is_committed(wc, e)) { + search_used = true; goto bio_copy; + } if (!WC_MODE_PMEM(wc) && !e->write_in_progress) { wc->overwrote_committed = true; + search_used = true; goto bio_copy; } found_entry = true; } else { - if (unlikely(wc->cleaner)) + if (unlikely(wc->cleaner) || + (wc->metadata_only && !(bio->bi_opf & REQ_META))) goto direct_write; } e = writecache_pop_from_freelist(wc, (sector_t)-1); if (unlikely(!e)) { - if (!found_entry) { + if (!WC_MODE_PMEM(wc) && !found_entry) { direct_write: e = writecache_find_entry(wc, bio->bi_iter.bi_sector, WFE_RETURN_FOLLOWING); if (e) { @@ -1404,13 +1422,31 @@ bio_copy: sector_t current_cache_sec = start_cache_sec + (bio_size >> SECTOR_SHIFT); while (bio_size < bio->bi_iter.bi_size) { - struct wc_entry *f = writecache_pop_from_freelist(wc, current_cache_sec); - if (!f) - break; - write_original_sector_seq_count(wc, f, bio->bi_iter.bi_sector + - (bio_size >> SECTOR_SHIFT), wc->seq_count); - writecache_insert_entry(wc, f); - wc->uncommitted_blocks++; + if (!search_used) { + struct wc_entry *f = writecache_pop_from_freelist(wc, current_cache_sec); + if (!f) + break; + write_original_sector_seq_count(wc, f, bio->bi_iter.bi_sector + + (bio_size >> SECTOR_SHIFT), wc->seq_count); + writecache_insert_entry(wc, f); + wc->uncommitted_blocks++; + } else { + struct wc_entry *f; + struct rb_node *next = rb_next(&e->rb_node); + if (!next) + break; + f = container_of(next, struct wc_entry, rb_node); + if (f != e + 1) + break; + if (read_original_sector(wc, f) != + read_original_sector(wc, e) + (wc->block_size >> SECTOR_SHIFT)) + break; + if (unlikely(f->write_in_progress)) + break; + if (writecache_entry_is_committed(wc, f)) + wc->overwrote_committed = true; + e = f; + } bio_size += wc->block_size; current_cache_sec += wc->block_size >> SECTOR_SHIFT; } @@ -1438,6 +1474,12 @@ bio_copy: } unlock_remap_origin: + if (likely(wc->pause != 0)) { + if (bio_op(bio) == REQ_OP_WRITE) { + dm_iot_io_begin(&wc->iot, 1); + bio->bi_private = (void *)2; + } + } bio_set_dev(bio, wc->dev->bdev); wc_unlock(wc); return DM_MAPIO_REMAPPED; @@ -1468,11 +1510,13 @@ static int writecache_end_io(struct dm_target *ti, struct bio *bio, blk_status_t { struct dm_writecache *wc = ti->private; - if (bio->bi_private != NULL) { + if (bio->bi_private == (void *)1) { int dir = bio_data_dir(bio); if (atomic_dec_and_test(&wc->bio_in_progress[dir])) if (unlikely(waitqueue_active(&wc->bio_in_progress_wait[dir]))) wake_up(&wc->bio_in_progress_wait[dir]); + } else if (bio->bi_private == (void *)2) { + dm_iot_io_end(&wc->iot, 1); } return 0; } @@ -1642,7 +1686,7 @@ pop_from_list: return 0; } -static bool wc_add_block(struct writeback_struct *wb, struct wc_entry *e, gfp_t gfp) +static bool wc_add_block(struct writeback_struct *wb, struct wc_entry *e) { struct dm_writecache *wc = wb->wc; unsigned block_size = wc->block_size; @@ -1703,7 +1747,7 @@ static void __writecache_writeback_pmem(struct dm_writecache *wc, struct writeba max_pages = WB_LIST_INLINE; } - BUG_ON(!wc_add_block(wb, e, GFP_NOIO)); + BUG_ON(!wc_add_block(wb, e)); wb->wc_list[0] = e; wb->wc_list_n = 1; @@ -1713,7 +1757,7 @@ static void __writecache_writeback_pmem(struct dm_writecache *wc, struct writeba if (read_original_sector(wc, f) != read_original_sector(wc, e) + (wc->block_size >> SECTOR_SHIFT)) break; - if (!wc_add_block(wb, f, GFP_NOWAIT | __GFP_NOWARN)) + if (!wc_add_block(wb, f)) break; wbl->size--; list_del(&f->lru); @@ -1794,6 +1838,27 @@ static void writecache_writeback(struct work_struct *work) struct writeback_list wbl; unsigned long n_walked; + if (!WC_MODE_PMEM(wc)) { + /* Wait for any active kcopyd work on behalf of ssd writeback */ + dm_kcopyd_client_flush(wc->dm_kcopyd); + } + + if (likely(wc->pause != 0)) { + while (1) { + unsigned long idle; + if (unlikely(wc->cleaner) || unlikely(wc->writeback_all) || + unlikely(dm_suspended(wc->ti))) + break; + idle = dm_iot_idle_time(&wc->iot); + if (idle >= wc->pause) + break; + idle = wc->pause - idle; + if (idle > HZ) + idle = HZ; + schedule_timeout_idle(idle); + } + } + wc_lock(wc); restart: if (writecache_has_error(wc)) { @@ -1822,8 +1887,9 @@ restart: n_walked++; if (unlikely(n_walked > WRITEBACK_LATENCY) && - likely(!wc->writeback_all) && likely(!dm_suspended(wc->ti))) { - queue_work(wc->writeback_wq, &wc->writeback_work); + likely(!wc->writeback_all)) { + if (likely(!dm_suspended(wc->ti))) + queue_work(wc->writeback_wq, &wc->writeback_work); break; } @@ -1845,15 +1911,13 @@ restart: if (unlikely(read_original_sector(wc, f) == read_original_sector(wc, e))) { BUG_ON(!f->write_in_progress); - list_del(&e->lru); - list_add(&e->lru, &skipped); + list_move(&e->lru, &skipped); cond_resched(); continue; } } wc->writeback_size++; - list_del(&e->lru); - list_add(&e->lru, &wbl.list); + list_move(&e->lru, &wbl.list); wbl.size++; e->write_in_progress = true; e->wc_list_contiguous = 1; @@ -1888,8 +1952,7 @@ restart: // break; wc->writeback_size++; - list_del(&g->lru); - list_add(&g->lru, &wbl.list); + list_move(&g->lru, &wbl.list); wbl.size++; g->write_in_progress = true; g->wc_list_contiguous = BIO_MAX_VECS; @@ -2065,7 +2128,7 @@ static int writecache_ctr(struct dm_target *ti, unsigned argc, char **argv) struct wc_memory_superblock s; static struct dm_arg _args[] = { - {0, 16, "Invalid number of feature args"}, + {0, 18, "Invalid number of feature args"}, }; as.argc = argc; @@ -2109,6 +2172,8 @@ static int writecache_ctr(struct dm_target *ti, unsigned argc, char **argv) INIT_WORK(&wc->writeback_work, writecache_writeback); INIT_WORK(&wc->flush_work, writecache_flush_work); + dm_iot_init(&wc->iot); + raw_spin_lock_init(&wc->endio_list_lock); INIT_LIST_HEAD(&wc->endio_list); wc->endio_thread = kthread_create(writecache_endio_thread, wc, "writecache_endio"); @@ -2156,6 +2221,7 @@ static int writecache_ctr(struct dm_target *ti, unsigned argc, char **argv) goto bad; } } else { + wc->pause = PAUSE_WRITEBACK; r = mempool_init_kmalloc_pool(&wc->copy_pool, 1, sizeof(struct copy_struct)); if (r) { ti->error = "Could not allocate mempool"; @@ -2292,6 +2358,20 @@ static int writecache_ctr(struct dm_target *ti, unsigned argc, char **argv) wc->writeback_fua = false; wc->writeback_fua_set = true; } else goto invalid_optional; + } else if (!strcasecmp(string, "metadata_only")) { + wc->metadata_only = true; + } else if (!strcasecmp(string, "pause_writeback") && opt_params >= 1) { + unsigned pause_msecs; + if (WC_MODE_PMEM(wc)) + goto invalid_optional; + string = dm_shift_arg(&as), opt_params--; + if (sscanf(string, "%u%c", &pause_msecs, &dummy) != 1) + goto invalid_optional; + if (pause_msecs > 60000) + goto invalid_optional; + wc->pause = msecs_to_jiffies(pause_msecs); + wc->pause_set = true; + wc->pause_value = pause_msecs; } else { invalid_optional: r = -EINVAL; @@ -2463,7 +2543,7 @@ overflow: goto bad; } - ti->num_flush_bios = 1; + ti->num_flush_bios = WC_MODE_PMEM(wc) ? 1 : 2; ti->flush_supported = true; ti->num_discard_bios = 1; @@ -2515,6 +2595,10 @@ static void writecache_status(struct dm_target *ti, status_type_t type, extra_args++; if (wc->writeback_fua_set) extra_args++; + if (wc->metadata_only) + extra_args++; + if (wc->pause_set) + extra_args += 2; DMEMIT("%u", extra_args); if (wc->start_sector_set) @@ -2535,13 +2619,17 @@ static void writecache_status(struct dm_target *ti, status_type_t type, DMEMIT(" cleaner"); if (wc->writeback_fua_set) DMEMIT(" %sfua", wc->writeback_fua ? "" : "no"); + if (wc->metadata_only) + DMEMIT(" metadata_only"); + if (wc->pause_set) + DMEMIT(" pause_writeback %u", wc->pause_value); break; } } static struct target_type writecache_target = { .name = "writecache", - .version = {1, 4, 0}, + .version = {1, 5, 0}, .module = THIS_MODULE, .ctr = writecache_ctr, .dtr = writecache_dtr, |