Merge tag 'for-6.9/dm-vdo' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm

Pull device mapper VDO target from Mike Snitzer: "Introduce the DM vdo target which provides block-level deduplication, compression, and thin provisioning. Please see: Documentation/admin-guide/device-mapper/vdo.rst Documentation/admin-guide/device-mapper/vdo-design.rst The DM vdo target handles its concurrency by pinning an IO, and subsequent stages of handling that IO, to a particular VDO thread. This aspect of VDO is "unique" but its overall implementation is very tightly coupled to its mostly lockless threading model. As such, VDO is not easily changed to use more traditional finer-grained locking and Linux workqueues. Please see the "Zones and Threading" section of vdo-design.rst The DM vdo target has been used in production for many years but has seen significant changes over the past ~6 years to prepare it for upstream inclusion. The codebase is still large but it is isolated to drivers/md/dm-vdo/ and has been made considerably more approachable and maintainable. Matt Sakai has been added to the MAINTAINERS file to reflect that he will send VDO changes upstream through the DM subsystem maintainers" * tag 'for-6.9/dm-vdo' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm: (142 commits) dm vdo: document minimum metadata size requirements dm vdo: remove meaningless version number constant dm vdo: remove vdo_perform_once dm vdo block-map: Remove stray semicolon dm vdo string-utils: change from uds_ to vdo_ namespace dm vdo logger: change from uds_ to vdo_ namespace dm vdo funnel-queue: change from uds_ to vdo_ namespace dm vdo indexer: fix use after free dm vdo logger: remove log level to string conversion code dm vdo: document log_level parameter dm vdo: add 'log_level' module parameter dm vdo: remove all sysfs interfaces dm vdo target: eliminate inappropriate uses of UDS_SUCCESS dm vdo indexer: update ASSERT and ASSERT_LOG_ONLY usage dm vdo encodings: update some stale comments dm vdo permassert: audit all of ASSERT to test for VDO_SUCCESS dm-vdo funnel-workqueue: return VDO_SUCCESS from make_simple_work_queue dm vdo thread-utils: return VDO_SUCCESS on vdo_create_thread success dm vdo int-map: return VDO_SUCCESS on success dm vdo: check for VDO_SUCCESS return value from memory-alloc functions ...
author: Linus Torvalds <torvalds@linux-foundation.org> 2024-03-13 17:57:23 +0100
committer: Linus Torvalds <torvalds@linux-foundation.org> 2024-03-13 17:57:23 +0100
commit: 61387b8dcf1dc0f30cf690956a48768a3fce1810 (patch)
tree: 78f176823ac74dc82f26462ce347735cf73b79f0 /drivers/md/dm-vdo/block-map.h
parent: Merge tag 'for-6.9/dm-bh-wq' of git://git.kernel.org/pub/scm/linux/kernel/git... (diff)
parent: dm vdo: document minimum metadata size requirements (diff)
download: linux-61387b8dcf1dc0f30cf690956a48768a3fce1810.tar.xz
linux-61387b8dcf1dc0f30cf690956a48768a3fce1810.zip
1 files changed, 394 insertions, 0 deletions
diff --git a/drivers/md/dm-vdo/block-map.h b/drivers/md/dm-vdo/block-map.h
new file mode 100644
index 000000000000..39a13039e4a3
--- /dev/null
+++ b/drivers/md/dm-vdo/block-map.h
@@ -0,0 +1,394 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright 2023 Red Hat
+ */
+
+#ifndef VDO_BLOCK_MAP_H
+#define VDO_BLOCK_MAP_H
+
+#include <linux/list.h>
+
+#include "numeric.h"
+
+#include "admin-state.h"
+#include "completion.h"
+#include "encodings.h"
+#include "int-map.h"
+#include "statistics.h"
+#include "types.h"
+#include "vio.h"
+#include "wait-queue.h"
+
+/*
+ * The block map is responsible for tracking all the logical to physical mappings of a VDO. It
+ * consists of a collection of 60 radix trees gradually allocated as logical addresses are used.
+ * Each tree is assigned to a logical zone such that it is easy to compute which zone must handle
+ * each logical address. Each logical zone also has a dedicated portion of the leaf page cache.
+ *
+ * Each logical zone has a single dedicated queue and thread for performing all updates to the
+ * radix trees assigned to that zone. The concurrency guarantees of this single-threaded model
+ * allow the code to omit more fine-grained locking for the block map structures.
+ *
+ * Load operations must be performed on the admin thread. Normal operations, such as reading and
+ * updating mappings, must be performed on the appropriate logical zone thread. Save operations
+ * must be launched from the same admin thread as the original load operation.
+ */
+
+enum {
+	BLOCK_MAP_VIO_POOL_SIZE = 64,
+};
+
+/*
+ * Generation counter for page references.
+ */
+typedef u32 vdo_page_generation;
+
+extern const struct block_map_entry UNMAPPED_BLOCK_MAP_ENTRY;
+
+/* The VDO Page Cache abstraction. */
+struct vdo_page_cache {
+	/* the VDO which owns this cache */
+	struct vdo *vdo;
+	/* number of pages in cache */
+	page_count_t page_count;
+	/* number of pages to write in the current batch */
+	page_count_t pages_in_batch;
+	/* Whether the VDO is doing a read-only rebuild */
+	bool rebuilding;
+
+	/* array of page information entries */
+	struct page_info *infos;
+	/* raw memory for pages */
+	char *pages;
+	/* cache last found page info */
+	struct page_info *last_found;
+	/* map of page number to info */
+	struct int_map *page_map;
+	/* main LRU list (all infos) */
+	struct list_head lru_list;
+	/* free page list (oldest first) */
+	struct list_head free_list;
+	/* outgoing page list */
+	struct list_head outgoing_list;
+	/* number of read I/O operations pending */
+	page_count_t outstanding_reads;
+	/* number of write I/O operations pending */
+	page_count_t outstanding_writes;
+	/* number of pages covered by the current flush */
+	page_count_t pages_in_flush;
+	/* number of pages waiting to be included in the next flush */
+	page_count_t pages_to_flush;
+	/* number of discards in progress */
+	unsigned int discard_count;
+	/* how many VPCs waiting for free page */
+	unsigned int waiter_count;
+	/* queue of waiters who want a free page */
+	struct vdo_wait_queue free_waiters;
+	/*
+	 * Statistics are only updated on the logical zone thread, but are accessed from other
+	 * threads.
+	 */
+	struct block_map_statistics stats;
+	/* counter for pressure reports */
+	u32 pressure_report;
+	/* the block map zone to which this cache belongs */
+	struct block_map_zone *zone;
+};
+
+/*
+ * The state of a page buffer. If the page buffer is free no particular page is bound to it,
+ * otherwise the page buffer is bound to particular page whose absolute pbn is in the pbn field. If
+ * the page is resident or dirty the page data is stable and may be accessed. Otherwise the page is
+ * in flight (incoming or outgoing) and its data should not be accessed.
+ *
+ * @note Update the static data in get_page_state_name() if you change this enumeration.
+ */
+enum vdo_page_buffer_state {
+	/* this page buffer is not being used */
+	PS_FREE,
+	/* this page is being read from store */
+	PS_INCOMING,
+	/* attempt to load this page failed */
+	PS_FAILED,
+	/* this page is valid and un-modified */
+	PS_RESIDENT,
+	/* this page is valid and modified */
+	PS_DIRTY,
+	/* this page is being written and should not be used */
+	PS_OUTGOING,
+	/* not a state */
+	PAGE_STATE_COUNT,
+} __packed;
+
+/*
+ * The write status of page
+ */
+enum vdo_page_write_status {
+	WRITE_STATUS_NORMAL,
+	WRITE_STATUS_DISCARD,
+	WRITE_STATUS_DEFERRED,
+} __packed;
+
+/* Per-page-slot information. */
+struct page_info {
+	/* Preallocated page struct vio */
+	struct vio *vio;
+	/* back-link for references */
+	struct vdo_page_cache *cache;
+	/* the pbn of the page */
+	physical_block_number_t pbn;
+	/* page is busy (temporarily locked) */
+	u16 busy;
+	/* the write status the page */
+	enum vdo_page_write_status write_status;
+	/* page state */
+	enum vdo_page_buffer_state state;
+	/* queue of completions awaiting this item */
+	struct vdo_wait_queue waiting;
+	/* state linked list entry */
+	struct list_head state_entry;
+	/* LRU entry */
+	struct list_head lru_entry;
+	/*
+	 * The earliest recovery journal block containing uncommitted updates to the block map page
+	 * associated with this page_info. A reference (lock) is held on that block to prevent it
+	 * from being reaped. When this value changes, the reference on the old value must be
+	 * released and a reference on the new value must be acquired.
+	 */
+	sequence_number_t recovery_lock;
+};
+
+/*
+ * A completion awaiting a specific page. Also a live reference into the page once completed, until
+ * freed.
+ */
+struct vdo_page_completion {
+	/* The generic completion */
+	struct vdo_completion completion;
+	/* The cache involved */
+	struct vdo_page_cache *cache;
+	/* The waiter for the pending list */
+	struct vdo_waiter waiter;
+	/* The absolute physical block number of the page on disk */
+	physical_block_number_t pbn;
+	/* Whether the page may be modified */
+	bool writable;
+	/* Whether the page is available */
+	bool ready;
+	/* The info structure for the page, only valid when ready */
+	struct page_info *info;
+};
+
+struct forest;
+
+struct tree_page {
+	struct vdo_waiter waiter;
+
+	/* Dirty list entry */
+	struct list_head entry;
+
+	/* If dirty, the tree zone flush generation in which it was last dirtied. */
+	u8 generation;
+
+	/* Whether this page is an interior tree page being written out. */
+	bool writing;
+
+	/* If writing, the tree zone flush generation of the copy being written. */
+	u8 writing_generation;
+
+	/*
+	 * Sequence number of the earliest recovery journal block containing uncommitted updates to
+	 * this page
+	 */
+	sequence_number_t recovery_lock;
+
+	/* The value of recovery_lock when the this page last started writing */
+	sequence_number_t writing_recovery_lock;
+
+	char page_buffer[VDO_BLOCK_SIZE];
+};
+
+enum block_map_page_type {
+	VDO_TREE_PAGE,
+	VDO_CACHE_PAGE,
+};
+
+typedef struct list_head dirty_era_t[2];
+
+struct dirty_lists {
+	/* The number of periods after which an element will be expired */
+	block_count_t maximum_age;
+	/* The oldest period which has unexpired elements */
+	sequence_number_t oldest_period;
+	/* One more than the current period */
+	sequence_number_t next_period;
+	/* The offset in the array of lists of the oldest period */
+	block_count_t offset;
+	/* Expired pages */
+	dirty_era_t expired;
+	/* The lists of dirty pages */
+	dirty_era_t eras[];
+};
+
+struct block_map_zone {
+	zone_count_t zone_number;
+	thread_id_t thread_id;
+	struct admin_state state;
+	struct block_map *block_map;
+	/* Dirty pages, by era*/
+	struct dirty_lists *dirty_lists;
+	struct vdo_page_cache page_cache;
+	data_vio_count_t active_lookups;
+	struct int_map *loading_pages;
+	struct vio_pool *vio_pool;
+	/* The tree page which has issued or will be issuing a flush */
+	struct tree_page *flusher;
+	struct vdo_wait_queue flush_waiters;
+	/* The generation after the most recent flush */
+	u8 generation;
+	u8 oldest_generation;
+	/* The counts of dirty pages in each generation */
+	u32 dirty_page_counts[256];
+};
+
+struct block_map {
+	struct vdo *vdo;
+	struct action_manager *action_manager;
+	/* The absolute PBN of the first root of the tree part of the block map */
+	physical_block_number_t root_origin;
+	block_count_t root_count;
+
+	/* The era point we are currently distributing to the zones */
+	sequence_number_t current_era_point;
+	/* The next era point */
+	sequence_number_t pending_era_point;
+
+	/* The number of entries in block map */
+	block_count_t entry_count;
+	nonce_t nonce;
+	struct recovery_journal *journal;
+
+	/* The trees for finding block map pages */
+	struct forest *forest;
+	/* The expanded trees awaiting growth */
+	struct forest *next_forest;
+	/* The number of entries after growth */
+	block_count_t next_entry_count;
+
+	zone_count_t zone_count;
+	struct block_map_zone zones[];
+};
+
+/**
+ * typedef vdo_entry_callback_fn - A function to be called for each allocated PBN when traversing
+ *                                 the forest.
+ * @pbn: A PBN of a tree node.
+ * @completion: The parent completion of the traversal.
+ *
+ * Return: VDO_SUCCESS or an error.
+ */
+typedef int (*vdo_entry_callback_fn)(physical_block_number_t pbn,
+				     struct vdo_completion *completion);
+
+static inline struct vdo_page_completion *as_vdo_page_completion(struct vdo_completion *completion)
+{
+	vdo_assert_completion_type(completion, VDO_PAGE_COMPLETION);
+	return container_of(completion, struct vdo_page_completion, completion);
+}
+
+void vdo_release_page_completion(struct vdo_completion *completion);
+
+void vdo_get_page(struct vdo_page_completion *page_completion,
+		  struct block_map_zone *zone, physical_block_number_t pbn,
+		  bool writable, void *parent, vdo_action_fn callback,
+		  vdo_action_fn error_handler, bool requeue);
+
+void vdo_request_page_write(struct vdo_completion *completion);
+
+int __must_check vdo_get_cached_page(struct vdo_completion *completion,
+				     struct block_map_page **page_ptr);
+
+int __must_check vdo_invalidate_page_cache(struct vdo_page_cache *cache);
+
+static inline struct block_map_page * __must_check
+vdo_as_block_map_page(struct tree_page *tree_page)
+{
+	return (struct block_map_page *) tree_page->page_buffer;
+}
+
+bool vdo_copy_valid_page(char *buffer, nonce_t nonce,
+			 physical_block_number_t pbn,
+			 struct block_map_page *page);
+
+void vdo_find_block_map_slot(struct data_vio *data_vio);
+
+physical_block_number_t vdo_find_block_map_page_pbn(struct block_map *map,
+						    page_number_t page_number);
+
+void vdo_write_tree_page(struct tree_page *page, struct block_map_zone *zone);
+
+void vdo_traverse_forest(struct block_map *map, vdo_entry_callback_fn callback,
+			 struct vdo_completion *completion);
+
+int __must_check vdo_decode_block_map(struct block_map_state_2_0 state,
+				      block_count_t logical_blocks, struct vdo *vdo,
+				      struct recovery_journal *journal, nonce_t nonce,
+				      page_count_t cache_size, block_count_t maximum_age,
+				      struct block_map **map_ptr);
+
+void vdo_drain_block_map(struct block_map *map, const struct admin_state_code *operation,
+			 struct vdo_completion *parent);
+
+void vdo_resume_block_map(struct block_map *map, struct vdo_completion *parent);
+
+int __must_check vdo_prepare_to_grow_block_map(struct block_map *map,
+					       block_count_t new_logical_blocks);
+
+void vdo_grow_block_map(struct block_map *map, struct vdo_completion *parent);
+
+void vdo_abandon_block_map_growth(struct block_map *map);
+
+void vdo_free_block_map(struct block_map *map);
+
+struct block_map_state_2_0 __must_check vdo_record_block_map(const struct block_map *map);
+
+void vdo_initialize_block_map_from_journal(struct block_map *map,
+					   struct recovery_journal *journal);
+
+zone_count_t vdo_compute_logical_zone(struct data_vio *data_vio);
+
+void vdo_advance_block_map_era(struct block_map *map,
+			       sequence_number_t recovery_block_number);
+
+void vdo_update_block_map_page(struct block_map_page *page, struct data_vio *data_vio,
+			       physical_block_number_t pbn,
+			       enum block_mapping_state mapping_state,
+			       sequence_number_t *recovery_lock);
+
+void vdo_get_mapped_block(struct data_vio *data_vio);
+
+void vdo_put_mapped_block(struct data_vio *data_vio);
+
+struct block_map_statistics __must_check vdo_get_block_map_statistics(struct block_map *map);
+
+/**
+ * vdo_convert_maximum_age() - Convert the maximum age to reflect the new recovery journal format
+ * @age: The configured maximum age
+ *
+ * Return: The converted age
+ *
+ * In the old recovery journal format, each journal block held 311 entries, and every write bio
+ * made two entries. The old maximum age was half the usable journal length. In the new format,
+ * each block holds only 217 entries, but each bio only makes one entry. We convert the configured
+ * age so that the number of writes in a block map era is the same in the old and new formats. This
+ * keeps the bound on the amount of work required to recover the block map from the recovery
+ * journal the same across the format change. It also keeps the amortization of block map page
+ * writes to write bios the same.
+ */
+static inline block_count_t vdo_convert_maximum_age(block_count_t age)
+{
+	return DIV_ROUND_UP(age * RECOVERY_JOURNAL_1_ENTRIES_PER_BLOCK,
+			    2 * RECOVERY_JOURNAL_ENTRIES_PER_BLOCK);
+}
+
+#endif /* VDO_BLOCK_MAP_H */
author	Linus Torvalds <torvalds@linux-foundation.org>	2024-03-13 17:57:23 +0100
committer	Linus Torvalds <torvalds@linux-foundation.org>	2024-03-13 17:57:23 +0100
commit	61387b8dcf1dc0f30cf690956a48768a3fce1810 (patch)
tree	78f176823ac74dc82f26462ce347735cf73b79f0 /drivers/md/dm-vdo/block-map.h
parent	Merge tag 'for-6.9/dm-bh-wq' of git://git.kernel.org/pub/scm/linux/kernel/git... (diff)
parent	dm vdo: document minimum metadata size requirements (diff)
download	linux-61387b8dcf1dc0f30cf690956a48768a3fce1810.tar.xz linux-61387b8dcf1dc0f30cf690956a48768a3fce1810.zip