summaryrefslogtreecommitdiffstats
path: root/fs/bcachefs/snapshot.c
diff options
context:
space:
mode:
authorKent Overstreet <kent.overstreet@linux.dev>2023-08-18 04:10:02 +0200
committerKent Overstreet <kent.overstreet@linux.dev>2023-10-22 23:10:11 +0200
commitf55d6e07bc6c9b90f58586daf9c432adb5f5ce25 (patch)
tree9c1f8caabf80bac3feb4bf693c2a622709058727 /fs/bcachefs/snapshot.c
parentbcachefs: Fix btree write buffer with snapshots btrees (diff)
downloadlinux-f55d6e07bc6c9b90f58586daf9c432adb5f5ce25.tar.xz
linux-f55d6e07bc6c9b90f58586daf9c432adb5f5ce25.zip
bcachefs: Cleanup redundant snapshot nodes
After deleteing snapshots, we may be left with a snapshot tree where some nodes only have one child, and we have a linear chain. Interior snapshot nodes are never used directly (i.e. they never have subvolumes that point to them), they are only referered to by child snapshot nodes - hence, they are redundant. The existing code talks about redundant snapshot nodes as forming and equivalence class; i.e. nodes for which snapshot_t->equiv is equal. In a given equivalence class, we only ever need a single key at a given position - i.e. multiple versions with different snapshot fields are redundant. The existing snapshot cleanup code deletes these redundant keys, but not redundant nodes. It turns out this is buggy, because we assume that after snapshot deletion finishes we should only have a single key per equivalence class, but the btree update path doesn't preserve this - overwriting keys in old snapshots doesn't check for the equivalence class being equal, and thus we can end up with duplicate keys in the same equivalence class and fsck complaining about snapshot deletion not having run correctly. The equivalence class notion has been leaking out of the core snapshots code and into too much other code, i.e. fsck, so this patch takes a different approach: snapshot deletion now moves keys to the node in an equivalence class being kept (the leafiest node) and then deletes the redundant nodes in the equivalance class. Some work has to be done to correctly delete interior snapshot nodes; snapshot node depth and skiplist fields for descendent nodes have to be fixed. Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
Diffstat (limited to '')
-rw-r--r--fs/bcachefs/snapshot.c250
1 files changed, 222 insertions, 28 deletions
diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c
index 56961c074674..25c888051ca4 100644
--- a/fs/bcachefs/snapshot.c
+++ b/fs/bcachefs/snapshot.c
@@ -3,6 +3,7 @@
#include "bcachefs.h"
#include "btree_key_cache.h"
#include "btree_update.h"
+#include "buckets.h"
#include "errcode.h"
#include "error.h"
#include "fs.h"
@@ -273,10 +274,9 @@ int bch2_snapshot_invalid(const struct bch_fs *c, struct bkey_s_c k,
for (i = 0; i < ARRAY_SIZE(s.v->skip); i++) {
id = le32_to_cpu(s.v->skip[i]);
- if (!id != !s.v->parent ||
- (s.v->parent &&
- id <= k.k->p.offset)) {
- prt_printf(err, "bad skiplist node %u)", id);
+ if ((id && !s.v->parent) ||
+ (id && id <= k.k->p.offset)) {
+ prt_printf(err, "bad skiplist node %u", id);
return -BCH_ERR_invalid_bkey;
}
}
@@ -933,13 +933,20 @@ err:
return ret;
}
-static int bch2_snapshot_node_delete(struct btree_trans *trans, u32 id)
+static inline void normalize_snapshot_child_pointers(struct bch_snapshot *s)
+{
+ if (le32_to_cpu(s->children[0]) < le32_to_cpu(s->children[1]))
+ swap(s->children[0], s->children[1]);
+}
+
+int bch2_snapshot_node_delete(struct btree_trans *trans, u32 id)
{
struct bch_fs *c = trans->c;
struct btree_iter iter, p_iter = (struct btree_iter) { NULL };
+ struct btree_iter c_iter = (struct btree_iter) { NULL };
struct btree_iter tree_iter = (struct btree_iter) { NULL };
struct bkey_s_c_snapshot s;
- u32 parent_id;
+ u32 parent_id, child_id;
unsigned i;
int ret = 0;
@@ -952,8 +959,10 @@ static int bch2_snapshot_node_delete(struct btree_trans *trans, u32 id)
if (ret)
goto err;
- BUG_ON(!BCH_SNAPSHOT_DELETED(s.v));
+ BUG_ON(s.v->children[1]);
+
parent_id = le32_to_cpu(s.v->parent);
+ child_id = le32_to_cpu(s.v->children[0]);
if (parent_id) {
struct bkey_i_snapshot *parent;
@@ -962,27 +971,48 @@ static int bch2_snapshot_node_delete(struct btree_trans *trans, u32 id)
BTREE_ID_snapshots, POS(0, parent_id),
0, snapshot);
ret = PTR_ERR_OR_ZERO(parent);
- if (unlikely(ret)) {
- bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c,
- "missing snapshot %u", parent_id);
+ bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c,
+ "missing snapshot %u", parent_id);
+ if (unlikely(ret))
goto err;
- }
+ /* find entry in parent->children for node being deleted */
for (i = 0; i < 2; i++)
if (le32_to_cpu(parent->v.children[i]) == id)
break;
- if (i == 2)
- bch_err(c, "snapshot %u missing child pointer to %u",
- parent_id, id);
- else
- parent->v.children[i] = 0;
+ if (bch2_fs_inconsistent_on(i == 2, c,
+ "snapshot %u missing child pointer to %u",
+ parent_id, id))
+ goto err;
+
+ parent->v.children[i] = le32_to_cpu(child_id);
- if (le32_to_cpu(parent->v.children[0]) <
- le32_to_cpu(parent->v.children[1]))
- swap(parent->v.children[0],
- parent->v.children[1]);
- } else {
+ normalize_snapshot_child_pointers(&parent->v);
+ }
+
+ if (child_id) {
+ struct bkey_i_snapshot *child;
+
+ child = bch2_bkey_get_mut_typed(trans, &c_iter,
+ BTREE_ID_snapshots, POS(0, child_id),
+ 0, snapshot);
+ ret = PTR_ERR_OR_ZERO(child);
+ bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c,
+ "missing snapshot %u", child_id);
+ if (unlikely(ret))
+ goto err;
+
+ child->v.parent = cpu_to_le32(parent_id);
+
+ if (!child->v.parent) {
+ child->v.skip[0] = 0;
+ child->v.skip[1] = 0;
+ child->v.skip[2] = 0;
+ }
+ }
+
+ if (!parent_id) {
/*
* We're deleting the root of a snapshot tree: update the
* snapshot_tree entry to point to the new root, or delete it if
@@ -1011,6 +1041,7 @@ static int bch2_snapshot_node_delete(struct btree_trans *trans, u32 id)
err:
bch2_trans_iter_exit(trans, &tree_iter);
bch2_trans_iter_exit(trans, &p_iter);
+ bch2_trans_iter_exit(trans, &c_iter);
bch2_trans_iter_exit(trans, &iter);
return ret;
}
@@ -1166,6 +1197,12 @@ int bch2_snapshot_node_create(struct btree_trans *trans, u32 parent,
*
* also: unlinked inode in internal snapshot appears to not be getting deleted
* correctly if inode doesn't exist in leaf snapshots
+ *
+ * solution:
+ *
+ * for a key in an interior snapshot node that needs work to be done that
+ * requires it to be mutated: iterate over all descendent leaf nodes and copy
+ * that key to snapshot leaf nodes, where we can mutate it
*/
static int snapshot_delete_key(struct btree_trans *trans,
@@ -1191,6 +1228,54 @@ static int snapshot_delete_key(struct btree_trans *trans,
}
}
+static int move_key_to_correct_snapshot(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bkey_s_c k)
+{
+ struct bch_fs *c = trans->c;
+ u32 equiv = bch2_snapshot_equiv(c, k.k->p.snapshot);
+
+ /*
+ * When we have a linear chain of snapshot nodes, we consider
+ * those to form an equivalence class: we're going to collapse
+ * them all down to a single node, and keep the leaf-most node -
+ * which has the same id as the equivalence class id.
+ *
+ * If there are multiple keys in different snapshots at the same
+ * position, we're only going to keep the one in the newest
+ * snapshot - the rest have been overwritten and are redundant,
+ * and for the key we're going to keep we need to move it to the
+ * equivalance class ID if it's not there already.
+ */
+ if (equiv != k.k->p.snapshot) {
+ struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k);
+ struct btree_iter new_iter;
+ int ret;
+
+ ret = PTR_ERR_OR_ZERO(new);
+ if (ret)
+ return ret;
+
+ new->k.p.snapshot = equiv;
+
+ bch2_trans_iter_init(trans, &new_iter, iter->btree_id, new->k.p,
+ BTREE_ITER_ALL_SNAPSHOTS|
+ BTREE_ITER_CACHED|
+ BTREE_ITER_INTENT);
+
+ ret = bch2_btree_iter_traverse(&new_iter) ?:
+ bch2_trans_update(trans, &new_iter, new,
+ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
+ bch2_btree_delete_at(trans, iter,
+ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+ bch2_trans_iter_exit(trans, &new_iter);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
/*
* For a given snapshot, if it doesn't have a subvolume that points to it, and
* it doesn't have child snapshot nodes - it's now redundant and we can mark it
@@ -1224,6 +1309,77 @@ static int bch2_delete_redundant_snapshot(struct btree_trans *trans, struct btre
return 0;
}
+static inline u32 bch2_snapshot_nth_parent_skip(struct bch_fs *c, u32 id, u32 n,
+ snapshot_id_list *skip)
+{
+ rcu_read_lock();
+ while (n--) {
+ do {
+ id = __bch2_snapshot_parent(c, id);
+ } while (snapshot_list_has_id(skip, id));
+ }
+ rcu_read_unlock();
+
+ return id;
+}
+
+static int bch2_fix_child_of_deleted_snapshot(struct btree_trans *trans,
+ struct btree_iter *iter, struct bkey_s_c k,
+ snapshot_id_list *deleted)
+{
+ struct bch_fs *c = trans->c;
+ u32 nr_deleted_ancestors = 0;
+ struct bkey_i_snapshot *s;
+ u32 *i;
+ int ret;
+
+ if (k.k->type != KEY_TYPE_snapshot)
+ return 0;
+
+ if (snapshot_list_has_id(deleted, k.k->p.offset))
+ return 0;
+
+ s = bch2_bkey_make_mut_noupdate_typed(trans, k, snapshot);
+ ret = PTR_ERR_OR_ZERO(s);
+ if (ret)
+ return ret;
+
+ darray_for_each(*deleted, i)
+ nr_deleted_ancestors += bch2_snapshot_is_ancestor(c, s->k.p.offset, *i);
+
+ if (!nr_deleted_ancestors)
+ return 0;
+
+ le32_add_cpu(&s->v.depth, -nr_deleted_ancestors);
+
+ if (!s->v.depth) {
+ s->v.skip[0] = 0;
+ s->v.skip[1] = 0;
+ s->v.skip[2] = 0;
+ } else {
+ u32 depth = le32_to_cpu(s->v.depth);
+ u32 parent = bch2_snapshot_parent(c, s->k.p.offset);
+
+ for (unsigned j = 0; j < ARRAY_SIZE(s->v.skip); j++) {
+ u32 id = le32_to_cpu(s->v.skip[j]);
+
+ if (snapshot_list_has_id(deleted, id)) {
+ id = depth > 1
+ ? bch2_snapshot_nth_parent_skip(c,
+ parent,
+ get_random_u32_below(depth - 1),
+ deleted)
+ : parent;
+ s->v.skip[j] = cpu_to_le32(id);
+ }
+ }
+
+ bubble_sort(s->v.skip, ARRAY_SIZE(s->v.skip), cmp_le32);
+ }
+
+ return bch2_trans_update(trans, iter, &s->k_i, 0);
+}
+
int bch2_delete_dead_snapshots(struct bch_fs *c)
{
struct btree_trans trans;
@@ -1231,7 +1387,8 @@ int bch2_delete_dead_snapshots(struct bch_fs *c)
struct bkey_s_c k;
struct bkey_s_c_snapshot snap;
snapshot_id_list deleted = { 0 };
- u32 i, id;
+ snapshot_id_list deleted_interior = { 0 };
+ u32 *i, id;
int ret = 0;
if (!test_bit(BCH_FS_STARTED, &c->flags)) {
@@ -1287,6 +1444,7 @@ int bch2_delete_dead_snapshots(struct bch_fs *c)
for (id = 0; id < BTREE_ID_NR; id++) {
struct bpos last_pos = POS_MIN;
snapshot_id_list equiv_seen = { 0 };
+ struct disk_reservation res = { 0 };
if (!btree_type_has_snapshots(id))
continue;
@@ -1294,9 +1452,15 @@ int bch2_delete_dead_snapshots(struct bch_fs *c)
ret = for_each_btree_key_commit(&trans, iter,
id, POS_MIN,
BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
- NULL, NULL, BTREE_INSERT_NOFAIL,
- snapshot_delete_key(&trans, &iter, k, &deleted, &equiv_seen, &last_pos));
+ &res, NULL, BTREE_INSERT_NOFAIL,
+ snapshot_delete_key(&trans, &iter, k, &deleted, &equiv_seen, &last_pos)) ?:
+ for_each_btree_key_commit(&trans, iter,
+ id, POS_MIN,
+ BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
+ &res, NULL, BTREE_INSERT_NOFAIL,
+ move_key_to_correct_snapshot(&trans, &iter, k));
+ bch2_disk_reservation_put(c, &res);
darray_exit(&equiv_seen);
if (ret) {
@@ -1305,19 +1469,49 @@ int bch2_delete_dead_snapshots(struct bch_fs *c)
}
}
- for (i = 0; i < deleted.nr; i++) {
- u32 node_to_delete = deleted.data[i];
+ for_each_btree_key(&trans, iter, BTREE_ID_snapshots,
+ POS_MIN, 0, k, ret) {
+ u32 snapshot = k.k->p.offset;
+ u32 equiv = bch2_snapshot_equiv(c, snapshot);
+
+ if (equiv != snapshot)
+ snapshot_list_add(c, &deleted_interior, snapshot);
+ }
+ bch2_trans_iter_exit(&trans, &iter);
+
+ /*
+ * Fixing children of deleted snapshots can't be done completely
+ * atomically, if we crash between here and when we delete the interior
+ * nodes some depth fields will be off:
+ */
+ ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_snapshots, POS_MIN,
+ BTREE_ITER_INTENT, k,
+ NULL, NULL, BTREE_INSERT_NOFAIL,
+ bch2_fix_child_of_deleted_snapshot(&trans, &iter, k, &deleted_interior));
+ if (ret)
+ goto err;
+
+ darray_for_each(deleted, i) {
+ ret = commit_do(&trans, NULL, NULL, 0,
+ bch2_snapshot_node_delete(&trans, *i));
+ if (ret) {
+ bch_err_msg(c, ret, "deleting snapshot %u", *i);
+ goto err;
+ }
+ }
+ darray_for_each(deleted_interior, i) {
ret = commit_do(&trans, NULL, NULL, 0,
- bch2_snapshot_node_delete(&trans, node_to_delete));
+ bch2_snapshot_node_delete(&trans, *i));
if (ret) {
- bch_err_msg(c, ret, "deleting snapshot %u", node_to_delete);
+ bch_err_msg(c, ret, "deleting snapshot %u", *i);
goto err;
}
}
clear_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags);
err:
+ darray_exit(&deleted_interior);
darray_exit(&deleted);
bch2_trans_exit(&trans);
if (ret)