summaryrefslogtreecommitdiffstats
path: root/fs/bcachefs/btree_io.c
diff options
context:
space:
mode:
authorKent Overstreet <kent.overstreet@linux.dev>2024-05-08 06:29:24 +0200
committerKent Overstreet <kent.overstreet@linux.dev>2024-05-08 20:56:09 +0200
commit5dfd3746b6c486db18bc75de89c7abce41c7826c (patch)
treec43f0029ada7870f3f8be29579c382a481464174 /fs/bcachefs/btree_io.c
parentbcachefs: Fix sb_clean_validate endianness conversion (diff)
downloadlinux-5dfd3746b6c486db18bc75de89c7abce41c7826c.tar.xz
linux-5dfd3746b6c486db18bc75de89c7abce41c7826c.zip
bcachefs: Fix needs_whiteout BUG_ON() in bkey_sort()
Btree nodes are log structured; thus, we need to emit whiteouts when we're deleting a key that's been written out to disk. k->needs_whiteout tracks whether a key will need a whiteout when it's deleted, and this requires some careful handling; e.g. the key we're deleting may not have been written out to disk, but it may have overwritten a key that was - thus we need to carry this flag around on overwrites. Invariants: There may be multiple key for the same position in a given node (because of overwrites), but only one of them will be a live (non deleted) key, and only one key for a given position will have the needs_whiteout flag set. Additionally, we don't want to carry around whiteouts that need to be written in the main searchable part of a btree node - btree_iter_peek() will have to skip past them, and this can lead to an O(n^2) issues when doing sequential deletions (e.g. inode rm/truncate). So there's a separate region in the btree node buffer for unwritten whiteouts; these are merge sorted with the rest of the keys we're writing in the btree node write path. The unwritten whiteouts was a later optimization that bch2_sort_keys() didn't take into account; the unwritten whiteouts area means that we never have deleted keys with needs_whiteout set in the main searchable part of a btree node. That means we can simplify and optimize some sort paths, and eliminate an assertion that syzbot found: - Unless we're in the btree node write path, it's always ok to drop whiteouts when sorting - When sorting for a btree node write, we drop the whiteout if it's not from the unwritten whiteouts area, or if it's overwritten by a real key at the same position. This completely eliminates some tricky logic for propagating the needs_whiteout flag: syzbot was able to hit the assertion that checked that there shouldn't be more than one key at the same pos with needs_whiteout set, likely due to a combination of flipping on needs_whiteout on all written keys (they need whiteouts if overwritten), combined with not always dropping unneeded whiteouts, and the tricky logic in the sort path for preserving needs_whiteout that wasn't really needed. Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
Diffstat (limited to 'fs/bcachefs/btree_io.c')
-rw-r--r--fs/bcachefs/btree_io.c18
1 files changed, 8 insertions, 10 deletions
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index debb0edc3455..94753c5b0e89 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -288,8 +288,7 @@ bool bch2_compact_whiteouts(struct bch_fs *c, struct btree *b,
static void btree_node_sort(struct bch_fs *c, struct btree *b,
unsigned start_idx,
- unsigned end_idx,
- bool filter_whiteouts)
+ unsigned end_idx)
{
struct btree_node *out;
struct sort_iter_stack sort_iter;
@@ -320,7 +319,7 @@ static void btree_node_sort(struct bch_fs *c, struct btree *b,
start_time = local_clock();
- u64s = bch2_sort_keys(out->keys.start, &sort_iter.iter, filter_whiteouts);
+ u64s = bch2_sort_keys(out->keys.start, &sort_iter.iter);
out->keys.u64s = cpu_to_le16(u64s);
@@ -426,13 +425,12 @@ static bool btree_node_compact(struct bch_fs *c, struct btree *b)
break;
if (b->nsets - unwritten_idx > 1) {
- btree_node_sort(c, b, unwritten_idx,
- b->nsets, false);
+ btree_node_sort(c, b, unwritten_idx, b->nsets);
ret = true;
}
if (unwritten_idx > 1) {
- btree_node_sort(c, b, 0, unwritten_idx, false);
+ btree_node_sort(c, b, 0, unwritten_idx);
ret = true;
}
@@ -2095,11 +2093,11 @@ do_write:
unwritten_whiteouts_end(b));
SET_BSET_SEPARATE_WHITEOUTS(i, false);
- b->whiteout_u64s = 0;
-
- u64s = bch2_sort_keys(i->start, &sort_iter.iter, false);
+ u64s = bch2_sort_keys_keep_unwritten_whiteouts(i->start, &sort_iter.iter);
le16_add_cpu(&i->u64s, u64s);
+ b->whiteout_u64s = 0;
+
BUG_ON(!b->written && i->u64s != b->data->keys.u64s);
set_needs_whiteout(i, false);
@@ -2249,7 +2247,7 @@ bool bch2_btree_post_write_cleanup(struct bch_fs *c, struct btree *b)
* single bset:
*/
if (b->nsets > 1) {
- btree_node_sort(c, b, 0, b->nsets, true);
+ btree_node_sort(c, b, 0, b->nsets);
invalidated_iter = true;
} else {
invalidated_iter = bch2_drop_whiteouts(b, COMPACT_ALL);