summaryrefslogtreecommitdiffstats
path: root/fs/btrfs/extent-tree.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/btrfs/extent-tree.c')
-rw-r--r--fs/btrfs/extent-tree.c685
1 files changed, 423 insertions, 262 deletions
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 3774c191e36d..d77498e7671c 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -104,10 +104,7 @@ int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_head *head;
struct btrfs_delayed_ref_root *delayed_refs;
struct btrfs_path *path;
- struct btrfs_extent_item *ei;
- struct extent_buffer *leaf;
struct btrfs_key key;
- u32 item_size;
u64 num_refs;
u64 extent_flags;
u64 owner = 0;
@@ -126,11 +123,6 @@ int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
if (!path)
return -ENOMEM;
- if (!trans) {
- path->skip_locking = 1;
- path->search_commit_root = 1;
- }
-
search_again:
key.objectid = bytenr;
key.offset = offset;
@@ -144,7 +136,7 @@ search_again:
if (ret < 0)
goto out_free;
- if (ret > 0 && metadata && key.type == BTRFS_METADATA_ITEM_KEY) {
+ if (ret > 0 && key.type == BTRFS_METADATA_ITEM_KEY) {
if (path->slots[0]) {
path->slots[0]--;
btrfs_item_key_to_cpu(path->nodes[0], &key,
@@ -157,38 +149,37 @@ search_again:
}
if (ret == 0) {
- leaf = path->nodes[0];
- item_size = btrfs_item_size(leaf, path->slots[0]);
- if (item_size >= sizeof(*ei)) {
- ei = btrfs_item_ptr(leaf, path->slots[0],
- struct btrfs_extent_item);
- num_refs = btrfs_extent_refs(leaf, ei);
- extent_flags = btrfs_extent_flags(leaf, ei);
- owner = btrfs_get_extent_owner_root(fs_info, leaf,
- path->slots[0]);
- } else {
+ struct extent_buffer *leaf = path->nodes[0];
+ struct btrfs_extent_item *ei;
+ const u32 item_size = btrfs_item_size(leaf, path->slots[0]);
+
+ if (unlikely(item_size < sizeof(*ei))) {
ret = -EUCLEAN;
btrfs_err(fs_info,
"unexpected extent item size, has %u expect >= %zu",
item_size, sizeof(*ei));
- if (trans)
- btrfs_abort_transaction(trans, ret);
- else
- btrfs_handle_fs_error(fs_info, ret, NULL);
-
+ btrfs_abort_transaction(trans, ret);
goto out_free;
}
- BUG_ON(num_refs == 0);
+ ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
+ num_refs = btrfs_extent_refs(leaf, ei);
+ if (unlikely(num_refs == 0)) {
+ ret = -EUCLEAN;
+ btrfs_err(fs_info,
+ "unexpected zero reference count for extent item (%llu %u %llu)",
+ key.objectid, key.type, key.offset);
+ btrfs_abort_transaction(trans, ret);
+ goto out_free;
+ }
+ extent_flags = btrfs_extent_flags(leaf, ei);
+ owner = btrfs_get_extent_owner_root(fs_info, leaf, path->slots[0]);
} else {
num_refs = 0;
extent_flags = 0;
ret = 0;
}
- if (!trans)
- goto out;
-
delayed_refs = &trans->transaction->delayed_refs;
spin_lock(&delayed_refs->lock);
head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
@@ -211,15 +202,13 @@ search_again:
spin_lock(&head->lock);
if (head->extent_op && head->extent_op->update_flags)
extent_flags |= head->extent_op->flags_to_set;
- else
- BUG_ON(num_refs == 0);
num_refs += head->ref_mod;
spin_unlock(&head->lock);
mutex_unlock(&head->mutex);
}
spin_unlock(&delayed_refs->lock);
-out:
+
WARN_ON(num_refs == 0);
if (refs)
*refs = num_refs;
@@ -1632,7 +1621,7 @@ static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
if (metadata) {
key.type = BTRFS_METADATA_ITEM_KEY;
- key.offset = extent_op->level;
+ key.offset = head->level;
} else {
key.type = BTRFS_EXTENT_ITEM_KEY;
key.offset = head->num_bytes;
@@ -1667,7 +1656,7 @@ again:
ret = -EUCLEAN;
btrfs_err(fs_info,
"missing extent item for extent %llu num_bytes %llu level %d",
- head->bytenr, head->num_bytes, extent_op->level);
+ head->bytenr, head->num_bytes, head->level);
goto out;
}
}
@@ -1726,7 +1715,6 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
.generation = trans->transid,
};
- BUG_ON(!extent_op || !extent_op->update_flags);
ret = alloc_reserved_tree_block(trans, node, extent_op);
if (!ret)
btrfs_record_squota_delta(fs_info, &delta);
@@ -2233,7 +2221,6 @@ int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
struct extent_buffer *eb, u64 flags)
{
struct btrfs_delayed_extent_op *extent_op;
- int level = btrfs_header_level(eb);
int ret;
extent_op = btrfs_alloc_delayed_extent_op();
@@ -2243,9 +2230,9 @@ int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
extent_op->flags_to_set = flags;
extent_op->update_flags = true;
extent_op->update_key = false;
- extent_op->level = level;
- ret = btrfs_add_delayed_extent_op(trans, eb->start, eb->len, extent_op);
+ ret = btrfs_add_delayed_extent_op(trans, eb->start, eb->len,
+ btrfs_header_level(eb), extent_op);
if (ret)
btrfs_free_delayed_extent_op(extent_op);
return ret;
@@ -3419,10 +3406,10 @@ out_delayed_unlock:
return 0;
}
-void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
- u64 root_id,
- struct extent_buffer *buf,
- u64 parent, int last_ref)
+int btrfs_free_tree_block(struct btrfs_trans_handle *trans,
+ u64 root_id,
+ struct extent_buffer *buf,
+ u64 parent, int last_ref)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_block_group *bg;
@@ -3449,11 +3436,12 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
btrfs_init_tree_ref(&generic_ref, btrfs_header_level(buf), 0, false);
btrfs_ref_tree_mod(fs_info, &generic_ref);
ret = btrfs_add_delayed_tree_ref(trans, &generic_ref, NULL);
- BUG_ON(ret); /* -ENOMEM */
+ if (ret < 0)
+ return ret;
}
if (!last_ref)
- return;
+ return 0;
if (btrfs_header_generation(buf) != trans->transid)
goto out;
@@ -3510,6 +3498,7 @@ out:
* matter anymore.
*/
clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags);
+ return 0;
}
/* Can return -ENOMEM */
@@ -4864,7 +4853,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
struct btrfs_path *path;
struct extent_buffer *leaf;
u32 size = sizeof(*extent_item) + sizeof(*iref);
- u64 flags = extent_op->flags_to_set;
+ const u64 flags = (extent_op ? extent_op->flags_to_set : 0);
/* The owner of a tree block is the level. */
int level = btrfs_delayed_ref_owner(node);
bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
@@ -5121,7 +5110,6 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
struct btrfs_key ins;
struct btrfs_block_rsv *block_rsv;
struct extent_buffer *buf;
- struct btrfs_delayed_extent_op *extent_op;
u64 flags = 0;
int ret;
u32 blocksize = fs_info->nodesize;
@@ -5164,6 +5152,7 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
BUG_ON(parent > 0);
if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
+ struct btrfs_delayed_extent_op *extent_op;
struct btrfs_ref generic_ref = {
.action = BTRFS_ADD_DELAYED_EXTENT,
.bytenr = ins.objectid,
@@ -5172,30 +5161,34 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
.owning_root = owning_root,
.ref_root = root_objectid,
};
- extent_op = btrfs_alloc_delayed_extent_op();
- if (!extent_op) {
- ret = -ENOMEM;
- goto out_free_buf;
+
+ if (!skinny_metadata || flags != 0) {
+ extent_op = btrfs_alloc_delayed_extent_op();
+ if (!extent_op) {
+ ret = -ENOMEM;
+ goto out_free_buf;
+ }
+ if (key)
+ memcpy(&extent_op->key, key, sizeof(extent_op->key));
+ else
+ memset(&extent_op->key, 0, sizeof(extent_op->key));
+ extent_op->flags_to_set = flags;
+ extent_op->update_key = (skinny_metadata ? false : true);
+ extent_op->update_flags = (flags != 0);
+ } else {
+ extent_op = NULL;
}
- if (key)
- memcpy(&extent_op->key, key, sizeof(extent_op->key));
- else
- memset(&extent_op->key, 0, sizeof(extent_op->key));
- extent_op->flags_to_set = flags;
- extent_op->update_key = skinny_metadata ? false : true;
- extent_op->update_flags = true;
- extent_op->level = level;
btrfs_init_tree_ref(&generic_ref, level, btrfs_root_id(root), false);
btrfs_ref_tree_mod(fs_info, &generic_ref);
ret = btrfs_add_delayed_tree_ref(trans, &generic_ref, extent_op);
- if (ret)
- goto out_free_delayed;
+ if (ret) {
+ btrfs_free_delayed_extent_op(extent_op);
+ goto out_free_buf;
+ }
}
return buf;
-out_free_delayed:
- btrfs_free_delayed_extent_op(extent_op);
out_free_buf:
btrfs_tree_unlock(buf);
free_extent_buffer(buf);
@@ -5220,11 +5213,99 @@ struct walk_control {
int reada_slot;
int reada_count;
int restarted;
+ /* Indicate that extent info needs to be looked up when walking the tree. */
+ int lookup_info;
};
+/*
+ * This is our normal stage. We are traversing blocks the current snapshot owns
+ * and we are dropping any of our references to any children we are able to, and
+ * then freeing the block once we've processed all of the children.
+ */
#define DROP_REFERENCE 1
+
+/*
+ * We enter this stage when we have to walk into a child block (meaning we can't
+ * simply drop our reference to it from our current parent node) and there are
+ * more than one reference on it. If we are the owner of any of the children
+ * blocks from the current parent node then we have to do the FULL_BACKREF dance
+ * on them in order to drop our normal ref and add the shared ref.
+ */
#define UPDATE_BACKREF 2
+/*
+ * Decide if we need to walk down into this node to adjust the references.
+ *
+ * @root: the root we are currently deleting
+ * @wc: the walk control for this deletion
+ * @eb: the parent eb that we're currently visiting
+ * @refs: the number of refs for wc->level - 1
+ * @flags: the flags for wc->level - 1
+ * @slot: the slot in the eb that we're currently checking
+ *
+ * This is meant to be called when we're evaluating if a node we point to at
+ * wc->level should be read and walked into, or if we can simply delete our
+ * reference to it. We return true if we should walk into the node, false if we
+ * can skip it.
+ *
+ * We have assertions in here to make sure this is called correctly. We assume
+ * that sanity checking on the blocks read to this point has been done, so any
+ * corrupted file systems must have been caught before calling this function.
+ */
+static bool visit_node_for_delete(struct btrfs_root *root, struct walk_control *wc,
+ struct extent_buffer *eb, u64 refs, u64 flags, int slot)
+{
+ struct btrfs_key key;
+ u64 generation;
+ int level = wc->level;
+
+ ASSERT(level > 0);
+ ASSERT(wc->refs[level - 1] > 0);
+
+ /*
+ * The update backref stage we only want to skip if we already have
+ * FULL_BACKREF set, otherwise we need to read.
+ */
+ if (wc->stage == UPDATE_BACKREF) {
+ if (level == 1 && flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)
+ return false;
+ return true;
+ }
+
+ /*
+ * We're the last ref on this block, we must walk into it and process
+ * any refs it's pointing at.
+ */
+ if (wc->refs[level - 1] == 1)
+ return true;
+
+ /*
+ * If we're already FULL_BACKREF then we know we can just drop our
+ * current reference.
+ */
+ if (level == 1 && flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)
+ return false;
+
+ /*
+ * This block is older than our creation generation, we can drop our
+ * reference to it.
+ */
+ generation = btrfs_node_ptr_generation(eb, slot);
+ if (!wc->update_ref || generation <= root->root_key.offset)
+ return false;
+
+ /*
+ * This block was processed from a previous snapshot deletion run, we
+ * can skip it.
+ */
+ btrfs_node_key_to_cpu(eb, &key, slot);
+ if (btrfs_comp_cpu_keys(&key, &wc->update_progress) < 0)
+ return false;
+
+ /* All other cases we need to wander into the node. */
+ return true;
+}
+
static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct walk_control *wc,
@@ -5236,7 +5317,6 @@ static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
u64 refs;
u64 flags;
u32 nritems;
- struct btrfs_key key;
struct extent_buffer *eb;
int ret;
int slot;
@@ -5276,28 +5356,19 @@ static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
/* We don't care about errors in readahead. */
if (ret < 0)
continue;
- BUG_ON(refs == 0);
- if (wc->stage == DROP_REFERENCE) {
- if (refs == 1)
- goto reada;
+ /*
+ * This could be racey, it's conceivable that we raced and end
+ * up with a bogus refs count, if that's the case just skip, if
+ * we are actually corrupt we will notice when we look up
+ * everything again with our locks.
+ */
+ if (refs == 0)
+ continue;
- if (wc->level == 1 &&
- (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
- continue;
- if (!wc->update_ref ||
- generation <= root->root_key.offset)
- continue;
- btrfs_node_key_to_cpu(eb, &key, slot);
- ret = btrfs_comp_cpu_keys(&key,
- &wc->update_progress);
- if (ret < 0)
- continue;
- } else {
- if (wc->level == 1 &&
- (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
- continue;
- }
+ /* If we don't need to visit this node don't reada. */
+ if (!visit_node_for_delete(root, wc, eb, refs, flags, slot))
+ continue;
reada:
btrfs_readahead_node_child(eb, slot);
nread++;
@@ -5316,7 +5387,7 @@ reada:
static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path,
- struct walk_control *wc, int lookup_info)
+ struct walk_control *wc)
{
struct btrfs_fs_info *fs_info = root->fs_info;
int level = wc->level;
@@ -5331,19 +5402,22 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
* when reference count of tree block is 1, it won't increase
* again. once full backref flag is set, we never clear it.
*/
- if (lookup_info &&
+ if (wc->lookup_info &&
((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) ||
(wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag)))) {
- BUG_ON(!path->locks[level]);
+ ASSERT(path->locks[level]);
ret = btrfs_lookup_extent_info(trans, fs_info,
eb->start, level, 1,
&wc->refs[level],
&wc->flags[level],
NULL);
- BUG_ON(ret == -ENOMEM);
if (ret)
return ret;
- BUG_ON(wc->refs[level] == 0);
+ if (unlikely(wc->refs[level] == 0)) {
+ btrfs_err(fs_info, "bytenr %llu has 0 references, expect > 0",
+ eb->start);
+ return -EUCLEAN;
+ }
}
if (wc->stage == DROP_REFERENCE) {
@@ -5359,13 +5433,22 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
/* wc->stage == UPDATE_BACKREF */
if (!(wc->flags[level] & flag)) {
- BUG_ON(!path->locks[level]);
+ ASSERT(path->locks[level]);
ret = btrfs_inc_ref(trans, root, eb, 1);
- BUG_ON(ret); /* -ENOMEM */
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ return ret;
+ }
ret = btrfs_dec_ref(trans, root, eb, 0);
- BUG_ON(ret); /* -ENOMEM */
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ return ret;
+ }
ret = btrfs_set_disk_extent_flags(trans, eb, flag);
- BUG_ON(ret); /* -ENOMEM */
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ return ret;
+ }
wc->flags[level] |= flag;
}
@@ -5408,6 +5491,132 @@ static int check_ref_exists(struct btrfs_trans_handle *trans,
}
/*
+ * We may not have an uptodate block, so if we are going to walk down into this
+ * block we need to drop the lock, read it off of the disk, re-lock it and
+ * return to continue dropping the snapshot.
+ */
+static int check_next_block_uptodate(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct walk_control *wc,
+ struct extent_buffer *next)
+{
+ struct btrfs_tree_parent_check check = { 0 };
+ u64 generation;
+ int level = wc->level;
+ int ret;
+
+ btrfs_assert_tree_write_locked(next);
+
+ generation = btrfs_node_ptr_generation(path->nodes[level], path->slots[level]);
+
+ if (btrfs_buffer_uptodate(next, generation, 0))
+ return 0;
+
+ check.level = level - 1;
+ check.transid = generation;
+ check.owner_root = btrfs_root_id(root);
+ check.has_first_key = true;
+ btrfs_node_key_to_cpu(path->nodes[level], &check.first_key, path->slots[level]);
+
+ btrfs_tree_unlock(next);
+ if (level == 1)
+ reada_walk_down(trans, root, wc, path);
+ ret = btrfs_read_extent_buffer(next, &check);
+ if (ret) {
+ free_extent_buffer(next);
+ return ret;
+ }
+ btrfs_tree_lock(next);
+ wc->lookup_info = 1;
+ return 0;
+}
+
+/*
+ * If we determine that we don't have to visit wc->level - 1 then we need to
+ * determine if we can drop our reference.
+ *
+ * If we are UPDATE_BACKREF then we will not, we need to update our backrefs.
+ *
+ * If we are DROP_REFERENCE this will figure out if we need to drop our current
+ * reference, skipping it if we dropped it from a previous incompleted drop, or
+ * dropping it if we still have a reference to it.
+ */
+static int maybe_drop_reference(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+ struct btrfs_path *path, struct walk_control *wc,
+ struct extent_buffer *next, u64 owner_root)
+{
+ struct btrfs_ref ref = {
+ .action = BTRFS_DROP_DELAYED_REF,
+ .bytenr = next->start,
+ .num_bytes = root->fs_info->nodesize,
+ .owning_root = owner_root,
+ .ref_root = btrfs_root_id(root),
+ };
+ int level = wc->level;
+ int ret;
+
+ /* We are UPDATE_BACKREF, we're not dropping anything. */
+ if (wc->stage == UPDATE_BACKREF)
+ return 0;
+
+ if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
+ ref.parent = path->nodes[level]->start;
+ } else {
+ ASSERT(btrfs_root_id(root) == btrfs_header_owner(path->nodes[level]));
+ if (btrfs_root_id(root) != btrfs_header_owner(path->nodes[level])) {
+ btrfs_err(root->fs_info, "mismatched block owner");
+ return -EIO;
+ }
+ }
+
+ /*
+ * If we had a drop_progress we need to verify the refs are set as
+ * expected. If we find our ref then we know that from here on out
+ * everything should be correct, and we can clear the
+ * ->restarted flag.
+ */
+ if (wc->restarted) {
+ ret = check_ref_exists(trans, root, next->start, ref.parent,
+ level - 1);
+ if (ret <= 0)
+ return ret;
+ ret = 0;
+ wc->restarted = 0;
+ }
+
+ /*
+ * Reloc tree doesn't contribute to qgroup numbers, and we have already
+ * accounted them at merge time (replace_path), thus we could skip
+ * expensive subtree trace here.
+ */
+ if (btrfs_root_id(root) != BTRFS_TREE_RELOC_OBJECTID &&
+ wc->refs[level - 1] > 1) {
+ u64 generation = btrfs_node_ptr_generation(path->nodes[level],
+ path->slots[level]);
+
+ ret = btrfs_qgroup_trace_subtree(trans, next, generation, level - 1);
+ if (ret) {
+ btrfs_err_rl(root->fs_info,
+"error %d accounting shared subtree, quota is out of sync, rescan required",
+ ret);
+ }
+ }
+
+ /*
+ * We need to update the next key in our walk control so we can update
+ * the drop_progress key accordingly. We don't care if find_next_key
+ * doesn't find a key because that means we're at the end and are going
+ * to clean up now.
+ */
+ wc->drop_level = level;
+ find_next_key(path, level, &wc->drop_progress);
+
+ btrfs_init_tree_ref(&ref, level - 1, 0, false);
+ return btrfs_free_extent(trans, &ref);
+}
+
+/*
* helper to process tree block pointer.
*
* when wc->stage == DROP_REFERENCE, this function checks
@@ -5423,19 +5632,15 @@ static int check_ref_exists(struct btrfs_trans_handle *trans,
static noinline int do_walk_down(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path,
- struct walk_control *wc, int *lookup_info)
+ struct walk_control *wc)
{
struct btrfs_fs_info *fs_info = root->fs_info;
u64 bytenr;
u64 generation;
u64 owner_root = 0;
- struct btrfs_tree_parent_check check = { 0 };
- struct btrfs_key key;
struct extent_buffer *next;
int level = wc->level;
- int reada = 0;
int ret = 0;
- bool need_account = false;
generation = btrfs_node_ptr_generation(path->nodes[level],
path->slots[level]);
@@ -5446,27 +5651,17 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
*/
if (wc->stage == UPDATE_BACKREF &&
generation <= root->root_key.offset) {
- *lookup_info = 1;
+ wc->lookup_info = 1;
return 1;
}
bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]);
- check.level = level - 1;
- check.transid = generation;
- check.owner_root = btrfs_root_id(root);
- check.has_first_key = true;
- btrfs_node_key_to_cpu(path->nodes[level], &check.first_key,
- path->slots[level]);
+ next = btrfs_find_create_tree_block(fs_info, bytenr, btrfs_root_id(root),
+ level - 1);
+ if (IS_ERR(next))
+ return PTR_ERR(next);
- next = find_extent_buffer(fs_info, bytenr);
- if (!next) {
- next = btrfs_find_create_tree_block(fs_info, bytenr,
- btrfs_root_id(root), level - 1);
- if (IS_ERR(next))
- return PTR_ERR(next);
- reada = 1;
- }
btrfs_tree_lock(next);
ret = btrfs_lookup_extent_info(trans, fs_info, bytenr, level - 1, 1,
@@ -5477,57 +5672,32 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
goto out_unlock;
if (unlikely(wc->refs[level - 1] == 0)) {
- btrfs_err(fs_info, "Missing references.");
- ret = -EIO;
+ btrfs_err(fs_info, "bytenr %llu has 0 references, expect > 0",
+ bytenr);
+ ret = -EUCLEAN;
goto out_unlock;
}
- *lookup_info = 0;
+ wc->lookup_info = 0;
- if (wc->stage == DROP_REFERENCE) {
- if (wc->refs[level - 1] > 1) {
- need_account = true;
- if (level == 1 &&
- (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
- goto skip;
-
- if (!wc->update_ref ||
- generation <= root->root_key.offset)
- goto skip;
-
- btrfs_node_key_to_cpu(path->nodes[level], &key,
- path->slots[level]);
- ret = btrfs_comp_cpu_keys(&key, &wc->update_progress);
- if (ret < 0)
- goto skip;
+ /* If we don't have to walk into this node skip it. */
+ if (!visit_node_for_delete(root, wc, path->nodes[level],
+ wc->refs[level - 1], wc->flags[level - 1],
+ path->slots[level]))
+ goto skip;
- wc->stage = UPDATE_BACKREF;
- wc->shared_level = level - 1;
- }
- } else {
- if (level == 1 &&
- (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
- goto skip;
+ /*
+ * We have to walk down into this node, and if we're currently at the
+ * DROP_REFERNCE stage and this block is shared then we need to switch
+ * to the UPDATE_BACKREF stage in order to convert to FULL_BACKREF.
+ */
+ if (wc->stage == DROP_REFERENCE && wc->refs[level - 1] > 1) {
+ wc->stage = UPDATE_BACKREF;
+ wc->shared_level = level - 1;
}
- if (!btrfs_buffer_uptodate(next, generation, 0)) {
- btrfs_tree_unlock(next);
- free_extent_buffer(next);
- next = NULL;
- *lookup_info = 1;
- }
-
- if (!next) {
- if (reada && level == 1)
- reada_walk_down(trans, root, wc, path);
- next = read_tree_block(fs_info, bytenr, &check);
- if (IS_ERR(next)) {
- return PTR_ERR(next);
- } else if (!extent_buffer_uptodate(next)) {
- free_extent_buffer(next);
- return -EIO;
- }
- btrfs_tree_lock(next);
- }
+ ret = check_next_block_uptodate(trans, root, path, wc, next);
+ if (ret)
+ return ret;
level--;
ASSERT(level == btrfs_header_level(next));
@@ -5544,78 +5714,12 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
wc->reada_slot = 0;
return 0;
skip:
+ ret = maybe_drop_reference(trans, root, path, wc, next, owner_root);
+ if (ret)
+ goto out_unlock;
wc->refs[level - 1] = 0;
wc->flags[level - 1] = 0;
- if (wc->stage == DROP_REFERENCE) {
- struct btrfs_ref ref = {
- .action = BTRFS_DROP_DELAYED_REF,
- .bytenr = bytenr,
- .num_bytes = fs_info->nodesize,
- .owning_root = owner_root,
- .ref_root = btrfs_root_id(root),
- };
- if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
- ref.parent = path->nodes[level]->start;
- } else {
- ASSERT(btrfs_root_id(root) ==
- btrfs_header_owner(path->nodes[level]));
- if (btrfs_root_id(root) !=
- btrfs_header_owner(path->nodes[level])) {
- btrfs_err(root->fs_info,
- "mismatched block owner");
- ret = -EIO;
- goto out_unlock;
- }
- }
-
- /*
- * If we had a drop_progress we need to verify the refs are set
- * as expected. If we find our ref then we know that from here
- * on out everything should be correct, and we can clear the
- * ->restarted flag.
- */
- if (wc->restarted) {
- ret = check_ref_exists(trans, root, bytenr, ref.parent,
- level - 1);
- if (ret < 0)
- goto out_unlock;
- if (ret == 0)
- goto no_delete;
- ret = 0;
- wc->restarted = 0;
- }
-
- /*
- * Reloc tree doesn't contribute to qgroup numbers, and we have
- * already accounted them at merge time (replace_path),
- * thus we could skip expensive subtree trace here.
- */
- if (btrfs_root_id(root) != BTRFS_TREE_RELOC_OBJECTID && need_account) {
- ret = btrfs_qgroup_trace_subtree(trans, next,
- generation, level - 1);
- if (ret) {
- btrfs_err_rl(fs_info,
- "Error %d accounting shared subtree. Quota is out of sync, rescan required.",
- ret);
- }
- }
-
- /*
- * We need to update the next key in our walk control so we can
- * update the drop_progress key accordingly. We don't care if
- * find_next_key doesn't find a key because that means we're at
- * the end and are going to clean up now.
- */
- wc->drop_level = level;
- find_next_key(path, level, &wc->drop_progress);
-
- btrfs_init_tree_ref(&ref, level - 1, 0, false);
- ret = btrfs_free_extent(trans, &ref);
- if (ret)
- goto out_unlock;
- }
-no_delete:
- *lookup_info = 1;
+ wc->lookup_info = 1;
ret = 1;
out_unlock:
@@ -5643,13 +5747,13 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
struct walk_control *wc)
{
struct btrfs_fs_info *fs_info = root->fs_info;
- int ret;
+ int ret = 0;
int level = wc->level;
struct extent_buffer *eb = path->nodes[level];
u64 parent = 0;
if (wc->stage == UPDATE_BACKREF) {
- BUG_ON(wc->shared_level < level);
+ ASSERT(wc->shared_level >= level);
if (level < wc->shared_level)
goto out;
@@ -5667,7 +5771,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
* count is one.
*/
if (!path->locks[level]) {
- BUG_ON(level == 0);
+ ASSERT(level > 0);
btrfs_tree_lock(eb);
path->locks[level] = BTRFS_WRITE_LOCK;
@@ -5681,7 +5785,12 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
path->locks[level] = 0;
return ret;
}
- BUG_ON(wc->refs[level] == 0);
+ if (unlikely(wc->refs[level] == 0)) {
+ btrfs_tree_unlock_rw(eb, path->locks[level]);
+ btrfs_err(fs_info, "bytenr %llu has 0 references, expect > 0",
+ eb->start);
+ return -EUCLEAN;
+ }
if (wc->refs[level] == 1) {
btrfs_tree_unlock_rw(eb, path->locks[level]);
path->locks[level] = 0;
@@ -5691,7 +5800,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
}
/* wc->stage == DROP_REFERENCE */
- BUG_ON(wc->refs[level] > 1 && !path->locks[level]);
+ ASSERT(path->locks[level] || wc->refs[level] == 1);
if (wc->refs[level] == 1) {
if (level == 0) {
@@ -5699,7 +5808,10 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
ret = btrfs_dec_ref(trans, root, eb, 1);
else
ret = btrfs_dec_ref(trans, root, eb, 0);
- BUG_ON(ret); /* -ENOMEM */
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ return ret;
+ }
if (is_fstree(btrfs_root_id(root))) {
ret = btrfs_qgroup_trace_leaf_items(trans, eb);
if (ret) {
@@ -5730,12 +5842,14 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
goto owner_mismatch;
}
- btrfs_free_tree_block(trans, btrfs_root_id(root), eb, parent,
- wc->refs[level] == 1);
+ ret = btrfs_free_tree_block(trans, btrfs_root_id(root), eb, parent,
+ wc->refs[level] == 1);
+ if (ret < 0)
+ btrfs_abort_transaction(trans, ret);
out:
wc->refs[level] = 0;
wc->flags[level] = 0;
- return 0;
+ return ret;
owner_mismatch:
btrfs_err_rl(fs_info, "unexpected tree owner, have %llu expect %llu",
@@ -5743,17 +5857,38 @@ owner_mismatch:
return -EUCLEAN;
}
+/*
+ * walk_down_tree consists of two steps.
+ *
+ * walk_down_proc(). Look up the reference count and reference of our current
+ * wc->level. At this point path->nodes[wc->level] should be populated and
+ * uptodate, and in most cases should already be locked. If we are in
+ * DROP_REFERENCE and our refcount is > 1 then we've entered a shared node and
+ * we can walk back up the tree. If we are UPDATE_BACKREF we have to set
+ * FULL_BACKREF on this node if it's not already set, and then do the
+ * FULL_BACKREF conversion dance, which is to drop the root reference and add
+ * the shared reference to all of this nodes children.
+ *
+ * do_walk_down(). This is where we actually start iterating on the children of
+ * our current path->nodes[wc->level]. For DROP_REFERENCE that means dropping
+ * our reference to the children that return false from visit_node_for_delete(),
+ * which has various conditions where we know we can just drop our reference
+ * without visiting the node. For UPDATE_BACKREF we will skip any children that
+ * visit_node_for_delete() returns false for, only walking down when necessary.
+ * The bulk of the work for UPDATE_BACKREF occurs in the walk_up_tree() part of
+ * snapshot deletion.
+ */
static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path,
struct walk_control *wc)
{
int level = wc->level;
- int lookup_info = 1;
int ret = 0;
+ wc->lookup_info = 1;
while (level >= 0) {
- ret = walk_down_proc(trans, root, path, wc, lookup_info);
+ ret = walk_down_proc(trans, root, path, wc);
if (ret)
break;
@@ -5764,7 +5899,7 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
btrfs_header_nritems(path->nodes[level]))
break;
- ret = do_walk_down(trans, root, path, wc, &lookup_info);
+ ret = do_walk_down(trans, root, path, wc);
if (ret > 0) {
path->slots[level]++;
continue;
@@ -5775,6 +5910,23 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
return (ret == 1) ? 0 : ret;
}
+/*
+ * walk_up_tree() is responsible for making sure we visit every slot on our
+ * current node, and if we're at the end of that node then we call
+ * walk_up_proc() on our current node which will do one of a few things based on
+ * our stage.
+ *
+ * UPDATE_BACKREF. If we wc->level is currently less than our wc->shared_level
+ * then we need to walk back up the tree, and then going back down into the
+ * other slots via walk_down_tree to update any other children from our original
+ * wc->shared_level. Once we're at or above our wc->shared_level we can switch
+ * back to DROP_REFERENCE, lookup the current nodes refs and flags, and carry on.
+ *
+ * DROP_REFERENCE. If our refs == 1 then we're going to free this tree block.
+ * If we're level 0 then we need to btrfs_dec_ref() on all of the data extents
+ * in our current leaf. After that we call btrfs_free_tree_block() on the
+ * current node and walk up to the next node to walk down the next slot.
+ */
static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path,
@@ -5833,8 +5985,8 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
struct btrfs_root_item *root_item = &root->root_item;
struct walk_control *wc;
struct btrfs_key key;
- int err = 0;
- int ret;
+ const u64 rootid = btrfs_root_id(root);
+ int ret = 0;
int level;
bool root_dropped = false;
bool unfinished_drop = false;
@@ -5843,14 +5995,14 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
path = btrfs_alloc_path();
if (!path) {
- err = -ENOMEM;
+ ret = -ENOMEM;
goto out;
}
wc = kzalloc(sizeof(*wc), GFP_NOFS);
if (!wc) {
btrfs_free_path(path);
- err = -ENOMEM;
+ ret = -ENOMEM;
goto out;
}
@@ -5863,12 +6015,12 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
else
trans = btrfs_start_transaction(tree_root, 0);
if (IS_ERR(trans)) {
- err = PTR_ERR(trans);
+ ret = PTR_ERR(trans);
goto out_free;
}
- err = btrfs_run_delayed_items(trans);
- if (err)
+ ret = btrfs_run_delayed_items(trans);
+ if (ret)
goto out_end_trans;
/*
@@ -5899,11 +6051,11 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
path->lowest_level = level;
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
path->lowest_level = 0;
- if (ret < 0) {
- err = ret;
+ if (ret < 0)
goto out_end_trans;
- }
+
WARN_ON(ret > 0);
+ ret = 0;
/*
* unlock our path, this is safe because only this
@@ -5916,14 +6068,17 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
btrfs_tree_lock(path->nodes[level]);
path->locks[level] = BTRFS_WRITE_LOCK;
+ /*
+ * btrfs_lookup_extent_info() returns 0 for success,
+ * or < 0 for error.
+ */
ret = btrfs_lookup_extent_info(trans, fs_info,
path->nodes[level]->start,
level, 1, &wc->refs[level],
&wc->flags[level], NULL);
- if (ret < 0) {
- err = ret;
+ if (ret < 0)
goto out_end_trans;
- }
+
BUG_ON(wc->refs[level] == 0);
if (level == btrfs_root_drop_level(root_item))
@@ -5949,19 +6104,18 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
ret = walk_down_tree(trans, root, path, wc);
if (ret < 0) {
btrfs_abort_transaction(trans, ret);
- err = ret;
break;
}
ret = walk_up_tree(trans, root, path, wc, BTRFS_MAX_LEVEL);
if (ret < 0) {
btrfs_abort_transaction(trans, ret);
- err = ret;
break;
}
if (ret > 0) {
BUG_ON(wc->stage != DROP_REFERENCE);
+ ret = 0;
break;
}
@@ -5983,7 +6137,6 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
root_item);
if (ret) {
btrfs_abort_transaction(trans, ret);
- err = ret;
goto out_end_trans;
}
@@ -5994,7 +6147,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
if (!for_reloc && btrfs_need_cleaner_sleep(fs_info)) {
btrfs_debug(fs_info,
"drop snapshot early exit");
- err = -EAGAIN;
+ ret = -EAGAIN;
goto out_free;
}
@@ -6008,19 +6161,18 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
else
trans = btrfs_start_transaction(tree_root, 0);
if (IS_ERR(trans)) {
- err = PTR_ERR(trans);
+ ret = PTR_ERR(trans);
goto out_free;
}
}
}
btrfs_release_path(path);
- if (err)
+ if (ret)
goto out_end_trans;
ret = btrfs_del_root(trans, &root->root_key);
if (ret) {
btrfs_abort_transaction(trans, ret);
- err = ret;
goto out_end_trans;
}
@@ -6029,10 +6181,11 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
NULL, NULL);
if (ret < 0) {
btrfs_abort_transaction(trans, ret);
- err = ret;
goto out_end_trans;
} else if (ret > 0) {
- /* if we fail to delete the orphan item this time
+ ret = 0;
+ /*
+ * If we fail to delete the orphan item this time
* around, it'll get picked up the next time.
*
* The most common failure here is just -ENOENT.
@@ -6063,11 +6216,19 @@ out_free:
kfree(wc);
btrfs_free_path(path);
out:
+ if (!ret && root_dropped) {
+ ret = btrfs_qgroup_cleanup_dropped_subvolume(fs_info, rootid);
+ if (ret < 0)
+ btrfs_warn_rl(fs_info,
+ "failed to cleanup qgroup 0/%llu: %d",
+ rootid, ret);
+ ret = 0;
+ }
/*
* We were an unfinished drop root, check to see if there are any
* pending, and if not clear and wake up any waiters.
*/
- if (!err && unfinished_drop)
+ if (!ret && unfinished_drop)
btrfs_maybe_wake_unfinished_drop(fs_info);
/*
@@ -6079,7 +6240,7 @@ out:
*/
if (!for_reloc && !root_dropped)
btrfs_add_dead_root(root);
- return err;
+ return ret;
}
/*