summaryrefslogtreecommitdiffstats
path: root/fs/btrfs
diff options
context:
space:
mode:
Diffstat (limited to 'fs/btrfs')
-rw-r--r--fs/btrfs/ctree.c234
-rw-r--r--fs/btrfs/ctree.h4
-rw-r--r--fs/btrfs/disk-io.c10
-rw-r--r--fs/btrfs/extent-tree.c5
-rw-r--r--fs/btrfs/extent_io.c18
-rw-r--r--fs/btrfs/extent_io.h16
-rw-r--r--fs/btrfs/inode.c3
-rw-r--r--fs/btrfs/locking.c208
-rw-r--r--fs/btrfs/locking.h6
-rw-r--r--fs/btrfs/tree-defrag.c1
-rw-r--r--fs/btrfs/tree-log.c4
11 files changed, 470 insertions, 39 deletions
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 3b6e35aafc9e..3af777357acb 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -54,6 +54,31 @@ struct btrfs_path *btrfs_alloc_path(void)
return path;
}
+/*
+ * set all locked nodes in the path to blocking locks. This should
+ * be done before scheduling
+ */
+noinline void btrfs_set_path_blocking(struct btrfs_path *p)
+{
+ int i;
+ for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
+ if (p->nodes[i] && p->locks[i])
+ btrfs_set_lock_blocking(p->nodes[i]);
+ }
+}
+
+/*
+ * reset all the locked nodes in the patch to spinning locks.
+ */
+noinline void btrfs_clear_path_blocking(struct btrfs_path *p)
+{
+ int i;
+ for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
+ if (p->nodes[i] && p->locks[i])
+ btrfs_clear_lock_blocking(p->nodes[i]);
+ }
+}
+
/* this also releases the path */
void btrfs_free_path(struct btrfs_path *p)
{
@@ -272,6 +297,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
if (IS_ERR(cow))
return PTR_ERR(cow);
+ /* cow is set to blocking by btrfs_init_new_buffer */
+
copy_extent_buffer(cow, buf, 0, 0, cow->len);
btrfs_set_header_bytenr(cow, cow->start);
btrfs_set_header_generation(cow, trans->transid);
@@ -397,6 +424,11 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
}
search_start = buf->start & ~((u64)(1024 * 1024 * 1024) - 1);
+
+ if (parent)
+ btrfs_set_lock_blocking(parent);
+ btrfs_set_lock_blocking(buf);
+
ret = __btrfs_cow_block(trans, root, buf, parent,
parent_slot, cow_ret, search_start, 0,
prealloc_dest);
@@ -502,6 +534,8 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
if (parent_nritems == 1)
return 0;
+ btrfs_set_lock_blocking(parent);
+
for (i = start_slot; i < end_slot; i++) {
int close = 1;
@@ -562,6 +596,7 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
search_start = last_block;
btrfs_tree_lock(cur);
+ btrfs_set_lock_blocking(cur);
err = __btrfs_cow_block(trans, root, cur, parent, i,
&cur, search_start,
min(16 * blocksize,
@@ -860,6 +895,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
return 0;
mid = path->nodes[level];
+
WARN_ON(!path->locks[level]);
WARN_ON(btrfs_header_generation(mid) != trans->transid);
@@ -882,6 +918,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
/* promote the child to a root */
child = read_node_slot(root, mid, 0);
btrfs_tree_lock(child);
+ btrfs_set_lock_blocking(child);
BUG_ON(!child);
ret = btrfs_cow_block(trans, root, child, mid, 0, &child, 0);
BUG_ON(ret);
@@ -898,6 +935,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
add_root_to_dirty_list(root);
btrfs_tree_unlock(child);
+
path->locks[level] = 0;
path->nodes[level] = NULL;
clean_tree_block(trans, root, mid);
@@ -922,6 +960,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
left = read_node_slot(root, parent, pslot - 1);
if (left) {
btrfs_tree_lock(left);
+ btrfs_set_lock_blocking(left);
wret = btrfs_cow_block(trans, root, left,
parent, pslot - 1, &left, 0);
if (wret) {
@@ -932,6 +971,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
right = read_node_slot(root, parent, pslot + 1);
if (right) {
btrfs_tree_lock(right);
+ btrfs_set_lock_blocking(right);
wret = btrfs_cow_block(trans, root, right,
parent, pslot + 1, &right, 0);
if (wret) {
@@ -1107,6 +1147,8 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
u32 left_nr;
btrfs_tree_lock(left);
+ btrfs_set_lock_blocking(left);
+
left_nr = btrfs_header_nritems(left);
if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) {
wret = 1;
@@ -1153,7 +1195,10 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
*/
if (right) {
u32 right_nr;
+
btrfs_tree_lock(right);
+ btrfs_set_lock_blocking(right);
+
right_nr = btrfs_header_nritems(right);
if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) {
wret = 1;
@@ -1265,6 +1310,68 @@ static noinline void reada_for_search(struct btrfs_root *root,
}
/*
+ * returns -EAGAIN if it had to drop the path, or zero if everything was in
+ * cache
+ */
+static noinline int reada_for_balance(struct btrfs_root *root,
+ struct btrfs_path *path, int level)
+{
+ int slot;
+ int nritems;
+ struct extent_buffer *parent;
+ struct extent_buffer *eb;
+ u64 gen;
+ u64 block1 = 0;
+ u64 block2 = 0;
+ int ret = 0;
+ int blocksize;
+
+ parent = path->nodes[level - 1];
+ if (!parent)
+ return 0;
+
+ nritems = btrfs_header_nritems(parent);
+ slot = path->slots[level];
+ blocksize = btrfs_level_size(root, level);
+
+ if (slot > 0) {
+ block1 = btrfs_node_blockptr(parent, slot - 1);
+ gen = btrfs_node_ptr_generation(parent, slot - 1);
+ eb = btrfs_find_tree_block(root, block1, blocksize);
+ if (eb && btrfs_buffer_uptodate(eb, gen))
+ block1 = 0;
+ free_extent_buffer(eb);
+ }
+ if (slot < nritems) {
+ block2 = btrfs_node_blockptr(parent, slot + 1);
+ gen = btrfs_node_ptr_generation(parent, slot + 1);
+ eb = btrfs_find_tree_block(root, block2, blocksize);
+ if (eb && btrfs_buffer_uptodate(eb, gen))
+ block2 = 0;
+ free_extent_buffer(eb);
+ }
+ if (block1 || block2) {
+ ret = -EAGAIN;
+ btrfs_release_path(root, path);
+ if (block1)
+ readahead_tree_block(root, block1, blocksize, 0);
+ if (block2)
+ readahead_tree_block(root, block2, blocksize, 0);
+
+ if (block1) {
+ eb = read_tree_block(root, block1, blocksize, 0);
+ free_extent_buffer(eb);
+ }
+ if (block1) {
+ eb = read_tree_block(root, block2, blocksize, 0);
+ free_extent_buffer(eb);
+ }
+ }
+ return ret;
+}
+
+
+/*
* when we walk down the tree, it is usually safe to unlock the higher layers
* in the tree. The exceptions are when our path goes through slot 0, because
* operations on the tree might require changing key pointers higher up in the
@@ -1315,6 +1422,32 @@ static noinline void unlock_up(struct btrfs_path *path, int level,
}
/*
+ * This releases any locks held in the path starting at level and
+ * going all the way up to the root.
+ *
+ * btrfs_search_slot will keep the lock held on higher nodes in a few
+ * corner cases, such as COW of the block at slot zero in the node. This
+ * ignores those rules, and it should only be called when there are no
+ * more updates to be done higher up in the tree.
+ */
+noinline void btrfs_unlock_up_safe(struct btrfs_path *path, int level)
+{
+ int i;
+
+ if (path->keep_locks || path->lowest_level)
+ return;
+
+ for (i = level; i < BTRFS_MAX_LEVEL; i++) {
+ if (!path->nodes[i])
+ break;
+ if (!path->locks[i])
+ break;
+ btrfs_tree_unlock(path->nodes[i]);
+ path->locks[i] = 0;
+ }
+}
+
+/*
* look for key in the tree. path is filled in with nodes along the way
* if key is found, we return zero and you can find the item in the leaf
* level of the path (level 0)
@@ -1385,6 +1518,7 @@ again:
*/
if (prealloc_block.objectid &&
prealloc_block.offset != b->len) {
+ btrfs_set_path_blocking(p);
btrfs_free_reserved_extent(root,
prealloc_block.objectid,
prealloc_block.offset);
@@ -1409,6 +1543,8 @@ again:
goto again;
}
+ btrfs_set_path_blocking(p);
+
wret = btrfs_cow_block(trans, root, b,
p->nodes[level + 1],
p->slots[level + 1],
@@ -1430,6 +1566,22 @@ cow_done:
if (!p->skip_locking)
p->locks[level] = 1;
+ btrfs_clear_path_blocking(p);
+
+ /*
+ * we have a lock on b and as long as we aren't changing
+ * the tree, there is no way to for the items in b to change.
+ * It is safe to drop the lock on our parent before we
+ * go through the expensive btree search on b.
+ *
+ * If cow is true, then we might be changing slot zero,
+ * which may require changing the parent. So, we can't
+ * drop the lock until after we know which slot we're
+ * operating on.
+ */
+ if (!cow)
+ btrfs_unlock_up_safe(p, level + 1);
+
ret = check_block(root, p, level);
if (ret) {
ret = -1;
@@ -1437,6 +1589,7 @@ cow_done:
}
ret = bin_search(b, key, level, &slot);
+
if (level != 0) {
if (ret && slot > 0)
slot -= 1;
@@ -1444,7 +1597,16 @@ cow_done:
if ((p->search_for_split || ins_len > 0) &&
btrfs_header_nritems(b) >=
BTRFS_NODEPTRS_PER_BLOCK(root) - 3) {
- int sret = split_node(trans, root, p, level);
+ int sret;
+
+ sret = reada_for_balance(root, p, level);
+ if (sret)
+ goto again;
+
+ btrfs_set_path_blocking(p);
+ sret = split_node(trans, root, p, level);
+ btrfs_clear_path_blocking(p);
+
BUG_ON(sret > 0);
if (sret) {
ret = sret;
@@ -1453,8 +1615,16 @@ cow_done:
b = p->nodes[level];
slot = p->slots[level];
} else if (ins_len < 0) {
- int sret = balance_level(trans, root, p,
- level);
+ int sret;
+
+ sret = reada_for_balance(root, p, level);
+ if (sret)
+ goto again;
+
+ btrfs_set_path_blocking(p);
+ sret = balance_level(trans, root, p, level);
+ btrfs_clear_path_blocking(p);
+
if (sret) {
ret = sret;
goto done;
@@ -1488,7 +1658,7 @@ cow_done:
* of the btree by dropping locks before
* we read.
*/
- if (level > 1) {
+ if (level > 0) {
btrfs_release_path(NULL, p);
if (tmp)
free_extent_buffer(tmp);
@@ -1503,6 +1673,7 @@ cow_done:
free_extent_buffer(tmp);
goto again;
} else {
+ btrfs_set_path_blocking(p);
if (tmp)
free_extent_buffer(tmp);
if (should_reada)
@@ -1512,14 +1683,29 @@ cow_done:
b = read_node_slot(root, b, slot);
}
}
- if (!p->skip_locking)
- btrfs_tree_lock(b);
+ if (!p->skip_locking) {
+ int lret;
+
+ btrfs_clear_path_blocking(p);
+ lret = btrfs_try_spin_lock(b);
+
+ if (!lret) {
+ btrfs_set_path_blocking(p);
+ btrfs_tree_lock(b);
+ btrfs_clear_path_blocking(p);
+ }
+ }
} else {
p->slots[level] = slot;
if (ins_len > 0 &&
btrfs_leaf_free_space(root, b) < ins_len) {
- int sret = split_leaf(trans, root, key,
+ int sret;
+
+ btrfs_set_path_blocking(p);
+ sret = split_leaf(trans, root, key,
p, ins_len, ret == 0);
+ btrfs_clear_path_blocking(p);
+
BUG_ON(sret > 0);
if (sret) {
ret = sret;
@@ -1533,12 +1719,16 @@ cow_done:
}
ret = 1;
done:
+ /*
+ * we don't really know what they plan on doing with the path
+ * from here on, so for now just mark it as blocking
+ */
+ btrfs_set_path_blocking(p);
if (prealloc_block.objectid) {
btrfs_free_reserved_extent(root,
prealloc_block.objectid,
prealloc_block.offset);
}
-
return ret;
}
@@ -1562,6 +1752,8 @@ int btrfs_merge_path(struct btrfs_trans_handle *trans,
ret = btrfs_cow_block(trans, root, eb, NULL, 0, &eb, 0);
BUG_ON(ret);
+ btrfs_set_lock_blocking(eb);
+
parent = eb;
while (1) {
level = btrfs_header_level(parent);
@@ -1586,6 +1778,7 @@ int btrfs_merge_path(struct btrfs_trans_handle *trans,
eb = read_tree_block(root, bytenr, blocksize,
generation);
btrfs_tree_lock(eb);
+ btrfs_set_lock_blocking(eb);
}
/*
@@ -1610,6 +1803,7 @@ int btrfs_merge_path(struct btrfs_trans_handle *trans,
eb = read_tree_block(root, bytenr, blocksize,
generation);
btrfs_tree_lock(eb);
+ btrfs_set_lock_blocking(eb);
}
ret = btrfs_cow_block(trans, root, eb, parent, slot,
@@ -2156,6 +2350,8 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
right = read_node_slot(root, upper, slot + 1);
btrfs_tree_lock(right);
+ btrfs_set_lock_blocking(right);
+
free_space = btrfs_leaf_free_space(root, right);
if (free_space < data_size)
goto out_unlock;
@@ -2351,6 +2547,8 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
left = read_node_slot(root, path->nodes[1], slot - 1);
btrfs_tree_lock(left);
+ btrfs_set_lock_blocking(left);
+
free_space = btrfs_leaf_free_space(root, left);
if (free_space < data_size) {
ret = 1;
@@ -2809,6 +3007,12 @@ int btrfs_split_item(struct btrfs_trans_handle *trans,
path->keep_locks = 0;
BUG_ON(ret);
+ /*
+ * make sure any changes to the path from split_leaf leave it
+ * in a blocking state
+ */
+ btrfs_set_path_blocking(path);
+
leaf = path->nodes[0];
BUG_ON(btrfs_leaf_free_space(root, leaf) < sizeof(struct btrfs_item));
@@ -3338,6 +3542,7 @@ int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
BUG();
}
out:
+ btrfs_unlock_up_safe(path, 1);
return ret;
}
@@ -3705,12 +3910,14 @@ find_next_key:
*/
if (slot >= nritems) {
path->slots[level] = slot;
+ btrfs_set_path_blocking(path);
sret = btrfs_find_next_key(root, path, min_key, level,
cache_only, min_trans);
if (sret == 0) {
btrfs_release_path(root, path);
goto again;
} else {
+ btrfs_clear_path_blocking(path);
goto out;
}
}
@@ -3722,16 +3929,20 @@ find_next_key:
unlock_up(path, level, 1);
goto out;
}
+ btrfs_set_path_blocking(path);
cur = read_node_slot(root, cur, slot);
btrfs_tree_lock(cur);
+
path->locks[level - 1] = 1;
path->nodes[level - 1] = cur;
unlock_up(path, level, 1);
+ btrfs_clear_path_blocking(path);
}
out:
if (ret == 0)
memcpy(min_key, &found_key, sizeof(found_key));
+ btrfs_set_path_blocking(path);
return ret;
}
@@ -3827,6 +4038,7 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
if (ret < 0)
return ret;
+ btrfs_set_path_blocking(path);
nritems = btrfs_header_nritems(path->nodes[0]);
/*
* by releasing the path above we dropped all our locks. A balance
@@ -3857,6 +4069,7 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
free_extent_buffer(next);
}
+ /* the path was set to blocking above */
if (level == 1 && (path->locks[1] || path->skip_locking) &&
path->reada)
reada_for_search(root, path, level, slot, 0);
@@ -3865,6 +4078,7 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
if (!path->skip_locking) {
WARN_ON(!btrfs_tree_locked(c));
btrfs_tree_lock(next);
+ btrfs_set_lock_blocking(next);
}
break;
}
@@ -3881,12 +4095,15 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
path->locks[level] = 1;
if (!level)
break;
+
+ btrfs_set_path_blocking(path);
if (level == 1 && path->locks[1] && path->reada)
reada_for_search(root, path, level, slot, 0);
next = read_node_slot(root, next, 0);
if (!path->skip_locking) {
WARN_ON(!btrfs_tree_locked(path->nodes[level]));
btrfs_tree_lock(next);
+ btrfs_set_lock_blocking(next);
}
}
done:
@@ -3911,6 +4128,7 @@ int btrfs_previous_item(struct btrfs_root *root,
while (1) {
if (path->slots[0] == 0) {
+ btrfs_set_path_blocking(path);
ret = btrfs_prev_leaf(root, path);
if (ret != 0)
return ret;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index f2b8d26b0472..531db112c8bd 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1835,6 +1835,10 @@ void btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p);
struct btrfs_path *btrfs_alloc_path(void);
void btrfs_free_path(struct btrfs_path *p);
void btrfs_init_path(struct btrfs_path *p);
+void btrfs_set_path_blocking(struct btrfs_path *p);
+void btrfs_clear_path_blocking(struct btrfs_path *p);
+void btrfs_unlock_up_safe(struct btrfs_path *p, int level);
+
int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
struct btrfs_path *path, int slot, int nr);
int btrfs_del_leaf(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 549271607c17..5aebddd71193 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -799,7 +799,7 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid);
if (ret == 0)
- buf->flags |= EXTENT_UPTODATE;
+ set_bit(EXTENT_BUFFER_UPTODATE, &buf->bflags);
else
WARN_ON(1);
return buf;
@@ -813,6 +813,10 @@ int clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root,
if (btrfs_header_generation(buf) ==
root->fs_info->running_transaction->transid) {
WARN_ON(!btrfs_tree_locked(buf));
+
+ /* ugh, clear_extent_buffer_dirty can be expensive */
+ btrfs_set_lock_blocking(buf);
+
clear_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree,
buf);
}
@@ -2311,6 +2315,8 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
u64 transid = btrfs_header_generation(buf);
struct inode *btree_inode = root->fs_info->btree_inode;
+ btrfs_set_lock_blocking(buf);
+
WARN_ON(!btrfs_tree_locked(buf));
if (transid != root->fs_info->generation) {
printk(KERN_CRIT "btrfs transid mismatch buffer %llu, "
@@ -2353,7 +2359,7 @@ int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid)
int ret;
ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid);
if (ret == 0)
- buf->flags |= EXTENT_UPTODATE;
+ set_bit(EXTENT_BUFFER_UPTODATE, &buf->bflags);
return ret;
}
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 7a22f2e6ec47..ed1e25d72483 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3407,7 +3407,10 @@ struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
btrfs_set_header_generation(buf, trans->transid);
btrfs_tree_lock(buf);
clean_tree_block(trans, root, buf);
+
+ btrfs_set_lock_blocking(buf);
btrfs_set_buffer_uptodate(buf);
+
if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
set_extent_dirty(&root->dirty_log_pages, buf->start,
buf->start + buf->len - 1, GFP_NOFS);
@@ -3416,6 +3419,7 @@ struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
buf->start + buf->len - 1, GFP_NOFS);
}
trans->blocks_used++;
+ /* this returns a buffer locked for blocking */
return buf;
}
@@ -3752,6 +3756,7 @@ static noinline int walk_down_subtree(struct btrfs_trans_handle *trans,
next = read_tree_block(root, bytenr, blocksize, ptr_gen);
btrfs_tree_lock(next);
+ btrfs_set_lock_blocking(next);
ret = btrfs_lookup_extent_ref(trans, root, bytenr, blocksize,
&refs);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 2ea7f052722c..dd5df53e045a 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2990,7 +2990,9 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
eb = kmem_cache_zalloc(extent_buffer_cache, mask);
eb->start = start;
eb->len = len;
- mutex_init(&eb->mutex);
+ spin_lock_init(&eb->lock);
+ init_waitqueue_head(&eb->lock_wq);
+
#if LEAK_DEBUG
spin_lock_irqsave(&leak_lock, flags);
list_add(&eb->leak_list, &buffers);
@@ -3071,8 +3073,7 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
unlock_page(p);
}
if (uptodate)
- eb->flags |= EXTENT_UPTODATE;
- eb->flags |= EXTENT_BUFFER_FILLED;
+ set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
spin_lock(&tree->buffer_lock);
exists = buffer_tree_insert(tree, start, &eb->rb_node);
@@ -3226,7 +3227,7 @@ int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
unsigned long num_pages;
num_pages = num_extent_pages(eb->start, eb->len);
- eb->flags &= ~EXTENT_UPTODATE;
+ clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
clear_extent_uptodate(tree, eb->start, eb->start + eb->len - 1,
GFP_NOFS);
@@ -3297,7 +3298,7 @@ int extent_buffer_uptodate(struct extent_io_tree *tree,
struct page *page;
int pg_uptodate = 1;
- if (eb->flags & EXTENT_UPTODATE)
+ if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
return 1;
ret = test_range_bit(tree, eb->start, eb->start + eb->len - 1,
@@ -3333,7 +3334,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
struct bio *bio = NULL;
unsigned long bio_flags = 0;
- if (eb->flags & EXTENT_UPTODATE)
+ if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
return 0;
if (test_range_bit(tree, eb->start, eb->start + eb->len - 1,
@@ -3364,7 +3365,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
}
if (all_uptodate) {
if (start_i == 0)
- eb->flags |= EXTENT_UPTODATE;
+ set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
goto unlock_exit;
}
@@ -3400,7 +3401,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
}
if (!ret)
- eb->flags |= EXTENT_UPTODATE;
+ set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
return ret;
unlock_exit:
@@ -3497,7 +3498,6 @@ int map_extent_buffer(struct extent_buffer *eb, unsigned long start,
unmap_extent_buffer(eb, eb->map_token, km);
eb->map_token = NULL;
save = 1;
- WARN_ON(!mutex_is_locked(&eb->mutex));
}
err = map_private_extent_buffer(eb, start, min_len, token, map,
map_start, map_len, km);
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index e80c6d96b318..1f9df88afbf6 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -22,6 +22,10 @@
/* flags for bio submission */
#define EXTENT_BIO_COMPRESSED 1
+/* these are bit numbers for test/set bit */
+#define EXTENT_BUFFER_UPTODATE 0
+#define EXTENT_BUFFER_BLOCKING 1
+
/*
* page->private values. Every page that is controlled by the extent
* map has page->private set to one.
@@ -95,11 +99,19 @@ struct extent_buffer {
unsigned long map_start;
unsigned long map_len;
struct page *first_page;
+ unsigned long bflags;
atomic_t refs;
- int flags;
struct list_head leak_list;
struct rb_node rb_node;
- struct mutex mutex;
+
+ /* the spinlock is used to protect most operations */
+ spinlock_t lock;
+
+ /*
+ * when we keep the lock held while blocking, waiters go onto
+ * the wq
+ */
+ wait_queue_head_t lock_wq;
};
struct extent_map_tree;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 4a79e1c5ebd0..ebd7d6c37df8 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -50,6 +50,7 @@
#include "tree-log.h"
#include "ref-cache.h"
#include "compression.h"
+#include "locking.h"
struct btrfs_iget_args {
u64 ino;
@@ -2021,6 +2022,7 @@ void btrfs_read_locked_inode(struct inode *inode)
BTRFS_I(inode)->flags = btrfs_inode_flags(leaf, inode_item);
alloc_group_block = btrfs_inode_block_group(leaf, inode_item);
+
BTRFS_I(inode)->block_group = btrfs_find_block_group(root, 0,
alloc_group_block, 0);
btrfs_free_path(path);
@@ -2117,6 +2119,7 @@ noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
goto failed;
}
+ btrfs_unlock_up_safe(path, 1);
leaf = path->nodes[0];
inode_item = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_inode_item);
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index 39bae7761db6..68fd9ccf1805 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -26,45 +26,215 @@
#include "locking.h"
/*
- * locks the per buffer mutex in an extent buffer. This uses adaptive locks
- * and the spin is not tuned very extensively. The spinning does make a big
- * difference in almost every workload, but spinning for the right amount of
- * time needs some help.
- *
- * In general, we want to spin as long as the lock holder is doing btree
- * searches, and we should give up if they are in more expensive code.
+ * btrfs_header_level() isn't free, so don't call it when lockdep isn't
+ * on
*/
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+static inline void spin_nested(struct extent_buffer *eb)
+{
+ spin_lock_nested(&eb->lock, BTRFS_MAX_LEVEL - btrfs_header_level(eb));
+}
+#else
+static inline void spin_nested(struct extent_buffer *eb)
+{
+ spin_lock(&eb->lock);
+}
+#endif
-int btrfs_tree_lock(struct extent_buffer *eb)
+/*
+ * Setting a lock to blocking will drop the spinlock and set the
+ * flag that forces other procs who want the lock to wait. After
+ * this you can safely schedule with the lock held.
+ */
+void btrfs_set_lock_blocking(struct extent_buffer *eb)
{
- int i;
+ if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) {
+ set_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags);
+ spin_unlock(&eb->lock);
+ }
+ /* exit with the spin lock released and the bit set */
+}
- if (mutex_trylock(&eb->mutex))
- return 0;
+/*
+ * clearing the blocking flag will take the spinlock again.
+ * After this you can't safely schedule
+ */
+void btrfs_clear_lock_blocking(struct extent_buffer *eb)
+{
+ if (test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) {
+ spin_nested(eb);
+ clear_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags);
+ smp_mb__after_clear_bit();
+ }
+ /* exit with the spin lock held */
+}
+
+/*
+ * unfortunately, many of the places that currently set a lock to blocking
+ * don't end up blocking for every long, and often they don't block
+ * at all. For a dbench 50 run, if we don't spin one the blocking bit
+ * at all, the context switch rate can jump up to 400,000/sec or more.
+ *
+ * So, we're still stuck with this crummy spin on the blocking bit,
+ * at least until the most common causes of the short blocks
+ * can be dealt with.
+ */
+static int btrfs_spin_on_block(struct extent_buffer *eb)
+{
+ int i;
for (i = 0; i < 512; i++) {
cpu_relax();
- if (mutex_trylock(&eb->mutex))
+ if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
+ return 1;
+ if (need_resched())
+ break;
+ }
+ return 0;
+}
+
+/*
+ * This is somewhat different from trylock. It will take the
+ * spinlock but if it finds the lock is set to blocking, it will
+ * return without the lock held.
+ *
+ * returns 1 if it was able to take the lock and zero otherwise
+ *
+ * After this call, scheduling is not safe without first calling
+ * btrfs_set_lock_blocking()
+ */
+int btrfs_try_spin_lock(struct extent_buffer *eb)
+{
+ int i;
+
+ spin_nested(eb);
+ if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
+ return 1;
+ spin_unlock(&eb->lock);
+
+ /* spin for a bit on the BLOCKING flag */
+ for (i = 0; i < 2; i++) {
+ if (!btrfs_spin_on_block(eb))
+ break;
+
+ spin_nested(eb);
+ if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
+ return 1;
+ spin_unlock(&eb->lock);
+ }
+ return 0;
+}
+
+/*
+ * the autoremove wake function will return 0 if it tried to wake up
+ * a process that was already awake, which means that process won't
+ * count as an exclusive wakeup. The waitq code will continue waking
+ * procs until it finds one that was actually sleeping.
+ *
+ * For btrfs, this isn't quite what we want. We want a single proc
+ * to be notified that the lock is ready for taking. If that proc
+ * already happen to be awake, great, it will loop around and try for
+ * the lock.
+ *
+ * So, btrfs_wake_function always returns 1, even when the proc that we
+ * tried to wake up was already awake.
+ */
+static int btrfs_wake_function(wait_queue_t *wait, unsigned mode,
+ int sync, void *key)
+{
+ autoremove_wake_function(wait, mode, sync, key);
+ return 1;
+}
+
+/*
+ * returns with the extent buffer spinlocked.
+ *
+ * This will spin and/or wait as required to take the lock, and then
+ * return with the spinlock held.
+ *
+ * After this call, scheduling is not safe without first calling
+ * btrfs_set_lock_blocking()
+ */
+int btrfs_tree_lock(struct extent_buffer *eb)
+{
+ DEFINE_WAIT(wait);
+ wait.func = btrfs_wake_function;
+
+ while(1) {
+ spin_nested(eb);
+
+ /* nobody is blocking, exit with the spinlock held */
+ if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
return 0;
+
+ /*
+ * we have the spinlock, but the real owner is blocking.
+ * wait for them
+ */
+ spin_unlock(&eb->lock);
+
+ /*
+ * spin for a bit, and if the blocking flag goes away,
+ * loop around
+ */
+ if (btrfs_spin_on_block(eb))
+ continue;
+
+ prepare_to_wait_exclusive(&eb->lock_wq, &wait,
+ TASK_UNINTERRUPTIBLE);
+
+ if (test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
+ schedule();
+
+ finish_wait(&eb->lock_wq, &wait);
}
- cpu_relax();
- mutex_lock_nested(&eb->mutex, BTRFS_MAX_LEVEL - btrfs_header_level(eb));
return 0;
}
+/*
+ * Very quick trylock, this does not spin or schedule. It returns
+ * 1 with the spinlock held if it was able to take the lock, or it
+ * returns zero if it was unable to take the lock.
+ *
+ * After this call, scheduling is not safe without first calling
+ * btrfs_set_lock_blocking()
+ */
int btrfs_try_tree_lock(struct extent_buffer *eb)
{
- return mutex_trylock(&eb->mutex);
+ if (spin_trylock(&eb->lock)) {
+ if (test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) {
+ /*
+ * we've got the spinlock, but the real owner is
+ * blocking. Drop the spinlock and return failure
+ */
+ spin_unlock(&eb->lock);
+ return 0;
+ }
+ return 1;
+ }
+ /* someone else has the spinlock giveup */
+ return 0;
}
int btrfs_tree_unlock(struct extent_buffer *eb)
{
- mutex_unlock(&eb->mutex);
+ /*
+ * if we were a blocking owner, we don't have the spinlock held
+ * just clear the bit and look for waiters
+ */
+ if (test_and_clear_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
+ smp_mb__after_clear_bit();
+ else
+ spin_unlock(&eb->lock);
+
+ if (waitqueue_active(&eb->lock_wq))
+ wake_up(&eb->lock_wq);
return 0;
}
int btrfs_tree_locked(struct extent_buffer *eb)
{
- return mutex_is_locked(&eb->mutex);
+ return test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags) ||
+ spin_is_locked(&eb->lock);
}
/*
@@ -75,12 +245,14 @@ int btrfs_path_lock_waiting(struct btrfs_path *path, int level)
{
int i;
struct extent_buffer *eb;
+
for (i = level; i <= level + 1 && i < BTRFS_MAX_LEVEL; i++) {
eb = path->nodes[i];
if (!eb)
break;
smp_mb();
- if (!list_empty(&eb->mutex.wait_list))
+ if (spin_is_contended(&eb->lock) ||
+ waitqueue_active(&eb->lock_wq))
return 1;
}
return 0;
diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h
index bc1faef12519..d92e707f5870 100644
--- a/fs/btrfs/locking.h
+++ b/fs/btrfs/locking.h
@@ -22,6 +22,12 @@
int btrfs_tree_lock(struct extent_buffer *eb);
int btrfs_tree_unlock(struct extent_buffer *eb);
int btrfs_tree_locked(struct extent_buffer *eb);
+
int btrfs_try_tree_lock(struct extent_buffer *eb);
+int btrfs_try_spin_lock(struct extent_buffer *eb);
+
int btrfs_path_lock_waiting(struct btrfs_path *path, int level);
+
+void btrfs_set_lock_blocking(struct extent_buffer *eb);
+void btrfs_clear_lock_blocking(struct extent_buffer *eb);
#endif
diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c
index 3e8358c36165..98d25fa4570e 100644
--- a/fs/btrfs/tree-defrag.c
+++ b/fs/btrfs/tree-defrag.c
@@ -74,6 +74,7 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
u32 nritems;
root_node = btrfs_lock_root_node(root);
+ btrfs_set_lock_blocking(root_node);
nritems = btrfs_header_nritems(root_node);
root->defrag_max.objectid = 0;
/* from above we know this is not a leaf */
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 4f26f3ed0c87..20794290256b 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -1615,6 +1615,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
btrfs_tree_lock(next);
clean_tree_block(trans, root, next);
+ btrfs_set_lock_blocking(next);
btrfs_wait_tree_block_writeback(next);
btrfs_tree_unlock(next);
@@ -1661,6 +1662,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
next = path->nodes[*level];
btrfs_tree_lock(next);
clean_tree_block(trans, root, next);
+ btrfs_set_lock_blocking(next);
btrfs_wait_tree_block_writeback(next);
btrfs_tree_unlock(next);
@@ -1718,6 +1720,7 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
btrfs_tree_lock(next);
clean_tree_block(trans, root, next);
+ btrfs_set_lock_blocking(next);
btrfs_wait_tree_block_writeback(next);
btrfs_tree_unlock(next);
@@ -1790,6 +1793,7 @@ static int walk_log_tree(struct btrfs_trans_handle *trans,
btrfs_tree_lock(next);
clean_tree_block(trans, log, next);
+ btrfs_set_lock_blocking(next);
btrfs_wait_tree_block_writeback(next);
btrfs_tree_unlock(next);