From 4920c9ac9a4bbc6bf9acd8c614987ee6b378e78f Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Fri, 26 Jan 2007 16:38:42 -0500 Subject: Btrfs: Faster deletes, add Makefile and kerncompat Signed-off-by: Chris Mason --- fs/btrfs/Makefile | 7 +++++++ 1 file changed, 7 insertions(+) create mode 100644 fs/btrfs/Makefile (limited to 'fs/btrfs/Makefile') diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile new file mode 100644 index 000000000000..9f84c08baab2 --- /dev/null +++ b/fs/btrfs/Makefile @@ -0,0 +1,7 @@ + +ctree: ctree.o + gcc -g -O2 -Wall -o ctree ctree.c + +clean: + rm ctree ctree.o + -- cgit v1.2.3 From eb60ceac0ebd11e9896ed1ca84fcad71a4b23601 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Fri, 2 Feb 2007 09:18:22 -0500 Subject: Btrfs: Add backing store, memory management Signed-off-by: Chris Mason --- fs/btrfs/Makefile | 11 +- fs/btrfs/ctree.c | 432 ++++++++++++++++---------- fs/btrfs/ctree.h | 62 ++++ fs/btrfs/disk-io.c | 174 +++++++++++ fs/btrfs/disk-io.h | 21 ++ fs/btrfs/kerncompat.h | 1 + fs/btrfs/radix-tree.c | 836 ++++++++++++++++++++++++++++++++++++++++++++++++++ fs/btrfs/radix-tree.h | 73 +++++ 8 files changed, 1447 insertions(+), 163 deletions(-) create mode 100644 fs/btrfs/ctree.h create mode 100644 fs/btrfs/disk-io.c create mode 100644 fs/btrfs/disk-io.h create mode 100644 fs/btrfs/radix-tree.c create mode 100644 fs/btrfs/radix-tree.h (limited to 'fs/btrfs/Makefile') diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index 9f84c08baab2..63360212a075 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -1,7 +1,12 @@ -ctree: ctree.o - gcc -g -O2 -Wall -o ctree ctree.c +CFLAGS= -g -Wall + +.c.o: + $(CC) $(CFLAGS) -c $< + +ctree: ctree.o disk-io.h ctree.h disk-io.o radix-tree.o radix-tree.h + gcc $(CFLAGS) -o ctree ctree.o disk-io.o radix-tree.o clean: - rm ctree ctree.o + rm ctree *.o diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 4bf5e92584bd..6f0522f21082 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -1,68 +1,25 @@ #include #include #include "kerncompat.h" - -#define BLOCKSIZE 4096 - -struct key { - u64 objectid; - u32 flags; - u64 offset; -} __attribute__ ((__packed__)); - -struct header { - u64 fsid[2]; /* FS specific uuid */ - u64 blocknum; - u64 parentid; - u32 csum; - u32 ham; - u16 nritems; - u16 flags; -} __attribute__ ((__packed__)); - -#define NODEPTRS_PER_BLOCK ((BLOCKSIZE - sizeof(struct header)) / \ - (sizeof(struct key) + sizeof(u64))) - -#define LEVEL_BITS 3 -#define MAX_LEVEL (1 << LEVEL_BITS) -#define node_level(f) ((f) & (MAX_LEVEL-1)) -#define is_leaf(f) (node_level(f) == 0) - -struct ctree_root { - struct node *node; -}; - -struct item { - struct key key; - u16 offset; - u16 size; -} __attribute__ ((__packed__)); - -#define LEAF_DATA_SIZE (BLOCKSIZE - sizeof(struct header)) -struct leaf { - struct header header; - union { - struct item items[LEAF_DATA_SIZE/sizeof(struct item)]; - u8 data[BLOCKSIZE-sizeof(struct header)]; - }; -} __attribute__ ((__packed__)); - -struct node { - struct header header; - struct key keys[NODEPTRS_PER_BLOCK]; - u64 blockptrs[NODEPTRS_PER_BLOCK]; -} __attribute__ ((__packed__)); - -struct ctree_path { - struct node *nodes[MAX_LEVEL]; - int slots[MAX_LEVEL]; -}; +#include "radix-tree.h" +#include "ctree.h" +#include "disk-io.h" static inline void init_path(struct ctree_path *p) { memset(p, 0, sizeof(*p)); } +static void release_path(struct ctree_root *root, struct ctree_path *p) +{ + int i; + for (i = 0; i < MAX_LEVEL; i++) { + if (!p->nodes[i]) + break; + tree_block_release(root, p->nodes[i]); + } +} + static inline unsigned int leaf_data_end(struct leaf *leaf) { unsigned int nr = leaf->header.nritems; @@ -135,26 +92,25 @@ int bin_search(struct node *c, struct key *key, int *slot) return -1; } -void *read_block(u64 blocknum) -{ - return (void *)blocknum; -} - int search_slot(struct ctree_root *root, struct key *key, struct ctree_path *p) { - struct node *c = root->node; + struct tree_buffer *b = root->node; + struct node *c; + int slot; int ret; int level; - while (c) { + b->count++; + while (b) { + c = &b->node; level = node_level(c->header.flags); - p->nodes[level] = c; + p->nodes[level] = b; ret = bin_search(c, key, &slot); if (!is_leaf(c->header.flags)) { if (ret && slot > 0) slot -= 1; p->slots[level] = slot; - c = read_block(c->blockptrs[slot]); + b = read_tree_block(root, c->blockptrs[slot]); continue; } else { p->slots[level] = slot; @@ -164,17 +120,20 @@ int search_slot(struct ctree_root *root, struct key *key, struct ctree_path *p) return -1; } -static void fixup_low_keys(struct ctree_path *path, struct key *key, - int level) +static void fixup_low_keys(struct ctree_root *root, + struct ctree_path *path, struct key *key, + int level) { int i; /* adjust the pointers going up the tree */ for (i = level; i < MAX_LEVEL; i++) { - struct node *t = path->nodes[i]; + struct node *t; int tslot = path->slots[i]; - if (!t) + if (!path->nodes[i]) break; + t = &path->nodes[i]->node; memcpy(t->keys + tslot, key, sizeof(*key)); + write_tree_block(root, path->nodes[i]); if (tslot != 0) break; } @@ -190,27 +149,34 @@ int __insert_ptr(struct ctree_root *root, int nritems; /* need a new root */ if (!path->nodes[level]) { - c = malloc(sizeof(struct node)); + struct tree_buffer *t; + t = alloc_free_block(root); + c = &t->node; memset(c, 0, sizeof(c)); c->header.nritems = 2; c->header.flags = node_level(level); - lower = path->nodes[level-1]; + c->header.blocknr = t->blocknr; + lower = &path->nodes[level-1]->node; if (is_leaf(lower->header.flags)) lower_key = &((struct leaf *)lower)->items[0].key; else lower_key = lower->keys; memcpy(c->keys, lower_key, sizeof(struct key)); memcpy(c->keys + 1, key, sizeof(struct key)); - c->blockptrs[0] = (u64)lower; + c->blockptrs[0] = path->nodes[level-1]->blocknr; c->blockptrs[1] = blocknr; - root->node = c; - path->nodes[level] = c; + /* the path has an extra ref to root->node */ + tree_block_release(root, root->node); + root->node = t; + t->count++; + write_tree_block(root, t); + path->nodes[level] = t; path->slots[level] = 0; if (c->keys[1].objectid == 0) BUG(); return 0; } - lower = path->nodes[level]; + lower = &path->nodes[level]->node; nritems = lower->header.nritems; if (slot > nritems) BUG(); @@ -227,6 +193,7 @@ int __insert_ptr(struct ctree_root *root, lower->header.nritems++; if (lower->keys[1].objectid == 0) BUG(); + write_tree_block(root, path->nodes[level]); return 0; } @@ -238,6 +205,8 @@ int push_node_left(struct ctree_root *root, struct ctree_path *path, int level) int push_items = 0; int left_nritems; int right_nritems; + struct tree_buffer *t; + struct tree_buffer *right_buf; if (level == MAX_LEVEL - 1 || path->nodes[level + 1] == 0) return 1; @@ -245,13 +214,18 @@ int push_node_left(struct ctree_root *root, struct ctree_path *path, int level) if (slot == 0) return 1; - left = read_block(path->nodes[level + 1]->blockptrs[slot - 1]); - right = path->nodes[level]; + t = read_tree_block(root, + path->nodes[level + 1]->node.blockptrs[slot - 1]); + left = &t->node; + right_buf = path->nodes[level]; + right = &right_buf->node; left_nritems = left->header.nritems; right_nritems = right->header.nritems; push_items = NODEPTRS_PER_BLOCK - (left_nritems + 1); - if (push_items <= 0) + if (push_items <= 0) { + tree_block_release(root, t); return 1; + } if (right_nritems < push_items) push_items = right_nritems; @@ -267,15 +241,20 @@ int push_node_left(struct ctree_root *root, struct ctree_path *path, int level) left->header.nritems += push_items; /* adjust the pointers going up the tree */ - fixup_low_keys(path, right->keys, level + 1); + fixup_low_keys(root, path, right->keys, level + 1); + + write_tree_block(root, t); + write_tree_block(root, right_buf); /* then fixup the leaf pointer in the path */ if (path->slots[level] < push_items) { path->slots[level] += left_nritems; - path->nodes[level] = (struct node*)left; + tree_block_release(root, path->nodes[level]); + path->nodes[level] = t; path->slots[level + 1] -= 1; } else { path->slots[level] -= push_items; + tree_block_release(root, t); } return 0; } @@ -283,6 +262,8 @@ int push_node_left(struct ctree_root *root, struct ctree_path *path, int level) int push_node_right(struct ctree_root *root, struct ctree_path *path, int level) { int slot; + struct tree_buffer *t; + struct tree_buffer *src_buffer; struct node *dst; struct node *src; int push_items = 0; @@ -295,16 +276,21 @@ int push_node_right(struct ctree_root *root, struct ctree_path *path, int level) if (slot == NODEPTRS_PER_BLOCK - 1) return 1; - if (slot >= path->nodes[level + 1]->header.nritems -1) + if (slot >= path->nodes[level + 1]->node.header.nritems -1) return 1; - dst = read_block(path->nodes[level + 1]->blockptrs[slot + 1]); - src = path->nodes[level]; + t = read_tree_block(root, + path->nodes[level + 1]->node.blockptrs[slot + 1]); + dst = &t->node; + src_buffer = path->nodes[level]; + src = &src_buffer->node; dst_nritems = dst->header.nritems; src_nritems = src->header.nritems; push_items = NODEPTRS_PER_BLOCK - (dst_nritems + 1); - if (push_items <= 0) + if (push_items <= 0) { + tree_block_release(root, t); return 1; + } if (src_nritems < push_items) push_items = src_nritems; @@ -322,13 +308,21 @@ int push_node_right(struct ctree_root *root, struct ctree_path *path, int level) dst->header.nritems += push_items; /* adjust the pointers going up the tree */ - memcpy(path->nodes[level + 1]->keys + path->slots[level + 1] + 1, + memcpy(path->nodes[level + 1]->node.keys + path->slots[level + 1] + 1, dst->keys, sizeof(struct key)); + + write_tree_block(root, path->nodes[level + 1]); + write_tree_block(root, t); + write_tree_block(root, src_buffer); + /* then fixup the leaf pointer in the path */ if (path->slots[level] >= src->header.nritems) { path->slots[level] -= src->header.nritems; - path->nodes[level] = (struct node*)dst; + tree_block_release(root, path->nodes[level]); + path->nodes[level] = t; path->slots[level + 1] += 1; + } else { + tree_block_release(root, t); } return 0; } @@ -337,15 +331,18 @@ int insert_ptr(struct ctree_root *root, struct ctree_path *path, struct key *key, u64 blocknr, int level) { - struct node *c = path->nodes[level]; + struct tree_buffer *t = path->nodes[level]; + struct node *c = &path->nodes[level]->node; struct node *b; - struct node *bal[MAX_LEVEL]; + struct tree_buffer *b_buffer; + struct tree_buffer *bal[MAX_LEVEL]; int bal_level = level; int mid; int bal_start = -1; memset(bal, 0, ARRAY_SIZE(bal)); - while(c && c->header.nritems == NODEPTRS_PER_BLOCK) { + while(t && t->node.header.nritems == NODEPTRS_PER_BLOCK) { + c = &t->node; if (push_node_left(root, path, node_level(c->header.flags)) == 0) break; @@ -355,8 +352,10 @@ int insert_ptr(struct ctree_root *root, bal_start = bal_level; if (bal_level == MAX_LEVEL - 1) BUG(); - b = malloc(sizeof(struct node)); + b_buffer = alloc_free_block(root); + b = &b_buffer->node; b->header.flags = c->header.flags; + b->header.blocknr = b_buffer->blocknr; mid = (c->header.nritems + 1) / 2; memcpy(b->keys, c->keys + mid, (c->header.nritems - mid) * sizeof(struct key)); @@ -364,21 +363,28 @@ int insert_ptr(struct ctree_root *root, (c->header.nritems - mid) * sizeof(u64)); b->header.nritems = c->header.nritems - mid; c->header.nritems = mid; - bal[bal_level] = b; + + write_tree_block(root, t); + write_tree_block(root, b_buffer); + + bal[bal_level] = b_buffer; if (bal_level == MAX_LEVEL - 1) break; bal_level += 1; - c = path->nodes[bal_level]; + t = path->nodes[bal_level]; } while(bal_start > 0) { - b = bal[bal_start]; - c = path->nodes[bal_start]; - __insert_ptr(root, path, b->keys, (u64)b, + b_buffer = bal[bal_start]; + c = &path->nodes[bal_start]->node; + __insert_ptr(root, path, b_buffer->node.keys, b_buffer->blocknr, path->slots[bal_start + 1] + 1, bal_start + 1); if (path->slots[bal_start] >= c->header.nritems) { path->slots[bal_start] -= c->header.nritems; - path->nodes[bal_start] = b; + tree_block_release(root, path->nodes[bal_start]); + path->nodes[bal_start] = b_buffer; path->slots[bal_start + 1] += 1; + } else { + tree_block_release(root, b_buffer); } bal_start--; if (!bal[bal_start]) @@ -404,7 +410,9 @@ int leaf_space_used(struct leaf *l, int start, int nr) int push_leaf_left(struct ctree_root *root, struct ctree_path *path, int data_size) { - struct leaf *right = (struct leaf *)path->nodes[0]; + struct tree_buffer *right_buf = path->nodes[0]; + struct leaf *right = &right_buf->leaf; + struct tree_buffer *t; struct leaf *left; int slot; int i; @@ -421,9 +429,11 @@ int push_leaf_left(struct ctree_root *root, struct ctree_path *path, if (!path->nodes[1]) { return 1; } - left = read_block(path->nodes[1]->blockptrs[slot - 1]); + t = read_tree_block(root, path->nodes[1]->node.blockptrs[slot - 1]); + left = &t->leaf; free_space = leaf_free_space(left); if (free_space < data_size + sizeof(struct item)) { + tree_block_release(root, t); return 1; } for (i = 0; i < right->header.nritems; i++) { @@ -436,6 +446,7 @@ int push_leaf_left(struct ctree_root *root, struct ctree_path *path, push_space += item->size + sizeof(*item); } if (push_items == 0) { + tree_block_release(root, t); return 1; } /* push data from right to left */ @@ -446,6 +457,8 @@ int push_leaf_left(struct ctree_root *root, struct ctree_path *path, right->data + right->items[push_items - 1].offset, push_space); old_left_nritems = left->header.nritems; + BUG_ON(old_left_nritems < 0); + for(i = old_left_nritems; i < old_left_nritems + push_items; i++) { left->items[i].offset -= LEAF_DATA_SIZE - left->items[old_left_nritems -1].offset; @@ -460,30 +473,40 @@ int push_leaf_left(struct ctree_root *root, struct ctree_path *path, (right->header.nritems - push_items) * sizeof(struct item)); right->header.nritems -= push_items; push_space = LEAF_DATA_SIZE; + for (i = 0; i < right->header.nritems; i++) { right->items[i].offset = push_space - right->items[i].size; push_space = right->items[i].offset; } - fixup_low_keys(path, &right->items[0].key, 1); + + write_tree_block(root, t); + write_tree_block(root, right_buf); + + fixup_low_keys(root, path, &right->items[0].key, 1); /* then fixup the leaf pointer in the path */ if (path->slots[0] < push_items) { path->slots[0] += old_left_nritems; - path->nodes[0] = (struct node*)left; + tree_block_release(root, path->nodes[0]); + path->nodes[0] = t; path->slots[1] -= 1; } else { + tree_block_release(root, t); path->slots[0] -= push_items; } + BUG_ON(path->slots[0] < 0); return 0; } int split_leaf(struct ctree_root *root, struct ctree_path *path, int data_size) { - struct leaf *l = (struct leaf *)path->nodes[0]; - int nritems = l->header.nritems; - int mid = (nritems + 1)/ 2; - int slot = path->slots[0]; + struct tree_buffer *l_buf = path->nodes[0]; + struct leaf *l = &l_buf->leaf; + int nritems; + int mid; + int slot; struct leaf *right; + struct tree_buffer *right_buffer; int space_needed = data_size + sizeof(struct item); int data_copy_size; int rt_data_off; @@ -491,9 +514,19 @@ int split_leaf(struct ctree_root *root, struct ctree_path *path, int data_size) int ret; if (push_leaf_left(root, path, data_size) == 0) { - return 0; + l_buf = path->nodes[0]; + l = &l_buf->leaf; + if (leaf_free_space(l) >= sizeof(struct item) + data_size) + return 0; } - right = malloc(sizeof(struct leaf)); + slot = path->slots[0]; + nritems = l->header.nritems; + mid = (nritems + 1)/ 2; + + right_buffer = alloc_free_block(root); + BUG_ON(!right_buffer); + BUG_ON(mid == nritems); + right = &right_buffer->leaf; memset(right, 0, sizeof(*right)); if (mid <= slot) { if (leaf_space_used(l, mid, nritems - mid) + space_needed > @@ -505,6 +538,8 @@ int split_leaf(struct ctree_root *root, struct ctree_path *path, int data_size) BUG(); } right->header.nritems = nritems - mid; + right->header.blocknr = right_buffer->blocknr; + right->header.flags = node_level(0); data_copy_size = l->items[mid].offset + l->items[mid].size - leaf_data_end(l); memcpy(right->items, l->items + mid, @@ -518,12 +553,20 @@ int split_leaf(struct ctree_root *root, struct ctree_path *path, int data_size) } l->header.nritems = mid; ret = insert_ptr(root, path, &right->items[0].key, - (u64)right, 1); + right_buffer->blocknr, 1); + + write_tree_block(root, right_buffer); + write_tree_block(root, l_buf); + + BUG_ON(path->slots[0] != slot); if (mid <= slot) { - path->nodes[0] = (struct node *)right; + tree_block_release(root, path->nodes[0]); + path->nodes[0] = right_buffer; path->slots[0] -= mid; path->slots[1] += 1; - } + } else + tree_block_release(root, right_buffer); + BUG_ON(path->slots[0] < 0); return ret; } @@ -532,28 +575,48 @@ int insert_item(struct ctree_root *root, struct key *key, { int ret; int slot; + int slot_orig; struct leaf *leaf; + struct tree_buffer *leaf_buf; unsigned int nritems; unsigned int data_end; struct ctree_path path; + if (!root->node) { + struct tree_buffer *t; + t = alloc_free_block(root); + BUG_ON(!t); + t->node.header.nritems = 0; + t->node.header.flags = node_level(0); + t->node.header.blocknr = t->blocknr; + root->node = t; + write_tree_block(root, t); + } init_path(&path); ret = search_slot(root, key, &path); - if (ret == 0) + if (ret == 0) { + release_path(root, &path); return -EEXIST; + } - leaf = (struct leaf *)path.nodes[0]; - if (leaf_free_space(leaf) < sizeof(struct item) + data_size) + slot_orig = path.slots[0]; + leaf_buf = path.nodes[0]; + leaf = &leaf_buf->leaf; + if (leaf_free_space(leaf) < sizeof(struct item) + data_size) { split_leaf(root, &path, data_size); - leaf = (struct leaf *)path.nodes[0]; + leaf_buf = path.nodes[0]; + leaf = &path.nodes[0]->leaf; + } nritems = leaf->header.nritems; data_end = leaf_data_end(leaf); + if (leaf_free_space(leaf) < sizeof(struct item) + data_size) BUG(); slot = path.slots[0]; + BUG_ON(slot < 0); if (slot == 0) - fixup_low_keys(&path, key, 1); + fixup_low_keys(root, &path, key, 1); if (slot != nritems) { int i; unsigned int old_data = leaf->items[slot].offset + @@ -580,21 +643,25 @@ int insert_item(struct ctree_root *root, struct key *key, leaf->items[slot].size = data_size; memcpy(leaf->data + data_end - data_size, data, data_size); leaf->header.nritems += 1; + write_tree_block(root, leaf_buf); if (leaf_free_space(leaf) < 0) BUG(); + release_path(root, &path); return 0; } int del_ptr(struct ctree_root *root, struct ctree_path *path, int level) { int slot; + struct tree_buffer *t; struct node *node; int nritems; while(1) { - node = path->nodes[level]; - if (!node) + t = path->nodes[level]; + if (!t) break; + node = &t->node; slot = path->slots[level]; nritems = node->header.nritems; @@ -606,28 +673,34 @@ int del_ptr(struct ctree_root *root, struct ctree_path *path, int level) sizeof(u64) * (nritems - slot - 1)); } node->header.nritems--; + write_tree_block(root, t); if (node->header.nritems != 0) { int tslot; if (slot == 0) - fixup_low_keys(path, node->keys, level + 1); + fixup_low_keys(root, path, node->keys, + level + 1); tslot = path->slots[level+1]; + t->count++; push_node_left(root, path, level); if (node->header.nritems) { push_node_right(root, path, level); } - if (node->header.nritems) + if (node->header.nritems) { + tree_block_release(root, t); break; + } + tree_block_release(root, t); path->slots[level+1] = tslot; } - if (node == root->node) { - printf("root is now null!\n"); - root->node = NULL; + if (t == root->node) { + /* just turn the root into a leaf and break */ + root->node->node.header.flags = node_level(0); + write_tree_block(root, t); break; } level++; if (!path->nodes[level]) BUG(); - free(node); } return 0; } @@ -636,10 +709,12 @@ int del_item(struct ctree_root *root, struct ctree_path *path) { int slot; struct leaf *leaf; + struct tree_buffer *leaf_buf; int doff; int dsize; - leaf = (struct leaf *)path->nodes[0]; + leaf_buf = path->nodes[0]; + leaf = &leaf_buf->leaf; slot = path->slots[0]; doff = leaf->items[slot].offset; dsize = leaf->items[slot].size; @@ -658,14 +733,15 @@ int del_item(struct ctree_root *root, struct ctree_path *path) } leaf->header.nritems -= 1; if (leaf->header.nritems == 0) { - if (leaf == (struct leaf *)root->node) - root->node = NULL; - else + if (leaf_buf == root->node) { + leaf->header.flags = node_level(0); + write_tree_block(root, leaf_buf); + } else del_ptr(root, path, 1); - free(leaf); } else { if (slot == 0) - fixup_low_keys(path, &leaf->items[0].key, 1); + fixup_low_keys(root, path, &leaf->items[0].key, 1); + write_tree_block(root, leaf_buf); if (leaf_space_used(leaf, 0, leaf->header.nritems) < LEAF_DATA_SIZE / 4) { /* push_leaf_left fixes the path. @@ -673,12 +749,13 @@ int del_item(struct ctree_root *root, struct ctree_path *path) * for possible call to del_ptr below */ slot = path->slots[1]; + leaf_buf->count++; push_leaf_left(root, path, 1); if (leaf->header.nritems == 0) { - free(leaf); path->slots[1] = slot; del_ptr(root, path, 1); } + tree_block_release(root, leaf_buf); } } return 0; @@ -689,7 +766,7 @@ void print_leaf(struct leaf *l) int i; int nr = l->header.nritems; struct item *item; - printf("leaf %p total ptrs %d free space %d\n", l, nr, + printf("leaf %lu total ptrs %d free space %d\n", l->header.blocknr, nr, leaf_free_space(l)); fflush(stdout); for (i = 0 ; i < nr ; i++) { @@ -703,38 +780,45 @@ void print_leaf(struct leaf *l) fflush(stdout); } } -void print_tree(struct node *c) +void print_tree(struct ctree_root *root, struct tree_buffer *t) { int i; int nr; + struct node *c; - if (!c) + if (!t) return; + c = &t->node; nr = c->header.nritems; + if (c->header.blocknr != t->blocknr) + BUG(); if (is_leaf(c->header.flags)) { print_leaf((struct leaf *)c); return; } - printf("node %p level %d total ptrs %d free spc %lu\n", c, + printf("node %lu level %d total ptrs %d free spc %lu\n", t->blocknr, node_level(c->header.flags), c->header.nritems, NODEPTRS_PER_BLOCK - c->header.nritems); fflush(stdout); for (i = 0; i < nr; i++) { - printf("\tkey %d (%lu %u %lu) block %lx\n", + printf("\tkey %d (%lu %u %lu) block %lu\n", i, c->keys[i].objectid, c->keys[i].flags, c->keys[i].offset, c->blockptrs[i]); fflush(stdout); } for (i = 0; i < nr; i++) { - struct node *next = read_block(c->blockptrs[i]); + struct tree_buffer *next_buf = read_tree_block(root, + c->blockptrs[i]); + struct node *next = &next_buf->node; if (is_leaf(next->header.flags) && node_level(c->header.flags) != 1) BUG(); if (node_level(next->header.flags) != node_level(c->header.flags) - 1) BUG(); - print_tree(next); + print_tree(root, next_buf); + tree_block_release(root, next_buf); } } @@ -746,23 +830,24 @@ int next_key(int i, int max_key) { } int main() { - struct leaf *first_node = malloc(sizeof(struct leaf)); - struct ctree_root root; + struct ctree_root *root; struct key ins; struct key last = { (u64)-1, 0, 0}; char *buf; int i; int num; int ret; - int run_size = 100000; + int run_size = 1000000; int max_key = 100000000; int tree_size = 0; struct ctree_path path; + radix_tree_init(); + + + root = open_ctree("dbfile"); srand(55); - root.node = (struct node *)first_node; - memset(first_node, 0, sizeof(*first_node)); for (i = 0; i < run_size; i++) { buf = malloc(64); num = next_key(i, max_key); @@ -772,39 +857,46 @@ int main() { ins.objectid = num; ins.offset = 0; ins.flags = 0; - ret = insert_item(&root, &ins, buf, strlen(buf)); + ret = insert_item(root, &ins, buf, strlen(buf)); if (!ret) tree_size++; } + close_ctree(root); + root = open_ctree("dbfile"); + printf("starting search\n"); srand(55); for (i = 0; i < run_size; i++) { num = next_key(i, max_key); ins.objectid = num; init_path(&path); - ret = search_slot(&root, &ins, &path); + ret = search_slot(root, &ins, &path); if (ret) { - print_tree(root.node); + print_tree(root, root->node); printf("unable to find %d\n", num); exit(1); } - } - printf("node %p level %d total ptrs %d free spc %lu\n", root.node, - node_level(root.node->header.flags), root.node->header.nritems, - NODEPTRS_PER_BLOCK - root.node->header.nritems); - // print_tree(root.node); - printf("all searches good\n"); + release_path(root, &path); + } + close_ctree(root); + root = open_ctree("dbfile"); + printf("node %p level %d total ptrs %d free spc %lu\n", root->node, + node_level(root->node->node.header.flags), + root->node->node.header.nritems, + NODEPTRS_PER_BLOCK - root->node->node.header.nritems); + printf("all searches good, deleting some items\n"); i = 0; srand(55); for (i = 0 ; i < run_size/4; i++) { num = next_key(i, max_key); ins.objectid = num; init_path(&path); - ret = search_slot(&root, &ins, &path); + ret = search_slot(root, &ins, &path); if (ret) continue; - ret = del_item(&root, &path); + ret = del_item(root, &path); if (ret != 0) BUG(); + release_path(root, &path); tree_size--; } srand(128); @@ -813,38 +905,58 @@ int main() { num = next_key(i, max_key); sprintf(buf, "string-%d", num); ins.objectid = num; - ret = insert_item(&root, &ins, buf, strlen(buf)); + ret = insert_item(root, &ins, buf, strlen(buf)); if (!ret) tree_size++; } - while(root.node) { + close_ctree(root); + root = open_ctree("dbfile"); + printf("starting search2\n"); + srand(128); + for (i = 0; i < run_size; i++) { + num = next_key(i, max_key); + ins.objectid = num; + init_path(&path); + ret = search_slot(root, &ins, &path); + if (ret) { + print_tree(root, root->node); + printf("unable to find %d\n", num); + exit(1); + } + release_path(root, &path); + } + printf("starting big long delete run\n"); + while(root->node && root->node->node.header.nritems > 0) { struct leaf *leaf; int slot; ins.objectid = (u64)-1; init_path(&path); - ret = search_slot(&root, &ins, &path); + ret = search_slot(root, &ins, &path); if (ret == 0) BUG(); - leaf = (struct leaf *)(path.nodes[0]); + leaf = &path.nodes[0]->leaf; slot = path.slots[0]; if (slot != leaf->header.nritems) BUG(); while(path.slots[0] > 0) { path.slots[0] -= 1; slot = path.slots[0]; - leaf = (struct leaf *)(path.nodes[0]); + leaf = &path.nodes[0]->leaf; if (comp_keys(&last, &leaf->items[slot].key) <= 0) BUG(); memcpy(&last, &leaf->items[slot].key, sizeof(last)); - ret = del_item(&root, &path); - if (ret != 0) + ret = del_item(root, &path); + if (ret != 0) { + printf("del_item returned %d\n", ret); BUG(); + } tree_size--; } + release_path(root, &path); } - print_tree(root.node); + close_ctree(root); printf("tree size is now %d\n", tree_size); return 0; } diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h new file mode 100644 index 000000000000..586bf1866042 --- /dev/null +++ b/fs/btrfs/ctree.h @@ -0,0 +1,62 @@ +#ifndef __CTREE__ +#define __CTREE__ + +#define CTREE_BLOCKSIZE 4096 + +struct key { + u64 objectid; + u32 flags; + u64 offset; +} __attribute__ ((__packed__)); + +struct header { + u64 fsid[2]; /* FS specific uuid */ + u64 blocknr; + u64 parentid; + u32 csum; + u32 ham; + u16 nritems; + u16 flags; +} __attribute__ ((__packed__)); + +#define NODEPTRS_PER_BLOCK ((CTREE_BLOCKSIZE - sizeof(struct header)) / \ + (sizeof(struct key) + sizeof(u64))) + +#define LEVEL_BITS 3 +#define MAX_LEVEL (1 << LEVEL_BITS) +#define node_level(f) ((f) & (MAX_LEVEL-1)) +#define is_leaf(f) (node_level(f) == 0) + +struct tree_buffer; +struct ctree_root { + struct tree_buffer *node; + int fp; + struct radix_tree_root cache_radix; +}; + +struct item { + struct key key; + u16 offset; + u16 size; +} __attribute__ ((__packed__)); + +#define LEAF_DATA_SIZE (CTREE_BLOCKSIZE - sizeof(struct header)) +struct leaf { + struct header header; + union { + struct item items[LEAF_DATA_SIZE/sizeof(struct item)]; + u8 data[CTREE_BLOCKSIZE-sizeof(struct header)]; + }; +} __attribute__ ((__packed__)); + +struct node { + struct header header; + struct key keys[NODEPTRS_PER_BLOCK]; + u64 blockptrs[NODEPTRS_PER_BLOCK]; +} __attribute__ ((__packed__)); + +struct ctree_path { + struct tree_buffer *nodes[MAX_LEVEL]; + int slots[MAX_LEVEL]; +}; +#endif diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c new file mode 100644 index 000000000000..8d51a07051d9 --- /dev/null +++ b/fs/btrfs/disk-io.c @@ -0,0 +1,174 @@ +#define _XOPEN_SOURCE 500 +#include +#include +#include +#include +#include +#include +#include "kerncompat.h" +#include "radix-tree.h" +#include "ctree.h" +#include "disk-io.h" + +static int allocated_blocks = 0; + +struct ctree_header { + u64 root_block; +} __attribute__ ((__packed__)); + +static int get_free_block(struct ctree_root *root, u64 *block) +{ + struct stat st; + int ret; + + st.st_size = 0; + ret = fstat(root->fp, &st); + if (st.st_size > sizeof(struct ctree_header)) { + *block = (st.st_size - + sizeof(struct ctree_header)) / CTREE_BLOCKSIZE; + } else { + *block = 0; + } + ret = ftruncate(root->fp, sizeof(struct ctree_header) + (*block + 1) * + CTREE_BLOCKSIZE); + return ret; +} + +struct tree_buffer *alloc_tree_block(struct ctree_root *root, u64 blocknr) +{ + struct tree_buffer *buf; + int ret; + buf = malloc(sizeof(struct tree_buffer)); + if (!buf) + return buf; + allocated_blocks++; + buf->blocknr = blocknr; + buf->count = 1; + radix_tree_preload(GFP_KERNEL); + ret = radix_tree_insert(&root->cache_radix, blocknr, buf); + radix_tree_preload_end(); + if (ret) { + free(buf); + return NULL; + } + return buf; +} + +struct tree_buffer *alloc_free_block(struct ctree_root *root) +{ + u64 free_block; + int ret; + struct tree_buffer * buf; + ret = get_free_block(root, &free_block); + if (ret) { + BUG(); + return NULL; + } + buf = alloc_tree_block(root, free_block); + if (!buf) + BUG(); + return buf; +} + +struct tree_buffer *read_tree_block(struct ctree_root *root, u64 blocknr) +{ + loff_t offset = blocknr * CTREE_BLOCKSIZE + sizeof(struct ctree_header); + struct tree_buffer *buf; + int ret; + + buf = radix_tree_lookup(&root->cache_radix, blocknr); + if (buf) { + buf->count++; + if (buf->blocknr != blocknr) + BUG(); + if (buf->blocknr != buf->node.header.blocknr) + BUG(); + return buf; + } + buf = alloc_tree_block(root, blocknr); + if (!buf) + return NULL; + ret = pread(root->fp, &buf->node, CTREE_BLOCKSIZE, offset); + if (ret != CTREE_BLOCKSIZE) { + free(buf); + return NULL; + } + if (buf->blocknr != buf->node.header.blocknr) + BUG(); + return buf; +} + +int write_tree_block(struct ctree_root *root, struct tree_buffer *buf) +{ + u64 blocknr = buf->blocknr; + loff_t offset = blocknr * CTREE_BLOCKSIZE + sizeof(struct ctree_header); + int ret; + + if (buf->blocknr != buf->node.header.blocknr) + BUG(); + ret = pwrite(root->fp, &buf->node, CTREE_BLOCKSIZE, offset); + if (ret != CTREE_BLOCKSIZE) + return ret; + if (buf == root->node) + return update_root_block(root); + return 0; +} + +struct ctree_root *open_ctree(char *filename) +{ + struct ctree_root *root = malloc(sizeof(struct ctree_root)); + int fp; + u64 root_block; + int ret; + + fp = open(filename, O_CREAT | O_RDWR); + if (fp < 0) { + free(root); + return NULL; + } + root->fp = fp; + INIT_RADIX_TREE(&root->cache_radix, GFP_KERNEL); + ret = pread(fp, &root_block, sizeof(u64), 0); + if (ret == sizeof(u64)) { + printf("reading root node at block %lu\n", root_block); + root->node = read_tree_block(root, root_block); + } else + root->node = NULL; + return root; +} + +int close_ctree(struct ctree_root *root) +{ + close(root->fp); + if (root->node) + tree_block_release(root, root->node); + free(root); + printf("on close %d blocks are allocated\n", allocated_blocks); + return 0; +} + +int update_root_block(struct ctree_root *root) +{ + int ret; + u64 root_block = root->node->blocknr; + + ret = pwrite(root->fp, &root_block, sizeof(u64), 0); + if (ret != sizeof(u64)) + return ret; + return 0; +} + +void tree_block_release(struct ctree_root *root, struct tree_buffer *buf) +{ + buf->count--; + if (buf->count == 0) { + if (!radix_tree_lookup(&root->cache_radix, buf->blocknr)) + BUG(); + radix_tree_delete(&root->cache_radix, buf->blocknr); + memset(buf, 0, sizeof(*buf)); + free(buf); + BUG_ON(allocated_blocks == 0); + allocated_blocks--; + } +} + diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h new file mode 100644 index 000000000000..ee95fa05c4a3 --- /dev/null +++ b/fs/btrfs/disk-io.h @@ -0,0 +1,21 @@ +#ifndef __DISKIO__ +#define __DISKIO__ + +struct tree_buffer { + u64 blocknr; + int count; + union { + struct node node; + struct leaf leaf; + }; +}; + +struct tree_buffer *read_tree_block(struct ctree_root *root, u64 blocknr); +int write_tree_block(struct ctree_root *root, struct tree_buffer *buf); +struct ctree_root *open_ctree(char *filename); +int close_ctree(struct ctree_root *root); +void tree_block_release(struct ctree_root *root, struct tree_buffer *buf); +struct tree_buffer *alloc_free_block(struct ctree_root *root); +int update_root_block(struct ctree_root *root); + +#endif diff --git a/fs/btrfs/kerncompat.h b/fs/btrfs/kerncompat.h index 3a4bb4d661f9..347ca06205c2 100644 --- a/fs/btrfs/kerncompat.h +++ b/fs/btrfs/kerncompat.h @@ -6,6 +6,7 @@ #define BITS_PER_LONG 64 #define __GFP_BITS_SHIFT 20 #define __GFP_BITS_MASK ((int)((1 << __GFP_BITS_SHIFT) - 1)) +#define GFP_KERNEL 0 #define __read_mostly #define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) #define __force diff --git a/fs/btrfs/radix-tree.c b/fs/btrfs/radix-tree.c new file mode 100644 index 000000000000..baa25ca1c2ac --- /dev/null +++ b/fs/btrfs/radix-tree.c @@ -0,0 +1,836 @@ +/* + * Copyright (C) 2001 Momchil Velikov + * Portions Copyright (C) 2001 Christoph Hellwig + * Copyright (C) 2005 SGI, Christoph Lameter + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2, or (at + * your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#include "kerncompat.h" +#include "radix-tree.h" +#ifdef __KERNEL__ +#define RADIX_TREE_MAP_SHIFT (CONFIG_BASE_SMALL ? 4 : 6) +#else +#define RADIX_TREE_MAP_SHIFT 3 /* For more stressful testing */ +#endif + +#define RADIX_TREE_MAP_SIZE (1UL << RADIX_TREE_MAP_SHIFT) +#define RADIX_TREE_MAP_MASK (RADIX_TREE_MAP_SIZE-1) + +#define RADIX_TREE_TAG_LONGS \ + ((RADIX_TREE_MAP_SIZE + BITS_PER_LONG - 1) / BITS_PER_LONG) + +struct radix_tree_node { + unsigned int count; + void *slots[RADIX_TREE_MAP_SIZE]; + unsigned long tags[RADIX_TREE_MAX_TAGS][RADIX_TREE_TAG_LONGS]; +}; + +struct radix_tree_path { + struct radix_tree_node *node; + int offset; +}; + +#define RADIX_TREE_INDEX_BITS (8 /* CHAR_BIT */ * sizeof(unsigned long)) +#define RADIX_TREE_MAX_PATH (RADIX_TREE_INDEX_BITS/RADIX_TREE_MAP_SHIFT + 2) + +static unsigned long height_to_maxindex[RADIX_TREE_MAX_PATH] __read_mostly; + +/* + * Per-cpu pool of preloaded nodes + */ +struct radix_tree_preload { + int nr; + struct radix_tree_node *nodes[RADIX_TREE_MAX_PATH]; +}; +struct radix_tree_preload radix_tree_preloads = { 0, }; + +static inline gfp_t root_gfp_mask(struct radix_tree_root *root) +{ + return root->gfp_mask & __GFP_BITS_MASK; +} + +static int internal_nodes = 0; +/* + * This assumes that the caller has performed appropriate preallocation, and + * that the caller has pinned this thread of control to the current CPU. + */ +static struct radix_tree_node * +radix_tree_node_alloc(struct radix_tree_root *root) +{ + struct radix_tree_node *ret; + ret = malloc(sizeof(struct radix_tree_node)); + if (ret) { + memset(ret, 0, sizeof(struct radix_tree_node)); + internal_nodes++; + } + return ret; +} + +static inline void +radix_tree_node_free(struct radix_tree_node *node) +{ + internal_nodes--; + free(node); +} + +/* + * Load up this CPU's radix_tree_node buffer with sufficient objects to + * ensure that the addition of a single element in the tree cannot fail. On + * success, return zero, with preemption disabled. On error, return -ENOMEM + * with preemption not disabled. + */ +int radix_tree_preload(gfp_t gfp_mask) +{ + struct radix_tree_preload *rtp; + struct radix_tree_node *node; + int ret = -ENOMEM; + + preempt_disable(); + rtp = &__get_cpu_var(radix_tree_preloads); + while (rtp->nr < ARRAY_SIZE(rtp->nodes)) { + preempt_enable(); + node = radix_tree_node_alloc(NULL); + if (node == NULL) + goto out; + preempt_disable(); + rtp = &__get_cpu_var(radix_tree_preloads); + if (rtp->nr < ARRAY_SIZE(rtp->nodes)) + rtp->nodes[rtp->nr++] = node; + else + radix_tree_node_free(node); + } + ret = 0; +out: + return ret; +} + +static inline void tag_set(struct radix_tree_node *node, unsigned int tag, + int offset) +{ + __set_bit(offset, node->tags[tag]); +} + +static inline void tag_clear(struct radix_tree_node *node, unsigned int tag, + int offset) +{ + __clear_bit(offset, node->tags[tag]); +} + +static inline int tag_get(struct radix_tree_node *node, unsigned int tag, + int offset) +{ + return test_bit(offset, node->tags[tag]); +} + +static inline void root_tag_set(struct radix_tree_root *root, unsigned int tag) +{ + root->gfp_mask |= (__force gfp_t)(1 << (tag + __GFP_BITS_SHIFT)); +} + + +static inline void root_tag_clear(struct radix_tree_root *root, unsigned int tag) +{ + root->gfp_mask &= (__force gfp_t)~(1 << (tag + __GFP_BITS_SHIFT)); +} + +static inline void root_tag_clear_all(struct radix_tree_root *root) +{ + root->gfp_mask &= __GFP_BITS_MASK; +} + +static inline int root_tag_get(struct radix_tree_root *root, unsigned int tag) +{ + return (__force unsigned)root->gfp_mask & (1 << (tag + __GFP_BITS_SHIFT)); +} + +/* + * Returns 1 if any slot in the node has this tag set. + * Otherwise returns 0. + */ +static inline int any_tag_set(struct radix_tree_node *node, unsigned int tag) +{ + int idx; + for (idx = 0; idx < RADIX_TREE_TAG_LONGS; idx++) { + if (node->tags[tag][idx]) + return 1; + } + return 0; +} + +/* + * Return the maximum key which can be store into a + * radix tree with height HEIGHT. + */ +static inline unsigned long radix_tree_maxindex(unsigned int height) +{ + return height_to_maxindex[height]; +} + +/* + * Extend a radix tree so it can store key @index. + */ +static int radix_tree_extend(struct radix_tree_root *root, unsigned long index) +{ + struct radix_tree_node *node; + unsigned int height; + int tag; + + /* Figure out what the height should be. */ + height = root->height + 1; + while (index > radix_tree_maxindex(height)) + height++; + + if (root->rnode == NULL) { + root->height = height; + goto out; + } + + do { + if (!(node = radix_tree_node_alloc(root))) + return -ENOMEM; + + /* Increase the height. */ + node->slots[0] = root->rnode; + + /* Propagate the aggregated tag info into the new root */ + for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) { + if (root_tag_get(root, tag)) + tag_set(node, tag, 0); + } + + node->count = 1; + root->rnode = node; + root->height++; + } while (height > root->height); +out: + return 0; +} + +/** + * radix_tree_insert - insert into a radix tree + * @root: radix tree root + * @index: index key + * @item: item to insert + * + * Insert an item into the radix tree at position @index. + */ +int radix_tree_insert(struct radix_tree_root *root, + unsigned long index, void *item) +{ + struct radix_tree_node *node = NULL, *slot; + unsigned int height, shift; + int offset; + int error; + + /* Make sure the tree is high enough. */ + if (index > radix_tree_maxindex(root->height)) { + error = radix_tree_extend(root, index); + if (error) + return error; + } + + slot = root->rnode; + height = root->height; + shift = (height-1) * RADIX_TREE_MAP_SHIFT; + + offset = 0; /* uninitialised var warning */ + while (height > 0) { + if (slot == NULL) { + /* Have to add a child node. */ + if (!(slot = radix_tree_node_alloc(root))) + return -ENOMEM; + if (node) { + node->slots[offset] = slot; + node->count++; + } else + root->rnode = slot; + } + + /* Go a level down */ + offset = (index >> shift) & RADIX_TREE_MAP_MASK; + node = slot; + slot = node->slots[offset]; + shift -= RADIX_TREE_MAP_SHIFT; + height--; + } + + if (slot != NULL) + return -EEXIST; + + if (node) { + node->count++; + node->slots[offset] = item; + BUG_ON(tag_get(node, 0, offset)); + BUG_ON(tag_get(node, 1, offset)); + } else { + root->rnode = item; + BUG_ON(root_tag_get(root, 0)); + BUG_ON(root_tag_get(root, 1)); + } + + return 0; +} + +static inline void **__lookup_slot(struct radix_tree_root *root, + unsigned long index) +{ + unsigned int height, shift; + struct radix_tree_node **slot; + + height = root->height; + + if (index > radix_tree_maxindex(height)) + return NULL; + + if (height == 0 && root->rnode) + return (void **)&root->rnode; + + shift = (height-1) * RADIX_TREE_MAP_SHIFT; + slot = &root->rnode; + + while (height > 0) { + if (*slot == NULL) + return NULL; + + slot = (struct radix_tree_node **) + ((*slot)->slots + + ((index >> shift) & RADIX_TREE_MAP_MASK)); + shift -= RADIX_TREE_MAP_SHIFT; + height--; + } + + return (void **)slot; +} + +/** + * radix_tree_lookup_slot - lookup a slot in a radix tree + * @root: radix tree root + * @index: index key + * + * Lookup the slot corresponding to the position @index in the radix tree + * @root. This is useful for update-if-exists operations. + */ +void **radix_tree_lookup_slot(struct radix_tree_root *root, unsigned long index) +{ + return __lookup_slot(root, index); +} + +/** + * radix_tree_lookup - perform lookup operation on a radix tree + * @root: radix tree root + * @index: index key + * + * Lookup the item at the position @index in the radix tree @root. + */ +void *radix_tree_lookup(struct radix_tree_root *root, unsigned long index) +{ + void **slot; + + slot = __lookup_slot(root, index); + return slot != NULL ? *slot : NULL; +} + +/** + * radix_tree_tag_set - set a tag on a radix tree node + * @root: radix tree root + * @index: index key + * @tag: tag index + * + * Set the search tag (which must be < RADIX_TREE_MAX_TAGS) + * corresponding to @index in the radix tree. From + * the root all the way down to the leaf node. + * + * Returns the address of the tagged item. Setting a tag on a not-present + * item is a bug. + */ +void *radix_tree_tag_set(struct radix_tree_root *root, + unsigned long index, unsigned int tag) +{ + unsigned int height, shift; + struct radix_tree_node *slot; + + height = root->height; + BUG_ON(index > radix_tree_maxindex(height)); + + slot = root->rnode; + shift = (height - 1) * RADIX_TREE_MAP_SHIFT; + + while (height > 0) { + int offset; + + offset = (index >> shift) & RADIX_TREE_MAP_MASK; + if (!tag_get(slot, tag, offset)) + tag_set(slot, tag, offset); + slot = slot->slots[offset]; + BUG_ON(slot == NULL); + shift -= RADIX_TREE_MAP_SHIFT; + height--; + } + + /* set the root's tag bit */ + if (slot && !root_tag_get(root, tag)) + root_tag_set(root, tag); + + return slot; +} + +/** + * radix_tree_tag_clear - clear a tag on a radix tree node + * @root: radix tree root + * @index: index key + * @tag: tag index + * + * Clear the search tag (which must be < RADIX_TREE_MAX_TAGS) + * corresponding to @index in the radix tree. If + * this causes the leaf node to have no tags set then clear the tag in the + * next-to-leaf node, etc. + * + * Returns the address of the tagged item on success, else NULL. ie: + * has the same return value and semantics as radix_tree_lookup(). + */ +void *radix_tree_tag_clear(struct radix_tree_root *root, + unsigned long index, unsigned int tag) +{ + struct radix_tree_path path[RADIX_TREE_MAX_PATH], *pathp = path; + struct radix_tree_node *slot = NULL; + unsigned int height, shift; + + height = root->height; + if (index > radix_tree_maxindex(height)) + goto out; + + shift = (height - 1) * RADIX_TREE_MAP_SHIFT; + pathp->node = NULL; + slot = root->rnode; + + while (height > 0) { + int offset; + + if (slot == NULL) + goto out; + + offset = (index >> shift) & RADIX_TREE_MAP_MASK; + pathp[1].offset = offset; + pathp[1].node = slot; + slot = slot->slots[offset]; + pathp++; + shift -= RADIX_TREE_MAP_SHIFT; + height--; + } + + if (slot == NULL) + goto out; + + while (pathp->node) { + if (!tag_get(pathp->node, tag, pathp->offset)) + goto out; + tag_clear(pathp->node, tag, pathp->offset); + if (any_tag_set(pathp->node, tag)) + goto out; + pathp--; + } + + /* clear the root's tag bit */ + if (root_tag_get(root, tag)) + root_tag_clear(root, tag); + +out: + return slot; +} + +#ifndef __KERNEL__ /* Only the test harness uses this at present */ +/** + * radix_tree_tag_get - get a tag on a radix tree node + * @root: radix tree root + * @index: index key + * @tag: tag index (< RADIX_TREE_MAX_TAGS) + * + * Return values: + * + * 0: tag not present or not set + * 1: tag set + */ +int radix_tree_tag_get(struct radix_tree_root *root, + unsigned long index, unsigned int tag) +{ + unsigned int height, shift; + struct radix_tree_node *slot; + int saw_unset_tag = 0; + + height = root->height; + if (index > radix_tree_maxindex(height)) + return 0; + + /* check the root's tag bit */ + if (!root_tag_get(root, tag)) + return 0; + + if (height == 0) + return 1; + + shift = (height - 1) * RADIX_TREE_MAP_SHIFT; + slot = root->rnode; + + for ( ; ; ) { + int offset; + + if (slot == NULL) + return 0; + + offset = (index >> shift) & RADIX_TREE_MAP_MASK; + + /* + * This is just a debug check. Later, we can bale as soon as + * we see an unset tag. + */ + if (!tag_get(slot, tag, offset)) + saw_unset_tag = 1; + if (height == 1) { + int ret = tag_get(slot, tag, offset); + + BUG_ON(ret && saw_unset_tag); + return !!ret; + } + slot = slot->slots[offset]; + shift -= RADIX_TREE_MAP_SHIFT; + height--; + } +} +#endif + +static unsigned int +__lookup(struct radix_tree_root *root, void **results, unsigned long index, + unsigned int max_items, unsigned long *next_index) +{ + unsigned int nr_found = 0; + unsigned int shift, height; + struct radix_tree_node *slot; + unsigned long i; + + height = root->height; + if (height == 0) { + if (root->rnode && index == 0) + results[nr_found++] = root->rnode; + goto out; + } + + shift = (height-1) * RADIX_TREE_MAP_SHIFT; + slot = root->rnode; + + for ( ; height > 1; height--) { + + for (i = (index >> shift) & RADIX_TREE_MAP_MASK ; + i < RADIX_TREE_MAP_SIZE; i++) { + if (slot->slots[i] != NULL) + break; + index &= ~((1UL << shift) - 1); + index += 1UL << shift; + if (index == 0) + goto out; /* 32-bit wraparound */ + } + if (i == RADIX_TREE_MAP_SIZE) + goto out; + + shift -= RADIX_TREE_MAP_SHIFT; + slot = slot->slots[i]; + } + + /* Bottom level: grab some items */ + for (i = index & RADIX_TREE_MAP_MASK; i < RADIX_TREE_MAP_SIZE; i++) { + index++; + if (slot->slots[i]) { + results[nr_found++] = slot->slots[i]; + if (nr_found == max_items) + goto out; + } + } +out: + *next_index = index; + return nr_found; +} + +/** + * radix_tree_gang_lookup - perform multiple lookup on a radix tree + * @root: radix tree root + * @results: where the results of the lookup are placed + * @first_index: start the lookup from this key + * @max_items: place up to this many items at *results + * + * Performs an index-ascending scan of the tree for present items. Places + * them at *@results and returns the number of items which were placed at + * *@results. + * + * The implementation is naive. + */ +unsigned int +radix_tree_gang_lookup(struct radix_tree_root *root, void **results, + unsigned long first_index, unsigned int max_items) +{ + const unsigned long max_index = radix_tree_maxindex(root->height); + unsigned long cur_index = first_index; + unsigned int ret = 0; + + while (ret < max_items) { + unsigned int nr_found; + unsigned long next_index; /* Index of next search */ + + if (cur_index > max_index) + break; + nr_found = __lookup(root, results + ret, cur_index, + max_items - ret, &next_index); + ret += nr_found; + if (next_index == 0) + break; + cur_index = next_index; + } + return ret; +} + +/* + * FIXME: the two tag_get()s here should use find_next_bit() instead of + * open-coding the search. + */ +static unsigned int +__lookup_tag(struct radix_tree_root *root, void **results, unsigned long index, + unsigned int max_items, unsigned long *next_index, unsigned int tag) +{ + unsigned int nr_found = 0; + unsigned int shift; + unsigned int height = root->height; + struct radix_tree_node *slot; + + if (height == 0) { + if (root->rnode && index == 0) + results[nr_found++] = root->rnode; + goto out; + } + + shift = (height - 1) * RADIX_TREE_MAP_SHIFT; + slot = root->rnode; + + do { + unsigned long i = (index >> shift) & RADIX_TREE_MAP_MASK; + + for ( ; i < RADIX_TREE_MAP_SIZE; i++) { + if (tag_get(slot, tag, i)) { + BUG_ON(slot->slots[i] == NULL); + break; + } + index &= ~((1UL << shift) - 1); + index += 1UL << shift; + if (index == 0) + goto out; /* 32-bit wraparound */ + } + if (i == RADIX_TREE_MAP_SIZE) + goto out; + height--; + if (height == 0) { /* Bottom level: grab some items */ + unsigned long j = index & RADIX_TREE_MAP_MASK; + + for ( ; j < RADIX_TREE_MAP_SIZE; j++) { + index++; + if (tag_get(slot, tag, j)) { + BUG_ON(slot->slots[j] == NULL); + results[nr_found++] = slot->slots[j]; + if (nr_found == max_items) + goto out; + } + } + } + shift -= RADIX_TREE_MAP_SHIFT; + slot = slot->slots[i]; + } while (height > 0); +out: + *next_index = index; + return nr_found; +} + +/** + * radix_tree_gang_lookup_tag - perform multiple lookup on a radix tree + * based on a tag + * @root: radix tree root + * @results: where the results of the lookup are placed + * @first_index: start the lookup from this key + * @max_items: place up to this many items at *results + * @tag: the tag index (< RADIX_TREE_MAX_TAGS) + * + * Performs an index-ascending scan of the tree for present items which + * have the tag indexed by @tag set. Places the items at *@results and + * returns the number of items which were placed at *@results. + */ +unsigned int +radix_tree_gang_lookup_tag(struct radix_tree_root *root, void **results, + unsigned long first_index, unsigned int max_items, + unsigned int tag) +{ + const unsigned long max_index = radix_tree_maxindex(root->height); + unsigned long cur_index = first_index; + unsigned int ret = 0; + + /* check the root's tag bit */ + if (!root_tag_get(root, tag)) + return 0; + + while (ret < max_items) { + unsigned int nr_found; + unsigned long next_index; /* Index of next search */ + + if (cur_index > max_index) + break; + nr_found = __lookup_tag(root, results + ret, cur_index, + max_items - ret, &next_index, tag); + ret += nr_found; + if (next_index == 0) + break; + cur_index = next_index; + } + return ret; +} + +/** + * radix_tree_shrink - shrink height of a radix tree to minimal + * @root radix tree root + */ +static inline void radix_tree_shrink(struct radix_tree_root *root) +{ + /* try to shrink tree height */ + while (root->height > 0 && + root->rnode->count == 1 && + root->rnode->slots[0]) { + struct radix_tree_node *to_free = root->rnode; + + root->rnode = to_free->slots[0]; + root->height--; + /* must only free zeroed nodes into the slab */ + tag_clear(to_free, 0, 0); + tag_clear(to_free, 1, 0); + to_free->slots[0] = NULL; + to_free->count = 0; + radix_tree_node_free(to_free); + } +} + +/** + * radix_tree_delete - delete an item from a radix tree + * @root: radix tree root + * @index: index key + * + * Remove the item at @index from the radix tree rooted at @root. + * + * Returns the address of the deleted item, or NULL if it was not present. + */ +void *radix_tree_delete(struct radix_tree_root *root, unsigned long index) +{ + struct radix_tree_path path[RADIX_TREE_MAX_PATH], *pathp = path; + struct radix_tree_node *slot = NULL; + unsigned int height, shift; + int tag; + int offset; + + height = root->height; + if (index > radix_tree_maxindex(height)) + goto out; + + slot = root->rnode; + if (height == 0 && root->rnode) { + root_tag_clear_all(root); + root->rnode = NULL; + goto out; + } + + shift = (height - 1) * RADIX_TREE_MAP_SHIFT; + pathp->node = NULL; + + do { + if (slot == NULL) + goto out; + + pathp++; + offset = (index >> shift) & RADIX_TREE_MAP_MASK; + pathp->offset = offset; + pathp->node = slot; + slot = slot->slots[offset]; + shift -= RADIX_TREE_MAP_SHIFT; + height--; + } while (height > 0); + + if (slot == NULL) + goto out; + + /* + * Clear all tags associated with the just-deleted item + */ + for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) { + if (tag_get(pathp->node, tag, pathp->offset)) + radix_tree_tag_clear(root, index, tag); + } + + /* Now free the nodes we do not need anymore */ + while (pathp->node) { + pathp->node->slots[pathp->offset] = NULL; + pathp->node->count--; + + if (pathp->node->count) { + if (pathp->node == root->rnode) + radix_tree_shrink(root); + goto out; + } + + /* Node with zero slots in use so free it */ + radix_tree_node_free(pathp->node); + + pathp--; + } + root_tag_clear_all(root); + root->height = 0; + root->rnode = NULL; + +out: + return slot; +} + +/** + * radix_tree_tagged - test whether any items in the tree are tagged + * @root: radix tree root + * @tag: tag to test + */ +int radix_tree_tagged(struct radix_tree_root *root, unsigned int tag) +{ + return root_tag_get(root, tag); +} + +static unsigned long __maxindex(unsigned int height) +{ + unsigned int tmp = height * RADIX_TREE_MAP_SHIFT; + unsigned long index = (~0UL >> (RADIX_TREE_INDEX_BITS - tmp - 1)) >> 1; + + if (tmp >= RADIX_TREE_INDEX_BITS) + index = ~0UL; + return index; +} + +static void radix_tree_init_maxindex(void) +{ + unsigned int i; + + for (i = 0; i < ARRAY_SIZE(height_to_maxindex); i++) + height_to_maxindex[i] = __maxindex(i); +} + +void radix_tree_init(void) +{ + radix_tree_init_maxindex(); +} diff --git a/fs/btrfs/radix-tree.h b/fs/btrfs/radix-tree.h new file mode 100644 index 000000000000..c3ce88137f77 --- /dev/null +++ b/fs/btrfs/radix-tree.h @@ -0,0 +1,73 @@ +/* + * Copyright (C) 2001 Momchil Velikov + * Portions Copyright (C) 2001 Christoph Hellwig + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2, or (at + * your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ +#ifndef _LINUX_RADIX_TREE_H +#define _LINUX_RADIX_TREE_H + +#define RADIX_TREE_MAX_TAGS 2 + +/* root tags are stored in gfp_mask, shifted by __GFP_BITS_SHIFT */ +struct radix_tree_root { + unsigned int height; + gfp_t gfp_mask; + struct radix_tree_node *rnode; +}; + +#define RADIX_TREE_INIT(mask) { \ + .height = 0, \ + .gfp_mask = (mask), \ + .rnode = NULL, \ +} + +#define RADIX_TREE(name, mask) \ + struct radix_tree_root name = RADIX_TREE_INIT(mask) + +#define INIT_RADIX_TREE(root, mask) \ +do { \ + (root)->height = 0; \ + (root)->gfp_mask = (mask); \ + (root)->rnode = NULL; \ +} while (0) + +int radix_tree_insert(struct radix_tree_root *, unsigned long, void *); +void *radix_tree_lookup(struct radix_tree_root *, unsigned long); +void **radix_tree_lookup_slot(struct radix_tree_root *, unsigned long); +void *radix_tree_delete(struct radix_tree_root *, unsigned long); +unsigned int +radix_tree_gang_lookup(struct radix_tree_root *root, void **results, + unsigned long first_index, unsigned int max_items); +int radix_tree_preload(gfp_t gfp_mask); +void radix_tree_init(void); +void *radix_tree_tag_set(struct radix_tree_root *root, + unsigned long index, unsigned int tag); +void *radix_tree_tag_clear(struct radix_tree_root *root, + unsigned long index, unsigned int tag); +int radix_tree_tag_get(struct radix_tree_root *root, + unsigned long index, unsigned int tag); +unsigned int +radix_tree_gang_lookup_tag(struct radix_tree_root *root, void **results, + unsigned long first_index, unsigned int max_items, + unsigned int tag); +int radix_tree_tagged(struct radix_tree_root *root, unsigned int tag); + +static inline void radix_tree_preload_end(void) +{ + preempt_enable(); +} + +#endif /* _LINUX_RADIX_TREE_H */ -- cgit v1.2.3 From d97e63b69ef21c02b67e20e41d9968b0e503572e Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Tue, 20 Feb 2007 16:40:44 -0500 Subject: Btrfs: early extent mapping support Signed-off-by: Chris Mason --- fs/btrfs/Makefile | 4 +- fs/btrfs/ctree.c | 152 +++++++++++++++++++++++++++++++++++++++++++++++++++-- fs/btrfs/ctree.h | 34 ++++++++++-- fs/btrfs/disk-io.c | 76 +++++++++++++++++++-------- fs/btrfs/disk-io.h | 3 ++ 5 files changed, 236 insertions(+), 33 deletions(-) (limited to 'fs/btrfs/Makefile') diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index 63360212a075..df065dd2dce7 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -4,8 +4,8 @@ CFLAGS= -g -Wall .c.o: $(CC) $(CFLAGS) -c $< -ctree: ctree.o disk-io.h ctree.h disk-io.o radix-tree.o radix-tree.h - gcc $(CFLAGS) -o ctree ctree.o disk-io.o radix-tree.o +ctree: ctree.o disk-io.h ctree.h disk-io.o radix-tree.o radix-tree.h mkfs.o + gcc $(CFLAGS) -o ctree ctree.o disk-io.o radix-tree.o mkfs.o clean: rm ctree *.o diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 6b64f49a0279..2177744dedd3 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -5,6 +5,8 @@ #include "ctree.h" #include "disk-io.h" +static int refill_alloc_extent(struct ctree_root *root); + static inline void init_path(struct ctree_path *p) { memset(p, 0, sizeof(*p)); @@ -29,7 +31,7 @@ static inline unsigned int leaf_data_end(struct leaf *leaf) { unsigned int nr = leaf->header.nritems; if (nr == 0) - return ARRAY_SIZE(leaf->data); + return sizeof(leaf->data); return leaf->items[nr-1].offset; } @@ -421,7 +423,7 @@ int insert_ptr(struct ctree_root *root, * due to splitting. Once we've done all the splitting required * do the inserts based on the data in the bal array. */ - memset(bal, 0, ARRAY_SIZE(bal)); + memset(bal, 0, sizeof(bal)); while(t && t->node.header.nritems == NODEPTRS_PER_BLOCK) { c = &t->node; if (push_node_left(root, path, @@ -756,6 +758,7 @@ int insert_item(struct ctree_root *root, struct key *key, if (leaf_free_space(leaf) < 0) BUG(); release_path(root, &path); + refill_alloc_extent(root); return 0; } @@ -884,6 +887,135 @@ int del_item(struct ctree_root *root, struct ctree_path *path) return 0; } +int next_leaf(struct ctree_root *root, struct ctree_path *path) +{ + int slot; + int level = 1; + u64 blocknr; + struct tree_buffer *c; + struct tree_buffer *next; + + while(level < MAX_LEVEL) { + if (!path->nodes[level]) + return -1; + slot = path->slots[level] + 1; + c = path->nodes[level]; + if (slot >= c->node.header.nritems) { + level++; + continue; + } + blocknr = c->node.blockptrs[slot]; + next = read_tree_block(root, blocknr); + break; + } + path->slots[level] = slot; + while(1) { + level--; + c = path->nodes[level]; + tree_block_release(root, c); + path->nodes[level] = next; + path->slots[level] = 0; + if (!level) + break; + next = read_tree_block(root, next->node.blockptrs[0]); + } + return 0; +} + +int alloc_extent(struct ctree_root *root, u64 num_blocks, u64 search_start, + u64 search_end, u64 owner, struct key *ins) +{ + struct ctree_path path; + struct key *key; + int ret; + u64 hole_size = 0; + int slot = 0; + u64 last_block; + int start_found = 0; + struct leaf *l; + struct extent_item extent_item; + + init_path(&path); + ins->objectid = search_start; + ins->offset = 0; + ins->flags = 0; + + ret = search_slot(root, ins, &path); + while (1) { + l = &path.nodes[0]->leaf; + slot = path.slots[0]; + if (!l) { + // FIXME allocate root + } + if (slot >= l->header.nritems) { + ret = next_leaf(root, &path); + if (ret == 0) + continue; + if (!start_found) { + ins->objectid = search_start; + ins->offset = num_blocks; + hole_size = search_end - search_start; + goto insert; + } + ins->objectid = last_block; + ins->offset = num_blocks; + hole_size = search_end - last_block; + goto insert; + } + key = &l->items[slot].key; + if (start_found) { + hole_size = key->objectid - last_block; + if (hole_size > num_blocks) { + ins->objectid = last_block; + ins->offset = num_blocks; + goto insert; + } + } else + start_found = 1; + last_block = key->objectid + key->offset; + path.slots[0]++; + printf("last block is not %lu\n", last_block); + } + // FIXME -ENOSPC +insert: + extent_item.refs = 1; + extent_item.owner = owner; + ret = insert_item(root, ins, &extent_item, sizeof(extent_item)); + return ret; +} + +static int refill_alloc_extent(struct ctree_root *root) +{ + struct alloc_extent *ae = root->alloc_extent; + struct key key; + int ret; + int min_blocks = MAX_LEVEL * 2; + + printf("refill alloc root %p, numused %lu total %lu\n", root, ae->num_used, ae->num_blocks); + if (ae->num_blocks > ae->num_used && ae->num_blocks - ae->num_used > + min_blocks) + return 0; + ae = root->reserve_extent; + if (ae->num_blocks > ae->num_used) { + if (root->alloc_extent->num_blocks == 0) { + /* we should swap reserve/alloc_extent when alloc + * fills up + */ + BUG(); + } + if (ae->num_blocks - ae->num_used < min_blocks) + BUG(); + return 0; + } + // FIXME, this recurses + ret = alloc_extent(root->extent_root, + min_blocks * 2, 0, (unsigned long)-1, 0, &key); + ae->blocknr = key.objectid; + ae->num_blocks = key.offset; + ae->num_used = 0; + return ret; +} + void print_leaf(struct leaf *l) { int i; @@ -948,8 +1080,8 @@ void print_tree(struct ctree_root *root, struct tree_buffer *t) /* for testing only */ int next_key(int i, int max_key) { - return rand() % max_key; - // return i; + // return rand() % max_key; + return i; } int main() { @@ -960,7 +1092,7 @@ int main() { int i; int num; int ret; - int run_size = 25000; + int run_size = 256; int max_key = 100000000; int tree_size = 0; struct ctree_path path; @@ -980,10 +1112,20 @@ int main() { ins.objectid = num; ins.offset = 0; ins.flags = 0; + printf("insert %d\n", i); ret = insert_item(root, &ins, buf, strlen(buf)); if (!ret) tree_size++; + printf("done insert %d\n", i); } + printf("root used: %lu\n", root->alloc_extent->num_used); + printf("root tree\n"); + print_tree(root, root->node); + printf("map tree\n"); + printf("map used: %lu\n", root->extent_root->alloc_extent->num_used); + print_tree(root->extent_root, root->extent_root->node); + exit(1); + close_ctree(root); root = open_ctree("dbfile"); printf("starting search\n"); diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 586bf1866042..b737925be314 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1,7 +1,7 @@ #ifndef __CTREE__ #define __CTREE__ -#define CTREE_BLOCKSIZE 4096 +#define CTREE_BLOCKSIZE 256 struct key { u64 objectid; @@ -22,18 +22,41 @@ struct header { #define NODEPTRS_PER_BLOCK ((CTREE_BLOCKSIZE - sizeof(struct header)) / \ (sizeof(struct key) + sizeof(u64))) -#define LEVEL_BITS 3 -#define MAX_LEVEL (1 << LEVEL_BITS) +#define MAX_LEVEL 8 #define node_level(f) ((f) & (MAX_LEVEL-1)) #define is_leaf(f) (node_level(f) == 0) struct tree_buffer; + +struct alloc_extent { + u64 blocknr; + u64 num_blocks; + u64 num_used; +} __attribute__ ((__packed__)); + struct ctree_root { struct tree_buffer *node; + struct ctree_root *extent_root; + struct alloc_extent *alloc_extent; + struct alloc_extent *reserve_extent; int fp; struct radix_tree_root cache_radix; + struct alloc_extent ai1; + struct alloc_extent ai2; }; +struct ctree_root_info { + u64 fsid[2]; /* FS specific uuid */ + u64 blocknr; /* blocknr of this block */ + u64 objectid; /* inode number of this root */ + u64 tree_root; /* the tree root */ + u32 csum; + u32 ham; + struct alloc_extent alloc_extent; + struct alloc_extent reserve_extent; + u64 snapuuid[2]; /* root specific uuid */ +} __attribute__ ((__packed__)); + struct item { struct key key; u16 offset; @@ -55,6 +78,11 @@ struct node { u64 blockptrs[NODEPTRS_PER_BLOCK]; } __attribute__ ((__packed__)); +struct extent_item { + u32 refs; + u64 owner; +} __attribute__ ((__packed__)); + struct ctree_path { struct tree_buffer *nodes[MAX_LEVEL]; int slots[MAX_LEVEL]; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 8d51a07051d9..653f18aab330 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -12,25 +12,27 @@ static int allocated_blocks = 0; -struct ctree_header { - u64 root_block; -} __attribute__ ((__packed__)); - static int get_free_block(struct ctree_root *root, u64 *block) { struct stat st; int ret; + if (root->alloc_extent->num_used >= root->alloc_extent->num_blocks) + return -1; + + *block = root->alloc_extent->blocknr + root->alloc_extent->num_used; + root->alloc_extent->num_used += 1; + if (root->alloc_extent->num_used >= root->alloc_extent->num_blocks) { + struct alloc_extent *ae = root->alloc_extent; + root->alloc_extent = root->reserve_extent; + root->reserve_extent = ae; + ae->num_blocks = 0; + } st.st_size = 0; ret = fstat(root->fp, &st); - if (st.st_size > sizeof(struct ctree_header)) { - *block = (st.st_size - - sizeof(struct ctree_header)) / CTREE_BLOCKSIZE; - } else { - *block = 0; - } - ret = ftruncate(root->fp, sizeof(struct ctree_header) + (*block + 1) * - CTREE_BLOCKSIZE); + if (st.st_size < (*block + 1) * CTREE_BLOCKSIZE) + ret = ftruncate(root->fp, + (*block + 1) * CTREE_BLOCKSIZE); return ret; } @@ -72,7 +74,7 @@ struct tree_buffer *alloc_free_block(struct ctree_root *root) struct tree_buffer *read_tree_block(struct ctree_root *root, u64 blocknr) { - loff_t offset = blocknr * CTREE_BLOCKSIZE + sizeof(struct ctree_header); + loff_t offset = blocknr * CTREE_BLOCKSIZE; struct tree_buffer *buf; int ret; @@ -101,7 +103,7 @@ struct tree_buffer *read_tree_block(struct ctree_root *root, u64 blocknr) int write_tree_block(struct ctree_root *root, struct tree_buffer *buf) { u64 blocknr = buf->blocknr; - loff_t offset = blocknr * CTREE_BLOCKSIZE + sizeof(struct ctree_header); + loff_t offset = blocknr * CTREE_BLOCKSIZE; int ret; if (buf->blocknr != buf->node.header.blocknr) @@ -114,11 +116,32 @@ int write_tree_block(struct ctree_root *root, struct tree_buffer *buf) return 0; } +struct ctree_super_block { + struct ctree_root_info root_info; + struct ctree_root_info extent_info; +} __attribute__ ((__packed__)); + +static int __setup_root(struct ctree_root *root, struct ctree_root *extent_root, + struct ctree_root_info *info, int fp) +{ + root->fp = fp; + root->node = read_tree_block(root, info->tree_root); + root->extent_root = extent_root; + memcpy(&root->ai1, &info->alloc_extent, sizeof(info->alloc_extent)); + memcpy(&root->ai2, &info->reserve_extent, sizeof(info->reserve_extent)); + root->alloc_extent = &root->ai1; + root->reserve_extent = &root->ai2; + INIT_RADIX_TREE(&root->cache_radix, GFP_KERNEL); + printf("setup done reading root %p, used %lu\n", root, root->alloc_extent->num_used); + return 0; +} + struct ctree_root *open_ctree(char *filename) { struct ctree_root *root = malloc(sizeof(struct ctree_root)); + struct ctree_root *extent_root = malloc(sizeof(struct ctree_root)); + struct ctree_super_block super; int fp; - u64 root_block; int ret; fp = open(filename, O_CREAT | O_RDWR); @@ -126,14 +149,20 @@ struct ctree_root *open_ctree(char *filename) free(root); return NULL; } - root->fp = fp; - INIT_RADIX_TREE(&root->cache_radix, GFP_KERNEL); - ret = pread(fp, &root_block, sizeof(u64), 0); - if (ret == sizeof(u64)) { - printf("reading root node at block %lu\n", root_block); - root->node = read_tree_block(root, root_block); - } else - root->node = NULL; + ret = pread(fp, &super, sizeof(struct ctree_super_block), + CTREE_SUPER_INFO_OFFSET(CTREE_BLOCKSIZE)); + if (ret == 0) { + ret = mkfs(fp); + if (ret) + return NULL; + ret = pread(fp, &super, sizeof(struct ctree_super_block), + CTREE_SUPER_INFO_OFFSET(CTREE_BLOCKSIZE)); + if (ret != sizeof(struct ctree_super_block)) + return NULL; + } + BUG_ON(ret < 0); + __setup_root(root, extent_root, &super.root_info, fp); + __setup_root(extent_root, extent_root, &super.extent_info, fp); return root; } @@ -160,6 +189,7 @@ int update_root_block(struct ctree_root *root) void tree_block_release(struct ctree_root *root, struct tree_buffer *buf) { + return; buf->count--; if (buf->count == 0) { if (!radix_tree_lookup(&root->cache_radix, buf->blocknr)) diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index ee95fa05c4a3..41acb65eee19 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -17,5 +17,8 @@ int close_ctree(struct ctree_root *root); void tree_block_release(struct ctree_root *root, struct tree_buffer *buf); struct tree_buffer *alloc_free_block(struct ctree_root *root); int update_root_block(struct ctree_root *root); +int mkfs(int fd); + +#define CTREE_SUPER_INFO_OFFSET(bs) (16 * (bs)) #endif -- cgit v1.2.3 From 5de08d7d50ba535b968d97101ed2ab07c2b8eb7d Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Sat, 24 Feb 2007 06:24:44 -0500 Subject: Btrfs: Break up ctree.c a little Extent fixes Signed-off-by: Chris Mason --- fs/btrfs/Makefile | 16 +- fs/btrfs/ctree.c | 446 +++++++++----------------------------------------- fs/btrfs/ctree.h | 12 +- fs/btrfs/disk-io.c | 1 - fs/btrfs/print-tree.c | 72 ++++++++ fs/btrfs/print-tree.h | 3 + 6 files changed, 176 insertions(+), 374 deletions(-) create mode 100644 fs/btrfs/print-tree.c create mode 100644 fs/btrfs/print-tree.h (limited to 'fs/btrfs/Makefile') diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index df065dd2dce7..fe73ab9d81d6 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -1,12 +1,16 @@ -CFLAGS= -g -Wall +CFLAGS = -g -Wall +headers = radix-tree.h ctree.h disk-io.h kerncompat.h print-tree.h +objects = ctree.o disk-io.o radix-tree.o mkfs.o extent-tree.o print-tree.o -.c.o: - $(CC) $(CFLAGS) -c $< +#.c.o: +# $(CC) $(CFLAGS) -c $< -ctree: ctree.o disk-io.h ctree.h disk-io.o radix-tree.o radix-tree.h mkfs.o - gcc $(CFLAGS) -o ctree ctree.o disk-io.o radix-tree.o mkfs.o +ctree : $(objects) + gcc $(CFLAGS) -o ctree $(objects) -clean: +$(objects) : $(headers) + +clean : rm ctree *.o diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index f0abcf1f3939..e497fd963118 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -4,23 +4,21 @@ #include "radix-tree.h" #include "ctree.h" #include "disk-io.h" - -#define SEARCH_READ 0 -#define SEARCH_WRITE 1 - -#define CTREE_EXTENT_PENDING 0 +#include "print-tree.h" int split_node(struct ctree_root *root, struct ctree_path *path, int level); int split_leaf(struct ctree_root *root, struct ctree_path *path, int data_size); -struct tree_buffer *alloc_free_block(struct ctree_root *root); -int free_extent(struct ctree_root *root, u64 blocknr, u64 num_blocks); +int push_node_left(struct ctree_root *root, struct ctree_path *path, int level); +int push_node_right(struct ctree_root *root, + struct ctree_path *path, int level); +int del_ptr(struct ctree_root *root, struct ctree_path *path, int level); -static inline void init_path(struct ctree_path *p) +inline void init_path(struct ctree_path *p) { memset(p, 0, sizeof(*p)); } -static void release_path(struct ctree_root *root, struct ctree_path *p) +void release_path(struct ctree_root *root, struct ctree_path *p) { int i; for (i = 0; i < MAX_LEVEL; i++) { @@ -48,7 +46,7 @@ static inline unsigned int leaf_data_end(struct leaf *leaf) * the start of the leaf data. IOW, how much room * the leaf has left for both items and data */ -static inline int leaf_free_space(struct leaf *leaf) +int leaf_free_space(struct leaf *leaf) { int data_end = leaf_data_end(leaf); int nritems = leaf->header.nritems; @@ -133,7 +131,8 @@ int bin_search(struct node *c, struct key *key, int *slot) * If the key isn't found, the path points to the slot where it should * be inserted. */ -int search_slot(struct ctree_root *root, struct key *key, struct ctree_path *p, int ins_len) +int search_slot(struct ctree_root *root, struct key *key, + struct ctree_path *p, int ins_len) { struct tree_buffer *b = root->node; struct node *c; @@ -151,7 +150,8 @@ int search_slot(struct ctree_root *root, struct key *key, struct ctree_path *p, if (ret && slot > 0) slot -= 1; p->slots[level] = slot; - if (ins_len && c->header.nritems == NODEPTRS_PER_BLOCK) { + if (ins_len > 0 && + c->header.nritems == NODEPTRS_PER_BLOCK) { int sret = split_node(root, p, level); BUG_ON(sret > 0); if (sret) @@ -159,13 +159,37 @@ int search_slot(struct ctree_root *root, struct key *key, struct ctree_path *p, b = p->nodes[level]; c = &b->node; slot = p->slots[level]; + } else if (ins_len < 0 && + c->header.nritems <= NODEPTRS_PER_BLOCK/4) { + u64 blocknr = b->blocknr; + slot = p->slots[level +1]; + b->count++; + if (push_node_left(root, p, level)) + push_node_right(root, p, level); + if (c->header.nritems == 0 && + level < MAX_LEVEL - 1 && + p->nodes[level + 1]) { + int tslot = p->slots[level + 1]; + + p->slots[level + 1] = slot; + del_ptr(root, p, level + 1); + p->slots[level + 1] = tslot; + tree_block_release(root, b); + free_extent(root, blocknr, 1); + } else { + tree_block_release(root, b); + } + b = p->nodes[level]; + c = &b->node; + slot = p->slots[level]; } b = read_tree_block(root, c->blockptrs[slot]); continue; } else { struct leaf *l = (struct leaf *)c; p->slots[level] = slot; - if (ins_len && leaf_free_space(l) < sizeof(struct item) + ins_len) { + if (ins_len > 0 && leaf_free_space(l) < + sizeof(struct item) + ins_len) { int sret = split_leaf(root, p, ins_len); BUG_ON(sret > 0); if (sret) @@ -355,7 +379,8 @@ int push_node_right(struct ctree_root *root, struct ctree_path *path, int level) return 0; } -static int insert_new_root(struct ctree_root *root, struct ctree_path *path, int level) +static int insert_new_root(struct ctree_root *root, + struct ctree_path *path, int level) { struct tree_buffer *t; struct node *lower; @@ -463,7 +488,7 @@ int split_node(struct ctree_root *root, struct ctree_path *path, int level) write_tree_block(root, split_buffer); insert_ptr(root, path, split->keys, split_buffer->blocknr, path->slots[level + 1] + 1, level + 1); - if (path->slots[level] > mid) { + if (path->slots[level] >= mid) { path->slots[level] -= mid; tree_block_release(root, t); path->nodes[level] = split_buffer; @@ -744,8 +769,7 @@ int insert_item(struct ctree_root *root, struct key *key, } /* - * delete the pointer from a given level in the path. The path is not - * fixed up, so after calling this it is not valid at that level. + * delete the pointer from a given node. * * If the delete empties a node, the node is removed from the tree, * continuing all the way the root if required. The root is converted into @@ -778,22 +802,10 @@ int del_ptr(struct ctree_root *root, struct ctree_path *path, int level) write_tree_block(root, t); blocknr = t->blocknr; if (node->header.nritems != 0) { - int tslot; if (slot == 0) fixup_low_keys(root, path, node->keys, level + 1); - tslot = path->slots[level+1]; - t->count++; - push_node_left(root, path, level); - if (node->header.nritems) { - push_node_right(root, path, level); - } - if (node->header.nritems) { - tree_block_release(root, t); - break; - } - tree_block_release(root, t); - path->slots[level+1] = tslot; + break; } if (t == root->node) { /* just turn the root into a leaf and break */ @@ -850,12 +862,12 @@ int del_item(struct ctree_root *root, struct ctree_path *path) free_extent(root, leaf_buf->blocknr, 1); } } else { + int used = leaf_space_used(leaf, 0, leaf->header.nritems); if (slot == 0) fixup_low_keys(root, path, &leaf->items[0].key, 1); write_tree_block(root, leaf_buf); /* delete the leaf if it is mostly empty */ - if (leaf_space_used(leaf, 0, leaf->header.nritems) < - LEAF_DATA_SIZE / 4) { + if (used < LEAF_DATA_SIZE / 3) { /* push_leaf_left fixes the path. * make sure the path still points to our leaf * for possible call to del_ptr below @@ -864,81 +876,19 @@ int del_item(struct ctree_root *root, struct ctree_path *path) leaf_buf->count++; push_leaf_left(root, path, 1); if (leaf->header.nritems == 0) { + u64 blocknr = leaf_buf->blocknr; path->slots[1] = slot; del_ptr(root, path, 1); + tree_block_release(root, leaf_buf); + free_extent(root, blocknr, 1); + } else { + tree_block_release(root, leaf_buf); } - tree_block_release(root, leaf_buf); } } return 0; } -static int del_pending_extents(struct ctree_root *extent_root) -{ - int ret; - struct key key; - struct tree_buffer *gang[4]; - int i; - struct ctree_path path; - - while(1) { - ret = radix_tree_gang_lookup_tag(&extent_root->cache_radix, - (void **)gang, 0, ARRAY_SIZE(gang), - CTREE_EXTENT_PENDING); - if (!ret) - break; - for (i = 0; i < ret; i++) { - key.objectid = gang[i]->blocknr; - key.flags = 0; - key.offset = 1; - init_path(&path); - ret = search_slot(extent_root, &key, &path, 0); - if (ret) { - BUG(); - // FIXME undo it and return sane - return ret; - } - ret = del_item(extent_root, &path); - if (ret) { - BUG(); - return ret; - } - release_path(extent_root, &path); - radix_tree_tag_clear(&extent_root->cache_radix, gang[i]->blocknr, - CTREE_EXTENT_PENDING); - tree_block_release(extent_root, gang[i]); - } - } - return 0; -} - -int free_extent(struct ctree_root *root, u64 blocknr, u64 num_blocks) -{ - struct ctree_path path; - struct key key; - struct ctree_root *extent_root = root->extent_root; - struct tree_buffer *t; - int pending_ret; - int ret; - - key.objectid = blocknr; - key.flags = 0; - key.offset = num_blocks; - if (root == extent_root) { - t = read_tree_block(root, key.objectid); - radix_tree_tag_set(&root->cache_radix, key.objectid, CTREE_EXTENT_PENDING); - return 0; - } - init_path(&path); - ret = search_slot(extent_root, &key, &path, 0); - if (ret) - BUG(); - ret = del_item(extent_root, &path); - release_path(extent_root, &path); - pending_ret = del_pending_extents(root->extent_root); - return ret ? ret : pending_ret; -} - int next_leaf(struct ctree_root *root, struct ctree_path *path) { int slot; @@ -976,241 +926,10 @@ int next_leaf(struct ctree_root *root, struct ctree_path *path) return 0; } -int find_free_extent(struct ctree_root *orig_root, u64 num_blocks, u64 search_start, - u64 search_end, struct key *ins) -{ - struct ctree_path path; - struct key *key; - int ret; - u64 hole_size = 0; - int slot = 0; - u64 last_block; - int start_found = 0; - struct leaf *l; - struct ctree_root * root = orig_root->extent_root; - - init_path(&path); - ins->objectid = search_start; - ins->offset = 0; - ins->flags = 0; - ret = search_slot(root, ins, &path, 0); - while (1) { - l = &path.nodes[0]->leaf; - slot = path.slots[0]; - if (!l) { - // FIXME allocate root - } - if (slot >= l->header.nritems) { - ret = next_leaf(root, &path); - if (ret == 0) - continue; - if (!start_found) { - ins->objectid = search_start; - ins->offset = num_blocks; - hole_size = search_end - search_start; - start_found = 1; - goto insert; - } - ins->objectid = last_block; - ins->offset = num_blocks; - hole_size = search_end - last_block; - goto insert; - } - key = &l->items[slot].key; - if (start_found) { - hole_size = key->objectid - last_block; - if (hole_size > num_blocks) { - ins->objectid = last_block; - ins->offset = num_blocks; - goto insert; - } - } else - start_found = 1; - last_block = key->objectid + key->offset; -insert_failed: - path.slots[0]++; - } - // FIXME -ENOSPC -insert: - if (orig_root->extent_root == orig_root) { - BUG_ON(num_blocks != 1); - if ((root->current_insert.objectid <= ins->objectid && - root->current_insert.objectid + root->current_insert.offset > - ins->objectid) || - (root->current_insert.objectid > ins->objectid && - root->current_insert.objectid <= ins->objectid + ins->offset) || - radix_tree_tag_get(&root->cache_radix, ins->objectid, - CTREE_EXTENT_PENDING)) { - last_block = ins->objectid + 1; - search_start = last_block; - goto insert_failed; - } - } - release_path(root, &path); - if (ins->offset != 1) - BUG(); - return 0; -} - -static int insert_pending_extents(struct ctree_root *extent_root) -{ - int ret; - struct key key; - struct extent_item item; - struct tree_buffer *gang[4]; - int i; - - // FIXME -ENOSPC - item.refs = 1; - item.owner = extent_root->node->node.header.parentid; - while(1) { - ret = radix_tree_gang_lookup_tag(&extent_root->cache_radix, - (void **)gang, 0, ARRAY_SIZE(gang), - CTREE_EXTENT_PENDING); - if (!ret) - break; - for (i = 0; i < ret; i++) { - key.objectid = gang[i]->blocknr; - key.flags = 0; - key.offset = 1; - ret = insert_item(extent_root, &key, &item, sizeof(item)); - if (ret) { - BUG(); - // FIXME undo it and return sane - return ret; - } - radix_tree_tag_clear(&extent_root->cache_radix, gang[i]->blocknr, - CTREE_EXTENT_PENDING); - tree_block_release(extent_root, gang[i]); - } - } - return 0; -} - -int alloc_extent(struct ctree_root *root, u64 num_blocks, u64 search_start, - u64 search_end, u64 owner, struct key *ins, struct tree_buffer **buf) -{ - int ret; - int pending_ret; - struct extent_item extent_item; - - extent_item.refs = 1; - extent_item.owner = owner; - - ret = find_free_extent(root, num_blocks, search_start, search_end, ins); - if (ret) - return ret; - - if (root != root->extent_root) { - memcpy(&root->extent_root->current_insert, ins, sizeof(*ins)); - ret = insert_item(root->extent_root, ins, &extent_item, sizeof(extent_item)); - memset(&root->extent_root->current_insert, 0, sizeof(struct key)); - pending_ret = insert_pending_extents(root->extent_root); - if (ret) - return ret; - if (pending_ret) - return pending_ret; - *buf = find_tree_block(root, ins->objectid); - return 0; - } - /* we're allocating an extent for the extent tree, don't recurse */ - BUG_ON(ins->offset != 1); - *buf = find_tree_block(root, ins->objectid); - BUG_ON(!*buf); - radix_tree_tag_set(&root->cache_radix, ins->objectid, CTREE_EXTENT_PENDING); - (*buf)->count++; - return 0; - -} - -struct tree_buffer *alloc_free_block(struct ctree_root *root) -{ - struct key ins; - int ret; - struct tree_buffer *buf = NULL; - - ret = alloc_extent(root, 1, 0, (unsigned long)-1, root->node->node.header.parentid, - &ins, &buf); - - if (ret) { - BUG(); - return NULL; - } - if (root != root->extent_root) - BUG_ON(radix_tree_tag_get(&root->extent_root->cache_radix, buf->blocknr, - CTREE_EXTENT_PENDING)); - return buf; -} - -void print_leaf(struct leaf *l) -{ - int i; - int nr = l->header.nritems; - struct item *item; - struct extent_item *ei; - printf("leaf %lu total ptrs %d free space %d\n", l->header.blocknr, nr, - leaf_free_space(l)); - fflush(stdout); - for (i = 0 ; i < nr ; i++) { - item = l->items + i; - printf("\titem %d key (%lu %u %lu) itemoff %d itemsize %d\n", - i, - item->key.objectid, item->key.flags, item->key.offset, - item->offset, item->size); - fflush(stdout); - printf("\t\titem data %.*s\n", item->size, l->data+item->offset); - ei = (struct extent_item *)(l->data + item->offset); - printf("\t\textent data %u %lu\n", ei->refs, ei->owner); - fflush(stdout); - } -} -void print_tree(struct ctree_root *root, struct tree_buffer *t) -{ - int i; - int nr; - struct node *c; - - if (!t) - return; - c = &t->node; - nr = c->header.nritems; - if (c->header.blocknr != t->blocknr) - BUG(); - if (is_leaf(c->header.flags)) { - print_leaf((struct leaf *)c); - return; - } - printf("node %lu level %d total ptrs %d free spc %lu\n", t->blocknr, - node_level(c->header.flags), c->header.nritems, - NODEPTRS_PER_BLOCK - c->header.nritems); - fflush(stdout); - for (i = 0; i < nr; i++) { - printf("\tkey %d (%lu %u %lu) block %lu\n", - i, - c->keys[i].objectid, c->keys[i].flags, c->keys[i].offset, - c->blockptrs[i]); - fflush(stdout); - } - for (i = 0; i < nr; i++) { - struct tree_buffer *next_buf = read_tree_block(root, - c->blockptrs[i]); - struct node *next = &next_buf->node; - if (is_leaf(next->header.flags) && - node_level(c->header.flags) != 1) - BUG(); - if (node_level(next->header.flags) != - node_level(c->header.flags) - 1) - BUG(); - print_tree(root, next_buf); - tree_block_release(root, next_buf); - } - -} - /* for testing only */ int next_key(int i, int max_key) { - // return rand() % max_key; - return i; + return rand() % max_key; + // return i; } int main() { @@ -1221,8 +940,8 @@ int main() { int i; int num; int ret; - int run_size = 10000; - int max_key = 100000000; + int run_size = 20000000; + int max_key = 100000000; int tree_size = 0; struct ctree_path path; struct ctree_super_block super; @@ -1231,11 +950,6 @@ int main() { root = open_ctree("dbfile", &super); - printf("root tree\n"); - print_tree(root, root->node); - printf("map tree\n"); - print_tree(root->extent_root, root->extent_root->node); - fflush(stdout); srand(55); for (i = 0; i < run_size; i++) { @@ -1243,13 +957,15 @@ int main() { num = next_key(i, max_key); // num = i; sprintf(buf, "string-%d", num); - // printf("insert %d\n", num); + if (i % 10000 == 0) + printf("insert %d:%d\n", num, i); ins.objectid = num; ins.offset = 0; ins.flags = 0; ret = insert_item(root, &ins, buf, strlen(buf)); if (!ret) tree_size++; + free(buf); } write_ctree_super(root, &super); close_ctree(root); @@ -1261,6 +977,8 @@ int main() { num = next_key(i, max_key); ins.objectid = num; init_path(&path); + if (i % 10000 == 0) + printf("search %d:%d\n", num, i); ret = search_slot(root, &ins, &path, 0); if (ret) { print_tree(root, root->node); @@ -1283,39 +1001,32 @@ int main() { num = next_key(i, max_key); ins.objectid = num; init_path(&path); - ret = search_slot(root, &ins, &path, 0); - if (ret) - continue; - ret = del_item(root, &path); - if (ret != 0) - BUG(); + ret = search_slot(root, &ins, &path, -1); + if (!ret) { + if (i % 10000 == 0) + printf("del %d:%d\n", num, i); + ret = del_item(root, &path); + if (ret != 0) + BUG(); + tree_size--; + } release_path(root, &path); - tree_size--; } + write_ctree_super(root, &super); + close_ctree(root); + root = open_ctree("dbfile", &super); srand(128); for (i = 0; i < run_size; i++) { buf = malloc(64); num = next_key(i, max_key); sprintf(buf, "string-%d", num); ins.objectid = num; + if (i % 10000 == 0) + printf("insert %d:%d\n", num, i); ret = insert_item(root, &ins, buf, strlen(buf)); if (!ret) tree_size++; - if (i >= 5) { - struct key ugh; - ugh.objectid = 5; - ugh.flags = 0; - ugh.offset = 0; - init_path(&path); - ret = search_slot(root, &ugh, &path, 0); - if (ret) { - print_tree(root, root->node); - printf("unable to find 5 %d\n", num); - exit(1); - } - release_path(root, &path); - - } + free(buf); } write_ctree_super(root, &super); close_ctree(root); @@ -1326,6 +1037,8 @@ int main() { num = next_key(i, max_key); ins.objectid = num; init_path(&path); + if (i % 10000 == 0) + printf("search %d:%d\n", num, i); ret = search_slot(root, &ins, &path, 0); if (ret) { print_tree(root, root->node); @@ -1340,7 +1053,7 @@ int main() { int slot; ins.objectid = (u64)-1; init_path(&path); - ret = search_slot(root, &ins, &path, 0); + ret = search_slot(root, &ins, &path, -1); if (ret == 0) BUG(); @@ -1356,6 +1069,8 @@ int main() { if (comp_keys(&last, &leaf->items[slot].key) <= 0) BUG(); memcpy(&last, &leaf->items[slot].key, sizeof(last)); + if (tree_size % 10000 == 0) + printf("big del %d:%d\n", tree_size, i); ret = del_item(root, &path); if (ret != 0) { printf("del_item returned %d\n", ret); @@ -1365,10 +1080,9 @@ int main() { } release_path(root, &path); } - write_ctree_super(root, &super); - close_ctree(root); printf("tree size is now %d\n", tree_size); printf("map tree\n"); - print_tree(root->extent_root, root->extent_root->node); + write_ctree_super(root, &super); + close_ctree(root); return 0; } diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 8c32c0e9267d..b92fbbb5ecd7 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1,7 +1,7 @@ #ifndef __CTREE__ #define __CTREE__ -#define CTREE_BLOCKSIZE 256 +#define CTREE_BLOCKSIZE 4096 struct key { u64 objectid; @@ -81,4 +81,14 @@ struct ctree_path { struct tree_buffer *nodes[MAX_LEVEL]; int slots[MAX_LEVEL]; }; + +struct tree_buffer *alloc_free_block(struct ctree_root *root); +int free_extent(struct ctree_root *root, u64 blocknr, u64 num_blocks); +int search_slot(struct ctree_root *root, struct key *key, struct ctree_path *p, int ins_len); +void release_path(struct ctree_root *root, struct ctree_path *p); +void init_path(struct ctree_path *p); +int del_item(struct ctree_root *root, struct ctree_path *path); +int insert_item(struct ctree_root *root, struct key *key, void *data, int data_size); +int next_leaf(struct ctree_root *root, struct ctree_path *path); +int leaf_free_space(struct leaf *leaf); #endif diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 14955e440773..f4c6ff202ba9 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -172,7 +172,6 @@ int close_ctree(struct ctree_root *root) void tree_block_release(struct ctree_root *root, struct tree_buffer *buf) { buf->count--; - write_tree_block(root, buf); if (buf->count < 0) BUG(); if (buf->count == 0) { diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c new file mode 100644 index 000000000000..594d23b5b24c --- /dev/null +++ b/fs/btrfs/print-tree.c @@ -0,0 +1,72 @@ +#include +#include +#include "kerncompat.h" +#include "radix-tree.h" +#include "ctree.h" +#include "disk-io.h" + +void print_leaf(struct leaf *l) +{ + int i; + int nr = l->header.nritems; + struct item *item; + struct extent_item *ei; + printf("leaf %lu total ptrs %d free space %d\n", l->header.blocknr, nr, + leaf_free_space(l)); + fflush(stdout); + for (i = 0 ; i < nr ; i++) { + item = l->items + i; + printf("\titem %d key (%lu %u %lu) itemoff %d itemsize %d\n", + i, + item->key.objectid, item->key.flags, item->key.offset, + item->offset, item->size); + fflush(stdout); + printf("\t\titem data %.*s\n", item->size, l->data+item->offset); + ei = (struct extent_item *)(l->data + item->offset); + printf("\t\textent data %u %lu\n", ei->refs, ei->owner); + fflush(stdout); + } +} +void print_tree(struct ctree_root *root, struct tree_buffer *t) +{ + int i; + int nr; + struct node *c; + + if (!t) + return; + c = &t->node; + nr = c->header.nritems; + if (c->header.blocknr != t->blocknr) + BUG(); + if (is_leaf(c->header.flags)) { + print_leaf((struct leaf *)c); + return; + } + printf("node %lu level %d total ptrs %d free spc %lu\n", t->blocknr, + node_level(c->header.flags), c->header.nritems, + NODEPTRS_PER_BLOCK - c->header.nritems); + fflush(stdout); + for (i = 0; i < nr; i++) { + printf("\tkey %d (%lu %u %lu) block %lu\n", + i, + c->keys[i].objectid, c->keys[i].flags, c->keys[i].offset, + c->blockptrs[i]); + fflush(stdout); + } + for (i = 0; i < nr; i++) { + struct tree_buffer *next_buf = read_tree_block(root, + c->blockptrs[i]); + struct node *next = &next_buf->node; + if (is_leaf(next->header.flags) && + node_level(c->header.flags) != 1) + BUG(); + if (node_level(next->header.flags) != + node_level(c->header.flags) - 1) + BUG(); + print_tree(root, next_buf); + tree_block_release(root, next_buf); + } + +} + diff --git a/fs/btrfs/print-tree.h b/fs/btrfs/print-tree.h new file mode 100644 index 000000000000..3c1e9a3e0260 --- /dev/null +++ b/fs/btrfs/print-tree.h @@ -0,0 +1,3 @@ + +void print_leaf(struct leaf *l); +void print_tree(struct ctree_root *root, struct tree_buffer *t); -- cgit v1.2.3 From fec577fb7f516e0d12ff821b1af272fd754e120a Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Mon, 26 Feb 2007 10:40:21 -0500 Subject: Btrfs: Add fsx-style randomized tree tester Add debug-tree command to print the tree Add extent-tree.c to the repo Comment ctree.h Signed-off-by: Chris Mason --- fs/btrfs/Makefile | 10 +- fs/btrfs/ctree.c | 6 +- fs/btrfs/ctree.h | 62 ++++++++++- fs/btrfs/debug-tree.c | 19 ++++ fs/btrfs/extent-tree.c | 296 +++++++++++++++++++++++++++++++++++++++++++++++++ fs/btrfs/random-test.c | 295 ++++++++++++++++++++++++++++++++++++++++++++++++ 6 files changed, 680 insertions(+), 8 deletions(-) create mode 100644 fs/btrfs/debug-tree.c create mode 100644 fs/btrfs/extent-tree.c create mode 100644 fs/btrfs/random-test.c (limited to 'fs/btrfs/Makefile') diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index fe73ab9d81d6..855e8f499e37 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -6,11 +6,17 @@ objects = ctree.o disk-io.o radix-tree.o mkfs.o extent-tree.o print-tree.o #.c.o: # $(CC) $(CFLAGS) -c $< -ctree : $(objects) - gcc $(CFLAGS) -o ctree $(objects) +all: tester debug-tree + +debug-tree: $(objects) debug-tree.o + gcc $(CFLAGS) -o debug-tree $(objects) debug-tree.o + +tester: $(objects) random-test.o + gcc $(CFLAGS) -o tester $(objects) random-test.o $(objects) : $(headers) clean : rm ctree *.o + diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 7645ab3259ea..25dc7b2f7426 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -1047,14 +1047,14 @@ int next_leaf(struct ctree_root *root, struct ctree_path *path) return 0; } +/* some sample code to insert,search & delete items */ +#if 0 /* for testing only */ int next_key(int i, int max_key) { return rand() % max_key; //return i; } - int main() { - struct ctree_root *root; struct key ins; struct key last = { (u64)-1, 0, 0}; char *buf; @@ -1066,6 +1066,7 @@ int main() { int tree_size = 0; struct ctree_path path; struct ctree_super_block super; + struct ctree_root *root; radix_tree_init(); @@ -1207,3 +1208,4 @@ int main() { close_ctree(root); return 0; } +#endif diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index b92fbbb5ecd7..18daccd84535 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1,22 +1,36 @@ #ifndef __CTREE__ #define __CTREE__ -#define CTREE_BLOCKSIZE 4096 +#define CTREE_BLOCKSIZE 1024 +/* + * the key defines the order in the tree, and so it also defines (optimal) + * block layout. objectid corresonds to the inode number. The flags + * tells us things about the object, and is a kind of stream selector. + * so for a given inode, keys with flags of 1 might refer to the inode + * data, flags of 2 may point to file data in the btree and flags == 3 + * may point to extents. + * + * offset is the starting byte offset for this key in the stream. + */ struct key { u64 objectid; u32 flags; u64 offset; } __attribute__ ((__packed__)); +/* + * every tree block (leaf or node) starts with this header. + */ struct header { u64 fsid[2]; /* FS specific uuid */ - u64 blocknr; - u64 parentid; + u64 blocknr; /* which block this node is supposed to live in */ + u64 parentid; /* objectid of the tree root */ u32 csum; u32 ham; u16 nritems; u16 flags; + /* generation flags to be added */ } __attribute__ ((__packed__)); #define NODEPTRS_PER_BLOCK ((CTREE_BLOCKSIZE - sizeof(struct header)) / \ @@ -28,6 +42,11 @@ struct header { struct tree_buffer; +/* + * in ram representation of the tree. extent_root is used for all allocations + * and for the extent tree extent_root root. current_insert is used + * only for the extent tree. + */ struct ctree_root { struct tree_buffer *node; struct ctree_root *extent_root; @@ -36,27 +55,46 @@ struct ctree_root { struct radix_tree_root cache_radix; }; +/* + * describes a tree on disk + */ struct ctree_root_info { u64 fsid[2]; /* FS specific uuid */ u64 blocknr; /* blocknr of this block */ u64 objectid; /* inode number of this root */ - u64 tree_root; /* the tree root */ + u64 tree_root; /* the tree root block */ u32 csum; u32 ham; u64 snapuuid[2]; /* root specific uuid */ } __attribute__ ((__packed__)); +/* + * the super block basically lists the main trees of the FS + * it currently lacks any block count etc etc + */ struct ctree_super_block { struct ctree_root_info root_info; struct ctree_root_info extent_info; } __attribute__ ((__packed__)); +/* + * A leaf is full of items. The exact type of item is defined by + * the key flags parameter. offset and size tell us where to find + * the item in the leaf (relative to the start of the data area) + */ struct item { struct key key; u16 offset; u16 size; } __attribute__ ((__packed__)); +/* + * leaves have an item area and a data area: + * [item0, item1....itemN] [free space] [dataN...data1, data0] + * + * The data is separate from the items to get the keys closer together + * during searches. + */ #define LEAF_DATA_SIZE (CTREE_BLOCKSIZE - sizeof(struct header)) struct leaf { struct header header; @@ -66,17 +104,33 @@ struct leaf { }; } __attribute__ ((__packed__)); +/* + * all non-leaf blocks are nodes, they hold only keys and pointers to + * other blocks + */ struct node { struct header header; struct key keys[NODEPTRS_PER_BLOCK]; u64 blockptrs[NODEPTRS_PER_BLOCK]; } __attribute__ ((__packed__)); +/* + * items in the extent btree are used to record the objectid of the + * owner of the block and the number of references + */ struct extent_item { u32 refs; u64 owner; } __attribute__ ((__packed__)); +/* + * ctree_paths remember the path taken from the root down to the leaf. + * level 0 is always the leaf, and nodes[1...MAX_LEVEL] will point + * to any other levels that are present. + * + * The slots array records the index of the item or block pointer + * used while walking the tree. + */ struct ctree_path { struct tree_buffer *nodes[MAX_LEVEL]; int slots[MAX_LEVEL]; diff --git a/fs/btrfs/debug-tree.c b/fs/btrfs/debug-tree.c new file mode 100644 index 000000000000..b2c5abfdc74a --- /dev/null +++ b/fs/btrfs/debug-tree.c @@ -0,0 +1,19 @@ +#include +#include +#include "kerncompat.h" +#include "radix-tree.h" +#include "ctree.h" +#include "disk-io.h" +#include "print-tree.h" + +int main() { + struct ctree_super_block super; + struct ctree_root *root; + radix_tree_init(); + root = open_ctree("dbfile", &super); + printf("root tree\n"); + print_tree(root, root->node); + printf("map tree\n"); + print_tree(root->extent_root, root->extent_root->node); + return 0; +} diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c new file mode 100644 index 000000000000..2b663ef84496 --- /dev/null +++ b/fs/btrfs/extent-tree.c @@ -0,0 +1,296 @@ +#include +#include +#include "kerncompat.h" +#include "radix-tree.h" +#include "ctree.h" +#include "disk-io.h" +#include "print-tree.h" + +/* + * pending extents are blocks that we're trying to allocate in the extent + * map while trying to grow the map because of other allocations. To avoid + * recursing, they are tagged in the radix tree and cleaned up after + * other allocations are done. The pending tag is also used in the same + * manner for deletes. + */ +#define CTREE_EXTENT_PENDING 0 + +/* + * find all the blocks marked as pending in the radix tree and remove + * them from the extent map + */ +static int del_pending_extents(struct ctree_root *extent_root) +{ + int ret; + struct key key; + struct tree_buffer *gang[4]; + int i; + struct ctree_path path; + + while(1) { + ret = radix_tree_gang_lookup_tag(&extent_root->cache_radix, + (void **)gang, 0, + ARRAY_SIZE(gang), + CTREE_EXTENT_PENDING); + if (!ret) + break; + for (i = 0; i < ret; i++) { + key.objectid = gang[i]->blocknr; + key.flags = 0; + key.offset = 1; + init_path(&path); + ret = search_slot(extent_root, &key, &path, 0); + if (ret) { + print_tree(extent_root, extent_root->node); + printf("unable to find %lu\n", key.objectid); + BUG(); + // FIXME undo it and return sane + return ret; + } + ret = del_item(extent_root, &path); + if (ret) { + BUG(); + return ret; + } + release_path(extent_root, &path); + radix_tree_tag_clear(&extent_root->cache_radix, + gang[i]->blocknr, + CTREE_EXTENT_PENDING); + tree_block_release(extent_root, gang[i]); + } + } + return 0; +} + +/* + * remove an extent from the root, returns 0 on success + */ +int free_extent(struct ctree_root *root, u64 blocknr, u64 num_blocks) +{ + struct ctree_path path; + struct key key; + struct ctree_root *extent_root = root->extent_root; + struct tree_buffer *t; + int pending_ret; + int ret; + key.objectid = blocknr; + key.flags = 0; + key.offset = num_blocks; + if (root == extent_root) { + t = read_tree_block(root, key.objectid); + radix_tree_tag_set(&root->cache_radix, key.objectid, + CTREE_EXTENT_PENDING); + return 0; + } + init_path(&path); + ret = search_slot(extent_root, &key, &path, 0); + if (ret) { + print_tree(extent_root, extent_root->node); + printf("failed to find %lu\n", key.objectid); + BUG(); + } + ret = del_item(extent_root, &path); + if (ret) + BUG(); + release_path(extent_root, &path); + pending_ret = del_pending_extents(root->extent_root); + return ret ? ret : pending_ret; +} + +/* + * walks the btree of allocated extents and find a hole of a given size. + * The key ins is changed to record the hole: + * ins->objectid == block start + * ins->flags = 0 + * ins->offset == number of blocks + * Any available blocks before search_start are skipped. + */ +int find_free_extent(struct ctree_root *orig_root, u64 num_blocks, + u64 search_start, u64 search_end, struct key *ins) +{ + struct ctree_path path; + struct key *key; + int ret; + u64 hole_size = 0; + int slot = 0; + u64 last_block; + int start_found; + struct leaf *l; + struct ctree_root * root = orig_root->extent_root; + +check_failed: + init_path(&path); + ins->objectid = search_start; + ins->offset = 0; + ins->flags = 0; + start_found = 0; + ret = search_slot(root, ins, &path, 0); + while (1) { + l = &path.nodes[0]->leaf; + slot = path.slots[0]; + if (slot >= l->header.nritems) { + ret = next_leaf(root, &path); + if (ret == 0) + continue; + if (!start_found) { + ins->objectid = search_start; + ins->offset = num_blocks; + start_found = 1; + goto check_pending; + } + ins->objectid = last_block > search_start ? + last_block : search_start; + ins->offset = num_blocks; + goto check_pending; + } + key = &l->items[slot].key; + if (key->objectid >= search_start) { + if (start_found) { + hole_size = key->objectid - last_block; + if (hole_size > num_blocks) { + ins->objectid = last_block; + ins->offset = num_blocks; + goto check_pending; + } + } else + start_found = 1; + last_block = key->objectid + key->offset; + } + path.slots[0]++; + } + // FIXME -ENOSPC +check_pending: + /* we have to make sure we didn't find an extent that has already + * been allocated by the map tree or the original allocation + */ + release_path(root, &path); + BUG_ON(ins->objectid < search_start); + if (orig_root->extent_root == orig_root) { + BUG_ON(num_blocks != 1); + if ((root->current_insert.objectid <= ins->objectid && + root->current_insert.objectid + + root->current_insert.offset > ins->objectid) || + (root->current_insert.objectid > ins->objectid && + root->current_insert.objectid <= ins->objectid + + ins->offset) || + radix_tree_tag_get(&root->cache_radix, ins->objectid, + CTREE_EXTENT_PENDING)) { + search_start = ins->objectid + 1; + goto check_failed; + } + } + if (ins->offset != 1) + BUG(); + return 0; +} + +/* + * insert all of the pending extents reserved during the original + * allocation. (CTREE_EXTENT_PENDING). Returns zero if it all worked out + */ +static int insert_pending_extents(struct ctree_root *extent_root) +{ + int ret; + struct key key; + struct extent_item item; + struct tree_buffer *gang[4]; + int i; + + // FIXME -ENOSPC + item.refs = 1; + item.owner = extent_root->node->node.header.parentid; + while(1) { + ret = radix_tree_gang_lookup_tag(&extent_root->cache_radix, + (void **)gang, 0, + ARRAY_SIZE(gang), + CTREE_EXTENT_PENDING); + if (!ret) + break; + for (i = 0; i < ret; i++) { + key.objectid = gang[i]->blocknr; + key.flags = 0; + key.offset = 1; + ret = insert_item(extent_root, &key, &item, + sizeof(item)); + if (ret) { + BUG(); + // FIXME undo it and return sane + return ret; + } + radix_tree_tag_clear(&extent_root->cache_radix, + gang[i]->blocknr, + CTREE_EXTENT_PENDING); + tree_block_release(extent_root, gang[i]); + } + } + return 0; +} + +/* + * finds a free extent and does all the dirty work required for allocation + * returns the key for the extent through ins, and a tree buffer for + * the first block of the extent through buf. + * + * returns 0 if everything worked, non-zero otherwise. + */ +int alloc_extent(struct ctree_root *root, u64 num_blocks, u64 search_start, + u64 search_end, u64 owner, struct key *ins, + struct tree_buffer **buf) +{ + int ret; + int pending_ret; + struct extent_item extent_item; + extent_item.refs = 1; + extent_item.owner = owner; + + ret = find_free_extent(root, num_blocks, search_start, search_end, ins); + if (ret) + return ret; + if (root != root->extent_root) { + memcpy(&root->extent_root->current_insert, ins, sizeof(*ins)); + ret = insert_item(root->extent_root, ins, &extent_item, + sizeof(extent_item)); + memset(&root->extent_root->current_insert, 0, + sizeof(struct key)); + pending_ret = insert_pending_extents(root->extent_root); + if (ret) + return ret; + if (pending_ret) + return pending_ret; + *buf = find_tree_block(root, ins->objectid); + return 0; + } + /* we're allocating an extent for the extent tree, don't recurse */ + BUG_ON(ins->offset != 1); + *buf = find_tree_block(root, ins->objectid); + BUG_ON(!*buf); + radix_tree_tag_set(&root->cache_radix, ins->objectid, + CTREE_EXTENT_PENDING); + (*buf)->count++; + return 0; + +} + +/* + * helper function to allocate a block for a given tree + * returns the tree buffer or NULL. + */ +struct tree_buffer *alloc_free_block(struct ctree_root *root) +{ + struct key ins; + int ret; + struct tree_buffer *buf = NULL; + + ret = alloc_extent(root, 1, 0, (unsigned long)-1, + root->node->node.header.parentid, + &ins, &buf); + + if (ret) { + BUG(); + return NULL; + } + if (root != root->extent_root) + BUG_ON(radix_tree_tag_get(&root->extent_root->cache_radix, + buf->blocknr, CTREE_EXTENT_PENDING)); + return buf; +} diff --git a/fs/btrfs/random-test.c b/fs/btrfs/random-test.c new file mode 100644 index 000000000000..3c8c68d55d2f --- /dev/null +++ b/fs/btrfs/random-test.c @@ -0,0 +1,295 @@ +#include +#include +#include +#include "kerncompat.h" +#include "radix-tree.h" +#include "ctree.h" +#include "disk-io.h" +#include "print-tree.h" + +int keep_running = 1; + +static int setup_key(struct radix_tree_root *root, struct key *key, int exists) +{ + int num = rand(); + unsigned long res[2]; + int ret; + + key->flags = 0; + key->offset = 0; +again: + ret = radix_tree_gang_lookup(root, (void **)res, num, 2); + if (exists) { + if (ret == 0) + return -1; + num = res[0]; + } else if (ret != 0 && num == res[0]) { + num++; + if (ret > 1 && num == res[1]) { + num++; + goto again; + } + } + key->objectid = num; + return 0; +} + +static int ins_one(struct ctree_root *root, struct radix_tree_root *radix) +{ + struct ctree_path path; + struct key key; + int ret; + char buf[128]; + init_path(&path); + ret = setup_key(radix, &key, 0); + sprintf(buf, "str-%lu\n", key.objectid); + ret = insert_item(root, &key, buf, strlen(buf)); + if (ret) + goto error; + radix_tree_preload(GFP_KERNEL); + ret = radix_tree_insert(radix, key.objectid, + (void *)key.objectid); + radix_tree_preload_end(); + if (ret) + goto error; + return ret; +error: + printf("failed to insert %lu\n", key.objectid); + return -1; +} + +static int insert_dup(struct ctree_root *root, struct radix_tree_root *radix) +{ + struct ctree_path path; + struct key key; + int ret; + char buf[128]; + init_path(&path); + ret = setup_key(radix, &key, 1); + if (ret < 0) + return 0; + sprintf(buf, "str-%lu\n", key.objectid); + ret = insert_item(root, &key, buf, strlen(buf)); + if (ret != -EEXIST) { + printf("insert on %lu gave us %d\n", key.objectid, ret); + return 1; + } + return 0; +} + +static int del_one(struct ctree_root *root, struct radix_tree_root *radix) +{ + struct ctree_path path; + struct key key; + int ret; + unsigned long *ptr; + init_path(&path); + ret = setup_key(radix, &key, 1); + if (ret < 0) + return 0; + ret = search_slot(root, &key, &path, -1); + if (ret) + goto error; + ret = del_item(root, &path); + release_path(root, &path); + if (ret != 0) + goto error; + ptr = radix_tree_delete(radix, key.objectid); + if (!ptr) + goto error; + return 0; +error: + printf("failed to delete %lu\n", key.objectid); + return -1; +} + +static int lookup_item(struct ctree_root *root, struct radix_tree_root *radix) +{ + struct ctree_path path; + struct key key; + int ret; + init_path(&path); + ret = setup_key(radix, &key, 1); + if (ret < 0) + return 0; + ret = search_slot(root, &key, &path, 0); + release_path(root, &path); + if (ret) + goto error; + return 0; +error: + printf("unable to find key %lu\n", key.objectid); + return -1; +} + +static int lookup_enoent(struct ctree_root *root, struct radix_tree_root *radix) +{ + struct ctree_path path; + struct key key; + int ret; + init_path(&path); + ret = setup_key(radix, &key, 0); + if (ret < 0) + return ret; + ret = search_slot(root, &key, &path, 0); + release_path(root, &path); + if (ret == 0) + goto error; + return 0; +error: + printf("able to find key that should not exist %lu\n", key.objectid); + return -1; +} + +int (*ops[])(struct ctree_root *root, struct radix_tree_root *radix) = +{ ins_one, insert_dup, del_one, lookup_item, lookup_enoent }; + +static int fill_radix(struct ctree_root *root, struct radix_tree_root *radix) +{ + struct ctree_path path; + struct key key; + u64 found; + int ret; + int slot; + int i; + key.offset = 0; + key.flags = 0; + key.objectid = (unsigned long)-1; + while(1) { + init_path(&path); + ret = search_slot(root, &key, &path, 0); + slot = path.slots[0]; + if (ret != 0) { + if (slot == 0) { + release_path(root, &path); + break; + } + slot -= 1; + } + for (i = slot; i >= 0; i--) { + found = path.nodes[0]->leaf.items[i].key.objectid; + radix_tree_preload(GFP_KERNEL); + ret = radix_tree_insert(radix, found, (void *)found); + if (ret) { + fprintf(stderr, + "failed to insert %lu into radix\n", + found); + exit(1); + } + + radix_tree_preload_end(); + } + release_path(root, &path); + key.objectid = found - 1; + if (key.objectid > found) + break; + } + return 0; +} + +void sigstopper(int ignored) +{ + keep_running = 0; + fprintf(stderr, "caught exit signal, stopping\n"); +} + +int print_usage(void) +{ + printf("usage: tester [-ih] [-c count] [-f count]\n"); + printf("\t -c count -- iteration count after filling\n"); + printf("\t -f count -- run this many random inserts before starting\n"); + printf("\t -i -- only do initial fill\n"); + printf("\t -h -- this help text\n"); + exit(1); +} +int main(int ac, char **av) +{ + RADIX_TREE(radix, GFP_KERNEL); + struct ctree_super_block super; + struct ctree_root *root; + int i; + int ret; + int count; + int op; + int iterations = 20000; + int init_fill_count = 800000; + int err = 0; + int initial_only = 0; + radix_tree_init(); + root = open_ctree("dbfile", &super); + fill_radix(root, &radix); + + signal(SIGTERM, sigstopper); + signal(SIGINT, sigstopper); + + for (i = 1 ; i < ac ; i++) { + if (strcmp(av[i], "-i") == 0) { + initial_only = 1; + } else if (strcmp(av[i], "-c") == 0) { + iterations = atoi(av[i+1]); + i++; + } else if (strcmp(av[i], "-f") == 0) { + init_fill_count = atoi(av[i+1]); + i++; + } else { + print_usage(); + } + } + for (i = 0; i < init_fill_count; i++) { + ret = ins_one(root, &radix); + if (ret) { + printf("initial fill failed\n"); + err = ret; + goto out; + } + if (i % 10000 == 0) { + printf("initial fill %d level %d count %d\n", i, + node_level(root->node->node.header.flags), + root->node->node.header.nritems); + } + if (keep_running == 0) { + err = 0; + goto out; + } + } + if (initial_only == 1) { + goto out; + } + for (i = 0; i < iterations; i++) { + op = rand() % ARRAY_SIZE(ops); + count = rand() % 128; + if (i % 2000 == 0) { + printf("%d\n", i); + fflush(stdout); + } + if (i && i % 5000 == 0) { + printf("open & close, root level %d nritems %d\n", + node_level(root->node->node.header.flags), + root->node->node.header.nritems); + write_ctree_super(root, &super); + close_ctree(root); + root = open_ctree("dbfile", &super); + } + while(count--) { + ret = ops[op](root, &radix); + if (ret) { + fprintf(stderr, "op %d failed %d:%d\n", + op, i, iterations); + print_tree(root, root->node); + fprintf(stderr, "op %d failed %d:%d\n", + op, i, iterations); + err = ret; + goto out; + } + if (keep_running == 0) { + err = 0; + goto out; + } + } + } +out: + write_ctree_super(root, &super); + close_ctree(root); + return err; +} + -- cgit v1.2.3 From 7cf75962acd27ed0f819df791486e7b803c7109c Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Mon, 26 Feb 2007 10:55:01 -0500 Subject: Btrfs: u64 cleanups Signed-off-by: Chris Mason --- fs/btrfs/Makefile | 2 +- fs/btrfs/extent-tree.c | 4 ++-- fs/btrfs/kerncompat.h | 2 +- fs/btrfs/print-tree.c | 10 +++++----- fs/btrfs/random-test.c | 16 ++++++++-------- 5 files changed, 17 insertions(+), 17 deletions(-) (limited to 'fs/btrfs/Makefile') diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index 855e8f499e37..ea9b6665da34 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -17,6 +17,6 @@ tester: $(objects) random-test.o $(objects) : $(headers) clean : - rm ctree *.o + rm debug-tree tester *.o diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 2b663ef84496..26321524c186 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -42,7 +42,7 @@ static int del_pending_extents(struct ctree_root *extent_root) ret = search_slot(extent_root, &key, &path, 0); if (ret) { print_tree(extent_root, extent_root->node); - printf("unable to find %lu\n", key.objectid); + printf("unable to find %Lu\n", key.objectid); BUG(); // FIXME undo it and return sane return ret; @@ -86,7 +86,7 @@ int free_extent(struct ctree_root *root, u64 blocknr, u64 num_blocks) ret = search_slot(extent_root, &key, &path, 0); if (ret) { print_tree(extent_root, extent_root->node); - printf("failed to find %lu\n", key.objectid); + printf("failed to find %Lu\n", key.objectid); BUG(); } ret = del_item(extent_root, &path); diff --git a/fs/btrfs/kerncompat.h b/fs/btrfs/kerncompat.h index 347ca06205c2..f5efc5f0fff0 100644 --- a/fs/btrfs/kerncompat.h +++ b/fs/btrfs/kerncompat.h @@ -15,7 +15,7 @@ #define BUG() abort() typedef unsigned int u32; -typedef unsigned long u64; +typedef unsigned long long u64; typedef unsigned char u8; typedef unsigned short u16; diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c index 594d23b5b24c..1d591270f4c4 100644 --- a/fs/btrfs/print-tree.c +++ b/fs/btrfs/print-tree.c @@ -11,19 +11,19 @@ void print_leaf(struct leaf *l) int nr = l->header.nritems; struct item *item; struct extent_item *ei; - printf("leaf %lu total ptrs %d free space %d\n", l->header.blocknr, nr, + printf("leaf %Lu total ptrs %d free space %d\n", l->header.blocknr, nr, leaf_free_space(l)); fflush(stdout); for (i = 0 ; i < nr ; i++) { item = l->items + i; - printf("\titem %d key (%lu %u %lu) itemoff %d itemsize %d\n", + printf("\titem %d key (%Lu %u %Lu) itemoff %d itemsize %d\n", i, item->key.objectid, item->key.flags, item->key.offset, item->offset, item->size); fflush(stdout); printf("\t\titem data %.*s\n", item->size, l->data+item->offset); ei = (struct extent_item *)(l->data + item->offset); - printf("\t\textent data %u %lu\n", ei->refs, ei->owner); + printf("\t\textent data %u %Lu\n", ei->refs, ei->owner); fflush(stdout); } } @@ -43,12 +43,12 @@ void print_tree(struct ctree_root *root, struct tree_buffer *t) print_leaf((struct leaf *)c); return; } - printf("node %lu level %d total ptrs %d free spc %lu\n", t->blocknr, + printf("node %Lu level %d total ptrs %d free spc %lu\n", t->blocknr, node_level(c->header.flags), c->header.nritems, NODEPTRS_PER_BLOCK - c->header.nritems); fflush(stdout); for (i = 0; i < nr; i++) { - printf("\tkey %d (%lu %u %lu) block %lu\n", + printf("\tkey %d (%Lu %u %Lu) block %Lu\n", i, c->keys[i].objectid, c->keys[i].flags, c->keys[i].offset, c->blockptrs[i]); diff --git a/fs/btrfs/random-test.c b/fs/btrfs/random-test.c index 3c8c68d55d2f..111a248bd5c2 100644 --- a/fs/btrfs/random-test.c +++ b/fs/btrfs/random-test.c @@ -42,7 +42,7 @@ static int ins_one(struct ctree_root *root, struct radix_tree_root *radix) char buf[128]; init_path(&path); ret = setup_key(radix, &key, 0); - sprintf(buf, "str-%lu\n", key.objectid); + sprintf(buf, "str-%Lu\n", key.objectid); ret = insert_item(root, &key, buf, strlen(buf)); if (ret) goto error; @@ -54,7 +54,7 @@ static int ins_one(struct ctree_root *root, struct radix_tree_root *radix) goto error; return ret; error: - printf("failed to insert %lu\n", key.objectid); + printf("failed to insert %Lu\n", key.objectid); return -1; } @@ -68,10 +68,10 @@ static int insert_dup(struct ctree_root *root, struct radix_tree_root *radix) ret = setup_key(radix, &key, 1); if (ret < 0) return 0; - sprintf(buf, "str-%lu\n", key.objectid); + sprintf(buf, "str-%Lu\n", key.objectid); ret = insert_item(root, &key, buf, strlen(buf)); if (ret != -EEXIST) { - printf("insert on %lu gave us %d\n", key.objectid, ret); + printf("insert on %Lu gave us %d\n", key.objectid, ret); return 1; } return 0; @@ -99,7 +99,7 @@ static int del_one(struct ctree_root *root, struct radix_tree_root *radix) goto error; return 0; error: - printf("failed to delete %lu\n", key.objectid); + printf("failed to delete %Lu\n", key.objectid); return -1; } @@ -118,7 +118,7 @@ static int lookup_item(struct ctree_root *root, struct radix_tree_root *radix) goto error; return 0; error: - printf("unable to find key %lu\n", key.objectid); + printf("unable to find key %Lu\n", key.objectid); return -1; } @@ -137,7 +137,7 @@ static int lookup_enoent(struct ctree_root *root, struct radix_tree_root *radix) goto error; return 0; error: - printf("able to find key that should not exist %lu\n", key.objectid); + printf("able to find key that should not exist %Lu\n", key.objectid); return -1; } @@ -148,7 +148,7 @@ static int fill_radix(struct ctree_root *root, struct radix_tree_root *radix) { struct ctree_path path; struct key key; - u64 found; + unsigned long found; int ret; int slot; int i; -- cgit v1.2.3 From af86d07efe83e2f2da3658702a720bc48b1a6d78 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Wed, 28 Feb 2007 09:40:58 -0500 Subject: Btrfs: Add sparse checking to Makefile Signed-off-by: Chris Mason --- fs/btrfs/Makefile | 9 +++++++-- fs/btrfs/debug-tree.c | 2 +- 2 files changed, 8 insertions(+), 3 deletions(-) (limited to 'fs/btrfs/Makefile') diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index ea9b6665da34..078061c321b0 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -3,8 +3,13 @@ CFLAGS = -g -Wall headers = radix-tree.h ctree.h disk-io.h kerncompat.h print-tree.h objects = ctree.o disk-io.o radix-tree.o mkfs.o extent-tree.o print-tree.o -#.c.o: -# $(CC) $(CFLAGS) -c $< +# if you don't have sparse installed, use ls instead +check=sparse +#check=ls + +.c.o: + $(check) $< + $(CC) $(CFLAGS) -c $< all: tester debug-tree diff --git a/fs/btrfs/debug-tree.c b/fs/btrfs/debug-tree.c index b2c5abfdc74a..21f607d8b050 100644 --- a/fs/btrfs/debug-tree.c +++ b/fs/btrfs/debug-tree.c @@ -6,7 +6,7 @@ #include "disk-io.h" #include "print-tree.h" -int main() { +int main(int ac, char **av) { struct ctree_super_block super; struct ctree_root *root; radix_tree_init(); -- cgit v1.2.3 From aa5d6bed255d7f8c655a8f10d760f4247bc8385c Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Wed, 28 Feb 2007 16:35:06 -0500 Subject: Btrfs: return code checking Signed-off-by: Chris Mason --- fs/btrfs/Makefile | 5 +- fs/btrfs/ctree.c | 480 ++++++++++++++++++++++++++----------------------- fs/btrfs/extent-tree.c | 5 + fs/btrfs/random-test.c | 7 +- 4 files changed, 269 insertions(+), 228 deletions(-) (limited to 'fs/btrfs/Makefile') diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index 078061c321b0..58d4260a04b2 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -1,10 +1,13 @@ +CC=gcc CFLAGS = -g -Wall headers = radix-tree.h ctree.h disk-io.h kerncompat.h print-tree.h objects = ctree.o disk-io.o radix-tree.o mkfs.o extent-tree.o print-tree.o # if you don't have sparse installed, use ls instead -check=sparse +CHECKFLAGS=-D__linux__ -Dlinux -D__STDC__ -Dunix -D__unix__ -Wbitwise \ + -Wcontext -Wcast-truncate -Wuninitialized -Wshadow -Wundef +check=sparse $(CHECKFLAGS) #check=ls .c.o: diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 0aea94224ba3..be2be0272513 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -6,12 +6,15 @@ #include "disk-io.h" #include "print-tree.h" -int split_node(struct ctree_root *root, struct ctree_path *path, int level); -int split_leaf(struct ctree_root *root, struct ctree_path *path, int data_size); -int push_node_left(struct ctree_root *root, struct ctree_path *path, int level); -int push_node_right(struct ctree_root *root, +static int split_node(struct ctree_root *root, struct ctree_path *path, + int level); +static int split_leaf(struct ctree_root *root, struct ctree_path *path, + int data_size); +static int push_node_left(struct ctree_root *root, struct ctree_path *path, + int level); +static int push_node_right(struct ctree_root *root, struct ctree_path *path, int level); -int del_ptr(struct ctree_root *root, struct ctree_path *path, int level); +static int del_ptr(struct ctree_root *root, struct ctree_path *path, int level); inline void init_path(struct ctree_path *p) { @@ -26,6 +29,7 @@ void release_path(struct ctree_root *root, struct ctree_path *p) break; tree_block_release(root, p->nodes[i]); } + memset(p, 0, sizeof(*p)); } /* @@ -74,6 +78,67 @@ int comp_keys(struct key *k1, struct key *k2) return 0; } +int check_node(struct ctree_path *path, int level) +{ + int i; + struct node *parent = NULL; + struct node *node = &path->nodes[level]->node; + int parent_slot; + + if (path->nodes[level + 1]) + parent = &path->nodes[level + 1]->node; + parent_slot = path->slots[level + 1]; + if (parent && node->header.nritems > 0) { + struct key *parent_key; + parent_key = &parent->keys[parent_slot]; + BUG_ON(memcmp(parent_key, node->keys, sizeof(struct key))); + BUG_ON(parent->blockptrs[parent_slot] != node->header.blocknr); + } + BUG_ON(node->header.nritems > NODEPTRS_PER_BLOCK); + for (i = 0; i < node->header.nritems - 2; i++) { + BUG_ON(comp_keys(&node->keys[i], &node->keys[i+1]) >= 0); + } + return 0; +} + +int check_leaf(struct ctree_path *path, int level) +{ + int i; + struct leaf *leaf = &path->nodes[level]->leaf; + struct node *parent = NULL; + int parent_slot; + + if (path->nodes[level + 1]) + parent = &path->nodes[level + 1]->node; + parent_slot = path->slots[level + 1]; + if (parent && leaf->header.nritems > 0) { + struct key *parent_key; + parent_key = &parent->keys[parent_slot]; + BUG_ON(memcmp(parent_key, &leaf->items[0].key, + sizeof(struct key))); + BUG_ON(parent->blockptrs[parent_slot] != leaf->header.blocknr); + } + for (i = 0; i < leaf->header.nritems - 2; i++) { + BUG_ON(comp_keys(&leaf->items[i].key, + &leaf->items[i+1].key) >= 0); + BUG_ON(leaf->items[i].offset != leaf->items[i + 1].offset + + leaf->items[i + 1].size); + if (i == 0) { + BUG_ON(leaf->items[i].offset + leaf->items[i].size != + LEAF_DATA_SIZE); + } + } + BUG_ON(leaf_free_space(leaf) < 0); + return 0; +} + +int check_block(struct ctree_path *path, int level) +{ + if (level == 0) + return check_leaf(path, level); + return check_node(path, level); +} + /* * search for key in the array p. items p are item_size apart * and there are 'max' items in p @@ -133,7 +198,8 @@ int bin_search(struct node *c, struct key *key, int *slot) * level of the path (level 0) * * If the key isn't found, the path points to the slot where it should - * be inserted. + * be inserted, and 1 is returned. If there are other errors during the + * search a negative error number is returned. * * if ins_len > 0, nodes and leaves will be split as we walk down the * tree. if ins_len < 0, nodes will be merged as we walk down the tree (if @@ -153,6 +219,9 @@ int search_slot(struct ctree_root *root, struct key *key, c = &b->node; level = node_level(c->header.flags); p->nodes[level] = b; + ret = check_block(p, level); + if (ret) + return -1; ret = bin_search(c, key, &slot); if (!is_leaf(c->header.flags)) { if (ret && slot > 0) @@ -183,7 +252,7 @@ int search_slot(struct ctree_root *root, struct key *key, return ret; } } - return -1; + return 1; } /* @@ -192,12 +261,17 @@ int search_slot(struct ctree_root *root, struct key *key, * This is used after shifting pointers to the left, so it stops * fixing up pointers when a given leaf/node is not in slot 0 of the * higher levels + * + * If this fails to write a tree block, it returns -1, but continues + * fixing up the blocks in ram so the tree is consistent. */ -static void fixup_low_keys(struct ctree_root *root, +static int fixup_low_keys(struct ctree_root *root, struct ctree_path *path, struct key *key, int level) { int i; + int ret = 0; + int wret; for (i = level; i < MAX_LEVEL; i++) { struct node *t; int tslot = path->slots[i]; @@ -205,10 +279,13 @@ static void fixup_low_keys(struct ctree_root *root, break; t = &path->nodes[i]->node; memcpy(t->keys + tslot, key, sizeof(*key)); - write_tree_block(root, path->nodes[i]); + wret = write_tree_block(root, path->nodes[i]); + if (wret) + ret = wret; if (tslot != 0) break; } + return ret; } /* @@ -220,8 +297,12 @@ static void fixup_low_keys(struct ctree_root *root, * be modified to reflect the push. * * The path is altered to reflect the push. + * + * returns 0 if some ptrs were pushed left, < 0 if there was some horrible + * error, and > 0 if there was no room in the left hand block. */ -int push_node_left(struct ctree_root *root, struct ctree_path *path, int level) +static int push_node_left(struct ctree_root *root, struct ctree_path *path, + int level) { int slot; struct node *left; @@ -231,6 +312,8 @@ int push_node_left(struct ctree_root *root, struct ctree_path *path, int level) int right_nritems; struct tree_buffer *t; struct tree_buffer *right_buf; + int ret = 0; + int wret; if (level == MAX_LEVEL - 1 || path->nodes[level + 1] == 0) return 1; @@ -265,10 +348,17 @@ int push_node_left(struct ctree_root *root, struct ctree_path *path, int level) left->header.nritems += push_items; /* adjust the pointers going up the tree */ - fixup_low_keys(root, path, right->keys, level + 1); + wret = fixup_low_keys(root, path, right->keys, level + 1); + if (wret < 0) + ret = wret; - write_tree_block(root, t); - write_tree_block(root, right_buf); + wret = write_tree_block(root, t); + if (wret < 0) + ret = wret; + + wret = write_tree_block(root, right_buf); + if (wret < 0) + ret = wret; /* then fixup the leaf pointer in the path */ if (path->slots[level] < push_items) { @@ -280,7 +370,7 @@ int push_node_left(struct ctree_root *root, struct ctree_path *path, int level) path->slots[level] -= push_items; tree_block_release(root, t); } - return 0; + return ret; } /* @@ -292,8 +382,12 @@ int push_node_left(struct ctree_root *root, struct ctree_path *path, int level) * be modified to reflect the push. * * The path is altered to reflect the push. + * + * returns 0 if some ptrs were pushed, < 0 if there was some horrible + * error, and > 0 if there was no room in the right hand block. */ -int push_node_right(struct ctree_root *root, struct ctree_path *path, int level) +static int push_node_right(struct ctree_root *root, struct ctree_path *path, + int level) { int slot; struct tree_buffer *t; @@ -368,6 +462,8 @@ int push_node_right(struct ctree_root *root, struct ctree_path *path, int level) * helper function to insert a new root level in the tree. * A new node is allocated, and a single item is inserted to * point to the existing root + * + * returns zero on success or < 0 on failure. */ static int insert_new_root(struct ctree_root *root, struct ctree_path *path, int level) @@ -410,8 +506,10 @@ static int insert_new_root(struct ctree_root *root, * * slot and level indicate where you want the key to go, and * blocknr is the block the key points to. + * + * returns zero on success and < 0 on any error */ -int insert_ptr(struct ctree_root *root, +static int insert_ptr(struct ctree_root *root, struct ctree_path *path, struct key *key, u64 blocknr, int slot, int level) { @@ -446,8 +544,11 @@ int insert_ptr(struct ctree_root *root, * * Before splitting this tries to make some room in the node by pushing * left and right, if either one works, it returns right away. + * + * returns 0 on success and < 0 on failure */ -int split_node(struct ctree_root *root, struct ctree_path *path, int level) +static int split_node(struct ctree_root *root, struct ctree_path *path, + int level) { struct tree_buffer *t; struct node *c; @@ -455,13 +556,18 @@ int split_node(struct ctree_root *root, struct ctree_path *path, int level) struct node *split; int mid; int ret; + int wret; ret = push_node_left(root, path, level); if (!ret) return 0; + if (ret < 0) + return ret; ret = push_node_right(root, path, level); if (!ret) return 0; + if (ret < 0) + return ret; t = path->nodes[level]; c = &t->node; if (t == root->node) { @@ -482,10 +588,19 @@ int split_node(struct ctree_root *root, struct ctree_path *path, int level) (c->header.nritems - mid) * sizeof(u64)); split->header.nritems = c->header.nritems - mid; c->header.nritems = mid; - write_tree_block(root, t); - write_tree_block(root, split_buffer); - insert_ptr(root, path, split->keys, split_buffer->blocknr, - path->slots[level + 1] + 1, level + 1); + ret = 0; + + wret = write_tree_block(root, t); + if (wret) + ret = wret; + wret = write_tree_block(root, split_buffer); + if (wret) + ret = wret; + wret = insert_ptr(root, path, split->keys, split_buffer->blocknr, + path->slots[level + 1] + 1, level + 1); + if (wret) + ret = wret; + if (path->slots[level] >= mid) { path->slots[level] -= mid; tree_block_release(root, t); @@ -494,7 +609,7 @@ int split_node(struct ctree_root *root, struct ctree_path *path, int level) } else { tree_block_release(root, split_buffer); } - return 0; + return ret; } /* @@ -502,7 +617,7 @@ int split_node(struct ctree_root *root, struct ctree_path *path, int level) * and nr indicate which items in the leaf to check. This totals up the * space used both by the item structs and the item data */ -int leaf_space_used(struct leaf *l, int start, int nr) +static int leaf_space_used(struct leaf *l, int start, int nr) { int data_len; int end = start + nr - 1; @@ -518,9 +633,12 @@ int leaf_space_used(struct leaf *l, int start, int nr) /* * push some data in the path leaf to the right, trying to free up at * least data_size bytes. returns zero if the push worked, nonzero otherwise + * + * returns 1 if the push failed because the other node didn't have enough + * room, 0 if everything worked out and < 0 if there were major errors. */ -int push_leaf_right(struct ctree_root *root, struct ctree_path *path, - int data_size) +static int push_leaf_right(struct ctree_root *root, struct ctree_path *path, + int data_size) { struct tree_buffer *left_buf = path->nodes[0]; struct leaf *left = &left_buf->leaf; @@ -609,8 +727,8 @@ int push_leaf_right(struct ctree_root *root, struct ctree_path *path, * push some data in the path leaf to the left, trying to free up at * least data_size bytes. returns zero if the push worked, nonzero otherwise */ -int push_leaf_left(struct ctree_root *root, struct ctree_path *path, - int data_size) +static int push_leaf_left(struct ctree_root *root, struct ctree_path *path, + int data_size) { struct tree_buffer *right_buf = path->nodes[0]; struct leaf *right = &right_buf->leaf; @@ -623,6 +741,8 @@ int push_leaf_left(struct ctree_root *root, struct ctree_path *path, int push_items = 0; struct item *item; int old_left_nritems; + int ret = 0; + int wret; slot = path->slots[1]; if (slot == 0) { @@ -681,10 +801,16 @@ int push_leaf_left(struct ctree_root *root, struct ctree_path *path, push_space = right->items[i].offset; } - write_tree_block(root, t); - write_tree_block(root, right_buf); + wret = write_tree_block(root, t); + if (wret) + ret = wret; + wret = write_tree_block(root, right_buf); + if (wret) + ret = wret; - fixup_low_keys(root, path, &right->items[0].key, 1); + wret = fixup_low_keys(root, path, &right->items[0].key, 1); + if (wret) + ret = wret; /* then fixup the leaf pointer in the path */ if (path->slots[0] < push_items) { @@ -697,17 +823,20 @@ int push_leaf_left(struct ctree_root *root, struct ctree_path *path, path->slots[0] -= push_items; } BUG_ON(path->slots[0] < 0); - return 0; + return ret; } /* * split the path's leaf in two, making sure there is at least data_size * available for the resulting leaf level of the path. + * + * returns 0 if all went well and < 0 on failure. */ -int split_leaf(struct ctree_root *root, struct ctree_path *path, int data_size) +static int split_leaf(struct ctree_root *root, struct ctree_path *path, + int data_size) { - struct tree_buffer *l_buf = path->nodes[0]; - struct leaf *l = &l_buf->leaf; + struct tree_buffer *l_buf; + struct leaf *l; int nritems; int mid; int slot; @@ -718,14 +847,23 @@ int split_leaf(struct ctree_root *root, struct ctree_path *path, int data_size) int rt_data_off; int i; int ret; - - if (push_leaf_left(root, path, data_size) == 0 || - push_leaf_right(root, path, data_size) == 0) { - l_buf = path->nodes[0]; - l = &l_buf->leaf; - if (leaf_free_space(l) >= sizeof(struct item) + data_size) - return 0; + int wret; + + wret = push_leaf_left(root, path, data_size); + if (wret < 0) + return wret; + if (wret) { + wret = push_leaf_right(root, path, data_size); + if (wret < 0) + return wret; } + l_buf = path->nodes[0]; + l = &l_buf->leaf; + + /* did the pushes work? */ + if (leaf_free_space(l) >= sizeof(struct item) + data_size) + return 0; + if (!path->nodes[1]) { ret = insert_new_root(root, path, 1); if (ret) @@ -768,10 +906,17 @@ int split_leaf(struct ctree_root *root, struct ctree_path *path, int data_size) right->items[i].offset += rt_data_off; l->header.nritems = mid; - ret = insert_ptr(root, path, &right->items[0].key, + ret = 0; + wret = insert_ptr(root, path, &right->items[0].key, right_buffer->blocknr, path->slots[1] + 1, 1); - write_tree_block(root, right_buffer); - write_tree_block(root, l_buf); + if (wret) + ret = wret; + wret = write_tree_block(root, right_buffer); + if (wret) + ret = wret; + wret = write_tree_block(root, l_buf); + if (wret) + ret = wret; BUG_ON(path->slots[0] != slot); if (mid <= slot) { @@ -792,7 +937,8 @@ int split_leaf(struct ctree_root *root, struct ctree_path *path, int data_size) int insert_item(struct ctree_root *root, struct key *key, void *data, int data_size) { - int ret; + int ret = 0; + int wret; int slot; int slot_orig; struct leaf *leaf; @@ -810,6 +956,10 @@ int insert_item(struct ctree_root *root, struct key *key, release_path(root, &path); return -EEXIST; } + if (ret < 0) { + release_path(root, &path); + return ret; + } slot_orig = path.slots[0]; leaf_buf = path.nodes[0]; @@ -850,13 +1000,19 @@ int insert_item(struct ctree_root *root, struct key *key, leaf->items[slot].size = data_size; memcpy(leaf->data + data_end - data_size, data, data_size); leaf->header.nritems += 1; - write_tree_block(root, leaf_buf); + + ret = 0; if (slot == 0) - fixup_low_keys(root, &path, key, 1); + ret = fixup_low_keys(root, &path, key, 1); + + wret = write_tree_block(root, leaf_buf); + if (wret) + ret = wret; + if (leaf_free_space(leaf) < 0) BUG(); release_path(root, &path); - return 0; + return ret; } /* @@ -866,13 +1022,15 @@ int insert_item(struct ctree_root *root, struct key *key, * continuing all the way the root if required. The root is converted into * a leaf if all the nodes are emptied. */ -int del_ptr(struct ctree_root *root, struct ctree_path *path, int level) +static int del_ptr(struct ctree_root *root, struct ctree_path *path, int level) { int slot; struct tree_buffer *t; struct node *node; int nritems; u64 blocknr; + int wret; + int ret = 0; while(1) { t = path->nodes[level]; @@ -894,13 +1052,27 @@ int del_ptr(struct ctree_root *root, struct ctree_path *path, int level) write_tree_block(root, t); if (node->header.nritems != 0) { int tslot; - if (slot == 0) - fixup_low_keys(root, path, node->keys, - level + 1); + if (slot == 0) { + wret = fixup_low_keys(root, path, + node->keys, + level + 1); + if (wret) + ret = wret; + } tslot = path->slots[level + 1]; t->count++; - if (push_node_left(root, path, level)) - push_node_right(root, path, level); + wret = push_node_left(root, path, level); + if (wret < 0) { + ret = wret; + break; + } + if (node->header.nritems != 0) { + wret = push_node_right(root, path, level); + if (wret < 0) { + ret = wret; + break; + } + } path->slots[level + 1] = tslot; if (node->header.nritems != 0) { tree_block_release(root, t); @@ -919,7 +1091,7 @@ int del_ptr(struct ctree_root *root, struct ctree_path *path, int level) if (!path->nodes[level]) BUG(); } - return 0; + return ret; } /* @@ -933,6 +1105,8 @@ int del_item(struct ctree_root *root, struct ctree_path *path) struct tree_buffer *leaf_buf; int doff; int dsize; + int ret = 0; + int wret; leaf_buf = path->nodes[0]; leaf = &leaf_buf->leaf; @@ -959,14 +1133,23 @@ int del_item(struct ctree_root *root, struct ctree_path *path) leaf->header.flags = node_level(0); write_tree_block(root, leaf_buf); } else { - del_ptr(root, path, 1); + wret = del_ptr(root, path, 1); + if (wret) + ret = wret; free_extent(root, leaf_buf->blocknr, 1); } } else { int used = leaf_space_used(leaf, 0, leaf->header.nritems); - if (slot == 0) - fixup_low_keys(root, path, &leaf->items[0].key, 1); - write_tree_block(root, leaf_buf); + if (slot == 0) { + wret = fixup_low_keys(root, path, + &leaf->items[0].key, 1); + if (wret) + ret = wret; + } + wret = write_tree_block(root, leaf_buf); + if (wret) + ret = wret; + /* delete the leaf if it is mostly empty */ if (used < LEAF_DATA_SIZE / 3) { /* push_leaf_left fixes the path. @@ -975,13 +1158,20 @@ int del_item(struct ctree_root *root, struct ctree_path *path) */ slot = path->slots[1]; leaf_buf->count++; - push_leaf_left(root, path, 1); - if (leaf->header.nritems) - push_leaf_right(root, path, 1); + wret = push_leaf_left(root, path, 1); + if (wret < 0) + ret = wret; + if (leaf->header.nritems) { + wret = push_leaf_right(root, path, 1); + if (wret < 0) + ret = wret; + } if (leaf->header.nritems == 0) { u64 blocknr = leaf_buf->blocknr; path->slots[1] = slot; - del_ptr(root, path, 1); + wret = del_ptr(root, path, 1); + if (wret) + ret = wret; tree_block_release(root, leaf_buf); free_extent(root, blocknr, 1); } else { @@ -989,7 +1179,7 @@ int del_item(struct ctree_root *root, struct ctree_path *path) } } } - return 0; + return ret; } /* @@ -1033,165 +1223,3 @@ int next_leaf(struct ctree_root *root, struct ctree_path *path) return 0; } -/* some sample code to insert,search & delete items */ -#if 0 -/* for testing only */ -int next_key(int i, int max_key) { - return rand() % max_key; - //return i; -} -int main() { - struct key ins; - struct key last = { (u64)-1, 0, 0}; - char *buf; - int i; - int num; - int ret; - int run_size = 20000000; - int max_key = 100000000; - int tree_size = 0; - struct ctree_path path; - struct ctree_super_block super; - struct ctree_root *root; - - radix_tree_init(); - - - root = open_ctree("dbfile", &super); - srand(55); - for (i = 0; i < run_size; i++) { - buf = malloc(64); - num = next_key(i, max_key); - // num = i; - sprintf(buf, "string-%d", num); - if (i % 10000 == 0) - fprintf(stderr, "insert %d:%d\n", num, i); - ins.objectid = num; - ins.offset = 0; - ins.flags = 0; - ret = insert_item(root, &ins, buf, strlen(buf)); - if (!ret) - tree_size++; - free(buf); - } - write_ctree_super(root, &super); - close_ctree(root); - - root = open_ctree("dbfile", &super); - printf("starting search\n"); - srand(55); - for (i = 0; i < run_size; i++) { - num = next_key(i, max_key); - ins.objectid = num; - init_path(&path); - if (i % 10000 == 0) - fprintf(stderr, "search %d:%d\n", num, i); - ret = search_slot(root, &ins, &path, 0); - if (ret) { - print_tree(root, root->node); - printf("unable to find %d\n", num); - exit(1); - } - release_path(root, &path); - } - write_ctree_super(root, &super); - close_ctree(root); - root = open_ctree("dbfile", &super); - printf("node %p level %d total ptrs %d free spc %lu\n", root->node, - node_level(root->node->node.header.flags), - root->node->node.header.nritems, - NODEPTRS_PER_BLOCK - root->node->node.header.nritems); - printf("all searches good, deleting some items\n"); - i = 0; - srand(55); - for (i = 0 ; i < run_size/4; i++) { - num = next_key(i, max_key); - ins.objectid = num; - init_path(&path); - ret = search_slot(root, &ins, &path, -1); - if (!ret) { - if (i % 10000 == 0) - fprintf(stderr, "del %d:%d\n", num, i); - ret = del_item(root, &path); - if (ret != 0) - BUG(); - tree_size--; - } - release_path(root, &path); - } - write_ctree_super(root, &super); - close_ctree(root); - root = open_ctree("dbfile", &super); - srand(128); - for (i = 0; i < run_size; i++) { - buf = malloc(64); - num = next_key(i, max_key); - sprintf(buf, "string-%d", num); - ins.objectid = num; - if (i % 10000 == 0) - fprintf(stderr, "insert %d:%d\n", num, i); - ret = insert_item(root, &ins, buf, strlen(buf)); - if (!ret) - tree_size++; - free(buf); - } - write_ctree_super(root, &super); - close_ctree(root); - root = open_ctree("dbfile", &super); - srand(128); - printf("starting search2\n"); - for (i = 0; i < run_size; i++) { - num = next_key(i, max_key); - ins.objectid = num; - init_path(&path); - if (i % 10000 == 0) - fprintf(stderr, "search %d:%d\n", num, i); - ret = search_slot(root, &ins, &path, 0); - if (ret) { - print_tree(root, root->node); - printf("unable to find %d\n", num); - exit(1); - } - release_path(root, &path); - } - printf("starting big long delete run\n"); - while(root->node && root->node->node.header.nritems > 0) { - struct leaf *leaf; - int slot; - ins.objectid = (u64)-1; - init_path(&path); - ret = search_slot(root, &ins, &path, -1); - if (ret == 0) - BUG(); - - leaf = &path.nodes[0]->leaf; - slot = path.slots[0]; - if (slot != leaf->header.nritems) - BUG(); - while(path.slots[0] > 0) { - path.slots[0] -= 1; - slot = path.slots[0]; - leaf = &path.nodes[0]->leaf; - - if (comp_keys(&last, &leaf->items[slot].key) <= 0) - BUG(); - memcpy(&last, &leaf->items[slot].key, sizeof(last)); - if (tree_size % 10000 == 0) - printf("big del %d:%d\n", tree_size, i); - ret = del_item(root, &path); - if (ret != 0) { - printf("del_item returned %d\n", ret); - BUG(); - } - tree_size--; - } - release_path(root, &path); - } - printf("tree size is now %d\n", tree_size); - printf("map tree\n"); - print_tree(root->extent_root, root->extent_root->node); - write_ctree_super(root, &super); - close_ctree(root); - return 0; -} -#endif diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 26321524c186..97715bae7847 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -125,6 +125,11 @@ check_failed: ins->flags = 0; start_found = 0; ret = search_slot(root, ins, &path, 0); + if (ret < 0) { + release_path(root, &path); + return ret; + } + while (1) { l = &path.nodes[0]->leaf; slot = path.slots[0]; diff --git a/fs/btrfs/random-test.c b/fs/btrfs/random-test.c index 7078263acc3d..cebaf6483927 100644 --- a/fs/btrfs/random-test.c +++ b/fs/btrfs/random-test.c @@ -134,7 +134,7 @@ static int lookup_enoent(struct ctree_root *root, struct radix_tree_root *radix) return ret; ret = search_slot(root, &key, &path, 0); release_path(root, &path); - if (ret == 0) + if (ret <= 0) goto error; return 0; error: @@ -153,12 +153,17 @@ static int fill_radix(struct ctree_root *root, struct radix_tree_root *radix) int ret; int slot; int i; + key.offset = 0; key.flags = 0; key.objectid = (unsigned long)-1; while(1) { init_path(&path); ret = search_slot(root, &key, &path, 0); + if (ret < 0) { + release_path(root, &path); + return ret; + } slot = path.slots[0]; if (ret != 0) { if (slot == 0) { -- cgit v1.2.3 From 79f95c82dca7665f32bafd68b7cdf4a01fab0840 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Thu, 1 Mar 2007 15:16:26 -0500 Subject: Btrfs: Fixup the code to merge during path walks Add a bulk insert/remove test to random-test Add the quick-test code back as another regression test Signed-off-by: Chris Mason --- fs/btrfs/Makefile | 5 +- fs/btrfs/ctree.c | 137 +++++++++++++++++++++++++++++++++------- fs/btrfs/quick-test.c | 165 +++++++++++++++++++++++++++++++++++++++++++++++++ fs/btrfs/random-test.c | 117 +++++++++++++++++++++++++++++------ 4 files changed, 381 insertions(+), 43 deletions(-) create mode 100644 fs/btrfs/quick-test.c (limited to 'fs/btrfs/Makefile') diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index 58d4260a04b2..756b1cd4cf4d 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -14,7 +14,7 @@ check=sparse $(CHECKFLAGS) $(check) $< $(CC) $(CFLAGS) -c $< -all: tester debug-tree +all: tester debug-tree quick-test debug-tree: $(objects) debug-tree.o gcc $(CFLAGS) -o debug-tree $(objects) debug-tree.o @@ -22,6 +22,9 @@ debug-tree: $(objects) debug-tree.o tester: $(objects) random-test.o gcc $(CFLAGS) -o tester $(objects) random-test.o +quick-test: $(objects) quick-test.o + gcc $(CFLAGS) -o quick-test $(objects) quick-test.o + $(objects) : $(headers) clean : diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index df4a19d65206..afa5bc5c7c1a 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -12,6 +12,9 @@ static int split_leaf(struct ctree_root *root, struct ctree_path *path, int data_size); static int push_node_left(struct ctree_root *root, struct tree_buffer *dst, struct tree_buffer *src); +static int balance_node_right(struct ctree_root *root, + struct tree_buffer *dst_buf, + struct tree_buffer *src_buf); static int del_ptr(struct ctree_root *root, struct ctree_path *path, int level, int slot); @@ -217,15 +220,16 @@ static int balance_level(struct ctree_root *root, struct ctree_path *path, int ret = 0; int wret; int pslot; - int used = 0; - int count; int orig_slot = path->slots[level]; + u64 orig_ptr; if (level == 0) return 0; mid_buf = path->nodes[level]; mid = &mid_buf->node; + orig_ptr = mid->blockptrs[orig_slot]; + if (level < MAX_LEVEL - 1) parent_buf = path->nodes[level + 1]; pslot = path->slots[level + 1]; @@ -253,24 +257,26 @@ static int balance_level(struct ctree_root *root, struct ctree_path *path, if (mid->header.nritems > NODEPTRS_PER_BLOCK / 4) return 0; - // print_tree(root, root->node); left_buf = read_node_slot(root, parent_buf, pslot - 1); right_buf = read_node_slot(root, parent_buf, pslot + 1); - if (right_buf) { - right = &right_buf->node; - used = right->header.nritems; - count = 1; - } + + /* first, try to make some room in the middle buffer */ if (left_buf) { left = &left_buf->node; - used += left->header.nritems; orig_slot += left->header.nritems; - count++; + wret = push_node_left(root, left_buf, mid_buf); + if (wret < 0) + ret = wret; } - if (left_buf) - push_node_left(root, left_buf, mid_buf); + + /* + * then try to empty the right most buffer into the middle + */ if (right_buf) { - push_node_left(root, mid_buf, right_buf); + right = &right_buf->node; + wret = push_node_left(root, mid_buf, right_buf); + if (wret < 0) + ret = wret; if (right->header.nritems == 0) { u64 blocknr = right_buf->blocknr; tree_block_release(root, right_buf); @@ -285,9 +291,29 @@ static int balance_level(struct ctree_root *root, struct ctree_path *path, } else { memcpy(parent->keys + pslot + 1, right->keys, sizeof(struct key)); + wret = write_tree_block(root, parent_buf); + if (wret) + ret = wret; } } + if (mid->header.nritems == 1) { + /* + * we're not allowed to leave a node with one item in the + * tree during a delete. A deletion from lower in the tree + * could try to delete the only pointer in this node. + * So, pull some keys from the left. + * There has to be a left pointer at this point because + * otherwise we would have pulled some pointers from the + * right + */ + BUG_ON(!left_buf); + wret = balance_node_right(root, mid_buf, left_buf); + if (wret < 0) + ret = wret; + BUG_ON(wret == 1); + } if (mid->header.nritems == 0) { + /* we've managed to empty the middle node, drop it */ u64 blocknr = mid_buf->blocknr; tree_block_release(root, mid_buf); mid_buf = NULL; @@ -298,11 +324,17 @@ static int balance_level(struct ctree_root *root, struct ctree_path *path, wret = free_extent(root, blocknr, 1); if (wret) ret = wret; - } else + } else { + /* update the parent key to reflect our changes */ memcpy(parent->keys + pslot, mid->keys, sizeof(struct key)); + wret = write_tree_block(root, parent_buf); + if (wret) + ret = wret; + } + /* update the path */ if (left_buf) { - if (left->header.nritems >= orig_slot) { + if (left->header.nritems > orig_slot) { left_buf->count++; // released below path->nodes[level] = left_buf; path->slots[level + 1] -= 1; @@ -314,12 +346,15 @@ static int balance_level(struct ctree_root *root, struct ctree_path *path, path->slots[level] = orig_slot; } } + /* double check we haven't messed things up */ + check_block(path, level); + if (orig_ptr != path->nodes[level]->node.blockptrs[path->slots[level]]) + BUG(); if (right_buf) tree_block_release(root, right_buf); if (left_buf) tree_block_release(root, left_buf); - return ret; } @@ -378,6 +413,7 @@ again: goto again; c = &b->node; slot = p->slots[level]; + BUG_ON(c->header.nritems == 1); } b = read_tree_block(root, c->blockptrs[slot]); } else { @@ -433,13 +469,7 @@ static int fixup_low_keys(struct ctree_root *root, /* * try to push data from one node into the next node left in the - * tree. The src node is found at specified level in the path. - * If some bytes were pushed, return 0, otherwise return 1. - * - * Lower nodes/leaves in the path are not touched, higher nodes may - * be modified to reflect the push. - * - * The path is altered to reflect the push. + * tree. * * returns 0 if some ptrs were pushed left, < 0 if there was some horrible * error, and > 0 if there was no room in the left hand block. @@ -463,7 +493,8 @@ static int push_node_left(struct ctree_root *root, struct tree_buffer *dst_buf, } if (src_nritems < push_items) - push_items =src_nritems; + push_items = src_nritems; + memcpy(dst->keys + dst_nritems, src->keys, push_items * sizeof(struct key)); memcpy(dst->blockptrs + dst_nritems, src->blockptrs, @@ -487,6 +518,64 @@ static int push_node_left(struct ctree_root *root, struct tree_buffer *dst_buf, return ret; } +/* + * try to push data from one node into the next node right in the + * tree. + * + * returns 0 if some ptrs were pushed, < 0 if there was some horrible + * error, and > 0 if there was no room in the right hand block. + * + * this will only push up to 1/2 the contents of the left node over + */ +static int balance_node_right(struct ctree_root *root, + struct tree_buffer *dst_buf, + struct tree_buffer *src_buf) +{ + struct node *src = &src_buf->node; + struct node *dst = &dst_buf->node; + int push_items = 0; + int max_push; + int src_nritems; + int dst_nritems; + int ret = 0; + int wret; + + src_nritems = src->header.nritems; + dst_nritems = dst->header.nritems; + push_items = NODEPTRS_PER_BLOCK - dst_nritems; + if (push_items <= 0) { + return 1; + } + + max_push = src_nritems / 2 + 1; + /* don't try to empty the node */ + if (max_push > src_nritems) + return 1; + if (max_push < push_items) + push_items = max_push; + + memmove(dst->keys + push_items, dst->keys, + dst_nritems * sizeof(struct key)); + memmove(dst->blockptrs + push_items, dst->blockptrs, + dst_nritems * sizeof(u64)); + memcpy(dst->keys, src->keys + src_nritems - push_items, + push_items * sizeof(struct key)); + memcpy(dst->blockptrs, src->blockptrs + src_nritems - push_items, + push_items * sizeof(u64)); + + src->header.nritems -= push_items; + dst->header.nritems += push_items; + + wret = write_tree_block(root, src_buf); + if (wret < 0) + ret = wret; + + wret = write_tree_block(root, dst_buf); + if (wret < 0) + ret = wret; + return ret; +} + /* * helper function to insert a new root level in the tree. * A new node is allocated, and a single item is inserted to diff --git a/fs/btrfs/quick-test.c b/fs/btrfs/quick-test.c new file mode 100644 index 000000000000..dbd00c3b7ab4 --- /dev/null +++ b/fs/btrfs/quick-test.c @@ -0,0 +1,165 @@ +#include +#include +#include "kerncompat.h" +#include "radix-tree.h" +#include "ctree.h" +#include "disk-io.h" +#include "print-tree.h" + +/* for testing only */ +int next_key(int i, int max_key) { + return rand() % max_key; + //return i; +} + +int main(int ac, char **av) { + struct key ins; + struct key last = { (u64)-1, 0, 0}; + char *buf; + int i; + int num; + int ret; + int run_size = 100000; + int max_key = 100000000; + int tree_size = 0; + struct ctree_path path; + struct ctree_super_block super; + struct ctree_root *root; + + radix_tree_init(); + + root = open_ctree("dbfile", &super); + srand(55); + for (i = 0; i < run_size; i++) { + buf = malloc(64); + num = next_key(i, max_key); + // num = i; + sprintf(buf, "string-%d", num); + if (i % 10000 == 0) + fprintf(stderr, "insert %d:%d\n", num, i); + ins.objectid = num; + ins.offset = 0; + ins.flags = 0; + ret = insert_item(root, &ins, buf, strlen(buf)); + if (!ret) + tree_size++; + free(buf); + } + write_ctree_super(root, &super); + close_ctree(root); + + root = open_ctree("dbfile", &super); + printf("starting search\n"); + srand(55); + for (i = 0; i < run_size; i++) { + num = next_key(i, max_key); + ins.objectid = num; + init_path(&path); + if (i % 10000 == 0) + fprintf(stderr, "search %d:%d\n", num, i); + ret = search_slot(root, &ins, &path, 0); + if (ret) { + print_tree(root, root->node); + printf("unable to find %d\n", num); + exit(1); + } + release_path(root, &path); + } + write_ctree_super(root, &super); + close_ctree(root); + root = open_ctree("dbfile", &super); + printf("node %p level %d total ptrs %d free spc %lu\n", root->node, + node_level(root->node->node.header.flags), + root->node->node.header.nritems, + NODEPTRS_PER_BLOCK - root->node->node.header.nritems); + printf("all searches good, deleting some items\n"); + i = 0; + srand(55); + for (i = 0 ; i < run_size/4; i++) { + num = next_key(i, max_key); + ins.objectid = num; + init_path(&path); + ret = search_slot(root, &ins, &path, -1); + if (!ret) { + if (i % 10000 == 0) + fprintf(stderr, "del %d:%d\n", num, i); + ret = del_item(root, &path); + if (ret != 0) + BUG(); + tree_size--; + } + release_path(root, &path); + } + write_ctree_super(root, &super); + close_ctree(root); + root = open_ctree("dbfile", &super); + srand(128); + for (i = 0; i < run_size; i++) { + buf = malloc(64); + num = next_key(i, max_key); + sprintf(buf, "string-%d", num); + ins.objectid = num; + if (i % 10000 == 0) + fprintf(stderr, "insert %d:%d\n", num, i); + ret = insert_item(root, &ins, buf, strlen(buf)); + if (!ret) + tree_size++; + free(buf); + } + write_ctree_super(root, &super); + close_ctree(root); + root = open_ctree("dbfile", &super); + srand(128); + printf("starting search2\n"); + for (i = 0; i < run_size; i++) { + num = next_key(i, max_key); + ins.objectid = num; + init_path(&path); + if (i % 10000 == 0) + fprintf(stderr, "search %d:%d\n", num, i); + ret = search_slot(root, &ins, &path, 0); + if (ret) { + print_tree(root, root->node); + printf("unable to find %d\n", num); + exit(1); + } + release_path(root, &path); + } + printf("starting big long delete run\n"); + while(root->node && root->node->node.header.nritems > 0) { + struct leaf *leaf; + int slot; + ins.objectid = (u64)-1; + init_path(&path); + ret = search_slot(root, &ins, &path, -1); + if (ret == 0) + BUG(); + + leaf = &path.nodes[0]->leaf; + slot = path.slots[0]; + if (slot != leaf->header.nritems) + BUG(); + while(path.slots[0] > 0) { + path.slots[0] -= 1; + slot = path.slots[0]; + leaf = &path.nodes[0]->leaf; + + memcpy(&last, &leaf->items[slot].key, sizeof(last)); + if (tree_size % 10000 == 0) + printf("big del %d:%d\n", tree_size, i); + ret = del_item(root, &path); + if (ret != 0) { + printf("del_item returned %d\n", ret); + BUG(); + } + tree_size--; + } + release_path(root, &path); + } + printf("tree size is now %d\n", tree_size); + printf("map tree\n"); + print_tree(root->extent_root, root->extent_root->node); + write_ctree_super(root, &super); + close_ctree(root); + return 0; +} diff --git a/fs/btrfs/random-test.c b/fs/btrfs/random-test.c index cebaf6483927..bbd554e88db2 100644 --- a/fs/btrfs/random-test.c +++ b/fs/btrfs/random-test.c @@ -142,8 +142,98 @@ error: return -1; } +static int empty_tree(struct ctree_root *root, struct radix_tree_root *radix, + int nr) +{ + struct ctree_path path; + struct key key; + unsigned long found = 0; + int ret; + int slot; + int *ptr; + int count = 0; + + key.offset = 0; + key.flags = 0; + key.objectid = (unsigned long)-1; + while(nr-- >= 0) { + init_path(&path); + ret = search_slot(root, &key, &path, -1); + if (ret < 0) { + release_path(root, &path); + return ret; + } + if (ret != 0) { + if (path.slots[0] == 0) { + release_path(root, &path); + break; + } + path.slots[0] -= 1; + } + slot = path.slots[0]; + found = path.nodes[0]->leaf.items[slot].key.objectid; + ret = del_item(root, &path); + count++; + if (ret) { + fprintf(stderr, + "failed to remove %lu from tree\n", + found); + return -1; + } + release_path(root, &path); + ptr = radix_tree_delete(radix, found); + if (!ptr) + goto error; + if (!keep_running) + break; + } + return 0; +error: + fprintf(stderr, "failed to delete from the radix %lu\n", found); + return -1; +} + +static int fill_tree(struct ctree_root *root, struct radix_tree_root *radix, + int count) +{ + int i; + int err; + int ret = 0; + for (i = 0; i < count; i++) { + ret = ins_one(root, radix); + if (ret) { + printf("fill failed\n"); + err = ret; + goto out; + } + if (!keep_running) + break; + } +out: + return ret; +} + +static int bulk_op(struct ctree_root *root, struct radix_tree_root *radix) +{ + int ret; + int nr = rand() % 20000; + static int run_nr = 0; + + /* do the bulk op much less frequently */ + if (run_nr++ % 100) + return 0; + ret = empty_tree(root, radix, nr); + if (ret) + return ret; + ret = fill_tree(root, radix, nr); + if (ret) + return ret; + return 0; +} + + int (*ops[])(struct ctree_root *root, struct radix_tree_root *radix) = -{ ins_one, insert_dup, del_one, lookup_item, lookup_enoent }; +{ ins_one, insert_dup, del_one, lookup_item, lookup_enoent, bulk_op }; static int fill_radix(struct ctree_root *root, struct radix_tree_root *radix) { @@ -192,7 +282,6 @@ static int fill_radix(struct ctree_root *root, struct radix_tree_root *radix) } return 0; } - void sigstopper(int ignored) { keep_running = 0; @@ -241,22 +330,12 @@ int main(int ac, char **av) print_usage(); } } - for (i = 0; i < init_fill_count; i++) { - ret = ins_one(root, &radix); - if (ret) { - printf("initial fill failed\n"); - err = ret; - goto out; - } - if (i % 10000 == 0) { - printf("initial fill %d level %d count %d\n", i, - node_level(root->node->node.header.flags), - root->node->node.header.nritems); - } - if (keep_running == 0) { - err = 0; - goto out; - } + printf("initial fill\n"); + ret = fill_tree(root, &radix, init_fill_count); + printf("starting run\n"); + if (ret) { + err = ret; + goto out; } if (initial_only == 1) { goto out; @@ -287,6 +366,8 @@ int main(int ac, char **av) err = ret; goto out; } + if (ops[op] == bulk_op) + break; if (keep_running == 0) { err = 0; goto out; -- cgit v1.2.3 From a28ec19775d62d673b034082128aca95780d3737 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Tue, 6 Mar 2007 20:08:01 -0500 Subject: Btrfs: Fixup reference counting on cows Signed-off-by: Chris Mason --- fs/btrfs/Makefile | 2 +- fs/btrfs/ctree.c | 9 +- fs/btrfs/ctree.h | 4 + fs/btrfs/disk-io.c | 31 +++++- fs/btrfs/disk-io.h | 4 +- fs/btrfs/extent-tree.c | 291 ++++++++++++++++++++++++++++++++++--------------- fs/btrfs/print-tree.c | 6 +- fs/btrfs/quick-test.c | 26 +++-- fs/btrfs/random-test.c | 21 ++-- 9 files changed, 267 insertions(+), 127 deletions(-) (limited to 'fs/btrfs/Makefile') diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index 756b1cd4cf4d..ae7f4c00c39c 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -1,7 +1,7 @@ CC=gcc CFLAGS = -g -Wall -headers = radix-tree.h ctree.h disk-io.h kerncompat.h print-tree.h +headers = radix-tree.h ctree.h disk-io.h kerncompat.h print-tree.h list.h objects = ctree.o disk-io.o radix-tree.o mkfs.o extent-tree.o print-tree.o # if you don't have sparse installed, use ls instead diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 1118986d7569..72816381d203 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -50,16 +50,17 @@ int btrfs_cow_block(struct ctree_root *root, memcpy(&cow->node, &buf->node, sizeof(buf->node)); cow->node.header.blocknr = cow->blocknr; *cow_ret = cow; + btrfs_inc_ref(root, buf); if (buf == root->node) { root->node = cow; cow->count++; + if (buf != root->commit_root) + free_extent(root, buf->blocknr, 1); tree_block_release(root, buf); } else { parent->node.blockptrs[parent_slot] = cow->blocknr; BUG_ON(list_empty(&parent->dirty)); - } - if (0 && root != root->extent_root && !is_leaf(cow->node.header.flags)) { - btrfs_inc_ref(root, cow); + free_extent(root, buf->blocknr, 1); } tree_block_release(root, buf); return 0; @@ -1018,7 +1019,6 @@ static int split_leaf(struct ctree_root *root, struct ctree_path *path, slot = path->slots[0]; nritems = l->header.nritems; mid = (nritems + 1)/ 2; - right_buffer = alloc_free_block(root); BUG_ON(!right_buffer); BUG_ON(mid == nritems); @@ -1170,7 +1170,6 @@ static int del_ptr(struct ctree_root *root, struct ctree_path *path, int level, node = &parent->node; nritems = node->header.nritems; - if (slot != nritems -1) { memmove(node->keys + slot, node->keys + slot + 1, sizeof(struct key) * (nritems - slot - 1)); diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 9fe8ba6e25c1..4a7bc4e6e747 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -51,10 +51,12 @@ struct tree_buffer; */ struct ctree_root { struct tree_buffer *node; + struct tree_buffer *commit_root; struct ctree_root *extent_root; struct key current_insert; int fp; struct radix_tree_root cache_radix; + struct radix_tree_root pinned_radix; struct list_head trans; struct list_head cache; int cache_size; @@ -151,4 +153,6 @@ int del_item(struct ctree_root *root, struct ctree_path *path); int insert_item(struct ctree_root *root, struct key *key, void *data, int data_size); int next_leaf(struct ctree_root *root, struct ctree_path *path); int leaf_free_space(struct leaf *leaf); +int btrfs_drop_snapshot(struct ctree_root *root, struct tree_buffer *snap); +int btrfs_finish_extent_commit(struct ctree_root *root); #endif diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 0e1c31e682fb..2fe31c3508c1 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -153,13 +153,24 @@ static int __commit_transaction(struct ctree_root *root) return ret; } -int commit_transaction(struct ctree_root *root) +int commit_transaction(struct ctree_root *root, struct ctree_super_block *s) { - int ret; + int ret = 0; + ret = __commit_transaction(root); if (!ret && root != root->extent_root) ret = __commit_transaction(root->extent_root); BUG_ON(ret); + if (root->commit_root != root->node) { + struct tree_buffer *snap = root->commit_root; + root->commit_root = root->node; + root->node->count++; + ret = btrfs_drop_snapshot(root, snap); + BUG_ON(ret); + tree_block_release(root, snap); + } + write_ctree_super(root, s); + btrfs_finish_extent_commit(root); return ret; } @@ -168,10 +179,13 @@ static int __setup_root(struct ctree_root *root, struct ctree_root *extent_root, { INIT_LIST_HEAD(&root->trans); INIT_LIST_HEAD(&root->cache); + root->cache_size = 0; root->fp = fp; root->node = NULL; - root->node = read_tree_block(root, info->tree_root); root->extent_root = extent_root; + root->commit_root = NULL; + root->node = read_tree_block(root, info->tree_root); + memset(&root->current_insert, 0, sizeof(root->current_insert)); return 0; } @@ -188,6 +202,8 @@ struct ctree_root *open_ctree(char *filename, struct ctree_super_block *super) return NULL; } INIT_RADIX_TREE(&root->cache_radix, GFP_KERNEL); + INIT_RADIX_TREE(&root->pinned_radix, GFP_KERNEL); + INIT_RADIX_TREE(&extent_root->pinned_radix, GFP_KERNEL); INIT_RADIX_TREE(&extent_root->cache_radix, GFP_KERNEL); ret = pread(fp, super, sizeof(struct ctree_super_block), CTREE_SUPER_INFO_OFFSET(CTREE_BLOCKSIZE)); @@ -204,6 +220,8 @@ struct ctree_root *open_ctree(char *filename, struct ctree_super_block *super) BUG_ON(ret < 0); __setup_root(root, extent_root, &super->root_info, fp); __setup_root(extent_root, extent_root, &super->extent_info, fp); + root->commit_root = root->node; + root->node->count++; return root; } @@ -236,9 +254,11 @@ static int drop_cache(struct ctree_root *root) } return 0; } -int close_ctree(struct ctree_root *root) +int close_ctree(struct ctree_root *root, struct ctree_super_block *s) { - commit_transaction(root); + commit_transaction(root, s); + __commit_transaction(root->extent_root); + write_ctree_super(root, s); drop_cache(root->extent_root); drop_cache(root); BUG_ON(!list_empty(&root->trans)); @@ -249,6 +269,7 @@ int close_ctree(struct ctree_root *root) tree_block_release(root, root->node); if (root->extent_root->node) tree_block_release(root->extent_root, root->extent_root->node); + tree_block_release(root, root->commit_root); free(root); printf("on close %d blocks are allocated\n", allocated_blocks); return 0; diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index b5dee2fae4da..1c0af7c56c21 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -18,9 +18,9 @@ struct tree_buffer *find_tree_block(struct ctree_root *root, u64 blocknr); int write_tree_block(struct ctree_root *root, struct tree_buffer *buf); int dirty_tree_block(struct ctree_root *root, struct tree_buffer *buf); int clean_tree_block(struct ctree_root *root, struct tree_buffer *buf); -int commit_transaction(struct ctree_root *root); +int commit_transaction(struct ctree_root *root, struct ctree_super_block *s); struct ctree_root *open_ctree(char *filename, struct ctree_super_block *s); -int close_ctree(struct ctree_root *root); +int close_ctree(struct ctree_root *root, struct ctree_super_block *s); void tree_block_release(struct ctree_root *root, struct tree_buffer *buf); int write_ctree_super(struct ctree_root *root, struct ctree_super_block *s); int mkfs(int fd); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 25d9cd169209..0723b7f3f0c3 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -13,7 +13,8 @@ * other allocations are done. The pending tag is also used in the same * manner for deletes. */ -#define CTREE_EXTENT_PENDING 0 +#define CTREE_EXTENT_PENDING_ADD 0 +#define CTREE_EXTENT_PENDING_DEL 1 static int inc_block_ref(struct ctree_root *root, u64 blocknr) { @@ -27,20 +28,51 @@ static int inc_block_ref(struct ctree_root *root, u64 blocknr) key.flags = 0; key.offset = 1; ret = search_slot(root->extent_root, &key, &path, 0, 1); + if (ret != 0) + BUG(); BUG_ON(ret != 0); l = &path.nodes[0]->leaf; item = (struct extent_item *)(l->data + l->items[path.slots[0]].offset); item->refs++; + BUG_ON(list_empty(&path.nodes[0]->dirty)); release_path(root->extent_root, &path); return 0; } +static int lookup_block_ref(struct ctree_root *root, u64 blocknr, int *refs) +{ + struct ctree_path path; + int ret; + struct key key; + struct leaf *l; + struct extent_item *item; + init_path(&path); + key.objectid = blocknr; + key.flags = 0; + key.offset = 1; + ret = search_slot(root->extent_root, &key, &path, 0, 0); + if (ret != 0) + BUG(); + l = &path.nodes[0]->leaf; + item = (struct extent_item *)(l->data + + l->items[path.slots[0]].offset); + *refs = item->refs; + release_path(root->extent_root, &path); + return 0; +} + int btrfs_inc_ref(struct ctree_root *root, struct tree_buffer *buf) { u64 blocknr; int i; + + if (root == root->extent_root) + return 0; + if (is_leaf(buf->node.header.flags)) + return 0; + for (i = 0; i < buf->node.header.nritems; i++) { blocknr = buf->node.blockptrs[i]; inc_block_ref(root, blocknr); @@ -48,85 +80,187 @@ int btrfs_inc_ref(struct ctree_root *root, struct tree_buffer *buf) return 0; } +int btrfs_finish_extent_commit(struct ctree_root *root) +{ + struct ctree_root *extent_root = root->extent_root; + unsigned long gang[8]; + int ret; + int i; + + while(1) { + ret = radix_tree_gang_lookup(&extent_root->pinned_radix, + (void **)gang, 0, + ARRAY_SIZE(gang)); + if (!ret) + break; + for (i = 0; i < ret; i++) + radix_tree_delete(&extent_root->pinned_radix, gang[i]); + } + return 0; +} + /* - * find all the blocks marked as pending in the radix tree and remove - * them from the extent map + * remove an extent from the root, returns 0 on success */ -static int del_pending_extents(struct ctree_root *extent_root) +int __free_extent(struct ctree_root *root, u64 blocknr, u64 num_blocks) +{ + struct ctree_path path; + struct key key; + struct ctree_root *extent_root = root->extent_root; + int ret; + struct item *item; + struct extent_item *ei; + key.objectid = blocknr; + key.flags = 0; + key.offset = num_blocks; + + init_path(&path); + ret = search_slot(extent_root, &key, &path, -1, 1); + if (ret) { + printf("failed to find %Lu\n", key.objectid); + print_tree(extent_root, extent_root->node); + printf("failed to find %Lu\n", key.objectid); + BUG(); + } + item = path.nodes[0]->leaf.items + path.slots[0]; + ei = (struct extent_item *)(path.nodes[0]->leaf.data + item->offset); + BUG_ON(ei->refs == 0); + ei->refs--; + if (ei->refs == 0) { + if (root == extent_root) { + int err; + radix_tree_preload(GFP_KERNEL); + err = radix_tree_insert(&extent_root->pinned_radix, + blocknr, (void *)blocknr); + BUG_ON(err); + radix_tree_preload_end(); + } + ret = del_item(extent_root, &path); + if (ret) + BUG(); + } + release_path(extent_root, &path); + return ret; +} + +/* + * insert all of the pending extents reserved during the original + * allocation. (CTREE_EXTENT_PENDING). Returns zero if it all worked out + */ +static int insert_pending_extents(struct ctree_root *extent_root) { int ret; struct key key; + struct extent_item item; struct tree_buffer *gang[4]; int i; - struct ctree_path path; + // FIXME -ENOSPC + item.owner = extent_root->node->node.header.parentid; + item.refs = 1; while(1) { ret = radix_tree_gang_lookup_tag(&extent_root->cache_radix, (void **)gang, 0, ARRAY_SIZE(gang), - CTREE_EXTENT_PENDING); + CTREE_EXTENT_PENDING_ADD); if (!ret) break; for (i = 0; i < ret; i++) { key.objectid = gang[i]->blocknr; key.flags = 0; key.offset = 1; - init_path(&path); - ret = search_slot(extent_root, &key, &path, -1, 1); + ret = insert_item(extent_root, &key, &item, + sizeof(item)); if (ret) { + printf("%Lu already in tree\n", key.objectid); print_tree(extent_root, extent_root->node); - printf("unable to find %Lu\n", key.objectid); BUG(); // FIXME undo it and return sane return ret; } - ret = del_item(extent_root, &path); - if (ret) { - BUG(); - return ret; - } - release_path(extent_root, &path); + radix_tree_tag_clear(&extent_root->cache_radix, + gang[i]->blocknr, + CTREE_EXTENT_PENDING_ADD); + tree_block_release(extent_root, gang[i]); + } + } + return 0; +} + +/* + * find all the blocks marked as pending in the radix tree and remove + * them from the extent map + */ +static int del_pending_extents(struct ctree_root *extent_root) +{ + int ret; + struct tree_buffer *gang[4]; + int i; + + while(1) { + ret = radix_tree_gang_lookup_tag(&extent_root->cache_radix, + (void **)gang, 0, + ARRAY_SIZE(gang), + CTREE_EXTENT_PENDING_DEL); + if (!ret) + break; + for (i = 0; i < ret; i++) { + ret = __free_extent(extent_root, gang[i]->blocknr, 1); radix_tree_tag_clear(&extent_root->cache_radix, gang[i]->blocknr, - CTREE_EXTENT_PENDING); + CTREE_EXTENT_PENDING_DEL); tree_block_release(extent_root, gang[i]); } } return 0; } +static int run_pending(struct ctree_root *extent_root) +{ + while(radix_tree_tagged(&extent_root->cache_radix, + CTREE_EXTENT_PENDING_DEL) || + radix_tree_tagged(&extent_root->cache_radix, + CTREE_EXTENT_PENDING_ADD)) { + insert_pending_extents(extent_root); + del_pending_extents(extent_root); + } + return 0; +} + + /* * remove an extent from the root, returns 0 on success */ int free_extent(struct ctree_root *root, u64 blocknr, u64 num_blocks) { - struct ctree_path path; struct key key; struct ctree_root *extent_root = root->extent_root; struct tree_buffer *t; int pending_ret; int ret; - key.objectid = blocknr; - key.flags = 0; - key.offset = num_blocks; + if (root == extent_root) { - t = read_tree_block(root, key.objectid); - radix_tree_tag_set(&root->cache_radix, key.objectid, - CTREE_EXTENT_PENDING); + t = find_tree_block(root, blocknr); + if (radix_tree_tag_get(&root->cache_radix, blocknr, + CTREE_EXTENT_PENDING_ADD)) { + radix_tree_tag_clear(&root->cache_radix, + blocknr, + CTREE_EXTENT_PENDING_ADD); + /* once for us */ + tree_block_release(root, t); + /* once for the pending add */ + tree_block_release(root, t); + } else { + radix_tree_tag_set(&root->cache_radix, blocknr, + CTREE_EXTENT_PENDING_DEL); + } return 0; } - init_path(&path); - ret = search_slot(extent_root, &key, &path, -1, 1); - if (ret) { - print_tree(extent_root, extent_root->node); - printf("failed to find %Lu\n", key.objectid); - BUG(); - } - ret = del_item(extent_root, &path); - if (ret) - BUG(); - release_path(extent_root, &path); - pending_ret = del_pending_extents(root->extent_root); + key.objectid = blocknr; + key.flags = 0; + key.offset = num_blocks; + ret = __free_extent(root, blocknr, num_blocks); + pending_ret = run_pending(root->extent_root); return ret ? ret : pending_ret; } @@ -203,7 +337,7 @@ check_pending: */ release_path(root, &path); BUG_ON(ins->objectid < search_start); - if (orig_root->extent_root == orig_root) { + if (1 || orig_root->extent_root == orig_root) { BUG_ON(num_blocks != 1); if ((root->current_insert.objectid <= ins->objectid && root->current_insert.objectid + @@ -211,8 +345,9 @@ check_pending: (root->current_insert.objectid > ins->objectid && root->current_insert.objectid <= ins->objectid + ins->offset) || + radix_tree_lookup(&root->pinned_radix, ins->objectid) || radix_tree_tag_get(&root->cache_radix, ins->objectid, - CTREE_EXTENT_PENDING)) { + CTREE_EXTENT_PENDING_ADD)) { search_start = ins->objectid + 1; goto check_failed; } @@ -225,51 +360,6 @@ error: return ret; } -/* - * insert all of the pending extents reserved during the original - * allocation. (CTREE_EXTENT_PENDING). Returns zero if it all worked out - */ -static int insert_pending_extents(struct ctree_root *extent_root) -{ - int ret; - struct key key; - struct extent_item item; - struct tree_buffer *gang[4]; - int i; - - // FIXME -ENOSPC - item.refs = 1; - item.owner = extent_root->node->node.header.parentid; - while(1) { - ret = radix_tree_gang_lookup_tag(&extent_root->cache_radix, - (void **)gang, 0, - ARRAY_SIZE(gang), - CTREE_EXTENT_PENDING); - if (!ret) - break; - for (i = 0; i < ret; i++) { - key.objectid = gang[i]->blocknr; - key.flags = 0; - key.offset = 1; - ret = insert_item(extent_root, &key, &item, - sizeof(item)); - if (ret) { - printf("%Lu already in tree\n", key.objectid); - print_tree(extent_root, extent_root->node); - BUG(); - // FIXME undo it and return sane - return ret; - } - radix_tree_tag_clear(&extent_root->cache_radix, - gang[i]->blocknr, - CTREE_EXTENT_PENDING); - printf("%Lu is not pending\n", gang[i]->blocknr); - tree_block_release(extent_root, gang[i]); - } - } - return 0; -} - /* * finds a free extent and does all the dirty work required for allocation * returns the key for the extent through ins, and a tree buffer for @@ -296,7 +386,7 @@ int alloc_extent(struct ctree_root *root, u64 num_blocks, u64 search_start, sizeof(extent_item)); memset(&root->extent_root->current_insert, 0, sizeof(struct key)); - pending_ret = insert_pending_extents(root->extent_root); + pending_ret = run_pending(root->extent_root); if (ret) return ret; if (pending_ret) @@ -309,9 +399,8 @@ int alloc_extent(struct ctree_root *root, u64 num_blocks, u64 search_start, BUG_ON(ins->offset != 1); *buf = find_tree_block(root, ins->objectid); BUG_ON(!*buf); - printf("%Lu is pending\n", ins->objectid); radix_tree_tag_set(&root->cache_radix, ins->objectid, - CTREE_EXTENT_PENDING); + CTREE_EXTENT_PENDING_ADD); (*buf)->count++; dirty_tree_block(root, *buf); return 0; @@ -331,13 +420,41 @@ struct tree_buffer *alloc_free_block(struct ctree_root *root) ret = alloc_extent(root, 1, 0, (unsigned long)-1, root->node->node.header.parentid, &ins, &buf); - if (ret) { BUG(); return NULL; } if (root != root->extent_root) BUG_ON(radix_tree_tag_get(&root->extent_root->cache_radix, - buf->blocknr, CTREE_EXTENT_PENDING)); + buf->blocknr, + CTREE_EXTENT_PENDING_ADD)); return buf; } + +int btrfs_drop_snapshot(struct ctree_root *root, struct tree_buffer *snap) +{ + int ret; + int level; + int refs; + u64 blocknr = snap->blocknr; + + level = node_level(snap->node.header.flags); + ret = lookup_block_ref(root, snap->blocknr, &refs); + BUG_ON(ret); + if (refs == 1 && level != 0) { + struct node *n = &snap->node; + struct tree_buffer *b; + int i; + for (i = 0; i < n->header.nritems; i++) { + b = read_tree_block(root, n->blockptrs[i]); + /* FIXME, don't recurse here */ + ret = btrfs_drop_snapshot(root, b); + BUG_ON(ret); + tree_block_release(root, b); + } + } + ret = free_extent(root, blocknr, 1); + BUG_ON(ret); + return 0; +} + diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c index dda08f32c154..e32a959dd3e5 100644 --- a/fs/btrfs/print-tree.c +++ b/fs/btrfs/print-tree.c @@ -21,9 +21,11 @@ void print_leaf(struct leaf *l) item->key.objectid, item->key.flags, item->key.offset, item->offset, item->size); fflush(stdout); - printf("\t\titem data %.*s\n", item->size, l->data+item->offset); + printf("\t\titem data %.*s\n", item->size, + l->data+item->offset); ei = (struct extent_item *)(l->data + item->offset); - printf("\t\textent data %u %Lu\n", ei->refs, ei->owner); + printf("\t\textent data refs %u owner %Lu\n", ei->refs, + ei->owner); fflush(stdout); } } diff --git a/fs/btrfs/quick-test.c b/fs/btrfs/quick-test.c index 8255f79ceca5..6400c7100a6a 100644 --- a/fs/btrfs/quick-test.c +++ b/fs/btrfs/quick-test.c @@ -19,7 +19,7 @@ int main(int ac, char **av) { int i; int num; int ret; - int run_size = 1024; + int run_size = 100000; int max_key = 100000000; int tree_size = 0; struct ctree_path path; @@ -44,9 +44,9 @@ int main(int ac, char **av) { if (!ret) tree_size++; free(buf); + } - write_ctree_super(root, &super); - close_ctree(root); + close_ctree(root, &super); root = open_ctree("dbfile", &super); printf("starting search\n"); @@ -65,8 +65,7 @@ int main(int ac, char **av) { } release_path(root, &path); } - write_ctree_super(root, &super); - close_ctree(root); + close_ctree(root, &super); root = open_ctree("dbfile", &super); printf("node %p level %d total ptrs %d free spc %lu\n", root->node, node_level(root->node->node.header.flags), @@ -90,8 +89,7 @@ int main(int ac, char **av) { } release_path(root, &path); } - write_ctree_super(root, &super); - close_ctree(root); + close_ctree(root, &super); root = open_ctree("dbfile", &super); srand(128); for (i = 0; i < run_size; i++) { @@ -106,8 +104,7 @@ int main(int ac, char **av) { tree_size++; free(buf); } - write_ctree_super(root, &super); - close_ctree(root); + close_ctree(root, &super); root = open_ctree("dbfile", &super); srand(128); printf("starting search2\n"); @@ -156,10 +153,17 @@ int main(int ac, char **av) { } release_path(root, &path); } + /* + printf("previous tree:\n"); + print_tree(root, root->commit_root); + printf("map before commit\n"); + print_tree(root->extent_root, root->extent_root->node); + */ + commit_transaction(root, &super); printf("tree size is now %d\n", tree_size); + printf("root %p commit root %p\n", root->node, root->commit_root); printf("map tree\n"); print_tree(root->extent_root, root->extent_root->node); - write_ctree_super(root, &super); - close_ctree(root); + close_ctree(root, &super); return 0; } diff --git a/fs/btrfs/random-test.c b/fs/btrfs/random-test.c index dcc852ad6737..7b37b6bae105 100644 --- a/fs/btrfs/random-test.c +++ b/fs/btrfs/random-test.c @@ -8,6 +8,7 @@ #include "print-tree.h" int keep_running = 1; +struct ctree_super_block super; static int setup_key(struct radix_tree_root *root, struct key *key, int exists) { @@ -59,11 +60,6 @@ error: return -1; } -static int run_commit(struct ctree_root *root, struct radix_tree_root *radix) -{ - return commit_transaction(root); -} - static int insert_dup(struct ctree_root *root, struct radix_tree_root *radix) { struct ctree_path path; @@ -210,7 +206,7 @@ static int fill_tree(struct ctree_root *root, struct radix_tree_root *radix, goto out; } if (i % 1000 == 0) { - ret = commit_transaction(root); + ret = commit_transaction(root, &super); if (ret) { fprintf(stderr, "fill commit failed\n"); return ret; @@ -229,7 +225,7 @@ out: static int bulk_op(struct ctree_root *root, struct radix_tree_root *radix) { int ret; - int nr = rand() % 20000; + int nr = rand() % 5000; static int run_nr = 0; /* do the bulk op much less frequently */ @@ -247,7 +243,7 @@ static int bulk_op(struct ctree_root *root, struct radix_tree_root *radix) int (*ops[])(struct ctree_root *root, struct radix_tree_root *radix) = { ins_one, insert_dup, del_one, lookup_item, - lookup_enoent, bulk_op, run_commit }; + lookup_enoent, bulk_op }; static int fill_radix(struct ctree_root *root, struct radix_tree_root *radix) { @@ -314,7 +310,6 @@ int print_usage(void) int main(int ac, char **av) { RADIX_TREE(radix, GFP_KERNEL); - struct ctree_super_block super; struct ctree_root *root; int i; int ret; @@ -365,8 +360,7 @@ int main(int ac, char **av) printf("open & close, root level %d nritems %d\n", node_level(root->node->node.header.flags), root->node->node.header.nritems); - write_ctree_super(root, &super); - close_ctree(root); + close_ctree(root, &super); root = open_ctree("dbfile", &super); } while(count--) { @@ -380,7 +374,7 @@ int main(int ac, char **av) err = ret; goto out; } - if (ops[op] == bulk_op || ops[op] == run_commit) + if (ops[op] == bulk_op) break; if (keep_running == 0) { err = 0; @@ -389,8 +383,7 @@ int main(int ac, char **av) } } out: - write_ctree_super(root, &super); - close_ctree(root); + close_ctree(root, &super); return err; } -- cgit v1.2.3 From 037e6390488af8ab96137e1e5cccc15ad14ef887 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Wed, 7 Mar 2007 11:50:24 -0500 Subject: Btrfs: get rid of add recursion Signed-off-by: Chris Mason --- fs/btrfs/Makefile | 2 +- fs/btrfs/ctree.c | 9 --- fs/btrfs/extent-tree.c | 209 ++++++++++++++++++++++--------------------------- 3 files changed, 96 insertions(+), 124 deletions(-) (limited to 'fs/btrfs/Makefile') diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index ae7f4c00c39c..d92d08dde0ff 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -1,6 +1,6 @@ CC=gcc -CFLAGS = -g -Wall +CFLAGS = -Wall headers = radix-tree.h ctree.h disk-io.h kerncompat.h print-tree.h list.h objects = ctree.o disk-io.o radix-tree.o mkfs.o extent-tree.o print-tree.o diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 72816381d203..729d4ddb3746 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -995,15 +995,6 @@ static int split_leaf(struct ctree_root *root, struct ctree_path *path, int ret; int wret; - wret = push_leaf_left(root, path, data_size); - if (wret < 0) - return wret; - if (wret) { - wret = push_leaf_right(root, path, data_size); - if (wret < 0) - return wret; - } - l_buf = path->nodes[0]; l = &l_buf->leaf; diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 0723b7f3f0c3..8a2b8aaf9b86 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -6,6 +6,11 @@ #include "disk-io.h" #include "print-tree.h" +static int find_free_extent(struct ctree_root *orig_root, u64 num_blocks, + u64 search_start, u64 search_end, struct key *ins); +static int finish_current_insert(struct ctree_root *extent_root); +static int run_pending(struct ctree_root *extent_root); + /* * pending extents are blocks that we're trying to allocate in the extent * map while trying to grow the map because of other allocations. To avoid @@ -13,8 +18,7 @@ * other allocations are done. The pending tag is also used in the same * manner for deletes. */ -#define CTREE_EXTENT_PENDING_ADD 0 -#define CTREE_EXTENT_PENDING_DEL 1 +#define CTREE_EXTENT_PENDING_DEL 0 static int inc_block_ref(struct ctree_root *root, u64 blocknr) { @@ -23,6 +27,9 @@ static int inc_block_ref(struct ctree_root *root, u64 blocknr) struct key key; struct leaf *l; struct extent_item *item; + struct key ins; + + find_free_extent(root->extent_root, 0, 0, (u64)-1, &ins); init_path(&path); key.objectid = blocknr; key.flags = 0; @@ -38,6 +45,8 @@ static int inc_block_ref(struct ctree_root *root, u64 blocknr) BUG_ON(list_empty(&path.nodes[0]->dirty)); release_path(root->extent_root, &path); + finish_current_insert(root->extent_root); + run_pending(root->extent_root); return 0; } @@ -99,6 +108,28 @@ int btrfs_finish_extent_commit(struct ctree_root *root) return 0; } +static int finish_current_insert(struct ctree_root *extent_root) +{ + struct key ins; + struct extent_item extent_item; + int i; + int ret; + + extent_item.refs = 1; + extent_item.owner = extent_root->node->node.header.parentid; + ins.offset = 1; + ins.flags = 0; + + for (i = 0; i < extent_root->current_insert.flags; i++) { + ins.objectid = extent_root->current_insert.objectid + i; + ret = insert_item(extent_root, &ins, &extent_item, + sizeof(extent_item)); + BUG_ON(ret); + } + extent_root->current_insert.offset = 0; + return 0; +} + /* * remove an extent from the root, returns 0 on success */ @@ -110,10 +141,13 @@ int __free_extent(struct ctree_root *root, u64 blocknr, u64 num_blocks) int ret; struct item *item; struct extent_item *ei; + struct key ins; + key.objectid = blocknr; key.flags = 0; key.offset = num_blocks; + find_free_extent(root, 0, 0, (u64)-1, &ins); init_path(&path); ret = search_slot(extent_root, &key, &path, -1, 1); if (ret) { @@ -140,53 +174,10 @@ int __free_extent(struct ctree_root *root, u64 blocknr, u64 num_blocks) BUG(); } release_path(extent_root, &path); + finish_current_insert(extent_root); return ret; } -/* - * insert all of the pending extents reserved during the original - * allocation. (CTREE_EXTENT_PENDING). Returns zero if it all worked out - */ -static int insert_pending_extents(struct ctree_root *extent_root) -{ - int ret; - struct key key; - struct extent_item item; - struct tree_buffer *gang[4]; - int i; - - // FIXME -ENOSPC - item.owner = extent_root->node->node.header.parentid; - item.refs = 1; - while(1) { - ret = radix_tree_gang_lookup_tag(&extent_root->cache_radix, - (void **)gang, 0, - ARRAY_SIZE(gang), - CTREE_EXTENT_PENDING_ADD); - if (!ret) - break; - for (i = 0; i < ret; i++) { - key.objectid = gang[i]->blocknr; - key.flags = 0; - key.offset = 1; - ret = insert_item(extent_root, &key, &item, - sizeof(item)); - if (ret) { - printf("%Lu already in tree\n", key.objectid); - print_tree(extent_root, extent_root->node); - BUG(); - // FIXME undo it and return sane - return ret; - } - radix_tree_tag_clear(&extent_root->cache_radix, - gang[i]->blocknr, - CTREE_EXTENT_PENDING_ADD); - tree_block_release(extent_root, gang[i]); - } - } - return 0; -} - /* * find all the blocks marked as pending in the radix tree and remove * them from the extent map @@ -218,12 +209,8 @@ static int del_pending_extents(struct ctree_root *extent_root) static int run_pending(struct ctree_root *extent_root) { while(radix_tree_tagged(&extent_root->cache_radix, - CTREE_EXTENT_PENDING_DEL) || - radix_tree_tagged(&extent_root->cache_radix, - CTREE_EXTENT_PENDING_ADD)) { - insert_pending_extents(extent_root); + CTREE_EXTENT_PENDING_DEL)) del_pending_extents(extent_root); - } return 0; } @@ -241,19 +228,8 @@ int free_extent(struct ctree_root *root, u64 blocknr, u64 num_blocks) if (root == extent_root) { t = find_tree_block(root, blocknr); - if (radix_tree_tag_get(&root->cache_radix, blocknr, - CTREE_EXTENT_PENDING_ADD)) { - radix_tree_tag_clear(&root->cache_radix, - blocknr, - CTREE_EXTENT_PENDING_ADD); - /* once for us */ - tree_block_release(root, t); - /* once for the pending add */ - tree_block_release(root, t); - } else { - radix_tree_tag_set(&root->cache_radix, blocknr, + radix_tree_tag_set(&root->cache_radix, blocknr, CTREE_EXTENT_PENDING_DEL); - } return 0; } key.objectid = blocknr; @@ -281,9 +257,11 @@ static int find_free_extent(struct ctree_root *orig_root, u64 num_blocks, u64 hole_size = 0; int slot = 0; u64 last_block; + u64 test_block; int start_found; struct leaf *l; struct ctree_root * root = orig_root->extent_root; + int total_needed = num_blocks + MAX_LEVEL * 3; check_failed: init_path(&path); @@ -306,22 +284,34 @@ check_failed: goto error; if (!start_found) { ins->objectid = search_start; - ins->offset = num_blocks; + ins->offset = (u64)-1; start_found = 1; goto check_pending; } ins->objectid = last_block > search_start ? last_block : search_start; - ins->offset = num_blocks; + ins->offset = (u64)-1; goto check_pending; } + if (slot == 0) { + int last_slot = l->header.nritems - 1; + u64 span = l->items[last_slot].key.objectid; + span -= l->items[slot].key.objectid; + if (span + total_needed > last_slot - slot) { + path.slots[0] = last_slot + 1; + key = &l->items[last_slot].key; + last_block = key->objectid + key->offset; + start_found = 1; + continue; + } + } key = &l->items[slot].key; if (key->objectid >= search_start) { if (start_found) { hole_size = key->objectid - last_block; - if (hole_size > num_blocks) { + if (hole_size > total_needed) { ins->objectid = last_block; - ins->offset = num_blocks; + ins->offset = hole_size; goto check_pending; } } else @@ -337,23 +327,18 @@ check_pending: */ release_path(root, &path); BUG_ON(ins->objectid < search_start); - if (1 || orig_root->extent_root == orig_root) { - BUG_ON(num_blocks != 1); - if ((root->current_insert.objectid <= ins->objectid && - root->current_insert.objectid + - root->current_insert.offset > ins->objectid) || - (root->current_insert.objectid > ins->objectid && - root->current_insert.objectid <= ins->objectid + - ins->offset) || - radix_tree_lookup(&root->pinned_radix, ins->objectid) || - radix_tree_tag_get(&root->cache_radix, ins->objectid, - CTREE_EXTENT_PENDING_ADD)) { - search_start = ins->objectid + 1; + for (test_block = ins->objectid; + test_block < ins->objectid + total_needed; test_block++) { + if (radix_tree_lookup(&root->pinned_radix, test_block)) { + search_start = test_block + 1; goto check_failed; } } - if (ins->offset != 1) - BUG(); + BUG_ON(root->current_insert.offset); + root->current_insert.offset = total_needed; + root->current_insert.objectid = ins->objectid + num_blocks; + root->current_insert.flags = 0; + ins->offset = num_blocks; return 0; error: release_path(root, &path); @@ -368,43 +353,41 @@ error: * returns 0 if everything worked, non-zero otherwise. */ int alloc_extent(struct ctree_root *root, u64 num_blocks, u64 search_start, - u64 search_end, u64 owner, struct key *ins, - struct tree_buffer **buf) + u64 search_end, u64 owner, struct key *ins) { int ret; int pending_ret; + struct ctree_root *extent_root = root->extent_root; struct extent_item extent_item; + extent_item.refs = 1; extent_item.owner = owner; - ret = find_free_extent(root, num_blocks, search_start, search_end, ins); - if (ret) - return ret; - if (root != root->extent_root) { - memcpy(&root->extent_root->current_insert, ins, sizeof(*ins)); - ret = insert_item(root->extent_root, ins, &extent_item, - sizeof(extent_item)); - memset(&root->extent_root->current_insert, 0, - sizeof(struct key)); - pending_ret = run_pending(root->extent_root); - if (ret) - return ret; - if (pending_ret) - return pending_ret; - *buf = find_tree_block(root, ins->objectid); - dirty_tree_block(root, *buf); + if (root == extent_root) { + BUG_ON(extent_root->current_insert.offset == 0); + BUG_ON(num_blocks != 1); + BUG_ON(extent_root->current_insert.flags == + extent_root->current_insert.offset); + ins->offset = 1; + ins->objectid = extent_root->current_insert.objectid + + extent_root->current_insert.flags++; return 0; } - /* we're allocating an extent for the extent tree, don't recurse */ - BUG_ON(ins->offset != 1); - *buf = find_tree_block(root, ins->objectid); - BUG_ON(!*buf); - radix_tree_tag_set(&root->cache_radix, ins->objectid, - CTREE_EXTENT_PENDING_ADD); - (*buf)->count++; - dirty_tree_block(root, *buf); - return 0; + ret = find_free_extent(root, num_blocks, search_start, + search_end, ins); + if (ret) + return ret; + ret = insert_item(extent_root, ins, &extent_item, + sizeof(extent_item)); + + finish_current_insert(extent_root); + pending_ret = run_pending(extent_root); + if (ret) + return ret; + if (pending_ret) + return pending_ret; + return 0; } /* @@ -415,19 +398,17 @@ struct tree_buffer *alloc_free_block(struct ctree_root *root) { struct key ins; int ret; - struct tree_buffer *buf = NULL; + struct tree_buffer *buf; ret = alloc_extent(root, 1, 0, (unsigned long)-1, root->node->node.header.parentid, - &ins, &buf); + &ins); if (ret) { BUG(); return NULL; } - if (root != root->extent_root) - BUG_ON(radix_tree_tag_get(&root->extent_root->cache_radix, - buf->blocknr, - CTREE_EXTENT_PENDING_ADD)); + buf = find_tree_block(root, ins.objectid); + dirty_tree_block(root, buf); return buf; } -- cgit v1.2.3 From 0579da4280812f34f382fb0f8004d7b0219e7a33 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Wed, 7 Mar 2007 16:15:30 -0500 Subject: Btrfs: Fixup last found extent caching Signed-off-by: Chris Mason --- fs/btrfs/Makefile | 2 +- fs/btrfs/ctree.h | 1 + fs/btrfs/disk-io.c | 1 + fs/btrfs/extent-tree.c | 39 +++++++++++++++++++++------------------ 4 files changed, 24 insertions(+), 19 deletions(-) (limited to 'fs/btrfs/Makefile') diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index d92d08dde0ff..ae7f4c00c39c 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -1,6 +1,6 @@ CC=gcc -CFLAGS = -Wall +CFLAGS = -g -Wall headers = radix-tree.h ctree.h disk-io.h kerncompat.h print-tree.h list.h objects = ctree.o disk-io.o radix-tree.o mkfs.o extent-tree.o print-tree.o diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 4a7bc4e6e747..518326fa3694 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -54,6 +54,7 @@ struct ctree_root { struct tree_buffer *commit_root; struct ctree_root *extent_root; struct key current_insert; + struct key last_insert; int fp; struct radix_tree_root cache_radix; struct radix_tree_root pinned_radix; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 2fe31c3508c1..997cc578a185 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -186,6 +186,7 @@ static int __setup_root(struct ctree_root *root, struct ctree_root *extent_root, root->commit_root = NULL; root->node = read_tree_block(root, info->tree_root); memset(&root->current_insert, 0, sizeof(root->current_insert)); + memset(&root->last_insert, 0, sizeof(root->last_insert)); return 0; } diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 8a2b8aaf9b86..dd11532cb2f6 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -102,9 +102,12 @@ int btrfs_finish_extent_commit(struct ctree_root *root) ARRAY_SIZE(gang)); if (!ret) break; - for (i = 0; i < ret; i++) + for (i = 0; i < ret; i++) { radix_tree_delete(&extent_root->pinned_radix, gang[i]); + } } + extent_root->last_insert.objectid = 0; + extent_root->last_insert.offset = 0; return 0; } @@ -170,6 +173,9 @@ int __free_extent(struct ctree_root *root, u64 blocknr, u64 num_blocks) radix_tree_preload_end(); } ret = del_item(extent_root, &path); + if (root != extent_root && + extent_root->last_insert.objectid < blocknr) + extent_root->last_insert.objectid = blocknr; if (ret) BUG(); } @@ -261,8 +267,11 @@ static int find_free_extent(struct ctree_root *orig_root, u64 num_blocks, int start_found; struct leaf *l; struct ctree_root * root = orig_root->extent_root; - int total_needed = num_blocks + MAX_LEVEL * 3; + int total_needed = num_blocks; + total_needed += (node_level(root->node->node.header.flags) + 1) * 3; + if (root->last_insert.objectid > search_start) + search_start = root->last_insert.objectid; check_failed: init_path(&path); ins->objectid = search_start; @@ -273,6 +282,9 @@ check_failed: if (ret < 0) goto error; + if (path.slots[0] > 0) + path.slots[0]--; + while (1) { l = &path.nodes[0]->leaf; slot = path.slots[0]; @@ -293,31 +305,21 @@ check_failed: ins->offset = (u64)-1; goto check_pending; } - if (slot == 0) { - int last_slot = l->header.nritems - 1; - u64 span = l->items[last_slot].key.objectid; - span -= l->items[slot].key.objectid; - if (span + total_needed > last_slot - slot) { - path.slots[0] = last_slot + 1; - key = &l->items[last_slot].key; - last_block = key->objectid + key->offset; - start_found = 1; - continue; - } - } key = &l->items[slot].key; if (key->objectid >= search_start) { if (start_found) { + if (last_block < search_start) + last_block = search_start; hole_size = key->objectid - last_block; if (hole_size > total_needed) { ins->objectid = last_block; ins->offset = hole_size; goto check_pending; } - } else - start_found = 1; - last_block = key->objectid + key->offset; + } } + start_found = 1; + last_block = key->objectid + key->offset; path.slots[0]++; } // FIXME -ENOSPC @@ -335,9 +337,10 @@ check_pending: } } BUG_ON(root->current_insert.offset); - root->current_insert.offset = total_needed; + root->current_insert.offset = total_needed - num_blocks; root->current_insert.objectid = ins->objectid + num_blocks; root->current_insert.flags = 0; + root->last_insert.objectid = ins->objectid; ins->offset = num_blocks; return 0; error: -- cgit v1.2.3 From 7518a238ea0152dc849d1ed76d3cae8b44e12f46 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Mon, 12 Mar 2007 12:01:18 -0400 Subject: Btrfs: get/set for struct header fields Signed-off-by: Chris Mason --- fs/btrfs/Makefile | 2 +- fs/btrfs/TODO | 1 + fs/btrfs/ctree.c | 224 +++++++++++++++++++++++++++---------------------- fs/btrfs/ctree.h | 63 +++++++++++++- fs/btrfs/disk-io.c | 7 +- fs/btrfs/extent-tree.c | 53 +++--------- fs/btrfs/mkfs.c | 10 +-- fs/btrfs/print-tree.c | 26 +++--- fs/btrfs/quick-test.c | 12 +-- fs/btrfs/random-test.c | 4 +- 10 files changed, 230 insertions(+), 172 deletions(-) (limited to 'fs/btrfs/Makefile') diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index ae7f4c00c39c..f8532200e9ad 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -25,7 +25,7 @@ tester: $(objects) random-test.o quick-test: $(objects) quick-test.o gcc $(CFLAGS) -o quick-test $(objects) quick-test.o -$(objects) : $(headers) +$(objects): $(headers) clean : rm debug-tree tester *.o diff --git a/fs/btrfs/TODO b/fs/btrfs/TODO index 5c4395c3784a..52d2a7a071f4 100644 --- a/fs/btrfs/TODO +++ b/fs/btrfs/TODO @@ -1,4 +1,5 @@ * cleanup, add more error checking, get rid of BUG_ONs +* endian fixes * Make IO functions look more like the page cache * Fix ENOSPC handling * make blocksize a mkfs parameter instead of #define diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 729d4ddb3746..e43c827e0dfd 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -48,7 +48,7 @@ int btrfs_cow_block(struct ctree_root *root, } cow = alloc_free_block(root); memcpy(&cow->node, &buf->node, sizeof(buf->node)); - cow->node.header.blocknr = cow->blocknr; + btrfs_set_header_blocknr(&cow->node.header, cow->blocknr); *cow_ret = cow; btrfs_inc_ref(root, buf); if (buf == root->node) { @@ -73,7 +73,7 @@ int btrfs_cow_block(struct ctree_root *root, */ static inline unsigned int leaf_data_end(struct leaf *leaf) { - unsigned int nr = leaf->header.nritems; + u32 nr = btrfs_header_nritems(&leaf->header); if (nr == 0) return sizeof(leaf->data); return leaf->items[nr-1].offset; @@ -87,7 +87,7 @@ static inline unsigned int leaf_data_end(struct leaf *leaf) int leaf_free_space(struct leaf *leaf) { int data_end = leaf_data_end(leaf); - int nritems = leaf->header.nritems; + int nritems = btrfs_header_nritems(&leaf->header); char *items_end = (char *)(leaf->items + nritems + 1); return (char *)(leaf->data + data_end) - (char *)items_end; } @@ -118,18 +118,21 @@ int check_node(struct ctree_path *path, int level) struct node *parent = NULL; struct node *node = &path->nodes[level]->node; int parent_slot; + u32 nritems = btrfs_header_nritems(&node->header); if (path->nodes[level + 1]) parent = &path->nodes[level + 1]->node; parent_slot = path->slots[level + 1]; - if (parent && node->header.nritems > 0) { + BUG_ON(nritems == 0); + if (parent) { struct key *parent_key; parent_key = &parent->keys[parent_slot]; BUG_ON(memcmp(parent_key, node->keys, sizeof(struct key))); - BUG_ON(parent->blockptrs[parent_slot] != node->header.blocknr); + BUG_ON(parent->blockptrs[parent_slot] != + btrfs_header_blocknr(&node->header)); } - BUG_ON(node->header.nritems > NODEPTRS_PER_BLOCK); - for (i = 0; i < node->header.nritems - 2; i++) { + BUG_ON(nritems > NODEPTRS_PER_BLOCK); + for (i = 0; nritems > 1 && i < nritems - 2; i++) { BUG_ON(comp_keys(&node->keys[i], &node->keys[i+1]) >= 0); } return 0; @@ -141,18 +144,25 @@ int check_leaf(struct ctree_path *path, int level) struct leaf *leaf = &path->nodes[level]->leaf; struct node *parent = NULL; int parent_slot; + u32 nritems = btrfs_header_nritems(&leaf->header); if (path->nodes[level + 1]) parent = &path->nodes[level + 1]->node; parent_slot = path->slots[level + 1]; - if (parent && leaf->header.nritems > 0) { + BUG_ON(leaf_free_space(leaf) < 0); + + if (nritems == 0) + return 0; + + if (parent) { struct key *parent_key; parent_key = &parent->keys[parent_slot]; BUG_ON(memcmp(parent_key, &leaf->items[0].key, sizeof(struct key))); - BUG_ON(parent->blockptrs[parent_slot] != leaf->header.blocknr); + BUG_ON(parent->blockptrs[parent_slot] != + btrfs_header_blocknr(&leaf->header)); } - for (i = 0; i < leaf->header.nritems - 2; i++) { + for (i = 0; nritems > 1 && i < nritems - 2; i++) { BUG_ON(comp_keys(&leaf->items[i].key, &leaf->items[i+1].key) >= 0); BUG_ON(leaf->items[i].offset != leaf->items[i + 1].offset + @@ -162,7 +172,6 @@ int check_leaf(struct ctree_path *path, int level) LEAF_DATA_SIZE); } } - BUG_ON(leaf_free_space(leaf) < 0); return 0; } @@ -215,13 +224,15 @@ int generic_bin_search(char *p, int item_size, struct key *key, */ int bin_search(struct node *c, struct key *key, int *slot) { - if (is_leaf(c->header.flags)) { + if (btrfs_is_leaf(c)) { struct leaf *l = (struct leaf *)c; return generic_bin_search((void *)l->items, sizeof(struct item), - key, c->header.nritems, slot); + key, btrfs_header_nritems(&c->header), + slot); } else { return generic_bin_search((void *)c->keys, sizeof(struct key), - key, c->header.nritems, slot); + key, btrfs_header_nritems(&c->header), + slot); } return -1; } @@ -233,7 +244,7 @@ struct tree_buffer *read_node_slot(struct ctree_root *root, struct node *node = &parent_buf->node; if (slot < 0) return NULL; - if (slot >= node->header.nritems) + if (slot >= btrfs_header_nritems(&node->header)) return NULL; return read_tree_block(root, node->blockptrs[slot]); } @@ -270,7 +281,7 @@ static int balance_level(struct ctree_root *root, struct ctree_path *path, struct tree_buffer *child; u64 blocknr = mid_buf->blocknr; - if (mid->header.nritems != 1) + if (btrfs_header_nritems(&mid->header) != 1) return 0; /* promote the child to a root */ @@ -287,7 +298,7 @@ static int balance_level(struct ctree_root *root, struct ctree_path *path, } parent = &parent_buf->node; - if (mid->header.nritems > NODEPTRS_PER_BLOCK / 4) + if (btrfs_header_nritems(&mid->header) > NODEPTRS_PER_BLOCK / 4) return 0; left_buf = read_node_slot(root, parent_buf, pslot - 1); @@ -298,7 +309,7 @@ static int balance_level(struct ctree_root *root, struct ctree_path *path, btrfs_cow_block(root, left_buf, parent_buf, pslot - 1, &left_buf); left = &left_buf->node; - orig_slot += left->header.nritems; + orig_slot += btrfs_header_nritems(&left->header); wret = push_node_left(root, left_buf, mid_buf); if (wret < 0) ret = wret; @@ -314,7 +325,7 @@ static int balance_level(struct ctree_root *root, struct ctree_path *path, wret = push_node_left(root, mid_buf, right_buf); if (wret < 0) ret = wret; - if (right->header.nritems == 0) { + if (btrfs_header_nritems(&right->header) == 0) { u64 blocknr = right_buf->blocknr; tree_block_release(root, right_buf); clean_tree_block(root, right_buf); @@ -332,7 +343,7 @@ static int balance_level(struct ctree_root *root, struct ctree_path *path, BUG_ON(list_empty(&parent_buf->dirty)); } } - if (mid->header.nritems == 1) { + if (btrfs_header_nritems(&mid->header) == 1) { /* * we're not allowed to leave a node with one item in the * tree during a delete. A deletion from lower in the tree @@ -348,7 +359,7 @@ static int balance_level(struct ctree_root *root, struct ctree_path *path, ret = wret; BUG_ON(wret == 1); } - if (mid->header.nritems == 0) { + if (btrfs_header_nritems(&mid->header) == 0) { /* we've managed to empty the middle node, drop it */ u64 blocknr = mid_buf->blocknr; tree_block_release(root, mid_buf); @@ -369,7 +380,7 @@ static int balance_level(struct ctree_root *root, struct ctree_path *path, /* update the path */ if (left_buf) { - if (left->header.nritems > orig_slot) { + if (btrfs_header_nritems(&left->header) > orig_slot) { left_buf->count++; // released below path->nodes[level] = left_buf; path->slots[level + 1] -= 1; @@ -377,7 +388,7 @@ static int balance_level(struct ctree_root *root, struct ctree_path *path, if (mid_buf) tree_block_release(root, mid_buf); } else { - orig_slot -= left->header.nritems; + orig_slot -= btrfs_header_nritems(&left->header); path->slots[level] = orig_slot; } } @@ -420,7 +431,7 @@ again: b = root->node; b->count++; while (b) { - level = node_level(b->node.header.flags); + level = btrfs_header_level(&b->node.header); if (cow) { int wret; wret = btrfs_cow_block(root, b, p->nodes[level + 1], @@ -434,12 +445,12 @@ again: if (ret) return -1; ret = bin_search(c, key, &slot); - if (!is_leaf(c->header.flags)) { + if (!btrfs_is_leaf(c)) { if (ret && slot > 0) slot -= 1; p->slots[level] = slot; - if (ins_len > 0 && - c->header.nritems == NODEPTRS_PER_BLOCK) { + if (ins_len > 0 && btrfs_header_nritems(&c->header) == + NODEPTRS_PER_BLOCK) { int sret = split_node(root, p, level); BUG_ON(sret > 0); if (sret) @@ -456,7 +467,7 @@ again: goto again; c = &b->node; slot = p->slots[level]; - BUG_ON(c->header.nritems == 1); + BUG_ON(btrfs_header_nritems(&c->header) == 1); } b = read_tree_block(root, c->blockptrs[slot]); } else { @@ -524,8 +535,8 @@ static int push_node_left(struct ctree_root *root, struct tree_buffer *dst_buf, int dst_nritems; int ret = 0; - src_nritems = src->header.nritems; - dst_nritems = dst->header.nritems; + src_nritems = btrfs_header_nritems(&src->header); + dst_nritems = btrfs_header_nritems(&dst->header); push_items = NODEPTRS_PER_BLOCK - dst_nritems; if (push_items <= 0) { return 1; @@ -544,9 +555,8 @@ static int push_node_left(struct ctree_root *root, struct tree_buffer *dst_buf, memmove(src->blockptrs, src->blockptrs + push_items, (src_nritems - push_items) * sizeof(u64)); } - src->header.nritems -= push_items; - dst->header.nritems += push_items; - + btrfs_set_header_nritems(&src->header, src_nritems - push_items); + btrfs_set_header_nritems(&dst->header, dst_nritems + push_items); BUG_ON(list_empty(&src_buf->dirty)); BUG_ON(list_empty(&dst_buf->dirty)); return ret; @@ -573,8 +583,8 @@ static int balance_node_right(struct ctree_root *root, int dst_nritems; int ret = 0; - src_nritems = src->header.nritems; - dst_nritems = dst->header.nritems; + src_nritems = btrfs_header_nritems(&src->header); + dst_nritems = btrfs_header_nritems(&dst->header); push_items = NODEPTRS_PER_BLOCK - dst_nritems; if (push_items <= 0) { return 1; @@ -596,8 +606,8 @@ static int balance_node_right(struct ctree_root *root, memcpy(dst->blockptrs, src->blockptrs + src_nritems - push_items, push_items * sizeof(u64)); - src->header.nritems -= push_items; - dst->header.nritems += push_items; + btrfs_set_header_nritems(&src->header, src_nritems - push_items); + btrfs_set_header_nritems(&dst->header, dst_nritems + push_items); BUG_ON(list_empty(&src_buf->dirty)); BUG_ON(list_empty(&dst_buf->dirty)); @@ -625,12 +635,13 @@ static int insert_new_root(struct ctree_root *root, t = alloc_free_block(root); c = &t->node; memset(c, 0, sizeof(c)); - c->header.nritems = 1; - c->header.flags = node_level(level); - c->header.blocknr = t->blocknr; - c->header.parentid = root->node->node.header.parentid; + btrfs_set_header_nritems(&c->header, 1); + btrfs_set_header_level(&c->header, level); + btrfs_set_header_blocknr(&c->header, t->blocknr); + btrfs_set_header_parentid(&c->header, + btrfs_header_parentid(&root->node->node.header)); lower = &path->nodes[level-1]->node; - if (is_leaf(lower->header.flags)) + if (btrfs_is_leaf(lower)) lower_key = &((struct leaf *)lower)->items[0].key; else lower_key = lower->keys; @@ -663,7 +674,7 @@ static int insert_ptr(struct ctree_root *root, BUG_ON(!path->nodes[level]); lower = &path->nodes[level]->node; - nritems = lower->header.nritems; + nritems = btrfs_header_nritems(&lower->header); if (slot > nritems) BUG(); if (nritems == NODEPTRS_PER_BLOCK) @@ -676,7 +687,7 @@ static int insert_ptr(struct ctree_root *root, } memcpy(lower->keys + slot, key, sizeof(struct key)); lower->blockptrs[slot] = blocknr; - lower->header.nritems++; + btrfs_set_header_nritems(&lower->header, nritems + 1); if (lower->keys[1].objectid == 0) BUG(); BUG_ON(list_empty(&path->nodes[level]->dirty)); @@ -702,6 +713,7 @@ static int split_node(struct ctree_root *root, struct ctree_path *path, int mid; int ret; int wret; + u32 c_nritems; t = path->nodes[level]; c = &t->node; @@ -711,18 +723,20 @@ static int split_node(struct ctree_root *root, struct ctree_path *path, if (ret) return ret; } + c_nritems = btrfs_header_nritems(&c->header); split_buffer = alloc_free_block(root); split = &split_buffer->node; - split->header.flags = c->header.flags; - split->header.blocknr = split_buffer->blocknr; - split->header.parentid = root->node->node.header.parentid; - mid = (c->header.nritems + 1) / 2; + btrfs_set_header_flags(&split->header, btrfs_header_flags(&c->header)); + btrfs_set_header_blocknr(&split->header, split_buffer->blocknr); + btrfs_set_header_parentid(&split->header, + btrfs_header_parentid(&root->node->node.header)); + mid = (c_nritems + 1) / 2; memcpy(split->keys, c->keys + mid, - (c->header.nritems - mid) * sizeof(struct key)); + (c_nritems - mid) * sizeof(struct key)); memcpy(split->blockptrs, c->blockptrs + mid, - (c->header.nritems - mid) * sizeof(u64)); - split->header.nritems = c->header.nritems - mid; - c->header.nritems = mid; + (c_nritems - mid) * sizeof(u64)); + btrfs_set_header_nritems(&split->header, c_nritems - mid); + btrfs_set_header_nritems(&c->header, mid); ret = 0; BUG_ON(list_empty(&t->dirty)); @@ -781,13 +795,15 @@ static int push_leaf_right(struct ctree_root *root, struct ctree_path *path, int push_space = 0; int push_items = 0; struct item *item; + u32 left_nritems; + u32 right_nritems; slot = path->slots[1]; if (!path->nodes[1]) { return 1; } upper = path->nodes[1]; - if (slot >= upper->node.header.nritems - 1) { + if (slot >= btrfs_header_nritems(&upper->node.header) - 1) { return 1; } right_buf = read_tree_block(root, upper->node.blockptrs[slot + 1]); @@ -806,7 +822,8 @@ static int push_leaf_right(struct ctree_root *root, struct ctree_path *path, return 1; } - for (i = left->header.nritems - 1; i >= 0; i--) { + left_nritems = btrfs_header_nritems(&left->header); + for (i = left_nritems - 1; i >= 0; i--) { item = left->items + i; if (path->slots[0] == i) push_space += data_size + sizeof(*item); @@ -819,9 +836,10 @@ static int push_leaf_right(struct ctree_root *root, struct ctree_path *path, tree_block_release(root, right_buf); return 1; } + right_nritems = btrfs_header_nritems(&right->header); /* push left to right */ - push_space = left->items[left->header.nritems - push_items].offset + - left->items[left->header.nritems - push_items].size; + push_space = left->items[left_nritems - push_items].offset + + left->items[left_nritems - push_items].size; push_space -= leaf_data_end(left); /* make room in the right data area */ memmove(right->data + leaf_data_end(right) - push_space, @@ -832,19 +850,21 @@ static int push_leaf_right(struct ctree_root *root, struct ctree_path *path, left->data + leaf_data_end(left), push_space); memmove(right->items + push_items, right->items, - right->header.nritems * sizeof(struct item)); + right_nritems * sizeof(struct item)); /* copy the items from left to right */ - memcpy(right->items, left->items + left->header.nritems - push_items, + memcpy(right->items, left->items + left_nritems - push_items, push_items * sizeof(struct item)); /* update the item pointers */ - right->header.nritems += push_items; + right_nritems += push_items; + btrfs_set_header_nritems(&right->header, right_nritems); push_space = LEAF_DATA_SIZE; - for (i = 0; i < right->header.nritems; i++) { + for (i = 0; i < right_nritems; i++) { right->items[i].offset = push_space - right->items[i].size; push_space = right->items[i].offset; } - left->header.nritems -= push_items; + left_nritems -= push_items; + btrfs_set_header_nritems(&left->header, left_nritems); BUG_ON(list_empty(&left_buf->dirty)); BUG_ON(list_empty(&right_buf->dirty)); @@ -853,8 +873,8 @@ static int push_leaf_right(struct ctree_root *root, struct ctree_path *path, BUG_ON(list_empty(&upper->dirty)); /* then fixup the leaf pointer in the path */ - if (path->slots[0] >= left->header.nritems) { - path->slots[0] -= left->header.nritems; + if (path->slots[0] >= left_nritems) { + path->slots[0] -= left_nritems; tree_block_release(root, path->nodes[0]); path->nodes[0] = right_buf; path->slots[1] += 1; @@ -880,7 +900,7 @@ static int push_leaf_left(struct ctree_root *root, struct ctree_path *path, int push_space = 0; int push_items = 0; struct item *item; - int old_left_nritems; + u32 old_left_nritems; int ret = 0; int wret; @@ -908,7 +928,7 @@ static int push_leaf_left(struct ctree_root *root, struct ctree_path *path, return 1; } - for (i = 0; i < right->header.nritems; i++) { + for (i = 0; i < btrfs_header_nritems(&right->header); i++) { item = right->items + i; if (path->slots[0] == i) push_space += data_size + sizeof(*item); @@ -922,31 +942,34 @@ static int push_leaf_left(struct ctree_root *root, struct ctree_path *path, return 1; } /* push data from right to left */ - memcpy(left->items + left->header.nritems, + memcpy(left->items + btrfs_header_nritems(&left->header), right->items, push_items * sizeof(struct item)); push_space = LEAF_DATA_SIZE - right->items[push_items -1].offset; memcpy(left->data + leaf_data_end(left) - push_space, right->data + right->items[push_items - 1].offset, push_space); - old_left_nritems = left->header.nritems; + old_left_nritems = btrfs_header_nritems(&left->header); BUG_ON(old_left_nritems < 0); for(i = old_left_nritems; i < old_left_nritems + push_items; i++) { left->items[i].offset -= LEAF_DATA_SIZE - left->items[old_left_nritems -1].offset; } - left->header.nritems += push_items; + btrfs_set_header_nritems(&left->header, old_left_nritems + push_items); /* fixup right node */ push_space = right->items[push_items-1].offset - leaf_data_end(right); memmove(right->data + LEAF_DATA_SIZE - push_space, right->data + leaf_data_end(right), push_space); memmove(right->items, right->items + push_items, - (right->header.nritems - push_items) * sizeof(struct item)); - right->header.nritems -= push_items; + (btrfs_header_nritems(&right->header) - push_items) * + sizeof(struct item)); + btrfs_set_header_nritems(&right->header, + btrfs_header_nritems(&right->header) - + push_items); push_space = LEAF_DATA_SIZE; - for (i = 0; i < right->header.nritems; i++) { + for (i = 0; i < btrfs_header_nritems(&right->header); i++) { right->items[i].offset = push_space - right->items[i].size; push_space = right->items[i].offset; } @@ -983,7 +1006,7 @@ static int split_leaf(struct ctree_root *root, struct ctree_path *path, { struct tree_buffer *l_buf; struct leaf *l; - int nritems; + u32 nritems; int mid; int slot; struct leaf *right; @@ -1008,7 +1031,7 @@ static int split_leaf(struct ctree_root *root, struct ctree_path *path, return ret; } slot = path->slots[0]; - nritems = l->header.nritems; + nritems = btrfs_header_nritems(&l->header); mid = (nritems + 1)/ 2; right_buffer = alloc_free_block(root); BUG_ON(!right_buffer); @@ -1026,10 +1049,11 @@ static int split_leaf(struct ctree_root *root, struct ctree_path *path, LEAF_DATA_SIZE) BUG(); } - right->header.nritems = nritems - mid; - right->header.blocknr = right_buffer->blocknr; - right->header.flags = node_level(0); - right->header.parentid = root->node->node.header.parentid; + btrfs_set_header_nritems(&right->header, nritems - mid); + btrfs_set_header_blocknr(&right->header, right_buffer->blocknr); + btrfs_set_header_level(&right->header, 0); + btrfs_set_header_parentid(&right->header, + btrfs_header_parentid(&root->node->node.header)); data_copy_size = l->items[mid].offset + l->items[mid].size - leaf_data_end(l); memcpy(right->items, l->items + mid, @@ -1039,10 +1063,10 @@ static int split_leaf(struct ctree_root *root, struct ctree_path *path, rt_data_off = LEAF_DATA_SIZE - (l->items[mid].offset + l->items[mid].size); - for (i = 0; i < right->header.nritems; i++) + for (i = 0; i < btrfs_header_nritems(&right->header); i++) right->items[i].offset += rt_data_off; - l->header.nritems = mid; + btrfs_set_header_nritems(&l->header, mid); ret = 0; wret = insert_ptr(root, path, &right->items[0].key, right_buffer->blocknr, path->slots[1] + 1, 1); @@ -1074,7 +1098,7 @@ int insert_item(struct ctree_root *root, struct key *key, int slot_orig; struct leaf *leaf; struct tree_buffer *leaf_buf; - unsigned int nritems; + u32 nritems; unsigned int data_end; struct ctree_path path; @@ -1094,7 +1118,7 @@ int insert_item(struct ctree_root *root, struct key *key, leaf_buf = path.nodes[0]; leaf = &leaf_buf->leaf; - nritems = leaf->header.nritems; + nritems = btrfs_header_nritems(&leaf->header); data_end = leaf_data_end(leaf); if (leaf_free_space(leaf) < sizeof(struct item) + data_size) @@ -1128,7 +1152,7 @@ int insert_item(struct ctree_root *root, struct key *key, leaf->items[slot].offset = data_end - data_size; leaf->items[slot].size = data_size; memcpy(leaf->data + data_end - data_size, data, data_size); - leaf->header.nritems += 1; + btrfs_set_header_nritems(&leaf->header, nritems + 1); ret = 0; if (slot == 0) @@ -1155,12 +1179,12 @@ static int del_ptr(struct ctree_root *root, struct ctree_path *path, int level, { struct node *node; struct tree_buffer *parent = path->nodes[level]; - int nritems; + u32 nritems; int ret = 0; int wret; node = &parent->node; - nritems = node->header.nritems; + nritems = btrfs_header_nritems(&node->header); if (slot != nritems -1) { memmove(node->keys + slot, node->keys + slot + 1, sizeof(struct key) * (nritems - slot - 1)); @@ -1168,11 +1192,12 @@ static int del_ptr(struct ctree_root *root, struct ctree_path *path, int level, node->blockptrs + slot + 1, sizeof(u64) * (nritems - slot - 1)); } - node->header.nritems--; - if (node->header.nritems == 0 && parent == root->node) { - BUG_ON(node_level(root->node->node.header.flags) != 1); + nritems--; + btrfs_set_header_nritems(&node->header, nritems); + if (nritems == 0 && parent == root->node) { + BUG_ON(btrfs_header_level(&root->node->node.header) != 1); /* just turn the root into a leaf and break */ - root->node->node.header.flags = node_level(0); + btrfs_set_header_level(&root->node->node.header, 0); } else if (slot == 0) { wret = fixup_low_keys(root, path, node->keys, level + 1); if (wret) @@ -1195,30 +1220,33 @@ int del_item(struct ctree_root *root, struct ctree_path *path) int dsize; int ret = 0; int wret; + u32 nritems; leaf_buf = path->nodes[0]; leaf = &leaf_buf->leaf; slot = path->slots[0]; doff = leaf->items[slot].offset; dsize = leaf->items[slot].size; + nritems = btrfs_header_nritems(&leaf->header); - if (slot != leaf->header.nritems - 1) { + if (slot != nritems - 1) { int i; int data_end = leaf_data_end(leaf); memmove(leaf->data + data_end + dsize, leaf->data + data_end, doff - data_end); - for (i = slot + 1; i < leaf->header.nritems; i++) + for (i = slot + 1; i < nritems; i++) leaf->items[i].offset += dsize; memmove(leaf->items + slot, leaf->items + slot + 1, sizeof(struct item) * - (leaf->header.nritems - slot - 1)); + (nritems - slot - 1)); } - leaf->header.nritems -= 1; + btrfs_set_header_nritems(&leaf->header, nritems - 1); + nritems--; /* delete the leaf if we've emptied it */ - if (leaf->header.nritems == 0) { + if (nritems == 0) { if (leaf_buf == root->node) { - leaf->header.flags = node_level(0); + btrfs_set_header_level(&leaf->header, 0); BUG_ON(list_empty(&leaf_buf->dirty)); } else { clean_tree_block(root, leaf_buf); @@ -1230,7 +1258,7 @@ int del_item(struct ctree_root *root, struct ctree_path *path) ret = wret; } } else { - int used = leaf_space_used(leaf, 0, leaf->header.nritems); + int used = leaf_space_used(leaf, 0, nritems); if (slot == 0) { wret = fixup_low_keys(root, path, &leaf->items[0].key, 1); @@ -1251,12 +1279,12 @@ int del_item(struct ctree_root *root, struct ctree_path *path) if (wret < 0) ret = wret; if (path->nodes[0] == leaf_buf && - leaf->header.nritems) { + btrfs_header_nritems(&leaf->header)) { wret = push_leaf_right(root, path, 1); if (wret < 0) ret = wret; } - if (leaf->header.nritems == 0) { + if (btrfs_header_nritems(&leaf->header) == 0) { u64 blocknr = leaf_buf->blocknr; clean_tree_block(root, leaf_buf); wret = del_ptr(root, path, 1, slot); @@ -1292,7 +1320,7 @@ int next_leaf(struct ctree_root *root, struct ctree_path *path) return 1; slot = path->slots[level] + 1; c = path->nodes[level]; - if (slot >= c->node.header.nritems) { + if (slot >= btrfs_header_nritems(&c->node.header)) { level++; continue; } diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 518326fa3694..60c21fe1566a 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -35,13 +35,10 @@ struct header { /* generation flags to be added */ } __attribute__ ((__packed__)); +#define MAX_LEVEL 8 #define NODEPTRS_PER_BLOCK ((CTREE_BLOCKSIZE - sizeof(struct header)) / \ (sizeof(struct key) + sizeof(u64))) -#define MAX_LEVEL 8 -#define node_level(f) ((f) & (MAX_LEVEL-1)) -#define is_leaf(f) (node_level(f) == 0) - struct tree_buffer; /* @@ -144,6 +141,64 @@ struct ctree_path { int slots[MAX_LEVEL]; }; +static inline u64 btrfs_header_blocknr(struct header *h) +{ + return h->blocknr; +} + +static inline void btrfs_set_header_blocknr(struct header *h, u64 blocknr) +{ + h->blocknr = blocknr; +} + +static inline u64 btrfs_header_parentid(struct header *h) +{ + return h->parentid; +} + +static inline void btrfs_set_header_parentid(struct header *h, u64 parentid) +{ + h->parentid = parentid; +} + +static inline u32 btrfs_header_nritems(struct header *h) +{ + return h->nritems; +} + +static inline void btrfs_set_header_nritems(struct header *h, u32 val) +{ + h->nritems = val; +} + +static inline u32 btrfs_header_flags(struct header *h) +{ + return h->flags; +} + +static inline void btrfs_set_header_flags(struct header *h, u32 val) +{ + h->flags = val; +} + +static inline int btrfs_header_level(struct header *h) +{ + return btrfs_header_flags(h) & (MAX_LEVEL - 1); +} + +static inline void btrfs_set_header_level(struct header *h, int level) +{ + u32 flags; + BUG_ON(level > MAX_LEVEL); + flags = btrfs_header_flags(h) & ~(MAX_LEVEL - 1); + btrfs_set_header_flags(h, flags | level); +} + +static inline int btrfs_is_leaf(struct node *n) +{ + return (btrfs_header_level(&n->header) == 0); +} + struct tree_buffer *alloc_free_block(struct ctree_root *root); int btrfs_inc_ref(struct ctree_root *root, struct tree_buffer *buf); int free_extent(struct ctree_root *root, u64 blocknr, u64 num_blocks); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index c9201ec81fef..065e888d2c0b 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -15,9 +15,10 @@ int cache_max = 10000; static int check_tree_block(struct ctree_root *root, struct tree_buffer *buf) { - if (buf->blocknr != buf->node.header.blocknr) + if (buf->blocknr != btrfs_header_blocknr(&buf->node.header)) BUG(); - if (root->node && buf->node.header.parentid != root->node->node.header.parentid) + if (root->node && btrfs_header_parentid(&buf->node.header) != + btrfs_header_parentid(&root->node->node.header)) BUG(); return 0; } @@ -129,7 +130,7 @@ int write_tree_block(struct ctree_root *root, struct tree_buffer *buf) loff_t offset = blocknr * CTREE_BLOCKSIZE; int ret; - if (buf->blocknr != buf->node.header.blocknr) + if (buf->blocknr != btrfs_header_blocknr(&buf->node.header)) BUG(); ret = pwrite(root->fp, &buf->node, CTREE_BLOCKSIZE, offset); if (ret != CTREE_BLOCKSIZE) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index e7b8a70b5e52..524c11fea24a 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -79,10 +79,10 @@ int btrfs_inc_ref(struct ctree_root *root, struct tree_buffer *buf) if (root == root->extent_root) return 0; - if (is_leaf(buf->node.header.flags)) + if (btrfs_is_leaf(&buf->node)) return 0; - for (i = 0; i < buf->node.header.nritems; i++) { + for (i = 0; i < btrfs_header_nritems(&buf->node.header); i++) { blocknr = buf->node.blockptrs[i]; inc_block_ref(root, blocknr); } @@ -119,7 +119,8 @@ static int finish_current_insert(struct ctree_root *extent_root) int ret; extent_item.refs = 1; - extent_item.owner = extent_root->node->node.header.parentid; + extent_item.owner = + btrfs_header_parentid(&extent_root->node->node.header); ins.offset = 1; ins.flags = 0; @@ -269,7 +270,7 @@ static int find_free_extent(struct ctree_root *orig_root, u64 num_blocks, struct ctree_root * root = orig_root->extent_root; int total_needed = num_blocks; - total_needed += (node_level(root->node->node.header.flags) + 1) * 3; + total_needed += (btrfs_header_level(&root->node->node.header) + 1) * 3; if (root->last_insert.objectid > search_start) search_start = root->last_insert.objectid; check_failed: @@ -288,7 +289,7 @@ check_failed: while (1) { l = &path.nodes[0]->leaf; slot = path.slots[0]; - if (slot >= l->header.nritems) { + if (slot >= btrfs_header_nritems(&l->header)) { ret = next_leaf(root, &path); if (ret == 0) continue; @@ -404,7 +405,7 @@ struct tree_buffer *alloc_free_block(struct ctree_root *root) struct tree_buffer *buf; ret = alloc_extent(root, 1, 0, (unsigned long)-1, - root->node->node.header.parentid, + btrfs_header_parentid(&root->node->node.header), &ins); if (ret) { BUG(); @@ -429,7 +430,8 @@ int walk_down_tree(struct ctree_root *root, struct ctree_path *path, int *level) goto out; while(*level > 0) { cur = path->nodes[*level]; - if (path->slots[*level] >= cur->node.header.nritems) + if (path->slots[*level] >= + btrfs_header_nritems(&cur->node.header)) break; blocknr = cur->node.blockptrs[path->slots[*level]]; ret = lookup_block_ref(root, blocknr, &refs); @@ -444,7 +446,7 @@ int walk_down_tree(struct ctree_root *root, struct ctree_path *path, int *level) if (path->nodes[*level-1]) tree_block_release(root, path->nodes[*level-1]); path->nodes[*level-1] = next; - *level = node_level(next->node.header.flags); + *level = btrfs_header_level(&next->node.header); path->slots[*level] = 0; } out: @@ -463,7 +465,8 @@ int walk_up_tree(struct ctree_root *root, struct ctree_path *path, int *level) int ret; for(i = *level; i < MAX_LEVEL - 1 && path->nodes[i]; i++) { slot = path->slots[i]; - if (slot < path->nodes[i]->node.header.nritems - 1) { + if (slot < + btrfs_header_nritems(&path->nodes[i]->node.header)- 1) { path->slots[i]++; *level = i; return 0; @@ -489,7 +492,7 @@ int btrfs_drop_snapshot(struct ctree_root *root, struct tree_buffer *snap) init_path(&path); - level = node_level(snap->node.header.flags); + level = btrfs_header_level(&snap->node.header); orig_level = level; path.nodes[level] = snap; path.slots[level] = 0; @@ -509,33 +512,3 @@ int btrfs_drop_snapshot(struct ctree_root *root, struct tree_buffer *snap) return 0; } - - -#if 0 -int btrfs_drop_snapshot(struct ctree_root *root, struct tree_buffer *snap) -{ - int ret; - int level; - int refs; - u64 blocknr = snap->blocknr; - - level = node_level(snap->node.header.flags); - ret = lookup_block_ref(root, snap->blocknr, &refs); - BUG_ON(ret); - if (refs == 1 && level != 0) { - struct node *n = &snap->node; - struct tree_buffer *b; - int i; - for (i = 0; i < n->header.nritems; i++) { - b = read_tree_block(root, n->blockptrs[i]); - /* FIXME, don't recurse here */ - ret = btrfs_drop_snapshot(root, b); - BUG_ON(ret); - tree_block_release(root, b); - } - } - ret = free_extent(root, blocknr, 1); - BUG_ON(ret); - return 0; -} -#endif diff --git a/fs/btrfs/mkfs.c b/fs/btrfs/mkfs.c index fd4e5dea7e1b..b2ba94696297 100644 --- a/fs/btrfs/mkfs.c +++ b/fs/btrfs/mkfs.c @@ -34,15 +34,15 @@ int mkfs(int fd) /* create leaves for the tree root and extent root */ memset(&empty_leaf, 0, sizeof(empty_leaf)); - empty_leaf.header.parentid = 1; - empty_leaf.header.blocknr = 17; + btrfs_set_header_parentid(&empty_leaf.header, 1); + btrfs_set_header_blocknr(&empty_leaf.header, 17); ret = pwrite(fd, &empty_leaf, sizeof(empty_leaf), 17 * CTREE_BLOCKSIZE); if (ret != sizeof(empty_leaf)) return -1; - empty_leaf.header.parentid = 2; - empty_leaf.header.blocknr = 18; - empty_leaf.header.nritems = 3; + btrfs_set_header_parentid(&empty_leaf.header, 2); + btrfs_set_header_blocknr(&empty_leaf.header, 18); + btrfs_set_header_nritems(&empty_leaf.header, 3); /* item1, reserve blocks 0-16 */ item.key.objectid = 0; diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c index e32a959dd3e5..7df16b1e4733 100644 --- a/fs/btrfs/print-tree.c +++ b/fs/btrfs/print-tree.c @@ -8,11 +8,11 @@ void print_leaf(struct leaf *l) { int i; - int nr = l->header.nritems; + u32 nr = btrfs_header_nritems(&l->header); struct item *item; struct extent_item *ei; - printf("leaf %Lu total ptrs %d free space %d\n", l->header.blocknr, nr, - leaf_free_space(l)); + printf("leaf %Lu total ptrs %d free space %d\n", + btrfs_header_blocknr(&l->header), nr, leaf_free_space(l)); fflush(stdout); for (i = 0 ; i < nr ; i++) { item = l->items + i; @@ -32,22 +32,20 @@ void print_leaf(struct leaf *l) void print_tree(struct ctree_root *root, struct tree_buffer *t) { int i; - int nr; + u32 nr; struct node *c; if (!t) return; c = &t->node; - nr = c->header.nritems; - if (c->header.blocknr != t->blocknr) - BUG(); - if (is_leaf(c->header.flags)) { + nr = btrfs_header_nritems(&c->header); + if (btrfs_is_leaf(c)) { print_leaf((struct leaf *)c); return; } printf("node %Lu level %d total ptrs %d free spc %u\n", t->blocknr, - node_level(c->header.flags), c->header.nritems, - (u32)NODEPTRS_PER_BLOCK - c->header.nritems); + btrfs_header_level(&c->header), nr, + (u32)NODEPTRS_PER_BLOCK - nr); fflush(stdout); for (i = 0; i < nr; i++) { printf("\tkey %d (%Lu %u %Lu) block %Lu\n", @@ -60,11 +58,11 @@ void print_tree(struct ctree_root *root, struct tree_buffer *t) struct tree_buffer *next_buf = read_tree_block(root, c->blockptrs[i]); struct node *next = &next_buf->node; - if (is_leaf(next->header.flags) && - node_level(c->header.flags) != 1) + if (btrfs_is_leaf(next) && + btrfs_header_level(&c->header) != 1) BUG(); - if (node_level(next->header.flags) != - node_level(c->header.flags) - 1) + if (btrfs_header_level(&next->header) != + btrfs_header_level(&c->header) - 1) BUG(); print_tree(root, next_buf); tree_block_release(root, next_buf); diff --git a/fs/btrfs/quick-test.c b/fs/btrfs/quick-test.c index ccca9b2c14a3..07fd71b77627 100644 --- a/fs/btrfs/quick-test.c +++ b/fs/btrfs/quick-test.c @@ -71,9 +71,10 @@ int main(int ac, char **av) { close_ctree(root, &super); root = open_ctree("dbfile", &super); printf("node %p level %d total ptrs %d free spc %lu\n", root->node, - node_level(root->node->node.header.flags), - root->node->node.header.nritems, - NODEPTRS_PER_BLOCK - root->node->node.header.nritems); + btrfs_header_level(&root->node->node.header), + btrfs_header_nritems(&root->node->node.header), + NODEPTRS_PER_BLOCK - + btrfs_header_nritems(&root->node->node.header)); printf("all searches good, deleting some items\n"); i = 0; srand(55); @@ -126,7 +127,8 @@ int main(int ac, char **av) { release_path(root, &path); } printf("starting big long delete run\n"); - while(root->node && root->node->node.header.nritems > 0) { + while(root->node && + btrfs_header_nritems(&root->node->node.header) > 0) { struct leaf *leaf; int slot; ins.objectid = (u64)-1; @@ -137,7 +139,7 @@ int main(int ac, char **av) { leaf = &path.nodes[0]->leaf; slot = path.slots[0]; - if (slot != leaf->header.nritems) + if (slot != btrfs_header_nritems(&leaf->header)) BUG(); while(path.slots[0] > 0) { path.slots[0] -= 1; diff --git a/fs/btrfs/random-test.c b/fs/btrfs/random-test.c index 7b37b6bae105..5d99b730a69c 100644 --- a/fs/btrfs/random-test.c +++ b/fs/btrfs/random-test.c @@ -358,8 +358,8 @@ int main(int ac, char **av) } if (i && i % 5000 == 0) { printf("open & close, root level %d nritems %d\n", - node_level(root->node->node.header.flags), - root->node->node.header.nritems); + btrfs_header_level(&root->node->node.header), + btrfs_header_nritems(&root->node->node.header)); close_ctree(root, &super); root = open_ctree("dbfile", &super); } -- cgit v1.2.3 From 3768f3689fc76ecea17414936dff7a02746a4355 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Tue, 13 Mar 2007 16:47:54 -0400 Subject: Btrfs: Change the super to point to a tree of trees to enable persistent snapshots Signed-off-by: Chris Mason --- fs/btrfs/Makefile | 3 +- fs/btrfs/TODO | 1 + fs/btrfs/ctree.h | 123 ++++++++++++++++++++++++++++++++++++++------ fs/btrfs/debug-tree.c | 4 +- fs/btrfs/disk-io.c | 135 +++++++++++++++++++++++++++++++++++++++---------- fs/btrfs/disk-io.h | 3 +- fs/btrfs/extent-tree.c | 15 +++--- fs/btrfs/mkfs.c | 128 +++++++++++++++++++++++++++++++++++++++++++--- fs/btrfs/print-tree.c | 10 ++-- fs/btrfs/root-tree.c | 88 ++++++++++++++++++++++++++++++++ 10 files changed, 445 insertions(+), 65 deletions(-) create mode 100644 fs/btrfs/root-tree.c (limited to 'fs/btrfs/Makefile') diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index f8532200e9ad..13ed6313ac85 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -2,7 +2,8 @@ CC=gcc CFLAGS = -g -Wall headers = radix-tree.h ctree.h disk-io.h kerncompat.h print-tree.h list.h -objects = ctree.o disk-io.o radix-tree.o mkfs.o extent-tree.o print-tree.o +objects = ctree.o disk-io.o radix-tree.o mkfs.o extent-tree.o print-tree.o \ + root-tree.o # if you don't have sparse installed, use ls instead CHECKFLAGS=-D__linux__ -Dlinux -D__STDC__ -Dunix -D__unix__ -Wbitwise \ diff --git a/fs/btrfs/TODO b/fs/btrfs/TODO index ad29267b8738..ff90e5eaf431 100644 --- a/fs/btrfs/TODO +++ b/fs/btrfs/TODO @@ -5,6 +5,7 @@ * make a real mkfs and superblock * Do checksumming * Define FS objects in terms of different item types +* add inode tree * Add block mapping tree (simple dm layer) * Add simple tree locking (semaphore per tree) * Make allocator smarter diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 5dfcc90812c1..0968899fb7f9 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -4,8 +4,13 @@ #include "list.h" #include "kerncompat.h" +#define BTRFS_MAGIC "_BtRfS_M" #define BTRFS_BLOCKSIZE 1024 +#define BTRFS_ROOT_TREE_OBJECTID 1 +#define BTRFS_EXTENT_TREE_OBJECTID 2 +#define BTRFS_FS_TREE_OBJECTID 3 + /* * the key defines the order in the tree, and so it also defines (optimal) * block layout. objectid corresonds to the inode number. The flags @@ -36,7 +41,7 @@ struct btrfs_key { * every tree block (leaf or node) starts with this header. */ struct btrfs_header { - __le64 fsid[2]; /* FS specific uuid */ + u8 fsid[16]; /* FS specific uuid */ __le64 blocknr; /* which block this node is supposed to live in */ __le64 parentid; /* objectid of the tree root */ __le32 csum; @@ -52,6 +57,14 @@ struct btrfs_header { struct btrfs_buffer; +struct btrfs_root_item { + __le64 blocknr; + __le32 flags; + __le64 block_limit; + __le64 blocks_used; + __le32 refs; +}; + /* * in ram representation of the tree. extent_root is used for all allocations * and for the extent tree extent_root root. current_insert is used @@ -61,6 +74,7 @@ struct btrfs_root { struct btrfs_buffer *node; struct btrfs_buffer *commit_root; struct btrfs_root *extent_root; + struct btrfs_root *tree_root; struct btrfs_key current_insert; struct btrfs_key last_insert; int fp; @@ -69,28 +83,25 @@ struct btrfs_root { struct list_head trans; struct list_head cache; int cache_size; + int ref_cows; + struct btrfs_root_item root_item; + struct btrfs_key root_key; }; -/* - * describes a tree on disk - */ -struct btrfs_root_info { - u64 fsid[2]; /* FS specific uuid */ - u64 blocknr; /* blocknr of this block */ - u64 objectid; /* inode number of this root */ - u64 tree_root; /* the tree root block */ - u32 csum; - u32 ham; - u64 snapuuid[2]; /* root specific uuid */ -} __attribute__ ((__packed__)); - /* * the super block basically lists the main trees of the FS * it currently lacks any block count etc etc */ struct btrfs_super_block { - struct btrfs_root_info root_info; - struct btrfs_root_info extent_info; + u8 fsid[16]; /* FS specific uuid */ + __le64 blocknr; /* this block number */ + __le32 csum; + __le64 magic; + __le16 blocksize; + __le64 generation; + __le64 root; + __le64 total_blocks; + __le64 blocks_used; } __attribute__ ((__packed__)); /* @@ -317,6 +328,79 @@ static inline int btrfs_is_leaf(struct btrfs_node *n) return (btrfs_header_level(&n->header) == 0); } +static inline u64 btrfs_root_blocknr(struct btrfs_root_item *item) +{ + return le64_to_cpu(item->blocknr); +} + +static inline void btrfs_set_root_blocknr(struct btrfs_root_item *item, u64 val) +{ + item->blocknr = cpu_to_le64(val); +} + +static inline u32 btrfs_root_refs(struct btrfs_root_item *item) +{ + return le32_to_cpu(item->refs); +} + +static inline void btrfs_set_root_refs(struct btrfs_root_item *item, u32 val) +{ + item->refs = cpu_to_le32(val); +} + +static inline u64 btrfs_super_blocknr(struct btrfs_super_block *s) +{ + return le64_to_cpu(s->blocknr); +} + +static inline void btrfs_set_super_blocknr(struct btrfs_super_block *s, u64 val) +{ + s->blocknr = cpu_to_le64(val); +} + +static inline u64 btrfs_super_root(struct btrfs_super_block *s) +{ + return le64_to_cpu(s->root); +} + +static inline void btrfs_set_super_root(struct btrfs_super_block *s, u64 val) +{ + s->root = cpu_to_le64(val); +} + +static inline u64 btrfs_super_total_blocks(struct btrfs_super_block *s) +{ + return le64_to_cpu(s->total_blocks); +} + +static inline void btrfs_set_super_total_blocks(struct btrfs_super_block *s, + u64 val) +{ + s->total_blocks = cpu_to_le64(val); +} + +static inline u64 btrfs_super_blocks_used(struct btrfs_super_block *s) +{ + return le64_to_cpu(s->blocks_used); +} + +static inline void btrfs_set_super_blocks_used(struct btrfs_super_block *s, + u64 val) +{ + s->blocks_used = cpu_to_le64(val); +} + +static inline u16 btrfs_super_blocksize(struct btrfs_super_block *s) +{ + return le16_to_cpu(s->blocksize); +} + +static inline void btrfs_set_super_blocksize(struct btrfs_super_block *s, + u16 val) +{ + s->blocksize = cpu_to_le16(val); +} + struct btrfs_buffer *btrfs_alloc_free_block(struct btrfs_root *root); int btrfs_inc_ref(struct btrfs_root *root, struct btrfs_buffer *buf); int btrfs_free_extent(struct btrfs_root *root, u64 blocknr, u64 num_blocks); @@ -331,4 +415,11 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path); int btrfs_leaf_free_space(struct btrfs_leaf *leaf); int btrfs_drop_snapshot(struct btrfs_root *root, struct btrfs_buffer *snap); int btrfs_finish_extent_commit(struct btrfs_root *root); +int btrfs_del_root(struct btrfs_root *root, struct btrfs_key *key); +int btrfs_insert_root(struct btrfs_root *root, struct btrfs_key *key, + struct btrfs_root_item *item); +int btrfs_update_root(struct btrfs_root *root, struct btrfs_key *key, + struct btrfs_root_item *item); +int btrfs_find_last_root(struct btrfs_root *root, u64 objectid, + struct btrfs_root_item *item, struct btrfs_key *key); #endif diff --git a/fs/btrfs/debug-tree.c b/fs/btrfs/debug-tree.c index 6da0a7aa0f47..de45fb4dfddd 100644 --- a/fs/btrfs/debug-tree.c +++ b/fs/btrfs/debug-tree.c @@ -11,9 +11,11 @@ int main(int ac, char **av) { struct btrfs_root *root; radix_tree_init(); root = open_ctree("dbfile", &super); - printf("root tree\n"); + printf("fs tree\n"); btrfs_print_tree(root, root->node); printf("map tree\n"); btrfs_print_tree(root->extent_root, root->extent_root->node); + printf("root tree\n"); + btrfs_print_tree(root->tree_root, root->tree_root->node); return 0; } diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index c34c0c60935f..3d4bf6833f2a 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -154,41 +154,96 @@ static int __commit_transaction(struct btrfs_root *root) return ret; } +static int commit_extent_and_tree_roots(struct btrfs_root *tree_root, + struct btrfs_root *extent_root) +{ + int ret; + u64 old_extent_block; + + while(1) { + old_extent_block = btrfs_root_blocknr(&extent_root->root_item); + if (old_extent_block == extent_root->node->blocknr) + break; + btrfs_set_root_blocknr(&extent_root->root_item, + extent_root->node->blocknr); + ret = btrfs_update_root(tree_root, + &extent_root->root_key, + &extent_root->root_item); + BUG_ON(ret); + } + __commit_transaction(extent_root); + __commit_transaction(tree_root); + return 0; +} + int btrfs_commit_transaction(struct btrfs_root *root, struct btrfs_super_block *s) { int ret = 0; + struct btrfs_buffer *snap = root->commit_root; + struct btrfs_key snap_key; ret = __commit_transaction(root); - if (!ret && root != root->extent_root) - ret = __commit_transaction(root->extent_root); BUG_ON(ret); - if (root->commit_root != root->node) { - struct btrfs_buffer *snap = root->commit_root; - root->commit_root = root->node; - root->node->count++; - ret = btrfs_drop_snapshot(root, snap); - BUG_ON(ret); - // btrfs_block_release(root, snap); - } + + if (root->commit_root == root->node) + return 0; + + memcpy(&snap_key, &root->root_key, sizeof(snap_key)); + root->root_key.offset++; + + btrfs_set_root_blocknr(&root->root_item, root->node->blocknr); + ret = btrfs_insert_root(root->tree_root, &root->root_key, + &root->root_item); + BUG_ON(ret); + + ret = commit_extent_and_tree_roots(root->tree_root, root->extent_root); + BUG_ON(ret); + write_ctree_super(root, s); - btrfs_finish_extent_commit(root); + btrfs_finish_extent_commit(root->extent_root); + btrfs_finish_extent_commit(root->tree_root); + + root->commit_root = root->node; + root->node->count++; + ret = btrfs_drop_snapshot(root, snap); + BUG_ON(ret); + + ret = btrfs_del_root(root->tree_root, &snap_key); + BUG_ON(ret); + return ret; } -static int __setup_root(struct btrfs_root *root, struct btrfs_root *extent_root, - struct btrfs_root_info *info, int fp) +static int __setup_root(struct btrfs_root *root, u64 objectid, int fp) { INIT_LIST_HEAD(&root->trans); INIT_LIST_HEAD(&root->cache); root->cache_size = 0; root->fp = fp; root->node = NULL; - root->extent_root = extent_root; root->commit_root = NULL; - root->node = read_tree_block(root, info->tree_root); memset(&root->current_insert, 0, sizeof(root->current_insert)); memset(&root->last_insert, 0, sizeof(root->last_insert)); + memset(&root->root_key, 0, sizeof(root->root_key)); + memset(&root->root_item, 0, sizeof(root->root_item)); + return 0; +} + +static int find_and_setup_root(struct btrfs_root *tree_root, u64 objectid, + struct btrfs_root *root, int fp) +{ + int ret; + + __setup_root(root, objectid, fp); + ret = btrfs_find_last_root(tree_root, objectid, + &root->root_item, &root->root_key); + BUG_ON(ret); + + root->node = read_tree_block(root, + btrfs_root_blocknr(&root->root_item)); + root->ref_cows = 0; + BUG_ON(!root->node); return 0; } @@ -196,9 +251,19 @@ struct btrfs_root *open_ctree(char *filename, struct btrfs_super_block *super) { struct btrfs_root *root = malloc(sizeof(struct btrfs_root)); struct btrfs_root *extent_root = malloc(sizeof(struct btrfs_root)); + struct btrfs_root *tree_root = malloc(sizeof(struct btrfs_root)); int fp; int ret; + root->extent_root = extent_root; + root->tree_root = tree_root; + + extent_root->extent_root = extent_root; + extent_root->tree_root = tree_root; + + tree_root->extent_root = extent_root; + tree_root->tree_root = tree_root; + fp = open(filename, O_CREAT | O_RDWR, 0600); if (fp < 0) { free(root); @@ -208,11 +273,14 @@ struct btrfs_root *open_ctree(char *filename, struct btrfs_super_block *super) INIT_RADIX_TREE(&root->pinned_radix, GFP_KERNEL); INIT_RADIX_TREE(&extent_root->pinned_radix, GFP_KERNEL); INIT_RADIX_TREE(&extent_root->cache_radix, GFP_KERNEL); + INIT_RADIX_TREE(&tree_root->pinned_radix, GFP_KERNEL); + INIT_RADIX_TREE(&tree_root->cache_radix, GFP_KERNEL); + ret = pread(fp, super, sizeof(struct btrfs_super_block), BTRFS_SUPER_INFO_OFFSET(BTRFS_BLOCKSIZE)); - if (ret == 0 || super->root_info.tree_root == 0) { + if (ret == 0 || btrfs_super_root(super) == 0) { printf("making new FS!\n"); - ret = mkfs(fp); + ret = mkfs(fp, 0, BTRFS_BLOCKSIZE); if (ret) return NULL; ret = pread(fp, super, sizeof(struct btrfs_super_block), @@ -221,24 +289,29 @@ struct btrfs_root *open_ctree(char *filename, struct btrfs_super_block *super) return NULL; } BUG_ON(ret < 0); - __setup_root(root, extent_root, &super->root_info, fp); - __setup_root(extent_root, extent_root, &super->extent_info, fp); + + __setup_root(tree_root, BTRFS_ROOT_TREE_OBJECTID, fp); + tree_root->node = read_tree_block(tree_root, btrfs_super_root(super)); + BUG_ON(!tree_root->node); + + ret = find_and_setup_root(tree_root, BTRFS_EXTENT_TREE_OBJECTID, + extent_root, fp); + BUG_ON(ret); + + ret = find_and_setup_root(tree_root, BTRFS_FS_TREE_OBJECTID, + root, fp); + BUG_ON(ret); + root->commit_root = root->node; root->node->count++; + root->ref_cows = 1; return root; } -static int __update_root(struct btrfs_root *root, struct btrfs_root_info *info) -{ - info->tree_root = root->node->blocknr; - return 0; -} - int write_ctree_super(struct btrfs_root *root, struct btrfs_super_block *s) { int ret; - __update_root(root, &s->root_info); - __update_root(root->extent_root, &s->extent_info); + btrfs_set_super_root(s, root->tree_root->node->blocknr); ret = pwrite(root->fp, s, sizeof(*s), BTRFS_SUPER_INFO_OFFSET(BTRFS_BLOCKSIZE)); if (ret != sizeof(*s)) { @@ -260,19 +333,25 @@ static int drop_cache(struct btrfs_root *root) } int close_ctree(struct btrfs_root *root, struct btrfs_super_block *s) { + int ret; btrfs_commit_transaction(root, s); - __commit_transaction(root->extent_root); + ret = commit_extent_and_tree_roots(root->tree_root, root->extent_root); + BUG_ON(ret); write_ctree_super(root, s); drop_cache(root->extent_root); + drop_cache(root->tree_root); drop_cache(root); BUG_ON(!list_empty(&root->trans)); BUG_ON(!list_empty(&root->extent_root->trans)); + BUG_ON(!list_empty(&root->tree_root->trans)); close(root->fp); if (root->node) btrfs_block_release(root, root->node); if (root->extent_root->node) btrfs_block_release(root->extent_root, root->extent_root->node); + if (root->tree_root->node) + btrfs_block_release(root->tree_root, root->tree_root->node); btrfs_block_release(root, root->commit_root); free(root); printf("on close %d blocks are allocated\n", allocated_blocks); diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index b391335864b1..c22a61f9233f 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -24,7 +24,8 @@ struct btrfs_root *open_ctree(char *filename, struct btrfs_super_block *s); int close_ctree(struct btrfs_root *root, struct btrfs_super_block *s); void btrfs_block_release(struct btrfs_root *root, struct btrfs_buffer *buf); int write_ctree_super(struct btrfs_root *root, struct btrfs_super_block *s); -int mkfs(int fd); +int mkfs(int fd, u64 num_blocks, u16 blocksize); + #define BTRFS_SUPER_INFO_OFFSET(bs) (16 * (bs)) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index a6969538bca2..21f39b404697 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -82,7 +82,7 @@ int btrfs_inc_ref(struct btrfs_root *root, struct btrfs_buffer *buf) u64 blocknr; int i; - if (root == root->extent_root) + if (!root->ref_cows) return 0; if (btrfs_is_leaf(&buf->node)) return 0; @@ -96,23 +96,22 @@ int btrfs_inc_ref(struct btrfs_root *root, struct btrfs_buffer *buf) int btrfs_finish_extent_commit(struct btrfs_root *root) { - struct btrfs_root *extent_root = root->extent_root; unsigned long gang[8]; int ret; int i; while(1) { - ret = radix_tree_gang_lookup(&extent_root->pinned_radix, + ret = radix_tree_gang_lookup(&root->pinned_radix, (void **)gang, 0, ARRAY_SIZE(gang)); if (!ret) break; for (i = 0; i < ret; i++) { - radix_tree_delete(&extent_root->pinned_radix, gang[i]); + radix_tree_delete(&root->pinned_radix, gang[i]); } } - extent_root->last_insert.objectid = 0; - extent_root->last_insert.offset = 0; + root->last_insert.objectid = 0; + root->last_insert.offset = 0; return 0; } @@ -173,7 +172,7 @@ static int __free_extent(struct btrfs_root *root, u64 blocknr, u64 num_blocks) refs = btrfs_extent_refs(ei) - 1; btrfs_set_extent_refs(ei, refs); if (refs == 0) { - if (root == extent_root) { + if (!root->ref_cows) { int err; radix_tree_preload(GFP_KERNEL); err = radix_tree_insert(&extent_root->pinned_radix, @@ -513,7 +512,7 @@ static int walk_up_tree(struct btrfs_root *root, struct btrfs_path *path, */ int btrfs_drop_snapshot(struct btrfs_root *root, struct btrfs_buffer *snap) { - int ret = 0;; + int ret = 0; int wret; int level; struct btrfs_path path; diff --git a/fs/btrfs/mkfs.c b/fs/btrfs/mkfs.c index fc1923320de3..dd14ed4fea6c 100644 --- a/fs/btrfs/mkfs.c +++ b/fs/btrfs/mkfs.c @@ -10,6 +10,120 @@ #include "ctree.h" #include "disk-io.h" +int mkfs(int fd, u64 num_blocks, u16 blocksize) +{ + struct btrfs_super_block super; + struct btrfs_leaf empty_leaf; + struct btrfs_root_item root_item; + struct btrfs_item item; + struct btrfs_extent_item extent_item; + char *block; + int ret; + u16 itemoff; + + btrfs_set_super_blocknr(&super, 16); + btrfs_set_super_root(&super, 17); + strcpy((char *)(&super.magic), BTRFS_MAGIC); + btrfs_set_super_blocksize(&super, blocksize); + btrfs_set_super_total_blocks(&super, num_blocks); + btrfs_set_super_blocks_used(&super, 0); + + block = malloc(blocksize); + memset(block, 0, blocksize); + BUG_ON(sizeof(super) > blocksize); + memcpy(block, &super, sizeof(super)); + ret = pwrite(fd, block, blocksize, BTRFS_SUPER_INFO_OFFSET(blocksize)); + BUG_ON(ret != blocksize); + + /* create the tree of root objects */ + memset(&empty_leaf, 0, sizeof(empty_leaf)); + btrfs_set_header_parentid(&empty_leaf.header, BTRFS_ROOT_TREE_OBJECTID); + btrfs_set_header_blocknr(&empty_leaf.header, 17); + btrfs_set_header_nritems(&empty_leaf.header, 2); + + /* create the items for the root tree */ + btrfs_set_root_blocknr(&root_item, 18); + btrfs_set_root_refs(&root_item, 1); + itemoff = LEAF_DATA_SIZE - sizeof(root_item); + btrfs_set_item_offset(&item, itemoff); + btrfs_set_item_size(&item, sizeof(root_item)); + btrfs_set_key_objectid(&item.key, BTRFS_EXTENT_TREE_OBJECTID); + btrfs_set_key_offset(&item.key, 0); + btrfs_set_key_flags(&item.key, 0); + memcpy(empty_leaf.items, &item, sizeof(item)); + memcpy(empty_leaf.data + itemoff, &root_item, sizeof(root_item)); + + btrfs_set_root_blocknr(&root_item, 19); + itemoff = itemoff - sizeof(root_item); + btrfs_set_item_offset(&item, itemoff); + btrfs_set_key_objectid(&item.key, BTRFS_FS_TREE_OBJECTID); + memcpy(empty_leaf.items + 1, &item, sizeof(item)); + memcpy(empty_leaf.data + itemoff, &root_item, sizeof(root_item)); + ret = pwrite(fd, &empty_leaf, blocksize, 17 * blocksize); + + /* create the items for the extent tree */ + btrfs_set_header_parentid(&empty_leaf.header, + BTRFS_EXTENT_TREE_OBJECTID); + btrfs_set_header_blocknr(&empty_leaf.header, 18); + btrfs_set_header_nritems(&empty_leaf.header, 4); + + /* item1, reserve blocks 0-16 */ + btrfs_set_key_objectid(&item.key, 0); + btrfs_set_key_offset(&item.key, 17); + btrfs_set_key_flags(&item.key, 0); + itemoff = LEAF_DATA_SIZE - sizeof(struct btrfs_extent_item); + btrfs_set_item_offset(&item, itemoff); + btrfs_set_item_size(&item, sizeof(struct btrfs_extent_item)); + btrfs_set_extent_refs(&extent_item, 1); + btrfs_set_extent_owner(&extent_item, 0); + memcpy(empty_leaf.items, &item, sizeof(item)); + memcpy(empty_leaf.data + btrfs_item_offset(&item), &extent_item, + btrfs_item_size(&item)); + + /* item2, give block 17 to the root */ + btrfs_set_key_objectid(&item.key, 17); + btrfs_set_key_offset(&item.key, 1); + itemoff = itemoff - sizeof(struct btrfs_extent_item); + btrfs_set_item_offset(&item, itemoff); + btrfs_set_extent_owner(&extent_item, BTRFS_ROOT_TREE_OBJECTID); + memcpy(empty_leaf.items + 1, &item, sizeof(item)); + memcpy(empty_leaf.data + btrfs_item_offset(&item), &extent_item, + btrfs_item_size(&item)); + + /* item3, give block 18 to the extent root */ + btrfs_set_key_objectid(&item.key, 18); + btrfs_set_key_offset(&item.key, 1); + itemoff = itemoff - sizeof(struct btrfs_extent_item); + btrfs_set_item_offset(&item, itemoff); + btrfs_set_extent_owner(&extent_item, BTRFS_EXTENT_TREE_OBJECTID); + memcpy(empty_leaf.items + 2, &item, sizeof(item)); + memcpy(empty_leaf.data + btrfs_item_offset(&item), &extent_item, + btrfs_item_size(&item)); + + /* item4, give block 19 to the FS root */ + btrfs_set_key_objectid(&item.key, 19); + btrfs_set_key_offset(&item.key, 1); + itemoff = itemoff - sizeof(struct btrfs_extent_item); + btrfs_set_item_offset(&item, itemoff); + btrfs_set_extent_owner(&extent_item, BTRFS_FS_TREE_OBJECTID); + memcpy(empty_leaf.items + 3, &item, sizeof(item)); + memcpy(empty_leaf.data + btrfs_item_offset(&item), &extent_item, + btrfs_item_size(&item)); + ret = pwrite(fd, &empty_leaf, blocksize, 18 * blocksize); + if (ret != sizeof(empty_leaf)) + return -1; + + /* finally create the FS root */ + btrfs_set_header_parentid(&empty_leaf.header, BTRFS_FS_TREE_OBJECTID); + btrfs_set_header_blocknr(&empty_leaf.header, 19); + btrfs_set_header_nritems(&empty_leaf.header, 0); + ret = pwrite(fd, &empty_leaf, blocksize, 19 * blocksize); + if (ret != sizeof(empty_leaf)) + return -1; + return 0; +} + +#if 0 int mkfs(int fd) { struct btrfs_root_info info[2]; @@ -20,13 +134,14 @@ int mkfs(int fd) /* setup the super block area */ memset(info, 0, sizeof(info)); - info[0].blocknr = 16; - info[0].objectid = 1; - info[0].tree_root = 17; + btrfs_set_root_blocknr(info, 16); + btrfs_set_root_objectid(info, 1); + btrfs_set_root_tree_root(info, 17); + + btrfs_set_root_blocknr(info + 1, 16); + btrfs_set_root_objectid(info + 1, 2); + btrfs_set_root_tree_root(info + 1, 18); - info[1].blocknr = 16; - info[1].objectid = 2; - info[1].tree_root = 18; ret = pwrite(fd, info, sizeof(info), BTRFS_SUPER_INFO_OFFSET(BTRFS_BLOCKSIZE)); if (ret != sizeof(info)) @@ -81,3 +196,4 @@ int mkfs(int fd) return -1; return 0; } +#endif diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c index c95c85640aa9..64829b6b90f9 100644 --- a/fs/btrfs/print-tree.c +++ b/fs/btrfs/print-tree.c @@ -11,6 +11,7 @@ void btrfs_print_leaf(struct btrfs_leaf *l) u32 nr = btrfs_header_nritems(&l->header); struct btrfs_item *item; struct btrfs_extent_item *ei; + struct btrfs_root_item *ri; printf("leaf %Lu total ptrs %d free space %d\n", btrfs_header_blocknr(&l->header), nr, btrfs_leaf_free_space(l)); fflush(stdout); @@ -23,13 +24,15 @@ void btrfs_print_leaf(struct btrfs_leaf *l) btrfs_key_offset(&item->key), btrfs_item_offset(item), btrfs_item_size(item)); - fflush(stdout); printf("\t\titem data %.*s\n", btrfs_item_size(item), l->data + btrfs_item_offset(item)); ei = (struct btrfs_extent_item *)(l->data + btrfs_item_offset(item)); - printf("\t\textent data refs %u owner %Lu\n", ei->refs, - ei->owner); + printf("\t\textent data refs %u owner %Lu\n", + btrfs_extent_refs(ei), btrfs_extent_owner(ei)); + ri = (struct btrfs_root_item *)ei; + printf("\t\troot data blocknr %Lu refs %u\n", + btrfs_root_blocknr(ri), btrfs_root_refs(ri)); fflush(stdout); } } @@ -71,6 +74,5 @@ void btrfs_print_tree(struct btrfs_root *root, struct btrfs_buffer *t) btrfs_print_tree(root, next_buf); btrfs_block_release(root, next_buf); } - } diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c new file mode 100644 index 000000000000..0ab90cfea98f --- /dev/null +++ b/fs/btrfs/root-tree.c @@ -0,0 +1,88 @@ +#include +#include +#include "kerncompat.h" +#include "radix-tree.h" +#include "ctree.h" +#include "disk-io.h" +#include "print-tree.h" + +int btrfs_find_last_root(struct btrfs_root *root, u64 objectid, + struct btrfs_root_item *item, struct btrfs_key *key) +{ + struct btrfs_path path; + struct btrfs_key search_key; + struct btrfs_leaf *l; + int ret; + int slot; + + search_key.objectid = objectid; + search_key.flags = (u32)-1; + search_key.offset = (u32)-1; + + btrfs_init_path(&path); + ret = btrfs_search_slot(root, &search_key, &path, 0, 0); + if (ret < 0) + goto out; + BUG_ON(ret == 0); + l = &path.nodes[0]->leaf; + BUG_ON(path.slots[0] == 0); + slot = path.slots[0] - 1; + if (btrfs_key_objectid(&l->items[slot].key) != objectid) { + ret = 1; + goto out; + } + memcpy(item, l->data + btrfs_item_offset(l->items + slot), + sizeof(*item)); + btrfs_disk_key_to_cpu(key, &l->items[slot].key); + btrfs_release_path(root, &path); + ret = 0; +out: + return ret; +} + +int btrfs_update_root(struct btrfs_root *root, struct btrfs_key *key, + struct btrfs_root_item *item) +{ + struct btrfs_path path; + struct btrfs_leaf *l; + int ret; + int slot; + + btrfs_init_path(&path); + ret = btrfs_search_slot(root, key, &path, 0, 1); + if (ret < 0) + goto out; + BUG_ON(ret != 0); + l = &path.nodes[0]->leaf; + slot = path.slots[0]; + memcpy(l->data + btrfs_item_offset(l->items + slot), item, + sizeof(*item)); +out: + btrfs_release_path(root, &path); + return ret; +} + +int btrfs_insert_root(struct btrfs_root *root, struct btrfs_key *key, + struct btrfs_root_item *item) +{ + int ret; + ret = btrfs_insert_item(root, key, item, sizeof(*item)); + BUG_ON(ret); + return ret; +} + +int btrfs_del_root(struct btrfs_root *root, struct btrfs_key *key) +{ + struct btrfs_path path; + int ret; + + btrfs_init_path(&path); + ret = btrfs_search_slot(root, key, &path, -1, 1); + if (ret < 0) + goto out; + BUG_ON(ret != 0); + ret = btrfs_del_item(root, &path); +out: + btrfs_release_path(root, &path); + return ret; +} -- cgit v1.2.3 From 62e2749e03a855d98855f9ce032dbe72d5fad148 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Thu, 15 Mar 2007 12:56:47 -0400 Subject: Btrfs: Use a chunk of the key flags to record the item type. Add (untested and simple) directory item code Fix comp_keys to use the new key ordering Add btrfs_insert_empty_item Signed-off-by: Chris Mason --- fs/btrfs/Makefile | 2 +- fs/btrfs/TODO | 1 + fs/btrfs/ctree.c | 51 +++++++++----- fs/btrfs/ctree.h | 182 ++++++++++++++++++++++++++++++++++--------------- fs/btrfs/dir-item.c | 102 +++++++++++++++++++++++++++ fs/btrfs/extent-tree.c | 17 +++-- fs/btrfs/hash.c | 86 +++++++++++++++++++++++ fs/btrfs/hasher.c | 23 +++++++ fs/btrfs/kerncompat.h | 1 + fs/btrfs/mkfs.c | 28 ++++---- fs/btrfs/print-tree.c | 36 +++++++--- fs/btrfs/quick-test.c | 3 +- fs/btrfs/random-test.c | 10 ++- fs/btrfs/root-tree.c | 2 +- 14 files changed, 435 insertions(+), 109 deletions(-) create mode 100644 fs/btrfs/dir-item.c create mode 100644 fs/btrfs/hash.c create mode 100644 fs/btrfs/hasher.c (limited to 'fs/btrfs/Makefile') diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index 13ed6313ac85..ee78cdd08c04 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -3,7 +3,7 @@ CC=gcc CFLAGS = -g -Wall headers = radix-tree.h ctree.h disk-io.h kerncompat.h print-tree.h list.h objects = ctree.o disk-io.o radix-tree.o mkfs.o extent-tree.o print-tree.o \ - root-tree.o + root-tree.o dir-item.o hash.o # if you don't have sparse installed, use ls instead CHECKFLAGS=-D__linux__ -Dlinux -D__STDC__ -Dunix -D__unix__ -Wbitwise \ diff --git a/fs/btrfs/TODO b/fs/btrfs/TODO index 8142be88d71c..249e2e90bea8 100644 --- a/fs/btrfs/TODO +++ b/fs/btrfs/TODO @@ -8,6 +8,7 @@ * Add block mapping tree (simple dm layer) * Add simple tree locking (semaphore per tree) * Make allocator smarter +* Do actual block accounting * Port into the kernel * Add virtual filesystems, mountable snapshots * Get rid of struct ctree_path, limiting tree levels held at one time diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index ece8de7f38ef..7edfbd468042 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -107,14 +107,14 @@ static int comp_keys(struct btrfs_disk_key *disk, struct btrfs_key *k2) return 1; if (k1.objectid < k2->objectid) return -1; - if (k1.flags > k2->flags) - return 1; - if (k1.flags < k2->flags) - return -1; if (k1.offset > k2->offset) return 1; if (k1.offset < k2->offset) return -1; + if (k1.flags > k2->flags) + return 1; + if (k1.flags < k2->flags) + return -1; return 0; } @@ -1122,8 +1122,8 @@ static int split_leaf(struct btrfs_root *root, struct btrfs_path *path, * Given a key and some data, insert an item into the tree. * This does all the path init required, making room in the tree if needed. */ -int btrfs_insert_item(struct btrfs_root *root, struct btrfs_key *cpu_key, - void *data, int data_size) +int btrfs_insert_empty_item(struct btrfs_root *root, struct btrfs_path *path, + struct btrfs_key *cpu_key, u32 data_size) { int ret = 0; int slot; @@ -1132,7 +1132,6 @@ int btrfs_insert_item(struct btrfs_root *root, struct btrfs_key *cpu_key, struct btrfs_buffer *leaf_buf; u32 nritems; unsigned int data_end; - struct btrfs_path path; struct btrfs_disk_key disk_key; btrfs_cpu_key_to_disk(&disk_key, cpu_key); @@ -1140,17 +1139,16 @@ int btrfs_insert_item(struct btrfs_root *root, struct btrfs_key *cpu_key, /* create a root if there isn't one */ if (!root->node) BUG(); - btrfs_init_path(&path); - ret = btrfs_search_slot(root, cpu_key, &path, data_size, 1); + ret = btrfs_search_slot(root, cpu_key, path, data_size, 1); if (ret == 0) { - btrfs_release_path(root, &path); + btrfs_release_path(root, path); return -EEXIST; } if (ret < 0) goto out; - slot_orig = path.slots[0]; - leaf_buf = path.nodes[0]; + slot_orig = path->slots[0]; + leaf_buf = path->nodes[0]; leaf = &leaf_buf->leaf; nritems = btrfs_header_nritems(&leaf->header); @@ -1160,7 +1158,7 @@ int btrfs_insert_item(struct btrfs_root *root, struct btrfs_key *cpu_key, sizeof(struct btrfs_item) + data_size) BUG(); - slot = path.slots[0]; + slot = path->slots[0]; BUG_ON(slot < 0); if (slot != nritems) { int i; @@ -1186,23 +1184,42 @@ int btrfs_insert_item(struct btrfs_root *root, struct btrfs_key *cpu_key, data_end, old_data - data_end); data_end = old_data; } - /* copy the new data in */ + /* setup the item for the new data */ memcpy(&leaf->items[slot].key, &disk_key, sizeof(struct btrfs_disk_key)); btrfs_set_item_offset(leaf->items + slot, data_end - data_size); btrfs_set_item_size(leaf->items + slot, data_size); - memcpy(btrfs_leaf_data(leaf) + data_end - data_size, data, data_size); btrfs_set_header_nritems(&leaf->header, nritems + 1); ret = 0; if (slot == 0) - ret = fixup_low_keys(root, &path, &disk_key, 1); + ret = fixup_low_keys(root, path, &disk_key, 1); BUG_ON(list_empty(&leaf_buf->dirty)); if (btrfs_leaf_free_space(root, leaf) < 0) BUG(); - check_leaf(root, &path, 0); + check_leaf(root, path, 0); out: + return ret; +} + +/* + * Given a key and some data, insert an item into the tree. + * This does all the path init required, making room in the tree if needed. + */ +int btrfs_insert_item(struct btrfs_root *root, struct btrfs_key *cpu_key, + void *data, u32 data_size) +{ + int ret = 0; + struct btrfs_path path; + u8 *ptr; + + btrfs_init_path(&path); + ret = btrfs_insert_empty_item(root, &path, cpu_key, data_size); + if (!ret) { + ptr = btrfs_item_ptr(&path.nodes[0]->leaf, path.slots[0], u8); + memcpy(ptr, data, data_size); + } btrfs_release_path(root, &path); return ret; } diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 73ebc824924a..e8a26fd8ea9f 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -58,39 +58,6 @@ struct btrfs_header { #define BTRFS_LEAF_DATA_SIZE(r) (__BTRFS_LEAF_DATA_SIZE(r->blocksize)) struct btrfs_buffer; - -struct btrfs_root_item { - __le64 blocknr; - __le32 flags; - __le64 block_limit; - __le64 blocks_used; - __le32 refs; -}; - -/* - * in ram representation of the tree. extent_root is used for all allocations - * and for the extent tree extent_root root. current_insert is used - * only for the extent tree. - */ -struct btrfs_root { - struct btrfs_buffer *node; - struct btrfs_buffer *commit_root; - struct btrfs_root *extent_root; - struct btrfs_root *tree_root; - struct btrfs_key current_insert; - struct btrfs_key last_insert; - int fp; - struct radix_tree_root cache_radix; - struct radix_tree_root pinned_radix; - struct list_head trans; - struct list_head cache; - int cache_size; - int ref_cows; - struct btrfs_root_item root_item; - struct btrfs_key root_key; - u32 blocksize; -}; - /* * the super block basically lists the main trees of the FS * it currently lacks any block count etc etc @@ -108,8 +75,7 @@ struct btrfs_super_block { } __attribute__ ((__packed__)); /* - * A leaf is full of items. The exact type of item is defined by - * the key flags parameter. offset and size tell us where to find + * A leaf is full of items. offset and size tell us where to find * the item in the leaf (relative to the start of the data area) */ struct btrfs_item { @@ -144,15 +110,6 @@ struct btrfs_node { struct btrfs_key_ptr ptrs[]; } __attribute__ ((__packed__)); -/* - * items in the extent btree are used to record the objectid of the - * owner of the block and the number of references - */ -struct btrfs_extent_item { - __le32 refs; - __le64 owner; -} __attribute__ ((__packed__)); - /* * btrfs_paths remember the path taken from the root down to the leaf. * level 0 is always the leaf, and nodes[1...BTRFS_MAX_LEVEL] will point @@ -166,6 +123,94 @@ struct btrfs_path { int slots[BTRFS_MAX_LEVEL]; }; +/* + * items in the extent btree are used to record the objectid of the + * owner of the block and the number of references + */ +struct btrfs_extent_item { + __le32 refs; + __le64 owner; +} __attribute__ ((__packed__)); + +struct btrfs_dir_item { + __le64 objectid; + __le16 flags; + u8 type; +} __attribute__ ((__packed__)); + +struct btrfs_root_item { + __le64 blocknr; + __le32 flags; + __le64 block_limit; + __le64 blocks_used; + __le32 refs; +}; + +/* + * in ram representation of the tree. extent_root is used for all allocations + * and for the extent tree extent_root root. current_insert is used + * only for the extent tree. + */ +struct btrfs_root { + struct btrfs_buffer *node; + struct btrfs_buffer *commit_root; + struct btrfs_root *extent_root; + struct btrfs_root *tree_root; + struct btrfs_key current_insert; + struct btrfs_key last_insert; + int fp; + struct radix_tree_root cache_radix; + struct radix_tree_root pinned_radix; + struct list_head trans; + struct list_head cache; + int cache_size; + int ref_cows; + struct btrfs_root_item root_item; + struct btrfs_key root_key; + u32 blocksize; +}; + + +/* the lower bits in the key flags defines the item type */ +#define BTRFS_KEY_TYPE_MAX 256 +#define BTRFS_KEY_TYPE_MASK (BTRFS_KEY_TYPE_MAX - 1) +#define BTRFS_INODE_ITEM_KEY 1 +#define BTRFS_DIR_ITEM_KEY 2 +#define BTRFS_ROOT_ITEM_KEY 3 +#define BTRFS_EXTENT_ITEM_KEY 4 +#define BTRFS_STRING_ITEM_KEY 5 + +static inline u64 btrfs_dir_objectid(struct btrfs_dir_item *d) +{ + return le64_to_cpu(d->objectid); +} + +static inline void btrfs_set_dir_objectid(struct btrfs_dir_item *d, u64 val) +{ + d->objectid = cpu_to_le64(val); +} + +static inline u16 btrfs_dir_flags(struct btrfs_dir_item *d) +{ + return le16_to_cpu(d->flags); +} + +static inline void btrfs_set_dir_flags(struct btrfs_dir_item *d, u16 val) +{ + d->flags = cpu_to_le16(val); +} + +static inline u8 btrfs_dir_type(struct btrfs_dir_item *d) +{ + return d->type; +} + +static inline void btrfs_set_dir_type(struct btrfs_dir_item *d, u8 val) +{ + d->type = val; +} + + static inline u64 btrfs_extent_owner(struct btrfs_extent_item *ei) { return le64_to_cpu(ei->owner); @@ -238,39 +283,65 @@ static inline void btrfs_cpu_key_to_disk(struct btrfs_disk_key *disk, disk->objectid = cpu_to_le64(cpu->objectid); } -static inline u64 btrfs_key_objectid(struct btrfs_disk_key *disk) +static inline u64 btrfs_disk_key_objectid(struct btrfs_disk_key *disk) { return le64_to_cpu(disk->objectid); } -static inline void btrfs_set_key_objectid(struct btrfs_disk_key *disk, - u64 val) +static inline void btrfs_set_disk_key_objectid(struct btrfs_disk_key *disk, + u64 val) { disk->objectid = cpu_to_le64(val); } -static inline u64 btrfs_key_offset(struct btrfs_disk_key *disk) +static inline u64 btrfs_disk_key_offset(struct btrfs_disk_key *disk) { return le64_to_cpu(disk->offset); } -static inline void btrfs_set_key_offset(struct btrfs_disk_key *disk, - u64 val) +static inline void btrfs_set_disk_key_offset(struct btrfs_disk_key *disk, + u64 val) { disk->offset = cpu_to_le64(val); } -static inline u32 btrfs_key_flags(struct btrfs_disk_key *disk) +static inline u32 btrfs_disk_key_flags(struct btrfs_disk_key *disk) { return le32_to_cpu(disk->flags); } -static inline void btrfs_set_key_flags(struct btrfs_disk_key *disk, - u32 val) +static inline void btrfs_set_disk_key_flags(struct btrfs_disk_key *disk, + u32 val) { disk->flags = cpu_to_le32(val); } +static inline u32 btrfs_key_type(struct btrfs_key *key) +{ + return key->flags & BTRFS_KEY_TYPE_MASK; +} + +static inline u32 btrfs_disk_key_type(struct btrfs_disk_key *key) +{ + return le32_to_cpu(key->flags) & BTRFS_KEY_TYPE_MASK; +} + +static inline void btrfs_set_key_type(struct btrfs_key *key, u32 type) +{ + BUG_ON(type >= BTRFS_KEY_TYPE_MAX); + key->flags = (key->flags & ~((u64)BTRFS_KEY_TYPE_MASK)) | type; +} + +static inline void btrfs_set_disk_key_type(struct btrfs_disk_key *key, u32 type) +{ + u32 flags = btrfs_disk_key_flags(key); + BUG_ON(type >= BTRFS_KEY_TYPE_MAX); + flags = (flags & ~((u64)BTRFS_KEY_TYPE_MASK)) | type; + btrfs_set_disk_key_flags(key, flags); +} + + + static inline u64 btrfs_header_blocknr(struct btrfs_header *h) { return le64_to_cpu(h->blocknr); @@ -407,7 +478,6 @@ static inline u8 *btrfs_leaf_data(struct btrfs_leaf *l) { return (u8 *)l->items; } - /* helper function to cast into the data area of the leaf. */ #define btrfs_item_ptr(leaf, slot, type) \ ((type *)(btrfs_leaf_data(leaf) + \ @@ -422,7 +492,9 @@ void btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p); void btrfs_init_path(struct btrfs_path *p); int btrfs_del_item(struct btrfs_root *root, struct btrfs_path *path); int btrfs_insert_item(struct btrfs_root *root, struct btrfs_key *key, - void *data, int data_size); + void *data, u32 data_size); +int btrfs_insert_empty_item(struct btrfs_root *root, struct btrfs_path *path, + struct btrfs_key *cpu_key, u32 data_size); int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path); int btrfs_leaf_free_space(struct btrfs_root *root, struct btrfs_leaf *leaf); int btrfs_drop_snapshot(struct btrfs_root *root, struct btrfs_buffer *snap); diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c new file mode 100644 index 000000000000..2a888e97e1a2 --- /dev/null +++ b/fs/btrfs/dir-item.c @@ -0,0 +1,102 @@ +#include +#include +#include "kerncompat.h" +#include "radix-tree.h" +#include "ctree.h" +#include "disk-io.h" +#include "hash.h" + +int btrfs_insert_dir_item(struct btrfs_root *root, char *name, int name_len, + u64 dir, u64 objectid, u8 type) +{ + int ret = 0; + struct btrfs_path path; + struct btrfs_dir_item *dir_item; + char *name_ptr; + struct btrfs_key key; + u32 data_size; + + key.objectid = dir; + key.flags = 0; + ret = btrfs_name_hash(name, name_len, &key.offset); + BUG_ON(ret); + btrfs_init_path(&path); + data_size = sizeof(*dir_item) + name_len; + ret = btrfs_insert_empty_item(root, &path, &key, data_size); + if (ret) + goto out; + + dir_item = btrfs_item_ptr(&path.nodes[0]->leaf, path.slots[0], + struct btrfs_dir_item); + btrfs_set_dir_objectid(dir_item, objectid); + btrfs_set_dir_type(dir_item, type); + btrfs_set_dir_flags(dir_item, 0); + name_ptr = (char *)(dir_item + 1); + memcpy(name_ptr, name, name_len); +out: + btrfs_release_path(root, &path); + return ret; +} + +int btrfs_del_dir_item(struct btrfs_root *root, u64 dir, char *name, + int name_len) +{ + int ret = 0; + struct btrfs_path path; + struct btrfs_key key; + + key.objectid = dir; + key.flags = 0; + ret = btrfs_name_hash(name, name_len, &key.offset); + BUG_ON(ret); + btrfs_init_path(&path); + ret = btrfs_search_slot(root, &key, &path, 0, 1); + if (ret) + goto out; + ret = btrfs_del_item(root, &path); +out: + btrfs_release_path(root, &path); + return ret; +} + +int btrfs_lookup_dir_item(struct btrfs_root *root, u64 dir, char *name, + int name_len, u64 *objectid) +{ + int ret = 0; + struct btrfs_path path; + struct btrfs_dir_item *dir_item; + char *name_ptr; + struct btrfs_key key; + u32 item_len; + struct btrfs_item *item; + + key.objectid = dir; + key.flags = 0; + ret = btrfs_name_hash(name, name_len, &key.offset); + BUG_ON(ret); + btrfs_init_path(&path); + ret = btrfs_search_slot(root, &key, &path, 0, 0); + if (ret) + goto out; + + dir_item = btrfs_item_ptr(&path.nodes[0]->leaf, path.slots[0], + struct btrfs_dir_item); + + item = path.nodes[0]->leaf.items + path.slots[0]; + item_len = btrfs_item_size(item); + if (item_len != name_len + sizeof(struct btrfs_dir_item)) { + BUG(); + ret = 1; + goto out; + } + name_ptr = (char *)(dir_item + 1); + if (memcmp(name_ptr, name, name_len)) { + BUG(); + ret = 1; + goto out; + } + *objectid = btrfs_dir_objectid(dir_item); +out: + btrfs_release_path(root, &path); + return ret; +} diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index d4f1ec328399..c81e14162ef1 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -35,6 +35,7 @@ static int inc_block_ref(struct btrfs_root *root, u64 blocknr) btrfs_init_path(&path); key.objectid = blocknr; key.flags = 0; + btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY); key.offset = 1; ret = btrfs_search_slot(root->extent_root, &key, &path, 0, 1); if (ret != 0) @@ -61,8 +62,9 @@ static int lookup_block_ref(struct btrfs_root *root, u64 blocknr, u32 *refs) struct btrfs_extent_item *item; btrfs_init_path(&path); key.objectid = blocknr; - key.flags = 0; key.offset = 1; + key.flags = 0; + btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY); ret = btrfs_search_slot(root->extent_root, &key, &path, 0, 0); if (ret != 0) BUG(); @@ -123,6 +125,7 @@ static int finish_current_insert(struct btrfs_root *extent_root) btrfs_header_parentid(&extent_root->node->node.header)); ins.offset = 1; ins.flags = 0; + btrfs_set_key_type(&ins, BTRFS_EXTENT_ITEM_KEY); for (i = 0; i < extent_root->current_insert.flags; i++) { ins.objectid = extent_root->current_insert.objectid + i; @@ -149,6 +152,7 @@ static int __free_extent(struct btrfs_root *root, u64 blocknr, u64 num_blocks) key.objectid = blocknr; key.flags = 0; + btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY); key.offset = num_blocks; find_free_extent(root, 0, 0, (u64)-1, &ins); @@ -228,7 +232,6 @@ static int run_pending(struct btrfs_root *extent_root) */ int btrfs_free_extent(struct btrfs_root *root, u64 blocknr, u64 num_blocks) { - struct btrfs_key key; struct btrfs_root *extent_root = root->extent_root; struct btrfs_buffer *t; int pending_ret; @@ -240,9 +243,6 @@ int btrfs_free_extent(struct btrfs_root *root, u64 blocknr, u64 num_blocks) CTREE_EXTENT_PENDING_DEL); return 0; } - key.objectid = blocknr; - key.flags = 0; - key.offset = num_blocks; ret = __free_extent(root, blocknr, num_blocks); pending_ret = run_pending(root->extent_root); return ret ? ret : pending_ret; @@ -252,7 +252,7 @@ int btrfs_free_extent(struct btrfs_root *root, u64 blocknr, u64 num_blocks) * walks the btree of allocated extents and find a hole of a given size. * The key ins is changed to record the hole: * ins->objectid == block start - * ins->flags = 0 + * ins->flags = BTRFS_EXTENT_ITEM_KEY * ins->offset == number of blocks * Any available blocks before search_start are skipped. */ @@ -275,11 +275,14 @@ static int find_free_extent(struct btrfs_root *orig_root, u64 num_blocks, total_needed += (btrfs_header_level(&root->node->node.header) + 1) * 3; if (root->last_insert.objectid > search_start) search_start = root->last_insert.objectid; + + ins->flags = 0; + btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY); + check_failed: btrfs_init_path(&path); ins->objectid = search_start; ins->offset = 0; - ins->flags = 0; start_found = 0; ret = btrfs_search_slot(root, ins, &path, 0, 0); if (ret < 0) diff --git a/fs/btrfs/hash.c b/fs/btrfs/hash.c new file mode 100644 index 000000000000..92c37eece1f2 --- /dev/null +++ b/fs/btrfs/hash.c @@ -0,0 +1,86 @@ +/* + * Original copy from: + * linux/fs/ext3/hash.c + * + * Copyright (C) 2002 by Theodore Ts'o + * + * This file is released under the GPL v2. + * + * This file may be redistributed under the terms of the GNU Public + * License. + */ + +#include "kerncompat.h" +#define DELTA 0x9E3779B9 + +static void TEA_transform(__u32 buf[2], __u32 const in[]) +{ + __u32 sum = 0; + __u32 b0 = buf[0], b1 = buf[1]; + __u32 a = in[0], b = in[1], c = in[2], d = in[3]; + int n = 16; + + do { + sum += DELTA; + b0 += ((b1 << 4)+a) ^ (b1+sum) ^ ((b1 >> 5)+b); + b1 += ((b0 << 4)+c) ^ (b0+sum) ^ ((b0 >> 5)+d); + } while(--n); + + buf[0] += b0; + buf[1] += b1; +} + +static void str2hashbuf(const char *msg, int len, __u32 *buf, int num) +{ + __u32 pad, val; + int i; + + pad = (__u32)len | ((__u32)len << 8); + pad |= pad << 16; + + val = pad; + if (len > num*4) + len = num * 4; + for (i=0; i < len; i++) { + if ((i % 4) == 0) + val = pad; + val = msg[i] + (val << 8); + if ((i % 4) == 3) { + *buf++ = val; + val = pad; + num--; + } + } + if (--num >= 0) + *buf++ = val; + while (--num >= 0) + *buf++ = pad; +} + +int btrfs_name_hash(const char *name, int len, u64 *hash_result) +{ + __u32 hash; + __u32 minor_hash = 0; + const char *p; + __u32 in[8], buf[2]; + + /* Initialize the default seed for the hash checksum functions */ + buf[0] = 0x67452301; + buf[1] = 0xefcdab89; + buf[2] = 0x98badcfe; + buf[3] = 0x10325476; + + p = name; + while (len > 0) { + str2hashbuf(p, len, in, 4); + TEA_transform(buf, in); + len -= 16; + p += 16; + } + hash = buf[0]; + minor_hash = buf[1]; + *hash_result = buf[0]; + *hash_result <<= 32; + *hash_result |= buf[1]; + return 0; +} diff --git a/fs/btrfs/hasher.c b/fs/btrfs/hasher.c new file mode 100644 index 000000000000..96702da4329c --- /dev/null +++ b/fs/btrfs/hasher.c @@ -0,0 +1,23 @@ +#include +#include +#include +#include "kerncompat.h" +#include "hash.h" + +int main() { + u64 result; + int ret; + char line[255]; + char *p; + while(1) { + p = fgets(line, 255, stdin); + if (!p) + break; + if (strlen(line) == 0) + continue; + ret = btrfs_name_hash(line, strlen(line), &result); + BUG_ON(ret); + printf("hash returns %Lu\n", result); + } + return 0; +} diff --git a/fs/btrfs/kerncompat.h b/fs/btrfs/kerncompat.h index 56239b922ba6..105d3f584089 100644 --- a/fs/btrfs/kerncompat.h +++ b/fs/btrfs/kerncompat.h @@ -21,6 +21,7 @@ #endif typedef unsigned int u32; +typedef u32 __u32; typedef unsigned long long u64; typedef unsigned char u8; typedef unsigned short u16; diff --git a/fs/btrfs/mkfs.c b/fs/btrfs/mkfs.c index 317d20ce759d..9aa900811c33 100644 --- a/fs/btrfs/mkfs.c +++ b/fs/btrfs/mkfs.c @@ -50,9 +50,10 @@ int mkfs(int fd, u64 num_blocks, u32 blocksize) itemoff = __BTRFS_LEAF_DATA_SIZE(blocksize) - sizeof(root_item); btrfs_set_item_offset(&item, itemoff); btrfs_set_item_size(&item, sizeof(root_item)); - btrfs_set_key_objectid(&item.key, BTRFS_EXTENT_TREE_OBJECTID); - btrfs_set_key_offset(&item.key, 0); - btrfs_set_key_flags(&item.key, 0); + btrfs_set_disk_key_objectid(&item.key, BTRFS_EXTENT_TREE_OBJECTID); + btrfs_set_disk_key_offset(&item.key, 0); + btrfs_set_disk_key_flags(&item.key, 0); + btrfs_set_disk_key_type(&item.key, BTRFS_ROOT_ITEM_KEY); memcpy(empty_leaf->items, &item, sizeof(item)); memcpy(btrfs_leaf_data(empty_leaf) + itemoff, &root_item, sizeof(root_item)); @@ -60,7 +61,7 @@ int mkfs(int fd, u64 num_blocks, u32 blocksize) btrfs_set_root_blocknr(&root_item, start_block + 3); itemoff = itemoff - sizeof(root_item); btrfs_set_item_offset(&item, itemoff); - btrfs_set_key_objectid(&item.key, BTRFS_FS_TREE_OBJECTID); + btrfs_set_disk_key_objectid(&item.key, BTRFS_FS_TREE_OBJECTID); memcpy(empty_leaf->items + 1, &item, sizeof(item)); memcpy(btrfs_leaf_data(empty_leaf) + itemoff, &root_item, sizeof(root_item)); @@ -73,9 +74,10 @@ int mkfs(int fd, u64 num_blocks, u32 blocksize) btrfs_set_header_nritems(&empty_leaf->header, 4); /* item1, reserve blocks 0-16 */ - btrfs_set_key_objectid(&item.key, 0); - btrfs_set_key_offset(&item.key, start_block + 1); - btrfs_set_key_flags(&item.key, 0); + btrfs_set_disk_key_objectid(&item.key, 0); + btrfs_set_disk_key_offset(&item.key, start_block + 1); + btrfs_set_disk_key_flags(&item.key, 0); + btrfs_set_disk_key_type(&item.key, BTRFS_EXTENT_ITEM_KEY); itemoff = __BTRFS_LEAF_DATA_SIZE(blocksize) - sizeof(struct btrfs_extent_item); btrfs_set_item_offset(&item, itemoff); @@ -87,8 +89,8 @@ int mkfs(int fd, u64 num_blocks, u32 blocksize) &extent_item, btrfs_item_size(&item)); /* item2, give block 17 to the root */ - btrfs_set_key_objectid(&item.key, start_block + 1); - btrfs_set_key_offset(&item.key, 1); + btrfs_set_disk_key_objectid(&item.key, start_block + 1); + btrfs_set_disk_key_offset(&item.key, 1); itemoff = itemoff - sizeof(struct btrfs_extent_item); btrfs_set_item_offset(&item, itemoff); btrfs_set_extent_owner(&extent_item, BTRFS_ROOT_TREE_OBJECTID); @@ -97,8 +99,8 @@ int mkfs(int fd, u64 num_blocks, u32 blocksize) &extent_item, btrfs_item_size(&item)); /* item3, give block 18 to the extent root */ - btrfs_set_key_objectid(&item.key, start_block + 2); - btrfs_set_key_offset(&item.key, 1); + btrfs_set_disk_key_objectid(&item.key, start_block + 2); + btrfs_set_disk_key_offset(&item.key, 1); itemoff = itemoff - sizeof(struct btrfs_extent_item); btrfs_set_item_offset(&item, itemoff); btrfs_set_extent_owner(&extent_item, BTRFS_EXTENT_TREE_OBJECTID); @@ -107,8 +109,8 @@ int mkfs(int fd, u64 num_blocks, u32 blocksize) &extent_item, btrfs_item_size(&item)); /* item4, give block 19 to the FS root */ - btrfs_set_key_objectid(&item.key, start_block + 3); - btrfs_set_key_offset(&item.key, 1); + btrfs_set_disk_key_objectid(&item.key, start_block + 3); + btrfs_set_disk_key_offset(&item.key, 1); itemoff = itemoff - sizeof(struct btrfs_extent_item); btrfs_set_item_offset(&item, itemoff); btrfs_set_extent_owner(&extent_item, BTRFS_FS_TREE_OBJECTID); diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c index 4c710190343d..f2745b247473 100644 --- a/fs/btrfs/print-tree.c +++ b/fs/btrfs/print-tree.c @@ -12,27 +12,41 @@ void btrfs_print_leaf(struct btrfs_root *root, struct btrfs_leaf *l) struct btrfs_item *item; struct btrfs_extent_item *ei; struct btrfs_root_item *ri; + u32 type; printf("leaf %Lu total ptrs %d free space %d\n", btrfs_header_blocknr(&l->header), nr, btrfs_leaf_free_space(root, l)); fflush(stdout); for (i = 0 ; i < nr ; i++) { item = l->items + i; + type = btrfs_disk_key_type(&item->key); printf("\titem %d key (%Lu %Lu %u) itemoff %d itemsize %d\n", i, - btrfs_key_objectid(&item->key), - btrfs_key_offset(&item->key), - btrfs_key_flags(&item->key), + btrfs_disk_key_objectid(&item->key), + btrfs_disk_key_offset(&item->key), + btrfs_disk_key_flags(&item->key), btrfs_item_offset(item), btrfs_item_size(item)); - printf("\t\titem data %.*s\n", btrfs_item_size(item), - btrfs_leaf_data(l) + btrfs_item_offset(item)); - ei = btrfs_item_ptr(l, i, struct btrfs_extent_item); - printf("\t\textent data refs %u owner %Lu\n", - btrfs_extent_refs(ei), btrfs_extent_owner(ei)); - ri = btrfs_item_ptr(l, i, struct btrfs_root_item); - printf("\t\troot data blocknr %Lu refs %u\n", - btrfs_root_blocknr(ri), btrfs_root_refs(ri)); + switch (type) { + case BTRFS_INODE_ITEM_KEY: + break; + case BTRFS_DIR_ITEM_KEY: + break; + case BTRFS_ROOT_ITEM_KEY: + ri = btrfs_item_ptr(l, i, struct btrfs_root_item); + printf("\t\troot data blocknr %Lu refs %u\n", + btrfs_root_blocknr(ri), btrfs_root_refs(ri)); + break; + case BTRFS_EXTENT_ITEM_KEY: + ei = btrfs_item_ptr(l, i, struct btrfs_extent_item); + printf("\t\textent data refs %u owner %Lu\n", + btrfs_extent_refs(ei), btrfs_extent_owner(ei)); + break; + case BTRFS_STRING_ITEM_KEY: + printf("\t\titem data %.*s\n", btrfs_item_size(item), + btrfs_leaf_data(l) + btrfs_item_offset(item)); + break; + }; fflush(stdout); } } diff --git a/fs/btrfs/quick-test.c b/fs/btrfs/quick-test.c index d9287cad35c9..3a2571617121 100644 --- a/fs/btrfs/quick-test.c +++ b/fs/btrfs/quick-test.c @@ -30,6 +30,8 @@ int main(int ac, char **av) { root = open_ctree("dbfile", &super); srand(55); + ins.flags = 0; + btrfs_set_key_type(&ins, BTRFS_STRING_ITEM_KEY); for (i = 0; i < run_size; i++) { buf = malloc(64); num = next_key(i, max_key); @@ -39,7 +41,6 @@ int main(int ac, char **av) { fprintf(stderr, "insert %d:%d\n", num, i); ins.objectid = num; ins.offset = 0; - ins.flags = 0; ret = btrfs_insert_item(root, &ins, buf, strlen(buf)); if (!ret) tree_size++; diff --git a/fs/btrfs/random-test.c b/fs/btrfs/random-test.c index a8ef0478991b..f05135f13f9a 100644 --- a/fs/btrfs/random-test.c +++ b/fs/btrfs/random-test.c @@ -18,6 +18,7 @@ static int setup_key(struct radix_tree_root *root, struct btrfs_key *key, int ret; key->flags = 0; + btrfs_set_key_type(key, BTRFS_STRING_ITEM_KEY); key->offset = 0; again: ret = radix_tree_gang_lookup(root, (void **)res, num, 2); @@ -157,6 +158,7 @@ static int empty_tree(struct btrfs_root *root, struct radix_tree_root *radix, key.offset = 0; key.flags = 0; + btrfs_set_key_type(&key, BTRFS_STRING_ITEM_KEY); key.objectid = (unsigned long)-1; while(nr-- >= 0) { btrfs_init_path(&path); @@ -173,7 +175,8 @@ static int empty_tree(struct btrfs_root *root, struct radix_tree_root *radix, path.slots[0] -= 1; } slot = path.slots[0]; - found=btrfs_key_objectid(&path.nodes[0]->leaf.items[slot].key); + found = btrfs_disk_key_objectid( + &path.nodes[0]->leaf.items[slot].key); ret = btrfs_del_item(root, &path); count++; if (ret) { @@ -257,6 +260,7 @@ static int fill_radix(struct btrfs_root *root, struct radix_tree_root *radix) key.offset = 0; key.flags = 0; + btrfs_set_key_type(&key, BTRFS_STRING_ITEM_KEY); key.objectid = (unsigned long)-1; while(1) { btrfs_init_path(&path); @@ -274,8 +278,8 @@ static int fill_radix(struct btrfs_root *root, struct radix_tree_root *radix) slot -= 1; } for (i = slot; i >= 0; i--) { - found = btrfs_key_objectid(&path.nodes[0]-> - leaf.items[i].key); + found = btrfs_disk_key_objectid(&path.nodes[0]-> + leaf.items[i].key); radix_tree_preload(GFP_KERNEL); ret = radix_tree_insert(radix, found, (void *)found); if (ret) { diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index 03aa7c2c6336..a24b47274037 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -27,7 +27,7 @@ int btrfs_find_last_root(struct btrfs_root *root, u64 objectid, l = &path.nodes[0]->leaf; BUG_ON(path.slots[0] == 0); slot = path.slots[0] - 1; - if (btrfs_key_objectid(&l->items[slot].key) != objectid) { + if (btrfs_disk_key_objectid(&l->items[slot].key) != objectid) { ret = 1; goto out; } -- cgit v1.2.3 From 1d4f6404de26df49eb8452f8fdf7672b59f407fc Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Thu, 15 Mar 2007 15:18:43 -0400 Subject: Btrfs: directory testing code and dir item fixes Signed-off-by: Chris Mason --- fs/btrfs/Makefile | 5 +- fs/btrfs/ctree.h | 72 +++++---- fs/btrfs/dir-item.c | 58 +++----- fs/btrfs/dir-test.c | 404 ++++++++++++++++++++++++++++++++++++++++++++++++++ fs/btrfs/print-tree.c | 12 ++ 5 files changed, 477 insertions(+), 74 deletions(-) create mode 100644 fs/btrfs/dir-test.c (limited to 'fs/btrfs/Makefile') diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index ee78cdd08c04..ea3e1493baa9 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -1,4 +1,3 @@ - CC=gcc CFLAGS = -g -Wall headers = radix-tree.h ctree.h disk-io.h kerncompat.h print-tree.h list.h @@ -15,7 +14,7 @@ check=sparse $(CHECKFLAGS) $(check) $< $(CC) $(CFLAGS) -c $< -all: tester debug-tree quick-test +all: tester debug-tree quick-test dir-test debug-tree: $(objects) debug-tree.o gcc $(CFLAGS) -o debug-tree $(objects) debug-tree.o @@ -23,6 +22,8 @@ debug-tree: $(objects) debug-tree.o tester: $(objects) random-test.o gcc $(CFLAGS) -o tester $(objects) random-test.o +dir-test: $(objects) dir-test.o + gcc $(CFLAGS) -o dir-test $(objects) dir-test.o quick-test: $(objects) quick-test.o gcc $(CFLAGS) -o quick-test $(objects) quick-test.o diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index e8a26fd8ea9f..7a3492d5888e 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -180,37 +180,6 @@ struct btrfs_root { #define BTRFS_EXTENT_ITEM_KEY 4 #define BTRFS_STRING_ITEM_KEY 5 -static inline u64 btrfs_dir_objectid(struct btrfs_dir_item *d) -{ - return le64_to_cpu(d->objectid); -} - -static inline void btrfs_set_dir_objectid(struct btrfs_dir_item *d, u64 val) -{ - d->objectid = cpu_to_le64(val); -} - -static inline u16 btrfs_dir_flags(struct btrfs_dir_item *d) -{ - return le16_to_cpu(d->flags); -} - -static inline void btrfs_set_dir_flags(struct btrfs_dir_item *d, u16 val) -{ - d->flags = cpu_to_le16(val); -} - -static inline u8 btrfs_dir_type(struct btrfs_dir_item *d) -{ - return d->type; -} - -static inline void btrfs_set_dir_type(struct btrfs_dir_item *d, u8 val) -{ - d->type = val; -} - - static inline u64 btrfs_extent_owner(struct btrfs_extent_item *ei) { return le64_to_cpu(ei->owner); @@ -267,6 +236,41 @@ static inline void btrfs_set_item_size(struct btrfs_item *item, u16 val) item->size = cpu_to_le16(val); } +static inline u64 btrfs_dir_objectid(struct btrfs_dir_item *d) +{ + return le64_to_cpu(d->objectid); +} + +static inline void btrfs_set_dir_objectid(struct btrfs_dir_item *d, u64 val) +{ + d->objectid = cpu_to_le64(val); +} + +static inline u16 btrfs_dir_flags(struct btrfs_dir_item *d) +{ + return le16_to_cpu(d->flags); +} + +static inline void btrfs_set_dir_flags(struct btrfs_dir_item *d, u16 val) +{ + d->flags = cpu_to_le16(val); +} + +static inline u8 btrfs_dir_type(struct btrfs_dir_item *d) +{ + return d->type; +} + +static inline void btrfs_set_dir_type(struct btrfs_dir_item *d, u8 val) +{ + d->type = val; +} + +static inline u32 btrfs_dir_name_len(struct btrfs_item *i) +{ + return btrfs_item_size(i) - sizeof(struct btrfs_dir_item); +} + static inline void btrfs_disk_key_to_cpu(struct btrfs_key *cpu, struct btrfs_disk_key *disk) { @@ -506,4 +510,10 @@ int btrfs_update_root(struct btrfs_root *root, struct btrfs_key *key, struct btrfs_root_item *item); int btrfs_find_last_root(struct btrfs_root *root, u64 objectid, struct btrfs_root_item *item, struct btrfs_key *key); +int btrfs_insert_dir_item(struct btrfs_root *root, char *name, int name_len, + u64 dir, u64 objectid, u8 type); +int btrfs_lookup_dir_item(struct btrfs_root *root, struct btrfs_path *path, + u64 dir, char *name, int name_len, int mod); +int btrfs_match_dir_item_name(struct btrfs_root *root, struct btrfs_path *path, + char *name, int name_len); #endif diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c index 2a888e97e1a2..8043b2ef10d1 100644 --- a/fs/btrfs/dir-item.c +++ b/fs/btrfs/dir-item.c @@ -18,6 +18,7 @@ int btrfs_insert_dir_item(struct btrfs_root *root, char *name, int name_len, key.objectid = dir; key.flags = 0; + btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY); ret = btrfs_name_hash(name, name_len, &key.offset); BUG_ON(ret); btrfs_init_path(&path); @@ -38,65 +39,40 @@ out: return ret; } -int btrfs_del_dir_item(struct btrfs_root *root, u64 dir, char *name, - int name_len) +int btrfs_lookup_dir_item(struct btrfs_root *root, struct btrfs_path *path, + u64 dir, char *name, int name_len, int mod) { - int ret = 0; - struct btrfs_path path; + int ret; struct btrfs_key key; + int ins_len = mod < 0 ? -1 : 0; + int cow = mod != 0; key.objectid = dir; key.flags = 0; + btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY); ret = btrfs_name_hash(name, name_len, &key.offset); BUG_ON(ret); - btrfs_init_path(&path); - ret = btrfs_search_slot(root, &key, &path, 0, 1); - if (ret) - goto out; - ret = btrfs_del_item(root, &path); -out: - btrfs_release_path(root, &path); + ret = btrfs_search_slot(root, &key, path, ins_len, cow); return ret; } -int btrfs_lookup_dir_item(struct btrfs_root *root, u64 dir, char *name, - int name_len, u64 *objectid) +int btrfs_match_dir_item_name(struct btrfs_root *root, struct btrfs_path *path, + char *name, int name_len) { - int ret = 0; - struct btrfs_path path; + struct btrfs_item *item; struct btrfs_dir_item *dir_item; char *name_ptr; - struct btrfs_key key; u32 item_len; - struct btrfs_item *item; - - key.objectid = dir; - key.flags = 0; - ret = btrfs_name_hash(name, name_len, &key.offset); - BUG_ON(ret); - btrfs_init_path(&path); - ret = btrfs_search_slot(root, &key, &path, 0, 0); - if (ret) - goto out; - - dir_item = btrfs_item_ptr(&path.nodes[0]->leaf, path.slots[0], - struct btrfs_dir_item); - - item = path.nodes[0]->leaf.items + path.slots[0]; + item = path->nodes[0]->leaf.items + path->slots[0]; item_len = btrfs_item_size(item); if (item_len != name_len + sizeof(struct btrfs_dir_item)) { - BUG(); - ret = 1; - goto out; + return 0; } + dir_item = btrfs_item_ptr(&path->nodes[0]->leaf, path->slots[0], + struct btrfs_dir_item); name_ptr = (char *)(dir_item + 1); if (memcmp(name_ptr, name, name_len)) { - BUG(); - ret = 1; - goto out; + return 0; } - *objectid = btrfs_dir_objectid(dir_item); -out: - btrfs_release_path(root, &path); - return ret; + return 1; } diff --git a/fs/btrfs/dir-test.c b/fs/btrfs/dir-test.c new file mode 100644 index 000000000000..b482b8f49f8a --- /dev/null +++ b/fs/btrfs/dir-test.c @@ -0,0 +1,404 @@ +#include +#include +#include +#include +#include "kerncompat.h" +#include "radix-tree.h" +#include "ctree.h" +#include "disk-io.h" +#include "print-tree.h" +#include "hash.h" + +int keep_running = 1; +struct btrfs_super_block super; +static u64 dir_oid = 44556; +static u64 file_oid = 33778; + +static int find_num(struct radix_tree_root *root, unsigned long *num_ret, + int exists) +{ + unsigned long num = rand(); + unsigned long res[2]; + int ret; + +again: + ret = radix_tree_gang_lookup(root, (void **)res, num, 2); + if (exists) { + if (ret == 0) + return -1; + num = res[0]; + } else if (ret != 0 && num == res[0]) { + num++; + if (ret > 1 && num == res[1]) { + num++; + goto again; + } + } + *num_ret = num; + return 0; +} + +static int ins_one(struct btrfs_root *root, struct radix_tree_root *radix) +{ + int ret; + char buf[128]; + unsigned long oid; + struct btrfs_path path; + + find_num(radix, &oid, 0); + sprintf(buf, "str-%lu", oid); + + ret = btrfs_insert_dir_item(root, buf, strlen(buf), dir_oid, file_oid, + 1); + if (ret) + goto error; + + radix_tree_preload(GFP_KERNEL); + ret = radix_tree_insert(radix, oid, (void *)oid); + radix_tree_preload_end(); + if (ret) + goto error; + return ret; +error: + if (ret != -EEXIST) + goto fatal; + + /* + * if we got an EEXIST, it may be due to hash collision, double + * check + */ + btrfs_init_path(&path); + ret = btrfs_lookup_dir_item(root, &path, dir_oid, buf, strlen(buf), 0); + if (ret) + goto fatal_release; + if (!btrfs_match_dir_item_name(root, &path, buf, strlen(buf))) { + struct btrfs_dir_item *di; + char *found; + u32 found_len; + u64 myhash; + u64 foundhash; + + di = btrfs_item_ptr(&path.nodes[0]->leaf, path.slots[0], + struct btrfs_dir_item); + found = (char *)(di + 1); + found_len = btrfs_dir_name_len(path.nodes[0]->leaf.items + + path.slots[0]); + btrfs_name_hash(buf, strlen(buf), &myhash); + btrfs_name_hash(found, found_len, &foundhash); + if (myhash != foundhash) + goto fatal_release; + btrfs_release_path(root, &path); + return 0; + } +fatal_release: + btrfs_release_path(root, &path); +fatal: + printf("failed to insert %lu ret %d\n", oid, ret); + return -1; +} + +static int insert_dup(struct btrfs_root *root, struct radix_tree_root *radix) +{ + int ret; + char buf[128]; + unsigned long oid; + + ret = find_num(radix, &oid, 1); + if (ret < 0) + return 0; + sprintf(buf, "str-%lu", oid); + + ret = btrfs_insert_dir_item(root, buf, strlen(buf), dir_oid, file_oid, + 1); + if (ret != -EEXIST) { + printf("insert on %s gave us %d\n", buf, ret); + return 1; + } + return 0; +} + +static int del_one(struct btrfs_root *root, struct radix_tree_root *radix) +{ + int ret; + char buf[128]; + unsigned long oid; + struct btrfs_path path; + unsigned long *ptr; + + ret = find_num(radix, &oid, 1); + if (ret < 0) + return 0; + sprintf(buf, "str-%lu", oid); + btrfs_init_path(&path); + ret = btrfs_lookup_dir_item(root, &path, dir_oid, buf, strlen(buf), -1); + if (ret) + goto out_release; + ret = btrfs_del_item(root, &path); + if (ret) + goto out_release; + btrfs_release_path(root, &path); + ptr = radix_tree_delete(radix, oid); + if (!ptr) { + ret = -5555; + goto out; + } + return 0; +out_release: + btrfs_release_path(root, &path); +out: + printf("failed to delete %lu %d\n", oid, ret); + return -1; +} + +static int lookup_item(struct btrfs_root *root, struct radix_tree_root *radix) +{ + struct btrfs_path path; + char buf[128]; + int ret; + unsigned long oid; + + ret = find_num(radix, &oid, 1); + if (ret < 0) + return 0; + sprintf(buf, "str-%lu", oid); + btrfs_init_path(&path); + ret = btrfs_lookup_dir_item(root, &path, dir_oid, buf, strlen(buf), 0); + btrfs_release_path(root, &path); + if (ret) { + printf("unable to find key %lu\n", oid); + return -1; + } + return 0; +} + +static int lookup_enoent(struct btrfs_root *root, struct radix_tree_root *radix) +{ + struct btrfs_path path; + char buf[128]; + int ret; + unsigned long oid; + + ret = find_num(radix, &oid, 0); + if (ret < 0) + return 0; + sprintf(buf, "str-%lu", oid); + btrfs_init_path(&path); + ret = btrfs_lookup_dir_item(root, &path, dir_oid, buf, strlen(buf), 0); + btrfs_release_path(root, &path); + if (!ret) { + printf("able to find key that should not exist %lu\n", oid); + return -1; + } + return 0; +} + +static int empty_tree(struct btrfs_root *root, struct radix_tree_root *radix, + int nr) +{ + struct btrfs_path path; + struct btrfs_key key; + unsigned long found = 0; + u32 found_len; + int ret; + int slot; + int *ptr; + int count = 0; + char buf[128]; + struct btrfs_dir_item *di; + + key.offset = (u64)-1; + key.flags = 0; + btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY); + key.objectid = dir_oid; + while(nr-- >= 0) { + btrfs_init_path(&path); + ret = btrfs_search_slot(root, &key, &path, -1, 1); + if (ret < 0) { + btrfs_release_path(root, &path); + return ret; + } + if (ret != 0) { + if (path.slots[0] == 0) { + btrfs_release_path(root, &path); + break; + } + path.slots[0] -= 1; + } + slot = path.slots[0]; + di = btrfs_item_ptr(&path.nodes[0]->leaf, slot, + struct btrfs_dir_item); + found_len = btrfs_dir_name_len(path.nodes[0]->leaf.items + + slot); + memcpy(buf, (char *)(di + 1), found_len); + BUG_ON(found_len > 128); + buf[found_len] = '\0'; + found = atoi(buf + 4); + ret = btrfs_del_item(root, &path); + count++; + if (ret) { + fprintf(stderr, + "failed to remove %lu from tree\n", + found); + return -1; + } + btrfs_release_path(root, &path); + ptr = radix_tree_delete(radix, found); + if (!ptr) + goto error; + if (!keep_running) + break; + } + return 0; +error: + fprintf(stderr, "failed to delete from the radix %lu\n", found); + return -1; +} + +static int fill_tree(struct btrfs_root *root, struct radix_tree_root *radix, + int count) +{ + int i; + int ret = 0; + for (i = 0; i < count; i++) { + ret = ins_one(root, radix); + if (ret) { + fprintf(stderr, "fill failed\n"); + goto out; + } + if (i % 1000 == 0) { + ret = btrfs_commit_transaction(root, &super); + if (ret) { + fprintf(stderr, "fill commit failed\n"); + return ret; + } + } + if (i && i % 10000 == 0) { + printf("bigfill %d\n", i); + } + if (!keep_running) + break; + } +out: + return ret; +} + +static int bulk_op(struct btrfs_root *root, struct radix_tree_root *radix) +{ + int ret; + int nr = rand() % 5000; + static int run_nr = 0; + + /* do the bulk op much less frequently */ + if (run_nr++ % 100) + return 0; + ret = empty_tree(root, radix, nr); + if (ret) + return ret; + ret = fill_tree(root, radix, nr); + if (ret) + return ret; + return 0; +} + + +int (*ops[])(struct btrfs_root *root, struct radix_tree_root *radix) = + { ins_one, insert_dup, del_one, lookup_item, + lookup_enoent, bulk_op }; + +void sigstopper(int ignored) +{ + keep_running = 0; + fprintf(stderr, "caught exit signal, stopping\n"); +} + +int print_usage(void) +{ + printf("usage: tester [-ih] [-c count] [-f count]\n"); + printf("\t -c count -- iteration count after filling\n"); + printf("\t -f count -- run this many random inserts before starting\n"); + printf("\t -i -- only do initial fill\n"); + printf("\t -h -- this help text\n"); + exit(1); +} +int main(int ac, char **av) +{ + RADIX_TREE(radix, GFP_KERNEL); + struct btrfs_root *root; + int i; + int ret; + int count; + int op; + int iterations = 20000; + int init_fill_count = 800000; + int err = 0; + int initial_only = 0; + radix_tree_init(); + + printf("removing old tree\n"); + unlink("dbfile"); + root = open_ctree("dbfile", &super); + + signal(SIGTERM, sigstopper); + signal(SIGINT, sigstopper); + + for (i = 1 ; i < ac ; i++) { + if (strcmp(av[i], "-i") == 0) { + initial_only = 1; + } else if (strcmp(av[i], "-c") == 0) { + iterations = atoi(av[i+1]); + i++; + } else if (strcmp(av[i], "-f") == 0) { + init_fill_count = atoi(av[i+1]); + i++; + } else { + print_usage(); + } + } + printf("initial fill\n"); + ret = fill_tree(root, &radix, init_fill_count); + printf("starting run\n"); + if (ret) { + err = ret; + goto out; + } + if (initial_only == 1) { + goto out; + } + for (i = 0; i < iterations; i++) { + op = rand() % ARRAY_SIZE(ops); + count = rand() % 128; + if (i % 2000 == 0) { + printf("%d\n", i); + fflush(stdout); + } + if (i && i % 5000 == 0) { + printf("open & close, root level %d nritems %d\n", + btrfs_header_level(&root->node->node.header), + btrfs_header_nritems(&root->node->node.header)); + close_ctree(root, &super); + root = open_ctree("dbfile", &super); + } + while(count--) { + ret = ops[op](root, &radix); + if (ret) { + fprintf(stderr, "op %d failed %d:%d\n", + op, i, iterations); + btrfs_print_tree(root, root->node); + fprintf(stderr, "op %d failed %d:%d\n", + op, i, iterations); + err = ret; + goto out; + } + if (ops[op] == bulk_op) + break; + if (keep_running == 0) { + err = 0; + goto out; + } + } + } +out: + close_ctree(root, &super); + return err; +} + diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c index f2745b247473..ad244d658117 100644 --- a/fs/btrfs/print-tree.c +++ b/fs/btrfs/print-tree.c @@ -12,7 +12,10 @@ void btrfs_print_leaf(struct btrfs_root *root, struct btrfs_leaf *l) struct btrfs_item *item; struct btrfs_extent_item *ei; struct btrfs_root_item *ri; + struct btrfs_dir_item *di; u32 type; + u32 namelen; + printf("leaf %Lu total ptrs %d free space %d\n", btrfs_header_blocknr(&l->header), nr, btrfs_leaf_free_space(root, l)); @@ -31,6 +34,15 @@ void btrfs_print_leaf(struct btrfs_root *root, struct btrfs_leaf *l) case BTRFS_INODE_ITEM_KEY: break; case BTRFS_DIR_ITEM_KEY: + namelen = btrfs_item_size(l->items + i) - sizeof(*di); + di = btrfs_item_ptr(l, i, struct btrfs_dir_item); + printf("\t\tdir oid %Lu flags %u type %u\n", + btrfs_dir_objectid(di), + btrfs_dir_flags(di), + btrfs_dir_type(di)); + printf("\t\tname %.*s\n", + namelen, (char *)(di + 1)); + break; case BTRFS_ROOT_ITEM_KEY: ri = btrfs_item_ptr(l, i, struct btrfs_root_item); -- cgit v1.2.3 From 1e1d27017c5986c1ea81181506042cf9cba3f6ea Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Thu, 15 Mar 2007 19:03:33 -0400 Subject: Btrfs: add inode item Signed-off-by: Chris Mason --- fs/btrfs/Makefile | 2 +- fs/btrfs/TODO | 1 + fs/btrfs/ctree.h | 175 ++++++++++++++++++++++++++++++++++++++++++++++++-- fs/btrfs/file-item.c | 7 ++ fs/btrfs/inode-item.c | 37 +++++++++++ 5 files changed, 215 insertions(+), 7 deletions(-) create mode 100644 fs/btrfs/file-item.c create mode 100644 fs/btrfs/inode-item.c (limited to 'fs/btrfs/Makefile') diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index ea3e1493baa9..36964f54bb47 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -2,7 +2,7 @@ CC=gcc CFLAGS = -g -Wall headers = radix-tree.h ctree.h disk-io.h kerncompat.h print-tree.h list.h objects = ctree.o disk-io.o radix-tree.o mkfs.o extent-tree.o print-tree.o \ - root-tree.o dir-item.o hash.o + root-tree.o dir-item.o hash.o file-item.o inode-item.o # if you don't have sparse installed, use ls instead CHECKFLAGS=-D__linux__ -Dlinux -D__STDC__ -Dunix -D__unix__ -Wbitwise \ diff --git a/fs/btrfs/TODO b/fs/btrfs/TODO index 249e2e90bea8..2ae4b3aae1e5 100644 --- a/fs/btrfs/TODO +++ b/fs/btrfs/TODO @@ -9,6 +9,7 @@ * Add simple tree locking (semaphore per tree) * Make allocator smarter * Do actual block accounting +* Check compat and incompat flags on the inode * Port into the kernel * Add virtual filesystems, mountable snapshots * Get rid of struct ctree_path, limiting tree levels held at one time diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 7a3492d5888e..dbf3917833fe 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -132,6 +132,37 @@ struct btrfs_extent_item { __le64 owner; } __attribute__ ((__packed__)); +struct btrfs_inode_timespec { + __le32 sec; + __le32 nsec; +} __attribute__ ((__packed__)); + +/* + * there is no padding here on purpose. If you want to extent the inode, + * make a new item type + */ +struct btrfs_inode_item { + __le64 generation; + __le64 size; + __le64 nblocks; + __le32 nlink; + __le32 uid; + __le32 gid; + __le32 mode; + __le32 rdev; + __le16 flags; + __le16 compat_flags; + struct btrfs_inode_timespec atime; + struct btrfs_inode_timespec ctime; + struct btrfs_inode_timespec mtime; + struct btrfs_inode_timespec otime; +} __attribute__ ((__packed__)); + +/* inline data is just a blob of bytes */ +struct btrfs_inline_data_item { + u8 data; +} __attribute__ ((__packed__)); + struct btrfs_dir_item { __le64 objectid; __le16 flags; @@ -170,15 +201,149 @@ struct btrfs_root { u32 blocksize; }; - /* the lower bits in the key flags defines the item type */ #define BTRFS_KEY_TYPE_MAX 256 #define BTRFS_KEY_TYPE_MASK (BTRFS_KEY_TYPE_MAX - 1) + +/* + * inode items have the data typically returned from stat and store other + * info about object characteristics. There is one for every file and dir in + * the FS + */ #define BTRFS_INODE_ITEM_KEY 1 + +/* + * dir items are the name -> inode pointers in a directory. There is one + * for every name in a directory. + */ #define BTRFS_DIR_ITEM_KEY 2 -#define BTRFS_ROOT_ITEM_KEY 3 -#define BTRFS_EXTENT_ITEM_KEY 4 -#define BTRFS_STRING_ITEM_KEY 5 +/* + * inline data is file data that fits in the btree. + */ +#define BTRFS_INLINE_DATA_KEY 3 +/* + * extent data is for data that can't fit in the btree. It points to + * a (hopefully) huge chunk of disk + */ +#define BTRFS_EXTENT_DATA_KEY 4 +/* + * root items point to tree roots. There are typically in the root + * tree used by the super block to find all the other trees + */ +#define BTRFS_ROOT_ITEM_KEY 5 +/* + * extent items are in the extent map tree. These record which blocks + * are used, and how many references there are to each block + */ +#define BTRFS_EXTENT_ITEM_KEY 6 +/* + * string items are for debugging. They just store a short string of + * data in the FS + */ +#define BTRFS_STRING_ITEM_KEY 7 + +static inline u64 btrfs_inode_generation(struct btrfs_inode_item *i) +{ + return le64_to_cpu(i->generation); +} + +static inline void btrfs_set_inode_generation(struct btrfs_inode_item *i, + u64 val) +{ + i->generation = cpu_to_le64(val); +} + +static inline u64 btrfs_inode_size(struct btrfs_inode_item *i) +{ + return le64_to_cpu(i->size); +} + +static inline void btrfs_set_inode_size(struct btrfs_inode_item *i, u64 val) +{ + i->size = cpu_to_le64(val); +} + +static inline u64 btrfs_inode_nblocks(struct btrfs_inode_item *i) +{ + return le64_to_cpu(i->nblocks); +} + +static inline void btrfs_set_inode_nblocks(struct btrfs_inode_item *i, u64 val) +{ + i->nblocks = cpu_to_le64(val); +} + +static inline u32 btrfs_inode_nlink(struct btrfs_inode_item *i) +{ + return le32_to_cpu(i->nlink); +} + +static inline void btrfs_set_inode_nlink(struct btrfs_inode_item *i, u32 val) +{ + i->nlink = cpu_to_le32(val); +} + +static inline u32 btrfs_inode_uid(struct btrfs_inode_item *i) +{ + return le32_to_cpu(i->uid); +} + +static inline void btrfs_set_inode_uid(struct btrfs_inode_item *i, u32 val) +{ + i->uid = cpu_to_le32(val); +} + +static inline u32 btrfs_inode_gid(struct btrfs_inode_item *i) +{ + return le32_to_cpu(i->gid); +} + +static inline void btrfs_set_inode_gid(struct btrfs_inode_item *i, u32 val) +{ + i->gid = cpu_to_le32(val); +} + +static inline u32 btrfs_inode_mode(struct btrfs_inode_item *i) +{ + return le32_to_cpu(i->mode); +} + +static inline void btrfs_set_inode_mode(struct btrfs_inode_item *i, u32 val) +{ + i->mode = cpu_to_le32(val); +} + +static inline u32 btrfs_inode_rdev(struct btrfs_inode_item *i) +{ + return le32_to_cpu(i->rdev); +} + +static inline void btrfs_set_inode_rdev(struct btrfs_inode_item *i, u32 val) +{ + i->rdev = cpu_to_le32(val); +} + +static inline u16 btrfs_inode_flags(struct btrfs_inode_item *i) +{ + return le16_to_cpu(i->flags); +} + +static inline void btrfs_set_inode_flags(struct btrfs_inode_item *i, u16 val) +{ + i->flags = cpu_to_le16(val); +} + +static inline u16 btrfs_inode_compat_flags(struct btrfs_inode_item *i) +{ + return le16_to_cpu(i->compat_flags); +} + +static inline void btrfs_set_inode_compat_flags(struct btrfs_inode_item *i, + u16 val) +{ + i->compat_flags = cpu_to_le16(val); +} + static inline u64 btrfs_extent_owner(struct btrfs_extent_item *ei) { @@ -344,8 +509,6 @@ static inline void btrfs_set_disk_key_type(struct btrfs_disk_key *key, u32 type) btrfs_set_disk_key_flags(key, flags); } - - static inline u64 btrfs_header_blocknr(struct btrfs_header *h) { return le64_to_cpu(h->blocknr); diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c new file mode 100644 index 000000000000..f5b36c732c5e --- /dev/null +++ b/fs/btrfs/file-item.c @@ -0,0 +1,7 @@ +#include +#include +#include "kerncompat.h" +#include "radix-tree.h" +#include "ctree.h" +#include "disk-io.h" + diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c new file mode 100644 index 000000000000..4908f9afdf86 --- /dev/null +++ b/fs/btrfs/inode-item.c @@ -0,0 +1,37 @@ +#include +#include +#include "kerncompat.h" +#include "radix-tree.h" +#include "ctree.h" +#include "disk-io.h" + +int btrfs_insert_inode(struct btrfs_root *root, u64 objectid, + struct btrfs_inode_item *inode_item) +{ + struct btrfs_path path; + struct btrfs_key key; + int ret; + key.objectid = objectid; + key.flags = 0; + btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); + key.offset = 0; + + btrfs_init_path(&path); + ret = btrfs_insert_item(root, &key, inode_item, sizeof(*inode_item)); + btrfs_release_path(root, &path); + return ret; +} + +int btrfs_lookup_inode(struct btrfs_root *root, struct btrfs_path *path, + u64 objectid, int mod) +{ + struct btrfs_key key; + int ins_len = mod < 0 ? -1 : 0; + int cow = mod != 0; + + key.objectid = objectid; + key.flags = 0; + btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); + key.offset = 0; + return btrfs_search_slot(root, &key, path, ins_len, cow); +} -- cgit v1.2.3 From e089f05c18ab36ed5fa7e2319052e03ab800d518 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Fri, 16 Mar 2007 16:20:31 -0400 Subject: Btrfs: transaction handles everywhere Signed-off-by: Chris Mason --- fs/btrfs/Makefile | 4 +- fs/btrfs/TODO | 2 + fs/btrfs/ctree.c | 204 +++++++++++++++++++++++++------------------------ fs/btrfs/ctree.h | 63 +++++++++------ fs/btrfs/debug-tree.c | 1 + fs/btrfs/dir-item.c | 20 +++-- fs/btrfs/dir-test.c | 70 ++++++++++------- fs/btrfs/disk-io.c | 60 +++++++++------ fs/btrfs/disk-io.h | 16 ++-- fs/btrfs/extent-tree.c | 135 +++++++++++++++++--------------- fs/btrfs/inode-item.c | 15 ++-- fs/btrfs/print-tree.h | 4 +- fs/btrfs/quick-test.c | 23 +++--- fs/btrfs/random-test.c | 62 ++++++++------- fs/btrfs/root-tree.c | 23 +++--- fs/btrfs/transaction.h | 27 +++++++ 16 files changed, 426 insertions(+), 303 deletions(-) create mode 100644 fs/btrfs/transaction.h (limited to 'fs/btrfs/Makefile') diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index 36964f54bb47..76bcd87d8575 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -1,5 +1,5 @@ CC=gcc -CFLAGS = -g -Wall +CFLAGS = -g -Wall -Werror headers = radix-tree.h ctree.h disk-io.h kerncompat.h print-tree.h list.h objects = ctree.o disk-io.o radix-tree.o mkfs.o extent-tree.o print-tree.o \ root-tree.o dir-item.o hash.o file-item.o inode-item.o @@ -14,7 +14,7 @@ check=sparse $(CHECKFLAGS) $(check) $< $(CC) $(CFLAGS) -c $< -all: tester debug-tree quick-test dir-test +all: tester debug-tree quick-test dir-test tags debug-tree: $(objects) debug-tree.o gcc $(CFLAGS) -o debug-tree $(objects) debug-tree.o diff --git a/fs/btrfs/TODO b/fs/btrfs/TODO index 2ae4b3aae1e5..e9894999322e 100644 --- a/fs/btrfs/TODO +++ b/fs/btrfs/TODO @@ -8,6 +8,8 @@ * Add block mapping tree (simple dm layer) * Add simple tree locking (semaphore per tree) * Make allocator smarter +* Only pin blocks allocated in this transaction +* Add transaction handles * Do actual block accounting * Check compat and incompat flags on the inode * Port into the kernel diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 13128b5ed65c..17a3ff2f1828 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -6,17 +6,18 @@ #include "disk-io.h" #include "print-tree.h" -static int split_node(struct btrfs_root *root, struct btrfs_path *path, - int level); -static int split_leaf(struct btrfs_root *root, struct btrfs_path *path, - int data_size); -static int push_node_left(struct btrfs_root *root, struct btrfs_buffer *dst, - struct btrfs_buffer *src); -static int balance_node_right(struct btrfs_root *root, - struct btrfs_buffer *dst_buf, +static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root + *root, struct btrfs_path *path, int level); +static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root + *root, struct btrfs_path *path, int data_size); +static int push_node_left(struct btrfs_trans_handle *trans, struct btrfs_root + *root, struct btrfs_buffer *dst, struct btrfs_buffer + *src); +static int balance_node_right(struct btrfs_trans_handle *trans, struct + btrfs_root *root, struct btrfs_buffer *dst_buf, struct btrfs_buffer *src_buf); -static int del_ptr(struct btrfs_root *root, struct btrfs_path *path, int level, - int slot); +static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, + struct btrfs_path *path, int level, int slot); inline void btrfs_init_path(struct btrfs_path *p) { @@ -34,11 +35,10 @@ void btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p) memset(p, 0, sizeof(*p)); } -static int btrfs_cow_block(struct btrfs_root *root, - struct btrfs_buffer *buf, - struct btrfs_buffer *parent, - int parent_slot, - struct btrfs_buffer **cow_ret) +static int btrfs_cow_block(struct btrfs_trans_handle *trans, struct btrfs_root + *root, struct btrfs_buffer *buf, struct btrfs_buffer + *parent, int parent_slot, struct btrfs_buffer + **cow_ret) { struct btrfs_buffer *cow; @@ -46,22 +46,22 @@ static int btrfs_cow_block(struct btrfs_root *root, *cow_ret = buf; return 0; } - cow = btrfs_alloc_free_block(root); + cow = btrfs_alloc_free_block(trans, root); memcpy(&cow->node, &buf->node, root->blocksize); btrfs_set_header_blocknr(&cow->node.header, cow->blocknr); *cow_ret = cow; - btrfs_inc_ref(root, buf); + btrfs_inc_ref(trans, root, buf); if (buf == root->node) { root->node = cow; cow->count++; if (buf != root->commit_root) - btrfs_free_extent(root, buf->blocknr, 1, 1); + btrfs_free_extent(trans, root, buf->blocknr, 1, 1); btrfs_block_release(root, buf); } else { btrfs_set_node_blockptr(&parent->node, parent_slot, cow->blocknr); BUG_ON(list_empty(&parent->dirty)); - btrfs_free_extent(root, buf->blocknr, 1, 1); + btrfs_free_extent(trans, root, buf->blocknr, 1, 1); } btrfs_block_release(root, buf); return 0; @@ -266,8 +266,8 @@ static struct btrfs_buffer *read_node_slot(struct btrfs_root *root, return read_tree_block(root, btrfs_node_blockptr(node, slot)); } -static int balance_level(struct btrfs_root *root, struct btrfs_path *path, - int level) +static int balance_level(struct btrfs_trans_handle *trans, struct btrfs_root + *root, struct btrfs_path *path, int level) { struct btrfs_buffer *right_buf; struct btrfs_buffer *mid_buf; @@ -310,8 +310,8 @@ static int balance_level(struct btrfs_root *root, struct btrfs_path *path, btrfs_block_release(root, mid_buf); /* once for the root ptr */ btrfs_block_release(root, mid_buf); - clean_tree_block(root, mid_buf); - return btrfs_free_extent(root, blocknr, 1, 1); + clean_tree_block(trans, root, mid_buf); + return btrfs_free_extent(trans, root, blocknr, 1, 1); } parent = &parent_buf->node; @@ -324,11 +324,11 @@ static int balance_level(struct btrfs_root *root, struct btrfs_path *path, /* first, try to make some room in the middle buffer */ if (left_buf) { - btrfs_cow_block(root, left_buf, parent_buf, - pslot - 1, &left_buf); + btrfs_cow_block(trans, root, left_buf, parent_buf, pslot - 1, + &left_buf); left = &left_buf->node; orig_slot += btrfs_header_nritems(&left->header); - wret = push_node_left(root, left_buf, mid_buf); + wret = push_node_left(trans, root, left_buf, mid_buf); if (wret < 0) ret = wret; } @@ -337,22 +337,23 @@ static int balance_level(struct btrfs_root *root, struct btrfs_path *path, * then try to empty the right most buffer into the middle */ if (right_buf) { - btrfs_cow_block(root, right_buf, parent_buf, - pslot + 1, &right_buf); + btrfs_cow_block(trans, root, right_buf, parent_buf, pslot + 1, + &right_buf); right = &right_buf->node; - wret = push_node_left(root, mid_buf, right_buf); + wret = push_node_left(trans, root, mid_buf, right_buf); if (wret < 0) ret = wret; if (btrfs_header_nritems(&right->header) == 0) { u64 blocknr = right_buf->blocknr; btrfs_block_release(root, right_buf); - clean_tree_block(root, right_buf); + clean_tree_block(trans, root, right_buf); right_buf = NULL; right = NULL; - wret = del_ptr(root, path, level + 1, pslot + 1); + wret = del_ptr(trans, root, path, level + 1, pslot + + 1); if (wret) ret = wret; - wret = btrfs_free_extent(root, blocknr, 1, 1); + wret = btrfs_free_extent(trans, root, blocknr, 1, 1); if (wret) ret = wret; } else { @@ -373,7 +374,7 @@ static int balance_level(struct btrfs_root *root, struct btrfs_path *path, * right */ BUG_ON(!left_buf); - wret = balance_node_right(root, mid_buf, left_buf); + wret = balance_node_right(trans, root, mid_buf, left_buf); if (wret < 0) ret = wret; BUG_ON(wret == 1); @@ -382,13 +383,13 @@ static int balance_level(struct btrfs_root *root, struct btrfs_path *path, /* we've managed to empty the middle node, drop it */ u64 blocknr = mid_buf->blocknr; btrfs_block_release(root, mid_buf); - clean_tree_block(root, mid_buf); + clean_tree_block(trans, root, mid_buf); mid_buf = NULL; mid = NULL; - wret = del_ptr(root, path, level + 1, pslot); + wret = del_ptr(trans, root, path, level + 1, pslot); if (wret) ret = wret; - wret = btrfs_free_extent(root, blocknr, 1, 1); + wret = btrfs_free_extent(trans, root, blocknr, 1, 1); if (wret) ret = wret; } else { @@ -438,8 +439,9 @@ static int balance_level(struct btrfs_root *root, struct btrfs_path *path, * tree. if ins_len < 0, nodes will be merged as we walk down the tree (if * possible) */ -int btrfs_search_slot(struct btrfs_root *root, struct btrfs_key *key, - struct btrfs_path *p, int ins_len, int cow) +int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root + *root, struct btrfs_key *key, struct btrfs_path *p, int + ins_len, int cow) { struct btrfs_buffer *b; struct btrfs_buffer *cow_buf; @@ -455,8 +457,9 @@ again: level = btrfs_header_level(&b->node.header); if (cow) { int wret; - wret = btrfs_cow_block(root, b, p->nodes[level + 1], - p->slots[level + 1], &cow_buf); + wret = btrfs_cow_block(trans, root, b, p->nodes[level + + 1], p->slots[level + 1], + &cow_buf); b = cow_buf; } BUG_ON(!cow && ins_len); @@ -472,7 +475,7 @@ again: p->slots[level] = slot; if (ins_len > 0 && btrfs_header_nritems(&c->header) == BTRFS_NODEPTRS_PER_BLOCK(root)) { - int sret = split_node(root, p, level); + int sret = split_node(trans, root, p, level); BUG_ON(sret > 0); if (sret) return sret; @@ -480,7 +483,8 @@ again: c = &b->node; slot = p->slots[level]; } else if (ins_len < 0) { - int sret = balance_level(root, p, level); + int sret = balance_level(trans, root, p, + level); if (sret) return sret; b = p->nodes[level]; @@ -496,7 +500,7 @@ again: p->slots[level] = slot; if (ins_len > 0 && btrfs_leaf_free_space(root, l) < sizeof(struct btrfs_item) + ins_len) { - int sret = split_leaf(root, p, ins_len); + int sret = split_leaf(trans, root, p, ins_len); BUG_ON(sret > 0); if (sret) return sret; @@ -519,9 +523,9 @@ again: * If this fails to write a tree block, it returns -1, but continues * fixing up the blocks in ram so the tree is consistent. */ -static int fixup_low_keys(struct btrfs_root *root, - struct btrfs_path *path, struct btrfs_disk_key *key, - int level) +static int fixup_low_keys(struct btrfs_trans_handle *trans, struct btrfs_root + *root, struct btrfs_path *path, struct btrfs_disk_key + *key, int level) { int i; int ret = 0; @@ -546,8 +550,9 @@ static int fixup_low_keys(struct btrfs_root *root, * returns 0 if some ptrs were pushed left, < 0 if there was some horrible * error, and > 0 if there was no room in the left hand block. */ -static int push_node_left(struct btrfs_root *root, struct btrfs_buffer *dst_buf, - struct btrfs_buffer *src_buf) +static int push_node_left(struct btrfs_trans_handle *trans, struct btrfs_root + *root, struct btrfs_buffer *dst_buf, struct + btrfs_buffer *src_buf) { struct btrfs_node *src = &src_buf->node; struct btrfs_node *dst = &dst_buf->node; @@ -589,8 +594,8 @@ static int push_node_left(struct btrfs_root *root, struct btrfs_buffer *dst_buf, * * this will only push up to 1/2 the contents of the left node over */ -static int balance_node_right(struct btrfs_root *root, - struct btrfs_buffer *dst_buf, +static int balance_node_right(struct btrfs_trans_handle *trans, struct + btrfs_root *root, struct btrfs_buffer *dst_buf, struct btrfs_buffer *src_buf) { struct btrfs_node *src = &src_buf->node; @@ -635,8 +640,8 @@ static int balance_node_right(struct btrfs_root *root, * * returns zero on success or < 0 on failure. */ -static int insert_new_root(struct btrfs_root *root, - struct btrfs_path *path, int level) +static int insert_new_root(struct btrfs_trans_handle *trans, struct btrfs_root + *root, struct btrfs_path *path, int level) { struct btrfs_buffer *t; struct btrfs_node *lower; @@ -646,7 +651,7 @@ static int insert_new_root(struct btrfs_root *root, BUG_ON(path->nodes[level]); BUG_ON(path->nodes[level-1] != root->node); - t = btrfs_alloc_free_block(root); + t = btrfs_alloc_free_block(trans, root); c = &t->node; memset(c, 0, root->blocksize); btrfs_set_header_nritems(&c->header, 1); @@ -679,9 +684,9 @@ static int insert_new_root(struct btrfs_root *root, * * returns zero on success and < 0 on any error */ -static int insert_ptr(struct btrfs_root *root, - struct btrfs_path *path, struct btrfs_disk_key *key, - u64 blocknr, int slot, int level) +static int insert_ptr(struct btrfs_trans_handle *trans, struct btrfs_root + *root, struct btrfs_path *path, struct btrfs_disk_key + *key, u64 blocknr, int slot, int level) { struct btrfs_node *lower; int nritems; @@ -713,8 +718,8 @@ static int insert_ptr(struct btrfs_root *root, * * returns 0 on success and < 0 on failure */ -static int split_node(struct btrfs_root *root, struct btrfs_path *path, - int level) +static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root + *root, struct btrfs_path *path, int level) { struct btrfs_buffer *t; struct btrfs_node *c; @@ -729,12 +734,12 @@ static int split_node(struct btrfs_root *root, struct btrfs_path *path, c = &t->node; if (t == root->node) { /* trying to split the root, lets make a new one */ - ret = insert_new_root(root, path, level + 1); + ret = insert_new_root(trans, root, path, level + 1); if (ret) return ret; } c_nritems = btrfs_header_nritems(&c->header); - split_buffer = btrfs_alloc_free_block(root); + split_buffer = btrfs_alloc_free_block(trans, root); split = &split_buffer->node; btrfs_set_header_flags(&split->header, btrfs_header_flags(&c->header)); btrfs_set_header_blocknr(&split->header, split_buffer->blocknr); @@ -748,7 +753,7 @@ static int split_node(struct btrfs_root *root, struct btrfs_path *path, ret = 0; BUG_ON(list_empty(&t->dirty)); - wret = insert_ptr(root, path, &split->ptrs[0].key, + wret = insert_ptr(trans, root, path, &split->ptrs[0].key, split_buffer->blocknr, path->slots[level + 1] + 1, level + 1); if (wret) @@ -790,8 +795,8 @@ static int leaf_space_used(struct btrfs_leaf *l, int start, int nr) * returns 1 if the push failed because the other node didn't have enough * room, 0 if everything worked out and < 0 if there were major errors. */ -static int push_leaf_right(struct btrfs_root *root, struct btrfs_path *path, - int data_size) +static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root + *root, struct btrfs_path *path, int data_size) { struct btrfs_buffer *left_buf = path->nodes[0]; struct btrfs_leaf *left = &left_buf->leaf; @@ -824,7 +829,7 @@ static int push_leaf_right(struct btrfs_root *root, struct btrfs_path *path, return 1; } /* cow and double check */ - btrfs_cow_block(root, right_buf, upper, slot + 1, &right_buf); + btrfs_cow_block(trans, root, right_buf, upper, slot + 1, &right_buf); right = &right_buf->leaf; free_space = btrfs_leaf_free_space(root, right); if (free_space < data_size + sizeof(struct btrfs_item)) { @@ -897,8 +902,8 @@ static int push_leaf_right(struct btrfs_root *root, struct btrfs_path *path, * push some data in the path leaf to the left, trying to free up at * least data_size bytes. returns zero if the push worked, nonzero otherwise */ -static int push_leaf_left(struct btrfs_root *root, struct btrfs_path *path, - int data_size) +static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root + *root, struct btrfs_path *path, int data_size) { struct btrfs_buffer *right_buf = path->nodes[0]; struct btrfs_leaf *right = &right_buf->leaf; @@ -931,7 +936,7 @@ static int push_leaf_left(struct btrfs_root *root, struct btrfs_path *path, } /* cow and double check */ - btrfs_cow_block(root, t, path->nodes[1], slot - 1, &t); + btrfs_cow_block(trans, root, t, path->nodes[1], slot - 1, &t); left = &t->leaf; free_space = btrfs_leaf_free_space(root, left); if (free_space < data_size + sizeof(struct btrfs_item)) { @@ -997,7 +1002,7 @@ static int push_leaf_left(struct btrfs_root *root, struct btrfs_path *path, BUG_ON(list_empty(&t->dirty)); BUG_ON(list_empty(&right_buf->dirty)); - wret = fixup_low_keys(root, path, &right->items[0].key, 1); + wret = fixup_low_keys(trans, root, path, &right->items[0].key, 1); if (wret) ret = wret; @@ -1021,8 +1026,8 @@ static int push_leaf_left(struct btrfs_root *root, struct btrfs_path *path, * * returns 0 if all went well and < 0 on failure. */ -static int split_leaf(struct btrfs_root *root, struct btrfs_path *path, - int data_size) +static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root + *root, struct btrfs_path *path, int data_size) { struct btrfs_buffer *l_buf; struct btrfs_leaf *l; @@ -1038,11 +1043,11 @@ static int split_leaf(struct btrfs_root *root, struct btrfs_path *path, int ret; int wret; - wret = push_leaf_left(root, path, data_size); + wret = push_leaf_left(trans, root, path, data_size); if (wret < 0) return wret; if (wret) { - wret = push_leaf_right(root, path, data_size); + wret = push_leaf_right(trans, root, path, data_size); if (wret < 0) return wret; } @@ -1055,14 +1060,14 @@ static int split_leaf(struct btrfs_root *root, struct btrfs_path *path, return 0; if (!path->nodes[1]) { - ret = insert_new_root(root, path, 1); + ret = insert_new_root(trans, root, path, 1); if (ret) return ret; } slot = path->slots[0]; nritems = btrfs_header_nritems(&l->header); mid = (nritems + 1)/ 2; - right_buffer = btrfs_alloc_free_block(root); + right_buffer = btrfs_alloc_free_block(trans, root); BUG_ON(!right_buffer); BUG_ON(mid == nritems); right = &right_buffer->leaf; @@ -1100,7 +1105,7 @@ static int split_leaf(struct btrfs_root *root, struct btrfs_path *path, btrfs_set_header_nritems(&l->header, mid); ret = 0; - wret = insert_ptr(root, path, &right->items[0].key, + wret = insert_ptr(trans, root, path, &right->items[0].key, right_buffer->blocknr, path->slots[1] + 1, 1); if (wret) ret = wret; @@ -1122,8 +1127,9 @@ static int split_leaf(struct btrfs_root *root, struct btrfs_path *path, * Given a key and some data, insert an item into the tree. * This does all the path init required, making room in the tree if needed. */ -int btrfs_insert_empty_item(struct btrfs_root *root, struct btrfs_path *path, - struct btrfs_key *cpu_key, u32 data_size) +int btrfs_insert_empty_item(struct btrfs_trans_handle *trans, struct btrfs_root + *root, struct btrfs_path *path, struct btrfs_key + *cpu_key, u32 data_size) { int ret = 0; int slot; @@ -1139,7 +1145,7 @@ int btrfs_insert_empty_item(struct btrfs_root *root, struct btrfs_path *path, /* create a root if there isn't one */ if (!root->node) BUG(); - ret = btrfs_search_slot(root, cpu_key, path, data_size, 1); + ret = btrfs_search_slot(trans, root, cpu_key, path, data_size, 1); if (ret == 0) { btrfs_release_path(root, path); return -EEXIST; @@ -1193,7 +1199,7 @@ int btrfs_insert_empty_item(struct btrfs_root *root, struct btrfs_path *path, ret = 0; if (slot == 0) - ret = fixup_low_keys(root, path, &disk_key, 1); + ret = fixup_low_keys(trans, root, path, &disk_key, 1); BUG_ON(list_empty(&leaf_buf->dirty)); if (btrfs_leaf_free_space(root, leaf) < 0) @@ -1207,15 +1213,16 @@ out: * Given a key and some data, insert an item into the tree. * This does all the path init required, making room in the tree if needed. */ -int btrfs_insert_item(struct btrfs_root *root, struct btrfs_key *cpu_key, - void *data, u32 data_size) +int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root + *root, struct btrfs_key *cpu_key, void *data, u32 + data_size) { int ret = 0; struct btrfs_path path; u8 *ptr; btrfs_init_path(&path); - ret = btrfs_insert_empty_item(root, &path, cpu_key, data_size); + ret = btrfs_insert_empty_item(trans, root, &path, cpu_key, data_size); if (!ret) { ptr = btrfs_item_ptr(&path.nodes[0]->leaf, path.slots[0], u8); memcpy(ptr, data, data_size); @@ -1231,8 +1238,8 @@ int btrfs_insert_item(struct btrfs_root *root, struct btrfs_key *cpu_key, * continuing all the way the root if required. The root is converted into * a leaf if all the nodes are emptied. */ -static int del_ptr(struct btrfs_root *root, struct btrfs_path *path, int level, - int slot) +static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, + struct btrfs_path *path, int level, int slot) { struct btrfs_node *node; struct btrfs_buffer *parent = path->nodes[level]; @@ -1253,7 +1260,7 @@ static int del_ptr(struct btrfs_root *root, struct btrfs_path *path, int level, /* just turn the root into a leaf and break */ btrfs_set_header_level(&root->node->node.header, 0); } else if (slot == 0) { - wret = fixup_low_keys(root, path, &node->ptrs[0].key, + wret = fixup_low_keys(trans, root, path, &node->ptrs[0].key, level + 1); if (wret) ret = wret; @@ -1266,7 +1273,8 @@ static int del_ptr(struct btrfs_root *root, struct btrfs_path *path, int level, * delete the item at the leaf level in path. If that empties * the leaf, remove it from the tree */ -int btrfs_del_item(struct btrfs_root *root, struct btrfs_path *path) +int btrfs_del_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, + struct btrfs_path *path) { int slot; struct btrfs_leaf *leaf; @@ -1306,19 +1314,20 @@ int btrfs_del_item(struct btrfs_root *root, struct btrfs_path *path) btrfs_set_header_level(&leaf->header, 0); BUG_ON(list_empty(&leaf_buf->dirty)); } else { - clean_tree_block(root, leaf_buf); - wret = del_ptr(root, path, 1, path->slots[1]); + clean_tree_block(trans, root, leaf_buf); + wret = del_ptr(trans, root, path, 1, path->slots[1]); if (wret) ret = wret; - wret = btrfs_free_extent(root, leaf_buf->blocknr, 1, 1); + wret = btrfs_free_extent(trans, root, + leaf_buf->blocknr, 1, 1); if (wret) ret = wret; } } else { int used = leaf_space_used(leaf, 0, nritems); if (slot == 0) { - wret = fixup_low_keys(root, path, - &leaf->items[0].key, 1); + wret = fixup_low_keys(trans, root, path, + &leaf->items[0].key, 1); if (wret) ret = wret; } @@ -1332,23 +1341,24 @@ int btrfs_del_item(struct btrfs_root *root, struct btrfs_path *path) */ slot = path->slots[1]; leaf_buf->count++; - wret = push_leaf_left(root, path, 1); + wret = push_leaf_left(trans, root, path, 1); if (wret < 0) ret = wret; if (path->nodes[0] == leaf_buf && btrfs_header_nritems(&leaf->header)) { - wret = push_leaf_right(root, path, 1); + wret = push_leaf_right(trans, root, path, 1); if (wret < 0) ret = wret; } if (btrfs_header_nritems(&leaf->header) == 0) { u64 blocknr = leaf_buf->blocknr; - clean_tree_block(root, leaf_buf); - wret = del_ptr(root, path, 1, slot); + clean_tree_block(trans, root, leaf_buf); + wret = del_ptr(trans, root, path, 1, slot); if (wret) ret = wret; btrfs_block_release(root, leaf_buf); - wret = btrfs_free_extent(root, blocknr, 1, 1); + wret = btrfs_free_extent(trans, root, blocknr, + 1, 1); if (wret) ret = wret; } else { @@ -1401,5 +1411,3 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path) } return 0; } - - diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index d15a2ed95076..68f0af39777f 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -4,6 +4,8 @@ #include "list.h" #include "kerncompat.h" +struct btrfs_trans_handle; + #define BTRFS_MAGIC "_BtRfS_M" #define BTRFS_ROOT_TREE_OBJECTID 1 @@ -200,6 +202,7 @@ struct btrfs_root { struct btrfs_root_item root_item; struct btrfs_key root_key; u32 blocksize; + struct btrfs_trans_handle *running_transaction; }; /* the lower bits in the key flags defines the item type */ @@ -656,34 +659,46 @@ static inline u8 *btrfs_leaf_data(struct btrfs_leaf *l) ((type *)(btrfs_leaf_data(leaf) + \ btrfs_item_offset((leaf)->items + (slot)))) -struct btrfs_buffer *btrfs_alloc_free_block(struct btrfs_root *root); -int btrfs_inc_ref(struct btrfs_root *root, struct btrfs_buffer *buf); -int btrfs_free_extent(struct btrfs_root *root, u64 blocknr, u64 num_blocks, - int pin); -int btrfs_search_slot(struct btrfs_root *root, struct btrfs_key *key, - struct btrfs_path *p, int ins_len, int cow); +struct btrfs_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, + struct btrfs_root *root); +int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, + struct btrfs_buffer *buf); +int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root + *root, u64 blocknr, u64 num_blocks, int pin); +int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root + *root, struct btrfs_key *key, struct btrfs_path *p, int + ins_len, int cow); void btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p); void btrfs_init_path(struct btrfs_path *p); -int btrfs_del_item(struct btrfs_root *root, struct btrfs_path *path); -int btrfs_insert_item(struct btrfs_root *root, struct btrfs_key *key, - void *data, u32 data_size); -int btrfs_insert_empty_item(struct btrfs_root *root, struct btrfs_path *path, - struct btrfs_key *cpu_key, u32 data_size); +int btrfs_del_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, + struct btrfs_path *path); +int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root + *root, struct btrfs_key *key, void *data, u32 data_size); +int btrfs_insert_empty_item(struct btrfs_trans_handle *trans, struct btrfs_root + *root, struct btrfs_path *path, struct btrfs_key + *cpu_key, u32 data_size); int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path); int btrfs_leaf_free_space(struct btrfs_root *root, struct btrfs_leaf *leaf); -int btrfs_drop_snapshot(struct btrfs_root *root, struct btrfs_buffer *snap); -int btrfs_finish_extent_commit(struct btrfs_root *root); -int btrfs_del_root(struct btrfs_root *root, struct btrfs_key *key); -int btrfs_insert_root(struct btrfs_root *root, struct btrfs_key *key, - struct btrfs_root_item *item); -int btrfs_update_root(struct btrfs_root *root, struct btrfs_key *key, - struct btrfs_root_item *item); -int btrfs_find_last_root(struct btrfs_root *root, u64 objectid, - struct btrfs_root_item *item, struct btrfs_key *key); -int btrfs_insert_dir_item(struct btrfs_root *root, char *name, int name_len, - u64 dir, u64 objectid, u8 type); -int btrfs_lookup_dir_item(struct btrfs_root *root, struct btrfs_path *path, - u64 dir, char *name, int name_len, int mod); +int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root + *root, struct btrfs_buffer *snap); +int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, struct + btrfs_root *root); +int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root, + struct btrfs_key *key); +int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root + *root, struct btrfs_key *key, struct btrfs_root_item + *item); +int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root + *root, struct btrfs_key *key, struct btrfs_root_item + *item); +int btrfs_find_last_root(struct btrfs_root *root, u64 objectid, struct + btrfs_root_item *item, struct btrfs_key *key); +int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root + *root, char *name, int name_len, u64 dir, u64 + objectid, u8 type); +int btrfs_lookup_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root + *root, struct btrfs_path *path, u64 dir, char *name, + int name_len, int mod); int btrfs_match_dir_item_name(struct btrfs_root *root, struct btrfs_path *path, char *name, int name_len); #endif diff --git a/fs/btrfs/debug-tree.c b/fs/btrfs/debug-tree.c index de45fb4dfddd..91dea7a0a476 100644 --- a/fs/btrfs/debug-tree.c +++ b/fs/btrfs/debug-tree.c @@ -5,6 +5,7 @@ #include "ctree.h" #include "disk-io.h" #include "print-tree.h" +#include "transaction.h" int main(int ac, char **av) { struct btrfs_super_block super; diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c index a42a67b99753..949c4e526798 100644 --- a/fs/btrfs/dir-item.c +++ b/fs/btrfs/dir-item.c @@ -5,9 +5,11 @@ #include "ctree.h" #include "disk-io.h" #include "hash.h" +#include "transaction.h" -int btrfs_insert_dir_item(struct btrfs_root *root, char *name, int name_len, - u64 dir, u64 objectid, u8 type) +int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root + *root, char *name, int name_len, u64 dir, u64 + objectid, u8 type) { int ret = 0; struct btrfs_path path; @@ -23,7 +25,7 @@ int btrfs_insert_dir_item(struct btrfs_root *root, char *name, int name_len, BUG_ON(ret); btrfs_init_path(&path); data_size = sizeof(*dir_item) + name_len; - ret = btrfs_insert_empty_item(root, &path, &key, data_size); + ret = btrfs_insert_empty_item(trans, root, &path, &key, data_size); if (ret) goto out; @@ -40,8 +42,9 @@ out: return ret; } -int btrfs_lookup_dir_item(struct btrfs_root *root, struct btrfs_path *path, - u64 dir, char *name, int name_len, int mod) +int btrfs_lookup_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root + *root, struct btrfs_path *path, u64 dir, char *name, + int name_len, int mod) { int ret; struct btrfs_key key; @@ -53,12 +56,13 @@ int btrfs_lookup_dir_item(struct btrfs_root *root, struct btrfs_path *path, btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY); ret = btrfs_name_hash(name, name_len, &key.offset); BUG_ON(ret); - ret = btrfs_search_slot(root, &key, path, ins_len, cow); + ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow); return ret; } -int btrfs_match_dir_item_name(struct btrfs_root *root, struct btrfs_path *path, - char *name, int name_len) +int btrfs_match_dir_item_name(struct btrfs_root *root, + struct btrfs_path *path, char + *name, int name_len) { struct btrfs_dir_item *dir_item; char *name_ptr; diff --git a/fs/btrfs/dir-test.c b/fs/btrfs/dir-test.c index f73aa7623398..e908c0c588cc 100644 --- a/fs/btrfs/dir-test.c +++ b/fs/btrfs/dir-test.c @@ -8,6 +8,7 @@ #include "disk-io.h" #include "print-tree.h" #include "hash.h" +#include "transaction.h" int keep_running = 1; struct btrfs_super_block super; @@ -38,7 +39,8 @@ again: return 0; } -static int ins_one(struct btrfs_root *root, struct radix_tree_root *radix) +static int ins_one(struct btrfs_trans_handle *trans, struct btrfs_root *root, + struct radix_tree_root *radix) { int ret; char buf[128]; @@ -48,8 +50,8 @@ static int ins_one(struct btrfs_root *root, struct radix_tree_root *radix) find_num(radix, &oid, 0); sprintf(buf, "str-%lu", oid); - ret = btrfs_insert_dir_item(root, buf, strlen(buf), dir_oid, file_oid, - 1); + ret = btrfs_insert_dir_item(trans, root, buf, strlen(buf), dir_oid, + file_oid, 1); if (ret) goto error; @@ -68,7 +70,8 @@ error: * check */ btrfs_init_path(&path); - ret = btrfs_lookup_dir_item(root, &path, dir_oid, buf, strlen(buf), 0); + ret = btrfs_lookup_dir_item(trans, root, &path, dir_oid, buf, + strlen(buf), 0); if (ret) goto fatal_release; if (!btrfs_match_dir_item_name(root, &path, buf, strlen(buf))) { @@ -96,7 +99,8 @@ fatal: return -1; } -static int insert_dup(struct btrfs_root *root, struct radix_tree_root *radix) +static int insert_dup(struct btrfs_trans_handle *trans, struct btrfs_root + *root, struct radix_tree_root *radix) { int ret; char buf[128]; @@ -107,8 +111,8 @@ static int insert_dup(struct btrfs_root *root, struct radix_tree_root *radix) return 0; sprintf(buf, "str-%lu", oid); - ret = btrfs_insert_dir_item(root, buf, strlen(buf), dir_oid, file_oid, - 1); + ret = btrfs_insert_dir_item(trans, root, buf, strlen(buf), dir_oid, + file_oid, 1); if (ret != -EEXIST) { printf("insert on %s gave us %d\n", buf, ret); return 1; @@ -116,7 +120,8 @@ static int insert_dup(struct btrfs_root *root, struct radix_tree_root *radix) return 0; } -static int del_one(struct btrfs_root *root, struct radix_tree_root *radix) +static int del_one(struct btrfs_trans_handle *trans, struct btrfs_root *root, + struct radix_tree_root *radix) { int ret; char buf[128]; @@ -129,10 +134,11 @@ static int del_one(struct btrfs_root *root, struct radix_tree_root *radix) return 0; sprintf(buf, "str-%lu", oid); btrfs_init_path(&path); - ret = btrfs_lookup_dir_item(root, &path, dir_oid, buf, strlen(buf), -1); + ret = btrfs_lookup_dir_item(trans, root, &path, dir_oid, buf, + strlen(buf), -1); if (ret) goto out_release; - ret = btrfs_del_item(root, &path); + ret = btrfs_del_item(trans, root, &path); if (ret) goto out_release; btrfs_release_path(root, &path); @@ -149,7 +155,8 @@ out: return -1; } -static int lookup_item(struct btrfs_root *root, struct radix_tree_root *radix) +static int lookup_item(struct btrfs_trans_handle *trans, struct btrfs_root + *root, struct radix_tree_root *radix) { struct btrfs_path path; char buf[128]; @@ -161,7 +168,8 @@ static int lookup_item(struct btrfs_root *root, struct radix_tree_root *radix) return 0; sprintf(buf, "str-%lu", oid); btrfs_init_path(&path); - ret = btrfs_lookup_dir_item(root, &path, dir_oid, buf, strlen(buf), 0); + ret = btrfs_lookup_dir_item(trans, root, &path, dir_oid, buf, + strlen(buf), 0); btrfs_release_path(root, &path); if (ret) { printf("unable to find key %lu\n", oid); @@ -170,7 +178,8 @@ static int lookup_item(struct btrfs_root *root, struct radix_tree_root *radix) return 0; } -static int lookup_enoent(struct btrfs_root *root, struct radix_tree_root *radix) +static int lookup_enoent(struct btrfs_trans_handle *trans, struct btrfs_root + *root, struct radix_tree_root *radix) { struct btrfs_path path; char buf[128]; @@ -182,7 +191,8 @@ static int lookup_enoent(struct btrfs_root *root, struct radix_tree_root *radix) return 0; sprintf(buf, "str-%lu", oid); btrfs_init_path(&path); - ret = btrfs_lookup_dir_item(root, &path, dir_oid, buf, strlen(buf), 0); + ret = btrfs_lookup_dir_item(trans, root, &path, dir_oid, buf, + strlen(buf), 0); btrfs_release_path(root, &path); if (!ret) { printf("able to find key that should not exist %lu\n", oid); @@ -191,8 +201,8 @@ static int lookup_enoent(struct btrfs_root *root, struct radix_tree_root *radix) return 0; } -static int empty_tree(struct btrfs_root *root, struct radix_tree_root *radix, - int nr) +static int empty_tree(struct btrfs_trans_handle *trans, struct btrfs_root + *root, struct radix_tree_root *radix, int nr) { struct btrfs_path path; struct btrfs_key key; @@ -211,7 +221,7 @@ static int empty_tree(struct btrfs_root *root, struct radix_tree_root *radix, key.objectid = dir_oid; while(nr-- >= 0) { btrfs_init_path(&path); - ret = btrfs_search_slot(root, &key, &path, -1, 1); + ret = btrfs_search_slot(trans, root, &key, &path, -1, 1); if (ret < 0) { btrfs_release_path(root, &path); return ret; @@ -231,7 +241,7 @@ static int empty_tree(struct btrfs_root *root, struct radix_tree_root *radix, BUG_ON(found_len > 128); buf[found_len] = '\0'; found = atoi(buf + 4); - ret = btrfs_del_item(root, &path); + ret = btrfs_del_item(trans, root, &path); count++; if (ret) { fprintf(stderr, @@ -252,19 +262,19 @@ error: return -1; } -static int fill_tree(struct btrfs_root *root, struct radix_tree_root *radix, - int count) +static int fill_tree(struct btrfs_trans_handle *trans, struct btrfs_root *root, + struct radix_tree_root *radix, int count) { int i; int ret = 0; for (i = 0; i < count; i++) { - ret = ins_one(root, radix); + ret = ins_one(trans, root, radix); if (ret) { fprintf(stderr, "fill failed\n"); goto out; } if (i % 1000 == 0) { - ret = btrfs_commit_transaction(root, &super); + ret = btrfs_commit_transaction(trans, root, &super); if (ret) { fprintf(stderr, "fill commit failed\n"); return ret; @@ -280,7 +290,8 @@ out: return ret; } -static int bulk_op(struct btrfs_root *root, struct radix_tree_root *radix) +static int bulk_op(struct btrfs_trans_handle *trans, struct btrfs_root *root, + struct radix_tree_root *radix) { int ret; int nr = rand() % 5000; @@ -289,17 +300,18 @@ static int bulk_op(struct btrfs_root *root, struct radix_tree_root *radix) /* do the bulk op much less frequently */ if (run_nr++ % 100) return 0; - ret = empty_tree(root, radix, nr); + ret = empty_tree(trans, root, radix, nr); if (ret) return ret; - ret = fill_tree(root, radix, nr); + ret = fill_tree(trans, root, radix, nr); if (ret) return ret; return 0; } -int (*ops[])(struct btrfs_root *root, struct radix_tree_root *radix) = +int (*ops[])(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct + radix_tree_root *radix) = { ins_one, insert_dup, del_one, lookup_item, lookup_enoent, bulk_op }; @@ -330,11 +342,13 @@ int main(int ac, char **av) int init_fill_count = 800000; int err = 0; int initial_only = 0; + struct btrfs_trans_handle *trans; radix_tree_init(); printf("removing old tree\n"); unlink("dbfile"); root = open_ctree("dbfile", &super); + trans = btrfs_start_transaction(root, 1); signal(SIGTERM, sigstopper); signal(SIGINT, sigstopper); @@ -353,7 +367,7 @@ int main(int ac, char **av) } } printf("initial fill\n"); - ret = fill_tree(root, &radix, init_fill_count); + ret = fill_tree(trans, root, &radix, init_fill_count); printf("starting run\n"); if (ret) { err = ret; @@ -377,7 +391,7 @@ int main(int ac, char **av) root = open_ctree("dbfile", &super); } while(count--) { - ret = ops[op](root, &radix); + ret = ops[op](trans, root, &radix); if (ret) { fprintf(stderr, "op %d failed %d:%d\n", op, i, iterations); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 8d9457b5aef5..25ce07908ee3 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -9,6 +9,7 @@ #include "radix-tree.h" #include "ctree.h" #include "disk-io.h" +#include "transaction.h" static int allocated_blocks = 0; int cache_max = 10000; @@ -107,7 +108,8 @@ struct btrfs_buffer *read_tree_block(struct btrfs_root *root, u64 blocknr) return buf; } -int dirty_tree_block(struct btrfs_root *root, struct btrfs_buffer *buf) +int dirty_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, + struct btrfs_buffer *buf) { if (!list_empty(&buf->dirty)) return 0; @@ -116,7 +118,8 @@ int dirty_tree_block(struct btrfs_root *root, struct btrfs_buffer *buf) return 0; } -int clean_tree_block(struct btrfs_root *root, struct btrfs_buffer *buf) +int clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, + struct btrfs_buffer *buf) { if (!list_empty(&buf->dirty)) { list_del_init(&buf->dirty); @@ -125,7 +128,8 @@ int clean_tree_block(struct btrfs_root *root, struct btrfs_buffer *buf) return 0; } -int write_tree_block(struct btrfs_root *root, struct btrfs_buffer *buf) +int write_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, + struct btrfs_buffer *buf) { u64 blocknr = buf->blocknr; loff_t offset = blocknr * root->blocksize; @@ -139,7 +143,8 @@ int write_tree_block(struct btrfs_root *root, struct btrfs_buffer *buf) return 0; } -static int __commit_transaction(struct btrfs_root *root) +static int __commit_transaction(struct btrfs_trans_handle *trans, struct + btrfs_root *root) { struct btrfs_buffer *b; int ret = 0; @@ -147,7 +152,7 @@ static int __commit_transaction(struct btrfs_root *root) while(!list_empty(&root->trans)) { b = list_entry(root->trans.next, struct btrfs_buffer, dirty); list_del_init(&b->dirty); - wret = write_tree_block(root, b); + wret = write_tree_block(trans, root, b); if (wret) ret = wret; btrfs_block_release(root, b); @@ -155,8 +160,9 @@ static int __commit_transaction(struct btrfs_root *root) return ret; } -static int commit_extent_and_tree_roots(struct btrfs_root *tree_root, - struct btrfs_root *extent_root) +static int commit_extent_and_tree_roots(struct btrfs_trans_handle *trans, + struct btrfs_root *tree_root, struct + btrfs_root *extent_root) { int ret; u64 old_extent_block; @@ -167,24 +173,24 @@ static int commit_extent_and_tree_roots(struct btrfs_root *tree_root, break; btrfs_set_root_blocknr(&extent_root->root_item, extent_root->node->blocknr); - ret = btrfs_update_root(tree_root, + ret = btrfs_update_root(trans, tree_root, &extent_root->root_key, &extent_root->root_item); BUG_ON(ret); } - __commit_transaction(extent_root); - __commit_transaction(tree_root); + __commit_transaction(trans, extent_root); + __commit_transaction(trans, tree_root); return 0; } -int btrfs_commit_transaction(struct btrfs_root *root, - struct btrfs_super_block *s) +int btrfs_commit_transaction(struct btrfs_trans_handle *trans, struct + btrfs_root *root, struct btrfs_super_block *s) { int ret = 0; struct btrfs_buffer *snap = root->commit_root; struct btrfs_key snap_key; - ret = __commit_transaction(root); + ret = __commit_transaction(trans, root); BUG_ON(ret); if (root->commit_root == root->node) @@ -194,23 +200,24 @@ int btrfs_commit_transaction(struct btrfs_root *root, root->root_key.offset++; btrfs_set_root_blocknr(&root->root_item, root->node->blocknr); - ret = btrfs_insert_root(root->tree_root, &root->root_key, + ret = btrfs_insert_root(trans, root->tree_root, &root->root_key, &root->root_item); BUG_ON(ret); - ret = commit_extent_and_tree_roots(root->tree_root, root->extent_root); + ret = commit_extent_and_tree_roots(trans, root->tree_root, + root->extent_root); BUG_ON(ret); - write_ctree_super(root, s); - btrfs_finish_extent_commit(root->extent_root); - btrfs_finish_extent_commit(root->tree_root); + write_ctree_super(trans, root, s); + btrfs_finish_extent_commit(trans, root->extent_root); + btrfs_finish_extent_commit(trans, root->tree_root); root->commit_root = root->node; root->node->count++; - ret = btrfs_drop_snapshot(root, snap); + ret = btrfs_drop_snapshot(trans, root, snap); BUG_ON(ret); - ret = btrfs_del_root(root->tree_root, &snap_key); + ret = btrfs_del_root(trans, root->tree_root, &snap_key); BUG_ON(ret); return ret; @@ -312,7 +319,8 @@ struct btrfs_root *open_ctree(char *filename, struct btrfs_super_block *super) return root; } -int write_ctree_super(struct btrfs_root *root, struct btrfs_super_block *s) +int write_ctree_super(struct btrfs_trans_handle *trans, struct btrfs_root + *root, struct btrfs_super_block *s) { int ret; btrfs_set_super_root(s, root->tree_root->node->blocknr); @@ -338,10 +346,14 @@ static int drop_cache(struct btrfs_root *root) int close_ctree(struct btrfs_root *root, struct btrfs_super_block *s) { int ret; - btrfs_commit_transaction(root, s); - ret = commit_extent_and_tree_roots(root->tree_root, root->extent_root); + struct btrfs_trans_handle *trans; + + trans = root->running_transaction; + btrfs_commit_transaction(trans, root, s); + ret = commit_extent_and_tree_roots(trans, root->tree_root, + root->extent_root); BUG_ON(ret); - write_ctree_super(root, s); + write_ctree_super(trans, root, s); drop_cache(root->extent_root); drop_cache(root->tree_root); drop_cache(root); diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 5771bb90acb2..24a9e77c8311 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -15,15 +15,19 @@ struct btrfs_buffer { struct btrfs_buffer *read_tree_block(struct btrfs_root *root, u64 blocknr); struct btrfs_buffer *find_tree_block(struct btrfs_root *root, u64 blocknr); -int write_tree_block(struct btrfs_root *root, struct btrfs_buffer *buf); -int dirty_tree_block(struct btrfs_root *root, struct btrfs_buffer *buf); -int clean_tree_block(struct btrfs_root *root, struct btrfs_buffer *buf); -int btrfs_commit_transaction(struct btrfs_root *root, - struct btrfs_super_block *s); +int write_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, + struct btrfs_buffer *buf); +int dirty_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, + struct btrfs_buffer *buf); +int clean_tree_block(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct btrfs_buffer *buf); +int btrfs_commit_transaction(struct btrfs_trans_handle *trans, struct btrfs_root + *root, struct btrfs_super_block *s); struct btrfs_root *open_ctree(char *filename, struct btrfs_super_block *s); int close_ctree(struct btrfs_root *root, struct btrfs_super_block *s); void btrfs_block_release(struct btrfs_root *root, struct btrfs_buffer *buf); -int write_ctree_super(struct btrfs_root *root, struct btrfs_super_block *s); +int write_ctree_super(struct btrfs_trans_handle *trans, struct btrfs_root *root, + struct btrfs_super_block *s); int mkfs(int fd, u64 num_blocks, u32 blocksize); #define BTRFS_SUPER_INFO_OFFSET (16 * 1024) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 4a40282b45f7..c29b92d440e0 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -5,12 +5,15 @@ #include "ctree.h" #include "disk-io.h" #include "print-tree.h" +#include "transaction.h" -static int find_free_extent(struct btrfs_root *orig_root, u64 num_blocks, - u64 search_start, u64 search_end, - struct btrfs_key *ins); -static int finish_current_insert(struct btrfs_root *extent_root); -static int run_pending(struct btrfs_root *extent_root); +static int find_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root + *orig_root, u64 num_blocks, u64 search_start, u64 + search_end, struct btrfs_key *ins); +static int finish_current_insert(struct btrfs_trans_handle *trans, struct + btrfs_root *extent_root); +static int run_pending(struct btrfs_trans_handle *trans, struct btrfs_root + *extent_root); /* * pending extents are blocks that we're trying to allocate in the extent @@ -21,7 +24,8 @@ static int run_pending(struct btrfs_root *extent_root); */ #define CTREE_EXTENT_PENDING_DEL 0 -static int inc_block_ref(struct btrfs_root *root, u64 blocknr) +static int inc_block_ref(struct btrfs_trans_handle *trans, struct btrfs_root + *root, u64 blocknr) { struct btrfs_path path; int ret; @@ -31,13 +35,13 @@ static int inc_block_ref(struct btrfs_root *root, u64 blocknr) struct btrfs_key ins; u32 refs; - find_free_extent(root->extent_root, 0, 0, (u64)-1, &ins); + find_free_extent(trans, root->extent_root, 0, 0, (u64)-1, &ins); btrfs_init_path(&path); key.objectid = blocknr; key.flags = 0; btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY); key.offset = 1; - ret = btrfs_search_slot(root->extent_root, &key, &path, 0, 1); + ret = btrfs_search_slot(trans, root->extent_root, &key, &path, 0, 1); if (ret != 0) BUG(); BUG_ON(ret != 0); @@ -48,12 +52,13 @@ static int inc_block_ref(struct btrfs_root *root, u64 blocknr) BUG_ON(list_empty(&path.nodes[0]->dirty)); btrfs_release_path(root->extent_root, &path); - finish_current_insert(root->extent_root); - run_pending(root->extent_root); + finish_current_insert(trans, root->extent_root); + run_pending(trans, root->extent_root); return 0; } -static int lookup_block_ref(struct btrfs_root *root, u64 blocknr, u32 *refs) +static int lookup_block_ref(struct btrfs_trans_handle *trans, struct btrfs_root + *root, u64 blocknr, u32 *refs) { struct btrfs_path path; int ret; @@ -65,7 +70,7 @@ static int lookup_block_ref(struct btrfs_root *root, u64 blocknr, u32 *refs) key.offset = 1; key.flags = 0; btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY); - ret = btrfs_search_slot(root->extent_root, &key, &path, 0, 0); + ret = btrfs_search_slot(trans, root->extent_root, &key, &path, 0, 0); if (ret != 0) BUG(); l = &path.nodes[0]->leaf; @@ -75,7 +80,8 @@ static int lookup_block_ref(struct btrfs_root *root, u64 blocknr, u32 *refs) return 0; } -int btrfs_inc_ref(struct btrfs_root *root, struct btrfs_buffer *buf) +int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, + struct btrfs_buffer *buf) { u64 blocknr; int i; @@ -87,12 +93,13 @@ int btrfs_inc_ref(struct btrfs_root *root, struct btrfs_buffer *buf) for (i = 0; i < btrfs_header_nritems(&buf->node.header); i++) { blocknr = btrfs_node_blockptr(&buf->node, i); - inc_block_ref(root, blocknr); + inc_block_ref(trans, root, blocknr); } return 0; } -int btrfs_finish_extent_commit(struct btrfs_root *root) +int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, struct + btrfs_root *root) { unsigned long gang[8]; u64 first = 0; @@ -116,7 +123,8 @@ int btrfs_finish_extent_commit(struct btrfs_root *root) return 0; } -static int finish_current_insert(struct btrfs_root *extent_root) +static int finish_current_insert(struct btrfs_trans_handle *trans, struct + btrfs_root *extent_root) { struct btrfs_key ins; struct btrfs_extent_item extent_item; @@ -132,8 +140,8 @@ static int finish_current_insert(struct btrfs_root *extent_root) for (i = 0; i < extent_root->current_insert.flags; i++) { ins.objectid = extent_root->current_insert.objectid + i; - ret = btrfs_insert_item(extent_root, &ins, &extent_item, - sizeof(extent_item)); + ret = btrfs_insert_item(trans, extent_root, &ins, &extent_item, + sizeof(extent_item)); BUG_ON(ret); } extent_root->current_insert.offset = 0; @@ -143,8 +151,8 @@ static int finish_current_insert(struct btrfs_root *extent_root) /* * remove an extent from the root, returns 0 on success */ -static int __free_extent(struct btrfs_root *root, u64 blocknr, u64 num_blocks, - int pin) +static int __free_extent(struct btrfs_trans_handle *trans, struct btrfs_root + *root, u64 blocknr, u64 num_blocks, int pin) { struct btrfs_path path; struct btrfs_key key; @@ -160,9 +168,9 @@ static int __free_extent(struct btrfs_root *root, u64 blocknr, u64 num_blocks, btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY); key.offset = num_blocks; - find_free_extent(root, 0, 0, (u64)-1, &ins); + find_free_extent(trans, root, 0, 0, (u64)-1, &ins); btrfs_init_path(&path); - ret = btrfs_search_slot(extent_root, &key, &path, -1, 1); + ret = btrfs_search_slot(trans, extent_root, &key, &path, -1, 1); if (ret) { printf("failed to find %Lu\n", key.objectid); btrfs_print_tree(extent_root, extent_root->node); @@ -183,14 +191,14 @@ static int __free_extent(struct btrfs_root *root, u64 blocknr, u64 num_blocks, BUG_ON(err); radix_tree_preload_end(); } - ret = btrfs_del_item(extent_root, &path); + ret = btrfs_del_item(trans, extent_root, &path); if (!pin && extent_root->last_insert.objectid > blocknr) extent_root->last_insert.objectid = blocknr; if (ret) BUG(); } btrfs_release_path(extent_root, &path); - finish_current_insert(extent_root); + finish_current_insert(trans, extent_root); return ret; } @@ -198,7 +206,8 @@ static int __free_extent(struct btrfs_root *root, u64 blocknr, u64 num_blocks, * find all the blocks marked as pending in the radix tree and remove * them from the extent map */ -static int del_pending_extents(struct btrfs_root *extent_root) +static int del_pending_extents(struct btrfs_trans_handle *trans, struct + btrfs_root *extent_root) { int ret; struct btrfs_buffer *gang[4]; @@ -212,7 +221,7 @@ static int del_pending_extents(struct btrfs_root *extent_root) if (!ret) break; for (i = 0; i < ret; i++) { - ret = __free_extent(extent_root, + ret = __free_extent(trans, extent_root, gang[i]->blocknr, 1, 1); radix_tree_tag_clear(&extent_root->cache_radix, gang[i]->blocknr, @@ -223,11 +232,12 @@ static int del_pending_extents(struct btrfs_root *extent_root) return 0; } -static int run_pending(struct btrfs_root *extent_root) +static int run_pending(struct btrfs_trans_handle *trans, struct btrfs_root + *extent_root) { while(radix_tree_tagged(&extent_root->cache_radix, CTREE_EXTENT_PENDING_DEL)) - del_pending_extents(extent_root); + del_pending_extents(trans, extent_root); return 0; } @@ -235,8 +245,8 @@ static int run_pending(struct btrfs_root *extent_root) /* * remove an extent from the root, returns 0 on success */ -int btrfs_free_extent(struct btrfs_root *root, u64 blocknr, u64 num_blocks, - int pin) +int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root + *root, u64 blocknr, u64 num_blocks, int pin) { struct btrfs_root *extent_root = root->extent_root; struct btrfs_buffer *t; @@ -249,8 +259,8 @@ int btrfs_free_extent(struct btrfs_root *root, u64 blocknr, u64 num_blocks, CTREE_EXTENT_PENDING_DEL); return 0; } - ret = __free_extent(root, blocknr, num_blocks, pin); - pending_ret = run_pending(root->extent_root); + ret = __free_extent(trans, root, blocknr, num_blocks, pin); + pending_ret = run_pending(trans, root->extent_root); return ret ? ret : pending_ret; } @@ -262,9 +272,9 @@ int btrfs_free_extent(struct btrfs_root *root, u64 blocknr, u64 num_blocks, * ins->offset == number of blocks * Any available blocks before search_start are skipped. */ -static int find_free_extent(struct btrfs_root *orig_root, u64 num_blocks, - u64 search_start, u64 search_end, - struct btrfs_key *ins) +static int find_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root + *orig_root, u64 num_blocks, u64 search_start, u64 + search_end, struct btrfs_key *ins) { struct btrfs_path path; struct btrfs_key key; @@ -290,7 +300,7 @@ check_failed: ins->objectid = search_start; ins->offset = 0; start_found = 0; - ret = btrfs_search_slot(root, ins, &path, 0, 0); + ret = btrfs_search_slot(trans, root, ins, &path, 0, 0); if (ret < 0) goto error; @@ -367,9 +377,9 @@ error: * * returns 0 if everything worked, non-zero otherwise. */ -static int alloc_extent(struct btrfs_root *root, u64 num_blocks, - u64 search_start, u64 search_end, u64 owner, - struct btrfs_key *ins) +static int alloc_extent(struct btrfs_trans_handle *trans, struct btrfs_root + *root, u64 num_blocks, u64 search_start, u64 + search_end, u64 owner, struct btrfs_key *ins) { int ret; int pending_ret; @@ -389,16 +399,16 @@ static int alloc_extent(struct btrfs_root *root, u64 num_blocks, extent_root->current_insert.flags++; return 0; } - ret = find_free_extent(root, num_blocks, search_start, + ret = find_free_extent(trans, root, num_blocks, search_start, search_end, ins); if (ret) return ret; - ret = btrfs_insert_item(extent_root, ins, &extent_item, - sizeof(extent_item)); + ret = btrfs_insert_item(trans, extent_root, ins, &extent_item, + sizeof(extent_item)); - finish_current_insert(extent_root); - pending_ret = run_pending(extent_root); + finish_current_insert(trans, extent_root); + pending_ret = run_pending(trans, extent_root); if (ret) return ret; if (pending_ret) @@ -410,13 +420,14 @@ static int alloc_extent(struct btrfs_root *root, u64 num_blocks, * helper function to allocate a block for a given tree * returns the tree buffer or NULL. */ -struct btrfs_buffer *btrfs_alloc_free_block(struct btrfs_root *root) +struct btrfs_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, + struct btrfs_root *root) { struct btrfs_key ins; int ret; struct btrfs_buffer *buf; - ret = alloc_extent(root, 1, 0, (unsigned long)-1, + ret = alloc_extent(trans, root, 1, 0, (unsigned long)-1, btrfs_header_parentid(&root->node->node.header), &ins); if (ret) { @@ -424,7 +435,7 @@ struct btrfs_buffer *btrfs_alloc_free_block(struct btrfs_root *root) return NULL; } buf = find_tree_block(root, ins.objectid); - dirty_tree_block(root, buf); + dirty_tree_block(trans, root, buf); return buf; } @@ -432,8 +443,8 @@ struct btrfs_buffer *btrfs_alloc_free_block(struct btrfs_root *root) * helper function for drop_snapshot, this walks down the tree dropping ref * counts as it goes. */ -static int walk_down_tree(struct btrfs_root *root, - struct btrfs_path *path, int *level) +static int walk_down_tree(struct btrfs_trans_handle *trans, struct btrfs_root + *root, struct btrfs_path *path, int *level) { struct btrfs_buffer *next; struct btrfs_buffer *cur; @@ -441,7 +452,8 @@ static int walk_down_tree(struct btrfs_root *root, int ret; u32 refs; - ret = lookup_block_ref(root, path->nodes[*level]->blocknr, &refs); + ret = lookup_block_ref(trans, root, path->nodes[*level]->blocknr, + &refs); BUG_ON(ret); if (refs > 1) goto out; @@ -454,10 +466,10 @@ static int walk_down_tree(struct btrfs_root *root, btrfs_header_nritems(&cur->node.header)) break; blocknr = btrfs_node_blockptr(&cur->node, path->slots[*level]); - ret = lookup_block_ref(root, blocknr, &refs); + ret = lookup_block_ref(trans, root, blocknr, &refs); if (refs != 1 || *level == 1) { path->slots[*level]++; - ret = btrfs_free_extent(root, blocknr, 1, 1); + ret = btrfs_free_extent(trans, root, blocknr, 1, 1); BUG_ON(ret); continue; } @@ -470,7 +482,8 @@ static int walk_down_tree(struct btrfs_root *root, path->slots[*level] = 0; } out: - ret = btrfs_free_extent(root, path->nodes[*level]->blocknr, 1, 1); + ret = btrfs_free_extent(trans, root, path->nodes[*level]->blocknr, 1, + 1); btrfs_block_release(root, path->nodes[*level]); path->nodes[*level] = NULL; *level += 1; @@ -483,8 +496,8 @@ out: * to find the first node higher up where we haven't yet gone through * all the slots */ -static int walk_up_tree(struct btrfs_root *root, struct btrfs_path *path, - int *level) +static int walk_up_tree(struct btrfs_trans_handle *trans, struct btrfs_root + *root, struct btrfs_path *path, int *level) { int i; int slot; @@ -497,8 +510,9 @@ static int walk_up_tree(struct btrfs_root *root, struct btrfs_path *path, *level = i; return 0; } else { - ret = btrfs_free_extent(root, - path->nodes[*level]->blocknr, 1, 1); + ret = btrfs_free_extent(trans, root, + path->nodes[*level]->blocknr, + 1, 1); btrfs_block_release(root, path->nodes[*level]); path->nodes[*level] = NULL; *level = i + 1; @@ -513,7 +527,8 @@ static int walk_up_tree(struct btrfs_root *root, struct btrfs_path *path, * the tree freeing any blocks that have a ref count of zero after being * decremented. */ -int btrfs_drop_snapshot(struct btrfs_root *root, struct btrfs_buffer *snap) +int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root + *root, struct btrfs_buffer *snap) { int ret = 0; int wret; @@ -529,13 +544,13 @@ int btrfs_drop_snapshot(struct btrfs_root *root, struct btrfs_buffer *snap) path.nodes[level] = snap; path.slots[level] = 0; while(1) { - wret = walk_down_tree(root, &path, &level); + wret = walk_down_tree(trans, root, &path, &level); if (wret > 0) break; if (wret < 0) ret = wret; - wret = walk_up_tree(root, &path, &level); + wret = walk_up_tree(trans, root, &path, &level); if (wret > 0) break; if (wret < 0) diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c index 4908f9afdf86..7caeb11e8756 100644 --- a/fs/btrfs/inode-item.c +++ b/fs/btrfs/inode-item.c @@ -4,9 +4,11 @@ #include "radix-tree.h" #include "ctree.h" #include "disk-io.h" +#include "transaction.h" -int btrfs_insert_inode(struct btrfs_root *root, u64 objectid, - struct btrfs_inode_item *inode_item) +int btrfs_insert_inode(struct btrfs_trans_handle *trans, struct btrfs_root + *root, u64 objectid, struct btrfs_inode_item + *inode_item) { struct btrfs_path path; struct btrfs_key key; @@ -17,13 +19,14 @@ int btrfs_insert_inode(struct btrfs_root *root, u64 objectid, key.offset = 0; btrfs_init_path(&path); - ret = btrfs_insert_item(root, &key, inode_item, sizeof(*inode_item)); + ret = btrfs_insert_item(trans, root, &key, inode_item, + sizeof(*inode_item)); btrfs_release_path(root, &path); return ret; } -int btrfs_lookup_inode(struct btrfs_root *root, struct btrfs_path *path, - u64 objectid, int mod) +int btrfs_lookup_inode(struct btrfs_trans_handle *trans, struct btrfs_root + *root, struct btrfs_path *path, u64 objectid, int mod) { struct btrfs_key key; int ins_len = mod < 0 ? -1 : 0; @@ -33,5 +36,5 @@ int btrfs_lookup_inode(struct btrfs_root *root, struct btrfs_path *path, key.flags = 0; btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); key.offset = 0; - return btrfs_search_slot(root, &key, path, ins_len, cow); + return btrfs_search_slot(trans, root, &key, path, ins_len, cow); } diff --git a/fs/btrfs/print-tree.h b/fs/btrfs/print-tree.h index 9236abda2a74..0882ca904eca 100644 --- a/fs/btrfs/print-tree.h +++ b/fs/btrfs/print-tree.h @@ -1,3 +1,5 @@ - +#ifndef __PRINT_TREE_ +#define __PRINT_TREE_ void btrfs_print_leaf(struct btrfs_root *root, struct btrfs_leaf *l); void btrfs_print_tree(struct btrfs_root *root, struct btrfs_buffer *t); +#endif diff --git a/fs/btrfs/quick-test.c b/fs/btrfs/quick-test.c index 3a2571617121..d676577185d5 100644 --- a/fs/btrfs/quick-test.c +++ b/fs/btrfs/quick-test.c @@ -5,6 +5,7 @@ #include "ctree.h" #include "disk-io.h" #include "print-tree.h" +#include "transaction.h" /* for testing only */ int next_key(int i, int max_key) { @@ -25,10 +26,12 @@ int main(int ac, char **av) { struct btrfs_path path; struct btrfs_super_block super; struct btrfs_root *root; + struct btrfs_trans_handle *trans; radix_tree_init(); root = open_ctree("dbfile", &super); + trans = btrfs_start_transaction(root, 1); srand(55); ins.flags = 0; btrfs_set_key_type(&ins, BTRFS_STRING_ITEM_KEY); @@ -41,12 +44,12 @@ int main(int ac, char **av) { fprintf(stderr, "insert %d:%d\n", num, i); ins.objectid = num; ins.offset = 0; - ret = btrfs_insert_item(root, &ins, buf, strlen(buf)); + ret = btrfs_insert_item(trans, root, &ins, buf, strlen(buf)); if (!ret) tree_size++; free(buf); if (i == run_size - 5) { - btrfs_commit_transaction(root, &super); + btrfs_commit_transaction(trans, root, &super); } } @@ -61,7 +64,7 @@ int main(int ac, char **av) { btrfs_init_path(&path); if (i % 10000 == 0) fprintf(stderr, "search %d:%d\n", num, i); - ret = btrfs_search_slot(root, &ins, &path, 0, 0); + ret = btrfs_search_slot(trans, root, &ins, &path, 0, 0); if (ret) { btrfs_print_tree(root, root->node); printf("unable to find %d\n", num); @@ -83,11 +86,11 @@ int main(int ac, char **av) { num = next_key(i, max_key); ins.objectid = num; btrfs_init_path(&path); - ret = btrfs_search_slot(root, &ins, &path, -1, 1); + ret = btrfs_search_slot(trans, root, &ins, &path, -1, 1); if (!ret) { if (i % 10000 == 0) fprintf(stderr, "del %d:%d\n", num, i); - ret = btrfs_del_item(root, &path); + ret = btrfs_del_item(trans, root, &path); if (ret != 0) BUG(); tree_size--; @@ -104,7 +107,7 @@ int main(int ac, char **av) { ins.objectid = num; if (i % 10000 == 0) fprintf(stderr, "insert %d:%d\n", num, i); - ret = btrfs_insert_item(root, &ins, buf, strlen(buf)); + ret = btrfs_insert_item(trans, root, &ins, buf, strlen(buf)); if (!ret) tree_size++; free(buf); @@ -119,7 +122,7 @@ int main(int ac, char **av) { btrfs_init_path(&path); if (i % 10000 == 0) fprintf(stderr, "search %d:%d\n", num, i); - ret = btrfs_search_slot(root, &ins, &path, 0, 0); + ret = btrfs_search_slot(trans, root, &ins, &path, 0, 0); if (ret) { btrfs_print_tree(root, root->node); printf("unable to find %d\n", num); @@ -134,7 +137,7 @@ int main(int ac, char **av) { int slot; ins.objectid = (u64)-1; btrfs_init_path(&path); - ret = btrfs_search_slot(root, &ins, &path, -1, 1); + ret = btrfs_search_slot(trans, root, &ins, &path, -1, 1); if (ret == 0) BUG(); @@ -150,7 +153,7 @@ int main(int ac, char **av) { btrfs_disk_key_to_cpu(&last, &leaf->items[slot].key); if (tree_size % 10000 == 0) printf("big del %d:%d\n", tree_size, i); - ret = btrfs_del_item(root, &path); + ret = btrfs_del_item(trans, root, &path); if (ret != 0) { printf("del_item returned %d\n", ret); BUG(); @@ -165,7 +168,7 @@ int main(int ac, char **av) { printf("map before commit\n"); btrfs_print_tree(root->extent_root, root->extent_root->node); */ - btrfs_commit_transaction(root, &super); + btrfs_commit_transaction(trans, root, &super); printf("tree size is now %d\n", tree_size); printf("root %p commit root %p\n", root->node, root->commit_root); printf("map tree\n"); diff --git a/fs/btrfs/random-test.c b/fs/btrfs/random-test.c index f05135f13f9a..3a38ae7a886d 100644 --- a/fs/btrfs/random-test.c +++ b/fs/btrfs/random-test.c @@ -6,6 +6,7 @@ #include "ctree.h" #include "disk-io.h" #include "print-tree.h" +#include "transaction.h" int keep_running = 1; struct btrfs_super_block super; @@ -37,7 +38,8 @@ again: return 0; } -static int ins_one(struct btrfs_root *root, struct radix_tree_root *radix) +static int ins_one(struct btrfs_trans_handle *trans, struct btrfs_root *root, + struct radix_tree_root *radix) { struct btrfs_path path; struct btrfs_key key; @@ -47,7 +49,7 @@ static int ins_one(struct btrfs_root *root, struct radix_tree_root *radix) btrfs_init_path(&path); ret = setup_key(radix, &key, 0); sprintf(buf, "str-%Lu\n", key.objectid); - ret = btrfs_insert_item(root, &key, buf, strlen(buf)); + ret = btrfs_insert_item(trans, root, &key, buf, strlen(buf)); if (ret) goto error; oid = (unsigned long)key.objectid; @@ -62,7 +64,8 @@ error: return -1; } -static int insert_dup(struct btrfs_root *root, struct radix_tree_root *radix) +static int insert_dup(struct btrfs_trans_handle *trans, struct btrfs_root + *root, struct radix_tree_root *radix) { struct btrfs_path path; struct btrfs_key key; @@ -73,7 +76,7 @@ static int insert_dup(struct btrfs_root *root, struct radix_tree_root *radix) if (ret < 0) return 0; sprintf(buf, "str-%Lu\n", key.objectid); - ret = btrfs_insert_item(root, &key, buf, strlen(buf)); + ret = btrfs_insert_item(trans, root, &key, buf, strlen(buf)); if (ret != -EEXIST) { printf("insert on %Lu gave us %d\n", key.objectid, ret); return 1; @@ -81,7 +84,8 @@ static int insert_dup(struct btrfs_root *root, struct radix_tree_root *radix) return 0; } -static int del_one(struct btrfs_root *root, struct radix_tree_root *radix) +static int del_one(struct btrfs_trans_handle *trans, struct btrfs_root *root, + struct radix_tree_root *radix) { struct btrfs_path path; struct btrfs_key key; @@ -91,10 +95,10 @@ static int del_one(struct btrfs_root *root, struct radix_tree_root *radix) ret = setup_key(radix, &key, 1); if (ret < 0) return 0; - ret = btrfs_search_slot(root, &key, &path, -1, 1); + ret = btrfs_search_slot(trans, root, &key, &path, -1, 1); if (ret) goto error; - ret = btrfs_del_item(root, &path); + ret = btrfs_del_item(trans, root, &path); btrfs_release_path(root, &path); if (ret != 0) goto error; @@ -107,7 +111,8 @@ error: return -1; } -static int lookup_item(struct btrfs_root *root, struct radix_tree_root *radix) +static int lookup_item(struct btrfs_trans_handle *trans, struct btrfs_root + *root, struct radix_tree_root *radix) { struct btrfs_path path; struct btrfs_key key; @@ -116,7 +121,7 @@ static int lookup_item(struct btrfs_root *root, struct radix_tree_root *radix) ret = setup_key(radix, &key, 1); if (ret < 0) return 0; - ret = btrfs_search_slot(root, &key, &path, 0, 1); + ret = btrfs_search_slot(trans, root, &key, &path, 0, 1); btrfs_release_path(root, &path); if (ret) goto error; @@ -126,7 +131,8 @@ error: return -1; } -static int lookup_enoent(struct btrfs_root *root, struct radix_tree_root *radix) +static int lookup_enoent(struct btrfs_trans_handle *trans, struct btrfs_root + *root, struct radix_tree_root *radix) { struct btrfs_path path; struct btrfs_key key; @@ -135,7 +141,7 @@ static int lookup_enoent(struct btrfs_root *root, struct radix_tree_root *radix) ret = setup_key(radix, &key, 0); if (ret < 0) return ret; - ret = btrfs_search_slot(root, &key, &path, 0, 0); + ret = btrfs_search_slot(trans, root, &key, &path, 0, 0); btrfs_release_path(root, &path); if (ret <= 0) goto error; @@ -145,8 +151,8 @@ error: return -1; } -static int empty_tree(struct btrfs_root *root, struct radix_tree_root *radix, - int nr) +static int empty_tree(struct btrfs_trans_handle *trans, struct btrfs_root + *root, struct radix_tree_root *radix, int nr) { struct btrfs_path path; struct btrfs_key key; @@ -162,7 +168,7 @@ static int empty_tree(struct btrfs_root *root, struct radix_tree_root *radix, key.objectid = (unsigned long)-1; while(nr-- >= 0) { btrfs_init_path(&path); - ret = btrfs_search_slot(root, &key, &path, -1, 1); + ret = btrfs_search_slot(trans, root, &key, &path, -1, 1); if (ret < 0) { btrfs_release_path(root, &path); return ret; @@ -177,7 +183,7 @@ static int empty_tree(struct btrfs_root *root, struct radix_tree_root *radix, slot = path.slots[0]; found = btrfs_disk_key_objectid( &path.nodes[0]->leaf.items[slot].key); - ret = btrfs_del_item(root, &path); + ret = btrfs_del_item(trans, root, &path); count++; if (ret) { fprintf(stderr, @@ -198,19 +204,19 @@ error: return -1; } -static int fill_tree(struct btrfs_root *root, struct radix_tree_root *radix, - int count) +static int fill_tree(struct btrfs_trans_handle *trans, struct btrfs_root *root, + struct radix_tree_root *radix, int count) { int i; int ret = 0; for (i = 0; i < count; i++) { - ret = ins_one(root, radix); + ret = ins_one(trans, root, radix); if (ret) { fprintf(stderr, "fill failed\n"); goto out; } if (i % 1000 == 0) { - ret = btrfs_commit_transaction(root, &super); + ret = btrfs_commit_transaction(trans, root, &super); if (ret) { fprintf(stderr, "fill commit failed\n"); return ret; @@ -226,7 +232,8 @@ out: return ret; } -static int bulk_op(struct btrfs_root *root, struct radix_tree_root *radix) +static int bulk_op(struct btrfs_trans_handle *trans, struct btrfs_root *root, + struct radix_tree_root *radix) { int ret; int nr = rand() % 5000; @@ -235,17 +242,18 @@ static int bulk_op(struct btrfs_root *root, struct radix_tree_root *radix) /* do the bulk op much less frequently */ if (run_nr++ % 100) return 0; - ret = empty_tree(root, radix, nr); + ret = empty_tree(trans, root, radix, nr); if (ret) return ret; - ret = fill_tree(root, radix, nr); + ret = fill_tree(trans, root, radix, nr); if (ret) return ret; return 0; } -int (*ops[])(struct btrfs_root *root, struct radix_tree_root *radix) = +int (*ops[])(struct btrfs_trans_handle *, + struct btrfs_root *root, struct radix_tree_root *radix) = { ins_one, insert_dup, del_one, lookup_item, lookup_enoent, bulk_op }; @@ -264,7 +272,7 @@ static int fill_radix(struct btrfs_root *root, struct radix_tree_root *radix) key.objectid = (unsigned long)-1; while(1) { btrfs_init_path(&path); - ret = btrfs_search_slot(root, &key, &path, 0, 0); + ret = btrfs_search_slot(NULL, root, &key, &path, 0, 0); if (ret < 0) { btrfs_release_path(root, &path); return ret; @@ -325,6 +333,7 @@ int main(int ac, char **av) int init_fill_count = 800000; int err = 0; int initial_only = 0; + struct btrfs_trans_handle *trans; radix_tree_init(); root = open_ctree("dbfile", &super); fill_radix(root, &radix); @@ -346,7 +355,8 @@ int main(int ac, char **av) } } printf("initial fill\n"); - ret = fill_tree(root, &radix, init_fill_count); + trans = btrfs_start_transaction(root, 1); + ret = fill_tree(trans, root, &radix, init_fill_count); printf("starting run\n"); if (ret) { err = ret; @@ -370,7 +380,7 @@ int main(int ac, char **av) root = open_ctree("dbfile", &super); } while(count--) { - ret = ops[op](root, &radix); + ret = ops[op](trans, root, &radix); if (ret) { fprintf(stderr, "op %d failed %d:%d\n", op, i, iterations); diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index a24b47274037..9cccecc0f431 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -20,7 +20,7 @@ int btrfs_find_last_root(struct btrfs_root *root, u64 objectid, search_key.offset = (u32)-1; btrfs_init_path(&path); - ret = btrfs_search_slot(root, &search_key, &path, 0, 0); + ret = btrfs_search_slot(NULL, root, &search_key, &path, 0, 0); if (ret < 0) goto out; BUG_ON(ret == 0); @@ -40,8 +40,9 @@ out: return ret; } -int btrfs_update_root(struct btrfs_root *root, struct btrfs_key *key, - struct btrfs_root_item *item) +int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root + *root, struct btrfs_key *key, struct btrfs_root_item + *item) { struct btrfs_path path; struct btrfs_leaf *l; @@ -49,7 +50,7 @@ int btrfs_update_root(struct btrfs_root *root, struct btrfs_key *key, int slot; btrfs_init_path(&path); - ret = btrfs_search_slot(root, key, &path, 0, 1); + ret = btrfs_search_slot(trans, root, key, &path, 0, 1); if (ret < 0) goto out; BUG_ON(ret != 0); @@ -62,26 +63,28 @@ out: return ret; } -int btrfs_insert_root(struct btrfs_root *root, struct btrfs_key *key, - struct btrfs_root_item *item) +int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root + *root, struct btrfs_key *key, struct btrfs_root_item + *item) { int ret; - ret = btrfs_insert_item(root, key, item, sizeof(*item)); + ret = btrfs_insert_item(trans, root, key, item, sizeof(*item)); BUG_ON(ret); return ret; } -int btrfs_del_root(struct btrfs_root *root, struct btrfs_key *key) +int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root, + struct btrfs_key *key) { struct btrfs_path path; int ret; btrfs_init_path(&path); - ret = btrfs_search_slot(root, key, &path, -1, 1); + ret = btrfs_search_slot(trans, root, key, &path, -1, 1); if (ret < 0) goto out; BUG_ON(ret != 0); - ret = btrfs_del_item(root, &path); + ret = btrfs_del_item(trans, root, &path); out: btrfs_release_path(root, &path); return ret; diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h new file mode 100644 index 000000000000..3adb6e69fe43 --- /dev/null +++ b/fs/btrfs/transaction.h @@ -0,0 +1,27 @@ +#ifndef __TRANSACTION__ +#define __TRANSACTION__ + +struct btrfs_trans_handle { + u64 transid; + unsigned long blocks_reserved; + unsigned long blocks_used; +}; + +static inline struct btrfs_trans_handle * +btrfs_start_transaction(struct btrfs_root *root, int num_blocks) +{ + struct btrfs_trans_handle *h = malloc(sizeof(*h)); + h->transid = root->root_key.offset; + h->blocks_reserved = num_blocks; + h->blocks_used = 0; + return h; +} + +static inline void btrfs_free_transaction(struct btrfs_root *root, + struct btrfs_trans_handle *handle) +{ + memset(handle, 0, sizeof(*handle)); + free(handle); +} + +#endif -- cgit v1.2.3 From 631d7d950a2f2706f5c86858402c1ec50b9801f1 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Tue, 20 Mar 2007 09:13:49 -0400 Subject: Btrfs: add transaction.h to the Makefile Signed-off-by: Chris Mason --- fs/btrfs/Makefile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs/btrfs/Makefile') diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index 76bcd87d8575..7953b41848c1 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -1,6 +1,7 @@ CC=gcc CFLAGS = -g -Wall -Werror -headers = radix-tree.h ctree.h disk-io.h kerncompat.h print-tree.h list.h +headers = radix-tree.h ctree.h disk-io.h kerncompat.h print-tree.h list.h \ + transaction.h objects = ctree.o disk-io.o radix-tree.o mkfs.o extent-tree.o print-tree.o \ root-tree.o dir-item.o hash.o file-item.o inode-item.o -- cgit v1.2.3 From 9f5fae2fe6dc35b46bf56183f11398451851cb3f Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Tue, 20 Mar 2007 14:38:32 -0400 Subject: Btrfs: Add inode map, and the start of file extent items Signed-off-by: Chris Mason --- fs/btrfs/Makefile | 3 +- fs/btrfs/ctree.h | 129 ++++++++++++++++++++++++++++++++---- fs/btrfs/debug-tree.c | 9 ++- fs/btrfs/dir-test.c | 91 ++++++++++++++++++++----- fs/btrfs/disk-io.c | 176 +++++++++++++++++++++++++++---------------------- fs/btrfs/extent-tree.c | 105 +++++++++++++++-------------- fs/btrfs/file-item.c | 6 ++ fs/btrfs/inode-map.c | 136 ++++++++++++++++++++++++++++++++++++++ fs/btrfs/mkfs.c | 44 +++++++++++-- fs/btrfs/print-tree.c | 13 ++-- fs/btrfs/quick-test.c | 3 +- 11 files changed, 542 insertions(+), 173 deletions(-) create mode 100644 fs/btrfs/inode-map.c (limited to 'fs/btrfs/Makefile') diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index 7953b41848c1..636c63efde6f 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -3,7 +3,8 @@ CFLAGS = -g -Wall -Werror headers = radix-tree.h ctree.h disk-io.h kerncompat.h print-tree.h list.h \ transaction.h objects = ctree.o disk-io.o radix-tree.o mkfs.o extent-tree.o print-tree.o \ - root-tree.o dir-item.o hash.o file-item.o inode-item.o + root-tree.o dir-item.o hash.o file-item.o inode-item.o \ + inode-map.o \ # if you don't have sparse installed, use ls instead CHECKFLAGS=-D__linux__ -Dlinux -D__STDC__ -Dunix -D__unix__ -Wbitwise \ diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 68f0af39777f..68e3da274f17 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -10,7 +10,8 @@ struct btrfs_trans_handle; #define BTRFS_ROOT_TREE_OBJECTID 1 #define BTRFS_EXTENT_TREE_OBJECTID 2 -#define BTRFS_FS_TREE_OBJECTID 3 +#define BTRFS_INODE_MAP_OBJECTID 3 +#define BTRFS_FS_TREE_OBJECTID 4 /* * the key defines the order in the tree, and so it also defines (optimal) @@ -178,31 +179,65 @@ struct btrfs_root_item { __le64 block_limit; __le64 blocks_used; __le32 refs; -}; +} __attribute__ ((__packed__)); -/* - * in ram representation of the tree. extent_root is used for all allocations - * and for the extent tree extent_root root. current_insert is used - * only for the extent tree. - */ -struct btrfs_root { - struct btrfs_buffer *node; - struct btrfs_buffer *commit_root; +struct btrfs_file_extent_item { + /* + * disk space consumed by the extent, checksum blocks are included + * in these numbers + */ + __le64 disk_blocknr; + __le64 disk_num_blocks; + /* + * the logical offset in file bytes (no csums) + * this extent record is for. This allows a file extent to point + * into the middle of an existing extent on disk, sharing it + * between two snapshots (useful if some bytes in the middle of the + * extent have changed + */ + __le64 offset; + /* + * the logical number of file blocks (no csums included) + */ + __le64 num_blocks; +} __attribute__ ((__packed__)); + +struct btrfs_inode_map_item { + struct btrfs_disk_key key; +} __attribute__ ((__packed__)); + +struct btrfs_fs_info { + struct btrfs_root *fs_root; struct btrfs_root *extent_root; struct btrfs_root *tree_root; + struct btrfs_root *inode_root; struct btrfs_key current_insert; struct btrfs_key last_insert; - int fp; struct radix_tree_root cache_radix; struct radix_tree_root pinned_radix; struct list_head trans; struct list_head cache; + u64 last_inode_alloc; + u64 last_inode_alloc_dirid; int cache_size; - int ref_cows; + int fp; + struct btrfs_trans_handle *running_transaction; +}; + +/* + * in ram representation of the tree. extent_root is used for all allocations + * and for the extent tree extent_root root. current_insert is used + * only for the extent tree. + */ +struct btrfs_root { + struct btrfs_buffer *node; + struct btrfs_buffer *commit_root; struct btrfs_root_item root_item; struct btrfs_key root_key; + struct btrfs_fs_info *fs_info; u32 blocksize; - struct btrfs_trans_handle *running_transaction; + int ref_cows; + u32 type; }; /* the lower bits in the key flags defines the item type */ @@ -240,11 +275,17 @@ struct btrfs_root { * are used, and how many references there are to each block */ #define BTRFS_EXTENT_ITEM_KEY 6 + +/* + * the inode map records which inode numbers are in use and where + * they actually live on disk + */ +#define BTRFS_INODE_MAP_ITEM_KEY 7 /* * string items are for debugging. They just store a short string of * data in the FS */ -#define BTRFS_STRING_ITEM_KEY 7 +#define BTRFS_STRING_ITEM_KEY 8 static inline u64 btrfs_inode_generation(struct btrfs_inode_item *i) { @@ -654,6 +695,57 @@ static inline u8 *btrfs_leaf_data(struct btrfs_leaf *l) { return (u8 *)l->items; } + +static inline u64 btrfs_file_extent_disk_blocknr(struct btrfs_file_extent_item + *e) +{ + return le64_to_cpu(e->disk_blocknr); +} + +static inline void btrfs_set_file_extent_disk_blocknr(struct + btrfs_file_extent_item + *e, u64 val) +{ + e->disk_blocknr = cpu_to_le64(val); +} + +static inline u64 btrfs_file_extent_disk_num_blocks(struct + btrfs_file_extent_item *e) +{ + return le64_to_cpu(e->disk_num_blocks); +} + +static inline void btrfs_set_file_extent_disk_num_blocks(struct + btrfs_file_extent_item + *e, u64 val) +{ + e->disk_num_blocks = cpu_to_le64(val); +} + +static inline u64 btrfs_file_extent_offset(struct btrfs_file_extent_item *e) +{ + return le64_to_cpu(e->offset); +} + +static inline void btrfs_set_file_extent_offset(struct btrfs_file_extent_item + *e, u64 val) +{ + e->offset = cpu_to_le64(val); +} + +static inline u64 btrfs_file_extent_num_blocks(struct btrfs_file_extent_item + *e) +{ + return le64_to_cpu(e->num_blocks); +} + +static inline void btrfs_set_file_extent_num_blocks(struct + btrfs_file_extent_item *e, + u64 val) +{ + e->num_blocks = cpu_to_le64(val); +} + /* helper function to cast into the data area of the leaf. */ #define btrfs_item_ptr(leaf, slot, type) \ ((type *)(btrfs_leaf_data(leaf) + \ @@ -701,4 +793,13 @@ int btrfs_lookup_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root int name_len, int mod); int btrfs_match_dir_item_name(struct btrfs_root *root, struct btrfs_path *path, char *name, int name_len); +int btrfs_find_free_objectid(struct btrfs_trans_handle *trans, + struct btrfs_root *fs_root, + u64 dirid, u64 *objectid); +int btrfs_insert_inode_map(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 objectid, struct btrfs_key *location); +int btrfs_lookup_inode_map(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct btrfs_path *path, + u64 objectid, int mod); #endif diff --git a/fs/btrfs/debug-tree.c b/fs/btrfs/debug-tree.c index 91dea7a0a476..d9f36efc3dd1 100644 --- a/fs/btrfs/debug-tree.c +++ b/fs/btrfs/debug-tree.c @@ -15,8 +15,13 @@ int main(int ac, char **av) { printf("fs tree\n"); btrfs_print_tree(root, root->node); printf("map tree\n"); - btrfs_print_tree(root->extent_root, root->extent_root->node); + btrfs_print_tree(root->fs_info->extent_root, + root->fs_info->extent_root->node); + printf("inode tree\n"); + btrfs_print_tree(root->fs_info->inode_root, + root->fs_info->inode_root->node); printf("root tree\n"); - btrfs_print_tree(root->tree_root, root->tree_root->node); + btrfs_print_tree(root->fs_info->tree_root, + root->fs_info->tree_root->node); return 0; } diff --git a/fs/btrfs/dir-test.c b/fs/btrfs/dir-test.c index e908c0c588cc..56f06c3ba573 100644 --- a/fs/btrfs/dir-test.c +++ b/fs/btrfs/dir-test.c @@ -45,13 +45,26 @@ static int ins_one(struct btrfs_trans_handle *trans, struct btrfs_root *root, int ret; char buf[128]; unsigned long oid; + u64 objectid; struct btrfs_path path; + struct btrfs_key inode_map; find_num(radix, &oid, 0); sprintf(buf, "str-%lu", oid); + ret = btrfs_find_free_objectid(trans, root, dir_oid + 1, &objectid); + if (ret) + goto error; + + inode_map.objectid = objectid; + inode_map.flags = 0; + inode_map.offset = 0; + + ret = btrfs_insert_inode_map(trans, root, objectid, &inode_map); + if (ret) + goto error; ret = btrfs_insert_dir_item(trans, root, buf, strlen(buf), dir_oid, - file_oid, 1); + objectid, 1); if (ret) goto error; @@ -120,6 +133,53 @@ static int insert_dup(struct btrfs_trans_handle *trans, struct btrfs_root return 0; } +static int del_dir_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct radix_tree_root *radix, + unsigned long radix_index, + struct btrfs_path *path) +{ + int ret; + unsigned long *ptr; + u64 file_objectid; + struct btrfs_dir_item *di; + struct btrfs_path map_path; + + /* find the inode number of the file */ + di = btrfs_item_ptr(&path->nodes[0]->leaf, path->slots[0], + struct btrfs_dir_item); + file_objectid = btrfs_dir_objectid(di); + + /* delete the directory item */ + ret = btrfs_del_item(trans, root, path); + if (ret) + goto out; + + /* delete the inode mapping */ + btrfs_init_path(&map_path); + ret = btrfs_lookup_inode_map(trans, root, &map_path, file_objectid, -1); + if (ret) + goto out_release; + ret = btrfs_del_item(trans, root->fs_info->inode_root, &map_path); + if (ret) + goto out_release; + + if (root->fs_info->last_inode_alloc > file_objectid) + root->fs_info->last_inode_alloc = file_objectid; + btrfs_release_path(root, &map_path); + ptr = radix_tree_delete(radix, radix_index); + if (!ptr) { + ret = -5555; + goto out; + } + return 0; +out_release: + btrfs_release_path(root, &map_path); +out: + printf("failed to delete %lu %d\n", radix_index, ret); + return -1; +} + static int del_one(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct radix_tree_root *radix) { @@ -127,7 +187,6 @@ static int del_one(struct btrfs_trans_handle *trans, struct btrfs_root *root, char buf[128]; unsigned long oid; struct btrfs_path path; - unsigned long *ptr; ret = find_num(radix, &oid, 1); if (ret < 0) @@ -138,19 +197,14 @@ static int del_one(struct btrfs_trans_handle *trans, struct btrfs_root *root, strlen(buf), -1); if (ret) goto out_release; - ret = btrfs_del_item(trans, root, &path); + + ret = del_dir_item(trans, root, radix, oid, &path); if (ret) goto out_release; btrfs_release_path(root, &path); - ptr = radix_tree_delete(radix, oid); - if (!ptr) { - ret = -5555; - goto out; - } - return 0; + return ret; out_release: btrfs_release_path(root, &path); -out: printf("failed to delete %lu %d\n", oid, ret); return -1; } @@ -162,6 +216,8 @@ static int lookup_item(struct btrfs_trans_handle *trans, struct btrfs_root char buf[128]; int ret; unsigned long oid; + u64 objectid; + struct btrfs_dir_item *di; ret = find_num(radix, &oid, 1); if (ret < 0) @@ -170,6 +226,14 @@ static int lookup_item(struct btrfs_trans_handle *trans, struct btrfs_root btrfs_init_path(&path); ret = btrfs_lookup_dir_item(trans, root, &path, dir_oid, buf, strlen(buf), 0); + if (!ret) { + di = btrfs_item_ptr(&path.nodes[0]->leaf, path.slots[0], + struct btrfs_dir_item); + objectid = btrfs_dir_objectid(di); + btrfs_release_path(root, &path); + btrfs_init_path(&path); + ret = btrfs_lookup_inode_map(trans, root, &path, objectid, 0); + } btrfs_release_path(root, &path); if (ret) { printf("unable to find key %lu\n", oid); @@ -210,7 +274,6 @@ static int empty_tree(struct btrfs_trans_handle *trans, struct btrfs_root u32 found_len; int ret; int slot; - int *ptr; int count = 0; char buf[128]; struct btrfs_dir_item *di; @@ -241,7 +304,7 @@ static int empty_tree(struct btrfs_trans_handle *trans, struct btrfs_root BUG_ON(found_len > 128); buf[found_len] = '\0'; found = atoi(buf + 4); - ret = btrfs_del_item(trans, root, &path); + ret = del_dir_item(trans, root, radix, found, &path); count++; if (ret) { fprintf(stderr, @@ -250,14 +313,10 @@ static int empty_tree(struct btrfs_trans_handle *trans, struct btrfs_root return -1; } btrfs_release_path(root, &path); - ptr = radix_tree_delete(radix, found); - if (!ptr) - goto error; if (!keep_running) break; } return 0; -error: fprintf(stderr, "failed to delete from the radix %lu\n", found); return -1; } diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 25ce07908ee3..1849a99690c8 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -28,15 +28,15 @@ static int free_some_buffers(struct btrfs_root *root) { struct list_head *node, *next; struct btrfs_buffer *b; - if (root->cache_size < cache_max) + if (root->fs_info->cache_size < cache_max) return 0; - list_for_each_safe(node, next, &root->cache) { + list_for_each_safe(node, next, &root->fs_info->cache) { b = list_entry(node, struct btrfs_buffer, cache); if (b->count == 1) { BUG_ON(!list_empty(&b->dirty)); list_del_init(&b->cache); btrfs_block_release(root, b); - if (root->cache_size < cache_max) + if (root->fs_info->cache_size < cache_max) break; } } @@ -57,10 +57,10 @@ struct btrfs_buffer *alloc_tree_block(struct btrfs_root *root, u64 blocknr) INIT_LIST_HEAD(&buf->dirty); free_some_buffers(root); radix_tree_preload(GFP_KERNEL); - ret = radix_tree_insert(&root->cache_radix, blocknr, buf); + ret = radix_tree_insert(&root->fs_info->cache_radix, blocknr, buf); radix_tree_preload_end(); - list_add_tail(&buf->cache, &root->cache); - root->cache_size++; + list_add_tail(&buf->cache, &root->fs_info->cache); + root->fs_info->cache_size++; if (ret) { free(buf); return NULL; @@ -71,7 +71,7 @@ struct btrfs_buffer *alloc_tree_block(struct btrfs_root *root, u64 blocknr) struct btrfs_buffer *find_tree_block(struct btrfs_root *root, u64 blocknr) { struct btrfs_buffer *buf; - buf = radix_tree_lookup(&root->cache_radix, blocknr); + buf = radix_tree_lookup(&root->fs_info->cache_radix, blocknr); if (buf) { buf->count++; } else { @@ -90,14 +90,15 @@ struct btrfs_buffer *read_tree_block(struct btrfs_root *root, u64 blocknr) struct btrfs_buffer *buf; int ret; - buf = radix_tree_lookup(&root->cache_radix, blocknr); + buf = radix_tree_lookup(&root->fs_info->cache_radix, blocknr); if (buf) { buf->count++; } else { buf = alloc_tree_block(root, blocknr); if (!buf) return NULL; - ret = pread(root->fp, &buf->node, root->blocksize, offset); + ret = pread(root->fs_info->fp, &buf->node, root->blocksize, + offset); if (ret != root->blocksize) { free(buf); return NULL; @@ -113,7 +114,7 @@ int dirty_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, { if (!list_empty(&buf->dirty)) return 0; - list_add_tail(&buf->dirty, &root->trans); + list_add_tail(&buf->dirty, &root->fs_info->trans); buf->count++; return 0; } @@ -137,7 +138,7 @@ int write_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, if (buf->blocknr != btrfs_header_blocknr(&buf->node.header)) BUG(); - ret = pwrite(root->fp, &buf->node, root->blocksize, offset); + ret = pwrite(root->fs_info->fp, &buf->node, root->blocksize, offset); if (ret != root->blocksize) return ret; return 0; @@ -149,8 +150,9 @@ static int __commit_transaction(struct btrfs_trans_handle *trans, struct struct btrfs_buffer *b; int ret = 0; int wret; - while(!list_empty(&root->trans)) { - b = list_entry(root->trans.next, struct btrfs_buffer, dirty); + while(!list_empty(&root->fs_info->trans)) { + b = list_entry(root->fs_info->trans.next, struct btrfs_buffer, + dirty); list_del_init(&b->dirty); wret = write_tree_block(trans, root, b); if (wret) @@ -160,13 +162,21 @@ static int __commit_transaction(struct btrfs_trans_handle *trans, struct return ret; } -static int commit_extent_and_tree_roots(struct btrfs_trans_handle *trans, - struct btrfs_root *tree_root, struct - btrfs_root *extent_root) +static int commit_tree_roots(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info) { int ret; u64 old_extent_block; - + struct btrfs_root *tree_root = fs_info->tree_root; + struct btrfs_root *extent_root = fs_info->extent_root; + struct btrfs_root *inode_root = fs_info->inode_root; + + btrfs_set_root_blocknr(&inode_root->root_item, + inode_root->node->blocknr); + ret = btrfs_update_root(trans, tree_root, + &inode_root->root_key, + &inode_root->root_item); + BUG_ON(ret); while(1) { old_extent_block = btrfs_root_blocknr(&extent_root->root_item); if (old_extent_block == extent_root->node->blocknr) @@ -178,8 +188,6 @@ static int commit_extent_and_tree_roots(struct btrfs_trans_handle *trans, &extent_root->root_item); BUG_ON(ret); } - __commit_transaction(trans, extent_root); - __commit_transaction(trans, tree_root); return 0; } @@ -190,9 +198,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, struct struct btrfs_buffer *snap = root->commit_root; struct btrfs_key snap_key; - ret = __commit_transaction(trans, root); - BUG_ON(ret); - if (root->commit_root == root->node) return 0; @@ -200,54 +205,55 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, struct root->root_key.offset++; btrfs_set_root_blocknr(&root->root_item, root->node->blocknr); - ret = btrfs_insert_root(trans, root->tree_root, &root->root_key, - &root->root_item); + ret = btrfs_insert_root(trans, root->fs_info->tree_root, + &root->root_key, &root->root_item); + BUG_ON(ret); + + ret = commit_tree_roots(trans, root->fs_info); BUG_ON(ret); - ret = commit_extent_and_tree_roots(trans, root->tree_root, - root->extent_root); + ret = __commit_transaction(trans, root); BUG_ON(ret); write_ctree_super(trans, root, s); - btrfs_finish_extent_commit(trans, root->extent_root); - btrfs_finish_extent_commit(trans, root->tree_root); + btrfs_finish_extent_commit(trans, root->fs_info->extent_root); + btrfs_finish_extent_commit(trans, root->fs_info->tree_root); root->commit_root = root->node; root->node->count++; ret = btrfs_drop_snapshot(trans, root, snap); BUG_ON(ret); - ret = btrfs_del_root(trans, root->tree_root, &snap_key); + ret = btrfs_del_root(trans, root->fs_info->tree_root, &snap_key); BUG_ON(ret); return ret; } static int __setup_root(struct btrfs_super_block *super, - struct btrfs_root *root, u64 objectid, int fp) + struct btrfs_root *root, + struct btrfs_fs_info *fs_info, + u64 objectid, int fp) { - INIT_LIST_HEAD(&root->trans); - INIT_LIST_HEAD(&root->cache); - root->cache_size = 0; - root->fp = fp; root->node = NULL; root->commit_root = NULL; root->blocksize = btrfs_super_blocksize(super); root->ref_cows = 0; - memset(&root->current_insert, 0, sizeof(root->current_insert)); - memset(&root->last_insert, 0, sizeof(root->last_insert)); + root->fs_info = fs_info; memset(&root->root_key, 0, sizeof(root->root_key)); memset(&root->root_item, 0, sizeof(root->root_item)); return 0; } static int find_and_setup_root(struct btrfs_super_block *super, - struct btrfs_root *tree_root, u64 objectid, + struct btrfs_root *tree_root, + struct btrfs_fs_info *fs_info, + u64 objectid, struct btrfs_root *root, int fp) { int ret; - __setup_root(super, root, objectid, fp); + __setup_root(super, root, fs_info, objectid, fp); ret = btrfs_find_last_root(tree_root, objectid, &root->root_item, &root->root_key); BUG_ON(ret); @@ -263,29 +269,31 @@ struct btrfs_root *open_ctree(char *filename, struct btrfs_super_block *super) struct btrfs_root *root = malloc(sizeof(struct btrfs_root)); struct btrfs_root *extent_root = malloc(sizeof(struct btrfs_root)); struct btrfs_root *tree_root = malloc(sizeof(struct btrfs_root)); + struct btrfs_root *inode_root = malloc(sizeof(struct btrfs_root)); + struct btrfs_fs_info *fs_info = malloc(sizeof(*fs_info)); int fp; int ret; - root->extent_root = extent_root; - root->tree_root = tree_root; - - extent_root->extent_root = extent_root; - extent_root->tree_root = tree_root; - - tree_root->extent_root = extent_root; - tree_root->tree_root = tree_root; - fp = open(filename, O_CREAT | O_RDWR, 0600); if (fp < 0) { free(root); return NULL; } - INIT_RADIX_TREE(&root->cache_radix, GFP_KERNEL); - INIT_RADIX_TREE(&root->pinned_radix, GFP_KERNEL); - INIT_RADIX_TREE(&extent_root->pinned_radix, GFP_KERNEL); - INIT_RADIX_TREE(&extent_root->cache_radix, GFP_KERNEL); - INIT_RADIX_TREE(&tree_root->pinned_radix, GFP_KERNEL); - INIT_RADIX_TREE(&tree_root->cache_radix, GFP_KERNEL); + INIT_RADIX_TREE(&fs_info->cache_radix, GFP_KERNEL); + INIT_RADIX_TREE(&fs_info->pinned_radix, GFP_KERNEL); + INIT_LIST_HEAD(&fs_info->trans); + INIT_LIST_HEAD(&fs_info->cache); + fs_info->cache_size = 0; + fs_info->fp = fp; + fs_info->running_transaction = NULL; + fs_info->fs_root = root; + fs_info->tree_root = tree_root; + fs_info->extent_root = extent_root; + fs_info->inode_root = inode_root; + fs_info->last_inode_alloc = 0; + fs_info->last_inode_alloc_dirid = 0; + memset(&fs_info->current_insert, 0, sizeof(fs_info->current_insert)); + memset(&fs_info->last_insert, 0, sizeof(fs_info->last_insert)); ret = pread(fp, super, sizeof(struct btrfs_super_block), BTRFS_SUPER_INFO_OFFSET); @@ -301,16 +309,20 @@ struct btrfs_root *open_ctree(char *filename, struct btrfs_super_block *super) } BUG_ON(ret < 0); - __setup_root(super, tree_root, BTRFS_ROOT_TREE_OBJECTID, fp); + __setup_root(super, tree_root, fs_info, BTRFS_ROOT_TREE_OBJECTID, fp); tree_root->node = read_tree_block(tree_root, btrfs_super_root(super)); BUG_ON(!tree_root->node); - ret = find_and_setup_root(super, tree_root, BTRFS_EXTENT_TREE_OBJECTID, - extent_root, fp); + ret = find_and_setup_root(super, tree_root, fs_info, + BTRFS_EXTENT_TREE_OBJECTID, extent_root, fp); BUG_ON(ret); - ret = find_and_setup_root(super, tree_root, BTRFS_FS_TREE_OBJECTID, - root, fp); + ret = find_and_setup_root(super, tree_root, fs_info, + BTRFS_INODE_MAP_OBJECTID, inode_root, fp); + BUG_ON(ret); + + ret = find_and_setup_root(super, tree_root, fs_info, + BTRFS_FS_TREE_OBJECTID, root, fp); BUG_ON(ret); root->commit_root = root->node; @@ -323,8 +335,8 @@ int write_ctree_super(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_super_block *s) { int ret; - btrfs_set_super_root(s, root->tree_root->node->blocknr); - ret = pwrite(root->fp, s, sizeof(*s), + btrfs_set_super_root(s, root->fs_info->tree_root->node->blocknr); + ret = pwrite(root->fs_info->fp, s, sizeof(*s), BTRFS_SUPER_INFO_OFFSET); if (ret != sizeof(*s)) { fprintf(stderr, "failed to write new super block err %d\n", ret); @@ -335,9 +347,10 @@ int write_ctree_super(struct btrfs_trans_handle *trans, struct btrfs_root static int drop_cache(struct btrfs_root *root) { - while(!list_empty(&root->cache)) { - struct btrfs_buffer *b = list_entry(root->cache.next, - struct btrfs_buffer, cache); + while(!list_empty(&root->fs_info->cache)) { + struct btrfs_buffer *b = list_entry(root->fs_info->cache.next, + struct btrfs_buffer, + cache); list_del_init(&b->cache); btrfs_block_release(root, b); } @@ -348,26 +361,28 @@ int close_ctree(struct btrfs_root *root, struct btrfs_super_block *s) int ret; struct btrfs_trans_handle *trans; - trans = root->running_transaction; + trans = root->fs_info->running_transaction; btrfs_commit_transaction(trans, root, s); - ret = commit_extent_and_tree_roots(trans, root->tree_root, - root->extent_root); + ret = commit_tree_roots(trans, root->fs_info); + BUG_ON(ret); + ret = __commit_transaction(trans, root); BUG_ON(ret); write_ctree_super(trans, root, s); - drop_cache(root->extent_root); - drop_cache(root->tree_root); drop_cache(root); - BUG_ON(!list_empty(&root->trans)); - BUG_ON(!list_empty(&root->extent_root->trans)); - BUG_ON(!list_empty(&root->tree_root->trans)); + BUG_ON(!list_empty(&root->fs_info->trans)); - close(root->fp); + close(root->fs_info->fp); if (root->node) btrfs_block_release(root, root->node); - if (root->extent_root->node) - btrfs_block_release(root->extent_root, root->extent_root->node); - if (root->tree_root->node) - btrfs_block_release(root->tree_root, root->tree_root->node); + if (root->fs_info->extent_root->node) + btrfs_block_release(root->fs_info->extent_root, + root->fs_info->extent_root->node); + if (root->fs_info->inode_root->node) + btrfs_block_release(root->fs_info->inode_root, + root->fs_info->inode_root->node); + if (root->fs_info->tree_root->node) + btrfs_block_release(root->fs_info->tree_root, + root->fs_info->tree_root->node); btrfs_block_release(root, root->commit_root); free(root); printf("on close %d blocks are allocated\n", allocated_blocks); @@ -382,15 +397,16 @@ void btrfs_block_release(struct btrfs_root *root, struct btrfs_buffer *buf) if (buf->count == 0) { BUG_ON(!list_empty(&buf->cache)); BUG_ON(!list_empty(&buf->dirty)); - if (!radix_tree_lookup(&root->cache_radix, buf->blocknr)) + if (!radix_tree_lookup(&root->fs_info->cache_radix, + buf->blocknr)) BUG(); - radix_tree_delete(&root->cache_radix, buf->blocknr); + radix_tree_delete(&root->fs_info->cache_radix, buf->blocknr); memset(buf, 0, sizeof(*buf)); free(buf); BUG_ON(allocated_blocks == 0); allocated_blocks--; - BUG_ON(root->cache_size == 0); - root->cache_size--; + BUG_ON(root->fs_info->cache_size == 0); + root->fs_info->cache_size--; } } diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index c29b92d440e0..09eeeb4d9d28 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -35,13 +35,15 @@ static int inc_block_ref(struct btrfs_trans_handle *trans, struct btrfs_root struct btrfs_key ins; u32 refs; - find_free_extent(trans, root->extent_root, 0, 0, (u64)-1, &ins); + find_free_extent(trans, root->fs_info->extent_root, 0, 0, (u64)-1, + &ins); btrfs_init_path(&path); key.objectid = blocknr; key.flags = 0; btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY); key.offset = 1; - ret = btrfs_search_slot(trans, root->extent_root, &key, &path, 0, 1); + ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key, &path, + 0, 1); if (ret != 0) BUG(); BUG_ON(ret != 0); @@ -51,9 +53,9 @@ static int inc_block_ref(struct btrfs_trans_handle *trans, struct btrfs_root btrfs_set_extent_refs(item, refs + 1); BUG_ON(list_empty(&path.nodes[0]->dirty)); - btrfs_release_path(root->extent_root, &path); - finish_current_insert(trans, root->extent_root); - run_pending(trans, root->extent_root); + btrfs_release_path(root->fs_info->extent_root, &path); + finish_current_insert(trans, root->fs_info->extent_root); + run_pending(trans, root->fs_info->extent_root); return 0; } @@ -70,13 +72,14 @@ static int lookup_block_ref(struct btrfs_trans_handle *trans, struct btrfs_root key.offset = 1; key.flags = 0; btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY); - ret = btrfs_search_slot(trans, root->extent_root, &key, &path, 0, 0); + ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key, &path, + 0, 0); if (ret != 0) BUG(); l = &path.nodes[0]->leaf; item = btrfs_item_ptr(l, path.slots[0], struct btrfs_extent_item); *refs = btrfs_extent_refs(item); - btrfs_release_path(root->extent_root, &path); + btrfs_release_path(root->fs_info->extent_root, &path); return 0; } @@ -107,19 +110,20 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, struct int i; while(1) { - ret = radix_tree_gang_lookup(&root->pinned_radix, - (void **)gang, 0, - ARRAY_SIZE(gang)); + ret = radix_tree_gang_lookup(&root->fs_info->pinned_radix, + (void **)gang, 0, + ARRAY_SIZE(gang)); if (!ret) break; if (!first) first = gang[0]; for (i = 0; i < ret; i++) { - radix_tree_delete(&root->pinned_radix, gang[i]); + radix_tree_delete(&root->fs_info->pinned_radix, + gang[i]); } } - root->last_insert.objectid = first; - root->last_insert.offset = 0; + root->fs_info->last_insert.objectid = first; + root->fs_info->last_insert.offset = 0; return 0; } @@ -138,13 +142,14 @@ static int finish_current_insert(struct btrfs_trans_handle *trans, struct ins.flags = 0; btrfs_set_key_type(&ins, BTRFS_EXTENT_ITEM_KEY); - for (i = 0; i < extent_root->current_insert.flags; i++) { - ins.objectid = extent_root->current_insert.objectid + i; + for (i = 0; i < extent_root->fs_info->current_insert.flags; i++) { + ins.objectid = extent_root->fs_info->current_insert.objectid + + i; ret = btrfs_insert_item(trans, extent_root, &ins, &extent_item, sizeof(extent_item)); BUG_ON(ret); } - extent_root->current_insert.offset = 0; + extent_root->fs_info->current_insert.offset = 0; return 0; } @@ -156,7 +161,7 @@ static int __free_extent(struct btrfs_trans_handle *trans, struct btrfs_root { struct btrfs_path path; struct btrfs_key key; - struct btrfs_root *extent_root = root->extent_root; + struct btrfs_root *extent_root = root->fs_info->extent_root; int ret; struct btrfs_extent_item *ei; struct btrfs_key ins; @@ -186,14 +191,16 @@ static int __free_extent(struct btrfs_trans_handle *trans, struct btrfs_root if (pin) { int err; radix_tree_preload(GFP_KERNEL); - err = radix_tree_insert(&extent_root->pinned_radix, - blocknr, (void *)blocknr); + err = radix_tree_insert( + &extent_root->fs_info->pinned_radix, + blocknr, (void *)blocknr); BUG_ON(err); radix_tree_preload_end(); } ret = btrfs_del_item(trans, extent_root, &path); - if (!pin && extent_root->last_insert.objectid > blocknr) - extent_root->last_insert.objectid = blocknr; + if (!pin && extent_root->fs_info->last_insert.objectid > + blocknr) + extent_root->fs_info->last_insert.objectid = blocknr; if (ret) BUG(); } @@ -214,18 +221,19 @@ static int del_pending_extents(struct btrfs_trans_handle *trans, struct int i; while(1) { - ret = radix_tree_gang_lookup_tag(&extent_root->cache_radix, - (void **)gang, 0, - ARRAY_SIZE(gang), - CTREE_EXTENT_PENDING_DEL); + ret = radix_tree_gang_lookup_tag( + &extent_root->fs_info->cache_radix, + (void **)gang, 0, + ARRAY_SIZE(gang), + CTREE_EXTENT_PENDING_DEL); if (!ret) break; for (i = 0; i < ret; i++) { ret = __free_extent(trans, extent_root, gang[i]->blocknr, 1, 1); - radix_tree_tag_clear(&extent_root->cache_radix, - gang[i]->blocknr, - CTREE_EXTENT_PENDING_DEL); + radix_tree_tag_clear(&extent_root->fs_info->cache_radix, + gang[i]->blocknr, + CTREE_EXTENT_PENDING_DEL); btrfs_block_release(extent_root, gang[i]); } } @@ -235,8 +243,8 @@ static int del_pending_extents(struct btrfs_trans_handle *trans, struct static int run_pending(struct btrfs_trans_handle *trans, struct btrfs_root *extent_root) { - while(radix_tree_tagged(&extent_root->cache_radix, - CTREE_EXTENT_PENDING_DEL)) + while(radix_tree_tagged(&extent_root->fs_info->cache_radix, + CTREE_EXTENT_PENDING_DEL)) del_pending_extents(trans, extent_root); return 0; } @@ -248,19 +256,19 @@ static int run_pending(struct btrfs_trans_handle *trans, struct btrfs_root int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 blocknr, u64 num_blocks, int pin) { - struct btrfs_root *extent_root = root->extent_root; + struct btrfs_root *extent_root = root->fs_info->extent_root; struct btrfs_buffer *t; int pending_ret; int ret; if (root == extent_root) { t = find_tree_block(root, blocknr); - radix_tree_tag_set(&root->cache_radix, blocknr, + radix_tree_tag_set(&root->fs_info->cache_radix, blocknr, CTREE_EXTENT_PENDING_DEL); return 0; } ret = __free_extent(trans, root, blocknr, num_blocks, pin); - pending_ret = run_pending(trans, root->extent_root); + pending_ret = run_pending(trans, root->fs_info->extent_root); return ret ? ret : pending_ret; } @@ -285,12 +293,12 @@ static int find_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root u64 test_block; int start_found; struct btrfs_leaf *l; - struct btrfs_root * root = orig_root->extent_root; + struct btrfs_root * root = orig_root->fs_info->extent_root; int total_needed = num_blocks; total_needed += (btrfs_header_level(&root->node->node.header) + 1) * 3; - if (root->last_insert.objectid > search_start) - search_start = root->last_insert.objectid; + if (root->fs_info->last_insert.objectid > search_start) + search_start = root->fs_info->last_insert.objectid; ins->flags = 0; btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY); @@ -353,16 +361,17 @@ check_pending: BUG_ON(ins->objectid < search_start); for (test_block = ins->objectid; test_block < ins->objectid + total_needed; test_block++) { - if (radix_tree_lookup(&root->pinned_radix, test_block)) { + if (radix_tree_lookup(&root->fs_info->pinned_radix, + test_block)) { search_start = test_block + 1; goto check_failed; } } - BUG_ON(root->current_insert.offset); - root->current_insert.offset = total_needed - num_blocks; - root->current_insert.objectid = ins->objectid + num_blocks; - root->current_insert.flags = 0; - root->last_insert.objectid = ins->objectid; + BUG_ON(root->fs_info->current_insert.offset); + root->fs_info->current_insert.offset = total_needed - num_blocks; + root->fs_info->current_insert.objectid = ins->objectid + num_blocks; + root->fs_info->current_insert.flags = 0; + root->fs_info->last_insert.objectid = ins->objectid; ins->offset = num_blocks; return 0; error: @@ -383,20 +392,20 @@ static int alloc_extent(struct btrfs_trans_handle *trans, struct btrfs_root { int ret; int pending_ret; - struct btrfs_root *extent_root = root->extent_root; + struct btrfs_root *extent_root = root->fs_info->extent_root; struct btrfs_extent_item extent_item; btrfs_set_extent_refs(&extent_item, 1); btrfs_set_extent_owner(&extent_item, owner); if (root == extent_root) { - BUG_ON(extent_root->current_insert.offset == 0); + BUG_ON(extent_root->fs_info->current_insert.offset == 0); BUG_ON(num_blocks != 1); - BUG_ON(extent_root->current_insert.flags == - extent_root->current_insert.offset); + BUG_ON(extent_root->fs_info->current_insert.flags == + extent_root->fs_info->current_insert.offset); ins->offset = 1; - ins->objectid = extent_root->current_insert.objectid + - extent_root->current_insert.flags++; + ins->objectid = extent_root->fs_info->current_insert.objectid + + extent_root->fs_info->current_insert.flags++; return 0; } ret = find_free_extent(trans, root, num_blocks, search_start, diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index f5b36c732c5e..24cfd6d85240 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -4,4 +4,10 @@ #include "radix-tree.h" #include "ctree.h" #include "disk-io.h" +#include "transaction.h" +int btrfs_create_file(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 dirid, u64 *objectid) +{ + return 0; +} diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c new file mode 100644 index 000000000000..f412b3392135 --- /dev/null +++ b/fs/btrfs/inode-map.c @@ -0,0 +1,136 @@ +#include +#include +#include "kerncompat.h" +#include "radix-tree.h" +#include "ctree.h" +#include "disk-io.h" +#include "transaction.h" + +/* + * walks the btree of allocated inodes and find a hole. + */ +int btrfs_find_free_objectid(struct btrfs_trans_handle *trans, + struct btrfs_root *fs_root, + u64 dirid, u64 *objectid) +{ + struct btrfs_path path; + struct btrfs_key key; + int ret; + u64 hole_size = 0; + int slot = 0; + u64 last_ino; + int start_found; + struct btrfs_leaf *l; + struct btrfs_root *root = fs_root->fs_info->inode_root; + struct btrfs_key search_key; + u64 search_start = dirid; + + if (fs_root->fs_info->last_inode_alloc_dirid == dirid) + search_start = fs_root->fs_info->last_inode_alloc; + + search_key.objectid = search_start; + search_key.flags = 0; + btrfs_set_key_type(&search_key, BTRFS_INODE_MAP_ITEM_KEY); + search_key.offset = 0; + + btrfs_init_path(&path); + start_found = 0; + ret = btrfs_search_slot(trans, root, &search_key, &path, 0, 0); + if (ret < 0) + goto error; + + if (path.slots[0] > 0) + path.slots[0]--; + + while (1) { + l = &path.nodes[0]->leaf; + slot = path.slots[0]; + if (slot >= btrfs_header_nritems(&l->header)) { + ret = btrfs_next_leaf(root, &path); + if (ret == 0) + continue; + if (ret < 0) + goto error; + if (!start_found) { + *objectid = search_start; + start_found = 1; + goto found; + } + *objectid = last_ino > search_start ? + last_ino : search_start; + goto found; + } + btrfs_disk_key_to_cpu(&key, &l->items[slot].key); + if (key.objectid >= search_start) { + if (start_found) { + if (last_ino < search_start) + last_ino = search_start; + hole_size = key.objectid - last_ino; + if (hole_size > 0) { + *objectid = last_ino; + goto found; + } + } + } + start_found = 1; + last_ino = key.objectid + 1; + path.slots[0]++; + } + // FIXME -ENOSPC +found: + root->fs_info->last_inode_alloc = *objectid; + root->fs_info->last_inode_alloc_dirid = dirid; + btrfs_release_path(root, &path); + BUG_ON(*objectid < search_start); + return 0; +error: + btrfs_release_path(root, &path); + return ret; +} + +int btrfs_insert_inode_map(struct btrfs_trans_handle *trans, + struct btrfs_root *fs_root, + u64 objectid, struct btrfs_key *location) +{ + int ret = 0; + struct btrfs_path path; + struct btrfs_inode_map_item *inode_item; + struct btrfs_key key; + struct btrfs_root *inode_root = fs_root->fs_info->inode_root; + + key.objectid = objectid; + key.flags = 0; + btrfs_set_key_type(&key, BTRFS_INODE_MAP_ITEM_KEY); + key.offset = 0; + btrfs_init_path(&path); + ret = btrfs_insert_empty_item(trans, inode_root, &path, &key, + sizeof(struct btrfs_inode_map_item)); + if (ret) + goto out; + + inode_item = btrfs_item_ptr(&path.nodes[0]->leaf, path.slots[0], + struct btrfs_inode_map_item); + btrfs_cpu_key_to_disk(&inode_item->key, location); +out: + btrfs_release_path(inode_root, &path); + return ret; +} + +int btrfs_lookup_inode_map(struct btrfs_trans_handle *trans, + struct btrfs_root *fs_root, struct btrfs_path *path, + u64 objectid, int mod) +{ + int ret; + struct btrfs_key key; + int ins_len = mod < 0 ? -1 : 0; + int cow = mod != 0; + struct btrfs_root *inode_root = fs_root->fs_info->inode_root; + + key.objectid = objectid; + key.flags = 0; + key.offset = 0; + btrfs_set_key_type(&key, BTRFS_INODE_MAP_ITEM_KEY); + ret = btrfs_search_slot(trans, inode_root, &key, path, ins_len, cow); + return ret; +} + diff --git a/fs/btrfs/mkfs.c b/fs/btrfs/mkfs.c index 9aa900811c33..1cac5ab114dd 100644 --- a/fs/btrfs/mkfs.c +++ b/fs/btrfs/mkfs.c @@ -42,7 +42,7 @@ int mkfs(int fd, u64 num_blocks, u32 blocksize) btrfs_set_header_parentid(&empty_leaf->header, BTRFS_ROOT_TREE_OBJECTID); btrfs_set_header_blocknr(&empty_leaf->header, start_block + 1); - btrfs_set_header_nritems(&empty_leaf->header, 2); + btrfs_set_header_nritems(&empty_leaf->header, 3); /* create the items for the root tree */ btrfs_set_root_blocknr(&root_item, start_block + 2); @@ -61,8 +61,16 @@ int mkfs(int fd, u64 num_blocks, u32 blocksize) btrfs_set_root_blocknr(&root_item, start_block + 3); itemoff = itemoff - sizeof(root_item); btrfs_set_item_offset(&item, itemoff); - btrfs_set_disk_key_objectid(&item.key, BTRFS_FS_TREE_OBJECTID); + btrfs_set_disk_key_objectid(&item.key, BTRFS_INODE_MAP_OBJECTID); memcpy(empty_leaf->items + 1, &item, sizeof(item)); + memcpy(btrfs_leaf_data(empty_leaf) + itemoff, + &root_item, sizeof(root_item)); + + btrfs_set_root_blocknr(&root_item, start_block + 4); + itemoff = itemoff - sizeof(root_item); + btrfs_set_item_offset(&item, itemoff); + btrfs_set_disk_key_objectid(&item.key, BTRFS_FS_TREE_OBJECTID); + memcpy(empty_leaf->items + 2, &item, sizeof(item)); memcpy(btrfs_leaf_data(empty_leaf) + itemoff, &root_item, sizeof(root_item)); ret = pwrite(fd, empty_leaf, blocksize, (start_block + 1) * blocksize); @@ -71,7 +79,7 @@ int mkfs(int fd, u64 num_blocks, u32 blocksize) btrfs_set_header_parentid(&empty_leaf->header, BTRFS_EXTENT_TREE_OBJECTID); btrfs_set_header_blocknr(&empty_leaf->header, start_block + 2); - btrfs_set_header_nritems(&empty_leaf->header, 4); + btrfs_set_header_nritems(&empty_leaf->header, 5); /* item1, reserve blocks 0-16 */ btrfs_set_disk_key_objectid(&item.key, 0); @@ -108,12 +116,12 @@ int mkfs(int fd, u64 num_blocks, u32 blocksize) memcpy(btrfs_leaf_data(empty_leaf) + btrfs_item_offset(&item), &extent_item, btrfs_item_size(&item)); - /* item4, give block 19 to the FS root */ + /* item4, give block 19 to the inode map */ btrfs_set_disk_key_objectid(&item.key, start_block + 3); btrfs_set_disk_key_offset(&item.key, 1); itemoff = itemoff - sizeof(struct btrfs_extent_item); btrfs_set_item_offset(&item, itemoff); - btrfs_set_extent_owner(&extent_item, BTRFS_FS_TREE_OBJECTID); + btrfs_set_extent_owner(&extent_item, BTRFS_INODE_MAP_OBJECTID); memcpy(empty_leaf->items + 3, &item, sizeof(item)); memcpy(btrfs_leaf_data(empty_leaf) + btrfs_item_offset(&item), &extent_item, btrfs_item_size(&item)); @@ -121,11 +129,33 @@ int mkfs(int fd, u64 num_blocks, u32 blocksize) if (ret != blocksize) return -1; - /* finally create the FS root */ - btrfs_set_header_parentid(&empty_leaf->header, BTRFS_FS_TREE_OBJECTID); + /* item5, give block 20 to the FS root */ + btrfs_set_disk_key_objectid(&item.key, start_block + 4); + btrfs_set_disk_key_offset(&item.key, 1); + itemoff = itemoff - sizeof(struct btrfs_extent_item); + btrfs_set_item_offset(&item, itemoff); + btrfs_set_extent_owner(&extent_item, BTRFS_FS_TREE_OBJECTID); + memcpy(empty_leaf->items + 4, &item, sizeof(item)); + memcpy(btrfs_leaf_data(empty_leaf) + btrfs_item_offset(&item), + &extent_item, btrfs_item_size(&item)); + ret = pwrite(fd, empty_leaf, blocksize, (start_block + 2) * blocksize); + if (ret != blocksize) + return -1; + + /* create the inode map */ + btrfs_set_header_parentid(&empty_leaf->header, + BTRFS_INODE_MAP_OBJECTID); btrfs_set_header_blocknr(&empty_leaf->header, start_block + 3); btrfs_set_header_nritems(&empty_leaf->header, 0); ret = pwrite(fd, empty_leaf, blocksize, (start_block + 3) * blocksize); + if (ret != blocksize) + return -1; + + /* finally create the FS root */ + btrfs_set_header_parentid(&empty_leaf->header, BTRFS_FS_TREE_OBJECTID); + btrfs_set_header_blocknr(&empty_leaf->header, start_block + 4); + btrfs_set_header_nritems(&empty_leaf->header, 0); + ret = pwrite(fd, empty_leaf, blocksize, (start_block + 4) * blocksize); if (ret != blocksize) return -1; return 0; diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c index 0bb5c38427c9..f250e5fad773 100644 --- a/fs/btrfs/print-tree.c +++ b/fs/btrfs/print-tree.c @@ -13,8 +13,8 @@ void btrfs_print_leaf(struct btrfs_root *root, struct btrfs_leaf *l) struct btrfs_extent_item *ei; struct btrfs_root_item *ri; struct btrfs_dir_item *di; + struct btrfs_inode_map_item *mi; u32 type; - u32 namelen; printf("leaf %Lu total ptrs %d free space %d\n", btrfs_header_blocknr(&l->header), nr, @@ -34,15 +34,13 @@ void btrfs_print_leaf(struct btrfs_root *root, struct btrfs_leaf *l) case BTRFS_INODE_ITEM_KEY: break; case BTRFS_DIR_ITEM_KEY: - namelen = btrfs_item_size(l->items + i) - sizeof(*di); di = btrfs_item_ptr(l, i, struct btrfs_dir_item); printf("\t\tdir oid %Lu flags %u type %u\n", btrfs_dir_objectid(di), btrfs_dir_flags(di), btrfs_dir_type(di)); printf("\t\tname %.*s\n", - namelen, (char *)(di + 1)); - + btrfs_dir_name_len(di),(char *)(di + 1)); break; case BTRFS_ROOT_ITEM_KEY: ri = btrfs_item_ptr(l, i, struct btrfs_root_item); @@ -54,6 +52,13 @@ void btrfs_print_leaf(struct btrfs_root *root, struct btrfs_leaf *l) printf("\t\textent data refs %u owner %Lu\n", btrfs_extent_refs(ei), btrfs_extent_owner(ei)); break; + case BTRFS_INODE_MAP_ITEM_KEY: + mi = btrfs_item_ptr(l, i, struct btrfs_inode_map_item); + printf("\t\tinode map key %Lu %u %Lu\n", + btrfs_disk_key_objectid(&mi->key), + btrfs_disk_key_flags(&mi->key), + btrfs_disk_key_offset(&mi->key)); + break; case BTRFS_STRING_ITEM_KEY: printf("\t\titem data %.*s\n", btrfs_item_size(item), btrfs_leaf_data(l) + btrfs_item_offset(item)); diff --git a/fs/btrfs/quick-test.c b/fs/btrfs/quick-test.c index d676577185d5..ef12cf888783 100644 --- a/fs/btrfs/quick-test.c +++ b/fs/btrfs/quick-test.c @@ -172,7 +172,8 @@ int main(int ac, char **av) { printf("tree size is now %d\n", tree_size); printf("root %p commit root %p\n", root->node, root->commit_root); printf("map tree\n"); - btrfs_print_tree(root->extent_root, root->extent_root->node); + btrfs_print_tree(root->fs_info->extent_root, + root->fs_info->extent_root->node); close_ctree(root, &super); return 0; } -- cgit v1.2.3 From 1261ec42b3d3a3ad878bd172144940e3ac710749 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Tue, 20 Mar 2007 20:35:03 -0400 Subject: Btrfs: Better block record keeping, real mkfs Signed-off-by: Chris Mason --- fs/btrfs/Makefile | 7 ++-- fs/btrfs/ctree.h | 1 + fs/btrfs/debug-tree.c | 13 ++++++- fs/btrfs/dir-test.c | 2 -- fs/btrfs/disk-io.c | 11 ++---- fs/btrfs/extent-tree.c | 24 ++++++++++--- fs/btrfs/mkfs.c | 95 +++++++++++++++++++++++++++++++++++++++++++++++++- 7 files changed, 134 insertions(+), 19 deletions(-) (limited to 'fs/btrfs/Makefile') diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index 636c63efde6f..0720169b6d66 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -2,7 +2,7 @@ CC=gcc CFLAGS = -g -Wall -Werror headers = radix-tree.h ctree.h disk-io.h kerncompat.h print-tree.h list.h \ transaction.h -objects = ctree.o disk-io.o radix-tree.o mkfs.o extent-tree.o print-tree.o \ +objects = ctree.o disk-io.o radix-tree.o extent-tree.o print-tree.o \ root-tree.o dir-item.o hash.o file-item.o inode-item.o \ inode-map.o \ @@ -16,7 +16,10 @@ check=sparse $(CHECKFLAGS) $(check) $< $(CC) $(CFLAGS) -c $< -all: tester debug-tree quick-test dir-test tags +all: tester debug-tree quick-test dir-test tags mkfs.btrfs + +mkfs.btrfs: $(objects) mkfs.o + gcc $(CFLAGS) -o mkfs.btrfs $(objects) mkfs.o debug-tree: $(objects) debug-tree.o gcc $(CFLAGS) -o debug-tree $(objects) debug-tree.o diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index b30b2ce72455..1a4d1d6fa401 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -223,6 +223,7 @@ struct btrfs_fs_info { int cache_size; int fp; struct btrfs_trans_handle *running_transaction; + struct btrfs_super_block *disk_super; }; /* diff --git a/fs/btrfs/debug-tree.c b/fs/btrfs/debug-tree.c index d9f36efc3dd1..fd07969600c6 100644 --- a/fs/btrfs/debug-tree.c +++ b/fs/btrfs/debug-tree.c @@ -10,8 +10,17 @@ int main(int ac, char **av) { struct btrfs_super_block super; struct btrfs_root *root; + + if (ac != 2) { + fprintf(stderr, "usage: %s device\n", av[0]); + exit(1); + } radix_tree_init(); - root = open_ctree("dbfile", &super); + root = open_ctree(av[1], &super); + if (!root) { + fprintf(stderr, "unable to open %s\n", av[1]); + exit(1); + } printf("fs tree\n"); btrfs_print_tree(root, root->node); printf("map tree\n"); @@ -23,5 +32,7 @@ int main(int ac, char **av) { printf("root tree\n"); btrfs_print_tree(root->fs_info->tree_root, root->fs_info->tree_root->node); + printf("total blocks %Lu\n", btrfs_super_total_blocks(&super)); + printf("blocks used %Lu\n", btrfs_super_blocks_used(&super)); return 0; } diff --git a/fs/btrfs/dir-test.c b/fs/btrfs/dir-test.c index 8fc77c83a351..b673982a1f3c 100644 --- a/fs/btrfs/dir-test.c +++ b/fs/btrfs/dir-test.c @@ -425,8 +425,6 @@ int main(int ac, char **av) struct btrfs_trans_handle *trans; radix_tree_init(); - printf("removing old tree\n"); - unlink("dbfile"); root = open_ctree("dbfile", &super); trans = btrfs_start_transaction(root, 1); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index bacaa38ea82e..0322c55162cb 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -293,20 +293,15 @@ struct btrfs_root *open_ctree(char *filename, struct btrfs_super_block *super) fs_info->inode_root = inode_root; fs_info->last_inode_alloc = 0; fs_info->last_inode_alloc_dirid = 0; + fs_info->disk_super = super; memset(&fs_info->current_insert, 0, sizeof(fs_info->current_insert)); memset(&fs_info->last_insert, 0, sizeof(fs_info->last_insert)); ret = pread(fp, super, sizeof(struct btrfs_super_block), BTRFS_SUPER_INFO_OFFSET); if (ret == 0 || btrfs_super_root(super) == 0) { - printf("making new FS!\n"); - ret = mkfs(fp, 0, 1024); - if (ret) - return NULL; - ret = pread(fp, super, sizeof(struct btrfs_super_block), - BTRFS_SUPER_INFO_OFFSET); - if (ret != sizeof(struct btrfs_super_block)) - return NULL; + BUG(); + return NULL; } BUG_ON(ret < 0); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 09eeeb4d9d28..9bc4ad38876d 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -134,6 +134,8 @@ static int finish_current_insert(struct btrfs_trans_handle *trans, struct struct btrfs_extent_item extent_item; int i; int ret; + u64 super_blocks_used; + struct btrfs_fs_info *info = extent_root->fs_info; btrfs_set_extent_refs(&extent_item, 1); btrfs_set_extent_owner(&extent_item, @@ -145,6 +147,9 @@ static int finish_current_insert(struct btrfs_trans_handle *trans, struct for (i = 0; i < extent_root->fs_info->current_insert.flags; i++) { ins.objectid = extent_root->fs_info->current_insert.objectid + i; + super_blocks_used = btrfs_super_blocks_used(info->disk_super); + btrfs_set_super_blocks_used(info->disk_super, + super_blocks_used + 1); ret = btrfs_insert_item(trans, extent_root, &ins, &extent_item, sizeof(extent_item)); BUG_ON(ret); @@ -161,7 +166,8 @@ static int __free_extent(struct btrfs_trans_handle *trans, struct btrfs_root { struct btrfs_path path; struct btrfs_key key; - struct btrfs_root *extent_root = root->fs_info->extent_root; + struct btrfs_fs_info *info = root->fs_info; + struct btrfs_root *extent_root = info->extent_root; int ret; struct btrfs_extent_item *ei; struct btrfs_key ins; @@ -188,15 +194,18 @@ static int __free_extent(struct btrfs_trans_handle *trans, struct btrfs_root refs = btrfs_extent_refs(ei) - 1; btrfs_set_extent_refs(ei, refs); if (refs == 0) { + u64 super_blocks_used; if (pin) { int err; radix_tree_preload(GFP_KERNEL); - err = radix_tree_insert( - &extent_root->fs_info->pinned_radix, - blocknr, (void *)blocknr); + err = radix_tree_insert(&info->pinned_radix, + blocknr, (void *)blocknr); BUG_ON(err); radix_tree_preload_end(); } + super_blocks_used = btrfs_super_blocks_used(info->disk_super); + btrfs_set_super_blocks_used(info->disk_super, + super_blocks_used - num_blocks); ret = btrfs_del_item(trans, extent_root, &path); if (!pin && extent_root->fs_info->last_insert.objectid > blocknr) @@ -392,7 +401,9 @@ static int alloc_extent(struct btrfs_trans_handle *trans, struct btrfs_root { int ret; int pending_ret; - struct btrfs_root *extent_root = root->fs_info->extent_root; + u64 super_blocks_used; + struct btrfs_fs_info *info = root->fs_info; + struct btrfs_root *extent_root = info->extent_root; struct btrfs_extent_item extent_item; btrfs_set_extent_refs(&extent_item, 1); @@ -413,6 +424,9 @@ static int alloc_extent(struct btrfs_trans_handle *trans, struct btrfs_root if (ret) return ret; + super_blocks_used = btrfs_super_blocks_used(info->disk_super); + btrfs_set_super_blocks_used(info->disk_super, super_blocks_used + + num_blocks); ret = btrfs_insert_item(trans, extent_root, ins, &extent_item, sizeof(extent_item)); diff --git a/fs/btrfs/mkfs.c b/fs/btrfs/mkfs.c index 1cac5ab114dd..f7efc8a5fb1a 100644 --- a/fs/btrfs/mkfs.c +++ b/fs/btrfs/mkfs.c @@ -1,4 +1,8 @@ #define _XOPEN_SOURCE 500 +#ifndef __CHECKER__ +#include +#include +#endif #include #include #include @@ -10,6 +14,17 @@ #include "ctree.h" #include "disk-io.h" +#ifdef __CHECKER__ +#define BLKGETSIZE64 0 +static inline int ioctl(int fd, int define, u64 *size) { return 0; } +#endif + +#if 0 +#if defined(__linux__) && defined(_IOR) && !defined(BLKGETSIZE64) +# define BLKGETSIZE64 _IOR(0x12, 114, __u64) +#endif +#endif + int mkfs(int fd, u64 num_blocks, u32 blocksize) { struct btrfs_super_block super; @@ -27,7 +42,7 @@ int mkfs(int fd, u64 num_blocks, u32 blocksize) strcpy((char *)(&super.magic), BTRFS_MAGIC); btrfs_set_super_blocksize(&super, blocksize); btrfs_set_super_total_blocks(&super, num_blocks); - btrfs_set_super_blocks_used(&super, 0); + btrfs_set_super_blocks_used(&super, start_block + 5); block = malloc(blocksize); memset(block, 0, blocksize); @@ -160,3 +175,81 @@ int mkfs(int fd, u64 num_blocks, u32 blocksize) return -1; return 0; } + +u64 device_size(int fd, struct stat *st) +{ + u64 size; + if (S_ISREG(st->st_mode)) { + return st->st_size; + } + if (!S_ISBLK(st->st_mode)) { + return 0; + } + if (ioctl(fd, BLKGETSIZE64, &size) >= 0) { + return size; + } + return 0; +} + +int main(int ac, char **av) +{ + char *file; + u64 block_count = 0; + int fd; + struct stat st; + int ret; + int i; + char *buf = malloc(4096); + if (ac >= 2) { + file = av[1]; + if (ac == 3) { + block_count = atoi(av[2]); + if (!block_count) { + fprintf(stderr, "error finding block count\n"); + exit(1); + } + } + } else { + fprintf(stderr, "usage: mkfs.btrfs file [block count]\n"); + exit(1); + } + fd = open(file, O_RDWR); + if (fd < 0) { + fprintf(stderr, "unable to open %s\n", file); + exit(1); + } + ret = fstat(fd, &st); + if (ret < 0) { + fprintf(stderr, "unable to stat %s\n", file); + exit(1); + } + if (block_count == 0) { + block_count = device_size(fd, &st); + if (block_count == 0) { + fprintf(stderr, "unable to find %s size\n", file); + exit(1); + } + } + block_count /= 4096; + if (block_count < 256) { + fprintf(stderr, "device %s is too small\n", file); + exit(1); + } + memset(buf, 0, 4096); + for(i = 0; i < 6; i++) { + ret = write(fd, buf, 4096); + if (ret != 4096) { + fprintf(stderr, "unable to zero fill device\n"); + exit(1); + } + } + ret = mkfs(fd, block_count, 4096); + if (ret) { + fprintf(stderr, "error during mkfs %d\n", ret); + exit(1); + } + printf("fs created on %s blocksize %d blocks %Lu\n", + file, 4096, block_count); + return 0; +} + -- cgit v1.2.3 From 2e635a278354a1a7951e16cfea4c247d6d0e7c99 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Wed, 21 Mar 2007 11:12:56 -0400 Subject: Btrfs: initial move to kernel module land Signed-off-by: Chris Mason --- fs/btrfs/Makefile | 50 +-- fs/btrfs/ctree.c | 6 +- fs/btrfs/ctree.h | 15 +- fs/btrfs/debug-tree.c | 38 --- fs/btrfs/dir-item.c | 12 +- fs/btrfs/dir-test.c | 494 ----------------------------- fs/btrfs/disk-io.c | 17 +- fs/btrfs/disk-io.h | 1 + fs/btrfs/extent-tree.c | 10 +- fs/btrfs/file-item.c | 6 +- fs/btrfs/hash.c | 1 - fs/btrfs/hasher.c | 23 -- fs/btrfs/inode-item.c | 5 +- fs/btrfs/inode-map.c | 5 +- fs/btrfs/kerncompat.h | 96 ------ fs/btrfs/list.h | 418 ------------------------- fs/btrfs/mkfs.c | 255 --------------- fs/btrfs/print-tree.c | 30 +- fs/btrfs/quick-test.c | 179 ----------- fs/btrfs/radix-tree.c | 836 ------------------------------------------------- fs/btrfs/radix-tree.h | 73 ----- fs/btrfs/random-test.c | 405 ------------------------ fs/btrfs/root-tree.c | 5 +- fs/btrfs/super.c | 205 ++++++++++++ 24 files changed, 274 insertions(+), 2911 deletions(-) delete mode 100644 fs/btrfs/debug-tree.c delete mode 100644 fs/btrfs/dir-test.c delete mode 100644 fs/btrfs/hasher.c delete mode 100644 fs/btrfs/kerncompat.h delete mode 100644 fs/btrfs/list.h delete mode 100644 fs/btrfs/mkfs.c delete mode 100644 fs/btrfs/quick-test.c delete mode 100644 fs/btrfs/radix-tree.c delete mode 100644 fs/btrfs/radix-tree.h delete mode 100644 fs/btrfs/random-test.c create mode 100644 fs/btrfs/super.c (limited to 'fs/btrfs/Makefile') diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index 0720169b6d66..99e45a54ebd6 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -1,40 +1,20 @@ -CC=gcc -CFLAGS = -g -Wall -Werror -headers = radix-tree.h ctree.h disk-io.h kerncompat.h print-tree.h list.h \ - transaction.h -objects = ctree.o disk-io.o radix-tree.o extent-tree.o print-tree.o \ - root-tree.o dir-item.o hash.o file-item.o inode-item.o \ - inode-map.o \ +ifneq ($(KERNELRELEASE),) +# kbuild part of makefile -# if you don't have sparse installed, use ls instead -CHECKFLAGS=-D__linux__ -Dlinux -D__STDC__ -Dunix -D__unix__ -Wbitwise \ - -Wcontext -Wcast-truncate -Wuninitialized -Wshadow -Wundef -check=sparse $(CHECKFLAGS) -#check=ls +obj-m := btrfs.o +btrfs-y := super.o -.c.o: - $(check) $< - $(CC) $(CFLAGS) -c $< +#btrfs-y := ctree.o disk-io.o radix-tree.o extent-tree.o print-tree.o \ +# root-tree.o dir-item.o hash.o file-item.o inode-item.o \ +# inode-map.o \ -all: tester debug-tree quick-test dir-test tags mkfs.btrfs - -mkfs.btrfs: $(objects) mkfs.o - gcc $(CFLAGS) -o mkfs.btrfs $(objects) mkfs.o - -debug-tree: $(objects) debug-tree.o - gcc $(CFLAGS) -o debug-tree $(objects) debug-tree.o - -tester: $(objects) random-test.o - gcc $(CFLAGS) -o tester $(objects) random-test.o - -dir-test: $(objects) dir-test.o - gcc $(CFLAGS) -o dir-test $(objects) dir-test.o -quick-test: $(objects) quick-test.o - gcc $(CFLAGS) -o quick-test $(objects) quick-test.o - -$(objects): $(headers) - -clean : - rm debug-tree tester *.o +else +# Normal Makefile +KERNELDIR := /lib/modules/`uname -r`/build +all:: + $(MAKE) -C $(KERNELDIR) M=`pwd` modules +clean:: + rm *.o btrfs.ko +endif diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 32922643b5bc..9fbd07c37fde 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -1,10 +1,6 @@ -#include -#include -#include "kerncompat.h" -#include "radix-tree.h" +#include #include "ctree.h" #include "disk-io.h" -#include "print-tree.h" static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, int level); diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 1a4d1d6fa401..ae8518cb94bf 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1,9 +1,6 @@ #ifndef __BTRFS__ #define __BTRFS__ -#include "list.h" -#include "kerncompat.h" - struct btrfs_trans_handle; #define BTRFS_MAGIC "_BtRfS_M" @@ -75,6 +72,7 @@ struct btrfs_super_block { __le64 root; __le64 total_blocks; __le64 blocks_used; + __le64 root_dir_objectid; } __attribute__ ((__packed__)); /* @@ -693,6 +691,17 @@ static inline void btrfs_set_super_blocksize(struct btrfs_super_block *s, s->blocksize = cpu_to_le32(val); } +static inline u64 btrfs_super_root_dir(struct btrfs_super_block *s) +{ + return le64_to_cpu(s->root_dir_objectid); +} + +static inline void btrfs_set_super_root_dir(struct btrfs_super_block *s, u64 + val) +{ + s->root_dir_objectid = cpu_to_le64(val); +} + static inline u8 *btrfs_leaf_data(struct btrfs_leaf *l) { return (u8 *)l->items; diff --git a/fs/btrfs/debug-tree.c b/fs/btrfs/debug-tree.c deleted file mode 100644 index fd07969600c6..000000000000 --- a/fs/btrfs/debug-tree.c +++ /dev/null @@ -1,38 +0,0 @@ -#include -#include -#include "kerncompat.h" -#include "radix-tree.h" -#include "ctree.h" -#include "disk-io.h" -#include "print-tree.h" -#include "transaction.h" - -int main(int ac, char **av) { - struct btrfs_super_block super; - struct btrfs_root *root; - - if (ac != 2) { - fprintf(stderr, "usage: %s device\n", av[0]); - exit(1); - } - radix_tree_init(); - root = open_ctree(av[1], &super); - if (!root) { - fprintf(stderr, "unable to open %s\n", av[1]); - exit(1); - } - printf("fs tree\n"); - btrfs_print_tree(root, root->node); - printf("map tree\n"); - btrfs_print_tree(root->fs_info->extent_root, - root->fs_info->extent_root->node); - printf("inode tree\n"); - btrfs_print_tree(root->fs_info->inode_root, - root->fs_info->inode_root->node); - printf("root tree\n"); - btrfs_print_tree(root->fs_info->tree_root, - root->fs_info->tree_root->node); - printf("total blocks %Lu\n", btrfs_super_total_blocks(&super)); - printf("blocks used %Lu\n", btrfs_super_blocks_used(&super)); - return 0; -} diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c index 949c4e526798..4d8083d92fa0 100644 --- a/fs/btrfs/dir-item.c +++ b/fs/btrfs/dir-item.c @@ -1,7 +1,4 @@ -#include -#include -#include "kerncompat.h" -#include "radix-tree.h" +#include #include "ctree.h" #include "disk-io.h" #include "hash.h" @@ -21,7 +18,12 @@ int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root key.objectid = dir; key.flags = 0; btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY); - ret = btrfs_name_hash(name, name_len, &key.offset); + if (name_len == 1 && *name == '.') + key.offset = 1; + else if (name_len == 2 && name[0] == '.' && name[1] == '.') + key.offset = 2; + else + ret = btrfs_name_hash(name, name_len, &key.offset); BUG_ON(ret); btrfs_init_path(&path); data_size = sizeof(*dir_item) + name_len; diff --git a/fs/btrfs/dir-test.c b/fs/btrfs/dir-test.c deleted file mode 100644 index b673982a1f3c..000000000000 --- a/fs/btrfs/dir-test.c +++ /dev/null @@ -1,494 +0,0 @@ -#include -#include -#include -#include -#include "kerncompat.h" -#include "radix-tree.h" -#include "ctree.h" -#include "disk-io.h" -#include "print-tree.h" -#include "hash.h" -#include "transaction.h" - -int keep_running = 1; -struct btrfs_super_block super; -static u64 dir_oid = 44556; -static u64 file_oid = 33778; - -static int find_num(struct radix_tree_root *root, unsigned long *num_ret, - int exists) -{ - unsigned long num = rand(); - unsigned long res[2]; - int ret; - -again: - ret = radix_tree_gang_lookup(root, (void **)res, num, 2); - if (exists) { - if (ret == 0) - return -1; - num = res[0]; - } else if (ret != 0 && num == res[0]) { - num++; - if (ret > 1 && num == res[1]) { - num++; - goto again; - } - } - *num_ret = num; - return 0; -} - -static void initial_inode_init(struct btrfs_root *root, - struct btrfs_inode_item *inode_item) -{ - memset(inode_item, 0, sizeof(*inode_item)); - btrfs_set_inode_generation(inode_item, root->fs_info->generation); -} - -static int ins_one(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct radix_tree_root *radix) -{ - int ret; - char buf[128]; - unsigned long oid; - u64 objectid; - struct btrfs_path path; - struct btrfs_key inode_map; - struct btrfs_inode_item inode_item; - - find_num(radix, &oid, 0); - sprintf(buf, "str-%lu", oid); - - ret = btrfs_find_free_objectid(trans, root, dir_oid + 1, &objectid); - if (ret) - goto error; - - inode_map.objectid = objectid; - inode_map.flags = 0; - inode_map.offset = 0; - - ret = btrfs_insert_inode_map(trans, root, objectid, &inode_map); - if (ret) - goto error; - - initial_inode_init(root, &inode_item); - ret = btrfs_insert_inode(trans, root, objectid, &inode_item); - if (ret) - goto error; - ret = btrfs_insert_dir_item(trans, root, buf, strlen(buf), dir_oid, - objectid, 1); - if (ret) - goto error; - - radix_tree_preload(GFP_KERNEL); - ret = radix_tree_insert(radix, oid, (void *)oid); - radix_tree_preload_end(); - if (ret) - goto error; - return ret; -error: - if (ret != -EEXIST) - goto fatal; - - /* - * if we got an EEXIST, it may be due to hash collision, double - * check - */ - btrfs_init_path(&path); - ret = btrfs_lookup_dir_item(trans, root, &path, dir_oid, buf, - strlen(buf), 0); - if (ret) - goto fatal_release; - if (!btrfs_match_dir_item_name(root, &path, buf, strlen(buf))) { - struct btrfs_dir_item *di; - char *found; - u32 found_len; - u64 myhash; - u64 foundhash; - - di = btrfs_item_ptr(&path.nodes[0]->leaf, path.slots[0], - struct btrfs_dir_item); - found = (char *)(di + 1); - found_len = btrfs_dir_name_len(di); - btrfs_name_hash(buf, strlen(buf), &myhash); - btrfs_name_hash(found, found_len, &foundhash); - if (myhash != foundhash) - goto fatal_release; - btrfs_release_path(root, &path); - return 0; - } -fatal_release: - btrfs_release_path(root, &path); -fatal: - printf("failed to insert %lu ret %d\n", oid, ret); - return -1; -} - -static int insert_dup(struct btrfs_trans_handle *trans, struct btrfs_root - *root, struct radix_tree_root *radix) -{ - int ret; - char buf[128]; - unsigned long oid; - - ret = find_num(radix, &oid, 1); - if (ret < 0) - return 0; - sprintf(buf, "str-%lu", oid); - - ret = btrfs_insert_dir_item(trans, root, buf, strlen(buf), dir_oid, - file_oid, 1); - if (ret != -EEXIST) { - printf("insert on %s gave us %d\n", buf, ret); - return 1; - } - return 0; -} - -static int del_dir_item(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct radix_tree_root *radix, - unsigned long radix_index, - struct btrfs_path *path) -{ - int ret; - unsigned long *ptr; - u64 file_objectid; - struct btrfs_dir_item *di; - - /* find the inode number of the file */ - di = btrfs_item_ptr(&path->nodes[0]->leaf, path->slots[0], - struct btrfs_dir_item); - file_objectid = btrfs_dir_objectid(di); - - /* delete the directory item */ - ret = btrfs_del_item(trans, root, path); - if (ret) - goto out_release; - btrfs_release_path(root, path); - - /* delete the inode */ - btrfs_init_path(path); - ret = btrfs_lookup_inode(trans, root, path, file_objectid, -1); - if (ret) - goto out_release; - ret = btrfs_del_item(trans, root, path); - if (ret) - goto out_release; - btrfs_release_path(root, path); - - /* delete the inode mapping */ - btrfs_init_path(path); - ret = btrfs_lookup_inode_map(trans, root, path, file_objectid, -1); - if (ret) - goto out_release; - ret = btrfs_del_item(trans, root->fs_info->inode_root, path); - if (ret) - goto out_release; - - if (root->fs_info->last_inode_alloc > file_objectid) - root->fs_info->last_inode_alloc = file_objectid; - btrfs_release_path(root, path); - ptr = radix_tree_delete(radix, radix_index); - if (!ptr) { - ret = -5555; - goto out; - } - return 0; -out_release: - btrfs_release_path(root, path); -out: - printf("failed to delete %lu %d\n", radix_index, ret); - return -1; -} - -static int del_one(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct radix_tree_root *radix) -{ - int ret; - char buf[128]; - unsigned long oid; - struct btrfs_path path; - - ret = find_num(radix, &oid, 1); - if (ret < 0) - return 0; - sprintf(buf, "str-%lu", oid); - btrfs_init_path(&path); - ret = btrfs_lookup_dir_item(trans, root, &path, dir_oid, buf, - strlen(buf), -1); - if (ret) - goto out_release; - - ret = del_dir_item(trans, root, radix, oid, &path); - if (ret) - goto out_release; - return ret; -out_release: - btrfs_release_path(root, &path); - printf("failed to delete %lu %d\n", oid, ret); - return -1; -} - -static int lookup_item(struct btrfs_trans_handle *trans, struct btrfs_root - *root, struct radix_tree_root *radix) -{ - struct btrfs_path path; - char buf[128]; - int ret; - unsigned long oid; - u64 objectid; - struct btrfs_dir_item *di; - - ret = find_num(radix, &oid, 1); - if (ret < 0) - return 0; - sprintf(buf, "str-%lu", oid); - btrfs_init_path(&path); - ret = btrfs_lookup_dir_item(trans, root, &path, dir_oid, buf, - strlen(buf), 0); - if (!ret) { - di = btrfs_item_ptr(&path.nodes[0]->leaf, path.slots[0], - struct btrfs_dir_item); - objectid = btrfs_dir_objectid(di); - btrfs_release_path(root, &path); - btrfs_init_path(&path); - ret = btrfs_lookup_inode_map(trans, root, &path, objectid, 0); - } - btrfs_release_path(root, &path); - if (ret) { - printf("unable to find key %lu\n", oid); - return -1; - } - return 0; -} - -static int lookup_enoent(struct btrfs_trans_handle *trans, struct btrfs_root - *root, struct radix_tree_root *radix) -{ - struct btrfs_path path; - char buf[128]; - int ret; - unsigned long oid; - - ret = find_num(radix, &oid, 0); - if (ret < 0) - return 0; - sprintf(buf, "str-%lu", oid); - btrfs_init_path(&path); - ret = btrfs_lookup_dir_item(trans, root, &path, dir_oid, buf, - strlen(buf), 0); - btrfs_release_path(root, &path); - if (!ret) { - printf("able to find key that should not exist %lu\n", oid); - return -1; - } - return 0; -} - -static int empty_tree(struct btrfs_trans_handle *trans, struct btrfs_root - *root, struct radix_tree_root *radix, int nr) -{ - struct btrfs_path path; - struct btrfs_key key; - unsigned long found = 0; - u32 found_len; - int ret; - int slot; - int count = 0; - char buf[128]; - struct btrfs_dir_item *di; - - key.offset = (u64)-1; - key.flags = 0; - btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY); - key.objectid = dir_oid; - while(nr-- >= 0) { - btrfs_init_path(&path); - ret = btrfs_search_slot(trans, root, &key, &path, -1, 1); - if (ret < 0) { - btrfs_release_path(root, &path); - return ret; - } - if (ret != 0) { - if (path.slots[0] == 0) { - btrfs_release_path(root, &path); - break; - } - path.slots[0] -= 1; - } - slot = path.slots[0]; - di = btrfs_item_ptr(&path.nodes[0]->leaf, slot, - struct btrfs_dir_item); - found_len = btrfs_dir_name_len(di); - memcpy(buf, (char *)(di + 1), found_len); - BUG_ON(found_len > 128); - buf[found_len] = '\0'; - found = atoi(buf + 4); - ret = del_dir_item(trans, root, radix, found, &path); - count++; - if (ret) { - fprintf(stderr, - "failed to remove %lu from tree\n", - found); - return -1; - } - if (!keep_running) - break; - } - return 0; - fprintf(stderr, "failed to delete from the radix %lu\n", found); - return -1; -} - -static int fill_tree(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct radix_tree_root *radix, int count) -{ - int i; - int ret = 0; - for (i = 0; i < count; i++) { - ret = ins_one(trans, root, radix); - if (ret) { - fprintf(stderr, "fill failed\n"); - goto out; - } - if (i % 1000 == 0) { - ret = btrfs_commit_transaction(trans, root, &super); - if (ret) { - fprintf(stderr, "fill commit failed\n"); - return ret; - } - } - if (i && i % 10000 == 0) { - printf("bigfill %d\n", i); - } - if (!keep_running) - break; - } -out: - return ret; -} - -static int bulk_op(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct radix_tree_root *radix) -{ - int ret; - int nr = rand() % 5000; - static int run_nr = 0; - - /* do the bulk op much less frequently */ - if (run_nr++ % 100) - return 0; - ret = empty_tree(trans, root, radix, nr); - if (ret) - return ret; - ret = fill_tree(trans, root, radix, nr); - if (ret) - return ret; - return 0; -} - - -int (*ops[])(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct - radix_tree_root *radix) = - { ins_one, insert_dup, del_one, lookup_item, - lookup_enoent, bulk_op }; - -void sigstopper(int ignored) -{ - keep_running = 0; - fprintf(stderr, "caught exit signal, stopping\n"); -} - -int print_usage(void) -{ - printf("usage: tester [-ih] [-c count] [-f count]\n"); - printf("\t -c count -- iteration count after filling\n"); - printf("\t -f count -- run this many random inserts before starting\n"); - printf("\t -i -- only do initial fill\n"); - printf("\t -h -- this help text\n"); - exit(1); -} -int main(int ac, char **av) -{ - RADIX_TREE(radix, GFP_KERNEL); - struct btrfs_root *root; - int i; - int ret; - int count; - int op; - int iterations = 20000; - int init_fill_count = 800000; - int err = 0; - int initial_only = 0; - struct btrfs_trans_handle *trans; - radix_tree_init(); - - root = open_ctree("dbfile", &super); - trans = btrfs_start_transaction(root, 1); - - signal(SIGTERM, sigstopper); - signal(SIGINT, sigstopper); - - for (i = 1 ; i < ac ; i++) { - if (strcmp(av[i], "-i") == 0) { - initial_only = 1; - } else if (strcmp(av[i], "-c") == 0) { - iterations = atoi(av[i+1]); - i++; - } else if (strcmp(av[i], "-f") == 0) { - init_fill_count = atoi(av[i+1]); - i++; - } else { - print_usage(); - } - } - printf("initial fill\n"); - ret = fill_tree(trans, root, &radix, init_fill_count); - printf("starting run\n"); - if (ret) { - err = ret; - goto out; - } - if (initial_only == 1) { - goto out; - } - for (i = 0; i < iterations; i++) { - op = rand() % ARRAY_SIZE(ops); - count = rand() % 128; - if (i % 2000 == 0) { - printf("%d\n", i); - fflush(stdout); - } - if (i && i % 5000 == 0) { - printf("open & close, root level %d nritems %d\n", - btrfs_header_level(&root->node->node.header), - btrfs_header_nritems(&root->node->node.header)); - close_ctree(root, &super); - root = open_ctree("dbfile", &super); - } - while(count--) { - ret = ops[op](trans, root, &radix); - if (ret) { - fprintf(stderr, "op %d failed %d:%d\n", - op, i, iterations); - btrfs_print_tree(root, root->node); - fprintf(stderr, "op %d failed %d:%d\n", - op, i, iterations); - err = ret; - goto out; - } - if (ops[op] == bulk_op) - break; - if (keep_running == 0) { - err = 0; - goto out; - } - } - } -out: - close_ctree(root, &super); - return err; -} - diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 0322c55162cb..05637f9fd7c7 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -266,20 +266,25 @@ static int find_and_setup_root(struct btrfs_super_block *super, } struct btrfs_root *open_ctree(char *filename, struct btrfs_super_block *super) +{ + int fp; + + fp = open(filename, O_CREAT | O_RDWR, 0600); + if (fp < 0) { + return NULL; + } + return open_ctree_fd(fp, super); +} + +struct btrfs_root *open_ctree_fd(int fp, struct btrfs_super_block *super) { struct btrfs_root *root = malloc(sizeof(struct btrfs_root)); struct btrfs_root *extent_root = malloc(sizeof(struct btrfs_root)); struct btrfs_root *tree_root = malloc(sizeof(struct btrfs_root)); struct btrfs_root *inode_root = malloc(sizeof(struct btrfs_root)); struct btrfs_fs_info *fs_info = malloc(sizeof(*fs_info)); - int fp; int ret; - fp = open(filename, O_CREAT | O_RDWR, 0600); - if (fp < 0) { - free(root); - return NULL; - } INIT_RADIX_TREE(&fs_info->cache_radix, GFP_KERNEL); INIT_RADIX_TREE(&fs_info->pinned_radix, GFP_KERNEL); INIT_LIST_HEAD(&fs_info->trans); diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 24a9e77c8311..d888cf5c350b 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -24,6 +24,7 @@ int clean_tree_block(struct btrfs_trans_handle *trans, int btrfs_commit_transaction(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_super_block *s); struct btrfs_root *open_ctree(char *filename, struct btrfs_super_block *s); +struct btrfs_root *open_ctree_fd(int fp, struct btrfs_super_block *super); int close_ctree(struct btrfs_root *root, struct btrfs_super_block *s); void btrfs_block_release(struct btrfs_root *root, struct btrfs_buffer *buf); int write_ctree_super(struct btrfs_trans_handle *trans, struct btrfs_root *root, diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 9bc4ad38876d..53a7550b5c1e 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -1,7 +1,5 @@ -#include -#include -#include "kerncompat.h" -#include "radix-tree.h" +#include +#include #include "ctree.h" #include "disk-io.h" #include "print-tree.h" @@ -183,9 +181,9 @@ static int __free_extent(struct btrfs_trans_handle *trans, struct btrfs_root btrfs_init_path(&path); ret = btrfs_search_slot(trans, extent_root, &key, &path, -1, 1); if (ret) { - printf("failed to find %Lu\n", key.objectid); + printk("failed to find %Lu\n", key.objectid); btrfs_print_tree(extent_root, extent_root->node); - printf("failed to find %Lu\n", key.objectid); + printk("failed to find %Lu\n", key.objectid); BUG(); } ei = btrfs_item_ptr(&path.nodes[0]->leaf, path.slots[0], diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 24cfd6d85240..8e1e5b4ccfaf 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -1,9 +1,5 @@ -#include -#include -#include "kerncompat.h" -#include "radix-tree.h" +#include #include "ctree.h" -#include "disk-io.h" #include "transaction.h" int btrfs_create_file(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/hash.c b/fs/btrfs/hash.c index 92c37eece1f2..6c2a71a46c7d 100644 --- a/fs/btrfs/hash.c +++ b/fs/btrfs/hash.c @@ -10,7 +10,6 @@ * License. */ -#include "kerncompat.h" #define DELTA 0x9E3779B9 static void TEA_transform(__u32 buf[2], __u32 const in[]) diff --git a/fs/btrfs/hasher.c b/fs/btrfs/hasher.c deleted file mode 100644 index 96702da4329c..000000000000 --- a/fs/btrfs/hasher.c +++ /dev/null @@ -1,23 +0,0 @@ -#include -#include -#include -#include "kerncompat.h" -#include "hash.h" - -int main() { - u64 result; - int ret; - char line[255]; - char *p; - while(1) { - p = fgets(line, 255, stdin); - if (!p) - break; - if (strlen(line) == 0) - continue; - ret = btrfs_name_hash(line, strlen(line), &result); - BUG_ON(ret); - printf("hash returns %Lu\n", result); - } - return 0; -} diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c index 7caeb11e8756..8d8c26a6c1a0 100644 --- a/fs/btrfs/inode-item.c +++ b/fs/btrfs/inode-item.c @@ -1,7 +1,4 @@ -#include -#include -#include "kerncompat.h" -#include "radix-tree.h" +#include #include "ctree.h" #include "disk-io.h" #include "transaction.h" diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c index f412b3392135..c7fda3bf7b21 100644 --- a/fs/btrfs/inode-map.c +++ b/fs/btrfs/inode-map.c @@ -1,7 +1,4 @@ -#include -#include -#include "kerncompat.h" -#include "radix-tree.h" +#include #include "ctree.h" #include "disk-io.h" #include "transaction.h" diff --git a/fs/btrfs/kerncompat.h b/fs/btrfs/kerncompat.h deleted file mode 100644 index 105d3f584089..000000000000 --- a/fs/btrfs/kerncompat.h +++ /dev/null @@ -1,96 +0,0 @@ -#ifndef __KERNCOMPAT -#define __KERNCOMPAT -#define gfp_t int -#define get_cpu_var(p) (p) -#define __get_cpu_var(p) (p) -#define BITS_PER_LONG 64 -#define __GFP_BITS_SHIFT 20 -#define __GFP_BITS_MASK ((int)((1 << __GFP_BITS_SHIFT) - 1)) -#define GFP_KERNEL 0 -#define __read_mostly -#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) -#define PAGE_SHIFT 12 -#define ULONG_MAX (~0UL) -#define BUG() abort() -#ifdef __CHECKER__ -#define __force __attribute__((force)) -#define __bitwise__ __attribute__((bitwise)) -#else -#define __force -#define __bitwise__ -#endif - -typedef unsigned int u32; -typedef u32 __u32; -typedef unsigned long long u64; -typedef unsigned char u8; -typedef unsigned short u16; - -typedef unsigned long pgoff_t; - -#include -#include -#include - -struct vma_shared { int prio_tree_node; }; -struct vm_area_struct { - unsigned long vm_pgoff; - unsigned long vm_start; - unsigned long vm_end; - struct vma_shared shared; -}; - -struct page { - unsigned long index; -}; - -static inline void preempt_enable(void) { do {; } while(0);} -static inline void preempt_disable(void) { do {; } while(0);} - -static inline void __set_bit(int bit, unsigned long *map) { - unsigned long *p = map + bit / BITS_PER_LONG; - bit = bit & (BITS_PER_LONG -1); - *p |= 1UL << bit; -} - -static inline int test_bit(int bit, unsigned long *map) { - unsigned long *p = map + bit / BITS_PER_LONG; - bit = bit & (BITS_PER_LONG -1); - return *p & (1UL << bit) ? 1 : 0; -} - -static inline void __clear_bit(int bit, unsigned long *map) { - unsigned long *p = map + bit / BITS_PER_LONG; - bit = bit & (BITS_PER_LONG -1); - *p &= ~(1UL << bit); -} -#define BUG_ON(c) do { if (c) abort(); } while (0) - -#define container_of(ptr, type, member) ({ \ - const typeof( ((type *)0)->member ) *__mptr = (ptr); \ - (type *)( (char *)__mptr - __builtin_offsetof(type,member) );}) - -#define ENOMEM 5 -#define EEXIST 6 - -#define __CHECK_ENDIAN__ -#ifdef __CHECK_ENDIAN__ -#define __bitwise __bitwise__ -#else -#define __bitwise -#endif - -typedef u16 __bitwise __le16; -typedef u16 __bitwise __be16; -typedef u32 __bitwise __le32; -typedef u32 __bitwise __be32; -typedef u64 __bitwise __le64; -typedef u64 __bitwise __be64; - -#define cpu_to_le64(x) ((__force __le64)(u64)(x)) -#define le64_to_cpu(x) ((__force u64)(__le64)(x)) -#define cpu_to_le32(x) ((__force __le32)(u32)(x)) -#define le32_to_cpu(x) ((__force u32)(__le32)(x)) -#define cpu_to_le16(x) ((__force __le16)(u16)(x)) -#define le16_to_cpu(x) ((__force u16)(__le16)(x)) -#endif diff --git a/fs/btrfs/list.h b/fs/btrfs/list.h deleted file mode 100644 index 1aafafb13370..000000000000 --- a/fs/btrfs/list.h +++ /dev/null @@ -1,418 +0,0 @@ -#ifndef _LINUX_LIST_H -#define _LINUX_LIST_H - -#define LIST_POISON1 ((void *) 0x00100100) -#define LIST_POISON2 ((void *) 0x00200200) - -/* - * Simple doubly linked list implementation. - * - * Some of the internal functions ("__xxx") are useful when - * manipulating whole lists rather than single entries, as - * sometimes we already know the next/prev entries and we can - * generate better code by using them directly rather than - * using the generic single-entry routines. - */ - -struct list_head { - struct list_head *next, *prev; -}; - -#define LIST_HEAD_INIT(name) { &(name), &(name) } - -#define LIST_HEAD(name) \ - struct list_head name = LIST_HEAD_INIT(name) - -static inline void INIT_LIST_HEAD(struct list_head *list) -{ - list->next = list; - list->prev = list; -} - -/* - * Insert a new entry between two known consecutive entries. - * - * This is only for internal list manipulation where we know - * the prev/next entries already! - */ -#ifndef CONFIG_DEBUG_LIST -static inline void __list_add(struct list_head *new, - struct list_head *prev, - struct list_head *next) -{ - next->prev = new; - new->next = next; - new->prev = prev; - prev->next = new; -} -#else -extern void __list_add(struct list_head *new, - struct list_head *prev, - struct list_head *next); -#endif - -/** - * list_add - add a new entry - * @new: new entry to be added - * @head: list head to add it after - * - * Insert a new entry after the specified head. - * This is good for implementing stacks. - */ -#ifndef CONFIG_DEBUG_LIST -static inline void list_add(struct list_head *new, struct list_head *head) -{ - __list_add(new, head, head->next); -} -#else -extern void list_add(struct list_head *new, struct list_head *head); -#endif - - -/** - * list_add_tail - add a new entry - * @new: new entry to be added - * @head: list head to add it before - * - * Insert a new entry before the specified head. - * This is useful for implementing queues. - */ -static inline void list_add_tail(struct list_head *new, struct list_head *head) -{ - __list_add(new, head->prev, head); -} - -/* - * Delete a list entry by making the prev/next entries - * point to each other. - * - * This is only for internal list manipulation where we know - * the prev/next entries already! - */ -static inline void __list_del(struct list_head * prev, struct list_head * next) -{ - next->prev = prev; - prev->next = next; -} - -/** - * list_del - deletes entry from list. - * @entry: the element to delete from the list. - * Note: list_empty on entry does not return true after this, the entry is - * in an undefined state. - */ -#ifndef CONFIG_DEBUG_LIST -static inline void list_del(struct list_head *entry) -{ - __list_del(entry->prev, entry->next); - entry->next = LIST_POISON1; - entry->prev = LIST_POISON2; -} -#else -extern void list_del(struct list_head *entry); -#endif - -/** - * list_replace - replace old entry by new one - * @old : the element to be replaced - * @new : the new element to insert - * Note: if 'old' was empty, it will be overwritten. - */ -static inline void list_replace(struct list_head *old, - struct list_head *new) -{ - new->next = old->next; - new->next->prev = new; - new->prev = old->prev; - new->prev->next = new; -} - -static inline void list_replace_init(struct list_head *old, - struct list_head *new) -{ - list_replace(old, new); - INIT_LIST_HEAD(old); -} -/** - * list_del_init - deletes entry from list and reinitialize it. - * @entry: the element to delete from the list. - */ -static inline void list_del_init(struct list_head *entry) -{ - __list_del(entry->prev, entry->next); - INIT_LIST_HEAD(entry); -} - -/** - * list_move - delete from one list and add as another's head - * @list: the entry to move - * @head: the head that will precede our entry - */ -static inline void list_move(struct list_head *list, struct list_head *head) -{ - __list_del(list->prev, list->next); - list_add(list, head); -} - -/** - * list_move_tail - delete from one list and add as another's tail - * @list: the entry to move - * @head: the head that will follow our entry - */ -static inline void list_move_tail(struct list_head *list, - struct list_head *head) -{ - __list_del(list->prev, list->next); - list_add_tail(list, head); -} - -/** - * list_is_last - tests whether @list is the last entry in list @head - * @list: the entry to test - * @head: the head of the list - */ -static inline int list_is_last(const struct list_head *list, - const struct list_head *head) -{ - return list->next == head; -} - -/** - * list_empty - tests whether a list is empty - * @head: the list to test. - */ -static inline int list_empty(const struct list_head *head) -{ - return head->next == head; -} - -/** - * list_empty_careful - tests whether a list is empty and not being modified - * @head: the list to test - * - * Description: - * tests whether a list is empty _and_ checks that no other CPU might be - * in the process of modifying either member (next or prev) - * - * NOTE: using list_empty_careful() without synchronization - * can only be safe if the only activity that can happen - * to the list entry is list_del_init(). Eg. it cannot be used - * if another CPU could re-list_add() it. - */ -static inline int list_empty_careful(const struct list_head *head) -{ - struct list_head *next = head->next; - return (next == head) && (next == head->prev); -} - -static inline void __list_splice(struct list_head *list, - struct list_head *head) -{ - struct list_head *first = list->next; - struct list_head *last = list->prev; - struct list_head *at = head->next; - - first->prev = head; - head->next = first; - - last->next = at; - at->prev = last; -} - -/** - * list_splice - join two lists - * @list: the new list to add. - * @head: the place to add it in the first list. - */ -static inline void list_splice(struct list_head *list, struct list_head *head) -{ - if (!list_empty(list)) - __list_splice(list, head); -} - -/** - * list_splice_init - join two lists and reinitialise the emptied list. - * @list: the new list to add. - * @head: the place to add it in the first list. - * - * The list at @list is reinitialised - */ -static inline void list_splice_init(struct list_head *list, - struct list_head *head) -{ - if (!list_empty(list)) { - __list_splice(list, head); - INIT_LIST_HEAD(list); - } -} - -/** - * list_entry - get the struct for this entry - * @ptr: the &struct list_head pointer. - * @type: the type of the struct this is embedded in. - * @member: the name of the list_struct within the struct. - */ -#define list_entry(ptr, type, member) \ - container_of(ptr, type, member) - -/** - * list_for_each - iterate over a list - * @pos: the &struct list_head to use as a loop cursor. - * @head: the head for your list. - */ -#define list_for_each(pos, head) \ - for (pos = (head)->next; prefetch(pos->next), pos != (head); \ - pos = pos->next) - -/** - * __list_for_each - iterate over a list - * @pos: the &struct list_head to use as a loop cursor. - * @head: the head for your list. - * - * This variant differs from list_for_each() in that it's the - * simplest possible list iteration code, no prefetching is done. - * Use this for code that knows the list to be very short (empty - * or 1 entry) most of the time. - */ -#define __list_for_each(pos, head) \ - for (pos = (head)->next; pos != (head); pos = pos->next) - -/** - * list_for_each_prev - iterate over a list backwards - * @pos: the &struct list_head to use as a loop cursor. - * @head: the head for your list. - */ -#define list_for_each_prev(pos, head) \ - for (pos = (head)->prev; prefetch(pos->prev), pos != (head); \ - pos = pos->prev) - -/** - * list_for_each_safe - iterate over a list safe against removal of list entry - * @pos: the &struct list_head to use as a loop cursor. - * @n: another &struct list_head to use as temporary storage - * @head: the head for your list. - */ -#define list_for_each_safe(pos, n, head) \ - for (pos = (head)->next, n = pos->next; pos != (head); \ - pos = n, n = pos->next) - -/** - * list_for_each_entry - iterate over list of given type - * @pos: the type * to use as a loop cursor. - * @head: the head for your list. - * @member: the name of the list_struct within the struct. - */ -#define list_for_each_entry(pos, head, member) \ - for (pos = list_entry((head)->next, typeof(*pos), member); \ - prefetch(pos->member.next), &pos->member != (head); \ - pos = list_entry(pos->member.next, typeof(*pos), member)) - -/** - * list_for_each_entry_reverse - iterate backwards over list of given type. - * @pos: the type * to use as a loop cursor. - * @head: the head for your list. - * @member: the name of the list_struct within the struct. - */ -#define list_for_each_entry_reverse(pos, head, member) \ - for (pos = list_entry((head)->prev, typeof(*pos), member); \ - prefetch(pos->member.prev), &pos->member != (head); \ - pos = list_entry(pos->member.prev, typeof(*pos), member)) - -/** - * list_prepare_entry - prepare a pos entry for use in list_for_each_entry_continue - * @pos: the type * to use as a start point - * @head: the head of the list - * @member: the name of the list_struct within the struct. - * - * Prepares a pos entry for use as a start point in list_for_each_entry_continue. - */ -#define list_prepare_entry(pos, head, member) \ - ((pos) ? : list_entry(head, typeof(*pos), member)) - -/** - * list_for_each_entry_continue - continue iteration over list of given type - * @pos: the type * to use as a loop cursor. - * @head: the head for your list. - * @member: the name of the list_struct within the struct. - * - * Continue to iterate over list of given type, continuing after - * the current position. - */ -#define list_for_each_entry_continue(pos, head, member) \ - for (pos = list_entry(pos->member.next, typeof(*pos), member); \ - prefetch(pos->member.next), &pos->member != (head); \ - pos = list_entry(pos->member.next, typeof(*pos), member)) - -/** - * list_for_each_entry_from - iterate over list of given type from the current point - * @pos: the type * to use as a loop cursor. - * @head: the head for your list. - * @member: the name of the list_struct within the struct. - * - * Iterate over list of given type, continuing from current position. - */ -#define list_for_each_entry_from(pos, head, member) \ - for (; prefetch(pos->member.next), &pos->member != (head); \ - pos = list_entry(pos->member.next, typeof(*pos), member)) - -/** - * list_for_each_entry_safe - iterate over list of given type safe against removal of list entry - * @pos: the type * to use as a loop cursor. - * @n: another type * to use as temporary storage - * @head: the head for your list. - * @member: the name of the list_struct within the struct. - */ -#define list_for_each_entry_safe(pos, n, head, member) \ - for (pos = list_entry((head)->next, typeof(*pos), member), \ - n = list_entry(pos->member.next, typeof(*pos), member); \ - &pos->member != (head); \ - pos = n, n = list_entry(n->member.next, typeof(*n), member)) - -/** - * list_for_each_entry_safe_continue - * @pos: the type * to use as a loop cursor. - * @n: another type * to use as temporary storage - * @head: the head for your list. - * @member: the name of the list_struct within the struct. - * - * Iterate over list of given type, continuing after current point, - * safe against removal of list entry. - */ -#define list_for_each_entry_safe_continue(pos, n, head, member) \ - for (pos = list_entry(pos->member.next, typeof(*pos), member), \ - n = list_entry(pos->member.next, typeof(*pos), member); \ - &pos->member != (head); \ - pos = n, n = list_entry(n->member.next, typeof(*n), member)) - -/** - * list_for_each_entry_safe_from - * @pos: the type * to use as a loop cursor. - * @n: another type * to use as temporary storage - * @head: the head for your list. - * @member: the name of the list_struct within the struct. - * - * Iterate over list of given type from current point, safe against - * removal of list entry. - */ -#define list_for_each_entry_safe_from(pos, n, head, member) \ - for (n = list_entry(pos->member.next, typeof(*pos), member); \ - &pos->member != (head); \ - pos = n, n = list_entry(n->member.next, typeof(*n), member)) - -/** - * list_for_each_entry_safe_reverse - * @pos: the type * to use as a loop cursor. - * @n: another type * to use as temporary storage - * @head: the head for your list. - * @member: the name of the list_struct within the struct. - * - * Iterate backwards over list of given type, safe against removal - * of list entry. - */ -#define list_for_each_entry_safe_reverse(pos, n, head, member) \ - for (pos = list_entry((head)->prev, typeof(*pos), member), \ - n = list_entry(pos->member.prev, typeof(*pos), member); \ - &pos->member != (head); \ - pos = n, n = list_entry(n->member.prev, typeof(*n), member)) - -#endif diff --git a/fs/btrfs/mkfs.c b/fs/btrfs/mkfs.c deleted file mode 100644 index f7efc8a5fb1a..000000000000 --- a/fs/btrfs/mkfs.c +++ /dev/null @@ -1,255 +0,0 @@ -#define _XOPEN_SOURCE 500 -#ifndef __CHECKER__ -#include -#include -#endif -#include -#include -#include -#include -#include -#include -#include "kerncompat.h" -#include "radix-tree.h" -#include "ctree.h" -#include "disk-io.h" - -#ifdef __CHECKER__ -#define BLKGETSIZE64 0 -static inline int ioctl(int fd, int define, u64 *size) { return 0; } -#endif - -#if 0 -#if defined(__linux__) && defined(_IOR) && !defined(BLKGETSIZE64) -# define BLKGETSIZE64 _IOR(0x12, 114, __u64) -#endif -#endif - -int mkfs(int fd, u64 num_blocks, u32 blocksize) -{ - struct btrfs_super_block super; - struct btrfs_leaf *empty_leaf; - struct btrfs_root_item root_item; - struct btrfs_item item; - struct btrfs_extent_item extent_item; - char *block; - int ret; - u32 itemoff; - u32 start_block = BTRFS_SUPER_INFO_OFFSET / blocksize; - - btrfs_set_super_blocknr(&super, start_block); - btrfs_set_super_root(&super, start_block + 1); - strcpy((char *)(&super.magic), BTRFS_MAGIC); - btrfs_set_super_blocksize(&super, blocksize); - btrfs_set_super_total_blocks(&super, num_blocks); - btrfs_set_super_blocks_used(&super, start_block + 5); - - block = malloc(blocksize); - memset(block, 0, blocksize); - BUG_ON(sizeof(super) > blocksize); - memcpy(block, &super, sizeof(super)); - ret = pwrite(fd, block, blocksize, BTRFS_SUPER_INFO_OFFSET); - BUG_ON(ret != blocksize); - - /* create the tree of root objects */ - empty_leaf = malloc(blocksize); - memset(empty_leaf, 0, blocksize); - btrfs_set_header_parentid(&empty_leaf->header, - BTRFS_ROOT_TREE_OBJECTID); - btrfs_set_header_blocknr(&empty_leaf->header, start_block + 1); - btrfs_set_header_nritems(&empty_leaf->header, 3); - - /* create the items for the root tree */ - btrfs_set_root_blocknr(&root_item, start_block + 2); - btrfs_set_root_refs(&root_item, 1); - itemoff = __BTRFS_LEAF_DATA_SIZE(blocksize) - sizeof(root_item); - btrfs_set_item_offset(&item, itemoff); - btrfs_set_item_size(&item, sizeof(root_item)); - btrfs_set_disk_key_objectid(&item.key, BTRFS_EXTENT_TREE_OBJECTID); - btrfs_set_disk_key_offset(&item.key, 0); - btrfs_set_disk_key_flags(&item.key, 0); - btrfs_set_disk_key_type(&item.key, BTRFS_ROOT_ITEM_KEY); - memcpy(empty_leaf->items, &item, sizeof(item)); - memcpy(btrfs_leaf_data(empty_leaf) + itemoff, - &root_item, sizeof(root_item)); - - btrfs_set_root_blocknr(&root_item, start_block + 3); - itemoff = itemoff - sizeof(root_item); - btrfs_set_item_offset(&item, itemoff); - btrfs_set_disk_key_objectid(&item.key, BTRFS_INODE_MAP_OBJECTID); - memcpy(empty_leaf->items + 1, &item, sizeof(item)); - memcpy(btrfs_leaf_data(empty_leaf) + itemoff, - &root_item, sizeof(root_item)); - - btrfs_set_root_blocknr(&root_item, start_block + 4); - itemoff = itemoff - sizeof(root_item); - btrfs_set_item_offset(&item, itemoff); - btrfs_set_disk_key_objectid(&item.key, BTRFS_FS_TREE_OBJECTID); - memcpy(empty_leaf->items + 2, &item, sizeof(item)); - memcpy(btrfs_leaf_data(empty_leaf) + itemoff, - &root_item, sizeof(root_item)); - ret = pwrite(fd, empty_leaf, blocksize, (start_block + 1) * blocksize); - - /* create the items for the extent tree */ - btrfs_set_header_parentid(&empty_leaf->header, - BTRFS_EXTENT_TREE_OBJECTID); - btrfs_set_header_blocknr(&empty_leaf->header, start_block + 2); - btrfs_set_header_nritems(&empty_leaf->header, 5); - - /* item1, reserve blocks 0-16 */ - btrfs_set_disk_key_objectid(&item.key, 0); - btrfs_set_disk_key_offset(&item.key, start_block + 1); - btrfs_set_disk_key_flags(&item.key, 0); - btrfs_set_disk_key_type(&item.key, BTRFS_EXTENT_ITEM_KEY); - itemoff = __BTRFS_LEAF_DATA_SIZE(blocksize) - - sizeof(struct btrfs_extent_item); - btrfs_set_item_offset(&item, itemoff); - btrfs_set_item_size(&item, sizeof(struct btrfs_extent_item)); - btrfs_set_extent_refs(&extent_item, 1); - btrfs_set_extent_owner(&extent_item, 0); - memcpy(empty_leaf->items, &item, sizeof(item)); - memcpy(btrfs_leaf_data(empty_leaf) + btrfs_item_offset(&item), - &extent_item, btrfs_item_size(&item)); - - /* item2, give block 17 to the root */ - btrfs_set_disk_key_objectid(&item.key, start_block + 1); - btrfs_set_disk_key_offset(&item.key, 1); - itemoff = itemoff - sizeof(struct btrfs_extent_item); - btrfs_set_item_offset(&item, itemoff); - btrfs_set_extent_owner(&extent_item, BTRFS_ROOT_TREE_OBJECTID); - memcpy(empty_leaf->items + 1, &item, sizeof(item)); - memcpy(btrfs_leaf_data(empty_leaf) + btrfs_item_offset(&item), - &extent_item, btrfs_item_size(&item)); - - /* item3, give block 18 to the extent root */ - btrfs_set_disk_key_objectid(&item.key, start_block + 2); - btrfs_set_disk_key_offset(&item.key, 1); - itemoff = itemoff - sizeof(struct btrfs_extent_item); - btrfs_set_item_offset(&item, itemoff); - btrfs_set_extent_owner(&extent_item, BTRFS_EXTENT_TREE_OBJECTID); - memcpy(empty_leaf->items + 2, &item, sizeof(item)); - memcpy(btrfs_leaf_data(empty_leaf) + btrfs_item_offset(&item), - &extent_item, btrfs_item_size(&item)); - - /* item4, give block 19 to the inode map */ - btrfs_set_disk_key_objectid(&item.key, start_block + 3); - btrfs_set_disk_key_offset(&item.key, 1); - itemoff = itemoff - sizeof(struct btrfs_extent_item); - btrfs_set_item_offset(&item, itemoff); - btrfs_set_extent_owner(&extent_item, BTRFS_INODE_MAP_OBJECTID); - memcpy(empty_leaf->items + 3, &item, sizeof(item)); - memcpy(btrfs_leaf_data(empty_leaf) + btrfs_item_offset(&item), - &extent_item, btrfs_item_size(&item)); - ret = pwrite(fd, empty_leaf, blocksize, (start_block + 2) * blocksize); - if (ret != blocksize) - return -1; - - /* item5, give block 20 to the FS root */ - btrfs_set_disk_key_objectid(&item.key, start_block + 4); - btrfs_set_disk_key_offset(&item.key, 1); - itemoff = itemoff - sizeof(struct btrfs_extent_item); - btrfs_set_item_offset(&item, itemoff); - btrfs_set_extent_owner(&extent_item, BTRFS_FS_TREE_OBJECTID); - memcpy(empty_leaf->items + 4, &item, sizeof(item)); - memcpy(btrfs_leaf_data(empty_leaf) + btrfs_item_offset(&item), - &extent_item, btrfs_item_size(&item)); - ret = pwrite(fd, empty_leaf, blocksize, (start_block + 2) * blocksize); - if (ret != blocksize) - return -1; - - /* create the inode map */ - btrfs_set_header_parentid(&empty_leaf->header, - BTRFS_INODE_MAP_OBJECTID); - btrfs_set_header_blocknr(&empty_leaf->header, start_block + 3); - btrfs_set_header_nritems(&empty_leaf->header, 0); - ret = pwrite(fd, empty_leaf, blocksize, (start_block + 3) * blocksize); - if (ret != blocksize) - return -1; - - /* finally create the FS root */ - btrfs_set_header_parentid(&empty_leaf->header, BTRFS_FS_TREE_OBJECTID); - btrfs_set_header_blocknr(&empty_leaf->header, start_block + 4); - btrfs_set_header_nritems(&empty_leaf->header, 0); - ret = pwrite(fd, empty_leaf, blocksize, (start_block + 4) * blocksize); - if (ret != blocksize) - return -1; - return 0; -} - -u64 device_size(int fd, struct stat *st) -{ - u64 size; - if (S_ISREG(st->st_mode)) { - return st->st_size; - } - if (!S_ISBLK(st->st_mode)) { - return 0; - } - if (ioctl(fd, BLKGETSIZE64, &size) >= 0) { - return size; - } - return 0; -} - -int main(int ac, char **av) -{ - char *file; - u64 block_count = 0; - int fd; - struct stat st; - int ret; - int i; - char *buf = malloc(4096); - if (ac >= 2) { - file = av[1]; - if (ac == 3) { - block_count = atoi(av[2]); - if (!block_count) { - fprintf(stderr, "error finding block count\n"); - exit(1); - } - } - } else { - fprintf(stderr, "usage: mkfs.btrfs file [block count]\n"); - exit(1); - } - fd = open(file, O_RDWR); - if (fd < 0) { - fprintf(stderr, "unable to open %s\n", file); - exit(1); - } - ret = fstat(fd, &st); - if (ret < 0) { - fprintf(stderr, "unable to stat %s\n", file); - exit(1); - } - if (block_count == 0) { - block_count = device_size(fd, &st); - if (block_count == 0) { - fprintf(stderr, "unable to find %s size\n", file); - exit(1); - } - } - block_count /= 4096; - if (block_count < 256) { - fprintf(stderr, "device %s is too small\n", file); - exit(1); - } - memset(buf, 0, 4096); - for(i = 0; i < 6; i++) { - ret = write(fd, buf, 4096); - if (ret != 4096) { - fprintf(stderr, "unable to zero fill device\n"); - exit(1); - } - } - ret = mkfs(fd, block_count, 4096); - if (ret) { - fprintf(stderr, "error during mkfs %d\n", ret); - exit(1); - } - printf("fs created on %s blocksize %d blocks %Lu\n", - file, 4096, block_count); - return 0; -} - diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c index f53b99da12f3..aa2d3fac8804 100644 --- a/fs/btrfs/print-tree.c +++ b/fs/btrfs/print-tree.c @@ -1,7 +1,4 @@ -#include -#include -#include "kerncompat.h" -#include "radix-tree.h" +#include #include "ctree.h" #include "disk-io.h" @@ -17,14 +14,14 @@ void btrfs_print_leaf(struct btrfs_root *root, struct btrfs_leaf *l) struct btrfs_inode_item *ii; u32 type; - printf("leaf %Lu total ptrs %d free space %d\n", + printk("leaf %Lu total ptrs %d free space %d\n", btrfs_header_blocknr(&l->header), nr, btrfs_leaf_free_space(root, l)); fflush(stdout); for (i = 0 ; i < nr ; i++) { item = l->items + i; type = btrfs_disk_key_type(&item->key); - printf("\titem %d key (%Lu %u %Lu) itemoff %d itemsize %d\n", + printk("\titem %d key (%Lu %u %Lu) itemoff %d itemsize %d\n", i, btrfs_disk_key_objectid(&item->key), btrfs_disk_key_flags(&item->key), @@ -34,38 +31,39 @@ void btrfs_print_leaf(struct btrfs_root *root, struct btrfs_leaf *l) switch (type) { case BTRFS_INODE_ITEM_KEY: ii = btrfs_item_ptr(l, i, struct btrfs_inode_item); - printf("\t\tinode generation %Lu size %Lu\n", + printk("\t\tinode generation %Lu size %Lu mode %o\n", btrfs_inode_generation(ii), - btrfs_inode_size(ii)); + btrfs_inode_size(ii), + btrfs_inode_mode(ii)); break; case BTRFS_DIR_ITEM_KEY: di = btrfs_item_ptr(l, i, struct btrfs_dir_item); - printf("\t\tdir oid %Lu flags %u type %u\n", + printk("\t\tdir oid %Lu flags %u type %u\n", btrfs_dir_objectid(di), btrfs_dir_flags(di), btrfs_dir_type(di)); - printf("\t\tname %.*s\n", + printk("\t\tname %.*s\n", btrfs_dir_name_len(di),(char *)(di + 1)); break; case BTRFS_ROOT_ITEM_KEY: ri = btrfs_item_ptr(l, i, struct btrfs_root_item); - printf("\t\troot data blocknr %Lu refs %u\n", + printk("\t\troot data blocknr %Lu refs %u\n", btrfs_root_blocknr(ri), btrfs_root_refs(ri)); break; case BTRFS_EXTENT_ITEM_KEY: ei = btrfs_item_ptr(l, i, struct btrfs_extent_item); - printf("\t\textent data refs %u owner %Lu\n", + printk("\t\textent data refs %u owner %Lu\n", btrfs_extent_refs(ei), btrfs_extent_owner(ei)); break; case BTRFS_INODE_MAP_ITEM_KEY: mi = btrfs_item_ptr(l, i, struct btrfs_inode_map_item); - printf("\t\tinode map key %Lu %u %Lu\n", + printk("\t\tinode map key %Lu %u %Lu\n", btrfs_disk_key_objectid(&mi->key), btrfs_disk_key_flags(&mi->key), btrfs_disk_key_offset(&mi->key)); break; case BTRFS_STRING_ITEM_KEY: - printf("\t\titem data %.*s\n", btrfs_item_size(item), + printk("\t\titem data %.*s\n", btrfs_item_size(item), btrfs_leaf_data(l) + btrfs_item_offset(item)); break; }; @@ -86,12 +84,12 @@ void btrfs_print_tree(struct btrfs_root *root, struct btrfs_buffer *t) btrfs_print_leaf(root, (struct btrfs_leaf *)c); return; } - printf("node %Lu level %d total ptrs %d free spc %u\n", t->blocknr, + printk("node %Lu level %d total ptrs %d free spc %u\n", t->blocknr, btrfs_header_level(&c->header), nr, (u32)BTRFS_NODEPTRS_PER_BLOCK(root) - nr); fflush(stdout); for (i = 0; i < nr; i++) { - printf("\tkey %d (%Lu %u %Lu) block %Lu\n", + printk("\tkey %d (%Lu %u %Lu) block %Lu\n", i, c->ptrs[i].key.objectid, c->ptrs[i].key.flags, diff --git a/fs/btrfs/quick-test.c b/fs/btrfs/quick-test.c deleted file mode 100644 index ef12cf888783..000000000000 --- a/fs/btrfs/quick-test.c +++ /dev/null @@ -1,179 +0,0 @@ -#include -#include -#include "kerncompat.h" -#include "radix-tree.h" -#include "ctree.h" -#include "disk-io.h" -#include "print-tree.h" -#include "transaction.h" - -/* for testing only */ -int next_key(int i, int max_key) { - return rand() % max_key; - // return i; -} - -int main(int ac, char **av) { - struct btrfs_key ins; - struct btrfs_key last = { (u64)-1, 0, 0}; - char *buf; - int i; - int num; - int ret; - int run_size = 100000; - int max_key = 100000000; - int tree_size = 0; - struct btrfs_path path; - struct btrfs_super_block super; - struct btrfs_root *root; - struct btrfs_trans_handle *trans; - - radix_tree_init(); - - root = open_ctree("dbfile", &super); - trans = btrfs_start_transaction(root, 1); - srand(55); - ins.flags = 0; - btrfs_set_key_type(&ins, BTRFS_STRING_ITEM_KEY); - for (i = 0; i < run_size; i++) { - buf = malloc(64); - num = next_key(i, max_key); - // num = i; - sprintf(buf, "string-%d", num); - if (i % 10000 == 0) - fprintf(stderr, "insert %d:%d\n", num, i); - ins.objectid = num; - ins.offset = 0; - ret = btrfs_insert_item(trans, root, &ins, buf, strlen(buf)); - if (!ret) - tree_size++; - free(buf); - if (i == run_size - 5) { - btrfs_commit_transaction(trans, root, &super); - } - - } - close_ctree(root, &super); - - root = open_ctree("dbfile", &super); - printf("starting search\n"); - srand(55); - for (i = 0; i < run_size; i++) { - num = next_key(i, max_key); - ins.objectid = num; - btrfs_init_path(&path); - if (i % 10000 == 0) - fprintf(stderr, "search %d:%d\n", num, i); - ret = btrfs_search_slot(trans, root, &ins, &path, 0, 0); - if (ret) { - btrfs_print_tree(root, root->node); - printf("unable to find %d\n", num); - exit(1); - } - btrfs_release_path(root, &path); - } - close_ctree(root, &super); - root = open_ctree("dbfile", &super); - printf("node %p level %d total ptrs %d free spc %lu\n", root->node, - btrfs_header_level(&root->node->node.header), - btrfs_header_nritems(&root->node->node.header), - BTRFS_NODEPTRS_PER_BLOCK(root) - - btrfs_header_nritems(&root->node->node.header)); - printf("all searches good, deleting some items\n"); - i = 0; - srand(55); - for (i = 0 ; i < run_size/4; i++) { - num = next_key(i, max_key); - ins.objectid = num; - btrfs_init_path(&path); - ret = btrfs_search_slot(trans, root, &ins, &path, -1, 1); - if (!ret) { - if (i % 10000 == 0) - fprintf(stderr, "del %d:%d\n", num, i); - ret = btrfs_del_item(trans, root, &path); - if (ret != 0) - BUG(); - tree_size--; - } - btrfs_release_path(root, &path); - } - close_ctree(root, &super); - root = open_ctree("dbfile", &super); - srand(128); - for (i = 0; i < run_size; i++) { - buf = malloc(64); - num = next_key(i, max_key); - sprintf(buf, "string-%d", num); - ins.objectid = num; - if (i % 10000 == 0) - fprintf(stderr, "insert %d:%d\n", num, i); - ret = btrfs_insert_item(trans, root, &ins, buf, strlen(buf)); - if (!ret) - tree_size++; - free(buf); - } - close_ctree(root, &super); - root = open_ctree("dbfile", &super); - srand(128); - printf("starting search2\n"); - for (i = 0; i < run_size; i++) { - num = next_key(i, max_key); - ins.objectid = num; - btrfs_init_path(&path); - if (i % 10000 == 0) - fprintf(stderr, "search %d:%d\n", num, i); - ret = btrfs_search_slot(trans, root, &ins, &path, 0, 0); - if (ret) { - btrfs_print_tree(root, root->node); - printf("unable to find %d\n", num); - exit(1); - } - btrfs_release_path(root, &path); - } - printf("starting big long delete run\n"); - while(root->node && - btrfs_header_nritems(&root->node->node.header) > 0) { - struct btrfs_leaf *leaf; - int slot; - ins.objectid = (u64)-1; - btrfs_init_path(&path); - ret = btrfs_search_slot(trans, root, &ins, &path, -1, 1); - if (ret == 0) - BUG(); - - leaf = &path.nodes[0]->leaf; - slot = path.slots[0]; - if (slot != btrfs_header_nritems(&leaf->header)) - BUG(); - while(path.slots[0] > 0) { - path.slots[0] -= 1; - slot = path.slots[0]; - leaf = &path.nodes[0]->leaf; - - btrfs_disk_key_to_cpu(&last, &leaf->items[slot].key); - if (tree_size % 10000 == 0) - printf("big del %d:%d\n", tree_size, i); - ret = btrfs_del_item(trans, root, &path); - if (ret != 0) { - printf("del_item returned %d\n", ret); - BUG(); - } - tree_size--; - } - btrfs_release_path(root, &path); - } - /* - printf("previous tree:\n"); - btrfs_print_tree(root, root->commit_root); - printf("map before commit\n"); - btrfs_print_tree(root->extent_root, root->extent_root->node); - */ - btrfs_commit_transaction(trans, root, &super); - printf("tree size is now %d\n", tree_size); - printf("root %p commit root %p\n", root->node, root->commit_root); - printf("map tree\n"); - btrfs_print_tree(root->fs_info->extent_root, - root->fs_info->extent_root->node); - close_ctree(root, &super); - return 0; -} diff --git a/fs/btrfs/radix-tree.c b/fs/btrfs/radix-tree.c deleted file mode 100644 index baa25ca1c2ac..000000000000 --- a/fs/btrfs/radix-tree.c +++ /dev/null @@ -1,836 +0,0 @@ -/* - * Copyright (C) 2001 Momchil Velikov - * Portions Copyright (C) 2001 Christoph Hellwig - * Copyright (C) 2005 SGI, Christoph Lameter - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation; either version 2, or (at - * your option) any later version. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - */ - -#include "kerncompat.h" -#include "radix-tree.h" -#ifdef __KERNEL__ -#define RADIX_TREE_MAP_SHIFT (CONFIG_BASE_SMALL ? 4 : 6) -#else -#define RADIX_TREE_MAP_SHIFT 3 /* For more stressful testing */ -#endif - -#define RADIX_TREE_MAP_SIZE (1UL << RADIX_TREE_MAP_SHIFT) -#define RADIX_TREE_MAP_MASK (RADIX_TREE_MAP_SIZE-1) - -#define RADIX_TREE_TAG_LONGS \ - ((RADIX_TREE_MAP_SIZE + BITS_PER_LONG - 1) / BITS_PER_LONG) - -struct radix_tree_node { - unsigned int count; - void *slots[RADIX_TREE_MAP_SIZE]; - unsigned long tags[RADIX_TREE_MAX_TAGS][RADIX_TREE_TAG_LONGS]; -}; - -struct radix_tree_path { - struct radix_tree_node *node; - int offset; -}; - -#define RADIX_TREE_INDEX_BITS (8 /* CHAR_BIT */ * sizeof(unsigned long)) -#define RADIX_TREE_MAX_PATH (RADIX_TREE_INDEX_BITS/RADIX_TREE_MAP_SHIFT + 2) - -static unsigned long height_to_maxindex[RADIX_TREE_MAX_PATH] __read_mostly; - -/* - * Per-cpu pool of preloaded nodes - */ -struct radix_tree_preload { - int nr; - struct radix_tree_node *nodes[RADIX_TREE_MAX_PATH]; -}; -struct radix_tree_preload radix_tree_preloads = { 0, }; - -static inline gfp_t root_gfp_mask(struct radix_tree_root *root) -{ - return root->gfp_mask & __GFP_BITS_MASK; -} - -static int internal_nodes = 0; -/* - * This assumes that the caller has performed appropriate preallocation, and - * that the caller has pinned this thread of control to the current CPU. - */ -static struct radix_tree_node * -radix_tree_node_alloc(struct radix_tree_root *root) -{ - struct radix_tree_node *ret; - ret = malloc(sizeof(struct radix_tree_node)); - if (ret) { - memset(ret, 0, sizeof(struct radix_tree_node)); - internal_nodes++; - } - return ret; -} - -static inline void -radix_tree_node_free(struct radix_tree_node *node) -{ - internal_nodes--; - free(node); -} - -/* - * Load up this CPU's radix_tree_node buffer with sufficient objects to - * ensure that the addition of a single element in the tree cannot fail. On - * success, return zero, with preemption disabled. On error, return -ENOMEM - * with preemption not disabled. - */ -int radix_tree_preload(gfp_t gfp_mask) -{ - struct radix_tree_preload *rtp; - struct radix_tree_node *node; - int ret = -ENOMEM; - - preempt_disable(); - rtp = &__get_cpu_var(radix_tree_preloads); - while (rtp->nr < ARRAY_SIZE(rtp->nodes)) { - preempt_enable(); - node = radix_tree_node_alloc(NULL); - if (node == NULL) - goto out; - preempt_disable(); - rtp = &__get_cpu_var(radix_tree_preloads); - if (rtp->nr < ARRAY_SIZE(rtp->nodes)) - rtp->nodes[rtp->nr++] = node; - else - radix_tree_node_free(node); - } - ret = 0; -out: - return ret; -} - -static inline void tag_set(struct radix_tree_node *node, unsigned int tag, - int offset) -{ - __set_bit(offset, node->tags[tag]); -} - -static inline void tag_clear(struct radix_tree_node *node, unsigned int tag, - int offset) -{ - __clear_bit(offset, node->tags[tag]); -} - -static inline int tag_get(struct radix_tree_node *node, unsigned int tag, - int offset) -{ - return test_bit(offset, node->tags[tag]); -} - -static inline void root_tag_set(struct radix_tree_root *root, unsigned int tag) -{ - root->gfp_mask |= (__force gfp_t)(1 << (tag + __GFP_BITS_SHIFT)); -} - - -static inline void root_tag_clear(struct radix_tree_root *root, unsigned int tag) -{ - root->gfp_mask &= (__force gfp_t)~(1 << (tag + __GFP_BITS_SHIFT)); -} - -static inline void root_tag_clear_all(struct radix_tree_root *root) -{ - root->gfp_mask &= __GFP_BITS_MASK; -} - -static inline int root_tag_get(struct radix_tree_root *root, unsigned int tag) -{ - return (__force unsigned)root->gfp_mask & (1 << (tag + __GFP_BITS_SHIFT)); -} - -/* - * Returns 1 if any slot in the node has this tag set. - * Otherwise returns 0. - */ -static inline int any_tag_set(struct radix_tree_node *node, unsigned int tag) -{ - int idx; - for (idx = 0; idx < RADIX_TREE_TAG_LONGS; idx++) { - if (node->tags[tag][idx]) - return 1; - } - return 0; -} - -/* - * Return the maximum key which can be store into a - * radix tree with height HEIGHT. - */ -static inline unsigned long radix_tree_maxindex(unsigned int height) -{ - return height_to_maxindex[height]; -} - -/* - * Extend a radix tree so it can store key @index. - */ -static int radix_tree_extend(struct radix_tree_root *root, unsigned long index) -{ - struct radix_tree_node *node; - unsigned int height; - int tag; - - /* Figure out what the height should be. */ - height = root->height + 1; - while (index > radix_tree_maxindex(height)) - height++; - - if (root->rnode == NULL) { - root->height = height; - goto out; - } - - do { - if (!(node = radix_tree_node_alloc(root))) - return -ENOMEM; - - /* Increase the height. */ - node->slots[0] = root->rnode; - - /* Propagate the aggregated tag info into the new root */ - for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) { - if (root_tag_get(root, tag)) - tag_set(node, tag, 0); - } - - node->count = 1; - root->rnode = node; - root->height++; - } while (height > root->height); -out: - return 0; -} - -/** - * radix_tree_insert - insert into a radix tree - * @root: radix tree root - * @index: index key - * @item: item to insert - * - * Insert an item into the radix tree at position @index. - */ -int radix_tree_insert(struct radix_tree_root *root, - unsigned long index, void *item) -{ - struct radix_tree_node *node = NULL, *slot; - unsigned int height, shift; - int offset; - int error; - - /* Make sure the tree is high enough. */ - if (index > radix_tree_maxindex(root->height)) { - error = radix_tree_extend(root, index); - if (error) - return error; - } - - slot = root->rnode; - height = root->height; - shift = (height-1) * RADIX_TREE_MAP_SHIFT; - - offset = 0; /* uninitialised var warning */ - while (height > 0) { - if (slot == NULL) { - /* Have to add a child node. */ - if (!(slot = radix_tree_node_alloc(root))) - return -ENOMEM; - if (node) { - node->slots[offset] = slot; - node->count++; - } else - root->rnode = slot; - } - - /* Go a level down */ - offset = (index >> shift) & RADIX_TREE_MAP_MASK; - node = slot; - slot = node->slots[offset]; - shift -= RADIX_TREE_MAP_SHIFT; - height--; - } - - if (slot != NULL) - return -EEXIST; - - if (node) { - node->count++; - node->slots[offset] = item; - BUG_ON(tag_get(node, 0, offset)); - BUG_ON(tag_get(node, 1, offset)); - } else { - root->rnode = item; - BUG_ON(root_tag_get(root, 0)); - BUG_ON(root_tag_get(root, 1)); - } - - return 0; -} - -static inline void **__lookup_slot(struct radix_tree_root *root, - unsigned long index) -{ - unsigned int height, shift; - struct radix_tree_node **slot; - - height = root->height; - - if (index > radix_tree_maxindex(height)) - return NULL; - - if (height == 0 && root->rnode) - return (void **)&root->rnode; - - shift = (height-1) * RADIX_TREE_MAP_SHIFT; - slot = &root->rnode; - - while (height > 0) { - if (*slot == NULL) - return NULL; - - slot = (struct radix_tree_node **) - ((*slot)->slots + - ((index >> shift) & RADIX_TREE_MAP_MASK)); - shift -= RADIX_TREE_MAP_SHIFT; - height--; - } - - return (void **)slot; -} - -/** - * radix_tree_lookup_slot - lookup a slot in a radix tree - * @root: radix tree root - * @index: index key - * - * Lookup the slot corresponding to the position @index in the radix tree - * @root. This is useful for update-if-exists operations. - */ -void **radix_tree_lookup_slot(struct radix_tree_root *root, unsigned long index) -{ - return __lookup_slot(root, index); -} - -/** - * radix_tree_lookup - perform lookup operation on a radix tree - * @root: radix tree root - * @index: index key - * - * Lookup the item at the position @index in the radix tree @root. - */ -void *radix_tree_lookup(struct radix_tree_root *root, unsigned long index) -{ - void **slot; - - slot = __lookup_slot(root, index); - return slot != NULL ? *slot : NULL; -} - -/** - * radix_tree_tag_set - set a tag on a radix tree node - * @root: radix tree root - * @index: index key - * @tag: tag index - * - * Set the search tag (which must be < RADIX_TREE_MAX_TAGS) - * corresponding to @index in the radix tree. From - * the root all the way down to the leaf node. - * - * Returns the address of the tagged item. Setting a tag on a not-present - * item is a bug. - */ -void *radix_tree_tag_set(struct radix_tree_root *root, - unsigned long index, unsigned int tag) -{ - unsigned int height, shift; - struct radix_tree_node *slot; - - height = root->height; - BUG_ON(index > radix_tree_maxindex(height)); - - slot = root->rnode; - shift = (height - 1) * RADIX_TREE_MAP_SHIFT; - - while (height > 0) { - int offset; - - offset = (index >> shift) & RADIX_TREE_MAP_MASK; - if (!tag_get(slot, tag, offset)) - tag_set(slot, tag, offset); - slot = slot->slots[offset]; - BUG_ON(slot == NULL); - shift -= RADIX_TREE_MAP_SHIFT; - height--; - } - - /* set the root's tag bit */ - if (slot && !root_tag_get(root, tag)) - root_tag_set(root, tag); - - return slot; -} - -/** - * radix_tree_tag_clear - clear a tag on a radix tree node - * @root: radix tree root - * @index: index key - * @tag: tag index - * - * Clear the search tag (which must be < RADIX_TREE_MAX_TAGS) - * corresponding to @index in the radix tree. If - * this causes the leaf node to have no tags set then clear the tag in the - * next-to-leaf node, etc. - * - * Returns the address of the tagged item on success, else NULL. ie: - * has the same return value and semantics as radix_tree_lookup(). - */ -void *radix_tree_tag_clear(struct radix_tree_root *root, - unsigned long index, unsigned int tag) -{ - struct radix_tree_path path[RADIX_TREE_MAX_PATH], *pathp = path; - struct radix_tree_node *slot = NULL; - unsigned int height, shift; - - height = root->height; - if (index > radix_tree_maxindex(height)) - goto out; - - shift = (height - 1) * RADIX_TREE_MAP_SHIFT; - pathp->node = NULL; - slot = root->rnode; - - while (height > 0) { - int offset; - - if (slot == NULL) - goto out; - - offset = (index >> shift) & RADIX_TREE_MAP_MASK; - pathp[1].offset = offset; - pathp[1].node = slot; - slot = slot->slots[offset]; - pathp++; - shift -= RADIX_TREE_MAP_SHIFT; - height--; - } - - if (slot == NULL) - goto out; - - while (pathp->node) { - if (!tag_get(pathp->node, tag, pathp->offset)) - goto out; - tag_clear(pathp->node, tag, pathp->offset); - if (any_tag_set(pathp->node, tag)) - goto out; - pathp--; - } - - /* clear the root's tag bit */ - if (root_tag_get(root, tag)) - root_tag_clear(root, tag); - -out: - return slot; -} - -#ifndef __KERNEL__ /* Only the test harness uses this at present */ -/** - * radix_tree_tag_get - get a tag on a radix tree node - * @root: radix tree root - * @index: index key - * @tag: tag index (< RADIX_TREE_MAX_TAGS) - * - * Return values: - * - * 0: tag not present or not set - * 1: tag set - */ -int radix_tree_tag_get(struct radix_tree_root *root, - unsigned long index, unsigned int tag) -{ - unsigned int height, shift; - struct radix_tree_node *slot; - int saw_unset_tag = 0; - - height = root->height; - if (index > radix_tree_maxindex(height)) - return 0; - - /* check the root's tag bit */ - if (!root_tag_get(root, tag)) - return 0; - - if (height == 0) - return 1; - - shift = (height - 1) * RADIX_TREE_MAP_SHIFT; - slot = root->rnode; - - for ( ; ; ) { - int offset; - - if (slot == NULL) - return 0; - - offset = (index >> shift) & RADIX_TREE_MAP_MASK; - - /* - * This is just a debug check. Later, we can bale as soon as - * we see an unset tag. - */ - if (!tag_get(slot, tag, offset)) - saw_unset_tag = 1; - if (height == 1) { - int ret = tag_get(slot, tag, offset); - - BUG_ON(ret && saw_unset_tag); - return !!ret; - } - slot = slot->slots[offset]; - shift -= RADIX_TREE_MAP_SHIFT; - height--; - } -} -#endif - -static unsigned int -__lookup(struct radix_tree_root *root, void **results, unsigned long index, - unsigned int max_items, unsigned long *next_index) -{ - unsigned int nr_found = 0; - unsigned int shift, height; - struct radix_tree_node *slot; - unsigned long i; - - height = root->height; - if (height == 0) { - if (root->rnode && index == 0) - results[nr_found++] = root->rnode; - goto out; - } - - shift = (height-1) * RADIX_TREE_MAP_SHIFT; - slot = root->rnode; - - for ( ; height > 1; height--) { - - for (i = (index >> shift) & RADIX_TREE_MAP_MASK ; - i < RADIX_TREE_MAP_SIZE; i++) { - if (slot->slots[i] != NULL) - break; - index &= ~((1UL << shift) - 1); - index += 1UL << shift; - if (index == 0) - goto out; /* 32-bit wraparound */ - } - if (i == RADIX_TREE_MAP_SIZE) - goto out; - - shift -= RADIX_TREE_MAP_SHIFT; - slot = slot->slots[i]; - } - - /* Bottom level: grab some items */ - for (i = index & RADIX_TREE_MAP_MASK; i < RADIX_TREE_MAP_SIZE; i++) { - index++; - if (slot->slots[i]) { - results[nr_found++] = slot->slots[i]; - if (nr_found == max_items) - goto out; - } - } -out: - *next_index = index; - return nr_found; -} - -/** - * radix_tree_gang_lookup - perform multiple lookup on a radix tree - * @root: radix tree root - * @results: where the results of the lookup are placed - * @first_index: start the lookup from this key - * @max_items: place up to this many items at *results - * - * Performs an index-ascending scan of the tree for present items. Places - * them at *@results and returns the number of items which were placed at - * *@results. - * - * The implementation is naive. - */ -unsigned int -radix_tree_gang_lookup(struct radix_tree_root *root, void **results, - unsigned long first_index, unsigned int max_items) -{ - const unsigned long max_index = radix_tree_maxindex(root->height); - unsigned long cur_index = first_index; - unsigned int ret = 0; - - while (ret < max_items) { - unsigned int nr_found; - unsigned long next_index; /* Index of next search */ - - if (cur_index > max_index) - break; - nr_found = __lookup(root, results + ret, cur_index, - max_items - ret, &next_index); - ret += nr_found; - if (next_index == 0) - break; - cur_index = next_index; - } - return ret; -} - -/* - * FIXME: the two tag_get()s here should use find_next_bit() instead of - * open-coding the search. - */ -static unsigned int -__lookup_tag(struct radix_tree_root *root, void **results, unsigned long index, - unsigned int max_items, unsigned long *next_index, unsigned int tag) -{ - unsigned int nr_found = 0; - unsigned int shift; - unsigned int height = root->height; - struct radix_tree_node *slot; - - if (height == 0) { - if (root->rnode && index == 0) - results[nr_found++] = root->rnode; - goto out; - } - - shift = (height - 1) * RADIX_TREE_MAP_SHIFT; - slot = root->rnode; - - do { - unsigned long i = (index >> shift) & RADIX_TREE_MAP_MASK; - - for ( ; i < RADIX_TREE_MAP_SIZE; i++) { - if (tag_get(slot, tag, i)) { - BUG_ON(slot->slots[i] == NULL); - break; - } - index &= ~((1UL << shift) - 1); - index += 1UL << shift; - if (index == 0) - goto out; /* 32-bit wraparound */ - } - if (i == RADIX_TREE_MAP_SIZE) - goto out; - height--; - if (height == 0) { /* Bottom level: grab some items */ - unsigned long j = index & RADIX_TREE_MAP_MASK; - - for ( ; j < RADIX_TREE_MAP_SIZE; j++) { - index++; - if (tag_get(slot, tag, j)) { - BUG_ON(slot->slots[j] == NULL); - results[nr_found++] = slot->slots[j]; - if (nr_found == max_items) - goto out; - } - } - } - shift -= RADIX_TREE_MAP_SHIFT; - slot = slot->slots[i]; - } while (height > 0); -out: - *next_index = index; - return nr_found; -} - -/** - * radix_tree_gang_lookup_tag - perform multiple lookup on a radix tree - * based on a tag - * @root: radix tree root - * @results: where the results of the lookup are placed - * @first_index: start the lookup from this key - * @max_items: place up to this many items at *results - * @tag: the tag index (< RADIX_TREE_MAX_TAGS) - * - * Performs an index-ascending scan of the tree for present items which - * have the tag indexed by @tag set. Places the items at *@results and - * returns the number of items which were placed at *@results. - */ -unsigned int -radix_tree_gang_lookup_tag(struct radix_tree_root *root, void **results, - unsigned long first_index, unsigned int max_items, - unsigned int tag) -{ - const unsigned long max_index = radix_tree_maxindex(root->height); - unsigned long cur_index = first_index; - unsigned int ret = 0; - - /* check the root's tag bit */ - if (!root_tag_get(root, tag)) - return 0; - - while (ret < max_items) { - unsigned int nr_found; - unsigned long next_index; /* Index of next search */ - - if (cur_index > max_index) - break; - nr_found = __lookup_tag(root, results + ret, cur_index, - max_items - ret, &next_index, tag); - ret += nr_found; - if (next_index == 0) - break; - cur_index = next_index; - } - return ret; -} - -/** - * radix_tree_shrink - shrink height of a radix tree to minimal - * @root radix tree root - */ -static inline void radix_tree_shrink(struct radix_tree_root *root) -{ - /* try to shrink tree height */ - while (root->height > 0 && - root->rnode->count == 1 && - root->rnode->slots[0]) { - struct radix_tree_node *to_free = root->rnode; - - root->rnode = to_free->slots[0]; - root->height--; - /* must only free zeroed nodes into the slab */ - tag_clear(to_free, 0, 0); - tag_clear(to_free, 1, 0); - to_free->slots[0] = NULL; - to_free->count = 0; - radix_tree_node_free(to_free); - } -} - -/** - * radix_tree_delete - delete an item from a radix tree - * @root: radix tree root - * @index: index key - * - * Remove the item at @index from the radix tree rooted at @root. - * - * Returns the address of the deleted item, or NULL if it was not present. - */ -void *radix_tree_delete(struct radix_tree_root *root, unsigned long index) -{ - struct radix_tree_path path[RADIX_TREE_MAX_PATH], *pathp = path; - struct radix_tree_node *slot = NULL; - unsigned int height, shift; - int tag; - int offset; - - height = root->height; - if (index > radix_tree_maxindex(height)) - goto out; - - slot = root->rnode; - if (height == 0 && root->rnode) { - root_tag_clear_all(root); - root->rnode = NULL; - goto out; - } - - shift = (height - 1) * RADIX_TREE_MAP_SHIFT; - pathp->node = NULL; - - do { - if (slot == NULL) - goto out; - - pathp++; - offset = (index >> shift) & RADIX_TREE_MAP_MASK; - pathp->offset = offset; - pathp->node = slot; - slot = slot->slots[offset]; - shift -= RADIX_TREE_MAP_SHIFT; - height--; - } while (height > 0); - - if (slot == NULL) - goto out; - - /* - * Clear all tags associated with the just-deleted item - */ - for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) { - if (tag_get(pathp->node, tag, pathp->offset)) - radix_tree_tag_clear(root, index, tag); - } - - /* Now free the nodes we do not need anymore */ - while (pathp->node) { - pathp->node->slots[pathp->offset] = NULL; - pathp->node->count--; - - if (pathp->node->count) { - if (pathp->node == root->rnode) - radix_tree_shrink(root); - goto out; - } - - /* Node with zero slots in use so free it */ - radix_tree_node_free(pathp->node); - - pathp--; - } - root_tag_clear_all(root); - root->height = 0; - root->rnode = NULL; - -out: - return slot; -} - -/** - * radix_tree_tagged - test whether any items in the tree are tagged - * @root: radix tree root - * @tag: tag to test - */ -int radix_tree_tagged(struct radix_tree_root *root, unsigned int tag) -{ - return root_tag_get(root, tag); -} - -static unsigned long __maxindex(unsigned int height) -{ - unsigned int tmp = height * RADIX_TREE_MAP_SHIFT; - unsigned long index = (~0UL >> (RADIX_TREE_INDEX_BITS - tmp - 1)) >> 1; - - if (tmp >= RADIX_TREE_INDEX_BITS) - index = ~0UL; - return index; -} - -static void radix_tree_init_maxindex(void) -{ - unsigned int i; - - for (i = 0; i < ARRAY_SIZE(height_to_maxindex); i++) - height_to_maxindex[i] = __maxindex(i); -} - -void radix_tree_init(void) -{ - radix_tree_init_maxindex(); -} diff --git a/fs/btrfs/radix-tree.h b/fs/btrfs/radix-tree.h deleted file mode 100644 index c3ce88137f77..000000000000 --- a/fs/btrfs/radix-tree.h +++ /dev/null @@ -1,73 +0,0 @@ -/* - * Copyright (C) 2001 Momchil Velikov - * Portions Copyright (C) 2001 Christoph Hellwig - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation; either version 2, or (at - * your option) any later version. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - */ -#ifndef _LINUX_RADIX_TREE_H -#define _LINUX_RADIX_TREE_H - -#define RADIX_TREE_MAX_TAGS 2 - -/* root tags are stored in gfp_mask, shifted by __GFP_BITS_SHIFT */ -struct radix_tree_root { - unsigned int height; - gfp_t gfp_mask; - struct radix_tree_node *rnode; -}; - -#define RADIX_TREE_INIT(mask) { \ - .height = 0, \ - .gfp_mask = (mask), \ - .rnode = NULL, \ -} - -#define RADIX_TREE(name, mask) \ - struct radix_tree_root name = RADIX_TREE_INIT(mask) - -#define INIT_RADIX_TREE(root, mask) \ -do { \ - (root)->height = 0; \ - (root)->gfp_mask = (mask); \ - (root)->rnode = NULL; \ -} while (0) - -int radix_tree_insert(struct radix_tree_root *, unsigned long, void *); -void *radix_tree_lookup(struct radix_tree_root *, unsigned long); -void **radix_tree_lookup_slot(struct radix_tree_root *, unsigned long); -void *radix_tree_delete(struct radix_tree_root *, unsigned long); -unsigned int -radix_tree_gang_lookup(struct radix_tree_root *root, void **results, - unsigned long first_index, unsigned int max_items); -int radix_tree_preload(gfp_t gfp_mask); -void radix_tree_init(void); -void *radix_tree_tag_set(struct radix_tree_root *root, - unsigned long index, unsigned int tag); -void *radix_tree_tag_clear(struct radix_tree_root *root, - unsigned long index, unsigned int tag); -int radix_tree_tag_get(struct radix_tree_root *root, - unsigned long index, unsigned int tag); -unsigned int -radix_tree_gang_lookup_tag(struct radix_tree_root *root, void **results, - unsigned long first_index, unsigned int max_items, - unsigned int tag); -int radix_tree_tagged(struct radix_tree_root *root, unsigned int tag); - -static inline void radix_tree_preload_end(void) -{ - preempt_enable(); -} - -#endif /* _LINUX_RADIX_TREE_H */ diff --git a/fs/btrfs/random-test.c b/fs/btrfs/random-test.c deleted file mode 100644 index 3a38ae7a886d..000000000000 --- a/fs/btrfs/random-test.c +++ /dev/null @@ -1,405 +0,0 @@ -#include -#include -#include -#include "kerncompat.h" -#include "radix-tree.h" -#include "ctree.h" -#include "disk-io.h" -#include "print-tree.h" -#include "transaction.h" - -int keep_running = 1; -struct btrfs_super_block super; - -static int setup_key(struct radix_tree_root *root, struct btrfs_key *key, - int exists) -{ - int num = rand(); - unsigned long res[2]; - int ret; - - key->flags = 0; - btrfs_set_key_type(key, BTRFS_STRING_ITEM_KEY); - key->offset = 0; -again: - ret = radix_tree_gang_lookup(root, (void **)res, num, 2); - if (exists) { - if (ret == 0) - return -1; - num = res[0]; - } else if (ret != 0 && num == res[0]) { - num++; - if (ret > 1 && num == res[1]) { - num++; - goto again; - } - } - key->objectid = num; - return 0; -} - -static int ins_one(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct radix_tree_root *radix) -{ - struct btrfs_path path; - struct btrfs_key key; - int ret; - char buf[128]; - unsigned long oid; - btrfs_init_path(&path); - ret = setup_key(radix, &key, 0); - sprintf(buf, "str-%Lu\n", key.objectid); - ret = btrfs_insert_item(trans, root, &key, buf, strlen(buf)); - if (ret) - goto error; - oid = (unsigned long)key.objectid; - radix_tree_preload(GFP_KERNEL); - ret = radix_tree_insert(radix, oid, (void *)oid); - radix_tree_preload_end(); - if (ret) - goto error; - return ret; -error: - printf("failed to insert %Lu\n", key.objectid); - return -1; -} - -static int insert_dup(struct btrfs_trans_handle *trans, struct btrfs_root - *root, struct radix_tree_root *radix) -{ - struct btrfs_path path; - struct btrfs_key key; - int ret; - char buf[128]; - btrfs_init_path(&path); - ret = setup_key(radix, &key, 1); - if (ret < 0) - return 0; - sprintf(buf, "str-%Lu\n", key.objectid); - ret = btrfs_insert_item(trans, root, &key, buf, strlen(buf)); - if (ret != -EEXIST) { - printf("insert on %Lu gave us %d\n", key.objectid, ret); - return 1; - } - return 0; -} - -static int del_one(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct radix_tree_root *radix) -{ - struct btrfs_path path; - struct btrfs_key key; - int ret; - unsigned long *ptr; - btrfs_init_path(&path); - ret = setup_key(radix, &key, 1); - if (ret < 0) - return 0; - ret = btrfs_search_slot(trans, root, &key, &path, -1, 1); - if (ret) - goto error; - ret = btrfs_del_item(trans, root, &path); - btrfs_release_path(root, &path); - if (ret != 0) - goto error; - ptr = radix_tree_delete(radix, key.objectid); - if (!ptr) - goto error; - return 0; -error: - printf("failed to delete %Lu\n", key.objectid); - return -1; -} - -static int lookup_item(struct btrfs_trans_handle *trans, struct btrfs_root - *root, struct radix_tree_root *radix) -{ - struct btrfs_path path; - struct btrfs_key key; - int ret; - btrfs_init_path(&path); - ret = setup_key(radix, &key, 1); - if (ret < 0) - return 0; - ret = btrfs_search_slot(trans, root, &key, &path, 0, 1); - btrfs_release_path(root, &path); - if (ret) - goto error; - return 0; -error: - printf("unable to find key %Lu\n", key.objectid); - return -1; -} - -static int lookup_enoent(struct btrfs_trans_handle *trans, struct btrfs_root - *root, struct radix_tree_root *radix) -{ - struct btrfs_path path; - struct btrfs_key key; - int ret; - btrfs_init_path(&path); - ret = setup_key(radix, &key, 0); - if (ret < 0) - return ret; - ret = btrfs_search_slot(trans, root, &key, &path, 0, 0); - btrfs_release_path(root, &path); - if (ret <= 0) - goto error; - return 0; -error: - printf("able to find key that should not exist %Lu\n", key.objectid); - return -1; -} - -static int empty_tree(struct btrfs_trans_handle *trans, struct btrfs_root - *root, struct radix_tree_root *radix, int nr) -{ - struct btrfs_path path; - struct btrfs_key key; - unsigned long found = 0; - int ret; - int slot; - int *ptr; - int count = 0; - - key.offset = 0; - key.flags = 0; - btrfs_set_key_type(&key, BTRFS_STRING_ITEM_KEY); - key.objectid = (unsigned long)-1; - while(nr-- >= 0) { - btrfs_init_path(&path); - ret = btrfs_search_slot(trans, root, &key, &path, -1, 1); - if (ret < 0) { - btrfs_release_path(root, &path); - return ret; - } - if (ret != 0) { - if (path.slots[0] == 0) { - btrfs_release_path(root, &path); - break; - } - path.slots[0] -= 1; - } - slot = path.slots[0]; - found = btrfs_disk_key_objectid( - &path.nodes[0]->leaf.items[slot].key); - ret = btrfs_del_item(trans, root, &path); - count++; - if (ret) { - fprintf(stderr, - "failed to remove %lu from tree\n", - found); - return -1; - } - btrfs_release_path(root, &path); - ptr = radix_tree_delete(radix, found); - if (!ptr) - goto error; - if (!keep_running) - break; - } - return 0; -error: - fprintf(stderr, "failed to delete from the radix %lu\n", found); - return -1; -} - -static int fill_tree(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct radix_tree_root *radix, int count) -{ - int i; - int ret = 0; - for (i = 0; i < count; i++) { - ret = ins_one(trans, root, radix); - if (ret) { - fprintf(stderr, "fill failed\n"); - goto out; - } - if (i % 1000 == 0) { - ret = btrfs_commit_transaction(trans, root, &super); - if (ret) { - fprintf(stderr, "fill commit failed\n"); - return ret; - } - } - if (i && i % 10000 == 0) { - printf("bigfill %d\n", i); - } - if (!keep_running) - break; - } -out: - return ret; -} - -static int bulk_op(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct radix_tree_root *radix) -{ - int ret; - int nr = rand() % 5000; - static int run_nr = 0; - - /* do the bulk op much less frequently */ - if (run_nr++ % 100) - return 0; - ret = empty_tree(trans, root, radix, nr); - if (ret) - return ret; - ret = fill_tree(trans, root, radix, nr); - if (ret) - return ret; - return 0; -} - - -int (*ops[])(struct btrfs_trans_handle *, - struct btrfs_root *root, struct radix_tree_root *radix) = - { ins_one, insert_dup, del_one, lookup_item, - lookup_enoent, bulk_op }; - -static int fill_radix(struct btrfs_root *root, struct radix_tree_root *radix) -{ - struct btrfs_path path; - struct btrfs_key key; - unsigned long found; - int ret; - int slot; - int i; - - key.offset = 0; - key.flags = 0; - btrfs_set_key_type(&key, BTRFS_STRING_ITEM_KEY); - key.objectid = (unsigned long)-1; - while(1) { - btrfs_init_path(&path); - ret = btrfs_search_slot(NULL, root, &key, &path, 0, 0); - if (ret < 0) { - btrfs_release_path(root, &path); - return ret; - } - slot = path.slots[0]; - if (ret != 0) { - if (slot == 0) { - btrfs_release_path(root, &path); - break; - } - slot -= 1; - } - for (i = slot; i >= 0; i--) { - found = btrfs_disk_key_objectid(&path.nodes[0]-> - leaf.items[i].key); - radix_tree_preload(GFP_KERNEL); - ret = radix_tree_insert(radix, found, (void *)found); - if (ret) { - fprintf(stderr, - "failed to insert %lu into radix\n", - found); - exit(1); - } - - radix_tree_preload_end(); - } - btrfs_release_path(root, &path); - key.objectid = found - 1; - if (key.objectid > found) - break; - } - return 0; -} -void sigstopper(int ignored) -{ - keep_running = 0; - fprintf(stderr, "caught exit signal, stopping\n"); -} - -int print_usage(void) -{ - printf("usage: tester [-ih] [-c count] [-f count]\n"); - printf("\t -c count -- iteration count after filling\n"); - printf("\t -f count -- run this many random inserts before starting\n"); - printf("\t -i -- only do initial fill\n"); - printf("\t -h -- this help text\n"); - exit(1); -} -int main(int ac, char **av) -{ - RADIX_TREE(radix, GFP_KERNEL); - struct btrfs_root *root; - int i; - int ret; - int count; - int op; - int iterations = 20000; - int init_fill_count = 800000; - int err = 0; - int initial_only = 0; - struct btrfs_trans_handle *trans; - radix_tree_init(); - root = open_ctree("dbfile", &super); - fill_radix(root, &radix); - - signal(SIGTERM, sigstopper); - signal(SIGINT, sigstopper); - - for (i = 1 ; i < ac ; i++) { - if (strcmp(av[i], "-i") == 0) { - initial_only = 1; - } else if (strcmp(av[i], "-c") == 0) { - iterations = atoi(av[i+1]); - i++; - } else if (strcmp(av[i], "-f") == 0) { - init_fill_count = atoi(av[i+1]); - i++; - } else { - print_usage(); - } - } - printf("initial fill\n"); - trans = btrfs_start_transaction(root, 1); - ret = fill_tree(trans, root, &radix, init_fill_count); - printf("starting run\n"); - if (ret) { - err = ret; - goto out; - } - if (initial_only == 1) { - goto out; - } - for (i = 0; i < iterations; i++) { - op = rand() % ARRAY_SIZE(ops); - count = rand() % 128; - if (i % 2000 == 0) { - printf("%d\n", i); - fflush(stdout); - } - if (i && i % 5000 == 0) { - printf("open & close, root level %d nritems %d\n", - btrfs_header_level(&root->node->node.header), - btrfs_header_nritems(&root->node->node.header)); - close_ctree(root, &super); - root = open_ctree("dbfile", &super); - } - while(count--) { - ret = ops[op](trans, root, &radix); - if (ret) { - fprintf(stderr, "op %d failed %d:%d\n", - op, i, iterations); - btrfs_print_tree(root, root->node); - fprintf(stderr, "op %d failed %d:%d\n", - op, i, iterations); - err = ret; - goto out; - } - if (ops[op] == bulk_op) - break; - if (keep_running == 0) { - err = 0; - goto out; - } - } - } -out: - close_ctree(root, &super); - return err; -} - diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index 9cccecc0f431..52c83be4b307 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -1,7 +1,4 @@ -#include -#include -#include "kerncompat.h" -#include "radix-tree.h" +#include #include "ctree.h" #include "disk-io.h" #include "print-tree.h" diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c new file mode 100644 index 000000000000..4ae76044aeab --- /dev/null +++ b/fs/btrfs/super.c @@ -0,0 +1,205 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "ctree.h" + +#define BTRFS_SUPER_MAGIC 0x9123682E +#if 0 +/* some random number */ + +static struct super_operations ramfs_ops; +static struct inode_operations ramfs_dir_inode_operations; + +static struct backing_dev_info ramfs_backing_dev_info = { + .ra_pages = 0, /* No readahead */ + .capabilities = BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_WRITEBACK | + BDI_CAP_MAP_DIRECT | BDI_CAP_MAP_COPY | + BDI_CAP_READ_MAP | BDI_CAP_WRITE_MAP | BDI_CAP_EXEC_MAP, +}; + +struct inode *ramfs_get_inode(struct super_block *sb, int mode, dev_t dev) +{ + struct inode * inode = new_inode(sb); + + if (inode) { + inode->i_mode = mode; + inode->i_uid = current->fsuid; + inode->i_gid = current->fsgid; + inode->i_blocks = 0; + inode->i_mapping->a_ops = &ramfs_aops; + inode->i_mapping->backing_dev_info = &ramfs_backing_dev_info; + inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; + switch (mode & S_IFMT) { + default: + init_special_inode(inode, mode, dev); + break; + case S_IFREG: + inode->i_op = &ramfs_file_inode_operations; + inode->i_fop = &ramfs_file_operations; + break; + case S_IFDIR: + inode->i_op = &ramfs_dir_inode_operations; + inode->i_fop = &simple_dir_operations; + + /* directory inodes start off with i_nlink == 2 (for "." entry) */ + inc_nlink(inode); + break; + case S_IFLNK: + inode->i_op = &page_symlink_inode_operations; + break; + } + } + return inode; +} + +/* + * File creation. Allocate an inode, and we're done.. + */ +/* SMP-safe */ +static int +ramfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev) +{ + struct inode * inode = ramfs_get_inode(dir->i_sb, mode, dev); + int error = -ENOSPC; + + if (inode) { + if (dir->i_mode & S_ISGID) { + inode->i_gid = dir->i_gid; + if (S_ISDIR(mode)) + inode->i_mode |= S_ISGID; + } + d_instantiate(dentry, inode); + dget(dentry); /* Extra count - pin the dentry in core */ + error = 0; + dir->i_mtime = dir->i_ctime = CURRENT_TIME; + } + return error; +} + +static int ramfs_mkdir(struct inode * dir, struct dentry * dentry, int mode) +{ + int retval = ramfs_mknod(dir, dentry, mode | S_IFDIR, 0); + if (!retval) + inc_nlink(dir); + return retval; +} + +static int ramfs_create(struct inode *dir, struct dentry *dentry, int mode, struct nameidata *nd) +{ + return ramfs_mknod(dir, dentry, mode | S_IFREG, 0); +} + +static int ramfs_symlink(struct inode * dir, struct dentry *dentry, const char * symname) +{ + struct inode *inode; + int error = -ENOSPC; + + inode = ramfs_get_inode(dir->i_sb, S_IFLNK|S_IRWXUGO, 0); + if (inode) { + int l = strlen(symname)+1; + error = page_symlink(inode, symname, l); + if (!error) { + if (dir->i_mode & S_ISGID) + inode->i_gid = dir->i_gid; + d_instantiate(dentry, inode); + dget(dentry); + dir->i_mtime = dir->i_ctime = CURRENT_TIME; + } else + iput(inode); + } + return error; +} + +static struct inode_operations ramfs_dir_inode_operations = { + .create = ramfs_create, + .lookup = simple_lookup, + .link = simple_link, + .unlink = simple_unlink, + .symlink = ramfs_symlink, + .mkdir = ramfs_mkdir, + .rmdir = simple_rmdir, + .mknod = ramfs_mknod, + .rename = simple_rename, +}; +#endif + +struct inode *btrfs_get_inode(struct super_block *sb, int mode, dev_t dev) +{ + struct inode * inode = new_inode(sb); + + if (inode) { + inode->i_mode = mode; + inode->i_uid = current->fsuid; + inode->i_gid = current->fsgid; + inode->i_blocks = 0; + inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; + } + return inode; +} + +static struct super_operations btrfs_ops = { + .statfs = simple_statfs, + .drop_inode = generic_delete_inode, +}; + +static int btrfs_fill_super(struct super_block * sb, void * data, int silent) +{ + struct inode * inode; + struct dentry * root; + + sb->s_maxbytes = MAX_LFS_FILESIZE; + sb->s_blocksize = PAGE_CACHE_SIZE; + sb->s_blocksize_bits = PAGE_CACHE_SHIFT; + sb->s_magic = BTRFS_SUPER_MAGIC; + sb->s_op = &btrfs_ops; + sb->s_time_gran = 1; + inode = btrfs_get_inode(sb, S_IFDIR | 0755, 0); + if (!inode) + return -ENOMEM; + + root = d_alloc_root(inode); + if (!root) { + iput(inode); + return -ENOMEM; + } + sb->s_root = root; + return 0; +} + +static int btrfs_get_sb(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data, struct vfsmount *mnt) +{ + return get_sb_bdev(fs_type, flags, dev_name, data, + btrfs_fill_super, mnt); +} + +static struct file_system_type btrfs_fs_type = { + .owner = THIS_MODULE, + .name = "btrfs", + .get_sb = btrfs_get_sb, + .kill_sb = kill_block_super, + .fs_flags = FS_REQUIRES_DEV, +}; + +static int __init init_btrfs_fs(void) +{ + printk("btrfs loaded!\n"); + return register_filesystem(&btrfs_fs_type); +} + +static void __exit exit_btrfs_fs(void) +{ + unregister_filesystem(&btrfs_fs_type); + printk("btrfs unloaded\n"); +} + +module_init(init_btrfs_fs) +module_exit(exit_btrfs_fs) + +MODULE_LICENSE("GPL"); -- cgit v1.2.3 From e20d96d64f9cf9288ffecc9ad4714e91c3b97ca8 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Thu, 22 Mar 2007 12:13:20 -0400 Subject: Mountable btrfs, with readdir Signed-off-by: Chris Mason --- fs/btrfs/Makefile | 5 +- fs/btrfs/ctree.c | 273 ++++++++++++++++++++++++----------------------- fs/btrfs/ctree.h | 62 ++++++++--- fs/btrfs/dir-item.c | 17 ++- fs/btrfs/disk-io.c | 281 +++++++++++++++---------------------------------- fs/btrfs/disk-io.h | 51 +++++---- fs/btrfs/extent-tree.c | 144 +++++++++++++------------ fs/btrfs/hash.c | 9 ++ fs/btrfs/inode-map.c | 8 +- fs/btrfs/print-tree.c | 20 ++-- fs/btrfs/print-tree.h | 2 +- fs/btrfs/root-tree.c | 4 +- fs/btrfs/super.c | 261 +++++++++++++++++++++++++++++++++++++++++---- fs/btrfs/transaction.h | 4 +- 14 files changed, 651 insertions(+), 490 deletions(-) (limited to 'fs/btrfs/Makefile') diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index 99e45a54ebd6..1a0fb7422393 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -2,7 +2,8 @@ ifneq ($(KERNELRELEASE),) # kbuild part of makefile obj-m := btrfs.o -btrfs-y := super.o +btrfs-y := super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ + hash.o file-item.o inode-item.o inode-map.o disk-io.o #btrfs-y := ctree.o disk-io.o radix-tree.o extent-tree.o print-tree.o \ # root-tree.o dir-item.o hash.o file-item.o inode-item.o \ @@ -14,7 +15,7 @@ else KERNELDIR := /lib/modules/`uname -r`/build all:: - $(MAKE) -C $(KERNELDIR) M=`pwd` modules + $(MAKE) C=1 -C $(KERNELDIR) M=`pwd` modules clean:: rm *.o btrfs.ko endif diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 9fbd07c37fde..e690e2bb47d2 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -7,11 +7,11 @@ static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, int data_size); static int push_node_left(struct btrfs_trans_handle *trans, struct btrfs_root - *root, struct btrfs_buffer *dst, struct btrfs_buffer + *root, struct buffer_head *dst, struct buffer_head *src); static int balance_node_right(struct btrfs_trans_handle *trans, struct - btrfs_root *root, struct btrfs_buffer *dst_buf, - struct btrfs_buffer *src_buf); + btrfs_root *root, struct buffer_head *dst_buf, + struct buffer_head *src_buf); static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, int level, int slot); @@ -32,32 +32,34 @@ void btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p) } static int btrfs_cow_block(struct btrfs_trans_handle *trans, struct btrfs_root - *root, struct btrfs_buffer *buf, struct btrfs_buffer - *parent, int parent_slot, struct btrfs_buffer + *root, struct buffer_head *buf, struct buffer_head + *parent, int parent_slot, struct buffer_head **cow_ret) { - struct btrfs_buffer *cow; + struct buffer_head *cow; + struct btrfs_node *cow_node; - if (!list_empty(&buf->dirty)) { + if (!buffer_dirty(buf)) { *cow_ret = buf; return 0; } cow = btrfs_alloc_free_block(trans, root); - memcpy(&cow->node, &buf->node, root->blocksize); - btrfs_set_header_blocknr(&cow->node.header, cow->blocknr); + cow_node = btrfs_buffer_node(cow); + memcpy(cow_node, btrfs_buffer_node(buf), root->blocksize); + btrfs_set_header_blocknr(&cow_node->header, cow->b_blocknr); *cow_ret = cow; btrfs_inc_ref(trans, root, buf); if (buf == root->node) { root->node = cow; - cow->count++; + get_bh(cow); if (buf != root->commit_root) - btrfs_free_extent(trans, root, buf->blocknr, 1, 1); + btrfs_free_extent(trans, root, buf->b_blocknr, 1, 1); btrfs_block_release(root, buf); } else { - btrfs_set_node_blockptr(&parent->node, parent_slot, - cow->blocknr); - BUG_ON(list_empty(&parent->dirty)); - btrfs_free_extent(trans, root, buf->blocknr, 1, 1); + btrfs_set_node_blockptr(btrfs_buffer_node(parent), parent_slot, + cow->b_blocknr); + BUG_ON(!buffer_dirty(parent)); + btrfs_free_extent(trans, root, buf->b_blocknr, 1, 1); } btrfs_block_release(root, buf); return 0; @@ -119,12 +121,12 @@ static int check_node(struct btrfs_root *root, struct btrfs_path *path, { int i; struct btrfs_node *parent = NULL; - struct btrfs_node *node = &path->nodes[level]->node; + struct btrfs_node *node = btrfs_buffer_node(path->nodes[level]); int parent_slot; u32 nritems = btrfs_header_nritems(&node->header); if (path->nodes[level + 1]) - parent = &path->nodes[level + 1]->node; + parent = btrfs_buffer_node(path->nodes[level + 1]); parent_slot = path->slots[level + 1]; BUG_ON(nritems == 0); if (parent) { @@ -148,13 +150,13 @@ static int check_leaf(struct btrfs_root *root, struct btrfs_path *path, int level) { int i; - struct btrfs_leaf *leaf = &path->nodes[level]->leaf; + struct btrfs_leaf *leaf = btrfs_buffer_leaf(path->nodes[level]); struct btrfs_node *parent = NULL; int parent_slot; u32 nritems = btrfs_header_nritems(&leaf->header); if (path->nodes[level + 1]) - parent = &path->nodes[level + 1]->node; + parent = btrfs_buffer_node(path->nodes[level + 1]); parent_slot = path->slots[level + 1]; BUG_ON(btrfs_leaf_free_space(root, leaf) < 0); @@ -250,11 +252,11 @@ static int bin_search(struct btrfs_node *c, struct btrfs_key *key, int *slot) return -1; } -static struct btrfs_buffer *read_node_slot(struct btrfs_root *root, - struct btrfs_buffer *parent_buf, +static struct buffer_head *read_node_slot(struct btrfs_root *root, + struct buffer_head *parent_buf, int slot) { - struct btrfs_node *node = &parent_buf->node; + struct btrfs_node *node = btrfs_buffer_node(parent_buf); if (slot < 0) return NULL; if (slot >= btrfs_header_nritems(&node->header)) @@ -265,10 +267,10 @@ static struct btrfs_buffer *read_node_slot(struct btrfs_root *root, static int balance_level(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, int level) { - struct btrfs_buffer *right_buf; - struct btrfs_buffer *mid_buf; - struct btrfs_buffer *left_buf; - struct btrfs_buffer *parent_buf = NULL; + struct buffer_head *right_buf; + struct buffer_head *mid_buf; + struct buffer_head *left_buf; + struct buffer_head *parent_buf = NULL; struct btrfs_node *right = NULL; struct btrfs_node *mid; struct btrfs_node *left = NULL; @@ -283,7 +285,7 @@ static int balance_level(struct btrfs_trans_handle *trans, struct btrfs_root return 0; mid_buf = path->nodes[level]; - mid = &mid_buf->node; + mid = btrfs_buffer_node(mid_buf); orig_ptr = btrfs_node_blockptr(mid, orig_slot); if (level < BTRFS_MAX_LEVEL - 1) @@ -295,8 +297,8 @@ static int balance_level(struct btrfs_trans_handle *trans, struct btrfs_root * by promoting the node below to a root */ if (!parent_buf) { - struct btrfs_buffer *child; - u64 blocknr = mid_buf->blocknr; + struct buffer_head *child; + u64 blocknr = mid_buf->b_blocknr; if (btrfs_header_nritems(&mid->header) != 1) return 0; @@ -313,7 +315,7 @@ static int balance_level(struct btrfs_trans_handle *trans, struct btrfs_root clean_tree_block(trans, root, mid_buf); return btrfs_free_extent(trans, root, blocknr, 1, 1); } - parent = &parent_buf->node; + parent = btrfs_buffer_node(parent_buf); if (btrfs_header_nritems(&mid->header) > BTRFS_NODEPTRS_PER_BLOCK(root) / 4) @@ -326,7 +328,7 @@ static int balance_level(struct btrfs_trans_handle *trans, struct btrfs_root if (left_buf) { btrfs_cow_block(trans, root, left_buf, parent_buf, pslot - 1, &left_buf); - left = &left_buf->node; + left = btrfs_buffer_node(left_buf); orig_slot += btrfs_header_nritems(&left->header); wret = push_node_left(trans, root, left_buf, mid_buf); if (wret < 0) @@ -339,12 +341,12 @@ static int balance_level(struct btrfs_trans_handle *trans, struct btrfs_root if (right_buf) { btrfs_cow_block(trans, root, right_buf, parent_buf, pslot + 1, &right_buf); - right = &right_buf->node; + right = btrfs_buffer_node(right_buf); wret = push_node_left(trans, root, mid_buf, right_buf); if (wret < 0) ret = wret; if (btrfs_header_nritems(&right->header) == 0) { - u64 blocknr = right_buf->blocknr; + u64 blocknr = right_buf->b_blocknr; btrfs_block_release(root, right_buf); clean_tree_block(trans, root, right_buf); right_buf = NULL; @@ -360,7 +362,7 @@ static int balance_level(struct btrfs_trans_handle *trans, struct btrfs_root memcpy(&parent->ptrs[pslot + 1].key, &right->ptrs[0].key, sizeof(struct btrfs_disk_key)); - BUG_ON(list_empty(&parent_buf->dirty)); + BUG_ON(!buffer_dirty(parent_buf)); } } if (btrfs_header_nritems(&mid->header) == 1) { @@ -381,7 +383,7 @@ static int balance_level(struct btrfs_trans_handle *trans, struct btrfs_root } if (btrfs_header_nritems(&mid->header) == 0) { /* we've managed to empty the middle node, drop it */ - u64 blocknr = mid_buf->blocknr; + u64 blocknr = mid_buf->b_blocknr; btrfs_block_release(root, mid_buf); clean_tree_block(trans, root, mid_buf); mid_buf = NULL; @@ -396,13 +398,13 @@ static int balance_level(struct btrfs_trans_handle *trans, struct btrfs_root /* update the parent key to reflect our changes */ memcpy(&parent->ptrs[pslot].key, &mid->ptrs[0].key, sizeof(struct btrfs_disk_key)); - BUG_ON(list_empty(&parent_buf->dirty)); + BUG_ON(!buffer_dirty(parent_buf)); } /* update the path */ if (left_buf) { if (btrfs_header_nritems(&left->header) > orig_slot) { - left_buf->count++; // released below + get_bh(left_buf); path->nodes[level] = left_buf; path->slots[level + 1] -= 1; path->slots[level] = orig_slot; @@ -415,8 +417,9 @@ static int balance_level(struct btrfs_trans_handle *trans, struct btrfs_root } /* double check we haven't messed things up */ check_block(root, path, level); - if (orig_ptr != btrfs_node_blockptr(&path->nodes[level]->node, - path->slots[level])) + if (orig_ptr != + btrfs_node_blockptr(btrfs_buffer_node(path->nodes[level]), + path->slots[level])) BUG(); if (right_buf) @@ -443,8 +446,8 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_key *key, struct btrfs_path *p, int ins_len, int cow) { - struct btrfs_buffer *b; - struct btrfs_buffer *cow_buf; + struct buffer_head *b; + struct buffer_head *cow_buf; struct btrfs_node *c; int slot; int ret; @@ -452,18 +455,20 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root again: b = root->node; - b->count++; + get_bh(b); while (b) { - level = btrfs_header_level(&b->node.header); + c = btrfs_buffer_node(b); + level = btrfs_header_level(&c->header); if (cow) { int wret; - wret = btrfs_cow_block(trans, root, b, p->nodes[level + - 1], p->slots[level + 1], + wret = btrfs_cow_block(trans, root, b, + p->nodes[level + 1], + p->slots[level + 1], &cow_buf); b = cow_buf; } BUG_ON(!cow && ins_len); - c = &b->node; + c = btrfs_buffer_node(b); p->nodes[level] = b; ret = check_block(root, p, level); if (ret) @@ -480,7 +485,7 @@ again: if (sret) return sret; b = p->nodes[level]; - c = &b->node; + c = btrfs_buffer_node(b); slot = p->slots[level]; } else if (ins_len < 0) { int sret = balance_level(trans, root, p, @@ -490,7 +495,7 @@ again: b = p->nodes[level]; if (!b) goto again; - c = &b->node; + c = btrfs_buffer_node(b); slot = p->slots[level]; BUG_ON(btrfs_header_nritems(&c->header) == 1); } @@ -505,11 +510,9 @@ again: if (sret) return sret; } - BUG_ON(root->node->count == 1); return ret; } } - BUG_ON(root->node->count == 1); return 1; } @@ -534,9 +537,9 @@ static int fixup_low_keys(struct btrfs_trans_handle *trans, struct btrfs_root int tslot = path->slots[i]; if (!path->nodes[i]) break; - t = &path->nodes[i]->node; + t = btrfs_buffer_node(path->nodes[i]); memcpy(&t->ptrs[tslot].key, key, sizeof(*key)); - BUG_ON(list_empty(&path->nodes[i]->dirty)); + BUG_ON(!buffer_dirty(path->nodes[i])); if (tslot != 0) break; } @@ -551,11 +554,11 @@ static int fixup_low_keys(struct btrfs_trans_handle *trans, struct btrfs_root * error, and > 0 if there was no room in the left hand block. */ static int push_node_left(struct btrfs_trans_handle *trans, struct btrfs_root - *root, struct btrfs_buffer *dst_buf, struct - btrfs_buffer *src_buf) + *root, struct buffer_head *dst_buf, struct + buffer_head *src_buf) { - struct btrfs_node *src = &src_buf->node; - struct btrfs_node *dst = &dst_buf->node; + struct btrfs_node *src = btrfs_buffer_node(src_buf); + struct btrfs_node *dst = btrfs_buffer_node(dst_buf); int push_items = 0; int src_nritems; int dst_nritems; @@ -580,8 +583,8 @@ static int push_node_left(struct btrfs_trans_handle *trans, struct btrfs_root } btrfs_set_header_nritems(&src->header, src_nritems - push_items); btrfs_set_header_nritems(&dst->header, dst_nritems + push_items); - BUG_ON(list_empty(&src_buf->dirty)); - BUG_ON(list_empty(&dst_buf->dirty)); + BUG_ON(!buffer_dirty(src_buf)); + BUG_ON(!buffer_dirty(dst_buf)); return ret; } @@ -595,11 +598,11 @@ static int push_node_left(struct btrfs_trans_handle *trans, struct btrfs_root * this will only push up to 1/2 the contents of the left node over */ static int balance_node_right(struct btrfs_trans_handle *trans, struct - btrfs_root *root, struct btrfs_buffer *dst_buf, - struct btrfs_buffer *src_buf) + btrfs_root *root, struct buffer_head *dst_buf, + struct buffer_head *src_buf) { - struct btrfs_node *src = &src_buf->node; - struct btrfs_node *dst = &dst_buf->node; + struct btrfs_node *src = btrfs_buffer_node(src_buf); + struct btrfs_node *dst = btrfs_buffer_node(dst_buf); int push_items = 0; int max_push; int src_nritems; @@ -628,8 +631,8 @@ static int balance_node_right(struct btrfs_trans_handle *trans, struct btrfs_set_header_nritems(&src->header, src_nritems - push_items); btrfs_set_header_nritems(&dst->header, dst_nritems + push_items); - BUG_ON(list_empty(&src_buf->dirty)); - BUG_ON(list_empty(&dst_buf->dirty)); + BUG_ON(!buffer_dirty(src_buf)); + BUG_ON(!buffer_dirty(dst_buf)); return ret; } @@ -643,7 +646,7 @@ static int balance_node_right(struct btrfs_trans_handle *trans, struct static int insert_new_root(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, int level) { - struct btrfs_buffer *t; + struct buffer_head *t; struct btrfs_node *lower; struct btrfs_node *c; struct btrfs_disk_key *lower_key; @@ -652,24 +655,24 @@ static int insert_new_root(struct btrfs_trans_handle *trans, struct btrfs_root BUG_ON(path->nodes[level-1] != root->node); t = btrfs_alloc_free_block(trans, root); - c = &t->node; + c = btrfs_buffer_node(t); memset(c, 0, root->blocksize); btrfs_set_header_nritems(&c->header, 1); btrfs_set_header_level(&c->header, level); - btrfs_set_header_blocknr(&c->header, t->blocknr); + btrfs_set_header_blocknr(&c->header, t->b_blocknr); btrfs_set_header_parentid(&c->header, - btrfs_header_parentid(&root->node->node.header)); - lower = &path->nodes[level-1]->node; + btrfs_header_parentid(btrfs_buffer_header(root->node))); + lower = btrfs_buffer_node(path->nodes[level-1]); if (btrfs_is_leaf(lower)) lower_key = &((struct btrfs_leaf *)lower)->items[0].key; else lower_key = &lower->ptrs[0].key; memcpy(&c->ptrs[0].key, lower_key, sizeof(struct btrfs_disk_key)); - btrfs_set_node_blockptr(c, 0, path->nodes[level - 1]->blocknr); + btrfs_set_node_blockptr(c, 0, path->nodes[level - 1]->b_blocknr); /* the super has an extra ref to root->node */ btrfs_block_release(root, root->node); root->node = t; - t->count++; + get_bh(t); path->nodes[level] = t; path->slots[level] = 0; return 0; @@ -692,7 +695,7 @@ static int insert_ptr(struct btrfs_trans_handle *trans, struct btrfs_root int nritems; BUG_ON(!path->nodes[level]); - lower = &path->nodes[level]->node; + lower = btrfs_buffer_node(path->nodes[level]); nritems = btrfs_header_nritems(&lower->header); if (slot > nritems) BUG(); @@ -705,7 +708,7 @@ static int insert_ptr(struct btrfs_trans_handle *trans, struct btrfs_root memcpy(&lower->ptrs[slot].key, key, sizeof(struct btrfs_disk_key)); btrfs_set_node_blockptr(lower, slot, blocknr); btrfs_set_header_nritems(&lower->header, nritems + 1); - BUG_ON(list_empty(&path->nodes[level]->dirty)); + BUG_ON(!buffer_dirty(path->nodes[level])); return 0; } @@ -721,9 +724,9 @@ static int insert_ptr(struct btrfs_trans_handle *trans, struct btrfs_root static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, int level) { - struct btrfs_buffer *t; + struct buffer_head *t; struct btrfs_node *c; - struct btrfs_buffer *split_buffer; + struct buffer_head *split_buffer; struct btrfs_node *split; int mid; int ret; @@ -731,7 +734,7 @@ static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root u32 c_nritems; t = path->nodes[level]; - c = &t->node; + c = btrfs_buffer_node(t); if (t == root->node) { /* trying to split the root, lets make a new one */ ret = insert_new_root(trans, root, path, level + 1); @@ -740,11 +743,11 @@ static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root } c_nritems = btrfs_header_nritems(&c->header); split_buffer = btrfs_alloc_free_block(trans, root); - split = &split_buffer->node; + split = btrfs_buffer_node(split_buffer); btrfs_set_header_flags(&split->header, btrfs_header_flags(&c->header)); - btrfs_set_header_blocknr(&split->header, split_buffer->blocknr); + btrfs_set_header_blocknr(&split->header, split_buffer->b_blocknr); btrfs_set_header_parentid(&split->header, - btrfs_header_parentid(&root->node->node.header)); + btrfs_header_parentid(btrfs_buffer_header(root->node))); mid = (c_nritems + 1) / 2; memcpy(split->ptrs, c->ptrs + mid, (c_nritems - mid) * sizeof(struct btrfs_key_ptr)); @@ -752,9 +755,9 @@ static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root btrfs_set_header_nritems(&c->header, mid); ret = 0; - BUG_ON(list_empty(&t->dirty)); + BUG_ON(!buffer_dirty(t)); wret = insert_ptr(trans, root, path, &split->ptrs[0].key, - split_buffer->blocknr, path->slots[level + 1] + 1, + split_buffer->b_blocknr, path->slots[level + 1] + 1, level + 1); if (wret) ret = wret; @@ -798,11 +801,12 @@ static int leaf_space_used(struct btrfs_leaf *l, int start, int nr) static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, int data_size) { - struct btrfs_buffer *left_buf = path->nodes[0]; - struct btrfs_leaf *left = &left_buf->leaf; + struct buffer_head *left_buf = path->nodes[0]; + struct btrfs_leaf *left = btrfs_buffer_leaf(left_buf); struct btrfs_leaf *right; - struct btrfs_buffer *right_buf; - struct btrfs_buffer *upper; + struct buffer_head *right_buf; + struct buffer_head *upper; + struct btrfs_node *upper_node; int slot; int i; int free_space; @@ -817,12 +821,13 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root return 1; } upper = path->nodes[1]; - if (slot >= btrfs_header_nritems(&upper->node.header) - 1) { + upper_node = btrfs_buffer_node(upper); + if (slot >= btrfs_header_nritems(&upper_node->header) - 1) { return 1; } - right_buf = read_tree_block(root, btrfs_node_blockptr(&upper->node, - slot + 1)); - right = &right_buf->leaf; + right_buf = read_tree_block(root, + btrfs_node_blockptr(btrfs_buffer_node(upper), slot + 1)); + right = btrfs_buffer_leaf(right_buf); free_space = btrfs_leaf_free_space(root, right); if (free_space < data_size + sizeof(struct btrfs_item)) { btrfs_block_release(root, right_buf); @@ -830,7 +835,7 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root } /* cow and double check */ btrfs_cow_block(trans, root, right_buf, upper, slot + 1, &right_buf); - right = &right_buf->leaf; + right = btrfs_buffer_leaf(right_buf); free_space = btrfs_leaf_free_space(root, right); if (free_space < data_size + sizeof(struct btrfs_item)) { btrfs_block_release(root, right_buf); @@ -881,11 +886,11 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root left_nritems -= push_items; btrfs_set_header_nritems(&left->header, left_nritems); - BUG_ON(list_empty(&left_buf->dirty)); - BUG_ON(list_empty(&right_buf->dirty)); - memcpy(&upper->node.ptrs[slot + 1].key, + BUG_ON(!buffer_dirty(left_buf)); + BUG_ON(!buffer_dirty(right_buf)); + memcpy(&upper_node->ptrs[slot + 1].key, &right->items[0].key, sizeof(struct btrfs_disk_key)); - BUG_ON(list_empty(&upper->dirty)); + BUG_ON(!buffer_dirty(upper)); /* then fixup the leaf pointer in the path */ if (path->slots[0] >= left_nritems) { @@ -905,9 +910,9 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, int data_size) { - struct btrfs_buffer *right_buf = path->nodes[0]; - struct btrfs_leaf *right = &right_buf->leaf; - struct btrfs_buffer *t; + struct buffer_head *right_buf = path->nodes[0]; + struct btrfs_leaf *right = btrfs_buffer_leaf(right_buf); + struct buffer_head *t; struct btrfs_leaf *left; int slot; int i; @@ -926,9 +931,9 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root if (!path->nodes[1]) { return 1; } - t = read_tree_block(root, btrfs_node_blockptr(&path->nodes[1]->node, - slot - 1)); - left = &t->leaf; + t = read_tree_block(root, + btrfs_node_blockptr(btrfs_buffer_node(path->nodes[1]), slot - 1)); + left = btrfs_buffer_leaf(t); free_space = btrfs_leaf_free_space(root, left); if (free_space < data_size + sizeof(struct btrfs_item)) { btrfs_block_release(root, t); @@ -937,7 +942,7 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root /* cow and double check */ btrfs_cow_block(trans, root, t, path->nodes[1], slot - 1, &t); - left = &t->leaf; + left = btrfs_buffer_leaf(t); free_space = btrfs_leaf_free_space(root, left); if (free_space < data_size + sizeof(struct btrfs_item)) { btrfs_block_release(root, t); @@ -999,8 +1004,8 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root push_space = btrfs_item_offset(right->items + i); } - BUG_ON(list_empty(&t->dirty)); - BUG_ON(list_empty(&right_buf->dirty)); + BUG_ON(!buffer_dirty(t)); + BUG_ON(!buffer_dirty(right_buf)); wret = fixup_low_keys(trans, root, path, &right->items[0].key, 1); if (wret) @@ -1029,13 +1034,13 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, int data_size) { - struct btrfs_buffer *l_buf; + struct buffer_head *l_buf; struct btrfs_leaf *l; u32 nritems; int mid; int slot; struct btrfs_leaf *right; - struct btrfs_buffer *right_buffer; + struct buffer_head *right_buffer; int space_needed = data_size + sizeof(struct btrfs_item); int data_copy_size; int rt_data_off; @@ -1053,7 +1058,7 @@ static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root return wret; } l_buf = path->nodes[0]; - l = &l_buf->leaf; + l = btrfs_buffer_leaf(l_buf); /* did the pushes work? */ if (btrfs_leaf_free_space(root, l) >= @@ -1071,7 +1076,7 @@ static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root right_buffer = btrfs_alloc_free_block(trans, root); BUG_ON(!right_buffer); BUG_ON(mid == nritems); - right = &right_buffer->leaf; + right = btrfs_buffer_leaf(right_buffer); memset(&right->header, 0, sizeof(right->header)); if (mid <= slot) { /* FIXME, just alloc a new leaf here */ @@ -1085,10 +1090,10 @@ static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root BUG(); } btrfs_set_header_nritems(&right->header, nritems - mid); - btrfs_set_header_blocknr(&right->header, right_buffer->blocknr); + btrfs_set_header_blocknr(&right->header, right_buffer->b_blocknr); btrfs_set_header_level(&right->header, 0); btrfs_set_header_parentid(&right->header, - btrfs_header_parentid(&root->node->node.header)); + btrfs_header_parentid(btrfs_buffer_header(root->node))); data_copy_size = btrfs_item_end(l->items + mid) - leaf_data_end(root, l); memcpy(right->items, l->items + mid, @@ -1107,11 +1112,11 @@ static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root btrfs_set_header_nritems(&l->header, mid); ret = 0; wret = insert_ptr(trans, root, path, &right->items[0].key, - right_buffer->blocknr, path->slots[1] + 1, 1); + right_buffer->b_blocknr, path->slots[1] + 1, 1); if (wret) ret = wret; - BUG_ON(list_empty(&right_buffer->dirty)); - BUG_ON(list_empty(&l_buf->dirty)); + BUG_ON(!buffer_dirty(right_buffer)); + BUG_ON(!buffer_dirty(l_buf)); BUG_ON(path->slots[0] != slot); if (mid <= slot) { btrfs_block_release(root, path->nodes[0]); @@ -1136,7 +1141,7 @@ int btrfs_insert_empty_item(struct btrfs_trans_handle *trans, struct btrfs_root int slot; int slot_orig; struct btrfs_leaf *leaf; - struct btrfs_buffer *leaf_buf; + struct buffer_head *leaf_buf; u32 nritems; unsigned int data_end; struct btrfs_disk_key disk_key; @@ -1156,7 +1161,7 @@ int btrfs_insert_empty_item(struct btrfs_trans_handle *trans, struct btrfs_root slot_orig = path->slots[0]; leaf_buf = path->nodes[0]; - leaf = &leaf_buf->leaf; + leaf = btrfs_buffer_leaf(leaf_buf); nritems = btrfs_header_nritems(&leaf->header); data_end = leaf_data_end(root, leaf); @@ -1202,7 +1207,7 @@ int btrfs_insert_empty_item(struct btrfs_trans_handle *trans, struct btrfs_root if (slot == 0) ret = fixup_low_keys(trans, root, path, &disk_key, 1); - BUG_ON(list_empty(&leaf_buf->dirty)); + BUG_ON(!buffer_dirty(leaf_buf)); if (btrfs_leaf_free_space(root, leaf) < 0) BUG(); check_leaf(root, path, 0); @@ -1225,7 +1230,8 @@ int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root btrfs_init_path(&path); ret = btrfs_insert_empty_item(trans, root, &path, cpu_key, data_size); if (!ret) { - ptr = btrfs_item_ptr(&path.nodes[0]->leaf, path.slots[0], u8); + ptr = btrfs_item_ptr(btrfs_buffer_leaf(path.nodes[0]), + path.slots[0], u8); memcpy(ptr, data, data_size); } btrfs_release_path(root, &path); @@ -1243,12 +1249,12 @@ static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, int level, int slot) { struct btrfs_node *node; - struct btrfs_buffer *parent = path->nodes[level]; + struct buffer_head *parent = path->nodes[level]; u32 nritems; int ret = 0; int wret; - node = &parent->node; + node = btrfs_buffer_node(parent); nritems = btrfs_header_nritems(&node->header); if (slot != nritems -1) { memmove(node->ptrs + slot, node->ptrs + slot + 1, @@ -1257,16 +1263,17 @@ static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, nritems--; btrfs_set_header_nritems(&node->header, nritems); if (nritems == 0 && parent == root->node) { - BUG_ON(btrfs_header_level(&root->node->node.header) != 1); + struct btrfs_header *header = btrfs_buffer_header(root->node); + BUG_ON(btrfs_header_level(header) != 1); /* just turn the root into a leaf and break */ - btrfs_set_header_level(&root->node->node.header, 0); + btrfs_set_header_level(header, 0); } else if (slot == 0) { wret = fixup_low_keys(trans, root, path, &node->ptrs[0].key, level + 1); if (wret) ret = wret; } - BUG_ON(list_empty(&parent->dirty)); + BUG_ON(!buffer_dirty(parent)); return ret; } @@ -1279,7 +1286,7 @@ int btrfs_del_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, { int slot; struct btrfs_leaf *leaf; - struct btrfs_buffer *leaf_buf; + struct buffer_head *leaf_buf; int doff; int dsize; int ret = 0; @@ -1287,7 +1294,7 @@ int btrfs_del_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, u32 nritems; leaf_buf = path->nodes[0]; - leaf = &leaf_buf->leaf; + leaf = btrfs_buffer_leaf(leaf_buf); slot = path->slots[0]; doff = btrfs_item_offset(leaf->items + slot); dsize = btrfs_item_size(leaf->items + slot); @@ -1313,14 +1320,13 @@ int btrfs_del_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, if (nritems == 0) { if (leaf_buf == root->node) { btrfs_set_header_level(&leaf->header, 0); - BUG_ON(list_empty(&leaf_buf->dirty)); } else { clean_tree_block(trans, root, leaf_buf); wret = del_ptr(trans, root, path, 1, path->slots[1]); if (wret) ret = wret; wret = btrfs_free_extent(trans, root, - leaf_buf->blocknr, 1, 1); + leaf_buf->b_blocknr, 1, 1); if (wret) ret = wret; } @@ -1332,7 +1338,6 @@ int btrfs_del_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, if (wret) ret = wret; } - BUG_ON(list_empty(&leaf_buf->dirty)); /* delete the leaf if it is mostly empty */ if (used < BTRFS_LEAF_DATA_SIZE(root) / 3) { @@ -1341,7 +1346,7 @@ int btrfs_del_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, * for possible call to del_ptr below */ slot = path->slots[1]; - leaf_buf->count++; + get_bh(leaf_buf); wret = push_leaf_left(trans, root, path, 1); if (wret < 0) ret = wret; @@ -1352,7 +1357,7 @@ int btrfs_del_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, ret = wret; } if (btrfs_header_nritems(&leaf->header) == 0) { - u64 blocknr = leaf_buf->blocknr; + u64 blocknr = leaf_buf->b_blocknr; clean_tree_block(trans, root, leaf_buf); wret = del_ptr(trans, root, path, 1, slot); if (wret) @@ -1380,19 +1385,21 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path) int slot; int level = 1; u64 blocknr; - struct btrfs_buffer *c; - struct btrfs_buffer *next = NULL; + struct buffer_head *c; + struct btrfs_node *c_node; + struct buffer_head *next = NULL; while(level < BTRFS_MAX_LEVEL) { if (!path->nodes[level]) return 1; slot = path->slots[level] + 1; c = path->nodes[level]; - if (slot >= btrfs_header_nritems(&c->node.header)) { + c_node = btrfs_buffer_node(c); + if (slot >= btrfs_header_nritems(&c_node->header)) { level++; continue; } - blocknr = btrfs_node_blockptr(&c->node, slot); + blocknr = btrfs_node_blockptr(c_node, slot); if (next) btrfs_block_release(root, next); next = read_tree_block(root, blocknr); @@ -1408,7 +1415,7 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path) if (!level) break; next = read_tree_block(root, - btrfs_node_blockptr(&next->node, 0)); + btrfs_node_blockptr(btrfs_buffer_node(next), 0)); } return 0; } diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index ae8518cb94bf..7748eecd9304 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1,6 +1,9 @@ #ifndef __BTRFS__ #define __BTRFS__ +#include +#include + struct btrfs_trans_handle; #define BTRFS_MAGIC "_BtRfS_M" @@ -10,6 +13,12 @@ struct btrfs_trans_handle; #define BTRFS_INODE_MAP_OBJECTID 3 #define BTRFS_FS_TREE_OBJECTID 4 +/* + * we can actually store much bigger names, but lets not confuse the rest + * of linux + */ +#define BTRFS_NAME_LEN 255 + /* * the key defines the order in the tree, and so it also defines (optimal) * block layout. objectid corresonds to the inode number. The flags @@ -57,7 +66,7 @@ struct btrfs_header { #define __BTRFS_LEAF_DATA_SIZE(bs) ((bs) - sizeof(struct btrfs_header)) #define BTRFS_LEAF_DATA_SIZE(r) (__BTRFS_LEAF_DATA_SIZE(r->blocksize)) -struct btrfs_buffer; +struct buffer_head; /* * the super block basically lists the main trees of the FS * it currently lacks any block count etc etc @@ -120,7 +129,7 @@ struct btrfs_node { * used while walking the tree. */ struct btrfs_path { - struct btrfs_buffer *nodes[BTRFS_MAX_LEVEL]; + struct buffer_head *nodes[BTRFS_MAX_LEVEL]; int slots[BTRFS_MAX_LEVEL]; }; @@ -211,17 +220,14 @@ struct btrfs_fs_info { struct btrfs_root *inode_root; struct btrfs_key current_insert; struct btrfs_key last_insert; - struct radix_tree_root cache_radix; struct radix_tree_root pinned_radix; - struct list_head trans; - struct list_head cache; u64 last_inode_alloc; u64 last_inode_alloc_dirid; u64 generation; - int cache_size; - int fp; struct btrfs_trans_handle *running_transaction; struct btrfs_super_block *disk_super; + struct buffer_head *sb_buffer; + struct super_block *sb; }; /* @@ -230,8 +236,8 @@ struct btrfs_fs_info { * only for the extent tree. */ struct btrfs_root { - struct btrfs_buffer *node; - struct btrfs_buffer *commit_root; + struct buffer_head *node; + struct buffer_head *commit_root; struct btrfs_root_item root_item; struct btrfs_key root_key; struct btrfs_fs_info *fs_info; @@ -389,6 +395,29 @@ static inline void btrfs_set_inode_compat_flags(struct btrfs_inode_item *i, i->compat_flags = cpu_to_le16(val); } +static inline u32 btrfs_timespec_sec(struct btrfs_inode_timespec *ts) +{ + return le32_to_cpu(ts->sec); +} + +static inline void btrfs_set_timespec_sec(struct btrfs_inode_timespec *ts, + u32 val) +{ + ts->sec = cpu_to_le32(val); +} + +static inline u32 btrfs_timespec_nsec(struct btrfs_inode_timespec *ts) +{ + return le32_to_cpu(ts->nsec); +} + +static inline void btrfs_set_timespec_nsec(struct btrfs_inode_timespec *ts, + u32 val) +{ + ts->nsec = cpu_to_le32(val); +} + + static inline u64 btrfs_extent_owner(struct btrfs_extent_item *ei) { @@ -757,15 +786,20 @@ static inline void btrfs_set_file_extent_num_blocks(struct e->num_blocks = cpu_to_le64(val); } +static inline struct btrfs_root *btrfs_sb(struct super_block *sb) +{ + return sb->s_fs_info; +} + /* helper function to cast into the data area of the leaf. */ #define btrfs_item_ptr(leaf, slot, type) \ ((type *)(btrfs_leaf_data(leaf) + \ btrfs_item_offset((leaf)->items + (slot)))) -struct btrfs_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, +struct buffer_head *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, struct btrfs_root *root); int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct btrfs_buffer *buf); + struct buffer_head *buf); int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 blocknr, u64 num_blocks, int pin); int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root @@ -783,7 +817,7 @@ int btrfs_insert_empty_item(struct btrfs_trans_handle *trans, struct btrfs_root int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path); int btrfs_leaf_free_space(struct btrfs_root *root, struct btrfs_leaf *leaf); int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root - *root, struct btrfs_buffer *snap); + *root, struct buffer_head *snap); int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, struct btrfs_root *root); int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root, @@ -800,8 +834,8 @@ int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, char *name, int name_len, u64 dir, u64 objectid, u8 type); int btrfs_lookup_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root - *root, struct btrfs_path *path, u64 dir, char *name, - int name_len, int mod); + *root, struct btrfs_path *path, u64 dir, + const char *name, int name_len, int mod); int btrfs_match_dir_item_name(struct btrfs_root *root, struct btrfs_path *path, char *name, int name_len); int btrfs_find_free_objectid(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c index 4d8083d92fa0..75d6e373e98d 100644 --- a/fs/btrfs/dir-item.c +++ b/fs/btrfs/dir-item.c @@ -18,12 +18,7 @@ int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root key.objectid = dir; key.flags = 0; btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY); - if (name_len == 1 && *name == '.') - key.offset = 1; - else if (name_len == 2 && name[0] == '.' && name[1] == '.') - key.offset = 2; - else - ret = btrfs_name_hash(name, name_len, &key.offset); + ret = btrfs_name_hash(name, name_len, &key.offset); BUG_ON(ret); btrfs_init_path(&path); data_size = sizeof(*dir_item) + name_len; @@ -31,7 +26,8 @@ int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root if (ret) goto out; - dir_item = btrfs_item_ptr(&path.nodes[0]->leaf, path.slots[0], + dir_item = btrfs_item_ptr(btrfs_buffer_leaf(path.nodes[0]), + path.slots[0], struct btrfs_dir_item); btrfs_set_dir_objectid(dir_item, objectid); btrfs_set_dir_type(dir_item, type); @@ -45,8 +41,8 @@ out: } int btrfs_lookup_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root - *root, struct btrfs_path *path, u64 dir, char *name, - int name_len, int mod) + *root, struct btrfs_path *path, u64 dir, + const char *name, int name_len, int mod) { int ret; struct btrfs_key key; @@ -69,7 +65,8 @@ int btrfs_match_dir_item_name(struct btrfs_root *root, struct btrfs_dir_item *dir_item; char *name_ptr; - dir_item = btrfs_item_ptr(&path->nodes[0]->leaf, path->slots[0], + dir_item = btrfs_item_ptr(btrfs_buffer_leaf(path->nodes[0]), + path->slots[0], struct btrfs_dir_item); if (btrfs_dir_name_len(dir_item) != name_len) return 0; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 05637f9fd7c7..df2061a735cd 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1,165 +1,67 @@ -#define _XOPEN_SOURCE 500 -#include -#include -#include -#include -#include -#include -#include "kerncompat.h" -#include "radix-tree.h" +#include +#include #include "ctree.h" #include "disk-io.h" #include "transaction.h" -static int allocated_blocks = 0; -int cache_max = 10000; - -static int check_tree_block(struct btrfs_root *root, struct btrfs_buffer *buf) +static int check_tree_block(struct btrfs_root *root, struct buffer_head *buf) { - if (buf->blocknr != btrfs_header_blocknr(&buf->node.header)) + struct btrfs_node *node = btrfs_buffer_node(buf); + if (buf->b_blocknr != btrfs_header_blocknr(&node->header)) BUG(); - if (root->node && btrfs_header_parentid(&buf->node.header) != - btrfs_header_parentid(&root->node->node.header)) + if (root->node && btrfs_header_parentid(&node->header) != + btrfs_header_parentid(btrfs_buffer_header(root->node))) BUG(); return 0; } -static int free_some_buffers(struct btrfs_root *root) +struct buffer_head *alloc_tree_block(struct btrfs_root *root, u64 blocknr) { - struct list_head *node, *next; - struct btrfs_buffer *b; - if (root->fs_info->cache_size < cache_max) - return 0; - list_for_each_safe(node, next, &root->fs_info->cache) { - b = list_entry(node, struct btrfs_buffer, cache); - if (b->count == 1) { - BUG_ON(!list_empty(&b->dirty)); - list_del_init(&b->cache); - btrfs_block_release(root, b); - if (root->fs_info->cache_size < cache_max) - break; - } - } - return 0; + return sb_getblk(root->fs_info->sb, blocknr); } -struct btrfs_buffer *alloc_tree_block(struct btrfs_root *root, u64 blocknr) +struct buffer_head *find_tree_block(struct btrfs_root *root, u64 blocknr) { - struct btrfs_buffer *buf; - int ret; - - buf = malloc(sizeof(struct btrfs_buffer) + root->blocksize); - if (!buf) - return buf; - allocated_blocks++; - buf->blocknr = blocknr; - buf->count = 2; - INIT_LIST_HEAD(&buf->dirty); - free_some_buffers(root); - radix_tree_preload(GFP_KERNEL); - ret = radix_tree_insert(&root->fs_info->cache_radix, blocknr, buf); - radix_tree_preload_end(); - list_add_tail(&buf->cache, &root->fs_info->cache); - root->fs_info->cache_size++; - if (ret) { - free(buf); - return NULL; - } - return buf; + return sb_getblk(root->fs_info->sb, blocknr); } -struct btrfs_buffer *find_tree_block(struct btrfs_root *root, u64 blocknr) +struct buffer_head *read_tree_block(struct btrfs_root *root, u64 blocknr) { - struct btrfs_buffer *buf; - buf = radix_tree_lookup(&root->fs_info->cache_radix, blocknr); - if (buf) { - buf->count++; - } else { - buf = alloc_tree_block(root, blocknr); - if (!buf) { - BUG(); - return NULL; - } - } - return buf; -} - -struct btrfs_buffer *read_tree_block(struct btrfs_root *root, u64 blocknr) -{ - loff_t offset = blocknr * root->blocksize; - struct btrfs_buffer *buf; - int ret; + struct buffer_head *buf = sb_bread(root->fs_info->sb, blocknr); - buf = radix_tree_lookup(&root->fs_info->cache_radix, blocknr); - if (buf) { - buf->count++; - } else { - buf = alloc_tree_block(root, blocknr); - if (!buf) - return NULL; - ret = pread(root->fs_info->fp, &buf->node, root->blocksize, - offset); - if (ret != root->blocksize) { - free(buf); - return NULL; - } - } + if (!buf) + return buf; if (check_tree_block(root, buf)) BUG(); return buf; } int dirty_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct btrfs_buffer *buf) + struct buffer_head *buf) { - if (!list_empty(&buf->dirty)) - return 0; - list_add_tail(&buf->dirty, &root->fs_info->trans); - buf->count++; + mark_buffer_dirty(buf); return 0; } int clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct btrfs_buffer *buf) + struct buffer_head *buf) { - if (!list_empty(&buf->dirty)) { - list_del_init(&buf->dirty); - btrfs_block_release(root, buf); - } + clear_buffer_dirty(buf); return 0; } int write_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct btrfs_buffer *buf) + struct buffer_head *buf) { - u64 blocknr = buf->blocknr; - loff_t offset = blocknr * root->blocksize; - int ret; - - if (buf->blocknr != btrfs_header_blocknr(&buf->node.header)) - BUG(); - ret = pwrite(root->fs_info->fp, &buf->node, root->blocksize, offset); - if (ret != root->blocksize) - return ret; + mark_buffer_dirty(buf); return 0; } static int __commit_transaction(struct btrfs_trans_handle *trans, struct btrfs_root *root) { - struct btrfs_buffer *b; - int ret = 0; - int wret; - while(!list_empty(&root->fs_info->trans)) { - b = list_entry(root->fs_info->trans.next, struct btrfs_buffer, - dirty); - list_del_init(&b->dirty); - wret = write_tree_block(trans, root, b); - if (wret) - ret = wret; - btrfs_block_release(root, b); - } - return ret; + filemap_write_and_wait(root->fs_info->sb->s_bdev->bd_inode->i_mapping); + return 0; } static int commit_tree_roots(struct btrfs_trans_handle *trans, @@ -172,17 +74,17 @@ static int commit_tree_roots(struct btrfs_trans_handle *trans, struct btrfs_root *inode_root = fs_info->inode_root; btrfs_set_root_blocknr(&inode_root->root_item, - inode_root->node->blocknr); + inode_root->node->b_blocknr); ret = btrfs_update_root(trans, tree_root, &inode_root->root_key, &inode_root->root_item); BUG_ON(ret); while(1) { old_extent_block = btrfs_root_blocknr(&extent_root->root_item); - if (old_extent_block == extent_root->node->blocknr) + if (old_extent_block == extent_root->node->b_blocknr) break; btrfs_set_root_blocknr(&extent_root->root_item, - extent_root->node->blocknr); + extent_root->node->b_blocknr); ret = btrfs_update_root(trans, tree_root, &extent_root->root_key, &extent_root->root_item); @@ -195,7 +97,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_super_block *s) { int ret = 0; - struct btrfs_buffer *snap = root->commit_root; + struct buffer_head *snap = root->commit_root; struct btrfs_key snap_key; if (root->commit_root == root->node) @@ -204,7 +106,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, struct memcpy(&snap_key, &root->root_key, sizeof(snap_key)); root->root_key.offset++; - btrfs_set_root_blocknr(&root->root_item, root->node->blocknr); + btrfs_set_root_blocknr(&root->root_item, root->node->b_blocknr); ret = btrfs_insert_root(trans, root->fs_info->tree_root, &root->root_key, &root->root_item); BUG_ON(ret); @@ -220,7 +122,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, struct btrfs_finish_extent_commit(trans, root->fs_info->tree_root); root->commit_root = root->node; - root->node->count++; + get_bh(root->node); ret = btrfs_drop_snapshot(trans, root, snap); BUG_ON(ret); @@ -234,7 +136,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, struct static int __setup_root(struct btrfs_super_block *super, struct btrfs_root *root, struct btrfs_fs_info *fs_info, - u64 objectid, int fp) + u64 objectid) { root->node = NULL; root->commit_root = NULL; @@ -250,11 +152,11 @@ static int find_and_setup_root(struct btrfs_super_block *super, struct btrfs_root *tree_root, struct btrfs_fs_info *fs_info, u64 objectid, - struct btrfs_root *root, int fp) + struct btrfs_root *root) { int ret; - __setup_root(super, root, fs_info, objectid, fp); + __setup_root(super, root, fs_info, objectid); ret = btrfs_find_last_root(tree_root, objectid, &root->root_item, &root->root_key); BUG_ON(ret); @@ -265,32 +167,26 @@ static int find_and_setup_root(struct btrfs_super_block *super, return 0; } -struct btrfs_root *open_ctree(char *filename, struct btrfs_super_block *super) +struct btrfs_root *open_ctree(struct super_block *sb, + struct buffer_head *sb_buffer, + struct btrfs_super_block *disk_super) { - int fp; - - fp = open(filename, O_CREAT | O_RDWR, 0600); - if (fp < 0) { - return NULL; - } - return open_ctree_fd(fp, super); -} - -struct btrfs_root *open_ctree_fd(int fp, struct btrfs_super_block *super) -{ - struct btrfs_root *root = malloc(sizeof(struct btrfs_root)); - struct btrfs_root *extent_root = malloc(sizeof(struct btrfs_root)); - struct btrfs_root *tree_root = malloc(sizeof(struct btrfs_root)); - struct btrfs_root *inode_root = malloc(sizeof(struct btrfs_root)); - struct btrfs_fs_info *fs_info = malloc(sizeof(*fs_info)); + struct btrfs_root *root = kmalloc(sizeof(struct btrfs_root), + GFP_NOFS); + struct btrfs_root *extent_root = kmalloc(sizeof(struct btrfs_root), + GFP_NOFS); + struct btrfs_root *tree_root = kmalloc(sizeof(struct btrfs_root), + GFP_NOFS); + struct btrfs_root *inode_root = kmalloc(sizeof(struct btrfs_root), + GFP_NOFS); + struct btrfs_fs_info *fs_info = kmalloc(sizeof(*fs_info), + GFP_NOFS); int ret; - INIT_RADIX_TREE(&fs_info->cache_radix, GFP_KERNEL); + /* FIXME: don't be stupid */ + if (!btrfs_super_root(disk_super)) + return NULL; INIT_RADIX_TREE(&fs_info->pinned_radix, GFP_KERNEL); - INIT_LIST_HEAD(&fs_info->trans); - INIT_LIST_HEAD(&fs_info->cache); - fs_info->cache_size = 0; - fs_info->fp = fp; fs_info->running_transaction = NULL; fs_info->fs_root = root; fs_info->tree_root = tree_root; @@ -298,36 +194,31 @@ struct btrfs_root *open_ctree_fd(int fp, struct btrfs_super_block *super) fs_info->inode_root = inode_root; fs_info->last_inode_alloc = 0; fs_info->last_inode_alloc_dirid = 0; - fs_info->disk_super = super; + fs_info->disk_super = disk_super; + fs_info->sb_buffer = sb_buffer; + fs_info->sb = sb; memset(&fs_info->current_insert, 0, sizeof(fs_info->current_insert)); memset(&fs_info->last_insert, 0, sizeof(fs_info->last_insert)); - ret = pread(fp, super, sizeof(struct btrfs_super_block), - BTRFS_SUPER_INFO_OFFSET); - if (ret == 0 || btrfs_super_root(super) == 0) { - BUG(); - return NULL; - } - BUG_ON(ret < 0); - - __setup_root(super, tree_root, fs_info, BTRFS_ROOT_TREE_OBJECTID, fp); - tree_root->node = read_tree_block(tree_root, btrfs_super_root(super)); + __setup_root(disk_super, tree_root, fs_info, BTRFS_ROOT_TREE_OBJECTID); + tree_root->node = read_tree_block(tree_root, + btrfs_super_root(disk_super)); BUG_ON(!tree_root->node); - ret = find_and_setup_root(super, tree_root, fs_info, - BTRFS_EXTENT_TREE_OBJECTID, extent_root, fp); + ret = find_and_setup_root(disk_super, tree_root, fs_info, + BTRFS_EXTENT_TREE_OBJECTID, extent_root); BUG_ON(ret); - ret = find_and_setup_root(super, tree_root, fs_info, - BTRFS_INODE_MAP_OBJECTID, inode_root, fp); + ret = find_and_setup_root(disk_super, tree_root, fs_info, + BTRFS_INODE_MAP_OBJECTID, inode_root); BUG_ON(ret); - ret = find_and_setup_root(super, tree_root, fs_info, - BTRFS_FS_TREE_OBJECTID, root, fp); + ret = find_and_setup_root(disk_super, tree_root, fs_info, + BTRFS_FS_TREE_OBJECTID, root); BUG_ON(ret); root->commit_root = root->node; - root->node->count++; + get_bh(root->node); root->ref_cows = 1; root->fs_info->generation = root->root_key.offset + 1; return root; @@ -336,8 +227,11 @@ struct btrfs_root *open_ctree_fd(int fp, struct btrfs_super_block *super) int write_ctree_super(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_super_block *s) { + return 0; +#if 0 int ret; - btrfs_set_super_root(s, root->fs_info->tree_root->node->blocknr); + btrfs_set_super_root(s, root->fs_info->tree_root->node->b_blocknr); + ret = pwrite(root->fs_info->fp, s, sizeof(*s), BTRFS_SUPER_INFO_OFFSET); if (ret != sizeof(*s)) { @@ -345,35 +239,38 @@ int write_ctree_super(struct btrfs_trans_handle *trans, struct btrfs_root return ret; } return 0; +#endif } static int drop_cache(struct btrfs_root *root) { + return 0; +#if 0 while(!list_empty(&root->fs_info->cache)) { - struct btrfs_buffer *b = list_entry(root->fs_info->cache.next, - struct btrfs_buffer, + struct buffer_head *b = list_entry(root->fs_info->cache.next, + struct buffer_head, cache); list_del_init(&b->cache); btrfs_block_release(root, b); } return 0; +#endif } -int close_ctree(struct btrfs_root *root, struct btrfs_super_block *s) + +int close_ctree(struct btrfs_root *root) { int ret; struct btrfs_trans_handle *trans; trans = root->fs_info->running_transaction; - btrfs_commit_transaction(trans, root, s); + btrfs_commit_transaction(trans, root, root->fs_info->disk_super); ret = commit_tree_roots(trans, root->fs_info); BUG_ON(ret); ret = __commit_transaction(trans, root); BUG_ON(ret); - write_ctree_super(trans, root, s); + write_ctree_super(trans, root, root->fs_info->disk_super); drop_cache(root); - BUG_ON(!list_empty(&root->fs_info->trans)); - close(root->fs_info->fp); if (root->node) btrfs_block_release(root, root->node); if (root->fs_info->extent_root->node) @@ -386,29 +283,17 @@ int close_ctree(struct btrfs_root *root, struct btrfs_super_block *s) btrfs_block_release(root->fs_info->tree_root, root->fs_info->tree_root->node); btrfs_block_release(root, root->commit_root); - free(root); - printf("on close %d blocks are allocated\n", allocated_blocks); + btrfs_block_release(root, root->fs_info->sb_buffer); + kfree(root->fs_info->extent_root); + kfree(root->fs_info->inode_root); + kfree(root->fs_info->tree_root); + kfree(root->fs_info); + kfree(root); return 0; } -void btrfs_block_release(struct btrfs_root *root, struct btrfs_buffer *buf) +void btrfs_block_release(struct btrfs_root *root, struct buffer_head *buf) { - buf->count--; - if (buf->count < 0) - BUG(); - if (buf->count == 0) { - BUG_ON(!list_empty(&buf->cache)); - BUG_ON(!list_empty(&buf->dirty)); - if (!radix_tree_lookup(&root->fs_info->cache_radix, - buf->blocknr)) - BUG(); - radix_tree_delete(&root->fs_info->cache_radix, buf->blocknr); - memset(buf, 0, sizeof(*buf)); - free(buf); - BUG_ON(allocated_blocks == 0); - allocated_blocks--; - BUG_ON(root->fs_info->cache_size == 0); - root->fs_info->cache_size--; - } + brelse(buf); } diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index d888cf5c350b..7f4bb729b734 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -1,36 +1,41 @@ #ifndef __DISKIO__ #define __DISKIO__ -#include "list.h" -struct btrfs_buffer { - u64 blocknr; - int count; - struct list_head dirty; - struct list_head cache; - union { - struct btrfs_node node; - struct btrfs_leaf leaf; - }; -}; +#include -struct btrfs_buffer *read_tree_block(struct btrfs_root *root, u64 blocknr); -struct btrfs_buffer *find_tree_block(struct btrfs_root *root, u64 blocknr); +#define BTRFS_SUPER_INFO_OFFSET (16 * 1024) + +static inline struct btrfs_node *btrfs_buffer_node(struct buffer_head *bh) +{ + return (struct btrfs_node *)bh->b_data; +} + +static inline struct btrfs_leaf *btrfs_buffer_leaf(struct buffer_head *bh) +{ + return (struct btrfs_leaf *)bh->b_data; +} + +static inline struct btrfs_header *btrfs_buffer_header(struct buffer_head *bh) +{ + return &((struct btrfs_node *)bh->b_data)->header; +} + +struct buffer_head *read_tree_block(struct btrfs_root *root, u64 blocknr); +struct buffer_head *find_tree_block(struct btrfs_root *root, u64 blocknr); int write_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct btrfs_buffer *buf); + struct buffer_head *buf); int dirty_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct btrfs_buffer *buf); + struct buffer_head *buf); int clean_tree_block(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct btrfs_buffer *buf); + struct btrfs_root *root, struct buffer_head *buf); int btrfs_commit_transaction(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_super_block *s); -struct btrfs_root *open_ctree(char *filename, struct btrfs_super_block *s); -struct btrfs_root *open_ctree_fd(int fp, struct btrfs_super_block *super); -int close_ctree(struct btrfs_root *root, struct btrfs_super_block *s); -void btrfs_block_release(struct btrfs_root *root, struct btrfs_buffer *buf); +struct btrfs_root *open_ctree(struct super_block *sb, + struct buffer_head *sb_buffer, + struct btrfs_super_block *disk_super); +int close_ctree(struct btrfs_root *root); +void btrfs_block_release(struct btrfs_root *root, struct buffer_head *buf); int write_ctree_super(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_super_block *s); int mkfs(int fd, u64 num_blocks, u32 blocksize); - -#define BTRFS_SUPER_INFO_OFFSET (16 * 1024) - #endif diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 53a7550b5c1e..e3af2c035687 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -10,9 +10,8 @@ static int find_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root search_end, struct btrfs_key *ins); static int finish_current_insert(struct btrfs_trans_handle *trans, struct btrfs_root *extent_root); -static int run_pending(struct btrfs_trans_handle *trans, struct btrfs_root - *extent_root); - +static int del_pending_extents(struct btrfs_trans_handle *trans, struct + btrfs_root *extent_root); /* * pending extents are blocks that we're trying to allocate in the extent * map while trying to grow the map because of other allocations. To avoid @@ -21,6 +20,7 @@ static int run_pending(struct btrfs_trans_handle *trans, struct btrfs_root * manner for deletes. */ #define CTREE_EXTENT_PENDING_DEL 0 +#define CTREE_EXTENT_PINNED 1 static int inc_block_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 blocknr) @@ -45,15 +45,14 @@ static int inc_block_ref(struct btrfs_trans_handle *trans, struct btrfs_root if (ret != 0) BUG(); BUG_ON(ret != 0); - l = &path.nodes[0]->leaf; + l = btrfs_buffer_leaf(path.nodes[0]); item = btrfs_item_ptr(l, path.slots[0], struct btrfs_extent_item); refs = btrfs_extent_refs(item); btrfs_set_extent_refs(item, refs + 1); - BUG_ON(list_empty(&path.nodes[0]->dirty)); btrfs_release_path(root->fs_info->extent_root, &path); finish_current_insert(trans, root->fs_info->extent_root); - run_pending(trans, root->fs_info->extent_root); + del_pending_extents(trans, root->fs_info->extent_root); return 0; } @@ -74,7 +73,7 @@ static int lookup_block_ref(struct btrfs_trans_handle *trans, struct btrfs_root 0, 0); if (ret != 0) BUG(); - l = &path.nodes[0]->leaf; + l = btrfs_buffer_leaf(path.nodes[0]); item = btrfs_item_ptr(l, path.slots[0], struct btrfs_extent_item); *refs = btrfs_extent_refs(item); btrfs_release_path(root->fs_info->extent_root, &path); @@ -82,18 +81,20 @@ static int lookup_block_ref(struct btrfs_trans_handle *trans, struct btrfs_root } int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct btrfs_buffer *buf) + struct buffer_head *buf) { u64 blocknr; + struct btrfs_node *buf_node; int i; if (!root->ref_cows) return 0; - if (btrfs_is_leaf(&buf->node)) + buf_node = btrfs_buffer_node(buf); + if (btrfs_is_leaf(buf_node)) return 0; - for (i = 0; i < btrfs_header_nritems(&buf->node.header); i++) { - blocknr = btrfs_node_blockptr(&buf->node, i); + for (i = 0; i < btrfs_header_nritems(&buf_node->header); i++) { + blocknr = btrfs_node_blockptr(buf_node, i); inc_block_ref(trans, root, blocknr); } return 0; @@ -108,9 +109,10 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, struct int i; while(1) { - ret = radix_tree_gang_lookup(&root->fs_info->pinned_radix, + ret = radix_tree_gang_lookup_tag(&root->fs_info->pinned_radix, (void **)gang, 0, - ARRAY_SIZE(gang)); + ARRAY_SIZE(gang), + CTREE_EXTENT_PINNED); if (!ret) break; if (!first) @@ -137,7 +139,7 @@ static int finish_current_insert(struct btrfs_trans_handle *trans, struct btrfs_set_extent_refs(&extent_item, 1); btrfs_set_extent_owner(&extent_item, - btrfs_header_parentid(&extent_root->node->node.header)); + btrfs_header_parentid(btrfs_buffer_header(extent_root->node))); ins.offset = 1; ins.flags = 0; btrfs_set_key_type(&ins, BTRFS_EXTENT_ITEM_KEY); @@ -156,11 +158,24 @@ static int finish_current_insert(struct btrfs_trans_handle *trans, struct return 0; } +static int pin_down_block(struct btrfs_root *root, u64 blocknr, int tag) +{ + int err; + err = radix_tree_insert(&root->fs_info->pinned_radix, + blocknr, (void *)blocknr); + BUG_ON(err); + if (err) + return err; + radix_tree_tag_set(&root->fs_info->pinned_radix, blocknr, + tag); + return 0; +} + /* * remove an extent from the root, returns 0 on success */ static int __free_extent(struct btrfs_trans_handle *trans, struct btrfs_root - *root, u64 blocknr, u64 num_blocks, int pin) + *root, u64 blocknr, u64 num_blocks) { struct btrfs_path path; struct btrfs_key key; @@ -171,7 +186,6 @@ static int __free_extent(struct btrfs_trans_handle *trans, struct btrfs_root struct btrfs_key ins; u32 refs; - BUG_ON(pin && num_blocks != 1); key.objectid = blocknr; key.flags = 0; btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY); @@ -186,26 +200,18 @@ static int __free_extent(struct btrfs_trans_handle *trans, struct btrfs_root printk("failed to find %Lu\n", key.objectid); BUG(); } - ei = btrfs_item_ptr(&path.nodes[0]->leaf, path.slots[0], + ei = btrfs_item_ptr(btrfs_buffer_leaf(path.nodes[0]), path.slots[0], struct btrfs_extent_item); BUG_ON(ei->refs == 0); refs = btrfs_extent_refs(ei) - 1; btrfs_set_extent_refs(ei, refs); if (refs == 0) { u64 super_blocks_used; - if (pin) { - int err; - radix_tree_preload(GFP_KERNEL); - err = radix_tree_insert(&info->pinned_radix, - blocknr, (void *)blocknr); - BUG_ON(err); - radix_tree_preload_end(); - } super_blocks_used = btrfs_super_blocks_used(info->disk_super); btrfs_set_super_blocks_used(info->disk_super, super_blocks_used - num_blocks); ret = btrfs_del_item(trans, extent_root, &path); - if (!pin && extent_root->fs_info->last_insert.objectid > + if (extent_root->fs_info->last_insert.objectid > blocknr) extent_root->fs_info->last_insert.objectid = blocknr; if (ret) @@ -224,39 +230,32 @@ static int del_pending_extents(struct btrfs_trans_handle *trans, struct btrfs_root *extent_root) { int ret; - struct btrfs_buffer *gang[4]; + int wret; + int err = 0; + unsigned long gang[4]; int i; + struct radix_tree_root *radix = &extent_root->fs_info->pinned_radix; while(1) { ret = radix_tree_gang_lookup_tag( - &extent_root->fs_info->cache_radix, + &extent_root->fs_info->pinned_radix, (void **)gang, 0, ARRAY_SIZE(gang), CTREE_EXTENT_PENDING_DEL); if (!ret) break; for (i = 0; i < ret; i++) { - ret = __free_extent(trans, extent_root, - gang[i]->blocknr, 1, 1); - radix_tree_tag_clear(&extent_root->fs_info->cache_radix, - gang[i]->blocknr, + radix_tree_tag_set(radix, gang[i], CTREE_EXTENT_PINNED); + radix_tree_tag_clear(radix, gang[i], CTREE_EXTENT_PENDING_DEL); - btrfs_block_release(extent_root, gang[i]); + wret = __free_extent(trans, extent_root, gang[i], 1); + if (wret) + err = wret; } } - return 0; + return err; } -static int run_pending(struct btrfs_trans_handle *trans, struct btrfs_root - *extent_root) -{ - while(radix_tree_tagged(&extent_root->fs_info->cache_radix, - CTREE_EXTENT_PENDING_DEL)) - del_pending_extents(trans, extent_root); - return 0; -} - - /* * remove an extent from the root, returns 0 on success */ @@ -264,18 +263,21 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 blocknr, u64 num_blocks, int pin) { struct btrfs_root *extent_root = root->fs_info->extent_root; - struct btrfs_buffer *t; + struct buffer_head *t; int pending_ret; int ret; if (root == extent_root) { t = find_tree_block(root, blocknr); - radix_tree_tag_set(&root->fs_info->cache_radix, blocknr, - CTREE_EXTENT_PENDING_DEL); + pin_down_block(root, blocknr, CTREE_EXTENT_PENDING_DEL); return 0; } - ret = __free_extent(trans, root, blocknr, num_blocks, pin); - pending_ret = run_pending(trans, root->fs_info->extent_root); + if (pin) { + ret = pin_down_block(root, blocknr, CTREE_EXTENT_PINNED); + BUG_ON(ret); + } + ret = __free_extent(trans, root, blocknr, num_blocks); + pending_ret = del_pending_extents(trans, root->fs_info->extent_root); return ret ? ret : pending_ret; } @@ -296,14 +298,16 @@ static int find_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root int ret; u64 hole_size = 0; int slot = 0; - u64 last_block; + u64 last_block = 0; u64 test_block; int start_found; struct btrfs_leaf *l; struct btrfs_root * root = orig_root->fs_info->extent_root; int total_needed = num_blocks; + int level; - total_needed += (btrfs_header_level(&root->node->node.header) + 1) * 3; + level = btrfs_header_level(btrfs_buffer_header(root->node)); + total_needed += (level + 1) * 3; if (root->fs_info->last_insert.objectid > search_start) search_start = root->fs_info->last_insert.objectid; @@ -323,7 +327,7 @@ check_failed: path.slots[0]--; while (1) { - l = &path.nodes[0]->leaf; + l = btrfs_buffer_leaf(path.nodes[0]); slot = path.slots[0]; if (slot >= btrfs_header_nritems(&l->header)) { ret = btrfs_next_leaf(root, &path); @@ -429,7 +433,7 @@ static int alloc_extent(struct btrfs_trans_handle *trans, struct btrfs_root sizeof(extent_item)); finish_current_insert(trans, extent_root); - pending_ret = run_pending(trans, extent_root); + pending_ret = del_pending_extents(trans, extent_root); if (ret) return ret; if (pending_ret) @@ -441,16 +445,15 @@ static int alloc_extent(struct btrfs_trans_handle *trans, struct btrfs_root * helper function to allocate a block for a given tree * returns the tree buffer or NULL. */ -struct btrfs_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, +struct buffer_head *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, struct btrfs_root *root) { struct btrfs_key ins; int ret; - struct btrfs_buffer *buf; + struct buffer_head *buf; ret = alloc_extent(trans, root, 1, 0, (unsigned long)-1, - btrfs_header_parentid(&root->node->node.header), - &ins); + btrfs_header_parentid(btrfs_buffer_header(root->node)), &ins); if (ret) { BUG(); return NULL; @@ -467,13 +470,13 @@ struct btrfs_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, static int walk_down_tree(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, int *level) { - struct btrfs_buffer *next; - struct btrfs_buffer *cur; + struct buffer_head *next; + struct buffer_head *cur; u64 blocknr; int ret; u32 refs; - ret = lookup_block_ref(trans, root, path->nodes[*level]->blocknr, + ret = lookup_block_ref(trans, root, path->nodes[*level]->b_blocknr, &refs); BUG_ON(ret); if (refs > 1) @@ -484,9 +487,10 @@ static int walk_down_tree(struct btrfs_trans_handle *trans, struct btrfs_root while(*level > 0) { cur = path->nodes[*level]; if (path->slots[*level] >= - btrfs_header_nritems(&cur->node.header)) + btrfs_header_nritems(btrfs_buffer_header(cur))) break; - blocknr = btrfs_node_blockptr(&cur->node, path->slots[*level]); + blocknr = btrfs_node_blockptr(btrfs_buffer_node(cur), + path->slots[*level]); ret = lookup_block_ref(trans, root, blocknr, &refs); if (refs != 1 || *level == 1) { path->slots[*level]++; @@ -499,12 +503,12 @@ static int walk_down_tree(struct btrfs_trans_handle *trans, struct btrfs_root if (path->nodes[*level-1]) btrfs_block_release(root, path->nodes[*level-1]); path->nodes[*level-1] = next; - *level = btrfs_header_level(&next->node.header); + *level = btrfs_header_level(btrfs_buffer_header(next)); path->slots[*level] = 0; } out: - ret = btrfs_free_extent(trans, root, path->nodes[*level]->blocknr, 1, - 1); + ret = btrfs_free_extent(trans, root, path->nodes[*level]->b_blocknr, + 1, 1); btrfs_block_release(root, path->nodes[*level]); path->nodes[*level] = NULL; *level += 1; @@ -525,14 +529,14 @@ static int walk_up_tree(struct btrfs_trans_handle *trans, struct btrfs_root int ret; for(i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) { slot = path->slots[i]; - if (slot < - btrfs_header_nritems(&path->nodes[i]->node.header)- 1) { + if (slot < btrfs_header_nritems( + btrfs_buffer_header(path->nodes[i])) - 1) { path->slots[i]++; *level = i; return 0; } else { ret = btrfs_free_extent(trans, root, - path->nodes[*level]->blocknr, + path->nodes[*level]->b_blocknr, 1, 1); btrfs_block_release(root, path->nodes[*level]); path->nodes[*level] = NULL; @@ -549,7 +553,7 @@ static int walk_up_tree(struct btrfs_trans_handle *trans, struct btrfs_root * decremented. */ int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root - *root, struct btrfs_buffer *snap) + *root, struct buffer_head *snap) { int ret = 0; int wret; @@ -560,7 +564,7 @@ int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root btrfs_init_path(&path); - level = btrfs_header_level(&snap->node.header); + level = btrfs_header_level(btrfs_buffer_header(snap)); orig_level = level; path.nodes[level] = snap; path.slots[level] = 0; diff --git a/fs/btrfs/hash.c b/fs/btrfs/hash.c index 6c2a71a46c7d..22519b8e0cf2 100644 --- a/fs/btrfs/hash.c +++ b/fs/btrfs/hash.c @@ -10,6 +10,7 @@ * License. */ +#include #define DELTA 0x9E3779B9 static void TEA_transform(__u32 buf[2], __u32 const in[]) @@ -63,6 +64,14 @@ int btrfs_name_hash(const char *name, int len, u64 *hash_result) const char *p; __u32 in[8], buf[2]; + if (len == 1 && *name == '.') { + *hash_result = 1; + return 0; + } else if (len == 2 && name[0] == '.' && name[1] == '.') { + *hash_result = 2; + return 0; + } + /* Initialize the default seed for the hash checksum functions */ buf[0] = 0x67452301; buf[1] = 0xefcdab89; diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c index c7fda3bf7b21..c45aec258bd5 100644 --- a/fs/btrfs/inode-map.c +++ b/fs/btrfs/inode-map.c @@ -15,7 +15,7 @@ int btrfs_find_free_objectid(struct btrfs_trans_handle *trans, int ret; u64 hole_size = 0; int slot = 0; - u64 last_ino; + u64 last_ino = 0; int start_found; struct btrfs_leaf *l; struct btrfs_root *root = fs_root->fs_info->inode_root; @@ -40,7 +40,7 @@ int btrfs_find_free_objectid(struct btrfs_trans_handle *trans, path.slots[0]--; while (1) { - l = &path.nodes[0]->leaf; + l = btrfs_buffer_leaf(path.nodes[0]); slot = path.slots[0]; if (slot >= btrfs_header_nritems(&l->header)) { ret = btrfs_next_leaf(root, &path); @@ -105,8 +105,8 @@ int btrfs_insert_inode_map(struct btrfs_trans_handle *trans, if (ret) goto out; - inode_item = btrfs_item_ptr(&path.nodes[0]->leaf, path.slots[0], - struct btrfs_inode_map_item); + inode_item = btrfs_item_ptr(btrfs_buffer_leaf(path.nodes[0]), + path.slots[0], struct btrfs_inode_map_item); btrfs_cpu_key_to_disk(&inode_item->key, location); out: btrfs_release_path(inode_root, &path); diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c index aa2d3fac8804..c8ee938c1251 100644 --- a/fs/btrfs/print-tree.c +++ b/fs/btrfs/print-tree.c @@ -17,7 +17,6 @@ void btrfs_print_leaf(struct btrfs_root *root, struct btrfs_leaf *l) printk("leaf %Lu total ptrs %d free space %d\n", btrfs_header_blocknr(&l->header), nr, btrfs_leaf_free_space(root, l)); - fflush(stdout); for (i = 0 ; i < nr ; i++) { item = l->items + i; type = btrfs_disk_key_type(&item->key); @@ -67,10 +66,10 @@ void btrfs_print_leaf(struct btrfs_root *root, struct btrfs_leaf *l) btrfs_leaf_data(l) + btrfs_item_offset(item)); break; }; - fflush(stdout); } } -void btrfs_print_tree(struct btrfs_root *root, struct btrfs_buffer *t) + +void btrfs_print_tree(struct btrfs_root *root, struct buffer_head *t) { int i; u32 nr; @@ -78,16 +77,16 @@ void btrfs_print_tree(struct btrfs_root *root, struct btrfs_buffer *t) if (!t) return; - c = &t->node; + c = btrfs_buffer_node(t); nr = btrfs_header_nritems(&c->header); if (btrfs_is_leaf(c)) { btrfs_print_leaf(root, (struct btrfs_leaf *)c); return; } - printk("node %Lu level %d total ptrs %d free spc %u\n", t->blocknr, - btrfs_header_level(&c->header), nr, - (u32)BTRFS_NODEPTRS_PER_BLOCK(root) - nr); - fflush(stdout); + printk("node %Lu level %d total ptrs %d free spc %u\n", + btrfs_header_blocknr(&c->header), + btrfs_header_level(&c->header), nr, + (u32)BTRFS_NODEPTRS_PER_BLOCK(root) - nr); for (i = 0; i < nr; i++) { printk("\tkey %d (%Lu %u %Lu) block %Lu\n", i, @@ -95,12 +94,11 @@ void btrfs_print_tree(struct btrfs_root *root, struct btrfs_buffer *t) c->ptrs[i].key.flags, c->ptrs[i].key.offset, btrfs_node_blockptr(c, i)); - fflush(stdout); } for (i = 0; i < nr; i++) { - struct btrfs_buffer *next_buf = read_tree_block(root, + struct buffer_head *next_buf = read_tree_block(root, btrfs_node_blockptr(c, i)); - struct btrfs_node *next = &next_buf->node; + struct btrfs_node *next = btrfs_buffer_node(next_buf); if (btrfs_is_leaf(next) && btrfs_header_level(&c->header) != 1) BUG(); diff --git a/fs/btrfs/print-tree.h b/fs/btrfs/print-tree.h index 0882ca904eca..396041a05cfa 100644 --- a/fs/btrfs/print-tree.h +++ b/fs/btrfs/print-tree.h @@ -1,5 +1,5 @@ #ifndef __PRINT_TREE_ #define __PRINT_TREE_ void btrfs_print_leaf(struct btrfs_root *root, struct btrfs_leaf *l); -void btrfs_print_tree(struct btrfs_root *root, struct btrfs_buffer *t); +void btrfs_print_tree(struct btrfs_root *root, struct buffer_head *t); #endif diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index 52c83be4b307..a4554c007ef7 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -21,7 +21,7 @@ int btrfs_find_last_root(struct btrfs_root *root, u64 objectid, if (ret < 0) goto out; BUG_ON(ret == 0); - l = &path.nodes[0]->leaf; + l = btrfs_buffer_leaf(path.nodes[0]); BUG_ON(path.slots[0] == 0); slot = path.slots[0] - 1; if (btrfs_disk_key_objectid(&l->items[slot].key) != objectid) { @@ -51,7 +51,7 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root if (ret < 0) goto out; BUG_ON(ret != 0); - l = &path.nodes[0]->leaf; + l = btrfs_buffer_leaf(path.nodes[0]); slot = path.slots[0]; memcpy(btrfs_item_ptr(l, slot, struct btrfs_root_item), item, sizeof(*item)); diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 4ae76044aeab..ccc056aad692 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1,4 +1,5 @@ #include +#include #include #include #include @@ -8,13 +9,18 @@ #include #include #include "ctree.h" +#include "disk-io.h" #define BTRFS_SUPER_MAGIC 0x9123682E + +static struct inode_operations btrfs_dir_inode_operations; +static struct super_operations btrfs_super_ops; +static struct file_operations btrfs_dir_file_operations; + #if 0 /* some random number */ static struct super_operations ramfs_ops; -static struct inode_operations ramfs_dir_inode_operations; static struct backing_dev_info ramfs_backing_dev_info = { .ra_pages = 0, /* No readahead */ @@ -129,46 +135,243 @@ static struct inode_operations ramfs_dir_inode_operations = { }; #endif -struct inode *btrfs_get_inode(struct super_block *sb, int mode, dev_t dev) +static void btrfs_read_locked_inode(struct inode *inode) { - struct inode * inode = new_inode(sb); + struct btrfs_path path; + struct btrfs_inode_item *inode_item; + struct btrfs_root *root = btrfs_sb(inode->i_sb); + int ret; +printk("read locked inode %lu\n", inode->i_ino); + btrfs_init_path(&path); + ret = btrfs_lookup_inode(NULL, root, &path, inode->i_ino, 0); + if (ret) { + make_bad_inode(inode); + return; + } + inode_item = btrfs_item_ptr(btrfs_buffer_leaf(path.nodes[0]), + path.slots[0], + struct btrfs_inode_item); - if (inode) { - inode->i_mode = mode; - inode->i_uid = current->fsuid; - inode->i_gid = current->fsgid; - inode->i_blocks = 0; - inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; +printk("found locked inode %lu\n", inode->i_ino); + inode->i_mode = btrfs_inode_mode(inode_item); + inode->i_nlink = btrfs_inode_nlink(inode_item); + inode->i_uid = btrfs_inode_uid(inode_item); + inode->i_gid = btrfs_inode_gid(inode_item); + inode->i_size = btrfs_inode_size(inode_item); + inode->i_atime.tv_sec = btrfs_timespec_sec(&inode_item->atime); + inode->i_atime.tv_nsec = btrfs_timespec_nsec(&inode_item->atime); + inode->i_mtime.tv_sec = btrfs_timespec_sec(&inode_item->mtime); + inode->i_mtime.tv_nsec = btrfs_timespec_nsec(&inode_item->mtime); + inode->i_ctime.tv_sec = btrfs_timespec_sec(&inode_item->ctime); + inode->i_ctime.tv_nsec = btrfs_timespec_nsec(&inode_item->ctime); + inode->i_blocks = btrfs_inode_nblocks(inode_item); + inode->i_generation = btrfs_inode_generation(inode_item); +printk("about to release\n"); + btrfs_release_path(root, &path); + switch (inode->i_mode & S_IFMT) { +#if 0 + default: + init_special_inode(inode, inode->i_mode, + btrfs_inode_rdev(inode_item)); + break; +#endif + case S_IFREG: +printk("inode %lu now a file\n", inode->i_ino); + break; + case S_IFDIR: +printk("inode %lu now a directory\n", inode->i_ino); + inode->i_op = &btrfs_dir_inode_operations; + inode->i_fop = &btrfs_dir_file_operations; + break; + case S_IFLNK: +printk("inode %lu now a link\n", inode->i_ino); + // inode->i_op = &page_symlink_inode_operations; + break; } - return inode; +printk("returning!\n"); + return; } -static struct super_operations btrfs_ops = { - .statfs = simple_statfs, - .drop_inode = generic_delete_inode, -}; +static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry, + ino_t *ino) +{ + const char *name = dentry->d_name.name; + int namelen = dentry->d_name.len; + struct btrfs_dir_item *di; + struct btrfs_path path; + struct btrfs_root *root = btrfs_sb(dir->i_sb); + int ret; + + btrfs_init_path(&path); + ret = btrfs_lookup_dir_item(NULL, root, &path, dir->i_ino, name, + namelen, 0); + if (ret) { + *ino = 0; + goto out; + } + di = btrfs_item_ptr(btrfs_buffer_leaf(path.nodes[0]), path.slots[0], + struct btrfs_dir_item); + *ino = btrfs_dir_objectid(di); +out: + btrfs_release_path(root, &path); + return ret; +} + +static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry, + struct nameidata *nd) +{ + struct inode * inode; + ino_t ino; + int ret; + + if (dentry->d_name.len > BTRFS_NAME_LEN) + return ERR_PTR(-ENAMETOOLONG); + + ret = btrfs_inode_by_name(dir, dentry, &ino); + if (ret < 0) + return ERR_PTR(ret); + inode = NULL; + if (ino) { +printk("lookup on %.*s returns %lu\n", dentry->d_name.len, dentry->d_name.name, ino); + inode = iget(dir->i_sb, ino); + if (!inode) + return ERR_PTR(-EACCES); + } + return d_splice_alias(inode, dentry); +} + +static int btrfs_readdir(struct file *filp, void *dirent, filldir_t filldir) +{ + struct inode *inode = filp->f_path.dentry->d_inode; + struct btrfs_root *root = btrfs_sb(inode->i_sb); + struct btrfs_item *item; + struct btrfs_dir_item *di; + struct btrfs_key key; + struct btrfs_path path; + int ret; + u32 nritems; + struct btrfs_leaf *leaf; + int slot; + int advance; + unsigned char d_type = DT_UNKNOWN; + int over; + + key.objectid = inode->i_ino; +printk("readdir on dir %Lu pos %Lu\n", key.objectid, filp->f_pos); + key.flags = 0; + btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY); + key.offset = filp->f_pos; + btrfs_init_path(&path); + ret = btrfs_search_slot(NULL, root, &key, &path, 0, 0); + if (ret < 0) { + goto err; + } +printk("first ret %d\n", ret); + advance = filp->f_pos > 0 && ret != 0; + while(1) { + leaf = btrfs_buffer_leaf(path.nodes[0]); + nritems = btrfs_header_nritems(&leaf->header); + slot = path.slots[0]; +printk("leaf %Lu nritems %lu slot %d\n", path.nodes[0]->b_blocknr, nritems, slot); + if (advance) { +printk("advancing!\n"); + if (slot == nritems -1) { + ret = btrfs_next_leaf(root, &path); + if (ret) + break; + leaf = btrfs_buffer_leaf(path.nodes[0]); + nritems = btrfs_header_nritems(&leaf->header); + slot = path.slots[0]; +printk("2leaf %Lu nritems %lu slot %d\n", path.nodes[0]->b_blocknr, nritems, slot); + } else { + slot++; + path.slots[0]++; + } + } + advance = 1; + item = leaf->items + slot; +printk("item key %Lu %u %Lu\n", btrfs_disk_key_objectid(&item->key), + btrfs_disk_key_flags(&item->key), btrfs_disk_key_offset(&item->key)); + if (btrfs_disk_key_objectid(&item->key) != key.objectid) + break; + if (btrfs_disk_key_type(&item->key) != BTRFS_DIR_ITEM_KEY) + continue; + di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item); +printk("filldir name %.*s, objectid %Lu\n", btrfs_dir_name_len(di), + (const char *)(di + 1), btrfs_dir_objectid(di)); + over = filldir(dirent, (const char *)(di + 1), + btrfs_dir_name_len(di), + btrfs_disk_key_offset(&item->key), + btrfs_dir_objectid(di), d_type); + if (over) + break; + filp->f_pos = btrfs_disk_key_offset(&item->key) + 1; + } +printk("filldir all done\n"); + ret = 0; +err: + btrfs_release_path(root, &path); + return ret; +} + +static void btrfs_put_super (struct super_block * sb) +{ + struct btrfs_root *root = btrfs_sb(sb); + int ret; + + ret = close_ctree(root); + if (ret) { + printk("close ctree returns %d\n", ret); + } + sb->s_fs_info = NULL; +} static int btrfs_fill_super(struct super_block * sb, void * data, int silent) { struct inode * inode; - struct dentry * root; + struct dentry * root_dentry; + struct btrfs_super_block *disk_super; + struct buffer_head *bh; + struct btrfs_root *root; sb->s_maxbytes = MAX_LFS_FILESIZE; sb->s_blocksize = PAGE_CACHE_SIZE; sb->s_blocksize_bits = PAGE_CACHE_SHIFT; sb->s_magic = BTRFS_SUPER_MAGIC; - sb->s_op = &btrfs_ops; + sb->s_op = &btrfs_super_ops; sb->s_time_gran = 1; - inode = btrfs_get_inode(sb, S_IFDIR | 0755, 0); + + bh = sb_bread(sb, BTRFS_SUPER_INFO_OFFSET / sb->s_blocksize); + if (!bh) { + printk("btrfs: unable to read on disk super\n"); + return -EIO; + } + disk_super = (struct btrfs_super_block *)bh->b_data; + root = open_ctree(sb, bh, disk_super); + sb->s_fs_info = root; + if (!root) { + printk("btrfs: open_ctree failed\n"); + return -EIO; + } + printk("read in super total blocks %Lu root %Lu\n", + btrfs_super_total_blocks(disk_super), + btrfs_super_root_dir(disk_super)); + + inode = iget_locked(sb, btrfs_super_root_dir(disk_super)); if (!inode) return -ENOMEM; + if (inode->i_state & I_NEW) { + btrfs_read_locked_inode(inode); + unlock_new_inode(inode); + } - root = d_alloc_root(inode); - if (!root) { + root_dentry = d_alloc_root(inode); + if (!root_dentry) { iput(inode); return -ENOMEM; } - sb->s_root = root; + sb->s_root = root_dentry; + return 0; } @@ -187,6 +390,24 @@ static struct file_system_type btrfs_fs_type = { .fs_flags = FS_REQUIRES_DEV, }; +static struct super_operations btrfs_super_ops = { + .statfs = simple_statfs, + .drop_inode = generic_delete_inode, + .put_super = btrfs_put_super, + .read_inode = btrfs_read_locked_inode, +}; + +static struct inode_operations btrfs_dir_inode_operations = { + .lookup = btrfs_lookup, +}; + +static struct file_operations btrfs_dir_file_operations = { + .llseek = generic_file_llseek, + .read = generic_read_dir, + .readdir = btrfs_readdir, +}; + + static int __init init_btrfs_fs(void) { printk("btrfs loaded!\n"); diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 3adb6e69fe43..9ab27b7491c5 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -10,7 +10,7 @@ struct btrfs_trans_handle { static inline struct btrfs_trans_handle * btrfs_start_transaction(struct btrfs_root *root, int num_blocks) { - struct btrfs_trans_handle *h = malloc(sizeof(*h)); + struct btrfs_trans_handle *h = kmalloc(sizeof(*h), GFP_NOFS); h->transid = root->root_key.offset; h->blocks_reserved = num_blocks; h->blocks_used = 0; @@ -21,7 +21,7 @@ static inline void btrfs_free_transaction(struct btrfs_root *root, struct btrfs_trans_handle *handle) { memset(handle, 0, sizeof(*handle)); - free(handle); + kfree(handle); } #endif -- cgit v1.2.3 From 79154b1b5bcf87903db7ff16a30b360b78d6fe4f Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Thu, 22 Mar 2007 15:59:16 -0400 Subject: Btrfs: transaction rework Signed-off-by: Chris Mason --- fs/btrfs/Makefile | 2 +- fs/btrfs/ctree.h | 4 +- fs/btrfs/disk-io.c | 115 +++------------------------- fs/btrfs/disk-io.h | 8 +- fs/btrfs/super.c | 18 ----- fs/btrfs/transaction.c | 199 +++++++++++++++++++++++++++++++++++++++++++++++++ fs/btrfs/transaction.h | 35 +++++---- 7 files changed, 235 insertions(+), 146 deletions(-) create mode 100644 fs/btrfs/transaction.c (limited to 'fs/btrfs/Makefile') diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index 1a0fb7422393..be7d74cdca04 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -3,7 +3,7 @@ ifneq ($(KERNELRELEASE),) obj-m := btrfs.o btrfs-y := super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ - hash.o file-item.o inode-item.o inode-map.o disk-io.o + hash.o file-item.o inode-item.o inode-map.o disk-io.o transaction.o #btrfs-y := ctree.o disk-io.o radix-tree.o extent-tree.o print-tree.o \ # root-tree.o dir-item.o hash.o file-item.o inode-item.o \ diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 7748eecd9304..4c19a3f12afd 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -5,6 +5,7 @@ #include struct btrfs_trans_handle; +struct btrfs_transaction; #define BTRFS_MAGIC "_BtRfS_M" @@ -224,10 +225,11 @@ struct btrfs_fs_info { u64 last_inode_alloc; u64 last_inode_alloc_dirid; u64 generation; - struct btrfs_trans_handle *running_transaction; + struct btrfs_transaction *running_transaction; struct btrfs_super_block *disk_super; struct buffer_head *sb_buffer; struct super_block *sb; + struct mutex trans_mutex; }; /* diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index df2061a735cd..9cacca0c525c 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -50,89 +50,6 @@ int clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, return 0; } -int write_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct buffer_head *buf) -{ - mark_buffer_dirty(buf); - return 0; -} - -static int __commit_transaction(struct btrfs_trans_handle *trans, struct - btrfs_root *root) -{ - filemap_write_and_wait(root->fs_info->sb->s_bdev->bd_inode->i_mapping); - return 0; -} - -static int commit_tree_roots(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info) -{ - int ret; - u64 old_extent_block; - struct btrfs_root *tree_root = fs_info->tree_root; - struct btrfs_root *extent_root = fs_info->extent_root; - struct btrfs_root *inode_root = fs_info->inode_root; - - btrfs_set_root_blocknr(&inode_root->root_item, - inode_root->node->b_blocknr); - ret = btrfs_update_root(trans, tree_root, - &inode_root->root_key, - &inode_root->root_item); - BUG_ON(ret); - while(1) { - old_extent_block = btrfs_root_blocknr(&extent_root->root_item); - if (old_extent_block == extent_root->node->b_blocknr) - break; - btrfs_set_root_blocknr(&extent_root->root_item, - extent_root->node->b_blocknr); - ret = btrfs_update_root(trans, tree_root, - &extent_root->root_key, - &extent_root->root_item); - BUG_ON(ret); - } - return 0; -} - -int btrfs_commit_transaction(struct btrfs_trans_handle *trans, struct - btrfs_root *root, struct btrfs_super_block *s) -{ - int ret = 0; - struct buffer_head *snap = root->commit_root; - struct btrfs_key snap_key; - - if (root->commit_root == root->node) - return 0; - - memcpy(&snap_key, &root->root_key, sizeof(snap_key)); - root->root_key.offset++; - - btrfs_set_root_blocknr(&root->root_item, root->node->b_blocknr); - ret = btrfs_insert_root(trans, root->fs_info->tree_root, - &root->root_key, &root->root_item); - BUG_ON(ret); - - ret = commit_tree_roots(trans, root->fs_info); - BUG_ON(ret); - - ret = __commit_transaction(trans, root); - BUG_ON(ret); - - write_ctree_super(trans, root, s); - btrfs_finish_extent_commit(trans, root->fs_info->extent_root); - btrfs_finish_extent_commit(trans, root->fs_info->tree_root); - - root->commit_root = root->node; - get_bh(root->node); - ret = btrfs_drop_snapshot(trans, root, snap); - BUG_ON(ret); - - ret = btrfs_del_root(trans, root->fs_info->tree_root, &snap_key); - BUG_ON(ret); - root->fs_info->generation = root->root_key.offset + 1; - - return ret; -} - static int __setup_root(struct btrfs_super_block *super, struct btrfs_root *root, struct btrfs_fs_info *fs_info, @@ -197,6 +114,7 @@ struct btrfs_root *open_ctree(struct super_block *sb, fs_info->disk_super = disk_super; fs_info->sb_buffer = sb_buffer; fs_info->sb = sb; + mutex_init(&fs_info->trans_mutex); memset(&fs_info->current_insert, 0, sizeof(fs_info->current_insert)); memset(&fs_info->last_insert, 0, sizeof(fs_info->last_insert)); @@ -225,7 +143,7 @@ struct btrfs_root *open_ctree(struct super_block *sb, } int write_ctree_super(struct btrfs_trans_handle *trans, struct btrfs_root - *root, struct btrfs_super_block *s) + *root) { return 0; #if 0 @@ -242,34 +160,19 @@ int write_ctree_super(struct btrfs_trans_handle *trans, struct btrfs_root #endif } -static int drop_cache(struct btrfs_root *root) -{ - return 0; -#if 0 - while(!list_empty(&root->fs_info->cache)) { - struct buffer_head *b = list_entry(root->fs_info->cache.next, - struct buffer_head, - cache); - list_del_init(&b->cache); - btrfs_block_release(root, b); - } - return 0; -#endif -} - int close_ctree(struct btrfs_root *root) { int ret; struct btrfs_trans_handle *trans; - trans = root->fs_info->running_transaction; - btrfs_commit_transaction(trans, root, root->fs_info->disk_super); - ret = commit_tree_roots(trans, root->fs_info); - BUG_ON(ret); - ret = __commit_transaction(trans, root); + trans = btrfs_start_transaction(root, 1); + btrfs_commit_transaction(trans, root); + /* run commit again to drop the original snapshot */ + trans = btrfs_start_transaction(root, 1); + btrfs_commit_transaction(trans, root); + ret = btrfs_write_and_wait_transaction(NULL, root); BUG_ON(ret); - write_ctree_super(trans, root, root->fs_info->disk_super); - drop_cache(root); + write_ctree_super(NULL, root); if (root->node) btrfs_block_release(root, root->node); diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 7f4bb729b734..099f7eea0ec7 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -28,14 +28,14 @@ int dirty_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct buffer_head *buf); int clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct buffer_head *buf); -int btrfs_commit_transaction(struct btrfs_trans_handle *trans, struct btrfs_root - *root, struct btrfs_super_block *s); +int btrfs_commit_transaction(struct btrfs_trans_handle *trans, + struct btrfs_root *root); struct btrfs_root *open_ctree(struct super_block *sb, struct buffer_head *sb_buffer, struct btrfs_super_block *disk_super); int close_ctree(struct btrfs_root *root); void btrfs_block_release(struct btrfs_root *root, struct buffer_head *buf); -int write_ctree_super(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct btrfs_super_block *s); +int write_ctree_super(struct btrfs_trans_handle *trans, + struct btrfs_root *root); int mkfs(int fd, u64 num_blocks, u32 blocksize); #endif diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index ccc056aad692..620430825840 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -141,7 +141,6 @@ static void btrfs_read_locked_inode(struct inode *inode) struct btrfs_inode_item *inode_item; struct btrfs_root *root = btrfs_sb(inode->i_sb); int ret; -printk("read locked inode %lu\n", inode->i_ino); btrfs_init_path(&path); ret = btrfs_lookup_inode(NULL, root, &path, inode->i_ino, 0); if (ret) { @@ -152,7 +151,6 @@ printk("read locked inode %lu\n", inode->i_ino); path.slots[0], struct btrfs_inode_item); -printk("found locked inode %lu\n", inode->i_ino); inode->i_mode = btrfs_inode_mode(inode_item); inode->i_nlink = btrfs_inode_nlink(inode_item); inode->i_uid = btrfs_inode_uid(inode_item); @@ -166,7 +164,6 @@ printk("found locked inode %lu\n", inode->i_ino); inode->i_ctime.tv_nsec = btrfs_timespec_nsec(&inode_item->ctime); inode->i_blocks = btrfs_inode_nblocks(inode_item); inode->i_generation = btrfs_inode_generation(inode_item); -printk("about to release\n"); btrfs_release_path(root, &path); switch (inode->i_mode & S_IFMT) { #if 0 @@ -176,19 +173,15 @@ printk("about to release\n"); break; #endif case S_IFREG: -printk("inode %lu now a file\n", inode->i_ino); break; case S_IFDIR: -printk("inode %lu now a directory\n", inode->i_ino); inode->i_op = &btrfs_dir_inode_operations; inode->i_fop = &btrfs_dir_file_operations; break; case S_IFLNK: -printk("inode %lu now a link\n", inode->i_ino); // inode->i_op = &page_symlink_inode_operations; break; } -printk("returning!\n"); return; } @@ -232,7 +225,6 @@ static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry, return ERR_PTR(ret); inode = NULL; if (ino) { -printk("lookup on %.*s returns %lu\n", dentry->d_name.len, dentry->d_name.name, ino); inode = iget(dir->i_sb, ino); if (!inode) return ERR_PTR(-EACCES); @@ -257,7 +249,6 @@ static int btrfs_readdir(struct file *filp, void *dirent, filldir_t filldir) int over; key.objectid = inode->i_ino; -printk("readdir on dir %Lu pos %Lu\n", key.objectid, filp->f_pos); key.flags = 0; btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY); key.offset = filp->f_pos; @@ -266,15 +257,12 @@ printk("readdir on dir %Lu pos %Lu\n", key.objectid, filp->f_pos); if (ret < 0) { goto err; } -printk("first ret %d\n", ret); advance = filp->f_pos > 0 && ret != 0; while(1) { leaf = btrfs_buffer_leaf(path.nodes[0]); nritems = btrfs_header_nritems(&leaf->header); slot = path.slots[0]; -printk("leaf %Lu nritems %lu slot %d\n", path.nodes[0]->b_blocknr, nritems, slot); if (advance) { -printk("advancing!\n"); if (slot == nritems -1) { ret = btrfs_next_leaf(root, &path); if (ret) @@ -282,7 +270,6 @@ printk("advancing!\n"); leaf = btrfs_buffer_leaf(path.nodes[0]); nritems = btrfs_header_nritems(&leaf->header); slot = path.slots[0]; -printk("2leaf %Lu nritems %lu slot %d\n", path.nodes[0]->b_blocknr, nritems, slot); } else { slot++; path.slots[0]++; @@ -290,15 +277,11 @@ printk("2leaf %Lu nritems %lu slot %d\n", path.nodes[0]->b_blocknr, nritems, slo } advance = 1; item = leaf->items + slot; -printk("item key %Lu %u %Lu\n", btrfs_disk_key_objectid(&item->key), - btrfs_disk_key_flags(&item->key), btrfs_disk_key_offset(&item->key)); if (btrfs_disk_key_objectid(&item->key) != key.objectid) break; if (btrfs_disk_key_type(&item->key) != BTRFS_DIR_ITEM_KEY) continue; di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item); -printk("filldir name %.*s, objectid %Lu\n", btrfs_dir_name_len(di), - (const char *)(di + 1), btrfs_dir_objectid(di)); over = filldir(dirent, (const char *)(di + 1), btrfs_dir_name_len(di), btrfs_disk_key_offset(&item->key), @@ -307,7 +290,6 @@ printk("filldir name %.*s, objectid %Lu\n", btrfs_dir_name_len(di), break; filp->f_pos = btrfs_disk_key_offset(&item->key) + 1; } -printk("filldir all done\n"); ret = 0; err: btrfs_release_path(root, &path); diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c new file mode 100644 index 000000000000..8dc1c170f10f --- /dev/null +++ b/fs/btrfs/transaction.c @@ -0,0 +1,199 @@ +#include +#include +#include "ctree.h" +#include "disk-io.h" +#include "transaction.h" + + +static void put_transaction(struct btrfs_transaction *transaction) +{ + transaction->use_count--; + if (transaction->use_count == 0) + kfree(transaction); +} + +static int join_transaction(struct btrfs_root *root) +{ + struct btrfs_transaction *cur_trans; + cur_trans = root->fs_info->running_transaction; + if (!cur_trans) { + cur_trans = kmalloc(sizeof(*cur_trans), GFP_NOFS); + BUG_ON(!cur_trans); + root->fs_info->running_transaction = cur_trans; + cur_trans->num_writers = 0; + cur_trans->transid = root->root_key.offset + 1; + init_waitqueue_head(&cur_trans->writer_wait); + init_waitqueue_head(&cur_trans->commit_wait); + cur_trans->in_commit = 0; + cur_trans->use_count = 0; + cur_trans->commit_done = 0; + } + cur_trans->num_writers++; + return 0; +} + +struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root, + int num_blocks) +{ + struct btrfs_trans_handle *h = kmalloc(sizeof(*h), GFP_NOFS); + int ret; + + mutex_lock(&root->fs_info->trans_mutex); + ret = join_transaction(root); + BUG_ON(ret); + h->transid = root->fs_info->running_transaction->transid; + h->transaction = root->fs_info->running_transaction; + h->blocks_reserved = num_blocks; + h->blocks_used = 0; + root->fs_info->running_transaction->use_count++; + mutex_unlock(&root->fs_info->trans_mutex); + return h; +} + +int btrfs_end_transaction(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + struct btrfs_transaction *cur_trans; + mutex_lock(&root->fs_info->trans_mutex); + cur_trans = root->fs_info->running_transaction; + WARN_ON(cur_trans->num_writers <= 1); + if (waitqueue_active(&cur_trans->writer_wait)) + wake_up(&cur_trans->writer_wait); + cur_trans->num_writers--; + put_transaction(cur_trans); + mutex_unlock(&root->fs_info->trans_mutex); + kfree(trans); + return 0; +} + + +int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + filemap_write_and_wait(root->fs_info->sb->s_bdev->bd_inode->i_mapping); + return 0; +} + +int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + int ret; + u64 old_extent_block; + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_root *tree_root = fs_info->tree_root; + struct btrfs_root *extent_root = fs_info->extent_root; + struct btrfs_root *inode_root = fs_info->inode_root; + + btrfs_set_root_blocknr(&inode_root->root_item, + inode_root->node->b_blocknr); + ret = btrfs_update_root(trans, tree_root, + &inode_root->root_key, + &inode_root->root_item); + BUG_ON(ret); + while(1) { + old_extent_block = btrfs_root_blocknr(&extent_root->root_item); + if (old_extent_block == extent_root->node->b_blocknr) + break; + btrfs_set_root_blocknr(&extent_root->root_item, + extent_root->node->b_blocknr); + ret = btrfs_update_root(trans, tree_root, + &extent_root->root_key, + &extent_root->root_item); + BUG_ON(ret); + } + return 0; +} + +static int wait_for_commit(struct btrfs_root *root, + struct btrfs_transaction *commit) +{ + DEFINE_WAIT(wait); + commit->use_count++; + while(!commit->commit_done) { + prepare_to_wait(&commit->commit_wait, &wait, + TASK_UNINTERRUPTIBLE); + if (commit->commit_done) + break; + mutex_unlock(&root->fs_info->trans_mutex); + schedule(); + mutex_lock(&root->fs_info->trans_mutex); + } + finish_wait(&commit->commit_wait, &wait); + return 0; +} + +int btrfs_commit_transaction(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + int ret = 0; + struct buffer_head *snap = root->commit_root; + struct btrfs_key snap_key; + struct btrfs_transaction *cur_trans; + DEFINE_WAIT(wait); + + mutex_lock(&root->fs_info->trans_mutex); + if (trans->transaction->in_commit) { + cur_trans = trans->transaction; + trans->transaction->use_count++; + btrfs_end_transaction(trans, root); + ret = wait_for_commit(root, cur_trans); + BUG_ON(ret); + put_transaction(cur_trans); + mutex_unlock(&root->fs_info->trans_mutex); + return 0; + } + while (trans->transaction->num_writers > 1) { + prepare_to_wait(&trans->transaction->writer_wait, &wait, + TASK_UNINTERRUPTIBLE); + if (trans->transaction->num_writers <= 1) + break; + mutex_unlock(&root->fs_info->trans_mutex); + schedule(); + mutex_lock(&root->fs_info->trans_mutex); + } + finish_wait(&trans->transaction->writer_wait, &wait); + + cur_trans = root->fs_info->running_transaction; + root->fs_info->running_transaction = NULL; + mutex_unlock(&root->fs_info->trans_mutex); + + memcpy(&snap_key, &root->root_key, sizeof(snap_key)); + root->root_key.offset++; + + if (btrfs_root_blocknr(&root->root_item) != root->node->b_blocknr) { + btrfs_set_root_blocknr(&root->root_item, root->node->b_blocknr); + ret = btrfs_insert_root(trans, root->fs_info->tree_root, + &root->root_key, &root->root_item); + BUG_ON(ret); + } + + ret = btrfs_commit_tree_roots(trans, root); + BUG_ON(ret); + + ret = btrfs_write_and_wait_transaction(trans, root); + BUG_ON(ret); + + write_ctree_super(trans, root); + btrfs_finish_extent_commit(trans, root->fs_info->extent_root); + btrfs_finish_extent_commit(trans, root->fs_info->tree_root); + put_transaction(cur_trans); + kfree(trans); + + if (root->node != root->commit_root) { + trans = btrfs_start_transaction(root, 1); + root->commit_root = root->node; + get_bh(root->node); + ret = btrfs_drop_snapshot(trans, root, snap); + BUG_ON(ret); + + ret = btrfs_del_root(trans, root->fs_info->tree_root, + &snap_key); + BUG_ON(ret); + root->fs_info->generation = root->root_key.offset + 1; + ret = btrfs_end_transaction(trans, root); + BUG_ON(ret); + } + + return ret; +} + diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 9ab27b7491c5..7e6c08a0accd 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -1,27 +1,30 @@ #ifndef __TRANSACTION__ #define __TRANSACTION__ +struct btrfs_transaction { + u64 transid; + unsigned long num_writers; + int in_commit; + int use_count; + int commit_done; + wait_queue_head_t writer_wait; + wait_queue_head_t commit_wait; +}; + struct btrfs_trans_handle { u64 transid; unsigned long blocks_reserved; unsigned long blocks_used; + struct btrfs_transaction *transaction; }; -static inline struct btrfs_trans_handle * -btrfs_start_transaction(struct btrfs_root *root, int num_blocks) -{ - struct btrfs_trans_handle *h = kmalloc(sizeof(*h), GFP_NOFS); - h->transid = root->root_key.offset; - h->blocks_reserved = num_blocks; - h->blocks_used = 0; - return h; -} - -static inline void btrfs_free_transaction(struct btrfs_root *root, - struct btrfs_trans_handle *handle) -{ - memset(handle, 0, sizeof(*handle)); - kfree(handle); -} +int btrfs_end_transaction(struct btrfs_trans_handle *trans, + struct btrfs_root *root); +struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root, + int num_blocks); +int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans, + struct btrfs_root *root); +int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans, + struct btrfs_root *root); #endif -- cgit v1.2.3 From 8ef97622caa2d5f78d1dc58ab918e2fbfa9b357a Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Mon, 26 Mar 2007 10:15:30 -0400 Subject: Btrfs: add a radix back bit tree Signed-off-by: Chris Mason --- fs/btrfs/Makefile | 3 +- fs/btrfs/bit-radix.c | 107 +++++++++++++++++++++++++++++++++++++++++++++++++ fs/btrfs/bit-radix.h | 15 +++++++ fs/btrfs/ctree.h | 3 +- fs/btrfs/disk-io.c | 3 +- fs/btrfs/extent-tree.c | 93 ++++++++++++++++++------------------------ 6 files changed, 167 insertions(+), 57 deletions(-) create mode 100644 fs/btrfs/bit-radix.c create mode 100644 fs/btrfs/bit-radix.h (limited to 'fs/btrfs/Makefile') diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index be7d74cdca04..5346f706b2c5 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -3,7 +3,8 @@ ifneq ($(KERNELRELEASE),) obj-m := btrfs.o btrfs-y := super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ - hash.o file-item.o inode-item.o inode-map.o disk-io.o transaction.o + hash.o file-item.o inode-item.o inode-map.o disk-io.o \ + transaction.o bit-radix.o #btrfs-y := ctree.o disk-io.o radix-tree.o extent-tree.o print-tree.o \ # root-tree.o dir-item.o hash.o file-item.o inode-item.o \ diff --git a/fs/btrfs/bit-radix.c b/fs/btrfs/bit-radix.c new file mode 100644 index 000000000000..43807cc3cce7 --- /dev/null +++ b/fs/btrfs/bit-radix.c @@ -0,0 +1,107 @@ +#include +#include "bit-radix.h" + +#define BIT_ARRAY_BYTES 256 +#define BIT_RADIX_BITS_PER_ARRAY ((BIT_ARRAY_BYTES - sizeof(unsigned long)) * 8) + +int set_radix_bit(struct radix_tree_root *radix, unsigned long bit) +{ + unsigned long *bits; + unsigned long slot; + int bit_slot; + int ret; + + slot = bit / BIT_RADIX_BITS_PER_ARRAY; + bit_slot = bit % BIT_RADIX_BITS_PER_ARRAY; + + bits = radix_tree_lookup(radix, slot); + if (!bits) { + bits = kmalloc(BIT_ARRAY_BYTES, GFP_NOIO); + if (!bits) + return -ENOMEM; + memset(bits + 1, 0, BIT_ARRAY_BYTES - sizeof(unsigned long)); + bits[0] = slot; + ret = radix_tree_insert(radix, slot, bits); + if (ret) + return ret; + } + set_bit(bit_slot, bits + 1); + return 0; +} + +int test_radix_bit(struct radix_tree_root *radix, unsigned long bit) +{ + unsigned long *bits; + unsigned long slot; + int bit_slot; + + slot = bit / BIT_RADIX_BITS_PER_ARRAY; + bit_slot = bit % BIT_RADIX_BITS_PER_ARRAY; + + bits = radix_tree_lookup(radix, slot); + if (!bits) + return 0; + return test_bit(bit_slot, bits + 1); +} + +int clear_radix_bit(struct radix_tree_root *radix, unsigned long bit) +{ + unsigned long *bits; + unsigned long slot; + int bit_slot; + int i; + int empty = 1; + + slot = bit / BIT_RADIX_BITS_PER_ARRAY; + bit_slot = bit % BIT_RADIX_BITS_PER_ARRAY; + + bits = radix_tree_lookup(radix, slot); + if (!bits) + return 0; + clear_bit(bit_slot, bits + 1); + + for (i = 1; i < BIT_ARRAY_BYTES / sizeof(unsigned long); i++) { + if (bits[i]) { + empty = 0; + break; + } + } + + if (empty) { + bits = radix_tree_delete(radix, slot); + BUG_ON(!bits); + } + return 0; +} + +int find_first_radix_bit(struct radix_tree_root *radix, unsigned long *retbits, + int nr) +{ + unsigned long *bits; + unsigned long *gang[4]; + int found; + int ret; + int i; + int total_found = 0; + + ret = radix_tree_gang_lookup(radix, (void *)&gang, 0, ARRAY_SIZE(gang)); + for (i = 0; i < ret && nr > 0; i++) { + found = 0; + bits = gang[i]; + while(nr > 0) { + found = find_next_bit(bits + 1, + BIT_RADIX_BITS_PER_ARRAY, + found); + if (found < BIT_RADIX_BITS_PER_ARRAY) { + *retbits = bits[0] * + BIT_RADIX_BITS_PER_ARRAY + found; + retbits++; + nr--; + total_found++; + found++; + } else + break; + } + } + return total_found; +} diff --git a/fs/btrfs/bit-radix.h b/fs/btrfs/bit-radix.h new file mode 100644 index 000000000000..56aad4c7d7f7 --- /dev/null +++ b/fs/btrfs/bit-radix.h @@ -0,0 +1,15 @@ +#ifndef __BIT_RADIX__ +#define __BIT_RADIX__ +#include + +int set_radix_bit(struct radix_tree_root *radix, unsigned long bit); +int test_radix_bit(struct radix_tree_root *radix, unsigned long bit); +int clear_radix_bit(struct radix_tree_root *radix, unsigned long bit); +int find_first_radix_bit(struct radix_tree_root *radix, unsigned long *retbits, + int nr); + +static inline void init_bit_radix(struct radix_tree_root *radix) +{ + INIT_RADIX_TREE(radix, GFP_NOFS); +} +#endif diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 68cafae6a850..0aa1052d9f67 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1,8 +1,8 @@ #ifndef __BTRFS__ #define __BTRFS__ -#include #include +#include "bit-radix.h" struct btrfs_trans_handle; struct btrfs_transaction; @@ -222,6 +222,7 @@ struct btrfs_fs_info { struct btrfs_root *inode_root; struct btrfs_key current_insert; struct btrfs_key last_insert; + struct radix_tree_root pending_del_radix; struct radix_tree_root pinned_radix; u64 last_inode_alloc; u64 last_inode_alloc_dirid; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index e32ddff55b0e..758a62aba063 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -104,7 +104,8 @@ struct btrfs_root *open_ctree(struct super_block *sb, /* FIXME: don't be stupid */ if (!btrfs_super_root(disk_super)) return NULL; - INIT_RADIX_TREE(&fs_info->pinned_radix, GFP_KERNEL); + init_bit_radix(&fs_info->pinned_radix); + init_bit_radix(&fs_info->pending_del_radix); fs_info->running_transaction = NULL; fs_info->fs_root = root; fs_info->tree_root = tree_root; diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 369b960fce45..b14104276eea 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -1,5 +1,4 @@ #include -#include #include "ctree.h" #include "disk-io.h" #include "print-tree.h" @@ -12,15 +11,6 @@ static int finish_current_insert(struct btrfs_trans_handle *trans, struct btrfs_root *extent_root); static int del_pending_extents(struct btrfs_trans_handle *trans, struct btrfs_root *extent_root); -/* - * pending extents are blocks that we're trying to allocate in the extent - * map while trying to grow the map because of other allocations. To avoid - * recursing, they are tagged in the radix tree and cleaned up after - * other allocations are done. The pending tag is also used in the same - * manner for deletes. - */ -#define CTREE_EXTENT_PENDING_DEL 0 -#define CTREE_EXTENT_PINNED 1 static int inc_block_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 blocknr) @@ -104,24 +94,21 @@ int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, struct btrfs_root *root) { - struct buffer_head *gang[8]; + unsigned long gang[8]; u64 first = 0; int ret; int i; + struct radix_tree_root *pinned_radix = &root->fs_info->pinned_radix; while(1) { - ret = radix_tree_gang_lookup_tag(&root->fs_info->pinned_radix, - (void **)gang, 0, - ARRAY_SIZE(gang), - CTREE_EXTENT_PINNED); + ret = find_first_radix_bit(pinned_radix, gang, + ARRAY_SIZE(gang)); if (!ret) break; if (!first) - first = gang[0]->b_blocknr; + first = gang[0]; for (i = 0; i < ret; i++) { - radix_tree_delete(&root->fs_info->pinned_radix, - gang[i]->b_blocknr); - brelse(gang[i]); + clear_radix_bit(pinned_radix, gang[i]); } } if (root->fs_info->last_insert.objectid > first) @@ -161,29 +148,27 @@ static int finish_current_insert(struct btrfs_trans_handle *trans, struct return 0; } -static int pin_down_block(struct btrfs_root *root, u64 blocknr, int tag) +static int pin_down_block(struct btrfs_root *root, u64 blocknr, int pending) { int err; - struct buffer_head *bh = sb_getblk(root->fs_info->sb, blocknr); struct btrfs_header *header; - BUG_ON(!bh); - - header = btrfs_buffer_header(bh); - if (btrfs_header_generation(header) == - root->fs_info->running_transaction->transid) { - return 0; - } - - err = radix_tree_insert(&root->fs_info->pinned_radix, - blocknr, bh); - if (err && err != -EEXIST) { - BUG(); - return err; - } - if (err == -EEXIST) + struct buffer_head *bh; + + bh = sb_find_get_block(root->fs_info->sb, blocknr); + if (bh) { + header = btrfs_buffer_header(bh); + if (btrfs_header_generation(header) == + root->fs_info->running_transaction->transid) { + brelse(bh); + return 0; + } brelse(bh); - radix_tree_tag_set(&root->fs_info->pinned_radix, blocknr, - tag); + } + if (pending) + err = set_radix_bit(&root->fs_info->pending_del_radix, blocknr); + else + err = set_radix_bit(&root->fs_info->pinned_radix, blocknr); + BUG_ON(err); return 0; } @@ -225,8 +210,7 @@ static int __free_extent(struct btrfs_trans_handle *trans, struct btrfs_root u64 super_blocks_used; if (pin) { - ret = pin_down_block(root, blocknr, - CTREE_EXTENT_PINNED); + ret = pin_down_block(root, blocknr, 0); BUG_ON(ret); } @@ -255,25 +239,26 @@ static int del_pending_extents(struct btrfs_trans_handle *trans, struct int ret; int wret; int err = 0; - struct buffer_head *gang[4]; + unsigned long gang[4]; int i; - struct radix_tree_root *radix = &extent_root->fs_info->pinned_radix; + struct radix_tree_root *pending_radix; + struct radix_tree_root *pinned_radix; + + pending_radix = &extent_root->fs_info->pending_del_radix; + pinned_radix = &extent_root->fs_info->pinned_radix; while(1) { - ret = radix_tree_gang_lookup_tag( - &extent_root->fs_info->pinned_radix, - (void **)gang, 0, - ARRAY_SIZE(gang), - CTREE_EXTENT_PENDING_DEL); + ret = find_first_radix_bit(pending_radix, gang, + ARRAY_SIZE(gang)); if (!ret) break; for (i = 0; i < ret; i++) { - radix_tree_tag_set(radix, gang[i]->b_blocknr, - CTREE_EXTENT_PINNED); - radix_tree_tag_clear(radix, gang[i]->b_blocknr, - CTREE_EXTENT_PENDING_DEL); + wret = set_radix_bit(pinned_radix, gang[i]); + BUG_ON(wret); + wret = clear_radix_bit(pending_radix, gang[i]); + BUG_ON(wret); wret = __free_extent(trans, extent_root, - gang[i]->b_blocknr, 1, 0); + gang[i], 1, 0); if (wret) err = wret; } @@ -294,7 +279,7 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root if (root == extent_root) { t = find_tree_block(root, blocknr); - pin_down_block(root, blocknr, CTREE_EXTENT_PENDING_DEL); + pin_down_block(root, blocknr, 1); return 0; } ret = __free_extent(trans, root, blocknr, num_blocks, pin); @@ -393,7 +378,7 @@ check_pending: BUG_ON(ins->objectid < search_start); for (test_block = ins->objectid; test_block < ins->objectid + total_needed; test_block++) { - if (radix_tree_lookup(&root->fs_info->pinned_radix, + if (test_radix_bit(&root->fs_info->pinned_radix, test_block)) { search_start = test_block + 1; goto check_failed; -- cgit v1.2.3 From 39279cc3d2704cfbf9c35dcb5bdd392159ae4625 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Tue, 12 Jun 2007 06:35:45 -0400 Subject: Btrfs: split up super.c Signed-off-by: Chris Mason --- fs/btrfs/Makefile | 4 +- fs/btrfs/TODO | 1 + fs/btrfs/ctree.h | 25 +- fs/btrfs/disk-io.c | 46 +- fs/btrfs/file.c | 783 ++++++++++++++ fs/btrfs/inode.c | 2210 ++++++++++++++++++++++++++++++++++++++ fs/btrfs/super.c | 2979 ++-------------------------------------------------- 7 files changed, 3120 insertions(+), 2928 deletions(-) create mode 100644 fs/btrfs/file.c create mode 100644 fs/btrfs/inode.c (limited to 'fs/btrfs/Makefile') diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index 5346f706b2c5..5af2426116d3 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -4,7 +4,7 @@ ifneq ($(KERNELRELEASE),) obj-m := btrfs.o btrfs-y := super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ hash.o file-item.o inode-item.o inode-map.o disk-io.o \ - transaction.o bit-radix.o + transaction.o bit-radix.o inode.o file.o #btrfs-y := ctree.o disk-io.o radix-tree.o extent-tree.o print-tree.o \ # root-tree.o dir-item.o hash.o file-item.o inode-item.o \ @@ -16,7 +16,7 @@ else KERNELDIR := /lib/modules/`uname -r`/build all:: - $(MAKE) C=1 -C $(KERNELDIR) M=`pwd` modules + $(MAKE) -C $(KERNELDIR) M=`pwd` modules clean:: rm *.o btrfs.ko endif diff --git a/fs/btrfs/TODO b/fs/btrfs/TODO index d9b6d38c603a..4a48a2365d57 100644 --- a/fs/btrfs/TODO +++ b/fs/btrfs/TODO @@ -1,3 +1,4 @@ +* fix printk warnings * cleanup, add more error checking, get rid of BUG_ONs * Fix ENOSPC handling * Make allocator smarter diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 4e136b7b03ae..24410d42e3ce 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1123,5 +1123,28 @@ struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans, int btrfs_csum_truncate(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, u64 isize); -/* super.c */ +/* inode.c */ +void btrfs_delete_inode(struct inode *inode); +void btrfs_read_locked_inode(struct inode *inode); +int btrfs_write_inode(struct inode *inode, int wait); +void btrfs_dirty_inode(struct inode *inode); +struct inode *btrfs_alloc_inode(struct super_block *sb); +void btrfs_destroy_inode(struct inode *inode); +int btrfs_init_cachep(void); +void btrfs_destroy_cachep(void); +int btrfs_ioctl(struct inode *inode, struct file *filp, unsigned int cmd, + unsigned long arg); +long btrfs_compat_ioctl(struct file *file, unsigned int cmd, + unsigned long arg); +struct inode *btrfs_iget_locked(struct super_block *s, u64 objectid, + struct btrfs_root *root); +int btrfs_commit_write(struct file *file, struct page *page, + unsigned from, unsigned to); +int btrfs_get_block(struct inode *inode, sector_t iblock, + struct buffer_head *result, int create); +/* file.c */ +extern struct file_operations btrfs_file_operations; +int btrfs_drop_extents(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct inode *inode, + u64 start, u64 end, u64 *hint_block); #endif diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 8a88404525e8..96bf3ef3a798 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -22,7 +22,7 @@ static int check_tree_block(struct btrfs_root *root, struct buffer_head *buf) if (bh_blocknr(buf) != btrfs_header_blocknr(&node->header)) { printk(KERN_CRIT "bh_blocknr(buf) is %Lu, header is %Lu\n", bh_blocknr(buf), btrfs_header_blocknr(&node->header)); - BUG(); + return 1; } return 0; } @@ -253,7 +253,7 @@ uptodate: set_buffer_checked(bh); } if (check_tree_block(root, bh)) - BUG(); + goto fail; return bh; fail: brelse(bh); @@ -398,8 +398,13 @@ struct btrfs_root *open_ctree(struct super_block *sb) struct btrfs_fs_info *fs_info = kmalloc(sizeof(*fs_info), GFP_NOFS); int ret; + int err = -EIO; struct btrfs_super_block *disk_super; + if (!extent_root || !tree_root || !fs_info) { + err = -ENOMEM; + goto fail; + } init_bit_radix(&fs_info->pinned_radix); init_bit_radix(&fs_info->pending_del_radix); init_bit_radix(&fs_info->extent_map_radix); @@ -431,9 +436,11 @@ struct btrfs_root *open_ctree(struct super_block *sb) mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS); fs_info->hash_tfm = crypto_alloc_hash("crc32c", 0, CRYPTO_ALG_ASYNC); spin_lock_init(&fs_info->hash_lock); + if (!fs_info->hash_tfm || IS_ERR(fs_info->hash_tfm)) { - printk("failed to allocate digest hash\n"); - return NULL; + printk("btrfs: failed hash setup, modprobe cryptomgr?\n"); + err = -ENOMEM; + goto fail_iput; } mutex_init(&fs_info->trans_mutex); mutex_init(&fs_info->fs_mutex); @@ -446,30 +453,53 @@ struct btrfs_root *open_ctree(struct super_block *sb) sb->s_blocksize); if (!fs_info->sb_buffer) - return NULL; + goto fail_iput; disk_super = (struct btrfs_super_block *)fs_info->sb_buffer->b_data; + if (!btrfs_super_root(disk_super)) - return NULL; + goto fail_sb_buffer; i_size_write(fs_info->btree_inode, btrfs_super_total_blocks(disk_super) << fs_info->btree_inode->i_blkbits); fs_info->disk_super = disk_super; + + if (strncmp((char *)(&disk_super->magic), BTRFS_MAGIC, + sizeof(disk_super->magic))) { + printk("btrfs: valid FS not found on %s\n", sb->s_id); + goto fail_sb_buffer; + } tree_root->node = read_tree_block(tree_root, btrfs_super_root(disk_super)); - BUG_ON(!tree_root->node); + if (!tree_root->node) + goto fail_sb_buffer; mutex_lock(&fs_info->fs_mutex); ret = find_and_setup_root(sb->s_blocksize, tree_root, fs_info, BTRFS_EXTENT_TREE_OBJECTID, extent_root); - BUG_ON(ret); + if (ret) { + mutex_unlock(&fs_info->fs_mutex); + goto fail_tree_root; + } btrfs_read_block_groups(extent_root); fs_info->generation = btrfs_super_generation(disk_super) + 1; mutex_unlock(&fs_info->fs_mutex); return tree_root; + +fail_tree_root: + btrfs_block_release(tree_root, tree_root->node); +fail_sb_buffer: + btrfs_block_release(tree_root, fs_info->sb_buffer); +fail_iput: + iput(fs_info->btree_inode); +fail: + kfree(extent_root); + kfree(tree_root); + kfree(fs_info); + return ERR_PTR(err); } int write_ctree_super(struct btrfs_trans_handle *trans, struct btrfs_root diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c new file mode 100644 index 000000000000..7f8e3035d99e --- /dev/null +++ b/fs/btrfs/file.c @@ -0,0 +1,783 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "ctree.h" +#include "disk-io.h" +#include "transaction.h" +#include "btrfs_inode.h" +#include "ioctl.h" +#include "print-tree.h" + + +static int btrfs_copy_from_user(loff_t pos, int num_pages, int write_bytes, + struct page **prepared_pages, + const char __user * buf) +{ + long page_fault = 0; + int i; + int offset = pos & (PAGE_CACHE_SIZE - 1); + + for (i = 0; i < num_pages && write_bytes > 0; i++, offset = 0) { + size_t count = min_t(size_t, + PAGE_CACHE_SIZE - offset, write_bytes); + struct page *page = prepared_pages[i]; + fault_in_pages_readable(buf, count); + + /* Copy data from userspace to the current page */ + kmap(page); + page_fault = __copy_from_user(page_address(page) + offset, + buf, count); + /* Flush processor's dcache for this page */ + flush_dcache_page(page); + kunmap(page); + buf += count; + write_bytes -= count; + + if (page_fault) + break; + } + return page_fault ? -EFAULT : 0; +} + +static void btrfs_drop_pages(struct page **pages, size_t num_pages) +{ + size_t i; + for (i = 0; i < num_pages; i++) { + if (!pages[i]) + break; + unlock_page(pages[i]); + mark_page_accessed(pages[i]); + page_cache_release(pages[i]); + } +} + +static int dirty_and_release_pages(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct file *file, + struct page **pages, + size_t num_pages, + loff_t pos, + size_t write_bytes) +{ + int i; + int offset; + int err = 0; + int ret; + int this_write; + struct inode *inode = file->f_path.dentry->d_inode; + struct buffer_head *bh; + struct btrfs_file_extent_item *ei; + + for (i = 0; i < num_pages; i++) { + offset = pos & (PAGE_CACHE_SIZE -1); + this_write = min(PAGE_CACHE_SIZE - offset, write_bytes); + /* FIXME, one block at a time */ + + mutex_lock(&root->fs_info->fs_mutex); + trans = btrfs_start_transaction(root, 1); + btrfs_set_trans_block_group(trans, inode); + + bh = page_buffers(pages[i]); + + if (buffer_mapped(bh) && bh->b_blocknr == 0) { + struct btrfs_key key; + struct btrfs_path *path; + char *ptr; + u32 datasize; + + /* create an inline extent, and copy the data in */ + path = btrfs_alloc_path(); + BUG_ON(!path); + key.objectid = inode->i_ino; + key.offset = pages[i]->index << PAGE_CACHE_SHIFT; + key.flags = 0; + btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY); + BUG_ON(write_bytes >= PAGE_CACHE_SIZE); + datasize = offset + + btrfs_file_extent_calc_inline_size(write_bytes); + + ret = btrfs_insert_empty_item(trans, root, path, &key, + datasize); + BUG_ON(ret); + ei = btrfs_item_ptr(btrfs_buffer_leaf(path->nodes[0]), + path->slots[0], struct btrfs_file_extent_item); + btrfs_set_file_extent_generation(ei, trans->transid); + btrfs_set_file_extent_type(ei, + BTRFS_FILE_EXTENT_INLINE); + ptr = btrfs_file_extent_inline_start(ei); + btrfs_memcpy(root, path->nodes[0]->b_data, + ptr, bh->b_data, offset + write_bytes); + mark_buffer_dirty(path->nodes[0]); + btrfs_free_path(path); + } else if (buffer_mapped(bh)) { + /* csum the file data */ + btrfs_csum_file_block(trans, root, inode->i_ino, + pages[i]->index << PAGE_CACHE_SHIFT, + kmap(pages[i]), PAGE_CACHE_SIZE); + kunmap(pages[i]); + } + SetPageChecked(pages[i]); + ret = btrfs_end_transaction(trans, root); + BUG_ON(ret); + mutex_unlock(&root->fs_info->fs_mutex); + + ret = btrfs_commit_write(file, pages[i], offset, + offset + this_write); + pos += this_write; + if (ret) { + err = ret; + goto failed; + } + WARN_ON(this_write > write_bytes); + write_bytes -= this_write; + } +failed: + return err; +} + +/* + * this is very complex, but the basic idea is to drop all extents + * in the range start - end. hint_block is filled in with a block number + * that would be a good hint to the block allocator for this file. + * + * If an extent intersects the range but is not entirely inside the range + * it is either truncated or split. Anything entirely inside the range + * is deleted from the tree. + */ +int btrfs_drop_extents(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct inode *inode, + u64 start, u64 end, u64 *hint_block) +{ + int ret; + struct btrfs_key key; + struct btrfs_leaf *leaf; + int slot; + struct btrfs_file_extent_item *extent; + u64 extent_end = 0; + int keep; + struct btrfs_file_extent_item old; + struct btrfs_path *path; + u64 search_start = start; + int bookend; + int found_type; + int found_extent; + int found_inline; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + while(1) { + btrfs_release_path(root, path); + ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino, + search_start, -1); + if (ret < 0) + goto out; + if (ret > 0) { + if (path->slots[0] == 0) { + ret = 0; + goto out; + } + path->slots[0]--; + } + keep = 0; + bookend = 0; + found_extent = 0; + found_inline = 0; + extent = NULL; + leaf = btrfs_buffer_leaf(path->nodes[0]); + slot = path->slots[0]; + btrfs_disk_key_to_cpu(&key, &leaf->items[slot].key); + if (key.offset >= end || key.objectid != inode->i_ino) { + ret = 0; + goto out; + } + if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY) { + ret = 0; + goto out; + } + extent = btrfs_item_ptr(leaf, slot, + struct btrfs_file_extent_item); + found_type = btrfs_file_extent_type(extent); + if (found_type == BTRFS_FILE_EXTENT_REG) { + extent_end = key.offset + + (btrfs_file_extent_num_blocks(extent) << + inode->i_blkbits); + found_extent = 1; + } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { + found_inline = 1; + extent_end = key.offset + + btrfs_file_extent_inline_len(leaf->items + slot); + } + + /* we found nothing we can drop */ + if (!found_extent && !found_inline) { + ret = 0; + goto out; + } + + /* we found nothing inside the range */ + if (search_start >= extent_end) { + ret = 0; + goto out; + } + + /* FIXME, there's only one inline extent allowed right now */ + if (found_inline) { + u64 mask = root->blocksize - 1; + search_start = (extent_end + mask) & ~mask; + } else + search_start = extent_end; + + if (end < extent_end && end >= key.offset) { + if (found_extent) { + u64 disk_blocknr = + btrfs_file_extent_disk_blocknr(extent); + u64 disk_num_blocks = + btrfs_file_extent_disk_num_blocks(extent); + memcpy(&old, extent, sizeof(old)); + if (disk_blocknr != 0) { + ret = btrfs_inc_extent_ref(trans, root, + disk_blocknr, disk_num_blocks); + BUG_ON(ret); + } + } + WARN_ON(found_inline); + bookend = 1; + } + + /* truncate existing extent */ + if (start > key.offset) { + u64 new_num; + u64 old_num; + keep = 1; + WARN_ON(start & (root->blocksize - 1)); + if (found_extent) { + new_num = (start - key.offset) >> + inode->i_blkbits; + old_num = btrfs_file_extent_num_blocks(extent); + *hint_block = + btrfs_file_extent_disk_blocknr(extent); + if (btrfs_file_extent_disk_blocknr(extent)) { + inode->i_blocks -= + (old_num - new_num) << 3; + } + btrfs_set_file_extent_num_blocks(extent, + new_num); + mark_buffer_dirty(path->nodes[0]); + } else { + WARN_ON(1); + } + } + /* delete the entire extent */ + if (!keep) { + u64 disk_blocknr = 0; + u64 disk_num_blocks = 0; + u64 extent_num_blocks = 0; + if (found_extent) { + disk_blocknr = + btrfs_file_extent_disk_blocknr(extent); + disk_num_blocks = + btrfs_file_extent_disk_num_blocks(extent); + extent_num_blocks = + btrfs_file_extent_num_blocks(extent); + *hint_block = + btrfs_file_extent_disk_blocknr(extent); + } + ret = btrfs_del_item(trans, root, path); + BUG_ON(ret); + btrfs_release_path(root, path); + extent = NULL; + if (found_extent && disk_blocknr != 0) { + inode->i_blocks -= extent_num_blocks << 3; + ret = btrfs_free_extent(trans, root, + disk_blocknr, + disk_num_blocks, 0); + } + + BUG_ON(ret); + if (!bookend && search_start >= end) { + ret = 0; + goto out; + } + if (!bookend) + continue; + } + /* create bookend, splitting the extent in two */ + if (bookend && found_extent) { + struct btrfs_key ins; + ins.objectid = inode->i_ino; + ins.offset = end; + ins.flags = 0; + btrfs_set_key_type(&ins, BTRFS_EXTENT_DATA_KEY); + + btrfs_release_path(root, path); + ret = btrfs_insert_empty_item(trans, root, path, &ins, + sizeof(*extent)); + BUG_ON(ret); + extent = btrfs_item_ptr( + btrfs_buffer_leaf(path->nodes[0]), + path->slots[0], + struct btrfs_file_extent_item); + btrfs_set_file_extent_disk_blocknr(extent, + btrfs_file_extent_disk_blocknr(&old)); + btrfs_set_file_extent_disk_num_blocks(extent, + btrfs_file_extent_disk_num_blocks(&old)); + + btrfs_set_file_extent_offset(extent, + btrfs_file_extent_offset(&old) + + ((end - key.offset) >> inode->i_blkbits)); + WARN_ON(btrfs_file_extent_num_blocks(&old) < + (extent_end - end) >> inode->i_blkbits); + btrfs_set_file_extent_num_blocks(extent, + (extent_end - end) >> inode->i_blkbits); + + btrfs_set_file_extent_type(extent, + BTRFS_FILE_EXTENT_REG); + btrfs_set_file_extent_generation(extent, + btrfs_file_extent_generation(&old)); + btrfs_mark_buffer_dirty(path->nodes[0]); + if (btrfs_file_extent_disk_blocknr(&old) != 0) { + inode->i_blocks += + btrfs_file_extent_num_blocks(extent) << 3; + } + ret = 0; + goto out; + } + } +out: + btrfs_free_path(path); + return ret; +} + +/* + * this gets pages into the page cache and locks them down + */ +static int prepare_pages(struct btrfs_root *root, + struct file *file, + struct page **pages, + size_t num_pages, + loff_t pos, + unsigned long first_index, + unsigned long last_index, + size_t write_bytes, + u64 alloc_extent_start) +{ + int i; + unsigned long index = pos >> PAGE_CACHE_SHIFT; + struct inode *inode = file->f_path.dentry->d_inode; + int offset; + int err = 0; + int this_write; + struct buffer_head *bh; + struct buffer_head *head; + loff_t isize = i_size_read(inode); + + memset(pages, 0, num_pages * sizeof(struct page *)); + + for (i = 0; i < num_pages; i++) { + pages[i] = grab_cache_page(inode->i_mapping, index + i); + if (!pages[i]) { + err = -ENOMEM; + goto failed_release; + } + cancel_dirty_page(pages[i], PAGE_CACHE_SIZE); + wait_on_page_writeback(pages[i]); + offset = pos & (PAGE_CACHE_SIZE -1); + this_write = min(PAGE_CACHE_SIZE - offset, write_bytes); + if (!page_has_buffers(pages[i])) { + create_empty_buffers(pages[i], + root->fs_info->sb->s_blocksize, + (1 << BH_Uptodate)); + } + head = page_buffers(pages[i]); + bh = head; + do { + err = btrfs_map_bh_to_logical(root, bh, + alloc_extent_start); + BUG_ON(err); + if (err) + goto failed_truncate; + bh = bh->b_this_page; + if (alloc_extent_start) + alloc_extent_start++; + } while (bh != head); + pos += this_write; + WARN_ON(this_write > write_bytes); + write_bytes -= this_write; + } + return 0; + +failed_release: + btrfs_drop_pages(pages, num_pages); + return err; + +failed_truncate: + btrfs_drop_pages(pages, num_pages); + if (pos > isize) + vmtruncate(inode, isize); + return err; +} + +static ssize_t btrfs_file_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + loff_t pos; + size_t num_written = 0; + int err = 0; + int ret = 0; + struct inode *inode = file->f_path.dentry->d_inode; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct page *pages[8]; + struct page *pinned[2]; + unsigned long first_index; + unsigned long last_index; + u64 start_pos; + u64 num_blocks; + u64 alloc_extent_start; + u64 hint_block; + struct btrfs_trans_handle *trans; + struct btrfs_key ins; + pinned[0] = NULL; + pinned[1] = NULL; + if (file->f_flags & O_DIRECT) + return -EINVAL; + pos = *ppos; + vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); + current->backing_dev_info = inode->i_mapping->backing_dev_info; + err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode)); + if (err) + goto out; + if (count == 0) + goto out; + err = remove_suid(file->f_path.dentry); + if (err) + goto out; + file_update_time(file); + + start_pos = pos & ~((u64)PAGE_CACHE_SIZE - 1); + num_blocks = (count + pos - start_pos + root->blocksize - 1) >> + inode->i_blkbits; + + mutex_lock(&inode->i_mutex); + first_index = pos >> PAGE_CACHE_SHIFT; + last_index = (pos + count) >> PAGE_CACHE_SHIFT; + + /* + * there are lots of better ways to do this, but this code + * makes sure the first and last page in the file range are + * up to date and ready for cow + */ + if ((pos & (PAGE_CACHE_SIZE - 1))) { + pinned[0] = grab_cache_page(inode->i_mapping, first_index); + if (!PageUptodate(pinned[0])) { + ret = mpage_readpage(pinned[0], btrfs_get_block); + BUG_ON(ret); + wait_on_page_locked(pinned[0]); + } else { + unlock_page(pinned[0]); + } + } + if ((pos + count) & (PAGE_CACHE_SIZE - 1)) { + pinned[1] = grab_cache_page(inode->i_mapping, last_index); + if (!PageUptodate(pinned[1])) { + ret = mpage_readpage(pinned[1], btrfs_get_block); + BUG_ON(ret); + wait_on_page_locked(pinned[1]); + } else { + unlock_page(pinned[1]); + } + } + + mutex_lock(&root->fs_info->fs_mutex); + trans = btrfs_start_transaction(root, 1); + if (!trans) { + err = -ENOMEM; + mutex_unlock(&root->fs_info->fs_mutex); + goto out_unlock; + } + btrfs_set_trans_block_group(trans, inode); + /* FIXME blocksize != 4096 */ + inode->i_blocks += num_blocks << 3; + hint_block = 0; + + /* FIXME...EIEIO, ENOSPC and more */ + + /* step one, delete the existing extents in this range */ + if (start_pos < inode->i_size) { + /* FIXME blocksize != pagesize */ + ret = btrfs_drop_extents(trans, root, inode, + start_pos, + (pos + count + root->blocksize -1) & + ~((u64)root->blocksize - 1), + &hint_block); + BUG_ON(ret); + } + + /* insert any holes we need to create */ + if (inode->i_size < start_pos) { + u64 last_pos_in_file; + u64 hole_size; + u64 mask = root->blocksize - 1; + last_pos_in_file = (inode->i_size + mask) & ~mask; + hole_size = (start_pos - last_pos_in_file + mask) & ~mask; + hole_size >>= inode->i_blkbits; + if (last_pos_in_file < start_pos) { + ret = btrfs_insert_file_extent(trans, root, + inode->i_ino, + last_pos_in_file, + 0, 0, hole_size); + } + BUG_ON(ret); + } + + /* + * either allocate an extent for the new bytes or setup the key + * to show we are doing inline data in the extent + */ + if (inode->i_size >= PAGE_CACHE_SIZE || pos + count < inode->i_size || + pos + count - start_pos > BTRFS_MAX_INLINE_DATA_SIZE(root)) { + ret = btrfs_alloc_extent(trans, root, inode->i_ino, + num_blocks, hint_block, (u64)-1, + &ins, 1); + BUG_ON(ret); + ret = btrfs_insert_file_extent(trans, root, inode->i_ino, + start_pos, ins.objectid, ins.offset, + ins.offset); + BUG_ON(ret); + } else { + ins.offset = 0; + ins.objectid = 0; + } + BUG_ON(ret); + alloc_extent_start = ins.objectid; + ret = btrfs_end_transaction(trans, root); + mutex_unlock(&root->fs_info->fs_mutex); + + while(count > 0) { + size_t offset = pos & (PAGE_CACHE_SIZE - 1); + size_t write_bytes = min(count, PAGE_CACHE_SIZE - offset); + size_t num_pages = (write_bytes + PAGE_CACHE_SIZE - 1) >> + PAGE_CACHE_SHIFT; + + memset(pages, 0, sizeof(pages)); + ret = prepare_pages(root, file, pages, num_pages, + pos, first_index, last_index, + write_bytes, alloc_extent_start); + BUG_ON(ret); + + /* FIXME blocks != pagesize */ + if (alloc_extent_start) + alloc_extent_start += num_pages; + ret = btrfs_copy_from_user(pos, num_pages, + write_bytes, pages, buf); + BUG_ON(ret); + + ret = dirty_and_release_pages(NULL, root, file, pages, + num_pages, pos, write_bytes); + BUG_ON(ret); + btrfs_drop_pages(pages, num_pages); + + buf += write_bytes; + count -= write_bytes; + pos += write_bytes; + num_written += write_bytes; + + balance_dirty_pages_ratelimited(inode->i_mapping); + btrfs_btree_balance_dirty(root); + cond_resched(); + } +out_unlock: + mutex_unlock(&inode->i_mutex); +out: + if (pinned[0]) + page_cache_release(pinned[0]); + if (pinned[1]) + page_cache_release(pinned[1]); + *ppos = pos; + current->backing_dev_info = NULL; + mark_inode_dirty(inode); + return num_written ? num_written : err; +} + +/* + * FIXME, do this by stuffing the csum we want in the info hanging off + * page->private. For now, verify file csums on read + */ +static int btrfs_read_actor(read_descriptor_t *desc, struct page *page, + unsigned long offset, unsigned long size) +{ + char *kaddr; + unsigned long left, count = desc->count; + struct inode *inode = page->mapping->host; + + if (size > count) + size = count; + + if (!PageChecked(page)) { + /* FIXME, do it per block */ + struct btrfs_root *root = BTRFS_I(inode)->root; + int ret; + struct buffer_head *bh; + + if (page_has_buffers(page)) { + bh = page_buffers(page); + if (!buffer_mapped(bh)) { + SetPageChecked(page); + goto checked; + } + } + + ret = btrfs_csum_verify_file_block(root, + page->mapping->host->i_ino, + page->index << PAGE_CACHE_SHIFT, + kmap(page), PAGE_CACHE_SIZE); + if (ret) { + if (ret != -ENOENT) { + printk("failed to verify ino %lu page %lu ret %d\n", + page->mapping->host->i_ino, + page->index, ret); + memset(page_address(page), 1, PAGE_CACHE_SIZE); + flush_dcache_page(page); + } + } + SetPageChecked(page); + kunmap(page); + } +checked: + /* + * Faults on the destination of a read are common, so do it before + * taking the kmap. + */ + if (!fault_in_pages_writeable(desc->arg.buf, size)) { + kaddr = kmap_atomic(page, KM_USER0); + left = __copy_to_user_inatomic(desc->arg.buf, + kaddr + offset, size); + kunmap_atomic(kaddr, KM_USER0); + if (left == 0) + goto success; + } + + /* Do it the slow way */ + kaddr = kmap(page); + left = __copy_to_user(desc->arg.buf, kaddr + offset, size); + kunmap(page); + + if (left) { + size -= left; + desc->error = -EFAULT; + } +success: + desc->count = count - size; + desc->written += size; + desc->arg.buf += size; + return size; +} + +/** + * btrfs_file_aio_read - filesystem read routine, with a mod to csum verify + * @iocb: kernel I/O control block + * @iov: io vector request + * @nr_segs: number of segments in the iovec + * @pos: current file position + */ +static ssize_t btrfs_file_aio_read(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t pos) +{ + struct file *filp = iocb->ki_filp; + ssize_t retval; + unsigned long seg; + size_t count; + loff_t *ppos = &iocb->ki_pos; + + count = 0; + for (seg = 0; seg < nr_segs; seg++) { + const struct iovec *iv = &iov[seg]; + + /* + * If any segment has a negative length, or the cumulative + * length ever wraps negative then return -EINVAL. + */ + count += iv->iov_len; + if (unlikely((ssize_t)(count|iv->iov_len) < 0)) + return -EINVAL; + if (access_ok(VERIFY_WRITE, iv->iov_base, iv->iov_len)) + continue; + if (seg == 0) + return -EFAULT; + nr_segs = seg; + count -= iv->iov_len; /* This segment is no good */ + break; + } + retval = 0; + if (count) { + for (seg = 0; seg < nr_segs; seg++) { + read_descriptor_t desc; + + desc.written = 0; + desc.arg.buf = iov[seg].iov_base; + desc.count = iov[seg].iov_len; + if (desc.count == 0) + continue; + desc.error = 0; + do_generic_file_read(filp, ppos, &desc, + btrfs_read_actor); + retval += desc.written; + if (desc.error) { + retval = retval ?: desc.error; + break; + } + } + } + return retval; +} + +static int btrfs_sync_file(struct file *file, + struct dentry *dentry, int datasync) +{ + struct inode *inode = dentry->d_inode; + struct btrfs_root *root = BTRFS_I(inode)->root; + int ret; + struct btrfs_trans_handle *trans; + + /* + * FIXME, use inode generation number to check if we can skip the + * commit + */ + mutex_lock(&root->fs_info->fs_mutex); + trans = btrfs_start_transaction(root, 1); + if (!trans) { + ret = -ENOMEM; + goto out; + } + ret = btrfs_commit_transaction(trans, root); + mutex_unlock(&root->fs_info->fs_mutex); +out: + return ret > 0 ? EIO : ret; +} + +struct file_operations btrfs_file_operations = { + .llseek = generic_file_llseek, + .read = do_sync_read, + .aio_read = btrfs_file_aio_read, + .write = btrfs_file_write, + .mmap = generic_file_mmap, + .open = generic_file_open, + .ioctl = btrfs_ioctl, + .fsync = btrfs_sync_file, +#ifdef CONFIG_COMPAT + .compat_ioctl = btrfs_compat_ioctl, +#endif +}; + diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c new file mode 100644 index 000000000000..62a3a778d37d --- /dev/null +++ b/fs/btrfs/inode.c @@ -0,0 +1,2210 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "ctree.h" +#include "disk-io.h" +#include "transaction.h" +#include "btrfs_inode.h" +#include "ioctl.h" +#include "print-tree.h" + +struct btrfs_iget_args { + u64 ino; + struct btrfs_root *root; +}; + +static struct inode_operations btrfs_dir_inode_operations; +static struct inode_operations btrfs_symlink_inode_operations; +static struct inode_operations btrfs_dir_ro_inode_operations; +static struct inode_operations btrfs_file_inode_operations; +static struct address_space_operations btrfs_aops; +static struct address_space_operations btrfs_symlink_aops; +static struct file_operations btrfs_dir_file_operations; + +static struct kmem_cache *btrfs_inode_cachep; +struct kmem_cache *btrfs_trans_handle_cachep; +struct kmem_cache *btrfs_transaction_cachep; +struct kmem_cache *btrfs_bit_radix_cachep; +struct kmem_cache *btrfs_path_cachep; + +#define S_SHIFT 12 +static unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = { + [S_IFREG >> S_SHIFT] = BTRFS_FT_REG_FILE, + [S_IFDIR >> S_SHIFT] = BTRFS_FT_DIR, + [S_IFCHR >> S_SHIFT] = BTRFS_FT_CHRDEV, + [S_IFBLK >> S_SHIFT] = BTRFS_FT_BLKDEV, + [S_IFIFO >> S_SHIFT] = BTRFS_FT_FIFO, + [S_IFSOCK >> S_SHIFT] = BTRFS_FT_SOCK, + [S_IFLNK >> S_SHIFT] = BTRFS_FT_SYMLINK, +}; + +void btrfs_read_locked_inode(struct inode *inode) +{ + struct btrfs_path *path; + struct btrfs_inode_item *inode_item; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_key location; + u64 alloc_group_block; + int ret; + + path = btrfs_alloc_path(); + BUG_ON(!path); + btrfs_init_path(path); + mutex_lock(&root->fs_info->fs_mutex); + + memcpy(&location, &BTRFS_I(inode)->location, sizeof(location)); + ret = btrfs_lookup_inode(NULL, root, path, &location, 0); + if (ret) { + btrfs_free_path(path); + goto make_bad; + } + inode_item = btrfs_item_ptr(btrfs_buffer_leaf(path->nodes[0]), + path->slots[0], + struct btrfs_inode_item); + + inode->i_mode = btrfs_inode_mode(inode_item); + inode->i_nlink = btrfs_inode_nlink(inode_item); + inode->i_uid = btrfs_inode_uid(inode_item); + inode->i_gid = btrfs_inode_gid(inode_item); + inode->i_size = btrfs_inode_size(inode_item); + inode->i_atime.tv_sec = btrfs_timespec_sec(&inode_item->atime); + inode->i_atime.tv_nsec = btrfs_timespec_nsec(&inode_item->atime); + inode->i_mtime.tv_sec = btrfs_timespec_sec(&inode_item->mtime); + inode->i_mtime.tv_nsec = btrfs_timespec_nsec(&inode_item->mtime); + inode->i_ctime.tv_sec = btrfs_timespec_sec(&inode_item->ctime); + inode->i_ctime.tv_nsec = btrfs_timespec_nsec(&inode_item->ctime); + inode->i_blocks = btrfs_inode_nblocks(inode_item); + inode->i_generation = btrfs_inode_generation(inode_item); + alloc_group_block = btrfs_inode_block_group(inode_item); + BTRFS_I(inode)->block_group = btrfs_lookup_block_group(root->fs_info, + alloc_group_block); + + btrfs_free_path(path); + inode_item = NULL; + + mutex_unlock(&root->fs_info->fs_mutex); + + switch (inode->i_mode & S_IFMT) { +#if 0 + default: + init_special_inode(inode, inode->i_mode, + btrfs_inode_rdev(inode_item)); + break; +#endif + case S_IFREG: + inode->i_mapping->a_ops = &btrfs_aops; + inode->i_fop = &btrfs_file_operations; + inode->i_op = &btrfs_file_inode_operations; + break; + case S_IFDIR: + inode->i_fop = &btrfs_dir_file_operations; + if (root == root->fs_info->tree_root) + inode->i_op = &btrfs_dir_ro_inode_operations; + else + inode->i_op = &btrfs_dir_inode_operations; + break; + case S_IFLNK: + inode->i_op = &btrfs_symlink_inode_operations; + inode->i_mapping->a_ops = &btrfs_symlink_aops; + break; + } + return; + +make_bad: + btrfs_release_path(root, path); + btrfs_free_path(path); + mutex_unlock(&root->fs_info->fs_mutex); + make_bad_inode(inode); +} + +static void fill_inode_item(struct btrfs_inode_item *item, + struct inode *inode) +{ + btrfs_set_inode_uid(item, inode->i_uid); + btrfs_set_inode_gid(item, inode->i_gid); + btrfs_set_inode_size(item, inode->i_size); + btrfs_set_inode_mode(item, inode->i_mode); + btrfs_set_inode_nlink(item, inode->i_nlink); + btrfs_set_timespec_sec(&item->atime, inode->i_atime.tv_sec); + btrfs_set_timespec_nsec(&item->atime, inode->i_atime.tv_nsec); + btrfs_set_timespec_sec(&item->mtime, inode->i_mtime.tv_sec); + btrfs_set_timespec_nsec(&item->mtime, inode->i_mtime.tv_nsec); + btrfs_set_timespec_sec(&item->ctime, inode->i_ctime.tv_sec); + btrfs_set_timespec_nsec(&item->ctime, inode->i_ctime.tv_nsec); + btrfs_set_inode_nblocks(item, inode->i_blocks); + btrfs_set_inode_generation(item, inode->i_generation); + btrfs_set_inode_block_group(item, + BTRFS_I(inode)->block_group->key.objectid); +} + +static int btrfs_update_inode(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct inode *inode) +{ + struct btrfs_inode_item *inode_item; + struct btrfs_path *path; + int ret; + + path = btrfs_alloc_path(); + BUG_ON(!path); + btrfs_init_path(path); + ret = btrfs_lookup_inode(trans, root, path, + &BTRFS_I(inode)->location, 1); + if (ret) { + if (ret > 0) + ret = -ENOENT; + goto failed; + } + + inode_item = btrfs_item_ptr(btrfs_buffer_leaf(path->nodes[0]), + path->slots[0], + struct btrfs_inode_item); + + fill_inode_item(inode_item, inode); + btrfs_mark_buffer_dirty(path->nodes[0]); + ret = 0; +failed: + btrfs_release_path(root, path); + btrfs_free_path(path); + return ret; +} + + +static int btrfs_unlink_trans(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct inode *dir, + struct dentry *dentry) +{ + struct btrfs_path *path; + const char *name = dentry->d_name.name; + int name_len = dentry->d_name.len; + int ret = 0; + u64 objectid; + struct btrfs_dir_item *di; + + path = btrfs_alloc_path(); + BUG_ON(!path); + btrfs_init_path(path); + di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino, + name, name_len, -1); + if (IS_ERR(di)) { + ret = PTR_ERR(di); + goto err; + } + if (!di) { + ret = -ENOENT; + goto err; + } + objectid = btrfs_disk_key_objectid(&di->location); + ret = btrfs_delete_one_dir_name(trans, root, path, di); + BUG_ON(ret); + btrfs_release_path(root, path); + + di = btrfs_lookup_dir_index_item(trans, root, path, dir->i_ino, + objectid, name, name_len, -1); + if (IS_ERR(di)) { + ret = PTR_ERR(di); + goto err; + } + if (!di) { + ret = -ENOENT; + goto err; + } + ret = btrfs_delete_one_dir_name(trans, root, path, di); + BUG_ON(ret); + + dentry->d_inode->i_ctime = dir->i_ctime; +err: + btrfs_free_path(path); + if (!ret) { + dir->i_size -= name_len * 2; + btrfs_update_inode(trans, root, dir); + drop_nlink(dentry->d_inode); + btrfs_update_inode(trans, root, dentry->d_inode); + dir->i_sb->s_dirt = 1; + } + return ret; +} + +static int btrfs_unlink(struct inode *dir, struct dentry *dentry) +{ + struct btrfs_root *root; + struct btrfs_trans_handle *trans; + int ret; + + root = BTRFS_I(dir)->root; + mutex_lock(&root->fs_info->fs_mutex); + trans = btrfs_start_transaction(root, 1); + btrfs_set_trans_block_group(trans, dir); + ret = btrfs_unlink_trans(trans, root, dir, dentry); + btrfs_end_transaction(trans, root); + mutex_unlock(&root->fs_info->fs_mutex); + btrfs_btree_balance_dirty(root); + return ret; +} + +static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) +{ + struct inode *inode = dentry->d_inode; + int err; + int ret; + struct btrfs_root *root = BTRFS_I(dir)->root; + struct btrfs_path *path; + struct btrfs_key key; + struct btrfs_trans_handle *trans; + struct btrfs_key found_key; + int found_type; + struct btrfs_leaf *leaf; + char *goodnames = ".."; + + path = btrfs_alloc_path(); + BUG_ON(!path); + btrfs_init_path(path); + mutex_lock(&root->fs_info->fs_mutex); + trans = btrfs_start_transaction(root, 1); + btrfs_set_trans_block_group(trans, dir); + key.objectid = inode->i_ino; + key.offset = (u64)-1; + key.flags = (u32)-1; + while(1) { + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + if (ret < 0) { + err = ret; + goto out; + } + BUG_ON(ret == 0); + if (path->slots[0] == 0) { + err = -ENOENT; + goto out; + } + path->slots[0]--; + leaf = btrfs_buffer_leaf(path->nodes[0]); + btrfs_disk_key_to_cpu(&found_key, + &leaf->items[path->slots[0]].key); + found_type = btrfs_key_type(&found_key); + if (found_key.objectid != inode->i_ino) { + err = -ENOENT; + goto out; + } + if ((found_type != BTRFS_DIR_ITEM_KEY && + found_type != BTRFS_DIR_INDEX_KEY) || + (!btrfs_match_dir_item_name(root, path, goodnames, 2) && + !btrfs_match_dir_item_name(root, path, goodnames, 1))) { + err = -ENOTEMPTY; + goto out; + } + ret = btrfs_del_item(trans, root, path); + BUG_ON(ret); + + if (found_type == BTRFS_DIR_ITEM_KEY && found_key.offset == 1) + break; + btrfs_release_path(root, path); + } + ret = 0; + btrfs_release_path(root, path); + + /* now the directory is empty */ + err = btrfs_unlink_trans(trans, root, dir, dentry); + if (!err) { + inode->i_size = 0; + } +out: + btrfs_release_path(root, path); + btrfs_free_path(path); + mutex_unlock(&root->fs_info->fs_mutex); + ret = btrfs_end_transaction(trans, root); + btrfs_btree_balance_dirty(root); + if (ret && !err) + err = ret; + return err; +} + +static int btrfs_free_inode(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct inode *inode) +{ + struct btrfs_path *path; + int ret; + + clear_inode(inode); + + path = btrfs_alloc_path(); + BUG_ON(!path); + btrfs_init_path(path); + ret = btrfs_lookup_inode(trans, root, path, + &BTRFS_I(inode)->location, -1); + BUG_ON(ret); + ret = btrfs_del_item(trans, root, path); + BUG_ON(ret); + btrfs_free_path(path); + return ret; +} + +/* + * truncates go from a high offset to a low offset. So, walk + * from hi to lo in the node and issue readas. Stop when you find + * keys from a different objectid + */ +static void reada_truncate(struct btrfs_root *root, struct btrfs_path *path, + u64 objectid) +{ + struct btrfs_node *node; + int i; + int nritems; + u64 item_objectid; + u64 blocknr; + int slot; + int ret; + + if (!path->nodes[1]) + return; + node = btrfs_buffer_node(path->nodes[1]); + slot = path->slots[1]; + if (slot == 0) + return; + nritems = btrfs_header_nritems(&node->header); + for (i = slot - 1; i >= 0; i--) { + item_objectid = btrfs_disk_key_objectid(&node->ptrs[i].key); + if (item_objectid != objectid) + break; + blocknr = btrfs_node_blockptr(node, i); + ret = readahead_tree_block(root, blocknr); + if (ret) + break; + } +} + +/* + * this can truncate away extent items, csum items and directory items. + * It starts at a high offset and removes keys until it can't find + * any higher than i_size. + * + * csum items that cross the new i_size are truncated to the new size + * as well. + */ +static int btrfs_truncate_in_trans(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct inode *inode) +{ + int ret; + struct btrfs_path *path; + struct btrfs_key key; + struct btrfs_disk_key *found_key; + u32 found_type; + struct btrfs_leaf *leaf; + struct btrfs_file_extent_item *fi; + u64 extent_start = 0; + u64 extent_num_blocks = 0; + u64 item_end = 0; + int found_extent; + int del_item; + + path = btrfs_alloc_path(); + BUG_ON(!path); + /* FIXME, add redo link to tree so we don't leak on crash */ + key.objectid = inode->i_ino; + key.offset = (u64)-1; + key.flags = (u32)-1; + while(1) { + btrfs_init_path(path); + fi = NULL; + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + if (ret < 0) { + goto error; + } + if (ret > 0) { + BUG_ON(path->slots[0] == 0); + path->slots[0]--; + } + reada_truncate(root, path, inode->i_ino); + leaf = btrfs_buffer_leaf(path->nodes[0]); + found_key = &leaf->items[path->slots[0]].key; + found_type = btrfs_disk_key_type(found_key); + + if (btrfs_disk_key_objectid(found_key) != inode->i_ino) + break; + if (found_type != BTRFS_CSUM_ITEM_KEY && + found_type != BTRFS_DIR_ITEM_KEY && + found_type != BTRFS_DIR_INDEX_KEY && + found_type != BTRFS_EXTENT_DATA_KEY) + break; + + item_end = btrfs_disk_key_offset(found_key); + if (found_type == BTRFS_EXTENT_DATA_KEY) { + fi = btrfs_item_ptr(btrfs_buffer_leaf(path->nodes[0]), + path->slots[0], + struct btrfs_file_extent_item); + if (btrfs_file_extent_type(fi) != + BTRFS_FILE_EXTENT_INLINE) { + item_end += btrfs_file_extent_num_blocks(fi) << + inode->i_blkbits; + } + } + if (found_type == BTRFS_CSUM_ITEM_KEY) { + ret = btrfs_csum_truncate(trans, root, path, + inode->i_size); + BUG_ON(ret); + } + if (item_end < inode->i_size) { + if (found_type) { + btrfs_set_key_type(&key, found_type - 1); + continue; + } + break; + } + if (btrfs_disk_key_offset(found_key) >= inode->i_size) + del_item = 1; + else + del_item = 0; + found_extent = 0; + + /* FIXME, shrink the extent if the ref count is only 1 */ + if (found_type == BTRFS_EXTENT_DATA_KEY && + btrfs_file_extent_type(fi) != + BTRFS_FILE_EXTENT_INLINE) { + u64 num_dec; + if (!del_item) { + u64 orig_num_blocks = + btrfs_file_extent_num_blocks(fi); + extent_num_blocks = inode->i_size - + btrfs_disk_key_offset(found_key) + + root->blocksize - 1; + extent_num_blocks >>= inode->i_blkbits; + btrfs_set_file_extent_num_blocks(fi, + extent_num_blocks); + inode->i_blocks -= (orig_num_blocks - + extent_num_blocks) << 3; + mark_buffer_dirty(path->nodes[0]); + } else { + extent_start = + btrfs_file_extent_disk_blocknr(fi); + extent_num_blocks = + btrfs_file_extent_disk_num_blocks(fi); + /* FIXME blocksize != 4096 */ + num_dec = btrfs_file_extent_num_blocks(fi) << 3; + if (extent_start != 0) { + found_extent = 1; + inode->i_blocks -= num_dec; + } + } + } + if (del_item) { + ret = btrfs_del_item(trans, root, path); + BUG_ON(ret); + } else { + break; + } + btrfs_release_path(root, path); + if (found_extent) { + ret = btrfs_free_extent(trans, root, extent_start, + extent_num_blocks, 0); + BUG_ON(ret); + } + } + ret = 0; +error: + btrfs_release_path(root, path); + btrfs_free_path(path); + inode->i_sb->s_dirt = 1; + return ret; +} + +/* + * taken from block_truncate_page, but does cow as it zeros out + * any bytes left in the last page in the file. + */ +static int btrfs_truncate_page(struct address_space *mapping, loff_t from) +{ + struct inode *inode = mapping->host; + unsigned blocksize = 1 << inode->i_blkbits; + pgoff_t index = from >> PAGE_CACHE_SHIFT; + unsigned offset = from & (PAGE_CACHE_SIZE-1); + struct page *page; + char *kaddr; + int ret = 0; + struct btrfs_root *root = BTRFS_I(inode)->root; + u64 alloc_hint = 0; + struct btrfs_key ins; + struct btrfs_trans_handle *trans; + + if ((offset & (blocksize - 1)) == 0) + goto out; + + ret = -ENOMEM; + page = grab_cache_page(mapping, index); + if (!page) + goto out; + + if (!PageUptodate(page)) { + ret = mpage_readpage(page, btrfs_get_block); + lock_page(page); + if (!PageUptodate(page)) { + ret = -EIO; + goto out; + } + } + mutex_lock(&root->fs_info->fs_mutex); + trans = btrfs_start_transaction(root, 1); + btrfs_set_trans_block_group(trans, inode); + + ret = btrfs_drop_extents(trans, root, inode, + page->index << PAGE_CACHE_SHIFT, + (page->index + 1) << PAGE_CACHE_SHIFT, + &alloc_hint); + BUG_ON(ret); + ret = btrfs_alloc_extent(trans, root, inode->i_ino, 1, + alloc_hint, (u64)-1, &ins, 1); + BUG_ON(ret); + ret = btrfs_insert_file_extent(trans, root, inode->i_ino, + page->index << PAGE_CACHE_SHIFT, + ins.objectid, 1, 1); + BUG_ON(ret); + SetPageChecked(page); + kaddr = kmap(page); + memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset); + flush_dcache_page(page); + btrfs_csum_file_block(trans, root, inode->i_ino, + page->index << PAGE_CACHE_SHIFT, + kaddr, PAGE_CACHE_SIZE); + kunmap(page); + btrfs_end_transaction(trans, root); + mutex_unlock(&root->fs_info->fs_mutex); + + set_page_dirty(page); + unlock_page(page); + page_cache_release(page); +out: + return ret; +} + +static int btrfs_setattr(struct dentry *dentry, struct iattr *attr) +{ + struct inode *inode = dentry->d_inode; + int err; + + err = inode_change_ok(inode, attr); + if (err) + return err; + + if (S_ISREG(inode->i_mode) && + attr->ia_valid & ATTR_SIZE && attr->ia_size > inode->i_size) { + struct btrfs_trans_handle *trans; + struct btrfs_root *root = BTRFS_I(inode)->root; + u64 mask = root->blocksize - 1; + u64 pos = (inode->i_size + mask) & ~mask; + u64 hole_size; + + if (attr->ia_size <= pos) + goto out; + + btrfs_truncate_page(inode->i_mapping, inode->i_size); + + hole_size = (attr->ia_size - pos + mask) & ~mask; + hole_size >>= inode->i_blkbits; + + mutex_lock(&root->fs_info->fs_mutex); + trans = btrfs_start_transaction(root, 1); + btrfs_set_trans_block_group(trans, inode); + err = btrfs_insert_file_extent(trans, root, inode->i_ino, + pos, 0, 0, hole_size); + BUG_ON(err); + btrfs_end_transaction(trans, root); + mutex_unlock(&root->fs_info->fs_mutex); + } +out: + err = inode_setattr(inode, attr); + + return err; +} +void btrfs_delete_inode(struct inode *inode) +{ + struct btrfs_trans_handle *trans; + struct btrfs_root *root = BTRFS_I(inode)->root; + int ret; + + truncate_inode_pages(&inode->i_data, 0); + if (is_bad_inode(inode)) { + goto no_delete; + } + inode->i_size = 0; + mutex_lock(&root->fs_info->fs_mutex); + trans = btrfs_start_transaction(root, 1); + btrfs_set_trans_block_group(trans, inode); + ret = btrfs_truncate_in_trans(trans, root, inode); + BUG_ON(ret); + btrfs_free_inode(trans, root, inode); + btrfs_end_transaction(trans, root); + mutex_unlock(&root->fs_info->fs_mutex); + btrfs_btree_balance_dirty(root); + return; +no_delete: + clear_inode(inode); +} + +/* + * this returns the key found in the dir entry in the location pointer. + * If no dir entries were found, location->objectid is 0. + */ +static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry, + struct btrfs_key *location) +{ + const char *name = dentry->d_name.name; + int namelen = dentry->d_name.len; + struct btrfs_dir_item *di; + struct btrfs_path *path; + struct btrfs_root *root = BTRFS_I(dir)->root; + int ret; + + path = btrfs_alloc_path(); + BUG_ON(!path); + btrfs_init_path(path); + di = btrfs_lookup_dir_item(NULL, root, path, dir->i_ino, name, + namelen, 0); + if (!di || IS_ERR(di)) { + location->objectid = 0; + ret = 0; + goto out; + } + btrfs_disk_key_to_cpu(location, &di->location); +out: + btrfs_release_path(root, path); + btrfs_free_path(path); + return ret; +} + +/* + * when we hit a tree root in a directory, the btrfs part of the inode + * needs to be changed to reflect the root directory of the tree root. This + * is kind of like crossing a mount point. + */ +static int fixup_tree_root_location(struct btrfs_root *root, + struct btrfs_key *location, + struct btrfs_root **sub_root) +{ + struct btrfs_path *path; + struct btrfs_root_item *ri; + + if (btrfs_key_type(location) != BTRFS_ROOT_ITEM_KEY) + return 0; + if (location->objectid == BTRFS_ROOT_TREE_OBJECTID) + return 0; + + path = btrfs_alloc_path(); + BUG_ON(!path); + mutex_lock(&root->fs_info->fs_mutex); + + *sub_root = btrfs_read_fs_root(root->fs_info, location); + if (IS_ERR(*sub_root)) + return PTR_ERR(*sub_root); + + ri = &(*sub_root)->root_item; + location->objectid = btrfs_root_dirid(ri); + location->flags = 0; + btrfs_set_key_type(location, BTRFS_INODE_ITEM_KEY); + location->offset = 0; + + btrfs_free_path(path); + mutex_unlock(&root->fs_info->fs_mutex); + return 0; +} + +static int btrfs_init_locked_inode(struct inode *inode, void *p) +{ + struct btrfs_iget_args *args = p; + inode->i_ino = args->ino; + BTRFS_I(inode)->root = args->root; + return 0; +} + +static int btrfs_find_actor(struct inode *inode, void *opaque) +{ + struct btrfs_iget_args *args = opaque; + return (args->ino == inode->i_ino && + args->root == BTRFS_I(inode)->root); +} + +struct inode *btrfs_iget_locked(struct super_block *s, u64 objectid, + struct btrfs_root *root) +{ + struct inode *inode; + struct btrfs_iget_args args; + args.ino = objectid; + args.root = root; + + inode = iget5_locked(s, objectid, btrfs_find_actor, + btrfs_init_locked_inode, + (void *)&args); + return inode; +} + +static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry, + struct nameidata *nd) +{ + struct inode * inode; + struct btrfs_inode *bi = BTRFS_I(dir); + struct btrfs_root *root = bi->root; + struct btrfs_root *sub_root = root; + struct btrfs_key location; + int ret; + + if (dentry->d_name.len > BTRFS_NAME_LEN) + return ERR_PTR(-ENAMETOOLONG); + mutex_lock(&root->fs_info->fs_mutex); + ret = btrfs_inode_by_name(dir, dentry, &location); + mutex_unlock(&root->fs_info->fs_mutex); + if (ret < 0) + return ERR_PTR(ret); + inode = NULL; + if (location.objectid) { + ret = fixup_tree_root_location(root, &location, &sub_root); + if (ret < 0) + return ERR_PTR(ret); + if (ret > 0) + return ERR_PTR(-ENOENT); + inode = btrfs_iget_locked(dir->i_sb, location.objectid, + sub_root); + if (!inode) + return ERR_PTR(-EACCES); + if (inode->i_state & I_NEW) { + /* the inode and parent dir are two different roots */ + if (sub_root != root) { + igrab(inode); + sub_root->inode = inode; + } + BTRFS_I(inode)->root = sub_root; + memcpy(&BTRFS_I(inode)->location, &location, + sizeof(location)); + btrfs_read_locked_inode(inode); + unlock_new_inode(inode); + } + } + return d_splice_alias(inode, dentry); +} + +/* + * readahead one full node of leaves as long as their keys include + * the objectid supplied + */ +static void reada_leaves(struct btrfs_root *root, struct btrfs_path *path, + u64 objectid) +{ + struct btrfs_node *node; + int i; + u32 nritems; + u64 item_objectid; + u64 blocknr; + int slot; + int ret; + + if (!path->nodes[1]) + return; + node = btrfs_buffer_node(path->nodes[1]); + slot = path->slots[1]; + nritems = btrfs_header_nritems(&node->header); + for (i = slot + 1; i < nritems; i++) { + item_objectid = btrfs_disk_key_objectid(&node->ptrs[i].key); + if (item_objectid != objectid) + break; + blocknr = btrfs_node_blockptr(node, i); + ret = readahead_tree_block(root, blocknr); + if (ret) + break; + } +} +static unsigned char btrfs_filetype_table[] = { + DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK +}; + +static int btrfs_readdir(struct file *filp, void *dirent, filldir_t filldir) +{ + struct inode *inode = filp->f_path.dentry->d_inode; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_item *item; + struct btrfs_dir_item *di; + struct btrfs_key key; + struct btrfs_path *path; + int ret; + u32 nritems; + struct btrfs_leaf *leaf; + int slot; + int advance; + unsigned char d_type; + int over = 0; + u32 di_cur; + u32 di_total; + u32 di_len; + int key_type = BTRFS_DIR_INDEX_KEY; + + /* FIXME, use a real flag for deciding about the key type */ + if (root->fs_info->tree_root == root) + key_type = BTRFS_DIR_ITEM_KEY; + mutex_lock(&root->fs_info->fs_mutex); + key.objectid = inode->i_ino; + key.flags = 0; + btrfs_set_key_type(&key, key_type); + key.offset = filp->f_pos; + path = btrfs_alloc_path(); + btrfs_init_path(path); + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + goto err; + advance = 0; + reada_leaves(root, path, inode->i_ino); + while(1) { + leaf = btrfs_buffer_leaf(path->nodes[0]); + nritems = btrfs_header_nritems(&leaf->header); + slot = path->slots[0]; + if (advance || slot >= nritems) { + if (slot >= nritems -1) { + reada_leaves(root, path, inode->i_ino); + ret = btrfs_next_leaf(root, path); + if (ret) + break; + leaf = btrfs_buffer_leaf(path->nodes[0]); + nritems = btrfs_header_nritems(&leaf->header); + slot = path->slots[0]; + } else { + slot++; + path->slots[0]++; + } + } + advance = 1; + item = leaf->items + slot; + if (btrfs_disk_key_objectid(&item->key) != key.objectid) + break; + if (btrfs_disk_key_type(&item->key) != key_type) + break; + if (btrfs_disk_key_offset(&item->key) < filp->f_pos) + continue; + filp->f_pos = btrfs_disk_key_offset(&item->key); + advance = 1; + di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item); + di_cur = 0; + di_total = btrfs_item_size(leaf->items + slot); + while(di_cur < di_total) { + d_type = btrfs_filetype_table[btrfs_dir_type(di)]; + over = filldir(dirent, (const char *)(di + 1), + btrfs_dir_name_len(di), + btrfs_disk_key_offset(&item->key), + btrfs_disk_key_objectid(&di->location), + d_type); + if (over) + goto nopos; + di_len = btrfs_dir_name_len(di) + sizeof(*di); + di_cur += di_len; + di = (struct btrfs_dir_item *)((char *)di + di_len); + } + } + filp->f_pos++; +nopos: + ret = 0; +err: + btrfs_release_path(root, path); + btrfs_free_path(path); + mutex_unlock(&root->fs_info->fs_mutex); + return ret; +} + +int btrfs_write_inode(struct inode *inode, int wait) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_trans_handle *trans; + int ret = 0; + + if (wait) { + mutex_lock(&root->fs_info->fs_mutex); + trans = btrfs_start_transaction(root, 1); + btrfs_set_trans_block_group(trans, inode); + ret = btrfs_commit_transaction(trans, root); + mutex_unlock(&root->fs_info->fs_mutex); + } + return ret; +} + +/* + * This is somewhat expense, updating the tree every time the + * inode changes. But, it is most likely to find the inode in cache. + * FIXME, needs more benchmarking...there are no reasons other than performance + * to keep or drop this code. + */ +void btrfs_dirty_inode(struct inode *inode) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_trans_handle *trans; + + mutex_lock(&root->fs_info->fs_mutex); + trans = btrfs_start_transaction(root, 1); + btrfs_set_trans_block_group(trans, inode); + btrfs_update_inode(trans, root, inode); + btrfs_end_transaction(trans, root); + mutex_unlock(&root->fs_info->fs_mutex); + btrfs_btree_balance_dirty(root); +} + +static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 objectid, + struct btrfs_block_group_cache *group, + int mode) +{ + struct inode *inode; + struct btrfs_inode_item inode_item; + struct btrfs_key *location; + int ret; + int owner; + + inode = new_inode(root->fs_info->sb); + if (!inode) + return ERR_PTR(-ENOMEM); + + BTRFS_I(inode)->root = root; + if (mode & S_IFDIR) + owner = 0; + else + owner = 1; + group = btrfs_find_block_group(root, group, 0, 0, owner); + BTRFS_I(inode)->block_group = group; + + inode->i_uid = current->fsuid; + inode->i_gid = current->fsgid; + inode->i_mode = mode; + inode->i_ino = objectid; + inode->i_blocks = 0; + inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; + fill_inode_item(&inode_item, inode); + location = &BTRFS_I(inode)->location; + location->objectid = objectid; + location->flags = 0; + location->offset = 0; + btrfs_set_key_type(location, BTRFS_INODE_ITEM_KEY); + + ret = btrfs_insert_inode(trans, root, objectid, &inode_item); + BUG_ON(ret); + + insert_inode_hash(inode); + return inode; +} + +static inline u8 btrfs_inode_type(struct inode *inode) +{ + return btrfs_type_by_mode[(inode->i_mode & S_IFMT) >> S_SHIFT]; +} + +static int btrfs_add_link(struct btrfs_trans_handle *trans, + struct dentry *dentry, struct inode *inode) +{ + int ret; + struct btrfs_key key; + struct btrfs_root *root = BTRFS_I(dentry->d_parent->d_inode)->root; + key.objectid = inode->i_ino; + key.flags = 0; + btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); + key.offset = 0; + + ret = btrfs_insert_dir_item(trans, root, + dentry->d_name.name, dentry->d_name.len, + dentry->d_parent->d_inode->i_ino, + &key, btrfs_inode_type(inode)); + if (ret == 0) { + dentry->d_parent->d_inode->i_size += dentry->d_name.len * 2; + ret = btrfs_update_inode(trans, root, + dentry->d_parent->d_inode); + } + return ret; +} + +static int btrfs_add_nondir(struct btrfs_trans_handle *trans, + struct dentry *dentry, struct inode *inode) +{ + int err = btrfs_add_link(trans, dentry, inode); + if (!err) { + d_instantiate(dentry, inode); + return 0; + } + if (err > 0) + err = -EEXIST; + return err; +} + +static int btrfs_create(struct inode *dir, struct dentry *dentry, + int mode, struct nameidata *nd) +{ + struct btrfs_trans_handle *trans; + struct btrfs_root *root = BTRFS_I(dir)->root; + struct inode *inode; + int err; + int drop_inode = 0; + u64 objectid; + + mutex_lock(&root->fs_info->fs_mutex); + trans = btrfs_start_transaction(root, 1); + btrfs_set_trans_block_group(trans, dir); + + err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid); + if (err) { + err = -ENOSPC; + goto out_unlock; + } + + inode = btrfs_new_inode(trans, root, objectid, + BTRFS_I(dir)->block_group, mode); + err = PTR_ERR(inode); + if (IS_ERR(inode)) + goto out_unlock; + + btrfs_set_trans_block_group(trans, inode); + err = btrfs_add_nondir(trans, dentry, inode); + if (err) + drop_inode = 1; + else { + inode->i_mapping->a_ops = &btrfs_aops; + inode->i_fop = &btrfs_file_operations; + inode->i_op = &btrfs_file_inode_operations; + } + dir->i_sb->s_dirt = 1; + btrfs_update_inode_block_group(trans, inode); + btrfs_update_inode_block_group(trans, dir); +out_unlock: + btrfs_end_transaction(trans, root); + mutex_unlock(&root->fs_info->fs_mutex); + + if (drop_inode) { + inode_dec_link_count(inode); + iput(inode); + } + btrfs_btree_balance_dirty(root); + return err; +} + +static int btrfs_link(struct dentry *old_dentry, struct inode *dir, + struct dentry *dentry) +{ + struct btrfs_trans_handle *trans; + struct btrfs_root *root = BTRFS_I(dir)->root; + struct inode *inode = old_dentry->d_inode; + int err; + int drop_inode = 0; + + if (inode->i_nlink == 0) + return -ENOENT; + + inc_nlink(inode); + mutex_lock(&root->fs_info->fs_mutex); + trans = btrfs_start_transaction(root, 1); + btrfs_set_trans_block_group(trans, dir); + atomic_inc(&inode->i_count); + err = btrfs_add_nondir(trans, dentry, inode); + if (err) + drop_inode = 1; + dir->i_sb->s_dirt = 1; + btrfs_update_inode_block_group(trans, dir); + btrfs_update_inode(trans, root, inode); + + btrfs_end_transaction(trans, root); + mutex_unlock(&root->fs_info->fs_mutex); + + if (drop_inode) { + inode_dec_link_count(inode); + iput(inode); + } + btrfs_btree_balance_dirty(root); + return err; +} + +static int btrfs_make_empty_dir(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 objectid, u64 dirid) +{ + int ret; + char buf[2]; + struct btrfs_key key; + + buf[0] = '.'; + buf[1] = '.'; + + key.objectid = objectid; + key.offset = 0; + key.flags = 0; + btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); + + ret = btrfs_insert_dir_item(trans, root, buf, 1, objectid, + &key, BTRFS_FT_DIR); + if (ret) + goto error; + key.objectid = dirid; + ret = btrfs_insert_dir_item(trans, root, buf, 2, objectid, + &key, BTRFS_FT_DIR); + if (ret) + goto error; +error: + return ret; +} + +static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) +{ + struct inode *inode; + struct btrfs_trans_handle *trans; + struct btrfs_root *root = BTRFS_I(dir)->root; + int err = 0; + int drop_on_err = 0; + u64 objectid; + + mutex_lock(&root->fs_info->fs_mutex); + trans = btrfs_start_transaction(root, 1); + btrfs_set_trans_block_group(trans, dir); + if (IS_ERR(trans)) { + err = PTR_ERR(trans); + goto out_unlock; + } + + err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid); + if (err) { + err = -ENOSPC; + goto out_unlock; + } + + inode = btrfs_new_inode(trans, root, objectid, + BTRFS_I(dir)->block_group, S_IFDIR | mode); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + goto out_fail; + } + drop_on_err = 1; + inode->i_op = &btrfs_dir_inode_operations; + inode->i_fop = &btrfs_dir_file_operations; + btrfs_set_trans_block_group(trans, inode); + + err = btrfs_make_empty_dir(trans, root, inode->i_ino, dir->i_ino); + if (err) + goto out_fail; + + inode->i_size = 6; + err = btrfs_update_inode(trans, root, inode); + if (err) + goto out_fail; + err = btrfs_add_link(trans, dentry, inode); + if (err) + goto out_fail; + d_instantiate(dentry, inode); + drop_on_err = 0; + dir->i_sb->s_dirt = 1; + btrfs_update_inode_block_group(trans, inode); + btrfs_update_inode_block_group(trans, dir); + +out_fail: + btrfs_end_transaction(trans, root); +out_unlock: + mutex_unlock(&root->fs_info->fs_mutex); + if (drop_on_err) + iput(inode); + btrfs_btree_balance_dirty(root); + return err; +} + +/* + * FIBMAP and others want to pass in a fake buffer head. They need to + * use BTRFS_GET_BLOCK_NO_DIRECT to make sure we don't try to memcpy + * any packed file data into the fake bh + */ +#define BTRFS_GET_BLOCK_NO_CREATE 0 +#define BTRFS_GET_BLOCK_CREATE 1 +#define BTRFS_GET_BLOCK_NO_DIRECT 2 + +/* + * FIXME create==1 doe not work. + */ +static int btrfs_get_block_lock(struct inode *inode, sector_t iblock, + struct buffer_head *result, int create) +{ + int ret; + int err = 0; + u64 blocknr; + u64 extent_start = 0; + u64 extent_end = 0; + u64 objectid = inode->i_ino; + u32 found_type; + u64 alloc_hint = 0; + struct btrfs_path *path; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_file_extent_item *item; + struct btrfs_leaf *leaf; + struct btrfs_disk_key *found_key; + struct btrfs_trans_handle *trans = NULL; + + path = btrfs_alloc_path(); + BUG_ON(!path); + btrfs_init_path(path); + if (create & BTRFS_GET_BLOCK_CREATE) { + WARN_ON(1); + /* this almost but not quite works */ + trans = btrfs_start_transaction(root, 1); + if (!trans) { + err = -ENOMEM; + goto out; + } + ret = btrfs_drop_extents(trans, root, inode, + iblock << inode->i_blkbits, + (iblock + 1) << inode->i_blkbits, + &alloc_hint); + BUG_ON(ret); + } + + ret = btrfs_lookup_file_extent(NULL, root, path, + inode->i_ino, + iblock << inode->i_blkbits, 0); + if (ret < 0) { + err = ret; + goto out; + } + + if (ret != 0) { + if (path->slots[0] == 0) { + btrfs_release_path(root, path); + goto not_found; + } + path->slots[0]--; + } + + item = btrfs_item_ptr(btrfs_buffer_leaf(path->nodes[0]), path->slots[0], + struct btrfs_file_extent_item); + leaf = btrfs_buffer_leaf(path->nodes[0]); + blocknr = btrfs_file_extent_disk_blocknr(item); + blocknr += btrfs_file_extent_offset(item); + + /* are we inside the extent that was found? */ + found_key = &leaf->items[path->slots[0]].key; + found_type = btrfs_disk_key_type(found_key); + if (btrfs_disk_key_objectid(found_key) != objectid || + found_type != BTRFS_EXTENT_DATA_KEY) { + extent_end = 0; + extent_start = 0; + goto not_found; + } + found_type = btrfs_file_extent_type(item); + extent_start = btrfs_disk_key_offset(&leaf->items[path->slots[0]].key); + if (found_type == BTRFS_FILE_EXTENT_REG) { + extent_start = extent_start >> inode->i_blkbits; + extent_end = extent_start + btrfs_file_extent_num_blocks(item); + err = 0; + if (btrfs_file_extent_disk_blocknr(item) == 0) + goto out; + if (iblock >= extent_start && iblock < extent_end) { + btrfs_map_bh_to_logical(root, result, blocknr + + iblock - extent_start); + goto out; + } + } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { + char *ptr; + char *map; + u32 size; + + if (create & BTRFS_GET_BLOCK_NO_DIRECT) { + err = -EINVAL; + goto out; + } + size = btrfs_file_extent_inline_len(leaf->items + + path->slots[0]); + extent_end = (extent_start + size) >> inode->i_blkbits; + extent_start >>= inode->i_blkbits; + if (iblock < extent_start || iblock > extent_end) { + goto not_found; + } + ptr = btrfs_file_extent_inline_start(item); + map = kmap(result->b_page); + memcpy(map, ptr, size); + memset(map + size, 0, PAGE_CACHE_SIZE - size); + flush_dcache_page(result->b_page); + kunmap(result->b_page); + set_buffer_uptodate(result); + SetPageChecked(result->b_page); + btrfs_map_bh_to_logical(root, result, 0); + } +not_found: + if (create & BTRFS_GET_BLOCK_CREATE) { + struct btrfs_key ins; + ret = btrfs_alloc_extent(trans, root, inode->i_ino, + 1, alloc_hint, (u64)-1, + &ins, 1); + BUG_ON(ret); + ret = btrfs_insert_file_extent(trans, root, inode->i_ino, + iblock << inode->i_blkbits, + ins.objectid, ins.offset, + ins.offset); + BUG_ON(ret); + SetPageChecked(result->b_page); + btrfs_map_bh_to_logical(root, result, ins.objectid); + } +out: + if (trans) + err = btrfs_end_transaction(trans, root); + btrfs_free_path(path); + return err; +} + +int btrfs_get_block(struct inode *inode, sector_t iblock, + struct buffer_head *result, int create) +{ + int err; + struct btrfs_root *root = BTRFS_I(inode)->root; + mutex_lock(&root->fs_info->fs_mutex); + err = btrfs_get_block_lock(inode, iblock, result, create); + mutex_unlock(&root->fs_info->fs_mutex); + return err; +} + +static int btrfs_get_block_bmap(struct inode *inode, sector_t iblock, + struct buffer_head *result, int create) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + mutex_lock(&root->fs_info->fs_mutex); + btrfs_get_block_lock(inode, iblock, result, BTRFS_GET_BLOCK_NO_DIRECT); + mutex_unlock(&root->fs_info->fs_mutex); + return 0; +} + +static sector_t btrfs_bmap(struct address_space *as, sector_t block) +{ + return generic_block_bmap(as, block, btrfs_get_block_bmap); +} + +static int btrfs_prepare_write(struct file *file, struct page *page, + unsigned from, unsigned to) +{ + return block_prepare_write(page, from, to, btrfs_get_block); +} + +static int btrfs_readpage(struct file *file, struct page *page) +{ + return mpage_readpage(page, btrfs_get_block); +} + +/* + * Aside from a tiny bit of packed file data handling, this is the + * same as the generic code. + * + * While block_write_full_page is writing back the dirty buffers under + * the page lock, whoever dirtied the buffers may decide to clean them + * again at any time. We handle that by only looking at the buffer + * state inside lock_buffer(). + * + * If block_write_full_page() is called for regular writeback + * (wbc->sync_mode == WB_SYNC_NONE) then it will redirty a page which has a + * locked buffer. This only can happen if someone has written the buffer + * directly, with submit_bh(). At the address_space level PageWriteback + * prevents this contention from occurring. + */ +static int __btrfs_write_full_page(struct inode *inode, struct page *page, + struct writeback_control *wbc) +{ + int err; + sector_t block; + sector_t last_block; + struct buffer_head *bh, *head; + const unsigned blocksize = 1 << inode->i_blkbits; + int nr_underway = 0; + + BUG_ON(!PageLocked(page)); + + last_block = (i_size_read(inode) - 1) >> inode->i_blkbits; + + if (!page_has_buffers(page)) { + create_empty_buffers(page, blocksize, + (1 << BH_Dirty)|(1 << BH_Uptodate)); + } + + /* + * Be very careful. We have no exclusion from __set_page_dirty_buffers + * here, and the (potentially unmapped) buffers may become dirty at + * any time. If a buffer becomes dirty here after we've inspected it + * then we just miss that fact, and the page stays dirty. + * + * Buffers outside i_size may be dirtied by __set_page_dirty_buffers; + * handle that here by just cleaning them. + */ + + block = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits); + head = page_buffers(page); + bh = head; + + /* + * Get all the dirty buffers mapped to disk addresses and + * handle any aliases from the underlying blockdev's mapping. + */ + do { + if (block > last_block) { + /* + * mapped buffers outside i_size will occur, because + * this page can be outside i_size when there is a + * truncate in progress. + */ + /* + * The buffer was zeroed by block_write_full_page() + */ + clear_buffer_dirty(bh); + set_buffer_uptodate(bh); + } else if (!buffer_mapped(bh) && buffer_dirty(bh)) { + WARN_ON(bh->b_size != blocksize); + err = btrfs_get_block(inode, block, bh, 0); + if (err) { + goto recover; + } + if (buffer_new(bh)) { + /* blockdev mappings never come here */ + clear_buffer_new(bh); + } + } + bh = bh->b_this_page; + block++; + } while (bh != head); + + do { + if (!buffer_mapped(bh)) + continue; + /* + * If it's a fully non-blocking write attempt and we cannot + * lock the buffer then redirty the page. Note that this can + * potentially cause a busy-wait loop from pdflush and kswapd + * activity, but those code paths have their own higher-level + * throttling. + */ + if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) { + lock_buffer(bh); + } else if (test_set_buffer_locked(bh)) { + redirty_page_for_writepage(wbc, page); + continue; + } + if (test_clear_buffer_dirty(bh) && bh->b_blocknr != 0) { + mark_buffer_async_write(bh); + } else { + unlock_buffer(bh); + } + } while ((bh = bh->b_this_page) != head); + + /* + * The page and its buffers are protected by PageWriteback(), so we can + * drop the bh refcounts early. + */ + BUG_ON(PageWriteback(page)); + set_page_writeback(page); + + do { + struct buffer_head *next = bh->b_this_page; + if (buffer_async_write(bh)) { + submit_bh(WRITE, bh); + nr_underway++; + } + bh = next; + } while (bh != head); + unlock_page(page); + + err = 0; +done: + if (nr_underway == 0) { + /* + * The page was marked dirty, but the buffers were + * clean. Someone wrote them back by hand with + * ll_rw_block/submit_bh. A rare case. + */ + int uptodate = 1; + do { + if (!buffer_uptodate(bh)) { + uptodate = 0; + break; + } + bh = bh->b_this_page; + } while (bh != head); + if (uptodate) + SetPageUptodate(page); + end_page_writeback(page); + } + return err; + +recover: + /* + * ENOSPC, or some other error. We may already have added some + * blocks to the file, so we need to write these out to avoid + * exposing stale data. + * The page is currently locked and not marked for writeback + */ + bh = head; + /* Recovery: lock and submit the mapped buffers */ + do { + if (buffer_mapped(bh) && buffer_dirty(bh)) { + lock_buffer(bh); + mark_buffer_async_write(bh); + } else { + /* + * The buffer may have been set dirty during + * attachment to a dirty page. + */ + clear_buffer_dirty(bh); + } + } while ((bh = bh->b_this_page) != head); + SetPageError(page); + BUG_ON(PageWriteback(page)); + set_page_writeback(page); + do { + struct buffer_head *next = bh->b_this_page; + if (buffer_async_write(bh)) { + clear_buffer_dirty(bh); + submit_bh(WRITE, bh); + nr_underway++; + } + bh = next; + } while (bh != head); + unlock_page(page); + goto done; +} + +static int btrfs_writepage(struct page *page, struct writeback_control *wbc) +{ + struct inode * const inode = page->mapping->host; + loff_t i_size = i_size_read(inode); + const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT; + unsigned offset; + void *kaddr; + + /* Is the page fully inside i_size? */ + if (page->index < end_index) + return __btrfs_write_full_page(inode, page, wbc); + + /* Is the page fully outside i_size? (truncate in progress) */ + offset = i_size & (PAGE_CACHE_SIZE-1); + if (page->index >= end_index+1 || !offset) { + /* + * The page may have dirty, unmapped buffers. For example, + * they may have been added in ext3_writepage(). Make them + * freeable here, so the page does not leak. + */ + block_invalidatepage(page, 0); + unlock_page(page); + return 0; /* don't care */ + } + + /* + * The page straddles i_size. It must be zeroed out on each and every + * writepage invokation because it may be mmapped. "A file is mapped + * in multiples of the page size. For a file that is not a multiple of + * the page size, the remaining memory is zeroed when mapped, and + * writes to that region are not written out to the file." + */ + kaddr = kmap_atomic(page, KM_USER0); + memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset); + flush_dcache_page(page); + kunmap_atomic(kaddr, KM_USER0); + return __btrfs_write_full_page(inode, page, wbc); +} + +static void btrfs_truncate(struct inode *inode) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + int ret; + struct btrfs_trans_handle *trans; + + if (!S_ISREG(inode->i_mode)) + return; + if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) + return; + + btrfs_truncate_page(inode->i_mapping, inode->i_size); + + mutex_lock(&root->fs_info->fs_mutex); + trans = btrfs_start_transaction(root, 1); + btrfs_set_trans_block_group(trans, inode); + + /* FIXME, add redo link to tree so we don't leak on crash */ + ret = btrfs_truncate_in_trans(trans, root, inode); + BUG_ON(ret); + btrfs_update_inode(trans, root, inode); + ret = btrfs_end_transaction(trans, root); + BUG_ON(ret); + mutex_unlock(&root->fs_info->fs_mutex); + btrfs_btree_balance_dirty(root); +} + +int btrfs_commit_write(struct file *file, struct page *page, + unsigned from, unsigned to) +{ + struct inode *inode = page->mapping->host; + struct buffer_head *bh; + loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to; + + SetPageUptodate(page); + bh = page_buffers(page); + set_buffer_uptodate(bh); + if (buffer_mapped(bh) && bh->b_blocknr != 0) { + set_page_dirty(page); + } + if (pos > inode->i_size) { + i_size_write(inode, pos); + mark_inode_dirty(inode); + } + return 0; +} + +static int create_subvol(struct btrfs_root *root, char *name, int namelen) +{ + struct btrfs_trans_handle *trans; + struct btrfs_key key; + struct btrfs_root_item root_item; + struct btrfs_inode_item *inode_item; + struct buffer_head *subvol; + struct btrfs_leaf *leaf; + struct btrfs_root *new_root; + struct inode *inode; + struct inode *dir; + int ret; + u64 objectid; + u64 new_dirid = BTRFS_FIRST_FREE_OBJECTID; + + mutex_lock(&root->fs_info->fs_mutex); + trans = btrfs_start_transaction(root, 1); + BUG_ON(!trans); + + subvol = btrfs_alloc_free_block(trans, root, 0); + if (subvol == NULL) + return -ENOSPC; + leaf = btrfs_buffer_leaf(subvol); + btrfs_set_header_nritems(&leaf->header, 0); + btrfs_set_header_level(&leaf->header, 0); + btrfs_set_header_blocknr(&leaf->header, bh_blocknr(subvol)); + btrfs_set_header_generation(&leaf->header, trans->transid); + btrfs_set_header_owner(&leaf->header, root->root_key.objectid); + memcpy(leaf->header.fsid, root->fs_info->disk_super->fsid, + sizeof(leaf->header.fsid)); + mark_buffer_dirty(subvol); + + inode_item = &root_item.inode; + memset(inode_item, 0, sizeof(*inode_item)); + btrfs_set_inode_generation(inode_item, 1); + btrfs_set_inode_size(inode_item, 3); + btrfs_set_inode_nlink(inode_item, 1); + btrfs_set_inode_nblocks(inode_item, 1); + btrfs_set_inode_mode(inode_item, S_IFDIR | 0755); + + btrfs_set_root_blocknr(&root_item, bh_blocknr(subvol)); + btrfs_set_root_refs(&root_item, 1); + brelse(subvol); + subvol = NULL; + + ret = btrfs_find_free_objectid(trans, root->fs_info->tree_root, + 0, &objectid); + BUG_ON(ret); + + btrfs_set_root_dirid(&root_item, new_dirid); + + key.objectid = objectid; + key.offset = 1; + key.flags = 0; + btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); + ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key, + &root_item); + BUG_ON(ret); + + /* + * insert the directory item + */ + key.offset = (u64)-1; + dir = root->fs_info->sb->s_root->d_inode; + ret = btrfs_insert_dir_item(trans, root->fs_info->tree_root, + name, namelen, dir->i_ino, &key, + BTRFS_FT_DIR); + BUG_ON(ret); + + ret = btrfs_commit_transaction(trans, root); + BUG_ON(ret); + + new_root = btrfs_read_fs_root(root->fs_info, &key); + BUG_ON(!new_root); + + trans = btrfs_start_transaction(new_root, 1); + BUG_ON(!trans); + + inode = btrfs_new_inode(trans, new_root, new_dirid, + BTRFS_I(dir)->block_group, S_IFDIR | 0700); + inode->i_op = &btrfs_dir_inode_operations; + inode->i_fop = &btrfs_dir_file_operations; + + ret = btrfs_make_empty_dir(trans, new_root, new_dirid, new_dirid); + BUG_ON(ret); + + inode->i_nlink = 1; + inode->i_size = 6; + ret = btrfs_update_inode(trans, new_root, inode); + BUG_ON(ret); + + ret = btrfs_commit_transaction(trans, new_root); + BUG_ON(ret); + + iput(inode); + + mutex_unlock(&root->fs_info->fs_mutex); + btrfs_btree_balance_dirty(root); + return 0; +} + +static int create_snapshot(struct btrfs_root *root, char *name, int namelen) +{ + struct btrfs_trans_handle *trans; + struct btrfs_key key; + struct btrfs_root_item new_root_item; + int ret; + u64 objectid; + + if (!root->ref_cows) + return -EINVAL; + + mutex_lock(&root->fs_info->fs_mutex); + trans = btrfs_start_transaction(root, 1); + BUG_ON(!trans); + + ret = btrfs_update_inode(trans, root, root->inode); + BUG_ON(ret); + + ret = btrfs_find_free_objectid(trans, root->fs_info->tree_root, + 0, &objectid); + BUG_ON(ret); + + memcpy(&new_root_item, &root->root_item, + sizeof(new_root_item)); + + key.objectid = objectid; + key.offset = 1; + key.flags = 0; + btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); + btrfs_set_root_blocknr(&new_root_item, bh_blocknr(root->node)); + + ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key, + &new_root_item); + BUG_ON(ret); + + /* + * insert the directory item + */ + key.offset = (u64)-1; + ret = btrfs_insert_dir_item(trans, root->fs_info->tree_root, + name, namelen, + root->fs_info->sb->s_root->d_inode->i_ino, + &key, BTRFS_FT_DIR); + + BUG_ON(ret); + + ret = btrfs_inc_root_ref(trans, root); + BUG_ON(ret); + + ret = btrfs_commit_transaction(trans, root); + BUG_ON(ret); + mutex_unlock(&root->fs_info->fs_mutex); + btrfs_btree_balance_dirty(root); + return 0; +} + +int btrfs_ioctl(struct inode *inode, struct file *filp, unsigned int + cmd, unsigned long arg) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_ioctl_vol_args vol_args; + int ret = 0; + struct btrfs_dir_item *di; + int namelen; + struct btrfs_path *path; + u64 root_dirid; + + switch (cmd) { + case BTRFS_IOC_SNAP_CREATE: + if (copy_from_user(&vol_args, + (struct btrfs_ioctl_vol_args __user *)arg, + sizeof(vol_args))) + return -EFAULT; + namelen = strlen(vol_args.name); + if (namelen > BTRFS_VOL_NAME_MAX) + return -EINVAL; + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + root_dirid = root->fs_info->sb->s_root->d_inode->i_ino, + mutex_lock(&root->fs_info->fs_mutex); + di = btrfs_lookup_dir_item(NULL, root->fs_info->tree_root, + path, root_dirid, + vol_args.name, namelen, 0); + mutex_unlock(&root->fs_info->fs_mutex); + btrfs_free_path(path); + if (di && !IS_ERR(di)) + return -EEXIST; + + if (root == root->fs_info->tree_root) + ret = create_subvol(root, vol_args.name, namelen); + else + ret = create_snapshot(root, vol_args.name, namelen); + WARN_ON(ret); + break; + default: + return -ENOTTY; + } + return ret; +} + +#ifdef CONFIG_COMPAT +long btrfs_compat_ioctl(struct file *file, unsigned int cmd, + unsigned long arg) +{ + struct inode *inode = file->f_path.dentry->d_inode; + int ret; + lock_kernel(); + ret = btrfs_ioctl(inode, file, cmd, (unsigned long) compat_ptr(arg)); + unlock_kernel(); + return ret; + +} +#endif + +/* + * Called inside transaction, so use GFP_NOFS + */ +struct inode *btrfs_alloc_inode(struct super_block *sb) +{ + struct btrfs_inode *ei; + + ei = kmem_cache_alloc(btrfs_inode_cachep, GFP_NOFS); + if (!ei) + return NULL; + return &ei->vfs_inode; +} + +void btrfs_destroy_inode(struct inode *inode) +{ + WARN_ON(!list_empty(&inode->i_dentry)); + WARN_ON(inode->i_data.nrpages); + + kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode)); +} + +static void init_once(void * foo, struct kmem_cache * cachep, + unsigned long flags) +{ + struct btrfs_inode *ei = (struct btrfs_inode *) foo; + + inode_init_once(&ei->vfs_inode); +} + +void btrfs_destroy_cachep(void) +{ + if (btrfs_inode_cachep) + kmem_cache_destroy(btrfs_inode_cachep); + if (btrfs_trans_handle_cachep) + kmem_cache_destroy(btrfs_trans_handle_cachep); + if (btrfs_transaction_cachep) + kmem_cache_destroy(btrfs_transaction_cachep); + if (btrfs_bit_radix_cachep) + kmem_cache_destroy(btrfs_bit_radix_cachep); + if (btrfs_path_cachep) + kmem_cache_destroy(btrfs_path_cachep); +} + +int btrfs_init_cachep(void) +{ + btrfs_inode_cachep = kmem_cache_create("btrfs_inode_cache", + sizeof(struct btrfs_inode), + 0, (SLAB_RECLAIM_ACCOUNT| + SLAB_MEM_SPREAD), + init_once, NULL); + if (!btrfs_inode_cachep) + goto fail; + btrfs_trans_handle_cachep = kmem_cache_create("btrfs_trans_handle_cache", + sizeof(struct btrfs_trans_handle), + 0, (SLAB_RECLAIM_ACCOUNT| + SLAB_MEM_SPREAD), + NULL, NULL); + if (!btrfs_trans_handle_cachep) + goto fail; + btrfs_transaction_cachep = kmem_cache_create("btrfs_transaction_cache", + sizeof(struct btrfs_transaction), + 0, (SLAB_RECLAIM_ACCOUNT| + SLAB_MEM_SPREAD), + NULL, NULL); + if (!btrfs_transaction_cachep) + goto fail; + btrfs_path_cachep = kmem_cache_create("btrfs_path_cache", + sizeof(struct btrfs_transaction), + 0, (SLAB_RECLAIM_ACCOUNT| + SLAB_MEM_SPREAD), + NULL, NULL); + if (!btrfs_path_cachep) + goto fail; + btrfs_bit_radix_cachep = kmem_cache_create("btrfs_radix", + 256, + 0, (SLAB_RECLAIM_ACCOUNT| + SLAB_MEM_SPREAD | + SLAB_DESTROY_BY_RCU), + NULL, NULL); + if (!btrfs_bit_radix_cachep) + goto fail; + return 0; +fail: + btrfs_destroy_cachep(); + return -ENOMEM; +} + +static int btrfs_getattr(struct vfsmount *mnt, + struct dentry *dentry, struct kstat *stat) +{ + struct inode *inode = dentry->d_inode; + generic_fillattr(inode, stat); + stat->blksize = 256 * 1024; + return 0; +} + +static int btrfs_rename(struct inode * old_dir, struct dentry *old_dentry, + struct inode * new_dir,struct dentry *new_dentry) +{ + struct btrfs_trans_handle *trans; + struct btrfs_root *root = BTRFS_I(old_dir)->root; + struct inode *new_inode = new_dentry->d_inode; + struct inode *old_inode = old_dentry->d_inode; + struct timespec ctime = CURRENT_TIME; + struct btrfs_path *path; + struct btrfs_dir_item *di; + int ret; + + if (S_ISDIR(old_inode->i_mode) && new_inode && + new_inode->i_size > BTRFS_EMPTY_DIR_SIZE) { + return -ENOTEMPTY; + } + mutex_lock(&root->fs_info->fs_mutex); + trans = btrfs_start_transaction(root, 1); + btrfs_set_trans_block_group(trans, new_dir); + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto out_fail; + } + + old_dentry->d_inode->i_nlink++; + old_dir->i_ctime = old_dir->i_mtime = ctime; + new_dir->i_ctime = new_dir->i_mtime = ctime; + old_inode->i_ctime = ctime; + if (S_ISDIR(old_inode->i_mode) && old_dir != new_dir) { + struct btrfs_key *location = &BTRFS_I(new_dir)->location; + u64 old_parent_oid; + di = btrfs_lookup_dir_item(trans, root, path, old_inode->i_ino, + "..", 2, -1); + if (IS_ERR(di)) { + ret = PTR_ERR(di); + goto out_fail; + } + if (!di) { + ret = -ENOENT; + goto out_fail; + } + old_parent_oid = btrfs_disk_key_objectid(&di->location); + ret = btrfs_del_item(trans, root, path); + if (ret) { + ret = -EIO; + goto out_fail; + } + btrfs_release_path(root, path); + + di = btrfs_lookup_dir_index_item(trans, root, path, + old_inode->i_ino, + old_parent_oid, + "..", 2, -1); + if (IS_ERR(di)) { + ret = PTR_ERR(di); + goto out_fail; + } + if (!di) { + ret = -ENOENT; + goto out_fail; + } + ret = btrfs_del_item(trans, root, path); + if (ret) { + ret = -EIO; + goto out_fail; + } + btrfs_release_path(root, path); + + ret = btrfs_insert_dir_item(trans, root, "..", 2, + old_inode->i_ino, location, + BTRFS_FT_DIR); + if (ret) + goto out_fail; + } + + + ret = btrfs_unlink_trans(trans, root, old_dir, old_dentry); + if (ret) + goto out_fail; + + if (new_inode) { + new_inode->i_ctime = CURRENT_TIME; + ret = btrfs_unlink_trans(trans, root, new_dir, new_dentry); + if (ret) + goto out_fail; + if (S_ISDIR(new_inode->i_mode)) + clear_nlink(new_inode); + else + drop_nlink(new_inode); + btrfs_update_inode(trans, root, new_inode); + } + ret = btrfs_add_link(trans, new_dentry, old_inode); + if (ret) + goto out_fail; + +out_fail: + btrfs_free_path(path); + btrfs_end_transaction(trans, root); + mutex_unlock(&root->fs_info->fs_mutex); + return ret; +} + +static int btrfs_symlink(struct inode *dir, struct dentry *dentry, + const char *symname) +{ + struct btrfs_trans_handle *trans; + struct btrfs_root *root = BTRFS_I(dir)->root; + struct btrfs_path *path; + struct btrfs_key key; + struct inode *inode; + int err; + int drop_inode = 0; + u64 objectid; + int name_len; + int datasize; + char *ptr; + struct btrfs_file_extent_item *ei; + + name_len = strlen(symname) + 1; + if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root)) + return -ENAMETOOLONG; + mutex_lock(&root->fs_info->fs_mutex); + trans = btrfs_start_transaction(root, 1); + btrfs_set_trans_block_group(trans, dir); + + err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid); + if (err) { + err = -ENOSPC; + goto out_unlock; + } + + inode = btrfs_new_inode(trans, root, objectid, + BTRFS_I(dir)->block_group, S_IFLNK|S_IRWXUGO); + err = PTR_ERR(inode); + if (IS_ERR(inode)) + goto out_unlock; + + btrfs_set_trans_block_group(trans, inode); + err = btrfs_add_nondir(trans, dentry, inode); + if (err) + drop_inode = 1; + else { + inode->i_mapping->a_ops = &btrfs_aops; + inode->i_fop = &btrfs_file_operations; + inode->i_op = &btrfs_file_inode_operations; + } + dir->i_sb->s_dirt = 1; + btrfs_update_inode_block_group(trans, inode); + btrfs_update_inode_block_group(trans, dir); + if (drop_inode) + goto out_unlock; + + path = btrfs_alloc_path(); + BUG_ON(!path); + key.objectid = inode->i_ino; + key.offset = 0; + key.flags = 0; + btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY); + datasize = btrfs_file_extent_calc_inline_size(name_len); + err = btrfs_insert_empty_item(trans, root, path, &key, + datasize); + BUG_ON(err); + ei = btrfs_item_ptr(btrfs_buffer_leaf(path->nodes[0]), + path->slots[0], struct btrfs_file_extent_item); + btrfs_set_file_extent_generation(ei, trans->transid); + btrfs_set_file_extent_type(ei, + BTRFS_FILE_EXTENT_INLINE); + ptr = btrfs_file_extent_inline_start(ei); + btrfs_memcpy(root, path->nodes[0]->b_data, + ptr, symname, name_len); + mark_buffer_dirty(path->nodes[0]); + btrfs_free_path(path); + inode->i_op = &btrfs_symlink_inode_operations; + inode->i_mapping->a_ops = &btrfs_symlink_aops; + inode->i_size = name_len - 1; + btrfs_update_inode(trans, root, inode); + err = 0; + +out_unlock: + btrfs_end_transaction(trans, root); + mutex_unlock(&root->fs_info->fs_mutex); + + if (drop_inode) { + inode_dec_link_count(inode); + iput(inode); + } + btrfs_btree_balance_dirty(root); + return err; +} + +static struct inode_operations btrfs_dir_inode_operations = { + .lookup = btrfs_lookup, + .create = btrfs_create, + .unlink = btrfs_unlink, + .link = btrfs_link, + .mkdir = btrfs_mkdir, + .rmdir = btrfs_rmdir, + .rename = btrfs_rename, + .symlink = btrfs_symlink, + .setattr = btrfs_setattr, +}; + +static struct inode_operations btrfs_dir_ro_inode_operations = { + .lookup = btrfs_lookup, +}; + +static struct file_operations btrfs_dir_file_operations = { + .llseek = generic_file_llseek, + .read = generic_read_dir, + .readdir = btrfs_readdir, + .ioctl = btrfs_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = btrfs_compat_ioctl, +#endif +}; + +static struct address_space_operations btrfs_aops = { + .readpage = btrfs_readpage, + .writepage = btrfs_writepage, + .sync_page = block_sync_page, + .prepare_write = btrfs_prepare_write, + .commit_write = btrfs_commit_write, + .bmap = btrfs_bmap, +}; + +static struct address_space_operations btrfs_symlink_aops = { + .readpage = btrfs_readpage, + .writepage = btrfs_writepage, +}; + +static struct inode_operations btrfs_file_inode_operations = { + .truncate = btrfs_truncate, + .getattr = btrfs_getattr, + .setattr = btrfs_setattr, +}; + +static struct inode_operations btrfs_symlink_inode_operations = { + .readlink = generic_readlink, + .follow_link = page_follow_link_light, + .put_link = page_put_link, +}; diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 9749090cfbb8..94b3535a2a11 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -20,2686 +20,101 @@ #include "ioctl.h" #include "print-tree.h" -struct btrfs_iget_args { - u64 ino; - struct btrfs_root *root; -}; - -#define BTRFS_SUPER_MAGIC 0x9123682E - -static struct inode_operations btrfs_dir_inode_operations; -static struct inode_operations btrfs_symlink_inode_operations; -static struct inode_operations btrfs_dir_ro_inode_operations; -static struct super_operations btrfs_super_ops; -static struct file_operations btrfs_dir_file_operations; -static struct inode_operations btrfs_file_inode_operations; -static struct address_space_operations btrfs_aops; -static struct address_space_operations btrfs_symlink_aops; -static struct file_operations btrfs_file_operations; - -static int drop_extents(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct inode *inode, - u64 start, u64 end, u64 *hint_block); -static int btrfs_get_block(struct inode *inode, sector_t iblock, - struct buffer_head *result, int create); - - -#define S_SHIFT 12 -static unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = { - [S_IFREG >> S_SHIFT] = BTRFS_FT_REG_FILE, - [S_IFDIR >> S_SHIFT] = BTRFS_FT_DIR, - [S_IFCHR >> S_SHIFT] = BTRFS_FT_CHRDEV, - [S_IFBLK >> S_SHIFT] = BTRFS_FT_BLKDEV, - [S_IFIFO >> S_SHIFT] = BTRFS_FT_FIFO, - [S_IFSOCK >> S_SHIFT] = BTRFS_FT_SOCK, - [S_IFLNK >> S_SHIFT] = BTRFS_FT_SYMLINK, -}; - -static void btrfs_read_locked_inode(struct inode *inode) -{ - struct btrfs_path *path; - struct btrfs_inode_item *inode_item; - struct btrfs_root *root = BTRFS_I(inode)->root; - struct btrfs_key location; - u64 alloc_group_block; - int ret; - - path = btrfs_alloc_path(); - BUG_ON(!path); - btrfs_init_path(path); - mutex_lock(&root->fs_info->fs_mutex); - - memcpy(&location, &BTRFS_I(inode)->location, sizeof(location)); - ret = btrfs_lookup_inode(NULL, root, path, &location, 0); - if (ret) { - btrfs_free_path(path); - goto make_bad; - } - inode_item = btrfs_item_ptr(btrfs_buffer_leaf(path->nodes[0]), - path->slots[0], - struct btrfs_inode_item); - - inode->i_mode = btrfs_inode_mode(inode_item); - inode->i_nlink = btrfs_inode_nlink(inode_item); - inode->i_uid = btrfs_inode_uid(inode_item); - inode->i_gid = btrfs_inode_gid(inode_item); - inode->i_size = btrfs_inode_size(inode_item); - inode->i_atime.tv_sec = btrfs_timespec_sec(&inode_item->atime); - inode->i_atime.tv_nsec = btrfs_timespec_nsec(&inode_item->atime); - inode->i_mtime.tv_sec = btrfs_timespec_sec(&inode_item->mtime); - inode->i_mtime.tv_nsec = btrfs_timespec_nsec(&inode_item->mtime); - inode->i_ctime.tv_sec = btrfs_timespec_sec(&inode_item->ctime); - inode->i_ctime.tv_nsec = btrfs_timespec_nsec(&inode_item->ctime); - inode->i_blocks = btrfs_inode_nblocks(inode_item); - inode->i_generation = btrfs_inode_generation(inode_item); - alloc_group_block = btrfs_inode_block_group(inode_item); - BTRFS_I(inode)->block_group = btrfs_lookup_block_group(root->fs_info, - alloc_group_block); - - btrfs_free_path(path); - inode_item = NULL; - - mutex_unlock(&root->fs_info->fs_mutex); - - switch (inode->i_mode & S_IFMT) { -#if 0 - default: - init_special_inode(inode, inode->i_mode, - btrfs_inode_rdev(inode_item)); - break; -#endif - case S_IFREG: - inode->i_mapping->a_ops = &btrfs_aops; - inode->i_fop = &btrfs_file_operations; - inode->i_op = &btrfs_file_inode_operations; - break; - case S_IFDIR: - inode->i_fop = &btrfs_dir_file_operations; - if (root == root->fs_info->tree_root) - inode->i_op = &btrfs_dir_ro_inode_operations; - else - inode->i_op = &btrfs_dir_inode_operations; - break; - case S_IFLNK: - inode->i_op = &btrfs_symlink_inode_operations; - inode->i_mapping->a_ops = &btrfs_symlink_aops; - break; - } - return; - -make_bad: - btrfs_release_path(root, path); - btrfs_free_path(path); - mutex_unlock(&root->fs_info->fs_mutex); - make_bad_inode(inode); -} - -static void fill_inode_item(struct btrfs_inode_item *item, - struct inode *inode) -{ - btrfs_set_inode_uid(item, inode->i_uid); - btrfs_set_inode_gid(item, inode->i_gid); - btrfs_set_inode_size(item, inode->i_size); - btrfs_set_inode_mode(item, inode->i_mode); - btrfs_set_inode_nlink(item, inode->i_nlink); - btrfs_set_timespec_sec(&item->atime, inode->i_atime.tv_sec); - btrfs_set_timespec_nsec(&item->atime, inode->i_atime.tv_nsec); - btrfs_set_timespec_sec(&item->mtime, inode->i_mtime.tv_sec); - btrfs_set_timespec_nsec(&item->mtime, inode->i_mtime.tv_nsec); - btrfs_set_timespec_sec(&item->ctime, inode->i_ctime.tv_sec); - btrfs_set_timespec_nsec(&item->ctime, inode->i_ctime.tv_nsec); - btrfs_set_inode_nblocks(item, inode->i_blocks); - btrfs_set_inode_generation(item, inode->i_generation); - btrfs_set_inode_block_group(item, - BTRFS_I(inode)->block_group->key.objectid); -} - -static int btrfs_update_inode(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct inode *inode) -{ - struct btrfs_inode_item *inode_item; - struct btrfs_path *path; - int ret; - - path = btrfs_alloc_path(); - BUG_ON(!path); - btrfs_init_path(path); - ret = btrfs_lookup_inode(trans, root, path, - &BTRFS_I(inode)->location, 1); - if (ret) { - if (ret > 0) - ret = -ENOENT; - goto failed; - } - - inode_item = btrfs_item_ptr(btrfs_buffer_leaf(path->nodes[0]), - path->slots[0], - struct btrfs_inode_item); - - fill_inode_item(inode_item, inode); - btrfs_mark_buffer_dirty(path->nodes[0]); - ret = 0; -failed: - btrfs_release_path(root, path); - btrfs_free_path(path); - return ret; -} - - -static int btrfs_unlink_trans(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct inode *dir, - struct dentry *dentry) -{ - struct btrfs_path *path; - const char *name = dentry->d_name.name; - int name_len = dentry->d_name.len; - int ret = 0; - u64 objectid; - struct btrfs_dir_item *di; - - path = btrfs_alloc_path(); - BUG_ON(!path); - btrfs_init_path(path); - di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino, - name, name_len, -1); - if (IS_ERR(di)) { - ret = PTR_ERR(di); - goto err; - } - if (!di) { - ret = -ENOENT; - goto err; - } - objectid = btrfs_disk_key_objectid(&di->location); - ret = btrfs_delete_one_dir_name(trans, root, path, di); - BUG_ON(ret); - btrfs_release_path(root, path); - - di = btrfs_lookup_dir_index_item(trans, root, path, dir->i_ino, - objectid, name, name_len, -1); - if (IS_ERR(di)) { - ret = PTR_ERR(di); - goto err; - } - if (!di) { - ret = -ENOENT; - goto err; - } - ret = btrfs_delete_one_dir_name(trans, root, path, di); - BUG_ON(ret); - - dentry->d_inode->i_ctime = dir->i_ctime; -err: - btrfs_free_path(path); - if (!ret) { - dir->i_size -= name_len * 2; - btrfs_update_inode(trans, root, dir); - drop_nlink(dentry->d_inode); - btrfs_update_inode(trans, root, dentry->d_inode); - dir->i_sb->s_dirt = 1; - } - return ret; -} - -static int btrfs_unlink(struct inode *dir, struct dentry *dentry) -{ - struct btrfs_root *root; - struct btrfs_trans_handle *trans; - int ret; - - root = BTRFS_I(dir)->root; - mutex_lock(&root->fs_info->fs_mutex); - trans = btrfs_start_transaction(root, 1); - btrfs_set_trans_block_group(trans, dir); - ret = btrfs_unlink_trans(trans, root, dir, dentry); - btrfs_end_transaction(trans, root); - mutex_unlock(&root->fs_info->fs_mutex); - btrfs_btree_balance_dirty(root); - return ret; -} - -static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) -{ - struct inode *inode = dentry->d_inode; - int err; - int ret; - struct btrfs_root *root = BTRFS_I(dir)->root; - struct btrfs_path *path; - struct btrfs_key key; - struct btrfs_trans_handle *trans; - struct btrfs_key found_key; - int found_type; - struct btrfs_leaf *leaf; - char *goodnames = ".."; - - path = btrfs_alloc_path(); - BUG_ON(!path); - btrfs_init_path(path); - mutex_lock(&root->fs_info->fs_mutex); - trans = btrfs_start_transaction(root, 1); - btrfs_set_trans_block_group(trans, dir); - key.objectid = inode->i_ino; - key.offset = (u64)-1; - key.flags = (u32)-1; - while(1) { - ret = btrfs_search_slot(trans, root, &key, path, -1, 1); - if (ret < 0) { - err = ret; - goto out; - } - BUG_ON(ret == 0); - if (path->slots[0] == 0) { - err = -ENOENT; - goto out; - } - path->slots[0]--; - leaf = btrfs_buffer_leaf(path->nodes[0]); - btrfs_disk_key_to_cpu(&found_key, - &leaf->items[path->slots[0]].key); - found_type = btrfs_key_type(&found_key); - if (found_key.objectid != inode->i_ino) { - err = -ENOENT; - goto out; - } - if ((found_type != BTRFS_DIR_ITEM_KEY && - found_type != BTRFS_DIR_INDEX_KEY) || - (!btrfs_match_dir_item_name(root, path, goodnames, 2) && - !btrfs_match_dir_item_name(root, path, goodnames, 1))) { - err = -ENOTEMPTY; - goto out; - } - ret = btrfs_del_item(trans, root, path); - BUG_ON(ret); - - if (found_type == BTRFS_DIR_ITEM_KEY && found_key.offset == 1) - break; - btrfs_release_path(root, path); - } - ret = 0; - btrfs_release_path(root, path); - - /* now the directory is empty */ - err = btrfs_unlink_trans(trans, root, dir, dentry); - if (!err) { - inode->i_size = 0; - } -out: - btrfs_release_path(root, path); - btrfs_free_path(path); - mutex_unlock(&root->fs_info->fs_mutex); - ret = btrfs_end_transaction(trans, root); - btrfs_btree_balance_dirty(root); - if (ret && !err) - err = ret; - return err; -} - -static int btrfs_free_inode(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct inode *inode) -{ - struct btrfs_path *path; - int ret; - - clear_inode(inode); - - path = btrfs_alloc_path(); - BUG_ON(!path); - btrfs_init_path(path); - ret = btrfs_lookup_inode(trans, root, path, - &BTRFS_I(inode)->location, -1); - BUG_ON(ret); - ret = btrfs_del_item(trans, root, path); - BUG_ON(ret); - btrfs_free_path(path); - return ret; -} - -static void reada_truncate(struct btrfs_root *root, struct btrfs_path *path, - u64 objectid) -{ - struct btrfs_node *node; - int i; - int nritems; - u64 item_objectid; - u64 blocknr; - int slot; - int ret; - - if (!path->nodes[1]) - return; - node = btrfs_buffer_node(path->nodes[1]); - slot = path->slots[1]; - if (slot == 0) - return; - nritems = btrfs_header_nritems(&node->header); - for (i = slot - 1; i >= 0; i--) { - item_objectid = btrfs_disk_key_objectid(&node->ptrs[i].key); - if (item_objectid != objectid) - break; - blocknr = btrfs_node_blockptr(node, i); - ret = readahead_tree_block(root, blocknr); - if (ret) - break; - } -} - -static int btrfs_truncate_in_trans(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct inode *inode) -{ - int ret; - struct btrfs_path *path; - struct btrfs_key key; - struct btrfs_disk_key *found_key; - u32 found_type; - struct btrfs_leaf *leaf; - struct btrfs_file_extent_item *fi; - u64 extent_start = 0; - u64 extent_num_blocks = 0; - u64 item_end = 0; - int found_extent; - int del_item; - - path = btrfs_alloc_path(); - BUG_ON(!path); - /* FIXME, add redo link to tree so we don't leak on crash */ - key.objectid = inode->i_ino; - key.offset = (u64)-1; - key.flags = (u32)-1; - while(1) { - btrfs_init_path(path); - fi = NULL; - ret = btrfs_search_slot(trans, root, &key, path, -1, 1); - if (ret < 0) { - goto error; - } - if (ret > 0) { - BUG_ON(path->slots[0] == 0); - path->slots[0]--; - } - reada_truncate(root, path, inode->i_ino); - leaf = btrfs_buffer_leaf(path->nodes[0]); - found_key = &leaf->items[path->slots[0]].key; - found_type = btrfs_disk_key_type(found_key); - if (btrfs_disk_key_objectid(found_key) != inode->i_ino) - break; - if (found_type != BTRFS_CSUM_ITEM_KEY && - found_type != BTRFS_DIR_ITEM_KEY && - found_type != BTRFS_DIR_INDEX_KEY && - found_type != BTRFS_EXTENT_DATA_KEY) - break; - item_end = btrfs_disk_key_offset(found_key); - if (found_type == BTRFS_EXTENT_DATA_KEY) { - fi = btrfs_item_ptr(btrfs_buffer_leaf(path->nodes[0]), - path->slots[0], - struct btrfs_file_extent_item); - if (btrfs_file_extent_type(fi) != - BTRFS_FILE_EXTENT_INLINE) { - item_end += btrfs_file_extent_num_blocks(fi) << - inode->i_blkbits; - } - } - if (found_type == BTRFS_CSUM_ITEM_KEY) { - ret = btrfs_csum_truncate(trans, root, path, - inode->i_size); - BUG_ON(ret); - } - if (item_end < inode->i_size) { - if (found_type) { - btrfs_set_key_type(&key, found_type - 1); - continue; - } - break; - } - if (btrfs_disk_key_offset(found_key) >= inode->i_size) - del_item = 1; - else - del_item = 0; - found_extent = 0; - - if (found_type == BTRFS_EXTENT_DATA_KEY && - btrfs_file_extent_type(fi) != - BTRFS_FILE_EXTENT_INLINE) { - u64 num_dec; - if (!del_item) { - u64 orig_num_blocks = - btrfs_file_extent_num_blocks(fi); - extent_num_blocks = inode->i_size - - btrfs_disk_key_offset(found_key) + - root->blocksize - 1; - extent_num_blocks >>= inode->i_blkbits; - btrfs_set_file_extent_num_blocks(fi, - extent_num_blocks); - inode->i_blocks -= (orig_num_blocks - - extent_num_blocks) << 3; - mark_buffer_dirty(path->nodes[0]); - } else { - extent_start = - btrfs_file_extent_disk_blocknr(fi); - extent_num_blocks = - btrfs_file_extent_disk_num_blocks(fi); - /* FIXME blocksize != 4096 */ - num_dec = btrfs_file_extent_num_blocks(fi) << 3; - if (extent_start != 0) { - found_extent = 1; - inode->i_blocks -= num_dec; - } - } - } - if (del_item) { - ret = btrfs_del_item(trans, root, path); - BUG_ON(ret); - } else { - break; - } - btrfs_release_path(root, path); - if (found_extent) { - ret = btrfs_free_extent(trans, root, extent_start, - extent_num_blocks, 0); - BUG_ON(ret); - } - } - ret = 0; -error: - btrfs_release_path(root, path); - btrfs_free_path(path); - inode->i_sb->s_dirt = 1; - return ret; -} - -static int btrfs_truncate_page(struct address_space *mapping, loff_t from) -{ - struct inode *inode = mapping->host; - unsigned blocksize = 1 << inode->i_blkbits; - pgoff_t index = from >> PAGE_CACHE_SHIFT; - unsigned offset = from & (PAGE_CACHE_SIZE-1); - struct page *page; - char *kaddr; - int ret = 0; - struct btrfs_root *root = BTRFS_I(inode)->root; - u64 alloc_hint; - struct btrfs_key ins; - struct btrfs_trans_handle *trans; - - if ((offset & (blocksize - 1)) == 0) - goto out; - - ret = -ENOMEM; - page = grab_cache_page(mapping, index); - if (!page) - goto out; - - if (!PageUptodate(page)) { - ret = mpage_readpage(page, btrfs_get_block); - lock_page(page); - if (!PageUptodate(page)) { - ret = -EIO; - goto out; - } - } - mutex_lock(&root->fs_info->fs_mutex); - trans = btrfs_start_transaction(root, 1); - btrfs_set_trans_block_group(trans, inode); - - ret = drop_extents(trans, root, inode, page->index << PAGE_CACHE_SHIFT, - (page->index + 1) << PAGE_CACHE_SHIFT, &alloc_hint); - BUG_ON(ret); - ret = btrfs_alloc_extent(trans, root, inode->i_ino, 1, - alloc_hint, (u64)-1, &ins, 1); - BUG_ON(ret); - ret = btrfs_insert_file_extent(trans, root, inode->i_ino, - page->index << PAGE_CACHE_SHIFT, - ins.objectid, 1, 1); - BUG_ON(ret); - SetPageChecked(page); - kaddr = kmap(page); - memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset); - flush_dcache_page(page); - btrfs_csum_file_block(trans, root, inode->i_ino, - page->index << PAGE_CACHE_SHIFT, - kaddr, PAGE_CACHE_SIZE); - kunmap(page); - btrfs_end_transaction(trans, root); - mutex_unlock(&root->fs_info->fs_mutex); - - set_page_dirty(page); - unlock_page(page); - page_cache_release(page); -out: - return ret; -} - -static int btrfs_setattr(struct dentry *dentry, struct iattr *attr) -{ - struct inode *inode = dentry->d_inode; - int err; - - err = inode_change_ok(inode, attr); - if (err) - return err; - - if (S_ISREG(inode->i_mode) && - attr->ia_valid & ATTR_SIZE && attr->ia_size > inode->i_size) { - struct btrfs_trans_handle *trans; - struct btrfs_root *root = BTRFS_I(inode)->root; - u64 mask = root->blocksize - 1; - u64 pos = (inode->i_size + mask) & ~mask; - u64 hole_size; - - if (attr->ia_size <= pos) - goto out; - - btrfs_truncate_page(inode->i_mapping, inode->i_size); - - hole_size = (attr->ia_size - pos + mask) & ~mask; - hole_size >>= inode->i_blkbits; - - mutex_lock(&root->fs_info->fs_mutex); - trans = btrfs_start_transaction(root, 1); - btrfs_set_trans_block_group(trans, inode); - err = btrfs_insert_file_extent(trans, root, inode->i_ino, - pos, 0, 0, hole_size); - BUG_ON(err); - btrfs_end_transaction(trans, root); - mutex_unlock(&root->fs_info->fs_mutex); - } -out: - err = inode_setattr(inode, attr); - - return err; -} -static void btrfs_delete_inode(struct inode *inode) -{ - struct btrfs_trans_handle *trans; - struct btrfs_root *root = BTRFS_I(inode)->root; - int ret; - - truncate_inode_pages(&inode->i_data, 0); - if (is_bad_inode(inode)) { - goto no_delete; - } - inode->i_size = 0; - mutex_lock(&root->fs_info->fs_mutex); - trans = btrfs_start_transaction(root, 1); - btrfs_set_trans_block_group(trans, inode); - ret = btrfs_truncate_in_trans(trans, root, inode); - BUG_ON(ret); - btrfs_free_inode(trans, root, inode); - btrfs_end_transaction(trans, root); - mutex_unlock(&root->fs_info->fs_mutex); - btrfs_btree_balance_dirty(root); - return; -no_delete: - clear_inode(inode); -} - -static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry, - struct btrfs_key *location) -{ - const char *name = dentry->d_name.name; - int namelen = dentry->d_name.len; - struct btrfs_dir_item *di; - struct btrfs_path *path; - struct btrfs_root *root = BTRFS_I(dir)->root; - int ret; - - path = btrfs_alloc_path(); - BUG_ON(!path); - btrfs_init_path(path); - di = btrfs_lookup_dir_item(NULL, root, path, dir->i_ino, name, - namelen, 0); - if (!di || IS_ERR(di)) { - location->objectid = 0; - ret = 0; - goto out; - } - btrfs_disk_key_to_cpu(location, &di->location); -out: - btrfs_release_path(root, path); - btrfs_free_path(path); - return ret; -} - -static int fixup_tree_root_location(struct btrfs_root *root, - struct btrfs_key *location, - struct btrfs_root **sub_root) -{ - struct btrfs_path *path; - struct btrfs_root_item *ri; - - if (btrfs_key_type(location) != BTRFS_ROOT_ITEM_KEY) - return 0; - if (location->objectid == BTRFS_ROOT_TREE_OBJECTID) - return 0; - - path = btrfs_alloc_path(); - BUG_ON(!path); - mutex_lock(&root->fs_info->fs_mutex); - - *sub_root = btrfs_read_fs_root(root->fs_info, location); - if (IS_ERR(*sub_root)) - return PTR_ERR(*sub_root); - - ri = &(*sub_root)->root_item; - location->objectid = btrfs_root_dirid(ri); - location->flags = 0; - btrfs_set_key_type(location, BTRFS_INODE_ITEM_KEY); - location->offset = 0; - - btrfs_free_path(path); - mutex_unlock(&root->fs_info->fs_mutex); - return 0; -} - -static int btrfs_init_locked_inode(struct inode *inode, void *p) -{ - struct btrfs_iget_args *args = p; - inode->i_ino = args->ino; - BTRFS_I(inode)->root = args->root; - return 0; -} - -static int btrfs_find_actor(struct inode *inode, void *opaque) -{ - struct btrfs_iget_args *args = opaque; - return (args->ino == inode->i_ino && - args->root == BTRFS_I(inode)->root); -} - -static struct inode *btrfs_iget_locked(struct super_block *s, u64 objectid, - struct btrfs_root *root) -{ - struct inode *inode; - struct btrfs_iget_args args; - args.ino = objectid; - args.root = root; - - inode = iget5_locked(s, objectid, btrfs_find_actor, - btrfs_init_locked_inode, - (void *)&args); - return inode; -} - -static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry, - struct nameidata *nd) -{ - struct inode * inode; - struct btrfs_inode *bi = BTRFS_I(dir); - struct btrfs_root *root = bi->root; - struct btrfs_root *sub_root = root; - struct btrfs_key location; - int ret; - - if (dentry->d_name.len > BTRFS_NAME_LEN) - return ERR_PTR(-ENAMETOOLONG); - mutex_lock(&root->fs_info->fs_mutex); - ret = btrfs_inode_by_name(dir, dentry, &location); - mutex_unlock(&root->fs_info->fs_mutex); - if (ret < 0) - return ERR_PTR(ret); - inode = NULL; - if (location.objectid) { - ret = fixup_tree_root_location(root, &location, &sub_root); - if (ret < 0) - return ERR_PTR(ret); - if (ret > 0) - return ERR_PTR(-ENOENT); - inode = btrfs_iget_locked(dir->i_sb, location.objectid, - sub_root); - if (!inode) - return ERR_PTR(-EACCES); - if (inode->i_state & I_NEW) { - if (sub_root != root) { -printk("adding new root for inode %lu root %p (found %p)\n", inode->i_ino, sub_root, BTRFS_I(inode)->root); - igrab(inode); - sub_root->inode = inode; - } - BTRFS_I(inode)->root = sub_root; - memcpy(&BTRFS_I(inode)->location, &location, - sizeof(location)); - btrfs_read_locked_inode(inode); - unlock_new_inode(inode); - } - } - return d_splice_alias(inode, dentry); -} - -static void reada_leaves(struct btrfs_root *root, struct btrfs_path *path, - u64 objectid) -{ - struct btrfs_node *node; - int i; - u32 nritems; - u64 item_objectid; - u64 blocknr; - int slot; - int ret; - - if (!path->nodes[1]) - return; - node = btrfs_buffer_node(path->nodes[1]); - slot = path->slots[1]; - nritems = btrfs_header_nritems(&node->header); - for (i = slot + 1; i < nritems; i++) { - item_objectid = btrfs_disk_key_objectid(&node->ptrs[i].key); - if (item_objectid != objectid) - break; - blocknr = btrfs_node_blockptr(node, i); - ret = readahead_tree_block(root, blocknr); - if (ret) - break; - } -} -static unsigned char btrfs_filetype_table[] = { - DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK -}; - -static int btrfs_readdir(struct file *filp, void *dirent, filldir_t filldir) -{ - struct inode *inode = filp->f_path.dentry->d_inode; - struct btrfs_root *root = BTRFS_I(inode)->root; - struct btrfs_item *item; - struct btrfs_dir_item *di; - struct btrfs_key key; - struct btrfs_path *path; - int ret; - u32 nritems; - struct btrfs_leaf *leaf; - int slot; - int advance; - unsigned char d_type; - int over = 0; - u32 di_cur; - u32 di_total; - u32 di_len; - int key_type = BTRFS_DIR_INDEX_KEY; - - /* FIXME, use a real flag for deciding about the key type */ - if (root->fs_info->tree_root == root) - key_type = BTRFS_DIR_ITEM_KEY; - mutex_lock(&root->fs_info->fs_mutex); - key.objectid = inode->i_ino; - key.flags = 0; - btrfs_set_key_type(&key, key_type); - key.offset = filp->f_pos; - path = btrfs_alloc_path(); - btrfs_init_path(path); - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - if (ret < 0) - goto err; - advance = 0; - reada_leaves(root, path, inode->i_ino); - while(1) { - leaf = btrfs_buffer_leaf(path->nodes[0]); - nritems = btrfs_header_nritems(&leaf->header); - slot = path->slots[0]; - if (advance || slot >= nritems) { - if (slot >= nritems -1) { - reada_leaves(root, path, inode->i_ino); - ret = btrfs_next_leaf(root, path); - if (ret) - break; - leaf = btrfs_buffer_leaf(path->nodes[0]); - nritems = btrfs_header_nritems(&leaf->header); - slot = path->slots[0]; - } else { - slot++; - path->slots[0]++; - } - } - advance = 1; - item = leaf->items + slot; - if (btrfs_disk_key_objectid(&item->key) != key.objectid) - break; - if (btrfs_disk_key_type(&item->key) != key_type) - break; - if (btrfs_disk_key_offset(&item->key) < filp->f_pos) - continue; - filp->f_pos = btrfs_disk_key_offset(&item->key); - advance = 1; - di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item); - di_cur = 0; - di_total = btrfs_item_size(leaf->items + slot); - while(di_cur < di_total) { - d_type = btrfs_filetype_table[btrfs_dir_type(di)]; - over = filldir(dirent, (const char *)(di + 1), - btrfs_dir_name_len(di), - btrfs_disk_key_offset(&item->key), - btrfs_disk_key_objectid(&di->location), - d_type); - if (over) - goto nopos; - di_len = btrfs_dir_name_len(di) + sizeof(*di); - di_cur += di_len; - di = (struct btrfs_dir_item *)((char *)di + di_len); - } - } - filp->f_pos++; -nopos: - ret = 0; -err: - btrfs_release_path(root, path); - btrfs_free_path(path); - mutex_unlock(&root->fs_info->fs_mutex); - return ret; -} - -static void btrfs_put_super (struct super_block * sb) -{ - struct btrfs_root *root = btrfs_sb(sb); - int ret; - - ret = close_ctree(root); - if (ret) { - printk("close ctree returns %d\n", ret); - } - sb->s_fs_info = NULL; -} - -static int btrfs_fill_super(struct super_block * sb, void * data, int silent) -{ - struct inode * inode; - struct dentry * root_dentry; - struct btrfs_super_block *disk_super; - struct btrfs_root *tree_root; - struct btrfs_inode *bi; - - sb->s_maxbytes = MAX_LFS_FILESIZE; - sb->s_magic = BTRFS_SUPER_MAGIC; - sb->s_op = &btrfs_super_ops; - sb->s_time_gran = 1; - - tree_root = open_ctree(sb); - - if (!tree_root) { - printk("btrfs: open_ctree failed\n"); - return -EIO; - } - sb->s_fs_info = tree_root; - disk_super = tree_root->fs_info->disk_super; - printk("read in super total blocks %Lu root %Lu\n", - btrfs_super_total_blocks(disk_super), - btrfs_super_root_dir(disk_super)); - - inode = btrfs_iget_locked(sb, btrfs_super_root_dir(disk_super), - tree_root); - bi = BTRFS_I(inode); - bi->location.objectid = inode->i_ino; - bi->location.offset = 0; - bi->location.flags = 0; - bi->root = tree_root; - btrfs_set_key_type(&bi->location, BTRFS_INODE_ITEM_KEY); - - if (!inode) - return -ENOMEM; - if (inode->i_state & I_NEW) { - btrfs_read_locked_inode(inode); - unlock_new_inode(inode); - } - - root_dentry = d_alloc_root(inode); - if (!root_dentry) { - iput(inode); - return -ENOMEM; - } - sb->s_root = root_dentry; - btrfs_transaction_queue_work(tree_root, HZ * 30); - return 0; -} - -static int btrfs_write_inode(struct inode *inode, int wait) -{ - struct btrfs_root *root = BTRFS_I(inode)->root; - struct btrfs_trans_handle *trans; - int ret = 0; - - if (wait) { - mutex_lock(&root->fs_info->fs_mutex); - trans = btrfs_start_transaction(root, 1); - btrfs_set_trans_block_group(trans, inode); - ret = btrfs_commit_transaction(trans, root); - mutex_unlock(&root->fs_info->fs_mutex); - } - return ret; -} - -static void btrfs_dirty_inode(struct inode *inode) -{ - struct btrfs_root *root = BTRFS_I(inode)->root; - struct btrfs_trans_handle *trans; - - mutex_lock(&root->fs_info->fs_mutex); - trans = btrfs_start_transaction(root, 1); - btrfs_set_trans_block_group(trans, inode); - btrfs_update_inode(trans, root, inode); - btrfs_end_transaction(trans, root); - mutex_unlock(&root->fs_info->fs_mutex); - btrfs_btree_balance_dirty(root); -} - -static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - u64 objectid, - struct btrfs_block_group_cache *group, - int mode) -{ - struct inode *inode; - struct btrfs_inode_item inode_item; - struct btrfs_key *location; - int ret; - int owner; - - inode = new_inode(root->fs_info->sb); - if (!inode) - return ERR_PTR(-ENOMEM); - - BTRFS_I(inode)->root = root; - if (mode & S_IFDIR) - owner = 0; - else - owner = 1; - group = btrfs_find_block_group(root, group, 0, 0, owner); - BTRFS_I(inode)->block_group = group; - - inode->i_uid = current->fsuid; - inode->i_gid = current->fsgid; - inode->i_mode = mode; - inode->i_ino = objectid; - inode->i_blocks = 0; - inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; - fill_inode_item(&inode_item, inode); - location = &BTRFS_I(inode)->location; - location->objectid = objectid; - location->flags = 0; - location->offset = 0; - btrfs_set_key_type(location, BTRFS_INODE_ITEM_KEY); - - ret = btrfs_insert_inode(trans, root, objectid, &inode_item); - BUG_ON(ret); - - insert_inode_hash(inode); - return inode; -} - -static inline u8 btrfs_inode_type(struct inode *inode) -{ - return btrfs_type_by_mode[(inode->i_mode & S_IFMT) >> S_SHIFT]; -} - -static int btrfs_add_link(struct btrfs_trans_handle *trans, - struct dentry *dentry, struct inode *inode) -{ - int ret; - struct btrfs_key key; - struct btrfs_root *root = BTRFS_I(dentry->d_parent->d_inode)->root; - key.objectid = inode->i_ino; - key.flags = 0; - btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); - key.offset = 0; - - ret = btrfs_insert_dir_item(trans, root, - dentry->d_name.name, dentry->d_name.len, - dentry->d_parent->d_inode->i_ino, - &key, btrfs_inode_type(inode)); - if (ret == 0) { - dentry->d_parent->d_inode->i_size += dentry->d_name.len * 2; - ret = btrfs_update_inode(trans, root, - dentry->d_parent->d_inode); - } - return ret; -} - -static int btrfs_add_nondir(struct btrfs_trans_handle *trans, - struct dentry *dentry, struct inode *inode) -{ - int err = btrfs_add_link(trans, dentry, inode); - if (!err) { - d_instantiate(dentry, inode); - return 0; - } - if (err > 0) - err = -EEXIST; - return err; -} - -static int btrfs_create(struct inode *dir, struct dentry *dentry, - int mode, struct nameidata *nd) -{ - struct btrfs_trans_handle *trans; - struct btrfs_root *root = BTRFS_I(dir)->root; - struct inode *inode; - int err; - int drop_inode = 0; - u64 objectid; - - mutex_lock(&root->fs_info->fs_mutex); - trans = btrfs_start_transaction(root, 1); - btrfs_set_trans_block_group(trans, dir); - - err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid); - if (err) { - err = -ENOSPC; - goto out_unlock; - } - - inode = btrfs_new_inode(trans, root, objectid, - BTRFS_I(dir)->block_group, mode); - err = PTR_ERR(inode); - if (IS_ERR(inode)) - goto out_unlock; - - btrfs_set_trans_block_group(trans, inode); - err = btrfs_add_nondir(trans, dentry, inode); - if (err) - drop_inode = 1; - else { - inode->i_mapping->a_ops = &btrfs_aops; - inode->i_fop = &btrfs_file_operations; - inode->i_op = &btrfs_file_inode_operations; - } - dir->i_sb->s_dirt = 1; - btrfs_update_inode_block_group(trans, inode); - btrfs_update_inode_block_group(trans, dir); -out_unlock: - btrfs_end_transaction(trans, root); - mutex_unlock(&root->fs_info->fs_mutex); - - if (drop_inode) { - inode_dec_link_count(inode); - iput(inode); - } - btrfs_btree_balance_dirty(root); - return err; -} - -static int btrfs_link(struct dentry *old_dentry, struct inode *dir, - struct dentry *dentry) -{ - struct btrfs_trans_handle *trans; - struct btrfs_root *root = BTRFS_I(dir)->root; - struct inode *inode = old_dentry->d_inode; - int err; - int drop_inode = 0; - - if (inode->i_nlink == 0) - return -ENOENT; - - inc_nlink(inode); - mutex_lock(&root->fs_info->fs_mutex); - trans = btrfs_start_transaction(root, 1); - btrfs_set_trans_block_group(trans, dir); - atomic_inc(&inode->i_count); - err = btrfs_add_nondir(trans, dentry, inode); - if (err) - drop_inode = 1; - dir->i_sb->s_dirt = 1; - btrfs_update_inode_block_group(trans, dir); - btrfs_update_inode(trans, root, inode); - - btrfs_end_transaction(trans, root); - mutex_unlock(&root->fs_info->fs_mutex); - - if (drop_inode) { - inode_dec_link_count(inode); - iput(inode); - } - btrfs_btree_balance_dirty(root); - return err; -} - -static int btrfs_make_empty_dir(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - u64 objectid, u64 dirid) -{ - int ret; - char buf[2]; - struct btrfs_key key; - - buf[0] = '.'; - buf[1] = '.'; - - key.objectid = objectid; - key.offset = 0; - key.flags = 0; - btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); - - ret = btrfs_insert_dir_item(trans, root, buf, 1, objectid, - &key, BTRFS_FT_DIR); - if (ret) - goto error; - key.objectid = dirid; - ret = btrfs_insert_dir_item(trans, root, buf, 2, objectid, - &key, BTRFS_FT_DIR); - if (ret) - goto error; -error: - return ret; -} - -static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) -{ - struct inode *inode; - struct btrfs_trans_handle *trans; - struct btrfs_root *root = BTRFS_I(dir)->root; - int err = 0; - int drop_on_err = 0; - u64 objectid; - - mutex_lock(&root->fs_info->fs_mutex); - trans = btrfs_start_transaction(root, 1); - btrfs_set_trans_block_group(trans, dir); - if (IS_ERR(trans)) { - err = PTR_ERR(trans); - goto out_unlock; - } - - err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid); - if (err) { - err = -ENOSPC; - goto out_unlock; - } - - inode = btrfs_new_inode(trans, root, objectid, - BTRFS_I(dir)->block_group, S_IFDIR | mode); - if (IS_ERR(inode)) { - err = PTR_ERR(inode); - goto out_fail; - } - drop_on_err = 1; - inode->i_op = &btrfs_dir_inode_operations; - inode->i_fop = &btrfs_dir_file_operations; - btrfs_set_trans_block_group(trans, inode); - - err = btrfs_make_empty_dir(trans, root, inode->i_ino, dir->i_ino); - if (err) - goto out_fail; - - inode->i_size = 6; - err = btrfs_update_inode(trans, root, inode); - if (err) - goto out_fail; - err = btrfs_add_link(trans, dentry, inode); - if (err) - goto out_fail; - d_instantiate(dentry, inode); - drop_on_err = 0; - dir->i_sb->s_dirt = 1; - btrfs_update_inode_block_group(trans, inode); - btrfs_update_inode_block_group(trans, dir); - -out_fail: - btrfs_end_transaction(trans, root); -out_unlock: - mutex_unlock(&root->fs_info->fs_mutex); - if (drop_on_err) - iput(inode); - btrfs_btree_balance_dirty(root); - return err; -} - -static int btrfs_sync_file(struct file *file, - struct dentry *dentry, int datasync) -{ - struct inode *inode = dentry->d_inode; - struct btrfs_root *root = BTRFS_I(inode)->root; - int ret; - struct btrfs_trans_handle *trans; - - mutex_lock(&root->fs_info->fs_mutex); - trans = btrfs_start_transaction(root, 1); - if (!trans) { - ret = -ENOMEM; - goto out; - } - ret = btrfs_commit_transaction(trans, root); - mutex_unlock(&root->fs_info->fs_mutex); -out: - return ret > 0 ? EIO : ret; -} - -static int btrfs_sync_fs(struct super_block *sb, int wait) -{ - struct btrfs_trans_handle *trans; - struct btrfs_root *root; - int ret; - root = btrfs_sb(sb); - - sb->s_dirt = 0; - if (!wait) { - filemap_flush(root->fs_info->btree_inode->i_mapping); - return 0; - } - mutex_lock(&root->fs_info->fs_mutex); - trans = btrfs_start_transaction(root, 1); - ret = btrfs_commit_transaction(trans, root); - sb->s_dirt = 0; - BUG_ON(ret); -printk("btrfs sync_fs\n"); - mutex_unlock(&root->fs_info->fs_mutex); - return 0; -} - -#define BTRFS_GET_BLOCK_NO_CREATE 0 -#define BTRFS_GET_BLOCK_CREATE 1 -#define BTRFS_GET_BLOCK_NO_DIRECT 2 - -static int btrfs_get_block_lock(struct inode *inode, sector_t iblock, - struct buffer_head *result, int create) -{ - int ret; - int err = 0; - u64 blocknr; - u64 extent_start = 0; - u64 extent_end = 0; - u64 objectid = inode->i_ino; - u32 found_type; - u64 alloc_hint = 0; - struct btrfs_path *path; - struct btrfs_root *root = BTRFS_I(inode)->root; - struct btrfs_file_extent_item *item; - struct btrfs_leaf *leaf; - struct btrfs_disk_key *found_key; - struct btrfs_trans_handle *trans = NULL; - - path = btrfs_alloc_path(); - BUG_ON(!path); - btrfs_init_path(path); - if (create & BTRFS_GET_BLOCK_CREATE) { - WARN_ON(1); - /* this almost but not quite works */ - trans = btrfs_start_transaction(root, 1); - if (!trans) { - err = -ENOMEM; - goto out; - } - ret = drop_extents(trans, root, inode, - iblock << inode->i_blkbits, - (iblock + 1) << inode->i_blkbits, - &alloc_hint); - BUG_ON(ret); - } - - ret = btrfs_lookup_file_extent(NULL, root, path, - inode->i_ino, - iblock << inode->i_blkbits, 0); - if (ret < 0) { - err = ret; - goto out; - } - - if (ret != 0) { - if (path->slots[0] == 0) { - btrfs_release_path(root, path); - goto not_found; - } - path->slots[0]--; - } - - item = btrfs_item_ptr(btrfs_buffer_leaf(path->nodes[0]), path->slots[0], - struct btrfs_file_extent_item); - leaf = btrfs_buffer_leaf(path->nodes[0]); - blocknr = btrfs_file_extent_disk_blocknr(item); - blocknr += btrfs_file_extent_offset(item); - - /* are we inside the extent that was found? */ - found_key = &leaf->items[path->slots[0]].key; - found_type = btrfs_disk_key_type(found_key); - if (btrfs_disk_key_objectid(found_key) != objectid || - found_type != BTRFS_EXTENT_DATA_KEY) { - extent_end = 0; - extent_start = 0; - goto not_found; - } - found_type = btrfs_file_extent_type(item); - extent_start = btrfs_disk_key_offset(&leaf->items[path->slots[0]].key); - if (found_type == BTRFS_FILE_EXTENT_REG) { - extent_start = extent_start >> inode->i_blkbits; - extent_end = extent_start + btrfs_file_extent_num_blocks(item); - err = 0; - if (btrfs_file_extent_disk_blocknr(item) == 0) - goto out; - if (iblock >= extent_start && iblock < extent_end) { - btrfs_map_bh_to_logical(root, result, blocknr + - iblock - extent_start); - goto out; - } - } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { - char *ptr; - char *map; - u32 size; - - if (create & BTRFS_GET_BLOCK_NO_DIRECT) { - err = -EINVAL; - goto out; - } - size = btrfs_file_extent_inline_len(leaf->items + - path->slots[0]); - extent_end = (extent_start + size) >> inode->i_blkbits; - extent_start >>= inode->i_blkbits; - if (iblock < extent_start || iblock > extent_end) { - goto not_found; - } - ptr = btrfs_file_extent_inline_start(item); - map = kmap(result->b_page); - memcpy(map, ptr, size); - memset(map + size, 0, PAGE_CACHE_SIZE - size); - flush_dcache_page(result->b_page); - kunmap(result->b_page); - set_buffer_uptodate(result); - SetPageChecked(result->b_page); - btrfs_map_bh_to_logical(root, result, 0); - } -not_found: - if (create & BTRFS_GET_BLOCK_CREATE) { - struct btrfs_key ins; - ret = btrfs_alloc_extent(trans, root, inode->i_ino, - 1, alloc_hint, (u64)-1, - &ins, 1); - BUG_ON(ret); - ret = btrfs_insert_file_extent(trans, root, inode->i_ino, - iblock << inode->i_blkbits, - ins.objectid, ins.offset, - ins.offset); - BUG_ON(ret); - SetPageChecked(result->b_page); - btrfs_map_bh_to_logical(root, result, ins.objectid); - } -out: - if (trans) - err = btrfs_end_transaction(trans, root); - btrfs_free_path(path); - return err; -} - -static int btrfs_get_block(struct inode *inode, sector_t iblock, - struct buffer_head *result, int create) -{ - int err; - struct btrfs_root *root = BTRFS_I(inode)->root; - mutex_lock(&root->fs_info->fs_mutex); - err = btrfs_get_block_lock(inode, iblock, result, create); - mutex_unlock(&root->fs_info->fs_mutex); - return err; -} - -static int btrfs_get_block_bmap(struct inode *inode, sector_t iblock, - struct buffer_head *result, int create) -{ - struct btrfs_root *root = BTRFS_I(inode)->root; - mutex_lock(&root->fs_info->fs_mutex); - btrfs_get_block_lock(inode, iblock, result, BTRFS_GET_BLOCK_NO_DIRECT); - mutex_unlock(&root->fs_info->fs_mutex); - return 0; -} - -static sector_t btrfs_bmap(struct address_space *as, sector_t block) -{ - return generic_block_bmap(as, block, btrfs_get_block_bmap); -} - -static int btrfs_prepare_write(struct file *file, struct page *page, - unsigned from, unsigned to) -{ - return block_prepare_write(page, from, to, btrfs_get_block); -} - -static void btrfs_write_super(struct super_block *sb) -{ - sb->s_dirt = 0; -} - -static int btrfs_readpage(struct file *file, struct page *page) -{ - return mpage_readpage(page, btrfs_get_block); -} - -/* - * While block_write_full_page is writing back the dirty buffers under - * the page lock, whoever dirtied the buffers may decide to clean them - * again at any time. We handle that by only looking at the buffer - * state inside lock_buffer(). - * - * If block_write_full_page() is called for regular writeback - * (wbc->sync_mode == WB_SYNC_NONE) then it will redirty a page which has a - * locked buffer. This only can happen if someone has written the buffer - * directly, with submit_bh(). At the address_space level PageWriteback - * prevents this contention from occurring. - */ -static int __btrfs_write_full_page(struct inode *inode, struct page *page, - struct writeback_control *wbc) -{ - int err; - sector_t block; - sector_t last_block; - struct buffer_head *bh, *head; - const unsigned blocksize = 1 << inode->i_blkbits; - int nr_underway = 0; - - BUG_ON(!PageLocked(page)); - - last_block = (i_size_read(inode) - 1) >> inode->i_blkbits; - - if (!page_has_buffers(page)) { - create_empty_buffers(page, blocksize, - (1 << BH_Dirty)|(1 << BH_Uptodate)); - } - - /* - * Be very careful. We have no exclusion from __set_page_dirty_buffers - * here, and the (potentially unmapped) buffers may become dirty at - * any time. If a buffer becomes dirty here after we've inspected it - * then we just miss that fact, and the page stays dirty. - * - * Buffers outside i_size may be dirtied by __set_page_dirty_buffers; - * handle that here by just cleaning them. - */ - - block = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits); - head = page_buffers(page); - bh = head; - - /* - * Get all the dirty buffers mapped to disk addresses and - * handle any aliases from the underlying blockdev's mapping. - */ - do { - if (block > last_block) { - /* - * mapped buffers outside i_size will occur, because - * this page can be outside i_size when there is a - * truncate in progress. - */ - /* - * The buffer was zeroed by block_write_full_page() - */ - clear_buffer_dirty(bh); - set_buffer_uptodate(bh); - } else if (!buffer_mapped(bh) && buffer_dirty(bh)) { - WARN_ON(bh->b_size != blocksize); - err = btrfs_get_block(inode, block, bh, 0); - if (err) { -printk("writepage going to recovery err %d\n", err); - goto recover; - } - if (buffer_new(bh)) { - /* blockdev mappings never come here */ - clear_buffer_new(bh); - } - } - bh = bh->b_this_page; - block++; - } while (bh != head); - - do { - if (!buffer_mapped(bh)) - continue; - /* - * If it's a fully non-blocking write attempt and we cannot - * lock the buffer then redirty the page. Note that this can - * potentially cause a busy-wait loop from pdflush and kswapd - * activity, but those code paths have their own higher-level - * throttling. - */ - if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) { - lock_buffer(bh); - } else if (test_set_buffer_locked(bh)) { - redirty_page_for_writepage(wbc, page); - continue; - } - if (test_clear_buffer_dirty(bh) && bh->b_blocknr != 0) { - mark_buffer_async_write(bh); - } else { - unlock_buffer(bh); - } - } while ((bh = bh->b_this_page) != head); - - /* - * The page and its buffers are protected by PageWriteback(), so we can - * drop the bh refcounts early. - */ - BUG_ON(PageWriteback(page)); - set_page_writeback(page); - - do { - struct buffer_head *next = bh->b_this_page; - if (buffer_async_write(bh)) { - submit_bh(WRITE, bh); - nr_underway++; - } - bh = next; - } while (bh != head); - unlock_page(page); - - err = 0; -done: - if (nr_underway == 0) { - /* - * The page was marked dirty, but the buffers were - * clean. Someone wrote them back by hand with - * ll_rw_block/submit_bh. A rare case. - */ - int uptodate = 1; - do { - if (!buffer_uptodate(bh)) { - uptodate = 0; - break; - } - bh = bh->b_this_page; - } while (bh != head); - if (uptodate) - SetPageUptodate(page); - end_page_writeback(page); - } - return err; - -recover: - /* - * ENOSPC, or some other error. We may already have added some - * blocks to the file, so we need to write these out to avoid - * exposing stale data. - * The page is currently locked and not marked for writeback - */ - bh = head; - /* Recovery: lock and submit the mapped buffers */ - do { - if (buffer_mapped(bh) && buffer_dirty(bh)) { - lock_buffer(bh); - mark_buffer_async_write(bh); - } else { - /* - * The buffer may have been set dirty during - * attachment to a dirty page. - */ - clear_buffer_dirty(bh); - } - } while ((bh = bh->b_this_page) != head); - SetPageError(page); - BUG_ON(PageWriteback(page)); - set_page_writeback(page); - do { - struct buffer_head *next = bh->b_this_page; - if (buffer_async_write(bh)) { - clear_buffer_dirty(bh); - submit_bh(WRITE, bh); - nr_underway++; - } - bh = next; - } while (bh != head); - unlock_page(page); - goto done; -} - -/* - * The generic ->writepage function for buffer-backed address_spaces - */ -static int btrfs_writepage(struct page *page, struct writeback_control *wbc) -{ - struct inode * const inode = page->mapping->host; - loff_t i_size = i_size_read(inode); - const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT; - unsigned offset; - void *kaddr; - - /* Is the page fully inside i_size? */ - if (page->index < end_index) - return __btrfs_write_full_page(inode, page, wbc); - - /* Is the page fully outside i_size? (truncate in progress) */ - offset = i_size & (PAGE_CACHE_SIZE-1); - if (page->index >= end_index+1 || !offset) { - /* - * The page may have dirty, unmapped buffers. For example, - * they may have been added in ext3_writepage(). Make them - * freeable here, so the page does not leak. - */ - block_invalidatepage(page, 0); - unlock_page(page); - return 0; /* don't care */ - } - - /* - * The page straddles i_size. It must be zeroed out on each and every - * writepage invokation because it may be mmapped. "A file is mapped - * in multiples of the page size. For a file that is not a multiple of - * the page size, the remaining memory is zeroed when mapped, and - * writes to that region are not written out to the file." - */ - kaddr = kmap_atomic(page, KM_USER0); - memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset); - flush_dcache_page(page); - kunmap_atomic(kaddr, KM_USER0); - return __btrfs_write_full_page(inode, page, wbc); -} - -static void btrfs_truncate(struct inode *inode) -{ - struct btrfs_root *root = BTRFS_I(inode)->root; - int ret; - struct btrfs_trans_handle *trans; - - if (!S_ISREG(inode->i_mode)) - return; - if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) - return; - - btrfs_truncate_page(inode->i_mapping, inode->i_size); - - mutex_lock(&root->fs_info->fs_mutex); - trans = btrfs_start_transaction(root, 1); - btrfs_set_trans_block_group(trans, inode); - - /* FIXME, add redo link to tree so we don't leak on crash */ - ret = btrfs_truncate_in_trans(trans, root, inode); - BUG_ON(ret); - btrfs_update_inode(trans, root, inode); - ret = btrfs_end_transaction(trans, root); - BUG_ON(ret); - mutex_unlock(&root->fs_info->fs_mutex); - btrfs_btree_balance_dirty(root); -} - -static int btrfs_commit_write(struct file *file, struct page *page, - unsigned from, unsigned to) -{ - struct inode *inode = page->mapping->host; - struct buffer_head *bh; - loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to; - - SetPageUptodate(page); - bh = page_buffers(page); - set_buffer_uptodate(bh); - if (buffer_mapped(bh) && bh->b_blocknr != 0) { - set_page_dirty(page); - } - if (pos > inode->i_size) { - i_size_write(inode, pos); - mark_inode_dirty(inode); - } - return 0; -} - -static int btrfs_copy_from_user(loff_t pos, int num_pages, int write_bytes, - struct page **prepared_pages, - const char __user * buf) -{ - long page_fault = 0; - int i; - int offset = pos & (PAGE_CACHE_SIZE - 1); - - for (i = 0; i < num_pages && write_bytes > 0; i++, offset = 0) { - size_t count = min_t(size_t, - PAGE_CACHE_SIZE - offset, write_bytes); - struct page *page = prepared_pages[i]; - fault_in_pages_readable(buf, count); - - /* Copy data from userspace to the current page */ - kmap(page); - page_fault = __copy_from_user(page_address(page) + offset, - buf, count); - /* Flush processor's dcache for this page */ - flush_dcache_page(page); - kunmap(page); - buf += count; - write_bytes -= count; - - if (page_fault) - break; - } - return page_fault ? -EFAULT : 0; -} - -static void btrfs_drop_pages(struct page **pages, size_t num_pages) -{ - size_t i; - for (i = 0; i < num_pages; i++) { - if (!pages[i]) - break; - unlock_page(pages[i]); - mark_page_accessed(pages[i]); - page_cache_release(pages[i]); - } -} -static int dirty_and_release_pages(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct file *file, - struct page **pages, - size_t num_pages, - loff_t pos, - size_t write_bytes) -{ - int i; - int offset; - int err = 0; - int ret; - int this_write; - struct inode *inode = file->f_path.dentry->d_inode; - struct buffer_head *bh; - struct btrfs_file_extent_item *ei; - - for (i = 0; i < num_pages; i++) { - offset = pos & (PAGE_CACHE_SIZE -1); - this_write = min(PAGE_CACHE_SIZE - offset, write_bytes); - /* FIXME, one block at a time */ - - mutex_lock(&root->fs_info->fs_mutex); - trans = btrfs_start_transaction(root, 1); - btrfs_set_trans_block_group(trans, inode); - - bh = page_buffers(pages[i]); - if (buffer_mapped(bh) && bh->b_blocknr == 0) { - struct btrfs_key key; - struct btrfs_path *path; - char *ptr; - u32 datasize; - - path = btrfs_alloc_path(); - BUG_ON(!path); - key.objectid = inode->i_ino; - key.offset = pages[i]->index << PAGE_CACHE_SHIFT; - key.flags = 0; - btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY); - BUG_ON(write_bytes >= PAGE_CACHE_SIZE); - datasize = offset + - btrfs_file_extent_calc_inline_size(write_bytes); - ret = btrfs_insert_empty_item(trans, root, path, &key, - datasize); - BUG_ON(ret); - ei = btrfs_item_ptr(btrfs_buffer_leaf(path->nodes[0]), - path->slots[0], struct btrfs_file_extent_item); - btrfs_set_file_extent_generation(ei, trans->transid); - btrfs_set_file_extent_type(ei, - BTRFS_FILE_EXTENT_INLINE); - ptr = btrfs_file_extent_inline_start(ei); - btrfs_memcpy(root, path->nodes[0]->b_data, - ptr, bh->b_data, offset + write_bytes); - mark_buffer_dirty(path->nodes[0]); - btrfs_free_path(path); - } else if (buffer_mapped(bh)) { - btrfs_csum_file_block(trans, root, inode->i_ino, - pages[i]->index << PAGE_CACHE_SHIFT, - kmap(pages[i]), PAGE_CACHE_SIZE); - kunmap(pages[i]); - } - SetPageChecked(pages[i]); - // btrfs_update_inode_block_group(trans, inode); - ret = btrfs_end_transaction(trans, root); - BUG_ON(ret); - mutex_unlock(&root->fs_info->fs_mutex); +#define BTRFS_SUPER_MAGIC 0x9123682E - ret = btrfs_commit_write(file, pages[i], offset, - offset + this_write); - pos += this_write; - if (ret) { - err = ret; - goto failed; - } - WARN_ON(this_write > write_bytes); - write_bytes -= this_write; - } -failed: - return err; -} +static struct super_operations btrfs_super_ops; -static int drop_extents(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct inode *inode, - u64 start, u64 end, u64 *hint_block) +static void btrfs_put_super (struct super_block * sb) { + struct btrfs_root *root = btrfs_sb(sb); int ret; - struct btrfs_key key; - struct btrfs_leaf *leaf; - int slot; - struct btrfs_file_extent_item *extent; - u64 extent_end = 0; - int keep; - struct btrfs_file_extent_item old; - struct btrfs_path *path; - u64 search_start = start; - int bookend; - int found_type; - int found_extent; - int found_inline; - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - while(1) { - btrfs_release_path(root, path); - ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino, - search_start, -1); - if (ret < 0) - goto out; - if (ret > 0) { - if (path->slots[0] == 0) { - ret = 0; - goto out; - } - path->slots[0]--; - } - keep = 0; - bookend = 0; - found_extent = 0; - found_inline = 0; - extent = NULL; - leaf = btrfs_buffer_leaf(path->nodes[0]); - slot = path->slots[0]; - btrfs_disk_key_to_cpu(&key, &leaf->items[slot].key); - if (key.offset >= end || key.objectid != inode->i_ino) { - ret = 0; - goto out; - } - if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY) { - ret = 0; - goto out; - } - extent = btrfs_item_ptr(leaf, slot, - struct btrfs_file_extent_item); - found_type = btrfs_file_extent_type(extent); - if (found_type == BTRFS_FILE_EXTENT_REG) { - extent_end = key.offset + - (btrfs_file_extent_num_blocks(extent) << - inode->i_blkbits); - found_extent = 1; - } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { - found_inline = 1; - extent_end = key.offset + - btrfs_file_extent_inline_len(leaf->items + slot); - } - - if (!found_extent && !found_inline) { - ret = 0; - goto out; - } - - if (search_start >= extent_end) { - ret = 0; - goto out; - } - - if (found_inline) { - u64 mask = root->blocksize - 1; - search_start = (extent_end + mask) & ~mask; - } else - search_start = extent_end; - if (end < extent_end && end >= key.offset) { - if (found_extent) { - u64 disk_blocknr = - btrfs_file_extent_disk_blocknr(extent); - u64 disk_num_blocks = - btrfs_file_extent_disk_num_blocks(extent); - memcpy(&old, extent, sizeof(old)); - if (disk_blocknr != 0) { - ret = btrfs_inc_extent_ref(trans, root, - disk_blocknr, disk_num_blocks); - BUG_ON(ret); - } - } - WARN_ON(found_inline); - bookend = 1; - } - - if (start > key.offset) { - u64 new_num; - u64 old_num; - /* truncate existing extent */ - keep = 1; - WARN_ON(start & (root->blocksize - 1)); - if (found_extent) { - new_num = (start - key.offset) >> - inode->i_blkbits; - old_num = btrfs_file_extent_num_blocks(extent); - *hint_block = - btrfs_file_extent_disk_blocknr(extent); - if (btrfs_file_extent_disk_blocknr(extent)) { - inode->i_blocks -= - (old_num - new_num) << 3; - } - btrfs_set_file_extent_num_blocks(extent, - new_num); - mark_buffer_dirty(path->nodes[0]); - } else { - WARN_ON(1); - } - } - if (!keep) { - u64 disk_blocknr = 0; - u64 disk_num_blocks = 0; - u64 extent_num_blocks = 0; - if (found_extent) { - disk_blocknr = - btrfs_file_extent_disk_blocknr(extent); - disk_num_blocks = - btrfs_file_extent_disk_num_blocks(extent); - extent_num_blocks = - btrfs_file_extent_num_blocks(extent); - *hint_block = - btrfs_file_extent_disk_blocknr(extent); - } - ret = btrfs_del_item(trans, root, path); - BUG_ON(ret); - btrfs_release_path(root, path); - extent = NULL; - if (found_extent && disk_blocknr != 0) { - inode->i_blocks -= extent_num_blocks << 3; - ret = btrfs_free_extent(trans, root, - disk_blocknr, - disk_num_blocks, 0); - } - - BUG_ON(ret); - if (!bookend && search_start >= end) { - ret = 0; - goto out; - } - if (!bookend) - continue; - } - if (bookend && found_extent) { - /* create bookend */ - struct btrfs_key ins; - ins.objectid = inode->i_ino; - ins.offset = end; - ins.flags = 0; - btrfs_set_key_type(&ins, BTRFS_EXTENT_DATA_KEY); - - btrfs_release_path(root, path); - ret = btrfs_insert_empty_item(trans, root, path, &ins, - sizeof(*extent)); - BUG_ON(ret); - extent = btrfs_item_ptr( - btrfs_buffer_leaf(path->nodes[0]), - path->slots[0], - struct btrfs_file_extent_item); - btrfs_set_file_extent_disk_blocknr(extent, - btrfs_file_extent_disk_blocknr(&old)); - btrfs_set_file_extent_disk_num_blocks(extent, - btrfs_file_extent_disk_num_blocks(&old)); - - btrfs_set_file_extent_offset(extent, - btrfs_file_extent_offset(&old) + - ((end - key.offset) >> inode->i_blkbits)); - WARN_ON(btrfs_file_extent_num_blocks(&old) < - (extent_end - end) >> inode->i_blkbits); - btrfs_set_file_extent_num_blocks(extent, - (extent_end - end) >> inode->i_blkbits); - - btrfs_set_file_extent_type(extent, - BTRFS_FILE_EXTENT_REG); - btrfs_set_file_extent_generation(extent, - btrfs_file_extent_generation(&old)); - btrfs_mark_buffer_dirty(path->nodes[0]); - if (btrfs_file_extent_disk_blocknr(&old) != 0) { - inode->i_blocks += - btrfs_file_extent_num_blocks(extent) << 3; - } - ret = 0; - goto out; - } - } -out: - btrfs_free_path(path); - return ret; -} - -static int prepare_pages(struct btrfs_root *root, - struct file *file, - struct page **pages, - size_t num_pages, - loff_t pos, - unsigned long first_index, - unsigned long last_index, - size_t write_bytes, - u64 alloc_extent_start) -{ - int i; - unsigned long index = pos >> PAGE_CACHE_SHIFT; - struct inode *inode = file->f_path.dentry->d_inode; - int offset; - int err = 0; - int this_write; - struct buffer_head *bh; - struct buffer_head *head; - loff_t isize = i_size_read(inode); - - memset(pages, 0, num_pages * sizeof(struct page *)); - - for (i = 0; i < num_pages; i++) { - pages[i] = grab_cache_page(inode->i_mapping, index + i); - if (!pages[i]) { - err = -ENOMEM; - goto failed_release; - } - cancel_dirty_page(pages[i], PAGE_CACHE_SIZE); - wait_on_page_writeback(pages[i]); - offset = pos & (PAGE_CACHE_SIZE -1); - this_write = min(PAGE_CACHE_SIZE - offset, write_bytes); - if (!page_has_buffers(pages[i])) { - create_empty_buffers(pages[i], - root->fs_info->sb->s_blocksize, - (1 << BH_Uptodate)); - } - head = page_buffers(pages[i]); - bh = head; - do { - err = btrfs_map_bh_to_logical(root, bh, - alloc_extent_start); - BUG_ON(err); - if (err) - goto failed_truncate; - bh = bh->b_this_page; - if (alloc_extent_start) - alloc_extent_start++; - } while (bh != head); - pos += this_write; - WARN_ON(this_write > write_bytes); - write_bytes -= this_write; + ret = close_ctree(root); + if (ret) { + printk("close ctree returns %d\n", ret); } - return 0; - -failed_release: - btrfs_drop_pages(pages, num_pages); - return err; - -failed_truncate: - btrfs_drop_pages(pages, num_pages); - if (pos > isize) - vmtruncate(inode, isize); - return err; + sb->s_fs_info = NULL; } -static ssize_t btrfs_file_write(struct file *file, const char __user *buf, - size_t count, loff_t *ppos) +static int btrfs_fill_super(struct super_block * sb, void * data, int silent) { - loff_t pos; - size_t num_written = 0; - int err = 0; - int ret = 0; - struct inode *inode = file->f_path.dentry->d_inode; - struct btrfs_root *root = BTRFS_I(inode)->root; - struct page *pages[8]; - struct page *pinned[2]; - unsigned long first_index; - unsigned long last_index; - u64 start_pos; - u64 num_blocks; - u64 alloc_extent_start; - u64 hint_block; - struct btrfs_trans_handle *trans; - struct btrfs_key ins; - pinned[0] = NULL; - pinned[1] = NULL; - if (file->f_flags & O_DIRECT) - return -EINVAL; - pos = *ppos; - vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); - current->backing_dev_info = inode->i_mapping->backing_dev_info; - err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode)); - if (err) - goto out; - if (count == 0) - goto out; - err = remove_suid(file->f_path.dentry); - if (err) - goto out; - file_update_time(file); + struct inode * inode; + struct dentry * root_dentry; + struct btrfs_super_block *disk_super; + struct btrfs_root *tree_root; + struct btrfs_inode *bi; + int err; - start_pos = pos & ~((u64)PAGE_CACHE_SIZE - 1); - num_blocks = (count + pos - start_pos + root->blocksize - 1) >> - inode->i_blkbits; + sb->s_maxbytes = MAX_LFS_FILESIZE; + sb->s_magic = BTRFS_SUPER_MAGIC; + sb->s_op = &btrfs_super_ops; + sb->s_time_gran = 1; - mutex_lock(&inode->i_mutex); - first_index = pos >> PAGE_CACHE_SHIFT; - last_index = (pos + count) >> PAGE_CACHE_SHIFT; + tree_root = open_ctree(sb); - if ((pos & (PAGE_CACHE_SIZE - 1))) { - pinned[0] = grab_cache_page(inode->i_mapping, first_index); - if (!PageUptodate(pinned[0])) { - ret = mpage_readpage(pinned[0], btrfs_get_block); - BUG_ON(ret); - wait_on_page_locked(pinned[0]); - } else { - unlock_page(pinned[0]); - } - } - if ((pos + count) & (PAGE_CACHE_SIZE - 1)) { - pinned[1] = grab_cache_page(inode->i_mapping, last_index); - if (!PageUptodate(pinned[1])) { - ret = mpage_readpage(pinned[1], btrfs_get_block); - BUG_ON(ret); - wait_on_page_locked(pinned[1]); - } else { - unlock_page(pinned[1]); - } + if (!tree_root || IS_ERR(tree_root)) { + printk("btrfs: open_ctree failed\n"); + return -EIO; } + sb->s_fs_info = tree_root; + disk_super = tree_root->fs_info->disk_super; + inode = btrfs_iget_locked(sb, btrfs_super_root_dir(disk_super), + tree_root); + bi = BTRFS_I(inode); + bi->location.objectid = inode->i_ino; + bi->location.offset = 0; + bi->location.flags = 0; + bi->root = tree_root; + btrfs_set_key_type(&bi->location, BTRFS_INODE_ITEM_KEY); - mutex_lock(&root->fs_info->fs_mutex); - trans = btrfs_start_transaction(root, 1); - if (!trans) { + if (!inode) { err = -ENOMEM; - mutex_unlock(&root->fs_info->fs_mutex); - goto out_unlock; - } - btrfs_set_trans_block_group(trans, inode); - /* FIXME blocksize != 4096 */ - inode->i_blocks += num_blocks << 3; - hint_block = 0; - if (start_pos < inode->i_size) { - /* FIXME blocksize != pagesize */ - ret = drop_extents(trans, root, inode, - start_pos, - (pos + count + root->blocksize -1) & - ~((u64)root->blocksize - 1), &hint_block); - BUG_ON(ret); - } - if (inode->i_size < start_pos) { - u64 last_pos_in_file; - u64 hole_size; - u64 mask = root->blocksize - 1; - last_pos_in_file = (inode->i_size + mask) & ~mask; - hole_size = (start_pos - last_pos_in_file + mask) & ~mask; - hole_size >>= inode->i_blkbits; - if (last_pos_in_file < start_pos) { - ret = btrfs_insert_file_extent(trans, root, - inode->i_ino, - last_pos_in_file, - 0, 0, hole_size); - } - BUG_ON(ret); - } - if (inode->i_size >= PAGE_CACHE_SIZE || pos + count < inode->i_size || - pos + count - start_pos > BTRFS_MAX_INLINE_DATA_SIZE(root)) { - ret = btrfs_alloc_extent(trans, root, inode->i_ino, - num_blocks, hint_block, (u64)-1, - &ins, 1); - BUG_ON(ret); - ret = btrfs_insert_file_extent(trans, root, inode->i_ino, - start_pos, ins.objectid, ins.offset, - ins.offset); - BUG_ON(ret); - } else { - ins.offset = 0; - ins.objectid = 0; - } - BUG_ON(ret); - alloc_extent_start = ins.objectid; - // btrfs_update_inode_block_group(trans, inode); - ret = btrfs_end_transaction(trans, root); - mutex_unlock(&root->fs_info->fs_mutex); - - while(count > 0) { - size_t offset = pos & (PAGE_CACHE_SIZE - 1); - size_t write_bytes = min(count, PAGE_CACHE_SIZE - offset); - size_t num_pages = (write_bytes + PAGE_CACHE_SIZE - 1) >> - PAGE_CACHE_SHIFT; - - memset(pages, 0, sizeof(pages)); - ret = prepare_pages(root, file, pages, num_pages, - pos, first_index, last_index, - write_bytes, alloc_extent_start); - BUG_ON(ret); - - /* FIXME blocks != pagesize */ - if (alloc_extent_start) - alloc_extent_start += num_pages; - ret = btrfs_copy_from_user(pos, num_pages, - write_bytes, pages, buf); - BUG_ON(ret); - - ret = dirty_and_release_pages(NULL, root, file, pages, - num_pages, pos, write_bytes); - BUG_ON(ret); - btrfs_drop_pages(pages, num_pages); - - buf += write_bytes; - count -= write_bytes; - pos += write_bytes; - num_written += write_bytes; - - balance_dirty_pages_ratelimited(inode->i_mapping); - btrfs_btree_balance_dirty(root); - cond_resched(); - } -out_unlock: - mutex_unlock(&inode->i_mutex); -out: - if (pinned[0]) - page_cache_release(pinned[0]); - if (pinned[1]) - page_cache_release(pinned[1]); - *ppos = pos; - current->backing_dev_info = NULL; - mark_inode_dirty(inode); - return num_written ? num_written : err; -} - -static int btrfs_read_actor(read_descriptor_t *desc, struct page *page, - unsigned long offset, unsigned long size) -{ - char *kaddr; - unsigned long left, count = desc->count; - struct inode *inode = page->mapping->host; - - if (size > count) - size = count; - - if (!PageChecked(page)) { - /* FIXME, do it per block */ - struct btrfs_root *root = BTRFS_I(inode)->root; - int ret; - struct buffer_head *bh; - - if (page_has_buffers(page)) { - bh = page_buffers(page); - if (!buffer_mapped(bh)) { - SetPageChecked(page); - goto checked; - } - } - - ret = btrfs_csum_verify_file_block(root, - page->mapping->host->i_ino, - page->index << PAGE_CACHE_SHIFT, - kmap(page), PAGE_CACHE_SIZE); - if (ret) { - if (ret != -ENOENT) { - printk("failed to verify ino %lu page %lu ret %d\n", - page->mapping->host->i_ino, - page->index, ret); - memset(page_address(page), 1, PAGE_CACHE_SIZE); - flush_dcache_page(page); - } - } - SetPageChecked(page); - kunmap(page); - } -checked: - /* - * Faults on the destination of a read are common, so do it before - * taking the kmap. - */ - if (!fault_in_pages_writeable(desc->arg.buf, size)) { - kaddr = kmap_atomic(page, KM_USER0); - left = __copy_to_user_inatomic(desc->arg.buf, - kaddr + offset, size); - kunmap_atomic(kaddr, KM_USER0); - if (left == 0) - goto success; - } - - /* Do it the slow way */ - kaddr = kmap(page); - left = __copy_to_user(desc->arg.buf, kaddr + offset, size); - kunmap(page); - - if (left) { - size -= left; - desc->error = -EFAULT; + goto fail_close; } -success: - desc->count = count - size; - desc->written += size; - desc->arg.buf += size; - return size; -} - -/** - * btrfs_file_aio_read - filesystem read routine - * @iocb: kernel I/O control block - * @iov: io vector request - * @nr_segs: number of segments in the iovec - * @pos: current file position - */ -static ssize_t btrfs_file_aio_read(struct kiocb *iocb, const struct iovec *iov, - unsigned long nr_segs, loff_t pos) -{ - struct file *filp = iocb->ki_filp; - ssize_t retval; - unsigned long seg; - size_t count; - loff_t *ppos = &iocb->ki_pos; - - count = 0; - for (seg = 0; seg < nr_segs; seg++) { - const struct iovec *iv = &iov[seg]; - - /* - * If any segment has a negative length, or the cumulative - * length ever wraps negative then return -EINVAL. - */ - count += iv->iov_len; - if (unlikely((ssize_t)(count|iv->iov_len) < 0)) - return -EINVAL; - if (access_ok(VERIFY_WRITE, iv->iov_base, iv->iov_len)) - continue; - if (seg == 0) - return -EFAULT; - nr_segs = seg; - count -= iv->iov_len; /* This segment is no good */ - break; + if (inode->i_state & I_NEW) { + btrfs_read_locked_inode(inode); + unlock_new_inode(inode); } - retval = 0; - if (count) { - for (seg = 0; seg < nr_segs; seg++) { - read_descriptor_t desc; - desc.written = 0; - desc.arg.buf = iov[seg].iov_base; - desc.count = iov[seg].iov_len; - if (desc.count == 0) - continue; - desc.error = 0; - do_generic_file_read(filp, ppos, &desc, - btrfs_read_actor); - retval += desc.written; - if (desc.error) { - retval = retval ?: desc.error; - break; - } - } + root_dentry = d_alloc_root(inode); + if (!root_dentry) { + iput(inode); + err = -ENOMEM; + goto fail_close; } - return retval; -} - -static int create_subvol(struct btrfs_root *root, char *name, int namelen) -{ - struct btrfs_trans_handle *trans; - struct btrfs_key key; - struct btrfs_root_item root_item; - struct btrfs_inode_item *inode_item; - struct buffer_head *subvol; - struct btrfs_leaf *leaf; - struct btrfs_root *new_root; - struct inode *inode; - struct inode *dir; - int ret; - u64 objectid; - u64 new_dirid = BTRFS_FIRST_FREE_OBJECTID; - - mutex_lock(&root->fs_info->fs_mutex); - trans = btrfs_start_transaction(root, 1); - BUG_ON(!trans); - - subvol = btrfs_alloc_free_block(trans, root, 0); - if (subvol == NULL) - return -ENOSPC; - leaf = btrfs_buffer_leaf(subvol); - btrfs_set_header_nritems(&leaf->header, 0); - btrfs_set_header_level(&leaf->header, 0); - btrfs_set_header_blocknr(&leaf->header, bh_blocknr(subvol)); - btrfs_set_header_generation(&leaf->header, trans->transid); - btrfs_set_header_owner(&leaf->header, root->root_key.objectid); - memcpy(leaf->header.fsid, root->fs_info->disk_super->fsid, - sizeof(leaf->header.fsid)); - mark_buffer_dirty(subvol); - - inode_item = &root_item.inode; - memset(inode_item, 0, sizeof(*inode_item)); - btrfs_set_inode_generation(inode_item, 1); - btrfs_set_inode_size(inode_item, 3); - btrfs_set_inode_nlink(inode_item, 1); - btrfs_set_inode_nblocks(inode_item, 1); - btrfs_set_inode_mode(inode_item, S_IFDIR | 0755); - - btrfs_set_root_blocknr(&root_item, bh_blocknr(subvol)); - btrfs_set_root_refs(&root_item, 1); - brelse(subvol); - subvol = NULL; - - ret = btrfs_find_free_objectid(trans, root->fs_info->tree_root, - 0, &objectid); - BUG_ON(ret); - - btrfs_set_root_dirid(&root_item, new_dirid); - - key.objectid = objectid; - key.offset = 1; - key.flags = 0; - btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); - ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key, - &root_item); - BUG_ON(ret); - - /* - * insert the directory item - */ - key.offset = (u64)-1; - dir = root->fs_info->sb->s_root->d_inode; - ret = btrfs_insert_dir_item(trans, root->fs_info->tree_root, - name, namelen, dir->i_ino, &key, - BTRFS_FT_DIR); - BUG_ON(ret); - - ret = btrfs_commit_transaction(trans, root); - BUG_ON(ret); - - new_root = btrfs_read_fs_root(root->fs_info, &key); - BUG_ON(!new_root); - - trans = btrfs_start_transaction(new_root, 1); - BUG_ON(!trans); - - inode = btrfs_new_inode(trans, new_root, new_dirid, - BTRFS_I(dir)->block_group, S_IFDIR | 0700); - inode->i_op = &btrfs_dir_inode_operations; - inode->i_fop = &btrfs_dir_file_operations; - - ret = btrfs_make_empty_dir(trans, new_root, new_dirid, new_dirid); - BUG_ON(ret); - - inode->i_nlink = 1; - inode->i_size = 6; - ret = btrfs_update_inode(trans, new_root, inode); - BUG_ON(ret); - - ret = btrfs_commit_transaction(trans, new_root); - BUG_ON(ret); - - iput(inode); - - mutex_unlock(&root->fs_info->fs_mutex); - btrfs_btree_balance_dirty(root); + sb->s_root = root_dentry; + btrfs_transaction_queue_work(tree_root, HZ * 30); return 0; + +fail_close: + close_ctree(tree_root); + return err; } -static int create_snapshot(struct btrfs_root *root, char *name, int namelen) +static int btrfs_sync_fs(struct super_block *sb, int wait) { struct btrfs_trans_handle *trans; - struct btrfs_key key; - struct btrfs_root_item new_root_item; + struct btrfs_root *root; int ret; - u64 objectid; - - if (!root->ref_cows) - return -EINVAL; + root = btrfs_sb(sb); + sb->s_dirt = 0; + if (!wait) { + filemap_flush(root->fs_info->btree_inode->i_mapping); + return 0; + } mutex_lock(&root->fs_info->fs_mutex); trans = btrfs_start_transaction(root, 1); - BUG_ON(!trans); - - ret = btrfs_update_inode(trans, root, root->inode); - BUG_ON(ret); - - ret = btrfs_find_free_objectid(trans, root->fs_info->tree_root, - 0, &objectid); - BUG_ON(ret); - - memcpy(&new_root_item, &root->root_item, - sizeof(new_root_item)); - - key.objectid = objectid; - key.offset = 1; - key.flags = 0; - btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); - btrfs_set_root_blocknr(&new_root_item, bh_blocknr(root->node)); - - ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key, - &new_root_item); - BUG_ON(ret); - - /* - * insert the directory item - */ - key.offset = (u64)-1; - ret = btrfs_insert_dir_item(trans, root->fs_info->tree_root, - name, namelen, - root->fs_info->sb->s_root->d_inode->i_ino, - &key, BTRFS_FT_DIR); - - BUG_ON(ret); - - ret = btrfs_inc_root_ref(trans, root); - BUG_ON(ret); - ret = btrfs_commit_transaction(trans, root); + sb->s_dirt = 0; BUG_ON(ret); mutex_unlock(&root->fs_info->fs_mutex); - btrfs_btree_balance_dirty(root); - return 0; -} - -static int btrfs_ioctl(struct inode *inode, struct file *filp, unsigned int - cmd, unsigned long arg) -{ - struct btrfs_root *root = BTRFS_I(inode)->root; - struct btrfs_ioctl_vol_args vol_args; - int ret = 0; - struct btrfs_dir_item *di; - int namelen; - struct btrfs_path *path; - u64 root_dirid; - - switch (cmd) { - case BTRFS_IOC_SNAP_CREATE: - if (copy_from_user(&vol_args, - (struct btrfs_ioctl_vol_args __user *)arg, - sizeof(vol_args))) - return -EFAULT; - namelen = strlen(vol_args.name); - if (namelen > BTRFS_VOL_NAME_MAX) - return -EINVAL; - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - root_dirid = root->fs_info->sb->s_root->d_inode->i_ino, - mutex_lock(&root->fs_info->fs_mutex); - di = btrfs_lookup_dir_item(NULL, root->fs_info->tree_root, - path, root_dirid, - vol_args.name, namelen, 0); - mutex_unlock(&root->fs_info->fs_mutex); - btrfs_free_path(path); - if (di && !IS_ERR(di)) - return -EEXIST; - - if (root == root->fs_info->tree_root) - ret = create_subvol(root, vol_args.name, namelen); - else - ret = create_snapshot(root, vol_args.name, namelen); - WARN_ON(ret); - break; - default: - return -ENOTTY; - } - return ret; -} - -#ifdef CONFIG_COMPAT -static long btrfs_compat_ioctl(struct file *file, unsigned int cmd, - unsigned long arg) -{ - struct inode *inode = file->f_path.dentry->d_inode; - int ret; - lock_kernel(); - ret = btrfs_ioctl(inode, file, cmd, (unsigned long) compat_ptr(arg)); - unlock_kernel(); - return ret; - -} -#endif - -static struct kmem_cache *btrfs_inode_cachep; -struct kmem_cache *btrfs_trans_handle_cachep; -struct kmem_cache *btrfs_transaction_cachep; -struct kmem_cache *btrfs_bit_radix_cachep; -struct kmem_cache *btrfs_path_cachep; - -/* - * Called inside transaction, so use GFP_NOFS - */ -static struct inode *btrfs_alloc_inode(struct super_block *sb) -{ - struct btrfs_inode *ei; - - ei = kmem_cache_alloc(btrfs_inode_cachep, GFP_NOFS); - if (!ei) - return NULL; - return &ei->vfs_inode; -} - -static void btrfs_destroy_inode(struct inode *inode) -{ - WARN_ON(!list_empty(&inode->i_dentry)); - WARN_ON(inode->i_data.nrpages); - - kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode)); -} - -static void init_once(void * foo, struct kmem_cache * cachep, - unsigned long flags) -{ - struct btrfs_inode *ei = (struct btrfs_inode *) foo; - - if ((flags & (SLAB_CTOR_CONSTRUCTOR)) == - SLAB_CTOR_CONSTRUCTOR) { - inode_init_once(&ei->vfs_inode); - } -} - -static int init_inodecache(void) -{ - btrfs_inode_cachep = kmem_cache_create("btrfs_inode_cache", - sizeof(struct btrfs_inode), - 0, (SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD), - init_once, NULL); - btrfs_trans_handle_cachep = kmem_cache_create("btrfs_trans_handle_cache", - sizeof(struct btrfs_trans_handle), - 0, (SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD), - NULL, NULL); - btrfs_transaction_cachep = kmem_cache_create("btrfs_transaction_cache", - sizeof(struct btrfs_transaction), - 0, (SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD), - NULL, NULL); - btrfs_path_cachep = kmem_cache_create("btrfs_path_cache", - sizeof(struct btrfs_transaction), - 0, (SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD), - NULL, NULL); - btrfs_bit_radix_cachep = kmem_cache_create("btrfs_radix", - 256, - 0, (SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD | - SLAB_DESTROY_BY_RCU), - NULL, NULL); - if (btrfs_inode_cachep == NULL || btrfs_trans_handle_cachep == NULL || - btrfs_transaction_cachep == NULL || btrfs_bit_radix_cachep == NULL) - return -ENOMEM; return 0; } -static void destroy_inodecache(void) +static void btrfs_write_super(struct super_block *sb) { - kmem_cache_destroy(btrfs_inode_cachep); - kmem_cache_destroy(btrfs_trans_handle_cachep); - kmem_cache_destroy(btrfs_transaction_cachep); - kmem_cache_destroy(btrfs_bit_radix_cachep); - kmem_cache_destroy(btrfs_path_cachep); + sb->s_dirt = 0; } static int btrfs_get_sb(struct file_system_type *fs_type, @@ -2709,15 +124,6 @@ static int btrfs_get_sb(struct file_system_type *fs_type, btrfs_fill_super, mnt); } -static int btrfs_getattr(struct vfsmount *mnt, - struct dentry *dentry, struct kstat *stat) -{ - struct inode *inode = dentry->d_inode; - generic_fillattr(inode, stat); - stat->blksize = 256 * 1024; - return 0; -} - static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) { struct btrfs_root *root = btrfs_sb(dentry->d_sb); @@ -2732,197 +138,6 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) return 0; } -static int btrfs_rename(struct inode * old_dir, struct dentry *old_dentry, - struct inode * new_dir,struct dentry *new_dentry) -{ - struct btrfs_trans_handle *trans; - struct btrfs_root *root = BTRFS_I(old_dir)->root; - struct inode *new_inode = new_dentry->d_inode; - struct inode *old_inode = old_dentry->d_inode; - struct timespec ctime = CURRENT_TIME; - struct btrfs_path *path; - struct btrfs_dir_item *di; - int ret; - - if (S_ISDIR(old_inode->i_mode) && new_inode && - new_inode->i_size > BTRFS_EMPTY_DIR_SIZE) { - return -ENOTEMPTY; - } - mutex_lock(&root->fs_info->fs_mutex); - trans = btrfs_start_transaction(root, 1); - btrfs_set_trans_block_group(trans, new_dir); - path = btrfs_alloc_path(); - if (!path) { - ret = -ENOMEM; - goto out_fail; - } - - old_dentry->d_inode->i_nlink++; - old_dir->i_ctime = old_dir->i_mtime = ctime; - new_dir->i_ctime = new_dir->i_mtime = ctime; - old_inode->i_ctime = ctime; - if (S_ISDIR(old_inode->i_mode) && old_dir != new_dir) { - struct btrfs_key *location = &BTRFS_I(new_dir)->location; - u64 old_parent_oid; - di = btrfs_lookup_dir_item(trans, root, path, old_inode->i_ino, - "..", 2, -1); - if (IS_ERR(di)) { - ret = PTR_ERR(di); - goto out_fail; - } - if (!di) { - ret = -ENOENT; - goto out_fail; - } - old_parent_oid = btrfs_disk_key_objectid(&di->location); - ret = btrfs_del_item(trans, root, path); - if (ret) { - ret = -EIO; - goto out_fail; - } - btrfs_release_path(root, path); - - di = btrfs_lookup_dir_index_item(trans, root, path, - old_inode->i_ino, - old_parent_oid, - "..", 2, -1); - if (IS_ERR(di)) { - ret = PTR_ERR(di); - goto out_fail; - } - if (!di) { - ret = -ENOENT; - goto out_fail; - } - ret = btrfs_del_item(trans, root, path); - if (ret) { - ret = -EIO; - goto out_fail; - } - btrfs_release_path(root, path); - - ret = btrfs_insert_dir_item(trans, root, "..", 2, - old_inode->i_ino, location, - BTRFS_FT_DIR); - if (ret) - goto out_fail; - } - - - ret = btrfs_unlink_trans(trans, root, old_dir, old_dentry); - if (ret) - goto out_fail; - - if (new_inode) { - new_inode->i_ctime = CURRENT_TIME; - ret = btrfs_unlink_trans(trans, root, new_dir, new_dentry); - if (ret) - goto out_fail; - if (S_ISDIR(new_inode->i_mode)) - clear_nlink(new_inode); - else - drop_nlink(new_inode); - btrfs_update_inode(trans, root, new_inode); - } - ret = btrfs_add_link(trans, new_dentry, old_inode); - if (ret) - goto out_fail; - -out_fail: - btrfs_free_path(path); - btrfs_end_transaction(trans, root); - mutex_unlock(&root->fs_info->fs_mutex); - return ret; -} - -static int btrfs_symlink(struct inode *dir, struct dentry *dentry, - const char *symname) -{ - struct btrfs_trans_handle *trans; - struct btrfs_root *root = BTRFS_I(dir)->root; - struct btrfs_path *path; - struct btrfs_key key; - struct inode *inode; - int err; - int drop_inode = 0; - u64 objectid; - int name_len; - int datasize; - char *ptr; - struct btrfs_file_extent_item *ei; - - name_len = strlen(symname) + 1; - if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root)) - return -ENAMETOOLONG; - mutex_lock(&root->fs_info->fs_mutex); - trans = btrfs_start_transaction(root, 1); - btrfs_set_trans_block_group(trans, dir); - - err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid); - if (err) { - err = -ENOSPC; - goto out_unlock; - } - - inode = btrfs_new_inode(trans, root, objectid, - BTRFS_I(dir)->block_group, S_IFLNK|S_IRWXUGO); - err = PTR_ERR(inode); - if (IS_ERR(inode)) - goto out_unlock; - - btrfs_set_trans_block_group(trans, inode); - err = btrfs_add_nondir(trans, dentry, inode); - if (err) - drop_inode = 1; - else { - inode->i_mapping->a_ops = &btrfs_aops; - inode->i_fop = &btrfs_file_operations; - inode->i_op = &btrfs_file_inode_operations; - } - dir->i_sb->s_dirt = 1; - btrfs_update_inode_block_group(trans, inode); - btrfs_update_inode_block_group(trans, dir); - if (drop_inode) - goto out_unlock; - - path = btrfs_alloc_path(); - BUG_ON(!path); - key.objectid = inode->i_ino; - key.offset = 0; - key.flags = 0; - btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY); - datasize = btrfs_file_extent_calc_inline_size(name_len); - err = btrfs_insert_empty_item(trans, root, path, &key, - datasize); - BUG_ON(err); - ei = btrfs_item_ptr(btrfs_buffer_leaf(path->nodes[0]), - path->slots[0], struct btrfs_file_extent_item); - btrfs_set_file_extent_generation(ei, trans->transid); - btrfs_set_file_extent_type(ei, - BTRFS_FILE_EXTENT_INLINE); - ptr = btrfs_file_extent_inline_start(ei); - btrfs_memcpy(root, path->nodes[0]->b_data, - ptr, symname, name_len); - mark_buffer_dirty(path->nodes[0]); - btrfs_free_path(path); - inode->i_op = &btrfs_symlink_inode_operations; - inode->i_mapping->a_ops = &btrfs_symlink_aops; - inode->i_size = name_len - 1; - btrfs_update_inode(trans, root, inode); - err = 0; - -out_unlock: - btrfs_end_transaction(trans, root); - mutex_unlock(&root->fs_info->fs_mutex); - - if (drop_inode) { - inode_dec_link_count(inode); - iput(inode); - } - btrfs_btree_balance_dirty(root); - return err; -} - static struct file_system_type btrfs_fs_type = { .owner = THIS_MODULE, .name = "btrfs", @@ -2944,91 +159,21 @@ static struct super_operations btrfs_super_ops = { .statfs = btrfs_statfs, }; -static struct inode_operations btrfs_dir_inode_operations = { - .lookup = btrfs_lookup, - .create = btrfs_create, - .unlink = btrfs_unlink, - .link = btrfs_link, - .mkdir = btrfs_mkdir, - .rmdir = btrfs_rmdir, - .rename = btrfs_rename, - .symlink = btrfs_symlink, - .setattr = btrfs_setattr, -}; - -static struct inode_operations btrfs_dir_ro_inode_operations = { - .lookup = btrfs_lookup, -}; - -static struct file_operations btrfs_dir_file_operations = { - .llseek = generic_file_llseek, - .read = generic_read_dir, - .readdir = btrfs_readdir, - .ioctl = btrfs_ioctl, -#ifdef CONFIG_COMPAT - .compat_ioctl = btrfs_compat_ioctl, -#endif -}; - -static struct address_space_operations btrfs_aops = { - .readpage = btrfs_readpage, - .writepage = btrfs_writepage, - .sync_page = block_sync_page, - .prepare_write = btrfs_prepare_write, - .commit_write = btrfs_commit_write, - .bmap = btrfs_bmap, -}; - -static struct address_space_operations btrfs_symlink_aops = { - .readpage = btrfs_readpage, - .writepage = btrfs_writepage, -}; - -static struct inode_operations btrfs_file_inode_operations = { - .truncate = btrfs_truncate, - .getattr = btrfs_getattr, - .setattr = btrfs_setattr, -}; - -static struct file_operations btrfs_file_operations = { - .llseek = generic_file_llseek, - .read = do_sync_read, - .aio_read = btrfs_file_aio_read, - .write = btrfs_file_write, - .mmap = generic_file_mmap, - .open = generic_file_open, - .ioctl = btrfs_ioctl, - .fsync = btrfs_sync_file, -#ifdef CONFIG_COMPAT - .compat_ioctl = btrfs_compat_ioctl, -#endif -}; - -static struct inode_operations btrfs_symlink_inode_operations = { - .readlink = generic_readlink, - .follow_link = page_follow_link_light, - .put_link = page_put_link, -}; - static int __init init_btrfs_fs(void) { int err; - printk("btrfs loaded!\n"); btrfs_init_transaction_sys(); - err = init_inodecache(); + err = btrfs_init_cachep(); if (err) return err; return register_filesystem(&btrfs_fs_type); - destroy_inodecache(); - return err; } static void __exit exit_btrfs_fs(void) { btrfs_exit_transaction_sys(); - destroy_inodecache(); + btrfs_destroy_cachep(); unregister_filesystem(&btrfs_fs_type); - printk("btrfs unloaded\n"); } module_init(init_btrfs_fs) -- cgit v1.2.3 From 8578f0f1fda41f8429a1037022b720275102ca65 Mon Sep 17 00:00:00 2001 From: Joel Becker Date: Wed, 25 Jul 2007 14:04:51 -0400 Subject: Btrfs: cleaner make clean Signed-off-by: Chris Mason --- fs/btrfs/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/btrfs/Makefile') diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index 5af2426116d3..a4e2df66bdbb 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -18,5 +18,5 @@ KERNELDIR := /lib/modules/`uname -r`/build all:: $(MAKE) -C $(KERNELDIR) M=`pwd` modules clean:: - rm *.o btrfs.ko + $(MAKE) -C $(KERNELDIR) M=`pwd` clean endif -- cgit v1.2.3 From 6702ed490ca0bb44e17131818a5a18b773957c5a Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Tue, 7 Aug 2007 16:15:09 -0400 Subject: Btrfs: Add run time btree defrag, and an ioctl to force btree defrag This adds two types of btree defrag, a run time form that tries to defrag recently allocated blocks in the btree when they are still in ram, and an ioctl that forces defrag of all btree blocks. File data blocks are not defragged yet, but this can make a huge difference in sequential btree reads. Signed-off-by: Chris Mason --- fs/btrfs/Makefile | 2 +- fs/btrfs/ctree.c | 195 +++++++++++++++++++++++++++++++++---------- fs/btrfs/ctree.h | 19 ++++- fs/btrfs/disk-io.c | 8 ++ fs/btrfs/extent-tree.c | 53 +++++++----- fs/btrfs/file.c | 2 +- fs/btrfs/inode.c | 33 +++++++- fs/btrfs/ioctl.h | 2 +- fs/btrfs/transaction.c | 105 +++++++++++++++++++---- fs/btrfs/transaction.h | 1 + fs/btrfs/tree-defrag.c | 222 +++++++++++++++++++++++++++++++++++++++++++++++++ 11 files changed, 553 insertions(+), 89 deletions(-) create mode 100644 fs/btrfs/tree-defrag.c (limited to 'fs/btrfs/Makefile') diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index a4e2df66bdbb..9321438c5eef 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -4,7 +4,7 @@ ifneq ($(KERNELRELEASE),) obj-m := btrfs.o btrfs-y := super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ hash.o file-item.o inode-item.o inode-map.o disk-io.o \ - transaction.o bit-radix.o inode.o file.o + transaction.o bit-radix.o inode.o file.o tree-defrag.o #btrfs-y := ctree.o disk-io.o radix-tree.o extent-tree.o print-tree.o \ # root-tree.o dir-item.o hash.o file-item.o inode-item.o \ diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 7a08491e208e..c7e47e77723f 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -65,44 +65,44 @@ void btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p) memset(p, 0, sizeof(*p)); } -static int btrfs_cow_block(struct btrfs_trans_handle *trans, struct btrfs_root +static int __btrfs_cow_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct buffer_head *buf, struct buffer_head *parent, int parent_slot, struct buffer_head - **cow_ret) + **cow_ret, u64 search_start, u64 empty_size) { struct buffer_head *cow; struct btrfs_node *cow_node; - int ret; + int ret = 0; + int different_trans = 0; + WARN_ON(root->ref_cows && trans->transid != root->last_trans); WARN_ON(!buffer_uptodate(buf)); - if (trans->transaction != root->fs_info->running_transaction) { - printk(KERN_CRIT "trans %Lu running %Lu\n", trans->transid, - root->fs_info->running_transaction->transid); - WARN_ON(1); - } - if (trans->transid != root->fs_info->generation) { - printk(KERN_CRIT "trans %Lu running %Lu\n", trans->transid, - root->fs_info->generation); - WARN_ON(1); - } - if (btrfs_header_generation(btrfs_buffer_header(buf)) == - trans->transid) { - *cow_ret = buf; - return 0; - } - cow = btrfs_alloc_free_block(trans, root, buf->b_blocknr); + cow = btrfs_alloc_free_block(trans, root, search_start, empty_size); if (IS_ERR(cow)) return PTR_ERR(cow); + cow_node = btrfs_buffer_node(cow); if (buf->b_size != root->blocksize || cow->b_size != root->blocksize) WARN_ON(1); + memcpy(cow_node, btrfs_buffer_node(buf), root->blocksize); btrfs_set_header_blocknr(&cow_node->header, bh_blocknr(cow)); btrfs_set_header_generation(&cow_node->header, trans->transid); btrfs_set_header_owner(&cow_node->header, root->root_key.objectid); - ret = btrfs_inc_ref(trans, root, buf); - if (ret) - return ret; + + WARN_ON(btrfs_header_generation(btrfs_buffer_header(buf)) > + trans->transid); + if (btrfs_header_generation(btrfs_buffer_header(buf)) != + trans->transid) { + different_trans = 1; + ret = btrfs_inc_ref(trans, root, buf); + if (ret) + return ret; + } else { + WARN_ON(!root->ref_cows); + clean_tree_block(trans, root, buf); + } + if (buf == root->node) { root->node = cow; get_bh(cow); @@ -114,6 +114,8 @@ static int btrfs_cow_block(struct btrfs_trans_handle *trans, struct btrfs_root btrfs_set_node_blockptr(btrfs_buffer_node(parent), parent_slot, bh_blocknr(cow)); btrfs_mark_buffer_dirty(parent); + WARN_ON(btrfs_header_generation(btrfs_buffer_header(parent)) != + trans->transid); btrfs_free_extent(trans, root, bh_blocknr(buf), 1, 1); } btrfs_block_release(root, buf); @@ -122,6 +124,115 @@ static int btrfs_cow_block(struct btrfs_trans_handle *trans, struct btrfs_root return 0; } +int btrfs_cow_block(struct btrfs_trans_handle *trans, struct btrfs_root + *root, struct buffer_head *buf, struct buffer_head + *parent, int parent_slot, struct buffer_head + **cow_ret) +{ + u64 search_start; + if (trans->transaction != root->fs_info->running_transaction) { + printk(KERN_CRIT "trans %Lu running %Lu\n", trans->transid, + root->fs_info->running_transaction->transid); + WARN_ON(1); + } + if (trans->transid != root->fs_info->generation) { + printk(KERN_CRIT "trans %Lu running %Lu\n", trans->transid, + root->fs_info->generation); + WARN_ON(1); + } + if (btrfs_header_generation(btrfs_buffer_header(buf)) == + trans->transid) { + *cow_ret = buf; + return 0; + } + + search_start = bh_blocknr(buf) & ~((u64)65535); + return __btrfs_cow_block(trans, root, buf, parent, + parent_slot, cow_ret, search_start, 0); +} + +static int close_blocks(u64 blocknr, u64 other) +{ + if (blocknr < other && other - blocknr < 8) + return 1; + if (blocknr > other && blocknr - other < 8) + return 1; + return 0; +} + +int btrfs_realloc_node(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct buffer_head *parent, + int cache_only) +{ + struct btrfs_node *parent_node; + struct buffer_head *cur_bh; + struct buffer_head *tmp_bh; + u64 blocknr; + u64 search_start = 0; + u64 other; + u32 parent_nritems; + int start_slot; + int end_slot; + int i; + int err = 0; + + if (trans->transaction != root->fs_info->running_transaction) { + printk(KERN_CRIT "trans %Lu running %Lu\n", trans->transid, + root->fs_info->running_transaction->transid); + WARN_ON(1); + } + if (trans->transid != root->fs_info->generation) { + printk(KERN_CRIT "trans %Lu running %Lu\n", trans->transid, + root->fs_info->generation); + WARN_ON(1); + } + parent_node = btrfs_buffer_node(parent); + parent_nritems = btrfs_header_nritems(&parent_node->header); + + start_slot = 0; + end_slot = parent_nritems; + + if (parent_nritems == 1) + return 0; + + for (i = start_slot; i < end_slot; i++) { + int close = 1; + blocknr = btrfs_node_blockptr(parent_node, i); + if (i > 0) { + other = btrfs_node_blockptr(parent_node, i - 1); + close = close_blocks(blocknr, other); + } + if (close && i < end_slot - 1) { + other = btrfs_node_blockptr(parent_node, i + 1); + close = close_blocks(blocknr, other); + } + if (close) + continue; + + cur_bh = btrfs_find_tree_block(root, blocknr); + if (!cur_bh || !buffer_uptodate(cur_bh) || + buffer_locked(cur_bh)) { + if (cache_only) { + brelse(cur_bh); + continue; + } + brelse(cur_bh); + cur_bh = read_tree_block(root, blocknr); + } + if (search_start == 0) { + search_start = bh_blocknr(cur_bh) & ~((u64)65535); + } + err = __btrfs_cow_block(trans, root, cur_bh, parent, i, + &tmp_bh, search_start, + min(8, end_slot - i)); + if (err) + break; + search_start = bh_blocknr(tmp_bh); + brelse(tmp_bh); + } + return err; +} + /* * The leaf data grows from end-to-front in the node. * this returns the address of the start of the last item, @@ -221,6 +332,7 @@ static int check_leaf(struct btrfs_root *root, struct btrfs_path *path, parent_slot = path->slots[level + 1]; parent_key = &parent->ptrs[parent_slot].key; + BUG_ON(memcmp(parent_key, &leaf->items[0].key, sizeof(struct btrfs_disk_key))); BUG_ON(btrfs_node_blockptr(parent, parent_slot) != @@ -643,7 +755,7 @@ static int push_nodes_for_insert(struct btrfs_trans_handle *trans, * readahead one full node of leaves */ static void reada_for_search(struct btrfs_root *root, struct btrfs_path *path, - int slot) + int level, int slot) { struct btrfs_node *node; int i; @@ -659,10 +771,13 @@ static void reada_for_search(struct btrfs_root *root, struct btrfs_path *path, unsigned long gang[8]; struct buffer_head *bh; - if (!path->nodes[1]) + if (level == 0) + return; + + if (!path->nodes[level]) return; - node = btrfs_buffer_node(path->nodes[1]); + node = btrfs_buffer_node(path->nodes[level]); search = btrfs_node_blockptr(node, slot); bh = btrfs_find_tree_block(root, search); if (bh) { @@ -690,7 +805,7 @@ static void reada_for_search(struct btrfs_root *root, struct btrfs_path *path, for (i = 0; i < ret; i++) { blocknr = gang[i]; clear_radix_bit(&found, blocknr); - if (nread > 64) + if (nread > 32) continue; if (direction > 0 && cluster_start <= blocknr && cluster_start + 8 > blocknr) { @@ -726,7 +841,6 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root struct buffer_head *b; struct buffer_head *cow_buf; struct btrfs_node *c; - struct btrfs_root_item *root_item = &root->root_item; u64 blocknr; int slot; int ret; @@ -734,11 +848,8 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root int should_reada = p->reada; u8 lowest_level = 0; - if (btrfs_root_refs(root_item) == 0 && root->ref_cows) { - lowest_level = root_item->drop_level; - WARN_ON(ins_len || cow); - } - + lowest_level = p->lowest_level; + WARN_ON(lowest_level && ins_len); WARN_ON(p->nodes[0] != NULL); WARN_ON(!mutex_is_locked(&root->fs_info->fs_mutex)); again: @@ -798,8 +909,8 @@ again: if (level == lowest_level) break; blocknr = btrfs_node_blockptr(c, slot); - if (level == 1 && should_reada) - reada_for_search(root, p, slot); + if (should_reada) + reada_for_search(root, p, level, slot); b = read_tree_block(root, btrfs_node_blockptr(c, slot)); } else { @@ -960,7 +1071,7 @@ static int insert_new_root(struct btrfs_trans_handle *trans, struct btrfs_root BUG_ON(path->nodes[level]); BUG_ON(path->nodes[level-1] != root->node); - t = btrfs_alloc_free_block(trans, root, root->node->b_blocknr); + t = btrfs_alloc_free_block(trans, root, root->node->b_blocknr, 0); if (IS_ERR(t)) return PTR_ERR(t); c = btrfs_buffer_node(t); @@ -1070,7 +1181,7 @@ static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root } c_nritems = btrfs_header_nritems(&c->header); - split_buffer = btrfs_alloc_free_block(trans, root, t->b_blocknr); + split_buffer = btrfs_alloc_free_block(trans, root, t->b_blocknr, 0); if (IS_ERR(split_buffer)) return PTR_ERR(split_buffer); @@ -1461,7 +1572,7 @@ static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root nritems = btrfs_header_nritems(&l->header); mid = (nritems + 1)/ 2; - right_buffer = btrfs_alloc_free_block(trans, root, l_buf->b_blocknr); + right_buffer = btrfs_alloc_free_block(trans, root, l_buf->b_blocknr, 0); if (IS_ERR(right_buffer)) return PTR_ERR(right_buffer); @@ -1560,7 +1671,7 @@ static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root if (!double_split) return ret; - right_buffer = btrfs_alloc_free_block(trans, root, l_buf->b_blocknr); + right_buffer = btrfs_alloc_free_block(trans, root, l_buf->b_blocknr, 0); if (IS_ERR(right_buffer)) return PTR_ERR(right_buffer); @@ -1988,8 +2099,8 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path) blocknr = btrfs_node_blockptr(c_node, slot); if (next) btrfs_block_release(root, next); - if (level == 1 && path->reada) - reada_for_search(root, path, slot); + if (path->reada) + reada_for_search(root, path, level, slot); next = read_tree_block(root, blocknr); break; } @@ -2002,8 +2113,8 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path) path->slots[level] = 0; if (!level) break; - if (level == 1 && path->reada) - reada_for_search(root, path, slot); + if (path->reada) + reada_for_search(root, path, level, slot); next = read_tree_block(root, btrfs_node_blockptr(btrfs_buffer_node(next), 0)); } diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index c5a18d5d7f7c..42aa20301bc9 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -178,6 +178,7 @@ struct btrfs_path { struct buffer_head *nodes[BTRFS_MAX_LEVEL]; int slots[BTRFS_MAX_LEVEL]; int reada; + int lowest_level; }; /* @@ -338,6 +339,9 @@ struct btrfs_root { u64 highest_inode; u64 last_inode_alloc; int ref_cows; + struct btrfs_key defrag_progress; + int defrag_running; + int defrag_level; }; /* the lower bits in the key flags defines the item type */ @@ -1031,10 +1035,11 @@ struct btrfs_block_group_cache *btrfs_find_block_group(struct btrfs_root *root, int btrfs_inc_root_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root); struct buffer_head *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 hint); + struct btrfs_root *root, u64 hint, + u64 empty_size); int btrfs_alloc_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 owner, - u64 num_blocks, u64 search_start, + u64 num_blocks, u64 empty_size, u64 search_start, u64 search_end, struct btrfs_key *ins, int data); int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct buffer_head *buf); @@ -1051,6 +1056,10 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, int btrfs_free_block_groups(struct btrfs_fs_info *info); int btrfs_read_block_groups(struct btrfs_root *root); /* ctree.c */ +int btrfs_cow_block(struct btrfs_trans_handle *trans, struct btrfs_root + *root, struct buffer_head *buf, struct buffer_head + *parent, int parent_slot, struct buffer_head + **cow_ret); int btrfs_extend_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, u32 data_size); int btrfs_truncate_item(struct btrfs_trans_handle *trans, @@ -1060,6 +1069,9 @@ int btrfs_truncate_item(struct btrfs_trans_handle *trans, int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_key *key, struct btrfs_path *p, int ins_len, int cow); +int btrfs_realloc_node(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct buffer_head *parent, + int cache_only); void btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p); struct btrfs_path *btrfs_alloc_path(void); void btrfs_free_path(struct btrfs_path *p); @@ -1171,4 +1183,7 @@ extern struct file_operations btrfs_file_operations; int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode, u64 start, u64 end, u64 *hint_block); +/* tree-defrag.c */ +int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, + struct btrfs_root *root, int cache_only); #endif diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 60db85bc4ba8..c948416fea3a 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -273,7 +273,9 @@ int clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct buffer_head *buf) { WARN_ON(atomic_read(&buf->b_count) == 0); + lock_buffer(buf); clear_buffer_dirty(buf); + unlock_buffer(buf); return 0; } @@ -294,6 +296,9 @@ static int __setup_root(int blocksize, root->last_inode_alloc = 0; memset(&root->root_key, 0, sizeof(root->root_key)); memset(&root->root_item, 0, sizeof(root->root_item)); + memset(&root->defrag_progress, 0, sizeof(root->defrag_progress)); + root->defrag_running = 0; + root->defrag_level = 0; root->root_key.objectid = objectid; return 0; } @@ -585,6 +590,7 @@ int close_ctree(struct btrfs_root *root) fs_info->closing = 1; btrfs_transaction_flush_work(root); mutex_lock(&fs_info->fs_mutex); + btrfs_defrag_dirty_roots(root->fs_info); trans = btrfs_start_transaction(root, 1); ret = btrfs_commit_transaction(trans, root); /* run commit again to drop the original snapshot */ @@ -616,7 +622,9 @@ void btrfs_mark_buffer_dirty(struct buffer_head *bh) { struct btrfs_root *root = BTRFS_I(bh->b_page->mapping->host)->root; u64 transid = btrfs_header_generation(btrfs_buffer_header(bh)); + WARN_ON(!atomic_read(&bh->b_count)); + if (transid != root->fs_info->generation) { printk(KERN_CRIT "transid mismatch buffer %llu, found %Lu running %Lu\n", (unsigned long long)bh->b_blocknr, diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 5d4d5d8db8ef..26b8d3406491 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -23,7 +23,8 @@ #include "transaction.h" static int find_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root - *orig_root, u64 num_blocks, u64 search_start, + *orig_root, u64 num_blocks, u64 empty_size, + u64 search_start, u64 search_end, u64 hint_block, struct btrfs_key *ins, u64 exclude_start, u64 exclude_nr, int data); @@ -379,7 +380,7 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, path = btrfs_alloc_path(); if (!path) return -ENOMEM; - ret = find_free_extent(trans, root->fs_info->extent_root, 0, 0, + ret = find_free_extent(trans, root->fs_info->extent_root, 0, 0, 0, (u64)-1, 0, &ins, 0, 0, 0); if (ret) { btrfs_free_path(path); @@ -533,7 +534,7 @@ static int write_one_cache_group(struct btrfs_trans_handle *trans, struct btrfs_block_group_item *bi; struct btrfs_key ins; - ret = find_free_extent(trans, extent_root, 0, 0, (u64)-1, 0, &ins, + ret = find_free_extent(trans, extent_root, 0, 0, 0, (u64)-1, 0, &ins, 0, 0, 0); /* FIXME, set bit to recalc cache groups on next mount */ if (ret) @@ -708,6 +709,7 @@ static int update_block_group(struct btrfs_trans_handle *trans, static int try_remove_page(struct address_space *mapping, unsigned long index) { int ret; + return 0; ret = invalidate_mapping_pages(mapping, index, index); return ret; } @@ -866,7 +868,7 @@ static int __free_extent(struct btrfs_trans_handle *trans, struct btrfs_root if (!path) return -ENOMEM; - ret = find_free_extent(trans, root, 0, 0, (u64)-1, 0, &ins, 0, 0, 0); + ret = find_free_extent(trans, root, 0, 0, 0, (u64)-1, 0, &ins, 0, 0, 0); if (ret) { btrfs_free_path(path); return ret; @@ -983,8 +985,8 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root * Any available blocks before search_start are skipped. */ static int find_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root - *orig_root, u64 num_blocks, u64 search_start, u64 - search_end, u64 hint_block, + *orig_root, u64 num_blocks, u64 empty_size, + u64 search_start, u64 search_end, u64 hint_block, struct btrfs_key *ins, u64 exclude_start, u64 exclude_nr, int data) { @@ -1042,6 +1044,7 @@ static int find_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root data, 1); } + total_needed += empty_size; path = btrfs_alloc_path(); check_failed: @@ -1157,9 +1160,11 @@ check_pending: goto error; } search_start = orig_search_start; - if (wrapped) + if (wrapped) { + if (!full_scan) + total_needed -= empty_size; full_scan = 1; - else + } else wrapped = 1; goto new_group; } @@ -1238,9 +1243,11 @@ new_group: ret = -ENOSPC; goto error; } - if (wrapped) + if (wrapped) { + if (!full_scan) + total_needed -= empty_size; full_scan = 1; - else + } else wrapped = 1; } block_group = btrfs_lookup_block_group(info, search_start); @@ -1264,7 +1271,7 @@ error: */ int btrfs_alloc_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 owner, - u64 num_blocks, u64 hint_block, + u64 num_blocks, u64 empty_size, u64 hint_block, u64 search_end, struct btrfs_key *ins, int data) { int ret; @@ -1303,7 +1310,7 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans, * in the correct block group. */ if (data) { - ret = find_free_extent(trans, root, 0, 0, + ret = find_free_extent(trans, root, 0, 0, 0, search_end, 0, &prealloc_key, 0, 0, 0); BUG_ON(ret); if (ret) @@ -1313,8 +1320,8 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans, } /* do the real allocation */ - ret = find_free_extent(trans, root, num_blocks, search_start, - search_end, hint_block, ins, + ret = find_free_extent(trans, root, num_blocks, empty_size, + search_start, search_end, hint_block, ins, exclude_start, exclude_nr, data); BUG_ON(ret); if (ret) @@ -1333,7 +1340,7 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans, exclude_start = ins->objectid; exclude_nr = ins->offset; hint_block = exclude_start + exclude_nr; - ret = find_free_extent(trans, root, 0, search_start, + ret = find_free_extent(trans, root, 0, 0, search_start, search_end, hint_block, &prealloc_key, exclude_start, exclude_nr, 0); @@ -1368,14 +1375,16 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans, * returns the tree buffer or NULL. */ struct buffer_head *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 hint) + struct btrfs_root *root, u64 hint, + u64 empty_size) { struct btrfs_key ins; int ret; struct buffer_head *buf; ret = btrfs_alloc_extent(trans, root, root->root_key.objectid, - 1, hint, (unsigned long)-1, &ins, 0); + 1, empty_size, hint, + (unsigned long)-1, &ins, 0); if (ret) { BUG_ON(ret > 0); return ERR_PTR(ret); @@ -1385,6 +1394,7 @@ struct buffer_head *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, btrfs_free_extent(trans, root, ins.objectid, 1, 0); return ERR_PTR(-ENOMEM); } + WARN_ON(buffer_dirty(buf)); set_buffer_uptodate(buf); set_buffer_checked(buf); set_radix_bit(&trans->transaction->dirty_pages, buf->b_page->index); @@ -1591,13 +1601,15 @@ int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root struct btrfs_key key; struct btrfs_disk_key *found_key; struct btrfs_node *node; + btrfs_disk_key_to_cpu(&key, &root_item->drop_progress); + level = root_item->drop_level; + path->lowest_level = level; wret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - if (ret < 0) { + if (wret < 0) { ret = wret; goto out; } - level = root_item->drop_level; node = btrfs_buffer_node(path->nodes[level]); found_key = &node->ptrs[path->slots[level]].key; WARN_ON(memcmp(found_key, &root_item->drop_progress, @@ -1617,8 +1629,6 @@ int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root ret = wret; num_walks++; if (num_walks > 10) { - struct btrfs_key key; - btrfs_disk_key_to_cpu(&key, &root_item->drop_progress); ret = -EAGAIN; get_bh(root->node); break; @@ -1627,6 +1637,7 @@ int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root for (i = 0; i <= orig_level; i++) { if (path->nodes[i]) { btrfs_block_release(root, path->nodes[i]); + path->nodes[i] = 0; } } out: diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 1fe38fe84150..00b118a2db69 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -512,7 +512,7 @@ static int prepare_pages(struct btrfs_root *root, if (isize >= PAGE_CACHE_SIZE || pos + write_bytes < inode->i_size || pos + write_bytes - start_pos > BTRFS_MAX_INLINE_DATA_SIZE(root)) { err = btrfs_alloc_extent(trans, root, inode->i_ino, - num_blocks, hint_block, (u64)-1, + num_blocks, 0, hint_block, (u64)-1, &ins, 1); if (err) goto failed_truncate; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 3889032fc449..12aa043b9f6f 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -554,7 +554,7 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from) &alloc_hint); if (ret) goto out; - ret = btrfs_alloc_extent(trans, root, inode->i_ino, 1, + ret = btrfs_alloc_extent(trans, root, inode->i_ino, 1, 0, alloc_hint, (u64)-1, &ins, 1); if (ret) goto out; @@ -1360,7 +1360,7 @@ not_found: if (create & BTRFS_GET_BLOCK_CREATE) { struct btrfs_key ins; ret = btrfs_alloc_extent(trans, root, inode->i_ino, - 1, alloc_hint, (u64)-1, + 1, 0, alloc_hint, (u64)-1, &ins, 1); if (ret) { err = ret; @@ -1998,7 +1998,7 @@ static int create_subvol(struct btrfs_root *root, char *name, int namelen) trans = btrfs_start_transaction(root, 1); BUG_ON(!trans); - subvol = btrfs_alloc_free_block(trans, root, 0); + subvol = btrfs_alloc_free_block(trans, root, 0, 0); if (IS_ERR(subvol)) return PTR_ERR(subvol); leaf = btrfs_buffer_leaf(subvol); @@ -2159,7 +2159,9 @@ int btrfs_ioctl(struct inode *inode, struct file *filp, unsigned int { struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_ioctl_vol_args vol_args; + struct btrfs_trans_handle *trans; int ret = 0; + int err; struct btrfs_dir_item *di; int namelen; struct btrfs_path *path; @@ -2196,6 +2198,31 @@ int btrfs_ioctl(struct inode *inode, struct file *filp, unsigned int else ret = create_snapshot(root, vol_args.name, namelen); break; + + case BTRFS_IOC_DEFRAG: + mutex_lock(&root->fs_info->fs_mutex); + trans = btrfs_start_transaction(root, 1); + memset(&root->defrag_progress, 0, + sizeof(root->defrag_progress)); + while (1) { + root->defrag_running = 1; + err = btrfs_defrag_leaves(trans, root, 0); + + btrfs_end_transaction(trans, root); + mutex_unlock(&root->fs_info->fs_mutex); + + btrfs_btree_balance_dirty(root); + + mutex_lock(&root->fs_info->fs_mutex); + trans = btrfs_start_transaction(root, 1); + if (err != -EAGAIN) + break; + } + root->defrag_running = 0; + btrfs_end_transaction(trans, root); + mutex_unlock(&root->fs_info->fs_mutex); + ret = 0; + break; default: return -ENOTTY; } diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h index 23bed48c51a7..8bc47dec2865 100644 --- a/fs/btrfs/ioctl.h +++ b/fs/btrfs/ioctl.h @@ -28,6 +28,6 @@ struct btrfs_ioctl_vol_args { #define BTRFS_IOC_SNAP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 1, \ struct btrfs_ioctl_vol_args) -#define BTRFS_IOC_ADD_DISK _IOW(BTRFS_IOCTL_MAGIC, 2, \ +#define BTRFS_IOC_DEFRAG _IOW(BTRFS_IOCTL_MAGIC, 2, \ struct btrfs_ioctl_vol_args) #endif diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 498626470a04..338a7199363b 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -29,6 +29,7 @@ extern struct kmem_cache *btrfs_transaction_cachep; static struct workqueue_struct *trans_wq; #define BTRFS_ROOT_TRANS_TAG 0 +#define BTRFS_ROOT_DEFRAG_TAG 1 static void put_transaction(struct btrfs_transaction *transaction) { @@ -69,35 +70,41 @@ static int join_transaction(struct btrfs_root *root) return 0; } +static int record_root_in_trans(struct btrfs_root *root) +{ + u64 running_trans_id = root->fs_info->running_transaction->transid; + if (root->ref_cows && root->last_trans < running_trans_id) { + WARN_ON(root == root->fs_info->extent_root); + if (root->root_item.refs != 0) { + radix_tree_tag_set(&root->fs_info->fs_roots_radix, + (unsigned long)root->root_key.objectid, + BTRFS_ROOT_TRANS_TAG); + radix_tree_tag_set(&root->fs_info->fs_roots_radix, + (unsigned long)root->root_key.objectid, + BTRFS_ROOT_DEFRAG_TAG); + root->commit_root = root->node; + get_bh(root->node); + } else { + WARN_ON(1); + } + root->last_trans = running_trans_id; + } + return 0; +} + struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root, int num_blocks) { struct btrfs_trans_handle *h = kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS); int ret; - u64 running_trans_id; mutex_lock(&root->fs_info->trans_mutex); ret = join_transaction(root); BUG_ON(ret); - running_trans_id = root->fs_info->running_transaction->transid; - if (root != root->fs_info->tree_root && root->last_trans < - running_trans_id) { - WARN_ON(root == root->fs_info->extent_root); - WARN_ON(root->ref_cows != 1); - if (root->root_item.refs != 0) { - radix_tree_tag_set(&root->fs_info->fs_roots_radix, - (unsigned long)root->root_key.objectid, - BTRFS_ROOT_TRANS_TAG); - root->commit_root = root->node; - get_bh(root->node); - } else { - WARN_ON(1); - } - } - root->last_trans = running_trans_id; - h->transid = running_trans_id; + record_root_in_trans(root); + h->transid = root->fs_info->running_transaction->transid; h->transaction = root->fs_info->running_transaction; h->blocks_reserved = num_blocks; h->blocks_used = 0; @@ -155,6 +162,15 @@ int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans, gang[i]); if (!page) continue; + if (PageWriteback(page)) { + if (PageDirty(page)) + wait_on_page_writeback(page); + else { + unlock_page(page); + page_cache_release(page); + continue; + } + } err = write_one_page(page, 0); if (err) werr = err; @@ -299,6 +315,58 @@ static int add_dirty_roots(struct btrfs_trans_handle *trans, return err; } +int btrfs_defrag_dirty_roots(struct btrfs_fs_info *info) +{ + struct btrfs_root *gang[1]; + struct btrfs_root *root; + struct btrfs_root *tree_root = info->tree_root; + struct btrfs_trans_handle *trans; + int i; + int ret; + int err = 0; + u64 last = 0; + + trans = btrfs_start_transaction(tree_root, 1); + while(1) { + ret = radix_tree_gang_lookup_tag(&info->fs_roots_radix, + (void **)gang, last, + ARRAY_SIZE(gang), + BTRFS_ROOT_DEFRAG_TAG); + if (ret == 0) + break; + for (i = 0; i < ret; i++) { + root = gang[i]; + last = root->root_key.objectid + 1; + radix_tree_tag_clear(&info->fs_roots_radix, + (unsigned long)root->root_key.objectid, + BTRFS_ROOT_DEFRAG_TAG); + if (root->defrag_running) + continue; + + while (1) { + mutex_lock(&root->fs_info->trans_mutex); + record_root_in_trans(root); + mutex_unlock(&root->fs_info->trans_mutex); + + root->defrag_running = 1; + err = btrfs_defrag_leaves(trans, root, 1); + btrfs_end_transaction(trans, tree_root); + mutex_unlock(&info->fs_mutex); + + btrfs_btree_balance_dirty(root); + + mutex_lock(&info->fs_mutex); + trans = btrfs_start_transaction(tree_root, 1); + if (err != -EAGAIN) + break; + } + root->defrag_running = 0; + } + } + btrfs_end_transaction(trans, tree_root); + return err; +} + static int drop_dirty_roots(struct btrfs_root *tree_root, struct list_head *list) { @@ -475,6 +543,7 @@ void btrfs_transaction_cleaner(struct work_struct *work) goto out; } mutex_unlock(&root->fs_info->trans_mutex); + btrfs_defrag_dirty_roots(root->fs_info); trans = btrfs_start_transaction(root, 1); ret = btrfs_commit_transaction(trans, root); out: diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index ebf44f3e1118..8b2714e65562 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -70,5 +70,6 @@ void btrfs_transaction_queue_work(struct btrfs_root *root, int delay); void btrfs_init_transaction_sys(void); void btrfs_exit_transaction_sys(void); int btrfs_add_dead_root(struct btrfs_root *root, struct list_head *dead_list); +int btrfs_defrag_dirty_roots(struct btrfs_fs_info *info); #endif diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c new file mode 100644 index 000000000000..15d0a486fb59 --- /dev/null +++ b/fs/btrfs/tree-defrag.c @@ -0,0 +1,222 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include +#include "ctree.h" +#include "disk-io.h" +#include "print-tree.h" +#include "transaction.h" + +static void reada_defrag(struct btrfs_root *root, + struct btrfs_node *node) +{ + int i; + u32 nritems; + u64 blocknr; + int ret; + + nritems = btrfs_header_nritems(&node->header); + for (i = 0; i < nritems; i++) { + blocknr = btrfs_node_blockptr(node, i); + ret = readahead_tree_block(root, blocknr); + if (ret) + break; + } +} + +static int defrag_walk_down(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, int *level, + int cache_only) +{ + struct buffer_head *next; + struct buffer_head *cur; + u64 blocknr; + int ret = 0; + + WARN_ON(*level < 0); + WARN_ON(*level >= BTRFS_MAX_LEVEL); + + while(*level > 0) { + WARN_ON(*level < 0); + WARN_ON(*level >= BTRFS_MAX_LEVEL); + cur = path->nodes[*level]; + + if (!cache_only && *level > 1 && path->slots[*level] == 0) + reada_defrag(root, btrfs_buffer_node(cur)); + + if (btrfs_header_level(btrfs_buffer_header(cur)) != *level) + WARN_ON(1); + + if (path->slots[*level] >= + btrfs_header_nritems(btrfs_buffer_header(cur))) + break; + + if (*level == 1) { + ret = btrfs_realloc_node(trans, root, + path->nodes[*level], + cache_only); + break; + } + blocknr = btrfs_node_blockptr(btrfs_buffer_node(cur), + path->slots[*level]); + + if (cache_only) { + next = btrfs_find_tree_block(root, blocknr); + if (!next || !buffer_uptodate(next) || + buffer_locked(next)) { + brelse(next); + path->slots[*level]++; + continue; + } + } else { + next = read_tree_block(root, blocknr); + } + ret = btrfs_cow_block(trans, root, next, path->nodes[*level], + path->slots[*level], &next); + BUG_ON(ret); + ret = btrfs_realloc_node(trans, root, next, cache_only); + BUG_ON(ret); + WARN_ON(*level <= 0); + if (path->nodes[*level-1]) + btrfs_block_release(root, path->nodes[*level-1]); + path->nodes[*level-1] = next; + *level = btrfs_header_level(btrfs_buffer_header(next)); + path->slots[*level] = 0; + } + WARN_ON(*level < 0); + WARN_ON(*level >= BTRFS_MAX_LEVEL); + btrfs_block_release(root, path->nodes[*level]); + path->nodes[*level] = NULL; + *level += 1; + WARN_ON(ret); + return 0; +} + +static int defrag_walk_up(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, int *level, + int cache_only) +{ + int i; + int slot; + struct btrfs_node *node; + + for(i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) { + slot = path->slots[i]; + if (slot < btrfs_header_nritems( + btrfs_buffer_header(path->nodes[i])) - 1) { + path->slots[i]++; + *level = i; + node = btrfs_buffer_node(path->nodes[i]); + WARN_ON(i == 0); + btrfs_disk_key_to_cpu(&root->defrag_progress, + &node->ptrs[path->slots[i]].key); + root->defrag_level = i; + return 0; + } else { + btrfs_block_release(root, path->nodes[*level]); + path->nodes[*level] = NULL; + *level = i + 1; + } + } + return 1; +} + +int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, + struct btrfs_root *root, int cache_only) +{ + struct btrfs_path *path = NULL; + struct buffer_head *tmp; + int ret = 0; + int wret; + int level; + int orig_level; + int i; + int num_runs = 0; + + if (root->ref_cows == 0) { + goto out; + } + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + level = btrfs_header_level(btrfs_buffer_header(root->node)); + orig_level = level; + if (level == 0) { + goto out; + } + if (root->defrag_progress.objectid == 0) { + get_bh(root->node); + ret = btrfs_cow_block(trans, root, root->node, NULL, 0, &tmp); + BUG_ON(ret); + ret = btrfs_realloc_node(trans, root, root->node, cache_only); + BUG_ON(ret); + path->nodes[level] = root->node; + path->slots[level] = 0; + } else { + level = root->defrag_level; + path->lowest_level = level; + wret = btrfs_search_slot(trans, root, &root->defrag_progress, + path, 0, 1); + + if (wret < 0) { + ret = wret; + goto out; + } + while(level > 0 && !path->nodes[level]) + level--; + if (!path->nodes[level]) { + ret = 0; + goto out; + } + } + + while(1) { + wret = defrag_walk_down(trans, root, path, &level, cache_only); + if (wret > 0) + break; + if (wret < 0) + ret = wret; + + wret = defrag_walk_up(trans, root, path, &level, cache_only); + if (wret > 0) + break; + if (wret < 0) + ret = wret; + if (num_runs++ > 8) { + ret = -EAGAIN; + break; + } + } + for (i = 0; i <= orig_level; i++) { + if (path->nodes[i]) { + btrfs_block_release(root, path->nodes[i]); + path->nodes[i] = 0; + } + } +out: + if (path) + btrfs_free_path(path); + if (ret != -EAGAIN) { + memset(&root->defrag_progress, 0, + sizeof(root->defrag_progress)); + } + return ret; +} -- cgit v1.2.3 From a52d9a8033c454cd9b4697cfafb467509fc1693f Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Mon, 27 Aug 2007 16:49:44 -0400 Subject: Btrfs: Extent based page cache code. This uses an rbtree of extents and tests instead of buffer heads. Signed-off-by: Chris Mason --- fs/btrfs/Makefile | 3 +- fs/btrfs/btrfs_inode.h | 3 + fs/btrfs/ctree.h | 7 + fs/btrfs/disk-io.h | 1 - fs/btrfs/extent_map.c | 1656 ++++++++++++++++++++++++++++++++++++++++++++++++ fs/btrfs/extent_map.h | 89 +++ fs/btrfs/file.c | 295 ++++----- fs/btrfs/inode.c | 816 +++++++++--------------- fs/btrfs/super.c | 2 + 9 files changed, 2203 insertions(+), 669 deletions(-) create mode 100644 fs/btrfs/extent_map.c create mode 100644 fs/btrfs/extent_map.h (limited to 'fs/btrfs/Makefile') diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index 9321438c5eef..ea7a22870350 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -4,7 +4,8 @@ ifneq ($(KERNELRELEASE),) obj-m := btrfs.o btrfs-y := super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ hash.o file-item.o inode-item.o inode-map.o disk-io.o \ - transaction.o bit-radix.o inode.o file.o tree-defrag.o + transaction.o bit-radix.o inode.o file.o tree-defrag.o \ + extent_map.o #btrfs-y := ctree.o disk-io.o radix-tree.o extent-tree.o print-tree.o \ # root-tree.o dir-item.o hash.o file-item.o inode-item.o \ diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index f446f161f736..d1d5af471c3c 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -19,11 +19,14 @@ #ifndef __BTRFS_I__ #define __BTRFS_I__ +#include "extent_map.h" + /* in memory btrfs inode */ struct btrfs_inode { struct btrfs_root *root; struct btrfs_block_group_cache *block_group; struct btrfs_key location; + struct extent_map_tree extent_tree; struct inode vfs_inode; /* diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index c7f5161271d3..947f061ed118 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1176,7 +1176,14 @@ int btrfs_commit_write(struct file *file, struct page *page, unsigned from, unsigned to); int btrfs_get_block(struct inode *inode, sector_t iblock, struct buffer_head *result, int create); +struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page, + size_t page_offset, u64 start, u64 end, + int create); +int btrfs_update_inode(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct inode *inode); /* file.c */ +int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end); extern struct file_operations btrfs_file_operations; int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode, diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 81fd18cbd824..5261733b8735 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -77,7 +77,6 @@ int btrfs_insert_dev_radix(struct btrfs_root *root, u64 num_blocks); int btrfs_map_bh_to_logical(struct btrfs_root *root, struct buffer_head *bh, u64 logical); -int btrfs_releasepage(struct page *page, gfp_t flags); void btrfs_btree_balance_dirty(struct btrfs_root *root); int btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root); void btrfs_mark_buffer_dirty(struct buffer_head *bh); diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c new file mode 100644 index 000000000000..d378edf0964e --- /dev/null +++ b/fs/btrfs/extent_map.c @@ -0,0 +1,1656 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "extent_map.h" + +static struct kmem_cache *extent_map_cache; +static struct kmem_cache *extent_state_cache; + +struct tree_entry { + u64 start; + u64 end; + int in_tree; + struct rb_node rb_node; +}; + +/* bits for the extent state */ +#define EXTENT_DIRTY 1 +#define EXTENT_WRITEBACK (1 << 1) +#define EXTENT_UPTODATE (1 << 2) +#define EXTENT_LOCKED (1 << 3) +#define EXTENT_NEW (1 << 4) +#define EXTENT_DELALLOC (1 << 5) + +#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK) + +static LIST_HEAD(all_states); +spinlock_t state_lock = SPIN_LOCK_UNLOCKED; + +void __init extent_map_init(void) +{ + extent_map_cache = kmem_cache_create("extent_map", + sizeof(struct extent_map), 0, + SLAB_RECLAIM_ACCOUNT | + SLAB_DESTROY_BY_RCU, + NULL); + extent_state_cache = kmem_cache_create("extent_state", + sizeof(struct extent_state), 0, + SLAB_RECLAIM_ACCOUNT | + SLAB_DESTROY_BY_RCU, + NULL); +} + +void __exit extent_map_exit(void) +{ + while(!list_empty(&all_states)) { + struct extent_state *state; + struct list_head *cur = all_states.next; + state = list_entry(cur, struct extent_state, list); + printk("found leaked state %Lu %Lu state %d in_tree %d\n", + state->start, state->end, state->state, state->in_tree); + list_del(&state->list); + kfree(state); + } + if (extent_map_cache) + kmem_cache_destroy(extent_map_cache); + if (extent_state_cache) + kmem_cache_destroy(extent_state_cache); +} + +void extent_map_tree_init(struct extent_map_tree *tree, + struct address_space *mapping, gfp_t mask) +{ + tree->map.rb_node = NULL; + tree->state.rb_node = NULL; + rwlock_init(&tree->lock); + tree->mapping = mapping; +} +EXPORT_SYMBOL(extent_map_tree_init); + +struct extent_map *alloc_extent_map(gfp_t mask) +{ + struct extent_map *em; + em = kmem_cache_alloc(extent_map_cache, mask); + if (!em || IS_ERR(em)) + return em; + em->in_tree = 0; + atomic_set(&em->refs, 1); + return em; +} +EXPORT_SYMBOL(alloc_extent_map); + +void free_extent_map(struct extent_map *em) +{ + if (atomic_dec_and_test(&em->refs)) { + WARN_ON(em->in_tree); + kmem_cache_free(extent_map_cache, em); + } +} +EXPORT_SYMBOL(free_extent_map); + + +struct extent_state *alloc_extent_state(gfp_t mask) +{ + struct extent_state *state; + state = kmem_cache_alloc(extent_state_cache, mask); + if (!state || IS_ERR(state)) + return state; + state->state = 0; + state->in_tree = 0; + atomic_set(&state->refs, 1); + init_waitqueue_head(&state->wq); + spin_lock_irq(&state_lock); + list_add(&state->list, &all_states); + spin_unlock_irq(&state_lock); + return state; +} +EXPORT_SYMBOL(alloc_extent_state); + +void free_extent_state(struct extent_state *state) +{ + if (atomic_dec_and_test(&state->refs)) { + WARN_ON(state->in_tree); + spin_lock_irq(&state_lock); + list_del_init(&state->list); + spin_unlock_irq(&state_lock); + kmem_cache_free(extent_state_cache, state); + } +} +EXPORT_SYMBOL(free_extent_state); + +static struct rb_node *tree_insert(struct rb_root *root, u64 offset, + struct rb_node *node) +{ + struct rb_node ** p = &root->rb_node; + struct rb_node * parent = NULL; + struct tree_entry *entry; + + while(*p) { + parent = *p; + entry = rb_entry(parent, struct tree_entry, rb_node); + + if (offset < entry->start) + p = &(*p)->rb_left; + else if (offset > entry->end) + p = &(*p)->rb_right; + else + return parent; + } + + entry = rb_entry(node, struct tree_entry, rb_node); + entry->in_tree = 1; + rb_link_node(node, parent, p); + rb_insert_color(node, root); + return NULL; +} + +static struct rb_node *__tree_search(struct rb_root *root, u64 offset, + struct rb_node **prev_ret) +{ + struct rb_node * n = root->rb_node; + struct rb_node *prev = NULL; + struct tree_entry *entry; + struct tree_entry *prev_entry = NULL; + + while(n) { + entry = rb_entry(n, struct tree_entry, rb_node); + prev = n; + prev_entry = entry; + + if (offset < entry->start) + n = n->rb_left; + else if (offset > entry->end) + n = n->rb_right; + else + return n; + } + if (!prev_ret) + return NULL; + while(prev && offset > prev_entry->end) { + prev = rb_next(prev); + prev_entry = rb_entry(prev, struct tree_entry, rb_node); + } + *prev_ret = prev; + return NULL; +} + +static inline struct rb_node *tree_search(struct rb_root *root, u64 offset) +{ + struct rb_node *prev; + struct rb_node *ret; + ret = __tree_search(root, offset, &prev); + if (!ret) + return prev; + return ret; +} + +static int tree_delete(struct rb_root *root, u64 offset) +{ + struct rb_node *node; + struct tree_entry *entry; + + node = __tree_search(root, offset, NULL); + if (!node) + return -ENOENT; + entry = rb_entry(node, struct tree_entry, rb_node); + entry->in_tree = 0; + rb_erase(node, root); + return 0; +} + +/* + * add_extent_mapping tries a simple backward merge with existing + * mappings. The extent_map struct passed in will be inserted into + * the tree directly (no copies made, just a reference taken). + */ +int add_extent_mapping(struct extent_map_tree *tree, + struct extent_map *em) +{ + int ret = 0; + struct extent_map *prev = NULL; + struct rb_node *rb; + + write_lock_irq(&tree->lock); + rb = tree_insert(&tree->map, em->end, &em->rb_node); + if (rb) { + prev = rb_entry(rb, struct extent_map, rb_node); + printk("found extent map %Lu %Lu on insert of %Lu %Lu\n", prev->start, prev->end, em->start, em->end); + ret = -EEXIST; + goto out; + } + atomic_inc(&em->refs); + if (em->start != 0) { + rb = rb_prev(&em->rb_node); + if (rb) + prev = rb_entry(rb, struct extent_map, rb_node); + if (prev && prev->end + 1 == em->start && + ((em->block_start == 0 && prev->block_start == 0) || + (em->block_start == prev->block_end + 1))) { + em->start = prev->start; + em->block_start = prev->block_start; + rb_erase(&prev->rb_node, &tree->map); + prev->in_tree = 0; + free_extent_map(prev); + } + } +out: + write_unlock_irq(&tree->lock); + return ret; +} +EXPORT_SYMBOL(add_extent_mapping); + +/* + * lookup_extent_mapping returns the first extent_map struct in the + * tree that intersects the [start, end] (inclusive) range. There may + * be additional objects in the tree that intersect, so check the object + * returned carefully to make sure you don't need additional lookups. + */ +struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree, + u64 start, u64 end) +{ + struct extent_map *em; + struct rb_node *rb_node; + + read_lock_irq(&tree->lock); + rb_node = tree_search(&tree->map, start); + if (!rb_node) { + em = NULL; + goto out; + } + if (IS_ERR(rb_node)) { + em = ERR_PTR(PTR_ERR(rb_node)); + goto out; + } + em = rb_entry(rb_node, struct extent_map, rb_node); + if (em->end < start || em->start > end) { + em = NULL; + goto out; + } + atomic_inc(&em->refs); +out: + read_unlock_irq(&tree->lock); + return em; +} +EXPORT_SYMBOL(lookup_extent_mapping); + +/* + * removes an extent_map struct from the tree. No reference counts are + * dropped, and no checks are done to see if the range is in use + */ +int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em) +{ + int ret; + + write_lock_irq(&tree->lock); + ret = tree_delete(&tree->map, em->end); + write_unlock_irq(&tree->lock); + return ret; +} +EXPORT_SYMBOL(remove_extent_mapping); + +/* + * utility function to look for merge candidates inside a given range. + * Any extents with matching state are merged together into a single + * extent in the tree. Extents with EXTENT_IO in their state field + * are not merged because the end_io handlers need to be able to do + * operations on them without sleeping (or doing allocations/splits). + * + * This should be called with the tree lock held. + */ +static int merge_state(struct extent_map_tree *tree, + struct extent_state *state) +{ + struct extent_state *other; + struct rb_node *other_node; + + if (state->state & EXTENT_IOBITS) + return 0; + + other_node = rb_prev(&state->rb_node); + if (other_node) { + other = rb_entry(other_node, struct extent_state, rb_node); + if (other->end == state->start - 1 && + other->state == state->state) { + state->start = other->start; + other->in_tree = 0; + rb_erase(&other->rb_node, &tree->state); + free_extent_state(other); + } + } + other_node = rb_next(&state->rb_node); + if (other_node) { + other = rb_entry(other_node, struct extent_state, rb_node); + if (other->start == state->end + 1 && + other->state == state->state) { + other->start = state->start; + state->in_tree = 0; + rb_erase(&state->rb_node, &tree->state); + free_extent_state(state); + } + } + return 0; +} + +/* + * insert an extent_state struct into the tree. 'bits' are set on the + * struct before it is inserted. + * + * This may return -EEXIST if the extent is already there, in which case the + * state struct is freed. + * + * The tree lock is not taken internally. This is a utility function and + * probably isn't what you want to call (see set/clear_extent_bit). + */ +static int insert_state(struct extent_map_tree *tree, + struct extent_state *state, u64 start, u64 end, + int bits) +{ + struct rb_node *node; + + if (end < start) { + printk("end < start %Lu %Lu\n", end, start); + WARN_ON(1); + } + state->state |= bits; + state->start = start; + state->end = end; + if ((end & 4095) == 0) { + printk("insert state %Lu %Lu strange end\n", start, end); + WARN_ON(1); + } + node = tree_insert(&tree->state, end, &state->rb_node); + if (node) { + struct extent_state *found; + found = rb_entry(node, struct extent_state, rb_node); +printk("found node %Lu %Lu on insert of %Lu %Lu\n", found->start, found->end, start, end); + free_extent_state(state); + return -EEXIST; + } + merge_state(tree, state); + return 0; +} + +/* + * split a given extent state struct in two, inserting the preallocated + * struct 'prealloc' as the newly created second half. 'split' indicates an + * offset inside 'orig' where it should be split. + * + * Before calling, + * the tree has 'orig' at [orig->start, orig->end]. After calling, there + * are two extent state structs in the tree: + * prealloc: [orig->start, split - 1] + * orig: [ split, orig->end ] + * + * The tree locks are not taken by this function. They need to be held + * by the caller. + */ +static int split_state(struct extent_map_tree *tree, struct extent_state *orig, + struct extent_state *prealloc, u64 split) +{ + struct rb_node *node; + prealloc->start = orig->start; + prealloc->end = split - 1; + prealloc->state = orig->state; + orig->start = split; + if ((prealloc->end & 4095) == 0) { + printk("insert state %Lu %Lu strange end\n", prealloc->start, + prealloc->end); + WARN_ON(1); + } + node = tree_insert(&tree->state, prealloc->end, &prealloc->rb_node); + if (node) { + struct extent_state *found; + found = rb_entry(node, struct extent_state, rb_node); +printk("found node %Lu %Lu on insert of %Lu %Lu\n", found->start, found->end, prealloc->start, prealloc->end); + free_extent_state(prealloc); + return -EEXIST; + } + return 0; +} + +/* + * utility function to clear some bits in an extent state struct. + * it will optionally wake up any one waiting on this state (wake == 1), or + * forcibly remove the state from the tree (delete == 1). + * + * If no bits are set on the state struct after clearing things, the + * struct is freed and removed from the tree + */ +static int clear_state_bit(struct extent_map_tree *tree, + struct extent_state *state, int bits, int wake, + int delete) +{ + int ret = state->state & bits; + state->state &= ~bits; + if (wake) + wake_up(&state->wq); + if (delete || state->state == 0) { + if (state->in_tree) { + rb_erase(&state->rb_node, &tree->state); + state->in_tree = 0; + free_extent_state(state); + } else { + WARN_ON(1); + } + } else { + merge_state(tree, state); + } + return ret; +} + +/* + * clear some bits on a range in the tree. This may require splitting + * or inserting elements in the tree, so the gfp mask is used to + * indicate which allocations or sleeping are allowed. + * + * pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove + * the given range from the tree regardless of state (ie for truncate). + * + * the range [start, end] is inclusive. + * + * This takes the tree lock, and returns < 0 on error, > 0 if any of the + * bits were already set, or zero if none of the bits were already set. + */ +int clear_extent_bit(struct extent_map_tree *tree, u64 start, u64 end, + int bits, int wake, int delete, gfp_t mask) +{ + struct extent_state *state; + struct extent_state *prealloc = NULL; + struct rb_node *node; + int err; + int set = 0; + +again: + if (!prealloc && (mask & __GFP_WAIT)) { + prealloc = alloc_extent_state(mask); + if (!prealloc) + return -ENOMEM; + } + + write_lock_irq(&tree->lock); + /* + * this search will find the extents that end after + * our range starts + */ + node = tree_search(&tree->state, start); + if (!node) + goto out; + state = rb_entry(node, struct extent_state, rb_node); + if (state->start > end) + goto out; + WARN_ON(state->end < start); + + /* + * | ---- desired range ---- | + * | state | or + * | ------------- state -------------- | + * + * We need to split the extent we found, and may flip + * bits on second half. + * + * If the extent we found extends past our range, we + * just split and search again. It'll get split again + * the next time though. + * + * If the extent we found is inside our range, we clear + * the desired bit on it. + */ + + if (state->start < start) { + err = split_state(tree, state, prealloc, start); + BUG_ON(err == -EEXIST); + prealloc = NULL; + if (err) + goto out; + if (state->end <= end) { + start = state->end + 1; + set |= clear_state_bit(tree, state, bits, + wake, delete); + } else { + start = state->start; + } + goto search_again; + } + /* + * | ---- desired range ---- | + * | state | + * We need to split the extent, and clear the bit + * on the first half + */ + if (state->start <= end && state->end > end) { + err = split_state(tree, state, prealloc, end + 1); + BUG_ON(err == -EEXIST); + + if (wake) + wake_up(&state->wq); + set |= clear_state_bit(tree, prealloc, bits, + wake, delete); + prealloc = NULL; + goto out; + } + + start = state->end + 1; + set |= clear_state_bit(tree, state, bits, wake, delete); + goto search_again; + +out: + write_unlock_irq(&tree->lock); + if (prealloc) + free_extent_state(prealloc); + + return set; + +search_again: + if (start >= end) + goto out; + write_unlock_irq(&tree->lock); + if (mask & __GFP_WAIT) + cond_resched(); + goto again; +} +EXPORT_SYMBOL(clear_extent_bit); + +static int wait_on_state(struct extent_map_tree *tree, + struct extent_state *state) +{ + DEFINE_WAIT(wait); + prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE); + read_unlock_irq(&tree->lock); + schedule(); + read_lock_irq(&tree->lock); + finish_wait(&state->wq, &wait); + return 0; +} + +/* + * waits for one or more bits to clear on a range in the state tree. + * The range [start, end] is inclusive. + * The tree lock is taken by this function + */ +int wait_extent_bit(struct extent_map_tree *tree, u64 start, u64 end, int bits) +{ + struct extent_state *state; + struct rb_node *node; + + read_lock_irq(&tree->lock); +again: + while (1) { + /* + * this search will find all the extents that end after + * our range starts + */ + node = tree_search(&tree->state, start); + if (!node) + break; + + state = rb_entry(node, struct extent_state, rb_node); + + if (state->start > end) + goto out; + + if (state->state & bits) { + start = state->start; + atomic_inc(&state->refs); + wait_on_state(tree, state); + free_extent_state(state); + goto again; + } + start = state->end + 1; + + if (start > end) + break; + + if (need_resched()) { + read_unlock_irq(&tree->lock); + cond_resched(); + read_lock_irq(&tree->lock); + } + } +out: + read_unlock_irq(&tree->lock); + return 0; +} +EXPORT_SYMBOL(wait_extent_bit); + +/* + * set some bits on a range in the tree. This may require allocations + * or sleeping, so the gfp mask is used to indicate what is allowed. + * + * If 'exclusive' == 1, this will fail with -EEXIST if some part of the + * range already has the desired bits set. The start of the existing + * range is returned in failed_start in this case. + * + * [start, end] is inclusive + * This takes the tree lock. + */ +int set_extent_bit(struct extent_map_tree *tree, u64 start, u64 end, int bits, + int exclusive, u64 *failed_start, gfp_t mask) +{ + struct extent_state *state; + struct extent_state *prealloc = NULL; + struct rb_node *node; + int err = 0; + int set; + u64 last_start; + u64 last_end; +again: + if (!prealloc && (mask & __GFP_WAIT)) { + prealloc = alloc_extent_state(mask); + if (!prealloc) + return -ENOMEM; + } + + write_lock_irq(&tree->lock); + /* + * this search will find all the extents that end after + * our range starts. + */ + node = tree_search(&tree->state, start); + if (!node) { + err = insert_state(tree, prealloc, start, end, bits); + prealloc = NULL; + BUG_ON(err == -EEXIST); + goto out; + } + + state = rb_entry(node, struct extent_state, rb_node); + last_start = state->start; + last_end = state->end; + + /* + * | ---- desired range ---- | + * | state | + * + * Just lock what we found and keep going + */ + if (state->start == start && state->end <= end) { + set = state->state & bits; + if (set && exclusive) { + *failed_start = state->start; + err = -EEXIST; + goto out; + } + state->state |= bits; + start = state->end + 1; + merge_state(tree, state); + goto search_again; + } + + /* + * | ---- desired range ---- | + * | state | + * or + * | ------------- state -------------- | + * + * We need to split the extent we found, and may flip bits on + * second half. + * + * If the extent we found extends past our + * range, we just split and search again. It'll get split + * again the next time though. + * + * If the extent we found is inside our range, we set the + * desired bit on it. + */ + if (state->start < start) { + set = state->state & bits; + if (exclusive && set) { + *failed_start = start; + err = -EEXIST; + goto out; + } + err = split_state(tree, state, prealloc, start); + BUG_ON(err == -EEXIST); + prealloc = NULL; + if (err) + goto out; + if (state->end <= end) { + state->state |= bits; + start = state->end + 1; + merge_state(tree, state); + } else { + start = state->start; + } + goto search_again; + } + /* + * | ---- desired range ---- | + * | state | + * We need to split the extent, and set the bit + * on the first half + */ + if (state->start <= end && state->end > end) { + set = state->state & bits; + if (exclusive && set) { + *failed_start = start; + err = -EEXIST; + goto out; + } + err = split_state(tree, state, prealloc, end + 1); + BUG_ON(err == -EEXIST); + + prealloc->state |= bits; + merge_state(tree, prealloc); + prealloc = NULL; + goto out; + } + + /* + * | ---- desired range ---- | + * | state | or | state | + * + * There's a hole, we need to insert something in it and + * ignore the extent we found. + */ + if (state->start > start) { + u64 this_end; + if (end < last_start) + this_end = end; + else + this_end = last_start -1; + err = insert_state(tree, prealloc, start, this_end, + bits); + prealloc = NULL; + BUG_ON(err == -EEXIST); + if (err) + goto out; + start = this_end + 1; + goto search_again; + } + goto search_again; + +out: + write_unlock_irq(&tree->lock); + if (prealloc) + free_extent_state(prealloc); + + return err; + +search_again: + if (start > end) + goto out; + write_unlock_irq(&tree->lock); + if (mask & __GFP_WAIT) + cond_resched(); + goto again; +} +EXPORT_SYMBOL(set_extent_bit); + +/* wrappers around set/clear extent bit */ +int set_extent_dirty(struct extent_map_tree *tree, u64 start, u64 end, + gfp_t mask) +{ + return set_extent_bit(tree, start, end, EXTENT_DIRTY, 0, NULL, + mask); +} +EXPORT_SYMBOL(set_extent_dirty); + +int clear_extent_dirty(struct extent_map_tree *tree, u64 start, u64 end, + gfp_t mask) +{ + return clear_extent_bit(tree, start, end, EXTENT_DIRTY, 0, 0, mask); +} +EXPORT_SYMBOL(clear_extent_dirty); + +int set_extent_new(struct extent_map_tree *tree, u64 start, u64 end, + gfp_t mask) +{ + return set_extent_bit(tree, start, end, EXTENT_NEW, 0, NULL, + mask); +} +EXPORT_SYMBOL(set_extent_new); + +int clear_extent_new(struct extent_map_tree *tree, u64 start, u64 end, + gfp_t mask) +{ + return clear_extent_bit(tree, start, end, EXTENT_NEW, 0, 0, mask); +} +EXPORT_SYMBOL(clear_extent_new); + +int set_extent_uptodate(struct extent_map_tree *tree, u64 start, u64 end, + gfp_t mask) +{ + return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, NULL, + mask); +} +EXPORT_SYMBOL(set_extent_uptodate); + +int clear_extent_uptodate(struct extent_map_tree *tree, u64 start, u64 end, + gfp_t mask) +{ + return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0, mask); +} +EXPORT_SYMBOL(clear_extent_uptodate); + +int set_extent_writeback(struct extent_map_tree *tree, u64 start, u64 end, + gfp_t mask) +{ + return set_extent_bit(tree, start, end, EXTENT_WRITEBACK, + 0, NULL, mask); +} +EXPORT_SYMBOL(set_extent_writeback); + +int clear_extent_writeback(struct extent_map_tree *tree, u64 start, u64 end, + gfp_t mask) +{ + return clear_extent_bit(tree, start, end, EXTENT_WRITEBACK, 1, 0, mask); +} +EXPORT_SYMBOL(clear_extent_writeback); + +int wait_on_extent_writeback(struct extent_map_tree *tree, u64 start, u64 end) +{ + return wait_extent_bit(tree, start, end, EXTENT_WRITEBACK); +} +EXPORT_SYMBOL(wait_on_extent_writeback); + +/* + * locks a range in ascending order, waiting for any locked regions + * it hits on the way. [start,end] are inclusive, and this will sleep. + */ +int lock_extent(struct extent_map_tree *tree, u64 start, u64 end, gfp_t mask) +{ + int err; + u64 failed_start; + while (1) { + err = set_extent_bit(tree, start, end, EXTENT_LOCKED, 1, + &failed_start, mask); + if (err == -EEXIST && (mask & __GFP_WAIT)) { + wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED); + start = failed_start; + } else { + break; + } + WARN_ON(start > end); + } + return err; +} +EXPORT_SYMBOL(lock_extent); + +int unlock_extent(struct extent_map_tree *tree, u64 start, u64 end, + gfp_t mask) +{ + return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, mask); +} +EXPORT_SYMBOL(unlock_extent); + +/* + * helper function to set pages and extents in the tree dirty + */ +int set_range_dirty(struct extent_map_tree *tree, u64 start, u64 end) +{ + unsigned long index = start >> PAGE_CACHE_SHIFT; + unsigned long end_index = end >> PAGE_CACHE_SHIFT; + struct page *page; + + while (index <= end_index) { + page = find_get_page(tree->mapping, index); + BUG_ON(!page); + __set_page_dirty_nobuffers(page); + page_cache_release(page); + index++; + } + set_extent_dirty(tree, start, end, GFP_NOFS); + return 0; +} +EXPORT_SYMBOL(set_range_dirty); + +/* + * helper function to set both pages and extents in the tree writeback + */ +int set_range_writeback(struct extent_map_tree *tree, u64 start, u64 end) +{ + unsigned long index = start >> PAGE_CACHE_SHIFT; + unsigned long end_index = end >> PAGE_CACHE_SHIFT; + struct page *page; + + while (index <= end_index) { + page = find_get_page(tree->mapping, index); + BUG_ON(!page); + set_page_writeback(page); + page_cache_release(page); + index++; + } + set_extent_writeback(tree, start, end, GFP_NOFS); + return 0; +} +EXPORT_SYMBOL(set_range_writeback); + +/* + * helper function to lock both pages and extents in the tree. + * pages must be locked first. + */ +int lock_range(struct extent_map_tree *tree, u64 start, u64 end) +{ + unsigned long index = start >> PAGE_CACHE_SHIFT; + unsigned long end_index = end >> PAGE_CACHE_SHIFT; + struct page *page; + int err; + + while (index <= end_index) { + page = grab_cache_page(tree->mapping, index); + if (!page) { + err = -ENOMEM; + goto failed; + } + if (IS_ERR(page)) { + err = PTR_ERR(page); + goto failed; + } + index++; + } + lock_extent(tree, start, end, GFP_NOFS); + return 0; + +failed: + /* + * we failed above in getting the page at 'index', so we undo here + * up to but not including the page at 'index' + */ + end_index = index; + index = start >> PAGE_CACHE_SHIFT; + while (index < end_index) { + page = find_get_page(tree->mapping, index); + unlock_page(page); + page_cache_release(page); + index++; + } + return err; +} +EXPORT_SYMBOL(lock_range); + +/* + * helper function to unlock both pages and extents in the tree. + */ +int unlock_range(struct extent_map_tree *tree, u64 start, u64 end) +{ + unsigned long index = start >> PAGE_CACHE_SHIFT; + unsigned long end_index = end >> PAGE_CACHE_SHIFT; + struct page *page; + + while (index <= end_index) { + page = find_get_page(tree->mapping, index); + unlock_page(page); + page_cache_release(page); + index++; + } + unlock_extent(tree, start, end, GFP_NOFS); + return 0; +} +EXPORT_SYMBOL(unlock_range); + +/* + * searches a range in the state tree for a given mask. + * If 'filled' == 1, this returns 1 only if ever extent in the tree + * has the bits set. Otherwise, 1 is returned if any bit in the + * range is found set. + */ +static int test_range_bit(struct extent_map_tree *tree, u64 start, u64 end, + int bits, int filled) +{ + struct extent_state *state = NULL; + struct rb_node *node; + int bitset = 0; + + read_lock_irq(&tree->lock); + node = tree_search(&tree->state, start); + while (node && start <= end) { + state = rb_entry(node, struct extent_state, rb_node); + if (state->start > end) + break; + + if (filled && state->start > start) { + bitset = 0; + break; + } + if (state->state & bits) { + bitset = 1; + if (!filled) + break; + } else if (filled) { + bitset = 0; + break; + } + start = state->end + 1; + if (start > end) + break; + node = rb_next(node); + } + read_unlock_irq(&tree->lock); + return bitset; +} + +/* + * helper function to set a given page up to date if all the + * extents in the tree for that page are up to date + */ +static int check_page_uptodate(struct extent_map_tree *tree, + struct page *page) +{ + u64 start = page->index << PAGE_CACHE_SHIFT; + u64 end = start + PAGE_CACHE_SIZE - 1; + if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1)) + SetPageUptodate(page); + return 0; +} + +/* + * helper function to unlock a page if all the extents in the tree + * for that page are unlocked + */ +static int check_page_locked(struct extent_map_tree *tree, + struct page *page) +{ + u64 start = page->index << PAGE_CACHE_SHIFT; + u64 end = start + PAGE_CACHE_SIZE - 1; + if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0)) + unlock_page(page); + return 0; +} + +/* + * helper function to end page writeback if all the extents + * in the tree for that page are done with writeback + */ +static int check_page_writeback(struct extent_map_tree *tree, + struct page *page) +{ + u64 start = page->index << PAGE_CACHE_SHIFT; + u64 end = start + PAGE_CACHE_SIZE - 1; + if (!test_range_bit(tree, start, end, EXTENT_WRITEBACK, 0)) + end_page_writeback(page); + return 0; +} + +/* lots and lots of room for performance fixes in the end_bio funcs */ + +/* + * after a writepage IO is done, we need to: + * clear the uptodate bits on error + * clear the writeback bits in the extent tree for this IO + * end_page_writeback if the page has no more pending IO + * + * Scheduling is not allowed, so the extent state tree is expected + * to have one and only one object corresponding to this IO. + */ +static int end_bio_extent_writepage(struct bio *bio, + unsigned int bytes_done, int err) +{ + const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); + struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; + struct extent_map_tree *tree = bio->bi_private; + u64 start; + u64 end; + int whole_page; + + if (bio->bi_size) + return 1; + + do { + struct page *page = bvec->bv_page; + start = (page->index << PAGE_CACHE_SHIFT) + bvec->bv_offset; + end = start + bvec->bv_len - 1; + + if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE) + whole_page = 1; + else + whole_page = 0; + + if (--bvec >= bio->bi_io_vec) + prefetchw(&bvec->bv_page->flags); + + if (!uptodate) { + clear_extent_uptodate(tree, start, end, GFP_ATOMIC); + ClearPageUptodate(page); + SetPageError(page); + } + clear_extent_writeback(tree, start, end, GFP_ATOMIC); + + if (whole_page) + end_page_writeback(page); + else + check_page_writeback(tree, page); + } while (bvec >= bio->bi_io_vec); + + bio_put(bio); + return 0; +} + +/* + * after a readpage IO is done, we need to: + * clear the uptodate bits on error + * set the uptodate bits if things worked + * set the page up to date if all extents in the tree are uptodate + * clear the lock bit in the extent tree + * unlock the page if there are no other extents locked for it + * + * Scheduling is not allowed, so the extent state tree is expected + * to have one and only one object corresponding to this IO. + */ +static int end_bio_extent_readpage(struct bio *bio, + unsigned int bytes_done, int err) +{ + const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); + struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; + struct extent_map_tree *tree = bio->bi_private; + u64 start; + u64 end; + int whole_page; + + if (bio->bi_size) + return 1; + + do { + struct page *page = bvec->bv_page; + start = (page->index << PAGE_CACHE_SHIFT) + bvec->bv_offset; + end = start + bvec->bv_len - 1; + + if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE) + whole_page = 1; + else + whole_page = 0; + + if (--bvec >= bio->bi_io_vec) + prefetchw(&bvec->bv_page->flags); + + if (uptodate) { + set_extent_uptodate(tree, start, end, GFP_ATOMIC); + if (whole_page) + SetPageUptodate(page); + else + check_page_uptodate(tree, page); + } else { + ClearPageUptodate(page); + SetPageError(page); + } + + unlock_extent(tree, start, end, GFP_ATOMIC); + + if (whole_page) + unlock_page(page); + else + check_page_locked(tree, page); + } while (bvec >= bio->bi_io_vec); + + bio_put(bio); + return 0; +} + +/* + * IO done from prepare_write is pretty simple, we just unlock + * the structs in the extent tree when done, and set the uptodate bits + * as appropriate. + */ +static int end_bio_extent_preparewrite(struct bio *bio, + unsigned int bytes_done, int err) +{ + const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); + struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; + struct extent_map_tree *tree = bio->bi_private; + u64 start; + u64 end; + + if (bio->bi_size) + return 1; + + do { + struct page *page = bvec->bv_page; + start = (page->index << PAGE_CACHE_SHIFT) + bvec->bv_offset; + end = start + bvec->bv_len - 1; + + if (--bvec >= bio->bi_io_vec) + prefetchw(&bvec->bv_page->flags); + + if (uptodate) { + set_extent_uptodate(tree, start, end, GFP_ATOMIC); + } else { + ClearPageUptodate(page); + SetPageError(page); + } + + unlock_extent(tree, start, end, GFP_ATOMIC); + + } while (bvec >= bio->bi_io_vec); + + bio_put(bio); + return 0; +} + +static int submit_extent_page(int rw, struct extent_map_tree *tree, + struct page *page, sector_t sector, + size_t size, unsigned long offset, + struct block_device *bdev, + bio_end_io_t end_io_func) +{ + struct bio *bio; + int ret = 0; + + bio = bio_alloc(GFP_NOIO, 1); + + bio->bi_sector = sector; + bio->bi_bdev = bdev; + bio->bi_io_vec[0].bv_page = page; + bio->bi_io_vec[0].bv_len = size; + bio->bi_io_vec[0].bv_offset = offset; + + bio->bi_vcnt = 1; + bio->bi_idx = 0; + bio->bi_size = size; + + bio->bi_end_io = end_io_func; + bio->bi_private = tree; + + bio_get(bio); + submit_bio(rw, bio); + + if (bio_flagged(bio, BIO_EOPNOTSUPP)) + ret = -EOPNOTSUPP; + + bio_put(bio); + return ret; +} + +/* + * basic readpage implementation. Locked extent state structs are inserted + * into the tree that are removed when the IO is done (by the end_io + * handlers) + */ +int extent_read_full_page(struct extent_map_tree *tree, struct page *page, + get_extent_t *get_extent) +{ + struct inode *inode = page->mapping->host; + u64 start = page->index << PAGE_CACHE_SHIFT; + u64 page_end = start + PAGE_CACHE_SIZE - 1; + u64 end; + u64 cur = start; + u64 extent_offset; + u64 last_byte = i_size_read(inode); + u64 block_start; + u64 cur_end; + sector_t sector; + struct extent_map *em; + struct block_device *bdev; + int ret; + int nr = 0; + size_t page_offset = 0; + size_t iosize; + size_t blocksize = inode->i_sb->s_blocksize; + + if (!PagePrivate(page)) { + SetPagePrivate(page); + set_page_private(page, 1); + page_cache_get(page); + } + + end = page_end; + lock_extent(tree, start, end, GFP_NOFS); + + while (cur <= end) { + if (cur >= last_byte) { + iosize = PAGE_CACHE_SIZE - page_offset; + zero_user_page(page, page_offset, iosize, KM_USER0); + set_extent_uptodate(tree, cur, cur + iosize - 1, + GFP_NOFS); + unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS); + break; + } + em = get_extent(inode, page, page_offset, cur, end, 0); + if (IS_ERR(em) || !em) { + SetPageError(page); + unlock_extent(tree, cur, end, GFP_NOFS); + break; + } + + extent_offset = cur - em->start; + BUG_ON(em->end < cur); + BUG_ON(end < cur); + + iosize = min(em->end - cur, end - cur) + 1; + cur_end = min(em->end, end); + iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1); + sector = (em->block_start + extent_offset) >> 9; + bdev = em->bdev; + block_start = em->block_start; + free_extent_map(em); + em = NULL; + + /* we've found a hole, just zero and go on */ + if (block_start == 0) { + zero_user_page(page, page_offset, iosize, KM_USER0); + set_extent_uptodate(tree, cur, cur + iosize - 1, + GFP_NOFS); + unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS); + cur = cur + iosize; + page_offset += iosize; + continue; + } + /* the get_extent function already copied into the page */ + if (test_range_bit(tree, cur, cur_end, EXTENT_UPTODATE, 1)) { + unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS); + cur = cur + iosize; + page_offset += iosize; + continue; + } + + ret = submit_extent_page(READ, tree, page, + sector, iosize, page_offset, bdev, + end_bio_extent_readpage); + if (ret) + SetPageError(page); + cur = cur + iosize; + page_offset += iosize; + nr++; + } + if (!nr) { + if (!PageError(page)) + SetPageUptodate(page); + unlock_page(page); + } + return 0; +} +EXPORT_SYMBOL(extent_read_full_page); + +/* + * the writepage semantics are similar to regular writepage. extent + * records are inserted to lock ranges in the tree, and as dirty areas + * are found, they are marked writeback. Then the lock bits are removed + * and the end_io handler clears the writeback ranges + */ +int extent_write_full_page(struct extent_map_tree *tree, struct page *page, + get_extent_t *get_extent, + struct writeback_control *wbc) +{ + struct inode *inode = page->mapping->host; + u64 start = page->index << PAGE_CACHE_SHIFT; + u64 page_end = start + PAGE_CACHE_SIZE - 1; + u64 end; + u64 cur = start; + u64 extent_offset; + u64 last_byte = i_size_read(inode); + u64 block_start; + sector_t sector; + struct extent_map *em; + struct block_device *bdev; + int ret; + int nr = 0; + size_t page_offset = 0; + size_t iosize; + size_t blocksize; + loff_t i_size = i_size_read(inode); + unsigned long end_index = i_size >> PAGE_CACHE_SHIFT; + + if (page->index > end_index) { + clear_extent_dirty(tree, start, page_end, GFP_NOFS); + unlock_page(page); + return 0; + } + + if (page->index == end_index) { + size_t offset = i_size & (PAGE_CACHE_SIZE - 1); + zero_user_page(page, offset, + PAGE_CACHE_SIZE - offset, KM_USER0); + } + + if (!PagePrivate(page)) { + SetPagePrivate(page); + set_page_private(page, 1); + page_cache_get(page); + } + + end = page_end; + lock_extent(tree, start, page_end, GFP_NOFS); + + if (last_byte <= start) { + clear_extent_dirty(tree, start, page_end, GFP_NOFS); + goto done; + } + + set_extent_uptodate(tree, start, page_end, GFP_NOFS); + blocksize = inode->i_sb->s_blocksize; + + while (cur <= end) { + if (cur >= last_byte) { + clear_extent_dirty(tree, cur, page_end, GFP_NOFS); + break; + } + em = get_extent(inode, page, page_offset, cur, end, 1); + if (IS_ERR(em) || !em) { + SetPageError(page); + break; + } + + extent_offset = cur - em->start; + BUG_ON(em->end < cur); + BUG_ON(end < cur); + iosize = min(em->end - cur, end - cur) + 1; + iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1); + sector = (em->block_start + extent_offset) >> 9; + bdev = em->bdev; + block_start = em->block_start; + free_extent_map(em); + em = NULL; + + if (block_start == 0 || block_start == EXTENT_MAP_INLINE) { + clear_extent_dirty(tree, cur, + cur + iosize - 1, GFP_NOFS); + cur = cur + iosize; + page_offset += iosize; + continue; + } + + /* leave this out until we have a page_mkwrite call */ + if (0 && !test_range_bit(tree, cur, cur + iosize - 1, + EXTENT_DIRTY, 0)) { + cur = cur + iosize; + page_offset += iosize; + continue; + } + clear_extent_dirty(tree, cur, cur + iosize - 1, GFP_NOFS); + set_range_writeback(tree, cur, cur + iosize - 1); + ret = submit_extent_page(WRITE, tree, page, + sector, iosize, page_offset, bdev, + end_bio_extent_writepage); + if (ret) + SetPageError(page); + cur = cur + iosize; + page_offset += iosize; + nr++; + } +done: + WARN_ON(test_range_bit(tree, start, page_end, EXTENT_DIRTY, 0)); + unlock_extent(tree, start, page_end, GFP_NOFS); + unlock_page(page); + return 0; +} +EXPORT_SYMBOL(extent_write_full_page); + +/* + * basic invalidatepage code, this waits on any locked or writeback + * ranges corresponding to the page, and then deletes any extent state + * records from the tree + */ +int extent_invalidatepage(struct extent_map_tree *tree, + struct page *page, unsigned long offset) +{ + u64 start = (page->index << PAGE_CACHE_SHIFT); + u64 end = start + PAGE_CACHE_SIZE - 1; + size_t blocksize = page->mapping->host->i_sb->s_blocksize; + + start += (offset + blocksize -1) & ~(blocksize - 1); + if (start > end) + return 0; + + lock_extent(tree, start, end, GFP_NOFS); + wait_on_extent_writeback(tree, start, end); + clear_extent_bit(tree, start, end, EXTENT_LOCKED | EXTENT_DIRTY, + 1, 1, GFP_NOFS); + return 0; +} +EXPORT_SYMBOL(extent_invalidatepage); + +/* + * simple commit_write call, set_range_dirty is used to mark both + * the pages and the extent records as dirty + */ +int extent_commit_write(struct extent_map_tree *tree, + struct inode *inode, struct page *page, + unsigned from, unsigned to) +{ + loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to; + + if (!PagePrivate(page)) { + SetPagePrivate(page); + set_page_private(page, 1); + page_cache_get(page); + } + + set_page_dirty(page); + + if (pos > inode->i_size) { + i_size_write(inode, pos); + mark_inode_dirty(inode); + } + return 0; +} +EXPORT_SYMBOL(extent_commit_write); + +int extent_prepare_write(struct extent_map_tree *tree, + struct inode *inode, struct page *page, + unsigned from, unsigned to, get_extent_t *get_extent) +{ + u64 page_start = page->index << PAGE_CACHE_SHIFT; + u64 page_end = page_start + PAGE_CACHE_SIZE - 1; + u64 block_start; + u64 orig_block_start; + u64 block_end; + u64 cur_end; + struct extent_map *em; + unsigned blocksize = 1 << inode->i_blkbits; + size_t page_offset = 0; + size_t block_off_start; + size_t block_off_end; + int err = 0; + int iocount = 0; + int ret = 0; + int isnew; + + if (!PagePrivate(page)) { + SetPagePrivate(page); + set_page_private(page, 1); + page_cache_get(page); + } + block_start = (page_start + from) & ~((u64)blocksize - 1); + block_end = (page_start + to - 1) | (blocksize - 1); + orig_block_start = block_start; + + lock_extent(tree, page_start, page_end, GFP_NOFS); + while(block_start <= block_end) { + em = get_extent(inode, page, page_offset, block_start, + block_end, 1); + if (IS_ERR(em) || !em) { + goto err; + } + cur_end = min(block_end, em->end); + block_off_start = block_start & (PAGE_CACHE_SIZE - 1); + block_off_end = block_off_start + blocksize; + isnew = clear_extent_new(tree, block_start, cur_end, GFP_NOFS); + + if (!PageUptodate(page) && isnew && + (block_off_end > to || block_off_start < from)) { + void *kaddr; + + kaddr = kmap_atomic(page, KM_USER0); + if (block_off_end > to) + memset(kaddr + to, 0, block_off_end - to); + if (block_off_start < from) + memset(kaddr + block_off_start, 0, + from - block_off_start); + flush_dcache_page(page); + kunmap_atomic(kaddr, KM_USER0); + } + if (!isnew && !PageUptodate(page) && + (block_off_end > to || block_off_start < from) && + !test_range_bit(tree, block_start, cur_end, + EXTENT_UPTODATE, 1)) { + u64 sector; + u64 extent_offset = block_start - em->start; + size_t iosize; + sector = (em->block_start + extent_offset) >> 9; + iosize = (cur_end - block_start + blocksize - 1) & + ~((u64)blocksize - 1); + /* + * we've already got the extent locked, but we + * need to split the state such that our end_bio + * handler can clear the lock. + */ + set_extent_bit(tree, block_start, + block_start + iosize - 1, + EXTENT_LOCKED, 0, NULL, GFP_NOFS); + ret = submit_extent_page(READ, tree, page, + sector, iosize, page_offset, em->bdev, + end_bio_extent_preparewrite); + iocount++; + block_start = block_start + iosize; + } else { + set_extent_uptodate(tree, block_start, cur_end, + GFP_NOFS); + unlock_extent(tree, block_start, cur_end, GFP_NOFS); + block_start = cur_end + 1; + } + page_offset = block_start & (PAGE_CACHE_SIZE - 1); + free_extent_map(em); + } + if (iocount) { + wait_extent_bit(tree, orig_block_start, + block_end, EXTENT_LOCKED); + } + check_page_uptodate(tree, page); +err: + /* FIXME, zero out newly allocated blocks on error */ + return err; +} +EXPORT_SYMBOL(extent_prepare_write); + +/* + * a helper for releasepage. As long as there are no locked extents + * in the range corresponding to the page, both state records and extent + * map records are removed + */ +int try_release_extent_mapping(struct extent_map_tree *tree, struct page *page) +{ + struct extent_map *em; + u64 start = page->index << PAGE_CACHE_SHIFT; + u64 end = start + PAGE_CACHE_SIZE - 1; + u64 orig_start = start; + + while (start <= end) { + em = lookup_extent_mapping(tree, start, end); + if (!em || IS_ERR(em)) + break; + if (test_range_bit(tree, em->start, em->end, + EXTENT_LOCKED, 0)) { + free_extent_map(em); + start = em->end + 1; +printk("range still locked %Lu %Lu\n", em->start, em->end); + break; + } + remove_extent_mapping(tree, em); + start = em->end + 1; + /* once for the rb tree */ + free_extent_map(em); + /* once for us */ + free_extent_map(em); + } + WARN_ON(test_range_bit(tree, orig_start, end, EXTENT_WRITEBACK, 0)); + clear_extent_bit(tree, orig_start, end, EXTENT_UPTODATE, + 1, 1, GFP_NOFS); + return 1; +} +EXPORT_SYMBOL(try_release_extent_mapping); + diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h new file mode 100644 index 000000000000..108944aab4bd --- /dev/null +++ b/fs/btrfs/extent_map.h @@ -0,0 +1,89 @@ +#ifndef __EXTENTMAP__ +#define __EXTENTMAP__ + +#include + +#define EXTENT_MAP_INLINE (u64)-2 +#define EXTENT_MAP_DELALLOC (u64)-1 + +struct extent_map_tree { + struct rb_root map; + struct rb_root state; + struct address_space *mapping; + rwlock_t lock; +}; + +/* note, this must start with the same fields as fs/extent_map.c:tree_entry */ +struct extent_map { + u64 start; + u64 end; /* inclusive */ + int in_tree; + struct rb_node rb_node; + /* block_start and block_end are in bytes */ + u64 block_start; + u64 block_end; /* inclusive */ + struct block_device *bdev; + atomic_t refs; +}; + +/* note, this must start with the same fields as fs/extent_map.c:tree_entry */ +struct extent_state { + u64 start; + u64 end; /* inclusive */ + int in_tree; + struct rb_node rb_node; + wait_queue_head_t wq; + atomic_t refs; + unsigned long state; + struct list_head list; +}; + +struct extent_buffer { + u64 start; + u64 end; /* inclusive */ + char *addr; + struct page *pages[]; +}; + +typedef struct extent_map *(get_extent_t)(struct inode *inode, + struct page *page, + size_t page_offset, + u64 start, u64 end, + int create); + +void extent_map_tree_init(struct extent_map_tree *tree, + struct address_space *mapping, gfp_t mask); +struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree, + u64 start, u64 end); +int add_extent_mapping(struct extent_map_tree *tree, + struct extent_map *em); +int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em); +int try_release_extent_mapping(struct extent_map_tree *tree, struct page *page); +int lock_extent(struct extent_map_tree *tree, u64 start, u64 end, gfp_t mask); +int unlock_extent(struct extent_map_tree *tree, u64 start, u64 end, gfp_t mask); +struct extent_map *alloc_extent_map(gfp_t mask); +void free_extent_map(struct extent_map *em); +int extent_read_full_page(struct extent_map_tree *tree, struct page *page, + get_extent_t *get_extent); +void __init extent_map_init(void); +void __exit extent_map_exit(void); +int extent_clean_all_trees(struct extent_map_tree *tree); +int set_extent_uptodate(struct extent_map_tree *tree, u64 start, u64 end, + gfp_t mask); +int set_extent_new(struct extent_map_tree *tree, u64 start, u64 end, + gfp_t mask); +int set_extent_dirty(struct extent_map_tree *tree, u64 start, u64 end, + gfp_t mask); +int extent_invalidatepage(struct extent_map_tree *tree, + struct page *page, unsigned long offset); +int extent_write_full_page(struct extent_map_tree *tree, struct page *page, + get_extent_t *get_extent, + struct writeback_control *wbc); +int extent_prepare_write(struct extent_map_tree *tree, + struct inode *inode, struct page *page, + unsigned from, unsigned to, get_extent_t *get_extent); +int extent_commit_write(struct extent_map_tree *tree, + struct inode *inode, struct page *page, + unsigned from, unsigned to); +int set_range_dirty(struct extent_map_tree *tree, u64 start, u64 end); +#endif diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 6933ab11a5cd..71a481894ab6 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -81,14 +81,14 @@ static void btrfs_drop_pages(struct page **pages, size_t num_pages) } } -static int insert_inline_extent(struct btrfs_root *root, struct inode *inode, +static int insert_inline_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct inode *inode, u64 offset, ssize_t size, - struct buffer_head *bh) + struct page *page, size_t page_offset) { struct btrfs_key key; struct btrfs_path *path; char *ptr, *kaddr; - struct btrfs_trans_handle *trans; struct btrfs_file_extent_item *ei; u32 datasize; int err = 0; @@ -98,8 +98,6 @@ static int insert_inline_extent(struct btrfs_root *root, struct inode *inode, if (!path) return -ENOMEM; - mutex_lock(&root->fs_info->fs_mutex); - trans = btrfs_start_transaction(root, 1); btrfs_set_trans_block_group(trans, inode); key.objectid = inode->i_ino; @@ -122,18 +120,13 @@ static int insert_inline_extent(struct btrfs_root *root, struct inode *inode, BTRFS_FILE_EXTENT_INLINE); ptr = btrfs_file_extent_inline_start(ei); - kaddr = kmap_atomic(bh->b_page, KM_USER0); + kaddr = kmap_atomic(page, KM_USER0); btrfs_memcpy(root, path->nodes[0]->b_data, - ptr, kaddr + bh_offset(bh), - size); + ptr, kaddr + page_offset, size); kunmap_atomic(kaddr, KM_USER0); btrfs_mark_buffer_dirty(path->nodes[0]); fail: btrfs_free_path(path); - ret = btrfs_end_transaction(trans, root); - if (ret && !err) - err = ret; - mutex_unlock(&root->fs_info->fs_mutex); return err; } @@ -145,45 +138,143 @@ static int dirty_and_release_pages(struct btrfs_trans_handle *trans, loff_t pos, size_t write_bytes) { - int i; - int offset; int err = 0; - int ret; - int this_write; + int i; struct inode *inode = file->f_path.dentry->d_inode; - struct buffer_head *bh; + struct extent_map *em; + struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; + struct btrfs_key ins; + u64 hint_block; + u64 num_blocks; + u64 start_pos; + u64 end_of_last_block; + u64 end_pos = pos + write_bytes; + loff_t isize = i_size_read(inode); - for (i = 0; i < num_pages; i++) { - offset = pos & (PAGE_CACHE_SIZE -1); - this_write = min((size_t)PAGE_CACHE_SIZE - offset, write_bytes); + em = alloc_extent_map(GFP_NOFS); + if (!em) + return -ENOMEM; - /* FIXME, one block at a time */ - bh = page_buffers(pages[i]); + em->bdev = inode->i_sb->s_bdev; - if (buffer_mapped(bh) && bh->b_blocknr == 0) { - ret = insert_inline_extent(root, inode, - pages[i]->index << PAGE_CACHE_SHIFT, - offset + this_write, bh); - if (ret) { - err = ret; - goto failed; - } - } + start_pos = pos & ~((u64)root->blocksize - 1); + num_blocks = (write_bytes + pos - start_pos + root->blocksize - 1) >> + inode->i_blkbits; - ret = btrfs_commit_write(file, pages[i], offset, - offset + this_write); - pos += this_write; - if (ret) { - err = ret; + end_of_last_block = start_pos + (num_blocks << inode->i_blkbits) - 1; + mutex_lock(&root->fs_info->fs_mutex); + trans = btrfs_start_transaction(root, 1); + if (!trans) { + err = -ENOMEM; + goto out_unlock; + } + btrfs_set_trans_block_group(trans, inode); + inode->i_blocks += num_blocks << 3; + hint_block = 0; + + if ((end_of_last_block & 4095) == 0) { + printk("strange end of last %Lu %lu %Lu\n", start_pos, write_bytes, end_of_last_block); + } + set_extent_uptodate(em_tree, start_pos, end_of_last_block, GFP_NOFS); + + /* FIXME...EIEIO, ENOSPC and more */ + + /* step one, delete the existing extents in this range */ + /* FIXME blocksize != pagesize */ + if (start_pos < inode->i_size) { + err = btrfs_drop_extents(trans, root, inode, + start_pos, (pos + write_bytes + root->blocksize -1) & + ~((u64)root->blocksize - 1), &hint_block); + if (err) + goto failed; + } + + /* insert any holes we need to create */ + if (inode->i_size < start_pos) { + u64 last_pos_in_file; + u64 hole_size; + u64 mask = root->blocksize - 1; + last_pos_in_file = (isize + mask) & ~mask; + hole_size = (start_pos - last_pos_in_file + mask) & ~mask; + hole_size >>= inode->i_blkbits; + if (last_pos_in_file < start_pos) { + err = btrfs_insert_file_extent(trans, root, + inode->i_ino, + last_pos_in_file, + 0, 0, hole_size); + } + if (err) goto failed; + } + + /* + * either allocate an extent for the new bytes or setup the key + * to show we are doing inline data in the extent + */ + if (isize >= PAGE_CACHE_SIZE || pos + write_bytes < inode->i_size || + pos + write_bytes - start_pos > BTRFS_MAX_INLINE_DATA_SIZE(root)) { + err = btrfs_alloc_extent(trans, root, inode->i_ino, + num_blocks, 0, hint_block, (u64)-1, + &ins, 1); + BUG_ON(err); + err = btrfs_insert_file_extent(trans, root, inode->i_ino, + start_pos, ins.objectid, ins.offset, + ins.offset); + BUG_ON(err); + em->start = start_pos; + em->end = end_of_last_block; + em->block_start = ins.objectid << inode->i_blkbits; + em->block_end = em->block_start + + (ins.offset << inode->i_blkbits) - 1; + set_extent_dirty(em_tree, start_pos, end_of_last_block, + GFP_NOFS); + err = add_extent_mapping(em_tree, em); + for (i = 0; i < num_pages; i++) { + struct page *p = pages[i]; + SetPageUptodate(p); + __set_page_dirty_nobuffers(p); } - WARN_ON(this_write > write_bytes); - write_bytes -= this_write; + } else { + struct page *p = pages[0]; + err = insert_inline_extent(trans, root, inode, start_pos, + end_pos - start_pos, p, 0); + BUG_ON(err); + em->start = start_pos; + em->end = end_pos; + em->block_start = EXTENT_MAP_INLINE; + em->block_end = EXTENT_MAP_INLINE; + add_extent_mapping(em_tree, em); + } + if (end_pos > isize) { + i_size_write(inode, end_pos); + btrfs_update_inode(trans, root, inode); } failed: + err = btrfs_end_transaction(trans, root); +out_unlock: + mutex_unlock(&root->fs_info->fs_mutex); + free_extent_map(em); return err; } +int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end) +{ + struct extent_map *em; + struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; + + while(1) { + em = lookup_extent_mapping(em_tree, start, end); + if (!em) + break; + remove_extent_mapping(em_tree, em); + /* once for us */ + free_extent_map(em); + /* once for the tree*/ + free_extent_map(em); + } + return 0; +} + /* * this is very complex, but the basic idea is to drop all extents * in the range start - end. hint_block is filled in with a block number @@ -213,6 +304,8 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans, int found_inline; int recow; + btrfs_drop_extent_cache(inode, start, end - 1); + path = btrfs_alloc_path(); if (!path) return -ENOMEM; @@ -434,18 +527,9 @@ static int prepare_pages(struct btrfs_root *root, int i; unsigned long index = pos >> PAGE_CACHE_SHIFT; struct inode *inode = file->f_path.dentry->d_inode; - int offset; int err = 0; - int this_write; - struct buffer_head *bh; - struct buffer_head *head; - loff_t isize = i_size_read(inode); - struct btrfs_trans_handle *trans; - u64 hint_block; u64 num_blocks; - u64 alloc_extent_start; u64 start_pos; - struct btrfs_key ins; start_pos = pos & ~((u64)PAGE_CACHE_SIZE - 1); num_blocks = (write_bytes + pos - start_pos + root->blocksize - 1) >> @@ -457,119 +541,17 @@ static int prepare_pages(struct btrfs_root *root, pages[i] = grab_cache_page(inode->i_mapping, index + i); if (!pages[i]) { err = -ENOMEM; - goto failed_release; + BUG_ON(1); } cancel_dirty_page(pages[i], PAGE_CACHE_SIZE); wait_on_page_writeback(pages[i]); - } - - mutex_lock(&root->fs_info->fs_mutex); - trans = btrfs_start_transaction(root, 1); - if (!trans) { - err = -ENOMEM; - mutex_unlock(&root->fs_info->fs_mutex); - goto out_unlock; - } - btrfs_set_trans_block_group(trans, inode); - /* FIXME blocksize != 4096 */ - inode->i_blocks += num_blocks << 3; - hint_block = 0; - - /* FIXME...EIEIO, ENOSPC and more */ - - /* step one, delete the existing extents in this range */ - /* FIXME blocksize != pagesize */ - if (start_pos < inode->i_size) { - err = btrfs_drop_extents(trans, root, inode, - start_pos, (pos + write_bytes + root->blocksize -1) & - ~((u64)root->blocksize - 1), &hint_block); - if (err) - goto failed_release; - } - - /* insert any holes we need to create */ - if (inode->i_size < start_pos) { - u64 last_pos_in_file; - u64 hole_size; - u64 mask = root->blocksize - 1; - last_pos_in_file = (isize + mask) & ~mask; - hole_size = (start_pos - last_pos_in_file + mask) & ~mask; - hole_size >>= inode->i_blkbits; - if (last_pos_in_file < start_pos) { - err = btrfs_insert_file_extent(trans, root, - inode->i_ino, - last_pos_in_file, - 0, 0, hole_size); - } - if (err) - goto failed_release; - } - - /* - * either allocate an extent for the new bytes or setup the key - * to show we are doing inline data in the extent - */ - if (isize >= PAGE_CACHE_SIZE || pos + write_bytes < inode->i_size || - pos + write_bytes - start_pos > BTRFS_MAX_INLINE_DATA_SIZE(root)) { - err = btrfs_alloc_extent(trans, root, inode->i_ino, - num_blocks, 0, hint_block, (u64)-1, - &ins, 1); - if (err) - goto failed_truncate; - err = btrfs_insert_file_extent(trans, root, inode->i_ino, - start_pos, ins.objectid, ins.offset, - ins.offset); - if (err) - goto failed_truncate; - } else { - ins.offset = 0; - ins.objectid = 0; - } - BUG_ON(err); - alloc_extent_start = ins.objectid; - err = btrfs_end_transaction(trans, root); - mutex_unlock(&root->fs_info->fs_mutex); - - for (i = 0; i < num_pages; i++) { - offset = pos & (PAGE_CACHE_SIZE -1); - this_write = min((size_t)PAGE_CACHE_SIZE - offset, write_bytes); - if (!page_has_buffers(pages[i])) { - create_empty_buffers(pages[i], - root->fs_info->sb->s_blocksize, - (1 << BH_Uptodate)); + if (!PagePrivate(pages[i])) { + SetPagePrivate(pages[i]); + set_page_private(pages[i], 1); + page_cache_get(pages[i]); } - head = page_buffers(pages[i]); - bh = head; - do { - err = btrfs_map_bh_to_logical(root, bh, - alloc_extent_start); - BUG_ON(err); - if (err) - goto failed_truncate; - bh = bh->b_this_page; - if (alloc_extent_start) - alloc_extent_start++; - } while (bh != head); - pos += this_write; - WARN_ON(this_write > write_bytes); - write_bytes -= this_write; } return 0; - -failed_release: - btrfs_drop_pages(pages, num_pages); - return err; - -failed_truncate: - btrfs_drop_pages(pages, num_pages); - if (pos > isize) - vmtruncate(inode, isize); - return err; - -out_unlock: - mutex_unlock(&root->fs_info->fs_mutex); - goto failed_release; - } static ssize_t btrfs_file_write(struct file *file, const char __user *buf, @@ -685,7 +667,6 @@ out: page_cache_release(pinned[1]); *ppos = pos; current->backing_dev_info = NULL; - mark_inode_dirty(inode); return num_written ? num_written : err; } @@ -714,8 +695,8 @@ static int btrfs_sync_file(struct file *file, mutex_unlock(&root->fs_info->trans_mutex); /* - * ok we haven't committed the transaction yet, lets do a commit - */ + * ok we haven't committed the transaction yet, lets do a commit + */ trans = btrfs_start_transaction(root, 1); if (!trans) { ret = -ENOMEM; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 504d37dfa6c9..fd4cb65fcf1f 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -121,6 +121,8 @@ void btrfs_read_locked_inode(struct inode *inode) switch (inode->i_mode & S_IFMT) { case S_IFREG: inode->i_mapping->a_ops = &btrfs_aops; + extent_map_tree_init(&BTRFS_I(inode)->extent_tree, + inode->i_mapping, GFP_NOFS); inode->i_fop = &btrfs_file_operations; inode->i_op = &btrfs_file_inode_operations; break; @@ -169,7 +171,7 @@ static void fill_inode_item(struct btrfs_inode_item *item, BTRFS_I(inode)->block_group->key.objectid); } -static int btrfs_update_inode(struct btrfs_trans_handle *trans, +int btrfs_update_inode(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode) { @@ -400,6 +402,7 @@ static int btrfs_truncate_in_trans(struct btrfs_trans_handle *trans, int found_extent; int del_item; + btrfs_drop_extent_cache(inode, inode->i_size, (u64)-1); path = btrfs_alloc_path(); path->reada = -1; BUG_ON(!path); @@ -511,6 +514,50 @@ error: return ret; } +static int btrfs_cow_one_page(struct btrfs_trans_handle *trans, + struct inode *inode, struct page *page, + size_t zero_start) +{ + char *kaddr; + int ret = 0; + struct btrfs_root *root = BTRFS_I(inode)->root; + u64 alloc_hint = 0; + u64 page_start = page->index << PAGE_CACHE_SHIFT; + struct btrfs_key ins; + + if (!PagePrivate(page)) { + SetPagePrivate(page); + set_page_private(page, 1); + page_cache_get(page); + } + + btrfs_set_trans_block_group(trans, inode); + + ret = btrfs_drop_extents(trans, root, inode, + page_start, page_start + PAGE_CACHE_SIZE, + &alloc_hint); + if (ret) + goto out; + ret = btrfs_alloc_extent(trans, root, inode->i_ino, 1, 0, + alloc_hint, (u64)-1, &ins, 1); + if (ret) + goto out; + ret = btrfs_insert_file_extent(trans, root, inode->i_ino, + page_start, ins.objectid, 1, 1); + if (ret) + goto out; + SetPageChecked(page); + kaddr = kmap(page); + if (zero_start != PAGE_CACHE_SIZE) { + memset(kaddr + zero_start, 0, PAGE_CACHE_SIZE - zero_start); + flush_dcache_page(page); + } + kunmap(page); + +out: + return ret; +} + /* * taken from block_truncate_page, but does cow as it zeros out * any bytes left in the last page in the file. @@ -518,16 +565,14 @@ error: static int btrfs_truncate_page(struct address_space *mapping, loff_t from) { struct inode *inode = mapping->host; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_trans_handle *trans; unsigned blocksize = 1 << inode->i_blkbits; pgoff_t index = from >> PAGE_CACHE_SHIFT; unsigned offset = from & (PAGE_CACHE_SIZE-1); struct page *page; - char *kaddr; int ret = 0; - struct btrfs_root *root = BTRFS_I(inode)->root; - u64 alloc_hint = 0; - struct btrfs_key ins; - struct btrfs_trans_handle *trans; + u64 page_start; if ((offset & (blocksize - 1)) == 0) goto out; @@ -536,7 +581,6 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from) page = grab_cache_page(mapping, index); if (!page) goto out; - if (!PageUptodate(page)) { ret = btrfs_readpage(NULL, page); lock_page(page); @@ -545,37 +589,24 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from) goto out; } } + page_start = page->index << PAGE_CACHE_SHIFT; + mutex_lock(&root->fs_info->fs_mutex); trans = btrfs_start_transaction(root, 1); - btrfs_set_trans_block_group(trans, inode); - - ret = btrfs_drop_extents(trans, root, inode, - page->index << PAGE_CACHE_SHIFT, - (page->index + 1) << PAGE_CACHE_SHIFT, - &alloc_hint); - if (ret) - goto out; - ret = btrfs_alloc_extent(trans, root, inode->i_ino, 1, 0, - alloc_hint, (u64)-1, &ins, 1); - if (ret) - goto out; - ret = btrfs_insert_file_extent(trans, root, inode->i_ino, - page->index << PAGE_CACHE_SHIFT, - ins.objectid, 1, 1); - if (ret) - goto out; - SetPageChecked(page); - kaddr = kmap(page); - memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset); - flush_dcache_page(page); - ret = btrfs_csum_file_block(trans, root, inode->i_ino, - page->index << PAGE_CACHE_SHIFT, - kaddr, PAGE_CACHE_SIZE); - kunmap(page); + ret = btrfs_cow_one_page(trans, inode, page, offset); + if (!ret) { + char *kaddr = kmap(page); + ret = btrfs_csum_file_block(trans, root, inode->i_ino, + page_start, kaddr, PAGE_CACHE_SIZE); + kunmap(page); + } + set_extent_dirty(&BTRFS_I(inode)->extent_tree, + page_start, page_start + PAGE_CACHE_SIZE - 1, + GFP_NOFS); + set_page_dirty(page); btrfs_end_transaction(trans, root); mutex_unlock(&root->fs_info->fs_mutex); - set_page_dirty(page); unlock_page(page); page_cache_release(page); out: @@ -1095,6 +1126,8 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry, inode->i_mapping->a_ops = &btrfs_aops; inode->i_fop = &btrfs_file_operations; inode->i_op = &btrfs_file_inode_operations; + extent_map_tree_init(&BTRFS_I(inode)->extent_tree, + inode->i_mapping, GFP_NOFS); } dir->i_sb->s_dirt = 1; btrfs_update_inode_block_group(trans, inode); @@ -1238,6 +1271,182 @@ out_unlock: return err; } +struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page, + size_t page_offset, u64 start, u64 end, + int create) +{ + int ret; + int err = 0; + u64 blocknr; + u64 extent_start = 0; + u64 extent_end = 0; + u64 objectid = inode->i_ino; + u32 found_type; + int failed_insert = 0; + struct btrfs_path *path; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_file_extent_item *item; + struct btrfs_leaf *leaf; + struct btrfs_disk_key *found_key; + struct extent_map *em = NULL; + struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; + struct btrfs_trans_handle *trans = NULL; + + path = btrfs_alloc_path(); + BUG_ON(!path); + mutex_lock(&root->fs_info->fs_mutex); + +again: + em = lookup_extent_mapping(em_tree, start, end); + if (em) { + goto out; + } + if (!em) { + em = alloc_extent_map(GFP_NOFS); + if (!em) { + err = -ENOMEM; + goto out; + } + em->start = 0; + em->end = 0; + } + em->bdev = inode->i_sb->s_bdev; + ret = btrfs_lookup_file_extent(NULL, root, path, + objectid, start, 0); + if (ret < 0) { + err = ret; + goto out; + } + + if (ret != 0) { + if (path->slots[0] == 0) + goto not_found; + path->slots[0]--; + } + + item = btrfs_item_ptr(btrfs_buffer_leaf(path->nodes[0]), path->slots[0], + struct btrfs_file_extent_item); + leaf = btrfs_buffer_leaf(path->nodes[0]); + blocknr = btrfs_file_extent_disk_blocknr(item); + blocknr += btrfs_file_extent_offset(item); + + /* are we inside the extent that was found? */ + found_key = &leaf->items[path->slots[0]].key; + found_type = btrfs_disk_key_type(found_key); + if (btrfs_disk_key_objectid(found_key) != objectid || + found_type != BTRFS_EXTENT_DATA_KEY) { + goto not_found; + } + + found_type = btrfs_file_extent_type(item); + extent_start = btrfs_disk_key_offset(&leaf->items[path->slots[0]].key); + if (found_type == BTRFS_FILE_EXTENT_REG) { + extent_end = extent_start + + (btrfs_file_extent_num_blocks(item) << inode->i_blkbits); + err = 0; + if (start < extent_start || start > extent_end) { + em->start = start; + if (start < extent_start) { + em->end = extent_end - 1; + } else { + em->end = end; + } + goto not_found_em; + } + if (btrfs_file_extent_disk_blocknr(item) == 0) { + em->start = extent_start; + em->end = extent_end - 1; + em->block_start = 0; + em->block_end = 0; + goto insert; + } + em->block_start = blocknr << inode->i_blkbits; + em->block_end = em->block_start + + (btrfs_file_extent_num_blocks(item) << + inode->i_blkbits) - 1; + em->start = extent_start; + em->end = extent_end - 1; + goto insert; + } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { + char *ptr; + char *map; + u32 size; + + size = btrfs_file_extent_inline_len(leaf->items + + path->slots[0]); + extent_end = extent_start + size; + if (start < extent_start || start > extent_end) { + em->start = start; + if (start < extent_start) { + em->end = extent_end - 1; + } else { + em->end = end; + } + goto not_found_em; + } + em->block_start = EXTENT_MAP_INLINE; + em->block_end = EXTENT_MAP_INLINE; + em->start = extent_start; + em->end = extent_end - 1; + if (!page) { + goto insert; + } + ptr = btrfs_file_extent_inline_start(item); + map = kmap(page); + memcpy(map + page_offset, ptr, size); + flush_dcache_page(result->b_page); + kunmap(page); + set_extent_uptodate(em_tree, extent_start, + extent_end, GFP_NOFS); + goto insert; + } else { + printk("unkknown found_type %d\n", found_type); + WARN_ON(1); + } +not_found: + em->start = start; + em->end = end; +not_found_em: + em->block_start = 0; + em->block_end = 0; +insert: + btrfs_release_path(root, path); + if (em->start > start || em->end < start) { + printk("bad extent! %Lu %Lu start %Lu end %Lu\n", em->start, em->end, start, end); + WARN_ON(1); + err = -EIO; + goto out; + } + ret = add_extent_mapping(em_tree, em); + if (ret == -EEXIST) { + free_extent_map(em); + failed_insert++; + if (failed_insert > 5) { + printk("failing to insert %Lu %Lu\n", start, end); + err = -EIO; + goto out; + } + em = NULL; + goto again; + } + err = 0; +out: + btrfs_free_path(path); + if (trans) { + ret = btrfs_end_transaction(trans, root); + if (!err) + err = ret; + } + mutex_unlock(&root->fs_info->fs_mutex); + if (err) { + free_extent_map(em); + WARN_ON(1); + return ERR_PTR(err); + } + return em; +} + + /* * FIBMAP and others want to pass in a fake buffer head. They need to * use BTRFS_GET_BLOCK_NO_DIRECT to make sure we don't try to memcpy @@ -1398,46 +1607,22 @@ int btrfs_get_block(struct inode *inode, sector_t iblock, return err; } -static int btrfs_get_block_csum(struct inode *inode, sector_t iblock, - struct buffer_head *result, int create) -{ - int ret; - struct btrfs_root *root = BTRFS_I(inode)->root; - struct page *page = result->b_page; - u64 offset = (page->index << PAGE_CACHE_SHIFT) + bh_offset(result); - struct btrfs_csum_item *item; - struct btrfs_path *path = NULL; - - mutex_lock(&root->fs_info->fs_mutex); - ret = btrfs_get_block_lock(inode, iblock, result, create); - if (ret) - goto out; - - path = btrfs_alloc_path(); - item = btrfs_lookup_csum(NULL, root, path, inode->i_ino, offset, 0); - if (IS_ERR(item)) { - ret = PTR_ERR(item); - /* a csum that isn't present is a preallocated region. */ - if (ret == -ENOENT || ret == -EFBIG) - ret = 0; - result->b_private = NULL; - goto out; - } - memcpy((char *)&result->b_private, &item->csum, BTRFS_CRC32_SIZE); -out: - if (path) - btrfs_free_path(path); - mutex_unlock(&root->fs_info->fs_mutex); - return ret; -} - static int btrfs_get_block_bmap(struct inode *inode, sector_t iblock, struct buffer_head *result, int create) { struct btrfs_root *root = BTRFS_I(inode)->root; - mutex_lock(&root->fs_info->fs_mutex); - btrfs_get_block_lock(inode, iblock, result, BTRFS_GET_BLOCK_NO_DIRECT); - mutex_unlock(&root->fs_info->fs_mutex); + u64 start = iblock << inode->i_blkbits; + u64 end = start + root->blocksize -1; + struct extent_map *em; + + em = btrfs_get_extent(inode, NULL, 0, start, end, 0); + if (em && !IS_ERR(em) && em->block_start != EXTENT_MAP_INLINE && + em->block_start != 0) { + u64 offset; + offset = start - em->start; + start = (em->block_start + offset) >> inode->i_blkbits; + btrfs_map_bh_to_logical(root, result, start); + } return 0; } @@ -1449,442 +1634,50 @@ static sector_t btrfs_bmap(struct address_space *as, sector_t block) static int btrfs_prepare_write(struct file *file, struct page *page, unsigned from, unsigned to) { - return block_prepare_write(page, from, to, btrfs_get_block); + return extent_prepare_write(&BTRFS_I(page->mapping->host)->extent_tree, + page->mapping->host, page, from, to, + btrfs_get_extent); } -static void buffer_io_error(struct buffer_head *bh) +int btrfs_readpage(struct file *file, struct page *page) { - char b[BDEVNAME_SIZE]; - - printk(KERN_ERR "Buffer I/O error on device %s, logical block %Lu\n", - bdevname(bh->b_bdev, b), - (unsigned long long)bh->b_blocknr); + struct extent_map_tree *tree; + tree = &BTRFS_I(page->mapping->host)->extent_tree; + return extent_read_full_page(tree, page, btrfs_get_extent); } - -/* - * I/O completion handler for block_read_full_page() - pages - * which come unlocked at the end of I/O. - */ -static void btrfs_end_buffer_async_read(struct buffer_head *bh, int uptodate) +static int btrfs_writepage(struct page *page, struct writeback_control *wbc) { - unsigned long flags; - struct buffer_head *first; - struct buffer_head *tmp; - struct page *page; - int page_uptodate = 1; - struct inode *inode; - int ret; - - BUG_ON(!buffer_async_read(bh)); - - page = bh->b_page; - inode = page->mapping->host; - if (uptodate) { - void *kaddr; - struct btrfs_root *root = BTRFS_I(page->mapping->host)->root; - if (bh->b_private) { - char csum[BTRFS_CRC32_SIZE]; - kaddr = kmap_atomic(page, KM_IRQ0); - ret = btrfs_csum_data(root, kaddr + bh_offset(bh), - bh->b_size, csum); - BUG_ON(ret); - if (memcmp(csum, &bh->b_private, BTRFS_CRC32_SIZE)) { - u64 offset; - offset = (page->index << PAGE_CACHE_SHIFT) + - bh_offset(bh); - printk("btrfs csum failed ino %lu off %llu\n", - page->mapping->host->i_ino, - (unsigned long long)offset); - memset(kaddr + bh_offset(bh), 1, bh->b_size); - flush_dcache_page(page); - } - kunmap_atomic(kaddr, KM_IRQ0); - } - set_buffer_uptodate(bh); - } else { - clear_buffer_uptodate(bh); - if (printk_ratelimit()) - buffer_io_error(bh); - SetPageError(page); - } - - /* - * Be _very_ careful from here on. Bad things can happen if - * two buffer heads end IO at almost the same time and both - * decide that the page is now completely done. - */ - first = page_buffers(page); - local_irq_save(flags); - bit_spin_lock(BH_Uptodate_Lock, &first->b_state); - clear_buffer_async_read(bh); - unlock_buffer(bh); - tmp = bh; - do { - if (!buffer_uptodate(tmp)) - page_uptodate = 0; - if (buffer_async_read(tmp)) { - BUG_ON(!buffer_locked(tmp)); - goto still_busy; - } - tmp = tmp->b_this_page; - } while (tmp != bh); - bit_spin_unlock(BH_Uptodate_Lock, &first->b_state); - local_irq_restore(flags); - - /* - * If none of the buffers had errors and they are all - * uptodate then we can set the page uptodate. - */ - if (page_uptodate && !PageError(page)) - SetPageUptodate(page); - unlock_page(page); - return; - -still_busy: - bit_spin_unlock(BH_Uptodate_Lock, &first->b_state); - local_irq_restore(flags); - return; + struct extent_map_tree *tree; + tree = &BTRFS_I(page->mapping->host)->extent_tree; + return extent_write_full_page(tree, page, btrfs_get_extent, wbc); } -/* - * Generic "read page" function for block devices that have the normal - * get_block functionality. This is most of the block device filesystems. - * Reads the page asynchronously --- the unlock_buffer() and - * set/clear_buffer_uptodate() functions propagate buffer state into the - * page struct once IO has completed. - */ -int btrfs_readpage(struct file *file, struct page *page) +static int btrfs_releasepage(struct page *page, gfp_t unused_gfp_flags) { - struct inode *inode = page->mapping->host; - sector_t iblock, lblock; - struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE]; - unsigned int blocksize; - int nr, i; - int fully_mapped = 1; - - BUG_ON(!PageLocked(page)); - blocksize = 1 << inode->i_blkbits; - if (!page_has_buffers(page)) - create_empty_buffers(page, blocksize, 0); - head = page_buffers(page); - - iblock = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits); - lblock = (i_size_read(inode)+blocksize-1) >> inode->i_blkbits; - bh = head; - nr = 0; - i = 0; - - do { - if (buffer_uptodate(bh)) - continue; - - if (!buffer_mapped(bh)) { - int err = 0; - - fully_mapped = 0; - if (iblock < lblock) { - WARN_ON(bh->b_size != blocksize); - err = btrfs_get_block_csum(inode, iblock, - bh, 0); - if (err) - SetPageError(page); - } - if (!buffer_mapped(bh)) { - void *kaddr = kmap_atomic(page, KM_USER0); - memset(kaddr + i * blocksize, 0, blocksize); - flush_dcache_page(page); - kunmap_atomic(kaddr, KM_USER0); - if (!err) - set_buffer_uptodate(bh); - continue; - } - /* - * get_block() might have updated the buffer - * synchronously - */ - if (buffer_uptodate(bh)) - continue; - } - arr[nr++] = bh; - } while (i++, iblock++, (bh = bh->b_this_page) != head); - - if (fully_mapped) - SetPageMappedToDisk(page); - - if (!nr) { - /* - * All buffers are uptodate - we can set the page uptodate - * as well. But not if get_block() returned an error. - */ - if (!PageError(page)) - SetPageUptodate(page); - unlock_page(page); - return 0; - } - - /* Stage two: lock the buffers */ - for (i = 0; i < nr; i++) { - bh = arr[i]; - lock_buffer(bh); - bh->b_end_io = btrfs_end_buffer_async_read; - set_buffer_async_read(bh); - } - - /* - * Stage 3: start the IO. Check for uptodateness - * inside the buffer lock in case another process reading - * the underlying blockdev brought it uptodate (the sct fix). - */ - for (i = 0; i < nr; i++) { - bh = arr[i]; - if (buffer_uptodate(bh)) - btrfs_end_buffer_async_read(bh, 1); - else - submit_bh(READ, bh); - } - return 0; -} - -/* - * Aside from a tiny bit of packed file data handling, this is the - * same as the generic code. - * - * While block_write_full_page is writing back the dirty buffers under - * the page lock, whoever dirtied the buffers may decide to clean them - * again at any time. We handle that by only looking at the buffer - * state inside lock_buffer(). - * - * If block_write_full_page() is called for regular writeback - * (wbc->sync_mode == WB_SYNC_NONE) then it will redirty a page which has a - * locked buffer. This only can happen if someone has written the buffer - * directly, with submit_bh(). At the address_space level PageWriteback - * prevents this contention from occurring. - */ -static int __btrfs_write_full_page(struct inode *inode, struct page *page, - struct writeback_control *wbc) -{ - int err; - sector_t block; - sector_t last_block; - struct buffer_head *bh, *head; - const unsigned blocksize = 1 << inode->i_blkbits; - int nr_underway = 0; - struct btrfs_root *root = BTRFS_I(inode)->root; - - BUG_ON(!PageLocked(page)); - - last_block = (i_size_read(inode) - 1) >> inode->i_blkbits; - - /* no csumming allowed when from PF_MEMALLOC */ - if (current->flags & PF_MEMALLOC) { - redirty_page_for_writepage(wbc, page); - unlock_page(page); - return 0; - } + struct extent_map_tree *tree; + int ret; - if (!page_has_buffers(page)) { - create_empty_buffers(page, blocksize, - (1 << BH_Dirty)|(1 << BH_Uptodate)); + if (page->private != 1) { + WARN_ON(1); + return try_to_free_buffers(page); } - - /* - * Be very careful. We have no exclusion from __set_page_dirty_buffers - * here, and the (potentially unmapped) buffers may become dirty at - * any time. If a buffer becomes dirty here after we've inspected it - * then we just miss that fact, and the page stays dirty. - * - * Buffers outside i_size may be dirtied by __set_page_dirty_buffers; - * handle that here by just cleaning them. - */ - - block = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits); - head = page_buffers(page); - bh = head; - - /* - * Get all the dirty buffers mapped to disk addresses and - * handle any aliases from the underlying blockdev's mapping. - */ - do { - if (block > last_block) { - /* - * mapped buffers outside i_size will occur, because - * this page can be outside i_size when there is a - * truncate in progress. - */ - /* - * The buffer was zeroed by block_write_full_page() - */ - clear_buffer_dirty(bh); - set_buffer_uptodate(bh); - } else if (!buffer_mapped(bh) && buffer_dirty(bh)) { - WARN_ON(bh->b_size != blocksize); - err = btrfs_get_block(inode, block, bh, 0); - if (err) { - goto recover; - } - if (buffer_new(bh)) { - /* blockdev mappings never come here */ - clear_buffer_new(bh); - } - } - bh = bh->b_this_page; - block++; - } while (bh != head); - - do { - if (!buffer_mapped(bh)) - continue; - /* - * If it's a fully non-blocking write attempt and we cannot - * lock the buffer then redirty the page. Note that this can - * potentially cause a busy-wait loop from pdflush and kswapd - * activity, but those code paths have their own higher-level - * throttling. - */ - if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) { - lock_buffer(bh); - } else if (test_set_buffer_locked(bh)) { - redirty_page_for_writepage(wbc, page); - continue; - } - if (test_clear_buffer_dirty(bh) && bh->b_blocknr != 0) { - struct btrfs_trans_handle *trans; - int ret; - u64 off = page->index << PAGE_CACHE_SHIFT; - char *kaddr; - - off += bh_offset(bh); - mutex_lock(&root->fs_info->fs_mutex); - trans = btrfs_start_transaction(root, 1); - btrfs_set_trans_block_group(trans, inode); - kaddr = kmap(page); - btrfs_csum_file_block(trans, root, inode->i_ino, - off, kaddr + bh_offset(bh), - bh->b_size); - kunmap(page); - ret = btrfs_end_transaction(trans, root); - BUG_ON(ret); - mutex_unlock(&root->fs_info->fs_mutex); - mark_buffer_async_write(bh); - } else { - unlock_buffer(bh); - } - } while ((bh = bh->b_this_page) != head); - - /* - * The page and its buffers are protected by PageWriteback(), so we can - * drop the bh refcounts early. - */ - BUG_ON(PageWriteback(page)); - set_page_writeback(page); - - do { - struct buffer_head *next = bh->b_this_page; - if (buffer_async_write(bh)) { - submit_bh(WRITE, bh); - nr_underway++; - } - bh = next; - } while (bh != head); - unlock_page(page); - - err = 0; -done: - if (nr_underway == 0) { - /* - * The page was marked dirty, but the buffers were - * clean. Someone wrote them back by hand with - * ll_rw_block/submit_bh. A rare case. - */ - int uptodate = 1; - do { - if (!buffer_uptodate(bh)) { - uptodate = 0; - break; - } - bh = bh->b_this_page; - } while (bh != head); - if (uptodate) - SetPageUptodate(page); - end_page_writeback(page); + tree = &BTRFS_I(page->mapping->host)->extent_tree; + ret = try_release_extent_mapping(tree, page); + if (ret == 1) { + ClearPagePrivate(page); + set_page_private(page, 0); + page_cache_release(page); } - return err; - -recover: - /* - * ENOSPC, or some other error. We may already have added some - * blocks to the file, so we need to write these out to avoid - * exposing stale data. - * The page is currently locked and not marked for writeback - */ - bh = head; - /* Recovery: lock and submit the mapped buffers */ - do { - if (buffer_mapped(bh) && buffer_dirty(bh)) { - lock_buffer(bh); - mark_buffer_async_write(bh); - } else { - /* - * The buffer may have been set dirty during - * attachment to a dirty page. - */ - clear_buffer_dirty(bh); - } - } while ((bh = bh->b_this_page) != head); - SetPageError(page); - BUG_ON(PageWriteback(page)); - set_page_writeback(page); - do { - struct buffer_head *next = bh->b_this_page; - if (buffer_async_write(bh)) { - clear_buffer_dirty(bh); - submit_bh(WRITE, bh); - nr_underway++; - } - bh = next; - } while (bh != head); - unlock_page(page); - goto done; + return ret; } -static int btrfs_writepage(struct page *page, struct writeback_control *wbc) +static void btrfs_invalidatepage(struct page *page, unsigned long offset) { - struct inode * const inode = page->mapping->host; - loff_t i_size = i_size_read(inode); - const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT; - unsigned offset; - void *kaddr; - - /* Is the page fully inside i_size? */ - if (page->index < end_index) - return __btrfs_write_full_page(inode, page, wbc); - - /* Is the page fully outside i_size? (truncate in progress) */ - offset = i_size & (PAGE_CACHE_SIZE-1); - if (page->index >= end_index+1 || !offset) { - /* - * The page may have dirty, unmapped buffers. For example, - * they may have been added in ext3_writepage(). Make them - * freeable here, so the page does not leak. - */ - block_invalidatepage(page, 0); - unlock_page(page); - return 0; /* don't care */ - } + struct extent_map_tree *tree; - /* - * The page straddles i_size. It must be zeroed out on each and every - * writepage invokation because it may be mmapped. "A file is mapped - * in multiples of the page size. For a file that is not a multiple of - * the page size, the remaining memory is zeroed when mapped, and - * writes to that region are not written out to the file." - */ - kaddr = kmap_atomic(page, KM_USER0); - memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset); - flush_dcache_page(page); - kunmap_atomic(kaddr, KM_USER0); - return __btrfs_write_full_page(inode, page, wbc); + tree = &BTRFS_I(page->mapping->host)->extent_tree; + extent_invalidatepage(tree, page, offset); + btrfs_releasepage(page, GFP_NOFS); } /* @@ -1905,28 +1698,39 @@ static int btrfs_writepage(struct page *page, struct writeback_control *wbc) int btrfs_page_mkwrite(struct vm_area_struct *vma, struct page *page) { struct inode *inode = vma->vm_file->f_path.dentry->d_inode; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_trans_handle *trans; unsigned long end; loff_t size; int ret = -EINVAL; + u64 page_start; lock_page(page); wait_on_page_writeback(page); size = i_size_read(inode); + page_start = page->index << PAGE_CACHE_SHIFT; + if ((page->mapping != inode->i_mapping) || - ((page->index << PAGE_CACHE_SHIFT) > size)) { + (page_start > size)) { /* page got truncated out from underneath us */ goto out_unlock; } /* page is wholly or partially inside EOF */ - if (((page->index + 1) << PAGE_CACHE_SHIFT) > size) + if (page_start + PAGE_CACHE_SIZE > size) end = size & ~PAGE_CACHE_MASK; else end = PAGE_CACHE_SIZE; - ret = btrfs_prepare_write(NULL, page, 0, end); - if (!ret) - ret = btrfs_commit_write(NULL, page, 0, end); + mutex_lock(&root->fs_info->fs_mutex); + trans = btrfs_start_transaction(root, 1); + ret = btrfs_cow_one_page(trans, inode, page, end); + btrfs_end_transaction(trans, root); + mutex_unlock(&root->fs_info->fs_mutex); + set_extent_dirty(&BTRFS_I(inode)->extent_tree, + page_start, page_start + PAGE_CACHE_SIZE - 1, + GFP_NOFS); + set_page_dirty(page); out_unlock: unlock_page(page); @@ -1962,21 +1766,8 @@ static void btrfs_truncate(struct inode *inode) int btrfs_commit_write(struct file *file, struct page *page, unsigned from, unsigned to) { - struct inode *inode = page->mapping->host; - struct buffer_head *bh; - loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to; - - SetPageUptodate(page); - bh = page_buffers(page); - set_buffer_uptodate(bh); - if (buffer_mapped(bh) && bh->b_blocknr != 0) { - set_page_dirty(page); - } - if (pos > inode->i_size) { - i_size_write(inode, pos); - mark_inode_dirty(inode); - } - return 0; + return extent_commit_write(&BTRFS_I(page->mapping->host)->extent_tree, + page->mapping->host, page, from, to); } static int create_subvol(struct btrfs_root *root, char *name, int namelen) @@ -2471,6 +2262,8 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry, inode->i_mapping->a_ops = &btrfs_aops; inode->i_fop = &btrfs_file_operations; inode->i_op = &btrfs_file_inode_operations; + extent_map_tree_init(&BTRFS_I(inode)->extent_tree, + inode->i_mapping, GFP_NOFS); } dir->i_sb->s_dirt = 1; btrfs_update_inode_block_group(trans, inode); @@ -2553,6 +2346,9 @@ static struct address_space_operations btrfs_aops = { .prepare_write = btrfs_prepare_write, .commit_write = btrfs_commit_write, .bmap = btrfs_bmap, + .invalidatepage = btrfs_invalidatepage, + .releasepage = btrfs_releasepage, + .set_page_dirty = __set_page_dirty_nobuffers, }; static struct address_space_operations btrfs_symlink_aops = { diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 74f3de47423c..7b7c32c73a2c 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -185,6 +185,7 @@ static int __init init_btrfs_fs(void) err = btrfs_init_cachep(); if (err) return err; + extent_map_init(); return register_filesystem(&btrfs_fs_type); } @@ -192,6 +193,7 @@ static void __exit exit_btrfs_fs(void) { btrfs_exit_transaction_sys(); btrfs_destroy_cachep(); + extent_map_exit(); unregister_filesystem(&btrfs_fs_type); } -- cgit v1.2.3 From 58176a9604c5db1784d2c979aea472b3be40b6f0 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 29 Aug 2007 15:47:34 -0400 Subject: Btrfs: Add per-root block accounting and sysfs entries Signed-off-by: Chris Mason --- fs/btrfs/Makefile | 2 +- fs/btrfs/ctree.h | 54 +++++++++++ fs/btrfs/disk-io.c | 31 ++++++- fs/btrfs/disk-io.h | 3 +- fs/btrfs/extent-tree.c | 17 +++- fs/btrfs/inode.c | 13 ++- fs/btrfs/super.c | 14 +++ fs/btrfs/sysfs.c | 236 +++++++++++++++++++++++++++++++++++++++++++++++++ fs/btrfs/transaction.c | 34 ++++++- 9 files changed, 393 insertions(+), 11 deletions(-) (limited to 'fs/btrfs/Makefile') diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index ea7a22870350..ee505c87ecf3 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -5,7 +5,7 @@ obj-m := btrfs.o btrfs-y := super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ hash.o file-item.o inode-item.o inode-map.o disk-io.o \ transaction.o bit-radix.o inode.o file.o tree-defrag.o \ - extent_map.o + extent_map.o sysfs.o #btrfs-y := ctree.o disk-io.o radix-tree.o extent-tree.o print-tree.o \ # root-tree.o dir-item.o hash.o file-item.o inode-item.o \ diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 947f061ed118..ffeee546a0d7 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -22,6 +22,7 @@ #include #include #include +#include #include "bit-radix.h" struct btrfs_trans_handle; @@ -313,6 +314,8 @@ struct btrfs_fs_info { struct list_head trans_list; struct list_head dead_roots; struct delayed_work trans_work; + struct kobject super_kobj; + struct completion kobj_unregister; int do_barriers; int closing; }; @@ -328,6 +331,8 @@ struct btrfs_root { struct btrfs_key root_key; struct btrfs_fs_info *fs_info; struct inode *inode; + struct kobject root_kobj; + struct completion kobj_unregister; u64 objectid; u64 last_trans; u32 blocksize; @@ -338,6 +343,7 @@ struct btrfs_root { struct btrfs_key defrag_progress; int defrag_running; int defrag_level; + char *name; }; /* the lower bits in the key flags defines the item type */ @@ -814,6 +820,28 @@ static inline void btrfs_set_root_flags(struct btrfs_root_item *item, u32 val) item->flags = cpu_to_le32(val); } +static inline void btrfs_set_root_blocks_used(struct btrfs_root_item *item, + u64 val) +{ + item->blocks_used = cpu_to_le64(val); +} + +static inline u64 btrfs_root_blocks_used(struct btrfs_root_item *item) +{ + return le64_to_cpu(item->blocks_used); +} + +static inline void btrfs_set_root_block_limit(struct btrfs_root_item *item, + u64 val) +{ + item->block_limit = cpu_to_le64(val); +} + +static inline u64 btrfs_root_block_limit(struct btrfs_root_item *item) +{ + return le64_to_cpu(item->block_limit); +} + static inline u64 btrfs_super_blocknr(struct btrfs_super_block *s) { return le64_to_cpu(s->blocknr); @@ -1014,6 +1042,23 @@ static inline void btrfs_memmove(struct btrfs_root *root, memmove(dst, src, nr); } +static inline int btrfs_set_root_name(struct btrfs_root *root, + const char *name, int len) +{ + /* if we already have a name just free it */ + if (root->name) + kfree(root->name); + + root->name = kmalloc(len+1, GFP_KERNEL); + if (!root->name) + return -ENOMEM; + + memcpy(root->name, name, len); + root->name[len] ='\0'; + + return 0; +} + /* helper function to cast into the data area of the leaf. */ #define btrfs_item_ptr(leaf, slot, type) \ ((type *)(btrfs_leaf_data(leaf) + \ @@ -1191,4 +1236,13 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans, /* tree-defrag.c */ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, struct btrfs_root *root, int cache_only); + +/* sysfs.c */ +int btrfs_init_sysfs(void); +void btrfs_exit_sysfs(void); +int btrfs_sysfs_add_super(struct btrfs_fs_info *fs); +int btrfs_sysfs_add_root(struct btrfs_root *root); +void btrfs_sysfs_del_root(struct btrfs_root *root); +void btrfs_sysfs_del_super(struct btrfs_fs_info *root); + #endif diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index b2f79878d51a..c25ef0a68f18 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -294,9 +294,12 @@ static int __setup_root(int blocksize, root->last_trans = 0; root->highest_inode = 0; root->last_inode_alloc = 0; + root->name = NULL; memset(&root->root_key, 0, sizeof(root->root_key)); memset(&root->root_item, 0, sizeof(root->root_item)); memset(&root->defrag_progress, 0, sizeof(root->defrag_progress)); + memset(&root->root_kobj, 0, sizeof(root->root_kobj)); + init_completion(&root->kobj_unregister); root->defrag_running = 0; root->defrag_level = 0; root->root_key.objectid = objectid; @@ -384,7 +387,8 @@ insert: } struct btrfs_root *btrfs_read_fs_root(struct btrfs_fs_info *fs_info, - struct btrfs_key *location) + struct btrfs_key *location, + const char *name, int namelen) { struct btrfs_root *root; int ret; @@ -405,6 +409,22 @@ struct btrfs_root *btrfs_read_fs_root(struct btrfs_fs_info *fs_info, kfree(root); return ERR_PTR(ret); } + + ret = btrfs_set_root_name(root, name, namelen); + if (ret) { + brelse(root->node); + kfree(root); + return ERR_PTR(ret); + } + + ret = btrfs_sysfs_add_root(root); + if (ret) { + brelse(root->node); + kfree(root->name); + kfree(root); + return ERR_PTR(ret); + } + return root; } @@ -433,6 +453,8 @@ struct btrfs_root *open_ctree(struct super_block *sb) INIT_RADIX_TREE(&fs_info->block_group_data_radix, GFP_KERNEL); INIT_LIST_HEAD(&fs_info->trans_list); INIT_LIST_HEAD(&fs_info->dead_roots); + memset(&fs_info->super_kobj, 0, sizeof(fs_info->super_kobj)); + init_completion(&fs_info->kobj_unregister); sb_set_blocksize(sb, 4096); fs_info->running_transaction = NULL; fs_info->last_trans_committed = 0; @@ -500,8 +522,10 @@ struct btrfs_root *open_ctree(struct super_block *sb) fs_info->generation = btrfs_super_generation(disk_super) + 1; ret = btrfs_find_dead_roots(tree_root); - if (ret) + if (ret) { + mutex_unlock(&fs_info->fs_mutex); goto fail_tree_root; + } mutex_unlock(&fs_info->fs_mutex); return tree_root; @@ -553,12 +577,15 @@ int btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root) { radix_tree_delete(&fs_info->fs_roots_radix, (unsigned long)root->root_key.objectid); + btrfs_sysfs_del_root(root); if (root->inode) iput(root->inode); if (root->node) brelse(root->node); if (root->commit_root) brelse(root->commit_root); + if (root->name) + kfree(root->name); kfree(root); return 0; } diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 5261733b8735..da6bb72750f2 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -66,7 +66,8 @@ struct buffer_head *btrfs_find_tree_block(struct btrfs_root *root, u64 blocknr); int btrfs_csum_data(struct btrfs_root * root, char *data, size_t len, char *result); struct btrfs_root *btrfs_read_fs_root(struct btrfs_fs_info *fs_info, - struct btrfs_key *location); + struct btrfs_key *location, + const char *name, int namelen); struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_fs_info *fs_info, struct btrfs_key *location); u64 bh_blocknr(struct buffer_head *bh); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index c31e84d42653..ff3f7c2be605 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -858,16 +858,23 @@ static int __free_extent(struct btrfs_trans_handle *trans, struct btrfs_root btrfs_set_extent_refs(ei, refs); btrfs_mark_buffer_dirty(path->nodes[0]); if (refs == 0) { - u64 super_blocks_used; + u64 super_blocks_used, root_blocks_used; if (pin) { ret = pin_down_block(root, blocknr, 0); BUG_ON(ret); } + /* block accounting for super block */ super_blocks_used = btrfs_super_blocks_used(&info->super_copy); btrfs_set_super_blocks_used(&info->super_copy, super_blocks_used - num_blocks); + + /* block accounting for root item */ + root_blocks_used = btrfs_root_blocks_used(&root->root_item); + btrfs_set_root_blocks_used(&root->root_item, + root_blocks_used - num_blocks); + ret = btrfs_del_item(trans, extent_root, path); if (ret) { return ret; @@ -1175,7 +1182,7 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans, { int ret; int pending_ret; - u64 super_blocks_used; + u64 super_blocks_used, root_blocks_used; u64 search_start = 0; struct btrfs_fs_info *info = root->fs_info; struct btrfs_root *extent_root = info->extent_root; @@ -1193,10 +1200,16 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans, if (ret) return ret; + /* block accounting for super block */ super_blocks_used = btrfs_super_blocks_used(&info->super_copy); btrfs_set_super_blocks_used(&info->super_copy, super_blocks_used + num_blocks); + /* block accounting for root item */ + root_blocks_used = btrfs_root_blocks_used(&root->root_item); + btrfs_set_root_blocks_used(&root->root_item, root_blocks_used + + num_blocks); + if (root == extent_root) { BUG_ON(num_blocks != 1); set_radix_bit(&root->fs_info->extent_ins_radix, ins->objectid); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index dc181089aa74..2e3918e6049e 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -733,7 +733,8 @@ out: */ static int fixup_tree_root_location(struct btrfs_root *root, struct btrfs_key *location, - struct btrfs_root **sub_root) + struct btrfs_root **sub_root, + struct dentry *dentry) { struct btrfs_path *path; struct btrfs_root_item *ri; @@ -747,7 +748,9 @@ static int fixup_tree_root_location(struct btrfs_root *root, BUG_ON(!path); mutex_lock(&root->fs_info->fs_mutex); - *sub_root = btrfs_read_fs_root(root->fs_info, location); + *sub_root = btrfs_read_fs_root(root->fs_info, location, + dentry->d_name.name, + dentry->d_name.len); if (IS_ERR(*sub_root)) return PTR_ERR(*sub_root); @@ -812,7 +815,8 @@ static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry, return ERR_PTR(ret); inode = NULL; if (location.objectid) { - ret = fixup_tree_root_location(root, &location, &sub_root); + ret = fixup_tree_root_location(root, &location, &sub_root, + dentry); if (ret < 0) return ERR_PTR(ret); if (ret > 0) @@ -1829,6 +1833,7 @@ static int create_subvol(struct btrfs_root *root, char *name, int namelen) btrfs_set_root_blocknr(&root_item, bh_blocknr(subvol)); btrfs_set_root_refs(&root_item, 1); + btrfs_set_root_blocks_used(&root_item, 0); memset(&root_item.drop_progress, 0, sizeof(root_item.drop_progress)); root_item.drop_level = 0; brelse(subvol); @@ -1865,7 +1870,7 @@ static int create_subvol(struct btrfs_root *root, char *name, int namelen) if (ret) goto fail_commit; - new_root = btrfs_read_fs_root(root->fs_info, &key); + new_root = btrfs_read_fs_root(root->fs_info, &key, name, namelen); BUG_ON(!new_root); trans = btrfs_start_transaction(new_root, 1); diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 7da95538745f..a68101ae1a3b 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -45,12 +45,14 @@ static struct super_operations btrfs_super_ops; static void btrfs_put_super (struct super_block * sb) { struct btrfs_root *root = btrfs_sb(sb); + struct btrfs_fs_info *fs = root->fs_info; int ret; ret = close_ctree(root); if (ret) { printk("close ctree returns %d\n", ret); } + btrfs_sysfs_del_super(fs); sb->s_fs_info = NULL; } @@ -101,6 +103,12 @@ static int btrfs_fill_super(struct super_block * sb, void * data, int silent) err = -ENOMEM; goto fail_close; } + + /* this does the super kobj at the same time */ + err = btrfs_sysfs_add_super(tree_root->fs_info); + if (err) + goto fail_close; + sb->s_root = root_dentry; btrfs_transaction_queue_work(tree_root, HZ * 30); return 0; @@ -182,6 +190,11 @@ static struct super_operations btrfs_super_ops = { static int __init init_btrfs_fs(void) { int err; + + err = btrfs_init_sysfs(); + if (err) + return err; + btrfs_init_transaction_sys(); err = btrfs_init_cachep(); if (err) @@ -196,6 +209,7 @@ static void __exit exit_btrfs_fs(void) btrfs_destroy_cachep(); extent_map_exit(); unregister_filesystem(&btrfs_fs_type); + btrfs_exit_sysfs(); } module_init(init_btrfs_fs) diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index db8917e5b256..2058783373eb 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -16,6 +16,242 @@ * Boston, MA 021110-1307, USA. */ +#include +#include +#include +#include +#include +#include +#include + #include "ctree.h" #include "disk-io.h" #include "transaction.h" + +static ssize_t root_blocks_used_show(struct btrfs_root *root, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%llu\n", + (unsigned long long)btrfs_root_blocks_used(&root->root_item)); +} + +static ssize_t root_block_limit_show(struct btrfs_root *root, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%llu\n", + (unsigned long long)btrfs_root_block_limit(&root->root_item)); +} + +static ssize_t super_blocks_used_show(struct btrfs_fs_info *fs, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%llu\n", + (unsigned long long)btrfs_super_blocks_used(fs->disk_super)); +} + +static ssize_t super_total_blocks_show(struct btrfs_fs_info *fs, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%llu\n", + (unsigned long long)btrfs_super_total_blocks(fs->disk_super)); +} + +static ssize_t super_blocksize_show(struct btrfs_fs_info *fs, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%llu\n", + (unsigned long long)btrfs_super_blocksize(fs->disk_super)); +} + +/* this is for root attrs (subvols/snapshots) */ +struct btrfs_root_attr { + struct attribute attr; + ssize_t (*show)(struct btrfs_root *, char *); + ssize_t (*store)(struct btrfs_root *, const char *, size_t); +}; + +#define ROOT_ATTR(name, mode, show, store) \ +static struct btrfs_root_attr btrfs_root_attr_##name = __ATTR(name, mode, show, store) + +ROOT_ATTR(blocks_used, 0444, root_blocks_used_show, NULL); +ROOT_ATTR(block_limit, 0644, root_block_limit_show, NULL); + +static struct attribute *btrfs_root_attrs[] = { + &btrfs_root_attr_blocks_used.attr, + &btrfs_root_attr_block_limit.attr, + NULL, +}; + +/* this is for super attrs (actual full fs) */ +struct btrfs_super_attr { + struct attribute attr; + ssize_t (*show)(struct btrfs_fs_info *, char *); + ssize_t (*store)(struct btrfs_fs_info *, const char *, size_t); +}; + +#define SUPER_ATTR(name, mode, show, store) \ +static struct btrfs_super_attr btrfs_super_attr_##name = __ATTR(name, mode, show, store) + +SUPER_ATTR(blocks_used, 0444, super_blocks_used_show, NULL); +SUPER_ATTR(total_blocks, 0444, super_total_blocks_show, NULL); +SUPER_ATTR(blocksize, 0444, super_blocksize_show, NULL); + +static struct attribute *btrfs_super_attrs[] = { + &btrfs_super_attr_blocks_used.attr, + &btrfs_super_attr_total_blocks.attr, + &btrfs_super_attr_blocksize.attr, + NULL, +}; + +static ssize_t btrfs_super_attr_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + struct btrfs_fs_info *fs = container_of(kobj, struct btrfs_fs_info, + super_kobj); + struct btrfs_super_attr *a = container_of(attr, + struct btrfs_super_attr, + attr); + + return a->show ? a->show(fs, buf) : 0; +} + +static ssize_t btrfs_super_attr_store(struct kobject *kobj, + struct attribute *attr, + const char *buf, size_t len) +{ + struct btrfs_fs_info *fs = container_of(kobj, struct btrfs_fs_info, + super_kobj); + struct btrfs_super_attr *a = container_of(attr, + struct btrfs_super_attr, + attr); + + return a->store ? a->store(fs, buf, len) : 0; +} + +static ssize_t btrfs_root_attr_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + struct btrfs_root *root = container_of(kobj, struct btrfs_root, + root_kobj); + struct btrfs_root_attr *a = container_of(attr, + struct btrfs_root_attr, + attr); + + return a->show ? a->show(root, buf) : 0; +} + +static ssize_t btrfs_root_attr_store(struct kobject *kobj, + struct attribute *attr, + const char *buf, size_t len) +{ + struct btrfs_root *root = container_of(kobj, struct btrfs_root, + root_kobj); + struct btrfs_root_attr *a = container_of(attr, + struct btrfs_root_attr, + attr); + return a->store ? a->store(root, buf, len) : 0; +} + +static void btrfs_super_release(struct kobject *kobj) +{ + struct btrfs_fs_info *fs = container_of(kobj, struct btrfs_fs_info, + super_kobj); + complete(&fs->kobj_unregister); +} + +static void btrfs_root_release(struct kobject *kobj) +{ + struct btrfs_root *root = container_of(kobj, struct btrfs_root, + root_kobj); + complete(&root->kobj_unregister); +} + +static struct sysfs_ops btrfs_super_attr_ops = { + .show = btrfs_super_attr_show, + .store = btrfs_super_attr_store, +}; + +static struct sysfs_ops btrfs_root_attr_ops = { + .show = btrfs_root_attr_show, + .store = btrfs_root_attr_store, +}; + +static struct kobj_type btrfs_root_ktype = { + .default_attrs = btrfs_root_attrs, + .sysfs_ops = &btrfs_root_attr_ops, + .release = btrfs_root_release, +}; + +static struct kobj_type btrfs_super_ktype = { + .default_attrs = btrfs_super_attrs, + .sysfs_ops = &btrfs_super_attr_ops, + .release = btrfs_super_release, +}; + +static struct kset btrfs_kset = { + .kobj = {.name = "btrfs"}, +}; + +int btrfs_sysfs_add_super(struct btrfs_fs_info *fs) +{ + int error; + + fs->super_kobj.kset = &btrfs_kset; + fs->super_kobj.ktype = &btrfs_super_ktype; + + error = kobject_set_name(&fs->super_kobj, "%s", + fs->sb->s_id); + if (error) + goto fail; + + error = kobject_register(&fs->super_kobj); + if (error) + goto fail; + + return 0; + +fail: + printk(KERN_ERR "btrfs: sysfs creation for super failed\n"); + return error; +} + +int btrfs_sysfs_add_root(struct btrfs_root *root) +{ + int error; + + root->root_kobj.ktype = &btrfs_root_ktype; + root->root_kobj.parent = &root->fs_info->super_kobj; + + error = kobject_set_name(&root->root_kobj, "%s", root->name); + if (error) { + goto fail; + } + + error = kobject_register(&root->root_kobj); + if (error) + goto fail; + + return 0; + +fail: + printk(KERN_ERR "btrfs: sysfs creation for root failed\n"); + return error; +} + +void btrfs_sysfs_del_root(struct btrfs_root *root) +{ + kobject_unregister(&root->root_kobj); + wait_for_completion(&root->kobj_unregister); +} + +void btrfs_sysfs_del_super(struct btrfs_fs_info *fs) +{ + kobject_unregister(&fs->super_kobj); + wait_for_completion(&fs->kobj_unregister); +} + +int btrfs_init_sysfs() +{ + kobj_set_kset_s(&btrfs_kset, fs_subsys); + return kset_register(&btrfs_kset); +} + +void btrfs_exit_sysfs() +{ + kset_unregister(&btrfs_kset); +} diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 18abea802794..2b15daa3a9f2 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -236,6 +236,7 @@ static int wait_for_commit(struct btrfs_root *root, struct dirty_root { struct list_head list; struct btrfs_root *root; + struct btrfs_root *latest_root; }; int btrfs_add_dead_root(struct btrfs_root *root, struct list_head *dead_list) @@ -278,6 +279,15 @@ static int add_dirty_roots(struct btrfs_trans_handle *trans, btrfs_root_blocknr(&root->root_item)); brelse(root->commit_root); root->commit_root = NULL; + + /* make sure to update the root on disk + * so we get any updates to the block used + * counts + */ + err = btrfs_update_root(trans, + root->fs_info->tree_root, + &root->root_key, + &root->root_item); continue; } dirty = kmalloc(sizeof(*dirty), GFP_NOFS); @@ -291,6 +301,7 @@ static int add_dirty_roots(struct btrfs_trans_handle *trans, memcpy(dirty->root, root, sizeof(*root)); dirty->root->node = root->commit_root; + dirty->latest_root = root; root->commit_root = NULL; root->root_key.offset = root->fs_info->generation; @@ -384,20 +395,29 @@ static int drop_dirty_roots(struct btrfs_root *tree_root, { struct dirty_root *dirty; struct btrfs_trans_handle *trans; + u64 num_blocks; + u64 blocks_used; int ret = 0; int err; while(!list_empty(list)) { + struct btrfs_root *root; + mutex_lock(&tree_root->fs_info->fs_mutex); dirty = list_entry(list->next, struct dirty_root, list); list_del_init(&dirty->list); + num_blocks = btrfs_root_blocks_used(&dirty->root->root_item); + root = dirty->latest_root; + while(1) { trans = btrfs_start_transaction(tree_root, 1); + ret = btrfs_drop_snapshot(trans, dirty->root); if (ret != -EAGAIN) { break; } + err = btrfs_update_root(trans, tree_root, &dirty->root->root_key, @@ -414,9 +434,19 @@ static int drop_dirty_roots(struct btrfs_root *tree_root, mutex_lock(&tree_root->fs_info->fs_mutex); } BUG_ON(ret); + + num_blocks -= btrfs_root_blocks_used(&dirty->root->root_item); + blocks_used = btrfs_root_blocks_used(&root->root_item); + if (num_blocks) { + record_root_in_trans(root); + btrfs_set_root_blocks_used(&root->root_item, + blocks_used - num_blocks); + } ret = btrfs_del_root(trans, tree_root, &dirty->root->root_key); - if (ret) + if (ret) { + BUG(); break; + } ret = btrfs_end_transaction(trans, tree_root); BUG_ON(ret); @@ -534,10 +564,12 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, wake_up(&cur_trans->commit_wait); put_transaction(cur_trans); put_transaction(cur_trans); + if (root->fs_info->closing) list_splice_init(&root->fs_info->dead_roots, &dirty_fs_roots); else list_splice_init(&dirty_fs_roots, &root->fs_info->dead_roots); + mutex_unlock(&root->fs_info->trans_mutex); kmem_cache_free(btrfs_trans_handle_cachep, trans); -- cgit v1.2.3 From 84a5d5eefd2b31c3cb97dc078589870bf85c7f23 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Fri, 14 Sep 2007 09:43:53 -0400 Subject: Btrfs: add modules_install target Signed-off-by: Chris Mason --- fs/btrfs/Makefile | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs/btrfs/Makefile') diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index ee505c87ecf3..058a2dae65b4 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -18,6 +18,8 @@ else KERNELDIR := /lib/modules/`uname -r`/build all:: $(MAKE) -C $(KERNELDIR) M=`pwd` modules +modules_install:: + $(MAKE) -C $(KERNELDIR) M=`pwd` modules_install clean:: $(MAKE) -C $(KERNELDIR) M=`pwd` clean endif -- cgit v1.2.3 From 432eba0882a32d6181ba44044f6e576226134784 Mon Sep 17 00:00:00 2001 From: Jan Engelhardt Date: Fri, 14 Sep 2007 10:22:19 -0400 Subject: Btrfs: Simplify makefile Single-colons will do here. Signed-off-by: Chris Mason --- fs/btrfs/Makefile | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'fs/btrfs/Makefile') diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index 058a2dae65b4..6341cf7a9f91 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -16,10 +16,11 @@ else # Normal Makefile KERNELDIR := /lib/modules/`uname -r`/build -all:: +all: $(MAKE) -C $(KERNELDIR) M=`pwd` modules -modules_install:: +modules_install: $(MAKE) -C $(KERNELDIR) M=`pwd` modules_install -clean:: +clean: $(MAKE) -C $(KERNELDIR) M=`pwd` clean + endif -- cgit v1.2.3 From 5f39d397dfbe140a14edecd4e73c34ce23c4f9ee Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Mon, 15 Oct 2007 16:14:19 -0400 Subject: Btrfs: Create extent_buffer interface for large blocksizes Signed-off-by: Chris Mason --- fs/btrfs/Makefile | 2 + fs/btrfs/ctree.c | 1484 ++++++++++++++++++++++++++---------------------- fs/btrfs/ctree.h | 849 ++++++++++----------------- fs/btrfs/dir-item.c | 102 ++-- fs/btrfs/disk-io.c | 391 ++++++------- fs/btrfs/disk-io.h | 55 +- fs/btrfs/extent-tree.c | 294 +++++----- fs/btrfs/extent_map.c | 672 +++++++++++++++++++++- fs/btrfs/extent_map.h | 75 ++- fs/btrfs/file-item.c | 63 +- fs/btrfs/file.c | 129 +++-- fs/btrfs/inode-item.c | 22 +- fs/btrfs/inode-map.c | 17 +- fs/btrfs/inode.c | 435 ++++++++------ fs/btrfs/print-tree.c | 96 ++-- fs/btrfs/print-tree.h | 4 +- fs/btrfs/root-tree.c | 51 +- fs/btrfs/super.c | 7 +- fs/btrfs/sysfs.c | 10 +- fs/btrfs/transaction.c | 61 +- fs/btrfs/transaction.h | 5 +- fs/btrfs/tree-defrag.c | 58 +- 22 files changed, 2794 insertions(+), 2088 deletions(-) (limited to 'fs/btrfs/Makefile') diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index 6341cf7a9f91..a3b51085d7f0 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -23,4 +23,6 @@ modules_install: clean: $(MAKE) -C $(KERNELDIR) M=`pwd` clean +tester: + $(MAKE) -C $(KERNELDIR) M=`pwd` tree-defrag.o transaction.o sysfs.o super.o root-tree.o inode-map.o inode-item.o inode.o file-item.o file.o extent_map.o disk-io.o ctree.o dir-item.o extent-tree.o endif diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index b41f48ade419..f60920e8a0e0 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -16,21 +16,24 @@ * Boston, MA 021110-1307, USA. */ +#include #include "ctree.h" #include "disk-io.h" #include "transaction.h" +#include "print-tree.h" static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, int level); static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_key *ins_key, struct btrfs_path *path, int data_size); -static int push_node_left(struct btrfs_trans_handle *trans, struct btrfs_root - *root, struct buffer_head *dst, struct buffer_head - *src); -static int balance_node_right(struct btrfs_trans_handle *trans, struct - btrfs_root *root, struct buffer_head *dst_buf, - struct buffer_head *src_buf); +static int push_node_left(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct extent_buffer *dst, + struct extent_buffer *src); +static int balance_node_right(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct extent_buffer *dst_buf, + struct extent_buffer *src_buf); static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, int level, int slot); @@ -62,40 +65,38 @@ void btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p) for (i = 0; i < BTRFS_MAX_LEVEL; i++) { if (!p->nodes[i]) break; - btrfs_block_release(root, p->nodes[i]); + free_extent_buffer(p->nodes[i]); } memset(p, 0, sizeof(*p)); } -static int __btrfs_cow_block(struct btrfs_trans_handle *trans, struct btrfs_root - *root, struct buffer_head *buf, struct buffer_head - *parent, int parent_slot, struct buffer_head - **cow_ret, u64 search_start, u64 empty_size) +static int __btrfs_cow_block(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct extent_buffer *buf, + struct extent_buffer *parent, int parent_slot, + struct extent_buffer **cow_ret, + u64 search_start, u64 empty_size) { - struct buffer_head *cow; - struct btrfs_node *cow_node; + struct extent_buffer *cow; int ret = 0; int different_trans = 0; WARN_ON(root->ref_cows && trans->transid != root->last_trans); - WARN_ON(!buffer_uptodate(buf)); + cow = btrfs_alloc_free_block(trans, root, search_start, empty_size); if (IS_ERR(cow)) return PTR_ERR(cow); - cow_node = btrfs_buffer_node(cow); - if (buf->b_size != root->blocksize || cow->b_size != root->blocksize) + if (buf->len != root->sectorsize || cow->len != root->sectorsize) WARN_ON(1); - memcpy(cow_node, btrfs_buffer_node(buf), root->blocksize); - btrfs_set_header_blocknr(&cow_node->header, bh_blocknr(cow)); - btrfs_set_header_generation(&cow_node->header, trans->transid); - btrfs_set_header_owner(&cow_node->header, root->root_key.objectid); + copy_extent_buffer(cow, buf, 0, 0, cow->len); + btrfs_set_header_blocknr(cow, extent_buffer_blocknr(cow)); + btrfs_set_header_generation(cow, trans->transid); + btrfs_set_header_owner(cow, root->root_key.objectid); - WARN_ON(btrfs_header_generation(btrfs_buffer_header(buf)) > - trans->transid); - if (btrfs_header_generation(btrfs_buffer_header(buf)) != - trans->transid) { + WARN_ON(btrfs_header_generation(buf) > trans->transid); + if (btrfs_header_generation(buf) != trans->transid) { different_trans = 1; ret = btrfs_inc_ref(trans, root, buf); if (ret) @@ -106,29 +107,29 @@ static int __btrfs_cow_block(struct btrfs_trans_handle *trans, struct btrfs_root if (buf == root->node) { root->node = cow; - get_bh(cow); + extent_buffer_get(cow); if (buf != root->commit_root) { - btrfs_free_extent(trans, root, bh_blocknr(buf), 1, 1); + btrfs_free_extent(trans, root, + extent_buffer_blocknr(buf), 1, 1); } - btrfs_block_release(root, buf); + free_extent_buffer(buf); } else { - btrfs_set_node_blockptr(btrfs_buffer_node(parent), parent_slot, - bh_blocknr(cow)); + btrfs_set_node_blockptr(parent, parent_slot, + extent_buffer_blocknr(cow)); btrfs_mark_buffer_dirty(parent); - WARN_ON(btrfs_header_generation(btrfs_buffer_header(parent)) != - trans->transid); - btrfs_free_extent(trans, root, bh_blocknr(buf), 1, 1); + WARN_ON(btrfs_header_generation(parent) != trans->transid); + btrfs_free_extent(trans, root, extent_buffer_blocknr(buf),1,1); } - btrfs_block_release(root, buf); + free_extent_buffer(buf); btrfs_mark_buffer_dirty(cow); *cow_ret = cow; return 0; } -int btrfs_cow_block(struct btrfs_trans_handle *trans, struct btrfs_root - *root, struct buffer_head *buf, struct buffer_head - *parent, int parent_slot, struct buffer_head - **cow_ret) +int btrfs_cow_block(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct extent_buffer *buf, + struct extent_buffer *parent, int parent_slot, + struct extent_buffer **cow_ret) { u64 search_start; if (trans->transaction != root->fs_info->running_transaction) { @@ -141,13 +142,12 @@ int btrfs_cow_block(struct btrfs_trans_handle *trans, struct btrfs_root root->fs_info->generation); WARN_ON(1); } - if (btrfs_header_generation(btrfs_buffer_header(buf)) == - trans->transid) { + if (btrfs_header_generation(buf) == trans->transid) { *cow_ret = buf; return 0; } - search_start = bh_blocknr(buf) & ~((u64)65535); + search_start = extent_buffer_blocknr(buf) & ~((u64)65535); return __btrfs_cow_block(trans, root, buf, parent, parent_slot, cow_ret, search_start, 0); } @@ -161,9 +161,11 @@ static int close_blocks(u64 blocknr, u64 other) return 0; } -static int should_defrag_leaf(struct buffer_head *bh) +#if 0 +static int should_defrag_leaf(struct extent_buffer *eb) { - struct btrfs_leaf *leaf = btrfs_buffer_leaf(bh); + return 0; + struct btrfs_leaf *leaf = btrfs_buffer_leaf(eb); struct btrfs_disk_key *key; u32 nritems; @@ -188,14 +190,17 @@ static int should_defrag_leaf(struct buffer_head *bh) } return 0; } +#endif int btrfs_realloc_node(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct buffer_head *parent, + struct btrfs_root *root, struct extent_buffer *parent, int cache_only, u64 *last_ret) { + return 0; +#if 0 struct btrfs_node *parent_node; - struct buffer_head *cur_bh; - struct buffer_head *tmp_bh; + struct extent_buffer *cur_eb; + struct extent_buffer *tmp_eb; u64 blocknr; u64 search_start = *last_ret; u64 last_block = 0; @@ -281,6 +286,7 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans, brelse(tmp_bh); } return err; +#endif } /* @@ -289,12 +295,12 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans, * which is the stop of the leaf data stack */ static inline unsigned int leaf_data_end(struct btrfs_root *root, - struct btrfs_leaf *leaf) + struct extent_buffer *leaf) { - u32 nr = btrfs_header_nritems(&leaf->header); + u32 nr = btrfs_header_nritems(leaf); if (nr == 0) return BTRFS_LEAF_DATA_SIZE(root); - return btrfs_item_offset(leaf->items + nr - 1); + return btrfs_item_offset_nr(leaf, nr - 1); } /* @@ -310,9 +316,9 @@ static int comp_keys(struct btrfs_disk_key *disk, struct btrfs_key *k2) return 1; if (k1.objectid < k2->objectid) return -1; - if (k1.flags > k2->flags) + if (k1.type > k2->type) return 1; - if (k1.flags < k2->flags) + if (k1.type < k2->type) return -1; if (k1.offset > k2->offset) return 1; @@ -324,37 +330,39 @@ static int comp_keys(struct btrfs_disk_key *disk, struct btrfs_key *k2) static int check_node(struct btrfs_root *root, struct btrfs_path *path, int level) { - struct btrfs_node *parent = NULL; - struct btrfs_node *node = btrfs_buffer_node(path->nodes[level]); + struct extent_buffer *parent = NULL; + struct extent_buffer *node = path->nodes[level]; + struct btrfs_disk_key parent_key; + struct btrfs_disk_key node_key; int parent_slot; int slot; struct btrfs_key cpukey; - u32 nritems = btrfs_header_nritems(&node->header); + u32 nritems = btrfs_header_nritems(node); if (path->nodes[level + 1]) - parent = btrfs_buffer_node(path->nodes[level + 1]); + parent = path->nodes[level + 1]; slot = path->slots[level]; - BUG_ON(!buffer_uptodate(path->nodes[level])); BUG_ON(nritems == 0); if (parent) { - struct btrfs_disk_key *parent_key; - parent_slot = path->slots[level + 1]; - parent_key = &parent->ptrs[parent_slot].key; - BUG_ON(memcmp(parent_key, &node->ptrs[0].key, + btrfs_node_key(parent, &parent_key, parent_slot); + btrfs_node_key(node, &node_key, 0); + BUG_ON(memcmp(&parent_key, &node_key, sizeof(struct btrfs_disk_key))); BUG_ON(btrfs_node_blockptr(parent, parent_slot) != - btrfs_header_blocknr(&node->header)); + btrfs_header_blocknr(node)); } BUG_ON(nritems > BTRFS_NODEPTRS_PER_BLOCK(root)); if (slot != 0) { - btrfs_disk_key_to_cpu(&cpukey, &node->ptrs[slot - 1].key); - BUG_ON(comp_keys(&node->ptrs[slot].key, &cpukey) <= 0); + btrfs_node_key_to_cpu(node, &cpukey, slot - 1); + btrfs_node_key(node, &node_key, slot); + BUG_ON(comp_keys(&node_key, &cpukey) <= 0); } if (slot < nritems - 1) { - btrfs_disk_key_to_cpu(&cpukey, &node->ptrs[slot + 1].key); - BUG_ON(comp_keys(&node->ptrs[slot].key, &cpukey) >= 0); + btrfs_node_key_to_cpu(node, &cpukey, slot + 1); + btrfs_node_key(node, &node_key, slot); + BUG_ON(comp_keys(&node_key, &cpukey) >= 0); } return 0; } @@ -362,83 +370,172 @@ static int check_node(struct btrfs_root *root, struct btrfs_path *path, static int check_leaf(struct btrfs_root *root, struct btrfs_path *path, int level) { - struct btrfs_leaf *leaf = btrfs_buffer_leaf(path->nodes[level]); - struct btrfs_node *parent = NULL; + struct extent_buffer *leaf = path->nodes[level]; + struct extent_buffer *parent = NULL; int parent_slot; - int slot = path->slots[0]; struct btrfs_key cpukey; + struct btrfs_disk_key parent_key; + struct btrfs_disk_key leaf_key; + int slot = path->slots[0]; - u32 nritems = btrfs_header_nritems(&leaf->header); + u32 nritems = btrfs_header_nritems(leaf); if (path->nodes[level + 1]) - parent = btrfs_buffer_node(path->nodes[level + 1]); - - BUG_ON(btrfs_leaf_free_space(root, leaf) < 0); + parent = path->nodes[level + 1]; if (nritems == 0) return 0; if (parent) { - struct btrfs_disk_key *parent_key; - parent_slot = path->slots[level + 1]; - parent_key = &parent->ptrs[parent_slot].key; + btrfs_node_key(parent, &parent_key, parent_slot); + btrfs_item_key(leaf, &leaf_key, 0); - BUG_ON(memcmp(parent_key, &leaf->items[0].key, + BUG_ON(memcmp(&parent_key, &leaf_key, sizeof(struct btrfs_disk_key))); BUG_ON(btrfs_node_blockptr(parent, parent_slot) != - btrfs_header_blocknr(&leaf->header)); + btrfs_header_blocknr(leaf)); + } +#if 0 + for (i = 0; nritems > 1 && i < nritems - 2; i++) { + btrfs_item_key_to_cpu(leaf, &cpukey, i + 1); + btrfs_item_key(leaf, &leaf_key, i); + if (comp_keys(&leaf_key, &cpukey) >= 0) { + btrfs_print_leaf(root, leaf); + printk("slot %d offset bad key\n", i); + BUG_ON(1); + } + if (btrfs_item_offset_nr(leaf, i) != + btrfs_item_end_nr(leaf, i + 1)) { + btrfs_print_leaf(root, leaf); + printk("slot %d offset bad\n", i); + BUG_ON(1); + } + if (i == 0) { + if (btrfs_item_offset_nr(leaf, i) + + btrfs_item_size_nr(leaf, i) != + BTRFS_LEAF_DATA_SIZE(root)) { + btrfs_print_leaf(root, leaf); + printk("slot %d first offset bad\n", i); + BUG_ON(1); + } + } } - if (slot != 0) { - btrfs_disk_key_to_cpu(&cpukey, &leaf->items[slot - 1].key); - BUG_ON(comp_keys(&leaf->items[slot].key, &cpukey) <= 0); - BUG_ON(btrfs_item_offset(leaf->items + slot - 1) != - btrfs_item_end(leaf->items + slot)); + if (nritems > 0) { + if (btrfs_item_size_nr(leaf, nritems - 1) > 4096) { + btrfs_print_leaf(root, leaf); + printk("slot %d bad size \n", nritems - 1); + BUG_ON(1); + } + } +#endif + if (slot != 0 && slot < nritems - 1) { + btrfs_item_key(leaf, &leaf_key, slot); + btrfs_item_key_to_cpu(leaf, &cpukey, slot - 1); + if (comp_keys(&leaf_key, &cpukey) <= 0) { + btrfs_print_leaf(root, leaf); + printk("slot %d offset bad key\n", slot); + BUG_ON(1); + } + if (btrfs_item_offset_nr(leaf, slot - 1) != + btrfs_item_end_nr(leaf, slot)) { + btrfs_print_leaf(root, leaf); + printk("slot %d offset bad\n", slot); + BUG_ON(1); + } } if (slot < nritems - 1) { - btrfs_disk_key_to_cpu(&cpukey, &leaf->items[slot + 1].key); - BUG_ON(comp_keys(&leaf->items[slot].key, &cpukey) >= 0); - BUG_ON(btrfs_item_offset(leaf->items + slot) != - btrfs_item_end(leaf->items + slot + 1)); + btrfs_item_key(leaf, &leaf_key, slot); + btrfs_item_key_to_cpu(leaf, &cpukey, slot + 1); + BUG_ON(comp_keys(&leaf_key, &cpukey) >= 0); + if (btrfs_item_offset_nr(leaf, slot) != + btrfs_item_end_nr(leaf, slot + 1)) { + btrfs_print_leaf(root, leaf); + printk("slot %d offset bad\n", slot); + BUG_ON(1); + } } - BUG_ON(btrfs_item_offset(leaf->items) + - btrfs_item_size(leaf->items) != BTRFS_LEAF_DATA_SIZE(root)); + BUG_ON(btrfs_item_offset_nr(leaf, 0) + + btrfs_item_size_nr(leaf, 0) != BTRFS_LEAF_DATA_SIZE(root)); return 0; } static int check_block(struct btrfs_root *root, struct btrfs_path *path, int level) { - struct btrfs_node *node = btrfs_buffer_node(path->nodes[level]); - if (memcmp(node->header.fsid, root->fs_info->disk_super->fsid, - sizeof(node->header.fsid))) - BUG(); + struct extent_buffer *buf = path->nodes[level]; + char fsid[BTRFS_FSID_SIZE]; + + read_extent_buffer(buf, fsid, (unsigned long)btrfs_header_fsid(buf), + BTRFS_FSID_SIZE); + + if (memcmp(fsid, root->fs_info->fsid, BTRFS_FSID_SIZE)) { + int i = 0; + printk("warning bad block %Lu\n", buf->start); + if (!btrfs_buffer_uptodate(buf)) { + WARN_ON(1); + } + for (i = 0; i < BTRFS_FSID_SIZE; i++) { + printk("%x:%x ", root->fs_info->fsid[i], fsid[i]); + } + printk("\n"); + // BUG(); + } if (level == 0) return check_leaf(root, path, level); return check_node(root, path, level); } /* - * search for key in the array p. items p are item_size apart - * and there are 'max' items in p + * search for key in the extent_buffer. The items start at offset p, + * and they are item_size apart. There are 'max' items in p. + * * the slot in the array is returned via slot, and it points to * the place where you would insert key if it is not found in * the array. * * slot may point to max if the key is bigger than all of the keys */ -static int generic_bin_search(char *p, int item_size, struct btrfs_key *key, - int max, int *slot) +static int generic_bin_search(struct extent_buffer *eb, unsigned long p, + int item_size, struct btrfs_key *key, + int max, int *slot) { int low = 0; int high = max; int mid; int ret; struct btrfs_disk_key *tmp; + struct btrfs_disk_key unaligned; + unsigned long offset; + char *map_token = NULL; + char *kaddr = NULL; + unsigned long map_start = 0; + unsigned long map_len = 0; while(low < high) { mid = (low + high) / 2; - tmp = (struct btrfs_disk_key *)(p + mid * item_size); + offset = p + mid * item_size; + + if (!map_token || offset < map_start || + (offset + sizeof(struct btrfs_disk_key)) > + map_start + map_len) { + if (map_token) + unmap_extent_buffer(eb, map_token, KM_USER0); + map_extent_buffer(eb, offset, &map_token, &kaddr, + &map_start, &map_len, KM_USER0); + + } + if (offset + sizeof(struct btrfs_disk_key) > + map_start + map_len) { + unmap_extent_buffer(eb, map_token, KM_USER0); + read_extent_buffer(eb, &unaligned, + offset, sizeof(unaligned)); + map_token = NULL; + tmp = &unaligned; + } else { + tmp = (struct btrfs_disk_key *)(kaddr + offset - + map_start); + } ret = comp_keys(tmp, key); if (ret < 0) @@ -447,10 +544,13 @@ static int generic_bin_search(char *p, int item_size, struct btrfs_key *key, high = mid; else { *slot = mid; + unmap_extent_buffer(eb, map_token, KM_USER0); return 0; } } *slot = low; + if (map_token) + unmap_extent_buffer(eb, map_token, KM_USER0); return 1; } @@ -458,46 +558,42 @@ static int generic_bin_search(char *p, int item_size, struct btrfs_key *key, * simple bin_search frontend that does the right thing for * leaves vs nodes */ -static int bin_search(struct btrfs_node *c, struct btrfs_key *key, int *slot) +static int bin_search(struct extent_buffer *eb, struct btrfs_key *key, + int level, int *slot) { - if (btrfs_is_leaf(c)) { - struct btrfs_leaf *l = (struct btrfs_leaf *)c; - return generic_bin_search((void *)l->items, + if (level == 0) { + return generic_bin_search(eb, + offsetof(struct btrfs_leaf, items), sizeof(struct btrfs_item), - key, btrfs_header_nritems(&c->header), + key, btrfs_header_nritems(eb), slot); } else { - return generic_bin_search((void *)c->ptrs, + return generic_bin_search(eb, + offsetof(struct btrfs_node, ptrs), sizeof(struct btrfs_key_ptr), - key, btrfs_header_nritems(&c->header), + key, btrfs_header_nritems(eb), slot); } return -1; } -static struct buffer_head *read_node_slot(struct btrfs_root *root, - struct buffer_head *parent_buf, - int slot) +static struct extent_buffer *read_node_slot(struct btrfs_root *root, + struct extent_buffer *parent, int slot) { - struct btrfs_node *node = btrfs_buffer_node(parent_buf); if (slot < 0) return NULL; - if (slot >= btrfs_header_nritems(&node->header)) + if (slot >= btrfs_header_nritems(parent)) return NULL; - return read_tree_block(root, btrfs_node_blockptr(node, slot)); + return read_tree_block(root, btrfs_node_blockptr(parent, slot)); } static int balance_level(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, int level) { - struct buffer_head *right_buf; - struct buffer_head *mid_buf; - struct buffer_head *left_buf; - struct buffer_head *parent_buf = NULL; - struct btrfs_node *right = NULL; - struct btrfs_node *mid; - struct btrfs_node *left = NULL; - struct btrfs_node *parent = NULL; + struct extent_buffer *right = NULL; + struct extent_buffer *mid; + struct extent_buffer *left = NULL; + struct extent_buffer *parent = NULL; int ret = 0; int wret; int pslot; @@ -508,60 +604,57 @@ static int balance_level(struct btrfs_trans_handle *trans, struct btrfs_root if (level == 0) return 0; - mid_buf = path->nodes[level]; - mid = btrfs_buffer_node(mid_buf); + mid = path->nodes[level]; orig_ptr = btrfs_node_blockptr(mid, orig_slot); if (level < BTRFS_MAX_LEVEL - 1) - parent_buf = path->nodes[level + 1]; + parent = path->nodes[level + 1]; pslot = path->slots[level + 1]; /* * deal with the case where there is only one pointer in the root * by promoting the node below to a root */ - if (!parent_buf) { - struct buffer_head *child; - u64 blocknr = bh_blocknr(mid_buf); + if (!parent) { + struct extent_buffer *child; + u64 blocknr = extent_buffer_blocknr(mid); - if (btrfs_header_nritems(&mid->header) != 1) + if (btrfs_header_nritems(mid) != 1) return 0; /* promote the child to a root */ - child = read_node_slot(root, mid_buf, 0); + child = read_node_slot(root, mid, 0); BUG_ON(!child); root->node = child; path->nodes[level] = NULL; - clean_tree_block(trans, root, mid_buf); - wait_on_buffer(mid_buf); + clean_tree_block(trans, root, mid); + wait_on_tree_block_writeback(root, mid); /* once for the path */ - btrfs_block_release(root, mid_buf); + free_extent_buffer(mid); /* once for the root ptr */ - btrfs_block_release(root, mid_buf); + free_extent_buffer(mid); return btrfs_free_extent(trans, root, blocknr, 1, 1); } - parent = btrfs_buffer_node(parent_buf); - - if (btrfs_header_nritems(&mid->header) > + if (btrfs_header_nritems(mid) > BTRFS_NODEPTRS_PER_BLOCK(root) / 4) return 0; - if (btrfs_header_nritems(&mid->header) < 2) + if (btrfs_header_nritems(mid) < 2) err_on_enospc = 1; - left_buf = read_node_slot(root, parent_buf, pslot - 1); - if (left_buf) { - wret = btrfs_cow_block(trans, root, left_buf, - parent_buf, pslot - 1, &left_buf); + left = read_node_slot(root, parent, pslot - 1); + if (left) { + wret = btrfs_cow_block(trans, root, left, + parent, pslot - 1, &left); if (wret) { ret = wret; goto enospc; } } - right_buf = read_node_slot(root, parent_buf, pslot + 1); - if (right_buf) { - wret = btrfs_cow_block(trans, root, right_buf, - parent_buf, pslot + 1, &right_buf); + right = read_node_slot(root, parent, pslot + 1); + if (right) { + wret = btrfs_cow_block(trans, root, right, + parent, pslot + 1, &right); if (wret) { ret = wret; goto enospc; @@ -569,30 +662,27 @@ static int balance_level(struct btrfs_trans_handle *trans, struct btrfs_root } /* first, try to make some room in the middle buffer */ - if (left_buf) { - left = btrfs_buffer_node(left_buf); - orig_slot += btrfs_header_nritems(&left->header); - wret = push_node_left(trans, root, left_buf, mid_buf); + if (left) { + orig_slot += btrfs_header_nritems(left); + wret = push_node_left(trans, root, left, mid); if (wret < 0) ret = wret; - if (btrfs_header_nritems(&mid->header) < 2) + if (btrfs_header_nritems(mid) < 2) err_on_enospc = 1; } /* * then try to empty the right most buffer into the middle */ - if (right_buf) { - right = btrfs_buffer_node(right_buf); - wret = push_node_left(trans, root, mid_buf, right_buf); + if (right) { + wret = push_node_left(trans, root, mid, right); if (wret < 0 && wret != -ENOSPC) ret = wret; - if (btrfs_header_nritems(&right->header) == 0) { - u64 blocknr = bh_blocknr(right_buf); - clean_tree_block(trans, root, right_buf); - wait_on_buffer(right_buf); - btrfs_block_release(root, right_buf); - right_buf = NULL; + if (btrfs_header_nritems(right) == 0) { + u64 blocknr = extent_buffer_blocknr(right); + clean_tree_block(trans, root, right); + wait_on_tree_block_writeback(root, right); + free_extent_buffer(right); right = NULL; wret = del_ptr(trans, root, path, level + 1, pslot + 1); @@ -602,14 +692,13 @@ static int balance_level(struct btrfs_trans_handle *trans, struct btrfs_root if (wret) ret = wret; } else { - btrfs_memcpy(root, parent, - &parent->ptrs[pslot + 1].key, - &right->ptrs[0].key, - sizeof(struct btrfs_disk_key)); - btrfs_mark_buffer_dirty(parent_buf); + struct btrfs_disk_key right_key; + btrfs_node_key(right, &right_key, 0); + btrfs_set_node_key(parent, &right_key, pslot + 1); + btrfs_mark_buffer_dirty(parent); } } - if (btrfs_header_nritems(&mid->header) == 1) { + if (btrfs_header_nritems(mid) == 1) { /* * we're not allowed to leave a node with one item in the * tree during a delete. A deletion from lower in the tree @@ -619,21 +708,20 @@ static int balance_level(struct btrfs_trans_handle *trans, struct btrfs_root * otherwise we would have pulled some pointers from the * right */ - BUG_ON(!left_buf); - wret = balance_node_right(trans, root, mid_buf, left_buf); + BUG_ON(!left); + wret = balance_node_right(trans, root, mid, left); if (wret < 0) { ret = wret; goto enospc; } BUG_ON(wret == 1); } - if (btrfs_header_nritems(&mid->header) == 0) { + if (btrfs_header_nritems(mid) == 0) { /* we've managed to empty the middle node, drop it */ - u64 blocknr = bh_blocknr(mid_buf); - clean_tree_block(trans, root, mid_buf); - wait_on_buffer(mid_buf); - btrfs_block_release(root, mid_buf); - mid_buf = NULL; + u64 blocknr = extent_buffer_blocknr(mid); + clean_tree_block(trans, root, mid); + wait_on_tree_block_writeback(root, mid); + free_extent_buffer(mid); mid = NULL; wret = del_ptr(trans, root, path, level + 1, pslot); if (wret) @@ -643,37 +731,36 @@ static int balance_level(struct btrfs_trans_handle *trans, struct btrfs_root ret = wret; } else { /* update the parent key to reflect our changes */ - btrfs_memcpy(root, parent, - &parent->ptrs[pslot].key, &mid->ptrs[0].key, - sizeof(struct btrfs_disk_key)); - btrfs_mark_buffer_dirty(parent_buf); + struct btrfs_disk_key mid_key; + btrfs_node_key(mid, &mid_key, 0); + btrfs_set_node_key(parent, &mid_key, pslot); + btrfs_mark_buffer_dirty(parent); } /* update the path */ - if (left_buf) { - if (btrfs_header_nritems(&left->header) > orig_slot) { - get_bh(left_buf); - path->nodes[level] = left_buf; + if (left) { + if (btrfs_header_nritems(left) > orig_slot) { + extent_buffer_get(left); + path->nodes[level] = left; path->slots[level + 1] -= 1; path->slots[level] = orig_slot; - if (mid_buf) - btrfs_block_release(root, mid_buf); + if (mid) + free_extent_buffer(mid); } else { - orig_slot -= btrfs_header_nritems(&left->header); + orig_slot -= btrfs_header_nritems(left); path->slots[level] = orig_slot; } } /* double check we haven't messed things up */ check_block(root, path, level); if (orig_ptr != - btrfs_node_blockptr(btrfs_buffer_node(path->nodes[level]), - path->slots[level])) + btrfs_node_blockptr(path->nodes[level], path->slots[level])) BUG(); enospc: - if (right_buf) - btrfs_block_release(root, right_buf); - if (left_buf) - btrfs_block_release(root, left_buf); + if (right) + free_extent_buffer(right); + if (left) + free_extent_buffer(left); return ret; } @@ -682,14 +769,10 @@ static int push_nodes_for_insert(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, int level) { - struct buffer_head *right_buf; - struct buffer_head *mid_buf; - struct buffer_head *left_buf; - struct buffer_head *parent_buf = NULL; - struct btrfs_node *right = NULL; - struct btrfs_node *mid; - struct btrfs_node *left = NULL; - struct btrfs_node *parent = NULL; + struct extent_buffer *right = NULL; + struct extent_buffer *mid; + struct extent_buffer *left = NULL; + struct extent_buffer *parent = NULL; int ret = 0; int wret; int pslot; @@ -699,107 +782,101 @@ static int push_nodes_for_insert(struct btrfs_trans_handle *trans, if (level == 0) return 1; - mid_buf = path->nodes[level]; - mid = btrfs_buffer_node(mid_buf); + mid = path->nodes[level]; orig_ptr = btrfs_node_blockptr(mid, orig_slot); if (level < BTRFS_MAX_LEVEL - 1) - parent_buf = path->nodes[level + 1]; + parent = path->nodes[level + 1]; pslot = path->slots[level + 1]; - if (!parent_buf) + if (!parent) return 1; - parent = btrfs_buffer_node(parent_buf); - left_buf = read_node_slot(root, parent_buf, pslot - 1); + left = read_node_slot(root, parent, pslot - 1); /* first, try to make some room in the middle buffer */ - if (left_buf) { + if (left) { u32 left_nr; - left = btrfs_buffer_node(left_buf); - left_nr = btrfs_header_nritems(&left->header); + left_nr = btrfs_header_nritems(left); if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) { wret = 1; } else { - ret = btrfs_cow_block(trans, root, left_buf, parent_buf, - pslot - 1, &left_buf); + ret = btrfs_cow_block(trans, root, left, parent, + pslot - 1, &left); if (ret) wret = 1; else { - left = btrfs_buffer_node(left_buf); wret = push_node_left(trans, root, - left_buf, mid_buf); + left, mid); } } if (wret < 0) ret = wret; if (wret == 0) { + struct btrfs_disk_key disk_key; orig_slot += left_nr; - btrfs_memcpy(root, parent, - &parent->ptrs[pslot].key, - &mid->ptrs[0].key, - sizeof(struct btrfs_disk_key)); - btrfs_mark_buffer_dirty(parent_buf); - if (btrfs_header_nritems(&left->header) > orig_slot) { - path->nodes[level] = left_buf; + btrfs_node_key(mid, &disk_key, 0); + btrfs_set_node_key(parent, &disk_key, pslot); + btrfs_mark_buffer_dirty(parent); + if (btrfs_header_nritems(left) > orig_slot) { + path->nodes[level] = left; path->slots[level + 1] -= 1; path->slots[level] = orig_slot; - btrfs_block_release(root, mid_buf); + free_extent_buffer(mid); } else { orig_slot -= - btrfs_header_nritems(&left->header); + btrfs_header_nritems(left); path->slots[level] = orig_slot; - btrfs_block_release(root, left_buf); + free_extent_buffer(left); } check_node(root, path, level); return 0; } - btrfs_block_release(root, left_buf); + free_extent_buffer(left); } - right_buf = read_node_slot(root, parent_buf, pslot + 1); + right= read_node_slot(root, parent, pslot + 1); /* * then try to empty the right most buffer into the middle */ - if (right_buf) { + if (right) { u32 right_nr; - right = btrfs_buffer_node(right_buf); - right_nr = btrfs_header_nritems(&right->header); + right_nr = btrfs_header_nritems(right); if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) { wret = 1; } else { - ret = btrfs_cow_block(trans, root, right_buf, - parent_buf, pslot + 1, - &right_buf); + ret = btrfs_cow_block(trans, root, right, + parent, pslot + 1, + &right); if (ret) wret = 1; else { - right = btrfs_buffer_node(right_buf); wret = balance_node_right(trans, root, - right_buf, mid_buf); + right, mid); } } if (wret < 0) ret = wret; if (wret == 0) { - btrfs_memcpy(root, parent, - &parent->ptrs[pslot + 1].key, - &right->ptrs[0].key, - sizeof(struct btrfs_disk_key)); - btrfs_mark_buffer_dirty(parent_buf); - if (btrfs_header_nritems(&mid->header) <= orig_slot) { - path->nodes[level] = right_buf; + struct btrfs_disk_key disk_key; + + btrfs_node_key(right, &disk_key, 0); + btrfs_set_node_key(parent, &disk_key, pslot + 1); + btrfs_mark_buffer_dirty(parent); + + if (btrfs_header_nritems(mid) <= orig_slot) { + path->nodes[level] = right; path->slots[level + 1] += 1; path->slots[level] = orig_slot - - btrfs_header_nritems(&mid->header); - btrfs_block_release(root, mid_buf); + btrfs_header_nritems(mid); + free_extent_buffer(mid); } else { - btrfs_block_release(root, right_buf); + free_extent_buffer(right); } check_node(root, path, level); return 0; } - btrfs_block_release(root, right_buf); + free_extent_buffer(right); } check_node(root, path, level); return 1; @@ -811,10 +888,9 @@ static int push_nodes_for_insert(struct btrfs_trans_handle *trans, static void reada_for_search(struct btrfs_root *root, struct btrfs_path *path, int level, int slot) { - struct btrfs_node *node; + struct extent_buffer *node; int i; u32 nritems; - u64 item_objectid; u64 blocknr; u64 search; u64 cluster_start; @@ -823,7 +899,7 @@ static void reada_for_search(struct btrfs_root *root, struct btrfs_path *path, int direction = path->reada; struct radix_tree_root found; unsigned long gang[8]; - struct buffer_head *bh; + struct extent_buffer *eb; if (level == 0) return; @@ -831,18 +907,17 @@ static void reada_for_search(struct btrfs_root *root, struct btrfs_path *path, if (!path->nodes[level]) return; - node = btrfs_buffer_node(path->nodes[level]); + node = path->nodes[level]; search = btrfs_node_blockptr(node, slot); - bh = btrfs_find_tree_block(root, search); - if (bh) { - brelse(bh); + eb = btrfs_find_tree_block(root, search); + if (eb) { + free_extent_buffer(eb); return; } init_bit_radix(&found); - nritems = btrfs_header_nritems(&node->header); + nritems = btrfs_header_nritems(node); for (i = slot; i < nritems; i++) { - item_objectid = btrfs_disk_key_objectid(&node->ptrs[i].key); blocknr = btrfs_node_blockptr(node, i); set_radix_bit(&found, blocknr); } @@ -886,8 +961,7 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_key *key, struct btrfs_path *p, int ins_len, int cow) { - struct buffer_head *b; - struct btrfs_node *c; + struct extent_buffer *b; u64 blocknr; int slot; int ret; @@ -901,10 +975,9 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root WARN_ON(!mutex_is_locked(&root->fs_info->fs_mutex)); again: b = root->node; - get_bh(b); + extent_buffer_get(b); while (b) { - c = btrfs_buffer_node(b); - level = btrfs_header_level(&c->header); + level = btrfs_header_level(b); if (cow) { int wret; wret = btrfs_cow_block(trans, root, b, @@ -912,32 +985,30 @@ again: p->slots[level + 1], &b); if (wret) { - btrfs_block_release(root, b); + free_extent_buffer(b); return wret; } - c = btrfs_buffer_node(b); } BUG_ON(!cow && ins_len); - if (level != btrfs_header_level(&c->header)) + if (level != btrfs_header_level(b)) WARN_ON(1); - level = btrfs_header_level(&c->header); + level = btrfs_header_level(b); p->nodes[level] = b; ret = check_block(root, p, level); if (ret) return -1; - ret = bin_search(c, key, &slot); - if (!btrfs_is_leaf(c)) { + ret = bin_search(b, key, level, &slot); + if (level != 0) { if (ret && slot > 0) slot -= 1; p->slots[level] = slot; - if (ins_len > 0 && btrfs_header_nritems(&c->header) >= + if (ins_len > 0 && btrfs_header_nritems(b) >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) { int sret = split_node(trans, root, p, level); BUG_ON(sret > 0); if (sret) return sret; b = p->nodes[level]; - c = btrfs_buffer_node(b); slot = p->slots[level]; } else if (ins_len < 0) { int sret = balance_level(trans, root, p, @@ -947,22 +1018,19 @@ again: b = p->nodes[level]; if (!b) goto again; - c = btrfs_buffer_node(b); slot = p->slots[level]; - BUG_ON(btrfs_header_nritems(&c->header) == 1); + BUG_ON(btrfs_header_nritems(b) == 1); } /* this is only true while dropping a snapshot */ if (level == lowest_level) break; - blocknr = btrfs_node_blockptr(c, slot); + blocknr = btrfs_node_blockptr(b, slot); if (should_reada) reada_for_search(root, p, level, slot); - b = read_tree_block(root, btrfs_node_blockptr(c, slot)); - + b = read_tree_block(root, btrfs_node_blockptr(b, slot)); } else { - struct btrfs_leaf *l = (struct btrfs_leaf *)c; p->slots[level] = slot; - if (ins_len > 0 && btrfs_leaf_free_space(root, l) < + if (ins_len > 0 && btrfs_leaf_free_space(root, b) < sizeof(struct btrfs_item) + ins_len) { int sret = split_leaf(trans, root, key, p, ins_len); @@ -986,19 +1054,20 @@ again: * If this fails to write a tree block, it returns -1, but continues * fixing up the blocks in ram so the tree is consistent. */ -static int fixup_low_keys(struct btrfs_trans_handle *trans, struct btrfs_root - *root, struct btrfs_path *path, struct btrfs_disk_key - *key, int level) +static int fixup_low_keys(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct btrfs_path *path, + struct btrfs_disk_key *key, int level) { int i; int ret = 0; + struct extent_buffer *t; + for (i = level; i < BTRFS_MAX_LEVEL; i++) { - struct btrfs_node *t; int tslot = path->slots[i]; if (!path->nodes[i]) break; - t = btrfs_buffer_node(path->nodes[i]); - btrfs_memcpy(root, t, &t->ptrs[tslot].key, key, sizeof(*key)); + t = path->nodes[i]; + btrfs_set_node_key(t, key, tslot); btrfs_mark_buffer_dirty(path->nodes[i]); if (tslot != 0) break; @@ -1014,18 +1083,16 @@ static int fixup_low_keys(struct btrfs_trans_handle *trans, struct btrfs_root * error, and > 0 if there was no room in the left hand block. */ static int push_node_left(struct btrfs_trans_handle *trans, struct btrfs_root - *root, struct buffer_head *dst_buf, struct - buffer_head *src_buf) + *root, struct extent_buffer *dst, + struct extent_buffer *src) { - struct btrfs_node *src = btrfs_buffer_node(src_buf); - struct btrfs_node *dst = btrfs_buffer_node(dst_buf); int push_items = 0; int src_nritems; int dst_nritems; int ret = 0; - src_nritems = btrfs_header_nritems(&src->header); - dst_nritems = btrfs_header_nritems(&dst->header); + src_nritems = btrfs_header_nritems(src); + dst_nritems = btrfs_header_nritems(dst); push_items = BTRFS_NODEPTRS_PER_BLOCK(root) - dst_nritems; if (push_items <= 0) { @@ -1035,17 +1102,21 @@ static int push_node_left(struct btrfs_trans_handle *trans, struct btrfs_root if (src_nritems < push_items) push_items = src_nritems; - btrfs_memcpy(root, dst, dst->ptrs + dst_nritems, src->ptrs, - push_items * sizeof(struct btrfs_key_ptr)); + copy_extent_buffer(dst, src, + btrfs_node_key_ptr_offset(dst_nritems), + btrfs_node_key_ptr_offset(0), + push_items * sizeof(struct btrfs_key_ptr)); + if (push_items < src_nritems) { - btrfs_memmove(root, src, src->ptrs, src->ptrs + push_items, - (src_nritems - push_items) * - sizeof(struct btrfs_key_ptr)); - } - btrfs_set_header_nritems(&src->header, src_nritems - push_items); - btrfs_set_header_nritems(&dst->header, dst_nritems + push_items); - btrfs_mark_buffer_dirty(src_buf); - btrfs_mark_buffer_dirty(dst_buf); + memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0), + btrfs_node_key_ptr_offset(push_items), + (src_nritems - push_items) * + sizeof(struct btrfs_key_ptr)); + } + btrfs_set_header_nritems(src, src_nritems - push_items); + btrfs_set_header_nritems(dst, dst_nritems + push_items); + btrfs_mark_buffer_dirty(src); + btrfs_mark_buffer_dirty(dst); return ret; } @@ -1058,24 +1129,22 @@ static int push_node_left(struct btrfs_trans_handle *trans, struct btrfs_root * * this will only push up to 1/2 the contents of the left node over */ -static int balance_node_right(struct btrfs_trans_handle *trans, struct - btrfs_root *root, struct buffer_head *dst_buf, - struct buffer_head *src_buf) +static int balance_node_right(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct extent_buffer *dst, + struct extent_buffer *src) { - struct btrfs_node *src = btrfs_buffer_node(src_buf); - struct btrfs_node *dst = btrfs_buffer_node(dst_buf); int push_items = 0; int max_push; int src_nritems; int dst_nritems; int ret = 0; - src_nritems = btrfs_header_nritems(&src->header); - dst_nritems = btrfs_header_nritems(&dst->header); + src_nritems = btrfs_header_nritems(src); + dst_nritems = btrfs_header_nritems(dst); push_items = BTRFS_NODEPTRS_PER_BLOCK(root) - dst_nritems; - if (push_items <= 0) { + if (push_items <= 0) return 1; - } max_push = src_nritems / 2 + 1; /* don't try to empty the node */ @@ -1085,18 +1154,21 @@ static int balance_node_right(struct btrfs_trans_handle *trans, struct if (max_push < push_items) push_items = max_push; - btrfs_memmove(root, dst, dst->ptrs + push_items, dst->ptrs, - dst_nritems * sizeof(struct btrfs_key_ptr)); + memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items), + btrfs_node_key_ptr_offset(0), + (dst_nritems) * + sizeof(struct btrfs_key_ptr)); - btrfs_memcpy(root, dst, dst->ptrs, - src->ptrs + src_nritems - push_items, - push_items * sizeof(struct btrfs_key_ptr)); + copy_extent_buffer(dst, src, + btrfs_node_key_ptr_offset(0), + btrfs_node_key_ptr_offset(src_nritems - push_items), + push_items * sizeof(struct btrfs_key_ptr)); - btrfs_set_header_nritems(&src->header, src_nritems - push_items); - btrfs_set_header_nritems(&dst->header, dst_nritems + push_items); + btrfs_set_header_nritems(src, src_nritems - push_items); + btrfs_set_header_nritems(dst, dst_nritems + push_items); - btrfs_mark_buffer_dirty(src_buf); - btrfs_mark_buffer_dirty(dst_buf); + btrfs_mark_buffer_dirty(src); + btrfs_mark_buffer_dirty(dst); return ret; } @@ -1107,45 +1179,46 @@ static int balance_node_right(struct btrfs_trans_handle *trans, struct * * returns zero on success or < 0 on failure. */ -static int insert_new_root(struct btrfs_trans_handle *trans, struct btrfs_root - *root, struct btrfs_path *path, int level) +static int insert_new_root(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, int level) { - struct buffer_head *t; - struct btrfs_node *lower; - struct btrfs_node *c; - struct btrfs_disk_key *lower_key; + struct extent_buffer *lower; + struct extent_buffer *c; + struct btrfs_disk_key lower_key; BUG_ON(path->nodes[level]); BUG_ON(path->nodes[level-1] != root->node); - t = btrfs_alloc_free_block(trans, root, root->node->b_blocknr, 0); - if (IS_ERR(t)) - return PTR_ERR(t); - c = btrfs_buffer_node(t); - memset(c, 0, root->blocksize); - btrfs_set_header_nritems(&c->header, 1); - btrfs_set_header_level(&c->header, level); - btrfs_set_header_blocknr(&c->header, bh_blocknr(t)); - btrfs_set_header_generation(&c->header, trans->transid); - btrfs_set_header_owner(&c->header, root->root_key.objectid); - lower = btrfs_buffer_node(path->nodes[level-1]); - memcpy(c->header.fsid, root->fs_info->disk_super->fsid, - sizeof(c->header.fsid)); - if (btrfs_is_leaf(lower)) - lower_key = &((struct btrfs_leaf *)lower)->items[0].key; + c = btrfs_alloc_free_block(trans, root, + extent_buffer_blocknr(root->node), 0); + if (IS_ERR(c)) + return PTR_ERR(c); + memset_extent_buffer(c, 0, 0, root->nodesize); + btrfs_set_header_nritems(c, 1); + btrfs_set_header_level(c, level); + btrfs_set_header_blocknr(c, extent_buffer_blocknr(c)); + btrfs_set_header_generation(c, trans->transid); + btrfs_set_header_owner(c, root->root_key.objectid); + lower = path->nodes[level-1]; + + write_extent_buffer(c, root->fs_info->fsid, + (unsigned long)btrfs_header_fsid(c), + BTRFS_FSID_SIZE); + if (level == 1) + btrfs_item_key(lower, &lower_key, 0); else - lower_key = &lower->ptrs[0].key; - btrfs_memcpy(root, c, &c->ptrs[0].key, lower_key, - sizeof(struct btrfs_disk_key)); - btrfs_set_node_blockptr(c, 0, bh_blocknr(path->nodes[level - 1])); + btrfs_node_key(lower, &lower_key, 0); + btrfs_set_node_key(c, &lower_key, 0); + btrfs_set_node_blockptr(c, 0, extent_buffer_blocknr(lower)); - btrfs_mark_buffer_dirty(t); + btrfs_mark_buffer_dirty(c); /* the super has an extra ref to root->node */ - btrfs_block_release(root, root->node); - root->node = t; - get_bh(t); - path->nodes[level] = t; + free_extent_buffer(root->node); + root->node = c; + extent_buffer_get(c); + path->nodes[level] = c; path->slots[level] = 0; return 0; } @@ -1163,26 +1236,26 @@ static int insert_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, struct btrfs_disk_key *key, u64 blocknr, int slot, int level) { - struct btrfs_node *lower; + struct extent_buffer *lower; int nritems; BUG_ON(!path->nodes[level]); - lower = btrfs_buffer_node(path->nodes[level]); - nritems = btrfs_header_nritems(&lower->header); + lower = path->nodes[level]; + nritems = btrfs_header_nritems(lower); if (slot > nritems) BUG(); if (nritems == BTRFS_NODEPTRS_PER_BLOCK(root)) BUG(); if (slot != nritems) { - btrfs_memmove(root, lower, lower->ptrs + slot + 1, - lower->ptrs + slot, + memmove_extent_buffer(lower, + btrfs_node_key_ptr_offset(slot + 1), + btrfs_node_key_ptr_offset(slot), (nritems - slot) * sizeof(struct btrfs_key_ptr)); } - btrfs_memcpy(root, lower, &lower->ptrs[slot].key, - key, sizeof(struct btrfs_disk_key)); + btrfs_set_node_key(lower, key, slot); btrfs_set_node_blockptr(lower, slot, blocknr); - btrfs_set_header_nritems(&lower->header, nritems + 1); - btrfs_mark_buffer_dirty(path->nodes[level]); + btrfs_set_header_nritems(lower, nritems + 1); + btrfs_mark_buffer_dirty(lower); check_node(root, path, level); return 0; } @@ -1199,69 +1272,73 @@ static int insert_ptr(struct btrfs_trans_handle *trans, struct btrfs_root static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, int level) { - struct buffer_head *t; - struct btrfs_node *c; - struct buffer_head *split_buffer; - struct btrfs_node *split; + struct extent_buffer *c; + struct extent_buffer *split; + struct btrfs_disk_key disk_key; int mid; int ret; int wret; u32 c_nritems; - t = path->nodes[level]; - c = btrfs_buffer_node(t); - if (t == root->node) { + c = path->nodes[level]; + if (c == root->node) { /* trying to split the root, lets make a new one */ ret = insert_new_root(trans, root, path, level + 1); if (ret) return ret; } else { ret = push_nodes_for_insert(trans, root, path, level); - t = path->nodes[level]; - c = btrfs_buffer_node(t); - if (!ret && - btrfs_header_nritems(&c->header) < + c = path->nodes[level]; + if (!ret && btrfs_header_nritems(c) < BTRFS_NODEPTRS_PER_BLOCK(root) - 1) return 0; if (ret < 0) return ret; } - c_nritems = btrfs_header_nritems(&c->header); - split_buffer = btrfs_alloc_free_block(trans, root, t->b_blocknr, 0); - if (IS_ERR(split_buffer)) - return PTR_ERR(split_buffer); + c_nritems = btrfs_header_nritems(c); + split = btrfs_alloc_free_block(trans, root, + extent_buffer_blocknr(c), 0); + if (IS_ERR(split)) + return PTR_ERR(split); + + btrfs_set_header_flags(split, btrfs_header_flags(c)); + btrfs_set_header_level(split, btrfs_header_level(c)); + btrfs_set_header_blocknr(split, extent_buffer_blocknr(split)); + btrfs_set_header_generation(split, trans->transid); + btrfs_set_header_owner(split, root->root_key.objectid); + write_extent_buffer(split, root->fs_info->fsid, + (unsigned long)btrfs_header_fsid(split), + BTRFS_FSID_SIZE); - split = btrfs_buffer_node(split_buffer); - btrfs_set_header_flags(&split->header, btrfs_header_flags(&c->header)); - btrfs_set_header_level(&split->header, btrfs_header_level(&c->header)); - btrfs_set_header_blocknr(&split->header, bh_blocknr(split_buffer)); - btrfs_set_header_generation(&split->header, trans->transid); - btrfs_set_header_owner(&split->header, root->root_key.objectid); - memcpy(split->header.fsid, root->fs_info->disk_super->fsid, - sizeof(split->header.fsid)); mid = (c_nritems + 1) / 2; - btrfs_memcpy(root, split, split->ptrs, c->ptrs + mid, - (c_nritems - mid) * sizeof(struct btrfs_key_ptr)); - btrfs_set_header_nritems(&split->header, c_nritems - mid); - btrfs_set_header_nritems(&c->header, mid); + + copy_extent_buffer(split, c, + btrfs_node_key_ptr_offset(0), + btrfs_node_key_ptr_offset(mid), + (c_nritems - mid) * sizeof(struct btrfs_key_ptr)); + btrfs_set_header_nritems(split, c_nritems - mid); + btrfs_set_header_nritems(c, mid); ret = 0; - btrfs_mark_buffer_dirty(t); - btrfs_mark_buffer_dirty(split_buffer); - wret = insert_ptr(trans, root, path, &split->ptrs[0].key, - bh_blocknr(split_buffer), path->slots[level + 1] + 1, + btrfs_mark_buffer_dirty(c); + btrfs_mark_buffer_dirty(split); + + btrfs_node_key(split, &disk_key, 0); + wret = insert_ptr(trans, root, path, &disk_key, + extent_buffer_blocknr(split), + path->slots[level + 1] + 1, level + 1); if (wret) ret = wret; if (path->slots[level] >= mid) { path->slots[level] -= mid; - btrfs_block_release(root, t); - path->nodes[level] = split_buffer; + free_extent_buffer(c); + path->nodes[level] = split; path->slots[level + 1] += 1; } else { - btrfs_block_release(root, split_buffer); + free_extent_buffer(split); } return ret; } @@ -1271,16 +1348,16 @@ static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root * and nr indicate which items in the leaf to check. This totals up the * space used both by the item structs and the item data */ -static int leaf_space_used(struct btrfs_leaf *l, int start, int nr) +static int leaf_space_used(struct extent_buffer *l, int start, int nr) { int data_len; - int nritems = btrfs_header_nritems(&l->header); + int nritems = btrfs_header_nritems(l); int end = min(nritems, start + nr) - 1; if (!nr) return 0; - data_len = btrfs_item_end(l->items + start); - data_len = data_len - btrfs_item_offset(l->items + end); + data_len = btrfs_item_end_nr(l, start); + data_len = data_len - btrfs_item_offset_nr(l, end); data_len += sizeof(struct btrfs_item) * nr; WARN_ON(data_len < 0); return data_len; @@ -1291,10 +1368,17 @@ static int leaf_space_used(struct btrfs_leaf *l, int start, int nr) * the start of the leaf data. IOW, how much room * the leaf has left for both items and data */ -int btrfs_leaf_free_space(struct btrfs_root *root, struct btrfs_leaf *leaf) +int btrfs_leaf_free_space(struct btrfs_root *root, struct extent_buffer *leaf) { - int nritems = btrfs_header_nritems(&leaf->header); - return BTRFS_LEAF_DATA_SIZE(root) - leaf_space_used(leaf, 0, nritems); + int nritems = btrfs_header_nritems(leaf); + int ret; + ret = BTRFS_LEAF_DATA_SIZE(root) - leaf_space_used(leaf, 0, nritems); + if (ret < 0) { + printk("leaf free space ret %d, leaf data size %lu, used %d nritems %d\n", + ret, BTRFS_LEAF_DATA_SIZE(root), + leaf_space_used(leaf, 0, nritems), nritems); + } + return ret; } /* @@ -1307,12 +1391,10 @@ int btrfs_leaf_free_space(struct btrfs_root *root, struct btrfs_leaf *leaf) static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, int data_size) { - struct buffer_head *left_buf = path->nodes[0]; - struct btrfs_leaf *left = btrfs_buffer_leaf(left_buf); - struct btrfs_leaf *right; - struct buffer_head *right_buf; - struct buffer_head *upper; - struct btrfs_node *upper_node; + struct extent_buffer *left = path->nodes[0]; + struct extent_buffer *right; + struct extent_buffer *upper; + struct btrfs_disk_key disk_key; int slot; int i; int free_space; @@ -1321,6 +1403,7 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root struct btrfs_item *item; u32 left_nritems; u32 right_nritems; + u32 data_end; int ret; slot = path->slots[1]; @@ -1328,102 +1411,109 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root return 1; } upper = path->nodes[1]; - upper_node = btrfs_buffer_node(upper); - if (slot >= btrfs_header_nritems(&upper_node->header) - 1) { + if (slot >= btrfs_header_nritems(upper) - 1) return 1; - } - right_buf = read_tree_block(root, - btrfs_node_blockptr(btrfs_buffer_node(upper), slot + 1)); - right = btrfs_buffer_leaf(right_buf); + + right = read_tree_block(root, btrfs_node_blockptr(upper, slot + 1)); free_space = btrfs_leaf_free_space(root, right); if (free_space < data_size + sizeof(struct btrfs_item)) { - btrfs_block_release(root, right_buf); + free_extent_buffer(right); return 1; } + /* cow and double check */ - ret = btrfs_cow_block(trans, root, right_buf, upper, - slot + 1, &right_buf); + ret = btrfs_cow_block(trans, root, right, upper, + slot + 1, &right); if (ret) { - btrfs_block_release(root, right_buf); + free_extent_buffer(right); return 1; } - right = btrfs_buffer_leaf(right_buf); free_space = btrfs_leaf_free_space(root, right); if (free_space < data_size + sizeof(struct btrfs_item)) { - btrfs_block_release(root, right_buf); + free_extent_buffer(right); return 1; } - left_nritems = btrfs_header_nritems(&left->header); + left_nritems = btrfs_header_nritems(left); if (left_nritems == 0) { - btrfs_block_release(root, right_buf); + free_extent_buffer(right); return 1; } + for (i = left_nritems - 1; i >= 1; i--) { - item = left->items + i; + item = btrfs_item_nr(left, i); if (path->slots[0] == i) push_space += data_size + sizeof(*item); - if (btrfs_item_size(item) + sizeof(*item) + push_space > + if (btrfs_item_size(left, item) + sizeof(*item) + push_space > free_space) break; push_items++; - push_space += btrfs_item_size(item) + sizeof(*item); + push_space += btrfs_item_size(left, item) + sizeof(*item); } + if (push_items == 0) { - btrfs_block_release(root, right_buf); + free_extent_buffer(right); return 1; } + if (push_items == left_nritems) WARN_ON(1); - right_nritems = btrfs_header_nritems(&right->header); + /* push left to right */ - push_space = btrfs_item_end(left->items + left_nritems - push_items); + right_nritems = btrfs_header_nritems(right); + push_space = btrfs_item_end_nr(left, left_nritems - push_items); push_space -= leaf_data_end(root, left); + /* make room in the right data area */ - btrfs_memmove(root, right, btrfs_leaf_data(right) + - leaf_data_end(root, right) - push_space, - btrfs_leaf_data(right) + - leaf_data_end(root, right), BTRFS_LEAF_DATA_SIZE(root) - - leaf_data_end(root, right)); + data_end = leaf_data_end(root, right); + memmove_extent_buffer(right, + btrfs_leaf_data(right) + data_end - push_space, + btrfs_leaf_data(right) + data_end, + BTRFS_LEAF_DATA_SIZE(root) - data_end); + /* copy from the left data area */ - btrfs_memcpy(root, right, btrfs_leaf_data(right) + + copy_extent_buffer(right, left, btrfs_leaf_data(right) + BTRFS_LEAF_DATA_SIZE(root) - push_space, btrfs_leaf_data(left) + leaf_data_end(root, left), push_space); - btrfs_memmove(root, right, right->items + push_items, right->items, - right_nritems * sizeof(struct btrfs_item)); + + memmove_extent_buffer(right, btrfs_item_nr_offset(push_items), + btrfs_item_nr_offset(0), + right_nritems * sizeof(struct btrfs_item)); + /* copy the items from left to right */ - btrfs_memcpy(root, right, right->items, left->items + - left_nritems - push_items, - push_items * sizeof(struct btrfs_item)); + copy_extent_buffer(right, left, btrfs_item_nr_offset(0), + btrfs_item_nr_offset(left_nritems - push_items), + push_items * sizeof(struct btrfs_item)); /* update the item pointers */ right_nritems += push_items; - btrfs_set_header_nritems(&right->header, right_nritems); + btrfs_set_header_nritems(right, right_nritems); push_space = BTRFS_LEAF_DATA_SIZE(root); for (i = 0; i < right_nritems; i++) { - btrfs_set_item_offset(right->items + i, push_space - - btrfs_item_size(right->items + i)); - push_space = btrfs_item_offset(right->items + i); + item = btrfs_item_nr(right, i); + btrfs_set_item_offset(right, item, push_space - + btrfs_item_size(right, item)); + push_space = btrfs_item_offset(right, item); } left_nritems -= push_items; - btrfs_set_header_nritems(&left->header, left_nritems); + btrfs_set_header_nritems(left, left_nritems); - btrfs_mark_buffer_dirty(left_buf); - btrfs_mark_buffer_dirty(right_buf); + btrfs_mark_buffer_dirty(left); + btrfs_mark_buffer_dirty(right); - btrfs_memcpy(root, upper_node, &upper_node->ptrs[slot + 1].key, - &right->items[0].key, sizeof(struct btrfs_disk_key)); + btrfs_item_key(right, &disk_key, 0); + btrfs_set_node_key(upper, &disk_key, slot + 1); btrfs_mark_buffer_dirty(upper); /* then fixup the leaf pointer in the path */ if (path->slots[0] >= left_nritems) { path->slots[0] -= left_nritems; - btrfs_block_release(root, path->nodes[0]); - path->nodes[0] = right_buf; + free_extent_buffer(path->nodes[0]); + path->nodes[0] = right; path->slots[1] += 1; } else { - btrfs_block_release(root, right_buf); + free_extent_buffer(right); } if (path->nodes[1]) check_node(root, path, 1); @@ -1436,10 +1526,9 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, int data_size) { - struct buffer_head *right_buf = path->nodes[0]; - struct btrfs_leaf *right = btrfs_buffer_leaf(right_buf); - struct buffer_head *t; - struct btrfs_leaf *left; + struct btrfs_disk_key disk_key; + struct extent_buffer *right = path->nodes[0]; + struct extent_buffer *left; int slot; int i; int free_space; @@ -1447,119 +1536,128 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root int push_items = 0; struct btrfs_item *item; u32 old_left_nritems; + u32 right_nritems; int ret = 0; int wret; slot = path->slots[1]; - if (slot == 0) { + if (slot == 0) return 1; - } - if (!path->nodes[1]) { + if (!path->nodes[1]) return 1; - } - t = read_tree_block(root, - btrfs_node_blockptr(btrfs_buffer_node(path->nodes[1]), slot - 1)); - left = btrfs_buffer_leaf(t); + + left = read_tree_block(root, btrfs_node_blockptr(path->nodes[1], + slot - 1)); free_space = btrfs_leaf_free_space(root, left); if (free_space < data_size + sizeof(struct btrfs_item)) { - btrfs_block_release(root, t); + free_extent_buffer(left); return 1; } /* cow and double check */ - ret = btrfs_cow_block(trans, root, t, path->nodes[1], slot - 1, &t); + ret = btrfs_cow_block(trans, root, left, + path->nodes[1], slot - 1, &left); if (ret) { /* we hit -ENOSPC, but it isn't fatal here */ - btrfs_block_release(root, t); + free_extent_buffer(left); return 1; } - left = btrfs_buffer_leaf(t); free_space = btrfs_leaf_free_space(root, left); if (free_space < data_size + sizeof(struct btrfs_item)) { - btrfs_block_release(root, t); + free_extent_buffer(left); return 1; } - if (btrfs_header_nritems(&right->header) == 0) { - btrfs_block_release(root, t); + right_nritems = btrfs_header_nritems(right); + if (right_nritems == 0) { + free_extent_buffer(left); return 1; } - for (i = 0; i < btrfs_header_nritems(&right->header) - 1; i++) { - item = right->items + i; + for (i = 0; i < right_nritems - 1; i++) { + item = btrfs_item_nr(right, i); if (path->slots[0] == i) push_space += data_size + sizeof(*item); - if (btrfs_item_size(item) + sizeof(*item) + push_space > + if (btrfs_item_size(right, item) + sizeof(*item) + push_space > free_space) break; push_items++; - push_space += btrfs_item_size(item) + sizeof(*item); + push_space += btrfs_item_size(right, item) + sizeof(*item); } if (push_items == 0) { - btrfs_block_release(root, t); + free_extent_buffer(left); return 1; } - if (push_items == btrfs_header_nritems(&right->header)) + if (push_items == btrfs_header_nritems(right)) WARN_ON(1); + /* push data from right to left */ - btrfs_memcpy(root, left, left->items + - btrfs_header_nritems(&left->header), - right->items, push_items * sizeof(struct btrfs_item)); + copy_extent_buffer(left, right, + btrfs_item_nr_offset(btrfs_header_nritems(left)), + btrfs_item_nr_offset(0), + push_items * sizeof(struct btrfs_item)); + push_space = BTRFS_LEAF_DATA_SIZE(root) - - btrfs_item_offset(right->items + push_items -1); - btrfs_memcpy(root, left, btrfs_leaf_data(left) + + btrfs_item_offset_nr(right, push_items -1); + + copy_extent_buffer(left, right, btrfs_leaf_data(left) + leaf_data_end(root, left) - push_space, btrfs_leaf_data(right) + - btrfs_item_offset(right->items + push_items - 1), + btrfs_item_offset_nr(right, push_items - 1), push_space); - old_left_nritems = btrfs_header_nritems(&left->header); + old_left_nritems = btrfs_header_nritems(left); BUG_ON(old_left_nritems < 0); for (i = old_left_nritems; i < old_left_nritems + push_items; i++) { - u32 ioff = btrfs_item_offset(left->items + i); - btrfs_set_item_offset(left->items + i, ioff - - (BTRFS_LEAF_DATA_SIZE(root) - - btrfs_item_offset(left->items + - old_left_nritems - 1))); + u32 ioff; + item = btrfs_item_nr(left, i); + ioff = btrfs_item_offset(left, item); + btrfs_set_item_offset(left, item, + ioff - (BTRFS_LEAF_DATA_SIZE(root) - + btrfs_item_offset_nr(left, old_left_nritems - 1))); } - btrfs_set_header_nritems(&left->header, old_left_nritems + push_items); + btrfs_set_header_nritems(left, old_left_nritems + push_items); /* fixup right node */ - push_space = btrfs_item_offset(right->items + push_items - 1) - - leaf_data_end(root, right); - btrfs_memmove(root, right, btrfs_leaf_data(right) + - BTRFS_LEAF_DATA_SIZE(root) - push_space, - btrfs_leaf_data(right) + - leaf_data_end(root, right), push_space); - btrfs_memmove(root, right, right->items, right->items + push_items, - (btrfs_header_nritems(&right->header) - push_items) * - sizeof(struct btrfs_item)); - btrfs_set_header_nritems(&right->header, - btrfs_header_nritems(&right->header) - - push_items); + push_space = btrfs_item_offset_nr(right, push_items - 1) - + leaf_data_end(root, right); + memmove_extent_buffer(right, btrfs_leaf_data(right) + + BTRFS_LEAF_DATA_SIZE(root) - push_space, + btrfs_leaf_data(right) + + leaf_data_end(root, right), push_space); + + memmove_extent_buffer(right, btrfs_item_nr_offset(0), + btrfs_item_nr_offset(push_items), + (btrfs_header_nritems(right) - push_items) * + sizeof(struct btrfs_item)); + + right_nritems = btrfs_header_nritems(right) - push_items; + btrfs_set_header_nritems(right, right_nritems); push_space = BTRFS_LEAF_DATA_SIZE(root); - for (i = 0; i < btrfs_header_nritems(&right->header); i++) { - btrfs_set_item_offset(right->items + i, push_space - - btrfs_item_size(right->items + i)); - push_space = btrfs_item_offset(right->items + i); + for (i = 0; i < right_nritems; i++) { + item = btrfs_item_nr(right, i); + btrfs_set_item_offset(right, item, push_space - + btrfs_item_size(right, item)); + push_space = btrfs_item_offset(right, item); } - btrfs_mark_buffer_dirty(t); - btrfs_mark_buffer_dirty(right_buf); + btrfs_mark_buffer_dirty(left); + btrfs_mark_buffer_dirty(right); - wret = fixup_low_keys(trans, root, path, &right->items[0].key, 1); + btrfs_item_key(right, &disk_key, 0); + wret = fixup_low_keys(trans, root, path, &disk_key, 1); if (wret) ret = wret; /* then fixup the leaf pointer in the path */ if (path->slots[0] < push_items) { path->slots[0] += old_left_nritems; - btrfs_block_release(root, path->nodes[0]); - path->nodes[0] = t; + free_extent_buffer(path->nodes[0]); + path->nodes[0] = left; path->slots[1] -= 1; } else { - btrfs_block_release(root, t); + free_extent_buffer(left); path->slots[0] -= push_items; } BUG_ON(path->slots[0] < 0); @@ -1578,13 +1676,11 @@ static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_key *ins_key, struct btrfs_path *path, int data_size) { - struct buffer_head *l_buf; - struct btrfs_leaf *l; + struct extent_buffer *l; u32 nritems; int mid; int slot; - struct btrfs_leaf *right; - struct buffer_head *right_buffer; + struct extent_buffer *right; int space_needed = data_size + sizeof(struct btrfs_item); int data_copy_size; int rt_data_off; @@ -1603,8 +1699,7 @@ static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root if (wret < 0) return wret; } - l_buf = path->nodes[0]; - l = btrfs_buffer_leaf(l_buf); + l = path->nodes[0]; /* did the pushes work? */ if (btrfs_leaf_free_space(root, l) >= @@ -1617,36 +1712,38 @@ static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root return ret; } slot = path->slots[0]; - nritems = btrfs_header_nritems(&l->header); + nritems = btrfs_header_nritems(l); mid = (nritems + 1)/ 2; - right_buffer = btrfs_alloc_free_block(trans, root, l_buf->b_blocknr, 0); - if (IS_ERR(right_buffer)) - return PTR_ERR(right_buffer); - - right = btrfs_buffer_leaf(right_buffer); - memset(&right->header, 0, sizeof(right->header)); - btrfs_set_header_blocknr(&right->header, bh_blocknr(right_buffer)); - btrfs_set_header_generation(&right->header, trans->transid); - btrfs_set_header_owner(&right->header, root->root_key.objectid); - btrfs_set_header_level(&right->header, 0); - memcpy(right->header.fsid, root->fs_info->disk_super->fsid, - sizeof(right->header.fsid)); + right = btrfs_alloc_free_block(trans, root, + extent_buffer_blocknr(l), 0); + if (IS_ERR(right)) + return PTR_ERR(right); + + memset_extent_buffer(right, 0, 0, sizeof(struct btrfs_header)); + btrfs_set_header_blocknr(right, extent_buffer_blocknr(right)); + btrfs_set_header_generation(right, trans->transid); + btrfs_set_header_owner(right, root->root_key.objectid); + btrfs_set_header_level(right, 0); + write_extent_buffer(right, root->fs_info->fsid, + (unsigned long)btrfs_header_fsid(right), + BTRFS_FSID_SIZE); + if (mid <= slot) { if (nritems == 1 || leaf_space_used(l, mid, nritems - mid) + space_needed > BTRFS_LEAF_DATA_SIZE(root)) { if (slot >= nritems) { btrfs_cpu_key_to_disk(&disk_key, ins_key); - btrfs_set_header_nritems(&right->header, 0); + btrfs_set_header_nritems(right, 0); wret = insert_ptr(trans, root, path, &disk_key, - bh_blocknr(right_buffer), + extent_buffer_blocknr(right), path->slots[1] + 1, 1); if (wret) ret = wret; - btrfs_block_release(root, path->nodes[0]); - path->nodes[0] = right_buffer; + free_extent_buffer(path->nodes[0]); + path->nodes[0] = right; path->slots[0] = 0; path->slots[1] += 1; return ret; @@ -1659,15 +1756,15 @@ static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root BTRFS_LEAF_DATA_SIZE(root)) { if (slot == 0) { btrfs_cpu_key_to_disk(&disk_key, ins_key); - btrfs_set_header_nritems(&right->header, 0); + btrfs_set_header_nritems(right, 0); wret = insert_ptr(trans, root, path, &disk_key, - bh_blocknr(right_buffer), + extent_buffer_blocknr(right), path->slots[1], 1); if (wret) ret = wret; - btrfs_block_release(root, path->nodes[0]); - path->nodes[0] = right_buffer; + free_extent_buffer(path->nodes[0]); + path->nodes[0] = right; path->slots[0] = 0; if (path->slots[1] == 0) { wret = fixup_low_keys(trans, root, @@ -1681,61 +1778,74 @@ static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root double_split = 1; } } - btrfs_set_header_nritems(&right->header, nritems - mid); - data_copy_size = btrfs_item_end(l->items + mid) - - leaf_data_end(root, l); - btrfs_memcpy(root, right, right->items, l->items + mid, - (nritems - mid) * sizeof(struct btrfs_item)); - btrfs_memcpy(root, right, + nritems = nritems - mid; + btrfs_set_header_nritems(right, nritems); + data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(root, l); + + copy_extent_buffer(right, l, btrfs_item_nr_offset(0), + btrfs_item_nr_offset(mid), + nritems * sizeof(struct btrfs_item)); + + copy_extent_buffer(right, l, btrfs_leaf_data(right) + BTRFS_LEAF_DATA_SIZE(root) - data_copy_size, btrfs_leaf_data(l) + leaf_data_end(root, l), data_copy_size); + rt_data_off = BTRFS_LEAF_DATA_SIZE(root) - - btrfs_item_end(l->items + mid); + btrfs_item_end_nr(l, mid); - for (i = 0; i < btrfs_header_nritems(&right->header); i++) { - u32 ioff = btrfs_item_offset(right->items + i); - btrfs_set_item_offset(right->items + i, ioff + rt_data_off); + for (i = 0; i < nritems; i++) { + struct btrfs_item *item = btrfs_item_nr(right, i); + u32 ioff = btrfs_item_offset(right, item); + btrfs_set_item_offset(right, item, ioff + rt_data_off); } - btrfs_set_header_nritems(&l->header, mid); + btrfs_set_header_nritems(l, mid); ret = 0; - wret = insert_ptr(trans, root, path, &right->items[0].key, - bh_blocknr(right_buffer), path->slots[1] + 1, 1); + btrfs_item_key(right, &disk_key, 0); + wret = insert_ptr(trans, root, path, &disk_key, + extent_buffer_blocknr(right), path->slots[1] + 1, 1); if (wret) ret = wret; - btrfs_mark_buffer_dirty(right_buffer); - btrfs_mark_buffer_dirty(l_buf); + + btrfs_mark_buffer_dirty(right); + btrfs_mark_buffer_dirty(l); BUG_ON(path->slots[0] != slot); + if (mid <= slot) { - btrfs_block_release(root, path->nodes[0]); - path->nodes[0] = right_buffer; + free_extent_buffer(path->nodes[0]); + path->nodes[0] = right; path->slots[0] -= mid; path->slots[1] += 1; } else - btrfs_block_release(root, right_buffer); + free_extent_buffer(right); + BUG_ON(path->slots[0] < 0); check_node(root, path, 1); + check_leaf(root, path, 0); if (!double_split) return ret; - right_buffer = btrfs_alloc_free_block(trans, root, l_buf->b_blocknr, 0); - if (IS_ERR(right_buffer)) - return PTR_ERR(right_buffer); - - right = btrfs_buffer_leaf(right_buffer); - memset(&right->header, 0, sizeof(right->header)); - btrfs_set_header_blocknr(&right->header, bh_blocknr(right_buffer)); - btrfs_set_header_generation(&right->header, trans->transid); - btrfs_set_header_owner(&right->header, root->root_key.objectid); - btrfs_set_header_level(&right->header, 0); - memcpy(right->header.fsid, root->fs_info->disk_super->fsid, - sizeof(right->header.fsid)); + + right = btrfs_alloc_free_block(trans, root, + extent_buffer_blocknr(l), 0); + if (IS_ERR(right)) + return PTR_ERR(right); + + memset_extent_buffer(right, 0, 0, sizeof(struct btrfs_header)); + btrfs_set_header_blocknr(right, extent_buffer_blocknr(right)); + btrfs_set_header_generation(right, trans->transid); + btrfs_set_header_owner(right, root->root_key.objectid); + btrfs_set_header_level(right, 0); + write_extent_buffer(right, root->fs_info->fsid, + (unsigned long)btrfs_header_fsid(right), + BTRFS_FSID_SIZE); + btrfs_cpu_key_to_disk(&disk_key, ins_key); - btrfs_set_header_nritems(&right->header, 0); + btrfs_set_header_nritems(right, 0); wret = insert_ptr(trans, root, path, &disk_key, - bh_blocknr(right_buffer), + extent_buffer_blocknr(right), path->slots[1], 1); if (wret) ret = wret; @@ -1744,8 +1854,8 @@ static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root if (wret) ret = wret; } - btrfs_block_release(root, path->nodes[0]); - path->nodes[0] = right_buffer; + free_extent_buffer(path->nodes[0]); + path->nodes[0] = right; path->slots[0] = 0; check_node(root, path, 1); check_leaf(root, path, 0); @@ -1760,8 +1870,8 @@ int btrfs_truncate_item(struct btrfs_trans_handle *trans, int ret = 0; int slot; int slot_orig; - struct btrfs_leaf *leaf; - struct buffer_head *leaf_buf; + struct extent_buffer *leaf; + struct btrfs_item *item; u32 nritems; unsigned int data_end; unsigned int old_data_start; @@ -1770,15 +1880,14 @@ int btrfs_truncate_item(struct btrfs_trans_handle *trans, int i; slot_orig = path->slots[0]; - leaf_buf = path->nodes[0]; - leaf = btrfs_buffer_leaf(leaf_buf); + leaf = path->nodes[0]; - nritems = btrfs_header_nritems(&leaf->header); + nritems = btrfs_header_nritems(leaf); data_end = leaf_data_end(root, leaf); slot = path->slots[0]; - old_data_start = btrfs_item_offset(leaf->items + slot); - old_size = btrfs_item_size(leaf->items + slot); + old_data_start = btrfs_item_offset_nr(leaf, slot); + old_size = btrfs_item_size_nr(leaf, slot); BUG_ON(old_size <= new_size); size_diff = old_size - new_size; @@ -1790,32 +1899,38 @@ int btrfs_truncate_item(struct btrfs_trans_handle *trans, */ /* first correct the data pointers */ for (i = slot; i < nritems; i++) { - u32 ioff = btrfs_item_offset(leaf->items + i); - btrfs_set_item_offset(leaf->items + i, - ioff + size_diff); + u32 ioff; + item = btrfs_item_nr(leaf, i); + ioff = btrfs_item_offset(leaf, item); + btrfs_set_item_offset(leaf, item, ioff + size_diff); } /* shift the data */ - btrfs_memmove(root, leaf, btrfs_leaf_data(leaf) + + memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) + data_end + size_diff, btrfs_leaf_data(leaf) + data_end, old_data_start + new_size - data_end); - btrfs_set_item_size(leaf->items + slot, new_size); - btrfs_mark_buffer_dirty(leaf_buf); + + item = btrfs_item_nr(leaf, slot); + btrfs_set_item_size(leaf, item, new_size); + btrfs_mark_buffer_dirty(leaf); ret = 0; - if (btrfs_leaf_free_space(root, leaf) < 0) + if (btrfs_leaf_free_space(root, leaf) < 0) { + btrfs_print_leaf(root, leaf); BUG(); + } check_leaf(root, path, 0); return ret; } -int btrfs_extend_item(struct btrfs_trans_handle *trans, struct btrfs_root - *root, struct btrfs_path *path, u32 data_size) +int btrfs_extend_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct btrfs_path *path, + u32 data_size) { int ret = 0; int slot; int slot_orig; - struct btrfs_leaf *leaf; - struct buffer_head *leaf_buf; + struct extent_buffer *leaf; + struct btrfs_item *item; u32 nritems; unsigned int data_end; unsigned int old_data; @@ -1823,16 +1938,17 @@ int btrfs_extend_item(struct btrfs_trans_handle *trans, struct btrfs_root int i; slot_orig = path->slots[0]; - leaf_buf = path->nodes[0]; - leaf = btrfs_buffer_leaf(leaf_buf); + leaf = path->nodes[0]; - nritems = btrfs_header_nritems(&leaf->header); + nritems = btrfs_header_nritems(leaf); data_end = leaf_data_end(root, leaf); - if (btrfs_leaf_free_space(root, leaf) < data_size) + if (btrfs_leaf_free_space(root, leaf) < data_size) { + btrfs_print_leaf(root, leaf); BUG(); + } slot = path->slots[0]; - old_data = btrfs_item_end(leaf->items + slot); + old_data = btrfs_item_end_nr(leaf, slot); BUG_ON(slot < 0); BUG_ON(slot >= nritems); @@ -1842,22 +1958,28 @@ int btrfs_extend_item(struct btrfs_trans_handle *trans, struct btrfs_root */ /* first correct the data pointers */ for (i = slot; i < nritems; i++) { - u32 ioff = btrfs_item_offset(leaf->items + i); - btrfs_set_item_offset(leaf->items + i, - ioff - data_size); + u32 ioff; + item = btrfs_item_nr(leaf, i); + ioff = btrfs_item_offset(leaf, item); + btrfs_set_item_offset(leaf, item, ioff - data_size); } + /* shift the data */ - btrfs_memmove(root, leaf, btrfs_leaf_data(leaf) + + memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) + data_end - data_size, btrfs_leaf_data(leaf) + data_end, old_data - data_end); + data_end = old_data; - old_size = btrfs_item_size(leaf->items + slot); - btrfs_set_item_size(leaf->items + slot, old_size + data_size); - btrfs_mark_buffer_dirty(leaf_buf); + old_size = btrfs_item_size_nr(leaf, slot); + item = btrfs_item_nr(leaf, slot); + btrfs_set_item_size(leaf, item, old_size + data_size); + btrfs_mark_buffer_dirty(leaf); ret = 0; - if (btrfs_leaf_free_space(root, leaf) < 0) + if (btrfs_leaf_free_space(root, leaf) < 0) { + btrfs_print_leaf(root, leaf); BUG(); + } check_leaf(root, path, 0); return ret; } @@ -1866,15 +1988,16 @@ int btrfs_extend_item(struct btrfs_trans_handle *trans, struct btrfs_root * Given a key and some data, insert an item into the tree. * This does all the path init required, making room in the tree if needed. */ -int btrfs_insert_empty_item(struct btrfs_trans_handle *trans, struct btrfs_root - *root, struct btrfs_path *path, struct btrfs_key - *cpu_key, u32 data_size) +int btrfs_insert_empty_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_key *cpu_key, u32 data_size) { + struct extent_buffer *leaf; + struct btrfs_item *item; int ret = 0; int slot; int slot_orig; - struct btrfs_leaf *leaf; - struct buffer_head *leaf_buf; u32 nritems; unsigned int data_end; struct btrfs_disk_key disk_key; @@ -1884,6 +2007,7 @@ int btrfs_insert_empty_item(struct btrfs_trans_handle *trans, struct btrfs_root /* create a root if there isn't one */ if (!root->node) BUG(); + ret = btrfs_search_slot(trans, root, cpu_key, path, data_size, 1); if (ret == 0) { return -EEXIST; @@ -1892,57 +2016,68 @@ int btrfs_insert_empty_item(struct btrfs_trans_handle *trans, struct btrfs_root goto out; slot_orig = path->slots[0]; - leaf_buf = path->nodes[0]; - leaf = btrfs_buffer_leaf(leaf_buf); + leaf = path->nodes[0]; - nritems = btrfs_header_nritems(&leaf->header); + nritems = btrfs_header_nritems(leaf); data_end = leaf_data_end(root, leaf); if (btrfs_leaf_free_space(root, leaf) < sizeof(struct btrfs_item) + data_size) { BUG(); } + slot = path->slots[0]; BUG_ON(slot < 0); + if (slot != nritems) { int i; - unsigned int old_data = btrfs_item_end(leaf->items + slot); + unsigned int old_data = btrfs_item_end_nr(leaf, slot); + if (old_data < data_end) { + btrfs_print_leaf(root, leaf); + printk("slot %d old_data %d data_end %d\n", + slot, old_data, data_end); + BUG_ON(1); + } /* * item0..itemN ... dataN.offset..dataN.size .. data0.size */ /* first correct the data pointers */ for (i = slot; i < nritems; i++) { - u32 ioff = btrfs_item_offset(leaf->items + i); - btrfs_set_item_offset(leaf->items + i, - ioff - data_size); + u32 ioff; + item = btrfs_item_nr(leaf, i); + ioff = btrfs_item_offset(leaf, item); + btrfs_set_item_offset(leaf, item, ioff - data_size); } /* shift the items */ - btrfs_memmove(root, leaf, leaf->items + slot + 1, - leaf->items + slot, + memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + 1), + btrfs_item_nr_offset(slot), (nritems - slot) * sizeof(struct btrfs_item)); /* shift the data */ - btrfs_memmove(root, leaf, btrfs_leaf_data(leaf) + + memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) + data_end - data_size, btrfs_leaf_data(leaf) + data_end, old_data - data_end); data_end = old_data; } + /* setup the item for the new data */ - btrfs_memcpy(root, leaf, &leaf->items[slot].key, &disk_key, - sizeof(struct btrfs_disk_key)); - btrfs_set_item_offset(leaf->items + slot, data_end - data_size); - btrfs_set_item_size(leaf->items + slot, data_size); - btrfs_set_header_nritems(&leaf->header, nritems + 1); - btrfs_mark_buffer_dirty(leaf_buf); + btrfs_set_item_key(leaf, &disk_key, slot); + item = btrfs_item_nr(leaf, slot); + btrfs_set_item_offset(leaf, item, data_end - data_size); + btrfs_set_item_size(leaf, item, data_size); + btrfs_set_header_nritems(leaf, nritems + 1); + btrfs_mark_buffer_dirty(leaf); ret = 0; if (slot == 0) ret = fixup_low_keys(trans, root, path, &disk_key, 1); - if (btrfs_leaf_free_space(root, leaf) < 0) + if (btrfs_leaf_free_space(root, leaf) < 0) { + btrfs_print_leaf(root, leaf); BUG(); + } check_leaf(root, path, 0); out: return ret; @@ -1958,17 +2093,17 @@ int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root { int ret = 0; struct btrfs_path *path; - u8 *ptr; + struct extent_buffer *leaf; + unsigned long ptr; path = btrfs_alloc_path(); BUG_ON(!path); ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size); if (!ret) { - ptr = btrfs_item_ptr(btrfs_buffer_leaf(path->nodes[0]), - path->slots[0], u8); - btrfs_memcpy(root, path->nodes[0]->b_data, - ptr, data, data_size); - btrfs_mark_buffer_dirty(path->nodes[0]); + leaf = path->nodes[0]; + ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); + write_extent_buffer(leaf, data, ptr, data_size); + btrfs_mark_buffer_dirty(leaf); } btrfs_free_path(path); return ret; @@ -1984,30 +2119,30 @@ int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, int level, int slot) { - struct btrfs_node *node; - struct buffer_head *parent = path->nodes[level]; + struct extent_buffer *parent = path->nodes[level]; u32 nritems; int ret = 0; int wret; - node = btrfs_buffer_node(parent); - nritems = btrfs_header_nritems(&node->header); + nritems = btrfs_header_nritems(parent); if (slot != nritems -1) { - btrfs_memmove(root, node, node->ptrs + slot, - node->ptrs + slot + 1, + memmove_extent_buffer(parent, + btrfs_node_key_ptr_offset(slot), + btrfs_node_key_ptr_offset(slot + 1), sizeof(struct btrfs_key_ptr) * (nritems - slot - 1)); } nritems--; - btrfs_set_header_nritems(&node->header, nritems); + btrfs_set_header_nritems(parent, nritems); if (nritems == 0 && parent == root->node) { - struct btrfs_header *header = btrfs_buffer_header(root->node); - BUG_ON(btrfs_header_level(header) != 1); + BUG_ON(btrfs_header_level(root->node) != 1); /* just turn the root into a leaf and break */ - btrfs_set_header_level(header, 0); + btrfs_set_header_level(root->node, 0); } else if (slot == 0) { - wret = fixup_low_keys(trans, root, path, &node->ptrs[0].key, - level + 1); + struct btrfs_disk_key disk_key; + + btrfs_node_key(parent, &disk_key, 0); + wret = fixup_low_keys(trans, root, path, &disk_key, level + 1); if (wret) ret = wret; } @@ -2023,59 +2158,67 @@ int btrfs_del_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path) { int slot; - struct btrfs_leaf *leaf; - struct buffer_head *leaf_buf; + struct extent_buffer *leaf; + struct btrfs_item *item; int doff; int dsize; int ret = 0; int wret; u32 nritems; - leaf_buf = path->nodes[0]; - leaf = btrfs_buffer_leaf(leaf_buf); + leaf = path->nodes[0]; slot = path->slots[0]; - doff = btrfs_item_offset(leaf->items + slot); - dsize = btrfs_item_size(leaf->items + slot); - nritems = btrfs_header_nritems(&leaf->header); + doff = btrfs_item_offset_nr(leaf, slot); + dsize = btrfs_item_size_nr(leaf, slot); + nritems = btrfs_header_nritems(leaf); if (slot != nritems - 1) { int i; int data_end = leaf_data_end(root, leaf); - btrfs_memmove(root, leaf, btrfs_leaf_data(leaf) + + + memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) + data_end + dsize, btrfs_leaf_data(leaf) + data_end, doff - data_end); + for (i = slot + 1; i < nritems; i++) { - u32 ioff = btrfs_item_offset(leaf->items + i); - btrfs_set_item_offset(leaf->items + i, ioff + dsize); + u32 ioff; + item = btrfs_item_nr(leaf, i); + ioff = btrfs_item_offset(leaf, item); + btrfs_set_item_offset(leaf, item, ioff + dsize); } - btrfs_memmove(root, leaf, leaf->items + slot, - leaf->items + slot + 1, + memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot), + btrfs_item_nr_offset(slot + 1), sizeof(struct btrfs_item) * (nritems - slot - 1)); } - btrfs_set_header_nritems(&leaf->header, nritems - 1); + btrfs_set_header_nritems(leaf, nritems - 1); nritems--; + /* delete the leaf if we've emptied it */ if (nritems == 0) { - if (leaf_buf == root->node) { - btrfs_set_header_level(&leaf->header, 0); + if (leaf == root->node) { + btrfs_set_header_level(leaf, 0); } else { - clean_tree_block(trans, root, leaf_buf); - wait_on_buffer(leaf_buf); + clean_tree_block(trans, root, leaf); + wait_on_tree_block_writeback(root, leaf); wret = del_ptr(trans, root, path, 1, path->slots[1]); if (wret) ret = wret; wret = btrfs_free_extent(trans, root, - bh_blocknr(leaf_buf), 1, 1); + extent_buffer_blocknr(leaf), + 1, 1); if (wret) ret = wret; } } else { int used = leaf_space_used(leaf, 0, nritems); if (slot == 0) { + struct btrfs_disk_key disk_key; + + btrfs_item_key(leaf, &disk_key, 0); wret = fixup_low_keys(trans, root, path, - &leaf->items[0].key, 1); + &disk_key, 1); if (wret) ret = wret; } @@ -2087,34 +2230,40 @@ int btrfs_del_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, * for possible call to del_ptr below */ slot = path->slots[1]; - get_bh(leaf_buf); + extent_buffer_get(leaf); + wret = push_leaf_left(trans, root, path, 1); if (wret < 0 && wret != -ENOSPC) ret = wret; - if (path->nodes[0] == leaf_buf && - btrfs_header_nritems(&leaf->header)) { + + if (path->nodes[0] == leaf && + btrfs_header_nritems(leaf)) { wret = push_leaf_right(trans, root, path, 1); if (wret < 0 && wret != -ENOSPC) ret = wret; } - if (btrfs_header_nritems(&leaf->header) == 0) { - u64 blocknr = bh_blocknr(leaf_buf); - clean_tree_block(trans, root, leaf_buf); - wait_on_buffer(leaf_buf); + + if (btrfs_header_nritems(leaf) == 0) { + u64 blocknr = extent_buffer_blocknr(leaf); + + clean_tree_block(trans, root, leaf); + wait_on_tree_block_writeback(root, leaf); + wret = del_ptr(trans, root, path, 1, slot); if (wret) ret = wret; - btrfs_block_release(root, leaf_buf); + + free_extent_buffer(leaf); wret = btrfs_free_extent(trans, root, blocknr, 1, 1); if (wret) ret = wret; } else { - btrfs_mark_buffer_dirty(leaf_buf); - btrfs_block_release(root, leaf_buf); + btrfs_mark_buffer_dirty(leaf); + free_extent_buffer(leaf); } } else { - btrfs_mark_buffer_dirty(leaf_buf); + btrfs_mark_buffer_dirty(leaf); } } return ret; @@ -2130,25 +2279,27 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path) int slot; int level = 1; u64 blocknr; - struct buffer_head *c; - struct btrfs_node *c_node; - struct buffer_head *next = NULL; + struct extent_buffer *c; + struct extent_buffer *next = NULL; while(level < BTRFS_MAX_LEVEL) { if (!path->nodes[level]) return 1; + slot = path->slots[level] + 1; c = path->nodes[level]; - c_node = btrfs_buffer_node(c); - if (slot >= btrfs_header_nritems(&c_node->header)) { + if (slot >= btrfs_header_nritems(c)) { level++; continue; } - blocknr = btrfs_node_blockptr(c_node, slot); + + blocknr = btrfs_node_blockptr(c, slot); if (next) - btrfs_block_release(root, next); + free_extent_buffer(next); + if (path->reada) reada_for_search(root, path, level, slot); + next = read_tree_block(root, blocknr); break; } @@ -2156,15 +2307,14 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path) while(1) { level--; c = path->nodes[level]; - btrfs_block_release(root, c); + free_extent_buffer(c); path->nodes[level] = next; path->slots[level] = 0; if (!level) break; if (path->reada) reada_for_search(root, path, level, 0); - next = read_tree_block(root, - btrfs_node_blockptr(btrfs_buffer_node(next), 0)); + next = read_tree_block(root, btrfs_node_blockptr(next, 0)); } return 0; } diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 800a3499cc3f..c4b829806855 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -20,10 +20,10 @@ #define __BTRFS__ #include -#include #include #include #include "bit-radix.h" +#include "extent_map.h" struct btrfs_trans_handle; struct btrfs_transaction; @@ -32,7 +32,7 @@ extern struct kmem_cache *btrfs_transaction_cachep; extern struct kmem_cache *btrfs_bit_radix_cachep; extern struct kmem_cache *btrfs_path_cachep; -#define BTRFS_MAGIC "_BtRfS_M" +#define BTRFS_MAGIC "_B2RfS_M" #define BTRFS_ROOT_TREE_OBJECTID 1ULL #define BTRFS_EXTENT_TREE_OBJECTID 2ULL @@ -78,41 +78,41 @@ extern struct kmem_cache *btrfs_path_cachep; */ struct btrfs_disk_key { __le64 objectid; - __le32 flags; + u8 type; __le64 offset; } __attribute__ ((__packed__)); struct btrfs_key { u64 objectid; - u32 flags; + u8 type; u64 offset; } __attribute__ ((__packed__)); +#define BTRFS_FSID_SIZE 16 /* * every tree block (leaf or node) starts with this header. */ struct btrfs_header { u8 csum[BTRFS_CSUM_SIZE]; - u8 fsid[16]; /* FS specific uuid */ + u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */ __le64 blocknr; /* which block this node is supposed to live in */ __le64 generation; __le64 owner; - __le16 nritems; + __le32 nritems; __le16 flags; u8 level; } __attribute__ ((__packed__)); #define BTRFS_MAX_LEVEL 8 -#define BTRFS_NODEPTRS_PER_BLOCK(r) (((r)->blocksize - \ +#define BTRFS_NODEPTRS_PER_BLOCK(r) (((r)->nodesize - \ sizeof(struct btrfs_header)) / \ (sizeof(struct btrfs_disk_key) + sizeof(u64))) #define __BTRFS_LEAF_DATA_SIZE(bs) ((bs) - sizeof(struct btrfs_header)) -#define BTRFS_LEAF_DATA_SIZE(r) (__BTRFS_LEAF_DATA_SIZE(r->blocksize)) +#define BTRFS_LEAF_DATA_SIZE(r) (__BTRFS_LEAF_DATA_SIZE(r->leafsize)) #define BTRFS_MAX_INLINE_DATA_SIZE(r) (BTRFS_LEAF_DATA_SIZE(r) - \ sizeof(struct btrfs_item) - \ sizeof(struct btrfs_file_extent_item)) -struct buffer_head; /* * the super block basically lists the main trees of the FS * it currently lacks any block count etc etc @@ -128,7 +128,9 @@ struct btrfs_super_block { __le64 total_blocks; __le64 blocks_used; __le64 root_dir_objectid; - __le32 blocksize; + __le32 sectorsize; + __le32 nodesize; + __le32 leafsize; } __attribute__ ((__packed__)); /* @@ -138,7 +140,7 @@ struct btrfs_super_block { struct btrfs_item { struct btrfs_disk_key key; __le32 offset; - __le16 size; + __le32 size; } __attribute__ ((__packed__)); /* @@ -176,7 +178,7 @@ struct btrfs_node { * used while walking the tree. */ struct btrfs_path { - struct buffer_head *nodes[BTRFS_MAX_LEVEL]; + struct extent_buffer *nodes[BTRFS_MAX_LEVEL]; int slots[BTRFS_MAX_LEVEL]; int reada; int lowest_level; @@ -292,6 +294,7 @@ struct btrfs_block_group_cache { }; struct btrfs_fs_info { + u8 fsid[BTRFS_FSID_SIZE]; struct btrfs_root *extent_root; struct btrfs_root *tree_root; struct radix_tree_root fs_roots_radix; @@ -304,9 +307,8 @@ struct btrfs_fs_info { u64 generation; u64 last_trans_committed; struct btrfs_transaction *running_transaction; - struct btrfs_super_block *disk_super; struct btrfs_super_block super_copy; - struct buffer_head *sb_buffer; + struct extent_buffer *sb_buffer; struct super_block *sb; struct inode *btree_inode; struct mutex trans_mutex; @@ -325,8 +327,8 @@ struct btrfs_fs_info { * and for the extent tree extent_root root. */ struct btrfs_root { - struct buffer_head *node; - struct buffer_head *commit_root; + struct extent_buffer *node; + struct extent_buffer *commit_root; struct btrfs_root_item root_item; struct btrfs_key root_key; struct btrfs_fs_info *fs_info; @@ -336,7 +338,16 @@ struct btrfs_root { struct rw_semaphore snap_sem; u64 objectid; u64 last_trans; - u32 blocksize; + + /* data allocations are done in sectorsize units */ + u32 sectorsize; + + /* node allocations are done in nodesize units */ + u32 nodesize; + + /* leaf allocations are done in leafsize units */ + u32 leafsize; + u32 type; u64 highest_inode; u64 last_inode_alloc; @@ -347,12 +358,6 @@ struct btrfs_root { char *name; }; -/* the lower bits in the key flags defines the item type */ -#define BTRFS_KEY_TYPE_MAX 256 -#define BTRFS_KEY_TYPE_SHIFT 24 -#define BTRFS_KEY_TYPE_MASK (((u32)BTRFS_KEY_TYPE_MAX - 1) << \ - BTRFS_KEY_TYPE_SHIFT) - /* * inode items have the data typically returned from stat and store other * info about object characteristics. There is one for every file and dir in @@ -402,246 +407,253 @@ struct btrfs_root { */ #define BTRFS_STRING_ITEM_KEY 253 +/* some macros to generate set/get funcs for the struct fields. This + * assumes there is a lefoo_to_cpu for every type, so lets make a simple + * one for u8: + */ +#define le8_to_cpu(v) (v) +#define cpu_to_le8(v) (v) +#define __le8 u8 + +#define read_eb_member(eb, ptr, type, member, result) ( \ + read_extent_buffer(eb, (char *)(result), \ + ((unsigned long)(ptr)) + \ + offsetof(type, member), \ + sizeof(((type *)0)->member))) + +#define write_eb_member(eb, ptr, type, member, result) ( \ + write_extent_buffer(eb, (char *)(result), \ + ((unsigned long)(ptr)) + \ + offsetof(type, member), \ + sizeof(((type *)0)->member))) + +#define BTRFS_SETGET_FUNCS(name, type, member, bits) \ +static inline u##bits btrfs_##name(struct extent_buffer *eb, \ + type *s) \ +{ \ + __le##bits res; \ + read_eb_member(eb, s, type, member, &res); \ + return le##bits##_to_cpu(res); \ +} \ +static inline void btrfs_set_##name(struct extent_buffer *eb, \ + type *s, u##bits val) \ +{ \ + val = cpu_to_le##bits(val); \ + write_eb_member(eb, s, type, member, &val); \ +} + +#define BTRFS_SETGET_HEADER_FUNCS(name, type, member, bits) \ +static inline u##bits btrfs_##name(struct extent_buffer *eb) \ +{ \ + __le##bits res; \ + read_eb_member(eb, NULL, type, member, &res); \ + return le##bits##_to_cpu(res); \ +} \ +static inline void btrfs_set_##name(struct extent_buffer *eb, \ + u##bits val) \ +{ \ + val = cpu_to_le##bits(val); \ + write_eb_member(eb, NULL, type, member, &val); \ +} -static inline u64 btrfs_block_group_used(struct btrfs_block_group_item *bi) -{ - return le64_to_cpu(bi->used); -} - -static inline void btrfs_set_block_group_used(struct - btrfs_block_group_item *bi, - u64 val) -{ - bi->used = cpu_to_le64(val); -} - -static inline u64 btrfs_inode_generation(struct btrfs_inode_item *i) -{ - return le64_to_cpu(i->generation); -} - -static inline void btrfs_set_inode_generation(struct btrfs_inode_item *i, - u64 val) -{ - i->generation = cpu_to_le64(val); -} - -static inline u64 btrfs_inode_size(struct btrfs_inode_item *i) -{ - return le64_to_cpu(i->size); -} - -static inline void btrfs_set_inode_size(struct btrfs_inode_item *i, u64 val) -{ - i->size = cpu_to_le64(val); -} - -static inline u64 btrfs_inode_nblocks(struct btrfs_inode_item *i) -{ - return le64_to_cpu(i->nblocks); -} - -static inline void btrfs_set_inode_nblocks(struct btrfs_inode_item *i, u64 val) -{ - i->nblocks = cpu_to_le64(val); -} - -static inline u64 btrfs_inode_block_group(struct btrfs_inode_item *i) -{ - return le64_to_cpu(i->block_group); -} - -static inline void btrfs_set_inode_block_group(struct btrfs_inode_item *i, - u64 val) -{ - i->block_group = cpu_to_le64(val); -} - -static inline u32 btrfs_inode_nlink(struct btrfs_inode_item *i) -{ - return le32_to_cpu(i->nlink); -} - -static inline void btrfs_set_inode_nlink(struct btrfs_inode_item *i, u32 val) -{ - i->nlink = cpu_to_le32(val); -} - -static inline u32 btrfs_inode_uid(struct btrfs_inode_item *i) -{ - return le32_to_cpu(i->uid); -} - -static inline void btrfs_set_inode_uid(struct btrfs_inode_item *i, u32 val) -{ - i->uid = cpu_to_le32(val); -} - -static inline u32 btrfs_inode_gid(struct btrfs_inode_item *i) -{ - return le32_to_cpu(i->gid); -} - -static inline void btrfs_set_inode_gid(struct btrfs_inode_item *i, u32 val) -{ - i->gid = cpu_to_le32(val); -} - -static inline u32 btrfs_inode_mode(struct btrfs_inode_item *i) -{ - return le32_to_cpu(i->mode); -} - -static inline void btrfs_set_inode_mode(struct btrfs_inode_item *i, u32 val) -{ - i->mode = cpu_to_le32(val); +#define BTRFS_SETGET_STACK_FUNCS(name, type, member, bits) \ +static inline u##bits btrfs_##name(type *s) \ +{ \ + return le##bits##_to_cpu(s->member); \ +} \ +static inline void btrfs_set_##name(type *s, u##bits val) \ +{ \ + s->member = cpu_to_le##bits(val); \ } -static inline u32 btrfs_inode_rdev(struct btrfs_inode_item *i) -{ - return le32_to_cpu(i->rdev); -} +/* struct btrfs_block_group_item */ +BTRFS_SETGET_STACK_FUNCS(block_group_used, struct btrfs_block_group_item, + used, 64); +BTRFS_SETGET_FUNCS(disk_block_group_used, struct btrfs_block_group_item, + used, 64); -static inline void btrfs_set_inode_rdev(struct btrfs_inode_item *i, u32 val) -{ - i->rdev = cpu_to_le32(val); -} +/* struct btrfs_inode_item */ +BTRFS_SETGET_FUNCS(inode_generation, struct btrfs_inode_item, generation, 64); +BTRFS_SETGET_FUNCS(inode_size, struct btrfs_inode_item, size, 64); +BTRFS_SETGET_FUNCS(inode_nblocks, struct btrfs_inode_item, nblocks, 64); +BTRFS_SETGET_FUNCS(inode_block_group, struct btrfs_inode_item, block_group, 64); +BTRFS_SETGET_FUNCS(inode_nlink, struct btrfs_inode_item, nlink, 32); +BTRFS_SETGET_FUNCS(inode_uid, struct btrfs_inode_item, uid, 32); +BTRFS_SETGET_FUNCS(inode_gid, struct btrfs_inode_item, gid, 32); +BTRFS_SETGET_FUNCS(inode_mode, struct btrfs_inode_item, mode, 32); +BTRFS_SETGET_FUNCS(inode_rdev, struct btrfs_inode_item, rdev, 32); +BTRFS_SETGET_FUNCS(inode_flags, struct btrfs_inode_item, flags, 16); +BTRFS_SETGET_FUNCS(inode_compat_flags, struct btrfs_inode_item, + compat_flags, 16); -static inline u16 btrfs_inode_flags(struct btrfs_inode_item *i) +static inline struct btrfs_inode_timespec * +btrfs_inode_atime(struct btrfs_inode_item *inode_item) { - return le16_to_cpu(i->flags); + unsigned long ptr = (unsigned long)inode_item; + ptr += offsetof(struct btrfs_inode_item, atime); + return (struct btrfs_inode_timespec *)ptr; } -static inline void btrfs_set_inode_flags(struct btrfs_inode_item *i, u16 val) +static inline struct btrfs_inode_timespec * +btrfs_inode_mtime(struct btrfs_inode_item *inode_item) { - i->flags = cpu_to_le16(val); + unsigned long ptr = (unsigned long)inode_item; + ptr += offsetof(struct btrfs_inode_item, mtime); + return (struct btrfs_inode_timespec *)ptr; } -static inline u16 btrfs_inode_compat_flags(struct btrfs_inode_item *i) +static inline struct btrfs_inode_timespec * +btrfs_inode_ctime(struct btrfs_inode_item *inode_item) { - return le16_to_cpu(i->compat_flags); + unsigned long ptr = (unsigned long)inode_item; + ptr += offsetof(struct btrfs_inode_item, ctime); + return (struct btrfs_inode_timespec *)ptr; } -static inline void btrfs_set_inode_compat_flags(struct btrfs_inode_item *i, - u16 val) +static inline struct btrfs_inode_timespec * +btrfs_inode_otime(struct btrfs_inode_item *inode_item) { - i->compat_flags = cpu_to_le16(val); + unsigned long ptr = (unsigned long)inode_item; + ptr += offsetof(struct btrfs_inode_item, otime); + return (struct btrfs_inode_timespec *)ptr; } -static inline u64 btrfs_timespec_sec(struct btrfs_inode_timespec *ts) -{ - return le64_to_cpu(ts->sec); -} +BTRFS_SETGET_FUNCS(timespec_sec, struct btrfs_inode_timespec, sec, 64); +BTRFS_SETGET_FUNCS(timespec_nsec, struct btrfs_inode_timespec, nsec, 32); -static inline void btrfs_set_timespec_sec(struct btrfs_inode_timespec *ts, - u64 val) -{ - ts->sec = cpu_to_le64(val); -} +/* struct btrfs_extent_item */ +BTRFS_SETGET_FUNCS(extent_refs, struct btrfs_extent_item, refs, 32); +BTRFS_SETGET_FUNCS(extent_owner, struct btrfs_extent_item, owner, 32); -static inline u32 btrfs_timespec_nsec(struct btrfs_inode_timespec *ts) -{ - return le32_to_cpu(ts->nsec); -} +BTRFS_SETGET_STACK_FUNCS(stack_extent_refs, struct btrfs_extent_item, + refs, 32); +BTRFS_SETGET_STACK_FUNCS(stack_extent_owner, struct btrfs_extent_item, + owner, 32); -static inline void btrfs_set_timespec_nsec(struct btrfs_inode_timespec *ts, - u32 val) -{ - ts->nsec = cpu_to_le32(val); -} +/* struct btrfs_node */ +BTRFS_SETGET_FUNCS(key_blockptr, struct btrfs_key_ptr, blockptr, 64); -static inline u32 btrfs_extent_refs(struct btrfs_extent_item *ei) +static inline u64 btrfs_node_blockptr(struct extent_buffer *eb, int nr) { - return le32_to_cpu(ei->refs); + unsigned long ptr; + ptr = offsetof(struct btrfs_node, ptrs) + + sizeof(struct btrfs_key_ptr) * nr; + return btrfs_key_blockptr(eb, (struct btrfs_key_ptr *)ptr); } -static inline void btrfs_set_extent_refs(struct btrfs_extent_item *ei, u32 val) +static inline void btrfs_set_node_blockptr(struct extent_buffer *eb, + int nr, u64 val) { - ei->refs = cpu_to_le32(val); + unsigned long ptr; + ptr = offsetof(struct btrfs_node, ptrs) + + sizeof(struct btrfs_key_ptr) * nr; + btrfs_set_key_blockptr(eb, (struct btrfs_key_ptr *)ptr, val); } -static inline u64 btrfs_extent_owner(struct btrfs_extent_item *ei) +static unsigned long btrfs_node_key_ptr_offset(int nr) { - return le64_to_cpu(ei->owner); + return offsetof(struct btrfs_node, ptrs) + + sizeof(struct btrfs_key_ptr) * nr; } -static inline void btrfs_set_extent_owner(struct btrfs_extent_item *ei, u64 val) +static void btrfs_node_key(struct extent_buffer *eb, + struct btrfs_disk_key *disk_key, int nr) { - ei->owner = cpu_to_le64(val); + unsigned long ptr; + ptr = btrfs_node_key_ptr_offset(nr); + read_eb_member(eb, (struct btrfs_key_ptr *)ptr, + struct btrfs_key_ptr, key, disk_key); } - -static inline u64 btrfs_node_blockptr(struct btrfs_node *n, int nr) +static inline void btrfs_set_node_key(struct extent_buffer *eb, + struct btrfs_disk_key *disk_key, int nr) { - return le64_to_cpu(n->ptrs[nr].blockptr); + unsigned long ptr; + ptr = btrfs_node_key_ptr_offset(nr); + write_eb_member(eb, (struct btrfs_key_ptr *)ptr, + struct btrfs_key_ptr, key, disk_key); } +/* struct btrfs_item */ +BTRFS_SETGET_FUNCS(item_offset, struct btrfs_item, offset, 32); +BTRFS_SETGET_FUNCS(item_size, struct btrfs_item, size, 32); -static inline void btrfs_set_node_blockptr(struct btrfs_node *n, int nr, - u64 val) +static inline unsigned long btrfs_item_nr_offset(int nr) { - n->ptrs[nr].blockptr = cpu_to_le64(val); + return offsetof(struct btrfs_leaf, items) + + sizeof(struct btrfs_item) * nr; } -static inline u32 btrfs_item_offset(struct btrfs_item *item) +static inline struct btrfs_item *btrfs_item_nr(struct extent_buffer *eb, + int nr) { - return le32_to_cpu(item->offset); + return (struct btrfs_item *)btrfs_item_nr_offset(nr); } -static inline void btrfs_set_item_offset(struct btrfs_item *item, u32 val) +static inline u32 btrfs_item_end(struct extent_buffer *eb, + struct btrfs_item *item) { - item->offset = cpu_to_le32(val); + return btrfs_item_offset(eb, item) + btrfs_item_size(eb, item); } -static inline u32 btrfs_item_end(struct btrfs_item *item) +static inline u32 btrfs_item_end_nr(struct extent_buffer *eb, int nr) { - return le32_to_cpu(item->offset) + le16_to_cpu(item->size); + return btrfs_item_end(eb, btrfs_item_nr(eb, nr)); } -static inline u16 btrfs_item_size(struct btrfs_item *item) +static inline u32 btrfs_item_offset_nr(struct extent_buffer *eb, int nr) { - return le16_to_cpu(item->size); + return btrfs_item_offset(eb, btrfs_item_nr(eb, nr)); } -static inline void btrfs_set_item_size(struct btrfs_item *item, u16 val) +static inline u32 btrfs_item_size_nr(struct extent_buffer *eb, int nr) { - item->size = cpu_to_le16(val); + return btrfs_item_size(eb, btrfs_item_nr(eb, nr)); } -static inline u16 btrfs_dir_flags(struct btrfs_dir_item *d) +static inline void btrfs_item_key(struct extent_buffer *eb, + struct btrfs_disk_key *disk_key, int nr) { - return le16_to_cpu(d->flags); + struct btrfs_item *item = btrfs_item_nr(eb, nr); + read_eb_member(eb, item, struct btrfs_item, key, disk_key); } -static inline void btrfs_set_dir_flags(struct btrfs_dir_item *d, u16 val) +static inline void btrfs_set_item_key(struct extent_buffer *eb, + struct btrfs_disk_key *disk_key, int nr) { - d->flags = cpu_to_le16(val); + struct btrfs_item *item = btrfs_item_nr(eb, nr); + write_eb_member(eb, item, struct btrfs_item, key, disk_key); } -static inline u8 btrfs_dir_type(struct btrfs_dir_item *d) -{ - return d->type; -} +/* struct btrfs_dir_item */ +BTRFS_SETGET_FUNCS(dir_flags, struct btrfs_dir_item, flags, 16); +BTRFS_SETGET_FUNCS(dir_type, struct btrfs_dir_item, type, 8); +BTRFS_SETGET_FUNCS(dir_name_len, struct btrfs_dir_item, name_len, 16); -static inline void btrfs_set_dir_type(struct btrfs_dir_item *d, u8 val) +static inline void btrfs_dir_item_key(struct extent_buffer *eb, + struct btrfs_dir_item *item, + struct btrfs_disk_key *key) { - d->type = val; + read_eb_member(eb, item, struct btrfs_dir_item, location, key); } -static inline u16 btrfs_dir_name_len(struct btrfs_dir_item *d) +static inline void btrfs_set_dir_item_key(struct extent_buffer *eb, + struct btrfs_dir_item *item, + struct btrfs_disk_key *key) { - return le16_to_cpu(d->name_len); + write_eb_member(eb, item, struct btrfs_dir_item, location, key); } -static inline void btrfs_set_dir_name_len(struct btrfs_dir_item *d, u16 val) -{ - d->name_len = cpu_to_le16(val); -} +/* struct btrfs_disk_key */ +BTRFS_SETGET_STACK_FUNCS(disk_key_objectid, struct btrfs_disk_key, + objectid, 64); +BTRFS_SETGET_STACK_FUNCS(disk_key_offset, struct btrfs_disk_key, offset, 64); +BTRFS_SETGET_STACK_FUNCS(disk_key_type, struct btrfs_disk_key, type, 8); static inline void btrfs_disk_key_to_cpu(struct btrfs_key *cpu, struct btrfs_disk_key *disk) { cpu->offset = le64_to_cpu(disk->offset); - cpu->flags = le32_to_cpu(disk->flags); + cpu->type = disk->type; cpu->objectid = le64_to_cpu(disk->objectid); } @@ -649,400 +661,167 @@ static inline void btrfs_cpu_key_to_disk(struct btrfs_disk_key *disk, struct btrfs_key *cpu) { disk->offset = cpu_to_le64(cpu->offset); - disk->flags = cpu_to_le32(cpu->flags); + disk->type = cpu->type; disk->objectid = cpu_to_le64(cpu->objectid); } -static inline u64 btrfs_disk_key_objectid(struct btrfs_disk_key *disk) -{ - return le64_to_cpu(disk->objectid); -} - -static inline void btrfs_set_disk_key_objectid(struct btrfs_disk_key *disk, - u64 val) -{ - disk->objectid = cpu_to_le64(val); -} - -static inline u64 btrfs_disk_key_offset(struct btrfs_disk_key *disk) -{ - return le64_to_cpu(disk->offset); -} - -static inline void btrfs_set_disk_key_offset(struct btrfs_disk_key *disk, - u64 val) -{ - disk->offset = cpu_to_le64(val); -} - -static inline u32 btrfs_disk_key_flags(struct btrfs_disk_key *disk) -{ - return le32_to_cpu(disk->flags); -} - -static inline void btrfs_set_disk_key_flags(struct btrfs_disk_key *disk, - u32 val) -{ - disk->flags = cpu_to_le32(val); -} - -static inline u32 btrfs_disk_key_type(struct btrfs_disk_key *key) -{ - return le32_to_cpu(key->flags) >> BTRFS_KEY_TYPE_SHIFT; -} - -static inline void btrfs_set_disk_key_type(struct btrfs_disk_key *key, - u32 val) -{ - u32 flags = btrfs_disk_key_flags(key); - BUG_ON(val >= BTRFS_KEY_TYPE_MAX); - val = val << BTRFS_KEY_TYPE_SHIFT; - flags = (flags & ~BTRFS_KEY_TYPE_MASK) | val; - btrfs_set_disk_key_flags(key, flags); -} - -static inline u32 btrfs_key_type(struct btrfs_key *key) -{ - return key->flags >> BTRFS_KEY_TYPE_SHIFT; -} - -static inline void btrfs_set_key_type(struct btrfs_key *key, u32 val) -{ - BUG_ON(val >= BTRFS_KEY_TYPE_MAX); - val = val << BTRFS_KEY_TYPE_SHIFT; - key->flags = (key->flags & ~(BTRFS_KEY_TYPE_MASK)) | val; -} - -static inline u64 btrfs_header_blocknr(struct btrfs_header *h) -{ - return le64_to_cpu(h->blocknr); -} - -static inline void btrfs_set_header_blocknr(struct btrfs_header *h, u64 blocknr) -{ - h->blocknr = cpu_to_le64(blocknr); -} - -static inline u64 btrfs_header_generation(struct btrfs_header *h) +static inline void btrfs_node_key_to_cpu(struct extent_buffer *eb, + struct btrfs_key *key, int nr) { - return le64_to_cpu(h->generation); + struct btrfs_disk_key disk_key; + btrfs_node_key(eb, &disk_key, nr); + btrfs_disk_key_to_cpu(key, &disk_key); } -static inline void btrfs_set_header_generation(struct btrfs_header *h, - u64 val) +static inline void btrfs_item_key_to_cpu(struct extent_buffer *eb, + struct btrfs_key *key, int nr) { - h->generation = cpu_to_le64(val); + struct btrfs_disk_key disk_key; + btrfs_item_key(eb, &disk_key, nr); + btrfs_disk_key_to_cpu(key, &disk_key); } -static inline u64 btrfs_header_owner(struct btrfs_header *h) +static inline void btrfs_dir_item_key_to_cpu(struct extent_buffer *eb, + struct btrfs_dir_item *item, + struct btrfs_key *key) { - return le64_to_cpu(h->owner); + struct btrfs_disk_key disk_key; + btrfs_dir_item_key(eb, item, &disk_key); + btrfs_disk_key_to_cpu(key, &disk_key); } -static inline void btrfs_set_header_owner(struct btrfs_header *h, - u64 val) -{ - h->owner = cpu_to_le64(val); -} - -static inline u16 btrfs_header_nritems(struct btrfs_header *h) -{ - return le16_to_cpu(h->nritems); -} - -static inline void btrfs_set_header_nritems(struct btrfs_header *h, u16 val) -{ - h->nritems = cpu_to_le16(val); -} - -static inline u16 btrfs_header_flags(struct btrfs_header *h) -{ - return le16_to_cpu(h->flags); -} - -static inline void btrfs_set_header_flags(struct btrfs_header *h, u16 val) -{ - h->flags = cpu_to_le16(val); -} - -static inline int btrfs_header_level(struct btrfs_header *h) -{ - return h->level; -} - -static inline void btrfs_set_header_level(struct btrfs_header *h, int level) -{ - BUG_ON(level > BTRFS_MAX_LEVEL); - h->level = level; -} - -static inline int btrfs_is_leaf(struct btrfs_node *n) -{ - return (btrfs_header_level(&n->header) == 0); -} - -static inline u64 btrfs_root_blocknr(struct btrfs_root_item *item) -{ - return le64_to_cpu(item->blocknr); -} - -static inline void btrfs_set_root_blocknr(struct btrfs_root_item *item, u64 val) -{ - item->blocknr = cpu_to_le64(val); -} - -static inline u64 btrfs_root_dirid(struct btrfs_root_item *item) -{ - return le64_to_cpu(item->root_dirid); -} - -static inline void btrfs_set_root_dirid(struct btrfs_root_item *item, u64 val) -{ - item->root_dirid = cpu_to_le64(val); -} - -static inline u32 btrfs_root_refs(struct btrfs_root_item *item) -{ - return le32_to_cpu(item->refs); -} - -static inline void btrfs_set_root_refs(struct btrfs_root_item *item, u32 val) -{ - item->refs = cpu_to_le32(val); -} - -static inline u32 btrfs_root_flags(struct btrfs_root_item *item) -{ - return le32_to_cpu(item->flags); -} - -static inline void btrfs_set_root_flags(struct btrfs_root_item *item, u32 val) -{ - item->flags = cpu_to_le32(val); -} - -static inline void btrfs_set_root_blocks_used(struct btrfs_root_item *item, - u64 val) -{ - item->blocks_used = cpu_to_le64(val); -} - -static inline u64 btrfs_root_blocks_used(struct btrfs_root_item *item) -{ - return le64_to_cpu(item->blocks_used); -} - -static inline void btrfs_set_root_block_limit(struct btrfs_root_item *item, - u64 val) -{ - item->block_limit = cpu_to_le64(val); -} - -static inline u64 btrfs_root_block_limit(struct btrfs_root_item *item) -{ - return le64_to_cpu(item->block_limit); -} -static inline u64 btrfs_super_blocknr(struct btrfs_super_block *s) +static inline u8 btrfs_key_type(struct btrfs_key *key) { - return le64_to_cpu(s->blocknr); + return key->type; } -static inline void btrfs_set_super_blocknr(struct btrfs_super_block *s, u64 val) +static inline void btrfs_set_key_type(struct btrfs_key *key, u8 val) { - s->blocknr = cpu_to_le64(val); + key->type = val; } -static inline u64 btrfs_super_generation(struct btrfs_super_block *s) -{ - return le64_to_cpu(s->generation); -} +/* struct btrfs_header */ +BTRFS_SETGET_HEADER_FUNCS(header_blocknr, struct btrfs_header, blocknr, 64); +BTRFS_SETGET_HEADER_FUNCS(header_generation, struct btrfs_header, + generation, 64); +BTRFS_SETGET_HEADER_FUNCS(header_owner, struct btrfs_header, owner, 64); +BTRFS_SETGET_HEADER_FUNCS(header_nritems, struct btrfs_header, nritems, 32); +BTRFS_SETGET_HEADER_FUNCS(header_flags, struct btrfs_header, flags, 16); +BTRFS_SETGET_HEADER_FUNCS(header_level, struct btrfs_header, level, 8); -static inline void btrfs_set_super_generation(struct btrfs_super_block *s, - u64 val) +static inline u8 *btrfs_header_fsid(struct extent_buffer *eb) { - s->generation = cpu_to_le64(val); + unsigned long ptr = offsetof(struct btrfs_header, fsid); + return (u8 *)ptr; } -static inline u64 btrfs_super_root(struct btrfs_super_block *s) +static inline u8 *btrfs_super_fsid(struct extent_buffer *eb) { - return le64_to_cpu(s->root); + unsigned long ptr = offsetof(struct btrfs_super_block, fsid); + return (u8 *)ptr; } -static inline void btrfs_set_super_root(struct btrfs_super_block *s, u64 val) +static inline u8 *btrfs_header_csum(struct extent_buffer *eb) { - s->root = cpu_to_le64(val); + unsigned long ptr = offsetof(struct btrfs_header, csum); + return (u8 *)ptr; } -static inline u64 btrfs_super_total_blocks(struct btrfs_super_block *s) +static inline struct btrfs_node *btrfs_buffer_node(struct extent_buffer *eb) { - return le64_to_cpu(s->total_blocks); + return NULL; } -static inline void btrfs_set_super_total_blocks(struct btrfs_super_block *s, - u64 val) +static inline struct btrfs_leaf *btrfs_buffer_leaf(struct extent_buffer *eb) { - s->total_blocks = cpu_to_le64(val); + return NULL; } -static inline u64 btrfs_super_blocks_used(struct btrfs_super_block *s) +static inline struct btrfs_header *btrfs_buffer_header(struct extent_buffer *eb) { - return le64_to_cpu(s->blocks_used); + return NULL; } -static inline void btrfs_set_super_blocks_used(struct btrfs_super_block *s, - u64 val) +static inline int btrfs_is_leaf(struct extent_buffer *eb) { - s->blocks_used = cpu_to_le64(val); + return (btrfs_header_level(eb) == 0); } -static inline u32 btrfs_super_blocksize(struct btrfs_super_block *s) -{ - return le32_to_cpu(s->blocksize); -} +/* struct btrfs_root_item */ +BTRFS_SETGET_FUNCS(disk_root_refs, struct btrfs_root_item, refs, 32); +BTRFS_SETGET_FUNCS(disk_root_blocknr, struct btrfs_root_item, blocknr, 64); -static inline void btrfs_set_super_blocksize(struct btrfs_super_block *s, - u32 val) -{ - s->blocksize = cpu_to_le32(val); -} +BTRFS_SETGET_STACK_FUNCS(root_blocknr, struct btrfs_root_item, blocknr, 64); +BTRFS_SETGET_STACK_FUNCS(root_dirid, struct btrfs_root_item, root_dirid, 64); +BTRFS_SETGET_STACK_FUNCS(root_refs, struct btrfs_root_item, refs, 32); +BTRFS_SETGET_STACK_FUNCS(root_flags, struct btrfs_root_item, flags, 32); +BTRFS_SETGET_STACK_FUNCS(root_used, struct btrfs_root_item, blocks_used, 64); +BTRFS_SETGET_STACK_FUNCS(root_limit, struct btrfs_root_item, block_limit, 64); -static inline u64 btrfs_super_root_dir(struct btrfs_super_block *s) -{ - return le64_to_cpu(s->root_dir_objectid); -} +/* struct btrfs_super_block */ +BTRFS_SETGET_STACK_FUNCS(super_blocknr, struct btrfs_super_block, blocknr, 64); +BTRFS_SETGET_STACK_FUNCS(super_generation, struct btrfs_super_block, + generation, 64); +BTRFS_SETGET_STACK_FUNCS(super_root, struct btrfs_super_block, root, 64); +BTRFS_SETGET_STACK_FUNCS(super_total_blocks, struct btrfs_super_block, + total_blocks, 64); +BTRFS_SETGET_STACK_FUNCS(super_blocks_used, struct btrfs_super_block, + blocks_used, 64); +BTRFS_SETGET_STACK_FUNCS(super_sectorsize, struct btrfs_super_block, + sectorsize, 32); +BTRFS_SETGET_STACK_FUNCS(super_nodesize, struct btrfs_super_block, + nodesize, 32); +BTRFS_SETGET_STACK_FUNCS(super_leafsize, struct btrfs_super_block, + leafsize, 32); +BTRFS_SETGET_STACK_FUNCS(super_root_dir, struct btrfs_super_block, + root_dir_objectid, 64); -static inline void btrfs_set_super_root_dir(struct btrfs_super_block *s, u64 - val) +static inline unsigned long btrfs_leaf_data(struct extent_buffer *l) { - s->root_dir_objectid = cpu_to_le64(val); + return offsetof(struct btrfs_leaf, items); } -static inline u8 *btrfs_leaf_data(struct btrfs_leaf *l) -{ - return (u8 *)l->items; -} +/* struct btrfs_file_extent_item */ +BTRFS_SETGET_FUNCS(file_extent_type, struct btrfs_file_extent_item, type, 8); -static inline int btrfs_file_extent_type(struct btrfs_file_extent_item *e) -{ - return e->type; -} -static inline void btrfs_set_file_extent_type(struct btrfs_file_extent_item *e, - u8 val) -{ - e->type = val; -} - -static inline char *btrfs_file_extent_inline_start(struct +static inline unsigned long btrfs_file_extent_inline_start(struct btrfs_file_extent_item *e) { - return (char *)(&e->disk_blocknr); + unsigned long offset = (unsigned long)e; + offset += offsetof(struct btrfs_file_extent_item, disk_blocknr); + return offset; } static inline u32 btrfs_file_extent_calc_inline_size(u32 datasize) { - return (unsigned long)(&((struct - btrfs_file_extent_item *)NULL)->disk_blocknr) + datasize; -} - -static inline u32 btrfs_file_extent_inline_len(struct btrfs_item *e) -{ - struct btrfs_file_extent_item *fe = NULL; - return btrfs_item_size(e) - (unsigned long)(&fe->disk_blocknr); -} - -static inline u64 btrfs_file_extent_disk_blocknr(struct btrfs_file_extent_item - *e) -{ - return le64_to_cpu(e->disk_blocknr); + return offsetof(struct btrfs_file_extent_item, disk_blocknr) + datasize; } -static inline void btrfs_set_file_extent_disk_blocknr(struct - btrfs_file_extent_item - *e, u64 val) +static inline u32 btrfs_file_extent_inline_len(struct extent_buffer *eb, + struct btrfs_item *e) { - e->disk_blocknr = cpu_to_le64(val); + unsigned long offset; + offset = offsetof(struct btrfs_file_extent_item, disk_blocknr); + return btrfs_item_size(eb, e) - offset; } -static inline u64 btrfs_file_extent_generation(struct btrfs_file_extent_item *e) -{ - return le64_to_cpu(e->generation); -} - -static inline void btrfs_set_file_extent_generation(struct - btrfs_file_extent_item *e, - u64 val) -{ - e->generation = cpu_to_le64(val); -} - -static inline u64 btrfs_file_extent_disk_num_blocks(struct - btrfs_file_extent_item *e) -{ - return le64_to_cpu(e->disk_num_blocks); -} - -static inline void btrfs_set_file_extent_disk_num_blocks(struct - btrfs_file_extent_item - *e, u64 val) -{ - e->disk_num_blocks = cpu_to_le64(val); -} - -static inline u64 btrfs_file_extent_offset(struct btrfs_file_extent_item *e) -{ - return le64_to_cpu(e->offset); -} - -static inline void btrfs_set_file_extent_offset(struct btrfs_file_extent_item - *e, u64 val) -{ - e->offset = cpu_to_le64(val); -} - -static inline u64 btrfs_file_extent_num_blocks(struct btrfs_file_extent_item - *e) -{ - return le64_to_cpu(e->num_blocks); -} - -static inline void btrfs_set_file_extent_num_blocks(struct - btrfs_file_extent_item *e, - u64 val) -{ - e->num_blocks = cpu_to_le64(val); -} +BTRFS_SETGET_FUNCS(file_extent_disk_blocknr, struct btrfs_file_extent_item, + disk_blocknr, 64); +BTRFS_SETGET_FUNCS(file_extent_generation, struct btrfs_file_extent_item, + generation, 64); +BTRFS_SETGET_FUNCS(file_extent_disk_num_blocks, struct btrfs_file_extent_item, + disk_num_blocks, 64); +BTRFS_SETGET_FUNCS(file_extent_offset, struct btrfs_file_extent_item, + offset, 64); +BTRFS_SETGET_FUNCS(file_extent_num_blocks, struct btrfs_file_extent_item, + num_blocks, 64); static inline struct btrfs_root *btrfs_sb(struct super_block *sb) { return sb->s_fs_info; } -static inline void btrfs_check_bounds(void *vptr, size_t len, - void *vcontainer, size_t container_len) -{ - char *ptr = vptr; - char *container = vcontainer; - WARN_ON(ptr < container); - WARN_ON(ptr + len > container + container_len); -} - -static inline void btrfs_memcpy(struct btrfs_root *root, - void *dst_block, - void *dst, const void *src, size_t nr) -{ - btrfs_check_bounds(dst, nr, dst_block, root->fs_info->sb->s_blocksize); - memcpy(dst, src, nr); -} - -static inline void btrfs_memmove(struct btrfs_root *root, - void *dst_block, - void *dst, void *src, size_t nr) -{ - btrfs_check_bounds(dst, nr, dst_block, root->fs_info->sb->s_blocksize); - memmove(dst, src, nr); -} - static inline int btrfs_set_root_name(struct btrfs_root *root, const char *name, int len) { @@ -1063,7 +842,11 @@ static inline int btrfs_set_root_name(struct btrfs_root *root, /* helper function to cast into the data area of the leaf. */ #define btrfs_item_ptr(leaf, slot, type) \ ((type *)(btrfs_leaf_data(leaf) + \ - btrfs_item_offset((leaf)->items + (slot)))) + btrfs_item_offset_nr(leaf, slot))) + +#define btrfs_item_ptr_offset(leaf, slot) \ + ((unsigned long)(btrfs_leaf_data(leaf) + \ + btrfs_item_offset_nr(leaf, slot))) /* mount option defines and helpers */ #define BTRFS_MOUNT_SUBVOL 0x000001 @@ -1084,7 +867,7 @@ struct btrfs_block_group_cache *btrfs_find_block_group(struct btrfs_root *root, int data, int owner); int btrfs_inc_root_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root); -struct buffer_head *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, +struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 hint, u64 empty_size); int btrfs_alloc_extent(struct btrfs_trans_handle *trans, @@ -1092,7 +875,7 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans, u64 num_blocks, u64 empty_size, u64 search_start, u64 search_end, struct btrfs_key *ins, int data); int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct buffer_head *buf); + struct extent_buffer *buf); int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 blocknr, u64 num_blocks, int pin); int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, @@ -1106,10 +889,10 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, int btrfs_free_block_groups(struct btrfs_fs_info *info); int btrfs_read_block_groups(struct btrfs_root *root); /* ctree.c */ -int btrfs_cow_block(struct btrfs_trans_handle *trans, struct btrfs_root - *root, struct buffer_head *buf, struct buffer_head - *parent, int parent_slot, struct buffer_head - **cow_ret); +int btrfs_cow_block(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct extent_buffer *buf, + struct extent_buffer *parent, int parent_slot, + struct extent_buffer **cow_ret); int btrfs_extend_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, u32 data_size); int btrfs_truncate_item(struct btrfs_trans_handle *trans, @@ -1120,7 +903,7 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_key *key, struct btrfs_path *p, int ins_len, int cow); int btrfs_realloc_node(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct buffer_head *parent, + struct btrfs_root *root, struct extent_buffer *parent, int cache_only, u64 *last_ret); void btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p); struct btrfs_path *btrfs_alloc_path(void); @@ -1134,7 +917,7 @@ int btrfs_insert_empty_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, struct btrfs_key *cpu_key, u32 data_size); int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path); -int btrfs_leaf_free_space(struct btrfs_root *root, struct btrfs_leaf *leaf); +int btrfs_leaf_free_space(struct btrfs_root *root, struct extent_buffer *leaf); int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root *root); /* root-item.c */ @@ -1179,9 +962,9 @@ int btrfs_find_free_objectid(struct btrfs_trans_handle *trans, int btrfs_find_highest_inode(struct btrfs_root *fs_root, u64 *objectid); /* inode-item.c */ -int btrfs_insert_inode(struct btrfs_trans_handle *trans, struct btrfs_root - *root, u64 objectid, struct btrfs_inode_item - *inode_item); +int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, u64 objectid); int btrfs_lookup_inode(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, struct btrfs_key *location, int mod); @@ -1224,8 +1007,6 @@ struct inode *btrfs_iget_locked(struct super_block *s, u64 objectid, struct btrfs_root *root); int btrfs_commit_write(struct file *file, struct page *page, unsigned from, unsigned to); -int btrfs_get_block(struct inode *inode, sector_t iblock, - struct buffer_head *result, int create); struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page, size_t page_offset, u64 start, u64 end, int create); diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c index 49db5fa7ced3..6f19de41b878 100644 --- a/fs/btrfs/dir-item.c +++ b/fs/btrfs/dir-item.c @@ -33,7 +33,7 @@ static struct btrfs_dir_item *insert_with_overflow(struct btrfs_trans_handle int ret; char *ptr; struct btrfs_item *item; - struct btrfs_leaf *leaf; + struct extent_buffer *leaf; ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size); if (ret == -EEXIST) { @@ -49,11 +49,11 @@ static struct btrfs_dir_item *insert_with_overflow(struct btrfs_trans_handle if (ret < 0) return ERR_PTR(ret); WARN_ON(ret > 0); - leaf = btrfs_buffer_leaf(path->nodes[0]); - item = leaf->items + path->slots[0]; + leaf = path->nodes[0]; + item = btrfs_item_nr(leaf, path->slots[0]); ptr = btrfs_item_ptr(leaf, path->slots[0], char); - BUG_ON(data_size > btrfs_item_size(item)); - ptr += btrfs_item_size(item) - data_size; + BUG_ON(data_size > btrfs_item_size(leaf, item)); + ptr += btrfs_item_size(leaf, item) - data_size; return (struct btrfs_dir_item *)ptr; } @@ -65,12 +65,13 @@ int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root int ret2 = 0; struct btrfs_path *path; struct btrfs_dir_item *dir_item; - char *name_ptr; + struct extent_buffer *leaf; + unsigned long name_ptr; struct btrfs_key key; + struct btrfs_disk_key disk_key; u32 data_size; key.objectid = dir; - key.flags = 0; btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY); ret = btrfs_name_hash(name, name_len, &key.offset); BUG_ON(ret); @@ -85,14 +86,16 @@ int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root goto out; } - btrfs_cpu_key_to_disk(&dir_item->location, location); - btrfs_set_dir_type(dir_item, type); - btrfs_set_dir_flags(dir_item, 0); - btrfs_set_dir_name_len(dir_item, name_len); - name_ptr = (char *)(dir_item + 1); + leaf = path->nodes[0]; + btrfs_cpu_key_to_disk(&disk_key, location); + btrfs_set_dir_item_key(leaf, dir_item, &disk_key); + btrfs_set_dir_type(leaf, dir_item, type); + btrfs_set_dir_flags(leaf, dir_item, 0); + btrfs_set_dir_name_len(leaf, dir_item, name_len); + name_ptr = (unsigned long)(dir_item + 1); - btrfs_memcpy(root, path->nodes[0]->b_data, name_ptr, name, name_len); - btrfs_mark_buffer_dirty(path->nodes[0]); + write_extent_buffer(leaf, name, name_ptr, name_len); + btrfs_mark_buffer_dirty(leaf); second_insert: /* FIXME, use some real flag for selecting the extra index */ @@ -110,13 +113,15 @@ second_insert: ret2 = PTR_ERR(dir_item); goto out; } - btrfs_cpu_key_to_disk(&dir_item->location, location); - btrfs_set_dir_type(dir_item, type); - btrfs_set_dir_flags(dir_item, 0); - btrfs_set_dir_name_len(dir_item, name_len); - name_ptr = (char *)(dir_item + 1); - btrfs_memcpy(root, path->nodes[0]->b_data, name_ptr, name, name_len); - btrfs_mark_buffer_dirty(path->nodes[0]); + leaf = path->nodes[0]; + btrfs_cpu_key_to_disk(&disk_key, location); + btrfs_set_dir_item_key(leaf, dir_item, &disk_key); + btrfs_set_dir_type(leaf, dir_item, type); + btrfs_set_dir_flags(leaf, dir_item, 0); + btrfs_set_dir_name_len(leaf, dir_item, name_len); + name_ptr = (unsigned long)(dir_item + 1); + write_extent_buffer(leaf, name, name_ptr, name_len); + btrfs_mark_buffer_dirty(leaf); out: btrfs_free_path(path); if (ret) @@ -136,14 +141,15 @@ struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans, struct btrfs_key key; int ins_len = mod < 0 ? -1 : 0; int cow = mod != 0; - struct btrfs_disk_key *found_key; - struct btrfs_leaf *leaf; + struct btrfs_key found_key; + struct extent_buffer *leaf; key.objectid = dir; - key.flags = 0; btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY); + ret = btrfs_name_hash(name, name_len, &key.offset); BUG_ON(ret); + ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow); if (ret < 0) return ERR_PTR(ret); @@ -152,12 +158,13 @@ struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans, return NULL; path->slots[0]--; } - leaf = btrfs_buffer_leaf(path->nodes[0]); - found_key = &leaf->items[path->slots[0]].key; - if (btrfs_disk_key_objectid(found_key) != dir || - btrfs_disk_key_type(found_key) != BTRFS_DIR_ITEM_KEY || - btrfs_disk_key_offset(found_key) != key.offset) + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + + if (found_key.objectid != dir || + btrfs_key_type(&found_key) != BTRFS_DIR_ITEM_KEY || + found_key.offset != key.offset) return NULL; return btrfs_match_dir_item_name(root, path, name, name_len); @@ -176,7 +183,6 @@ btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans, int cow = mod != 0; key.objectid = dir; - key.flags = 0; btrfs_set_key_type(&key, BTRFS_DIR_INDEX_KEY); key.offset = objectid; @@ -193,21 +199,22 @@ struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root, const char *name, int name_len) { struct btrfs_dir_item *dir_item; - char *name_ptr; + unsigned long name_ptr; u32 total_len; u32 cur = 0; u32 this_len; - struct btrfs_leaf *leaf; + struct extent_buffer *leaf; - leaf = btrfs_buffer_leaf(path->nodes[0]); + leaf = path->nodes[0]; dir_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dir_item); - total_len = btrfs_item_size(leaf->items + path->slots[0]); + total_len = btrfs_item_size_nr(leaf, path->slots[0]); while(cur < total_len) { - this_len = sizeof(*dir_item) + btrfs_dir_name_len(dir_item); - name_ptr = (char *)(dir_item + 1); + this_len = sizeof(*dir_item) + + btrfs_dir_name_len(leaf, dir_item); + name_ptr = (unsigned long)(dir_item + 1); - if (btrfs_dir_name_len(dir_item) == name_len && - memcmp(name_ptr, name, name_len) == 0) + if (btrfs_dir_name_len(leaf, dir_item) == name_len && + memcmp_extent_buffer(leaf, name, name_ptr, name_len) == 0) return dir_item; cur += this_len; @@ -223,20 +230,23 @@ int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans, struct btrfs_dir_item *di) { - struct btrfs_leaf *leaf; + struct extent_buffer *leaf; u32 sub_item_len; u32 item_len; int ret = 0; - leaf = btrfs_buffer_leaf(path->nodes[0]); - sub_item_len = sizeof(*di) + btrfs_dir_name_len(di); - item_len = btrfs_item_size(leaf->items + path->slots[0]); - if (sub_item_len == btrfs_item_size(leaf->items + path->slots[0])) { + leaf = path->nodes[0]; + sub_item_len = sizeof(*di) + btrfs_dir_name_len(leaf, di); + item_len = btrfs_item_size_nr(leaf, path->slots[0]); + if (sub_item_len == item_len) { ret = btrfs_del_item(trans, root, path); } else { - char *ptr = (char *)di; - char *start = btrfs_item_ptr(leaf, path->slots[0], char); - btrfs_memmove(root, leaf, ptr, ptr + sub_item_len, + /* MARKER */ + unsigned long ptr = (unsigned long)di; + unsigned long start; + + start = btrfs_item_ptr_offset(leaf, path->slots[0]); + memmove_extent_buffer(leaf, ptr, ptr + sub_item_len, item_len - (ptr + sub_item_len - start)); ret = btrfs_truncate_item(trans, root, path, item_len - sub_item_len); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index ef6d76f2ec20..0c1f90cbedb0 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -23,138 +23,132 @@ #include #include #include +#include // for block_sync_page #include "ctree.h" #include "disk-io.h" #include "transaction.h" #include "btrfs_inode.h" -u64 bh_blocknr(struct buffer_head *bh) +#if 0 +static int check_tree_block(struct btrfs_root *root, struct extent_buffer *buf) { - return bh->b_blocknr; -} - -static int check_tree_block(struct btrfs_root *root, struct buffer_head *buf) -{ - struct btrfs_node *node = btrfs_buffer_node(buf); - if (bh_blocknr(buf) != btrfs_header_blocknr(&node->header)) { - printk(KERN_CRIT "bh_blocknr(buf) is %llu, header is %llu\n", - (unsigned long long)bh_blocknr(buf), - (unsigned long long)btrfs_header_blocknr(&node->header)); + if (extent_buffer_blocknr(buf) != btrfs_header_blocknr(buf)) { + printk(KERN_CRIT "buf blocknr(buf) is %llu, header is %llu\n", + (unsigned long long)extent_buffer_blocknr(buf), + (unsigned long long)btrfs_header_blocknr(buf)); return 1; } return 0; } +#endif -struct buffer_head *btrfs_find_tree_block(struct btrfs_root *root, u64 blocknr) +struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root, + u64 blocknr) { - struct address_space *mapping = root->fs_info->btree_inode->i_mapping; - int blockbits = root->fs_info->sb->s_blocksize_bits; - unsigned long index = blocknr >> (PAGE_CACHE_SHIFT - blockbits); - struct page *page; - struct buffer_head *bh; - struct buffer_head *head; - struct buffer_head *ret = NULL; - - - page = find_lock_page(mapping, index); - if (!page) - return NULL; - - if (!page_has_buffers(page)) - goto out_unlock; + struct inode *btree_inode = root->fs_info->btree_inode; + return find_extent_buffer(&BTRFS_I(btree_inode)->extent_tree, + blocknr * root->sectorsize, + root->sectorsize, GFP_NOFS); +} - head = page_buffers(page); - bh = head; - do { - if (buffer_mapped(bh) && bh_blocknr(bh) == blocknr) { - ret = bh; - get_bh(bh); - goto out_unlock; - } - bh = bh->b_this_page; - } while (bh != head); -out_unlock: - unlock_page(page); - page_cache_release(page); - return ret; +struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root, + u64 blocknr) +{ + struct inode *btree_inode = root->fs_info->btree_inode; + return alloc_extent_buffer(&BTRFS_I(btree_inode)->extent_tree, + blocknr * root->sectorsize, + root->sectorsize, GFP_NOFS); } -int btrfs_map_bh_to_logical(struct btrfs_root *root, struct buffer_head *bh, - u64 logical) +struct extent_map *btree_get_extent(struct inode *inode, struct page *page, + size_t page_offset, u64 start, u64 end, + int create) { - if (logical == 0) { - bh->b_bdev = NULL; - bh->b_blocknr = 0; - set_buffer_mapped(bh); - } else { - map_bh(bh, root->fs_info->sb, logical); + struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; + struct extent_map *em; + int ret; + +again: + em = lookup_extent_mapping(em_tree, start, end); + if (em) { + goto out; } - return 0; + em = alloc_extent_map(GFP_NOFS); + if (!em) { + em = ERR_PTR(-ENOMEM); + goto out; + } + em->start = 0; + em->end = (i_size_read(inode) & ~((u64)PAGE_CACHE_SIZE -1)) - 1; + em->block_start = 0; + em->block_end = em->end; + em->bdev = inode->i_sb->s_bdev; + ret = add_extent_mapping(em_tree, em); + if (ret == -EEXIST) { + free_extent_map(em); + em = NULL; + goto again; + } else if (ret) { + em = ERR_PTR(ret); + } +out: + return em; } -struct buffer_head *btrfs_find_create_tree_block(struct btrfs_root *root, - u64 blocknr) +static int btree_writepage(struct page *page, struct writeback_control *wbc) { - struct address_space *mapping = root->fs_info->btree_inode->i_mapping; - int blockbits = root->fs_info->sb->s_blocksize_bits; - unsigned long index = blocknr >> (PAGE_CACHE_SHIFT - blockbits); - struct page *page; - struct buffer_head *bh; - struct buffer_head *head; - struct buffer_head *ret = NULL; - int err; - u64 first_block = index << (PAGE_CACHE_SHIFT - blockbits); + struct extent_map_tree *tree; + tree = &BTRFS_I(page->mapping->host)->extent_tree; + return extent_write_full_page(tree, page, btree_get_extent, wbc); +} +int btree_readpage(struct file *file, struct page *page) +{ + struct extent_map_tree *tree; + tree = &BTRFS_I(page->mapping->host)->extent_tree; + return extent_read_full_page(tree, page, btree_get_extent); +} - page = find_or_create_page(mapping, index, GFP_NOFS); - if (!page) - return NULL; +static int btree_releasepage(struct page *page, gfp_t unused_gfp_flags) +{ + struct extent_map_tree *tree; + int ret; - if (!page_has_buffers(page)) - create_empty_buffers(page, root->fs_info->sb->s_blocksize, 0); - head = page_buffers(page); - bh = head; - do { - if (!buffer_mapped(bh)) { - err = btrfs_map_bh_to_logical(root, bh, first_block); - BUG_ON(err); - } - if (bh_blocknr(bh) == blocknr) { - ret = bh; - get_bh(bh); - goto out_unlock; - } - bh = bh->b_this_page; - first_block++; - } while (bh != head); -out_unlock: - unlock_page(page); - if (ret) - touch_buffer(ret); - page_cache_release(page); + BUG_ON(page->private != 1); + tree = &BTRFS_I(page->mapping->host)->extent_tree; + ret = try_release_extent_mapping(tree, page); + if (ret == 1) { + ClearPagePrivate(page); + set_page_private(page, 0); + page_cache_release(page); + } return ret; } -static int btree_get_block(struct inode *inode, sector_t iblock, - struct buffer_head *bh, int create) +static void btree_invalidatepage(struct page *page, unsigned long offset) { - int err; - struct btrfs_root *root = BTRFS_I(bh->b_page->mapping->host)->root; - err = btrfs_map_bh_to_logical(root, bh, iblock); - return err; + struct extent_map_tree *tree; + tree = &BTRFS_I(page->mapping->host)->extent_tree; + extent_invalidatepage(tree, page, offset); + btree_releasepage(page, GFP_NOFS); } int btrfs_csum_data(struct btrfs_root * root, char *data, size_t len, char *result) { + return 0; +#if 0 u32 crc; crc = crc32c(0, data, len); memcpy(result, &crc, BTRFS_CRC32_SIZE); return 0; +#endif } -static int csum_tree_block(struct btrfs_root *root, struct buffer_head *bh, +#if 0 +static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf, int verify) { + return 0; char result[BTRFS_CRC32_SIZE]; int ret; struct btrfs_node *node; @@ -176,7 +170,9 @@ static int csum_tree_block(struct btrfs_root *root, struct buffer_head *bh, } return 0; } +#endif +#if 0 static int btree_writepage(struct page *page, struct writeback_control *wbc) { struct buffer_head *bh; @@ -195,87 +191,65 @@ static int btree_writepage(struct page *page, struct writeback_control *wbc) } while (bh != head); return block_write_full_page(page, btree_get_block, wbc); } - -static int btree_readpage(struct file * file, struct page * page) -{ - return block_read_full_page(page, btree_get_block); -} +#endif static struct address_space_operations btree_aops = { .readpage = btree_readpage, .writepage = btree_writepage, + .releasepage = btree_releasepage, + .invalidatepage = btree_invalidatepage, .sync_page = block_sync_page, }; int readahead_tree_block(struct btrfs_root *root, u64 blocknr) { - struct buffer_head *bh = NULL; + struct extent_buffer *buf = NULL; + struct inode *btree_inode = root->fs_info->btree_inode; int ret = 0; - bh = btrfs_find_create_tree_block(root, blocknr); - if (!bh) + buf = btrfs_find_create_tree_block(root, blocknr); + if (!buf) return 0; - if (buffer_uptodate(bh)) { - ret = 1; - goto done; - } - if (test_set_buffer_locked(bh)) { - ret = 1; - goto done; - } - if (!buffer_uptodate(bh)) { - get_bh(bh); - bh->b_end_io = end_buffer_read_sync; - submit_bh(READ, bh); - } else { - unlock_buffer(bh); - ret = 1; - } -done: - brelse(bh); + read_extent_buffer_pages(&BTRFS_I(btree_inode)->extent_tree, + buf, 0); + free_extent_buffer(buf); return ret; } -struct buffer_head *read_tree_block(struct btrfs_root *root, u64 blocknr) +struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 blocknr) { - struct buffer_head *bh = NULL; - - bh = btrfs_find_create_tree_block(root, blocknr); - if (!bh) - return bh; - if (buffer_uptodate(bh)) - goto uptodate; - lock_buffer(bh); - if (!buffer_uptodate(bh)) { - get_bh(bh); - bh->b_end_io = end_buffer_read_sync; - submit_bh(READ, bh); - wait_on_buffer(bh); - if (!buffer_uptodate(bh)) - goto fail; - } else { - unlock_buffer(bh); - } -uptodate: - if (!buffer_checked(bh)) { - csum_tree_block(root, bh, 1); - set_buffer_checked(bh); - } - if (check_tree_block(root, bh)) - goto fail; - return bh; -fail: - brelse(bh); - return NULL; + struct extent_buffer *buf = NULL; + struct inode *btree_inode = root->fs_info->btree_inode; + + buf = btrfs_find_create_tree_block(root, blocknr); + if (!buf) + return NULL; + read_extent_buffer_pages(&BTRFS_I(btree_inode)->extent_tree, + buf, 1); + return buf; } int clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct buffer_head *buf) + struct extent_buffer *buf) { - WARN_ON(atomic_read(&buf->b_count) == 0); - lock_buffer(buf); - clear_buffer_dirty(buf); - unlock_buffer(buf); + struct inode *btree_inode = root->fs_info->btree_inode; + clear_extent_buffer_dirty(&BTRFS_I(btree_inode)->extent_tree, buf); + return 0; +} + +int wait_on_tree_block_writeback(struct btrfs_root *root, + struct extent_buffer *buf) +{ + struct inode *btree_inode = root->fs_info->btree_inode; + wait_on_extent_buffer_writeback(&BTRFS_I(btree_inode)->extent_tree, + buf); + return 0; +} + +int set_tree_block_dirty(struct btrfs_root *root, struct extent_buffer *buf) +{ + struct inode *btree_inode = root->fs_info->btree_inode; + set_extent_buffer_dirty(&BTRFS_I(btree_inode)->extent_tree, buf); return 0; } @@ -287,7 +261,9 @@ static int __setup_root(int blocksize, root->node = NULL; root->inode = NULL; root->commit_root = NULL; - root->blocksize = blocksize; + root->sectorsize = blocksize; + root->nodesize = blocksize; + root->leafsize = blocksize; root->ref_cows = 0; root->fs_info = fs_info; root->objectid = objectid; @@ -332,7 +308,7 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_fs_info *fs_info, struct btrfs_root *root; struct btrfs_root *tree_root = fs_info->tree_root; struct btrfs_path *path; - struct btrfs_leaf *l; + struct extent_buffer *l; u64 highest_inode; int ret = 0; @@ -361,11 +337,10 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_fs_info *fs_info, ret = -ENOENT; goto out; } - l = btrfs_buffer_leaf(path->nodes[0]); - memcpy(&root->root_item, - btrfs_item_ptr(l, path->slots[0], struct btrfs_root_item), + l = path->nodes[0]; + read_extent_buffer(l, &root->root_item, + btrfs_item_ptr_offset(l, path->slots[0]), sizeof(root->root_item)); - memcpy(&root->root_key, location, sizeof(*location)); ret = 0; out: btrfs_release_path(root, path); @@ -406,21 +381,21 @@ struct btrfs_root *btrfs_read_fs_root(struct btrfs_fs_info *fs_info, (unsigned long)root->root_key.objectid, root); if (ret) { - brelse(root->node); + free_extent_buffer(root->node); kfree(root); return ERR_PTR(ret); } ret = btrfs_set_root_name(root, name, namelen); if (ret) { - brelse(root->node); + free_extent_buffer(root->node); kfree(root); return ERR_PTR(ret); } ret = btrfs_sysfs_add_root(root); if (ret) { - brelse(root->node); + free_extent_buffer(root->node); kfree(root->name); kfree(root); return ERR_PTR(ret); @@ -471,6 +446,9 @@ struct btrfs_root *open_ctree(struct super_block *sb) fs_info->btree_inode->i_nlink = 1; fs_info->btree_inode->i_size = sb->s_bdev->bd_inode->i_size; fs_info->btree_inode->i_mapping->a_ops = &btree_aops; + extent_map_tree_init(&BTRFS_I(fs_info->btree_inode)->extent_tree, + fs_info->btree_inode->i_mapping, + GFP_NOFS); fs_info->do_barriers = 1; fs_info->closing = 0; @@ -493,10 +471,14 @@ struct btrfs_root *open_ctree(struct super_block *sb) if (!fs_info->sb_buffer) goto fail_iput; - disk_super = (struct btrfs_super_block *)fs_info->sb_buffer->b_data; - fs_info->disk_super = disk_super; - memcpy(&fs_info->super_copy, disk_super, sizeof(fs_info->super_copy)); + read_extent_buffer(fs_info->sb_buffer, &fs_info->super_copy, 0, + sizeof(fs_info->super_copy)); + + read_extent_buffer(fs_info->sb_buffer, fs_info->fsid, + (unsigned long)btrfs_super_fsid(fs_info->sb_buffer), + BTRFS_FSID_SIZE); + disk_super = &fs_info->super_copy; if (!btrfs_super_root(disk_super)) goto fail_sb_buffer; @@ -530,9 +512,9 @@ struct btrfs_root *open_ctree(struct super_block *sb) return tree_root; fail_tree_root: - btrfs_block_release(tree_root, tree_root->node); + free_extent_buffer(tree_root->node); fail_sb_buffer: - btrfs_block_release(tree_root, fs_info->sb_buffer); + free_extent_buffer(fs_info->sb_buffer); fail_iput: iput(fs_info->btree_inode); fail: @@ -546,31 +528,13 @@ int write_ctree_super(struct btrfs_trans_handle *trans, struct btrfs_root *root) { int ret; - struct buffer_head *bh = root->fs_info->sb_buffer; - - lock_buffer(bh); - WARN_ON(atomic_read(&bh->b_count) < 1); - clear_buffer_dirty(bh); - csum_tree_block(root, bh, 0); - bh->b_end_io = end_buffer_write_sync; - get_bh(bh); - if (root->fs_info->do_barriers) - ret = submit_bh(WRITE_BARRIER, bh); - else - ret = submit_bh(WRITE, bh); - if (ret == -EOPNOTSUPP) { - get_bh(bh); - lock_buffer(bh); - set_buffer_uptodate(bh); - root->fs_info->do_barriers = 0; - ret = submit_bh(WRITE, bh); - } - wait_on_buffer(bh); - if (!buffer_uptodate(bh)) { - WARN_ON(1); - return -EIO; - } - return 0; + struct extent_buffer *super = root->fs_info->sb_buffer; + struct inode *btree_inode = root->fs_info->btree_inode; + + set_extent_buffer_dirty(&BTRFS_I(btree_inode)->extent_tree, super); + ret = sync_page_range_nolock(btree_inode, btree_inode->i_mapping, + super->start, super->len); + return ret; } int btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root) @@ -581,9 +545,9 @@ int btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root) if (root->inode) iput(root->inode); if (root->node) - brelse(root->node); + free_extent_buffer(root->node); if (root->commit_root) - brelse(root->commit_root); + free_extent_buffer(root->commit_root); if (root->name) kfree(root->name); kfree(root); @@ -629,12 +593,10 @@ int close_ctree(struct btrfs_root *root) mutex_unlock(&fs_info->fs_mutex); if (fs_info->extent_root->node) - btrfs_block_release(fs_info->extent_root, - fs_info->extent_root->node); + free_extent_buffer(fs_info->extent_root->node); if (fs_info->tree_root->node) - btrfs_block_release(fs_info->tree_root, - fs_info->tree_root->node); - btrfs_block_release(root, fs_info->sb_buffer); + free_extent_buffer(fs_info->tree_root->node); + free_extent_buffer(fs_info->sb_buffer); truncate_inode_pages(fs_info->btree_inode->i_mapping, 0); iput(fs_info->btree_inode); @@ -645,25 +607,32 @@ int close_ctree(struct btrfs_root *root) return 0; } -void btrfs_mark_buffer_dirty(struct buffer_head *bh) +int btrfs_buffer_uptodate(struct extent_buffer *buf) +{ + struct inode *btree_inode = buf->pages[0]->mapping->host; + return extent_buffer_uptodate(&BTRFS_I(btree_inode)->extent_tree, buf); +} + +int btrfs_set_buffer_uptodate(struct extent_buffer *buf) { - struct btrfs_root *root = BTRFS_I(bh->b_page->mapping->host)->root; - u64 transid = btrfs_header_generation(btrfs_buffer_header(bh)); + struct inode *btree_inode = buf->pages[0]->mapping->host; + return set_extent_buffer_uptodate(&BTRFS_I(btree_inode)->extent_tree, + buf); +} - WARN_ON(!atomic_read(&bh->b_count)); +void btrfs_mark_buffer_dirty(struct extent_buffer *buf) +{ + struct btrfs_root *root = BTRFS_I(buf->pages[0]->mapping->host)->root; + u64 transid = btrfs_header_generation(buf); + struct inode *btree_inode = root->fs_info->btree_inode; if (transid != root->fs_info->generation) { printk(KERN_CRIT "transid mismatch buffer %llu, found %Lu running %Lu\n", - (unsigned long long)bh->b_blocknr, + (unsigned long long)extent_buffer_blocknr(buf), transid, root->fs_info->generation); WARN_ON(1); } - mark_buffer_dirty(bh); -} - -void btrfs_block_release(struct btrfs_root *root, struct buffer_head *buf) -{ - brelse(buf); + set_extent_buffer_dirty(&BTRFS_I(btree_inode)->extent_tree, buf); } void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr) diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index fc6b520c6e0b..70d9413c599f 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -19,68 +19,35 @@ #ifndef __DISKIO__ #define __DISKIO__ -#include - #define BTRFS_SUPER_INFO_OFFSET (16 * 1024) -enum btrfs_bh_state_bits { - BH_Checked = BH_PrivateStart, - BH_Defrag, - BH_DefragDone, -}; -BUFFER_FNS(Checked, checked); -BUFFER_FNS(Defrag, defrag); -BUFFER_FNS(DefragDone, defrag_done); - -static inline struct btrfs_node *btrfs_buffer_node(struct buffer_head *bh) -{ - return (struct btrfs_node *)bh->b_data; -} - -static inline struct btrfs_leaf *btrfs_buffer_leaf(struct buffer_head *bh) -{ - return (struct btrfs_leaf *)bh->b_data; -} - -static inline struct btrfs_header *btrfs_buffer_header(struct buffer_head *bh) -{ - return &((struct btrfs_node *)bh->b_data)->header; -} - -struct buffer_head *read_tree_block(struct btrfs_root *root, u64 blocknr); +struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 blocknr); int readahead_tree_block(struct btrfs_root *root, u64 blocknr); -struct buffer_head *btrfs_find_create_tree_block(struct btrfs_root *root, - u64 blocknr); -int write_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct buffer_head *buf); -int dirty_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct buffer_head *buf); +struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root, + u64 blocknr); int clean_tree_block(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct buffer_head *buf); -int btrfs_commit_transaction(struct btrfs_trans_handle *trans, - struct btrfs_root *root); + struct btrfs_root *root, struct extent_buffer *buf); struct btrfs_root *open_ctree(struct super_block *sb); int close_ctree(struct btrfs_root *root); -void btrfs_block_release(struct btrfs_root *root, struct buffer_head *buf); int write_ctree_super(struct btrfs_trans_handle *trans, struct btrfs_root *root); -struct buffer_head *btrfs_find_tree_block(struct btrfs_root *root, u64 blocknr); -int btrfs_csum_data(struct btrfs_root * root, char *data, size_t len, - char *result); +struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root, + u64 blocknr); struct btrfs_root *btrfs_read_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_key *location, const char *name, int namelen); struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_fs_info *fs_info, struct btrfs_key *location); -u64 bh_blocknr(struct buffer_head *bh); int btrfs_insert_dev_radix(struct btrfs_root *root, struct block_device *bdev, u64 device_id, u64 block_start, u64 num_blocks); -int btrfs_map_bh_to_logical(struct btrfs_root *root, struct buffer_head *bh, - u64 logical); void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr); int btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root); -void btrfs_mark_buffer_dirty(struct buffer_head *bh); +void btrfs_mark_buffer_dirty(struct extent_buffer *buf); +int btrfs_buffer_uptodate(struct extent_buffer *buf); +int btrfs_set_buffer_uptodate(struct extent_buffer *buf); +int wait_on_tree_block_writeback(struct btrfs_root *root, + struct extent_buffer *buf); #endif diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index f261a8326cdf..089c41cbca74 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -33,7 +33,7 @@ static int cache_block_group(struct btrfs_root *root, struct btrfs_path *path; int ret; struct btrfs_key key; - struct btrfs_leaf *leaf; + struct extent_buffer *leaf; struct radix_tree_root *extent_radix; int slot; u64 i; @@ -56,7 +56,6 @@ static int cache_block_group(struct btrfs_root *root, path->reada = 2; first_free = block_group->key.objectid; key.objectid = block_group->key.objectid; - key.flags = 0; key.offset = 0; btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY); @@ -69,9 +68,9 @@ static int cache_block_group(struct btrfs_root *root, path->slots[0]--; while(1) { - leaf = btrfs_buffer_leaf(path->nodes[0]); + leaf = path->nodes[0]; slot = path->slots[0]; - if (slot >= btrfs_header_nritems(&leaf->header)) { + if (slot >= btrfs_header_nritems(leaf)) { ret = btrfs_next_leaf(root, path); if (ret < 0) goto err; @@ -82,7 +81,7 @@ static int cache_block_group(struct btrfs_root *root, } } - btrfs_disk_key_to_cpu(&key, &leaf->items[slot].key); + btrfs_item_key_to_cpu(leaf, &key, slot); if (key.objectid < block_group->key.objectid) { if (key.objectid + key.offset > first_free) first_free = key.objectid + key.offset; @@ -116,8 +115,7 @@ next: hole_size = block_group->key.objectid + block_group->key.offset - last; for (i = 0; i < hole_size; i++) { - set_radix_bit(extent_radix, - last + i); + set_radix_bit(extent_radix, last + i); } } block_group->cached = 1; @@ -366,7 +364,7 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, struct btrfs_path *path; int ret; struct btrfs_key key; - struct btrfs_leaf *l; + struct extent_buffer *l; struct btrfs_extent_item *item; u32 refs; @@ -375,7 +373,6 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, return -ENOMEM; key.objectid = blocknr; - key.flags = 0; btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY); key.offset = num_blocks; ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key, path, @@ -386,10 +383,10 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, BUG(); } BUG_ON(ret != 0); - l = btrfs_buffer_leaf(path->nodes[0]); + l = path->nodes[0]; item = btrfs_item_ptr(l, path->slots[0], struct btrfs_extent_item); - refs = btrfs_extent_refs(item); - btrfs_set_extent_refs(item, refs + 1); + refs = btrfs_extent_refs(l, item); + btrfs_set_extent_refs(l, item, refs + 1); btrfs_mark_buffer_dirty(path->nodes[0]); btrfs_release_path(root->fs_info->extent_root, path); @@ -414,23 +411,25 @@ static int lookup_extent_ref(struct btrfs_trans_handle *trans, struct btrfs_path *path; int ret; struct btrfs_key key; - struct btrfs_leaf *l; + struct extent_buffer *l; struct btrfs_extent_item *item; path = btrfs_alloc_path(); key.objectid = blocknr; key.offset = num_blocks; - key.flags = 0; btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY); ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key, path, 0, 0); if (ret < 0) goto out; - if (ret != 0) + if (ret != 0) { + btrfs_print_leaf(root, path->nodes[0]); + printk("failed to find block number %Lu\n", blocknr); BUG(); - l = btrfs_buffer_leaf(path->nodes[0]); + } + l = path->nodes[0]; item = btrfs_item_ptr(l, path->slots[0], struct btrfs_extent_item); - *refs = btrfs_extent_refs(item); + *refs = btrfs_extent_refs(l, item); out: btrfs_free_path(path); return 0; @@ -439,16 +438,16 @@ out: int btrfs_inc_root_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root) { - return btrfs_inc_extent_ref(trans, root, bh_blocknr(root->node), 1); + return btrfs_inc_extent_ref(trans, root, + extent_buffer_blocknr(root->node), 1); } int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct buffer_head *buf) + struct extent_buffer *buf) { u64 blocknr; - struct btrfs_node *buf_node; - struct btrfs_leaf *buf_leaf; - struct btrfs_disk_key *key; + u32 nritems; + struct btrfs_key key; struct btrfs_file_extent_item *fi; int i; int leaf; @@ -458,31 +457,31 @@ int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, if (!root->ref_cows) return 0; - buf_node = btrfs_buffer_node(buf); - leaf = btrfs_is_leaf(buf_node); - buf_leaf = btrfs_buffer_leaf(buf); - for (i = 0; i < btrfs_header_nritems(&buf_node->header); i++) { + + leaf = btrfs_is_leaf(buf); + nritems = btrfs_header_nritems(buf); + for (i = 0; i < nritems; i++) { if (leaf) { u64 disk_blocknr; - key = &buf_leaf->items[i].key; - if (btrfs_disk_key_type(key) != BTRFS_EXTENT_DATA_KEY) + btrfs_item_key_to_cpu(buf, &key, i); + if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY) continue; - fi = btrfs_item_ptr(buf_leaf, i, + fi = btrfs_item_ptr(buf, i, struct btrfs_file_extent_item); - if (btrfs_file_extent_type(fi) == + if (btrfs_file_extent_type(buf, fi) == BTRFS_FILE_EXTENT_INLINE) continue; - disk_blocknr = btrfs_file_extent_disk_blocknr(fi); + disk_blocknr = btrfs_file_extent_disk_blocknr(buf, fi); if (disk_blocknr == 0) continue; ret = btrfs_inc_extent_ref(trans, root, disk_blocknr, - btrfs_file_extent_disk_num_blocks(fi)); + btrfs_file_extent_disk_num_blocks(buf, fi)); if (ret) { faili = i; goto fail; } } else { - blocknr = btrfs_node_blockptr(buf_node, i); + blocknr = btrfs_node_blockptr(buf, i); ret = btrfs_inc_extent_ref(trans, root, blocknr, 1); if (ret) { faili = i; @@ -496,22 +495,23 @@ fail: for (i =0; i < faili; i++) { if (leaf) { u64 disk_blocknr; - key = &buf_leaf->items[i].key; - if (btrfs_disk_key_type(key) != BTRFS_EXTENT_DATA_KEY) + btrfs_item_key_to_cpu(buf, &key, i); + if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY) continue; - fi = btrfs_item_ptr(buf_leaf, i, + fi = btrfs_item_ptr(buf, i, struct btrfs_file_extent_item); - if (btrfs_file_extent_type(fi) == + if (btrfs_file_extent_type(buf, fi) == BTRFS_FILE_EXTENT_INLINE) continue; - disk_blocknr = btrfs_file_extent_disk_blocknr(fi); + disk_blocknr = btrfs_file_extent_disk_blocknr(buf, fi); if (disk_blocknr == 0) continue; err = btrfs_free_extent(trans, root, disk_blocknr, - btrfs_file_extent_disk_num_blocks(fi), 0); + btrfs_file_extent_disk_num_blocks(buf, + fi), 0); BUG_ON(err); } else { - blocknr = btrfs_node_blockptr(buf_node, i); + blocknr = btrfs_node_blockptr(buf, i); err = btrfs_free_extent(trans, root, blocknr, 1, 0); BUG_ON(err); } @@ -527,16 +527,18 @@ static int write_one_cache_group(struct btrfs_trans_handle *trans, int ret; int pending_ret; struct btrfs_root *extent_root = root->fs_info->extent_root; - struct btrfs_block_group_item *bi; + unsigned long bi; + struct extent_buffer *leaf; ret = btrfs_search_slot(trans, extent_root, &cache->key, path, 0, 1); if (ret < 0) goto fail; BUG_ON(ret); - bi = btrfs_item_ptr(btrfs_buffer_leaf(path->nodes[0]), path->slots[0], - struct btrfs_block_group_item); - memcpy(bi, &cache->item, sizeof(*bi)); - btrfs_mark_buffer_dirty(path->nodes[0]); + + leaf = path->nodes[0]; + bi = btrfs_item_ptr_offset(leaf, path->slots[0]); + write_extent_buffer(leaf, &cache->item, bi, sizeof(cache->item)); + btrfs_mark_buffer_dirty(leaf); btrfs_release_path(extent_root, path); fail: finish_current_insert(trans, extent_root); @@ -768,11 +770,11 @@ static int finish_current_insert(struct btrfs_trans_handle *trans, struct unsigned long gang[8]; struct btrfs_fs_info *info = extent_root->fs_info; - btrfs_set_extent_refs(&extent_item, 1); + btrfs_set_stack_extent_refs(&extent_item, 1); ins.offset = 1; - ins.flags = 0; btrfs_set_key_type(&ins, BTRFS_EXTENT_ITEM_KEY); - btrfs_set_extent_owner(&extent_item, extent_root->root_key.objectid); + btrfs_set_stack_extent_owner(&extent_item, + extent_root->root_key.objectid); while(1) { ret = find_first_radix_bit(&info->extent_ins_radix, gang, 0, @@ -795,23 +797,20 @@ static int finish_current_insert(struct btrfs_trans_handle *trans, struct static int pin_down_block(struct btrfs_root *root, u64 blocknr, int pending) { int err; - struct btrfs_header *header; - struct buffer_head *bh; + struct extent_buffer *buf; if (!pending) { - bh = btrfs_find_tree_block(root, blocknr); - if (bh) { - if (buffer_uptodate(bh)) { + buf = btrfs_find_tree_block(root, blocknr); + if (buf) { + if (btrfs_buffer_uptodate(buf)) { u64 transid = root->fs_info->running_transaction->transid; - header = btrfs_buffer_header(bh); - if (btrfs_header_generation(header) == - transid) { - btrfs_block_release(root, bh); + if (btrfs_header_generation(buf) == transid) { + free_extent_buffer(buf); return 0; } } - btrfs_block_release(root, bh); + free_extent_buffer(buf); } err = set_radix_bit(&root->fs_info->pinned_radix, blocknr); if (!err) { @@ -839,12 +838,12 @@ static int __free_extent(struct btrfs_trans_handle *trans, struct btrfs_root struct btrfs_key key; struct btrfs_fs_info *info = root->fs_info; struct btrfs_root *extent_root = info->extent_root; + struct extent_buffer *leaf; int ret; struct btrfs_extent_item *ei; u32 refs; key.objectid = blocknr; - key.flags = 0; btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY); key.offset = num_blocks; @@ -856,12 +855,16 @@ static int __free_extent(struct btrfs_trans_handle *trans, struct btrfs_root if (ret < 0) return ret; BUG_ON(ret); - ei = btrfs_item_ptr(btrfs_buffer_leaf(path->nodes[0]), path->slots[0], + + leaf = path->nodes[0]; + ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); - BUG_ON(ei->refs == 0); - refs = btrfs_extent_refs(ei) - 1; - btrfs_set_extent_refs(ei, refs); - btrfs_mark_buffer_dirty(path->nodes[0]); + refs = btrfs_extent_refs(leaf, ei); + BUG_ON(refs == 0); + refs -= 1; + btrfs_set_extent_refs(leaf, ei, refs); + btrfs_mark_buffer_dirty(leaf); + if (refs == 0) { u64 super_blocks_used, root_blocks_used; @@ -876,8 +879,8 @@ static int __free_extent(struct btrfs_trans_handle *trans, struct btrfs_root super_blocks_used - num_blocks); /* block accounting for root item */ - root_blocks_used = btrfs_root_blocks_used(&root->root_item); - btrfs_set_root_blocks_used(&root->root_item, + root_blocks_used = btrfs_root_used(&root->root_item); + btrfs_set_root_used(&root->root_item, root_blocks_used - num_blocks); ret = btrfs_del_item(trans, extent_root, path); @@ -984,7 +987,7 @@ static int find_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root u64 test_block; u64 orig_search_start = search_start; int start_found; - struct btrfs_leaf *l; + struct extent_buffer *l; struct btrfs_root * root = orig_root->fs_info->extent_root; struct btrfs_fs_info *info = root->fs_info; int total_needed = num_blocks; @@ -994,10 +997,10 @@ static int find_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root int wrapped = 0; WARN_ON(num_blocks < 1); - ins->flags = 0; btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY); - level = btrfs_header_level(btrfs_buffer_header(root->node)); + level = btrfs_header_level(root->node); + if (search_end == (u64)-1) search_end = btrfs_super_total_blocks(&info->super_copy); if (hint_block) { @@ -1034,8 +1037,9 @@ check_failed: path->slots[0]--; } - l = btrfs_buffer_leaf(path->nodes[0]); - btrfs_disk_key_to_cpu(&key, &l->items[path->slots[0]].key); + l = path->nodes[0]; + btrfs_item_key_to_cpu(l, &key, path->slots[0]); + /* * a rare case, go back one key if we hit a block group item * instead of an extent item @@ -1055,9 +1059,9 @@ check_failed: } while (1) { - l = btrfs_buffer_leaf(path->nodes[0]); + l = path->nodes[0]; slot = path->slots[0]; - if (slot >= btrfs_header_nritems(&l->header)) { + if (slot >= btrfs_header_nritems(l)) { ret = btrfs_next_leaf(root, path); if (ret == 0) continue; @@ -1075,7 +1079,7 @@ check_failed: goto check_pending; } - btrfs_disk_key_to_cpu(&key, &l->items[slot].key); + btrfs_item_key_to_cpu(l, &key, slot); if (key.objectid >= search_start && key.objectid > last_block && start_found) { if (last_block < search_start) @@ -1183,8 +1187,8 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans, struct btrfs_root *extent_root = info->extent_root; struct btrfs_extent_item extent_item; - btrfs_set_extent_refs(&extent_item, 1); - btrfs_set_extent_owner(&extent_item, owner); + btrfs_set_stack_extent_refs(&extent_item, 1); + btrfs_set_stack_extent_owner(&extent_item, owner); WARN_ON(num_blocks < 1); ret = find_free_extent(trans, root, num_blocks, empty_size, @@ -1201,8 +1205,8 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans, num_blocks); /* block accounting for root item */ - root_blocks_used = btrfs_root_blocks_used(&root->root_item); - btrfs_set_root_blocks_used(&root->root_item, root_blocks_used + + root_blocks_used = btrfs_root_used(&root->root_item); + btrfs_set_root_used(&root->root_item, root_blocks_used + num_blocks); if (root == extent_root) { @@ -1241,13 +1245,13 @@ update_block: * helper function to allocate a block for a given tree * returns the tree buffer or NULL. */ -struct buffer_head *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 hint, - u64 empty_size) +struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 hint, + u64 empty_size) { struct btrfs_key ins; int ret; - struct buffer_head *buf; + struct extent_buffer *buf; ret = btrfs_alloc_extent(trans, root, root->root_key.objectid, 1, empty_size, hint, (u64)-1, &ins, 0); @@ -1260,53 +1264,57 @@ struct buffer_head *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, btrfs_free_extent(trans, root, ins.objectid, 1, 0); return ERR_PTR(-ENOMEM); } - WARN_ON(buffer_dirty(buf)); - set_buffer_uptodate(buf); + btrfs_set_buffer_uptodate(buf); + set_extent_dirty(&trans->transaction->dirty_pages, buf->start, + buf->start + buf->len - 1, GFP_NOFS); + /* set_buffer_checked(buf); set_buffer_defrag(buf); - set_radix_bit(&trans->transaction->dirty_pages, buf->b_page->index); + */ + /* FIXME!!!!!!!!!!!!!!!! + set_radix_bit(&trans->transaction->dirty_pages, buf->pages[0]->index); + */ trans->blocks_used++; return buf; } static int drop_leaf_ref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct buffer_head *cur) + struct btrfs_root *root, struct extent_buffer *leaf) { - struct btrfs_disk_key *key; - struct btrfs_leaf *leaf; + struct btrfs_key key; struct btrfs_file_extent_item *fi; int i; int nritems; int ret; - BUG_ON(!btrfs_is_leaf(btrfs_buffer_node(cur))); - leaf = btrfs_buffer_leaf(cur); - nritems = btrfs_header_nritems(&leaf->header); + BUG_ON(!btrfs_is_leaf(leaf)); + nritems = btrfs_header_nritems(leaf); for (i = 0; i < nritems; i++) { u64 disk_blocknr; - key = &leaf->items[i].key; - if (btrfs_disk_key_type(key) != BTRFS_EXTENT_DATA_KEY) + + btrfs_item_key_to_cpu(leaf, &key, i); + if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY) continue; fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item); - if (btrfs_file_extent_type(fi) == BTRFS_FILE_EXTENT_INLINE) + if (btrfs_file_extent_type(leaf, fi) == + BTRFS_FILE_EXTENT_INLINE) continue; /* * FIXME make sure to insert a trans record that * repeats the snapshot del on crash */ - disk_blocknr = btrfs_file_extent_disk_blocknr(fi); + disk_blocknr = btrfs_file_extent_disk_blocknr(leaf, fi); if (disk_blocknr == 0) continue; ret = btrfs_free_extent(trans, root, disk_blocknr, - btrfs_file_extent_disk_num_blocks(fi), - 0); + btrfs_file_extent_disk_num_blocks(leaf, fi), 0); BUG_ON(ret); } return 0; } static void reada_walk_down(struct btrfs_root *root, - struct btrfs_node *node) + struct extent_buffer *node) { int i; u32 nritems; @@ -1314,7 +1322,7 @@ static void reada_walk_down(struct btrfs_root *root, int ret; u32 refs; - nritems = btrfs_header_nritems(&node->header); + nritems = btrfs_header_nritems(node); for (i = 0; i < nritems; i++) { blocknr = btrfs_node_blockptr(node, i); ret = lookup_extent_ref(NULL, root, blocknr, 1, &refs); @@ -1337,16 +1345,17 @@ static void reada_walk_down(struct btrfs_root *root, static int walk_down_tree(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, int *level) { - struct buffer_head *next; - struct buffer_head *cur; + struct extent_buffer *next; + struct extent_buffer *cur; u64 blocknr; int ret; u32 refs; WARN_ON(*level < 0); WARN_ON(*level >= BTRFS_MAX_LEVEL); - ret = lookup_extent_ref(trans, root, bh_blocknr(path->nodes[*level]), - 1, &refs); + ret = lookup_extent_ref(trans, root, + extent_buffer_blocknr(path->nodes[*level]), + 1, &refs); BUG_ON(ret); if (refs > 1) goto out; @@ -1360,21 +1369,20 @@ static int walk_down_tree(struct btrfs_trans_handle *trans, struct btrfs_root cur = path->nodes[*level]; if (*level > 0 && path->slots[*level] == 0) - reada_walk_down(root, btrfs_buffer_node(cur)); + reada_walk_down(root, cur); - if (btrfs_header_level(btrfs_buffer_header(cur)) != *level) + if (btrfs_header_level(cur) != *level) WARN_ON(1); if (path->slots[*level] >= - btrfs_header_nritems(btrfs_buffer_header(cur))) + btrfs_header_nritems(cur)) break; if (*level == 0) { ret = drop_leaf_ref(trans, root, cur); BUG_ON(ret); break; } - blocknr = btrfs_node_blockptr(btrfs_buffer_node(cur), - path->slots[*level]); + blocknr = btrfs_node_blockptr(cur, path->slots[*level]); ret = lookup_extent_ref(trans, root, blocknr, 1, &refs); BUG_ON(ret); if (refs != 1) { @@ -1384,8 +1392,8 @@ static int walk_down_tree(struct btrfs_trans_handle *trans, struct btrfs_root continue; } next = btrfs_find_tree_block(root, blocknr); - if (!next || !buffer_uptodate(next)) { - brelse(next); + if (!next || !btrfs_buffer_uptodate(next)) { + free_extent_buffer(next); mutex_unlock(&root->fs_info->fs_mutex); next = read_tree_block(root, blocknr); mutex_lock(&root->fs_info->fs_mutex); @@ -1395,7 +1403,7 @@ static int walk_down_tree(struct btrfs_trans_handle *trans, struct btrfs_root BUG_ON(ret); if (refs != 1) { path->slots[*level]++; - brelse(next); + free_extent_buffer(next); ret = btrfs_free_extent(trans, root, blocknr, 1, 1); BUG_ON(ret); @@ -1404,17 +1412,17 @@ static int walk_down_tree(struct btrfs_trans_handle *trans, struct btrfs_root } WARN_ON(*level <= 0); if (path->nodes[*level-1]) - btrfs_block_release(root, path->nodes[*level-1]); + free_extent_buffer(path->nodes[*level-1]); path->nodes[*level-1] = next; - *level = btrfs_header_level(btrfs_buffer_header(next)); + *level = btrfs_header_level(next); path->slots[*level] = 0; } out: WARN_ON(*level < 0); WARN_ON(*level >= BTRFS_MAX_LEVEL); ret = btrfs_free_extent(trans, root, - bh_blocknr(path->nodes[*level]), 1, 1); - btrfs_block_release(root, path->nodes[*level]); + extent_buffer_blocknr(path->nodes[*level]), 1, 1); + free_extent_buffer(path->nodes[*level]); path->nodes[*level] = NULL; *level += 1; BUG_ON(ret); @@ -1436,24 +1444,24 @@ static int walk_up_tree(struct btrfs_trans_handle *trans, struct btrfs_root for(i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) { slot = path->slots[i]; - if (slot < btrfs_header_nritems( - btrfs_buffer_header(path->nodes[i])) - 1) { - struct btrfs_node *node; - node = btrfs_buffer_node(path->nodes[i]); + if (slot < btrfs_header_nritems(path->nodes[i]) - 1) { + struct extent_buffer *node; + struct btrfs_disk_key disk_key; + node = path->nodes[i]; path->slots[i]++; *level = i; WARN_ON(*level == 0); + btrfs_node_key(node, &disk_key, path->slots[i]); memcpy(&root_item->drop_progress, - &node->ptrs[path->slots[i]].key, - sizeof(root_item->drop_progress)); + &disk_key, sizeof(disk_key)); root_item->drop_level = i; return 0; } else { ret = btrfs_free_extent(trans, root, - bh_blocknr(path->nodes[*level]), - 1, 1); + extent_buffer_blocknr(path->nodes[*level]), + 1, 1); BUG_ON(ret); - btrfs_block_release(root, path->nodes[*level]); + free_extent_buffer(path->nodes[*level]); path->nodes[*level] = NULL; *level = i + 1; } @@ -1480,15 +1488,15 @@ int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root path = btrfs_alloc_path(); BUG_ON(!path); - level = btrfs_header_level(btrfs_buffer_header(root->node)); + level = btrfs_header_level(root->node); orig_level = level; if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) { path->nodes[level] = root->node; path->slots[level] = 0; } else { struct btrfs_key key; - struct btrfs_disk_key *found_key; - struct btrfs_node *node; + struct btrfs_disk_key found_key; + struct extent_buffer *node; btrfs_disk_key_to_cpu(&key, &root_item->drop_progress); level = root_item->drop_level; @@ -1498,10 +1506,10 @@ int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root ret = wret; goto out; } - node = btrfs_buffer_node(path->nodes[level]); - found_key = &node->ptrs[path->slots[level]].key; - WARN_ON(memcmp(found_key, &root_item->drop_progress, - sizeof(*found_key))); + node = path->nodes[level]; + btrfs_node_key(node, &found_key, path->slots[level]); + WARN_ON(memcmp(&found_key, &root_item->drop_progress, + sizeof(found_key))); } while(1) { wret = walk_down_tree(trans, root, path, &level); @@ -1516,12 +1524,12 @@ int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root if (wret < 0) ret = wret; ret = -EAGAIN; - get_bh(root->node); + extent_buffer_get(root->node); break; } for (i = 0; i <= orig_level; i++) { if (path->nodes[i]) { - btrfs_block_release(root, path->nodes[i]); + free_extent_buffer(path->nodes[i]); path->nodes[i] = 0; } } @@ -1581,13 +1589,12 @@ int btrfs_read_block_groups(struct btrfs_root *root) struct btrfs_path *path; int ret; int err = 0; - struct btrfs_block_group_item *bi; struct btrfs_block_group_cache *cache; struct btrfs_fs_info *info = root->fs_info; struct radix_tree_root *radix; struct btrfs_key key; struct btrfs_key found_key; - struct btrfs_leaf *leaf; + struct extent_buffer *leaf; u64 group_size_blocks; u64 used; @@ -1596,7 +1603,6 @@ int btrfs_read_block_groups(struct btrfs_root *root) root = info->extent_root; key.objectid = 0; key.offset = group_size_blocks; - key.flags = 0; btrfs_set_key_type(&key, BTRFS_BLOCK_GROUP_ITEM_KEY); path = btrfs_alloc_path(); @@ -1610,18 +1616,18 @@ int btrfs_read_block_groups(struct btrfs_root *root) err = ret; break; } - leaf = btrfs_buffer_leaf(path->nodes[0]); - btrfs_disk_key_to_cpu(&found_key, - &leaf->items[path->slots[0]].key); + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); cache = kmalloc(sizeof(*cache), GFP_NOFS); if (!cache) { err = -1; break; } - bi = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_block_group_item); - if (bi->flags & BTRFS_BLOCK_GROUP_DATA) { + read_extent_buffer(leaf, &cache->item, + btrfs_item_ptr_offset(leaf, path->slots[0]), + sizeof(cache->item)); + if (cache->item.flags & BTRFS_BLOCK_GROUP_DATA) { radix = &info->block_group_data_radix; cache->data = 1; } else { @@ -1629,7 +1635,6 @@ int btrfs_read_block_groups(struct btrfs_root *root) cache->data = 0; } - memcpy(&cache->item, bi, sizeof(*bi)); memcpy(&cache->key, &found_key, sizeof(found_key)); cache->last_alloc = cache->key.objectid; cache->first_free = cache->key.objectid; @@ -1640,11 +1645,12 @@ int btrfs_read_block_groups(struct btrfs_root *root) key.objectid = found_key.objectid + found_key.offset; btrfs_release_path(root, path); + ret = radix_tree_insert(radix, found_key.objectid + found_key.offset - 1, (void *)cache); BUG_ON(ret); - used = btrfs_block_group_used(bi); + used = btrfs_block_group_used(&cache->item); if (used < div_factor(key.offset, 8)) { radix_tree_tag_set(radix, found_key.objectid + found_key.offset - 1, diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 0ab368e091f9..55fefdfab84c 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -26,16 +26,6 @@ struct tree_entry { struct rb_node rb_node; }; -/* bits for the extent state */ -#define EXTENT_DIRTY 1 -#define EXTENT_WRITEBACK (1 << 1) -#define EXTENT_UPTODATE (1 << 2) -#define EXTENT_LOCKED (1 << 3) -#define EXTENT_NEW (1 << 4) -#define EXTENT_DELALLOC (1 << 5) - -#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK) - void __init extent_map_init(void) { extent_map_cache = btrfs_cache_create("extent_map", @@ -223,7 +213,8 @@ int add_extent_mapping(struct extent_map_tree *tree, if (rb) prev = rb_entry(rb, struct extent_map, rb_node); if (prev && prev->end + 1 == em->start && - ((em->block_start == 0 && prev->block_start == 0) || + ((em->block_start == EXTENT_MAP_HOLE && + prev->block_start == EXTENT_MAP_HOLE) || (em->block_start == prev->block_end + 1))) { em->start = prev->start; em->block_start = prev->block_start; @@ -926,6 +917,40 @@ int set_range_writeback(struct extent_map_tree *tree, u64 start, u64 end) } EXPORT_SYMBOL(set_range_writeback); +int find_first_extent_bit(struct extent_map_tree *tree, u64 start, + u64 *start_ret, u64 *end_ret, int bits) +{ + struct rb_node *node; + struct extent_state *state; + int ret = 1; + + write_lock_irq(&tree->lock); + /* + * this search will find all the extents that end after + * our range starts. + */ + node = tree_search(&tree->state, start); + if (!node || IS_ERR(node)) { + goto out; + } + + while(1) { + state = rb_entry(node, struct extent_state, rb_node); + if (state->state & bits) { + *start_ret = state->start; + *end_ret = state->end; + ret = 0; + } + node = rb_next(node); + if (!node) + break; + } +out: + write_unlock_irq(&tree->lock); + return ret; +} +EXPORT_SYMBOL(find_first_extent_bit); + u64 find_lock_delalloc_range(struct extent_map_tree *tree, u64 start, u64 lock_start, u64 *end, u64 max_bytes) { @@ -1450,7 +1475,7 @@ int extent_read_full_page(struct extent_map_tree *tree, struct page *page, em = NULL; /* we've found a hole, just zero and go on */ - if (block_start == 0) { + if (block_start == EXTENT_MAP_HOLE) { zero_user_page(page, page_offset, iosize, KM_USER0); set_extent_uptodate(tree, cur, cur + iosize - 1, GFP_NOFS); @@ -1593,7 +1618,8 @@ int extent_write_full_page(struct extent_map_tree *tree, struct page *page, free_extent_map(em); em = NULL; - if (block_start == 0 || block_start == EXTENT_MAP_INLINE) { + if (block_start == EXTENT_MAP_HOLE || + block_start == EXTENT_MAP_INLINE) { clear_extent_dirty(tree, cur, cur + iosize - 1, GFP_NOFS); cur = cur + iosize; @@ -1630,7 +1656,6 @@ int extent_write_full_page(struct extent_map_tree *tree, struct page *page, nr++; } done: - WARN_ON(test_range_bit(tree, start, page_end, EXTENT_DIRTY, 0)); unlock_extent(tree, start, page_end, GFP_NOFS); unlock_page(page); return 0; @@ -1827,8 +1852,623 @@ sector_t extent_bmap(struct address_space *mapping, sector_t iblock, // XXX(hch): block 0 is valid in some cases, e.g. XFS RT device if (em->block_start == EXTENT_MAP_INLINE || - em->block_start == 0) - return 0; + em->block_start == EXTENT_MAP_HOLE) + return 0; return (em->block_start + start - em->start) >> inode->i_blkbits; } + +struct extent_buffer *alloc_extent_buffer(struct extent_map_tree *tree, + u64 start, unsigned long len, + gfp_t mask) +{ + unsigned long num_pages = ((start + len - 1) >> PAGE_CACHE_SHIFT) - + (start >> PAGE_CACHE_SHIFT) + 1; + unsigned long i; + unsigned long index = start >> PAGE_CACHE_SHIFT; + struct extent_buffer *eb; + struct page *p; + struct address_space *mapping = tree->mapping; + int uptodate = 0; + + eb = kzalloc(EXTENT_BUFFER_SIZE(num_pages), mask); + if (!eb || IS_ERR(eb)) + return NULL; + + eb->start = start; + eb->len = len; + atomic_set(&eb->refs, 1); + + for (i = 0; i < num_pages; i++, index++) { + p = find_or_create_page(mapping, index, mask | __GFP_HIGHMEM); + if (!p) + goto fail; + eb->pages[i] = p; + if (!PageUptodate(p)) + uptodate = 0; + unlock_page(p); + } + if (uptodate) + eb->flags |= EXTENT_UPTODATE; + return eb; +fail: + free_extent_buffer(eb); + return NULL; +} +EXPORT_SYMBOL(alloc_extent_buffer); + +struct extent_buffer *find_extent_buffer(struct extent_map_tree *tree, + u64 start, unsigned long len, + gfp_t mask) +{ + unsigned long num_pages = ((start + len - 1) >> PAGE_CACHE_SHIFT) - + (start >> PAGE_CACHE_SHIFT) + 1; + unsigned long i; + unsigned long index = start >> PAGE_CACHE_SHIFT; + struct extent_buffer *eb; + struct page *p; + struct address_space *mapping = tree->mapping; + + eb = kzalloc(EXTENT_BUFFER_SIZE(num_pages), mask); + if (!eb || IS_ERR(eb)) + return NULL; + + eb->start = start; + eb->len = len; + atomic_set(&eb->refs, 1); + + for (i = 0; i < num_pages; i++, index++) { + p = find_get_page(mapping, index); + if (!p) + goto fail; + eb->pages[i] = p; + } + return eb; +fail: + free_extent_buffer(eb); + return NULL; +} +EXPORT_SYMBOL(find_extent_buffer); + +void free_extent_buffer(struct extent_buffer *eb) +{ + unsigned long i; + unsigned long num_pages; + + if (!eb) + return; + + if (!atomic_dec_and_test(&eb->refs)) + return; + + num_pages = ((eb->start + eb->len - 1) >> PAGE_CACHE_SHIFT) - + (eb->start >> PAGE_CACHE_SHIFT) + 1; + + for (i = 0; i < num_pages; i++) { + if (eb->pages[i]) + page_cache_release(eb->pages[i]); + } + kfree(eb); +} +EXPORT_SYMBOL(free_extent_buffer); + +int clear_extent_buffer_dirty(struct extent_map_tree *tree, + struct extent_buffer *eb) +{ + int set; + unsigned long i; + unsigned long num_pages; + struct page *page; + + u64 start = eb->start; + u64 end = start + eb->len - 1; + + set = clear_extent_dirty(tree, start, end, GFP_NOFS); + num_pages = ((eb->start + eb->len - 1) >> PAGE_CACHE_SHIFT) - + (eb->start >> PAGE_CACHE_SHIFT) + 1; + + for (i = 0; i < num_pages; i++) { + page = eb->pages[i]; + lock_page(page); + /* + * if we're on the last page or the first page and the + * block isn't aligned on a page boundary, do extra checks + * to make sure we don't clean page that is partially dirty + */ + if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) || + ((i == num_pages - 1) && + ((eb->start + eb->len - 1) & (PAGE_CACHE_SIZE - 1)))) { + start = page->index << PAGE_CACHE_SHIFT; + end = start + PAGE_CACHE_SIZE - 1; + if (test_range_bit(tree, start, end, + EXTENT_DIRTY, 0)) { + unlock_page(page); + continue; + } + } + clear_page_dirty_for_io(page); + unlock_page(page); + } + return 0; +} +EXPORT_SYMBOL(clear_extent_buffer_dirty); + +int wait_on_extent_buffer_writeback(struct extent_map_tree *tree, + struct extent_buffer *eb) +{ + return wait_on_extent_writeback(tree, eb->start, + eb->start + eb->len - 1); +} +EXPORT_SYMBOL(wait_on_extent_buffer_writeback); + +int set_extent_buffer_dirty(struct extent_map_tree *tree, + struct extent_buffer *eb) +{ + return set_range_dirty(tree, eb->start, eb->start + eb->len - 1); +} +EXPORT_SYMBOL(set_extent_buffer_dirty); + +int set_extent_buffer_uptodate(struct extent_map_tree *tree, + struct extent_buffer *eb) +{ + unsigned long i; + struct page *page; + unsigned long num_pages; + + num_pages = ((eb->start + eb->len - 1) >> PAGE_CACHE_SHIFT) - + (eb->start >> PAGE_CACHE_SHIFT) + 1; + + set_extent_uptodate(tree, eb->start, eb->start + eb->len - 1, + GFP_NOFS); + for (i = 0; i < num_pages; i++) { + page = eb->pages[i]; + if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) || + ((i == num_pages - 1) && + ((eb->start + eb->len - 1) & (PAGE_CACHE_SIZE - 1)))) { + check_page_uptodate(tree, page); + continue; + } + SetPageUptodate(page); + } + return 0; +} +EXPORT_SYMBOL(set_extent_buffer_uptodate); + +int extent_buffer_uptodate(struct extent_map_tree *tree, + struct extent_buffer *eb) +{ + if (eb->flags & EXTENT_UPTODATE) + return 1; + return test_range_bit(tree, eb->start, eb->start + eb->len - 1, + EXTENT_UPTODATE, 1); +} +EXPORT_SYMBOL(extent_buffer_uptodate); + +int read_extent_buffer_pages(struct extent_map_tree *tree, + struct extent_buffer *eb, int wait) +{ + unsigned long i; + struct page *page; + int err; + int ret = 0; + unsigned long num_pages; + + if (eb->flags & EXTENT_UPTODATE) + return 0; + + if (test_range_bit(tree, eb->start, eb->start + eb->len - 1, + EXTENT_UPTODATE, 1)) { + return 0; + } + + num_pages = ((eb->start + eb->len - 1) >> PAGE_CACHE_SHIFT) - + (eb->start >> PAGE_CACHE_SHIFT) + 1; + for (i = 0; i < num_pages; i++) { + page = eb->pages[i]; + if (PageUptodate(page)) { + continue; + } + if (!wait) { + if (TestSetPageLocked(page)) { + continue; + } + } else { + lock_page(page); + } + if (!PageUptodate(page)) { + err = page->mapping->a_ops->readpage(NULL, page); + if (err) { + ret = err; + } + } else { + unlock_page(page); + } + } + + if (ret || !wait) { + return ret; + } + + for (i = 0; i < num_pages; i++) { + page = eb->pages[i]; + wait_on_page_locked(page); + if (!PageUptodate(page)) { + ret = -EIO; + } + } + eb->flags |= EXTENT_UPTODATE; + return ret; +} +EXPORT_SYMBOL(read_extent_buffer_pages); + +void read_extent_buffer(struct extent_buffer *eb, void *dstv, + unsigned long start, + unsigned long len) +{ + size_t cur; + size_t offset; + struct page *page; + char *kaddr; + char *dst = (char *)dstv; + size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); + unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT; + + WARN_ON(start > eb->len); + WARN_ON(start + len > eb->start + eb->len); + + page = eb->pages[i]; + offset = start & ((unsigned long)PAGE_CACHE_SIZE - 1); + if (i == 0) + offset += start_offset; + + while(len > 0) { + WARN_ON(!PageUptodate(page)); + + cur = min(len, (PAGE_CACHE_SIZE - offset)); + // kaddr = kmap_atomic(page, KM_USER0); + kaddr = page_address(page); + memcpy(dst, kaddr + offset, cur); + // kunmap_atomic(kaddr, KM_USER0); + + dst += cur; + len -= cur; + offset = 0; + i++; + page = eb->pages[i]; + } +} +EXPORT_SYMBOL(read_extent_buffer); + +int map_extent_buffer(struct extent_buffer *eb, unsigned long start, + char **token, char **map, + unsigned long *map_start, + unsigned long *map_len, int km) +{ + size_t offset; + char *kaddr; + size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); + unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT; + + WARN_ON(start > eb->len); + + if (i == 0) { + offset = start_offset; + *map_start = 0; + } else { + offset = 0; + *map_start = (i << PAGE_CACHE_SHIFT) - offset; + } + + // kaddr = kmap_atomic(eb->pages[i], km); + kaddr = page_address(eb->pages[i]); + *token = kaddr; + *map = kaddr + offset; + *map_len = PAGE_CACHE_SIZE - offset; + return 0; +} +EXPORT_SYMBOL(map_extent_buffer); + +void unmap_extent_buffer(struct extent_buffer *eb, char *token, int km) +{ + // kunmap_atomic(token, km); +} +EXPORT_SYMBOL(unmap_extent_buffer); + +int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv, + unsigned long start, + unsigned long len) +{ + size_t cur; + size_t offset; + struct page *page; + char *kaddr; + char *ptr = (char *)ptrv; + size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); + unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT; + int ret = 0; + + WARN_ON(start > eb->len); + WARN_ON(start + len > eb->start + eb->len); + + page = eb->pages[i]; + offset = start & ((unsigned long)PAGE_CACHE_SIZE - 1); + if (i == 0) + offset += start_offset; + + while(len > 0) { + WARN_ON(!PageUptodate(page)); + + cur = min(len, (PAGE_CACHE_SIZE - offset)); + + // kaddr = kmap_atomic(page, KM_USER0); + kaddr = page_address(page); + ret = memcmp(ptr, kaddr + offset, cur); + // kunmap_atomic(kaddr, KM_USER0); + if (ret) + break; + + ptr += cur; + len -= cur; + offset = 0; + i++; + page = eb->pages[i]; + } + return ret; +} +EXPORT_SYMBOL(memcmp_extent_buffer); + +void write_extent_buffer(struct extent_buffer *eb, const void *srcv, + unsigned long start, unsigned long len) +{ + size_t cur; + size_t offset; + struct page *page; + char *kaddr; + char *src = (char *)srcv; + size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); + unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT; + + WARN_ON(start > eb->len); + WARN_ON(start + len > eb->start + eb->len); + + page = eb->pages[i]; + offset = start & ((unsigned long)PAGE_CACHE_SIZE - 1); + if (i == 0) + offset += start_offset; + + while(len > 0) { + WARN_ON(!PageUptodate(page)); + + cur = min(len, PAGE_CACHE_SIZE - offset); + // kaddr = kmap_atomic(page, KM_USER0); + kaddr = page_address(page); + memcpy(kaddr + offset, src, cur); + // kunmap_atomic(kaddr, KM_USER0); + + src += cur; + len -= cur; + offset = 0; + i++; + page = eb->pages[i]; + } +} +EXPORT_SYMBOL(write_extent_buffer); + +void memset_extent_buffer(struct extent_buffer *eb, char c, + unsigned long start, unsigned long len) +{ + size_t cur; + size_t offset; + struct page *page; + char *kaddr; + size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); + unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT; + + WARN_ON(start > eb->len); + WARN_ON(start + len > eb->start + eb->len); + + page = eb->pages[i]; + offset = start & ((unsigned long)PAGE_CACHE_SIZE - 1); + if (i == 0) + offset += start_offset; + + while(len > 0) { + WARN_ON(!PageUptodate(page)); + + cur = min(len, PAGE_CACHE_SIZE - offset); + // kaddr = kmap_atomic(page, KM_USER0); + kaddr = page_address(page); + memset(kaddr + offset, c, cur); + // kunmap_atomic(kaddr, KM_USER0); + + len -= cur; + offset = 0; + i++; + page = eb->pages[i]; + } +} +EXPORT_SYMBOL(memset_extent_buffer); + +void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src, + unsigned long dst_offset, unsigned long src_offset, + unsigned long len) +{ + u64 dst_len = dst->len; + size_t cur; + size_t offset; + struct page *page; + char *kaddr; + size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1); + unsigned long i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT; + + WARN_ON(src->len != dst_len); + + offset = dst_offset & ((unsigned long)PAGE_CACHE_SIZE - 1); + if (i == 0) + offset += start_offset; + + while(len > 0) { + page = dst->pages[i]; + WARN_ON(!PageUptodate(page)); + + cur = min(len, (unsigned long)(PAGE_CACHE_SIZE - offset)); + + // kaddr = kmap_atomic(page, KM_USER1); + kaddr = page_address(page); + read_extent_buffer(src, kaddr + offset, src_offset, cur); + // kunmap_atomic(kaddr, KM_USER1); + + src_offset += cur; + len -= cur; + offset = 0; + i++; + } +} +EXPORT_SYMBOL(copy_extent_buffer); + +static void move_pages(struct page *dst_page, struct page *src_page, + unsigned long dst_off, unsigned long src_off, + unsigned long len) +{ + // char *dst_kaddr = kmap_atomic(dst_page, KM_USER0); + char *dst_kaddr = page_address(dst_page); + if (dst_page == src_page) { + memmove(dst_kaddr + dst_off, dst_kaddr + src_off, len); + } else { + // char *src_kaddr = kmap_atomic(src_page, KM_USER1); + char *src_kaddr = page_address(src_page); + char *p = dst_kaddr + dst_off + len; + char *s = src_kaddr + src_off + len; + + while (len--) + *--p = *--s; + + // kunmap_atomic(src_kaddr, KM_USER1); + } + // kunmap_atomic(dst_kaddr, KM_USER0); +} + +static void copy_pages(struct page *dst_page, struct page *src_page, + unsigned long dst_off, unsigned long src_off, + unsigned long len) +{ + //kmap_atomic(dst_page, KM_USER0); + char *dst_kaddr = page_address(dst_page); + char *src_kaddr; + + if (dst_page != src_page) + src_kaddr = page_address(src_page); // kmap_atomic(src_page, KM_USER1); + else + src_kaddr = dst_kaddr; + + memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len); + /* + kunmap_atomic(dst_kaddr, KM_USER0); + if (dst_page != src_page) + kunmap_atomic(src_kaddr, KM_USER1); + */ +} + +void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, + unsigned long src_offset, unsigned long len) +{ + size_t cur; + size_t dst_off_in_page; + size_t src_off_in_page; + size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1); + unsigned long dst_i; + unsigned long src_i; + + if (src_offset + len > dst->len) { + printk("memmove bogus src_offset %lu move len %lu len %lu\n", + src_offset, len, dst->len); + BUG_ON(1); + } + if (dst_offset + len > dst->len) { + printk("memmove bogus dst_offset %lu move len %lu len %lu\n", + dst_offset, len, dst->len); + BUG_ON(1); + } + + while(len > 0) { + dst_off_in_page = dst_offset & + ((unsigned long)PAGE_CACHE_SIZE - 1); + src_off_in_page = src_offset & + ((unsigned long)PAGE_CACHE_SIZE - 1); + + dst_i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT; + src_i = (start_offset + src_offset) >> PAGE_CACHE_SHIFT; + + if (src_i == 0) + src_off_in_page += start_offset; + if (dst_i == 0) + dst_off_in_page += start_offset; + + cur = min(len, (unsigned long)(PAGE_CACHE_SIZE - + src_off_in_page)); + cur = min(cur, (unsigned long)(PAGE_CACHE_SIZE - + dst_off_in_page)); + + copy_pages(dst->pages[dst_i], dst->pages[src_i], + dst_off_in_page, src_off_in_page, cur); + + src_offset += cur; + dst_offset += cur; + len -= cur; + } +} +EXPORT_SYMBOL(memcpy_extent_buffer); + +void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, + unsigned long src_offset, unsigned long len) +{ + size_t cur; + size_t dst_off_in_page; + size_t src_off_in_page; + unsigned long dst_end = dst_offset + len - 1; + unsigned long src_end = src_offset + len - 1; + size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1); + unsigned long dst_i; + unsigned long src_i; + + if (src_offset + len > dst->len) { + printk("memmove bogus src_offset %lu move len %lu len %lu\n", + src_offset, len, dst->len); + BUG_ON(1); + } + if (dst_offset + len > dst->len) { + printk("memmove bogus dst_offset %lu move len %lu len %lu\n", + dst_offset, len, dst->len); + BUG_ON(1); + } + if (dst_offset < src_offset) { + memcpy_extent_buffer(dst, dst_offset, src_offset, len); + return; + } + while(len > 0) { + dst_i = (start_offset + dst_end) >> PAGE_CACHE_SHIFT; + src_i = (start_offset + src_end) >> PAGE_CACHE_SHIFT; + + dst_off_in_page = dst_end & + ((unsigned long)PAGE_CACHE_SIZE - 1); + src_off_in_page = src_end & + ((unsigned long)PAGE_CACHE_SIZE - 1); + + if (src_i == 0) + src_off_in_page += start_offset; + if (dst_i == 0) + dst_off_in_page += start_offset; + + cur = min(len, src_off_in_page + 1); + cur = min(cur, dst_off_in_page + 1); +// printk("move pages orig dst %lu src %lu len %lu, this %lu %lu %lu\n", dst_offset, src_offset, len, dst_off_in_page - cur + 1, src_off_in_page - cur + 1, cur); + move_pages(dst->pages[dst_i], dst->pages[src_i], + dst_off_in_page - cur + 1, + src_off_in_page - cur + 1, cur); + + dst_end -= cur - 1; + src_end -= cur - 1; + len -= cur; + } +} +EXPORT_SYMBOL(memmove_extent_buffer); diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h index d913ce01248d..430b997a70f6 100644 --- a/fs/btrfs/extent_map.h +++ b/fs/btrfs/extent_map.h @@ -3,9 +3,20 @@ #include +#define EXTENT_MAP_HOLE (u64)-3 #define EXTENT_MAP_INLINE (u64)-2 #define EXTENT_MAP_DELALLOC (u64)-1 +/* bits for the extent state */ +#define EXTENT_DIRTY 1 +#define EXTENT_WRITEBACK (1 << 1) +#define EXTENT_UPTODATE (1 << 2) +#define EXTENT_LOCKED (1 << 3) +#define EXTENT_NEW (1 << 4) +#define EXTENT_DELALLOC (1 << 5) +#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK) + + struct extent_map_ops { int (*fill_delalloc)(struct inode *inode, u64 start, u64 end); int (*writepage_io_hook)(struct page *page, u64 start, u64 end); @@ -51,10 +62,13 @@ struct extent_state { struct list_head list; }; +#define EXTENT_BUFFER_SIZE(nr) (sizeof(struct extent_buffer) + \ + (nr - 1) * sizeof(struct page *)) struct extent_buffer { u64 start; - u64 end; /* inclusive */ - char *addr; + unsigned long len; + atomic_t refs; + int flags; struct page *pages[]; }; @@ -87,8 +101,12 @@ int set_extent_new(struct extent_map_tree *tree, u64 start, u64 end, gfp_t mask); int set_extent_dirty(struct extent_map_tree *tree, u64 start, u64 end, gfp_t mask); +int clear_extent_dirty(struct extent_map_tree *tree, u64 start, u64 end, + gfp_t mask); int set_extent_delalloc(struct extent_map_tree *tree, u64 start, u64 end, gfp_t mask); +int find_first_extent_bit(struct extent_map_tree *tree, u64 start, + u64 *start_ret, u64 *end_ret, int bits); int extent_invalidatepage(struct extent_map_tree *tree, struct page *page, unsigned long offset); int extent_write_full_page(struct extent_map_tree *tree, struct page *page, @@ -106,4 +124,57 @@ int set_range_dirty(struct extent_map_tree *tree, u64 start, u64 end); int set_state_private(struct extent_map_tree *tree, u64 start, u64 private); int get_state_private(struct extent_map_tree *tree, u64 start, u64 *private); void set_page_extent_mapped(struct page *page); + +struct extent_buffer *alloc_extent_buffer(struct extent_map_tree *tree, + u64 start, unsigned long len, + gfp_t mask); +struct extent_buffer *find_extent_buffer(struct extent_map_tree *tree, + u64 start, unsigned long len, + gfp_t mask); +void free_extent_buffer(struct extent_buffer *eb); +int read_extent_buffer_pages(struct extent_map_tree *tree, + struct extent_buffer *eb, int wait); + +static inline void extent_buffer_get(struct extent_buffer *eb) +{ + atomic_inc(&eb->refs); +} + +static inline u64 extent_buffer_blocknr(struct extent_buffer *eb) +{ + return eb->start / 4096; +} + +int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv, + unsigned long start, + unsigned long len); +void read_extent_buffer(struct extent_buffer *eb, void *dst, + unsigned long start, + unsigned long len); +void write_extent_buffer(struct extent_buffer *eb, const void *src, + unsigned long start, unsigned long len); +void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src, + unsigned long dst_offset, unsigned long src_offset, + unsigned long len); +void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, + unsigned long src_offset, unsigned long len); +void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, + unsigned long src_offset, unsigned long len); +void memset_extent_buffer(struct extent_buffer *eb, char c, + unsigned long start, unsigned long len); +int wait_on_extent_buffer_writeback(struct extent_map_tree *tree, + struct extent_buffer *eb); +int clear_extent_buffer_dirty(struct extent_map_tree *tree, + struct extent_buffer *eb); +int set_extent_buffer_dirty(struct extent_map_tree *tree, + struct extent_buffer *eb); +int set_extent_buffer_uptodate(struct extent_map_tree *tree, + struct extent_buffer *eb); +int extent_buffer_uptodate(struct extent_map_tree *tree, + struct extent_buffer *eb); +int map_extent_buffer(struct extent_buffer *eb, unsigned long offset, + char **token, char **map, + unsigned long *map_start, + unsigned long *map_len, int km); +void unmap_extent_buffer(struct extent_buffer *eb, char *token, int km); #endif diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index ea4dd4c5fce3..226f6d028c3f 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -34,12 +34,12 @@ int btrfs_insert_file_extent(struct btrfs_trans_handle *trans, struct btrfs_file_extent_item *item; struct btrfs_key file_key; struct btrfs_path *path; + struct extent_buffer *leaf; path = btrfs_alloc_path(); BUG_ON(!path); file_key.objectid = objectid; file_key.offset = pos; - file_key.flags = 0; btrfs_set_key_type(&file_key, BTRFS_EXTENT_DATA_KEY); ret = btrfs_insert_empty_item(trans, root, path, &file_key, @@ -47,15 +47,16 @@ int btrfs_insert_file_extent(struct btrfs_trans_handle *trans, if (ret < 0) goto out; BUG_ON(ret); - item = btrfs_item_ptr(btrfs_buffer_leaf(path->nodes[0]), path->slots[0], + leaf = path->nodes[0]; + item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); - btrfs_set_file_extent_disk_blocknr(item, offset); - btrfs_set_file_extent_disk_num_blocks(item, disk_num_blocks); - btrfs_set_file_extent_offset(item, 0); - btrfs_set_file_extent_num_blocks(item, num_blocks); - btrfs_set_file_extent_generation(item, trans->transid); - btrfs_set_file_extent_type(item, BTRFS_FILE_EXTENT_REG); - btrfs_mark_buffer_dirty(path->nodes[0]); + btrfs_set_file_extent_disk_blocknr(leaf, item, offset); + btrfs_set_file_extent_disk_num_blocks(leaf, item, disk_num_blocks); + btrfs_set_file_extent_offset(leaf, item, 0); + btrfs_set_file_extent_num_blocks(leaf, item, num_blocks); + btrfs_set_file_extent_generation(leaf, item, trans->transid); + btrfs_set_file_extent_type(leaf, item, BTRFS_FILE_EXTENT_REG); + btrfs_mark_buffer_dirty(leaf); out: btrfs_free_path(path); return ret; @@ -71,32 +72,30 @@ struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans, struct btrfs_key file_key; struct btrfs_key found_key; struct btrfs_csum_item *item; - struct btrfs_leaf *leaf; + struct extent_buffer *leaf; u64 csum_offset = 0; int csums_in_item; file_key.objectid = objectid; file_key.offset = offset; - file_key.flags = 0; btrfs_set_key_type(&file_key, BTRFS_CSUM_ITEM_KEY); ret = btrfs_search_slot(trans, root, &file_key, path, 0, cow); if (ret < 0) goto fail; - leaf = btrfs_buffer_leaf(path->nodes[0]); + leaf = path->nodes[0]; if (ret > 0) { ret = 1; if (path->slots[0] == 0) goto fail; path->slots[0]--; - btrfs_disk_key_to_cpu(&found_key, - &leaf->items[path->slots[0]].key); + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); if (btrfs_key_type(&found_key) != BTRFS_CSUM_ITEM_KEY || found_key.objectid != objectid) { goto fail; } csum_offset = (offset - found_key.offset) >> root->fs_info->sb->s_blocksize_bits; - csums_in_item = btrfs_item_size(leaf->items + path->slots[0]); + csums_in_item = btrfs_item_size_nr(leaf, path->slots[0]); csums_in_item /= BTRFS_CRC32_SIZE; if (csum_offset >= csums_in_item) { @@ -127,7 +126,6 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans, file_key.objectid = objectid; file_key.offset = offset; - file_key.flags = 0; btrfs_set_key_type(&file_key, BTRFS_EXTENT_DATA_KEY); ret = btrfs_search_slot(trans, root, &file_key, path, ins_len, cow); return ret; @@ -138,12 +136,14 @@ int btrfs_csum_file_block(struct btrfs_trans_handle *trans, u64 objectid, u64 offset, char *data, size_t len) { + return 0; +#if 0 int ret; struct btrfs_key file_key; struct btrfs_key found_key; struct btrfs_path *path; struct btrfs_csum_item *item; - struct btrfs_leaf *leaf; + struct extent_buffer *leaf; u64 csum_offset; path = btrfs_alloc_path(); @@ -161,8 +161,8 @@ int btrfs_csum_file_block(struct btrfs_trans_handle *trans, if (ret == -EFBIG) { u32 item_size; /* we found one, but it isn't big enough yet */ - leaf = btrfs_buffer_leaf(path->nodes[0]); - item_size = btrfs_item_size(leaf->items + path->slots[0]); + leaf = path->nodes[0]; + item_size = btrfs_item_size_nr(leaf, path->slots[0]); if ((item_size / BTRFS_CRC32_SIZE) >= MAX_CSUM_ITEMS(root)) { /* already at max size, make a new one */ goto insert; @@ -188,8 +188,8 @@ int btrfs_csum_file_block(struct btrfs_trans_handle *trans, goto insert; } path->slots[0]--; - leaf = btrfs_buffer_leaf(path->nodes[0]); - btrfs_disk_key_to_cpu(&found_key, &leaf->items[path->slots[0]].key); + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); csum_offset = (offset - found_key.offset) >> root->fs_info->sb->s_blocksize_bits; if (btrfs_key_type(&found_key) != BTRFS_CSUM_ITEM_KEY || @@ -197,10 +197,10 @@ int btrfs_csum_file_block(struct btrfs_trans_handle *trans, csum_offset >= MAX_CSUM_ITEMS(root)) { goto insert; } - if (csum_offset >= btrfs_item_size(leaf->items + path->slots[0]) / + if (csum_offset >= btrfs_item_size_nr(leaf, path->slots[0]) / BTRFS_CRC32_SIZE) { u32 diff = (csum_offset + 1) * BTRFS_CRC32_SIZE; - diff = diff - btrfs_item_size(leaf->items + path->slots[0]); + diff = diff - btrfs_item_size_nr(leaf, path->slots[0]); if (diff != BTRFS_CRC32_SIZE) goto insert; ret = btrfs_extend_item(trans, root, path, diff); @@ -220,21 +220,20 @@ insert: goto fail; } csum: - item = btrfs_item_ptr(btrfs_buffer_leaf(path->nodes[0]), path->slots[0], - struct btrfs_csum_item); + leaf = path->nodes[0]; + item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item); ret = 0; item = (struct btrfs_csum_item *)((unsigned char *)item + csum_offset * BTRFS_CRC32_SIZE); found: - btrfs_check_bounds(&item->csum, BTRFS_CRC32_SIZE, - path->nodes[0]->b_data, - root->fs_info->sb->s_blocksize); + /* FIXME!!!!!!!!!!!! */ ret = btrfs_csum_data(root, data, len, &item->csum); btrfs_mark_buffer_dirty(path->nodes[0]); fail: btrfs_release_path(root, path); btrfs_free_path(path); return ret; +#endif } int btrfs_csum_truncate(struct btrfs_trans_handle *trans, @@ -242,21 +241,21 @@ int btrfs_csum_truncate(struct btrfs_trans_handle *trans, u64 isize) { struct btrfs_key key; - struct btrfs_leaf *leaf = btrfs_buffer_leaf(path->nodes[0]); + struct extent_buffer *leaf = path->nodes[0]; int slot = path->slots[0]; int ret; u32 new_item_size; u64 new_item_span; u64 blocks; - btrfs_disk_key_to_cpu(&key, &leaf->items[slot].key); + btrfs_item_key_to_cpu(leaf, &key, slot); if (isize <= key.offset) return 0; new_item_span = isize - key.offset; - blocks = (new_item_span + root->blocksize - 1) >> + blocks = (new_item_span + root->sectorsize - 1) >> root->fs_info->sb->s_blocksize_bits; new_item_size = blocks * BTRFS_CRC32_SIZE; - if (new_item_size >= btrfs_item_size(leaf->items + slot)) + if (new_item_size >= btrfs_item_size_nr(leaf, slot)) return 0; ret = btrfs_truncate_item(trans, root, path, new_item_size); BUG_ON(ret); diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 4cc459c943ec..1734ca695555 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -16,7 +16,6 @@ * Boston, MA 021110-1307, USA. */ -#include #include #include #include @@ -88,7 +87,9 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans, { struct btrfs_key key; struct btrfs_path *path; - char *ptr, *kaddr; + struct extent_buffer *leaf; + char *kaddr; + unsigned long ptr; struct btrfs_file_extent_item *ei; u32 datasize; int err = 0; @@ -102,7 +103,6 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans, key.objectid = inode->i_ino; key.offset = offset; - key.flags = 0; btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY); BUG_ON(size >= PAGE_CACHE_SIZE); datasize = btrfs_file_extent_calc_inline_size(size); @@ -113,18 +113,17 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans, err = ret; goto fail; } - ei = btrfs_item_ptr(btrfs_buffer_leaf(path->nodes[0]), - path->slots[0], struct btrfs_file_extent_item); - btrfs_set_file_extent_generation(ei, trans->transid); - btrfs_set_file_extent_type(ei, - BTRFS_FILE_EXTENT_INLINE); + leaf = path->nodes[0]; + ei = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + btrfs_set_file_extent_generation(leaf, ei, trans->transid); + btrfs_set_file_extent_type(leaf, ei, BTRFS_FILE_EXTENT_INLINE); ptr = btrfs_file_extent_inline_start(ei); kaddr = kmap_atomic(page, KM_USER0); - btrfs_memcpy(root, path->nodes[0]->b_data, - ptr, kaddr + page_offset, size); + write_extent_buffer(leaf, kaddr + page_offset, ptr, size); kunmap_atomic(kaddr, KM_USER0); - btrfs_mark_buffer_dirty(path->nodes[0]); + btrfs_mark_buffer_dirty(leaf); fail: btrfs_free_path(path); return err; @@ -156,8 +155,8 @@ static int dirty_and_release_pages(struct btrfs_trans_handle *trans, em->bdev = inode->i_sb->s_bdev; - start_pos = pos & ~((u64)root->blocksize - 1); - num_blocks = (write_bytes + pos - start_pos + root->blocksize - 1) >> + start_pos = pos & ~((u64)root->sectorsize - 1); + num_blocks = (write_bytes + pos - start_pos + root->sectorsize - 1) >> inode->i_blkbits; down_read(&BTRFS_I(inode)->root->snap_sem); @@ -184,7 +183,7 @@ static int dirty_and_release_pages(struct btrfs_trans_handle *trans, if (inode->i_size < start_pos) { u64 last_pos_in_file; u64 hole_size; - u64 mask = root->blocksize - 1; + u64 mask = root->sectorsize - 1; last_pos_in_file = (isize + mask) & ~mask; hole_size = (start_pos - last_pos_in_file + mask) & ~mask; @@ -227,8 +226,8 @@ static int dirty_and_release_pages(struct btrfs_trans_handle *trans, /* step one, delete the existing extents in this range */ /* FIXME blocksize != pagesize */ err = btrfs_drop_extents(trans, root, inode, start_pos, - (pos + write_bytes + root->blocksize -1) & - ~((u64)root->blocksize - 1), &hint_block); + (pos + write_bytes + root->sectorsize -1) & + ~((u64)root->sectorsize - 1), &hint_block); if (err) goto failed; @@ -288,7 +287,7 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans, { int ret; struct btrfs_key key; - struct btrfs_leaf *leaf; + struct extent_buffer *leaf; int slot; struct btrfs_file_extent_item *extent; u64 extent_end = 0; @@ -327,10 +326,10 @@ next_slot: found_extent = 0; found_inline = 0; extent = NULL; - leaf = btrfs_buffer_leaf(path->nodes[0]); + leaf = path->nodes[0]; slot = path->slots[0]; ret = 0; - btrfs_disk_key_to_cpu(&key, &leaf->items[slot].key); + btrfs_item_key_to_cpu(leaf, &key, slot); if (key.offset >= end || key.objectid != inode->i_ino) { goto out; } @@ -344,17 +343,18 @@ next_slot: if (btrfs_key_type(&key) == BTRFS_EXTENT_DATA_KEY) { extent = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); - found_type = btrfs_file_extent_type(extent); + found_type = btrfs_file_extent_type(leaf, extent); if (found_type == BTRFS_FILE_EXTENT_REG) { extent_end = key.offset + - (btrfs_file_extent_num_blocks(extent) << + (btrfs_file_extent_num_blocks(leaf, extent) << inode->i_blkbits); found_extent = 1; } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { + struct btrfs_item *item; + item = btrfs_item_nr(leaf, slot); found_inline = 1; extent_end = key.offset + - btrfs_file_extent_inline_len(leaf->items + - slot); + btrfs_file_extent_inline_len(leaf, item); } } else { extent_end = search_start; @@ -365,8 +365,7 @@ next_slot: search_start >= extent_end) { int nextret; u32 nritems; - nritems = btrfs_header_nritems( - btrfs_buffer_header(path->nodes[0])); + nritems = btrfs_header_nritems(leaf); if (slot >= nritems - 1) { nextret = btrfs_next_leaf(root, path); if (nextret) @@ -380,7 +379,7 @@ next_slot: /* FIXME, there's only one inline extent allowed right now */ if (found_inline) { - u64 mask = root->blocksize - 1; + u64 mask = root->sectorsize - 1; search_start = (extent_end + mask) & ~mask; } else search_start = extent_end; @@ -388,10 +387,13 @@ next_slot: if (end < extent_end && end >= key.offset) { if (found_extent) { u64 disk_blocknr = - btrfs_file_extent_disk_blocknr(extent); + btrfs_file_extent_disk_blocknr(leaf,extent); u64 disk_num_blocks = - btrfs_file_extent_disk_num_blocks(extent); - memcpy(&old, extent, sizeof(old)); + btrfs_file_extent_disk_num_blocks(leaf, + extent); + read_extent_buffer(leaf, &old, + (unsigned long)extent, + sizeof(old)); if (disk_blocknr != 0) { ret = btrfs_inc_extent_ref(trans, root, disk_blocknr, disk_num_blocks); @@ -406,20 +408,24 @@ next_slot: u64 new_num; u64 old_num; keep = 1; - WARN_ON(start & (root->blocksize - 1)); + WARN_ON(start & (root->sectorsize - 1)); if (found_extent) { new_num = (start - key.offset) >> inode->i_blkbits; - old_num = btrfs_file_extent_num_blocks(extent); + old_num = btrfs_file_extent_num_blocks(leaf, + extent); *hint_block = - btrfs_file_extent_disk_blocknr(extent); - if (btrfs_file_extent_disk_blocknr(extent)) { + btrfs_file_extent_disk_blocknr(leaf, + extent); + if (btrfs_file_extent_disk_blocknr(leaf, + extent)) { inode->i_blocks -= (old_num - new_num) << 3; } - btrfs_set_file_extent_num_blocks(extent, + btrfs_set_file_extent_num_blocks(leaf, + extent, new_num); - btrfs_mark_buffer_dirty(path->nodes[0]); + btrfs_mark_buffer_dirty(leaf); } else { WARN_ON(1); } @@ -431,13 +437,17 @@ next_slot: u64 extent_num_blocks = 0; if (found_extent) { disk_blocknr = - btrfs_file_extent_disk_blocknr(extent); + btrfs_file_extent_disk_blocknr(leaf, + extent); disk_num_blocks = - btrfs_file_extent_disk_num_blocks(extent); + btrfs_file_extent_disk_num_blocks(leaf, + extent); extent_num_blocks = - btrfs_file_extent_num_blocks(extent); + btrfs_file_extent_num_blocks(leaf, + extent); *hint_block = - btrfs_file_extent_disk_blocknr(extent); + btrfs_file_extent_disk_blocknr(leaf, + extent); } ret = btrfs_del_item(trans, root, path); /* TODO update progress marker and return */ @@ -464,42 +474,37 @@ next_slot: struct btrfs_key ins; ins.objectid = inode->i_ino; ins.offset = end; - ins.flags = 0; btrfs_set_key_type(&ins, BTRFS_EXTENT_DATA_KEY); btrfs_release_path(root, path); ret = btrfs_insert_empty_item(trans, root, path, &ins, sizeof(*extent)); + leaf = path->nodes[0]; if (ret) { - btrfs_print_leaf(root, btrfs_buffer_leaf(path->nodes[0])); - printk("got %d on inserting %Lu %u %Lu start %Lu end %Lu found %Lu %Lu keep was %d\n", ret , ins.objectid, ins.flags, ins.offset, start, end, key.offset, extent_end, keep); + btrfs_print_leaf(root, leaf); + printk("got %d on inserting %Lu %u %Lu start %Lu end %Lu found %Lu %Lu keep was %d\n", ret , ins.objectid, ins.type, ins.offset, start, end, key.offset, extent_end, keep); } BUG_ON(ret); - extent = btrfs_item_ptr( - btrfs_buffer_leaf(path->nodes[0]), - path->slots[0], - struct btrfs_file_extent_item); - btrfs_set_file_extent_disk_blocknr(extent, - btrfs_file_extent_disk_blocknr(&old)); - btrfs_set_file_extent_disk_num_blocks(extent, - btrfs_file_extent_disk_num_blocks(&old)); - - btrfs_set_file_extent_offset(extent, - btrfs_file_extent_offset(&old) + + extent = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + write_extent_buffer(leaf, &old, + (unsigned long)extent, sizeof(old)); + + btrfs_set_file_extent_offset(leaf, extent, + le64_to_cpu(old.offset) + ((end - key.offset) >> inode->i_blkbits)); - WARN_ON(btrfs_file_extent_num_blocks(&old) < + WARN_ON(le64_to_cpu(old.num_blocks) < (extent_end - end) >> inode->i_blkbits); - btrfs_set_file_extent_num_blocks(extent, + btrfs_set_file_extent_num_blocks(leaf, extent, (extent_end - end) >> inode->i_blkbits); - btrfs_set_file_extent_type(extent, + btrfs_set_file_extent_type(leaf, extent, BTRFS_FILE_EXTENT_REG); - btrfs_set_file_extent_generation(extent, - btrfs_file_extent_generation(&old)); btrfs_mark_buffer_dirty(path->nodes[0]); - if (btrfs_file_extent_disk_blocknr(&old) != 0) { + if (le64_to_cpu(old.disk_blocknr) != 0) { inode->i_blocks += - btrfs_file_extent_num_blocks(extent) << 3; + btrfs_file_extent_num_blocks(leaf, + extent) << 3; } ret = 0; goto out; @@ -529,8 +534,8 @@ static int prepare_pages(struct btrfs_root *root, u64 num_blocks; u64 start_pos; - start_pos = pos & ~((u64)root->blocksize - 1); - num_blocks = (write_bytes + pos - start_pos + root->blocksize - 1) >> + start_pos = pos & ~((u64)root->sectorsize - 1); + num_blocks = (write_bytes + pos - start_pos + root->sectorsize - 1) >> inode->i_blkbits; memset(pages, 0, num_pages * sizeof(struct page *)); diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c index b78346177539..35d2608f8918 100644 --- a/fs/btrfs/inode-item.c +++ b/fs/btrfs/inode-item.c @@ -20,24 +20,18 @@ #include "disk-io.h" #include "transaction.h" -int btrfs_insert_inode(struct btrfs_trans_handle *trans, struct btrfs_root - *root, u64 objectid, struct btrfs_inode_item - *inode_item) +int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, u64 objectid) { - struct btrfs_path *path; struct btrfs_key key; int ret; key.objectid = objectid; - key.flags = 0; btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); key.offset = 0; - path = btrfs_alloc_path(); - BUG_ON(!path); - ret = btrfs_insert_item(trans, root, &key, inode_item, - sizeof(*inode_item)); - btrfs_release_path(root, path); - btrfs_free_path(path); + ret = btrfs_insert_empty_item(trans, root, path, &key, + sizeof(struct btrfs_inode_item)); if (ret == 0 && objectid > root->highest_inode) root->highest_inode = objectid; return ret; @@ -51,15 +45,15 @@ int btrfs_lookup_inode(struct btrfs_trans_handle *trans, struct btrfs_root int cow = mod != 0; int ret; int slot; - struct btrfs_leaf *leaf; + struct extent_buffer *leaf; struct btrfs_key found_key; ret = btrfs_search_slot(trans, root, location, path, ins_len, cow); if (ret > 0 && btrfs_key_type(location) == BTRFS_ROOT_ITEM_KEY && location->offset == (u64)-1 && path->slots[0] != 0) { slot = path->slots[0] - 1; - leaf = btrfs_buffer_leaf(path->nodes[0]); - btrfs_disk_key_to_cpu(&found_key, &leaf->items[slot].key); + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &found_key, slot); if (found_key.objectid == location->objectid && btrfs_key_type(&found_key) == btrfs_key_type(location)) { path->slots[0]--; diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c index 405470866254..ab74977adf5c 100644 --- a/fs/btrfs/inode-map.c +++ b/fs/btrfs/inode-map.c @@ -24,8 +24,9 @@ int btrfs_find_highest_inode(struct btrfs_root *root, u64 *objectid) { struct btrfs_path *path; int ret; - struct btrfs_leaf *l; + struct extent_buffer *l; struct btrfs_key search_key; + struct btrfs_key found_key; int slot; path = btrfs_alloc_path(); @@ -39,8 +40,9 @@ int btrfs_find_highest_inode(struct btrfs_root *root, u64 *objectid) BUG_ON(ret == 0); if (path->slots[0] > 0) { slot = path->slots[0] - 1; - l = btrfs_buffer_leaf(path->nodes[0]); - *objectid = btrfs_disk_key_objectid(&l->items[slot].key); + l = path->nodes[0]; + btrfs_item_key_to_cpu(l, &found_key, slot); + *objectid = found_key.objectid; } else { *objectid = BTRFS_FIRST_FREE_OBJECTID; } @@ -64,13 +66,12 @@ int btrfs_find_free_objectid(struct btrfs_trans_handle *trans, int slot = 0; u64 last_ino = 0; int start_found; - struct btrfs_leaf *l; + struct extent_buffer *l; struct btrfs_key search_key; u64 search_start = dirid; path = btrfs_alloc_path(); BUG_ON(!path); - search_key.flags = 0; search_start = root->last_inode_alloc; search_start = max(search_start, BTRFS_FIRST_FREE_OBJECTID); search_key.objectid = search_start; @@ -86,9 +87,9 @@ int btrfs_find_free_objectid(struct btrfs_trans_handle *trans, path->slots[0]--; while (1) { - l = btrfs_buffer_leaf(path->nodes[0]); + l = path->nodes[0]; slot = path->slots[0]; - if (slot >= btrfs_header_nritems(&l->header)) { + if (slot >= btrfs_header_nritems(l)) { ret = btrfs_next_leaf(root, path); if (ret == 0) continue; @@ -103,7 +104,7 @@ int btrfs_find_free_objectid(struct btrfs_trans_handle *trans, last_ino : search_start; goto found; } - btrfs_disk_key_to_cpu(&key, &l->items[slot].key); + btrfs_item_key_to_cpu(l, &key, slot); if (key.objectid >= search_start) { if (start_found) { if (last_ino < search_start) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index b03d40a907ca..fbe2836364e0 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -159,10 +159,8 @@ out: int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end) { - char csum[BTRFS_CRC32_SIZE]; size_t offset = start - (page->index << PAGE_CACHE_SHIFT); struct inode *inode = page->mapping->host; - struct btrfs_root *root = BTRFS_I(inode)->root; struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; char *kaddr; u64 private; @@ -173,11 +171,15 @@ int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end) if (ret) { goto zeroit; } + /* + struct btrfs_root *root = BTRFS_I(inode)->root; + char csum[BTRFS_CRC32_SIZE]; ret = btrfs_csum_data(root, kaddr + offset, end - start + 1, csum); BUG_ON(ret); if (memcmp(csum, &private, BTRFS_CRC32_SIZE)) { goto zeroit; } + */ kunmap_atomic(kaddr, KM_IRQ0); return 0; @@ -192,7 +194,9 @@ zeroit: void btrfs_read_locked_inode(struct inode *inode) { struct btrfs_path *path; + struct extent_buffer *leaf; struct btrfs_inode_item *inode_item; + struct btrfs_inode_timespec *tspec; struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_key location; u64 alloc_group_block; @@ -205,29 +209,37 @@ void btrfs_read_locked_inode(struct inode *inode) memcpy(&location, &BTRFS_I(inode)->location, sizeof(location)); ret = btrfs_lookup_inode(NULL, root, path, &location, 0); - if (ret) { + if (ret) goto make_bad; - } - inode_item = btrfs_item_ptr(btrfs_buffer_leaf(path->nodes[0]), - path->slots[0], - struct btrfs_inode_item); - inode->i_mode = btrfs_inode_mode(inode_item); - inode->i_nlink = btrfs_inode_nlink(inode_item); - inode->i_uid = btrfs_inode_uid(inode_item); - inode->i_gid = btrfs_inode_gid(inode_item); - inode->i_size = btrfs_inode_size(inode_item); - inode->i_atime.tv_sec = btrfs_timespec_sec(&inode_item->atime); - inode->i_atime.tv_nsec = btrfs_timespec_nsec(&inode_item->atime); - inode->i_mtime.tv_sec = btrfs_timespec_sec(&inode_item->mtime); - inode->i_mtime.tv_nsec = btrfs_timespec_nsec(&inode_item->mtime); - inode->i_ctime.tv_sec = btrfs_timespec_sec(&inode_item->ctime); - inode->i_ctime.tv_nsec = btrfs_timespec_nsec(&inode_item->ctime); - inode->i_blocks = btrfs_inode_nblocks(inode_item); - inode->i_generation = btrfs_inode_generation(inode_item); + leaf = path->nodes[0]; + inode_item = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_inode_item); + + inode->i_mode = btrfs_inode_mode(leaf, inode_item); + inode->i_nlink = btrfs_inode_nlink(leaf, inode_item); + inode->i_uid = btrfs_inode_uid(leaf, inode_item); + inode->i_gid = btrfs_inode_gid(leaf, inode_item); + inode->i_size = btrfs_inode_size(leaf, inode_item); + + tspec = btrfs_inode_atime(inode_item); + inode->i_atime.tv_sec = btrfs_timespec_sec(leaf, tspec); + inode->i_atime.tv_nsec = btrfs_timespec_nsec(leaf, tspec); + + tspec = btrfs_inode_mtime(inode_item); + inode->i_mtime.tv_sec = btrfs_timespec_sec(leaf, tspec); + inode->i_mtime.tv_nsec = btrfs_timespec_nsec(leaf, tspec); + + tspec = btrfs_inode_ctime(inode_item); + inode->i_ctime.tv_sec = btrfs_timespec_sec(leaf, tspec); + inode->i_ctime.tv_nsec = btrfs_timespec_nsec(leaf, tspec); + + inode->i_blocks = btrfs_inode_nblocks(leaf, inode_item); + inode->i_generation = btrfs_inode_generation(leaf, inode_item); inode->i_rdev = 0; - rdev = btrfs_inode_rdev(inode_item); - alloc_group_block = btrfs_inode_block_group(inode_item); + rdev = btrfs_inode_rdev(leaf, inode_item); + + alloc_group_block = btrfs_inode_block_group(leaf, inode_item); BTRFS_I(inode)->block_group = btrfs_lookup_block_group(root->fs_info, alloc_group_block); @@ -267,24 +279,35 @@ make_bad: make_bad_inode(inode); } -static void fill_inode_item(struct btrfs_inode_item *item, +static void fill_inode_item(struct extent_buffer *leaf, + struct btrfs_inode_item *item, struct inode *inode) { - btrfs_set_inode_uid(item, inode->i_uid); - btrfs_set_inode_gid(item, inode->i_gid); - btrfs_set_inode_size(item, inode->i_size); - btrfs_set_inode_mode(item, inode->i_mode); - btrfs_set_inode_nlink(item, inode->i_nlink); - btrfs_set_timespec_sec(&item->atime, inode->i_atime.tv_sec); - btrfs_set_timespec_nsec(&item->atime, inode->i_atime.tv_nsec); - btrfs_set_timespec_sec(&item->mtime, inode->i_mtime.tv_sec); - btrfs_set_timespec_nsec(&item->mtime, inode->i_mtime.tv_nsec); - btrfs_set_timespec_sec(&item->ctime, inode->i_ctime.tv_sec); - btrfs_set_timespec_nsec(&item->ctime, inode->i_ctime.tv_nsec); - btrfs_set_inode_nblocks(item, inode->i_blocks); - btrfs_set_inode_generation(item, inode->i_generation); - btrfs_set_inode_rdev(item, inode->i_rdev); - btrfs_set_inode_block_group(item, + btrfs_set_inode_uid(leaf, item, inode->i_uid); + btrfs_set_inode_gid(leaf, item, inode->i_gid); + btrfs_set_inode_size(leaf, item, inode->i_size); + btrfs_set_inode_mode(leaf, item, inode->i_mode); + btrfs_set_inode_nlink(leaf, item, inode->i_nlink); + + btrfs_set_timespec_sec(leaf, btrfs_inode_atime(item), + inode->i_atime.tv_sec); + btrfs_set_timespec_nsec(leaf, btrfs_inode_atime(item), + inode->i_atime.tv_nsec); + + btrfs_set_timespec_sec(leaf, btrfs_inode_mtime(item), + inode->i_mtime.tv_sec); + btrfs_set_timespec_nsec(leaf, btrfs_inode_mtime(item), + inode->i_mtime.tv_nsec); + + btrfs_set_timespec_sec(leaf, btrfs_inode_ctime(item), + inode->i_ctime.tv_sec); + btrfs_set_timespec_nsec(leaf, btrfs_inode_ctime(item), + inode->i_ctime.tv_nsec); + + btrfs_set_inode_nblocks(leaf, item, inode->i_blocks); + btrfs_set_inode_generation(leaf, item, inode->i_generation); + btrfs_set_inode_rdev(leaf, item, inode->i_rdev); + btrfs_set_inode_block_group(leaf, item, BTRFS_I(inode)->block_group->key.objectid); } @@ -294,6 +317,7 @@ int btrfs_update_inode(struct btrfs_trans_handle *trans, { struct btrfs_inode_item *inode_item; struct btrfs_path *path; + struct extent_buffer *leaf; int ret; path = btrfs_alloc_path(); @@ -306,12 +330,12 @@ int btrfs_update_inode(struct btrfs_trans_handle *trans, goto failed; } - inode_item = btrfs_item_ptr(btrfs_buffer_leaf(path->nodes[0]), - path->slots[0], + leaf = path->nodes[0]; + inode_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item); - fill_inode_item(inode_item, inode); - btrfs_mark_buffer_dirty(path->nodes[0]); + fill_inode_item(leaf, inode_item, inode); + btrfs_mark_buffer_dirty(leaf); btrfs_set_inode_last_trans(trans, inode); ret = 0; failed: @@ -330,8 +354,9 @@ static int btrfs_unlink_trans(struct btrfs_trans_handle *trans, const char *name = dentry->d_name.name; int name_len = dentry->d_name.len; int ret = 0; - u64 objectid; + struct extent_buffer *leaf; struct btrfs_dir_item *di; + struct btrfs_key key; path = btrfs_alloc_path(); if (!path) { @@ -349,14 +374,15 @@ static int btrfs_unlink_trans(struct btrfs_trans_handle *trans, ret = -ENOENT; goto err; } - objectid = btrfs_disk_key_objectid(&di->location); + leaf = path->nodes[0]; + btrfs_dir_item_key_to_cpu(leaf, di, &key); ret = btrfs_delete_one_dir_name(trans, root, path, di); if (ret) goto err; btrfs_release_path(root, path); di = btrfs_lookup_dir_index_item(trans, root, path, dir->i_ino, - objectid, name, name_len, -1); + key.objectid, name, name_len, -1); if (IS_ERR(di)) { ret = PTR_ERR(di); goto err; @@ -391,12 +417,15 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry) root = BTRFS_I(dir)->root; mutex_lock(&root->fs_info->fs_mutex); trans = btrfs_start_transaction(root, 1); + btrfs_set_trans_block_group(trans, dir); ret = btrfs_unlink_trans(trans, root, dir, dentry); nr = trans->blocks_used; + btrfs_end_transaction(trans, root); mutex_unlock(&root->fs_info->fs_mutex); btrfs_btree_balance_dirty(root, nr); + return ret; } @@ -411,7 +440,7 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) struct btrfs_trans_handle *trans; struct btrfs_key found_key; int found_type; - struct btrfs_leaf *leaf; + struct extent_buffer *leaf; char *goodnames = ".."; unsigned long nr; @@ -419,10 +448,11 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) BUG_ON(!path); mutex_lock(&root->fs_info->fs_mutex); trans = btrfs_start_transaction(root, 1); + btrfs_set_trans_block_group(trans, dir); key.objectid = inode->i_ino; key.offset = (u64)-1; - key.flags = (u32)-1; + key.type = (u8)-1; while(1) { ret = btrfs_search_slot(trans, root, &key, path, -1, 1); if (ret < 0) { @@ -435,9 +465,8 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) goto out; } path->slots[0]--; - leaf = btrfs_buffer_leaf(path->nodes[0]); - btrfs_disk_key_to_cpu(&found_key, - &leaf->items[path->slots[0]].key); + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); found_type = btrfs_key_type(&found_key); if (found_key.objectid != inode->i_ino) { err = -ENOENT; @@ -513,9 +542,9 @@ static int btrfs_truncate_in_trans(struct btrfs_trans_handle *trans, int ret; struct btrfs_path *path; struct btrfs_key key; - struct btrfs_disk_key *found_key; + struct btrfs_key found_key; u32 found_type; - struct btrfs_leaf *leaf; + struct extent_buffer *leaf; struct btrfs_file_extent_item *fi; u64 extent_start = 0; u64 extent_num_blocks = 0; @@ -527,10 +556,12 @@ static int btrfs_truncate_in_trans(struct btrfs_trans_handle *trans, path = btrfs_alloc_path(); path->reada = -1; BUG_ON(!path); + /* FIXME, add redo link to tree so we don't leak on crash */ key.objectid = inode->i_ino; key.offset = (u64)-1; - key.flags = (u32)-1; + key.type = (u8)-1; + while(1) { btrfs_init_path(path); fi = NULL; @@ -542,27 +573,28 @@ static int btrfs_truncate_in_trans(struct btrfs_trans_handle *trans, BUG_ON(path->slots[0] == 0); path->slots[0]--; } - leaf = btrfs_buffer_leaf(path->nodes[0]); - found_key = &leaf->items[path->slots[0]].key; - found_type = btrfs_disk_key_type(found_key); + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + found_type = btrfs_key_type(&found_key); - if (btrfs_disk_key_objectid(found_key) != inode->i_ino) + if (found_key.objectid != inode->i_ino) break; + if (found_type != BTRFS_CSUM_ITEM_KEY && found_type != BTRFS_DIR_ITEM_KEY && found_type != BTRFS_DIR_INDEX_KEY && found_type != BTRFS_EXTENT_DATA_KEY) break; - item_end = btrfs_disk_key_offset(found_key); + item_end = found_key.offset; if (found_type == BTRFS_EXTENT_DATA_KEY) { - fi = btrfs_item_ptr(btrfs_buffer_leaf(path->nodes[0]), - path->slots[0], + fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); - if (btrfs_file_extent_type(fi) != + if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_INLINE) { - item_end += btrfs_file_extent_num_blocks(fi) << - inode->i_blkbits; + item_end += + btrfs_file_extent_num_blocks(leaf, fi) << + inode->i_blkbits; } } if (found_type == BTRFS_CSUM_ITEM_KEY) { @@ -583,7 +615,7 @@ static int btrfs_truncate_in_trans(struct btrfs_trans_handle *trans, btrfs_set_key_type(&key, found_type); continue; } - if (btrfs_disk_key_offset(found_key) >= inode->i_size) + if (found_key.offset >= inode->i_size) del_item = 1; else del_item = 0; @@ -591,30 +623,31 @@ static int btrfs_truncate_in_trans(struct btrfs_trans_handle *trans, /* FIXME, shrink the extent if the ref count is only 1 */ if (found_type == BTRFS_EXTENT_DATA_KEY && - btrfs_file_extent_type(fi) != + btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_INLINE) { u64 num_dec; - extent_start = btrfs_file_extent_disk_blocknr(fi); + extent_start = btrfs_file_extent_disk_blocknr(leaf, fi); if (!del_item) { u64 orig_num_blocks = - btrfs_file_extent_num_blocks(fi); + btrfs_file_extent_num_blocks(leaf, fi); extent_num_blocks = inode->i_size - - btrfs_disk_key_offset(found_key) + - root->blocksize - 1; + found_key.offset + root->sectorsize - 1; extent_num_blocks >>= inode->i_blkbits; - btrfs_set_file_extent_num_blocks(fi, + btrfs_set_file_extent_num_blocks(leaf, fi, extent_num_blocks); num_dec = (orig_num_blocks - extent_num_blocks) << 3; if (extent_start != 0) { inode->i_blocks -= num_dec; } - btrfs_mark_buffer_dirty(path->nodes[0]); + btrfs_mark_buffer_dirty(leaf); } else { extent_num_blocks = - btrfs_file_extent_disk_num_blocks(fi); + btrfs_file_extent_disk_num_blocks(leaf, + fi); /* FIXME blocksize != 4096 */ - num_dec = btrfs_file_extent_num_blocks(fi) << 3; + num_dec = btrfs_file_extent_num_blocks(leaf, + fi) << 3; if (extent_start != 0) { found_extent = 1; inode->i_blocks -= num_dec; @@ -725,7 +758,7 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr) struct btrfs_root *root = BTRFS_I(inode)->root; struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; - u64 mask = root->blocksize - 1; + u64 mask = root->sectorsize - 1; u64 pos = (inode->i_size + mask) & ~mask; u64 block_end = attr->ia_size | mask; u64 hole_size; @@ -771,9 +804,11 @@ void btrfs_delete_inode(struct inode *inode) if (is_bad_inode(inode)) { goto no_delete; } + inode->i_size = 0; mutex_lock(&root->fs_info->fs_mutex); trans = btrfs_start_transaction(root, 1); + btrfs_set_trans_block_group(trans, inode); ret = btrfs_truncate_in_trans(trans, root, inode); if (ret) @@ -782,6 +817,7 @@ void btrfs_delete_inode(struct inode *inode) if (ret) goto no_delete_lock; nr = trans->blocks_used; + btrfs_end_transaction(trans, root); mutex_unlock(&root->fs_info->fs_mutex); btrfs_btree_balance_dirty(root, nr); @@ -819,7 +855,7 @@ static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry, ret = 0; goto out; } - btrfs_disk_key_to_cpu(location, &di->location); + btrfs_dir_item_key_to_cpu(path->nodes[0], di, location); out: btrfs_release_path(root, path); btrfs_free_path(path); @@ -856,7 +892,6 @@ static int fixup_tree_root_location(struct btrfs_root *root, ri = &(*sub_root)->root_item; location->objectid = btrfs_root_dirid(ri); - location->flags = 0; btrfs_set_key_type(location, BTRFS_INODE_ITEM_KEY); location->offset = 0; @@ -908,11 +943,14 @@ static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry, if (dentry->d_name.len > BTRFS_NAME_LEN) return ERR_PTR(-ENAMETOOLONG); + mutex_lock(&root->fs_info->fs_mutex); ret = btrfs_inode_by_name(dir, dentry, &location); mutex_unlock(&root->fs_info->fs_mutex); + if (ret < 0) return ERR_PTR(ret); + inode = NULL; if (location.objectid) { ret = fixup_tree_root_location(root, &location, &sub_root, @@ -952,10 +990,11 @@ static int btrfs_readdir(struct file *filp, void *dirent, filldir_t filldir) struct btrfs_item *item; struct btrfs_dir_item *di; struct btrfs_key key; + struct btrfs_key found_key; struct btrfs_path *path; int ret; u32 nritems; - struct btrfs_leaf *leaf; + struct extent_buffer *leaf; int slot; int advance; unsigned char d_type; @@ -964,15 +1003,19 @@ static int btrfs_readdir(struct file *filp, void *dirent, filldir_t filldir) u32 di_total; u32 di_len; int key_type = BTRFS_DIR_INDEX_KEY; + char tmp_name[32]; + char *name_ptr; + int name_len; /* FIXME, use a real flag for deciding about the key type */ if (root->fs_info->tree_root == root) key_type = BTRFS_DIR_ITEM_KEY; + mutex_lock(&root->fs_info->fs_mutex); key.objectid = inode->i_ino; - key.flags = 0; btrfs_set_key_type(&key, key_type); key.offset = filp->f_pos; + path = btrfs_alloc_path(); path->reada = 2; ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); @@ -980,16 +1023,16 @@ static int btrfs_readdir(struct file *filp, void *dirent, filldir_t filldir) goto err; advance = 0; while(1) { - leaf = btrfs_buffer_leaf(path->nodes[0]); - nritems = btrfs_header_nritems(&leaf->header); + leaf = path->nodes[0]; + nritems = btrfs_header_nritems(leaf); slot = path->slots[0]; if (advance || slot >= nritems) { if (slot >= nritems -1) { ret = btrfs_next_leaf(root, path); if (ret) break; - leaf = btrfs_buffer_leaf(path->nodes[0]); - nritems = btrfs_header_nritems(&leaf->header); + leaf = path->nodes[0]; + nritems = btrfs_header_nritems(leaf); slot = path->slots[0]; } else { slot++; @@ -997,28 +1040,48 @@ static int btrfs_readdir(struct file *filp, void *dirent, filldir_t filldir) } } advance = 1; - item = leaf->items + slot; - if (btrfs_disk_key_objectid(&item->key) != key.objectid) + item = btrfs_item_nr(leaf, slot); + btrfs_item_key_to_cpu(leaf, &found_key, slot); + + if (found_key.objectid != key.objectid) break; - if (btrfs_disk_key_type(&item->key) != key_type) + if (btrfs_key_type(&found_key) != key_type) break; - if (btrfs_disk_key_offset(&item->key) < filp->f_pos) + if (found_key.offset < filp->f_pos) continue; - filp->f_pos = btrfs_disk_key_offset(&item->key); + + filp->f_pos = found_key.offset; advance = 1; di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item); di_cur = 0; - di_total = btrfs_item_size(leaf->items + slot); + di_total = btrfs_item_size(leaf, item); while(di_cur < di_total) { - d_type = btrfs_filetype_table[btrfs_dir_type(di)]; - over = filldir(dirent, (const char *)(di + 1), - btrfs_dir_name_len(di), - btrfs_disk_key_offset(&item->key), - btrfs_disk_key_objectid(&di->location), + struct btrfs_key location; + + name_len = btrfs_dir_name_len(leaf, di); + if (name_len < 32) { + name_ptr = tmp_name; + } else { + name_ptr = kmalloc(name_len, GFP_NOFS); + BUG_ON(!name_ptr); + } + read_extent_buffer(leaf, name_ptr, + (unsigned long)(di + 1), name_len); + + d_type = btrfs_filetype_table[btrfs_dir_type(leaf, di)]; + btrfs_dir_item_key_to_cpu(leaf, di, &location); + + over = filldir(dirent, name_ptr, name_len, + found_key.offset, + location.objectid, d_type); + + if (name_ptr != tmp_name) + kfree(name_ptr); + if (over) goto nopos; - di_len = btrfs_dir_name_len(di) + sizeof(*di); + di_len = btrfs_dir_name_len(leaf, di) + sizeof(*di); di_cur += di_len; di = (struct btrfs_dir_item *)((char *)di + di_len); } @@ -1075,11 +1138,15 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, int mode) { struct inode *inode; - struct btrfs_inode_item inode_item; + struct btrfs_inode_item *inode_item; struct btrfs_key *location; + struct btrfs_path *path; int ret; int owner; + path = btrfs_alloc_path(); + BUG_ON(!path); + inode = new_inode(root->fs_info->sb); if (!inode) return ERR_PTR(-ENOMEM); @@ -1095,24 +1162,32 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, group = btrfs_find_block_group(root, group, 0, 0, owner); BTRFS_I(inode)->block_group = group; + ret = btrfs_insert_empty_inode(trans, root, path, objectid); + if (ret) + goto fail; + inode->i_uid = current->fsuid; inode->i_gid = current->fsgid; inode->i_mode = mode; inode->i_ino = objectid; inode->i_blocks = 0; inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; - fill_inode_item(&inode_item, inode); + inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_inode_item); + fill_inode_item(path->nodes[0], inode_item, inode); + btrfs_mark_buffer_dirty(path->nodes[0]); + btrfs_free_path(path); + location = &BTRFS_I(inode)->location; location->objectid = objectid; - location->flags = 0; location->offset = 0; btrfs_set_key_type(location, BTRFS_INODE_ITEM_KEY); - ret = btrfs_insert_inode(trans, root, objectid, &inode_item); - if (ret) - return ERR_PTR(ret); insert_inode_hash(inode); return inode; +fail: + btrfs_free_path(path); + return ERR_PTR(ret); } static inline u8 btrfs_inode_type(struct inode *inode) @@ -1127,8 +1202,8 @@ static int btrfs_add_link(struct btrfs_trans_handle *trans, struct btrfs_key key; struct btrfs_root *root = BTRFS_I(dentry->d_parent->d_inode)->root; struct inode *parent_inode; + key.objectid = inode->i_ino; - key.flags = 0; btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); key.offset = 0; @@ -1285,14 +1360,18 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, inc_nlink(inode); mutex_lock(&root->fs_info->fs_mutex); trans = btrfs_start_transaction(root, 1); + btrfs_set_trans_block_group(trans, dir); atomic_inc(&inode->i_count); err = btrfs_add_nondir(trans, dentry, inode); + if (err) drop_inode = 1; + dir->i_sb->s_dirt = 1; btrfs_update_inode_block_group(trans, dir); err = btrfs_update_inode(trans, root, inode); + if (err) drop_inode = 1; @@ -1321,13 +1400,13 @@ static int btrfs_make_empty_dir(struct btrfs_trans_handle *trans, key.objectid = objectid; key.offset = 0; - key.flags = 0; btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); ret = btrfs_insert_dir_item(trans, root, buf, 1, objectid, &key, BTRFS_FT_DIR); if (ret) goto error; + key.objectid = dirid; ret = btrfs_insert_dir_item(trans, root, buf, 2, objectid, &key, BTRFS_FT_DIR); @@ -1350,6 +1429,7 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) mutex_lock(&root->fs_info->fs_mutex); trans = btrfs_start_transaction(root, 1); btrfs_set_trans_block_group(trans, dir); + if (IS_ERR(trans)) { err = PTR_ERR(trans); goto out_unlock; @@ -1367,6 +1447,7 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) err = PTR_ERR(inode); goto out_fail; } + drop_on_err = 1; inode->i_op = &btrfs_dir_inode_operations; inode->i_fop = &btrfs_dir_file_operations; @@ -1380,9 +1461,11 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) err = btrfs_update_inode(trans, root, inode); if (err) goto out_fail; + err = btrfs_add_link(trans, dentry, inode); if (err) goto out_fail; + d_instantiate(dentry, inode); drop_on_err = 0; dir->i_sb->s_dirt = 1; @@ -1392,6 +1475,7 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) out_fail: nr = trans->blocks_used; btrfs_end_transaction(trans, root); + out_unlock: mutex_unlock(&root->fs_info->fs_mutex); if (drop_on_err) @@ -1415,8 +1499,8 @@ struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page, struct btrfs_path *path; struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_file_extent_item *item; - struct btrfs_leaf *leaf; - struct btrfs_disk_key *found_key; + struct extent_buffer *leaf; + struct btrfs_key found_key; struct extent_map *em = NULL; struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; struct btrfs_trans_handle *trans = NULL; @@ -1436,8 +1520,8 @@ again: err = -ENOMEM; goto out; } - em->start = 0; - em->end = 0; + em->start = EXTENT_MAP_HOLE; + em->end = EXTENT_MAP_HOLE; } em->bdev = inode->i_sb->s_bdev; ret = btrfs_lookup_file_extent(NULL, root, path, @@ -1453,25 +1537,27 @@ again: path->slots[0]--; } - item = btrfs_item_ptr(btrfs_buffer_leaf(path->nodes[0]), path->slots[0], + leaf = path->nodes[0]; + item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); - leaf = btrfs_buffer_leaf(path->nodes[0]); - blocknr = btrfs_file_extent_disk_blocknr(item); - blocknr += btrfs_file_extent_offset(item); + + blocknr = btrfs_file_extent_disk_blocknr(leaf, item); + blocknr += btrfs_file_extent_offset(leaf, item); /* are we inside the extent that was found? */ - found_key = &leaf->items[path->slots[0]].key; - found_type = btrfs_disk_key_type(found_key); - if (btrfs_disk_key_objectid(found_key) != objectid || + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + found_type = btrfs_key_type(&found_key); + if (found_key.objectid != objectid || found_type != BTRFS_EXTENT_DATA_KEY) { goto not_found; } - found_type = btrfs_file_extent_type(item); - extent_start = btrfs_disk_key_offset(&leaf->items[path->slots[0]].key); + found_type = btrfs_file_extent_type(leaf, item); + extent_start = found_key.offset; if (found_type == BTRFS_FILE_EXTENT_REG) { extent_end = extent_start + - (btrfs_file_extent_num_blocks(item) << inode->i_blkbits); + (btrfs_file_extent_num_blocks(leaf, item) << + inode->i_blkbits); err = 0; if (start < extent_start || start >= extent_end) { em->start = start; @@ -1484,28 +1570,29 @@ again: } goto not_found_em; } - if (btrfs_file_extent_disk_blocknr(item) == 0) { + if (btrfs_file_extent_disk_blocknr(leaf, item) == 0) { em->start = extent_start; em->end = extent_end - 1; - em->block_start = 0; - em->block_end = 0; + em->block_start = EXTENT_MAP_HOLE; + em->block_end = EXTENT_MAP_HOLE; goto insert; } em->block_start = blocknr << inode->i_blkbits; em->block_end = em->block_start + - (btrfs_file_extent_num_blocks(item) << + (btrfs_file_extent_num_blocks(leaf, item) << inode->i_blkbits) - 1; em->start = extent_start; em->end = extent_end - 1; goto insert; } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { - char *ptr; + unsigned long ptr; char *map; u32 size; - size = btrfs_file_extent_inline_len(leaf->items + - path->slots[0]); - extent_end = extent_start | ((u64)root->blocksize - 1); + size = btrfs_file_extent_inline_len(leaf, btrfs_item_nr(leaf, + path->slots[0])); + + extent_end = extent_start | ((u64)root->sectorsize - 1); if (start < extent_start || start >= extent_end) { em->start = start; if (start < extent_start) { @@ -1517,18 +1604,21 @@ again: } goto not_found_em; } + em->block_start = EXTENT_MAP_INLINE; em->block_end = EXTENT_MAP_INLINE; em->start = extent_start; em->end = extent_end; + if (!page) { goto insert; } + ptr = btrfs_file_extent_inline_start(item); map = kmap(page); - memcpy(map + page_offset, ptr, size); + read_extent_buffer(leaf, map + page_offset, ptr, size); memset(map + page_offset + size, 0, - root->blocksize - (page_offset + size)); + root->sectorsize - (page_offset + size)); flush_dcache_page(page); kunmap(page); set_extent_uptodate(em_tree, extent_start, @@ -1542,8 +1632,8 @@ not_found: em->start = start; em->end = end; not_found_em: - em->block_start = 0; - em->block_end = 0; + em->block_start = EXTENT_MAP_HOLE; + em->block_end = EXTENT_MAP_HOLE; insert: btrfs_release_path(root, path); if (em->start > start || em->end < start) { @@ -1712,6 +1802,7 @@ static void btrfs_truncate(struct inode *inode) ret = btrfs_truncate_in_trans(trans, root, inode); btrfs_update_inode(trans, root, inode); nr = trans->blocks_used; + ret = btrfs_end_transaction(trans, root); BUG_ON(ret); mutex_unlock(&root->fs_info->fs_mutex); @@ -1731,8 +1822,7 @@ static int create_subvol(struct btrfs_root *root, char *name, int namelen) struct btrfs_key key; struct btrfs_root_item root_item; struct btrfs_inode_item *inode_item; - struct buffer_head *subvol; - struct btrfs_leaf *leaf; + struct extent_buffer *leaf; struct btrfs_root *new_root; struct inode *inode; struct inode *dir; @@ -1746,34 +1836,37 @@ static int create_subvol(struct btrfs_root *root, char *name, int namelen) trans = btrfs_start_transaction(root, 1); BUG_ON(!trans); - subvol = btrfs_alloc_free_block(trans, root, 0, 0); - if (IS_ERR(subvol)) - return PTR_ERR(subvol); - leaf = btrfs_buffer_leaf(subvol); - btrfs_set_header_nritems(&leaf->header, 0); - btrfs_set_header_level(&leaf->header, 0); - btrfs_set_header_blocknr(&leaf->header, bh_blocknr(subvol)); - btrfs_set_header_generation(&leaf->header, trans->transid); - btrfs_set_header_owner(&leaf->header, root->root_key.objectid); - memcpy(leaf->header.fsid, root->fs_info->disk_super->fsid, - sizeof(leaf->header.fsid)); - btrfs_mark_buffer_dirty(subvol); + leaf = btrfs_alloc_free_block(trans, root, 0, 0); + if (IS_ERR(leaf)) + return PTR_ERR(leaf); + + btrfs_set_header_nritems(leaf, 0); + btrfs_set_header_level(leaf, 0); + btrfs_set_header_blocknr(leaf, extent_buffer_blocknr(leaf)); + btrfs_set_header_generation(leaf, trans->transid); + btrfs_set_header_owner(leaf, root->root_key.objectid); + write_extent_buffer(leaf, root->fs_info->fsid, + (unsigned long)btrfs_header_fsid(leaf), + BTRFS_FSID_SIZE); + btrfs_mark_buffer_dirty(leaf); inode_item = &root_item.inode; memset(inode_item, 0, sizeof(*inode_item)); - btrfs_set_inode_generation(inode_item, 1); - btrfs_set_inode_size(inode_item, 3); - btrfs_set_inode_nlink(inode_item, 1); - btrfs_set_inode_nblocks(inode_item, 1); - btrfs_set_inode_mode(inode_item, S_IFDIR | 0755); + inode_item->generation = cpu_to_le64(1); + inode_item->size = cpu_to_le64(3); + inode_item->nlink = cpu_to_le32(1); + inode_item->nblocks = cpu_to_le64(1); + inode_item->mode = cpu_to_le32(S_IFDIR | 0755); - btrfs_set_root_blocknr(&root_item, bh_blocknr(subvol)); + btrfs_set_root_blocknr(&root_item, extent_buffer_blocknr(leaf)); btrfs_set_root_refs(&root_item, 1); - btrfs_set_root_blocks_used(&root_item, 0); + btrfs_set_root_used(&root_item, 0); + memset(&root_item.drop_progress, 0, sizeof(root_item.drop_progress)); root_item.drop_level = 0; - brelse(subvol); - subvol = NULL; + + free_extent_buffer(leaf); + leaf = NULL; ret = btrfs_find_free_objectid(trans, root->fs_info->tree_root, 0, &objectid); @@ -1784,7 +1877,6 @@ static int create_subvol(struct btrfs_root *root, char *name, int namelen) key.objectid = objectid; key.offset = 1; - key.flags = 0; btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key, &root_item); @@ -1845,7 +1937,7 @@ static int create_snapshot(struct btrfs_root *root, char *name, int namelen) struct btrfs_trans_handle *trans; struct btrfs_key key; struct btrfs_root_item new_root_item; - struct buffer_head *tmp; + struct extent_buffer *tmp; int ret; int err; u64 objectid; @@ -1876,10 +1968,11 @@ static int create_snapshot(struct btrfs_root *root, char *name, int namelen) key.objectid = objectid; key.offset = 1; - key.flags = 0; btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); + btrfs_cow_block(trans, root, root->node, NULL, 0, &tmp); - btrfs_set_root_blocknr(&new_root_item, bh_blocknr(root->node)); + btrfs_set_root_blocknr(&new_root_item, + extent_buffer_blocknr(root->node)); ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key, &new_root_item); @@ -1904,8 +1997,10 @@ static int create_snapshot(struct btrfs_root *root, char *name, int namelen) fail: nr = trans->blocks_used; err = btrfs_commit_transaction(trans, root); + if (err && !ret) ret = err; + mutex_unlock(&root->fs_info->fs_mutex); up_write(&root->snap_sem); btrfs_btree_balance_dirty(root, nr); @@ -1986,7 +2081,7 @@ static int btrfs_ioctl_snap_create(struct btrfs_root *root, void __user *arg) if (copy_from_user(&vol_args, arg, sizeof(vol_args))) return -EFAULT; - + namelen = strlen(vol_args.name); if (namelen > BTRFS_VOL_NAME_MAX) return -EINVAL; @@ -2164,8 +2259,10 @@ static int btrfs_rename(struct inode * old_dir, struct dentry *old_dentry, new_inode->i_size > BTRFS_EMPTY_DIR_SIZE) { return -ENOTEMPTY; } + mutex_lock(&root->fs_info->fs_mutex); trans = btrfs_start_transaction(root, 1); + btrfs_set_trans_block_group(trans, new_dir); path = btrfs_alloc_path(); if (!path) { @@ -2177,9 +2274,10 @@ static int btrfs_rename(struct inode * old_dir, struct dentry *old_dentry, old_dir->i_ctime = old_dir->i_mtime = ctime; new_dir->i_ctime = new_dir->i_mtime = ctime; old_inode->i_ctime = ctime; + if (S_ISDIR(old_inode->i_mode) && old_dir != new_dir) { struct btrfs_key *location = &BTRFS_I(new_dir)->location; - u64 old_parent_oid; + struct btrfs_key old_parent_key; di = btrfs_lookup_dir_item(trans, root, path, old_inode->i_ino, "..", 2, -1); if (IS_ERR(di)) { @@ -2190,7 +2288,7 @@ static int btrfs_rename(struct inode * old_dir, struct dentry *old_dentry, ret = -ENOENT; goto out_fail; } - old_parent_oid = btrfs_disk_key_objectid(&di->location); + btrfs_dir_item_key_to_cpu(path->nodes[0], di, &old_parent_key); ret = btrfs_del_item(trans, root, path); if (ret) { goto out_fail; @@ -2199,7 +2297,7 @@ static int btrfs_rename(struct inode * old_dir, struct dentry *old_dentry, di = btrfs_lookup_dir_index_item(trans, root, path, old_inode->i_ino, - old_parent_oid, + old_parent_key.objectid, "..", 2, -1); if (IS_ERR(di)) { ret = PTR_ERR(di); @@ -2257,8 +2355,9 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry, u64 objectid; int name_len; int datasize; - char *ptr; + unsigned long ptr; struct btrfs_file_extent_item *ei; + struct extent_buffer *leaf; unsigned long nr; name_len = strlen(symname) + 1; @@ -2302,7 +2401,6 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry, BUG_ON(!path); key.objectid = inode->i_ino; key.offset = 0; - key.flags = 0; btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY); datasize = btrfs_file_extent_calc_inline_size(name_len); err = btrfs_insert_empty_item(trans, root, path, &key, @@ -2311,16 +2409,17 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry, drop_inode = 1; goto out_unlock; } - ei = btrfs_item_ptr(btrfs_buffer_leaf(path->nodes[0]), - path->slots[0], struct btrfs_file_extent_item); - btrfs_set_file_extent_generation(ei, trans->transid); - btrfs_set_file_extent_type(ei, + leaf = path->nodes[0]; + ei = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + btrfs_set_file_extent_generation(leaf, ei, trans->transid); + btrfs_set_file_extent_type(leaf, ei, BTRFS_FILE_EXTENT_INLINE); ptr = btrfs_file_extent_inline_start(ei); - btrfs_memcpy(root, path->nodes[0]->b_data, - ptr, symname, name_len); - btrfs_mark_buffer_dirty(path->nodes[0]); + write_extent_buffer(leaf, symname, ptr, name_len); + btrfs_mark_buffer_dirty(leaf); btrfs_free_path(path); + inode->i_op = &btrfs_symlink_inode_operations; inode->i_mapping->a_ops = &btrfs_symlink_aops; inode->i_size = name_len - 1; diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c index bafa78603bd2..a825ce078a54 100644 --- a/fs/btrfs/print-tree.c +++ b/fs/btrfs/print-tree.c @@ -20,10 +20,10 @@ #include "disk-io.h" #include "print-tree.h" -void btrfs_print_leaf(struct btrfs_root *root, struct btrfs_leaf *l) +void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l) { int i; - u32 nr = btrfs_header_nritems(&l->header); + u32 nr = btrfs_header_nritems(l); struct btrfs_item *item; struct btrfs_extent_item *ei; struct btrfs_root_item *ri; @@ -31,119 +31,113 @@ void btrfs_print_leaf(struct btrfs_root *root, struct btrfs_leaf *l) struct btrfs_inode_item *ii; struct btrfs_block_group_item *bi; struct btrfs_file_extent_item *fi; + struct btrfs_key key; + struct btrfs_key found_key; u32 type; printk("leaf %llu total ptrs %d free space %d\n", - (unsigned long long)btrfs_header_blocknr(&l->header), nr, + (unsigned long long)btrfs_header_blocknr(l), nr, btrfs_leaf_free_space(root, l)); for (i = 0 ; i < nr ; i++) { - item = l->items + i; - type = btrfs_disk_key_type(&item->key); + item = btrfs_item_nr(l, i); + btrfs_item_key_to_cpu(l, &key, i); + type = btrfs_key_type(&key); printk("\titem %d key (%llu %x %llu) itemoff %d itemsize %d\n", i, - (unsigned long long)btrfs_disk_key_objectid(&item->key), - btrfs_disk_key_flags(&item->key), - (unsigned long long)btrfs_disk_key_offset(&item->key), - btrfs_item_offset(item), - btrfs_item_size(item)); + (unsigned long long)key.objectid, type, + (unsigned long long)key.offset, + btrfs_item_offset(l, item), btrfs_item_size(l, item)); switch (type) { case BTRFS_INODE_ITEM_KEY: ii = btrfs_item_ptr(l, i, struct btrfs_inode_item); printk("\t\tinode generation %llu size %llu mode %o\n", - (unsigned long long)btrfs_inode_generation(ii), - (unsigned long long)btrfs_inode_size(ii), - btrfs_inode_mode(ii)); + (unsigned long long)btrfs_inode_generation(l, ii), + (unsigned long long)btrfs_inode_size(l, ii), + btrfs_inode_mode(l, ii)); break; case BTRFS_DIR_ITEM_KEY: di = btrfs_item_ptr(l, i, struct btrfs_dir_item); + btrfs_dir_item_key_to_cpu(l, di, &found_key); printk("\t\tdir oid %llu flags %u type %u\n", - (unsigned long long)btrfs_disk_key_objectid( - &di->location), - btrfs_dir_flags(di), - btrfs_dir_type(di)); - printk("\t\tname %.*s\n", - btrfs_dir_name_len(di),(char *)(di + 1)); + (unsigned long long)found_key.objectid, + btrfs_dir_flags(l, di), + btrfs_dir_type(l, di)); break; case BTRFS_ROOT_ITEM_KEY: ri = btrfs_item_ptr(l, i, struct btrfs_root_item); printk("\t\troot data blocknr %llu refs %u\n", - (unsigned long long)btrfs_root_blocknr(ri), - btrfs_root_refs(ri)); + (unsigned long long)btrfs_disk_root_blocknr(l, ri), + btrfs_disk_root_refs(l, ri)); break; case BTRFS_EXTENT_ITEM_KEY: ei = btrfs_item_ptr(l, i, struct btrfs_extent_item); printk("\t\textent data refs %u\n", - btrfs_extent_refs(ei)); + btrfs_extent_refs(l, ei)); break; case BTRFS_EXTENT_DATA_KEY: fi = btrfs_item_ptr(l, i, struct btrfs_file_extent_item); - if (btrfs_file_extent_type(fi) == + if (btrfs_file_extent_type(l, fi) == BTRFS_FILE_EXTENT_INLINE) { printk("\t\tinline extent data size %u\n", - btrfs_file_extent_inline_len(l->items + i)); + btrfs_file_extent_inline_len(l, item)); break; } printk("\t\textent data disk block %llu nr %llu\n", - (unsigned long long)btrfs_file_extent_disk_blocknr(fi), - (unsigned long long)btrfs_file_extent_disk_num_blocks(fi)); + (unsigned long long)btrfs_file_extent_disk_blocknr(l, fi), + (unsigned long long)btrfs_file_extent_disk_num_blocks(l, fi)); printk("\t\textent data offset %llu nr %llu\n", - (unsigned long long)btrfs_file_extent_offset(fi), - (unsigned long long)btrfs_file_extent_num_blocks(fi)); + (unsigned long long)btrfs_file_extent_offset(l, fi), + (unsigned long long)btrfs_file_extent_num_blocks(l, fi)); break; case BTRFS_BLOCK_GROUP_ITEM_KEY: bi = btrfs_item_ptr(l, i, struct btrfs_block_group_item); printk("\t\tblock group used %llu\n", - (unsigned long long)btrfs_block_group_used(bi)); - break; - case BTRFS_STRING_ITEM_KEY: - printk("\t\titem data %.*s\n", btrfs_item_size(item), - btrfs_leaf_data(l) + btrfs_item_offset(item)); + (unsigned long long)btrfs_disk_block_group_used(l, bi)); break; }; } } -void btrfs_print_tree(struct btrfs_root *root, struct buffer_head *t) +void btrfs_print_tree(struct btrfs_root *root, struct extent_buffer *c) { int i; u32 nr; - struct btrfs_node *c; + struct btrfs_key key; - if (!t) + if (!c) return; - c = btrfs_buffer_node(t); - nr = btrfs_header_nritems(&c->header); + nr = btrfs_header_nritems(c); if (btrfs_is_leaf(c)) { - btrfs_print_leaf(root, (struct btrfs_leaf *)c); + btrfs_print_leaf(root, c); return; } printk("node %llu level %d total ptrs %d free spc %u\n", - (unsigned long long)btrfs_header_blocknr(&c->header), - btrfs_header_level(&c->header), nr, + (unsigned long long)btrfs_header_blocknr(c), + btrfs_header_level(c), nr, (u32)BTRFS_NODEPTRS_PER_BLOCK(root) - nr); for (i = 0; i < nr; i++) { + btrfs_node_key_to_cpu(c, &key, i); printk("\tkey %d (%llu %u %llu) block %llu\n", i, - (unsigned long long)c->ptrs[i].key.objectid, - c->ptrs[i].key.flags, - (unsigned long long)c->ptrs[i].key.offset, + (unsigned long long)key.objectid, + key.type, + (unsigned long long)key.offset, (unsigned long long)btrfs_node_blockptr(c, i)); } for (i = 0; i < nr; i++) { - struct buffer_head *next_buf = read_tree_block(root, + struct extent_buffer *next = read_tree_block(root, btrfs_node_blockptr(c, i)); - struct btrfs_node *next = btrfs_buffer_node(next_buf); if (btrfs_is_leaf(next) && - btrfs_header_level(&c->header) != 1) + btrfs_header_level(c) != 1) BUG(); - if (btrfs_header_level(&next->header) != - btrfs_header_level(&c->header) - 1) + if (btrfs_header_level(next) != + btrfs_header_level(c) - 1) BUG(); - btrfs_print_tree(root, next_buf); - btrfs_block_release(root, next_buf); + btrfs_print_tree(root, next); + free_extent_buffer(next); } } diff --git a/fs/btrfs/print-tree.h b/fs/btrfs/print-tree.h index 205cd03601ab..da75efe534d5 100644 --- a/fs/btrfs/print-tree.h +++ b/fs/btrfs/print-tree.h @@ -18,6 +18,6 @@ #ifndef __PRINT_TREE_ #define __PRINT_TREE_ -void btrfs_print_leaf(struct btrfs_root *root, struct btrfs_leaf *l); -void btrfs_print_tree(struct btrfs_root *root, struct buffer_head *t); +void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l); +void btrfs_print_tree(struct btrfs_root *root, struct extent_buffer *t); #endif diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index 3b5926dfbeba..88bcdd33f56e 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -26,12 +26,13 @@ int btrfs_find_last_root(struct btrfs_root *root, u64 objectid, { struct btrfs_path *path; struct btrfs_key search_key; - struct btrfs_leaf *l; + struct btrfs_key found_key; + struct extent_buffer *l; int ret; int slot; search_key.objectid = objectid; - search_key.flags = (u32)-1; + search_key.type = (u8)-1; search_key.offset = (u64)-1; path = btrfs_alloc_path(); @@ -39,17 +40,19 @@ int btrfs_find_last_root(struct btrfs_root *root, u64 objectid, ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); if (ret < 0) goto out; + BUG_ON(ret == 0); - l = btrfs_buffer_leaf(path->nodes[0]); + l = path->nodes[0]; BUG_ON(path->slots[0] == 0); slot = path->slots[0] - 1; - if (btrfs_disk_key_objectid(&l->items[slot].key) != objectid) { + btrfs_item_key_to_cpu(l, &found_key, slot); + if (found_key.objectid != objectid) { ret = 1; goto out; } - memcpy(item, btrfs_item_ptr(l, slot, struct btrfs_root_item), - sizeof(*item)); - btrfs_disk_key_to_cpu(key, &l->items[slot].key); + read_extent_buffer(l, item, btrfs_item_ptr_offset(l, slot), + sizeof(*item)); + memcpy(key, &found_key, sizeof(found_key)); ret = 0; out: btrfs_release_path(root, path); @@ -62,10 +65,10 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root *item) { struct btrfs_path *path; - struct btrfs_leaf *l; + struct extent_buffer *l; int ret; int slot; - struct btrfs_root_item *update_item; + unsigned long ptr; path = btrfs_alloc_path(); BUG_ON(!path); @@ -73,10 +76,10 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root if (ret < 0) goto out; BUG_ON(ret != 0); - l = btrfs_buffer_leaf(path->nodes[0]); + l = path->nodes[0]; slot = path->slots[0]; - update_item = btrfs_item_ptr(l, slot, struct btrfs_root_item); - btrfs_memcpy(root, l, update_item, item, sizeof(*item)); + ptr = btrfs_item_ptr_offset(l, slot); + write_extent_buffer(l, item, ptr, sizeof(*item)); btrfs_mark_buffer_dirty(path->nodes[0]); out: btrfs_release_path(root, path); @@ -103,11 +106,10 @@ int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid, struct btrfs_path *path; int ret; u32 nritems; - struct btrfs_leaf *leaf; + struct extent_buffer *leaf; int slot; key.objectid = objectid; - key.flags = 0; btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); key.offset = 0; path = btrfs_alloc_path(); @@ -117,19 +119,19 @@ int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid, if (ret < 0) goto err; while(1) { - leaf = btrfs_buffer_leaf(path->nodes[0]); - nritems = btrfs_header_nritems(&leaf->header); + leaf = path->nodes[0]; + nritems = btrfs_header_nritems(leaf); slot = path->slots[0]; if (slot >= nritems) { ret = btrfs_next_leaf(root, path); if (ret) break; - leaf = btrfs_buffer_leaf(path->nodes[0]); - nritems = btrfs_header_nritems(&leaf->header); + leaf = path->nodes[0]; + nritems = btrfs_header_nritems(leaf); slot = path->slots[0]; } - item = leaf->items + slot; - btrfs_disk_key_to_cpu(&key, &item->key); + item = btrfs_item_nr(leaf, slot); + btrfs_item_key_to_cpu(leaf, &key, slot); if (btrfs_key_type(&key) != BTRFS_ROOT_ITEM_KEY) goto next; @@ -140,7 +142,7 @@ int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid, break; ri = btrfs_item_ptr(leaf, slot, struct btrfs_root_item); - if (btrfs_root_refs(ri) != 0) + if (btrfs_disk_root_refs(leaf, ri) != 0) goto next; dead_root = btrfs_read_fs_root_no_radix(root->fs_info, &key); @@ -170,6 +172,7 @@ int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root, int ret; u32 refs; struct btrfs_root_item *ri; + struct extent_buffer *leaf; path = btrfs_alloc_path(); BUG_ON(!path); @@ -177,10 +180,10 @@ int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root, if (ret < 0) goto out; BUG_ON(ret != 0); - ri = btrfs_item_ptr(btrfs_buffer_leaf(path->nodes[0]), - path->slots[0], struct btrfs_root_item); + leaf = path->nodes[0]; + ri = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_item); - refs = btrfs_root_refs(ri); + refs = btrfs_disk_root_refs(leaf, ri); BUG_ON(refs != 0); ret = btrfs_del_item(trans, root, path); out: diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 66a01cbbbea1..39a1435c68f1 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -41,7 +41,7 @@ #include "ioctl.h" #include "print-tree.h" -#define BTRFS_SUPER_MAGIC 0x9123682E +#define BTRFS_SUPER_MAGIC 0x9123683E static struct super_operations btrfs_super_ops; @@ -115,13 +115,12 @@ static int btrfs_fill_super(struct super_block * sb, void * data, int silent) return -EIO; } sb->s_fs_info = tree_root; - disk_super = tree_root->fs_info->disk_super; + disk_super = &tree_root->fs_info->super_copy; inode = btrfs_iget_locked(sb, btrfs_super_root_dir(disk_super), tree_root); bi = BTRFS_I(inode); bi->location.objectid = inode->i_ino; bi->location.offset = 0; - bi->location.flags = 0; bi->root = tree_root; btrfs_set_key_type(&bi->location, BTRFS_INODE_ITEM_KEY); @@ -281,6 +280,7 @@ error_s: error_bdev: close_bdev_excl(bdev); error: +printk("get_sb failed\n"); return error; } /* end copy & paste */ @@ -295,6 +295,7 @@ static int btrfs_get_sb(struct file_system_type *fs_type, ret = btrfs_get_sb_bdev(fs_type, flags, dev_name, data, btrfs_fill_super, mnt, subvol_name ? subvol_name : "default"); +printk("btrfs_get_sb returns %d\n", ret); return ret; } diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 2058783373eb..9654e90eec89 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -31,31 +31,31 @@ static ssize_t root_blocks_used_show(struct btrfs_root *root, char *buf) { return snprintf(buf, PAGE_SIZE, "%llu\n", - (unsigned long long)btrfs_root_blocks_used(&root->root_item)); + (unsigned long long)btrfs_root_used(&root->root_item)); } static ssize_t root_block_limit_show(struct btrfs_root *root, char *buf) { return snprintf(buf, PAGE_SIZE, "%llu\n", - (unsigned long long)btrfs_root_block_limit(&root->root_item)); + (unsigned long long)btrfs_root_limit(&root->root_item)); } static ssize_t super_blocks_used_show(struct btrfs_fs_info *fs, char *buf) { return snprintf(buf, PAGE_SIZE, "%llu\n", - (unsigned long long)btrfs_super_blocks_used(fs->disk_super)); + (unsigned long long)btrfs_super_blocks_used(&fs->super_copy)); } static ssize_t super_total_blocks_show(struct btrfs_fs_info *fs, char *buf) { return snprintf(buf, PAGE_SIZE, "%llu\n", - (unsigned long long)btrfs_super_total_blocks(fs->disk_super)); + (unsigned long long)btrfs_super_total_blocks(&fs->super_copy)); } static ssize_t super_blocksize_show(struct btrfs_fs_info *fs, char *buf) { return snprintf(buf, PAGE_SIZE, "%llu\n", - (unsigned long long)btrfs_super_blocksize(fs->disk_super)); + (unsigned long long)btrfs_super_sectorsize(&fs->super_copy)); } /* this is for root attrs (subvols/snapshots) */ diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 087074db0bd5..750f35a37aae 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -19,6 +19,7 @@ #include #include #include +#include #include "ctree.h" #include "disk-io.h" #include "transaction.h" @@ -66,7 +67,9 @@ static int join_transaction(struct btrfs_root *root) cur_trans->commit_done = 0; cur_trans->start_time = get_seconds(); list_add_tail(&cur_trans->list, &root->fs_info->trans_list); - init_bit_radix(&cur_trans->dirty_pages); + extent_map_tree_init(&cur_trans->dirty_pages, + root->fs_info->btree_inode->i_mapping, + GFP_NOFS); } else { cur_trans->num_writers++; cur_trans->num_joined++; @@ -88,7 +91,7 @@ static int record_root_in_trans(struct btrfs_root *root) (unsigned long)root->root_key.objectid, BTRFS_ROOT_DEFRAG_TAG); root->commit_root = root->node; - get_bh(root->node); + extent_buffer_get(root->node); } else { WARN_ON(1); } @@ -144,29 +147,30 @@ int btrfs_end_transaction(struct btrfs_trans_handle *trans, int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans, struct btrfs_root *root) { - unsigned long gang[16]; int ret; - int i; int err; int werr = 0; + struct extent_map_tree *dirty_pages; struct page *page; - struct radix_tree_root *dirty_pages; struct inode *btree_inode = root->fs_info->btree_inode; + u64 start; + u64 end; + unsigned long index; if (!trans || !trans->transaction) { return filemap_write_and_wait(btree_inode->i_mapping); } dirty_pages = &trans->transaction->dirty_pages; while(1) { - ret = find_first_radix_bit(dirty_pages, gang, - 0, ARRAY_SIZE(gang)); - if (!ret) + ret = find_first_extent_bit(dirty_pages, 0, &start, &end, + EXTENT_DIRTY); + if (ret) break; - for (i = 0; i < ret; i++) { - /* FIXME EIO */ - clear_radix_bit(dirty_pages, gang[i]); - page = find_lock_page(btree_inode->i_mapping, - gang[i]); + clear_extent_dirty(dirty_pages, start, end, GFP_NOFS); + while(start <= end) { + index = start >> PAGE_CACHE_SHIFT; + start = (index + 1) << PAGE_CACHE_SHIFT; + page = find_lock_page(btree_inode->i_mapping, index); if (!page) continue; if (PageWriteback(page)) { @@ -202,10 +206,11 @@ int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans, btrfs_write_dirty_block_groups(trans, extent_root); while(1) { old_extent_block = btrfs_root_blocknr(&extent_root->root_item); - if (old_extent_block == bh_blocknr(extent_root->node)) + if (old_extent_block == + extent_buffer_blocknr(extent_root->node)) break; btrfs_set_root_blocknr(&extent_root->root_item, - bh_blocknr(extent_root->node)); + extent_buffer_blocknr(extent_root->node)); ret = btrfs_update_root(trans, tree_root, &extent_root->root_key, &extent_root->root_item); @@ -279,9 +284,9 @@ static int add_dirty_roots(struct btrfs_trans_handle *trans, (unsigned long)root->root_key.objectid, BTRFS_ROOT_TRANS_TAG); if (root->commit_root == root->node) { - WARN_ON(bh_blocknr(root->node) != + WARN_ON(extent_buffer_blocknr(root->node) != btrfs_root_blocknr(&root->root_item)); - brelse(root->commit_root); + free_extent_buffer(root->commit_root); root->commit_root = NULL; /* make sure to update the root on disk @@ -310,7 +315,7 @@ static int add_dirty_roots(struct btrfs_trans_handle *trans, root->root_key.offset = root->fs_info->generation; btrfs_set_root_blocknr(&root->root_item, - bh_blocknr(root->node)); + extent_buffer_blocknr(root->node)); err = btrfs_insert_root(trans, root->fs_info->tree_root, &root->root_key, &root->root_item); @@ -389,10 +394,10 @@ int btrfs_defrag_dirty_roots(struct btrfs_fs_info *info) for (i = 0; i < ret; i++) { root = gang[i]; last = root->root_key.objectid + 1; - btrfs_defrag_root(root, 1); + // btrfs_defrag_root(root, 1); } } - btrfs_defrag_root(info->extent_root, 1); + // btrfs_defrag_root(info->extent_root, 1); return err; } @@ -414,7 +419,7 @@ static int drop_dirty_roots(struct btrfs_root *tree_root, dirty = list_entry(list->next, struct dirty_root, list); list_del_init(&dirty->list); - num_blocks = btrfs_root_blocks_used(&dirty->root->root_item); + num_blocks = btrfs_root_used(&dirty->root->root_item); root = dirty->latest_root; while(1) { @@ -441,11 +446,11 @@ static int drop_dirty_roots(struct btrfs_root *tree_root, } BUG_ON(ret); - num_blocks -= btrfs_root_blocks_used(&dirty->root->root_item); - blocks_used = btrfs_root_blocks_used(&root->root_item); + num_blocks -= btrfs_root_used(&dirty->root->root_item); + blocks_used = btrfs_root_used(&root->root_item); if (num_blocks) { record_root_in_trans(root); - btrfs_set_root_blocks_used(&root->root_item, + btrfs_set_root_used(&root->root_item, blocks_used - num_blocks); } ret = btrfs_del_root(trans, tree_root, &dirty->root->root_key); @@ -553,9 +558,11 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, btrfs_set_super_generation(&root->fs_info->super_copy, cur_trans->transid); btrfs_set_super_root(&root->fs_info->super_copy, - bh_blocknr(root->fs_info->tree_root->node)); - memcpy(root->fs_info->disk_super, &root->fs_info->super_copy, - sizeof(root->fs_info->super_copy)); + extent_buffer_blocknr(root->fs_info->tree_root->node)); + + write_extent_buffer(root->fs_info->sb_buffer, + &root->fs_info->super_copy, 0, + sizeof(root->fs_info->super_copy)); btrfs_copy_pinned(root, &pinned_copy); diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 4bc328cbb24c..ae39fcfc169a 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -28,7 +28,7 @@ struct btrfs_transaction { int use_count; int commit_done; struct list_head list; - struct radix_tree_root dirty_pages; + struct extent_map_tree dirty_pages; unsigned long start_time; wait_queue_head_t writer_wait; wait_queue_head_t commit_wait; @@ -83,5 +83,6 @@ int btrfs_add_dead_root(struct btrfs_root *root, struct btrfs_root *latest, int btrfs_defrag_dirty_roots(struct btrfs_fs_info *info); int btrfs_defrag_root(struct btrfs_root *root, int cacheonly); int btrfs_clean_old_snapshots(struct btrfs_root *root); - +int btrfs_commit_transaction(struct btrfs_trans_handle *trans, + struct btrfs_root *root); #endif diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c index 420597127ed1..daf019afa0a1 100644 --- a/fs/btrfs/tree-defrag.c +++ b/fs/btrfs/tree-defrag.c @@ -23,14 +23,14 @@ #include "transaction.h" static void reada_defrag(struct btrfs_root *root, - struct btrfs_node *node) + struct extent_buffer *node) { int i; u32 nritems; u64 blocknr; int ret; - nritems = btrfs_header_nritems(&node->header); + nritems = btrfs_header_nritems(node); for (i = 0; i < nritems; i++) { blocknr = btrfs_node_blockptr(node, i); ret = readahead_tree_block(root, blocknr); @@ -44,8 +44,8 @@ static int defrag_walk_down(struct btrfs_trans_handle *trans, struct btrfs_path *path, int *level, int cache_only, u64 *last_ret) { - struct buffer_head *next; - struct buffer_head *cur; + struct extent_buffer *next; + struct extent_buffer *cur; u64 blocknr; int ret = 0; int is_extent = 0; @@ -62,13 +62,13 @@ static int defrag_walk_down(struct btrfs_trans_handle *trans, cur = path->nodes[*level]; if (!cache_only && *level > 1 && path->slots[*level] == 0) - reada_defrag(root, btrfs_buffer_node(cur)); + reada_defrag(root, cur); - if (btrfs_header_level(btrfs_buffer_header(cur)) != *level) + if (btrfs_header_level(cur) != *level) WARN_ON(1); if (path->slots[*level] >= - btrfs_header_nritems(btrfs_buffer_header(cur))) + btrfs_header_nritems(cur)) break; if (*level == 1) { @@ -80,14 +80,13 @@ static int defrag_walk_down(struct btrfs_trans_handle *trans, break; } - blocknr = btrfs_node_blockptr(btrfs_buffer_node(cur), - path->slots[*level]); + blocknr = btrfs_node_blockptr(cur, path->slots[*level]); if (cache_only) { next = btrfs_find_tree_block(root, blocknr); - if (!next || !buffer_uptodate(next) || - buffer_locked(next) || !buffer_defrag(next)) { - brelse(next); + /* FIXME, test for defrag */ + if (!next || !btrfs_buffer_uptodate(next)) { + free_extent_buffer(next); path->slots[*level]++; continue; } @@ -106,16 +105,18 @@ static int defrag_walk_down(struct btrfs_trans_handle *trans, WARN_ON(*level <= 0); if (path->nodes[*level-1]) - btrfs_block_release(root, path->nodes[*level-1]); + free_extent_buffer(path->nodes[*level-1]); path->nodes[*level-1] = next; - *level = btrfs_header_level(btrfs_buffer_header(next)); + *level = btrfs_header_level(next); path->slots[*level] = 0; } WARN_ON(*level < 0); WARN_ON(*level >= BTRFS_MAX_LEVEL); +#if 0 clear_buffer_defrag(path->nodes[*level]); clear_buffer_defrag_done(path->nodes[*level]); - btrfs_block_release(root, path->nodes[*level]); +#endif + free_extent_buffer(path->nodes[*level]); path->nodes[*level] = NULL; *level += 1; WARN_ON(ret); @@ -129,24 +130,25 @@ static int defrag_walk_up(struct btrfs_trans_handle *trans, { int i; int slot; - struct btrfs_node *node; + struct extent_buffer *node; for(i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) { slot = path->slots[i]; - if (slot < btrfs_header_nritems( - btrfs_buffer_header(path->nodes[i])) - 1) { + if (slot < btrfs_header_nritems(path->nodes[i]) - 1) { path->slots[i]++; *level = i; - node = btrfs_buffer_node(path->nodes[i]); + node = path->nodes[i]; WARN_ON(i == 0); - btrfs_disk_key_to_cpu(&root->defrag_progress, - &node->ptrs[path->slots[i]].key); + btrfs_node_key_to_cpu(node, &root->defrag_progress, + path->slots[i]); root->defrag_level = i; return 0; } else { + /* clear_buffer_defrag(path->nodes[*level]); clear_buffer_defrag_done(path->nodes[*level]); - btrfs_block_release(root, path->nodes[*level]); + */ + free_extent_buffer(path->nodes[*level]); path->nodes[*level] = NULL; *level = i + 1; } @@ -158,7 +160,7 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, struct btrfs_root *root, int cache_only) { struct btrfs_path *path = NULL; - struct buffer_head *tmp; + struct extent_buffer *tmp; int ret = 0; int wret; int level; @@ -172,17 +174,18 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, if (root->ref_cows == 0 && !is_extent) goto out; + path = btrfs_alloc_path(); if (!path) return -ENOMEM; - level = btrfs_header_level(btrfs_buffer_header(root->node)); + level = btrfs_header_level(root->node); orig_level = level; if (level == 0) { goto out; } if (root->defrag_progress.objectid == 0) { - get_bh(root->node); + extent_buffer_get(root->node); ret = btrfs_cow_block(trans, root, root->node, NULL, 0, &tmp); BUG_ON(ret); ret = btrfs_realloc_node(trans, root, root->node, cache_only, @@ -200,12 +203,15 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, if (is_extent) btrfs_extent_post_op(trans, root); + if (wret < 0) { ret = wret; goto out; } + while(level > 0 && !path->nodes[level]) level--; + if (!path->nodes[level]) { ret = 0; goto out; @@ -230,7 +236,7 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, } for (i = 0; i <= orig_level; i++) { if (path->nodes[i]) { - btrfs_block_release(root, path->nodes[i]); + free_extent_buffer(path->nodes[i]); path->nodes[i] = 0; } } -- cgit v1.2.3 From 0f82731fc56448c2733f58e1f5db6c2cbfc90652 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Mon, 15 Oct 2007 16:18:56 -0400 Subject: Breakout BTRFS_SETGET_FUNCS into a separate C file, the inlines were too big. Signed-off-by: Chris Mason --- fs/btrfs/Makefile | 2 +- fs/btrfs/ctree.c | 1 - fs/btrfs/ctree.h | 74 ++----------------------------------- fs/btrfs/extent-tree.c | 2 +- fs/btrfs/struct-funcs.c | 97 +++++++++++++++++++++++++++++++++++++++++++++++++ fs/btrfs/tree-defrag.c | 2 +- 6 files changed, 104 insertions(+), 74 deletions(-) create mode 100644 fs/btrfs/struct-funcs.c (limited to 'fs/btrfs/Makefile') diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index a3b51085d7f0..551743be5f03 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -5,7 +5,7 @@ obj-m := btrfs.o btrfs-y := super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ hash.o file-item.o inode-item.o inode-map.o disk-io.o \ transaction.o bit-radix.o inode.o file.o tree-defrag.o \ - extent_map.o sysfs.o + extent_map.o sysfs.o struct-funcs.o #btrfs-y := ctree.o disk-io.o radix-tree.o extent-tree.o print-tree.o \ # root-tree.o dir-item.o hash.o file-item.o inode-item.o \ diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 54a5d006c562..0c6ed17ac1bc 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -16,7 +16,6 @@ * Boston, MA 021110-1307, USA. */ -#include #include "ctree.h" #include "disk-io.h" #include "transaction.h" diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index a942a2427228..d1c6f023a302 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -426,77 +426,11 @@ struct btrfs_root { offsetof(type, member), \ sizeof(((type *)0)->member))) +#ifndef BTRFS_SETGET_FUNCS #define BTRFS_SETGET_FUNCS(name, type, member, bits) \ -static inline u##bits btrfs_##name(struct extent_buffer *eb, \ - type *s) \ -{ \ - int err; \ - char *map_token; \ - char *kaddr; \ - int unmap_on_exit = (eb->map_token == NULL); \ - unsigned long map_start; \ - unsigned long map_len; \ - unsigned long offset = (unsigned long)s + \ - offsetof(type, member); \ - if (eb->map_token && offset >= eb->map_start && \ - offset + sizeof(((type *)0)->member) <= eb->map_start + \ - eb->map_len) { \ - kaddr = eb->kaddr; \ - map_start = eb->map_start; \ - err = 0; \ - } else { \ - err = map_extent_buffer(eb, offset, \ - sizeof(((type *)0)->member), \ - &map_token, &kaddr, \ - &map_start, &map_len, KM_USER1); \ - } \ - if (!err) { \ - __le##bits *tmp = (__le##bits *)(kaddr + offset - \ - map_start); \ - u##bits res = le##bits##_to_cpu(*tmp); \ - if (unmap_on_exit) \ - unmap_extent_buffer(eb, map_token, KM_USER1); \ - return res; \ - } else { \ - __le##bits res; \ - read_eb_member(eb, s, type, member, &res); \ - return le##bits##_to_cpu(res); \ - } \ -} \ -static inline void btrfs_set_##name(struct extent_buffer *eb, \ - type *s, u##bits val) \ -{ \ - int err; \ - char *map_token; \ - char *kaddr; \ - unsigned long map_start; \ - unsigned long map_len; \ - int unmap_on_exit = (eb->map_token == NULL); \ - unsigned long offset = (unsigned long)s + \ - offsetof(type, member); \ - if (eb->map_token && offset >= eb->map_start && \ - offset + sizeof(((type *)0)->member) <= eb->map_start + \ - eb->map_len) { \ - kaddr = eb->kaddr; \ - map_start = eb->map_start; \ - err = 0; \ - } else { \ - err = map_extent_buffer(eb, offset, \ - sizeof(((type *)0)->member), \ - &map_token, &kaddr, \ - &map_start, &map_len, KM_USER1); \ - } \ - if (!err) { \ - __le##bits *tmp = (__le##bits *)(kaddr + offset - \ - map_start); \ - *tmp = cpu_to_le##bits(val); \ - if (unmap_on_exit) \ - unmap_extent_buffer(eb, map_token, KM_USER1); \ - } else { \ - val = cpu_to_le##bits(val); \ - write_eb_member(eb, s, type, member, &val); \ - } \ -} +u##bits btrfs_##name(struct extent_buffer *eb, type *s); \ +void btrfs_set_##name(struct extent_buffer *eb, type *s, u##bits val); +#endif #define BTRFS_SETGET_HEADER_FUNCS(name, type, member, bits) \ static inline u##bits btrfs_##name(struct extent_buffer *eb) \ diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 525fa845d613..e4e68ea96483 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -1443,7 +1443,7 @@ int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root for (i = 0; i <= orig_level; i++) { if (path->nodes[i]) { free_extent_buffer(path->nodes[i]); - path->nodes[i] = 0; + path->nodes[i] = NULL; } } out: diff --git a/fs/btrfs/struct-funcs.c b/fs/btrfs/struct-funcs.c new file mode 100644 index 000000000000..cf68fcf9b552 --- /dev/null +++ b/fs/btrfs/struct-funcs.c @@ -0,0 +1,97 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include +#define BTRFS_SETGET_FUNCS(name, type, member, bits) \ +u##bits btrfs_##name(struct extent_buffer *eb, \ + type *s) \ +{ \ + unsigned long offset = (unsigned long)s + \ + offsetof(type, member); \ + __le##bits *tmp; \ + /* ugly, but we want the fast path here */ \ + if (eb->map_token && offset >= eb->map_start && \ + offset + sizeof(((type *)0)->member) <= eb->map_start + \ + eb->map_len) { \ + tmp = (__le##bits *)(eb->kaddr + offset - \ + eb->map_start); \ + return le##bits##_to_cpu(*tmp); \ + } \ + { \ + int err; \ + char *map_token; \ + char *kaddr; \ + int unmap_on_exit = (eb->map_token == NULL); \ + unsigned long map_start; \ + unsigned long map_len; \ + __le##bits res; \ + err = map_extent_buffer(eb, offset, \ + sizeof(((type *)0)->member), \ + &map_token, &kaddr, \ + &map_start, &map_len, KM_USER1); \ + if (err) { \ + read_eb_member(eb, s, type, member, &res); \ + return le##bits##_to_cpu(res); \ + } \ + tmp = (__le##bits *)(kaddr + offset - map_start); \ + res = le##bits##_to_cpu(*tmp); \ + if (unmap_on_exit) \ + unmap_extent_buffer(eb, map_token, KM_USER1); \ + return res; \ + } \ +} \ +void btrfs_set_##name(struct extent_buffer *eb, \ + type *s, u##bits val) \ +{ \ + unsigned long offset = (unsigned long)s + \ + offsetof(type, member); \ + __le##bits *tmp; \ + /* ugly, but we want the fast path here */ \ + if (eb->map_token && offset >= eb->map_start && \ + offset + sizeof(((type *)0)->member) <= eb->map_start + \ + eb->map_len) { \ + tmp = (__le##bits *)(eb->kaddr + offset - \ + eb->map_start); \ + *tmp = cpu_to_le##bits(val); \ + return; \ + } \ + { \ + int err; \ + char *map_token; \ + char *kaddr; \ + int unmap_on_exit = (eb->map_token == NULL); \ + unsigned long map_start; \ + unsigned long map_len; \ + err = map_extent_buffer(eb, offset, \ + sizeof(((type *)0)->member), \ + &map_token, &kaddr, \ + &map_start, &map_len, KM_USER1); \ + if (err) { \ + val = cpu_to_le##bits(val); \ + write_eb_member(eb, s, type, member, &val); \ + return; \ + } \ + tmp = (__le##bits *)(kaddr + offset - map_start); \ + *tmp = cpu_to_le##bits(val); \ + if (unmap_on_exit) \ + unmap_extent_buffer(eb, map_token, KM_USER1); \ + } \ +} + +#include "ctree.h" + diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c index d23216aa4ab5..0d1a1be0995c 100644 --- a/fs/btrfs/tree-defrag.c +++ b/fs/btrfs/tree-defrag.c @@ -239,7 +239,7 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, for (i = 0; i <= orig_level; i++) { if (path->nodes[i]) { free_extent_buffer(path->nodes[i]); - path->nodes[i] = 0; + path->nodes[i] = NULL; } } out: -- cgit v1.2.3 From 5103e947b9b7ac18ddb21a04ee3486e94c6504d7 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 16 Nov 2007 11:45:54 -0500 Subject: xattr support for btrfs Signed-off-by: Chris Mason --- fs/btrfs/Makefile | 2 +- fs/btrfs/acl.c | 66 +++++++ fs/btrfs/ctree.h | 22 ++- fs/btrfs/dir-item.c | 100 +++++++++- fs/btrfs/inode.c | 15 +- fs/btrfs/print-tree.c | 3 +- fs/btrfs/super.c | 2 + fs/btrfs/xattr.c | 497 ++++++++++++++++++++++++++++++++++++++++++++++++++ fs/btrfs/xattr.h | 58 ++++++ 9 files changed, 751 insertions(+), 14 deletions(-) create mode 100644 fs/btrfs/acl.c create mode 100644 fs/btrfs/xattr.c create mode 100644 fs/btrfs/xattr.h (limited to 'fs/btrfs/Makefile') diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index 551743be5f03..d5804c5ca075 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -5,7 +5,7 @@ obj-m := btrfs.o btrfs-y := super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ hash.o file-item.o inode-item.o inode-map.o disk-io.o \ transaction.o bit-radix.o inode.o file.o tree-defrag.o \ - extent_map.o sysfs.o struct-funcs.o + extent_map.o sysfs.o struct-funcs.o xattr.o acl.o #btrfs-y := ctree.o disk-io.o radix-tree.o extent-tree.o print-tree.o \ # root-tree.o dir-item.o hash.o file-item.o inode-item.o \ diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c new file mode 100644 index 000000000000..aee9f0657c35 --- /dev/null +++ b/fs/btrfs/acl.c @@ -0,0 +1,66 @@ +/* + * Copyright (C) 2007 Red Hat. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include +#include +#include +#include +#include "ctree.h" +#include "xattr.h" + +static int btrfs_xattr_acl_access_get(struct inode *inode, const char *name, + void *value, size_t size) +{ + return btrfs_xattr_get(inode, BTRFS_XATTR_INDEX_POSIX_ACL_ACCESS, name, + value, size); +} + +static int btrfs_xattr_acl_access_set(struct inode *inode, const char *name, + const void *value, size_t size, int flags) +{ + return btrfs_xattr_set(inode, BTRFS_XATTR_INDEX_POSIX_ACL_ACCESS, name, + value, size, flags); +} + +static int btrfs_xattr_acl_default_get(struct inode *inode, const char *name, + void *value, size_t size) +{ + return btrfs_xattr_get(inode, BTRFS_XATTR_INDEX_POSIX_ACL_DEFAULT, + name, value, size); +} + +static int btrfs_xattr_acl_default_set(struct inode *inode, const char *name, + const void *value, size_t size, int flags) +{ + return btrfs_xattr_set(inode, BTRFS_XATTR_INDEX_POSIX_ACL_DEFAULT, + name, value, size, flags); +} + +struct xattr_handler btrfs_xattr_acl_default_handler = { + .prefix = POSIX_ACL_XATTR_DEFAULT, + .list = btrfs_xattr_generic_list, + .get = btrfs_xattr_acl_default_get, + .set = btrfs_xattr_acl_default_set, +}; + +struct xattr_handler btrfs_xattr_acl_access_handler = { + .prefix = POSIX_ACL_XATTR_ACCESS, + .list = btrfs_xattr_generic_list, + .get = btrfs_xattr_acl_access_get, + .set = btrfs_xattr_acl_access_set, +}; diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 030d21d7f98c..27cadae1af63 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -63,7 +63,8 @@ extern struct kmem_cache *btrfs_path_cachep; #define BTRFS_FT_FIFO 5 #define BTRFS_FT_SOCK 6 #define BTRFS_FT_SYMLINK 7 -#define BTRFS_FT_MAX 8 +#define BTRFS_FT_XATTR 8 +#define BTRFS_FT_MAX 9 /* * the key defines the order in the tree, and so it also defines (optimal) @@ -226,7 +227,7 @@ struct btrfs_inode_item { struct btrfs_dir_item { struct btrfs_disk_key location; - __le16 flags; + __le16 data_len; __le16 name_len; u8 type; } __attribute__ ((__packed__)); @@ -367,7 +368,7 @@ struct btrfs_root { * the FS */ #define BTRFS_INODE_ITEM_KEY 1 - +#define BTRFS_XATTR_ITEM_KEY 2 /* reserve 2-15 close to the inode for later flexibility */ /* @@ -621,7 +622,7 @@ static inline void btrfs_set_item_key(struct extent_buffer *eb, } /* struct btrfs_dir_item */ -BTRFS_SETGET_FUNCS(dir_flags, struct btrfs_dir_item, flags, 16); +BTRFS_SETGET_FUNCS(dir_data_len, struct btrfs_dir_item, data_len, 16); BTRFS_SETGET_FUNCS(dir_type, struct btrfs_dir_item, type, 8); BTRFS_SETGET_FUNCS(dir_name_len, struct btrfs_dir_item, name_len, 16); @@ -962,6 +963,15 @@ int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, struct btrfs_dir_item *di); +int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, const char *name, + u16 name_len, const void *data, u16 data_len, + u64 dir); +struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, u64 dir, + const char *name, u16 name_len, + int mod); /* inode-map.c */ int btrfs_find_free_objectid(struct btrfs_trans_handle *trans, struct btrfs_root *fs_root, @@ -1039,4 +1049,8 @@ int btrfs_sysfs_add_root(struct btrfs_root *root); void btrfs_sysfs_del_root(struct btrfs_root *root); void btrfs_sysfs_del_super(struct btrfs_fs_info *root); +/* xattr.c */ +ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size); +int btrfs_delete_xattrs(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct inode *inode); #endif diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c index 514a1dc337a8..ddbe12ae0d63 100644 --- a/fs/btrfs/dir-item.c +++ b/fs/btrfs/dir-item.c @@ -43,8 +43,6 @@ static struct btrfs_dir_item *insert_with_overflow(struct btrfs_trans_handle return ERR_PTR(-EEXIST); ret = btrfs_extend_item(trans, root, path, data_size); WARN_ON(ret > 0); - if (ret) - return ERR_PTR(ret); } if (ret < 0) return ERR_PTR(ret); @@ -57,6 +55,57 @@ static struct btrfs_dir_item *insert_with_overflow(struct btrfs_trans_handle return (struct btrfs_dir_item *)ptr; } +int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, const char *name, + u16 name_len, const void *data, u16 data_len, + u64 dir) +{ + int ret = 0; + struct btrfs_path *path; + struct btrfs_dir_item *dir_item; + unsigned long name_ptr, data_ptr; + struct btrfs_key key, location; + struct btrfs_disk_key disk_key; + struct extent_buffer *leaf; + u32 data_size; + + key.objectid = dir; + btrfs_set_key_type(&key, BTRFS_XATTR_ITEM_KEY); + ret = btrfs_name_hash(name, name_len, &key.offset); + BUG_ON(ret); + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + data_size = sizeof(*dir_item) + name_len + data_len; + dir_item = insert_with_overflow(trans, root, path, &key, data_size, + name, name_len); + /* + * FIXME: at some point we should handle xattr's that are larger than + * what we can fit in our leaf. We set location to NULL b/c we arent + * pointing at anything else, that will change if we store the xattr + * data in a separate inode. + */ + BUG_ON(IS_ERR(dir_item)); + memset(&location, 0, sizeof(location)); + + leaf = path->nodes[0]; + btrfs_cpu_key_to_disk(&disk_key, &location); + btrfs_set_dir_item_key(leaf, dir_item, &disk_key); + btrfs_set_dir_type(leaf, dir_item, BTRFS_FT_XATTR); + btrfs_set_dir_name_len(leaf, dir_item, name_len); + btrfs_set_dir_data_len(leaf, dir_item, data_len); + name_ptr = (unsigned long)(dir_item + 1); + data_ptr = (unsigned long)((char *)name_ptr + name_len); + + write_extent_buffer(leaf, name, name_ptr, name_len); + write_extent_buffer(leaf, data, data_ptr, data_len); + btrfs_mark_buffer_dirty(path->nodes[0]); + + btrfs_free_path(path); + return ret; +} + int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, const char *name, int name_len, u64 dir, struct btrfs_key *location, u8 type) @@ -90,7 +139,7 @@ int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root btrfs_cpu_key_to_disk(&disk_key, location); btrfs_set_dir_item_key(leaf, dir_item, &disk_key); btrfs_set_dir_type(leaf, dir_item, type); - btrfs_set_dir_flags(leaf, dir_item, 0); + btrfs_set_dir_data_len(leaf, dir_item, 0); btrfs_set_dir_name_len(leaf, dir_item, name_len); name_ptr = (unsigned long)(dir_item + 1); @@ -117,7 +166,7 @@ second_insert: btrfs_cpu_key_to_disk(&disk_key, location); btrfs_set_dir_item_key(leaf, dir_item, &disk_key); btrfs_set_dir_type(leaf, dir_item, type); - btrfs_set_dir_flags(leaf, dir_item, 0); + btrfs_set_dir_data_len(leaf, dir_item, 0); btrfs_set_dir_name_len(leaf, dir_item, name_len); name_ptr = (unsigned long)(dir_item + 1); write_extent_buffer(leaf, name, name_ptr, name_len); @@ -194,6 +243,43 @@ btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans, return btrfs_match_dir_item_name(root, path, name, name_len); } +struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, u64 dir, + const char *name, u16 name_len, + int mod) +{ + int ret; + struct btrfs_key key; + int ins_len = mod < 0 ? -1 : 0; + int cow = mod != 0; + struct btrfs_key found_key; + struct extent_buffer *leaf; + + key.objectid = dir; + btrfs_set_key_type(&key, BTRFS_XATTR_ITEM_KEY); + ret = btrfs_name_hash(name, name_len, &key.offset); + BUG_ON(ret); + ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow); + if (ret < 0) + return ERR_PTR(ret); + if (ret > 0) { + if (path->slots[0] == 0) + return NULL; + path->slots[0]--; + } + + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + + if (found_key.objectid != dir || + btrfs_key_type(&found_key) != BTRFS_XATTR_ITEM_KEY || + found_key.offset != key.offset) + return NULL; + + return btrfs_match_dir_item_name(root, path, name, name_len); +} + struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root, struct btrfs_path *path, const char *name, int name_len) @@ -210,7 +296,8 @@ struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root, total_len = btrfs_item_size_nr(leaf, path->slots[0]); while(cur < total_len) { this_len = sizeof(*dir_item) + - btrfs_dir_name_len(leaf, dir_item); + btrfs_dir_name_len(leaf, dir_item) + + btrfs_dir_data_len(leaf, dir_item); name_ptr = (unsigned long)(dir_item + 1); if (btrfs_dir_name_len(leaf, dir_item) == name_len && @@ -236,7 +323,8 @@ int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans, int ret = 0; leaf = path->nodes[0]; - sub_item_len = sizeof(*di) + btrfs_dir_name_len(leaf, di); + sub_item_len = sizeof(*di) + btrfs_dir_name_len(leaf, di) + + btrfs_dir_data_len(leaf, di); item_len = btrfs_item_size_nr(leaf, path->slots[0]); if (sub_item_len == item_len) { ret = btrfs_del_item(trans, root, path); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index d6d32465bc9a..37f8e0321af8 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -32,6 +32,7 @@ #include #include #include +#include #include "ctree.h" #include "disk-io.h" #include "transaction.h" @@ -839,6 +840,9 @@ void btrfs_delete_inode(struct inode *inode) btrfs_set_trans_block_group(trans, inode); ret = btrfs_truncate_in_trans(trans, root, inode); + if (ret) + goto no_delete_lock; + ret = btrfs_delete_xattrs(trans, root, inode); if (ret) goto no_delete_lock; ret = btrfs_free_inode(trans, root, inode); @@ -1110,7 +1114,8 @@ static int btrfs_readdir(struct file *filp, void *dirent, filldir_t filldir) if (over) goto nopos; - di_len = btrfs_dir_name_len(leaf, di) + sizeof(*di); + di_len = btrfs_dir_name_len(leaf, di) + + btrfs_dir_data_len(leaf, di) +sizeof(*di); di_cur += di_len; di = (struct btrfs_dir_item *)((char *)di + di_len); } @@ -2519,6 +2524,10 @@ static struct inode_operations btrfs_dir_inode_operations = { .symlink = btrfs_symlink, .setattr = btrfs_setattr, .mknod = btrfs_mknod, + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, + .listxattr = btrfs_listxattr, + .removexattr = generic_removexattr, }; static struct inode_operations btrfs_dir_ro_inode_operations = { @@ -2567,6 +2576,10 @@ static struct inode_operations btrfs_file_inode_operations = { .truncate = btrfs_truncate, .getattr = btrfs_getattr, .setattr = btrfs_setattr, + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, + .listxattr = btrfs_listxattr, + .removexattr = generic_removexattr, }; static struct inode_operations btrfs_special_inode_operations = { diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c index 9f8696c8a8e8..030324febf6c 100644 --- a/fs/btrfs/print-tree.c +++ b/fs/btrfs/print-tree.c @@ -58,9 +58,8 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l) case BTRFS_DIR_ITEM_KEY: di = btrfs_item_ptr(l, i, struct btrfs_dir_item); btrfs_dir_item_key_to_cpu(l, di, &found_key); - printk("\t\tdir oid %llu flags %u type %u\n", + printk("\t\tdir oid %llu type %u\n", (unsigned long long)found_key.objectid, - btrfs_dir_flags(l, di), btrfs_dir_type(l, di)); break; case BTRFS_ROOT_ITEM_KEY: diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index f94aa1f97a0a..c46bc3911798 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -40,6 +40,7 @@ #include "btrfs_inode.h" #include "ioctl.h" #include "print-tree.h" +#include "xattr.h" #define BTRFS_SUPER_MAGIC 0x9123683E @@ -106,6 +107,7 @@ static int btrfs_fill_super(struct super_block * sb, void * data, int silent) sb->s_maxbytes = MAX_LFS_FILESIZE; sb->s_magic = BTRFS_SUPER_MAGIC; sb->s_op = &btrfs_super_ops; + sb->s_xattr = btrfs_xattr_handlers; sb->s_time_gran = 1; tree_root = open_ctree(sb); diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c new file mode 100644 index 000000000000..f4ac5e0bbad1 --- /dev/null +++ b/fs/btrfs/xattr.c @@ -0,0 +1,497 @@ +/* + * Copyright (C) 2007 Red Hat. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include +#include +#include +#include +#include +#include "ctree.h" +#include "btrfs_inode.h" +#include "transaction.h" +#include "xattr.h" +#include "disk-io.h" + +static struct xattr_handler *btrfs_xattr_handler_map[] = { + [BTRFS_XATTR_INDEX_USER] = &btrfs_xattr_user_handler, + [BTRFS_XATTR_INDEX_POSIX_ACL_ACCESS] = &btrfs_xattr_acl_access_handler, + [BTRFS_XATTR_INDEX_POSIX_ACL_DEFAULT] = &btrfs_xattr_acl_default_handler, + [BTRFS_XATTR_INDEX_TRUSTED] = &btrfs_xattr_trusted_handler, + [BTRFS_XATTR_INDEX_SECURITY] = &btrfs_xattr_security_handler, + [BTRFS_XATTR_INDEX_SYSTEM] = &btrfs_xattr_system_handler, +}; + +struct xattr_handler *btrfs_xattr_handlers[] = { + &btrfs_xattr_user_handler, + &btrfs_xattr_acl_access_handler, + &btrfs_xattr_acl_default_handler, + &btrfs_xattr_trusted_handler, + &btrfs_xattr_security_handler, + &btrfs_xattr_system_handler, + NULL, +}; + +/* + * @param name - the xattr name + * @return - the xattr_handler for the xattr, NULL if its not found + * + * use this with listxattr where we don't already know the type of xattr we + * have + */ +static struct xattr_handler *find_btrfs_xattr_handler(struct extent_buffer *l, + unsigned long name_ptr, + u16 name_len) +{ + struct xattr_handler *handler = NULL; + int i = 0; + + for (handler = btrfs_xattr_handlers[i]; handler != NULL; i++, + handler = btrfs_xattr_handlers[i]) { + u16 prefix_len = strlen(handler->prefix); + + if (name_len < prefix_len) + continue; + + if (memcmp_extent_buffer(l, handler->prefix, name_ptr, + prefix_len) == 0) + break; + } + + return handler; +} + +/* + * @param name_index - the index for the xattr handler + * @return the xattr_handler if we found it, NULL otherwise + * + * use this if we know the type of the xattr already + */ +static struct xattr_handler *btrfs_xattr_handler(int name_index) +{ + struct xattr_handler *handler = NULL; + + if (name_index >= 0 && + name_index < ARRAY_SIZE(btrfs_xattr_handler_map)) + handler = btrfs_xattr_handler_map[name_index]; + + return handler; +} + +static inline char *get_name(const char *name, int name_index) +{ + char *ret = NULL; + struct xattr_handler *handler = btrfs_xattr_handler(name_index); + int prefix_len; + + if (!handler) + return ret; + + prefix_len = strlen(handler->prefix); + + ret = kmalloc(strlen(name) + prefix_len + 1, GFP_KERNEL); + if (!ret) + return ret; + + memcpy(ret, handler->prefix, prefix_len); + memcpy(ret+prefix_len, name, strlen(name)); + ret[prefix_len + strlen(name)] = '\0'; + + return ret; +} + +size_t btrfs_xattr_generic_list(struct inode *inode, char *list, + size_t list_size, const char *name, + size_t name_len) +{ + if (list && (name_len+1) <= list_size) { + memcpy(list, name, name_len); + list[name_len] = '\0'; + } else + return -ERANGE; + + return name_len+1; +} + +ssize_t btrfs_xattr_get(struct inode *inode, int name_index, + const char *attr_name, void *buffer, size_t size) +{ + struct btrfs_dir_item *di; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_path *path; + struct extent_buffer *leaf; + struct xattr_handler *handler = btrfs_xattr_handler(name_index); + int ret = 0; + unsigned long data_ptr; + char *name; + + if (!handler) + return -EOPNOTSUPP; + + /* just in case... */ + if (*attr_name == '\0') + return -EINVAL; + + name = get_name(attr_name, name_index); + if (!name) + return -ENOMEM; + + path = btrfs_alloc_path(); + if (!path) { + kfree(name); + return -ENOMEM; + } + + mutex_lock(&root->fs_info->fs_mutex); + /* lookup the xattr by name */ + di = btrfs_lookup_xattr(NULL, root, path, inode->i_ino, name, + strlen(name), 0); + if (!di || IS_ERR(di)) { + ret = -ENODATA; + goto out; + } + + leaf = path->nodes[0]; + /* if size is 0, that means we want the size of the attr */ + if (!size) { + ret = btrfs_dir_data_len(leaf, di); + goto out; + } + + /* now get the data out of our dir_item */ + if (btrfs_dir_data_len(leaf, di) > size) { + ret = -ERANGE; + goto out; + } + data_ptr = (unsigned long)((char *)(di + 1) + + btrfs_dir_name_len(leaf, di)); + read_extent_buffer(leaf, buffer, data_ptr, + btrfs_dir_name_len(leaf, di)); + ret = btrfs_dir_data_len(leaf, di); + +out: + mutex_unlock(&root->fs_info->fs_mutex); + kfree(name); + btrfs_free_path(path); + return ret; +} + +int btrfs_xattr_set(struct inode *inode, int name_index, + const char *attr_name, const void *value, size_t size, + int flags) +{ + struct btrfs_dir_item *di; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_trans_handle *trans; + struct btrfs_path *path; + struct xattr_handler *handler = btrfs_xattr_handler(name_index); + char *name; + int ret = 0, mod = 0; + + if (!handler) + return -EOPNOTSUPP; + + /* just in case... */ + if (*attr_name == '\0') + return -EINVAL; + + name = get_name(attr_name, name_index); + if (!name) + return -ENOMEM; + + path = btrfs_alloc_path(); + if (!path) { + kfree(name); + return -ENOMEM; + } + + mutex_lock(&root->fs_info->fs_mutex); + trans = btrfs_start_transaction(root, 1); + btrfs_set_trans_block_group(trans, inode); + + /* first lets see if we already have this xattr */ + di = btrfs_lookup_xattr(trans, root, path, inode->i_ino, name, + strlen(name), -1); + if (IS_ERR(di)) { + ret = PTR_ERR(di); + goto out; + } + + /* ok we already have this xattr, lets remove it */ + if (di) { + /* if we want create only exit */ + if (flags & XATTR_CREATE) { + ret = -EEXIST; + goto out; + } + + ret = btrfs_delete_one_dir_name(trans, root, path, di); + if (ret) + goto out; + btrfs_release_path(root, path); + + /* if we don't have a value then we are removing the xattr */ + if (!value) { + mod = 1; + goto out; + } + } else if (flags & XATTR_REPLACE) { + /* we couldn't find the attr to replace, so error out */ + ret = -ENODATA; + goto out; + } + + /* ok we have to create a completely new xattr */ + ret = btrfs_insert_xattr_item(trans, root, name, strlen(name), + value, size, inode->i_ino); + if (ret) + goto out; + mod = 1; + +out: + if (mod) { + inode->i_ctime = CURRENT_TIME; + ret = btrfs_update_inode(trans, root, inode); + } + + btrfs_end_transaction(trans, root); + mutex_unlock(&root->fs_info->fs_mutex); + kfree(name); + btrfs_free_path(path); + + return ret; +} + +ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size) +{ + struct btrfs_key key, found_key; + struct inode *inode = dentry->d_inode; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_path *path; + struct btrfs_item *item; + struct extent_buffer *leaf; + struct btrfs_dir_item *di; + struct xattr_handler *handler; + int ret = 0, slot, advance; + size_t total_size = 0, size_left = size, written; + unsigned long name_ptr; + char *name; + u32 nritems; + + /* + * ok we want all objects associated with this id. + * NOTE: we set key.offset = 0; because we want to start with the + * first xattr that we find and walk forward + */ + key.objectid = inode->i_ino; + btrfs_set_key_type(&key, BTRFS_XATTR_ITEM_KEY); + key.offset = 0; + + path = btrfs_alloc_path(); + path->reada = 2; + if (!path) + return -ENOMEM; + + mutex_lock(&root->fs_info->fs_mutex); + + /* search for our xattrs */ + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + goto err; + ret = 0; + advance = 0; + while (1) { + leaf = path->nodes[0]; + nritems = btrfs_header_nritems(leaf); + slot = path->slots[0]; + + /* this is where we start walking through the path */ + if (advance || slot >= nritems) { + /* + * if we've reached the last slot in this leaf we need + * to go to the next leaf and reset everything + */ + if (slot >= nritems-1) { + ret = btrfs_next_leaf(root, path); + if (ret) + break; + leaf = path->nodes[0]; + nritems = btrfs_header_nritems(leaf); + slot = path->slots[0]; + } else { + /* + * just walking through the slots on this leaf + */ + slot++; + path->slots[0]++; + } + } + advance = 1; + + item = btrfs_item_nr(leaf, slot); + btrfs_item_key_to_cpu(leaf, &found_key, slot); + + /* check to make sure this item is what we want */ + if (found_key.objectid != key.objectid) + break; + if (btrfs_key_type(&found_key) != BTRFS_XATTR_ITEM_KEY) + break; + + di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item); + + total_size += btrfs_dir_name_len(leaf, di)+1; + + /* we are just looking for how big our buffer needs to be */ + if (!size) + continue; + + /* find our handler for this xattr */ + name_ptr = (unsigned long)(di + 1); + handler = find_btrfs_xattr_handler(leaf, name_ptr, + btrfs_dir_name_len(leaf, di)); + if (!handler) { + printk(KERN_ERR "btrfs: unsupported xattr found\n"); + continue; + } + + name = kmalloc(btrfs_dir_name_len(leaf, di), GFP_KERNEL); + read_extent_buffer(leaf, name, name_ptr, + btrfs_dir_name_len(leaf, di)); + + /* call the list function associated with this xattr */ + written = handler->list(inode, buffer, size_left, name, + btrfs_dir_name_len(leaf, di)); + kfree(name); + + if (written < 0) { + ret = -ERANGE; + break; + } + + size_left -= written; + buffer += written; + } + ret = total_size; + +err: + mutex_unlock(&root->fs_info->fs_mutex); + btrfs_free_path(path); + + return ret; +} + +/* + * delete all the xattrs associated with the inode. fs_mutex should be + * held when we come into here + */ +int btrfs_delete_xattrs(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct inode *inode) +{ + struct btrfs_path *path; + struct btrfs_key key, found_key; + struct btrfs_item *item; + struct extent_buffer *leaf; + int ret; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + key.objectid = inode->i_ino; + btrfs_set_key_type(&key, BTRFS_XATTR_ITEM_KEY); + key.offset = (u64)-1; + + while(1) { + /* look for our next xattr */ + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + if (ret < 0) + goto out; + BUG_ON(ret == 0); + + if (path->slots[0] == 0) + break; + + path->slots[0]--; + leaf = path->nodes[0]; + item = btrfs_item_nr(leaf, path->slots[0]); + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + + if (found_key.objectid != key.objectid) + break; + if (btrfs_key_type(&found_key) != BTRFS_XATTR_ITEM_KEY) + break; + + ret = btrfs_del_item(trans, root, path); + BUG_ON(ret); + btrfs_release_path(root, path); + } + ret = 0; +out: + btrfs_free_path(path); + + return ret; +} + +/* + * Handler functions + */ +#define BTRFS_XATTR_SETGET_FUNCS(name, index) \ +static int btrfs_xattr_##name##_get(struct inode *inode, \ + const char *name, void *value, \ + size_t size) \ +{ \ + return btrfs_xattr_get(inode, index, name, value, size); \ +} \ +static int btrfs_xattr_##name##_set(struct inode *inode, \ + const char *name, const void *value,\ + size_t size, int flags) \ +{ \ + return btrfs_xattr_set(inode, index, name, value, size, flags); \ +} \ + +BTRFS_XATTR_SETGET_FUNCS(security, BTRFS_XATTR_INDEX_SECURITY); +BTRFS_XATTR_SETGET_FUNCS(system, BTRFS_XATTR_INDEX_SYSTEM); +BTRFS_XATTR_SETGET_FUNCS(user, BTRFS_XATTR_INDEX_USER); +BTRFS_XATTR_SETGET_FUNCS(trusted, BTRFS_XATTR_INDEX_TRUSTED); + +struct xattr_handler btrfs_xattr_security_handler = { + .prefix = XATTR_SECURITY_PREFIX, + .list = btrfs_xattr_generic_list, + .get = btrfs_xattr_security_get, + .set = btrfs_xattr_security_set, +}; + +struct xattr_handler btrfs_xattr_system_handler = { + .prefix = XATTR_SYSTEM_PREFIX, + .list = btrfs_xattr_generic_list, + .get = btrfs_xattr_system_get, + .set = btrfs_xattr_system_set, +}; + +struct xattr_handler btrfs_xattr_user_handler = { + .prefix = XATTR_USER_PREFIX, + .list = btrfs_xattr_generic_list, + .get = btrfs_xattr_user_get, + .set = btrfs_xattr_user_set, +}; + +struct xattr_handler btrfs_xattr_trusted_handler = { + .prefix = XATTR_USER_PREFIX, + .list = btrfs_xattr_generic_list, + .get = btrfs_xattr_trusted_get, + .set = btrfs_xattr_trusted_set, +}; diff --git a/fs/btrfs/xattr.h b/fs/btrfs/xattr.h new file mode 100644 index 000000000000..b2e47e3f2442 --- /dev/null +++ b/fs/btrfs/xattr.h @@ -0,0 +1,58 @@ +/* + * Copyright (C) 2007 Red Hat. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef __XATTR__ +#define __XATTR__ + +#include +#include "ctree.h" + +/* Name indexes */ +enum { + BTRFS_XATTR_INDEX_USER, + BTRFS_XATTR_INDEX_POSIX_ACL_ACCESS, + BTRFS_XATTR_INDEX_POSIX_ACL_DEFAULT, + BTRFS_XATTR_INDEX_TRUSTED, + BTRFS_XATTR_INDEX_SECURITY, + BTRFS_XATTR_INDEX_SYSTEM, + BTRFS_XATTR_INDEX_END, +}; + +extern struct xattr_handler btrfs_xattr_user_handler; +extern struct xattr_handler btrfs_xattr_trusted_handler; +extern struct xattr_handler btrfs_xattr_acl_access_handler; +extern struct xattr_handler btrfs_xattr_acl_default_handler; +extern struct xattr_handler btrfs_xattr_security_handler; +extern struct xattr_handler btrfs_xattr_system_handler; + +extern struct xattr_handler *btrfs_xattr_handlers[]; + +ssize_t btrfs_xattr_get(struct inode *inode, int name_index, const char *name, + void *buffer, size_t size); +int btrfs_xattr_set(struct inode *inode, int name_index, const char *name, + const void *value, size_t size, int flags); + +/* + * the only reason this is public is for acl.c. There may be a point where + * acl.c doesn't need it, and if thats the case we need to remove it and make + * it static in xattr.c + */ +size_t btrfs_xattr_generic_list(struct inode *inode, char *list, + size_t list_size, const char *name, + size_t name_len); +#endif /* __XATTR__ */ -- cgit v1.2.3 From dc17ff8f11d129db9e83ab7244769e4eae05e14d Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Tue, 8 Jan 2008 15:46:30 -0500 Subject: Btrfs: Add data=ordered support This forces file data extents down the disk along with the metadata that references them. The current implementation is fairly simple, and just writes out all of the dirty pages in an inode before the commit. Signed-off-by: Chris Mason --- fs/btrfs/Makefile | 2 +- fs/btrfs/btrfs_inode.h | 1 + fs/btrfs/ctree.c | 6 +- fs/btrfs/ctree.h | 7 +- fs/btrfs/disk-io.c | 16 +++- fs/btrfs/disk-io.h | 2 + fs/btrfs/extent-tree.c | 4 +- fs/btrfs/file.c | 5 +- fs/btrfs/inode.c | 46 ++++++---- fs/btrfs/ordered-data.c | 221 ++++++++++++++++++++++++++++++++++++++++++++++++ fs/btrfs/ordered-data.h | 39 +++++++++ fs/btrfs/transaction.c | 58 +++++++++++++ fs/btrfs/transaction.h | 8 +- 13 files changed, 387 insertions(+), 28 deletions(-) create mode 100644 fs/btrfs/ordered-data.c create mode 100644 fs/btrfs/ordered-data.h (limited to 'fs/btrfs/Makefile') diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index d5804c5ca075..ab9a9f8edbf8 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -5,7 +5,7 @@ obj-m := btrfs.o btrfs-y := super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ hash.o file-item.o inode-item.o inode-map.o disk-io.o \ transaction.o bit-radix.o inode.o file.o tree-defrag.o \ - extent_map.o sysfs.o struct-funcs.o xattr.o acl.o + extent_map.o sysfs.o struct-funcs.o xattr.o acl.o ordered-data.o #btrfs-y := ctree.o disk-io.o radix-tree.o extent-tree.o print-tree.o \ # root-tree.o dir-item.o hash.o file-item.o inode-item.o \ diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index d1d5af471c3c..f27e633f1742 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -29,6 +29,7 @@ struct btrfs_inode { struct extent_map_tree extent_tree; struct inode vfs_inode; + u64 ordered_trans; /* * transid of the trans_handle that last modified this inode */ diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 35c57074a376..43d23148a4fe 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -221,7 +221,9 @@ int btrfs_cow_block(struct btrfs_trans_handle *trans, struct extent_buffer **cow_ret) { u64 search_start; + u64 header_trans; int ret; + if (trans->transaction != root->fs_info->running_transaction) { printk(KERN_CRIT "trans %Lu running %Lu\n", trans->transid, root->fs_info->running_transaction->transid); @@ -232,7 +234,9 @@ int btrfs_cow_block(struct btrfs_trans_handle *trans, root->fs_info->generation); WARN_ON(1); } - if (btrfs_header_generation(buf) == trans->transid) { + + header_trans = btrfs_header_generation(buf); + if (header_trans == trans->transid) { *cow_ret = buf; return 0; } diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 9873975ce0ee..b55dba58dfaa 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -16,8 +16,8 @@ * Boston, MA 021110-1307, USA. */ -#ifndef __BTRFS__ -#define __BTRFS__ +#ifndef __BTRFS_CTREE__ +#define __BTRFS_CTREE__ #include #include @@ -363,7 +363,6 @@ struct btrfs_root { struct inode *inode; struct kobject root_kobj; struct completion kobj_unregister; - struct rw_semaphore snap_sem; u64 objectid; u64 last_trans; @@ -1142,6 +1141,8 @@ void btrfs_destroy_cachep(void); long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg); struct inode *btrfs_iget_locked(struct super_block *s, u64 objectid, struct btrfs_root *root); +struct inode *btrfs_ilookup(struct super_block *s, u64 objectid, + u64 root_objectid); int btrfs_commit_write(struct file *file, struct page *page, unsigned from, unsigned to); struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page, diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index a6170ff19e7f..34cf1f1f47be 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -406,7 +406,6 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize, memset(&root->defrag_progress, 0, sizeof(root->defrag_progress)); memset(&root->root_kobj, 0, sizeof(root->root_kobj)); init_completion(&root->kobj_unregister); - init_rwsem(&root->snap_sem); root->defrag_running = 0; root->defrag_level = 0; root->root_key.objectid = objectid; @@ -498,6 +497,21 @@ insert: return root; } +struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info, + u64 root_objectid) +{ + struct btrfs_root *root; + + if (root_objectid == BTRFS_ROOT_TREE_OBJECTID) + return fs_info->tree_root; + if (root_objectid == BTRFS_EXTENT_TREE_OBJECTID) + return fs_info->extent_root; + + root = radix_tree_lookup(&fs_info->fs_roots_radix, + (unsigned long)root_objectid); + return root; +} + struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info, struct btrfs_key *location) { diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 8c3cfd02901f..dae9fba8efcd 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -34,6 +34,8 @@ int write_ctree_super(struct btrfs_trans_handle *trans, struct btrfs_root *root); struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize); +struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info, + u64 root_objectid); struct btrfs_root *btrfs_read_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_key *location, const char *name, int namelen); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index c906bb19b211..68137cd8506a 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -1195,7 +1195,9 @@ static int pin_down_bytes(struct btrfs_root *root, u64 bytenr, u32 num_bytes, if (btrfs_buffer_uptodate(buf)) { u64 transid = root->fs_info->running_transaction->transid; - if (btrfs_header_generation(buf) == transid) { + u64 header_transid = + btrfs_header_generation(buf); + if (header_transid == transid) { free_extent_buffer(buf); return 1; } diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 94c93373cb7d..0a5f4defe59b 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -34,6 +34,7 @@ #include "disk-io.h" #include "transaction.h" #include "btrfs_inode.h" +#include "ordered-data.h" #include "ioctl.h" #include "print-tree.h" @@ -329,6 +330,7 @@ static int noinline dirty_and_release_pages(struct btrfs_trans_handle *trans, root->fs_info->delalloc_bytes += (end_of_last_block + 1 - start_pos) - existing_delalloc; spin_unlock(&root->fs_info->delalloc_lock); + btrfs_add_ordered_inode(inode); } else { u64 aligned_end; /* step one, delete the existing extents in this range */ @@ -724,8 +726,6 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf, pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL); - down_read(&BTRFS_I(inode)->root->snap_sem); - mutex_lock(&inode->i_mutex); first_index = pos >> PAGE_CACHE_SHIFT; last_index = (pos + count) >> PAGE_CACHE_SHIFT; @@ -804,7 +804,6 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf, } out: mutex_unlock(&inode->i_mutex); - up_read(&BTRFS_I(inode)->root->snap_sem); out_nolock: kfree(pages); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 6d0cd9a7a615..6d6e1ac0a9a0 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -135,6 +135,7 @@ static int cow_file_range(struct inode *inode, u64 start, u64 end) alloc_hint = ins.objectid + ins.offset; start += cur_alloc_size; } + btrfs_add_ordered_inode(inode); out: btrfs_end_transaction(trans, root); return ret; @@ -367,8 +368,8 @@ void btrfs_read_locked_inode(struct inode *inode) path = btrfs_alloc_path(); BUG_ON(!path); mutex_lock(&root->fs_info->fs_mutex); - memcpy(&location, &BTRFS_I(inode)->location, sizeof(location)); + ret = btrfs_lookup_inode(NULL, root, path, &location, 0); if (ret) goto make_bad; @@ -898,7 +899,6 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from) if ((offset & (blocksize - 1)) == 0) goto out; - down_read(&root->snap_sem); ret = -ENOMEM; page = grab_cache_page(mapping, index); if (!page) @@ -917,7 +917,6 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from) unlock_page(page); page_cache_release(page); - up_read(&BTRFS_I(inode)->root->snap_sem); out: return ret; } @@ -1146,6 +1145,19 @@ static int btrfs_find_actor(struct inode *inode, void *opaque) args->root == BTRFS_I(inode)->root); } +struct inode *btrfs_ilookup(struct super_block *s, u64 objectid, + u64 root_objectid) +{ + struct btrfs_iget_args args; + args.ino = objectid; + args.root = btrfs_lookup_fs_root(btrfs_sb(s)->fs_info, root_objectid); + + if (!args.root) + return NULL; + + return ilookup5(s, objectid, btrfs_find_actor, (void *)&args); +} + struct inode *btrfs_iget_locked(struct super_block *s, u64 objectid, struct btrfs_root *root) { @@ -1336,7 +1348,6 @@ read_dir_items: d_type = btrfs_filetype_table[btrfs_dir_type(leaf, di)]; btrfs_dir_item_key_to_cpu(leaf, di, &location); - over = filldir(dirent, name_ptr, name_len, found_key.offset, location.objectid, @@ -2054,7 +2065,6 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct page *page) ret = -EINVAL; - down_read(&BTRFS_I(inode)->root->snap_sem); lock_page(page); wait_on_page_writeback(page); size = i_size_read(inode); @@ -2075,7 +2085,6 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct page *page) ret = btrfs_cow_one_page(inode, page, end); out_unlock: - up_read(&BTRFS_I(inode)->root->snap_sem); unlock_page(page); out: return ret; @@ -2118,7 +2127,7 @@ static int noinline create_subvol(struct btrfs_root *root, char *name, struct btrfs_root_item root_item; struct btrfs_inode_item *inode_item; struct extent_buffer *leaf; - struct btrfs_root *new_root; + struct btrfs_root *new_root = root; struct inode *inode; struct inode *dir; int ret; @@ -2230,7 +2239,7 @@ static int noinline create_subvol(struct btrfs_root *root, char *name, goto fail; fail: nr = trans->blocks_used; - err = btrfs_commit_transaction(trans, root); + err = btrfs_commit_transaction(trans, new_root); if (err && !ret) ret = err; fail_commit: @@ -2253,10 +2262,6 @@ static int create_snapshot(struct btrfs_root *root, char *name, int namelen) if (!root->ref_cows) return -EINVAL; - down_write(&root->snap_sem); - freeze_bdev(root->fs_info->sb->s_bdev); - thaw_bdev(root->fs_info->sb->s_bdev, root->fs_info->sb); - mutex_lock(&root->fs_info->fs_mutex); ret = btrfs_check_free_space(root, 1, 0); if (ret) @@ -2264,6 +2269,9 @@ static int create_snapshot(struct btrfs_root *root, char *name, int namelen) trans = btrfs_start_transaction(root, 1); BUG_ON(!trans); + err = btrfs_commit_transaction(trans, root); + + trans = btrfs_start_transaction(root, 1); ret = btrfs_update_inode(trans, root, root->inode); if (ret) @@ -2272,9 +2280,7 @@ static int create_snapshot(struct btrfs_root *root, char *name, int namelen) ret = btrfs_find_free_objectid(trans, root->fs_info->tree_root, 0, &objectid); if (ret) - goto fail; - - memcpy(&new_root_item, &root->root_item, + goto fail; memcpy(&new_root_item, &root->root_item, sizeof(new_root_item)); key.objectid = objectid; @@ -2285,12 +2291,20 @@ static int create_snapshot(struct btrfs_root *root, char *name, int namelen) btrfs_cow_block(trans, root, root->node, NULL, 0, &tmp); free_extent_buffer(tmp); + /* write the ordered inodes to force all delayed allocations to + * be filled. Once this is done, we can copy the root + */ + mutex_lock(&root->fs_info->trans_mutex); + btrfs_write_ordered_inodes(trans, root); + mutex_unlock(&root->fs_info->trans_mutex); + btrfs_copy_root(trans, root, root->node, &tmp, objectid); btrfs_set_root_bytenr(&new_root_item, tmp->start); btrfs_set_root_level(&new_root_item, btrfs_header_level(tmp)); ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key, &new_root_item); +printk("new root %Lu node %Lu\n", objectid, tmp->start); free_extent_buffer(tmp); if (ret) goto fail; @@ -2321,7 +2335,6 @@ fail: ret = err; fail_unlock: mutex_unlock(&root->fs_info->fs_mutex); - up_write(&root->snap_sem); btrfs_btree_balance_dirty(root, nr); return ret; } @@ -2608,6 +2621,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) if (!ei) return NULL; ei->last_trans = 0; + ei->ordered_trans = 0; return &ei->vfs_inode; } diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c new file mode 100644 index 000000000000..411aba84d305 --- /dev/null +++ b/fs/btrfs/ordered-data.c @@ -0,0 +1,221 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include +#include +#include "ctree.h" +#include "transaction.h" +#include "btrfs_inode.h" + +struct tree_entry { + u64 root_objectid; + u64 objectid; + struct rb_node rb_node; +}; + +/* + * returns > 0 if entry passed (root, objectid) is > entry, + * < 0 if (root, objectid) < entry and zero if they are equal + */ +static int comp_entry(struct tree_entry *entry, u64 root_objectid, + u64 objectid) +{ + if (root_objectid < entry->root_objectid) + return -1; + if (root_objectid > entry->root_objectid) + return 1; + if (objectid < entry->objectid) + return -1; + if (objectid > entry->objectid) + return 1; + return 0; +} + +static struct rb_node *tree_insert(struct rb_root *root, u64 root_objectid, + u64 objectid, struct rb_node *node) +{ + struct rb_node ** p = &root->rb_node; + struct rb_node * parent = NULL; + struct tree_entry *entry; + int comp; + + while(*p) { + parent = *p; + entry = rb_entry(parent, struct tree_entry, rb_node); + + comp = comp_entry(entry, root_objectid, objectid); + if (comp < 0) + p = &(*p)->rb_left; + else if (comp > 0) + p = &(*p)->rb_right; + else + return parent; + } + + rb_link_node(node, parent, p); + rb_insert_color(node, root); + return NULL; +} + +static struct rb_node *__tree_search(struct rb_root *root, u64 root_objectid, + u64 objectid, struct rb_node **prev_ret) +{ + struct rb_node * n = root->rb_node; + struct rb_node *prev = NULL; + struct tree_entry *entry; + struct tree_entry *prev_entry = NULL; + int comp; + + while(n) { + entry = rb_entry(n, struct tree_entry, rb_node); + prev = n; + prev_entry = entry; + comp = comp_entry(entry, root_objectid, objectid); + + if (comp < 0) + n = n->rb_left; + else if (comp > 0) + n = n->rb_right; + else + return n; + } + if (!prev_ret) + return NULL; + + while(prev && comp_entry(prev_entry, root_objectid, objectid) >= 0) { + prev = rb_next(prev); + prev_entry = rb_entry(prev, struct tree_entry, rb_node); + } + *prev_ret = prev; + return NULL; +} + +static inline struct rb_node *tree_search(struct rb_root *root, + u64 root_objectid, u64 objectid) +{ + struct rb_node *prev; + struct rb_node *ret; + ret = __tree_search(root, root_objectid, objectid, &prev); + if (!ret) + return prev; + return ret; +} + +int btrfs_add_ordered_inode(struct inode *inode) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + u64 root_objectid = root->root_key.objectid; + u64 transid = root->fs_info->running_transaction->transid; + struct tree_entry *entry; + struct rb_node *node; + struct btrfs_ordered_inode_tree *tree; + + if (transid <= BTRFS_I(inode)->ordered_trans) + return 0; + + tree = &root->fs_info->running_transaction->ordered_inode_tree; + + read_lock(&tree->lock); + node = __tree_search(&tree->tree, root_objectid, inode->i_ino, NULL); + read_unlock(&tree->lock); + if (node) { + return 0; + } + + entry = kmalloc(sizeof(*entry), GFP_NOFS); + if (!entry) + return -ENOMEM; + + write_lock(&tree->lock); + entry->objectid = inode->i_ino; + entry->root_objectid = root_objectid; + + node = tree_insert(&tree->tree, root_objectid, + inode->i_ino, &entry->rb_node); + + BTRFS_I(inode)->ordered_trans = transid; + + write_unlock(&tree->lock); + if (node) + kfree(entry); + return 0; +} + +int btrfs_find_first_ordered_inode(struct btrfs_ordered_inode_tree *tree, + u64 *root_objectid, u64 *objectid) +{ + struct tree_entry *entry; + struct rb_node *node; + + write_lock(&tree->lock); + node = tree_search(&tree->tree, *root_objectid, *objectid); + if (!node) { + write_unlock(&tree->lock); + return 0; + } + entry = rb_entry(node, struct tree_entry, rb_node); + + while(comp_entry(entry, *root_objectid, *objectid) >= 0) { + node = rb_next(node); + if (!node) + break; + entry = rb_entry(node, struct tree_entry, rb_node); + } + if (!node) { + write_unlock(&tree->lock); + return 0; + } + + *root_objectid = entry->root_objectid; + *objectid = entry->objectid; + write_unlock(&tree->lock); + return 1; +} + +int btrfs_find_del_first_ordered_inode(struct btrfs_ordered_inode_tree *tree, + u64 *root_objectid, u64 *objectid) +{ + struct tree_entry *entry; + struct rb_node *node; + + write_lock(&tree->lock); + node = tree_search(&tree->tree, *root_objectid, *objectid); + if (!node) { + write_unlock(&tree->lock); + return 0; + } + + entry = rb_entry(node, struct tree_entry, rb_node); + while(comp_entry(entry, *root_objectid, *objectid) >= 0) { + node = rb_next(node); + if (!node) + break; + entry = rb_entry(node, struct tree_entry, rb_node); + } + if (!node) { + write_unlock(&tree->lock); + return 0; + } + + *root_objectid = entry->root_objectid; + *objectid = entry->objectid; + rb_erase(node, &tree->tree); + write_unlock(&tree->lock); + kfree(entry); + return 1; +} diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h new file mode 100644 index 000000000000..aaf9eb142719 --- /dev/null +++ b/fs/btrfs/ordered-data.h @@ -0,0 +1,39 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef __BTRFS_ORDERED_DATA__ +#define __BTRFS_ORDERED_DATA__ + +struct btrfs_ordered_inode_tree { + rwlock_t lock; + struct rb_root tree; +}; + +static inline void +btrfs_ordered_inode_tree_init(struct btrfs_ordered_inode_tree *t) +{ + rwlock_init(&t->lock); + t->tree.rb_node = NULL; +} + +int btrfs_add_ordered_inode(struct inode *inode); +int btrfs_find_del_first_ordered_inode(struct btrfs_ordered_inode_tree *tree, + u64 *root_objectid, u64 *objectid); +int btrfs_find_first_ordered_inode(struct btrfs_ordered_inode_tree *tree, + u64 *root_objectid, u64 *objectid); +#endif diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 02721eea9a7a..3ed5868e7c0f 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -67,6 +67,7 @@ static int join_transaction(struct btrfs_root *root) cur_trans->commit_done = 0; cur_trans->start_time = get_seconds(); list_add_tail(&cur_trans->list, &root->fs_info->trans_list); + btrfs_ordered_inode_tree_init(&cur_trans->ordered_inode_tree); extent_map_tree_init(&cur_trans->dirty_pages, root->fs_info->btree_inode->i_mapping, GFP_NOFS); @@ -473,6 +474,60 @@ static int drop_dirty_roots(struct btrfs_root *tree_root, return ret; } +int btrfs_write_ordered_inodes(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + struct btrfs_transaction *cur_trans = trans->transaction; + struct inode *inode; + u64 root_objectid = 0; + u64 objectid = 0; + u64 transid = trans->transid; + int ret; + +printk("write ordered trans %Lu\n", transid); + while(1) { + ret = btrfs_find_first_ordered_inode( + &cur_trans->ordered_inode_tree, + &root_objectid, &objectid); + if (!ret) + break; + + mutex_unlock(&root->fs_info->trans_mutex); + mutex_unlock(&root->fs_info->fs_mutex); + inode = btrfs_ilookup(root->fs_info->sb, objectid, + root_objectid); + if (inode) { + if (S_ISREG(inode->i_mode)) + filemap_fdatawrite(inode->i_mapping); + iput(inode); + } + mutex_lock(&root->fs_info->fs_mutex); + mutex_lock(&root->fs_info->trans_mutex); + } + while(1) { + root_objectid = 0; + objectid = 0; + ret = btrfs_find_del_first_ordered_inode( + &cur_trans->ordered_inode_tree, + &root_objectid, &objectid); + if (!ret) + break; + mutex_unlock(&root->fs_info->trans_mutex); + mutex_unlock(&root->fs_info->fs_mutex); + inode = btrfs_ilookup(root->fs_info->sb, objectid, + root_objectid); + if (inode) { + if (S_ISREG(inode->i_mode)) + filemap_write_and_wait(inode->i_mapping); + iput(inode); + } + mutex_lock(&root->fs_info->fs_mutex); + mutex_lock(&root->fs_info->trans_mutex); + } +printk("done write ordered trans %Lu\n", transid); + return 0; +} + int btrfs_commit_transaction(struct btrfs_trans_handle *trans, struct btrfs_root *root) { @@ -550,10 +605,13 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, mutex_lock(&root->fs_info->fs_mutex); mutex_lock(&root->fs_info->trans_mutex); finish_wait(&cur_trans->writer_wait, &wait); + ret = btrfs_write_ordered_inodes(trans, root); + } while (cur_trans->num_writers > 1 || (cur_trans->num_joined != joined)); WARN_ON(cur_trans != trans->transaction); + ret = add_dirty_roots(trans, &root->fs_info->fs_roots_radix, &dirty_fs_roots); BUG_ON(ret); diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index eef840bca91e..c157ddbe9d1e 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -16,9 +16,10 @@ * Boston, MA 021110-1307, USA. */ -#ifndef __TRANSACTION__ -#define __TRANSACTION__ +#ifndef __BTRFS_TRANSACTION__ +#define __BTRFS_TRANSACTION__ #include "btrfs_inode.h" +#include "ordered-data.h" struct btrfs_transaction { u64 transid; @@ -30,6 +31,7 @@ struct btrfs_transaction { struct list_head list; struct extent_map_tree dirty_pages; unsigned long start_time; + struct btrfs_ordered_inode_tree ordered_inode_tree; wait_queue_head_t writer_wait; wait_queue_head_t commit_wait; }; @@ -90,4 +92,6 @@ int btrfs_defrag_root(struct btrfs_root *root, int cacheonly); int btrfs_clean_old_snapshots(struct btrfs_root *root); int btrfs_commit_transaction(struct btrfs_trans_handle *trans, struct btrfs_root *root); +int btrfs_write_ordered_inodes(struct btrfs_trans_handle *trans, + struct btrfs_root *root); #endif -- cgit v1.2.3 From caaca38b8fdcf63a17647fddae2195b189e19e37 Mon Sep 17 00:00:00 2001 From: Yan Date: Thu, 17 Jan 2008 11:59:48 -0500 Subject: Btrfs: Fix compile on kernel without ACLs enabled Signed-off-by: Chris Mason --- fs/btrfs/Makefile | 5 ++++- fs/btrfs/xattr.c | 6 ++++-- 2 files changed, 8 insertions(+), 3 deletions(-) (limited to 'fs/btrfs/Makefile') diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index ab9a9f8edbf8..72cc3e16741f 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -5,8 +5,11 @@ obj-m := btrfs.o btrfs-y := super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ hash.o file-item.o inode-item.o inode-map.o disk-io.o \ transaction.o bit-radix.o inode.o file.o tree-defrag.o \ - extent_map.o sysfs.o struct-funcs.o xattr.o acl.o ordered-data.o + extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o +ifeq ($(CONFIG_FS_POSIX_ACL),y) +btrfs-y += acl.o +endif #btrfs-y := ctree.o disk-io.o radix-tree.o extent-tree.o print-tree.o \ # root-tree.o dir-item.o hash.o file-item.o inode-item.o \ # inode-map.o \ diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index 4b3bd24af72b..0a4950400f85 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -26,20 +26,22 @@ #include "transaction.h" #include "xattr.h" #include "disk-io.h" - static struct xattr_handler *btrfs_xattr_handler_map[] = { [BTRFS_XATTR_INDEX_USER] = &btrfs_xattr_user_handler, +#ifdef CONFIG_FS_POSIX_ACL [BTRFS_XATTR_INDEX_POSIX_ACL_ACCESS] = &btrfs_xattr_acl_access_handler, [BTRFS_XATTR_INDEX_POSIX_ACL_DEFAULT] = &btrfs_xattr_acl_default_handler, +#endif [BTRFS_XATTR_INDEX_TRUSTED] = &btrfs_xattr_trusted_handler, [BTRFS_XATTR_INDEX_SECURITY] = &btrfs_xattr_security_handler, [BTRFS_XATTR_INDEX_SYSTEM] = &btrfs_xattr_system_handler, }; - struct xattr_handler *btrfs_xattr_handlers[] = { &btrfs_xattr_user_handler, +#ifdef CONFIG_FS_POSIX_ACL &btrfs_xattr_acl_access_handler, &btrfs_xattr_acl_default_handler, +#endif &btrfs_xattr_trusted_handler, &btrfs_xattr_security_handler, &btrfs_xattr_system_handler, -- cgit v1.2.3 From d1310b2e0cd98eb1348553e69b73827b436dca7b Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Thu, 24 Jan 2008 16:13:08 -0500 Subject: Btrfs: Split the extent_map code into two parts There is now extent_map for mapping offsets in the file to disk and extent_io for state tracking, IO submission and extent_bufers. The new extent_map code shifts from [start,end] pairs to [start,len], and pushes the locking out into the caller. This allows a few performance optimizations and is easier to use. A number of extent_map usage bugs were fixed, mostly with failing to remove extent_map entries when changing the file. Signed-off-by: Chris Mason --- fs/btrfs/Makefile | 3 +- fs/btrfs/btrfs_inode.h | 2 + fs/btrfs/ctree.h | 15 +- fs/btrfs/disk-io.c | 123 +- fs/btrfs/extent-tree.c | 43 +- fs/btrfs/extent_io.c | 3089 +++++++++++++++++++++++++++++++++++++++++++++++ fs/btrfs/extent_io.h | 193 +++ fs/btrfs/extent_map.c | 3119 ++---------------------------------------------- fs/btrfs/extent_map.h | 214 +--- fs/btrfs/file.c | 29 +- fs/btrfs/inode.c | 175 +-- fs/btrfs/super.c | 10 +- fs/btrfs/transaction.c | 8 +- fs/btrfs/transaction.h | 2 +- 14 files changed, 3612 insertions(+), 3413 deletions(-) create mode 100644 fs/btrfs/extent_io.c create mode 100644 fs/btrfs/extent_io.h (limited to 'fs/btrfs/Makefile') diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index 72cc3e16741f..9c61609f3714 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -5,7 +5,8 @@ obj-m := btrfs.o btrfs-y := super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ hash.o file-item.o inode-item.o inode-map.o disk-io.o \ transaction.o bit-radix.o inode.o file.o tree-defrag.o \ - extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o + extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \ + extent_io.o ifeq ($(CONFIG_FS_POSIX_ACL),y) btrfs-y += acl.o diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 9fc99883c70a..9a0647dc5a0b 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -20,6 +20,7 @@ #define __BTRFS_I__ #include "extent_map.h" +#include "extent_io.h" /* in memory btrfs inode */ struct btrfs_inode { @@ -27,6 +28,7 @@ struct btrfs_inode { struct btrfs_block_group_cache *block_group; struct btrfs_key location; struct extent_map_tree extent_tree; + struct extent_io_tree io_tree; struct inode vfs_inode; u64 ordered_trans; diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index b9f2975b55c9..6c65473e0fe3 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -27,6 +27,7 @@ #include #include #include "bit-radix.h" +#include "extent_io.h" #include "extent_map.h" struct btrfs_trans_handle; @@ -314,11 +315,11 @@ struct btrfs_fs_info { struct btrfs_root *tree_root; struct radix_tree_root fs_roots_radix; - struct extent_map_tree free_space_cache; - struct extent_map_tree block_group_cache; - struct extent_map_tree pinned_extents; - struct extent_map_tree pending_del; - struct extent_map_tree extent_ins; + struct extent_io_tree free_space_cache; + struct extent_io_tree block_group_cache; + struct extent_io_tree pinned_extents; + struct extent_io_tree pending_del; + struct extent_io_tree extent_ins; u64 generation; u64 last_trans_committed; @@ -956,7 +957,7 @@ u32 btrfs_count_snapshots_in_path(struct btrfs_root *root, u64 first_extent); int btrfs_extent_post_op(struct btrfs_trans_handle *trans, struct btrfs_root *root); -int btrfs_copy_pinned(struct btrfs_root *root, struct extent_map_tree *copy); +int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy); struct btrfs_block_group_cache *btrfs_lookup_block_group(struct btrfs_fs_info *info, u64 bytenr); @@ -1001,7 +1002,7 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root u64 owner_objectid, u64 owner_offset, int pin); int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct extent_map_tree *unpin); + struct extent_io_tree *unpin); int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 num_bytes, diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 5d1f9bca2712..4c4ebea0b2a9 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -43,14 +43,14 @@ static int check_tree_block(struct btrfs_root *root, struct extent_buffer *buf) } #endif -static struct extent_map_ops btree_extent_map_ops; +static struct extent_io_ops btree_extent_io_ops; struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize) { struct inode *btree_inode = root->fs_info->btree_inode; struct extent_buffer *eb; - eb = find_extent_buffer(&BTRFS_I(btree_inode)->extent_tree, + eb = find_extent_buffer(&BTRFS_I(btree_inode)->io_tree, bytenr, blocksize, GFP_NOFS); return eb; } @@ -61,13 +61,13 @@ struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root, struct inode *btree_inode = root->fs_info->btree_inode; struct extent_buffer *eb; - eb = alloc_extent_buffer(&BTRFS_I(btree_inode)->extent_tree, + eb = alloc_extent_buffer(&BTRFS_I(btree_inode)->io_tree, bytenr, blocksize, NULL, GFP_NOFS); return eb; } struct extent_map *btree_get_extent(struct inode *inode, struct page *page, - size_t page_offset, u64 start, u64 end, + size_t page_offset, u64 start, u64 len, int create) { struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; @@ -75,7 +75,9 @@ struct extent_map *btree_get_extent(struct inode *inode, struct page *page, int ret; again: - em = lookup_extent_mapping(em_tree, start, end); + spin_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, start, len); + spin_unlock(&em_tree->lock); if (em) { goto out; } @@ -85,11 +87,14 @@ again: goto out; } em->start = 0; - em->end = (i_size_read(inode) & ~((u64)PAGE_CACHE_SIZE -1)) - 1; + em->len = i_size_read(inode); em->block_start = 0; - em->block_end = em->end; em->bdev = inode->i_sb->s_bdev; + + spin_lock(&em_tree->lock); ret = add_extent_mapping(em_tree, em); + spin_unlock(&em_tree->lock); + if (ret == -EEXIST) { free_extent_map(em); em = NULL; @@ -175,13 +180,13 @@ static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf, int csum_dirty_buffer(struct btrfs_root *root, struct page *page) { - struct extent_map_tree *tree; + struct extent_io_tree *tree; u64 start = (u64)page->index << PAGE_CACHE_SHIFT; u64 found_start; int found_level; unsigned long len; struct extent_buffer *eb; - tree = &BTRFS_I(page->mapping->host)->extent_tree; + tree = &BTRFS_I(page->mapping->host)->io_tree; if (page->private == EXTENT_PAGE_PRIVATE) goto out; @@ -230,16 +235,16 @@ static int btree_writepage_io_hook(struct page *page, u64 start, u64 end) static int btree_writepage(struct page *page, struct writeback_control *wbc) { - struct extent_map_tree *tree; - tree = &BTRFS_I(page->mapping->host)->extent_tree; + struct extent_io_tree *tree; + tree = &BTRFS_I(page->mapping->host)->io_tree; return extent_write_full_page(tree, page, btree_get_extent, wbc); } static int btree_writepages(struct address_space *mapping, struct writeback_control *wbc) { - struct extent_map_tree *tree; - tree = &BTRFS_I(mapping->host)->extent_tree; + struct extent_io_tree *tree; + tree = &BTRFS_I(mapping->host)->io_tree; if (wbc->sync_mode == WB_SYNC_NONE) { u64 num_dirty; u64 start = 0; @@ -264,18 +269,20 @@ static int btree_writepages(struct address_space *mapping, int btree_readpage(struct file *file, struct page *page) { - struct extent_map_tree *tree; - tree = &BTRFS_I(page->mapping->host)->extent_tree; + struct extent_io_tree *tree; + tree = &BTRFS_I(page->mapping->host)->io_tree; return extent_read_full_page(tree, page, btree_get_extent); } static int btree_releasepage(struct page *page, gfp_t unused_gfp_flags) { - struct extent_map_tree *tree; + struct extent_io_tree *tree; + struct extent_map_tree *map; int ret; - tree = &BTRFS_I(page->mapping->host)->extent_tree; - ret = try_release_extent_mapping(tree, page); + tree = &BTRFS_I(page->mapping->host)->io_tree; + map = &BTRFS_I(page->mapping->host)->extent_tree; + ret = try_release_extent_mapping(map, tree, page); if (ret == 1) { ClearPagePrivate(page); set_page_private(page, 0); @@ -286,8 +293,8 @@ static int btree_releasepage(struct page *page, gfp_t unused_gfp_flags) static void btree_invalidatepage(struct page *page, unsigned long offset) { - struct extent_map_tree *tree; - tree = &BTRFS_I(page->mapping->host)->extent_tree; + struct extent_io_tree *tree; + tree = &BTRFS_I(page->mapping->host)->io_tree; extent_invalidatepage(tree, page, offset); btree_releasepage(page, GFP_NOFS); } @@ -331,7 +338,7 @@ int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize) buf = btrfs_find_create_tree_block(root, bytenr, blocksize); if (!buf) return 0; - read_extent_buffer_pages(&BTRFS_I(btree_inode)->extent_tree, + read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree, buf, 0, 0); free_extent_buffer(buf); return ret; @@ -342,40 +349,39 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr, { struct extent_buffer *buf = NULL; struct inode *btree_inode = root->fs_info->btree_inode; - struct extent_map_tree *extent_tree; + struct extent_io_tree *io_tree; u64 end; int ret; - extent_tree = &BTRFS_I(btree_inode)->extent_tree; + io_tree = &BTRFS_I(btree_inode)->io_tree; buf = btrfs_find_create_tree_block(root, bytenr, blocksize); if (!buf) return NULL; - read_extent_buffer_pages(&BTRFS_I(btree_inode)->extent_tree, - buf, 0, 1); + read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree, buf, 0, 1); if (buf->flags & EXTENT_CSUM) return buf; end = buf->start + PAGE_CACHE_SIZE - 1; - if (test_range_bit(extent_tree, buf->start, end, EXTENT_CSUM, 1)) { + if (test_range_bit(io_tree, buf->start, end, EXTENT_CSUM, 1)) { buf->flags |= EXTENT_CSUM; return buf; } - lock_extent(extent_tree, buf->start, end, GFP_NOFS); + lock_extent(io_tree, buf->start, end, GFP_NOFS); - if (test_range_bit(extent_tree, buf->start, end, EXTENT_CSUM, 1)) { + if (test_range_bit(io_tree, buf->start, end, EXTENT_CSUM, 1)) { buf->flags |= EXTENT_CSUM; goto out_unlock; } ret = csum_tree_block(root, buf, 1); - set_extent_bits(extent_tree, buf->start, end, EXTENT_CSUM, GFP_NOFS); + set_extent_bits(io_tree, buf->start, end, EXTENT_CSUM, GFP_NOFS); buf->flags |= EXTENT_CSUM; out_unlock: - unlock_extent(extent_tree, buf->start, end, GFP_NOFS); + unlock_extent(io_tree, buf->start, end, GFP_NOFS); return buf; } @@ -385,7 +391,7 @@ int clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *btree_inode = root->fs_info->btree_inode; if (btrfs_header_generation(buf) == root->fs_info->running_transaction->transid) - clear_extent_buffer_dirty(&BTRFS_I(btree_inode)->extent_tree, + clear_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree, buf); return 0; } @@ -394,7 +400,7 @@ int wait_on_tree_block_writeback(struct btrfs_root *root, struct extent_buffer *buf) { struct inode *btree_inode = root->fs_info->btree_inode; - wait_on_extent_buffer_writeback(&BTRFS_I(btree_inode)->extent_tree, + wait_on_extent_buffer_writeback(&BTRFS_I(btree_inode)->io_tree, buf); return 0; } @@ -659,20 +665,23 @@ struct btrfs_root *open_ctree(struct super_block *sb) fs_info->btree_inode->i_nlink = 1; fs_info->btree_inode->i_size = sb->s_bdev->bd_inode->i_size; fs_info->btree_inode->i_mapping->a_ops = &btree_aops; - extent_map_tree_init(&BTRFS_I(fs_info->btree_inode)->extent_tree, + extent_io_tree_init(&BTRFS_I(fs_info->btree_inode)->io_tree, fs_info->btree_inode->i_mapping, GFP_NOFS); - BTRFS_I(fs_info->btree_inode)->extent_tree.ops = &btree_extent_map_ops; + extent_map_tree_init(&BTRFS_I(fs_info->btree_inode)->extent_tree, + GFP_NOFS); + + BTRFS_I(fs_info->btree_inode)->io_tree.ops = &btree_extent_io_ops; - extent_map_tree_init(&fs_info->free_space_cache, + extent_io_tree_init(&fs_info->free_space_cache, fs_info->btree_inode->i_mapping, GFP_NOFS); - extent_map_tree_init(&fs_info->block_group_cache, + extent_io_tree_init(&fs_info->block_group_cache, fs_info->btree_inode->i_mapping, GFP_NOFS); - extent_map_tree_init(&fs_info->pinned_extents, + extent_io_tree_init(&fs_info->pinned_extents, fs_info->btree_inode->i_mapping, GFP_NOFS); - extent_map_tree_init(&fs_info->pending_del, + extent_io_tree_init(&fs_info->pending_del, fs_info->btree_inode->i_mapping, GFP_NOFS); - extent_map_tree_init(&fs_info->extent_ins, + extent_io_tree_init(&fs_info->extent_ins, fs_info->btree_inode->i_mapping, GFP_NOFS); fs_info->do_barriers = 1; fs_info->closing = 0; @@ -787,7 +796,7 @@ int write_ctree_super(struct btrfs_trans_handle *trans, struct btrfs_root if (!btrfs_test_opt(root, NOBARRIER)) blkdev_issue_flush(sb->s_bdev, NULL); - set_extent_buffer_dirty(&BTRFS_I(btree_inode)->extent_tree, super); + set_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree, super); ret = sync_page_range_nolock(btree_inode, btree_inode->i_mapping, super->start, super->len); if (!btrfs_test_opt(root, NOBARRIER)) @@ -864,12 +873,12 @@ int close_ctree(struct btrfs_root *root) filemap_write_and_wait(fs_info->btree_inode->i_mapping); - extent_map_tree_empty_lru(&fs_info->free_space_cache); - extent_map_tree_empty_lru(&fs_info->block_group_cache); - extent_map_tree_empty_lru(&fs_info->pinned_extents); - extent_map_tree_empty_lru(&fs_info->pending_del); - extent_map_tree_empty_lru(&fs_info->extent_ins); - extent_map_tree_empty_lru(&BTRFS_I(fs_info->btree_inode)->extent_tree); + extent_io_tree_empty_lru(&fs_info->free_space_cache); + extent_io_tree_empty_lru(&fs_info->block_group_cache); + extent_io_tree_empty_lru(&fs_info->pinned_extents); + extent_io_tree_empty_lru(&fs_info->pending_del); + extent_io_tree_empty_lru(&fs_info->extent_ins); + extent_io_tree_empty_lru(&BTRFS_I(fs_info->btree_inode)->io_tree); truncate_inode_pages(fs_info->btree_inode->i_mapping, 0); @@ -892,13 +901,13 @@ int close_ctree(struct btrfs_root *root) int btrfs_buffer_uptodate(struct extent_buffer *buf) { struct inode *btree_inode = buf->first_page->mapping->host; - return extent_buffer_uptodate(&BTRFS_I(btree_inode)->extent_tree, buf); + return extent_buffer_uptodate(&BTRFS_I(btree_inode)->io_tree, buf); } int btrfs_set_buffer_uptodate(struct extent_buffer *buf) { struct inode *btree_inode = buf->first_page->mapping->host; - return set_extent_buffer_uptodate(&BTRFS_I(btree_inode)->extent_tree, + return set_extent_buffer_uptodate(&BTRFS_I(btree_inode)->io_tree, buf); } @@ -914,7 +923,7 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf) transid, root->fs_info->generation); WARN_ON(1); } - set_extent_buffer_dirty(&BTRFS_I(btree_inode)->extent_tree, buf); + set_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree, buf); } void btrfs_throttle(struct btrfs_root *root) @@ -941,7 +950,7 @@ void btrfs_set_buffer_defrag(struct extent_buffer *buf) { struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root; struct inode *btree_inode = root->fs_info->btree_inode; - set_extent_bits(&BTRFS_I(btree_inode)->extent_tree, buf->start, + set_extent_bits(&BTRFS_I(btree_inode)->io_tree, buf->start, buf->start + buf->len - 1, EXTENT_DEFRAG, GFP_NOFS); } @@ -949,7 +958,7 @@ void btrfs_set_buffer_defrag_done(struct extent_buffer *buf) { struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root; struct inode *btree_inode = root->fs_info->btree_inode; - set_extent_bits(&BTRFS_I(btree_inode)->extent_tree, buf->start, + set_extent_bits(&BTRFS_I(btree_inode)->io_tree, buf->start, buf->start + buf->len - 1, EXTENT_DEFRAG_DONE, GFP_NOFS); } @@ -958,7 +967,7 @@ int btrfs_buffer_defrag(struct extent_buffer *buf) { struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root; struct inode *btree_inode = root->fs_info->btree_inode; - return test_range_bit(&BTRFS_I(btree_inode)->extent_tree, + return test_range_bit(&BTRFS_I(btree_inode)->io_tree, buf->start, buf->start + buf->len - 1, EXTENT_DEFRAG, 0); } @@ -966,7 +975,7 @@ int btrfs_buffer_defrag_done(struct extent_buffer *buf) { struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root; struct inode *btree_inode = root->fs_info->btree_inode; - return test_range_bit(&BTRFS_I(btree_inode)->extent_tree, + return test_range_bit(&BTRFS_I(btree_inode)->io_tree, buf->start, buf->start + buf->len - 1, EXTENT_DEFRAG_DONE, 0); } @@ -975,7 +984,7 @@ int btrfs_clear_buffer_defrag_done(struct extent_buffer *buf) { struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root; struct inode *btree_inode = root->fs_info->btree_inode; - return clear_extent_bits(&BTRFS_I(btree_inode)->extent_tree, + return clear_extent_bits(&BTRFS_I(btree_inode)->io_tree, buf->start, buf->start + buf->len - 1, EXTENT_DEFRAG_DONE, GFP_NOFS); } @@ -984,7 +993,7 @@ int btrfs_clear_buffer_defrag(struct extent_buffer *buf) { struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root; struct inode *btree_inode = root->fs_info->btree_inode; - return clear_extent_bits(&BTRFS_I(btree_inode)->extent_tree, + return clear_extent_bits(&BTRFS_I(btree_inode)->io_tree, buf->start, buf->start + buf->len - 1, EXTENT_DEFRAG, GFP_NOFS); } @@ -993,10 +1002,10 @@ int btrfs_read_buffer(struct extent_buffer *buf) { struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root; struct inode *btree_inode = root->fs_info->btree_inode; - return read_extent_buffer_pages(&BTRFS_I(btree_inode)->extent_tree, + return read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree, buf, 0, 1); } -static struct extent_map_ops btree_extent_map_ops = { +static struct extent_io_ops btree_extent_io_ops = { .writepage_io_hook = btree_writepage_io_hook, }; diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index b69a46691a96..1cf125ab7822 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -63,7 +63,7 @@ static int cache_block_group(struct btrfs_root *root, int ret; struct btrfs_key key; struct extent_buffer *leaf; - struct extent_map_tree *free_space_cache; + struct extent_io_tree *free_space_cache; int slot; u64 last = 0; u64 hole_size; @@ -158,7 +158,7 @@ struct btrfs_block_group_cache *btrfs_lookup_block_group(struct btrfs_fs_info *info, u64 bytenr) { - struct extent_map_tree *block_group_cache; + struct extent_io_tree *block_group_cache; struct btrfs_block_group_cache *block_group = NULL; u64 ptr; u64 start; @@ -281,7 +281,7 @@ struct btrfs_block_group_cache *btrfs_find_block_group(struct btrfs_root *root, int data, int owner) { struct btrfs_block_group_cache *cache; - struct extent_map_tree *block_group_cache; + struct extent_io_tree *block_group_cache; struct btrfs_block_group_cache *found_group = NULL; struct btrfs_fs_info *info = root->fs_info; u64 used; @@ -951,7 +951,7 @@ fail: int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, struct btrfs_root *root) { - struct extent_map_tree *block_group_cache; + struct extent_io_tree *block_group_cache; struct btrfs_block_group_cache *cache; int ret; int err = 0; @@ -1107,12 +1107,12 @@ static int update_pinned_extents(struct btrfs_root *root, return 0; } -int btrfs_copy_pinned(struct btrfs_root *root, struct extent_map_tree *copy) +int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy) { u64 last = 0; u64 start; u64 end; - struct extent_map_tree *pinned_extents = &root->fs_info->pinned_extents; + struct extent_io_tree *pinned_extents = &root->fs_info->pinned_extents; int ret; while(1) { @@ -1128,12 +1128,12 @@ int btrfs_copy_pinned(struct btrfs_root *root, struct extent_map_tree *copy) int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct extent_map_tree *unpin) + struct extent_io_tree *unpin) { u64 start; u64 end; int ret; - struct extent_map_tree *free_space_cache; + struct extent_io_tree *free_space_cache; free_space_cache = &root->fs_info->free_space_cache; while(1) { @@ -1329,8 +1329,8 @@ static int del_pending_extents(struct btrfs_trans_handle *trans, struct int err = 0; u64 start; u64 end; - struct extent_map_tree *pending_del; - struct extent_map_tree *pinned_extents; + struct extent_io_tree *pending_del; + struct extent_io_tree *pinned_extents; pending_del = &extent_root->fs_info->pending_del; pinned_extents = &extent_root->fs_info->pinned_extents; @@ -1802,7 +1802,7 @@ struct extent_buffer *__btrfs_alloc_free_block(struct btrfs_trans_handle *trans, set_extent_dirty(&trans->transaction->dirty_pages, buf->start, buf->start + buf->len - 1, GFP_NOFS); - set_extent_bits(&BTRFS_I(root->fs_info->btree_inode)->extent_tree, + set_extent_bits(&BTRFS_I(root->fs_info->btree_inode)->io_tree, buf->start, buf->start + buf->len - 1, EXTENT_CSUM, GFP_NOFS); buf->flags |= EXTENT_CSUM; @@ -2166,7 +2166,7 @@ static int noinline relocate_inode_pages(struct inode *inode, u64 start, unsigned long i; struct page *page; struct btrfs_root *root = BTRFS_I(inode)->root; - struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; + struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; struct file_ra_state *ra; ra = kzalloc(sizeof(*ra), GFP_NOFS); @@ -2195,15 +2195,14 @@ static int noinline relocate_inode_pages(struct inode *inode, u64 start, page_start = (u64)page->index << PAGE_CACHE_SHIFT; page_end = page_start + PAGE_CACHE_SIZE - 1; - lock_extent(em_tree, page_start, page_end, GFP_NOFS); + lock_extent(io_tree, page_start, page_end, GFP_NOFS); delalloc_start = page_start; - existing_delalloc = - count_range_bits(&BTRFS_I(inode)->extent_tree, - &delalloc_start, page_end, - PAGE_CACHE_SIZE, EXTENT_DELALLOC); + existing_delalloc = count_range_bits(io_tree, + &delalloc_start, page_end, + PAGE_CACHE_SIZE, EXTENT_DELALLOC); - set_extent_delalloc(em_tree, page_start, + set_extent_delalloc(io_tree, page_start, page_end, GFP_NOFS); spin_lock(&root->fs_info->delalloc_lock); @@ -2211,7 +2210,7 @@ static int noinline relocate_inode_pages(struct inode *inode, u64 start, existing_delalloc; spin_unlock(&root->fs_info->delalloc_lock); - unlock_extent(em_tree, page_start, page_end, GFP_NOFS); + unlock_extent(io_tree, page_start, page_end, GFP_NOFS); set_page_dirty(page); unlock_page(page); page_cache_release(page); @@ -2379,7 +2378,7 @@ int btrfs_shrink_extent_tree(struct btrfs_root *root, u64 new_size) u64 cur_byte; u64 total_found; struct btrfs_fs_info *info = root->fs_info; - struct extent_map_tree *block_group_cache; + struct extent_io_tree *block_group_cache; struct btrfs_key key; struct btrfs_key found_key; struct extent_buffer *leaf; @@ -2561,7 +2560,7 @@ int btrfs_grow_extent_tree(struct btrfs_trans_handle *trans, struct btrfs_block_group_cache *cache; struct btrfs_block_group_item *item; struct btrfs_fs_info *info = root->fs_info; - struct extent_map_tree *block_group_cache; + struct extent_io_tree *block_group_cache; struct btrfs_key key; struct extent_buffer *leaf; int ret; @@ -2645,7 +2644,7 @@ int btrfs_read_block_groups(struct btrfs_root *root) int bit; struct btrfs_block_group_cache *cache; struct btrfs_fs_info *info = root->fs_info; - struct extent_map_tree *block_group_cache; + struct extent_io_tree *block_group_cache; struct btrfs_key key; struct btrfs_key found_key; struct extent_buffer *leaf; diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c new file mode 100644 index 000000000000..15cc158a0498 --- /dev/null +++ b/fs/btrfs/extent_io.c @@ -0,0 +1,3089 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "extent_io.h" +#include "extent_map.h" + +/* temporary define until extent_map moves out of btrfs */ +struct kmem_cache *btrfs_cache_create(const char *name, size_t size, + unsigned long extra_flags, + void (*ctor)(void *, struct kmem_cache *, + unsigned long)); + +static struct kmem_cache *extent_state_cache; +static struct kmem_cache *extent_buffer_cache; + +static LIST_HEAD(buffers); +static LIST_HEAD(states); + +static spinlock_t state_lock = SPIN_LOCK_UNLOCKED; +#define BUFFER_LRU_MAX 64 + +struct tree_entry { + u64 start; + u64 end; + int in_tree; + struct rb_node rb_node; +}; + +struct extent_page_data { + struct bio *bio; + struct extent_io_tree *tree; + get_extent_t *get_extent; +}; + +int __init extent_io_init(void) +{ + extent_state_cache = btrfs_cache_create("extent_state", + sizeof(struct extent_state), 0, + NULL); + if (!extent_state_cache) + return -ENOMEM; + + extent_buffer_cache = btrfs_cache_create("extent_buffers", + sizeof(struct extent_buffer), 0, + NULL); + if (!extent_buffer_cache) + goto free_state_cache; + return 0; + +free_state_cache: + kmem_cache_destroy(extent_state_cache); + return -ENOMEM; +} + +void extent_io_exit(void) +{ + struct extent_state *state; + + while (!list_empty(&states)) { + state = list_entry(states.next, struct extent_state, list); + printk("state leak: start %Lu end %Lu state %lu in tree %d refs %d\n", state->start, state->end, state->state, state->in_tree, atomic_read(&state->refs)); + list_del(&state->list); + kmem_cache_free(extent_state_cache, state); + + } + + if (extent_state_cache) + kmem_cache_destroy(extent_state_cache); + if (extent_buffer_cache) + kmem_cache_destroy(extent_buffer_cache); +} + +void extent_io_tree_init(struct extent_io_tree *tree, + struct address_space *mapping, gfp_t mask) +{ + tree->state.rb_node = NULL; + tree->ops = NULL; + tree->dirty_bytes = 0; + rwlock_init(&tree->lock); + spin_lock_init(&tree->lru_lock); + tree->mapping = mapping; + INIT_LIST_HEAD(&tree->buffer_lru); + tree->lru_size = 0; +} +EXPORT_SYMBOL(extent_io_tree_init); + +void extent_io_tree_empty_lru(struct extent_io_tree *tree) +{ + struct extent_buffer *eb; + while(!list_empty(&tree->buffer_lru)) { + eb = list_entry(tree->buffer_lru.next, struct extent_buffer, + lru); + list_del_init(&eb->lru); + free_extent_buffer(eb); + } +} +EXPORT_SYMBOL(extent_io_tree_empty_lru); + +struct extent_state *alloc_extent_state(gfp_t mask) +{ + struct extent_state *state; + unsigned long flags; + + state = kmem_cache_alloc(extent_state_cache, mask); + if (!state || IS_ERR(state)) + return state; + state->state = 0; + state->in_tree = 0; + state->private = 0; + + spin_lock_irqsave(&state_lock, flags); + list_add(&state->list, &states); + spin_unlock_irqrestore(&state_lock, flags); + + atomic_set(&state->refs, 1); + init_waitqueue_head(&state->wq); + return state; +} +EXPORT_SYMBOL(alloc_extent_state); + +void free_extent_state(struct extent_state *state) +{ + unsigned long flags; + if (!state) + return; + if (atomic_dec_and_test(&state->refs)) { + WARN_ON(state->in_tree); + spin_lock_irqsave(&state_lock, flags); + list_del(&state->list); + spin_unlock_irqrestore(&state_lock, flags); + kmem_cache_free(extent_state_cache, state); + } +} +EXPORT_SYMBOL(free_extent_state); + +static struct rb_node *tree_insert(struct rb_root *root, u64 offset, + struct rb_node *node) +{ + struct rb_node ** p = &root->rb_node; + struct rb_node * parent = NULL; + struct tree_entry *entry; + + while(*p) { + parent = *p; + entry = rb_entry(parent, struct tree_entry, rb_node); + + if (offset < entry->start) + p = &(*p)->rb_left; + else if (offset > entry->end) + p = &(*p)->rb_right; + else + return parent; + } + + entry = rb_entry(node, struct tree_entry, rb_node); + entry->in_tree = 1; + rb_link_node(node, parent, p); + rb_insert_color(node, root); + return NULL; +} + +static struct rb_node *__tree_search(struct rb_root *root, u64 offset, + struct rb_node **prev_ret, + struct rb_node **next_ret) +{ + struct rb_node * n = root->rb_node; + struct rb_node *prev = NULL; + struct rb_node *orig_prev = NULL; + struct tree_entry *entry; + struct tree_entry *prev_entry = NULL; + + while(n) { + entry = rb_entry(n, struct tree_entry, rb_node); + prev = n; + prev_entry = entry; + + if (offset < entry->start) + n = n->rb_left; + else if (offset > entry->end) + n = n->rb_right; + else + return n; + } + + if (prev_ret) { + orig_prev = prev; + while(prev && offset > prev_entry->end) { + prev = rb_next(prev); + prev_entry = rb_entry(prev, struct tree_entry, rb_node); + } + *prev_ret = prev; + prev = orig_prev; + } + + if (next_ret) { + prev_entry = rb_entry(prev, struct tree_entry, rb_node); + while(prev && offset < prev_entry->start) { + prev = rb_prev(prev); + prev_entry = rb_entry(prev, struct tree_entry, rb_node); + } + *next_ret = prev; + } + return NULL; +} + +static inline struct rb_node *tree_search(struct rb_root *root, u64 offset) +{ + struct rb_node *prev; + struct rb_node *ret; + ret = __tree_search(root, offset, &prev, NULL); + if (!ret) + return prev; + return ret; +} + +/* + * utility function to look for merge candidates inside a given range. + * Any extents with matching state are merged together into a single + * extent in the tree. Extents with EXTENT_IO in their state field + * are not merged because the end_io handlers need to be able to do + * operations on them without sleeping (or doing allocations/splits). + * + * This should be called with the tree lock held. + */ +static int merge_state(struct extent_io_tree *tree, + struct extent_state *state) +{ + struct extent_state *other; + struct rb_node *other_node; + + if (state->state & EXTENT_IOBITS) + return 0; + + other_node = rb_prev(&state->rb_node); + if (other_node) { + other = rb_entry(other_node, struct extent_state, rb_node); + if (other->end == state->start - 1 && + other->state == state->state) { + state->start = other->start; + other->in_tree = 0; + rb_erase(&other->rb_node, &tree->state); + free_extent_state(other); + } + } + other_node = rb_next(&state->rb_node); + if (other_node) { + other = rb_entry(other_node, struct extent_state, rb_node); + if (other->start == state->end + 1 && + other->state == state->state) { + other->start = state->start; + state->in_tree = 0; + rb_erase(&state->rb_node, &tree->state); + free_extent_state(state); + } + } + return 0; +} + +/* + * insert an extent_state struct into the tree. 'bits' are set on the + * struct before it is inserted. + * + * This may return -EEXIST if the extent is already there, in which case the + * state struct is freed. + * + * The tree lock is not taken internally. This is a utility function and + * probably isn't what you want to call (see set/clear_extent_bit). + */ +static int insert_state(struct extent_io_tree *tree, + struct extent_state *state, u64 start, u64 end, + int bits) +{ + struct rb_node *node; + + if (end < start) { + printk("end < start %Lu %Lu\n", end, start); + WARN_ON(1); + } + if (bits & EXTENT_DIRTY) + tree->dirty_bytes += end - start + 1; + state->state |= bits; + state->start = start; + state->end = end; + node = tree_insert(&tree->state, end, &state->rb_node); + if (node) { + struct extent_state *found; + found = rb_entry(node, struct extent_state, rb_node); + printk("found node %Lu %Lu on insert of %Lu %Lu\n", found->start, found->end, start, end); + free_extent_state(state); + return -EEXIST; + } + merge_state(tree, state); + return 0; +} + +/* + * split a given extent state struct in two, inserting the preallocated + * struct 'prealloc' as the newly created second half. 'split' indicates an + * offset inside 'orig' where it should be split. + * + * Before calling, + * the tree has 'orig' at [orig->start, orig->end]. After calling, there + * are two extent state structs in the tree: + * prealloc: [orig->start, split - 1] + * orig: [ split, orig->end ] + * + * The tree locks are not taken by this function. They need to be held + * by the caller. + */ +static int split_state(struct extent_io_tree *tree, struct extent_state *orig, + struct extent_state *prealloc, u64 split) +{ + struct rb_node *node; + prealloc->start = orig->start; + prealloc->end = split - 1; + prealloc->state = orig->state; + orig->start = split; + + node = tree_insert(&tree->state, prealloc->end, &prealloc->rb_node); + if (node) { + struct extent_state *found; + found = rb_entry(node, struct extent_state, rb_node); + printk("found node %Lu %Lu on insert of %Lu %Lu\n", found->start, found->end, prealloc->start, prealloc->end); + free_extent_state(prealloc); + return -EEXIST; + } + return 0; +} + +/* + * utility function to clear some bits in an extent state struct. + * it will optionally wake up any one waiting on this state (wake == 1), or + * forcibly remove the state from the tree (delete == 1). + * + * If no bits are set on the state struct after clearing things, the + * struct is freed and removed from the tree + */ +static int clear_state_bit(struct extent_io_tree *tree, + struct extent_state *state, int bits, int wake, + int delete) +{ + int ret = state->state & bits; + + if ((bits & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) { + u64 range = state->end - state->start + 1; + WARN_ON(range > tree->dirty_bytes); + tree->dirty_bytes -= range; + } + state->state &= ~bits; + if (wake) + wake_up(&state->wq); + if (delete || state->state == 0) { + if (state->in_tree) { + rb_erase(&state->rb_node, &tree->state); + state->in_tree = 0; + free_extent_state(state); + } else { + WARN_ON(1); + } + } else { + merge_state(tree, state); + } + return ret; +} + +/* + * clear some bits on a range in the tree. This may require splitting + * or inserting elements in the tree, so the gfp mask is used to + * indicate which allocations or sleeping are allowed. + * + * pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove + * the given range from the tree regardless of state (ie for truncate). + * + * the range [start, end] is inclusive. + * + * This takes the tree lock, and returns < 0 on error, > 0 if any of the + * bits were already set, or zero if none of the bits were already set. + */ +int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, + int bits, int wake, int delete, gfp_t mask) +{ + struct extent_state *state; + struct extent_state *prealloc = NULL; + struct rb_node *node; + unsigned long flags; + int err; + int set = 0; + +again: + if (!prealloc && (mask & __GFP_WAIT)) { + prealloc = alloc_extent_state(mask); + if (!prealloc) + return -ENOMEM; + } + + write_lock_irqsave(&tree->lock, flags); + /* + * this search will find the extents that end after + * our range starts + */ + node = tree_search(&tree->state, start); + if (!node) + goto out; + state = rb_entry(node, struct extent_state, rb_node); + if (state->start > end) + goto out; + WARN_ON(state->end < start); + + /* + * | ---- desired range ---- | + * | state | or + * | ------------- state -------------- | + * + * We need to split the extent we found, and may flip + * bits on second half. + * + * If the extent we found extends past our range, we + * just split and search again. It'll get split again + * the next time though. + * + * If the extent we found is inside our range, we clear + * the desired bit on it. + */ + + if (state->start < start) { + err = split_state(tree, state, prealloc, start); + BUG_ON(err == -EEXIST); + prealloc = NULL; + if (err) + goto out; + if (state->end <= end) { + start = state->end + 1; + set |= clear_state_bit(tree, state, bits, + wake, delete); + } else { + start = state->start; + } + goto search_again; + } + /* + * | ---- desired range ---- | + * | state | + * We need to split the extent, and clear the bit + * on the first half + */ + if (state->start <= end && state->end > end) { + err = split_state(tree, state, prealloc, end + 1); + BUG_ON(err == -EEXIST); + + if (wake) + wake_up(&state->wq); + set |= clear_state_bit(tree, prealloc, bits, + wake, delete); + prealloc = NULL; + goto out; + } + + start = state->end + 1; + set |= clear_state_bit(tree, state, bits, wake, delete); + goto search_again; + +out: + write_unlock_irqrestore(&tree->lock, flags); + if (prealloc) + free_extent_state(prealloc); + + return set; + +search_again: + if (start > end) + goto out; + write_unlock_irqrestore(&tree->lock, flags); + if (mask & __GFP_WAIT) + cond_resched(); + goto again; +} +EXPORT_SYMBOL(clear_extent_bit); + +static int wait_on_state(struct extent_io_tree *tree, + struct extent_state *state) +{ + DEFINE_WAIT(wait); + prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE); + read_unlock_irq(&tree->lock); + schedule(); + read_lock_irq(&tree->lock); + finish_wait(&state->wq, &wait); + return 0; +} + +/* + * waits for one or more bits to clear on a range in the state tree. + * The range [start, end] is inclusive. + * The tree lock is taken by this function + */ +int wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits) +{ + struct extent_state *state; + struct rb_node *node; + + read_lock_irq(&tree->lock); +again: + while (1) { + /* + * this search will find all the extents that end after + * our range starts + */ + node = tree_search(&tree->state, start); + if (!node) + break; + + state = rb_entry(node, struct extent_state, rb_node); + + if (state->start > end) + goto out; + + if (state->state & bits) { + start = state->start; + atomic_inc(&state->refs); + wait_on_state(tree, state); + free_extent_state(state); + goto again; + } + start = state->end + 1; + + if (start > end) + break; + + if (need_resched()) { + read_unlock_irq(&tree->lock); + cond_resched(); + read_lock_irq(&tree->lock); + } + } +out: + read_unlock_irq(&tree->lock); + return 0; +} +EXPORT_SYMBOL(wait_extent_bit); + +static void set_state_bits(struct extent_io_tree *tree, + struct extent_state *state, + int bits) +{ + if ((bits & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) { + u64 range = state->end - state->start + 1; + tree->dirty_bytes += range; + } + state->state |= bits; +} + +/* + * set some bits on a range in the tree. This may require allocations + * or sleeping, so the gfp mask is used to indicate what is allowed. + * + * If 'exclusive' == 1, this will fail with -EEXIST if some part of the + * range already has the desired bits set. The start of the existing + * range is returned in failed_start in this case. + * + * [start, end] is inclusive + * This takes the tree lock. + */ +int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits, + int exclusive, u64 *failed_start, gfp_t mask) +{ + struct extent_state *state; + struct extent_state *prealloc = NULL; + struct rb_node *node; + unsigned long flags; + int err = 0; + int set; + u64 last_start; + u64 last_end; +again: + if (!prealloc && (mask & __GFP_WAIT)) { + prealloc = alloc_extent_state(mask); + if (!prealloc) + return -ENOMEM; + } + + write_lock_irqsave(&tree->lock, flags); + /* + * this search will find all the extents that end after + * our range starts. + */ + node = tree_search(&tree->state, start); + if (!node) { + err = insert_state(tree, prealloc, start, end, bits); + prealloc = NULL; + BUG_ON(err == -EEXIST); + goto out; + } + + state = rb_entry(node, struct extent_state, rb_node); + last_start = state->start; + last_end = state->end; + + /* + * | ---- desired range ---- | + * | state | + * + * Just lock what we found and keep going + */ + if (state->start == start && state->end <= end) { + set = state->state & bits; + if (set && exclusive) { + *failed_start = state->start; + err = -EEXIST; + goto out; + } + set_state_bits(tree, state, bits); + start = state->end + 1; + merge_state(tree, state); + goto search_again; + } + + /* + * | ---- desired range ---- | + * | state | + * or + * | ------------- state -------------- | + * + * We need to split the extent we found, and may flip bits on + * second half. + * + * If the extent we found extends past our + * range, we just split and search again. It'll get split + * again the next time though. + * + * If the extent we found is inside our range, we set the + * desired bit on it. + */ + if (state->start < start) { + set = state->state & bits; + if (exclusive && set) { + *failed_start = start; + err = -EEXIST; + goto out; + } + err = split_state(tree, state, prealloc, start); + BUG_ON(err == -EEXIST); + prealloc = NULL; + if (err) + goto out; + if (state->end <= end) { + set_state_bits(tree, state, bits); + start = state->end + 1; + merge_state(tree, state); + } else { + start = state->start; + } + goto search_again; + } + /* + * | ---- desired range ---- | + * | state | or | state | + * + * There's a hole, we need to insert something in it and + * ignore the extent we found. + */ + if (state->start > start) { + u64 this_end; + if (end < last_start) + this_end = end; + else + this_end = last_start -1; + err = insert_state(tree, prealloc, start, this_end, + bits); + prealloc = NULL; + BUG_ON(err == -EEXIST); + if (err) + goto out; + start = this_end + 1; + goto search_again; + } + /* + * | ---- desired range ---- | + * | state | + * We need to split the extent, and set the bit + * on the first half + */ + if (state->start <= end && state->end > end) { + set = state->state & bits; + if (exclusive && set) { + *failed_start = start; + err = -EEXIST; + goto out; + } + err = split_state(tree, state, prealloc, end + 1); + BUG_ON(err == -EEXIST); + + set_state_bits(tree, prealloc, bits); + merge_state(tree, prealloc); + prealloc = NULL; + goto out; + } + + goto search_again; + +out: + write_unlock_irqrestore(&tree->lock, flags); + if (prealloc) + free_extent_state(prealloc); + + return err; + +search_again: + if (start > end) + goto out; + write_unlock_irqrestore(&tree->lock, flags); + if (mask & __GFP_WAIT) + cond_resched(); + goto again; +} +EXPORT_SYMBOL(set_extent_bit); + +/* wrappers around set/clear extent bit */ +int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, + gfp_t mask) +{ + return set_extent_bit(tree, start, end, EXTENT_DIRTY, 0, NULL, + mask); +} +EXPORT_SYMBOL(set_extent_dirty); + +int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, + int bits, gfp_t mask) +{ + return set_extent_bit(tree, start, end, bits, 0, NULL, + mask); +} +EXPORT_SYMBOL(set_extent_bits); + +int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, + int bits, gfp_t mask) +{ + return clear_extent_bit(tree, start, end, bits, 0, 0, mask); +} +EXPORT_SYMBOL(clear_extent_bits); + +int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end, + gfp_t mask) +{ + return set_extent_bit(tree, start, end, + EXTENT_DELALLOC | EXTENT_DIRTY, 0, NULL, + mask); +} +EXPORT_SYMBOL(set_extent_delalloc); + +int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, + gfp_t mask) +{ + return clear_extent_bit(tree, start, end, + EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, mask); +} +EXPORT_SYMBOL(clear_extent_dirty); + +int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end, + gfp_t mask) +{ + return set_extent_bit(tree, start, end, EXTENT_NEW, 0, NULL, + mask); +} +EXPORT_SYMBOL(set_extent_new); + +int clear_extent_new(struct extent_io_tree *tree, u64 start, u64 end, + gfp_t mask) +{ + return clear_extent_bit(tree, start, end, EXTENT_NEW, 0, 0, mask); +} +EXPORT_SYMBOL(clear_extent_new); + +int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, + gfp_t mask) +{ + return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, NULL, + mask); +} +EXPORT_SYMBOL(set_extent_uptodate); + +int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, + gfp_t mask) +{ + return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0, mask); +} +EXPORT_SYMBOL(clear_extent_uptodate); + +int set_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end, + gfp_t mask) +{ + return set_extent_bit(tree, start, end, EXTENT_WRITEBACK, + 0, NULL, mask); +} +EXPORT_SYMBOL(set_extent_writeback); + +int clear_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end, + gfp_t mask) +{ + return clear_extent_bit(tree, start, end, EXTENT_WRITEBACK, 1, 0, mask); +} +EXPORT_SYMBOL(clear_extent_writeback); + +int wait_on_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end) +{ + return wait_extent_bit(tree, start, end, EXTENT_WRITEBACK); +} +EXPORT_SYMBOL(wait_on_extent_writeback); + +/* + * locks a range in ascending order, waiting for any locked regions + * it hits on the way. [start,end] are inclusive, and this will sleep. + */ +int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask) +{ + int err; + u64 failed_start; + while (1) { + err = set_extent_bit(tree, start, end, EXTENT_LOCKED, 1, + &failed_start, mask); + if (err == -EEXIST && (mask & __GFP_WAIT)) { + wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED); + start = failed_start; + } else { + break; + } + WARN_ON(start > end); + } + return err; +} +EXPORT_SYMBOL(lock_extent); + +int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end, + gfp_t mask) +{ + return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, mask); +} +EXPORT_SYMBOL(unlock_extent); + +/* + * helper function to set pages and extents in the tree dirty + */ +int set_range_dirty(struct extent_io_tree *tree, u64 start, u64 end) +{ + unsigned long index = start >> PAGE_CACHE_SHIFT; + unsigned long end_index = end >> PAGE_CACHE_SHIFT; + struct page *page; + + while (index <= end_index) { + page = find_get_page(tree->mapping, index); + BUG_ON(!page); + __set_page_dirty_nobuffers(page); + page_cache_release(page); + index++; + } + set_extent_dirty(tree, start, end, GFP_NOFS); + return 0; +} +EXPORT_SYMBOL(set_range_dirty); + +/* + * helper function to set both pages and extents in the tree writeback + */ +int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end) +{ + unsigned long index = start >> PAGE_CACHE_SHIFT; + unsigned long end_index = end >> PAGE_CACHE_SHIFT; + struct page *page; + + while (index <= end_index) { + page = find_get_page(tree->mapping, index); + BUG_ON(!page); + set_page_writeback(page); + page_cache_release(page); + index++; + } + set_extent_writeback(tree, start, end, GFP_NOFS); + return 0; +} +EXPORT_SYMBOL(set_range_writeback); + +int find_first_extent_bit(struct extent_io_tree *tree, u64 start, + u64 *start_ret, u64 *end_ret, int bits) +{ + struct rb_node *node; + struct extent_state *state; + int ret = 1; + + read_lock_irq(&tree->lock); + /* + * this search will find all the extents that end after + * our range starts. + */ + node = tree_search(&tree->state, start); + if (!node || IS_ERR(node)) { + goto out; + } + + while(1) { + state = rb_entry(node, struct extent_state, rb_node); + if (state->end >= start && (state->state & bits)) { + *start_ret = state->start; + *end_ret = state->end; + ret = 0; + break; + } + node = rb_next(node); + if (!node) + break; + } +out: + read_unlock_irq(&tree->lock); + return ret; +} +EXPORT_SYMBOL(find_first_extent_bit); + +u64 find_lock_delalloc_range(struct extent_io_tree *tree, + u64 *start, u64 *end, u64 max_bytes) +{ + struct rb_node *node; + struct extent_state *state; + u64 cur_start = *start; + u64 found = 0; + u64 total_bytes = 0; + + write_lock_irq(&tree->lock); + /* + * this search will find all the extents that end after + * our range starts. + */ +search_again: + node = tree_search(&tree->state, cur_start); + if (!node || IS_ERR(node)) { + *end = (u64)-1; + goto out; + } + + while(1) { + state = rb_entry(node, struct extent_state, rb_node); + if (found && state->start != cur_start) { + goto out; + } + if (!(state->state & EXTENT_DELALLOC)) { + if (!found) + *end = state->end; + goto out; + } + if (!found) { + struct extent_state *prev_state; + struct rb_node *prev_node = node; + while(1) { + prev_node = rb_prev(prev_node); + if (!prev_node) + break; + prev_state = rb_entry(prev_node, + struct extent_state, + rb_node); + if (!(prev_state->state & EXTENT_DELALLOC)) + break; + state = prev_state; + node = prev_node; + } + } + if (state->state & EXTENT_LOCKED) { + DEFINE_WAIT(wait); + atomic_inc(&state->refs); + prepare_to_wait(&state->wq, &wait, + TASK_UNINTERRUPTIBLE); + write_unlock_irq(&tree->lock); + schedule(); + write_lock_irq(&tree->lock); + finish_wait(&state->wq, &wait); + free_extent_state(state); + goto search_again; + } + state->state |= EXTENT_LOCKED; + if (!found) + *start = state->start; + found++; + *end = state->end; + cur_start = state->end + 1; + node = rb_next(node); + if (!node) + break; + total_bytes += state->end - state->start + 1; + if (total_bytes >= max_bytes) + break; + } +out: + write_unlock_irq(&tree->lock); + return found; +} + +u64 count_range_bits(struct extent_io_tree *tree, + u64 *start, u64 search_end, u64 max_bytes, + unsigned long bits) +{ + struct rb_node *node; + struct extent_state *state; + u64 cur_start = *start; + u64 total_bytes = 0; + int found = 0; + + if (search_end <= cur_start) { + printk("search_end %Lu start %Lu\n", search_end, cur_start); + WARN_ON(1); + return 0; + } + + write_lock_irq(&tree->lock); + if (cur_start == 0 && bits == EXTENT_DIRTY) { + total_bytes = tree->dirty_bytes; + goto out; + } + /* + * this search will find all the extents that end after + * our range starts. + */ + node = tree_search(&tree->state, cur_start); + if (!node || IS_ERR(node)) { + goto out; + } + + while(1) { + state = rb_entry(node, struct extent_state, rb_node); + if (state->start > search_end) + break; + if (state->end >= cur_start && (state->state & bits)) { + total_bytes += min(search_end, state->end) + 1 - + max(cur_start, state->start); + if (total_bytes >= max_bytes) + break; + if (!found) { + *start = state->start; + found = 1; + } + } + node = rb_next(node); + if (!node) + break; + } +out: + write_unlock_irq(&tree->lock); + return total_bytes; +} +/* + * helper function to lock both pages and extents in the tree. + * pages must be locked first. + */ +int lock_range(struct extent_io_tree *tree, u64 start, u64 end) +{ + unsigned long index = start >> PAGE_CACHE_SHIFT; + unsigned long end_index = end >> PAGE_CACHE_SHIFT; + struct page *page; + int err; + + while (index <= end_index) { + page = grab_cache_page(tree->mapping, index); + if (!page) { + err = -ENOMEM; + goto failed; + } + if (IS_ERR(page)) { + err = PTR_ERR(page); + goto failed; + } + index++; + } + lock_extent(tree, start, end, GFP_NOFS); + return 0; + +failed: + /* + * we failed above in getting the page at 'index', so we undo here + * up to but not including the page at 'index' + */ + end_index = index; + index = start >> PAGE_CACHE_SHIFT; + while (index < end_index) { + page = find_get_page(tree->mapping, index); + unlock_page(page); + page_cache_release(page); + index++; + } + return err; +} +EXPORT_SYMBOL(lock_range); + +/* + * helper function to unlock both pages and extents in the tree. + */ +int unlock_range(struct extent_io_tree *tree, u64 start, u64 end) +{ + unsigned long index = start >> PAGE_CACHE_SHIFT; + unsigned long end_index = end >> PAGE_CACHE_SHIFT; + struct page *page; + + while (index <= end_index) { + page = find_get_page(tree->mapping, index); + unlock_page(page); + page_cache_release(page); + index++; + } + unlock_extent(tree, start, end, GFP_NOFS); + return 0; +} +EXPORT_SYMBOL(unlock_range); + +int set_state_private(struct extent_io_tree *tree, u64 start, u64 private) +{ + struct rb_node *node; + struct extent_state *state; + int ret = 0; + + write_lock_irq(&tree->lock); + /* + * this search will find all the extents that end after + * our range starts. + */ + node = tree_search(&tree->state, start); + if (!node || IS_ERR(node)) { + ret = -ENOENT; + goto out; + } + state = rb_entry(node, struct extent_state, rb_node); + if (state->start != start) { + ret = -ENOENT; + goto out; + } + state->private = private; +out: + write_unlock_irq(&tree->lock); + return ret; +} + +int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private) +{ + struct rb_node *node; + struct extent_state *state; + int ret = 0; + + read_lock_irq(&tree->lock); + /* + * this search will find all the extents that end after + * our range starts. + */ + node = tree_search(&tree->state, start); + if (!node || IS_ERR(node)) { + ret = -ENOENT; + goto out; + } + state = rb_entry(node, struct extent_state, rb_node); + if (state->start != start) { + ret = -ENOENT; + goto out; + } + *private = state->private; +out: + read_unlock_irq(&tree->lock); + return ret; +} + +/* + * searches a range in the state tree for a given mask. + * If 'filled' == 1, this returns 1 only if ever extent in the tree + * has the bits set. Otherwise, 1 is returned if any bit in the + * range is found set. + */ +int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, + int bits, int filled) +{ + struct extent_state *state = NULL; + struct rb_node *node; + int bitset = 0; + unsigned long flags; + + read_lock_irqsave(&tree->lock, flags); + node = tree_search(&tree->state, start); + while (node && start <= end) { + state = rb_entry(node, struct extent_state, rb_node); + + if (filled && state->start > start) { + bitset = 0; + break; + } + + if (state->start > end) + break; + + if (state->state & bits) { + bitset = 1; + if (!filled) + break; + } else if (filled) { + bitset = 0; + break; + } + start = state->end + 1; + if (start > end) + break; + node = rb_next(node); + if (!node) { + if (filled) + bitset = 0; + break; + } + } + read_unlock_irqrestore(&tree->lock, flags); + return bitset; +} +EXPORT_SYMBOL(test_range_bit); + +/* + * helper function to set a given page up to date if all the + * extents in the tree for that page are up to date + */ +static int check_page_uptodate(struct extent_io_tree *tree, + struct page *page) +{ + u64 start = (u64)page->index << PAGE_CACHE_SHIFT; + u64 end = start + PAGE_CACHE_SIZE - 1; + if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1)) + SetPageUptodate(page); + return 0; +} + +/* + * helper function to unlock a page if all the extents in the tree + * for that page are unlocked + */ +static int check_page_locked(struct extent_io_tree *tree, + struct page *page) +{ + u64 start = (u64)page->index << PAGE_CACHE_SHIFT; + u64 end = start + PAGE_CACHE_SIZE - 1; + if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0)) + unlock_page(page); + return 0; +} + +/* + * helper function to end page writeback if all the extents + * in the tree for that page are done with writeback + */ +static int check_page_writeback(struct extent_io_tree *tree, + struct page *page) +{ + u64 start = (u64)page->index << PAGE_CACHE_SHIFT; + u64 end = start + PAGE_CACHE_SIZE - 1; + if (!test_range_bit(tree, start, end, EXTENT_WRITEBACK, 0)) + end_page_writeback(page); + return 0; +} + +/* lots and lots of room for performance fixes in the end_bio funcs */ + +/* + * after a writepage IO is done, we need to: + * clear the uptodate bits on error + * clear the writeback bits in the extent tree for this IO + * end_page_writeback if the page has no more pending IO + * + * Scheduling is not allowed, so the extent state tree is expected + * to have one and only one object corresponding to this IO. + */ +#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,23) +static void end_bio_extent_writepage(struct bio *bio, int err) +#else +static int end_bio_extent_writepage(struct bio *bio, + unsigned int bytes_done, int err) +#endif +{ + const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); + struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; + struct extent_io_tree *tree = bio->bi_private; + u64 start; + u64 end; + int whole_page; + +#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23) + if (bio->bi_size) + return 1; +#endif + + do { + struct page *page = bvec->bv_page; + start = ((u64)page->index << PAGE_CACHE_SHIFT) + + bvec->bv_offset; + end = start + bvec->bv_len - 1; + + if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE) + whole_page = 1; + else + whole_page = 0; + + if (--bvec >= bio->bi_io_vec) + prefetchw(&bvec->bv_page->flags); + + if (!uptodate) { + clear_extent_uptodate(tree, start, end, GFP_ATOMIC); + ClearPageUptodate(page); + SetPageError(page); + } + clear_extent_writeback(tree, start, end, GFP_ATOMIC); + + if (whole_page) + end_page_writeback(page); + else + check_page_writeback(tree, page); + if (tree->ops && tree->ops->writepage_end_io_hook) + tree->ops->writepage_end_io_hook(page, start, end); + } while (bvec >= bio->bi_io_vec); + + bio_put(bio); +#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23) + return 0; +#endif +} + +/* + * after a readpage IO is done, we need to: + * clear the uptodate bits on error + * set the uptodate bits if things worked + * set the page up to date if all extents in the tree are uptodate + * clear the lock bit in the extent tree + * unlock the page if there are no other extents locked for it + * + * Scheduling is not allowed, so the extent state tree is expected + * to have one and only one object corresponding to this IO. + */ +#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,23) +static void end_bio_extent_readpage(struct bio *bio, int err) +#else +static int end_bio_extent_readpage(struct bio *bio, + unsigned int bytes_done, int err) +#endif +{ + int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); + struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; + struct extent_io_tree *tree = bio->bi_private; + u64 start; + u64 end; + int whole_page; + int ret; + +#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23) + if (bio->bi_size) + return 1; +#endif + + do { + struct page *page = bvec->bv_page; + start = ((u64)page->index << PAGE_CACHE_SHIFT) + + bvec->bv_offset; + end = start + bvec->bv_len - 1; + + if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE) + whole_page = 1; + else + whole_page = 0; + + if (--bvec >= bio->bi_io_vec) + prefetchw(&bvec->bv_page->flags); + + if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) { + ret = tree->ops->readpage_end_io_hook(page, start, end); + if (ret) + uptodate = 0; + } + if (uptodate) { + set_extent_uptodate(tree, start, end, GFP_ATOMIC); + if (whole_page) + SetPageUptodate(page); + else + check_page_uptodate(tree, page); + } else { + ClearPageUptodate(page); + SetPageError(page); + } + + unlock_extent(tree, start, end, GFP_ATOMIC); + + if (whole_page) + unlock_page(page); + else + check_page_locked(tree, page); + } while (bvec >= bio->bi_io_vec); + + bio_put(bio); +#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23) + return 0; +#endif +} + +/* + * IO done from prepare_write is pretty simple, we just unlock + * the structs in the extent tree when done, and set the uptodate bits + * as appropriate. + */ +#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,23) +static void end_bio_extent_preparewrite(struct bio *bio, int err) +#else +static int end_bio_extent_preparewrite(struct bio *bio, + unsigned int bytes_done, int err) +#endif +{ + const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); + struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; + struct extent_io_tree *tree = bio->bi_private; + u64 start; + u64 end; + +#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23) + if (bio->bi_size) + return 1; +#endif + + do { + struct page *page = bvec->bv_page; + start = ((u64)page->index << PAGE_CACHE_SHIFT) + + bvec->bv_offset; + end = start + bvec->bv_len - 1; + + if (--bvec >= bio->bi_io_vec) + prefetchw(&bvec->bv_page->flags); + + if (uptodate) { + set_extent_uptodate(tree, start, end, GFP_ATOMIC); + } else { + ClearPageUptodate(page); + SetPageError(page); + } + + unlock_extent(tree, start, end, GFP_ATOMIC); + + } while (bvec >= bio->bi_io_vec); + + bio_put(bio); +#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23) + return 0; +#endif +} + +static struct bio * +extent_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs, + gfp_t gfp_flags) +{ + struct bio *bio; + + bio = bio_alloc(gfp_flags, nr_vecs); + + if (bio == NULL && (current->flags & PF_MEMALLOC)) { + while (!bio && (nr_vecs /= 2)) + bio = bio_alloc(gfp_flags, nr_vecs); + } + + if (bio) { + bio->bi_bdev = bdev; + bio->bi_sector = first_sector; + } + return bio; +} + +static int submit_one_bio(int rw, struct bio *bio) +{ + u64 maxsector; + int ret = 0; + + bio_get(bio); + + maxsector = bio->bi_bdev->bd_inode->i_size >> 9; + if (maxsector < bio->bi_sector) { + printk("sector too large max %Lu got %llu\n", maxsector, + (unsigned long long)bio->bi_sector); + WARN_ON(1); + } + + submit_bio(rw, bio); + if (bio_flagged(bio, BIO_EOPNOTSUPP)) + ret = -EOPNOTSUPP; + bio_put(bio); + return ret; +} + +static int submit_extent_page(int rw, struct extent_io_tree *tree, + struct page *page, sector_t sector, + size_t size, unsigned long offset, + struct block_device *bdev, + struct bio **bio_ret, + unsigned long max_pages, + bio_end_io_t end_io_func) +{ + int ret = 0; + struct bio *bio; + int nr; + + if (bio_ret && *bio_ret) { + bio = *bio_ret; + if (bio->bi_sector + (bio->bi_size >> 9) != sector || + bio_add_page(bio, page, size, offset) < size) { + ret = submit_one_bio(rw, bio); + bio = NULL; + } else { + return 0; + } + } + nr = min_t(int, max_pages, bio_get_nr_vecs(bdev)); + bio = extent_bio_alloc(bdev, sector, nr, GFP_NOFS | __GFP_HIGH); + if (!bio) { + printk("failed to allocate bio nr %d\n", nr); + } + bio_add_page(bio, page, size, offset); + bio->bi_end_io = end_io_func; + bio->bi_private = tree; + if (bio_ret) { + *bio_ret = bio; + } else { + ret = submit_one_bio(rw, bio); + } + + return ret; +} + +void set_page_extent_mapped(struct page *page) +{ + if (!PagePrivate(page)) { + SetPagePrivate(page); + WARN_ON(!page->mapping->a_ops->invalidatepage); + set_page_private(page, EXTENT_PAGE_PRIVATE); + page_cache_get(page); + } +} + +void set_page_extent_head(struct page *page, unsigned long len) +{ + set_page_private(page, EXTENT_PAGE_PRIVATE_FIRST_PAGE | len << 2); +} + +/* + * basic readpage implementation. Locked extent state structs are inserted + * into the tree that are removed when the IO is done (by the end_io + * handlers) + */ +static int __extent_read_full_page(struct extent_io_tree *tree, + struct page *page, + get_extent_t *get_extent, + struct bio **bio) +{ + struct inode *inode = page->mapping->host; + u64 start = (u64)page->index << PAGE_CACHE_SHIFT; + u64 page_end = start + PAGE_CACHE_SIZE - 1; + u64 end; + u64 cur = start; + u64 extent_offset; + u64 last_byte = i_size_read(inode); + u64 block_start; + u64 cur_end; + sector_t sector; + struct extent_map *em; + struct block_device *bdev; + int ret; + int nr = 0; + size_t page_offset = 0; + size_t iosize; + size_t blocksize = inode->i_sb->s_blocksize; + + set_page_extent_mapped(page); + + end = page_end; + lock_extent(tree, start, end, GFP_NOFS); + + while (cur <= end) { + if (cur >= last_byte) { + char *userpage; + iosize = PAGE_CACHE_SIZE - page_offset; + userpage = kmap_atomic(page, KM_USER0); + memset(userpage + page_offset, 0, iosize); + flush_dcache_page(page); + kunmap_atomic(userpage, KM_USER0); + set_extent_uptodate(tree, cur, cur + iosize - 1, + GFP_NOFS); + unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS); + break; + } + em = get_extent(inode, page, page_offset, cur, + end - cur + 1, 0); + if (IS_ERR(em) || !em) { + SetPageError(page); + unlock_extent(tree, cur, end, GFP_NOFS); + break; + } + + extent_offset = cur - em->start; + BUG_ON(extent_map_end(em) <= cur); + BUG_ON(end < cur); + + iosize = min(extent_map_end(em) - cur, end - cur + 1); + cur_end = min(extent_map_end(em) - 1, end); + iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1); + sector = (em->block_start + extent_offset) >> 9; + bdev = em->bdev; + block_start = em->block_start; + free_extent_map(em); + em = NULL; + + /* we've found a hole, just zero and go on */ + if (block_start == EXTENT_MAP_HOLE) { + char *userpage; + userpage = kmap_atomic(page, KM_USER0); + memset(userpage + page_offset, 0, iosize); + flush_dcache_page(page); + kunmap_atomic(userpage, KM_USER0); + + set_extent_uptodate(tree, cur, cur + iosize - 1, + GFP_NOFS); + unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS); + cur = cur + iosize; + page_offset += iosize; + continue; + } + /* the get_extent function already copied into the page */ + if (test_range_bit(tree, cur, cur_end, EXTENT_UPTODATE, 1)) { + unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS); + cur = cur + iosize; + page_offset += iosize; + continue; + } + + ret = 0; + if (tree->ops && tree->ops->readpage_io_hook) { + ret = tree->ops->readpage_io_hook(page, cur, + cur + iosize - 1); + } + if (!ret) { + unsigned long nr = (last_byte >> PAGE_CACHE_SHIFT) + 1; + nr -= page->index; + ret = submit_extent_page(READ, tree, page, + sector, iosize, page_offset, + bdev, bio, nr, + end_bio_extent_readpage); + } + if (ret) + SetPageError(page); + cur = cur + iosize; + page_offset += iosize; + nr++; + } + if (!nr) { + if (!PageError(page)) + SetPageUptodate(page); + unlock_page(page); + } + return 0; +} + +int extent_read_full_page(struct extent_io_tree *tree, struct page *page, + get_extent_t *get_extent) +{ + struct bio *bio = NULL; + int ret; + + ret = __extent_read_full_page(tree, page, get_extent, &bio); + if (bio) + submit_one_bio(READ, bio); + return ret; +} +EXPORT_SYMBOL(extent_read_full_page); + +/* + * the writepage semantics are similar to regular writepage. extent + * records are inserted to lock ranges in the tree, and as dirty areas + * are found, they are marked writeback. Then the lock bits are removed + * and the end_io handler clears the writeback ranges + */ +static int __extent_writepage(struct page *page, struct writeback_control *wbc, + void *data) +{ + struct inode *inode = page->mapping->host; + struct extent_page_data *epd = data; + struct extent_io_tree *tree = epd->tree; + u64 start = (u64)page->index << PAGE_CACHE_SHIFT; + u64 delalloc_start; + u64 page_end = start + PAGE_CACHE_SIZE - 1; + u64 end; + u64 cur = start; + u64 extent_offset; + u64 last_byte = i_size_read(inode); + u64 block_start; + u64 iosize; + sector_t sector; + struct extent_map *em; + struct block_device *bdev; + int ret; + int nr = 0; + size_t page_offset = 0; + size_t blocksize; + loff_t i_size = i_size_read(inode); + unsigned long end_index = i_size >> PAGE_CACHE_SHIFT; + u64 nr_delalloc; + u64 delalloc_end; + + WARN_ON(!PageLocked(page)); + if (page->index > end_index) { + clear_extent_dirty(tree, start, page_end, GFP_NOFS); + unlock_page(page); + return 0; + } + + if (page->index == end_index) { + char *userpage; + + size_t offset = i_size & (PAGE_CACHE_SIZE - 1); + + userpage = kmap_atomic(page, KM_USER0); + memset(userpage + offset, 0, PAGE_CACHE_SIZE - offset); + flush_dcache_page(page); + kunmap_atomic(userpage, KM_USER0); + } + + set_page_extent_mapped(page); + + delalloc_start = start; + delalloc_end = 0; + while(delalloc_end < page_end) { + nr_delalloc = find_lock_delalloc_range(tree, &delalloc_start, + &delalloc_end, + 128 * 1024 * 1024); + if (nr_delalloc == 0) { + delalloc_start = delalloc_end + 1; + continue; + } + tree->ops->fill_delalloc(inode, delalloc_start, + delalloc_end); + clear_extent_bit(tree, delalloc_start, + delalloc_end, + EXTENT_LOCKED | EXTENT_DELALLOC, + 1, 0, GFP_NOFS); + delalloc_start = delalloc_end + 1; + } + lock_extent(tree, start, page_end, GFP_NOFS); + + end = page_end; + if (test_range_bit(tree, start, page_end, EXTENT_DELALLOC, 0)) { + printk("found delalloc bits after lock_extent\n"); + } + + if (last_byte <= start) { + clear_extent_dirty(tree, start, page_end, GFP_NOFS); + goto done; + } + + set_extent_uptodate(tree, start, page_end, GFP_NOFS); + blocksize = inode->i_sb->s_blocksize; + + while (cur <= end) { + if (cur >= last_byte) { + clear_extent_dirty(tree, cur, page_end, GFP_NOFS); + break; + } + em = epd->get_extent(inode, page, page_offset, cur, + end - cur + 1, 1); + if (IS_ERR(em) || !em) { + SetPageError(page); + break; + } + + extent_offset = cur - em->start; + BUG_ON(extent_map_end(em) <= cur); + BUG_ON(end < cur); + iosize = min(extent_map_end(em) - cur, end - cur + 1); + iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1); + sector = (em->block_start + extent_offset) >> 9; + bdev = em->bdev; + block_start = em->block_start; + free_extent_map(em); + em = NULL; + + if (block_start == EXTENT_MAP_HOLE || + block_start == EXTENT_MAP_INLINE) { + clear_extent_dirty(tree, cur, + cur + iosize - 1, GFP_NOFS); + cur = cur + iosize; + page_offset += iosize; + continue; + } + + /* leave this out until we have a page_mkwrite call */ + if (0 && !test_range_bit(tree, cur, cur + iosize - 1, + EXTENT_DIRTY, 0)) { + cur = cur + iosize; + page_offset += iosize; + continue; + } + clear_extent_dirty(tree, cur, cur + iosize - 1, GFP_NOFS); + if (tree->ops && tree->ops->writepage_io_hook) { + ret = tree->ops->writepage_io_hook(page, cur, + cur + iosize - 1); + } else { + ret = 0; + } + if (ret) + SetPageError(page); + else { + unsigned long max_nr = end_index + 1; + set_range_writeback(tree, cur, cur + iosize - 1); + if (!PageWriteback(page)) { + printk("warning page %lu not writeback, " + "cur %llu end %llu\n", page->index, + (unsigned long long)cur, + (unsigned long long)end); + } + + ret = submit_extent_page(WRITE, tree, page, sector, + iosize, page_offset, bdev, + &epd->bio, max_nr, + end_bio_extent_writepage); + if (ret) + SetPageError(page); + } + cur = cur + iosize; + page_offset += iosize; + nr++; + } +done: + if (nr == 0) { + /* make sure the mapping tag for page dirty gets cleared */ + set_page_writeback(page); + end_page_writeback(page); + } + unlock_extent(tree, start, page_end, GFP_NOFS); + unlock_page(page); + return 0; +} + +#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18) + +/* Taken directly from 2.6.23 for 2.6.18 back port */ +typedef int (*writepage_t)(struct page *page, struct writeback_control *wbc, + void *data); + +/** + * write_cache_pages - walk the list of dirty pages of the given address space + * and write all of them. + * @mapping: address space structure to write + * @wbc: subtract the number of written pages from *@wbc->nr_to_write + * @writepage: function called for each page + * @data: data passed to writepage function + * + * If a page is already under I/O, write_cache_pages() skips it, even + * if it's dirty. This is desirable behaviour for memory-cleaning writeback, + * but it is INCORRECT for data-integrity system calls such as fsync(). fsync() + * and msync() need to guarantee that all the data which was dirty at the time + * the call was made get new I/O started against them. If wbc->sync_mode is + * WB_SYNC_ALL then we were called for data integrity and we must wait for + * existing IO to complete. + */ +static int write_cache_pages(struct address_space *mapping, + struct writeback_control *wbc, writepage_t writepage, + void *data) +{ + struct backing_dev_info *bdi = mapping->backing_dev_info; + int ret = 0; + int done = 0; + struct pagevec pvec; + int nr_pages; + pgoff_t index; + pgoff_t end; /* Inclusive */ + int scanned = 0; + int range_whole = 0; + + if (wbc->nonblocking && bdi_write_congested(bdi)) { + wbc->encountered_congestion = 1; + return 0; + } + + pagevec_init(&pvec, 0); + if (wbc->range_cyclic) { + index = mapping->writeback_index; /* Start from prev offset */ + end = -1; + } else { + index = wbc->range_start >> PAGE_CACHE_SHIFT; + end = wbc->range_end >> PAGE_CACHE_SHIFT; + if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) + range_whole = 1; + scanned = 1; + } +retry: + while (!done && (index <= end) && + (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, + PAGECACHE_TAG_DIRTY, + min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) { + unsigned i; + + scanned = 1; + for (i = 0; i < nr_pages; i++) { + struct page *page = pvec.pages[i]; + + /* + * At this point we hold neither mapping->tree_lock nor + * lock on the page itself: the page may be truncated or + * invalidated (changing page->mapping to NULL), or even + * swizzled back from swapper_space to tmpfs file + * mapping + */ + lock_page(page); + + if (unlikely(page->mapping != mapping)) { + unlock_page(page); + continue; + } + + if (!wbc->range_cyclic && page->index > end) { + done = 1; + unlock_page(page); + continue; + } + + if (wbc->sync_mode != WB_SYNC_NONE) + wait_on_page_writeback(page); + + if (PageWriteback(page) || + !clear_page_dirty_for_io(page)) { + unlock_page(page); + continue; + } + + ret = (*writepage)(page, wbc, data); + + if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) { + unlock_page(page); + ret = 0; + } + if (ret || (--(wbc->nr_to_write) <= 0)) + done = 1; + if (wbc->nonblocking && bdi_write_congested(bdi)) { + wbc->encountered_congestion = 1; + done = 1; + } + } + pagevec_release(&pvec); + cond_resched(); + } + if (!scanned && !done) { + /* + * We hit the last page and there is more work to be done: wrap + * back to the start of the file + */ + scanned = 1; + index = 0; + goto retry; + } + if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) + mapping->writeback_index = index; + return ret; +} +#endif + +int extent_write_full_page(struct extent_io_tree *tree, struct page *page, + get_extent_t *get_extent, + struct writeback_control *wbc) +{ + int ret; + struct address_space *mapping = page->mapping; + struct extent_page_data epd = { + .bio = NULL, + .tree = tree, + .get_extent = get_extent, + }; + struct writeback_control wbc_writepages = { + .bdi = wbc->bdi, + .sync_mode = WB_SYNC_NONE, + .older_than_this = NULL, + .nr_to_write = 64, + .range_start = page_offset(page) + PAGE_CACHE_SIZE, + .range_end = (loff_t)-1, + }; + + + ret = __extent_writepage(page, wbc, &epd); + + write_cache_pages(mapping, &wbc_writepages, __extent_writepage, &epd); + if (epd.bio) { + submit_one_bio(WRITE, epd.bio); + } + return ret; +} +EXPORT_SYMBOL(extent_write_full_page); + + +int extent_writepages(struct extent_io_tree *tree, + struct address_space *mapping, + get_extent_t *get_extent, + struct writeback_control *wbc) +{ + int ret = 0; + struct extent_page_data epd = { + .bio = NULL, + .tree = tree, + .get_extent = get_extent, + }; + + ret = write_cache_pages(mapping, wbc, __extent_writepage, &epd); + if (epd.bio) { + submit_one_bio(WRITE, epd.bio); + } + return ret; +} +EXPORT_SYMBOL(extent_writepages); + +int extent_readpages(struct extent_io_tree *tree, + struct address_space *mapping, + struct list_head *pages, unsigned nr_pages, + get_extent_t get_extent) +{ + struct bio *bio = NULL; + unsigned page_idx; + struct pagevec pvec; + + pagevec_init(&pvec, 0); + for (page_idx = 0; page_idx < nr_pages; page_idx++) { + struct page *page = list_entry(pages->prev, struct page, lru); + + prefetchw(&page->flags); + list_del(&page->lru); + /* + * what we want to do here is call add_to_page_cache_lru, + * but that isn't exported, so we reproduce it here + */ + if (!add_to_page_cache(page, mapping, + page->index, GFP_KERNEL)) { + + /* open coding of lru_cache_add, also not exported */ + page_cache_get(page); + if (!pagevec_add(&pvec, page)) + __pagevec_lru_add(&pvec); + __extent_read_full_page(tree, page, get_extent, &bio); + } + page_cache_release(page); + } + if (pagevec_count(&pvec)) + __pagevec_lru_add(&pvec); + BUG_ON(!list_empty(pages)); + if (bio) + submit_one_bio(READ, bio); + return 0; +} +EXPORT_SYMBOL(extent_readpages); + +/* + * basic invalidatepage code, this waits on any locked or writeback + * ranges corresponding to the page, and then deletes any extent state + * records from the tree + */ +int extent_invalidatepage(struct extent_io_tree *tree, + struct page *page, unsigned long offset) +{ + u64 start = ((u64)page->index << PAGE_CACHE_SHIFT); + u64 end = start + PAGE_CACHE_SIZE - 1; + size_t blocksize = page->mapping->host->i_sb->s_blocksize; + + start += (offset + blocksize -1) & ~(blocksize - 1); + if (start > end) + return 0; + + lock_extent(tree, start, end, GFP_NOFS); + wait_on_extent_writeback(tree, start, end); + clear_extent_bit(tree, start, end, + EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC, + 1, 1, GFP_NOFS); + return 0; +} +EXPORT_SYMBOL(extent_invalidatepage); + +/* + * simple commit_write call, set_range_dirty is used to mark both + * the pages and the extent records as dirty + */ +int extent_commit_write(struct extent_io_tree *tree, + struct inode *inode, struct page *page, + unsigned from, unsigned to) +{ + loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to; + + set_page_extent_mapped(page); + set_page_dirty(page); + + if (pos > inode->i_size) { + i_size_write(inode, pos); + mark_inode_dirty(inode); + } + return 0; +} +EXPORT_SYMBOL(extent_commit_write); + +int extent_prepare_write(struct extent_io_tree *tree, + struct inode *inode, struct page *page, + unsigned from, unsigned to, get_extent_t *get_extent) +{ + u64 page_start = (u64)page->index << PAGE_CACHE_SHIFT; + u64 page_end = page_start + PAGE_CACHE_SIZE - 1; + u64 block_start; + u64 orig_block_start; + u64 block_end; + u64 cur_end; + struct extent_map *em; + unsigned blocksize = 1 << inode->i_blkbits; + size_t page_offset = 0; + size_t block_off_start; + size_t block_off_end; + int err = 0; + int iocount = 0; + int ret = 0; + int isnew; + + set_page_extent_mapped(page); + + block_start = (page_start + from) & ~((u64)blocksize - 1); + block_end = (page_start + to - 1) | (blocksize - 1); + orig_block_start = block_start; + + lock_extent(tree, page_start, page_end, GFP_NOFS); + while(block_start <= block_end) { + em = get_extent(inode, page, page_offset, block_start, + block_end - block_start + 1, 1); + if (IS_ERR(em) || !em) { + goto err; + } + cur_end = min(block_end, extent_map_end(em) - 1); + block_off_start = block_start & (PAGE_CACHE_SIZE - 1); + block_off_end = block_off_start + blocksize; + isnew = clear_extent_new(tree, block_start, cur_end, GFP_NOFS); + + if (!PageUptodate(page) && isnew && + (block_off_end > to || block_off_start < from)) { + void *kaddr; + + kaddr = kmap_atomic(page, KM_USER0); + if (block_off_end > to) + memset(kaddr + to, 0, block_off_end - to); + if (block_off_start < from) + memset(kaddr + block_off_start, 0, + from - block_off_start); + flush_dcache_page(page); + kunmap_atomic(kaddr, KM_USER0); + } + if ((em->block_start != EXTENT_MAP_HOLE && + em->block_start != EXTENT_MAP_INLINE) && + !isnew && !PageUptodate(page) && + (block_off_end > to || block_off_start < from) && + !test_range_bit(tree, block_start, cur_end, + EXTENT_UPTODATE, 1)) { + u64 sector; + u64 extent_offset = block_start - em->start; + size_t iosize; + sector = (em->block_start + extent_offset) >> 9; + iosize = (cur_end - block_start + blocksize) & + ~((u64)blocksize - 1); + /* + * we've already got the extent locked, but we + * need to split the state such that our end_bio + * handler can clear the lock. + */ + set_extent_bit(tree, block_start, + block_start + iosize - 1, + EXTENT_LOCKED, 0, NULL, GFP_NOFS); + ret = submit_extent_page(READ, tree, page, + sector, iosize, page_offset, em->bdev, + NULL, 1, + end_bio_extent_preparewrite); + iocount++; + block_start = block_start + iosize; + } else { + set_extent_uptodate(tree, block_start, cur_end, + GFP_NOFS); + unlock_extent(tree, block_start, cur_end, GFP_NOFS); + block_start = cur_end + 1; + } + page_offset = block_start & (PAGE_CACHE_SIZE - 1); + free_extent_map(em); + } + if (iocount) { + wait_extent_bit(tree, orig_block_start, + block_end, EXTENT_LOCKED); + } + check_page_uptodate(tree, page); +err: + /* FIXME, zero out newly allocated blocks on error */ + return err; +} +EXPORT_SYMBOL(extent_prepare_write); + +/* + * a helper for releasepage. As long as there are no locked extents + * in the range corresponding to the page, both state records and extent + * map records are removed + */ +int try_release_extent_mapping(struct extent_map_tree *map, + struct extent_io_tree *tree, struct page *page) +{ + struct extent_map *em; + u64 start = (u64)page->index << PAGE_CACHE_SHIFT; + u64 end = start + PAGE_CACHE_SIZE - 1; + u64 orig_start = start; + int ret = 1; + + while (start <= end) { + spin_lock(&map->lock); + em = lookup_extent_mapping(map, start, end); + if (!em || IS_ERR(em)) { + spin_unlock(&map->lock); + break; + } + if (!test_range_bit(tree, em->start, extent_map_end(em) - 1, + EXTENT_LOCKED, 0)) { + remove_extent_mapping(map, em); + /* once for the rb tree */ + free_extent_map(em); + } + start = extent_map_end(em); + spin_unlock(&map->lock); + + /* once for us */ + free_extent_map(em); + } + if (test_range_bit(tree, orig_start, end, EXTENT_LOCKED, 0)) + ret = 0; + else + clear_extent_bit(tree, orig_start, end, EXTENT_UPTODATE, + 1, 1, GFP_NOFS); + return ret; +} +EXPORT_SYMBOL(try_release_extent_mapping); + +sector_t extent_bmap(struct address_space *mapping, sector_t iblock, + get_extent_t *get_extent) +{ + struct inode *inode = mapping->host; + u64 start = iblock << inode->i_blkbits; + sector_t sector = 0; + struct extent_map *em; + + em = get_extent(inode, NULL, 0, start, (1 << inode->i_blkbits), 0); + if (!em || IS_ERR(em)) + return 0; + + if (em->block_start == EXTENT_MAP_INLINE || + em->block_start == EXTENT_MAP_HOLE) + goto out; + + sector = (em->block_start + start - em->start) >> inode->i_blkbits; +printk("bmap finds %Lu %Lu block %Lu\n", em->start, em->len, em->block_start); +out: + free_extent_map(em); + return sector; +} + +static int add_lru(struct extent_io_tree *tree, struct extent_buffer *eb) +{ + if (list_empty(&eb->lru)) { + extent_buffer_get(eb); + list_add(&eb->lru, &tree->buffer_lru); + tree->lru_size++; + if (tree->lru_size >= BUFFER_LRU_MAX) { + struct extent_buffer *rm; + rm = list_entry(tree->buffer_lru.prev, + struct extent_buffer, lru); + tree->lru_size--; + list_del_init(&rm->lru); + free_extent_buffer(rm); + } + } else + list_move(&eb->lru, &tree->buffer_lru); + return 0; +} +static struct extent_buffer *find_lru(struct extent_io_tree *tree, + u64 start, unsigned long len) +{ + struct list_head *lru = &tree->buffer_lru; + struct list_head *cur = lru->next; + struct extent_buffer *eb; + + if (list_empty(lru)) + return NULL; + + do { + eb = list_entry(cur, struct extent_buffer, lru); + if (eb->start == start && eb->len == len) { + extent_buffer_get(eb); + return eb; + } + cur = cur->next; + } while (cur != lru); + return NULL; +} + +static inline unsigned long num_extent_pages(u64 start, u64 len) +{ + return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) - + (start >> PAGE_CACHE_SHIFT); +} + +static inline struct page *extent_buffer_page(struct extent_buffer *eb, + unsigned long i) +{ + struct page *p; + struct address_space *mapping; + + if (i == 0) + return eb->first_page; + i += eb->start >> PAGE_CACHE_SHIFT; + mapping = eb->first_page->mapping; + read_lock_irq(&mapping->tree_lock); + p = radix_tree_lookup(&mapping->page_tree, i); + read_unlock_irq(&mapping->tree_lock); + return p; +} + +static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree, + u64 start, + unsigned long len, + gfp_t mask) +{ + struct extent_buffer *eb = NULL; + + spin_lock(&tree->lru_lock); + eb = find_lru(tree, start, len); + spin_unlock(&tree->lru_lock); + if (eb) { + return eb; + } + + eb = kmem_cache_zalloc(extent_buffer_cache, mask); + INIT_LIST_HEAD(&eb->lru); + eb->start = start; + eb->len = len; + atomic_set(&eb->refs, 1); + + return eb; +} + +static void __free_extent_buffer(struct extent_buffer *eb) +{ + kmem_cache_free(extent_buffer_cache, eb); +} + +struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree, + u64 start, unsigned long len, + struct page *page0, + gfp_t mask) +{ + unsigned long num_pages = num_extent_pages(start, len); + unsigned long i; + unsigned long index = start >> PAGE_CACHE_SHIFT; + struct extent_buffer *eb; + struct page *p; + struct address_space *mapping = tree->mapping; + int uptodate = 1; + + eb = __alloc_extent_buffer(tree, start, len, mask); + if (!eb || IS_ERR(eb)) + return NULL; + + if (eb->flags & EXTENT_BUFFER_FILLED) + goto lru_add; + + if (page0) { + eb->first_page = page0; + i = 1; + index++; + page_cache_get(page0); + mark_page_accessed(page0); + set_page_extent_mapped(page0); + WARN_ON(!PageUptodate(page0)); + set_page_extent_head(page0, len); + } else { + i = 0; + } + for (; i < num_pages; i++, index++) { + p = find_or_create_page(mapping, index, mask | __GFP_HIGHMEM); + if (!p) { + WARN_ON(1); + goto fail; + } + set_page_extent_mapped(p); + mark_page_accessed(p); + if (i == 0) { + eb->first_page = p; + set_page_extent_head(p, len); + } else { + set_page_private(p, EXTENT_PAGE_PRIVATE); + } + if (!PageUptodate(p)) + uptodate = 0; + unlock_page(p); + } + if (uptodate) + eb->flags |= EXTENT_UPTODATE; + eb->flags |= EXTENT_BUFFER_FILLED; + +lru_add: + spin_lock(&tree->lru_lock); + add_lru(tree, eb); + spin_unlock(&tree->lru_lock); + return eb; + +fail: + spin_lock(&tree->lru_lock); + list_del_init(&eb->lru); + spin_unlock(&tree->lru_lock); + if (!atomic_dec_and_test(&eb->refs)) + return NULL; + for (index = 1; index < i; index++) { + page_cache_release(extent_buffer_page(eb, index)); + } + if (i > 0) + page_cache_release(extent_buffer_page(eb, 0)); + __free_extent_buffer(eb); + return NULL; +} +EXPORT_SYMBOL(alloc_extent_buffer); + +struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree, + u64 start, unsigned long len, + gfp_t mask) +{ + unsigned long num_pages = num_extent_pages(start, len); + unsigned long i; + unsigned long index = start >> PAGE_CACHE_SHIFT; + struct extent_buffer *eb; + struct page *p; + struct address_space *mapping = tree->mapping; + int uptodate = 1; + + eb = __alloc_extent_buffer(tree, start, len, mask); + if (!eb || IS_ERR(eb)) + return NULL; + + if (eb->flags & EXTENT_BUFFER_FILLED) + goto lru_add; + + for (i = 0; i < num_pages; i++, index++) { + p = find_lock_page(mapping, index); + if (!p) { + goto fail; + } + set_page_extent_mapped(p); + mark_page_accessed(p); + + if (i == 0) { + eb->first_page = p; + set_page_extent_head(p, len); + } else { + set_page_private(p, EXTENT_PAGE_PRIVATE); + } + + if (!PageUptodate(p)) + uptodate = 0; + unlock_page(p); + } + if (uptodate) + eb->flags |= EXTENT_UPTODATE; + eb->flags |= EXTENT_BUFFER_FILLED; + +lru_add: + spin_lock(&tree->lru_lock); + add_lru(tree, eb); + spin_unlock(&tree->lru_lock); + return eb; +fail: + spin_lock(&tree->lru_lock); + list_del_init(&eb->lru); + spin_unlock(&tree->lru_lock); + if (!atomic_dec_and_test(&eb->refs)) + return NULL; + for (index = 1; index < i; index++) { + page_cache_release(extent_buffer_page(eb, index)); + } + if (i > 0) + page_cache_release(extent_buffer_page(eb, 0)); + __free_extent_buffer(eb); + return NULL; +} +EXPORT_SYMBOL(find_extent_buffer); + +void free_extent_buffer(struct extent_buffer *eb) +{ + unsigned long i; + unsigned long num_pages; + + if (!eb) + return; + + if (!atomic_dec_and_test(&eb->refs)) + return; + + WARN_ON(!list_empty(&eb->lru)); + num_pages = num_extent_pages(eb->start, eb->len); + + for (i = 1; i < num_pages; i++) { + page_cache_release(extent_buffer_page(eb, i)); + } + page_cache_release(extent_buffer_page(eb, 0)); + __free_extent_buffer(eb); +} +EXPORT_SYMBOL(free_extent_buffer); + +int clear_extent_buffer_dirty(struct extent_io_tree *tree, + struct extent_buffer *eb) +{ + int set; + unsigned long i; + unsigned long num_pages; + struct page *page; + + u64 start = eb->start; + u64 end = start + eb->len - 1; + + set = clear_extent_dirty(tree, start, end, GFP_NOFS); + num_pages = num_extent_pages(eb->start, eb->len); + + for (i = 0; i < num_pages; i++) { + page = extent_buffer_page(eb, i); + lock_page(page); + if (i == 0) + set_page_extent_head(page, eb->len); + else + set_page_private(page, EXTENT_PAGE_PRIVATE); + + /* + * if we're on the last page or the first page and the + * block isn't aligned on a page boundary, do extra checks + * to make sure we don't clean page that is partially dirty + */ + if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) || + ((i == num_pages - 1) && + ((eb->start + eb->len) & (PAGE_CACHE_SIZE - 1)))) { + start = (u64)page->index << PAGE_CACHE_SHIFT; + end = start + PAGE_CACHE_SIZE - 1; + if (test_range_bit(tree, start, end, + EXTENT_DIRTY, 0)) { + unlock_page(page); + continue; + } + } + clear_page_dirty_for_io(page); + write_lock_irq(&page->mapping->tree_lock); + if (!PageDirty(page)) { + radix_tree_tag_clear(&page->mapping->page_tree, + page_index(page), + PAGECACHE_TAG_DIRTY); + } + write_unlock_irq(&page->mapping->tree_lock); + unlock_page(page); + } + return 0; +} +EXPORT_SYMBOL(clear_extent_buffer_dirty); + +int wait_on_extent_buffer_writeback(struct extent_io_tree *tree, + struct extent_buffer *eb) +{ + return wait_on_extent_writeback(tree, eb->start, + eb->start + eb->len - 1); +} +EXPORT_SYMBOL(wait_on_extent_buffer_writeback); + +int set_extent_buffer_dirty(struct extent_io_tree *tree, + struct extent_buffer *eb) +{ + unsigned long i; + unsigned long num_pages; + + num_pages = num_extent_pages(eb->start, eb->len); + for (i = 0; i < num_pages; i++) { + struct page *page = extent_buffer_page(eb, i); + /* writepage may need to do something special for the + * first page, we have to make sure page->private is + * properly set. releasepage may drop page->private + * on us if the page isn't already dirty. + */ + if (i == 0) { + lock_page(page); + set_page_extent_head(page, eb->len); + } else if (PagePrivate(page) && + page->private != EXTENT_PAGE_PRIVATE) { + lock_page(page); + set_page_extent_mapped(page); + unlock_page(page); + } + __set_page_dirty_nobuffers(extent_buffer_page(eb, i)); + if (i == 0) + unlock_page(page); + } + return set_extent_dirty(tree, eb->start, + eb->start + eb->len - 1, GFP_NOFS); +} +EXPORT_SYMBOL(set_extent_buffer_dirty); + +int set_extent_buffer_uptodate(struct extent_io_tree *tree, + struct extent_buffer *eb) +{ + unsigned long i; + struct page *page; + unsigned long num_pages; + + num_pages = num_extent_pages(eb->start, eb->len); + + set_extent_uptodate(tree, eb->start, eb->start + eb->len - 1, + GFP_NOFS); + for (i = 0; i < num_pages; i++) { + page = extent_buffer_page(eb, i); + if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) || + ((i == num_pages - 1) && + ((eb->start + eb->len) & (PAGE_CACHE_SIZE - 1)))) { + check_page_uptodate(tree, page); + continue; + } + SetPageUptodate(page); + } + return 0; +} +EXPORT_SYMBOL(set_extent_buffer_uptodate); + +int extent_buffer_uptodate(struct extent_io_tree *tree, + struct extent_buffer *eb) +{ + if (eb->flags & EXTENT_UPTODATE) + return 1; + return test_range_bit(tree, eb->start, eb->start + eb->len - 1, + EXTENT_UPTODATE, 1); +} +EXPORT_SYMBOL(extent_buffer_uptodate); + +int read_extent_buffer_pages(struct extent_io_tree *tree, + struct extent_buffer *eb, + u64 start, + int wait) +{ + unsigned long i; + unsigned long start_i; + struct page *page; + int err; + int ret = 0; + unsigned long num_pages; + + if (eb->flags & EXTENT_UPTODATE) + return 0; + + if (0 && test_range_bit(tree, eb->start, eb->start + eb->len - 1, + EXTENT_UPTODATE, 1)) { + return 0; + } + + if (start) { + WARN_ON(start < eb->start); + start_i = (start >> PAGE_CACHE_SHIFT) - + (eb->start >> PAGE_CACHE_SHIFT); + } else { + start_i = 0; + } + + num_pages = num_extent_pages(eb->start, eb->len); + for (i = start_i; i < num_pages; i++) { + page = extent_buffer_page(eb, i); + if (PageUptodate(page)) { + continue; + } + if (!wait) { + if (TestSetPageLocked(page)) { + continue; + } + } else { + lock_page(page); + } + if (!PageUptodate(page)) { + err = page->mapping->a_ops->readpage(NULL, page); + if (err) { + ret = err; + } + } else { + unlock_page(page); + } + } + + if (ret || !wait) { + return ret; + } + + for (i = start_i; i < num_pages; i++) { + page = extent_buffer_page(eb, i); + wait_on_page_locked(page); + if (!PageUptodate(page)) { + ret = -EIO; + } + } + if (!ret) + eb->flags |= EXTENT_UPTODATE; + return ret; +} +EXPORT_SYMBOL(read_extent_buffer_pages); + +void read_extent_buffer(struct extent_buffer *eb, void *dstv, + unsigned long start, + unsigned long len) +{ + size_t cur; + size_t offset; + struct page *page; + char *kaddr; + char *dst = (char *)dstv; + size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); + unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT; + unsigned long num_pages = num_extent_pages(eb->start, eb->len); + + WARN_ON(start > eb->len); + WARN_ON(start + len > eb->start + eb->len); + + offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1); + + while(len > 0) { + page = extent_buffer_page(eb, i); + if (!PageUptodate(page)) { + printk("page %lu not up to date i %lu, total %lu, len %lu\n", page->index, i, num_pages, eb->len); + WARN_ON(1); + } + WARN_ON(!PageUptodate(page)); + + cur = min(len, (PAGE_CACHE_SIZE - offset)); + kaddr = kmap_atomic(page, KM_USER1); + memcpy(dst, kaddr + offset, cur); + kunmap_atomic(kaddr, KM_USER1); + + dst += cur; + len -= cur; + offset = 0; + i++; + } +} +EXPORT_SYMBOL(read_extent_buffer); + +int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start, + unsigned long min_len, char **token, char **map, + unsigned long *map_start, + unsigned long *map_len, int km) +{ + size_t offset = start & (PAGE_CACHE_SIZE - 1); + char *kaddr; + struct page *p; + size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); + unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT; + unsigned long end_i = (start_offset + start + min_len - 1) >> + PAGE_CACHE_SHIFT; + + if (i != end_i) + return -EINVAL; + + if (i == 0) { + offset = start_offset; + *map_start = 0; + } else { + offset = 0; + *map_start = ((u64)i << PAGE_CACHE_SHIFT) - start_offset; + } + if (start + min_len > eb->len) { +printk("bad mapping eb start %Lu len %lu, wanted %lu %lu\n", eb->start, eb->len, start, min_len); + WARN_ON(1); + } + + p = extent_buffer_page(eb, i); + WARN_ON(!PageUptodate(p)); + kaddr = kmap_atomic(p, km); + *token = kaddr; + *map = kaddr + offset; + *map_len = PAGE_CACHE_SIZE - offset; + return 0; +} +EXPORT_SYMBOL(map_private_extent_buffer); + +int map_extent_buffer(struct extent_buffer *eb, unsigned long start, + unsigned long min_len, + char **token, char **map, + unsigned long *map_start, + unsigned long *map_len, int km) +{ + int err; + int save = 0; + if (eb->map_token) { + unmap_extent_buffer(eb, eb->map_token, km); + eb->map_token = NULL; + save = 1; + } + err = map_private_extent_buffer(eb, start, min_len, token, map, + map_start, map_len, km); + if (!err && save) { + eb->map_token = *token; + eb->kaddr = *map; + eb->map_start = *map_start; + eb->map_len = *map_len; + } + return err; +} +EXPORT_SYMBOL(map_extent_buffer); + +void unmap_extent_buffer(struct extent_buffer *eb, char *token, int km) +{ + kunmap_atomic(token, km); +} +EXPORT_SYMBOL(unmap_extent_buffer); + +int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv, + unsigned long start, + unsigned long len) +{ + size_t cur; + size_t offset; + struct page *page; + char *kaddr; + char *ptr = (char *)ptrv; + size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); + unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT; + int ret = 0; + + WARN_ON(start > eb->len); + WARN_ON(start + len > eb->start + eb->len); + + offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1); + + while(len > 0) { + page = extent_buffer_page(eb, i); + WARN_ON(!PageUptodate(page)); + + cur = min(len, (PAGE_CACHE_SIZE - offset)); + + kaddr = kmap_atomic(page, KM_USER0); + ret = memcmp(ptr, kaddr + offset, cur); + kunmap_atomic(kaddr, KM_USER0); + if (ret) + break; + + ptr += cur; + len -= cur; + offset = 0; + i++; + } + return ret; +} +EXPORT_SYMBOL(memcmp_extent_buffer); + +void write_extent_buffer(struct extent_buffer *eb, const void *srcv, + unsigned long start, unsigned long len) +{ + size_t cur; + size_t offset; + struct page *page; + char *kaddr; + char *src = (char *)srcv; + size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); + unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT; + + WARN_ON(start > eb->len); + WARN_ON(start + len > eb->start + eb->len); + + offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1); + + while(len > 0) { + page = extent_buffer_page(eb, i); + WARN_ON(!PageUptodate(page)); + + cur = min(len, PAGE_CACHE_SIZE - offset); + kaddr = kmap_atomic(page, KM_USER1); + memcpy(kaddr + offset, src, cur); + kunmap_atomic(kaddr, KM_USER1); + + src += cur; + len -= cur; + offset = 0; + i++; + } +} +EXPORT_SYMBOL(write_extent_buffer); + +void memset_extent_buffer(struct extent_buffer *eb, char c, + unsigned long start, unsigned long len) +{ + size_t cur; + size_t offset; + struct page *page; + char *kaddr; + size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); + unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT; + + WARN_ON(start > eb->len); + WARN_ON(start + len > eb->start + eb->len); + + offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1); + + while(len > 0) { + page = extent_buffer_page(eb, i); + WARN_ON(!PageUptodate(page)); + + cur = min(len, PAGE_CACHE_SIZE - offset); + kaddr = kmap_atomic(page, KM_USER0); + memset(kaddr + offset, c, cur); + kunmap_atomic(kaddr, KM_USER0); + + len -= cur; + offset = 0; + i++; + } +} +EXPORT_SYMBOL(memset_extent_buffer); + +void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src, + unsigned long dst_offset, unsigned long src_offset, + unsigned long len) +{ + u64 dst_len = dst->len; + size_t cur; + size_t offset; + struct page *page; + char *kaddr; + size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1); + unsigned long i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT; + + WARN_ON(src->len != dst_len); + + offset = (start_offset + dst_offset) & + ((unsigned long)PAGE_CACHE_SIZE - 1); + + while(len > 0) { + page = extent_buffer_page(dst, i); + WARN_ON(!PageUptodate(page)); + + cur = min(len, (unsigned long)(PAGE_CACHE_SIZE - offset)); + + kaddr = kmap_atomic(page, KM_USER0); + read_extent_buffer(src, kaddr + offset, src_offset, cur); + kunmap_atomic(kaddr, KM_USER0); + + src_offset += cur; + len -= cur; + offset = 0; + i++; + } +} +EXPORT_SYMBOL(copy_extent_buffer); + +static void move_pages(struct page *dst_page, struct page *src_page, + unsigned long dst_off, unsigned long src_off, + unsigned long len) +{ + char *dst_kaddr = kmap_atomic(dst_page, KM_USER0); + if (dst_page == src_page) { + memmove(dst_kaddr + dst_off, dst_kaddr + src_off, len); + } else { + char *src_kaddr = kmap_atomic(src_page, KM_USER1); + char *p = dst_kaddr + dst_off + len; + char *s = src_kaddr + src_off + len; + + while (len--) + *--p = *--s; + + kunmap_atomic(src_kaddr, KM_USER1); + } + kunmap_atomic(dst_kaddr, KM_USER0); +} + +static void copy_pages(struct page *dst_page, struct page *src_page, + unsigned long dst_off, unsigned long src_off, + unsigned long len) +{ + char *dst_kaddr = kmap_atomic(dst_page, KM_USER0); + char *src_kaddr; + + if (dst_page != src_page) + src_kaddr = kmap_atomic(src_page, KM_USER1); + else + src_kaddr = dst_kaddr; + + memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len); + kunmap_atomic(dst_kaddr, KM_USER0); + if (dst_page != src_page) + kunmap_atomic(src_kaddr, KM_USER1); +} + +void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, + unsigned long src_offset, unsigned long len) +{ + size_t cur; + size_t dst_off_in_page; + size_t src_off_in_page; + size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1); + unsigned long dst_i; + unsigned long src_i; + + if (src_offset + len > dst->len) { + printk("memmove bogus src_offset %lu move len %lu len %lu\n", + src_offset, len, dst->len); + BUG_ON(1); + } + if (dst_offset + len > dst->len) { + printk("memmove bogus dst_offset %lu move len %lu len %lu\n", + dst_offset, len, dst->len); + BUG_ON(1); + } + + while(len > 0) { + dst_off_in_page = (start_offset + dst_offset) & + ((unsigned long)PAGE_CACHE_SIZE - 1); + src_off_in_page = (start_offset + src_offset) & + ((unsigned long)PAGE_CACHE_SIZE - 1); + + dst_i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT; + src_i = (start_offset + src_offset) >> PAGE_CACHE_SHIFT; + + cur = min(len, (unsigned long)(PAGE_CACHE_SIZE - + src_off_in_page)); + cur = min_t(unsigned long, cur, + (unsigned long)(PAGE_CACHE_SIZE - dst_off_in_page)); + + copy_pages(extent_buffer_page(dst, dst_i), + extent_buffer_page(dst, src_i), + dst_off_in_page, src_off_in_page, cur); + + src_offset += cur; + dst_offset += cur; + len -= cur; + } +} +EXPORT_SYMBOL(memcpy_extent_buffer); + +void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, + unsigned long src_offset, unsigned long len) +{ + size_t cur; + size_t dst_off_in_page; + size_t src_off_in_page; + unsigned long dst_end = dst_offset + len - 1; + unsigned long src_end = src_offset + len - 1; + size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1); + unsigned long dst_i; + unsigned long src_i; + + if (src_offset + len > dst->len) { + printk("memmove bogus src_offset %lu move len %lu len %lu\n", + src_offset, len, dst->len); + BUG_ON(1); + } + if (dst_offset + len > dst->len) { + printk("memmove bogus dst_offset %lu move len %lu len %lu\n", + dst_offset, len, dst->len); + BUG_ON(1); + } + if (dst_offset < src_offset) { + memcpy_extent_buffer(dst, dst_offset, src_offset, len); + return; + } + while(len > 0) { + dst_i = (start_offset + dst_end) >> PAGE_CACHE_SHIFT; + src_i = (start_offset + src_end) >> PAGE_CACHE_SHIFT; + + dst_off_in_page = (start_offset + dst_end) & + ((unsigned long)PAGE_CACHE_SIZE - 1); + src_off_in_page = (start_offset + src_end) & + ((unsigned long)PAGE_CACHE_SIZE - 1); + + cur = min_t(unsigned long, len, src_off_in_page + 1); + cur = min(cur, dst_off_in_page + 1); + move_pages(extent_buffer_page(dst, dst_i), + extent_buffer_page(dst, src_i), + dst_off_in_page - cur + 1, + src_off_in_page - cur + 1, cur); + + dst_end -= cur; + src_end -= cur; + len -= cur; + } +} +EXPORT_SYMBOL(memmove_extent_buffer); diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h new file mode 100644 index 000000000000..06be1fe84b29 --- /dev/null +++ b/fs/btrfs/extent_io.h @@ -0,0 +1,193 @@ +#ifndef __EXTENTIO__ +#define __EXTENTIO__ + +#include + +/* bits for the extent state */ +#define EXTENT_DIRTY 1 +#define EXTENT_WRITEBACK (1 << 1) +#define EXTENT_UPTODATE (1 << 2) +#define EXTENT_LOCKED (1 << 3) +#define EXTENT_NEW (1 << 4) +#define EXTENT_DELALLOC (1 << 5) +#define EXTENT_DEFRAG (1 << 6) +#define EXTENT_DEFRAG_DONE (1 << 7) +#define EXTENT_BUFFER_FILLED (1 << 8) +#define EXTENT_CSUM (1 << 9) +#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK) + +/* + * page->private values. Every page that is controlled by the extent + * map has page->private set to one. + */ +#define EXTENT_PAGE_PRIVATE 1 +#define EXTENT_PAGE_PRIVATE_FIRST_PAGE 3 + +struct extent_io_ops { + int (*fill_delalloc)(struct inode *inode, u64 start, u64 end); + int (*writepage_io_hook)(struct page *page, u64 start, u64 end); + int (*readpage_io_hook)(struct page *page, u64 start, u64 end); + int (*readpage_end_io_hook)(struct page *page, u64 start, u64 end); + void (*writepage_end_io_hook)(struct page *page, u64 start, u64 end); +}; + +struct extent_io_tree { + struct rb_root state; + struct address_space *mapping; + u64 dirty_bytes; + rwlock_t lock; + struct extent_io_ops *ops; + spinlock_t lru_lock; + struct list_head buffer_lru; + int lru_size; +}; + +struct extent_state { + u64 start; + u64 end; /* inclusive */ + int in_tree; + struct rb_node rb_node; + wait_queue_head_t wq; + atomic_t refs; + unsigned long state; + + /* for use by the FS */ + u64 private; + + struct list_head list; +}; + +struct extent_buffer { + u64 start; + unsigned long len; + char *map_token; + char *kaddr; + unsigned long map_start; + unsigned long map_len; + struct page *first_page; + struct list_head lru; + atomic_t refs; + int flags; +}; + +struct extent_map_tree; + +typedef struct extent_map *(get_extent_t)(struct inode *inode, + struct page *page, + size_t page_offset, + u64 start, u64 len, + int create); + +void extent_io_tree_init(struct extent_io_tree *tree, + struct address_space *mapping, gfp_t mask); +void extent_io_tree_empty_lru(struct extent_io_tree *tree); +int try_release_extent_mapping(struct extent_map_tree *map, + struct extent_io_tree *tree, struct page *page); +int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask); +int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask); +int extent_read_full_page(struct extent_io_tree *tree, struct page *page, + get_extent_t *get_extent); +int __init extent_io_init(void); +void extent_io_exit(void); + +u64 count_range_bits(struct extent_io_tree *tree, + u64 *start, u64 search_end, + u64 max_bytes, unsigned long bits); + +int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, + int bits, int filled); +int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, + int bits, gfp_t mask); +int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, + int bits, gfp_t mask); +int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, + gfp_t mask); +int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end, + gfp_t mask); +int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, + gfp_t mask); +int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, + gfp_t mask); +int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end, + gfp_t mask); +int find_first_extent_bit(struct extent_io_tree *tree, u64 start, + u64 *start_ret, u64 *end_ret, int bits); +int extent_invalidatepage(struct extent_io_tree *tree, + struct page *page, unsigned long offset); +int extent_write_full_page(struct extent_io_tree *tree, struct page *page, + get_extent_t *get_extent, + struct writeback_control *wbc); +int extent_writepages(struct extent_io_tree *tree, + struct address_space *mapping, + get_extent_t *get_extent, + struct writeback_control *wbc); +int extent_readpages(struct extent_io_tree *tree, + struct address_space *mapping, + struct list_head *pages, unsigned nr_pages, + get_extent_t get_extent); +int extent_prepare_write(struct extent_io_tree *tree, + struct inode *inode, struct page *page, + unsigned from, unsigned to, get_extent_t *get_extent); +int extent_commit_write(struct extent_io_tree *tree, + struct inode *inode, struct page *page, + unsigned from, unsigned to); +sector_t extent_bmap(struct address_space *mapping, sector_t iblock, + get_extent_t *get_extent); +int set_range_dirty(struct extent_io_tree *tree, u64 start, u64 end); +int set_state_private(struct extent_io_tree *tree, u64 start, u64 private); +int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private); +void set_page_extent_mapped(struct page *page); + +struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree, + u64 start, unsigned long len, + struct page *page0, + gfp_t mask); +struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree, + u64 start, unsigned long len, + gfp_t mask); +void free_extent_buffer(struct extent_buffer *eb); +int read_extent_buffer_pages(struct extent_io_tree *tree, + struct extent_buffer *eb, u64 start, int wait); + +static inline void extent_buffer_get(struct extent_buffer *eb) +{ + atomic_inc(&eb->refs); +} + +int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv, + unsigned long start, + unsigned long len); +void read_extent_buffer(struct extent_buffer *eb, void *dst, + unsigned long start, + unsigned long len); +void write_extent_buffer(struct extent_buffer *eb, const void *src, + unsigned long start, unsigned long len); +void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src, + unsigned long dst_offset, unsigned long src_offset, + unsigned long len); +void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, + unsigned long src_offset, unsigned long len); +void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, + unsigned long src_offset, unsigned long len); +void memset_extent_buffer(struct extent_buffer *eb, char c, + unsigned long start, unsigned long len); +int wait_on_extent_buffer_writeback(struct extent_io_tree *tree, + struct extent_buffer *eb); +int clear_extent_buffer_dirty(struct extent_io_tree *tree, + struct extent_buffer *eb); +int set_extent_buffer_dirty(struct extent_io_tree *tree, + struct extent_buffer *eb); +int set_extent_buffer_uptodate(struct extent_io_tree *tree, + struct extent_buffer *eb); +int extent_buffer_uptodate(struct extent_io_tree *tree, + struct extent_buffer *eb); +int map_extent_buffer(struct extent_buffer *eb, unsigned long offset, + unsigned long min_len, char **token, char **map, + unsigned long *map_start, + unsigned long *map_len, int km); +int map_private_extent_buffer(struct extent_buffer *eb, unsigned long offset, + unsigned long min_len, char **token, char **map, + unsigned long *map_start, + unsigned long *map_len, int km); +void unmap_extent_buffer(struct extent_buffer *eb, char *token, int km); +#endif diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 010a287fbd71..268ad8facf6e 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -1,17 +1,10 @@ -#include -#include -#include -#include +#include #include -#include -#include +#include #include #include -#include -#include #include -#include -#include +#include #include "extent_map.h" /* temporary define until extent_map moves out of btrfs */ @@ -21,27 +14,6 @@ struct kmem_cache *btrfs_cache_create(const char *name, size_t size, unsigned long)); static struct kmem_cache *extent_map_cache; -static struct kmem_cache *extent_state_cache; -static struct kmem_cache *extent_buffer_cache; - -static LIST_HEAD(buffers); -static LIST_HEAD(states); - -static spinlock_t state_lock = SPIN_LOCK_UNLOCKED; -#define BUFFER_LRU_MAX 64 - -struct tree_entry { - u64 start; - u64 end; - int in_tree; - struct rb_node rb_node; -}; - -struct extent_page_data { - struct bio *bio; - struct extent_map_tree *tree; - get_extent_t *get_extent; -}; int __init extent_map_init(void) { @@ -50,72 +22,23 @@ int __init extent_map_init(void) NULL); if (!extent_map_cache) return -ENOMEM; - extent_state_cache = btrfs_cache_create("extent_state", - sizeof(struct extent_state), 0, - NULL); - if (!extent_state_cache) - goto free_map_cache; - extent_buffer_cache = btrfs_cache_create("extent_buffers", - sizeof(struct extent_buffer), 0, - NULL); - if (!extent_buffer_cache) - goto free_state_cache; return 0; - -free_state_cache: - kmem_cache_destroy(extent_state_cache); -free_map_cache: - kmem_cache_destroy(extent_map_cache); - return -ENOMEM; } void extent_map_exit(void) { - struct extent_state *state; - - while (!list_empty(&states)) { - state = list_entry(states.next, struct extent_state, list); - printk("state leak: start %Lu end %Lu state %lu in tree %d refs %d\n", state->start, state->end, state->state, state->in_tree, atomic_read(&state->refs)); - list_del(&state->list); - kmem_cache_free(extent_state_cache, state); - - } - if (extent_map_cache) kmem_cache_destroy(extent_map_cache); - if (extent_state_cache) - kmem_cache_destroy(extent_state_cache); - if (extent_buffer_cache) - kmem_cache_destroy(extent_buffer_cache); } -void extent_map_tree_init(struct extent_map_tree *tree, - struct address_space *mapping, gfp_t mask) +void extent_map_tree_init(struct extent_map_tree *tree, gfp_t mask) { tree->map.rb_node = NULL; - tree->state.rb_node = NULL; - tree->ops = NULL; - tree->dirty_bytes = 0; - rwlock_init(&tree->lock); - spin_lock_init(&tree->lru_lock); - tree->mapping = mapping; - INIT_LIST_HEAD(&tree->buffer_lru); - tree->lru_size = 0; + tree->last = NULL; + spin_lock_init(&tree->lock); } EXPORT_SYMBOL(extent_map_tree_init); -void extent_map_tree_empty_lru(struct extent_map_tree *tree) -{ - struct extent_buffer *eb; - while(!list_empty(&tree->buffer_lru)) { - eb = list_entry(tree->buffer_lru.next, struct extent_buffer, - lru); - list_del_init(&eb->lru); - free_extent_buffer(eb); - } -} -EXPORT_SYMBOL(extent_map_tree_empty_lru); - struct extent_map *alloc_extent_map(gfp_t mask) { struct extent_map *em; @@ -123,6 +46,7 @@ struct extent_map *alloc_extent_map(gfp_t mask) if (!em || IS_ERR(em)) return em; em->in_tree = 0; + em->flags = 0; atomic_set(&em->refs, 1); return em; } @@ -132,6 +56,7 @@ void free_extent_map(struct extent_map *em) { if (!em) return; + WARN_ON(atomic_read(&em->refs) == 0); if (atomic_dec_and_test(&em->refs)) { WARN_ON(em->in_tree); kmem_cache_free(extent_map_cache, em); @@ -139,64 +64,28 @@ void free_extent_map(struct extent_map *em) } EXPORT_SYMBOL(free_extent_map); - -struct extent_state *alloc_extent_state(gfp_t mask) -{ - struct extent_state *state; - unsigned long flags; - - state = kmem_cache_alloc(extent_state_cache, mask); - if (!state || IS_ERR(state)) - return state; - state->state = 0; - state->in_tree = 0; - state->private = 0; - - spin_lock_irqsave(&state_lock, flags); - list_add(&state->list, &states); - spin_unlock_irqrestore(&state_lock, flags); - - atomic_set(&state->refs, 1); - init_waitqueue_head(&state->wq); - return state; -} -EXPORT_SYMBOL(alloc_extent_state); - -void free_extent_state(struct extent_state *state) -{ - unsigned long flags; - if (!state) - return; - if (atomic_dec_and_test(&state->refs)) { - WARN_ON(state->in_tree); - spin_lock_irqsave(&state_lock, flags); - list_del(&state->list); - spin_unlock_irqrestore(&state_lock, flags); - kmem_cache_free(extent_state_cache, state); - } -} -EXPORT_SYMBOL(free_extent_state); - static struct rb_node *tree_insert(struct rb_root *root, u64 offset, struct rb_node *node) { struct rb_node ** p = &root->rb_node; struct rb_node * parent = NULL; - struct tree_entry *entry; + struct extent_map *entry; while(*p) { parent = *p; - entry = rb_entry(parent, struct tree_entry, rb_node); + entry = rb_entry(parent, struct extent_map, rb_node); + + WARN_ON(!entry->in_tree); if (offset < entry->start) p = &(*p)->rb_left; - else if (offset > entry->end) + else if (offset >= extent_map_end(entry)) p = &(*p)->rb_right; else return parent; } - entry = rb_entry(node, struct tree_entry, rb_node); + entry = rb_entry(node, struct extent_map, rb_node); entry->in_tree = 1; rb_link_node(node, parent, p); rb_insert_color(node, root); @@ -210,17 +99,19 @@ static struct rb_node *__tree_search(struct rb_root *root, u64 offset, struct rb_node * n = root->rb_node; struct rb_node *prev = NULL; struct rb_node *orig_prev = NULL; - struct tree_entry *entry; - struct tree_entry *prev_entry = NULL; + struct extent_map *entry; + struct extent_map *prev_entry = NULL; while(n) { - entry = rb_entry(n, struct tree_entry, rb_node); + entry = rb_entry(n, struct extent_map, rb_node); prev = n; prev_entry = entry; + WARN_ON(!entry->in_tree); + if (offset < entry->start) n = n->rb_left; - else if (offset > entry->end) + else if (offset >= extent_map_end(entry)) n = n->rb_right; else return n; @@ -228,19 +119,19 @@ static struct rb_node *__tree_search(struct rb_root *root, u64 offset, if (prev_ret) { orig_prev = prev; - while(prev && offset > prev_entry->end) { + while(prev && offset >= extent_map_end(prev_entry)) { prev = rb_next(prev); - prev_entry = rb_entry(prev, struct tree_entry, rb_node); + prev_entry = rb_entry(prev, struct extent_map, rb_node); } *prev_ret = prev; prev = orig_prev; } if (next_ret) { - prev_entry = rb_entry(prev, struct tree_entry, rb_node); + prev_entry = rb_entry(prev, struct extent_map, rb_node); while(prev && offset < prev_entry->start) { prev = rb_prev(prev); - prev_entry = rb_entry(prev, struct tree_entry, rb_node); + prev_entry = rb_entry(prev, struct extent_map, rb_node); } *next_ret = prev; } @@ -257,22 +148,26 @@ static inline struct rb_node *tree_search(struct rb_root *root, u64 offset) return ret; } -static int tree_delete(struct rb_root *root, u64 offset) +static int mergable_maps(struct extent_map *prev, struct extent_map *next) { - struct rb_node *node; - struct tree_entry *entry; - - node = __tree_search(root, offset, NULL, NULL); - if (!node) - return -ENOENT; - entry = rb_entry(node, struct tree_entry, rb_node); - entry->in_tree = 0; - rb_erase(node, root); + if (extent_map_end(prev) == next->start && + prev->flags == next->flags && + prev->bdev == next->bdev && + ((next->block_start == EXTENT_MAP_HOLE && + prev->block_start == EXTENT_MAP_HOLE) || + (next->block_start == EXTENT_MAP_INLINE && + prev->block_start == EXTENT_MAP_INLINE) || + (next->block_start == EXTENT_MAP_DELALLOC && + prev->block_start == EXTENT_MAP_DELALLOC) || + (next->block_start < EXTENT_MAP_LAST_BYTE - 1 && + next->block_start == extent_map_block_end(prev)))) { + return 1; + } return 0; } /* - * add_extent_mapping tries a simple backward merge with existing + * add_extent_mapping tries a simple forward/backward merge with existing * mappings. The extent_map struct passed in will be inserted into * the tree directly (no copies made, just a reference taken). */ @@ -280,13 +175,12 @@ int add_extent_mapping(struct extent_map_tree *tree, struct extent_map *em) { int ret = 0; - struct extent_map *prev = NULL; + struct extent_map *merge = NULL; struct rb_node *rb; - write_lock_irq(&tree->lock); - rb = tree_insert(&tree->map, em->end, &em->rb_node); + rb = tree_insert(&tree->map, em->start, &em->rb_node); if (rb) { - prev = rb_entry(rb, struct extent_map, rb_node); + merge = rb_entry(rb, struct extent_map, rb_node); ret = -EEXIST; goto out; } @@ -294,53 +188,60 @@ int add_extent_mapping(struct extent_map_tree *tree, if (em->start != 0) { rb = rb_prev(&em->rb_node); if (rb) - prev = rb_entry(rb, struct extent_map, rb_node); - if (prev && prev->end + 1 == em->start && - ((em->block_start == EXTENT_MAP_HOLE && - prev->block_start == EXTENT_MAP_HOLE) || - (em->block_start == EXTENT_MAP_INLINE && - prev->block_start == EXTENT_MAP_INLINE) || - (em->block_start == EXTENT_MAP_DELALLOC && - prev->block_start == EXTENT_MAP_DELALLOC) || - (em->block_start < EXTENT_MAP_DELALLOC - 1 && - em->block_start == prev->block_end + 1))) { - em->start = prev->start; - em->block_start = prev->block_start; - rb_erase(&prev->rb_node, &tree->map); - prev->in_tree = 0; - free_extent_map(prev); + merge = rb_entry(rb, struct extent_map, rb_node); + if (rb && mergable_maps(merge, em)) { + em->start = merge->start; + em->len += merge->len; + em->block_start = merge->block_start; + merge->in_tree = 0; + rb_erase(&merge->rb_node, &tree->map); + free_extent_map(merge); } } + rb = rb_next(&em->rb_node); + if (rb) + merge = rb_entry(rb, struct extent_map, rb_node); + if (rb && mergable_maps(em, merge)) { + em->len += merge->len; + rb_erase(&merge->rb_node, &tree->map); + merge->in_tree = 0; + free_extent_map(merge); + } + tree->last = em; out: - write_unlock_irq(&tree->lock); return ret; } EXPORT_SYMBOL(add_extent_mapping); +static u64 range_end(u64 start, u64 len) +{ + if (start + len < start) + return (u64)-1; + return start + len; +} + /* * lookup_extent_mapping returns the first extent_map struct in the - * tree that intersects the [start, end] (inclusive) range. There may + * tree that intersects the [start, len] range. There may * be additional objects in the tree that intersect, so check the object * returned carefully to make sure you don't need additional lookups. */ struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree, - u64 start, u64 end) + u64 start, u64 len) { struct extent_map *em; struct rb_node *rb_node; - struct rb_node *prev = NULL; - struct rb_node *next = NULL; + struct rb_node *prev = NULL; struct rb_node *next = NULL; u64 end = range_end(start, len); em = tree->last; if (em && end > em->start && start < extent_map_end(em)) goto found; - read_lock_irq(&tree->lock); rb_node = __tree_search(&tree->map, start, &prev, &next); if (!rb_node && prev) { em = rb_entry(prev, struct extent_map, rb_node); - if (em->start <= end && em->end >= start) + if (end > em->start && start < extent_map_end(em)) goto found; } if (!rb_node && next) { em = rb_entry(next, struct extent_map, rb_node); - if (em->start <= end && em->end >= start) + if (end > em->start && start < extent_map_end(em)) goto found; } if (!rb_node) { @@ -352,14 +253,16 @@ struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree, goto out; } em = rb_entry(rb_node, struct extent_map, rb_node); - if (em->end < start || em->start > end) { - em = NULL; - goto out; - } + if (end > em->start && start < extent_map_end(em)) + goto found; + + em = NULL; + goto out; + found: atomic_inc(&em->refs); + tree->last = em; out: - read_unlock_irq(&tree->lock); return em; } EXPORT_SYMBOL(lookup_extent_mapping); @@ -370,2866 +273,12 @@ EXPORT_SYMBOL(lookup_extent_mapping); */ int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em) { - int ret; + int ret = 0; - write_lock_irq(&tree->lock); - ret = tree_delete(&tree->map, em->end); - write_unlock_irq(&tree->lock); + rb_erase(&em->rb_node, &tree->map); + em->in_tree = 0; + if (tree->last == em) + tree->last = NULL; return ret; } EXPORT_SYMBOL(remove_extent_mapping); - -/* - * utility function to look for merge candidates inside a given range. - * Any extents with matching state are merged together into a single - * extent in the tree. Extents with EXTENT_IO in their state field - * are not merged because the end_io handlers need to be able to do - * operations on them without sleeping (or doing allocations/splits). - * - * This should be called with the tree lock held. - */ -static int merge_state(struct extent_map_tree *tree, - struct extent_state *state) -{ - struct extent_state *other; - struct rb_node *other_node; - - if (state->state & EXTENT_IOBITS) - return 0; - - other_node = rb_prev(&state->rb_node); - if (other_node) { - other = rb_entry(other_node, struct extent_state, rb_node); - if (other->end == state->start - 1 && - other->state == state->state) { - state->start = other->start; - other->in_tree = 0; - rb_erase(&other->rb_node, &tree->state); - free_extent_state(other); - } - } - other_node = rb_next(&state->rb_node); - if (other_node) { - other = rb_entry(other_node, struct extent_state, rb_node); - if (other->start == state->end + 1 && - other->state == state->state) { - other->start = state->start; - state->in_tree = 0; - rb_erase(&state->rb_node, &tree->state); - free_extent_state(state); - } - } - return 0; -} - -/* - * insert an extent_state struct into the tree. 'bits' are set on the - * struct before it is inserted. - * - * This may return -EEXIST if the extent is already there, in which case the - * state struct is freed. - * - * The tree lock is not taken internally. This is a utility function and - * probably isn't what you want to call (see set/clear_extent_bit). - */ -static int insert_state(struct extent_map_tree *tree, - struct extent_state *state, u64 start, u64 end, - int bits) -{ - struct rb_node *node; - - if (end < start) { - printk("end < start %Lu %Lu\n", end, start); - WARN_ON(1); - } - if (bits & EXTENT_DIRTY) - tree->dirty_bytes += end - start + 1; - state->state |= bits; - state->start = start; - state->end = end; - node = tree_insert(&tree->state, end, &state->rb_node); - if (node) { - struct extent_state *found; - found = rb_entry(node, struct extent_state, rb_node); - printk("found node %Lu %Lu on insert of %Lu %Lu\n", found->start, found->end, start, end); - free_extent_state(state); - return -EEXIST; - } - merge_state(tree, state); - return 0; -} - -/* - * split a given extent state struct in two, inserting the preallocated - * struct 'prealloc' as the newly created second half. 'split' indicates an - * offset inside 'orig' where it should be split. - * - * Before calling, - * the tree has 'orig' at [orig->start, orig->end]. After calling, there - * are two extent state structs in the tree: - * prealloc: [orig->start, split - 1] - * orig: [ split, orig->end ] - * - * The tree locks are not taken by this function. They need to be held - * by the caller. - */ -static int split_state(struct extent_map_tree *tree, struct extent_state *orig, - struct extent_state *prealloc, u64 split) -{ - struct rb_node *node; - prealloc->start = orig->start; - prealloc->end = split - 1; - prealloc->state = orig->state; - orig->start = split; - - node = tree_insert(&tree->state, prealloc->end, &prealloc->rb_node); - if (node) { - struct extent_state *found; - found = rb_entry(node, struct extent_state, rb_node); - printk("found node %Lu %Lu on insert of %Lu %Lu\n", found->start, found->end, prealloc->start, prealloc->end); - free_extent_state(prealloc); - return -EEXIST; - } - return 0; -} - -/* - * utility function to clear some bits in an extent state struct. - * it will optionally wake up any one waiting on this state (wake == 1), or - * forcibly remove the state from the tree (delete == 1). - * - * If no bits are set on the state struct after clearing things, the - * struct is freed and removed from the tree - */ -static int clear_state_bit(struct extent_map_tree *tree, - struct extent_state *state, int bits, int wake, - int delete) -{ - int ret = state->state & bits; - - if ((bits & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) { - u64 range = state->end - state->start + 1; - WARN_ON(range > tree->dirty_bytes); - tree->dirty_bytes -= range; - } - state->state &= ~bits; - if (wake) - wake_up(&state->wq); - if (delete || state->state == 0) { - if (state->in_tree) { - rb_erase(&state->rb_node, &tree->state); - state->in_tree = 0; - free_extent_state(state); - } else { - WARN_ON(1); - } - } else { - merge_state(tree, state); - } - return ret; -} - -/* - * clear some bits on a range in the tree. This may require splitting - * or inserting elements in the tree, so the gfp mask is used to - * indicate which allocations or sleeping are allowed. - * - * pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove - * the given range from the tree regardless of state (ie for truncate). - * - * the range [start, end] is inclusive. - * - * This takes the tree lock, and returns < 0 on error, > 0 if any of the - * bits were already set, or zero if none of the bits were already set. - */ -int clear_extent_bit(struct extent_map_tree *tree, u64 start, u64 end, - int bits, int wake, int delete, gfp_t mask) -{ - struct extent_state *state; - struct extent_state *prealloc = NULL; - struct rb_node *node; - unsigned long flags; - int err; - int set = 0; - -again: - if (!prealloc && (mask & __GFP_WAIT)) { - prealloc = alloc_extent_state(mask); - if (!prealloc) - return -ENOMEM; - } - - write_lock_irqsave(&tree->lock, flags); - /* - * this search will find the extents that end after - * our range starts - */ - node = tree_search(&tree->state, start); - if (!node) - goto out; - state = rb_entry(node, struct extent_state, rb_node); - if (state->start > end) - goto out; - WARN_ON(state->end < start); - - /* - * | ---- desired range ---- | - * | state | or - * | ------------- state -------------- | - * - * We need to split the extent we found, and may flip - * bits on second half. - * - * If the extent we found extends past our range, we - * just split and search again. It'll get split again - * the next time though. - * - * If the extent we found is inside our range, we clear - * the desired bit on it. - */ - - if (state->start < start) { - err = split_state(tree, state, prealloc, start); - BUG_ON(err == -EEXIST); - prealloc = NULL; - if (err) - goto out; - if (state->end <= end) { - start = state->end + 1; - set |= clear_state_bit(tree, state, bits, - wake, delete); - } else { - start = state->start; - } - goto search_again; - } - /* - * | ---- desired range ---- | - * | state | - * We need to split the extent, and clear the bit - * on the first half - */ - if (state->start <= end && state->end > end) { - err = split_state(tree, state, prealloc, end + 1); - BUG_ON(err == -EEXIST); - - if (wake) - wake_up(&state->wq); - set |= clear_state_bit(tree, prealloc, bits, - wake, delete); - prealloc = NULL; - goto out; - } - - start = state->end + 1; - set |= clear_state_bit(tree, state, bits, wake, delete); - goto search_again; - -out: - write_unlock_irqrestore(&tree->lock, flags); - if (prealloc) - free_extent_state(prealloc); - - return set; - -search_again: - if (start > end) - goto out; - write_unlock_irqrestore(&tree->lock, flags); - if (mask & __GFP_WAIT) - cond_resched(); - goto again; -} -EXPORT_SYMBOL(clear_extent_bit); - -static int wait_on_state(struct extent_map_tree *tree, - struct extent_state *state) -{ - DEFINE_WAIT(wait); - prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE); - read_unlock_irq(&tree->lock); - schedule(); - read_lock_irq(&tree->lock); - finish_wait(&state->wq, &wait); - return 0; -} - -/* - * waits for one or more bits to clear on a range in the state tree. - * The range [start, end] is inclusive. - * The tree lock is taken by this function - */ -int wait_extent_bit(struct extent_map_tree *tree, u64 start, u64 end, int bits) -{ - struct extent_state *state; - struct rb_node *node; - - read_lock_irq(&tree->lock); -again: - while (1) { - /* - * this search will find all the extents that end after - * our range starts - */ - node = tree_search(&tree->state, start); - if (!node) - break; - - state = rb_entry(node, struct extent_state, rb_node); - - if (state->start > end) - goto out; - - if (state->state & bits) { - start = state->start; - atomic_inc(&state->refs); - wait_on_state(tree, state); - free_extent_state(state); - goto again; - } - start = state->end + 1; - - if (start > end) - break; - - if (need_resched()) { - read_unlock_irq(&tree->lock); - cond_resched(); - read_lock_irq(&tree->lock); - } - } -out: - read_unlock_irq(&tree->lock); - return 0; -} -EXPORT_SYMBOL(wait_extent_bit); - -static void set_state_bits(struct extent_map_tree *tree, - struct extent_state *state, - int bits) -{ - if ((bits & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) { - u64 range = state->end - state->start + 1; - tree->dirty_bytes += range; - } - state->state |= bits; -} - -/* - * set some bits on a range in the tree. This may require allocations - * or sleeping, so the gfp mask is used to indicate what is allowed. - * - * If 'exclusive' == 1, this will fail with -EEXIST if some part of the - * range already has the desired bits set. The start of the existing - * range is returned in failed_start in this case. - * - * [start, end] is inclusive - * This takes the tree lock. - */ -int set_extent_bit(struct extent_map_tree *tree, u64 start, u64 end, int bits, - int exclusive, u64 *failed_start, gfp_t mask) -{ - struct extent_state *state; - struct extent_state *prealloc = NULL; - struct rb_node *node; - unsigned long flags; - int err = 0; - int set; - u64 last_start; - u64 last_end; -again: - if (!prealloc && (mask & __GFP_WAIT)) { - prealloc = alloc_extent_state(mask); - if (!prealloc) - return -ENOMEM; - } - - write_lock_irqsave(&tree->lock, flags); - /* - * this search will find all the extents that end after - * our range starts. - */ - node = tree_search(&tree->state, start); - if (!node) { - err = insert_state(tree, prealloc, start, end, bits); - prealloc = NULL; - BUG_ON(err == -EEXIST); - goto out; - } - - state = rb_entry(node, struct extent_state, rb_node); - last_start = state->start; - last_end = state->end; - - /* - * | ---- desired range ---- | - * | state | - * - * Just lock what we found and keep going - */ - if (state->start == start && state->end <= end) { - set = state->state & bits; - if (set && exclusive) { - *failed_start = state->start; - err = -EEXIST; - goto out; - } - set_state_bits(tree, state, bits); - start = state->end + 1; - merge_state(tree, state); - goto search_again; - } - - /* - * | ---- desired range ---- | - * | state | - * or - * | ------------- state -------------- | - * - * We need to split the extent we found, and may flip bits on - * second half. - * - * If the extent we found extends past our - * range, we just split and search again. It'll get split - * again the next time though. - * - * If the extent we found is inside our range, we set the - * desired bit on it. - */ - if (state->start < start) { - set = state->state & bits; - if (exclusive && set) { - *failed_start = start; - err = -EEXIST; - goto out; - } - err = split_state(tree, state, prealloc, start); - BUG_ON(err == -EEXIST); - prealloc = NULL; - if (err) - goto out; - if (state->end <= end) { - set_state_bits(tree, state, bits); - start = state->end + 1; - merge_state(tree, state); - } else { - start = state->start; - } - goto search_again; - } - /* - * | ---- desired range ---- | - * | state | or | state | - * - * There's a hole, we need to insert something in it and - * ignore the extent we found. - */ - if (state->start > start) { - u64 this_end; - if (end < last_start) - this_end = end; - else - this_end = last_start -1; - err = insert_state(tree, prealloc, start, this_end, - bits); - prealloc = NULL; - BUG_ON(err == -EEXIST); - if (err) - goto out; - start = this_end + 1; - goto search_again; - } - /* - * | ---- desired range ---- | - * | state | - * We need to split the extent, and set the bit - * on the first half - */ - if (state->start <= end && state->end > end) { - set = state->state & bits; - if (exclusive && set) { - *failed_start = start; - err = -EEXIST; - goto out; - } - err = split_state(tree, state, prealloc, end + 1); - BUG_ON(err == -EEXIST); - - set_state_bits(tree, prealloc, bits); - merge_state(tree, prealloc); - prealloc = NULL; - goto out; - } - - goto search_again; - -out: - write_unlock_irqrestore(&tree->lock, flags); - if (prealloc) - free_extent_state(prealloc); - - return err; - -search_again: - if (start > end) - goto out; - write_unlock_irqrestore(&tree->lock, flags); - if (mask & __GFP_WAIT) - cond_resched(); - goto again; -} -EXPORT_SYMBOL(set_extent_bit); - -/* wrappers around set/clear extent bit */ -int set_extent_dirty(struct extent_map_tree *tree, u64 start, u64 end, - gfp_t mask) -{ - return set_extent_bit(tree, start, end, EXTENT_DIRTY, 0, NULL, - mask); -} -EXPORT_SYMBOL(set_extent_dirty); - -int set_extent_bits(struct extent_map_tree *tree, u64 start, u64 end, - int bits, gfp_t mask) -{ - return set_extent_bit(tree, start, end, bits, 0, NULL, - mask); -} -EXPORT_SYMBOL(set_extent_bits); - -int clear_extent_bits(struct extent_map_tree *tree, u64 start, u64 end, - int bits, gfp_t mask) -{ - return clear_extent_bit(tree, start, end, bits, 0, 0, mask); -} -EXPORT_SYMBOL(clear_extent_bits); - -int set_extent_delalloc(struct extent_map_tree *tree, u64 start, u64 end, - gfp_t mask) -{ - return set_extent_bit(tree, start, end, - EXTENT_DELALLOC | EXTENT_DIRTY, 0, NULL, - mask); -} -EXPORT_SYMBOL(set_extent_delalloc); - -int clear_extent_dirty(struct extent_map_tree *tree, u64 start, u64 end, - gfp_t mask) -{ - return clear_extent_bit(tree, start, end, - EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, mask); -} -EXPORT_SYMBOL(clear_extent_dirty); - -int set_extent_new(struct extent_map_tree *tree, u64 start, u64 end, - gfp_t mask) -{ - return set_extent_bit(tree, start, end, EXTENT_NEW, 0, NULL, - mask); -} -EXPORT_SYMBOL(set_extent_new); - -int clear_extent_new(struct extent_map_tree *tree, u64 start, u64 end, - gfp_t mask) -{ - return clear_extent_bit(tree, start, end, EXTENT_NEW, 0, 0, mask); -} -EXPORT_SYMBOL(clear_extent_new); - -int set_extent_uptodate(struct extent_map_tree *tree, u64 start, u64 end, - gfp_t mask) -{ - return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, NULL, - mask); -} -EXPORT_SYMBOL(set_extent_uptodate); - -int clear_extent_uptodate(struct extent_map_tree *tree, u64 start, u64 end, - gfp_t mask) -{ - return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0, mask); -} -EXPORT_SYMBOL(clear_extent_uptodate); - -int set_extent_writeback(struct extent_map_tree *tree, u64 start, u64 end, - gfp_t mask) -{ - return set_extent_bit(tree, start, end, EXTENT_WRITEBACK, - 0, NULL, mask); -} -EXPORT_SYMBOL(set_extent_writeback); - -int clear_extent_writeback(struct extent_map_tree *tree, u64 start, u64 end, - gfp_t mask) -{ - return clear_extent_bit(tree, start, end, EXTENT_WRITEBACK, 1, 0, mask); -} -EXPORT_SYMBOL(clear_extent_writeback); - -int wait_on_extent_writeback(struct extent_map_tree *tree, u64 start, u64 end) -{ - return wait_extent_bit(tree, start, end, EXTENT_WRITEBACK); -} -EXPORT_SYMBOL(wait_on_extent_writeback); - -/* - * locks a range in ascending order, waiting for any locked regions - * it hits on the way. [start,end] are inclusive, and this will sleep. - */ -int lock_extent(struct extent_map_tree *tree, u64 start, u64 end, gfp_t mask) -{ - int err; - u64 failed_start; - while (1) { - err = set_extent_bit(tree, start, end, EXTENT_LOCKED, 1, - &failed_start, mask); - if (err == -EEXIST && (mask & __GFP_WAIT)) { - wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED); - start = failed_start; - } else { - break; - } - WARN_ON(start > end); - } - return err; -} -EXPORT_SYMBOL(lock_extent); - -int unlock_extent(struct extent_map_tree *tree, u64 start, u64 end, - gfp_t mask) -{ - return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, mask); -} -EXPORT_SYMBOL(unlock_extent); - -/* - * helper function to set pages and extents in the tree dirty - */ -int set_range_dirty(struct extent_map_tree *tree, u64 start, u64 end) -{ - unsigned long index = start >> PAGE_CACHE_SHIFT; - unsigned long end_index = end >> PAGE_CACHE_SHIFT; - struct page *page; - - while (index <= end_index) { - page = find_get_page(tree->mapping, index); - BUG_ON(!page); - __set_page_dirty_nobuffers(page); - page_cache_release(page); - index++; - } - set_extent_dirty(tree, start, end, GFP_NOFS); - return 0; -} -EXPORT_SYMBOL(set_range_dirty); - -/* - * helper function to set both pages and extents in the tree writeback - */ -int set_range_writeback(struct extent_map_tree *tree, u64 start, u64 end) -{ - unsigned long index = start >> PAGE_CACHE_SHIFT; - unsigned long end_index = end >> PAGE_CACHE_SHIFT; - struct page *page; - - while (index <= end_index) { - page = find_get_page(tree->mapping, index); - BUG_ON(!page); - set_page_writeback(page); - page_cache_release(page); - index++; - } - set_extent_writeback(tree, start, end, GFP_NOFS); - return 0; -} -EXPORT_SYMBOL(set_range_writeback); - -int find_first_extent_bit(struct extent_map_tree *tree, u64 start, - u64 *start_ret, u64 *end_ret, int bits) -{ - struct rb_node *node; - struct extent_state *state; - int ret = 1; - - read_lock_irq(&tree->lock); - /* - * this search will find all the extents that end after - * our range starts. - */ - node = tree_search(&tree->state, start); - if (!node || IS_ERR(node)) { - goto out; - } - - while(1) { - state = rb_entry(node, struct extent_state, rb_node); - if (state->end >= start && (state->state & bits)) { - *start_ret = state->start; - *end_ret = state->end; - ret = 0; - break; - } - node = rb_next(node); - if (!node) - break; - } -out: - read_unlock_irq(&tree->lock); - return ret; -} -EXPORT_SYMBOL(find_first_extent_bit); - -u64 find_lock_delalloc_range(struct extent_map_tree *tree, - u64 *start, u64 *end, u64 max_bytes) -{ - struct rb_node *node; - struct extent_state *state; - u64 cur_start = *start; - u64 found = 0; - u64 total_bytes = 0; - - write_lock_irq(&tree->lock); - /* - * this search will find all the extents that end after - * our range starts. - */ -search_again: - node = tree_search(&tree->state, cur_start); - if (!node || IS_ERR(node)) { - *end = (u64)-1; - goto out; - } - - while(1) { - state = rb_entry(node, struct extent_state, rb_node); - if (found && state->start != cur_start) { - goto out; - } - if (!(state->state & EXTENT_DELALLOC)) { - if (!found) - *end = state->end; - goto out; - } - if (!found) { - struct extent_state *prev_state; - struct rb_node *prev_node = node; - while(1) { - prev_node = rb_prev(prev_node); - if (!prev_node) - break; - prev_state = rb_entry(prev_node, - struct extent_state, - rb_node); - if (!(prev_state->state & EXTENT_DELALLOC)) - break; - state = prev_state; - node = prev_node; - } - } - if (state->state & EXTENT_LOCKED) { - DEFINE_WAIT(wait); - atomic_inc(&state->refs); - prepare_to_wait(&state->wq, &wait, - TASK_UNINTERRUPTIBLE); - write_unlock_irq(&tree->lock); - schedule(); - write_lock_irq(&tree->lock); - finish_wait(&state->wq, &wait); - free_extent_state(state); - goto search_again; - } - state->state |= EXTENT_LOCKED; - if (!found) - *start = state->start; - found++; - *end = state->end; - cur_start = state->end + 1; - node = rb_next(node); - if (!node) - break; - total_bytes += state->end - state->start + 1; - if (total_bytes >= max_bytes) - break; - } -out: - write_unlock_irq(&tree->lock); - return found; -} - -u64 count_range_bits(struct extent_map_tree *tree, - u64 *start, u64 search_end, u64 max_bytes, - unsigned long bits) -{ - struct rb_node *node; - struct extent_state *state; - u64 cur_start = *start; - u64 total_bytes = 0; - int found = 0; - - if (search_end <= cur_start) { - printk("search_end %Lu start %Lu\n", search_end, cur_start); - WARN_ON(1); - return 0; - } - - write_lock_irq(&tree->lock); - if (cur_start == 0 && bits == EXTENT_DIRTY) { - total_bytes = tree->dirty_bytes; - goto out; - } - /* - * this search will find all the extents that end after - * our range starts. - */ - node = tree_search(&tree->state, cur_start); - if (!node || IS_ERR(node)) { - goto out; - } - - while(1) { - state = rb_entry(node, struct extent_state, rb_node); - if (state->start > search_end) - break; - if (state->end >= cur_start && (state->state & bits)) { - total_bytes += min(search_end, state->end) + 1 - - max(cur_start, state->start); - if (total_bytes >= max_bytes) - break; - if (!found) { - *start = state->start; - found = 1; - } - } - node = rb_next(node); - if (!node) - break; - } -out: - write_unlock_irq(&tree->lock); - return total_bytes; -} -/* - * helper function to lock both pages and extents in the tree. - * pages must be locked first. - */ -int lock_range(struct extent_map_tree *tree, u64 start, u64 end) -{ - unsigned long index = start >> PAGE_CACHE_SHIFT; - unsigned long end_index = end >> PAGE_CACHE_SHIFT; - struct page *page; - int err; - - while (index <= end_index) { - page = grab_cache_page(tree->mapping, index); - if (!page) { - err = -ENOMEM; - goto failed; - } - if (IS_ERR(page)) { - err = PTR_ERR(page); - goto failed; - } - index++; - } - lock_extent(tree, start, end, GFP_NOFS); - return 0; - -failed: - /* - * we failed above in getting the page at 'index', so we undo here - * up to but not including the page at 'index' - */ - end_index = index; - index = start >> PAGE_CACHE_SHIFT; - while (index < end_index) { - page = find_get_page(tree->mapping, index); - unlock_page(page); - page_cache_release(page); - index++; - } - return err; -} -EXPORT_SYMBOL(lock_range); - -/* - * helper function to unlock both pages and extents in the tree. - */ -int unlock_range(struct extent_map_tree *tree, u64 start, u64 end) -{ - unsigned long index = start >> PAGE_CACHE_SHIFT; - unsigned long end_index = end >> PAGE_CACHE_SHIFT; - struct page *page; - - while (index <= end_index) { - page = find_get_page(tree->mapping, index); - unlock_page(page); - page_cache_release(page); - index++; - } - unlock_extent(tree, start, end, GFP_NOFS); - return 0; -} -EXPORT_SYMBOL(unlock_range); - -int set_state_private(struct extent_map_tree *tree, u64 start, u64 private) -{ - struct rb_node *node; - struct extent_state *state; - int ret = 0; - - write_lock_irq(&tree->lock); - /* - * this search will find all the extents that end after - * our range starts. - */ - node = tree_search(&tree->state, start); - if (!node || IS_ERR(node)) { - ret = -ENOENT; - goto out; - } - state = rb_entry(node, struct extent_state, rb_node); - if (state->start != start) { - ret = -ENOENT; - goto out; - } - state->private = private; -out: - write_unlock_irq(&tree->lock); - return ret; -} - -int get_state_private(struct extent_map_tree *tree, u64 start, u64 *private) -{ - struct rb_node *node; - struct extent_state *state; - int ret = 0; - - read_lock_irq(&tree->lock); - /* - * this search will find all the extents that end after - * our range starts. - */ - node = tree_search(&tree->state, start); - if (!node || IS_ERR(node)) { - ret = -ENOENT; - goto out; - } - state = rb_entry(node, struct extent_state, rb_node); - if (state->start != start) { - ret = -ENOENT; - goto out; - } - *private = state->private; -out: - read_unlock_irq(&tree->lock); - return ret; -} - -/* - * searches a range in the state tree for a given mask. - * If 'filled' == 1, this returns 1 only if ever extent in the tree - * has the bits set. Otherwise, 1 is returned if any bit in the - * range is found set. - */ -int test_range_bit(struct extent_map_tree *tree, u64 start, u64 end, - int bits, int filled) -{ - struct extent_state *state = NULL; - struct rb_node *node; - int bitset = 0; - - read_lock_irq(&tree->lock); - node = tree_search(&tree->state, start); - while (node && start <= end) { - state = rb_entry(node, struct extent_state, rb_node); - - if (filled && state->start > start) { - bitset = 0; - break; - } - - if (state->start > end) - break; - - if (state->state & bits) { - bitset = 1; - if (!filled) - break; - } else if (filled) { - bitset = 0; - break; - } - start = state->end + 1; - if (start > end) - break; - node = rb_next(node); - if (!node) { - if (filled) - bitset = 0; - break; - } - } - read_unlock_irq(&tree->lock); - return bitset; -} -EXPORT_SYMBOL(test_range_bit); - -/* - * helper function to set a given page up to date if all the - * extents in the tree for that page are up to date - */ -static int check_page_uptodate(struct extent_map_tree *tree, - struct page *page) -{ - u64 start = (u64)page->index << PAGE_CACHE_SHIFT; - u64 end = start + PAGE_CACHE_SIZE - 1; - if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1)) - SetPageUptodate(page); - return 0; -} - -/* - * helper function to unlock a page if all the extents in the tree - * for that page are unlocked - */ -static int check_page_locked(struct extent_map_tree *tree, - struct page *page) -{ - u64 start = (u64)page->index << PAGE_CACHE_SHIFT; - u64 end = start + PAGE_CACHE_SIZE - 1; - if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0)) - unlock_page(page); - return 0; -} - -/* - * helper function to end page writeback if all the extents - * in the tree for that page are done with writeback - */ -static int check_page_writeback(struct extent_map_tree *tree, - struct page *page) -{ - u64 start = (u64)page->index << PAGE_CACHE_SHIFT; - u64 end = start + PAGE_CACHE_SIZE - 1; - if (!test_range_bit(tree, start, end, EXTENT_WRITEBACK, 0)) - end_page_writeback(page); - return 0; -} - -/* lots and lots of room for performance fixes in the end_bio funcs */ - -/* - * after a writepage IO is done, we need to: - * clear the uptodate bits on error - * clear the writeback bits in the extent tree for this IO - * end_page_writeback if the page has no more pending IO - * - * Scheduling is not allowed, so the extent state tree is expected - * to have one and only one object corresponding to this IO. - */ -#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,23) -static void end_bio_extent_writepage(struct bio *bio, int err) -#else -static int end_bio_extent_writepage(struct bio *bio, - unsigned int bytes_done, int err) -#endif -{ - const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); - struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; - struct extent_map_tree *tree = bio->bi_private; - u64 start; - u64 end; - int whole_page; - -#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23) - if (bio->bi_size) - return 1; -#endif - - do { - struct page *page = bvec->bv_page; - start = ((u64)page->index << PAGE_CACHE_SHIFT) + - bvec->bv_offset; - end = start + bvec->bv_len - 1; - - if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE) - whole_page = 1; - else - whole_page = 0; - - if (--bvec >= bio->bi_io_vec) - prefetchw(&bvec->bv_page->flags); - - if (!uptodate) { - clear_extent_uptodate(tree, start, end, GFP_ATOMIC); - ClearPageUptodate(page); - SetPageError(page); - } - clear_extent_writeback(tree, start, end, GFP_ATOMIC); - - if (whole_page) - end_page_writeback(page); - else - check_page_writeback(tree, page); - if (tree->ops && tree->ops->writepage_end_io_hook) - tree->ops->writepage_end_io_hook(page, start, end); - } while (bvec >= bio->bi_io_vec); - - bio_put(bio); -#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23) - return 0; -#endif -} - -/* - * after a readpage IO is done, we need to: - * clear the uptodate bits on error - * set the uptodate bits if things worked - * set the page up to date if all extents in the tree are uptodate - * clear the lock bit in the extent tree - * unlock the page if there are no other extents locked for it - * - * Scheduling is not allowed, so the extent state tree is expected - * to have one and only one object corresponding to this IO. - */ -#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,23) -static void end_bio_extent_readpage(struct bio *bio, int err) -#else -static int end_bio_extent_readpage(struct bio *bio, - unsigned int bytes_done, int err) -#endif -{ - int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); - struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; - struct extent_map_tree *tree = bio->bi_private; - u64 start; - u64 end; - int whole_page; - int ret; - -#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23) - if (bio->bi_size) - return 1; -#endif - - do { - struct page *page = bvec->bv_page; - start = ((u64)page->index << PAGE_CACHE_SHIFT) + - bvec->bv_offset; - end = start + bvec->bv_len - 1; - - if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE) - whole_page = 1; - else - whole_page = 0; - - if (--bvec >= bio->bi_io_vec) - prefetchw(&bvec->bv_page->flags); - - if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) { - ret = tree->ops->readpage_end_io_hook(page, start, end); - if (ret) - uptodate = 0; - } - if (uptodate) { - set_extent_uptodate(tree, start, end, GFP_ATOMIC); - if (whole_page) - SetPageUptodate(page); - else - check_page_uptodate(tree, page); - } else { - ClearPageUptodate(page); - SetPageError(page); - } - - unlock_extent(tree, start, end, GFP_ATOMIC); - - if (whole_page) - unlock_page(page); - else - check_page_locked(tree, page); - } while (bvec >= bio->bi_io_vec); - - bio_put(bio); -#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23) - return 0; -#endif -} - -/* - * IO done from prepare_write is pretty simple, we just unlock - * the structs in the extent tree when done, and set the uptodate bits - * as appropriate. - */ -#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,23) -static void end_bio_extent_preparewrite(struct bio *bio, int err) -#else -static int end_bio_extent_preparewrite(struct bio *bio, - unsigned int bytes_done, int err) -#endif -{ - const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); - struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; - struct extent_map_tree *tree = bio->bi_private; - u64 start; - u64 end; - -#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23) - if (bio->bi_size) - return 1; -#endif - - do { - struct page *page = bvec->bv_page; - start = ((u64)page->index << PAGE_CACHE_SHIFT) + - bvec->bv_offset; - end = start + bvec->bv_len - 1; - - if (--bvec >= bio->bi_io_vec) - prefetchw(&bvec->bv_page->flags); - - if (uptodate) { - set_extent_uptodate(tree, start, end, GFP_ATOMIC); - } else { - ClearPageUptodate(page); - SetPageError(page); - } - - unlock_extent(tree, start, end, GFP_ATOMIC); - - } while (bvec >= bio->bi_io_vec); - - bio_put(bio); -#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23) - return 0; -#endif -} - -static struct bio * -extent_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs, - gfp_t gfp_flags) -{ - struct bio *bio; - - bio = bio_alloc(gfp_flags, nr_vecs); - - if (bio == NULL && (current->flags & PF_MEMALLOC)) { - while (!bio && (nr_vecs /= 2)) - bio = bio_alloc(gfp_flags, nr_vecs); - } - - if (bio) { - bio->bi_bdev = bdev; - bio->bi_sector = first_sector; - } - return bio; -} - -static int submit_one_bio(int rw, struct bio *bio) -{ - u64 maxsector; - int ret = 0; - - bio_get(bio); - - maxsector = bio->bi_bdev->bd_inode->i_size >> 9; - if (maxsector < bio->bi_sector) { - printk("sector too large max %Lu got %llu\n", maxsector, - (unsigned long long)bio->bi_sector); - WARN_ON(1); - } - - submit_bio(rw, bio); - if (bio_flagged(bio, BIO_EOPNOTSUPP)) - ret = -EOPNOTSUPP; - bio_put(bio); - return ret; -} - -static int submit_extent_page(int rw, struct extent_map_tree *tree, - struct page *page, sector_t sector, - size_t size, unsigned long offset, - struct block_device *bdev, - struct bio **bio_ret, - unsigned long max_pages, - bio_end_io_t end_io_func) -{ - int ret = 0; - struct bio *bio; - int nr; - - if (bio_ret && *bio_ret) { - bio = *bio_ret; - if (bio->bi_sector + (bio->bi_size >> 9) != sector || - bio_add_page(bio, page, size, offset) < size) { - ret = submit_one_bio(rw, bio); - bio = NULL; - } else { - return 0; - } - } - nr = min_t(int, max_pages, bio_get_nr_vecs(bdev)); - bio = extent_bio_alloc(bdev, sector, nr, GFP_NOFS | __GFP_HIGH); - if (!bio) { - printk("failed to allocate bio nr %d\n", nr); - } - bio_add_page(bio, page, size, offset); - bio->bi_end_io = end_io_func; - bio->bi_private = tree; - if (bio_ret) { - *bio_ret = bio; - } else { - ret = submit_one_bio(rw, bio); - } - - return ret; -} - -void set_page_extent_mapped(struct page *page) -{ - if (!PagePrivate(page)) { - SetPagePrivate(page); - WARN_ON(!page->mapping->a_ops->invalidatepage); - set_page_private(page, EXTENT_PAGE_PRIVATE); - page_cache_get(page); - } -} - -void set_page_extent_head(struct page *page, unsigned long len) -{ - set_page_private(page, EXTENT_PAGE_PRIVATE_FIRST_PAGE | len << 2); -} - -/* - * basic readpage implementation. Locked extent state structs are inserted - * into the tree that are removed when the IO is done (by the end_io - * handlers) - */ -static int __extent_read_full_page(struct extent_map_tree *tree, - struct page *page, - get_extent_t *get_extent, - struct bio **bio) -{ - struct inode *inode = page->mapping->host; - u64 start = (u64)page->index << PAGE_CACHE_SHIFT; - u64 page_end = start + PAGE_CACHE_SIZE - 1; - u64 end; - u64 cur = start; - u64 extent_offset; - u64 last_byte = i_size_read(inode); - u64 block_start; - u64 cur_end; - sector_t sector; - struct extent_map *em; - struct block_device *bdev; - int ret; - int nr = 0; - size_t page_offset = 0; - size_t iosize; - size_t blocksize = inode->i_sb->s_blocksize; - - set_page_extent_mapped(page); - - end = page_end; - lock_extent(tree, start, end, GFP_NOFS); - - while (cur <= end) { - if (cur >= last_byte) { - char *userpage; - iosize = PAGE_CACHE_SIZE - page_offset; - userpage = kmap_atomic(page, KM_USER0); - memset(userpage + page_offset, 0, iosize); - flush_dcache_page(page); - kunmap_atomic(userpage, KM_USER0); - set_extent_uptodate(tree, cur, cur + iosize - 1, - GFP_NOFS); - unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS); - break; - } - em = get_extent(inode, page, page_offset, cur, end, 0); - if (IS_ERR(em) || !em) { - SetPageError(page); - unlock_extent(tree, cur, end, GFP_NOFS); - break; - } - - extent_offset = cur - em->start; - BUG_ON(em->end < cur); - BUG_ON(end < cur); - - iosize = min(em->end - cur, end - cur) + 1; - cur_end = min(em->end, end); - iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1); - sector = (em->block_start + extent_offset) >> 9; - bdev = em->bdev; - block_start = em->block_start; - free_extent_map(em); - em = NULL; - - /* we've found a hole, just zero and go on */ - if (block_start == EXTENT_MAP_HOLE) { - char *userpage; - userpage = kmap_atomic(page, KM_USER0); - memset(userpage + page_offset, 0, iosize); - flush_dcache_page(page); - kunmap_atomic(userpage, KM_USER0); - - set_extent_uptodate(tree, cur, cur + iosize - 1, - GFP_NOFS); - unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS); - cur = cur + iosize; - page_offset += iosize; - continue; - } - /* the get_extent function already copied into the page */ - if (test_range_bit(tree, cur, cur_end, EXTENT_UPTODATE, 1)) { - unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS); - cur = cur + iosize; - page_offset += iosize; - continue; - } - - ret = 0; - if (tree->ops && tree->ops->readpage_io_hook) { - ret = tree->ops->readpage_io_hook(page, cur, - cur + iosize - 1); - } - if (!ret) { - unsigned long nr = (last_byte >> PAGE_CACHE_SHIFT) + 1; - nr -= page->index; - ret = submit_extent_page(READ, tree, page, - sector, iosize, page_offset, - bdev, bio, nr, - end_bio_extent_readpage); - } - if (ret) - SetPageError(page); - cur = cur + iosize; - page_offset += iosize; - nr++; - } - if (!nr) { - if (!PageError(page)) - SetPageUptodate(page); - unlock_page(page); - } - return 0; -} - -int extent_read_full_page(struct extent_map_tree *tree, struct page *page, - get_extent_t *get_extent) -{ - struct bio *bio = NULL; - int ret; - - ret = __extent_read_full_page(tree, page, get_extent, &bio); - if (bio) - submit_one_bio(READ, bio); - return ret; -} -EXPORT_SYMBOL(extent_read_full_page); - -/* - * the writepage semantics are similar to regular writepage. extent - * records are inserted to lock ranges in the tree, and as dirty areas - * are found, they are marked writeback. Then the lock bits are removed - * and the end_io handler clears the writeback ranges - */ -static int __extent_writepage(struct page *page, struct writeback_control *wbc, - void *data) -{ - struct inode *inode = page->mapping->host; - struct extent_page_data *epd = data; - struct extent_map_tree *tree = epd->tree; - u64 start = (u64)page->index << PAGE_CACHE_SHIFT; - u64 delalloc_start; - u64 page_end = start + PAGE_CACHE_SIZE - 1; - u64 end; - u64 cur = start; - u64 extent_offset; - u64 last_byte = i_size_read(inode); - u64 block_start; - u64 iosize; - sector_t sector; - struct extent_map *em; - struct block_device *bdev; - int ret; - int nr = 0; - size_t page_offset = 0; - size_t blocksize; - loff_t i_size = i_size_read(inode); - unsigned long end_index = i_size >> PAGE_CACHE_SHIFT; - u64 nr_delalloc; - u64 delalloc_end; - - WARN_ON(!PageLocked(page)); - if (page->index > end_index) { - clear_extent_dirty(tree, start, page_end, GFP_NOFS); - unlock_page(page); - return 0; - } - - if (page->index == end_index) { - char *userpage; - - size_t offset = i_size & (PAGE_CACHE_SIZE - 1); - - userpage = kmap_atomic(page, KM_USER0); - memset(userpage + offset, 0, PAGE_CACHE_SIZE - offset); - flush_dcache_page(page); - kunmap_atomic(userpage, KM_USER0); - } - - set_page_extent_mapped(page); - - delalloc_start = start; - delalloc_end = 0; - while(delalloc_end < page_end) { - nr_delalloc = find_lock_delalloc_range(tree, &delalloc_start, - &delalloc_end, - 128 * 1024 * 1024); - if (nr_delalloc == 0) { - delalloc_start = delalloc_end + 1; - continue; - } - tree->ops->fill_delalloc(inode, delalloc_start, - delalloc_end); - clear_extent_bit(tree, delalloc_start, - delalloc_end, - EXTENT_LOCKED | EXTENT_DELALLOC, - 1, 0, GFP_NOFS); - delalloc_start = delalloc_end + 1; - } - lock_extent(tree, start, page_end, GFP_NOFS); - - end = page_end; - if (test_range_bit(tree, start, page_end, EXTENT_DELALLOC, 0)) { - printk("found delalloc bits after lock_extent\n"); - } - - if (last_byte <= start) { - clear_extent_dirty(tree, start, page_end, GFP_NOFS); - goto done; - } - - set_extent_uptodate(tree, start, page_end, GFP_NOFS); - blocksize = inode->i_sb->s_blocksize; - - while (cur <= end) { - if (cur >= last_byte) { - clear_extent_dirty(tree, cur, page_end, GFP_NOFS); - break; - } - em = epd->get_extent(inode, page, page_offset, cur, end, 1); - if (IS_ERR(em) || !em) { - SetPageError(page); - break; - } - - extent_offset = cur - em->start; - BUG_ON(em->end < cur); - BUG_ON(end < cur); - iosize = min(em->end - cur, end - cur) + 1; - iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1); - sector = (em->block_start + extent_offset) >> 9; - bdev = em->bdev; - block_start = em->block_start; - free_extent_map(em); - em = NULL; - - if (block_start == EXTENT_MAP_HOLE || - block_start == EXTENT_MAP_INLINE) { - clear_extent_dirty(tree, cur, - cur + iosize - 1, GFP_NOFS); - cur = cur + iosize; - page_offset += iosize; - continue; - } - - /* leave this out until we have a page_mkwrite call */ - if (0 && !test_range_bit(tree, cur, cur + iosize - 1, - EXTENT_DIRTY, 0)) { - cur = cur + iosize; - page_offset += iosize; - continue; - } - clear_extent_dirty(tree, cur, cur + iosize - 1, GFP_NOFS); - if (tree->ops && tree->ops->writepage_io_hook) { - ret = tree->ops->writepage_io_hook(page, cur, - cur + iosize - 1); - } else { - ret = 0; - } - if (ret) - SetPageError(page); - else { - unsigned long max_nr = end_index + 1; - set_range_writeback(tree, cur, cur + iosize - 1); - if (!PageWriteback(page)) { - printk("warning page %lu not writeback, " - "cur %llu end %llu\n", page->index, - (unsigned long long)cur, - (unsigned long long)end); - } - - ret = submit_extent_page(WRITE, tree, page, sector, - iosize, page_offset, bdev, - &epd->bio, max_nr, - end_bio_extent_writepage); - if (ret) - SetPageError(page); - } - cur = cur + iosize; - page_offset += iosize; - nr++; - } -done: - if (nr == 0) { - /* make sure the mapping tag for page dirty gets cleared */ - set_page_writeback(page); - end_page_writeback(page); - } - unlock_extent(tree, start, page_end, GFP_NOFS); - unlock_page(page); - return 0; -} - -#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18) - -/* Taken directly from 2.6.23 for 2.6.18 back port */ -typedef int (*writepage_t)(struct page *page, struct writeback_control *wbc, - void *data); - -/** - * write_cache_pages - walk the list of dirty pages of the given address space - * and write all of them. - * @mapping: address space structure to write - * @wbc: subtract the number of written pages from *@wbc->nr_to_write - * @writepage: function called for each page - * @data: data passed to writepage function - * - * If a page is already under I/O, write_cache_pages() skips it, even - * if it's dirty. This is desirable behaviour for memory-cleaning writeback, - * but it is INCORRECT for data-integrity system calls such as fsync(). fsync() - * and msync() need to guarantee that all the data which was dirty at the time - * the call was made get new I/O started against them. If wbc->sync_mode is - * WB_SYNC_ALL then we were called for data integrity and we must wait for - * existing IO to complete. - */ -static int write_cache_pages(struct address_space *mapping, - struct writeback_control *wbc, writepage_t writepage, - void *data) -{ - struct backing_dev_info *bdi = mapping->backing_dev_info; - int ret = 0; - int done = 0; - struct pagevec pvec; - int nr_pages; - pgoff_t index; - pgoff_t end; /* Inclusive */ - int scanned = 0; - int range_whole = 0; - - if (wbc->nonblocking && bdi_write_congested(bdi)) { - wbc->encountered_congestion = 1; - return 0; - } - - pagevec_init(&pvec, 0); - if (wbc->range_cyclic) { - index = mapping->writeback_index; /* Start from prev offset */ - end = -1; - } else { - index = wbc->range_start >> PAGE_CACHE_SHIFT; - end = wbc->range_end >> PAGE_CACHE_SHIFT; - if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) - range_whole = 1; - scanned = 1; - } -retry: - while (!done && (index <= end) && - (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, - PAGECACHE_TAG_DIRTY, - min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) { - unsigned i; - - scanned = 1; - for (i = 0; i < nr_pages; i++) { - struct page *page = pvec.pages[i]; - - /* - * At this point we hold neither mapping->tree_lock nor - * lock on the page itself: the page may be truncated or - * invalidated (changing page->mapping to NULL), or even - * swizzled back from swapper_space to tmpfs file - * mapping - */ - lock_page(page); - - if (unlikely(page->mapping != mapping)) { - unlock_page(page); - continue; - } - - if (!wbc->range_cyclic && page->index > end) { - done = 1; - unlock_page(page); - continue; - } - - if (wbc->sync_mode != WB_SYNC_NONE) - wait_on_page_writeback(page); - - if (PageWriteback(page) || - !clear_page_dirty_for_io(page)) { - unlock_page(page); - continue; - } - - ret = (*writepage)(page, wbc, data); - - if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) { - unlock_page(page); - ret = 0; - } - if (ret || (--(wbc->nr_to_write) <= 0)) - done = 1; - if (wbc->nonblocking && bdi_write_congested(bdi)) { - wbc->encountered_congestion = 1; - done = 1; - } - } - pagevec_release(&pvec); - cond_resched(); - } - if (!scanned && !done) { - /* - * We hit the last page and there is more work to be done: wrap - * back to the start of the file - */ - scanned = 1; - index = 0; - goto retry; - } - if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) - mapping->writeback_index = index; - return ret; -} -#endif - -int extent_write_full_page(struct extent_map_tree *tree, struct page *page, - get_extent_t *get_extent, - struct writeback_control *wbc) -{ - int ret; - struct address_space *mapping = page->mapping; - struct extent_page_data epd = { - .bio = NULL, - .tree = tree, - .get_extent = get_extent, - }; - struct writeback_control wbc_writepages = { - .bdi = wbc->bdi, - .sync_mode = WB_SYNC_NONE, - .older_than_this = NULL, - .nr_to_write = 64, - .range_start = page_offset(page) + PAGE_CACHE_SIZE, - .range_end = (loff_t)-1, - }; - - - ret = __extent_writepage(page, wbc, &epd); - - write_cache_pages(mapping, &wbc_writepages, __extent_writepage, &epd); - if (epd.bio) { - submit_one_bio(WRITE, epd.bio); - } - return ret; -} -EXPORT_SYMBOL(extent_write_full_page); - - -int extent_writepages(struct extent_map_tree *tree, - struct address_space *mapping, - get_extent_t *get_extent, - struct writeback_control *wbc) -{ - int ret = 0; - struct extent_page_data epd = { - .bio = NULL, - .tree = tree, - .get_extent = get_extent, - }; - - ret = write_cache_pages(mapping, wbc, __extent_writepage, &epd); - if (epd.bio) { - submit_one_bio(WRITE, epd.bio); - } - return ret; -} -EXPORT_SYMBOL(extent_writepages); - -int extent_readpages(struct extent_map_tree *tree, - struct address_space *mapping, - struct list_head *pages, unsigned nr_pages, - get_extent_t get_extent) -{ - struct bio *bio = NULL; - unsigned page_idx; - struct pagevec pvec; - - pagevec_init(&pvec, 0); - for (page_idx = 0; page_idx < nr_pages; page_idx++) { - struct page *page = list_entry(pages->prev, struct page, lru); - - prefetchw(&page->flags); - list_del(&page->lru); - /* - * what we want to do here is call add_to_page_cache_lru, - * but that isn't exported, so we reproduce it here - */ - if (!add_to_page_cache(page, mapping, - page->index, GFP_KERNEL)) { - - /* open coding of lru_cache_add, also not exported */ - page_cache_get(page); - if (!pagevec_add(&pvec, page)) - __pagevec_lru_add(&pvec); - __extent_read_full_page(tree, page, get_extent, &bio); - } - page_cache_release(page); - } - if (pagevec_count(&pvec)) - __pagevec_lru_add(&pvec); - BUG_ON(!list_empty(pages)); - if (bio) - submit_one_bio(READ, bio); - return 0; -} -EXPORT_SYMBOL(extent_readpages); - -/* - * basic invalidatepage code, this waits on any locked or writeback - * ranges corresponding to the page, and then deletes any extent state - * records from the tree - */ -int extent_invalidatepage(struct extent_map_tree *tree, - struct page *page, unsigned long offset) -{ - u64 start = ((u64)page->index << PAGE_CACHE_SHIFT); - u64 end = start + PAGE_CACHE_SIZE - 1; - size_t blocksize = page->mapping->host->i_sb->s_blocksize; - - start += (offset + blocksize -1) & ~(blocksize - 1); - if (start > end) - return 0; - - lock_extent(tree, start, end, GFP_NOFS); - wait_on_extent_writeback(tree, start, end); - clear_extent_bit(tree, start, end, - EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC, - 1, 1, GFP_NOFS); - return 0; -} -EXPORT_SYMBOL(extent_invalidatepage); - -/* - * simple commit_write call, set_range_dirty is used to mark both - * the pages and the extent records as dirty - */ -int extent_commit_write(struct extent_map_tree *tree, - struct inode *inode, struct page *page, - unsigned from, unsigned to) -{ - loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to; - - set_page_extent_mapped(page); - set_page_dirty(page); - - if (pos > inode->i_size) { - i_size_write(inode, pos); - mark_inode_dirty(inode); - } - return 0; -} -EXPORT_SYMBOL(extent_commit_write); - -int extent_prepare_write(struct extent_map_tree *tree, - struct inode *inode, struct page *page, - unsigned from, unsigned to, get_extent_t *get_extent) -{ - u64 page_start = (u64)page->index << PAGE_CACHE_SHIFT; - u64 page_end = page_start + PAGE_CACHE_SIZE - 1; - u64 block_start; - u64 orig_block_start; - u64 block_end; - u64 cur_end; - struct extent_map *em; - unsigned blocksize = 1 << inode->i_blkbits; - size_t page_offset = 0; - size_t block_off_start; - size_t block_off_end; - int err = 0; - int iocount = 0; - int ret = 0; - int isnew; - - set_page_extent_mapped(page); - - block_start = (page_start + from) & ~((u64)blocksize - 1); - block_end = (page_start + to - 1) | (blocksize - 1); - orig_block_start = block_start; - - lock_extent(tree, page_start, page_end, GFP_NOFS); - while(block_start <= block_end) { - em = get_extent(inode, page, page_offset, block_start, - block_end, 1); - if (IS_ERR(em) || !em) { - goto err; - } - cur_end = min(block_end, em->end); - block_off_start = block_start & (PAGE_CACHE_SIZE - 1); - block_off_end = block_off_start + blocksize; - isnew = clear_extent_new(tree, block_start, cur_end, GFP_NOFS); - - if (!PageUptodate(page) && isnew && - (block_off_end > to || block_off_start < from)) { - void *kaddr; - - kaddr = kmap_atomic(page, KM_USER0); - if (block_off_end > to) - memset(kaddr + to, 0, block_off_end - to); - if (block_off_start < from) - memset(kaddr + block_off_start, 0, - from - block_off_start); - flush_dcache_page(page); - kunmap_atomic(kaddr, KM_USER0); - } - if ((em->block_start != EXTENT_MAP_HOLE && - em->block_start != EXTENT_MAP_INLINE) && - !isnew && !PageUptodate(page) && - (block_off_end > to || block_off_start < from) && - !test_range_bit(tree, block_start, cur_end, - EXTENT_UPTODATE, 1)) { - u64 sector; - u64 extent_offset = block_start - em->start; - size_t iosize; - sector = (em->block_start + extent_offset) >> 9; - iosize = (cur_end - block_start + blocksize) & - ~((u64)blocksize - 1); - /* - * we've already got the extent locked, but we - * need to split the state such that our end_bio - * handler can clear the lock. - */ - set_extent_bit(tree, block_start, - block_start + iosize - 1, - EXTENT_LOCKED, 0, NULL, GFP_NOFS); - ret = submit_extent_page(READ, tree, page, - sector, iosize, page_offset, em->bdev, - NULL, 1, - end_bio_extent_preparewrite); - iocount++; - block_start = block_start + iosize; - } else { - set_extent_uptodate(tree, block_start, cur_end, - GFP_NOFS); - unlock_extent(tree, block_start, cur_end, GFP_NOFS); - block_start = cur_end + 1; - } - page_offset = block_start & (PAGE_CACHE_SIZE - 1); - free_extent_map(em); - } - if (iocount) { - wait_extent_bit(tree, orig_block_start, - block_end, EXTENT_LOCKED); - } - check_page_uptodate(tree, page); -err: - /* FIXME, zero out newly allocated blocks on error */ - return err; -} -EXPORT_SYMBOL(extent_prepare_write); - -/* - * a helper for releasepage. As long as there are no locked extents - * in the range corresponding to the page, both state records and extent - * map records are removed - */ -int try_release_extent_mapping(struct extent_map_tree *tree, struct page *page) -{ - struct extent_map *em; - u64 start = (u64)page->index << PAGE_CACHE_SHIFT; - u64 end = start + PAGE_CACHE_SIZE - 1; - u64 orig_start = start; - int ret = 1; - - while (start <= end) { - em = lookup_extent_mapping(tree, start, end); - if (!em || IS_ERR(em)) - break; - if (!test_range_bit(tree, em->start, em->end, - EXTENT_LOCKED, 0)) { - remove_extent_mapping(tree, em); - /* once for the rb tree */ - free_extent_map(em); - } - start = em->end + 1; - /* once for us */ - free_extent_map(em); - } - if (test_range_bit(tree, orig_start, end, EXTENT_LOCKED, 0)) - ret = 0; - else - clear_extent_bit(tree, orig_start, end, EXTENT_UPTODATE, - 1, 1, GFP_NOFS); - return ret; -} -EXPORT_SYMBOL(try_release_extent_mapping); - -sector_t extent_bmap(struct address_space *mapping, sector_t iblock, - get_extent_t *get_extent) -{ - struct inode *inode = mapping->host; - u64 start = iblock << inode->i_blkbits; - u64 end = start + (1 << inode->i_blkbits) - 1; - sector_t sector = 0; - struct extent_map *em; - - em = get_extent(inode, NULL, 0, start, end, 0); - if (!em || IS_ERR(em)) - return 0; - - if (em->block_start == EXTENT_MAP_INLINE || - em->block_start == EXTENT_MAP_HOLE) - goto out; - - sector = (em->block_start + start - em->start) >> inode->i_blkbits; -out: - free_extent_map(em); - return sector; -} - -static int add_lru(struct extent_map_tree *tree, struct extent_buffer *eb) -{ - if (list_empty(&eb->lru)) { - extent_buffer_get(eb); - list_add(&eb->lru, &tree->buffer_lru); - tree->lru_size++; - if (tree->lru_size >= BUFFER_LRU_MAX) { - struct extent_buffer *rm; - rm = list_entry(tree->buffer_lru.prev, - struct extent_buffer, lru); - tree->lru_size--; - list_del_init(&rm->lru); - free_extent_buffer(rm); - } - } else - list_move(&eb->lru, &tree->buffer_lru); - return 0; -} -static struct extent_buffer *find_lru(struct extent_map_tree *tree, - u64 start, unsigned long len) -{ - struct list_head *lru = &tree->buffer_lru; - struct list_head *cur = lru->next; - struct extent_buffer *eb; - - if (list_empty(lru)) - return NULL; - - do { - eb = list_entry(cur, struct extent_buffer, lru); - if (eb->start == start && eb->len == len) { - extent_buffer_get(eb); - return eb; - } - cur = cur->next; - } while (cur != lru); - return NULL; -} - -static inline unsigned long num_extent_pages(u64 start, u64 len) -{ - return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) - - (start >> PAGE_CACHE_SHIFT); -} - -static inline struct page *extent_buffer_page(struct extent_buffer *eb, - unsigned long i) -{ - struct page *p; - struct address_space *mapping; - - if (i == 0) - return eb->first_page; - i += eb->start >> PAGE_CACHE_SHIFT; - mapping = eb->first_page->mapping; - read_lock_irq(&mapping->tree_lock); - p = radix_tree_lookup(&mapping->page_tree, i); - read_unlock_irq(&mapping->tree_lock); - return p; -} - -static struct extent_buffer *__alloc_extent_buffer(struct extent_map_tree *tree, - u64 start, - unsigned long len, - gfp_t mask) -{ - struct extent_buffer *eb = NULL; - - spin_lock(&tree->lru_lock); - eb = find_lru(tree, start, len); - spin_unlock(&tree->lru_lock); - if (eb) { - return eb; - } - - eb = kmem_cache_zalloc(extent_buffer_cache, mask); - INIT_LIST_HEAD(&eb->lru); - eb->start = start; - eb->len = len; - atomic_set(&eb->refs, 1); - - return eb; -} - -static void __free_extent_buffer(struct extent_buffer *eb) -{ - kmem_cache_free(extent_buffer_cache, eb); -} - -struct extent_buffer *alloc_extent_buffer(struct extent_map_tree *tree, - u64 start, unsigned long len, - struct page *page0, - gfp_t mask) -{ - unsigned long num_pages = num_extent_pages(start, len); - unsigned long i; - unsigned long index = start >> PAGE_CACHE_SHIFT; - struct extent_buffer *eb; - struct page *p; - struct address_space *mapping = tree->mapping; - int uptodate = 1; - - eb = __alloc_extent_buffer(tree, start, len, mask); - if (!eb || IS_ERR(eb)) - return NULL; - - if (eb->flags & EXTENT_BUFFER_FILLED) - goto lru_add; - - if (page0) { - eb->first_page = page0; - i = 1; - index++; - page_cache_get(page0); - mark_page_accessed(page0); - set_page_extent_mapped(page0); - WARN_ON(!PageUptodate(page0)); - set_page_extent_head(page0, len); - } else { - i = 0; - } - for (; i < num_pages; i++, index++) { - p = find_or_create_page(mapping, index, mask | __GFP_HIGHMEM); - if (!p) { - WARN_ON(1); - goto fail; - } - set_page_extent_mapped(p); - mark_page_accessed(p); - if (i == 0) { - eb->first_page = p; - set_page_extent_head(p, len); - } else { - set_page_private(p, EXTENT_PAGE_PRIVATE); - } - if (!PageUptodate(p)) - uptodate = 0; - unlock_page(p); - } - if (uptodate) - eb->flags |= EXTENT_UPTODATE; - eb->flags |= EXTENT_BUFFER_FILLED; - -lru_add: - spin_lock(&tree->lru_lock); - add_lru(tree, eb); - spin_unlock(&tree->lru_lock); - return eb; - -fail: - spin_lock(&tree->lru_lock); - list_del_init(&eb->lru); - spin_unlock(&tree->lru_lock); - if (!atomic_dec_and_test(&eb->refs)) - return NULL; - for (index = 1; index < i; index++) { - page_cache_release(extent_buffer_page(eb, index)); - } - if (i > 0) - page_cache_release(extent_buffer_page(eb, 0)); - __free_extent_buffer(eb); - return NULL; -} -EXPORT_SYMBOL(alloc_extent_buffer); - -struct extent_buffer *find_extent_buffer(struct extent_map_tree *tree, - u64 start, unsigned long len, - gfp_t mask) -{ - unsigned long num_pages = num_extent_pages(start, len); - unsigned long i; - unsigned long index = start >> PAGE_CACHE_SHIFT; - struct extent_buffer *eb; - struct page *p; - struct address_space *mapping = tree->mapping; - int uptodate = 1; - - eb = __alloc_extent_buffer(tree, start, len, mask); - if (!eb || IS_ERR(eb)) - return NULL; - - if (eb->flags & EXTENT_BUFFER_FILLED) - goto lru_add; - - for (i = 0; i < num_pages; i++, index++) { - p = find_lock_page(mapping, index); - if (!p) { - goto fail; - } - set_page_extent_mapped(p); - mark_page_accessed(p); - - if (i == 0) { - eb->first_page = p; - set_page_extent_head(p, len); - } else { - set_page_private(p, EXTENT_PAGE_PRIVATE); - } - - if (!PageUptodate(p)) - uptodate = 0; - unlock_page(p); - } - if (uptodate) - eb->flags |= EXTENT_UPTODATE; - eb->flags |= EXTENT_BUFFER_FILLED; - -lru_add: - spin_lock(&tree->lru_lock); - add_lru(tree, eb); - spin_unlock(&tree->lru_lock); - return eb; -fail: - spin_lock(&tree->lru_lock); - list_del_init(&eb->lru); - spin_unlock(&tree->lru_lock); - if (!atomic_dec_and_test(&eb->refs)) - return NULL; - for (index = 1; index < i; index++) { - page_cache_release(extent_buffer_page(eb, index)); - } - if (i > 0) - page_cache_release(extent_buffer_page(eb, 0)); - __free_extent_buffer(eb); - return NULL; -} -EXPORT_SYMBOL(find_extent_buffer); - -void free_extent_buffer(struct extent_buffer *eb) -{ - unsigned long i; - unsigned long num_pages; - - if (!eb) - return; - - if (!atomic_dec_and_test(&eb->refs)) - return; - - WARN_ON(!list_empty(&eb->lru)); - num_pages = num_extent_pages(eb->start, eb->len); - - for (i = 1; i < num_pages; i++) { - page_cache_release(extent_buffer_page(eb, i)); - } - page_cache_release(extent_buffer_page(eb, 0)); - __free_extent_buffer(eb); -} -EXPORT_SYMBOL(free_extent_buffer); - -int clear_extent_buffer_dirty(struct extent_map_tree *tree, - struct extent_buffer *eb) -{ - int set; - unsigned long i; - unsigned long num_pages; - struct page *page; - - u64 start = eb->start; - u64 end = start + eb->len - 1; - - set = clear_extent_dirty(tree, start, end, GFP_NOFS); - num_pages = num_extent_pages(eb->start, eb->len); - - for (i = 0; i < num_pages; i++) { - page = extent_buffer_page(eb, i); - lock_page(page); - if (i == 0) - set_page_extent_head(page, eb->len); - else - set_page_private(page, EXTENT_PAGE_PRIVATE); - - /* - * if we're on the last page or the first page and the - * block isn't aligned on a page boundary, do extra checks - * to make sure we don't clean page that is partially dirty - */ - if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) || - ((i == num_pages - 1) && - ((eb->start + eb->len) & (PAGE_CACHE_SIZE - 1)))) { - start = (u64)page->index << PAGE_CACHE_SHIFT; - end = start + PAGE_CACHE_SIZE - 1; - if (test_range_bit(tree, start, end, - EXTENT_DIRTY, 0)) { - unlock_page(page); - continue; - } - } - clear_page_dirty_for_io(page); - write_lock_irq(&page->mapping->tree_lock); - if (!PageDirty(page)) { - radix_tree_tag_clear(&page->mapping->page_tree, - page_index(page), - PAGECACHE_TAG_DIRTY); - } - write_unlock_irq(&page->mapping->tree_lock); - unlock_page(page); - } - return 0; -} -EXPORT_SYMBOL(clear_extent_buffer_dirty); - -int wait_on_extent_buffer_writeback(struct extent_map_tree *tree, - struct extent_buffer *eb) -{ - return wait_on_extent_writeback(tree, eb->start, - eb->start + eb->len - 1); -} -EXPORT_SYMBOL(wait_on_extent_buffer_writeback); - -int set_extent_buffer_dirty(struct extent_map_tree *tree, - struct extent_buffer *eb) -{ - unsigned long i; - unsigned long num_pages; - - num_pages = num_extent_pages(eb->start, eb->len); - for (i = 0; i < num_pages; i++) { - struct page *page = extent_buffer_page(eb, i); - /* writepage may need to do something special for the - * first page, we have to make sure page->private is - * properly set. releasepage may drop page->private - * on us if the page isn't already dirty. - */ - if (i == 0) { - lock_page(page); - set_page_extent_head(page, eb->len); - } else if (PagePrivate(page) && - page->private != EXTENT_PAGE_PRIVATE) { - lock_page(page); - set_page_extent_mapped(page); - unlock_page(page); - } - __set_page_dirty_nobuffers(extent_buffer_page(eb, i)); - if (i == 0) - unlock_page(page); - } - return set_extent_dirty(tree, eb->start, - eb->start + eb->len - 1, GFP_NOFS); -} -EXPORT_SYMBOL(set_extent_buffer_dirty); - -int set_extent_buffer_uptodate(struct extent_map_tree *tree, - struct extent_buffer *eb) -{ - unsigned long i; - struct page *page; - unsigned long num_pages; - - num_pages = num_extent_pages(eb->start, eb->len); - - set_extent_uptodate(tree, eb->start, eb->start + eb->len - 1, - GFP_NOFS); - for (i = 0; i < num_pages; i++) { - page = extent_buffer_page(eb, i); - if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) || - ((i == num_pages - 1) && - ((eb->start + eb->len) & (PAGE_CACHE_SIZE - 1)))) { - check_page_uptodate(tree, page); - continue; - } - SetPageUptodate(page); - } - return 0; -} -EXPORT_SYMBOL(set_extent_buffer_uptodate); - -int extent_buffer_uptodate(struct extent_map_tree *tree, - struct extent_buffer *eb) -{ - if (eb->flags & EXTENT_UPTODATE) - return 1; - return test_range_bit(tree, eb->start, eb->start + eb->len - 1, - EXTENT_UPTODATE, 1); -} -EXPORT_SYMBOL(extent_buffer_uptodate); - -int read_extent_buffer_pages(struct extent_map_tree *tree, - struct extent_buffer *eb, - u64 start, - int wait) -{ - unsigned long i; - unsigned long start_i; - struct page *page; - int err; - int ret = 0; - unsigned long num_pages; - - if (eb->flags & EXTENT_UPTODATE) - return 0; - - if (0 && test_range_bit(tree, eb->start, eb->start + eb->len - 1, - EXTENT_UPTODATE, 1)) { - return 0; - } - - if (start) { - WARN_ON(start < eb->start); - start_i = (start >> PAGE_CACHE_SHIFT) - - (eb->start >> PAGE_CACHE_SHIFT); - } else { - start_i = 0; - } - - num_pages = num_extent_pages(eb->start, eb->len); - for (i = start_i; i < num_pages; i++) { - page = extent_buffer_page(eb, i); - if (PageUptodate(page)) { - continue; - } - if (!wait) { - if (TestSetPageLocked(page)) { - continue; - } - } else { - lock_page(page); - } - if (!PageUptodate(page)) { - err = page->mapping->a_ops->readpage(NULL, page); - if (err) { - ret = err; - } - } else { - unlock_page(page); - } - } - - if (ret || !wait) { - return ret; - } - - for (i = start_i; i < num_pages; i++) { - page = extent_buffer_page(eb, i); - wait_on_page_locked(page); - if (!PageUptodate(page)) { - ret = -EIO; - } - } - if (!ret) - eb->flags |= EXTENT_UPTODATE; - return ret; -} -EXPORT_SYMBOL(read_extent_buffer_pages); - -void read_extent_buffer(struct extent_buffer *eb, void *dstv, - unsigned long start, - unsigned long len) -{ - size_t cur; - size_t offset; - struct page *page; - char *kaddr; - char *dst = (char *)dstv; - size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); - unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT; - unsigned long num_pages = num_extent_pages(eb->start, eb->len); - - WARN_ON(start > eb->len); - WARN_ON(start + len > eb->start + eb->len); - - offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1); - - while(len > 0) { - page = extent_buffer_page(eb, i); - if (!PageUptodate(page)) { - printk("page %lu not up to date i %lu, total %lu, len %lu\n", page->index, i, num_pages, eb->len); - WARN_ON(1); - } - WARN_ON(!PageUptodate(page)); - - cur = min(len, (PAGE_CACHE_SIZE - offset)); - kaddr = kmap_atomic(page, KM_USER1); - memcpy(dst, kaddr + offset, cur); - kunmap_atomic(kaddr, KM_USER1); - - dst += cur; - len -= cur; - offset = 0; - i++; - } -} -EXPORT_SYMBOL(read_extent_buffer); - -int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start, - unsigned long min_len, char **token, char **map, - unsigned long *map_start, - unsigned long *map_len, int km) -{ - size_t offset = start & (PAGE_CACHE_SIZE - 1); - char *kaddr; - struct page *p; - size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); - unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT; - unsigned long end_i = (start_offset + start + min_len - 1) >> - PAGE_CACHE_SHIFT; - - if (i != end_i) - return -EINVAL; - - if (i == 0) { - offset = start_offset; - *map_start = 0; - } else { - offset = 0; - *map_start = ((u64)i << PAGE_CACHE_SHIFT) - start_offset; - } - if (start + min_len > eb->len) { -printk("bad mapping eb start %Lu len %lu, wanted %lu %lu\n", eb->start, eb->len, start, min_len); - WARN_ON(1); - } - - p = extent_buffer_page(eb, i); - WARN_ON(!PageUptodate(p)); - kaddr = kmap_atomic(p, km); - *token = kaddr; - *map = kaddr + offset; - *map_len = PAGE_CACHE_SIZE - offset; - return 0; -} -EXPORT_SYMBOL(map_private_extent_buffer); - -int map_extent_buffer(struct extent_buffer *eb, unsigned long start, - unsigned long min_len, - char **token, char **map, - unsigned long *map_start, - unsigned long *map_len, int km) -{ - int err; - int save = 0; - if (eb->map_token) { - unmap_extent_buffer(eb, eb->map_token, km); - eb->map_token = NULL; - save = 1; - } - err = map_private_extent_buffer(eb, start, min_len, token, map, - map_start, map_len, km); - if (!err && save) { - eb->map_token = *token; - eb->kaddr = *map; - eb->map_start = *map_start; - eb->map_len = *map_len; - } - return err; -} -EXPORT_SYMBOL(map_extent_buffer); - -void unmap_extent_buffer(struct extent_buffer *eb, char *token, int km) -{ - kunmap_atomic(token, km); -} -EXPORT_SYMBOL(unmap_extent_buffer); - -int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv, - unsigned long start, - unsigned long len) -{ - size_t cur; - size_t offset; - struct page *page; - char *kaddr; - char *ptr = (char *)ptrv; - size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); - unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT; - int ret = 0; - - WARN_ON(start > eb->len); - WARN_ON(start + len > eb->start + eb->len); - - offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1); - - while(len > 0) { - page = extent_buffer_page(eb, i); - WARN_ON(!PageUptodate(page)); - - cur = min(len, (PAGE_CACHE_SIZE - offset)); - - kaddr = kmap_atomic(page, KM_USER0); - ret = memcmp(ptr, kaddr + offset, cur); - kunmap_atomic(kaddr, KM_USER0); - if (ret) - break; - - ptr += cur; - len -= cur; - offset = 0; - i++; - } - return ret; -} -EXPORT_SYMBOL(memcmp_extent_buffer); - -void write_extent_buffer(struct extent_buffer *eb, const void *srcv, - unsigned long start, unsigned long len) -{ - size_t cur; - size_t offset; - struct page *page; - char *kaddr; - char *src = (char *)srcv; - size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); - unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT; - - WARN_ON(start > eb->len); - WARN_ON(start + len > eb->start + eb->len); - - offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1); - - while(len > 0) { - page = extent_buffer_page(eb, i); - WARN_ON(!PageUptodate(page)); - - cur = min(len, PAGE_CACHE_SIZE - offset); - kaddr = kmap_atomic(page, KM_USER1); - memcpy(kaddr + offset, src, cur); - kunmap_atomic(kaddr, KM_USER1); - - src += cur; - len -= cur; - offset = 0; - i++; - } -} -EXPORT_SYMBOL(write_extent_buffer); - -void memset_extent_buffer(struct extent_buffer *eb, char c, - unsigned long start, unsigned long len) -{ - size_t cur; - size_t offset; - struct page *page; - char *kaddr; - size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); - unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT; - - WARN_ON(start > eb->len); - WARN_ON(start + len > eb->start + eb->len); - - offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1); - - while(len > 0) { - page = extent_buffer_page(eb, i); - WARN_ON(!PageUptodate(page)); - - cur = min(len, PAGE_CACHE_SIZE - offset); - kaddr = kmap_atomic(page, KM_USER0); - memset(kaddr + offset, c, cur); - kunmap_atomic(kaddr, KM_USER0); - - len -= cur; - offset = 0; - i++; - } -} -EXPORT_SYMBOL(memset_extent_buffer); - -void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src, - unsigned long dst_offset, unsigned long src_offset, - unsigned long len) -{ - u64 dst_len = dst->len; - size_t cur; - size_t offset; - struct page *page; - char *kaddr; - size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1); - unsigned long i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT; - - WARN_ON(src->len != dst_len); - - offset = (start_offset + dst_offset) & - ((unsigned long)PAGE_CACHE_SIZE - 1); - - while(len > 0) { - page = extent_buffer_page(dst, i); - WARN_ON(!PageUptodate(page)); - - cur = min(len, (unsigned long)(PAGE_CACHE_SIZE - offset)); - - kaddr = kmap_atomic(page, KM_USER0); - read_extent_buffer(src, kaddr + offset, src_offset, cur); - kunmap_atomic(kaddr, KM_USER0); - - src_offset += cur; - len -= cur; - offset = 0; - i++; - } -} -EXPORT_SYMBOL(copy_extent_buffer); - -static void move_pages(struct page *dst_page, struct page *src_page, - unsigned long dst_off, unsigned long src_off, - unsigned long len) -{ - char *dst_kaddr = kmap_atomic(dst_page, KM_USER0); - if (dst_page == src_page) { - memmove(dst_kaddr + dst_off, dst_kaddr + src_off, len); - } else { - char *src_kaddr = kmap_atomic(src_page, KM_USER1); - char *p = dst_kaddr + dst_off + len; - char *s = src_kaddr + src_off + len; - - while (len--) - *--p = *--s; - - kunmap_atomic(src_kaddr, KM_USER1); - } - kunmap_atomic(dst_kaddr, KM_USER0); -} - -static void copy_pages(struct page *dst_page, struct page *src_page, - unsigned long dst_off, unsigned long src_off, - unsigned long len) -{ - char *dst_kaddr = kmap_atomic(dst_page, KM_USER0); - char *src_kaddr; - - if (dst_page != src_page) - src_kaddr = kmap_atomic(src_page, KM_USER1); - else - src_kaddr = dst_kaddr; - - memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len); - kunmap_atomic(dst_kaddr, KM_USER0); - if (dst_page != src_page) - kunmap_atomic(src_kaddr, KM_USER1); -} - -void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, - unsigned long src_offset, unsigned long len) -{ - size_t cur; - size_t dst_off_in_page; - size_t src_off_in_page; - size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1); - unsigned long dst_i; - unsigned long src_i; - - if (src_offset + len > dst->len) { - printk("memmove bogus src_offset %lu move len %lu len %lu\n", - src_offset, len, dst->len); - BUG_ON(1); - } - if (dst_offset + len > dst->len) { - printk("memmove bogus dst_offset %lu move len %lu len %lu\n", - dst_offset, len, dst->len); - BUG_ON(1); - } - - while(len > 0) { - dst_off_in_page = (start_offset + dst_offset) & - ((unsigned long)PAGE_CACHE_SIZE - 1); - src_off_in_page = (start_offset + src_offset) & - ((unsigned long)PAGE_CACHE_SIZE - 1); - - dst_i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT; - src_i = (start_offset + src_offset) >> PAGE_CACHE_SHIFT; - - cur = min(len, (unsigned long)(PAGE_CACHE_SIZE - - src_off_in_page)); - cur = min_t(unsigned long, cur, - (unsigned long)(PAGE_CACHE_SIZE - dst_off_in_page)); - - copy_pages(extent_buffer_page(dst, dst_i), - extent_buffer_page(dst, src_i), - dst_off_in_page, src_off_in_page, cur); - - src_offset += cur; - dst_offset += cur; - len -= cur; - } -} -EXPORT_SYMBOL(memcpy_extent_buffer); - -void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, - unsigned long src_offset, unsigned long len) -{ - size_t cur; - size_t dst_off_in_page; - size_t src_off_in_page; - unsigned long dst_end = dst_offset + len - 1; - unsigned long src_end = src_offset + len - 1; - size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1); - unsigned long dst_i; - unsigned long src_i; - - if (src_offset + len > dst->len) { - printk("memmove bogus src_offset %lu move len %lu len %lu\n", - src_offset, len, dst->len); - BUG_ON(1); - } - if (dst_offset + len > dst->len) { - printk("memmove bogus dst_offset %lu move len %lu len %lu\n", - dst_offset, len, dst->len); - BUG_ON(1); - } - if (dst_offset < src_offset) { - memcpy_extent_buffer(dst, dst_offset, src_offset, len); - return; - } - while(len > 0) { - dst_i = (start_offset + dst_end) >> PAGE_CACHE_SHIFT; - src_i = (start_offset + src_end) >> PAGE_CACHE_SHIFT; - - dst_off_in_page = (start_offset + dst_end) & - ((unsigned long)PAGE_CACHE_SIZE - 1); - src_off_in_page = (start_offset + src_end) & - ((unsigned long)PAGE_CACHE_SIZE - 1); - - cur = min_t(unsigned long, len, src_off_in_page + 1); - cur = min(cur, dst_off_in_page + 1); - move_pages(extent_buffer_page(dst, dst_i), - extent_buffer_page(dst, src_i), - dst_off_in_page - cur + 1, - src_off_in_page - cur + 1, cur); - - dst_end -= cur; - src_end -= cur; - len -= cur; - } -} -EXPORT_SYMBOL(memmove_extent_buffer); diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h index ea60f5447b5b..56314217cfc0 100644 --- a/fs/btrfs/extent_map.h +++ b/fs/btrfs/extent_map.h @@ -3,215 +3,53 @@ #include +#define EXTENT_MAP_LAST_BYTE (u64)-4 #define EXTENT_MAP_HOLE (u64)-3 #define EXTENT_MAP_INLINE (u64)-2 #define EXTENT_MAP_DELALLOC (u64)-1 -/* bits for the extent state */ -#define EXTENT_DIRTY 1 -#define EXTENT_WRITEBACK (1 << 1) -#define EXTENT_UPTODATE (1 << 2) -#define EXTENT_LOCKED (1 << 3) -#define EXTENT_NEW (1 << 4) -#define EXTENT_DELALLOC (1 << 5) -#define EXTENT_DEFRAG (1 << 6) -#define EXTENT_DEFRAG_DONE (1 << 7) -#define EXTENT_BUFFER_FILLED (1 << 8) -#define EXTENT_CSUM (1 << 9) -#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK) - -/* - * page->private values. Every page that is controlled by the extent - * map has page->private set to one. - */ -#define EXTENT_PAGE_PRIVATE 1 -#define EXTENT_PAGE_PRIVATE_FIRST_PAGE 3 - - -struct extent_map_ops { - int (*fill_delalloc)(struct inode *inode, u64 start, u64 end); - int (*writepage_io_hook)(struct page *page, u64 start, u64 end); - int (*readpage_io_hook)(struct page *page, u64 start, u64 end); - int (*readpage_end_io_hook)(struct page *page, u64 start, u64 end); - void (*writepage_end_io_hook)(struct page *page, u64 start, u64 end); -}; - -struct extent_map_tree { - struct rb_root map; - struct rb_root state; - struct address_space *mapping; - u64 dirty_bytes; - rwlock_t lock; - struct extent_map_ops *ops; - spinlock_t lru_lock; - struct list_head buffer_lru; - int lru_size; -}; - -/* note, this must start with the same fields as fs/extent_map.c:tree_entry */ struct extent_map { - u64 start; - u64 end; /* inclusive */ - int in_tree; struct rb_node rb_node; - /* block_start and block_end are in bytes */ + + /* all of these are in bytes */ + u64 start; + u64 len; u64 block_start; - u64 block_end; /* inclusive */ + unsigned long flags; struct block_device *bdev; atomic_t refs; -}; - -/* note, this must start with the same fields as fs/extent_map.c:tree_entry */ -struct extent_state { - u64 start; - u64 end; /* inclusive */ int in_tree; - struct rb_node rb_node; - wait_queue_head_t wq; - atomic_t refs; - unsigned long state; - - /* for use by the FS */ - u64 private; - - struct list_head list; }; -struct extent_buffer { - u64 start; - unsigned long len; - char *map_token; - char *kaddr; - unsigned long map_start; - unsigned long map_len; - struct page *first_page; - struct list_head lru; - atomic_t refs; - int flags; +struct extent_map_tree { + struct rb_root map; + struct extent_map *last; + spinlock_t lock; }; -typedef struct extent_map *(get_extent_t)(struct inode *inode, - struct page *page, - size_t page_offset, - u64 start, u64 end, - int create); +static inline u64 extent_map_end(struct extent_map *em) +{ + if (em->start + em->len < em->start) + return (u64)-1; + return em->start + em->len; +} + +static inline u64 extent_map_block_end(struct extent_map *em) +{ + if (em->block_start + em->len < em->block_start) + return (u64)-1; + return em->block_start + em->len; +} -void extent_map_tree_init(struct extent_map_tree *tree, - struct address_space *mapping, gfp_t mask); -void extent_map_tree_empty_lru(struct extent_map_tree *tree); +void extent_map_tree_init(struct extent_map_tree *tree, gfp_t mask); struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree, - u64 start, u64 end); + u64 start, u64 len); int add_extent_mapping(struct extent_map_tree *tree, struct extent_map *em); int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em); -int try_release_extent_mapping(struct extent_map_tree *tree, struct page *page); -int lock_extent(struct extent_map_tree *tree, u64 start, u64 end, gfp_t mask); -int unlock_extent(struct extent_map_tree *tree, u64 start, u64 end, gfp_t mask); + struct extent_map *alloc_extent_map(gfp_t mask); void free_extent_map(struct extent_map *em); -int extent_read_full_page(struct extent_map_tree *tree, struct page *page, - get_extent_t *get_extent); int __init extent_map_init(void); void extent_map_exit(void); - -u64 count_range_bits(struct extent_map_tree *tree, - u64 *start, u64 search_end, - u64 max_bytes, unsigned long bits); - -int test_range_bit(struct extent_map_tree *tree, u64 start, u64 end, - int bits, int filled); -int clear_extent_bits(struct extent_map_tree *tree, u64 start, u64 end, - int bits, gfp_t mask); -int set_extent_bits(struct extent_map_tree *tree, u64 start, u64 end, - int bits, gfp_t mask); -int set_extent_uptodate(struct extent_map_tree *tree, u64 start, u64 end, - gfp_t mask); -int set_extent_new(struct extent_map_tree *tree, u64 start, u64 end, - gfp_t mask); -int set_extent_dirty(struct extent_map_tree *tree, u64 start, u64 end, - gfp_t mask); -int clear_extent_dirty(struct extent_map_tree *tree, u64 start, u64 end, - gfp_t mask); -int set_extent_delalloc(struct extent_map_tree *tree, u64 start, u64 end, - gfp_t mask); -int find_first_extent_bit(struct extent_map_tree *tree, u64 start, - u64 *start_ret, u64 *end_ret, int bits); -int extent_invalidatepage(struct extent_map_tree *tree, - struct page *page, unsigned long offset); -int extent_write_full_page(struct extent_map_tree *tree, struct page *page, - get_extent_t *get_extent, - struct writeback_control *wbc); -int extent_writepages(struct extent_map_tree *tree, - struct address_space *mapping, - get_extent_t *get_extent, - struct writeback_control *wbc); -int extent_readpages(struct extent_map_tree *tree, - struct address_space *mapping, - struct list_head *pages, unsigned nr_pages, - get_extent_t get_extent); -int extent_prepare_write(struct extent_map_tree *tree, - struct inode *inode, struct page *page, - unsigned from, unsigned to, get_extent_t *get_extent); -int extent_commit_write(struct extent_map_tree *tree, - struct inode *inode, struct page *page, - unsigned from, unsigned to); -sector_t extent_bmap(struct address_space *mapping, sector_t iblock, - get_extent_t *get_extent); -int set_range_dirty(struct extent_map_tree *tree, u64 start, u64 end); -int set_state_private(struct extent_map_tree *tree, u64 start, u64 private); -int get_state_private(struct extent_map_tree *tree, u64 start, u64 *private); -void set_page_extent_mapped(struct page *page); - -struct extent_buffer *alloc_extent_buffer(struct extent_map_tree *tree, - u64 start, unsigned long len, - struct page *page0, - gfp_t mask); -struct extent_buffer *find_extent_buffer(struct extent_map_tree *tree, - u64 start, unsigned long len, - gfp_t mask); -void free_extent_buffer(struct extent_buffer *eb); -int read_extent_buffer_pages(struct extent_map_tree *tree, - struct extent_buffer *eb, u64 start, int wait); - -static inline void extent_buffer_get(struct extent_buffer *eb) -{ - atomic_inc(&eb->refs); -} - -int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv, - unsigned long start, - unsigned long len); -void read_extent_buffer(struct extent_buffer *eb, void *dst, - unsigned long start, - unsigned long len); -void write_extent_buffer(struct extent_buffer *eb, const void *src, - unsigned long start, unsigned long len); -void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src, - unsigned long dst_offset, unsigned long src_offset, - unsigned long len); -void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, - unsigned long src_offset, unsigned long len); -void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, - unsigned long src_offset, unsigned long len); -void memset_extent_buffer(struct extent_buffer *eb, char c, - unsigned long start, unsigned long len); -int wait_on_extent_buffer_writeback(struct extent_map_tree *tree, - struct extent_buffer *eb); -int clear_extent_buffer_dirty(struct extent_map_tree *tree, - struct extent_buffer *eb); -int set_extent_buffer_dirty(struct extent_map_tree *tree, - struct extent_buffer *eb); -int set_extent_buffer_uptodate(struct extent_map_tree *tree, - struct extent_buffer *eb); -int extent_buffer_uptodate(struct extent_map_tree *tree, - struct extent_buffer *eb); -int map_extent_buffer(struct extent_buffer *eb, unsigned long offset, - unsigned long min_len, char **token, char **map, - unsigned long *map_start, - unsigned long *map_len, int km); -int map_private_extent_buffer(struct extent_buffer *eb, unsigned long offset, - unsigned long min_len, char **token, char **map, - unsigned long *map_start, - unsigned long *map_len, int km); -void unmap_extent_buffer(struct extent_buffer *eb, char *token, int km); #endif diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 1cd8c908811e..c5bb00f92396 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -233,8 +233,7 @@ static int noinline dirty_and_release_pages(struct btrfs_trans_handle *trans, int err = 0; int i; struct inode *inode = fdentry(file)->d_inode; - struct extent_map *em; - struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; + struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; u64 hint_byte; u64 num_bytes; u64 start_pos; @@ -242,11 +241,6 @@ static int noinline dirty_and_release_pages(struct btrfs_trans_handle *trans, u64 end_pos = pos + write_bytes; u64 inline_size; loff_t isize = i_size_read(inode); - em = alloc_extent_map(GFP_NOFS); - if (!em) - return -ENOMEM; - - em->bdev = inode->i_sb->s_bdev; start_pos = pos & ~((u64)root->sectorsize - 1); num_bytes = (write_bytes + pos - start_pos + @@ -254,7 +248,7 @@ static int noinline dirty_and_release_pages(struct btrfs_trans_handle *trans, end_of_last_block = start_pos + num_bytes - 1; - lock_extent(em_tree, start_pos, end_of_last_block, GFP_NOFS); + lock_extent(io_tree, start_pos, end_of_last_block, GFP_NOFS); mutex_lock(&root->fs_info->fs_mutex); trans = btrfs_start_transaction(root, 1); if (!trans) { @@ -268,7 +262,7 @@ static int noinline dirty_and_release_pages(struct btrfs_trans_handle *trans, if ((end_of_last_block & 4095) == 0) { printk("strange end of last %Lu %zu %Lu\n", start_pos, write_bytes, end_of_last_block); } - set_extent_uptodate(em_tree, start_pos, end_of_last_block, GFP_NOFS); + set_extent_uptodate(io_tree, start_pos, end_of_last_block, GFP_NOFS); /* FIXME...EIEIO, ENOSPC and more */ @@ -293,6 +287,8 @@ static int noinline dirty_and_release_pages(struct btrfs_trans_handle *trans, inode->i_ino, last_pos_in_file, 0, 0, hole_size); + btrfs_drop_extent_cache(inode, last_pos_in_file, + last_pos_in_file + hole_size -1); btrfs_check_file(root, inode); } if (err) @@ -320,12 +316,12 @@ static int noinline dirty_and_release_pages(struct btrfs_trans_handle *trans, last_end += PAGE_CACHE_SIZE - 1; if (start_pos < isize) { u64 delalloc_start = start_pos; - existing_delalloc = count_range_bits(em_tree, + existing_delalloc = count_range_bits(io_tree, &delalloc_start, end_of_last_block, (u64)-1, EXTENT_DELALLOC); } - set_extent_delalloc(em_tree, start_pos, end_of_last_block, + set_extent_delalloc(io_tree, start_pos, end_of_last_block, GFP_NOFS); spin_lock(&root->fs_info->delalloc_lock); root->fs_info->delalloc_bytes += (end_of_last_block + 1 - @@ -346,6 +342,7 @@ static int noinline dirty_and_release_pages(struct btrfs_trans_handle *trans, inline_size -= start_pos; err = insert_inline_extent(trans, root, inode, start_pos, inline_size, pages, 0, num_pages); + btrfs_drop_extent_cache(inode, start_pos, aligned_end - 1); BUG_ON(err); } if (end_pos > isize) { @@ -356,8 +353,7 @@ failed: err = btrfs_end_transaction(trans, root); out_unlock: mutex_unlock(&root->fs_info->fs_mutex); - unlock_extent(em_tree, start_pos, end_of_last_block, GFP_NOFS); - free_extent_map(em); + unlock_extent(io_tree, start_pos, end_of_last_block, GFP_NOFS); return err; } @@ -367,10 +363,15 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end) struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; while(1) { + spin_lock(&em_tree->lock); em = lookup_extent_mapping(em_tree, start, end); - if (!em) + if (!em) { + spin_unlock(&em_tree->lock); break; + } remove_extent_mapping(em_tree, em); + spin_unlock(&em_tree->lock); + /* once for us */ free_extent_map(em); /* once for the tree*/ diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 67005480e139..16d3aef45d18 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -53,7 +53,7 @@ static struct inode_operations btrfs_file_inode_operations; static struct address_space_operations btrfs_aops; static struct address_space_operations btrfs_symlink_aops; static struct file_operations btrfs_dir_file_operations; -static struct extent_map_ops btrfs_extent_map_ops; +static struct extent_io_ops btrfs_extent_io_ops; static struct kmem_cache *btrfs_inode_cachep; struct kmem_cache *btrfs_trans_handle_cachep; @@ -104,6 +104,8 @@ static int cow_file_range(struct inode *inode, u64 start, u64 end) u64 num_bytes; u64 cur_alloc_size; u64 blocksize = root->sectorsize; + u64 orig_start = start; + u64 orig_num_bytes; struct btrfs_key ins; int ret; @@ -115,6 +117,7 @@ static int cow_file_range(struct inode *inode, u64 start, u64 end) num_bytes = max(blocksize, num_bytes); ret = btrfs_drop_extents(trans, root, inode, start, start + num_bytes, start, &alloc_hint); + orig_num_bytes = num_bytes; if (alloc_hint == EXTENT_MAP_INLINE) goto out; @@ -138,6 +141,8 @@ static int cow_file_range(struct inode *inode, u64 start, u64 end) alloc_hint = ins.objectid + ins.offset; start += cur_alloc_size; } + btrfs_drop_extent_cache(inode, orig_start, + orig_start + orig_num_bytes - 1); btrfs_add_ordered_inode(inode); out: btrfs_end_transaction(trans, root); @@ -297,7 +302,7 @@ int btrfs_readpage_io_hook(struct page *page, u64 start, u64 end) int ret = 0; struct inode *inode = page->mapping->host; struct btrfs_root *root = BTRFS_I(inode)->root; - struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; + struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; struct btrfs_csum_item *item; struct btrfs_path *path = NULL; u32 csum; @@ -317,7 +322,7 @@ int btrfs_readpage_io_hook(struct page *page, u64 start, u64 end) } read_extent_buffer(path->nodes[0], &csum, (unsigned long)item, BTRFS_CRC32_SIZE); - set_state_private(em_tree, start, csum); + set_state_private(io_tree, start, csum); out: if (path) btrfs_free_path(path); @@ -329,17 +334,19 @@ int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end) { size_t offset = start - ((u64)page->index << PAGE_CACHE_SHIFT); struct inode *inode = page->mapping->host; - struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; + struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; char *kaddr; u64 private; int ret; struct btrfs_root *root = BTRFS_I(inode)->root; u32 csum = ~(u32)0; unsigned long flags; + if (btrfs_test_opt(root, NODATASUM) || btrfs_test_flag(inode, NODATASUM)) return 0; - ret = get_state_private(em_tree, start, &private); + + ret = get_state_private(io_tree, start, &private); local_irq_save(flags); kaddr = kmap_atomic(page, KM_IRQ0); if (ret) { @@ -428,7 +435,7 @@ void btrfs_read_locked_inode(struct inode *inode) switch (inode->i_mode & S_IFMT) { case S_IFREG: inode->i_mapping->a_ops = &btrfs_aops; - BTRFS_I(inode)->extent_tree.ops = &btrfs_extent_map_ops; + BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; inode->i_fop = &btrfs_file_operations; inode->i_op = &btrfs_file_inode_operations; break; @@ -873,7 +880,7 @@ static int btrfs_cow_one_page(struct inode *inode, struct page *page, size_t zero_start) { char *kaddr; - struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; + struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; struct btrfs_root *root = BTRFS_I(inode)->root; u64 page_start = (u64)page->index << PAGE_CACHE_SHIFT; u64 page_end = page_start + PAGE_CACHE_SIZE - 1; @@ -884,12 +891,12 @@ static int btrfs_cow_one_page(struct inode *inode, struct page *page, WARN_ON(!PageLocked(page)); set_page_extent_mapped(page); - lock_extent(em_tree, page_start, page_end, GFP_NOFS); + lock_extent(io_tree, page_start, page_end, GFP_NOFS); delalloc_start = page_start; - existing_delalloc = count_range_bits(&BTRFS_I(inode)->extent_tree, + existing_delalloc = count_range_bits(&BTRFS_I(inode)->io_tree, &delalloc_start, page_end, PAGE_CACHE_SIZE, EXTENT_DELALLOC); - set_extent_delalloc(&BTRFS_I(inode)->extent_tree, page_start, + set_extent_delalloc(&BTRFS_I(inode)->io_tree, page_start, page_end, GFP_NOFS); spin_lock(&root->fs_info->delalloc_lock); @@ -903,7 +910,7 @@ static int btrfs_cow_one_page(struct inode *inode, struct page *page, kunmap(page); } set_page_dirty(page); - unlock_extent(em_tree, page_start, page_end, GFP_NOFS); + unlock_extent(io_tree, page_start, page_end, GFP_NOFS); return ret; } @@ -961,7 +968,7 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr) attr->ia_valid & ATTR_SIZE && attr->ia_size > inode->i_size) { struct btrfs_trans_handle *trans; struct btrfs_root *root = BTRFS_I(inode)->root; - struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; + struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; u64 mask = root->sectorsize - 1; u64 pos = (inode->i_size + mask) & ~mask; @@ -986,7 +993,7 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr) btrfs_truncate_page(inode->i_mapping, inode->i_size); - lock_extent(em_tree, pos, block_end, GFP_NOFS); + lock_extent(io_tree, pos, block_end, GFP_NOFS); hole_size = block_end - hole_start; mutex_lock(&root->fs_info->fs_mutex); @@ -1001,11 +1008,13 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr) inode->i_ino, hole_start, 0, 0, hole_size); + btrfs_drop_extent_cache(inode, hole_start, + hole_size - 1); btrfs_check_file(root, inode); } btrfs_end_transaction(trans, root); mutex_unlock(&root->fs_info->fs_mutex); - unlock_extent(em_tree, pos, block_end, GFP_NOFS); + unlock_extent(io_tree, pos, block_end, GFP_NOFS); if (err) return err; } @@ -1189,7 +1198,8 @@ static int btrfs_init_locked_inode(struct inode *inode, void *p) struct btrfs_iget_args *args = p; inode->i_ino = args->ino; BTRFS_I(inode)->root = args->root; - extent_map_tree_init(&BTRFS_I(inode)->extent_tree, + extent_map_tree_init(&BTRFS_I(inode)->extent_tree, GFP_NOFS); + extent_io_tree_init(&BTRFS_I(inode)->io_tree, inode->i_mapping, GFP_NOFS); return 0; } @@ -1485,7 +1495,8 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, if (!inode) return ERR_PTR(-ENOMEM); - extent_map_tree_init(&BTRFS_I(inode)->extent_tree, + extent_map_tree_init(&BTRFS_I(inode)->extent_tree, GFP_NOFS); + extent_io_tree_init(&BTRFS_I(inode)->io_tree, inode->i_mapping, GFP_NOFS); BTRFS_I(inode)->root = root; @@ -1672,9 +1683,10 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry, inode->i_mapping->a_ops = &btrfs_aops; inode->i_fop = &btrfs_file_operations; inode->i_op = &btrfs_file_inode_operations; - extent_map_tree_init(&BTRFS_I(inode)->extent_tree, + extent_map_tree_init(&BTRFS_I(inode)->extent_tree, GFP_NOFS); + extent_io_tree_init(&BTRFS_I(inode)->io_tree, inode->i_mapping, GFP_NOFS); - BTRFS_I(inode)->extent_tree.ops = &btrfs_extent_map_ops; + BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; } dir->i_sb->s_dirt = 1; btrfs_update_inode_block_group(trans, inode); @@ -1816,7 +1828,7 @@ out_unlock: } struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page, - size_t page_offset, u64 start, u64 end, + size_t page_offset, u64 start, u64 len, int create) { int ret; @@ -1826,7 +1838,6 @@ struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page, u64 extent_end = 0; u64 objectid = inode->i_ino; u32 found_type; - int failed_insert = 0; struct btrfs_path *path; struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_file_extent_item *item; @@ -1834,6 +1845,7 @@ struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page, struct btrfs_key found_key; struct extent_map *em = NULL; struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; + struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; struct btrfs_trans_handle *trans = NULL; path = btrfs_alloc_path(); @@ -1841,24 +1853,26 @@ struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page, mutex_lock(&root->fs_info->fs_mutex); again: - em = lookup_extent_mapping(em_tree, start, end); + spin_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, start, len); + spin_unlock(&em_tree->lock); + if (em) { if (em->start > start) { - printk("get_extent start %Lu em start %Lu\n", - start, em->start); + printk("get_extent lookup [%Lu %Lu] em [%Lu %Lu]\n", + start, len, em->start, em->len); WARN_ON(1); } goto out; } + em = alloc_extent_map(GFP_NOFS); if (!em) { - em = alloc_extent_map(GFP_NOFS); - if (!em) { - err = -ENOMEM; - goto out; - } - em->start = EXTENT_MAP_HOLE; - em->end = EXTENT_MAP_HOLE; + err = -ENOMEM; + goto out; } + + em->start = EXTENT_MAP_HOLE; + em->len = (u64)-1; em->bdev = inode->i_sb->s_bdev; ret = btrfs_lookup_file_extent(trans, root, path, objectid, start, trans != NULL); @@ -1893,28 +1907,25 @@ again: if (start < extent_start || start >= extent_end) { em->start = start; if (start < extent_start) { - if (end < extent_start) + if (start + len <= extent_start) goto not_found; - em->end = extent_end - 1; + em->len = extent_end - extent_start; } else { - em->end = end; + em->len = len; } goto not_found_em; } bytenr = btrfs_file_extent_disk_bytenr(leaf, item); if (bytenr == 0) { em->start = extent_start; - em->end = extent_end - 1; + em->len = extent_end - extent_start; em->block_start = EXTENT_MAP_HOLE; - em->block_end = EXTENT_MAP_HOLE; goto insert; } bytenr += btrfs_file_extent_offset(leaf, item); em->block_start = bytenr; - em->block_end = em->block_start + - btrfs_file_extent_num_bytes(leaf, item) - 1; em->start = extent_start; - em->end = extent_end - 1; + em->len = extent_end - extent_start; goto insert; } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { unsigned long ptr; @@ -1925,25 +1936,24 @@ again: size = btrfs_file_extent_inline_len(leaf, btrfs_item_nr(leaf, path->slots[0])); - extent_end = (extent_start + size - 1) | - ((u64)root->sectorsize - 1); + extent_end = (extent_start + size + root->sectorsize - 1) & + ~((u64)root->sectorsize - 1); if (start < extent_start || start >= extent_end) { em->start = start; if (start < extent_start) { - if (end < extent_start) + if (start + len <= extent_start) goto not_found; - em->end = extent_end; + em->len = extent_end - extent_start; } else { - em->end = end; + em->len = len; } goto not_found_em; } em->block_start = EXTENT_MAP_INLINE; - em->block_end = EXTENT_MAP_INLINE; if (!page) { em->start = extent_start; - em->end = extent_start + size - 1; + em->len = size; goto out; } @@ -1952,8 +1962,7 @@ again: copy_size = min_t(u64, PAGE_CACHE_SIZE - page_offset, size - extent_offset); em->start = extent_start + extent_offset; - em->end = (em->start + copy_size -1) | - ((u64)root->sectorsize -1); + em->len = copy_size; map = kmap(page); ptr = btrfs_file_extent_inline_start(item) + extent_offset; if (create == 0 && !PageUptodate(page)) { @@ -1974,7 +1983,8 @@ again: btrfs_mark_buffer_dirty(leaf); } kunmap(page); - set_extent_uptodate(em_tree, em->start, em->end, GFP_NOFS); + set_extent_uptodate(io_tree, em->start, + extent_map_end(em) - 1, GFP_NOFS); goto insert; } else { printk("unkknown found_type %d\n", found_type); @@ -1982,33 +1992,29 @@ again: } not_found: em->start = start; - em->end = end; + em->len = len; not_found_em: em->block_start = EXTENT_MAP_HOLE; - em->block_end = EXTENT_MAP_HOLE; insert: btrfs_release_path(root, path); - if (em->start > start || em->end < start) { - printk("bad extent! em: [%Lu %Lu] passed [%Lu %Lu]\n", em->start, em->end, start, end); + if (em->start > start || extent_map_end(em) <= start) { + printk("bad extent! em: [%Lu %Lu] passed [%Lu %Lu]\n", em->start, em->len, start, len); err = -EIO; goto out; } + + err = 0; + spin_lock(&em_tree->lock); ret = add_extent_mapping(em_tree, em); if (ret == -EEXIST) { free_extent_map(em); - em = NULL; - if (0 && failed_insert == 1) { - btrfs_drop_extent_cache(inode, start, end); - } - failed_insert++; - if (failed_insert > 5) { - printk("failing to insert %Lu %Lu\n", start, end); + em = lookup_extent_mapping(em_tree, start, len); + if (!em) { err = -EIO; - goto out; + printk("failing to insert %Lu %Lu\n", start, len); } - goto again; } - err = 0; + spin_unlock(&em_tree->lock); out: btrfs_free_path(path); if (trans) { @@ -2032,14 +2038,14 @@ static sector_t btrfs_bmap(struct address_space *mapping, sector_t iblock) int btrfs_readpage(struct file *file, struct page *page) { - struct extent_map_tree *tree; - tree = &BTRFS_I(page->mapping->host)->extent_tree; + struct extent_io_tree *tree; + tree = &BTRFS_I(page->mapping->host)->io_tree; return extent_read_full_page(tree, page, btrfs_get_extent); } static int btrfs_writepage(struct page *page, struct writeback_control *wbc) { - struct extent_map_tree *tree; + struct extent_io_tree *tree; if (current->flags & PF_MEMALLOC) { @@ -2047,15 +2053,15 @@ static int btrfs_writepage(struct page *page, struct writeback_control *wbc) unlock_page(page); return 0; } - tree = &BTRFS_I(page->mapping->host)->extent_tree; + tree = &BTRFS_I(page->mapping->host)->io_tree; return extent_write_full_page(tree, page, btrfs_get_extent, wbc); } static int btrfs_writepages(struct address_space *mapping, struct writeback_control *wbc) { - struct extent_map_tree *tree; - tree = &BTRFS_I(mapping->host)->extent_tree; + struct extent_io_tree *tree; + tree = &BTRFS_I(mapping->host)->io_tree; return extent_writepages(tree, mapping, btrfs_get_extent, wbc); } @@ -2063,19 +2069,21 @@ static int btrfs_readpages(struct file *file, struct address_space *mapping, struct list_head *pages, unsigned nr_pages) { - struct extent_map_tree *tree; - tree = &BTRFS_I(mapping->host)->extent_tree; + struct extent_io_tree *tree; + tree = &BTRFS_I(mapping->host)->io_tree; return extent_readpages(tree, mapping, pages, nr_pages, btrfs_get_extent); } static int btrfs_releasepage(struct page *page, gfp_t unused_gfp_flags) { - struct extent_map_tree *tree; + struct extent_io_tree *tree; + struct extent_map_tree *map; int ret; - tree = &BTRFS_I(page->mapping->host)->extent_tree; - ret = try_release_extent_mapping(tree, page); + tree = &BTRFS_I(page->mapping->host)->io_tree; + map = &BTRFS_I(page->mapping->host)->extent_tree; + ret = try_release_extent_mapping(map, tree, page); if (ret == 1) { ClearPagePrivate(page); set_page_private(page, 0); @@ -2086,9 +2094,9 @@ static int btrfs_releasepage(struct page *page, gfp_t unused_gfp_flags) static void btrfs_invalidatepage(struct page *page, unsigned long offset) { - struct extent_map_tree *tree; + struct extent_io_tree *tree; - tree = &BTRFS_I(page->mapping->host)->extent_tree; + tree = &BTRFS_I(page->mapping->host)->io_tree; extent_invalidatepage(tree, page, offset); btrfs_releasepage(page, GFP_NOFS); } @@ -2374,7 +2382,7 @@ unsigned long btrfs_force_ra(struct address_space *mapping, int btrfs_defrag_file(struct file *file) { struct inode *inode = fdentry(file)->d_inode; struct btrfs_root *root = BTRFS_I(inode)->root; - struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; + struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; struct page *page; unsigned long last_index; unsigned long ra_index = 0; @@ -2414,13 +2422,13 @@ int btrfs_defrag_file(struct file *file) { page_start = (u64)page->index << PAGE_CACHE_SHIFT; page_end = page_start + PAGE_CACHE_SIZE - 1; - lock_extent(em_tree, page_start, page_end, GFP_NOFS); + lock_extent(io_tree, page_start, page_end, GFP_NOFS); delalloc_start = page_start; existing_delalloc = - count_range_bits(&BTRFS_I(inode)->extent_tree, + count_range_bits(&BTRFS_I(inode)->io_tree, &delalloc_start, page_end, PAGE_CACHE_SIZE, EXTENT_DELALLOC); - set_extent_delalloc(em_tree, page_start, + set_extent_delalloc(io_tree, page_start, page_end, GFP_NOFS); spin_lock(&root->fs_info->delalloc_lock); @@ -2428,7 +2436,7 @@ int btrfs_defrag_file(struct file *file) { existing_delalloc; spin_unlock(&root->fs_info->delalloc_lock); - unlock_extent(em_tree, page_start, page_end, GFP_NOFS); + unlock_extent(io_tree, page_start, page_end, GFP_NOFS); set_page_dirty(page); unlock_page(page); page_cache_release(page); @@ -2842,9 +2850,10 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry, inode->i_mapping->a_ops = &btrfs_aops; inode->i_fop = &btrfs_file_operations; inode->i_op = &btrfs_file_inode_operations; - extent_map_tree_init(&BTRFS_I(inode)->extent_tree, + extent_map_tree_init(&BTRFS_I(inode)->extent_tree, GFP_NOFS); + extent_io_tree_init(&BTRFS_I(inode)->io_tree, inode->i_mapping, GFP_NOFS); - BTRFS_I(inode)->extent_tree.ops = &btrfs_extent_map_ops; + BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; } dir->i_sb->s_dirt = 1; btrfs_update_inode_block_group(trans, inode); @@ -2934,7 +2943,7 @@ static struct file_operations btrfs_dir_file_operations = { #endif }; -static struct extent_map_ops btrfs_extent_map_ops = { +static struct extent_io_ops btrfs_extent_io_ops = { .fill_delalloc = run_delalloc_range, .writepage_io_hook = btrfs_writepage_io_hook, .readpage_io_hook = btrfs_readpage_io_hook, diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 8b52c69fda2e..f8a1016600b1 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -468,10 +468,15 @@ static int __init init_btrfs_fs(void) err = btrfs_init_cachep(); if (err) goto free_transaction_sys; - err = extent_map_init(); + + err = extent_io_init(); if (err) goto free_cachep; + err = extent_map_init(); + if (err) + goto free_extent_io; + err = register_filesystem(&btrfs_fs_type); if (err) goto free_extent_map; @@ -479,6 +484,8 @@ static int __init init_btrfs_fs(void) free_extent_map: extent_map_exit(); +free_extent_io: + extent_io_exit(); free_cachep: btrfs_destroy_cachep(); free_transaction_sys: @@ -492,6 +499,7 @@ static void __exit exit_btrfs_fs(void) btrfs_exit_transaction_sys(); btrfs_destroy_cachep(); extent_map_exit(); + extent_io_exit(); unregister_filesystem(&btrfs_fs_type); btrfs_exit_sysfs(); } diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 163c01a24498..b4a1bc62a784 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -70,7 +70,7 @@ static int join_transaction(struct btrfs_root *root) INIT_LIST_HEAD(&cur_trans->pending_snapshots); list_add_tail(&cur_trans->list, &root->fs_info->trans_list); btrfs_ordered_inode_tree_init(&cur_trans->ordered_inode_tree); - extent_map_tree_init(&cur_trans->dirty_pages, + extent_io_tree_init(&cur_trans->dirty_pages, root->fs_info->btree_inode->i_mapping, GFP_NOFS); } else { @@ -153,7 +153,7 @@ int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans, int ret; int err; int werr = 0; - struct extent_map_tree *dirty_pages; + struct extent_io_tree *dirty_pages; struct page *page; struct inode *btree_inode = root->fs_info->btree_inode; u64 start; @@ -610,7 +610,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, struct btrfs_transaction *cur_trans; struct btrfs_transaction *prev_trans = NULL; struct list_head dirty_fs_roots; - struct extent_map_tree *pinned_copy; + struct extent_io_tree *pinned_copy; DEFINE_WAIT(wait); int ret; @@ -639,7 +639,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, if (!pinned_copy) return -ENOMEM; - extent_map_tree_init(pinned_copy, + extent_io_tree_init(pinned_copy, root->fs_info->btree_inode->i_mapping, GFP_NOFS); trans->transaction->in_commit = 1; diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index fd52e9b23922..c3172ddb3321 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -29,7 +29,7 @@ struct btrfs_transaction { int use_count; int commit_done; struct list_head list; - struct extent_map_tree dirty_pages; + struct extent_io_tree dirty_pages; unsigned long start_time; struct btrfs_ordered_inode_tree ordered_inode_tree; wait_queue_head_t writer_wait; -- cgit v1.2.3 From 0b86a832a1f38abec695864ec2eaedc9d2383f1b Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Mon, 24 Mar 2008 15:01:56 -0400 Subject: Btrfs: Add support for multiple devices per filesystem Signed-off-by: Chris Mason --- fs/btrfs/Makefile | 2 +- fs/btrfs/ctree.c | 38 ++- fs/btrfs/ctree.h | 313 ++++++++++++++++-- fs/btrfs/disk-io.c | 97 +++++- fs/btrfs/disk-io.h | 2 + fs/btrfs/extent-tree.c | 429 +++++++------------------ fs/btrfs/extent_io.c | 6 +- fs/btrfs/extent_io.h | 2 +- fs/btrfs/inode.c | 26 +- fs/btrfs/print-tree.c | 48 +++ fs/btrfs/transaction.c | 50 ++- fs/btrfs/volumes.c | 852 +++++++++++++++++++++++++++++++++++++++++++++++++ fs/btrfs/volumes.h | 78 +++++ 13 files changed, 1563 insertions(+), 380 deletions(-) create mode 100644 fs/btrfs/volumes.c create mode 100644 fs/btrfs/volumes.h (limited to 'fs/btrfs/Makefile') diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index 9c61609f3714..5ac6f2946c21 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -6,7 +6,7 @@ btrfs-y := super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ hash.o file-item.o inode-item.o inode-map.o disk-io.o \ transaction.o bit-radix.o inode.o file.o tree-defrag.o \ extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \ - extent_io.o + extent_io.o volumes.o ifeq ($(CONFIG_FS_POSIX_ACL),y) btrfs-y += acl.o diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index ef8809c1e69c..70b6ddfe15a1 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -70,6 +70,14 @@ void btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p) memset(p, 0, sizeof(*p)); } +static void add_root_to_dirty_list(struct btrfs_root *root) +{ + if (root->track_dirty && list_empty(&root->dirty_list)) { + list_add(&root->dirty_list, + &root->fs_info->dirty_cowonly_roots); + } +} + int btrfs_copy_root(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, @@ -196,6 +204,7 @@ int __btrfs_cow_block(struct btrfs_trans_handle *trans, root_gen, 0, 0, 1); } free_extent_buffer(buf); + add_root_to_dirty_list(root); } else { root_gen = btrfs_header_generation(parent); btrfs_set_node_blockptr(parent, parent_slot, @@ -241,7 +250,7 @@ int btrfs_cow_block(struct btrfs_trans_handle *trans, return 0; } - search_start = buf->start & ~((u64)BTRFS_BLOCK_GROUP_SIZE - 1); + search_start = buf->start & ~((u64)(1024 * 1024 * 1024) - 1); ret = __btrfs_cow_block(trans, root, buf, parent, parent_slot, cow_ret, search_start, 0); return ret; @@ -724,6 +733,7 @@ static int balance_level(struct btrfs_trans_handle *trans, BUG_ON(ret); root->node = child; + add_root_to_dirty_list(root); path->nodes[level] = NULL; clean_tree_block(trans, root, mid); wait_on_tree_block_writeback(root, mid); @@ -1369,6 +1379,7 @@ static int noinline insert_new_root(struct btrfs_trans_handle *trans, /* the super has an extra ref to root->node */ free_extent_buffer(root->node); root->node = c; + add_root_to_dirty_list(root); extent_buffer_get(c); path->nodes[level] = c; path->slots[level] = 0; @@ -2777,3 +2788,28 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path) } return 0; } + +int btrfs_previous_item(struct btrfs_root *root, + struct btrfs_path *path, u64 min_objectid, + int type) +{ + struct btrfs_key found_key; + struct extent_buffer *leaf; + int ret; + + while(1) { + if (path->slots[0] == 0) { + ret = btrfs_prev_leaf(root, path); + if (ret != 0) + return ret; + } else { + path->slots[0]--; + } + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + if (found_key.type == type) + return 0; + } + return 1; +} + diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 92d892f92075..1453d995fef6 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -40,12 +40,44 @@ extern struct kmem_cache *btrfs_path_cachep; #define BTRFS_MAGIC "_B4RfS_M" #define BTRFS_MAX_LEVEL 8 + +/* holds pointers to all of the tree roots */ #define BTRFS_ROOT_TREE_OBJECTID 1ULL + +/* stores information about which extents are in use, and reference counts */ #define BTRFS_EXTENT_TREE_OBJECTID 2ULL + +/* one per subvolume, storing files and directories */ #define BTRFS_FS_TREE_OBJECTID 3ULL + +/* directory objectid inside the root tree */ #define BTRFS_ROOT_TREE_DIR_OBJECTID 4ULL + + +/* + * chunk tree stores translations from logical -> physical block numbering + * the super block points to the chunk tree + */ +#define BTRFS_CHUNK_TREE_OBJECTID 5ULL + +/* + * stores information about which areas of a given device are in use. + * one per device. The tree of tree roots points to the device tree + */ +#define BTRFS_DEV_TREE_OBJECTID 6ULL + +/* + * All files have objectids higher than this. + */ #define BTRFS_FIRST_FREE_OBJECTID 256ULL + +/* + * the device items go into the chunk tree. The key is in the form + * [ 1 BTRFS_DEV_ITEM_KEY device_id ] + */ +#define BTRFS_DEV_ITEMS_OBJECTID 1ULL + /* * we can actually store much bigger names, but lets not confuse the rest * of linux @@ -95,6 +127,81 @@ struct btrfs_key { u64 offset; } __attribute__ ((__packed__)); +struct btrfs_mapping_tree { + struct extent_map_tree map_tree; +}; + +#define BTRFS_DEV_UUID_SIZE 16 +struct btrfs_dev_item { + /* the internal btrfs device id */ + __le64 devid; + + /* size of the device */ + __le64 total_bytes; + + /* bytes used */ + __le64 bytes_used; + + /* optimal io alignment for this device */ + __le32 io_align; + + /* optimal io width for this device */ + __le32 io_width; + + /* minimal io size for this device */ + __le32 sector_size; + + /* the kernel device number */ + __le64 rdev; + + /* type and info about this device */ + __le64 type; + + /* partition number, 0 for whole dev */ + __le32 partition; + + /* length of the name data at the end of the item */ + __le16 name_len; + + /* physical drive uuid (or lvm uuid) */ + u8 uuid[BTRFS_DEV_UUID_SIZE]; + /* name goes here */ +} __attribute__ ((__packed__)); + +struct btrfs_stripe { + __le64 devid; + __le64 offset; +} __attribute__ ((__packed__)); + +struct btrfs_chunk { + __le64 owner; + __le64 stripe_len; + __le64 type; + + /* optimal io alignment for this chunk */ + __le32 io_align; + + /* optimal io width for this chunk */ + __le32 io_width; + + /* minimal io size for this chunk */ + __le32 sector_size; + + /* 2^16 stripes is quite a lot, a second limit is the size of a single + * item in the btree + */ + __le16 num_stripes; + struct btrfs_stripe stripe; + /* additional stripes go here */ +} __attribute__ ((__packed__)); + +static inline unsigned long btrfs_chunk_item_size(int num_stripes) +{ + BUG_ON(num_stripes == 0); + return sizeof(struct btrfs_chunk) + + sizeof(struct btrfs_stripe) * (num_stripes - 1); +} + #define BTRFS_FSID_SIZE 16 /* * every tree block (leaf or node) starts with this header. @@ -119,6 +226,13 @@ struct btrfs_header { sizeof(struct btrfs_item) - \ sizeof(struct btrfs_file_extent_item)) + +/* + * this is a very generous portion of the super block, giving us + * room to translate 14 chunks with 3 stripes each. + */ +#define BTRFS_SYSTEM_CHUNK_ARRAY_SIZE 2048 + /* * the super block basically lists the main trees of the FS * it currently lacks any block count etc etc @@ -131,6 +245,7 @@ struct btrfs_super_block { __le64 magic; __le64 generation; __le64 root; + __le64 chunk_root; __le64 total_bytes; __le64 bytes_used; __le64 root_dir_objectid; @@ -138,7 +253,10 @@ struct btrfs_super_block { __le32 nodesize; __le32 leafsize; __le32 stripesize; + __le32 sys_chunk_array_size; u8 root_level; + u8 chunk_root_level; + u8 sys_chunk_array[BTRFS_SYSTEM_CHUNK_ARRAY_SIZE]; } __attribute__ ((__packed__)); /* @@ -208,12 +326,22 @@ struct btrfs_extent_ref { __le64 offset; } __attribute__ ((__packed__)); +/* dev extents record free space on individual devices. The owner + * field points back to the chunk allocation mapping tree that allocated + * the extent + */ +struct btrfs_dev_extent { + __le64 owner; + __le64 length; +} __attribute__ ((__packed__)); + + struct btrfs_inode_ref { __le16 name_len; /* name goes here */ } __attribute__ ((__packed__)); -struct btrfs_inode_timespec { +struct btrfs_timespec { __le64 sec; __le32 nsec; } __attribute__ ((__packed__)); @@ -231,13 +359,13 @@ struct btrfs_inode_item { __le32 uid; __le32 gid; __le32 mode; - __le32 rdev; + __le64 rdev; __le16 flags; __le16 compat_flags; - struct btrfs_inode_timespec atime; - struct btrfs_inode_timespec ctime; - struct btrfs_inode_timespec mtime; - struct btrfs_inode_timespec otime; + struct btrfs_timespec atime; + struct btrfs_timespec ctime; + struct btrfs_timespec mtime; + struct btrfs_timespec otime; } __attribute__ ((__packed__)); struct btrfs_dir_item { @@ -290,29 +418,34 @@ struct btrfs_csum_item { u8 csum; } __attribute__ ((__packed__)); -/* tag for the radix tree of block groups in ram */ -#define BTRFS_BLOCK_GROUP_SIZE (256 * 1024 * 1024) - +/* different types of block groups (and chunks) */ +#define BTRFS_BLOCK_GROUP_DATA (1 << 0) +#define BTRFS_BLOCK_GROUP_SYSTEM (1 << 1) +#define BTRFS_BLOCK_GROUP_METADATA (1 << 2) -#define BTRFS_BLOCK_GROUP_DATA 1 -#define BTRFS_BLOCK_GROUP_MIXED 2 struct btrfs_block_group_item { __le64 used; - u8 flags; + __le64 chunk_tree; + __le64 chunk_objectid; + __le64 flags; } __attribute__ ((__packed__)); struct btrfs_block_group_cache { struct btrfs_key key; struct btrfs_block_group_item item; - int data; - int cached; u64 pinned; + u64 flags; + int cached; }; + +struct btrfs_device; struct btrfs_fs_info { u8 fsid[BTRFS_FSID_SIZE]; struct btrfs_root *extent_root; struct btrfs_root *tree_root; + struct btrfs_root *chunk_root; + struct btrfs_root *dev_root; struct radix_tree_root fs_roots_radix; struct extent_io_tree free_space_cache; @@ -321,6 +454,9 @@ struct btrfs_fs_info { struct extent_io_tree pending_del; struct extent_io_tree extent_ins; + /* logical->physical extent mapping */ + struct btrfs_mapping_tree mapping_tree; + u64 generation; u64 last_trans_committed; unsigned long mount_opt; @@ -330,6 +466,7 @@ struct btrfs_fs_info { struct btrfs_transaction *running_transaction; struct btrfs_super_block super_copy; struct extent_buffer *sb_buffer; + struct block_device *__bdev; struct super_block *sb; struct inode *btree_inode; spinlock_t hash_lock; @@ -350,12 +487,17 @@ struct btrfs_fs_info { unsigned long throttles; u64 total_pinned; + struct list_head dirty_cowonly_roots; + + struct list_head devices; + struct list_head *last_device; spinlock_t delalloc_lock; spinlock_t new_trans_lock; u64 delalloc_bytes; u64 last_alloc; u64 last_data_alloc; }; + /* * in ram representation of the tree. extent_root is used for all allocations * and for the extent tree extent_root root. @@ -387,14 +529,19 @@ struct btrfs_root { u64 highest_inode; u64 last_inode_alloc; int ref_cows; + int track_dirty; struct btrfs_key defrag_progress; int defrag_running; int defrag_level; char *name; int in_sysfs; + + /* the dirty list is only used by non-reference counted roots */ + struct list_head dirty_list; }; /* + * inode items have the data typically returned from stat and store other * info about object characteristics. There is one for every file and dir in * the FS @@ -439,6 +586,10 @@ struct btrfs_root { */ #define BTRFS_BLOCK_GROUP_ITEM_KEY 50 +#define BTRFS_DEV_EXTENT_KEY 75 +#define BTRFS_DEV_ITEM_KEY 76 +#define BTRFS_CHUNK_ITEM_KEY 77 + /* * string items are for debugging. They just store a short string of * data in the FS @@ -518,13 +669,104 @@ static inline void btrfs_set_##name(type *s, u##bits val) \ s->member = cpu_to_le##bits(val); \ } +BTRFS_SETGET_FUNCS(device_type, struct btrfs_dev_item, type, 64); +BTRFS_SETGET_FUNCS(device_total_bytes, struct btrfs_dev_item, total_bytes, 64); +BTRFS_SETGET_FUNCS(device_bytes_used, struct btrfs_dev_item, bytes_used, 64); +BTRFS_SETGET_FUNCS(device_io_align, struct btrfs_dev_item, io_align, 32); +BTRFS_SETGET_FUNCS(device_io_width, struct btrfs_dev_item, io_width, 32); +BTRFS_SETGET_FUNCS(device_sector_size, struct btrfs_dev_item, sector_size, 32); +BTRFS_SETGET_FUNCS(device_id, struct btrfs_dev_item, devid, 64); +BTRFS_SETGET_FUNCS(device_rdev, struct btrfs_dev_item, rdev, 64); +BTRFS_SETGET_FUNCS(device_partition, struct btrfs_dev_item, partition, 32); +BTRFS_SETGET_FUNCS(device_name_len, struct btrfs_dev_item, name_len, 16); + +static inline char *btrfs_device_uuid(struct btrfs_dev_item *d) +{ + return (char *)d + offsetof(struct btrfs_dev_item, uuid); +} + +static inline char *btrfs_device_name(struct btrfs_dev_item *d) +{ + return (char *)(d + 1); +} + +BTRFS_SETGET_FUNCS(chunk_owner, struct btrfs_chunk, owner, 64); +BTRFS_SETGET_FUNCS(chunk_stripe_len, struct btrfs_chunk, stripe_len, 64); +BTRFS_SETGET_FUNCS(chunk_io_align, struct btrfs_chunk, io_align, 32); +BTRFS_SETGET_FUNCS(chunk_io_width, struct btrfs_chunk, io_width, 32); +BTRFS_SETGET_FUNCS(chunk_sector_size, struct btrfs_chunk, sector_size, 32); +BTRFS_SETGET_FUNCS(chunk_type, struct btrfs_chunk, type, 64); +BTRFS_SETGET_FUNCS(chunk_num_stripes, struct btrfs_chunk, num_stripes, 16); +BTRFS_SETGET_FUNCS(stripe_devid, struct btrfs_stripe, devid, 64); +BTRFS_SETGET_FUNCS(stripe_offset, struct btrfs_stripe, offset, 64); + +BTRFS_SETGET_STACK_FUNCS(stack_chunk_owner, struct btrfs_chunk, owner, 64); +BTRFS_SETGET_STACK_FUNCS(stack_chunk_stripe_len, struct btrfs_chunk, + stripe_len, 64); +BTRFS_SETGET_STACK_FUNCS(stack_chunk_io_align, struct btrfs_chunk, + io_align, 32); +BTRFS_SETGET_STACK_FUNCS(stack_chunk_io_width, struct btrfs_chunk, + io_width, 32); +BTRFS_SETGET_STACK_FUNCS(stack_chunk_sector_size, struct btrfs_chunk, + sector_size, 32); +BTRFS_SETGET_STACK_FUNCS(stack_chunk_type, struct btrfs_chunk, type, 64); +BTRFS_SETGET_STACK_FUNCS(stack_chunk_num_stripes, struct btrfs_chunk, + num_stripes, 16); +BTRFS_SETGET_STACK_FUNCS(stack_stripe_devid, struct btrfs_stripe, devid, 64); +BTRFS_SETGET_STACK_FUNCS(stack_stripe_offset, struct btrfs_stripe, offset, 64); + +static inline struct btrfs_stripe *btrfs_stripe_nr(struct btrfs_chunk *c, + int nr) +{ + unsigned long offset = (unsigned long)c; + offset += offsetof(struct btrfs_chunk, stripe); + offset += nr * sizeof(struct btrfs_stripe); + return (struct btrfs_stripe *)offset; +} + +static inline u64 btrfs_stripe_offset_nr(struct extent_buffer *eb, + struct btrfs_chunk *c, int nr) +{ + return btrfs_stripe_offset(eb, btrfs_stripe_nr(c, nr)); +} + +static inline void btrfs_set_stripe_offset_nr(struct extent_buffer *eb, + struct btrfs_chunk *c, int nr, + u64 val) +{ + btrfs_set_stripe_offset(eb, btrfs_stripe_nr(c, nr), val); +} + +static inline u64 btrfs_stripe_devid_nr(struct extent_buffer *eb, + struct btrfs_chunk *c, int nr) +{ + return btrfs_stripe_devid(eb, btrfs_stripe_nr(c, nr)); +} + +static inline void btrfs_set_stripe_devid_nr(struct extent_buffer *eb, + struct btrfs_chunk *c, int nr, + u64 val) +{ + btrfs_set_stripe_devid(eb, btrfs_stripe_nr(c, nr), val); +} + /* struct btrfs_block_group_item */ BTRFS_SETGET_STACK_FUNCS(block_group_used, struct btrfs_block_group_item, used, 64); BTRFS_SETGET_FUNCS(disk_block_group_used, struct btrfs_block_group_item, used, 64); -BTRFS_SETGET_FUNCS(disk_block_group_flags, struct btrfs_block_group_item, - flags, 8); +BTRFS_SETGET_STACK_FUNCS(block_group_chunk_tree, struct btrfs_block_group_item, + chunk_tree, 64); +BTRFS_SETGET_FUNCS(disk_block_group_chunk_tree, struct btrfs_block_group_item, + chunk_tree, 64); +BTRFS_SETGET_STACK_FUNCS(block_group_chunk_objectid, + struct btrfs_block_group_item, chunk_objectid, 64); +BTRFS_SETGET_FUNCS(disk_block_group_chunk_objecitd, + struct btrfs_block_group_item, chunk_objectid, 64); +BTRFS_SETGET_FUNCS(disk_block_group_flags, + struct btrfs_block_group_item, flags, 64); +BTRFS_SETGET_STACK_FUNCS(block_group_flags, + struct btrfs_block_group_item, flags, 64); /* struct btrfs_inode_ref */ BTRFS_SETGET_FUNCS(inode_ref_name_len, struct btrfs_inode_ref, name_len, 16); @@ -538,49 +780,53 @@ BTRFS_SETGET_FUNCS(inode_nlink, struct btrfs_inode_item, nlink, 32); BTRFS_SETGET_FUNCS(inode_uid, struct btrfs_inode_item, uid, 32); BTRFS_SETGET_FUNCS(inode_gid, struct btrfs_inode_item, gid, 32); BTRFS_SETGET_FUNCS(inode_mode, struct btrfs_inode_item, mode, 32); -BTRFS_SETGET_FUNCS(inode_rdev, struct btrfs_inode_item, rdev, 32); +BTRFS_SETGET_FUNCS(inode_rdev, struct btrfs_inode_item, rdev, 64); BTRFS_SETGET_FUNCS(inode_flags, struct btrfs_inode_item, flags, 16); BTRFS_SETGET_FUNCS(inode_compat_flags, struct btrfs_inode_item, compat_flags, 16); -static inline struct btrfs_inode_timespec * +static inline struct btrfs_timespec * btrfs_inode_atime(struct btrfs_inode_item *inode_item) { unsigned long ptr = (unsigned long)inode_item; ptr += offsetof(struct btrfs_inode_item, atime); - return (struct btrfs_inode_timespec *)ptr; + return (struct btrfs_timespec *)ptr; } -static inline struct btrfs_inode_timespec * +static inline struct btrfs_timespec * btrfs_inode_mtime(struct btrfs_inode_item *inode_item) { unsigned long ptr = (unsigned long)inode_item; ptr += offsetof(struct btrfs_inode_item, mtime); - return (struct btrfs_inode_timespec *)ptr; + return (struct btrfs_timespec *)ptr; } -static inline struct btrfs_inode_timespec * +static inline struct btrfs_timespec * btrfs_inode_ctime(struct btrfs_inode_item *inode_item) { unsigned long ptr = (unsigned long)inode_item; ptr += offsetof(struct btrfs_inode_item, ctime); - return (struct btrfs_inode_timespec *)ptr; + return (struct btrfs_timespec *)ptr; } -static inline struct btrfs_inode_timespec * +static inline struct btrfs_timespec * btrfs_inode_otime(struct btrfs_inode_item *inode_item) { unsigned long ptr = (unsigned long)inode_item; ptr += offsetof(struct btrfs_inode_item, otime); - return (struct btrfs_inode_timespec *)ptr; + return (struct btrfs_timespec *)ptr; } -BTRFS_SETGET_FUNCS(timespec_sec, struct btrfs_inode_timespec, sec, 64); -BTRFS_SETGET_FUNCS(timespec_nsec, struct btrfs_inode_timespec, nsec, 32); +BTRFS_SETGET_FUNCS(timespec_sec, struct btrfs_timespec, sec, 64); +BTRFS_SETGET_FUNCS(timespec_nsec, struct btrfs_timespec, nsec, 32); /* struct btrfs_extent_item */ BTRFS_SETGET_FUNCS(extent_refs, struct btrfs_extent_item, refs, 32); +/* struct btrfs_dev_extent */ +BTRFS_SETGET_FUNCS(dev_extent_owner, struct btrfs_dev_extent, owner, 64); +BTRFS_SETGET_FUNCS(dev_extent_length, struct btrfs_dev_extent, length, 64); + /* struct btrfs_extent_ref */ BTRFS_SETGET_FUNCS(ref_root, struct btrfs_extent_ref, root, 64); BTRFS_SETGET_FUNCS(ref_generation, struct btrfs_extent_ref, generation, 64); @@ -846,8 +1092,14 @@ BTRFS_SETGET_STACK_FUNCS(super_bytenr, struct btrfs_super_block, bytenr, 64); BTRFS_SETGET_STACK_FUNCS(super_generation, struct btrfs_super_block, generation, 64); BTRFS_SETGET_STACK_FUNCS(super_root, struct btrfs_super_block, root, 64); +BTRFS_SETGET_STACK_FUNCS(super_sys_array_size, + struct btrfs_super_block, sys_chunk_array_size, 32); BTRFS_SETGET_STACK_FUNCS(super_root_level, struct btrfs_super_block, root_level, 8); +BTRFS_SETGET_STACK_FUNCS(super_chunk_root, struct btrfs_super_block, + chunk_root, 64); +BTRFS_SETGET_STACK_FUNCS(super_chunk_root_level, struct btrfs_super_block, + chunk_root_level, 64); BTRFS_SETGET_STACK_FUNCS(super_total_bytes, struct btrfs_super_block, total_bytes, 64); BTRFS_SETGET_STACK_FUNCS(super_bytes_used, struct btrfs_super_block, @@ -1009,7 +1261,14 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, struct btrfs_root *root); int btrfs_free_block_groups(struct btrfs_fs_info *info); int btrfs_read_block_groups(struct btrfs_root *root); +int btrfs_make_block_group(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 bytes_used, + u64 type, u64 chunk_tree, u64 chunk_objectid, + u64 size); /* ctree.c */ +int btrfs_previous_item(struct btrfs_root *root, + struct btrfs_path *path, u64 min_objectid, + int type); int btrfs_cow_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, struct extent_buffer *parent, int parent_slot, diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 88e21bdbc478..8e37fa120cc8 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -28,6 +28,7 @@ #include "disk-io.h" #include "transaction.h" #include "btrfs_inode.h" +#include "volumes.h" #include "print-tree.h" #if 0 @@ -234,6 +235,19 @@ static int btree_writepage_io_hook(struct page *page, u64 start, u64 end) return 0; } +static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + u64 offset; + offset = bio->bi_sector << 9; + if (offset == BTRFS_SUPER_INFO_OFFSET) { + bio->bi_bdev = root->fs_info->sb->s_bdev; + submit_bio(rw, bio); + return 0; + } + return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio); +} + static int btree_writepage(struct page *page, struct writeback_control *wbc) { struct extent_io_tree *tree; @@ -345,6 +359,23 @@ int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize) return ret; } +static int close_all_devices(struct btrfs_fs_info *fs_info) +{ + struct list_head *list; + struct list_head *next; + struct btrfs_device *device; + + list = &fs_info->devices; + while(!list_empty(list)) { + next = list->next; + list_del(next); + device = list_entry(next, struct btrfs_device, dev_list); + kfree(device->name); + kfree(device); + } + return 0; +} + struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize) { @@ -420,6 +451,8 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize, root->leafsize = leafsize; root->stripesize = stripesize; root->ref_cows = 0; + root->track_dirty = 0; + root->fs_info = fs_info; root->objectid = objectid; root->last_trans = 0; @@ -427,6 +460,8 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize, root->last_inode_alloc = 0; root->name = NULL; root->in_sysfs = 0; + + INIT_LIST_HEAD(&root->dirty_list); memset(&root->root_key, 0, sizeof(root->root_key)); memset(&root->root_item, 0, sizeof(root->root_item)); memset(&root->defrag_progress, 0, sizeof(root->defrag_progress)); @@ -634,6 +669,10 @@ struct btrfs_root *open_ctree(struct super_block *sb) GFP_NOFS); struct btrfs_fs_info *fs_info = kmalloc(sizeof(*fs_info), GFP_NOFS); + struct btrfs_root *chunk_root = kmalloc(sizeof(struct btrfs_root), + GFP_NOFS); + struct btrfs_root *dev_root = kmalloc(sizeof(struct btrfs_root), + GFP_NOFS); int ret; int err = -EIO; struct btrfs_super_block *disk_super; @@ -657,6 +696,12 @@ struct btrfs_root *open_ctree(struct super_block *sb) fs_info->last_trans_committed = 0; fs_info->tree_root = tree_root; fs_info->extent_root = extent_root; + fs_info->chunk_root = chunk_root; + fs_info->dev_root = dev_root; + INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots); + INIT_LIST_HEAD(&fs_info->devices); + btrfs_mapping_init(&fs_info->mapping_tree); + fs_info->last_device = &fs_info->devices; fs_info->sb = sb; fs_info->throttles = 0; fs_info->mount_opt = 0; @@ -714,12 +759,12 @@ struct btrfs_root *open_ctree(struct super_block *sb) goto fail_iput; } #endif - __setup_root(512, 512, 512, 512, tree_root, + __setup_root(4096, 4096, 4096, 4096, tree_root, fs_info, BTRFS_ROOT_TREE_OBJECTID); fs_info->sb_buffer = read_tree_block(tree_root, BTRFS_SUPER_INFO_OFFSET, - 512); + 4096); if (!fs_info->sb_buffer) goto fail_iput; @@ -730,6 +775,7 @@ struct btrfs_root *open_ctree(struct super_block *sb) read_extent_buffer(fs_info->sb_buffer, fs_info->fsid, (unsigned long)btrfs_super_fsid(fs_info->sb_buffer), BTRFS_FSID_SIZE); + disk_super = &fs_info->super_copy; if (!btrfs_super_root(disk_super)) goto fail_sb_buffer; @@ -753,23 +799,47 @@ struct btrfs_root *open_ctree(struct super_block *sb) goto fail_sb_buffer; } + mutex_lock(&fs_info->fs_mutex); + ret = btrfs_read_sys_array(tree_root); + BUG_ON(ret); + + blocksize = btrfs_level_size(tree_root, + btrfs_super_chunk_root_level(disk_super)); + + __setup_root(nodesize, leafsize, sectorsize, stripesize, + chunk_root, fs_info, BTRFS_CHUNK_TREE_OBJECTID); + + chunk_root->node = read_tree_block(chunk_root, + btrfs_super_chunk_root(disk_super), + blocksize); + BUG_ON(!chunk_root->node); + + ret = btrfs_read_chunk_tree(chunk_root); + BUG_ON(ret); + blocksize = btrfs_level_size(tree_root, btrfs_super_root_level(disk_super)); + tree_root->node = read_tree_block(tree_root, btrfs_super_root(disk_super), blocksize); if (!tree_root->node) goto fail_sb_buffer; - mutex_lock(&fs_info->fs_mutex); ret = find_and_setup_root(tree_root, fs_info, BTRFS_EXTENT_TREE_OBJECTID, extent_root); - if (ret) { - mutex_unlock(&fs_info->fs_mutex); + if (ret) goto fail_tree_root; - } + extent_root->track_dirty = 1; + + ret = find_and_setup_root(tree_root, fs_info, + BTRFS_DEV_TREE_OBJECTID, dev_root); + dev_root->track_dirty = 1; + + if (ret) + goto fail_extent_root; btrfs_read_block_groups(extent_root); @@ -777,7 +847,10 @@ struct btrfs_root *open_ctree(struct super_block *sb) mutex_unlock(&fs_info->fs_mutex); return tree_root; +fail_extent_root: + free_extent_buffer(extent_root->node); fail_tree_root: + mutex_unlock(&fs_info->fs_mutex); free_extent_buffer(tree_root->node); fail_sb_buffer: free_extent_buffer(fs_info->sb_buffer); @@ -874,6 +947,12 @@ int close_ctree(struct btrfs_root *root) if (fs_info->tree_root->node) free_extent_buffer(fs_info->tree_root->node); + if (root->fs_info->chunk_root->node); + free_extent_buffer(root->fs_info->chunk_root->node); + + if (root->fs_info->dev_root->node); + free_extent_buffer(root->fs_info->dev_root->node); + free_extent_buffer(fs_info->sb_buffer); btrfs_free_block_groups(root->fs_info); @@ -901,8 +980,13 @@ int close_ctree(struct btrfs_root *root) kfree(hasher); } #endif + close_all_devices(fs_info); + btrfs_mapping_tree_free(&fs_info->mapping_tree); + kfree(fs_info->extent_root); kfree(fs_info->tree_root); + kfree(fs_info->chunk_root); + kfree(fs_info->dev_root); return 0; } @@ -1016,4 +1100,5 @@ int btrfs_read_buffer(struct extent_buffer *buf) static struct extent_io_ops btree_extent_io_ops = { .writepage_io_hook = btree_writepage_io_hook, + .submit_bio_hook = btree_submit_bio_hook, }; diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 828f3a2081b6..206cb48638f7 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -20,6 +20,7 @@ #define __DISKIO__ #define BTRFS_SUPER_INFO_OFFSET (16 * 1024) +struct btrfs_device; struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize); @@ -65,4 +66,5 @@ int btrfs_read_buffer(struct extent_buffer *buf); u32 btrfs_csum_data(struct btrfs_root *root, char *data, u32 seed, size_t len); void btrfs_csum_final(u32 crc, char *result); void btrfs_throttle(struct btrfs_root *root); +int btrfs_open_device(struct btrfs_device *dev); #endif diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index ebfd304138ca..2cd957d6e8d8 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -24,37 +24,19 @@ #include "disk-io.h" #include "print-tree.h" #include "transaction.h" +#include "volumes.h" -#define BLOCK_GROUP_DATA EXTENT_WRITEBACK +#define BLOCK_GROUP_DATA EXTENT_WRITEBACK #define BLOCK_GROUP_METADATA EXTENT_UPTODATE +#define BLOCK_GROUP_SYSTEM EXTENT_NEW + #define BLOCK_GROUP_DIRTY EXTENT_DIRTY static int finish_current_insert(struct btrfs_trans_handle *trans, struct btrfs_root *extent_root); static int del_pending_extents(struct btrfs_trans_handle *trans, struct btrfs_root *extent_root); -static int find_previous_extent(struct btrfs_root *root, - struct btrfs_path *path) -{ - struct btrfs_key found_key; - struct extent_buffer *leaf; - int ret; - while(1) { - if (path->slots[0] == 0) { - ret = btrfs_prev_leaf(root, path); - if (ret != 0) - return ret; - } else { - path->slots[0]--; - } - leaf = path->nodes[0]; - btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); - if (found_key.type == BTRFS_EXTENT_ITEM_KEY) - return 0; - } - return 1; -} static int cache_block_group(struct btrfs_root *root, struct btrfs_block_group_cache *block_group) @@ -91,7 +73,7 @@ static int cache_block_group(struct btrfs_root *root, ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) return ret; - ret = find_previous_extent(root, path); + ret = btrfs_previous_item(root, path, 0, BTRFS_EXTENT_ITEM_KEY); if (ret < 0) return ret; if (ret == 0) { @@ -168,7 +150,8 @@ struct btrfs_block_group_cache *btrfs_lookup_block_group(struct block_group_cache = &info->block_group_cache; ret = find_first_extent_bit(block_group_cache, bytenr, &start, &end, - BLOCK_GROUP_DATA | BLOCK_GROUP_METADATA); + BLOCK_GROUP_DATA | BLOCK_GROUP_METADATA | + BLOCK_GROUP_SYSTEM); if (ret) { return NULL; } @@ -182,23 +165,38 @@ struct btrfs_block_group_cache *btrfs_lookup_block_group(struct return block_group; return NULL; } -static u64 noinline find_search_start(struct btrfs_root *root, + +static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits) +{ + if ((bits & BLOCK_GROUP_DATA) && + (cache->flags & BTRFS_BLOCK_GROUP_DATA)) + return 1; + if ((bits & BLOCK_GROUP_METADATA) && + (cache->flags & BTRFS_BLOCK_GROUP_METADATA)) + return 1; + if ((bits & BLOCK_GROUP_SYSTEM) && + (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM)) + return 1; + return 0; +} + +static int noinline find_search_start(struct btrfs_root *root, struct btrfs_block_group_cache **cache_ret, - u64 search_start, int num, int data) + u64 *start_ret, int num, int data) { int ret; struct btrfs_block_group_cache *cache = *cache_ret; struct extent_io_tree *free_space_cache; - struct extent_state *state; u64 last; u64 start = 0; + u64 end = 0; u64 cache_miss = 0; u64 total_fs_bytes; + u64 search_start = *start_ret; int wrapped = 0; - if (!cache) { + if (!cache) goto out; - } total_fs_bytes = btrfs_super_total_bytes(&root->fs_info->super_copy); free_space_cache = &root->fs_info->free_space_cache; @@ -208,6 +206,9 @@ again: goto out; last = max(search_start, cache->key.objectid); + if (!block_group_bits(cache, data)) { + goto new_group; + } while(1) { ret = find_first_extent_bit(&root->fs_info->free_space_cache, @@ -225,22 +226,20 @@ again: cache_miss = start; continue; } - if (data != BTRFS_BLOCK_GROUP_MIXED && - start + num > cache->key.objectid + cache->key.offset) + if (start + num > cache->key.objectid + cache->key.offset) goto new_group; if (start + num > total_fs_bytes) goto new_group; - return start; + *start_ret = start; + return 0; } out: cache = btrfs_lookup_block_group(root->fs_info, search_start); if (!cache) { - printk("Unable to find block group for %Lu\n", - search_start); + printk("Unable to find block group for %Lu\n", search_start); WARN_ON(1); - return search_start; } - return search_start; + return -ENOSPC; new_group: last = cache->key.objectid + cache->key.offset; @@ -251,7 +250,6 @@ no_cache: if (!wrapped) { wrapped = 1; last = search_start; - data = BTRFS_BLOCK_GROUP_MIXED; goto wrapped; } goto out; @@ -299,7 +297,6 @@ struct btrfs_block_group_cache *btrfs_find_block_group(struct btrfs_root *root, int ret; int full_search = 0; int factor = 8; - int data_swap = 0; block_group_cache = &info->block_group_cache; total_fs_bytes = btrfs_super_total_bytes(&root->fs_info->super_copy); @@ -307,19 +304,12 @@ struct btrfs_block_group_cache *btrfs_find_block_group(struct btrfs_root *root, if (!owner) factor = 8; - if (data == BTRFS_BLOCK_GROUP_MIXED) { - bit = BLOCK_GROUP_DATA | BLOCK_GROUP_METADATA; - factor = 10; - } else if (data) - bit = BLOCK_GROUP_DATA; - else - bit = BLOCK_GROUP_METADATA; + bit = data; if (search_start && search_start < total_fs_bytes) { struct btrfs_block_group_cache *shint; shint = btrfs_lookup_block_group(info, search_start); - if (shint && (shint->data == data || - shint->data == BTRFS_BLOCK_GROUP_MIXED)) { + if (shint && block_group_bits(shint, data)) { used = btrfs_block_group_used(&shint->item); if (used + shint->pinned < div_factor(shint->key.offset, factor)) { @@ -327,8 +317,8 @@ struct btrfs_block_group_cache *btrfs_find_block_group(struct btrfs_root *root, } } } - if (hint && hint->key.objectid < total_fs_bytes && - (hint->data == data || hint->data == BTRFS_BLOCK_GROUP_MIXED)) { + if (hint && block_group_bits(hint, data) && + hint->key.objectid < total_fs_bytes) { used = btrfs_block_group_used(&hint->item); if (used + hint->pinned < div_factor(hint->key.offset, factor)) { @@ -379,12 +369,6 @@ again: full_search = 1; goto again; } - if (!data_swap) { - data_swap = 1; - bit = BLOCK_GROUP_DATA | BLOCK_GROUP_METADATA; - last = search_start; - goto again; - } found: return found_group; } @@ -1002,7 +986,7 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, static int update_block_group(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 num_bytes, int alloc, - int mark_free, int data) + int mark_free) { struct btrfs_block_group_cache *cache; struct btrfs_fs_info *info = root->fs_info; @@ -1027,41 +1011,6 @@ static int update_block_group(struct btrfs_trans_handle *trans, old_val = btrfs_block_group_used(&cache->item); num_bytes = min(total, cache->key.offset - byte_in_group); if (alloc) { - if (cache->data != data && - old_val < (cache->key.offset >> 1)) { - int bit_to_clear; - int bit_to_set; - cache->data = data; - if (data) { - bit_to_clear = BLOCK_GROUP_METADATA; - bit_to_set = BLOCK_GROUP_DATA; - cache->item.flags &= - ~BTRFS_BLOCK_GROUP_MIXED; - cache->item.flags |= - BTRFS_BLOCK_GROUP_DATA; - } else { - bit_to_clear = BLOCK_GROUP_DATA; - bit_to_set = BLOCK_GROUP_METADATA; - cache->item.flags &= - ~BTRFS_BLOCK_GROUP_MIXED; - cache->item.flags &= - ~BTRFS_BLOCK_GROUP_DATA; - } - clear_extent_bits(&info->block_group_cache, - start, end, bit_to_clear, - GFP_NOFS); - set_extent_bits(&info->block_group_cache, - start, end, bit_to_set, - GFP_NOFS); - } else if (cache->data != data && - cache->data != BTRFS_BLOCK_GROUP_MIXED) { - cache->data = BTRFS_BLOCK_GROUP_MIXED; - set_extent_bits(&info->block_group_cache, - start, end, - BLOCK_GROUP_DATA | - BLOCK_GROUP_METADATA, - GFP_NOFS); - } old_val += num_bytes; } else { old_val -= num_bytes; @@ -1357,7 +1306,7 @@ static int __free_extent(struct btrfs_trans_handle *trans, struct btrfs_root return ret; } ret = update_block_group(trans, root, bytenr, num_bytes, 0, - mark_free, 0); + mark_free); BUG_ON(ret); } btrfs_free_path(path); @@ -1450,38 +1399,21 @@ static int noinline find_free_extent(struct btrfs_trans_handle *trans, u64 exclude_start, u64 exclude_nr, int data) { - struct btrfs_path *path; - struct btrfs_key key; - u64 hole_size = 0; - u64 aligned; int ret; - int slot = 0; - u64 last_byte = 0; - u64 *last_ptr = NULL; u64 orig_search_start = search_start; - int start_found; - struct extent_buffer *l; struct btrfs_root * root = orig_root->fs_info->extent_root; struct btrfs_fs_info *info = root->fs_info; u64 total_needed = num_bytes; - int level; struct btrfs_block_group_cache *block_group; int full_scan = 0; int wrapped = 0; - int empty_cluster; - u64 cached_start; WARN_ON(num_bytes < root->sectorsize); btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY); - level = btrfs_header_level(root->node); - - if (num_bytes >= 32 * 1024 * 1024 && hint_byte) { - data = BTRFS_BLOCK_GROUP_MIXED; - } - if (search_end == (u64)-1) search_end = btrfs_super_total_bytes(&info->super_copy); + if (hint_byte) { block_group = btrfs_lookup_block_group(info, hint_byte); if (!block_group) @@ -1495,7 +1427,7 @@ static int noinline find_free_extent(struct btrfs_trans_handle *trans, } total_needed += empty_size; - path = btrfs_alloc_path(); + check_failed: if (!block_group) { block_group = btrfs_lookup_block_group(info, search_start); @@ -1503,135 +1435,49 @@ check_failed: block_group = btrfs_lookup_block_group(info, orig_search_start); } - search_start = find_search_start(root, &block_group, search_start, - total_needed, data); - search_start = stripe_align(root, search_start); - cached_start = search_start; - btrfs_init_path(path); - ins->objectid = search_start; - ins->offset = 0; - start_found = 0; - path->reada = 2; - - ret = btrfs_search_slot(trans, root, ins, path, 0, 0); - if (ret < 0) - goto error; - ret = find_previous_extent(root, path); - if (ret < 0) + ret = find_search_start(root, &block_group, &search_start, + total_needed, data); + if (ret) goto error; - l = path->nodes[0]; - btrfs_item_key_to_cpu(l, &key, path->slots[0]); - while (1) { - l = path->nodes[0]; - slot = path->slots[0]; - if (slot >= btrfs_header_nritems(l)) { - ret = btrfs_next_leaf(root, path); - if (ret == 0) - continue; - if (ret < 0) - goto error; - search_start = max(search_start, - block_group->key.objectid); - if (!start_found) { - aligned = stripe_align(root, search_start); - ins->objectid = aligned; - if (aligned >= search_end) { - ret = -ENOSPC; - goto error; - } - ins->offset = search_end - aligned; - start_found = 1; - goto check_pending; - } - ins->objectid = stripe_align(root, - last_byte > search_start ? - last_byte : search_start); - if (search_end <= ins->objectid) { - ret = -ENOSPC; - goto error; - } - ins->offset = search_end - ins->objectid; - BUG_ON(ins->objectid >= search_end); - goto check_pending; - } - btrfs_item_key_to_cpu(l, &key, slot); - - if (key.objectid >= search_start && key.objectid > last_byte && - start_found) { - if (last_byte < search_start) - last_byte = search_start; - aligned = stripe_align(root, last_byte); - hole_size = key.objectid - aligned; - if (key.objectid > aligned && hole_size >= num_bytes) { - ins->objectid = aligned; - ins->offset = hole_size; - goto check_pending; - } - } - if (btrfs_key_type(&key) != BTRFS_EXTENT_ITEM_KEY) { - if (!start_found && btrfs_key_type(&key) == - BTRFS_BLOCK_GROUP_ITEM_KEY) { - last_byte = key.objectid; - start_found = 1; - } - goto next; - } - - - start_found = 1; - last_byte = key.objectid + key.offset; - - if (!full_scan && data != BTRFS_BLOCK_GROUP_MIXED && - last_byte >= block_group->key.objectid + - block_group->key.offset) { - btrfs_release_path(root, path); - search_start = block_group->key.objectid + - block_group->key.offset; - goto new_group; - } -next: - path->slots[0]++; - cond_resched(); - } -check_pending: - /* we have to make sure we didn't find an extent that has already - * been allocated by the map tree or the original allocation - */ - btrfs_release_path(root, path); - BUG_ON(ins->objectid < search_start); + search_start = stripe_align(root, search_start); + ins->objectid = search_start; + ins->offset = num_bytes; if (ins->objectid + num_bytes >= search_end) goto enospc; - if (!full_scan && data != BTRFS_BLOCK_GROUP_MIXED && - ins->objectid + num_bytes > block_group-> - key.objectid + block_group->key.offset) { + + if (ins->objectid + num_bytes > + block_group->key.objectid + block_group->key.offset) { search_start = block_group->key.objectid + block_group->key.offset; goto new_group; } + if (test_range_bit(&info->extent_ins, ins->objectid, ins->objectid + num_bytes -1, EXTENT_LOCKED, 0)) { search_start = ins->objectid + num_bytes; goto new_group; } + if (test_range_bit(&info->pinned_extents, ins->objectid, ins->objectid + num_bytes -1, EXTENT_DIRTY, 0)) { search_start = ins->objectid + num_bytes; goto new_group; } + if (exclude_nr > 0 && (ins->objectid + num_bytes > exclude_start && ins->objectid < exclude_start + exclude_nr)) { search_start = exclude_start + exclude_nr; goto new_group; } - if (!data) { + + if (!(data & BLOCK_GROUP_DATA)) { block_group = btrfs_lookup_block_group(info, ins->objectid); if (block_group) trans->block_group = block_group; } ins->offset = num_bytes; - btrfs_free_path(path); return 0; new_group: @@ -1646,7 +1492,6 @@ enospc: if (!full_scan) total_needed -= empty_size; full_scan = 1; - data = BTRFS_BLOCK_GROUP_MIXED; } else wrapped = 1; } @@ -1657,8 +1502,6 @@ enospc: goto check_failed; error: - btrfs_release_path(root, path); - btrfs_free_path(path); return ret; } /* @@ -1689,6 +1532,13 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans, struct btrfs_path *path; struct btrfs_key keys[2]; + if (data) + data = BLOCK_GROUP_DATA; + else if (root == root->fs_info->chunk_root) + data = BLOCK_GROUP_SYSTEM; + else + data = BLOCK_GROUP_METADATA; + new_hint = max(hint_byte, root->fs_info->alloc_start); if (new_hint < btrfs_super_total_bytes(&info->super_copy)) hint_byte = new_hint; @@ -1718,7 +1568,6 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans, set_extent_bits(&root->fs_info->extent_ins, ins->objectid, ins->objectid + ins->offset - 1, EXTENT_LOCKED, GFP_NOFS); - WARN_ON(data == 1); goto update_block; } @@ -1768,8 +1617,7 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans, } update_block: - ret = update_block_group(trans, root, ins->objectid, ins->offset, 1, 0, - data); + ret = update_block_group(trans, root, ins->objectid, ins->offset, 1, 0); if (ret) { printk("update block group failed for %Lu %Lu\n", ins->objectid, ins->offset); @@ -2457,7 +2305,7 @@ again: if (ret < 0) goto out; - ret = find_previous_extent(root, path); + ret = btrfs_previous_item(root, path, 0, BTRFS_EXTENT_ITEM_KEY); if (ret < 0) goto out; if (ret == 0) { @@ -2604,95 +2452,48 @@ out: int btrfs_grow_extent_tree(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 new_size) { - struct btrfs_path *path; - u64 nr = 0; - u64 cur_byte; - u64 old_size; - unsigned long rem; - struct btrfs_block_group_cache *cache; - struct btrfs_block_group_item *item; - struct btrfs_fs_info *info = root->fs_info; - struct extent_io_tree *block_group_cache; - struct btrfs_key key; - struct extent_buffer *leaf; - int ret; - int bit; - - old_size = btrfs_super_total_bytes(&info->super_copy); - block_group_cache = &info->block_group_cache; - - root = info->extent_root; - - cache = btrfs_lookup_block_group(root->fs_info, old_size - 1); - - cur_byte = cache->key.objectid + cache->key.offset; - if (cur_byte >= new_size) - goto set_size; - - key.offset = BTRFS_BLOCK_GROUP_SIZE; - btrfs_set_key_type(&key, BTRFS_BLOCK_GROUP_ITEM_KEY); + btrfs_set_super_total_bytes(&root->fs_info->super_copy, new_size); + return 0; +} - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; +int find_first_block_group(struct btrfs_root *root, struct btrfs_path *path, + struct btrfs_key *key) +{ + int ret; + struct btrfs_key found_key; + struct extent_buffer *leaf; + int slot; - while(cur_byte < new_size) { - key.objectid = cur_byte; - ret = btrfs_insert_empty_item(trans, root, path, &key, - sizeof(struct btrfs_block_group_item)); - BUG_ON(ret); + ret = btrfs_search_slot(NULL, root, key, path, 0, 0); + if (ret < 0) + return ret; + while(1) { + slot = path->slots[0]; leaf = path->nodes[0]; - item = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_block_group_item); - - btrfs_set_disk_block_group_used(leaf, item, 0); - div_long_long_rem(nr, 3, &rem); - if (rem) { - btrfs_set_disk_block_group_flags(leaf, item, - BTRFS_BLOCK_GROUP_DATA); - } else { - btrfs_set_disk_block_group_flags(leaf, item, 0); - } - nr++; - - cache = kmalloc(sizeof(*cache), GFP_NOFS); - BUG_ON(!cache); - - read_extent_buffer(leaf, &cache->item, (unsigned long)item, - sizeof(cache->item)); - - memcpy(&cache->key, &key, sizeof(key)); - cache->cached = 0; - cache->pinned = 0; - cur_byte = key.objectid + key.offset; - btrfs_release_path(root, path); - - if (cache->item.flags & BTRFS_BLOCK_GROUP_DATA) { - bit = BLOCK_GROUP_DATA; - cache->data = BTRFS_BLOCK_GROUP_DATA; - } else { - bit = BLOCK_GROUP_METADATA; - cache->data = 0; + if (slot >= btrfs_header_nritems(leaf)) { + ret = btrfs_next_leaf(root, path); + if (ret == 0) + continue; + if (ret < 0) + goto error; + break; } + btrfs_item_key_to_cpu(leaf, &found_key, slot); - /* use EXTENT_LOCKED to prevent merging */ - set_extent_bits(block_group_cache, key.objectid, - key.objectid + key.offset - 1, - bit | EXTENT_LOCKED, GFP_NOFS); - set_state_private(block_group_cache, key.objectid, - (unsigned long)cache); + if (found_key.objectid >= key->objectid && + found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) + return 0; + path->slots[0]++; } - btrfs_free_path(path); -set_size: - btrfs_set_super_total_bytes(&info->super_copy, new_size); - return 0; + ret = -ENOENT; +error: + return ret; } int btrfs_read_block_groups(struct btrfs_root *root) { struct btrfs_path *path; int ret; - int err = 0; int bit; struct btrfs_block_group_cache *cache; struct btrfs_fs_info *info = root->fs_info; @@ -2702,28 +2503,28 @@ int btrfs_read_block_groups(struct btrfs_root *root) struct extent_buffer *leaf; block_group_cache = &info->block_group_cache; - root = info->extent_root; key.objectid = 0; - key.offset = BTRFS_BLOCK_GROUP_SIZE; + key.offset = 0; btrfs_set_key_type(&key, BTRFS_BLOCK_GROUP_ITEM_KEY); - path = btrfs_alloc_path(); if (!path) return -ENOMEM; while(1) { - ret = btrfs_search_slot(NULL, info->extent_root, - &key, path, 0, 0); - if (ret != 0) { - err = ret; - break; + ret = find_first_block_group(root, path, &key); + if (ret > 0) { + ret = 0; + goto error; } + if (ret != 0) + goto error; + leaf = path->nodes[0]; btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); cache = kmalloc(sizeof(*cache), GFP_NOFS); if (!cache) { - err = -1; + ret = -ENOMEM; break; } @@ -2733,18 +2534,17 @@ int btrfs_read_block_groups(struct btrfs_root *root) memcpy(&cache->key, &found_key, sizeof(found_key)); cache->cached = 0; cache->pinned = 0; + key.objectid = found_key.objectid + found_key.offset; btrfs_release_path(root, path); - - if (cache->item.flags & BTRFS_BLOCK_GROUP_MIXED) { - bit = BLOCK_GROUP_DATA | BLOCK_GROUP_METADATA; - cache->data = BTRFS_BLOCK_GROUP_MIXED; - } else if (cache->item.flags & BTRFS_BLOCK_GROUP_DATA) { + cache->flags = btrfs_block_group_flags(&cache->item); + bit = 0; + if (cache->flags & BTRFS_BLOCK_GROUP_DATA) { bit = BLOCK_GROUP_DATA; - cache->data = BTRFS_BLOCK_GROUP_DATA; - } else { + } else if (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM) { + bit = BLOCK_GROUP_SYSTEM; + } else if (cache->flags & BTRFS_BLOCK_GROUP_METADATA) { bit = BLOCK_GROUP_METADATA; - cache->data = 0; } /* use EXTENT_LOCKED to prevent merging */ @@ -2758,7 +2558,8 @@ int btrfs_read_block_groups(struct btrfs_root *root) btrfs_super_total_bytes(&info->super_copy)) break; } - + ret = 0; +error: btrfs_free_path(path); - return 0; + return ret; } diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index e8130c876330..7e3a1ebde9fc 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1706,9 +1706,9 @@ static int submit_one_bio(int rw, struct bio *bio) WARN_ON(1); } if (tree->ops && tree->ops->submit_bio_hook) - tree->ops->submit_bio_hook(rw, bio); - - submit_bio(rw, bio); + tree->ops->submit_bio_hook(page->mapping->host, rw, bio); + else + submit_bio(rw, bio); if (bio_flagged(bio, BIO_EOPNOTSUPP)) ret = -EOPNOTSUPP; bio_put(bio); diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 9d6654667089..0dca89328f98 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -28,7 +28,7 @@ struct extent_state; struct extent_io_ops { int (*fill_delalloc)(struct inode *inode, u64 start, u64 end); int (*writepage_io_hook)(struct page *page, u64 start, u64 end); - int (*submit_bio_hook)(int rw, struct bio *bio); + int (*submit_bio_hook)(struct inode *inode, int rw, struct bio *bio); int (*readpage_io_hook)(struct page *page, u64 start, u64 end); int (*readpage_end_io_hook)(struct page *page, u64 start, u64 end, struct extent_state *state); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 315dd550d560..17063cd2cb73 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -40,6 +40,7 @@ #include "btrfs_inode.h" #include "ioctl.h" #include "print-tree.h" +#include "volumes.h" struct btrfs_iget_args { u64 ino; @@ -295,20 +296,20 @@ int btrfs_clear_bit_hook(struct inode *inode, u64 start, u64 end, return 0; } -int btrfs_submit_bio_hook(int rw, struct bio *bio) +int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio) { - struct bio_vec *bvec = bio->bi_io_vec; - struct inode *inode = bvec->bv_page->mapping->host; struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_trans_handle *trans; int ret = 0; - if (rw != WRITE) - return 0; + if (rw != WRITE) { + goto mapit; + } if (btrfs_test_opt(root, NODATASUM) || - btrfs_test_flag(inode, NODATASUM)) - return 0; + btrfs_test_flag(inode, NODATASUM)) { + goto mapit; + } mutex_lock(&root->fs_info->fs_mutex); trans = btrfs_start_transaction(root, 1); @@ -317,7 +318,8 @@ int btrfs_submit_bio_hook(int rw, struct bio *bio) ret = btrfs_end_transaction(trans, root); BUG_ON(ret); mutex_unlock(&root->fs_info->fs_mutex); - return ret; +mapit: + return btrfs_map_bio(root, rw, bio); } int btrfs_readpage_io_hook(struct page *page, u64 start, u64 end) @@ -406,7 +408,7 @@ void btrfs_read_locked_inode(struct inode *inode) struct btrfs_path *path; struct extent_buffer *leaf; struct btrfs_inode_item *inode_item; - struct btrfs_inode_timespec *tspec; + struct btrfs_timespec *tspec; struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_key location; u64 alloc_group_block; @@ -455,7 +457,8 @@ void btrfs_read_locked_inode(struct inode *inode) BTRFS_I(inode)->flags = btrfs_inode_flags(leaf, inode_item); if (!BTRFS_I(inode)->block_group) { BTRFS_I(inode)->block_group = btrfs_find_block_group(root, - NULL, 0, 0, 0); + NULL, 0, + BTRFS_BLOCK_GROUP_METADATA, 0); } btrfs_free_path(path); inode_item = NULL; @@ -1550,7 +1553,8 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, owner = 0; else owner = 1; - group = btrfs_find_block_group(root, group, 0, 0, owner); + group = btrfs_find_block_group(root, group, 0, + BTRFS_BLOCK_GROUP_METADATA, owner); BTRFS_I(inode)->block_group = group; BTRFS_I(inode)->flags = 0; diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c index da0b4dcf3617..9c1335dad40c 100644 --- a/fs/btrfs/print-tree.c +++ b/fs/btrfs/print-tree.c @@ -20,6 +20,40 @@ #include "disk-io.h" #include "print-tree.h" +static void print_chunk(struct extent_buffer *eb, struct btrfs_chunk *chunk) +{ + int num_stripes = btrfs_chunk_num_stripes(eb, chunk); + int i; + printk("\t\tchunk owner %llu type %llu num_stripes %d\n", + (unsigned long long)btrfs_chunk_owner(eb, chunk), + (unsigned long long)btrfs_chunk_type(eb, chunk), + num_stripes); + for (i = 0 ; i < num_stripes ; i++) { + printk("\t\t\tstripe %d devid %llu offset %llu\n", i, + (unsigned long long)btrfs_stripe_devid_nr(eb, chunk, i), + (unsigned long long)btrfs_stripe_offset_nr(eb, chunk, i)); + } +} +static void print_dev_item(struct extent_buffer *eb, + struct btrfs_dev_item *dev_item) +{ + char *name; + int name_len; + + name_len = btrfs_device_name_len(eb, dev_item); + name = kmalloc(name_len, GFP_NOFS); + if (name) { + read_extent_buffer(eb, name, + (unsigned long)btrfs_device_name(dev_item), + name_len); + } + printk("\t\tdev item name %.*s devid %llu " + "total_bytes %llu bytes used %Lu\n", name_len, name, + (unsigned long long)btrfs_device_id(eb, dev_item), + (unsigned long long)btrfs_device_total_bytes(eb, dev_item), + (unsigned long long)btrfs_device_bytes_used(eb, dev_item)); + kfree(name); +} void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l) { int i; @@ -34,6 +68,7 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l) struct btrfs_key key; struct btrfs_key found_key; struct btrfs_extent_ref *ref; + struct btrfs_dev_extent *dev_extent; u32 type; printk("leaf %llu total ptrs %d free space %d\n", @@ -106,6 +141,19 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l) printk("\t\tblock group used %llu\n", (unsigned long long)btrfs_disk_block_group_used(l, bi)); break; + case BTRFS_CHUNK_ITEM_KEY: + print_chunk(l, btrfs_item_ptr(l, i, struct btrfs_chunk)); + break; + case BTRFS_DEV_ITEM_KEY: + print_dev_item(l, btrfs_item_ptr(l, i, + struct btrfs_dev_item)); + break; + case BTRFS_DEV_EXTENT_KEY: + dev_extent = btrfs_item_ptr(l, i, + struct btrfs_dev_extent); + printk("\t\tdev extent owner %llu length %llu\n", + (unsigned long long)btrfs_dev_extent_owner(l, dev_extent), + (unsigned long long)btrfs_dev_extent_length(l, dev_extent)); }; } } diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index e9a0983897f3..5e9f69244f9f 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -198,29 +198,42 @@ int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans, return werr; } -int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans, - struct btrfs_root *root) +static int update_cowonly_root(struct btrfs_trans_handle *trans, + struct btrfs_root *root) { int ret; - u64 old_extent_block; - struct btrfs_fs_info *fs_info = root->fs_info; - struct btrfs_root *tree_root = fs_info->tree_root; - struct btrfs_root *extent_root = fs_info->extent_root; + u64 old_root_bytenr; + struct btrfs_root *tree_root = root->fs_info->tree_root; - btrfs_write_dirty_block_groups(trans, extent_root); + btrfs_write_dirty_block_groups(trans, root); while(1) { - old_extent_block = btrfs_root_bytenr(&extent_root->root_item); - if (old_extent_block == extent_root->node->start) + old_root_bytenr = btrfs_root_bytenr(&root->root_item); + if (old_root_bytenr == root->node->start) break; - btrfs_set_root_bytenr(&extent_root->root_item, - extent_root->node->start); - btrfs_set_root_level(&extent_root->root_item, - btrfs_header_level(extent_root->node)); + btrfs_set_root_bytenr(&root->root_item, + root->node->start); + btrfs_set_root_level(&root->root_item, + btrfs_header_level(root->node)); ret = btrfs_update_root(trans, tree_root, - &extent_root->root_key, - &extent_root->root_item); + &root->root_key, + &root->root_item); BUG_ON(ret); - btrfs_write_dirty_block_groups(trans, extent_root); + btrfs_write_dirty_block_groups(trans, root); + } + return 0; +} + +int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + struct list_head *next; + + while(!list_empty(&fs_info->dirty_cowonly_roots)) { + next = fs_info->dirty_cowonly_roots.next; + list_del_init(next); + root = list_entry(next, struct btrfs_root, dirty_list); + update_cowonly_root(trans, root); } return 0; } @@ -616,6 +629,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, unsigned long timeout = 1; struct btrfs_transaction *cur_trans; struct btrfs_transaction *prev_trans = NULL; + struct btrfs_root *chunk_root = root->fs_info->chunk_root; struct list_head dirty_fs_roots; struct extent_io_tree *pinned_copy; DEFINE_WAIT(wait); @@ -714,6 +728,10 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, btrfs_set_super_root_level(&root->fs_info->super_copy, btrfs_header_level(root->fs_info->tree_root->node)); + btrfs_set_super_chunk_root(&root->fs_info->super_copy, + chunk_root->node->start); + btrfs_set_super_chunk_root_level(&root->fs_info->super_copy, + btrfs_header_level(chunk_root->node)); write_extent_buffer(root->fs_info->sb_buffer, &root->fs_info->super_copy, 0, sizeof(root->fs_info->super_copy)); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c new file mode 100644 index 000000000000..90a8d45dc6d7 --- /dev/null +++ b/fs/btrfs/volumes.c @@ -0,0 +1,852 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ +#include +#include +#include "ctree.h" +#include "extent_map.h" +#include "disk-io.h" +#include "transaction.h" +#include "print-tree.h" +#include "volumes.h" + +struct map_lookup { + struct btrfs_device *dev; + u64 physical; +}; + +/* + * this uses a pretty simple search, the expectation is that it is + * called very infrequently and that a given device has a small number + * of extents + */ +static int find_free_dev_extent(struct btrfs_trans_handle *trans, + struct btrfs_device *device, + struct btrfs_path *path, + u64 num_bytes, u64 *start) +{ + struct btrfs_key key; + struct btrfs_root *root = device->dev_root; + struct btrfs_dev_extent *dev_extent = NULL; + u64 hole_size = 0; + u64 last_byte = 0; + u64 search_start = 0; + u64 search_end = device->total_bytes; + int ret; + int slot = 0; + int start_found; + struct extent_buffer *l; + + start_found = 0; + path->reada = 2; + + /* FIXME use last free of some kind */ + + key.objectid = device->devid; + key.offset = search_start; + key.type = BTRFS_DEV_EXTENT_KEY; + ret = btrfs_search_slot(trans, root, &key, path, 0, 0); + if (ret < 0) + goto error; + ret = btrfs_previous_item(root, path, 0, key.type); + if (ret < 0) + goto error; + l = path->nodes[0]; + btrfs_item_key_to_cpu(l, &key, path->slots[0]); + while (1) { + l = path->nodes[0]; + slot = path->slots[0]; + if (slot >= btrfs_header_nritems(l)) { + ret = btrfs_next_leaf(root, path); + if (ret == 0) + continue; + if (ret < 0) + goto error; +no_more_items: + if (!start_found) { + if (search_start >= search_end) { + ret = -ENOSPC; + goto error; + } + *start = search_start; + start_found = 1; + goto check_pending; + } + *start = last_byte > search_start ? + last_byte : search_start; + if (search_end <= *start) { + ret = -ENOSPC; + goto error; + } + goto check_pending; + } + btrfs_item_key_to_cpu(l, &key, slot); + + if (key.objectid < device->devid) + goto next; + + if (key.objectid > device->devid) + goto no_more_items; + + if (key.offset >= search_start && key.offset > last_byte && + start_found) { + if (last_byte < search_start) + last_byte = search_start; + hole_size = key.offset - last_byte; + if (key.offset > last_byte && + hole_size >= num_bytes) { + *start = last_byte; + goto check_pending; + } + } + if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY) { + goto next; + } + + start_found = 1; + dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); + last_byte = key.offset + btrfs_dev_extent_length(l, dev_extent); +next: + path->slots[0]++; + cond_resched(); + } +check_pending: + /* we have to make sure we didn't find an extent that has already + * been allocated by the map tree or the original allocation + */ + btrfs_release_path(root, path); + BUG_ON(*start < search_start); + + if (*start + num_bytes >= search_end) { + ret = -ENOSPC; + goto error; + } + /* check for pending inserts here */ + return 0; + +error: + btrfs_release_path(root, path); + return ret; +} + +int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans, + struct btrfs_device *device, + u64 owner, u64 num_bytes, u64 *start) +{ + int ret; + struct btrfs_path *path; + struct btrfs_root *root = device->dev_root; + struct btrfs_dev_extent *extent; + struct extent_buffer *leaf; + struct btrfs_key key; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + ret = find_free_dev_extent(trans, device, path, num_bytes, start); + if (ret) + goto err; + + key.objectid = device->devid; + key.offset = *start; + key.type = BTRFS_DEV_EXTENT_KEY; + ret = btrfs_insert_empty_item(trans, root, path, &key, + sizeof(*extent)); + BUG_ON(ret); + + leaf = path->nodes[0]; + extent = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_dev_extent); + btrfs_set_dev_extent_owner(leaf, extent, owner); + btrfs_set_dev_extent_length(leaf, extent, num_bytes); + btrfs_mark_buffer_dirty(leaf); +err: + btrfs_free_path(path); + return ret; +} + +static int find_next_chunk(struct btrfs_root *root, u64 *objectid) +{ + struct btrfs_path *path; + int ret; + struct btrfs_key key; + struct btrfs_key found_key; + + path = btrfs_alloc_path(); + BUG_ON(!path); + + key.objectid = (u64)-1; + key.offset = (u64)-1; + key.type = BTRFS_CHUNK_ITEM_KEY; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + goto error; + + BUG_ON(ret == 0); + + ret = btrfs_previous_item(root, path, 0, BTRFS_CHUNK_ITEM_KEY); + if (ret) { + *objectid = 0; + } else { + btrfs_item_key_to_cpu(path->nodes[0], &found_key, + path->slots[0]); + *objectid = found_key.objectid + found_key.offset; + } + ret = 0; +error: + btrfs_free_path(path); + return ret; +} + +static struct btrfs_device *next_device(struct list_head *head, + struct list_head *last) +{ + struct list_head *next = last->next; + struct btrfs_device *dev; + + if (list_empty(head)) + return NULL; + + if (next == head) + next = next->next; + + dev = list_entry(next, struct btrfs_device, dev_list); + return dev; +} + +static int find_next_devid(struct btrfs_root *root, struct btrfs_path *path, + u64 *objectid) +{ + int ret; + struct btrfs_key key; + struct btrfs_key found_key; + + key.objectid = BTRFS_DEV_ITEMS_OBJECTID; + key.type = BTRFS_DEV_ITEM_KEY; + key.offset = (u64)-1; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + goto error; + + BUG_ON(ret == 0); + + ret = btrfs_previous_item(root, path, BTRFS_DEV_ITEMS_OBJECTID, + BTRFS_DEV_ITEM_KEY); + if (ret) { + *objectid = 1; + } else { + btrfs_item_key_to_cpu(path->nodes[0], &found_key, + path->slots[0]); + *objectid = found_key.offset + 1; + } + ret = 0; +error: + btrfs_release_path(root, path); + return ret; +} + +/* + * the device information is stored in the chunk root + * the btrfs_device struct should be fully filled in + */ +int btrfs_add_device(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_device *device) +{ + int ret; + struct btrfs_path *path; + struct btrfs_dev_item *dev_item; + struct extent_buffer *leaf; + struct btrfs_key key; + unsigned long ptr; + u64 free_devid; + + root = root->fs_info->chunk_root; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + ret = find_next_devid(root, path, &free_devid); + if (ret) + goto out; + + key.objectid = BTRFS_DEV_ITEMS_OBJECTID; + key.type = BTRFS_DEV_ITEM_KEY; + key.offset = free_devid; + + ret = btrfs_insert_empty_item(trans, root, path, &key, + sizeof(*dev_item) + device->name_len); + if (ret) + goto out; + + leaf = path->nodes[0]; + dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item); + + btrfs_set_device_id(leaf, dev_item, device->devid); + btrfs_set_device_type(leaf, dev_item, device->type); + btrfs_set_device_io_align(leaf, dev_item, device->io_align); + btrfs_set_device_io_width(leaf, dev_item, device->io_width); + btrfs_set_device_sector_size(leaf, dev_item, device->sector_size); + btrfs_set_device_rdev(leaf, dev_item, device->rdev); + btrfs_set_device_partition(leaf, dev_item, device->partition); + btrfs_set_device_name_len(leaf, dev_item, device->name_len); + btrfs_set_device_total_bytes(leaf, dev_item, device->total_bytes); + btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used); + + ptr = (unsigned long)btrfs_device_name(dev_item); + write_extent_buffer(leaf, device->name, ptr, device->name_len); + + ptr = (unsigned long)btrfs_device_uuid(dev_item); + write_extent_buffer(leaf, device->uuid, ptr, BTRFS_DEV_UUID_SIZE); + btrfs_mark_buffer_dirty(leaf); + ret = 0; + +out: + btrfs_free_path(path); + return ret; +} +int btrfs_update_device(struct btrfs_trans_handle *trans, + struct btrfs_device *device) +{ + int ret; + struct btrfs_path *path; + struct btrfs_root *root; + struct btrfs_dev_item *dev_item; + struct extent_buffer *leaf; + struct btrfs_key key; + + root = device->dev_root->fs_info->chunk_root; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + key.objectid = BTRFS_DEV_ITEMS_OBJECTID; + key.type = BTRFS_DEV_ITEM_KEY; + key.offset = device->devid; + + ret = btrfs_search_slot(trans, root, &key, path, 0, 1); + if (ret < 0) + goto out; + + if (ret > 0) { + ret = -ENOENT; + goto out; + } + + leaf = path->nodes[0]; + dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item); + + btrfs_set_device_id(leaf, dev_item, device->devid); + btrfs_set_device_type(leaf, dev_item, device->type); + btrfs_set_device_io_align(leaf, dev_item, device->io_align); + btrfs_set_device_io_width(leaf, dev_item, device->io_width); + btrfs_set_device_sector_size(leaf, dev_item, device->sector_size); + btrfs_set_device_rdev(leaf, dev_item, device->rdev); + btrfs_set_device_partition(leaf, dev_item, device->partition); + btrfs_set_device_total_bytes(leaf, dev_item, device->total_bytes); + btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used); + btrfs_mark_buffer_dirty(leaf); + +out: + btrfs_free_path(path); + return ret; +} + +int btrfs_add_system_chunk(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_key *key, + struct btrfs_chunk *chunk, int item_size) +{ + struct btrfs_super_block *super_copy = &root->fs_info->super_copy; + struct btrfs_disk_key disk_key; + u32 array_size; + u8 *ptr; + + array_size = btrfs_super_sys_array_size(super_copy); + if (array_size + item_size > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) + return -EFBIG; + + ptr = super_copy->sys_chunk_array + array_size; + btrfs_cpu_key_to_disk(&disk_key, key); + memcpy(ptr, &disk_key, sizeof(disk_key)); + ptr += sizeof(disk_key); + memcpy(ptr, chunk, item_size); + item_size += sizeof(disk_key); + btrfs_set_super_sys_array_size(super_copy, array_size + item_size); + return 0; +} + +int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, + struct btrfs_root *extent_root, u64 *start, + u64 *num_bytes, u32 type) +{ + u64 dev_offset; + struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root; + struct btrfs_stripe *stripes; + struct btrfs_device *device = NULL; + struct btrfs_chunk *chunk; + struct list_head *dev_list = &extent_root->fs_info->devices; + struct list_head *last_dev = extent_root->fs_info->last_device; + struct extent_map_tree *em_tree; + struct map_lookup *map; + struct extent_map *em; + u64 physical; + u64 calc_size = 1024 * 1024 * 1024; + int num_stripes; + int ret; + int index = 0; + struct btrfs_key key; + + + ret = find_next_chunk(chunk_root, &key.objectid); + if (ret) + return ret; + + num_stripes = 1; + chunk = kmalloc(btrfs_chunk_item_size(num_stripes), GFP_NOFS); + if (!chunk) + return -ENOMEM; + + stripes = &chunk->stripe; + + *num_bytes = calc_size; + while(index < num_stripes) { + device = next_device(dev_list, last_dev); + BUG_ON(!device); + last_dev = &device->dev_list; + extent_root->fs_info->last_device = last_dev; + + ret = btrfs_alloc_dev_extent(trans, device, + key.objectid, + calc_size, &dev_offset); + BUG_ON(ret); + + device->bytes_used += calc_size; + ret = btrfs_update_device(trans, device); + BUG_ON(ret); + + btrfs_set_stack_stripe_devid(stripes + index, device->devid); + btrfs_set_stack_stripe_offset(stripes + index, dev_offset); + physical = dev_offset; + index++; + } + + /* key.objectid was set above */ + key.offset = *num_bytes; + key.type = BTRFS_CHUNK_ITEM_KEY; + btrfs_set_stack_chunk_owner(chunk, extent_root->root_key.objectid); + btrfs_set_stack_chunk_stripe_len(chunk, 64 * 1024); + btrfs_set_stack_chunk_type(chunk, type); + btrfs_set_stack_chunk_num_stripes(chunk, num_stripes); + btrfs_set_stack_chunk_io_align(chunk, extent_root->sectorsize); + btrfs_set_stack_chunk_io_width(chunk, extent_root->sectorsize); + btrfs_set_stack_chunk_sector_size(chunk, extent_root->sectorsize); + + ret = btrfs_insert_item(trans, chunk_root, &key, chunk, + btrfs_chunk_item_size(num_stripes)); + BUG_ON(ret); + *start = key.objectid; + + em = alloc_extent_map(GFP_NOFS); + if (!em) + return -ENOMEM; + map = kmalloc(sizeof(*map), GFP_NOFS); + if (!map) { + free_extent_map(em); + return -ENOMEM; + } + + em->bdev = (struct block_device *)map; + em->start = key.objectid; + em->len = key.offset; + em->block_start = 0; + + map->physical = physical; + map->dev = device; + + if (!map->dev) { + kfree(map); + free_extent_map(em); + return -EIO; + } + kfree(chunk); + + em_tree = &extent_root->fs_info->mapping_tree.map_tree; + spin_lock(&em_tree->lock); + ret = add_extent_mapping(em_tree, em); + BUG_ON(ret); + spin_unlock(&em_tree->lock); + free_extent_map(em); + return ret; +} + +void btrfs_mapping_init(struct btrfs_mapping_tree *tree) +{ + extent_map_tree_init(&tree->map_tree, GFP_NOFS); +} + +void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree) +{ + struct extent_map *em; + + while(1) { + spin_lock(&tree->map_tree.lock); + em = lookup_extent_mapping(&tree->map_tree, 0, (u64)-1); + if (em) + remove_extent_mapping(&tree->map_tree, em); + spin_unlock(&tree->map_tree.lock); + if (!em) + break; + kfree(em->bdev); + /* once for us */ + free_extent_map(em); + /* once for the tree */ + free_extent_map(em); + } +} + +int btrfs_map_block(struct btrfs_mapping_tree *map_tree, + u64 logical, u64 *phys, u64 *length, + struct btrfs_device **dev) +{ + struct extent_map *em; + struct map_lookup *map; + struct extent_map_tree *em_tree = &map_tree->map_tree; + u64 offset; + + + spin_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, logical, *length); + BUG_ON(!em); + + BUG_ON(em->start > logical || em->start + em->len < logical); + map = (struct map_lookup *)em->bdev; + offset = logical - em->start; + *phys = map->physical + offset; + *length = em->len - offset; + *dev = map->dev; + free_extent_map(em); + spin_unlock(&em_tree->lock); + return 0; +} + +int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio) +{ + struct btrfs_mapping_tree *map_tree; + struct btrfs_device *dev; + u64 logical = bio->bi_sector << 9; + u64 physical; + u64 length = 0; + u64 map_length; + struct bio_vec *bvec; + int i; + int ret; + + bio_for_each_segment(bvec, bio, i) { + length += bvec->bv_len; + } + map_tree = &root->fs_info->mapping_tree; + map_length = length; + ret = btrfs_map_block(map_tree, logical, &physical, &map_length, &dev); + BUG_ON(map_length < length); + bio->bi_sector = physical >> 9; + bio->bi_bdev = dev->bdev; + submit_bio(rw, bio); + return 0; +} + +struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid) +{ + struct btrfs_device *dev; + struct list_head *cur = root->fs_info->devices.next; + struct list_head *head = &root->fs_info->devices; + + while(cur != head) { + dev = list_entry(cur, struct btrfs_device, dev_list); + if (dev->devid == devid) + return dev; + cur = cur->next; + } + return NULL; +} + +static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key, + struct extent_buffer *leaf, + struct btrfs_chunk *chunk) +{ + struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree; + struct map_lookup *map; + struct extent_map *em; + u64 logical; + u64 length; + u64 devid; + int ret; + + logical = key->objectid; + length = key->offset; + spin_lock(&map_tree->map_tree.lock); + em = lookup_extent_mapping(&map_tree->map_tree, logical, 1); + + /* already mapped? */ + if (em && em->start <= logical && em->start + em->len > logical) { + free_extent_map(em); + spin_unlock(&map_tree->map_tree.lock); + return 0; + } else if (em) { + free_extent_map(em); + } + spin_unlock(&map_tree->map_tree.lock); + + map = kzalloc(sizeof(*map), GFP_NOFS); + if (!map) + return -ENOMEM; + + em = alloc_extent_map(GFP_NOFS); + if (!em) + return -ENOMEM; + map = kmalloc(sizeof(*map), GFP_NOFS); + if (!map) { + free_extent_map(em); + return -ENOMEM; + } + + em->bdev = (struct block_device *)map; + em->start = logical; + em->len = length; + em->block_start = 0; + + map->physical = btrfs_stripe_offset_nr(leaf, chunk, 0); + devid = btrfs_stripe_devid_nr(leaf, chunk, 0); + map->dev = btrfs_find_device(root, devid); + if (!map->dev) { + kfree(map); + free_extent_map(em); + return -EIO; + } + + spin_lock(&map_tree->map_tree.lock); + ret = add_extent_mapping(&map_tree->map_tree, em); + BUG_ON(ret); + spin_unlock(&map_tree->map_tree.lock); + free_extent_map(em); + + return 0; +} + +static int fill_device_from_item(struct extent_buffer *leaf, + struct btrfs_dev_item *dev_item, + struct btrfs_device *device) +{ + unsigned long ptr; + char *name; + + device->devid = btrfs_device_id(leaf, dev_item); + device->total_bytes = btrfs_device_total_bytes(leaf, dev_item); + device->bytes_used = btrfs_device_bytes_used(leaf, dev_item); + device->type = btrfs_device_type(leaf, dev_item); + device->io_align = btrfs_device_io_align(leaf, dev_item); + device->io_width = btrfs_device_io_width(leaf, dev_item); + device->sector_size = btrfs_device_sector_size(leaf, dev_item); + device->rdev = btrfs_device_rdev(leaf, dev_item); + device->partition = btrfs_device_partition(leaf, dev_item); + device->name_len = btrfs_device_name_len(leaf, dev_item); + + ptr = (unsigned long)btrfs_device_uuid(dev_item); + read_extent_buffer(leaf, device->uuid, ptr, BTRFS_DEV_UUID_SIZE); + + name = kmalloc(device->name_len + 1, GFP_NOFS); + if (!name) + return -ENOMEM; + device->name = name; + ptr = (unsigned long)btrfs_device_name(dev_item); + read_extent_buffer(leaf, name, ptr, device->name_len); + name[device->name_len] = '\0'; + return 0; +} + +static int read_one_dev(struct btrfs_root *root, struct btrfs_key *key, + struct extent_buffer *leaf, + struct btrfs_dev_item *dev_item) +{ + struct btrfs_device *device; + u64 devid; + int ret; + + devid = btrfs_device_id(leaf, dev_item); + if (btrfs_find_device(root, devid)) + return 0; + + device = kmalloc(sizeof(*device), GFP_NOFS); + if (!device) + return -ENOMEM; + + fill_device_from_item(leaf, dev_item, device); + device->dev_root = root->fs_info->dev_root; + device->bdev = root->fs_info->sb->s_bdev; + list_add(&device->dev_list, &root->fs_info->devices); + memcpy(&device->dev_key, key, sizeof(*key)); + ret = 0; +#if 0 + ret = btrfs_open_device(device); + if (ret) { + kfree(device); + } +#endif + return ret; +} + +int btrfs_read_sys_array(struct btrfs_root *root) +{ + struct btrfs_super_block *super_copy = &root->fs_info->super_copy; + struct extent_buffer *sb = root->fs_info->sb_buffer; + struct btrfs_disk_key *disk_key; + struct btrfs_dev_item *dev_item; + struct btrfs_chunk *chunk; + struct btrfs_key key; + u32 num_stripes; + u32 array_size; + u32 len = 0; + u8 *ptr; + unsigned long sb_ptr; + u32 cur; + int ret; + int dev_only = 1; + + array_size = btrfs_super_sys_array_size(super_copy); + + /* + * we do this loop twice, once for the device items and + * once for all of the chunks. This way there are device + * structs filled in for every chunk + */ +again: + ptr = super_copy->sys_chunk_array; + sb_ptr = offsetof(struct btrfs_super_block, sys_chunk_array); + cur = 0; + + while (cur < array_size) { + disk_key = (struct btrfs_disk_key *)ptr; + btrfs_disk_key_to_cpu(&key, disk_key); + + len = sizeof(*disk_key); + ptr += len; + sb_ptr += len; + cur += len; + + if (key.objectid == BTRFS_DEV_ITEMS_OBJECTID && + key.type == BTRFS_DEV_ITEM_KEY) { + dev_item = (struct btrfs_dev_item *)sb_ptr; + if (dev_only) { + ret = read_one_dev(root, &key, sb, dev_item); + BUG_ON(ret); + } + len = sizeof(*dev_item); + len += btrfs_device_name_len(sb, dev_item); + } else if (key.type == BTRFS_CHUNK_ITEM_KEY) { + + chunk = (struct btrfs_chunk *)sb_ptr; + if (!dev_only) { + ret = read_one_chunk(root, &key, sb, chunk); + BUG_ON(ret); + } + num_stripes = btrfs_chunk_num_stripes(sb, chunk); + len = btrfs_chunk_item_size(num_stripes); + } else { + BUG(); + } + ptr += len; + sb_ptr += len; + cur += len; + } + if (dev_only == 1) { + dev_only = 0; + goto again; + } + return 0; +} + +int btrfs_read_chunk_tree(struct btrfs_root *root) +{ + struct btrfs_path *path; + struct extent_buffer *leaf; + struct btrfs_key key; + struct btrfs_key found_key; + int ret; + int slot; + + root = root->fs_info->chunk_root; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + /* first we search for all of the device items, and then we + * read in all of the chunk items. This way we can create chunk + * mappings that reference all of the devices that are afound + */ + key.objectid = BTRFS_DEV_ITEMS_OBJECTID; + key.offset = 0; + key.type = 0; +again: + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + while(1) { + leaf = path->nodes[0]; + slot = path->slots[0]; + if (slot >= btrfs_header_nritems(leaf)) { + ret = btrfs_next_leaf(root, path); + if (ret == 0) + continue; + if (ret < 0) + goto error; + break; + } + btrfs_item_key_to_cpu(leaf, &found_key, slot); + if (key.objectid == BTRFS_DEV_ITEMS_OBJECTID) { + if (found_key.objectid != BTRFS_DEV_ITEMS_OBJECTID) + break; + if (found_key.type == BTRFS_DEV_ITEM_KEY) { + struct btrfs_dev_item *dev_item; + dev_item = btrfs_item_ptr(leaf, slot, + struct btrfs_dev_item); + ret = read_one_dev(root, &found_key, leaf, + dev_item); + BUG_ON(ret); + } + } else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) { + struct btrfs_chunk *chunk; + chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk); + ret = read_one_chunk(root, &found_key, leaf, chunk); + } + path->slots[0]++; + } + if (key.objectid == BTRFS_DEV_ITEMS_OBJECTID) { + key.objectid = 0; + btrfs_release_path(root, path); + goto again; + } + + btrfs_free_path(path); + ret = 0; +error: + return ret; +} + diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h new file mode 100644 index 000000000000..4a47dcb00a73 --- /dev/null +++ b/fs/btrfs/volumes.h @@ -0,0 +1,78 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef __BTRFS_VOLUMES_ +#define __BTRFS_VOLUMES_ +struct btrfs_device { + struct list_head dev_list; + struct btrfs_root *dev_root; + struct btrfs_key dev_key; + + struct block_device *bdev; + + /* the internal btrfs device id */ + u64 devid; + + /* size of the device */ + u64 total_bytes; + + /* bytes used */ + u64 bytes_used; + + /* optimal io alignment for this device */ + u32 io_align; + + /* optimal io width for this device */ + u32 io_width; + + /* minimal io size for this device */ + u32 sector_size; + + /* the kernel device number */ + u64 rdev; + + /* type and info about this device */ + u64 type; + + /* partition number, 0 for whole dev */ + int partition; + + /* length of the name data at the end of the item */ + int name_len; + + /* physical drive uuid (or lvm uuid) */ + u8 uuid[BTRFS_DEV_UUID_SIZE]; + + char *name; +}; + +int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans, + struct btrfs_device *device, + u64 owner, u64 num_bytes, u64 *start); +int btrfs_map_block(struct btrfs_mapping_tree *map_tree, + u64 logical, u64 *phys, u64 *length, + struct btrfs_device **dev); +int btrfs_read_sys_array(struct btrfs_root *root); +int btrfs_read_chunk_tree(struct btrfs_root *root); +int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, + struct btrfs_root *extent_root, u64 *start, + u64 *num_bytes, u32 type); +void btrfs_mapping_init(struct btrfs_mapping_tree *tree); +void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree); +int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio); +#endif -- cgit v1.2.3 From 95c9eb178b96474e9d896f20f982818c5f2ceaa0 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 10 Jun 2008 10:20:57 -0400 Subject: btrfs: tiny makefile cleanup use normal kbuild syntax to build acl.o conditinally and remove comment out lines. Signed-off-by: Christoph Hellwig Signed-off-by: Chris Mason --- fs/btrfs/Makefile | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) (limited to 'fs/btrfs/Makefile') diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index 5ac6f2946c21..e5fc3cfea0ab 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -8,13 +8,7 @@ btrfs-y := super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \ extent_io.o volumes.o -ifeq ($(CONFIG_FS_POSIX_ACL),y) -btrfs-y += acl.o -endif -#btrfs-y := ctree.o disk-io.o radix-tree.o extent-tree.o print-tree.o \ -# root-tree.o dir-item.o hash.o file-item.o inode-item.o \ -# inode-map.o \ - +btrfs-$(CONFIG_FS_POSIX_ACL) += acl.o else # Normal Makefile -- cgit v1.2.3 From 8b7128429235d9bd72cfd5ed20c77c4f3118f744 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Wed, 11 Jun 2008 16:50:36 -0400 Subject: Btrfs: Add async worker threads for pre and post IO checksumming Btrfs has been using workqueues to spread the checksumming load across other CPUs in the system. But, workqueues only schedule work on the same CPU that queued the work, giving them a limited benefit for systems with higher CPU counts. This code adds a generic facility to schedule work with pools of kthreads, and changes the bio submission code to queue bios up. The queueing is important to make sure large numbers of procs on the system don't turn streaming workloads into random workloads by sending IO down concurrently. The end result of all of this is much higher performance (and CPU usage) when doing checksumming on large machines. Two worker pools are created, one for writes and one for endio processing. The two could deadlock if we tried to service both from a single pool. Signed-off-by: Chris Mason --- fs/btrfs/Makefile | 2 +- fs/btrfs/async-thread.c | 288 ++++++++++++++++++++++++++++++++++++++++++++++++ fs/btrfs/async-thread.h | 78 +++++++++++++ fs/btrfs/ctree.h | 14 ++- fs/btrfs/disk-io.c | 200 ++++++++++++++------------------- fs/btrfs/inode.c | 4 +- fs/btrfs/volumes.c | 162 ++++++++++++++++++++++++++- fs/btrfs/volumes.h | 10 +- 8 files changed, 626 insertions(+), 132 deletions(-) create mode 100644 fs/btrfs/async-thread.c create mode 100644 fs/btrfs/async-thread.h (limited to 'fs/btrfs/Makefile') diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index e5fc3cfea0ab..9dcfc2fe3332 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -6,7 +6,7 @@ btrfs-y := super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ hash.o file-item.o inode-item.o inode-map.o disk-io.o \ transaction.o bit-radix.o inode.o file.o tree-defrag.o \ extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \ - extent_io.o volumes.o + extent_io.o volumes.o async-thread.o btrfs-$(CONFIG_FS_POSIX_ACL) += acl.o else diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c new file mode 100644 index 000000000000..2911b67bd6f7 --- /dev/null +++ b/fs/btrfs/async-thread.c @@ -0,0 +1,288 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include +#include +#include +#include +#include "async-thread.h" + +/* + * container for the kthread task pointer and the list of pending work + * One of these is allocated per thread. + */ +struct btrfs_worker_thread { + /* list of struct btrfs_work that are waiting for service */ + struct list_head pending; + + /* list of worker threads from struct btrfs_workers */ + struct list_head worker_list; + + /* kthread */ + struct task_struct *task; + + /* number of things on the pending list */ + atomic_t num_pending; + + /* protects the pending list. */ + spinlock_t lock; + + /* set to non-zero when this thread is already awake and kicking */ + int working; +}; + +/* + * main loop for servicing work items + */ +static int worker_loop(void *arg) +{ + struct btrfs_worker_thread *worker = arg; + struct list_head *cur; + struct btrfs_work *work; + do { + spin_lock_irq(&worker->lock); + while(!list_empty(&worker->pending)) { + cur = worker->pending.next; + work = list_entry(cur, struct btrfs_work, list); + list_del(&work->list); + clear_bit(0, &work->flags); + + work->worker = worker; + spin_unlock_irq(&worker->lock); + + work->func(work); + + atomic_dec(&worker->num_pending); + spin_lock_irq(&worker->lock); + } + worker->working = 0; + if (freezing(current)) { + refrigerator(); + } else { + set_current_state(TASK_INTERRUPTIBLE); + spin_unlock_irq(&worker->lock); + schedule(); + __set_current_state(TASK_RUNNING); + } + } while (!kthread_should_stop()); + return 0; +} + +/* + * this will wait for all the worker threads to shutdown + */ +int btrfs_stop_workers(struct btrfs_workers *workers) +{ + struct list_head *cur; + struct btrfs_worker_thread *worker; + + while(!list_empty(&workers->worker_list)) { + cur = workers->worker_list.next; + worker = list_entry(cur, struct btrfs_worker_thread, + worker_list); + kthread_stop(worker->task); + list_del(&worker->worker_list); + kfree(worker); + } + return 0; +} + +/* + * simple init on struct btrfs_workers + */ +void btrfs_init_workers(struct btrfs_workers *workers, int max) +{ + workers->num_workers = 0; + INIT_LIST_HEAD(&workers->worker_list); + workers->last = NULL; + spin_lock_init(&workers->lock); + workers->max_workers = max; +} + +/* + * starts new worker threads. This does not enforce the max worker + * count in case you need to temporarily go past it. + */ +int btrfs_start_workers(struct btrfs_workers *workers, int num_workers) +{ + struct btrfs_worker_thread *worker; + int ret = 0; + int i; + + for (i = 0; i < num_workers; i++) { + worker = kzalloc(sizeof(*worker), GFP_NOFS); + if (!worker) { + ret = -ENOMEM; + goto fail; + } + + INIT_LIST_HEAD(&worker->pending); + INIT_LIST_HEAD(&worker->worker_list); + spin_lock_init(&worker->lock); + atomic_set(&worker->num_pending, 0); + worker->task = kthread_run(worker_loop, worker, "btrfs"); + if (IS_ERR(worker->task)) { + ret = PTR_ERR(worker->task); + goto fail; + } + + spin_lock_irq(&workers->lock); + list_add_tail(&worker->worker_list, &workers->worker_list); + workers->last = worker; + workers->num_workers++; + spin_unlock_irq(&workers->lock); + } + return 0; +fail: + btrfs_stop_workers(workers); + return ret; +} + +/* + * run through the list and find a worker thread that doesn't have a lot + * to do right now. This can return null if we aren't yet at the thread + * count limit and all of the threads are busy. + */ +static struct btrfs_worker_thread *next_worker(struct btrfs_workers *workers) +{ + struct btrfs_worker_thread *worker; + struct list_head *next; + struct list_head *start; + int enforce_min = workers->num_workers < workers->max_workers; + + /* start with the last thread if it isn't busy */ + worker = workers->last; + if (atomic_read(&worker->num_pending) < 64) + goto done; + + next = worker->worker_list.next; + start = &worker->worker_list; + + /* + * check all the workers for someone that is bored. FIXME, do + * something smart here + */ + while(next != start) { + if (next == &workers->worker_list) { + next = workers->worker_list.next; + continue; + } + worker = list_entry(next, struct btrfs_worker_thread, + worker_list); + if (atomic_read(&worker->num_pending) < 64 || !enforce_min) + goto done; + next = next->next; + } + /* + * nobody was bored, if we're already at the max thread count, + * use the last thread + */ + if (!enforce_min || atomic_read(&workers->last->num_pending) < 64) { + return workers->last; + } + return NULL; +done: + workers->last = worker; + return worker; +} + +static struct btrfs_worker_thread *find_worker(struct btrfs_workers *workers) +{ + struct btrfs_worker_thread *worker; + unsigned long flags; + +again: + spin_lock_irqsave(&workers->lock, flags); + worker = next_worker(workers); + spin_unlock_irqrestore(&workers->lock, flags); + + if (!worker) { + spin_lock_irqsave(&workers->lock, flags); + if (workers->num_workers >= workers->max_workers) { + /* + * we have failed to find any workers, just + * return the force one + */ + worker = list_entry(workers->worker_list.next, + struct btrfs_worker_thread, worker_list); + spin_unlock_irqrestore(&workers->lock, flags); + } else { + spin_unlock_irqrestore(&workers->lock, flags); + /* we're below the limit, start another worker */ + btrfs_start_workers(workers, 1); + goto again; + } + } + return worker; +} + +/* + * btrfs_requeue_work just puts the work item back on the tail of the list + * it was taken from. It is intended for use with long running work functions + * that make some progress and want to give the cpu up for others. + */ +int btrfs_requeue_work(struct btrfs_work *work) +{ + struct btrfs_worker_thread *worker = work->worker; + unsigned long flags; + + if (test_and_set_bit(0, &work->flags)) + goto out; + + spin_lock_irqsave(&worker->lock, flags); + atomic_inc(&worker->num_pending); + list_add_tail(&work->list, &worker->pending); + spin_unlock_irqrestore(&worker->lock, flags); +out: + return 0; +} + +/* + * places a struct btrfs_work into the pending queue of one of the kthreads + */ +int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work) +{ + struct btrfs_worker_thread *worker; + unsigned long flags; + int wake = 0; + + /* don't requeue something already on a list */ + if (test_and_set_bit(0, &work->flags)) + goto out; + + worker = find_worker(workers); + + spin_lock_irqsave(&worker->lock, flags); + atomic_inc(&worker->num_pending); + list_add_tail(&work->list, &worker->pending); + + /* + * avoid calling into wake_up_process if this thread has already + * been kicked + */ + if (!worker->working) + wake = 1; + worker->working = 1; + + spin_unlock_irqrestore(&worker->lock, flags); + + if (wake) + wake_up_process(worker->task); +out: + return 0; +} diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h new file mode 100644 index 000000000000..52fc9da0f9e7 --- /dev/null +++ b/fs/btrfs/async-thread.h @@ -0,0 +1,78 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef __BTRFS_ASYNC_THREAD_ +#define __BTRFS_ASYNC_THREAD_ + +struct btrfs_worker_thread; + +/* + * This is similar to a workqueue, but it is meant to spread the operations + * across all available cpus instead of just the CPU that was used to + * queue the work. There is also some batching introduced to try and + * cut down on context switches. + * + * By default threads are added on demand up to 2 * the number of cpus. + * Changing struct btrfs_workers->max_workers is one way to prevent + * demand creation of kthreads. + * + * the basic model of these worker threads is to embed a btrfs_work + * structure in your own data struct, and use container_of in a + * work function to get back to your data struct. + */ +struct btrfs_work { + /* + * only func should be set to the function you want called + * your work struct is passed as the only arg + */ + void (*func)(struct btrfs_work *work); + + /* + * flags should be set to zero. It is used to make sure the + * struct is only inserted once into the list. + */ + unsigned long flags; + + /* don't touch these */ + struct btrfs_worker_thread *worker; + struct list_head list; +}; + +struct btrfs_workers { + /* current number of running workers */ + int num_workers; + + /* max number of workers allowed. changed by btrfs_start_workers */ + int max_workers; + + /* list with all the work threads */ + struct list_head worker_list; + + /* the last worker thread to have something queued */ + struct btrfs_worker_thread *last; + + /* lock for finding the next worker thread to queue on */ + spinlock_t lock; +}; + +int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work); +int btrfs_start_workers(struct btrfs_workers *workers, int num_workers); +int btrfs_stop_workers(struct btrfs_workers *workers); +void btrfs_init_workers(struct btrfs_workers *workers, int max); +int btrfs_requeue_work(struct btrfs_work *work); +#endif diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 49cbc62b42f9..6c91a510c965 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -30,6 +30,7 @@ #include "bit-radix.h" #include "extent_io.h" #include "extent_map.h" +#include "async-thread.h" struct btrfs_trans_handle; struct btrfs_transaction; @@ -518,13 +519,20 @@ struct btrfs_fs_info { struct list_head hashers; struct list_head dead_roots; struct list_head end_io_work_list; - struct list_head async_submit_work_list; struct work_struct end_io_work; - struct work_struct async_submit_work; spinlock_t end_io_work_lock; - spinlock_t async_submit_work_lock; atomic_t nr_async_submits; + /* + * there is a pool of worker threads for checksumming during writes + * and a pool for checksumming after reads. This is because readers + * can run with FS locks held, and the writers may be waiting for + * those locks. We don't want ordering in the pending list to cause + * deadlocks, and so the two are serviced separately. + */ + struct btrfs_workers workers; + struct btrfs_workers endio_workers; + #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18) struct work_struct trans_work; #else diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index b9a53646ceb2..98ff4fbcb386 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -31,6 +31,7 @@ #include "btrfs_inode.h" #include "volumes.h" #include "print-tree.h" +#include "async-thread.h" #if 0 static int check_tree_block(struct btrfs_root *root, struct extent_buffer *buf) @@ -46,8 +47,7 @@ static int check_tree_block(struct btrfs_root *root, struct extent_buffer *buf) #endif static struct extent_io_ops btree_extent_io_ops; -static struct workqueue_struct *end_io_workqueue; -static struct workqueue_struct *async_submit_workqueue; +static void end_workqueue_fn(struct btrfs_work *work); struct end_io_wq { struct bio *bio; @@ -57,6 +57,7 @@ struct end_io_wq { int error; int metadata; struct list_head list; + struct btrfs_work work; }; struct async_submit_bio { @@ -66,6 +67,7 @@ struct async_submit_bio { extent_submit_bio_hook_t *submit_bio_hook; int rw; int mirror_num; + struct btrfs_work work; }; struct extent_map *btree_get_extent(struct inode *inode, struct page *page, @@ -389,7 +391,6 @@ static int end_workqueue_bio(struct bio *bio, { struct end_io_wq *end_io_wq = bio->bi_private; struct btrfs_fs_info *fs_info; - unsigned long flags; #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23) if (bio->bi_size) @@ -397,11 +398,10 @@ static int end_workqueue_bio(struct bio *bio, #endif fs_info = end_io_wq->info; - spin_lock_irqsave(&fs_info->end_io_work_lock, flags); end_io_wq->error = err; - list_add_tail(&end_io_wq->list, &fs_info->end_io_work_list); - spin_unlock_irqrestore(&fs_info->end_io_work_lock, flags); - queue_work(end_io_workqueue, &fs_info->end_io_work); + end_io_wq->work.func = end_workqueue_fn; + end_io_wq->work.flags = 0; + btrfs_queue_worker(&fs_info->endio_workers, &end_io_wq->work); #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23) return 0; @@ -428,6 +428,19 @@ int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio, return 0; } +static void run_one_async_submit(struct btrfs_work *work) +{ + struct btrfs_fs_info *fs_info; + struct async_submit_bio *async; + + async = container_of(work, struct async_submit_bio, work); + fs_info = BTRFS_I(async->inode)->root->fs_info; + atomic_dec(&fs_info->nr_async_submits); + async->submit_bio_hook(async->inode, async->rw, async->bio, + async->mirror_num); + kfree(async); +} + int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode, int rw, struct bio *bio, int mirror_num, extent_submit_bio_hook_t *submit_bio_hook) @@ -443,13 +456,10 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode, async->bio = bio; async->mirror_num = mirror_num; async->submit_bio_hook = submit_bio_hook; - - spin_lock(&fs_info->async_submit_work_lock); - list_add_tail(&async->list, &fs_info->async_submit_work_list); + async->work.func = run_one_async_submit; + async->work.flags = 0; atomic_inc(&fs_info->nr_async_submits); - spin_unlock(&fs_info->async_submit_work_lock); - - queue_work(async_submit_workqueue, &fs_info->async_submit_work); + btrfs_queue_worker(&fs_info->workers, &async->work); return 0; } @@ -462,19 +472,32 @@ static int __btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, offset = bio->bi_sector << 9; + /* + * when we're called for a write, we're already in the async + * submission context. Just jump ingo btrfs_map_bio + */ if (rw & (1 << BIO_RW)) { - return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, mirror_num); + return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, + mirror_num, 0); } + /* + * called for a read, do the setup so that checksum validation + * can happen in the async kernel threads + */ ret = btrfs_bio_wq_end_io(root->fs_info, bio, 1); BUG_ON(ret); - return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, mirror_num); + return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, mirror_num, 1); } static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, int mirror_num) { + /* + * kthread helpers are used to submit writes so that checksumming + * can happen in parallel across all CPUs + */ if (!(rw & (1 << BIO_RW))) { return __btree_submit_bio_hook(inode, rw, bio, mirror_num); } @@ -1036,95 +1059,40 @@ static int bio_ready_for_csum(struct bio *bio) return ret; } -#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18) -static void btrfs_end_io_csum(void *p) -#else -static void btrfs_end_io_csum(struct work_struct *work) -#endif +/* + * called by the kthread helper functions to finally call the bio end_io + * functions. This is where read checksum verification actually happens + */ +static void end_workqueue_fn(struct btrfs_work *work) { -#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18) - struct btrfs_fs_info *fs_info = p; -#else - struct btrfs_fs_info *fs_info = container_of(work, - struct btrfs_fs_info, - end_io_work); -#endif - unsigned long flags; - struct end_io_wq *end_io_wq; struct bio *bio; - struct list_head *next; + struct end_io_wq *end_io_wq; + struct btrfs_fs_info *fs_info; int error; - int was_empty; - while(1) { - spin_lock_irqsave(&fs_info->end_io_work_lock, flags); - if (list_empty(&fs_info->end_io_work_list)) { - spin_unlock_irqrestore(&fs_info->end_io_work_lock, - flags); - return; - } - next = fs_info->end_io_work_list.next; - list_del(next); - spin_unlock_irqrestore(&fs_info->end_io_work_lock, flags); - - end_io_wq = list_entry(next, struct end_io_wq, list); - - bio = end_io_wq->bio; - if (end_io_wq->metadata && !bio_ready_for_csum(bio)) { - spin_lock_irqsave(&fs_info->end_io_work_lock, flags); - was_empty = list_empty(&fs_info->end_io_work_list); - list_add_tail(&end_io_wq->list, - &fs_info->end_io_work_list); - spin_unlock_irqrestore(&fs_info->end_io_work_lock, - flags); - if (was_empty) - return; - continue; - } - error = end_io_wq->error; - bio->bi_private = end_io_wq->private; - bio->bi_end_io = end_io_wq->end_io; - kfree(end_io_wq); -#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23) - bio_endio(bio, bio->bi_size, error); -#else - bio_endio(bio, error); -#endif - } -} + end_io_wq = container_of(work, struct end_io_wq, work); + bio = end_io_wq->bio; + fs_info = end_io_wq->info; -#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18) -static void btrfs_async_submit_work(void *p) -#else -static void btrfs_async_submit_work(struct work_struct *work) -#endif -{ -#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18) - struct btrfs_fs_info *fs_info = p; + /* metadata bios are special because the whole tree block must + * be checksummed at once. This makes sure the entire block is in + * ram and up to date before trying to verify things. For + * blocksize <= pagesize, it is basically a noop + */ + if (end_io_wq->metadata && !bio_ready_for_csum(bio)) { + btrfs_queue_worker(&fs_info->endio_workers, + &end_io_wq->work); + return; + } + error = end_io_wq->error; + bio->bi_private = end_io_wq->private; + bio->bi_end_io = end_io_wq->end_io; + kfree(end_io_wq); +#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23) + bio_endio(bio, bio->bi_size, error); #else - struct btrfs_fs_info *fs_info = container_of(work, - struct btrfs_fs_info, - async_submit_work); + bio_endio(bio, error); #endif - struct async_submit_bio *async; - struct list_head *next; - - while(1) { - spin_lock(&fs_info->async_submit_work_lock); - if (list_empty(&fs_info->async_submit_work_list)) { - spin_unlock(&fs_info->async_submit_work_lock); - return; - } - next = fs_info->async_submit_work_list.next; - list_del(next); - atomic_dec(&fs_info->nr_async_submits); - spin_unlock(&fs_info->async_submit_work_lock); - - async = list_entry(next, struct async_submit_bio, list); - async->submit_bio_hook(async->inode, async->rw, async->bio, - async->mirror_num); - kfree(async); - } } struct btrfs_root *open_ctree(struct super_block *sb, @@ -1155,19 +1123,11 @@ struct btrfs_root *open_ctree(struct super_block *sb, err = -ENOMEM; goto fail; } - end_io_workqueue = create_workqueue("btrfs-end-io"); - BUG_ON(!end_io_workqueue); - async_submit_workqueue = create_workqueue("btrfs-async-submit"); - INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_NOFS); INIT_LIST_HEAD(&fs_info->trans_list); INIT_LIST_HEAD(&fs_info->dead_roots); INIT_LIST_HEAD(&fs_info->hashers); - INIT_LIST_HEAD(&fs_info->end_io_work_list); - INIT_LIST_HEAD(&fs_info->async_submit_work_list); spin_lock_init(&fs_info->hash_lock); - spin_lock_init(&fs_info->end_io_work_lock); - spin_lock_init(&fs_info->async_submit_work_lock); spin_lock_init(&fs_info->delalloc_lock); spin_lock_init(&fs_info->new_trans_lock); @@ -1222,13 +1182,8 @@ struct btrfs_root *open_ctree(struct super_block *sb, fs_info->do_barriers = 1; #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18) - INIT_WORK(&fs_info->end_io_work, btrfs_end_io_csum, fs_info); - INIT_WORK(&fs_info->async_submit_work, btrfs_async_submit_work, - fs_info); INIT_WORK(&fs_info->trans_work, btrfs_transaction_cleaner, fs_info); #else - INIT_WORK(&fs_info->end_io_work, btrfs_end_io_csum); - INIT_WORK(&fs_info->async_submit_work, btrfs_async_submit_work); INIT_DELAYED_WORK(&fs_info->trans_work, btrfs_transaction_cleaner); #endif BTRFS_I(fs_info->btree_inode)->root = tree_root; @@ -1240,6 +1195,19 @@ struct btrfs_root *open_ctree(struct super_block *sb, mutex_init(&fs_info->trans_mutex); mutex_init(&fs_info->fs_mutex); + /* we need to start all the end_io workers up front because the + * queue work function gets called at interrupt time. The endio + * workers don't normally start IO, so some number of them <= the + * number of cpus is fine. They handle checksumming after a read. + * + * The other worker threads do start IO, so the max is larger than + * the number of CPUs. FIXME, tune this for huge machines + */ + btrfs_init_workers(&fs_info->workers, num_online_cpus() * 2); + btrfs_init_workers(&fs_info->endio_workers, num_online_cpus()); + btrfs_start_workers(&fs_info->workers, 1); + btrfs_start_workers(&fs_info->endio_workers, num_online_cpus()); + #if 0 ret = add_hasher(fs_info, "crc32c"); if (ret) { @@ -1375,6 +1343,8 @@ fail_sb_buffer: extent_io_tree_empty_lru(&BTRFS_I(fs_info->btree_inode)->io_tree); fail_iput: iput(fs_info->btree_inode); + btrfs_stop_workers(&fs_info->workers); + btrfs_stop_workers(&fs_info->endio_workers); fail: btrfs_close_devices(fs_info->fs_devices); btrfs_mapping_tree_free(&fs_info->mapping_tree); @@ -1623,16 +1593,10 @@ int close_ctree(struct btrfs_root *root) extent_io_tree_empty_lru(&fs_info->extent_ins); extent_io_tree_empty_lru(&BTRFS_I(fs_info->btree_inode)->io_tree); - flush_workqueue(async_submit_workqueue); - flush_workqueue(end_io_workqueue); - truncate_inode_pages(fs_info->btree_inode->i_mapping, 0); - flush_workqueue(async_submit_workqueue); - destroy_workqueue(async_submit_workqueue); - - flush_workqueue(end_io_workqueue); - destroy_workqueue(end_io_workqueue); + btrfs_stop_workers(&fs_info->workers); + btrfs_stop_workers(&fs_info->endio_workers); iput(fs_info->btree_inode); #if 0 diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 0f14697becef..7daef8d37006 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -359,7 +359,7 @@ int __btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, kfree(sums); - return btrfs_map_bio(root, rw, bio, mirror_num); + return btrfs_map_bio(root, rw, bio, mirror_num, 1); } int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, @@ -383,7 +383,7 @@ int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, inode, rw, bio, mirror_num, __btrfs_submit_bio_hook); mapit: - return btrfs_map_bio(root, rw, bio, mirror_num); + return btrfs_map_bio(root, rw, bio, mirror_num, 0); } int btrfs_readpage_io_hook(struct page *page, u64 start, u64 end) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 722eb4550154..c57458ce6339 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -27,6 +27,7 @@ #include "transaction.h" #include "print-tree.h" #include "volumes.h" +#include "async-thread.h" struct map_lookup { u64 type; @@ -110,6 +111,101 @@ static struct btrfs_fs_devices *find_fsid(u8 *fsid) return NULL; } +/* + * we try to collect pending bios for a device so we don't get a large + * number of procs sending bios down to the same device. This greatly + * improves the schedulers ability to collect and merge the bios. + * + * But, it also turns into a long list of bios to process and that is sure + * to eventually make the worker thread block. The solution here is to + * make some progress and then put this work struct back at the end of + * the list if the block device is congested. This way, multiple devices + * can make progress from a single worker thread. + */ +int run_scheduled_bios(struct btrfs_device *device) +{ + struct bio *pending; + struct backing_dev_info *bdi; + struct bio *tail; + struct bio *cur; + int again = 0; + unsigned long num_run = 0; + + bdi = device->bdev->bd_inode->i_mapping->backing_dev_info; +loop: + spin_lock(&device->io_lock); + + /* take all the bios off the list at once and process them + * later on (without the lock held). But, remember the + * tail and other pointers so the bios can be properly reinserted + * into the list if we hit congestion + */ + pending = device->pending_bios; + tail = device->pending_bio_tail; + WARN_ON(pending && !tail); + device->pending_bios = NULL; + device->pending_bio_tail = NULL; + + /* + * if pending was null this time around, no bios need processing + * at all and we can stop. Otherwise it'll loop back up again + * and do an additional check so no bios are missed. + * + * device->running_pending is used to synchronize with the + * schedule_bio code. + */ + if (pending) { + again = 1; + device->running_pending = 1; + } else { + again = 0; + device->running_pending = 0; + } + spin_unlock(&device->io_lock); + + while(pending) { + cur = pending; + pending = pending->bi_next; + cur->bi_next = NULL; + atomic_dec(&device->dev_root->fs_info->nr_async_submits); + submit_bio(cur->bi_rw, cur); + num_run++; + + /* + * we made progress, there is more work to do and the bdi + * is now congested. Back off and let other work structs + * run instead + */ + if (pending && num_run && bdi_write_congested(bdi)) { + struct bio *old_head; + + spin_lock(&device->io_lock); + old_head = device->pending_bios; + device->pending_bios = pending; + if (device->pending_bio_tail) + tail->bi_next = old_head; + else + device->pending_bio_tail = tail; + + spin_unlock(&device->io_lock); + btrfs_requeue_work(&device->work); + goto done; + } + } + if (again) + goto loop; +done: + return 0; +} + +void pending_bios_fn(struct btrfs_work *work) +{ + struct btrfs_device *device; + + device = container_of(work, struct btrfs_device, work); + run_scheduled_bios(device); +} + static int device_list_add(const char *path, struct btrfs_super_block *disk_super, u64 devid, struct btrfs_fs_devices **fs_devices_ret) @@ -141,6 +237,7 @@ static int device_list_add(const char *path, return -ENOMEM; } device->devid = devid; + device->work.func = pending_bios_fn; memcpy(device->uuid, disk_super->dev_item.uuid, BTRFS_UUID_SIZE); device->barriers = 1; @@ -925,6 +1022,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) } device->barriers = 1; + device->work.func = pending_bios_fn; generate_random_uuid(device->uuid); spin_lock_init(&device->io_lock); device->name = kstrdup(device_path, GFP_NOFS); @@ -1965,8 +2063,61 @@ static int end_bio_multi_stripe(struct bio *bio, #endif } +struct async_sched { + struct bio *bio; + int rw; + struct btrfs_fs_info *info; + struct btrfs_work work; +}; + +/* + * see run_scheduled_bios for a description of why bios are collected for + * async submit. + * + * This will add one bio to the pending list for a device and make sure + * the work struct is scheduled. + */ +int schedule_bio(struct btrfs_root *root, struct btrfs_device *device, + int rw, struct bio *bio) +{ + int should_queue = 1; + + /* don't bother with additional async steps for reads, right now */ + if (!(rw & (1 << BIO_RW))) { + submit_bio(rw, bio); + return 0; + } + + /* + * nr_async_sumbits allows us to reliably return congestion to the + * higher layers. Otherwise, the async bio makes it appear we have + * made progress against dirty pages when we've really just put it + * on a queue for later + */ + atomic_inc(&root->fs_info->nr_async_submits); + bio->bi_next = NULL; + bio->bi_rw |= rw; + + spin_lock(&device->io_lock); + + if (device->pending_bio_tail) + device->pending_bio_tail->bi_next = bio; + + device->pending_bio_tail = bio; + if (!device->pending_bios) + device->pending_bios = bio; + if (device->running_pending) + should_queue = 0; + + spin_unlock(&device->io_lock); + + if (should_queue) + btrfs_queue_worker(&root->fs_info->workers, &device->work); + return 0; +} + int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, - int mirror_num) + int mirror_num, int async_submit) { struct btrfs_mapping_tree *map_tree; struct btrfs_device *dev; @@ -2012,10 +2163,10 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, dev = multi->stripes[dev_nr].dev; if (dev && dev->bdev) { bio->bi_bdev = dev->bdev; - spin_lock(&dev->io_lock); - dev->total_ios++; - spin_unlock(&dev->io_lock); - submit_bio(rw, bio); + if (async_submit) + schedule_bio(root, dev, rw, bio); + else + submit_bio(rw, bio); } else { bio->bi_bdev = root->fs_info->fs_devices->latest_bdev; bio->bi_sector = logical >> 9; @@ -2054,6 +2205,7 @@ static struct btrfs_device *add_missing_dev(struct btrfs_root *root, device->barriers = 1; device->dev_root = root->fs_info->dev_root; device->devid = devid; + device->work.func = pending_bios_fn; fs_devices->num_devices++; spin_lock_init(&device->io_lock); memcpy(device->uuid, dev_uuid, BTRFS_UUID_SIZE); diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 4df6b1608f91..48a44f7a9385 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -20,6 +20,7 @@ #define __BTRFS_VOLUMES_ #include +#include "async-thread.h" struct buffer_head; struct btrfs_device { @@ -27,6 +28,9 @@ struct btrfs_device { struct list_head dev_alloc_list; struct btrfs_root *dev_root; struct buffer_head *pending_io; + struct bio *pending_bios; + struct bio *pending_bio_tail; + int running_pending; u64 generation; int barriers; @@ -36,8 +40,6 @@ struct btrfs_device { struct block_device *bdev; - u64 total_ios; - char *name; /* the internal btrfs device id */ @@ -63,6 +65,8 @@ struct btrfs_device { /* physical drive uuid (or lvm uuid) */ u8 uuid[BTRFS_UUID_SIZE]; + + struct btrfs_work work; }; struct btrfs_fs_devices { @@ -117,7 +121,7 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, void btrfs_mapping_init(struct btrfs_mapping_tree *tree); void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree); int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, - int mirror_num); + int mirror_num, int async_submit); int btrfs_read_super_device(struct btrfs_root *root, struct extent_buffer *buf); int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, int flags, void *holder); -- cgit v1.2.3 From f46b5a66b3316ef2f8febfe4c56e2d555e2c3979 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 11 Jun 2008 21:53:53 -0400 Subject: Btrfs: split out ioctl.c Split the ioctl handling out of inode.c into a file of it's own. Also fix up checkpatch.pl warnings for the moved code. Signed-off-by: Christoph Hellwig Signed-off-by: Chris Mason --- fs/btrfs/Makefile | 2 +- fs/btrfs/ctree.h | 9 +- fs/btrfs/inode.c | 733 +------------------------------------------------- fs/btrfs/ioctl.c | 781 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 796 insertions(+), 729 deletions(-) create mode 100644 fs/btrfs/ioctl.c (limited to 'fs/btrfs/Makefile') diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index 9dcfc2fe3332..7ed6b39e42d2 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -6,7 +6,7 @@ btrfs-y := super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ hash.o file-item.o inode-item.o inode-map.o disk-io.o \ transaction.o bit-radix.o inode.o file.o tree-defrag.o \ extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \ - extent_io.o volumes.o async-thread.o + extent_io.o volumes.o async-thread.o ioctl.o btrfs-$(CONFIG_FS_POSIX_ACL) += acl.o else diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 7ae4666103c0..181c81d29897 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1553,6 +1553,10 @@ int btrfs_csum_truncate(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, u64 isize); /* inode.c */ +int btrfs_create_subvol_root(struct btrfs_root *new_root, + struct btrfs_trans_handle *trans, u64 new_dirid, + struct btrfs_block_group_cache *block_group); + void btrfs_invalidate_dcache_root(struct btrfs_root *root, char *name, int namelen); @@ -1585,7 +1589,6 @@ void btrfs_destroy_inode(struct inode *inode); int btrfs_init_cachep(void); void btrfs_destroy_cachep(void); long btrfs_ioctl_trans_end(struct file *file); -long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg); struct inode *btrfs_iget_locked(struct super_block *s, u64 objectid, struct btrfs_root *root); struct inode *btrfs_ilookup(struct super_block *s, u64 objectid, @@ -1598,6 +1601,10 @@ struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page, int btrfs_update_inode(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode); + +/* ioctl.c */ +long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg); + /* file.c */ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end); int btrfs_check_file(struct btrfs_root *root, struct inode *inode); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 7daef8d37006..0c79346fd2c9 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2667,114 +2667,17 @@ void btrfs_invalidate_dcache_root(struct btrfs_root *root, char *name, } } -static int noinline create_subvol(struct btrfs_root *root, char *name, - int namelen) +int btrfs_create_subvol_root(struct btrfs_root *new_root, + struct btrfs_trans_handle *trans, u64 new_dirid, + struct btrfs_block_group_cache *block_group) { - struct btrfs_trans_handle *trans; - struct btrfs_key key; - struct btrfs_root_item root_item; - struct btrfs_inode_item *inode_item; - struct extent_buffer *leaf; - struct btrfs_root *new_root = root; struct inode *inode; - struct inode *dir; int ret; - int err; - u64 objectid; - u64 new_dirid = BTRFS_FIRST_FREE_OBJECTID; - unsigned long nr = 1; - - mutex_lock(&root->fs_info->fs_mutex); - ret = btrfs_check_free_space(root, 1, 0); - if (ret) - goto fail_commit; - - trans = btrfs_start_transaction(root, 1); - BUG_ON(!trans); - - ret = btrfs_find_free_objectid(trans, root->fs_info->tree_root, - 0, &objectid); - if (ret) - goto fail; - - leaf = __btrfs_alloc_free_block(trans, root, root->leafsize, - objectid, trans->transid, 0, 0, - 0, 0); - if (IS_ERR(leaf)) - return PTR_ERR(leaf); - - btrfs_set_header_nritems(leaf, 0); - btrfs_set_header_level(leaf, 0); - btrfs_set_header_bytenr(leaf, leaf->start); - btrfs_set_header_generation(leaf, trans->transid); - btrfs_set_header_owner(leaf, objectid); - - write_extent_buffer(leaf, root->fs_info->fsid, - (unsigned long)btrfs_header_fsid(leaf), - BTRFS_FSID_SIZE); - btrfs_mark_buffer_dirty(leaf); - - inode_item = &root_item.inode; - memset(inode_item, 0, sizeof(*inode_item)); - inode_item->generation = cpu_to_le64(1); - inode_item->size = cpu_to_le64(3); - inode_item->nlink = cpu_to_le32(1); - inode_item->nblocks = cpu_to_le64(1); - inode_item->mode = cpu_to_le32(S_IFDIR | 0755); - - btrfs_set_root_bytenr(&root_item, leaf->start); - btrfs_set_root_level(&root_item, 0); - btrfs_set_root_refs(&root_item, 1); - btrfs_set_root_used(&root_item, 0); - - memset(&root_item.drop_progress, 0, sizeof(root_item.drop_progress)); - root_item.drop_level = 0; - - free_extent_buffer(leaf); - leaf = NULL; - - btrfs_set_root_dirid(&root_item, new_dirid); - - key.objectid = objectid; - key.offset = 1; - btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); - ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key, - &root_item); - if (ret) - goto fail; - - /* - * insert the directory item - */ - key.offset = (u64)-1; - dir = root->fs_info->sb->s_root->d_inode; - ret = btrfs_insert_dir_item(trans, root->fs_info->tree_root, - name, namelen, dir->i_ino, &key, - BTRFS_FT_DIR); - if (ret) - goto fail; - - ret = btrfs_insert_inode_ref(trans, root->fs_info->tree_root, - name, namelen, objectid, - root->fs_info->sb->s_root->d_inode->i_ino); - if (ret) - goto fail; - - ret = btrfs_commit_transaction(trans, root); - if (ret) - goto fail_commit; - - new_root = btrfs_read_fs_root(root->fs_info, &key, name, namelen); - BUG_ON(!new_root); - - trans = btrfs_start_transaction(new_root, 1); - BUG_ON(!trans); inode = btrfs_new_inode(trans, new_root, "..", 2, new_dirid, - new_dirid, - BTRFS_I(dir)->block_group, S_IFDIR | 0700); + new_dirid, block_group, S_IFDIR | 0700); if (IS_ERR(inode)) - goto fail; + return PTR_ERR(inode); inode->i_op = &btrfs_dir_inode_operations; inode->i_fop = &btrfs_dir_file_operations; new_root->inode = inode; @@ -2783,67 +2686,8 @@ static int noinline create_subvol(struct btrfs_root *root, char *name, new_dirid); inode->i_nlink = 1; inode->i_size = 0; - ret = btrfs_update_inode(trans, new_root, inode); - if (ret) - goto fail; - - /* Invalidate existing dcache entry for new subvolume. */ - btrfs_invalidate_dcache_root(root, name, namelen); -fail: - nr = trans->blocks_used; - err = btrfs_commit_transaction(trans, new_root); - if (err && !ret) - ret = err; -fail_commit: - mutex_unlock(&root->fs_info->fs_mutex); - btrfs_btree_balance_dirty(root, nr); - btrfs_throttle(root); - return ret; -} - -static int create_snapshot(struct btrfs_root *root, char *name, int namelen) -{ - struct btrfs_pending_snapshot *pending_snapshot; - struct btrfs_trans_handle *trans; - int ret; - int err; - unsigned long nr = 0; - - if (!root->ref_cows) - return -EINVAL; - - mutex_lock(&root->fs_info->fs_mutex); - ret = btrfs_check_free_space(root, 1, 0); - if (ret) - goto fail_unlock; - - pending_snapshot = kmalloc(sizeof(*pending_snapshot), GFP_NOFS); - if (!pending_snapshot) { - ret = -ENOMEM; - goto fail_unlock; - } - pending_snapshot->name = kmalloc(namelen + 1, GFP_NOFS); - if (!pending_snapshot->name) { - ret = -ENOMEM; - kfree(pending_snapshot); - goto fail_unlock; - } - memcpy(pending_snapshot->name, name, namelen); - pending_snapshot->name[namelen] = '\0'; - trans = btrfs_start_transaction(root, 1); - BUG_ON(!trans); - pending_snapshot->root = root; - list_add(&pending_snapshot->list, - &trans->transaction->pending_snapshots); - ret = btrfs_update_inode(trans, root, root->inode); - err = btrfs_commit_transaction(trans, root); - -fail_unlock: - mutex_unlock(&root->fs_info->fs_mutex); - btrfs_btree_balance_dirty(root, nr); - btrfs_throttle(root); - return ret; + return btrfs_update_inode(trans, new_root, inode); } unsigned long btrfs_force_ra(struct address_space *mapping, @@ -2861,571 +2705,6 @@ unsigned long btrfs_force_ra(struct address_space *mapping, #endif } -int btrfs_defrag_file(struct file *file) { - struct inode *inode = fdentry(file)->d_inode; - struct btrfs_root *root = BTRFS_I(inode)->root; - struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; - struct page *page; - unsigned long last_index; - unsigned long ra_pages = root->fs_info->bdi.ra_pages; - unsigned long total_read = 0; - u64 page_start; - u64 page_end; - unsigned long i; - int ret; - - mutex_lock(&root->fs_info->fs_mutex); - ret = btrfs_check_free_space(root, inode->i_size, 0); - mutex_unlock(&root->fs_info->fs_mutex); - if (ret) - return -ENOSPC; - - mutex_lock(&inode->i_mutex); - last_index = inode->i_size >> PAGE_CACHE_SHIFT; - for (i = 0; i <= last_index; i++) { - if (total_read % ra_pages == 0) { - btrfs_force_ra(inode->i_mapping, &file->f_ra, file, i, - min(last_index, i + ra_pages - 1)); - } - total_read++; - page = grab_cache_page(inode->i_mapping, i); - if (!page) - goto out_unlock; - if (!PageUptodate(page)) { - btrfs_readpage(NULL, page); - lock_page(page); - if (!PageUptodate(page)) { - unlock_page(page); - page_cache_release(page); - goto out_unlock; - } - } - -#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18) - ClearPageDirty(page); -#else - cancel_dirty_page(page, PAGE_CACHE_SIZE); -#endif - wait_on_page_writeback(page); - set_page_extent_mapped(page); - - page_start = (u64)page->index << PAGE_CACHE_SHIFT; - page_end = page_start + PAGE_CACHE_SIZE - 1; - - lock_extent(io_tree, page_start, page_end, GFP_NOFS); - set_extent_delalloc(io_tree, page_start, - page_end, GFP_NOFS); - - unlock_extent(io_tree, page_start, page_end, GFP_NOFS); - set_page_dirty(page); - unlock_page(page); - page_cache_release(page); - balance_dirty_pages_ratelimited_nr(inode->i_mapping, 1); - } - -out_unlock: - mutex_unlock(&inode->i_mutex); - return 0; -} - -static int btrfs_ioctl_resize(struct btrfs_root *root, void __user *arg) -{ - u64 new_size; - u64 old_size; - u64 devid = 1; - struct btrfs_ioctl_vol_args *vol_args; - struct btrfs_trans_handle *trans; - struct btrfs_device *device = NULL; - char *sizestr; - char *devstr = NULL; - int ret = 0; - int namelen; - int mod = 0; - - vol_args = kmalloc(sizeof(*vol_args), GFP_NOFS); - - if (!vol_args) - return -ENOMEM; - - if (copy_from_user(vol_args, arg, sizeof(*vol_args))) { - ret = -EFAULT; - goto out; - } - namelen = strlen(vol_args->name); - if (namelen > BTRFS_VOL_NAME_MAX) { - ret = -EINVAL; - goto out; - } - - mutex_lock(&root->fs_info->fs_mutex); - sizestr = vol_args->name; - devstr = strchr(sizestr, ':'); - if (devstr) { - char *end; - sizestr = devstr + 1; - *devstr = '\0'; - devstr = vol_args->name; - devid = simple_strtoull(devstr, &end, 10); -printk("resizing devid %Lu\n", devid); - } - device = btrfs_find_device(root, devid, NULL); - if (!device) { - printk("resizer unable to find device %Lu\n", devid); - ret = -EINVAL; - goto out_unlock; - } - if (!strcmp(sizestr, "max")) - new_size = device->bdev->bd_inode->i_size; - else { - if (sizestr[0] == '-') { - mod = -1; - sizestr++; - } else if (sizestr[0] == '+') { - mod = 1; - sizestr++; - } - new_size = btrfs_parse_size(sizestr); - if (new_size == 0) { - ret = -EINVAL; - goto out_unlock; - } - } - - old_size = device->total_bytes; - - if (mod < 0) { - if (new_size > old_size) { - ret = -EINVAL; - goto out_unlock; - } - new_size = old_size - new_size; - } else if (mod > 0) { - new_size = old_size + new_size; - } - - if (new_size < 256 * 1024 * 1024) { - ret = -EINVAL; - goto out_unlock; - } - if (new_size > device->bdev->bd_inode->i_size) { - ret = -EFBIG; - goto out_unlock; - } - - do_div(new_size, root->sectorsize); - new_size *= root->sectorsize; - -printk("new size for %s is %llu\n", device->name, (unsigned long long)new_size); - - if (new_size > old_size) { - trans = btrfs_start_transaction(root, 1); - ret = btrfs_grow_device(trans, device, new_size); - btrfs_commit_transaction(trans, root); - } else { - ret = btrfs_shrink_device(device, new_size); - } - -out_unlock: - mutex_unlock(&root->fs_info->fs_mutex); -out: - kfree(vol_args); - return ret; -} - -static int noinline btrfs_ioctl_snap_create(struct btrfs_root *root, - void __user *arg) -{ - struct btrfs_ioctl_vol_args *vol_args; - struct btrfs_dir_item *di; - struct btrfs_path *path; - u64 root_dirid; - int namelen; - int ret; - - vol_args = kmalloc(sizeof(*vol_args), GFP_NOFS); - - if (!vol_args) - return -ENOMEM; - - if (copy_from_user(vol_args, arg, sizeof(*vol_args))) { - ret = -EFAULT; - goto out; - } - - namelen = strlen(vol_args->name); - if (namelen > BTRFS_VOL_NAME_MAX) { - ret = -EINVAL; - goto out; - } - if (strchr(vol_args->name, '/')) { - ret = -EINVAL; - goto out; - } - - path = btrfs_alloc_path(); - if (!path) { - ret = -ENOMEM; - goto out; - } - - root_dirid = root->fs_info->sb->s_root->d_inode->i_ino, - mutex_lock(&root->fs_info->fs_mutex); - di = btrfs_lookup_dir_item(NULL, root->fs_info->tree_root, - path, root_dirid, - vol_args->name, namelen, 0); - mutex_unlock(&root->fs_info->fs_mutex); - btrfs_free_path(path); - - if (di && !IS_ERR(di)) { - ret = -EEXIST; - goto out; - } - - if (IS_ERR(di)) { - ret = PTR_ERR(di); - goto out; - } - - if (root == root->fs_info->tree_root) - ret = create_subvol(root, vol_args->name, namelen); - else - ret = create_snapshot(root, vol_args->name, namelen); -out: - kfree(vol_args); - return ret; -} - -static int btrfs_ioctl_defrag(struct file *file) -{ - struct inode *inode = fdentry(file)->d_inode; - struct btrfs_root *root = BTRFS_I(inode)->root; - - switch (inode->i_mode & S_IFMT) { - case S_IFDIR: - mutex_lock(&root->fs_info->fs_mutex); - btrfs_defrag_root(root, 0); - btrfs_defrag_root(root->fs_info->extent_root, 0); - mutex_unlock(&root->fs_info->fs_mutex); - break; - case S_IFREG: - btrfs_defrag_file(file); - break; - } - - return 0; -} - -long btrfs_ioctl_add_dev(struct btrfs_root *root, void __user *arg) -{ - struct btrfs_ioctl_vol_args *vol_args; - int ret; - - vol_args = kmalloc(sizeof(*vol_args), GFP_NOFS); - - if (!vol_args) - return -ENOMEM; - - if (copy_from_user(vol_args, arg, sizeof(*vol_args))) { - ret = -EFAULT; - goto out; - } - ret = btrfs_init_new_device(root, vol_args->name); - -out: - kfree(vol_args); - return ret; -} - -long btrfs_ioctl_rm_dev(struct btrfs_root *root, void __user *arg) -{ - struct btrfs_ioctl_vol_args *vol_args; - int ret; - - vol_args = kmalloc(sizeof(*vol_args), GFP_NOFS); - - if (!vol_args) - return -ENOMEM; - - if (copy_from_user(vol_args, arg, sizeof(*vol_args))) { - ret = -EFAULT; - goto out; - } - ret = btrfs_rm_device(root, vol_args->name); - -out: - kfree(vol_args); - return ret; -} - -int dup_item_to_inode(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - struct extent_buffer *leaf, - int slot, - struct btrfs_key *key, - u64 destino) -{ - char *dup; - int len = btrfs_item_size_nr(leaf, slot); - struct btrfs_key ckey = *key; - int ret = 0; - - dup = kmalloc(len, GFP_NOFS); - if (!dup) - return -ENOMEM; - - read_extent_buffer(leaf, dup, btrfs_item_ptr_offset(leaf, slot), len); - btrfs_release_path(root, path); - - ckey.objectid = destino; - ret = btrfs_insert_item(trans, root, &ckey, dup, len); - kfree(dup); - return ret; -} - -long btrfs_ioctl_clone(struct file *file, unsigned long src_fd) -{ - struct inode *inode = fdentry(file)->d_inode; - struct btrfs_root *root = BTRFS_I(inode)->root; - struct file *src_file; - struct inode *src; - struct btrfs_trans_handle *trans; - int ret; - u64 pos; - struct btrfs_path *path; - struct btrfs_key key; - struct extent_buffer *leaf; - u32 nritems; - int slot; - - src_file = fget(src_fd); - if (!src_file) - return -EBADF; - src = src_file->f_dentry->d_inode; - - ret = -EXDEV; - if (src->i_sb != inode->i_sb) - goto out_fput; - - if (inode < src) { - mutex_lock(&inode->i_mutex); - mutex_lock(&src->i_mutex); - } else { - mutex_lock(&src->i_mutex); - mutex_lock(&inode->i_mutex); - } - - ret = -ENOTEMPTY; - if (inode->i_size) - goto out_unlock; - - /* do any pending delalloc/csum calc on src, one way or - another, and lock file content */ - while (1) { - filemap_write_and_wait(src->i_mapping); - lock_extent(&BTRFS_I(src)->io_tree, 0, (u64)-1, GFP_NOFS); - if (BTRFS_I(src)->delalloc_bytes == 0) - break; - unlock_extent(&BTRFS_I(src)->io_tree, 0, (u64)-1, GFP_NOFS); - } - - mutex_lock(&root->fs_info->fs_mutex); - trans = btrfs_start_transaction(root, 0); - path = btrfs_alloc_path(); - if (!path) { - ret = -ENOMEM; - goto out; - } - key.offset = 0; - key.type = BTRFS_EXTENT_DATA_KEY; - key.objectid = src->i_ino; - pos = 0; - path->reada = 2; - - while (1) { - /* - * note the key will change type as we walk through the - * tree. - */ - ret = btrfs_search_slot(trans, root, &key, path, 0, 0); - if (ret < 0) - goto out; - - if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { - ret = btrfs_next_leaf(root, path); - if (ret < 0) - goto out; - if (ret > 0) - break; - } - leaf = path->nodes[0]; - slot = path->slots[0]; - btrfs_item_key_to_cpu(leaf, &key, slot); - nritems = btrfs_header_nritems(leaf); - - if (btrfs_key_type(&key) > BTRFS_CSUM_ITEM_KEY || - key.objectid != src->i_ino) - break; - - if (btrfs_key_type(&key) == BTRFS_EXTENT_DATA_KEY) { - struct btrfs_file_extent_item *extent; - int found_type; - pos = key.offset; - extent = btrfs_item_ptr(leaf, slot, - struct btrfs_file_extent_item); - found_type = btrfs_file_extent_type(leaf, extent); - if (found_type == BTRFS_FILE_EXTENT_REG) { - u64 len = btrfs_file_extent_num_bytes(leaf, - extent); - u64 ds = btrfs_file_extent_disk_bytenr(leaf, - extent); - u64 dl = btrfs_file_extent_disk_num_bytes(leaf, - extent); - u64 off = btrfs_file_extent_offset(leaf, - extent); - btrfs_insert_file_extent(trans, root, - inode->i_ino, pos, - ds, dl, len, off); - /* ds == 0 means there's a hole */ - if (ds != 0) { - btrfs_inc_extent_ref(trans, root, - ds, dl, - root->root_key.objectid, - trans->transid, - inode->i_ino, pos); - } - pos = key.offset + len; - } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { - ret = dup_item_to_inode(trans, root, path, - leaf, slot, &key, - inode->i_ino); - if (ret) - goto out; - pos = key.offset + btrfs_item_size_nr(leaf, - slot); - } - } else if (btrfs_key_type(&key) == BTRFS_CSUM_ITEM_KEY) { - ret = dup_item_to_inode(trans, root, path, leaf, - slot, &key, inode->i_ino); - - if (ret) - goto out; - } - key.offset++; - btrfs_release_path(root, path); - } - - ret = 0; -out: - btrfs_free_path(path); - - inode->i_blocks = src->i_blocks; - i_size_write(inode, src->i_size); - btrfs_update_inode(trans, root, inode); - - unlock_extent(&BTRFS_I(src)->io_tree, 0, (u64)-1, GFP_NOFS); - - btrfs_end_transaction(trans, root); - mutex_unlock(&root->fs_info->fs_mutex); - -out_unlock: - mutex_unlock(&src->i_mutex); - mutex_unlock(&inode->i_mutex); -out_fput: - fput(src_file); - return ret; -} - -/* - * there are many ways the trans_start and trans_end ioctls can lead - * to deadlocks. They should only be used by applications that - * basically own the machine, and have a very in depth understanding - * of all the possible deadlocks and enospc problems. - */ -long btrfs_ioctl_trans_start(struct file *file) -{ - struct inode *inode = fdentry(file)->d_inode; - struct btrfs_root *root = BTRFS_I(inode)->root; - struct btrfs_trans_handle *trans; - int ret = 0; - - mutex_lock(&root->fs_info->fs_mutex); - if (file->private_data) { - ret = -EINPROGRESS; - goto out; - } - trans = btrfs_start_transaction(root, 0); - if (trans) - file->private_data = trans; - else - ret = -ENOMEM; - /*printk(KERN_INFO "btrfs_ioctl_trans_start on %p\n", file);*/ -out: - mutex_unlock(&root->fs_info->fs_mutex); - return ret; -} - -/* - * there are many ways the trans_start and trans_end ioctls can lead - * to deadlocks. They should only be used by applications that - * basically own the machine, and have a very in depth understanding - * of all the possible deadlocks and enospc problems. - */ -long btrfs_ioctl_trans_end(struct file *file) -{ - struct inode *inode = fdentry(file)->d_inode; - struct btrfs_root *root = BTRFS_I(inode)->root; - struct btrfs_trans_handle *trans; - int ret = 0; - - mutex_lock(&root->fs_info->fs_mutex); - trans = file->private_data; - if (!trans) { - ret = -EINVAL; - goto out; - } - btrfs_end_transaction(trans, root); - file->private_data = 0; -out: - mutex_unlock(&root->fs_info->fs_mutex); - return ret; -} - -long btrfs_ioctl(struct file *file, unsigned int - cmd, unsigned long arg) -{ - struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root; - - switch (cmd) { - case BTRFS_IOC_SNAP_CREATE: - return btrfs_ioctl_snap_create(root, (void __user *)arg); - case BTRFS_IOC_DEFRAG: - return btrfs_ioctl_defrag(file); - case BTRFS_IOC_RESIZE: - return btrfs_ioctl_resize(root, (void __user *)arg); - case BTRFS_IOC_ADD_DEV: - return btrfs_ioctl_add_dev(root, (void __user *)arg); - case BTRFS_IOC_RM_DEV: - return btrfs_ioctl_rm_dev(root, (void __user *)arg); - case BTRFS_IOC_BALANCE: - return btrfs_balance(root->fs_info->dev_root); - case BTRFS_IOC_CLONE: - return btrfs_ioctl_clone(file, arg); - case BTRFS_IOC_TRANS_START: - return btrfs_ioctl_trans_start(file); - case BTRFS_IOC_TRANS_END: - return btrfs_ioctl_trans_end(file); - case BTRFS_IOC_SYNC: - btrfs_sync_fs(file->f_dentry->d_sb, 1); - return 0; - } - - return -ENOTTY; -} - -/* - * Called inside transaction, so use GFP_NOFS - */ struct inode *btrfs_alloc_inode(struct super_block *sb) { struct btrfs_inode *ei; diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c new file mode 100644 index 000000000000..da8de6cfdb5a --- /dev/null +++ b/fs/btrfs/ioctl.c @@ -0,0 +1,781 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "ctree.h" +#include "disk-io.h" +#include "transaction.h" +#include "btrfs_inode.h" +#include "ioctl.h" +#include "print-tree.h" +#include "volumes.h" + + + +static noinline int create_subvol(struct btrfs_root *root, char *name, + int namelen) +{ + struct btrfs_trans_handle *trans; + struct btrfs_key key; + struct btrfs_root_item root_item; + struct btrfs_inode_item *inode_item; + struct extent_buffer *leaf; + struct btrfs_root *new_root = root; + struct inode *dir; + int ret; + int err; + u64 objectid; + u64 new_dirid = BTRFS_FIRST_FREE_OBJECTID; + unsigned long nr = 1; + + mutex_lock(&root->fs_info->fs_mutex); + ret = btrfs_check_free_space(root, 1, 0); + if (ret) + goto fail_commit; + + trans = btrfs_start_transaction(root, 1); + BUG_ON(!trans); + + ret = btrfs_find_free_objectid(trans, root->fs_info->tree_root, + 0, &objectid); + if (ret) + goto fail; + + leaf = __btrfs_alloc_free_block(trans, root, root->leafsize, + objectid, trans->transid, 0, 0, + 0, 0); + if (IS_ERR(leaf)) + return PTR_ERR(leaf); + + btrfs_set_header_nritems(leaf, 0); + btrfs_set_header_level(leaf, 0); + btrfs_set_header_bytenr(leaf, leaf->start); + btrfs_set_header_generation(leaf, trans->transid); + btrfs_set_header_owner(leaf, objectid); + + write_extent_buffer(leaf, root->fs_info->fsid, + (unsigned long)btrfs_header_fsid(leaf), + BTRFS_FSID_SIZE); + btrfs_mark_buffer_dirty(leaf); + + inode_item = &root_item.inode; + memset(inode_item, 0, sizeof(*inode_item)); + inode_item->generation = cpu_to_le64(1); + inode_item->size = cpu_to_le64(3); + inode_item->nlink = cpu_to_le32(1); + inode_item->nblocks = cpu_to_le64(1); + inode_item->mode = cpu_to_le32(S_IFDIR | 0755); + + btrfs_set_root_bytenr(&root_item, leaf->start); + btrfs_set_root_level(&root_item, 0); + btrfs_set_root_refs(&root_item, 1); + btrfs_set_root_used(&root_item, 0); + + memset(&root_item.drop_progress, 0, sizeof(root_item.drop_progress)); + root_item.drop_level = 0; + + free_extent_buffer(leaf); + leaf = NULL; + + btrfs_set_root_dirid(&root_item, new_dirid); + + key.objectid = objectid; + key.offset = 1; + btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); + ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key, + &root_item); + if (ret) + goto fail; + + /* + * insert the directory item + */ + key.offset = (u64)-1; + dir = root->fs_info->sb->s_root->d_inode; + ret = btrfs_insert_dir_item(trans, root->fs_info->tree_root, + name, namelen, dir->i_ino, &key, + BTRFS_FT_DIR); + if (ret) + goto fail; + + ret = btrfs_insert_inode_ref(trans, root->fs_info->tree_root, + name, namelen, objectid, + root->fs_info->sb->s_root->d_inode->i_ino); + if (ret) + goto fail; + + ret = btrfs_commit_transaction(trans, root); + if (ret) + goto fail_commit; + + new_root = btrfs_read_fs_root(root->fs_info, &key, name, namelen); + BUG_ON(!new_root); + + trans = btrfs_start_transaction(new_root, 1); + BUG_ON(!trans); + + ret = btrfs_create_subvol_root(new_root, trans, new_dirid, + BTRFS_I(dir)->block_group); + if (ret) + goto fail; + + /* Invalidate existing dcache entry for new subvolume. */ + btrfs_invalidate_dcache_root(root, name, namelen); + +fail: + nr = trans->blocks_used; + err = btrfs_commit_transaction(trans, new_root); + if (err && !ret) + ret = err; +fail_commit: + mutex_unlock(&root->fs_info->fs_mutex); + btrfs_btree_balance_dirty(root, nr); + btrfs_throttle(root); + return ret; +} + +static int create_snapshot(struct btrfs_root *root, char *name, int namelen) +{ + struct btrfs_pending_snapshot *pending_snapshot; + struct btrfs_trans_handle *trans; + int ret; + int err; + unsigned long nr = 0; + + if (!root->ref_cows) + return -EINVAL; + + mutex_lock(&root->fs_info->fs_mutex); + ret = btrfs_check_free_space(root, 1, 0); + if (ret) + goto fail_unlock; + + pending_snapshot = kmalloc(sizeof(*pending_snapshot), GFP_NOFS); + if (!pending_snapshot) { + ret = -ENOMEM; + goto fail_unlock; + } + pending_snapshot->name = kmalloc(namelen + 1, GFP_NOFS); + if (!pending_snapshot->name) { + ret = -ENOMEM; + kfree(pending_snapshot); + goto fail_unlock; + } + memcpy(pending_snapshot->name, name, namelen); + pending_snapshot->name[namelen] = '\0'; + trans = btrfs_start_transaction(root, 1); + BUG_ON(!trans); + pending_snapshot->root = root; + list_add(&pending_snapshot->list, + &trans->transaction->pending_snapshots); + ret = btrfs_update_inode(trans, root, root->inode); + err = btrfs_commit_transaction(trans, root); + +fail_unlock: + mutex_unlock(&root->fs_info->fs_mutex); + btrfs_btree_balance_dirty(root, nr); + btrfs_throttle(root); + return ret; +} + +int btrfs_defrag_file(struct file *file) +{ + struct inode *inode = fdentry(file)->d_inode; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + struct page *page; + unsigned long last_index; + unsigned long ra_pages = root->fs_info->bdi.ra_pages; + unsigned long total_read = 0; + u64 page_start; + u64 page_end; + unsigned long i; + int ret; + + mutex_lock(&root->fs_info->fs_mutex); + ret = btrfs_check_free_space(root, inode->i_size, 0); + mutex_unlock(&root->fs_info->fs_mutex); + if (ret) + return -ENOSPC; + + mutex_lock(&inode->i_mutex); + last_index = inode->i_size >> PAGE_CACHE_SHIFT; + for (i = 0; i <= last_index; i++) { + if (total_read % ra_pages == 0) { + btrfs_force_ra(inode->i_mapping, &file->f_ra, file, i, + min(last_index, i + ra_pages - 1)); + } + total_read++; + page = grab_cache_page(inode->i_mapping, i); + if (!page) + goto out_unlock; + if (!PageUptodate(page)) { + btrfs_readpage(NULL, page); + lock_page(page); + if (!PageUptodate(page)) { + unlock_page(page); + page_cache_release(page); + goto out_unlock; + } + } + +#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18) + ClearPageDirty(page); +#else + cancel_dirty_page(page, PAGE_CACHE_SIZE); +#endif + wait_on_page_writeback(page); + set_page_extent_mapped(page); + + page_start = (u64)page->index << PAGE_CACHE_SHIFT; + page_end = page_start + PAGE_CACHE_SIZE - 1; + + lock_extent(io_tree, page_start, page_end, GFP_NOFS); + set_extent_delalloc(io_tree, page_start, + page_end, GFP_NOFS); + + unlock_extent(io_tree, page_start, page_end, GFP_NOFS); + set_page_dirty(page); + unlock_page(page); + page_cache_release(page); + balance_dirty_pages_ratelimited_nr(inode->i_mapping, 1); + } + +out_unlock: + mutex_unlock(&inode->i_mutex); + return 0; +} + +/* + * Called inside transaction, so use GFP_NOFS + */ + +static int btrfs_ioctl_resize(struct btrfs_root *root, void __user *arg) +{ + u64 new_size; + u64 old_size; + u64 devid = 1; + struct btrfs_ioctl_vol_args *vol_args; + struct btrfs_trans_handle *trans; + struct btrfs_device *device = NULL; + char *sizestr; + char *devstr = NULL; + int ret = 0; + int namelen; + int mod = 0; + + vol_args = kmalloc(sizeof(*vol_args), GFP_NOFS); + + if (!vol_args) + return -ENOMEM; + + if (copy_from_user(vol_args, arg, sizeof(*vol_args))) { + ret = -EFAULT; + goto out; + } + namelen = strlen(vol_args->name); + if (namelen > BTRFS_VOL_NAME_MAX) { + ret = -EINVAL; + goto out; + } + + mutex_lock(&root->fs_info->fs_mutex); + sizestr = vol_args->name; + devstr = strchr(sizestr, ':'); + if (devstr) { + char *end; + sizestr = devstr + 1; + *devstr = '\0'; + devstr = vol_args->name; + devid = simple_strtoull(devstr, &end, 10); + printk(KERN_INFO "resizing devid %llu\n", devid); + } + device = btrfs_find_device(root, devid, NULL); + if (!device) { + printk(KERN_INFO "resizer unable to find device %llu\n", devid); + ret = -EINVAL; + goto out_unlock; + } + if (!strcmp(sizestr, "max")) + new_size = device->bdev->bd_inode->i_size; + else { + if (sizestr[0] == '-') { + mod = -1; + sizestr++; + } else if (sizestr[0] == '+') { + mod = 1; + sizestr++; + } + new_size = btrfs_parse_size(sizestr); + if (new_size == 0) { + ret = -EINVAL; + goto out_unlock; + } + } + + old_size = device->total_bytes; + + if (mod < 0) { + if (new_size > old_size) { + ret = -EINVAL; + goto out_unlock; + } + new_size = old_size - new_size; + } else if (mod > 0) { + new_size = old_size + new_size; + } + + if (new_size < 256 * 1024 * 1024) { + ret = -EINVAL; + goto out_unlock; + } + if (new_size > device->bdev->bd_inode->i_size) { + ret = -EFBIG; + goto out_unlock; + } + + do_div(new_size, root->sectorsize); + new_size *= root->sectorsize; + + printk(KERN_INFO "new size for %s is %llu\n", + device->name, (unsigned long long)new_size); + + if (new_size > old_size) { + trans = btrfs_start_transaction(root, 1); + ret = btrfs_grow_device(trans, device, new_size); + btrfs_commit_transaction(trans, root); + } else { + ret = btrfs_shrink_device(device, new_size); + } + +out_unlock: + mutex_unlock(&root->fs_info->fs_mutex); +out: + kfree(vol_args); + return ret; +} + +static noinline int btrfs_ioctl_snap_create(struct btrfs_root *root, + void __user *arg) +{ + struct btrfs_ioctl_vol_args *vol_args; + struct btrfs_dir_item *di; + struct btrfs_path *path; + u64 root_dirid; + int namelen; + int ret; + + vol_args = kmalloc(sizeof(*vol_args), GFP_NOFS); + + if (!vol_args) + return -ENOMEM; + + if (copy_from_user(vol_args, arg, sizeof(*vol_args))) { + ret = -EFAULT; + goto out; + } + + namelen = strlen(vol_args->name); + if (namelen > BTRFS_VOL_NAME_MAX) { + ret = -EINVAL; + goto out; + } + if (strchr(vol_args->name, '/')) { + ret = -EINVAL; + goto out; + } + + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto out; + } + + root_dirid = root->fs_info->sb->s_root->d_inode->i_ino, + mutex_lock(&root->fs_info->fs_mutex); + di = btrfs_lookup_dir_item(NULL, root->fs_info->tree_root, + path, root_dirid, + vol_args->name, namelen, 0); + mutex_unlock(&root->fs_info->fs_mutex); + btrfs_free_path(path); + + if (di && !IS_ERR(di)) { + ret = -EEXIST; + goto out; + } + + if (IS_ERR(di)) { + ret = PTR_ERR(di); + goto out; + } + + if (root == root->fs_info->tree_root) + ret = create_subvol(root, vol_args->name, namelen); + else + ret = create_snapshot(root, vol_args->name, namelen); +out: + kfree(vol_args); + return ret; +} + +static int btrfs_ioctl_defrag(struct file *file) +{ + struct inode *inode = fdentry(file)->d_inode; + struct btrfs_root *root = BTRFS_I(inode)->root; + + switch (inode->i_mode & S_IFMT) { + case S_IFDIR: + mutex_lock(&root->fs_info->fs_mutex); + btrfs_defrag_root(root, 0); + btrfs_defrag_root(root->fs_info->extent_root, 0); + mutex_unlock(&root->fs_info->fs_mutex); + break; + case S_IFREG: + btrfs_defrag_file(file); + break; + } + + return 0; +} + +long btrfs_ioctl_add_dev(struct btrfs_root *root, void __user *arg) +{ + struct btrfs_ioctl_vol_args *vol_args; + int ret; + + vol_args = kmalloc(sizeof(*vol_args), GFP_NOFS); + + if (!vol_args) + return -ENOMEM; + + if (copy_from_user(vol_args, arg, sizeof(*vol_args))) { + ret = -EFAULT; + goto out; + } + ret = btrfs_init_new_device(root, vol_args->name); + +out: + kfree(vol_args); + return ret; +} + +long btrfs_ioctl_rm_dev(struct btrfs_root *root, void __user *arg) +{ + struct btrfs_ioctl_vol_args *vol_args; + int ret; + + vol_args = kmalloc(sizeof(*vol_args), GFP_NOFS); + + if (!vol_args) + return -ENOMEM; + + if (copy_from_user(vol_args, arg, sizeof(*vol_args))) { + ret = -EFAULT; + goto out; + } + ret = btrfs_rm_device(root, vol_args->name); + +out: + kfree(vol_args); + return ret; +} + +int dup_item_to_inode(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct extent_buffer *leaf, + int slot, + struct btrfs_key *key, + u64 destino) +{ + char *dup; + int len = btrfs_item_size_nr(leaf, slot); + struct btrfs_key ckey = *key; + int ret = 0; + + dup = kmalloc(len, GFP_NOFS); + if (!dup) + return -ENOMEM; + + read_extent_buffer(leaf, dup, btrfs_item_ptr_offset(leaf, slot), len); + btrfs_release_path(root, path); + + ckey.objectid = destino; + ret = btrfs_insert_item(trans, root, &ckey, dup, len); + kfree(dup); + return ret; +} + +long btrfs_ioctl_clone(struct file *file, unsigned long src_fd) +{ + struct inode *inode = fdentry(file)->d_inode; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct file *src_file; + struct inode *src; + struct btrfs_trans_handle *trans; + int ret; + u64 pos; + struct btrfs_path *path; + struct btrfs_key key; + struct extent_buffer *leaf; + u32 nritems; + int slot; + + src_file = fget(src_fd); + if (!src_file) + return -EBADF; + src = src_file->f_dentry->d_inode; + + ret = -EXDEV; + if (src->i_sb != inode->i_sb) + goto out_fput; + + if (inode < src) { + mutex_lock(&inode->i_mutex); + mutex_lock(&src->i_mutex); + } else { + mutex_lock(&src->i_mutex); + mutex_lock(&inode->i_mutex); + } + + ret = -ENOTEMPTY; + if (inode->i_size) + goto out_unlock; + + /* do any pending delalloc/csum calc on src, one way or + another, and lock file content */ + while (1) { + filemap_write_and_wait(src->i_mapping); + lock_extent(&BTRFS_I(src)->io_tree, 0, (u64)-1, GFP_NOFS); + if (BTRFS_I(src)->delalloc_bytes == 0) + break; + unlock_extent(&BTRFS_I(src)->io_tree, 0, (u64)-1, GFP_NOFS); + } + + mutex_lock(&root->fs_info->fs_mutex); + trans = btrfs_start_transaction(root, 0); + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto out; + } + key.offset = 0; + key.type = BTRFS_EXTENT_DATA_KEY; + key.objectid = src->i_ino; + pos = 0; + path->reada = 2; + + while (1) { + /* + * note the key will change type as we walk through the + * tree. + */ + ret = btrfs_search_slot(trans, root, &key, path, 0, 0); + if (ret < 0) + goto out; + + if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) + goto out; + if (ret > 0) + break; + } + leaf = path->nodes[0]; + slot = path->slots[0]; + btrfs_item_key_to_cpu(leaf, &key, slot); + nritems = btrfs_header_nritems(leaf); + + if (btrfs_key_type(&key) > BTRFS_CSUM_ITEM_KEY || + key.objectid != src->i_ino) + break; + + if (btrfs_key_type(&key) == BTRFS_EXTENT_DATA_KEY) { + struct btrfs_file_extent_item *extent; + int found_type; + pos = key.offset; + extent = btrfs_item_ptr(leaf, slot, + struct btrfs_file_extent_item); + found_type = btrfs_file_extent_type(leaf, extent); + if (found_type == BTRFS_FILE_EXTENT_REG) { + u64 len = btrfs_file_extent_num_bytes(leaf, + extent); + u64 ds = btrfs_file_extent_disk_bytenr(leaf, + extent); + u64 dl = btrfs_file_extent_disk_num_bytes(leaf, + extent); + u64 off = btrfs_file_extent_offset(leaf, + extent); + btrfs_insert_file_extent(trans, root, + inode->i_ino, pos, + ds, dl, len, off); + /* ds == 0 means there's a hole */ + if (ds != 0) { + btrfs_inc_extent_ref(trans, root, + ds, dl, + root->root_key.objectid, + trans->transid, + inode->i_ino, pos); + } + pos = key.offset + len; + } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { + ret = dup_item_to_inode(trans, root, path, + leaf, slot, &key, + inode->i_ino); + if (ret) + goto out; + pos = key.offset + btrfs_item_size_nr(leaf, + slot); + } + } else if (btrfs_key_type(&key) == BTRFS_CSUM_ITEM_KEY) { + ret = dup_item_to_inode(trans, root, path, leaf, + slot, &key, inode->i_ino); + + if (ret) + goto out; + } + key.offset++; + btrfs_release_path(root, path); + } + + ret = 0; +out: + btrfs_free_path(path); + + inode->i_blocks = src->i_blocks; + i_size_write(inode, src->i_size); + btrfs_update_inode(trans, root, inode); + + unlock_extent(&BTRFS_I(src)->io_tree, 0, (u64)-1, GFP_NOFS); + + btrfs_end_transaction(trans, root); + mutex_unlock(&root->fs_info->fs_mutex); + +out_unlock: + mutex_unlock(&src->i_mutex); + mutex_unlock(&inode->i_mutex); +out_fput: + fput(src_file); + return ret; +} + +/* + * there are many ways the trans_start and trans_end ioctls can lead + * to deadlocks. They should only be used by applications that + * basically own the machine, and have a very in depth understanding + * of all the possible deadlocks and enospc problems. + */ +long btrfs_ioctl_trans_start(struct file *file) +{ + struct inode *inode = fdentry(file)->d_inode; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_trans_handle *trans; + int ret = 0; + + mutex_lock(&root->fs_info->fs_mutex); + if (file->private_data) { + ret = -EINPROGRESS; + goto out; + } + trans = btrfs_start_transaction(root, 0); + if (trans) + file->private_data = trans; + else + ret = -ENOMEM; + /*printk(KERN_INFO "btrfs_ioctl_trans_start on %p\n", file);*/ +out: + mutex_unlock(&root->fs_info->fs_mutex); + return ret; +} + +/* + * there are many ways the trans_start and trans_end ioctls can lead + * to deadlocks. They should only be used by applications that + * basically own the machine, and have a very in depth understanding + * of all the possible deadlocks and enospc problems. + */ +long btrfs_ioctl_trans_end(struct file *file) +{ + struct inode *inode = fdentry(file)->d_inode; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_trans_handle *trans; + int ret = 0; + + mutex_lock(&root->fs_info->fs_mutex); + trans = file->private_data; + if (!trans) { + ret = -EINVAL; + goto out; + } + btrfs_end_transaction(trans, root); + file->private_data = 0; +out: + mutex_unlock(&root->fs_info->fs_mutex); + return ret; +} + +long btrfs_ioctl(struct file *file, unsigned int + cmd, unsigned long arg) +{ + struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root; + + switch (cmd) { + case BTRFS_IOC_SNAP_CREATE: + return btrfs_ioctl_snap_create(root, (void __user *)arg); + case BTRFS_IOC_DEFRAG: + return btrfs_ioctl_defrag(file); + case BTRFS_IOC_RESIZE: + return btrfs_ioctl_resize(root, (void __user *)arg); + case BTRFS_IOC_ADD_DEV: + return btrfs_ioctl_add_dev(root, (void __user *)arg); + case BTRFS_IOC_RM_DEV: + return btrfs_ioctl_rm_dev(root, (void __user *)arg); + case BTRFS_IOC_BALANCE: + return btrfs_balance(root->fs_info->dev_root); + case BTRFS_IOC_CLONE: + return btrfs_ioctl_clone(file, arg); + case BTRFS_IOC_TRANS_START: + return btrfs_ioctl_trans_start(file); + case BTRFS_IOC_TRANS_END: + return btrfs_ioctl_trans_end(file); + case BTRFS_IOC_SYNC: + btrfs_sync_fs(file->f_dentry->d_sb, 1); + return 0; + } + + return -ENOTTY; +} -- cgit v1.2.3 From 925baeddc5b0764a53f2214a1253251bab0e0324 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Wed, 25 Jun 2008 16:01:30 -0400 Subject: Btrfs: Start btree concurrency work. The allocation trees and the chunk trees are serialized via their own dedicated mutexes. This means allocation location is still not very fine grained. The main FS btree is protected by locks on each block in the btree. Locks are taken top / down, and as processing finishes on a given level of the tree, the lock is released after locking the lower level. The end result of a search is now a path where only the lowest level is locked. Releasing or freeing the path drops any locks held. Signed-off-by: Chris Mason --- fs/btrfs/Makefile | 2 +- fs/btrfs/ctree.c | 386 ++++++++++++++++++++++++++++++++++++------------- fs/btrfs/ctree.h | 23 ++- fs/btrfs/disk-io.c | 13 +- fs/btrfs/extent-tree.c | 252 +++++++++++++++++++++----------- fs/btrfs/extent_io.c | 8 - fs/btrfs/inode.c | 8 +- fs/btrfs/ioctl.c | 8 +- fs/btrfs/locking.c | 50 +++++++ fs/btrfs/locking.h | 26 ++++ fs/btrfs/transaction.c | 15 +- fs/btrfs/tree-defrag.c | 2 + 12 files changed, 579 insertions(+), 214 deletions(-) create mode 100644 fs/btrfs/locking.c create mode 100644 fs/btrfs/locking.h (limited to 'fs/btrfs/Makefile') diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index 7ed6b39e42d2..0e2dcc718de2 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -6,7 +6,7 @@ btrfs-y := super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ hash.o file-item.o inode-item.o inode-map.o disk-io.o \ transaction.o bit-radix.o inode.o file.o tree-defrag.o \ extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \ - extent_io.o volumes.o async-thread.o ioctl.o + extent_io.o volumes.o async-thread.o ioctl.o locking.o btrfs-$(CONFIG_FS_POSIX_ACL) += acl.o else diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 1d404bde3cfa..75625c68fdd8 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -21,6 +21,7 @@ #include "disk-io.h" #include "transaction.h" #include "print-tree.h" +#include "locking.h" static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, int level); @@ -64,12 +65,47 @@ void btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p) int i; for (i = 0; i < BTRFS_MAX_LEVEL; i++) { if (!p->nodes[i]) - break; + continue; + if (p->locks[i]) { + btrfs_tree_unlock(p->nodes[i]); + p->locks[i] = 0; + } free_extent_buffer(p->nodes[i]); } memset(p, 0, sizeof(*p)); } +struct extent_buffer *btrfs_root_node(struct btrfs_root *root) +{ + struct extent_buffer *eb; + spin_lock(&root->node_lock); + eb = root->node; + extent_buffer_get(eb); + spin_unlock(&root->node_lock); + return eb; +} + +struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root) +{ + struct extent_buffer *eb; + + while(1) { + eb = btrfs_root_node(root); + btrfs_tree_lock(eb); + + spin_lock(&root->node_lock); + if (eb == root->node) { + spin_unlock(&root->node_lock); + break; + } + spin_unlock(&root->node_lock); + + btrfs_tree_unlock(eb); + free_extent_buffer(eb); + } + return eb; +} + static void add_root_to_dirty_list(struct btrfs_root *root) { if (root->track_dirty && list_empty(&root->dirty_list)) { @@ -111,7 +147,7 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans, } else { first_key.objectid = 0; } - cow = __btrfs_alloc_free_block(trans, new_root, buf->len, + cow = btrfs_alloc_free_block(trans, new_root, buf->len, new_root_objectid, trans->transid, first_key.objectid, level, buf->start, 0); @@ -151,8 +187,14 @@ int __btrfs_cow_block(struct btrfs_trans_handle *trans, int ret = 0; int different_trans = 0; int level; + int unlock_orig = 0; struct btrfs_key first_key; + if (*cow_ret == buf) + unlock_orig = 1; + + WARN_ON(!btrfs_tree_locked(buf)); + if (root->ref_cows) { root_gen = trans->transid; } else { @@ -172,7 +214,7 @@ int __btrfs_cow_block(struct btrfs_trans_handle *trans, } else { first_key.objectid = 0; } - cow = __btrfs_alloc_free_block(trans, root, buf->len, + cow = btrfs_alloc_free_block(trans, root, buf->len, root->root_key.objectid, root_gen, first_key.objectid, level, search_start, empty_size); @@ -196,9 +238,14 @@ int __btrfs_cow_block(struct btrfs_trans_handle *trans, } if (buf == root->node) { + WARN_ON(parent && parent != buf); root_gen = btrfs_header_generation(buf); + + spin_lock(&root->node_lock); root->node = cow; extent_buffer_get(cow); + spin_unlock(&root->node_lock); + if (buf != root->commit_root) { btrfs_free_extent(trans, root, buf->start, buf->len, root->root_key.objectid, @@ -219,6 +266,8 @@ int __btrfs_cow_block(struct btrfs_trans_handle *trans, btrfs_header_owner(parent), root_gen, 0, 0, 1); } + if (unlock_orig) + btrfs_tree_unlock(buf); free_extent_buffer(buf); btrfs_mark_buffer_dirty(cow); *cow_ret = cow; @@ -316,6 +365,9 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans, int progress_passed = 0; struct btrfs_disk_key disk_key; + /* FIXME this code needs locking */ + return 0; + parent_level = btrfs_header_level(parent); if (cache_only && parent_level != 1) return 0; @@ -729,6 +781,7 @@ static int balance_level(struct btrfs_trans_handle *trans, return 0; mid = path->nodes[level]; + WARN_ON(!path->locks[level]); WARN_ON(btrfs_header_generation(mid) != trans->transid); orig_ptr = btrfs_node_blockptr(mid, orig_slot); @@ -749,14 +802,21 @@ static int balance_level(struct btrfs_trans_handle *trans, /* promote the child to a root */ child = read_node_slot(root, mid, 0); + btrfs_tree_lock(child); BUG_ON(!child); ret = btrfs_cow_block(trans, root, child, mid, 0, &child); BUG_ON(ret); + spin_lock(&root->node_lock); root->node = child; + spin_unlock(&root->node_lock); + add_root_to_dirty_list(root); + btrfs_tree_unlock(child); + path->locks[level] = 0; path->nodes[level] = NULL; clean_tree_block(trans, root, mid); + btrfs_tree_unlock(mid); /* once for the path */ free_extent_buffer(mid); ret = btrfs_free_extent(trans, root, mid->start, mid->len, @@ -775,6 +835,7 @@ static int balance_level(struct btrfs_trans_handle *trans, left = read_node_slot(root, parent, pslot - 1); if (left) { + btrfs_tree_lock(left); wret = btrfs_cow_block(trans, root, left, parent, pslot - 1, &left); if (wret) { @@ -784,6 +845,7 @@ static int balance_level(struct btrfs_trans_handle *trans, } right = read_node_slot(root, parent, pslot + 1); if (right) { + btrfs_tree_lock(right); wret = btrfs_cow_block(trans, root, right, parent, pslot + 1, &right); if (wret) { @@ -815,6 +877,7 @@ static int balance_level(struct btrfs_trans_handle *trans, u32 blocksize = right->len; clean_tree_block(trans, root, right); + btrfs_tree_unlock(right); free_extent_buffer(right); right = NULL; wret = del_ptr(trans, root, path, level + 1, pslot + @@ -862,7 +925,9 @@ static int balance_level(struct btrfs_trans_handle *trans, u64 root_gen = btrfs_header_generation(parent); u64 bytenr = mid->start; u32 blocksize = mid->len; + clean_tree_block(trans, root, mid); + btrfs_tree_unlock(mid); free_extent_buffer(mid); mid = NULL; wret = del_ptr(trans, root, path, level + 1, pslot); @@ -885,11 +950,14 @@ static int balance_level(struct btrfs_trans_handle *trans, if (left) { if (btrfs_header_nritems(left) > orig_slot) { extent_buffer_get(left); + /* left was locked after cow */ path->nodes[level] = left; path->slots[level + 1] -= 1; path->slots[level] = orig_slot; - if (mid) + if (mid) { + btrfs_tree_unlock(mid); free_extent_buffer(mid); + } } else { orig_slot -= btrfs_header_nritems(left); path->slots[level] = orig_slot; @@ -901,10 +969,15 @@ static int balance_level(struct btrfs_trans_handle *trans, btrfs_node_blockptr(path->nodes[level], path->slots[level])) BUG(); enospc: - if (right) + if (right) { + btrfs_tree_unlock(right); free_extent_buffer(right); - if (left) + } + if (left) { + if (path->nodes[level] != left) + btrfs_tree_unlock(left); free_extent_buffer(left); + } return ret; } @@ -942,6 +1015,8 @@ static int noinline push_nodes_for_insert(struct btrfs_trans_handle *trans, /* first, try to make some room in the middle buffer */ if (left) { u32 left_nr; + + btrfs_tree_lock(left); left_nr = btrfs_header_nritems(left); if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) { wret = 1; @@ -967,24 +1042,28 @@ static int noinline push_nodes_for_insert(struct btrfs_trans_handle *trans, path->nodes[level] = left; path->slots[level + 1] -= 1; path->slots[level] = orig_slot; + btrfs_tree_unlock(mid); free_extent_buffer(mid); } else { orig_slot -= btrfs_header_nritems(left); path->slots[level] = orig_slot; + btrfs_tree_unlock(left); free_extent_buffer(left); } return 0; } + btrfs_tree_unlock(left); free_extent_buffer(left); } - right= read_node_slot(root, parent, pslot + 1); + right = read_node_slot(root, parent, pslot + 1); /* * then try to empty the right most buffer into the middle */ if (right) { u32 right_nr; + btrfs_tree_lock(right); right_nr = btrfs_header_nritems(right); if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) { wret = 1; @@ -1013,12 +1092,15 @@ static int noinline push_nodes_for_insert(struct btrfs_trans_handle *trans, path->slots[level + 1] += 1; path->slots[level] = orig_slot - btrfs_header_nritems(mid); + btrfs_tree_unlock(mid); free_extent_buffer(mid); } else { + btrfs_tree_unlock(right); free_extent_buffer(right); } return 0; } + btrfs_tree_unlock(right); free_extent_buffer(right); } return 1; @@ -1050,6 +1132,8 @@ static void reada_for_search(struct btrfs_root *root, struct btrfs_path *path, return; node = path->nodes[level]; + WARN_ON(!path->skip_locking && !btrfs_tree_locked(node)); + search = btrfs_node_blockptr(node, slot); blocksize = btrfs_level_size(root, level - 1); eb = btrfs_find_tree_block(root, search, blocksize); @@ -1098,6 +1182,39 @@ static void reada_for_search(struct btrfs_root *root, struct btrfs_path *path, highest_read = search; } } + +static void unlock_up(struct btrfs_path *path, int level, int lowest_unlock) +{ + int i; + int skip_level = level; + struct extent_buffer *t; + + for (i = level; i < BTRFS_MAX_LEVEL; i++) { + if (!path->nodes[i]) + break; + if (!path->locks[i]) + break; + if (path->slots[i] == 0) { + skip_level = i + 1; + continue; + } + if (path->keep_locks) { + u32 nritems; + t = path->nodes[i]; + nritems = btrfs_header_nritems(t); + if (path->slots[i] >= nritems - 1) { + skip_level = i + 1; + continue; + } + } + t = path->nodes[i]; + if (i >= lowest_unlock && i > skip_level && path->locks[i]) { + btrfs_tree_unlock(t); + path->locks[i] = 0; + } + } +} + /* * look for key in the tree. path is filled in with nodes along the way * if key is found, we return zero and you can find the item in the leaf @@ -1120,15 +1237,27 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root int ret; int level; int should_reada = p->reada; + int lowest_unlock = 1; u8 lowest_level = 0; lowest_level = p->lowest_level; WARN_ON(lowest_level && ins_len); WARN_ON(p->nodes[0] != NULL); - WARN_ON(!mutex_is_locked(&root->fs_info->fs_mutex)); + // WARN_ON(!mutex_is_locked(&root->fs_info->fs_mutex)); + WARN_ON(root == root->fs_info->extent_root && + !mutex_is_locked(&root->fs_info->alloc_mutex)); + WARN_ON(root == root->fs_info->chunk_root && + !mutex_is_locked(&root->fs_info->chunk_mutex)); + WARN_ON(root == root->fs_info->dev_root && + !mutex_is_locked(&root->fs_info->chunk_mutex)); + if (ins_len < 0) + lowest_unlock = 2; again: - b = root->node; - extent_buffer_get(b); + if (!p->skip_locking) + b = btrfs_lock_root_node(root); + else + b = btrfs_root_node(root); + while (b) { level = btrfs_header_level(b); if (cow) { @@ -1147,9 +1276,12 @@ again: WARN_ON(1); level = btrfs_header_level(b); p->nodes[level] = b; + if (!p->skip_locking) + p->locks[level] = 1; ret = check_block(root, p, level); if (ret) return -1; + ret = bin_search(b, key, level, &slot); if (level != 0) { if (ret && slot > 0) @@ -1177,14 +1309,19 @@ again: BUG_ON(btrfs_header_nritems(b) == 1); } /* this is only true while dropping a snapshot */ - if (level == lowest_level) + if (level == lowest_level) { + unlock_up(p, level, lowest_unlock); break; + } if (should_reada) reada_for_search(root, p, level, slot, key->objectid); b = read_node_slot(root, b, slot); + if (!p->skip_locking) + btrfs_tree_lock(b); + unlock_up(p, level, lowest_unlock); } else { p->slots[level] = slot; if (ins_len > 0 && btrfs_leaf_free_space(root, b) < @@ -1195,6 +1332,7 @@ again: if (sret) return sret; } + unlock_up(p, level, lowest_unlock); return ret; } } @@ -1225,6 +1363,13 @@ static int fixup_low_keys(struct btrfs_trans_handle *trans, break; t = path->nodes[i]; btrfs_set_node_key(t, key, tslot); + if (!btrfs_tree_locked(path->nodes[i])) { + int ii; +printk("fixup without lock on level %d\n", btrfs_header_level(path->nodes[i])); + for (ii = 0; ii < BTRFS_MAX_LEVEL; ii++) { +printk("level %d slot %d\n", ii, path->slots[ii]); + } + } btrfs_mark_buffer_dirty(path->nodes[i]); if (tslot != 0) break; @@ -1370,6 +1515,7 @@ static int noinline insert_new_root(struct btrfs_trans_handle *trans, u64 lower_gen; struct extent_buffer *lower; struct extent_buffer *c; + struct extent_buffer *old; struct btrfs_disk_key lower_key; BUG_ON(path->nodes[level]); @@ -1386,12 +1532,13 @@ static int noinline insert_new_root(struct btrfs_trans_handle *trans, else btrfs_node_key(lower, &lower_key, 0); - c = __btrfs_alloc_free_block(trans, root, root->nodesize, + c = btrfs_alloc_free_block(trans, root, root->nodesize, root->root_key.objectid, root_gen, lower_key.objectid, level, root->node->start, 0); if (IS_ERR(c)) return PTR_ERR(c); + memset_extent_buffer(c, 0, 0, root->nodesize); btrfs_set_header_nritems(c, 1); btrfs_set_header_level(c, level); @@ -1416,23 +1563,31 @@ static int noinline insert_new_root(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(c); - /* the super has an extra ref to root->node */ - free_extent_buffer(root->node); + spin_lock(&root->node_lock); + old = root->node; root->node = c; + spin_unlock(&root->node_lock); + + /* the super has an extra ref to root->node */ + free_extent_buffer(old); + add_root_to_dirty_list(root); extent_buffer_get(c); path->nodes[level] = c; + path->locks[level] = 1; path->slots[level] = 0; if (root->ref_cows && lower_gen != trans->transid) { struct btrfs_path *back_path = btrfs_alloc_path(); int ret; + mutex_lock(&root->fs_info->alloc_mutex); ret = btrfs_insert_extent_backref(trans, root->fs_info->extent_root, path, lower->start, root->root_key.objectid, trans->transid, 0, 0); BUG_ON(ret); + mutex_unlock(&root->fs_info->alloc_mutex); btrfs_free_path(back_path); } return 0; @@ -1521,7 +1676,7 @@ static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root root_gen = 0; btrfs_node_key(c, &disk_key, 0); - split = __btrfs_alloc_free_block(trans, root, root->nodesize, + split = btrfs_alloc_free_block(trans, root, root->nodesize, root->root_key.objectid, root_gen, btrfs_disk_key_objectid(&disk_key), @@ -1564,10 +1719,12 @@ static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root if (path->slots[level] >= mid) { path->slots[level] -= mid; + btrfs_tree_unlock(c); free_extent_buffer(c); path->nodes[level] = split; path->slots[level + 1] += 1; } else { + btrfs_tree_unlock(split); free_extent_buffer(split); } return ret; @@ -1648,30 +1805,24 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root return 1; right = read_node_slot(root, upper, slot + 1); + btrfs_tree_lock(right); free_space = btrfs_leaf_free_space(root, right); - if (free_space < data_size + sizeof(struct btrfs_item)) { - free_extent_buffer(right); - return 1; - } + if (free_space < data_size + sizeof(struct btrfs_item)) + goto out_unlock; /* cow and double check */ ret = btrfs_cow_block(trans, root, right, upper, slot + 1, &right); - if (ret) { - free_extent_buffer(right); - return 1; - } + if (ret) + goto out_unlock; + free_space = btrfs_leaf_free_space(root, right); - if (free_space < data_size + sizeof(struct btrfs_item)) { - free_extent_buffer(right); - return 1; - } + if (free_space < data_size + sizeof(struct btrfs_item)) + goto out_unlock; left_nritems = btrfs_header_nritems(left); - if (left_nritems == 0) { - free_extent_buffer(right); - return 1; - } + if (left_nritems == 0) + goto out_unlock; if (empty) nr = 0; @@ -1707,10 +1858,8 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root left->map_token = NULL; } - if (push_items == 0) { - free_extent_buffer(right); - return 1; - } + if (push_items == 0) + goto out_unlock; if (!empty && push_items == left_nritems) WARN_ON(1); @@ -1778,14 +1927,24 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root /* then fixup the leaf pointer in the path */ if (path->slots[0] >= left_nritems) { path->slots[0] -= left_nritems; + if (btrfs_header_nritems(path->nodes[0]) == 0) + clean_tree_block(trans, root, path->nodes[0]); + btrfs_tree_unlock(path->nodes[0]); free_extent_buffer(path->nodes[0]); path->nodes[0] = right; path->slots[1] += 1; } else { + btrfs_tree_unlock(right); free_extent_buffer(right); } return 0; + +out_unlock: + btrfs_tree_unlock(right); + free_extent_buffer(right); + return 1; } + /* * push some data in the path leaf to the left, trying to free up at * least data_size bytes. returns zero if the push worked, nonzero otherwise @@ -1823,10 +1982,11 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root } left = read_node_slot(root, path->nodes[1], slot - 1); + btrfs_tree_lock(left); free_space = btrfs_leaf_free_space(root, left); if (free_space < data_size + sizeof(struct btrfs_item)) { - free_extent_buffer(left); - return 1; + ret = 1; + goto out; } /* cow and double check */ @@ -1834,14 +1994,14 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root path->nodes[1], slot - 1, &left); if (ret) { /* we hit -ENOSPC, but it isn't fatal here */ - free_extent_buffer(left); - return 1; + ret = 1; + goto out; } free_space = btrfs_leaf_free_space(root, left); if (free_space < data_size + sizeof(struct btrfs_item)) { - free_extent_buffer(left); - return 1; + ret = 1; + goto out; } if (empty) @@ -1876,8 +2036,8 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root } if (push_items == 0) { - free_extent_buffer(left); - return 1; + ret = 1; + goto out; } if (!empty && push_items == btrfs_header_nritems(right)) WARN_ON(1); @@ -1975,15 +2135,23 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root /* then fixup the leaf pointer in the path */ if (path->slots[0] < push_items) { path->slots[0] += old_left_nritems; + if (btrfs_header_nritems(path->nodes[0]) == 0) + clean_tree_block(trans, root, path->nodes[0]); + btrfs_tree_unlock(path->nodes[0]); free_extent_buffer(path->nodes[0]); path->nodes[0] = left; path->slots[1] -= 1; } else { + btrfs_tree_unlock(left); free_extent_buffer(left); path->slots[0] -= push_items; } BUG_ON(path->slots[0] < 0); return ret; +out: + btrfs_tree_unlock(left); + free_extent_buffer(left); + return ret; } /* @@ -2052,7 +2220,7 @@ again: btrfs_item_key(l, &disk_key, 0); - right = __btrfs_alloc_free_block(trans, root, root->leafsize, + right = btrfs_alloc_free_block(trans, root, root->leafsize, root->root_key.objectid, root_gen, disk_key.objectid, 0, l->start, 0); @@ -2085,6 +2253,8 @@ again: path->slots[1] + 1, 1); if (wret) ret = wret; + + btrfs_tree_unlock(path->nodes[0]); free_extent_buffer(path->nodes[0]); path->nodes[0] = right; path->slots[0] = 0; @@ -2111,6 +2281,7 @@ again: path->slots[1], 1); if (wret) ret = wret; + btrfs_tree_unlock(path->nodes[0]); free_extent_buffer(path->nodes[0]); path->nodes[0] = right; path->slots[0] = 0; @@ -2184,12 +2355,15 @@ again: BUG_ON(path->slots[0] != slot); if (mid <= slot) { + btrfs_tree_unlock(path->nodes[0]); free_extent_buffer(path->nodes[0]); path->nodes[0] = right; path->slots[0] -= mid; path->slots[1] += 1; - } else + } else { + btrfs_tree_unlock(right); free_extent_buffer(right); + } BUG_ON(path->slots[0] < 0); @@ -2418,10 +2592,6 @@ int btrfs_insert_empty_items(struct btrfs_trans_handle *trans, total_data += data_size[i]; } - /* create a root if there isn't one */ - if (!root->node) - BUG(); - total_size = total_data + (nr - 1) * sizeof(struct btrfs_item); ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1); if (ret == 0) { @@ -2516,7 +2686,6 @@ int btrfs_insert_empty_items(struct btrfs_trans_handle *trans, btrfs_print_leaf(root, leaf); BUG(); } - out: return ret; } @@ -2655,7 +2824,6 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, btrfs_set_header_level(leaf, 0); } else { u64 root_gen = btrfs_header_generation(path->nodes[1]); - clean_tree_block(trans, root, leaf); wret = del_ptr(trans, root, path, 1, path->slots[1]); if (wret) ret = wret; @@ -2706,8 +2874,6 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, root_gen = btrfs_header_generation( path->nodes[1]); - clean_tree_block(trans, root, leaf); - wret = del_ptr(trans, root, path, 1, slot); if (wret) ret = wret; @@ -2720,7 +2886,13 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, if (wret) ret = wret; } else { - btrfs_mark_buffer_dirty(leaf); + /* if we're still in the path, make sure + * we're dirty. Otherwise, one of the + * push_leaf functions must have already + * dirtied this buffer + */ + if (path->nodes[0] == leaf) + btrfs_mark_buffer_dirty(leaf); free_extent_buffer(leaf); } } else { @@ -2731,56 +2903,40 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, } /* - * walk up the tree as far as required to find the previous leaf. + * search the tree again to find a leaf with lesser keys * returns 0 if it found something or 1 if there are no lesser leaves. * returns < 0 on io errors. */ int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path) { - int slot; - int level = 1; - struct extent_buffer *c; - struct extent_buffer *next = NULL; + struct btrfs_key key; + struct btrfs_disk_key found_key; + int ret; - while(level < BTRFS_MAX_LEVEL) { - if (!path->nodes[level]) - return 1; + btrfs_item_key_to_cpu(path->nodes[0], &key, 0); - slot = path->slots[level]; - c = path->nodes[level]; - if (slot == 0) { - level++; - if (level == BTRFS_MAX_LEVEL) - return 1; - continue; - } - slot--; - - if (next) - free_extent_buffer(next); + if (key.offset > 0) + key.offset--; + else if (key.type > 0) + key.type--; + else if (key.objectid > 0) + key.objectid--; + else + return 1; - next = read_node_slot(root, c, slot); - break; - } - path->slots[level] = slot; - while(1) { - level--; - c = path->nodes[level]; - free_extent_buffer(c); - slot = btrfs_header_nritems(next); - if (slot != 0) - slot--; - path->nodes[level] = next; - path->slots[level] = slot; - if (!level) - break; - next = read_node_slot(root, next, slot); - } - return 0; + btrfs_release_path(root, path); + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + return ret; + btrfs_item_key(path->nodes[0], &found_key, 0); + ret = comp_keys(&found_key, &key); + if (ret < 0) + return 0; + return 1; } /* - * walk up the tree as far as required to find the next leaf. + * search the tree again to find a leaf with greater keys * returns 0 if it found something or 1 if there are no greater leaves. * returns < 0 on io errors. */ @@ -2790,6 +2946,28 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path) int level = 1; struct extent_buffer *c; struct extent_buffer *next = NULL; + struct btrfs_key key; + u32 nritems; + int ret; + + nritems = btrfs_header_nritems(path->nodes[0]); + if (nritems == 0) { + return 1; + } + + btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1); + + path->keep_locks = 1; + btrfs_release_path(root, path); + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + path->keep_locks = 0; + + if (ret < 0) + return ret; + + if (path->slots[0] < nritems - 1) { + goto done; + } while(level < BTRFS_MAX_LEVEL) { if (!path->nodes[level]) @@ -2799,33 +2977,45 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path) c = path->nodes[level]; if (slot >= btrfs_header_nritems(c)) { level++; - if (level == BTRFS_MAX_LEVEL) + if (level == BTRFS_MAX_LEVEL) { return 1; + } continue; } - if (next) + if (next) { + btrfs_tree_unlock(next); free_extent_buffer(next); + } - if (path->reada) + if (level == 1 && path->locks[1] && path->reada) reada_for_search(root, path, level, slot, 0); next = read_node_slot(root, c, slot); + if (!path->skip_locking) + btrfs_tree_lock(next); break; } path->slots[level] = slot; while(1) { level--; c = path->nodes[level]; + if (path->locks[level]) + btrfs_tree_unlock(c); free_extent_buffer(c); path->nodes[level] = next; path->slots[level] = 0; + path->locks[level] = 1; if (!level) break; - if (path->reada) - reada_for_search(root, path, level, 0, 0); + if (level == 1 && path->locks[1] && path->reada) + reada_for_search(root, path, level, slot, 0); next = read_node_slot(root, next, 0); + if (!path->skip_locking) + btrfs_tree_lock(next); } +done: + unlock_up(path, 0, 1); return 0; } diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index dcea9d706d9b..50891b39f366 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -330,8 +330,13 @@ struct btrfs_node { struct btrfs_path { struct extent_buffer *nodes[BTRFS_MAX_LEVEL]; int slots[BTRFS_MAX_LEVEL]; + /* if there is real range locking, this locks field will change */ + int locks[BTRFS_MAX_LEVEL]; int reada; + /* keep some upper locks as we walk down */ + int keep_locks; int lowest_level; + int skip_locking; }; /* @@ -515,6 +520,8 @@ struct btrfs_fs_info { spinlock_t hash_lock; struct mutex trans_mutex; struct mutex fs_mutex; + struct mutex alloc_mutex; + struct mutex chunk_mutex; struct list_head trans_list; struct list_head hashers; struct list_head dead_roots; @@ -576,6 +583,10 @@ struct btrfs_fs_info { */ struct btrfs_root { struct extent_buffer *node; + + /* the node lock is held while changing the node pointer */ + spinlock_t node_lock; + struct extent_buffer *commit_root; struct btrfs_root_item root_item; struct btrfs_key root_key; @@ -1353,13 +1364,7 @@ struct btrfs_block_group_cache *btrfs_find_block_group(struct btrfs_root *root, struct btrfs_block_group_cache *hint, u64 search_start, int data, int owner); -int btrfs_inc_root_ref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 owner_objectid); struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u32 size, - u64 root_objectid, - u64 hint, u64 empty_size); -struct extent_buffer *__btrfs_alloc_free_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, u32 blocksize, u64 root_objectid, @@ -1368,8 +1373,6 @@ struct extent_buffer *__btrfs_alloc_free_block(struct btrfs_trans_handle *trans, int level, u64 hint, u64 empty_size); -int btrfs_grow_extent_tree(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 new_size); int btrfs_shrink_extent_tree(struct btrfs_root *root, u64 new_size); int btrfs_insert_extent_backref(struct btrfs_trans_handle *trans, struct btrfs_root *root, @@ -1409,6 +1412,10 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, int btrfs_previous_item(struct btrfs_root *root, struct btrfs_path *path, u64 min_objectid, int type); + +struct extent_buffer *btrfs_root_node(struct btrfs_root *root); +struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root); + int btrfs_cow_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, struct extent_buffer *parent, int parent_slot, diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index e5c758e306d5..fe40bdd984ff 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -32,6 +32,7 @@ #include "volumes.h" #include "print-tree.h" #include "async-thread.h" +#include "locking.h" #if 0 static int check_tree_block(struct btrfs_root *root, struct extent_buffer *buf) @@ -681,9 +682,11 @@ int clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, { struct inode *btree_inode = root->fs_info->btree_inode; if (btrfs_header_generation(buf) == - root->fs_info->running_transaction->transid) + root->fs_info->running_transaction->transid) { + WARN_ON(!btrfs_tree_locked(buf)); clear_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree, buf); + } return 0; } @@ -720,6 +723,7 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize, root->in_sysfs = 0; INIT_LIST_HEAD(&root->dirty_list); + spin_lock_init(&root->node_lock); memset(&root->root_key, 0, sizeof(root->root_key)); memset(&root->root_item, 0, sizeof(root->root_item)); memset(&root->defrag_progress, 0, sizeof(root->defrag_progress)); @@ -1196,6 +1200,8 @@ struct btrfs_root *open_ctree(struct super_block *sb, mutex_init(&fs_info->trans_mutex); mutex_init(&fs_info->fs_mutex); + mutex_init(&fs_info->alloc_mutex); + mutex_init(&fs_info->chunk_mutex); #if 0 ret = add_hasher(fs_info, "crc32c"); @@ -1274,7 +1280,9 @@ struct btrfs_root *open_ctree(struct super_block *sb, mutex_lock(&fs_info->fs_mutex); + mutex_lock(&fs_info->chunk_mutex); ret = btrfs_read_sys_array(tree_root); + mutex_unlock(&fs_info->chunk_mutex); if (ret) { printk("btrfs: failed to read the system array on %s\n", sb->s_id); @@ -1296,7 +1304,9 @@ struct btrfs_root *open_ctree(struct super_block *sb, (unsigned long)btrfs_header_chunk_tree_uuid(chunk_root->node), BTRFS_UUID_SIZE); + mutex_lock(&fs_info->chunk_mutex); ret = btrfs_read_chunk_tree(chunk_root); + mutex_unlock(&fs_info->chunk_mutex); BUG_ON(ret); btrfs_close_extra_devices(fs_devices); @@ -1654,6 +1664,7 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf) u64 transid = btrfs_header_generation(buf); struct inode *btree_inode = root->fs_info->btree_inode; + WARN_ON(!btrfs_tree_locked(buf)); if (transid != root->fs_info->generation) { printk(KERN_CRIT "transid mismatch buffer %llu, found %Lu running %Lu\n", (unsigned long long)buf->start, diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 41a63462d3eb..7e40c516fe62 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -25,6 +25,7 @@ #include "print-tree.h" #include "transaction.h" #include "volumes.h" +#include "locking.h" #define BLOCK_GROUP_DATA EXTENT_WRITEBACK #define BLOCK_GROUP_METADATA EXTENT_UPTODATE @@ -36,7 +37,28 @@ static int finish_current_insert(struct btrfs_trans_handle *trans, struct btrfs_root *extent_root); static int del_pending_extents(struct btrfs_trans_handle *trans, struct btrfs_root *extent_root); +static struct btrfs_block_group_cache * +__btrfs_find_block_group(struct btrfs_root *root, + struct btrfs_block_group_cache *hint, + u64 search_start, int data, int owner); +void maybe_lock_mutex(struct btrfs_root *root) +{ + if (root != root->fs_info->extent_root && + root != root->fs_info->chunk_root && + root != root->fs_info->dev_root) { + mutex_lock(&root->fs_info->alloc_mutex); + } +} + +void maybe_unlock_mutex(struct btrfs_root *root) +{ + if (root != root->fs_info->extent_root && + root != root->fs_info->chunk_root && + root != root->fs_info->dev_root) { + mutex_unlock(&root->fs_info->alloc_mutex); + } +} static int cache_block_group(struct btrfs_root *root, struct btrfs_block_group_cache *block_group) @@ -66,6 +88,7 @@ static int cache_block_group(struct btrfs_root *root, return -ENOMEM; path->reada = 2; + path->skip_locking = 1; first_free = block_group->key.objectid; key.objectid = block_group->key.objectid; key.offset = 0; @@ -290,7 +313,7 @@ no_cache: cache = btrfs_lookup_first_block_group(root->fs_info, last); } cache_miss = 0; - cache = btrfs_find_block_group(root, cache, last, data, 0); + cache = __btrfs_find_block_group(root, cache, last, data, 0); if (!cache) goto no_cache; *cache_ret = cache; @@ -318,10 +341,10 @@ static int block_group_state_bits(u64 flags) return bits; } -struct btrfs_block_group_cache *btrfs_find_block_group(struct btrfs_root *root, - struct btrfs_block_group_cache - *hint, u64 search_start, - int data, int owner) +static struct btrfs_block_group_cache * +__btrfs_find_block_group(struct btrfs_root *root, + struct btrfs_block_group_cache *hint, + u64 search_start, int data, int owner) { struct btrfs_block_group_cache *cache; struct extent_io_tree *block_group_cache; @@ -411,6 +434,18 @@ found: return found_group; } +struct btrfs_block_group_cache *btrfs_find_block_group(struct btrfs_root *root, + struct btrfs_block_group_cache + *hint, u64 search_start, + int data, int owner) +{ + + struct btrfs_block_group_cache *ret; + mutex_lock(&root->fs_info->alloc_mutex); + ret = __btrfs_find_block_group(root, hint, search_start, data, owner); + mutex_unlock(&root->fs_info->alloc_mutex); + return ret; +} static u64 hash_extent_ref(u64 root_objectid, u64 ref_generation, u64 owner, u64 owner_offset) { @@ -646,7 +681,7 @@ out: return ret; } -int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, +static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 num_bytes, u64 root_objectid, u64 ref_generation, @@ -696,6 +731,22 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, return 0; } +int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 bytenr, u64 num_bytes, + u64 root_objectid, u64 ref_generation, + u64 owner, u64 owner_offset) +{ + int ret; + + mutex_lock(&root->fs_info->alloc_mutex); + ret = __btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, + root_objectid, ref_generation, + owner, owner_offset); + mutex_unlock(&root->fs_info->alloc_mutex); + return ret; +} + int btrfs_extent_post_op(struct btrfs_trans_handle *trans, struct btrfs_root *root) { @@ -760,6 +811,10 @@ u32 btrfs_count_snapshots_in_path(struct btrfs_root *root, struct btrfs_extent_ref *ref_item; int level = -1; + /* FIXME, needs locking */ + BUG(); + + mutex_lock(&root->fs_info->alloc_mutex); path = btrfs_alloc_path(); again: if (level == -1) @@ -854,33 +909,9 @@ again: out: btrfs_free_path(path); + mutex_unlock(&root->fs_info->alloc_mutex); return total_count; } -int btrfs_inc_root_ref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 owner_objectid) -{ - u64 generation; - u64 key_objectid; - u64 level; - u32 nritems; - struct btrfs_disk_key disk_key; - - level = btrfs_header_level(root->node); - generation = trans->transid; - nritems = btrfs_header_nritems(root->node); - if (nritems > 0) { - if (level == 0) - btrfs_item_key(root->node, &disk_key, 0); - else - btrfs_node_key(root->node, &disk_key, 0); - key_objectid = btrfs_disk_key_objectid(&disk_key); - } else { - key_objectid = 0; - } - return btrfs_inc_extent_ref(trans, root, root->node->start, - root->node->len, owner_objectid, - generation, level, key_objectid); -} int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf) @@ -897,6 +928,7 @@ int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, if (!root->ref_cows) return 0; + mutex_lock(&root->fs_info->alloc_mutex); level = btrfs_header_level(buf); nritems = btrfs_header_nritems(buf); for (i = 0; i < nritems; i++) { @@ -913,7 +945,7 @@ int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, disk_bytenr = btrfs_file_extent_disk_bytenr(buf, fi); if (disk_bytenr == 0) continue; - ret = btrfs_inc_extent_ref(trans, root, disk_bytenr, + ret = __btrfs_inc_extent_ref(trans, root, disk_bytenr, btrfs_file_extent_disk_num_bytes(buf, fi), root->root_key.objectid, trans->transid, key.objectid, key.offset); @@ -924,7 +956,7 @@ int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, } else { bytenr = btrfs_node_blockptr(buf, i); btrfs_node_key_to_cpu(buf, &key, i); - ret = btrfs_inc_extent_ref(trans, root, bytenr, + ret = __btrfs_inc_extent_ref(trans, root, bytenr, btrfs_level_size(root, level - 1), root->root_key.objectid, trans->transid, @@ -935,6 +967,7 @@ int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, } } } + mutex_unlock(&root->fs_info->alloc_mutex); return 0; fail: WARN_ON(1); @@ -965,6 +998,7 @@ fail: } } #endif + mutex_unlock(&root->fs_info->alloc_mutex); return ret; } @@ -1019,6 +1053,7 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, if (!path) return -ENOMEM; + mutex_lock(&root->fs_info->alloc_mutex); while(1) { ret = find_first_extent_bit(block_group_cache, last, &start, &end, BLOCK_GROUP_DIRTY); @@ -1045,6 +1080,7 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, BLOCK_GROUP_DIRTY, GFP_NOFS); } btrfs_free_path(path); + mutex_unlock(&root->fs_info->alloc_mutex); return werr; } @@ -1162,26 +1198,28 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans, space_info->force_alloc = 0; } if (space_info->full) - return 0; + goto out; thresh = div_factor(space_info->total_bytes, 6); if (!force && (space_info->bytes_used + space_info->bytes_pinned + alloc_bytes) < thresh) - return 0; + goto out; + mutex_lock(&extent_root->fs_info->chunk_mutex); ret = btrfs_alloc_chunk(trans, extent_root, &start, &num_bytes, flags); if (ret == -ENOSPC) { printk("space info full %Lu\n", flags); space_info->full = 1; - return 0; + goto out; } BUG_ON(ret); ret = btrfs_make_block_group(trans, extent_root, 0, flags, BTRFS_FIRST_CHUNK_TREE_OBJECTID, start, num_bytes); BUG_ON(ret); - + mutex_unlock(&extent_root->fs_info->chunk_mutex); +out: return 0; } @@ -1318,6 +1356,7 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, struct extent_io_tree *free_space_cache; free_space_cache = &root->fs_info->free_space_cache; + mutex_lock(&root->fs_info->alloc_mutex); while(1) { ret = find_first_extent_bit(unpin, 0, &start, &end, EXTENT_DIRTY); @@ -1327,6 +1366,7 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, clear_extent_dirty(unpin, start, end, GFP_NOFS); set_extent_dirty(free_space_cache, start, end, GFP_NOFS); } + mutex_unlock(&root->fs_info->alloc_mutex); return 0; } @@ -1363,18 +1403,24 @@ static int finish_current_insert(struct btrfs_trans_handle *trans, GFP_NOFS); eb = read_tree_block(extent_root, ins.objectid, ins.offset, trans->transid); + btrfs_tree_lock(eb); level = btrfs_header_level(eb); if (level == 0) { btrfs_item_key(eb, &first, 0); } else { btrfs_node_key(eb, &first, 0); } + btrfs_tree_unlock(eb); + free_extent_buffer(eb); + /* + * the first key is just a hint, so the race we've created + * against reading it is fine + */ err = btrfs_insert_extent_backref(trans, extent_root, path, start, extent_root->root_key.objectid, 0, level, btrfs_disk_key_objectid(&first)); BUG_ON(err); - free_extent_buffer(eb); } btrfs_free_path(path); return 0; @@ -1384,12 +1430,14 @@ static int pin_down_bytes(struct btrfs_root *root, u64 bytenr, u32 num_bytes, int pending) { int err = 0; - struct extent_buffer *buf; if (!pending) { +#if 0 + struct extent_buffer *buf; buf = btrfs_find_tree_block(root, bytenr, num_bytes); if (buf) { - if (btrfs_buffer_uptodate(buf, 0)) { + if (!btrfs_try_tree_lock(buf) && + btrfs_buffer_uptodate(buf, 0)) { u64 transid = root->fs_info->running_transaction->transid; u64 header_transid = @@ -1398,12 +1446,15 @@ static int pin_down_bytes(struct btrfs_root *root, u64 bytenr, u32 num_bytes, !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) { clean_tree_block(NULL, root, buf); + btrfs_tree_unlock(buf); free_extent_buffer(buf); return 1; } + btrfs_tree_unlock(buf); } free_extent_buffer(buf); } +#endif update_pinned_extents(root, bytenr, num_bytes, 1); } else { set_extent_bits(&root->fs_info->pending_del, @@ -1586,10 +1637,11 @@ static int del_pending_extents(struct btrfs_trans_handle *trans, struct /* * remove an extent from the root, returns 0 on success */ -int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root - *root, u64 bytenr, u64 num_bytes, - u64 root_objectid, u64 ref_generation, - u64 owner_objectid, u64 owner_offset, int pin) +static int __btrfs_free_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 bytenr, + u64 num_bytes, u64 root_objectid, + u64 ref_generation, u64 owner_objectid, + u64 owner_offset, int pin) { struct btrfs_root *extent_root = root->fs_info->extent_root; int pending_ret; @@ -1610,6 +1662,22 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root return ret ? ret : pending_ret; } +int btrfs_free_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 bytenr, + u64 num_bytes, u64 root_objectid, + u64 ref_generation, u64 owner_objectid, + u64 owner_offset, int pin) +{ + int ret; + + maybe_lock_mutex(root); + ret = __btrfs_free_extent(trans, root, bytenr, num_bytes, + root_objectid, ref_generation, + owner_objectid, owner_offset, pin); + maybe_unlock_mutex(root); + return ret; +} + static u64 stripe_align(struct btrfs_root *root, u64 val) { u64 mask = ((u64)root->stripesize - 1); @@ -1679,12 +1747,12 @@ static int noinline find_free_extent(struct btrfs_trans_handle *trans, block_group = btrfs_lookup_first_block_group(info, hint_byte); if (!block_group) hint_byte = search_start; - block_group = btrfs_find_block_group(root, block_group, + block_group = __btrfs_find_block_group(root, block_group, hint_byte, data, 1); if (last_ptr && *last_ptr == 0 && block_group) hint_byte = block_group->key.objectid; } else { - block_group = btrfs_find_block_group(root, + block_group = __btrfs_find_block_group(root, trans->block_group, search_start, data, 1); } @@ -1806,7 +1874,7 @@ enospc: } block_group = btrfs_lookup_first_block_group(info, search_start); cond_resched(); - block_group = btrfs_find_block_group(root, block_group, + block_group = __btrfs_find_block_group(root, block_group, search_start, data, 0); goto check_failed; @@ -1843,6 +1911,8 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans, struct btrfs_path *path; struct btrfs_key keys[2]; + maybe_lock_mutex(root); + if (data) { alloc_profile = info->avail_data_alloc_bits & info->data_alloc_profile; @@ -1892,9 +1962,10 @@ again: if (ret) { printk("allocation failed flags %Lu\n", data); } - BUG_ON(ret); - if (ret) - return ret; + if (ret) { + BUG(); + goto out; + } /* block accounting for super block */ super_used = btrfs_super_bytes_used(&info->super_copy); @@ -1953,11 +2024,11 @@ again: finish_current_insert(trans, extent_root); pending_ret = del_pending_extents(trans, extent_root); - if (ret) { - return ret; - } + if (ret) + goto out; if (pending_ret) { - return pending_ret; + ret = pending_ret; + goto out; } update_block: @@ -1967,36 +2038,15 @@ update_block: ins->objectid, ins->offset); BUG(); } - return 0; +out: + maybe_unlock_mutex(root); + return ret; } - /* * helper function to allocate a block for a given tree * returns the tree buffer or NULL. */ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - u32 blocksize, - u64 root_objectid, u64 hint, - u64 empty_size) -{ - u64 ref_generation; - - if (root->ref_cows) - ref_generation = trans->transid; - else - ref_generation = 0; - - - return __btrfs_alloc_free_block(trans, root, blocksize, root_objectid, - ref_generation, 0, 0, hint, empty_size); -} - -/* - * helper function to allocate a block for a given tree - * returns the tree buffer or NULL. - */ -struct extent_buffer *__btrfs_alloc_free_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, u32 blocksize, u64 root_objectid, @@ -2026,6 +2076,7 @@ struct extent_buffer *__btrfs_alloc_free_block(struct btrfs_trans_handle *trans, return ERR_PTR(-ENOMEM); } btrfs_set_header_generation(buf, trans->transid); + btrfs_tree_lock(buf); clean_tree_block(trans, root, buf); btrfs_set_buffer_uptodate(buf); @@ -2076,7 +2127,7 @@ static int noinline drop_leaf_ref(struct btrfs_trans_handle *trans, disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); if (disk_bytenr == 0) continue; - ret = btrfs_free_extent(trans, root, disk_bytenr, + ret = __btrfs_free_extent(trans, root, disk_bytenr, btrfs_file_extent_disk_num_bytes(leaf, fi), leaf_owner, leaf_generation, key.objectid, key.offset, 0); @@ -2151,6 +2202,8 @@ static int noinline walk_down_tree(struct btrfs_trans_handle *trans, int ret; u32 refs; + mutex_lock(&root->fs_info->alloc_mutex); + WARN_ON(*level < 0); WARN_ON(*level >= BTRFS_MAX_LEVEL); ret = lookup_extent_ref(trans, root, @@ -2182,6 +2235,7 @@ static int noinline walk_down_tree(struct btrfs_trans_handle *trans, bytenr = btrfs_node_blockptr(cur, path->slots[*level]); ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]); blocksize = btrfs_level_size(root, *level - 1); + ret = lookup_extent_ref(trans, root, bytenr, blocksize, &refs); BUG_ON(ret); if (refs != 1) { @@ -2189,7 +2243,7 @@ static int noinline walk_down_tree(struct btrfs_trans_handle *trans, root_owner = btrfs_header_owner(parent); root_gen = btrfs_header_generation(parent); path->slots[*level]++; - ret = btrfs_free_extent(trans, root, bytenr, + ret = __btrfs_free_extent(trans, root, bytenr, blocksize, root_owner, root_gen, 0, 0, 1); BUG_ON(ret); @@ -2201,9 +2255,11 @@ static int noinline walk_down_tree(struct btrfs_trans_handle *trans, reada_walk_down(root, cur, path->slots[*level]); mutex_unlock(&root->fs_info->fs_mutex); + mutex_unlock(&root->fs_info->alloc_mutex); next = read_tree_block(root, bytenr, blocksize, ptr_gen); mutex_lock(&root->fs_info->fs_mutex); + mutex_lock(&root->fs_info->alloc_mutex); /* we've dropped the lock, double check */ ret = lookup_extent_ref(trans, root, bytenr, @@ -2216,7 +2272,7 @@ static int noinline walk_down_tree(struct btrfs_trans_handle *trans, path->slots[*level]++; free_extent_buffer(next); - ret = btrfs_free_extent(trans, root, bytenr, + ret = __btrfs_free_extent(trans, root, bytenr, blocksize, root_owner, root_gen, 0, 0, 1); @@ -2244,13 +2300,14 @@ out: } root_gen = btrfs_header_generation(parent); - ret = btrfs_free_extent(trans, root, path->nodes[*level]->start, + ret = __btrfs_free_extent(trans, root, path->nodes[*level]->start, path->nodes[*level]->len, root_owner, root_gen, 0, 0, 1); free_extent_buffer(path->nodes[*level]); path->nodes[*level] = NULL; *level += 1; BUG_ON(ret); + mutex_unlock(&root->fs_info->alloc_mutex); return 0; } @@ -2350,6 +2407,12 @@ int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root btrfs_node_key(node, &found_key, path->slots[level]); WARN_ON(memcmp(&found_key, &root_item->drop_progress, sizeof(found_key))); + for (i = 0; i < BTRFS_MAX_LEVEL; i++) { + if (path->nodes[i] && path->locks[i]) { + path->locks[i] = 0; + btrfs_tree_unlock(path->nodes[i]); + } + } } while(1) { wret = walk_down_tree(trans, root, path, &level); @@ -2383,6 +2446,8 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info) u64 end; u64 ptr; int ret; + + mutex_lock(&info->alloc_mutex); while(1) { ret = find_first_extent_bit(&info->block_group_cache, 0, &start, &end, (unsigned int)-1); @@ -2402,6 +2467,7 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info) clear_extent_dirty(&info->free_space_cache, start, end, GFP_NOFS); } + mutex_unlock(&info->alloc_mutex); return 0; } @@ -2678,6 +2744,7 @@ static int noinline relocate_one_reference(struct btrfs_root *extent_root, eb = read_tree_block(found_root, extent_key->objectid, extent_key->offset, 0); + btrfs_tree_lock(eb); level = btrfs_header_level(eb); if (level == 0) @@ -2685,6 +2752,7 @@ static int noinline relocate_one_reference(struct btrfs_root *extent_root, else btrfs_node_key_to_cpu(eb, &found_key, 0); + btrfs_tree_unlock(eb); free_extent_buffer(eb); ret = find_root_for_ref(extent_root, path, &found_key, @@ -2888,6 +2956,7 @@ int btrfs_shrink_extent_tree(struct btrfs_root *root, u64 shrink_start) int ret; int progress; + mutex_lock(&root->fs_info->alloc_mutex); shrink_block_group = btrfs_lookup_block_group(root->fs_info, shrink_start); BUG_ON(!shrink_block_group); @@ -3044,20 +3113,22 @@ next: (unsigned int)-1, GFP_NOFS); out: btrfs_free_path(path); + mutex_unlock(&root->fs_info->alloc_mutex); return ret; } int find_first_block_group(struct btrfs_root *root, struct btrfs_path *path, struct btrfs_key *key) { - int ret; + int ret = 0; struct btrfs_key found_key; struct extent_buffer *leaf; int slot; ret = btrfs_search_slot(NULL, root, key, path, 0, 0); if (ret < 0) - return ret; + goto out; + while(1) { slot = path->slots[0]; leaf = path->nodes[0]; @@ -3066,18 +3137,20 @@ int find_first_block_group(struct btrfs_root *root, struct btrfs_path *path, if (ret == 0) continue; if (ret < 0) - goto error; + goto out; break; } btrfs_item_key_to_cpu(leaf, &found_key, slot); if (found_key.objectid >= key->objectid && - found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) - return 0; + found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) { + ret = 0; + goto out; + } path->slots[0]++; } ret = -ENOENT; -error: +out: return ret; } @@ -3103,6 +3176,7 @@ int btrfs_read_block_groups(struct btrfs_root *root) if (!path) return -ENOMEM; + mutex_lock(&root->fs_info->alloc_mutex); while(1) { ret = find_first_block_group(root, path, &key); if (ret > 0) { @@ -3158,6 +3232,7 @@ int btrfs_read_block_groups(struct btrfs_root *root) ret = 0; error: btrfs_free_path(path); + mutex_unlock(&root->fs_info->alloc_mutex); return ret; } @@ -3205,5 +3280,6 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, ret = del_pending_extents(trans, extent_root); BUG_ON(ret); set_avail_alloc_bits(extent_root->fs_info, type); + return 0; } diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 17c508a941eb..bd15cdcaba95 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2889,7 +2889,6 @@ int clear_extent_buffer_dirty(struct extent_io_tree *tree, for (i = 0; i < num_pages; i++) { page = extent_buffer_page(eb, i); - lock_page(page); if (i == 0) set_page_extent_head(page, eb->len); else @@ -2907,7 +2906,6 @@ int clear_extent_buffer_dirty(struct extent_io_tree *tree, end = start + PAGE_CACHE_SIZE - 1; if (test_range_bit(tree, start, end, EXTENT_DIRTY, 0)) { - unlock_page(page); continue; } } @@ -2919,7 +2917,6 @@ int clear_extent_buffer_dirty(struct extent_io_tree *tree, PAGECACHE_TAG_DIRTY); } read_unlock_irq(&page->mapping->tree_lock); - unlock_page(page); } return 0; } @@ -2948,17 +2945,12 @@ int set_extent_buffer_dirty(struct extent_io_tree *tree, * on us if the page isn't already dirty. */ if (i == 0) { - lock_page(page); set_page_extent_head(page, eb->len); } else if (PagePrivate(page) && page->private != EXTENT_PAGE_PRIVATE) { - lock_page(page); set_page_extent_mapped(page); - unlock_page(page); } __set_page_dirty_nobuffers(extent_buffer_page(eb, i)); - if (i == 0) - unlock_page(page); } return set_extent_dirty(tree, eb->start, eb->start + eb->len - 1, GFP_NOFS); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 0c79346fd2c9..61bd8953a683 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -115,6 +115,7 @@ static int cow_file_range(struct inode *inode, u64 start, u64 end) trans = btrfs_start_transaction(root, 1); BUG_ON(!trans); btrfs_set_trans_block_group(trans, inode); + mutex_unlock(&root->fs_info->fs_mutex); num_bytes = (end - start + blocksize) & ~(blocksize - 1); num_bytes = max(blocksize, num_bytes); @@ -159,6 +160,7 @@ static int cow_file_range(struct inode *inode, u64 start, u64 end) btrfs_add_ordered_inode(inode); btrfs_update_inode(trans, root, inode); out: + mutex_lock(&root->fs_info->fs_mutex); btrfs_end_transaction(trans, root); return ret; } @@ -349,10 +351,12 @@ int __btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, mutex_lock(&root->fs_info->fs_mutex); trans = btrfs_start_transaction(root, 1); + mutex_unlock(&root->fs_info->fs_mutex); btrfs_set_trans_block_group(trans, inode); btrfs_csum_file_blocks(trans, root, inode, bio, sums); + mutex_lock(&root->fs_info->fs_mutex); ret = btrfs_end_transaction(trans, root); BUG_ON(ret); mutex_unlock(&root->fs_info->fs_mutex); @@ -807,6 +811,7 @@ static int btrfs_unlink_trans(struct btrfs_trans_handle *trans, goto err; } ret = btrfs_delete_one_dir_name(trans, root, path, di); + btrfs_release_path(root, path); dentry->d_inode->i_ctime = dir->i_ctime; ret = btrfs_del_inode_ref(trans, root, name, name_len, @@ -881,8 +886,9 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) struct btrfs_trans_handle *trans; unsigned long nr = 0; - if (inode->i_size > BTRFS_EMPTY_DIR_SIZE) + if (inode->i_size > BTRFS_EMPTY_DIR_SIZE) { return -ENOTEMPTY; + } mutex_lock(&root->fs_info->fs_mutex); ret = btrfs_check_free_space(root, 1, 1); diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 6fb455802759..3fbf74e93dba 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -43,6 +43,7 @@ #include "ioctl.h" #include "print-tree.h" #include "volumes.h" +#include "locking.h" @@ -75,9 +76,9 @@ static noinline int create_subvol(struct btrfs_root *root, char *name, if (ret) goto fail; - leaf = __btrfs_alloc_free_block(trans, root, root->leafsize, - objectid, trans->transid, 0, 0, - 0, 0); + leaf = btrfs_alloc_free_block(trans, root, root->leafsize, + objectid, trans->transid, 0, 0, + 0, 0); if (IS_ERR(leaf)) return PTR_ERR(leaf); @@ -108,6 +109,7 @@ static noinline int create_subvol(struct btrfs_root *root, char *name, memset(&root_item.drop_progress, 0, sizeof(root_item.drop_progress)); root_item.drop_level = 0; + btrfs_tree_unlock(leaf); free_extent_buffer(leaf); leaf = NULL; diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c new file mode 100644 index 000000000000..80813a307b4b --- /dev/null +++ b/fs/btrfs/locking.c @@ -0,0 +1,50 @@ +/* + * Copyright (C) 2008 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ +#include +#include +#include +#include +#include +#include +#include "ctree.h" +#include "extent_io.h" +#include "locking.h" + +int btrfs_tree_lock(struct extent_buffer *eb) +{ + lock_page(eb->first_page); + return 0; +} + +int btrfs_try_tree_lock(struct extent_buffer *eb) +{ + return TestSetPageLocked(eb->first_page); +} + +int btrfs_tree_unlock(struct extent_buffer *eb) +{ + WARN_ON(!PageLocked(eb->first_page)); + unlock_page(eb->first_page); + return 0; +} + +int btrfs_tree_locked(struct extent_buffer *eb) +{ + return PageLocked(eb->first_page); +} + diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h new file mode 100644 index 000000000000..2dab96d8280e --- /dev/null +++ b/fs/btrfs/locking.h @@ -0,0 +1,26 @@ +/* + * Copyright (C) 2008 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef __BTRFS_LOCKING_ +#define __BTRFS_LOCKING_ + +int btrfs_tree_lock(struct extent_buffer *eb); +int btrfs_tree_unlock(struct extent_buffer *eb); +int btrfs_tree_locked(struct extent_buffer *eb); +int btrfs_try_tree_lock(struct extent_buffer *eb); +#endif diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index f04684f7fea3..1ed433a71493 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -23,6 +23,7 @@ #include "ctree.h" #include "disk-io.h" #include "transaction.h" +#include "locking.h" static int total_trans = 0; extern struct kmem_cache *btrfs_trans_handle_cachep; @@ -96,8 +97,7 @@ static noinline int record_root_in_trans(struct btrfs_root *root) radix_tree_tag_set(&root->fs_info->fs_roots_radix, (unsigned long)root->root_key.objectid, BTRFS_ROOT_DEFRAG_TAG); - root->commit_root = root->node; - extent_buffer_get(root->node); + root->commit_root = btrfs_root_node(root); } else { WARN_ON(1); } @@ -559,6 +559,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root *tree_root = fs_info->tree_root; struct btrfs_root *root = pending->root; struct extent_buffer *tmp; + struct extent_buffer *old; int ret; int namelen; u64 objectid; @@ -578,16 +579,18 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, key.offset = 1; btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); - extent_buffer_get(root->node); - btrfs_cow_block(trans, root, root->node, NULL, 0, &tmp); - free_extent_buffer(tmp); + old = btrfs_lock_root_node(root); + btrfs_cow_block(trans, root, old, NULL, 0, &old); - btrfs_copy_root(trans, root, root->node, &tmp, objectid); + btrfs_copy_root(trans, root, old, &tmp, objectid); + btrfs_tree_unlock(old); + free_extent_buffer(old); btrfs_set_root_bytenr(new_root_item, tmp->start); btrfs_set_root_level(new_root_item, btrfs_header_level(tmp)); ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key, new_root_item); + btrfs_tree_unlock(tmp); free_extent_buffer(tmp); if (ret) goto fail; diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c index 155961c7b4d5..fab851d85383 100644 --- a/fs/btrfs/tree-defrag.c +++ b/fs/btrfs/tree-defrag.c @@ -181,6 +181,8 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, if (root->fs_info->extent_root == root) is_extent = 1; + goto out; + if (root->ref_cows == 0 && !is_extent) goto out; -- cgit v1.2.3 From b3c3da71ed88c06bc18a4d8919c6e4af3b933566 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Wed, 23 Jul 2008 12:12:13 -0400 Subject: Btrfs: Add version strings on module load Signed-off-by: Chris Mason --- fs/btrfs/Makefile | 6 +++++- fs/btrfs/super.c | 3 +++ fs/btrfs/version.sh | 43 +++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 51 insertions(+), 1 deletion(-) create mode 100644 fs/btrfs/version.sh (limited to 'fs/btrfs/Makefile') diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index 0e2dcc718de2..f8a38b0da7fb 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -14,8 +14,12 @@ else # Normal Makefile KERNELDIR := /lib/modules/`uname -r`/build -all: +all: version $(MAKE) -C $(KERNELDIR) M=`pwd` modules + +version: + bash version.sh + modules_install: $(MAKE) -C $(KERNELDIR) M=`pwd` modules_install clean: diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 5e28cf5c2e85..4cb6aac5122e 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -45,6 +45,7 @@ #include "print-tree.h" #include "xattr.h" #include "volumes.h" +#include "version.h" #define BTRFS_SUPER_MAGIC 0x9123683E @@ -605,6 +606,8 @@ static int __init init_btrfs_fs(void) err = register_filesystem(&btrfs_fs_type); if (err) goto unregister_ioctl; + + printk(KERN_INFO "%s loaded\n", BTRFS_BUILD_VERSION); return 0; unregister_ioctl: diff --git a/fs/btrfs/version.sh b/fs/btrfs/version.sh new file mode 100644 index 000000000000..fd9b53d39860 --- /dev/null +++ b/fs/btrfs/version.sh @@ -0,0 +1,43 @@ +#!/bin/bash +# +# determine-version -- report a useful version for releases +# +# Copyright 2008, Aron Griffis +# Copyright 2008, Oracle +# Released under the GNU GPLv2 + +v="Btrfs v0.15" + +which hg > /dev/null +if [ $? == 0 ]; then + last=$(hg tags | grep -m1 -o '^v[0-9.]\+') + + # now check if the repo has commits since then... + if [[ $(hg id -t) == $last || \ + $(hg di -r "$last:." | awk '/^diff/{print $NF}' | sort -u) == .hgtags ]] + then + # check if it's dirty + if [[ $(hg id | cut -d' ' -f1) == *+ ]]; then + v=$last+ + else + v=$last + fi + else + # includes dirty flag + v=$last+$(hg id -i) + fi +fi + +echo "#ifndef __BUILD_VERSION" > .build-version.h +echo "#define __BUILD_VERSION" >> .build-version.h +echo "#define BTRFS_BUILD_VERSION \"Btrfs $v\"" >> .build-version.h +echo "#endif" >> .build-version.h + +diff -q version.h .build-version.h >& /dev/null + +if [ $? == 0 ]; then + rm .build-version.h + exit 0 +fi + +mv .build-version.h version.h -- cgit v1.2.3 From 7b1287662304c3cb05cb38f5e3e2d69f386e8f10 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 24 Jul 2008 12:17:14 -0400 Subject: Btrfs: Create orphan inode records to prevent lost files after a crash Signed-off-by: Chris Mason --- fs/btrfs/Makefile | 2 +- fs/btrfs/btrfs_inode.h | 3 + fs/btrfs/ctree.c | 2 +- fs/btrfs/ctree.h | 14 +++ fs/btrfs/disk-io.c | 2 + fs/btrfs/inode.c | 237 ++++++++++++++++++++++++++++++++++++++++++++++++- fs/btrfs/orphan.c | 67 ++++++++++++++ 7 files changed, 322 insertions(+), 5 deletions(-) create mode 100644 fs/btrfs/orphan.c (limited to 'fs/btrfs/Makefile') diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index f8a38b0da7fb..5a0fd7b0e3e7 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -6,7 +6,7 @@ btrfs-y := super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ hash.o file-item.o inode-item.o inode-map.o disk-io.o \ transaction.o bit-radix.o inode.o file.o tree-defrag.o \ extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \ - extent_io.o volumes.o async-thread.o ioctl.o locking.o + extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o btrfs-$(CONFIG_FS_POSIX_ACL) += acl.o else diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 9f2a4ef944a7..df624fd735c1 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -39,6 +39,9 @@ struct btrfs_inode { struct posix_acl *i_acl; struct posix_acl *i_default_acl; + /* for keeping track of orphaned inodes */ + struct list_head i_orphan; + /* * transid of the trans_handle that last modified this inode */ diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index cdc713062b03..f2a94999c371 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -2622,7 +2622,7 @@ int btrfs_insert_empty_items(struct btrfs_trans_handle *trans, total_data += data_size[i]; } - total_size = total_data + (nr - 1) * sizeof(struct btrfs_item); + total_size = total_data + (nr * sizeof(struct btrfs_item)); ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1); if (ret == 0) { return -EEXIST; diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index f87d7263f2d7..012ad529cb18 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -74,6 +74,9 @@ struct btrfs_ordered_sum; /* directory objectid inside the root tree */ #define BTRFS_ROOT_TREE_DIR_OBJECTID 6ULL +/* orhpan objectid for tracking unlinked/truncated files */ +#define BTRFS_ORPHAN_OBJECTID -5ULL + /* * All files have objectids higher than this. */ @@ -646,6 +649,9 @@ struct btrfs_root { /* the dirty list is only used by non-reference counted roots */ struct list_head dirty_list; + + spinlock_t orphan_lock; + struct list_head orphan_list; }; /* @@ -657,6 +663,7 @@ struct btrfs_root { #define BTRFS_INODE_ITEM_KEY 1 #define BTRFS_INODE_REF_KEY 2 #define BTRFS_XATTR_ITEM_KEY 8 +#define BTRFS_ORPHAN_ITEM_KEY 9 /* reserve 2-15 close to the inode for later flexibility */ /* @@ -1560,6 +1567,13 @@ struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans, struct btrfs_path *path, u64 dir, const char *name, u16 name_len, int mod); + +/* orphan.c */ +int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 offset); +int btrfs_del_orphan_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 offset); + /* inode-map.c */ int btrfs_find_free_objectid(struct btrfs_trans_handle *trans, struct btrfs_root *fs_root, diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index ec01062eb41d..d60923967347 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -732,7 +732,9 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize, root->in_sysfs = 0; INIT_LIST_HEAD(&root->dirty_list); + INIT_LIST_HEAD(&root->orphan_list); spin_lock_init(&root->node_lock); + spin_lock_init(&root->orphan_lock); mutex_init(&root->objectid_mutex); memset(&root->root_key, 0, sizeof(root->root_key)); memset(&root->root_item, 0, sizeof(root->root_item)); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 2d8853543a71..0c9ec8aa304a 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -78,6 +78,8 @@ static unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = { [S_IFLNK >> S_SHIFT] = BTRFS_FT_SYMLINK, }; +static void btrfs_truncate(struct inode *inode); + int btrfs_check_free_space(struct btrfs_root *root, u64 num_required, int for_del) { @@ -826,6 +828,190 @@ zeroit: return -EIO; } +/* + * This creates an orphan entry for the given inode in case something goes + * wrong in the middle of an unlink/truncate. + */ +int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + int ret = 0; + + spin_lock(&root->orphan_lock); + + /* already on the orphan list, we're good */ + if (!list_empty(&BTRFS_I(inode)->i_orphan)) { + spin_unlock(&root->orphan_lock); + return 0; + } + + list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list); + + spin_unlock(&root->orphan_lock); + + /* + * insert an orphan item to track this unlinked/truncated file + */ + ret = btrfs_insert_orphan_item(trans, root, inode->i_ino); + + return ret; +} + +/* + * We have done the truncate/delete so we can go ahead and remove the orphan + * item for this particular inode. + */ +int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + int ret = 0; + + spin_lock(&root->orphan_lock); + + if (list_empty(&BTRFS_I(inode)->i_orphan)) { + spin_unlock(&root->orphan_lock); + return 0; + } + + list_del_init(&BTRFS_I(inode)->i_orphan); + if (!trans) { + spin_unlock(&root->orphan_lock); + return 0; + } + + spin_unlock(&root->orphan_lock); + + ret = btrfs_del_orphan_item(trans, root, inode->i_ino); + + return ret; +} + +/* + * this cleans up any orphans that may be left on the list from the last use + * of this root. + */ +void btrfs_orphan_cleanup(struct btrfs_root *root) +{ + struct btrfs_path *path; + struct extent_buffer *leaf; + struct btrfs_item *item; + struct btrfs_key key, found_key; + struct btrfs_trans_handle *trans; + struct inode *inode; + int ret = 0, nr_unlink = 0, nr_truncate = 0; + + /* don't do orphan cleanup if the fs is readonly. */ + if (root->inode->i_sb->s_flags & MS_RDONLY) + return; + + path = btrfs_alloc_path(); + if (!path) + return; + path->reada = -1; + + key.objectid = BTRFS_ORPHAN_OBJECTID; + btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY); + key.offset = (u64)-1; + + trans = btrfs_start_transaction(root, 1); + btrfs_set_trans_block_group(trans, root->inode); + + while (1) { + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) { + printk(KERN_ERR "Error searching slot for orphan: %d" + "\n", ret); + break; + } + + /* + * if ret == 0 means we found what we were searching for, which + * is weird, but possible, so only screw with path if we didnt + * find the key and see if we have stuff that matches + */ + if (ret > 0) { + if (path->slots[0] == 0) + break; + path->slots[0]--; + } + + /* pull out the item */ + leaf = path->nodes[0]; + item = btrfs_item_nr(leaf, path->slots[0]); + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + + /* make sure the item matches what we want */ + if (found_key.objectid != BTRFS_ORPHAN_OBJECTID) + break; + if (btrfs_key_type(&found_key) != BTRFS_ORPHAN_ITEM_KEY) + break; + + /* release the path since we're done with it */ + btrfs_release_path(root, path); + + /* + * this is where we are basically btrfs_lookup, without the + * crossing root thing. we store the inode number in the + * offset of the orphan item. + */ + inode = btrfs_iget_locked(root->inode->i_sb, + found_key.offset, root); + if (!inode) + break; + + if (inode->i_state & I_NEW) { + BTRFS_I(inode)->root = root; + + /* have to set the location manually */ + BTRFS_I(inode)->location.objectid = inode->i_ino; + BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY; + BTRFS_I(inode)->location.offset = 0; + + btrfs_read_locked_inode(inode); + unlock_new_inode(inode); + } + + /* + * add this inode to the orphan list so btrfs_orphan_del does + * the proper thing when we hit it + */ + spin_lock(&root->orphan_lock); + list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list); + spin_unlock(&root->orphan_lock); + + /* + * if this is a bad inode, means we actually succeeded in + * removing the inode, but not the orphan record, which means + * we need to manually delete the orphan since iput will just + * do a destroy_inode + */ + if (is_bad_inode(inode)) { + btrfs_orphan_del(trans, inode); + iput(inode); + continue; + } + + /* if we have links, this was a truncate, lets do that */ + if (inode->i_nlink) { + nr_truncate++; + btrfs_truncate(inode); + } else { + nr_unlink++; + } + + /* this will do delete_inode and everything for us */ + iput(inode); + } + + if (nr_unlink) + printk(KERN_INFO "btrfs: unlinked %d orphans\n", nr_unlink); + if (nr_truncate) + printk(KERN_INFO "btrfs: truncated %d orphans\n", nr_truncate); + + btrfs_free_path(path); + btrfs_end_transaction(trans, root); +} + void btrfs_read_locked_inode(struct inode *inode) { struct btrfs_path *path; @@ -1067,6 +1253,7 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry) { struct btrfs_root *root; struct btrfs_trans_handle *trans; + struct inode *inode = dentry->d_inode; int ret; unsigned long nr = 0; @@ -1080,6 +1267,10 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry) btrfs_set_trans_block_group(trans, dir); ret = btrfs_unlink_trans(trans, root, dir, dentry); + + if (inode->i_nlink == 0) + ret = btrfs_orphan_add(trans, inode); + nr = trans->blocks_used; btrfs_end_transaction_throttle(trans, root); @@ -1108,12 +1299,17 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) trans = btrfs_start_transaction(root, 1); btrfs_set_trans_block_group(trans, dir); + err = btrfs_orphan_add(trans, inode); + if (err) + goto fail_trans; + /* now the directory is empty */ err = btrfs_unlink_trans(trans, root, dir, dentry); if (!err) { btrfs_i_size_write(inode, 0); } +fail_trans: nr = trans->blocks_used; ret = btrfs_end_transaction_throttle(trans, root); fail: @@ -1131,6 +1327,9 @@ fail: * * csum items that cross the new i_size are truncated to the new size * as well. + * + * min_type is the minimum key type to truncate down to. If set to 0, this + * will kill all the items on this inode, including the INODE_ITEM_KEY. */ static int btrfs_truncate_in_trans(struct btrfs_trans_handle *trans, struct btrfs_root *root, @@ -1495,6 +1694,7 @@ void btrfs_delete_inode(struct inode *inode) truncate_inode_pages(&inode->i_data, 0); if (is_bad_inode(inode)) { + btrfs_orphan_del(NULL, inode); goto no_delete; } btrfs_wait_ordered_range(inode, 0, (u64)-1); @@ -1504,8 +1704,12 @@ void btrfs_delete_inode(struct inode *inode) btrfs_set_trans_block_group(trans, inode); ret = btrfs_truncate_in_trans(trans, root, inode, 0); - if (ret) + if (ret) { + btrfs_orphan_del(NULL, inode); goto no_delete_lock; + } + + btrfs_orphan_del(trans, inode); nr = trans->blocks_used; clear_inode(inode); @@ -1688,7 +1892,7 @@ static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry, struct btrfs_root *root = bi->root; struct btrfs_root *sub_root = root; struct btrfs_key location; - int ret; + int ret, do_orphan = 0; if (dentry->d_name.len > BTRFS_NAME_LEN) return ERR_PTR(-ENAMETOOLONG); @@ -1706,6 +1910,7 @@ static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry, return ERR_PTR(ret); if (ret > 0) return ERR_PTR(-ENOENT); + inode = btrfs_iget_locked(dir->i_sb, location.objectid, sub_root); if (!inode) @@ -1715,6 +1920,7 @@ static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry, if (sub_root != root) { igrab(inode); sub_root->inode = inode; + do_orphan = 1; } BTRFS_I(inode)->root = sub_root; memcpy(&BTRFS_I(inode)->location, &location, @@ -1723,6 +1929,10 @@ static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry, unlock_new_inode(inode); } } + + if (unlikely(do_orphan)) + btrfs_orphan_cleanup(sub_root); + return d_splice_alias(inode, dentry); } @@ -2964,12 +3174,19 @@ static void btrfs_truncate(struct inode *inode) btrfs_set_trans_block_group(trans, inode); btrfs_i_size_write(inode, inode->i_size); + ret = btrfs_orphan_add(trans, inode); + if (ret) + goto out; /* FIXME, add redo link to tree so we don't leak on crash */ ret = btrfs_truncate_in_trans(trans, root, inode, BTRFS_EXTENT_DATA_KEY); btrfs_update_inode(trans, root, inode); - nr = trans->blocks_used; + ret = btrfs_orphan_del(trans, inode); + BUG_ON(ret); + +out: + nr = trans->blocks_used; ret = btrfs_end_transaction_throttle(trans, root); BUG_ON(ret); btrfs_btree_balance_dirty(root, nr); @@ -3046,6 +3263,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) btrfs_ordered_inode_tree_init(&ei->ordered_tree); ei->i_acl = BTRFS_ACL_NOT_CACHED; ei->i_default_acl = BTRFS_ACL_NOT_CACHED; + INIT_LIST_HEAD(&ei->i_orphan); return &ei->vfs_inode; } @@ -3062,6 +3280,14 @@ void btrfs_destroy_inode(struct inode *inode) BTRFS_I(inode)->i_default_acl != BTRFS_ACL_NOT_CACHED) posix_acl_release(BTRFS_I(inode)->i_default_acl); + spin_lock(&BTRFS_I(inode)->root->orphan_lock); + if (!list_empty(&BTRFS_I(inode)->i_orphan)) { + printk(KERN_ERR "BTRFS: inode %lu: inode still on the orphan" + " list\n", inode->i_ino); + dump_stack(); + } + spin_unlock(&BTRFS_I(inode)->root->orphan_lock); + while(1) { ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1); if (!ordered) @@ -3202,6 +3428,11 @@ static int btrfs_rename(struct inode * old_dir, struct dentry *old_dentry, ret = btrfs_unlink_trans(trans, root, new_dir, new_dentry); if (ret) goto out_fail; + if (new_inode->i_nlink == 0) { + ret = btrfs_orphan_add(trans, new_inode); + if (ret) + goto out_fail; + } } ret = btrfs_set_inode_index(new_dir, old_inode); if (ret) diff --git a/fs/btrfs/orphan.c b/fs/btrfs/orphan.c new file mode 100644 index 000000000000..3c0d52af4f80 --- /dev/null +++ b/fs/btrfs/orphan.c @@ -0,0 +1,67 @@ +/* + * Copyright (C) 2008 Red Hat. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include "ctree.h" +#include "disk-io.h" + +int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 offset) +{ + struct btrfs_path *path; + struct btrfs_key key; + int ret = 0; + + key.objectid = BTRFS_ORPHAN_OBJECTID; + btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY); + key.offset = offset; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + ret = btrfs_insert_empty_item(trans, root, path, &key, 0); + + btrfs_free_path(path); + return ret; +} + +int btrfs_del_orphan_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 offset) +{ + struct btrfs_path *path; + struct btrfs_key key; + int ret = 0; + + key.objectid = BTRFS_ORPHAN_OBJECTID; + btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY); + key.offset = offset; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + if (ret) + goto out; + + ret = btrfs_del_item(trans, root, path); + +out: + btrfs_free_path(path); + return ret; +} -- cgit v1.2.3 From 31153d81284934601d08110ac7698fd9a535e4c0 Mon Sep 17 00:00:00 2001 From: Yan Zheng Date: Mon, 28 Jul 2008 15:32:19 -0400 Subject: Btrfs: Add a leaf reference cache Much of the IO done while dropping snapshots is done looking up leaves in the filesystem trees to see if they point to any extents and to drop the references on any extents found. This creates a cache so that IO isn't required. Signed-off-by: Chris Mason --- fs/btrfs/Makefile | 3 +- fs/btrfs/ctree.c | 4 +- fs/btrfs/ctree.h | 8 +- fs/btrfs/disk-io.c | 14 +++ fs/btrfs/extent-tree.c | 115 ++++++++++++++++++++++--- fs/btrfs/ref-cache.c | 226 +++++++++++++++++++++++++++++++++++++++++++++++++ fs/btrfs/ref-cache.h | 72 ++++++++++++++++ fs/btrfs/transaction.c | 67 +++++++++++---- 8 files changed, 476 insertions(+), 33 deletions(-) create mode 100644 fs/btrfs/ref-cache.c create mode 100644 fs/btrfs/ref-cache.h (limited to 'fs/btrfs/Makefile') diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index 5a0fd7b0e3e7..a4b38177abdb 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -6,7 +6,8 @@ btrfs-y := super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ hash.o file-item.o inode-item.o inode-map.o disk-io.o \ transaction.o bit-radix.o inode.o file.o tree-defrag.o \ extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \ - extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o + extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \ + ref-cache.o btrfs-$(CONFIG_FS_POSIX_ACL) += acl.o else diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index ab4ac0365c7f..245eb00435dd 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -165,7 +165,7 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans, btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN); WARN_ON(btrfs_header_generation(buf) > trans->transid); - ret = btrfs_inc_ref(trans, new_root, buf); + ret = btrfs_inc_ref(trans, new_root, buf, 0); kfree(new_root); if (ret) @@ -232,7 +232,7 @@ int __btrfs_cow_block(struct btrfs_trans_handle *trans, WARN_ON(btrfs_header_generation(buf) > trans->transid); if (btrfs_header_generation(buf) != trans->transid) { different_trans = 1; - ret = btrfs_inc_ref(trans, root, buf); + ret = btrfs_inc_ref(trans, root, buf, 1); if (ret) return ret; } else { diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 07d321552dbe..34ed23d64eb5 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -592,6 +592,10 @@ struct btrfs_fs_info { u64 last_alloc; u64 last_data_alloc; + spinlock_t ref_cache_lock; + u64 total_ref_cache_size; + u64 running_ref_cache_size; + u64 avail_data_alloc_bits; u64 avail_metadata_alloc_bits; u64 avail_system_alloc_bits; @@ -613,6 +617,8 @@ struct btrfs_root { spinlock_t node_lock; struct extent_buffer *commit_root; + struct btrfs_leaf_ref_tree *ref_tree; + struct btrfs_root_item root_item; struct btrfs_key root_key; struct btrfs_fs_info *fs_info; @@ -1430,7 +1436,7 @@ int btrfs_reserve_extent(struct btrfs_trans_handle *trans, u64 search_end, struct btrfs_key *ins, u64 data); int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct extent_buffer *buf); + struct extent_buffer *buf, int cache_ref); int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 num_bytes, u64 root_objectid, u64 ref_generation, diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index d60923967347..4f0e1d06c384 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -716,6 +716,7 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize, root->node = NULL; root->inode = NULL; root->commit_root = NULL; + root->ref_tree = NULL; root->sectorsize = sectorsize; root->nodesize = nodesize; root->leafsize = leafsize; @@ -1165,12 +1166,19 @@ static int transaction_kthread(void *arg) vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE); mutex_lock(&root->fs_info->transaction_kthread_mutex); + printk("btrfs: total reference cache size %Lu\n", + root->fs_info->total_ref_cache_size); + mutex_lock(&root->fs_info->trans_mutex); cur = root->fs_info->running_transaction; if (!cur) { mutex_unlock(&root->fs_info->trans_mutex); goto sleep; } + + printk("btrfs: running reference cache size %Lu\n", + root->fs_info->running_ref_cache_size); + now = get_seconds(); if (now < cur->start_time || now - cur->start_time < 30) { mutex_unlock(&root->fs_info->trans_mutex); @@ -1233,6 +1241,7 @@ struct btrfs_root *open_ctree(struct super_block *sb, spin_lock_init(&fs_info->hash_lock); spin_lock_init(&fs_info->delalloc_lock); spin_lock_init(&fs_info->new_trans_lock); + spin_lock_init(&fs_info->ref_cache_lock); init_completion(&fs_info->kobj_unregister); fs_info->tree_root = tree_root; @@ -1699,6 +1708,11 @@ int close_ctree(struct btrfs_root *root) printk("btrfs: at unmount delalloc count %Lu\n", fs_info->delalloc_bytes); } + if (fs_info->total_ref_cache_size) { + printk("btrfs: at umount reference cache size %Lu\n", + fs_info->total_ref_cache_size); + } + if (fs_info->extent_root->node) free_extent_buffer(fs_info->extent_root->node); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index cdfb4ff4b459..7b24f1511654 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -26,6 +26,7 @@ #include "transaction.h" #include "volumes.h" #include "locking.h" +#include "ref-cache.h" #define BLOCK_GROUP_DATA EXTENT_WRITEBACK #define BLOCK_GROUP_METADATA EXTENT_UPTODATE @@ -927,7 +928,7 @@ out: } int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct extent_buffer *buf) + struct extent_buffer *buf, int cache_ref) { u64 bytenr; u32 nritems; @@ -937,6 +938,7 @@ int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, int level; int ret; int faili; + int nr_file_extents = 0; if (!root->ref_cows) return 0; @@ -959,6 +961,9 @@ int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, if (disk_bytenr == 0) continue; + if (buf != root->commit_root) + nr_file_extents++; + mutex_lock(&root->fs_info->alloc_mutex); ret = __btrfs_inc_extent_ref(trans, root, disk_bytenr, btrfs_file_extent_disk_num_bytes(buf, fi), @@ -988,6 +993,53 @@ int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, } } } + /* cache orignal leaf block's references */ + if (level == 0 && cache_ref && buf != root->commit_root) { + struct btrfs_leaf_ref *ref; + struct btrfs_extent_info *info; + + ref = btrfs_alloc_leaf_ref(nr_file_extents); + if (!ref) { + WARN_ON(1); + goto out; + } + + btrfs_item_key_to_cpu(buf, &ref->key, 0); + + ref->bytenr = buf->start; + ref->owner = btrfs_header_owner(buf); + ref->generation = btrfs_header_generation(buf); + ref->nritems = nr_file_extents; + info = ref->extents; + + for (i = 0; nr_file_extents > 0 && i < nritems; i++) { + u64 disk_bytenr; + btrfs_item_key_to_cpu(buf, &key, i); + if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY) + continue; + fi = btrfs_item_ptr(buf, i, + struct btrfs_file_extent_item); + if (btrfs_file_extent_type(buf, fi) == + BTRFS_FILE_EXTENT_INLINE) + continue; + disk_bytenr = btrfs_file_extent_disk_bytenr(buf, fi); + if (disk_bytenr == 0) + continue; + + info->bytenr = disk_bytenr; + info->num_bytes = + btrfs_file_extent_disk_num_bytes(buf, fi); + info->objectid = key.objectid; + info->offset = key.offset; + info++; + } + + BUG_ON(!root->ref_tree); + ret = btrfs_add_leaf_ref(root, ref); + WARN_ON(ret); + btrfs_free_leaf_ref(ref); + } +out: return 0; fail: WARN_ON(1); @@ -2215,9 +2267,9 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, return buf; } -static int noinline drop_leaf_ref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct extent_buffer *leaf) +static int noinline drop_leaf_ref_no_cache(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct extent_buffer *leaf) { u64 leaf_owner; u64 leaf_generation; @@ -2266,6 +2318,30 @@ static int noinline drop_leaf_ref(struct btrfs_trans_handle *trans, return 0; } +static int noinline drop_leaf_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_leaf_ref *ref) +{ + int i; + int ret; + struct btrfs_extent_info *info = ref->extents; + + mutex_unlock(&root->fs_info->alloc_mutex); + for (i = 0; i < ref->nritems; i++) { + mutex_lock(&root->fs_info->alloc_mutex); + ret = __btrfs_free_extent(trans, root, + info->bytenr, info->num_bytes, + ref->owner, ref->generation, + info->objectid, info->offset, 0); + mutex_unlock(&root->fs_info->alloc_mutex); + BUG_ON(ret); + info++; + } + mutex_lock(&root->fs_info->alloc_mutex); + + return 0; +} + static void noinline reada_walk_down(struct btrfs_root *root, struct extent_buffer *node, int slot) @@ -2341,6 +2417,7 @@ static int noinline walk_down_tree(struct btrfs_trans_handle *trans, struct extent_buffer *next; struct extent_buffer *cur; struct extent_buffer *parent; + struct btrfs_leaf_ref *ref; u32 blocksize; int ret; u32 refs; @@ -2370,7 +2447,7 @@ static int noinline walk_down_tree(struct btrfs_trans_handle *trans, btrfs_header_nritems(cur)) break; if (*level == 0) { - ret = drop_leaf_ref(trans, root, cur); + ret = drop_leaf_ref_no_cache(trans, root, cur); BUG_ON(ret); break; } @@ -2391,6 +2468,21 @@ static int noinline walk_down_tree(struct btrfs_trans_handle *trans, BUG_ON(ret); continue; } + + if (*level == 1) { + struct btrfs_key key; + btrfs_node_key_to_cpu(cur, &key, path->slots[*level]); + ref = btrfs_lookup_leaf_ref(root, &key); + if (ref) { + ret = drop_leaf_ref(trans, root, ref); + BUG_ON(ret); + btrfs_remove_leaf_ref(root, ref); + btrfs_free_leaf_ref(ref); + *level = 0; + break; + } + } + next = btrfs_find_tree_block(root, bytenr, blocksize); if (!next || !btrfs_buffer_uptodate(next, ptr_gen)) { free_extent_buffer(next); @@ -2398,7 +2490,6 @@ static int noinline walk_down_tree(struct btrfs_trans_handle *trans, if (path->slots[*level] == 0) reada_walk_down(root, cur, path->slots[*level]); - next = read_tree_block(root, bytenr, blocksize, ptr_gen); cond_resched(); @@ -2435,17 +2526,19 @@ out: WARN_ON(*level >= BTRFS_MAX_LEVEL); if (path->nodes[*level] == root->node) { - root_owner = root->root_key.objectid; parent = path->nodes[*level]; + bytenr = path->nodes[*level]->start; } else { parent = path->nodes[*level + 1]; - root_owner = btrfs_header_owner(parent); + bytenr = btrfs_node_blockptr(parent, path->slots[*level + 1]); } + blocksize = btrfs_level_size(root, *level); + root_owner = btrfs_header_owner(parent); root_gen = btrfs_header_generation(parent); - ret = __btrfs_free_extent(trans, root, path->nodes[*level]->start, - path->nodes[*level]->len, - root_owner, root_gen, 0, 0, 1); + + ret = __btrfs_free_extent(trans, root, bytenr, blocksize, + root_owner, root_gen, 0, 0, 1); free_extent_buffer(path->nodes[*level]); path->nodes[*level] = NULL; *level += 1; diff --git a/fs/btrfs/ref-cache.c b/fs/btrfs/ref-cache.c new file mode 100644 index 000000000000..95a9faeb9dc4 --- /dev/null +++ b/fs/btrfs/ref-cache.c @@ -0,0 +1,226 @@ +/* + * Copyright (C) 2008 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include +#include "ctree.h" +#include "ref-cache.h" +#include "transaction.h" + +struct btrfs_leaf_ref *btrfs_alloc_leaf_ref(int nr_extents) +{ + struct btrfs_leaf_ref *ref; + + ref = kmalloc(btrfs_leaf_ref_size(nr_extents), GFP_NOFS); + if (ref) { + memset(ref, 0, sizeof(*ref)); + atomic_set(&ref->usage, 1); + } + return ref; +} + +void btrfs_free_leaf_ref(struct btrfs_leaf_ref *ref) +{ + if (!ref) + return; + WARN_ON(atomic_read(&ref->usage) == 0); + if (atomic_dec_and_test(&ref->usage)) { + BUG_ON(ref->in_tree); + kfree(ref); + } +} + +static int comp_keys(struct btrfs_key *k1, struct btrfs_key *k2) +{ + if (k1->objectid > k2->objectid) + return 1; + if (k1->objectid < k2->objectid) + return -1; + if (k1->type > k2->type) + return 1; + if (k1->type < k2->type) + return -1; + if (k1->offset > k2->offset) + return 1; + if (k1->offset < k2->offset) + return -1; + return 0; +} + +static struct rb_node *tree_insert(struct rb_root *root, struct btrfs_key *key, + struct rb_node *node) +{ + struct rb_node ** p = &root->rb_node; + struct rb_node * parent = NULL; + struct btrfs_leaf_ref *entry; + int ret; + + while(*p) { + parent = *p; + entry = rb_entry(parent, struct btrfs_leaf_ref, rb_node); + WARN_ON(!entry->in_tree); + + ret = comp_keys(key, &entry->key); + if (ret < 0) + p = &(*p)->rb_left; + else if (ret > 0) + p = &(*p)->rb_right; + else + return parent; + } + + entry = rb_entry(node, struct btrfs_leaf_ref, rb_node); + entry->in_tree = 1; + rb_link_node(node, parent, p); + rb_insert_color(node, root); + return NULL; +} + +static struct rb_node *tree_search(struct rb_root *root, struct btrfs_key *key) +{ + struct rb_node * n = root->rb_node; + struct btrfs_leaf_ref *entry; + int ret; + + while(n) { + entry = rb_entry(n, struct btrfs_leaf_ref, rb_node); + WARN_ON(!entry->in_tree); + + ret = comp_keys(key, &entry->key); + if (ret < 0) + n = n->rb_left; + else if (ret > 0) + n = n->rb_right; + else + return n; + } + return NULL; +} + +int btrfs_remove_leaf_refs(struct btrfs_root *root) +{ + struct rb_node *rb; + struct btrfs_leaf_ref *ref = NULL; + struct btrfs_leaf_ref_tree *tree = root->ref_tree; + + if (!tree) + return 0; + + spin_lock(&tree->lock); + while(!btrfs_leaf_ref_tree_empty(tree)) { + tree->last = NULL; + rb = rb_first(&tree->root); + ref = rb_entry(rb, struct btrfs_leaf_ref, rb_node); + rb_erase(&ref->rb_node, &tree->root); + ref->in_tree = 0; + + spin_unlock(&tree->lock); + + btrfs_free_leaf_ref(ref); + + cond_resched(); + spin_lock(&tree->lock); + } + spin_unlock(&tree->lock); + return 0; +} + +struct btrfs_leaf_ref *btrfs_lookup_leaf_ref(struct btrfs_root *root, + struct btrfs_key *key) +{ + struct rb_node *rb; + struct btrfs_leaf_ref *ref = NULL; + struct btrfs_leaf_ref_tree *tree = root->ref_tree; + + if (!tree) + return NULL; + + spin_lock(&tree->lock); + if (tree->last && comp_keys(key, &tree->last->key) == 0) { + ref = tree->last; + } else { + rb = tree_search(&tree->root, key); + if (rb) { + ref = rb_entry(rb, struct btrfs_leaf_ref, rb_node); + tree->last = ref; + } + } + if (ref) + atomic_inc(&ref->usage); + spin_unlock(&tree->lock); + return ref; +} + +int btrfs_add_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref) +{ + int ret = 0; + struct rb_node *rb; + size_t size = btrfs_leaf_ref_size(ref->nritems); + struct btrfs_leaf_ref_tree *tree = root->ref_tree; + struct btrfs_transaction *trans = root->fs_info->running_transaction; + + spin_lock(&tree->lock); + rb = tree_insert(&tree->root, &ref->key, &ref->rb_node); + if (rb) { + ret = -EEXIST; + } else { + spin_lock(&root->fs_info->ref_cache_lock); + root->fs_info->total_ref_cache_size += size; + if (trans && tree->generation == trans->transid) + root->fs_info->running_ref_cache_size += size; + spin_unlock(&root->fs_info->ref_cache_lock); + + tree->last = ref; + atomic_inc(&ref->usage); + } + spin_unlock(&tree->lock); + return ret; +} + +int btrfs_remove_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref) +{ + size_t size = btrfs_leaf_ref_size(ref->nritems); + struct btrfs_leaf_ref_tree *tree = root->ref_tree; + struct btrfs_transaction *trans = root->fs_info->running_transaction; + + BUG_ON(!ref->in_tree); + spin_lock(&tree->lock); + + spin_lock(&root->fs_info->ref_cache_lock); + root->fs_info->total_ref_cache_size -= size; + if (trans && tree->generation == trans->transid) + root->fs_info->running_ref_cache_size -= size; + spin_unlock(&root->fs_info->ref_cache_lock); + + if (tree->last == ref) { + struct rb_node *next = rb_next(&ref->rb_node); + if (next) { + tree->last = rb_entry(next, struct btrfs_leaf_ref, + rb_node); + } else + tree->last = NULL; + } + + rb_erase(&ref->rb_node, &tree->root); + ref->in_tree = 0; + + spin_unlock(&tree->lock); + + btrfs_free_leaf_ref(ref); + return 0; +} + diff --git a/fs/btrfs/ref-cache.h b/fs/btrfs/ref-cache.h new file mode 100644 index 000000000000..79ecc47110f2 --- /dev/null +++ b/fs/btrfs/ref-cache.h @@ -0,0 +1,72 @@ +/* + * Copyright (C) 2008 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +struct btrfs_extent_info { + u64 bytenr; + u64 num_bytes; + u64 objectid; + u64 offset; +}; + +struct btrfs_leaf_ref { + struct rb_node rb_node; + struct btrfs_key key; + int in_tree; + atomic_t usage; + + u64 bytenr; + u64 owner; + u64 generation; + int nritems; + struct btrfs_extent_info extents[]; +}; + +struct btrfs_leaf_ref_tree { + struct rb_root root; + struct btrfs_leaf_ref *last; + u64 generation; + spinlock_t lock; +}; + +static inline size_t btrfs_leaf_ref_size(int nr_extents) +{ + return sizeof(struct btrfs_leaf_ref) + + sizeof(struct btrfs_extent_info) * nr_extents; +} + +static inline void btrfs_leaf_ref_tree_init(struct btrfs_leaf_ref_tree *tree) +{ + tree->root.rb_node = NULL; + tree->last = NULL; + tree->generation = 0; + spin_lock_init(&tree->lock); +} + +static inline int btrfs_leaf_ref_tree_empty(struct btrfs_leaf_ref_tree *tree) +{ + return RB_EMPTY_ROOT(&tree->root); +} + +void btrfs_leaf_ref_tree_init(struct btrfs_leaf_ref_tree *tree); +struct btrfs_leaf_ref *btrfs_alloc_leaf_ref(int nr_extents); +void btrfs_free_leaf_ref(struct btrfs_leaf_ref *ref); +struct btrfs_leaf_ref *btrfs_lookup_leaf_ref(struct btrfs_root *root, + struct btrfs_key *key); +int btrfs_add_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref); +int btrfs_remove_leaf_refs(struct btrfs_root *root); +int btrfs_remove_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref); diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 768b0d223e68..543e5ee4033a 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -24,6 +24,7 @@ #include "disk-io.h" #include "transaction.h" #include "locking.h" +#include "ref-cache.h" static int total_trans = 0; extern struct kmem_cache *btrfs_trans_handle_cachep; @@ -31,6 +32,13 @@ extern struct kmem_cache *btrfs_transaction_cachep; #define BTRFS_ROOT_TRANS_TAG 0 +struct dirty_root { + struct list_head list; + struct btrfs_root *root; + struct btrfs_root *latest_root; + struct btrfs_leaf_ref_tree ref_tree; +}; + static noinline void put_transaction(struct btrfs_transaction *transaction) { WARN_ON(transaction->use_count == 0); @@ -84,6 +92,7 @@ static noinline int join_transaction(struct btrfs_root *root) static noinline int record_root_in_trans(struct btrfs_root *root) { + struct dirty_root *dirty; u64 running_trans_id = root->fs_info->running_transaction->transid; if (root->ref_cows && root->last_trans < running_trans_id) { WARN_ON(root == root->fs_info->extent_root); @@ -91,7 +100,25 @@ static noinline int record_root_in_trans(struct btrfs_root *root) radix_tree_tag_set(&root->fs_info->fs_roots_radix, (unsigned long)root->root_key.objectid, BTRFS_ROOT_TRANS_TAG); + + dirty = kmalloc(sizeof(*dirty), GFP_NOFS); + BUG_ON(!dirty); + dirty->root = kmalloc(sizeof(*dirty->root), GFP_NOFS); + BUG_ON(!dirty->root); + + dirty->latest_root = root; + INIT_LIST_HEAD(&dirty->list); + btrfs_leaf_ref_tree_init(&dirty->ref_tree); + dirty->ref_tree.generation = running_trans_id; + root->commit_root = btrfs_root_node(root); + root->ref_tree = &dirty->ref_tree; + + memcpy(dirty->root, root, sizeof(*root)); + spin_lock_init(&dirty->root->node_lock); + mutex_init(&dirty->root->objectid_mutex); + dirty->root->node = root->commit_root; + dirty->root->commit_root = NULL; } else { WARN_ON(1); } @@ -310,12 +337,6 @@ int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans, return 0; } -struct dirty_root { - struct list_head list; - struct btrfs_root *root; - struct btrfs_root *latest_root; -}; - int btrfs_add_dead_root(struct btrfs_root *root, struct btrfs_root *latest, struct list_head *dead_list) @@ -325,8 +346,10 @@ int btrfs_add_dead_root(struct btrfs_root *root, dirty = kmalloc(sizeof(*dirty), GFP_NOFS); if (!dirty) return -ENOMEM; + btrfs_leaf_ref_tree_init(&dirty->ref_tree); dirty->root = root; dirty->latest_root = latest; + root->ref_tree = NULL; list_add(&dirty->list, dead_list); return 0; } @@ -354,11 +377,23 @@ static noinline int add_dirty_roots(struct btrfs_trans_handle *trans, radix_tree_tag_clear(radix, (unsigned long)root->root_key.objectid, BTRFS_ROOT_TRANS_TAG); + + BUG_ON(!root->ref_tree); + dirty = container_of(root->ref_tree, struct dirty_root, + ref_tree); + if (root->commit_root == root->node) { WARN_ON(root->node->start != btrfs_root_bytenr(&root->root_item)); + + BUG_ON(!btrfs_leaf_ref_tree_empty( + root->ref_tree)); free_extent_buffer(root->commit_root); root->commit_root = NULL; + root->ref_tree = NULL; + + kfree(dirty->root); + kfree(dirty); /* make sure to update the root on disk * so we get any updates to the block used @@ -370,23 +405,12 @@ static noinline int add_dirty_roots(struct btrfs_trans_handle *trans, &root->root_item); continue; } - dirty = kmalloc(sizeof(*dirty), GFP_NOFS); - BUG_ON(!dirty); - dirty->root = kmalloc(sizeof(*dirty->root), GFP_NOFS); - BUG_ON(!dirty->root); memset(&root->root_item.drop_progress, 0, sizeof(struct btrfs_disk_key)); root->root_item.drop_level = 0; - - memcpy(dirty->root, root, sizeof(*root)); - dirty->root->node = root->commit_root; - dirty->latest_root = root; - spin_lock_init(&dirty->root->node_lock); - mutex_init(&dirty->root->objectid_mutex); - root->commit_root = NULL; - + root->ref_tree = NULL; root->root_key.offset = root->fs_info->generation; btrfs_set_root_bytenr(&root->root_item, root->node->start); @@ -409,6 +433,7 @@ static noinline int add_dirty_roots(struct btrfs_trans_handle *trans, list_add(&dirty->list, list); } else { WARN_ON(1); + free_extent_buffer(dirty->root->node); kfree(dirty->root); kfree(dirty); } @@ -514,6 +539,8 @@ static noinline int drop_dirty_roots(struct btrfs_root *tree_root, ret = btrfs_end_transaction(trans, tree_root); BUG_ON(ret); + btrfs_remove_leaf_refs(dirty->root); + free_extent_buffer(dirty->root->node); kfree(dirty->root); kfree(dirty); @@ -698,6 +725,10 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, &dirty_fs_roots); BUG_ON(ret); + spin_lock(&root->fs_info->ref_cache_lock); + root->fs_info->running_ref_cache_size = 0; + spin_unlock(&root->fs_info->ref_cache_lock); + ret = btrfs_commit_tree_roots(trans, root); BUG_ON(ret); -- cgit v1.2.3 From be6e8dc0ba84029997075a1ec77b4ddb863cbe15 Mon Sep 17 00:00:00 2001 From: Balaji Rao Date: Mon, 21 Jul 2008 02:01:56 +0530 Subject: NFS support for btrfs - v3 Date: Mon, 21 Jul 2008 02:01:56 +0530 Here's an implementation of NFS support for btrfs. It relies on the fixes which are going in to 2.6.28 for the NFS readdir/lookup deadlock. This uses the btrfs_iget helper introduced previously. [dwmw2: Tidy up a little, switch to d_obtain_alias() w/compat routine, change fh_type, store parent's root object ID where needed, fix some get_parent() and fs_to_dentry() bugs] Signed-off-by: Balaji Rao Signed-off-by: David Woodhouse Signed-off-by: Chris Mason --- fs/btrfs/Makefile | 2 +- fs/btrfs/compat.h | 17 +++++ fs/btrfs/export.c | 208 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ fs/btrfs/export.h | 19 +++++ fs/btrfs/super.c | 2 + 5 files changed, 247 insertions(+), 1 deletion(-) create mode 100644 fs/btrfs/export.c create mode 100644 fs/btrfs/export.h (limited to 'fs/btrfs/Makefile') diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index a4b38177abdb..75f8818cbfee 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -7,7 +7,7 @@ btrfs-y := super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ transaction.o bit-radix.o inode.o file.o tree-defrag.o \ extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \ extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \ - ref-cache.o + ref-cache.o export.o btrfs-$(CONFIG_FS_POSIX_ACL) += acl.o else diff --git a/fs/btrfs/compat.h b/fs/btrfs/compat.h index b3349a66999d..d45fb37887bc 100644 --- a/fs/btrfs/compat.h +++ b/fs/btrfs/compat.h @@ -5,6 +5,23 @@ #define trylock_page(page) (!TestSetPageLocked(page)) #endif +#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,27) +static inline struct dentry *d_obtain_alias(struct inode *inode) +{ + struct dentry *d; + + if (!inode) + return NULL; + if (IS_ERR(inode)) + return ERR_CAST(inode); + + d = d_alloc_anon(inode); + if (!d) + iput(inode); + return d; +} +#endif + /* * Even if AppArmor isn't enabled, it still has different prototypes. * Add more distro/version pairs here to declare which has AppArmor applied. diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c new file mode 100644 index 000000000000..797b4cbc3786 --- /dev/null +++ b/fs/btrfs/export.c @@ -0,0 +1,208 @@ +#include +#include +#include "ctree.h" +#include "disk-io.h" +#include "btrfs_inode.h" +#include "print-tree.h" +#include "export.h" +#include "compat.h" + +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,28) +#define FILEID_BTRFS_WITHOUT_PARENT 0x4d +#define FILEID_BTRFS_WITH_PARENT 0x4e +#define FILEID_BTRFS_WITH_PARENT_ROOT 0x4f +#endif + +#define BTRFS_FID_SIZE_NON_CONNECTABLE (offsetof(struct btrfs_fid, parent_objectid)/4) +#define BTRFS_FID_SIZE_CONNECTABLE (offsetof(struct btrfs_fid, parent_root_objectid)/4) +#define BTRFS_FID_SIZE_CONNECTABLE_ROOT (sizeof(struct btrfs_fid)/4) + +static int btrfs_encode_fh(struct dentry *dentry, u32 *fh, int *max_len, + int connectable) +{ + struct btrfs_fid *fid = (struct btrfs_fid *)fh; + struct inode *inode = dentry->d_inode; + int len = *max_len; + int type; + + if ((len < BTRFS_FID_SIZE_NON_CONNECTABLE) || + (connectable && len < BTRFS_FID_SIZE_CONNECTABLE)) + return 255; + + len = BTRFS_FID_SIZE_NON_CONNECTABLE; + type = FILEID_BTRFS_WITHOUT_PARENT; + + fid->objectid = BTRFS_I(inode)->location.objectid; + fid->root_objectid = BTRFS_I(inode)->root->objectid; + fid->gen = inode->i_generation; + + if (connectable && !S_ISDIR(inode->i_mode)) { + struct inode *parent; + u64 parent_root_id; + + spin_lock(&dentry->d_lock); + + parent = dentry->d_parent->d_inode; + fid->parent_objectid = BTRFS_I(parent)->location.objectid; + fid->parent_gen = parent->i_generation; + parent_root_id = BTRFS_I(parent)->root->objectid; + + spin_unlock(&dentry->d_lock); + + if (parent_root_id != fid->root_objectid) { + fid->parent_root_objectid = parent_root_id; + len = BTRFS_FID_SIZE_CONNECTABLE_ROOT; + type = FILEID_BTRFS_WITH_PARENT_ROOT; + } else { + len = BTRFS_FID_SIZE_CONNECTABLE; + type = FILEID_BTRFS_WITH_PARENT; + } + } + + *max_len = len; + return type; +} + +static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid, + u64 root_objectid, u32 generation) +{ + struct btrfs_root *root; + struct inode *inode; + struct dentry *result; + struct btrfs_key key; + + key.objectid = objectid; + btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); + key.offset = 0; + + root = btrfs_lookup_fs_root(btrfs_sb(sb)->fs_info, root_objectid); + inode = btrfs_iget(sb, &key, root, NULL); + if (IS_ERR(inode)) + return (void *)inode; + + if (generation != inode->i_generation) { + iput(inode); + return ERR_PTR(-ESTALE); + } + + result = d_obtain_alias(inode); + if (!result) + return ERR_PTR(-ENOMEM); + + return result; +} + +static struct dentry *btrfs_fh_to_parent(struct super_block *sb, struct fid *fh, + int fh_len, int fh_type) +{ + struct btrfs_fid *fid = (struct btrfs_fid *) fh; + u64 objectid, root_objectid; + u32 generation; + + if (fh_type == FILEID_BTRFS_WITH_PARENT) { + if (fh_len != BTRFS_FID_SIZE_CONNECTABLE) + return NULL; + root_objectid = fid->root_objectid; + } else if (fh_type == FILEID_BTRFS_WITH_PARENT_ROOT) { + if (fh_len != BTRFS_FID_SIZE_CONNECTABLE_ROOT) + return NULL; + root_objectid = fid->parent_root_objectid; + } else + return NULL; + + objectid = fid->parent_objectid; + generation = fid->parent_gen; + + return btrfs_get_dentry(sb, objectid, root_objectid, generation); +} + +static struct dentry *btrfs_fh_to_dentry(struct super_block *sb, struct fid *fh, + int fh_len, int fh_type) +{ + struct btrfs_fid *fid = (struct btrfs_fid *) fh; + u64 objectid, root_objectid; + u32 generation; + + if ((fh_type != FILEID_BTRFS_WITH_PARENT || + fh_len != BTRFS_FID_SIZE_CONNECTABLE) && + (fh_type != FILEID_BTRFS_WITH_PARENT_ROOT || + fh_len != BTRFS_FID_SIZE_CONNECTABLE_ROOT) && + (fh_type != FILEID_BTRFS_WITHOUT_PARENT || + fh_len != BTRFS_FID_SIZE_NON_CONNECTABLE)) + return NULL; + + objectid = fid->objectid; + root_objectid = fid->root_objectid; + generation = fid->gen; + + return btrfs_get_dentry(sb, objectid, root_objectid, generation); +} + +static struct dentry *btrfs_get_parent(struct dentry *child) +{ + struct inode *dir = child->d_inode; + struct inode *inode; + struct dentry *parent; + struct btrfs_root *root = BTRFS_I(dir)->root; + struct btrfs_key key; + struct btrfs_path *path; + struct extent_buffer *leaf; + u32 nritems; + int slot; + u64 objectid; + int ret; + + path = btrfs_alloc_path(); + + key.objectid = dir->i_ino; + btrfs_set_key_type(&key, BTRFS_INODE_REF_KEY); + key.offset = 0; + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + BUG_ON(ret == 0); + ret = 0; + + leaf = path->nodes[0]; + slot = path->slots[0]; + nritems = btrfs_header_nritems(leaf); + if (slot >= nritems) { + ret = btrfs_next_leaf(root, path); + if (ret) { + btrfs_free_path(path); + goto out; + } + leaf = path->nodes[0]; + slot = path->slots[0]; + } + + btrfs_free_path(path); + + btrfs_item_key_to_cpu(leaf, &key, slot); + if (key.objectid != dir->i_ino || key.type != BTRFS_INODE_REF_KEY) + goto out; + + objectid = key.offset; + + /* Build a new key for the inode item */ + key.objectid = objectid; + btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); + key.offset = 0; + + inode = btrfs_iget(root->fs_info->sb, &key, root, NULL); + + parent = d_obtain_alias(inode); + if (!parent) + parent = ERR_PTR(-ENOMEM); + + return parent; + +out: + btrfs_free_path(path); + return ERR_PTR(-EINVAL); +} + +const struct export_operations btrfs_export_ops = { + .encode_fh = btrfs_encode_fh, + .fh_to_dentry = btrfs_fh_to_dentry, + .fh_to_parent = btrfs_fh_to_parent, + .get_parent = btrfs_get_parent, +}; diff --git a/fs/btrfs/export.h b/fs/btrfs/export.h new file mode 100644 index 000000000000..074348a95841 --- /dev/null +++ b/fs/btrfs/export.h @@ -0,0 +1,19 @@ +#ifndef BTRFS_EXPORT_H +#define BTRFS_EXPORT_H + +#include + +extern const struct export_operations btrfs_export_ops; + +struct btrfs_fid { + u64 objectid; + u64 root_objectid; + u32 gen; + + u64 parent_objectid; + u32 parent_gen; + + u64 parent_root_objectid; +} __attribute__ ((packed)); + +#endif diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index eb4b357d05e1..e830e0ed409a 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -46,6 +46,7 @@ #include "xattr.h" #include "volumes.h" #include "version.h" +#include "export.h" #define BTRFS_SUPER_MAGIC 0x9123683E @@ -303,6 +304,7 @@ static int btrfs_fill_super(struct super_block * sb, sb->s_maxbytes = MAX_LFS_FILESIZE; sb->s_magic = BTRFS_SUPER_MAGIC; sb->s_op = &btrfs_super_ops; + sb->s_export_op = &btrfs_export_ops; sb->s_xattr = btrfs_xattr_handlers; sb->s_time_gran = 1; sb->s_flags |= MS_POSIXACL; -- cgit v1.2.3 From 615f996fb8185a0bc02812ebd72cb77ded5645f1 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Tue, 19 Aug 2008 19:21:57 +0100 Subject: Switch btrfs_name_hash() to crc32c Date: Tue, 19 Aug 2008 19:21:57 +0100 Using a 64-bit hash as the readdir cookie is just asking for trouble. And gets it, when we try to export the file system by NFS. Signed-off-by: David Woodhouse Signed-off-by: Chris Mason --- fs/btrfs/Makefile | 2 +- fs/btrfs/hash.c | 112 ------------------------------------------------------ fs/btrfs/hash.h | 7 +++- 3 files changed, 7 insertions(+), 114 deletions(-) delete mode 100644 fs/btrfs/hash.c (limited to 'fs/btrfs/Makefile') diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index 75f8818cbfee..8213bba1de9f 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -3,7 +3,7 @@ ifneq ($(KERNELRELEASE),) obj-m := btrfs.o btrfs-y := super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ - hash.o file-item.o inode-item.o inode-map.o disk-io.o \ + file-item.o inode-item.o inode-map.o disk-io.o \ transaction.o bit-radix.o inode.o file.o tree-defrag.o \ extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \ extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \ diff --git a/fs/btrfs/hash.c b/fs/btrfs/hash.c deleted file mode 100644 index 21037cfca9bd..000000000000 --- a/fs/btrfs/hash.c +++ /dev/null @@ -1,112 +0,0 @@ -/* - * Copyright (C) 2007 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - */ - -/* - * Original copy from: - * linux/fs/ext3/hash.c - * - * Copyright (C) 2002 by Theodore Ts'o - * - * This file is released under the GPL v2. - * - * This file may be redistributed under the terms of the GNU Public - * License. - */ - -#include -#include "hash.h" -#define DELTA 0x9E3779B9 - -static void TEA_transform(__u32 buf[2], __u32 const in[]) -{ - __u32 sum = 0; - __u32 b0 = buf[0], b1 = buf[1]; - __u32 a = in[0], b = in[1], c = in[2], d = in[3]; - int n = 16; - - do { - sum += DELTA; - b0 += ((b1 << 4)+a) ^ (b1+sum) ^ ((b1 >> 5)+b); - b1 += ((b0 << 4)+c) ^ (b0+sum) ^ ((b0 >> 5)+d); - } while(--n); - - buf[0] += b0; - buf[1] += b1; -} - -static void str2hashbuf(const char *msg, int len, __u32 *buf, int num) -{ - __u32 pad, val; - int i; - - pad = (__u32)len | ((__u32)len << 8); - pad |= pad << 16; - - val = pad; - if (len > num*4) - len = num * 4; - for (i=0; i < len; i++) { - if ((i % 4) == 0) - val = pad; - val = msg[i] + (val << 8); - if ((i % 4) == 3) { - *buf++ = val; - val = pad; - num--; - } - } - if (--num >= 0) - *buf++ = val; - while (--num >= 0) - *buf++ = pad; -} - -u64 btrfs_name_hash(const char *name, int len) -{ - __u32 hash; - __u32 minor_hash = 0; - const char *p; - __u32 in[8], buf[4]; - u64 hash_result; - - if (len == 1 && *name == '.') { - return 1; - } else if (len == 2 && name[0] == '.' && name[1] == '.') { - return 2; - } - - /* Initialize the default seed for the hash checksum functions */ - buf[0] = 0x67452301; - buf[1] = 0xefcdab89; - buf[2] = 0x98badcfe; - buf[3] = 0x10325476; - - p = name; - while (len > 0) { - str2hashbuf(p, len, in, 4); - TEA_transform(buf, in); - len -= 16; - p += 16; - } - hash = buf[0]; - minor_hash = buf[1]; - hash_result = buf[0]; - hash_result <<= 32; - hash_result |= buf[1]; - return hash_result; -} diff --git a/fs/btrfs/hash.h b/fs/btrfs/hash.h index 868ee17ca77a..2a020b276768 100644 --- a/fs/btrfs/hash.h +++ b/fs/btrfs/hash.h @@ -18,5 +18,10 @@ #ifndef __HASH__ #define __HASH__ -u64 btrfs_name_hash(const char *name, int len); + +#include "crc32c.h" +static inline u64 btrfs_name_hash(const char *name, int len) +{ + return btrfs_crc32c((u32)~1, name, len); +} #endif -- cgit v1.2.3 From eab922ec8907b8c506e799785e7e2d16eabe50e4 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 28 Aug 2008 06:21:15 -0400 Subject: Btrfs: compile when posix acl's are disabled This patch makes btrfs so it will compile properly when acls are disabled. I tested this and it worked with CONFIG_FS_POSIX_ACL off and on. Signed-off-by: Chris Mason --- fs/btrfs/Makefile | 3 +-- fs/btrfs/acl.c | 21 +++++++++++++++++++++ 2 files changed, 22 insertions(+), 2 deletions(-) (limited to 'fs/btrfs/Makefile') diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index 8213bba1de9f..3a01065d4ef1 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -7,9 +7,8 @@ btrfs-y := super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ transaction.o bit-radix.o inode.o file.o tree-defrag.o \ extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \ extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \ - ref-cache.o export.o + ref-cache.o acl.o export.o -btrfs-$(CONFIG_FS_POSIX_ACL) += acl.o else # Normal Makefile diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c index b192659b4f5f..b95147ef1c77 100644 --- a/fs/btrfs/acl.c +++ b/fs/btrfs/acl.c @@ -27,6 +27,8 @@ #include "btrfs_inode.h" #include "xattr.h" +#ifdef CONFIG_FS_POSIX_ACL + static void btrfs_update_cached_acl(struct inode *inode, struct posix_acl **p_acl, struct posix_acl *acl) @@ -329,3 +331,22 @@ struct xattr_handler btrfs_xattr_acl_access_handler = { .get = btrfs_xattr_acl_access_get, .set = btrfs_xattr_acl_access_set, }; + +#else /* CONFIG_FS_POSIX_ACL */ + +int btrfs_acl_chmod(struct inode *inode) +{ + return 0; +} + +int btrfs_init_acl(struct inode *inode, struct inode *dir) +{ + return 0; +} + +int btrfs_check_acl(struct inode *inode, int mask) +{ + return 0; +} + +#endif /* CONFIG_FS_POSIX_ACL */ -- cgit v1.2.3 From e02119d5a7b4396c5a872582fddc8bd6d305a70a Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Fri, 5 Sep 2008 16:13:11 -0400 Subject: Btrfs: Add a write ahead tree log to optimize synchronous operations File syncs and directory syncs are optimized by copying their items into a special (copy-on-write) log tree. There is one log tree per subvolume and the btrfs super block points to a tree of log tree roots. After a crash, items are copied out of the log tree and back into the subvolume. See tree-log.c for all the details. Signed-off-by: Chris Mason --- fs/btrfs/Makefile | 3 +- fs/btrfs/btrfs_inode.h | 8 + fs/btrfs/compat.h | 15 + fs/btrfs/ctree.c | 73 +- fs/btrfs/ctree.h | 72 +- fs/btrfs/dir-item.c | 3 + fs/btrfs/disk-io.c | 138 ++- fs/btrfs/disk-io.h | 8 +- fs/btrfs/extent-tree.c | 93 +- fs/btrfs/file.c | 39 +- fs/btrfs/inode.c | 261 ++--- fs/btrfs/root-tree.c | 5 +- fs/btrfs/transaction.c | 45 +- fs/btrfs/transaction.h | 1 + fs/btrfs/tree-defrag.c | 4 +- fs/btrfs/tree-log.c | 2804 ++++++++++++++++++++++++++++++++++++++++++++++++ fs/btrfs/tree-log.h | 41 + 17 files changed, 3408 insertions(+), 205 deletions(-) create mode 100644 fs/btrfs/tree-log.c create mode 100644 fs/btrfs/tree-log.h (limited to 'fs/btrfs/Makefile') diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index 3a01065d4ef1..b7addbfd8c22 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -7,8 +7,7 @@ btrfs-y := super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ transaction.o bit-radix.o inode.o file.o tree-defrag.o \ extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \ extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \ - ref-cache.o acl.o export.o - + ref-cache.o export.o tree-log.o acl.o else # Normal Makefile diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 111f90524ae1..fcc8cf27e906 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -33,6 +33,7 @@ struct btrfs_inode { struct extent_io_tree io_failure_tree; struct mutex csum_mutex; struct mutex extent_mutex; + struct mutex log_mutex; struct inode vfs_inode; struct btrfs_ordered_inode_tree ordered_tree; @@ -44,10 +45,17 @@ struct btrfs_inode { struct list_head delalloc_inodes; + /* full 64 bit generation number */ + u64 generation; + /* * transid of the trans_handle that last modified this inode */ u64 last_trans; + /* + * transid that last logged this inode + */ + u64 logged_trans; u64 delalloc_bytes; u64 disk_i_size; u32 flags; diff --git a/fs/btrfs/compat.h b/fs/btrfs/compat.h index d45fb37887bc..b0ed1887d9b1 100644 --- a/fs/btrfs/compat.h +++ b/fs/btrfs/compat.h @@ -22,6 +22,21 @@ static inline struct dentry *d_obtain_alias(struct inode *inode) } #endif +#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18) +static inline void btrfs_drop_nlink(struct inode *inode) +{ + inode->i_nlink--; +} + +static inline void btrfs_inc_nlink(struct inode *inode) +{ + inode->i_nlink++; +} +#else +# define btrfs_drop_nlink(inode) drop_nlink(inode) +# define btrfs_inc_nlink(inode) inc_nlink(inode) +#endif + /* * Even if AppArmor isn't enabled, it still has different prototypes. * Add more distro/version pairs here to declare which has AppArmor applied. diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 7114faafa9d4..579124043d9b 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -60,7 +60,7 @@ void btrfs_free_path(struct btrfs_path *p) kmem_cache_free(btrfs_path_cachep, p); } -void btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p) +void noinline btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p) { int i; @@ -176,7 +176,7 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans, return 0; } -int __btrfs_cow_block(struct btrfs_trans_handle *trans, +int noinline __btrfs_cow_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, struct extent_buffer *parent, int parent_slot, @@ -294,7 +294,7 @@ int __btrfs_cow_block(struct btrfs_trans_handle *trans, return 0; } -int btrfs_cow_block(struct btrfs_trans_handle *trans, +int noinline btrfs_cow_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, struct extent_buffer *parent, int parent_slot, struct extent_buffer **cow_ret, u64 prealloc_dest) @@ -677,9 +677,10 @@ static int noinline check_block(struct btrfs_root *root, * * slot may point to max if the key is bigger than all of the keys */ -static int generic_bin_search(struct extent_buffer *eb, unsigned long p, - int item_size, struct btrfs_key *key, - int max, int *slot) +static noinline int generic_bin_search(struct extent_buffer *eb, + unsigned long p, + int item_size, struct btrfs_key *key, + int max, int *slot) { int low = 0; int high = max; @@ -765,7 +766,7 @@ static int bin_search(struct extent_buffer *eb, struct btrfs_key *key, return -1; } -static struct extent_buffer *read_node_slot(struct btrfs_root *root, +static noinline struct extent_buffer *read_node_slot(struct btrfs_root *root, struct extent_buffer *parent, int slot) { int level = btrfs_header_level(parent); @@ -781,7 +782,7 @@ static struct extent_buffer *read_node_slot(struct btrfs_root *root, btrfs_node_ptr_generation(parent, slot)); } -static int balance_level(struct btrfs_trans_handle *trans, +static noinline int balance_level(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, int level) { @@ -1128,8 +1129,9 @@ static int noinline push_nodes_for_insert(struct btrfs_trans_handle *trans, /* * readahead one full node of leaves */ -static void reada_for_search(struct btrfs_root *root, struct btrfs_path *path, - int level, int slot, u64 objectid) +static noinline void reada_for_search(struct btrfs_root *root, + struct btrfs_path *path, + int level, int slot, u64 objectid) { struct extent_buffer *node; struct btrfs_disk_key disk_key; @@ -1201,7 +1203,8 @@ static void reada_for_search(struct btrfs_root *root, struct btrfs_path *path, } } -static void unlock_up(struct btrfs_path *path, int level, int lowest_unlock) +static noinline void unlock_up(struct btrfs_path *path, int level, + int lowest_unlock) { int i; int skip_level = level; @@ -1759,8 +1762,9 @@ static int insert_ptr(struct btrfs_trans_handle *trans, struct btrfs_root * * returns 0 on success and < 0 on failure */ -static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root - *root, struct btrfs_path *path, int level) +static noinline int split_node(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, int level) { u64 root_gen; struct extent_buffer *c; @@ -1874,7 +1878,8 @@ static int leaf_space_used(struct extent_buffer *l, int start, int nr) * the start of the leaf data. IOW, how much room * the leaf has left for both items and data */ -int btrfs_leaf_free_space(struct btrfs_root *root, struct extent_buffer *leaf) +int noinline btrfs_leaf_free_space(struct btrfs_root *root, + struct extent_buffer *leaf) { int nritems = btrfs_header_nritems(leaf); int ret; @@ -2283,9 +2288,11 @@ out: * * returns 0 if all went well and < 0 on failure. */ -static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root - *root, struct btrfs_key *ins_key, - struct btrfs_path *path, int data_size, int extend) +static noinline int split_leaf(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_key *ins_key, + struct btrfs_path *path, int data_size, + int extend) { u64 root_gen; struct extent_buffer *l; @@ -3079,6 +3086,7 @@ int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path) * was nothing in the tree that matched the search criteria. */ int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key, + struct btrfs_key *max_key, struct btrfs_path *path, int cache_only, u64 min_trans) { @@ -3093,6 +3101,7 @@ int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key, again: cur = btrfs_lock_root_node(root); level = btrfs_header_level(cur); + WARN_ON(path->nodes[level]); path->nodes[level] = cur; path->locks[level] = 1; @@ -3107,6 +3116,8 @@ again: /* at level = 0, we're done, setup the path and exit */ if (level == 0) { + if (slot >= nritems) + goto find_next_key; ret = 0; path->slots[level] = slot; btrfs_item_key_to_cpu(cur, &found_key, slot); @@ -3123,6 +3134,8 @@ again: u64 blockptr; u64 gen; struct extent_buffer *tmp; + struct btrfs_disk_key disk_key; + blockptr = btrfs_node_blockptr(cur, slot); gen = btrfs_node_ptr_generation(cur, slot); if (gen < min_trans) { @@ -3132,6 +3145,14 @@ again: if (!cache_only) break; + if (max_key) { + btrfs_node_key(cur, &disk_key, slot); + if (comp_keys(&disk_key, max_key) >= 0) { + ret = 1; + goto out; + } + } + tmp = btrfs_find_tree_block(root, blockptr, btrfs_level_size(root, level - 1)); @@ -3143,14 +3164,16 @@ again: free_extent_buffer(tmp); slot++; } +find_next_key: /* * we didn't find a candidate key in this node, walk forward * and find another one */ if (slot >= nritems) { - ret = btrfs_find_next_key(root, path, min_key, level, + path->slots[level] = slot; + sret = btrfs_find_next_key(root, path, min_key, level, cache_only, min_trans); - if (ret == 0) { + if (sret == 0) { btrfs_release_path(root, path); goto again; } else { @@ -3351,6 +3374,7 @@ int btrfs_previous_item(struct btrfs_root *root, { struct btrfs_key found_key; struct extent_buffer *leaf; + u32 nritems; int ret; while(1) { @@ -3362,9 +3386,20 @@ int btrfs_previous_item(struct btrfs_root *root, path->slots[0]--; } leaf = path->nodes[0]; + nritems = btrfs_header_nritems(leaf); + if (nritems == 0) + return 1; + if (path->slots[0] == nritems) + path->slots[0]--; + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); if (found_key.type == type) return 0; + if (found_key.objectid < min_objectid) + break; + if (found_key.objectid == min_objectid && + found_key.type < type) + break; } return 1; } diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index b305ae7e10b0..6532b60683ef 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -77,6 +77,10 @@ struct btrfs_ordered_sum; /* orhpan objectid for tracking unlinked/truncated files */ #define BTRFS_ORPHAN_OBJECTID -5ULL +/* does write ahead logging to speed up fsyncs */ +#define BTRFS_TREE_LOG_OBJECTID -6ULL +#define BTRFS_TREE_LOG_FIXUP_OBJECTID -7ULL + /* * All files have objectids higher than this. */ @@ -276,6 +280,7 @@ struct btrfs_super_block { __le64 generation; __le64 root; __le64 chunk_root; + __le64 log_root; __le64 total_bytes; __le64 bytes_used; __le64 root_dir_objectid; @@ -287,6 +292,7 @@ struct btrfs_super_block { __le32 sys_chunk_array_size; u8 root_level; u8 chunk_root_level; + u8 log_root_level; struct btrfs_dev_item dev_item; char label[BTRFS_LABEL_SIZE]; u8 sys_chunk_array[BTRFS_SYSTEM_CHUNK_ARRAY_SIZE]; @@ -392,7 +398,10 @@ struct btrfs_timespec { * make a new item type */ struct btrfs_inode_item { + /* nfs style generation number */ __le64 generation; + /* transid that last touched this inode */ + __le64 transid; __le64 size; __le64 nblocks; __le64 block_group; @@ -409,8 +418,13 @@ struct btrfs_inode_item { struct btrfs_timespec otime; } __attribute__ ((__packed__)); +struct btrfs_dir_log_item { + __le64 end; +} __attribute__ ((__packed__)); + struct btrfs_dir_item { struct btrfs_disk_key location; + __le64 transid; __le16 data_len; __le16 name_len; u8 type; @@ -505,6 +519,9 @@ struct btrfs_fs_info { struct btrfs_root *tree_root; struct btrfs_root *chunk_root; struct btrfs_root *dev_root; + + /* the log root tree is a directory of all the other log roots */ + struct btrfs_root *log_root_tree; struct radix_tree_root fs_roots_radix; struct extent_io_tree free_space_cache; @@ -518,6 +535,7 @@ struct btrfs_fs_info { u64 generation; u64 last_trans_committed; + u64 last_trans_new_blockgroup; u64 open_ioctl_trans; unsigned long mount_opt; u64 max_extent; @@ -527,6 +545,9 @@ struct btrfs_fs_info { wait_queue_head_t transaction_throttle; wait_queue_head_t transaction_wait; wait_queue_head_t async_submit_wait; + + wait_queue_head_t tree_log_wait; + struct btrfs_super_block super_copy; struct btrfs_super_block super_for_commit; struct block_device *__bdev; @@ -535,6 +556,7 @@ struct btrfs_fs_info { struct backing_dev_info bdi; spinlock_t hash_lock; struct mutex trans_mutex; + struct mutex tree_log_mutex; struct mutex transaction_kthread_mutex; struct mutex cleaner_mutex; struct mutex alloc_mutex; @@ -544,8 +566,13 @@ struct btrfs_fs_info { struct list_head trans_list; struct list_head hashers; struct list_head dead_roots; + atomic_t nr_async_submits; atomic_t nr_async_bios; + atomic_t tree_log_writers; + atomic_t tree_log_commit; + unsigned long tree_log_batch; + u64 tree_log_transid; /* * this is used by the balancing code to wait for all the pending @@ -583,6 +610,7 @@ struct btrfs_fs_info { struct completion kobj_unregister; int do_barriers; int closing; + int log_root_recovering; atomic_t throttles; atomic_t throttle_gen; @@ -596,6 +624,7 @@ struct btrfs_fs_info { u64 delalloc_bytes; u64 last_alloc; u64 last_data_alloc; + u64 last_log_alloc; spinlock_t ref_cache_lock; u64 total_ref_cache_size; @@ -632,6 +661,7 @@ struct btrfs_root { struct btrfs_leaf_ref_tree *ref_tree; struct btrfs_leaf_ref_tree ref_tree_struct; struct btrfs_dirty_root *dirty_root; + struct btrfs_root *log_root; struct btrfs_root_item root_item; struct btrfs_key root_key; @@ -640,6 +670,7 @@ struct btrfs_root { struct kobject root_kobj; struct completion kobj_unregister; struct mutex objectid_mutex; + struct mutex log_mutex; u64 objectid; u64 last_trans; @@ -692,6 +723,8 @@ struct btrfs_root { * dir items are the name -> inode pointers in a directory. There is one * for every name in a directory. */ +#define BTRFS_DIR_LOG_ITEM_KEY 14 +#define BTRFS_DIR_LOG_INDEX_KEY 15 #define BTRFS_DIR_ITEM_KEY 16 #define BTRFS_DIR_INDEX_KEY 17 /* @@ -703,7 +736,8 @@ struct btrfs_root { */ #define BTRFS_CSUM_ITEM_KEY 19 -/* reserve 20-31 for other file stuff */ + +/* reserve 21-31 for other file/dir stuff */ /* * root items point to tree roots. There are typically in the root @@ -938,6 +972,7 @@ BTRFS_SETGET_FUNCS(inode_ref_index, struct btrfs_inode_ref, index, 64); /* struct btrfs_inode_item */ BTRFS_SETGET_FUNCS(inode_generation, struct btrfs_inode_item, generation, 64); +BTRFS_SETGET_FUNCS(inode_transid, struct btrfs_inode_item, transid, 64); BTRFS_SETGET_FUNCS(inode_size, struct btrfs_inode_item, size, 64); BTRFS_SETGET_FUNCS(inode_nblocks, struct btrfs_inode_item, nblocks, 64); BTRFS_SETGET_FUNCS(inode_block_group, struct btrfs_inode_item, block_group, 64); @@ -1126,10 +1161,13 @@ static inline void btrfs_set_item_key(struct extent_buffer *eb, write_eb_member(eb, item, struct btrfs_item, key, disk_key); } +BTRFS_SETGET_FUNCS(dir_log_end, struct btrfs_dir_log_item, end, 64); + /* struct btrfs_dir_item */ BTRFS_SETGET_FUNCS(dir_data_len, struct btrfs_dir_item, data_len, 16); BTRFS_SETGET_FUNCS(dir_type, struct btrfs_dir_item, type, 8); BTRFS_SETGET_FUNCS(dir_name_len, struct btrfs_dir_item, name_len, 16); +BTRFS_SETGET_FUNCS(dir_transid, struct btrfs_dir_item, transid, 64); static inline void btrfs_dir_item_key(struct extent_buffer *eb, struct btrfs_dir_item *item, @@ -1301,7 +1339,11 @@ BTRFS_SETGET_STACK_FUNCS(super_root_level, struct btrfs_super_block, BTRFS_SETGET_STACK_FUNCS(super_chunk_root, struct btrfs_super_block, chunk_root, 64); BTRFS_SETGET_STACK_FUNCS(super_chunk_root_level, struct btrfs_super_block, - chunk_root_level, 64); + chunk_root_level, 8); +BTRFS_SETGET_STACK_FUNCS(super_log_root, struct btrfs_super_block, + log_root, 64); +BTRFS_SETGET_STACK_FUNCS(super_log_root_level, struct btrfs_super_block, + log_root_level, 8); BTRFS_SETGET_STACK_FUNCS(super_total_bytes, struct btrfs_super_block, total_bytes, 64); BTRFS_SETGET_STACK_FUNCS(super_bytes_used, struct btrfs_super_block, @@ -1405,6 +1447,12 @@ static inline struct dentry *fdentry(struct file *file) { } /* extent-tree.c */ +int btrfs_lookup_extent(struct btrfs_root *root, struct btrfs_path *path, + u64 start, u64 len); +int btrfs_update_pinned_extents(struct btrfs_root *root, + u64 bytenr, u64 num, int pin); +int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct extent_buffer *leaf); int btrfs_cross_ref_exists(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_key *key, u64 bytenr); @@ -1448,6 +1496,11 @@ int btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans, u64 root_objectid, u64 ref_generation, u64 owner, u64 owner_offset, struct btrfs_key *ins); +int btrfs_alloc_logged_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 root_objectid, u64 ref_generation, + u64 owner, u64 owner_offset, + struct btrfs_key *ins); int btrfs_reserve_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 num_bytes, u64 min_alloc_size, @@ -1488,9 +1541,9 @@ int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path, struct btrfs_key *key, int lowest_level, int cache_only, u64 min_trans); int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key, + struct btrfs_key *max_key, struct btrfs_path *path, int cache_only, u64 min_trans); - int btrfs_cow_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, struct extent_buffer *parent, int parent_slot, @@ -1656,6 +1709,18 @@ int btrfs_csum_truncate(struct btrfs_trans_handle *trans, #define PageChecked PageFsMisc #endif +int btrfs_unlink_inode(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct inode *dir, struct inode *inode, + const char *name, int name_len); +int btrfs_add_link(struct btrfs_trans_handle *trans, + struct inode *parent_inode, struct inode *inode, + const char *name, int name_len, int add_backref, u64 index); +int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct inode *inode, u64 new_size, + u32 min_type); + int btrfs_start_delalloc_inodes(struct btrfs_root *root); int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end); int btrfs_writepages(struct address_space *mapping, @@ -1715,6 +1780,7 @@ int btrfs_update_inode(struct btrfs_trans_handle *trans, long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg); /* file.c */ +int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync); int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end); int btrfs_check_file(struct btrfs_root *root, struct inode *inode); extern struct file_operations btrfs_file_operations; diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c index 125094617fe8..e4f30090d640 100644 --- a/fs/btrfs/dir-item.c +++ b/fs/btrfs/dir-item.c @@ -96,6 +96,7 @@ int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans, btrfs_set_dir_item_key(leaf, dir_item, &disk_key); btrfs_set_dir_type(leaf, dir_item, BTRFS_FT_XATTR); btrfs_set_dir_name_len(leaf, dir_item, name_len); + btrfs_set_dir_transid(leaf, dir_item, trans->transid); btrfs_set_dir_data_len(leaf, dir_item, data_len); name_ptr = (unsigned long)(dir_item + 1); data_ptr = (unsigned long)((char *)name_ptr + name_len); @@ -142,6 +143,7 @@ int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root btrfs_set_dir_type(leaf, dir_item, type); btrfs_set_dir_data_len(leaf, dir_item, 0); btrfs_set_dir_name_len(leaf, dir_item, name_len); + btrfs_set_dir_transid(leaf, dir_item, trans->transid); name_ptr = (unsigned long)(dir_item + 1); write_extent_buffer(leaf, name, name_ptr, name_len); @@ -169,6 +171,7 @@ second_insert: btrfs_set_dir_type(leaf, dir_item, type); btrfs_set_dir_data_len(leaf, dir_item, 0); btrfs_set_dir_name_len(leaf, dir_item, name_len); + btrfs_set_dir_transid(leaf, dir_item, trans->transid); name_ptr = (unsigned long)(dir_item + 1); write_extent_buffer(leaf, name, name_ptr, name_len); btrfs_mark_buffer_dirty(leaf); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 8e7a938bfbc7..a4373db5967a 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -41,6 +41,7 @@ #include "async-thread.h" #include "locking.h" #include "ref-cache.h" +#include "tree-log.h" #if 0 static int check_tree_block(struct btrfs_root *root, struct extent_buffer *buf) @@ -694,6 +695,18 @@ struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root, } +int btrfs_write_tree_block(struct extent_buffer *buf) +{ + return btrfs_fdatawrite_range(buf->first_page->mapping, buf->start, + buf->start + buf->len - 1, WB_SYNC_NONE); +} + +int btrfs_wait_tree_block_writeback(struct extent_buffer *buf) +{ + return btrfs_wait_on_page_writeback_range(buf->first_page->mapping, + buf->start, buf->start + buf->len -1); +} + struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize, u64 parent_transid) { @@ -732,15 +745,6 @@ int clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, return 0; } -int wait_on_tree_block_writeback(struct btrfs_root *root, - struct extent_buffer *buf) -{ - struct inode *btree_inode = root->fs_info->btree_inode; - wait_on_extent_buffer_writeback(&BTRFS_I(btree_inode)->io_tree, - buf); - return 0; -} - static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize, u32 stripesize, struct btrfs_root *root, struct btrfs_fs_info *fs_info, @@ -771,6 +775,7 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize, spin_lock_init(&root->node_lock); spin_lock_init(&root->list_lock); mutex_init(&root->objectid_mutex); + mutex_init(&root->log_mutex); btrfs_leaf_ref_tree_init(&root->ref_tree_struct); root->ref_tree = &root->ref_tree_struct; @@ -809,11 +814,74 @@ static int find_and_setup_root(struct btrfs_root *tree_root, return 0; } -struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_fs_info *fs_info, - struct btrfs_key *location) +int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info) +{ + struct extent_buffer *eb; + int ret; + + if (!fs_info->log_root_tree) + return 0; + + eb = fs_info->log_root_tree->node; + + WARN_ON(btrfs_header_level(eb) != 0); + WARN_ON(btrfs_header_nritems(eb) != 0); + + ret = btrfs_free_extent(trans, fs_info->tree_root, + eb->start, eb->len, + BTRFS_TREE_LOG_OBJECTID, 0, 0, 0, 1); + BUG_ON(ret); + + free_extent_buffer(eb); + kfree(fs_info->log_root_tree); + fs_info->log_root_tree = NULL; + return 0; +} + +int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info) { struct btrfs_root *root; struct btrfs_root *tree_root = fs_info->tree_root; + + root = kzalloc(sizeof(*root), GFP_NOFS); + if (!root) + return -ENOMEM; + + __setup_root(tree_root->nodesize, tree_root->leafsize, + tree_root->sectorsize, tree_root->stripesize, + root, fs_info, BTRFS_TREE_LOG_OBJECTID); + + root->root_key.objectid = BTRFS_TREE_LOG_OBJECTID; + root->root_key.type = BTRFS_ROOT_ITEM_KEY; + root->root_key.offset = BTRFS_TREE_LOG_OBJECTID; + root->ref_cows = 0; + + root->node = btrfs_alloc_free_block(trans, root, root->leafsize, + BTRFS_TREE_LOG_OBJECTID, + 0, 0, 0, 0, 0); + + btrfs_set_header_nritems(root->node, 0); + btrfs_set_header_level(root->node, 0); + btrfs_set_header_bytenr(root->node, root->node->start); + btrfs_set_header_generation(root->node, trans->transid); + btrfs_set_header_owner(root->node, BTRFS_TREE_LOG_OBJECTID); + + write_extent_buffer(root->node, root->fs_info->fsid, + (unsigned long)btrfs_header_fsid(root->node), + BTRFS_FSID_SIZE); + btrfs_mark_buffer_dirty(root->node); + btrfs_tree_unlock(root->node); + fs_info->log_root_tree = root; + return 0; +} + +struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root, + struct btrfs_key *location) +{ + struct btrfs_root *root; + struct btrfs_fs_info *fs_info = tree_root->fs_info; struct btrfs_path *path; struct extent_buffer *l; u64 highest_inode; @@ -863,11 +931,13 @@ out: blocksize, 0); BUG_ON(!root->node); insert: - root->ref_cows = 1; - ret = btrfs_find_highest_inode(root, &highest_inode); - if (ret == 0) { - root->highest_inode = highest_inode; - root->last_inode_alloc = highest_inode; + if (location->objectid != BTRFS_TREE_LOG_OBJECTID) { + root->ref_cows = 1; + ret = btrfs_find_highest_inode(root, &highest_inode); + if (ret == 0) { + root->highest_inode = highest_inode; + root->last_inode_alloc = highest_inode; + } } return root; } @@ -907,7 +977,7 @@ struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info, if (root) return root; - root = btrfs_read_fs_root_no_radix(fs_info, location); + root = btrfs_read_fs_root_no_radix(fs_info->tree_root, location); if (IS_ERR(root)) return root; ret = radix_tree_insert(&fs_info->fs_roots_radix, @@ -1250,16 +1320,18 @@ struct btrfs_root *open_ctree(struct super_block *sb, u32 blocksize; u32 stripesize; struct buffer_head *bh; - struct btrfs_root *extent_root = kmalloc(sizeof(struct btrfs_root), + struct btrfs_root *extent_root = kzalloc(sizeof(struct btrfs_root), GFP_NOFS); - struct btrfs_root *tree_root = kmalloc(sizeof(struct btrfs_root), + struct btrfs_root *tree_root = kzalloc(sizeof(struct btrfs_root), GFP_NOFS); struct btrfs_fs_info *fs_info = kzalloc(sizeof(*fs_info), GFP_NOFS); - struct btrfs_root *chunk_root = kmalloc(sizeof(struct btrfs_root), + struct btrfs_root *chunk_root = kzalloc(sizeof(struct btrfs_root), GFP_NOFS); - struct btrfs_root *dev_root = kmalloc(sizeof(struct btrfs_root), + struct btrfs_root *dev_root = kzalloc(sizeof(struct btrfs_root), GFP_NOFS); + struct btrfs_root *log_tree_root; + int ret; int err = -EINVAL; @@ -1343,6 +1415,7 @@ struct btrfs_root *open_ctree(struct super_block *sb, mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS); mutex_init(&fs_info->trans_mutex); + mutex_init(&fs_info->tree_log_mutex); mutex_init(&fs_info->drop_mutex); mutex_init(&fs_info->alloc_mutex); mutex_init(&fs_info->chunk_mutex); @@ -1352,6 +1425,10 @@ struct btrfs_root *open_ctree(struct super_block *sb, init_waitqueue_head(&fs_info->transaction_throttle); init_waitqueue_head(&fs_info->transaction_wait); init_waitqueue_head(&fs_info->async_submit_wait); + init_waitqueue_head(&fs_info->tree_log_wait); + atomic_set(&fs_info->tree_log_commit, 0); + atomic_set(&fs_info->tree_log_writers, 0); + fs_info->tree_log_transid = 0; #if 0 ret = add_hasher(fs_info, "crc32c"); @@ -1532,7 +1609,26 @@ struct btrfs_root *open_ctree(struct super_block *sb, if (!fs_info->transaction_kthread) goto fail_cleaner; + if (btrfs_super_log_root(disk_super) != 0) { + u32 blocksize; + u64 bytenr = btrfs_super_log_root(disk_super); + + blocksize = + btrfs_level_size(tree_root, + btrfs_super_log_root_level(disk_super)); + log_tree_root = kzalloc(sizeof(struct btrfs_root), + GFP_NOFS); + + __setup_root(nodesize, leafsize, sectorsize, stripesize, + log_tree_root, fs_info, BTRFS_TREE_LOG_OBJECTID); + + log_tree_root->node = read_tree_block(tree_root, bytenr, + blocksize, 0); + ret = btrfs_recover_log_trees(log_tree_root); + BUG_ON(ret); + } + fs_info->last_trans_committed = btrfs_super_generation(disk_super); return tree_root; fail_cleaner: diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 2562a273ae18..6b6fdc697f31 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -45,7 +45,7 @@ struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *btrfs_read_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_key *location, const char *name, int namelen); -struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_fs_info *fs_info, +struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root, struct btrfs_key *location); struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info, struct btrfs_key *location); @@ -74,4 +74,10 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode, extent_submit_bio_hook_t *submit_bio_hook); int btrfs_congested_async(struct btrfs_fs_info *info, int iodone); unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info); +int btrfs_write_tree_block(struct extent_buffer *buf); +int btrfs_wait_tree_block_writeback(struct extent_buffer *buf); +int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info); +int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info); #endif diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index e63b3b4bed7c..646b9148ca21 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -496,6 +496,23 @@ static int match_extent_ref(struct extent_buffer *leaf, return ret == 0; } +/* simple helper to search for an existing extent at a given offset */ +int btrfs_lookup_extent(struct btrfs_root *root, struct btrfs_path *path, + u64 start, u64 len) +{ + int ret; + struct btrfs_key key; + + maybe_lock_mutex(root); + key.objectid = start; + key.offset = len; + btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY); + ret = btrfs_search_slot(NULL, root->fs_info->extent_root, &key, path, + 0, 0); + maybe_unlock_mutex(root); + return ret; +} + static int noinline lookup_extent_backref(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, u64 bytenr, @@ -1409,7 +1426,7 @@ static u64 first_logical_byte(struct btrfs_root *root, u64 search_start) } -static int update_pinned_extents(struct btrfs_root *root, +int btrfs_update_pinned_extents(struct btrfs_root *root, u64 bytenr, u64 num, int pin) { u64 len; @@ -1492,7 +1509,7 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, EXTENT_DIRTY); if (ret) break; - update_pinned_extents(root, start, end + 1 - start, 0); + btrfs_update_pinned_extents(root, start, end + 1 - start, 0); clear_extent_dirty(unpin, start, end, GFP_NOFS); set_extent_dirty(free_space_cache, start, end, GFP_NOFS); if (need_resched()) { @@ -1538,14 +1555,11 @@ static int finish_current_insert(struct btrfs_trans_handle *trans, clear_extent_bits(&info->extent_ins, start, end, EXTENT_LOCKED, GFP_NOFS); - eb = btrfs_find_tree_block(extent_root, ins.objectid, + eb = btrfs_find_create_tree_block(extent_root, ins.objectid, ins.offset); - if (!btrfs_buffer_uptodate(eb, trans->transid)) { - mutex_unlock(&extent_root->fs_info->alloc_mutex); + if (!btrfs_buffer_uptodate(eb, trans->transid)) btrfs_read_buffer(eb, trans->transid); - mutex_lock(&extent_root->fs_info->alloc_mutex); - } btrfs_tree_lock(eb); level = btrfs_header_level(eb); @@ -1585,13 +1599,20 @@ static int pin_down_bytes(struct btrfs_root *root, u64 bytenr, u32 num_bytes, struct extent_buffer *buf; buf = btrfs_find_tree_block(root, bytenr, num_bytes); if (buf) { + /* we can reuse a block if it hasn't been written + * and it is from this transaction. We can't + * reuse anything from the tree log root because + * it has tiny sub-transactions. + */ if (btrfs_buffer_uptodate(buf, 0) && btrfs_try_tree_lock(buf)) { u64 transid = root->fs_info->running_transaction->transid; u64 header_transid = btrfs_header_generation(buf); - if (header_transid == transid && + if (btrfs_header_owner(buf) != + BTRFS_TREE_LOG_OBJECTID && + header_transid == transid && !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) { clean_tree_block(NULL, root, buf); @@ -1603,7 +1624,7 @@ static int pin_down_bytes(struct btrfs_root *root, u64 bytenr, u32 num_bytes, } free_extent_buffer(buf); } - update_pinned_extents(root, bytenr, num_bytes, 1); + btrfs_update_pinned_extents(root, bytenr, num_bytes, 1); } else { set_extent_bits(&root->fs_info->pending_del, bytenr, bytenr + num_bytes - 1, @@ -1801,7 +1822,7 @@ static int del_pending_extents(struct btrfs_trans_handle *trans, struct GFP_NOFS); if (!test_range_bit(&extent_root->fs_info->extent_ins, start, end, EXTENT_LOCKED, 0)) { - update_pinned_extents(extent_root, start, + btrfs_update_pinned_extents(extent_root, start, end + 1 - start, 1); ret = __free_extent(trans, extent_root, start, end + 1 - start, @@ -1919,6 +1940,12 @@ static int noinline find_free_extent(struct btrfs_trans_handle *trans, if ((data & BTRFS_BLOCK_GROUP_DATA) && btrfs_test_opt(root, SSD)) { last_ptr = &root->fs_info->last_data_alloc; } + if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) { + last_ptr = &root->fs_info->last_log_alloc; + if (!last_ptr == 0 && root->fs_info->last_alloc) { + *last_ptr = root->fs_info->last_alloc + empty_cluster; + } + } if (last_ptr) { if (*last_ptr) @@ -2268,6 +2295,35 @@ int btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans, maybe_unlock_mutex(root); return ret; } + +/* + * this is used by the tree logging recovery code. It records that + * an extent has been allocated and makes sure to clear the free + * space cache bits as well + */ +int btrfs_alloc_logged_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 root_objectid, u64 ref_generation, + u64 owner, u64 owner_offset, + struct btrfs_key *ins) +{ + int ret; + struct btrfs_block_group_cache *block_group; + + maybe_lock_mutex(root); + block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid); + cache_block_group(root, block_group); + + clear_extent_dirty(&root->fs_info->free_space_cache, + ins->objectid, ins->objectid + ins->offset - 1, + GFP_NOFS); + ret = __btrfs_alloc_reserved_extent(trans, root, root_objectid, + ref_generation, owner, + owner_offset, ins); + maybe_unlock_mutex(root); + return ret; +} + /* * finds a free extent and does all the dirty work required for allocation * returns the key for the extent through ins, and a tree buffer for @@ -2350,9 +2406,8 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, return buf; } -static int noinline drop_leaf_ref_no_cache(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct extent_buffer *leaf) +int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct extent_buffer *leaf) { u64 leaf_owner; u64 leaf_generation; @@ -2402,9 +2457,9 @@ static int noinline drop_leaf_ref_no_cache(struct btrfs_trans_handle *trans, return 0; } -static int noinline drop_leaf_ref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_leaf_ref *ref) +static int noinline cache_drop_leaf_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_leaf_ref *ref) { int i; int ret; @@ -2512,7 +2567,7 @@ static int noinline walk_down_tree(struct btrfs_trans_handle *trans, btrfs_header_nritems(cur)) break; if (*level == 0) { - ret = drop_leaf_ref_no_cache(trans, root, cur); + ret = btrfs_drop_leaf_ref(trans, root, cur); BUG_ON(ret); break; } @@ -2552,7 +2607,7 @@ static int noinline walk_down_tree(struct btrfs_trans_handle *trans, btrfs_node_key_to_cpu(cur, &key, path->slots[*level]); ref = btrfs_lookup_leaf_ref(root, bytenr); if (ref) { - ret = drop_leaf_ref(trans, root, ref); + ret = cache_drop_leaf_ref(trans, root, ref); BUG_ON(ret); btrfs_remove_leaf_ref(root, ref); btrfs_free_leaf_ref(root, ref); @@ -3628,6 +3683,8 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, extent_root = root->fs_info->extent_root; block_group_cache = &root->fs_info->block_group_cache; + root->fs_info->last_trans_new_blockgroup = trans->transid; + cache = kzalloc(sizeof(*cache), GFP_NOFS); BUG_ON(!cache); cache->key.objectid = chunk_offset; diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index e9e86fbaa243..84ecf3ab8511 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -36,6 +36,8 @@ #include "btrfs_inode.h" #include "ioctl.h" #include "print-tree.h" +#include "tree-log.h" +#include "locking.h" #include "compat.h" @@ -988,10 +990,27 @@ out_nolock: *ppos = pos; if (num_written > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { - err = sync_page_range(inode, inode->i_mapping, - start_pos, num_written); + struct btrfs_trans_handle *trans; + + err = btrfs_fdatawrite_range(inode->i_mapping, start_pos, + start_pos + num_written -1, + WB_SYNC_NONE); + if (err < 0) + num_written = err; + + err = btrfs_wait_on_page_writeback_range(inode->i_mapping, + start_pos, start_pos + num_written - 1); if (err < 0) num_written = err; + + trans = btrfs_start_transaction(root, 1); + ret = btrfs_log_dentry_safe(trans, root, file->f_dentry); + if (ret == 0) { + btrfs_sync_log(trans, root); + btrfs_end_transaction(trans, root); + } else { + btrfs_commit_transaction(trans, root); + } } else if (num_written > 0 && (file->f_flags & O_DIRECT)) { #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,22) do_sync_file_range(file, start_pos, @@ -1019,8 +1038,7 @@ int btrfs_release_file(struct inode * inode, struct file * filp) return 0; } -static int btrfs_sync_file(struct file *file, - struct dentry *dentry, int datasync) +int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync) { struct inode *inode = dentry->d_inode; struct btrfs_root *root = BTRFS_I(inode)->root; @@ -1043,6 +1061,8 @@ static int btrfs_sync_file(struct file *file, } mutex_unlock(&root->fs_info->trans_mutex); + filemap_fdatawait(inode->i_mapping); + /* * ok we haven't committed the transaction yet, lets do a commit */ @@ -1054,7 +1074,16 @@ static int btrfs_sync_file(struct file *file, ret = -ENOMEM; goto out; } - ret = btrfs_commit_transaction(trans, root); + + ret = btrfs_log_dentry_safe(trans, root, file->f_dentry); + if (ret < 0) + goto out; + if (ret > 0) { + ret = btrfs_commit_transaction(trans, root); + } else { + btrfs_sync_log(trans, root); + ret = btrfs_end_transaction(trans, root); + } out: return ret > 0 ? EIO : ret; } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 43d3f2649ca3..65df9d830230 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -46,6 +46,8 @@ #include "volumes.h" #include "ordered-data.h" #include "xattr.h" +#include "compat.h" +#include "tree-log.h" struct btrfs_iget_args { u64 ino; @@ -586,6 +588,7 @@ nocow: &ordered_extent->list); btrfs_ordered_update_i_size(inode, ordered_extent); + btrfs_update_inode(trans, root, inode); btrfs_remove_ordered_extent(inode, ordered_extent); /* once for us */ @@ -593,7 +596,6 @@ nocow: /* once for the tree */ btrfs_put_ordered_extent(ordered_extent); - btrfs_update_inode(trans, root, inode); btrfs_end_transaction(trans, root); return 0; } @@ -1007,7 +1009,8 @@ void btrfs_read_locked_inode(struct inode *inode) inode->i_ctime.tv_nsec = btrfs_timespec_nsec(leaf, tspec); inode->i_blocks = btrfs_inode_nblocks(leaf, inode_item); - inode->i_generation = btrfs_inode_generation(leaf, inode_item); + BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item); + inode->i_generation = BTRFS_I(inode)->generation; inode->i_rdev = 0; rdev = btrfs_inode_rdev(leaf, inode_item); @@ -1056,7 +1059,8 @@ make_bad: make_bad_inode(inode); } -static void fill_inode_item(struct extent_buffer *leaf, +static void fill_inode_item(struct btrfs_trans_handle *trans, + struct extent_buffer *leaf, struct btrfs_inode_item *item, struct inode *inode) { @@ -1082,7 +1086,8 @@ static void fill_inode_item(struct extent_buffer *leaf, inode->i_ctime.tv_nsec); btrfs_set_inode_nblocks(leaf, item, inode->i_blocks); - btrfs_set_inode_generation(leaf, item, inode->i_generation); + btrfs_set_inode_generation(leaf, item, BTRFS_I(inode)->generation); + btrfs_set_inode_transid(leaf, item, trans->transid); btrfs_set_inode_rdev(leaf, item, inode->i_rdev); btrfs_set_inode_flags(leaf, item, BTRFS_I(inode)->flags); btrfs_set_inode_block_group(leaf, item, @@ -1112,7 +1117,7 @@ int noinline btrfs_update_inode(struct btrfs_trans_handle *trans, inode_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item); - fill_inode_item(leaf, inode_item, inode); + fill_inode_item(trans, leaf, inode_item, inode); btrfs_mark_buffer_dirty(leaf); btrfs_set_inode_last_trans(trans, inode); ret = 0; @@ -1122,14 +1127,12 @@ failed: } -static int btrfs_unlink_trans(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct inode *dir, - struct dentry *dentry) +int btrfs_unlink_inode(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct inode *dir, struct inode *inode, + const char *name, int name_len) { struct btrfs_path *path; - const char *name = dentry->d_name.name; - int name_len = dentry->d_name.len; int ret = 0; struct extent_buffer *leaf; struct btrfs_dir_item *di; @@ -1160,13 +1163,12 @@ static int btrfs_unlink_trans(struct btrfs_trans_handle *trans, btrfs_release_path(root, path); ret = btrfs_del_inode_ref(trans, root, name, name_len, - dentry->d_inode->i_ino, - dentry->d_parent->d_inode->i_ino, &index); + inode->i_ino, + dir->i_ino, &index); if (ret) { printk("failed to delete reference to %.*s, " "inode %lu parent %lu\n", name_len, name, - dentry->d_inode->i_ino, - dentry->d_parent->d_inode->i_ino); + inode->i_ino, dir->i_ino); goto err; } @@ -1183,21 +1185,25 @@ static int btrfs_unlink_trans(struct btrfs_trans_handle *trans, ret = btrfs_delete_one_dir_name(trans, root, path, di); btrfs_release_path(root, path); - dentry->d_inode->i_ctime = dir->i_ctime; + ret = btrfs_del_inode_ref_in_log(trans, root, name, name_len, + inode, dir->i_ino); + BUG_ON(ret); + + ret = btrfs_del_dir_entries_in_log(trans, root, name, name_len, + dir, index); + BUG_ON(ret); err: btrfs_free_path(path); - if (!ret) { - btrfs_i_size_write(dir, dir->i_size - name_len * 2); - dir->i_mtime = dir->i_ctime = CURRENT_TIME; - btrfs_update_inode(trans, root, dir); -#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18) - dentry->d_inode->i_nlink--; -#else - drop_nlink(dentry->d_inode); -#endif - ret = btrfs_update_inode(trans, root, dentry->d_inode); - dir->i_sb->s_dirt = 1; - } + if (ret) + goto out; + + btrfs_i_size_write(dir, dir->i_size - name_len * 2); + inode->i_ctime = dir->i_mtime = dir->i_ctime = CURRENT_TIME; + btrfs_update_inode(trans, root, dir); + btrfs_drop_nlink(inode); + ret = btrfs_update_inode(trans, root, inode); + dir->i_sb->s_dirt = 1; +out: return ret; } @@ -1218,7 +1224,8 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry) trans = btrfs_start_transaction(root, 1); btrfs_set_trans_block_group(trans, dir); - ret = btrfs_unlink_trans(trans, root, dir, dentry); + ret = btrfs_unlink_inode(trans, root, dir, dentry->d_inode, + dentry->d_name.name, dentry->d_name.len); if (inode->i_nlink == 0) ret = btrfs_orphan_add(trans, inode); @@ -1256,7 +1263,8 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) goto fail_trans; /* now the directory is empty */ - err = btrfs_unlink_trans(trans, root, dir, dentry); + err = btrfs_unlink_inode(trans, root, dir, dentry->d_inode, + dentry->d_name.name, dentry->d_name.len); if (!err) { btrfs_i_size_write(inode, 0); } @@ -1283,10 +1291,10 @@ fail: * min_type is the minimum key type to truncate down to. If set to 0, this * will kill all the items on this inode, including the INODE_ITEM_KEY. */ -static int btrfs_truncate_in_trans(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct inode *inode, - u32 min_type) +noinline int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct inode *inode, + u64 new_size, u32 min_type) { int ret; struct btrfs_path *path; @@ -1307,7 +1315,9 @@ static int btrfs_truncate_in_trans(struct btrfs_trans_handle *trans, int extent_type = -1; u64 mask = root->sectorsize - 1; - btrfs_drop_extent_cache(inode, inode->i_size & (~mask), (u64)-1); + if (root->ref_cows) + btrfs_drop_extent_cache(inode, + new_size & (~mask), (u64)-1); path = btrfs_alloc_path(); path->reada = -1; BUG_ON(!path); @@ -1324,7 +1334,13 @@ search_again: goto error; } if (ret > 0) { - BUG_ON(path->slots[0] == 0); + /* there are no items in the tree for us to truncate, we're + * done + */ + if (path->slots[0] == 0) { + ret = 0; + goto error; + } path->slots[0]--; } @@ -1358,10 +1374,10 @@ search_again: } if (found_type == BTRFS_CSUM_ITEM_KEY) { ret = btrfs_csum_truncate(trans, root, path, - inode->i_size); + new_size); BUG_ON(ret); } - if (item_end < inode->i_size) { + if (item_end < new_size) { if (found_type == BTRFS_DIR_ITEM_KEY) { found_type = BTRFS_INODE_ITEM_KEY; } else if (found_type == BTRFS_EXTENT_ITEM_KEY) { @@ -1378,7 +1394,7 @@ search_again: btrfs_set_key_type(&key, found_type); goto next; } - if (found_key.offset >= inode->i_size) + if (found_key.offset >= new_size) del_item = 1; else del_item = 0; @@ -1394,7 +1410,7 @@ search_again: if (!del_item) { u64 orig_num_bytes = btrfs_file_extent_num_bytes(leaf, fi); - extent_num_bytes = inode->i_size - + extent_num_bytes = new_size - found_key.offset + root->sectorsize - 1; extent_num_bytes = extent_num_bytes & ~((u64)root->sectorsize - 1); @@ -1402,7 +1418,7 @@ search_again: extent_num_bytes); num_dec = (orig_num_bytes - extent_num_bytes); - if (extent_start != 0) + if (root->ref_cows && extent_start != 0) dec_i_blocks(inode, num_dec); btrfs_mark_buffer_dirty(leaf); } else { @@ -1413,22 +1429,29 @@ search_again: num_dec = btrfs_file_extent_num_bytes(leaf, fi); if (extent_start != 0) { found_extent = 1; - dec_i_blocks(inode, num_dec); + if (root->ref_cows) + dec_i_blocks(inode, num_dec); + } + if (root->ref_cows) { + root_gen = + btrfs_header_generation(leaf); } - root_gen = btrfs_header_generation(leaf); root_owner = btrfs_header_owner(leaf); } } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { if (!del_item) { - u32 newsize = inode->i_size - found_key.offset; - dec_i_blocks(inode, item_end + 1 - - found_key.offset - newsize); - newsize = - btrfs_file_extent_calc_inline_size(newsize); + u32 size = new_size - found_key.offset; + + if (root->ref_cows) { + dec_i_blocks(inode, item_end + 1 - + found_key.offset - size); + } + size = + btrfs_file_extent_calc_inline_size(size); ret = btrfs_truncate_item(trans, root, path, - newsize, 1); + size, 1); BUG_ON(ret); - } else { + } else if (root->ref_cows) { dec_i_blocks(inode, item_end + 1 - found_key.offset); } @@ -1666,7 +1689,7 @@ void btrfs_delete_inode(struct inode *inode) trans = btrfs_start_transaction(root, 1); btrfs_set_trans_block_group(trans, inode); - ret = btrfs_truncate_in_trans(trans, root, inode, 0); + ret = btrfs_truncate_inode_items(trans, root, inode, inode->i_size, 0); if (ret) { btrfs_orphan_del(NULL, inode); goto no_delete_lock; @@ -1753,15 +1776,20 @@ static int fixup_tree_root_location(struct btrfs_root *root, return 0; } -static int btrfs_init_locked_inode(struct inode *inode, void *p) +static noinline void init_btrfs_i(struct inode *inode) { - struct btrfs_iget_args *args = p; - inode->i_ino = args->ino; - BTRFS_I(inode)->root = args->root; - BTRFS_I(inode)->delalloc_bytes = 0; - inode->i_mapping->writeback_index = 0; - BTRFS_I(inode)->disk_i_size = 0; - BTRFS_I(inode)->index_cnt = (u64)-1; + struct btrfs_inode *bi = BTRFS_I(inode); + + bi->i_acl = NULL; + bi->i_default_acl = NULL; + + bi->generation = 0; + bi->last_trans = 0; + bi->logged_trans = 0; + bi->delalloc_bytes = 0; + bi->disk_i_size = 0; + bi->flags = 0; + bi->index_cnt = (u64)-1; extent_map_tree_init(&BTRFS_I(inode)->extent_tree, GFP_NOFS); extent_io_tree_init(&BTRFS_I(inode)->io_tree, inode->i_mapping, GFP_NOFS); @@ -1771,6 +1799,15 @@ static int btrfs_init_locked_inode(struct inode *inode, void *p) btrfs_ordered_inode_tree_init(&BTRFS_I(inode)->ordered_tree); mutex_init(&BTRFS_I(inode)->csum_mutex); mutex_init(&BTRFS_I(inode)->extent_mutex); + mutex_init(&BTRFS_I(inode)->log_mutex); +} + +static int btrfs_init_locked_inode(struct inode *inode, void *p) +{ + struct btrfs_iget_args *args = p; + inode->i_ino = args->ino; + init_btrfs_i(inode); + BTRFS_I(inode)->root = args->root; return 0; } @@ -2263,21 +2300,10 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, * btrfs_get_inode_index_count has an explanation for the magic * number */ + init_btrfs_i(inode); BTRFS_I(inode)->index_cnt = 2; - - extent_map_tree_init(&BTRFS_I(inode)->extent_tree, GFP_NOFS); - extent_io_tree_init(&BTRFS_I(inode)->io_tree, - inode->i_mapping, GFP_NOFS); - extent_io_tree_init(&BTRFS_I(inode)->io_failure_tree, - inode->i_mapping, GFP_NOFS); - btrfs_ordered_inode_tree_init(&BTRFS_I(inode)->ordered_tree); - INIT_LIST_HEAD(&BTRFS_I(inode)->delalloc_inodes); - mutex_init(&BTRFS_I(inode)->csum_mutex); - mutex_init(&BTRFS_I(inode)->extent_mutex); - BTRFS_I(inode)->delalloc_bytes = 0; - inode->i_mapping->writeback_index = 0; - BTRFS_I(inode)->disk_i_size = 0; BTRFS_I(inode)->root = root; + BTRFS_I(inode)->generation = trans->transid; if (mode & S_IFDIR) owner = 0; @@ -2290,7 +2316,6 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, new_inode_group = group; } BTRFS_I(inode)->block_group = new_inode_group; - BTRFS_I(inode)->flags = 0; key[0].objectid = objectid; btrfs_set_key_type(&key[0], BTRFS_INODE_ITEM_KEY); @@ -2318,7 +2343,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_inode_item); - fill_inode_item(path->nodes[0], inode_item, inode); + fill_inode_item(trans, path->nodes[0], inode_item, inode); ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1, struct btrfs_inode_ref); @@ -2349,38 +2374,34 @@ static inline u8 btrfs_inode_type(struct inode *inode) return btrfs_type_by_mode[(inode->i_mode & S_IFMT) >> S_SHIFT]; } -static int btrfs_add_link(struct btrfs_trans_handle *trans, - struct dentry *dentry, struct inode *inode, - int add_backref, u64 index) +int btrfs_add_link(struct btrfs_trans_handle *trans, + struct inode *parent_inode, struct inode *inode, + const char *name, int name_len, int add_backref, u64 index) { int ret; struct btrfs_key key; - struct btrfs_root *root = BTRFS_I(dentry->d_parent->d_inode)->root; - struct inode *parent_inode = dentry->d_parent->d_inode; + struct btrfs_root *root = BTRFS_I(parent_inode)->root; key.objectid = inode->i_ino; btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); key.offset = 0; - ret = btrfs_insert_dir_item(trans, root, - dentry->d_name.name, dentry->d_name.len, - dentry->d_parent->d_inode->i_ino, + ret = btrfs_insert_dir_item(trans, root, name, name_len, + parent_inode->i_ino, &key, btrfs_inode_type(inode), index); if (ret == 0) { if (add_backref) { ret = btrfs_insert_inode_ref(trans, root, - dentry->d_name.name, - dentry->d_name.len, - inode->i_ino, - parent_inode->i_ino, - index); + name, name_len, + inode->i_ino, + parent_inode->i_ino, + index); } btrfs_i_size_write(parent_inode, parent_inode->i_size + - dentry->d_name.len * 2); + name_len * 2); parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME; - ret = btrfs_update_inode(trans, root, - dentry->d_parent->d_inode); + ret = btrfs_update_inode(trans, root, parent_inode); } return ret; } @@ -2389,7 +2410,9 @@ static int btrfs_add_nondir(struct btrfs_trans_handle *trans, struct dentry *dentry, struct inode *inode, int backref, u64 index) { - int err = btrfs_add_link(trans, dentry, inode, backref, index); + int err = btrfs_add_link(trans, dentry->d_parent->d_inode, + inode, dentry->d_name.name, + dentry->d_name.len, backref, index); if (!err) { d_instantiate(dentry, inode); return 0; @@ -2513,19 +2536,7 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry, inode->i_mapping->backing_dev_info = &root->fs_info->bdi; inode->i_fop = &btrfs_file_operations; inode->i_op = &btrfs_file_inode_operations; - extent_map_tree_init(&BTRFS_I(inode)->extent_tree, GFP_NOFS); - extent_io_tree_init(&BTRFS_I(inode)->io_tree, - inode->i_mapping, GFP_NOFS); - extent_io_tree_init(&BTRFS_I(inode)->io_failure_tree, - inode->i_mapping, GFP_NOFS); - INIT_LIST_HEAD(&BTRFS_I(inode)->delalloc_inodes); - mutex_init(&BTRFS_I(inode)->csum_mutex); - mutex_init(&BTRFS_I(inode)->extent_mutex); - BTRFS_I(inode)->delalloc_bytes = 0; - BTRFS_I(inode)->disk_i_size = 0; - inode->i_mapping->writeback_index = 0; BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; - btrfs_ordered_inode_tree_init(&BTRFS_I(inode)->ordered_tree); } dir->i_sb->s_dirt = 1; btrfs_update_inode_block_group(trans, inode); @@ -2556,11 +2567,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, if (inode->i_nlink == 0) return -ENOENT; -#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18) - inode->i_nlink++; -#else - inc_nlink(inode); -#endif + btrfs_inc_nlink(inode); err = btrfs_check_free_space(root, 1, 0); if (err) goto fail; @@ -2650,7 +2657,9 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) if (err) goto out_fail; - err = btrfs_add_link(trans, dentry, inode, 0, index); + err = btrfs_add_link(trans, dentry->d_parent->d_inode, + inode, dentry->d_name.name, + dentry->d_name.len, 0, index); if (err) goto out_fail; @@ -3221,7 +3230,7 @@ static void btrfs_truncate(struct inode *inode) if (ret) goto out; /* FIXME, add redo link to tree so we don't leak on crash */ - ret = btrfs_truncate_in_trans(trans, root, inode, + ret = btrfs_truncate_inode_items(trans, root, inode, inode->i_size, BTRFS_EXTENT_DATA_KEY); btrfs_update_inode(trans, root, inode); @@ -3304,6 +3313,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) if (!ei) return NULL; ei->last_trans = 0; + ei->logged_trans = 0; btrfs_ordered_inode_tree_init(&ei->ordered_tree); ei->i_acl = BTRFS_ACL_NOT_CACHED; ei->i_default_acl = BTRFS_ACL_NOT_CACHED; @@ -3463,31 +3473,39 @@ static int btrfs_rename(struct inode * old_dir, struct dentry *old_dentry, btrfs_set_trans_block_group(trans, new_dir); - old_dentry->d_inode->i_nlink++; + btrfs_inc_nlink(old_dentry->d_inode); old_dir->i_ctime = old_dir->i_mtime = ctime; new_dir->i_ctime = new_dir->i_mtime = ctime; old_inode->i_ctime = ctime; - ret = btrfs_unlink_trans(trans, root, old_dir, old_dentry); + ret = btrfs_unlink_inode(trans, root, old_dir, old_dentry->d_inode, + old_dentry->d_name.name, + old_dentry->d_name.len); if (ret) goto out_fail; if (new_inode) { new_inode->i_ctime = CURRENT_TIME; - ret = btrfs_unlink_trans(trans, root, new_dir, new_dentry); + ret = btrfs_unlink_inode(trans, root, new_dir, + new_dentry->d_inode, + new_dentry->d_name.name, + new_dentry->d_name.len); if (ret) goto out_fail; if (new_inode->i_nlink == 0) { - ret = btrfs_orphan_add(trans, new_inode); + ret = btrfs_orphan_add(trans, new_dentry->d_inode); if (ret) goto out_fail; } + } ret = btrfs_set_inode_index(new_dir, old_inode, &index); if (ret) goto out_fail; - ret = btrfs_add_link(trans, new_dentry, old_inode, 1, index); + ret = btrfs_add_link(trans, new_dentry->d_parent->d_inode, + old_inode, new_dentry->d_name.name, + new_dentry->d_name.len, 1, index); if (ret) goto out_fail; @@ -3577,19 +3595,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry, inode->i_mapping->backing_dev_info = &root->fs_info->bdi; inode->i_fop = &btrfs_file_operations; inode->i_op = &btrfs_file_inode_operations; - extent_map_tree_init(&BTRFS_I(inode)->extent_tree, GFP_NOFS); - extent_io_tree_init(&BTRFS_I(inode)->io_tree, - inode->i_mapping, GFP_NOFS); - extent_io_tree_init(&BTRFS_I(inode)->io_failure_tree, - inode->i_mapping, GFP_NOFS); - INIT_LIST_HEAD(&BTRFS_I(inode)->delalloc_inodes); - mutex_init(&BTRFS_I(inode)->csum_mutex); - mutex_init(&BTRFS_I(inode)->extent_mutex); - BTRFS_I(inode)->delalloc_bytes = 0; - BTRFS_I(inode)->disk_i_size = 0; - inode->i_mapping->writeback_index = 0; BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; - btrfs_ordered_inode_tree_init(&BTRFS_I(inode)->ordered_tree); } dir->i_sb->s_dirt = 1; btrfs_update_inode_block_group(trans, inode); @@ -3691,6 +3697,7 @@ static struct file_operations btrfs_dir_file_operations = { .compat_ioctl = btrfs_ioctl, #endif .release = btrfs_release_file, + .fsync = btrfs_sync_file, }; static struct extent_io_ops btrfs_extent_io_ops = { diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index 36726696e58b..e3984f902e71 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -202,8 +202,9 @@ again: memcpy(&found_key, &key, sizeof(key)); key.offset++; btrfs_release_path(root, path); - dead_root = btrfs_read_fs_root_no_radix(root->fs_info, - &found_key); + dead_root = + btrfs_read_fs_root_no_radix(root->fs_info->tree_root, + &found_key); if (IS_ERR(dead_root)) { ret = PTR_ERR(dead_root); goto err; diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index eff3ad72991b..49c4f5b40ed6 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -25,6 +25,7 @@ #include "transaction.h" #include "locking.h" #include "ref-cache.h" +#include "tree-log.h" static int total_trans = 0; extern struct kmem_cache *btrfs_trans_handle_cachep; @@ -57,6 +58,7 @@ static noinline int join_transaction(struct btrfs_root *root) root->fs_info->generation++; root->fs_info->last_alloc = 0; root->fs_info->last_data_alloc = 0; + root->fs_info->last_log_alloc = 0; cur_trans->num_writers = 1; cur_trans->num_joined = 0; cur_trans->transid = root->fs_info->generation; @@ -83,7 +85,7 @@ static noinline int join_transaction(struct btrfs_root *root) return 0; } -static noinline int record_root_in_trans(struct btrfs_root *root) +noinline int btrfs_record_root_in_trans(struct btrfs_root *root) { struct btrfs_dirty_root *dirty; u64 running_trans_id = root->fs_info->running_transaction->transid; @@ -151,7 +153,7 @@ static void wait_current_trans(struct btrfs_root *root) } } -struct btrfs_trans_handle *start_transaction(struct btrfs_root *root, +static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root, int num_blocks, int wait) { struct btrfs_trans_handle *h = @@ -164,7 +166,7 @@ struct btrfs_trans_handle *start_transaction(struct btrfs_root *root, ret = join_transaction(root); BUG_ON(ret); - record_root_in_trans(root); + btrfs_record_root_in_trans(root); h->transid = root->fs_info->running_transaction->transid; h->transaction = root->fs_info->running_transaction; h->blocks_reserved = num_blocks; @@ -456,6 +458,8 @@ static noinline int add_dirty_roots(struct btrfs_trans_handle *trans, BUG_ON(!root->ref_tree); dirty = root->dirty_root; + btrfs_free_log(trans, root); + if (root->commit_root == root->node) { WARN_ON(root->node->start != btrfs_root_bytenr(&root->root_item)); @@ -600,7 +604,7 @@ static noinline int drop_dirty_roots(struct btrfs_root *tree_root, num_bytes -= btrfs_root_used(&dirty->root->root_item); bytes_used = btrfs_root_used(&root->root_item); if (num_bytes) { - record_root_in_trans(root); + btrfs_record_root_in_trans(root); btrfs_set_root_used(&root->root_item, bytes_used - num_bytes); } @@ -745,7 +749,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, int ret; INIT_LIST_HEAD(&dirty_fs_roots); - mutex_lock(&root->fs_info->trans_mutex); if (trans->transaction->in_commit) { cur_trans = trans->transaction; @@ -821,10 +824,30 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, WARN_ON(cur_trans != trans->transaction); + /* btrfs_commit_tree_roots is responsible for getting the + * various roots consistent with each other. Every pointer + * in the tree of tree roots has to point to the most up to date + * root for every subvolume and other tree. So, we have to keep + * the tree logging code from jumping in and changing any + * of the trees. + * + * At this point in the commit, there can't be any tree-log + * writers, but a little lower down we drop the trans mutex + * and let new people in. By holding the tree_log_mutex + * from now until after the super is written, we avoid races + * with the tree-log code. + */ + mutex_lock(&root->fs_info->tree_log_mutex); + ret = add_dirty_roots(trans, &root->fs_info->fs_roots_radix, &dirty_fs_roots); BUG_ON(ret); + /* add_dirty_roots gets rid of all the tree log roots, it is now + * safe to free the root of tree log roots + */ + btrfs_free_log_root_tree(trans, root->fs_info); + ret = btrfs_commit_tree_roots(trans, root); BUG_ON(ret); @@ -843,6 +866,12 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, chunk_root->node->start); btrfs_set_super_chunk_root_level(&root->fs_info->super_copy, btrfs_header_level(chunk_root->node)); + + if (!root->fs_info->log_root_recovering) { + btrfs_set_super_log_root(&root->fs_info->super_copy, 0); + btrfs_set_super_log_root_level(&root->fs_info->super_copy, 0); + } + memcpy(&root->fs_info->super_for_commit, &root->fs_info->super_copy, sizeof(root->fs_info->super_copy)); @@ -857,6 +886,12 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, BUG_ON(ret); write_ctree_super(trans, root); + /* + * the super is written, we can safely allow the tree-loggers + * to go about their business + */ + mutex_unlock(&root->fs_info->tree_log_mutex); + btrfs_finish_extent_commit(trans, root, pinned_copy); mutex_lock(&root->fs_info->trans_mutex); diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 598baa312417..cc63650d60d6 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -98,4 +98,5 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans, struct btrfs_root *root); void btrfs_throttle(struct btrfs_root *root); +int btrfs_record_root_in_trans(struct btrfs_root *root); #endif diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c index cc2650b06952..b3bb5bbad76e 100644 --- a/fs/btrfs/tree-defrag.c +++ b/fs/btrfs/tree-defrag.c @@ -81,12 +81,12 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, memcpy(&key, &root->defrag_progress, sizeof(key)); } - path->lowest_level = 1; path->keep_locks = 1; if (cache_only) min_trans = root->defrag_trans_start; - ret = btrfs_search_forward(root, &key, path, cache_only, min_trans); + ret = btrfs_search_forward(root, &key, NULL, path, + cache_only, min_trans); if (ret < 0) goto out; if (ret > 0) { diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c new file mode 100644 index 000000000000..d1ce8314b948 --- /dev/null +++ b/fs/btrfs/tree-log.c @@ -0,0 +1,2804 @@ +/* + * Copyright (C) 2008 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include +#include "ctree.h" +#include "transaction.h" +#include "disk-io.h" +#include "locking.h" +#include "print-tree.h" +#include "compat.h" + +/* magic values for the inode_only field in btrfs_log_inode: + * + * LOG_INODE_ALL means to log everything + * LOG_INODE_EXISTS means to log just enough to recreate the inode + * during log replay + */ +#define LOG_INODE_ALL 0 +#define LOG_INODE_EXISTS 1 + +/* + * stages for the tree walking. The first + * stage (0) is to only pin down the blocks we find + * the second stage (1) is to make sure that all the inodes + * we find in the log are created in the subvolume. + * + * The last stage is to deal with directories and links and extents + * and all the other fun semantics + */ +#define LOG_WALK_PIN_ONLY 0 +#define LOG_WALK_REPLAY_INODES 1 +#define LOG_WALK_REPLAY_ALL 2 + +static int __btrfs_log_inode(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct inode *inode, + int inode_only); + +/* + * tree logging is a special write ahead log used to make sure that + * fsyncs and O_SYNCs can happen without doing full tree commits. + * + * Full tree commits are expensive because they require commonly + * modified blocks to be recowed, creating many dirty pages in the + * extent tree an 4x-6x higher write load than ext3. + * + * Instead of doing a tree commit on every fsync, we use the + * key ranges and transaction ids to find items for a given file or directory + * that have changed in this transaction. Those items are copied into + * a special tree (one per subvolume root), that tree is written to disk + * and then the fsync is considered complete. + * + * After a crash, items are copied out of the log-tree back into the + * subvolume tree. Any file data extents found are recorded in the extent + * allocation tree, and the log-tree freed. + * + * The log tree is read three times, once to pin down all the extents it is + * using in ram and once, once to create all the inodes logged in the tree + * and once to do all the other items. + */ + +/* + * btrfs_add_log_tree adds a new per-subvolume log tree into the + * tree of log tree roots. This must be called with a tree log transaction + * running (see start_log_trans). + */ +int btrfs_add_log_tree(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + struct btrfs_key key; + struct btrfs_root_item root_item; + struct btrfs_inode_item *inode_item; + struct extent_buffer *leaf; + struct btrfs_root *new_root = root; + int ret; + u64 objectid = root->root_key.objectid; + + leaf = btrfs_alloc_free_block(trans, root, root->leafsize, + BTRFS_TREE_LOG_OBJECTID, + 0, 0, 0, 0, 0); + if (IS_ERR(leaf)) { + ret = PTR_ERR(leaf); + return ret; + } + + btrfs_set_header_nritems(leaf, 0); + btrfs_set_header_level(leaf, 0); + btrfs_set_header_bytenr(leaf, leaf->start); + btrfs_set_header_generation(leaf, trans->transid); + btrfs_set_header_owner(leaf, BTRFS_TREE_LOG_OBJECTID); + + write_extent_buffer(leaf, root->fs_info->fsid, + (unsigned long)btrfs_header_fsid(leaf), + BTRFS_FSID_SIZE); + btrfs_mark_buffer_dirty(leaf); + + inode_item = &root_item.inode; + memset(inode_item, 0, sizeof(*inode_item)); + inode_item->generation = cpu_to_le64(1); + inode_item->size = cpu_to_le64(3); + inode_item->nlink = cpu_to_le32(1); + inode_item->nblocks = cpu_to_le64(1); + inode_item->mode = cpu_to_le32(S_IFDIR | 0755); + + btrfs_set_root_bytenr(&root_item, leaf->start); + btrfs_set_root_level(&root_item, 0); + btrfs_set_root_refs(&root_item, 0); + btrfs_set_root_used(&root_item, 0); + + memset(&root_item.drop_progress, 0, sizeof(root_item.drop_progress)); + root_item.drop_level = 0; + + btrfs_tree_unlock(leaf); + free_extent_buffer(leaf); + leaf = NULL; + + btrfs_set_root_dirid(&root_item, 0); + + key.objectid = BTRFS_TREE_LOG_OBJECTID; + key.offset = objectid; + btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); + ret = btrfs_insert_root(trans, root->fs_info->log_root_tree, &key, + &root_item); + if (ret) + goto fail; + + new_root = btrfs_read_fs_root_no_radix(root->fs_info->log_root_tree, + &key); + BUG_ON(!new_root); + + WARN_ON(root->log_root); + root->log_root = new_root; + + /* + * log trees do not get reference counted because they go away + * before a real commit is actually done. They do store pointers + * to file data extents, and those reference counts still get + * updated (along with back refs to the log tree). + */ + new_root->ref_cows = 0; + new_root->last_trans = trans->transid; +fail: + return ret; +} + +/* + * start a sub transaction and setup the log tree + * this increments the log tree writer count to make the people + * syncing the tree wait for us to finish + */ +static int start_log_trans(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + int ret; + mutex_lock(&root->fs_info->tree_log_mutex); + if (!root->fs_info->log_root_tree) { + ret = btrfs_init_log_root_tree(trans, root->fs_info); + BUG_ON(ret); + } + if (!root->log_root) { + ret = btrfs_add_log_tree(trans, root); + BUG_ON(ret); + } + atomic_inc(&root->fs_info->tree_log_writers); + root->fs_info->tree_log_batch++; + mutex_unlock(&root->fs_info->tree_log_mutex); + return 0; +} + +/* + * returns 0 if there was a log transaction running and we were able + * to join, or returns -ENOENT if there were not transactions + * in progress + */ +static int join_running_log_trans(struct btrfs_root *root) +{ + int ret = -ENOENT; + + smp_mb(); + if (!root->log_root) + return -ENOENT; + + mutex_lock(&root->fs_info->tree_log_mutex); + if (root->log_root) { + ret = 0; + atomic_inc(&root->fs_info->tree_log_writers); + root->fs_info->tree_log_batch++; + } + mutex_unlock(&root->fs_info->tree_log_mutex); + return ret; +} + +/* + * indicate we're done making changes to the log tree + * and wake up anyone waiting to do a sync + */ +static int end_log_trans(struct btrfs_root *root) +{ + atomic_dec(&root->fs_info->tree_log_writers); + smp_mb(); + if (waitqueue_active(&root->fs_info->tree_log_wait)) + wake_up(&root->fs_info->tree_log_wait); + return 0; +} + + +/* + * the walk control struct is used to pass state down the chain when + * processing the log tree. The stage field tells us which part + * of the log tree processing we are currently doing. The others + * are state fields used for that specific part + */ +struct walk_control { + /* should we free the extent on disk when done? This is used + * at transaction commit time while freeing a log tree + */ + int free; + + /* should we write out the extent buffer? This is used + * while flushing the log tree to disk during a sync + */ + int write; + + /* should we wait for the extent buffer io to finish? Also used + * while flushing the log tree to disk for a sync + */ + int wait; + + /* pin only walk, we record which extents on disk belong to the + * log trees + */ + int pin; + + /* what stage of the replay code we're currently in */ + int stage; + + /* the root we are currently replaying */ + struct btrfs_root *replay_dest; + + /* the trans handle for the current replay */ + struct btrfs_trans_handle *trans; + + /* the function that gets used to process blocks we find in the + * tree. Note the extent_buffer might not be up to date when it is + * passed in, and it must be checked or read if you need the data + * inside it + */ + int (*process_func)(struct btrfs_root *log, struct extent_buffer *eb, + struct walk_control *wc, u64 gen); +}; + +/* + * process_func used to pin down extents, write them or wait on them + */ +static int process_one_buffer(struct btrfs_root *log, + struct extent_buffer *eb, + struct walk_control *wc, u64 gen) +{ + if (wc->pin) { + mutex_lock(&log->fs_info->alloc_mutex); + btrfs_update_pinned_extents(log->fs_info->extent_root, + eb->start, eb->len, 1); + mutex_unlock(&log->fs_info->alloc_mutex); + } + + if (btrfs_buffer_uptodate(eb, gen)) { + if (wc->write) + btrfs_write_tree_block(eb); + if (wc->wait) + btrfs_wait_tree_block_writeback(eb); + } + return 0; +} + +/* + * Item overwrite used by replay and tree logging. eb, slot and key all refer + * to the src data we are copying out. + * + * root is the tree we are copying into, and path is a scratch + * path for use in this function (it should be released on entry and + * will be released on exit). + * + * If the key is already in the destination tree the existing item is + * overwritten. If the existing item isn't big enough, it is extended. + * If it is too large, it is truncated. + * + * If the key isn't in the destination yet, a new item is inserted. + */ +static noinline int overwrite_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct extent_buffer *eb, int slot, + struct btrfs_key *key) +{ + int ret; + u32 item_size; + u64 saved_i_size = 0; + int save_old_i_size = 0; + unsigned long src_ptr; + unsigned long dst_ptr; + int overwrite_root = 0; + + if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) + overwrite_root = 1; + + item_size = btrfs_item_size_nr(eb, slot); + src_ptr = btrfs_item_ptr_offset(eb, slot); + + /* look for the key in the destination tree */ + ret = btrfs_search_slot(NULL, root, key, path, 0, 0); + if (ret == 0) { + char *src_copy; + char *dst_copy; + u32 dst_size = btrfs_item_size_nr(path->nodes[0], + path->slots[0]); + if (dst_size != item_size) + goto insert; + + if (item_size == 0) { + btrfs_release_path(root, path); + return 0; + } + dst_copy = kmalloc(item_size, GFP_NOFS); + src_copy = kmalloc(item_size, GFP_NOFS); + + read_extent_buffer(eb, src_copy, src_ptr, item_size); + + dst_ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); + read_extent_buffer(path->nodes[0], dst_copy, dst_ptr, + item_size); + ret = memcmp(dst_copy, src_copy, item_size); + + kfree(dst_copy); + kfree(src_copy); + /* + * they have the same contents, just return, this saves + * us from cowing blocks in the destination tree and doing + * extra writes that may not have been done by a previous + * sync + */ + if (ret == 0) { + btrfs_release_path(root, path); + return 0; + } + + } +insert: + btrfs_release_path(root, path); + /* try to insert the key into the destination tree */ + ret = btrfs_insert_empty_item(trans, root, path, + key, item_size); + + /* make sure any existing item is the correct size */ + if (ret == -EEXIST) { + u32 found_size; + found_size = btrfs_item_size_nr(path->nodes[0], + path->slots[0]); + if (found_size > item_size) { + btrfs_truncate_item(trans, root, path, item_size, 1); + } else if (found_size < item_size) { + ret = btrfs_del_item(trans, root, + path); + BUG_ON(ret); + + btrfs_release_path(root, path); + ret = btrfs_insert_empty_item(trans, + root, path, key, item_size); + BUG_ON(ret); + } + } else if (ret) { + BUG(); + } + dst_ptr = btrfs_item_ptr_offset(path->nodes[0], + path->slots[0]); + + /* don't overwrite an existing inode if the generation number + * was logged as zero. This is done when the tree logging code + * is just logging an inode to make sure it exists after recovery. + * + * Also, don't overwrite i_size on directories during replay. + * log replay inserts and removes directory items based on the + * state of the tree found in the subvolume, and i_size is modified + * as it goes + */ + if (key->type == BTRFS_INODE_ITEM_KEY && ret == -EEXIST) { + struct btrfs_inode_item *src_item; + struct btrfs_inode_item *dst_item; + + src_item = (struct btrfs_inode_item *)src_ptr; + dst_item = (struct btrfs_inode_item *)dst_ptr; + + if (btrfs_inode_generation(eb, src_item) == 0) + goto no_copy; + + if (overwrite_root && + S_ISDIR(btrfs_inode_mode(eb, src_item)) && + S_ISDIR(btrfs_inode_mode(path->nodes[0], dst_item))) { + save_old_i_size = 1; + saved_i_size = btrfs_inode_size(path->nodes[0], + dst_item); + } + } + + copy_extent_buffer(path->nodes[0], eb, dst_ptr, + src_ptr, item_size); + + if (save_old_i_size) { + struct btrfs_inode_item *dst_item; + dst_item = (struct btrfs_inode_item *)dst_ptr; + btrfs_set_inode_size(path->nodes[0], dst_item, saved_i_size); + } + + /* make sure the generation is filled in */ + if (key->type == BTRFS_INODE_ITEM_KEY) { + struct btrfs_inode_item *dst_item; + dst_item = (struct btrfs_inode_item *)dst_ptr; + if (btrfs_inode_generation(path->nodes[0], dst_item) == 0) { + btrfs_set_inode_generation(path->nodes[0], dst_item, + trans->transid); + } + } +no_copy: + btrfs_mark_buffer_dirty(path->nodes[0]); + btrfs_release_path(root, path); + return 0; +} + +/* + * simple helper to read an inode off the disk from a given root + * This can only be called for subvolume roots and not for the log + */ +static noinline struct inode *read_one_inode(struct btrfs_root *root, + u64 objectid) +{ + struct inode *inode; + inode = btrfs_iget_locked(root->fs_info->sb, objectid, root); + if (inode->i_state & I_NEW) { + BTRFS_I(inode)->root = root; + BTRFS_I(inode)->location.objectid = objectid; + BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY; + BTRFS_I(inode)->location.offset = 0; + btrfs_read_locked_inode(inode); + unlock_new_inode(inode); + + } + if (is_bad_inode(inode)) { + iput(inode); + inode = NULL; + } + return inode; +} + +/* replays a single extent in 'eb' at 'slot' with 'key' into the + * subvolume 'root'. path is released on entry and should be released + * on exit. + * + * extents in the log tree have not been allocated out of the extent + * tree yet. So, this completes the allocation, taking a reference + * as required if the extent already exists or creating a new extent + * if it isn't in the extent allocation tree yet. + * + * The extent is inserted into the file, dropping any existing extents + * from the file that overlap the new one. + */ +static noinline int replay_one_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct extent_buffer *eb, int slot, + struct btrfs_key *key) +{ + int found_type; + u64 mask = root->sectorsize - 1; + u64 extent_end; + u64 alloc_hint; + u64 start = key->offset; + struct btrfs_file_extent_item *item; + struct inode *inode = NULL; + unsigned long size; + int ret = 0; + + item = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item); + found_type = btrfs_file_extent_type(eb, item); + + if (found_type == BTRFS_FILE_EXTENT_REG) + extent_end = start + btrfs_file_extent_num_bytes(eb, item); + else if (found_type == BTRFS_FILE_EXTENT_INLINE) { + size = btrfs_file_extent_inline_len(eb, + btrfs_item_nr(eb, slot)); + extent_end = (start + size + mask) & ~mask; + } else { + ret = 0; + goto out; + } + + inode = read_one_inode(root, key->objectid); + if (!inode) { + ret = -EIO; + goto out; + } + + /* + * first check to see if we already have this extent in the + * file. This must be done before the btrfs_drop_extents run + * so we don't try to drop this extent. + */ + ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino, + start, 0); + + if (ret == 0 && found_type == BTRFS_FILE_EXTENT_REG) { + struct btrfs_file_extent_item cmp1; + struct btrfs_file_extent_item cmp2; + struct btrfs_file_extent_item *existing; + struct extent_buffer *leaf; + + leaf = path->nodes[0]; + existing = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + + read_extent_buffer(eb, &cmp1, (unsigned long)item, + sizeof(cmp1)); + read_extent_buffer(leaf, &cmp2, (unsigned long)existing, + sizeof(cmp2)); + + /* + * we already have a pointer to this exact extent, + * we don't have to do anything + */ + if (memcmp(&cmp1, &cmp2, sizeof(cmp1)) == 0) { + btrfs_release_path(root, path); + goto out; + } + } + btrfs_release_path(root, path); + + /* drop any overlapping extents */ + ret = btrfs_drop_extents(trans, root, inode, + start, extent_end, start, &alloc_hint); + BUG_ON(ret); + + BUG_ON(ret); + if (found_type == BTRFS_FILE_EXTENT_REG) { + struct btrfs_key ins; + + ins.objectid = btrfs_file_extent_disk_bytenr(eb, item); + ins.offset = btrfs_file_extent_disk_num_bytes(eb, item); + ins.type = BTRFS_EXTENT_ITEM_KEY; + + /* insert the extent pointer in the file */ + ret = overwrite_item(trans, root, path, eb, slot, key); + BUG_ON(ret); + + /* + * is this extent already allocated in the extent + * allocation tree? If so, just add a reference + */ + ret = btrfs_lookup_extent(root, path, ins.objectid, ins.offset); + btrfs_release_path(root, path); + if (ret == 0) { + ret = btrfs_inc_extent_ref(trans, root, + ins.objectid, ins.offset, + root->root_key.objectid, + trans->transid, key->objectid, start); + } else { + /* + * insert the extent pointer in the extent + * allocation tree + */ + ret = btrfs_alloc_logged_extent(trans, root, + root->root_key.objectid, + trans->transid, key->objectid, + start, &ins); + BUG_ON(ret); + } + } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { + /* inline extents are easy, we just overwrite them */ + ret = overwrite_item(trans, root, path, eb, slot, key); + BUG_ON(ret); + } + /* btrfs_drop_extents changes i_blocks, update it here */ + inode->i_blocks += (extent_end - start) >> 9; + btrfs_update_inode(trans, root, inode); +out: + if (inode) + iput(inode); + return ret; +} + +/* + * when cleaning up conflicts between the directory names in the + * subvolume, directory names in the log and directory names in the + * inode back references, we may have to unlink inodes from directories. + * + * This is a helper function to do the unlink of a specific directory + * item + */ +static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct inode *dir, + struct btrfs_dir_item *di) +{ + struct inode *inode; + char *name; + int name_len; + struct extent_buffer *leaf; + struct btrfs_key location; + int ret; + + leaf = path->nodes[0]; + + btrfs_dir_item_key_to_cpu(leaf, di, &location); + name_len = btrfs_dir_name_len(leaf, di); + name = kmalloc(name_len, GFP_NOFS); + read_extent_buffer(leaf, name, (unsigned long)(di + 1), name_len); + btrfs_release_path(root, path); + + inode = read_one_inode(root, location.objectid); + BUG_ON(!inode); + + btrfs_inc_nlink(inode); + ret = btrfs_unlink_inode(trans, root, dir, inode, name, name_len); + kfree(name); + + iput(inode); + return ret; +} + +/* + * helper function to see if a given name and sequence number found + * in an inode back reference are already in a directory and correctly + * point to this inode + */ +static noinline int inode_in_dir(struct btrfs_root *root, + struct btrfs_path *path, + u64 dirid, u64 objectid, u64 index, + const char *name, int name_len) +{ + struct btrfs_dir_item *di; + struct btrfs_key location; + int match = 0; + + di = btrfs_lookup_dir_index_item(NULL, root, path, dirid, + index, name, name_len, 0); + if (di && !IS_ERR(di)) { + btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location); + if (location.objectid != objectid) + goto out; + } else + goto out; + btrfs_release_path(root, path); + + di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, name_len, 0); + if (di && !IS_ERR(di)) { + btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location); + if (location.objectid != objectid) + goto out; + } else + goto out; + match = 1; +out: + btrfs_release_path(root, path); + return match; +} + +/* + * helper function to check a log tree for a named back reference in + * an inode. This is used to decide if a back reference that is + * found in the subvolume conflicts with what we find in the log. + * + * inode backreferences may have multiple refs in a single item, + * during replay we process one reference at a time, and we don't + * want to delete valid links to a file from the subvolume if that + * link is also in the log. + */ +static noinline int backref_in_log(struct btrfs_root *log, + struct btrfs_key *key, + char *name, int namelen) +{ + struct btrfs_path *path; + struct btrfs_inode_ref *ref; + unsigned long ptr; + unsigned long ptr_end; + unsigned long name_ptr; + int found_name_len; + int item_size; + int ret; + int match = 0; + + path = btrfs_alloc_path(); + ret = btrfs_search_slot(NULL, log, key, path, 0, 0); + if (ret != 0) + goto out; + + item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]); + ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); + ptr_end = ptr + item_size; + while (ptr < ptr_end) { + ref = (struct btrfs_inode_ref *)ptr; + found_name_len = btrfs_inode_ref_name_len(path->nodes[0], ref); + if (found_name_len == namelen) { + name_ptr = (unsigned long)(ref + 1); + ret = memcmp_extent_buffer(path->nodes[0], name, + name_ptr, namelen); + if (ret == 0) { + match = 1; + goto out; + } + } + ptr = (unsigned long)(ref + 1) + found_name_len; + } +out: + btrfs_free_path(path); + return match; +} + + +/* + * replay one inode back reference item found in the log tree. + * eb, slot and key refer to the buffer and key found in the log tree. + * root is the destination we are replaying into, and path is for temp + * use by this function. (it should be released on return). + */ +static noinline int add_inode_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_root *log, + struct btrfs_path *path, + struct extent_buffer *eb, int slot, + struct btrfs_key *key) +{ + struct inode *dir; + int ret; + struct btrfs_key location; + struct btrfs_inode_ref *ref; + struct btrfs_dir_item *di; + struct inode *inode; + char *name; + int namelen; + unsigned long ref_ptr; + unsigned long ref_end; + + location.objectid = key->objectid; + location.type = BTRFS_INODE_ITEM_KEY; + location.offset = 0; + + /* + * it is possible that we didn't log all the parent directories + * for a given inode. If we don't find the dir, just don't + * copy the back ref in. The link count fixup code will take + * care of the rest + */ + dir = read_one_inode(root, key->offset); + if (!dir) + return -ENOENT; + + inode = read_one_inode(root, key->objectid); + BUG_ON(!dir); + + ref_ptr = btrfs_item_ptr_offset(eb, slot); + ref_end = ref_ptr + btrfs_item_size_nr(eb, slot); + +again: + ref = (struct btrfs_inode_ref *)ref_ptr; + + namelen = btrfs_inode_ref_name_len(eb, ref); + name = kmalloc(namelen, GFP_NOFS); + BUG_ON(!name); + + read_extent_buffer(eb, name, (unsigned long)(ref + 1), namelen); + + /* if we already have a perfect match, we're done */ + if (inode_in_dir(root, path, dir->i_ino, inode->i_ino, + btrfs_inode_ref_index(eb, ref), + name, namelen)) { + goto out; + } + + /* + * look for a conflicting back reference in the metadata. + * if we find one we have to unlink that name of the file + * before we add our new link. Later on, we overwrite any + * existing back reference, and we don't want to create + * dangling pointers in the directory. + */ +conflict_again: + ret = btrfs_search_slot(NULL, root, key, path, 0, 0); + if (ret == 0) { + char *victim_name; + int victim_name_len; + struct btrfs_inode_ref *victim_ref; + unsigned long ptr; + unsigned long ptr_end; + struct extent_buffer *leaf = path->nodes[0]; + + /* are we trying to overwrite a back ref for the root directory + * if so, just jump out, we're done + */ + if (key->objectid == key->offset) + goto out_nowrite; + + /* check all the names in this back reference to see + * if they are in the log. if so, we allow them to stay + * otherwise they must be unlinked as a conflict + */ + ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); + ptr_end = ptr + btrfs_item_size_nr(leaf, path->slots[0]); + while(ptr < ptr_end) { + victim_ref = (struct btrfs_inode_ref *)ptr; + victim_name_len = btrfs_inode_ref_name_len(leaf, + victim_ref); + victim_name = kmalloc(victim_name_len, GFP_NOFS); + BUG_ON(!victim_name); + + read_extent_buffer(leaf, victim_name, + (unsigned long)(victim_ref + 1), + victim_name_len); + + if (!backref_in_log(log, key, victim_name, + victim_name_len)) { + btrfs_inc_nlink(inode); + btrfs_release_path(root, path); + ret = btrfs_unlink_inode(trans, root, dir, + inode, victim_name, + victim_name_len); + kfree(victim_name); + btrfs_release_path(root, path); + goto conflict_again; + } + kfree(victim_name); + ptr = (unsigned long)(victim_ref + 1) + victim_name_len; + } + BUG_ON(ret); + } + btrfs_release_path(root, path); + + /* look for a conflicting sequence number */ + di = btrfs_lookup_dir_index_item(trans, root, path, dir->i_ino, + btrfs_inode_ref_index(eb, ref), + name, namelen, 0); + if (di && !IS_ERR(di)) { + ret = drop_one_dir_item(trans, root, path, dir, di); + BUG_ON(ret); + } + btrfs_release_path(root, path); + + + /* look for a conflicting name */ + di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino, + name, namelen, 0); + if (di && !IS_ERR(di)) { + ret = drop_one_dir_item(trans, root, path, dir, di); + BUG_ON(ret); + } + btrfs_release_path(root, path); + + /* insert our name */ + ret = btrfs_add_link(trans, dir, inode, name, namelen, 0, + btrfs_inode_ref_index(eb, ref)); + BUG_ON(ret); + + btrfs_update_inode(trans, root, inode); + +out: + ref_ptr = (unsigned long)(ref + 1) + namelen; + kfree(name); + if (ref_ptr < ref_end) + goto again; + + /* finally write the back reference in the inode */ + ret = overwrite_item(trans, root, path, eb, slot, key); + BUG_ON(ret); + +out_nowrite: + btrfs_release_path(root, path); + iput(dir); + iput(inode); + return 0; +} + +/* + * replay one csum item from the log tree into the subvolume 'root' + * eb, slot and key all refer to the log tree + * path is for temp use by this function and should be released on return + * + * This copies the checksums out of the log tree and inserts them into + * the subvolume. Any existing checksums for this range in the file + * are overwritten, and new items are added where required. + * + * We keep this simple by reusing the btrfs_ordered_sum code from + * the data=ordered mode. This basically means making a copy + * of all the checksums in ram, which we have to do anyway for kmap + * rules. + * + * The copy is then sent down to btrfs_csum_file_blocks, which + * does all the hard work of finding existing items in the file + * or adding new ones. + */ +static noinline int replay_one_csum(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct extent_buffer *eb, int slot, + struct btrfs_key *key) +{ + int ret; + u32 item_size = btrfs_item_size_nr(eb, slot); + u64 cur_offset; + unsigned long file_bytes; + struct btrfs_ordered_sum *sums; + struct btrfs_sector_sum *sector_sum; + struct inode *inode; + unsigned long ptr; + + file_bytes = (item_size / BTRFS_CRC32_SIZE) * root->sectorsize; + inode = read_one_inode(root, key->objectid); + if (!inode) { + return -EIO; + } + + sums = kzalloc(btrfs_ordered_sum_size(root, file_bytes), GFP_NOFS); + if (!sums) { + iput(inode); + return -ENOMEM; + } + + INIT_LIST_HEAD(&sums->list); + sums->len = file_bytes; + sums->file_offset = key->offset; + + /* + * copy all the sums into the ordered sum struct + */ + sector_sum = sums->sums; + cur_offset = key->offset; + ptr = btrfs_item_ptr_offset(eb, slot); + while(item_size > 0) { + sector_sum->offset = cur_offset; + read_extent_buffer(eb, §or_sum->sum, ptr, BTRFS_CRC32_SIZE); + sector_sum++; + item_size -= BTRFS_CRC32_SIZE; + ptr += BTRFS_CRC32_SIZE; + cur_offset += root->sectorsize; + } + + /* let btrfs_csum_file_blocks add them into the file */ + ret = btrfs_csum_file_blocks(trans, root, inode, sums); + BUG_ON(ret); + kfree(sums); + iput(inode); + + return 0; +} +/* + * There are a few corners where the link count of the file can't + * be properly maintained during replay. So, instead of adding + * lots of complexity to the log code, we just scan the backrefs + * for any file that has been through replay. + * + * The scan will update the link count on the inode to reflect the + * number of back refs found. If it goes down to zero, the iput + * will free the inode. + */ +static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct inode *inode) +{ + struct btrfs_path *path; + int ret; + struct btrfs_key key; + u64 nlink = 0; + unsigned long ptr; + unsigned long ptr_end; + int name_len; + + key.objectid = inode->i_ino; + key.type = BTRFS_INODE_REF_KEY; + key.offset = (u64)-1; + + path = btrfs_alloc_path(); + + while(1) { + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + break; + if (ret > 0) { + if (path->slots[0] == 0) + break; + path->slots[0]--; + } + btrfs_item_key_to_cpu(path->nodes[0], &key, + path->slots[0]); + if (key.objectid != inode->i_ino || + key.type != BTRFS_INODE_REF_KEY) + break; + ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); + ptr_end = ptr + btrfs_item_size_nr(path->nodes[0], + path->slots[0]); + while(ptr < ptr_end) { + struct btrfs_inode_ref *ref; + + ref = (struct btrfs_inode_ref *)ptr; + name_len = btrfs_inode_ref_name_len(path->nodes[0], + ref); + ptr = (unsigned long)(ref + 1) + name_len; + nlink++; + } + + if (key.offset == 0) + break; + key.offset--; + btrfs_release_path(root, path); + } + btrfs_free_path(path); + if (nlink != inode->i_nlink) { + inode->i_nlink = nlink; + btrfs_update_inode(trans, root, inode); + } + + return 0; +} + +static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path) +{ + int ret; + struct btrfs_key key; + struct inode *inode; + + key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID; + key.type = BTRFS_ORPHAN_ITEM_KEY; + key.offset = (u64)-1; + while(1) { + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + if (ret < 0) + break; + + if (ret == 1) { + if (path->slots[0] == 0) + break; + path->slots[0]--; + } + + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + if (key.objectid != BTRFS_TREE_LOG_FIXUP_OBJECTID || + key.type != BTRFS_ORPHAN_ITEM_KEY) + break; + + ret = btrfs_del_item(trans, root, path); + BUG_ON(ret); + + btrfs_release_path(root, path); + inode = read_one_inode(root, key.offset); + BUG_ON(!inode); + + ret = fixup_inode_link_count(trans, root, inode); + BUG_ON(ret); + + iput(inode); + + if (key.offset == 0) + break; + key.offset--; + } + btrfs_release_path(root, path); + return 0; +} + + +/* + * record a given inode in the fixup dir so we can check its link + * count when replay is done. The link count is incremented here + * so the inode won't go away until we check it + */ +static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + u64 objectid) +{ + struct btrfs_key key; + int ret = 0; + struct inode *inode; + + inode = read_one_inode(root, objectid); + BUG_ON(!inode); + + key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID; + btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY); + key.offset = objectid; + + ret = btrfs_insert_empty_item(trans, root, path, &key, 0); + + btrfs_release_path(root, path); + if (ret == 0) { + btrfs_inc_nlink(inode); + btrfs_update_inode(trans, root, inode); + } else if (ret == -EEXIST) { + ret = 0; + } else { + BUG(); + } + iput(inode); + + return ret; +} + +/* + * when replaying the log for a directory, we only insert names + * for inodes that actually exist. This means an fsync on a directory + * does not implicitly fsync all the new files in it + */ +static noinline int insert_one_name(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + u64 dirid, u64 index, + char *name, int name_len, u8 type, + struct btrfs_key *location) +{ + struct inode *inode; + struct inode *dir; + int ret; + + inode = read_one_inode(root, location->objectid); + if (!inode) + return -ENOENT; + + dir = read_one_inode(root, dirid); + if (!dir) { + iput(inode); + return -EIO; + } + ret = btrfs_add_link(trans, dir, inode, name, name_len, 1, index); + + /* FIXME, put inode into FIXUP list */ + + iput(inode); + iput(dir); + return ret; +} + +/* + * take a single entry in a log directory item and replay it into + * the subvolume. + * + * if a conflicting item exists in the subdirectory already, + * the inode it points to is unlinked and put into the link count + * fix up tree. + * + * If a name from the log points to a file or directory that does + * not exist in the FS, it is skipped. fsyncs on directories + * do not force down inodes inside that directory, just changes to the + * names or unlinks in a directory. + */ +static noinline int replay_one_name(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct extent_buffer *eb, + struct btrfs_dir_item *di, + struct btrfs_key *key) +{ + char *name; + int name_len; + struct btrfs_dir_item *dst_di; + struct btrfs_key found_key; + struct btrfs_key log_key; + struct inode *dir; + struct inode *inode; + u8 log_type; + int ret; + + dir = read_one_inode(root, key->objectid); + BUG_ON(!dir); + + name_len = btrfs_dir_name_len(eb, di); + name = kmalloc(name_len, GFP_NOFS); + log_type = btrfs_dir_type(eb, di); + read_extent_buffer(eb, name, (unsigned long)(di + 1), + name_len); + + btrfs_dir_item_key_to_cpu(eb, di, &log_key); + if (key->type == BTRFS_DIR_ITEM_KEY) { + dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid, + name, name_len, 1); + } + else if (key->type == BTRFS_DIR_INDEX_KEY) { + dst_di = btrfs_lookup_dir_index_item(trans, root, path, + key->objectid, + key->offset, name, + name_len, 1); + } else { + BUG(); + } + if (!dst_di || IS_ERR(dst_di)) { + /* we need a sequence number to insert, so we only + * do inserts for the BTRFS_DIR_INDEX_KEY types + */ + if (key->type != BTRFS_DIR_INDEX_KEY) + goto out; + goto insert; + } + + btrfs_dir_item_key_to_cpu(path->nodes[0], dst_di, &found_key); + /* the existing item matches the logged item */ + if (found_key.objectid == log_key.objectid && + found_key.type == log_key.type && + found_key.offset == log_key.offset && + btrfs_dir_type(path->nodes[0], dst_di) == log_type) { + goto out; + } + + /* + * don't drop the conflicting directory entry if the inode + * for the new entry doesn't exist + */ + inode = read_one_inode(root, log_key.objectid); + if (!inode) + goto out; + + iput(inode); + ret = drop_one_dir_item(trans, root, path, dir, dst_di); + BUG_ON(ret); + + if (key->type == BTRFS_DIR_INDEX_KEY) + goto insert; +out: + btrfs_release_path(root, path); + kfree(name); + iput(dir); + return 0; + +insert: + btrfs_release_path(root, path); + ret = insert_one_name(trans, root, path, key->objectid, key->offset, + name, name_len, log_type, &log_key); + + if (ret && ret != -ENOENT) + BUG(); + goto out; +} + +/* + * find all the names in a directory item and reconcile them into + * the subvolume. Only BTRFS_DIR_ITEM_KEY types will have more than + * one name in a directory item, but the same code gets used for + * both directory index types + */ +static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct extent_buffer *eb, int slot, + struct btrfs_key *key) +{ + int ret; + u32 item_size = btrfs_item_size_nr(eb, slot); + struct btrfs_dir_item *di; + int name_len; + unsigned long ptr; + unsigned long ptr_end; + + ptr = btrfs_item_ptr_offset(eb, slot); + ptr_end = ptr + item_size; + while(ptr < ptr_end) { + di = (struct btrfs_dir_item *)ptr; + name_len = btrfs_dir_name_len(eb, di); + ret = replay_one_name(trans, root, path, eb, di, key); + BUG_ON(ret); + ptr = (unsigned long)(di + 1); + ptr += name_len; + } + return 0; +} + +/* + * directory replay has two parts. There are the standard directory + * items in the log copied from the subvolume, and range items + * created in the log while the subvolume was logged. + * + * The range items tell us which parts of the key space the log + * is authoritative for. During replay, if a key in the subvolume + * directory is in a logged range item, but not actually in the log + * that means it was deleted from the directory before the fsync + * and should be removed. + */ +static noinline int find_dir_range(struct btrfs_root *root, + struct btrfs_path *path, + u64 dirid, int key_type, + u64 *start_ret, u64 *end_ret) +{ + struct btrfs_key key; + u64 found_end; + struct btrfs_dir_log_item *item; + int ret; + int nritems; + + if (*start_ret == (u64)-1) + return 1; + + key.objectid = dirid; + key.type = key_type; + key.offset = *start_ret; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + goto out; + if (ret > 0) { + if (path->slots[0] == 0) + goto out; + path->slots[0]--; + } + if (ret != 0) + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + + if (key.type != key_type || key.objectid != dirid) { + ret = 1; + goto next; + } + item = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_dir_log_item); + found_end = btrfs_dir_log_end(path->nodes[0], item); + + if (*start_ret >= key.offset && *start_ret <= found_end) { + ret = 0; + *start_ret = key.offset; + *end_ret = found_end; + goto out; + } + ret = 1; +next: + /* check the next slot in the tree to see if it is a valid item */ + nritems = btrfs_header_nritems(path->nodes[0]); + if (path->slots[0] >= nritems) { + ret = btrfs_next_leaf(root, path); + if (ret) + goto out; + } else { + path->slots[0]++; + } + + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + + if (key.type != key_type || key.objectid != dirid) { + ret = 1; + goto out; + } + item = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_dir_log_item); + found_end = btrfs_dir_log_end(path->nodes[0], item); + *start_ret = key.offset; + *end_ret = found_end; + ret = 0; +out: + btrfs_release_path(root, path); + return ret; +} + +/* + * this looks for a given directory item in the log. If the directory + * item is not in the log, the item is removed and the inode it points + * to is unlinked + */ +static noinline int check_item_in_log(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_root *log, + struct btrfs_path *path, + struct btrfs_path *log_path, + struct inode *dir, + struct btrfs_key *dir_key) +{ + int ret; + struct extent_buffer *eb; + int slot; + u32 item_size; + struct btrfs_dir_item *di; + struct btrfs_dir_item *log_di; + int name_len; + unsigned long ptr; + unsigned long ptr_end; + char *name; + struct inode *inode; + struct btrfs_key location; + +again: + eb = path->nodes[0]; + slot = path->slots[0]; + item_size = btrfs_item_size_nr(eb, slot); + ptr = btrfs_item_ptr_offset(eb, slot); + ptr_end = ptr + item_size; + while(ptr < ptr_end) { + di = (struct btrfs_dir_item *)ptr; + name_len = btrfs_dir_name_len(eb, di); + name = kmalloc(name_len, GFP_NOFS); + if (!name) { + ret = -ENOMEM; + goto out; + } + read_extent_buffer(eb, name, (unsigned long)(di + 1), + name_len); + log_di = NULL; + if (dir_key->type == BTRFS_DIR_ITEM_KEY) { + log_di = btrfs_lookup_dir_item(trans, log, log_path, + dir_key->objectid, + name, name_len, 0); + } else if (dir_key->type == BTRFS_DIR_INDEX_KEY) { + log_di = btrfs_lookup_dir_index_item(trans, log, + log_path, + dir_key->objectid, + dir_key->offset, + name, name_len, 0); + } + if (!log_di || IS_ERR(log_di)) { + btrfs_dir_item_key_to_cpu(eb, di, &location); + btrfs_release_path(root, path); + btrfs_release_path(log, log_path); + inode = read_one_inode(root, location.objectid); + BUG_ON(!inode); + + ret = link_to_fixup_dir(trans, root, + path, location.objectid); + BUG_ON(ret); + btrfs_inc_nlink(inode); + ret = btrfs_unlink_inode(trans, root, dir, inode, + name, name_len); + BUG_ON(ret); + kfree(name); + iput(inode); + + /* there might still be more names under this key + * check and repeat if required + */ + ret = btrfs_search_slot(NULL, root, dir_key, path, + 0, 0); + if (ret == 0) + goto again; + ret = 0; + goto out; + } + btrfs_release_path(log, log_path); + kfree(name); + + ptr = (unsigned long)(di + 1); + ptr += name_len; + } + ret = 0; +out: + btrfs_release_path(root, path); + btrfs_release_path(log, log_path); + return ret; +} + +/* + * deletion replay happens before we copy any new directory items + * out of the log or out of backreferences from inodes. It + * scans the log to find ranges of keys that log is authoritative for, + * and then scans the directory to find items in those ranges that are + * not present in the log. + * + * Anything we don't find in the log is unlinked and removed from the + * directory. + */ +static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_root *log, + struct btrfs_path *path, + u64 dirid) +{ + u64 range_start; + u64 range_end; + int key_type = BTRFS_DIR_LOG_ITEM_KEY; + int ret = 0; + struct btrfs_key dir_key; + struct btrfs_key found_key; + struct btrfs_path *log_path; + struct inode *dir; + + dir_key.objectid = dirid; + dir_key.type = BTRFS_DIR_ITEM_KEY; + log_path = btrfs_alloc_path(); + if (!log_path) + return -ENOMEM; + + dir = read_one_inode(root, dirid); + /* it isn't an error if the inode isn't there, that can happen + * because we replay the deletes before we copy in the inode item + * from the log + */ + if (!dir) { + btrfs_free_path(log_path); + return 0; + } +again: + range_start = 0; + range_end = 0; + while(1) { + ret = find_dir_range(log, path, dirid, key_type, + &range_start, &range_end); + if (ret != 0) + break; + + dir_key.offset = range_start; + while(1) { + int nritems; + ret = btrfs_search_slot(NULL, root, &dir_key, path, + 0, 0); + if (ret < 0) + goto out; + + nritems = btrfs_header_nritems(path->nodes[0]); + if (path->slots[0] >= nritems) { + ret = btrfs_next_leaf(root, path); + if (ret) + break; + } + btrfs_item_key_to_cpu(path->nodes[0], &found_key, + path->slots[0]); + if (found_key.objectid != dirid || + found_key.type != dir_key.type) + goto next_type; + + if (found_key.offset > range_end) + break; + + ret = check_item_in_log(trans, root, log, path, + log_path, dir, &found_key); + BUG_ON(ret); + if (found_key.offset == (u64)-1) + break; + dir_key.offset = found_key.offset + 1; + } + btrfs_release_path(root, path); + if (range_end == (u64)-1) + break; + range_start = range_end + 1; + } + +next_type: + ret = 0; + if (key_type == BTRFS_DIR_LOG_ITEM_KEY) { + key_type = BTRFS_DIR_LOG_INDEX_KEY; + dir_key.type = BTRFS_DIR_INDEX_KEY; + btrfs_release_path(root, path); + goto again; + } +out: + btrfs_release_path(root, path); + btrfs_free_path(log_path); + iput(dir); + return ret; +} + +/* + * the process_func used to replay items from the log tree. This + * gets called in two different stages. The first stage just looks + * for inodes and makes sure they are all copied into the subvolume. + * + * The second stage copies all the other item types from the log into + * the subvolume. The two stage approach is slower, but gets rid of + * lots of complexity around inodes referencing other inodes that exist + * only in the log (references come from either directory items or inode + * back refs). + */ +static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, + struct walk_control *wc, u64 gen) +{ + int nritems; + struct btrfs_path *path; + struct btrfs_root *root = wc->replay_dest; + struct btrfs_key key; + u32 item_size; + int level; + int i; + int ret; + + btrfs_read_buffer(eb, gen); + + level = btrfs_header_level(eb); + + if (level != 0) + return 0; + + path = btrfs_alloc_path(); + BUG_ON(!path); + + nritems = btrfs_header_nritems(eb); + for (i = 0; i < nritems; i++) { + btrfs_item_key_to_cpu(eb, &key, i); + item_size = btrfs_item_size_nr(eb, i); + + /* inode keys are done during the first stage */ + if (key.type == BTRFS_INODE_ITEM_KEY && + wc->stage == LOG_WALK_REPLAY_INODES) { + struct inode *inode; + struct btrfs_inode_item *inode_item; + u32 mode; + + inode_item = btrfs_item_ptr(eb, i, + struct btrfs_inode_item); + mode = btrfs_inode_mode(eb, inode_item); + if (S_ISDIR(mode)) { + ret = replay_dir_deletes(wc->trans, + root, log, path, key.objectid); + BUG_ON(ret); + } + ret = overwrite_item(wc->trans, root, path, + eb, i, &key); + BUG_ON(ret); + + /* for regular files, truncate away + * extents past the new EOF + */ + if (S_ISREG(mode)) { + inode = read_one_inode(root, + key.objectid); + BUG_ON(!inode); + + ret = btrfs_truncate_inode_items(wc->trans, + root, inode, inode->i_size, + BTRFS_EXTENT_DATA_KEY); + BUG_ON(ret); + iput(inode); + } + ret = link_to_fixup_dir(wc->trans, root, + path, key.objectid); + BUG_ON(ret); + } + if (wc->stage < LOG_WALK_REPLAY_ALL) + continue; + + /* these keys are simply copied */ + if (key.type == BTRFS_XATTR_ITEM_KEY) { + ret = overwrite_item(wc->trans, root, path, + eb, i, &key); + BUG_ON(ret); + } else if (key.type == BTRFS_INODE_REF_KEY) { + ret = add_inode_ref(wc->trans, root, log, path, + eb, i, &key); + BUG_ON(ret && ret != -ENOENT); + } else if (key.type == BTRFS_EXTENT_DATA_KEY) { + ret = replay_one_extent(wc->trans, root, path, + eb, i, &key); + BUG_ON(ret); + } else if (key.type == BTRFS_CSUM_ITEM_KEY) { + ret = replay_one_csum(wc->trans, root, path, + eb, i, &key); + BUG_ON(ret); + } else if (key.type == BTRFS_DIR_ITEM_KEY || + key.type == BTRFS_DIR_INDEX_KEY) { + ret = replay_one_dir_item(wc->trans, root, path, + eb, i, &key); + BUG_ON(ret); + } + } + btrfs_free_path(path); + return 0; +} + +static int noinline walk_down_log_tree(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, int *level, + struct walk_control *wc) +{ + u64 root_owner; + u64 root_gen; + u64 bytenr; + u64 ptr_gen; + struct extent_buffer *next; + struct extent_buffer *cur; + struct extent_buffer *parent; + u32 blocksize; + int ret = 0; + + WARN_ON(*level < 0); + WARN_ON(*level >= BTRFS_MAX_LEVEL); + + while(*level > 0) { + WARN_ON(*level < 0); + WARN_ON(*level >= BTRFS_MAX_LEVEL); + cur = path->nodes[*level]; + + if (btrfs_header_level(cur) != *level) + WARN_ON(1); + + if (path->slots[*level] >= + btrfs_header_nritems(cur)) + break; + + bytenr = btrfs_node_blockptr(cur, path->slots[*level]); + ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]); + blocksize = btrfs_level_size(root, *level - 1); + + parent = path->nodes[*level]; + root_owner = btrfs_header_owner(parent); + root_gen = btrfs_header_generation(parent); + + next = btrfs_find_create_tree_block(root, bytenr, blocksize); + + wc->process_func(root, next, wc, ptr_gen); + + if (*level == 1) { + path->slots[*level]++; + if (wc->free) { + btrfs_read_buffer(next, ptr_gen); + + btrfs_tree_lock(next); + clean_tree_block(trans, root, next); + btrfs_wait_tree_block_writeback(next); + btrfs_tree_unlock(next); + + ret = btrfs_drop_leaf_ref(trans, root, next); + BUG_ON(ret); + + WARN_ON(root_owner != + BTRFS_TREE_LOG_OBJECTID); + ret = btrfs_free_extent(trans, root, bytenr, + blocksize, root_owner, + root_gen, 0, 0, 1); + BUG_ON(ret); + } + free_extent_buffer(next); + continue; + } + btrfs_read_buffer(next, ptr_gen); + + WARN_ON(*level <= 0); + if (path->nodes[*level-1]) + free_extent_buffer(path->nodes[*level-1]); + path->nodes[*level-1] = next; + *level = btrfs_header_level(next); + path->slots[*level] = 0; + cond_resched(); + } + WARN_ON(*level < 0); + WARN_ON(*level >= BTRFS_MAX_LEVEL); + + if (path->nodes[*level] == root->node) { + parent = path->nodes[*level]; + } else { + parent = path->nodes[*level + 1]; + } + bytenr = path->nodes[*level]->start; + + blocksize = btrfs_level_size(root, *level); + root_owner = btrfs_header_owner(parent); + root_gen = btrfs_header_generation(parent); + + wc->process_func(root, path->nodes[*level], wc, + btrfs_header_generation(path->nodes[*level])); + + if (wc->free) { + next = path->nodes[*level]; + btrfs_tree_lock(next); + clean_tree_block(trans, root, next); + btrfs_wait_tree_block_writeback(next); + btrfs_tree_unlock(next); + + if (*level == 0) { + ret = btrfs_drop_leaf_ref(trans, root, next); + BUG_ON(ret); + } + WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID); + ret = btrfs_free_extent(trans, root, bytenr, blocksize, + root_owner, root_gen, 0, 0, 1); + BUG_ON(ret); + } + free_extent_buffer(path->nodes[*level]); + path->nodes[*level] = NULL; + *level += 1; + + cond_resched(); + return 0; +} + +static int noinline walk_up_log_tree(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, int *level, + struct walk_control *wc) +{ + u64 root_owner; + u64 root_gen; + int i; + int slot; + int ret; + + for(i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) { + slot = path->slots[i]; + if (slot < btrfs_header_nritems(path->nodes[i]) - 1) { + struct extent_buffer *node; + node = path->nodes[i]; + path->slots[i]++; + *level = i; + WARN_ON(*level == 0); + return 0; + } else { + if (path->nodes[*level] == root->node) { + root_owner = root->root_key.objectid; + root_gen = + btrfs_header_generation(path->nodes[*level]); + } else { + struct extent_buffer *node; + node = path->nodes[*level + 1]; + root_owner = btrfs_header_owner(node); + root_gen = btrfs_header_generation(node); + } + wc->process_func(root, path->nodes[*level], wc, + btrfs_header_generation(path->nodes[*level])); + if (wc->free) { + struct extent_buffer *next; + + next = path->nodes[*level]; + + btrfs_tree_lock(next); + clean_tree_block(trans, root, next); + btrfs_wait_tree_block_writeback(next); + btrfs_tree_unlock(next); + + if (*level == 0) { + ret = btrfs_drop_leaf_ref(trans, root, + next); + BUG_ON(ret); + } + + WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID); + ret = btrfs_free_extent(trans, root, + path->nodes[*level]->start, + path->nodes[*level]->len, + root_owner, root_gen, 0, 0, 1); + BUG_ON(ret); + } + free_extent_buffer(path->nodes[*level]); + path->nodes[*level] = NULL; + *level = i + 1; + } + } + return 1; +} + +/* + * drop the reference count on the tree rooted at 'snap'. This traverses + * the tree freeing any blocks that have a ref count of zero after being + * decremented. + */ +static int walk_log_tree(struct btrfs_trans_handle *trans, + struct btrfs_root *log, struct walk_control *wc) +{ + int ret = 0; + int wret; + int level; + struct btrfs_path *path; + int i; + int orig_level; + + path = btrfs_alloc_path(); + BUG_ON(!path); + + level = btrfs_header_level(log->node); + orig_level = level; + path->nodes[level] = log->node; + extent_buffer_get(log->node); + path->slots[level] = 0; + + while(1) { + wret = walk_down_log_tree(trans, log, path, &level, wc); + if (wret > 0) + break; + if (wret < 0) + ret = wret; + + wret = walk_up_log_tree(trans, log, path, &level, wc); + if (wret > 0) + break; + if (wret < 0) + ret = wret; + } + + /* was the root node processed? if not, catch it here */ + if (path->nodes[orig_level]) { + wc->process_func(log, path->nodes[orig_level], wc, + btrfs_header_generation(path->nodes[orig_level])); + if (wc->free) { + struct extent_buffer *next; + + next = path->nodes[orig_level]; + + btrfs_tree_lock(next); + clean_tree_block(trans, log, next); + btrfs_wait_tree_block_writeback(next); + btrfs_tree_unlock(next); + + if (orig_level == 0) { + ret = btrfs_drop_leaf_ref(trans, log, + next); + BUG_ON(ret); + } + WARN_ON(log->root_key.objectid != + BTRFS_TREE_LOG_OBJECTID); + ret = btrfs_free_extent(trans, log, + next->start, next->len, + log->root_key.objectid, + btrfs_header_generation(next), + 0, 0, 1); + BUG_ON(ret); + } + } + + for (i = 0; i <= orig_level; i++) { + if (path->nodes[i]) { + free_extent_buffer(path->nodes[i]); + path->nodes[i] = NULL; + } + } + btrfs_free_path(path); + if (wc->free) + free_extent_buffer(log->node); + return ret; +} + +int wait_log_commit(struct btrfs_root *log) +{ + DEFINE_WAIT(wait); + u64 transid = log->fs_info->tree_log_transid; + + do { + prepare_to_wait(&log->fs_info->tree_log_wait, &wait, + TASK_UNINTERRUPTIBLE); + mutex_unlock(&log->fs_info->tree_log_mutex); + if (atomic_read(&log->fs_info->tree_log_commit)) + schedule(); + finish_wait(&log->fs_info->tree_log_wait, &wait); + mutex_lock(&log->fs_info->tree_log_mutex); + } while(transid == log->fs_info->tree_log_transid && + atomic_read(&log->fs_info->tree_log_commit)); + return 0; +} + +/* + * btrfs_sync_log does sends a given tree log down to the disk and + * updates the super blocks to record it. When this call is done, + * you know that any inodes previously logged are safely on disk + */ +int btrfs_sync_log(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + int ret; + unsigned long batch; + struct btrfs_root *log = root->log_root; + struct walk_control wc = { + .write = 1, + .process_func = process_one_buffer + }; + + mutex_lock(&log->fs_info->tree_log_mutex); + if (atomic_read(&log->fs_info->tree_log_commit)) { + wait_log_commit(log); + goto out; + } + atomic_set(&log->fs_info->tree_log_commit, 1); + + while(1) { + mutex_unlock(&log->fs_info->tree_log_mutex); + schedule_timeout_uninterruptible(1); + mutex_lock(&log->fs_info->tree_log_mutex); + batch = log->fs_info->tree_log_batch; + + while(atomic_read(&log->fs_info->tree_log_writers)) { + DEFINE_WAIT(wait); + prepare_to_wait(&log->fs_info->tree_log_wait, &wait, + TASK_UNINTERRUPTIBLE); + batch = log->fs_info->tree_log_batch; + mutex_unlock(&log->fs_info->tree_log_mutex); + if (atomic_read(&log->fs_info->tree_log_writers)) + schedule(); + mutex_lock(&log->fs_info->tree_log_mutex); + finish_wait(&log->fs_info->tree_log_wait, &wait); + } + if (batch == log->fs_info->tree_log_batch) + break; + } + ret = walk_log_tree(trans, log, &wc); + BUG_ON(ret); + + ret = walk_log_tree(trans, log->fs_info->log_root_tree, &wc); + BUG_ON(ret); + + wc.wait = 1; + + ret = walk_log_tree(trans, log, &wc); + BUG_ON(ret); + + ret = walk_log_tree(trans, log->fs_info->log_root_tree, &wc); + BUG_ON(ret); + + btrfs_set_super_log_root(&root->fs_info->super_for_commit, + log->fs_info->log_root_tree->node->start); + btrfs_set_super_log_root_level(&root->fs_info->super_for_commit, + btrfs_header_level(log->fs_info->log_root_tree->node)); + + write_ctree_super(trans, log->fs_info->tree_root); + log->fs_info->tree_log_transid++; + log->fs_info->tree_log_batch = 0; + atomic_set(&log->fs_info->tree_log_commit, 0); + smp_mb(); + if (waitqueue_active(&log->fs_info->tree_log_wait)) + wake_up(&log->fs_info->tree_log_wait); +out: + mutex_unlock(&log->fs_info->tree_log_mutex); + return 0; + +} + +/* + * free all the extents used by the tree log. This should be called + * at commit time of the full transaction + */ +int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root) +{ + int ret; + struct btrfs_root *log; + struct key; + struct walk_control wc = { + .free = 1, + .process_func = process_one_buffer + }; + + if (!root->log_root) + return 0; + + log = root->log_root; + ret = walk_log_tree(trans, log, &wc); + BUG_ON(ret); + + log = root->log_root; + ret = btrfs_del_root(trans, root->fs_info->log_root_tree, + &log->root_key); + BUG_ON(ret); + root->log_root = NULL; + kfree(root->log_root); + return 0; +} + +/* + * helper function to update the item for a given subvolumes log root + * in the tree of log roots + */ +static int update_log_root(struct btrfs_trans_handle *trans, + struct btrfs_root *log) +{ + u64 bytenr = btrfs_root_bytenr(&log->root_item); + int ret; + + if (log->node->start == bytenr) + return 0; + + btrfs_set_root_bytenr(&log->root_item, log->node->start); + btrfs_set_root_level(&log->root_item, btrfs_header_level(log->node)); + ret = btrfs_update_root(trans, log->fs_info->log_root_tree, + &log->root_key, &log->root_item); + BUG_ON(ret); + return ret; +} + +/* + * If both a file and directory are logged, and unlinks or renames are + * mixed in, we have a few interesting corners: + * + * create file X in dir Y + * link file X to X.link in dir Y + * fsync file X + * unlink file X but leave X.link + * fsync dir Y + * + * After a crash we would expect only X.link to exist. But file X + * didn't get fsync'd again so the log has back refs for X and X.link. + * + * We solve this by removing directory entries and inode backrefs from the + * log when a file that was logged in the current transaction is + * unlinked. Any later fsync will include the updated log entries, and + * we'll be able to reconstruct the proper directory items from backrefs. + * + * This optimizations allows us to avoid relogging the entire inode + * or the entire directory. + */ +int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + const char *name, int name_len, + struct inode *dir, u64 index) +{ + struct btrfs_root *log; + struct btrfs_dir_item *di; + struct btrfs_path *path; + int ret; + int bytes_del = 0; + + ret = join_running_log_trans(root); + if (ret) + return 0; + + mutex_lock(&BTRFS_I(dir)->log_mutex); + + log = root->log_root; + path = btrfs_alloc_path(); + di = btrfs_lookup_dir_item(trans, log, path, dir->i_ino, + name, name_len, -1); + if (di && !IS_ERR(di)) { + ret = btrfs_delete_one_dir_name(trans, log, path, di); + bytes_del += name_len; + BUG_ON(ret); + } + btrfs_release_path(log, path); + di = btrfs_lookup_dir_index_item(trans, log, path, dir->i_ino, + index, name, name_len, -1); + if (di && !IS_ERR(di)) { + ret = btrfs_delete_one_dir_name(trans, log, path, di); + bytes_del += name_len; + BUG_ON(ret); + } + + /* update the directory size in the log to reflect the names + * we have removed + */ + if (bytes_del) { + struct btrfs_key key; + + key.objectid = dir->i_ino; + key.offset = 0; + key.type = BTRFS_INODE_ITEM_KEY; + btrfs_release_path(log, path); + + ret = btrfs_search_slot(trans, log, &key, path, 0, 1); + if (ret == 0) { + struct btrfs_inode_item *item; + u64 i_size; + + item = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_inode_item); + i_size = btrfs_inode_size(path->nodes[0], item); + if (i_size > bytes_del) + i_size -= bytes_del; + else + i_size = 0; + btrfs_set_inode_size(path->nodes[0], item, i_size); + btrfs_mark_buffer_dirty(path->nodes[0]); + } else + ret = 0; + btrfs_release_path(log, path); + } + + btrfs_free_path(path); + mutex_unlock(&BTRFS_I(dir)->log_mutex); + end_log_trans(root); + + return 0; +} + +/* see comments for btrfs_del_dir_entries_in_log */ +int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + const char *name, int name_len, + struct inode *inode, u64 dirid) +{ + struct btrfs_root *log; + u64 index; + int ret; + + ret = join_running_log_trans(root); + if (ret) + return 0; + log = root->log_root; + mutex_lock(&BTRFS_I(inode)->log_mutex); + + ret = btrfs_del_inode_ref(trans, log, name, name_len, inode->i_ino, + dirid, &index); + mutex_unlock(&BTRFS_I(inode)->log_mutex); + end_log_trans(root); + + if (ret == 0 || ret == -ENOENT) + return 0; + return ret; +} + +/* + * creates a range item in the log for 'dirid'. first_offset and + * last_offset tell us which parts of the key space the log should + * be considered authoritative for. + */ +static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans, + struct btrfs_root *log, + struct btrfs_path *path, + int key_type, u64 dirid, + u64 first_offset, u64 last_offset) +{ + int ret; + struct btrfs_key key; + struct btrfs_dir_log_item *item; + + key.objectid = dirid; + key.offset = first_offset; + if (key_type == BTRFS_DIR_ITEM_KEY) + key.type = BTRFS_DIR_LOG_ITEM_KEY; + else + key.type = BTRFS_DIR_LOG_INDEX_KEY; + ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item)); + BUG_ON(ret); + + item = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_dir_log_item); + btrfs_set_dir_log_end(path->nodes[0], item, last_offset); + btrfs_mark_buffer_dirty(path->nodes[0]); + btrfs_release_path(log, path); + return 0; +} + +/* + * log all the items included in the current transaction for a given + * directory. This also creates the range items in the log tree required + * to replay anything deleted before the fsync + */ +static noinline int log_dir_items(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct inode *inode, + struct btrfs_path *path, + struct btrfs_path *dst_path, int key_type, + u64 min_offset, u64 *last_offset_ret) +{ + struct btrfs_key min_key; + struct btrfs_key max_key; + struct btrfs_root *log = root->log_root; + struct extent_buffer *src; + int ret; + int i; + int nritems; + u64 first_offset = min_offset; + u64 last_offset = (u64)-1; + + log = root->log_root; + max_key.objectid = inode->i_ino; + max_key.offset = (u64)-1; + max_key.type = key_type; + + min_key.objectid = inode->i_ino; + min_key.type = key_type; + min_key.offset = min_offset; + + path->keep_locks = 1; + + ret = btrfs_search_forward(root, &min_key, &max_key, + path, 0, trans->transid); + + /* + * we didn't find anything from this transaction, see if there + * is anything at all + */ + if (ret != 0 || min_key.objectid != inode->i_ino || + min_key.type != key_type) { + min_key.objectid = inode->i_ino; + min_key.type = key_type; + min_key.offset = (u64)-1; + btrfs_release_path(root, path); + ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0); + if (ret < 0) { + btrfs_release_path(root, path); + return ret; + } + ret = btrfs_previous_item(root, path, inode->i_ino, key_type); + + /* if ret == 0 there are items for this type, + * create a range to tell us the last key of this type. + * otherwise, there are no items in this directory after + * *min_offset, and we create a range to indicate that. + */ + if (ret == 0) { + struct btrfs_key tmp; + btrfs_item_key_to_cpu(path->nodes[0], &tmp, + path->slots[0]); + if (key_type == tmp.type) { + first_offset = max(min_offset, tmp.offset) + 1; + } + } + goto done; + } + + /* go backward to find any previous key */ + ret = btrfs_previous_item(root, path, inode->i_ino, key_type); + if (ret == 0) { + struct btrfs_key tmp; + btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]); + if (key_type == tmp.type) { + first_offset = tmp.offset; + ret = overwrite_item(trans, log, dst_path, + path->nodes[0], path->slots[0], + &tmp); + } + } + btrfs_release_path(root, path); + + /* find the first key from this transaction again */ + ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0); + if (ret != 0) { + WARN_ON(1); + goto done; + } + + /* + * we have a block from this transaction, log every item in it + * from our directory + */ + while(1) { + struct btrfs_key tmp; + src = path->nodes[0]; + nritems = btrfs_header_nritems(src); + for (i = path->slots[0]; i < nritems; i++) { + btrfs_item_key_to_cpu(src, &min_key, i); + + if (min_key.objectid != inode->i_ino || + min_key.type != key_type) + goto done; + ret = overwrite_item(trans, log, dst_path, src, i, + &min_key); + BUG_ON(ret); + } + path->slots[0] = nritems; + + /* + * look ahead to the next item and see if it is also + * from this directory and from this transaction + */ + ret = btrfs_next_leaf(root, path); + if (ret == 1) { + last_offset = (u64)-1; + goto done; + } + btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]); + if (tmp.objectid != inode->i_ino || tmp.type != key_type) { + last_offset = (u64)-1; + goto done; + } + if (btrfs_header_generation(path->nodes[0]) != trans->transid) { + ret = overwrite_item(trans, log, dst_path, + path->nodes[0], path->slots[0], + &tmp); + + BUG_ON(ret); + last_offset = tmp.offset; + goto done; + } + } +done: + *last_offset_ret = last_offset; + btrfs_release_path(root, path); + btrfs_release_path(log, dst_path); + + /* insert the log range keys to indicate where the log is valid */ + ret = insert_dir_log_key(trans, log, path, key_type, inode->i_ino, + first_offset, last_offset); + BUG_ON(ret); + return 0; +} + +/* + * logging directories is very similar to logging inodes, We find all the items + * from the current transaction and write them to the log. + * + * The recovery code scans the directory in the subvolume, and if it finds a + * key in the range logged that is not present in the log tree, then it means + * that dir entry was unlinked during the transaction. + * + * In order for that scan to work, we must include one key smaller than + * the smallest logged by this transaction and one key larger than the largest + * key logged by this transaction. + */ +static noinline int log_directory_changes(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct inode *inode, + struct btrfs_path *path, + struct btrfs_path *dst_path) +{ + u64 min_key; + u64 max_key; + int ret; + int key_type = BTRFS_DIR_ITEM_KEY; + +again: + min_key = 0; + max_key = 0; + while(1) { + ret = log_dir_items(trans, root, inode, path, + dst_path, key_type, min_key, + &max_key); + BUG_ON(ret); + if (max_key == (u64)-1) + break; + min_key = max_key + 1; + } + + if (key_type == BTRFS_DIR_ITEM_KEY) { + key_type = BTRFS_DIR_INDEX_KEY; + goto again; + } + return 0; +} + +/* + * a helper function to drop items from the log before we relog an + * inode. max_key_type indicates the highest item type to remove. + * This cannot be run for file data extents because it does not + * free the extents they point to. + */ +static int drop_objectid_items(struct btrfs_trans_handle *trans, + struct btrfs_root *log, + struct btrfs_path *path, + u64 objectid, int max_key_type) +{ + int ret; + struct btrfs_key key; + struct btrfs_key found_key; + + key.objectid = objectid; + key.type = max_key_type; + key.offset = (u64)-1; + + while(1) { + ret = btrfs_search_slot(trans, log, &key, path, -1, 1); + + if (ret != 1) + break; + + if (path->slots[0] == 0) + break; + + path->slots[0]--; + btrfs_item_key_to_cpu(path->nodes[0], &found_key, + path->slots[0]); + + if (found_key.objectid != objectid) + break; + + ret = btrfs_del_item(trans, log, path); + BUG_ON(ret); + btrfs_release_path(log, path); + } + btrfs_release_path(log, path); + return 0; +} + +/* log a single inode in the tree log. + * At least one parent directory for this inode must exist in the tree + * or be logged already. + * + * Any items from this inode changed by the current transaction are copied + * to the log tree. An extra reference is taken on any extents in this + * file, allowing us to avoid a whole pile of corner cases around logging + * blocks that have been removed from the tree. + * + * See LOG_INODE_ALL and related defines for a description of what inode_only + * does. + * + * This handles both files and directories. + */ +static int __btrfs_log_inode(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct inode *inode, + int inode_only) +{ + struct btrfs_path *path; + struct btrfs_path *dst_path; + struct btrfs_key min_key; + struct btrfs_key max_key; + struct btrfs_root *log = root->log_root; + unsigned long src_offset; + unsigned long dst_offset; + struct extent_buffer *src; + struct btrfs_file_extent_item *extent; + struct btrfs_inode_item *inode_item; + u32 size; + int ret; + + log = root->log_root; + + path = btrfs_alloc_path(); + dst_path = btrfs_alloc_path(); + + min_key.objectid = inode->i_ino; + min_key.type = BTRFS_INODE_ITEM_KEY; + min_key.offset = 0; + + max_key.objectid = inode->i_ino; + if (inode_only == LOG_INODE_EXISTS || S_ISDIR(inode->i_mode)) + max_key.type = BTRFS_XATTR_ITEM_KEY; + else + max_key.type = (u8)-1; + max_key.offset = (u64)-1; + + /* + * if this inode has already been logged and we're in inode_only + * mode, we don't want to delete the things that have already + * been written to the log. + * + * But, if the inode has been through an inode_only log, + * the logged_trans field is not set. This allows us to catch + * any new names for this inode in the backrefs by logging it + * again + */ + if (inode_only == LOG_INODE_EXISTS && + BTRFS_I(inode)->logged_trans == trans->transid) { + btrfs_free_path(path); + btrfs_free_path(dst_path); + goto out; + } + mutex_lock(&BTRFS_I(inode)->log_mutex); + + /* + * a brute force approach to making sure we get the most uptodate + * copies of everything. + */ + if (S_ISDIR(inode->i_mode)) { + int max_key_type = BTRFS_DIR_LOG_INDEX_KEY; + + if (inode_only == LOG_INODE_EXISTS) + max_key_type = BTRFS_XATTR_ITEM_KEY; + ret = drop_objectid_items(trans, log, path, + inode->i_ino, max_key_type); + } else { + ret = btrfs_truncate_inode_items(trans, log, inode, 0, 0); + } + BUG_ON(ret); + path->keep_locks = 1; + + while(1) { + ret = btrfs_search_forward(root, &min_key, &max_key, + path, 0, trans->transid); + if (ret != 0) + break; + + if (min_key.objectid != inode->i_ino) + break; + if (min_key.type > max_key.type) + break; + + src = path->nodes[0]; + size = btrfs_item_size_nr(src, path->slots[0]); + ret = btrfs_insert_empty_item(trans, log, dst_path, &min_key, + size); + if (ret) + BUG(); + + dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0], + dst_path->slots[0]); + + src_offset = btrfs_item_ptr_offset(src, path->slots[0]); + + copy_extent_buffer(dst_path->nodes[0], src, dst_offset, + src_offset, size); + + if (inode_only == LOG_INODE_EXISTS && + min_key.type == BTRFS_INODE_ITEM_KEY) { + inode_item = btrfs_item_ptr(dst_path->nodes[0], + dst_path->slots[0], + struct btrfs_inode_item); + btrfs_set_inode_size(dst_path->nodes[0], inode_item, 0); + + /* set the generation to zero so the recover code + * can tell the difference between an logging + * just to say 'this inode exists' and a logging + * to say 'update this inode with these values' + */ + btrfs_set_inode_generation(dst_path->nodes[0], + inode_item, 0); + } + /* take a reference on file data extents so that truncates + * or deletes of this inode don't have to relog the inode + * again + */ + if (btrfs_key_type(&min_key) == BTRFS_EXTENT_DATA_KEY) { + int found_type; + extent = btrfs_item_ptr(src, path->slots[0], + struct btrfs_file_extent_item); + + found_type = btrfs_file_extent_type(src, extent); + if (found_type == BTRFS_FILE_EXTENT_REG) { + u64 ds = btrfs_file_extent_disk_bytenr(src, + extent); + u64 dl = btrfs_file_extent_disk_num_bytes(src, + extent); + /* ds == 0 is a hole */ + if (ds != 0) { + ret = btrfs_inc_extent_ref(trans, log, + ds, dl, + log->root_key.objectid, + 0, + inode->i_ino, + min_key.offset); + BUG_ON(ret); + } + } + } + + btrfs_mark_buffer_dirty(dst_path->nodes[0]); + btrfs_release_path(root, path); + btrfs_release_path(log, dst_path); + + if (min_key.offset < (u64)-1) + min_key.offset++; + else if (min_key.type < (u8)-1) + min_key.type++; + else if (min_key.objectid < (u64)-1) + min_key.objectid++; + else + break; + } + if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) { + btrfs_release_path(root, path); + btrfs_release_path(log, dst_path); + ret = log_directory_changes(trans, root, inode, path, dst_path); + BUG_ON(ret); + } + mutex_unlock(&BTRFS_I(inode)->log_mutex); + + btrfs_free_path(path); + btrfs_free_path(dst_path); + + mutex_lock(&root->fs_info->tree_log_mutex); + ret = update_log_root(trans, log); + BUG_ON(ret); + mutex_unlock(&root->fs_info->tree_log_mutex); +out: + return 0; +} + +int btrfs_log_inode(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct inode *inode, + int inode_only) +{ + int ret; + + start_log_trans(trans, root); + ret = __btrfs_log_inode(trans, root, inode, inode_only); + end_log_trans(root); + return ret; +} + +/* + * helper function around btrfs_log_inode to make sure newly created + * parent directories also end up in the log. A minimal inode and backref + * only logging is done of any parent directories that are older than + * the last committed transaction + */ +int btrfs_log_dentry(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct dentry *dentry) +{ + int inode_only = LOG_INODE_ALL; + struct super_block *sb; + int ret; + + start_log_trans(trans, root); + sb = dentry->d_inode->i_sb; + while(1) { + ret = __btrfs_log_inode(trans, root, dentry->d_inode, + inode_only); + BUG_ON(ret); + inode_only = LOG_INODE_EXISTS; + + dentry = dentry->d_parent; + if (!dentry || !dentry->d_inode || sb != dentry->d_inode->i_sb) + break; + + if (BTRFS_I(dentry->d_inode)->generation <= + root->fs_info->last_trans_committed) + break; + } + end_log_trans(root); + return 0; +} + +/* + * it is not safe to log dentry if the chunk root has added new + * chunks. This returns 0 if the dentry was logged, and 1 otherwise. + * If this returns 1, you must commit the transaction to safely get your + * data on disk. + */ +int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct dentry *dentry) +{ + u64 gen; + gen = root->fs_info->last_trans_new_blockgroup; + if (gen > root->fs_info->last_trans_committed) + return 1; + else + return btrfs_log_dentry(trans, root, dentry); +} + +/* + * should be called during mount to recover any replay any log trees + * from the FS + */ +int btrfs_recover_log_trees(struct btrfs_root *log_root_tree) +{ + int ret; + struct btrfs_path *path; + struct btrfs_trans_handle *trans; + struct btrfs_key key; + struct btrfs_key found_key; + struct btrfs_key tmp_key; + struct btrfs_root *log; + struct btrfs_fs_info *fs_info = log_root_tree->fs_info; + struct walk_control wc = { + .process_func = process_one_buffer, + .stage = 0, + }; + + fs_info->log_root_recovering = 1; + path = btrfs_alloc_path(); + BUG_ON(!path); + + trans = btrfs_start_transaction(fs_info->tree_root, 1); + + wc.trans = trans; + wc.pin = 1; + + walk_log_tree(trans, log_root_tree, &wc); + +again: + key.objectid = BTRFS_TREE_LOG_OBJECTID; + key.offset = (u64)-1; + btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); + + while(1) { + ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0); + if (ret < 0) + break; + if (ret > 0) { + if (path->slots[0] == 0) + break; + path->slots[0]--; + } + btrfs_item_key_to_cpu(path->nodes[0], &found_key, + path->slots[0]); + btrfs_release_path(log_root_tree, path); + if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID) + break; + + log = btrfs_read_fs_root_no_radix(log_root_tree, + &found_key); + BUG_ON(!log); + + + tmp_key.objectid = found_key.offset; + tmp_key.type = BTRFS_ROOT_ITEM_KEY; + tmp_key.offset = (u64)-1; + + wc.replay_dest = btrfs_read_fs_root_no_name(fs_info, &tmp_key); + + BUG_ON(!wc.replay_dest); + + btrfs_record_root_in_trans(wc.replay_dest); + ret = walk_log_tree(trans, log, &wc); + BUG_ON(ret); + + if (wc.stage == LOG_WALK_REPLAY_ALL) { + ret = fixup_inode_link_counts(trans, wc.replay_dest, + path); + BUG_ON(ret); + } + + key.offset = found_key.offset - 1; + free_extent_buffer(log->node); + kfree(log); + + if (found_key.offset == 0) + break; + } + btrfs_release_path(log_root_tree, path); + + /* step one is to pin it all, step two is to replay just inodes */ + if (wc.pin) { + wc.pin = 0; + wc.process_func = replay_one_buffer; + wc.stage = LOG_WALK_REPLAY_INODES; + goto again; + } + /* step three is to replay everything */ + if (wc.stage < LOG_WALK_REPLAY_ALL) { + wc.stage++; + goto again; + } + + btrfs_free_path(path); + + free_extent_buffer(log_root_tree->node); + log_root_tree->log_root = NULL; + fs_info->log_root_recovering = 0; + + /* step 4: commit the transaction, which also unpins the blocks */ + btrfs_commit_transaction(trans, fs_info->tree_root); + + kfree(log_root_tree); + return 0; +} diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h new file mode 100644 index 000000000000..b9409b32ed02 --- /dev/null +++ b/fs/btrfs/tree-log.h @@ -0,0 +1,41 @@ +/* + * Copyright (C) 2008 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef __TREE_LOG_ +#define __TREE_LOG_ + +int btrfs_sync_log(struct btrfs_trans_handle *trans, + struct btrfs_root *root); +int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root); +int btrfs_log_dentry(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct dentry *dentry); +int btrfs_recover_log_trees(struct btrfs_root *tree_root); +int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct dentry *dentry); +int btrfs_log_inode(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct inode *inode, + int inode_only); +int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + const char *name, int name_len, + struct inode *dir, u64 index); +int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + const char *name, int name_len, + struct inode *inode, u64 dirid); +#endif -- cgit v1.2.3 From 0f9dd46cda36b8de3b9f48bc42bd09d20b9c3b52 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 23 Sep 2008 13:14:11 -0400 Subject: Btrfs: free space accounting redo 1) replace the per fs_info extent_io_tree that tracked free space with two rb-trees per block group to track free space areas via offset and size. The reason to do this is because most allocations come with a hint byte where to start, so we can usually find a chunk of free space at that hint byte to satisfy the allocation and get good space packing. If we cannot find free space at or after the given offset we fall back on looking for a chunk of the given size as close to that given offset as possible. When we fall back on the size search we also try to find a slot as close to the size we want as possible, to avoid breaking small chunks off of huge areas if possible. 2) remove the extent_io_tree that tracked the block group cache from fs_info and replaced it with an rb-tree thats tracks block group cache via offset. also added a per space_info list that tracks the block group cache for the particular space so we can lookup related block groups easily. 3) cleaned up the allocation code to make it a little easier to read and a little less complicated. Basically there are 3 steps, first look from our provided hint. If we couldn't find from that given hint, start back at our original search start and look for space from there. If that fails try to allocate space if we can and start looking again. If not we're screwed and need to start over again. 4) small fixes. there were some issues in volumes.c where we wouldn't allocate the rest of the disk. fixed cow_file_range to actually pass the alloc_hint, which has helped a good bit in making the fs_mark test I run have semi-normal results as we run out of space. Generally with data allocations we don't track where we last allocated from, so everytime we did a data allocation we'd search through every block group that we have looking for free space. Now searching a block group with no free space isn't terribly time consuming, it was causing a slight degradation as we got more data block groups. The alloc_hint has fixed this slight degredation and made things semi-normal. There is still one nagging problem I'm working on where we will get ENOSPC when there is definitely plenty of space. This only happens with metadata allocations, and only when we are almost full. So you generally hit the 85% mark first, but sometimes you'll hit the BUG before you hit the 85% wall. I'm still tracking it down, but until then this seems to be pretty stable and make a significant performance gain. Signed-off-by: Chris Mason --- fs/btrfs/Makefile | 2 +- fs/btrfs/ctree.c | 3 +- fs/btrfs/ctree.h | 46 ++- fs/btrfs/disk-io.c | 7 +- fs/btrfs/extent-tree.c | 869 +++++++++++++++++++++++--------------------- fs/btrfs/extent_io.c | 4 + fs/btrfs/free-space-cache.c | 415 +++++++++++++++++++++ fs/btrfs/inode.c | 3 +- fs/btrfs/volumes.c | 11 +- 9 files changed, 925 insertions(+), 435 deletions(-) create mode 100644 fs/btrfs/free-space-cache.c (limited to 'fs/btrfs/Makefile') diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index b7addbfd8c22..eb36ae981bdc 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -7,7 +7,7 @@ btrfs-y := super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ transaction.o bit-radix.o inode.o file.o tree-defrag.o \ extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \ extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \ - ref-cache.o export.o tree-log.o acl.o + ref-cache.o export.o tree-log.o acl.o free-space-cache.o else # Normal Makefile diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 18e84472abb5..6f467901246f 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -2725,9 +2725,8 @@ int btrfs_insert_empty_items(struct btrfs_trans_handle *trans, total_size = total_data + (nr * sizeof(struct btrfs_item)); ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1); - if (ret == 0) { + if (ret == 0) return -EEXIST; - } if (ret < 0) goto out; diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index eb65fd808883..730aae3bc181 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -483,7 +483,6 @@ struct btrfs_csum_item { #define BTRFS_BLOCK_GROUP_DUP (1 << 5) #define BTRFS_BLOCK_GROUP_RAID10 (1 << 6) - struct btrfs_block_group_item { __le64 used; __le64 chunk_objectid; @@ -498,17 +497,40 @@ struct btrfs_space_info { int full; int force_alloc; struct list_head list; + + /* for block groups in our same type */ + struct list_head block_groups; + spinlock_t lock; +}; + +struct btrfs_free_space { + struct rb_node bytes_index; + struct rb_node offset_index; + u64 offset; + u64 bytes; }; struct btrfs_block_group_cache { struct btrfs_key key; struct btrfs_block_group_item item; - struct btrfs_space_info *space_info; spinlock_t lock; u64 pinned; u64 flags; int cached; int ro; + int dirty; + + struct btrfs_space_info *space_info; + + /* free space cache stuff */ + struct rb_root free_space_bytes; + struct rb_root free_space_offset; + + /* block group cache stuff */ + struct rb_node cache_node; + + /* for block groups in the same raid type */ + struct list_head list; }; struct btrfs_device; @@ -525,8 +547,10 @@ struct btrfs_fs_info { struct btrfs_root *log_root_tree; struct radix_tree_root fs_roots_radix; - struct extent_io_tree free_space_cache; - struct extent_io_tree block_group_cache; + /* block group cache stuff */ + spinlock_t block_group_cache_lock; + struct rb_root block_group_cache_tree; + struct extent_io_tree pinned_extents; struct extent_io_tree pending_del; struct extent_io_tree extent_ins; @@ -1814,4 +1838,18 @@ int btrfs_sync_fs(struct super_block *sb, int wait); int btrfs_check_acl(struct inode *inode, int mask); int btrfs_init_acl(struct inode *inode, struct inode *dir); int btrfs_acl_chmod(struct inode *inode); + +/* free-space-cache.c */ +int btrfs_add_free_space(struct btrfs_block_group_cache *block_group, + u64 bytenr, u64 size); +int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group, + u64 bytenr, u64 size); +void btrfs_remove_free_space_cache(struct btrfs_block_group_cache + *block_group); +struct btrfs_free_space *btrfs_find_free_space(struct btrfs_block_group_cache + *block_group, u64 offset, + u64 bytes); +void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group, + u64 bytes); +u64 btrfs_block_group_free_space(struct btrfs_block_group_cache *block_group); #endif diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index f6f7821d43a5..535bd0fe1a71 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1410,10 +1410,9 @@ struct btrfs_root *open_ctree(struct super_block *sb, BTRFS_I(fs_info->btree_inode)->io_tree.ops = &btree_extent_io_ops; - extent_io_tree_init(&fs_info->free_space_cache, - fs_info->btree_inode->i_mapping, GFP_NOFS); - extent_io_tree_init(&fs_info->block_group_cache, - fs_info->btree_inode->i_mapping, GFP_NOFS); + spin_lock_init(&fs_info->block_group_cache_lock); + fs_info->block_group_cache_tree.rb_node = NULL; + extent_io_tree_init(&fs_info->pinned_extents, fs_info->btree_inode->i_mapping, GFP_NOFS); extent_io_tree_init(&fs_info->pending_del, diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 1c10ffc837c8..813566acc5d3 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -29,12 +29,6 @@ #include "locking.h" #include "ref-cache.h" -#define BLOCK_GROUP_DATA EXTENT_WRITEBACK -#define BLOCK_GROUP_METADATA EXTENT_UPTODATE -#define BLOCK_GROUP_SYSTEM EXTENT_NEW - -#define BLOCK_GROUP_DIRTY EXTENT_DIRTY - static int finish_current_insert(struct btrfs_trans_handle *trans, struct btrfs_root *extent_root); static int del_pending_extents(struct btrfs_trans_handle *trans, struct @@ -62,6 +56,127 @@ void maybe_unlock_mutex(struct btrfs_root *root) } } +static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits) +{ + return (cache->flags & bits) == bits; +} + +/* + * this adds the block group to the fs_info rb tree for the block group + * cache + */ +int btrfs_add_block_group_cache(struct btrfs_fs_info *info, + struct btrfs_block_group_cache *block_group) +{ + struct rb_node **p; + struct rb_node *parent = NULL; + struct btrfs_block_group_cache *cache; + + spin_lock(&info->block_group_cache_lock); + p = &info->block_group_cache_tree.rb_node; + + while (*p) { + parent = *p; + cache = rb_entry(parent, struct btrfs_block_group_cache, + cache_node); + if (block_group->key.objectid < cache->key.objectid) { + p = &(*p)->rb_left; + } else if (block_group->key.objectid > cache->key.objectid) { + p = &(*p)->rb_right; + } else { + spin_unlock(&info->block_group_cache_lock); + return -EEXIST; + } + } + + rb_link_node(&block_group->cache_node, parent, p); + rb_insert_color(&block_group->cache_node, + &info->block_group_cache_tree); + spin_unlock(&info->block_group_cache_lock); + + return 0; +} + +/* + * This will return the block group at or after bytenr if contains is 0, else + * it will return the block group that contains the bytenr + */ +static struct btrfs_block_group_cache * +block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr, + int contains) +{ + struct btrfs_block_group_cache *cache, *ret = NULL; + struct rb_node *n; + u64 end, start; + + spin_lock(&info->block_group_cache_lock); + n = info->block_group_cache_tree.rb_node; + + while (n) { + cache = rb_entry(n, struct btrfs_block_group_cache, + cache_node); + end = cache->key.objectid + cache->key.offset - 1; + start = cache->key.objectid; + + if (bytenr < start) { + if (!contains && (!ret || start < ret->key.objectid)) + ret = cache; + n = n->rb_left; + } else if (bytenr > start) { + if (contains && bytenr <= end) { + ret = cache; + break; + } + n = n->rb_right; + } else { + ret = cache; + break; + } + } + spin_unlock(&info->block_group_cache_lock); + + return ret; +} + +/* + * this is only called by cache_block_group, since we could have freed extents + * we need to check the pinned_extents for any extents that can't be used yet + * since their free space will be released as soon as the transaction commits. + */ +static int add_new_free_space(struct btrfs_block_group_cache *block_group, + struct btrfs_fs_info *info, u64 start, u64 end) +{ + u64 extent_start, extent_end, size; + int ret; + + while (start < end) { + ret = find_first_extent_bit(&info->pinned_extents, start, + &extent_start, &extent_end, + EXTENT_DIRTY); + if (ret) + break; + + if (extent_start == start) { + start = extent_end + 1; + } else if (extent_start > start && extent_start < end) { + size = extent_start - start; + ret = btrfs_add_free_space(block_group, start, size); + BUG_ON(ret); + start = extent_end + 1; + } else { + break; + } + } + + if (start < end) { + size = end - start; + ret = btrfs_add_free_space(block_group, start, size); + BUG_ON(ret); + } + + return 0; +} + static int cache_block_group(struct btrfs_root *root, struct btrfs_block_group_cache *block_group) { @@ -69,10 +184,8 @@ static int cache_block_group(struct btrfs_root *root, int ret = 0; struct btrfs_key key; struct extent_buffer *leaf; - struct extent_io_tree *free_space_cache; int slot; u64 last = 0; - u64 hole_size; u64 first_free; int found = 0; @@ -80,7 +193,6 @@ static int cache_block_group(struct btrfs_root *root, return 0; root = root->fs_info->extent_root; - free_space_cache = &root->fs_info->free_space_cache; if (block_group->cached) return 0; @@ -96,7 +208,8 @@ static int cache_block_group(struct btrfs_root *root, * skip the locking here */ path->skip_locking = 1; - first_free = block_group->key.objectid; + first_free = max_t(u64, block_group->key.objectid, + BTRFS_SUPER_INFO_OFFSET + BTRFS_SUPER_INFO_SIZE); key.objectid = block_group->key.objectid; key.offset = 0; btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY); @@ -119,32 +232,28 @@ static int cache_block_group(struct btrfs_root *root, ret = btrfs_next_leaf(root, path); if (ret < 0) goto err; - if (ret == 0) { + if (ret == 0) continue; - } else { + else break; - } } btrfs_item_key_to_cpu(leaf, &key, slot); - if (key.objectid < block_group->key.objectid) { + if (key.objectid < block_group->key.objectid) goto next; - } + if (key.objectid >= block_group->key.objectid + - block_group->key.offset) { + block_group->key.offset) break; - } if (btrfs_key_type(&key) == BTRFS_EXTENT_ITEM_KEY) { if (!found) { last = first_free; found = 1; } - if (key.objectid > last) { - hole_size = key.objectid - last; - set_extent_dirty(free_space_cache, last, - last + hole_size - 1, - GFP_NOFS); - } + + add_new_free_space(block_group, root->fs_info, last, + key.objectid); + last = key.objectid + key.offset; } next: @@ -153,13 +262,11 @@ next: if (!found) last = first_free; - if (block_group->key.objectid + - block_group->key.offset > last) { - hole_size = block_group->key.objectid + - block_group->key.offset - last; - set_extent_dirty(free_space_cache, last, - last + hole_size - 1, GFP_NOFS); - } + + add_new_free_space(block_group, root->fs_info, last, + block_group->key.objectid + + block_group->key.offset); + block_group->cached = 1; ret = 0; err: @@ -167,166 +274,79 @@ err: return ret; } +/* + * return the block group that starts at or after bytenr + */ struct btrfs_block_group_cache *btrfs_lookup_first_block_group(struct btrfs_fs_info *info, u64 bytenr) { - struct extent_io_tree *block_group_cache; - struct btrfs_block_group_cache *block_group = NULL; - u64 ptr; - u64 start; - u64 end; - int ret; + struct btrfs_block_group_cache *cache; - bytenr = max_t(u64, bytenr, - BTRFS_SUPER_INFO_OFFSET + BTRFS_SUPER_INFO_SIZE); - block_group_cache = &info->block_group_cache; - ret = find_first_extent_bit(block_group_cache, - bytenr, &start, &end, - BLOCK_GROUP_DATA | BLOCK_GROUP_METADATA | - BLOCK_GROUP_SYSTEM); - if (ret) { - return NULL; - } - ret = get_state_private(block_group_cache, start, &ptr); - if (ret) - return NULL; + cache = block_group_cache_tree_search(info, bytenr, 0); - block_group = (struct btrfs_block_group_cache *)(unsigned long)ptr; - return block_group; + return cache; } +/* + * return the block group that contains teh given bytenr + */ struct btrfs_block_group_cache *btrfs_lookup_block_group(struct btrfs_fs_info *info, u64 bytenr) { - struct extent_io_tree *block_group_cache; - struct btrfs_block_group_cache *block_group = NULL; - u64 ptr; - u64 start; - u64 end; - int ret; + struct btrfs_block_group_cache *cache; - bytenr = max_t(u64, bytenr, - BTRFS_SUPER_INFO_OFFSET + BTRFS_SUPER_INFO_SIZE); - block_group_cache = &info->block_group_cache; - ret = find_first_extent_bit(block_group_cache, - bytenr, &start, &end, - BLOCK_GROUP_DATA | BLOCK_GROUP_METADATA | - BLOCK_GROUP_SYSTEM); - if (ret) { - return NULL; - } - ret = get_state_private(block_group_cache, start, &ptr); - if (ret) - return NULL; + cache = block_group_cache_tree_search(info, bytenr, 1); - block_group = (struct btrfs_block_group_cache *)(unsigned long)ptr; - if (block_group->key.objectid <= bytenr && bytenr < - block_group->key.objectid + block_group->key.offset) - return block_group; - return NULL; + return cache; } -static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits) -{ - return (cache->flags & bits) == bits; -} - -static int noinline find_search_start(struct btrfs_root *root, - struct btrfs_block_group_cache **cache_ret, - u64 *start_ret, u64 num, int data) +static int noinline find_free_space(struct btrfs_root *root, + struct btrfs_block_group_cache **cache_ret, + u64 *start_ret, u64 num, int data) { int ret; struct btrfs_block_group_cache *cache = *cache_ret; - struct extent_io_tree *free_space_cache; - struct extent_state *state; + struct btrfs_free_space *info = NULL; u64 last; - u64 start = 0; - u64 cache_miss = 0; u64 total_fs_bytes; u64 search_start = *start_ret; - int wrapped = 0; WARN_ON(!mutex_is_locked(&root->fs_info->alloc_mutex)); total_fs_bytes = btrfs_super_total_bytes(&root->fs_info->super_copy); - free_space_cache = &root->fs_info->free_space_cache; if (!cache) goto out; + last = max(search_start, cache->key.objectid); + again: ret = cache_block_group(root, cache); - if (ret) { + if (ret) goto out; - } - last = max(search_start, cache->key.objectid); - if (!block_group_bits(cache, data) || cache->ro) + if (cache->ro || !block_group_bits(cache, data)) goto new_group; - spin_lock_irq(&free_space_cache->lock); - state = find_first_extent_bit_state(free_space_cache, last, EXTENT_DIRTY); - while(1) { - if (!state) { - if (!cache_miss) - cache_miss = last; - spin_unlock_irq(&free_space_cache->lock); - goto new_group; - } - - start = max(last, state->start); - last = state->end + 1; - if (last - start < num) { - do { - state = extent_state_next(state); - } while(state && !(state->state & EXTENT_DIRTY)); - continue; - } - spin_unlock_irq(&free_space_cache->lock); - if (cache->ro) { - goto new_group; - } - if (start + num > cache->key.objectid + cache->key.offset) - goto new_group; - if (!block_group_bits(cache, data)) { - printk("block group bits don't match %Lu %d\n", cache->flags, data); - } - *start_ret = start; + info = btrfs_find_free_space(cache, last, num); + if (info) { + *start_ret = info->offset; return 0; } -out: - cache = btrfs_lookup_block_group(root->fs_info, search_start); - if (!cache) { - printk("Unable to find block group for %Lu\n", search_start); - WARN_ON(1); - } - return -ENOSPC; new_group: last = cache->key.objectid + cache->key.offset; -wrapped: + cache = btrfs_lookup_first_block_group(root->fs_info, last); - if (!cache || cache->key.objectid >= total_fs_bytes) { -no_cache: - if (!wrapped) { - wrapped = 1; - last = search_start; - goto wrapped; - } + if (!cache || cache->key.objectid >= total_fs_bytes) goto out; - } - if (cache_miss && !cache->cached) { - cache_block_group(root, cache); - last = cache_miss; - cache = btrfs_lookup_first_block_group(root->fs_info, last); - } - cache_miss = 0; - cache = btrfs_find_block_group(root, cache, last, data, 0); - if (!cache) - goto no_cache; + *cache_ret = cache; goto again; + +out: + return -ENOSPC; } static u64 div_factor(u64 num, int factor) @@ -338,16 +358,19 @@ static u64 div_factor(u64 num, int factor) return num; } -static int block_group_state_bits(u64 flags) +static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info, + u64 flags) { - int bits = 0; - if (flags & BTRFS_BLOCK_GROUP_DATA) - bits |= BLOCK_GROUP_DATA; - if (flags & BTRFS_BLOCK_GROUP_METADATA) - bits |= BLOCK_GROUP_METADATA; - if (flags & BTRFS_BLOCK_GROUP_SYSTEM) - bits |= BLOCK_GROUP_SYSTEM; - return bits; + struct list_head *head = &info->space_info; + struct list_head *cur; + struct btrfs_space_info *found; + list_for_each(cur, head) { + found = list_entry(cur, struct btrfs_space_info, list); + if (found->flags == flags) + return found; + } + return NULL; + } static struct btrfs_block_group_cache * @@ -356,28 +379,19 @@ __btrfs_find_block_group(struct btrfs_root *root, u64 search_start, int data, int owner) { struct btrfs_block_group_cache *cache; - struct extent_io_tree *block_group_cache; struct btrfs_block_group_cache *found_group = NULL; struct btrfs_fs_info *info = root->fs_info; + struct btrfs_space_info *sinfo; u64 used; u64 last = 0; - u64 start; - u64 end; u64 free_check; - u64 ptr; - int bit; - int ret; int full_search = 0; int factor = 10; int wrapped = 0; - block_group_cache = &info->block_group_cache; - if (data & BTRFS_BLOCK_GROUP_METADATA) factor = 9; - bit = block_group_state_bits(data); - if (search_start) { struct btrfs_block_group_cache *shint; shint = btrfs_lookup_first_block_group(info, search_start); @@ -408,20 +422,30 @@ __btrfs_find_block_group(struct btrfs_root *root, else last = search_start; } + sinfo = __find_space_info(root->fs_info, data); + if (!sinfo) + goto found; again: while(1) { - ret = find_first_extent_bit(block_group_cache, last, - &start, &end, bit); - if (ret) - break; + struct list_head *l; - ret = get_state_private(block_group_cache, start, &ptr); - if (ret) { - last = end + 1; - continue; + cache = NULL; + + spin_lock(&sinfo->lock); + list_for_each(l, &sinfo->block_groups) { + struct btrfs_block_group_cache *entry; + entry = list_entry(l, struct btrfs_block_group_cache, + list); + if ((entry->key.objectid >= last) && + (!cache || (entry->key.objectid < + cache->key.objectid))) + cache = entry; } + spin_unlock(&sinfo->lock); + + if (!cache) + break; - cache = (struct btrfs_block_group_cache *)(unsigned long)ptr; spin_lock(&cache->lock); last = cache->key.objectid + cache->key.offset; used = btrfs_block_group_used(&cache->item); @@ -462,6 +486,7 @@ struct btrfs_block_group_cache *btrfs_find_block_group(struct btrfs_root *root, ret = __btrfs_find_block_group(root, hint, search_start, data, owner); return ret; } + static u64 hash_extent_ref(u64 root_objectid, u64 ref_generation, u64 owner, u64 owner_offset) { @@ -1175,34 +1200,37 @@ fail: int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, struct btrfs_root *root) { - struct extent_io_tree *block_group_cache; - struct btrfs_block_group_cache *cache; - int ret; + struct btrfs_block_group_cache *cache, *entry; + struct rb_node *n; int err = 0; int werr = 0; struct btrfs_path *path; u64 last = 0; - u64 start; - u64 end; - u64 ptr; - block_group_cache = &root->fs_info->block_group_cache; path = btrfs_alloc_path(); if (!path) return -ENOMEM; mutex_lock(&root->fs_info->alloc_mutex); while(1) { - ret = find_first_extent_bit(block_group_cache, last, - &start, &end, BLOCK_GROUP_DIRTY); - if (ret) - break; + cache = NULL; + spin_lock(&root->fs_info->block_group_cache_lock); + for (n = rb_first(&root->fs_info->block_group_cache_tree); + n; n = rb_next(n)) { + entry = rb_entry(n, struct btrfs_block_group_cache, + cache_node); + if (entry->dirty) { + cache = entry; + break; + } + } + spin_unlock(&root->fs_info->block_group_cache_lock); - last = end + 1; - ret = get_state_private(block_group_cache, start, &ptr); - if (ret) + if (!cache) break; - cache = (struct btrfs_block_group_cache *)(unsigned long)ptr; + + last += cache->key.offset; + err = write_one_cache_group(trans, root, path, cache); /* @@ -1214,29 +1242,14 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, werr = err; continue; } - clear_extent_bits(block_group_cache, start, end, - BLOCK_GROUP_DIRTY, GFP_NOFS); + + cache->dirty = 0; } btrfs_free_path(path); mutex_unlock(&root->fs_info->alloc_mutex); return werr; } -static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info, - u64 flags) -{ - struct list_head *head = &info->space_info; - struct list_head *cur; - struct btrfs_space_info *found; - list_for_each(cur, head) { - found = list_entry(cur, struct btrfs_space_info, list); - if (found->flags == flags) - return found; - } - return NULL; - -} - static int update_space_info(struct btrfs_fs_info *info, u64 flags, u64 total_bytes, u64 bytes_used, struct btrfs_space_info **space_info) @@ -1256,6 +1269,8 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags, return -ENOMEM; list_add(&found->list, &info->space_info); + INIT_LIST_HEAD(&found->block_groups); + spin_lock_init(&found->lock); found->flags = flags; found->total_bytes = total_bytes; found->bytes_used = bytes_used; @@ -1318,7 +1333,7 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans, u64 thresh; u64 start; u64 num_bytes; - int ret; + int ret = 0; flags = reduce_alloc_profile(extent_root, flags); @@ -1355,10 +1370,11 @@ printk("space info full %Lu\n", flags); ret = btrfs_make_block_group(trans, extent_root, 0, flags, BTRFS_FIRST_CHUNK_TREE_OBJECTID, start, num_bytes); BUG_ON(ret); + out_unlock: mutex_unlock(&extent_root->fs_info->chunk_mutex); out: - return 0; + return ret; } static int update_block_group(struct btrfs_trans_handle *trans, @@ -1371,8 +1387,6 @@ static int update_block_group(struct btrfs_trans_handle *trans, u64 total = num_bytes; u64 old_val; u64 byte_in_group; - u64 start; - u64 end; WARN_ON(!mutex_is_locked(&root->fs_info->alloc_mutex)); while(total) { @@ -1382,12 +1396,9 @@ static int update_block_group(struct btrfs_trans_handle *trans, } byte_in_group = bytenr - cache->key.objectid; WARN_ON(byte_in_group > cache->key.offset); - start = cache->key.objectid; - end = start + cache->key.offset - 1; - set_extent_bits(&info->block_group_cache, start, end, - BLOCK_GROUP_DIRTY, GFP_NOFS); spin_lock(&cache->lock); + cache->dirty = 1; old_val = btrfs_block_group_used(&cache->item); num_bytes = min(total, cache->key.offset - byte_in_group); if (alloc) { @@ -1401,9 +1412,11 @@ static int update_block_group(struct btrfs_trans_handle *trans, btrfs_set_block_group_used(&cache->item, old_val); spin_unlock(&cache->lock); if (mark_free) { - set_extent_dirty(&info->free_space_cache, - bytenr, bytenr + num_bytes - 1, - GFP_NOFS); + int ret; + ret = btrfs_add_free_space(cache, bytenr, + num_bytes); + if (ret) + return -1; } } total -= num_bytes; @@ -1414,16 +1427,13 @@ static int update_block_group(struct btrfs_trans_handle *trans, static u64 first_logical_byte(struct btrfs_root *root, u64 search_start) { - u64 start; - u64 end; - int ret; - ret = find_first_extent_bit(&root->fs_info->block_group_cache, - search_start, &start, &end, - BLOCK_GROUP_DATA | BLOCK_GROUP_METADATA | - BLOCK_GROUP_SYSTEM); - if (ret) + struct btrfs_block_group_cache *cache; + + cache = btrfs_lookup_first_block_group(root->fs_info, search_start); + if (!cache) return 0; - return start; + + return cache->key.objectid; } @@ -1501,8 +1511,7 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, u64 start; u64 end; int ret; - struct extent_io_tree *free_space_cache; - free_space_cache = &root->fs_info->free_space_cache; + struct btrfs_block_group_cache *cache; mutex_lock(&root->fs_info->alloc_mutex); while(1) { @@ -1512,7 +1521,9 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, break; btrfs_update_pinned_extents(root, start, end + 1 - start, 0); clear_extent_dirty(unpin, start, end, GFP_NOFS); - set_extent_dirty(free_space_cache, start, end, GFP_NOFS); + cache = btrfs_lookup_block_group(root->fs_info, start); + if (cache->cached) + btrfs_add_free_space(cache, start, end - start + 1); if (need_resched()) { mutex_unlock(&root->fs_info->alloc_mutex); cond_resched(); @@ -1875,9 +1886,12 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, /* if metadata always pin */ if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID) { if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) { + struct btrfs_block_group_cache *cache; + /* btrfs_free_reserved_extent */ - set_extent_dirty(&root->fs_info->free_space_cache, - bytenr, bytenr + num_bytes - 1, GFP_NOFS); + cache = btrfs_lookup_block_group(root->fs_info, bytenr); + BUG_ON(!cache); + btrfs_add_free_space(cache, bytenr, num_bytes); return 0; } pin = 1; @@ -1942,8 +1956,6 @@ static int noinline find_free_extent(struct btrfs_trans_handle *trans, u64 total_needed = num_bytes; u64 *last_ptr = NULL; struct btrfs_block_group_cache *block_group; - int full_scan = 0; - int wrapped = 0; int chunk_alloc_done = 0; int empty_cluster = 2 * 1024 * 1024; int allowed_chunk_alloc = 0; @@ -1959,9 +1971,9 @@ static int noinline find_free_extent(struct btrfs_trans_handle *trans, empty_cluster = 256 * 1024; } - if ((data & BTRFS_BLOCK_GROUP_DATA) && btrfs_test_opt(root, SSD)) { + if ((data & BTRFS_BLOCK_GROUP_DATA) && btrfs_test_opt(root, SSD)) last_ptr = &root->fs_info->last_data_alloc; - } + if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) { last_ptr = &root->fs_info->last_log_alloc; if (!last_ptr == 0 && root->fs_info->last_alloc) { @@ -1972,9 +1984,8 @@ static int noinline find_free_extent(struct btrfs_trans_handle *trans, if (last_ptr) { if (*last_ptr) hint_byte = *last_ptr; - else { + else empty_size += empty_cluster; - } } search_start = max(search_start, first_logical_byte(root, 0)); @@ -1983,145 +1994,172 @@ static int noinline find_free_extent(struct btrfs_trans_handle *trans, if (search_end == (u64)-1) search_end = btrfs_super_total_bytes(&info->super_copy); - if (hint_byte) { - block_group = btrfs_lookup_first_block_group(info, hint_byte); - if (!block_group) - hint_byte = search_start; - block_group = btrfs_find_block_group(root, block_group, - hint_byte, data, 1); - if (last_ptr && *last_ptr == 0 && block_group) - hint_byte = block_group->key.objectid; - } else { - block_group = btrfs_find_block_group(root, - trans->block_group, - search_start, data, 1); - } search_start = max(search_start, hint_byte); - total_needed += empty_size; -check_failed: - if (!block_group) { - block_group = btrfs_lookup_first_block_group(info, - search_start); - if (!block_group) - block_group = btrfs_lookup_first_block_group(info, - orig_search_start); - } - if (full_scan && !chunk_alloc_done) { - if (allowed_chunk_alloc) { - do_chunk_alloc(trans, root, - num_bytes + 2 * 1024 * 1024, data, 1); - allowed_chunk_alloc = 0; - } else if (block_group && block_group_bits(block_group, data)) { - block_group->space_info->force_alloc = 1; +new_group: + block_group = btrfs_lookup_block_group(info, search_start); + + /* + * Ok this looks a little tricky, buts its really simple. First if we + * didn't find a block group obviously we want to start over. + * Secondly, if the block group we found does not match the type we + * need, and we have a last_ptr and its not 0, chances are the last + * allocation we made was at the end of the block group, so lets go + * ahead and skip the looking through the rest of the block groups and + * start at the beginning. This helps with metadata allocations, + * since you are likely to have a bunch of data block groups to search + * through first before you realize that you need to start over, so go + * ahead and start over and save the time. + */ + if (!block_group || (!block_group_bits(block_group, data) && + last_ptr && *last_ptr)) { + if (search_start != orig_search_start) { + if (last_ptr && *last_ptr) + *last_ptr = 0; + search_start = orig_search_start; + goto new_group; + } else if (!chunk_alloc_done && allowed_chunk_alloc) { + ret = do_chunk_alloc(trans, root, + num_bytes + 2 * 1024 * 1024, + data, 1); + if (ret < 0) { + struct btrfs_space_info *info; + + info = __find_space_info(root->fs_info, data); + goto error; + } + BUG_ON(ret); + chunk_alloc_done = 1; + search_start = orig_search_start; + goto new_group; + } else { + ret = -ENOSPC; + goto error; } - chunk_alloc_done = 1; - } - ret = find_search_start(root, &block_group, &search_start, - total_needed, data); - if (ret == -ENOSPC && last_ptr && *last_ptr) { - *last_ptr = 0; - block_group = btrfs_lookup_first_block_group(info, - orig_search_start); - search_start = orig_search_start; - ret = find_search_start(root, &block_group, &search_start, - total_needed, data); } - if (ret == -ENOSPC) - goto enospc; - if (ret) - goto error; - if (last_ptr && *last_ptr && search_start != *last_ptr) { - *last_ptr = 0; - if (!empty_size) { - empty_size += empty_cluster; - total_needed += empty_size; + /* + * this is going to seach through all of the existing block groups it + * can find, so if we don't find something we need to see if we can + * allocate what we need. + */ + ret = find_free_space(root, &block_group, &search_start, + total_needed, data); + if (ret == -ENOSPC) { + /* + * instead of allocating, start at the original search start + * and see if there is something to be found, if not then we + * allocate + */ + if (search_start != orig_search_start) { + if (last_ptr && *last_ptr) { + *last_ptr = 0; + total_needed += empty_cluster; + } + search_start = orig_search_start; + goto new_group; } - block_group = btrfs_lookup_first_block_group(info, - orig_search_start); - search_start = orig_search_start; - ret = find_search_start(root, &block_group, - &search_start, total_needed, data); - if (ret == -ENOSPC) - goto enospc; - if (ret) + + /* + * we've already allocated, we're pretty screwed + */ + if (chunk_alloc_done) { goto error; + } else if (!allowed_chunk_alloc && block_group && + block_group_bits(block_group, data)) { + block_group->space_info->force_alloc = 1; + goto error; + } else if (!allowed_chunk_alloc) { + goto error; + } + + ret = do_chunk_alloc(trans, root, num_bytes + 2 * 1024 * 1024, + data, 1); + if (ret < 0) + goto error; + + BUG_ON(ret); + chunk_alloc_done = 1; + if (block_group) + search_start = block_group->key.objectid + + block_group->key.offset; + else + search_start = orig_search_start; + goto new_group; } + if (ret) + goto error; + search_start = stripe_align(root, search_start); ins->objectid = search_start; ins->offset = num_bytes; - if (ins->objectid + num_bytes >= search_end) - goto enospc; + if (ins->objectid + num_bytes >= search_end) { + search_start = orig_search_start; + if (chunk_alloc_done) { + ret = -ENOSPC; + goto error; + } + goto new_group; + } if (ins->objectid + num_bytes > block_group->key.objectid + block_group->key.offset) { + if (search_start == orig_search_start && chunk_alloc_done) { + ret = -ENOSPC; + goto error; + } search_start = block_group->key.objectid + block_group->key.offset; goto new_group; } - if (test_range_bit(&info->extent_ins, ins->objectid, - ins->objectid + num_bytes -1, EXTENT_LOCKED, 0)) { - search_start = ins->objectid + num_bytes; - goto new_group; - } - - if (test_range_bit(&info->pinned_extents, ins->objectid, - ins->objectid + num_bytes -1, EXTENT_DIRTY, 0)) { - search_start = ins->objectid + num_bytes; - goto new_group; - } - if (exclude_nr > 0 && (ins->objectid + num_bytes > exclude_start && ins->objectid < exclude_start + exclude_nr)) { search_start = exclude_start + exclude_nr; goto new_group; } - if (!(data & BTRFS_BLOCK_GROUP_DATA)) { - block_group = btrfs_lookup_block_group(info, ins->objectid); - if (block_group) - trans->block_group = block_group; - } + if (!(data & BTRFS_BLOCK_GROUP_DATA)) + trans->block_group = block_group; + ins->offset = num_bytes; if (last_ptr) { *last_ptr = ins->objectid + ins->offset; if (*last_ptr == - btrfs_super_total_bytes(&root->fs_info->super_copy)) { + btrfs_super_total_bytes(&root->fs_info->super_copy)) *last_ptr = 0; - } - } - return 0; - -new_group: - if (search_start + num_bytes >= search_end) { -enospc: - search_start = orig_search_start; - if (full_scan) { - ret = -ENOSPC; - goto error; - } - if (wrapped) { - if (!full_scan) - total_needed -= empty_size; - full_scan = 1; - } else - wrapped = 1; } - block_group = btrfs_lookup_first_block_group(info, search_start); - cond_resched(); - block_group = btrfs_find_block_group(root, block_group, - search_start, data, 0); - goto check_failed; + ret = 0; error: return ret; } +static void dump_space_info(struct btrfs_space_info *info, u64 bytes) +{ + struct btrfs_block_group_cache *cache; + struct list_head *l; + + printk(KERN_INFO "space_info has %Lu free, is %sfull\n", + info->total_bytes - info->bytes_used - info->bytes_pinned, + (info->full) ? "" : "not "); + + spin_lock(&info->lock); + list_for_each(l, &info->block_groups) { + cache = list_entry(l, struct btrfs_block_group_cache, list); + spin_lock(&cache->lock); + printk(KERN_INFO "block group %Lu has %Lu bytes, %Lu used " + "%Lu pinned\n", + cache->key.objectid, cache->key.offset, + btrfs_block_group_used(&cache->item), cache->pinned); + btrfs_dump_free_space(cache, bytes); + spin_unlock(&cache->lock); + } + spin_unlock(&info->lock); +} static int __btrfs_reserve_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 num_bytes, u64 min_alloc_size, @@ -2133,6 +2171,7 @@ static int __btrfs_reserve_extent(struct btrfs_trans_handle *trans, u64 search_start = 0; u64 alloc_profile; struct btrfs_fs_info *info = root->fs_info; + struct btrfs_block_group_cache *cache; if (data) { alloc_profile = info->avail_data_alloc_bits & @@ -2160,11 +2199,9 @@ again: BTRFS_BLOCK_GROUP_METADATA | (info->metadata_alloc_profile & info->avail_metadata_alloc_bits), 0); - BUG_ON(ret); } ret = do_chunk_alloc(trans, root->fs_info->extent_root, num_bytes + 2 * 1024 * 1024, data, 0); - BUG_ON(ret); } WARN_ON(num_bytes < root->sectorsize); @@ -2175,26 +2212,44 @@ again: if (ret == -ENOSPC && num_bytes > min_alloc_size) { num_bytes = num_bytes >> 1; + num_bytes = num_bytes & ~(root->sectorsize - 1); num_bytes = max(num_bytes, min_alloc_size); do_chunk_alloc(trans, root->fs_info->extent_root, num_bytes, data, 1); goto again; } if (ret) { - printk("allocation failed flags %Lu\n", data); + struct btrfs_space_info *sinfo; + + sinfo = __find_space_info(root->fs_info, data); + printk("allocation failed flags %Lu, wanted %Lu\n", + data, num_bytes); + dump_space_info(sinfo, num_bytes); BUG(); } - clear_extent_dirty(&root->fs_info->free_space_cache, - ins->objectid, ins->objectid + ins->offset - 1, - GFP_NOFS); - return 0; + cache = btrfs_lookup_block_group(root->fs_info, ins->objectid); + if (!cache) { + printk(KERN_ERR "Unable to find block group for %Lu\n", ins->objectid); + return -ENOSPC; + } + + ret = btrfs_remove_free_space(cache, ins->objectid, ins->offset); + + return ret; } int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len) { + struct btrfs_block_group_cache *cache; + maybe_lock_mutex(root); - set_extent_dirty(&root->fs_info->free_space_cache, - start, start + len - 1, GFP_NOFS); + cache = btrfs_lookup_block_group(root->fs_info, start); + if (!cache) { + printk(KERN_ERR "Unable to find block group for %Lu\n", start); + maybe_unlock_mutex(root); + return -ENOSPC; + } + btrfs_add_free_space(cache, start, len); maybe_unlock_mutex(root); return 0; } @@ -2264,8 +2319,8 @@ static int __btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans, ret = btrfs_insert_empty_items(trans, extent_root, path, keys, sizes, 2); - BUG_ON(ret); + extent_item = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_extent_item); btrfs_set_extent_refs(path->nodes[0], extent_item, 1); @@ -2336,9 +2391,9 @@ int btrfs_alloc_logged_extent(struct btrfs_trans_handle *trans, block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid); cache_block_group(root, block_group); - clear_extent_dirty(&root->fs_info->free_space_cache, - ins->objectid, ins->objectid + ins->offset - 1, - GFP_NOFS); + ret = btrfs_remove_free_space(block_group, ins->objectid, ins->offset); + BUG_ON(ret); + ret = __btrfs_alloc_reserved_extent(trans, root, root_objectid, ref_generation, owner, owner_offset, ins); @@ -2843,31 +2898,24 @@ out: int btrfs_free_block_groups(struct btrfs_fs_info *info) { - u64 start; - u64 end; - u64 ptr; - int ret; + struct btrfs_block_group_cache *block_group; + struct rb_node *n; mutex_lock(&info->alloc_mutex); - while(1) { - ret = find_first_extent_bit(&info->block_group_cache, 0, - &start, &end, (unsigned int)-1); - if (ret) - break; - ret = get_state_private(&info->block_group_cache, start, &ptr); - if (!ret) - kfree((void *)(unsigned long)ptr); - clear_extent_bits(&info->block_group_cache, start, - end, (unsigned int)-1, GFP_NOFS); - } - while(1) { - ret = find_first_extent_bit(&info->free_space_cache, 0, - &start, &end, EXTENT_DIRTY); - if (ret) - break; - clear_extent_dirty(&info->free_space_cache, start, - end, GFP_NOFS); - } + spin_lock(&info->block_group_cache_lock); + while ((n = rb_last(&info->block_group_cache_tree)) != NULL) { + block_group = rb_entry(n, struct btrfs_block_group_cache, + cache_node); + + btrfs_remove_free_space_cache(block_group); + rb_erase(&block_group->cache_node, + &info->block_group_cache_tree); + spin_lock(&block_group->space_info->lock); + list_del(&block_group->list); + spin_unlock(&block_group->space_info->lock); + kfree(block_group); + } + spin_unlock(&info->block_group_cache_lock); mutex_unlock(&info->alloc_mutex); return 0; } @@ -3386,7 +3434,6 @@ int btrfs_shrink_extent_tree(struct btrfs_root *root, u64 shrink_start) u64 total_found; u64 shrink_last_byte; struct btrfs_block_group_cache *shrink_block_group; - struct btrfs_fs_info *info = root->fs_info; struct btrfs_key key; struct btrfs_key found_key; struct extent_buffer *leaf; @@ -3542,15 +3589,17 @@ next: goto out; } - clear_extent_bits(&info->block_group_cache, key.objectid, - key.objectid + key.offset - 1, - (unsigned int)-1, GFP_NOFS); - - - clear_extent_bits(&info->free_space_cache, - key.objectid, key.objectid + key.offset - 1, - (unsigned int)-1, GFP_NOFS); + spin_lock(&root->fs_info->block_group_cache_lock); + rb_erase(&shrink_block_group->cache_node, + &root->fs_info->block_group_cache_tree); + spin_unlock(&root->fs_info->block_group_cache_lock); + ret = btrfs_remove_free_space(shrink_block_group, key.objectid, + key.offset); + if (ret) { + btrfs_end_transaction(trans, root); + goto out; + } /* memset(shrink_block_group, 0, sizeof(*shrink_block_group)); kfree(shrink_block_group); @@ -3566,9 +3615,9 @@ next: /* the code to unpin extents might set a few bits in the free * space cache for this range again */ - clear_extent_bits(&info->free_space_cache, - key.objectid, key.objectid + key.offset - 1, - (unsigned int)-1, GFP_NOFS); + /* XXX? */ + ret = btrfs_remove_free_space(shrink_block_group, key.objectid, + key.offset); out: btrfs_free_path(path); mutex_unlock(&root->fs_info->alloc_mutex); @@ -3616,16 +3665,13 @@ int btrfs_read_block_groups(struct btrfs_root *root) { struct btrfs_path *path; int ret; - int bit; struct btrfs_block_group_cache *cache; struct btrfs_fs_info *info = root->fs_info; struct btrfs_space_info *space_info; - struct extent_io_tree *block_group_cache; struct btrfs_key key; struct btrfs_key found_key; struct extent_buffer *leaf; - block_group_cache = &info->block_group_cache; root = info->extent_root; key.objectid = 0; key.offset = 0; @@ -3653,6 +3699,7 @@ int btrfs_read_block_groups(struct btrfs_root *root) } spin_lock_init(&cache->lock); + INIT_LIST_HEAD(&cache->list); read_extent_buffer(leaf, &cache->item, btrfs_item_ptr_offset(leaf, path->slots[0]), sizeof(cache->item)); @@ -3661,31 +3708,19 @@ int btrfs_read_block_groups(struct btrfs_root *root) key.objectid = found_key.objectid + found_key.offset; btrfs_release_path(root, path); cache->flags = btrfs_block_group_flags(&cache->item); - bit = 0; - if (cache->flags & BTRFS_BLOCK_GROUP_DATA) { - bit = BLOCK_GROUP_DATA; - } else if (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM) { - bit = BLOCK_GROUP_SYSTEM; - } else if (cache->flags & BTRFS_BLOCK_GROUP_METADATA) { - bit = BLOCK_GROUP_METADATA; - } - set_avail_alloc_bits(info, cache->flags); ret = update_space_info(info, cache->flags, found_key.offset, btrfs_block_group_used(&cache->item), &space_info); BUG_ON(ret); cache->space_info = space_info; + spin_lock(&space_info->lock); + list_add(&cache->list, &space_info->block_groups); + spin_unlock(&space_info->lock); + + ret = btrfs_add_block_group_cache(root->fs_info, cache); + BUG_ON(ret); - /* use EXTENT_LOCKED to prevent merging */ - set_extent_bits(block_group_cache, found_key.objectid, - found_key.objectid + found_key.offset - 1, - EXTENT_LOCKED, GFP_NOFS); - set_state_private(block_group_cache, found_key.objectid, - (unsigned long)cache); - set_extent_bits(block_group_cache, found_key.objectid, - found_key.objectid + found_key.offset - 1, - bit | EXTENT_LOCKED, GFP_NOFS); if (key.objectid >= btrfs_super_total_bytes(&info->super_copy)) break; @@ -3703,22 +3738,22 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 size) { int ret; - int bit = 0; struct btrfs_root *extent_root; struct btrfs_block_group_cache *cache; - struct extent_io_tree *block_group_cache; WARN_ON(!mutex_is_locked(&root->fs_info->alloc_mutex)); extent_root = root->fs_info->extent_root; - block_group_cache = &root->fs_info->block_group_cache; root->fs_info->last_trans_new_blockgroup = trans->transid; cache = kzalloc(sizeof(*cache), GFP_NOFS); - BUG_ON(!cache); + if (!cache) + return -ENOMEM; + cache->key.objectid = chunk_offset; cache->key.offset = size; spin_lock_init(&cache->lock); + INIT_LIST_HEAD(&cache->list); btrfs_set_key_type(&cache->key, BTRFS_BLOCK_GROUP_ITEM_KEY); btrfs_set_block_group_used(&cache->item, bytes_used); @@ -3729,16 +3764,12 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, ret = update_space_info(root->fs_info, cache->flags, size, bytes_used, &cache->space_info); BUG_ON(ret); + spin_lock(&cache->space_info->lock); + list_add(&cache->list, &cache->space_info->block_groups); + spin_unlock(&cache->space_info->lock); - bit = block_group_state_bits(type); - set_extent_bits(block_group_cache, chunk_offset, - chunk_offset + size - 1, - EXTENT_LOCKED, GFP_NOFS); - set_state_private(block_group_cache, chunk_offset, - (unsigned long)cache); - set_extent_bits(block_group_cache, chunk_offset, - chunk_offset + size - 1, - bit | EXTENT_LOCKED, GFP_NOFS); + ret = btrfs_add_block_group_cache(root->fs_info, cache); + BUG_ON(ret); ret = btrfs_insert_item(trans, extent_root, &cache->key, &cache->item, sizeof(cache->item)); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 319a0c7a4a58..8624f3e88036 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2634,6 +2634,7 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree, if (eb) { atomic_inc(&eb->refs); spin_unlock(&tree->buffer_lock); + mark_page_accessed(eb->first_page); return eb; } spin_unlock(&tree->buffer_lock); @@ -2713,6 +2714,9 @@ struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree, atomic_inc(&eb->refs); spin_unlock(&tree->buffer_lock); + if (eb) + mark_page_accessed(eb->first_page); + return eb; } EXPORT_SYMBOL(find_extent_buffer); diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c new file mode 100644 index 000000000000..01c26e8ae555 --- /dev/null +++ b/fs/btrfs/free-space-cache.c @@ -0,0 +1,415 @@ +/* + * Copyright (C) 2008 Red Hat. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include +#include "ctree.h" + +static int tree_insert_offset(struct rb_root *root, u64 offset, + struct rb_node *node) +{ + struct rb_node **p = &root->rb_node; + struct rb_node *parent = NULL; + struct btrfs_free_space *info; + + while (*p) { + parent = *p; + info = rb_entry(parent, struct btrfs_free_space, offset_index); + + if (offset < info->offset) + p = &(*p)->rb_left; + else if (offset > info->offset) + p = &(*p)->rb_right; + else + return -EEXIST; + } + + rb_link_node(node, parent, p); + rb_insert_color(node, root); + + return 0; +} + +static int tree_insert_bytes(struct rb_root *root, u64 bytes, + struct rb_node *node) +{ + struct rb_node **p = &root->rb_node; + struct rb_node *parent = NULL; + struct btrfs_free_space *info; + + while (*p) { + parent = *p; + info = rb_entry(parent, struct btrfs_free_space, bytes_index); + + if (bytes < info->bytes) + p = &(*p)->rb_left; + else + p = &(*p)->rb_right; + } + + rb_link_node(node, parent, p); + rb_insert_color(node, root); + + return 0; +} + +/* + * searches the tree for the given offset. If contains is set we will return + * the free space that contains the given offset. If contains is not set we + * will return the free space that starts at or after the given offset and is + * at least bytes long. + */ +static struct btrfs_free_space *tree_search_offset(struct rb_root *root, + u64 offset, u64 bytes, + int contains) +{ + struct rb_node *n = root->rb_node; + struct btrfs_free_space *entry, *ret = NULL; + + while (n) { + entry = rb_entry(n, struct btrfs_free_space, offset_index); + + if (offset < entry->offset) { + if (!contains && + (!ret || entry->offset < ret->offset) && + (bytes <= entry->bytes)) + ret = entry; + n = n->rb_left; + } else if (offset > entry->offset) { + if (contains && + (entry->offset + entry->bytes - 1) >= offset) { + ret = entry; + break; + } + n = n->rb_right; + } else { + if (bytes > entry->bytes) { + n = n->rb_right; + continue; + } + ret = entry; + break; + } + } + + return ret; +} + +/* + * return a chunk at least bytes size, as close to offset that we can get. + */ +static struct btrfs_free_space *tree_search_bytes(struct rb_root *root, + u64 offset, u64 bytes) +{ + struct rb_node *n = root->rb_node; + struct btrfs_free_space *entry, *ret = NULL; + + while (n) { + entry = rb_entry(n, struct btrfs_free_space, bytes_index); + + if (bytes < entry->bytes) { + /* + * We prefer to get a hole size as close to the size we + * are asking for so we don't take small slivers out of + * huge holes, but we also want to get as close to the + * offset as possible so we don't have a whole lot of + * fragmentation. + */ + if (offset <= entry->offset) { + if (!ret) + ret = entry; + else if (entry->bytes < ret->bytes) + ret = entry; + else if (entry->offset < ret->offset) + ret = entry; + } + n = n->rb_left; + } else if (bytes > entry->bytes) { + n = n->rb_right; + } else { + /* + * Ok we may have multiple chunks of the wanted size, + * so we don't want to take the first one we find, we + * want to take the one closest to our given offset, so + * keep searching just in case theres a better match. + */ + n = n->rb_right; + if (offset > entry->offset) + continue; + else if (!ret || entry->offset < ret->offset) + ret = entry; + } + } + + return ret; +} + +static void unlink_free_space(struct btrfs_block_group_cache *block_group, + struct btrfs_free_space *info) +{ + rb_erase(&info->offset_index, &block_group->free_space_offset); + rb_erase(&info->bytes_index, &block_group->free_space_bytes); +} + +static int link_free_space(struct btrfs_block_group_cache *block_group, + struct btrfs_free_space *info) +{ + int ret = 0; + + + ret = tree_insert_offset(&block_group->free_space_offset, info->offset, + &info->offset_index); + if (ret) + return ret; + + ret = tree_insert_bytes(&block_group->free_space_bytes, info->bytes, + &info->bytes_index); + if (ret) + return ret; + + return ret; +} + +int btrfs_add_free_space(struct btrfs_block_group_cache *block_group, + u64 offset, u64 bytes) +{ + struct btrfs_free_space *right_info; + struct btrfs_free_space *left_info; + struct btrfs_free_space *info = NULL; + struct btrfs_free_space *alloc_info; + int ret = 0; + + alloc_info = kzalloc(sizeof(struct btrfs_free_space), GFP_NOFS); + if (!alloc_info) + return -ENOMEM; + + /* + * first we want to see if there is free space adjacent to the range we + * are adding, if there is remove that struct and add a new one to + * cover the entire range + */ + spin_lock(&block_group->lock); + + right_info = tree_search_offset(&block_group->free_space_offset, + offset+bytes, 0, 1); + left_info = tree_search_offset(&block_group->free_space_offset, + offset-1, 0, 1); + + if (right_info && right_info->offset == offset+bytes) { + unlink_free_space(block_group, right_info); + info = right_info; + info->offset = offset; + info->bytes += bytes; + } else if (right_info && right_info->offset != offset+bytes) { + printk(KERN_ERR "adding space in the middle of an existing " + "free space area. existing: offset=%Lu, bytes=%Lu. " + "new: offset=%Lu, bytes=%Lu\n", right_info->offset, + right_info->bytes, offset, bytes); + BUG(); + } + + if (left_info) { + unlink_free_space(block_group, left_info); + + if (unlikely((left_info->offset + left_info->bytes) != + offset)) { + printk(KERN_ERR "free space to the left of new free " + "space isn't quite right. existing: offset=%Lu," + " bytes=%Lu. new: offset=%Lu, bytes=%Lu\n", + left_info->offset, left_info->bytes, offset, + bytes); + BUG(); + } + + if (info) { + info->offset = left_info->offset; + info->bytes += left_info->bytes; + kfree(left_info); + } else { + info = left_info; + info->bytes += bytes; + } + } + + if (info) { + ret = link_free_space(block_group, info); + if (!ret) + info = NULL; + goto out; + } + + info = alloc_info; + alloc_info = NULL; + info->offset = offset; + info->bytes = bytes; + + ret = link_free_space(block_group, info); + if (ret) + kfree(info); +out: + spin_unlock(&block_group->lock); + if (ret) { + printk(KERN_ERR "btrfs: unable to add free space :%d\n", ret); + if (ret == -EEXIST) + BUG(); + } + + if (alloc_info) + kfree(alloc_info); + + return ret; +} + +int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group, + u64 offset, u64 bytes) +{ + struct btrfs_free_space *info; + int ret = 0; + + spin_lock(&block_group->lock); + info = tree_search_offset(&block_group->free_space_offset, offset, 0, + 1); + + if (info && info->offset == offset) { + if (info->bytes < bytes) { + printk(KERN_ERR "Found free space at %Lu, size %Lu," + "trying to use %Lu\n", + info->offset, info->bytes, bytes); + WARN_ON(1); + ret = -EINVAL; + goto out; + } + + unlink_free_space(block_group, info); + + if (info->bytes == bytes) { + kfree(info); + goto out; + } + + info->offset += bytes; + info->bytes -= bytes; + + ret = link_free_space(block_group, info); + BUG_ON(ret); + } else { + WARN_ON(1); + } +out: + spin_unlock(&block_group->lock); + return ret; +} + +void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group, + u64 bytes) +{ + struct btrfs_free_space *info; + struct rb_node *n; + int count = 0; + + for (n = rb_first(&block_group->free_space_offset); n; n = rb_next(n)) { + info = rb_entry(n, struct btrfs_free_space, offset_index); + if (info->bytes >= bytes) + count++; + //printk(KERN_INFO "offset=%Lu, bytes=%Lu\n", info->offset, + // info->bytes); + } + printk(KERN_INFO "%d blocks of free space at or bigger than bytes is" + "\n", count); +} + +u64 btrfs_block_group_free_space(struct btrfs_block_group_cache *block_group) +{ + struct btrfs_free_space *info; + struct rb_node *n; + u64 ret = 0; + + for (n = rb_first(&block_group->free_space_offset); n; + n = rb_next(n)) { + info = rb_entry(n, struct btrfs_free_space, offset_index); + ret += info->bytes; + } + + return ret; +} + +void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group) +{ + struct btrfs_free_space *info; + struct rb_node *node; + + spin_lock(&block_group->lock); + while ((node = rb_last(&block_group->free_space_bytes)) != NULL) { + info = rb_entry(node, struct btrfs_free_space, bytes_index); + unlink_free_space(block_group, info); + kfree(info); + if (need_resched()) { + spin_unlock(&block_group->lock); + cond_resched(); + spin_lock(&block_group->lock); + } + } + spin_unlock(&block_group->lock); +} + +struct btrfs_free_space *btrfs_find_free_space_offset(struct + btrfs_block_group_cache + *block_group, u64 offset, + u64 bytes) +{ + struct btrfs_free_space *ret; + + spin_lock(&block_group->lock); + ret = tree_search_offset(&block_group->free_space_offset, offset, + bytes, 0); + spin_unlock(&block_group->lock); + + return ret; +} + +struct btrfs_free_space *btrfs_find_free_space_bytes(struct + btrfs_block_group_cache + *block_group, u64 offset, + u64 bytes) +{ + struct btrfs_free_space *ret; + + spin_lock(&block_group->lock); + + ret = tree_search_bytes(&block_group->free_space_bytes, offset, bytes); + spin_unlock(&block_group->lock); + + return ret; +} + +struct btrfs_free_space *btrfs_find_free_space(struct btrfs_block_group_cache + *block_group, u64 offset, + u64 bytes) +{ + struct btrfs_free_space *ret; + + spin_lock(&block_group->lock); + ret = tree_search_offset(&block_group->free_space_offset, offset, + bytes, 0); + if (!ret) + ret = tree_search_bytes(&block_group->free_space_bytes, + offset, bytes); + + spin_unlock(&block_group->lock); + + return ret; +} diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 12c1c0530f3d..65b4f864b0df 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -141,7 +141,7 @@ static int cow_file_range(struct inode *inode, u64 start, u64 end) while(num_bytes > 0) { cur_alloc_size = min(num_bytes, root->fs_info->max_extent); ret = btrfs_reserve_extent(trans, root, cur_alloc_size, - root->sectorsize, 0, 0, + root->sectorsize, 0, alloc_hint, (u64)-1, &ins, 1); if (ret) { WARN_ON(1); @@ -558,7 +558,6 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) trans->transid, inode->i_ino, ordered_extent->file_offset, &ins); BUG_ON(ret); - mutex_lock(&BTRFS_I(inode)->extent_mutex); ret = btrfs_drop_extents(trans, root, inode, diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 1546fa6f4f7d..b9e5c2d82dde 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -64,8 +64,8 @@ static void lock_chunks(struct btrfs_root *root) static void unlock_chunks(struct btrfs_root *root) { - mutex_unlock(&root->fs_info->alloc_mutex); mutex_unlock(&root->fs_info->chunk_mutex); + mutex_unlock(&root->fs_info->alloc_mutex); } int btrfs_cleanup_fs_uuids(void) @@ -1668,8 +1668,13 @@ again: else min_free = calc_size; - /* we add 1MB because we never use the first 1MB of the device */ - min_free += 1024 * 1024; + /* + * we add 1MB because we never use the first 1MB of the device, unless + * we've looped, then we are likely allocating the maximum amount of + * space left already + */ + if (!looped) + min_free += 1024 * 1024; /* build a private list of devices we will allocate from */ while(index < num_stripes) { -- cgit v1.2.3 From b4f6c45dfbf84f47c21f73f6370ad1292b0627fd Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Wed, 24 Sep 2008 11:05:53 -0400 Subject: Update Btrfs files for in-kernel usage btrfs had magic to put the chagneset id into a printk on module load. This removes that from the Makefile and hardcodes the printk to print "Btrfs" Signed-off-by: Chris Mason --- fs/btrfs/Makefile | 7 +------ fs/btrfs/version.h | 4 ++++ 2 files changed, 5 insertions(+), 6 deletions(-) create mode 100644 fs/btrfs/version.h (limited to 'fs/btrfs/Makefile') diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index eb36ae981bdc..d5c28557fba9 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -13,17 +13,12 @@ else # Normal Makefile KERNELDIR := /lib/modules/`uname -r`/build -all: version +all: $(MAKE) -C $(KERNELDIR) M=`pwd` modules -version: - bash version.sh - modules_install: $(MAKE) -C $(KERNELDIR) M=`pwd` modules_install clean: $(MAKE) -C $(KERNELDIR) M=`pwd` clean -tester: - $(MAKE) -C $(KERNELDIR) M=`pwd` tree-defrag.o transaction.o sysfs.o super.o root-tree.o inode-map.o inode-item.o inode.o file-item.o file.o extent_map.o disk-io.o ctree.o dir-item.o extent-tree.o endif diff --git a/fs/btrfs/version.h b/fs/btrfs/version.h new file mode 100644 index 000000000000..9bf3946d5ef2 --- /dev/null +++ b/fs/btrfs/version.h @@ -0,0 +1,4 @@ +#ifndef __BTRFS_VERSION_H +#define __BTRFS_VERSION_H +#define BTRFS_BUILD_VERSION "Btrfs" +#endif -- cgit v1.2.3 From d352ac68148b69937d39ca5d48bcc4478e118dbf Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Mon, 29 Sep 2008 15:18:18 -0400 Subject: Btrfs: add and improve comments This improves the comments at the top of many functions. It didn't dive into the guts of functions because I was trying to avoid merging problems with the new allocator and back reference work. extent-tree.c and volumes.c were both skipped, and there is definitely more work todo in cleaning and commenting the code. Signed-off-by: Chris Mason --- fs/btrfs/Makefile | 2 +- fs/btrfs/TODO | 20 ----- fs/btrfs/async-thread.c | 10 ++- fs/btrfs/async-thread.h | 7 +- fs/btrfs/bit-radix.c | 130 --------------------------------- fs/btrfs/bit-radix.h | 33 --------- fs/btrfs/btrfs_inode.h | 54 +++++++++++++- fs/btrfs/crc32c.h | 18 +++++ fs/btrfs/ctree.c | 127 +++++++++++++++++++++++++++++--- fs/btrfs/ctree.h | 1 - fs/btrfs/dir-item.c | 41 +++++++++++ fs/btrfs/disk-io.c | 33 ++++++++- fs/btrfs/extent_io.c | 34 ++++++++- fs/btrfs/extent_map.c | 10 +++ fs/btrfs/file.c | 44 ++++++++++- fs/btrfs/inode.c | 189 +++++++++++++++++++++++++++++++----------------- fs/btrfs/locking.c | 13 ++++ fs/btrfs/ordered-data.c | 19 ++++- fs/btrfs/ref-cache.c | 26 +++++++ fs/btrfs/ref-cache.h | 3 + fs/btrfs/root-tree.c | 21 +++++- fs/btrfs/struct-funcs.c | 21 ++++++ fs/btrfs/super.c | 3 + fs/btrfs/transaction.c | 67 ++++++++++++++++- fs/btrfs/tree-defrag.c | 4 + 25 files changed, 653 insertions(+), 277 deletions(-) delete mode 100644 fs/btrfs/TODO delete mode 100644 fs/btrfs/bit-radix.c delete mode 100644 fs/btrfs/bit-radix.h (limited to 'fs/btrfs/Makefile') diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index d5c28557fba9..48b7909ca8d1 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -4,7 +4,7 @@ ifneq ($(KERNELRELEASE),) obj-m := btrfs.o btrfs-y := super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ file-item.o inode-item.o inode-map.o disk-io.o \ - transaction.o bit-radix.o inode.o file.o tree-defrag.o \ + transaction.o inode.o file.o tree-defrag.o \ extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \ extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \ ref-cache.o export.o tree-log.o acl.o free-space-cache.o diff --git a/fs/btrfs/TODO b/fs/btrfs/TODO deleted file mode 100644 index d9b6d38c603a..000000000000 --- a/fs/btrfs/TODO +++ /dev/null @@ -1,20 +0,0 @@ -* cleanup, add more error checking, get rid of BUG_ONs -* Fix ENOSPC handling -* Make allocator smarter -* add a block group to struct inode -* Do actual block accounting -* Check compat and incompat flags on the inode -* Get rid of struct ctree_path, limiting tree levels held at one time -* Add generation number to key pointer in nodes -* Add generation number to inode -* forbid cross subvolume renames and hardlinks -* Release -* Do real tree locking -* Add extent mirroring (backup copies of blocks) -* Add fancy interface to get access to incremental backups -* Add fancy striped extents to make big reads faster -* Use relocation to try and fix write errors -* Make allocator much smarter -* xattrs (directory streams for regular files) -* Scrub & defrag - diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c index 4e780b279de6..04fb9702d14c 100644 --- a/fs/btrfs/async-thread.c +++ b/fs/btrfs/async-thread.c @@ -231,17 +231,25 @@ static struct btrfs_worker_thread *next_worker(struct btrfs_workers *workers) /* * if we pick a busy task, move the task to the end of the list. - * hopefully this will keep things somewhat evenly balanced + * hopefully this will keep things somewhat evenly balanced. + * Do the move in batches based on the sequence number. This groups + * requests submitted at roughly the same time onto the same worker. */ next = workers->worker_list.next; worker = list_entry(next, struct btrfs_worker_thread, worker_list); atomic_inc(&worker->num_pending); worker->sequence++; + if (worker->sequence % workers->idle_thresh == 0) list_move_tail(next, &workers->worker_list); return worker; } +/* + * selects a worker thread to take the next job. This will either find + * an idle worker, start a new worker up to the max count, or just return + * one of the existing busy workers. + */ static struct btrfs_worker_thread *find_worker(struct btrfs_workers *workers) { struct btrfs_worker_thread *worker; diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h index 43e44d115dd1..4ec9a2ee0f9d 100644 --- a/fs/btrfs/async-thread.h +++ b/fs/btrfs/async-thread.h @@ -63,14 +63,17 @@ struct btrfs_workers { /* once a worker has this many requests or fewer, it is idle */ int idle_thresh; - /* list with all the work threads */ + /* list with all the work threads. The workers on the idle thread + * may be actively servicing jobs, but they haven't yet hit the + * idle thresh limit above. + */ struct list_head worker_list; struct list_head idle_list; /* lock for finding the next worker thread to queue on */ spinlock_t lock; - /* extra name for this worker */ + /* extra name for this worker, used for current->name */ char *name; }; diff --git a/fs/btrfs/bit-radix.c b/fs/btrfs/bit-radix.c deleted file mode 100644 index e8bf876db393..000000000000 --- a/fs/btrfs/bit-radix.c +++ /dev/null @@ -1,130 +0,0 @@ -/* - * Copyright (C) 2007 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - */ - -#include "bit-radix.h" - -#define BIT_ARRAY_BYTES 256 -#define BIT_RADIX_BITS_PER_ARRAY ((BIT_ARRAY_BYTES - sizeof(unsigned long)) * 8) - -extern struct kmem_cache *btrfs_bit_radix_cachep; -int set_radix_bit(struct radix_tree_root *radix, unsigned long bit) -{ - unsigned long *bits; - unsigned long slot; - int bit_slot; - int ret; - - slot = bit / BIT_RADIX_BITS_PER_ARRAY; - bit_slot = bit % BIT_RADIX_BITS_PER_ARRAY; - - bits = radix_tree_lookup(radix, slot); - if (!bits) { - bits = kmem_cache_alloc(btrfs_bit_radix_cachep, GFP_NOFS); - if (!bits) - return -ENOMEM; - memset(bits + 1, 0, BIT_ARRAY_BYTES - sizeof(unsigned long)); - bits[0] = slot; - ret = radix_tree_insert(radix, slot, bits); - if (ret) - return ret; - } - ret = test_and_set_bit(bit_slot, bits + 1); - if (ret < 0) - ret = 1; - return ret; -} - -int test_radix_bit(struct radix_tree_root *radix, unsigned long bit) -{ - unsigned long *bits; - unsigned long slot; - int bit_slot; - - slot = bit / BIT_RADIX_BITS_PER_ARRAY; - bit_slot = bit % BIT_RADIX_BITS_PER_ARRAY; - - bits = radix_tree_lookup(radix, slot); - if (!bits) - return 0; - return test_bit(bit_slot, bits + 1); -} - -int clear_radix_bit(struct radix_tree_root *radix, unsigned long bit) -{ - unsigned long *bits; - unsigned long slot; - int bit_slot; - int i; - int empty = 1; - - slot = bit / BIT_RADIX_BITS_PER_ARRAY; - bit_slot = bit % BIT_RADIX_BITS_PER_ARRAY; - - bits = radix_tree_lookup(radix, slot); - if (!bits) - return 0; - clear_bit(bit_slot, bits + 1); - for (i = 1; i < BIT_ARRAY_BYTES / sizeof(unsigned long); i++) { - if (bits[i]) { - empty = 0; - break; - } - } - if (empty) { - bits = radix_tree_delete(radix, slot); - BUG_ON(!bits); - kmem_cache_free(btrfs_bit_radix_cachep, bits); - } - return 0; -} - -int find_first_radix_bit(struct radix_tree_root *radix, unsigned long *retbits, - unsigned long start, int nr) -{ - unsigned long *bits; - unsigned long *gang[4]; - int found; - int ret; - int i; - int total_found = 0; - unsigned long slot; - - slot = start / BIT_RADIX_BITS_PER_ARRAY; - ret = radix_tree_gang_lookup(radix, (void **)gang, slot, - ARRAY_SIZE(gang)); - found = start % BIT_RADIX_BITS_PER_ARRAY; - for (i = 0; i < ret && nr > 0; i++) { - bits = gang[i]; - while(nr > 0) { - found = find_next_bit(bits + 1, - BIT_RADIX_BITS_PER_ARRAY, - found); - if (found < BIT_RADIX_BITS_PER_ARRAY) { - *retbits = bits[0] * - BIT_RADIX_BITS_PER_ARRAY + found; - retbits++; - nr--; - total_found++; - found++; - } else - break; - } - found = 0; - } - return total_found; -} diff --git a/fs/btrfs/bit-radix.h b/fs/btrfs/bit-radix.h deleted file mode 100644 index c100f54d5c32..000000000000 --- a/fs/btrfs/bit-radix.h +++ /dev/null @@ -1,33 +0,0 @@ -/* - * Copyright (C) 2007 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - */ - -#ifndef __BIT_RADIX__ -#define __BIT_RADIX__ -#include - -int set_radix_bit(struct radix_tree_root *radix, unsigned long bit); -int test_radix_bit(struct radix_tree_root *radix, unsigned long bit); -int clear_radix_bit(struct radix_tree_root *radix, unsigned long bit); -int find_first_radix_bit(struct radix_tree_root *radix, unsigned long *retbits, - unsigned long start, int nr); - -static inline void init_bit_radix(struct radix_tree_root *radix) -{ - INIT_RADIX_TREE(radix, GFP_NOFS); -} -#endif diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 0577fda2168a..0b2e623cf421 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -25,27 +25,58 @@ /* in memory btrfs inode */ struct btrfs_inode { + /* which subvolume this inode belongs to */ struct btrfs_root *root; + + /* the block group preferred for allocations. This pointer is buggy + * and needs to be replaced with a bytenr instead + */ struct btrfs_block_group_cache *block_group; + + /* key used to find this inode on disk. This is used by the code + * to read in roots of subvolumes + */ struct btrfs_key location; + + /* the extent_tree has caches of all the extent mappings to disk */ struct extent_map_tree extent_tree; + + /* the io_tree does range state (DIRTY, LOCKED etc) */ struct extent_io_tree io_tree; + + /* special utility tree used to record which mirrors have already been + * tried when checksums fail for a given block + */ struct extent_io_tree io_failure_tree; + + /* held while inserting checksums to avoid races */ struct mutex csum_mutex; + + /* held while inesrting or deleting extents from files */ struct mutex extent_mutex; + + /* held while logging the inode in tree-log.c */ struct mutex log_mutex; - struct inode vfs_inode; + + /* used to order data wrt metadata */ struct btrfs_ordered_inode_tree ordered_tree; + /* standard acl pointers */ struct posix_acl *i_acl; struct posix_acl *i_default_acl; /* for keeping track of orphaned inodes */ struct list_head i_orphan; + /* list of all the delalloc inodes in the FS. There are times we need + * to write all the delalloc pages to disk, and this list is used + * to walk them all. + */ struct list_head delalloc_inodes; - /* full 64 bit generation number */ + /* full 64 bit generation number, struct vfs_inode doesn't have a big + * enough field for this. + */ u64 generation; /* @@ -57,10 +88,25 @@ struct btrfs_inode { */ u64 logged_trans; - /* trans that last made a change that should be fully fsync'd */ + /* + * trans that last made a change that should be fully fsync'd. This + * gets reset to zero each time the inode is logged + */ u64 log_dirty_trans; + + /* total number of bytes pending delalloc, used by stat to calc the + * real block usage of the file + */ u64 delalloc_bytes; + + /* + * the size of the file stored in the metadata on disk. data=ordered + * means the in-memory i_size might be larger than the size on disk + * because not all the blocks are written yet. + */ u64 disk_i_size; + + /* flags field from the on disk inode */ u32 flags; /* @@ -68,6 +114,8 @@ struct btrfs_inode { * number for new files that are created */ u64 index_cnt; + + struct inode vfs_inode; }; static inline struct btrfs_inode *BTRFS_I(struct inode *inode) diff --git a/fs/btrfs/crc32c.h b/fs/btrfs/crc32c.h index 4f0fefed132a..1eaf11d334fd 100644 --- a/fs/btrfs/crc32c.h +++ b/fs/btrfs/crc32c.h @@ -1,3 +1,21 @@ +/* + * Copyright (C) 2008 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + #ifndef __BTRFS_CRC32C__ #define __BTRFS_CRC32C__ #include diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 50e81f43e6d4..ff3261ff2e19 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2007 Oracle. All rights reserved. + * Copyright (C) 2007,2008 Oracle. All rights reserved. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public @@ -54,12 +54,19 @@ struct btrfs_path *btrfs_alloc_path(void) return path; } +/* this also releases the path */ void btrfs_free_path(struct btrfs_path *p) { btrfs_release_path(NULL, p); kmem_cache_free(btrfs_path_cachep, p); } +/* + * path release drops references on the extent buffers in the path + * and it drops any locks held by this path + * + * It is safe to call this on paths that no locks or extent buffers held. + */ void noinline btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p) { int i; @@ -77,6 +84,16 @@ void noinline btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p) } } +/* + * safely gets a reference on the root node of a tree. A lock + * is not taken, so a concurrent writer may put a different node + * at the root of the tree. See btrfs_lock_root_node for the + * looping required. + * + * The extent buffer returned by this has a reference taken, so + * it won't disappear. It may stop being the root of the tree + * at any time because there are no locks held. + */ struct extent_buffer *btrfs_root_node(struct btrfs_root *root) { struct extent_buffer *eb; @@ -87,6 +104,10 @@ struct extent_buffer *btrfs_root_node(struct btrfs_root *root) return eb; } +/* loop around taking references on and locking the root node of the + * tree until you end up with a lock on the root. A locked buffer + * is returned, with a reference held. + */ struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root) { struct extent_buffer *eb; @@ -108,6 +129,10 @@ struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root) return eb; } +/* cowonly root (everything not a reference counted cow subvolume), just get + * put onto a simple dirty list. transaction.c walks this to make sure they + * get properly updated on disk. + */ static void add_root_to_dirty_list(struct btrfs_root *root) { if (root->track_dirty && list_empty(&root->dirty_list)) { @@ -116,6 +141,11 @@ static void add_root_to_dirty_list(struct btrfs_root *root) } } +/* + * used by snapshot creation to make a copy of a root for a tree with + * a given objectid. The buffer with the new root node is returned in + * cow_ret, and this func returns zero on success or a negative error code. + */ int btrfs_copy_root(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, @@ -167,6 +197,22 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans, return 0; } +/* + * does the dirty work in cow of a single block. The parent block + * (if supplied) is updated to point to the new cow copy. The new + * buffer is marked dirty and returned locked. If you modify the block + * it needs to be marked dirty again. + * + * search_start -- an allocation hint for the new block + * + * empty_size -- a hint that you plan on doing more cow. This is the size in bytes + * the allocator should try to find free next to the block it returns. This is + * just a hint and may be ignored by the allocator. + * + * prealloc_dest -- if you have already reserved a destination for the cow, + * this uses that block instead of allocating a new one. btrfs_alloc_reserved_extent + * is used to finish the allocation. + */ int noinline __btrfs_cow_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, @@ -311,6 +357,11 @@ int noinline __btrfs_cow_block(struct btrfs_trans_handle *trans, return 0; } +/* + * cows a single block, see __btrfs_cow_block for the real work. + * This version of it has extra checks so that a block isn't cow'd more than + * once per transaction, as long as it hasn't been written yet + */ int noinline btrfs_cow_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, struct extent_buffer *parent, int parent_slot, @@ -347,6 +398,10 @@ int noinline btrfs_cow_block(struct btrfs_trans_handle *trans, return ret; } +/* + * helper function for defrag to decide if two blocks pointed to by a + * node are actually close by + */ static int close_blocks(u64 blocknr, u64 other, u32 blocksize) { if (blocknr < other && other - (blocknr + blocksize) < 32768) @@ -381,6 +436,11 @@ static int comp_keys(struct btrfs_disk_key *disk, struct btrfs_key *k2) } +/* + * this is used by the defrag code to go through all the + * leaves pointed to by a node and reallocate them so that + * disk order is close to key order + */ int btrfs_realloc_node(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *parent, int start_slot, int cache_only, u64 *last_ret, @@ -521,6 +581,10 @@ static inline unsigned int leaf_data_end(struct btrfs_root *root, return btrfs_item_offset_nr(leaf, nr - 1); } +/* + * extra debugging checks to make sure all the items in a key are + * well formed and in the proper order + */ static int check_node(struct btrfs_root *root, struct btrfs_path *path, int level) { @@ -561,6 +625,10 @@ static int check_node(struct btrfs_root *root, struct btrfs_path *path, return 0; } +/* + * extra checking to make sure all the items in a leaf are + * well formed and in the proper order + */ static int check_leaf(struct btrfs_root *root, struct btrfs_path *path, int level) { @@ -782,6 +850,10 @@ static int bin_search(struct extent_buffer *eb, struct btrfs_key *key, return -1; } +/* given a node and slot number, this reads the blocks it points to. The + * extent buffer is returned with a reference taken (but unlocked). + * NULL is returned on error. + */ static noinline struct extent_buffer *read_node_slot(struct btrfs_root *root, struct extent_buffer *parent, int slot) { @@ -798,6 +870,11 @@ static noinline struct extent_buffer *read_node_slot(struct btrfs_root *root, btrfs_node_ptr_generation(parent, slot)); } +/* + * node level balancing, used to make sure nodes are in proper order for + * item deletion. We balance from the top down, so we have to make sure + * that a deletion won't leave an node completely empty later on. + */ static noinline int balance_level(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, int level) @@ -1024,7 +1101,10 @@ enospc: return ret; } -/* returns zero if the push worked, non-zero otherwise */ +/* Node balancing for insertion. Here we only split or push nodes around + * when they are completely full. This is also done top down, so we + * have to be pessimistic. + */ static int noinline push_nodes_for_insert(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, int level) @@ -1150,7 +1230,8 @@ static int noinline push_nodes_for_insert(struct btrfs_trans_handle *trans, } /* - * readahead one full node of leaves + * readahead one full node of leaves, finding things that are close + * to the block in 'slot', and triggering ra on them. */ static noinline void reada_for_search(struct btrfs_root *root, struct btrfs_path *path, @@ -1226,6 +1307,19 @@ static noinline void reada_for_search(struct btrfs_root *root, } } +/* + * when we walk down the tree, it is usually safe to unlock the higher layers in + * the tree. The exceptions are when our path goes through slot 0, because operations + * on the tree might require changing key pointers higher up in the tree. + * + * callers might also have set path->keep_locks, which tells this code to + * keep the lock if the path points to the last slot in the block. This is + * part of walking through the tree, and selecting the next slot in the higher + * block. + * + * lowest_unlock sets the lowest level in the tree we're allowed to unlock. + * so if lowest_unlock is 1, level 0 won't be unlocked + */ static noinline void unlock_up(struct btrfs_path *path, int level, int lowest_unlock) { @@ -2705,6 +2799,12 @@ again: return ret; } +/* + * make the item pointed to by the path smaller. new_size indicates + * how small to make it, and from_end tells us if we just chop bytes + * off the end of the item or if we shift the item to chop bytes off + * the front. + */ int btrfs_truncate_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, @@ -2818,6 +2918,9 @@ int btrfs_truncate_item(struct btrfs_trans_handle *trans, return ret; } +/* + * make the item pointed to by the path bigger, data_size is the new size. + */ int btrfs_extend_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, u32 data_size) @@ -2897,7 +3000,7 @@ int btrfs_extend_item(struct btrfs_trans_handle *trans, } /* - * Given a key and some data, insert an item into the tree. + * Given a key and some data, insert items into the tree. * This does all the path init required, making room in the tree if needed. */ int btrfs_insert_empty_items(struct btrfs_trans_handle *trans, @@ -3046,9 +3149,8 @@ int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root /* * delete the pointer from a given node. * - * If the delete empties a node, the node is removed from the tree, - * continuing all the way the root if required. The root is converted into - * a leaf if all the nodes are emptied. + * the tree should have been previously balanced so the deletion does not + * empty a node. */ static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, int level, int slot) @@ -3233,6 +3335,9 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, * search the tree again to find a leaf with lesser keys * returns 0 if it found something or 1 if there are no lesser leaves. * returns < 0 on io errors. + * + * This may release the path, and so you may lose any locks held at the + * time you call it. */ int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path) { @@ -3265,9 +3370,7 @@ int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path) /* * A helper function to walk down the tree starting at min_key, and looking * for nodes or leaves that are either in cache or have a minimum - * transaction id. This is used by the btree defrag code, but could - * also be used to search for blocks that have changed since a given - * transaction id. + * transaction id. This is used by the btree defrag code, and tree logging * * This does not cow, but it does stuff the starting key it finds back * into min_key, so you can call btrfs_search_slot with cow=1 on the @@ -3279,6 +3382,10 @@ int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path) * This honors path->lowest_level to prevent descent past a given level * of the tree. * + * min_trans indicates the oldest transaction that you are interested + * in walking through. Any nodes or leaves older than min_trans are + * skipped over (without reading them). + * * returns zero if something useful was found, < 0 on error and 1 if there * was nothing in the tree that matched the search criteria. */ diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 0079b60b18f3..ded1643c0273 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -27,7 +27,6 @@ #include #include #include -#include "bit-radix.h" #include "extent_io.h" #include "extent_map.h" #include "async-thread.h" diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c index e4f30090d640..5040b71f1900 100644 --- a/fs/btrfs/dir-item.c +++ b/fs/btrfs/dir-item.c @@ -21,6 +21,14 @@ #include "hash.h" #include "transaction.h" +/* + * insert a name into a directory, doing overflow properly if there is a hash + * collision. data_size indicates how big the item inserted should be. On + * success a struct btrfs_dir_item pointer is returned, otherwise it is + * an ERR_PTR. + * + * The name is not copied into the dir item, you have to do that yourself. + */ static struct btrfs_dir_item *insert_with_overflow(struct btrfs_trans_handle *trans, struct btrfs_root *root, @@ -55,6 +63,10 @@ static struct btrfs_dir_item *insert_with_overflow(struct btrfs_trans_handle return (struct btrfs_dir_item *)ptr; } +/* + * xattrs work a lot like directories, this inserts an xattr item + * into the tree + */ int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, const char *name, u16 name_len, const void *data, u16 data_len, @@ -109,6 +121,13 @@ int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans, return ret; } +/* + * insert a directory item in the tree, doing all the magic for + * both indexes. 'dir' indicates which objectid to insert it into, + * 'location' is the key to stuff into the directory item, 'type' is the + * type of the inode we're pointing to, and 'index' is the sequence number + * to use for the second index (if one is created). + */ int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, const char *name, int name_len, u64 dir, struct btrfs_key *location, u8 type, u64 index) @@ -184,6 +203,11 @@ out: return 0; } +/* + * lookup a directory item based on name. 'dir' is the objectid + * we're searching in, and 'mod' tells us if you plan on deleting the + * item (use mod < 0) or changing the options (use mod > 0) + */ struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, u64 dir, @@ -222,6 +246,14 @@ struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans, return btrfs_match_dir_item_name(root, path, name, name_len); } +/* + * lookup a directory item based on index. 'dir' is the objectid + * we're searching in, and 'mod' tells us if you plan on deleting the + * item (use mod < 0) or changing the options (use mod > 0) + * + * The name is used to make sure the index really points to the name you were + * looking for. + */ struct btrfs_dir_item * btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, @@ -282,6 +314,11 @@ struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans, return btrfs_match_dir_item_name(root, path, name, name_len); } +/* + * helper function to look at the directory item pointed to by 'path' + * this walks through all the entries in a dir item and finds one + * for a specific name. + */ struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root, struct btrfs_path *path, const char *name, int name_len) @@ -313,6 +350,10 @@ struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root, return NULL; } +/* + * given a pointer into a directory item, delete it. This + * handles items that have more than one entry in them. + */ int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 45b4f7285275..5ee10d3136f5 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -55,6 +55,11 @@ static int check_tree_block(struct btrfs_root *root, struct extent_buffer *buf) static struct extent_io_ops btree_extent_io_ops; static void end_workqueue_fn(struct btrfs_work *work); +/* + * end_io_wq structs are used to do processing in task context when an IO is + * complete. This is used during reads to verify checksums, and it is used + * by writes to insert metadata for new file extents after IO is complete. + */ struct end_io_wq { struct bio *bio; bio_end_io_t *end_io; @@ -66,6 +71,11 @@ struct end_io_wq { struct btrfs_work work; }; +/* + * async submit bios are used to offload expensive checksumming + * onto the worker threads. They checksum file and metadata bios + * just before they are sent down the IO stack. + */ struct async_submit_bio { struct inode *inode; struct bio *bio; @@ -76,6 +86,10 @@ struct async_submit_bio { struct btrfs_work work; }; +/* + * extents on the btree inode are pretty simple, there's one extent + * that covers the entire device + */ struct extent_map *btree_get_extent(struct inode *inode, struct page *page, size_t page_offset, u64 start, u64 len, int create) @@ -151,6 +165,10 @@ void btrfs_csum_final(u32 crc, char *result) *(__le32 *)result = ~cpu_to_le32(crc); } +/* + * compute the csum for a btree block, and either verify it or write it + * into the csum field of the block. + */ static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf, int verify) { @@ -204,6 +222,12 @@ static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf, return 0; } +/* + * we can't consider a given block up to date unless the transid of the + * block matches the transid in the parent node's pointer. This is how we + * detect blocks that either didn't get written at all or got written + * in the wrong place. + */ static int verify_parent_transid(struct extent_io_tree *io_tree, struct extent_buffer *eb, u64 parent_transid) { @@ -228,9 +252,12 @@ out: unlock_extent(io_tree, eb->start, eb->start + eb->len - 1, GFP_NOFS); return ret; - } +/* + * helper to read a given tree block, doing retries as required when + * the checksums don't match and we have alternate mirrors to try. + */ static int btree_read_extent_buffer_pages(struct btrfs_root *root, struct extent_buffer *eb, u64 start, u64 parent_transid) @@ -260,6 +287,10 @@ printk("read extent buffer pages failed with ret %d mirror no %d\n", ret, mirror return -EIO; } +/* + * checksum a dirty tree block before IO. This has extra checks to make + * sure we only fill in the checksum field in the first page of a multi-page block + */ int csum_dirty_buffer(struct btrfs_root *root, struct page *page) { struct extent_io_tree *tree; diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 8bd1b402f3fd..563b2d12f4f2 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -914,6 +914,10 @@ int wait_on_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end) } EXPORT_SYMBOL(wait_on_extent_writeback); +/* + * either insert or lock state struct between start and end use mask to tell + * us if waiting is desired. + */ int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask) { int err; @@ -982,6 +986,13 @@ int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end) } EXPORT_SYMBOL(set_range_writeback); +/* + * find the first offset in the io tree with 'bits' set. zero is + * returned if we find something, and *start_ret and *end_ret are + * set to reflect the state struct that was found. + * + * If nothing was found, 1 is returned, < 0 on error + */ int find_first_extent_bit(struct extent_io_tree *tree, u64 start, u64 *start_ret, u64 *end_ret, int bits) { @@ -1017,6 +1028,10 @@ out: } EXPORT_SYMBOL(find_first_extent_bit); +/* find the first state struct with 'bits' set after 'start', and + * return it. tree->lock must be held. NULL will returned if + * nothing was found after 'start' + */ struct extent_state *find_first_extent_bit_state(struct extent_io_tree *tree, u64 start, int bits) { @@ -1046,8 +1061,14 @@ out: } EXPORT_SYMBOL(find_first_extent_bit_state); -u64 find_lock_delalloc_range(struct extent_io_tree *tree, - u64 *start, u64 *end, u64 max_bytes) +/* + * find a contiguous range of bytes in the file marked as delalloc, not + * more than 'max_bytes'. start and end are used to return the range, + * + * 1 is returned if we find something, 0 if nothing was in the tree + */ +static noinline u64 find_lock_delalloc_range(struct extent_io_tree *tree, + u64 *start, u64 *end, u64 max_bytes) { struct rb_node *node; struct extent_state *state; @@ -1130,6 +1151,11 @@ out: return found; } +/* + * count the number of bytes in the tree that have a given bit(s) + * set. This can be fairly slow, except for EXTENT_DIRTY which is + * cached. The total number found is returned. + */ u64 count_range_bits(struct extent_io_tree *tree, u64 *start, u64 search_end, u64 max_bytes, unsigned long bits) @@ -1245,6 +1271,10 @@ int unlock_range(struct extent_io_tree *tree, u64 start, u64 end) } EXPORT_SYMBOL(unlock_range); +/* + * set the private field for a given byte offset in the tree. If there isn't + * an extent_state there already, this does nothing. + */ int set_state_private(struct extent_io_tree *tree, u64 start, u64 private) { struct rb_node *node; diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 78ced11d18c7..74b2a29880d3 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -114,6 +114,10 @@ static struct rb_node *tree_insert(struct rb_root *root, u64 offset, return NULL; } +/* + * search through the tree for an extent_map with a given offset. If + * it can't be found, try to find some neighboring extents + */ static struct rb_node *__tree_search(struct rb_root *root, u64 offset, struct rb_node **prev_ret, struct rb_node **next_ret) @@ -160,6 +164,10 @@ static struct rb_node *__tree_search(struct rb_root *root, u64 offset, return NULL; } +/* + * look for an offset in the tree, and if it can't be found, return + * the first offset we can find smaller than 'offset'. + */ static inline struct rb_node *tree_search(struct rb_root *root, u64 offset) { struct rb_node *prev; @@ -170,6 +178,7 @@ static inline struct rb_node *tree_search(struct rb_root *root, u64 offset) return ret; } +/* check to see if two extent_map structs are adjacent and safe to merge */ static int mergable_maps(struct extent_map *prev, struct extent_map *next) { if (test_bit(EXTENT_FLAG_PINNED, &prev->flags)) @@ -250,6 +259,7 @@ out: } EXPORT_SYMBOL(add_extent_mapping); +/* simple helper to do math around the end of an extent, handling wrap */ static u64 range_end(u64 start, u64 len) { if (start + len < start) diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 1b7e51a9db0f..3088a1184483 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -41,6 +41,9 @@ #include "compat.h" +/* simple helper to fault in pages and copy. This should go away + * and be replaced with calls into generic code. + */ static int noinline btrfs_copy_from_user(loff_t pos, int num_pages, int write_bytes, struct page **prepared_pages, @@ -72,12 +75,19 @@ static int noinline btrfs_copy_from_user(loff_t pos, int num_pages, return page_fault ? -EFAULT : 0; } +/* + * unlocks pages after btrfs_file_write is done with them + */ static void noinline btrfs_drop_pages(struct page **pages, size_t num_pages) { size_t i; for (i = 0; i < num_pages; i++) { if (!pages[i]) break; + /* page checked is some magic around finding pages that + * have been modified without going through btrfs_set_page_dirty + * clear it here + */ ClearPageChecked(pages[i]); unlock_page(pages[i]); mark_page_accessed(pages[i]); @@ -85,6 +95,10 @@ static void noinline btrfs_drop_pages(struct page **pages, size_t num_pages) } } +/* this does all the hard work for inserting an inline extent into + * the btree. Any existing inline extent is extended as required to make room, + * otherwise things are inserted as required into the btree + */ static int noinline insert_inline_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode, u64 offset, size_t size, @@ -228,6 +242,14 @@ fail: return err; } +/* + * after copy_from_user, pages need to be dirtied and we need to make + * sure holes are created between the current EOF and the start of + * any next extents (if required). + * + * this also makes the decision about creating an inline extent vs + * doing real data extents, marking pages dirty and delalloc as required. + */ static int noinline dirty_and_release_pages(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct file *file, @@ -362,6 +384,10 @@ out_unlock: return err; } +/* + * this drops all the extents in the cache that intersect the range + * [start, end]. Existing extents are split as required. + */ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, int skip_pinned) { @@ -536,6 +562,9 @@ out: * If an extent intersects the range but is not entirely inside the range * it is either truncated or split. Anything entirely inside the range * is deleted from the tree. + * + * inline_limit is used to tell this code which offsets in the file to keep + * if they contain inline extents. */ int noinline btrfs_drop_extents(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode, @@ -796,7 +825,9 @@ out: } /* - * this gets pages into the page cache and locks them down + * this gets pages into the page cache and locks them down, it also properly + * waits for data=ordered extents to finish before allowing the pages to be + * modified. */ static int noinline prepare_pages(struct btrfs_root *root, struct file *file, struct page **pages, size_t num_pages, @@ -1034,6 +1065,17 @@ int btrfs_release_file(struct inode * inode, struct file * filp) return 0; } +/* + * fsync call for both files and directories. This logs the inode into + * the tree log instead of forcing full commits whenever possible. + * + * It needs to call filemap_fdatawait so that all ordered extent updates are + * in the metadata btree are up to date for copying to the log. + * + * It drops the inode mutex before doing the tree log commit. This is an + * important optimization for directories because holding the mutex prevents + * new operations on the dir while we write to disk. + */ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync) { struct inode *inode = dentry->d_inode; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 404704d26822..f3abecc2d14c 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -83,6 +83,10 @@ static unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = { static void btrfs_truncate(struct inode *inode); +/* + * a very lame attempt at stopping writes when the FS is 85% full. There + * are countless ways this is incorrect, but it is better than nothing. + */ int btrfs_check_free_space(struct btrfs_root *root, u64 num_required, int for_del) { @@ -108,6 +112,12 @@ int btrfs_check_free_space(struct btrfs_root *root, u64 num_required, return ret; } +/* + * when extent_io.c finds a delayed allocation range in the file, + * the call backs end up in this code. The basic idea is to + * allocate extents on disk for the range, and create ordered data structs + * in ram to track those extents. + */ static int cow_file_range(struct inode *inode, u64 start, u64 end) { struct btrfs_root *root = BTRFS_I(inode)->root; @@ -185,6 +195,13 @@ out: return ret; } +/* + * when nowcow writeback call back. This checks for snapshots or COW copies + * of the extents that exist in the file, and COWs the file as required. + * + * If no cow copies or snapshots exist, we write directly to the existing + * blocks on disk + */ static int run_delalloc_nocow(struct inode *inode, u64 start, u64 end) { u64 extent_start; @@ -291,6 +308,9 @@ out: return err; } +/* + * extent_io.c call back to do delayed allocation processing + */ static int run_delalloc_range(struct inode *inode, u64 start, u64 end) { struct btrfs_root *root = BTRFS_I(inode)->root; @@ -305,6 +325,11 @@ static int run_delalloc_range(struct inode *inode, u64 start, u64 end) return ret; } +/* + * extent_io.c set_bit_hook, used to track delayed allocation + * bytes in this file, and to maintain the list of inodes that + * have pending delalloc work to be done. + */ int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end, unsigned long old, unsigned long bits) { @@ -323,6 +348,9 @@ int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end, return 0; } +/* + * extent_io.c clear_bit_hook, see set_bit_hook for why + */ int btrfs_clear_bit_hook(struct inode *inode, u64 start, u64 end, unsigned long old, unsigned long bits) { @@ -349,6 +377,10 @@ int btrfs_clear_bit_hook(struct inode *inode, u64 start, u64 end, return 0; } +/* + * extent_io.c merge_bio_hook, this must check the chunk tree to make sure + * we don't create bios that span stripes or chunks + */ int btrfs_merge_bio_hook(struct page *page, unsigned long offset, size_t size, struct bio *bio) { @@ -371,6 +403,14 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset, return 0; } +/* + * in order to insert checksums into the metadata in large chunks, + * we wait until bio submission time. All the pages in the bio are + * checksummed and sums are attached onto the ordered extent record. + * + * At IO completion time the cums attached on the ordered extent record + * are inserted into the btree + */ int __btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, int mirror_num) { @@ -383,6 +423,10 @@ int __btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, return btrfs_map_bio(root, rw, bio, mirror_num, 1); } +/* + * extent_io.c submission hook. This does the right thing for csum calculation on write, + * or reading the csums from the tree before a read + */ int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, int mirror_num) { @@ -408,6 +452,10 @@ mapit: return btrfs_map_bio(root, rw, bio, mirror_num, 0); } +/* + * given a list of ordered sums record them in the inode. This happens + * at IO completion time based on sums calculated at bio submission time. + */ static noinline int add_pending_csums(struct btrfs_trans_handle *trans, struct inode *inode, u64 file_offset, struct list_head *list) @@ -430,12 +478,12 @@ int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end) GFP_NOFS); } +/* see btrfs_writepage_start_hook for details on why this is required */ struct btrfs_writepage_fixup { struct page *page; struct btrfs_work work; }; -/* see btrfs_writepage_start_hook for details on why this is required */ void btrfs_writepage_fixup_worker(struct btrfs_work *work) { struct btrfs_writepage_fixup *fixup; @@ -522,6 +570,10 @@ int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end) return -EAGAIN; } +/* as ordered data IO finishes, this gets called so we can finish + * an ordered extent if the range of bytes in the file it covers are + * fully written. + */ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) { struct btrfs_root *root = BTRFS_I(inode)->root; @@ -631,6 +683,14 @@ int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end, return btrfs_finish_ordered_io(page->mapping->host, start, end); } +/* + * When IO fails, either with EIO or csum verification fails, we + * try other mirrors that might have a good copy of the data. This + * io_failure_record is used to record state as we go through all the + * mirrors. If another mirror has good data, the page is set up to date + * and things continue. If a good mirror can't be found, the original + * bio end_io callback is called to indicate things have failed. + */ struct io_failure_record { struct page *page; u64 start; @@ -725,6 +785,10 @@ int btrfs_io_failed_hook(struct bio *failed_bio, return 0; } +/* + * each time an IO finishes, we do a fast check in the IO failure tree + * to see if we need to process or clean up an io_failure_record + */ int btrfs_clean_io_failures(struct inode *inode, u64 start) { u64 private; @@ -753,6 +817,11 @@ int btrfs_clean_io_failures(struct inode *inode, u64 start) return 0; } +/* + * when reads are done, we need to check csums to verify the data is correct + * if there's a match, we allow the bio to finish. If not, we go through + * the io_failure_record routines to find good copies + */ int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end, struct extent_state *state) { @@ -990,6 +1059,9 @@ void btrfs_orphan_cleanup(struct btrfs_root *root) btrfs_free_path(path); } +/* + * read an inode from the btree into the in-memory inode + */ void btrfs_read_locked_inode(struct inode *inode) { struct btrfs_path *path; @@ -1083,6 +1155,9 @@ make_bad: make_bad_inode(inode); } +/* + * given a leaf and an inode, copy the inode fields into the leaf + */ static void fill_inode_item(struct btrfs_trans_handle *trans, struct extent_buffer *leaf, struct btrfs_inode_item *item, @@ -1118,6 +1193,9 @@ static void fill_inode_item(struct btrfs_trans_handle *trans, BTRFS_I(inode)->block_group->key.objectid); } +/* + * copy everything in the in-memory inode into the btree. + */ int noinline btrfs_update_inode(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode) @@ -1151,6 +1229,11 @@ failed: } +/* + * unlink helper that gets used here in inode.c and in the tree logging + * recovery code. It remove a link in a directory with a given name, and + * also drops the back refs in the inode to the directory + */ int btrfs_unlink_inode(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *dir, struct inode *inode, @@ -1309,7 +1392,7 @@ fail: /* * this can truncate away extent items, csum items and directory items. * It starts at a high offset and removes keys until it can't find - * any higher than i_size. + * any higher than new_size * * csum items that cross the new i_size are truncated to the new size * as well. @@ -2123,6 +2206,11 @@ void btrfs_dirty_inode(struct inode *inode) btrfs_end_transaction(trans, root); } +/* + * find the highest existing sequence number in a directory + * and then set the in-memory index_cnt variable to reflect + * free sequence numbers + */ static int btrfs_set_inode_index_count(struct inode *inode) { struct btrfs_root *root = BTRFS_I(inode)->root; @@ -2175,6 +2263,10 @@ out: return ret; } +/* + * helper to find a free sequence number in a given directory. This current + * code is very simple, later versions will do smarter things in the btree + */ static int btrfs_set_inode_index(struct inode *dir, struct inode *inode, u64 *index) { @@ -2305,6 +2397,12 @@ static inline u8 btrfs_inode_type(struct inode *inode) return btrfs_type_by_mode[(inode->i_mode & S_IFMT) >> S_SHIFT]; } +/* + * utility function to add 'inode' into 'parent_inode' with + * a give name and a given sequence number. + * if 'add_backref' is true, also insert a backref from the + * inode to the parent directory. + */ int btrfs_add_link(struct btrfs_trans_handle *trans, struct inode *parent_inode, struct inode *inode, const char *name, int name_len, int add_backref, u64 index) @@ -2611,6 +2709,10 @@ out_unlock: return err; } +/* helper for btfs_get_extent. Given an existing extent in the tree, + * and an extent that you want to insert, deal with overlap and insert + * the new extent into the tree. + */ static int merge_extent_mapping(struct extent_map_tree *em_tree, struct extent_map *existing, struct extent_map *em, @@ -2627,6 +2729,14 @@ static int merge_extent_mapping(struct extent_map_tree *em_tree, return add_extent_mapping(em_tree, em); } +/* + * a bit scary, this does extent mapping from logical file offset to the disk. + * the ugly parts come from merging extents from the disk with the + * in-ram representation. This gets more complex because of the data=ordered code, + * where the in-ram extents might be locked pending data=ordered completion. + * + * This also copies inline extents directly into the page. + */ struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page, size_t pg_offset, u64 start, u64 len, int create) @@ -2869,76 +2979,11 @@ out: return em; } -#if 0 /* waiting for O_DIRECT reads */ -static int btrfs_get_block(struct inode *inode, sector_t iblock, - struct buffer_head *bh_result, int create) -{ - struct extent_map *em; - u64 start = (u64)iblock << inode->i_blkbits; - struct btrfs_multi_bio *multi = NULL; - struct btrfs_root *root = BTRFS_I(inode)->root; - u64 len; - u64 logical; - u64 map_length; - int ret = 0; - - em = btrfs_get_extent(inode, NULL, 0, start, bh_result->b_size, 0); - - if (!em || IS_ERR(em)) - goto out; - - if (em->start > start || em->start + em->len <= start) { - goto out; - } - - if (em->block_start == EXTENT_MAP_INLINE) { - ret = -EINVAL; - goto out; - } - - len = em->start + em->len - start; - len = min_t(u64, len, INT_LIMIT(typeof(bh_result->b_size))); - - if (em->block_start == EXTENT_MAP_HOLE || - em->block_start == EXTENT_MAP_DELALLOC) { - bh_result->b_size = len; - goto out; - } - - logical = start - em->start; - logical = em->block_start + logical; - - map_length = len; - ret = btrfs_map_block(&root->fs_info->mapping_tree, READ, - logical, &map_length, &multi, 0); - BUG_ON(ret); - bh_result->b_blocknr = multi->stripes[0].physical >> inode->i_blkbits; - bh_result->b_size = min(map_length, len); - - bh_result->b_bdev = multi->stripes[0].dev->bdev; - set_buffer_mapped(bh_result); - kfree(multi); -out: - free_extent_map(em); - return ret; -} -#endif - static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, loff_t offset, unsigned long nr_segs) { return -EINVAL; -#if 0 - struct file *file = iocb->ki_filp; - struct inode *inode = file->f_mapping->host; - - if (rw == WRITE) - return -EINVAL; - - return blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov, - offset, nr_segs, btrfs_get_block, NULL); -#endif } static sector_t btrfs_bmap(struct address_space *mapping, sector_t iblock) @@ -3202,6 +3247,9 @@ void btrfs_invalidate_dcache_root(struct btrfs_root *root, char *name, } } +/* + * create a new subvolume directory/inode (helper for the ioctl). + */ int btrfs_create_subvol_root(struct btrfs_root *new_root, struct btrfs_trans_handle *trans, u64 new_dirid, struct btrfs_block_group_cache *block_group) @@ -3223,6 +3271,9 @@ int btrfs_create_subvol_root(struct btrfs_root *new_root, return btrfs_update_inode(trans, new_root, inode); } +/* helper function for file defrag and space balancing. This + * forces readahead on a given range of bytes in an inode + */ unsigned long btrfs_force_ra(struct address_space *mapping, struct file_ra_state *ra, struct file *file, pgoff_t offset, pgoff_t last_index) @@ -3424,6 +3475,10 @@ out_unlock: return ret; } +/* + * some fairly slow code that needs optimization. This walks the list + * of all the inodes with pending delalloc and forces them to disk. + */ int btrfs_start_delalloc_inodes(struct btrfs_root *root) { struct list_head *head = &root->fs_info->delalloc_inodes; diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c index 0cc314c10d66..e30aa6e2958f 100644 --- a/fs/btrfs/locking.c +++ b/fs/btrfs/locking.c @@ -25,6 +25,15 @@ #include "extent_io.h" #include "locking.h" +/* + * locks the per buffer mutex in an extent buffer. This uses adaptive locks + * and the spin is not tuned very extensively. The spinning does make a big + * difference in almost every workload, but spinning for the right amount of + * time needs some help. + * + * In general, we want to spin as long as the lock holder is doing btree searches, + * and we should give up if they are in more expensive code. + */ int btrfs_tree_lock(struct extent_buffer *eb) { int i; @@ -57,6 +66,10 @@ int btrfs_tree_locked(struct extent_buffer *eb) return mutex_is_locked(&eb->mutex); } +/* + * btrfs_search_slot uses this to decide if it should drop its locks + * before doing something expensive like allocating free blocks for cow. + */ int btrfs_path_lock_waiting(struct btrfs_path *path, int level) { int i; diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 951eacff2420..dcc1730dd837 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -26,7 +26,6 @@ #include "btrfs_inode.h" #include "extent_io.h" - static u64 entry_end(struct btrfs_ordered_extent *entry) { if (entry->file_offset + entry->len < entry->file_offset) @@ -34,6 +33,9 @@ static u64 entry_end(struct btrfs_ordered_extent *entry) return entry->file_offset + entry->len; } +/* returns NULL if the insertion worked, or it returns the node it did find + * in the tree + */ static struct rb_node *tree_insert(struct rb_root *root, u64 file_offset, struct rb_node *node) { @@ -58,6 +60,10 @@ static struct rb_node *tree_insert(struct rb_root *root, u64 file_offset, return NULL; } +/* + * look for a given offset in the tree, and if it can't be found return the + * first lesser offset + */ static struct rb_node *__tree_search(struct rb_root *root, u64 file_offset, struct rb_node **prev_ret) { @@ -108,6 +114,9 @@ static struct rb_node *__tree_search(struct rb_root *root, u64 file_offset, return NULL; } +/* + * helper to check if a given offset is inside a given entry + */ static int offset_in_entry(struct btrfs_ordered_extent *entry, u64 file_offset) { if (file_offset < entry->file_offset || @@ -116,6 +125,10 @@ static int offset_in_entry(struct btrfs_ordered_extent *entry, u64 file_offset) return 1; } +/* + * look find the first ordered struct that has this offset, otherwise + * the first one less than this offset + */ static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree, u64 file_offset) { @@ -305,6 +318,10 @@ int btrfs_remove_ordered_extent(struct inode *inode, return 0; } +/* + * wait for all the ordered extents in a root. This is done when balancing + * space between drives. + */ int btrfs_wait_ordered_extents(struct btrfs_root *root, int nocow_only) { struct list_head splice; diff --git a/fs/btrfs/ref-cache.c b/fs/btrfs/ref-cache.c index 30fcb7aea5b5..a50ebb67055d 100644 --- a/fs/btrfs/ref-cache.c +++ b/fs/btrfs/ref-cache.c @@ -21,6 +21,16 @@ #include "ref-cache.h" #include "transaction.h" +/* + * leaf refs are used to cache the information about which extents + * a given leaf has references on. This allows us to process that leaf + * in btrfs_drop_snapshot without needing to read it back from disk. + */ + +/* + * kmalloc a leaf reference struct and update the counters for the + * total ref cache size + */ struct btrfs_leaf_ref *btrfs_alloc_leaf_ref(struct btrfs_root *root, int nr_extents) { @@ -40,6 +50,10 @@ struct btrfs_leaf_ref *btrfs_alloc_leaf_ref(struct btrfs_root *root, return ref; } +/* + * free a leaf reference struct and update the counters for the + * total ref cache size + */ void btrfs_free_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref) { if (!ref) @@ -135,6 +149,10 @@ int btrfs_remove_leaf_refs(struct btrfs_root *root, u64 max_root_gen, return 0; } +/* + * find the leaf ref for a given extent. This returns the ref struct with + * a usage reference incremented + */ struct btrfs_leaf_ref *btrfs_lookup_leaf_ref(struct btrfs_root *root, u64 bytenr) { @@ -160,6 +178,10 @@ again: return NULL; } +/* + * add a fully filled in leaf ref struct + * remove all the refs older than a given root generation + */ int btrfs_add_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref, int shared) { @@ -184,6 +206,10 @@ int btrfs_add_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref, return ret; } +/* + * remove a single leaf ref from the tree. This drops the ref held by the tree + * only + */ int btrfs_remove_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref) { struct btrfs_leaf_ref_tree *tree; diff --git a/fs/btrfs/ref-cache.h b/fs/btrfs/ref-cache.h index 617564787f52..16f3183d7c59 100644 --- a/fs/btrfs/ref-cache.h +++ b/fs/btrfs/ref-cache.h @@ -19,8 +19,11 @@ #define __REFCACHE__ struct btrfs_extent_info { + /* bytenr and num_bytes find the extent in the extent allocation tree */ u64 bytenr; u64 num_bytes; + + /* objectid and offset find the back reference for the file */ u64 objectid; u64 offset; }; diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index 0091c01abb06..eb7f7655e9d5 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -22,8 +22,10 @@ #include "print-tree.h" /* - * returns 0 on finding something, 1 if no more roots are there - * and < 0 on error + * search forward for a root, starting with objectid 'search_start' + * if a root key is found, the objectid we find is filled into 'found_objectid' + * and 0 is returned. < 0 is returned on error, 1 if there is nothing + * left in the tree. */ int btrfs_search_root(struct btrfs_root *root, u64 search_start, u64 *found_objectid) @@ -66,6 +68,11 @@ out: return ret; } +/* + * lookup the root with the highest offset for a given objectid. The key we do + * find is copied into 'key'. If we find something return 0, otherwise 1, < 0 + * on error. + */ int btrfs_find_last_root(struct btrfs_root *root, u64 objectid, struct btrfs_root_item *item, struct btrfs_key *key) { @@ -104,6 +111,9 @@ out: return ret; } +/* + * copy the data in 'item' into the btree + */ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_key *key, struct btrfs_root_item *item) @@ -147,6 +157,12 @@ int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root return ret; } +/* + * at mount time we want to find all the old transaction snapshots that were in + * the process of being deleted if we crashed. This is any root item with an offset + * lower than the latest root. They need to be queued for deletion to finish + * what was happening when we crashed. + */ int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid, struct btrfs_root *latest) { @@ -227,6 +243,7 @@ err: return ret; } +/* drop the root item for 'key' from 'root' */ int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_key *key) { diff --git a/fs/btrfs/struct-funcs.c b/fs/btrfs/struct-funcs.c index ad03a32d1116..cdedbe144d45 100644 --- a/fs/btrfs/struct-funcs.c +++ b/fs/btrfs/struct-funcs.c @@ -17,6 +17,27 @@ */ #include + +/* this is some deeply nasty code. ctree.h has a different + * definition for this BTRFS_SETGET_FUNCS macro, behind a #ifndef + * + * The end result is that anyone who #includes ctree.h gets a + * declaration for the btrfs_set_foo functions and btrfs_foo functions + * + * This file declares the macros and then #includes ctree.h, which results + * in cpp creating the function here based on the template below. + * + * These setget functions do all the extent_buffer related mapping + * required to efficiently read and write specific fields in the extent + * buffers. Every pointer to metadata items in btrfs is really just + * an unsigned long offset into the extent buffer which has been + * cast to a specific type. This gives us all the gcc type checking. + * + * The extent buffer api is used to do all the kmapping and page + * spanning work required to get extent buffers in highmem and have + * a metadata blocksize different from the page size. + */ + #define BTRFS_SETGET_FUNCS(name, type, member, bits) \ u##bits btrfs_##name(struct extent_buffer *eb, \ type *s) \ diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 8399d6d05d63..2e6039825b7b 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -519,6 +519,9 @@ static struct file_system_type btrfs_fs_type = { .fs_flags = FS_REQUIRES_DEV, }; +/* + * used by btrfsctl to scan devices when no FS is mounted + */ static long btrfs_control_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 444abe0796ae..11266d68a6c9 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -46,6 +46,9 @@ static noinline void put_transaction(struct btrfs_transaction *transaction) } } +/* + * either allocate a new transaction or hop into the existing one + */ static noinline int join_transaction(struct btrfs_root *root) { struct btrfs_transaction *cur_trans; @@ -85,6 +88,12 @@ static noinline int join_transaction(struct btrfs_root *root) return 0; } +/* + * this does all the record keeping required to make sure that a + * reference counted root is properly recorded in a given transaction. + * This is required to make sure the old root from before we joined the transaction + * is deleted when the transaction commits + */ noinline int btrfs_record_root_in_trans(struct btrfs_root *root) { struct btrfs_dirty_root *dirty; @@ -127,6 +136,10 @@ noinline int btrfs_record_root_in_trans(struct btrfs_root *root) return 0; } +/* wait for commit against the current transaction to become unblocked + * when this is done, it is safe to start a new transaction, but the current + * transaction might not be fully on disk. + */ static void wait_current_trans(struct btrfs_root *root) { struct btrfs_transaction *cur_trans; @@ -198,7 +211,7 @@ struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r, return start_transaction(r, num_blocks, 2); } - +/* wait for a transaction commit to be fully complete */ static noinline int wait_for_commit(struct btrfs_root *root, struct btrfs_transaction *commit) { @@ -218,6 +231,10 @@ static noinline int wait_for_commit(struct btrfs_root *root, return 0; } +/* + * rate limit against the drop_snapshot code. This helps to slow down new operations + * if the drop_snapshot code isn't able to keep up. + */ static void throttle_on_drops(struct btrfs_root *root) { struct btrfs_fs_info *info = root->fs_info; @@ -302,7 +319,11 @@ int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans, return __btrfs_end_transaction(trans, root, 1); } - +/* + * when btree blocks are allocated, they have some corresponding bits set for + * them in one of two extent_io trees. This is used to make sure all of + * those extents are on disk for transaction or log commit + */ int btrfs_write_and_wait_marked_extents(struct btrfs_root *root, struct extent_io_tree *dirty_pages) { @@ -393,6 +414,16 @@ int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans, &trans->transaction->dirty_pages); } +/* + * this is used to update the root pointer in the tree of tree roots. + * + * But, in the case of the extent allocation tree, updating the root + * pointer may allocate blocks which may change the root of the extent + * allocation tree. + * + * So, this loops and repeats and makes sure the cowonly root didn't + * change while the root pointer was being updated in the metadata. + */ static int update_cowonly_root(struct btrfs_trans_handle *trans, struct btrfs_root *root) { @@ -418,6 +449,9 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans, return 0; } +/* + * update all the cowonly tree roots on disk + */ int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans, struct btrfs_root *root) { @@ -433,6 +467,11 @@ int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans, return 0; } +/* + * dead roots are old snapshots that need to be deleted. This allocates + * a dirty root struct and adds it into the list of dead roots that need to + * be deleted + */ int btrfs_add_dead_root(struct btrfs_root *root, struct btrfs_root *latest) { struct btrfs_dirty_root *dirty; @@ -449,6 +488,12 @@ int btrfs_add_dead_root(struct btrfs_root *root, struct btrfs_root *latest) return 0; } +/* + * at transaction commit time we need to schedule the old roots for + * deletion via btrfs_drop_snapshot. This runs through all the + * reference counted roots that were modified in the current + * transaction and puts them into the drop list + */ static noinline int add_dirty_roots(struct btrfs_trans_handle *trans, struct radix_tree_root *radix, struct list_head *list) @@ -541,6 +586,10 @@ static noinline int add_dirty_roots(struct btrfs_trans_handle *trans, return err; } +/* + * defrag a given btree. If cacheonly == 1, this won't read from the disk, + * otherwise every leaf in the btree is read and defragged. + */ int btrfs_defrag_root(struct btrfs_root *root, int cacheonly) { struct btrfs_fs_info *info = root->fs_info; @@ -570,6 +619,10 @@ int btrfs_defrag_root(struct btrfs_root *root, int cacheonly) return 0; } +/* + * Given a list of roots that need to be deleted, call btrfs_drop_snapshot on + * all of them + */ static noinline int drop_dirty_roots(struct btrfs_root *tree_root, struct list_head *list) { @@ -664,6 +717,10 @@ static noinline int drop_dirty_roots(struct btrfs_root *tree_root, return ret; } +/* + * new snapshots need to be created at a very specific time in the + * transaction commit. This does the actual creation + */ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, struct btrfs_pending_snapshot *pending) @@ -734,6 +791,9 @@ fail: return ret; } +/* + * create all the snapshots we've scheduled for creation + */ static noinline int create_pending_snapshots(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info) { @@ -944,6 +1004,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, return ret; } +/* + * interface function to delete all the snapshots we have scheduled for deletion + */ int btrfs_clean_old_snapshots(struct btrfs_root *root) { struct list_head dirty_roots; diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c index b3bb5bbad76e..6f57d0889b1e 100644 --- a/fs/btrfs/tree-defrag.c +++ b/fs/btrfs/tree-defrag.c @@ -23,6 +23,10 @@ #include "transaction.h" #include "locking.h" +/* defrag all the leaves in a given btree. If cache_only == 1, don't read things + * from disk, otherwise read all the leaves and try to get key order to + * better reflect disk order + */ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, struct btrfs_root *root, int cache_only) { -- cgit v1.2.3 From 61f8c86ee8f9ca55488449db886104a0ab4e1f98 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 9 Oct 2008 11:52:35 -0400 Subject: Btrfs: Fix makefile for builing btrfs static This fixes the btrfs makefile for building in the tree and out of the tree both as a module and static. Signed-off-by: Chris Mason --- fs/btrfs/Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs/btrfs/Makefile') diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index 48b7909ca8d1..7125716e142b 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -1,7 +1,7 @@ ifneq ($(KERNELRELEASE),) # kbuild part of makefile -obj-m := btrfs.o +obj-$(CONFIG_BTRFS_FS) := btrfs.o btrfs-y := super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ file-item.o inode-item.o inode-map.o disk-io.o \ transaction.o inode.o file.o tree-defrag.o \ @@ -14,7 +14,7 @@ else KERNELDIR := /lib/modules/`uname -r`/build all: - $(MAKE) -C $(KERNELDIR) M=`pwd` modules + $(MAKE) -C $(KERNELDIR) M=`pwd` CONFIG_BTRFS_FS=m modules modules_install: $(MAKE) -C $(KERNELDIR) M=`pwd` modules_install -- cgit v1.2.3 From c8b978188c9a0fd3d535c13debd19d522b726f1f Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Wed, 29 Oct 2008 14:49:59 -0400 Subject: Btrfs: Add zlib compression support This is a large change for adding compression on reading and writing, both for inline and regular extents. It does some fairly large surgery to the writeback paths. Compression is off by default and enabled by mount -o compress. Even when the -o compress mount option is not used, it is possible to read compressed extents off the disk. If compression for a given set of pages fails to make them smaller, the file is flagged to avoid future compression attempts later. * While finding delalloc extents, the pages are locked before being sent down to the delalloc handler. This allows the delalloc handler to do complex things such as cleaning the pages, marking them writeback and starting IO on their behalf. * Inline extents are inserted at delalloc time now. This allows us to compress the data before inserting the inline extent, and it allows us to insert an inline extent that spans multiple pages. * All of the in-memory extent representations (extent_map.c, ordered-data.c etc) are changed to record both an in-memory size and an on disk size, as well as a flag for compression. From a disk format point of view, the extent pointers in the file are changed to record the on disk size of a given extent and some encoding flags. Space in the disk format is allocated for compression encoding, as well as encryption and a generic 'other' field. Neither the encryption or the 'other' field are currently used. In order to limit the amount of data read for a single random read in the file, the size of a compressed extent is limited to 128k. This is a software only limit, the disk format supports u64 sized compressed extents. In order to limit the ram consumed while processing extents, the uncompressed size of a compressed extent is limited to 256k. This is a software only limit and will be subject to tuning later. Checksumming is still done on compressed extents, and it is done on the uncompressed version of the data. This way additional encodings can be layered on without having to figure out which encoding to checksum. Compression happens at delalloc time, which is basically singled threaded because it is usually done by a single pdflush thread. This makes it tricky to spread the compression load across all the cpus on the box. We'll have to look at parallel pdflush walks of dirty inodes at a later time. Decompression is hooked into readpages and it does spread across CPUs nicely. Signed-off-by: Chris Mason --- fs/Kconfig | 2 + fs/btrfs/Makefile | 3 +- fs/btrfs/compression.c | 454 ++++++++++++++++++++++++++++++++++ fs/btrfs/compression.h | 47 ++++ fs/btrfs/ctree.h | 99 ++++++-- fs/btrfs/disk-io.c | 18 +- fs/btrfs/disk-io.h | 1 + fs/btrfs/extent-tree.c | 27 +- fs/btrfs/extent_io.c | 411 ++++++++++++++++++++++++------- fs/btrfs/extent_io.h | 17 +- fs/btrfs/extent_map.c | 9 + fs/btrfs/extent_map.h | 6 +- fs/btrfs/file-item.c | 75 +++++- fs/btrfs/file.c | 263 +++++--------------- fs/btrfs/inode.c | 584 ++++++++++++++++++++++++++++++++++++++++---- fs/btrfs/ordered-data.c | 9 +- fs/btrfs/ordered-data.h | 10 +- fs/btrfs/print-tree.c | 7 +- fs/btrfs/super.c | 10 +- fs/btrfs/tree-log.c | 3 +- fs/btrfs/volumes.c | 2 + fs/btrfs/zlib.c | 637 ++++++++++++++++++++++++++++++++++++++++++++++++ 22 files changed, 2315 insertions(+), 379 deletions(-) create mode 100644 fs/btrfs/compression.c create mode 100644 fs/btrfs/compression.h create mode 100644 fs/btrfs/zlib.c (limited to 'fs/btrfs/Makefile') diff --git a/fs/Kconfig b/fs/Kconfig index 18f5a85b47c6..31cce5d88b1a 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -501,6 +501,8 @@ config BTRFS_FS tristate "Btrfs filesystem (EXPERIMENTAL) Unstable disk format" depends on EXPERIMENTAL select LIBCRC32C + select ZLIB_INFLATE + select ZLIB_DEFLATE help Btrfs is a new filesystem with extents, writable snapshotting, support for multiple devices and many more features. diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index 7125716e142b..d2cf5a54a4b8 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -7,7 +7,8 @@ btrfs-y := super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ transaction.o inode.o file.o tree-defrag.o \ extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \ extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \ - ref-cache.o export.o tree-log.o acl.o free-space-cache.o + ref-cache.o export.o tree-log.o acl.o free-space-cache.o zlib.o \ + compression.o else # Normal Makefile diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c new file mode 100644 index 000000000000..c5470367ca5c --- /dev/null +++ b/fs/btrfs/compression.c @@ -0,0 +1,454 @@ +/* + * Copyright (C) 2008 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "ctree.h" +#include "disk-io.h" +#include "transaction.h" +#include "btrfs_inode.h" +#include "volumes.h" +#include "ordered-data.h" +#include "compat.h" +#include "compression.h" +#include "extent_io.h" +#include "extent_map.h" + +struct compressed_bio { + /* number of bios pending for this compressed extent */ + atomic_t pending_bios; + + /* the pages with the compressed data on them */ + struct page **compressed_pages; + + /* inode that owns this data */ + struct inode *inode; + + /* starting offset in the inode for our pages */ + u64 start; + + /* number of bytes in the inode we're working on */ + unsigned long len; + + /* number of bytes on disk */ + unsigned long compressed_len; + + /* number of compressed pages in the array */ + unsigned long nr_pages; + + /* IO errors */ + int errors; + + /* for reads, this is the bio we are copying the data into */ + struct bio *orig_bio; +}; + +static struct bio *compressed_bio_alloc(struct block_device *bdev, + u64 first_byte, gfp_t gfp_flags) +{ + struct bio *bio; + int nr_vecs; + + nr_vecs = bio_get_nr_vecs(bdev); + bio = bio_alloc(gfp_flags, nr_vecs); + + if (bio == NULL && (current->flags & PF_MEMALLOC)) { + while (!bio && (nr_vecs /= 2)) + bio = bio_alloc(gfp_flags, nr_vecs); + } + + if (bio) { + bio->bi_size = 0; + bio->bi_bdev = bdev; + bio->bi_sector = first_byte >> 9; + } + return bio; +} + +/* when we finish reading compressed pages from the disk, we + * decompress them and then run the bio end_io routines on the + * decompressed pages (in the inode address space). + * + * This allows the checksumming and other IO error handling routines + * to work normally + * + * The compressed pages are freed here, and it must be run + * in process context + */ +static void end_compressed_bio_read(struct bio *bio, int err) +{ + struct extent_io_tree *tree; + struct compressed_bio *cb = bio->bi_private; + struct inode *inode; + struct page *page; + unsigned long index; + int ret; + + if (err) + cb->errors = 1; + + /* if there are more bios still pending for this compressed + * extent, just exit + */ + if (!atomic_dec_and_test(&cb->pending_bios)) + goto out; + + /* ok, we're the last bio for this extent, lets start + * the decompression. + */ + inode = cb->inode; + tree = &BTRFS_I(inode)->io_tree; + ret = btrfs_zlib_decompress_biovec(cb->compressed_pages, + cb->start, + cb->orig_bio->bi_io_vec, + cb->orig_bio->bi_vcnt, + cb->compressed_len); + if (ret) + cb->errors = 1; + + /* release the compressed pages */ + index = 0; + for (index = 0; index < cb->nr_pages; index++) { + page = cb->compressed_pages[index]; + page->mapping = NULL; + page_cache_release(page); + } + + /* do io completion on the original bio */ + if (cb->errors) + bio_io_error(cb->orig_bio); + else + bio_endio(cb->orig_bio, 0); + + /* finally free the cb struct */ + kfree(cb->compressed_pages); + kfree(cb); +out: + bio_put(bio); +} + +/* + * Clear the writeback bits on all of the file + * pages for a compressed write + */ +static noinline int end_compressed_writeback(struct inode *inode, u64 start, + unsigned long ram_size) +{ + unsigned long index = start >> PAGE_CACHE_SHIFT; + unsigned long end_index = (start + ram_size - 1) >> PAGE_CACHE_SHIFT; + struct page *pages[16]; + unsigned long nr_pages = end_index - index + 1; + int i; + int ret; + + while(nr_pages > 0) { + ret = find_get_pages_contig(inode->i_mapping, index, + min(nr_pages, ARRAY_SIZE(pages)), pages); + if (ret == 0) { + nr_pages -= 1; + index += 1; + continue; + } + for (i = 0; i < ret; i++) { + end_page_writeback(pages[i]); + page_cache_release(pages[i]); + } + nr_pages -= ret; + index += ret; + } + /* the inode may be gone now */ + return 0; +} + +/* + * do the cleanup once all the compressed pages hit the disk. + * This will clear writeback on the file pages and free the compressed + * pages. + * + * This also calls the writeback end hooks for the file pages so that + * metadata and checksums can be updated in the file. + */ +static void end_compressed_bio_write(struct bio *bio, int err) +{ + struct extent_io_tree *tree; + struct compressed_bio *cb = bio->bi_private; + struct inode *inode; + struct page *page; + unsigned long index; + + if (err) + cb->errors = 1; + + /* if there are more bios still pending for this compressed + * extent, just exit + */ + if (!atomic_dec_and_test(&cb->pending_bios)) + goto out; + + /* ok, we're the last bio for this extent, step one is to + * call back into the FS and do all the end_io operations + */ + inode = cb->inode; + tree = &BTRFS_I(inode)->io_tree; + tree->ops->writepage_end_io_hook(cb->compressed_pages[0], + cb->start, + cb->start + cb->len - 1, + NULL, 1); + + end_compressed_writeback(inode, cb->start, cb->len); + /* note, our inode could be gone now */ + + /* + * release the compressed pages, these came from alloc_page and + * are not attached to the inode at all + */ + index = 0; + for (index = 0; index < cb->nr_pages; index++) { + page = cb->compressed_pages[index]; + page->mapping = NULL; + page_cache_release(page); + } + + /* finally free the cb struct */ + kfree(cb->compressed_pages); + kfree(cb); +out: + bio_put(bio); +} + +/* + * worker function to build and submit bios for previously compressed pages. + * The corresponding pages in the inode should be marked for writeback + * and the compressed pages should have a reference on them for dropping + * when the IO is complete. + * + * This also checksums the file bytes and gets things ready for + * the end io hooks. + */ +int btrfs_submit_compressed_write(struct inode *inode, u64 start, + unsigned long len, u64 disk_start, + unsigned long compressed_len, + struct page **compressed_pages, + unsigned long nr_pages) +{ + struct bio *bio = NULL; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct compressed_bio *cb; + unsigned long bytes_left; + struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + int page_index = 0; + struct page *page; + u64 first_byte = disk_start; + struct block_device *bdev; + int ret; + + WARN_ON(start & ((u64)PAGE_CACHE_SIZE - 1)); + cb = kmalloc(sizeof(*cb), GFP_NOFS); + atomic_set(&cb->pending_bios, 0); + cb->errors = 0; + cb->inode = inode; + cb->start = start; + cb->len = len; + cb->compressed_pages = compressed_pages; + cb->compressed_len = compressed_len; + cb->orig_bio = NULL; + cb->nr_pages = nr_pages; + + bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev; + + ret = btrfs_csum_file_bytes(root, inode, start, len); + BUG_ON(ret); + + bio = compressed_bio_alloc(bdev, first_byte, GFP_NOFS); + bio->bi_private = cb; + bio->bi_end_io = end_compressed_bio_write; + atomic_inc(&cb->pending_bios); + + /* create and submit bios for the compressed pages */ + bytes_left = compressed_len; + while(bytes_left > 0) { + page = compressed_pages[page_index]; + page->mapping = inode->i_mapping; + if (bio->bi_size) + ret = io_tree->ops->merge_bio_hook(page, 0, + PAGE_CACHE_SIZE, + bio, 0); + else + ret = 0; + + if (ret || bio_add_page(bio, page, PAGE_CACHE_SIZE, 0) < + PAGE_CACHE_SIZE) { + bio_get(bio); + + ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0); + BUG_ON(ret); + + ret = btrfs_map_bio(root, WRITE, bio, 0, 1); + BUG_ON(ret); + + bio_put(bio); + + bio = compressed_bio_alloc(bdev, first_byte, GFP_NOFS); + atomic_inc(&cb->pending_bios); + bio->bi_private = cb; + bio->bi_end_io = end_compressed_bio_write; + bio_add_page(bio, page, PAGE_CACHE_SIZE, 0); + } + page_index++; + bytes_left -= PAGE_CACHE_SIZE; + first_byte += PAGE_CACHE_SIZE; + } + bio_get(bio); + + ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0); + BUG_ON(ret); + + ret = btrfs_map_bio(root, WRITE, bio, 0, 1); + BUG_ON(ret); + + bio_put(bio); + return 0; +} + +/* + * for a compressed read, the bio we get passed has all the inode pages + * in it. We don't actually do IO on those pages but allocate new ones + * to hold the compressed pages on disk. + * + * bio->bi_sector points to the compressed extent on disk + * bio->bi_io_vec points to all of the inode pages + * bio->bi_vcnt is a count of pages + * + * After the compressed pages are read, we copy the bytes into the + * bio we were passed and then call the bio end_io calls + */ +int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, + int mirror_num, unsigned long bio_flags) +{ + struct extent_io_tree *tree; + struct extent_map_tree *em_tree; + struct compressed_bio *cb; + struct btrfs_root *root = BTRFS_I(inode)->root; + unsigned long uncompressed_len = bio->bi_vcnt * PAGE_CACHE_SIZE; + unsigned long compressed_len; + unsigned long nr_pages; + unsigned long page_index; + struct page *page; + struct block_device *bdev; + struct bio *comp_bio; + u64 cur_disk_byte = (u64)bio->bi_sector << 9; + struct extent_map *em; + int ret; + + tree = &BTRFS_I(inode)->io_tree; + em_tree = &BTRFS_I(inode)->extent_tree; + + /* we need the actual starting offset of this extent in the file */ + spin_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, + page_offset(bio->bi_io_vec->bv_page), + PAGE_CACHE_SIZE); + spin_unlock(&em_tree->lock); + + cb = kmalloc(sizeof(*cb), GFP_NOFS); + atomic_set(&cb->pending_bios, 0); + cb->errors = 0; + cb->inode = inode; + + cb->start = em->start; + compressed_len = em->block_len; + free_extent_map(em); + + cb->len = uncompressed_len; + cb->compressed_len = compressed_len; + cb->orig_bio = bio; + + nr_pages = (compressed_len + PAGE_CACHE_SIZE - 1) / + PAGE_CACHE_SIZE; + cb->compressed_pages = kmalloc(sizeof(struct page *) * nr_pages, + GFP_NOFS); + bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev; + + for (page_index = 0; page_index < nr_pages; page_index++) { + cb->compressed_pages[page_index] = alloc_page(GFP_NOFS | + __GFP_HIGHMEM); + } + cb->nr_pages = nr_pages; + + comp_bio = compressed_bio_alloc(bdev, cur_disk_byte, GFP_NOFS); + comp_bio->bi_private = cb; + comp_bio->bi_end_io = end_compressed_bio_read; + atomic_inc(&cb->pending_bios); + + for (page_index = 0; page_index < nr_pages; page_index++) { + page = cb->compressed_pages[page_index]; + page->mapping = inode->i_mapping; + if (comp_bio->bi_size) + ret = tree->ops->merge_bio_hook(page, 0, + PAGE_CACHE_SIZE, + comp_bio, 0); + else + ret = 0; + + if (ret || bio_add_page(comp_bio, page, PAGE_CACHE_SIZE, 0) < + PAGE_CACHE_SIZE) { + bio_get(comp_bio); + + ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio, 0); + BUG_ON(ret); + + ret = btrfs_map_bio(root, READ, comp_bio, 0, 0); + BUG_ON(ret); + + bio_put(comp_bio); + + comp_bio = compressed_bio_alloc(bdev, cur_disk_byte, + GFP_NOFS); + atomic_inc(&cb->pending_bios); + bio->bi_private = cb; + bio->bi_end_io = end_compressed_bio_write; + bio_add_page(bio, page, PAGE_CACHE_SIZE, 0); + } + cur_disk_byte += PAGE_CACHE_SIZE; + } + bio_get(comp_bio); + + ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio, 0); + BUG_ON(ret); + + ret = btrfs_map_bio(root, READ, comp_bio, 0, 0); + BUG_ON(ret); + + bio_put(comp_bio); + return 0; +} diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h new file mode 100644 index 000000000000..421f5b4aa715 --- /dev/null +++ b/fs/btrfs/compression.h @@ -0,0 +1,47 @@ +/* + * Copyright (C) 2008 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef __BTRFS_COMPRESSION_ +#define __BTRFS_COMPRESSION_ + +int btrfs_zlib_decompress(unsigned char *data_in, + struct page *dest_page, + unsigned long start_byte, + size_t srclen, size_t destlen); +int btrfs_zlib_compress_pages(struct address_space *mapping, + u64 start, unsigned long len, + struct page **pages, + unsigned long nr_dest_pages, + unsigned long *out_pages, + unsigned long *total_in, + unsigned long *total_out, + unsigned long max_out); +int btrfs_zlib_decompress_biovec(struct page **pages_in, + u64 disk_start, + struct bio_vec *bvec, + int vcnt, + size_t srclen); +void btrfs_zlib_exit(void); +int btrfs_submit_compressed_write(struct inode *inode, u64 start, + unsigned long len, u64 disk_start, + unsigned long compressed_len, + struct page **compressed_pages, + unsigned long nr_pages); +int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, + int mirror_num, unsigned long bio_flags); +#endif diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 8559f39fd47f..793d8fdda244 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -400,10 +400,18 @@ struct btrfs_timespec { __le32 nsec; } __attribute__ ((__packed__)); -/* - * there is no padding here on purpose. If you want to extent the inode, - * make a new item type - */ +typedef enum { + BTRFS_COMPRESS_NONE = 0, + BTRFS_COMPRESS_ZLIB = 1, + BTRFS_COMPRESS_LAST = 2, +} btrfs_compression_type; + +/* we don't understand any encryption methods right now */ +typedef enum { + BTRFS_ENCRYPTION_NONE = 0, + BTRFS_ENCRYPTION_LAST = 1, +} btrfs_encryption_type; + struct btrfs_inode_item { /* nfs style generation number */ __le64 generation; @@ -419,6 +427,7 @@ struct btrfs_inode_item { __le64 rdev; __le16 flags; __le16 compat_flags; + struct btrfs_timespec atime; struct btrfs_timespec ctime; struct btrfs_timespec mtime; @@ -454,8 +463,33 @@ struct btrfs_root_item { #define BTRFS_FILE_EXTENT_INLINE 1 struct btrfs_file_extent_item { + /* + * transaction id that created this extent + */ __le64 generation; + /* + * max number of bytes to hold this extent in ram + * when we split a compressed extent we can't know how big + * each of the resulting pieces will be. So, this is + * an upper limit on the size of the extent in ram instead of + * an exact limit. + */ + __le64 ram_bytes; + + /* + * 32 bits for the various ways we might encode the data, + * including compression and encryption. If any of these + * are set to something a given disk format doesn't understand + * it is treated like an incompat flag for reading and writing, + * but not for stat. + */ + u8 compression; + u8 encryption; + __le16 other_encoding; /* spare for later use */ + + /* are we inline data or a real extent? */ u8 type; + /* * disk space consumed by the extent, checksum blocks are included * in these numbers @@ -471,9 +505,11 @@ struct btrfs_file_extent_item { */ __le64 offset; /* - * the logical number of file blocks (no csums included) + * the logical number of file blocks (no csums included). This + * always reflects the size uncompressed and without encoding. */ __le64 num_bytes; + } __attribute__ ((__packed__)); struct btrfs_csum_item { @@ -814,6 +850,7 @@ struct btrfs_root { #define BTRFS_MOUNT_NOBARRIER (1 << 2) #define BTRFS_MOUNT_SSD (1 << 3) #define BTRFS_MOUNT_DEGRADED (1 << 4) +#define BTRFS_MOUNT_COMPRESS (1 << 5) #define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt) #define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt) @@ -825,6 +862,7 @@ struct btrfs_root { #define BTRFS_INODE_NODATASUM (1 << 0) #define BTRFS_INODE_NODATACOW (1 << 1) #define BTRFS_INODE_READONLY (1 << 2) +#define BTRFS_INODE_NOCOMPRESS (1 << 3) #define btrfs_clear_flag(inode, flag) (BTRFS_I(inode)->flags &= \ ~BTRFS_INODE_##flag) #define btrfs_set_flag(inode, flag) (BTRFS_I(inode)->flags |= \ @@ -1424,14 +1462,6 @@ static inline u32 btrfs_file_extent_calc_inline_size(u32 datasize) return offsetof(struct btrfs_file_extent_item, disk_bytenr) + datasize; } -static inline u32 btrfs_file_extent_inline_len(struct extent_buffer *eb, - struct btrfs_item *e) -{ - unsigned long offset; - offset = offsetof(struct btrfs_file_extent_item, disk_bytenr); - return btrfs_item_size(eb, e) - offset; -} - BTRFS_SETGET_FUNCS(file_extent_disk_bytenr, struct btrfs_file_extent_item, disk_bytenr, 64); BTRFS_SETGET_FUNCS(file_extent_generation, struct btrfs_file_extent_item, @@ -1442,6 +1472,36 @@ BTRFS_SETGET_FUNCS(file_extent_offset, struct btrfs_file_extent_item, offset, 64); BTRFS_SETGET_FUNCS(file_extent_num_bytes, struct btrfs_file_extent_item, num_bytes, 64); +BTRFS_SETGET_FUNCS(file_extent_ram_bytes, struct btrfs_file_extent_item, + ram_bytes, 64); +BTRFS_SETGET_FUNCS(file_extent_compression, struct btrfs_file_extent_item, + compression, 8); +BTRFS_SETGET_FUNCS(file_extent_encryption, struct btrfs_file_extent_item, + encryption, 8); +BTRFS_SETGET_FUNCS(file_extent_other_encoding, struct btrfs_file_extent_item, + other_encoding, 16); + +/* this returns the number of file bytes represented by the inline item. + * If an item is compressed, this is the uncompressed size + */ +static inline u32 btrfs_file_extent_inline_len(struct extent_buffer *eb, + struct btrfs_file_extent_item *e) +{ + return btrfs_file_extent_ram_bytes(eb, e); +} + +/* + * this returns the number of bytes used by the item on disk, minus the + * size of any extent headers. If a file is compressed on disk, this is + * the compressed size + */ +static inline u32 btrfs_file_extent_inline_item_len(struct extent_buffer *eb, + struct btrfs_item *e) +{ + unsigned long offset; + offset = offsetof(struct btrfs_file_extent_item, disk_bytenr); + return btrfs_item_size(eb, e) - offset; +} static inline struct btrfs_root *btrfs_sb(struct super_block *sb) { @@ -1745,10 +1805,11 @@ int btrfs_lookup_inode(struct btrfs_trans_handle *trans, struct btrfs_root int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode, struct bio *bio); int btrfs_insert_file_extent(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - u64 objectid, u64 pos, u64 disk_offset, - u64 disk_num_bytes, - u64 num_bytes, u64 offset); + struct btrfs_root *root, + u64 objectid, u64 pos, + u64 disk_offset, u64 disk_num_bytes, + u64 num_bytes, u64 offset, u64 ram_bytes, + u8 compression, u8 encryption, u16 other_encoding); int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, u64 objectid, @@ -1758,6 +1819,8 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans, struct btrfs_ordered_sum *sums); int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode, struct bio *bio); +int btrfs_csum_file_bytes(struct btrfs_root *root, struct inode *inode, + u64 start, unsigned long len); struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, @@ -1799,7 +1862,7 @@ void btrfs_invalidate_dcache_root(struct btrfs_root *root, char *name, int namelen); int btrfs_merge_bio_hook(struct page *page, unsigned long offset, - size_t size, struct bio *bio); + size_t size, struct bio *bio, unsigned long bio_flags); unsigned long btrfs_force_ra(struct address_space *mapping, struct file_ra_state *ra, struct file *file, diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 0be044bb6194..dc95f636a11b 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -83,6 +83,7 @@ struct async_submit_bio { extent_submit_bio_hook_t *submit_bio_hook; int rw; int mirror_num; + unsigned long bio_flags; struct btrfs_work work; }; @@ -115,6 +116,7 @@ struct extent_map *btree_get_extent(struct inode *inode, struct page *page, } em->start = 0; em->len = (u64)-1; + em->block_len = (u64)-1; em->block_start = 0; em->bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev; @@ -469,12 +471,13 @@ static void run_one_async_submit(struct btrfs_work *work) wake_up(&fs_info->async_submit_wait); async->submit_bio_hook(async->inode, async->rw, async->bio, - async->mirror_num); + async->mirror_num, async->bio_flags); kfree(async); } int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode, int rw, struct bio *bio, int mirror_num, + unsigned long bio_flags, extent_submit_bio_hook_t *submit_bio_hook) { struct async_submit_bio *async; @@ -491,6 +494,7 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode, async->submit_bio_hook = submit_bio_hook; async->work.func = run_one_async_submit; async->work.flags = 0; + async->bio_flags = bio_flags; while(atomic_read(&fs_info->async_submit_draining) && atomic_read(&fs_info->nr_async_submits)) { @@ -530,7 +534,7 @@ static int btree_csum_one_bio(struct bio *bio) } static int __btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, - int mirror_num) + int mirror_num, unsigned long bio_flags) { struct btrfs_root *root = BTRFS_I(inode)->root; int ret; @@ -556,17 +560,17 @@ static int __btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, } static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, - int mirror_num) + int mirror_num, unsigned long bio_flags) { /* * kthread helpers are used to submit writes so that checksumming * can happen in parallel across all CPUs */ if (!(rw & (1 << BIO_RW))) { - return __btree_submit_bio_hook(inode, rw, bio, mirror_num); + return __btree_submit_bio_hook(inode, rw, bio, mirror_num, 0); } return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info, - inode, rw, bio, mirror_num, + inode, rw, bio, mirror_num, 0, __btree_submit_bio_hook); } @@ -1407,6 +1411,7 @@ struct btrfs_root *open_ctree(struct super_block *sb, fs_info->btree_inode = new_inode(sb); fs_info->btree_inode->i_ino = 1; fs_info->btree_inode->i_nlink = 1; + fs_info->thread_pool_size = min(num_online_cpus() + 2, 8); INIT_LIST_HEAD(&fs_info->ordered_extents); @@ -1508,6 +1513,7 @@ struct btrfs_root *open_ctree(struct super_block *sb, */ btrfs_init_workers(&fs_info->workers, "worker", fs_info->thread_pool_size); + btrfs_init_workers(&fs_info->submit_workers, "submit", min_t(u64, fs_devices->num_devices, fs_info->thread_pool_size)); @@ -1559,6 +1565,8 @@ struct btrfs_root *open_ctree(struct super_block *sb, } fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super); + fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages, + 4 * 1024 * 1024 / PAGE_CACHE_SIZE); nodesize = btrfs_super_nodesize(disk_super); leafsize = btrfs_super_leafsize(disk_super); diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index f84f5058dbbb..4eb1f1408d21 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -71,6 +71,7 @@ int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio, int metadata); int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode, int rw, struct bio *bio, int mirror_num, + unsigned long bio_flags, extent_submit_bio_hook_t *submit_bio_hook); int btrfs_congested_async(struct btrfs_fs_info *info, int iodone); unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 280ac1aa9b6d..bbf04e80a1a3 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3278,6 +3278,7 @@ static int noinline relocate_data_extent(struct inode *reloc_inode, em->start = extent_key->objectid - offset; em->len = extent_key->offset; + em->block_len = extent_key->offset; em->block_start = extent_key->objectid; em->bdev = root->fs_info->fs_devices->latest_bdev; set_bit(EXTENT_FLAG_PINNED, &em->flags); @@ -3314,10 +3315,14 @@ struct btrfs_ref_path { }; struct disk_extent { + u64 ram_bytes; u64 disk_bytenr; u64 disk_num_bytes; u64 offset; u64 num_bytes; + u8 compression; + u8 encryption; + u16 other_encoding; }; static int is_cowonly_root(u64 root_objectid) @@ -3631,6 +3636,11 @@ static int noinline get_new_locations(struct inode *reloc_inode, btrfs_file_extent_disk_num_bytes(leaf, fi); exts[nr].offset = btrfs_file_extent_offset(leaf, fi); exts[nr].num_bytes = btrfs_file_extent_num_bytes(leaf, fi); + exts[nr].ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi); + exts[nr].compression = btrfs_file_extent_compression(leaf, fi); + exts[nr].encryption = btrfs_file_extent_encryption(leaf, fi); + exts[nr].other_encoding = btrfs_file_extent_other_encoding(leaf, + fi); WARN_ON(exts[nr].offset > 0); WARN_ON(exts[nr].num_bytes != exts[nr].disk_num_bytes); @@ -3846,6 +3856,8 @@ next: new_extents[0].disk_bytenr); btrfs_set_file_extent_disk_num_bytes(leaf, fi, new_extents[0].disk_num_bytes); + btrfs_set_file_extent_ram_bytes(leaf, fi, + new_extents[0].ram_bytes); ext_offset += new_extents[0].offset; btrfs_set_file_extent_offset(leaf, fi, ext_offset); btrfs_mark_buffer_dirty(leaf); @@ -3911,6 +3923,16 @@ next: new_extents[i].disk_bytenr); btrfs_set_file_extent_disk_num_bytes(leaf, fi, new_extents[i].disk_num_bytes); + btrfs_set_file_extent_ram_bytes(leaf, fi, + new_extents[i].ram_bytes); + + btrfs_set_file_extent_compression(leaf, fi, + new_extents[i].compression); + btrfs_set_file_extent_encryption(leaf, fi, + new_extents[i].encryption); + btrfs_set_file_extent_other_encoding(leaf, fi, + new_extents[i].other_encoding); + btrfs_set_file_extent_num_bytes(leaf, fi, extent_len); ext_offset += new_extents[i].offset; @@ -4169,6 +4191,8 @@ static int noinline replace_extents_in_leaf(struct btrfs_trans_handle *trans, ref->extents[ext_index].num_bytes = new_extent->disk_num_bytes; btrfs_set_file_extent_generation(leaf, fi, trans->transid); + btrfs_set_file_extent_ram_bytes(leaf, fi, + new_extent->ram_bytes); btrfs_set_file_extent_disk_bytenr(leaf, fi, new_extent->disk_bytenr); btrfs_set_file_extent_disk_num_bytes(leaf, fi, @@ -4847,7 +4871,8 @@ static struct inode noinline *create_reloc_inode(struct btrfs_fs_info *fs_info, BUG_ON(err); err = btrfs_insert_file_extent(trans, root, objectid, 0, 0, 0, - group->key.offset, 0); + group->key.offset, 0, group->key.offset, + 0, 0, 0); BUG_ON(err); inode = btrfs_iget_locked(root->fs_info->sb, objectid, root); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 563b2d12f4f2..314041fdfa43 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -30,6 +30,7 @@ static struct kmem_cache *extent_buffer_cache; static LIST_HEAD(buffers); static LIST_HEAD(states); +#define LEAK_DEBUG 1 #ifdef LEAK_DEBUG static spinlock_t leak_lock = SPIN_LOCK_UNLOCKED; #endif @@ -1067,8 +1068,8 @@ EXPORT_SYMBOL(find_first_extent_bit_state); * * 1 is returned if we find something, 0 if nothing was in the tree */ -static noinline u64 find_lock_delalloc_range(struct extent_io_tree *tree, - u64 *start, u64 *end, u64 max_bytes) +static noinline u64 find_delalloc_range(struct extent_io_tree *tree, + u64 *start, u64 *end, u64 max_bytes) { struct rb_node *node; struct extent_state *state; @@ -1077,11 +1078,11 @@ static noinline u64 find_lock_delalloc_range(struct extent_io_tree *tree, u64 total_bytes = 0; spin_lock_irq(&tree->lock); + /* * this search will find all the extents that end after * our range starts. */ -search_again: node = tree_search(tree, cur_start); if (!node) { if (!found) @@ -1100,40 +1101,6 @@ search_again: *end = state->end; goto out; } - if (!found && !(state->state & EXTENT_BOUNDARY)) { - struct extent_state *prev_state; - struct rb_node *prev_node = node; - while(1) { - prev_node = rb_prev(prev_node); - if (!prev_node) - break; - prev_state = rb_entry(prev_node, - struct extent_state, - rb_node); - if ((prev_state->end + 1 != state->start) || - !(prev_state->state & EXTENT_DELALLOC)) - break; - if ((cur_start - prev_state->start) * 2 > - max_bytes) - break; - state = prev_state; - node = prev_node; - } - } - if (state->state & EXTENT_LOCKED) { - DEFINE_WAIT(wait); - atomic_inc(&state->refs); - prepare_to_wait(&state->wq, &wait, - TASK_UNINTERRUPTIBLE); - spin_unlock_irq(&tree->lock); - schedule(); - spin_lock_irq(&tree->lock); - finish_wait(&state->wq, &wait); - free_extent_state(state); - goto search_again; - } - set_state_cb(tree, state, EXTENT_LOCKED); - state->state |= EXTENT_LOCKED; if (!found) *start = state->start; found++; @@ -1151,6 +1118,208 @@ out: return found; } +static noinline int __unlock_for_delalloc(struct inode *inode, + struct page *locked_page, + u64 start, u64 end) +{ + int ret; + struct page *pages[16]; + unsigned long index = start >> PAGE_CACHE_SHIFT; + unsigned long end_index = end >> PAGE_CACHE_SHIFT; + unsigned long nr_pages = end_index - index + 1; + int i; + + if (index == locked_page->index && end_index == index) + return 0; + + while(nr_pages > 0) { + ret = find_get_pages_contig(inode->i_mapping, index, + min(nr_pages, ARRAY_SIZE(pages)), pages); + for (i = 0; i < ret; i++) { + if (pages[i] != locked_page) + unlock_page(pages[i]); + page_cache_release(pages[i]); + } + nr_pages -= ret; + index += ret; + cond_resched(); + } + return 0; +} + +static noinline int lock_delalloc_pages(struct inode *inode, + struct page *locked_page, + u64 delalloc_start, + u64 delalloc_end) +{ + unsigned long index = delalloc_start >> PAGE_CACHE_SHIFT; + unsigned long start_index = index; + unsigned long end_index = delalloc_end >> PAGE_CACHE_SHIFT; + unsigned long pages_locked = 0; + struct page *pages[16]; + unsigned long nrpages; + int ret; + int i; + + /* the caller is responsible for locking the start index */ + if (index == locked_page->index && index == end_index) + return 0; + + /* skip the page at the start index */ + nrpages = end_index - index + 1; + while(nrpages > 0) { + ret = find_get_pages_contig(inode->i_mapping, index, + min(nrpages, ARRAY_SIZE(pages)), pages); + if (ret == 0) { + ret = -EAGAIN; + goto done; + } + /* now we have an array of pages, lock them all */ + for (i = 0; i < ret; i++) { + /* + * the caller is taking responsibility for + * locked_page + */ + if (pages[i] != locked_page) + lock_page(pages[i]); + page_cache_release(pages[i]); + } + pages_locked += ret; + nrpages -= ret; + index += ret; + cond_resched(); + } + ret = 0; +done: + if (ret && pages_locked) { + __unlock_for_delalloc(inode, locked_page, + delalloc_start, + ((u64)(start_index + pages_locked - 1)) << + PAGE_CACHE_SHIFT); + } + return ret; +} + +/* + * find a contiguous range of bytes in the file marked as delalloc, not + * more than 'max_bytes'. start and end are used to return the range, + * + * 1 is returned if we find something, 0 if nothing was in the tree + */ +static noinline u64 find_lock_delalloc_range(struct inode *inode, + struct extent_io_tree *tree, + struct page *locked_page, + u64 *start, u64 *end, + u64 max_bytes) +{ + u64 delalloc_start; + u64 delalloc_end; + u64 found; + int ret; + int loops = 0; + +again: + /* step one, find a bunch of delalloc bytes starting at start */ + delalloc_start = *start; + delalloc_end = 0; + found = find_delalloc_range(tree, &delalloc_start, &delalloc_end, + max_bytes); + if (!found) { + *start = delalloc_start; + *end = delalloc_end; + return found; + } + + /* + * make sure to limit the number of pages we try to lock down + * if we're looping. + */ + if (delalloc_end + 1 - delalloc_start > max_bytes && loops) { + delalloc_end = (delalloc_start + PAGE_CACHE_SIZE - 1) & + ~((u64)PAGE_CACHE_SIZE - 1); + } + /* step two, lock all the pages after the page that has start */ + ret = lock_delalloc_pages(inode, locked_page, + delalloc_start, delalloc_end); + if (ret == -EAGAIN) { + /* some of the pages are gone, lets avoid looping by + * shortening the size of the delalloc range we're searching + */ + if (!loops) { + unsigned long offset = (*start) & (PAGE_CACHE_SIZE - 1); + max_bytes = PAGE_CACHE_SIZE - offset; + loops = 1; + goto again; + } else { + found = 0; + goto out_failed; + } + } + BUG_ON(ret); + + /* step three, lock the state bits for the whole range */ + lock_extent(tree, delalloc_start, delalloc_end, GFP_NOFS); + + /* then test to make sure it is all still delalloc */ + ret = test_range_bit(tree, delalloc_start, delalloc_end, + EXTENT_DELALLOC, 1); + if (!ret) { + unlock_extent(tree, delalloc_start, delalloc_end, GFP_NOFS); + __unlock_for_delalloc(inode, locked_page, + delalloc_start, delalloc_end); + cond_resched(); + goto again; + } + *start = delalloc_start; + *end = delalloc_end; +out_failed: + return found; +} + +int extent_clear_unlock_delalloc(struct inode *inode, + struct extent_io_tree *tree, + u64 start, u64 end, struct page *locked_page, + int clear_dirty, int set_writeback, + int end_writeback) +{ + int ret; + struct page *pages[16]; + unsigned long index = start >> PAGE_CACHE_SHIFT; + unsigned long end_index = end >> PAGE_CACHE_SHIFT; + unsigned long nr_pages = end_index - index + 1; + int i; + int clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC; + + if (clear_dirty) + clear_bits |= EXTENT_DIRTY; + + clear_extent_bit(tree, start, end, clear_bits, 1, 0, GFP_NOFS); + + while(nr_pages > 0) { + ret = find_get_pages_contig(inode->i_mapping, index, + min(nr_pages, ARRAY_SIZE(pages)), pages); + for (i = 0; i < ret; i++) { + if (pages[i] == locked_page) { + page_cache_release(pages[i]); + continue; + } + if (clear_dirty) + clear_page_dirty_for_io(pages[i]); + if (set_writeback) + set_page_writeback(pages[i]); + if (end_writeback) + end_page_writeback(pages[i]); + unlock_page(pages[i]); + page_cache_release(pages[i]); + } + nr_pages -= ret; + index += ret; + cond_resched(); + } + return 0; +} +EXPORT_SYMBOL(extent_clear_unlock_delalloc); + /* * count the number of bytes in the tree that have a given bit(s) * set. This can be fairly slow, except for EXTENT_DIRTY which is @@ -1631,38 +1800,26 @@ extent_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs, return bio; } -static int submit_one_bio(int rw, struct bio *bio, int mirror_num) +static int submit_one_bio(int rw, struct bio *bio, int mirror_num, + unsigned long bio_flags) { int ret = 0; struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; struct page *page = bvec->bv_page; struct extent_io_tree *tree = bio->bi_private; - struct rb_node *node; - struct extent_state *state; u64 start; u64 end; start = ((u64)page->index << PAGE_CACHE_SHIFT) + bvec->bv_offset; end = start + bvec->bv_len - 1; - spin_lock_irq(&tree->lock); - node = __etree_search(tree, start, NULL, NULL); - BUG_ON(!node); - state = rb_entry(node, struct extent_state, rb_node); - while(state->end < end) { - node = rb_next(node); - state = rb_entry(node, struct extent_state, rb_node); - } - BUG_ON(state->end != end); - spin_unlock_irq(&tree->lock); - bio->bi_private = NULL; bio_get(bio); if (tree->ops && tree->ops->submit_bio_hook) tree->ops->submit_bio_hook(page->mapping->host, rw, bio, - mirror_num); + mirror_num, bio_flags); else submit_bio(rw, bio); if (bio_flagged(bio, BIO_EOPNOTSUPP)) @@ -1678,39 +1835,56 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree, struct bio **bio_ret, unsigned long max_pages, bio_end_io_t end_io_func, - int mirror_num) + int mirror_num, + unsigned long prev_bio_flags, + unsigned long bio_flags) { int ret = 0; struct bio *bio; int nr; + int contig = 0; + int this_compressed = bio_flags & EXTENT_BIO_COMPRESSED; + int old_compressed = prev_bio_flags & EXTENT_BIO_COMPRESSED; + size_t page_size = min(size, PAGE_CACHE_SIZE); if (bio_ret && *bio_ret) { bio = *bio_ret; - if (bio->bi_sector + (bio->bi_size >> 9) != sector || + if (old_compressed) + contig = bio->bi_sector == sector; + else + contig = bio->bi_sector + (bio->bi_size >> 9) == + sector; + + if (prev_bio_flags != bio_flags || !contig || (tree->ops && tree->ops->merge_bio_hook && - tree->ops->merge_bio_hook(page, offset, size, bio)) || - bio_add_page(bio, page, size, offset) < size) { - ret = submit_one_bio(rw, bio, mirror_num); + tree->ops->merge_bio_hook(page, offset, page_size, bio, + bio_flags)) || + bio_add_page(bio, page, page_size, offset) < page_size) { + ret = submit_one_bio(rw, bio, mirror_num, + prev_bio_flags); bio = NULL; } else { return 0; } } - nr = bio_get_nr_vecs(bdev); + if (this_compressed) + nr = BIO_MAX_PAGES; + else + nr = bio_get_nr_vecs(bdev); + bio = extent_bio_alloc(bdev, sector, nr, GFP_NOFS | __GFP_HIGH); if (!bio) { printk("failed to allocate bio nr %d\n", nr); } - - bio_add_page(bio, page, size, offset); + bio_add_page(bio, page, page_size, offset); bio->bi_end_io = end_io_func; bio->bi_private = tree; if (bio_ret) { *bio_ret = bio; } else { - ret = submit_one_bio(rw, bio, mirror_num); + ret = submit_one_bio(rw, bio, mirror_num, bio_flags); } return ret; @@ -1738,7 +1912,8 @@ void set_page_extent_head(struct page *page, unsigned long len) static int __extent_read_full_page(struct extent_io_tree *tree, struct page *page, get_extent_t *get_extent, - struct bio **bio, int mirror_num) + struct bio **bio, int mirror_num, + unsigned long *bio_flags) { struct inode *inode = page->mapping->host; u64 start = (u64)page->index << PAGE_CACHE_SHIFT; @@ -1756,13 +1931,27 @@ static int __extent_read_full_page(struct extent_io_tree *tree, int nr = 0; size_t page_offset = 0; size_t iosize; + size_t disk_io_size; size_t blocksize = inode->i_sb->s_blocksize; + unsigned long this_bio_flag = 0; set_page_extent_mapped(page); end = page_end; lock_extent(tree, start, end, GFP_NOFS); + if (page->index == last_byte >> PAGE_CACHE_SHIFT) { + char *userpage; + size_t zero_offset = last_byte & (PAGE_CACHE_SIZE - 1); + + if (zero_offset) { + iosize = PAGE_CACHE_SIZE - zero_offset; + userpage = kmap_atomic(page, KM_USER0); + memset(userpage + zero_offset, 0, iosize); + flush_dcache_page(page); + kunmap_atomic(userpage, KM_USER0); + } + } while (cur <= end) { if (cur >= last_byte) { char *userpage; @@ -1793,10 +1982,19 @@ printk("2bad mapping end %Lu cur %Lu\n", end, cur); } BUG_ON(end < cur); + if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) + this_bio_flag = EXTENT_BIO_COMPRESSED; + iosize = min(extent_map_end(em) - cur, end - cur + 1); cur_end = min(extent_map_end(em) - 1, end); iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1); - sector = (em->block_start + extent_offset) >> 9; + if (this_bio_flag & EXTENT_BIO_COMPRESSED) { + disk_io_size = em->block_len; + sector = em->block_start >> 9; + } else { + sector = (em->block_start + extent_offset) >> 9; + disk_io_size = iosize; + } bdev = em->bdev; block_start = em->block_start; free_extent_map(em); @@ -1845,10 +2043,13 @@ printk("2bad mapping end %Lu cur %Lu\n", end, cur); unsigned long pnr = (last_byte >> PAGE_CACHE_SHIFT) + 1; pnr -= page->index; ret = submit_extent_page(READ, tree, page, - sector, iosize, page_offset, + sector, disk_io_size, page_offset, bdev, bio, pnr, - end_bio_extent_readpage, mirror_num); + end_bio_extent_readpage, mirror_num, + *bio_flags, + this_bio_flag); nr++; + *bio_flags = this_bio_flag; } if (ret) SetPageError(page); @@ -1867,11 +2068,13 @@ int extent_read_full_page(struct extent_io_tree *tree, struct page *page, get_extent_t *get_extent) { struct bio *bio = NULL; + unsigned long bio_flags = 0; int ret; - ret = __extent_read_full_page(tree, page, get_extent, &bio, 0); + ret = __extent_read_full_page(tree, page, get_extent, &bio, 0, + &bio_flags); if (bio) - submit_one_bio(READ, bio, 0); + submit_one_bio(READ, bio, 0, bio_flags); return ret; } EXPORT_SYMBOL(extent_read_full_page); @@ -1909,6 +2112,8 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, unsigned long end_index = i_size >> PAGE_CACHE_SHIFT; u64 nr_delalloc; u64 delalloc_end; + int page_started; + int compressed; WARN_ON(!PageLocked(page)); pg_offset = i_size & (PAGE_CACHE_SIZE - 1); @@ -1934,27 +2139,33 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, delalloc_start = start; delalloc_end = 0; + page_started = 0; while(delalloc_end < page_end) { - nr_delalloc = find_lock_delalloc_range(tree, &delalloc_start, + nr_delalloc = find_lock_delalloc_range(inode, tree, + page, + &delalloc_start, &delalloc_end, 128 * 1024 * 1024); if (nr_delalloc == 0) { delalloc_start = delalloc_end + 1; continue; } - tree->ops->fill_delalloc(inode, delalloc_start, - delalloc_end); - clear_extent_bit(tree, delalloc_start, - delalloc_end, - EXTENT_LOCKED | EXTENT_DELALLOC, - 1, 0, GFP_NOFS); + tree->ops->fill_delalloc(inode, page, delalloc_start, + delalloc_end, &page_started); delalloc_start = delalloc_end + 1; } + + /* did the fill delalloc function already unlock and start the IO? */ + if (page_started) { + return 0; + } + lock_extent(tree, start, page_end, GFP_NOFS); unlock_start = start; if (tree->ops && tree->ops->writepage_start_hook) { - ret = tree->ops->writepage_start_hook(page, start, page_end); + ret = tree->ops->writepage_start_hook(page, start, + page_end); if (ret == -EAGAIN) { unlock_extent(tree, start, page_end, GFP_NOFS); redirty_page_for_writepage(wbc, page); @@ -2006,10 +2217,15 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, sector = (em->block_start + extent_offset) >> 9; bdev = em->bdev; block_start = em->block_start; + compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags); free_extent_map(em); em = NULL; - if (block_start == EXTENT_MAP_HOLE || + /* + * compressed and inline extents are written through other + * paths in the FS + */ + if (compressed || block_start == EXTENT_MAP_HOLE || block_start == EXTENT_MAP_INLINE) { clear_extent_dirty(tree, cur, cur + iosize - 1, GFP_NOFS); @@ -2017,16 +2233,28 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, unlock_extent(tree, unlock_start, cur + iosize -1, GFP_NOFS); - if (tree->ops && tree->ops->writepage_end_io_hook) + /* + * end_io notification does not happen here for + * compressed extents + */ + if (!compressed && tree->ops && + tree->ops->writepage_end_io_hook) tree->ops->writepage_end_io_hook(page, cur, cur + iosize - 1, NULL, 1); - cur = cur + iosize; + else if (compressed) { + /* we don't want to end_page_writeback on + * a compressed extent. this happens + * elsewhere + */ + nr++; + } + + cur += iosize; pg_offset += iosize; unlock_start = cur; continue; } - /* leave this out until we have a page_mkwrite call */ if (0 && !test_range_bit(tree, cur, cur + iosize - 1, EXTENT_DIRTY, 0)) { @@ -2034,6 +2262,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, pg_offset += iosize; continue; } + clear_extent_dirty(tree, cur, cur + iosize - 1, GFP_NOFS); if (tree->ops && tree->ops->writepage_io_hook) { ret = tree->ops->writepage_io_hook(page, cur, @@ -2057,7 +2286,8 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, ret = submit_extent_page(WRITE, tree, page, sector, iosize, pg_offset, bdev, &epd->bio, max_nr, - end_bio_extent_writepage, 0); + end_bio_extent_writepage, + 0, 0, 0); if (ret) SetPageError(page); } @@ -2226,7 +2456,7 @@ int extent_write_full_page(struct extent_io_tree *tree, struct page *page, extent_write_cache_pages(tree, mapping, &wbc_writepages, __extent_writepage, &epd); if (epd.bio) { - submit_one_bio(WRITE, epd.bio, 0); + submit_one_bio(WRITE, epd.bio, 0, 0); } return ret; } @@ -2248,7 +2478,7 @@ int extent_writepages(struct extent_io_tree *tree, ret = extent_write_cache_pages(tree, mapping, wbc, __extent_writepage, &epd); if (epd.bio) { - submit_one_bio(WRITE, epd.bio, 0); + submit_one_bio(WRITE, epd.bio, 0, 0); } return ret; } @@ -2262,6 +2492,7 @@ int extent_readpages(struct extent_io_tree *tree, struct bio *bio = NULL; unsigned page_idx; struct pagevec pvec; + unsigned long bio_flags = 0; pagevec_init(&pvec, 0); for (page_idx = 0; page_idx < nr_pages; page_idx++) { @@ -2281,7 +2512,7 @@ int extent_readpages(struct extent_io_tree *tree, if (!pagevec_add(&pvec, page)) __pagevec_lru_add(&pvec); __extent_read_full_page(tree, page, get_extent, - &bio, 0); + &bio, 0, &bio_flags); } page_cache_release(page); } @@ -2289,7 +2520,7 @@ int extent_readpages(struct extent_io_tree *tree, __pagevec_lru_add(&pvec); BUG_ON(!list_empty(pages)); if (bio) - submit_one_bio(READ, bio, 0); + submit_one_bio(READ, bio, 0, bio_flags); return 0; } EXPORT_SYMBOL(extent_readpages); @@ -2414,7 +2645,8 @@ int extent_prepare_write(struct extent_io_tree *tree, ret = submit_extent_page(READ, tree, page, sector, iosize, page_offset, em->bdev, NULL, 1, - end_bio_extent_preparewrite, 0); + end_bio_extent_preparewrite, 0, + 0, 0); iocount++; block_start = block_start + iosize; } else { @@ -2495,7 +2727,9 @@ int try_release_extent_mapping(struct extent_map_tree *map, } if (!test_range_bit(tree, em->start, extent_map_end(em) - 1, - EXTENT_LOCKED, 0)) { + EXTENT_LOCKED | EXTENT_WRITEBACK | + EXTENT_ORDERED, + 0)) { remove_extent_mapping(map, em); /* once for the rb tree */ free_extent_map(em); @@ -2923,6 +3157,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree, int inc_all_pages = 0; unsigned long num_pages; struct bio *bio = NULL; + unsigned long bio_flags = 0; if (eb->flags & EXTENT_UPTODATE) return 0; @@ -2973,7 +3208,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree, ClearPageError(page); err = __extent_read_full_page(tree, page, get_extent, &bio, - mirror_num); + mirror_num, &bio_flags); if (err) { ret = err; printk("err %d from __extent_read_full_page\n", ret); @@ -2984,7 +3219,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree, } if (bio) - submit_one_bio(READ, bio, mirror_num); + submit_one_bio(READ, bio, mirror_num, bio_flags); if (ret || !wait) { if (ret) diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index c9d1908a1ae3..86f859b87a6e 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -18,6 +18,9 @@ #define EXTENT_BOUNDARY (1 << 11) #define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK) +/* flags for bio submission */ +#define EXTENT_BIO_COMPRESSED 1 + /* * page->private values. Every page that is controlled by the extent * map has page->private set to one. @@ -28,14 +31,17 @@ struct extent_state; typedef int (extent_submit_bio_hook_t)(struct inode *inode, int rw, - struct bio *bio, int mirror_num); + struct bio *bio, int mirror_num, + unsigned long bio_flags); struct extent_io_ops { - int (*fill_delalloc)(struct inode *inode, u64 start, u64 end); + int (*fill_delalloc)(struct inode *inode, struct page *locked_page, + u64 start, u64 end, int *page_started); int (*writepage_start_hook)(struct page *page, u64 start, u64 end); int (*writepage_io_hook)(struct page *page, u64 start, u64 end); extent_submit_bio_hook_t *submit_bio_hook; int (*merge_bio_hook)(struct page *page, unsigned long offset, - size_t size, struct bio *bio); + size_t size, struct bio *bio, + unsigned long bio_flags); int (*readpage_io_hook)(struct page *page, u64 start, u64 end); int (*readpage_io_failed_hook)(struct bio *bio, struct page *page, u64 start, u64 end, @@ -245,4 +251,9 @@ void unmap_extent_buffer(struct extent_buffer *eb, char *token, int km); int release_extent_buffer_tail_pages(struct extent_buffer *eb); int extent_range_uptodate(struct extent_io_tree *tree, u64 start, u64 end); +int extent_clear_unlock_delalloc(struct inode *inode, + struct extent_io_tree *tree, + u64 start, u64 end, struct page *locked_page, + int clear_dirty, int set_writeback, + int clear_writeback); #endif diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 74b2a29880d3..fd3ebfb8c3c5 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -184,6 +184,13 @@ static int mergable_maps(struct extent_map *prev, struct extent_map *next) if (test_bit(EXTENT_FLAG_PINNED, &prev->flags)) return 0; + /* + * don't merge compressed extents, we need to know their + * actual size + */ + if (test_bit(EXTENT_FLAG_COMPRESSED, &prev->flags)) + return 0; + if (extent_map_end(prev) == next->start && prev->flags == next->flags && prev->bdev == next->bdev && @@ -239,6 +246,7 @@ int add_extent_mapping(struct extent_map_tree *tree, if (rb && mergable_maps(merge, em)) { em->start = merge->start; em->len += merge->len; + em->block_len += merge->block_len; em->block_start = merge->block_start; merge->in_tree = 0; rb_erase(&merge->rb_node, &tree->map); @@ -250,6 +258,7 @@ int add_extent_mapping(struct extent_map_tree *tree, merge = rb_entry(rb, struct extent_map, rb_node); if (rb && mergable_maps(em, merge)) { em->len += merge->len; + em->block_len += merge->len; rb_erase(&merge->rb_node, &tree->map); merge->in_tree = 0; free_extent_map(merge); diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h index 26ac6fe0b268..abbcbeb28c79 100644 --- a/fs/btrfs/extent_map.h +++ b/fs/btrfs/extent_map.h @@ -10,6 +10,7 @@ /* bits for the flags field */ #define EXTENT_FLAG_PINNED 0 /* this entry not yet on disk, don't free it */ +#define EXTENT_FLAG_COMPRESSED 1 struct extent_map { struct rb_node rb_node; @@ -18,6 +19,7 @@ struct extent_map { u64 start; u64 len; u64 block_start; + u64 block_len; unsigned long flags; struct block_device *bdev; atomic_t refs; @@ -38,9 +40,9 @@ static inline u64 extent_map_end(struct extent_map *em) static inline u64 extent_map_block_end(struct extent_map *em) { - if (em->block_start + em->len < em->block_start) + if (em->block_start + em->block_len < em->block_start) return (u64)-1; - return em->block_start + em->len; + return em->block_start + em->block_len; } void extent_map_tree_init(struct extent_map_tree *tree, gfp_t mask); diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 6dbe88b9d7d4..f4d3fa71bc41 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -31,7 +31,8 @@ int btrfs_insert_file_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 objectid, u64 pos, u64 disk_offset, u64 disk_num_bytes, - u64 num_bytes, u64 offset) + u64 num_bytes, u64 offset, u64 ram_bytes, + u8 compression, u8 encryption, u16 other_encoding) { int ret = 0; struct btrfs_file_extent_item *item; @@ -57,8 +58,13 @@ int btrfs_insert_file_extent(struct btrfs_trans_handle *trans, btrfs_set_file_extent_disk_num_bytes(leaf, item, disk_num_bytes); btrfs_set_file_extent_offset(leaf, item, offset); btrfs_set_file_extent_num_bytes(leaf, item, num_bytes); + btrfs_set_file_extent_ram_bytes(leaf, item, ram_bytes); btrfs_set_file_extent_generation(leaf, item, trans->transid); btrfs_set_file_extent_type(leaf, item, BTRFS_FILE_EXTENT_REG); + btrfs_set_file_extent_compression(leaf, item, compression); + btrfs_set_file_extent_encryption(leaf, item, encryption); + btrfs_set_file_extent_other_encoding(leaf, item, other_encoding); + btrfs_mark_buffer_dirty(leaf); out: btrfs_free_path(path); @@ -213,6 +219,73 @@ found: return 0; } +int btrfs_csum_file_bytes(struct btrfs_root *root, struct inode *inode, + u64 start, unsigned long len) +{ + struct btrfs_ordered_sum *sums; + struct btrfs_sector_sum *sector_sum; + struct btrfs_ordered_extent *ordered; + char *data; + struct page *page; + unsigned long total_bytes = 0; + unsigned long this_sum_bytes = 0; + + sums = kzalloc(btrfs_ordered_sum_size(root, len), GFP_NOFS); + if (!sums) + return -ENOMEM; + + sector_sum = sums->sums; + sums->file_offset = start; + sums->len = len; + INIT_LIST_HEAD(&sums->list); + ordered = btrfs_lookup_ordered_extent(inode, sums->file_offset); + BUG_ON(!ordered); + + while(len > 0) { + if (start >= ordered->file_offset + ordered->len || + start < ordered->file_offset) { + sums->len = this_sum_bytes; + this_sum_bytes = 0; + btrfs_add_ordered_sum(inode, ordered, sums); + btrfs_put_ordered_extent(ordered); + + sums = kzalloc(btrfs_ordered_sum_size(root, len), + GFP_NOFS); + BUG_ON(!sums); + sector_sum = sums->sums; + sums->len = len; + sums->file_offset = start; + ordered = btrfs_lookup_ordered_extent(inode, + sums->file_offset); + BUG_ON(!ordered); + } + + page = find_get_page(inode->i_mapping, + start >> PAGE_CACHE_SHIFT); + + data = kmap_atomic(page, KM_USER0); + sector_sum->sum = ~(u32)0; + sector_sum->sum = btrfs_csum_data(root, data, sector_sum->sum, + PAGE_CACHE_SIZE); + kunmap_atomic(data, KM_USER0); + btrfs_csum_final(sector_sum->sum, + (char *)§or_sum->sum); + sector_sum->offset = page_offset(page); + page_cache_release(page); + + sector_sum++; + total_bytes += PAGE_CACHE_SIZE; + this_sum_bytes += PAGE_CACHE_SIZE; + start += PAGE_CACHE_SIZE; + + WARN_ON(len < PAGE_CACHE_SIZE); + len -= PAGE_CACHE_SIZE; + } + btrfs_add_ordered_sum(inode, ordered, sums); + btrfs_put_ordered_extent(ordered); + return 0; +} + int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode, struct bio *bio) { diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 69abbe19add2..0aa15436590e 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -95,153 +95,6 @@ static void noinline btrfs_drop_pages(struct page **pages, size_t num_pages) } } -/* this does all the hard work for inserting an inline extent into - * the btree. Any existing inline extent is extended as required to make room, - * otherwise things are inserted as required into the btree - */ -static int noinline insert_inline_extent(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct inode *inode, - u64 offset, size_t size, - struct page **pages, size_t page_offset, - int num_pages) -{ - struct btrfs_key key; - struct btrfs_path *path; - struct extent_buffer *leaf; - char *kaddr; - unsigned long ptr; - struct btrfs_file_extent_item *ei; - struct page *page; - u32 datasize; - int err = 0; - int ret; - int i; - ssize_t cur_size; - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - - btrfs_set_trans_block_group(trans, inode); - - key.objectid = inode->i_ino; - key.offset = offset; - btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY); - - ret = btrfs_search_slot(trans, root, &key, path, 0, 1); - if (ret < 0) { - err = ret; - goto fail; - } - if (ret == 1) { - struct btrfs_key found_key; - - if (path->slots[0] == 0) - goto insert; - - path->slots[0]--; - leaf = path->nodes[0]; - btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); - - if (found_key.objectid != inode->i_ino) - goto insert; - - if (found_key.type != BTRFS_EXTENT_DATA_KEY) - goto insert; - ei = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_file_extent_item); - - if (btrfs_file_extent_type(leaf, ei) != - BTRFS_FILE_EXTENT_INLINE) { - goto insert; - } - btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); - ret = 0; - } - if (ret == 0) { - u32 found_size; - u64 found_end; - - leaf = path->nodes[0]; - ei = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_file_extent_item); - - if (btrfs_file_extent_type(leaf, ei) != - BTRFS_FILE_EXTENT_INLINE) { - err = ret; - btrfs_print_leaf(root, leaf); - printk("found wasn't inline offset %Lu inode %lu\n", - offset, inode->i_ino); - goto fail; - } - found_size = btrfs_file_extent_inline_len(leaf, - btrfs_item_nr(leaf, path->slots[0])); - found_end = key.offset + found_size; - - if (found_end < offset + size) { - btrfs_release_path(root, path); - ret = btrfs_search_slot(trans, root, &key, path, - offset + size - found_end, 1); - BUG_ON(ret != 0); - - ret = btrfs_extend_item(trans, root, path, - offset + size - found_end); - if (ret) { - err = ret; - goto fail; - } - leaf = path->nodes[0]; - ei = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_file_extent_item); - inode_add_bytes(inode, offset + size - found_end); - } - if (found_end < offset) { - ptr = btrfs_file_extent_inline_start(ei) + found_size; - memset_extent_buffer(leaf, 0, ptr, offset - found_end); - } - } else { -insert: - btrfs_release_path(root, path); - datasize = offset + size - key.offset; - inode_add_bytes(inode, datasize); - datasize = btrfs_file_extent_calc_inline_size(datasize); - ret = btrfs_insert_empty_item(trans, root, path, &key, - datasize); - if (ret) { - err = ret; - printk("got bad ret %d\n", ret); - goto fail; - } - leaf = path->nodes[0]; - ei = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_file_extent_item); - btrfs_set_file_extent_generation(leaf, ei, trans->transid); - btrfs_set_file_extent_type(leaf, ei, BTRFS_FILE_EXTENT_INLINE); - } - ptr = btrfs_file_extent_inline_start(ei) + offset - key.offset; - - cur_size = size; - i = 0; - while (size > 0) { - page = pages[i]; - kaddr = kmap_atomic(page, KM_USER0); - cur_size = min_t(size_t, PAGE_CACHE_SIZE - page_offset, size); - write_extent_buffer(leaf, kaddr + page_offset, ptr, cur_size); - kunmap_atomic(kaddr, KM_USER0); - page_offset = 0; - ptr += cur_size; - size -= cur_size; - if (i >= num_pages) { - printk("i %d num_pages %d\n", i, num_pages); - } - i++; - } - btrfs_mark_buffer_dirty(leaf); -fail: - btrfs_free_path(path); - return err; -} - /* * after copy_from_user, pages need to be dirtied and we need to make * sure holes are created between the current EOF and the start of @@ -267,8 +120,6 @@ static int noinline dirty_and_release_pages(struct btrfs_trans_handle *trans, u64 start_pos; u64 end_of_last_block; u64 end_pos = pos + write_bytes; - u64 inline_size; - int did_inline = 0; loff_t isize = i_size_read(inode); start_pos = pos & ~((u64)root->sectorsize - 1); @@ -314,7 +165,8 @@ static int noinline dirty_and_release_pages(struct btrfs_trans_handle *trans, err = btrfs_insert_file_extent(trans, root, inode->i_ino, last_pos_in_file, - 0, 0, hole_size, 0); + 0, 0, hole_size, 0, + hole_size, 0, 0, 0); btrfs_drop_extent_cache(inode, last_pos_in_file, last_pos_in_file + hole_size - 1, 0); mutex_unlock(&BTRFS_I(inode)->extent_mutex); @@ -324,57 +176,19 @@ static int noinline dirty_and_release_pages(struct btrfs_trans_handle *trans, goto failed; } - /* - * either allocate an extent for the new bytes or setup the key - * to show we are doing inline data in the extent + /* check for reserved extents on each page, we don't want + * to reset the delalloc bit on things that already have + * extents reserved. */ - inline_size = end_pos; - if (isize >= BTRFS_MAX_INLINE_DATA_SIZE(root) || - inline_size > root->fs_info->max_inline || - (inline_size & (root->sectorsize -1)) == 0 || - inline_size >= BTRFS_MAX_INLINE_DATA_SIZE(root)) { - /* check for reserved extents on each page, we don't want - * to reset the delalloc bit on things that already have - * extents reserved. - */ - btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block); - for (i = 0; i < num_pages; i++) { - struct page *p = pages[i]; - SetPageUptodate(p); - ClearPageChecked(p); - set_page_dirty(p); - } - } else { - u64 aligned_end; - /* step one, delete the existing extents in this range */ - aligned_end = (pos + write_bytes + root->sectorsize - 1) & - ~((u64)root->sectorsize - 1); - mutex_lock(&BTRFS_I(inode)->extent_mutex); - err = btrfs_drop_extents(trans, root, inode, start_pos, - aligned_end, aligned_end, &hint_byte); - if (err) - goto failed; - if (isize > inline_size) - inline_size = min_t(u64, isize, aligned_end); - inline_size -= start_pos; - err = insert_inline_extent(trans, root, inode, start_pos, - inline_size, pages, 0, num_pages); - btrfs_drop_extent_cache(inode, start_pos, aligned_end - 1, 0); - BUG_ON(err); - mutex_unlock(&BTRFS_I(inode)->extent_mutex); - - /* - * an ugly way to do all the prop accounting around - * the page bits and mapping tags - */ - set_page_writeback(pages[0]); - end_page_writeback(pages[0]); - did_inline = 1; + btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block); + for (i = 0; i < num_pages; i++) { + struct page *p = pages[i]; + SetPageUptodate(p); + ClearPageChecked(p); + set_page_dirty(p); } if (end_pos > isize) { i_size_write(inode, end_pos); - if (did_inline) - BTRFS_I(inode)->disk_i_size = end_pos; btrfs_update_inode(trans, root, inode); } failed: @@ -399,6 +213,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, int ret; int testend = 1; unsigned long flags; + int compressed = 0; WARN_ON(end < start); if (end == (u64)-1) { @@ -434,6 +249,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, free_extent_map(em); continue; } + compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags); clear_bit(EXTENT_FLAG_PINNED, &em->flags); remove_extent_mapping(em_tree, em); @@ -442,6 +258,12 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, split->start = em->start; split->len = start - em->start; split->block_start = em->block_start; + + if (compressed) + split->block_len = em->block_len; + else + split->block_len = split->len; + split->bdev = em->bdev; split->flags = flags; ret = add_extent_mapping(em_tree, split); @@ -459,7 +281,13 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, split->bdev = em->bdev; split->flags = flags; - split->block_start = em->block_start + diff; + if (compressed) { + split->block_len = em->block_len; + split->block_start = em->block_start; + } else { + split->block_len = split->len; + split->block_start = em->block_start + diff; + } ret = add_extent_mapping(em_tree, split); BUG_ON(ret); @@ -533,7 +361,7 @@ int btrfs_check_file(struct btrfs_root *root, struct inode *inode) struct btrfs_item *item; item = btrfs_item_nr(leaf, slot); extent_end = found_key.offset + - btrfs_file_extent_inline_len(leaf, item); + btrfs_file_extent_inline_len(leaf, extent); extent_end = (extent_end + root->sectorsize - 1) & ~((u64)root->sectorsize -1 ); } @@ -573,6 +401,10 @@ int noinline btrfs_drop_extents(struct btrfs_trans_handle *trans, u64 extent_end = 0; u64 search_start = start; u64 leaf_start; + u64 ram_bytes = 0; + u8 compression = 0; + u8 encryption = 0; + u16 other_encoding = 0; u64 root_gen; u64 root_owner; struct extent_buffer *leaf; @@ -589,6 +421,7 @@ int noinline btrfs_drop_extents(struct btrfs_trans_handle *trans, int recow; int ret; + inline_limit = 0; btrfs_drop_extent_cache(inode, start, end - 1, 0); path = btrfs_alloc_path(); @@ -637,6 +470,12 @@ next_slot: extent = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); found_type = btrfs_file_extent_type(leaf, extent); + compression = btrfs_file_extent_compression(leaf, + extent); + encryption = btrfs_file_extent_encryption(leaf, + extent); + other_encoding = btrfs_file_extent_other_encoding(leaf, + extent); if (found_type == BTRFS_FILE_EXTENT_REG) { extent_end = btrfs_file_extent_disk_bytenr(leaf, @@ -646,13 +485,13 @@ next_slot: extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, extent); + ram_bytes = btrfs_file_extent_ram_bytes(leaf, + extent); found_extent = 1; } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { - struct btrfs_item *item; - item = btrfs_item_nr(leaf, slot); found_inline = 1; extent_end = key.offset + - btrfs_file_extent_inline_len(leaf, item); + btrfs_file_extent_inline_len(leaf, extent); } } else { extent_end = search_start; @@ -680,10 +519,9 @@ next_slot: search_start = (extent_end + mask) & ~mask; } else search_start = extent_end; - if (end <= extent_end && start >= key.offset && found_inline) { + + if (end <= extent_end && start >= key.offset && found_inline) *hint_byte = EXTENT_MAP_INLINE; - goto out; - } if (found_extent) { read_extent_buffer(leaf, &old, (unsigned long)extent, @@ -770,12 +608,27 @@ next_slot: write_extent_buffer(leaf, &old, (unsigned long)extent, sizeof(old)); + btrfs_set_file_extent_compression(leaf, extent, + compression); + btrfs_set_file_extent_encryption(leaf, extent, + encryption); + btrfs_set_file_extent_other_encoding(leaf, extent, + other_encoding); btrfs_set_file_extent_offset(leaf, extent, le64_to_cpu(old.offset) + end - key.offset); WARN_ON(le64_to_cpu(old.num_bytes) < (extent_end - end)); btrfs_set_file_extent_num_bytes(leaf, extent, extent_end - end); + + /* + * set the ram bytes to the size of the full extent + * before splitting. This is a worst case flag, + * but its the best we can do because we don't know + * how splitting affects compression + */ + btrfs_set_file_extent_ram_bytes(leaf, extent, + ram_bytes); btrfs_set_file_extent_type(leaf, extent, BTRFS_FILE_EXTENT_REG); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index bf4bed6ca4d6..9797592dc86b 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -49,6 +49,7 @@ #include "compat.h" #include "tree-log.h" #include "ref-cache.h" +#include "compression.h" struct btrfs_iget_args { u64 ino; @@ -83,6 +84,7 @@ static unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = { }; static void btrfs_truncate(struct inode *inode); +static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end); /* * a very lame attempt at stopping writes when the FS is 85% full. There @@ -113,58 +115,375 @@ int btrfs_check_free_space(struct btrfs_root *root, u64 num_required, return ret; } +/* + * this does all the hard work for inserting an inline extent into + * the btree. The caller should have done a btrfs_drop_extents so that + * no overlapping inline items exist in the btree + */ +static int noinline insert_inline_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct inode *inode, + u64 start, size_t size, size_t compressed_size, + struct page **compressed_pages) +{ + struct btrfs_key key; + struct btrfs_path *path; + struct extent_buffer *leaf; + struct page *page = NULL; + char *kaddr; + unsigned long ptr; + struct btrfs_file_extent_item *ei; + int err = 0; + int ret; + size_t cur_size = size; + size_t datasize; + unsigned long offset; + int use_compress = 0; + + if (compressed_size && compressed_pages) { + use_compress = 1; + cur_size = compressed_size; + } + + path = btrfs_alloc_path(); if (!path) + return -ENOMEM; + + btrfs_set_trans_block_group(trans, inode); + + key.objectid = inode->i_ino; + key.offset = start; + btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY); + inode_add_bytes(inode, size); + datasize = btrfs_file_extent_calc_inline_size(cur_size); + + inode_add_bytes(inode, size); + ret = btrfs_insert_empty_item(trans, root, path, &key, + datasize); + BUG_ON(ret); + if (ret) { + err = ret; + printk("got bad ret %d\n", ret); + goto fail; + } + leaf = path->nodes[0]; + ei = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + btrfs_set_file_extent_generation(leaf, ei, trans->transid); + btrfs_set_file_extent_type(leaf, ei, BTRFS_FILE_EXTENT_INLINE); + btrfs_set_file_extent_encryption(leaf, ei, 0); + btrfs_set_file_extent_other_encoding(leaf, ei, 0); + btrfs_set_file_extent_ram_bytes(leaf, ei, size); + ptr = btrfs_file_extent_inline_start(ei); + + if (use_compress) { + struct page *cpage; + int i = 0; + while(compressed_size > 0) { + cpage = compressed_pages[i]; + cur_size = min(compressed_size, + PAGE_CACHE_SIZE); + + kaddr = kmap(cpage); + write_extent_buffer(leaf, kaddr, ptr, cur_size); + kunmap(cpage); + + i++; + ptr += cur_size; + compressed_size -= cur_size; + } + btrfs_set_file_extent_compression(leaf, ei, + BTRFS_COMPRESS_ZLIB); + } else { + page = find_get_page(inode->i_mapping, + start >> PAGE_CACHE_SHIFT); + btrfs_set_file_extent_compression(leaf, ei, 0); + kaddr = kmap_atomic(page, KM_USER0); + offset = start & (PAGE_CACHE_SIZE - 1); + write_extent_buffer(leaf, kaddr + offset, ptr, size); + kunmap_atomic(kaddr, KM_USER0); + page_cache_release(page); + } + btrfs_mark_buffer_dirty(leaf); + btrfs_free_path(path); + + BTRFS_I(inode)->disk_i_size = inode->i_size; + btrfs_update_inode(trans, root, inode); + return 0; +fail: + btrfs_free_path(path); + return err; +} + + +/* + * conditionally insert an inline extent into the file. This + * does the checks required to make sure the data is small enough + * to fit as an inline extent. + */ +static int cow_file_range_inline(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct inode *inode, u64 start, u64 end, + size_t compressed_size, + struct page **compressed_pages) +{ + u64 isize = i_size_read(inode); + u64 actual_end = min(end + 1, isize); + u64 inline_len = actual_end - start; + u64 aligned_end = (end + root->sectorsize - 1) & + ~((u64)root->sectorsize - 1); + u64 hint_byte; + u64 data_len = inline_len; + int ret; + + if (compressed_size) + data_len = compressed_size; + + if (start > 0 || + data_len >= BTRFS_MAX_INLINE_DATA_SIZE(root) || + (!compressed_size && + (actual_end & (root->sectorsize - 1)) == 0) || + end + 1 < isize || + data_len > root->fs_info->max_inline) { + return 1; + } + + mutex_lock(&BTRFS_I(inode)->extent_mutex); + ret = btrfs_drop_extents(trans, root, inode, start, + aligned_end, aligned_end, &hint_byte); + BUG_ON(ret); + + if (isize > actual_end) + inline_len = min_t(u64, isize, actual_end); + ret = insert_inline_extent(trans, root, inode, start, + inline_len, compressed_size, + compressed_pages); + BUG_ON(ret); + btrfs_drop_extent_cache(inode, start, aligned_end, 0); + mutex_unlock(&BTRFS_I(inode)->extent_mutex); + return 0; +} + /* * when extent_io.c finds a delayed allocation range in the file, * the call backs end up in this code. The basic idea is to * allocate extents on disk for the range, and create ordered data structs * in ram to track those extents. + * + * locked_page is the page that writepage had locked already. We use + * it to make sure we don't do extra locks or unlocks. + * + * *page_started is set to one if we unlock locked_page and do everything + * required to start IO on it. It may be clean and already done with + * IO when we return. */ -static int cow_file_range(struct inode *inode, u64 start, u64 end) +static int cow_file_range(struct inode *inode, struct page *locked_page, + u64 start, u64 end, int *page_started) { struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_trans_handle *trans; u64 alloc_hint = 0; u64 num_bytes; + unsigned long ram_size; + u64 orig_start; + u64 disk_num_bytes; u64 cur_alloc_size; u64 blocksize = root->sectorsize; - u64 orig_num_bytes; + u64 actual_end; struct btrfs_key ins; struct extent_map *em; struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; int ret = 0; + struct page **pages = NULL; + unsigned long nr_pages; + unsigned long nr_pages_ret = 0; + unsigned long total_compressed = 0; + unsigned long total_in = 0; + unsigned long max_compressed = 128 * 1024; + unsigned long max_uncompressed = 256 * 1024; + int i; + int will_compress; trans = btrfs_join_transaction(root, 1); BUG_ON(!trans); btrfs_set_trans_block_group(trans, inode); + orig_start = start; + + /* + * compression made this loop a bit ugly, but the basic idea is to + * compress some pages but keep the total size of the compressed + * extent relatively small. If compression is off, this goto target + * is never used. + */ +again: + will_compress = 0; + nr_pages = (end >> PAGE_CACHE_SHIFT) - (start >> PAGE_CACHE_SHIFT) + 1; + nr_pages = min(nr_pages, (128 * 1024UL) / PAGE_CACHE_SIZE); + actual_end = min_t(u64, i_size_read(inode), end + 1); + total_compressed = actual_end - start; + + /* we want to make sure that amount of ram required to uncompress + * an extent is reasonable, so we limit the total size in ram + * of a compressed extent to 256k + */ + total_compressed = min(total_compressed, max_uncompressed); num_bytes = (end - start + blocksize) & ~(blocksize - 1); num_bytes = max(blocksize, num_bytes); - orig_num_bytes = num_bytes; + disk_num_bytes = num_bytes; + total_in = 0; + ret = 0; - if (alloc_hint == EXTENT_MAP_INLINE) - goto out; + /* we do compression for mount -o compress and when the + * inode has not been flagged as nocompress + */ + if (!btrfs_test_flag(inode, NOCOMPRESS) && + btrfs_test_opt(root, COMPRESS)) { + WARN_ON(pages); + pages = kmalloc(sizeof(struct page *) * nr_pages, GFP_NOFS); + + /* we want to make sure the amount of IO required to satisfy + * a random read is reasonably small, so we limit the size + * of a compressed extent to 128k + */ + ret = btrfs_zlib_compress_pages(inode->i_mapping, start, + total_compressed, pages, + nr_pages, &nr_pages_ret, + &total_in, + &total_compressed, + max_compressed); + + if (!ret) { + unsigned long offset = total_compressed & + (PAGE_CACHE_SIZE - 1); + struct page *page = pages[nr_pages_ret - 1]; + char *kaddr; + + /* zero the tail end of the last page, we might be + * sending it down to disk + */ + if (offset) { + kaddr = kmap_atomic(page, KM_USER0); + memset(kaddr + offset, 0, + PAGE_CACHE_SIZE - offset); + kunmap_atomic(kaddr, KM_USER0); + } + will_compress = 1; + } + } + if (start == 0) { + /* lets try to make an inline extent */ + if (ret || total_in < (end - start + 1)) { + /* we didn't compress the entire range, try + * to make an uncompressed inline extent. This + * is almost sure to fail, but maybe inline sizes + * will get bigger later + */ + ret = cow_file_range_inline(trans, root, inode, + start, end, 0, NULL); + } else { + ret = cow_file_range_inline(trans, root, inode, + start, end, + total_compressed, pages); + } + if (ret == 0) { + extent_clear_unlock_delalloc(inode, + &BTRFS_I(inode)->io_tree, + start, end, NULL, + 1, 1, 1); + *page_started = 1; + ret = 0; + goto free_pages_out; + } + } + + if (will_compress) { + /* + * we aren't doing an inline extent round the compressed size + * up to a block size boundary so the allocator does sane + * things + */ + total_compressed = (total_compressed + blocksize - 1) & + ~(blocksize - 1); + + /* + * one last check to make sure the compression is really a + * win, compare the page count read with the blocks on disk + */ + total_in = (total_in + PAGE_CACHE_SIZE - 1) & + ~(PAGE_CACHE_SIZE - 1); + if (total_compressed >= total_in) { + will_compress = 0; + } else { + disk_num_bytes = total_compressed; + num_bytes = total_in; + } + } + if (!will_compress && pages) { + /* + * the compression code ran but failed to make things smaller, + * free any pages it allocated and our page pointer array + */ + for (i = 0; i < nr_pages_ret; i++) { + page_cache_release(pages[i]); + } + kfree(pages); + pages = NULL; + total_compressed = 0; + nr_pages_ret = 0; + + /* flag the file so we don't compress in the future */ + btrfs_set_flag(inode, NOCOMPRESS); + } + + BUG_ON(disk_num_bytes > + btrfs_super_total_bytes(&root->fs_info->super_copy)); - BUG_ON(num_bytes > btrfs_super_total_bytes(&root->fs_info->super_copy)); mutex_lock(&BTRFS_I(inode)->extent_mutex); btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0); mutex_unlock(&BTRFS_I(inode)->extent_mutex); - while(num_bytes > 0) { - cur_alloc_size = min(num_bytes, root->fs_info->max_extent); + while(disk_num_bytes > 0) { + unsigned long min_bytes; + + /* + * the max size of a compressed extent is pretty small, + * make the code a little less complex by forcing + * the allocator to find a whole compressed extent at once + */ + if (will_compress) + min_bytes = disk_num_bytes; + else + min_bytes = root->sectorsize; + + cur_alloc_size = min(disk_num_bytes, root->fs_info->max_extent); ret = btrfs_reserve_extent(trans, root, cur_alloc_size, - root->sectorsize, 0, alloc_hint, + min_bytes, 0, alloc_hint, (u64)-1, &ins, 1); if (ret) { WARN_ON(1); - goto out; + goto free_pages_out_fail; } em = alloc_extent_map(GFP_NOFS); em->start = start; - em->len = ins.offset; + + if (will_compress) { + ram_size = num_bytes; + em->len = num_bytes; + } else { + /* ramsize == disk size */ + ram_size = ins.offset; + em->len = ins.offset; + } + em->block_start = ins.objectid; + em->block_len = ins.offset; em->bdev = root->fs_info->fs_devices->latest_bdev; + mutex_lock(&BTRFS_I(inode)->extent_mutex); set_bit(EXTENT_FLAG_PINNED, &em->flags); + + if (will_compress) + set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); + while(1) { spin_lock(&em_tree->lock); ret = add_extent_mapping(em_tree, em); @@ -174,26 +493,95 @@ static int cow_file_range(struct inode *inode, u64 start, u64 end) break; } btrfs_drop_extent_cache(inode, start, - start + ins.offset - 1, 0); + start + ram_size - 1, 0); } mutex_unlock(&BTRFS_I(inode)->extent_mutex); cur_alloc_size = ins.offset; ret = btrfs_add_ordered_extent(inode, start, ins.objectid, - ins.offset, 0); + ram_size, cur_alloc_size, 0, + will_compress); BUG_ON(ret); - if (num_bytes < cur_alloc_size) { - printk("num_bytes %Lu cur_alloc %Lu\n", num_bytes, + + if (disk_num_bytes < cur_alloc_size) { + printk("num_bytes %Lu cur_alloc %Lu\n", disk_num_bytes, cur_alloc_size); break; } + + if (will_compress) { + /* + * we're doing compression, we and we need to + * submit the compressed extents down to the device. + * + * We lock down all the file pages, clearing their + * dirty bits and setting them writeback. Everyone + * that wants to modify the page will wait on the + * ordered extent above. + * + * The writeback bits on the file pages are + * cleared when the compressed pages are on disk + */ + btrfs_end_transaction(trans, root); + + if (start <= page_offset(locked_page) && + page_offset(locked_page) < start + ram_size) { + *page_started = 1; + } + + extent_clear_unlock_delalloc(inode, + &BTRFS_I(inode)->io_tree, + start, + start + ram_size - 1, + NULL, 1, 1, 0); + + ret = btrfs_submit_compressed_write(inode, start, + ram_size, ins.objectid, + cur_alloc_size, pages, + nr_pages_ret); + + BUG_ON(ret); + trans = btrfs_join_transaction(root, 1); + if (start + ram_size < end) { + start += ram_size; + alloc_hint = ins.objectid + ins.offset; + /* pages will be freed at end_bio time */ + pages = NULL; + goto again; + } else { + /* we've written everything, time to go */ + break; + } + } + /* we're not doing compressed IO, don't unlock the first + * page (which the caller expects to stay locked), don't + * clear any dirty bits and don't set any writeback bits + */ + extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree, + start, start + ram_size - 1, + locked_page, 0, 0, 0); + disk_num_bytes -= cur_alloc_size; num_bytes -= cur_alloc_size; alloc_hint = ins.objectid + ins.offset; start += cur_alloc_size; } + + ret = 0; out: btrfs_end_transaction(trans, root); + return ret; + +free_pages_out_fail: + extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree, + start, end, locked_page, 0, 0, 0); +free_pages_out: + for (i = 0; i < nr_pages_ret; i++) + page_cache_release(pages[i]); + if (pages) + kfree(pages); + + goto out; } /* @@ -203,7 +591,8 @@ out: * If no cow copies or snapshots exist, we write directly to the existing * blocks on disk */ -static int run_delalloc_nocow(struct inode *inode, u64 start, u64 end) +static int run_delalloc_nocow(struct inode *inode, struct page *locked_page, + u64 start, u64 end, int *page_started) { u64 extent_start; u64 extent_end; @@ -260,6 +649,11 @@ again: extent_end = extent_start + extent_num_bytes; err = 0; + if (btrfs_file_extent_compression(leaf, item) || + btrfs_file_extent_encryption(leaf,item) || + btrfs_file_extent_other_encoding(leaf, item)) + goto not_found; + if (loops && start != extent_start) goto not_found; @@ -284,7 +678,8 @@ again: bytenr += btrfs_file_extent_offset(leaf, item); extent_num_bytes = min(end + 1, extent_end) - start; ret = btrfs_add_ordered_extent(inode, start, bytenr, - extent_num_bytes, 1); + extent_num_bytes, + extent_num_bytes, 1, 0); if (ret) { err = ret; goto out; @@ -300,7 +695,8 @@ again: not_found: btrfs_end_transaction(trans, root); btrfs_free_path(path); - return cow_file_range(inode, start, end); + return cow_file_range(inode, locked_page, start, end, + page_started); } out: WARN_ON(err); @@ -312,16 +708,19 @@ out: /* * extent_io.c call back to do delayed allocation processing */ -static int run_delalloc_range(struct inode *inode, u64 start, u64 end) +static int run_delalloc_range(struct inode *inode, struct page *locked_page, + u64 start, u64 end, int *page_started) { struct btrfs_root *root = BTRFS_I(inode)->root; int ret; if (btrfs_test_opt(root, NODATACOW) || btrfs_test_flag(inode, NODATACOW)) - ret = run_delalloc_nocow(inode, start, end); + ret = run_delalloc_nocow(inode, locked_page, start, end, + page_started); else - ret = cow_file_range(inode, start, end); + ret = cow_file_range(inode, locked_page, start, end, + page_started); return ret; } @@ -383,7 +782,8 @@ int btrfs_clear_bit_hook(struct inode *inode, u64 start, u64 end, * we don't create bios that span stripes or chunks */ int btrfs_merge_bio_hook(struct page *page, unsigned long offset, - size_t size, struct bio *bio) + size_t size, struct bio *bio, + unsigned long bio_flags) { struct btrfs_root *root = BTRFS_I(page->mapping->host)->root; struct btrfs_mapping_tree *map_tree; @@ -413,7 +813,7 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset, * are inserted into the btree */ int __btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, - int mirror_num) + int mirror_num, unsigned long bio_flags) { struct btrfs_root *root = BTRFS_I(inode)->root; int ret = 0; @@ -429,7 +829,7 @@ int __btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, * or reading the csums from the tree before a read */ int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, - int mirror_num) + int mirror_num, unsigned long bio_flags) { struct btrfs_root *root = BTRFS_I(inode)->root; int ret = 0; @@ -444,11 +844,17 @@ int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, if (!(rw & (1 << BIO_RW))) { btrfs_lookup_bio_sums(root, inode, bio); + + if (bio_flags & EXTENT_BIO_COMPRESSED) { + return btrfs_submit_compressed_read(inode, bio, + mirror_num, bio_flags); + } + goto mapit; } return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info, inode, rw, bio, mirror_num, - __btrfs_submit_bio_hook); + bio_flags, __btrfs_submit_bio_hook); mapit: return btrfs_map_bio(root, rw, bio, mirror_num, 0); } @@ -539,7 +945,7 @@ out_page: * good idea. This causes problems because we want to make sure COW * properly happens and the data=ordered rules are followed. * - * In our case any range that doesn't have the EXTENT_ORDERED bit set + * In our case any range that doesn't have the ORDERED bit set * hasn't been properly setup for IO. We kick off an async process * to fix it up. The async helper will wait for ordered extents, set * the delalloc bit and make it safe to write the page. @@ -632,10 +1038,21 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) btrfs_set_file_extent_disk_bytenr(leaf, extent_item, ordered_extent->start); btrfs_set_file_extent_disk_num_bytes(leaf, extent_item, - ordered_extent->len); + ordered_extent->disk_len); btrfs_set_file_extent_offset(leaf, extent_item, 0); + + if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags)) + btrfs_set_file_extent_compression(leaf, extent_item, 1); + else + btrfs_set_file_extent_compression(leaf, extent_item, 0); + btrfs_set_file_extent_encryption(leaf, extent_item, 0); + btrfs_set_file_extent_other_encoding(leaf, extent_item, 0); + + /* ram bytes = extent_num_bytes for now */ btrfs_set_file_extent_num_bytes(leaf, extent_item, ordered_extent->len); + btrfs_set_file_extent_ram_bytes(leaf, extent_item, + ordered_extent->len); btrfs_mark_buffer_dirty(leaf); btrfs_drop_extent_cache(inode, ordered_extent->file_offset, @@ -644,7 +1061,7 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) mutex_unlock(&BTRFS_I(inode)->extent_mutex); ins.objectid = ordered_extent->start; - ins.offset = ordered_extent->len; + ins.offset = ordered_extent->disk_len; ins.type = BTRFS_EXTENT_ITEM_KEY; ret = btrfs_alloc_reserved_extent(trans, root, leaf->start, root->root_key.objectid, @@ -714,6 +1131,7 @@ int btrfs_io_failed_hook(struct bio *failed_bio, int ret; int rw; u64 logical; + unsigned long bio_flags = 0; ret = get_state_private(failure_tree, start, &private); if (ret) { @@ -738,6 +1156,8 @@ int btrfs_io_failed_hook(struct bio *failed_bio, } logical = start - em->start; logical = em->block_start + logical; + if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) + bio_flags = EXTENT_BIO_COMPRESSED; failrec->logical = logical; free_extent_map(em); set_extent_bits(failure_tree, start, end, EXTENT_LOCKED | @@ -781,7 +1201,8 @@ int btrfs_io_failed_hook(struct bio *failed_bio, rw = READ; BTRFS_I(inode)->io_tree.ops->submit_bio_hook(inode, rw, bio, - failrec->last_mirror); + failrec->last_mirror, + bio_flags); return 0; } @@ -1644,10 +2065,8 @@ search_again: item_end += btrfs_file_extent_num_bytes(leaf, fi); } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { - struct btrfs_item *item = btrfs_item_nr(leaf, - path->slots[0]); item_end += btrfs_file_extent_inline_len(leaf, - item); + fi); } item_end--; } @@ -1715,7 +2134,14 @@ search_again: root_owner = btrfs_header_owner(leaf); } } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { - if (!del_item) { + /* + * we can't truncate inline items that have had + * special encodings + */ + if (!del_item && + btrfs_file_extent_compression(leaf, fi) == 0 && + btrfs_file_extent_encryption(leaf, fi) == 0 && + btrfs_file_extent_other_encoding(leaf, fi) == 0) { u32 size = new_size - found_key.offset; if (root->ref_cows) { @@ -1926,7 +2352,8 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr) err = btrfs_insert_file_extent(trans, root, inode->i_ino, hole_start, 0, 0, - hole_size, 0); + hole_size, 0, hole_size, + 0, 0, 0); btrfs_drop_extent_cache(inode, hole_start, (u64)-1, 0); btrfs_check_file(root, inode); @@ -2894,11 +3321,50 @@ static int merge_extent_mapping(struct extent_map_tree *em_tree, start_diff = map_start - em->start; em->start = map_start; em->len = map_len; - if (em->block_start < EXTENT_MAP_LAST_BYTE) + if (em->block_start < EXTENT_MAP_LAST_BYTE && + !test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { em->block_start += start_diff; + em->block_len -= start_diff; + } return add_extent_mapping(em_tree, em); } +static noinline int uncompress_inline(struct btrfs_path *path, + struct inode *inode, struct page *page, + size_t pg_offset, u64 extent_offset, + struct btrfs_file_extent_item *item) +{ + int ret; + struct extent_buffer *leaf = path->nodes[0]; + char *tmp; + size_t max_size; + unsigned long inline_size; + unsigned long ptr; + + WARN_ON(pg_offset != 0); + max_size = btrfs_file_extent_ram_bytes(leaf, item); + inline_size = btrfs_file_extent_inline_item_len(leaf, + btrfs_item_nr(leaf, path->slots[0])); + tmp = kmalloc(inline_size, GFP_NOFS); + ptr = btrfs_file_extent_inline_start(item); + + read_extent_buffer(leaf, tmp, ptr, inline_size); + + max_size = min(PAGE_CACHE_SIZE, max_size); + ret = btrfs_zlib_decompress(tmp, page, extent_offset, + inline_size, max_size); + if (ret) { + char *kaddr = kmap_atomic(page, KM_USER0); + unsigned long copy_size = min_t(u64, + PAGE_CACHE_SIZE - pg_offset, + max_size - extent_offset); + memset(kaddr + pg_offset, 0, copy_size); + kunmap_atomic(kaddr, KM_USER0); + } + kfree(tmp); + return 0; +} + /* * a bit scary, this does extent mapping from logical file offset to the disk. * the ugly parts come from merging extents from the disk with the @@ -2927,6 +3393,7 @@ struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page, struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; struct btrfs_trans_handle *trans = NULL; + int compressed; again: spin_lock(&em_tree->lock); @@ -2951,6 +3418,7 @@ again: em->bdev = root->fs_info->fs_devices->latest_bdev; em->start = EXTENT_MAP_HOLE; em->len = (u64)-1; + em->block_len = (u64)-1; if (!path) { path = btrfs_alloc_path(); @@ -2983,6 +3451,7 @@ again: found_type = btrfs_file_extent_type(leaf, item); extent_start = found_key.offset; + compressed = btrfs_file_extent_compression(leaf, item); if (found_type == BTRFS_FILE_EXTENT_REG) { extent_end = extent_start + btrfs_file_extent_num_bytes(leaf, item); @@ -3005,10 +3474,18 @@ again: em->block_start = EXTENT_MAP_HOLE; goto insert; } - bytenr += btrfs_file_extent_offset(leaf, item); - em->block_start = bytenr; em->start = extent_start; em->len = extent_end - extent_start; + if (compressed) { + set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); + em->block_start = bytenr; + em->block_len = btrfs_file_extent_disk_num_bytes(leaf, + item); + } else { + bytenr += btrfs_file_extent_offset(leaf, item); + em->block_start = bytenr; + em->block_len = em->len; + } goto insert; } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { u64 page_start; @@ -3018,8 +3495,7 @@ again: size_t extent_offset; size_t copy_size; - size = btrfs_file_extent_inline_len(leaf, btrfs_item_nr(leaf, - path->slots[0])); + size = btrfs_file_extent_inline_len(leaf, item); extent_end = (extent_start + size + root->sectorsize - 1) & ~((u64)root->sectorsize - 1); if (start < extent_start || start >= extent_end) { @@ -3035,9 +3511,10 @@ again: } em->block_start = EXTENT_MAP_INLINE; - if (!page) { + if (!page || create) { em->start = extent_start; - em->len = size; + em->len = (size + root->sectorsize - 1) & + ~((u64)root->sectorsize - 1); goto out; } @@ -3048,11 +3525,22 @@ again: em->start = extent_start + extent_offset; em->len = (copy_size + root->sectorsize - 1) & ~((u64)root->sectorsize - 1); - map = kmap(page); + if (compressed) + set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); ptr = btrfs_file_extent_inline_start(item) + extent_offset; if (create == 0 && !PageUptodate(page)) { - read_extent_buffer(leaf, map + pg_offset, ptr, - copy_size); + if (btrfs_file_extent_compression(leaf, item) == + BTRFS_COMPRESS_ZLIB) { + ret = uncompress_inline(path, inode, page, + pg_offset, + extent_offset, item); + BUG_ON(ret); + } else { + map = kmap(page); + read_extent_buffer(leaf, map + pg_offset, ptr, + copy_size); + kunmap(page); + } flush_dcache_page(page); } else if (create && PageUptodate(page)) { if (!trans) { @@ -3063,11 +3551,12 @@ again: trans = btrfs_join_transaction(root, 1); goto again; } + map = kmap(page); write_extent_buffer(leaf, map + pg_offset, ptr, copy_size); + kunmap(page); btrfs_mark_buffer_dirty(leaf); } - kunmap(page); set_extent_uptodate(io_tree, em->start, extent_map_end(em) - 1, GFP_NOFS); goto insert; @@ -3779,6 +4268,11 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry, btrfs_set_file_extent_generation(leaf, ei, trans->transid); btrfs_set_file_extent_type(leaf, ei, BTRFS_FILE_EXTENT_INLINE); + btrfs_set_file_extent_encryption(leaf, ei, 0); + btrfs_set_file_extent_compression(leaf, ei, 0); + btrfs_set_file_extent_other_encoding(leaf, ei, 0); + btrfs_set_file_extent_ram_bytes(leaf, ei, name_len); + ptr = btrfs_file_extent_inline_start(ei); write_extent_buffer(leaf, symname, ptr, name_len); btrfs_mark_buffer_dirty(leaf); diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 2eb6caba57c2..b5745bb96d40 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -165,7 +165,8 @@ static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree, * inserted. */ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, - u64 start, u64 len, int nocow) + u64 start, u64 len, u64 disk_len, int nocow, + int compressed) { struct btrfs_ordered_inode_tree *tree; struct rb_node *node; @@ -180,9 +181,12 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, entry->file_offset = file_offset; entry->start = start; entry->len = len; + entry->disk_len = disk_len; entry->inode = inode; if (nocow) set_bit(BTRFS_ORDERED_NOCOW, &entry->flags); + if (compressed) + set_bit(BTRFS_ORDERED_COMPRESSED, &entry->flags); /* one ref for the tree */ atomic_set(&entry->refs, 1); @@ -389,9 +393,10 @@ void btrfs_start_ordered_extent(struct inode *inode, * for pdflush to find them */ btrfs_fdatawrite_range(inode->i_mapping, start, end, WB_SYNC_NONE); - if (wait) + if (wait) { wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE, &entry->flags)); + } } /* diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index f50f8870a144..1ef464145d22 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -66,6 +66,8 @@ struct btrfs_ordered_sum { #define BTRFS_ORDERED_NOCOW 2 /* set when we want to write in place */ +#define BTRFS_ORDERED_COMPRESSED 3 /* writing a compressed extent */ + struct btrfs_ordered_extent { /* logical offset in the file */ u64 file_offset; @@ -73,9 +75,12 @@ struct btrfs_ordered_extent { /* disk byte number */ u64 start; - /* length of the extent in bytes */ + /* ram length of the extent in bytes */ u64 len; + /* extent length on disk */ + u64 disk_len; + /* flags (described above) */ unsigned long flags; @@ -127,7 +132,8 @@ int btrfs_remove_ordered_extent(struct inode *inode, int btrfs_dec_test_ordered_pending(struct inode *inode, u64 file_offset, u64 io_size); int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, - u64 start, u64 len, int nocow); + u64 start, u64 len, u64 disk_len, int nocow, + int compressed); int btrfs_add_ordered_sum(struct inode *inode, struct btrfs_ordered_extent *entry, struct btrfs_ordered_sum *sum); diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c index bd9ab3e9a7f2..64725c13aa11 100644 --- a/fs/btrfs/print-tree.c +++ b/fs/btrfs/print-tree.c @@ -115,15 +115,16 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l) if (btrfs_file_extent_type(l, fi) == BTRFS_FILE_EXTENT_INLINE) { printk("\t\tinline extent data size %u\n", - btrfs_file_extent_inline_len(l, item)); + btrfs_file_extent_inline_len(l, fi)); break; } printk("\t\textent data disk bytenr %llu nr %llu\n", (unsigned long long)btrfs_file_extent_disk_bytenr(l, fi), (unsigned long long)btrfs_file_extent_disk_num_bytes(l, fi)); - printk("\t\textent data offset %llu nr %llu\n", + printk("\t\textent data offset %llu nr %llu ram %llu\n", (unsigned long long)btrfs_file_extent_offset(l, fi), - (unsigned long long)btrfs_file_extent_num_bytes(l, fi)); + (unsigned long long)btrfs_file_extent_num_bytes(l, fi), + (unsigned long long)btrfs_file_extent_ram_bytes(l, fi)); break; case BTRFS_BLOCK_GROUP_ITEM_KEY: bi = btrfs_item_ptr(l, i, diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 2e6039825b7b..431fdf144b58 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -47,6 +47,7 @@ #include "volumes.h" #include "version.h" #include "export.h" +#include "compression.h" #define BTRFS_SUPER_MAGIC 0x9123683E @@ -69,7 +70,7 @@ static void btrfs_put_super (struct super_block * sb) enum { Opt_degraded, Opt_subvol, Opt_device, Opt_nodatasum, Opt_nodatacow, Opt_max_extent, Opt_max_inline, Opt_alloc_start, Opt_nobarrier, - Opt_ssd, Opt_thread_pool, Opt_noacl, Opt_err, + Opt_ssd, Opt_thread_pool, Opt_noacl, Opt_compress, Opt_err, }; static match_table_t tokens = { @@ -83,6 +84,7 @@ static match_table_t tokens = { {Opt_max_inline, "max_inline=%s"}, {Opt_alloc_start, "alloc_start=%s"}, {Opt_thread_pool, "thread_pool=%d"}, + {Opt_compress, "compress"}, {Opt_ssd, "ssd"}, {Opt_noacl, "noacl"}, {Opt_err, NULL}, @@ -163,6 +165,10 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) btrfs_set_opt(info->mount_opt, NODATACOW); btrfs_set_opt(info->mount_opt, NODATASUM); break; + case Opt_compress: + printk(KERN_INFO "btrfs: use compression\n"); + btrfs_set_opt(info->mount_opt, COMPRESS); + break; case Opt_ssd: printk(KERN_INFO "btrfs: use ssd allocation scheme\n"); btrfs_set_opt(info->mount_opt, SSD); @@ -622,6 +628,7 @@ static int __init init_btrfs_fs(void) err = btrfs_interface_init(); if (err) goto free_extent_map; + err = register_filesystem(&btrfs_fs_type); if (err) goto unregister_ioctl; @@ -651,6 +658,7 @@ static void __exit exit_btrfs_fs(void) unregister_filesystem(&btrfs_fs_type); btrfs_exit_sysfs(); btrfs_cleanup_fs_uuids(); + btrfs_zlib_exit(); } module_init(init_btrfs_fs) diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index cf618cc8b34a..e6d579053a47 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -540,8 +540,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, if (found_type == BTRFS_FILE_EXTENT_REG) extent_end = start + btrfs_file_extent_num_bytes(eb, item); else if (found_type == BTRFS_FILE_EXTENT_INLINE) { - size = btrfs_file_extent_inline_len(eb, - btrfs_item_nr(eb, slot)); + size = btrfs_file_extent_inline_len(eb, item); extent_end = (start + size + mask) & ~mask; } else { ret = 0; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 2eed7f91f51a..7db4cfd03a98 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1816,6 +1816,7 @@ again: em->start = key.offset; em->len = *num_bytes; em->block_start = 0; + em->block_len = em->len; if (type & BTRFS_BLOCK_GROUP_SYSTEM) { ret = btrfs_add_system_chunk(trans, chunk_root, &key, @@ -2323,6 +2324,7 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key, em->start = logical; em->len = length; em->block_start = 0; + em->block_len = em->len; map->num_stripes = num_stripes; map->io_width = btrfs_chunk_io_width(leaf, chunk); diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c new file mode 100644 index 000000000000..e99309180a11 --- /dev/null +++ b/fs/btrfs/zlib.c @@ -0,0 +1,637 @@ +/* + * Copyright (C) 2008 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + * + * Based on jffs2 zlib code: + * Copyright © 2001-2007 Red Hat, Inc. + * Created by David Woodhouse + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* Plan: call deflate() with avail_in == *sourcelen, + avail_out = *dstlen - 12 and flush == Z_FINISH. + If it doesn't manage to finish, call it again with + avail_in == 0 and avail_out set to the remaining 12 + bytes for it to clean up. + Q: Is 12 bytes sufficient? +*/ +#define STREAM_END_SPACE 12 + +struct workspace { + z_stream inf_strm; + z_stream def_strm; + char *buf; + struct list_head list; +}; + +static LIST_HEAD(idle_workspace); +static DEFINE_SPINLOCK(workspace_lock); +static unsigned long num_workspace; +static atomic_t alloc_workspace = ATOMIC_INIT(0); +static DECLARE_WAIT_QUEUE_HEAD(workspace_wait); + +/* + * this finds an available zlib workspace or allocates a new one + * NULL or an ERR_PTR is returned if things go bad. + */ +static struct workspace *find_zlib_workspace(void) +{ + struct workspace *workspace; + int ret; + int cpus = num_online_cpus(); + +again: + spin_lock(&workspace_lock); + if (!list_empty(&idle_workspace)) { + workspace = list_entry(idle_workspace.next, struct workspace, + list); + list_del(&workspace->list); + num_workspace--; + spin_unlock(&workspace_lock); + return workspace; + + } + spin_unlock(&workspace_lock); + if (atomic_read(&alloc_workspace) > cpus) { + DEFINE_WAIT(wait); + prepare_to_wait(&workspace_wait, &wait, TASK_UNINTERRUPTIBLE); + if (atomic_read(&alloc_workspace) > cpus) + schedule(); + finish_wait(&workspace_wait, &wait); + goto again; + } + atomic_inc(&alloc_workspace); + workspace = kzalloc(sizeof(*workspace), GFP_NOFS); + if (!workspace) { + ret = -ENOMEM; + goto fail; + } + + workspace->def_strm.workspace = vmalloc(zlib_deflate_workspacesize()); + if (!workspace->def_strm.workspace) { + ret = -ENOMEM; + goto fail; + } + workspace->inf_strm.workspace = vmalloc(zlib_inflate_workspacesize()); + if (!workspace->inf_strm.workspace) { + ret = -ENOMEM; + goto fail_inflate; + } + workspace->buf = kmalloc(PAGE_CACHE_SIZE, GFP_NOFS); + if (!workspace->buf) { + ret = -ENOMEM; + goto fail_kmalloc; + } + return workspace; + +fail_kmalloc: + vfree(workspace->inf_strm.workspace); +fail_inflate: + vfree(workspace->def_strm.workspace); +fail: + kfree(workspace); + atomic_dec(&alloc_workspace); + wake_up(&workspace_wait); + return ERR_PTR(ret); +} + +/* + * put a workspace struct back on the list or free it if we have enough + * idle ones sitting around + */ +static int free_workspace(struct workspace *workspace) +{ + spin_lock(&workspace_lock); + if (num_workspace < num_online_cpus()) { + list_add_tail(&workspace->list, &idle_workspace); + num_workspace++; + spin_unlock(&workspace_lock); + if (waitqueue_active(&workspace_wait)) + wake_up(&workspace_wait); + return 0; + } + spin_unlock(&workspace_lock); + vfree(workspace->def_strm.workspace); + vfree(workspace->inf_strm.workspace); + kfree(workspace->buf); + kfree(workspace); + + atomic_dec(&alloc_workspace); + if (waitqueue_active(&workspace_wait)) + wake_up(&workspace_wait); + return 0; +} + +/* + * cleanup function for module exit + */ +static void free_workspaces(void) +{ + struct workspace *workspace; + while(!list_empty(&idle_workspace)) { + workspace = list_entry(idle_workspace.next, struct workspace, + list); + list_del(&workspace->list); + vfree(workspace->def_strm.workspace); + vfree(workspace->inf_strm.workspace); + kfree(workspace->buf); + kfree(workspace); + atomic_dec(&alloc_workspace); + } +} + +/* + * given an address space and start/len, compress the bytes. + * + * pages are allocated to hold the compressed result and stored + * in 'pages' + * + * out_pages is used to return the number of pages allocated. There + * may be pages allocated even if we return an error + * + * total_in is used to return the number of bytes actually read. It + * may be smaller then len if we had to exit early because we + * ran out of room in the pages array or because we cross the + * max_out threshold. + * + * total_out is used to return the total number of compressed bytes + * + * max_out tells us the max number of bytes that we're allowed to + * stuff into pages + */ +int btrfs_zlib_compress_pages(struct address_space *mapping, + u64 start, unsigned long len, + struct page **pages, + unsigned long nr_dest_pages, + unsigned long *out_pages, + unsigned long *total_in, + unsigned long *total_out, + unsigned long max_out) +{ + int ret; + struct workspace *workspace; + char *data_in; + char *cpage_out; + int nr_pages = 0; + struct page *in_page = NULL; + struct page *out_page = NULL; + int out_written = 0; + int in_read = 0; + unsigned long bytes_left; + + *out_pages = 0; + *total_out = 0; + *total_in = 0; + + workspace = find_zlib_workspace(); + if (!workspace) + return -1; + + if (Z_OK != zlib_deflateInit(&workspace->def_strm, 3)) { + printk(KERN_WARNING "deflateInit failed\n"); + ret = -1; + goto out; + } + + workspace->def_strm.total_in = 0; + workspace->def_strm.total_out = 0; + + in_page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT); + data_in = kmap(in_page); + + out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); + cpage_out = kmap(out_page); + pages[0] = out_page; + nr_pages = 1; + + workspace->def_strm.next_in = data_in; + workspace->def_strm.next_out = cpage_out; + workspace->def_strm.avail_out = PAGE_CACHE_SIZE; + workspace->def_strm.avail_in = min(len, PAGE_CACHE_SIZE); + + out_written = 0; + in_read = 0; + + while (workspace->def_strm.total_in < len) { + ret = zlib_deflate(&workspace->def_strm, Z_SYNC_FLUSH); + if (ret != Z_OK) { + printk(KERN_DEBUG "btrfs deflate in loop returned %d\n", + ret); + zlib_deflateEnd(&workspace->def_strm); + ret = -1; + goto out; + } + + /* we're making it bigger, give up */ + if (workspace->def_strm.total_in > 8192 && + workspace->def_strm.total_in < + workspace->def_strm.total_out) { + ret = -1; + goto out; + } + /* we need another page for writing out. Test this + * before the total_in so we will pull in a new page for + * the stream end if required + */ + if (workspace->def_strm.avail_out == 0) { + kunmap(out_page); + if (nr_pages == nr_dest_pages) { + out_page = NULL; + ret = -1; + goto out; + } + out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); + cpage_out = kmap(out_page); + pages[nr_pages] = out_page; + nr_pages++; + workspace->def_strm.avail_out = PAGE_CACHE_SIZE; + workspace->def_strm.next_out = cpage_out; + } + /* we're all done */ + if (workspace->def_strm.total_in >= len) + break; + + /* we've read in a full page, get a new one */ + if (workspace->def_strm.avail_in == 0) { + if (workspace->def_strm.total_out > max_out) + break; + + bytes_left = len - workspace->def_strm.total_in; + kunmap(in_page); + page_cache_release(in_page); + + start += PAGE_CACHE_SIZE; + in_page = find_get_page(mapping, + start >> PAGE_CACHE_SHIFT); + data_in = kmap(in_page); + workspace->def_strm.avail_in = min(bytes_left, + PAGE_CACHE_SIZE); + workspace->def_strm.next_in = data_in; + } + } + workspace->def_strm.avail_in = 0; + ret = zlib_deflate(&workspace->def_strm, Z_FINISH); + zlib_deflateEnd(&workspace->def_strm); + + if (ret != Z_STREAM_END) { + ret = -1; + goto out; + } + + if (workspace->def_strm.total_out >= workspace->def_strm.total_in) { + ret = -1; + goto out; + } + + ret = 0; + *total_out = workspace->def_strm.total_out; + *total_in = workspace->def_strm.total_in; +out: + *out_pages = nr_pages; + if (out_page) + kunmap(out_page); + + if (in_page) { + kunmap(in_page); + page_cache_release(in_page); + } + free_workspace(workspace); + return ret; +} + +/* + * pages_in is an array of pages with compressed data. + * + * disk_start is the starting logical offset of this array in the file + * + * bvec is a bio_vec of pages from the file that we want to decompress into + * + * vcnt is the count of pages in the biovec + * + * srclen is the number of bytes in pages_in + * + * The basic idea is that we have a bio that was created by readpages. + * The pages in the bio are for the uncompressed data, and they may not + * be contiguous. They all correspond to the range of bytes covered by + * the compressed extent. + */ +int btrfs_zlib_decompress_biovec(struct page **pages_in, + u64 disk_start, + struct bio_vec *bvec, + int vcnt, + size_t srclen) +{ + int ret = 0; + int wbits = MAX_WBITS; + struct workspace *workspace; + char *data_in; + size_t total_out = 0; + unsigned long page_bytes_left; + unsigned long page_in_index = 0; + unsigned long page_out_index = 0; + struct page *page_out; + unsigned long total_pages_in = (srclen + PAGE_CACHE_SIZE - 1) / + PAGE_CACHE_SIZE; + unsigned long buf_start; + unsigned long buf_offset; + unsigned long bytes; + unsigned long working_bytes; + unsigned long pg_offset; + unsigned long start_byte; + unsigned long current_buf_start; + char *kaddr; + + workspace = find_zlib_workspace(); + if (!workspace) + return -ENOMEM; + + data_in = kmap(pages_in[page_in_index]); + workspace->inf_strm.next_in = data_in; + workspace->inf_strm.avail_in = min(srclen, PAGE_CACHE_SIZE); + workspace->inf_strm.total_in = 0; + + workspace->inf_strm.total_out = 0; + workspace->inf_strm.next_out = workspace->buf; + workspace->inf_strm.avail_out = PAGE_CACHE_SIZE; + page_out = bvec[page_out_index].bv_page; + page_bytes_left = PAGE_CACHE_SIZE; + pg_offset = 0; + + /* If it's deflate, and it's got no preset dictionary, then + we can tell zlib to skip the adler32 check. */ + if (srclen > 2 && !(data_in[1] & PRESET_DICT) && + ((data_in[0] & 0x0f) == Z_DEFLATED) && + !(((data_in[0]<<8) + data_in[1]) % 31)) { + + wbits = -((data_in[0] >> 4) + 8); + workspace->inf_strm.next_in += 2; + workspace->inf_strm.avail_in -= 2; + } + + if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) { + printk(KERN_WARNING "inflateInit failed\n"); + ret = -1; + goto out; + } + while(workspace->inf_strm.total_in < srclen) { + ret = zlib_inflate(&workspace->inf_strm, Z_NO_FLUSH); + if (ret != Z_OK && ret != Z_STREAM_END) { + break; + } + + /* + * buf start is the byte offset we're of the start of + * our workspace buffer + */ + buf_start = total_out; + + /* total_out is the last byte of the workspace buffer */ + total_out = workspace->inf_strm.total_out; + + working_bytes = total_out - buf_start; + + /* + * start byte is the first byte of the page we're currently + * copying into relative to the start of the compressed data. + */ + start_byte = page_offset(page_out) - disk_start; + + if (working_bytes == 0) { + /* we didn't make progress in this inflate + * call, we're done + */ + if (ret != Z_STREAM_END) + ret = -1; + break; + } + + /* we haven't yet hit data corresponding to this page */ + if (total_out <= start_byte) { + goto next; + } + + /* + * the start of the data we care about is offset into + * the middle of our working buffer + */ + if (total_out > start_byte && buf_start < start_byte) { + buf_offset = start_byte - buf_start; + working_bytes -= buf_offset; + } else { + buf_offset = 0; + } + current_buf_start = buf_start; + + /* copy bytes from the working buffer into the pages */ + while(working_bytes > 0) { + bytes = min(PAGE_CACHE_SIZE - pg_offset, + PAGE_CACHE_SIZE - buf_offset); + bytes = min(bytes, working_bytes); + kaddr = kmap_atomic(page_out, KM_USER0); + memcpy(kaddr + pg_offset, workspace->buf + buf_offset, + bytes); + kunmap_atomic(kaddr, KM_USER0); + flush_dcache_page(page_out); + + pg_offset += bytes; + page_bytes_left -= bytes; + buf_offset += bytes; + working_bytes -= bytes; + current_buf_start += bytes; + + /* check if we need to pick another page */ + if (page_bytes_left == 0) { + page_out_index++; + if (page_out_index >= vcnt) { + ret = 0; + goto done; + } + page_out = bvec[page_out_index].bv_page; + pg_offset = 0; + page_bytes_left = PAGE_CACHE_SIZE; + start_byte = page_offset(page_out) - disk_start; + + /* + * make sure our new page is covered by this + * working buffer + */ + if (total_out <= start_byte) { + goto next; + } + + /* the next page in the biovec might not + * be adjacent to the last page, but it + * might still be found inside this working + * buffer. bump our offset pointer + */ + if (total_out > start_byte && + current_buf_start < start_byte) { + buf_offset = start_byte - buf_start; + working_bytes = total_out - start_byte; + current_buf_start = buf_start + + buf_offset; + } + } + } +next: + workspace->inf_strm.next_out = workspace->buf; + workspace->inf_strm.avail_out = PAGE_CACHE_SIZE; + + if (workspace->inf_strm.avail_in == 0) { + unsigned long tmp; + kunmap(pages_in[page_in_index]); + page_in_index++; + if (page_in_index >= total_pages_in) { + data_in = NULL; + break; + } + data_in = kmap(pages_in[page_in_index]); + workspace->inf_strm.next_in = data_in; + tmp = srclen - workspace->inf_strm.total_in; + workspace->inf_strm.avail_in = min(tmp, + PAGE_CACHE_SIZE); + } + } + if (ret != Z_STREAM_END) { + ret = -1; + } else { + ret = 0; + } +done: + zlib_inflateEnd(&workspace->inf_strm); + if (data_in) + kunmap(pages_in[page_in_index]); +out: + free_workspace(workspace); + return ret; +} + +/* + * a less complex decompression routine. Our compressed data fits in a + * single page, and we want to read a single page out of it. + * start_byte tells us the offset into the compressed data we're interested in + */ +int btrfs_zlib_decompress(unsigned char *data_in, + struct page *dest_page, + unsigned long start_byte, + size_t srclen, size_t destlen) +{ + int ret = 0; + int wbits = MAX_WBITS; + struct workspace *workspace; + unsigned long bytes_left = destlen; + unsigned long total_out = 0; + char *kaddr; + + if (destlen > PAGE_CACHE_SIZE) + return -ENOMEM; + + workspace = find_zlib_workspace(); + if (!workspace) + return -ENOMEM; + + workspace->inf_strm.next_in = data_in; + workspace->inf_strm.avail_in = srclen; + workspace->inf_strm.total_in = 0; + + workspace->inf_strm.next_out = workspace->buf; + workspace->inf_strm.avail_out = PAGE_CACHE_SIZE; + workspace->inf_strm.total_out = 0; + /* If it's deflate, and it's got no preset dictionary, then + we can tell zlib to skip the adler32 check. */ + if (srclen > 2 && !(data_in[1] & PRESET_DICT) && + ((data_in[0] & 0x0f) == Z_DEFLATED) && + !(((data_in[0]<<8) + data_in[1]) % 31)) { + + wbits = -((data_in[0] >> 4) + 8); + workspace->inf_strm.next_in += 2; + workspace->inf_strm.avail_in -= 2; + } + + if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) { + printk(KERN_WARNING "inflateInit failed\n"); + ret = -1; + goto out; + } + + while(bytes_left > 0) { + unsigned long buf_start; + unsigned long buf_offset; + unsigned long bytes; + unsigned long pg_offset = 0; + + ret = zlib_inflate(&workspace->inf_strm, Z_NO_FLUSH); + if (ret != Z_OK && ret != Z_STREAM_END) { + break; + } + + buf_start = total_out; + total_out = workspace->inf_strm.total_out; + + if (total_out == buf_start) { + ret = -1; + break; + } + + if (total_out <= start_byte) { + goto next; + } + + if (total_out > start_byte && buf_start < start_byte) { + buf_offset = start_byte - buf_start; + } else { + buf_offset = 0; + } + + bytes = min(PAGE_CACHE_SIZE - pg_offset, + PAGE_CACHE_SIZE - buf_offset); + bytes = min(bytes, bytes_left); + + kaddr = kmap_atomic(dest_page, KM_USER0); + memcpy(kaddr + pg_offset, workspace->buf + buf_offset, bytes); + kunmap_atomic(kaddr, KM_USER0); + + pg_offset += bytes; + bytes_left -= bytes; +next: + workspace->inf_strm.next_out = workspace->buf; + workspace->inf_strm.avail_out = PAGE_CACHE_SIZE; + } + if (ret != Z_STREAM_END && bytes_left != 0) { + ret = -1; + } else { + ret = 0; + } + zlib_inflateEnd(&workspace->inf_strm); +out: + free_workspace(workspace); + return ret; +} + +void btrfs_zlib_exit(void) +{ + free_workspaces(); +} -- cgit v1.2.3