diff options
Diffstat (limited to 'fs/btrfs')
51 files changed, 4118 insertions, 1362 deletions
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index 6d1d0b93b1aa..128ce17a80b0 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -9,11 +9,12 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ export.o tree-log.o free-space-cache.o zlib.o lzo.o \ compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \ reada.o backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o \ - uuid-tree.o props.o hash.o + uuid-tree.o props.o hash.o free-space-tree.o btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o btrfs-$(CONFIG_BTRFS_FS_RUN_SANITY_TESTS) += tests/free-space-tests.o \ tests/extent-buffer-tests.o tests/btrfs-tests.o \ - tests/extent-io-tests.o tests/inode-tests.o tests/qgroup-tests.o + tests/extent-io-tests.o tests/inode-tests.o tests/qgroup-tests.o \ + tests/free-space-tree-tests.o diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c index 9a0124a95851..6d263bb1621c 100644 --- a/fs/btrfs/acl.c +++ b/fs/btrfs/acl.c @@ -37,10 +37,10 @@ struct posix_acl *btrfs_get_acl(struct inode *inode, int type) switch (type) { case ACL_TYPE_ACCESS: - name = POSIX_ACL_XATTR_ACCESS; + name = XATTR_NAME_POSIX_ACL_ACCESS; break; case ACL_TYPE_DEFAULT: - name = POSIX_ACL_XATTR_DEFAULT; + name = XATTR_NAME_POSIX_ACL_DEFAULT; break; default: BUG(); @@ -48,7 +48,7 @@ struct posix_acl *btrfs_get_acl(struct inode *inode, int type) size = __btrfs_getxattr(inode, name, "", 0); if (size > 0) { - value = kzalloc(size, GFP_NOFS); + value = kzalloc(size, GFP_KERNEL); if (!value) return ERR_PTR(-ENOMEM); size = __btrfs_getxattr(inode, name, value, size); @@ -81,7 +81,7 @@ static int __btrfs_set_acl(struct btrfs_trans_handle *trans, switch (type) { case ACL_TYPE_ACCESS: - name = POSIX_ACL_XATTR_ACCESS; + name = XATTR_NAME_POSIX_ACL_ACCESS; if (acl) { ret = posix_acl_equiv_mode(acl, &inode->i_mode); if (ret < 0) @@ -94,7 +94,7 @@ static int __btrfs_set_acl(struct btrfs_trans_handle *trans, case ACL_TYPE_DEFAULT: if (!S_ISDIR(inode->i_mode)) return acl ? -EINVAL : 0; - name = POSIX_ACL_XATTR_DEFAULT; + name = XATTR_NAME_POSIX_ACL_DEFAULT; break; default: return -EINVAL; @@ -102,7 +102,7 @@ static int __btrfs_set_acl(struct btrfs_trans_handle *trans, if (acl) { size = posix_acl_xattr_size(acl->a_count); - value = kmalloc(size, GFP_NOFS); + value = kmalloc(size, GFP_KERNEL); if (!value) { ret = -ENOMEM; goto out; diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c index 3e36e4adc4a3..88d9af3d4581 100644 --- a/fs/btrfs/async-thread.c +++ b/fs/btrfs/async-thread.c @@ -97,7 +97,7 @@ static struct __btrfs_workqueue * __btrfs_alloc_workqueue(const char *name, unsigned int flags, int limit_active, int thresh) { - struct __btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_NOFS); + struct __btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_KERNEL); if (!ret) return NULL; @@ -148,7 +148,7 @@ struct btrfs_workqueue *btrfs_alloc_workqueue(const char *name, int limit_active, int thresh) { - struct btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_NOFS); + struct btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_KERNEL); if (!ret) return NULL; diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index d453d62ab0c6..b90cd3776f8e 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -520,13 +520,10 @@ static inline int ref_for_same_block(struct __prelim_ref *ref1, static int __add_missing_keys(struct btrfs_fs_info *fs_info, struct list_head *head) { - struct list_head *pos; + struct __prelim_ref *ref; struct extent_buffer *eb; - list_for_each(pos, head) { - struct __prelim_ref *ref; - ref = list_entry(pos, struct __prelim_ref, list); - + list_for_each_entry(ref, head, list) { if (ref->parent) continue; if (ref->key_for_search.type) @@ -563,23 +560,15 @@ static int __add_missing_keys(struct btrfs_fs_info *fs_info, */ static void __merge_refs(struct list_head *head, int mode) { - struct list_head *pos1; + struct __prelim_ref *pos1; - list_for_each(pos1, head) { - struct list_head *n2; - struct list_head *pos2; - struct __prelim_ref *ref1; + list_for_each_entry(pos1, head, list) { + struct __prelim_ref *pos2 = pos1, *tmp; - ref1 = list_entry(pos1, struct __prelim_ref, list); - - for (pos2 = pos1->next, n2 = pos2->next; pos2 != head; - pos2 = n2, n2 = pos2->next) { - struct __prelim_ref *ref2; - struct __prelim_ref *xchg; + list_for_each_entry_safe_continue(pos2, tmp, head, list) { + struct __prelim_ref *xchg, *ref1 = pos1, *ref2 = pos2; struct extent_inode_elem *eie; - ref2 = list_entry(pos2, struct __prelim_ref, list); - if (!ref_for_same_block(ref1, ref2)) continue; if (mode == 1) { diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 0ef5cc13fae2..61205e3bbefa 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -192,6 +192,10 @@ struct btrfs_inode { /* File creation time. */ struct timespec i_otime; + /* Hook into fs_info->delayed_iputs */ + struct list_head delayed_iput; + long delayed_iput_count; + struct inode vfs_inode; }; diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index 0340c57bf377..861d472564c1 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -531,13 +531,9 @@ static struct btrfsic_block *btrfsic_block_hashtable_lookup( (((unsigned int)(dev_bytenr >> 16)) ^ ((unsigned int)((uintptr_t)bdev))) & (BTRFSIC_BLOCK_HASHTABLE_SIZE - 1); - struct list_head *elem; - - list_for_each(elem, h->table + hashval) { - struct btrfsic_block *const b = - list_entry(elem, struct btrfsic_block, - collision_resolving_node); + struct btrfsic_block *b; + list_for_each_entry(b, h->table + hashval, collision_resolving_node) { if (b->dev_state->bdev == bdev && b->dev_bytenr == dev_bytenr) return b; } @@ -588,13 +584,9 @@ static struct btrfsic_block_link *btrfsic_block_link_hashtable_lookup( ((unsigned int)((uintptr_t)bdev_ref_to)) ^ ((unsigned int)((uintptr_t)bdev_ref_from))) & (BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE - 1); - struct list_head *elem; - - list_for_each(elem, h->table + hashval) { - struct btrfsic_block_link *const l = - list_entry(elem, struct btrfsic_block_link, - collision_resolving_node); + struct btrfsic_block_link *l; + list_for_each_entry(l, h->table + hashval, collision_resolving_node) { BUG_ON(NULL == l->block_ref_to); BUG_ON(NULL == l->block_ref_from); if (l->block_ref_to->dev_state->bdev == bdev_ref_to && @@ -639,13 +631,9 @@ static struct btrfsic_dev_state *btrfsic_dev_state_hashtable_lookup( const unsigned int hashval = (((unsigned int)((uintptr_t)bdev)) & (BTRFSIC_DEV2STATE_HASHTABLE_SIZE - 1)); - struct list_head *elem; - - list_for_each(elem, h->table + hashval) { - struct btrfsic_dev_state *const ds = - list_entry(elem, struct btrfsic_dev_state, - collision_resolving_node); + struct btrfsic_dev_state *ds; + list_for_each_entry(ds, h->table + hashval, collision_resolving_node) { if (ds->bdev == bdev) return ds; } @@ -1720,29 +1708,20 @@ static int btrfsic_read_block(struct btrfsic_state *state, static void btrfsic_dump_database(struct btrfsic_state *state) { - struct list_head *elem_all; + const struct btrfsic_block *b_all; BUG_ON(NULL == state); printk(KERN_INFO "all_blocks_list:\n"); - list_for_each(elem_all, &state->all_blocks_list) { - const struct btrfsic_block *const b_all = - list_entry(elem_all, struct btrfsic_block, - all_blocks_node); - struct list_head *elem_ref_to; - struct list_head *elem_ref_from; + list_for_each_entry(b_all, &state->all_blocks_list, all_blocks_node) { + const struct btrfsic_block_link *l; printk(KERN_INFO "%c-block @%llu (%s/%llu/%d)\n", btrfsic_get_block_type(state, b_all), b_all->logical_bytenr, b_all->dev_state->name, b_all->dev_bytenr, b_all->mirror_num); - list_for_each(elem_ref_to, &b_all->ref_to_list) { - const struct btrfsic_block_link *const l = - list_entry(elem_ref_to, - struct btrfsic_block_link, - node_ref_to); - + list_for_each_entry(l, &b_all->ref_to_list, node_ref_to) { printk(KERN_INFO " %c @%llu (%s/%llu/%d)" " refers %u* to" " %c @%llu (%s/%llu/%d)\n", @@ -1757,12 +1736,7 @@ static void btrfsic_dump_database(struct btrfsic_state *state) l->block_ref_to->mirror_num); } - list_for_each(elem_ref_from, &b_all->ref_from_list) { - const struct btrfsic_block_link *const l = - list_entry(elem_ref_from, - struct btrfsic_block_link, - node_ref_from); - + list_for_each_entry(l, &b_all->ref_from_list, node_ref_from) { printk(KERN_INFO " %c @%llu (%s/%llu/%d)" " is ref %u* from" " %c @%llu (%s/%llu/%d)\n", @@ -1845,8 +1819,7 @@ again: &state->block_hashtable); if (NULL != block) { u64 bytenr = 0; - struct list_head *elem_ref_to; - struct list_head *tmp_ref_to; + struct btrfsic_block_link *l, *tmp; if (block->is_superblock) { bytenr = btrfs_super_bytenr((struct btrfs_super_block *) @@ -1967,13 +1940,8 @@ again: * because it still carries valueable information * like whether it was ever written and IO completed. */ - list_for_each_safe(elem_ref_to, tmp_ref_to, - &block->ref_to_list) { - struct btrfsic_block_link *const l = - list_entry(elem_ref_to, - struct btrfsic_block_link, - node_ref_to); - + list_for_each_entry_safe(l, tmp, &block->ref_to_list, + node_ref_to) { if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) btrfsic_print_rem_link(state, l); l->ref_cnt--; @@ -2436,7 +2404,7 @@ static int btrfsic_check_all_ref_blocks(struct btrfsic_state *state, struct btrfsic_block *const block, int recursion_level) { - struct list_head *elem_ref_to; + const struct btrfsic_block_link *l; int ret = 0; if (recursion_level >= 3 + BTRFS_MAX_LEVEL) { @@ -2464,11 +2432,7 @@ static int btrfsic_check_all_ref_blocks(struct btrfsic_state *state, * This algorithm is recursive because the amount of used stack * space is very small and the max recursion depth is limited. */ - list_for_each(elem_ref_to, &block->ref_to_list) { - const struct btrfsic_block_link *const l = - list_entry(elem_ref_to, struct btrfsic_block_link, - node_ref_to); - + list_for_each_entry(l, &block->ref_to_list, node_ref_to) { if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) printk(KERN_INFO "rl=%d, %c @%llu (%s/%llu/%d)" @@ -2561,7 +2525,7 @@ static int btrfsic_is_block_ref_by_superblock( const struct btrfsic_block *block, int recursion_level) { - struct list_head *elem_ref_from; + const struct btrfsic_block_link *l; if (recursion_level >= 3 + BTRFS_MAX_LEVEL) { /* refer to comment at "abort cyclic linkage (case 1)" */ @@ -2576,11 +2540,7 @@ static int btrfsic_is_block_ref_by_superblock( * This algorithm is recursive because the amount of used stack space * is very small and the max recursion depth is limited. */ - list_for_each(elem_ref_from, &block->ref_from_list) { - const struct btrfsic_block_link *const l = - list_entry(elem_ref_from, struct btrfsic_block_link, - node_ref_from); - + list_for_each_entry(l, &block->ref_from_list, node_ref_from) { if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) printk(KERN_INFO "rl=%d, %c @%llu (%s/%llu/%d)" @@ -2669,7 +2629,7 @@ static void btrfsic_dump_tree_sub(const struct btrfsic_state *state, const struct btrfsic_block *block, int indent_level) { - struct list_head *elem_ref_to; + const struct btrfsic_block_link *l; int indent_add; static char buf[80]; int cursor_position; @@ -2704,11 +2664,7 @@ static void btrfsic_dump_tree_sub(const struct btrfsic_state *state, } cursor_position = indent_level; - list_for_each(elem_ref_to, &block->ref_to_list) { - const struct btrfsic_block_link *const l = - list_entry(elem_ref_to, struct btrfsic_block_link, - node_ref_to); - + list_for_each_entry(l, &block->ref_to_list, node_ref_to) { while (cursor_position < indent_level) { printk(" "); cursor_position++; @@ -3165,8 +3121,7 @@ int btrfsic_mount(struct btrfs_root *root, void btrfsic_unmount(struct btrfs_root *root, struct btrfs_fs_devices *fs_devices) { - struct list_head *elem_all; - struct list_head *tmp_all; + struct btrfsic_block *b_all, *tmp_all; struct btrfsic_state *state; struct list_head *dev_head = &fs_devices->devices; struct btrfs_device *device; @@ -3206,20 +3161,12 @@ void btrfsic_unmount(struct btrfs_root *root, * just free all memory that was allocated dynamically. * Free the blocks and the block_links. */ - list_for_each_safe(elem_all, tmp_all, &state->all_blocks_list) { - struct btrfsic_block *const b_all = - list_entry(elem_all, struct btrfsic_block, - all_blocks_node); - struct list_head *elem_ref_to; - struct list_head *tmp_ref_to; - - list_for_each_safe(elem_ref_to, tmp_ref_to, - &b_all->ref_to_list) { - struct btrfsic_block_link *const l = - list_entry(elem_ref_to, - struct btrfsic_block_link, - node_ref_to); + list_for_each_entry_safe(b_all, tmp_all, &state->all_blocks_list, + all_blocks_node) { + struct btrfsic_block_link *l, *tmp; + list_for_each_entry_safe(l, tmp, &b_all->ref_to_list, + node_ref_to) { if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) btrfsic_print_rem_link(state, l); diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 5b8e235c4b6d..769e0ff1b4ce 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -1555,7 +1555,7 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans, return 0; } - search_start = buf->start & ~((u64)(1024 * 1024 * 1024) - 1); + search_start = buf->start & ~((u64)SZ_1G - 1); if (parent) btrfs_set_lock_blocking(parent); @@ -2248,7 +2248,6 @@ static void reada_for_search(struct btrfs_root *root, u64 target; u64 nread = 0; u64 gen; - int direction = path->reada; struct extent_buffer *eb; u32 nr; u32 blocksize; @@ -2276,16 +2275,16 @@ static void reada_for_search(struct btrfs_root *root, nr = slot; while (1) { - if (direction < 0) { + if (path->reada == READA_BACK) { if (nr == 0) break; nr--; - } else if (direction > 0) { + } else if (path->reada == READA_FORWARD) { nr++; if (nr >= nritems) break; } - if (path->reada < 0 && objectid) { + if (path->reada == READA_BACK && objectid) { btrfs_node_key(node, &disk_key, nr); if (btrfs_disk_key_objectid(&disk_key) != objectid) break; @@ -2493,7 +2492,7 @@ read_block_for_search(struct btrfs_trans_handle *trans, btrfs_set_path_blocking(p); free_extent_buffer(tmp); - if (p->reada) + if (p->reada != READA_NONE) reada_for_search(root, p, level, slot, key->objectid); btrfs_release_path(p); diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 35489e7129a7..bfe4a337fb4d 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -35,6 +35,7 @@ #include <linux/btrfs.h> #include <linux/workqueue.h> #include <linux/security.h> +#include <linux/sizes.h> #include "extent_io.h" #include "extent_map.h" #include "async-thread.h" @@ -96,6 +97,9 @@ struct btrfs_ordered_sum; /* for storing items that use the BTRFS_UUID_KEY* types */ #define BTRFS_UUID_TREE_OBJECTID 9ULL +/* tracks free space in block groups. */ +#define BTRFS_FREE_SPACE_TREE_OBJECTID 10ULL + /* for storing balance parameters in the root tree */ #define BTRFS_BALANCE_OBJECTID -4ULL @@ -174,7 +178,7 @@ struct btrfs_ordered_sum; /* csum types */ #define BTRFS_CSUM_TYPE_CRC32 0 -static int btrfs_csum_sizes[] = { 4 }; +static const int btrfs_csum_sizes[] = { 4 }; /* four bytes for CRC32 */ #define BTRFS_EMPTY_DIR_SIZE 0 @@ -196,9 +200,9 @@ static int btrfs_csum_sizes[] = { 4 }; /* ioprio of readahead is set to idle */ #define BTRFS_IOPRIO_READA (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)) -#define BTRFS_DIRTY_METADATA_THRESH (32 * 1024 * 1024) +#define BTRFS_DIRTY_METADATA_THRESH SZ_32M -#define BTRFS_MAX_EXTENT_SIZE (128 * 1024 * 1024) +#define BTRFS_MAX_EXTENT_SIZE SZ_128M /* * The key defines the order in the tree, and so it also defines (optimal) @@ -500,6 +504,8 @@ struct btrfs_super_block { * Compat flags that we support. If any incompat flags are set other than the * ones specified below then we will fail to mount */ +#define BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE (1ULL << 0) + #define BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF (1ULL << 0) #define BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL (1ULL << 1) #define BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS (1ULL << 2) @@ -526,7 +532,10 @@ struct btrfs_super_block { #define BTRFS_FEATURE_COMPAT_SUPP 0ULL #define BTRFS_FEATURE_COMPAT_SAFE_SET 0ULL #define BTRFS_FEATURE_COMPAT_SAFE_CLEAR 0ULL -#define BTRFS_FEATURE_COMPAT_RO_SUPP 0ULL + +#define BTRFS_FEATURE_COMPAT_RO_SUPP \ + (BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE) + #define BTRFS_FEATURE_COMPAT_RO_SAFE_SET 0ULL #define BTRFS_FEATURE_COMPAT_RO_SAFE_CLEAR 0ULL @@ -590,14 +599,15 @@ struct btrfs_node { * The slots array records the index of the item or block pointer * used while walking the tree. */ +enum { READA_NONE = 0, READA_BACK, READA_FORWARD }; struct btrfs_path { struct extent_buffer *nodes[BTRFS_MAX_LEVEL]; int slots[BTRFS_MAX_LEVEL]; /* if there is real range locking, this locks field will change */ - int locks[BTRFS_MAX_LEVEL]; - int reada; + u8 locks[BTRFS_MAX_LEVEL]; + u8 reada; /* keep some upper locks as we walk down */ - int lowest_level; + u8 lowest_level; /* * set by btrfs_split_item, tells search_slot to keep all locks @@ -1088,6 +1098,13 @@ struct btrfs_block_group_item { __le64 flags; } __attribute__ ((__packed__)); +struct btrfs_free_space_info { + __le32 extent_count; + __le32 flags; +} __attribute__ ((__packed__)); + +#define BTRFS_FREE_SPACE_USING_BITMAPS (1ULL << 0) + #define BTRFS_QGROUP_LEVEL_SHIFT 48 static inline u64 btrfs_qgroup_level(u64 qgroupid) { @@ -1296,6 +1313,9 @@ struct btrfs_caching_control { atomic_t count; }; +/* Once caching_thread() finds this much free space, it will wake up waiters. */ +#define CACHING_CTL_WAKE_UP (1024 * 1024 * 2) + struct btrfs_io_ctl { void *cur, *orig; struct page *page; @@ -1321,8 +1341,20 @@ struct btrfs_block_group_cache { u64 delalloc_bytes; u64 bytes_super; u64 flags; - u64 sectorsize; u64 cache_generation; + u32 sectorsize; + + /* + * If the free space extent count exceeds this number, convert the block + * group to bitmaps. + */ + u32 bitmap_high_thresh; + + /* + * If the free space extent count drops below this number, convert the + * block group back to extents. + */ + u32 bitmap_low_thresh; /* * It is just used for the delayed data space allocation because @@ -1378,6 +1410,15 @@ struct btrfs_block_group_cache { struct list_head io_list; struct btrfs_io_ctl io_ctl; + + /* Lock for free space tree operations. */ + struct mutex free_space_lock; + + /* + * Does the block group need to be added to the free space tree? + * Protected by free_space_lock. + */ + int needs_free_space; }; /* delayed seq elem */ @@ -1429,6 +1470,7 @@ struct btrfs_fs_info { struct btrfs_root *csum_root; struct btrfs_root *quota_root; struct btrfs_root *uuid_root; + struct btrfs_root *free_space_root; /* the log root tree is a directory of all the other log roots */ struct btrfs_root *log_root_tree; @@ -1572,7 +1614,7 @@ struct btrfs_fs_info { spinlock_t delayed_iput_lock; struct list_head delayed_iputs; - struct rw_semaphore delayed_iput_sem; + struct mutex cleaner_delayed_iput_mutex; /* this protects tree_mod_seq_list */ spinlock_t tree_mod_seq_lock; @@ -1816,6 +1858,8 @@ struct btrfs_fs_info { * and will be latter freed. Protected by fs_info->chunk_mutex. */ struct list_head pinned_chunks; + + int creating_free_space_tree; }; struct btrfs_subvolume_writers { @@ -2092,6 +2136,27 @@ struct btrfs_ioctl_defrag_range_args { */ #define BTRFS_BLOCK_GROUP_ITEM_KEY 192 +/* + * Every block group is represented in the free space tree by a free space info + * item, which stores some accounting information. It is keyed on + * (block_group_start, FREE_SPACE_INFO, block_group_length). + */ +#define BTRFS_FREE_SPACE_INFO_KEY 198 + +/* + * A free space extent tracks an extent of space that is free in a block group. + * It is keyed on (start, FREE_SPACE_EXTENT, length). + */ +#define BTRFS_FREE_SPACE_EXTENT_KEY 199 + +/* + * When a block group becomes very fragmented, we convert it to use bitmaps + * instead of extents. A free space bitmap is keyed on + * (start, FREE_SPACE_BITMAP, length); the corresponding item is a bitmap with + * (length / sectorsize) bits. + */ +#define BTRFS_FREE_SPACE_BITMAP_KEY 200 + #define BTRFS_DEV_EXTENT_KEY 204 #define BTRFS_DEV_ITEM_KEY 216 #define BTRFS_CHUNK_ITEM_KEY 228 @@ -2184,6 +2249,7 @@ struct btrfs_ioctl_defrag_range_args { #define BTRFS_MOUNT_RESCAN_UUID_TREE (1 << 23) #define BTRFS_MOUNT_FRAGMENT_DATA (1 << 24) #define BTRFS_MOUNT_FRAGMENT_METADATA (1 << 25) +#define BTRFS_MOUNT_FREE_SPACE_TREE (1 << 26) #define BTRFS_DEFAULT_COMMIT_INTERVAL (30) #define BTRFS_DEFAULT_MAX_INLINE (8192) @@ -2506,6 +2572,11 @@ BTRFS_SETGET_FUNCS(disk_block_group_flags, BTRFS_SETGET_STACK_FUNCS(block_group_flags, struct btrfs_block_group_item, flags, 64); +/* struct btrfs_free_space_info */ +BTRFS_SETGET_FUNCS(free_space_extent_count, struct btrfs_free_space_info, + extent_count, 32); +BTRFS_SETGET_FUNCS(free_space_flags, struct btrfs_free_space_info, flags, 32); + /* struct btrfs_inode_ref */ BTRFS_SETGET_FUNCS(inode_ref_name_len, struct btrfs_inode_ref, name_len, 16); BTRFS_SETGET_FUNCS(inode_ref_index, struct btrfs_inode_ref, index, 64); @@ -3570,9 +3641,13 @@ int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans, int __get_raid_index(u64 flags); int btrfs_start_write_no_snapshoting(struct btrfs_root *root); void btrfs_end_write_no_snapshoting(struct btrfs_root *root); +void btrfs_wait_for_snapshot_creation(struct btrfs_root *root); void check_system_chunk(struct btrfs_trans_handle *trans, struct btrfs_root *root, const u64 type); +u64 add_new_free_space(struct btrfs_block_group_cache *block_group, + struct btrfs_fs_info *info, u64 start, u64 end); + /* ctree.c */ int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key, int level, int *slot); @@ -3737,6 +3812,7 @@ static inline void free_fs_info(struct btrfs_fs_info *fs_info) kfree(fs_info->csum_root); kfree(fs_info->quota_root); kfree(fs_info->uuid_root); + kfree(fs_info->free_space_root); kfree(fs_info->super_copy); kfree(fs_info->super_for_commit); security_free_mnt_opts(&fs_info->security_opts); @@ -3906,7 +3982,6 @@ void btrfs_extent_item_to_extent_map(struct inode *inode, /* inode.c */ struct btrfs_delalloc_work { struct inode *inode; - int wait; int delay_iput; struct completion completion; struct list_head list; @@ -3914,7 +3989,7 @@ struct btrfs_delalloc_work { }; struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode, - int wait, int delay_iput); + int delay_iput); void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work); struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *page, @@ -4024,7 +4099,8 @@ void btrfs_get_block_group_info(struct list_head *groups_list, struct btrfs_ioctl_space_info *space); void update_ioctl_balance_args(struct btrfs_fs_info *fs_info, int lock, struct btrfs_ioctl_balance_args *bargs); - +ssize_t btrfs_dedupe_file_range(struct file *src_file, u64 loff, u64 olen, + struct file *dst_file, u64 dst_loff); /* file.c */ int btrfs_auto_defrag_init(void); @@ -4055,6 +4131,11 @@ int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode, loff_t pos, size_t write_bytes, struct extent_state **cached); int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end); +ssize_t btrfs_copy_file_range(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, + size_t len, unsigned int flags); +int btrfs_clone_file_range(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, u64 len); /* tree-defrag.c */ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, @@ -4247,16 +4328,98 @@ static inline void __btrfs_set_fs_incompat(struct btrfs_fs_info *fs_info, } } +#define btrfs_clear_fs_incompat(__fs_info, opt) \ + __btrfs_clear_fs_incompat((__fs_info), BTRFS_FEATURE_INCOMPAT_##opt) + +static inline void __btrfs_clear_fs_incompat(struct btrfs_fs_info *fs_info, + u64 flag) +{ + struct btrfs_super_block *disk_super; + u64 features; + + disk_super = fs_info->super_copy; + features = btrfs_super_incompat_flags(disk_super); + if (features & flag) { + spin_lock(&fs_info->super_lock); + features = btrfs_super_incompat_flags(disk_super); + if (features & flag) { + features &= ~flag; + btrfs_set_super_incompat_flags(disk_super, features); + btrfs_info(fs_info, "clearing %llu feature flag", + flag); + } + spin_unlock(&fs_info->super_lock); + } +} + #define btrfs_fs_incompat(fs_info, opt) \ __btrfs_fs_incompat((fs_info), BTRFS_FEATURE_INCOMPAT_##opt) -static inline int __btrfs_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag) +static inline bool __btrfs_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag) { struct btrfs_super_block *disk_super; disk_super = fs_info->super_copy; return !!(btrfs_super_incompat_flags(disk_super) & flag); } +#define btrfs_set_fs_compat_ro(__fs_info, opt) \ + __btrfs_set_fs_compat_ro((__fs_info), BTRFS_FEATURE_COMPAT_RO_##opt) + +static inline void __btrfs_set_fs_compat_ro(struct btrfs_fs_info *fs_info, + u64 flag) +{ + struct btrfs_super_block *disk_super; + u64 features; + + disk_super = fs_info->super_copy; + features = btrfs_super_compat_ro_flags(disk_super); + if (!(features & flag)) { + spin_lock(&fs_info->super_lock); + features = btrfs_super_compat_ro_flags(disk_super); + if (!(features & flag)) { + features |= flag; + btrfs_set_super_compat_ro_flags(disk_super, features); + btrfs_info(fs_info, "setting %llu ro feature flag", + flag); + } + spin_unlock(&fs_info->super_lock); + } +} + +#define btrfs_clear_fs_compat_ro(__fs_info, opt) \ + __btrfs_clear_fs_compat_ro((__fs_info), BTRFS_FEATURE_COMPAT_RO_##opt) + +static inline void __btrfs_clear_fs_compat_ro(struct btrfs_fs_info *fs_info, + u64 flag) +{ + struct btrfs_super_block *disk_super; + u64 features; + + disk_super = fs_info->super_copy; + features = btrfs_super_compat_ro_flags(disk_super); + if (features & flag) { + spin_lock(&fs_info->super_lock); + features = btrfs_super_compat_ro_flags(disk_super); + if (features & flag) { + features &= ~flag; + btrfs_set_super_compat_ro_flags(disk_super, features); + btrfs_info(fs_info, "clearing %llu ro feature flag", + flag); + } + spin_unlock(&fs_info->super_lock); + } +} + +#define btrfs_fs_compat_ro(fs_info, opt) \ + __btrfs_fs_compat_ro((fs_info), BTRFS_FEATURE_COMPAT_RO_##opt) + +static inline int __btrfs_fs_compat_ro(struct btrfs_fs_info *fs_info, u64 flag) +{ + struct btrfs_super_block *disk_super; + disk_super = fs_info->super_copy; + return !!(btrfs_super_compat_ro_flags(disk_super) & flag); +} + /* * Call btrfs_abort_transaction as early as possible when an error condition is * detected, that way the exact line number is reported. diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index e0941fbb913c..0be47e4b8136 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -54,16 +54,11 @@ static inline void btrfs_init_delayed_node( delayed_node->root = root; delayed_node->inode_id = inode_id; atomic_set(&delayed_node->refs, 0); - delayed_node->count = 0; - delayed_node->flags = 0; delayed_node->ins_root = RB_ROOT; delayed_node->del_root = RB_ROOT; mutex_init(&delayed_node->mutex); - delayed_node->index_cnt = 0; INIT_LIST_HEAD(&delayed_node->n_list); INIT_LIST_HEAD(&delayed_node->p_list); - delayed_node->bytes_reserved = 0; - memset(&delayed_node->inode_item, 0, sizeof(delayed_node->inode_item)); } static inline int btrfs_is_continuous_delayed_item( @@ -132,7 +127,7 @@ again: if (node) return node; - node = kmem_cache_alloc(delayed_node_cache, GFP_NOFS); + node = kmem_cache_zalloc(delayed_node_cache, GFP_NOFS); if (!node) return ERR_PTR(-ENOMEM); btrfs_init_delayed_node(node, root, ino); diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index e06dd75ad13f..914ac13bd92f 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -493,12 +493,12 @@ update_existing_head_ref(struct btrfs_delayed_ref_root *delayed_refs, memcpy(&existing_ref->extent_op->key, &ref->extent_op->key, sizeof(ref->extent_op->key)); - existing_ref->extent_op->update_key = 1; + existing_ref->extent_op->update_key = true; } if (ref->extent_op->update_flags) { existing_ref->extent_op->flags_to_set |= ref->extent_op->flags_to_set; - existing_ref->extent_op->update_flags = 1; + existing_ref->extent_op->update_flags = true; } btrfs_free_delayed_extent_op(ref->extent_op); } diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index 00ed02cbf3e9..c24b653c7343 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -75,11 +75,11 @@ struct btrfs_delayed_ref_node { struct btrfs_delayed_extent_op { struct btrfs_disk_key key; + u8 level; + bool update_key; + bool update_flags; + bool is_data; u64 flags_to_set; - int level; - unsigned int update_key:1; - unsigned int update_flags:1; - unsigned int is_data:1; }; /* diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 1e668fb7dd4c..cbb7dbfb3fff 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -614,7 +614,7 @@ static void btrfs_dev_replace_update_device_in_mapping_tree( em = lookup_extent_mapping(em_tree, start, (u64)-1); if (!em) break; - map = (struct map_lookup *)em->bdev; + map = em->map_lookup; for (i = 0; i < map->num_stripes; i++) if (srcdev == map->stripes[i].dev) map->stripes[i].dev = tgtdev; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 974be09e7556..dd08e29f5117 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -42,6 +42,7 @@ #include "locking.h" #include "tree-log.h" #include "free-space-cache.h" +#include "free-space-tree.h" #include "inode-map.h" #include "check-integrity.h" #include "rcu-string.h" @@ -54,6 +55,12 @@ #include <asm/cpufeature.h> #endif +#define BTRFS_SUPER_FLAG_SUPP (BTRFS_HEADER_FLAG_WRITTEN |\ + BTRFS_HEADER_FLAG_RELOC |\ + BTRFS_SUPER_FLAG_ERROR |\ + BTRFS_SUPER_FLAG_SEEDING |\ + BTRFS_SUPER_FLAG_METADUMP) + static const struct extent_io_ops btree_extent_io_ops; static void end_workqueue_fn(struct btrfs_work *work); static void free_fs_root(struct btrfs_root *root); @@ -362,7 +369,7 @@ static int verify_parent_transid(struct extent_io_tree *io_tree, } lock_extent_bits(io_tree, eb->start, eb->start + eb->len - 1, - 0, &cached_state); + &cached_state); if (extent_buffer_uptodate(eb) && btrfs_header_generation(eb) == parent_transid) { ret = 0; @@ -923,7 +930,7 @@ static int check_async_write(struct inode *inode, unsigned long bio_flags) if (bio_flags & EXTENT_BIO_TREE_LOG) return 0; #ifdef CONFIG_X86 - if (cpu_has_xmm4_2) + if (static_cpu_has_safe(X86_FEATURE_XMM4_2)) return 0; #endif return 1; @@ -1582,8 +1589,23 @@ int btrfs_init_fs_root(struct btrfs_root *root) ret = get_anon_bdev(&root->anon_dev); if (ret) goto free_writers; + + mutex_lock(&root->objectid_mutex); + ret = btrfs_find_highest_objectid(root, + &root->highest_objectid); + if (ret) { + mutex_unlock(&root->objectid_mutex); + goto free_root_dev; + } + + ASSERT(root->highest_objectid <= BTRFS_LAST_FREE_OBJECTID); + + mutex_unlock(&root->objectid_mutex); + return 0; +free_root_dev: + free_anon_bdev(root->anon_dev); free_writers: btrfs_free_subvolume_writers(root->subv_writers); fail: @@ -1650,6 +1672,9 @@ struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info, if (location->objectid == BTRFS_UUID_TREE_OBJECTID) return fs_info->uuid_root ? fs_info->uuid_root : ERR_PTR(-ENOENT); + if (location->objectid == BTRFS_FREE_SPACE_TREE_OBJECTID) + return fs_info->free_space_root ? fs_info->free_space_root : + ERR_PTR(-ENOENT); again: root = btrfs_lookup_fs_root(fs_info, location->objectid); if (root) { @@ -1782,7 +1807,10 @@ static int cleaner_kthread(void *arg) goto sleep; } + mutex_lock(&root->fs_info->cleaner_delayed_iput_mutex); btrfs_run_delayed_iputs(root); + mutex_unlock(&root->fs_info->cleaner_delayed_iput_mutex); + again = btrfs_clean_one_deleted_snapshot(root); mutex_unlock(&root->fs_info->cleaner_mutex); @@ -2148,6 +2176,7 @@ static void free_root_pointers(struct btrfs_fs_info *info, int chunk_root) free_root_extent_buffers(info->uuid_root); if (chunk_root) free_root_extent_buffers(info->chunk_root); + free_root_extent_buffers(info->free_space_root); } void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info) @@ -2448,6 +2477,15 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info, fs_info->uuid_root = root; } + if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) { + location.objectid = BTRFS_FREE_SPACE_TREE_OBJECTID; + root = btrfs_read_tree_root(tree_root, &location); + if (IS_ERR(root)) + return PTR_ERR(root); + set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); + fs_info->free_space_root = root; + } + return 0; } @@ -2542,8 +2580,8 @@ int open_ctree(struct super_block *sb, mutex_init(&fs_info->delete_unused_bgs_mutex); mutex_init(&fs_info->reloc_mutex); mutex_init(&fs_info->delalloc_root_mutex); + mutex_init(&fs_info->cleaner_delayed_iput_mutex); seqlock_init(&fs_info->profiles_lock); - init_rwsem(&fs_info->delayed_iput_sem); INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots); INIT_LIST_HEAD(&fs_info->space_info); @@ -2668,6 +2706,7 @@ int open_ctree(struct super_block *sb, if (btrfs_check_super_csum(bh->b_data)) { printk(KERN_ERR "BTRFS: superblock checksum mismatch\n"); err = -EINVAL; + brelse(bh); goto fail_alloc; } @@ -2727,26 +2766,6 @@ int open_ctree(struct super_block *sb, goto fail_alloc; } - /* - * Leafsize and nodesize were always equal, this is only a sanity check. - */ - if (le32_to_cpu(disk_super->__unused_leafsize) != - btrfs_super_nodesize(disk_super)) { - printk(KERN_ERR "BTRFS: couldn't mount because metadata " - "blocksizes don't match. node %d leaf %d\n", - btrfs_super_nodesize(disk_super), - le32_to_cpu(disk_super->__unused_leafsize)); - err = -EINVAL; - goto fail_alloc; - } - if (btrfs_super_nodesize(disk_super) > BTRFS_MAX_METADATA_BLOCKSIZE) { - printk(KERN_ERR "BTRFS: couldn't mount because metadata " - "blocksize (%d) was too large\n", - btrfs_super_nodesize(disk_super)); - err = -EINVAL; - goto fail_alloc; - } - features = btrfs_super_incompat_flags(disk_super); features |= BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF; if (tree_root->fs_info->compress_type == BTRFS_COMPRESS_LZO) @@ -2809,7 +2828,7 @@ int open_ctree(struct super_block *sb, fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super); fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages, - 4 * 1024 * 1024 / PAGE_CACHE_SIZE); + SZ_4M / PAGE_CACHE_SIZE); tree_root->nodesize = nodesize; tree_root->sectorsize = sectorsize; @@ -2818,17 +2837,6 @@ int open_ctree(struct super_block *sb, sb->s_blocksize = sectorsize; sb->s_blocksize_bits = blksize_bits(sectorsize); - if (btrfs_super_magic(disk_super) != BTRFS_MAGIC) { - printk(KERN_ERR "BTRFS: valid FS not found on %s\n", sb->s_id); - goto fail_sb_buffer; - } - - if (sectorsize != PAGE_SIZE) { - printk(KERN_ERR "BTRFS: incompatible sector size (%lu) " - "found on %s\n", (unsigned long)sectorsize, sb->s_id); - goto fail_sb_buffer; - } - mutex_lock(&fs_info->chunk_mutex); ret = btrfs_read_sys_array(tree_root); mutex_unlock(&fs_info->chunk_mutex); @@ -2900,6 +2908,18 @@ retry_root_backup: tree_root->commit_root = btrfs_root_node(tree_root); btrfs_set_root_refs(&tree_root->root_item, 1); + mutex_lock(&tree_root->objectid_mutex); + ret = btrfs_find_highest_objectid(tree_root, + &tree_root->highest_objectid); + if (ret) { + mutex_unlock(&tree_root->objectid_mutex); + goto recovery_tree_root; + } + + ASSERT(tree_root->highest_objectid <= BTRFS_LAST_FREE_OBJECTID); + + mutex_unlock(&tree_root->objectid_mutex); + ret = btrfs_read_roots(fs_info, tree_root); if (ret) goto recovery_tree_root; @@ -3051,6 +3071,18 @@ retry_root_backup: if (sb->s_flags & MS_RDONLY) return 0; + if (btrfs_test_opt(tree_root, FREE_SPACE_TREE) && + !btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) { + pr_info("BTRFS: creating free space tree\n"); + ret = btrfs_create_free_space_tree(fs_info); + if (ret) { + pr_warn("BTRFS: failed to create free space tree %d\n", + ret); + close_ctree(tree_root); + return ret; + } + } + down_read(&fs_info->cleanup_work_sem); if ((ret = btrfs_orphan_cleanup(fs_info->fs_root)) || (ret = btrfs_orphan_cleanup(fs_info->tree_root))) { @@ -3076,6 +3108,18 @@ retry_root_backup: btrfs_qgroup_rescan_resume(fs_info); + if (btrfs_test_opt(tree_root, CLEAR_CACHE) && + btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) { + pr_info("BTRFS: clearing free space tree\n"); + ret = btrfs_clear_free_space_tree(fs_info); + if (ret) { + pr_warn("BTRFS: failed to clear free space tree %d\n", + ret); + close_ctree(tree_root); + return ret; + } + } + if (!fs_info->uuid_root) { pr_info("BTRFS: creating UUID tree\n"); ret = btrfs_create_uuid_tree(fs_info); @@ -3902,11 +3946,6 @@ int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid, return !ret; } -int btrfs_set_buffer_uptodate(struct extent_buffer *buf) -{ - return set_extent_buffer_uptodate(buf); -} - void btrfs_mark_buffer_dirty(struct extent_buffer *buf) { struct btrfs_root *root; @@ -3962,7 +4001,6 @@ static void __btrfs_btree_balance_dirty(struct btrfs_root *root, balance_dirty_pages_ratelimited( root->fs_info->btree_inode->i_mapping); } - return; } void btrfs_btree_balance_dirty(struct btrfs_root *root) @@ -3985,8 +4023,17 @@ static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info, int read_only) { struct btrfs_super_block *sb = fs_info->super_copy; + u64 nodesize = btrfs_super_nodesize(sb); + u64 sectorsize = btrfs_super_sectorsize(sb); int ret = 0; + if (btrfs_super_magic(sb) != BTRFS_MAGIC) { + printk(KERN_ERR "BTRFS: no valid FS found\n"); + ret = -EINVAL; + } + if (btrfs_super_flags(sb) & ~BTRFS_SUPER_FLAG_SUPP) + printk(KERN_WARNING "BTRFS: unrecognized super flag: %llu\n", + btrfs_super_flags(sb) & ~BTRFS_SUPER_FLAG_SUPP); if (btrfs_super_root_level(sb) >= BTRFS_MAX_LEVEL) { printk(KERN_ERR "BTRFS: tree_root level too big: %d >= %d\n", btrfs_super_root_level(sb), BTRFS_MAX_LEVEL); @@ -4004,31 +4051,46 @@ static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info, } /* - * The common minimum, we don't know if we can trust the nodesize/sectorsize - * items yet, they'll be verified later. Issue just a warning. + * Check sectorsize and nodesize first, other check will need it. + * Check all possible sectorsize(4K, 8K, 16K, 32K, 64K) here. */ - if (!IS_ALIGNED(btrfs_super_root(sb), 4096)) + if (!is_power_of_2(sectorsize) || sectorsize < 4096 || + sectorsize > BTRFS_MAX_METADATA_BLOCKSIZE) { + printk(KERN_ERR "BTRFS: invalid sectorsize %llu\n", sectorsize); + ret = -EINVAL; + } + /* Only PAGE SIZE is supported yet */ + if (sectorsize != PAGE_CACHE_SIZE) { + printk(KERN_ERR "BTRFS: sectorsize %llu not supported yet, only support %lu\n", + sectorsize, PAGE_CACHE_SIZE); + ret = -EINVAL; + } + if (!is_power_of_2(nodesize) || nodesize < sectorsize || + nodesize > BTRFS_MAX_METADATA_BLOCKSIZE) { + printk(KERN_ERR "BTRFS: invalid nodesize %llu\n", nodesize); + ret = -EINVAL; + } + if (nodesize != le32_to_cpu(sb->__unused_leafsize)) { + printk(KERN_ERR "BTRFS: invalid leafsize %u, should be %llu\n", + le32_to_cpu(sb->__unused_leafsize), + nodesize); + ret = -EINVAL; + } + + /* Root alignment check */ + if (!IS_ALIGNED(btrfs_super_root(sb), sectorsize)) { printk(KERN_WARNING "BTRFS: tree_root block unaligned: %llu\n", btrfs_super_root(sb)); - if (!IS_ALIGNED(btrfs_super_chunk_root(sb), 4096)) + ret = -EINVAL; + } + if (!IS_ALIGNED(btrfs_super_chunk_root(sb), sectorsize)) { printk(KERN_WARNING "BTRFS: chunk_root block unaligned: %llu\n", btrfs_super_chunk_root(sb)); - if (!IS_ALIGNED(btrfs_super_log_root(sb), 4096)) - printk(KERN_WARNING "BTRFS: log_root block unaligned: %llu\n", - btrfs_super_log_root(sb)); - - /* - * Check the lower bound, the alignment and other constraints are - * checked later. - */ - if (btrfs_super_nodesize(sb) < 4096) { - printk(KERN_ERR "BTRFS: nodesize too small: %u < 4096\n", - btrfs_super_nodesize(sb)); ret = -EINVAL; } - if (btrfs_super_sectorsize(sb) < 4096) { - printk(KERN_ERR "BTRFS: sectorsize too small: %u < 4096\n", - btrfs_super_sectorsize(sb)); + if (!IS_ALIGNED(btrfs_super_log_root(sb), sectorsize)) { + printk(KERN_WARNING "BTRFS: log_root block unaligned: %llu\n", + btrfs_super_log_root(sb)); ret = -EINVAL; } diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index adeb31830b9c..8e79d0070bcf 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -19,7 +19,7 @@ #ifndef __DISKIO__ #define __DISKIO__ -#define BTRFS_SUPER_INFO_OFFSET (64 * 1024) +#define BTRFS_SUPER_INFO_OFFSET SZ_64K #define BTRFS_SUPER_INFO_SIZE 4096 #define BTRFS_SUPER_MIRROR_MAX 3 @@ -35,7 +35,7 @@ enum btrfs_wq_endio_type { static inline u64 btrfs_sb_offset(int mirror) { - u64 start = 16 * 1024; + u64 start = SZ_16K; if (mirror) return start << (BTRFS_SUPER_MIRROR_SHIFT * mirror); return BTRFS_SUPER_INFO_OFFSET; @@ -116,7 +116,6 @@ static inline void btrfs_put_fs_root(struct btrfs_root *root) void btrfs_mark_buffer_dirty(struct extent_buffer *buf); int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid, int atomic); -int btrfs_set_buffer_uptodate(struct extent_buffer *buf); int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid); u32 btrfs_csum_data(char *data, u32 seed, size_t len); void btrfs_csum_final(u32 crc, char *result); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index c4661db2b72a..e2287c7c10be 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -33,6 +33,7 @@ #include "raid56.h" #include "locking.h" #include "free-space-cache.h" +#include "free-space-tree.h" #include "math.h" #include "sysfs.h" #include "qgroup.h" @@ -357,8 +358,8 @@ static void fragment_free_space(struct btrfs_root *root, * we need to check the pinned_extents for any extents that can't be used yet * since their free space will be released as soon as the transaction commits. */ -static u64 add_new_free_space(struct btrfs_block_group_cache *block_group, - struct btrfs_fs_info *info, u64 start, u64 end) +u64 add_new_free_space(struct btrfs_block_group_cache *block_group, + struct btrfs_fs_info *info, u64 start, u64 end) { u64 extent_start, extent_end, size, total_added = 0; int ret; @@ -395,11 +396,10 @@ static u64 add_new_free_space(struct btrfs_block_group_cache *block_group, return total_added; } -static noinline void caching_thread(struct btrfs_work *work) +static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl) { struct btrfs_block_group_cache *block_group; struct btrfs_fs_info *fs_info; - struct btrfs_caching_control *caching_ctl; struct btrfs_root *extent_root; struct btrfs_path *path; struct extent_buffer *leaf; @@ -407,17 +407,16 @@ static noinline void caching_thread(struct btrfs_work *work) u64 total_found = 0; u64 last = 0; u32 nritems; - int ret = -ENOMEM; + int ret; bool wakeup = true; - caching_ctl = container_of(work, struct btrfs_caching_control, work); block_group = caching_ctl->block_group; fs_info = block_group->fs_info; extent_root = fs_info->extent_root; path = btrfs_alloc_path(); if (!path) - goto out; + return -ENOMEM; last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET); @@ -438,20 +437,16 @@ static noinline void caching_thread(struct btrfs_work *work) */ path->skip_locking = 1; path->search_commit_root = 1; - path->reada = 1; + path->reada = READA_FORWARD; key.objectid = last; key.offset = 0; key.type = BTRFS_EXTENT_ITEM_KEY; -again: - mutex_lock(&caching_ctl->mutex); - /* need to make sure the commit_root doesn't disappear */ - down_read(&fs_info->commit_root_sem); next: ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); if (ret < 0) - goto err; + goto out; leaf = path->nodes[0]; nritems = btrfs_header_nritems(leaf); @@ -477,12 +472,14 @@ next: up_read(&fs_info->commit_root_sem); mutex_unlock(&caching_ctl->mutex); cond_resched(); - goto again; + mutex_lock(&caching_ctl->mutex); + down_read(&fs_info->commit_root_sem); + goto next; } ret = btrfs_next_leaf(extent_root, path); if (ret < 0) - goto err; + goto out; if (ret) break; leaf = path->nodes[0]; @@ -521,7 +518,7 @@ next: else last = key.objectid + key.offset; - if (total_found > (1024 * 1024 * 2)) { + if (total_found > CACHING_CTL_WAKE_UP) { total_found = 0; if (wakeup) wake_up(&caching_ctl->wait); @@ -534,9 +531,37 @@ next: total_found += add_new_free_space(block_group, fs_info, last, block_group->key.objectid + block_group->key.offset); + caching_ctl->progress = (u64)-1; + +out: + btrfs_free_path(path); + return ret; +} + +static noinline void caching_thread(struct btrfs_work *work) +{ + struct btrfs_block_group_cache *block_group; + struct btrfs_fs_info *fs_info; + struct btrfs_caching_control *caching_ctl; + struct btrfs_root *extent_root; + int ret; + + caching_ctl = container_of(work, struct btrfs_caching_control, work); + block_group = caching_ctl->block_group; + fs_info = block_group->fs_info; + extent_root = fs_info->extent_root; + + mutex_lock(&caching_ctl->mutex); + down_read(&fs_info->commit_root_sem); + + if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) + ret = load_free_space_tree(caching_ctl); + else + ret = load_extent_tree_free(caching_ctl); + spin_lock(&block_group->lock); block_group->caching_ctl = NULL; - block_group->cached = BTRFS_CACHE_FINISHED; + block_group->cached = ret ? BTRFS_CACHE_ERROR : BTRFS_CACHE_FINISHED; spin_unlock(&block_group->lock); #ifdef CONFIG_BTRFS_DEBUG @@ -555,20 +580,11 @@ next: #endif caching_ctl->progress = (u64)-1; -err: - btrfs_free_path(path); - up_read(&fs_info->commit_root_sem); - - free_excluded_extents(extent_root, block_group); + up_read(&fs_info->commit_root_sem); + free_excluded_extents(fs_info->extent_root, block_group); mutex_unlock(&caching_ctl->mutex); -out: - if (ret) { - spin_lock(&block_group->lock); - block_group->caching_ctl = NULL; - block_group->cached = BTRFS_CACHE_ERROR; - spin_unlock(&block_group->lock); - } + wake_up(&caching_ctl->wait); put_caching_control(caching_ctl); @@ -680,8 +696,8 @@ static int cache_block_group(struct btrfs_block_group_cache *cache, } } else { /* - * We are not going to do the fast caching, set cached to the - * appropriate value and wakeup any waiters. + * We're either using the free space tree or no caching at all. + * Set cached to the appropriate value and wakeup any waiters. */ spin_lock(&cache->lock); if (load_cache_only) { @@ -2115,7 +2131,7 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, if (!path) return -ENOMEM; - path->reada = 1; + path->reada = READA_FORWARD; path->leave_spinning = 1; /* this will setup the path even if it fails to insert the back ref */ ret = insert_inline_extent_backref(trans, fs_info->extent_root, path, @@ -2141,7 +2157,7 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(leaf); btrfs_release_path(path); - path->reada = 1; + path->reada = READA_FORWARD; path->leave_spinning = 1; /* now insert the actual backref */ ret = insert_extent_backref(trans, root->fs_info->extent_root, @@ -2254,7 +2270,7 @@ static int run_delayed_extent_op(struct btrfs_trans_handle *trans, } again: - path->reada = 1; + path->reada = READA_FORWARD; path->leave_spinning = 1; ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key, path, 0, 1); @@ -2910,6 +2926,9 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, if (trans->aborted) return 0; + if (root->fs_info->creating_free_space_tree) + return 0; + if (root == root->fs_info->extent_root) root = root->fs_info->tree_root; @@ -2988,9 +3007,9 @@ int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, return -ENOMEM; extent_op->flags_to_set = flags; - extent_op->update_flags = 1; - extent_op->update_key = 0; - extent_op->is_data = is_data ? 1 : 0; + extent_op->update_flags = true; + extent_op->update_key = false; + extent_op->is_data = is_data ? true : false; extent_op->level = level; ret = btrfs_add_delayed_extent_op(root->fs_info, trans, bytenr, @@ -3328,7 +3347,7 @@ static int cache_save_setup(struct btrfs_block_group_cache *block_group, * If this block group is smaller than 100 megs don't bother caching the * block group. */ - if (block_group->key.offset < (100 * 1024 * 1024)) { + if (block_group->key.offset < (100 * SZ_1M)) { spin_lock(&block_group->lock); block_group->disk_cache_state = BTRFS_DC_WRITTEN; spin_unlock(&block_group->lock); @@ -3428,7 +3447,7 @@ again: * taking up quite a bit since it's not folded into the other space * cache. */ - num_pages = div_u64(block_group->key.offset, 256 * 1024 * 1024); + num_pages = div_u64(block_group->key.offset, SZ_256M); if (!num_pages) num_pages = 1; @@ -3684,11 +3703,21 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, return -ENOMEM; /* - * We don't need the lock here since we are protected by the transaction - * commit. We want to do the cache_save_setup first and then run the + * Even though we are in the critical section of the transaction commit, + * we can still have concurrent tasks adding elements to this + * transaction's list of dirty block groups. These tasks correspond to + * endio free space workers started when writeback finishes for a + * space cache, which run inode.c:btrfs_finish_ordered_io(), and can + * allocate new block groups as a result of COWing nodes of the root + * tree when updating the free space inode. The writeback for the space + * caches is triggered by an earlier call to + * btrfs_start_dirty_block_groups() and iterations of the following + * loop. + * Also we want to do the cache_save_setup first and then run the * delayed refs to make sure we have the best chance at doing this all * in one shot. */ + spin_lock(&cur_trans->dirty_bgs_lock); while (!list_empty(&cur_trans->dirty_bgs)) { cache = list_first_entry(&cur_trans->dirty_bgs, struct btrfs_block_group_cache, @@ -3700,11 +3729,13 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, * finish and then do it all again */ if (!list_empty(&cache->io_list)) { + spin_unlock(&cur_trans->dirty_bgs_lock); list_del_init(&cache->io_list); btrfs_wait_cache_io(root, trans, cache, &cache->io_ctl, path, cache->key.objectid); btrfs_put_block_group(cache); + spin_lock(&cur_trans->dirty_bgs_lock); } /* @@ -3712,6 +3743,7 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, * on any pending IO */ list_del_init(&cache->dirty_list); + spin_unlock(&cur_trans->dirty_bgs_lock); should_put = 1; cache_save_setup(cache, trans, path); @@ -3736,6 +3768,25 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, } if (!ret) { ret = write_one_cache_group(trans, root, path, cache); + /* + * One of the free space endio workers might have + * created a new block group while updating a free space + * cache's inode (at inode.c:btrfs_finish_ordered_io()) + * and hasn't released its transaction handle yet, in + * which case the new block group is still attached to + * its transaction handle and its creation has not + * finished yet (no block group item in the extent tree + * yet, etc). If this is the case, wait for all free + * space endio workers to finish and retry. This is a + * a very rare case so no need for a more efficient and + * complex approach. + */ + if (ret == -ENOENT) { + wait_event(cur_trans->writer_wait, + atomic_read(&cur_trans->num_writers) == 1); + ret = write_one_cache_group(trans, root, path, + cache); + } if (ret) btrfs_abort_transaction(trans, root, ret); } @@ -3743,7 +3794,9 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, /* if its not on the io list, we need to put the block group */ if (should_put) btrfs_put_block_group(cache); + spin_lock(&cur_trans->dirty_bgs_lock); } + spin_unlock(&cur_trans->dirty_bgs_lock); while (!list_empty(io)) { cache = list_first_entry(io, struct btrfs_block_group_cache, @@ -4086,8 +4139,10 @@ commit_trans: !atomic_read(&root->fs_info->open_ioctl_trans)) { need_commit--; - if (need_commit > 0) + if (need_commit > 0) { + btrfs_start_delalloc_roots(fs_info, 0, -1); btrfs_wait_ordered_roots(fs_info, -1); + } trans = btrfs_join_transaction(root); if (IS_ERR(trans)) @@ -4100,11 +4155,12 @@ commit_trans: if (ret) return ret; /* - * make sure that all running delayed iput are - * done + * The cleaner kthread might still be doing iput + * operations. Wait for it to finish so that + * more space is released. */ - down_write(&root->fs_info->delayed_iput_sem); - up_write(&root->fs_info->delayed_iput_sem); + mutex_lock(&root->fs_info->cleaner_delayed_iput_mutex); + mutex_unlock(&root->fs_info->cleaner_delayed_iput_mutex); goto again; } else { btrfs_end_transaction(trans, root); @@ -4239,14 +4295,13 @@ static int should_alloc_chunk(struct btrfs_root *root, */ if (force == CHUNK_ALLOC_LIMITED) { thresh = btrfs_super_total_bytes(root->fs_info->super_copy); - thresh = max_t(u64, 64 * 1024 * 1024, - div_factor_fine(thresh, 1)); + thresh = max_t(u64, SZ_64M, div_factor_fine(thresh, 1)); if (num_bytes - num_allocated < thresh) return 1; } - if (num_allocated + 2 * 1024 * 1024 < div_factor(num_bytes, 8)) + if (num_allocated + SZ_2M < div_factor(num_bytes, 8)) return 0; return 1; } @@ -4446,7 +4501,7 @@ out: * transaction. */ if (trans->can_flush_pending_bgs && - trans->chunk_bytes_reserved >= (2 * 1024 * 1024ull)) { + trans->chunk_bytes_reserved >= (u64)SZ_2M) { btrfs_create_pending_block_groups(trans, trans->root); btrfs_trans_release_chunk_metadata(trans); } @@ -4544,7 +4599,7 @@ static inline int calc_reclaim_items_nr(struct btrfs_root *root, u64 to_reclaim) return nr; } -#define EXTENT_SIZE_PER_ITEM (256 * 1024) +#define EXTENT_SIZE_PER_ITEM SZ_256K /* * shrink metadata reservation for delalloc @@ -4749,8 +4804,7 @@ btrfs_calc_reclaim_metadata_size(struct btrfs_root *root, u64 expected; u64 to_reclaim; - to_reclaim = min_t(u64, num_online_cpus() * 1024 * 1024, - 16 * 1024 * 1024); + to_reclaim = min_t(u64, num_online_cpus() * SZ_1M, SZ_16M); spin_lock(&space_info->lock); if (can_overcommit(root, space_info, to_reclaim, BTRFS_RESERVE_FLUSH_ALL)) { @@ -4761,8 +4815,7 @@ btrfs_calc_reclaim_metadata_size(struct btrfs_root *root, used = space_info->bytes_used + space_info->bytes_reserved + space_info->bytes_pinned + space_info->bytes_readonly + space_info->bytes_may_use; - if (can_overcommit(root, space_info, 1024 * 1024, - BTRFS_RESERVE_FLUSH_ALL)) + if (can_overcommit(root, space_info, SZ_1M, BTRFS_RESERVE_FLUSH_ALL)) expected = div_factor_fine(space_info->total_bytes, 95); else expected = div_factor_fine(space_info->total_bytes, 90); @@ -5318,7 +5371,7 @@ static void update_global_block_rsv(struct btrfs_fs_info *fs_info) spin_lock(&sinfo->lock); spin_lock(&block_rsv->lock); - block_rsv->size = min_t(u64, num_bytes, 512 * 1024 * 1024); + block_rsv->size = min_t(u64, num_bytes, SZ_512M); num_bytes = sinfo->bytes_used + sinfo->bytes_pinned + sinfo->bytes_reserved + sinfo->bytes_readonly + @@ -6222,11 +6275,11 @@ fetch_cluster_info(struct btrfs_root *root, struct btrfs_space_info *space_info, return ret; if (ssd) - *empty_cluster = 2 * 1024 * 1024; + *empty_cluster = SZ_2M; if (space_info->flags & BTRFS_BLOCK_GROUP_METADATA) { ret = &root->fs_info->meta_alloc_cluster; if (!ssd) - *empty_cluster = 64 * 1024; + *empty_cluster = SZ_64K; } else if ((space_info->flags & BTRFS_BLOCK_GROUP_DATA) && ssd) { ret = &root->fs_info->data_alloc_cluster; } @@ -6438,7 +6491,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, if (!path) return -ENOMEM; - path->reada = 1; + path->reada = READA_FORWARD; path->leave_spinning = 1; is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID; @@ -6661,6 +6714,13 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, } } + ret = add_to_free_space_tree(trans, root->fs_info, bytenr, + num_bytes); + if (ret) { + btrfs_abort_transaction(trans, extent_root, ret); + goto out; + } + ret = update_block_group(trans, root, bytenr, num_bytes, 0); if (ret) { btrfs_abort_transaction(trans, extent_root, ret); @@ -7672,6 +7732,11 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(path->nodes[0]); btrfs_free_path(path); + ret = remove_from_free_space_tree(trans, fs_info, ins->objectid, + ins->offset); + if (ret) + return ret; + ret = update_block_group(trans, root, ins->objectid, ins->offset, 1); if (ret) { /* -ENOENT, logic error */ btrfs_err(fs_info, "update block group failed for %llu %llu", @@ -7752,6 +7817,11 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(leaf); btrfs_free_path(path); + ret = remove_from_free_space_tree(trans, fs_info, ins->objectid, + num_bytes); + if (ret) + return ret; + ret = update_block_group(trans, root, ins->objectid, root->nodesize, 1); if (ret) { /* -ENOENT, logic error */ @@ -7834,7 +7904,7 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, clear_bit(EXTENT_BUFFER_STALE, &buf->bflags); btrfs_set_lock_blocking(buf); - btrfs_set_buffer_uptodate(buf); + set_extent_buffer_uptodate(buf); if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) { buf->log_index = root->log_transid % 2; @@ -7980,12 +8050,9 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, else memset(&extent_op->key, 0, sizeof(extent_op->key)); extent_op->flags_to_set = flags; - if (skinny_metadata) - extent_op->update_key = 0; - else - extent_op->update_key = 1; - extent_op->update_flags = 1; - extent_op->is_data = 0; + extent_op->update_key = skinny_metadata ? false : true; + extent_op->update_flags = true; + extent_op->is_data = false; extent_op->level = level; ret = btrfs_add_delayed_tree_ref(root->fs_info, trans, @@ -9124,7 +9191,7 @@ static int inc_block_group_ro(struct btrfs_block_group_cache *cache, int force) if ((sinfo->flags & (BTRFS_BLOCK_GROUP_SYSTEM | BTRFS_BLOCK_GROUP_METADATA)) && !force) - min_allocable_bytes = 1 * 1024 * 1024; + min_allocable_bytes = SZ_1M; else min_allocable_bytes = 0; @@ -9656,6 +9723,8 @@ btrfs_create_block_group_cache(struct btrfs_root *root, u64 start, u64 size) cache->full_stripe_len = btrfs_full_stripe_len(root, &root->fs_info->mapping_tree, start); + set_free_space_tree_thresholds(cache); + atomic_set(&cache->count, 1); spin_lock_init(&cache->lock); init_rwsem(&cache->data_rwsem); @@ -9667,6 +9736,7 @@ btrfs_create_block_group_cache(struct btrfs_root *root, u64 start, u64 size) INIT_LIST_HEAD(&cache->io_list); btrfs_init_free_space_ctl(cache); atomic_set(&cache->trimming, 0); + mutex_init(&cache->free_space_lock); return cache; } @@ -9691,7 +9761,7 @@ int btrfs_read_block_groups(struct btrfs_root *root) path = btrfs_alloc_path(); if (!path) return -ENOMEM; - path->reada = 1; + path->reada = READA_FORWARD; cache_gen = btrfs_super_cache_generation(root->fs_info->super_copy); if (btrfs_test_opt(root, SPACE_CACHE) && @@ -9877,6 +9947,8 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans, key.objectid, key.offset); if (ret) btrfs_abort_transaction(trans, extent_root, ret); + add_block_group_free_space(trans, root->fs_info, block_group); + /* already aborted the transaction if it failed. */ next: list_del_init(&block_group->bg_list); } @@ -9907,6 +9979,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, cache->flags = type; cache->last_byte_to_unpin = (u64)-1; cache->cached = BTRFS_CACHE_FINISHED; + cache->needs_free_space = 1; ret = exclude_super_stripes(root, cache); if (ret) { /* @@ -10277,6 +10350,10 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, unlock_chunks(root); + ret = remove_block_group_free_space(trans, root->fs_info, block_group); + if (ret) + goto out; + btrfs_put_block_group(block_group); btrfs_put_block_group(block_group); @@ -10325,7 +10402,7 @@ btrfs_start_trans_remove_block_group(struct btrfs_fs_info *fs_info, * more device items and remove one chunk item), but this is done at * btrfs_remove_chunk() through a call to check_system_chunk(). */ - map = (struct map_lookup *)em->bdev; + map = em->map_lookup; num_items = 3 + map->num_stripes; free_extent_map(em); @@ -10512,7 +10589,7 @@ int btrfs_init_space_info(struct btrfs_fs_info *fs_info) disk_super = fs_info->super_copy; if (!btrfs_super_root(disk_super)) - return 1; + return -EINVAL; features = btrfs_super_incompat_flags(disk_super); if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) @@ -10742,3 +10819,23 @@ int btrfs_start_write_no_snapshoting(struct btrfs_root *root) } return 1; } + +static int wait_snapshoting_atomic_t(atomic_t *a) +{ + schedule(); + return 0; +} + +void btrfs_wait_for_snapshot_creation(struct btrfs_root *root) +{ + while (true) { + int ret; + + ret = btrfs_start_write_no_snapshoting(root); + if (ret) + break; + wait_on_atomic_t(&root->will_be_snapshoted, + wait_snapshoting_atomic_t, + TASK_UNINTERRUPTIBLE); + } +} diff --git a/fs/btrfs/extent-tree.h b/fs/btrfs/extent-tree.h deleted file mode 100644 index e69de29bb2d1..000000000000 --- a/fs/btrfs/extent-tree.h +++ /dev/null diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 9abe18763a7f..2e7c97a3f344 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1285,20 +1285,6 @@ search_again: } /* wrappers around set/clear extent bit */ -int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, - gfp_t mask) -{ - return set_extent_bit(tree, start, end, EXTENT_DIRTY, NULL, - NULL, mask); -} - -int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, - unsigned bits, gfp_t mask) -{ - return set_extent_bit(tree, start, end, bits, NULL, - NULL, mask); -} - int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, unsigned bits, gfp_t mask, struct extent_changeset *changeset) @@ -1323,17 +1309,6 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, cached, mask, NULL); } -int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, - unsigned bits, gfp_t mask) -{ - int wake = 0; - - if (bits & EXTENT_LOCKED) - wake = 1; - - return clear_extent_bit(tree, start, end, bits, wake, 0, NULL, mask); -} - int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, unsigned bits, gfp_t mask, struct extent_changeset *changeset) @@ -1348,63 +1323,18 @@ int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, changeset); } -int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end, - struct extent_state **cached_state, gfp_t mask) -{ - return set_extent_bit(tree, start, end, - EXTENT_DELALLOC | EXTENT_UPTODATE, - NULL, cached_state, mask); -} - -int set_extent_defrag(struct extent_io_tree *tree, u64 start, u64 end, - struct extent_state **cached_state, gfp_t mask) -{ - return set_extent_bit(tree, start, end, - EXTENT_DELALLOC | EXTENT_UPTODATE | EXTENT_DEFRAG, - NULL, cached_state, mask); -} - -int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, - gfp_t mask) -{ - return clear_extent_bit(tree, start, end, - EXTENT_DIRTY | EXTENT_DELALLOC | - EXTENT_DO_ACCOUNTING, 0, 0, NULL, mask); -} - -int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end, - gfp_t mask) -{ - return set_extent_bit(tree, start, end, EXTENT_NEW, NULL, - NULL, mask); -} - -int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, - struct extent_state **cached_state, gfp_t mask) -{ - return set_extent_bit(tree, start, end, EXTENT_UPTODATE, NULL, - cached_state, mask); -} - -int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, - struct extent_state **cached_state, gfp_t mask) -{ - return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0, - cached_state, mask); -} - /* * either insert or lock state struct between start and end use mask to tell * us if waiting is desired. */ int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, - unsigned bits, struct extent_state **cached_state) + struct extent_state **cached_state) { int err; u64 failed_start; while (1) { - err = __set_extent_bit(tree, start, end, EXTENT_LOCKED | bits, + err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED, &failed_start, cached_state, GFP_NOFS, NULL); if (err == -EEXIST) { @@ -1417,11 +1347,6 @@ int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, return err; } -int lock_extent(struct extent_io_tree *tree, u64 start, u64 end) -{ - return lock_extent_bits(tree, start, end, 0, NULL); -} - int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end) { int err; @@ -1438,20 +1363,7 @@ int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end) return 1; } -int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end, - struct extent_state **cached, gfp_t mask) -{ - return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, cached, - mask); -} - -int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end) -{ - return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, NULL, - GFP_NOFS); -} - -int extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end) +void extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end) { unsigned long index = start >> PAGE_CACHE_SHIFT; unsigned long end_index = end >> PAGE_CACHE_SHIFT; @@ -1464,10 +1376,9 @@ int extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end) page_cache_release(page); index++; } - return 0; } -int extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end) +void extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end) { unsigned long index = start >> PAGE_CACHE_SHIFT; unsigned long end_index = end >> PAGE_CACHE_SHIFT; @@ -1481,13 +1392,12 @@ int extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end) page_cache_release(page); index++; } - return 0; } /* * helper function to set both pages and extents in the tree writeback */ -static int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end) +static void set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end) { unsigned long index = start >> PAGE_CACHE_SHIFT; unsigned long end_index = end >> PAGE_CACHE_SHIFT; @@ -1500,7 +1410,6 @@ static int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end) page_cache_release(page); index++; } - return 0; } /* find the first state struct with 'bits' set after 'start', and @@ -1800,7 +1709,7 @@ again: BUG_ON(ret); /* Only valid values are 0 and -EAGAIN */ /* step three, lock the state bits for the whole range */ - lock_extent_bits(tree, delalloc_start, delalloc_end, 0, &cached_state); + lock_extent_bits(tree, delalloc_start, delalloc_end, &cached_state); /* then test to make sure it is all still delalloc */ ret = test_range_bit(tree, delalloc_start, delalloc_end, @@ -1820,7 +1729,7 @@ out_failed: return found; } -int extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end, +void extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end, struct page *locked_page, unsigned clear_bits, unsigned long page_ops) @@ -1835,7 +1744,7 @@ int extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end, clear_extent_bit(tree, start, end, clear_bits, 1, 0, NULL, GFP_NOFS); if (page_ops == 0) - return 0; + return; if ((page_ops & PAGE_SET_ERROR) && nr_pages > 0) mapping_set_error(inode->i_mapping, -EIO); @@ -1869,7 +1778,6 @@ int extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end, index += ret; cond_resched(); } - return 0; } /* @@ -2516,7 +2424,7 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset, /* lots and lots of room for performance fixes in the end_bio funcs */ -int end_extent_writepage(struct page *page, int err, u64 start, u64 end) +void end_extent_writepage(struct page *page, int err, u64 start, u64 end) { int uptodate = (err == 0); struct extent_io_tree *tree; @@ -2537,7 +2445,6 @@ int end_extent_writepage(struct page *page, int err, u64 start, u64 end) ret = ret < 0 ? ret : -EIO; mapping_set_error(page->mapping, ret); } - return 0; } /* @@ -2579,9 +2486,7 @@ static void end_bio_extent_writepage(struct bio *bio) start = page_offset(page); end = start + bvec->bv_offset + bvec->bv_len - 1; - if (end_extent_writepage(page, bio->bi_error, start, end)) - continue; - + end_extent_writepage(page, bio->bi_error, start, end); end_page_writeback(page); } @@ -4326,7 +4231,7 @@ int extent_invalidatepage(struct extent_io_tree *tree, if (start > end) return 0; - lock_extent_bits(tree, start, end, 0, &cached_state); + lock_extent_bits(tree, start, end, &cached_state); wait_on_page_writeback(page); clear_extent_bit(tree, start, end, EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC | @@ -4387,7 +4292,7 @@ int try_release_extent_mapping(struct extent_map_tree *map, u64 end = start + PAGE_CACHE_SIZE - 1; if (gfpflags_allow_blocking(mask) && - page->mapping->host->i_size > 16 * 1024 * 1024) { + page->mapping->host->i_size > SZ_16M) { u64 len; while (start <= end) { len = end - start + 1; @@ -4536,7 +4441,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, last_for_get_extent = isize; } - lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len - 1, 0, + lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len - 1, &cached_state); em = get_extent_skip_holes(inode, start, last_for_get_extent, @@ -4797,24 +4702,14 @@ struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src) return new; } -struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info, - u64 start) +struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info, + u64 start, unsigned long len) { struct extent_buffer *eb; - unsigned long len; unsigned long num_pages; unsigned long i; - if (!fs_info) { - /* - * Called only from tests that don't always have a fs_info - * available, but we know that nodesize is 4096 - */ - len = 4096; - } else { - len = fs_info->tree_root->nodesize; - } - num_pages = num_extent_pages(0, len); + num_pages = num_extent_pages(start, len); eb = __alloc_extent_buffer(fs_info, start, len); if (!eb) @@ -4837,6 +4732,24 @@ err: return NULL; } +struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info, + u64 start) +{ + unsigned long len; + + if (!fs_info) { + /* + * Called only from tests that don't always have a fs_info + * available, but we know that nodesize is 4096 + */ + len = 4096; + } else { + len = fs_info->tree_root->nodesize; + } + + return __alloc_dummy_extent_buffer(fs_info, start, len); +} + static void check_buffer_tree_ref(struct extent_buffer *eb) { int refs; @@ -5227,7 +5140,7 @@ int set_extent_buffer_dirty(struct extent_buffer *eb) return was_dirty; } -int clear_extent_buffer_uptodate(struct extent_buffer *eb) +void clear_extent_buffer_uptodate(struct extent_buffer *eb) { unsigned long i; struct page *page; @@ -5240,10 +5153,9 @@ int clear_extent_buffer_uptodate(struct extent_buffer *eb) if (page) ClearPageUptodate(page); } - return 0; } -int set_extent_buffer_uptodate(struct extent_buffer *eb) +void set_extent_buffer_uptodate(struct extent_buffer *eb) { unsigned long i; struct page *page; @@ -5255,7 +5167,6 @@ int set_extent_buffer_uptodate(struct extent_buffer *eb) page = eb->pages[i]; SetPageUptodate(page); } - return 0; } int extent_buffer_uptodate(struct extent_buffer *eb) @@ -5594,6 +5505,155 @@ void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src, } } +/* + * The extent buffer bitmap operations are done with byte granularity because + * bitmap items are not guaranteed to be aligned to a word and therefore a + * single word in a bitmap may straddle two pages in the extent buffer. + */ +#define BIT_BYTE(nr) ((nr) / BITS_PER_BYTE) +#define BYTE_MASK ((1 << BITS_PER_BYTE) - 1) +#define BITMAP_FIRST_BYTE_MASK(start) \ + ((BYTE_MASK << ((start) & (BITS_PER_BYTE - 1))) & BYTE_MASK) +#define BITMAP_LAST_BYTE_MASK(nbits) \ + (BYTE_MASK >> (-(nbits) & (BITS_PER_BYTE - 1))) + +/* + * eb_bitmap_offset() - calculate the page and offset of the byte containing the + * given bit number + * @eb: the extent buffer + * @start: offset of the bitmap item in the extent buffer + * @nr: bit number + * @page_index: return index of the page in the extent buffer that contains the + * given bit number + * @page_offset: return offset into the page given by page_index + * + * This helper hides the ugliness of finding the byte in an extent buffer which + * contains a given bit. + */ +static inline void eb_bitmap_offset(struct extent_buffer *eb, + unsigned long start, unsigned long nr, + unsigned long *page_index, + size_t *page_offset) +{ + size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); + size_t byte_offset = BIT_BYTE(nr); + size_t offset; + + /* + * The byte we want is the offset of the extent buffer + the offset of + * the bitmap item in the extent buffer + the offset of the byte in the + * bitmap item. + */ + offset = start_offset + start + byte_offset; + + *page_index = offset >> PAGE_CACHE_SHIFT; + *page_offset = offset & (PAGE_CACHE_SIZE - 1); +} + +/** + * extent_buffer_test_bit - determine whether a bit in a bitmap item is set + * @eb: the extent buffer + * @start: offset of the bitmap item in the extent buffer + * @nr: bit number to test + */ +int extent_buffer_test_bit(struct extent_buffer *eb, unsigned long start, + unsigned long nr) +{ + char *kaddr; + struct page *page; + unsigned long i; + size_t offset; + + eb_bitmap_offset(eb, start, nr, &i, &offset); + page = eb->pages[i]; + WARN_ON(!PageUptodate(page)); + kaddr = page_address(page); + return 1U & (kaddr[offset] >> (nr & (BITS_PER_BYTE - 1))); +} + +/** + * extent_buffer_bitmap_set - set an area of a bitmap + * @eb: the extent buffer + * @start: offset of the bitmap item in the extent buffer + * @pos: bit number of the first bit + * @len: number of bits to set + */ +void extent_buffer_bitmap_set(struct extent_buffer *eb, unsigned long start, + unsigned long pos, unsigned long len) +{ + char *kaddr; + struct page *page; + unsigned long i; + size_t offset; + const unsigned int size = pos + len; + int bits_to_set = BITS_PER_BYTE - (pos % BITS_PER_BYTE); + unsigned int mask_to_set = BITMAP_FIRST_BYTE_MASK(pos); + + eb_bitmap_offset(eb, start, pos, &i, &offset); + page = eb->pages[i]; + WARN_ON(!PageUptodate(page)); + kaddr = page_address(page); + + while (len >= bits_to_set) { + kaddr[offset] |= mask_to_set; + len -= bits_to_set; + bits_to_set = BITS_PER_BYTE; + mask_to_set = ~0U; + if (++offset >= PAGE_CACHE_SIZE && len > 0) { + offset = 0; + page = eb->pages[++i]; + WARN_ON(!PageUptodate(page)); + kaddr = page_address(page); + } + } + if (len) { + mask_to_set &= BITMAP_LAST_BYTE_MASK(size); + kaddr[offset] |= mask_to_set; + } +} + + +/** + * extent_buffer_bitmap_clear - clear an area of a bitmap + * @eb: the extent buffer + * @start: offset of the bitmap item in the extent buffer + * @pos: bit number of the first bit + * @len: number of bits to clear + */ +void extent_buffer_bitmap_clear(struct extent_buffer *eb, unsigned long start, + unsigned long pos, unsigned long len) +{ + char *kaddr; + struct page *page; + unsigned long i; + size_t offset; + const unsigned int size = pos + len; + int bits_to_clear = BITS_PER_BYTE - (pos % BITS_PER_BYTE); + unsigned int mask_to_clear = BITMAP_FIRST_BYTE_MASK(pos); + + eb_bitmap_offset(eb, start, pos, &i, &offset); + page = eb->pages[i]; + WARN_ON(!PageUptodate(page)); + kaddr = page_address(page); + + while (len >= bits_to_clear) { + kaddr[offset] &= ~mask_to_clear; + len -= bits_to_clear; + bits_to_clear = BITS_PER_BYTE; + mask_to_clear = ~0U; + if (++offset >= PAGE_CACHE_SIZE && len > 0) { + offset = 0; + page = eb->pages[++i]; + WARN_ON(!PageUptodate(page)); + kaddr = page_address(page); + } + } + if (len) { + mask_to_clear &= BITMAP_LAST_BYTE_MASK(size); + kaddr[offset] &= ~mask_to_clear; + } +} + static inline bool areas_overlap(unsigned long src, unsigned long dst, unsigned long len) { unsigned long distance = (src > dst) ? src - dst : dst - src; diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index f4c1ae11855f..0377413bd4b9 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -199,12 +199,14 @@ int try_release_extent_mapping(struct extent_map_tree *map, struct extent_io_tree *tree, struct page *page, gfp_t mask); int try_release_extent_buffer(struct page *page); -int lock_extent(struct extent_io_tree *tree, u64 start, u64 end); int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, - unsigned bits, struct extent_state **cached); -int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end); -int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end, - struct extent_state **cached, gfp_t mask); + struct extent_state **cached); + +static inline int lock_extent(struct extent_io_tree *tree, u64 start, u64 end) +{ + return lock_extent_bits(tree, start, end, NULL); +} + int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end); int extent_read_full_page(struct extent_io_tree *tree, struct page *page, get_extent_t *get_extent, int mirror_num); @@ -221,39 +223,105 @@ void free_extent_state(struct extent_state *state); int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, unsigned bits, int filled, struct extent_state *cached_state); -int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, - unsigned bits, gfp_t mask); int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, unsigned bits, gfp_t mask, struct extent_changeset *changeset); int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, unsigned bits, int wake, int delete, struct extent_state **cached, gfp_t mask); -int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, - unsigned bits, gfp_t mask); + +static inline int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end) +{ + return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, NULL, + GFP_NOFS); +} + +static inline int unlock_extent_cached(struct extent_io_tree *tree, u64 start, + u64 end, struct extent_state **cached, gfp_t mask) +{ + return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, cached, + mask); +} + +static inline int clear_extent_bits(struct extent_io_tree *tree, u64 start, + u64 end, unsigned bits, gfp_t mask) +{ + int wake = 0; + + if (bits & EXTENT_LOCKED) + wake = 1; + + return clear_extent_bit(tree, start, end, bits, wake, 0, NULL, mask); +} + int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, unsigned bits, gfp_t mask, struct extent_changeset *changeset); int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, unsigned bits, u64 *failed_start, struct extent_state **cached_state, gfp_t mask); -int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, - struct extent_state **cached_state, gfp_t mask); -int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, - struct extent_state **cached_state, gfp_t mask); -int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end, - gfp_t mask); -int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, - gfp_t mask); -int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, - gfp_t mask); + +static inline int set_extent_bits(struct extent_io_tree *tree, u64 start, + u64 end, unsigned bits, gfp_t mask) +{ + return set_extent_bit(tree, start, end, bits, NULL, NULL, mask); +} + +static inline int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, + u64 end, struct extent_state **cached_state, gfp_t mask) +{ + return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0, + cached_state, mask); +} + +static inline int set_extent_dirty(struct extent_io_tree *tree, u64 start, + u64 end, gfp_t mask) +{ + return set_extent_bit(tree, start, end, EXTENT_DIRTY, NULL, + NULL, mask); +} + +static inline int clear_extent_dirty(struct extent_io_tree *tree, u64 start, + u64 end, gfp_t mask) +{ + return clear_extent_bit(tree, start, end, + EXTENT_DIRTY | EXTENT_DELALLOC | + EXTENT_DO_ACCOUNTING, 0, 0, NULL, mask); +} + int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, unsigned bits, unsigned clear_bits, struct extent_state **cached_state, gfp_t mask); -int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end, - struct extent_state **cached_state, gfp_t mask); -int set_extent_defrag(struct extent_io_tree *tree, u64 start, u64 end, - struct extent_state **cached_state, gfp_t mask); + +static inline int set_extent_delalloc(struct extent_io_tree *tree, u64 start, + u64 end, struct extent_state **cached_state, gfp_t mask) +{ + return set_extent_bit(tree, start, end, + EXTENT_DELALLOC | EXTENT_UPTODATE, + NULL, cached_state, mask); +} + +static inline int set_extent_defrag(struct extent_io_tree *tree, u64 start, + u64 end, struct extent_state **cached_state, gfp_t mask) +{ + return set_extent_bit(tree, start, end, + EXTENT_DELALLOC | EXTENT_UPTODATE | EXTENT_DEFRAG, + NULL, cached_state, mask); +} + +static inline int set_extent_new(struct extent_io_tree *tree, u64 start, + u64 end, gfp_t mask) +{ + return set_extent_bit(tree, start, end, EXTENT_NEW, NULL, NULL, mask); +} + +static inline int set_extent_uptodate(struct extent_io_tree *tree, u64 start, + u64 end, struct extent_state **cached_state, gfp_t mask) +{ + return set_extent_bit(tree, start, end, EXTENT_UPTODATE, NULL, + cached_state, mask); +} + int find_first_extent_bit(struct extent_io_tree *tree, u64 start, u64 *start_ret, u64 *end_ret, unsigned bits, struct extent_state **cached_state); @@ -282,8 +350,10 @@ void set_page_extent_mapped(struct page *page); struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start); +struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info, + u64 start, unsigned long len); struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info, - u64 start); + u64 start); struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src); struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info, u64 start); @@ -328,19 +398,25 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, unsigned long src_offset, unsigned long len); void memset_extent_buffer(struct extent_buffer *eb, char c, unsigned long start, unsigned long len); +int extent_buffer_test_bit(struct extent_buffer *eb, unsigned long start, + unsigned long pos); +void extent_buffer_bitmap_set(struct extent_buffer *eb, unsigned long start, + unsigned long pos, unsigned long len); +void extent_buffer_bitmap_clear(struct extent_buffer *eb, unsigned long start, + unsigned long pos, unsigned long len); void clear_extent_buffer_dirty(struct extent_buffer *eb); int set_extent_buffer_dirty(struct extent_buffer *eb); -int set_extent_buffer_uptodate(struct extent_buffer *eb); -int clear_extent_buffer_uptodate(struct extent_buffer *eb); +void set_extent_buffer_uptodate(struct extent_buffer *eb); +void clear_extent_buffer_uptodate(struct extent_buffer *eb); int extent_buffer_uptodate(struct extent_buffer *eb); int extent_buffer_under_io(struct extent_buffer *eb); int map_private_extent_buffer(struct extent_buffer *eb, unsigned long offset, unsigned long min_len, char **map, unsigned long *map_start, unsigned long *map_len); -int extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end); -int extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end); -int extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end, +void extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end); +void extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end); +void extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end, struct page *locked_page, unsigned bits_to_clear, unsigned long page_ops); @@ -357,7 +433,7 @@ int repair_io_failure(struct inode *inode, u64 start, u64 length, u64 logical, int mirror_num); int clean_io_failure(struct inode *inode, u64 start, struct page *page, unsigned int pg_offset); -int end_extent_writepage(struct page *page, int err, u64 start, u64 end); +void end_extent_writepage(struct page *page, int err, u64 start, u64 end); int repair_eb_io_failure(struct btrfs_root *root, struct extent_buffer *eb, int mirror_num); diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 6a98bddd8f33..84fb56d5c018 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -76,7 +76,7 @@ void free_extent_map(struct extent_map *em) WARN_ON(extent_map_in_tree(em)); WARN_ON(!list_empty(&em->list)); if (test_bit(EXTENT_FLAG_FS_MAPPING, &em->flags)) - kfree(em->bdev); + kfree(em->map_lookup); kmem_cache_free(extent_map_cache, em); } } diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h index b2991fd8583e..eb8b8fae036b 100644 --- a/fs/btrfs/extent_map.h +++ b/fs/btrfs/extent_map.h @@ -32,7 +32,15 @@ struct extent_map { u64 block_len; u64 generation; unsigned long flags; - struct block_device *bdev; + union { + struct block_device *bdev; + + /* + * used for chunk mappings + * flags & EXTENT_FLAG_FS_MAPPING must be set + */ + struct map_lookup *map_lookup; + }; atomic_t refs; unsigned int compress_type; struct list_head list; diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 58ece6558430..a67e1c828d0f 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -202,7 +202,7 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root, } if (bio->bi_iter.bi_size > PAGE_CACHE_SIZE * 8) - path->reada = 2; + path->reada = READA_FORWARD; WARN_ON(bio->bi_vcnt <= 0); @@ -328,7 +328,7 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, if (search_commit) { path->skip_locking = 1; - path->reada = 2; + path->reada = READA_FORWARD; path->search_commit_root = 1; } diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 0f09526aa7d9..098bb8f690c9 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -406,8 +406,7 @@ int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info) /* simple helper to fault in pages and copy. This should go away * and be replaced with calls into generic code. */ -static noinline int btrfs_copy_from_user(loff_t pos, int num_pages, - size_t write_bytes, +static noinline int btrfs_copy_from_user(loff_t pos, size_t write_bytes, struct page **prepared_pages, struct iov_iter *i) { @@ -1394,7 +1393,7 @@ lock_and_cleanup_extent_if_need(struct inode *inode, struct page **pages, if (start_pos < inode->i_size) { struct btrfs_ordered_extent *ordered; lock_extent_bits(&BTRFS_I(inode)->io_tree, - start_pos, last_pos, 0, cached_state); + start_pos, last_pos, cached_state); ordered = btrfs_lookup_ordered_range(inode, start_pos, last_pos - start_pos + 1); if (ordered && @@ -1588,8 +1587,7 @@ again: ret = 0; } - copied = btrfs_copy_from_user(pos, num_pages, - write_bytes, pages, i); + copied = btrfs_copy_from_user(pos, write_bytes, pages, i); /* * if we have trouble faulting in the pages, fall @@ -1764,17 +1762,17 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb, loff_t pos; size_t count; - mutex_lock(&inode->i_mutex); + inode_lock(inode); err = generic_write_checks(iocb, from); if (err <= 0) { - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return err; } current->backing_dev_info = inode_to_bdi(inode); err = file_remove_privs(file); if (err) { - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); goto out; } @@ -1785,7 +1783,7 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb, * to stop this write operation to ensure FS consistency. */ if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state)) { - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); err = -EROFS; goto out; } @@ -1806,7 +1804,7 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb, end_pos = round_up(pos + count, root->sectorsize); err = btrfs_cont_expand(inode, i_size_read(inode), end_pos); if (err) { - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); goto out; } } @@ -1822,7 +1820,7 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb, iocb->ki_pos = pos + num_written; } - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); /* * We also have to set last_sub_trans to the current log transid, @@ -1911,7 +1909,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) if (ret) return ret; - mutex_lock(&inode->i_mutex); + inode_lock(inode); atomic_inc(&root->log_batch); full_sync = test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags); @@ -1963,7 +1961,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) ret = start_ordered_ops(inode, start, end); } if (ret) { - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); goto out; } atomic_inc(&root->log_batch); @@ -2009,7 +2007,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) */ clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); goto out; } @@ -2033,7 +2031,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { ret = PTR_ERR(trans); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); goto out; } trans->sync = true; @@ -2056,7 +2054,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) * file again, but that will end up using the synchronization * inside btrfs_sync_log to keep things safe. */ - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); /* * If any of the ordered extents had an error, just return it to user @@ -2305,7 +2303,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) if (ret) return ret; - mutex_lock(&inode->i_mutex); + inode_lock(inode); ino_size = round_up(inode->i_size, PAGE_CACHE_SIZE); ret = find_first_non_hole(inode, &offset, &len); if (ret < 0) @@ -2345,7 +2343,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) truncated_page = true; ret = btrfs_truncate_page(inode, offset, 0, 0); if (ret) { - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return ret; } } @@ -2398,7 +2396,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) truncate_pagecache_range(inode, lockstart, lockend); lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend, - 0, &cached_state); + &cached_state); ordered = btrfs_lookup_first_ordered_extent(inode, lockend); /* @@ -2421,7 +2419,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) ret = btrfs_wait_ordered_range(inode, lockstart, lockend - lockstart + 1); if (ret) { - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return ret; } } @@ -2576,7 +2574,7 @@ out_only_mutex: ret = btrfs_end_transaction(trans, root); } } - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); if (ret && !err) err = ret; return err; @@ -2660,7 +2658,7 @@ static long btrfs_fallocate(struct file *file, int mode, if (ret < 0) return ret; - mutex_lock(&inode->i_mutex); + inode_lock(inode); ret = inode_newsize_ok(inode, alloc_end); if (ret) goto out; @@ -2705,7 +2703,7 @@ static long btrfs_fallocate(struct file *file, int mode, * transaction */ lock_extent_bits(&BTRFS_I(inode)->io_tree, alloc_start, - locked_end, 0, &cached_state); + locked_end, &cached_state); ordered = btrfs_lookup_first_ordered_extent(inode, alloc_end - 1); if (ordered && @@ -2818,7 +2816,7 @@ out: * So this is completely used as cleanup. */ btrfs_qgroup_free_data(inode, alloc_start, alloc_end - alloc_start); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); /* Let go of our reservation. */ btrfs_free_reserved_data_space(inode, alloc_start, alloc_end - alloc_start); @@ -2852,7 +2850,7 @@ static int find_desired_extent(struct inode *inode, loff_t *offset, int whence) lockend--; len = lockend - lockstart + 1; - lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend, 0, + lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend, &cached_state); while (start < inode->i_size) { @@ -2894,7 +2892,7 @@ static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int whence) struct inode *inode = file->f_mapping->host; int ret; - mutex_lock(&inode->i_mutex); + inode_lock(inode); switch (whence) { case SEEK_END: case SEEK_CUR: @@ -2903,20 +2901,20 @@ static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int whence) case SEEK_DATA: case SEEK_HOLE: if (offset >= i_size_read(inode)) { - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return -ENXIO; } ret = find_desired_extent(inode, &offset, whence); if (ret) { - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return ret; } } offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes); out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return offset; } @@ -2934,6 +2932,9 @@ const struct file_operations btrfs_file_operations = { #ifdef CONFIG_COMPAT .compat_ioctl = btrfs_ioctl, #endif + .copy_file_range = btrfs_copy_file_range, + .clone_file_range = btrfs_clone_file_range, + .dedupe_file_range = btrfs_dedupe_file_range, }; void btrfs_auto_defrag_exit(void) diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index cfe99bec49de..8f835bfa1bdd 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -30,7 +30,7 @@ #include "volumes.h" #define BITS_PER_BITMAP (PAGE_CACHE_SIZE * 8) -#define MAX_CACHE_BYTES_PER_GIG (32 * 1024) +#define MAX_CACHE_BYTES_PER_GIG SZ_32K struct btrfs_trim_range { u64 start; @@ -1086,14 +1086,11 @@ write_pinned_extent_entries(struct btrfs_root *root, static noinline_for_stack int write_bitmap_entries(struct btrfs_io_ctl *io_ctl, struct list_head *bitmap_list) { - struct list_head *pos, *n; + struct btrfs_free_space *entry, *next; int ret; /* Write out the bitmaps */ - list_for_each_safe(pos, n, bitmap_list) { - struct btrfs_free_space *entry = - list_entry(pos, struct btrfs_free_space, list); - + list_for_each_entry_safe(entry, next, bitmap_list, list) { ret = io_ctl_add_bitmap(io_ctl, entry->bitmap); if (ret) return -ENOSPC; @@ -1119,13 +1116,10 @@ static int flush_dirty_cache(struct inode *inode) static void noinline_for_stack cleanup_bitmap_list(struct list_head *bitmap_list) { - struct list_head *pos, *n; + struct btrfs_free_space *entry, *next; - list_for_each_safe(pos, n, bitmap_list) { - struct btrfs_free_space *entry = - list_entry(pos, struct btrfs_free_space, list); + list_for_each_entry_safe(entry, next, bitmap_list, list) list_del_init(&entry->list); - } } static void noinline_for_stack @@ -1261,7 +1255,7 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, goto out; lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, - 0, &cached_state); + &cached_state); io_ctl_set_generation(io_ctl, trans->transid); @@ -1656,11 +1650,10 @@ static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl) * at or below 32k, so we need to adjust how much memory we allow to be * used by extent based free space tracking */ - if (size < 1024 * 1024 * 1024) + if (size < SZ_1G) max_bytes = MAX_CACHE_BYTES_PER_GIG; else - max_bytes = MAX_CACHE_BYTES_PER_GIG * - div_u64(size, 1024 * 1024 * 1024); + max_bytes = MAX_CACHE_BYTES_PER_GIG * div_u64(size, SZ_1G); /* * we want to account for 1 more bitmap than what we have so we can make @@ -2016,7 +2009,7 @@ static bool use_bitmap(struct btrfs_free_space_ctl *ctl, return true; } -static struct btrfs_free_space_op free_space_op = { +static const struct btrfs_free_space_op free_space_op = { .recalc_thresholds = recalculate_thresholds, .use_bitmap = use_bitmap, }; @@ -2489,8 +2482,7 @@ void btrfs_init_free_space_ctl(struct btrfs_block_group_cache *block_group) * track of free space, and if we pass 1/2 of that we want to * start converting things over to using bitmaps */ - ctl->extents_thresh = ((1024 * 32) / 2) / - sizeof(struct btrfs_free_space); + ctl->extents_thresh = (SZ_32K / 2) / sizeof(struct btrfs_free_space); } /* diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h index f251865eb6f3..33178c490ace 100644 --- a/fs/btrfs/free-space-cache.h +++ b/fs/btrfs/free-space-cache.h @@ -37,7 +37,7 @@ struct btrfs_free_space_ctl { int total_bitmaps; int unit; u64 start; - struct btrfs_free_space_op *op; + const struct btrfs_free_space_op *op; void *private; struct mutex cache_writeout_mutex; struct list_head trimming_ranges; diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c new file mode 100644 index 000000000000..393e36bd5845 --- /dev/null +++ b/fs/btrfs/free-space-tree.c @@ -0,0 +1,1591 @@ +/* + * Copyright (C) 2015 Facebook. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/kernel.h> +#include <linux/vmalloc.h> +#include "ctree.h" +#include "disk-io.h" +#include "locking.h" +#include "free-space-tree.h" +#include "transaction.h" + +static int __add_block_group_free_space(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_block_group_cache *block_group, + struct btrfs_path *path); + +void set_free_space_tree_thresholds(struct btrfs_block_group_cache *cache) +{ + u32 bitmap_range; + size_t bitmap_size; + u64 num_bitmaps, total_bitmap_size; + + /* + * We convert to bitmaps when the disk space required for using extents + * exceeds that required for using bitmaps. + */ + bitmap_range = cache->sectorsize * BTRFS_FREE_SPACE_BITMAP_BITS; + num_bitmaps = div_u64(cache->key.offset + bitmap_range - 1, + bitmap_range); + bitmap_size = sizeof(struct btrfs_item) + BTRFS_FREE_SPACE_BITMAP_SIZE; + total_bitmap_size = num_bitmaps * bitmap_size; + cache->bitmap_high_thresh = div_u64(total_bitmap_size, + sizeof(struct btrfs_item)); + + /* + * We allow for a small buffer between the high threshold and low + * threshold to avoid thrashing back and forth between the two formats. + */ + if (cache->bitmap_high_thresh > 100) + cache->bitmap_low_thresh = cache->bitmap_high_thresh - 100; + else + cache->bitmap_low_thresh = 0; +} + +static int add_new_free_space_info(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_block_group_cache *block_group, + struct btrfs_path *path) +{ + struct btrfs_root *root = fs_info->free_space_root; + struct btrfs_free_space_info *info; + struct btrfs_key key; + struct extent_buffer *leaf; + int ret; + + key.objectid = block_group->key.objectid; + key.type = BTRFS_FREE_SPACE_INFO_KEY; + key.offset = block_group->key.offset; + + ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(*info)); + if (ret) + goto out; + + leaf = path->nodes[0]; + info = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_free_space_info); + btrfs_set_free_space_extent_count(leaf, info, 0); + btrfs_set_free_space_flags(leaf, info, 0); + btrfs_mark_buffer_dirty(leaf); + + ret = 0; +out: + btrfs_release_path(path); + return ret; +} + +struct btrfs_free_space_info * +search_free_space_info(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_block_group_cache *block_group, + struct btrfs_path *path, int cow) +{ + struct btrfs_root *root = fs_info->free_space_root; + struct btrfs_key key; + int ret; + + key.objectid = block_group->key.objectid; + key.type = BTRFS_FREE_SPACE_INFO_KEY; + key.offset = block_group->key.offset; + + ret = btrfs_search_slot(trans, root, &key, path, 0, cow); + if (ret < 0) + return ERR_PTR(ret); + if (ret != 0) { + btrfs_warn(fs_info, "missing free space info for %llu\n", + block_group->key.objectid); + ASSERT(0); + return ERR_PTR(-ENOENT); + } + + return btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_free_space_info); +} + +/* + * btrfs_search_slot() but we're looking for the greatest key less than the + * passed key. + */ +static int btrfs_search_prev_slot(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_key *key, struct btrfs_path *p, + int ins_len, int cow) +{ + int ret; + + ret = btrfs_search_slot(trans, root, key, p, ins_len, cow); + if (ret < 0) + return ret; + + if (ret == 0) { + ASSERT(0); + return -EIO; + } + + if (p->slots[0] == 0) { + ASSERT(0); + return -EIO; + } + p->slots[0]--; + + return 0; +} + +static inline u32 free_space_bitmap_size(u64 size, u32 sectorsize) +{ + return DIV_ROUND_UP((u32)div_u64(size, sectorsize), BITS_PER_BYTE); +} + +static unsigned long *alloc_bitmap(u32 bitmap_size) +{ + return __vmalloc(bitmap_size, GFP_NOFS | __GFP_HIGHMEM | __GFP_ZERO, + PAGE_KERNEL); +} + +int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_block_group_cache *block_group, + struct btrfs_path *path) +{ + struct btrfs_root *root = fs_info->free_space_root; + struct btrfs_free_space_info *info; + struct btrfs_key key, found_key; + struct extent_buffer *leaf; + unsigned long *bitmap; + char *bitmap_cursor; + u64 start, end; + u64 bitmap_range, i; + u32 bitmap_size, flags, expected_extent_count; + u32 extent_count = 0; + int done = 0, nr; + int ret; + + bitmap_size = free_space_bitmap_size(block_group->key.offset, + block_group->sectorsize); + bitmap = alloc_bitmap(bitmap_size); + if (!bitmap) { + ret = -ENOMEM; + goto out; + } + + start = block_group->key.objectid; + end = block_group->key.objectid + block_group->key.offset; + + key.objectid = end - 1; + key.type = (u8)-1; + key.offset = (u64)-1; + + while (!done) { + ret = btrfs_search_prev_slot(trans, root, &key, path, -1, 1); + if (ret) + goto out; + + leaf = path->nodes[0]; + nr = 0; + path->slots[0]++; + while (path->slots[0] > 0) { + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0] - 1); + + if (found_key.type == BTRFS_FREE_SPACE_INFO_KEY) { + ASSERT(found_key.objectid == block_group->key.objectid); + ASSERT(found_key.offset == block_group->key.offset); + done = 1; + break; + } else if (found_key.type == BTRFS_FREE_SPACE_EXTENT_KEY) { + u64 first, last; + + ASSERT(found_key.objectid >= start); + ASSERT(found_key.objectid < end); + ASSERT(found_key.objectid + found_key.offset <= end); + + first = div_u64(found_key.objectid - start, + block_group->sectorsize); + last = div_u64(found_key.objectid + found_key.offset - start, + block_group->sectorsize); + bitmap_set(bitmap, first, last - first); + + extent_count++; + nr++; + path->slots[0]--; + } else { + ASSERT(0); + } + } + + ret = btrfs_del_items(trans, root, path, path->slots[0], nr); + if (ret) + goto out; + btrfs_release_path(path); + } + + info = search_free_space_info(trans, fs_info, block_group, path, 1); + if (IS_ERR(info)) { + ret = PTR_ERR(info); + goto out; + } + leaf = path->nodes[0]; + flags = btrfs_free_space_flags(leaf, info); + flags |= BTRFS_FREE_SPACE_USING_BITMAPS; + btrfs_set_free_space_flags(leaf, info, flags); + expected_extent_count = btrfs_free_space_extent_count(leaf, info); + btrfs_mark_buffer_dirty(leaf); + btrfs_release_path(path); + + if (extent_count != expected_extent_count) { + btrfs_err(fs_info, "incorrect extent count for %llu; counted %u, expected %u", + block_group->key.objectid, extent_count, + expected_extent_count); + ASSERT(0); + ret = -EIO; + goto out; + } + + bitmap_cursor = (char *)bitmap; + bitmap_range = block_group->sectorsize * BTRFS_FREE_SPACE_BITMAP_BITS; + i = start; + while (i < end) { + unsigned long ptr; + u64 extent_size; + u32 data_size; + + extent_size = min(end - i, bitmap_range); + data_size = free_space_bitmap_size(extent_size, + block_group->sectorsize); + + key.objectid = i; + key.type = BTRFS_FREE_SPACE_BITMAP_KEY; + key.offset = extent_size; + + ret = btrfs_insert_empty_item(trans, root, path, &key, + data_size); + if (ret) + goto out; + + leaf = path->nodes[0]; + ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); + write_extent_buffer(leaf, bitmap_cursor, ptr, + data_size); + btrfs_mark_buffer_dirty(leaf); + btrfs_release_path(path); + + i += extent_size; + bitmap_cursor += data_size; + } + + ret = 0; +out: + vfree(bitmap); + if (ret) + btrfs_abort_transaction(trans, root, ret); + return ret; +} + +int convert_free_space_to_extents(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_block_group_cache *block_group, + struct btrfs_path *path) +{ + struct btrfs_root *root = fs_info->free_space_root; + struct btrfs_free_space_info *info; + struct btrfs_key key, found_key; + struct extent_buffer *leaf; + unsigned long *bitmap; + u64 start, end; + /* Initialize to silence GCC. */ + u64 extent_start = 0; + u64 offset; + u32 bitmap_size, flags, expected_extent_count; + int prev_bit = 0, bit, bitnr; + u32 extent_count = 0; + int done = 0, nr; + int ret; + + bitmap_size = free_space_bitmap_size(block_group->key.offset, + block_group->sectorsize); + bitmap = alloc_bitmap(bitmap_size); + if (!bitmap) { + ret = -ENOMEM; + goto out; + } + + start = block_group->key.objectid; + end = block_group->key.objectid + block_group->key.offset; + + key.objectid = end - 1; + key.type = (u8)-1; + key.offset = (u64)-1; + + while (!done) { + ret = btrfs_search_prev_slot(trans, root, &key, path, -1, 1); + if (ret) + goto out; + + leaf = path->nodes[0]; + nr = 0; + path->slots[0]++; + while (path->slots[0] > 0) { + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0] - 1); + + if (found_key.type == BTRFS_FREE_SPACE_INFO_KEY) { + ASSERT(found_key.objectid == block_group->key.objectid); + ASSERT(found_key.offset == block_group->key.offset); + done = 1; + break; + } else if (found_key.type == BTRFS_FREE_SPACE_BITMAP_KEY) { + unsigned long ptr; + char *bitmap_cursor; + u32 bitmap_pos, data_size; + + ASSERT(found_key.objectid >= start); + ASSERT(found_key.objectid < end); + ASSERT(found_key.objectid + found_key.offset <= end); + + bitmap_pos = div_u64(found_key.objectid - start, + block_group->sectorsize * + BITS_PER_BYTE); + bitmap_cursor = ((char *)bitmap) + bitmap_pos; + data_size = free_space_bitmap_size(found_key.offset, + block_group->sectorsize); + + ptr = btrfs_item_ptr_offset(leaf, path->slots[0] - 1); + read_extent_buffer(leaf, bitmap_cursor, ptr, + data_size); + + nr++; + path->slots[0]--; + } else { + ASSERT(0); + } + } + + ret = btrfs_del_items(trans, root, path, path->slots[0], nr); + if (ret) + goto out; + btrfs_release_path(path); + } + + info = search_free_space_info(trans, fs_info, block_group, path, 1); + if (IS_ERR(info)) { + ret = PTR_ERR(info); + goto out; + } + leaf = path->nodes[0]; + flags = btrfs_free_space_flags(leaf, info); + flags &= ~BTRFS_FREE_SPACE_USING_BITMAPS; + btrfs_set_free_space_flags(leaf, info, flags); + expected_extent_count = btrfs_free_space_extent_count(leaf, info); + btrfs_mark_buffer_dirty(leaf); + btrfs_release_path(path); + + offset = start; + bitnr = 0; + while (offset < end) { + bit = !!test_bit(bitnr, bitmap); + if (prev_bit == 0 && bit == 1) { + extent_start = offset; + } else if (prev_bit == 1 && bit == 0) { + key.objectid = extent_start; + key.type = BTRFS_FREE_SPACE_EXTENT_KEY; + key.offset = offset - extent_start; + + ret = btrfs_insert_empty_item(trans, root, path, &key, 0); + if (ret) + goto out; + btrfs_release_path(path); + + extent_count++; + } + prev_bit = bit; + offset += block_group->sectorsize; + bitnr++; + } + if (prev_bit == 1) { + key.objectid = extent_start; + key.type = BTRFS_FREE_SPACE_EXTENT_KEY; + key.offset = end - extent_start; + + ret = btrfs_insert_empty_item(trans, root, path, &key, 0); + if (ret) + goto out; + btrfs_release_path(path); + + extent_count++; + } + + if (extent_count != expected_extent_count) { + btrfs_err(fs_info, "incorrect extent count for %llu; counted %u, expected %u", + block_group->key.objectid, extent_count, + expected_extent_count); + ASSERT(0); + ret = -EIO; + goto out; + } + + ret = 0; +out: + vfree(bitmap); + if (ret) + btrfs_abort_transaction(trans, root, ret); + return ret; +} + +static int update_free_space_extent_count(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_block_group_cache *block_group, + struct btrfs_path *path, + int new_extents) +{ + struct btrfs_free_space_info *info; + u32 flags; + u32 extent_count; + int ret = 0; + + if (new_extents == 0) + return 0; + + info = search_free_space_info(trans, fs_info, block_group, path, 1); + if (IS_ERR(info)) { + ret = PTR_ERR(info); + goto out; + } + flags = btrfs_free_space_flags(path->nodes[0], info); + extent_count = btrfs_free_space_extent_count(path->nodes[0], info); + + extent_count += new_extents; + btrfs_set_free_space_extent_count(path->nodes[0], info, extent_count); + btrfs_mark_buffer_dirty(path->nodes[0]); + btrfs_release_path(path); + + if (!(flags & BTRFS_FREE_SPACE_USING_BITMAPS) && + extent_count > block_group->bitmap_high_thresh) { + ret = convert_free_space_to_bitmaps(trans, fs_info, block_group, + path); + } else if ((flags & BTRFS_FREE_SPACE_USING_BITMAPS) && + extent_count < block_group->bitmap_low_thresh) { + ret = convert_free_space_to_extents(trans, fs_info, block_group, + path); + } + +out: + return ret; +} + +int free_space_test_bit(struct btrfs_block_group_cache *block_group, + struct btrfs_path *path, u64 offset) +{ + struct extent_buffer *leaf; + struct btrfs_key key; + u64 found_start, found_end; + unsigned long ptr, i; + + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + ASSERT(key.type == BTRFS_FREE_SPACE_BITMAP_KEY); + + found_start = key.objectid; + found_end = key.objectid + key.offset; + ASSERT(offset >= found_start && offset < found_end); + + ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); + i = div_u64(offset - found_start, block_group->sectorsize); + return !!extent_buffer_test_bit(leaf, ptr, i); +} + +static void free_space_set_bits(struct btrfs_block_group_cache *block_group, + struct btrfs_path *path, u64 *start, u64 *size, + int bit) +{ + struct extent_buffer *leaf; + struct btrfs_key key; + u64 end = *start + *size; + u64 found_start, found_end; + unsigned long ptr, first, last; + + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + ASSERT(key.type == BTRFS_FREE_SPACE_BITMAP_KEY); + + found_start = key.objectid; + found_end = key.objectid + key.offset; + ASSERT(*start >= found_start && *start < found_end); + ASSERT(end > found_start); + + if (end > found_end) + end = found_end; + + ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); + first = div_u64(*start - found_start, block_group->sectorsize); + last = div_u64(end - found_start, block_group->sectorsize); + if (bit) + extent_buffer_bitmap_set(leaf, ptr, first, last - first); + else + extent_buffer_bitmap_clear(leaf, ptr, first, last - first); + btrfs_mark_buffer_dirty(leaf); + + *size -= end - *start; + *start = end; +} + +/* + * We can't use btrfs_next_item() in modify_free_space_bitmap() because + * btrfs_next_leaf() doesn't get the path for writing. We can forgo the fancy + * tree walking in btrfs_next_leaf() anyways because we know exactly what we're + * looking for. + */ +static int free_space_next_bitmap(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct btrfs_path *p) +{ + struct btrfs_key key; + + if (p->slots[0] + 1 < btrfs_header_nritems(p->nodes[0])) { + p->slots[0]++; + return 0; + } + + btrfs_item_key_to_cpu(p->nodes[0], &key, p->slots[0]); + btrfs_release_path(p); + + key.objectid += key.offset; + key.type = (u8)-1; + key.offset = (u64)-1; + + return btrfs_search_prev_slot(trans, root, &key, p, 0, 1); +} + +/* + * If remove is 1, then we are removing free space, thus clearing bits in the + * bitmap. If remove is 0, then we are adding free space, thus setting bits in + * the bitmap. + */ +static int modify_free_space_bitmap(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_block_group_cache *block_group, + struct btrfs_path *path, + u64 start, u64 size, int remove) +{ + struct btrfs_root *root = fs_info->free_space_root; + struct btrfs_key key; + u64 end = start + size; + u64 cur_start, cur_size; + int prev_bit, next_bit; + int new_extents; + int ret; + + /* + * Read the bit for the block immediately before the extent of space if + * that block is within the block group. + */ + if (start > block_group->key.objectid) { + u64 prev_block = start - block_group->sectorsize; + + key.objectid = prev_block; + key.type = (u8)-1; + key.offset = (u64)-1; + + ret = btrfs_search_prev_slot(trans, root, &key, path, 0, 1); + if (ret) + goto out; + + prev_bit = free_space_test_bit(block_group, path, prev_block); + + /* The previous block may have been in the previous bitmap. */ + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + if (start >= key.objectid + key.offset) { + ret = free_space_next_bitmap(trans, root, path); + if (ret) + goto out; + } + } else { + key.objectid = start; + key.type = (u8)-1; + key.offset = (u64)-1; + + ret = btrfs_search_prev_slot(trans, root, &key, path, 0, 1); + if (ret) + goto out; + + prev_bit = -1; + } + + /* + * Iterate over all of the bitmaps overlapped by the extent of space, + * clearing/setting bits as required. + */ + cur_start = start; + cur_size = size; + while (1) { + free_space_set_bits(block_group, path, &cur_start, &cur_size, + !remove); + if (cur_size == 0) + break; + ret = free_space_next_bitmap(trans, root, path); + if (ret) + goto out; + } + + /* + * Read the bit for the block immediately after the extent of space if + * that block is within the block group. + */ + if (end < block_group->key.objectid + block_group->key.offset) { + /* The next block may be in the next bitmap. */ + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + if (end >= key.objectid + key.offset) { + ret = free_space_next_bitmap(trans, root, path); + if (ret) + goto out; + } + + next_bit = free_space_test_bit(block_group, path, end); + } else { + next_bit = -1; + } + + if (remove) { + new_extents = -1; + if (prev_bit == 1) { + /* Leftover on the left. */ + new_extents++; + } + if (next_bit == 1) { + /* Leftover on the right. */ + new_extents++; + } + } else { + new_extents = 1; + if (prev_bit == 1) { + /* Merging with neighbor on the left. */ + new_extents--; + } + if (next_bit == 1) { + /* Merging with neighbor on the right. */ + new_extents--; + } + } + + btrfs_release_path(path); + ret = update_free_space_extent_count(trans, fs_info, block_group, path, + new_extents); + +out: + return ret; +} + +static int remove_free_space_extent(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_block_group_cache *block_group, + struct btrfs_path *path, + u64 start, u64 size) +{ + struct btrfs_root *root = fs_info->free_space_root; + struct btrfs_key key; + u64 found_start, found_end; + u64 end = start + size; + int new_extents = -1; + int ret; + + key.objectid = start; + key.type = (u8)-1; + key.offset = (u64)-1; + + ret = btrfs_search_prev_slot(trans, root, &key, path, -1, 1); + if (ret) + goto out; + + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + + ASSERT(key.type == BTRFS_FREE_SPACE_EXTENT_KEY); + + found_start = key.objectid; + found_end = key.objectid + key.offset; + ASSERT(start >= found_start && end <= found_end); + + /* + * Okay, now that we've found the free space extent which contains the + * free space that we are removing, there are four cases: + * + * 1. We're using the whole extent: delete the key we found and + * decrement the free space extent count. + * 2. We are using part of the extent starting at the beginning: delete + * the key we found and insert a new key representing the leftover at + * the end. There is no net change in the number of extents. + * 3. We are using part of the extent ending at the end: delete the key + * we found and insert a new key representing the leftover at the + * beginning. There is no net change in the number of extents. + * 4. We are using part of the extent in the middle: delete the key we + * found and insert two new keys representing the leftovers on each + * side. Where we used to have one extent, we now have two, so increment + * the extent count. We may need to convert the block group to bitmaps + * as a result. + */ + + /* Delete the existing key (cases 1-4). */ + ret = btrfs_del_item(trans, root, path); + if (ret) + goto out; + + /* Add a key for leftovers at the beginning (cases 3 and 4). */ + if (start > found_start) { + key.objectid = found_start; + key.type = BTRFS_FREE_SPACE_EXTENT_KEY; + key.offset = start - found_start; + + btrfs_release_path(path); + ret = btrfs_insert_empty_item(trans, root, path, &key, 0); + if (ret) + goto out; + new_extents++; + } + + /* Add a key for leftovers at the end (cases 2 and 4). */ + if (end < found_end) { + key.objectid = end; + key.type = BTRFS_FREE_SPACE_EXTENT_KEY; + key.offset = found_end - end; + + btrfs_release_path(path); + ret = btrfs_insert_empty_item(trans, root, path, &key, 0); + if (ret) + goto out; + new_extents++; + } + + btrfs_release_path(path); + ret = update_free_space_extent_count(trans, fs_info, block_group, path, + new_extents); + +out: + return ret; +} + +int __remove_from_free_space_tree(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_block_group_cache *block_group, + struct btrfs_path *path, u64 start, u64 size) +{ + struct btrfs_free_space_info *info; + u32 flags; + int ret; + + if (block_group->needs_free_space) { + ret = __add_block_group_free_space(trans, fs_info, block_group, + path); + if (ret) + return ret; + } + + info = search_free_space_info(NULL, fs_info, block_group, path, 0); + if (IS_ERR(info)) + return PTR_ERR(info); + flags = btrfs_free_space_flags(path->nodes[0], info); + btrfs_release_path(path); + + if (flags & BTRFS_FREE_SPACE_USING_BITMAPS) { + return modify_free_space_bitmap(trans, fs_info, block_group, + path, start, size, 1); + } else { + return remove_free_space_extent(trans, fs_info, block_group, + path, start, size); + } +} + +int remove_from_free_space_tree(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + u64 start, u64 size) +{ + struct btrfs_block_group_cache *block_group; + struct btrfs_path *path; + int ret; + + if (!btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) + return 0; + + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto out; + } + + block_group = btrfs_lookup_block_group(fs_info, start); + if (!block_group) { + ASSERT(0); + ret = -ENOENT; + goto out; + } + + mutex_lock(&block_group->free_space_lock); + ret = __remove_from_free_space_tree(trans, fs_info, block_group, path, + start, size); + mutex_unlock(&block_group->free_space_lock); + + btrfs_put_block_group(block_group); +out: + btrfs_free_path(path); + if (ret) + btrfs_abort_transaction(trans, fs_info->free_space_root, ret); + return ret; +} + +static int add_free_space_extent(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_block_group_cache *block_group, + struct btrfs_path *path, + u64 start, u64 size) +{ + struct btrfs_root *root = fs_info->free_space_root; + struct btrfs_key key, new_key; + u64 found_start, found_end; + u64 end = start + size; + int new_extents = 1; + int ret; + + /* + * We are adding a new extent of free space, but we need to merge + * extents. There are four cases here: + * + * 1. The new extent does not have any immediate neighbors to merge + * with: add the new key and increment the free space extent count. We + * may need to convert the block group to bitmaps as a result. + * 2. The new extent has an immediate neighbor before it: remove the + * previous key and insert a new key combining both of them. There is no + * net change in the number of extents. + * 3. The new extent has an immediate neighbor after it: remove the next + * key and insert a new key combining both of them. There is no net + * change in the number of extents. + * 4. The new extent has immediate neighbors on both sides: remove both + * of the keys and insert a new key combining all of them. Where we used + * to have two extents, we now have one, so decrement the extent count. + */ + + new_key.objectid = start; + new_key.type = BTRFS_FREE_SPACE_EXTENT_KEY; + new_key.offset = size; + + /* Search for a neighbor on the left. */ + if (start == block_group->key.objectid) + goto right; + key.objectid = start - 1; + key.type = (u8)-1; + key.offset = (u64)-1; + + ret = btrfs_search_prev_slot(trans, root, &key, path, -1, 1); + if (ret) + goto out; + + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + + if (key.type != BTRFS_FREE_SPACE_EXTENT_KEY) { + ASSERT(key.type == BTRFS_FREE_SPACE_INFO_KEY); + btrfs_release_path(path); + goto right; + } + + found_start = key.objectid; + found_end = key.objectid + key.offset; + ASSERT(found_start >= block_group->key.objectid && + found_end > block_group->key.objectid); + ASSERT(found_start < start && found_end <= start); + + /* + * Delete the neighbor on the left and absorb it into the new key (cases + * 2 and 4). + */ + if (found_end == start) { + ret = btrfs_del_item(trans, root, path); + if (ret) + goto out; + new_key.objectid = found_start; + new_key.offset += key.offset; + new_extents--; + } + btrfs_release_path(path); + +right: + /* Search for a neighbor on the right. */ + if (end == block_group->key.objectid + block_group->key.offset) + goto insert; + key.objectid = end; + key.type = (u8)-1; + key.offset = (u64)-1; + + ret = btrfs_search_prev_slot(trans, root, &key, path, -1, 1); + if (ret) + goto out; + + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + + if (key.type != BTRFS_FREE_SPACE_EXTENT_KEY) { + ASSERT(key.type == BTRFS_FREE_SPACE_INFO_KEY); + btrfs_release_path(path); + goto insert; + } + + found_start = key.objectid; + found_end = key.objectid + key.offset; + ASSERT(found_start >= block_group->key.objectid && + found_end > block_group->key.objectid); + ASSERT((found_start < start && found_end <= start) || + (found_start >= end && found_end > end)); + + /* + * Delete the neighbor on the right and absorb it into the new key + * (cases 3 and 4). + */ + if (found_start == end) { + ret = btrfs_del_item(trans, root, path); + if (ret) + goto out; + new_key.offset += key.offset; + new_extents--; + } + btrfs_release_path(path); + +insert: + /* Insert the new key (cases 1-4). */ + ret = btrfs_insert_empty_item(trans, root, path, &new_key, 0); + if (ret) + goto out; + + btrfs_release_path(path); + ret = update_free_space_extent_count(trans, fs_info, block_group, path, + new_extents); + +out: + return ret; +} + +int __add_to_free_space_tree(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_block_group_cache *block_group, + struct btrfs_path *path, u64 start, u64 size) +{ + struct btrfs_free_space_info *info; + u32 flags; + int ret; + + if (block_group->needs_free_space) { + ret = __add_block_group_free_space(trans, fs_info, block_group, + path); + if (ret) + return ret; + } + + info = search_free_space_info(NULL, fs_info, block_group, path, 0); + if (IS_ERR(info)) + return PTR_ERR(info); + flags = btrfs_free_space_flags(path->nodes[0], info); + btrfs_release_path(path); + + if (flags & BTRFS_FREE_SPACE_USING_BITMAPS) { + return modify_free_space_bitmap(trans, fs_info, block_group, + path, start, size, 0); + } else { + return add_free_space_extent(trans, fs_info, block_group, path, + start, size); + } +} + +int add_to_free_space_tree(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + u64 start, u64 size) +{ + struct btrfs_block_group_cache *block_group; + struct btrfs_path *path; + int ret; + + if (!btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) + return 0; + + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto out; + } + + block_group = btrfs_lookup_block_group(fs_info, start); + if (!block_group) { + ASSERT(0); + ret = -ENOENT; + goto out; + } + + mutex_lock(&block_group->free_space_lock); + ret = __add_to_free_space_tree(trans, fs_info, block_group, path, start, + size); + mutex_unlock(&block_group->free_space_lock); + + btrfs_put_block_group(block_group); +out: + btrfs_free_path(path); + if (ret) + btrfs_abort_transaction(trans, fs_info->free_space_root, ret); + return ret; +} + +/* + * Populate the free space tree by walking the extent tree. Operations on the + * extent tree that happen as a result of writes to the free space tree will go + * through the normal add/remove hooks. + */ +static int populate_free_space_tree(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_block_group_cache *block_group) +{ + struct btrfs_root *extent_root = fs_info->extent_root; + struct btrfs_path *path, *path2; + struct btrfs_key key; + u64 start, end; + int ret; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + path->reada = 1; + + path2 = btrfs_alloc_path(); + if (!path2) { + btrfs_free_path(path); + return -ENOMEM; + } + + ret = add_new_free_space_info(trans, fs_info, block_group, path2); + if (ret) + goto out; + + mutex_lock(&block_group->free_space_lock); + + /* + * Iterate through all of the extent and metadata items in this block + * group, adding the free space between them and the free space at the + * end. Note that EXTENT_ITEM and METADATA_ITEM are less than + * BLOCK_GROUP_ITEM, so an extent may precede the block group that it's + * contained in. + */ + key.objectid = block_group->key.objectid; + key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = 0; + + ret = btrfs_search_slot_for_read(extent_root, &key, path, 1, 0); + if (ret < 0) + goto out_locked; + ASSERT(ret == 0); + + start = block_group->key.objectid; + end = block_group->key.objectid + block_group->key.offset; + while (1) { + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + + if (key.type == BTRFS_EXTENT_ITEM_KEY || + key.type == BTRFS_METADATA_ITEM_KEY) { + if (key.objectid >= end) + break; + + if (start < key.objectid) { + ret = __add_to_free_space_tree(trans, fs_info, + block_group, + path2, start, + key.objectid - + start); + if (ret) + goto out_locked; + } + start = key.objectid; + if (key.type == BTRFS_METADATA_ITEM_KEY) + start += fs_info->tree_root->nodesize; + else + start += key.offset; + } else if (key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) { + if (key.objectid != block_group->key.objectid) + break; + } + + ret = btrfs_next_item(extent_root, path); + if (ret < 0) + goto out_locked; + if (ret) + break; + } + if (start < end) { + ret = __add_to_free_space_tree(trans, fs_info, block_group, + path2, start, end - start); + if (ret) + goto out_locked; + } + + ret = 0; +out_locked: + mutex_unlock(&block_group->free_space_lock); +out: + btrfs_free_path(path2); + btrfs_free_path(path); + return ret; +} + +int btrfs_create_free_space_tree(struct btrfs_fs_info *fs_info) +{ + struct btrfs_trans_handle *trans; + struct btrfs_root *tree_root = fs_info->tree_root; + struct btrfs_root *free_space_root; + struct btrfs_block_group_cache *block_group; + struct rb_node *node; + int ret; + + trans = btrfs_start_transaction(tree_root, 0); + if (IS_ERR(trans)) + return PTR_ERR(trans); + + fs_info->creating_free_space_tree = 1; + free_space_root = btrfs_create_tree(trans, fs_info, + BTRFS_FREE_SPACE_TREE_OBJECTID); + if (IS_ERR(free_space_root)) { + ret = PTR_ERR(free_space_root); + goto abort; + } + fs_info->free_space_root = free_space_root; + + node = rb_first(&fs_info->block_group_cache_tree); + while (node) { + block_group = rb_entry(node, struct btrfs_block_group_cache, + cache_node); + ret = populate_free_space_tree(trans, fs_info, block_group); + if (ret) + goto abort; + node = rb_next(node); + } + + btrfs_set_fs_compat_ro(fs_info, FREE_SPACE_TREE); + fs_info->creating_free_space_tree = 0; + + ret = btrfs_commit_transaction(trans, tree_root); + if (ret) + return ret; + + return 0; + +abort: + fs_info->creating_free_space_tree = 0; + btrfs_abort_transaction(trans, tree_root, ret); + btrfs_end_transaction(trans, tree_root); + return ret; +} + +static int clear_free_space_tree(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + struct btrfs_path *path; + struct btrfs_key key; + int nr; + int ret; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + path->leave_spinning = 1; + + key.objectid = 0; + key.type = 0; + key.offset = 0; + + while (1) { + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + if (ret < 0) + goto out; + + nr = btrfs_header_nritems(path->nodes[0]); + if (!nr) + break; + + path->slots[0] = 0; + ret = btrfs_del_items(trans, root, path, 0, nr); + if (ret) + goto out; + + btrfs_release_path(path); + } + + ret = 0; +out: + btrfs_free_path(path); + return ret; +} + +int btrfs_clear_free_space_tree(struct btrfs_fs_info *fs_info) +{ + struct btrfs_trans_handle *trans; + struct btrfs_root *tree_root = fs_info->tree_root; + struct btrfs_root *free_space_root = fs_info->free_space_root; + int ret; + + trans = btrfs_start_transaction(tree_root, 0); + if (IS_ERR(trans)) + return PTR_ERR(trans); + + btrfs_clear_fs_compat_ro(fs_info, FREE_SPACE_TREE); + fs_info->free_space_root = NULL; + + ret = clear_free_space_tree(trans, free_space_root); + if (ret) + goto abort; + + ret = btrfs_del_root(trans, tree_root, &free_space_root->root_key); + if (ret) + goto abort; + + list_del(&free_space_root->dirty_list); + + btrfs_tree_lock(free_space_root->node); + clean_tree_block(trans, tree_root->fs_info, free_space_root->node); + btrfs_tree_unlock(free_space_root->node); + btrfs_free_tree_block(trans, free_space_root, free_space_root->node, + 0, 1); + + free_extent_buffer(free_space_root->node); + free_extent_buffer(free_space_root->commit_root); + kfree(free_space_root); + + ret = btrfs_commit_transaction(trans, tree_root); + if (ret) + return ret; + + return 0; + +abort: + btrfs_abort_transaction(trans, tree_root, ret); + btrfs_end_transaction(trans, tree_root); + return ret; +} + +static int __add_block_group_free_space(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_block_group_cache *block_group, + struct btrfs_path *path) +{ + u64 start, end; + int ret; + + start = block_group->key.objectid; + end = block_group->key.objectid + block_group->key.offset; + + block_group->needs_free_space = 0; + + ret = add_new_free_space_info(trans, fs_info, block_group, path); + if (ret) + return ret; + + return __add_to_free_space_tree(trans, fs_info, block_group, path, + block_group->key.objectid, + block_group->key.offset); +} + +int add_block_group_free_space(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_block_group_cache *block_group) +{ + struct btrfs_path *path = NULL; + int ret = 0; + + if (!btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) + return 0; + + mutex_lock(&block_group->free_space_lock); + if (!block_group->needs_free_space) + goto out; + + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto out; + } + + ret = __add_block_group_free_space(trans, fs_info, block_group, path); + +out: + btrfs_free_path(path); + mutex_unlock(&block_group->free_space_lock); + if (ret) + btrfs_abort_transaction(trans, fs_info->free_space_root, ret); + return ret; +} + +int remove_block_group_free_space(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_block_group_cache *block_group) +{ + struct btrfs_root *root = fs_info->free_space_root; + struct btrfs_path *path; + struct btrfs_key key, found_key; + struct extent_buffer *leaf; + u64 start, end; + int done = 0, nr; + int ret; + + if (!btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) + return 0; + + if (block_group->needs_free_space) { + /* We never added this block group to the free space tree. */ + return 0; + } + + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto out; + } + + start = block_group->key.objectid; + end = block_group->key.objectid + block_group->key.offset; + + key.objectid = end - 1; + key.type = (u8)-1; + key.offset = (u64)-1; + + while (!done) { + ret = btrfs_search_prev_slot(trans, root, &key, path, -1, 1); + if (ret) + goto out; + + leaf = path->nodes[0]; + nr = 0; + path->slots[0]++; + while (path->slots[0] > 0) { + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0] - 1); + + if (found_key.type == BTRFS_FREE_SPACE_INFO_KEY) { + ASSERT(found_key.objectid == block_group->key.objectid); + ASSERT(found_key.offset == block_group->key.offset); + done = 1; + nr++; + path->slots[0]--; + break; + } else if (found_key.type == BTRFS_FREE_SPACE_EXTENT_KEY || + found_key.type == BTRFS_FREE_SPACE_BITMAP_KEY) { + ASSERT(found_key.objectid >= start); + ASSERT(found_key.objectid < end); + ASSERT(found_key.objectid + found_key.offset <= end); + nr++; + path->slots[0]--; + } else { + ASSERT(0); + } + } + + ret = btrfs_del_items(trans, root, path, path->slots[0], nr); + if (ret) + goto out; + btrfs_release_path(path); + } + + ret = 0; +out: + btrfs_free_path(path); + if (ret) + btrfs_abort_transaction(trans, root, ret); + return ret; +} + +static int load_free_space_bitmaps(struct btrfs_caching_control *caching_ctl, + struct btrfs_path *path, + u32 expected_extent_count) +{ + struct btrfs_block_group_cache *block_group; + struct btrfs_fs_info *fs_info; + struct btrfs_root *root; + struct btrfs_key key; + int prev_bit = 0, bit; + /* Initialize to silence GCC. */ + u64 extent_start = 0; + u64 end, offset; + u64 total_found = 0; + u32 extent_count = 0; + int ret; + + block_group = caching_ctl->block_group; + fs_info = block_group->fs_info; + root = fs_info->free_space_root; + + end = block_group->key.objectid + block_group->key.offset; + + while (1) { + ret = btrfs_next_item(root, path); + if (ret < 0) + goto out; + if (ret) + break; + + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + + if (key.type == BTRFS_FREE_SPACE_INFO_KEY) + break; + + ASSERT(key.type == BTRFS_FREE_SPACE_BITMAP_KEY); + ASSERT(key.objectid < end && key.objectid + key.offset <= end); + + caching_ctl->progress = key.objectid; + + offset = key.objectid; + while (offset < key.objectid + key.offset) { + bit = free_space_test_bit(block_group, path, offset); + if (prev_bit == 0 && bit == 1) { + extent_start = offset; + } else if (prev_bit == 1 && bit == 0) { + total_found += add_new_free_space(block_group, + fs_info, + extent_start, + offset); + if (total_found > CACHING_CTL_WAKE_UP) { + total_found = 0; + wake_up(&caching_ctl->wait); + } + extent_count++; + } + prev_bit = bit; + offset += block_group->sectorsize; + } + } + if (prev_bit == 1) { + total_found += add_new_free_space(block_group, fs_info, + extent_start, end); + extent_count++; + } + + if (extent_count != expected_extent_count) { + btrfs_err(fs_info, "incorrect extent count for %llu; counted %u, expected %u", + block_group->key.objectid, extent_count, + expected_extent_count); + ASSERT(0); + ret = -EIO; + goto out; + } + + caching_ctl->progress = (u64)-1; + + ret = 0; +out: + return ret; +} + +static int load_free_space_extents(struct btrfs_caching_control *caching_ctl, + struct btrfs_path *path, + u32 expected_extent_count) +{ + struct btrfs_block_group_cache *block_group; + struct btrfs_fs_info *fs_info; + struct btrfs_root *root; + struct btrfs_key key; + u64 end; + u64 total_found = 0; + u32 extent_count = 0; + int ret; + + block_group = caching_ctl->block_group; + fs_info = block_group->fs_info; + root = fs_info->free_space_root; + + end = block_group->key.objectid + block_group->key.offset; + + while (1) { + ret = btrfs_next_item(root, path); + if (ret < 0) + goto out; + if (ret) + break; + + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + + if (key.type == BTRFS_FREE_SPACE_INFO_KEY) + break; + + ASSERT(key.type == BTRFS_FREE_SPACE_EXTENT_KEY); + ASSERT(key.objectid < end && key.objectid + key.offset <= end); + + caching_ctl->progress = key.objectid; + + total_found += add_new_free_space(block_group, fs_info, + key.objectid, + key.objectid + key.offset); + if (total_found > CACHING_CTL_WAKE_UP) { + total_found = 0; + wake_up(&caching_ctl->wait); + } + extent_count++; + } + + if (extent_count != expected_extent_count) { + btrfs_err(fs_info, "incorrect extent count for %llu; counted %u, expected %u", + block_group->key.objectid, extent_count, + expected_extent_count); + ASSERT(0); + ret = -EIO; + goto out; + } + + caching_ctl->progress = (u64)-1; + + ret = 0; +out: + return ret; +} + +int load_free_space_tree(struct btrfs_caching_control *caching_ctl) +{ + struct btrfs_block_group_cache *block_group; + struct btrfs_fs_info *fs_info; + struct btrfs_free_space_info *info; + struct btrfs_path *path; + u32 extent_count, flags; + int ret; + + block_group = caching_ctl->block_group; + fs_info = block_group->fs_info; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + /* + * Just like caching_thread() doesn't want to deadlock on the extent + * tree, we don't want to deadlock on the free space tree. + */ + path->skip_locking = 1; + path->search_commit_root = 1; + path->reada = 1; + + info = search_free_space_info(NULL, fs_info, block_group, path, 0); + if (IS_ERR(info)) { + ret = PTR_ERR(info); + goto out; + } + extent_count = btrfs_free_space_extent_count(path->nodes[0], info); + flags = btrfs_free_space_flags(path->nodes[0], info); + + /* + * We left path pointing to the free space info item, so now + * load_free_space_foo can just iterate through the free space tree from + * there. + */ + if (flags & BTRFS_FREE_SPACE_USING_BITMAPS) + ret = load_free_space_bitmaps(caching_ctl, path, extent_count); + else + ret = load_free_space_extents(caching_ctl, path, extent_count); + +out: + btrfs_free_path(path); + return ret; +} diff --git a/fs/btrfs/free-space-tree.h b/fs/btrfs/free-space-tree.h new file mode 100644 index 000000000000..54ffced3bce8 --- /dev/null +++ b/fs/btrfs/free-space-tree.h @@ -0,0 +1,72 @@ +/* + * Copyright (C) 2015 Facebook. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef __BTRFS_FREE_SPACE_TREE +#define __BTRFS_FREE_SPACE_TREE + +/* + * The default size for new free space bitmap items. The last bitmap in a block + * group may be truncated, and none of the free space tree code assumes that + * existing bitmaps are this size. + */ +#define BTRFS_FREE_SPACE_BITMAP_SIZE 256 +#define BTRFS_FREE_SPACE_BITMAP_BITS (BTRFS_FREE_SPACE_BITMAP_SIZE * BITS_PER_BYTE) + +void set_free_space_tree_thresholds(struct btrfs_block_group_cache *block_group); +int btrfs_create_free_space_tree(struct btrfs_fs_info *fs_info); +int btrfs_clear_free_space_tree(struct btrfs_fs_info *fs_info); +int load_free_space_tree(struct btrfs_caching_control *caching_ctl); +int add_block_group_free_space(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_block_group_cache *block_group); +int remove_block_group_free_space(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_block_group_cache *block_group); +int add_to_free_space_tree(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + u64 start, u64 size); +int remove_from_free_space_tree(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + u64 start, u64 size); + +/* Exposed for testing. */ +struct btrfs_free_space_info * +search_free_space_info(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_block_group_cache *block_group, + struct btrfs_path *path, int cow); +int __add_to_free_space_tree(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_block_group_cache *block_group, + struct btrfs_path *path, u64 start, u64 size); +int __remove_from_free_space_tree(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_block_group_cache *block_group, + struct btrfs_path *path, u64 start, u64 size); +int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_block_group_cache *block_group, + struct btrfs_path *path); +int convert_free_space_to_extents(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_block_group_cache *block_group, + struct btrfs_path *path); +int free_space_test_bit(struct btrfs_block_group_cache *block_group, + struct btrfs_path *path, u64 offset); + +#endif diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c index 767a6056ac45..e50316c4af15 100644 --- a/fs/btrfs/inode-map.c +++ b/fs/btrfs/inode-map.c @@ -48,7 +48,7 @@ static int caching_kthread(void *data) /* Since the commit root is read-only, we can safely skip locking. */ path->skip_locking = 1; path->search_commit_root = 1; - path->reada = 2; + path->reada = READA_FORWARD; key.objectid = BTRFS_FIRST_FREE_OBJECTID; key.offset = 0; @@ -282,7 +282,7 @@ void btrfs_unpin_free_ino(struct btrfs_root *root) } } -#define INIT_THRESHOLD (((1024 * 32) / 2) / sizeof(struct btrfs_free_space)) +#define INIT_THRESHOLD ((SZ_32K / 2) / sizeof(struct btrfs_free_space)) #define INODES_PER_BITMAP (PAGE_CACHE_SIZE * 8) /* @@ -334,7 +334,7 @@ static bool use_bitmap(struct btrfs_free_space_ctl *ctl, return true; } -static struct btrfs_free_space_op free_ino_op = { +static const struct btrfs_free_space_op free_ino_op = { .recalc_thresholds = recalculate_thresholds, .use_bitmap = use_bitmap, }; @@ -356,7 +356,7 @@ static bool pinned_use_bitmap(struct btrfs_free_space_ctl *ctl, return false; } -static struct btrfs_free_space_op pinned_free_ino_op = { +static const struct btrfs_free_space_op pinned_free_ino_op = { .recalc_thresholds = pinned_recalc_thresholds, .use_bitmap = pinned_use_bitmap, }; @@ -515,7 +515,7 @@ out: return ret; } -static int btrfs_find_highest_objectid(struct btrfs_root *root, u64 *objectid) +int btrfs_find_highest_objectid(struct btrfs_root *root, u64 *objectid) { struct btrfs_path *path; int ret; @@ -555,13 +555,6 @@ int btrfs_find_free_objectid(struct btrfs_root *root, u64 *objectid) int ret; mutex_lock(&root->objectid_mutex); - if (unlikely(root->highest_objectid < BTRFS_FIRST_FREE_OBJECTID)) { - ret = btrfs_find_highest_objectid(root, - &root->highest_objectid); - if (ret) - goto out; - } - if (unlikely(root->highest_objectid >= BTRFS_LAST_FREE_OBJECTID)) { ret = -ENOSPC; goto out; diff --git a/fs/btrfs/inode-map.h b/fs/btrfs/inode-map.h index ddb347bfee23..c8e864b2d530 100644 --- a/fs/btrfs/inode-map.h +++ b/fs/btrfs/inode-map.h @@ -9,5 +9,6 @@ int btrfs_save_ino_cache(struct btrfs_root *root, struct btrfs_trans_handle *trans); int btrfs_find_free_objectid(struct btrfs_root *root, u64 *objectid); +int btrfs_find_highest_objectid(struct btrfs_root *root, u64 *objectid); #endif diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index a70c5790f8f5..e28f3d4691af 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -66,6 +66,13 @@ struct btrfs_iget_args { struct btrfs_root *root; }; +struct btrfs_dio_data { + u64 outstanding_extents; + u64 reserve; + u64 unsubmitted_oe_range_start; + u64 unsubmitted_oe_range_end; +}; + static const struct inode_operations btrfs_dir_inode_operations; static const struct inode_operations btrfs_symlink_inode_operations; static const struct inode_operations btrfs_dir_ro_inode_operations; @@ -74,17 +81,16 @@ static const struct inode_operations btrfs_file_inode_operations; static const struct address_space_operations btrfs_aops; static const struct address_space_operations btrfs_symlink_aops; static const struct file_operations btrfs_dir_file_operations; -static struct extent_io_ops btrfs_extent_io_ops; +static const struct extent_io_ops btrfs_extent_io_ops; static struct kmem_cache *btrfs_inode_cachep; -static struct kmem_cache *btrfs_delalloc_work_cachep; struct kmem_cache *btrfs_trans_handle_cachep; struct kmem_cache *btrfs_transaction_cachep; struct kmem_cache *btrfs_path_cachep; struct kmem_cache *btrfs_free_space_cachep; #define S_SHIFT 12 -static unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = { +static const unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = { [S_IFREG >> S_SHIFT] = BTRFS_FT_REG_FILE, [S_IFDIR >> S_SHIFT] = BTRFS_FT_DIR, [S_IFCHR >> S_SHIFT] = BTRFS_FT_CHRDEV, @@ -414,15 +420,15 @@ static noinline void compress_file_range(struct inode *inode, unsigned long nr_pages_ret = 0; unsigned long total_compressed = 0; unsigned long total_in = 0; - unsigned long max_compressed = 128 * 1024; - unsigned long max_uncompressed = 128 * 1024; + unsigned long max_compressed = SZ_128K; + unsigned long max_uncompressed = SZ_128K; int i; int will_compress; int compress_type = root->fs_info->compress_type; int redirty = 0; /* if this is a small write inside eof, kick off a defrag */ - if ((end - start + 1) < 16 * 1024 && + if ((end - start + 1) < SZ_16K && (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size)) btrfs_add_inode_defrag(NULL, inode); @@ -430,7 +436,7 @@ static noinline void compress_file_range(struct inode *inode, again: will_compress = 0; nr_pages = (end >> PAGE_CACHE_SHIFT) - (start >> PAGE_CACHE_SHIFT) + 1; - nr_pages = min(nr_pages, (128 * 1024UL) / PAGE_CACHE_SIZE); + nr_pages = min_t(unsigned long, nr_pages, SZ_128K / PAGE_CACHE_SIZE); /* * we don't want to send crud past the end of i_size through @@ -944,7 +950,7 @@ static noinline int cow_file_range(struct inode *inode, disk_num_bytes = num_bytes; /* if this is a small write inside eof, kick off defrag */ - if (num_bytes < 64 * 1024 && + if (num_bytes < SZ_64K && (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size)) btrfs_add_inode_defrag(NULL, inode); @@ -1107,7 +1113,7 @@ static noinline void async_cow_submit(struct btrfs_work *work) * atomic_sub_return implies a barrier for waitqueue_active */ if (atomic_sub_return(nr_pages, &root->fs_info->async_delalloc_pages) < - 5 * 1024 * 1024 && + 5 * SZ_1M && waitqueue_active(&root->fs_info->async_submit_wait)) wake_up(&root->fs_info->async_submit_wait); @@ -1132,7 +1138,7 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page, struct btrfs_root *root = BTRFS_I(inode)->root; unsigned long nr_pages; u64 cur_end; - int limit = 10 * 1024 * 1024; + int limit = 10 * SZ_1M; clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED, 1, 0, NULL, GFP_NOFS); @@ -1148,7 +1154,7 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page, !btrfs_test_opt(root, FORCE_COMPRESS)) cur_end = end; else - cur_end = min(end, start + 512 * 1024 - 1); + cur_end = min(end, start + SZ_512K - 1); async_cow->end = cur_end; INIT_LIST_HEAD(&async_cow->extents); @@ -1989,7 +1995,7 @@ again: page_start = page_offset(page); page_end = page_offset(page) + PAGE_CACHE_SIZE - 1; - lock_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end, 0, + lock_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end, &cached_state); /* already ordered? We're done */ @@ -2482,7 +2488,7 @@ static noinline int relink_extent_backref(struct btrfs_path *path, lock_start = backref->file_pos; lock_end = backref->file_pos + backref->num_bytes - 1; lock_extent_bits(&BTRFS_I(inode)->io_tree, lock_start, lock_end, - 0, &cached); + &cached); ordered = btrfs_lookup_first_ordered_extent(inode, lock_end); if (ordered) { @@ -2874,7 +2880,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) lock_extent_bits(io_tree, ordered_extent->file_offset, ordered_extent->file_offset + ordered_extent->len - 1, - 0, &cached_state); + &cached_state); ret = test_range_bit(io_tree, ordered_extent->file_offset, ordered_extent->file_offset + ordered_extent->len - 1, @@ -3106,56 +3112,46 @@ static int btrfs_readpage_end_io_hook(struct btrfs_io_bio *io_bio, start, (size_t)(end - start + 1)); } -struct delayed_iput { - struct list_head list; - struct inode *inode; -}; - -/* JDM: If this is fs-wide, why can't we add a pointer to - * btrfs_inode instead and avoid the allocation? */ void btrfs_add_delayed_iput(struct inode *inode) { struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; - struct delayed_iput *delayed; + struct btrfs_inode *binode = BTRFS_I(inode); if (atomic_add_unless(&inode->i_count, -1, 1)) return; - delayed = kmalloc(sizeof(*delayed), GFP_NOFS | __GFP_NOFAIL); - delayed->inode = inode; - spin_lock(&fs_info->delayed_iput_lock); - list_add_tail(&delayed->list, &fs_info->delayed_iputs); + if (binode->delayed_iput_count == 0) { + ASSERT(list_empty(&binode->delayed_iput)); + list_add_tail(&binode->delayed_iput, &fs_info->delayed_iputs); + } else { + binode->delayed_iput_count++; + } spin_unlock(&fs_info->delayed_iput_lock); } void btrfs_run_delayed_iputs(struct btrfs_root *root) { - LIST_HEAD(list); struct btrfs_fs_info *fs_info = root->fs_info; - struct delayed_iput *delayed; - int empty; spin_lock(&fs_info->delayed_iput_lock); - empty = list_empty(&fs_info->delayed_iputs); - spin_unlock(&fs_info->delayed_iput_lock); - if (empty) - return; - - down_read(&fs_info->delayed_iput_sem); - - spin_lock(&fs_info->delayed_iput_lock); - list_splice_init(&fs_info->delayed_iputs, &list); - spin_unlock(&fs_info->delayed_iput_lock); - - while (!list_empty(&list)) { - delayed = list_entry(list.next, struct delayed_iput, list); - list_del(&delayed->list); - iput(delayed->inode); - kfree(delayed); + while (!list_empty(&fs_info->delayed_iputs)) { + struct btrfs_inode *inode; + + inode = list_first_entry(&fs_info->delayed_iputs, + struct btrfs_inode, delayed_iput); + if (inode->delayed_iput_count) { + inode->delayed_iput_count--; + list_move_tail(&inode->delayed_iput, + &fs_info->delayed_iputs); + } else { + list_del_init(&inode->delayed_iput); + } + spin_unlock(&fs_info->delayed_iput_lock); + iput(&inode->vfs_inode); + spin_lock(&fs_info->delayed_iput_lock); } - - up_read(&root->fs_info->delayed_iput_sem); + spin_unlock(&fs_info->delayed_iput_lock); } /* @@ -3351,7 +3347,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) ret = -ENOMEM; goto out; } - path->reada = -1; + path->reada = READA_BACK; key.objectid = BTRFS_ORPHAN_OBJECTID; key.type = BTRFS_ORPHAN_ITEM_KEY; @@ -3550,10 +3546,10 @@ static noinline int acls_after_inode_item(struct extent_buffer *leaf, int scanned = 0; if (!xattr_access) { - xattr_access = btrfs_name_hash(POSIX_ACL_XATTR_ACCESS, - strlen(POSIX_ACL_XATTR_ACCESS)); - xattr_default = btrfs_name_hash(POSIX_ACL_XATTR_DEFAULT, - strlen(POSIX_ACL_XATTR_DEFAULT)); + xattr_access = btrfs_name_hash(XATTR_NAME_POSIX_ACL_ACCESS, + strlen(XATTR_NAME_POSIX_ACL_ACCESS)); + xattr_default = btrfs_name_hash(XATTR_NAME_POSIX_ACL_DEFAULT, + strlen(XATTR_NAME_POSIX_ACL_DEFAULT)); } slot++; @@ -3774,6 +3770,7 @@ cache_acl: break; case S_IFLNK: inode->i_op = &btrfs_symlink_inode_operations; + inode_nohighmem(inode); inode->i_mapping->a_ops = &btrfs_symlink_aops; break; default: @@ -4317,7 +4314,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, path = btrfs_alloc_path(); if (!path) return -ENOMEM; - path->reada = -1; + path->reada = READA_BACK; /* * We want to drop from the next block forward in case this new size is @@ -4348,7 +4345,7 @@ search_again: * up a huge file in a single leaf. Most of the time that * bytes_deleted is > 0, it will be huge by the time we get here */ - if (be_nice && bytes_deleted > 32 * 1024 * 1024) { + if (be_nice && bytes_deleted > SZ_32M) { if (btrfs_should_end_transaction(trans, root)) { err = -EAGAIN; goto error; @@ -4591,7 +4588,7 @@ error: btrfs_free_path(path); - if (be_nice && bytes_deleted > 32 * 1024 * 1024) { + if (be_nice && bytes_deleted > SZ_32M) { unsigned long updates = trans->delayed_ref_updates; if (updates) { trans->delayed_ref_updates = 0; @@ -4668,7 +4665,7 @@ again: } wait_on_page_writeback(page); - lock_extent_bits(io_tree, page_start, page_end, 0, &cached_state); + lock_extent_bits(io_tree, page_start, page_end, &cached_state); set_page_extent_mapped(page); ordered = btrfs_lookup_ordered_extent(inode, page_start); @@ -4799,7 +4796,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size) while (1) { struct btrfs_ordered_extent *ordered; - lock_extent_bits(io_tree, hole_start, block_end - 1, 0, + lock_extent_bits(io_tree, hole_start, block_end - 1, &cached_state); ordered = btrfs_lookup_ordered_range(inode, hole_start, block_end - hole_start); @@ -4875,26 +4872,6 @@ next: return err; } -static int wait_snapshoting_atomic_t(atomic_t *a) -{ - schedule(); - return 0; -} - -static void wait_for_snapshot_creation(struct btrfs_root *root) -{ - while (true) { - int ret; - - ret = btrfs_start_write_no_snapshoting(root); - if (ret) - break; - wait_on_atomic_t(&root->will_be_snapshoted, - wait_snapshoting_atomic_t, - TASK_UNINTERRUPTIBLE); - } -} - static int btrfs_setsize(struct inode *inode, struct iattr *attr) { struct btrfs_root *root = BTRFS_I(inode)->root; @@ -4926,7 +4903,7 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr) * truncation, it must capture all writes that happened before * this truncation. */ - wait_for_snapshot_creation(root); + btrfs_wait_for_snapshot_creation(root); ret = btrfs_cont_expand(inode, oldsize, newsize); if (ret) { btrfs_end_write_no_snapshoting(root); @@ -5111,7 +5088,7 @@ static void evict_inode_truncate_pages(struct inode *inode) end = state->end; spin_unlock(&io_tree->lock); - lock_extent_bits(io_tree, start, end, 0, &cached_state); + lock_extent_bits(io_tree, start, end, &cached_state); /* * If still has DELALLOC flag, the extent didn't reach disk, @@ -5304,7 +5281,6 @@ void btrfs_evict_inode(struct inode *inode) no_delete: btrfs_remove_delayed_node(inode); clear_inode(inode); - return; } /* @@ -5753,7 +5729,7 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx) if (!path) return -ENOMEM; - path->reada = 1; + path->reada = READA_FORWARD; if (key_type == BTRFS_DIR_INDEX_KEY) { INIT_LIST_HEAD(&ins_list); @@ -6481,7 +6457,7 @@ out_unlock_inode: static int btrfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) { - struct btrfs_trans_handle *trans; + struct btrfs_trans_handle *trans = NULL; struct btrfs_root *root = BTRFS_I(dir)->root; struct inode *inode = d_inode(old_dentry); u64 index; @@ -6507,6 +6483,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, trans = btrfs_start_transaction(root, 5); if (IS_ERR(trans)) { err = PTR_ERR(trans); + trans = NULL; goto fail; } @@ -6540,9 +6517,10 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, btrfs_log_new_name(trans, inode, NULL, parent); } - btrfs_end_transaction(trans, root); btrfs_balance_delayed_items(root); fail: + if (trans) + btrfs_end_transaction(trans, root); if (drop_inode) { inode_dec_link_count(inode); iput(inode); @@ -6687,7 +6665,7 @@ static int merge_extent_mapping(struct extent_map_tree *em_tree, } static noinline int uncompress_inline(struct btrfs_path *path, - struct inode *inode, struct page *page, + struct page *page, size_t pg_offset, u64 extent_offset, struct btrfs_file_extent_item *item) { @@ -6784,7 +6762,7 @@ again: * Chances are we'll be called again, so go ahead and do * readahead */ - path->reada = 1; + path->reada = READA_FORWARD; } ret = btrfs_lookup_file_extent(trans, root, path, @@ -6883,8 +6861,7 @@ next: if (create == 0 && !PageUptodate(page)) { if (btrfs_file_extent_compression(leaf, item) != BTRFS_COMPRESS_NONE) { - ret = uncompress_inline(path, inode, page, - pg_offset, + ret = uncompress_inline(path, page, pg_offset, extent_offset, item); if (ret) { err = ret; @@ -7380,7 +7357,7 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend, while (1) { lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend, - 0, cached_state); + cached_state); /* * We're concerned with the entire range that we're going to be * doing DIO to, so we need to make sure theres no ordered @@ -7408,25 +7385,21 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend, btrfs_start_ordered_extent(inode, ordered, 1); btrfs_put_ordered_extent(ordered); } else { - /* Screw you mmap */ - ret = btrfs_fdatawrite_range(inode, lockstart, lockend); - if (ret) - break; - ret = filemap_fdatawait_range(inode->i_mapping, - lockstart, - lockend); - if (ret) - break; - /* - * If we found a page that couldn't be invalidated just - * fall back to buffered. + * We could trigger writeback for this range (and wait + * for it to complete) and then invalidate the pages for + * this range (through invalidate_inode_pages2_range()), + * but that can lead us to a deadlock with a concurrent + * call to readpages() (a buffered read or a defrag call + * triggered a readahead) on a page lock due to an + * ordered dio extent we created before but did not have + * yet a corresponding bio submitted (whence it can not + * complete), which makes readpages() wait for that + * ordered extent to complete while holding a lock on + * that page. */ - ret = invalidate_inode_pages2_range(inode->i_mapping, - lockstart >> PAGE_CACHE_SHIFT, - lockend >> PAGE_CACHE_SHIFT); - if (ret) - break; + ret = -ENOTBLK; + break; } cond_resched(); @@ -7482,11 +7455,6 @@ static struct extent_map *create_pinned_em(struct inode *inode, u64 start, return em; } -struct btrfs_dio_data { - u64 outstanding_extents; - u64 reserve; -}; - static void adjust_dio_outstanding_extents(struct inode *inode, struct btrfs_dio_data *dio_data, const u64 len) @@ -7670,6 +7638,7 @@ unlock: btrfs_free_reserved_data_space(inode, start, len); WARN_ON(dio_data->reserve < len); dio_data->reserve -= len; + dio_data->unsubmitted_oe_range_end = start + len; current->journal_info = dio_data; } @@ -7992,22 +7961,22 @@ static void btrfs_endio_direct_read(struct bio *bio) bio_put(bio); } -static void btrfs_endio_direct_write(struct bio *bio) +static void btrfs_endio_direct_write_update_ordered(struct inode *inode, + const u64 offset, + const u64 bytes, + const int uptodate) { - struct btrfs_dio_private *dip = bio->bi_private; - struct inode *inode = dip->inode; struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_ordered_extent *ordered = NULL; - u64 ordered_offset = dip->logical_offset; - u64 ordered_bytes = dip->bytes; - struct bio *dio_bio; + u64 ordered_offset = offset; + u64 ordered_bytes = bytes; int ret; again: ret = btrfs_dec_test_first_ordered_pending(inode, &ordered, &ordered_offset, ordered_bytes, - !bio->bi_error); + uptodate); if (!ret) goto out_test; @@ -8020,13 +7989,22 @@ out_test: * our bio might span multiple ordered extents. If we haven't * completed the accounting for the whole dio, go back and try again */ - if (ordered_offset < dip->logical_offset + dip->bytes) { - ordered_bytes = dip->logical_offset + dip->bytes - - ordered_offset; + if (ordered_offset < offset + bytes) { + ordered_bytes = offset + bytes - ordered_offset; ordered = NULL; goto again; } - dio_bio = dip->dio_bio; +} + +static void btrfs_endio_direct_write(struct bio *bio) +{ + struct btrfs_dio_private *dip = bio->bi_private; + struct bio *dio_bio = dip->dio_bio; + + btrfs_endio_direct_write_update_ordered(dip->inode, + dip->logical_offset, + dip->bytes, + !bio->bi_error); kfree(dip); @@ -8334,6 +8312,21 @@ static void btrfs_submit_direct(int rw, struct bio *dio_bio, dip->subio_endio = btrfs_subio_endio_read; } + /* + * Reset the range for unsubmitted ordered extents (to a 0 length range) + * even if we fail to submit a bio, because in such case we do the + * corresponding error handling below and it must not be done a second + * time by btrfs_direct_IO(). + */ + if (write) { + struct btrfs_dio_data *dio_data = current->journal_info; + + dio_data->unsubmitted_oe_range_end = dip->logical_offset + + dip->bytes; + dio_data->unsubmitted_oe_range_start = + dio_data->unsubmitted_oe_range_end; + } + ret = btrfs_submit_direct_hook(rw, dip, skip_sum); if (!ret) return; @@ -8362,24 +8355,15 @@ free_ordered: dip = NULL; io_bio = NULL; } else { - if (write) { - struct btrfs_ordered_extent *ordered; - - ordered = btrfs_lookup_ordered_extent(inode, - file_offset); - set_bit(BTRFS_ORDERED_IOERR, &ordered->flags); - /* - * Decrements our ref on the ordered extent and removes - * the ordered extent from the inode's ordered tree, - * doing all the proper resource cleanup such as for the - * reserved space and waking up any waiters for this - * ordered extent (through btrfs_remove_ordered_extent). - */ - btrfs_finish_ordered_io(ordered); - } else { + if (write) + btrfs_endio_direct_write_update_ordered(inode, + file_offset, + dio_bio->bi_iter.bi_size, + 0); + else unlock_extent(&BTRFS_I(inode)->io_tree, file_offset, file_offset + dio_bio->bi_iter.bi_size - 1); - } + dio_bio->bi_error = -EIO; /* * Releases and cleans up our dio_bio, no need to bio_put() @@ -8463,7 +8447,7 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, * not unlock the i_mutex at this case. */ if (offset + count <= inode->i_size) { - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); relock = true; } ret = btrfs_delalloc_reserve_space(inode, offset, count); @@ -8479,6 +8463,8 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, * originally calculated. Abuse current->journal_info for this. */ dio_data.reserve = round_up(count, root->sectorsize); + dio_data.unsubmitted_oe_range_start = (u64)offset; + dio_data.unsubmitted_oe_range_end = (u64)offset; current->journal_info = &dio_data; } else if (test_bit(BTRFS_INODE_READDIO_NEED_LOCK, &BTRFS_I(inode)->runtime_flags)) { @@ -8497,6 +8483,19 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, if (dio_data.reserve) btrfs_delalloc_release_space(inode, offset, dio_data.reserve); + /* + * On error we might have left some ordered extents + * without submitting corresponding bios for them, so + * cleanup them up to avoid other tasks getting them + * and waiting for them to complete forever. + */ + if (dio_data.unsubmitted_oe_range_start < + dio_data.unsubmitted_oe_range_end) + btrfs_endio_direct_write_update_ordered(inode, + dio_data.unsubmitted_oe_range_start, + dio_data.unsubmitted_oe_range_end - + dio_data.unsubmitted_oe_range_start, + 0); } else if (ret >= 0 && (size_t)ret < count) btrfs_delalloc_release_space(inode, offset, count - (size_t)ret); @@ -8505,7 +8504,7 @@ out: if (wakeup) inode_dio_end(inode); if (relock) - mutex_lock(&inode->i_mutex); + inode_lock(inode); return ret; } @@ -8534,15 +8533,28 @@ int btrfs_readpage(struct file *file, struct page *page) static int btrfs_writepage(struct page *page, struct writeback_control *wbc) { struct extent_io_tree *tree; - + struct inode *inode = page->mapping->host; + int ret; if (current->flags & PF_MEMALLOC) { redirty_page_for_writepage(wbc, page); unlock_page(page); return 0; } + + /* + * If we are under memory pressure we will call this directly from the + * VM, we need to make sure we have the inode referenced for the ordered + * extent. If not just return like we didn't do anything. + */ + if (!igrab(inode)) { + redirty_page_for_writepage(wbc, page); + return AOP_WRITEPAGE_ACTIVATE; + } tree = &BTRFS_I(page->mapping->host)->io_tree; - return extent_write_full_page(tree, page, btrfs_get_extent, wbc); + ret = extent_write_full_page(tree, page, btrfs_get_extent, wbc); + btrfs_add_delayed_iput(inode); + return ret; } static int btrfs_writepages(struct address_space *mapping, @@ -8614,7 +8626,7 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset, } if (!inode_evicting) - lock_extent_bits(tree, page_start, page_end, 0, &cached_state); + lock_extent_bits(tree, page_start, page_end, &cached_state); ordered = btrfs_lookup_ordered_extent(inode, page_start); if (ordered) { /* @@ -8652,7 +8664,7 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset, btrfs_put_ordered_extent(ordered); if (!inode_evicting) { cached_state = NULL; - lock_extent_bits(tree, page_start, page_end, 0, + lock_extent_bits(tree, page_start, page_end, &cached_state); } } @@ -8750,7 +8762,7 @@ again: } wait_on_page_writeback(page); - lock_extent_bits(io_tree, page_start, page_end, 0, &cached_state); + lock_extent_bits(io_tree, page_start, page_end, &cached_state); set_page_extent_mapped(page); /* @@ -9024,6 +9036,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) ei->dir_index = 0; ei->last_unlink_trans = 0; ei->last_log_commit = 0; + ei->delayed_iput_count = 0; spin_lock_init(&ei->lock); ei->outstanding_extents = 0; @@ -9048,6 +9061,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) mutex_init(&ei->delalloc_mutex); btrfs_ordered_inode_tree_init(&ei->ordered_tree); INIT_LIST_HEAD(&ei->delalloc_inodes); + INIT_LIST_HEAD(&ei->delayed_iput); RB_CLEAR_NODE(&ei->rb_node); return inode; @@ -9152,15 +9166,14 @@ void btrfs_destroy_cachep(void) kmem_cache_destroy(btrfs_path_cachep); if (btrfs_free_space_cachep) kmem_cache_destroy(btrfs_free_space_cachep); - if (btrfs_delalloc_work_cachep) - kmem_cache_destroy(btrfs_delalloc_work_cachep); } int btrfs_init_cachep(void) { btrfs_inode_cachep = kmem_cache_create("btrfs_inode", sizeof(struct btrfs_inode), 0, - SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, init_once); + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD | SLAB_ACCOUNT, + init_once); if (!btrfs_inode_cachep) goto fail; @@ -9188,13 +9201,6 @@ int btrfs_init_cachep(void) if (!btrfs_free_space_cachep) goto fail; - btrfs_delalloc_work_cachep = kmem_cache_create("btrfs_delalloc_work", - sizeof(struct btrfs_delalloc_work), 0, - SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, - NULL); - if (!btrfs_delalloc_work_cachep) - goto fail; - return 0; fail: btrfs_destroy_cachep(); @@ -9418,14 +9424,10 @@ static void btrfs_run_delalloc_work(struct btrfs_work *work) delalloc_work = container_of(work, struct btrfs_delalloc_work, work); inode = delalloc_work->inode; - if (delalloc_work->wait) { - btrfs_wait_ordered_range(inode, 0, (u64)-1); - } else { + filemap_flush(inode->i_mapping); + if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, + &BTRFS_I(inode)->runtime_flags)) filemap_flush(inode->i_mapping); - if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, - &BTRFS_I(inode)->runtime_flags)) - filemap_flush(inode->i_mapping); - } if (delalloc_work->delay_iput) btrfs_add_delayed_iput(inode); @@ -9435,18 +9437,17 @@ static void btrfs_run_delalloc_work(struct btrfs_work *work) } struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode, - int wait, int delay_iput) + int delay_iput) { struct btrfs_delalloc_work *work; - work = kmem_cache_zalloc(btrfs_delalloc_work_cachep, GFP_NOFS); + work = kmalloc(sizeof(*work), GFP_NOFS); if (!work) return NULL; init_completion(&work->completion); INIT_LIST_HEAD(&work->list); work->inode = inode; - work->wait = wait; work->delay_iput = delay_iput; WARN_ON_ONCE(!inode); btrfs_init_work(&work->work, btrfs_flush_delalloc_helper, @@ -9458,7 +9459,7 @@ struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode, void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work) { wait_for_completion(&work->completion); - kmem_cache_free(btrfs_delalloc_work_cachep, work); + kfree(work); } /* @@ -9494,7 +9495,7 @@ static int __start_delalloc_inodes(struct btrfs_root *root, int delay_iput, } spin_unlock(&root->delalloc_lock); - work = btrfs_alloc_delalloc_work(inode, 0, delay_iput); + work = btrfs_alloc_delalloc_work(inode, delay_iput); if (!work) { if (delay_iput) btrfs_add_delayed_iput(inode); @@ -9636,9 +9637,11 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry, /* * 2 items for inode item and ref * 2 items for dir items + * 1 item for updating parent inode item + * 1 item for the inline extent item * 1 item for xattr if selinux is on */ - trans = btrfs_start_transaction(root, 5); + trans = btrfs_start_transaction(root, 7); if (IS_ERR(trans)) return PTR_ERR(trans); @@ -9669,10 +9672,6 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry, if (err) goto out_unlock_inode; - err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index); - if (err) - goto out_unlock_inode; - path = btrfs_alloc_path(); if (!path) { err = -ENOMEM; @@ -9705,10 +9704,18 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry, btrfs_free_path(path); inode->i_op = &btrfs_symlink_inode_operations; + inode_nohighmem(inode); inode->i_mapping->a_ops = &btrfs_symlink_aops; inode_set_bytes(inode, name_len); btrfs_i_size_write(inode, name_len); err = btrfs_update_inode(trans, root, inode); + /* + * Last step, add directory indexes for our symlink inode. This is the + * last step to avoid extra cleanup of these indexes if an error happens + * elsewhere above. + */ + if (!err) + err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index); if (err) { drop_inode = 1; goto out_unlock_inode; @@ -9759,7 +9766,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, } } - cur_bytes = min(num_bytes, 256ULL * 1024 * 1024); + cur_bytes = min_t(u64, num_bytes, SZ_256M); cur_bytes = max(cur_bytes, min_size); /* * If we are severely fragmented we could end up with really @@ -9994,7 +10001,7 @@ static const struct inode_operations btrfs_dir_inode_operations = { .setattr = btrfs_setattr, .mknod = btrfs_mknod, .setxattr = btrfs_setxattr, - .getxattr = btrfs_getxattr, + .getxattr = generic_getxattr, .listxattr = btrfs_listxattr, .removexattr = btrfs_removexattr, .permission = btrfs_permission, @@ -10023,7 +10030,7 @@ static const struct file_operations btrfs_dir_file_operations = { .fsync = btrfs_sync_file, }; -static struct extent_io_ops btrfs_extent_io_ops = { +static const struct extent_io_ops btrfs_extent_io_ops = { .fill_delalloc = run_delalloc_range, .submit_bio_hook = btrfs_submit_bio_hook, .merge_bio_hook = btrfs_merge_bio_hook, @@ -10071,7 +10078,7 @@ static const struct inode_operations btrfs_file_inode_operations = { .getattr = btrfs_getattr, .setattr = btrfs_setattr, .setxattr = btrfs_setxattr, - .getxattr = btrfs_getxattr, + .getxattr = generic_getxattr, .listxattr = btrfs_listxattr, .removexattr = btrfs_removexattr, .permission = btrfs_permission, @@ -10085,7 +10092,7 @@ static const struct inode_operations btrfs_special_inode_operations = { .setattr = btrfs_setattr, .permission = btrfs_permission, .setxattr = btrfs_setxattr, - .getxattr = btrfs_getxattr, + .getxattr = generic_getxattr, .listxattr = btrfs_listxattr, .removexattr = btrfs_removexattr, .get_acl = btrfs_get_acl, @@ -10094,13 +10101,12 @@ static const struct inode_operations btrfs_special_inode_operations = { }; static const struct inode_operations btrfs_symlink_inode_operations = { .readlink = generic_readlink, - .follow_link = page_follow_link_light, - .put_link = page_put_link, + .get_link = page_get_link, .getattr = btrfs_getattr, .setattr = btrfs_setattr, .permission = btrfs_permission, .setxattr = btrfs_setxattr, - .getxattr = btrfs_getxattr, + .getxattr = generic_getxattr, .listxattr = btrfs_listxattr, .removexattr = btrfs_removexattr, .update_time = btrfs_update_time, diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index da94138eb85e..952172ca7e45 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -240,7 +240,7 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg) if (ret) return ret; - mutex_lock(&inode->i_mutex); + inode_lock(inode); ip_oldflags = ip->flags; i_oldflags = inode->i_flags; @@ -358,7 +358,7 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg) } out_unlock: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); mnt_drop_write_file(file); return ret; } @@ -568,6 +568,10 @@ static noinline int create_subvol(struct inode *dir, goto fail; } + mutex_lock(&new_root->objectid_mutex); + new_root->highest_objectid = new_dirid; + mutex_unlock(&new_root->objectid_mutex); + /* * insert the directory item */ @@ -655,22 +659,28 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state)) return -EINVAL; + pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS); + if (!pending_snapshot) + return -ENOMEM; + + pending_snapshot->root_item = kzalloc(sizeof(struct btrfs_root_item), + GFP_NOFS); + pending_snapshot->path = btrfs_alloc_path(); + if (!pending_snapshot->root_item || !pending_snapshot->path) { + ret = -ENOMEM; + goto free_pending; + } + atomic_inc(&root->will_be_snapshoted); smp_mb__after_atomic(); btrfs_wait_for_no_snapshoting_writes(root); ret = btrfs_start_delalloc_inodes(root, 0); if (ret) - goto out; + goto dec_and_free; btrfs_wait_ordered_extents(root, -1); - pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS); - if (!pending_snapshot) { - ret = -ENOMEM; - goto out; - } - btrfs_init_block_rsv(&pending_snapshot->block_rsv, BTRFS_BLOCK_RSV_TEMP); /* @@ -686,7 +696,7 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, &pending_snapshot->qgroup_reserved, false); if (ret) - goto free; + goto dec_and_free; pending_snapshot->dentry = dentry; pending_snapshot->root = root; @@ -737,11 +747,14 @@ fail: btrfs_subvolume_release_metadata(BTRFS_I(dir)->root, &pending_snapshot->block_rsv, pending_snapshot->qgroup_reserved); -free: - kfree(pending_snapshot); -out: +dec_and_free: if (atomic_dec_and_test(&root->will_be_snapshoted)) wake_up_atomic_t(&root->will_be_snapshoted); +free_pending: + kfree(pending_snapshot->root_item); + btrfs_free_path(pending_snapshot->path); + kfree(pending_snapshot); + return ret; } @@ -868,7 +881,7 @@ out_up_read: out_dput: dput(dentry); out_unlock: - mutex_unlock(&dir->i_mutex); + inode_unlock(dir); return error; } @@ -992,7 +1005,7 @@ static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start) u64 end = start + len - 1; /* get the big lock and read metadata off disk */ - lock_extent_bits(io_tree, start, end, 0, &cached); + lock_extent_bits(io_tree, start, end, &cached); em = btrfs_get_extent(inode, NULL, 0, start, len, 0); unlock_extent_cached(io_tree, start, end, &cached, GFP_NOFS); @@ -1016,7 +1029,7 @@ static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em) if (!next || next->block_start >= EXTENT_MAP_LAST_BYTE) ret = false; else if ((em->block_start + em->block_len == next->block_start) && - (em->block_len > 128 * 1024 && next->block_len > 128 * 1024)) + (em->block_len > SZ_128K && next->block_len > SZ_128K)) ret = false; free_extent_map(next); @@ -1140,7 +1153,7 @@ again: page_end = page_start + PAGE_CACHE_SIZE - 1; while (1) { lock_extent_bits(tree, page_start, page_end, - 0, &cached_state); + &cached_state); ordered = btrfs_lookup_ordered_extent(inode, page_start); unlock_extent_cached(tree, page_start, page_end, @@ -1200,7 +1213,7 @@ again: page_end = page_offset(pages[i_done - 1]) + PAGE_CACHE_SIZE; lock_extent_bits(&BTRFS_I(inode)->io_tree, - page_start, page_end - 1, 0, &cached_state); + page_start, page_end - 1, &cached_state); clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, page_end - 1, EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 0, 0, @@ -1262,9 +1275,9 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, int defrag_count = 0; int compress_type = BTRFS_COMPRESS_ZLIB; u32 extent_thresh = range->extent_thresh; - unsigned long max_cluster = (256 * 1024) >> PAGE_CACHE_SHIFT; + unsigned long max_cluster = SZ_256K >> PAGE_CACHE_SHIFT; unsigned long cluster = max_cluster; - u64 new_align = ~((u64)128 * 1024 - 1); + u64 new_align = ~((u64)SZ_128K - 1); struct page **pages = NULL; if (isize == 0) @@ -1281,7 +1294,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, } if (extent_thresh == 0) - extent_thresh = 256 * 1024; + extent_thresh = SZ_256K; /* * if we were not given a file, allocate a readahead @@ -1313,7 +1326,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, if (newer_than) { ret = find_new_extents(root, inode, newer_than, - &newer_off, 64 * 1024); + &newer_off, SZ_64K); if (!ret) { range->start = newer_off; /* @@ -1380,18 +1393,18 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, ra_index += cluster; } - mutex_lock(&inode->i_mutex); + inode_lock(inode); if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS) BTRFS_I(inode)->force_compress = compress_type; ret = cluster_pages_for_defrag(inode, pages, i, cluster); if (ret < 0) { - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); goto out_ra; } defrag_count += ret; balance_dirty_pages_ratelimited(inode->i_mapping); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); if (newer_than) { if (newer_off == (u64)-1) @@ -1403,9 +1416,8 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, newer_off = max(newer_off + 1, (u64)i << PAGE_CACHE_SHIFT); - ret = find_new_extents(root, inode, - newer_than, &newer_off, - 64 * 1024); + ret = find_new_extents(root, inode, newer_than, + &newer_off, SZ_64K); if (!ret) { range->start = newer_off; i = (newer_off & new_align) >> PAGE_CACHE_SHIFT; @@ -1453,9 +1465,9 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, out_ra: if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS) { - mutex_lock(&inode->i_mutex); + inode_lock(inode); BTRFS_I(inode)->force_compress = BTRFS_COMPRESS_NONE; - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); } if (!file) kfree(ra); @@ -1571,7 +1583,7 @@ static noinline int btrfs_ioctl_resize(struct file *file, new_size = old_size + new_size; } - if (new_size < 256 * 1024 * 1024) { + if (new_size < SZ_256M) { ret = -EINVAL; goto out_free; } @@ -2160,7 +2172,7 @@ static noinline int btrfs_ioctl_tree_search_v2(struct file *file, struct inode *inode; int ret; size_t buf_size; - const size_t buf_limit = 16 * 1024 * 1024; + const size_t buf_limit = SZ_16M; if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -2418,7 +2430,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, goto out_dput; } - mutex_lock(&inode->i_mutex); + inode_lock(inode); /* * Don't allow to delete a subvolume with send in progress. This is @@ -2531,7 +2543,7 @@ out_up_write: spin_unlock(&dest->root_item_lock); } out_unlock_inode: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); if (!err) { d_invalidate(dentry); btrfs_invalidate_inodes(dest); @@ -2547,7 +2559,7 @@ out_unlock_inode: out_dput: dput(dentry); out_unlock_dir: - mutex_unlock(&dir->i_mutex); + inode_unlock(dir); out_drop_write: mnt_drop_write_file(file); out: @@ -2845,8 +2857,8 @@ static inline void lock_extent_range(struct inode *inode, u64 off, u64 len) static void btrfs_double_inode_unlock(struct inode *inode1, struct inode *inode2) { - mutex_unlock(&inode1->i_mutex); - mutex_unlock(&inode2->i_mutex); + inode_unlock(inode1); + inode_unlock(inode2); } static void btrfs_double_inode_lock(struct inode *inode1, struct inode *inode2) @@ -2854,8 +2866,8 @@ static void btrfs_double_inode_lock(struct inode *inode1, struct inode *inode2) if (inode1 < inode2) swap(inode1, inode2); - mutex_lock_nested(&inode1->i_mutex, I_MUTEX_PARENT); - mutex_lock_nested(&inode2->i_mutex, I_MUTEX_CHILD); + inode_lock_nested(inode1, I_MUTEX_PARENT); + inode_lock_nested(inode2, I_MUTEX_CHILD); } static void btrfs_double_extent_unlock(struct inode *inode1, u64 loff1, @@ -2962,7 +2974,7 @@ static int btrfs_cmp_data(struct inode *src, u64 loff, struct inode *dst, flush_dcache_page(dst_page); if (memcmp(addr, dst_addr, cmp_len)) - ret = BTRFS_SAME_DATA_DIFFERS; + ret = -EBADE; kunmap_atomic(addr); kunmap_atomic(dst_addr); @@ -3014,7 +3026,7 @@ static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen, return 0; if (same_inode) { - mutex_lock(&src->i_mutex); + inode_lock(src); ret = extent_same_check_offsets(src, loff, &len, olen); if (ret) @@ -3089,62 +3101,25 @@ static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen, btrfs_cmp_data_free(&cmp); out_unlock: if (same_inode) - mutex_unlock(&src->i_mutex); + inode_unlock(src); else btrfs_double_inode_unlock(src, dst); return ret; } -#define BTRFS_MAX_DEDUPE_LEN (16 * 1024 * 1024) +#define BTRFS_MAX_DEDUPE_LEN SZ_16M -static long btrfs_ioctl_file_extent_same(struct file *file, - struct btrfs_ioctl_same_args __user *argp) +ssize_t btrfs_dedupe_file_range(struct file *src_file, u64 loff, u64 olen, + struct file *dst_file, u64 dst_loff) { - struct btrfs_ioctl_same_args *same = NULL; - struct btrfs_ioctl_same_extent_info *info; - struct inode *src = file_inode(file); - u64 off; - u64 len; - int i; - int ret; - unsigned long size; + struct inode *src = file_inode(src_file); + struct inode *dst = file_inode(dst_file); u64 bs = BTRFS_I(src)->root->fs_info->sb->s_blocksize; - bool is_admin = capable(CAP_SYS_ADMIN); - u16 count; + ssize_t res; - if (!(file->f_mode & FMODE_READ)) - return -EINVAL; - - ret = mnt_want_write_file(file); - if (ret) - return ret; - - if (get_user(count, &argp->dest_count)) { - ret = -EFAULT; - goto out; - } - - size = offsetof(struct btrfs_ioctl_same_args __user, info[count]); - - same = memdup_user(argp, size); - - if (IS_ERR(same)) { - ret = PTR_ERR(same); - same = NULL; - goto out; - } - - off = same->logical_offset; - len = same->length; - - /* - * Limit the total length we will dedupe for each operation. - * This is intended to bound the total time spent in this - * ioctl to something sane. - */ - if (len > BTRFS_MAX_DEDUPE_LEN) - len = BTRFS_MAX_DEDUPE_LEN; + if (olen > BTRFS_MAX_DEDUPE_LEN) + olen = BTRFS_MAX_DEDUPE_LEN; if (WARN_ON_ONCE(bs < PAGE_CACHE_SIZE)) { /* @@ -3152,58 +3127,13 @@ static long btrfs_ioctl_file_extent_same(struct file *file, * result, btrfs_cmp_data() won't correctly handle * this situation without an update. */ - ret = -EINVAL; - goto out; - } - - ret = -EISDIR; - if (S_ISDIR(src->i_mode)) - goto out; - - ret = -EACCES; - if (!S_ISREG(src->i_mode)) - goto out; - - /* pre-format output fields to sane values */ - for (i = 0; i < count; i++) { - same->info[i].bytes_deduped = 0ULL; - same->info[i].status = 0; - } - - for (i = 0, info = same->info; i < count; i++, info++) { - struct inode *dst; - struct fd dst_file = fdget(info->fd); - if (!dst_file.file) { - info->status = -EBADF; - continue; - } - dst = file_inode(dst_file.file); - - if (!(is_admin || (dst_file.file->f_mode & FMODE_WRITE))) { - info->status = -EINVAL; - } else if (file->f_path.mnt != dst_file.file->f_path.mnt) { - info->status = -EXDEV; - } else if (S_ISDIR(dst->i_mode)) { - info->status = -EISDIR; - } else if (!S_ISREG(dst->i_mode)) { - info->status = -EACCES; - } else { - info->status = btrfs_extent_same(src, off, len, dst, - info->logical_offset); - if (info->status == 0) - info->bytes_deduped += len; - } - fdput(dst_file); + return -EINVAL; } - ret = copy_to_user(argp, same, size); - if (ret) - ret = -EFAULT; - -out: - mnt_drop_write_file(file); - kfree(same); - return ret; + res = btrfs_extent_same(src, loff, olen, dst, dst_loff); + if (res) + return res; + return olen; } static int clone_finish_inode_update(struct btrfs_trans_handle *trans, @@ -3478,7 +3408,7 @@ static int btrfs_clone(struct inode *src, struct inode *inode, return ret; } - path->reada = 2; + path->reada = READA_FORWARD; /* clone data */ key.objectid = btrfs_ino(src); key.type = BTRFS_EXTENT_DATA_KEY; @@ -3779,17 +3709,16 @@ out: return ret; } -static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, - u64 off, u64 olen, u64 destoff) +static noinline int btrfs_clone_files(struct file *file, struct file *file_src, + u64 off, u64 olen, u64 destoff) { struct inode *inode = file_inode(file); + struct inode *src = file_inode(file_src); struct btrfs_root *root = BTRFS_I(inode)->root; - struct fd src_file; - struct inode *src; int ret; u64 len = olen; u64 bs = root->fs_info->sb->s_blocksize; - int same_inode = 0; + int same_inode = src == inode; /* * TODO: @@ -3802,54 +3731,25 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, * be either compressed or non-compressed. */ - /* the destination must be opened for writing */ - if (!(file->f_mode & FMODE_WRITE) || (file->f_flags & O_APPEND)) - return -EINVAL; - if (btrfs_root_readonly(root)) return -EROFS; - ret = mnt_want_write_file(file); - if (ret) - return ret; - - src_file = fdget(srcfd); - if (!src_file.file) { - ret = -EBADF; - goto out_drop_write; - } - - ret = -EXDEV; - if (src_file.file->f_path.mnt != file->f_path.mnt) - goto out_fput; - - src = file_inode(src_file.file); - - ret = -EINVAL; - if (src == inode) - same_inode = 1; - - /* the src must be open for reading */ - if (!(src_file.file->f_mode & FMODE_READ)) - goto out_fput; + if (file_src->f_path.mnt != file->f_path.mnt || + src->i_sb != inode->i_sb) + return -EXDEV; /* don't make the dst file partly checksummed */ if ((BTRFS_I(src)->flags & BTRFS_INODE_NODATASUM) != (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) - goto out_fput; + return -EINVAL; - ret = -EISDIR; if (S_ISDIR(src->i_mode) || S_ISDIR(inode->i_mode)) - goto out_fput; - - ret = -EXDEV; - if (src->i_sb != inode->i_sb) - goto out_fput; + return -EISDIR; if (!same_inode) { btrfs_double_inode_lock(src, inode); } else { - mutex_lock(&src->i_mutex); + inode_lock(src); } /* determine range to clone */ @@ -3920,22 +3820,26 @@ out_unlock: if (!same_inode) btrfs_double_inode_unlock(src, inode); else - mutex_unlock(&src->i_mutex); -out_fput: - fdput(src_file); -out_drop_write: - mnt_drop_write_file(file); + inode_unlock(src); return ret; } -static long btrfs_ioctl_clone_range(struct file *file, void __user *argp) +ssize_t btrfs_copy_file_range(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, + size_t len, unsigned int flags) { - struct btrfs_ioctl_clone_range_args args; + ssize_t ret; - if (copy_from_user(&args, argp, sizeof(args))) - return -EFAULT; - return btrfs_ioctl_clone(file, args.src_fd, args.src_offset, - args.src_length, args.dest_offset); + ret = btrfs_clone_files(file_out, file_in, pos_in, len, pos_out); + if (ret == 0) + ret = len; + return ret; +} + +int btrfs_clone_file_range(struct file *src_file, loff_t off, + struct file *dst_file, loff_t destoff, u64 len) +{ + return btrfs_clone_files(dst_file, src_file, off, len, destoff); } /* @@ -4147,7 +4051,7 @@ static long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg) return -ENOMEM; space_args.total_spaces = 0; - dest = kmalloc(alloc_size, GFP_NOFS); + dest = kmalloc(alloc_size, GFP_KERNEL); if (!dest) return -ENOMEM; dest_orig = dest; @@ -4524,7 +4428,7 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_root *root, goto out; } - size = min_t(u32, loi->size, 64 * 1024); + size = min_t(u32, loi->size, SZ_64K); inodes = init_data_container(size); if (IS_ERR(inodes)) { ret = PTR_ERR(inodes); @@ -4673,7 +4577,7 @@ locked: goto out_bargs; } - bctl = kzalloc(sizeof(*bctl), GFP_NOFS); + bctl = kzalloc(sizeof(*bctl), GFP_KERNEL); if (!bctl) { ret = -ENOMEM; goto out_bargs; @@ -4759,7 +4663,7 @@ static long btrfs_ioctl_balance_progress(struct btrfs_root *root, goto out; } - bargs = kzalloc(sizeof(*bargs), GFP_NOFS); + bargs = kzalloc(sizeof(*bargs), GFP_KERNEL); if (!bargs) { ret = -ENOMEM; goto out; @@ -5019,7 +4923,7 @@ static long btrfs_ioctl_quota_rescan_status(struct file *file, void __user *arg) if (!capable(CAP_SYS_ADMIN)) return -EPERM; - qsa = kzalloc(sizeof(*qsa), GFP_NOFS); + qsa = kzalloc(sizeof(*qsa), GFP_KERNEL); if (!qsa) return -ENOMEM; @@ -5149,7 +5053,7 @@ static long btrfs_ioctl_set_received_subvol_32(struct file *file, goto out; } - args64 = kmalloc(sizeof(*args64), GFP_NOFS); + args64 = kmalloc(sizeof(*args64), GFP_KERNEL); if (!args64) { ret = -ENOMEM; goto out; @@ -5286,7 +5190,7 @@ out_unlock: static int btrfs_ioctl_get_supported_features(struct file *file, void __user *arg) { - static struct btrfs_ioctl_feature_flags features[3] = { + static const struct btrfs_ioctl_feature_flags features[3] = { INIT_FEATURE_FLAGS(SUPP), INIT_FEATURE_FLAGS(SAFE_SET), INIT_FEATURE_FLAGS(SAFE_CLEAR) @@ -5485,10 +5389,6 @@ long btrfs_ioctl(struct file *file, unsigned int return btrfs_ioctl_dev_info(root, argp); case BTRFS_IOC_BALANCE: return btrfs_ioctl_balance(file, NULL); - case BTRFS_IOC_CLONE: - return btrfs_ioctl_clone(file, arg, 0, 0, 0); - case BTRFS_IOC_CLONE_RANGE: - return btrfs_ioctl_clone_range(file, argp); case BTRFS_IOC_TRANS_START: return btrfs_ioctl_trans_start(file); case BTRFS_IOC_TRANS_END: @@ -5566,8 +5466,6 @@ long btrfs_ioctl(struct file *file, unsigned int return btrfs_ioctl_get_fslabel(file, argp); case BTRFS_IOC_SET_FSLABEL: return btrfs_ioctl_set_fslabel(file, argp); - case BTRFS_IOC_FILE_EXTENT_SAME: - return btrfs_ioctl_file_extent_same(file, argp); case BTRFS_IOC_GET_SUPPORTED_FEATURES: return btrfs_ioctl_get_supported_features(file, argp); case BTRFS_IOC_GET_FEATURES: diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c index 8077461fc56a..d13128c70ddd 100644 --- a/fs/btrfs/locking.c +++ b/fs/btrfs/locking.c @@ -56,7 +56,6 @@ void btrfs_set_lock_blocking_rw(struct extent_buffer *eb, int rw) atomic_dec(&eb->spinning_readers); read_unlock(&eb->lock); } - return; } /* @@ -96,7 +95,6 @@ void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw) waitqueue_active(&eb->read_lock_wq)) wake_up(&eb->read_lock_wq); } - return; } /* diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 1a33d3eb36de..55161369fab1 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -503,7 +503,6 @@ static void cache_rbio(struct btrfs_raid_bio *rbio) } spin_unlock_irqrestore(&table->cache_lock, flags); - return; } /* @@ -610,13 +609,28 @@ static int rbio_can_merge(struct btrfs_raid_bio *last, return 1; } +static int rbio_stripe_page_index(struct btrfs_raid_bio *rbio, int stripe, + int index) +{ + return stripe * rbio->stripe_npages + index; +} + +/* + * these are just the pages from the rbio array, not from anything + * the FS sent down to us + */ +static struct page *rbio_stripe_page(struct btrfs_raid_bio *rbio, int stripe, + int index) +{ + return rbio->stripe_pages[rbio_stripe_page_index(rbio, stripe, index)]; +} + /* * helper to index into the pstripe */ static struct page *rbio_pstripe_page(struct btrfs_raid_bio *rbio, int index) { - index += (rbio->nr_data * rbio->stripe_len) >> PAGE_CACHE_SHIFT; - return rbio->stripe_pages[index]; + return rbio_stripe_page(rbio, rbio->nr_data, index); } /* @@ -627,10 +641,7 @@ static struct page *rbio_qstripe_page(struct btrfs_raid_bio *rbio, int index) { if (rbio->nr_data + 1 == rbio->real_stripes) return NULL; - - index += ((rbio->nr_data + 1) * rbio->stripe_len) >> - PAGE_CACHE_SHIFT; - return rbio->stripe_pages[index]; + return rbio_stripe_page(rbio, rbio->nr_data + 1, index); } /* @@ -890,6 +901,7 @@ static void raid_write_end_io(struct bio *bio) { struct btrfs_raid_bio *rbio = bio->bi_private; int err = bio->bi_error; + int max_errors; if (err) fail_bio_stripe(rbio, bio); @@ -902,11 +914,12 @@ static void raid_write_end_io(struct bio *bio) err = 0; /* OK, we have read all the stripes we need to. */ - if (atomic_read(&rbio->error) > rbio->bbio->max_errors) + max_errors = (rbio->operation == BTRFS_RBIO_PARITY_SCRUB) ? + 0 : rbio->bbio->max_errors; + if (atomic_read(&rbio->error) > max_errors) err = -EIO; rbio_orig_end_io(rbio, err); - return; } /* @@ -949,8 +962,7 @@ static struct page *page_in_rbio(struct btrfs_raid_bio *rbio, */ static unsigned long rbio_nr_pages(unsigned long stripe_len, int nr_stripes) { - unsigned long nr = stripe_len * nr_stripes; - return DIV_ROUND_UP(nr, PAGE_CACHE_SIZE); + return DIV_ROUND_UP(stripe_len, PAGE_CACHE_SIZE) * nr_stripes; } /* @@ -968,8 +980,8 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_root *root, void *p; rbio = kzalloc(sizeof(*rbio) + num_pages * sizeof(struct page *) * 2 + - DIV_ROUND_UP(stripe_npages, BITS_PER_LONG / 8), - GFP_NOFS); + DIV_ROUND_UP(stripe_npages, BITS_PER_LONG) * + sizeof(long), GFP_NOFS); if (!rbio) return ERR_PTR(-ENOMEM); @@ -1023,18 +1035,17 @@ static int alloc_rbio_pages(struct btrfs_raid_bio *rbio) if (!page) return -ENOMEM; rbio->stripe_pages[i] = page; - ClearPageUptodate(page); } return 0; } -/* allocate pages for just the p/q stripes */ +/* only allocate pages for p/q stripes */ static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio) { int i; struct page *page; - i = (rbio->nr_data * rbio->stripe_len) >> PAGE_CACHE_SHIFT; + i = rbio_stripe_page_index(rbio, rbio->nr_data, 0); for (; i < rbio->nr_pages; i++) { if (rbio->stripe_pages[i]) @@ -1123,18 +1134,6 @@ static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio) } /* - * these are just the pages from the rbio array, not from anything - * the FS sent down to us - */ -static struct page *rbio_stripe_page(struct btrfs_raid_bio *rbio, int stripe, int page) -{ - int index; - index = stripe * (rbio->stripe_len >> PAGE_CACHE_SHIFT); - index += page; - return rbio->stripe_pages[index]; -} - -/* * helper function to walk our bio list and populate the bio_pages array with * the result. This seems expensive, but it is faster than constantly * searching through the bio list as we setup the IO in finish_rmw or stripe @@ -1177,7 +1176,6 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio) { struct btrfs_bio *bbio = rbio->bbio; void *pointers[rbio->real_stripes]; - int stripe_len = rbio->stripe_len; int nr_data = rbio->nr_data; int stripe; int pagenr; @@ -1185,7 +1183,6 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio) int q_stripe = -1; struct bio_list bio_list; struct bio *bio; - int pages_per_stripe = stripe_len >> PAGE_CACHE_SHIFT; int ret; bio_list_init(&bio_list); @@ -1228,7 +1225,7 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio) else clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags); - for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) { + for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) { struct page *p; /* first collect one page from each data stripe */ for (stripe = 0; stripe < nr_data; stripe++) { @@ -1270,7 +1267,7 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio) * everything else. */ for (stripe = 0; stripe < rbio->real_stripes; stripe++) { - for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) { + for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) { struct page *page; if (stripe < rbio->nr_data) { page = page_in_rbio(rbio, stripe, pagenr, 1); @@ -1294,7 +1291,7 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio) if (!bbio->tgtdev_map[stripe]) continue; - for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) { + for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) { struct page *page; if (stripe < rbio->nr_data) { page = page_in_rbio(rbio, stripe, pagenr, 1); @@ -1508,7 +1505,6 @@ static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio) int bios_to_read = 0; struct bio_list bio_list; int ret; - int nr_pages = DIV_ROUND_UP(rbio->stripe_len, PAGE_CACHE_SIZE); int pagenr; int stripe; struct bio *bio; @@ -1527,7 +1523,7 @@ static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio) * stripe */ for (stripe = 0; stripe < rbio->nr_data; stripe++) { - for (pagenr = 0; pagenr < nr_pages; pagenr++) { + for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) { struct page *page; /* * we want to find all the pages missing from @@ -1803,7 +1799,6 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio) int pagenr, stripe; void **pointers; int faila = -1, failb = -1; - int nr_pages = DIV_ROUND_UP(rbio->stripe_len, PAGE_CACHE_SIZE); struct page *page; int err; int i; @@ -1826,7 +1821,7 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio) index_rbio_pages(rbio); - for (pagenr = 0; pagenr < nr_pages; pagenr++) { + for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) { /* * Now we just use bitmap to mark the horizontal stripes in * which we have data when doing parity scrub. @@ -1937,7 +1932,7 @@ pstripe: * other endio functions will fiddle the uptodate bits */ if (rbio->operation == BTRFS_RBIO_WRITE) { - for (i = 0; i < nr_pages; i++) { + for (i = 0; i < rbio->stripe_npages; i++) { if (faila != -1) { page = rbio_stripe_page(rbio, faila, i); SetPageUptodate(page); @@ -2033,7 +2028,6 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio) int bios_to_read = 0; struct bio_list bio_list; int ret; - int nr_pages = DIV_ROUND_UP(rbio->stripe_len, PAGE_CACHE_SIZE); int pagenr; int stripe; struct bio *bio; @@ -2057,7 +2051,7 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio) continue; } - for (pagenr = 0; pagenr < nr_pages; pagenr++) { + for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) { struct page *p; /* @@ -2281,37 +2275,11 @@ static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio) if (!page) return -ENOMEM; rbio->stripe_pages[index] = page; - ClearPageUptodate(page); } } return 0; } -/* - * end io function used by finish_rmw. When we finally - * get here, we've written a full stripe - */ -static void raid_write_parity_end_io(struct bio *bio) -{ - struct btrfs_raid_bio *rbio = bio->bi_private; - int err = bio->bi_error; - - if (bio->bi_error) - fail_bio_stripe(rbio, bio); - - bio_put(bio); - - if (!atomic_dec_and_test(&rbio->stripes_pending)) - return; - - err = 0; - - if (atomic_read(&rbio->error)) - err = -EIO; - - rbio_orig_end_io(rbio, err); -} - static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio, int need_check) { @@ -2464,7 +2432,7 @@ submit_write: break; bio->bi_private = rbio; - bio->bi_end_io = raid_write_parity_end_io; + bio->bi_end_io = raid_write_end_io; submit_bio(WRITE, bio); } return; diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index b4ca5454ef1a..fd1c4d982463 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -708,8 +708,8 @@ struct backref_node *build_backref_tree(struct reloc_control *rc, err = -ENOMEM; goto out; } - path1->reada = 1; - path2->reada = 2; + path1->reada = READA_FORWARD; + path2->reada = READA_FORWARD; node = alloc_backref_node(cache); if (!node) { @@ -2130,7 +2130,7 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc, path = btrfs_alloc_path(); if (!path) return -ENOMEM; - path->reada = 1; + path->reada = READA_FORWARD; reloc_root = root->reloc_root; root_item = &reloc_root->root_item; @@ -3030,7 +3030,7 @@ int prealloc_file_extent_cluster(struct inode *inode, int ret = 0; BUG_ON(cluster->start != cluster->boundary[0]); - mutex_lock(&inode->i_mutex); + inode_lock(inode); ret = btrfs_check_data_free_space(inode, cluster->start, cluster->end + 1 - cluster->start); @@ -3057,7 +3057,7 @@ int prealloc_file_extent_cluster(struct inode *inode, btrfs_free_reserved_data_space(inode, cluster->start, cluster->end + 1 - cluster->start); out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return ret; } @@ -3527,7 +3527,7 @@ static int find_data_references(struct reloc_control *rc, path = btrfs_alloc_path(); if (!path) return -ENOMEM; - path->reada = 1; + path->reada = READA_FORWARD; root = read_fs_root(rc->extent_root->fs_info, ref_root); if (IS_ERR(root)) { @@ -3917,7 +3917,7 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc) path = btrfs_alloc_path(); if (!path) return -ENOMEM; - path->reada = 1; + path->reada = READA_FORWARD; ret = prepare_to_relocate(rc); if (ret) { @@ -4343,7 +4343,7 @@ int btrfs_recover_relocation(struct btrfs_root *root) path = btrfs_alloc_path(); if (!path) return -ENOMEM; - path->reada = -1; + path->reada = READA_BACK; key.objectid = BTRFS_TREE_RELOC_OBJECTID; key.type = BTRFS_ROOT_ITEM_KEY; diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index b091d94ceef6..92bf5ee732fb 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -1514,8 +1514,6 @@ static void scrub_recheck_block(struct btrfs_fs_info *fs_info, if (sblock->no_io_error_seen) scrub_recheck_block_checksum(sblock); - - return; } static inline int scrub_check_fsid(u8 fsid[], @@ -2815,7 +2813,7 @@ out: static inline int scrub_calc_parity_bitmap_len(int nsectors) { - return DIV_ROUND_UP(nsectors, BITS_PER_LONG) * (BITS_PER_LONG / 8); + return DIV_ROUND_UP(nsectors, BITS_PER_LONG) * sizeof(long); } static void scrub_parity_get(struct scrub_parity *sparity) @@ -3460,7 +3458,7 @@ static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx, return ret; } - map = (struct map_lookup *)em->bdev; + map = em->map_lookup; if (em->start != chunk_offset) goto out; @@ -3507,7 +3505,7 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, if (!path) return -ENOMEM; - path->reada = 2; + path->reada = READA_FORWARD; path->search_commit_root = 1; path->skip_locking = 1; @@ -3735,27 +3733,27 @@ static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info, if (fs_info->scrub_workers_refcnt == 0) { if (is_dev_replace) fs_info->scrub_workers = - btrfs_alloc_workqueue("btrfs-scrub", flags, + btrfs_alloc_workqueue("scrub", flags, 1, 4); else fs_info->scrub_workers = - btrfs_alloc_workqueue("btrfs-scrub", flags, + btrfs_alloc_workqueue("scrub", flags, max_active, 4); if (!fs_info->scrub_workers) goto fail_scrub_workers; fs_info->scrub_wr_completion_workers = - btrfs_alloc_workqueue("btrfs-scrubwrc", flags, + btrfs_alloc_workqueue("scrubwrc", flags, max_active, 2); if (!fs_info->scrub_wr_completion_workers) goto fail_scrub_wr_completion_workers; fs_info->scrub_nocow_workers = - btrfs_alloc_workqueue("btrfs-scrubnc", flags, 1, 0); + btrfs_alloc_workqueue("scrubnc", flags, 1, 0); if (!fs_info->scrub_nocow_workers) goto fail_scrub_nocow_workers; fs_info->scrub_parity_workers = - btrfs_alloc_workqueue("btrfs-scrubparity", flags, + btrfs_alloc_workqueue("scrubparity", flags, max_active, 2); if (!fs_info->scrub_parity_workers) goto fail_scrub_parity_workers; @@ -4211,7 +4209,7 @@ static int check_extent_to_block(struct inode *inode, u64 start, u64 len, io_tree = &BTRFS_I(inode)->io_tree; - lock_extent_bits(io_tree, lockstart, lockend, 0, &cached_state); + lock_extent_bits(io_tree, lockstart, lockend, &cached_state); ordered = btrfs_lookup_ordered_range(inode, lockstart, len); if (ordered) { btrfs_put_ordered_extent(ordered); @@ -4281,7 +4279,7 @@ static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, return PTR_ERR(inode); /* Avoid truncate/dio/punch hole.. */ - mutex_lock(&inode->i_mutex); + inode_lock(inode); inode_dio_wait(inode); physical_for_dev_replace = nocow_ctx->physical_for_dev_replace; @@ -4360,7 +4358,7 @@ next_page: } ret = COPY_COMPLETE; out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); iput(inode); return ret; } diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 355a458cba1a..63a6152be04b 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -1469,7 +1469,21 @@ static int read_symlink(struct btrfs_root *root, ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) goto out; - BUG_ON(ret); + if (ret) { + /* + * An empty symlink inode. Can happen in rare error paths when + * creating a symlink (transaction committed before the inode + * eviction handler removed the symlink inode items and a crash + * happened in between or the subvol was snapshoted in between). + * Print an informative message to dmesg/syslog so that the user + * can delete the symlink. + */ + btrfs_err(root->fs_info, + "Found empty symlink inode %llu at root %llu", + ino, root->root_key.objectid); + ret = -EIO; + goto out; + } ei = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_file_extent_item); diff --git a/fs/btrfs/send.h b/fs/btrfs/send.h index 48d425aef05b..02e00166c4da 100644 --- a/fs/btrfs/send.h +++ b/fs/btrfs/send.h @@ -22,8 +22,8 @@ #define BTRFS_SEND_STREAM_MAGIC "btrfs-stream" #define BTRFS_SEND_STREAM_VERSION 1 -#define BTRFS_SEND_BUF_SIZE (1024 * 64) -#define BTRFS_SEND_READ_SIZE (1024 * 48) +#define BTRFS_SEND_BUF_SIZE SZ_64K +#define BTRFS_SEND_READ_SIZE (48 * SZ_1K) enum btrfs_tlv_type { BTRFS_TLV_U8, diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 24154e422945..d41e09fe8e38 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -295,10 +295,11 @@ enum { Opt_nossd, Opt_ssd_spread, Opt_thread_pool, Opt_noacl, Opt_compress, Opt_compress_type, Opt_compress_force, Opt_compress_force_type, Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_discard, - Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed, - Opt_enospc_debug, Opt_subvolrootid, Opt_defrag, Opt_inode_cache, - Opt_no_space_cache, Opt_recovery, Opt_skip_balance, - Opt_check_integrity, Opt_check_integrity_including_extent_data, + Opt_space_cache, Opt_space_cache_version, Opt_clear_cache, + Opt_user_subvol_rm_allowed, Opt_enospc_debug, Opt_subvolrootid, + Opt_defrag, Opt_inode_cache, Opt_no_space_cache, Opt_recovery, + Opt_skip_balance, Opt_check_integrity, + Opt_check_integrity_including_extent_data, Opt_check_integrity_print_mask, Opt_fatal_errors, Opt_rescan_uuid_tree, Opt_commit_interval, Opt_barrier, Opt_nodefrag, Opt_nodiscard, Opt_noenospc_debug, Opt_noflushoncommit, Opt_acl, Opt_datacow, @@ -309,7 +310,7 @@ enum { Opt_err, }; -static match_table_t tokens = { +static const match_table_t tokens = { {Opt_degraded, "degraded"}, {Opt_subvol, "subvol=%s"}, {Opt_subvolid, "subvolid=%s"}, @@ -340,6 +341,7 @@ static match_table_t tokens = { {Opt_discard, "discard"}, {Opt_nodiscard, "nodiscard"}, {Opt_space_cache, "space_cache"}, + {Opt_space_cache_version, "space_cache=%s"}, {Opt_clear_cache, "clear_cache"}, {Opt_user_subvol_rm_allowed, "user_subvol_rm_allowed"}, {Opt_enospc_debug, "enospc_debug"}, @@ -381,9 +383,14 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) int ret = 0; char *compress_type; bool compress_force = false; + enum btrfs_compression_type saved_compress_type; + bool saved_compress_force; + int no_compress = 0; cache_gen = btrfs_super_cache_generation(root->fs_info->super_copy); - if (cache_gen) + if (btrfs_fs_compat_ro(root->fs_info, FREE_SPACE_TREE)) + btrfs_set_opt(info->mount_opt, FREE_SPACE_TREE); + else if (cache_gen) btrfs_set_opt(info->mount_opt, SPACE_CACHE); if (!options) @@ -458,6 +465,10 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) /* Fallthrough */ case Opt_compress: case Opt_compress_type: + saved_compress_type = btrfs_test_opt(root, COMPRESS) ? + info->compress_type : BTRFS_COMPRESS_NONE; + saved_compress_force = + btrfs_test_opt(root, FORCE_COMPRESS); if (token == Opt_compress || token == Opt_compress_force || strcmp(args[0].from, "zlib") == 0) { @@ -466,6 +477,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) btrfs_set_opt(info->mount_opt, COMPRESS); btrfs_clear_opt(info->mount_opt, NODATACOW); btrfs_clear_opt(info->mount_opt, NODATASUM); + no_compress = 0; } else if (strcmp(args[0].from, "lzo") == 0) { compress_type = "lzo"; info->compress_type = BTRFS_COMPRESS_LZO; @@ -473,25 +485,21 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) btrfs_clear_opt(info->mount_opt, NODATACOW); btrfs_clear_opt(info->mount_opt, NODATASUM); btrfs_set_fs_incompat(info, COMPRESS_LZO); + no_compress = 0; } else if (strncmp(args[0].from, "no", 2) == 0) { compress_type = "no"; btrfs_clear_opt(info->mount_opt, COMPRESS); btrfs_clear_opt(info->mount_opt, FORCE_COMPRESS); compress_force = false; + no_compress++; } else { ret = -EINVAL; goto out; } if (compress_force) { - btrfs_set_and_info(root, FORCE_COMPRESS, - "force %s compression", - compress_type); + btrfs_set_opt(info->mount_opt, FORCE_COMPRESS); } else { - if (!btrfs_test_opt(root, COMPRESS)) - btrfs_info(root->fs_info, - "btrfs: use %s compression", - compress_type); /* * If we remount from compress-force=xxx to * compress=xxx, we need clear FORCE_COMPRESS @@ -500,6 +508,17 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) */ btrfs_clear_opt(info->mount_opt, FORCE_COMPRESS); } + if ((btrfs_test_opt(root, COMPRESS) && + (info->compress_type != saved_compress_type || + compress_force != saved_compress_force)) || + (!btrfs_test_opt(root, COMPRESS) && + no_compress == 1)) { + btrfs_info(root->fs_info, + "%s %s compression", + (compress_force) ? "force" : "use", + compress_type); + } + compress_force = false; break; case Opt_ssd: btrfs_set_and_info(root, SSD, @@ -617,15 +636,35 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) "turning off discard"); break; case Opt_space_cache: - btrfs_set_and_info(root, SPACE_CACHE, - "enabling disk space caching"); + case Opt_space_cache_version: + if (token == Opt_space_cache || + strcmp(args[0].from, "v1") == 0) { + btrfs_clear_opt(root->fs_info->mount_opt, + FREE_SPACE_TREE); + btrfs_set_and_info(root, SPACE_CACHE, + "enabling disk space caching"); + } else if (strcmp(args[0].from, "v2") == 0) { + btrfs_clear_opt(root->fs_info->mount_opt, + SPACE_CACHE); + btrfs_set_and_info(root, FREE_SPACE_TREE, + "enabling free space tree"); + } else { + ret = -EINVAL; + goto out; + } break; case Opt_rescan_uuid_tree: btrfs_set_opt(info->mount_opt, RESCAN_UUID_TREE); break; case Opt_no_space_cache: - btrfs_clear_and_info(root, SPACE_CACHE, - "disabling disk space caching"); + if (btrfs_test_opt(root, SPACE_CACHE)) { + btrfs_clear_and_info(root, SPACE_CACHE, + "disabling disk space caching"); + } + if (btrfs_test_opt(root, FREE_SPACE_TREE)) { + btrfs_clear_and_info(root, FREE_SPACE_TREE, + "disabling free space tree"); + } break; case Opt_inode_cache: btrfs_set_pending_and_info(info, INODE_MAP_CACHE, @@ -754,8 +793,17 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) } } out: + if (btrfs_fs_compat_ro(root->fs_info, FREE_SPACE_TREE) && + !btrfs_test_opt(root, FREE_SPACE_TREE) && + !btrfs_test_opt(root, CLEAR_CACHE)) { + btrfs_err(root->fs_info, "cannot disable free space tree"); + ret = -EINVAL; + + } if (!ret && btrfs_test_opt(root, SPACE_CACHE)) btrfs_info(root->fs_info, "disk space caching is enabled"); + if (!ret && btrfs_test_opt(root, FREE_SPACE_TREE)) + btrfs_info(root->fs_info, "using free space tree"); kfree(orig); return ret; } @@ -1162,6 +1210,8 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry) seq_puts(seq, ",noacl"); if (btrfs_test_opt(root, SPACE_CACHE)) seq_puts(seq, ",space_cache"); + else if (btrfs_test_opt(root, FREE_SPACE_TREE)) + seq_puts(seq, ",space_cache=v2"); else seq_puts(seq, ",nospace_cache"); if (btrfs_test_opt(root, RESCAN_UUID_TREE)) @@ -1514,9 +1564,7 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, if ((flags ^ s->s_flags) & MS_RDONLY) error = -EBUSY; } else { - char b[BDEVNAME_SIZE]; - - strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id)); + snprintf(s->s_id, sizeof(s->s_id), "%pg", bdev); btrfs_sb(s)->bdev_holder = fs_type; error = btrfs_fill_super(s, fs_devices, data, flags & MS_SILENT ? 1 : 0); @@ -1865,7 +1913,7 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes) * btrfs starts at an offset of at least 1MB when doing chunk * allocation. */ - skip_space = 1024 * 1024; + skip_space = SZ_1M; /* user can set the offset in fs_info->alloc_start. */ if (fs_info->alloc_start && @@ -1956,6 +2004,8 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes) * there are other factors that may change the result (like a new metadata * chunk). * + * If metadata is exhausted, f_bavail will be 0. + * * FIXME: not accurate for mixed block groups, total and free/used are ok, * available appears slightly larger. */ @@ -1967,11 +2017,13 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) struct btrfs_space_info *found; u64 total_used = 0; u64 total_free_data = 0; + u64 total_free_meta = 0; int bits = dentry->d_sb->s_blocksize_bits; __be32 *fsid = (__be32 *)fs_info->fsid; unsigned factor = 1; struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv; int ret; + u64 thresh = 0; /* * holding chunk_muext to avoid allocating new chunks, holding @@ -1997,6 +2049,8 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) } } } + if (found->flags & BTRFS_BLOCK_GROUP_METADATA) + total_free_meta += found->disk_total - found->disk_used; total_used += found->disk_used; } @@ -2019,6 +2073,24 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_bavail += div_u64(total_free_data, factor); buf->f_bavail = buf->f_bavail >> bits; + /* + * We calculate the remaining metadata space minus global reserve. If + * this is (supposedly) smaller than zero, there's no space. But this + * does not hold in practice, the exhausted state happens where's still + * some positive delta. So we apply some guesswork and compare the + * delta to a 4M threshold. (Practically observed delta was ~2M.) + * + * We probably cannot calculate the exact threshold value because this + * depends on the internal reservations requested by various + * operations, so some operations that consume a few metadata will + * succeed even if the Avail is zero. But this is better than the other + * way around. + */ + thresh = 4 * 1024 * 1024; + + if (total_free_meta - thresh < block_rsv->size) + buf->f_bavail = 0; + buf->f_type = BTRFS_SUPER_MAGIC; buf->f_bsize = dentry->d_sb->s_blocksize; buf->f_namelen = BTRFS_NAME_LEN; @@ -2225,6 +2297,9 @@ static int btrfs_run_sanity_tests(void) if (ret) goto out; ret = btrfs_test_qgroups(); + if (ret) + goto out; + ret = btrfs_test_free_space_tree(); out: btrfs_destroy_test_fs(); return ret; diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c index 9626252ee6b4..b1d920b30070 100644 --- a/fs/btrfs/tests/btrfs-tests.c +++ b/fs/btrfs/tests/btrfs-tests.c @@ -21,6 +21,9 @@ #include <linux/magic.h> #include "btrfs-tests.h" #include "../ctree.h" +#include "../free-space-cache.h" +#include "../free-space-tree.h" +#include "../transaction.h" #include "../volumes.h" #include "../disk-io.h" #include "../qgroup.h" @@ -122,6 +125,9 @@ struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(void) INIT_LIST_HEAD(&fs_info->tree_mod_seq_list); INIT_RADIX_TREE(&fs_info->buffer_radix, GFP_ATOMIC); INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC); + extent_io_tree_init(&fs_info->freed_extents[0], NULL); + extent_io_tree_init(&fs_info->freed_extents[1], NULL); + fs_info->pinned_extents = &fs_info->freed_extents[0]; return fs_info; } @@ -169,3 +175,55 @@ void btrfs_free_dummy_root(struct btrfs_root *root) kfree(root); } +struct btrfs_block_group_cache * +btrfs_alloc_dummy_block_group(unsigned long length) +{ + struct btrfs_block_group_cache *cache; + + cache = kzalloc(sizeof(*cache), GFP_NOFS); + if (!cache) + return NULL; + cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl), + GFP_NOFS); + if (!cache->free_space_ctl) { + kfree(cache); + return NULL; + } + cache->fs_info = btrfs_alloc_dummy_fs_info(); + if (!cache->fs_info) { + kfree(cache->free_space_ctl); + kfree(cache); + return NULL; + } + + cache->key.objectid = 0; + cache->key.offset = length; + cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; + cache->sectorsize = 4096; + cache->full_stripe_len = 4096; + + INIT_LIST_HEAD(&cache->list); + INIT_LIST_HEAD(&cache->cluster_list); + INIT_LIST_HEAD(&cache->bg_list); + btrfs_init_free_space_ctl(cache); + mutex_init(&cache->free_space_lock); + + return cache; +} + +void btrfs_free_dummy_block_group(struct btrfs_block_group_cache *cache) +{ + if (!cache) + return; + __btrfs_remove_free_space_cache(cache->free_space_ctl); + kfree(cache->free_space_ctl); + kfree(cache); +} + +void btrfs_init_dummy_trans(struct btrfs_trans_handle *trans) +{ + memset(trans, 0, sizeof(*trans)); + trans->transid = 1; + INIT_LIST_HEAD(&trans->qgroup_ref_list); + trans->type = __TRANS_DUMMY; +} diff --git a/fs/btrfs/tests/btrfs-tests.h b/fs/btrfs/tests/btrfs-tests.h index fd3954224480..054b8c73c951 100644 --- a/fs/btrfs/tests/btrfs-tests.h +++ b/fs/btrfs/tests/btrfs-tests.h @@ -24,17 +24,23 @@ #define test_msg(fmt, ...) pr_info("BTRFS: selftest: " fmt, ##__VA_ARGS__) struct btrfs_root; +struct btrfs_trans_handle; int btrfs_test_free_space_cache(void); int btrfs_test_extent_buffer_operations(void); int btrfs_test_extent_io(void); int btrfs_test_inodes(void); int btrfs_test_qgroups(void); +int btrfs_test_free_space_tree(void); int btrfs_init_test_fs(void); void btrfs_destroy_test_fs(void); struct inode *btrfs_new_test_inode(void); struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(void); void btrfs_free_dummy_root(struct btrfs_root *root); +struct btrfs_block_group_cache * +btrfs_alloc_dummy_block_group(unsigned long length); +void btrfs_free_dummy_block_group(struct btrfs_block_group_cache *cache); +void btrfs_init_dummy_trans(struct btrfs_trans_handle *trans); #else static inline int btrfs_test_free_space_cache(void) { @@ -63,6 +69,10 @@ static inline int btrfs_test_qgroups(void) { return 0; } +static inline int btrfs_test_free_space_tree(void) +{ + return 0; +} #endif #endif diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c index 9e9f2368177d..e29fa297e053 100644 --- a/fs/btrfs/tests/extent-io-tests.c +++ b/fs/btrfs/tests/extent-io-tests.c @@ -18,6 +18,8 @@ #include <linux/pagemap.h> #include <linux/sched.h> +#include <linux/slab.h> +#include <linux/sizes.h> #include "btrfs-tests.h" #include "../extent_io.h" @@ -70,12 +72,14 @@ static int test_find_delalloc(void) struct page *page; struct page *locked_page = NULL; unsigned long index = 0; - u64 total_dirty = 256 * 1024 * 1024; - u64 max_bytes = 128 * 1024 * 1024; + u64 total_dirty = SZ_256M; + u64 max_bytes = SZ_128M; u64 start, end, test_start; u64 found; int ret = -EINVAL; + test_msg("Running find delalloc tests\n"); + inode = btrfs_new_test_inode(); if (!inode) { test_msg("Failed to allocate test inode\n"); @@ -133,7 +137,7 @@ static int test_find_delalloc(void) * |--- delalloc ---| * |--- search ---| */ - test_start = 64 * 1024 * 1024; + test_start = SZ_64M; locked_page = find_lock_page(inode->i_mapping, test_start >> PAGE_CACHE_SHIFT); if (!locked_page) { @@ -220,8 +224,8 @@ static int test_find_delalloc(void) * Now to test where we run into a page that is no longer dirty in the * range we want to find. */ - page = find_get_page(inode->i_mapping, (max_bytes + (1 * 1024 * 1024)) - >> PAGE_CACHE_SHIFT); + page = find_get_page(inode->i_mapping, + (max_bytes + SZ_1M) >> PAGE_CACHE_SHIFT); if (!page) { test_msg("Couldn't find our page\n"); goto out_bits; @@ -268,8 +272,139 @@ out: return ret; } +static int __test_eb_bitmaps(unsigned long *bitmap, struct extent_buffer *eb, + unsigned long len) +{ + unsigned long i, x; + + memset(bitmap, 0, len); + memset_extent_buffer(eb, 0, 0, len); + if (memcmp_extent_buffer(eb, bitmap, 0, len) != 0) { + test_msg("Bitmap was not zeroed\n"); + return -EINVAL; + } + + bitmap_set(bitmap, 0, len * BITS_PER_BYTE); + extent_buffer_bitmap_set(eb, 0, 0, len * BITS_PER_BYTE); + if (memcmp_extent_buffer(eb, bitmap, 0, len) != 0) { + test_msg("Setting all bits failed\n"); + return -EINVAL; + } + + bitmap_clear(bitmap, 0, len * BITS_PER_BYTE); + extent_buffer_bitmap_clear(eb, 0, 0, len * BITS_PER_BYTE); + if (memcmp_extent_buffer(eb, bitmap, 0, len) != 0) { + test_msg("Clearing all bits failed\n"); + return -EINVAL; + } + + bitmap_set(bitmap, (PAGE_CACHE_SIZE - sizeof(long) / 2) * BITS_PER_BYTE, + sizeof(long) * BITS_PER_BYTE); + extent_buffer_bitmap_set(eb, PAGE_CACHE_SIZE - sizeof(long) / 2, 0, + sizeof(long) * BITS_PER_BYTE); + if (memcmp_extent_buffer(eb, bitmap, 0, len) != 0) { + test_msg("Setting straddling pages failed\n"); + return -EINVAL; + } + + bitmap_set(bitmap, 0, len * BITS_PER_BYTE); + bitmap_clear(bitmap, + (PAGE_CACHE_SIZE - sizeof(long) / 2) * BITS_PER_BYTE, + sizeof(long) * BITS_PER_BYTE); + extent_buffer_bitmap_set(eb, 0, 0, len * BITS_PER_BYTE); + extent_buffer_bitmap_clear(eb, PAGE_CACHE_SIZE - sizeof(long) / 2, 0, + sizeof(long) * BITS_PER_BYTE); + if (memcmp_extent_buffer(eb, bitmap, 0, len) != 0) { + test_msg("Clearing straddling pages failed\n"); + return -EINVAL; + } + + /* + * Generate a wonky pseudo-random bit pattern for the sake of not using + * something repetitive that could miss some hypothetical off-by-n bug. + */ + x = 0; + for (i = 0; i < len / sizeof(long); i++) { + x = (0x19660dULL * (u64)x + 0x3c6ef35fULL) & 0xffffffffUL; + bitmap[i] = x; + } + write_extent_buffer(eb, bitmap, 0, len); + + for (i = 0; i < len * BITS_PER_BYTE; i++) { + int bit, bit1; + + bit = !!test_bit(i, bitmap); + bit1 = !!extent_buffer_test_bit(eb, 0, i); + if (bit1 != bit) { + test_msg("Testing bit pattern failed\n"); + return -EINVAL; + } + + bit1 = !!extent_buffer_test_bit(eb, i / BITS_PER_BYTE, + i % BITS_PER_BYTE); + if (bit1 != bit) { + test_msg("Testing bit pattern with offset failed\n"); + return -EINVAL; + } + } + + return 0; +} + +static int test_eb_bitmaps(void) +{ + unsigned long len = PAGE_CACHE_SIZE * 4; + unsigned long *bitmap; + struct extent_buffer *eb; + int ret; + + test_msg("Running extent buffer bitmap tests\n"); + + bitmap = kmalloc(len, GFP_NOFS); + if (!bitmap) { + test_msg("Couldn't allocate test bitmap\n"); + return -ENOMEM; + } + + eb = __alloc_dummy_extent_buffer(NULL, 0, len); + if (!eb) { + test_msg("Couldn't allocate test extent buffer\n"); + kfree(bitmap); + return -ENOMEM; + } + + ret = __test_eb_bitmaps(bitmap, eb, len); + if (ret) + goto out; + + /* Do it over again with an extent buffer which isn't page-aligned. */ + free_extent_buffer(eb); + eb = __alloc_dummy_extent_buffer(NULL, PAGE_CACHE_SIZE / 2, len); + if (!eb) { + test_msg("Couldn't allocate test extent buffer\n"); + kfree(bitmap); + return -ENOMEM; + } + + ret = __test_eb_bitmaps(bitmap, eb, len); +out: + free_extent_buffer(eb); + kfree(bitmap); + return ret; +} + int btrfs_test_extent_io(void) { - test_msg("Running find delalloc tests\n"); - return test_find_delalloc(); + int ret; + + test_msg("Running extent I/O tests\n"); + + ret = test_find_delalloc(); + if (ret) + goto out; + + ret = test_eb_bitmaps(); +out: + test_msg("Extent I/O tests finished\n"); + return ret; } diff --git a/fs/btrfs/tests/free-space-tests.c b/fs/btrfs/tests/free-space-tests.c index 8b72b005bfb9..c9ad97b1e690 100644 --- a/fs/btrfs/tests/free-space-tests.c +++ b/fs/btrfs/tests/free-space-tests.c @@ -23,41 +23,6 @@ #include "../free-space-cache.h" #define BITS_PER_BITMAP (PAGE_CACHE_SIZE * 8) -static struct btrfs_block_group_cache *init_test_block_group(void) -{ - struct btrfs_block_group_cache *cache; - - cache = kzalloc(sizeof(*cache), GFP_NOFS); - if (!cache) - return NULL; - cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl), - GFP_NOFS); - if (!cache->free_space_ctl) { - kfree(cache); - return NULL; - } - cache->fs_info = btrfs_alloc_dummy_fs_info(); - if (!cache->fs_info) { - kfree(cache->free_space_ctl); - kfree(cache); - return NULL; - } - - cache->key.objectid = 0; - cache->key.offset = 1024 * 1024 * 1024; - cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; - cache->sectorsize = 4096; - cache->full_stripe_len = 4096; - - spin_lock_init(&cache->lock); - INIT_LIST_HEAD(&cache->list); - INIT_LIST_HEAD(&cache->cluster_list); - INIT_LIST_HEAD(&cache->bg_list); - - btrfs_init_free_space_ctl(cache); - - return cache; -} /* * This test just does basic sanity checking, making sure we can add an exten @@ -71,59 +36,59 @@ static int test_extents(struct btrfs_block_group_cache *cache) test_msg("Running extent only tests\n"); /* First just make sure we can remove an entire entry */ - ret = btrfs_add_free_space(cache, 0, 4 * 1024 * 1024); + ret = btrfs_add_free_space(cache, 0, SZ_4M); if (ret) { test_msg("Error adding initial extents %d\n", ret); return ret; } - ret = btrfs_remove_free_space(cache, 0, 4 * 1024 * 1024); + ret = btrfs_remove_free_space(cache, 0, SZ_4M); if (ret) { test_msg("Error removing extent %d\n", ret); return ret; } - if (test_check_exists(cache, 0, 4 * 1024 * 1024)) { + if (test_check_exists(cache, 0, SZ_4M)) { test_msg("Full remove left some lingering space\n"); return -1; } /* Ok edge and middle cases now */ - ret = btrfs_add_free_space(cache, 0, 4 * 1024 * 1024); + ret = btrfs_add_free_space(cache, 0, SZ_4M); if (ret) { test_msg("Error adding half extent %d\n", ret); return ret; } - ret = btrfs_remove_free_space(cache, 3 * 1024 * 1024, 1 * 1024 * 1024); + ret = btrfs_remove_free_space(cache, 3 * SZ_1M, SZ_1M); if (ret) { test_msg("Error removing tail end %d\n", ret); return ret; } - ret = btrfs_remove_free_space(cache, 0, 1 * 1024 * 1024); + ret = btrfs_remove_free_space(cache, 0, SZ_1M); if (ret) { test_msg("Error removing front end %d\n", ret); return ret; } - ret = btrfs_remove_free_space(cache, 2 * 1024 * 1024, 4096); + ret = btrfs_remove_free_space(cache, SZ_2M, 4096); if (ret) { test_msg("Error removing middle piece %d\n", ret); return ret; } - if (test_check_exists(cache, 0, 1 * 1024 * 1024)) { + if (test_check_exists(cache, 0, SZ_1M)) { test_msg("Still have space at the front\n"); return -1; } - if (test_check_exists(cache, 2 * 1024 * 1024, 4096)) { + if (test_check_exists(cache, SZ_2M, 4096)) { test_msg("Still have space in the middle\n"); return -1; } - if (test_check_exists(cache, 3 * 1024 * 1024, 1 * 1024 * 1024)) { + if (test_check_exists(cache, 3 * SZ_1M, SZ_1M)) { test_msg("Still have space at the end\n"); return -1; } @@ -141,30 +106,30 @@ static int test_bitmaps(struct btrfs_block_group_cache *cache) test_msg("Running bitmap only tests\n"); - ret = test_add_free_space_entry(cache, 0, 4 * 1024 * 1024, 1); + ret = test_add_free_space_entry(cache, 0, SZ_4M, 1); if (ret) { test_msg("Couldn't create a bitmap entry %d\n", ret); return ret; } - ret = btrfs_remove_free_space(cache, 0, 4 * 1024 * 1024); + ret = btrfs_remove_free_space(cache, 0, SZ_4M); if (ret) { test_msg("Error removing bitmap full range %d\n", ret); return ret; } - if (test_check_exists(cache, 0, 4 * 1024 * 1024)) { + if (test_check_exists(cache, 0, SZ_4M)) { test_msg("Left some space in bitmap\n"); return -1; } - ret = test_add_free_space_entry(cache, 0, 4 * 1024 * 1024, 1); + ret = test_add_free_space_entry(cache, 0, SZ_4M, 1); if (ret) { test_msg("Couldn't add to our bitmap entry %d\n", ret); return ret; } - ret = btrfs_remove_free_space(cache, 1 * 1024 * 1024, 2 * 1024 * 1024); + ret = btrfs_remove_free_space(cache, SZ_1M, SZ_2M); if (ret) { test_msg("Couldn't remove middle chunk %d\n", ret); return ret; @@ -177,23 +142,21 @@ static int test_bitmaps(struct btrfs_block_group_cache *cache) next_bitmap_offset = (u64)(BITS_PER_BITMAP * 4096); /* Test a bit straddling two bitmaps */ - ret = test_add_free_space_entry(cache, next_bitmap_offset - - (2 * 1024 * 1024), 4 * 1024 * 1024, 1); + ret = test_add_free_space_entry(cache, next_bitmap_offset - SZ_2M, + SZ_4M, 1); if (ret) { test_msg("Couldn't add space that straddles two bitmaps %d\n", ret); return ret; } - ret = btrfs_remove_free_space(cache, next_bitmap_offset - - (1 * 1024 * 1024), 2 * 1024 * 1024); + ret = btrfs_remove_free_space(cache, next_bitmap_offset - SZ_1M, SZ_2M); if (ret) { test_msg("Couldn't remove overlapping space %d\n", ret); return ret; } - if (test_check_exists(cache, next_bitmap_offset - (1 * 1024 * 1024), - 2 * 1024 * 1024)) { + if (test_check_exists(cache, next_bitmap_offset - SZ_1M, SZ_2M)) { test_msg("Left some space when removing overlapping\n"); return -1; } @@ -216,43 +179,43 @@ static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache) * bitmap, but the free space completely in the extent and then * completely in the bitmap. */ - ret = test_add_free_space_entry(cache, 4 * 1024 * 1024, 1 * 1024 * 1024, 1); + ret = test_add_free_space_entry(cache, SZ_4M, SZ_1M, 1); if (ret) { test_msg("Couldn't create bitmap entry %d\n", ret); return ret; } - ret = test_add_free_space_entry(cache, 0, 1 * 1024 * 1024, 0); + ret = test_add_free_space_entry(cache, 0, SZ_1M, 0); if (ret) { test_msg("Couldn't add extent entry %d\n", ret); return ret; } - ret = btrfs_remove_free_space(cache, 0, 1 * 1024 * 1024); + ret = btrfs_remove_free_space(cache, 0, SZ_1M); if (ret) { test_msg("Couldn't remove extent entry %d\n", ret); return ret; } - if (test_check_exists(cache, 0, 1 * 1024 * 1024)) { + if (test_check_exists(cache, 0, SZ_1M)) { test_msg("Left remnants after our remove\n"); return -1; } /* Now to add back the extent entry and remove from the bitmap */ - ret = test_add_free_space_entry(cache, 0, 1 * 1024 * 1024, 0); + ret = test_add_free_space_entry(cache, 0, SZ_1M, 0); if (ret) { test_msg("Couldn't re-add extent entry %d\n", ret); return ret; } - ret = btrfs_remove_free_space(cache, 4 * 1024 * 1024, 1 * 1024 * 1024); + ret = btrfs_remove_free_space(cache, SZ_4M, SZ_1M); if (ret) { test_msg("Couldn't remove from bitmap %d\n", ret); return ret; } - if (test_check_exists(cache, 4 * 1024 * 1024, 1 * 1024 * 1024)) { + if (test_check_exists(cache, SZ_4M, SZ_1M)) { test_msg("Left remnants in the bitmap\n"); return -1; } @@ -261,19 +224,19 @@ static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache) * Ok so a little more evil, extent entry and bitmap at the same offset, * removing an overlapping chunk. */ - ret = test_add_free_space_entry(cache, 1 * 1024 * 1024, 4 * 1024 * 1024, 1); + ret = test_add_free_space_entry(cache, SZ_1M, SZ_4M, 1); if (ret) { test_msg("Couldn't add to a bitmap %d\n", ret); return ret; } - ret = btrfs_remove_free_space(cache, 512 * 1024, 3 * 1024 * 1024); + ret = btrfs_remove_free_space(cache, SZ_512K, 3 * SZ_1M); if (ret) { test_msg("Couldn't remove overlapping space %d\n", ret); return ret; } - if (test_check_exists(cache, 512 * 1024, 3 * 1024 * 1024)) { + if (test_check_exists(cache, SZ_512K, 3 * SZ_1M)) { test_msg("Left over pieces after removing overlapping\n"); return -1; } @@ -281,25 +244,25 @@ static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache) __btrfs_remove_free_space_cache(cache->free_space_ctl); /* Now with the extent entry offset into the bitmap */ - ret = test_add_free_space_entry(cache, 4 * 1024 * 1024, 4 * 1024 * 1024, 1); + ret = test_add_free_space_entry(cache, SZ_4M, SZ_4M, 1); if (ret) { test_msg("Couldn't add space to the bitmap %d\n", ret); return ret; } - ret = test_add_free_space_entry(cache, 2 * 1024 * 1024, 2 * 1024 * 1024, 0); + ret = test_add_free_space_entry(cache, SZ_2M, SZ_2M, 0); if (ret) { test_msg("Couldn't add extent to the cache %d\n", ret); return ret; } - ret = btrfs_remove_free_space(cache, 3 * 1024 * 1024, 4 * 1024 * 1024); + ret = btrfs_remove_free_space(cache, 3 * SZ_1M, SZ_4M); if (ret) { test_msg("Problem removing overlapping space %d\n", ret); return ret; } - if (test_check_exists(cache, 3 * 1024 * 1024, 4 * 1024 * 1024)) { + if (test_check_exists(cache, 3 * SZ_1M, SZ_4M)) { test_msg("Left something behind when removing space"); return -1; } @@ -315,29 +278,26 @@ static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache) * [ del ] */ __btrfs_remove_free_space_cache(cache->free_space_ctl); - ret = test_add_free_space_entry(cache, bitmap_offset + 4 * 1024 * 1024, - 4 * 1024 * 1024, 1); + ret = test_add_free_space_entry(cache, bitmap_offset + SZ_4M, SZ_4M, 1); if (ret) { test_msg("Couldn't add bitmap %d\n", ret); return ret; } - ret = test_add_free_space_entry(cache, bitmap_offset - 1 * 1024 * 1024, - 5 * 1024 * 1024, 0); + ret = test_add_free_space_entry(cache, bitmap_offset - SZ_1M, + 5 * SZ_1M, 0); if (ret) { test_msg("Couldn't add extent entry %d\n", ret); return ret; } - ret = btrfs_remove_free_space(cache, bitmap_offset + 1 * 1024 * 1024, - 5 * 1024 * 1024); + ret = btrfs_remove_free_space(cache, bitmap_offset + SZ_1M, 5 * SZ_1M); if (ret) { test_msg("Failed to free our space %d\n", ret); return ret; } - if (test_check_exists(cache, bitmap_offset + 1 * 1024 * 1024, - 5 * 1024 * 1024)) { + if (test_check_exists(cache, bitmap_offset + SZ_1M, 5 * SZ_1M)) { test_msg("Left stuff over\n"); return -1; } @@ -350,19 +310,19 @@ static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache) * to return -EAGAIN back from btrfs_remove_extent, make sure this * doesn't happen. */ - ret = test_add_free_space_entry(cache, 1 * 1024 * 1024, 2 * 1024 * 1024, 1); + ret = test_add_free_space_entry(cache, SZ_1M, SZ_2M, 1); if (ret) { test_msg("Couldn't add bitmap entry %d\n", ret); return ret; } - ret = test_add_free_space_entry(cache, 3 * 1024 * 1024, 1 * 1024 * 1024, 0); + ret = test_add_free_space_entry(cache, 3 * SZ_1M, SZ_1M, 0); if (ret) { test_msg("Couldn't add extent entry %d\n", ret); return ret; } - ret = btrfs_remove_free_space(cache, 1 * 1024 * 1024, 3 * 1024 * 1024); + ret = btrfs_remove_free_space(cache, SZ_1M, 3 * SZ_1M); if (ret) { test_msg("Error removing bitmap and extent overlapping %d\n", ret); return ret; @@ -445,9 +405,11 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache) int ret; u64 offset; u64 max_extent_size; - - bool (*use_bitmap_op)(struct btrfs_free_space_ctl *, - struct btrfs_free_space *); + const struct btrfs_free_space_op test_free_space_ops = { + .recalc_thresholds = cache->free_space_ctl->op->recalc_thresholds, + .use_bitmap = test_use_bitmap, + }; + const struct btrfs_free_space_op *orig_free_space_ops; test_msg("Running space stealing from bitmap to extent\n"); @@ -469,22 +431,21 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache) * that forces use of bitmaps as soon as we have at least 1 * extent entry. */ - use_bitmap_op = cache->free_space_ctl->op->use_bitmap; - cache->free_space_ctl->op->use_bitmap = test_use_bitmap; + orig_free_space_ops = cache->free_space_ctl->op; + cache->free_space_ctl->op = &test_free_space_ops; /* * Extent entry covering free space range [128Mb - 256Kb, 128Mb - 128Kb[ */ - ret = test_add_free_space_entry(cache, 128 * 1024 * 1024 - 256 * 1024, - 128 * 1024, 0); + ret = test_add_free_space_entry(cache, SZ_128M - SZ_256K, SZ_128K, 0); if (ret) { test_msg("Couldn't add extent entry %d\n", ret); return ret; } /* Bitmap entry covering free space range [128Mb + 512Kb, 256Mb[ */ - ret = test_add_free_space_entry(cache, 128 * 1024 * 1024 + 512 * 1024, - 128 * 1024 * 1024 - 512 * 1024, 1); + ret = test_add_free_space_entry(cache, SZ_128M + SZ_512K, + SZ_128M - SZ_512K, 1); if (ret) { test_msg("Couldn't add bitmap entry %d\n", ret); return ret; @@ -502,21 +463,19 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache) * [128Mb + 512Kb, 128Mb + 768Kb[ */ ret = btrfs_remove_free_space(cache, - 128 * 1024 * 1024 + 768 * 1024, - 128 * 1024 * 1024 - 768 * 1024); + SZ_128M + 768 * SZ_1K, + SZ_128M - 768 * SZ_1K); if (ret) { test_msg("Failed to free part of bitmap space %d\n", ret); return ret; } /* Confirm that only those 2 ranges are marked as free. */ - if (!test_check_exists(cache, 128 * 1024 * 1024 - 256 * 1024, - 128 * 1024)) { + if (!test_check_exists(cache, SZ_128M - SZ_256K, SZ_128K)) { test_msg("Free space range missing\n"); return -ENOENT; } - if (!test_check_exists(cache, 128 * 1024 * 1024 + 512 * 1024, - 256 * 1024)) { + if (!test_check_exists(cache, SZ_128M + SZ_512K, SZ_256K)) { test_msg("Free space range missing\n"); return -ENOENT; } @@ -525,8 +484,8 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache) * Confirm that the bitmap range [128Mb + 768Kb, 256Mb[ isn't marked * as free anymore. */ - if (test_check_exists(cache, 128 * 1024 * 1024 + 768 * 1024, - 128 * 1024 * 1024 - 768 * 1024)) { + if (test_check_exists(cache, SZ_128M + 768 * SZ_1K, + SZ_128M - 768 * SZ_1K)) { test_msg("Bitmap region not removed from space cache\n"); return -EINVAL; } @@ -535,8 +494,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache) * Confirm that the region [128Mb + 256Kb, 128Mb + 512Kb[, which is * covered by the bitmap, isn't marked as free. */ - if (test_check_exists(cache, 128 * 1024 * 1024 + 256 * 1024, - 256 * 1024)) { + if (test_check_exists(cache, SZ_128M + SZ_256K, SZ_256K)) { test_msg("Invalid bitmap region marked as free\n"); return -EINVAL; } @@ -545,8 +503,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache) * Confirm that the region [128Mb, 128Mb + 256Kb[, which is covered * by the bitmap too, isn't marked as free either. */ - if (test_check_exists(cache, 128 * 1024 * 1024, - 256 * 1024)) { + if (test_check_exists(cache, SZ_128M, SZ_256K)) { test_msg("Invalid bitmap region marked as free\n"); return -EINVAL; } @@ -556,13 +513,13 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache) * lets make sure the free space cache marks it as free in the bitmap, * and doesn't insert a new extent entry to represent this region. */ - ret = btrfs_add_free_space(cache, 128 * 1024 * 1024, 512 * 1024); + ret = btrfs_add_free_space(cache, SZ_128M, SZ_512K); if (ret) { test_msg("Error adding free space: %d\n", ret); return ret; } /* Confirm the region is marked as free. */ - if (!test_check_exists(cache, 128 * 1024 * 1024, 512 * 1024)) { + if (!test_check_exists(cache, SZ_128M, SZ_512K)) { test_msg("Bitmap region not marked as free\n"); return -ENOENT; } @@ -581,8 +538,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache) * The goal is to test that the bitmap entry space stealing doesn't * steal this space region. */ - ret = btrfs_add_free_space(cache, 128 * 1024 * 1024 + 16 * 1024 * 1024, - 4096); + ret = btrfs_add_free_space(cache, SZ_128M + SZ_16M, 4096); if (ret) { test_msg("Error adding free space: %d\n", ret); return ret; @@ -601,15 +557,13 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache) * expand the range covered by the existing extent entry that represents * the free space [128Mb - 256Kb, 128Mb - 128Kb[. */ - ret = btrfs_add_free_space(cache, 128 * 1024 * 1024 - 128 * 1024, - 128 * 1024); + ret = btrfs_add_free_space(cache, SZ_128M - SZ_128K, SZ_128K); if (ret) { test_msg("Error adding free space: %d\n", ret); return ret; } /* Confirm the region is marked as free. */ - if (!test_check_exists(cache, 128 * 1024 * 1024 - 128 * 1024, - 128 * 1024)) { + if (!test_check_exists(cache, SZ_128M - SZ_128K, SZ_128K)) { test_msg("Extent region not marked as free\n"); return -ENOENT; } @@ -637,21 +591,20 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache) * that represents the 1Mb free space, and therefore we're able to * allocate the whole free space at once. */ - if (!test_check_exists(cache, 128 * 1024 * 1024 - 256 * 1024, - 1 * 1024 * 1024)) { + if (!test_check_exists(cache, SZ_128M - SZ_256K, SZ_1M)) { test_msg("Expected region not marked as free\n"); return -ENOENT; } - if (cache->free_space_ctl->free_space != (1 * 1024 * 1024 + 4096)) { + if (cache->free_space_ctl->free_space != (SZ_1M + 4096)) { test_msg("Cache free space is not 1Mb + 4Kb\n"); return -EINVAL; } offset = btrfs_find_space_for_alloc(cache, - 0, 1 * 1024 * 1024, 0, + 0, SZ_1M, 0, &max_extent_size); - if (offset != (128 * 1024 * 1024 - 256 * 1024)) { + if (offset != (SZ_128M - SZ_256K)) { test_msg("Failed to allocate 1Mb from space cache, returned offset is: %llu\n", offset); return -EINVAL; @@ -670,7 +623,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache) offset = btrfs_find_space_for_alloc(cache, 0, 4096, 0, &max_extent_size); - if (offset != (128 * 1024 * 1024 + 16 * 1024 * 1024)) { + if (offset != (SZ_128M + SZ_16M)) { test_msg("Failed to allocate 4Kb from space cache, returned offset is: %llu\n", offset); return -EINVAL; @@ -691,16 +644,14 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache) /* * Extent entry covering free space range [128Mb + 128Kb, 128Mb + 256Kb[ */ - ret = test_add_free_space_entry(cache, 128 * 1024 * 1024 + 128 * 1024, - 128 * 1024, 0); + ret = test_add_free_space_entry(cache, SZ_128M + SZ_128K, SZ_128K, 0); if (ret) { test_msg("Couldn't add extent entry %d\n", ret); return ret; } /* Bitmap entry covering free space range [0, 128Mb - 512Kb[ */ - ret = test_add_free_space_entry(cache, 0, - 128 * 1024 * 1024 - 512 * 1024, 1); + ret = test_add_free_space_entry(cache, 0, SZ_128M - SZ_512K, 1); if (ret) { test_msg("Couldn't add bitmap entry %d\n", ret); return ret; @@ -717,22 +668,18 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache) * [128Mb + 128b, 128Mb + 256Kb[ * [128Mb - 768Kb, 128Mb - 512Kb[ */ - ret = btrfs_remove_free_space(cache, - 0, - 128 * 1024 * 1024 - 768 * 1024); + ret = btrfs_remove_free_space(cache, 0, SZ_128M - 768 * SZ_1K); if (ret) { test_msg("Failed to free part of bitmap space %d\n", ret); return ret; } /* Confirm that only those 2 ranges are marked as free. */ - if (!test_check_exists(cache, 128 * 1024 * 1024 + 128 * 1024, - 128 * 1024)) { + if (!test_check_exists(cache, SZ_128M + SZ_128K, SZ_128K)) { test_msg("Free space range missing\n"); return -ENOENT; } - if (!test_check_exists(cache, 128 * 1024 * 1024 - 768 * 1024, - 256 * 1024)) { + if (!test_check_exists(cache, SZ_128M - 768 * SZ_1K, SZ_256K)) { test_msg("Free space range missing\n"); return -ENOENT; } @@ -741,8 +688,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache) * Confirm that the bitmap range [0, 128Mb - 768Kb[ isn't marked * as free anymore. */ - if (test_check_exists(cache, 0, - 128 * 1024 * 1024 - 768 * 1024)) { + if (test_check_exists(cache, 0, SZ_128M - 768 * SZ_1K)) { test_msg("Bitmap region not removed from space cache\n"); return -EINVAL; } @@ -751,8 +697,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache) * Confirm that the region [128Mb - 512Kb, 128Mb[, which is * covered by the bitmap, isn't marked as free. */ - if (test_check_exists(cache, 128 * 1024 * 1024 - 512 * 1024, - 512 * 1024)) { + if (test_check_exists(cache, SZ_128M - SZ_512K, SZ_512K)) { test_msg("Invalid bitmap region marked as free\n"); return -EINVAL; } @@ -762,15 +707,13 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache) * lets make sure the free space cache marks it as free in the bitmap, * and doesn't insert a new extent entry to represent this region. */ - ret = btrfs_add_free_space(cache, 128 * 1024 * 1024 - 512 * 1024, - 512 * 1024); + ret = btrfs_add_free_space(cache, SZ_128M - SZ_512K, SZ_512K); if (ret) { test_msg("Error adding free space: %d\n", ret); return ret; } /* Confirm the region is marked as free. */ - if (!test_check_exists(cache, 128 * 1024 * 1024 - 512 * 1024, - 512 * 1024)) { + if (!test_check_exists(cache, SZ_128M - SZ_512K, SZ_512K)) { test_msg("Bitmap region not marked as free\n"); return -ENOENT; } @@ -789,7 +732,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache) * The goal is to test that the bitmap entry space stealing doesn't * steal this space region. */ - ret = btrfs_add_free_space(cache, 32 * 1024 * 1024, 8192); + ret = btrfs_add_free_space(cache, SZ_32M, 8192); if (ret) { test_msg("Error adding free space: %d\n", ret); return ret; @@ -800,13 +743,13 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache) * expand the range covered by the existing extent entry that represents * the free space [128Mb + 128Kb, 128Mb + 256Kb[. */ - ret = btrfs_add_free_space(cache, 128 * 1024 * 1024, 128 * 1024); + ret = btrfs_add_free_space(cache, SZ_128M, SZ_128K); if (ret) { test_msg("Error adding free space: %d\n", ret); return ret; } /* Confirm the region is marked as free. */ - if (!test_check_exists(cache, 128 * 1024 * 1024, 128 * 1024)) { + if (!test_check_exists(cache, SZ_128M, SZ_128K)) { test_msg("Extent region not marked as free\n"); return -ENOENT; } @@ -834,21 +777,19 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache) * that represents the 1Mb free space, and therefore we're able to * allocate the whole free space at once. */ - if (!test_check_exists(cache, 128 * 1024 * 1024 - 768 * 1024, - 1 * 1024 * 1024)) { + if (!test_check_exists(cache, SZ_128M - 768 * SZ_1K, SZ_1M)) { test_msg("Expected region not marked as free\n"); return -ENOENT; } - if (cache->free_space_ctl->free_space != (1 * 1024 * 1024 + 8192)) { + if (cache->free_space_ctl->free_space != (SZ_1M + 8192)) { test_msg("Cache free space is not 1Mb + 8Kb\n"); return -EINVAL; } - offset = btrfs_find_space_for_alloc(cache, - 0, 1 * 1024 * 1024, 0, + offset = btrfs_find_space_for_alloc(cache, 0, SZ_1M, 0, &max_extent_size); - if (offset != (128 * 1024 * 1024 - 768 * 1024)) { + if (offset != (SZ_128M - 768 * SZ_1K)) { test_msg("Failed to allocate 1Mb from space cache, returned offset is: %llu\n", offset); return -EINVAL; @@ -867,7 +808,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache) offset = btrfs_find_space_for_alloc(cache, 0, 8192, 0, &max_extent_size); - if (offset != (32 * 1024 * 1024)) { + if (offset != SZ_32M) { test_msg("Failed to allocate 8Kb from space cache, returned offset is: %llu\n", offset); return -EINVAL; @@ -877,7 +818,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache) if (ret) return ret; - cache->free_space_ctl->op->use_bitmap = use_bitmap_op; + cache->free_space_ctl->op = orig_free_space_ops; __btrfs_remove_free_space_cache(cache->free_space_ctl); return 0; @@ -891,7 +832,7 @@ int btrfs_test_free_space_cache(void) test_msg("Running btrfs free space cache tests\n"); - cache = init_test_block_group(); + cache = btrfs_alloc_dummy_block_group(1024 * 1024 * 1024); if (!cache) { test_msg("Couldn't run the tests\n"); return 0; @@ -922,9 +863,7 @@ int btrfs_test_free_space_cache(void) ret = test_steal_space_from_bitmap_to_extent(cache); out: - __btrfs_remove_free_space_cache(cache->free_space_ctl); - kfree(cache->free_space_ctl); - kfree(cache); + btrfs_free_dummy_block_group(cache); btrfs_free_dummy_root(root); test_msg("Free space cache tests finished\n"); return ret; diff --git a/fs/btrfs/tests/free-space-tree-tests.c b/fs/btrfs/tests/free-space-tree-tests.c new file mode 100644 index 000000000000..d05fe1ab4808 --- /dev/null +++ b/fs/btrfs/tests/free-space-tree-tests.c @@ -0,0 +1,571 @@ +/* + * Copyright (C) 2015 Facebook. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include "btrfs-tests.h" +#include "../ctree.h" +#include "../disk-io.h" +#include "../free-space-tree.h" +#include "../transaction.h" + +struct free_space_extent { + u64 start, length; +}; + +/* + * The test cases align their operations to this in order to hit some of the + * edge cases in the bitmap code. + */ +#define BITMAP_RANGE (BTRFS_FREE_SPACE_BITMAP_BITS * 4096) + +static int __check_free_space_extents(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_block_group_cache *cache, + struct btrfs_path *path, + struct free_space_extent *extents, + unsigned int num_extents) +{ + struct btrfs_free_space_info *info; + struct btrfs_key key; + int prev_bit = 0, bit; + u64 extent_start = 0, offset, end; + u32 flags, extent_count; + unsigned int i; + int ret; + + info = search_free_space_info(trans, fs_info, cache, path, 0); + if (IS_ERR(info)) { + test_msg("Could not find free space info\n"); + ret = PTR_ERR(info); + goto out; + } + flags = btrfs_free_space_flags(path->nodes[0], info); + extent_count = btrfs_free_space_extent_count(path->nodes[0], info); + + if (extent_count != num_extents) { + test_msg("Extent count is wrong\n"); + ret = -EINVAL; + goto out; + } + if (flags & BTRFS_FREE_SPACE_USING_BITMAPS) { + if (path->slots[0] != 0) + goto invalid; + end = cache->key.objectid + cache->key.offset; + i = 0; + while (++path->slots[0] < btrfs_header_nritems(path->nodes[0])) { + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + if (key.type != BTRFS_FREE_SPACE_BITMAP_KEY) + goto invalid; + offset = key.objectid; + while (offset < key.objectid + key.offset) { + bit = free_space_test_bit(cache, path, offset); + if (prev_bit == 0 && bit == 1) { + extent_start = offset; + } else if (prev_bit == 1 && bit == 0) { + if (i >= num_extents) + goto invalid; + if (i >= num_extents || + extent_start != extents[i].start || + offset - extent_start != extents[i].length) + goto invalid; + i++; + } + prev_bit = bit; + offset += cache->sectorsize; + } + } + if (prev_bit == 1) { + if (i >= num_extents || + extent_start != extents[i].start || + end - extent_start != extents[i].length) + goto invalid; + i++; + } + if (i != num_extents) + goto invalid; + } else { + if (btrfs_header_nritems(path->nodes[0]) != num_extents + 1 || + path->slots[0] != 0) + goto invalid; + for (i = 0; i < num_extents; i++) { + path->slots[0]++; + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + if (key.type != BTRFS_FREE_SPACE_EXTENT_KEY || + key.objectid != extents[i].start || + key.offset != extents[i].length) + goto invalid; + } + } + + ret = 0; +out: + btrfs_release_path(path); + return ret; +invalid: + test_msg("Free space tree is invalid\n"); + ret = -EINVAL; + goto out; +} + +static int check_free_space_extents(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_block_group_cache *cache, + struct btrfs_path *path, + struct free_space_extent *extents, + unsigned int num_extents) +{ + struct btrfs_free_space_info *info; + u32 flags; + int ret; + + info = search_free_space_info(trans, fs_info, cache, path, 0); + if (IS_ERR(info)) { + test_msg("Could not find free space info\n"); + btrfs_release_path(path); + return PTR_ERR(info); + } + flags = btrfs_free_space_flags(path->nodes[0], info); + btrfs_release_path(path); + + ret = __check_free_space_extents(trans, fs_info, cache, path, extents, + num_extents); + if (ret) + return ret; + + /* Flip it to the other format and check that for good measure. */ + if (flags & BTRFS_FREE_SPACE_USING_BITMAPS) { + ret = convert_free_space_to_extents(trans, fs_info, cache, path); + if (ret) { + test_msg("Could not convert to extents\n"); + return ret; + } + } else { + ret = convert_free_space_to_bitmaps(trans, fs_info, cache, path); + if (ret) { + test_msg("Could not convert to bitmaps\n"); + return ret; + } + } + return __check_free_space_extents(trans, fs_info, cache, path, extents, + num_extents); +} + +static int test_empty_block_group(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_block_group_cache *cache, + struct btrfs_path *path) +{ + struct free_space_extent extents[] = { + {cache->key.objectid, cache->key.offset}, + }; + + return check_free_space_extents(trans, fs_info, cache, path, + extents, ARRAY_SIZE(extents)); +} + +static int test_remove_all(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_block_group_cache *cache, + struct btrfs_path *path) +{ + struct free_space_extent extents[] = {}; + int ret; + + ret = __remove_from_free_space_tree(trans, fs_info, cache, path, + cache->key.objectid, + cache->key.offset); + if (ret) { + test_msg("Could not remove free space\n"); + return ret; + } + + return check_free_space_extents(trans, fs_info, cache, path, + extents, ARRAY_SIZE(extents)); +} + +static int test_remove_beginning(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_block_group_cache *cache, + struct btrfs_path *path) +{ + struct free_space_extent extents[] = { + {cache->key.objectid + BITMAP_RANGE, + cache->key.offset - BITMAP_RANGE}, + }; + int ret; + + ret = __remove_from_free_space_tree(trans, fs_info, cache, path, + cache->key.objectid, BITMAP_RANGE); + if (ret) { + test_msg("Could not remove free space\n"); + return ret; + } + + return check_free_space_extents(trans, fs_info, cache, path, + extents, ARRAY_SIZE(extents)); + +} + +static int test_remove_end(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_block_group_cache *cache, + struct btrfs_path *path) +{ + struct free_space_extent extents[] = { + {cache->key.objectid, cache->key.offset - BITMAP_RANGE}, + }; + int ret; + + ret = __remove_from_free_space_tree(trans, fs_info, cache, path, + cache->key.objectid + + cache->key.offset - BITMAP_RANGE, + BITMAP_RANGE); + if (ret) { + test_msg("Could not remove free space\n"); + return ret; + } + + return check_free_space_extents(trans, fs_info, cache, path, + extents, ARRAY_SIZE(extents)); +} + +static int test_remove_middle(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_block_group_cache *cache, + struct btrfs_path *path) +{ + struct free_space_extent extents[] = { + {cache->key.objectid, BITMAP_RANGE}, + {cache->key.objectid + 2 * BITMAP_RANGE, + cache->key.offset - 2 * BITMAP_RANGE}, + }; + int ret; + + ret = __remove_from_free_space_tree(trans, fs_info, cache, path, + cache->key.objectid + BITMAP_RANGE, + BITMAP_RANGE); + if (ret) { + test_msg("Could not remove free space\n"); + return ret; + } + + return check_free_space_extents(trans, fs_info, cache, path, + extents, ARRAY_SIZE(extents)); +} + +static int test_merge_left(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_block_group_cache *cache, + struct btrfs_path *path) +{ + struct free_space_extent extents[] = { + {cache->key.objectid, 2 * BITMAP_RANGE}, + }; + int ret; + + ret = __remove_from_free_space_tree(trans, fs_info, cache, path, + cache->key.objectid, + cache->key.offset); + if (ret) { + test_msg("Could not remove free space\n"); + return ret; + } + + ret = __add_to_free_space_tree(trans, fs_info, cache, path, + cache->key.objectid, BITMAP_RANGE); + if (ret) { + test_msg("Could not add free space\n"); + return ret; + } + + ret = __add_to_free_space_tree(trans, fs_info, cache, path, + cache->key.objectid + BITMAP_RANGE, + BITMAP_RANGE); + if (ret) { + test_msg("Could not add free space\n"); + return ret; + } + + return check_free_space_extents(trans, fs_info, cache, path, + extents, ARRAY_SIZE(extents)); +} + +static int test_merge_right(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_block_group_cache *cache, + struct btrfs_path *path) +{ + struct free_space_extent extents[] = { + {cache->key.objectid + BITMAP_RANGE, 2 * BITMAP_RANGE}, + }; + int ret; + + ret = __remove_from_free_space_tree(trans, fs_info, cache, path, + cache->key.objectid, + cache->key.offset); + if (ret) { + test_msg("Could not remove free space\n"); + return ret; + } + + ret = __add_to_free_space_tree(trans, fs_info, cache, path, + cache->key.objectid + 2 * BITMAP_RANGE, + BITMAP_RANGE); + if (ret) { + test_msg("Could not add free space\n"); + return ret; + } + + ret = __add_to_free_space_tree(trans, fs_info, cache, path, + cache->key.objectid + BITMAP_RANGE, + BITMAP_RANGE); + if (ret) { + test_msg("Could not add free space\n"); + return ret; + } + + return check_free_space_extents(trans, fs_info, cache, path, + extents, ARRAY_SIZE(extents)); +} + +static int test_merge_both(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_block_group_cache *cache, + struct btrfs_path *path) +{ + struct free_space_extent extents[] = { + {cache->key.objectid, 3 * BITMAP_RANGE}, + }; + int ret; + + ret = __remove_from_free_space_tree(trans, fs_info, cache, path, + cache->key.objectid, + cache->key.offset); + if (ret) { + test_msg("Could not remove free space\n"); + return ret; + } + + ret = __add_to_free_space_tree(trans, fs_info, cache, path, + cache->key.objectid, BITMAP_RANGE); + if (ret) { + test_msg("Could not add free space\n"); + return ret; + } + + ret = __add_to_free_space_tree(trans, fs_info, cache, path, + cache->key.objectid + 2 * BITMAP_RANGE, + BITMAP_RANGE); + if (ret) { + test_msg("Could not add free space\n"); + return ret; + } + + ret = __add_to_free_space_tree(trans, fs_info, cache, path, + cache->key.objectid + BITMAP_RANGE, + BITMAP_RANGE); + if (ret) { + test_msg("Could not add free space\n"); + return ret; + } + + return check_free_space_extents(trans, fs_info, cache, path, + extents, ARRAY_SIZE(extents)); +} + +static int test_merge_none(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_block_group_cache *cache, + struct btrfs_path *path) +{ + struct free_space_extent extents[] = { + {cache->key.objectid, BITMAP_RANGE}, + {cache->key.objectid + 2 * BITMAP_RANGE, BITMAP_RANGE}, + {cache->key.objectid + 4 * BITMAP_RANGE, BITMAP_RANGE}, + }; + int ret; + + ret = __remove_from_free_space_tree(trans, fs_info, cache, path, + cache->key.objectid, + cache->key.offset); + if (ret) { + test_msg("Could not remove free space\n"); + return ret; + } + + ret = __add_to_free_space_tree(trans, fs_info, cache, path, + cache->key.objectid, BITMAP_RANGE); + if (ret) { + test_msg("Could not add free space\n"); + return ret; + } + + ret = __add_to_free_space_tree(trans, fs_info, cache, path, + cache->key.objectid + 4 * BITMAP_RANGE, + BITMAP_RANGE); + if (ret) { + test_msg("Could not add free space\n"); + return ret; + } + + ret = __add_to_free_space_tree(trans, fs_info, cache, path, + cache->key.objectid + 2 * BITMAP_RANGE, + BITMAP_RANGE); + if (ret) { + test_msg("Could not add free space\n"); + return ret; + } + + return check_free_space_extents(trans, fs_info, cache, path, + extents, ARRAY_SIZE(extents)); +} + +typedef int (*test_func_t)(struct btrfs_trans_handle *, + struct btrfs_fs_info *, + struct btrfs_block_group_cache *, + struct btrfs_path *); + +static int run_test(test_func_t test_func, int bitmaps) +{ + struct btrfs_root *root = NULL; + struct btrfs_block_group_cache *cache = NULL; + struct btrfs_trans_handle trans; + struct btrfs_path *path = NULL; + int ret; + + root = btrfs_alloc_dummy_root(); + if (IS_ERR(root)) { + test_msg("Couldn't allocate dummy root\n"); + ret = PTR_ERR(root); + goto out; + } + + root->fs_info = btrfs_alloc_dummy_fs_info(); + if (!root->fs_info) { + test_msg("Couldn't allocate dummy fs info\n"); + ret = -ENOMEM; + goto out; + } + + btrfs_set_super_compat_ro_flags(root->fs_info->super_copy, + BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE); + root->fs_info->free_space_root = root; + root->fs_info->tree_root = root; + + root->node = alloc_test_extent_buffer(root->fs_info, 4096); + if (!root->node) { + test_msg("Couldn't allocate dummy buffer\n"); + ret = -ENOMEM; + goto out; + } + btrfs_set_header_level(root->node, 0); + btrfs_set_header_nritems(root->node, 0); + root->alloc_bytenr += 8192; + + cache = btrfs_alloc_dummy_block_group(8 * BITMAP_RANGE); + if (!cache) { + test_msg("Couldn't allocate dummy block group cache\n"); + ret = -ENOMEM; + goto out; + } + cache->bitmap_low_thresh = 0; + cache->bitmap_high_thresh = (u32)-1; + cache->needs_free_space = 1; + + btrfs_init_dummy_trans(&trans); + + path = btrfs_alloc_path(); + if (!path) { + test_msg("Couldn't allocate path\n"); + return -ENOMEM; + } + + ret = add_block_group_free_space(&trans, root->fs_info, cache); + if (ret) { + test_msg("Could not add block group free space\n"); + goto out; + } + + if (bitmaps) { + ret = convert_free_space_to_bitmaps(&trans, root->fs_info, + cache, path); + if (ret) { + test_msg("Could not convert block group to bitmaps\n"); + goto out; + } + } + + ret = test_func(&trans, root->fs_info, cache, path); + if (ret) + goto out; + + ret = remove_block_group_free_space(&trans, root->fs_info, cache); + if (ret) { + test_msg("Could not remove block group free space\n"); + goto out; + } + + if (btrfs_header_nritems(root->node) != 0) { + test_msg("Free space tree has leftover items\n"); + ret = -EINVAL; + goto out; + } + + ret = 0; +out: + btrfs_free_path(path); + btrfs_free_dummy_block_group(cache); + btrfs_free_dummy_root(root); + return ret; +} + +static int run_test_both_formats(test_func_t test_func) +{ + int ret; + + ret = run_test(test_func, 0); + if (ret) + return ret; + return run_test(test_func, 1); +} + +int btrfs_test_free_space_tree(void) +{ + test_func_t tests[] = { + test_empty_block_group, + test_remove_all, + test_remove_beginning, + test_remove_end, + test_remove_middle, + test_merge_left, + test_merge_right, + test_merge_both, + test_merge_none, + }; + int i; + + test_msg("Running free space tree tests\n"); + for (i = 0; i < ARRAY_SIZE(tests); i++) { + int ret = run_test_both_formats(tests[i]); + if (ret) { + test_msg("%pf failed\n", tests[i]); + return ret; + } + } + + return 0; +} diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c index 054fc0d97131..5de55fdd28bc 100644 --- a/fs/btrfs/tests/inode-tests.c +++ b/fs/btrfs/tests/inode-tests.c @@ -100,7 +100,7 @@ static void insert_inode_item_key(struct btrfs_root *root) static void setup_file_extents(struct btrfs_root *root) { int slot = 0; - u64 disk_bytenr = 1 * 1024 * 1024; + u64 disk_bytenr = SZ_1M; u64 offset = 0; /* First we want a hole */ diff --git a/fs/btrfs/tests/qgroup-tests.c b/fs/btrfs/tests/qgroup-tests.c index 846d277b1901..8ea5d34bc5a2 100644 --- a/fs/btrfs/tests/qgroup-tests.c +++ b/fs/btrfs/tests/qgroup-tests.c @@ -23,14 +23,6 @@ #include "../qgroup.h" #include "../backref.h" -static void init_dummy_trans(struct btrfs_trans_handle *trans) -{ - memset(trans, 0, sizeof(*trans)); - trans->transid = 1; - INIT_LIST_HEAD(&trans->qgroup_ref_list); - trans->type = __TRANS_DUMMY; -} - static int insert_normal_tree_ref(struct btrfs_root *root, u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid) { @@ -44,7 +36,7 @@ static int insert_normal_tree_ref(struct btrfs_root *root, u64 bytenr, u32 size = sizeof(*item) + sizeof(*iref) + sizeof(*block_info); int ret; - init_dummy_trans(&trans); + btrfs_init_dummy_trans(&trans); ins.objectid = bytenr; ins.type = BTRFS_EXTENT_ITEM_KEY; @@ -94,7 +86,7 @@ static int add_tree_ref(struct btrfs_root *root, u64 bytenr, u64 num_bytes, u64 refs; int ret; - init_dummy_trans(&trans); + btrfs_init_dummy_trans(&trans); key.objectid = bytenr; key.type = BTRFS_EXTENT_ITEM_KEY; @@ -144,7 +136,7 @@ static int remove_extent_item(struct btrfs_root *root, u64 bytenr, struct btrfs_path *path; int ret; - init_dummy_trans(&trans); + btrfs_init_dummy_trans(&trans); key.objectid = bytenr; key.type = BTRFS_EXTENT_ITEM_KEY; @@ -178,7 +170,7 @@ static int remove_extent_ref(struct btrfs_root *root, u64 bytenr, u64 refs; int ret; - init_dummy_trans(&trans); + btrfs_init_dummy_trans(&trans); key.objectid = bytenr; key.type = BTRFS_EXTENT_ITEM_KEY; @@ -232,7 +224,7 @@ static int test_no_shared_qgroup(struct btrfs_root *root) struct ulist *new_roots = NULL; int ret; - init_dummy_trans(&trans); + btrfs_init_dummy_trans(&trans); test_msg("Qgroup basic add\n"); ret = btrfs_create_qgroup(NULL, fs_info, 5); @@ -326,7 +318,7 @@ static int test_multiple_refs(struct btrfs_root *root) struct ulist *new_roots = NULL; int ret; - init_dummy_trans(&trans); + btrfs_init_dummy_trans(&trans); test_msg("Qgroup multiple refs test\n"); diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index be8eae80ff65..b6031ce474f7 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -75,6 +75,23 @@ void btrfs_put_transaction(struct btrfs_transaction *transaction) list_del_init(&em->list); free_extent_map(em); } + /* + * If any block groups are found in ->deleted_bgs then it's + * because the transaction was aborted and a commit did not + * happen (things failed before writing the new superblock + * and calling btrfs_finish_extent_commit()), so we can not + * discard the physical locations of the block groups. + */ + while (!list_empty(&transaction->deleted_bgs)) { + struct btrfs_block_group_cache *cache; + + cache = list_first_entry(&transaction->deleted_bgs, + struct btrfs_block_group_cache, + bg_list); + list_del_init(&cache->bg_list); + btrfs_put_block_group_trimming(cache); + btrfs_put_block_group(cache); + } kmem_cache_free(btrfs_transaction_cachep, transaction); } } @@ -634,17 +651,20 @@ struct btrfs_trans_handle *btrfs_start_transaction_lflush( struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root) { - return start_transaction(root, 0, TRANS_JOIN, 0); + return start_transaction(root, 0, TRANS_JOIN, + BTRFS_RESERVE_NO_FLUSH); } struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root) { - return start_transaction(root, 0, TRANS_JOIN_NOLOCK, 0); + return start_transaction(root, 0, TRANS_JOIN_NOLOCK, + BTRFS_RESERVE_NO_FLUSH); } struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root) { - return start_transaction(root, 0, TRANS_USERSPACE, 0); + return start_transaction(root, 0, TRANS_USERSPACE, + BTRFS_RESERVE_NO_FLUSH); } /* @@ -662,7 +682,8 @@ struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root */ struct btrfs_trans_handle *btrfs_attach_transaction(struct btrfs_root *root) { - return start_transaction(root, 0, TRANS_ATTACH, 0); + return start_transaction(root, 0, TRANS_ATTACH, + BTRFS_RESERVE_NO_FLUSH); } /* @@ -677,7 +698,8 @@ btrfs_attach_transaction_barrier(struct btrfs_root *root) { struct btrfs_trans_handle *trans; - trans = start_transaction(root, 0, TRANS_ATTACH, 0); + trans = start_transaction(root, 0, TRANS_ATTACH, + BTRFS_RESERVE_NO_FLUSH); if (IS_ERR(trans) && PTR_ERR(trans) == -ENOENT) btrfs_wait_for_commit(root, 0); @@ -1319,17 +1341,11 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, u64 root_flags; uuid_le new_uuid; - path = btrfs_alloc_path(); - if (!path) { - pending->error = -ENOMEM; - return 0; - } + ASSERT(pending->path); + path = pending->path; - new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS); - if (!new_root_item) { - pending->error = -ENOMEM; - goto root_item_alloc_fail; - } + ASSERT(pending->root_item); + new_root_item = pending->root_item; pending->error = btrfs_find_free_objectid(tree_root, &objectid); if (pending->error) @@ -1562,8 +1578,10 @@ clear_skip_qgroup: btrfs_clear_skip_qgroup(trans); no_free_objectid: kfree(new_root_item); -root_item_alloc_fail: + pending->root_item = NULL; btrfs_free_path(path); + pending->path = NULL; + return ret; } diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 64c8221b6165..72be51f7ca2f 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -137,8 +137,10 @@ struct btrfs_pending_snapshot { struct dentry *dentry; struct inode *dir; struct btrfs_root *root; + struct btrfs_root_item *root_item; struct btrfs_root *snap; struct btrfs_qgroup_inherit *inherit; + struct btrfs_path *path; /* block reservation for the operation */ struct btrfs_block_rsv block_rsv; u64 qgroup_reserved; diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c index f31db4325339..cb65089127cc 100644 --- a/fs/btrfs/tree-defrag.c +++ b/fs/btrfs/tree-defrag.c @@ -89,6 +89,12 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, goto out; } btrfs_release_path(path); + /* + * We don't need a lock on a leaf. btrfs_realloc_node() will lock all + * leafs from path->nodes[1], so set lowest_level to 1 to avoid later + * a deadlock (attempting to write lock an already write locked leaf). + */ + path->lowest_level = 1; wret = btrfs_search_slot(trans, root, &key, path, 0, 1); if (wret < 0) { @@ -99,9 +105,12 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, ret = 0; goto out; } - path->slots[1] = btrfs_header_nritems(path->nodes[1]); - next_key_ret = btrfs_find_next_key(root, path, &key, 1, - min_trans); + /* + * The node at level 1 must always be locked when our path has + * keep_locks set and lowest_level is 1, regardless of the value of + * path->slots[1]. + */ + BUG_ON(path->locks[1] == 0); ret = btrfs_realloc_node(trans, root, path->nodes[1], 0, &last_ret, @@ -110,6 +119,18 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, WARN_ON(ret == -EAGAIN); goto out; } + /* + * Now that we reallocated the node we can find the next key. Note that + * btrfs_find_next_key() can release our path and do another search + * without COWing, this is because even with path->keep_locks = 1, + * btrfs_search_slot() / ctree.c:unlock_up() does not keeps a lock on a + * node when path->slots[node_level - 1] does not point to the last + * item or a slot beyond the last item (ctree.c:unlock_up()). Therefore + * we search for the next key after reallocating our node. + */ + path->slots[1] = btrfs_header_nritems(path->nodes[1]); + next_key_ret = btrfs_find_next_key(root, path, &key, 1, + min_trans); if (next_key_ret == 0) { memcpy(&root->defrag_progress, &key, sizeof(key)); ret = -EAGAIN; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index a23399e8e3ab..366b335946fa 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -108,7 +108,7 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { }, }; -const u64 const btrfs_raid_group[BTRFS_NR_RAID_TYPES] = { +const u64 btrfs_raid_group[BTRFS_NR_RAID_TYPES] = { [BTRFS_RAID_RAID10] = BTRFS_BLOCK_GROUP_RAID10, [BTRFS_RAID_RAID1] = BTRFS_BLOCK_GROUP_RAID1, [BTRFS_RAID_DUP] = BTRFS_BLOCK_GROUP_DUP, @@ -125,6 +125,7 @@ static int btrfs_relocate_sys_chunks(struct btrfs_root *root); static void __btrfs_reset_dev_stats(struct btrfs_device *dev); static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev); static void btrfs_dev_stat_print_on_load(struct btrfs_device *device); +static void btrfs_close_one_device(struct btrfs_device *device); DEFINE_MUTEX(uuid_mutex); static LIST_HEAD(fs_uuids); @@ -232,6 +233,7 @@ static struct btrfs_device *__alloc_device(void) spin_lock_init(&dev->reada_lock); atomic_set(&dev->reada_in_flight, 0); atomic_set(&dev->dev_stats_ccnt, 0); + btrfs_device_data_ordered_init(dev); INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_DIRECT_RECLAIM); INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_DIRECT_RECLAIM); @@ -1102,7 +1104,7 @@ int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start, path = btrfs_alloc_path(); if (!path) return -ENOMEM; - path->reada = 2; + path->reada = READA_FORWARD; key.objectid = device->devid; key.offset = start; @@ -1182,7 +1184,7 @@ again: struct map_lookup *map; int i; - map = (struct map_lookup *)em->bdev; + map = em->map_lookup; for (i = 0; i < map->num_stripes; i++) { u64 end; @@ -1257,6 +1259,15 @@ int find_free_dev_extent_start(struct btrfs_transaction *transaction, int ret; int slot; struct extent_buffer *l; + u64 min_search_start; + + /* + * We don't want to overwrite the superblock on the drive nor any area + * used by the boot loader (grub for example), so we make sure to start + * at an offset of at least 1MB. + */ + min_search_start = max(root->fs_info->alloc_start, 1024ull * 1024); + search_start = max(search_start, min_search_start); path = btrfs_alloc_path(); if (!path) @@ -1271,7 +1282,7 @@ again: goto out; } - path->reada = 2; + path->reada = READA_FORWARD; path->search_commit_root = 1; path->skip_locking = 1; @@ -1397,18 +1408,9 @@ int find_free_dev_extent(struct btrfs_trans_handle *trans, struct btrfs_device *device, u64 num_bytes, u64 *start, u64 *len) { - struct btrfs_root *root = device->dev_root; - u64 search_start; - /* FIXME use last free of some kind */ - - /* - * we don't want to overwrite the superblock on the drive, - * so we make sure to start at an offset of at least 1MB - */ - search_start = max(root->fs_info->alloc_start, 1024ull * 1024); return find_free_dev_extent_start(trans->transaction, device, - num_bytes, search_start, start, len); + num_bytes, 0, start, len); } static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans, @@ -1642,7 +1644,6 @@ static void update_dev_time(char *path_name) return; file_update_time(filp); filp_close(filp, NULL); - return; } static int btrfs_rm_dev_item(struct btrfs_root *root, @@ -2755,7 +2756,7 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, free_extent_map(em); return -EINVAL; } - map = (struct map_lookup *)em->bdev; + map = em->map_lookup; lock_chunks(root->fs_info->chunk_root); check_system_chunk(trans, extent_root, map->type); unlock_chunks(root->fs_info->chunk_root); @@ -3406,7 +3407,7 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info) list_for_each_entry(device, devices, dev_list) { old_size = btrfs_device_get_total_bytes(device); size_to_free = div_factor(old_size, 1); - size_to_free = min(size_to_free, (u64)1 * 1024 * 1024); + size_to_free = min_t(u64, size_to_free, SZ_1M); if (!device->writeable || btrfs_device_get_total_bytes(device) - btrfs_device_get_bytes_used(device) > size_to_free || @@ -3723,14 +3724,6 @@ int btrfs_balance(struct btrfs_balance_control *bctl, goto out; } - /* allow dup'ed data chunks only in mixed mode */ - if (!mixed && (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) && - (bctl->data.target & BTRFS_BLOCK_GROUP_DUP)) { - btrfs_err(fs_info, "dup for data is not allowed"); - ret = -EINVAL; - goto out; - } - /* allow to reduce meta or sys integrity only if force set */ allowed = BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10 | @@ -3756,6 +3749,13 @@ int btrfs_balance(struct btrfs_balance_control *bctl, } } while (read_seqretry(&fs_info->profiles_lock, seq)); + if (btrfs_get_num_tolerated_disk_barrier_failures(bctl->meta.target) < + btrfs_get_num_tolerated_disk_barrier_failures(bctl->data.target)) { + btrfs_warn(fs_info, + "metadata profile 0x%llx has lower redundancy than data profile 0x%llx", + bctl->meta.target, bctl->data.target); + } + if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) { fs_info->num_tolerated_disk_barrier_failures = min( btrfs_calc_num_tolerated_disk_barrier_failures(fs_info), @@ -4268,7 +4268,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) if (!path) return -ENOMEM; - path->reada = 2; + path->reada = READA_FORWARD; lock_chunks(root); @@ -4460,7 +4460,7 @@ static int btrfs_cmp_device_info(const void *a, const void *b) static u32 find_raid56_stripe_len(u32 data_devices, u32 dev_stripe_target) { /* TODO allow them to set a preferred stripe size */ - return 64 * 1024; + return SZ_64K; } static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type) @@ -4528,21 +4528,21 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, ncopies = btrfs_raid_array[index].ncopies; if (type & BTRFS_BLOCK_GROUP_DATA) { - max_stripe_size = 1024 * 1024 * 1024; + max_stripe_size = SZ_1G; max_chunk_size = 10 * max_stripe_size; if (!devs_max) devs_max = BTRFS_MAX_DEVS(info->chunk_root); } else if (type & BTRFS_BLOCK_GROUP_METADATA) { /* for larger filesystems, use larger metadata chunks */ - if (fs_devices->total_rw_bytes > 50ULL * 1024 * 1024 * 1024) - max_stripe_size = 1024 * 1024 * 1024; + if (fs_devices->total_rw_bytes > 50ULL * SZ_1G) + max_stripe_size = SZ_1G; else - max_stripe_size = 256 * 1024 * 1024; + max_stripe_size = SZ_256M; max_chunk_size = max_stripe_size; if (!devs_max) devs_max = BTRFS_MAX_DEVS(info->chunk_root); } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) { - max_stripe_size = 32 * 1024 * 1024; + max_stripe_size = SZ_32M; max_chunk_size = 2 * max_stripe_size; if (!devs_max) devs_max = BTRFS_MAX_DEVS_SYS_CHUNK; @@ -4719,7 +4719,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, goto error; } set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags); - em->bdev = (struct block_device *)map; + em->map_lookup = map; em->start = start; em->len = num_bytes; em->block_start = 0; @@ -4793,7 +4793,7 @@ int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans, u64 dev_offset; u64 stripe_size; int i = 0; - int ret; + int ret = 0; em_tree = &extent_root->fs_info->mapping_tree.map_tree; read_lock(&em_tree->lock); @@ -4814,7 +4814,7 @@ int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans, return -EINVAL; } - map = (struct map_lookup *)em->bdev; + map = em->map_lookup; item_size = btrfs_chunk_item_size(map->num_stripes); stripe_size = em->orig_block_len; @@ -4824,20 +4824,32 @@ int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans, goto out; } + /* + * Take the device list mutex to prevent races with the final phase of + * a device replace operation that replaces the device object associated + * with the map's stripes, because the device object's id can change + * at any time during that final phase of the device replace operation + * (dev-replace.c:btrfs_dev_replace_finishing()). + */ + mutex_lock(&chunk_root->fs_info->fs_devices->device_list_mutex); for (i = 0; i < map->num_stripes; i++) { device = map->stripes[i].dev; dev_offset = map->stripes[i].physical; ret = btrfs_update_device(trans, device); if (ret) - goto out; + break; ret = btrfs_alloc_dev_extent(trans, device, chunk_root->root_key.objectid, BTRFS_FIRST_CHUNK_TREE_OBJECTID, chunk_offset, dev_offset, stripe_size); if (ret) - goto out; + break; + } + if (ret) { + mutex_unlock(&chunk_root->fs_info->fs_devices->device_list_mutex); + goto out; } stripe = &chunk->stripe; @@ -4850,6 +4862,7 @@ int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans, memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE); stripe++; } + mutex_unlock(&chunk_root->fs_info->fs_devices->device_list_mutex); btrfs_set_stack_chunk_length(chunk, chunk_size); btrfs_set_stack_chunk_owner(chunk, extent_root->root_key.objectid); @@ -4956,7 +4969,7 @@ int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset) if (!em) return 1; - map = (struct map_lookup *)em->bdev; + map = em->map_lookup; for (i = 0; i < map->num_stripes; i++) { if (map->stripes[i].dev->missing) { miss_ndevs++; @@ -5036,7 +5049,7 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len) return 1; } - map = (struct map_lookup *)em->bdev; + map = em->map_lookup; if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1)) ret = map->num_stripes; else if (map->type & BTRFS_BLOCK_GROUP_RAID10) @@ -5072,7 +5085,7 @@ unsigned long btrfs_full_stripe_len(struct btrfs_root *root, BUG_ON(!em); BUG_ON(em->start > logical || em->start + em->len < logical); - map = (struct map_lookup *)em->bdev; + map = em->map_lookup; if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) len = map->stripe_len * nr_data_stripes(map); free_extent_map(em); @@ -5093,7 +5106,7 @@ int btrfs_is_parity_mirror(struct btrfs_mapping_tree *map_tree, BUG_ON(!em); BUG_ON(em->start > logical || em->start + em->len < logical); - map = (struct map_lookup *)em->bdev; + map = em->map_lookup; if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) ret = 1; free_extent_map(em); @@ -5252,7 +5265,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, return -EINVAL; } - map = (struct map_lookup *)em->bdev; + map = em->map_lookup; offset = logical - em->start; stripe_len = map->stripe_len; @@ -5366,35 +5379,33 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, * target drive. */ for (i = 0; i < tmp_num_stripes; i++) { - if (tmp_bbio->stripes[i].dev->devid == srcdev_devid) { - /* - * In case of DUP, in order to keep it - * simple, only add the mirror with the - * lowest physical address - */ - if (found && - physical_of_found <= - tmp_bbio->stripes[i].physical) - continue; - index_srcdev = i; - found = 1; - physical_of_found = - tmp_bbio->stripes[i].physical; - } + if (tmp_bbio->stripes[i].dev->devid != srcdev_devid) + continue; + + /* + * In case of DUP, in order to keep it simple, only add + * the mirror with the lowest physical address + */ + if (found && + physical_of_found <= tmp_bbio->stripes[i].physical) + continue; + + index_srcdev = i; + found = 1; + physical_of_found = tmp_bbio->stripes[i].physical; } - if (found) { - mirror_num = index_srcdev + 1; - patch_the_first_stripe_for_dev_replace = 1; - physical_to_patch_in_first_stripe = physical_of_found; - } else { + btrfs_put_bbio(tmp_bbio); + + if (!found) { WARN_ON(1); ret = -EIO; - btrfs_put_bbio(tmp_bbio); goto out; } - btrfs_put_bbio(tmp_bbio); + mirror_num = index_srcdev + 1; + patch_the_first_stripe_for_dev_replace = 1; + physical_to_patch_in_first_stripe = physical_of_found; } else if (mirror_num > map->num_stripes) { mirror_num = 0; } @@ -5794,7 +5805,7 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, free_extent_map(em); return -EIO; } - map = (struct map_lookup *)em->bdev; + map = em->map_lookup; length = em->len; rmap_len = map->stripe_len; @@ -6057,7 +6068,8 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, bbio->fs_info = root->fs_info; atomic_set(&bbio->stripes_pending, bbio->num_stripes); - if (bbio->raid_map) { + if ((bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) && + ((rw & WRITE) || (mirror_num > 1))) { /* In this case, map_length has been set to the length of a single stripe; not the whole write */ if (rw & WRITE) { @@ -6198,6 +6210,7 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key, struct extent_map *em; u64 logical; u64 length; + u64 stripe_len; u64 devid; u8 uuid[BTRFS_UUID_SIZE]; int num_stripes; @@ -6206,6 +6219,37 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key, logical = key->offset; length = btrfs_chunk_length(leaf, chunk); + stripe_len = btrfs_chunk_stripe_len(leaf, chunk); + num_stripes = btrfs_chunk_num_stripes(leaf, chunk); + /* Validation check */ + if (!num_stripes) { + btrfs_err(root->fs_info, "invalid chunk num_stripes: %u", + num_stripes); + return -EIO; + } + if (!IS_ALIGNED(logical, root->sectorsize)) { + btrfs_err(root->fs_info, + "invalid chunk logical %llu", logical); + return -EIO; + } + if (!length || !IS_ALIGNED(length, root->sectorsize)) { + btrfs_err(root->fs_info, + "invalid chunk length %llu", length); + return -EIO; + } + if (!is_power_of_2(stripe_len)) { + btrfs_err(root->fs_info, "invalid chunk stripe length: %llu", + stripe_len); + return -EIO; + } + if (~(BTRFS_BLOCK_GROUP_TYPE_MASK | BTRFS_BLOCK_GROUP_PROFILE_MASK) & + btrfs_chunk_type(leaf, chunk)) { + btrfs_err(root->fs_info, "unrecognized chunk type: %llu", + ~(BTRFS_BLOCK_GROUP_TYPE_MASK | + BTRFS_BLOCK_GROUP_PROFILE_MASK) & + btrfs_chunk_type(leaf, chunk)); + return -EIO; + } read_lock(&map_tree->map_tree.lock); em = lookup_extent_mapping(&map_tree->map_tree, logical, 1); @@ -6222,7 +6266,6 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key, em = alloc_extent_map(); if (!em) return -ENOMEM; - num_stripes = btrfs_chunk_num_stripes(leaf, chunk); map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS); if (!map) { free_extent_map(em); @@ -6230,7 +6273,7 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key, } set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags); - em->bdev = (struct block_device *)map; + em->map_lookup = map; em->start = logical; em->len = length; em->orig_start = 0; @@ -6465,11 +6508,11 @@ int btrfs_read_sys_array(struct btrfs_root *root) sb = btrfs_find_create_tree_block(root, BTRFS_SUPER_INFO_OFFSET); if (!sb) return -ENOMEM; - btrfs_set_buffer_uptodate(sb); + set_extent_buffer_uptodate(sb); btrfs_set_buffer_lockdep_class(root->root_key.objectid, sb, 0); /* * The sb extent buffer is artifical and just used to read the system array. - * btrfs_set_buffer_uptodate() call does not properly mark all it's + * set_extent_buffer_uptodate() call does not properly mark all it's * pages up-to-date when the page is larger: extent does not cover the * whole page and consequently check_page_uptodate does not find all * the page's extents up-to-date (the hole beyond sb), @@ -6512,6 +6555,14 @@ int btrfs_read_sys_array(struct btrfs_root *root) goto out_short_read; num_stripes = btrfs_chunk_num_stripes(sb, chunk); + if (!num_stripes) { + printk(KERN_ERR + "BTRFS: invalid number of stripes %u in sys_array at offset %u\n", + num_stripes, cur_offset); + ret = -EIO; + break; + } + len = btrfs_chunk_item_size(num_stripes); if (cur_offset + len > array_size) goto out_short_read; @@ -6520,6 +6571,9 @@ int btrfs_read_sys_array(struct btrfs_root *root) if (ret) break; } else { + printk(KERN_ERR + "BTRFS: unexpected item type %u in sys_array at offset %u\n", + (u32)key.type, cur_offset); ret = -EIO; break; } @@ -6921,7 +6975,7 @@ void btrfs_update_commit_device_bytes_used(struct btrfs_root *root, /* In order to kick the device replace finish process */ lock_chunks(root); list_for_each_entry(em, &transaction->pending_chunks, list) { - map = (struct map_lookup *)em->bdev; + map = em->map_lookup; for (i = 0; i < map->num_stripes; i++) { dev = map->stripes[i].dev; @@ -6949,7 +7003,7 @@ void btrfs_reset_fs_info_ptr(struct btrfs_fs_info *fs_info) } } -void btrfs_close_one_device(struct btrfs_device *device) +static void btrfs_close_one_device(struct btrfs_device *device) { struct btrfs_fs_devices *fs_devices = device->fs_devices; struct btrfs_device *new_device; diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index d5c84f6b1353..1939ebde63df 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -26,7 +26,7 @@ extern struct mutex uuid_mutex; -#define BTRFS_STRIPE_LEN (64 * 1024) +#define BTRFS_STRIPE_LEN SZ_64K struct buffer_head; struct btrfs_pending_bios { @@ -566,6 +566,5 @@ static inline void unlock_chunks(struct btrfs_root *root) struct list_head *btrfs_get_fs_uuids(void); void btrfs_set_fs_info_ptr(struct btrfs_fs_info *fs_info); void btrfs_reset_fs_info_ptr(struct btrfs_fs_info *fs_info); -void btrfs_close_one_device(struct btrfs_device *device); #endif diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index 1fcd7b6e7564..6c68d6356197 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -126,7 +126,7 @@ static int do_setxattr(struct btrfs_trans_handle *trans, * locks the inode's i_mutex before calling setxattr or removexattr. */ if (flags & XATTR_REPLACE) { - ASSERT(mutex_is_locked(&inode->i_mutex)); + ASSERT(inode_is_locked(inode)); di = btrfs_lookup_xattr(NULL, root, path, btrfs_ino(inode), name, name_len, 0); if (!di) @@ -283,7 +283,7 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size) path = btrfs_alloc_path(); if (!path) return -ENOMEM; - path->reada = 2; + path->reada = READA_FORWARD; /* search for our xattrs */ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); @@ -351,137 +351,89 @@ err: return ret; } -/* - * List of handlers for synthetic system.* attributes. All real ondisk - * attributes are handled directly. - */ -const struct xattr_handler *btrfs_xattr_handlers[] = { -#ifdef CONFIG_BTRFS_FS_POSIX_ACL - &posix_acl_access_xattr_handler, - &posix_acl_default_xattr_handler, -#endif - NULL, -}; - -/* - * Check if the attribute is in a supported namespace. - * - * This is applied after the check for the synthetic attributes in the system - * namespace. - */ -static int btrfs_is_valid_xattr(const char *name) +static int btrfs_xattr_handler_get(const struct xattr_handler *handler, + struct dentry *dentry, const char *name, + void *buffer, size_t size) { - int len = strlen(name); - int prefixlen = 0; - - if (!strncmp(name, XATTR_SECURITY_PREFIX, - XATTR_SECURITY_PREFIX_LEN)) - prefixlen = XATTR_SECURITY_PREFIX_LEN; - else if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) - prefixlen = XATTR_SYSTEM_PREFIX_LEN; - else if (!strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN)) - prefixlen = XATTR_TRUSTED_PREFIX_LEN; - else if (!strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN)) - prefixlen = XATTR_USER_PREFIX_LEN; - else if (!strncmp(name, XATTR_BTRFS_PREFIX, XATTR_BTRFS_PREFIX_LEN)) - prefixlen = XATTR_BTRFS_PREFIX_LEN; - else - return -EOPNOTSUPP; - - /* - * The name cannot consist of just prefix - */ - if (len <= prefixlen) - return -EINVAL; + struct inode *inode = d_inode(dentry); - return 0; + name = xattr_full_name(handler, name); + return __btrfs_getxattr(inode, name, buffer, size); } -ssize_t btrfs_getxattr(struct dentry *dentry, const char *name, - void *buffer, size_t size) +static int btrfs_xattr_handler_set(const struct xattr_handler *handler, + struct dentry *dentry, const char *name, + const void *buffer, size_t size, + int flags) { - int ret; + struct inode *inode = d_inode(dentry); - /* - * If this is a request for a synthetic attribute in the system.* - * namespace use the generic infrastructure to resolve a handler - * for it via sb->s_xattr. - */ - if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) - return generic_getxattr(dentry, name, buffer, size); + name = xattr_full_name(handler, name); + return __btrfs_setxattr(NULL, inode, name, buffer, size, flags); +} - ret = btrfs_is_valid_xattr(name); - if (ret) - return ret; - return __btrfs_getxattr(d_inode(dentry), name, buffer, size); +static int btrfs_xattr_handler_set_prop(const struct xattr_handler *handler, + struct dentry *dentry, + const char *name, const void *value, + size_t size, int flags) +{ + name = xattr_full_name(handler, name); + return btrfs_set_prop(d_inode(dentry), name, value, size, flags); } +static const struct xattr_handler btrfs_security_xattr_handler = { + .prefix = XATTR_SECURITY_PREFIX, + .get = btrfs_xattr_handler_get, + .set = btrfs_xattr_handler_set, +}; + +static const struct xattr_handler btrfs_trusted_xattr_handler = { + .prefix = XATTR_TRUSTED_PREFIX, + .get = btrfs_xattr_handler_get, + .set = btrfs_xattr_handler_set, +}; + +static const struct xattr_handler btrfs_user_xattr_handler = { + .prefix = XATTR_USER_PREFIX, + .get = btrfs_xattr_handler_get, + .set = btrfs_xattr_handler_set, +}; + +static const struct xattr_handler btrfs_btrfs_xattr_handler = { + .prefix = XATTR_BTRFS_PREFIX, + .get = btrfs_xattr_handler_get, + .set = btrfs_xattr_handler_set_prop, +}; + +const struct xattr_handler *btrfs_xattr_handlers[] = { + &btrfs_security_xattr_handler, +#ifdef CONFIG_BTRFS_FS_POSIX_ACL + &posix_acl_access_xattr_handler, + &posix_acl_default_xattr_handler, +#endif + &btrfs_trusted_xattr_handler, + &btrfs_user_xattr_handler, + &btrfs_btrfs_xattr_handler, + NULL, +}; + int btrfs_setxattr(struct dentry *dentry, const char *name, const void *value, size_t size, int flags) { struct btrfs_root *root = BTRFS_I(d_inode(dentry))->root; - int ret; - /* - * The permission on security.* and system.* is not checked - * in permission(). - */ if (btrfs_root_readonly(root)) return -EROFS; - - /* - * If this is a request for a synthetic attribute in the system.* - * namespace use the generic infrastructure to resolve a handler - * for it via sb->s_xattr. - */ - if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) - return generic_setxattr(dentry, name, value, size, flags); - - ret = btrfs_is_valid_xattr(name); - if (ret) - return ret; - - if (!strncmp(name, XATTR_BTRFS_PREFIX, XATTR_BTRFS_PREFIX_LEN)) - return btrfs_set_prop(d_inode(dentry), name, - value, size, flags); - - if (size == 0) - value = ""; /* empty EA, do not remove */ - - return __btrfs_setxattr(NULL, d_inode(dentry), name, value, size, - flags); + return generic_setxattr(dentry, name, value, size, flags); } int btrfs_removexattr(struct dentry *dentry, const char *name) { struct btrfs_root *root = BTRFS_I(d_inode(dentry))->root; - int ret; - /* - * The permission on security.* and system.* is not checked - * in permission(). - */ if (btrfs_root_readonly(root)) return -EROFS; - - /* - * If this is a request for a synthetic attribute in the system.* - * namespace use the generic infrastructure to resolve a handler - * for it via sb->s_xattr. - */ - if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) - return generic_removexattr(dentry, name); - - ret = btrfs_is_valid_xattr(name); - if (ret) - return ret; - - if (!strncmp(name, XATTR_BTRFS_PREFIX, XATTR_BTRFS_PREFIX_LEN)) - return btrfs_set_prop(d_inode(dentry), name, - NULL, 0, XATTR_REPLACE); - - return __btrfs_setxattr(NULL, d_inode(dentry), name, NULL, 0, - XATTR_REPLACE); + return generic_removexattr(dentry, name); } static int btrfs_initxattrs(struct inode *inode, @@ -494,7 +446,7 @@ static int btrfs_initxattrs(struct inode *inode, for (xattr = xattr_array; xattr->name != NULL; xattr++) { name = kmalloc(XATTR_SECURITY_PREFIX_LEN + - strlen(xattr->name) + 1, GFP_NOFS); + strlen(xattr->name) + 1, GFP_KERNEL); if (!name) { err = -ENOMEM; break; diff --git a/fs/btrfs/xattr.h b/fs/btrfs/xattr.h index 5049608d1388..96807b3d22f5 100644 --- a/fs/btrfs/xattr.h +++ b/fs/btrfs/xattr.h @@ -28,8 +28,6 @@ extern ssize_t __btrfs_getxattr(struct inode *inode, const char *name, extern int __btrfs_setxattr(struct btrfs_trans_handle *trans, struct inode *inode, const char *name, const void *value, size_t size, int flags); -extern ssize_t btrfs_getxattr(struct dentry *dentry, const char *name, - void *buffer, size_t size); extern int btrfs_setxattr(struct dentry *dentry, const char *name, const void *value, size_t size, int flags); extern int btrfs_removexattr(struct dentry *dentry, const char *name); |