diff options
Diffstat (limited to 'fs/btrfs')
63 files changed, 4489 insertions, 3535 deletions
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index 9a0ff3384381..e738f6206ea5 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -11,7 +11,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \ reada.o backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o \ uuid-tree.o props.o free-space-tree.o tree-checker.o space-info.o \ - block-rsv.o delalloc-space.o block-group.o discard.o + block-rsv.o delalloc-space.o block-group.o discard.o reflink.o btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c index 1d32a07bb2d1..309516e6a968 100644 --- a/fs/btrfs/async-thread.c +++ b/fs/btrfs/async-thread.c @@ -395,3 +395,11 @@ void btrfs_set_work_high_priority(struct btrfs_work *work) { set_bit(WORK_HIGH_PRIO_BIT, &work->flags); } + +void btrfs_flush_workqueue(struct btrfs_workqueue *wq) +{ + if (wq->high) + flush_workqueue(wq->high->normal_wq); + + flush_workqueue(wq->normal->normal_wq); +} diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h index a4434301d84d..3204daa51b95 100644 --- a/fs/btrfs/async-thread.h +++ b/fs/btrfs/async-thread.h @@ -44,5 +44,6 @@ void btrfs_set_work_high_priority(struct btrfs_work *work); struct btrfs_fs_info * __pure btrfs_work_owner(const struct btrfs_work *work); struct btrfs_fs_info * __pure btrfs_workqueue_owner(const struct __btrfs_workqueue *wq); bool btrfs_workqueue_normal_congested(const struct btrfs_workqueue *wq); +void btrfs_flush_workqueue(struct btrfs_workqueue *wq); #endif diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index e5d85311d5d5..9c380e7edf62 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -347,33 +347,10 @@ static int add_prelim_ref(const struct btrfs_fs_info *fs_info, return -ENOMEM; ref->root_id = root_id; - if (key) { + if (key) ref->key_for_search = *key; - /* - * We can often find data backrefs with an offset that is too - * large (>= LLONG_MAX, maximum allowed file offset) due to - * underflows when subtracting a file's offset with the data - * offset of its corresponding extent data item. This can - * happen for example in the clone ioctl. - * So if we detect such case we set the search key's offset to - * zero to make sure we will find the matching file extent item - * at add_all_parents(), otherwise we will miss it because the - * offset taken form the backref is much larger then the offset - * of the file extent item. This can make us scan a very large - * number of file extent items, but at least it will not make - * us miss any. - * This is an ugly workaround for a behaviour that should have - * never existed, but it does and a fix for the clone ioctl - * would touch a lot of places, cause backwards incompatibility - * and would not fix the problem for extents cloned with older - * kernels. - */ - if (ref->key_for_search.type == BTRFS_EXTENT_DATA_KEY && - ref->key_for_search.offset >= LLONG_MAX) - ref->key_for_search.offset = 0; - } else { + else memset(&ref->key_for_search, 0, sizeof(ref->key_for_search)); - } ref->inode_list = NULL; ref->level = level; @@ -409,10 +386,36 @@ static int add_indirect_ref(const struct btrfs_fs_info *fs_info, wanted_disk_byte, count, sc, gfp_mask); } +static int is_shared_data_backref(struct preftrees *preftrees, u64 bytenr) +{ + struct rb_node **p = &preftrees->direct.root.rb_root.rb_node; + struct rb_node *parent = NULL; + struct prelim_ref *ref = NULL; + struct prelim_ref target = {0}; + int result; + + target.parent = bytenr; + + while (*p) { + parent = *p; + ref = rb_entry(parent, struct prelim_ref, rbnode); + result = prelim_ref_compare(ref, &target); + + if (result < 0) + p = &(*p)->rb_left; + else if (result > 0) + p = &(*p)->rb_right; + else + return 1; + } + return 0; +} + static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path, - struct ulist *parents, struct prelim_ref *ref, + struct ulist *parents, + struct preftrees *preftrees, struct prelim_ref *ref, int level, u64 time_seq, const u64 *extent_item_pos, - u64 total_refs, bool ignore_offset) + bool ignore_offset) { int ret = 0; int slot; @@ -424,6 +427,7 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path, u64 disk_byte; u64 wanted_disk_byte = ref->wanted_disk_byte; u64 count = 0; + u64 data_offset; if (level != 0) { eb = path->nodes[level]; @@ -434,18 +438,26 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path, } /* - * We normally enter this function with the path already pointing to - * the first item to check. But sometimes, we may enter it with - * slot==nritems. In that case, go to the next leaf before we continue. + * 1. We normally enter this function with the path already pointing to + * the first item to check. But sometimes, we may enter it with + * slot == nritems. + * 2. We are searching for normal backref but bytenr of this leaf + * matches shared data backref + * 3. The leaf owner is not equal to the root we are searching + * + * For these cases, go to the next leaf before we continue. */ - if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { + eb = path->nodes[0]; + if (path->slots[0] >= btrfs_header_nritems(eb) || + is_shared_data_backref(preftrees, eb->start) || + ref->root_id != btrfs_header_owner(eb)) { if (time_seq == SEQ_LAST) ret = btrfs_next_leaf(root, path); else ret = btrfs_next_old_leaf(root, path, time_seq); } - while (!ret && count < total_refs) { + while (!ret && count < ref->count) { eb = path->nodes[0]; slot = path->slots[0]; @@ -455,13 +467,31 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path, key.type != BTRFS_EXTENT_DATA_KEY) break; + /* + * We are searching for normal backref but bytenr of this leaf + * matches shared data backref, OR + * the leaf owner is not equal to the root we are searching for + */ + if (slot == 0 && + (is_shared_data_backref(preftrees, eb->start) || + ref->root_id != btrfs_header_owner(eb))) { + if (time_seq == SEQ_LAST) + ret = btrfs_next_leaf(root, path); + else + ret = btrfs_next_old_leaf(root, path, time_seq); + continue; + } fi = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item); disk_byte = btrfs_file_extent_disk_bytenr(eb, fi); + data_offset = btrfs_file_extent_offset(eb, fi); if (disk_byte == wanted_disk_byte) { eie = NULL; old = NULL; - count++; + if (ref->key_for_search.offset == key.offset - data_offset) + count++; + else + goto next; if (extent_item_pos) { ret = check_extent_in_eb(&key, eb, fi, *extent_item_pos, @@ -502,9 +532,9 @@ next: */ static int resolve_indirect_ref(struct btrfs_fs_info *fs_info, struct btrfs_path *path, u64 time_seq, + struct preftrees *preftrees, struct prelim_ref *ref, struct ulist *parents, - const u64 *extent_item_pos, u64 total_refs, - bool ignore_offset) + const u64 *extent_item_pos, bool ignore_offset) { struct btrfs_root *root; struct btrfs_key root_key; @@ -512,23 +542,25 @@ static int resolve_indirect_ref(struct btrfs_fs_info *fs_info, int ret = 0; int root_level; int level = ref->level; - int index; + struct btrfs_key search_key = ref->key_for_search; root_key.objectid = ref->root_id; root_key.type = BTRFS_ROOT_ITEM_KEY; root_key.offset = (u64)-1; - index = srcu_read_lock(&fs_info->subvol_srcu); - root = btrfs_get_fs_root(fs_info, &root_key, false); if (IS_ERR(root)) { - srcu_read_unlock(&fs_info->subvol_srcu, index); ret = PTR_ERR(root); + goto out_free; + } + + if (!path->search_commit_root && + test_bit(BTRFS_ROOT_DELETING, &root->state)) { + ret = -ENOENT; goto out; } if (btrfs_is_testing(fs_info)) { - srcu_read_unlock(&fs_info->subvol_srcu, index); ret = -ENOENT; goto out; } @@ -540,21 +572,36 @@ static int resolve_indirect_ref(struct btrfs_fs_info *fs_info, else root_level = btrfs_old_root_level(root, time_seq); - if (root_level + 1 == level) { - srcu_read_unlock(&fs_info->subvol_srcu, index); + if (root_level + 1 == level) goto out; - } + /* + * We can often find data backrefs with an offset that is too large + * (>= LLONG_MAX, maximum allowed file offset) due to underflows when + * subtracting a file's offset with the data offset of its + * corresponding extent data item. This can happen for example in the + * clone ioctl. + * + * So if we detect such case we set the search key's offset to zero to + * make sure we will find the matching file extent item at + * add_all_parents(), otherwise we will miss it because the offset + * taken form the backref is much larger then the offset of the file + * extent item. This can make us scan a very large number of file + * extent items, but at least it will not make us miss any. + * + * This is an ugly workaround for a behaviour that should have never + * existed, but it does and a fix for the clone ioctl would touch a lot + * of places, cause backwards incompatibility and would not fix the + * problem for extents cloned with older kernels. + */ + if (search_key.type == BTRFS_EXTENT_DATA_KEY && + search_key.offset >= LLONG_MAX) + search_key.offset = 0; path->lowest_level = level; if (time_seq == SEQ_LAST) - ret = btrfs_search_slot(NULL, root, &ref->key_for_search, path, - 0, 0); + ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); else - ret = btrfs_search_old_slot(root, &ref->key_for_search, path, - time_seq); - - /* root node has been locked, we can release @subvol_srcu safely here */ - srcu_read_unlock(&fs_info->subvol_srcu, index); + ret = btrfs_search_old_slot(root, &search_key, path, time_seq); btrfs_debug(fs_info, "search slot in root %llu (level %d, ref count %d) returned %d for key (%llu %u %llu)", @@ -574,9 +621,11 @@ static int resolve_indirect_ref(struct btrfs_fs_info *fs_info, eb = path->nodes[level]; } - ret = add_all_parents(root, path, parents, ref, level, time_seq, - extent_item_pos, total_refs, ignore_offset); + ret = add_all_parents(root, path, parents, preftrees, ref, level, + time_seq, extent_item_pos, ignore_offset); out: + btrfs_put_root(root); +out_free: path->lowest_level = 0; btrfs_release_path(path); return ret; @@ -609,7 +658,7 @@ unode_aux_to_inode_list(struct ulist_node *node) static int resolve_indirect_refs(struct btrfs_fs_info *fs_info, struct btrfs_path *path, u64 time_seq, struct preftrees *preftrees, - const u64 *extent_item_pos, u64 total_refs, + const u64 *extent_item_pos, struct share_check *sc, bool ignore_offset) { int err; @@ -653,9 +702,9 @@ static int resolve_indirect_refs(struct btrfs_fs_info *fs_info, ret = BACKREF_FOUND_SHARED; goto out; } - err = resolve_indirect_ref(fs_info, path, time_seq, ref, - parents, extent_item_pos, - total_refs, ignore_offset); + err = resolve_indirect_ref(fs_info, path, time_seq, preftrees, + ref, parents, extent_item_pos, + ignore_offset); /* * we can only tolerate ENOENT,otherwise,we should catch error * and return directly. @@ -758,8 +807,7 @@ static int add_missing_keys(struct btrfs_fs_info *fs_info, */ static int add_delayed_refs(const struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_head *head, u64 seq, - struct preftrees *preftrees, u64 *total_refs, - struct share_check *sc) + struct preftrees *preftrees, struct share_check *sc) { struct btrfs_delayed_ref_node *node; struct btrfs_delayed_extent_op *extent_op = head->extent_op; @@ -793,7 +841,6 @@ static int add_delayed_refs(const struct btrfs_fs_info *fs_info, default: BUG(); } - *total_refs += count; switch (node->type) { case BTRFS_TREE_BLOCK_REF_KEY: { /* NORMAL INDIRECT METADATA backref */ @@ -876,7 +923,7 @@ out: static int add_inline_refs(const struct btrfs_fs_info *fs_info, struct btrfs_path *path, u64 bytenr, int *info_level, struct preftrees *preftrees, - u64 *total_refs, struct share_check *sc) + struct share_check *sc) { int ret = 0; int slot; @@ -900,7 +947,6 @@ static int add_inline_refs(const struct btrfs_fs_info *fs_info, ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item); flags = btrfs_extent_flags(leaf, ei); - *total_refs += btrfs_extent_refs(leaf, ei); btrfs_item_key_to_cpu(leaf, &found_key, slot); ptr = (unsigned long)(ei + 1); @@ -1125,8 +1171,6 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans, struct prelim_ref *ref; struct rb_node *node; struct extent_inode_elem *eie = NULL; - /* total of both direct AND indirect refs! */ - u64 total_refs = 0; struct preftrees preftrees = { .direct = PREFTREE_INIT, .indirect = PREFTREE_INIT, @@ -1195,7 +1239,7 @@ again: } spin_unlock(&delayed_refs->lock); ret = add_delayed_refs(fs_info, head, time_seq, - &preftrees, &total_refs, sc); + &preftrees, sc); mutex_unlock(&head->mutex); if (ret) goto out; @@ -1216,8 +1260,7 @@ again: (key.type == BTRFS_EXTENT_ITEM_KEY || key.type == BTRFS_METADATA_ITEM_KEY)) { ret = add_inline_refs(fs_info, path, bytenr, - &info_level, &preftrees, - &total_refs, sc); + &info_level, &preftrees, sc); if (ret) goto out; ret = add_keyed_refs(fs_info, path, bytenr, info_level, @@ -1236,7 +1279,7 @@ again: WARN_ON(!RB_EMPTY_ROOT(&preftrees.indirect_missing_keys.root.rb_root)); ret = resolve_indirect_refs(fs_info, path, time_seq, &preftrees, - extent_item_pos, total_refs, sc, ignore_offset); + extent_item_pos, sc, ignore_offset); if (ret) goto out; @@ -1362,10 +1405,10 @@ static void free_leaf_list(struct ulist *blocks) * * returns 0 on success, <0 on error */ -static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, u64 bytenr, - u64 time_seq, struct ulist **leafs, - const u64 *extent_item_pos, bool ignore_offset) +int btrfs_find_all_leafs(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, u64 bytenr, + u64 time_seq, struct ulist **leafs, + const u64 *extent_item_pos, bool ignore_offset) { int ret; diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h index 777f61dc081e..723d6da99114 100644 --- a/fs/btrfs/backref.h +++ b/fs/btrfs/backref.h @@ -40,6 +40,10 @@ int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info, int paths_from_inode(u64 inum, struct inode_fs_paths *ipath); +int btrfs_find_all_leafs(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, u64 bytenr, + u64 time_seq, struct ulist **leafs, + const u64 *extent_item_pos, bool ignore_offset); int btrfs_find_all_roots(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 bytenr, u64 time_seq, struct ulist **roots, bool ignore_offset); diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 404e050ce8ee..786849fcc319 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -460,7 +460,7 @@ u64 add_new_free_space(struct btrfs_block_group *block_group, u64 start, u64 end int ret; while (start < end) { - ret = find_first_extent_bit(info->pinned_extents, start, + ret = find_first_extent_bit(&info->excluded_extents, start, &extent_start, &extent_end, EXTENT_DIRTY | EXTENT_UPTODATE, NULL); @@ -856,9 +856,9 @@ static void clear_incompat_bg_bits(struct btrfs_fs_info *fs_info, u64 flags) found_raid1c34 = true; up_read(&sinfo->groups_sem); } - if (found_raid56) + if (!found_raid56) btrfs_clear_fs_incompat(fs_info, RAID56); - if (found_raid1c34) + if (!found_raid1c34) btrfs_clear_fs_incompat(fs_info, RAID1C34); } } @@ -1248,6 +1248,55 @@ out: return ret; } +static bool clean_pinned_extents(struct btrfs_trans_handle *trans, + struct btrfs_block_group *bg) +{ + struct btrfs_fs_info *fs_info = bg->fs_info; + struct btrfs_transaction *prev_trans = NULL; + const u64 start = bg->start; + const u64 end = start + bg->length - 1; + int ret; + + spin_lock(&fs_info->trans_lock); + if (trans->transaction->list.prev != &fs_info->trans_list) { + prev_trans = list_last_entry(&trans->transaction->list, + struct btrfs_transaction, list); + refcount_inc(&prev_trans->use_count); + } + spin_unlock(&fs_info->trans_lock); + + /* + * Hold the unused_bg_unpin_mutex lock to avoid racing with + * btrfs_finish_extent_commit(). If we are at transaction N, another + * task might be running finish_extent_commit() for the previous + * transaction N - 1, and have seen a range belonging to the block + * group in pinned_extents before we were able to clear the whole block + * group range from pinned_extents. This means that task can lookup for + * the block group after we unpinned it from pinned_extents and removed + * it, leading to a BUG_ON() at unpin_extent_range(). + */ + mutex_lock(&fs_info->unused_bg_unpin_mutex); + if (prev_trans) { + ret = clear_extent_bits(&prev_trans->pinned_extents, start, end, + EXTENT_DIRTY); + if (ret) + goto err; + } + + ret = clear_extent_bits(&trans->transaction->pinned_extents, start, end, + EXTENT_DIRTY); + if (ret) + goto err; + mutex_unlock(&fs_info->unused_bg_unpin_mutex); + + return true; + +err: + mutex_unlock(&fs_info->unused_bg_unpin_mutex); + btrfs_dec_block_group_ro(bg); + return false; +} + /* * Process the unused_bgs list and remove any that don't have any allocated * space inside of them. @@ -1265,7 +1314,6 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) spin_lock(&fs_info->unused_bgs_lock); while (!list_empty(&fs_info->unused_bgs)) { - u64 start, end; int trimming; block_group = list_first_entry(&fs_info->unused_bgs, @@ -1344,35 +1392,8 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) * We could have pending pinned extents for this block group, * just delete them, we don't care about them anymore. */ - start = block_group->start; - end = start + block_group->length - 1; - /* - * Hold the unused_bg_unpin_mutex lock to avoid racing with - * btrfs_finish_extent_commit(). If we are at transaction N, - * another task might be running finish_extent_commit() for the - * previous transaction N - 1, and have seen a range belonging - * to the block group in freed_extents[] before we were able to - * clear the whole block group range from freed_extents[]. This - * means that task can lookup for the block group after we - * unpinned it from freed_extents[] and removed it, leading to - * a BUG_ON() at btrfs_unpin_extent_range(). - */ - mutex_lock(&fs_info->unused_bg_unpin_mutex); - ret = clear_extent_bits(&fs_info->freed_extents[0], start, end, - EXTENT_DIRTY); - if (ret) { - mutex_unlock(&fs_info->unused_bg_unpin_mutex); - btrfs_dec_block_group_ro(block_group); + if (!clean_pinned_extents(trans, block_group)) goto end_trans; - } - ret = clear_extent_bits(&fs_info->freed_extents[1], start, end, - EXTENT_DIRTY); - if (ret) { - mutex_unlock(&fs_info->unused_bg_unpin_mutex); - btrfs_dec_block_group_ro(block_group); - goto end_trans; - } - mutex_unlock(&fs_info->unused_bg_unpin_mutex); /* * At this point, the block_group is read only and should fail @@ -1987,6 +2008,7 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info) btrfs_release_path(path); } + rcu_read_lock(); list_for_each_entry_rcu(space_info, &info->space_info, list) { if (!(btrfs_get_alloc_profile(info, space_info->flags) & (BTRFS_BLOCK_GROUP_RAID10 | @@ -2007,6 +2029,7 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info) list) inc_block_group_ro(cache, 1); } + rcu_read_unlock(); btrfs_init_global_block_rsv(info); ret = check_chunk_block_group_mappings(info); @@ -2345,7 +2368,7 @@ static int cache_save_setup(struct btrfs_block_group *block_group, return 0; } - if (trans->aborted) + if (TRANS_ABORTED(trans)) return 0; again: inode = lookup_free_space_inode(block_group, path); @@ -2881,7 +2904,7 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans, &cache->space_info->total_bytes_pinned, num_bytes, BTRFS_TOTAL_BYTES_PINNED_BATCH); - set_extent_dirty(info->pinned_extents, + set_extent_dirty(&trans->transaction->pinned_extents, bytenr, bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL); } diff --git a/fs/btrfs/block-rsv.c b/fs/btrfs/block-rsv.c index d07bd41a7c1e..27efec8f7c5b 100644 --- a/fs/btrfs/block-rsv.c +++ b/fs/btrfs/block-rsv.c @@ -6,6 +6,98 @@ #include "space-info.h" #include "transaction.h" +/* + * HOW DO BLOCK RESERVES WORK + * + * Think of block_rsv's as buckets for logically grouped metadata + * reservations. Each block_rsv has a ->size and a ->reserved. ->size is + * how large we want our block rsv to be, ->reserved is how much space is + * currently reserved for this block reserve. + * + * ->failfast exists for the truncate case, and is described below. + * + * NORMAL OPERATION + * + * -> Reserve + * Entrance: btrfs_block_rsv_add, btrfs_block_rsv_refill + * + * We call into btrfs_reserve_metadata_bytes() with our bytes, which is + * accounted for in space_info->bytes_may_use, and then add the bytes to + * ->reserved, and ->size in the case of btrfs_block_rsv_add. + * + * ->size is an over-estimation of how much we may use for a particular + * operation. + * + * -> Use + * Entrance: btrfs_use_block_rsv + * + * When we do a btrfs_alloc_tree_block() we call into btrfs_use_block_rsv() + * to determine the appropriate block_rsv to use, and then verify that + * ->reserved has enough space for our tree block allocation. Once + * successful we subtract fs_info->nodesize from ->reserved. + * + * -> Finish + * Entrance: btrfs_block_rsv_release + * + * We are finished with our operation, subtract our individual reservation + * from ->size, and then subtract ->size from ->reserved and free up the + * excess if there is any. + * + * There is some logic here to refill the delayed refs rsv or the global rsv + * as needed, otherwise the excess is subtracted from + * space_info->bytes_may_use. + * + * TYPES OF BLOCK RESERVES + * + * BLOCK_RSV_TRANS, BLOCK_RSV_DELOPS, BLOCK_RSV_CHUNK + * These behave normally, as described above, just within the confines of the + * lifetime of their particular operation (transaction for the whole trans + * handle lifetime, for example). + * + * BLOCK_RSV_GLOBAL + * It is impossible to properly account for all the space that may be required + * to make our extent tree updates. This block reserve acts as an overflow + * buffer in case our delayed refs reserve does not reserve enough space to + * update the extent tree. + * + * We can steal from this in some cases as well, notably on evict() or + * truncate() in order to help users recover from ENOSPC conditions. + * + * BLOCK_RSV_DELALLOC + * The individual item sizes are determined by the per-inode size + * calculations, which are described with the delalloc code. This is pretty + * straightforward, it's just the calculation of ->size encodes a lot of + * different items, and thus it gets used when updating inodes, inserting file + * extents, and inserting checksums. + * + * BLOCK_RSV_DELREFS + * We keep a running tally of how many delayed refs we have on the system. + * We assume each one of these delayed refs are going to use a full + * reservation. We use the transaction items and pre-reserve space for every + * operation, and use this reservation to refill any gap between ->size and + * ->reserved that may exist. + * + * From there it's straightforward, removing a delayed ref means we remove its + * count from ->size and free up reservations as necessary. Since this is + * the most dynamic block reserve in the system, we will try to refill this + * block reserve first with any excess returned by any other block reserve. + * + * BLOCK_RSV_EMPTY + * This is the fallback block reserve to make us try to reserve space if we + * don't have a specific bucket for this allocation. It is mostly used for + * updating the device tree and such, since that is a separate pool we're + * content to just reserve space from the space_info on demand. + * + * BLOCK_RSV_TEMP + * This is used by things like truncate and iput. We will temporarily + * allocate a block reserve, set it to some size, and then truncate bytes + * until we have no space left. With ->failfast set we'll simply return + * ENOSPC from btrfs_use_block_rsv() to signal that we need to unwind and try + * to make a new reservation. This is because these operations are + * unbounded, so we want to do as much work as we can, and then back off and + * re-reserve. + */ + static u64 block_rsv_release_bytes(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *block_rsv, struct btrfs_block_rsv *dest, u64 num_bytes, @@ -111,7 +203,7 @@ void btrfs_free_block_rsv(struct btrfs_fs_info *fs_info, { if (!rsv) return; - btrfs_block_rsv_release(fs_info, rsv, (u64)-1); + btrfs_block_rsv_release(fs_info, rsv, (u64)-1, NULL); kfree(rsv); } @@ -178,9 +270,9 @@ int btrfs_block_rsv_refill(struct btrfs_root *root, return ret; } -u64 __btrfs_block_rsv_release(struct btrfs_fs_info *fs_info, - struct btrfs_block_rsv *block_rsv, - u64 num_bytes, u64 *qgroup_to_release) +u64 btrfs_block_rsv_release(struct btrfs_fs_info *fs_info, + struct btrfs_block_rsv *block_rsv, u64 num_bytes, + u64 *qgroup_to_release) { struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv; @@ -297,9 +389,9 @@ void btrfs_update_global_block_rsv(struct btrfs_fs_info *fs_info) if (block_rsv->reserved < block_rsv->size) { num_bytes = block_rsv->size - block_rsv->reserved; - block_rsv->reserved += num_bytes; btrfs_space_info_update_bytes_may_use(fs_info, sinfo, num_bytes); + block_rsv->reserved = block_rsv->size; } else if (block_rsv->reserved > block_rsv->size) { num_bytes = block_rsv->reserved - block_rsv->size; btrfs_space_info_update_bytes_may_use(fs_info, sinfo, @@ -344,7 +436,8 @@ void btrfs_init_global_block_rsv(struct btrfs_fs_info *fs_info) void btrfs_release_global_block_rsv(struct btrfs_fs_info *fs_info) { - btrfs_block_rsv_release(fs_info, &fs_info->global_block_rsv, (u64)-1); + btrfs_block_rsv_release(fs_info, &fs_info->global_block_rsv, (u64)-1, + NULL); WARN_ON(fs_info->trans_block_rsv.size > 0); WARN_ON(fs_info->trans_block_rsv.reserved > 0); WARN_ON(fs_info->chunk_block_rsv.size > 0); diff --git a/fs/btrfs/block-rsv.h b/fs/btrfs/block-rsv.h index d1428bb73fc5..0b6ae5302837 100644 --- a/fs/btrfs/block-rsv.h +++ b/fs/btrfs/block-rsv.h @@ -73,7 +73,7 @@ int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info, int min_factor); void btrfs_block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv, u64 num_bytes, bool update_size); -u64 __btrfs_block_rsv_release(struct btrfs_fs_info *fs_info, +u64 btrfs_block_rsv_release(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *block_rsv, u64 num_bytes, u64 *qgroup_to_release); void btrfs_update_global_block_rsv(struct btrfs_fs_info *fs_info); @@ -82,20 +82,12 @@ void btrfs_release_global_block_rsv(struct btrfs_fs_info *fs_info); struct btrfs_block_rsv *btrfs_use_block_rsv(struct btrfs_trans_handle *trans, struct btrfs_root *root, u32 blocksize); - -static inline void btrfs_block_rsv_release(struct btrfs_fs_info *fs_info, - struct btrfs_block_rsv *block_rsv, - u64 num_bytes) -{ - __btrfs_block_rsv_release(fs_info, block_rsv, num_bytes, NULL); -} - static inline void btrfs_unuse_block_rsv(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *block_rsv, u32 blocksize) { btrfs_block_rsv_add_bytes(block_rsv, blocksize, false); - btrfs_block_rsv_release(fs_info, block_rsv, 0); + btrfs_block_rsv_release(fs_info, block_rsv, 0, NULL); } #endif /* BTRFS_BLOCK_RSV_H */ diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 4e12a477d32e..27a1fefce508 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -60,6 +60,12 @@ struct btrfs_inode { */ struct extent_io_tree io_failure_tree; + /* + * Keep track of where the inode has extent items mapped in order to + * make sure the i_size adjustments are accurate + */ + struct extent_io_tree file_extent_tree; + /* held while logging the inode in tree-log.c */ struct mutex log_mutex; diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index a0ce69f2d27c..32e11a23b47f 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -77,7 +77,6 @@ #include <linux/sched.h> #include <linux/slab.h> -#include <linux/buffer_head.h> #include <linux/mutex.h> #include <linux/genhd.h> #include <linux/blkdev.h> @@ -152,11 +151,8 @@ struct btrfsic_block { struct list_head ref_to_list; /* list */ struct list_head ref_from_list; /* list */ struct btrfsic_block *next_in_same_bio; - void *orig_bio_bh_private; - union { - bio_end_io_t *bio; - bh_end_io_t *bh; - } orig_bio_bh_end_io; + void *orig_bio_private; + bio_end_io_t *orig_bio_end_io; int submit_bio_bh_rw; u64 flush_gen; /* only valid if !never_written */ }; @@ -325,14 +321,12 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state, u64 dev_bytenr, char **mapped_datav, unsigned int num_pages, struct bio *bio, int *bio_is_patched, - struct buffer_head *bh, int submit_bio_bh_rw); static int btrfsic_process_written_superblock( struct btrfsic_state *state, struct btrfsic_block *const block, struct btrfs_super_block *const super_hdr); static void btrfsic_bio_end_io(struct bio *bp); -static void btrfsic_bh_end_io(struct buffer_head *bh, int uptodate); static int btrfsic_is_block_ref_by_superblock(const struct btrfsic_state *state, const struct btrfsic_block *block, int recursion_level); @@ -399,8 +393,8 @@ static void btrfsic_block_init(struct btrfsic_block *b) b->never_written = 0; b->mirror_num = 0; b->next_in_same_bio = NULL; - b->orig_bio_bh_private = NULL; - b->orig_bio_bh_end_io.bio = NULL; + b->orig_bio_private = NULL; + b->orig_bio_end_io = NULL; INIT_LIST_HEAD(&b->collision_resolving_node); INIT_LIST_HEAD(&b->all_blocks_node); INIT_LIST_HEAD(&b->ref_to_list); @@ -767,29 +761,31 @@ static int btrfsic_process_superblock_dev_mirror( struct btrfs_fs_info *fs_info = state->fs_info; struct btrfs_super_block *super_tmp; u64 dev_bytenr; - struct buffer_head *bh; struct btrfsic_block *superblock_tmp; int pass; struct block_device *const superblock_bdev = device->bdev; + struct page *page; + struct address_space *mapping = superblock_bdev->bd_inode->i_mapping; + int ret = 0; /* super block bytenr is always the unmapped device bytenr */ dev_bytenr = btrfs_sb_offset(superblock_mirror_num); if (dev_bytenr + BTRFS_SUPER_INFO_SIZE > device->commit_total_bytes) return -1; - bh = __bread(superblock_bdev, dev_bytenr / BTRFS_BDEV_BLOCKSIZE, - BTRFS_SUPER_INFO_SIZE); - if (NULL == bh) + + page = read_cache_page_gfp(mapping, dev_bytenr >> PAGE_SHIFT, GFP_NOFS); + if (IS_ERR(page)) return -1; - super_tmp = (struct btrfs_super_block *) - (bh->b_data + (dev_bytenr & (BTRFS_BDEV_BLOCKSIZE - 1))); + + super_tmp = page_address(page); if (btrfs_super_bytenr(super_tmp) != dev_bytenr || btrfs_super_magic(super_tmp) != BTRFS_MAGIC || memcmp(device->uuid, super_tmp->dev_item.uuid, BTRFS_UUID_SIZE) || btrfs_super_nodesize(super_tmp) != state->metablock_size || btrfs_super_sectorsize(super_tmp) != state->datablock_size) { - brelse(bh); - return 0; + ret = 0; + goto out; } superblock_tmp = @@ -800,8 +796,8 @@ static int btrfsic_process_superblock_dev_mirror( superblock_tmp = btrfsic_block_alloc(); if (NULL == superblock_tmp) { pr_info("btrfsic: error, kmalloc failed!\n"); - brelse(bh); - return -1; + ret = -1; + goto out; } /* for superblock, only the dev_bytenr makes sense */ superblock_tmp->dev_bytenr = dev_bytenr; @@ -885,8 +881,8 @@ static int btrfsic_process_superblock_dev_mirror( mirror_num)) { pr_info("btrfsic: btrfsic_map_block(bytenr @%llu, mirror %d) failed!\n", next_bytenr, mirror_num); - brelse(bh); - return -1; + ret = -1; + goto out; } next_block = btrfsic_block_lookup_or_add( @@ -895,8 +891,8 @@ static int btrfsic_process_superblock_dev_mirror( mirror_num, NULL); if (NULL == next_block) { btrfsic_release_block_ctx(&tmp_next_block_ctx); - brelse(bh); - return -1; + ret = -1; + goto out; } next_block->disk_key = tmp_disk_key; @@ -907,16 +903,17 @@ static int btrfsic_process_superblock_dev_mirror( BTRFSIC_GENERATION_UNKNOWN); btrfsic_release_block_ctx(&tmp_next_block_ctx); if (NULL == l) { - brelse(bh); - return -1; + ret = -1; + goto out; } } } if (state->print_mask & BTRFSIC_PRINT_MASK_INITIAL_ALL_TREES) btrfsic_dump_tree_sub(state, superblock_tmp, 0); - brelse(bh); - return 0; +out: + put_page(page); + return ret; } static struct btrfsic_stack_frame *btrfsic_stack_frame_alloc(void) @@ -1743,7 +1740,6 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state, u64 dev_bytenr, char **mapped_datav, unsigned int num_pages, struct bio *bio, int *bio_is_patched, - struct buffer_head *bh, int submit_bio_bh_rw) { int is_metadata; @@ -1902,9 +1898,9 @@ again: block->is_iodone = 0; BUG_ON(NULL == bio_is_patched); if (!*bio_is_patched) { - block->orig_bio_bh_private = + block->orig_bio_private = bio->bi_private; - block->orig_bio_bh_end_io.bio = + block->orig_bio_end_io = bio->bi_end_io; block->next_in_same_bio = NULL; bio->bi_private = block; @@ -1916,25 +1912,17 @@ again: bio->bi_private; BUG_ON(NULL == chained_block); - block->orig_bio_bh_private = - chained_block->orig_bio_bh_private; - block->orig_bio_bh_end_io.bio = - chained_block->orig_bio_bh_end_io. - bio; + block->orig_bio_private = + chained_block->orig_bio_private; + block->orig_bio_end_io = + chained_block->orig_bio_end_io; block->next_in_same_bio = chained_block; bio->bi_private = block; } - } else if (NULL != bh) { - block->is_iodone = 0; - block->orig_bio_bh_private = bh->b_private; - block->orig_bio_bh_end_io.bh = bh->b_end_io; - block->next_in_same_bio = NULL; - bh->b_private = block; - bh->b_end_io = btrfsic_bh_end_io; } else { block->is_iodone = 1; - block->orig_bio_bh_private = NULL; - block->orig_bio_bh_end_io.bio = NULL; + block->orig_bio_private = NULL; + block->orig_bio_end_io = NULL; block->next_in_same_bio = NULL; } } @@ -2042,8 +2030,8 @@ again: block->is_iodone = 0; BUG_ON(NULL == bio_is_patched); if (!*bio_is_patched) { - block->orig_bio_bh_private = bio->bi_private; - block->orig_bio_bh_end_io.bio = bio->bi_end_io; + block->orig_bio_private = bio->bi_private; + block->orig_bio_end_io = bio->bi_end_io; block->next_in_same_bio = NULL; bio->bi_private = block; bio->bi_end_io = btrfsic_bio_end_io; @@ -2054,24 +2042,17 @@ again: bio->bi_private; BUG_ON(NULL == chained_block); - block->orig_bio_bh_private = - chained_block->orig_bio_bh_private; - block->orig_bio_bh_end_io.bio = - chained_block->orig_bio_bh_end_io.bio; + block->orig_bio_private = + chained_block->orig_bio_private; + block->orig_bio_end_io = + chained_block->orig_bio_end_io; block->next_in_same_bio = chained_block; bio->bi_private = block; } - } else if (NULL != bh) { - block->is_iodone = 0; - block->orig_bio_bh_private = bh->b_private; - block->orig_bio_bh_end_io.bh = bh->b_end_io; - block->next_in_same_bio = NULL; - bh->b_private = block; - bh->b_end_io = btrfsic_bh_end_io; } else { block->is_iodone = 1; - block->orig_bio_bh_private = NULL; - block->orig_bio_bh_end_io.bio = NULL; + block->orig_bio_private = NULL; + block->orig_bio_end_io = NULL; block->next_in_same_bio = NULL; } if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) @@ -2112,8 +2093,8 @@ static void btrfsic_bio_end_io(struct bio *bp) iodone_w_error = 1; BUG_ON(NULL == block); - bp->bi_private = block->orig_bio_bh_private; - bp->bi_end_io = block->orig_bio_bh_end_io.bio; + bp->bi_private = block->orig_bio_private; + bp->bi_end_io = block->orig_bio_end_io; do { struct btrfsic_block *next_block; @@ -2146,38 +2127,6 @@ static void btrfsic_bio_end_io(struct bio *bp) bp->bi_end_io(bp); } -static void btrfsic_bh_end_io(struct buffer_head *bh, int uptodate) -{ - struct btrfsic_block *block = (struct btrfsic_block *)bh->b_private; - int iodone_w_error = !uptodate; - struct btrfsic_dev_state *dev_state; - - BUG_ON(NULL == block); - dev_state = block->dev_state; - if ((dev_state->state->print_mask & BTRFSIC_PRINT_MASK_END_IO_BIO_BH)) - pr_info("bh_end_io(error=%d) for %c @%llu (%s/%llu/%d)\n", - iodone_w_error, - btrfsic_get_block_type(dev_state->state, block), - block->logical_bytenr, block->dev_state->name, - block->dev_bytenr, block->mirror_num); - - block->iodone_w_error = iodone_w_error; - if (block->submit_bio_bh_rw & REQ_PREFLUSH) { - dev_state->last_flush_gen++; - if ((dev_state->state->print_mask & - BTRFSIC_PRINT_MASK_END_IO_BIO_BH)) - pr_info("bh_end_io() new %s flush_gen=%llu\n", - dev_state->name, dev_state->last_flush_gen); - } - if (block->submit_bio_bh_rw & REQ_FUA) - block->flush_gen = 0; /* FUA completed means block is on disk */ - - bh->b_private = block->orig_bio_bh_private; - bh->b_end_io = block->orig_bio_bh_end_io.bh; - block->is_iodone = 1; /* for FLUSH, this releases the block */ - bh->b_end_io(bh, uptodate); -} - static int btrfsic_process_written_superblock( struct btrfsic_state *state, struct btrfsic_block *const superblock, @@ -2730,63 +2679,6 @@ static struct btrfsic_dev_state *btrfsic_dev_state_lookup(dev_t dev) &btrfsic_dev_state_hashtable); } -int btrfsic_submit_bh(int op, int op_flags, struct buffer_head *bh) -{ - struct btrfsic_dev_state *dev_state; - - if (!btrfsic_is_initialized) - return submit_bh(op, op_flags, bh); - - mutex_lock(&btrfsic_mutex); - /* since btrfsic_submit_bh() might also be called before - * btrfsic_mount(), this might return NULL */ - dev_state = btrfsic_dev_state_lookup(bh->b_bdev->bd_dev); - - /* Only called to write the superblock (incl. FLUSH/FUA) */ - if (NULL != dev_state && - (op == REQ_OP_WRITE) && bh->b_size > 0) { - u64 dev_bytenr; - - dev_bytenr = BTRFS_BDEV_BLOCKSIZE * bh->b_blocknr; - if (dev_state->state->print_mask & - BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH) - pr_info("submit_bh(op=0x%x,0x%x, blocknr=%llu (bytenr %llu), size=%zu, data=%p, bdev=%p)\n", - op, op_flags, (unsigned long long)bh->b_blocknr, - dev_bytenr, bh->b_size, bh->b_data, bh->b_bdev); - btrfsic_process_written_block(dev_state, dev_bytenr, - &bh->b_data, 1, NULL, - NULL, bh, op_flags); - } else if (NULL != dev_state && (op_flags & REQ_PREFLUSH)) { - if (dev_state->state->print_mask & - BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH) - pr_info("submit_bh(op=0x%x,0x%x FLUSH, bdev=%p)\n", - op, op_flags, bh->b_bdev); - if (!dev_state->dummy_block_for_bio_bh_flush.is_iodone) { - if ((dev_state->state->print_mask & - (BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH | - BTRFSIC_PRINT_MASK_VERBOSE))) - pr_info("btrfsic_submit_bh(%s) with FLUSH but dummy block already in use (ignored)!\n", - dev_state->name); - } else { - struct btrfsic_block *const block = - &dev_state->dummy_block_for_bio_bh_flush; - - block->is_iodone = 0; - block->never_written = 0; - block->iodone_w_error = 0; - block->flush_gen = dev_state->last_flush_gen + 1; - block->submit_bio_bh_rw = op_flags; - block->orig_bio_bh_private = bh->b_private; - block->orig_bio_bh_end_io.bh = bh->b_end_io; - block->next_in_same_bio = NULL; - bh->b_private = block; - bh->b_end_io = btrfsic_bh_end_io; - } - } - mutex_unlock(&btrfsic_mutex); - return submit_bh(op, op_flags, bh); -} - static void __btrfsic_submit_bio(struct bio *bio) { struct btrfsic_dev_state *dev_state; @@ -2838,7 +2730,7 @@ static void __btrfsic_submit_bio(struct bio *bio) btrfsic_process_written_block(dev_state, dev_bytenr, mapped_datav, segs, bio, &bio_is_patched, - NULL, bio->bi_opf); + bio->bi_opf); bio_for_each_segment(bvec, bio, iter) kunmap(bvec.bv_page); kfree(mapped_datav); @@ -2862,8 +2754,8 @@ static void __btrfsic_submit_bio(struct bio *bio) block->iodone_w_error = 0; block->flush_gen = dev_state->last_flush_gen + 1; block->submit_bio_bh_rw = bio->bi_opf; - block->orig_bio_bh_private = bio->bi_private; - block->orig_bio_bh_end_io.bio = bio->bi_end_io; + block->orig_bio_private = bio->bi_private; + block->orig_bio_end_io = bio->bi_end_io; block->next_in_same_bio = NULL; bio->bi_private = block; bio->bi_end_io = btrfsic_bio_end_io; diff --git a/fs/btrfs/check-integrity.h b/fs/btrfs/check-integrity.h index 9bf4359cc44c..bcc730a06cb5 100644 --- a/fs/btrfs/check-integrity.h +++ b/fs/btrfs/check-integrity.h @@ -7,11 +7,9 @@ #define BTRFS_CHECK_INTEGRITY_H #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY -int btrfsic_submit_bh(int op, int op_flags, struct buffer_head *bh); void btrfsic_submit_bio(struct bio *bio); int btrfsic_submit_bio_wait(struct bio *bio); #else -#define btrfsic_submit_bh submit_bh #define btrfsic_submit_bio submit_bio #define btrfsic_submit_bio_wait submit_bio_wait #endif diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index f2ec1a9bae28..bfedbbe2311f 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -31,8 +31,8 @@ static void del_ptr(struct btrfs_root *root, struct btrfs_path *path, static const struct btrfs_csums { u16 size; - const char *name; - const char *driver; + const char name[10]; + const char driver[12]; } btrfs_csums[] = { [BTRFS_CSUM_TYPE_CRC32] = { .size = 4, .name = "crc32c" }, [BTRFS_CSUM_TYPE_XXHASH] = { .size = 8, .name = "xxhash64" }, @@ -63,7 +63,8 @@ const char *btrfs_super_csum_name(u16 csum_type) const char *btrfs_super_csum_driver(u16 csum_type) { /* csum type is validated at mount time */ - return btrfs_csums[csum_type].driver ?: + return btrfs_csums[csum_type].driver[0] ? + btrfs_csums[csum_type].driver : btrfs_csums[csum_type].name; } @@ -143,44 +144,6 @@ struct extent_buffer *btrfs_root_node(struct btrfs_root *root) return eb; } -/* loop around taking references on and locking the root node of the - * tree until you end up with a lock on the root. A locked buffer - * is returned, with a reference held. - */ -struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root) -{ - struct extent_buffer *eb; - - while (1) { - eb = btrfs_root_node(root); - btrfs_tree_lock(eb); - if (eb == root->node) - break; - btrfs_tree_unlock(eb); - free_extent_buffer(eb); - } - return eb; -} - -/* loop around taking references on and locking the root node of the - * tree until you end up with a lock on the root. A locked buffer - * is returned, with a reference held. - */ -struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root) -{ - struct extent_buffer *eb; - - while (1) { - eb = btrfs_root_node(root); - btrfs_tree_read_lock(eb); - if (eb == root->node) - break; - btrfs_tree_read_unlock(eb); - free_extent_buffer(eb); - } - return eb; -} - /* cowonly root (everything not a reference counted cow subvolume), just get * put onto a simple dirty list. transaction.c walks this to make sure they * get properly updated on disk. @@ -341,7 +304,6 @@ void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info, struct rb_root *tm_root; struct rb_node *node; struct rb_node *next; - struct seq_list *cur_elem; struct tree_mod_elem *tm; u64 min_seq = (u64)-1; u64 seq_putting = elem->seq; @@ -353,18 +315,20 @@ void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info, list_del(&elem->list); elem->seq = 0; - list_for_each_entry(cur_elem, &fs_info->tree_mod_seq_list, list) { - if (cur_elem->seq < min_seq) { - if (seq_putting > cur_elem->seq) { - /* - * blocker with lower sequence number exists, we - * cannot remove anything from the log - */ - write_unlock(&fs_info->tree_mod_log_lock); - return; - } - min_seq = cur_elem->seq; + if (!list_empty(&fs_info->tree_mod_seq_list)) { + struct seq_list *first; + + first = list_first_entry(&fs_info->tree_mod_seq_list, + struct seq_list, list); + if (seq_putting > first->seq) { + /* + * Blocker with lower sequence number exists, we + * cannot remove anything from the log. + */ + write_unlock(&fs_info->tree_mod_log_lock); + return; } + min_seq = first->seq; } /* @@ -962,9 +926,7 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, if (new_flags != 0) { int level = btrfs_header_level(buf); - ret = btrfs_set_disk_extent_flags(trans, - buf->start, - buf->len, + ret = btrfs_set_disk_extent_flags(trans, buf, new_flags, level, 0); if (ret) return ret; diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 36df977b64d9..8aa7b9dac405 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -33,6 +33,7 @@ #include "extent_map.h" #include "async-thread.h" #include "block-rsv.h" +#include "locking.h" struct btrfs_trans_handle; struct btrfs_transaction; @@ -596,8 +597,8 @@ struct btrfs_fs_info { /* keep track of unallocated space */ atomic64_t free_chunk_space; - struct extent_io_tree freed_extents[2]; - struct extent_io_tree *pinned_extents; + /* Track ranges which are used by log trees blocks/logged data extents */ + struct extent_io_tree excluded_extents; /* logical->physical extent mapping */ struct extent_map_tree mapping_tree; @@ -696,7 +697,6 @@ struct btrfs_fs_info { struct rw_semaphore cleanup_work_sem; struct rw_semaphore subvol_sem; - struct srcu_struct subvol_srcu; spinlock_t trans_lock; /* @@ -947,6 +947,10 @@ struct btrfs_fs_info { #ifdef CONFIG_BTRFS_DEBUG struct kobject *debug_kobj; struct kobject *discard_debug_kobj; + struct list_head allocated_roots; + + spinlock_t eb_leak_lock; + struct list_head allocated_ebs; #endif }; @@ -955,11 +959,6 @@ static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb) return sb->s_fs_info; } -struct btrfs_subvolume_writers { - struct percpu_counter counter; - wait_queue_head_t wait; -}; - /* * The state of btrfs root */ @@ -1131,8 +1130,9 @@ struct btrfs_root { * root_item_lock. */ int dedupe_in_progress; - struct btrfs_subvolume_writers *subv_writers; - atomic_t will_be_snapshotted; + /* For exclusion of snapshot creation and nocow writes */ + struct btrfs_drew_lock snapshot_lock; + atomic_t snapshot_force_cow; /* For qgroup metadata reserved space */ @@ -1149,6 +1149,10 @@ struct btrfs_root { #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS u64 alloc_bytenr; #endif + +#ifdef CONFIG_BTRFS_DEBUG + struct list_head leak_list; +#endif }; struct btrfs_clone_extent_info { @@ -1971,16 +1975,6 @@ static inline void btrfs_set_header_backref_rev(struct extent_buffer *eb, btrfs_set_header_flags(eb, flags); } -static inline unsigned long btrfs_header_fsid(void) -{ - return offsetof(struct btrfs_header, fsid); -} - -static inline unsigned long btrfs_header_chunk_tree_uuid(const struct extent_buffer *eb) -{ - return offsetof(struct btrfs_header, chunk_tree_uuid); -} - static inline int btrfs_is_leaf(const struct extent_buffer *eb) { return btrfs_header_level(eb) == 0; @@ -2458,9 +2452,9 @@ int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len); int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 bytenr, u64 offset, int metadata, u64 *refs, u64 *flags); -int btrfs_pin_extent(struct btrfs_fs_info *fs_info, - u64 bytenr, u64 num, int reserved); -int btrfs_pin_extent_for_log_replay(struct btrfs_fs_info *fs_info, +int btrfs_pin_extent(struct btrfs_trans_handle *trans, u64 bytenr, u64 num, + int reserved); +int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes); int btrfs_exclude_logged_extents(struct extent_buffer *eb); int btrfs_cross_ref_exist(struct btrfs_root *root, @@ -2490,13 +2484,13 @@ int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, int full_backref); int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, - u64 bytenr, u64 num_bytes, u64 flags, + struct extent_buffer *eb, u64 flags, int level, int is_data); int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref); int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len, int delalloc); -int btrfs_pin_reserved_extent(struct btrfs_fs_info *fs_info, u64 start, +int btrfs_pin_reserved_extent(struct btrfs_trans_handle *trans, u64 start, u64 len); void btrfs_prepare_extent_commit(struct btrfs_fs_info *fs_info); int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans); @@ -2665,9 +2659,8 @@ static inline int btrfs_next_item(struct btrfs_root *root, struct btrfs_path *p) return btrfs_next_old_item(root, p, 0); } int btrfs_leaf_free_space(struct extent_buffer *leaf); -int __must_check btrfs_drop_snapshot(struct btrfs_root *root, - struct btrfs_block_rsv *block_rsv, - int update_ref, int for_reloc); +int __must_check btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, + int for_reloc); int btrfs_drop_subtree(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *node, @@ -2695,23 +2688,6 @@ static inline int btrfs_need_cleaner_sleep(struct btrfs_fs_info *fs_info) return fs_info->sb->s_flags & SB_RDONLY || btrfs_fs_closing(fs_info); } -static inline void free_fs_info(struct btrfs_fs_info *fs_info) -{ - kfree(fs_info->balance_ctl); - kfree(fs_info->delayed_root); - kfree(fs_info->extent_root); - kfree(fs_info->tree_root); - kfree(fs_info->chunk_root); - kfree(fs_info->dev_root); - kfree(fs_info->csum_root); - kfree(fs_info->quota_root); - kfree(fs_info->uuid_root); - kfree(fs_info->free_space_root); - kfree(fs_info->super_copy); - kfree(fs_info->super_for_commit); - kvfree(fs_info); -} - /* tree mod log functions from ctree.c */ u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info, struct seq_list *elem); @@ -2750,9 +2726,7 @@ int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans, u8 *uuid, u8 type, u64 subid); int btrfs_uuid_tree_remove(struct btrfs_trans_handle *trans, u8 *uuid, u8 type, u64 subid); -int btrfs_uuid_tree_iterate(struct btrfs_fs_info *fs_info, - int (*check_func)(struct btrfs_fs_info *, u8 *, u8, - u64)); +int btrfs_uuid_tree_iterate(struct btrfs_fs_info *fs_info); /* dir-item.c */ int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir, @@ -2859,6 +2833,12 @@ void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode, struct btrfs_file_extent_item *fi, const bool new_inline, struct extent_map *em); +int btrfs_inode_clear_file_extent_range(struct btrfs_inode *inode, u64 start, + u64 len); +int btrfs_inode_set_file_extent_range(struct btrfs_inode *inode, u64 start, + u64 len); +void btrfs_inode_safe_disk_i_size_write(struct inode *inode, u64 new_i_size); +u64 btrfs_file_extent_end(const struct btrfs_path *path); /* inode.c */ struct extent_map *btrfs_get_extent_fiemap(struct btrfs_inode *inode, @@ -2996,9 +2976,6 @@ int btrfs_dirty_pages(struct inode *inode, struct page **pages, size_t num_pages, loff_t pos, size_t write_bytes, struct extent_state **cached); int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end); -loff_t btrfs_remap_file_range(struct file *file_in, loff_t pos_in, - struct file *file_out, loff_t pos_out, - loff_t len, unsigned int remap_flags); /* tree-defrag.c */ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, @@ -3008,6 +2985,8 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, int btrfs_parse_options(struct btrfs_fs_info *info, char *options, unsigned long new_flags); int btrfs_sync_fs(struct super_block *sb, int wait); +char *btrfs_get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info, + u64 subvol_objectid); static inline __printf(2, 3) __cold void btrfs_no_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...) @@ -3401,6 +3380,7 @@ void btrfs_reloc_pre_snapshot(struct btrfs_pending_snapshot *pending, u64 *bytes_to_reserve); int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans, struct btrfs_pending_snapshot *pending); +int btrfs_should_cancel_balance(struct btrfs_fs_info *fs_info); /* scrub.c */ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, diff --git a/fs/btrfs/delalloc-space.c b/fs/btrfs/delalloc-space.c index 4cdac4d834f5..1245739a3a6e 100644 --- a/fs/btrfs/delalloc-space.c +++ b/fs/btrfs/delalloc-space.c @@ -9,6 +9,108 @@ #include "qgroup.h" #include "block-group.h" +/* + * HOW DOES THIS WORK + * + * There are two stages to data reservations, one for data and one for metadata + * to handle the new extents and checksums generated by writing data. + * + * + * DATA RESERVATION + * The general flow of the data reservation is as follows + * + * -> Reserve + * We call into btrfs_reserve_data_bytes() for the user request bytes that + * they wish to write. We make this reservation and add it to + * space_info->bytes_may_use. We set EXTENT_DELALLOC on the inode io_tree + * for the range and carry on if this is buffered, or follow up trying to + * make a real allocation if we are pre-allocating or doing O_DIRECT. + * + * -> Use + * At writepages()/prealloc/O_DIRECT time we will call into + * btrfs_reserve_extent() for some part or all of this range of bytes. We + * will make the allocation and subtract space_info->bytes_may_use by the + * original requested length and increase the space_info->bytes_reserved by + * the allocated length. This distinction is important because compression + * may allocate a smaller on disk extent than we previously reserved. + * + * -> Allocation + * finish_ordered_io() will insert the new file extent item for this range, + * and then add a delayed ref update for the extent tree. Once that delayed + * ref is written the extent size is subtracted from + * space_info->bytes_reserved and added to space_info->bytes_used. + * + * Error handling + * + * -> By the reservation maker + * This is the simplest case, we haven't completed our operation and we know + * how much we reserved, we can simply call + * btrfs_free_reserved_data_space*() and it will be removed from + * space_info->bytes_may_use. + * + * -> After the reservation has been made, but before cow_file_range() + * This is specifically for the delalloc case. You must clear + * EXTENT_DELALLOC with the EXTENT_CLEAR_DATA_RESV bit, and the range will + * be subtracted from space_info->bytes_may_use. + * + * METADATA RESERVATION + * The general metadata reservation lifetimes are discussed elsewhere, this + * will just focus on how it is used for delalloc space. + * + * We keep track of two things on a per inode bases + * + * ->outstanding_extents + * This is the number of file extent items we'll need to handle all of the + * outstanding DELALLOC space we have in this inode. We limit the maximum + * size of an extent, so a large contiguous dirty area may require more than + * one outstanding_extent, which is why count_max_extents() is used to + * determine how many outstanding_extents get added. + * + * ->csum_bytes + * This is essentially how many dirty bytes we have for this inode, so we + * can calculate the number of checksum items we would have to add in order + * to checksum our outstanding data. + * + * We keep a per-inode block_rsv in order to make it easier to keep track of + * our reservation. We use btrfs_calculate_inode_block_rsv_size() to + * calculate the current theoretical maximum reservation we would need for the + * metadata for this inode. We call this and then adjust our reservation as + * necessary, either by attempting to reserve more space, or freeing up excess + * space. + * + * OUTSTANDING_EXTENTS HANDLING + * + * ->outstanding_extents is used for keeping track of how many extents we will + * need to use for this inode, and it will fluctuate depending on where you are + * in the life cycle of the dirty data. Consider the following normal case for + * a completely clean inode, with a num_bytes < our maximum allowed extent size + * + * -> reserve + * ->outstanding_extents += 1 (current value is 1) + * + * -> set_delalloc + * ->outstanding_extents += 1 (currrent value is 2) + * + * -> btrfs_delalloc_release_extents() + * ->outstanding_extents -= 1 (current value is 1) + * + * We must call this once we are done, as we hold our reservation for the + * duration of our operation, and then assume set_delalloc will update the + * counter appropriately. + * + * -> add ordered extent + * ->outstanding_extents += 1 (current value is 2) + * + * -> btrfs_clear_delalloc_extent + * ->outstanding_extents -= 1 (current value is 1) + * + * -> finish_ordered_io/btrfs_remove_ordered_extent + * ->outstanding_extents -= 1 (current value is 0) + * + * Each stage is responsible for their own accounting of the extent, thus + * making error handling and cleanup easier. + */ + int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes) { struct btrfs_root *root = inode->root; @@ -228,8 +330,8 @@ static void btrfs_inode_rsv_release(struct btrfs_inode *inode, bool qgroup_free) * are releasing 0 bytes, and then we'll just get the reservation over * the size free'd. */ - released = __btrfs_block_rsv_release(fs_info, block_rsv, 0, - &qgroup_to_release); + released = btrfs_block_rsv_release(fs_info, block_rsv, 0, + &qgroup_to_release); if (released > 0) trace_btrfs_space_reservation(fs_info, "delalloc", btrfs_ino(inode), released, 0); diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index d3e15e1d4a91..bf1595a42a98 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -6,6 +6,7 @@ #include <linux/slab.h> #include <linux/iversion.h> +#include <linux/sched/mm.h> #include "misc.h" #include "delayed-inode.h" #include "disk-io.h" @@ -595,8 +596,7 @@ static void btrfs_delayed_item_release_metadata(struct btrfs_root *root, trace_btrfs_space_reservation(fs_info, "delayed_item", item->key.objectid, item->bytes_reserved, 0); - btrfs_block_rsv_release(fs_info, rsv, - item->bytes_reserved); + btrfs_block_rsv_release(fs_info, rsv, item->bytes_reserved, NULL); } static int btrfs_delayed_inode_reserve_metadata( @@ -677,8 +677,7 @@ static void btrfs_delayed_inode_release_metadata(struct btrfs_fs_info *fs_info, rsv = &fs_info->delayed_block_rsv; trace_btrfs_space_reservation(fs_info, "delayed_inode", node->inode_id, node->bytes_reserved, 0); - btrfs_block_rsv_release(fs_info, rsv, - node->bytes_reserved); + btrfs_block_rsv_release(fs_info, rsv, node->bytes_reserved, NULL); if (qgroup_free) btrfs_qgroup_free_meta_prealloc(node->root, node->bytes_reserved); @@ -805,11 +804,14 @@ static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans, struct btrfs_delayed_item *delayed_item) { struct extent_buffer *leaf; + unsigned int nofs_flag; char *ptr; int ret; + nofs_flag = memalloc_nofs_save(); ret = btrfs_insert_empty_item(trans, root, path, &delayed_item->key, delayed_item->data_len); + memalloc_nofs_restore(nofs_flag); if (ret < 0 && ret != -EEXIST) return ret; @@ -937,6 +939,7 @@ static int btrfs_delete_delayed_items(struct btrfs_trans_handle *trans, struct btrfs_delayed_node *node) { struct btrfs_delayed_item *curr, *prev; + unsigned int nofs_flag; int ret = 0; do_again: @@ -945,7 +948,9 @@ do_again: if (!curr) goto delete_fail; + nofs_flag = memalloc_nofs_save(); ret = btrfs_search_slot(trans, root, &curr->key, path, -1, 1); + memalloc_nofs_restore(nofs_flag); if (ret < 0) goto delete_fail; else if (ret > 0) { @@ -1012,6 +1017,7 @@ static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans, struct btrfs_key key; struct btrfs_inode_item *inode_item; struct extent_buffer *leaf; + unsigned int nofs_flag; int mod; int ret; @@ -1024,7 +1030,9 @@ static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans, else mod = 1; + nofs_flag = memalloc_nofs_save(); ret = btrfs_lookup_inode(trans, root, path, &key, mod); + memalloc_nofs_restore(nofs_flag); if (ret > 0) { btrfs_release_path(path); return -ENOENT; @@ -1075,7 +1083,10 @@ search: key.type = BTRFS_INODE_EXTREF_KEY; key.offset = -1; + + nofs_flag = memalloc_nofs_save(); ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + memalloc_nofs_restore(nofs_flag); if (ret < 0) goto err_out; ASSERT(ret); @@ -1139,7 +1150,7 @@ static int __btrfs_run_delayed_items(struct btrfs_trans_handle *trans, int nr) int ret = 0; bool count = (nr > 0); - if (trans->aborted) + if (TRANS_ABORTED(trans)) return -EIO; path = btrfs_alloc_path(); @@ -1760,6 +1771,7 @@ static void fill_stack_inode_item(struct btrfs_trans_handle *trans, int btrfs_fill_inode(struct inode *inode, u32 *rdev) { + struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; struct btrfs_delayed_node *delayed_node; struct btrfs_inode_item *inode_item; @@ -1779,6 +1791,8 @@ int btrfs_fill_inode(struct inode *inode, u32 *rdev) i_uid_write(inode, btrfs_stack_inode_uid(inode_item)); i_gid_write(inode, btrfs_stack_inode_gid(inode_item)); btrfs_i_size_write(BTRFS_I(inode), btrfs_stack_inode_size(inode_item)); + btrfs_inode_set_file_extent_range(BTRFS_I(inode), 0, + round_up(i_size_read(inode), fs_info->sectorsize)); inode->i_mode = btrfs_stack_inode_mode(inode_item); set_nlink(inode, btrfs_stack_inode_nlink(inode_item)); inode_set_bytes(inode, btrfs_stack_inode_nbytes(inode_item)); diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h index 74ae226ffaf0..ca96ef007d8f 100644 --- a/fs/btrfs/delayed-inode.h +++ b/fs/btrfs/delayed-inode.h @@ -70,7 +70,7 @@ struct btrfs_delayed_item { refcount_t refs; int ins_or_del; u32 data_len; - char data[0]; + char data[]; }; static inline void btrfs_init_delayed_root( diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index dfdb7d4f8406..353cc2994d10 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -82,8 +82,7 @@ void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr) u64 num_bytes = btrfs_calc_insert_metadata_size(fs_info, nr); u64 released = 0; - released = __btrfs_block_rsv_release(fs_info, block_rsv, num_bytes, - NULL); + released = btrfs_block_rsv_release(fs_info, block_rsv, num_bytes, NULL); if (released) trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv", 0, released, 0); diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 2ca2a09d0e23..db93909b25e0 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -22,6 +22,46 @@ #include "dev-replace.h" #include "sysfs.h" +/* + * Device replace overview + * + * [Objective] + * To copy all extents (both new and on-disk) from source device to target + * device, while still keeping the filesystem read-write. + * + * [Method] + * There are two main methods involved: + * + * - Write duplication + * + * All new writes will be written to both target and source devices, so even + * if replace gets canceled, sources device still contans up-to-date data. + * + * Location: handle_ops_on_dev_replace() from __btrfs_map_block() + * Start: btrfs_dev_replace_start() + * End: btrfs_dev_replace_finishing() + * Content: Latest data/metadata + * + * - Copy existing extents + * + * This happens by re-using scrub facility, as scrub also iterates through + * existing extents from commit root. + * + * Location: scrub_write_block_to_dev_replace() from + * scrub_block_complete() + * Content: Data/meta from commit root. + * + * Due to the content difference, we need to avoid nocow write when dev-replace + * is happening. This is done by marking the block group read-only and waiting + * for NOCOW writes. + * + * After replace is done, the finishing part is done by swapping the target and + * source devices. + * + * Location: btrfs_dev_replace_update_device_in_mapping_tree() from + * btrfs_dev_replace_finishing() + */ + static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, int scrub_ret); static void btrfs_dev_replace_update_device_in_mapping_tree( @@ -472,7 +512,7 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, atomic64_set(&dev_replace->num_uncorrectable_read_errors, 0); up_write(&dev_replace->rwsem); - ret = btrfs_sysfs_add_device_link(tgt_device->fs_devices, tgt_device); + ret = btrfs_sysfs_add_devices_dir(tgt_device->fs_devices, tgt_device); if (ret) btrfs_err(fs_info, "kobj add dev failed %d", ret); @@ -703,7 +743,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, mutex_unlock(&fs_info->fs_devices->device_list_mutex); /* replace the sysfs entry */ - btrfs_sysfs_rm_device_link(fs_info->fs_devices, src_device); + btrfs_sysfs_remove_devices_dir(fs_info->fs_devices, src_device); btrfs_sysfs_update_devid(tgt_device); btrfs_rm_dev_replace_free_srcdev(src_device); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 7fa9bb79ad08..a6cb5cbbdb9f 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -7,7 +7,6 @@ #include <linux/blkdev.h> #include <linux/radix-tree.h> #include <linux/writeback.h> -#include <linux/buffer_head.h> #include <linux/workqueue.h> #include <linux/kthread.h> #include <linux/slab.h> @@ -42,6 +41,7 @@ #include "ref-verify.h" #include "block-group.h" #include "discard.h" +#include "space-info.h" #define BTRFS_SUPER_FLAG_SUPP (BTRFS_HEADER_FLAG_WRITTEN |\ BTRFS_HEADER_FLAG_RELOC |\ @@ -98,6 +98,12 @@ void __cold btrfs_end_io_wq_exit(void) kmem_cache_destroy(btrfs_end_io_wq_cache); } +static void btrfs_free_csum_hash(struct btrfs_fs_info *fs_info) +{ + if (fs_info->csum_shash) + crypto_free_shash(fs_info->csum_shash); +} + /* * async submit bios are used to offload expensive checksumming * onto the worker threads. They checksum file and metadata bios @@ -247,47 +253,27 @@ out: /* * Compute the csum of a btree block and store the result to provided buffer. - * - * Returns error if the extent buffer cannot be mapped. */ -static int csum_tree_block(struct extent_buffer *buf, u8 *result) +static void csum_tree_block(struct extent_buffer *buf, u8 *result) { struct btrfs_fs_info *fs_info = buf->fs_info; + const int num_pages = fs_info->nodesize >> PAGE_SHIFT; SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); - unsigned long len; - unsigned long cur_len; - unsigned long offset = BTRFS_CSUM_SIZE; char *kaddr; - unsigned long map_start; - unsigned long map_len; - int err; + int i; shash->tfm = fs_info->csum_shash; crypto_shash_init(shash); + kaddr = page_address(buf->pages[0]); + crypto_shash_update(shash, kaddr + BTRFS_CSUM_SIZE, + PAGE_SIZE - BTRFS_CSUM_SIZE); - len = buf->len - offset; - - while (len > 0) { - /* - * Note: we don't need to check for the err == 1 case here, as - * with the given combination of 'start = BTRFS_CSUM_SIZE (32)' - * and 'min_len = 32' and the currently implemented mapping - * algorithm we cannot cross a page boundary. - */ - err = map_private_extent_buffer(buf, offset, 32, - &kaddr, &map_start, &map_len); - if (WARN_ON(err)) - return err; - cur_len = min(len, map_len - (offset - map_start)); - crypto_shash_update(shash, kaddr + offset - map_start, cur_len); - len -= cur_len; - offset += cur_len; + for (i = 1; i < num_pages; i++) { + kaddr = page_address(buf->pages[i]); + crypto_shash_update(shash, kaddr, PAGE_SIZE); } memset(result, 0, BTRFS_CSUM_SIZE); - crypto_shash_final(shash, result); - - return 0; } /* @@ -535,10 +521,10 @@ static int csum_dirty_buffer(struct btrfs_fs_info *fs_info, struct page *page) return -EUCLEAN; ASSERT(memcmp_extent_buffer(eb, fs_info->fs_devices->metadata_uuid, - btrfs_header_fsid(), BTRFS_FSID_SIZE) == 0); + offsetof(struct btrfs_header, fsid), + BTRFS_FSID_SIZE) == 0); - if (csum_tree_block(eb, result)) - return -EINVAL; + csum_tree_block(eb, result); if (btrfs_header_level(eb)) ret = btrfs_check_node(eb); @@ -565,7 +551,8 @@ static int check_tree_block_fsid(struct extent_buffer *eb) u8 fsid[BTRFS_FSID_SIZE]; int ret = 1; - read_extent_buffer(eb, fsid, btrfs_header_fsid(), BTRFS_FSID_SIZE); + read_extent_buffer(eb, fsid, offsetof(struct btrfs_header, fsid), + BTRFS_FSID_SIZE); while (fs_devices) { u8 *metadata_uuid; @@ -596,9 +583,8 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio, u64 found_start; int found_level; struct extent_buffer *eb; - struct btrfs_root *root = BTRFS_I(page->mapping->host)->root; - struct btrfs_fs_info *fs_info = root->fs_info; - u16 csum_size = btrfs_super_csum_size(fs_info->super_copy); + struct btrfs_fs_info *fs_info; + u16 csum_size; int ret = 0; u8 result[BTRFS_CSUM_SIZE]; int reads_done; @@ -607,6 +593,8 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio, goto out; eb = (struct extent_buffer *)page->private; + fs_info = eb->fs_info; + csum_size = btrfs_super_csum_size(fs_info->super_copy); /* the pending IO might have been the only thing that kept this buffer * in memory. Make sure we have a ref for all this other checks @@ -647,9 +635,7 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio, btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb), eb, found_level); - ret = csum_tree_block(eb, result); - if (ret) - goto err; + csum_tree_block(eb, result); if (memcmp_extent_buffer(eb, result, 0, csum_size)) { u32 val; @@ -972,9 +958,7 @@ static int btree_writepages(struct address_space *mapping, static int btree_readpage(struct file *file, struct page *page) { - struct extent_io_tree *tree; - tree = &BTRFS_I(page->mapping->host)->io_tree; - return extent_read_full_page(tree, page, btree_get_extent, 0); + return extent_read_full_page(page, btree_get_extent, 0); } static int btree_releasepage(struct page *page, gfp_t gfp_flags) @@ -1100,36 +1084,11 @@ void btrfs_clean_tree_block(struct extent_buffer *buf) } } -static struct btrfs_subvolume_writers *btrfs_alloc_subvolume_writers(void) -{ - struct btrfs_subvolume_writers *writers; - int ret; - - writers = kmalloc(sizeof(*writers), GFP_NOFS); - if (!writers) - return ERR_PTR(-ENOMEM); - - ret = percpu_counter_init(&writers->counter, 0, GFP_NOFS); - if (ret < 0) { - kfree(writers); - return ERR_PTR(ret); - } - - init_waitqueue_head(&writers->wait); - return writers; -} - -static void -btrfs_free_subvolume_writers(struct btrfs_subvolume_writers *writers) -{ - percpu_counter_destroy(&writers->counter); - kfree(writers); -} - static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info, u64 objectid) { bool dummy = test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state); + root->fs_info = fs_info; root->node = NULL; root->commit_root = NULL; root->state = 0; @@ -1173,7 +1132,6 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info, atomic_set(&root->log_writers, 0); atomic_set(&root->log_batch, 0); refcount_set(&root->refs, 1); - atomic_set(&root->will_be_snapshotted, 0); atomic_set(&root->snapshot_force_cow, 0); atomic_set(&root->nr_swapfiles, 0); root->log_transid = 0; @@ -1195,14 +1153,20 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info, spin_lock_init(&root->root_item_lock); btrfs_qgroup_init_swapped_blocks(&root->swapped_blocks); +#ifdef CONFIG_BTRFS_DEBUG + INIT_LIST_HEAD(&root->leak_list); + spin_lock(&fs_info->fs_roots_radix_lock); + list_add_tail(&root->leak_list, &fs_info->allocated_roots); + spin_unlock(&fs_info->fs_roots_radix_lock); +#endif } static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info, - gfp_t flags) + u64 objectid, gfp_t flags) { struct btrfs_root *root = kzalloc(sizeof(*root), flags); if (root) - root->fs_info = fs_info; + __setup_root(root, fs_info, objectid); return root; } @@ -1215,12 +1179,11 @@ struct btrfs_root *btrfs_alloc_dummy_root(struct btrfs_fs_info *fs_info) if (!fs_info) return ERR_PTR(-EINVAL); - root = btrfs_alloc_root(fs_info, GFP_KERNEL); + root = btrfs_alloc_root(fs_info, BTRFS_ROOT_TREE_OBJECTID, GFP_KERNEL); if (!root) return ERR_PTR(-ENOMEM); /* We don't use the stripesize in selftest, set it as sectorsize */ - __setup_root(root, fs_info, BTRFS_ROOT_TREE_OBJECTID); root->alloc_bytenr = 0; return root; @@ -1237,19 +1200,17 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, struct btrfs_key key; unsigned int nofs_flag; int ret = 0; - uuid_le uuid = NULL_UUID_LE; /* * We're holding a transaction handle, so use a NOFS memory allocation * context to avoid deadlock if reclaim happens. */ nofs_flag = memalloc_nofs_save(); - root = btrfs_alloc_root(fs_info, GFP_KERNEL); + root = btrfs_alloc_root(fs_info, objectid, GFP_KERNEL); memalloc_nofs_restore(nofs_flag); if (!root) return ERR_PTR(-ENOMEM); - __setup_root(root, fs_info, objectid); root->root_key.objectid = objectid; root->root_key.type = BTRFS_ROOT_ITEM_KEY; root->root_key.offset = 0; @@ -1277,8 +1238,9 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, btrfs_set_root_last_snapshot(&root->root_item, 0); btrfs_set_root_dirid(&root->root_item, 0); if (is_fstree(objectid)) - uuid_le_gen(&uuid); - memcpy(root->root_item.uuid, uuid.b, BTRFS_UUID_SIZE); + generate_random_guid(root->root_item.uuid); + else + export_guid(root->root_item.uuid, &guid_null); root->root_item.drop_level = 0; key.objectid = objectid; @@ -1293,12 +1255,9 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, return root; fail: - if (leaf) { + if (leaf) btrfs_tree_unlock(leaf); - free_extent_buffer(root->commit_root); - free_extent_buffer(leaf); - } - kfree(root); + btrfs_put_root(root); return ERR_PTR(ret); } @@ -1309,12 +1268,10 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans, struct btrfs_root *root; struct extent_buffer *leaf; - root = btrfs_alloc_root(fs_info, GFP_NOFS); + root = btrfs_alloc_root(fs_info, BTRFS_TREE_LOG_OBJECTID, GFP_NOFS); if (!root) return ERR_PTR(-ENOMEM); - __setup_root(root, fs_info, BTRFS_TREE_LOG_OBJECTID); - root->root_key.objectid = BTRFS_TREE_LOG_OBJECTID; root->root_key.type = BTRFS_ROOT_ITEM_KEY; root->root_key.offset = BTRFS_TREE_LOG_OBJECTID; @@ -1331,7 +1288,7 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans, leaf = btrfs_alloc_tree_block(trans, root, 0, BTRFS_TREE_LOG_OBJECTID, NULL, 0, 0, 0); if (IS_ERR(leaf)) { - kfree(root); + btrfs_put_root(root); return ERR_CAST(leaf); } @@ -1387,8 +1344,8 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans, return 0; } -static struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root, - struct btrfs_key *key) +struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root, + struct btrfs_key *key) { struct btrfs_root *root; struct btrfs_fs_info *fs_info = tree_root->fs_info; @@ -1401,14 +1358,12 @@ static struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root, if (!path) return ERR_PTR(-ENOMEM); - root = btrfs_alloc_root(fs_info, GFP_NOFS); + root = btrfs_alloc_root(fs_info, key->objectid, GFP_NOFS); if (!root) { ret = -ENOMEM; goto alloc_fail; } - __setup_root(root, fs_info, key->objectid); - ret = btrfs_find_root(tree_root, key, path, &root->root_item, &root->root_key); if (ret) { @@ -1424,10 +1379,10 @@ static struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root, generation, level, NULL); if (IS_ERR(root->node)) { ret = PTR_ERR(root->node); + root->node = NULL; goto find_fail; } else if (!btrfs_buffer_uptodate(root->node, generation, 0)) { ret = -EIO; - free_extent_buffer(root->node); goto find_fail; } root->commit_root = btrfs_root_node(root); @@ -1436,33 +1391,16 @@ out: return root; find_fail: - kfree(root); + btrfs_put_root(root); alloc_fail: root = ERR_PTR(ret); goto out; } -struct btrfs_root *btrfs_read_fs_root(struct btrfs_root *tree_root, - struct btrfs_key *location) -{ - struct btrfs_root *root; - - root = btrfs_read_tree_root(tree_root, location); - if (IS_ERR(root)) - return root; - - if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { - set_bit(BTRFS_ROOT_REF_COWS, &root->state); - btrfs_check_and_init_root_item(&root->root_item); - } - - return root; -} - -int btrfs_init_fs_root(struct btrfs_root *root) +static int btrfs_init_fs_root(struct btrfs_root *root) { int ret; - struct btrfs_subvolume_writers *writers; + unsigned int nofs_flag; root->free_ino_ctl = kzalloc(sizeof(*root->free_ino_ctl), GFP_NOFS); root->free_ino_pinned = kzalloc(sizeof(*root->free_ino_pinned), @@ -1472,12 +1410,20 @@ int btrfs_init_fs_root(struct btrfs_root *root) goto fail; } - writers = btrfs_alloc_subvolume_writers(); - if (IS_ERR(writers)) { - ret = PTR_ERR(writers); + /* + * We might be called under a transaction (e.g. indirect backref + * resolution) which could deadlock if it triggers memory reclaim + */ + nofs_flag = memalloc_nofs_save(); + ret = btrfs_drew_lock_init(&root->snapshot_lock); + memalloc_nofs_restore(nofs_flag); + if (ret) goto fail; + + if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { + set_bit(BTRFS_ROOT_REF_COWS, &root->state); + btrfs_check_and_init_root_item(&root->root_item); } - root->subv_writers = writers; btrfs_init_free_ino_ctl(root); spin_lock_init(&root->ino_cache_lock); @@ -1505,14 +1451,16 @@ fail: return ret; } -struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info, - u64 root_id) +static struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info, + u64 root_id) { struct btrfs_root *root; spin_lock(&fs_info->fs_roots_radix_lock); root = radix_tree_lookup(&fs_info->fs_roots_radix, (unsigned long)root_id); + if (root) + root = btrfs_grab_root(root); spin_unlock(&fs_info->fs_roots_radix_lock); return root; } @@ -1530,14 +1478,62 @@ int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info, ret = radix_tree_insert(&fs_info->fs_roots_radix, (unsigned long)root->root_key.objectid, root); - if (ret == 0) + if (ret == 0) { + btrfs_grab_root(root); set_bit(BTRFS_ROOT_IN_RADIX, &root->state); + } spin_unlock(&fs_info->fs_roots_radix_lock); radix_tree_preload_end(); return ret; } +void btrfs_check_leaked_roots(struct btrfs_fs_info *fs_info) +{ +#ifdef CONFIG_BTRFS_DEBUG + struct btrfs_root *root; + + while (!list_empty(&fs_info->allocated_roots)) { + root = list_first_entry(&fs_info->allocated_roots, + struct btrfs_root, leak_list); + btrfs_err(fs_info, "leaked root %llu-%llu refcount %d", + root->root_key.objectid, root->root_key.offset, + refcount_read(&root->refs)); + while (refcount_read(&root->refs) > 1) + btrfs_put_root(root); + btrfs_put_root(root); + } +#endif +} + +void btrfs_free_fs_info(struct btrfs_fs_info *fs_info) +{ + percpu_counter_destroy(&fs_info->dirty_metadata_bytes); + percpu_counter_destroy(&fs_info->delalloc_bytes); + percpu_counter_destroy(&fs_info->dio_bytes); + percpu_counter_destroy(&fs_info->dev_replace.bio_counter); + btrfs_free_csum_hash(fs_info); + btrfs_free_stripe_hash_table(fs_info); + btrfs_free_ref_cache(fs_info); + kfree(fs_info->balance_ctl); + kfree(fs_info->delayed_root); + btrfs_put_root(fs_info->extent_root); + btrfs_put_root(fs_info->tree_root); + btrfs_put_root(fs_info->chunk_root); + btrfs_put_root(fs_info->dev_root); + btrfs_put_root(fs_info->csum_root); + btrfs_put_root(fs_info->quota_root); + btrfs_put_root(fs_info->uuid_root); + btrfs_put_root(fs_info->free_space_root); + btrfs_put_root(fs_info->fs_root); + btrfs_check_leaked_roots(fs_info); + btrfs_extent_buffer_leak_debug_check(fs_info); + kfree(fs_info->super_copy); + kfree(fs_info->super_for_commit); + kvfree(fs_info); +} + + struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_key *location, bool check_ref) @@ -1548,33 +1544,35 @@ struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info, int ret; if (location->objectid == BTRFS_ROOT_TREE_OBJECTID) - return fs_info->tree_root; + return btrfs_grab_root(fs_info->tree_root); if (location->objectid == BTRFS_EXTENT_TREE_OBJECTID) - return fs_info->extent_root; + return btrfs_grab_root(fs_info->extent_root); if (location->objectid == BTRFS_CHUNK_TREE_OBJECTID) - return fs_info->chunk_root; + return btrfs_grab_root(fs_info->chunk_root); if (location->objectid == BTRFS_DEV_TREE_OBJECTID) - return fs_info->dev_root; + return btrfs_grab_root(fs_info->dev_root); if (location->objectid == BTRFS_CSUM_TREE_OBJECTID) - return fs_info->csum_root; + return btrfs_grab_root(fs_info->csum_root); if (location->objectid == BTRFS_QUOTA_TREE_OBJECTID) - return fs_info->quota_root ? fs_info->quota_root : - ERR_PTR(-ENOENT); + return btrfs_grab_root(fs_info->quota_root) ? + fs_info->quota_root : ERR_PTR(-ENOENT); if (location->objectid == BTRFS_UUID_TREE_OBJECTID) - return fs_info->uuid_root ? fs_info->uuid_root : - ERR_PTR(-ENOENT); + return btrfs_grab_root(fs_info->uuid_root) ? + fs_info->uuid_root : ERR_PTR(-ENOENT); if (location->objectid == BTRFS_FREE_SPACE_TREE_OBJECTID) - return fs_info->free_space_root ? fs_info->free_space_root : - ERR_PTR(-ENOENT); + return btrfs_grab_root(fs_info->free_space_root) ? + fs_info->free_space_root : ERR_PTR(-ENOENT); again: root = btrfs_lookup_fs_root(fs_info, location->objectid); if (root) { - if (check_ref && btrfs_root_refs(&root->root_item) == 0) + if (check_ref && btrfs_root_refs(&root->root_item) == 0) { + btrfs_put_root(root); return ERR_PTR(-ENOENT); + } return root; } - root = btrfs_read_fs_root(fs_info->tree_root, location); + root = btrfs_read_tree_root(fs_info->tree_root, location); if (IS_ERR(root)) return root; @@ -1605,15 +1603,14 @@ again: ret = btrfs_insert_fs_root(fs_info, root); if (ret) { - if (ret == -EEXIST) { - btrfs_free_fs_root(root); + btrfs_put_root(root); + if (ret == -EEXIST) goto again; - } goto fail; } return root; fail: - btrfs_free_fs_root(root); + btrfs_put_root(root); return ERR_PTR(ret); } @@ -1985,11 +1982,35 @@ static void free_root_pointers(struct btrfs_fs_info *info, bool free_chunk_root) free_root_extent_buffers(info->csum_root); free_root_extent_buffers(info->quota_root); free_root_extent_buffers(info->uuid_root); + free_root_extent_buffers(info->fs_root); if (free_chunk_root) free_root_extent_buffers(info->chunk_root); free_root_extent_buffers(info->free_space_root); } +void btrfs_put_root(struct btrfs_root *root) +{ + if (!root) + return; + + if (refcount_dec_and_test(&root->refs)) { + WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree)); + if (root->anon_dev) + free_anon_bdev(root->anon_dev); + btrfs_drew_lock_destroy(&root->snapshot_lock); + free_extent_buffer(root->node); + free_extent_buffer(root->commit_root); + kfree(root->free_ino_ctl); + kfree(root->free_ino_pinned); +#ifdef CONFIG_BTRFS_DEBUG + spin_lock(&root->fs_info->fs_roots_radix_lock); + list_del_init(&root->leak_list); + spin_unlock(&root->fs_info->fs_roots_radix_lock); +#endif + kfree(root); + } +} + void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info) { int ret; @@ -2001,13 +2022,9 @@ void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info) struct btrfs_root, root_list); list_del(&gang[0]->root_list); - if (test_bit(BTRFS_ROOT_IN_RADIX, &gang[0]->state)) { + if (test_bit(BTRFS_ROOT_IN_RADIX, &gang[0]->state)) btrfs_drop_and_free_fs_root(fs_info, gang[0]); - } else { - free_extent_buffer(gang[0]->node); - free_extent_buffer(gang[0]->commit_root); - btrfs_put_fs_root(gang[0]); - } + btrfs_put_root(gang[0]); } while (1) { @@ -2020,10 +2037,8 @@ void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info) btrfs_drop_and_free_fs_root(fs_info, gang[i]); } - if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) { + if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) btrfs_free_log_root_tree(NULL, fs_info); - btrfs_destroy_pinned_extent(fs_info, fs_info->pinned_extents); - } } static void btrfs_init_scrub(struct btrfs_fs_info *fs_info) @@ -2069,7 +2084,7 @@ static void btrfs_init_btree_inode(struct btrfs_fs_info *fs_info) BTRFS_I(inode)->io_tree.ops = &btree_extent_io_ops; - BTRFS_I(inode)->root = fs_info->tree_root; + BTRFS_I(inode)->root = btrfs_grab_root(fs_info->tree_root); memset(&BTRFS_I(inode)->location, 0, sizeof(struct btrfs_key)); set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags); btrfs_insert_inode_hash(inode); @@ -2189,11 +2204,6 @@ static int btrfs_init_csum_hash(struct btrfs_fs_info *fs_info, u16 csum_type) return 0; } -static void btrfs_free_csum_hash(struct btrfs_fs_info *fs_info) -{ - crypto_free_shash(fs_info->csum_shash); -} - static int btrfs_replay_log(struct btrfs_fs_info *fs_info, struct btrfs_fs_devices *fs_devices) { @@ -2208,24 +2218,23 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info, return -EIO; } - log_tree_root = btrfs_alloc_root(fs_info, GFP_KERNEL); + log_tree_root = btrfs_alloc_root(fs_info, BTRFS_TREE_LOG_OBJECTID, + GFP_KERNEL); if (!log_tree_root) return -ENOMEM; - __setup_root(log_tree_root, fs_info, BTRFS_TREE_LOG_OBJECTID); - log_tree_root->node = read_tree_block(fs_info, bytenr, fs_info->generation + 1, level, NULL); if (IS_ERR(log_tree_root->node)) { btrfs_warn(fs_info, "failed to read log tree"); ret = PTR_ERR(log_tree_root->node); - kfree(log_tree_root); + log_tree_root->node = NULL; + btrfs_put_root(log_tree_root); return ret; } else if (!extent_buffer_uptodate(log_tree_root->node)) { btrfs_err(fs_info, "failed to read log tree"); - free_extent_buffer(log_tree_root->node); - kfree(log_tree_root); + btrfs_put_root(log_tree_root); return -EIO; } /* returns with log_tree_root freed on success */ @@ -2233,8 +2242,7 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info, if (ret) { btrfs_handle_fs_error(fs_info, ret, "Failed to recover log tree"); - free_extent_buffer(log_tree_root->node); - kfree(log_tree_root); + btrfs_put_root(log_tree_root); return ret; } @@ -2624,67 +2632,8 @@ static int __cold init_tree_roots(struct btrfs_fs_info *fs_info) return ret; } -int __cold open_ctree(struct super_block *sb, - struct btrfs_fs_devices *fs_devices, - char *options) +void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) { - u32 sectorsize; - u32 nodesize; - u32 stripesize; - u64 generation; - u64 features; - u16 csum_type; - struct btrfs_key location; - struct buffer_head *bh; - struct btrfs_super_block *disk_super; - struct btrfs_fs_info *fs_info = btrfs_sb(sb); - struct btrfs_root *tree_root; - struct btrfs_root *chunk_root; - int ret; - int err = -EINVAL; - int clear_free_space_tree = 0; - int level; - - tree_root = fs_info->tree_root = btrfs_alloc_root(fs_info, GFP_KERNEL); - chunk_root = fs_info->chunk_root = btrfs_alloc_root(fs_info, GFP_KERNEL); - if (!tree_root || !chunk_root) { - err = -ENOMEM; - goto fail; - } - - ret = init_srcu_struct(&fs_info->subvol_srcu); - if (ret) { - err = ret; - goto fail; - } - - ret = percpu_counter_init(&fs_info->dio_bytes, 0, GFP_KERNEL); - if (ret) { - err = ret; - goto fail_srcu; - } - - ret = percpu_counter_init(&fs_info->dirty_metadata_bytes, 0, GFP_KERNEL); - if (ret) { - err = ret; - goto fail_dio_bytes; - } - fs_info->dirty_metadata_batch = PAGE_SIZE * - (1 + ilog2(nr_cpu_ids)); - - ret = percpu_counter_init(&fs_info->delalloc_bytes, 0, GFP_KERNEL); - if (ret) { - err = ret; - goto fail_dirty_metadata_bytes; - } - - ret = percpu_counter_init(&fs_info->dev_replace.bio_counter, 0, - GFP_KERNEL); - if (ret) { - err = ret; - goto fail_delalloc_bytes; - } - INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC); INIT_RADIX_TREE(&fs_info->buffer_radix, GFP_ATOMIC); INIT_LIST_HEAD(&fs_info->trans_list); @@ -2711,6 +2660,11 @@ int __cold open_ctree(struct super_block *sb, INIT_LIST_HEAD(&fs_info->space_info); INIT_LIST_HEAD(&fs_info->tree_mod_seq_list); INIT_LIST_HEAD(&fs_info->unused_bgs); +#ifdef CONFIG_BTRFS_DEBUG + INIT_LIST_HEAD(&fs_info->allocated_roots); + INIT_LIST_HEAD(&fs_info->allocated_ebs); + spin_lock_init(&fs_info->eb_leak_lock); +#endif extent_map_tree_init(&fs_info->mapping_tree); btrfs_init_block_rsv(&fs_info->global_block_rsv, BTRFS_BLOCK_RSV_GLOBAL); @@ -2727,7 +2681,6 @@ int __cold open_ctree(struct super_block *sb, atomic_set(&fs_info->reada_works_cnt, 0); atomic_set(&fs_info->nr_delayed_iputs, 0); atomic64_set(&fs_info->tree_mod_seq, 0); - fs_info->sb = sb; fs_info->max_inline = BTRFS_DEFAULT_MAX_INLINE; fs_info->metadata_ratio = 0; fs_info->defrag_inodes = RB_ROOT; @@ -2746,21 +2699,6 @@ int __cold open_ctree(struct super_block *sb, INIT_LIST_HEAD(&fs_info->ordered_roots); spin_lock_init(&fs_info->ordered_root_lock); - fs_info->btree_inode = new_inode(sb); - if (!fs_info->btree_inode) { - err = -ENOMEM; - goto fail_bio_counter; - } - mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS); - - fs_info->delayed_root = kmalloc(sizeof(struct btrfs_delayed_root), - GFP_KERNEL); - if (!fs_info->delayed_root) { - err = -ENOMEM; - goto fail_iput; - } - btrfs_init_delayed_root(fs_info->delayed_root); - btrfs_init_scrub(fs_info); #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY fs_info->check_integrity_print_mask = 0; @@ -2768,20 +2706,12 @@ int __cold open_ctree(struct super_block *sb, btrfs_init_balance(fs_info); btrfs_init_async_reclaim_work(&fs_info->async_reclaim_work); - sb->s_blocksize = BTRFS_BDEV_BLOCKSIZE; - sb->s_blocksize_bits = blksize_bits(BTRFS_BDEV_BLOCKSIZE); - - btrfs_init_btree_inode(fs_info); - spin_lock_init(&fs_info->block_group_cache_lock); fs_info->block_group_cache_tree = RB_ROOT; fs_info->first_logical_byte = (u64)-1; - extent_io_tree_init(fs_info, &fs_info->freed_extents[0], - IO_TREE_FS_INFO_FREED_EXTENTS0, NULL); - extent_io_tree_init(fs_info, &fs_info->freed_extents[1], - IO_TREE_FS_INFO_FREED_EXTENTS1, NULL); - fs_info->pinned_extents = &fs_info->freed_extents[0]; + extent_io_tree_init(fs_info, &fs_info->excluded_extents, + IO_TREE_FS_EXCLUDED_EXTENTS, NULL); set_bit(BTRFS_FS_BARRIER, &fs_info->flags); mutex_init(&fs_info->ordered_operations_mutex); @@ -2817,23 +2747,135 @@ int __cold open_ctree(struct super_block *sb, fs_info->swapfile_pins = RB_ROOT; fs_info->send_in_progress = 0; +} + +static int init_mount_fs_info(struct btrfs_fs_info *fs_info, struct super_block *sb) +{ + int ret; + + fs_info->sb = sb; + sb->s_blocksize = BTRFS_BDEV_BLOCKSIZE; + sb->s_blocksize_bits = blksize_bits(BTRFS_BDEV_BLOCKSIZE); + + ret = percpu_counter_init(&fs_info->dio_bytes, 0, GFP_KERNEL); + if (ret) + return ret; + + ret = percpu_counter_init(&fs_info->dirty_metadata_bytes, 0, GFP_KERNEL); + if (ret) + return ret; + + fs_info->dirty_metadata_batch = PAGE_SIZE * + (1 + ilog2(nr_cpu_ids)); + + ret = percpu_counter_init(&fs_info->delalloc_bytes, 0, GFP_KERNEL); + if (ret) + return ret; + + ret = percpu_counter_init(&fs_info->dev_replace.bio_counter, 0, + GFP_KERNEL); + if (ret) + return ret; + + fs_info->delayed_root = kmalloc(sizeof(struct btrfs_delayed_root), + GFP_KERNEL); + if (!fs_info->delayed_root) + return -ENOMEM; + btrfs_init_delayed_root(fs_info->delayed_root); + + return btrfs_alloc_stripe_hash_table(fs_info); +} + +static int btrfs_uuid_rescan_kthread(void *data) +{ + struct btrfs_fs_info *fs_info = (struct btrfs_fs_info *)data; + int ret; - ret = btrfs_alloc_stripe_hash_table(fs_info); + /* + * 1st step is to iterate through the existing UUID tree and + * to delete all entries that contain outdated data. + * 2nd step is to add all missing entries to the UUID tree. + */ + ret = btrfs_uuid_tree_iterate(fs_info); + if (ret < 0) { + if (ret != -EINTR) + btrfs_warn(fs_info, "iterating uuid_tree failed %d", + ret); + up(&fs_info->uuid_tree_rescan_sem); + return ret; + } + return btrfs_uuid_scan_kthread(data); +} + +static int btrfs_check_uuid_tree(struct btrfs_fs_info *fs_info) +{ + struct task_struct *task; + + down(&fs_info->uuid_tree_rescan_sem); + task = kthread_run(btrfs_uuid_rescan_kthread, fs_info, "btrfs-uuid"); + if (IS_ERR(task)) { + /* fs_info->update_uuid_tree_gen remains 0 in all error case */ + btrfs_warn(fs_info, "failed to start uuid_rescan task"); + up(&fs_info->uuid_tree_rescan_sem); + return PTR_ERR(task); + } + + return 0; +} + +int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_devices, + char *options) +{ + u32 sectorsize; + u32 nodesize; + u32 stripesize; + u64 generation; + u64 features; + u16 csum_type; + struct btrfs_key location; + struct btrfs_super_block *disk_super; + struct btrfs_fs_info *fs_info = btrfs_sb(sb); + struct btrfs_root *tree_root; + struct btrfs_root *chunk_root; + int ret; + int err = -EINVAL; + int clear_free_space_tree = 0; + int level; + + ret = init_mount_fs_info(fs_info, sb); if (ret) { err = ret; - goto fail_alloc; + goto fail; } - __setup_root(tree_root, fs_info, BTRFS_ROOT_TREE_OBJECTID); + /* These need to be init'ed before we start creating inodes and such. */ + tree_root = btrfs_alloc_root(fs_info, BTRFS_ROOT_TREE_OBJECTID, + GFP_KERNEL); + fs_info->tree_root = tree_root; + chunk_root = btrfs_alloc_root(fs_info, BTRFS_CHUNK_TREE_OBJECTID, + GFP_KERNEL); + fs_info->chunk_root = chunk_root; + if (!tree_root || !chunk_root) { + err = -ENOMEM; + goto fail; + } + + fs_info->btree_inode = new_inode(sb); + if (!fs_info->btree_inode) { + err = -ENOMEM; + goto fail; + } + mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS); + btrfs_init_btree_inode(fs_info); invalidate_bdev(fs_devices->latest_bdev); /* * Read super block and check the signature bytes only */ - bh = btrfs_read_dev_super(fs_devices->latest_bdev); - if (IS_ERR(bh)) { - err = PTR_ERR(bh); + disk_super = btrfs_read_dev_super(fs_devices->latest_bdev); + if (IS_ERR(disk_super)) { + err = PTR_ERR(disk_super); goto fail_alloc; } @@ -2841,18 +2883,19 @@ int __cold open_ctree(struct super_block *sb, * Verify the type first, if that or the the checksum value are * corrupted, we'll find out */ - csum_type = btrfs_super_csum_type((struct btrfs_super_block *)bh->b_data); + csum_type = btrfs_super_csum_type(disk_super); if (!btrfs_supported_super_csum(csum_type)) { btrfs_err(fs_info, "unsupported checksum algorithm: %u", csum_type); err = -EINVAL; - brelse(bh); + btrfs_release_disk_super(disk_super); goto fail_alloc; } ret = btrfs_init_csum_hash(fs_info, csum_type); if (ret) { err = ret; + btrfs_release_disk_super(disk_super); goto fail_alloc; } @@ -2860,11 +2903,11 @@ int __cold open_ctree(struct super_block *sb, * We want to check superblock checksum, the type is stored inside. * Pass the whole disk block of size BTRFS_SUPER_INFO_SIZE (4k). */ - if (btrfs_check_super_csum(fs_info, bh->b_data)) { + if (btrfs_check_super_csum(fs_info, (u8 *)disk_super)) { btrfs_err(fs_info, "superblock checksum mismatch"); err = -EINVAL; - brelse(bh); - goto fail_csum; + btrfs_release_disk_super(disk_super); + goto fail_alloc; } /* @@ -2872,8 +2915,8 @@ int __cold open_ctree(struct super_block *sb, * following bytes up to INFO_SIZE, the checksum is calculated from * the whole block of INFO_SIZE */ - memcpy(fs_info->super_copy, bh->b_data, sizeof(*fs_info->super_copy)); - brelse(bh); + memcpy(fs_info->super_copy, disk_super, sizeof(*fs_info->super_copy)); + btrfs_release_disk_super(disk_super); disk_super = fs_info->super_copy; @@ -2901,11 +2944,11 @@ int __cold open_ctree(struct super_block *sb, if (ret) { btrfs_err(fs_info, "superblock contains fatal errors"); err = -EINVAL; - goto fail_csum; + goto fail_alloc; } if (!btrfs_super_root(disk_super)) - goto fail_csum; + goto fail_alloc; /* check FS state, whether FS is broken. */ if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_ERROR) @@ -2920,7 +2963,7 @@ int __cold open_ctree(struct super_block *sb, ret = btrfs_parse_options(fs_info, options, sb->s_flags); if (ret) { err = ret; - goto fail_csum; + goto fail_alloc; } features = btrfs_super_incompat_flags(disk_super) & @@ -2930,7 +2973,7 @@ int __cold open_ctree(struct super_block *sb, "cannot mount because of unsupported optional features (%llx)", features); err = -EINVAL; - goto fail_csum; + goto fail_alloc; } features = btrfs_super_incompat_flags(disk_super); @@ -2974,7 +3017,7 @@ int __cold open_ctree(struct super_block *sb, btrfs_err(fs_info, "unequal nodesize/sectorsize (%u != %u) are not allowed for mixed block groups", nodesize, sectorsize); - goto fail_csum; + goto fail_alloc; } /* @@ -2990,7 +3033,7 @@ int __cold open_ctree(struct super_block *sb, "cannot mount read-write because of unsupported optional features (%llx)", features); err = -EINVAL; - goto fail_csum; + goto fail_alloc; } ret = btrfs_init_workqueues(fs_info, fs_devices); @@ -3021,8 +3064,6 @@ int __cold open_ctree(struct super_block *sb, generation = btrfs_super_chunk_root_generation(disk_super); level = btrfs_super_chunk_root_level(disk_super); - __setup_root(chunk_root, fs_info, BTRFS_CHUNK_TREE_OBJECTID); - chunk_root->node = read_tree_block(fs_info, btrfs_super_chunk_root(disk_super), generation, level, NULL); @@ -3038,7 +3079,8 @@ int __cold open_ctree(struct super_block *sb, chunk_root->commit_root = btrfs_root_node(chunk_root); read_extent_buffer(chunk_root->node, fs_info->chunk_tree_uuid, - btrfs_header_chunk_tree_uuid(chunk_root->node), BTRFS_UUID_SIZE); + offsetof(struct btrfs_header, chunk_tree_uuid), + BTRFS_UUID_SIZE); ret = btrfs_read_chunk_tree(fs_info); if (ret) { @@ -3061,6 +3103,18 @@ int __cold open_ctree(struct super_block *sb, if (ret) goto fail_tree_roots; + /* + * If we have a uuid root and we're not being told to rescan we need to + * check the generation here so we can set the + * BTRFS_FS_UPDATE_UUID_TREE_GEN bit. Otherwise we could commit the + * transaction during a balance or the log replay without updating the + * uuid generation, and then if we crash we would rescan the uuid tree, + * even though it was perfectly fine. + */ + if (fs_info->uuid_root && !btrfs_test_opt(fs_info, RESCAN_UUID_TREE) && + fs_info->generation == btrfs_super_uuid_tree_generation(disk_super)) + set_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags); + ret = btrfs_verify_dev_extents(fs_info); if (ret) { btrfs_err(fs_info, @@ -3164,6 +3218,7 @@ int __cold open_ctree(struct super_block *sb, /* do not make disk changes in broken FS or nologreplay is given */ if (btrfs_super_log_root(disk_super) != 0 && !btrfs_test_opt(fs_info, NOLOGREPLAY)) { + btrfs_info(fs_info, "start tree-log replay"); ret = btrfs_replay_log(fs_info, fs_devices); if (ret) { err = ret; @@ -3195,10 +3250,11 @@ int __cold open_ctree(struct super_block *sb, location.type = BTRFS_ROOT_ITEM_KEY; location.offset = 0; - fs_info->fs_root = btrfs_read_fs_root_no_name(fs_info, &location); + fs_info->fs_root = btrfs_get_fs_root(fs_info, &location, true); if (IS_ERR(fs_info->fs_root)) { err = PTR_ERR(fs_info->fs_root); btrfs_warn(fs_info, "failed to read fs tree: %d", err); + fs_info->fs_root = NULL; goto fail_qgroup; } @@ -3283,8 +3339,6 @@ int __cold open_ctree(struct super_block *sb, close_ctree(fs_info); return ret; } - } else { - set_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags); } set_bit(BTRFS_FS_OPEN, &fs_info->flags); @@ -3327,90 +3381,78 @@ fail_tree_roots: fail_sb_buffer: btrfs_stop_all_workers(fs_info); btrfs_free_block_groups(fs_info); -fail_csum: - btrfs_free_csum_hash(fs_info); fail_alloc: -fail_iput: btrfs_mapping_tree_free(&fs_info->mapping_tree); iput(fs_info->btree_inode); -fail_bio_counter: - percpu_counter_destroy(&fs_info->dev_replace.bio_counter); -fail_delalloc_bytes: - percpu_counter_destroy(&fs_info->delalloc_bytes); -fail_dirty_metadata_bytes: - percpu_counter_destroy(&fs_info->dirty_metadata_bytes); -fail_dio_bytes: - percpu_counter_destroy(&fs_info->dio_bytes); -fail_srcu: - cleanup_srcu_struct(&fs_info->subvol_srcu); fail: - btrfs_free_stripe_hash_table(fs_info); btrfs_close_devices(fs_info->fs_devices); return err; } ALLOW_ERROR_INJECTION(open_ctree, ERRNO); -static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate) +static void btrfs_end_super_write(struct bio *bio) { - if (uptodate) { - set_buffer_uptodate(bh); - } else { - struct btrfs_device *device = (struct btrfs_device *) - bh->b_private; - - btrfs_warn_rl_in_rcu(device->fs_info, - "lost page write due to IO error on %s", - rcu_str_deref(device->name)); - /* note, we don't set_buffer_write_io_error because we have - * our own ways of dealing with the IO errors - */ - clear_buffer_uptodate(bh); - btrfs_dev_stat_inc_and_print(device, BTRFS_DEV_STAT_WRITE_ERRS); + struct btrfs_device *device = bio->bi_private; + struct bio_vec *bvec; + struct bvec_iter_all iter_all; + struct page *page; + + bio_for_each_segment_all(bvec, bio, iter_all) { + page = bvec->bv_page; + + if (bio->bi_status) { + btrfs_warn_rl_in_rcu(device->fs_info, + "lost page write due to IO error on %s (%d)", + rcu_str_deref(device->name), + blk_status_to_errno(bio->bi_status)); + ClearPageUptodate(page); + SetPageError(page); + btrfs_dev_stat_inc_and_print(device, + BTRFS_DEV_STAT_WRITE_ERRS); + } else { + SetPageUptodate(page); + } + + put_page(page); + unlock_page(page); } - unlock_buffer(bh); - put_bh(bh); + + bio_put(bio); } -int btrfs_read_dev_one_super(struct block_device *bdev, int copy_num, - struct buffer_head **bh_ret) +struct btrfs_super_block *btrfs_read_dev_one_super(struct block_device *bdev, + int copy_num) { - struct buffer_head *bh; struct btrfs_super_block *super; + struct page *page; u64 bytenr; + struct address_space *mapping = bdev->bd_inode->i_mapping; bytenr = btrfs_sb_offset(copy_num); if (bytenr + BTRFS_SUPER_INFO_SIZE >= i_size_read(bdev->bd_inode)) - return -EINVAL; + return ERR_PTR(-EINVAL); - bh = __bread(bdev, bytenr / BTRFS_BDEV_BLOCKSIZE, BTRFS_SUPER_INFO_SIZE); - /* - * If we fail to read from the underlying devices, as of now - * the best option we have is to mark it EIO. - */ - if (!bh) - return -EIO; + page = read_cache_page_gfp(mapping, bytenr >> PAGE_SHIFT, GFP_NOFS); + if (IS_ERR(page)) + return ERR_CAST(page); - super = (struct btrfs_super_block *)bh->b_data; + super = page_address(page); if (btrfs_super_bytenr(super) != bytenr || btrfs_super_magic(super) != BTRFS_MAGIC) { - brelse(bh); - return -EINVAL; + btrfs_release_disk_super(super); + return ERR_PTR(-EINVAL); } - *bh_ret = bh; - return 0; + return super; } -struct buffer_head *btrfs_read_dev_super(struct block_device *bdev) +struct btrfs_super_block *btrfs_read_dev_super(struct block_device *bdev) { - struct buffer_head *bh; - struct buffer_head *latest = NULL; - struct btrfs_super_block *super; + struct btrfs_super_block *super, *latest = NULL; int i; u64 transid = 0; - int ret = -EINVAL; /* we would like to check all the supers, but that would make * a btrfs mount succeed after a mkfs from a different FS. @@ -3418,48 +3460,41 @@ struct buffer_head *btrfs_read_dev_super(struct block_device *bdev) * later supers, using BTRFS_SUPER_MIRROR_MAX instead */ for (i = 0; i < 1; i++) { - ret = btrfs_read_dev_one_super(bdev, i, &bh); - if (ret) + super = btrfs_read_dev_one_super(bdev, i); + if (IS_ERR(super)) continue; - super = (struct btrfs_super_block *)bh->b_data; - if (!latest || btrfs_super_generation(super) > transid) { - brelse(latest); - latest = bh; + if (latest) + btrfs_release_disk_super(super); + + latest = super; transid = btrfs_super_generation(super); - } else { - brelse(bh); } } - if (!latest) - return ERR_PTR(ret); - - return latest; + return super; } /* * Write superblock @sb to the @device. Do not wait for completion, all the - * buffer heads we write are pinned. + * pages we use for writing are locked. * * Write @max_mirrors copies of the superblock, where 0 means default that fit * the expected device size at commit time. Note that max_mirrors must be * same for write and wait phases. * - * Return number of errors when buffer head is not found or submission fails. + * Return number of errors when page is not found or submission fails. */ static int write_dev_supers(struct btrfs_device *device, struct btrfs_super_block *sb, int max_mirrors) { struct btrfs_fs_info *fs_info = device->fs_info; + struct address_space *mapping = device->bdev->bd_inode->i_mapping; SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); - struct buffer_head *bh; int i; - int ret; int errors = 0; u64 bytenr; - int op_flags; if (max_mirrors == 0) max_mirrors = BTRFS_SUPER_MIRROR_MAX; @@ -3467,6 +3502,10 @@ static int write_dev_supers(struct btrfs_device *device, shash->tfm = fs_info->csum_shash; for (i = 0; i < max_mirrors; i++) { + struct page *page; + struct bio *bio; + struct btrfs_super_block *disk_super; + bytenr = btrfs_sb_offset(i); if (bytenr + BTRFS_SUPER_INFO_SIZE >= device->commit_total_bytes) @@ -3479,37 +3518,45 @@ static int write_dev_supers(struct btrfs_device *device, BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE); crypto_shash_final(shash, sb->csum); - /* One reference for us, and we leave it for the caller */ - bh = __getblk(device->bdev, bytenr / BTRFS_BDEV_BLOCKSIZE, - BTRFS_SUPER_INFO_SIZE); - if (!bh) { + page = find_or_create_page(mapping, bytenr >> PAGE_SHIFT, + GFP_NOFS); + if (!page) { btrfs_err(device->fs_info, - "couldn't get super buffer head for bytenr %llu", + "couldn't get super block page for bytenr %llu", bytenr); errors++; continue; } - memcpy(bh->b_data, sb, BTRFS_SUPER_INFO_SIZE); + /* Bump the refcount for wait_dev_supers() */ + get_page(page); - /* one reference for submit_bh */ - get_bh(bh); + disk_super = page_address(page); + memcpy(disk_super, sb, BTRFS_SUPER_INFO_SIZE); - set_buffer_uptodate(bh); - lock_buffer(bh); - bh->b_end_io = btrfs_end_buffer_write_sync; - bh->b_private = device; + /* + * Directly use bios here instead of relying on the page cache + * to do I/O, so we don't lose the ability to do integrity + * checking. + */ + bio = bio_alloc(GFP_NOFS, 1); + bio_set_dev(bio, device->bdev); + bio->bi_iter.bi_sector = bytenr >> SECTOR_SHIFT; + bio->bi_private = device; + bio->bi_end_io = btrfs_end_super_write; + __bio_add_page(bio, page, BTRFS_SUPER_INFO_SIZE, + offset_in_page(bytenr)); /* - * we fua the first super. The others we allow - * to go down lazy. + * We FUA only the first super block. The others we allow to + * go down lazy and there's a short window where the on-disk + * copies might still contain the older version. */ - op_flags = REQ_SYNC | REQ_META | REQ_PRIO; + bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_META | REQ_PRIO; if (i == 0 && !btrfs_test_opt(device->fs_info, NOBARRIER)) - op_flags |= REQ_FUA; - ret = btrfsic_submit_bh(REQ_OP_WRITE, op_flags, bh); - if (ret) - errors++; + bio->bi_opf |= REQ_FUA; + + btrfsic_submit_bio(bio); } return errors < i ? 0 : -1; } @@ -3518,12 +3565,11 @@ static int write_dev_supers(struct btrfs_device *device, * Wait for write completion of superblocks done by write_dev_supers, * @max_mirrors same for write and wait phases. * - * Return number of errors when buffer head is not found or not marked up to + * Return number of errors when page is not found or not marked up to * date. */ static int wait_dev_supers(struct btrfs_device *device, int max_mirrors) { - struct buffer_head *bh; int i; int errors = 0; bool primary_failed = false; @@ -3533,32 +3579,34 @@ static int wait_dev_supers(struct btrfs_device *device, int max_mirrors) max_mirrors = BTRFS_SUPER_MIRROR_MAX; for (i = 0; i < max_mirrors; i++) { + struct page *page; + bytenr = btrfs_sb_offset(i); if (bytenr + BTRFS_SUPER_INFO_SIZE >= device->commit_total_bytes) break; - bh = __find_get_block(device->bdev, - bytenr / BTRFS_BDEV_BLOCKSIZE, - BTRFS_SUPER_INFO_SIZE); - if (!bh) { + page = find_get_page(device->bdev->bd_inode->i_mapping, + bytenr >> PAGE_SHIFT); + if (!page) { errors++; if (i == 0) primary_failed = true; continue; } - wait_on_buffer(bh); - if (!buffer_uptodate(bh)) { + /* Page is submitted locked and unlocked once the IO completes */ + wait_on_page_locked(page); + if (PageError(page)) { errors++; if (i == 0) primary_failed = true; } - /* drop our reference */ - brelse(bh); + /* Drop our reference */ + put_page(page); - /* drop the reference from the writing run */ - brelse(bh); + /* Drop the reference from the writing run */ + put_page(page); } /* log error, force error return */ @@ -3830,20 +3878,19 @@ int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors) void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root) { + bool drop_ref = false; + spin_lock(&fs_info->fs_roots_radix_lock); radix_tree_delete(&fs_info->fs_roots_radix, (unsigned long)root->root_key.objectid); + if (test_and_clear_bit(BTRFS_ROOT_IN_RADIX, &root->state)) + drop_ref = true; spin_unlock(&fs_info->fs_roots_radix_lock); - if (btrfs_root_refs(&root->root_item) == 0) - synchronize_srcu(&fs_info->subvol_srcu); - if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) { btrfs_free_log(NULL, root); if (root->reloc_root) { - free_extent_buffer(root->reloc_root->node); - free_extent_buffer(root->reloc_root->commit_root); - btrfs_put_fs_root(root->reloc_root); + btrfs_put_root(root->reloc_root); root->reloc_root = NULL; } } @@ -3852,22 +3899,12 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info, __btrfs_remove_free_space_cache(root->free_ino_pinned); if (root->free_ino_ctl) __btrfs_remove_free_space_cache(root->free_ino_ctl); - btrfs_free_fs_root(root); -} - -void btrfs_free_fs_root(struct btrfs_root *root) -{ - iput(root->ino_cache_inode); - WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree)); - if (root->anon_dev) - free_anon_bdev(root->anon_dev); - if (root->subv_writers) - btrfs_free_subvolume_writers(root->subv_writers); - free_extent_buffer(root->node); - free_extent_buffer(root->commit_root); - kfree(root->free_ino_ctl); - kfree(root->free_ino_pinned); - btrfs_put_fs_root(root); + if (root->ino_cache_inode) { + iput(root->ino_cache_inode); + root->ino_cache_inode = NULL; + } + if (drop_ref) + btrfs_put_root(root); } int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info) @@ -3877,15 +3914,14 @@ int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info) int i = 0; int err = 0; unsigned int ret = 0; - int index; while (1) { - index = srcu_read_lock(&fs_info->subvol_srcu); + spin_lock(&fs_info->fs_roots_radix_lock); ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix, (void **)gang, root_objectid, ARRAY_SIZE(gang)); if (!ret) { - srcu_read_unlock(&fs_info->subvol_srcu, index); + spin_unlock(&fs_info->fs_roots_radix_lock); break; } root_objectid = gang[ret - 1]->root_key.objectid + 1; @@ -3897,9 +3933,9 @@ int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info) continue; } /* grab all the search result for later use */ - gang[i] = btrfs_grab_fs_root(gang[i]); + gang[i] = btrfs_grab_root(gang[i]); } - srcu_read_unlock(&fs_info->subvol_srcu, index); + spin_unlock(&fs_info->fs_roots_radix_lock); for (i = 0; i < ret; i++) { if (!gang[i]) @@ -3908,7 +3944,7 @@ int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info) err = btrfs_orphan_cleanup(gang[i]); if (err) break; - btrfs_put_fs_root(gang[i]); + btrfs_put_root(gang[i]); } root_objectid++; } @@ -3916,7 +3952,7 @@ int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info) /* release the uncleaned roots due to error */ for (; i < ret; i++) { if (gang[i]) - btrfs_put_fs_root(gang[i]); + btrfs_put_root(gang[i]); } return err; } @@ -3988,6 +4024,19 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) */ btrfs_delete_unused_bgs(fs_info); + /* + * There might be existing delayed inode workers still running + * and holding an empty delayed inode item. We must wait for + * them to complete first because they can create a transaction. + * This happens when someone calls btrfs_balance_delayed_items() + * and then a transaction commit runs the same delayed nodes + * before any delayed worker has done something with the nodes. + * We must wait for any worker here and not at transaction + * commit time since that could cause a deadlock. + * This is a very rare case. + */ + btrfs_flush_workqueue(fs_info->delayed_workers); + ret = btrfs_commit_super(fs_info); if (ret) btrfs_err(fs_info, "commit super ret %d", ret); @@ -4018,8 +4067,6 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) btrfs_sysfs_remove_mounted(fs_info); btrfs_sysfs_remove_fsid(fs_info->fs_devices); - btrfs_free_fs_roots(fs_info); - btrfs_put_block_group_cache(fs_info); /* @@ -4031,6 +4078,7 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) clear_bit(BTRFS_FS_OPEN, &fs_info->flags); free_root_pointers(fs_info, true); + btrfs_free_fs_roots(fs_info); /* * We must free the block groups after dropping the fs_roots as we could @@ -4050,16 +4098,6 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) btrfs_mapping_tree_free(&fs_info->mapping_tree); btrfs_close_devices(fs_info->fs_devices); - - percpu_counter_destroy(&fs_info->dirty_metadata_bytes); - percpu_counter_destroy(&fs_info->delalloc_bytes); - percpu_counter_destroy(&fs_info->dio_bytes); - percpu_counter_destroy(&fs_info->dev_replace.bio_counter); - cleanup_srcu_struct(&fs_info->subvol_srcu); - - btrfs_free_csum_hash(fs_info); - btrfs_free_stripe_hash_table(fs_info); - btrfs_free_ref_cache(fs_info); } int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid, @@ -4233,7 +4271,7 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, spin_lock(&delayed_refs->lock); if (atomic_read(&delayed_refs->num_entries) == 0) { spin_unlock(&delayed_refs->lock); - btrfs_info(fs_info, "delayed_refs has NO entry"); + btrfs_debug(fs_info, "delayed_refs has NO entry"); return ret; } @@ -4267,14 +4305,36 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, spin_unlock(&delayed_refs->lock); mutex_unlock(&head->mutex); - if (pin_bytes) - btrfs_pin_extent(fs_info, head->bytenr, - head->num_bytes, 1); + if (pin_bytes) { + struct btrfs_block_group *cache; + + cache = btrfs_lookup_block_group(fs_info, head->bytenr); + BUG_ON(!cache); + + spin_lock(&cache->space_info->lock); + spin_lock(&cache->lock); + cache->pinned += head->num_bytes; + btrfs_space_info_update_bytes_pinned(fs_info, + cache->space_info, head->num_bytes); + cache->reserved -= head->num_bytes; + cache->space_info->bytes_reserved -= head->num_bytes; + spin_unlock(&cache->lock); + spin_unlock(&cache->space_info->lock); + percpu_counter_add_batch( + &cache->space_info->total_bytes_pinned, + head->num_bytes, BTRFS_TOTAL_BYTES_PINNED_BATCH); + + btrfs_put_block_group(cache); + + btrfs_error_unpin_extent_range(fs_info, head->bytenr, + head->bytenr + head->num_bytes - 1); + } btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head); btrfs_put_delayed_ref_head(head); cond_resched(); spin_lock(&delayed_refs->lock); } + btrfs_qgroup_destroy_extent_records(trans); spin_unlock(&delayed_refs->lock); @@ -4324,12 +4384,12 @@ static void btrfs_destroy_all_delalloc_inodes(struct btrfs_fs_info *fs_info) while (!list_empty(&splice)) { root = list_first_entry(&splice, struct btrfs_root, delalloc_root); - root = btrfs_grab_fs_root(root); + root = btrfs_grab_root(root); BUG_ON(!root); spin_unlock(&fs_info->delalloc_root_lock); btrfs_destroy_delalloc_inodes(root); - btrfs_put_fs_root(root); + btrfs_put_root(root); spin_lock(&fs_info->delalloc_root_lock); } @@ -4370,16 +4430,12 @@ static int btrfs_destroy_marked_extents(struct btrfs_fs_info *fs_info, } static int btrfs_destroy_pinned_extent(struct btrfs_fs_info *fs_info, - struct extent_io_tree *pinned_extents) + struct extent_io_tree *unpin) { - struct extent_io_tree *unpin; u64 start; u64 end; int ret; - bool loop = true; - unpin = pinned_extents; -again: while (1) { struct extent_state *cached_state = NULL; @@ -4404,15 +4460,6 @@ again: cond_resched(); } - if (loop) { - if (unpin == &fs_info->freed_extents[0]) - unpin = &fs_info->freed_extents[1]; - else - unpin = &fs_info->freed_extents[0]; - loop = false; - goto again; - } - return 0; } @@ -4500,12 +4547,10 @@ void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans, wake_up(&fs_info->transaction_wait); btrfs_destroy_delayed_inodes(fs_info); - btrfs_assert_delayed_root_empty(fs_info); btrfs_destroy_marked_extents(fs_info, &cur_trans->dirty_pages, EXTENT_DIRTY); - btrfs_destroy_pinned_extent(fs_info, - fs_info->pinned_extents); + btrfs_destroy_pinned_extent(fs_info, &cur_trans->pinned_extents); cur_trans->state =TRANS_STATE_COMPLETED; wake_up(&cur_trans->commit_wait); @@ -4557,7 +4602,6 @@ static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info) btrfs_destroy_all_ordered_extents(fs_info); btrfs_destroy_delayed_inodes(fs_info); btrfs_assert_delayed_root_empty(fs_info); - btrfs_destroy_pinned_extent(fs_info, fs_info->pinned_extents); btrfs_destroy_all_delalloc_inodes(fs_info); mutex_unlock(&fs_info->transaction_kthread_mutex); diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 8c2d6cf1ce59..cd629113f61c 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -39,6 +39,8 @@ static inline u64 btrfs_sb_offset(int mirror) struct btrfs_device; struct btrfs_fs_devices; +void btrfs_check_leaked_roots(struct btrfs_fs_info *fs_info); +void btrfs_init_fs_info(struct btrfs_fs_info *fs_info); int btrfs_verify_level_key(struct extent_buffer *eb, int level, struct btrfs_key *first_key, u64 parent_transid); struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr, @@ -54,15 +56,12 @@ int __cold open_ctree(struct super_block *sb, char *options); void __cold close_ctree(struct btrfs_fs_info *fs_info); int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors); -struct buffer_head *btrfs_read_dev_super(struct block_device *bdev); -int btrfs_read_dev_one_super(struct block_device *bdev, int copy_num, - struct buffer_head **bh_ret); +struct btrfs_super_block *btrfs_read_dev_super(struct block_device *bdev); +struct btrfs_super_block *btrfs_read_dev_one_super(struct block_device *bdev, + int copy_num); int btrfs_commit_super(struct btrfs_fs_info *fs_info); -struct btrfs_root *btrfs_read_fs_root(struct btrfs_root *tree_root, - struct btrfs_key *location); -int btrfs_init_fs_root(struct btrfs_root *root); -struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info, - u64 root_id); +struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root, + struct btrfs_key *key); int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root); void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info); @@ -70,19 +69,13 @@ void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info); struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_key *key, bool check_ref); -static inline struct btrfs_root * -btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info, - struct btrfs_key *location) -{ - return btrfs_get_fs_root(fs_info, location, true); -} +void btrfs_free_fs_info(struct btrfs_fs_info *fs_info); int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info); void btrfs_btree_balance_dirty(struct btrfs_fs_info *fs_info); void btrfs_btree_balance_dirty_nodelay(struct btrfs_fs_info *fs_info); void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root); -void btrfs_free_fs_root(struct btrfs_root *root); #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS struct btrfs_root *btrfs_alloc_dummy_root(struct btrfs_fs_info *fs_info); @@ -95,19 +88,16 @@ struct btrfs_root *btrfs_alloc_dummy_root(struct btrfs_fs_info *fs_info); * If you want to ensure the whole tree is safe, you should use * fs_info->subvol_srcu */ -static inline struct btrfs_root *btrfs_grab_fs_root(struct btrfs_root *root) +static inline struct btrfs_root *btrfs_grab_root(struct btrfs_root *root) { + if (!root) + return NULL; if (refcount_inc_not_zero(&root->refs)) return root; return NULL; } -static inline void btrfs_put_fs_root(struct btrfs_root *root) -{ - if (refcount_dec_and_test(&root->refs)) - kfree(root); -} - +void btrfs_put_root(struct btrfs_root *root); void btrfs_mark_buffer_dirty(struct extent_buffer *buf); int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid, int atomic); diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c index 72e312cae69d..2bb25d2dc44b 100644 --- a/fs/btrfs/export.c +++ b/fs/btrfs/export.c @@ -57,16 +57,14 @@ static int btrfs_encode_fh(struct inode *inode, u32 *fh, int *max_len, return type; } -static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid, - u64 root_objectid, u32 generation, - int check_generation) +struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid, + u64 root_objectid, u32 generation, + int check_generation) { struct btrfs_fs_info *fs_info = btrfs_sb(sb); struct btrfs_root *root; struct inode *inode; struct btrfs_key key; - int index; - int err = 0; if (objectid < BTRFS_FIRST_FREE_OBJECTID) return ERR_PTR(-ESTALE); @@ -75,25 +73,18 @@ static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid, key.type = BTRFS_ROOT_ITEM_KEY; key.offset = (u64)-1; - index = srcu_read_lock(&fs_info->subvol_srcu); - - root = btrfs_read_fs_root_no_name(fs_info, &key); - if (IS_ERR(root)) { - err = PTR_ERR(root); - goto fail; - } + root = btrfs_get_fs_root(fs_info, &key, true); + if (IS_ERR(root)) + return ERR_CAST(root); key.objectid = objectid; key.type = BTRFS_INODE_ITEM_KEY; key.offset = 0; inode = btrfs_iget(sb, &key, root); - if (IS_ERR(inode)) { - err = PTR_ERR(inode); - goto fail; - } - - srcu_read_unlock(&fs_info->subvol_srcu, index); + btrfs_put_root(root); + if (IS_ERR(inode)) + return ERR_CAST(inode); if (check_generation && generation != inode->i_generation) { iput(inode); @@ -101,9 +92,6 @@ static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid, } return d_obtain_alias(inode); -fail: - srcu_read_unlock(&fs_info->subvol_srcu, index); - return ERR_PTR(err); } static struct dentry *btrfs_fh_to_parent(struct super_block *sb, struct fid *fh, @@ -152,7 +140,7 @@ static struct dentry *btrfs_fh_to_dentry(struct super_block *sb, struct fid *fh, return btrfs_get_dentry(sb, objectid, root_objectid, generation, 1); } -static struct dentry *btrfs_get_parent(struct dentry *child) +struct dentry *btrfs_get_parent(struct dentry *child) { struct inode *dir = d_inode(child); struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); diff --git a/fs/btrfs/export.h b/fs/btrfs/export.h index 57488ecd7d4e..f32f4113c976 100644 --- a/fs/btrfs/export.h +++ b/fs/btrfs/export.h @@ -18,4 +18,9 @@ struct btrfs_fid { u64 parent_root_objectid; } __attribute__ ((packed)); +struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid, + u64 root_objectid, u32 generation, + int check_generation); +struct dentry *btrfs_get_parent(struct dentry *child); + #endif diff --git a/fs/btrfs/extent-io-tree.h b/fs/btrfs/extent-io-tree.h index a3febe746c79..b4a7bad3e82e 100644 --- a/fs/btrfs/extent-io-tree.h +++ b/fs/btrfs/extent-io-tree.h @@ -36,13 +36,14 @@ struct io_failure_record; #define CHUNK_TRIMMED EXTENT_DEFRAG enum { - IO_TREE_FS_INFO_FREED_EXTENTS0, - IO_TREE_FS_INFO_FREED_EXTENTS1, + IO_TREE_FS_PINNED_EXTENTS, + IO_TREE_FS_EXCLUDED_EXTENTS, IO_TREE_INODE_IO, IO_TREE_INODE_IO_FAILURE, IO_TREE_RELOC_BLOCKS, IO_TREE_TRANS_DIRTY_PAGES, IO_TREE_ROOT_DIRTY_LOG_PAGES, + IO_TREE_INODE_FILE_EXTENT, IO_TREE_SELFTEST, }; @@ -222,6 +223,8 @@ int find_first_extent_bit(struct extent_io_tree *tree, u64 start, struct extent_state **cached_state); void find_first_clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 *start_ret, u64 *end_ret, unsigned bits); +int find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start, + u64 *start_ret, u64 *end_ret, unsigned bits); int extent_invalidatepage(struct extent_io_tree *tree, struct page *page, unsigned long offset); bool btrfs_find_delalloc_range(struct extent_io_tree *tree, u64 *start, diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 0163fdd59f8f..54a64d1e18c6 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -64,10 +64,8 @@ int btrfs_add_excluded_extent(struct btrfs_fs_info *fs_info, u64 start, u64 num_bytes) { u64 end = start + num_bytes - 1; - set_extent_bits(&fs_info->freed_extents[0], - start, end, EXTENT_UPTODATE); - set_extent_bits(&fs_info->freed_extents[1], - start, end, EXTENT_UPTODATE); + set_extent_bits(&fs_info->excluded_extents, start, end, + EXTENT_UPTODATE); return 0; } @@ -79,10 +77,8 @@ void btrfs_free_excluded_extents(struct btrfs_block_group *cache) start = cache->start; end = start + cache->length - 1; - clear_extent_bits(&fs_info->freed_extents[0], - start, end, EXTENT_UPTODATE); - clear_extent_bits(&fs_info->freed_extents[1], - start, end, EXTENT_UPTODATE); + clear_extent_bits(&fs_info->excluded_extents, start, end, + EXTENT_UPTODATE); } static u64 generic_ref_to_space_flags(struct btrfs_ref *ref) @@ -1193,24 +1189,6 @@ int insert_inline_extent_backref(struct btrfs_trans_handle *trans, return ret; } -static int insert_extent_backref(struct btrfs_trans_handle *trans, - struct btrfs_path *path, - u64 bytenr, u64 parent, u64 root_objectid, - u64 owner, u64 offset, int refs_to_add) -{ - int ret; - if (owner < BTRFS_FIRST_FREE_OBJECTID) { - BUG_ON(refs_to_add != 1); - ret = insert_tree_block_ref(trans, path, bytenr, parent, - root_objectid); - } else { - ret = insert_extent_data_ref(trans, path, bytenr, parent, - root_objectid, owner, offset, - refs_to_add); - } - return ret; -} - static int remove_extent_backref(struct btrfs_trans_handle *trans, struct btrfs_path *path, struct btrfs_extent_inline_ref *iref, @@ -1469,7 +1447,6 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, if (!path) return -ENOMEM; - path->reada = READA_FORWARD; path->leave_spinning = 1; /* this will setup the path even if it fails to insert the back ref */ ret = insert_inline_extent_backref(trans, path, bytenr, num_bytes, @@ -1494,11 +1471,17 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(leaf); btrfs_release_path(path); - path->reada = READA_FORWARD; path->leave_spinning = 1; /* now insert the actual backref */ - ret = insert_extent_backref(trans, path, bytenr, parent, root_objectid, - owner, offset, refs_to_add); + if (owner < BTRFS_FIRST_FREE_OBJECTID) { + BUG_ON(refs_to_add != 1); + ret = insert_tree_block_ref(trans, path, bytenr, parent, + root_objectid); + } else { + ret = insert_extent_data_ref(trans, path, bytenr, parent, + root_objectid, owner, offset, + refs_to_add); + } if (ret) btrfs_abort_transaction(trans, ret); out: @@ -1583,7 +1566,7 @@ static int run_delayed_extent_op(struct btrfs_trans_handle *trans, int err = 0; int metadata = !extent_op->is_data; - if (trans->aborted) + if (TRANS_ABORTED(trans)) return 0; if (metadata && !btrfs_fs_incompat(fs_info, SKINNY_METADATA)) @@ -1604,7 +1587,6 @@ static int run_delayed_extent_op(struct btrfs_trans_handle *trans, } again: - path->reada = READA_FORWARD; path->leave_spinning = 1; ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 1); if (ret < 0) { @@ -1703,10 +1685,9 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans, { int ret = 0; - if (trans->aborted) { + if (TRANS_ABORTED(trans)) { if (insert_reserved) - btrfs_pin_extent(trans->fs_info, node->bytenr, - node->num_bytes, 1); + btrfs_pin_extent(trans, node->bytenr, node->num_bytes, 1); return 0; } @@ -1721,8 +1702,7 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans, else BUG(); if (ret && insert_reserved) - btrfs_pin_extent(trans->fs_info, node->bytenr, - node->num_bytes, 1); + btrfs_pin_extent(trans, node->bytenr, node->num_bytes, 1); return ret; } @@ -1867,8 +1847,7 @@ static int cleanup_ref_head(struct btrfs_trans_handle *trans, spin_unlock(&delayed_refs->lock); if (head->must_insert_reserved) { - btrfs_pin_extent(fs_info, head->bytenr, - head->num_bytes, 1); + btrfs_pin_extent(trans, head->bytenr, head->num_bytes, 1); if (head->is_data) { ret = btrfs_del_csums(trans, fs_info->csum_root, head->bytenr, head->num_bytes); @@ -2191,7 +2170,7 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, int run_all = count == (unsigned long)-1; /* We'll clean this up in btrfs_cleanup_transaction */ - if (trans->aborted) + if (TRANS_ABORTED(trans)) return 0; if (test_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &fs_info->flags)) @@ -2238,7 +2217,7 @@ out: } int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, - u64 bytenr, u64 num_bytes, u64 flags, + struct extent_buffer *eb, u64 flags, int level, int is_data) { struct btrfs_delayed_extent_op *extent_op; @@ -2254,7 +2233,7 @@ int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, extent_op->is_data = is_data ? true : false; extent_op->level = level; - ret = btrfs_add_delayed_extent_op(trans, bytenr, num_bytes, extent_op); + ret = btrfs_add_delayed_extent_op(trans, eb->start, eb->len, extent_op); if (ret) btrfs_free_delayed_extent_op(extent_op); return ret; @@ -2588,7 +2567,8 @@ static u64 first_logical_byte(struct btrfs_fs_info *fs_info, u64 search_start) return bytenr; } -static int pin_down_extent(struct btrfs_block_group *cache, +static int pin_down_extent(struct btrfs_trans_handle *trans, + struct btrfs_block_group *cache, u64 bytenr, u64 num_bytes, int reserved) { struct btrfs_fs_info *fs_info = cache->fs_info; @@ -2607,22 +2587,20 @@ static int pin_down_extent(struct btrfs_block_group *cache, percpu_counter_add_batch(&cache->space_info->total_bytes_pinned, num_bytes, BTRFS_TOTAL_BYTES_PINNED_BATCH); - set_extent_dirty(fs_info->pinned_extents, bytenr, + set_extent_dirty(&trans->transaction->pinned_extents, bytenr, bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL); return 0; } -int btrfs_pin_extent(struct btrfs_fs_info *fs_info, +int btrfs_pin_extent(struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, int reserved) { struct btrfs_block_group *cache; - ASSERT(fs_info->running_transaction); - - cache = btrfs_lookup_block_group(fs_info, bytenr); + cache = btrfs_lookup_block_group(trans->fs_info, bytenr); BUG_ON(!cache); /* Logic error */ - pin_down_extent(cache, bytenr, num_bytes, reserved); + pin_down_extent(trans, cache, bytenr, num_bytes, reserved); btrfs_put_block_group(cache); return 0; @@ -2631,13 +2609,15 @@ int btrfs_pin_extent(struct btrfs_fs_info *fs_info, /* * this function must be called within transaction */ -int btrfs_pin_extent_for_log_replay(struct btrfs_fs_info *fs_info, +int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes) { struct btrfs_block_group *cache; int ret; - cache = btrfs_lookup_block_group(fs_info, bytenr); + btrfs_add_excluded_extent(trans->fs_info, bytenr, num_bytes); + + cache = btrfs_lookup_block_group(trans->fs_info, bytenr); if (!cache) return -EINVAL; @@ -2649,7 +2629,7 @@ int btrfs_pin_extent_for_log_replay(struct btrfs_fs_info *fs_info, */ btrfs_cache_block_group(cache, 1); - pin_down_extent(cache, bytenr, num_bytes, 0); + pin_down_extent(trans, cache, bytenr, num_bytes, 0); /* remove us from the free space cache (if we're there at all) */ ret = btrfs_remove_free_space(cache, bytenr, num_bytes); @@ -2763,11 +2743,6 @@ void btrfs_prepare_extent_commit(struct btrfs_fs_info *fs_info) } } - if (fs_info->pinned_extents == &fs_info->freed_extents[0]) - fs_info->pinned_extents = &fs_info->freed_extents[1]; - else - fs_info->pinned_extents = &fs_info->freed_extents[0]; - up_write(&fs_info->commit_root_sem); btrfs_update_global_block_rsv(fs_info); @@ -2908,12 +2883,9 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans) u64 end; int ret; - if (fs_info->pinned_extents == &fs_info->freed_extents[0]) - unpin = &fs_info->freed_extents[1]; - else - unpin = &fs_info->freed_extents[0]; + unpin = &trans->transaction->pinned_extents; - while (!trans->aborted) { + while (!TRANS_ABORTED(trans)) { struct extent_state *cached_state = NULL; mutex_lock(&fs_info->unused_bg_unpin_mutex); @@ -2923,6 +2895,9 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans) mutex_unlock(&fs_info->unused_bg_unpin_mutex); break; } + if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) + clear_extent_bits(&fs_info->excluded_extents, start, + end, EXTENT_UPTODATE); if (btrfs_test_opt(fs_info, DISCARD_SYNC)) ret = btrfs_discard_extent(fs_info, start, @@ -2950,7 +2925,7 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans) u64 trimmed = 0; ret = -EROFS; - if (!trans->aborted) + if (!TRANS_ABORTED(trans)) ret = btrfs_discard_extent(fs_info, block_group->start, block_group->length, @@ -3000,7 +2975,6 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, if (!path) return -ENOMEM; - path->reada = READA_FORWARD; path->leave_spinning = 1; is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID; @@ -3301,7 +3275,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, cache = btrfs_lookup_block_group(fs_info, buf->start); if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) { - pin_down_extent(cache, buf->start, buf->len, 1); + pin_down_extent(trans, cache, buf->start, buf->len, 1); btrfs_put_block_group(cache); goto out; } @@ -3345,7 +3319,7 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref) (ref->type == BTRFS_REF_DATA && ref->data_ref.ref_root == BTRFS_TREE_LOG_OBJECTID)) { /* unlocks the pinned mutex */ - btrfs_pin_extent(fs_info, ref->bytenr, ref->len, 1); + btrfs_pin_extent(trans, ref->bytenr, ref->len, 1); old_ref_mod = new_ref_mod = 0; ret = 0; } else if (ref->type == BTRFS_REF_METADATA) { @@ -3438,6 +3412,10 @@ btrfs_release_block_group(struct btrfs_block_group *cache, btrfs_put_block_group(cache); } +enum btrfs_extent_allocation_policy { + BTRFS_EXTENT_ALLOC_CLUSTERED, +}; + /* * Structure used internally for find_free_extent() function. Wraps needed * parameters. @@ -3454,6 +3432,8 @@ struct find_free_extent_ctl { /* For clustered allocation */ u64 empty_cluster; + struct btrfs_free_cluster *last_ptr; + bool use_cluster; bool have_caching_bg; bool orig_have_caching_bg; @@ -3489,6 +3469,12 @@ struct find_free_extent_ctl { /* Found result */ u64 found_offset; + + /* Hint where to start looking for an empty space */ + u64 hint_byte; + + /* Allocation policy */ + enum btrfs_extent_allocation_policy policy; }; @@ -3501,11 +3487,11 @@ struct find_free_extent_ctl { * Return 0 means we have found a location and set ffe_ctl->found_offset. */ static int find_free_extent_clustered(struct btrfs_block_group *bg, - struct btrfs_free_cluster *last_ptr, - struct find_free_extent_ctl *ffe_ctl, - struct btrfs_block_group **cluster_bg_ret) + struct find_free_extent_ctl *ffe_ctl, + struct btrfs_block_group **cluster_bg_ret) { struct btrfs_block_group *cluster_bg; + struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr; u64 aligned_cluster; u64 offset; int ret; @@ -3605,9 +3591,9 @@ refill_cluster: * Return -EAGAIN to inform caller that we need to re-search this block group */ static int find_free_extent_unclustered(struct btrfs_block_group *bg, - struct btrfs_free_cluster *last_ptr, - struct find_free_extent_ctl *ffe_ctl) + struct find_free_extent_ctl *ffe_ctl) { + struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr; u64 offset; /* @@ -3663,16 +3649,101 @@ static int find_free_extent_unclustered(struct btrfs_block_group *bg, return 0; } +static int do_allocation_clustered(struct btrfs_block_group *block_group, + struct find_free_extent_ctl *ffe_ctl, + struct btrfs_block_group **bg_ret) +{ + int ret; + + /* We want to try and use the cluster allocator, so lets look there */ + if (ffe_ctl->last_ptr && ffe_ctl->use_cluster) { + ret = find_free_extent_clustered(block_group, ffe_ctl, bg_ret); + if (ret >= 0 || ret == -EAGAIN) + return ret; + /* ret == -ENOENT case falls through */ + } + + return find_free_extent_unclustered(block_group, ffe_ctl); +} + +static int do_allocation(struct btrfs_block_group *block_group, + struct find_free_extent_ctl *ffe_ctl, + struct btrfs_block_group **bg_ret) +{ + switch (ffe_ctl->policy) { + case BTRFS_EXTENT_ALLOC_CLUSTERED: + return do_allocation_clustered(block_group, ffe_ctl, bg_ret); + default: + BUG(); + } +} + +static void release_block_group(struct btrfs_block_group *block_group, + struct find_free_extent_ctl *ffe_ctl, + int delalloc) +{ + switch (ffe_ctl->policy) { + case BTRFS_EXTENT_ALLOC_CLUSTERED: + ffe_ctl->retry_clustered = false; + ffe_ctl->retry_unclustered = false; + break; + default: + BUG(); + } + + BUG_ON(btrfs_bg_flags_to_raid_index(block_group->flags) != + ffe_ctl->index); + btrfs_release_block_group(block_group, delalloc); +} + +static void found_extent_clustered(struct find_free_extent_ctl *ffe_ctl, + struct btrfs_key *ins) +{ + struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr; + + if (!ffe_ctl->use_cluster && last_ptr) { + spin_lock(&last_ptr->lock); + last_ptr->window_start = ins->objectid; + spin_unlock(&last_ptr->lock); + } +} + +static void found_extent(struct find_free_extent_ctl *ffe_ctl, + struct btrfs_key *ins) +{ + switch (ffe_ctl->policy) { + case BTRFS_EXTENT_ALLOC_CLUSTERED: + found_extent_clustered(ffe_ctl, ins); + break; + default: + BUG(); + } +} + +static int chunk_allocation_failed(struct find_free_extent_ctl *ffe_ctl) +{ + switch (ffe_ctl->policy) { + case BTRFS_EXTENT_ALLOC_CLUSTERED: + /* + * If we can't allocate a new chunk we've already looped through + * at least once, move on to the NO_EMPTY_SIZE case. + */ + ffe_ctl->loop = LOOP_NO_EMPTY_SIZE; + return 0; + default: + BUG(); + } +} + /* * Return >0 means caller needs to re-search for free extent * Return 0 means we have the needed free extent. * Return <0 means we failed to locate any free extent. */ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info, - struct btrfs_free_cluster *last_ptr, struct btrfs_key *ins, struct find_free_extent_ctl *ffe_ctl, - int full_search, bool use_cluster) + bool full_search) { struct btrfs_root *root = fs_info->extent_root; int ret; @@ -3689,11 +3760,7 @@ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info, return 1; if (ins->objectid) { - if (!use_cluster && last_ptr) { - spin_lock(&last_ptr->lock); - last_ptr->window_start = ins->objectid; - spin_unlock(&last_ptr->lock); - } + found_extent(ffe_ctl, ins); return 0; } @@ -3739,16 +3806,10 @@ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info, ret = btrfs_chunk_alloc(trans, ffe_ctl->flags, CHUNK_ALLOC_FORCE); - /* - * If we can't allocate a new chunk we've already looped - * through at least once, move on to the NO_EMPTY_SIZE - * case. - */ - if (ret == -ENOSPC) - ffe_ctl->loop = LOOP_NO_EMPTY_SIZE; - /* Do not bail out on ENOSPC since we can do more. */ - if (ret < 0 && ret != -ENOSPC) + if (ret == -ENOSPC) + ret = chunk_allocation_failed(ffe_ctl); + else if (ret < 0) btrfs_abort_transaction(trans, ret); else ret = 0; @@ -3759,6 +3820,9 @@ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info, } if (ffe_ctl->loop == LOOP_NO_EMPTY_SIZE) { + if (ffe_ctl->policy != BTRFS_EXTENT_ALLOC_CLUSTERED) + return -ENOSPC; + /* * Don't loop again if we already have no empty_size and * no empty_cluster. @@ -3774,6 +3838,71 @@ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info, return -ENOSPC; } +static int prepare_allocation_clustered(struct btrfs_fs_info *fs_info, + struct find_free_extent_ctl *ffe_ctl, + struct btrfs_space_info *space_info, + struct btrfs_key *ins) +{ + /* + * If our free space is heavily fragmented we may not be able to make + * big contiguous allocations, so instead of doing the expensive search + * for free space, simply return ENOSPC with our max_extent_size so we + * can go ahead and search for a more manageable chunk. + * + * If our max_extent_size is large enough for our allocation simply + * disable clustering since we will likely not be able to find enough + * space to create a cluster and induce latency trying. + */ + if (space_info->max_extent_size) { + spin_lock(&space_info->lock); + if (space_info->max_extent_size && + ffe_ctl->num_bytes > space_info->max_extent_size) { + ins->offset = space_info->max_extent_size; + spin_unlock(&space_info->lock); + return -ENOSPC; + } else if (space_info->max_extent_size) { + ffe_ctl->use_cluster = false; + } + spin_unlock(&space_info->lock); + } + + ffe_ctl->last_ptr = fetch_cluster_info(fs_info, space_info, + &ffe_ctl->empty_cluster); + if (ffe_ctl->last_ptr) { + struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr; + + spin_lock(&last_ptr->lock); + if (last_ptr->block_group) + ffe_ctl->hint_byte = last_ptr->window_start; + if (last_ptr->fragmented) { + /* + * We still set window_start so we can keep track of the + * last place we found an allocation to try and save + * some time. + */ + ffe_ctl->hint_byte = last_ptr->window_start; + ffe_ctl->use_cluster = false; + } + spin_unlock(&last_ptr->lock); + } + + return 0; +} + +static int prepare_allocation(struct btrfs_fs_info *fs_info, + struct find_free_extent_ctl *ffe_ctl, + struct btrfs_space_info *space_info, + struct btrfs_key *ins) +{ + switch (ffe_ctl->policy) { + case BTRFS_EXTENT_ALLOC_CLUSTERED: + return prepare_allocation_clustered(fs_info, ffe_ctl, + space_info, ins); + default: + BUG(); + } +} + /* * walks the btree of allocated extents and find a hole of a given size. * The key ins is changed to record the hole: @@ -3801,16 +3930,14 @@ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info, */ static noinline int find_free_extent(struct btrfs_fs_info *fs_info, u64 ram_bytes, u64 num_bytes, u64 empty_size, - u64 hint_byte, struct btrfs_key *ins, + u64 hint_byte_orig, struct btrfs_key *ins, u64 flags, int delalloc) { int ret = 0; int cache_block_group_error = 0; - struct btrfs_free_cluster *last_ptr = NULL; struct btrfs_block_group *block_group = NULL; struct find_free_extent_ctl ffe_ctl = {0}; struct btrfs_space_info *space_info; - bool use_cluster = true; bool full_search = false; WARN_ON(num_bytes < fs_info->sectorsize); @@ -3819,13 +3946,19 @@ static noinline int find_free_extent(struct btrfs_fs_info *fs_info, ffe_ctl.empty_size = empty_size; ffe_ctl.flags = flags; ffe_ctl.search_start = 0; - ffe_ctl.retry_clustered = false; - ffe_ctl.retry_unclustered = false; ffe_ctl.delalloc = delalloc; ffe_ctl.index = btrfs_bg_flags_to_raid_index(flags); ffe_ctl.have_caching_bg = false; ffe_ctl.orig_have_caching_bg = false; ffe_ctl.found_offset = 0; + ffe_ctl.hint_byte = hint_byte_orig; + ffe_ctl.policy = BTRFS_EXTENT_ALLOC_CLUSTERED; + + /* For clustered allocation */ + ffe_ctl.retry_clustered = false; + ffe_ctl.retry_unclustered = false; + ffe_ctl.last_ptr = NULL; + ffe_ctl.use_cluster = true; ins->type = BTRFS_EXTENT_ITEM_KEY; ins->objectid = 0; @@ -3839,51 +3972,14 @@ static noinline int find_free_extent(struct btrfs_fs_info *fs_info, return -ENOSPC; } - /* - * If our free space is heavily fragmented we may not be able to make - * big contiguous allocations, so instead of doing the expensive search - * for free space, simply return ENOSPC with our max_extent_size so we - * can go ahead and search for a more manageable chunk. - * - * If our max_extent_size is large enough for our allocation simply - * disable clustering since we will likely not be able to find enough - * space to create a cluster and induce latency trying. - */ - if (unlikely(space_info->max_extent_size)) { - spin_lock(&space_info->lock); - if (space_info->max_extent_size && - num_bytes > space_info->max_extent_size) { - ins->offset = space_info->max_extent_size; - spin_unlock(&space_info->lock); - return -ENOSPC; - } else if (space_info->max_extent_size) { - use_cluster = false; - } - spin_unlock(&space_info->lock); - } - - last_ptr = fetch_cluster_info(fs_info, space_info, - &ffe_ctl.empty_cluster); - if (last_ptr) { - spin_lock(&last_ptr->lock); - if (last_ptr->block_group) - hint_byte = last_ptr->window_start; - if (last_ptr->fragmented) { - /* - * We still set window_start so we can keep track of the - * last place we found an allocation to try and save - * some time. - */ - hint_byte = last_ptr->window_start; - use_cluster = false; - } - spin_unlock(&last_ptr->lock); - } + ret = prepare_allocation(fs_info, &ffe_ctl, space_info, ins); + if (ret < 0) + return ret; ffe_ctl.search_start = max(ffe_ctl.search_start, first_logical_byte(fs_info, 0)); - ffe_ctl.search_start = max(ffe_ctl.search_start, hint_byte); - if (ffe_ctl.search_start == hint_byte) { + ffe_ctl.search_start = max(ffe_ctl.search_start, ffe_ctl.hint_byte); + if (ffe_ctl.search_start == ffe_ctl.hint_byte) { block_group = btrfs_lookup_block_group(fs_info, ffe_ctl.search_start); /* @@ -3924,6 +4020,8 @@ search: down_read(&space_info->groups_sem); list_for_each_entry(block_group, &space_info->block_groups[ffe_ctl.index], list) { + struct btrfs_block_group *bg_ret; + /* If the block group is read-only, we can skip it entirely. */ if (unlikely(block_group->ro)) continue; @@ -3984,39 +4082,20 @@ have_block_group: if (unlikely(block_group->cached == BTRFS_CACHE_ERROR)) goto loop; - /* - * Ok we want to try and use the cluster allocator, so - * lets look there - */ - if (last_ptr && use_cluster) { - struct btrfs_block_group *cluster_bg = NULL; - - ret = find_free_extent_clustered(block_group, last_ptr, - &ffe_ctl, &cluster_bg); - - if (ret == 0) { - if (cluster_bg && cluster_bg != block_group) { - btrfs_release_block_group(block_group, - delalloc); - block_group = cluster_bg; - } - goto checks; - } else if (ret == -EAGAIN) { - goto have_block_group; - } else if (ret > 0) { - goto loop; + bg_ret = NULL; + ret = do_allocation(block_group, &ffe_ctl, &bg_ret); + if (ret == 0) { + if (bg_ret && bg_ret != block_group) { + btrfs_release_block_group(block_group, delalloc); + block_group = bg_ret; } - /* ret == -ENOENT case falls through */ - } - - ret = find_free_extent_unclustered(block_group, last_ptr, - &ffe_ctl); - if (ret == -EAGAIN) + } else if (ret == -EAGAIN) { goto have_block_group; - else if (ret > 0) + } else if (ret > 0) { goto loop; - /* ret == 0 case falls through */ -checks: + } + + /* Checks */ ffe_ctl.search_start = round_up(ffe_ctl.found_offset, fs_info->stripesize); @@ -4050,17 +4129,12 @@ checks: btrfs_release_block_group(block_group, delalloc); break; loop: - ffe_ctl.retry_clustered = false; - ffe_ctl.retry_unclustered = false; - BUG_ON(btrfs_bg_flags_to_raid_index(block_group->flags) != - ffe_ctl.index); - btrfs_release_block_group(block_group, delalloc); + release_block_group(block_group, &ffe_ctl, delalloc); cond_resched(); } up_read(&space_info->groups_sem); - ret = find_free_extent_update_loop(fs_info, last_ptr, ins, &ffe_ctl, - full_search, use_cluster); + ret = find_free_extent_update_loop(fs_info, ins, &ffe_ctl, full_search); if (ret > 0) goto search; @@ -4189,18 +4263,20 @@ int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info, return 0; } -int btrfs_pin_reserved_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len) +int btrfs_pin_reserved_extent(struct btrfs_trans_handle *trans, u64 start, + u64 len) { struct btrfs_block_group *cache; int ret = 0; - cache = btrfs_lookup_block_group(fs_info, start); + cache = btrfs_lookup_block_group(trans->fs_info, start); if (!cache) { - btrfs_err(fs_info, "unable to find block group for %llu", start); + btrfs_err(trans->fs_info, "unable to find block group for %llu", + start); return -ENOSPC; } - ret = pin_down_extent(cache, start, len, 1); + ret = pin_down_extent(trans, cache, start, len, 1); btrfs_put_block_group(cache); return ret; } @@ -4430,6 +4506,8 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, ret = alloc_reserved_file_extent(trans, 0, root_objectid, 0, owner, offset, ins, 1); + if (ret) + btrfs_pin_extent(trans, ins->objectid, ins->offset, 1); btrfs_put_block_group(block_group); return ret; } @@ -4748,8 +4826,7 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans, BUG_ON(ret); /* -ENOMEM */ ret = btrfs_dec_ref(trans, root, eb, 0); BUG_ON(ret); /* -ENOMEM */ - ret = btrfs_set_disk_extent_flags(trans, eb->start, - eb->len, flag, + ret = btrfs_set_disk_extent_flags(trans, eb, flag, btrfs_header_level(eb), 0); BUG_ON(ret); /* -ENOMEM */ wc->flags[level] |= flag; @@ -5207,9 +5284,7 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans, * * If called with for_reloc == 0, may exit early with -EAGAIN */ -int btrfs_drop_snapshot(struct btrfs_root *root, - struct btrfs_block_rsv *block_rsv, int update_ref, - int for_reloc) +int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc) { struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_path *path; @@ -5248,9 +5323,6 @@ int btrfs_drop_snapshot(struct btrfs_root *root, if (err) goto out_end_trans; - if (block_rsv) - trans->block_rsv = block_rsv; - /* * This will help us catch people modifying the fs tree while we're * dropping it. It is unsafe to mess with the fs tree while it's being @@ -5378,8 +5450,6 @@ int btrfs_drop_snapshot(struct btrfs_root *root, err = PTR_ERR(trans); goto out_free; } - if (block_rsv) - trans->block_rsv = block_rsv; } } btrfs_release_path(path); @@ -5411,13 +5481,10 @@ int btrfs_drop_snapshot(struct btrfs_root *root, } } - if (test_bit(BTRFS_ROOT_IN_RADIX, &root->state)) { + if (test_bit(BTRFS_ROOT_IN_RADIX, &root->state)) btrfs_add_dropped_root(trans, root); - } else { - free_extent_buffer(root->node); - free_extent_buffer(root->commit_root); - btrfs_put_fs_root(root); - } + else + btrfs_put_root(root); root_dropped = true; out_end_trans: btrfs_end_transaction_throttle(trans); @@ -5747,47 +5814,3 @@ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range) return bg_ret; return dev_ret; } - -/* - * btrfs_{start,end}_write_no_snapshotting() are similar to - * mnt_{want,drop}_write(), they are used to prevent some tasks from writing - * data into the page cache through nocow before the subvolume is snapshoted, - * but flush the data into disk after the snapshot creation, or to prevent - * operations while snapshotting is ongoing and that cause the snapshot to be - * inconsistent (writes followed by expanding truncates for example). - */ -void btrfs_end_write_no_snapshotting(struct btrfs_root *root) -{ - percpu_counter_dec(&root->subv_writers->counter); - cond_wake_up(&root->subv_writers->wait); -} - -int btrfs_start_write_no_snapshotting(struct btrfs_root *root) -{ - if (atomic_read(&root->will_be_snapshotted)) - return 0; - - percpu_counter_inc(&root->subv_writers->counter); - /* - * Make sure counter is updated before we check for snapshot creation. - */ - smp_mb(); - if (atomic_read(&root->will_be_snapshotted)) { - btrfs_end_write_no_snapshotting(root); - return 0; - } - return 1; -} - -void btrfs_wait_for_snapshot_creation(struct btrfs_root *root) -{ - while (true) { - int ret; - - ret = btrfs_start_write_no_snapshotting(root); - if (ret) - break; - wait_var_event(&root->will_be_snapshotted, - !atomic_read(&root->will_be_snapshotted)); - } -} diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index c0f202741e09..39e45b8a5031 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -35,42 +35,54 @@ static inline bool extent_state_in_tree(const struct extent_state *state) } #ifdef CONFIG_BTRFS_DEBUG -static LIST_HEAD(buffers); static LIST_HEAD(states); - static DEFINE_SPINLOCK(leak_lock); -static inline -void btrfs_leak_debug_add(struct list_head *new, struct list_head *head) +static inline void btrfs_leak_debug_add(spinlock_t *lock, + struct list_head *new, + struct list_head *head) { unsigned long flags; - spin_lock_irqsave(&leak_lock, flags); + spin_lock_irqsave(lock, flags); list_add(new, head); - spin_unlock_irqrestore(&leak_lock, flags); + spin_unlock_irqrestore(lock, flags); } -static inline -void btrfs_leak_debug_del(struct list_head *entry) +static inline void btrfs_leak_debug_del(spinlock_t *lock, + struct list_head *entry) { unsigned long flags; - spin_lock_irqsave(&leak_lock, flags); + spin_lock_irqsave(lock, flags); list_del(entry); - spin_unlock_irqrestore(&leak_lock, flags); + spin_unlock_irqrestore(lock, flags); } -static inline void btrfs_extent_buffer_leak_debug_check(void) +void btrfs_extent_buffer_leak_debug_check(struct btrfs_fs_info *fs_info) { struct extent_buffer *eb; + unsigned long flags; - while (!list_empty(&buffers)) { - eb = list_entry(buffers.next, struct extent_buffer, leak_list); - pr_err("BTRFS: buffer leak start %llu len %lu refs %d bflags %lu\n", - eb->start, eb->len, atomic_read(&eb->refs), eb->bflags); + /* + * If we didn't get into open_ctree our allocated_ebs will not be + * initialized, so just skip this. + */ + if (!fs_info->allocated_ebs.next) + return; + + spin_lock_irqsave(&fs_info->eb_leak_lock, flags); + while (!list_empty(&fs_info->allocated_ebs)) { + eb = list_first_entry(&fs_info->allocated_ebs, + struct extent_buffer, leak_list); + pr_err( + "BTRFS: buffer leak start %llu len %lu refs %d bflags %lu owner %llu\n", + eb->start, eb->len, atomic_read(&eb->refs), eb->bflags, + btrfs_header_owner(eb)); list_del(&eb->leak_list); kmem_cache_free(extent_buffer_cache, eb); } + spin_unlock_irqrestore(&fs_info->eb_leak_lock, flags); } static inline void btrfs_extent_state_leak_debug_check(void) @@ -107,9 +119,8 @@ static inline void __btrfs_debug_check_extent_io_range(const char *caller, } } #else -#define btrfs_leak_debug_add(new, head) do {} while (0) -#define btrfs_leak_debug_del(entry) do {} while (0) -#define btrfs_extent_buffer_leak_debug_check() do {} while (0) +#define btrfs_leak_debug_add(lock, new, head) do {} while (0) +#define btrfs_leak_debug_del(lock, entry) do {} while (0) #define btrfs_extent_state_leak_debug_check() do {} while (0) #define btrfs_debug_check_extent_io_range(c, s, e) do {} while (0) #endif @@ -122,7 +133,6 @@ struct tree_entry { struct extent_page_data { struct bio *bio; - struct extent_io_tree *tree; /* tells writepage not to lock the state bits for this range * it still does the unlocking */ @@ -246,8 +256,6 @@ void __cold extent_state_cache_exit(void) void __cold extent_io_exit(void) { - btrfs_extent_buffer_leak_debug_check(); - /* * Make sure all delayed rcu free are flushed before we * destroy caches. @@ -257,6 +265,15 @@ void __cold extent_io_exit(void) bioset_exit(&btrfs_bioset); } +/* + * For the file_extent_tree, we want to hold the inode lock when we lookup and + * update the disk_i_size, but lockdep will complain because our io_tree we hold + * the tree lock and get the inode lock when setting delalloc. These two things + * are unrelated, so make a class for the file_extent_tree so we don't get the + * two locking patterns mixed up. + */ +static struct lock_class_key file_extent_tree_class; + void extent_io_tree_init(struct btrfs_fs_info *fs_info, struct extent_io_tree *tree, unsigned int owner, void *private_data) @@ -268,6 +285,8 @@ void extent_io_tree_init(struct btrfs_fs_info *fs_info, spin_lock_init(&tree->lock); tree->private_data = private_data; tree->owner = owner; + if (owner == IO_TREE_INODE_FILE_EXTENT) + lockdep_set_class(&tree->lock, &file_extent_tree_class); } void extent_io_tree_release(struct extent_io_tree *tree) @@ -314,7 +333,7 @@ static struct extent_state *alloc_extent_state(gfp_t mask) state->state = 0; state->failrec = NULL; RB_CLEAR_NODE(&state->rb_node); - btrfs_leak_debug_add(&state->leak_list, &states); + btrfs_leak_debug_add(&leak_lock, &state->leak_list, &states); refcount_set(&state->refs, 1); init_waitqueue_head(&state->wq); trace_alloc_extent_state(state, mask, _RET_IP_); @@ -327,7 +346,7 @@ void free_extent_state(struct extent_state *state) return; if (refcount_dec_and_test(&state->refs)) { WARN_ON(extent_state_in_tree(state)); - btrfs_leak_debug_del(&state->leak_list); + btrfs_leak_debug_del(&leak_lock, &state->leak_list); trace_free_extent_state(state, _RET_IP_); kmem_cache_free(extent_state_cache, state); } @@ -1053,6 +1072,16 @@ hit_next: goto out; } + /* + * If this extent already has all the bits we want set, then + * skip it, not necessary to split it or do anything with it. + */ + if ((state->state & bits) == bits) { + start = state->end + 1; + cache_state(state, cached_state); + goto search_again; + } + prealloc = alloc_extent_state_atomic(prealloc); BUG_ON(!prealloc); err = split_state(tree, state, prealloc, start); @@ -1568,6 +1597,43 @@ out: } /** + * find_contiguous_extent_bit: find a contiguous area of bits + * @tree - io tree to check + * @start - offset to start the search from + * @start_ret - the first offset we found with the bits set + * @end_ret - the final contiguous range of the bits that were set + * @bits - bits to look for + * + * set_extent_bit and clear_extent_bit can temporarily split contiguous ranges + * to set bits appropriately, and then merge them again. During this time it + * will drop the tree->lock, so use this helper if you want to find the actual + * contiguous area for given bits. We will search to the first bit we find, and + * then walk down the tree until we find a non-contiguous area. The area + * returned will be the full contiguous area with the bits set. + */ +int find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start, + u64 *start_ret, u64 *end_ret, unsigned bits) +{ + struct extent_state *state; + int ret = 1; + + spin_lock(&tree->lock); + state = find_first_extent_bit_state(tree, start, bits); + if (state) { + *start_ret = state->start; + *end_ret = state->end; + while ((state = next_state(state)) != NULL) { + if (state->start > (*end_ret + 1)) + break; + *end_ret = state->end; + } + ret = 0; + } + spin_unlock(&tree->lock); + return ret; +} + +/** * find_first_clear_extent_bit - find the first range that has @bits not set. * This range could start before @start. * @@ -2926,7 +2992,6 @@ struct bio *btrfs_bio_clone_partial(struct bio *orig, int offset, int size) /* * @opf: bio REQ_OP_* and REQ_* flags as one value - * @tree: tree so we can call our merge_bio hook * @wbc: optional writeback control for io accounting * @page: page to add to the bio * @pg_offset: offset of the new bio or to check whether we are adding @@ -2939,7 +3004,7 @@ struct bio *btrfs_bio_clone_partial(struct bio *orig, int offset, int size) * @prev_bio_flags: flags of previous bio to see if we can merge the current one * @bio_flags: flags of the current bio to see if we can merge them */ -static int submit_extent_page(unsigned int opf, struct extent_io_tree *tree, +static int submit_extent_page(unsigned int opf, struct writeback_control *wbc, struct page *page, u64 offset, size_t size, unsigned long pg_offset, @@ -2954,6 +3019,7 @@ static int submit_extent_page(unsigned int opf, struct extent_io_tree *tree, struct bio *bio; size_t page_size = min_t(size_t, size, PAGE_SIZE); sector_t sector = offset >> 9; + struct extent_io_tree *tree = &BTRFS_I(page->mapping->host)->io_tree; ASSERT(bio_ret); @@ -3062,8 +3128,7 @@ __get_extent_map(struct inode *inode, struct page *page, size_t pg_offset, * XXX JDM: This needs looking at to ensure proper page locking * return 0 on success, otherwise return error */ -static int __do_readpage(struct extent_io_tree *tree, - struct page *page, +static int __do_readpage(struct page *page, get_extent_t *get_extent, struct extent_map **em_cached, struct bio **bio, int mirror_num, @@ -3086,6 +3151,7 @@ static int __do_readpage(struct extent_io_tree *tree, size_t disk_io_size; size_t blocksize = inode->i_sb->s_blocksize; unsigned long this_bio_flag = 0; + struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; set_page_extent_mapped(page); @@ -3242,7 +3308,7 @@ static int __do_readpage(struct extent_io_tree *tree, continue; } - ret = submit_extent_page(REQ_OP_READ | read_flags, tree, NULL, + ret = submit_extent_page(REQ_OP_READ | read_flags, NULL, page, offset, disk_io_size, pg_offset, bio, end_bio_extent_readpage, mirror_num, @@ -3269,8 +3335,7 @@ out: return ret; } -static inline void contiguous_readpages(struct extent_io_tree *tree, - struct page *pages[], int nr_pages, +static inline void contiguous_readpages(struct page *pages[], int nr_pages, u64 start, u64 end, struct extent_map **em_cached, struct bio **bio, @@ -3280,17 +3345,16 @@ static inline void contiguous_readpages(struct extent_io_tree *tree, struct btrfs_inode *inode = BTRFS_I(pages[0]->mapping->host); int index; - btrfs_lock_and_flush_ordered_range(tree, inode, start, end, NULL); + btrfs_lock_and_flush_ordered_range(inode, start, end, NULL); for (index = 0; index < nr_pages; index++) { - __do_readpage(tree, pages[index], btrfs_get_extent, em_cached, + __do_readpage(pages[index], btrfs_get_extent, em_cached, bio, 0, bio_flags, REQ_RAHEAD, prev_em_start); put_page(pages[index]); } } -static int __extent_read_full_page(struct extent_io_tree *tree, - struct page *page, +static int __extent_read_full_page(struct page *page, get_extent_t *get_extent, struct bio **bio, int mirror_num, unsigned long *bio_flags, @@ -3301,21 +3365,21 @@ static int __extent_read_full_page(struct extent_io_tree *tree, u64 end = start + PAGE_SIZE - 1; int ret; - btrfs_lock_and_flush_ordered_range(tree, inode, start, end, NULL); + btrfs_lock_and_flush_ordered_range(inode, start, end, NULL); - ret = __do_readpage(tree, page, get_extent, NULL, bio, mirror_num, + ret = __do_readpage(page, get_extent, NULL, bio, mirror_num, bio_flags, read_flags, NULL); return ret; } -int extent_read_full_page(struct extent_io_tree *tree, struct page *page, - get_extent_t *get_extent, int mirror_num) +int extent_read_full_page(struct page *page, get_extent_t *get_extent, + int mirror_num) { struct bio *bio = NULL; unsigned long bio_flags = 0; int ret; - ret = __extent_read_full_page(tree, page, get_extent, &bio, mirror_num, + ret = __extent_read_full_page(page, get_extent, &bio, mirror_num, &bio_flags, 0); if (bio) ret = submit_one_bio(bio, mirror_num, bio_flags); @@ -3423,7 +3487,7 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode, unsigned long nr_written, int *nr_ret) { - struct extent_io_tree *tree = epd->tree; + struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; u64 start = page_offset(page); u64 page_end = start + PAGE_SIZE - 1; u64 end; @@ -3509,7 +3573,7 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode, page->index, cur, end); } - ret = submit_extent_page(REQ_OP_WRITE | write_flags, tree, wbc, + ret = submit_extent_page(REQ_OP_WRITE | write_flags, wbc, page, offset, iosize, pg_offset, &epd->bio, end_bio_extent_writepage, @@ -3830,8 +3894,6 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb, struct writeback_control *wbc, struct extent_page_data *epd) { - struct btrfs_fs_info *fs_info = eb->fs_info; - struct extent_io_tree *tree = &BTRFS_I(fs_info->btree_inode)->io_tree; u64 offset = eb->start; u32 nritems; int i, num_pages; @@ -3864,7 +3926,7 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb, clear_page_dirty_for_io(p); set_page_writeback(p); - ret = submit_extent_page(REQ_OP_WRITE | write_flags, tree, wbc, + ret = submit_extent_page(REQ_OP_WRITE | write_flags, wbc, p, offset, PAGE_SIZE, 0, &epd->bio, end_bio_extent_buffer_writepage, @@ -3897,14 +3959,13 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb, int btree_write_cache_pages(struct address_space *mapping, struct writeback_control *wbc) { - struct extent_io_tree *tree = &BTRFS_I(mapping->host)->io_tree; struct extent_buffer *eb, *prev_eb = NULL; struct extent_page_data epd = { .bio = NULL, - .tree = tree, .extent_locked = 0, .sync_io = wbc->sync_mode == WB_SYNC_ALL, }; + struct btrfs_fs_info *fs_info = BTRFS_I(mapping->host)->root->fs_info; int ret = 0; int done = 0; int nr_to_write_done = 0; @@ -4018,7 +4079,39 @@ retry: end_write_bio(&epd, ret); return ret; } - ret = flush_write_bio(&epd); + /* + * If something went wrong, don't allow any metadata write bio to be + * submitted. + * + * This would prevent use-after-free if we had dirty pages not + * cleaned up, which can still happen by fuzzed images. + * + * - Bad extent tree + * Allowing existing tree block to be allocated for other trees. + * + * - Log tree operations + * Exiting tree blocks get allocated to log tree, bumps its + * generation, then get cleaned in tree re-balance. + * Such tree block will not be written back, since it's clean, + * thus no WRITTEN flag set. + * And after log writes back, this tree block is not traced by + * any dirty extent_io_tree. + * + * - Offending tree block gets re-dirtied from its original owner + * Since it has bumped generation, no WRITTEN flag, it can be + * reused without COWing. This tree block will not be traced + * by btrfs_transaction::dirty_pages. + * + * Now such dirty tree block will not be cleaned by any dirty + * extent io tree. Thus we don't want to submit such wild eb + * if the fs already has error. + */ + if (!test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) { + ret = flush_write_bio(&epd); + } else { + ret = -EUCLEAN; + end_write_bio(&epd, ret); + } return ret; } @@ -4190,7 +4283,6 @@ int extent_write_full_page(struct page *page, struct writeback_control *wbc) int ret; struct extent_page_data epd = { .bio = NULL, - .tree = &BTRFS_I(page->mapping->host)->io_tree, .extent_locked = 0, .sync_io = wbc->sync_mode == WB_SYNC_ALL, }; @@ -4212,14 +4304,12 @@ int extent_write_locked_range(struct inode *inode, u64 start, u64 end, { int ret = 0; struct address_space *mapping = inode->i_mapping; - struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; struct page *page; unsigned long nr_pages = (end - start + PAGE_SIZE) >> PAGE_SHIFT; struct extent_page_data epd = { .bio = NULL, - .tree = tree, .extent_locked = 1, .sync_io = mode == WB_SYNC_ALL, }; @@ -4263,7 +4353,6 @@ int extent_writepages(struct address_space *mapping, int ret = 0; struct extent_page_data epd = { .bio = NULL, - .tree = &BTRFS_I(mapping->host)->io_tree, .extent_locked = 0, .sync_io = wbc->sync_mode == WB_SYNC_ALL, }; @@ -4285,7 +4374,6 @@ int extent_readpages(struct address_space *mapping, struct list_head *pages, unsigned long bio_flags = 0; struct page *pagepool[16]; struct extent_map *em_cached = NULL; - struct extent_io_tree *tree = &BTRFS_I(mapping->host)->io_tree; int nr = 0; u64 prev_em_start = (u64)-1; @@ -4312,7 +4400,7 @@ int extent_readpages(struct address_space *mapping, struct list_head *pages, ASSERT(contig_start + nr * PAGE_SIZE - 1 == contig_end); - contiguous_readpages(tree, pagepool, nr, contig_start, + contiguous_readpages(pagepool, nr, contig_start, contig_end, &em_cached, &bio, &bio_flags, &prev_em_start); } @@ -4796,7 +4884,6 @@ out_free_ulist: static void __free_extent_buffer(struct extent_buffer *eb) { - btrfs_leak_debug_del(&eb->leak_list); kmem_cache_free(extent_buffer_cache, eb); } @@ -4862,6 +4949,7 @@ static void btrfs_release_extent_buffer_pages(struct extent_buffer *eb) static inline void btrfs_release_extent_buffer(struct extent_buffer *eb) { btrfs_release_extent_buffer_pages(eb); + btrfs_leak_debug_del(&eb->fs_info->eb_leak_lock, &eb->leak_list); __free_extent_buffer(eb); } @@ -4883,7 +4971,8 @@ __alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start, init_waitqueue_head(&eb->write_lock_wq); init_waitqueue_head(&eb->read_lock_wq); - btrfs_leak_debug_add(&eb->leak_list, &buffers); + btrfs_leak_debug_add(&fs_info->eb_leak_lock, &eb->leak_list, + &fs_info->allocated_ebs); spin_lock_init(&eb->refs_lock); atomic_set(&eb->refs, 1); @@ -5230,6 +5319,7 @@ static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head) } static int release_extent_buffer(struct extent_buffer *eb) + __releases(&eb->refs_lock) { lockdep_assert_held(&eb->refs_lock); @@ -5248,6 +5338,7 @@ static int release_extent_buffer(struct extent_buffer *eb) spin_unlock(&eb->refs_lock); } + btrfs_leak_debug_del(&eb->fs_info->eb_leak_lock, &eb->leak_list); /* Should be safe to release our pages at this point */ btrfs_release_extent_buffer_pages(eb); #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS @@ -5405,7 +5496,6 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num) unsigned long num_reads = 0; struct bio *bio = NULL; unsigned long bio_flags = 0; - struct extent_io_tree *tree = &BTRFS_I(eb->fs_info->btree_inode)->io_tree; if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) return 0; @@ -5453,7 +5543,7 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num) } ClearPageError(page); - err = __extent_read_full_page(tree, page, + err = __extent_read_full_page(page, btree_get_extent, &bio, mirror_num, &bio_flags, REQ_META); diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 5d205bbaafdc..2ed65bd0760e 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -189,8 +189,8 @@ typedef struct extent_map *(get_extent_t)(struct btrfs_inode *inode, int try_release_extent_mapping(struct page *page, gfp_t mask); int try_release_extent_buffer(struct page *page); -int extent_read_full_page(struct extent_io_tree *tree, struct page *page, - get_extent_t *get_extent, int mirror_num); +int extent_read_full_page(struct page *page, get_extent_t *get_extent, + int mirror_num); int extent_write_full_page(struct page *page, struct writeback_control *wbc); int extent_write_locked_range(struct inode *inode, u64 start, u64 end, int mode); @@ -325,4 +325,11 @@ bool find_lock_delalloc_range(struct inode *inode, #endif struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info, u64 start); + +#ifdef CONFIG_BTRFS_DEBUG +void btrfs_extent_buffer_leak_debug_check(struct btrfs_fs_info *fs_info); +#else +#define btrfs_extent_buffer_leak_debug_check(fs_info) do {} while (0) +#endif + #endif diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 6f417ff68980..bd6229fb2b6f 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -237,6 +237,17 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em) struct extent_map *merge = NULL; struct rb_node *rb; + /* + * We can't modify an extent map that is in the tree and that is being + * used by another task, as it can cause that other task to see it in + * inconsistent state during the merging. We always have 1 reference for + * the tree and 1 for this task (which is unpinning the extent map or + * clearing the logging flag), so anything > 2 means it's being used by + * other tasks too. + */ + if (refcount_read(&em->refs) > 2) + return; + if (em->start != 0) { rb = rb_prev(&em->rb_node); if (rb) diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index c2f365662d55..b618ad5339ba 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -23,6 +23,97 @@ #define MAX_CSUM_ITEMS(r, size) (min_t(u32, __MAX_CSUM_ITEMS(r, size), \ PAGE_SIZE)) +/** + * @inode - the inode we want to update the disk_i_size for + * @new_i_size - the i_size we want to set to, 0 if we use i_size + * + * With NO_HOLES set this simply sets the disk_is_size to whatever i_size_read() + * returns as it is perfectly fine with a file that has holes without hole file + * extent items. + * + * However without NO_HOLES we need to only return the area that is contiguous + * from the 0 offset of the file. Otherwise we could end up adjust i_size up + * to an extent that has a gap in between. + * + * Finally new_i_size should only be set in the case of truncate where we're not + * ready to use i_size_read() as the limiter yet. + */ +void btrfs_inode_safe_disk_i_size_write(struct inode *inode, u64 new_i_size) +{ + struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; + u64 start, end, i_size; + int ret; + + i_size = new_i_size ?: i_size_read(inode); + if (btrfs_fs_incompat(fs_info, NO_HOLES)) { + BTRFS_I(inode)->disk_i_size = i_size; + return; + } + + spin_lock(&BTRFS_I(inode)->lock); + ret = find_contiguous_extent_bit(&BTRFS_I(inode)->file_extent_tree, 0, + &start, &end, EXTENT_DIRTY); + if (!ret && start == 0) + i_size = min(i_size, end + 1); + else + i_size = 0; + BTRFS_I(inode)->disk_i_size = i_size; + spin_unlock(&BTRFS_I(inode)->lock); +} + +/** + * @inode - the inode we're modifying + * @start - the start file offset of the file extent we've inserted + * @len - the logical length of the file extent item + * + * Call when we are inserting a new file extent where there was none before. + * Does not need to call this in the case where we're replacing an existing file + * extent, however if not sure it's fine to call this multiple times. + * + * The start and len must match the file extent item, so thus must be sectorsize + * aligned. + */ +int btrfs_inode_set_file_extent_range(struct btrfs_inode *inode, u64 start, + u64 len) +{ + if (len == 0) + return 0; + + ASSERT(IS_ALIGNED(start + len, inode->root->fs_info->sectorsize)); + + if (btrfs_fs_incompat(inode->root->fs_info, NO_HOLES)) + return 0; + return set_extent_bits(&inode->file_extent_tree, start, start + len - 1, + EXTENT_DIRTY); +} + +/** + * @inode - the inode we're modifying + * @start - the start file offset of the file extent we've inserted + * @len - the logical length of the file extent item + * + * Called when we drop a file extent, for example when we truncate. Doesn't + * need to be called for cases where we're replacing a file extent, like when + * we've COWed a file extent. + * + * The start and len must match the file extent item, so thus must be sectorsize + * aligned. + */ +int btrfs_inode_clear_file_extent_range(struct btrfs_inode *inode, u64 start, + u64 len) +{ + if (len == 0) + return 0; + + ASSERT(IS_ALIGNED(start + len, inode->root->fs_info->sectorsize) || + len == (u64)-1); + + if (btrfs_fs_incompat(inode->root->fs_info, NO_HOLES)) + return 0; + return clear_extent_bit(&inode->file_extent_tree, start, + start + len - 1, EXTENT_DIRTY, 0, 0, NULL); +} + static inline u32 max_ordered_sum_bytes(struct btrfs_fs_info *fs_info, u16 csum_size) { @@ -949,18 +1040,7 @@ void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode, btrfs_item_key_to_cpu(leaf, &key, slot); extent_start = key.offset; - - if (type == BTRFS_FILE_EXTENT_REG || - type == BTRFS_FILE_EXTENT_PREALLOC) { - extent_end = extent_start + - btrfs_file_extent_num_bytes(leaf, fi); - } else if (type == BTRFS_FILE_EXTENT_INLINE) { - size_t size; - size = btrfs_file_extent_ram_bytes(leaf, fi); - extent_end = ALIGN(extent_start + size, - fs_info->sectorsize); - } - + extent_end = btrfs_file_extent_end(path); em->ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi); if (type == BTRFS_FILE_EXTENT_REG || type == BTRFS_FILE_EXTENT_PREALLOC) { @@ -1007,3 +1087,30 @@ void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode, root->root_key.objectid); } } + +/* + * Returns the end offset (non inclusive) of the file extent item the given path + * points to. If it points to an inline extent, the returned offset is rounded + * up to the sector size. + */ +u64 btrfs_file_extent_end(const struct btrfs_path *path) +{ + const struct extent_buffer *leaf = path->nodes[0]; + const int slot = path->slots[0]; + struct btrfs_file_extent_item *fi; + struct btrfs_key key; + u64 end; + + btrfs_item_key_to_cpu(leaf, &key, slot); + ASSERT(key.type == BTRFS_EXTENT_DATA_KEY); + fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); + + if (btrfs_file_extent_type(leaf, fi) == BTRFS_FILE_EXTENT_INLINE) { + end = btrfs_file_extent_ram_bytes(leaf, fi); + end = ALIGN(key.offset + end, leaf->fs_info->sectorsize); + } else { + end = key.offset + btrfs_file_extent_num_bytes(leaf, fi); + } + + return end; +} diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index a16da274c9aa..8a144f9cb7ac 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -27,6 +27,7 @@ #include "qgroup.h" #include "compression.h" #include "delalloc-space.h" +#include "reflink.h" static struct kmem_cache *btrfs_inode_defrag_cachep; /* @@ -277,7 +278,6 @@ static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info, struct btrfs_key key; struct btrfs_ioctl_defrag_range_args range; int num_defrag; - int index; int ret; /* get the inode */ @@ -285,9 +285,7 @@ static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info, key.type = BTRFS_ROOT_ITEM_KEY; key.offset = (u64)-1; - index = srcu_read_lock(&fs_info->subvol_srcu); - - inode_root = btrfs_read_fs_root_no_name(fs_info, &key); + inode_root = btrfs_get_fs_root(fs_info, &key, true); if (IS_ERR(inode_root)) { ret = PTR_ERR(inode_root); goto cleanup; @@ -297,11 +295,11 @@ static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info, key.type = BTRFS_INODE_ITEM_KEY; key.offset = 0; inode = btrfs_iget(fs_info->sb, &key, inode_root); + btrfs_put_root(inode_root); if (IS_ERR(inode)) { ret = PTR_ERR(inode); goto cleanup; } - srcu_read_unlock(&fs_info->subvol_srcu, index); /* do a chunk of defrag */ clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags); @@ -337,7 +335,6 @@ static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info, iput(inode); return 0; cleanup: - srcu_read_unlock(&fs_info->subvol_srcu, index); kmem_cache_free(btrfs_inode_defrag_cachep, defrag); return ret; } @@ -1552,15 +1549,14 @@ static noinline int check_can_nocow(struct btrfs_inode *inode, loff_t pos, u64 num_bytes; int ret; - ret = btrfs_start_write_no_snapshotting(root); - if (!ret) + if (!btrfs_drew_try_write_lock(&root->snapshot_lock)) return -EAGAIN; lockstart = round_down(pos, fs_info->sectorsize); lockend = round_up(pos + *write_bytes, fs_info->sectorsize) - 1; - btrfs_lock_and_flush_ordered_range(&inode->io_tree, inode, lockstart, + btrfs_lock_and_flush_ordered_range(inode, lockstart, lockend, NULL); num_bytes = lockend - lockstart + 1; @@ -1568,7 +1564,7 @@ static noinline int check_can_nocow(struct btrfs_inode *inode, loff_t pos, NULL, NULL, NULL); if (ret <= 0) { ret = 0; - btrfs_end_write_no_snapshotting(root); + btrfs_drew_write_unlock(&root->snapshot_lock); } else { *write_bytes = min_t(size_t, *write_bytes , num_bytes - pos + lockstart); @@ -1674,7 +1670,7 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb, data_reserved, pos, write_bytes); else - btrfs_end_write_no_snapshotting(root); + btrfs_drew_write_unlock(&root->snapshot_lock); break; } @@ -1778,7 +1774,7 @@ again: release_bytes = 0; if (only_release_metadata) - btrfs_end_write_no_snapshotting(root); + btrfs_drew_write_unlock(&root->snapshot_lock); if (only_release_metadata && copied > 0) { lockstart = round_down(pos, @@ -1807,7 +1803,7 @@ again: if (release_bytes) { if (only_release_metadata) { - btrfs_end_write_no_snapshotting(root); + btrfs_drew_write_unlock(&root->snapshot_lock); btrfs_delalloc_release_metadata(BTRFS_I(inode), release_bytes, true); } else { @@ -2071,6 +2067,16 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) btrfs_init_log_ctx(&ctx, inode); /* + * Set the range to full if the NO_HOLES feature is not enabled. + * This is to avoid missing file extent items representing holes after + * replaying the log. + */ + if (!btrfs_fs_incompat(fs_info, NO_HOLES)) { + start = 0; + end = LLONG_MAX; + } + + /* * We write the dirty pages in the range and wait until they complete * out of the ->i_mutex. If so, we can flush the dirty pages by * multi-task, and make the performance up. See @@ -2092,19 +2098,6 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) atomic_inc(&root->log_batch); /* - * If the inode needs a full sync, make sure we use a full range to - * avoid log tree corruption, due to hole detection racing with ordered - * extent completion for adjacent ranges, and assertion failures during - * hole detection. Do this while holding the inode lock, to avoid races - * with other tasks. - */ - if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, - &BTRFS_I(inode)->runtime_flags)) { - start = 0; - end = LLONG_MAX; - } - - /* * Before we acquired the inode's lock, someone may have dirtied more * pages in the target range. We need to make sure that writeback for * any such pages does not start while we are logging the inode, because @@ -2124,6 +2117,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) */ ret = start_ordered_ops(inode, start, end); if (ret) { + up_write(&BTRFS_I(inode)->dio_sem); inode_unlock(inode); goto out; } @@ -2486,6 +2480,11 @@ static int btrfs_insert_clone_extent(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(leaf); btrfs_release_path(path); + ret = btrfs_inode_set_file_extent_range(BTRFS_I(inode), + clone_info->file_offset, clone_len); + if (ret) + return ret; + /* If it's a hole, nothing more needs to be done. */ if (clone_info->disk_offset == 0) return 0; @@ -2596,6 +2595,24 @@ int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path, btrfs_abort_transaction(trans, ret); break; } + } else if (!clone_info && cur_offset < drop_end) { + /* + * We are past the i_size here, but since we didn't + * insert holes we need to clear the mapped area so we + * know to not set disk_i_size in this area until a new + * file extent is inserted here. + */ + ret = btrfs_inode_clear_file_extent_range(BTRFS_I(inode), + cur_offset, drop_end - cur_offset); + if (ret) { + /* + * We couldn't clear our area, so we could + * presumably adjust up and corrupt the fs, so + * we need to abort. + */ + btrfs_abort_transaction(trans, ret); + break; + } } if (clone_info && drop_end > clone_info->file_offset) { @@ -2686,6 +2703,15 @@ int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path, btrfs_abort_transaction(trans, ret); goto out_trans; } + } else if (!clone_info && cur_offset < drop_end) { + /* See the comment in the loop above for the reasoning here. */ + ret = btrfs_inode_clear_file_extent_range(BTRFS_I(inode), + cur_offset, drop_end - cur_offset); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out_trans; + } + } if (clone_info) { ret = btrfs_insert_clone_extent(trans, inode, path, clone_info, @@ -2935,7 +2961,7 @@ static int btrfs_fallocate_update_isize(struct inode *inode, inode->i_ctime = current_time(inode); i_size_write(inode, end); - btrfs_ordered_update_i_size(inode, end, NULL); + btrfs_inode_safe_disk_i_size_write(inode, 0); ret = btrfs_update_inode(trans, root, inode); ret2 = btrfs_end_transaction(trans); diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 0598fd3c6e3f..3613da065a73 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -371,10 +371,10 @@ static void io_ctl_drop_pages(struct btrfs_io_ctl *io_ctl) } } -static int io_ctl_prepare_pages(struct btrfs_io_ctl *io_ctl, struct inode *inode, - int uptodate) +static int io_ctl_prepare_pages(struct btrfs_io_ctl *io_ctl, bool uptodate) { struct page *page; + struct inode *inode = io_ctl->inode; gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping); int i; @@ -732,7 +732,7 @@ static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, readahead_cache(inode); - ret = io_ctl_prepare_pages(&io_ctl, inode, 1); + ret = io_ctl_prepare_pages(&io_ctl, true); if (ret) goto out; @@ -1067,6 +1067,7 @@ fail: } static noinline_for_stack int write_pinned_extent_entries( + struct btrfs_trans_handle *trans, struct btrfs_block_group *block_group, struct btrfs_io_ctl *io_ctl, int *entries) @@ -1085,7 +1086,7 @@ static noinline_for_stack int write_pinned_extent_entries( * We shouldn't have switched the pinned extents yet so this is the * right one */ - unpin = block_group->fs_info->pinned_extents; + unpin = &trans->transaction->pinned_extents; start = block_group->start; @@ -1190,7 +1191,7 @@ out: invalidate_inode_pages2(inode->i_mapping); BTRFS_I(inode)->generation = 0; if (block_group) { -#ifdef DEBUG +#ifdef CONFIG_BTRFS_DEBUG btrfs_err(root->fs_info, "failed to write free space cache for block group %llu", block_group->start); @@ -1291,7 +1292,7 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, } /* Lock all pages first so we can lock the extent safely. */ - ret = io_ctl_prepare_pages(io_ctl, inode, 0); + ret = io_ctl_prepare_pages(io_ctl, false); if (ret) goto out_unlock; @@ -1317,7 +1318,7 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, * If this changes while we are working we'll get added back to * the dirty list and redo it. No locking needed */ - ret = write_pinned_extent_entries(block_group, io_ctl, &entries); + ret = write_pinned_extent_entries(trans, block_group, io_ctl, &entries); if (ret) goto out_nospc_locked; @@ -1366,18 +1367,6 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, return 0; -out: - io_ctl->inode = NULL; - io_ctl_free(io_ctl); - if (ret) { - invalidate_inode_pages2(inode->i_mapping); - BTRFS_I(inode)->generation = 0; - } - btrfs_update_inode(trans, root, inode); - if (must_iput) - iput(inode); - return ret; - out_nospc_locked: cleanup_bitmap_list(&bitmap_list); spin_unlock(&ctl->tree_lock); @@ -1390,7 +1379,17 @@ out_unlock: if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA)) up_write(&block_group->data_rwsem); - goto out; +out: + io_ctl->inode = NULL; + io_ctl_free(io_ctl); + if (ret) { + invalidate_inode_pages2(inode->i_mapping); + BTRFS_I(inode)->generation = 0; + } + btrfs_update_inode(trans, root, inode); + if (must_iput) + iput(inode); + return ret; } int btrfs_write_out_cache(struct btrfs_trans_handle *trans, @@ -1416,7 +1415,7 @@ int btrfs_write_out_cache(struct btrfs_trans_handle *trans, ret = __btrfs_write_out_cache(fs_info->tree_root, inode, ctl, block_group, &block_group->io_ctl, trans); if (ret) { -#ifdef DEBUG +#ifdef CONFIG_BTRFS_DEBUG btrfs_err(fs_info, "failed to write free space cache for block group %llu", block_group->start); @@ -4036,7 +4035,7 @@ int btrfs_write_out_ino_cache(struct btrfs_root *root, if (release_metadata) btrfs_delalloc_release_metadata(BTRFS_I(inode), inode->i_size, true); -#ifdef DEBUG +#ifdef CONFIG_BTRFS_DEBUG btrfs_err(fs_info, "failed to write free ino cache for root %llu", root->root_key.objectid); diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c index 258cb3fae17a..8b1f5c8897b7 100644 --- a/fs/btrfs/free-space-tree.c +++ b/fs/btrfs/free-space-tree.c @@ -1251,9 +1251,7 @@ int btrfs_clear_free_space_tree(struct btrfs_fs_info *fs_info) btrfs_free_tree_block(trans, free_space_root, free_space_root->node, 0, 1); - free_extent_buffer(free_space_root->node); - free_extent_buffer(free_space_root->commit_root); - kfree(free_space_root); + btrfs_put_root(free_space_root); return btrfs_commit_transaction(trans); diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c index d5c9c69d8263..6009e0e939b5 100644 --- a/fs/btrfs/inode-map.c +++ b/fs/btrfs/inode-map.c @@ -515,7 +515,7 @@ out_release: trace_btrfs_space_reservation(fs_info, "ino_cache", trans->transid, trans->bytes_reserved, 0); btrfs_block_rsv_release(fs_info, trans->block_rsv, - trans->bytes_reserved); + trans->bytes_reserved, NULL); out: trans->block_rsv = rsv; trans->bytes_reserved = num_bytes; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 5b3ec93ff911..320d1062068d 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -28,6 +28,7 @@ #include <linux/magic.h> #include <linux/iversion.h> #include <linux/swap.h> +#include <linux/migrate.h> #include <linux/sched/mm.h> #include <asm/unaligned.h> #include "misc.h" @@ -242,6 +243,15 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans, btrfs_release_path(path); /* + * We align size to sectorsize for inline extents just for simplicity + * sake. + */ + size = ALIGN(size, root->fs_info->sectorsize); + ret = btrfs_inode_set_file_extent_range(BTRFS_I(inode), start, size); + if (ret) + goto fail; + + /* * we're an inline extent, so nobody can * extend the file past i_size without locking * a page we already have locked. @@ -2446,6 +2456,11 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, ins.offset = disk_num_bytes; ins.type = BTRFS_EXTENT_ITEM_KEY; + ret = btrfs_inode_set_file_extent_range(BTRFS_I(inode), file_pos, + ram_bytes); + if (ret) + goto out; + /* * Release the reserved range from inode dirty range map, as it is * already moved into delayed_ref_head @@ -2536,7 +2551,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) */ btrfs_qgroup_free_data(inode, NULL, start, ordered_extent->num_bytes); - btrfs_ordered_update_i_size(inode, 0, ordered_extent); + btrfs_inode_safe_disk_i_size_write(inode, 0); if (freespace_inode) trans = btrfs_join_transaction_spacecache(root); else @@ -2607,7 +2622,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) goto out; } - btrfs_ordered_update_i_size(inode, 0, ordered_extent); + btrfs_inode_safe_disk_i_size_write(inode, 0); ret = btrfs_update_inode_fallback(trans, root, inode); if (ret) { /* -ENOMEM or corruption */ btrfs_abort_transaction(trans, ret); @@ -3187,6 +3202,8 @@ static int btrfs_read_locked_inode(struct inode *inode, i_uid_write(inode, btrfs_inode_uid(leaf, inode_item)); i_gid_write(inode, btrfs_inode_gid(leaf, inode_item)); btrfs_i_size_write(BTRFS_I(inode), btrfs_inode_size(leaf, inode_item)); + btrfs_inode_set_file_extent_range(BTRFS_I(inode), 0, + round_up(i_size_read(inode), fs_info->sectorsize)); inode->i_atime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->atime); inode->i_atime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->atime); @@ -4085,6 +4102,8 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, u64 bytes_deleted = 0; bool be_nice = false; bool should_throttle = false; + const u64 lock_start = ALIGN_DOWN(new_size, fs_info->sectorsize); + struct extent_state *cached_state = NULL; BUG_ON(new_size > 0 && min_type != BTRFS_EXTENT_DATA_KEY); @@ -4101,6 +4120,10 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, return -ENOMEM; path->reada = READA_BACK; + if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) + lock_extent_bits(&BTRFS_I(inode)->io_tree, lock_start, (u64)-1, + &cached_state); + /* * We want to drop from the next block forward in case this new size is * not block aligned since we will be keeping the last block of the @@ -4137,7 +4160,6 @@ search_again: goto out; } - path->leave_spinning = 1; ret = btrfs_search_slot(trans, root, &key, path, -1, 1); if (ret < 0) goto out; @@ -4153,6 +4175,8 @@ search_again: } while (1) { + u64 clear_start = 0, clear_len = 0; + fi = NULL; leaf = path->nodes[0]; btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); @@ -4203,6 +4227,8 @@ search_again: if (extent_type != BTRFS_FILE_EXTENT_INLINE) { u64 num_dec; + + clear_start = found_key.offset; extent_start = btrfs_file_extent_disk_bytenr(leaf, fi); if (!del_item) { u64 orig_num_bytes = @@ -4210,6 +4236,7 @@ search_again: extent_num_bytes = ALIGN(new_size - found_key.offset, fs_info->sectorsize); + clear_start = ALIGN(new_size, fs_info->sectorsize); btrfs_set_file_extent_num_bytes(leaf, fi, extent_num_bytes); num_dec = (orig_num_bytes - @@ -4235,6 +4262,7 @@ search_again: inode_sub_bytes(inode, num_dec); } } + clear_len = num_dec; } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { /* * we can't truncate inline items that have had @@ -4256,12 +4284,33 @@ search_again: */ ret = NEED_TRUNCATE_BLOCK; break; + } else { + /* + * Inline extents are special, we just treat + * them as a full sector worth in the file + * extent tree just for simplicity sake. + */ + clear_len = fs_info->sectorsize; } if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) inode_sub_bytes(inode, item_end + 1 - new_size); } delete: + /* + * We use btrfs_truncate_inode_items() to clean up log trees for + * multiple fsyncs, and in this case we don't want to clear the + * file extent range because it's just the log. + */ + if (root == BTRFS_I(inode)->root) { + ret = btrfs_inode_clear_file_extent_range(BTRFS_I(inode), + clear_start, clear_len); + if (ret) { + btrfs_abort_transaction(trans, ret); + break; + } + } + if (del_item) last_size = found_key.offset; else @@ -4289,7 +4338,6 @@ delete: root == fs_info->tree_root)) { struct btrfs_ref ref = { 0 }; - btrfs_set_path_blocking(path); bytes_deleted += extent_num_bytes; btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, @@ -4364,7 +4412,9 @@ out: ASSERT(last_size >= new_size); if (!ret && last_size > new_size) last_size = new_size; - btrfs_ordered_update_i_size(inode, last_size, NULL); + btrfs_inode_safe_disk_i_size_write(inode, last_size); + unlock_extent_cached(&BTRFS_I(inode)->io_tree, lock_start, + (u64)-1, &cached_state); } btrfs_free_path(path); @@ -4570,7 +4620,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size) if (size <= hole_start) return 0; - btrfs_lock_and_flush_ordered_range(io_tree, BTRFS_I(inode), hole_start, + btrfs_lock_and_flush_ordered_range(BTRFS_I(inode), hole_start, block_end - 1, &cached_state); cur_offset = hole_start; while (1) { @@ -4583,14 +4633,21 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size) } last_byte = min(extent_map_end(em), block_end); last_byte = ALIGN(last_byte, fs_info->sectorsize); + hole_size = last_byte - cur_offset; + if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) { struct extent_map *hole_em; - hole_size = last_byte - cur_offset; err = maybe_insert_hole(root, inode, cur_offset, hole_size); if (err) break; + + err = btrfs_inode_set_file_extent_range(BTRFS_I(inode), + cur_offset, hole_size); + if (err) + break; + btrfs_drop_extent_cache(BTRFS_I(inode), cur_offset, cur_offset + hole_size - 1, 0); hole_em = alloc_extent_map(); @@ -4622,6 +4679,11 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size) hole_size - 1, 0); } free_extent_map(hole_em); + } else { + err = btrfs_inode_set_file_extent_range(BTRFS_I(inode), + cur_offset, hole_size); + if (err) + break; } next: free_extent_map(em); @@ -4665,24 +4727,24 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr) * truncation, it must capture all writes that happened before * this truncation. */ - btrfs_wait_for_snapshot_creation(root); + btrfs_drew_write_lock(&root->snapshot_lock); ret = btrfs_cont_expand(inode, oldsize, newsize); if (ret) { - btrfs_end_write_no_snapshotting(root); + btrfs_drew_write_unlock(&root->snapshot_lock); return ret; } trans = btrfs_start_transaction(root, 1); if (IS_ERR(trans)) { - btrfs_end_write_no_snapshotting(root); + btrfs_drew_write_unlock(&root->snapshot_lock); return PTR_ERR(trans); } i_size_write(inode, newsize); - btrfs_ordered_update_i_size(inode, i_size_read(inode), NULL); + btrfs_inode_safe_disk_i_size_write(inode, 0); pagecache_isize_extended(inode, oldsize, newsize); ret = btrfs_update_inode(trans, root, inode); - btrfs_end_write_no_snapshotting(root); + btrfs_drew_write_unlock(&root->snapshot_lock); btrfs_end_transaction(trans); } else { @@ -5092,7 +5154,7 @@ static int fixup_tree_root_location(struct btrfs_fs_info *fs_info, btrfs_release_path(path); - new_root = btrfs_read_fs_root_no_name(fs_info, location); + new_root = btrfs_get_fs_root(fs_info, location, true); if (IS_ERR(new_root)) { err = PTR_ERR(new_root); goto out; @@ -5173,7 +5235,8 @@ static int btrfs_init_locked_inode(struct inode *inode, void *p) inode->i_ino = args->location->objectid; memcpy(&BTRFS_I(inode)->location, args->location, sizeof(*args->location)); - BTRFS_I(inode)->root = args->root; + BTRFS_I(inode)->root = btrfs_grab_root(args->root); + BUG_ON(args->root && !BTRFS_I(inode)->root); return 0; } @@ -5254,7 +5317,7 @@ static struct inode *new_simple_dir(struct super_block *s, if (!inode) return ERR_PTR(-ENOMEM); - BTRFS_I(inode)->root = root; + BTRFS_I(inode)->root = btrfs_grab_root(root); memcpy(&BTRFS_I(inode)->location, key, sizeof(*key)); set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags); @@ -5301,7 +5364,6 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) struct btrfs_root *sub_root = root; struct btrfs_key location; u8 di_type = 0; - int index; int ret = 0; if (dentry->d_name.len > BTRFS_NAME_LEN) @@ -5328,7 +5390,6 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) return inode; } - index = srcu_read_lock(&fs_info->subvol_srcu); ret = fixup_tree_root_location(fs_info, dir, dentry, &location, &sub_root); if (ret < 0) { @@ -5339,7 +5400,8 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) } else { inode = btrfs_iget(dir->i_sb, &location, sub_root); } - srcu_read_unlock(&fs_info->subvol_srcu, index); + if (root != sub_root) + btrfs_put_root(sub_root); if (!IS_ERR(inode) && root != sub_root) { down_read(&fs_info->cleanup_work_sem); @@ -5820,7 +5882,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, */ BTRFS_I(inode)->index_cnt = 2; BTRFS_I(inode)->dir_index = *index; - BTRFS_I(inode)->root = root; + BTRFS_I(inode)->root = btrfs_grab_root(root); BTRFS_I(inode)->generation = trans->transid; inode->i_generation = BTRFS_I(inode)->generation; @@ -6457,6 +6519,7 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, extent_type = btrfs_file_extent_type(leaf, item); extent_start = found_key.offset; + extent_end = btrfs_file_extent_end(path); if (extent_type == BTRFS_FILE_EXTENT_REG || extent_type == BTRFS_FILE_EXTENT_PREALLOC) { /* Only regular file could have regular/prealloc extent */ @@ -6467,18 +6530,9 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, btrfs_ino(inode)); goto out; } - extent_end = extent_start + - btrfs_file_extent_num_bytes(leaf, item); - trace_btrfs_get_extent_show_fi_regular(inode, leaf, item, extent_start); } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { - size_t size; - - size = btrfs_file_extent_ram_bytes(leaf, item); - extent_end = ALIGN(extent_start + size, - fs_info->sectorsize); - trace_btrfs_get_extent_show_fi_inline(inode, leaf, item, path->slots[0], extent_start); @@ -7777,6 +7831,7 @@ static inline blk_status_t btrfs_lookup_and_bind_dio_csum(struct inode *inode, { struct btrfs_io_bio *io_bio = btrfs_io_bio(bio); struct btrfs_io_bio *orig_io_bio = btrfs_io_bio(dip->orig_bio); + u16 csum_size; blk_status_t ret; /* @@ -7796,7 +7851,8 @@ static inline blk_status_t btrfs_lookup_and_bind_dio_csum(struct inode *inode, file_offset -= dip->logical_offset; file_offset >>= inode->i_sb->s_blocksize_bits; - io_bio->csum = (u8 *)(((u32 *)orig_io_bio->csum) + file_offset); + csum_size = btrfs_super_csum_size(btrfs_sb(inode->i_sb)->super_copy); + io_bio->csum = orig_io_bio->csum + csum_size * file_offset; return 0; } @@ -8203,9 +8259,7 @@ static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, int btrfs_readpage(struct file *file, struct page *page) { - struct extent_io_tree *tree; - tree = &BTRFS_I(page->mapping->host)->io_tree; - return extent_read_full_page(tree, page, btrfs_get_extent, 0); + return extent_read_full_page(page, btrfs_get_extent, 0); } static int btrfs_writepage(struct page *page, struct writeback_control *wbc) @@ -8264,6 +8318,39 @@ static int btrfs_releasepage(struct page *page, gfp_t gfp_flags) return __btrfs_releasepage(page, gfp_flags); } +#ifdef CONFIG_MIGRATION +static int btrfs_migratepage(struct address_space *mapping, + struct page *newpage, struct page *page, + enum migrate_mode mode) +{ + int ret; + + ret = migrate_page_move_mapping(mapping, newpage, page, 0); + if (ret != MIGRATEPAGE_SUCCESS) + return ret; + + if (page_has_private(page)) { + ClearPagePrivate(page); + get_page(newpage); + set_page_private(newpage, page_private(page)); + set_page_private(page, 0); + put_page(page); + SetPagePrivate(newpage); + } + + if (PagePrivate2(page)) { + ClearPagePrivate2(page); + SetPagePrivate2(newpage); + } + + if (mode != MIGRATE_SYNC_NO_COPY) + migrate_page_copy(newpage, page); + else + migrate_page_states(newpage, page); + return MIGRATEPAGE_SUCCESS; +} +#endif + static void btrfs_invalidatepage(struct page *page, unsigned int offset, unsigned int length) { @@ -8639,7 +8726,7 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback) break; } - btrfs_block_rsv_release(fs_info, rsv, -1); + btrfs_block_rsv_release(fs_info, rsv, -1, NULL); ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, rsv, min_size, false); BUG_ON(ret); /* shouldn't happen */ @@ -8664,7 +8751,7 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback) ret = PTR_ERR(trans); goto out; } - btrfs_ordered_update_i_size(inode, inode->i_size, NULL); + btrfs_inode_safe_disk_i_size_write(inode, 0); } if (trans) { @@ -8768,6 +8855,8 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) extent_io_tree_init(fs_info, &ei->io_tree, IO_TREE_INODE_IO, inode); extent_io_tree_init(fs_info, &ei->io_failure_tree, IO_TREE_INODE_IO_FAILURE, inode); + extent_io_tree_init(fs_info, &ei->file_extent_tree, + IO_TREE_INODE_FILE_EXTENT, inode); ei->io_tree.track_uptodate = true; ei->io_failure_tree.track_uptodate = true; atomic_set(&ei->sync_writers, 0); @@ -8834,6 +8923,8 @@ void btrfs_destroy_inode(struct inode *inode) btrfs_qgroup_check_reserved_leak(inode); inode_tree_del(inode); btrfs_drop_extent_cache(BTRFS_I(inode), 0, (u64)-1, 0); + btrfs_inode_clear_file_extent_range(BTRFS_I(inode), 0, (u64)-1); + btrfs_put_root(BTRFS_I(inode)->root); } int btrfs_drop_inode(struct inode *inode) @@ -9488,6 +9579,10 @@ out_fail: ret = btrfs_sync_log(trans, BTRFS_I(old_inode)->root, &ctx); if (ret) commit_transaction = true; + } else if (sync_log) { + mutex_lock(&root->log_mutex); + list_del(&ctx.list); + mutex_unlock(&root->log_mutex); } if (commit_transaction) { ret = btrfs_commit_transaction(trans); @@ -9657,14 +9752,14 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int nr) while (!list_empty(&splice) && nr) { root = list_first_entry(&splice, struct btrfs_root, delalloc_root); - root = btrfs_grab_fs_root(root); + root = btrfs_grab_root(root); BUG_ON(!root); list_move_tail(&root->delalloc_root, &fs_info->delalloc_roots); spin_unlock(&fs_info->delalloc_root_lock); ret = start_delalloc_inodes(root, nr, false); - btrfs_put_fs_root(root); + btrfs_put_root(root); if (ret < 0) goto out; @@ -9818,6 +9913,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_key ins; u64 cur_offset = start; + u64 clear_offset = start; u64 i_size; u64 cur_bytes; u64 last_alloc = (u64)-1; @@ -9852,6 +9948,15 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, btrfs_end_transaction(trans); break; } + + /* + * We've reserved this space, and thus converted it from + * ->bytes_may_use to ->bytes_reserved. Any error that happens + * from here on out we will only need to clear our reservation + * for the remaining unreserved area, so advance our + * clear_offset by our extent size. + */ + clear_offset += ins.offset; btrfs_dec_block_group_reservations(fs_info, ins.objectid); last_alloc = ins.offset; @@ -9916,7 +10021,7 @@ next: else i_size = cur_offset; i_size_write(inode, i_size); - btrfs_ordered_update_i_size(inode, i_size, NULL); + btrfs_inode_safe_disk_i_size_write(inode, 0); } ret = btrfs_update_inode(trans, root, inode); @@ -9931,9 +10036,9 @@ next: if (own_trans) btrfs_end_transaction(trans); } - if (cur_offset < end) - btrfs_free_reserved_data_space(inode, NULL, cur_offset, - end - cur_offset + 1); + if (clear_offset < end) + btrfs_free_reserved_data_space(inode, NULL, clear_offset, + end - clear_offset + 1); return ret; } @@ -10452,6 +10557,9 @@ static const struct address_space_operations btrfs_aops = { .direct_IO = btrfs_direct_IO, .invalidatepage = btrfs_invalidatepage, .releasepage = btrfs_releasepage, +#ifdef CONFIG_MIGRATION + .migratepage = btrfs_migratepage, +#endif .set_page_dirty = btrfs_set_page_dirty, .error_remove_page = generic_error_remove_page, .swap_activate = btrfs_swap_activate, diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 4f4b13830b25..40b729dce91c 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -28,6 +28,7 @@ #include <linux/iversion.h> #include "ctree.h" #include "disk-io.h" +#include "export.h" #include "transaction.h" #include "btrfs_inode.h" #include "print-tree.h" @@ -86,10 +87,6 @@ struct btrfs_ioctl_send_args_32 { struct btrfs_ioctl_send_args_32) #endif -static int btrfs_clone(struct inode *src, struct inode *inode, - u64 off, u64 olen, u64 olen_aligned, u64 destoff, - int no_time_update); - /* Mask out flags that are inappropriate for the given type of inode. */ static unsigned int btrfs_mask_fsflags_for_type(struct inode *inode, unsigned int flags) @@ -554,7 +551,6 @@ int __pure btrfs_is_empty_uuid(u8 *uuid) static noinline int create_subvol(struct inode *dir, struct dentry *dentry, const char *name, int namelen, - u64 *async_transid, struct btrfs_qgroup_inherit *inherit) { struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); @@ -573,7 +569,6 @@ static noinline int create_subvol(struct inode *dir, u64 objectid; u64 new_dirid = BTRFS_FIRST_FREE_OBJECTID; u64 index = 0; - uuid_le new_uuid; root_item = kzalloc(sizeof(*root_item), GFP_KERNEL); if (!root_item) @@ -643,8 +638,7 @@ static noinline int create_subvol(struct inode *dir, btrfs_set_root_generation_v2(root_item, btrfs_root_generation(root_item)); - uuid_le_gen(&new_uuid); - memcpy(root_item->uuid, new_uuid.b, BTRFS_UUID_SIZE); + generate_random_guid(root_item->uuid); btrfs_set_stack_timespec_sec(&root_item->otime, cur_time.tv_sec); btrfs_set_stack_timespec_nsec(&root_item->otime, cur_time.tv_nsec); root_item->ctime = root_item->otime; @@ -666,7 +660,7 @@ static noinline int create_subvol(struct inode *dir, goto fail; key.offset = (u64)-1; - new_root = btrfs_read_fs_root_no_name(fs_info, &key); + new_root = btrfs_get_fs_root(fs_info, &key, true); if (IS_ERR(new_root)) { ret = PTR_ERR(new_root); btrfs_abort_transaction(trans, ret); @@ -676,6 +670,7 @@ static noinline int create_subvol(struct inode *dir, btrfs_record_root_in_trans(trans, new_root); ret = btrfs_create_subvol_root(trans, new_root, root, new_dirid); + btrfs_put_root(new_root); if (ret) { /* We potentially lose an unused inode item here */ btrfs_abort_transaction(trans, ret); @@ -727,14 +722,7 @@ fail: trans->bytes_reserved = 0; btrfs_subvolume_release_metadata(fs_info, &block_rsv); - if (async_transid) { - *async_transid = trans->transid; - err = btrfs_commit_transaction_async(trans, 1); - if (err) - err = btrfs_commit_transaction(trans); - } else { - err = btrfs_commit_transaction(trans); - } + err = btrfs_commit_transaction(trans); if (err && !ret) ret = err; @@ -752,8 +740,7 @@ fail_free: } static int create_snapshot(struct btrfs_root *root, struct inode *dir, - struct dentry *dentry, - u64 *async_transid, bool readonly, + struct dentry *dentry, bool readonly, struct btrfs_qgroup_inherit *inherit) { struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); @@ -789,11 +776,7 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, * possible. This is to avoid later writeback (running dealloc) to * fallback to COW mode and unexpectedly fail with ENOSPC. */ - atomic_inc(&root->will_be_snapshotted); - smp_mb__after_atomic(); - /* wait for no snapshot writes */ - wait_event(root->subv_writers->wait, - percpu_counter_sum(&root->subv_writers->counter) == 0); + btrfs_drew_read_lock(&root->snapshot_lock); ret = btrfs_start_delalloc_snapshot(root); if (ret) @@ -841,14 +824,8 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, list_add(&pending_snapshot->list, &trans->transaction->pending_snapshots); spin_unlock(&fs_info->trans_lock); - if (async_transid) { - *async_transid = trans->transid; - ret = btrfs_commit_transaction_async(trans, 1); - if (ret) - ret = btrfs_commit_transaction(trans); - } else { - ret = btrfs_commit_transaction(trans); - } + + ret = btrfs_commit_transaction(trans); if (ret) goto fail; @@ -869,12 +846,13 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, d_instantiate(dentry, inode); ret = 0; fail: + btrfs_put_root(pending_snapshot->snap); btrfs_subvolume_release_metadata(fs_info, &pending_snapshot->block_rsv); dec_and_free: if (snapshot_force_cow) atomic_dec(&root->snapshot_force_cow); - if (atomic_dec_and_test(&root->will_be_snapshotted)) - wake_up_var(&root->will_be_snapshotted); + btrfs_drew_read_unlock(&root->snapshot_lock); + free_pending: kfree(pending_snapshot->root_item); btrfs_free_path(pending_snapshot->path); @@ -953,7 +931,7 @@ static inline int btrfs_may_create(struct inode *dir, struct dentry *child) static noinline int btrfs_mksubvol(const struct path *parent, const char *name, int namelen, struct btrfs_root *snap_src, - u64 *async_transid, bool readonly, + bool readonly, struct btrfs_qgroup_inherit *inherit) { struct inode *dir = d_inode(parent->dentry); @@ -989,13 +967,11 @@ static noinline int btrfs_mksubvol(const struct path *parent, if (btrfs_root_refs(&BTRFS_I(dir)->root->root_item) == 0) goto out_up_read; - if (snap_src) { - error = create_snapshot(snap_src, dir, dentry, - async_transid, readonly, inherit); - } else { - error = create_subvol(dir, dentry, name, namelen, - async_transid, inherit); - } + if (snap_src) + error = create_snapshot(snap_src, dir, dentry, readonly, inherit); + else + error = create_subvol(dir, dentry, name, namelen, inherit); + if (!error) fsnotify_mkdir(dir, dentry); out_up_read: @@ -1711,9 +1687,6 @@ static noinline int btrfs_ioctl_resize(struct file *file, new_size = round_down(new_size, fs_info->sectorsize); - btrfs_info_in_rcu(fs_info, "new size for %s is %llu", - rcu_str_deref(device->name), new_size); - if (new_size > old_size) { trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { @@ -1726,6 +1699,11 @@ static noinline int btrfs_ioctl_resize(struct file *file, ret = btrfs_shrink_device(device, new_size); } /* equal, nothing need to do */ + if (ret == 0 && new_size != old_size) + btrfs_info_in_rcu(fs_info, + "resize device %s (devid %llu) from %llu to %llu", + rcu_str_deref(device->name), device->devid, + old_size, new_size); out_free: kfree(vol_args); out: @@ -1734,9 +1712,9 @@ out: return ret; } -static noinline int btrfs_ioctl_snap_create_transid(struct file *file, +static noinline int __btrfs_ioctl_snap_create(struct file *file, const char *name, unsigned long fd, int subvol, - u64 *transid, bool readonly, + bool readonly, struct btrfs_qgroup_inherit *inherit) { int namelen; @@ -1763,7 +1741,7 @@ static noinline int btrfs_ioctl_snap_create_transid(struct file *file, if (subvol) { ret = btrfs_mksubvol(&file->f_path, name, namelen, - NULL, transid, readonly, inherit); + NULL, readonly, inherit); } else { struct fd src = fdget(fd); struct inode *src_inode; @@ -1786,7 +1764,7 @@ static noinline int btrfs_ioctl_snap_create_transid(struct file *file, } else { ret = btrfs_mksubvol(&file->f_path, name, namelen, BTRFS_I(src_inode)->root, - transid, readonly, inherit); + readonly, inherit); } fdput(src); } @@ -1810,9 +1788,8 @@ static noinline int btrfs_ioctl_snap_create(struct file *file, return PTR_ERR(vol_args); vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; - ret = btrfs_ioctl_snap_create_transid(file, vol_args->name, - vol_args->fd, subvol, - NULL, false, NULL); + ret = __btrfs_ioctl_snap_create(file, vol_args->name, vol_args->fd, + subvol, false, NULL); kfree(vol_args); return ret; @@ -1823,8 +1800,6 @@ static noinline int btrfs_ioctl_snap_create_v2(struct file *file, { struct btrfs_ioctl_vol_args_v2 *vol_args; int ret; - u64 transid = 0; - u64 *ptr = NULL; bool readonly = false; struct btrfs_qgroup_inherit *inherit = NULL; @@ -1836,22 +1811,11 @@ static noinline int btrfs_ioctl_snap_create_v2(struct file *file, return PTR_ERR(vol_args); vol_args->name[BTRFS_SUBVOL_NAME_MAX] = '\0'; - if (vol_args->flags & - ~(BTRFS_SUBVOL_CREATE_ASYNC | BTRFS_SUBVOL_RDONLY | - BTRFS_SUBVOL_QGROUP_INHERIT)) { + if (vol_args->flags & ~BTRFS_SUBVOL_CREATE_ARGS_MASK) { ret = -EOPNOTSUPP; goto free_args; } - if (vol_args->flags & BTRFS_SUBVOL_CREATE_ASYNC) { - struct inode *inode = file_inode(file); - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - - btrfs_warn(fs_info, -"SNAP_CREATE_V2 ioctl with CREATE_ASYNC is deprecated and will be removed in kernel 5.7"); - - ptr = &transid; - } if (vol_args->flags & BTRFS_SUBVOL_RDONLY) readonly = true; if (vol_args->flags & BTRFS_SUBVOL_QGROUP_INHERIT) { @@ -1866,18 +1830,10 @@ static noinline int btrfs_ioctl_snap_create_v2(struct file *file, } } - ret = btrfs_ioctl_snap_create_transid(file, vol_args->name, - vol_args->fd, subvol, ptr, - readonly, inherit); + ret = __btrfs_ioctl_snap_create(file, vol_args->name, vol_args->fd, + subvol, readonly, inherit); if (ret) goto free_inherit; - - if (ptr && copy_to_user(arg + - offsetof(struct btrfs_ioctl_vol_args_v2, - transid), - ptr, sizeof(*ptr))) - ret = -EFAULT; - free_inherit: kfree(inherit); free_args: @@ -1936,11 +1892,6 @@ static noinline int btrfs_ioctl_subvol_setflags(struct file *file, goto out_drop_write; } - if (flags & BTRFS_SUBVOL_CREATE_ASYNC) { - ret = -EINVAL; - goto out_drop_write; - } - if (flags & ~BTRFS_SUBVOL_RDONLY) { ret = -EOPNOTSUPP; goto out_drop_write; @@ -2174,12 +2125,12 @@ static noinline int search_ioctl(struct inode *inode, if (sk->tree_id == 0) { /* search the root of the inode that was passed */ - root = BTRFS_I(inode)->root; + root = btrfs_grab_root(BTRFS_I(inode)->root); } else { key.objectid = sk->tree_id; key.type = BTRFS_ROOT_ITEM_KEY; key.offset = (u64)-1; - root = btrfs_read_fs_root_no_name(info, &key); + root = btrfs_get_fs_root(info, &key, true); if (IS_ERR(root)) { btrfs_free_path(path); return PTR_ERR(root); @@ -2208,6 +2159,7 @@ static noinline int search_ioctl(struct inode *inode, ret = 0; err: sk->nr_items = num_found; + btrfs_put_root(root); btrfs_free_path(path); return ret; } @@ -2314,9 +2266,10 @@ static noinline int btrfs_search_path_in_tree(struct btrfs_fs_info *info, key.objectid = tree_id; key.type = BTRFS_ROOT_ITEM_KEY; key.offset = (u64)-1; - root = btrfs_read_fs_root_no_name(info, &key); + root = btrfs_get_fs_root(info, &key, true); if (IS_ERR(root)) { ret = PTR_ERR(root); + root = NULL; goto out; } @@ -2367,6 +2320,7 @@ static noinline int btrfs_search_path_in_tree(struct btrfs_fs_info *info, name[total_len] = '\0'; ret = 0; out: + btrfs_put_root(root); btrfs_free_path(path); return ret; } @@ -2383,7 +2337,7 @@ static int btrfs_search_path_in_tree_user(struct inode *inode, unsigned long item_len; struct btrfs_inode_ref *iref; struct btrfs_root_ref *rref; - struct btrfs_root *root; + struct btrfs_root *root = NULL; struct btrfs_path *path; struct btrfs_key key, key2; struct extent_buffer *leaf; @@ -2408,7 +2362,7 @@ static int btrfs_search_path_in_tree_user(struct inode *inode, key.objectid = treeid; key.type = BTRFS_ROOT_ITEM_KEY; key.offset = (u64)-1; - root = btrfs_read_fs_root_no_name(fs_info, &key); + root = btrfs_get_fs_root(fs_info, &key, true); if (IS_ERR(root)) { ret = PTR_ERR(root); goto out; @@ -2420,15 +2374,15 @@ static int btrfs_search_path_in_tree_user(struct inode *inode, while (1) { ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) { - goto out; + goto out_put; } else if (ret > 0) { ret = btrfs_previous_item(root, path, dirid, BTRFS_INODE_REF_KEY); if (ret < 0) { - goto out; + goto out_put; } else if (ret > 0) { ret = -ENOENT; - goto out; + goto out_put; } } @@ -2442,7 +2396,7 @@ static int btrfs_search_path_in_tree_user(struct inode *inode, total_len += len + 1; if (ptr < args->path) { ret = -ENAMETOOLONG; - goto out; + goto out_put; } *(ptr + len) = '/'; @@ -2453,10 +2407,10 @@ static int btrfs_search_path_in_tree_user(struct inode *inode, ret = btrfs_previous_item(root, path, dirid, BTRFS_INODE_ITEM_KEY); if (ret < 0) { - goto out; + goto out_put; } else if (ret > 0) { ret = -ENOENT; - goto out; + goto out_put; } leaf = path->nodes[0]; @@ -2464,26 +2418,26 @@ static int btrfs_search_path_in_tree_user(struct inode *inode, btrfs_item_key_to_cpu(leaf, &key2, slot); if (key2.objectid != dirid) { ret = -ENOENT; - goto out; + goto out_put; } temp_inode = btrfs_iget(sb, &key2, root); if (IS_ERR(temp_inode)) { ret = PTR_ERR(temp_inode); - goto out; + goto out_put; } ret = inode_permission(temp_inode, MAY_READ | MAY_EXEC); iput(temp_inode); if (ret) { ret = -EACCES; - goto out; + goto out_put; } if (key.offset == upper_limit.objectid) break; if (key.objectid == BTRFS_FIRST_FREE_OBJECTID) { ret = -EACCES; - goto out; + goto out_put; } btrfs_release_path(path); @@ -2494,15 +2448,16 @@ static int btrfs_search_path_in_tree_user(struct inode *inode, memmove(args->path, ptr, total_len); args->path[total_len] = '\0'; + btrfs_put_root(root); + root = NULL; btrfs_release_path(path); } /* Get the bottom subvolume's name from ROOT_REF */ - root = fs_info->tree_root; key.objectid = treeid; key.type = BTRFS_ROOT_REF_KEY; key.offset = args->treeid; - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0); if (ret < 0) { goto out; } else if (ret > 0) { @@ -2529,6 +2484,8 @@ static int btrfs_search_path_in_tree_user(struct inode *inode, read_extent_buffer(leaf, args->name, item_off, item_len); args->name[item_len] = 0; +out_put: + btrfs_put_root(root); out: btrfs_free_path(path); return ret; @@ -2653,10 +2610,10 @@ static int btrfs_ioctl_get_subvol_info(struct file *file, void __user *argp) key.objectid = BTRFS_I(inode)->root->root_key.objectid; key.type = BTRFS_ROOT_ITEM_KEY; key.offset = (u64)-1; - root = btrfs_read_fs_root_no_name(fs_info, &key); + root = btrfs_get_fs_root(fs_info, &key, true); if (IS_ERR(root)) { ret = PTR_ERR(root); - goto out; + goto out_free; } root_item = &root->root_item; @@ -2689,16 +2646,14 @@ static int btrfs_ioctl_get_subvol_info(struct file *file, void __user *argp) if (key.objectid != BTRFS_FS_TREE_OBJECTID) { /* Search root tree for ROOT_BACKREF of this subvolume */ - root = fs_info->tree_root; - key.type = BTRFS_ROOT_BACKREF_KEY; key.offset = 0; - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0); if (ret < 0) { goto out; } else if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { - ret = btrfs_next_leaf(root, path); + ret = btrfs_next_leaf(fs_info->tree_root, path); if (ret < 0) { goto out; } else if (ret > 0) { @@ -2733,6 +2688,8 @@ static int btrfs_ioctl_get_subvol_info(struct file *file, void __user *argp) ret = -EFAULT; out: + btrfs_put_root(root); +out_free: btrfs_free_path(path); kzfree(subvol_info); return ret; @@ -2836,7 +2793,8 @@ out: } static noinline int btrfs_ioctl_snap_destroy(struct file *file, - void __user *arg) + void __user *arg, + bool destroy_v2) { struct dentry *parent = file->f_path.dentry; struct btrfs_fs_info *fs_info = btrfs_sb(parent->d_sb); @@ -2845,34 +2803,120 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, struct inode *inode; struct btrfs_root *root = BTRFS_I(dir)->root; struct btrfs_root *dest = NULL; - struct btrfs_ioctl_vol_args *vol_args; - int namelen; + struct btrfs_ioctl_vol_args *vol_args = NULL; + struct btrfs_ioctl_vol_args_v2 *vol_args2 = NULL; + char *subvol_name, *subvol_name_ptr = NULL; + int subvol_namelen; int err = 0; + bool destroy_parent = false; - if (!S_ISDIR(dir->i_mode)) - return -ENOTDIR; + if (destroy_v2) { + vol_args2 = memdup_user(arg, sizeof(*vol_args2)); + if (IS_ERR(vol_args2)) + return PTR_ERR(vol_args2); - vol_args = memdup_user(arg, sizeof(*vol_args)); - if (IS_ERR(vol_args)) - return PTR_ERR(vol_args); + if (vol_args2->flags & ~BTRFS_SUBVOL_DELETE_ARGS_MASK) { + err = -EOPNOTSUPP; + goto out; + } - vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; - namelen = strlen(vol_args->name); - if (strchr(vol_args->name, '/') || - strncmp(vol_args->name, "..", namelen) == 0) { - err = -EINVAL; - goto out; + /* + * If SPEC_BY_ID is not set, we are looking for the subvolume by + * name, same as v1 currently does. + */ + if (!(vol_args2->flags & BTRFS_SUBVOL_SPEC_BY_ID)) { + vol_args2->name[BTRFS_SUBVOL_NAME_MAX] = 0; + subvol_name = vol_args2->name; + + err = mnt_want_write_file(file); + if (err) + goto out; + } else { + if (vol_args2->subvolid < BTRFS_FIRST_FREE_OBJECTID) { + err = -EINVAL; + goto out; + } + + err = mnt_want_write_file(file); + if (err) + goto out; + + dentry = btrfs_get_dentry(fs_info->sb, + BTRFS_FIRST_FREE_OBJECTID, + vol_args2->subvolid, 0, 0); + if (IS_ERR(dentry)) { + err = PTR_ERR(dentry); + goto out_drop_write; + } + + /* + * Change the default parent since the subvolume being + * deleted can be outside of the current mount point. + */ + parent = btrfs_get_parent(dentry); + + /* + * At this point dentry->d_name can point to '/' if the + * subvolume we want to destroy is outsite of the + * current mount point, so we need to release the + * current dentry and execute the lookup to return a new + * one with ->d_name pointing to the + * <mount point>/subvol_name. + */ + dput(dentry); + if (IS_ERR(parent)) { + err = PTR_ERR(parent); + goto out_drop_write; + } + dir = d_inode(parent); + + /* + * If v2 was used with SPEC_BY_ID, a new parent was + * allocated since the subvolume can be outside of the + * current mount point. Later on we need to release this + * new parent dentry. + */ + destroy_parent = true; + + subvol_name_ptr = btrfs_get_subvol_name_from_objectid( + fs_info, vol_args2->subvolid); + if (IS_ERR(subvol_name_ptr)) { + err = PTR_ERR(subvol_name_ptr); + goto free_parent; + } + /* subvol_name_ptr is already NULL termined */ + subvol_name = (char *)kbasename(subvol_name_ptr); + } + } else { + vol_args = memdup_user(arg, sizeof(*vol_args)); + if (IS_ERR(vol_args)) + return PTR_ERR(vol_args); + + vol_args->name[BTRFS_PATH_NAME_MAX] = 0; + subvol_name = vol_args->name; + + err = mnt_want_write_file(file); + if (err) + goto out; } - err = mnt_want_write_file(file); - if (err) - goto out; + subvol_namelen = strlen(subvol_name); + + if (strchr(subvol_name, '/') || + strncmp(subvol_name, "..", subvol_namelen) == 0) { + err = -EINVAL; + goto free_subvol_name; + } + if (!S_ISDIR(dir->i_mode)) { + err = -ENOTDIR; + goto free_subvol_name; + } err = down_write_killable_nested(&dir->i_rwsem, I_MUTEX_PARENT); if (err == -EINTR) - goto out_drop_write; - dentry = lookup_one_len(vol_args->name, parent, namelen); + goto free_subvol_name; + dentry = lookup_one_len(subvol_name, parent, subvol_namelen); if (IS_ERR(dentry)) { err = PTR_ERR(dentry); goto out_unlock_dir; @@ -2941,9 +2985,15 @@ out_dput: dput(dentry); out_unlock_dir: inode_unlock(dir); +free_subvol_name: + kfree(subvol_name_ptr); +free_parent: + if (destroy_parent) + dput(parent); out_drop_write: mnt_drop_write_file(file); out: + kfree(vol_args2); kfree(vol_args); return err; } @@ -3069,8 +3119,7 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg) goto err_drop; } - /* Check for compatibility reject unknown flags */ - if (vol_args->flags & ~BTRFS_VOL_ARG_V2_FLAGS_SUPPORTED) { + if (vol_args->flags & ~BTRFS_DEVICE_REMOVE_ARGS_MASK) { ret = -EOPNOTSUPP; goto out; } @@ -3220,733 +3269,6 @@ out: return ret; } -static void btrfs_double_extent_unlock(struct inode *inode1, u64 loff1, - struct inode *inode2, u64 loff2, u64 len) -{ - unlock_extent(&BTRFS_I(inode1)->io_tree, loff1, loff1 + len - 1); - unlock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1); -} - -static void btrfs_double_extent_lock(struct inode *inode1, u64 loff1, - struct inode *inode2, u64 loff2, u64 len) -{ - if (inode1 < inode2) { - swap(inode1, inode2); - swap(loff1, loff2); - } else if (inode1 == inode2 && loff2 < loff1) { - swap(loff1, loff2); - } - lock_extent(&BTRFS_I(inode1)->io_tree, loff1, loff1 + len - 1); - lock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1); -} - -static int btrfs_extent_same_range(struct inode *src, u64 loff, u64 len, - struct inode *dst, u64 dst_loff) -{ - const u64 bs = BTRFS_I(src)->root->fs_info->sb->s_blocksize; - int ret; - - /* - * Lock destination range to serialize with concurrent readpages() and - * source range to serialize with relocation. - */ - btrfs_double_extent_lock(src, loff, dst, dst_loff, len); - ret = btrfs_clone(src, dst, loff, len, ALIGN(len, bs), dst_loff, 1); - btrfs_double_extent_unlock(src, loff, dst, dst_loff, len); - - return ret; -} - -#define BTRFS_MAX_DEDUPE_LEN SZ_16M - -static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen, - struct inode *dst, u64 dst_loff) -{ - int ret; - u64 i, tail_len, chunk_count; - struct btrfs_root *root_dst = BTRFS_I(dst)->root; - - spin_lock(&root_dst->root_item_lock); - if (root_dst->send_in_progress) { - btrfs_warn_rl(root_dst->fs_info, -"cannot deduplicate to root %llu while send operations are using it (%d in progress)", - root_dst->root_key.objectid, - root_dst->send_in_progress); - spin_unlock(&root_dst->root_item_lock); - return -EAGAIN; - } - root_dst->dedupe_in_progress++; - spin_unlock(&root_dst->root_item_lock); - - tail_len = olen % BTRFS_MAX_DEDUPE_LEN; - chunk_count = div_u64(olen, BTRFS_MAX_DEDUPE_LEN); - - for (i = 0; i < chunk_count; i++) { - ret = btrfs_extent_same_range(src, loff, BTRFS_MAX_DEDUPE_LEN, - dst, dst_loff); - if (ret) - goto out; - - loff += BTRFS_MAX_DEDUPE_LEN; - dst_loff += BTRFS_MAX_DEDUPE_LEN; - } - - if (tail_len > 0) - ret = btrfs_extent_same_range(src, loff, tail_len, dst, - dst_loff); -out: - spin_lock(&root_dst->root_item_lock); - root_dst->dedupe_in_progress--; - spin_unlock(&root_dst->root_item_lock); - - return ret; -} - -static int clone_finish_inode_update(struct btrfs_trans_handle *trans, - struct inode *inode, - u64 endoff, - const u64 destoff, - const u64 olen, - int no_time_update) -{ - struct btrfs_root *root = BTRFS_I(inode)->root; - int ret; - - inode_inc_iversion(inode); - if (!no_time_update) - inode->i_mtime = inode->i_ctime = current_time(inode); - /* - * We round up to the block size at eof when determining which - * extents to clone above, but shouldn't round up the file size. - */ - if (endoff > destoff + olen) - endoff = destoff + olen; - if (endoff > inode->i_size) - btrfs_i_size_write(BTRFS_I(inode), endoff); - - ret = btrfs_update_inode(trans, root, inode); - if (ret) { - btrfs_abort_transaction(trans, ret); - btrfs_end_transaction(trans); - goto out; - } - ret = btrfs_end_transaction(trans); -out: - return ret; -} - -/* - * Make sure we do not end up inserting an inline extent into a file that has - * already other (non-inline) extents. If a file has an inline extent it can - * not have any other extents and the (single) inline extent must start at the - * file offset 0. Failing to respect these rules will lead to file corruption, - * resulting in EIO errors on read/write operations, hitting BUG_ON's in mm, etc - * - * We can have extents that have been already written to disk or we can have - * dirty ranges still in delalloc, in which case the extent maps and items are - * created only when we run delalloc, and the delalloc ranges might fall outside - * the range we are currently locking in the inode's io tree. So we check the - * inode's i_size because of that (i_size updates are done while holding the - * i_mutex, which we are holding here). - * We also check to see if the inode has a size not greater than "datal" but has - * extents beyond it, due to an fallocate with FALLOC_FL_KEEP_SIZE (and we are - * protected against such concurrent fallocate calls by the i_mutex). - * - * If the file has no extents but a size greater than datal, do not allow the - * copy because we would need turn the inline extent into a non-inline one (even - * with NO_HOLES enabled). If we find our destination inode only has one inline - * extent, just overwrite it with the source inline extent if its size is less - * than the source extent's size, or we could copy the source inline extent's - * data into the destination inode's inline extent if the later is greater then - * the former. - */ -static int clone_copy_inline_extent(struct inode *dst, - struct btrfs_trans_handle *trans, - struct btrfs_path *path, - struct btrfs_key *new_key, - const u64 drop_start, - const u64 datal, - const u64 skip, - const u64 size, - char *inline_data) -{ - struct btrfs_fs_info *fs_info = btrfs_sb(dst->i_sb); - struct btrfs_root *root = BTRFS_I(dst)->root; - const u64 aligned_end = ALIGN(new_key->offset + datal, - fs_info->sectorsize); - int ret; - struct btrfs_key key; - - if (new_key->offset > 0) - return -EOPNOTSUPP; - - key.objectid = btrfs_ino(BTRFS_I(dst)); - key.type = BTRFS_EXTENT_DATA_KEY; - key.offset = 0; - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - if (ret < 0) { - return ret; - } else if (ret > 0) { - if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { - ret = btrfs_next_leaf(root, path); - if (ret < 0) - return ret; - else if (ret > 0) - goto copy_inline_extent; - } - btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); - if (key.objectid == btrfs_ino(BTRFS_I(dst)) && - key.type == BTRFS_EXTENT_DATA_KEY) { - ASSERT(key.offset > 0); - return -EOPNOTSUPP; - } - } else if (i_size_read(dst) <= datal) { - struct btrfs_file_extent_item *ei; - u64 ext_len; - - /* - * If the file size is <= datal, make sure there are no other - * extents following (can happen do to an fallocate call with - * the flag FALLOC_FL_KEEP_SIZE). - */ - ei = btrfs_item_ptr(path->nodes[0], path->slots[0], - struct btrfs_file_extent_item); - /* - * If it's an inline extent, it can not have other extents - * following it. - */ - if (btrfs_file_extent_type(path->nodes[0], ei) == - BTRFS_FILE_EXTENT_INLINE) - goto copy_inline_extent; - - ext_len = btrfs_file_extent_num_bytes(path->nodes[0], ei); - if (ext_len > aligned_end) - return -EOPNOTSUPP; - - ret = btrfs_next_item(root, path); - if (ret < 0) { - return ret; - } else if (ret == 0) { - btrfs_item_key_to_cpu(path->nodes[0], &key, - path->slots[0]); - if (key.objectid == btrfs_ino(BTRFS_I(dst)) && - key.type == BTRFS_EXTENT_DATA_KEY) - return -EOPNOTSUPP; - } - } - -copy_inline_extent: - /* - * We have no extent items, or we have an extent at offset 0 which may - * or may not be inlined. All these cases are dealt the same way. - */ - if (i_size_read(dst) > datal) { - /* - * If the destination inode has an inline extent... - * This would require copying the data from the source inline - * extent into the beginning of the destination's inline extent. - * But this is really complex, both extents can be compressed - * or just one of them, which would require decompressing and - * re-compressing data (which could increase the new compressed - * size, not allowing the compressed data to fit anymore in an - * inline extent). - * So just don't support this case for now (it should be rare, - * we are not really saving space when cloning inline extents). - */ - return -EOPNOTSUPP; - } - - btrfs_release_path(path); - ret = btrfs_drop_extents(trans, root, dst, drop_start, aligned_end, 1); - if (ret) - return ret; - ret = btrfs_insert_empty_item(trans, root, path, new_key, size); - if (ret) - return ret; - - if (skip) { - const u32 start = btrfs_file_extent_calc_inline_size(0); - - memmove(inline_data + start, inline_data + start + skip, datal); - } - - write_extent_buffer(path->nodes[0], inline_data, - btrfs_item_ptr_offset(path->nodes[0], - path->slots[0]), - size); - inode_add_bytes(dst, datal); - set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(dst)->runtime_flags); - - return 0; -} - -/** - * btrfs_clone() - clone a range from inode file to another - * - * @src: Inode to clone from - * @inode: Inode to clone to - * @off: Offset within source to start clone from - * @olen: Original length, passed by user, of range to clone - * @olen_aligned: Block-aligned value of olen - * @destoff: Offset within @inode to start clone - * @no_time_update: Whether to update mtime/ctime on the target inode - */ -static int btrfs_clone(struct inode *src, struct inode *inode, - const u64 off, const u64 olen, const u64 olen_aligned, - const u64 destoff, int no_time_update) -{ - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct btrfs_root *root = BTRFS_I(inode)->root; - struct btrfs_path *path = NULL; - struct extent_buffer *leaf; - struct btrfs_trans_handle *trans; - char *buf = NULL; - struct btrfs_key key; - u32 nritems; - int slot; - int ret; - const u64 len = olen_aligned; - u64 last_dest_end = destoff; - - ret = -ENOMEM; - buf = kvmalloc(fs_info->nodesize, GFP_KERNEL); - if (!buf) - return ret; - - path = btrfs_alloc_path(); - if (!path) { - kvfree(buf); - return ret; - } - - path->reada = READA_FORWARD; - /* clone data */ - key.objectid = btrfs_ino(BTRFS_I(src)); - key.type = BTRFS_EXTENT_DATA_KEY; - key.offset = off; - - while (1) { - u64 next_key_min_offset = key.offset + 1; - struct btrfs_file_extent_item *extent; - int type; - u32 size; - struct btrfs_key new_key; - u64 disko = 0, diskl = 0; - u64 datao = 0, datal = 0; - u8 comp; - u64 drop_start; - - /* - * note the key will change type as we walk through the - * tree. - */ - path->leave_spinning = 1; - ret = btrfs_search_slot(NULL, BTRFS_I(src)->root, &key, path, - 0, 0); - if (ret < 0) - goto out; - /* - * First search, if no extent item that starts at offset off was - * found but the previous item is an extent item, it's possible - * it might overlap our target range, therefore process it. - */ - if (key.offset == off && ret > 0 && path->slots[0] > 0) { - btrfs_item_key_to_cpu(path->nodes[0], &key, - path->slots[0] - 1); - if (key.type == BTRFS_EXTENT_DATA_KEY) - path->slots[0]--; - } - - nritems = btrfs_header_nritems(path->nodes[0]); -process_slot: - if (path->slots[0] >= nritems) { - ret = btrfs_next_leaf(BTRFS_I(src)->root, path); - if (ret < 0) - goto out; - if (ret > 0) - break; - nritems = btrfs_header_nritems(path->nodes[0]); - } - leaf = path->nodes[0]; - slot = path->slots[0]; - - btrfs_item_key_to_cpu(leaf, &key, slot); - if (key.type > BTRFS_EXTENT_DATA_KEY || - key.objectid != btrfs_ino(BTRFS_I(src))) - break; - - ASSERT(key.type == BTRFS_EXTENT_DATA_KEY); - - extent = btrfs_item_ptr(leaf, slot, - struct btrfs_file_extent_item); - comp = btrfs_file_extent_compression(leaf, extent); - type = btrfs_file_extent_type(leaf, extent); - if (type == BTRFS_FILE_EXTENT_REG || - type == BTRFS_FILE_EXTENT_PREALLOC) { - disko = btrfs_file_extent_disk_bytenr(leaf, extent); - diskl = btrfs_file_extent_disk_num_bytes(leaf, extent); - datao = btrfs_file_extent_offset(leaf, extent); - datal = btrfs_file_extent_num_bytes(leaf, extent); - } else if (type == BTRFS_FILE_EXTENT_INLINE) { - /* Take upper bound, may be compressed */ - datal = btrfs_file_extent_ram_bytes(leaf, extent); - } - - /* - * The first search might have left us at an extent item that - * ends before our target range's start, can happen if we have - * holes and NO_HOLES feature enabled. - */ - if (key.offset + datal <= off) { - path->slots[0]++; - goto process_slot; - } else if (key.offset >= off + len) { - break; - } - next_key_min_offset = key.offset + datal; - size = btrfs_item_size_nr(leaf, slot); - read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf, slot), - size); - - btrfs_release_path(path); - path->leave_spinning = 0; - - memcpy(&new_key, &key, sizeof(new_key)); - new_key.objectid = btrfs_ino(BTRFS_I(inode)); - if (off <= key.offset) - new_key.offset = key.offset + destoff - off; - else - new_key.offset = destoff; - - /* - * Deal with a hole that doesn't have an extent item that - * represents it (NO_HOLES feature enabled). - * This hole is either in the middle of the cloning range or at - * the beginning (fully overlaps it or partially overlaps it). - */ - if (new_key.offset != last_dest_end) - drop_start = last_dest_end; - else - drop_start = new_key.offset; - - if (type == BTRFS_FILE_EXTENT_REG || - type == BTRFS_FILE_EXTENT_PREALLOC) { - struct btrfs_clone_extent_info clone_info; - - /* - * a | --- range to clone ---| b - * | ------------- extent ------------- | - */ - - /* Subtract range b */ - if (key.offset + datal > off + len) - datal = off + len - key.offset; - - /* Subtract range a */ - if (off > key.offset) { - datao += off - key.offset; - datal -= off - key.offset; - } - - clone_info.disk_offset = disko; - clone_info.disk_len = diskl; - clone_info.data_offset = datao; - clone_info.data_len = datal; - clone_info.file_offset = new_key.offset; - clone_info.extent_buf = buf; - clone_info.item_size = size; - ret = btrfs_punch_hole_range(inode, path, - drop_start, - new_key.offset + datal - 1, - &clone_info, &trans); - if (ret) - goto out; - } else if (type == BTRFS_FILE_EXTENT_INLINE) { - u64 skip = 0; - u64 trim = 0; - - if (off > key.offset) { - skip = off - key.offset; - new_key.offset += skip; - } - - if (key.offset + datal > off + len) - trim = key.offset + datal - (off + len); - - if (comp && (skip || trim)) { - ret = -EINVAL; - goto out; - } - size -= skip + trim; - datal -= skip + trim; - - /* - * If our extent is inline, we know we will drop or - * adjust at most 1 extent item in the destination root. - * - * 1 - adjusting old extent (we may have to split it) - * 1 - add new extent - * 1 - inode update - */ - trans = btrfs_start_transaction(root, 3); - if (IS_ERR(trans)) { - ret = PTR_ERR(trans); - goto out; - } - - ret = clone_copy_inline_extent(inode, trans, path, - &new_key, drop_start, - datal, skip, size, buf); - if (ret) { - if (ret != -EOPNOTSUPP) - btrfs_abort_transaction(trans, ret); - btrfs_end_transaction(trans); - goto out; - } - } - - btrfs_release_path(path); - - last_dest_end = ALIGN(new_key.offset + datal, - fs_info->sectorsize); - ret = clone_finish_inode_update(trans, inode, last_dest_end, - destoff, olen, no_time_update); - if (ret) - goto out; - if (new_key.offset + datal >= destoff + len) - break; - - btrfs_release_path(path); - key.offset = next_key_min_offset; - - if (fatal_signal_pending(current)) { - ret = -EINTR; - goto out; - } - } - ret = 0; - - if (last_dest_end < destoff + len) { - /* - * We have an implicit hole that fully or partially overlaps our - * cloning range at its end. This means that we either have the - * NO_HOLES feature enabled or the implicit hole happened due to - * mixing buffered and direct IO writes against this file. - */ - btrfs_release_path(path); - path->leave_spinning = 0; - - ret = btrfs_punch_hole_range(inode, path, - last_dest_end, destoff + len - 1, - NULL, &trans); - if (ret) - goto out; - - ret = clone_finish_inode_update(trans, inode, destoff + len, - destoff, olen, no_time_update); - } - -out: - btrfs_free_path(path); - kvfree(buf); - return ret; -} - -static noinline int btrfs_clone_files(struct file *file, struct file *file_src, - u64 off, u64 olen, u64 destoff) -{ - struct inode *inode = file_inode(file); - struct inode *src = file_inode(file_src); - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - int ret; - u64 len = olen; - u64 bs = fs_info->sb->s_blocksize; - - /* - * TODO: - * - split compressed inline extents. annoying: we need to - * decompress into destination's address_space (the file offset - * may change, so source mapping won't do), then recompress (or - * otherwise reinsert) a subrange. - * - * - split destination inode's inline extents. The inline extents can - * be either compressed or non-compressed. - */ - - /* - * VFS's generic_remap_file_range_prep() protects us from cloning the - * eof block into the middle of a file, which would result in corruption - * if the file size is not blocksize aligned. So we don't need to check - * for that case here. - */ - if (off + len == src->i_size) - len = ALIGN(src->i_size, bs) - off; - - if (destoff > inode->i_size) { - const u64 wb_start = ALIGN_DOWN(inode->i_size, bs); - - ret = btrfs_cont_expand(inode, inode->i_size, destoff); - if (ret) - return ret; - /* - * We may have truncated the last block if the inode's size is - * not sector size aligned, so we need to wait for writeback to - * complete before proceeding further, otherwise we can race - * with cloning and attempt to increment a reference to an - * extent that no longer exists (writeback completed right after - * we found the previous extent covering eof and before we - * attempted to increment its reference count). - */ - ret = btrfs_wait_ordered_range(inode, wb_start, - destoff - wb_start); - if (ret) - return ret; - } - - /* - * Lock destination range to serialize with concurrent readpages() and - * source range to serialize with relocation. - */ - btrfs_double_extent_lock(src, off, inode, destoff, len); - ret = btrfs_clone(src, inode, off, olen, len, destoff, 0); - btrfs_double_extent_unlock(src, off, inode, destoff, len); - /* - * Truncate page cache pages so that future reads will see the cloned - * data immediately and not the previous data. - */ - truncate_inode_pages_range(&inode->i_data, - round_down(destoff, PAGE_SIZE), - round_up(destoff + len, PAGE_SIZE) - 1); - - return ret; -} - -static int btrfs_remap_file_range_prep(struct file *file_in, loff_t pos_in, - struct file *file_out, loff_t pos_out, - loff_t *len, unsigned int remap_flags) -{ - struct inode *inode_in = file_inode(file_in); - struct inode *inode_out = file_inode(file_out); - u64 bs = BTRFS_I(inode_out)->root->fs_info->sb->s_blocksize; - bool same_inode = inode_out == inode_in; - u64 wb_len; - int ret; - - if (!(remap_flags & REMAP_FILE_DEDUP)) { - struct btrfs_root *root_out = BTRFS_I(inode_out)->root; - - if (btrfs_root_readonly(root_out)) - return -EROFS; - - if (file_in->f_path.mnt != file_out->f_path.mnt || - inode_in->i_sb != inode_out->i_sb) - return -EXDEV; - } - - /* don't make the dst file partly checksummed */ - if ((BTRFS_I(inode_in)->flags & BTRFS_INODE_NODATASUM) != - (BTRFS_I(inode_out)->flags & BTRFS_INODE_NODATASUM)) { - return -EINVAL; - } - - /* - * Now that the inodes are locked, we need to start writeback ourselves - * and can not rely on the writeback from the VFS's generic helper - * generic_remap_file_range_prep() because: - * - * 1) For compression we must call filemap_fdatawrite_range() range - * twice (btrfs_fdatawrite_range() does it for us), and the generic - * helper only calls it once; - * - * 2) filemap_fdatawrite_range(), called by the generic helper only - * waits for the writeback to complete, i.e. for IO to be done, and - * not for the ordered extents to complete. We need to wait for them - * to complete so that new file extent items are in the fs tree. - */ - if (*len == 0 && !(remap_flags & REMAP_FILE_DEDUP)) - wb_len = ALIGN(inode_in->i_size, bs) - ALIGN_DOWN(pos_in, bs); - else - wb_len = ALIGN(*len, bs); - - /* - * Since we don't lock ranges, wait for ongoing lockless dio writes (as - * any in progress could create its ordered extents after we wait for - * existing ordered extents below). - */ - inode_dio_wait(inode_in); - if (!same_inode) - inode_dio_wait(inode_out); - - /* - * Workaround to make sure NOCOW buffered write reach disk as NOCOW. - * - * Btrfs' back references do not have a block level granularity, they - * work at the whole extent level. - * NOCOW buffered write without data space reserved may not be able - * to fall back to CoW due to lack of data space, thus could cause - * data loss. - * - * Here we take a shortcut by flushing the whole inode, so that all - * nocow write should reach disk as nocow before we increase the - * reference of the extent. We could do better by only flushing NOCOW - * data, but that needs extra accounting. - * - * Also we don't need to check ASYNC_EXTENT, as async extent will be - * CoWed anyway, not affecting nocow part. - */ - ret = filemap_flush(inode_in->i_mapping); - if (ret < 0) - return ret; - - ret = btrfs_wait_ordered_range(inode_in, ALIGN_DOWN(pos_in, bs), - wb_len); - if (ret < 0) - return ret; - ret = btrfs_wait_ordered_range(inode_out, ALIGN_DOWN(pos_out, bs), - wb_len); - if (ret < 0) - return ret; - - return generic_remap_file_range_prep(file_in, pos_in, file_out, pos_out, - len, remap_flags); -} - -loff_t btrfs_remap_file_range(struct file *src_file, loff_t off, - struct file *dst_file, loff_t destoff, loff_t len, - unsigned int remap_flags) -{ - struct inode *src_inode = file_inode(src_file); - struct inode *dst_inode = file_inode(dst_file); - bool same_inode = dst_inode == src_inode; - int ret; - - if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY)) - return -EINVAL; - - if (same_inode) - inode_lock(src_inode); - else - lock_two_nondirectories(src_inode, dst_inode); - - ret = btrfs_remap_file_range_prep(src_file, off, dst_file, destoff, - &len, remap_flags); - if (ret < 0 || len == 0) - goto out_unlock; - - if (remap_flags & REMAP_FILE_DEDUP) - ret = btrfs_extent_same(src_inode, off, len, dst_inode, destoff); - else - ret = btrfs_clone_files(dst_file, src_file, off, len, destoff); - -out_unlock: - if (same_inode) - inode_unlock(src_inode); - else - unlock_two_nondirectories(src_inode, dst_inode); - - return ret < 0 ? ret : len; -} - static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp) { struct inode *inode = file_inode(file); @@ -3955,7 +3277,7 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp) struct btrfs_root *new_root; struct btrfs_dir_item *di; struct btrfs_trans_handle *trans; - struct btrfs_path *path; + struct btrfs_path *path = NULL; struct btrfs_key location; struct btrfs_disk_key disk_key; u64 objectid = 0; @@ -3981,49 +3303,51 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp) location.type = BTRFS_ROOT_ITEM_KEY; location.offset = (u64)-1; - new_root = btrfs_read_fs_root_no_name(fs_info, &location); + new_root = btrfs_get_fs_root(fs_info, &location, true); if (IS_ERR(new_root)) { ret = PTR_ERR(new_root); goto out; } if (!is_fstree(new_root->root_key.objectid)) { ret = -ENOENT; - goto out; + goto out_free; } path = btrfs_alloc_path(); if (!path) { ret = -ENOMEM; - goto out; + goto out_free; } path->leave_spinning = 1; trans = btrfs_start_transaction(root, 1); if (IS_ERR(trans)) { - btrfs_free_path(path); ret = PTR_ERR(trans); - goto out; + goto out_free; } dir_id = btrfs_super_root_dir(fs_info->super_copy); di = btrfs_lookup_dir_item(trans, fs_info->tree_root, path, dir_id, "default", 7, 1); if (IS_ERR_OR_NULL(di)) { - btrfs_free_path(path); + btrfs_release_path(path); btrfs_end_transaction(trans); btrfs_err(fs_info, "Umm, you don't have the default diritem, this isn't going to work"); ret = -ENOENT; - goto out; + goto out_free; } btrfs_cpu_key_to_disk(&disk_key, &new_root->root_key); btrfs_set_dir_item_key(path->nodes[0], di, &disk_key); btrfs_mark_buffer_dirty(path->nodes[0]); - btrfs_free_path(path); + btrfs_release_path(path); btrfs_set_fs_incompat(fs_info, DEFAULT_SUBVOL); btrfs_end_transaction(trans); +out_free: + btrfs_put_root(new_root); + btrfs_free_path(path); out: mnt_drop_write_file(file); return ret; @@ -5465,7 +4789,9 @@ long btrfs_ioctl(struct file *file, unsigned int case BTRFS_IOC_SUBVOL_CREATE_V2: return btrfs_ioctl_snap_create_v2(file, argp, 1); case BTRFS_IOC_SNAP_DESTROY: - return btrfs_ioctl_snap_destroy(file, argp); + return btrfs_ioctl_snap_destroy(file, argp, false); + case BTRFS_IOC_SNAP_DESTROY_V2: + return btrfs_ioctl_snap_destroy(file, argp, true); case BTRFS_IOC_SUBVOL_GETFLAGS: return btrfs_ioctl_subvol_getflags(file, argp); case BTRFS_IOC_SUBVOL_SETFLAGS: diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c index 571c4826c428..fb647d8cf527 100644 --- a/fs/btrfs/locking.c +++ b/fs/btrfs/locking.c @@ -523,3 +523,138 @@ void btrfs_unlock_up_safe(struct btrfs_path *path, int level) path->locks[i] = 0; } } + +/* + * Loop around taking references on and locking the root node of the tree until + * we end up with a lock on the root node. + * + * Return: root extent buffer with write lock held + */ +struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root) +{ + struct extent_buffer *eb; + + while (1) { + eb = btrfs_root_node(root); + btrfs_tree_lock(eb); + if (eb == root->node) + break; + btrfs_tree_unlock(eb); + free_extent_buffer(eb); + } + return eb; +} + +/* + * Loop around taking references on and locking the root node of the tree until + * we end up with a lock on the root node. + * + * Return: root extent buffer with read lock held + */ +struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root) +{ + struct extent_buffer *eb; + + while (1) { + eb = btrfs_root_node(root); + btrfs_tree_read_lock(eb); + if (eb == root->node) + break; + btrfs_tree_read_unlock(eb); + free_extent_buffer(eb); + } + return eb; +} + +/* + * DREW locks + * ========== + * + * DREW stands for double-reader-writer-exclusion lock. It's used in situation + * where you want to provide A-B exclusion but not AA or BB. + * + * Currently implementation gives more priority to reader. If a reader and a + * writer both race to acquire their respective sides of the lock the writer + * would yield its lock as soon as it detects a concurrent reader. Additionally + * if there are pending readers no new writers would be allowed to come in and + * acquire the lock. + */ + +int btrfs_drew_lock_init(struct btrfs_drew_lock *lock) +{ + int ret; + + ret = percpu_counter_init(&lock->writers, 0, GFP_KERNEL); + if (ret) + return ret; + + atomic_set(&lock->readers, 0); + init_waitqueue_head(&lock->pending_readers); + init_waitqueue_head(&lock->pending_writers); + + return 0; +} + +void btrfs_drew_lock_destroy(struct btrfs_drew_lock *lock) +{ + percpu_counter_destroy(&lock->writers); +} + +/* Return true if acquisition is successful, false otherwise */ +bool btrfs_drew_try_write_lock(struct btrfs_drew_lock *lock) +{ + if (atomic_read(&lock->readers)) + return false; + + percpu_counter_inc(&lock->writers); + + /* Ensure writers count is updated before we check for pending readers */ + smp_mb(); + if (atomic_read(&lock->readers)) { + btrfs_drew_write_unlock(lock); + return false; + } + + return true; +} + +void btrfs_drew_write_lock(struct btrfs_drew_lock *lock) +{ + while (true) { + if (btrfs_drew_try_write_lock(lock)) + return; + wait_event(lock->pending_writers, !atomic_read(&lock->readers)); + } +} + +void btrfs_drew_write_unlock(struct btrfs_drew_lock *lock) +{ + percpu_counter_dec(&lock->writers); + cond_wake_up(&lock->pending_readers); +} + +void btrfs_drew_read_lock(struct btrfs_drew_lock *lock) +{ + atomic_inc(&lock->readers); + + /* + * Ensure the pending reader count is perceieved BEFORE this reader + * goes to sleep in case of active writers. This guarantees new writers + * won't be allowed and that the current reader will be woken up when + * the last active writer finishes its jobs. + */ + smp_mb__after_atomic(); + + wait_event(lock->pending_readers, + percpu_counter_sum(&lock->writers) == 0); +} + +void btrfs_drew_read_unlock(struct btrfs_drew_lock *lock) +{ + /* + * atomic_dec_and_test implies a full barrier, so woken up writers + * are guaranteed to see the decrement + */ + if (atomic_dec_and_test(&lock->readers)) + wake_up(&lock->pending_writers); +} diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h index 21a285883e89..d715846c10b8 100644 --- a/fs/btrfs/locking.h +++ b/fs/btrfs/locking.h @@ -6,6 +6,9 @@ #ifndef BTRFS_LOCKING_H #define BTRFS_LOCKING_H +#include <linux/atomic.h> +#include <linux/wait.h> +#include <linux/percpu_counter.h> #include "extent_io.h" #define BTRFS_WRITE_LOCK 1 @@ -13,6 +16,8 @@ #define BTRFS_WRITE_LOCK_BLOCKING 3 #define BTRFS_READ_LOCK_BLOCKING 4 +struct btrfs_path; + void btrfs_tree_lock(struct extent_buffer *eb); void btrfs_tree_unlock(struct extent_buffer *eb); @@ -48,4 +53,19 @@ static inline void btrfs_tree_unlock_rw(struct extent_buffer *eb, int rw) BUG(); } +struct btrfs_drew_lock { + atomic_t readers; + struct percpu_counter writers; + wait_queue_head_t pending_writers; + wait_queue_head_t pending_readers; +}; + +int btrfs_drew_lock_init(struct btrfs_drew_lock *lock); +void btrfs_drew_lock_destroy(struct btrfs_drew_lock *lock); +void btrfs_drew_write_lock(struct btrfs_drew_lock *lock); +bool btrfs_drew_try_write_lock(struct btrfs_drew_lock *lock); +void btrfs_drew_write_unlock(struct btrfs_drew_lock *lock); +void btrfs_drew_read_lock(struct btrfs_drew_lock *lock); +void btrfs_drew_read_unlock(struct btrfs_drew_lock *lock); + #endif diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index ecb9fb6a6fe0..e13b3d28c063 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -580,7 +580,7 @@ void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr, while (!list_empty(&splice) && nr) { root = list_first_entry(&splice, struct btrfs_root, ordered_root); - root = btrfs_grab_fs_root(root); + root = btrfs_grab_root(root); BUG_ON(!root); list_move_tail(&root->ordered_root, &fs_info->ordered_roots); @@ -588,7 +588,7 @@ void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr, done = btrfs_wait_ordered_extents(root, nr, range_start, range_len); - btrfs_put_fs_root(root); + btrfs_put_root(root); spin_lock(&fs_info->ordered_root_lock); if (nr != U64_MAX) { @@ -679,10 +679,15 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len) } btrfs_start_ordered_extent(inode, ordered, 1); end = ordered->file_offset; + /* + * If the ordered extent had an error save the error but don't + * exit without waiting first for all other ordered extents in + * the range to complete. + */ if (test_bit(BTRFS_ORDERED_IOERR, &ordered->flags)) ret = -EIO; btrfs_put_ordered_extent(ordered); - if (ret || end == 0 || end == start) + if (end == 0 || end == start) break; end--; } @@ -781,134 +786,6 @@ out: } /* - * After an extent is done, call this to conditionally update the on disk - * i_size. i_size is updated to cover any fully written part of the file. - */ -int btrfs_ordered_update_i_size(struct inode *inode, u64 offset, - struct btrfs_ordered_extent *ordered) -{ - struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree; - u64 disk_i_size; - u64 new_i_size; - u64 i_size = i_size_read(inode); - struct rb_node *node; - struct rb_node *prev = NULL; - struct btrfs_ordered_extent *test; - int ret = 1; - u64 orig_offset = offset; - - spin_lock_irq(&tree->lock); - if (ordered) { - offset = entry_end(ordered); - if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags)) - offset = min(offset, - ordered->file_offset + - ordered->truncated_len); - } else { - offset = ALIGN(offset, btrfs_inode_sectorsize(inode)); - } - disk_i_size = BTRFS_I(inode)->disk_i_size; - - /* - * truncate file. - * If ordered is not NULL, then this is called from endio and - * disk_i_size will be updated by either truncate itself or any - * in-flight IOs which are inside the disk_i_size. - * - * Because btrfs_setsize() may set i_size with disk_i_size if truncate - * fails somehow, we need to make sure we have a precise disk_i_size by - * updating it as usual. - * - */ - if (!ordered && disk_i_size > i_size) { - BTRFS_I(inode)->disk_i_size = orig_offset; - ret = 0; - goto out; - } - - /* - * if the disk i_size is already at the inode->i_size, or - * this ordered extent is inside the disk i_size, we're done - */ - if (disk_i_size == i_size) - goto out; - - /* - * We still need to update disk_i_size if outstanding_isize is greater - * than disk_i_size. - */ - if (offset <= disk_i_size && - (!ordered || ordered->outstanding_isize <= disk_i_size)) - goto out; - - /* - * walk backward from this ordered extent to disk_i_size. - * if we find an ordered extent then we can't update disk i_size - * yet - */ - if (ordered) { - node = rb_prev(&ordered->rb_node); - } else { - prev = tree_search(tree, offset); - /* - * we insert file extents without involving ordered struct, - * so there should be no ordered struct cover this offset - */ - if (prev) { - test = rb_entry(prev, struct btrfs_ordered_extent, - rb_node); - BUG_ON(offset_in_entry(test, offset)); - } - node = prev; - } - for (; node; node = rb_prev(node)) { - test = rb_entry(node, struct btrfs_ordered_extent, rb_node); - - /* We treat this entry as if it doesn't exist */ - if (test_bit(BTRFS_ORDERED_UPDATED_ISIZE, &test->flags)) - continue; - - if (entry_end(test) <= disk_i_size) - break; - if (test->file_offset >= i_size) - break; - - /* - * We don't update disk_i_size now, so record this undealt - * i_size. Or we will not know the real i_size. - */ - if (test->outstanding_isize < offset) - test->outstanding_isize = offset; - if (ordered && - ordered->outstanding_isize > test->outstanding_isize) - test->outstanding_isize = ordered->outstanding_isize; - goto out; - } - new_i_size = min_t(u64, offset, i_size); - - /* - * Some ordered extents may completed before the current one, and - * we hold the real i_size in ->outstanding_isize. - */ - if (ordered && ordered->outstanding_isize > new_i_size) - new_i_size = min_t(u64, ordered->outstanding_isize, i_size); - BTRFS_I(inode)->disk_i_size = new_i_size; - ret = 0; -out: - /* - * We need to do this because we can't remove ordered extents until - * after the i_disk_size has been updated and then the inode has been - * updated to reflect the change, so we need to tell anybody who finds - * this ordered extent that we've already done all the real work, we - * just haven't completed all the other work. - */ - if (ordered) - set_bit(BTRFS_ORDERED_UPDATED_ISIZE, &ordered->flags); - spin_unlock_irq(&tree->lock); - return ret; -} - -/* * search the ordered extents for one corresponding to 'offset' and * try to find a checksum. This is used because we allow pages to * be reclaimed before their checksum is actually put into the btree @@ -958,7 +835,6 @@ out: * btrfs_flush_ordered_range - Lock the passed range and ensures all pending * ordered extents in it are run to completion. * - * @tree: IO tree used for locking out other users of the range * @inode: Inode whose ordered tree is to be searched * @start: Beginning of range to flush * @end: Last byte of range to lock @@ -968,8 +844,7 @@ out: * This function always returns with the given range locked, ensuring after it's * called no order extent can be pending. */ -void btrfs_lock_and_flush_ordered_range(struct extent_io_tree *tree, - struct btrfs_inode *inode, u64 start, +void btrfs_lock_and_flush_ordered_range(struct btrfs_inode *inode, u64 start, u64 end, struct extent_state **cached_state) { @@ -981,7 +856,7 @@ void btrfs_lock_and_flush_ordered_range(struct extent_io_tree *tree, cachedp = cached_state; while (1) { - lock_extent_bits(tree, start, end, cachedp); + lock_extent_bits(&inode->io_tree, start, end, cachedp); ordered = btrfs_lookup_ordered_range(inode, start, end - start + 1); if (!ordered) { @@ -994,7 +869,7 @@ void btrfs_lock_and_flush_ordered_range(struct extent_io_tree *tree, refcount_dec(&cache->refs); break; } - unlock_extent_cached(tree, start, end, cachedp); + unlock_extent_cached(&inode->io_tree, start, end, cachedp); btrfs_start_ordered_extent(&inode->vfs_inode, ordered, 1); btrfs_put_ordered_extent(ordered); } diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index 3beb4da4ab41..c01c9698250b 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -52,11 +52,6 @@ enum { BTRFS_ORDERED_DIRECT, /* We had an io error when writing this out */ BTRFS_ORDERED_IOERR, - /* - * indicates whether this ordered extent has done its due diligence in - * updating the isize - */ - BTRFS_ORDERED_UPDATED_ISIZE, /* Set when we have to truncate an extent */ BTRFS_ORDERED_TRUNCATED, /* Regular IO for COW */ @@ -182,16 +177,13 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_range( struct btrfs_inode *inode, u64 file_offset, u64 len); -int btrfs_ordered_update_i_size(struct inode *inode, u64 offset, - struct btrfs_ordered_extent *ordered); int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u8 *sum, int len); u64 btrfs_wait_ordered_extents(struct btrfs_root *root, u64 nr, const u64 range_start, const u64 range_len); void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr, const u64 range_start, const u64 range_len); -void btrfs_lock_and_flush_ordered_range(struct extent_io_tree *tree, - struct btrfs_inode *inode, u64 start, +void btrfs_lock_and_flush_ordered_range(struct btrfs_inode *inode, u64 start, u64 end, struct extent_state **cached_state); int __init ordered_data_init(void); diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c index deb59e7cfcac..ff1ff90e48b1 100644 --- a/fs/btrfs/props.c +++ b/fs/btrfs/props.c @@ -383,7 +383,7 @@ static int inherit_props(struct btrfs_trans_handle *trans, if (need_reserve) { btrfs_block_rsv_release(fs_info, trans->block_rsv, - num_bytes); + num_bytes, NULL); if (ret) return ret; } diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 98d9a50352d6..c3888fb367e7 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -1030,6 +1030,7 @@ out_add_root: ret = qgroup_rescan_init(fs_info, 0, 1); if (!ret) { qgroup_rescan_zero_tracking(fs_info); + fs_info->qgroup_rescan_running = true; btrfs_queue_work(fs_info->qgroup_rescan_workers, &fs_info->qgroup_rescan_work); } @@ -1037,11 +1038,8 @@ out_add_root: out_free_path: btrfs_free_path(path); out_free_root: - if (ret) { - free_extent_buffer(quota_root->node); - free_extent_buffer(quota_root->commit_root); - kfree(quota_root); - } + if (ret) + btrfs_put_root(quota_root); out: if (ret) { ulist_free(fs_info->qgroup_ulist); @@ -1104,9 +1102,7 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info) btrfs_tree_unlock(quota_root->node); btrfs_free_tree_block(trans, quota_root, quota_root->node, 0, 1); - free_extent_buffer(quota_root->node); - free_extent_buffer(quota_root->commit_root); - kfree(quota_root); + btrfs_put_root(quota_root); end_trans: ret = btrfs_end_transaction(trans); @@ -3237,7 +3233,6 @@ qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid, } mutex_lock(&fs_info->qgroup_rescan_lock); - spin_lock(&fs_info->qgroup_lock); if (init_flags) { if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) { @@ -3252,7 +3247,6 @@ qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid, } if (ret) { - spin_unlock(&fs_info->qgroup_lock); mutex_unlock(&fs_info->qgroup_rescan_lock); return ret; } @@ -3263,9 +3257,6 @@ qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid, sizeof(fs_info->qgroup_rescan_progress)); fs_info->qgroup_rescan_progress.objectid = progress_objectid; init_completion(&fs_info->qgroup_rescan_completion); - fs_info->qgroup_rescan_running = true; - - spin_unlock(&fs_info->qgroup_lock); mutex_unlock(&fs_info->qgroup_rescan_lock); btrfs_init_work(&fs_info->qgroup_rescan_work, @@ -3326,8 +3317,11 @@ btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info) qgroup_rescan_zero_tracking(fs_info); + mutex_lock(&fs_info->qgroup_rescan_lock); + fs_info->qgroup_rescan_running = true; btrfs_queue_work(fs_info->qgroup_rescan_workers, &fs_info->qgroup_rescan_work); + mutex_unlock(&fs_info->qgroup_rescan_lock); return 0; } @@ -3339,9 +3333,7 @@ int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info, int ret = 0; mutex_lock(&fs_info->qgroup_rescan_lock); - spin_lock(&fs_info->qgroup_lock); running = fs_info->qgroup_rescan_running; - spin_unlock(&fs_info->qgroup_lock); mutex_unlock(&fs_info->qgroup_rescan_lock); if (!running) @@ -3363,9 +3355,13 @@ int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info, void btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info) { - if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) + if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) { + mutex_lock(&fs_info->qgroup_rescan_lock); + fs_info->qgroup_rescan_running = true; btrfs_queue_work(fs_info->qgroup_rescan_workers, &fs_info->qgroup_rescan_work); + mutex_unlock(&fs_info->qgroup_rescan_lock); + } } /* @@ -4002,3 +3998,16 @@ out: } return ret; } + +void btrfs_qgroup_destroy_extent_records(struct btrfs_transaction *trans) +{ + struct btrfs_qgroup_extent_record *entry; + struct btrfs_qgroup_extent_record *next; + struct rb_root *root; + + root = &trans->delayed_refs.dirty_extent_root; + rbtree_postorder_for_each_entry_safe(entry, next, root, node) { + ulist_free(entry->old_roots); + kfree(entry); + } +} diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h index 236f12224d52..1bc654459469 100644 --- a/fs/btrfs/qgroup.h +++ b/fs/btrfs/qgroup.h @@ -414,5 +414,6 @@ int btrfs_qgroup_add_swapped_blocks(struct btrfs_trans_handle *trans, u64 last_snapshot); int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *eb); +void btrfs_qgroup_destroy_extent_records(struct btrfs_transaction *trans); #endif diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index a8e53c8e7b01..c870ef70f817 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -206,7 +206,6 @@ int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info) struct btrfs_stripe_hash *h; int num_entries = 1 << BTRFS_STRIPE_HASH_TABLE_BITS; int i; - int table_size; if (info->stripe_hash_table) return 0; @@ -218,8 +217,7 @@ int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info) * Try harder to allocate and fallback to vmalloc to lower the chance * of a failing mount. */ - table_size = sizeof(*table) + sizeof(*h) * num_entries; - table = kvzalloc(table_size, GFP_KERNEL); + table = kvzalloc(struct_size(table, table, num_entries), GFP_KERNEL); if (!table) return -ENOMEM; @@ -1196,22 +1194,19 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio) int nr_data = rbio->nr_data; int stripe; int pagenr; - int p_stripe = -1; - int q_stripe = -1; + bool has_qstripe; struct bio_list bio_list; struct bio *bio; int ret; bio_list_init(&bio_list); - if (rbio->real_stripes - rbio->nr_data == 1) { - p_stripe = rbio->real_stripes - 1; - } else if (rbio->real_stripes - rbio->nr_data == 2) { - p_stripe = rbio->real_stripes - 2; - q_stripe = rbio->real_stripes - 1; - } else { + if (rbio->real_stripes - rbio->nr_data == 1) + has_qstripe = false; + else if (rbio->real_stripes - rbio->nr_data == 2) + has_qstripe = true; + else BUG(); - } /* at this point we either have a full stripe, * or we've read the full stripe from the drive. @@ -1255,7 +1250,7 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio) SetPageUptodate(p); pointers[stripe++] = kmap(p); - if (q_stripe != -1) { + if (has_qstripe) { /* * raid6, add the qstripe and call the @@ -2353,8 +2348,7 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio, int nr_data = rbio->nr_data; int stripe; int pagenr; - int p_stripe = -1; - int q_stripe = -1; + bool has_qstripe; struct page *p_page = NULL; struct page *q_page = NULL; struct bio_list bio_list; @@ -2364,14 +2358,12 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio, bio_list_init(&bio_list); - if (rbio->real_stripes - rbio->nr_data == 1) { - p_stripe = rbio->real_stripes - 1; - } else if (rbio->real_stripes - rbio->nr_data == 2) { - p_stripe = rbio->real_stripes - 2; - q_stripe = rbio->real_stripes - 1; - } else { + if (rbio->real_stripes - rbio->nr_data == 1) + has_qstripe = false; + else if (rbio->real_stripes - rbio->nr_data == 2) + has_qstripe = true; + else BUG(); - } if (bbio->num_tgtdevs && bbio->tgtdev_map[rbio->scrubp]) { is_replace = 1; @@ -2393,7 +2385,7 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio, goto cleanup; SetPageUptodate(p_page); - if (q_stripe != -1) { + if (has_qstripe) { q_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); if (!q_page) { __free_page(p_page); @@ -2416,8 +2408,7 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio, /* then add the parity stripe */ pointers[stripe++] = kmap(p_page); - if (q_stripe != -1) { - + if (has_qstripe) { /* * raid6, add the qstripe and call the * library function to fill in our p/q diff --git a/fs/btrfs/rcu-string.h b/fs/btrfs/rcu-string.h index a97dc74a4d3d..5c1a617eb25d 100644 --- a/fs/btrfs/rcu-string.h +++ b/fs/btrfs/rcu-string.h @@ -8,7 +8,7 @@ struct rcu_string { struct rcu_head rcu; - char str[0]; + char str[]; }; static inline struct rcu_string *rcu_string_strdup(const char *src, gfp_t mask) diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c index b57f3618e58e..7887317033c9 100644 --- a/fs/btrfs/ref-verify.c +++ b/fs/btrfs/ref-verify.c @@ -744,6 +744,7 @@ int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info, */ be = add_block_entry(fs_info, bytenr, num_bytes, ref_root); if (IS_ERR(be)) { + kfree(ref); kfree(ra); ret = PTR_ERR(be); goto out; @@ -757,6 +758,8 @@ int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info, "re-allocated a block that still has references to it!"); dump_block_entry(fs_info, be); dump_ref_action(fs_info, ra); + kfree(ref); + kfree(ra); goto out_unlock; } @@ -800,6 +803,15 @@ int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info, kfree(ref); kfree(ra); goto out_unlock; + } else if (be->num_refs == 0) { + btrfs_err(fs_info, + "trying to do action %d for a bytenr that has 0 total references", + action); + dump_block_entry(fs_info, be); + dump_ref_action(fs_info, ra); + kfree(ref); + kfree(ra); + goto out_unlock; } if (!parent) { @@ -819,6 +831,7 @@ int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info, "dropping a ref for a existing root that doesn't have a ref on the block"); dump_block_entry(fs_info, be); dump_ref_action(fs_info, ra); + kfree(ref); kfree(ra); goto out_unlock; } @@ -834,6 +847,7 @@ int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info, "attempting to add another ref for an existing ref on a tree block"); dump_block_entry(fs_info, be); dump_ref_action(fs_info, ra); + kfree(ref); kfree(ra); goto out_unlock; } diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c new file mode 100644 index 000000000000..d1973141d3bb --- /dev/null +++ b/fs/btrfs/reflink.c @@ -0,0 +1,804 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/blkdev.h> +#include <linux/iversion.h> +#include "compression.h" +#include "ctree.h" +#include "delalloc-space.h" +#include "reflink.h" +#include "transaction.h" + +#define BTRFS_MAX_DEDUPE_LEN SZ_16M + +static int clone_finish_inode_update(struct btrfs_trans_handle *trans, + struct inode *inode, + u64 endoff, + const u64 destoff, + const u64 olen, + int no_time_update) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + int ret; + + inode_inc_iversion(inode); + if (!no_time_update) + inode->i_mtime = inode->i_ctime = current_time(inode); + /* + * We round up to the block size at eof when determining which + * extents to clone above, but shouldn't round up the file size. + */ + if (endoff > destoff + olen) + endoff = destoff + olen; + if (endoff > inode->i_size) { + i_size_write(inode, endoff); + btrfs_inode_safe_disk_i_size_write(inode, 0); + } + + ret = btrfs_update_inode(trans, root, inode); + if (ret) { + btrfs_abort_transaction(trans, ret); + btrfs_end_transaction(trans); + goto out; + } + ret = btrfs_end_transaction(trans); +out: + return ret; +} + +static int copy_inline_to_page(struct inode *inode, + const u64 file_offset, + char *inline_data, + const u64 size, + const u64 datal, + const u8 comp_type) +{ + const u64 block_size = btrfs_inode_sectorsize(inode); + const u64 range_end = file_offset + block_size - 1; + const size_t inline_size = size - btrfs_file_extent_calc_inline_size(0); + char *data_start = inline_data + btrfs_file_extent_calc_inline_size(0); + struct extent_changeset *data_reserved = NULL; + struct page *page = NULL; + int ret; + + ASSERT(IS_ALIGNED(file_offset, block_size)); + + /* + * We have flushed and locked the ranges of the source and destination + * inodes, we also have locked the inodes, so we are safe to do a + * reservation here. Also we must not do the reservation while holding + * a transaction open, otherwise we would deadlock. + */ + ret = btrfs_delalloc_reserve_space(inode, &data_reserved, file_offset, + block_size); + if (ret) + goto out; + + page = find_or_create_page(inode->i_mapping, file_offset >> PAGE_SHIFT, + btrfs_alloc_write_mask(inode->i_mapping)); + if (!page) { + ret = -ENOMEM; + goto out_unlock; + } + + set_page_extent_mapped(page); + clear_extent_bit(&BTRFS_I(inode)->io_tree, file_offset, range_end, + EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, + 0, 0, NULL); + ret = btrfs_set_extent_delalloc(inode, file_offset, range_end, 0, NULL); + if (ret) + goto out_unlock; + + if (comp_type == BTRFS_COMPRESS_NONE) { + char *map; + + map = kmap(page); + memcpy(map, data_start, datal); + flush_dcache_page(page); + kunmap(page); + } else { + ret = btrfs_decompress(comp_type, data_start, page, 0, + inline_size, datal); + if (ret) + goto out_unlock; + flush_dcache_page(page); + } + + /* + * If our inline data is smaller then the block/page size, then the + * remaining of the block/page is equivalent to zeroes. We had something + * like the following done: + * + * $ xfs_io -f -c "pwrite -S 0xab 0 500" file + * $ sync # (or fsync) + * $ xfs_io -c "falloc 0 4K" file + * $ xfs_io -c "pwrite -S 0xcd 4K 4K" + * + * So what's in the range [500, 4095] corresponds to zeroes. + */ + if (datal < block_size) { + char *map; + + map = kmap(page); + memset(map + datal, 0, block_size - datal); + flush_dcache_page(page); + kunmap(page); + } + + SetPageUptodate(page); + ClearPageChecked(page); + set_page_dirty(page); +out_unlock: + if (page) { + unlock_page(page); + put_page(page); + } + if (ret) + btrfs_delalloc_release_space(inode, data_reserved, file_offset, + block_size, true); + btrfs_delalloc_release_extents(BTRFS_I(inode), block_size); +out: + extent_changeset_free(data_reserved); + + return ret; +} + +/* + * Deal with cloning of inline extents. We try to copy the inline extent from + * the source inode to destination inode when possible. When not possible we + * copy the inline extent's data into the respective page of the inode. + */ +static int clone_copy_inline_extent(struct inode *dst, + struct btrfs_path *path, + struct btrfs_key *new_key, + const u64 drop_start, + const u64 datal, + const u64 size, + const u8 comp_type, + char *inline_data, + struct btrfs_trans_handle **trans_out) +{ + struct btrfs_fs_info *fs_info = btrfs_sb(dst->i_sb); + struct btrfs_root *root = BTRFS_I(dst)->root; + const u64 aligned_end = ALIGN(new_key->offset + datal, + fs_info->sectorsize); + struct btrfs_trans_handle *trans = NULL; + int ret; + struct btrfs_key key; + + if (new_key->offset > 0) { + ret = copy_inline_to_page(dst, new_key->offset, inline_data, + size, datal, comp_type); + goto out; + } + + key.objectid = btrfs_ino(BTRFS_I(dst)); + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = 0; + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) { + return ret; + } else if (ret > 0) { + if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) + return ret; + else if (ret > 0) + goto copy_inline_extent; + } + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + if (key.objectid == btrfs_ino(BTRFS_I(dst)) && + key.type == BTRFS_EXTENT_DATA_KEY) { + /* + * There's an implicit hole at file offset 0, copy the + * inline extent's data to the page. + */ + ASSERT(key.offset > 0); + ret = copy_inline_to_page(dst, new_key->offset, + inline_data, size, datal, + comp_type); + goto out; + } + } else if (i_size_read(dst) <= datal) { + struct btrfs_file_extent_item *ei; + + ei = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_file_extent_item); + /* + * If it's an inline extent replace it with the source inline + * extent, otherwise copy the source inline extent data into + * the respective page at the destination inode. + */ + if (btrfs_file_extent_type(path->nodes[0], ei) == + BTRFS_FILE_EXTENT_INLINE) + goto copy_inline_extent; + + ret = copy_inline_to_page(dst, new_key->offset, inline_data, + size, datal, comp_type); + goto out; + } + +copy_inline_extent: + ret = 0; + /* + * We have no extent items, or we have an extent at offset 0 which may + * or may not be inlined. All these cases are dealt the same way. + */ + if (i_size_read(dst) > datal) { + /* + * At the destination offset 0 we have either a hole, a regular + * extent or an inline extent larger then the one we want to + * clone. Deal with all these cases by copying the inline extent + * data into the respective page at the destination inode. + */ + ret = copy_inline_to_page(dst, new_key->offset, inline_data, + size, datal, comp_type); + goto out; + } + + btrfs_release_path(path); + /* + * If we end up here it means were copy the inline extent into a leaf + * of the destination inode. We know we will drop or adjust at most one + * extent item in the destination root. + * + * 1 unit - adjusting old extent (we may have to split it) + * 1 unit - add new extent + * 1 unit - inode update + */ + trans = btrfs_start_transaction(root, 3); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + trans = NULL; + goto out; + } + ret = btrfs_drop_extents(trans, root, dst, drop_start, aligned_end, 1); + if (ret) + goto out; + ret = btrfs_insert_empty_item(trans, root, path, new_key, size); + if (ret) + goto out; + + write_extent_buffer(path->nodes[0], inline_data, + btrfs_item_ptr_offset(path->nodes[0], + path->slots[0]), + size); + inode_add_bytes(dst, datal); + set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(dst)->runtime_flags); +out: + if (!ret && !trans) { + /* + * No transaction here means we copied the inline extent into a + * page of the destination inode. + * + * 1 unit to update inode item + */ + trans = btrfs_start_transaction(root, 1); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + trans = NULL; + } + } + if (ret && trans) { + btrfs_abort_transaction(trans, ret); + btrfs_end_transaction(trans); + } + if (!ret) + *trans_out = trans; + + return ret; +} + +/** + * btrfs_clone() - clone a range from inode file to another + * + * @src: Inode to clone from + * @inode: Inode to clone to + * @off: Offset within source to start clone from + * @olen: Original length, passed by user, of range to clone + * @olen_aligned: Block-aligned value of olen + * @destoff: Offset within @inode to start clone + * @no_time_update: Whether to update mtime/ctime on the target inode + */ +static int btrfs_clone(struct inode *src, struct inode *inode, + const u64 off, const u64 olen, const u64 olen_aligned, + const u64 destoff, int no_time_update) +{ + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_path *path = NULL; + struct extent_buffer *leaf; + struct btrfs_trans_handle *trans; + char *buf = NULL; + struct btrfs_key key; + u32 nritems; + int slot; + int ret; + const u64 len = olen_aligned; + u64 last_dest_end = destoff; + + ret = -ENOMEM; + buf = kvmalloc(fs_info->nodesize, GFP_KERNEL); + if (!buf) + return ret; + + path = btrfs_alloc_path(); + if (!path) { + kvfree(buf); + return ret; + } + + path->reada = READA_FORWARD; + /* Clone data */ + key.objectid = btrfs_ino(BTRFS_I(src)); + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = off; + + while (1) { + u64 next_key_min_offset = key.offset + 1; + struct btrfs_file_extent_item *extent; + int type; + u32 size; + struct btrfs_key new_key; + u64 disko = 0, diskl = 0; + u64 datao = 0, datal = 0; + u8 comp; + u64 drop_start; + + /* Note the key will change type as we walk through the tree */ + path->leave_spinning = 1; + ret = btrfs_search_slot(NULL, BTRFS_I(src)->root, &key, path, + 0, 0); + if (ret < 0) + goto out; + /* + * First search, if no extent item that starts at offset off was + * found but the previous item is an extent item, it's possible + * it might overlap our target range, therefore process it. + */ + if (key.offset == off && ret > 0 && path->slots[0] > 0) { + btrfs_item_key_to_cpu(path->nodes[0], &key, + path->slots[0] - 1); + if (key.type == BTRFS_EXTENT_DATA_KEY) + path->slots[0]--; + } + + nritems = btrfs_header_nritems(path->nodes[0]); +process_slot: + if (path->slots[0] >= nritems) { + ret = btrfs_next_leaf(BTRFS_I(src)->root, path); + if (ret < 0) + goto out; + if (ret > 0) + break; + nritems = btrfs_header_nritems(path->nodes[0]); + } + leaf = path->nodes[0]; + slot = path->slots[0]; + + btrfs_item_key_to_cpu(leaf, &key, slot); + if (key.type > BTRFS_EXTENT_DATA_KEY || + key.objectid != btrfs_ino(BTRFS_I(src))) + break; + + ASSERT(key.type == BTRFS_EXTENT_DATA_KEY); + + extent = btrfs_item_ptr(leaf, slot, + struct btrfs_file_extent_item); + comp = btrfs_file_extent_compression(leaf, extent); + type = btrfs_file_extent_type(leaf, extent); + if (type == BTRFS_FILE_EXTENT_REG || + type == BTRFS_FILE_EXTENT_PREALLOC) { + disko = btrfs_file_extent_disk_bytenr(leaf, extent); + diskl = btrfs_file_extent_disk_num_bytes(leaf, extent); + datao = btrfs_file_extent_offset(leaf, extent); + datal = btrfs_file_extent_num_bytes(leaf, extent); + } else if (type == BTRFS_FILE_EXTENT_INLINE) { + /* Take upper bound, may be compressed */ + datal = btrfs_file_extent_ram_bytes(leaf, extent); + } + + /* + * The first search might have left us at an extent item that + * ends before our target range's start, can happen if we have + * holes and NO_HOLES feature enabled. + */ + if (key.offset + datal <= off) { + path->slots[0]++; + goto process_slot; + } else if (key.offset >= off + len) { + break; + } + next_key_min_offset = key.offset + datal; + size = btrfs_item_size_nr(leaf, slot); + read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf, slot), + size); + + btrfs_release_path(path); + path->leave_spinning = 0; + + memcpy(&new_key, &key, sizeof(new_key)); + new_key.objectid = btrfs_ino(BTRFS_I(inode)); + if (off <= key.offset) + new_key.offset = key.offset + destoff - off; + else + new_key.offset = destoff; + + /* + * Deal with a hole that doesn't have an extent item that + * represents it (NO_HOLES feature enabled). + * This hole is either in the middle of the cloning range or at + * the beginning (fully overlaps it or partially overlaps it). + */ + if (new_key.offset != last_dest_end) + drop_start = last_dest_end; + else + drop_start = new_key.offset; + + if (type == BTRFS_FILE_EXTENT_REG || + type == BTRFS_FILE_EXTENT_PREALLOC) { + struct btrfs_clone_extent_info clone_info; + + /* + * a | --- range to clone ---| b + * | ------------- extent ------------- | + */ + + /* Subtract range b */ + if (key.offset + datal > off + len) + datal = off + len - key.offset; + + /* Subtract range a */ + if (off > key.offset) { + datao += off - key.offset; + datal -= off - key.offset; + } + + clone_info.disk_offset = disko; + clone_info.disk_len = diskl; + clone_info.data_offset = datao; + clone_info.data_len = datal; + clone_info.file_offset = new_key.offset; + clone_info.extent_buf = buf; + clone_info.item_size = size; + ret = btrfs_punch_hole_range(inode, path, drop_start, + new_key.offset + datal - 1, &clone_info, + &trans); + if (ret) + goto out; + } else if (type == BTRFS_FILE_EXTENT_INLINE) { + /* + * Inline extents always have to start at file offset 0 + * and can never be bigger then the sector size. We can + * never clone only parts of an inline extent, since all + * reflink operations must start at a sector size aligned + * offset, and the length must be aligned too or end at + * the i_size (which implies the whole inlined data). + */ + ASSERT(key.offset == 0); + ASSERT(datal <= fs_info->sectorsize); + if (key.offset != 0 || datal > fs_info->sectorsize) + return -EUCLEAN; + + ret = clone_copy_inline_extent(inode, path, &new_key, + drop_start, datal, size, + comp, buf, &trans); + if (ret) + goto out; + } + + btrfs_release_path(path); + + last_dest_end = ALIGN(new_key.offset + datal, + fs_info->sectorsize); + ret = clone_finish_inode_update(trans, inode, last_dest_end, + destoff, olen, no_time_update); + if (ret) + goto out; + if (new_key.offset + datal >= destoff + len) + break; + + btrfs_release_path(path); + key.offset = next_key_min_offset; + + if (fatal_signal_pending(current)) { + ret = -EINTR; + goto out; + } + } + ret = 0; + + if (last_dest_end < destoff + len) { + /* + * We have an implicit hole that fully or partially overlaps our + * cloning range at its end. This means that we either have the + * NO_HOLES feature enabled or the implicit hole happened due to + * mixing buffered and direct IO writes against this file. + */ + btrfs_release_path(path); + path->leave_spinning = 0; + + ret = btrfs_punch_hole_range(inode, path, last_dest_end, + destoff + len - 1, NULL, &trans); + if (ret) + goto out; + + ret = clone_finish_inode_update(trans, inode, destoff + len, + destoff, olen, no_time_update); + } + +out: + btrfs_free_path(path); + kvfree(buf); + return ret; +} + +static void btrfs_double_extent_unlock(struct inode *inode1, u64 loff1, + struct inode *inode2, u64 loff2, u64 len) +{ + unlock_extent(&BTRFS_I(inode1)->io_tree, loff1, loff1 + len - 1); + unlock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1); +} + +static void btrfs_double_extent_lock(struct inode *inode1, u64 loff1, + struct inode *inode2, u64 loff2, u64 len) +{ + if (inode1 < inode2) { + swap(inode1, inode2); + swap(loff1, loff2); + } else if (inode1 == inode2 && loff2 < loff1) { + swap(loff1, loff2); + } + lock_extent(&BTRFS_I(inode1)->io_tree, loff1, loff1 + len - 1); + lock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1); +} + +static int btrfs_extent_same_range(struct inode *src, u64 loff, u64 len, + struct inode *dst, u64 dst_loff) +{ + const u64 bs = BTRFS_I(src)->root->fs_info->sb->s_blocksize; + int ret; + + /* + * Lock destination range to serialize with concurrent readpages() and + * source range to serialize with relocation. + */ + btrfs_double_extent_lock(src, loff, dst, dst_loff, len); + ret = btrfs_clone(src, dst, loff, len, ALIGN(len, bs), dst_loff, 1); + btrfs_double_extent_unlock(src, loff, dst, dst_loff, len); + + return ret; +} + +static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen, + struct inode *dst, u64 dst_loff) +{ + int ret; + u64 i, tail_len, chunk_count; + struct btrfs_root *root_dst = BTRFS_I(dst)->root; + + spin_lock(&root_dst->root_item_lock); + if (root_dst->send_in_progress) { + btrfs_warn_rl(root_dst->fs_info, +"cannot deduplicate to root %llu while send operations are using it (%d in progress)", + root_dst->root_key.objectid, + root_dst->send_in_progress); + spin_unlock(&root_dst->root_item_lock); + return -EAGAIN; + } + root_dst->dedupe_in_progress++; + spin_unlock(&root_dst->root_item_lock); + + tail_len = olen % BTRFS_MAX_DEDUPE_LEN; + chunk_count = div_u64(olen, BTRFS_MAX_DEDUPE_LEN); + + for (i = 0; i < chunk_count; i++) { + ret = btrfs_extent_same_range(src, loff, BTRFS_MAX_DEDUPE_LEN, + dst, dst_loff); + if (ret) + goto out; + + loff += BTRFS_MAX_DEDUPE_LEN; + dst_loff += BTRFS_MAX_DEDUPE_LEN; + } + + if (tail_len > 0) + ret = btrfs_extent_same_range(src, loff, tail_len, dst, dst_loff); +out: + spin_lock(&root_dst->root_item_lock); + root_dst->dedupe_in_progress--; + spin_unlock(&root_dst->root_item_lock); + + return ret; +} + +static noinline int btrfs_clone_files(struct file *file, struct file *file_src, + u64 off, u64 olen, u64 destoff) +{ + struct inode *inode = file_inode(file); + struct inode *src = file_inode(file_src); + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + int ret; + int wb_ret; + u64 len = olen; + u64 bs = fs_info->sb->s_blocksize; + + /* + * VFS's generic_remap_file_range_prep() protects us from cloning the + * eof block into the middle of a file, which would result in corruption + * if the file size is not blocksize aligned. So we don't need to check + * for that case here. + */ + if (off + len == src->i_size) + len = ALIGN(src->i_size, bs) - off; + + if (destoff > inode->i_size) { + const u64 wb_start = ALIGN_DOWN(inode->i_size, bs); + + ret = btrfs_cont_expand(inode, inode->i_size, destoff); + if (ret) + return ret; + /* + * We may have truncated the last block if the inode's size is + * not sector size aligned, so we need to wait for writeback to + * complete before proceeding further, otherwise we can race + * with cloning and attempt to increment a reference to an + * extent that no longer exists (writeback completed right after + * we found the previous extent covering eof and before we + * attempted to increment its reference count). + */ + ret = btrfs_wait_ordered_range(inode, wb_start, + destoff - wb_start); + if (ret) + return ret; + } + + /* + * Lock destination range to serialize with concurrent readpages() and + * source range to serialize with relocation. + */ + btrfs_double_extent_lock(src, off, inode, destoff, len); + ret = btrfs_clone(src, inode, off, olen, len, destoff, 0); + btrfs_double_extent_unlock(src, off, inode, destoff, len); + + /* + * We may have copied an inline extent into a page of the destination + * range, so wait for writeback to complete before truncating pages + * from the page cache. This is a rare case. + */ + wb_ret = btrfs_wait_ordered_range(inode, destoff, len); + ret = ret ? ret : wb_ret; + /* + * Truncate page cache pages so that future reads will see the cloned + * data immediately and not the previous data. + */ + truncate_inode_pages_range(&inode->i_data, + round_down(destoff, PAGE_SIZE), + round_up(destoff + len, PAGE_SIZE) - 1); + + return ret; +} + +static int btrfs_remap_file_range_prep(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, + loff_t *len, unsigned int remap_flags) +{ + struct inode *inode_in = file_inode(file_in); + struct inode *inode_out = file_inode(file_out); + u64 bs = BTRFS_I(inode_out)->root->fs_info->sb->s_blocksize; + bool same_inode = inode_out == inode_in; + u64 wb_len; + int ret; + + if (!(remap_flags & REMAP_FILE_DEDUP)) { + struct btrfs_root *root_out = BTRFS_I(inode_out)->root; + + if (btrfs_root_readonly(root_out)) + return -EROFS; + + if (file_in->f_path.mnt != file_out->f_path.mnt || + inode_in->i_sb != inode_out->i_sb) + return -EXDEV; + } + + /* Don't make the dst file partly checksummed */ + if ((BTRFS_I(inode_in)->flags & BTRFS_INODE_NODATASUM) != + (BTRFS_I(inode_out)->flags & BTRFS_INODE_NODATASUM)) { + return -EINVAL; + } + + /* + * Now that the inodes are locked, we need to start writeback ourselves + * and can not rely on the writeback from the VFS's generic helper + * generic_remap_file_range_prep() because: + * + * 1) For compression we must call filemap_fdatawrite_range() range + * twice (btrfs_fdatawrite_range() does it for us), and the generic + * helper only calls it once; + * + * 2) filemap_fdatawrite_range(), called by the generic helper only + * waits for the writeback to complete, i.e. for IO to be done, and + * not for the ordered extents to complete. We need to wait for them + * to complete so that new file extent items are in the fs tree. + */ + if (*len == 0 && !(remap_flags & REMAP_FILE_DEDUP)) + wb_len = ALIGN(inode_in->i_size, bs) - ALIGN_DOWN(pos_in, bs); + else + wb_len = ALIGN(*len, bs); + + /* + * Since we don't lock ranges, wait for ongoing lockless dio writes (as + * any in progress could create its ordered extents after we wait for + * existing ordered extents below). + */ + inode_dio_wait(inode_in); + if (!same_inode) + inode_dio_wait(inode_out); + + /* + * Workaround to make sure NOCOW buffered write reach disk as NOCOW. + * + * Btrfs' back references do not have a block level granularity, they + * work at the whole extent level. + * NOCOW buffered write without data space reserved may not be able + * to fall back to CoW due to lack of data space, thus could cause + * data loss. + * + * Here we take a shortcut by flushing the whole inode, so that all + * nocow write should reach disk as nocow before we increase the + * reference of the extent. We could do better by only flushing NOCOW + * data, but that needs extra accounting. + * + * Also we don't need to check ASYNC_EXTENT, as async extent will be + * CoWed anyway, not affecting nocow part. + */ + ret = filemap_flush(inode_in->i_mapping); + if (ret < 0) + return ret; + + ret = btrfs_wait_ordered_range(inode_in, ALIGN_DOWN(pos_in, bs), + wb_len); + if (ret < 0) + return ret; + ret = btrfs_wait_ordered_range(inode_out, ALIGN_DOWN(pos_out, bs), + wb_len); + if (ret < 0) + return ret; + + return generic_remap_file_range_prep(file_in, pos_in, file_out, pos_out, + len, remap_flags); +} + +loff_t btrfs_remap_file_range(struct file *src_file, loff_t off, + struct file *dst_file, loff_t destoff, loff_t len, + unsigned int remap_flags) +{ + struct inode *src_inode = file_inode(src_file); + struct inode *dst_inode = file_inode(dst_file); + bool same_inode = dst_inode == src_inode; + int ret; + + if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY)) + return -EINVAL; + + if (same_inode) + inode_lock(src_inode); + else + lock_two_nondirectories(src_inode, dst_inode); + + ret = btrfs_remap_file_range_prep(src_file, off, dst_file, destoff, + &len, remap_flags); + if (ret < 0 || len == 0) + goto out_unlock; + + if (remap_flags & REMAP_FILE_DEDUP) + ret = btrfs_extent_same(src_inode, off, len, dst_inode, destoff); + else + ret = btrfs_clone_files(dst_file, src_file, off, len, destoff); + +out_unlock: + if (same_inode) + inode_unlock(src_inode); + else + unlock_two_nondirectories(src_inode, dst_inode); + + return ret < 0 ? ret : len; +} diff --git a/fs/btrfs/reflink.h b/fs/btrfs/reflink.h new file mode 100644 index 000000000000..ecb309b4dad0 --- /dev/null +++ b/fs/btrfs/reflink.h @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef BTRFS_REFLINK_H +#define BTRFS_REFLINK_H + +#include <linux/fs.h> + +loff_t btrfs_remap_file_range(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, + loff_t len, unsigned int remap_flags); + +#endif /* BTRFS_REFLINK_H */ diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 995d4b8b1cfd..f65595602aa8 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -9,6 +9,7 @@ #include <linux/blkdev.h> #include <linux/rbtree.h> #include <linux/slab.h> +#include <linux/error-injection.h> #include "ctree.h" #include "disk-io.h" #include "transaction.h" @@ -22,6 +23,54 @@ #include "print-tree.h" #include "delalloc-space.h" #include "block-group.h" +#include "backref.h" + +/* + * Relocation overview + * + * [What does relocation do] + * + * The objective of relocation is to relocate all extents of the target block + * group to other block groups. + * This is utilized by resize (shrink only), profile converting, compacting + * space, or balance routine to spread chunks over devices. + * + * Before | After + * ------------------------------------------------------------------ + * BG A: 10 data extents | BG A: deleted + * BG B: 2 data extents | BG B: 10 data extents (2 old + 8 relocated) + * BG C: 1 extents | BG C: 3 data extents (1 old + 2 relocated) + * + * [How does relocation work] + * + * 1. Mark the target block group read-only + * New extents won't be allocated from the target block group. + * + * 2.1 Record each extent in the target block group + * To build a proper map of extents to be relocated. + * + * 2.2 Build data reloc tree and reloc trees + * Data reloc tree will contain an inode, recording all newly relocated + * data extents. + * There will be only one data reloc tree for one data block group. + * + * Reloc tree will be a special snapshot of its source tree, containing + * relocated tree blocks. + * Each tree referring to a tree block in target block group will get its + * reloc tree built. + * + * 2.3 Swap source tree with its corresponding reloc tree + * Each involved tree only refers to new extents after swap. + * + * 3. Cleanup reloc trees and data reloc tree. + * As old extents in the target block group are still referenced by reloc + * trees, we need to clean them up before really freeing the target block + * group. + * + * The main complexity is in steps 2.2 and 2.3. + * + * The entry point of relocation is relocate_block_group() function. + */ /* * backref_node, mapping_node and tree_block start with this @@ -256,6 +305,7 @@ static void free_backref_node(struct backref_cache *cache, { if (node) { cache->nr_nodes--; + btrfs_put_root(node->root); kfree(node); } } @@ -589,22 +639,7 @@ static struct btrfs_root *find_reloc_root(struct reloc_control *rc, root = (struct btrfs_root *)node->data; } spin_unlock(&rc->reloc_root_tree.lock); - return root; -} - -static int is_cowonly_root(u64 root_objectid) -{ - if (root_objectid == BTRFS_ROOT_TREE_OBJECTID || - root_objectid == BTRFS_EXTENT_TREE_OBJECTID || - root_objectid == BTRFS_CHUNK_TREE_OBJECTID || - root_objectid == BTRFS_DEV_TREE_OBJECTID || - root_objectid == BTRFS_TREE_LOG_OBJECTID || - root_objectid == BTRFS_CSUM_TREE_OBJECTID || - root_objectid == BTRFS_UUID_TREE_OBJECTID || - root_objectid == BTRFS_QUOTA_TREE_OBJECTID || - root_objectid == BTRFS_FREE_SPACE_TREE_OBJECTID) - return 1; - return 0; + return btrfs_grab_root(root); } static struct btrfs_root *read_fs_root(struct btrfs_fs_info *fs_info, @@ -614,10 +649,7 @@ static struct btrfs_root *read_fs_root(struct btrfs_fs_info *fs_info, key.objectid = root_objectid; key.type = BTRFS_ROOT_ITEM_KEY; - if (is_cowonly_root(root_objectid)) - key.offset = 0; - else - key.offset = (u64)-1; + key.offset = (u64)-1; return btrfs_get_fs_root(fs_info, &key, false); } @@ -711,8 +743,6 @@ struct backref_node *build_backref_tree(struct reloc_control *rc, err = -ENOMEM; goto out; } - path1->reada = READA_FORWARD; - path2->reada = READA_FORWARD; node = alloc_backref_node(cache); if (!node) { @@ -899,10 +929,12 @@ again: /* tree root */ ASSERT(btrfs_root_bytenr(&root->root_item) == cur->bytenr); - if (should_ignore_root(root)) + if (should_ignore_root(root)) { + btrfs_put_root(root); list_add(&cur->list, &useless); - else + } else { cur->root = root; + } break; } @@ -915,6 +947,7 @@ again: ret = btrfs_search_slot(NULL, root, node_key, path2, 0, 0); path2->lowest_level = 0; if (ret < 0) { + btrfs_put_root(root); err = ret; goto out; } @@ -930,6 +963,7 @@ again: root->root_key.objectid, node_key->objectid, node_key->type, node_key->offset); + btrfs_put_root(root); err = -ENOENT; goto out; } @@ -941,15 +975,18 @@ again: if (!path2->nodes[level]) { ASSERT(btrfs_root_bytenr(&root->root_item) == lower->bytenr); - if (should_ignore_root(root)) + if (should_ignore_root(root)) { + btrfs_put_root(root); list_add(&lower->list, &useless); - else + } else { lower->root = root; + } break; } edge = alloc_backref_edge(cache); if (!edge) { + btrfs_put_root(root); err = -ENOMEM; goto out; } @@ -959,6 +996,7 @@ again: if (!rb_node) { upper = alloc_backref_node(cache); if (!upper) { + btrfs_put_root(root); free_backref_edge(cache, edge); err = -ENOMEM; goto out; @@ -1006,8 +1044,10 @@ again: edge->node[LOWER] = lower; edge->node[UPPER] = upper; - if (rb_node) + if (rb_node) { + btrfs_put_root(root); break; + } lower = upper; upper = NULL; } @@ -1186,7 +1226,7 @@ out: free_backref_node(cache, lower); } - free_backref_node(cache, node); + remove_backref_node(cache, node); return ERR_PTR(err); } ASSERT(!node || !node->detached); @@ -1244,7 +1284,8 @@ static int clone_backref_node(struct btrfs_trans_handle *trans, new_node->level = node->level; new_node->lowest = node->lowest; new_node->checked = 1; - new_node->root = dest; + new_node->root = btrfs_grab_root(dest); + ASSERT(new_node->root); if (!node->lowest) { list_for_each_entry(edge, &node->lower, list[UPPER]) { @@ -1298,7 +1339,7 @@ static int __must_check __add_reloc_root(struct btrfs_root *root) if (!node) return -ENOMEM; - node->bytenr = root->node->start; + node->bytenr = root->commit_root->start; node->data = root; spin_lock(&rc->reloc_root_tree.lock); @@ -1325,14 +1366,16 @@ static void __del_reloc_root(struct btrfs_root *root) struct rb_node *rb_node; struct mapping_node *node = NULL; struct reloc_control *rc = fs_info->reloc_ctl; + bool put_ref = false; if (rc && root->node) { spin_lock(&rc->reloc_root_tree.lock); rb_node = tree_search(&rc->reloc_root_tree.rb_root, - root->node->start); + root->commit_root->start); if (rb_node) { node = rb_entry(rb_node, struct mapping_node, rb_node); rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root); + RB_CLEAR_NODE(&node->rb_node); } spin_unlock(&rc->reloc_root_tree.lock); if (!node) @@ -1340,9 +1383,22 @@ static void __del_reloc_root(struct btrfs_root *root) BUG_ON((struct btrfs_root *)node->data != root); } + /* + * We only put the reloc root here if it's on the list. There's a lot + * of places where the pattern is to splice the rc->reloc_roots, process + * the reloc roots, and then add the reloc root back onto + * rc->reloc_roots. If we call __del_reloc_root while it's off of the + * list we don't want the reference being dropped, because the guy + * messing with the list is in charge of the reference. + */ spin_lock(&fs_info->trans_lock); - list_del_init(&root->root_list); + if (!list_empty(&root->root_list)) { + put_ref = true; + list_del_init(&root->root_list); + } spin_unlock(&fs_info->trans_lock); + if (put_ref) + btrfs_put_root(root); kfree(node); } @@ -1350,7 +1406,7 @@ static void __del_reloc_root(struct btrfs_root *root) * helper to update the 'address of tree root -> reloc tree' * mapping */ -static int __update_reloc_root(struct btrfs_root *root, u64 new_bytenr) +static int __update_reloc_root(struct btrfs_root *root) { struct btrfs_fs_info *fs_info = root->fs_info; struct rb_node *rb_node; @@ -1359,7 +1415,7 @@ static int __update_reloc_root(struct btrfs_root *root, u64 new_bytenr) spin_lock(&rc->reloc_root_tree.lock); rb_node = tree_search(&rc->reloc_root_tree.rb_root, - root->node->start); + root->commit_root->start); if (rb_node) { node = rb_entry(rb_node, struct mapping_node, rb_node); rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root); @@ -1371,7 +1427,7 @@ static int __update_reloc_root(struct btrfs_root *root, u64 new_bytenr) BUG_ON((struct btrfs_root *)node->data != root); spin_lock(&rc->reloc_root_tree.lock); - node->bytenr = new_bytenr; + node->bytenr = root->node->start; rb_node = tree_insert(&rc->reloc_root_tree.rb_root, node->bytenr, &node->rb_node); spin_unlock(&rc->reloc_root_tree.lock); @@ -1447,8 +1503,9 @@ static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans, BUG_ON(ret); kfree(root_item); - reloc_root = btrfs_read_fs_root(fs_info->tree_root, &root_key); + reloc_root = btrfs_read_tree_root(fs_info->tree_root, &root_key); BUG_ON(IS_ERR(reloc_root)); + set_bit(BTRFS_ROOT_REF_COWS, &reloc_root->state); reloc_root->last_trans = trans->transid; return reloc_root; } @@ -1456,6 +1513,9 @@ static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans, /* * create reloc tree for a given fs tree. reloc tree is just a * snapshot of the fs tree with special root objectid. + * + * The reloc_root comes out of here with two references, one for + * root->reloc_root, and another for being on the rc->reloc_roots list. */ int btrfs_init_reloc_root(struct btrfs_trans_handle *trans, struct btrfs_root *root) @@ -1467,6 +1527,10 @@ int btrfs_init_reloc_root(struct btrfs_trans_handle *trans, int clear_rsv = 0; int ret; + if (!rc || !rc->create_reloc_tree || + root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) + return 0; + /* * The subvolume has reloc tree but the swap is finished, no need to * create/update the dead reloc tree @@ -1480,10 +1544,6 @@ int btrfs_init_reloc_root(struct btrfs_trans_handle *trans, return 0; } - if (!rc || !rc->create_reloc_tree || - root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) - return 0; - if (!trans->reloc_reserved) { rsv = trans->block_rsv; trans->block_rsv = rc->block_rsv; @@ -1495,7 +1555,7 @@ int btrfs_init_reloc_root(struct btrfs_trans_handle *trans, ret = __add_reloc_root(reloc_root); BUG_ON(ret < 0); - root->reloc_root = reloc_root; + root->reloc_root = btrfs_grab_root(reloc_root); return 0; } @@ -1516,6 +1576,13 @@ int btrfs_update_reloc_root(struct btrfs_trans_handle *trans, reloc_root = root->reloc_root; root_item = &reloc_root->root_item; + /* + * We are probably ok here, but __del_reloc_root() will drop its ref of + * the root. We have the ref for root->reloc_root, but just in case + * hold it while we update the reloc root. + */ + btrfs_grab_root(reloc_root); + /* root->reloc_root will stay until current relocation finished */ if (fs_info->reloc_ctl->merge_reloc_tree && btrfs_root_refs(root_item) == 0) { @@ -1529,6 +1596,7 @@ int btrfs_update_reloc_root(struct btrfs_trans_handle *trans, } if (reloc_root->commit_root != reloc_root->node) { + __update_reloc_root(reloc_root); btrfs_set_root_node(root_item, reloc_root->node); free_extent_buffer(reloc_root->commit_root); reloc_root->commit_root = btrfs_root_node(reloc_root); @@ -1537,7 +1605,7 @@ int btrfs_update_reloc_root(struct btrfs_trans_handle *trans, ret = btrfs_update_root(trans, fs_info->tree_root, &reloc_root->root_key, root_item); BUG_ON(ret); - + btrfs_put_root(reloc_root); out: return 0; } @@ -2211,7 +2279,7 @@ static void insert_dirty_subvol(struct btrfs_trans_handle *trans, btrfs_update_reloc_root(trans, root); if (list_empty(&root->reloc_dirty_list)) { - btrfs_grab_fs_root(root); + btrfs_grab_root(root); list_add_tail(&root->reloc_dirty_list, &rc->dirty_subvol_roots); } } @@ -2231,24 +2299,34 @@ static int clean_dirty_subvols(struct reloc_control *rc) list_del_init(&root->reloc_dirty_list); root->reloc_root = NULL; - if (reloc_root) { - - ret2 = btrfs_drop_snapshot(reloc_root, NULL, 0, 1); - if (ret2 < 0 && !ret) - ret = ret2; - } /* * Need barrier to ensure clear_bit() only happens after * root->reloc_root = NULL. Pairs with have_reloc_root. */ smp_wmb(); clear_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state); - btrfs_put_fs_root(root); + if (reloc_root) { + /* + * btrfs_drop_snapshot drops our ref we hold for + * ->reloc_root. If it fails however we must + * drop the ref ourselves. + */ + ret2 = btrfs_drop_snapshot(reloc_root, 0, 1); + if (ret2 < 0) { + btrfs_put_root(reloc_root); + if (!ret) + ret = ret2; + } + } + btrfs_put_root(root); } else { /* Orphan reloc tree, just clean it up */ - ret2 = btrfs_drop_snapshot(root, NULL, 0, 1); - if (ret2 < 0 && !ret) - ret = ret2; + ret2 = btrfs_drop_snapshot(root, 0, 1); + if (ret2 < 0) { + btrfs_put_root(root); + if (!ret) + ret = ret2; + } } } return ret; @@ -2325,6 +2403,18 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc, trans = NULL; goto out; } + + /* + * At this point we no longer have a reloc_control, so we can't + * depend on btrfs_init_reloc_root to update our last_trans. + * + * But that's ok, we started the trans handle on our + * corresponding fs_root, which means it's been added to the + * dirty list. At commit time we'll still call + * btrfs_update_reloc_root() and update our root item + * appropriately. + */ + reloc_root->last_trans = trans->transid; trans->block_rsv = rc->block_rsv; replaced = 0; @@ -2435,7 +2525,7 @@ again: if (IS_ERR(trans)) { if (!err) btrfs_block_rsv_release(fs_info, rc->block_rsv, - num_bytes); + num_bytes, NULL); return PTR_ERR(trans); } @@ -2443,7 +2533,7 @@ again: if (num_bytes != rc->merging_rsv_size) { btrfs_end_transaction(trans); btrfs_block_rsv_release(fs_info, rc->block_rsv, - num_bytes); + num_bytes, NULL); goto again; } } @@ -2468,6 +2558,7 @@ again: btrfs_update_reloc_root(trans, root); list_add(&reloc_root->root_list, &reloc_roots); + btrfs_put_root(root); } list_splice(&reloc_roots, &rc->reloc_roots); @@ -2488,10 +2579,6 @@ void free_reloc_roots(struct list_head *list) reloc_root = list_entry(list->next, struct btrfs_root, root_list); __del_reloc_root(reloc_root); - free_extent_buffer(reloc_root->node); - free_extent_buffer(reloc_root->commit_root); - reloc_root->node = NULL; - reloc_root->commit_root = NULL; } } @@ -2529,6 +2616,7 @@ again: BUG_ON(root->reloc_root != reloc_root); ret = merge_reloc_root(rc, root); + btrfs_put_root(root); if (ret) { if (list_empty(&reloc_root->root_list)) list_add_tail(&reloc_root->root_list, @@ -2561,7 +2649,21 @@ out: free_reloc_roots(&reloc_roots); } - BUG_ON(!RB_EMPTY_ROOT(&rc->reloc_root_tree.rb_root)); + /* + * We used to have + * + * BUG_ON(!RB_EMPTY_ROOT(&rc->reloc_root_tree.rb_root)); + * + * here, but it's wrong. If we fail to start the transaction in + * prepare_to_merge() we will have only 0 ref reloc roots, none of which + * have actually been removed from the reloc_root_tree rb tree. This is + * fine because we're bailing here, and we hold a reference on the root + * for the list that holds it, so these roots will be cleaned up when we + * do the reloc_dirty_list afterwards. Meanwhile the root->reloc_root + * will be cleaned up on unmount. + * + * The remaining nodes will be cleaned up by free_reloc_control. + */ } static void free_block_list(struct rb_root *blocks) @@ -2580,6 +2682,7 @@ static int record_reloc_root_in_trans(struct btrfs_trans_handle *trans, { struct btrfs_fs_info *fs_info = reloc_root->fs_info; struct btrfs_root *root; + int ret; if (reloc_root->last_trans == trans->transid) return 0; @@ -2587,8 +2690,10 @@ static int record_reloc_root_in_trans(struct btrfs_trans_handle *trans, root = read_fs_root(fs_info, reloc_root->root_key.offset); BUG_ON(IS_ERR(root)); BUG_ON(root->reloc_root != reloc_root); + ret = btrfs_record_root_in_trans(trans, root); + btrfs_put_root(root); - return btrfs_record_root_in_trans(trans, root); + return ret; } static noinline_for_stack @@ -2621,7 +2726,9 @@ struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans, BUG_ON(next->new_bytenr); BUG_ON(!list_empty(&next->list)); next->new_bytenr = root->node->start; - next->root = root; + btrfs_put_root(next->root); + next->root = btrfs_grab_root(root); + ASSERT(next->root); list_add_tail(&next->list, &rc->backref_cache.changed); __mark_block_processed(rc, next); @@ -3040,7 +3147,6 @@ static int get_tree_block_key(struct btrfs_fs_info *fs_info, { struct extent_buffer *eb; - BUG_ON(block->key_ready); eb = read_tree_block(fs_info, block->bytenr, block->key.offset, block->level, NULL); if (IS_ERR(eb)) { @@ -3073,6 +3179,14 @@ static int relocate_tree_block(struct btrfs_trans_handle *trans, if (!node) return 0; + /* + * If we fail here we want to drop our backref_node because we are going + * to start over and regenerate the tree for it. + */ + ret = reserve_metadata_space(trans, rc, node); + if (ret) + goto out; + BUG_ON(node->processed); root = select_one_root(node); if (root == ERR_PTR(-ENOENT)) { @@ -3080,12 +3194,6 @@ static int relocate_tree_block(struct btrfs_trans_handle *trans, goto out; } - if (!root || test_bit(BTRFS_ROOT_REF_COWS, &root->state)) { - ret = reserve_metadata_space(trans, rc, node); - if (ret) - goto out; - } - if (root) { if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) { BUG_ON(node->new_bytenr); @@ -3093,7 +3201,9 @@ static int relocate_tree_block(struct btrfs_trans_handle *trans, btrfs_record_root_in_trans(trans, root); root = root->reloc_root; node->new_bytenr = root->node->start; - node->root = root; + btrfs_put_root(node->root); + node->root = btrfs_grab_root(root); + ASSERT(node->root); list_add_tail(&node->list, &rc->backref_cache.changed); } else { path->lowest_level = node->level; @@ -3161,9 +3271,8 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans, ret = relocate_tree_block(trans, rc, node, &block->key, path); if (ret < 0) { - if (ret != -EAGAIN || &block->rb_node == rb_first(blocks)) - err = ret; - goto out; + err = ret; + break; } } out: @@ -3264,6 +3373,15 @@ int setup_extent_mapping(struct inode *inode, u64 start, u64 end, return ret; } +/* + * Allow error injection to test balance cancellation + */ +int btrfs_should_cancel_balance(struct btrfs_fs_info *fs_info) +{ + return atomic_read(&fs_info->balance_cancel_req); +} +ALLOW_ERROR_INJECTION(btrfs_should_cancel_balance, TRUE); + static int relocate_file_extent_cluster(struct inode *inode, struct file_extent_cluster *cluster) { @@ -3385,6 +3503,10 @@ static int relocate_file_extent_cluster(struct inode *inode, btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE); balance_dirty_pages_ratelimited(inode->i_mapping); btrfs_throttle(fs_info); + if (btrfs_should_cancel_balance(fs_info)) { + ret = -ECANCELED; + goto out; + } } WARN_ON(nr != cluster->nr); out: @@ -3556,31 +3678,6 @@ out: return ret; } -/* - * helper to check if the block use full backrefs for pointers in it - */ -static int block_use_full_backref(struct reloc_control *rc, - struct extent_buffer *eb) -{ - u64 flags; - int ret; - - if (btrfs_header_flag(eb, BTRFS_HEADER_FLAG_RELOC) || - btrfs_header_backref_rev(eb) < BTRFS_MIXED_BACKREF_REV) - return 1; - - ret = btrfs_lookup_extent_info(NULL, rc->extent_root->fs_info, - eb->start, btrfs_header_level(eb), 1, - NULL, &flags); - BUG_ON(ret); - - if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) - ret = 1; - else - ret = 0; - return ret; -} - static int delete_block_group_cache(struct btrfs_fs_info *fs_info, struct btrfs_block_group *block_group, struct inode *inode, @@ -3624,172 +3721,40 @@ out: } /* - * helper to add tree blocks for backref of type BTRFS_EXTENT_DATA_REF_KEY - * this function scans fs tree to find blocks reference the data extent + * Locate the free space cache EXTENT_DATA in root tree leaf and delete the + * cache inode, to avoid free space cache data extent blocking data relocation. */ -static int find_data_references(struct reloc_control *rc, - struct btrfs_key *extent_key, - struct extent_buffer *leaf, - struct btrfs_extent_data_ref *ref, - struct rb_root *blocks) +static int delete_v1_space_cache(struct extent_buffer *leaf, + struct btrfs_block_group *block_group, + u64 data_bytenr) { - struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; - struct btrfs_path *path; - struct tree_block *block; - struct btrfs_root *root; - struct btrfs_file_extent_item *fi; - struct rb_node *rb_node; + u64 space_cache_ino; + struct btrfs_file_extent_item *ei; struct btrfs_key key; - u64 ref_root; - u64 ref_objectid; - u64 ref_offset; - u32 ref_count; - u32 nritems; - int err = 0; - int added = 0; - int counted; + bool found = false; + int i; int ret; - ref_root = btrfs_extent_data_ref_root(leaf, ref); - ref_objectid = btrfs_extent_data_ref_objectid(leaf, ref); - ref_offset = btrfs_extent_data_ref_offset(leaf, ref); - ref_count = btrfs_extent_data_ref_count(leaf, ref); - - /* - * This is an extent belonging to the free space cache, lets just delete - * it and redo the search. - */ - if (ref_root == BTRFS_ROOT_TREE_OBJECTID) { - ret = delete_block_group_cache(fs_info, rc->block_group, - NULL, ref_objectid); - if (ret != -ENOENT) - return ret; - ret = 0; - } - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - path->reada = READA_FORWARD; - - root = read_fs_root(fs_info, ref_root); - if (IS_ERR(root)) { - err = PTR_ERR(root); - goto out; - } - - key.objectid = ref_objectid; - key.type = BTRFS_EXTENT_DATA_KEY; - if (ref_offset > ((u64)-1 << 32)) - key.offset = 0; - else - key.offset = ref_offset; - - path->search_commit_root = 1; - path->skip_locking = 1; - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - if (ret < 0) { - err = ret; - goto out; - } - - leaf = path->nodes[0]; - nritems = btrfs_header_nritems(leaf); - /* - * the references in tree blocks that use full backrefs - * are not counted in - */ - if (block_use_full_backref(rc, leaf)) - counted = 0; - else - counted = 1; - rb_node = tree_search(blocks, leaf->start); - if (rb_node) { - if (counted) - added = 1; - else - path->slots[0] = nritems; - } - - while (ref_count > 0) { - while (path->slots[0] >= nritems) { - ret = btrfs_next_leaf(root, path); - if (ret < 0) { - err = ret; - goto out; - } - if (WARN_ON(ret > 0)) - goto out; - - leaf = path->nodes[0]; - nritems = btrfs_header_nritems(leaf); - added = 0; - - if (block_use_full_backref(rc, leaf)) - counted = 0; - else - counted = 1; - rb_node = tree_search(blocks, leaf->start); - if (rb_node) { - if (counted) - added = 1; - else - path->slots[0] = nritems; - } - } + if (btrfs_header_owner(leaf) != BTRFS_ROOT_TREE_OBJECTID) + return 0; - btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); - if (WARN_ON(key.objectid != ref_objectid || - key.type != BTRFS_EXTENT_DATA_KEY)) + for (i = 0; i < btrfs_header_nritems(leaf); i++) { + btrfs_item_key_to_cpu(leaf, &key, i); + if (key.type != BTRFS_EXTENT_DATA_KEY) + continue; + ei = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item); + if (btrfs_file_extent_type(leaf, ei) == BTRFS_FILE_EXTENT_REG && + btrfs_file_extent_disk_bytenr(leaf, ei) == data_bytenr) { + found = true; + space_cache_ino = key.objectid; break; - - fi = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_file_extent_item); - - if (btrfs_file_extent_type(leaf, fi) == - BTRFS_FILE_EXTENT_INLINE) - goto next; - - if (btrfs_file_extent_disk_bytenr(leaf, fi) != - extent_key->objectid) - goto next; - - key.offset -= btrfs_file_extent_offset(leaf, fi); - if (key.offset != ref_offset) - goto next; - - if (counted) - ref_count--; - if (added) - goto next; - - if (!tree_block_processed(leaf->start, rc)) { - block = kmalloc(sizeof(*block), GFP_NOFS); - if (!block) { - err = -ENOMEM; - break; - } - block->bytenr = leaf->start; - btrfs_item_key_to_cpu(leaf, &block->key, 0); - block->level = 0; - block->key_ready = 1; - rb_node = tree_insert(blocks, block->bytenr, - &block->rb_node); - if (rb_node) - backref_tree_panic(rb_node, -EEXIST, - block->bytenr); } - if (counted) - added = 1; - else - path->slots[0] = nritems; -next: - path->slots[0]++; - } -out: - btrfs_free_path(path); - return err; + if (!found) + return -ENOENT; + ret = delete_block_group_cache(leaf->fs_info, block_group, NULL, + space_cache_ino); + return ret; } /* @@ -3801,91 +3766,41 @@ int add_data_references(struct reloc_control *rc, struct btrfs_path *path, struct rb_root *blocks) { - struct btrfs_key key; - struct extent_buffer *eb; - struct btrfs_extent_data_ref *dref; - struct btrfs_extent_inline_ref *iref; - unsigned long ptr; - unsigned long end; - u32 blocksize = rc->extent_root->fs_info->nodesize; + struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; + struct ulist *leaves = NULL; + struct ulist_iterator leaf_uiter; + struct ulist_node *ref_node = NULL; + const u32 blocksize = fs_info->nodesize; int ret = 0; - int err = 0; - - eb = path->nodes[0]; - ptr = btrfs_item_ptr_offset(eb, path->slots[0]); - end = ptr + btrfs_item_size_nr(eb, path->slots[0]); - ptr += sizeof(struct btrfs_extent_item); - while (ptr < end) { - iref = (struct btrfs_extent_inline_ref *)ptr; - key.type = btrfs_get_extent_inline_ref_type(eb, iref, - BTRFS_REF_TYPE_DATA); - if (key.type == BTRFS_SHARED_DATA_REF_KEY) { - key.offset = btrfs_extent_inline_ref_offset(eb, iref); - ret = __add_tree_block(rc, key.offset, blocksize, - blocks); - } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) { - dref = (struct btrfs_extent_data_ref *)(&iref->offset); - ret = find_data_references(rc, extent_key, - eb, dref, blocks); - } else { - ret = -EUCLEAN; - btrfs_err(rc->extent_root->fs_info, - "extent %llu slot %d has an invalid inline ref type", - eb->start, path->slots[0]); - } - if (ret) { - err = ret; - goto out; - } - ptr += btrfs_extent_inline_ref_size(key.type); - } - WARN_ON(ptr > end); + btrfs_release_path(path); + ret = btrfs_find_all_leafs(NULL, fs_info, extent_key->objectid, + 0, &leaves, NULL, true); + if (ret < 0) + return ret; - while (1) { - cond_resched(); - eb = path->nodes[0]; - if (path->slots[0] >= btrfs_header_nritems(eb)) { - ret = btrfs_next_leaf(rc->extent_root, path); - if (ret < 0) { - err = ret; - break; - } - if (ret > 0) - break; - eb = path->nodes[0]; - } + ULIST_ITER_INIT(&leaf_uiter); + while ((ref_node = ulist_next(leaves, &leaf_uiter))) { + struct extent_buffer *eb; - btrfs_item_key_to_cpu(eb, &key, path->slots[0]); - if (key.objectid != extent_key->objectid) + eb = read_tree_block(fs_info, ref_node->val, 0, 0, NULL); + if (IS_ERR(eb)) { + ret = PTR_ERR(eb); break; - - if (key.type == BTRFS_SHARED_DATA_REF_KEY) { - ret = __add_tree_block(rc, key.offset, blocksize, - blocks); - } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) { - dref = btrfs_item_ptr(eb, path->slots[0], - struct btrfs_extent_data_ref); - ret = find_data_references(rc, extent_key, - eb, dref, blocks); - } else if (unlikely(key.type == BTRFS_EXTENT_REF_V0_KEY)) { - btrfs_print_v0_err(eb->fs_info); - btrfs_handle_fs_error(eb->fs_info, -EINVAL, NULL); - ret = -EINVAL; - } else { - ret = 0; } - if (ret) { - err = ret; + ret = delete_v1_space_cache(eb, rc->block_group, + extent_key->objectid); + free_extent_buffer(eb); + if (ret < 0) + break; + ret = __add_tree_block(rc, ref_node->val, blocksize, blocks); + if (ret < 0) break; - } - path->slots[0]++; } -out: - btrfs_release_path(path); - if (err) + if (ret < 0) free_block_list(blocks); - return err; + ulist_free(leaves); + return ret; } /* @@ -4137,12 +4052,6 @@ restart: if (!RB_EMPTY_ROOT(&blocks)) { ret = relocate_tree_blocks(trans, rc, &blocks); if (ret < 0) { - /* - * if we fail to relocate tree blocks, force to update - * backref cache when committing transaction. - */ - rc->backref_cache.last_trans = trans->transid - 1; - if (ret != -EAGAIN) { err = ret; break; @@ -4166,6 +4075,10 @@ restart: break; } } + if (btrfs_should_cancel_balance(fs_info)) { + err = -ECANCELED; + break; + } } if (trans && progress && err == -ENOSPC) { ret = btrfs_force_chunk_alloc(trans, rc->block_group->flags); @@ -4195,15 +4108,23 @@ restart: set_reloc_control(rc); backref_cache_cleanup(&rc->backref_cache); - btrfs_block_rsv_release(fs_info, rc->block_rsv, (u64)-1); + btrfs_block_rsv_release(fs_info, rc->block_rsv, (u64)-1, NULL); + /* + * Even in the case when the relocation is cancelled, we should all go + * through prepare_to_merge() and merge_reloc_roots(). + * + * For error (including cancelled balance), prepare_to_merge() will + * mark all reloc trees orphan, then queue them for cleanup in + * merge_reloc_roots() + */ err = prepare_to_merge(rc, err); merge_reloc_roots(rc); rc->merge_reloc_tree = 0; unset_reloc_control(rc); - btrfs_block_rsv_release(fs_info, rc->block_rsv, (u64)-1); + btrfs_block_rsv_release(fs_info, rc->block_rsv, (u64)-1, NULL); /* get rid of pinned extents */ trans = btrfs_join_transaction(rc->extent_root); @@ -4212,10 +4133,10 @@ restart: goto out_free; } btrfs_commit_transaction(trans); +out_free: ret = clean_dirty_subvols(rc); if (ret < 0 && !err) err = ret; -out_free: btrfs_free_block_rsv(fs_info, rc->block_rsv); btrfs_free_path(path); return err; @@ -4271,8 +4192,10 @@ struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info, return ERR_CAST(root); trans = btrfs_start_transaction(root, 6); - if (IS_ERR(trans)) + if (IS_ERR(trans)) { + btrfs_put_root(root); return ERR_CAST(trans); + } err = btrfs_find_free_objectid(root, &objectid); if (err) @@ -4290,6 +4213,7 @@ struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info, err = btrfs_orphan_add(trans, BTRFS_I(inode)); out: + btrfs_put_root(root); btrfs_end_transaction(trans); btrfs_btree_balance_dirty(fs_info); if (err) { @@ -4317,6 +4241,18 @@ static struct reloc_control *alloc_reloc_control(struct btrfs_fs_info *fs_info) return rc; } +static void free_reloc_control(struct reloc_control *rc) +{ + struct mapping_node *node, *tmp; + + free_reloc_roots(&rc->reloc_roots); + rbtree_postorder_for_each_entry_safe(node, tmp, + &rc->reloc_root_tree.rb_root, rb_node) + kfree(node); + + kfree(rc); +} + /* * Print the block group being relocated */ @@ -4461,7 +4397,7 @@ out: btrfs_dec_block_group_ro(rc->block_group); iput(rc->data_inode); btrfs_put_block_group(rc->block_group); - kfree(rc); + free_reloc_control(rc); return err; } @@ -4537,12 +4473,13 @@ int btrfs_recover_relocation(struct btrfs_root *root) key.type != BTRFS_ROOT_ITEM_KEY) break; - reloc_root = btrfs_read_fs_root(root, &key); + reloc_root = btrfs_read_tree_root(root, &key); if (IS_ERR(reloc_root)) { err = PTR_ERR(reloc_root); goto out; } + set_bit(BTRFS_ROOT_REF_COWS, &reloc_root->state); list_add(&reloc_root->root_list, &reloc_roots); if (btrfs_root_refs(&reloc_root->root_item) > 0) { @@ -4559,6 +4496,8 @@ int btrfs_recover_relocation(struct btrfs_root *root) err = ret; goto out; } + } else { + btrfs_put_root(fs_root); } } @@ -4584,9 +4523,8 @@ int btrfs_recover_relocation(struct btrfs_root *root) trans = btrfs_join_transaction(rc->extent_root); if (IS_ERR(trans)) { - unset_reloc_control(rc); err = PTR_ERR(trans); - goto out_free; + goto out_unset; } rc->merge_reloc_tree = 1; @@ -4606,17 +4544,18 @@ int btrfs_recover_relocation(struct btrfs_root *root) if (IS_ERR(fs_root)) { err = PTR_ERR(fs_root); list_add_tail(&reloc_root->root_list, &reloc_roots); - goto out_free; + goto out_unset; } err = __add_reloc_root(reloc_root); BUG_ON(err < 0); /* -ENOMEM or logic error */ - fs_root->reloc_root = reloc_root; + fs_root->reloc_root = btrfs_grab_root(reloc_root); + btrfs_put_root(fs_root); } err = btrfs_commit_transaction(trans); if (err) - goto out_free; + goto out_unset; merge_reloc_roots(rc); @@ -4625,15 +4564,16 @@ int btrfs_recover_relocation(struct btrfs_root *root) trans = btrfs_join_transaction(rc->extent_root); if (IS_ERR(trans)) { err = PTR_ERR(trans); - goto out_free; + goto out_clean; } err = btrfs_commit_transaction(trans); - +out_clean: ret = clean_dirty_subvols(rc); if (ret < 0 && !err) err = ret; -out_free: - kfree(rc); +out_unset: + unset_reloc_control(rc); + free_reloc_control(rc); out: if (!list_empty(&reloc_roots)) free_reloc_roots(&reloc_roots); @@ -4643,10 +4583,12 @@ out: if (err == 0) { /* cleanup orphan inode in data relocation tree */ fs_root = read_fs_root(fs_info, BTRFS_DATA_RELOC_TREE_OBJECTID); - if (IS_ERR(fs_root)) + if (IS_ERR(fs_root)) { err = PTR_ERR(fs_root); - else + } else { err = btrfs_orphan_cleanup(fs_root); + btrfs_put_root(fs_root); + } } return err; } @@ -4720,11 +4662,6 @@ int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans, BUG_ON(rc->stage == UPDATE_DATA_PTRS && root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID); - if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) { - if (buf == root->node) - __update_reloc_root(root, cow->start); - } - level = btrfs_header_level(buf); if (btrfs_header_generation(buf) <= btrfs_root_last_snapshot(&root->root_item)) @@ -4795,6 +4732,10 @@ void btrfs_reloc_pre_snapshot(struct btrfs_pending_snapshot *pending, /* * called after snapshot is created. migrate block reservation * and create reloc root for the newly created snapshot + * + * This is similar to btrfs_init_reloc_root(), we come out of here with two + * references held on the reloc_root, one for root->reloc_root and one for + * rc->reloc_roots. */ int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans, struct btrfs_pending_snapshot *pending) @@ -4827,7 +4768,7 @@ int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans, ret = __add_reloc_root(reloc_root); BUG_ON(ret < 0); - new_root->reloc_root = reloc_root; + new_root->reloc_root = btrfs_grab_root(reloc_root); if (rc->create_reloc_tree) ret = clone_backref_node(trans, rc, root, reloc_root); diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index 612411c74550..668f22844017 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -22,7 +22,6 @@ static void btrfs_read_root_item(struct extent_buffer *eb, int slot, struct btrfs_root_item *item) { - uuid_le uuid; u32 len; int need_reset = 0; @@ -44,8 +43,7 @@ static void btrfs_read_root_item(struct extent_buffer *eb, int slot, sizeof(*item) - offsetof(struct btrfs_root_item, generation_v2)); - uuid_le_gen(&uuid); - memcpy(item->uuid, uuid.b, BTRFS_UUID_SIZE); + generate_random_guid(item->uuid); } } @@ -255,25 +253,7 @@ int btrfs_find_orphan_roots(struct btrfs_fs_info *fs_info) root_key.objectid = key.offset; key.offset++; - /* - * The root might have been inserted already, as before we look - * for orphan roots, log replay might have happened, which - * triggers a transaction commit and qgroup accounting, which - * in turn reads and inserts fs roots while doing backref - * walking. - */ - root = btrfs_lookup_fs_root(fs_info, root_key.objectid); - if (root) { - WARN_ON(!test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, - &root->state)); - if (btrfs_root_refs(&root->root_item) == 0) { - set_bit(BTRFS_ROOT_DEAD_TREE, &root->state); - btrfs_add_dead_root(root); - } - continue; - } - - root = btrfs_read_fs_root(tree_root, &root_key); + root = btrfs_get_fs_root(fs_info, &root_key, false); err = PTR_ERR_OR_ZERO(root); if (err && err != -ENOENT) { break; @@ -300,25 +280,12 @@ int btrfs_find_orphan_roots(struct btrfs_fs_info *fs_info) continue; } - err = btrfs_init_fs_root(root); - if (err) { - btrfs_free_fs_root(root); - break; - } - - set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state); - - err = btrfs_insert_fs_root(fs_info, root); - if (err) { - BUG_ON(err == -EEXIST); - btrfs_free_fs_root(root); - break; - } - + WARN_ON(!test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state)); if (btrfs_root_refs(&root->root_item) == 0) { set_bit(BTRFS_ROOT_DEAD_TREE, &root->state); btrfs_add_dead_root(root); } + btrfs_put_root(root); } btrfs_free_path(path); @@ -553,5 +520,5 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, void btrfs_subvolume_release_metadata(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *rsv) { - btrfs_block_rsv_release(fs_info, rsv, (u64)-1); + btrfs_block_rsv_release(fs_info, rsv, (u64)-1, NULL); } diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 61b37c56a7fb..adaf8ab694d5 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -149,7 +149,7 @@ struct scrub_parity { */ unsigned long *ebitmap; - unsigned long bitmap[0]; + unsigned long bitmap[]; }; struct scrub_ctx { @@ -653,7 +653,7 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, root_key.objectid = root; root_key.type = BTRFS_ROOT_ITEM_KEY; root_key.offset = (u64)-1; - local_root = btrfs_read_fs_root_no_name(fs_info, &root_key); + local_root = btrfs_get_fs_root(fs_info, &root_key, true); if (IS_ERR(local_root)) { ret = PTR_ERR(local_root); goto err; @@ -668,6 +668,7 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, ret = btrfs_search_slot(NULL, local_root, &key, swarn->path, 0, 0); if (ret) { + btrfs_put_root(local_root); btrfs_release_path(swarn->path); goto err; } @@ -688,6 +689,7 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, ipath = init_ipath(4096, local_root, swarn->path); memalloc_nofs_restore(nofs_flag); if (IS_ERR(ipath)) { + btrfs_put_root(local_root); ret = PTR_ERR(ipath); ipath = NULL; goto err; @@ -711,6 +713,7 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, min(isize - offset, (u64)PAGE_SIZE), nlink, (char *)(unsigned long)ipath->fspath->val[i]); + btrfs_put_root(local_root); free_ipath(ipath); return 0; diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index a055b657cb85..c5f41bd86765 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -5586,10 +5586,7 @@ static int get_last_extent(struct send_ctx *sctx, u64 offset) { struct btrfs_path *path; struct btrfs_root *root = sctx->send_root; - struct btrfs_file_extent_item *fi; struct btrfs_key key; - u64 extent_end; - u8 type; int ret; path = alloc_path_for_send(); @@ -5609,18 +5606,7 @@ static int get_last_extent(struct send_ctx *sctx, u64 offset) if (key.objectid != sctx->cur_ino || key.type != BTRFS_EXTENT_DATA_KEY) goto out; - fi = btrfs_item_ptr(path->nodes[0], path->slots[0], - struct btrfs_file_extent_item); - type = btrfs_file_extent_type(path->nodes[0], fi); - if (type == BTRFS_FILE_EXTENT_INLINE) { - u64 size = btrfs_file_extent_ram_bytes(path->nodes[0], fi); - extent_end = ALIGN(key.offset + size, - sctx->send_root->fs_info->sectorsize); - } else { - extent_end = key.offset + - btrfs_file_extent_num_bytes(path->nodes[0], fi); - } - sctx->cur_inode_last_extent = extent_end; + sctx->cur_inode_last_extent = btrfs_file_extent_end(path); out: btrfs_free_path(path); return ret; @@ -5674,16 +5660,7 @@ static int range_is_hole_in_parent(struct send_ctx *sctx, break; fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); - if (btrfs_file_extent_type(leaf, fi) == - BTRFS_FILE_EXTENT_INLINE) { - u64 size = btrfs_file_extent_ram_bytes(leaf, fi); - - extent_end = ALIGN(key.offset + size, - root->fs_info->sectorsize); - } else { - extent_end = key.offset + - btrfs_file_extent_num_bytes(leaf, fi); - } + extent_end = btrfs_file_extent_end(path); if (extent_end <= start) goto next; if (btrfs_file_extent_disk_bytenr(leaf, fi) == 0) { @@ -5704,9 +5681,6 @@ out: static int maybe_send_hole(struct send_ctx *sctx, struct btrfs_path *path, struct btrfs_key *key) { - struct btrfs_file_extent_item *fi; - u64 extent_end; - u8 type; int ret = 0; if (sctx->cur_ino != key->objectid || !need_send_hole(sctx)) @@ -5718,18 +5692,6 @@ static int maybe_send_hole(struct send_ctx *sctx, struct btrfs_path *path, return ret; } - fi = btrfs_item_ptr(path->nodes[0], path->slots[0], - struct btrfs_file_extent_item); - type = btrfs_file_extent_type(path->nodes[0], fi); - if (type == BTRFS_FILE_EXTENT_INLINE) { - u64 size = btrfs_file_extent_ram_bytes(path->nodes[0], fi); - extent_end = ALIGN(key->offset + size, - sctx->send_root->fs_info->sectorsize); - } else { - extent_end = key->offset + - btrfs_file_extent_num_bytes(path->nodes[0], fi); - } - if (path->slots[0] == 0 && sctx->cur_inode_last_extent < key->offset) { /* @@ -5755,7 +5717,7 @@ static int maybe_send_hole(struct send_ctx *sctx, struct btrfs_path *path, else ret = 0; } - sctx->cur_inode_last_extent = extent_end; + sctx->cur_inode_last_extent = btrfs_file_extent_end(path); return ret; } @@ -7066,7 +7028,6 @@ long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg) int clone_sources_to_rollback = 0; unsigned alloc_size; int sort_clone_roots = 0; - int index; if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -7193,11 +7154,8 @@ long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg) key.type = BTRFS_ROOT_ITEM_KEY; key.offset = (u64)-1; - index = srcu_read_lock(&fs_info->subvol_srcu); - - clone_root = btrfs_read_fs_root_no_name(fs_info, &key); + clone_root = btrfs_get_fs_root(fs_info, &key, true); if (IS_ERR(clone_root)) { - srcu_read_unlock(&fs_info->subvol_srcu, index); ret = PTR_ERR(clone_root); goto out; } @@ -7205,20 +7163,19 @@ long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg) if (!btrfs_root_readonly(clone_root) || btrfs_root_dead(clone_root)) { spin_unlock(&clone_root->root_item_lock); - srcu_read_unlock(&fs_info->subvol_srcu, index); + btrfs_put_root(clone_root); ret = -EPERM; goto out; } if (clone_root->dedupe_in_progress) { dedupe_in_progress_warn(clone_root); spin_unlock(&clone_root->root_item_lock); - srcu_read_unlock(&fs_info->subvol_srcu, index); + btrfs_put_root(clone_root); ret = -EAGAIN; goto out; } clone_root->send_in_progress++; spin_unlock(&clone_root->root_item_lock); - srcu_read_unlock(&fs_info->subvol_srcu, index); sctx->clone_roots[i].root = clone_root; clone_sources_to_rollback = i + 1; @@ -7232,11 +7189,8 @@ long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg) key.type = BTRFS_ROOT_ITEM_KEY; key.offset = (u64)-1; - index = srcu_read_lock(&fs_info->subvol_srcu); - - sctx->parent_root = btrfs_read_fs_root_no_name(fs_info, &key); + sctx->parent_root = btrfs_get_fs_root(fs_info, &key, true); if (IS_ERR(sctx->parent_root)) { - srcu_read_unlock(&fs_info->subvol_srcu, index); ret = PTR_ERR(sctx->parent_root); goto out; } @@ -7246,20 +7200,16 @@ long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg) if (!btrfs_root_readonly(sctx->parent_root) || btrfs_root_dead(sctx->parent_root)) { spin_unlock(&sctx->parent_root->root_item_lock); - srcu_read_unlock(&fs_info->subvol_srcu, index); ret = -EPERM; goto out; } if (sctx->parent_root->dedupe_in_progress) { dedupe_in_progress_warn(sctx->parent_root); spin_unlock(&sctx->parent_root->root_item_lock); - srcu_read_unlock(&fs_info->subvol_srcu, index); ret = -EAGAIN; goto out; } spin_unlock(&sctx->parent_root->root_item_lock); - - srcu_read_unlock(&fs_info->subvol_srcu, index); } /* @@ -7267,7 +7217,8 @@ long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg) * is behind the current send position. This is checked while searching * for possible clone sources. */ - sctx->clone_roots[sctx->clone_roots_cnt++].root = sctx->send_root; + sctx->clone_roots[sctx->clone_roots_cnt++].root = + btrfs_grab_root(sctx->send_root); /* We do a bsearch later */ sort(sctx->clone_roots, sctx->clone_roots_cnt, @@ -7352,18 +7303,24 @@ out: } if (sort_clone_roots) { - for (i = 0; i < sctx->clone_roots_cnt; i++) + for (i = 0; i < sctx->clone_roots_cnt; i++) { btrfs_root_dec_send_in_progress( sctx->clone_roots[i].root); + btrfs_put_root(sctx->clone_roots[i].root); + } } else { - for (i = 0; sctx && i < clone_sources_to_rollback; i++) + for (i = 0; sctx && i < clone_sources_to_rollback; i++) { btrfs_root_dec_send_in_progress( sctx->clone_roots[i].root); + btrfs_put_root(sctx->clone_roots[i].root); + } btrfs_root_dec_send_in_progress(send_root); } - if (sctx && !IS_ERR_OR_NULL(sctx->parent_root)) + if (sctx && !IS_ERR_OR_NULL(sctx->parent_root)) { btrfs_root_dec_send_in_progress(sctx->parent_root); + btrfs_put_root(sctx->parent_root); + } kvfree(clone_sources_tmp); diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index 01297c5b2666..8b0fe053a25d 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -10,6 +10,153 @@ #include "transaction.h" #include "block-group.h" +/* + * HOW DOES SPACE RESERVATION WORK + * + * If you want to know about delalloc specifically, there is a separate comment + * for that with the delalloc code. This comment is about how the whole system + * works generally. + * + * BASIC CONCEPTS + * + * 1) space_info. This is the ultimate arbiter of how much space we can use. + * There's a description of the bytes_ fields with the struct declaration, + * refer to that for specifics on each field. Suffice it to say that for + * reservations we care about total_bytes - SUM(space_info->bytes_) when + * determining if there is space to make an allocation. There is a space_info + * for METADATA, SYSTEM, and DATA areas. + * + * 2) block_rsv's. These are basically buckets for every different type of + * metadata reservation we have. You can see the comment in the block_rsv + * code on the rules for each type, but generally block_rsv->reserved is how + * much space is accounted for in space_info->bytes_may_use. + * + * 3) btrfs_calc*_size. These are the worst case calculations we used based + * on the number of items we will want to modify. We have one for changing + * items, and one for inserting new items. Generally we use these helpers to + * determine the size of the block reserves, and then use the actual bytes + * values to adjust the space_info counters. + * + * MAKING RESERVATIONS, THE NORMAL CASE + * + * We call into either btrfs_reserve_data_bytes() or + * btrfs_reserve_metadata_bytes(), depending on which we're looking for, with + * num_bytes we want to reserve. + * + * ->reserve + * space_info->bytes_may_reserve += num_bytes + * + * ->extent allocation + * Call btrfs_add_reserved_bytes() which does + * space_info->bytes_may_reserve -= num_bytes + * space_info->bytes_reserved += extent_bytes + * + * ->insert reference + * Call btrfs_update_block_group() which does + * space_info->bytes_reserved -= extent_bytes + * space_info->bytes_used += extent_bytes + * + * MAKING RESERVATIONS, FLUSHING NORMALLY (non-priority) + * + * Assume we are unable to simply make the reservation because we do not have + * enough space + * + * -> __reserve_bytes + * create a reserve_ticket with ->bytes set to our reservation, add it to + * the tail of space_info->tickets, kick async flush thread + * + * ->handle_reserve_ticket + * wait on ticket->wait for ->bytes to be reduced to 0, or ->error to be set + * on the ticket. + * + * -> btrfs_async_reclaim_metadata_space/btrfs_async_reclaim_data_space + * Flushes various things attempting to free up space. + * + * -> btrfs_try_granting_tickets() + * This is called by anything that either subtracts space from + * space_info->bytes_may_use, ->bytes_pinned, etc, or adds to the + * space_info->total_bytes. This loops through the ->priority_tickets and + * then the ->tickets list checking to see if the reservation can be + * completed. If it can the space is added to space_info->bytes_may_use and + * the ticket is woken up. + * + * -> ticket wakeup + * Check if ->bytes == 0, if it does we got our reservation and we can carry + * on, if not return the appropriate error (ENOSPC, but can be EINTR if we + * were interrupted.) + * + * MAKING RESERVATIONS, FLUSHING HIGH PRIORITY + * + * Same as the above, except we add ourselves to the + * space_info->priority_tickets, and we do not use ticket->wait, we simply + * call flush_space() ourselves for the states that are safe for us to call + * without deadlocking and hope for the best. + * + * THE FLUSHING STATES + * + * Generally speaking we will have two cases for each state, a "nice" state + * and a "ALL THE THINGS" state. In btrfs we delay a lot of work in order to + * reduce the locking over head on the various trees, and even to keep from + * doing any work at all in the case of delayed refs. Each of these delayed + * things however hold reservations, and so letting them run allows us to + * reclaim space so we can make new reservations. + * + * FLUSH_DELAYED_ITEMS + * Every inode has a delayed item to update the inode. Take a simple write + * for example, we would update the inode item at write time to update the + * mtime, and then again at finish_ordered_io() time in order to update the + * isize or bytes. We keep these delayed items to coalesce these operations + * into a single operation done on demand. These are an easy way to reclaim + * metadata space. + * + * FLUSH_DELALLOC + * Look at the delalloc comment to get an idea of how much space is reserved + * for delayed allocation. We can reclaim some of this space simply by + * running delalloc, but usually we need to wait for ordered extents to + * reclaim the bulk of this space. + * + * FLUSH_DELAYED_REFS + * We have a block reserve for the outstanding delayed refs space, and every + * delayed ref operation holds a reservation. Running these is a quick way + * to reclaim space, but we want to hold this until the end because COW can + * churn a lot and we can avoid making some extent tree modifications if we + * are able to delay for as long as possible. + * + * ALLOC_CHUNK + * We will skip this the first time through space reservation, because of + * overcommit and we don't want to have a lot of useless metadata space when + * our worst case reservations will likely never come true. + * + * RUN_DELAYED_IPUTS + * If we're freeing inodes we're likely freeing checksums, file extent + * items, and extent tree items. Loads of space could be freed up by these + * operations, however they won't be usable until the transaction commits. + * + * COMMIT_TRANS + * may_commit_transaction() is the ultimate arbiter on whether we commit the + * transaction or not. In order to avoid constantly churning we do all the + * above flushing first and then commit the transaction as the last resort. + * However we need to take into account things like pinned space that would + * be freed, plus any delayed work we may not have gotten rid of in the case + * of metadata. + * + * OVERCOMMIT + * + * Because we hold so many reservations for metadata we will allow you to + * reserve more space than is currently free in the currently allocate + * metadata space. This only happens with metadata, data does not allow + * overcommitting. + * + * You can see the current logic for when we allow overcommit in + * btrfs_can_overcommit(), but it only applies to unallocated space. If there + * is no unallocated space to be had, all reservations are kept within the + * free space in the allocated metadata chunks. + * + * Because of overcommitting, you generally want to use the + * btrfs_can_overcommit() logic for metadata allocations, as it does the right + * thing with or without extra unallocated space. + */ + u64 __pure btrfs_space_info_used(struct btrfs_space_info *s_info, bool may_use_included) { @@ -159,25 +306,19 @@ static inline u64 calc_global_rsv_need_space(struct btrfs_block_rsv *global) return (global->size << 1); } -int btrfs_can_overcommit(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info, u64 bytes, - enum btrfs_reserve_flush_enum flush) +static u64 calc_available_free_space(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info, + enum btrfs_reserve_flush_enum flush) { u64 profile; u64 avail; - u64 used; int factor; - /* Don't overcommit when in mixed mode. */ - if (space_info->flags & BTRFS_BLOCK_GROUP_DATA) - return 0; - if (space_info->flags & BTRFS_BLOCK_GROUP_SYSTEM) profile = btrfs_system_alloc_profile(fs_info); else profile = btrfs_metadata_alloc_profile(fs_info); - used = btrfs_space_info_used(space_info, true); avail = atomic64_read(&fs_info->free_chunk_space); /* @@ -198,6 +339,22 @@ int btrfs_can_overcommit(struct btrfs_fs_info *fs_info, avail >>= 3; else avail >>= 1; + return avail; +} + +int btrfs_can_overcommit(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info, u64 bytes, + enum btrfs_reserve_flush_enum flush) +{ + u64 avail; + u64 used; + + /* Don't overcommit when in mixed mode */ + if (space_info->flags & BTRFS_BLOCK_GROUP_DATA) + return 0; + + used = btrfs_space_info_used(space_info, true); + avail = calc_available_free_space(fs_info, space_info, flush); if (used + bytes < space_info->total_bytes + avail) return 1; @@ -232,6 +389,8 @@ again: space_info, ticket->bytes); list_del_init(&ticket->list); + ASSERT(space_info->reclaim_size >= ticket->bytes); + space_info->reclaim_size -= ticket->bytes; ticket->bytes = 0; space_info->tickets_id++; wake_up(&ticket->wait); @@ -627,15 +786,26 @@ static inline u64 btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info, struct btrfs_space_info *space_info) { - struct reserve_ticket *ticket; u64 used; + u64 avail; u64 expected; - u64 to_reclaim = 0; + u64 to_reclaim = space_info->reclaim_size; + + lockdep_assert_held(&space_info->lock); + + avail = calc_available_free_space(fs_info, space_info, + BTRFS_RESERVE_FLUSH_ALL); + used = btrfs_space_info_used(space_info, true); + + /* + * We may be flushing because suddenly we have less space than we had + * before, and now we're well over-committed based on our current free + * space. If that's the case add in our overage so we make sure to put + * appropriate pressure on the flushing state machine. + */ + if (space_info->total_bytes + avail < used) + to_reclaim += used - (space_info->total_bytes + avail); - list_for_each_entry(ticket, &space_info->tickets, list) - to_reclaim += ticket->bytes; - list_for_each_entry(ticket, &space_info->priority_tickets, list) - to_reclaim += ticket->bytes; if (to_reclaim) return to_reclaim; @@ -1020,8 +1190,10 @@ static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info, * the list and we will do our own flushing further down. */ if (ret && flush != BTRFS_RESERVE_NO_FLUSH) { + ASSERT(space_info->reclaim_size >= 0); ticket.bytes = orig_bytes; ticket.error = 0; + space_info->reclaim_size += ticket.bytes; init_waitqueue_head(&ticket.wait); if (flush == BTRFS_RESERVE_FLUSH_ALL) { list_add_tail(&ticket.list, &space_info->tickets); diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h index 24514cd2c6c1..0a5001ef1481 100644 --- a/fs/btrfs/space-info.h +++ b/fs/btrfs/space-info.h @@ -54,6 +54,13 @@ struct btrfs_space_info { struct list_head ro_bgs; struct list_head priority_tickets; struct list_head tickets; + + /* + * Size of space that needs to be reclaimed in order to satisfy pending + * tickets + */ + u64 reclaim_size; + /* * tickets_id just indicates the next ticket will be handled, so note * it's not stored per ticket. diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 0616a5434793..7932d8d07cff 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -244,7 +244,7 @@ void __btrfs_abort_transaction(struct btrfs_trans_handle *trans, { struct btrfs_fs_info *fs_info = trans->fs_info; - trans->aborted = errno; + WRITE_ONCE(trans->aborted, errno); /* Nothing used. The other threads that have joined this * transaction may be able to continue. */ if (!trans->dirty && list_empty(&trans->new_bgs)) { @@ -873,7 +873,7 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, break; #endif case Opt_err: - btrfs_info(info, "unrecognized mount option '%s'", p); + btrfs_err(info, "unrecognized mount option '%s'", p); ret = -EINVAL; goto out; default: @@ -1024,11 +1024,11 @@ out: return error; } -static char *get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info, - u64 subvol_objectid) +char *btrfs_get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info, + u64 subvol_objectid) { struct btrfs_root *root = fs_info->tree_root; - struct btrfs_root *fs_root; + struct btrfs_root *fs_root = NULL; struct btrfs_root_ref *root_ref; struct btrfs_inode_ref *inode_ref; struct btrfs_key key; @@ -1096,9 +1096,10 @@ static char *get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info, key.objectid = subvol_objectid; key.type = BTRFS_ROOT_ITEM_KEY; key.offset = (u64)-1; - fs_root = btrfs_read_fs_root_no_name(fs_info, &key); + fs_root = btrfs_get_fs_root(fs_info, &key, true); if (IS_ERR(fs_root)) { ret = PTR_ERR(fs_root); + fs_root = NULL; goto err; } @@ -1143,6 +1144,8 @@ static char *get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info, ptr[0] = '/'; btrfs_release_path(path); } + btrfs_put_root(fs_root); + fs_root = NULL; } btrfs_free_path(path); @@ -1155,6 +1158,7 @@ static char *get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info, return name; err: + btrfs_put_root(fs_root); btrfs_free_path(path); kfree(name); return ERR_PTR(ret); @@ -1438,8 +1442,8 @@ static struct dentry *mount_subvol(const char *subvol_name, u64 subvol_objectid, goto out; } } - subvol_name = get_subvol_name_from_objectid(btrfs_sb(mnt->mnt_sb), - subvol_objectid); + subvol_name = btrfs_get_subvol_name_from_objectid( + btrfs_sb(mnt->mnt_sb), subvol_objectid); if (IS_ERR(subvol_name)) { root = ERR_CAST(subvol_name); subvol_name = NULL; @@ -1518,14 +1522,17 @@ static struct dentry *btrfs_mount_root(struct file_system_type *fs_type, /* * Setup a dummy root and fs_info for test/set super. This is because * we don't actually fill this stuff out until open_ctree, but we need - * it for searching for existing supers, so this lets us do that and - * then open_ctree will properly initialize everything later. + * then open_ctree will properly initialize the file system specific + * settings later. btrfs_init_fs_info initializes the static elements + * of the fs_info (locks and such) to make cleanup easier if we find a + * superblock with our given fs_devices later on at sget() time. */ fs_info = kvzalloc(sizeof(struct btrfs_fs_info), GFP_KERNEL); if (!fs_info) { error = -ENOMEM; goto error_sec_opts; } + btrfs_init_fs_info(fs_info); fs_info->super_copy = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_KERNEL); fs_info->super_for_commit = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_KERNEL); @@ -1571,7 +1578,7 @@ static struct dentry *btrfs_mount_root(struct file_system_type *fs_type, if (s->s_root) { btrfs_close_devices(fs_devices); - free_fs_info(fs_info); + btrfs_free_fs_info(fs_info); if ((flags ^ s->s_flags) & SB_RDONLY) error = -EBUSY; } else { @@ -1594,7 +1601,7 @@ static struct dentry *btrfs_mount_root(struct file_system_type *fs_type, error_close_devices: btrfs_close_devices(fs_devices); error_fs_info: - free_fs_info(fs_info); + btrfs_free_fs_info(fs_info); error_sec_opts: security_free_mnt_opts(&new_sec_opts); return ERR_PTR(error); @@ -1834,6 +1841,8 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) } if (btrfs_super_log_root(fs_info->super_copy) != 0) { + btrfs_warn(fs_info, + "mount required to replay tree-log, cannot remount read-write"); ret = -EINVAL; goto restore; } @@ -2168,7 +2177,7 @@ static void btrfs_kill_super(struct super_block *sb) { struct btrfs_fs_info *fs_info = btrfs_sb(sb); kill_anon_super(sb); - free_fs_info(fs_info); + btrfs_free_fs_info(fs_info); } static struct file_system_type btrfs_fs_type = { @@ -2201,7 +2210,7 @@ static int btrfs_control_open(struct inode *inode, struct file *file) } /* - * used by btrfsctl to scan devices when no FS is mounted + * Used by /dev/btrfs-control for devices ioctls. */ static long btrfs_control_ioctl(struct file *file, unsigned int cmd, unsigned long arg) diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 7436422194da..a39bff64ff24 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -155,7 +155,7 @@ static ssize_t btrfs_feature_attr_show(struct kobject *kobj, } else val = can_modify_feature(fa); - return snprintf(buf, PAGE_SIZE, "%d\n", val); + return scnprintf(buf, PAGE_SIZE, "%d\n", val); } static ssize_t btrfs_feature_attr_store(struct kobject *kobj, @@ -295,7 +295,7 @@ static const struct attribute_group btrfs_feature_attr_group = { static ssize_t rmdir_subvol_show(struct kobject *kobj, struct kobj_attribute *ka, char *buf) { - return snprintf(buf, PAGE_SIZE, "0\n"); + return scnprintf(buf, PAGE_SIZE, "0\n"); } BTRFS_ATTR(static_feature, rmdir_subvol, rmdir_subvol_show); @@ -310,12 +310,12 @@ static ssize_t supported_checksums_show(struct kobject *kobj, * This "trick" only works as long as 'enum btrfs_csum_type' has * no holes in it */ - ret += snprintf(buf + ret, PAGE_SIZE - ret, "%s%s", + ret += scnprintf(buf + ret, PAGE_SIZE - ret, "%s%s", (i == 0 ? "" : " "), btrfs_super_csum_name(i)); } - ret += snprintf(buf + ret, PAGE_SIZE - ret, "\n"); + ret += scnprintf(buf + ret, PAGE_SIZE - ret, "\n"); return ret; } BTRFS_ATTR(static_feature, supported_checksums, supported_checksums_show); @@ -350,7 +350,7 @@ static ssize_t btrfs_discardable_bytes_show(struct kobject *kobj, { struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj); - return snprintf(buf, PAGE_SIZE, "%lld\n", + return scnprintf(buf, PAGE_SIZE, "%lld\n", atomic64_read(&fs_info->discard_ctl.discardable_bytes)); } BTRFS_ATTR(discard, discardable_bytes, btrfs_discardable_bytes_show); @@ -361,7 +361,7 @@ static ssize_t btrfs_discardable_extents_show(struct kobject *kobj, { struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj); - return snprintf(buf, PAGE_SIZE, "%d\n", + return scnprintf(buf, PAGE_SIZE, "%d\n", atomic_read(&fs_info->discard_ctl.discardable_extents)); } BTRFS_ATTR(discard, discardable_extents, btrfs_discardable_extents_show); @@ -372,7 +372,7 @@ static ssize_t btrfs_discard_bitmap_bytes_show(struct kobject *kobj, { struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj); - return snprintf(buf, PAGE_SIZE, "%lld\n", + return scnprintf(buf, PAGE_SIZE, "%lld\n", fs_info->discard_ctl.discard_bitmap_bytes); } BTRFS_ATTR(discard, discard_bitmap_bytes, btrfs_discard_bitmap_bytes_show); @@ -383,7 +383,7 @@ static ssize_t btrfs_discard_bytes_saved_show(struct kobject *kobj, { struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj); - return snprintf(buf, PAGE_SIZE, "%lld\n", + return scnprintf(buf, PAGE_SIZE, "%lld\n", atomic64_read(&fs_info->discard_ctl.discard_bytes_saved)); } BTRFS_ATTR(discard, discard_bytes_saved, btrfs_discard_bytes_saved_show); @@ -394,7 +394,7 @@ static ssize_t btrfs_discard_extent_bytes_show(struct kobject *kobj, { struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj); - return snprintf(buf, PAGE_SIZE, "%lld\n", + return scnprintf(buf, PAGE_SIZE, "%lld\n", fs_info->discard_ctl.discard_extent_bytes); } BTRFS_ATTR(discard, discard_extent_bytes, btrfs_discard_extent_bytes_show); @@ -405,7 +405,7 @@ static ssize_t btrfs_discard_iops_limit_show(struct kobject *kobj, { struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj); - return snprintf(buf, PAGE_SIZE, "%u\n", + return scnprintf(buf, PAGE_SIZE, "%u\n", READ_ONCE(fs_info->discard_ctl.iops_limit)); } @@ -435,7 +435,7 @@ static ssize_t btrfs_discard_kbps_limit_show(struct kobject *kobj, { struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj); - return snprintf(buf, PAGE_SIZE, "%u\n", + return scnprintf(buf, PAGE_SIZE, "%u\n", READ_ONCE(fs_info->discard_ctl.kbps_limit)); } @@ -465,7 +465,7 @@ static ssize_t btrfs_discard_max_discard_size_show(struct kobject *kobj, { struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj); - return snprintf(buf, PAGE_SIZE, "%llu\n", + return scnprintf(buf, PAGE_SIZE, "%llu\n", READ_ONCE(fs_info->discard_ctl.max_discard_size)); } @@ -530,7 +530,7 @@ static ssize_t btrfs_show_u64(u64 *value_ptr, spinlock_t *lock, char *buf) val = *value_ptr; if (lock) spin_unlock(lock); - return snprintf(buf, PAGE_SIZE, "%llu\n", val); + return scnprintf(buf, PAGE_SIZE, "%llu\n", val); } static ssize_t global_rsv_size_show(struct kobject *kobj, @@ -576,7 +576,7 @@ static ssize_t raid_bytes_show(struct kobject *kobj, val += block_group->used; } up_read(&sinfo->groups_sem); - return snprintf(buf, PAGE_SIZE, "%llu\n", val); + return scnprintf(buf, PAGE_SIZE, "%llu\n", val); } static struct attribute *raid_attrs[] = { @@ -613,7 +613,7 @@ static ssize_t btrfs_space_info_show_total_bytes_pinned(struct kobject *kobj, { struct btrfs_space_info *sinfo = to_space_info(kobj); s64 val = percpu_counter_sum(&sinfo->total_bytes_pinned); - return snprintf(buf, PAGE_SIZE, "%lld\n", val); + return scnprintf(buf, PAGE_SIZE, "%lld\n", val); } SPACE_INFO_ATTR(flags); @@ -670,7 +670,7 @@ static ssize_t btrfs_label_show(struct kobject *kobj, ssize_t ret; spin_lock(&fs_info->super_lock); - ret = snprintf(buf, PAGE_SIZE, label[0] ? "%s\n" : "%s", label); + ret = scnprintf(buf, PAGE_SIZE, label[0] ? "%s\n" : "%s", label); spin_unlock(&fs_info->super_lock); return ret; @@ -718,7 +718,7 @@ static ssize_t btrfs_nodesize_show(struct kobject *kobj, { struct btrfs_fs_info *fs_info = to_fs_info(kobj); - return snprintf(buf, PAGE_SIZE, "%u\n", fs_info->super_copy->nodesize); + return scnprintf(buf, PAGE_SIZE, "%u\n", fs_info->super_copy->nodesize); } BTRFS_ATTR(, nodesize, btrfs_nodesize_show); @@ -728,8 +728,8 @@ static ssize_t btrfs_sectorsize_show(struct kobject *kobj, { struct btrfs_fs_info *fs_info = to_fs_info(kobj); - return snprintf(buf, PAGE_SIZE, "%u\n", - fs_info->super_copy->sectorsize); + return scnprintf(buf, PAGE_SIZE, "%u\n", + fs_info->super_copy->sectorsize); } BTRFS_ATTR(, sectorsize, btrfs_sectorsize_show); @@ -739,8 +739,7 @@ static ssize_t btrfs_clone_alignment_show(struct kobject *kobj, { struct btrfs_fs_info *fs_info = to_fs_info(kobj); - return snprintf(buf, PAGE_SIZE, "%u\n", - fs_info->super_copy->sectorsize); + return scnprintf(buf, PAGE_SIZE, "%u\n", fs_info->super_copy->sectorsize); } BTRFS_ATTR(, clone_alignment, btrfs_clone_alignment_show); @@ -752,7 +751,7 @@ static ssize_t quota_override_show(struct kobject *kobj, int quota_override; quota_override = test_bit(BTRFS_FS_QUOTA_OVERRIDE, &fs_info->flags); - return snprintf(buf, PAGE_SIZE, "%d\n", quota_override); + return scnprintf(buf, PAGE_SIZE, "%d\n", quota_override); } static ssize_t quota_override_store(struct kobject *kobj, @@ -790,7 +789,7 @@ static ssize_t btrfs_metadata_uuid_show(struct kobject *kobj, { struct btrfs_fs_info *fs_info = to_fs_info(kobj); - return snprintf(buf, PAGE_SIZE, "%pU\n", + return scnprintf(buf, PAGE_SIZE, "%pU\n", fs_info->fs_devices->metadata_uuid); } @@ -802,7 +801,7 @@ static ssize_t btrfs_checksum_show(struct kobject *kobj, struct btrfs_fs_info *fs_info = to_fs_info(kobj); u16 csum_type = btrfs_super_csum_type(fs_info->super_copy); - return snprintf(buf, PAGE_SIZE, "%s (%s)\n", + return scnprintf(buf, PAGE_SIZE, "%s (%s)\n", btrfs_super_csum_name(csum_type), crypto_shash_driver_name(fs_info->csum_shash)); } @@ -901,6 +900,12 @@ static int addrm_unknown_feature_attrs(struct btrfs_fs_info *fs_info, bool add) static void __btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs) { + if (fs_devs->devinfo_kobj) { + kobject_del(fs_devs->devinfo_kobj); + kobject_put(fs_devs->devinfo_kobj); + fs_devs->devinfo_kobj = NULL; + } + if (fs_devs->devices_kobj) { kobject_del(fs_devs->devices_kobj); kobject_put(fs_devs->devices_kobj); @@ -954,7 +959,7 @@ void btrfs_sysfs_remove_mounted(struct btrfs_fs_info *fs_info) addrm_unknown_feature_attrs(fs_info, false); sysfs_remove_group(&fs_info->fs_devices->fsid_kobj, &btrfs_feature_attr_group); sysfs_remove_files(&fs_info->fs_devices->fsid_kobj, btrfs_attrs); - btrfs_sysfs_rm_device_link(fs_info->fs_devices, NULL); + btrfs_sysfs_remove_devices_dir(fs_info->fs_devices, NULL); } static const char * const btrfs_feature_set_names[FEAT_MAX] = { @@ -986,7 +991,7 @@ char *btrfs_printable_features(enum btrfs_feature_set set, u64 flags) continue; name = btrfs_feature_attrs[set][i].kobj_attr.attr.name; - len += snprintf(str + len, bufsize - len, "%s%s", + len += scnprintf(str + len, bufsize - len, "%s%s", len ? "," : "", name); } @@ -1143,7 +1148,7 @@ int btrfs_sysfs_add_space_info_type(struct btrfs_fs_info *fs_info, /* when one_device is NULL, it removes all device links */ -int btrfs_sysfs_rm_device_link(struct btrfs_fs_devices *fs_devices, +int btrfs_sysfs_remove_devices_dir(struct btrfs_fs_devices *fs_devices, struct btrfs_device *one_device) { struct hd_struct *disk; @@ -1195,11 +1200,11 @@ static ssize_t btrfs_devinfo_in_fs_metadata_show(struct kobject *kobj, val = !!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); - return snprintf(buf, PAGE_SIZE, "%d\n", val); + return scnprintf(buf, PAGE_SIZE, "%d\n", val); } BTRFS_ATTR(devid, in_fs_metadata, btrfs_devinfo_in_fs_metadata_show); -static ssize_t btrfs_sysfs_missing_show(struct kobject *kobj, +static ssize_t btrfs_devinfo_missing_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) { int val; @@ -1208,9 +1213,9 @@ static ssize_t btrfs_sysfs_missing_show(struct kobject *kobj, val = !!test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state); - return snprintf(buf, PAGE_SIZE, "%d\n", val); + return scnprintf(buf, PAGE_SIZE, "%d\n", val); } -BTRFS_ATTR(devid, missing, btrfs_sysfs_missing_show); +BTRFS_ATTR(devid, missing, btrfs_devinfo_missing_show); static ssize_t btrfs_devinfo_replace_target_show(struct kobject *kobj, struct kobj_attribute *a, @@ -1222,7 +1227,7 @@ static ssize_t btrfs_devinfo_replace_target_show(struct kobject *kobj, val = !!test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state); - return snprintf(buf, PAGE_SIZE, "%d\n", val); + return scnprintf(buf, PAGE_SIZE, "%d\n", val); } BTRFS_ATTR(devid, replace_target, btrfs_devinfo_replace_target_show); @@ -1235,7 +1240,7 @@ static ssize_t btrfs_devinfo_writeable_show(struct kobject *kobj, val = !!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); - return snprintf(buf, PAGE_SIZE, "%d\n", val); + return scnprintf(buf, PAGE_SIZE, "%d\n", val); } BTRFS_ATTR(devid, writeable, btrfs_devinfo_writeable_show); @@ -1263,7 +1268,7 @@ static struct kobj_type devid_ktype = { .release = btrfs_release_devid_kobj, }; -int btrfs_sysfs_add_device_link(struct btrfs_fs_devices *fs_devices, +int btrfs_sysfs_add_devices_dir(struct btrfs_fs_devices *fs_devices, struct btrfs_device *one_device) { int error = 0; @@ -1289,7 +1294,7 @@ int btrfs_sysfs_add_device_link(struct btrfs_fs_devices *fs_devices, init_completion(&dev->kobj_unregister); error = kobject_init_and_add(&dev->devid_kobj, &devid_ktype, - fs_devices->devices_kobj, "%llu", + fs_devices->devinfo_kobj, "%llu", dev->devid); if (error) { kobject_put(&dev->devid_kobj); @@ -1365,7 +1370,16 @@ int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs) if (!fs_devs->devices_kobj) { btrfs_err(fs_devs->fs_info, "failed to init sysfs device interface"); - kobject_put(&fs_devs->fsid_kobj); + btrfs_sysfs_remove_fsid(fs_devs); + return -ENOMEM; + } + + fs_devs->devinfo_kobj = kobject_create_and_add("devinfo", + &fs_devs->fsid_kobj); + if (!fs_devs->devinfo_kobj) { + btrfs_err(fs_devs->fs_info, + "failed to init sysfs devinfo kobject"); + btrfs_sysfs_remove_fsid(fs_devs); return -ENOMEM; } @@ -1380,13 +1394,13 @@ int btrfs_sysfs_add_mounted(struct btrfs_fs_info *fs_info) btrfs_set_fs_info_ptr(fs_info); - error = btrfs_sysfs_add_device_link(fs_devs, NULL); + error = btrfs_sysfs_add_devices_dir(fs_devs, NULL); if (error) return error; error = sysfs_create_files(fsid_kobj, btrfs_attrs); if (error) { - btrfs_sysfs_rm_device_link(fs_devs, NULL); + btrfs_sysfs_remove_devices_dir(fs_devs, NULL); return error; } diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h index c68582add92e..718a26c97833 100644 --- a/fs/btrfs/sysfs.h +++ b/fs/btrfs/sysfs.h @@ -14,9 +14,9 @@ enum btrfs_feature_set { char *btrfs_printable_features(enum btrfs_feature_set set, u64 flags); const char * const btrfs_feature_set_name(enum btrfs_feature_set set); -int btrfs_sysfs_add_device_link(struct btrfs_fs_devices *fs_devices, +int btrfs_sysfs_add_devices_dir(struct btrfs_fs_devices *fs_devices, struct btrfs_device *one_device); -int btrfs_sysfs_rm_device_link(struct btrfs_fs_devices *fs_devices, +int btrfs_sysfs_remove_devices_dir(struct btrfs_fs_devices *fs_devices, struct btrfs_device *one_device); int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs); void btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs); diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c index 84fb3fa940a6..999c14e5d0bd 100644 --- a/fs/btrfs/tests/btrfs-tests.c +++ b/fs/btrfs/tests/btrfs-tests.c @@ -120,6 +120,8 @@ struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(u32 nodesize, u32 sectorsize) kfree(fs_info); return NULL; } + INIT_LIST_HEAD(&fs_info->fs_devices->devices); + fs_info->super_copy = kzalloc(sizeof(struct btrfs_super_block), GFP_KERNEL); if (!fs_info->super_copy) { @@ -128,39 +130,10 @@ struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(u32 nodesize, u32 sectorsize) return NULL; } + btrfs_init_fs_info(fs_info); + fs_info->nodesize = nodesize; fs_info->sectorsize = sectorsize; - - if (init_srcu_struct(&fs_info->subvol_srcu)) { - kfree(fs_info->fs_devices); - kfree(fs_info->super_copy); - kfree(fs_info); - return NULL; - } - - spin_lock_init(&fs_info->buffer_lock); - spin_lock_init(&fs_info->qgroup_lock); - spin_lock_init(&fs_info->super_lock); - spin_lock_init(&fs_info->fs_roots_radix_lock); - mutex_init(&fs_info->qgroup_ioctl_lock); - mutex_init(&fs_info->qgroup_rescan_lock); - rwlock_init(&fs_info->tree_mod_log_lock); - fs_info->running_transaction = NULL; - fs_info->qgroup_tree = RB_ROOT; - fs_info->qgroup_ulist = NULL; - atomic64_set(&fs_info->tree_mod_seq, 0); - INIT_LIST_HEAD(&fs_info->dirty_qgroups); - INIT_LIST_HEAD(&fs_info->dead_roots); - INIT_LIST_HEAD(&fs_info->tree_mod_seq_list); - INIT_LIST_HEAD(&fs_info->fs_devices->devices); - INIT_RADIX_TREE(&fs_info->buffer_radix, GFP_ATOMIC); - INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC); - extent_io_tree_init(fs_info, &fs_info->freed_extents[0], - IO_TREE_FS_INFO_FREED_EXTENTS0, NULL); - extent_io_tree_init(fs_info, &fs_info->freed_extents[1], - IO_TREE_FS_INFO_FREED_EXTENTS1, NULL); - extent_map_tree_init(&fs_info->mapping_tree); - fs_info->pinned_extents = &fs_info->freed_extents[0]; set_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state); test_mnt->mnt_sb->s_fs_info = fs_info; @@ -210,8 +183,9 @@ void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info) } btrfs_free_qgroup_config(fs_info); btrfs_free_fs_roots(fs_info); - cleanup_srcu_struct(&fs_info->subvol_srcu); kfree(fs_info->super_copy); + btrfs_check_leaked_roots(fs_info); + btrfs_extent_buffer_leak_debug_check(fs_info); kfree(fs_info->fs_devices); kfree(fs_info); } @@ -223,11 +197,7 @@ void btrfs_free_dummy_root(struct btrfs_root *root) /* Will be freed by btrfs_free_fs_roots */ if (WARN_ON(test_bit(BTRFS_ROOT_IN_RADIX, &root->state))) return; - if (root->node) { - /* One for allocate_extent_buffer */ - free_extent_buffer(root->node); - } - kfree(root); + btrfs_put_root(root); } struct btrfs_block_group * diff --git a/fs/btrfs/tests/qgroup-tests.c b/fs/btrfs/tests/qgroup-tests.c index ac035a6fa003..ce1ca8e73c2d 100644 --- a/fs/btrfs/tests/qgroup-tests.c +++ b/fs/btrfs/tests/qgroup-tests.c @@ -507,6 +507,7 @@ int btrfs_test_qgroups(u32 sectorsize, u32 nodesize) test_err("couldn't insert fs root %d", ret); goto out; } + btrfs_put_root(tmp_root); tmp_root = btrfs_alloc_dummy_root(fs_info); if (IS_ERR(tmp_root)) { @@ -521,6 +522,7 @@ int btrfs_test_qgroups(u32 sectorsize, u32 nodesize) test_err("couldn't insert fs root %d", ret); goto out; } + btrfs_put_root(tmp_root); test_msg("running qgroup tests"); ret = test_no_shared_qgroup(root, sectorsize, nodesize); diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 33dcc88b428a..8cede6eb9843 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -121,6 +121,8 @@ void btrfs_put_transaction(struct btrfs_transaction *transaction) BUG_ON(!list_empty(&transaction->list)); WARN_ON(!RB_EMPTY_ROOT( &transaction->delayed_refs.href_root.rb_root)); + WARN_ON(!RB_EMPTY_ROOT( + &transaction->delayed_refs.dirty_extent_root)); if (transaction->delayed_refs.pending_csums) btrfs_err(transaction->fs_info, "pending csums is %llu", @@ -219,7 +221,7 @@ void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans) WARN_ON_ONCE(!list_empty(&trans->new_bgs)); btrfs_block_rsv_release(fs_info, &fs_info->chunk_block_rsv, - trans->chunk_bytes_reserved); + trans->chunk_bytes_reserved, NULL); trans->chunk_bytes_reserved = 0; } @@ -241,7 +243,7 @@ loop: cur_trans = fs_info->running_transaction; if (cur_trans) { - if (cur_trans->aborted) { + if (TRANS_ABORTED(cur_trans)) { spin_unlock(&fs_info->trans_lock); return cur_trans->aborted; } @@ -334,6 +336,8 @@ loop: list_add_tail(&cur_trans->list, &fs_info->trans_list); extent_io_tree_init(fs_info, &cur_trans->dirty_pages, IO_TREE_TRANS_DIRTY_PAGES, fs_info->btree_inode); + extent_io_tree_init(fs_info, &cur_trans->pinned_extents, + IO_TREE_FS_PINNED_EXTENTS, NULL); fs_info->generation++; cur_trans->transid = fs_info->generation; fs_info->running_transaction = cur_trans; @@ -457,7 +461,7 @@ static inline int is_transaction_blocked(struct btrfs_transaction *trans) { return (trans->state >= TRANS_STATE_COMMIT_START && trans->state < TRANS_STATE_UNBLOCKED && - !trans->aborted); + !TRANS_ABORTED(trans)); } /* wait for commit against the current transaction to become unblocked @@ -476,7 +480,7 @@ static void wait_current_trans(struct btrfs_fs_info *fs_info) wait_event(fs_info->transaction_wait, cur_trans->state >= TRANS_STATE_UNBLOCKED || - cur_trans->aborted); + TRANS_ABORTED(cur_trans)); btrfs_put_transaction(cur_trans); } else { spin_unlock(&fs_info->trans_lock); @@ -671,7 +675,7 @@ join_fail: alloc_fail: if (num_bytes) btrfs_block_rsv_release(fs_info, &fs_info->trans_block_rsv, - num_bytes); + num_bytes, NULL); reserve_fail: btrfs_qgroup_free_meta_pertrans(root, qgroup_reserved); return ERR_PTR(ret); @@ -894,7 +898,7 @@ static void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans) trace_btrfs_space_reservation(fs_info, "transaction", trans->transid, trans->bytes_reserved, 0); btrfs_block_rsv_release(fs_info, trans->block_rsv, - trans->bytes_reserved); + trans->bytes_reserved, NULL); trans->bytes_reserved = 0; } @@ -935,7 +939,7 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, if (throttle) btrfs_run_delayed_iputs(info); - if (trans->aborted || + if (TRANS_ABORTED(trans) || test_bit(BTRFS_FS_STATE_ERROR, &info->fs_state)) { wake_up_process(info->transaction_kthread); err = -EIO; @@ -1260,8 +1264,10 @@ void btrfs_add_dead_root(struct btrfs_root *root) struct btrfs_fs_info *fs_info = root->fs_info; spin_lock(&fs_info->trans_lock); - if (list_empty(&root->root_list)) + if (list_empty(&root->root_list)) { + btrfs_grab_root(root); list_add_tail(&root->root_list, &fs_info->dead_roots); + } spin_unlock(&fs_info->trans_lock); } @@ -1475,7 +1481,6 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, u64 index = 0; u64 objectid; u64 root_flags; - uuid_le new_uuid; ASSERT(pending->path); path = pending->path; @@ -1568,8 +1573,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, btrfs_set_root_generation_v2(new_root_item, trans->transid); - uuid_le_gen(&new_uuid); - memcpy(new_root_item->uuid, new_uuid.b, BTRFS_UUID_SIZE); + generate_random_guid(new_root_item->uuid); memcpy(new_root_item->parent_uuid, root->root_item.uuid, BTRFS_UUID_SIZE); if (!(root_flags & BTRFS_ROOT_SUBVOL_RDONLY)) { @@ -1631,7 +1635,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, } key.offset = (u64)-1; - pending->snap = btrfs_read_fs_root_no_name(fs_info, &key); + pending->snap = btrfs_get_fs_root(fs_info, &key, true); if (IS_ERR(pending->snap)) { ret = PTR_ERR(pending->snap); btrfs_abort_transaction(trans, ret); @@ -1680,7 +1684,8 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, btrfs_abort_transaction(trans, ret); goto fail; } - ret = btrfs_uuid_tree_add(trans, new_uuid.b, BTRFS_UUID_KEY_SUBVOL, + ret = btrfs_uuid_tree_add(trans, new_root_item->uuid, + BTRFS_UUID_KEY_SUBVOL, objectid); if (ret) { btrfs_abort_transaction(trans, ret); @@ -1792,7 +1797,8 @@ static void wait_current_trans_commit_start(struct btrfs_fs_info *fs_info, struct btrfs_transaction *trans) { wait_event(fs_info->transaction_blocked_wait, - trans->state >= TRANS_STATE_COMMIT_START || trans->aborted); + trans->state >= TRANS_STATE_COMMIT_START || + TRANS_ABORTED(trans)); } /* @@ -1804,7 +1810,8 @@ static void wait_current_trans_commit_start_and_unblock( struct btrfs_transaction *trans) { wait_event(fs_info->transaction_wait, - trans->state >= TRANS_STATE_UNBLOCKED || trans->aborted); + trans->state >= TRANS_STATE_UNBLOCKED || + TRANS_ABORTED(trans)); } /* @@ -2024,7 +2031,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) trans->dirty = true; /* Stop the commit early if ->aborted is set */ - if (unlikely(READ_ONCE(cur_trans->aborted))) { + if (TRANS_ABORTED(cur_trans)) { ret = cur_trans->aborted; btrfs_end_transaction(trans); return ret; @@ -2098,7 +2105,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) wait_for_commit(cur_trans); - if (unlikely(cur_trans->aborted)) + if (TRANS_ABORTED(cur_trans)) ret = cur_trans->aborted; btrfs_put_transaction(cur_trans); @@ -2117,7 +2124,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) spin_unlock(&fs_info->trans_lock); wait_for_commit(prev_trans); - ret = prev_trans->aborted; + ret = READ_ONCE(prev_trans->aborted); btrfs_put_transaction(prev_trans); if (ret) @@ -2171,8 +2178,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) wait_event(cur_trans->writer_wait, atomic_read(&cur_trans->num_writers) == 1); - /* ->aborted might be set after the previous check, so check it */ - if (unlikely(READ_ONCE(cur_trans->aborted))) { + if (TRANS_ABORTED(cur_trans)) { ret = cur_trans->aborted; goto scrub_continue; } @@ -2189,10 +2195,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) * core function of the snapshot creation. */ ret = create_pending_snapshots(trans); - if (ret) { - mutex_unlock(&fs_info->reloc_mutex); - goto scrub_continue; - } + if (ret) + goto unlock_reloc; /* * We insert the dir indexes of the snapshots and update the inode @@ -2205,16 +2209,12 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) * the nodes and leaves. */ ret = btrfs_run_delayed_items(trans); - if (ret) { - mutex_unlock(&fs_info->reloc_mutex); - goto scrub_continue; - } + if (ret) + goto unlock_reloc; ret = btrfs_run_delayed_refs(trans, (unsigned long)-1); - if (ret) { - mutex_unlock(&fs_info->reloc_mutex); - goto scrub_continue; - } + if (ret) + goto unlock_reloc; /* * make sure none of the code above managed to slip in a @@ -2240,11 +2240,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) mutex_lock(&fs_info->tree_log_mutex); ret = commit_fs_roots(trans); - if (ret) { - mutex_unlock(&fs_info->tree_log_mutex); - mutex_unlock(&fs_info->reloc_mutex); - goto scrub_continue; - } + if (ret) + goto unlock_tree_log; /* * Since the transaction is done, we can apply the pending changes @@ -2262,39 +2259,28 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) * new delayed refs. Must handle them or qgroup can be wrong. */ ret = btrfs_run_delayed_refs(trans, (unsigned long)-1); - if (ret) { - mutex_unlock(&fs_info->tree_log_mutex); - mutex_unlock(&fs_info->reloc_mutex); - goto scrub_continue; - } + if (ret) + goto unlock_tree_log; /* * Since fs roots are all committed, we can get a quite accurate * new_roots. So let's do quota accounting. */ ret = btrfs_qgroup_account_extents(trans); - if (ret < 0) { - mutex_unlock(&fs_info->tree_log_mutex); - mutex_unlock(&fs_info->reloc_mutex); - goto scrub_continue; - } + if (ret < 0) + goto unlock_tree_log; ret = commit_cowonly_roots(trans); - if (ret) { - mutex_unlock(&fs_info->tree_log_mutex); - mutex_unlock(&fs_info->reloc_mutex); - goto scrub_continue; - } + if (ret) + goto unlock_tree_log; /* * The tasks which save the space cache and inode cache may also * update ->aborted, check it. */ - if (unlikely(READ_ONCE(cur_trans->aborted))) { + if (TRANS_ABORTED(cur_trans)) { ret = cur_trans->aborted; - mutex_unlock(&fs_info->tree_log_mutex); - mutex_unlock(&fs_info->reloc_mutex); - goto scrub_continue; + goto unlock_tree_log; } btrfs_prepare_extent_commit(fs_info); @@ -2341,6 +2327,10 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) if (ret) { btrfs_handle_fs_error(fs_info, ret, "Error while writing out transaction"); + /* + * reloc_mutex has been unlocked, tree_log_mutex is still held + * but we can't jump to unlock_tree_log causing double unlock + */ mutex_unlock(&fs_info->tree_log_mutex); goto scrub_continue; } @@ -2389,6 +2379,10 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) return ret; +unlock_tree_log: + mutex_unlock(&fs_info->tree_log_mutex); +unlock_reloc: + mutex_unlock(&fs_info->reloc_mutex); scrub_continue: btrfs_scrub_continue(fs_info); cleanup_transaction: @@ -2432,13 +2426,18 @@ int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root) btrfs_debug(fs_info, "cleaner removing %llu", root->root_key.objectid); btrfs_kill_all_delayed_nodes(root); + if (root->ino_cache_inode) { + iput(root->ino_cache_inode); + root->ino_cache_inode = NULL; + } if (btrfs_header_backref_rev(root->node) < BTRFS_MIXED_BACKREF_REV) - ret = btrfs_drop_snapshot(root, NULL, 0, 0); + ret = btrfs_drop_snapshot(root, 0, 0); else - ret = btrfs_drop_snapshot(root, NULL, 1, 0); + ret = btrfs_drop_snapshot(root, 1, 0); + btrfs_put_root(root); return (ret < 0) ? 0 : 1; } diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 49f7196368f5..31ae8d273065 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -71,6 +71,7 @@ struct btrfs_transaction { */ struct list_head io_bgs; struct list_head dropped_roots; + struct extent_io_tree pinned_extents; /* * we need to make sure block group deletion doesn't race with @@ -115,6 +116,10 @@ struct btrfs_trans_handle { struct btrfs_block_rsv *orig_rsv; refcount_t use_count; unsigned int type; + /* + * Error code of transaction abort, set outside of locks and must use + * the READ_ONCE/WRITE_ONCE access + */ short aborted; bool adding_csums; bool allocating_chunk; @@ -126,6 +131,14 @@ struct btrfs_trans_handle { struct list_head new_bgs; }; +/* + * The abort status can be changed between calls and is not protected by locks. + * This accepts btrfs_transaction and btrfs_trans_handle as types. Once it's + * set to a non-zero value it does not change, so the macro should be in checks + * but is not necessary for further reads of the value. + */ +#define TRANS_ABORTED(trans) (unlikely(READ_ONCE((trans)->aborted))) + struct btrfs_pending_snapshot { struct dentry *dentry; struct inode *dir; diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 7dd7552f53a4..58c111474ba5 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -18,6 +18,8 @@ #include "compression.h" #include "qgroup.h" #include "inode-map.h" +#include "block-group.h" +#include "space-info.h" /* magic values for the inode_only field in btrfs_log_inode: * @@ -94,8 +96,8 @@ enum { static int btrfs_log_inode(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_inode *inode, int inode_only, - const loff_t start, - const loff_t end, + u64 start, + u64 end, struct btrfs_log_ctx *ctx); static int link_to_fixup_dir(struct btrfs_trans_handle *trans, struct btrfs_root *root, @@ -311,7 +313,7 @@ static int process_one_buffer(struct btrfs_root *log, } if (wc->pin) - ret = btrfs_pin_extent_for_log_replay(fs_info, eb->start, + ret = btrfs_pin_extent_for_log_replay(wc->trans, eb->start, eb->len); if (!ret && btrfs_buffer_uptodate(eb, gen, 0)) { @@ -830,6 +832,11 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, goto out; } + ret = btrfs_inode_set_file_extent_range(BTRFS_I(inode), start, + extent_end - start); + if (ret) + goto out; + inode_add_bytes(inode, nbytes); update_inode: ret = btrfs_update_inode(trans, root, inode); @@ -2659,18 +2666,39 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, return ret; } +/* + * Correctly adjust the reserved bytes occupied by a log tree extent buffer + */ +static void unaccount_log_buffer(struct btrfs_fs_info *fs_info, u64 start) +{ + struct btrfs_block_group *cache; + + cache = btrfs_lookup_block_group(fs_info, start); + if (!cache) { + btrfs_err(fs_info, "unable to find block group for %llu", start); + return; + } + + spin_lock(&cache->space_info->lock); + spin_lock(&cache->lock); + cache->reserved -= fs_info->nodesize; + cache->space_info->bytes_reserved -= fs_info->nodesize; + spin_unlock(&cache->lock); + spin_unlock(&cache->space_info->lock); + + btrfs_put_block_group(cache); +} + static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, int *level, struct walk_control *wc) { struct btrfs_fs_info *fs_info = root->fs_info; - u64 root_owner; u64 bytenr; u64 ptr_gen; struct extent_buffer *next; struct extent_buffer *cur; - struct extent_buffer *parent; u32 blocksize; int ret = 0; @@ -2690,9 +2718,6 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, btrfs_node_key_to_cpu(cur, &first_key, path->slots[*level]); blocksize = fs_info->nodesize; - parent = path->nodes[*level]; - root_owner = btrfs_header_owner(parent); - next = btrfs_find_create_tree_block(fs_info, bytenr); if (IS_ERR(next)) return PTR_ERR(next); @@ -2720,18 +2745,16 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, btrfs_clean_tree_block(next); btrfs_wait_tree_block_writeback(next); btrfs_tree_unlock(next); + ret = btrfs_pin_reserved_extent(trans, + bytenr, blocksize); + if (ret) { + free_extent_buffer(next); + return ret; + } } else { if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags)) clear_extent_buffer_dirty(next); - } - - WARN_ON(root_owner != - BTRFS_TREE_LOG_OBJECTID); - ret = btrfs_pin_reserved_extent(fs_info, - bytenr, blocksize); - if (ret) { - free_extent_buffer(next); - return ret; + unaccount_log_buffer(fs_info, bytenr); } } free_extent_buffer(next); @@ -2762,7 +2785,6 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, struct walk_control *wc) { struct btrfs_fs_info *fs_info = root->fs_info; - u64 root_owner; int i; int slot; int ret; @@ -2775,13 +2797,6 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, WARN_ON(*level == 0); return 0; } else { - struct extent_buffer *parent; - if (path->nodes[*level] == root->node) - parent = path->nodes[*level]; - else - parent = path->nodes[*level + 1]; - - root_owner = btrfs_header_owner(parent); ret = wc->process_func(root, path->nodes[*level], wc, btrfs_header_generation(path->nodes[*level]), *level); @@ -2799,17 +2814,18 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, btrfs_clean_tree_block(next); btrfs_wait_tree_block_writeback(next); btrfs_tree_unlock(next); + ret = btrfs_pin_reserved_extent(trans, + path->nodes[*level]->start, + path->nodes[*level]->len); + if (ret) + return ret; } else { if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags)) clear_extent_buffer_dirty(next); - } - WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID); - ret = btrfs_pin_reserved_extent(fs_info, - path->nodes[*level]->start, - path->nodes[*level]->len); - if (ret) - return ret; + unaccount_log_buffer(fs_info, + path->nodes[*level]->start); + } } free_extent_buffer(path->nodes[*level]); path->nodes[*level] = NULL; @@ -2880,15 +2896,15 @@ static int walk_log_tree(struct btrfs_trans_handle *trans, btrfs_clean_tree_block(next); btrfs_wait_tree_block_writeback(next); btrfs_tree_unlock(next); + ret = btrfs_pin_reserved_extent(trans, + next->start, next->len); + if (ret) + goto out; } else { if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags)) clear_extent_buffer_dirty(next); + unaccount_log_buffer(fs_info, next->start); } - - ret = btrfs_pin_reserved_extent(fs_info, next->start, - next->len); - if (ret) - goto out; } } @@ -3283,8 +3299,7 @@ static void free_log_tree(struct btrfs_trans_handle *trans, clear_extent_bits(&log->dirty_log_pages, 0, (u64)-1, EXTENT_DIRTY | EXTENT_NEW | EXTENT_NEED_WAIT); - free_extent_buffer(log->node); - kfree(log); + btrfs_put_root(log); } /* @@ -4518,13 +4533,15 @@ static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans, static int btrfs_log_holes(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_inode *inode, - struct btrfs_path *path) + struct btrfs_path *path, + const u64 start, + const u64 end) { struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_key key; const u64 ino = btrfs_ino(inode); const u64 i_size = i_size_read(&inode->vfs_inode); - u64 prev_extent_end = 0; + u64 prev_extent_end = start; int ret; if (!btrfs_fs_incompat(fs_info, NO_HOLES) || i_size == 0) @@ -4532,16 +4549,21 @@ static int btrfs_log_holes(struct btrfs_trans_handle *trans, key.objectid = ino; key.type = BTRFS_EXTENT_DATA_KEY; - key.offset = 0; + key.offset = start; ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) return ret; + if (ret > 0 && path->slots[0] > 0) { + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0] - 1); + if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY) + path->slots[0]--; + } + while (true) { - struct btrfs_file_extent_item *extent; struct extent_buffer *leaf = path->nodes[0]; - u64 len; + u64 extent_end; if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { ret = btrfs_next_leaf(root, path); @@ -4558,9 +4580,18 @@ static int btrfs_log_holes(struct btrfs_trans_handle *trans, if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY) break; + extent_end = btrfs_file_extent_end(path); + if (extent_end <= start) + goto next_slot; + /* We have a hole, log it. */ if (prev_extent_end < key.offset) { - const u64 hole_len = key.offset - prev_extent_end; + u64 hole_len; + + if (key.offset >= end) + hole_len = end - prev_extent_end; + else + hole_len = key.offset - prev_extent_end; /* * Release the path to avoid deadlocks with other code @@ -4590,27 +4621,20 @@ static int btrfs_log_holes(struct btrfs_trans_handle *trans, leaf = path->nodes[0]; } - extent = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_file_extent_item); - if (btrfs_file_extent_type(leaf, extent) == - BTRFS_FILE_EXTENT_INLINE) { - len = btrfs_file_extent_ram_bytes(leaf, extent); - prev_extent_end = ALIGN(key.offset + len, - fs_info->sectorsize); - } else { - len = btrfs_file_extent_num_bytes(leaf, extent); - prev_extent_end = key.offset + len; - } - + prev_extent_end = min(extent_end, end); + if (extent_end >= end) + break; +next_slot: path->slots[0]++; cond_resched(); } - if (prev_extent_end < i_size) { + if (prev_extent_end < end && prev_extent_end < i_size) { u64 hole_len; btrfs_release_path(path); - hole_len = ALIGN(i_size - prev_extent_end, fs_info->sectorsize); + hole_len = min(ALIGN(i_size, fs_info->sectorsize), end); + hole_len -= prev_extent_end; ret = btrfs_insert_file_extent(trans, root->log_root, ino, prev_extent_end, 0, 0, hole_len, 0, hole_len, @@ -4938,6 +4962,178 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans, return ret; } +static int copy_inode_items_to_log(struct btrfs_trans_handle *trans, + struct btrfs_inode *inode, + struct btrfs_key *min_key, + const struct btrfs_key *max_key, + struct btrfs_path *path, + struct btrfs_path *dst_path, + const u64 logged_isize, + const bool recursive_logging, + const int inode_only, + const u64 start, + const u64 end, + struct btrfs_log_ctx *ctx, + bool *need_log_inode_item) +{ + struct btrfs_root *root = inode->root; + int ins_start_slot = 0; + int ins_nr = 0; + int ret; + + /* + * We must make sure we don't copy extent items that are entirely out of + * the range [start, end - 1]. This is not just an optimization to avoid + * copying but also needed to avoid a corruption where we end up with + * file extent items in the log tree that have overlapping ranges - this + * can happen if we race with ordered extent completion for ranges that + * are outside our target range. For example we copy an extent item and + * when we move to the next leaf, that extent was trimmed and a new one + * covering a subrange of it, but with a higher key, was inserted - we + * would then copy this other extent too, resulting in a log tree with + * 2 extent items that represent overlapping ranges. + * + * We can copy the entire extents at the range bondaries however, even + * if they cover an area outside the target range. That's ok. + */ + while (1) { + ret = btrfs_search_forward(root, min_key, path, trans->transid); + if (ret < 0) + return ret; + if (ret > 0) { + ret = 0; + break; + } +again: + /* Note, ins_nr might be > 0 here, cleanup outside the loop */ + if (min_key->objectid != max_key->objectid) + break; + if (min_key->type > max_key->type) + break; + + if (min_key->type == BTRFS_INODE_ITEM_KEY) + *need_log_inode_item = false; + + if ((min_key->type == BTRFS_INODE_REF_KEY || + min_key->type == BTRFS_INODE_EXTREF_KEY) && + inode->generation == trans->transid && + !recursive_logging) { + u64 other_ino = 0; + u64 other_parent = 0; + + ret = btrfs_check_ref_name_override(path->nodes[0], + path->slots[0], min_key, inode, + &other_ino, &other_parent); + if (ret < 0) { + return ret; + } else if (ret > 0 && ctx && + other_ino != btrfs_ino(BTRFS_I(ctx->inode))) { + if (ins_nr > 0) { + ins_nr++; + } else { + ins_nr = 1; + ins_start_slot = path->slots[0]; + } + ret = copy_items(trans, inode, dst_path, path, + ins_start_slot, ins_nr, + inode_only, logged_isize); + if (ret < 0) + return ret; + ins_nr = 0; + + ret = log_conflicting_inodes(trans, root, path, + ctx, other_ino, other_parent); + if (ret) + return ret; + btrfs_release_path(path); + goto next_key; + } + } + + /* Skip xattrs, we log them later with btrfs_log_all_xattrs() */ + if (min_key->type == BTRFS_XATTR_ITEM_KEY) { + if (ins_nr == 0) + goto next_slot; + ret = copy_items(trans, inode, dst_path, path, + ins_start_slot, + ins_nr, inode_only, logged_isize); + if (ret < 0) + return ret; + ins_nr = 0; + goto next_slot; + } + + if (min_key->type == BTRFS_EXTENT_DATA_KEY) { + const u64 extent_end = btrfs_file_extent_end(path); + + if (extent_end <= start) { + if (ins_nr > 0) { + ret = copy_items(trans, inode, dst_path, + path, ins_start_slot, + ins_nr, inode_only, + logged_isize); + if (ret < 0) + return ret; + ins_nr = 0; + } + goto next_slot; + } + if (extent_end >= end) { + ins_nr++; + if (ins_nr == 1) + ins_start_slot = path->slots[0]; + break; + } + } + + if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) { + ins_nr++; + goto next_slot; + } else if (!ins_nr) { + ins_start_slot = path->slots[0]; + ins_nr = 1; + goto next_slot; + } + + ret = copy_items(trans, inode, dst_path, path, ins_start_slot, + ins_nr, inode_only, logged_isize); + if (ret < 0) + return ret; + ins_nr = 1; + ins_start_slot = path->slots[0]; +next_slot: + path->slots[0]++; + if (path->slots[0] < btrfs_header_nritems(path->nodes[0])) { + btrfs_item_key_to_cpu(path->nodes[0], min_key, + path->slots[0]); + goto again; + } + if (ins_nr) { + ret = copy_items(trans, inode, dst_path, path, + ins_start_slot, ins_nr, inode_only, + logged_isize); + if (ret < 0) + return ret; + ins_nr = 0; + } + btrfs_release_path(path); +next_key: + if (min_key->offset < (u64)-1) { + min_key->offset++; + } else if (min_key->type < max_key->type) { + min_key->type++; + min_key->offset = 0; + } else { + break; + } + } + if (ins_nr) + ret = copy_items(trans, inode, dst_path, path, ins_start_slot, + ins_nr, inode_only, logged_isize); + + return ret; +} + /* log a single inode in the tree log. * At least one parent directory for this inode must exist in the tree * or be logged already. @@ -4955,8 +5151,8 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans, static int btrfs_log_inode(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_inode *inode, int inode_only, - const loff_t start, - const loff_t end, + u64 start, + u64 end, struct btrfs_log_ctx *ctx) { struct btrfs_fs_info *fs_info = root->fs_info; @@ -4967,9 +5163,6 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, struct btrfs_root *log = root->log_root; int err = 0; int ret; - int nritems; - int ins_start_slot = 0; - int ins_nr; bool fast_search = false; u64 ino = btrfs_ino(inode); struct extent_map_tree *em_tree = &inode->extent_tree; @@ -4987,6 +5180,9 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, return -ENOMEM; } + start = ALIGN_DOWN(start, fs_info->sectorsize); + end = ALIGN(end, fs_info->sectorsize); + min_key.objectid = ino; min_key.type = BTRFS_INODE_ITEM_KEY; min_key.offset = 0; @@ -5100,139 +5296,12 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, goto out_unlock; } - while (1) { - ins_nr = 0; - ret = btrfs_search_forward(root, &min_key, - path, trans->transid); - if (ret < 0) { - err = ret; - goto out_unlock; - } - if (ret != 0) - break; -again: - /* note, ins_nr might be > 0 here, cleanup outside the loop */ - if (min_key.objectid != ino) - break; - if (min_key.type > max_key.type) - break; - - if (min_key.type == BTRFS_INODE_ITEM_KEY) - need_log_inode_item = false; - - if ((min_key.type == BTRFS_INODE_REF_KEY || - min_key.type == BTRFS_INODE_EXTREF_KEY) && - inode->generation == trans->transid && - !recursive_logging) { - u64 other_ino = 0; - u64 other_parent = 0; - - ret = btrfs_check_ref_name_override(path->nodes[0], - path->slots[0], &min_key, inode, - &other_ino, &other_parent); - if (ret < 0) { - err = ret; - goto out_unlock; - } else if (ret > 0 && ctx && - other_ino != btrfs_ino(BTRFS_I(ctx->inode))) { - if (ins_nr > 0) { - ins_nr++; - } else { - ins_nr = 1; - ins_start_slot = path->slots[0]; - } - ret = copy_items(trans, inode, dst_path, path, - ins_start_slot, - ins_nr, inode_only, - logged_isize); - if (ret < 0) { - err = ret; - goto out_unlock; - } - ins_nr = 0; - - err = log_conflicting_inodes(trans, root, path, - ctx, other_ino, other_parent); - if (err) - goto out_unlock; - btrfs_release_path(path); - goto next_key; - } - } - - /* Skip xattrs, we log them later with btrfs_log_all_xattrs() */ - if (min_key.type == BTRFS_XATTR_ITEM_KEY) { - if (ins_nr == 0) - goto next_slot; - ret = copy_items(trans, inode, dst_path, path, - ins_start_slot, - ins_nr, inode_only, logged_isize); - if (ret < 0) { - err = ret; - goto out_unlock; - } - ins_nr = 0; - goto next_slot; - } - - if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) { - ins_nr++; - goto next_slot; - } else if (!ins_nr) { - ins_start_slot = path->slots[0]; - ins_nr = 1; - goto next_slot; - } - - ret = copy_items(trans, inode, dst_path, path, - ins_start_slot, ins_nr, inode_only, - logged_isize); - if (ret < 0) { - err = ret; - goto out_unlock; - } - ins_nr = 1; - ins_start_slot = path->slots[0]; -next_slot: - - nritems = btrfs_header_nritems(path->nodes[0]); - path->slots[0]++; - if (path->slots[0] < nritems) { - btrfs_item_key_to_cpu(path->nodes[0], &min_key, - path->slots[0]); - goto again; - } - if (ins_nr) { - ret = copy_items(trans, inode, dst_path, path, - ins_start_slot, - ins_nr, inode_only, logged_isize); - if (ret < 0) { - err = ret; - goto out_unlock; - } - ins_nr = 0; - } - btrfs_release_path(path); -next_key: - if (min_key.offset < (u64)-1) { - min_key.offset++; - } else if (min_key.type < max_key.type) { - min_key.type++; - min_key.offset = 0; - } else { - break; - } - } - if (ins_nr) { - ret = copy_items(trans, inode, dst_path, path, - ins_start_slot, ins_nr, inode_only, - logged_isize); - if (ret < 0) { - err = ret; - goto out_unlock; - } - ins_nr = 0; - } + err = copy_inode_items_to_log(trans, inode, &min_key, &max_key, + path, dst_path, logged_isize, + recursive_logging, inode_only, + start, end, ctx, &need_log_inode_item); + if (err) + goto out_unlock; btrfs_release_path(path); btrfs_release_path(dst_path); @@ -5243,7 +5312,7 @@ next_key: if (max_key.type >= BTRFS_EXTENT_DATA_KEY && !fast_search) { btrfs_release_path(path); btrfs_release_path(dst_path); - err = btrfs_log_holes(trans, root, inode, path); + err = btrfs_log_holes(trans, root, inode, path, start, end); if (err) goto out_unlock; } @@ -6145,7 +6214,7 @@ again: if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID) break; - log = btrfs_read_fs_root(log_root_tree, &found_key); + log = btrfs_read_tree_root(log_root_tree, &found_key); if (IS_ERR(log)) { ret = PTR_ERR(log); btrfs_handle_fs_error(fs_info, ret, @@ -6157,7 +6226,7 @@ again: tmp_key.type = BTRFS_ROOT_ITEM_KEY; tmp_key.offset = (u64)-1; - wc.replay_dest = btrfs_read_fs_root_no_name(fs_info, &tmp_key); + wc.replay_dest = btrfs_get_fs_root(fs_info, &tmp_key, true); if (IS_ERR(wc.replay_dest)) { ret = PTR_ERR(wc.replay_dest); @@ -6173,12 +6242,10 @@ again: * each subsequent pass. */ if (ret == -ENOENT) - ret = btrfs_pin_extent_for_log_replay(fs_info, + ret = btrfs_pin_extent_for_log_replay(trans, log->node->start, log->node->len); - free_extent_buffer(log->node); - free_extent_buffer(log->commit_root); - kfree(log); + btrfs_put_root(log); if (!ret) goto next; @@ -6214,9 +6281,8 @@ again: } wc.replay_dest->log_root = NULL; - free_extent_buffer(log->node); - free_extent_buffer(log->commit_root); - kfree(log); + btrfs_put_root(wc.replay_dest); + btrfs_put_root(log); if (ret) goto error; @@ -6247,10 +6313,9 @@ next: if (ret) return ret; - free_extent_buffer(log_root_tree->node); log_root_tree->log_root = NULL; clear_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags); - kfree(log_root_tree); + btrfs_put_root(log_root_tree); return 0; error: diff --git a/fs/btrfs/uuid-tree.c b/fs/btrfs/uuid-tree.c index 76b84f2397b1..76671a6bcb61 100644 --- a/fs/btrfs/uuid-tree.c +++ b/fs/btrfs/uuid-tree.c @@ -246,9 +246,53 @@ out: return ret; } -int btrfs_uuid_tree_iterate(struct btrfs_fs_info *fs_info, - int (*check_func)(struct btrfs_fs_info *, u8 *, u8, - u64)) +/* + * Check if there's an matching subvolume for given UUID + * + * Return: + * 0 check succeeded, the entry is not outdated + * > 0 if the check failed, the caller should remove the entry + * < 0 if an error occurred + */ +static int btrfs_check_uuid_tree_entry(struct btrfs_fs_info *fs_info, + u8 *uuid, u8 type, u64 subvolid) +{ + struct btrfs_key key; + int ret = 0; + struct btrfs_root *subvol_root; + + if (type != BTRFS_UUID_KEY_SUBVOL && + type != BTRFS_UUID_KEY_RECEIVED_SUBVOL) + goto out; + + key.objectid = subvolid; + key.type = BTRFS_ROOT_ITEM_KEY; + key.offset = (u64)-1; + subvol_root = btrfs_get_fs_root(fs_info, &key, true); + if (IS_ERR(subvol_root)) { + ret = PTR_ERR(subvol_root); + if (ret == -ENOENT) + ret = 1; + goto out; + } + + switch (type) { + case BTRFS_UUID_KEY_SUBVOL: + if (memcmp(uuid, subvol_root->root_item.uuid, BTRFS_UUID_SIZE)) + ret = 1; + break; + case BTRFS_UUID_KEY_RECEIVED_SUBVOL: + if (memcmp(uuid, subvol_root->root_item.received_uuid, + BTRFS_UUID_SIZE)) + ret = 1; + break; + } + btrfs_put_root(subvol_root); +out: + return ret; +} + +int btrfs_uuid_tree_iterate(struct btrfs_fs_info *fs_info) { struct btrfs_root *root = fs_info->uuid_root; struct btrfs_key key; @@ -278,6 +322,10 @@ again_search_slot: } while (1) { + if (btrfs_fs_closing(fs_info)) { + ret = -EINTR; + goto out; + } cond_resched(); leaf = path->nodes[0]; slot = path->slots[0]; @@ -305,7 +353,8 @@ again_search_slot: read_extent_buffer(leaf, &subid_le, offset, sizeof(subid_le)); subid_cpu = le64_to_cpu(subid_le); - ret = check_func(fs_info, uuid, key.type, subid_cpu); + ret = btrfs_check_uuid_tree_entry(fs_info, uuid, + key.type, subid_cpu); if (ret < 0) goto out; if (ret > 0) { diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 9cfc668f91f4..c1909e5f4506 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -6,7 +6,6 @@ #include <linux/sched.h> #include <linux/bio.h> #include <linux/slab.h> -#include <linux/buffer_head.h> #include <linux/blkdev.h> #include <linux/ratelimit.h> #include <linux/kthread.h> @@ -500,7 +499,7 @@ static struct btrfs_fs_devices *find_fsid_with_metadata_uuid( static int btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder, int flush, struct block_device **bdev, - struct buffer_head **bh) + struct btrfs_super_block **disk_super) { int ret; @@ -519,9 +518,9 @@ btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder, goto error; } invalidate_bdev(*bdev); - *bh = btrfs_read_dev_super(*bdev); - if (IS_ERR(*bh)) { - ret = PTR_ERR(*bh); + *disk_super = btrfs_read_dev_super(*bdev); + if (IS_ERR(*disk_super)) { + ret = PTR_ERR(*disk_super); blkdev_put(*bdev, flags); goto error; } @@ -530,7 +529,6 @@ btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder, error: *bdev = NULL; - *bh = NULL; return ret; } @@ -611,7 +609,6 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices, { struct request_queue *q; struct block_device *bdev; - struct buffer_head *bh; struct btrfs_super_block *disk_super; u64 devid; int ret; @@ -622,17 +619,16 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices, return -EINVAL; ret = btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1, - &bdev, &bh); + &bdev, &disk_super); if (ret) return ret; - disk_super = (struct btrfs_super_block *)bh->b_data; devid = btrfs_stack_device_id(&disk_super->dev_item); if (devid != device->devid) - goto error_brelse; + goto error_free_page; if (memcmp(device->uuid, disk_super->dev_item.uuid, BTRFS_UUID_SIZE)) - goto error_brelse; + goto error_free_page; device->generation = btrfs_super_generation(disk_super); @@ -641,7 +637,7 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices, BTRFS_FEATURE_INCOMPAT_METADATA_UUID) { pr_err( "BTRFS: Invalid seeding and uuid-changed device detected\n"); - goto error_brelse; + goto error_free_page; } clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); @@ -667,12 +663,12 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices, fs_devices->rw_devices++; list_add_tail(&device->dev_alloc_list, &fs_devices->alloc_list); } - brelse(bh); + btrfs_release_disk_super(disk_super); return 0; -error_brelse: - brelse(bh); +error_free_page: + btrfs_release_disk_super(disk_super); blkdev_put(bdev, flags); return -EINVAL; @@ -1209,6 +1205,7 @@ static int open_fs_devices(struct btrfs_fs_devices *fs_devices, fs_devices->opened = 1; fs_devices->latest_bdev = latest_dev->bdev; fs_devices->total_rw_bytes = 0; + fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_REGULAR; out: return ret; } @@ -1247,9 +1244,10 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, return ret; } -static void btrfs_release_disk_super(struct page *page) +void btrfs_release_disk_super(struct btrfs_super_block *super) { - kunmap(page); + struct page *page = virt_to_page(super); + put_page(page); } @@ -1277,17 +1275,17 @@ static int btrfs_read_disk_super(struct block_device *bdev, u64 bytenr, *page = read_cache_page_gfp(bdev->bd_inode->i_mapping, index, GFP_KERNEL); - if (IS_ERR_OR_NULL(*page)) + if (IS_ERR(*page)) return 1; - p = kmap(*page); + p = page_address(*page); /* align our pointer to the offset of the super block */ *disk_super = p + offset_in_page(bytenr); if (btrfs_super_bytenr(*disk_super) != bytenr || btrfs_super_magic(*disk_super) != BTRFS_MAGIC) { - btrfs_release_disk_super(*page); + btrfs_release_disk_super(p); return 1; } @@ -1350,7 +1348,7 @@ struct btrfs_device *btrfs_scan_one_device(const char *path, fmode_t flags, btrfs_free_stale_devices(path, device); } - btrfs_release_disk_super(page); + btrfs_release_disk_super(disk_super); error_bdev_put: blkdev_put(bdev, flags); @@ -1383,6 +1381,59 @@ static bool contains_pending_extent(struct btrfs_device *device, u64 *start, return false; } +static u64 dev_extent_search_start(struct btrfs_device *device, u64 start) +{ + switch (device->fs_devices->chunk_alloc_policy) { + case BTRFS_CHUNK_ALLOC_REGULAR: + /* + * We don't want to overwrite the superblock on the drive nor + * any area used by the boot loader (grub for example), so we + * make sure to start at an offset of at least 1MB. + */ + return max_t(u64, start, SZ_1M); + default: + BUG(); + } +} + +/** + * dev_extent_hole_check - check if specified hole is suitable for allocation + * @device: the device which we have the hole + * @hole_start: starting position of the hole + * @hole_size: the size of the hole + * @num_bytes: the size of the free space that we need + * + * This function may modify @hole_start and @hole_end to reflect the suitable + * position for allocation. Returns 1 if hole position is updated, 0 otherwise. + */ +static bool dev_extent_hole_check(struct btrfs_device *device, u64 *hole_start, + u64 *hole_size, u64 num_bytes) +{ + bool changed = false; + u64 hole_end = *hole_start + *hole_size; + + /* + * Check before we set max_hole_start, otherwise we could end up + * sending back this offset anyway. + */ + if (contains_pending_extent(device, hole_start, *hole_size)) { + if (hole_end >= *hole_start) + *hole_size = hole_end - *hole_start; + else + *hole_size = 0; + changed = true; + } + + switch (device->fs_devices->chunk_alloc_policy) { + case BTRFS_CHUNK_ALLOC_REGULAR: + /* No extra check */ + break; + default: + BUG(); + } + + return changed; +} /* * find_free_dev_extent_start - find free space in the specified device @@ -1429,12 +1480,7 @@ static int find_free_dev_extent_start(struct btrfs_device *device, int slot; struct extent_buffer *l; - /* - * We don't want to overwrite the superblock on the drive nor any area - * used by the boot loader (grub for example), so we make sure to start - * at an offset of at least 1MB. - */ - search_start = max_t(u64, search_start, SZ_1M); + search_start = dev_extent_search_start(device, search_start); path = btrfs_alloc_path(); if (!path) @@ -1492,18 +1538,8 @@ again: if (key.offset > search_start) { hole_size = key.offset - search_start; - - /* - * Have to check before we set max_hole_start, otherwise - * we could end up sending back this offset anyway. - */ - if (contains_pending_extent(device, &search_start, - hole_size)) { - if (key.offset >= search_start) - hole_size = key.offset - search_start; - else - hole_size = 0; - } + dev_extent_hole_check(device, &search_start, &hole_size, + num_bytes); if (hole_size > max_hole_size) { max_hole_start = search_start; @@ -1542,8 +1578,8 @@ next: */ if (search_end > search_start) { hole_size = search_end - search_start; - - if (contains_pending_extent(device, &search_start, hole_size)) { + if (dev_extent_hole_check(device, &search_start, &hole_size, + num_bytes)) { btrfs_release_path(path); goto again; } @@ -1949,6 +1985,46 @@ static u64 btrfs_num_devices(struct btrfs_fs_info *fs_info) return num_devices; } +static void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info, + struct block_device *bdev, + const char *device_path) +{ + struct btrfs_super_block *disk_super; + int copy_num; + + if (!bdev) + return; + + for (copy_num = 0; copy_num < BTRFS_SUPER_MIRROR_MAX; copy_num++) { + struct page *page; + int ret; + + disk_super = btrfs_read_dev_one_super(bdev, copy_num); + if (IS_ERR(disk_super)) + continue; + + memset(&disk_super->magic, 0, sizeof(disk_super->magic)); + + page = virt_to_page(disk_super); + set_page_dirty(page); + lock_page(page); + /* write_on_page() unlocks the page */ + ret = write_one_page(page); + if (ret) + btrfs_warn(fs_info, + "error clearing superblock number %d (%d)", + copy_num, ret); + btrfs_release_disk_super(disk_super); + + } + + /* Notify udev that device has changed */ + btrfs_kobject_uevent(bdev, KOBJ_CHANGE); + + /* Update ctime/mtime for device path for libblkid */ + update_dev_time(device_path); +} + int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path, u64 devid) { @@ -2054,7 +2130,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path, if (device->bdev) { cur_devices->open_devices--; /* remove sysfs entry */ - btrfs_sysfs_rm_device_link(fs_devices, device); + btrfs_sysfs_remove_devices_dir(fs_devices, device); } num_devices = btrfs_super_num_devices(fs_info->super_copy) - 1; @@ -2067,7 +2143,8 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path, * supers and free the device. */ if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) - btrfs_scratch_superblocks(device->bdev, device->name->str); + btrfs_scratch_superblocks(fs_info, device->bdev, + device->name->str); btrfs_close_bdev(device); synchronize_rcu(); @@ -2135,7 +2212,8 @@ void btrfs_rm_dev_replace_free_srcdev(struct btrfs_device *srcdev) if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &srcdev->dev_state)) { /* zero out the old super if it is writable */ - btrfs_scratch_superblocks(srcdev->bdev, srcdev->name->str); + btrfs_scratch_superblocks(fs_info, srcdev->bdev, + srcdev->name->str); } btrfs_close_bdev(srcdev); @@ -2174,7 +2252,7 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev) mutex_lock(&fs_devices->device_list_mutex); - btrfs_sysfs_rm_device_link(fs_devices, tgtdev); + btrfs_sysfs_remove_devices_dir(fs_devices, tgtdev); if (tgtdev->bdev) fs_devices->open_devices--; @@ -2194,7 +2272,8 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev) * is already out of device list, so we don't have to hold * the device_list_mutex lock. */ - btrfs_scratch_superblocks(tgtdev->bdev, tgtdev->name->str); + btrfs_scratch_superblocks(tgtdev->fs_info, tgtdev->bdev, + tgtdev->name->str); btrfs_close_bdev(tgtdev); synchronize_rcu(); @@ -2209,14 +2288,13 @@ static struct btrfs_device *btrfs_find_device_by_path( u64 devid; u8 *dev_uuid; struct block_device *bdev; - struct buffer_head *bh; struct btrfs_device *device; ret = btrfs_get_bdev_and_sb(device_path, FMODE_READ, - fs_info->bdev_holder, 0, &bdev, &bh); + fs_info->bdev_holder, 0, &bdev, &disk_super); if (ret) return ERR_PTR(ret); - disk_super = (struct btrfs_super_block *)bh->b_data; + devid = btrfs_stack_device_id(&disk_super->dev_item); dev_uuid = disk_super->dev_item.uuid; if (btrfs_fs_incompat(fs_info, METADATA_UUID)) @@ -2226,7 +2304,7 @@ static struct btrfs_device *btrfs_find_device_by_path( device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid, disk_super->fsid, true); - brelse(bh); + btrfs_release_disk_super(disk_super); if (!device) device = ERR_PTR(-ENOENT); blkdev_put(bdev, FMODE_READ); @@ -2522,7 +2600,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path orig_super_num_devices + 1); /* add sysfs device entry */ - btrfs_sysfs_add_device_link(fs_devices, device); + btrfs_sysfs_add_devices_dir(fs_devices, device); /* * we've got more storage, clear any full flags on the space @@ -2590,7 +2668,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path return ret; error_sysfs: - btrfs_sysfs_rm_device_link(fs_devices, device); + btrfs_sysfs_remove_devices_dir(fs_devices, device); mutex_lock(&fs_info->fs_devices->device_list_mutex); mutex_lock(&fs_info->chunk_mutex); list_del_rcu(&device->dev_list); @@ -3723,13 +3801,25 @@ static inline int balance_need_close(struct btrfs_fs_info *fs_info) atomic_read(&fs_info->balance_cancel_req) == 0); } -/* Non-zero return value signifies invalidity */ -static inline int validate_convert_profile(struct btrfs_balance_args *bctl_arg, - u64 allowed) +/* + * Validate target profile against allowed profiles and return true if it's OK. + * Otherwise print the error message and return false. + */ +static inline int validate_convert_profile(struct btrfs_fs_info *fs_info, + const struct btrfs_balance_args *bargs, + u64 allowed, const char *type) { - return ((bctl_arg->flags & BTRFS_BALANCE_ARGS_CONVERT) && - (!alloc_profile_is_valid(bctl_arg->target, 1) || - (bctl_arg->target & ~allowed))); + if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT)) + return true; + + /* Profile is valid and does not have bits outside of the allowed set */ + if (alloc_profile_is_valid(bargs->target, 1) && + (bargs->target & ~allowed) == 0) + return true; + + btrfs_err(fs_info, "balance: invalid convert %s profile %s", + type, btrfs_bg_type_to_raid_name(bargs->target)); + return false; } /* @@ -3904,7 +3994,7 @@ int btrfs_balance(struct btrfs_fs_info *fs_info, if (btrfs_fs_closing(fs_info) || atomic_read(&fs_info->balance_pause_req) || - atomic_read(&fs_info->balance_cancel_req)) { + btrfs_should_cancel_balance(fs_info)) { ret = -EINVAL; goto out; } @@ -3945,24 +4035,9 @@ int btrfs_balance(struct btrfs_fs_info *fs_info, if (num_devices >= btrfs_raid_array[i].devs_min) allowed |= btrfs_raid_array[i].bg_flag; - if (validate_convert_profile(&bctl->data, allowed)) { - btrfs_err(fs_info, - "balance: invalid convert data profile %s", - btrfs_bg_type_to_raid_name(bctl->data.target)); - ret = -EINVAL; - goto out; - } - if (validate_convert_profile(&bctl->meta, allowed)) { - btrfs_err(fs_info, - "balance: invalid convert metadata profile %s", - btrfs_bg_type_to_raid_name(bctl->meta.target)); - ret = -EINVAL; - goto out; - } - if (validate_convert_profile(&bctl->sys, allowed)) { - btrfs_err(fs_info, - "balance: invalid convert system profile %s", - btrfs_bg_type_to_raid_name(bctl->sys.target)); + if (!validate_convert_profile(fs_info, &bctl->data, allowed, "data") || + !validate_convert_profile(fs_info, &bctl->meta, allowed, "metadata") || + !validate_convert_profile(fs_info, &bctl->sys, allowed, "system")) { ret = -EINVAL; goto out; } @@ -4274,7 +4349,7 @@ int btrfs_cancel_balance(struct btrfs_fs_info *fs_info) return 0; } -static int btrfs_uuid_scan_kthread(void *data) +int btrfs_uuid_scan_kthread(void *data) { struct btrfs_fs_info *fs_info = data; struct btrfs_root *root = fs_info->tree_root; @@ -4286,6 +4361,7 @@ static int btrfs_uuid_scan_kthread(void *data) struct btrfs_root_item root_item; u32 item_size; struct btrfs_trans_handle *trans = NULL; + bool closing = false; path = btrfs_alloc_path(); if (!path) { @@ -4298,6 +4374,10 @@ static int btrfs_uuid_scan_kthread(void *data) key.offset = 0; while (1) { + if (btrfs_fs_closing(fs_info)) { + closing = true; + break; + } ret = btrfs_search_forward(root, &key, path, BTRFS_OLDEST_GENERATION); if (ret) { @@ -4397,76 +4477,12 @@ out: btrfs_end_transaction(trans); if (ret) btrfs_warn(fs_info, "btrfs_uuid_scan_kthread failed %d", ret); - else + else if (!closing) set_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags); up(&fs_info->uuid_tree_rescan_sem); return 0; } -/* - * Callback for btrfs_uuid_tree_iterate(). - * returns: - * 0 check succeeded, the entry is not outdated. - * < 0 if an error occurred. - * > 0 if the check failed, which means the caller shall remove the entry. - */ -static int btrfs_check_uuid_tree_entry(struct btrfs_fs_info *fs_info, - u8 *uuid, u8 type, u64 subid) -{ - struct btrfs_key key; - int ret = 0; - struct btrfs_root *subvol_root; - - if (type != BTRFS_UUID_KEY_SUBVOL && - type != BTRFS_UUID_KEY_RECEIVED_SUBVOL) - goto out; - - key.objectid = subid; - key.type = BTRFS_ROOT_ITEM_KEY; - key.offset = (u64)-1; - subvol_root = btrfs_read_fs_root_no_name(fs_info, &key); - if (IS_ERR(subvol_root)) { - ret = PTR_ERR(subvol_root); - if (ret == -ENOENT) - ret = 1; - goto out; - } - - switch (type) { - case BTRFS_UUID_KEY_SUBVOL: - if (memcmp(uuid, subvol_root->root_item.uuid, BTRFS_UUID_SIZE)) - ret = 1; - break; - case BTRFS_UUID_KEY_RECEIVED_SUBVOL: - if (memcmp(uuid, subvol_root->root_item.received_uuid, - BTRFS_UUID_SIZE)) - ret = 1; - break; - } - -out: - return ret; -} - -static int btrfs_uuid_rescan_kthread(void *data) -{ - struct btrfs_fs_info *fs_info = (struct btrfs_fs_info *)data; - int ret; - - /* - * 1st step is to iterate through the existing UUID tree and - * to delete all entries that contain outdated data. - * 2nd step is to add all missing entries to the UUID tree. - */ - ret = btrfs_uuid_tree_iterate(fs_info, btrfs_check_uuid_tree_entry); - if (ret < 0) { - btrfs_warn(fs_info, "iterating uuid_tree failed %d", ret); - up(&fs_info->uuid_tree_rescan_sem); - return ret; - } - return btrfs_uuid_scan_kthread(data); -} - int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info) { struct btrfs_trans_handle *trans; @@ -4509,22 +4525,6 @@ int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info) return 0; } -int btrfs_check_uuid_tree(struct btrfs_fs_info *fs_info) -{ - struct task_struct *task; - - down(&fs_info->uuid_tree_rescan_sem); - task = kthread_run(btrfs_uuid_rescan_kthread, fs_info, "btrfs-uuid"); - if (IS_ERR(task)) { - /* fs_info->update_uuid_tree_gen remains 0 in all error case */ - btrfs_warn(fs_info, "failed to start uuid_rescan task"); - up(&fs_info->uuid_tree_rescan_sem); - return PTR_ERR(task); - } - - return 0; -} - /* * shrinking a device means finding all of the device extents past * the new size, and then following the back refs to the chunks. @@ -4777,96 +4777,111 @@ static void check_raid1c34_incompat_flag(struct btrfs_fs_info *info, u64 type) btrfs_set_fs_incompat(info, RAID1C34); } -static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, - u64 start, u64 type) -{ - struct btrfs_fs_info *info = trans->fs_info; - struct btrfs_fs_devices *fs_devices = info->fs_devices; - struct btrfs_device *device; - struct map_lookup *map = NULL; - struct extent_map_tree *em_tree; - struct extent_map *em; - struct btrfs_device_info *devices_info = NULL; - u64 total_avail; - int num_stripes; /* total number of stripes to allocate */ - int data_stripes; /* number of stripes that count for - block group size */ - int sub_stripes; /* sub_stripes info for map */ - int dev_stripes; /* stripes per dev */ - int devs_max; /* max devs to use */ - int devs_min; /* min devs needed */ - int devs_increment; /* ndevs has to be a multiple of this */ - int ncopies; /* how many copies to data has */ - int nparity; /* number of stripes worth of bytes to - store parity information */ - int ret; +/* + * Structure used internally for __btrfs_alloc_chunk() function. + * Wraps needed parameters. + */ +struct alloc_chunk_ctl { + u64 start; + u64 type; + /* Total number of stripes to allocate */ + int num_stripes; + /* sub_stripes info for map */ + int sub_stripes; + /* Stripes per device */ + int dev_stripes; + /* Maximum number of devices to use */ + int devs_max; + /* Minimum number of devices to use */ + int devs_min; + /* ndevs has to be a multiple of this */ + int devs_increment; + /* Number of copies */ + int ncopies; + /* Number of stripes worth of bytes to store parity information */ + int nparity; u64 max_stripe_size; u64 max_chunk_size; + u64 dev_extent_min; u64 stripe_size; u64 chunk_size; int ndevs; - int i; - int j; - int index; - - BUG_ON(!alloc_profile_is_valid(type, 0)); - - if (list_empty(&fs_devices->alloc_list)) { - if (btrfs_test_opt(info, ENOSPC_DEBUG)) - btrfs_debug(info, "%s: no writable device", __func__); - return -ENOSPC; - } - - index = btrfs_bg_flags_to_raid_index(type); +}; - sub_stripes = btrfs_raid_array[index].sub_stripes; - dev_stripes = btrfs_raid_array[index].dev_stripes; - devs_max = btrfs_raid_array[index].devs_max; - if (!devs_max) - devs_max = BTRFS_MAX_DEVS(info); - devs_min = btrfs_raid_array[index].devs_min; - devs_increment = btrfs_raid_array[index].devs_increment; - ncopies = btrfs_raid_array[index].ncopies; - nparity = btrfs_raid_array[index].nparity; +static void init_alloc_chunk_ctl_policy_regular( + struct btrfs_fs_devices *fs_devices, + struct alloc_chunk_ctl *ctl) +{ + u64 type = ctl->type; if (type & BTRFS_BLOCK_GROUP_DATA) { - max_stripe_size = SZ_1G; - max_chunk_size = BTRFS_MAX_DATA_CHUNK_SIZE; + ctl->max_stripe_size = SZ_1G; + ctl->max_chunk_size = BTRFS_MAX_DATA_CHUNK_SIZE; } else if (type & BTRFS_BLOCK_GROUP_METADATA) { - /* for larger filesystems, use larger metadata chunks */ + /* For larger filesystems, use larger metadata chunks */ if (fs_devices->total_rw_bytes > 50ULL * SZ_1G) - max_stripe_size = SZ_1G; + ctl->max_stripe_size = SZ_1G; else - max_stripe_size = SZ_256M; - max_chunk_size = max_stripe_size; + ctl->max_stripe_size = SZ_256M; + ctl->max_chunk_size = ctl->max_stripe_size; } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) { - max_stripe_size = SZ_32M; - max_chunk_size = 2 * max_stripe_size; - devs_max = min_t(int, devs_max, BTRFS_MAX_DEVS_SYS_CHUNK); + ctl->max_stripe_size = SZ_32M; + ctl->max_chunk_size = 2 * ctl->max_stripe_size; + ctl->devs_max = min_t(int, ctl->devs_max, + BTRFS_MAX_DEVS_SYS_CHUNK); } else { - btrfs_err(info, "invalid chunk type 0x%llx requested", - type); BUG(); } /* We don't want a chunk larger than 10% of writable space */ - max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1), - max_chunk_size); + ctl->max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1), + ctl->max_chunk_size); + ctl->dev_extent_min = BTRFS_STRIPE_LEN * ctl->dev_stripes; +} + +static void init_alloc_chunk_ctl(struct btrfs_fs_devices *fs_devices, + struct alloc_chunk_ctl *ctl) +{ + int index = btrfs_bg_flags_to_raid_index(ctl->type); + + ctl->sub_stripes = btrfs_raid_array[index].sub_stripes; + ctl->dev_stripes = btrfs_raid_array[index].dev_stripes; + ctl->devs_max = btrfs_raid_array[index].devs_max; + if (!ctl->devs_max) + ctl->devs_max = BTRFS_MAX_DEVS(fs_devices->fs_info); + ctl->devs_min = btrfs_raid_array[index].devs_min; + ctl->devs_increment = btrfs_raid_array[index].devs_increment; + ctl->ncopies = btrfs_raid_array[index].ncopies; + ctl->nparity = btrfs_raid_array[index].nparity; + ctl->ndevs = 0; + + switch (fs_devices->chunk_alloc_policy) { + case BTRFS_CHUNK_ALLOC_REGULAR: + init_alloc_chunk_ctl_policy_regular(fs_devices, ctl); + break; + default: + BUG(); + } +} - devices_info = kcalloc(fs_devices->rw_devices, sizeof(*devices_info), - GFP_NOFS); - if (!devices_info) - return -ENOMEM; +static int gather_device_info(struct btrfs_fs_devices *fs_devices, + struct alloc_chunk_ctl *ctl, + struct btrfs_device_info *devices_info) +{ + struct btrfs_fs_info *info = fs_devices->fs_info; + struct btrfs_device *device; + u64 total_avail; + u64 dev_extent_want = ctl->max_stripe_size * ctl->dev_stripes; + int ret; + int ndevs = 0; + u64 max_avail; + u64 dev_offset; /* * in the first pass through the devices list, we gather information * about the available holes on each device. */ - ndevs = 0; list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) { - u64 max_avail; - u64 dev_offset; - if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { WARN(1, KERN_ERR "BTRFS: read-only device in alloc_list\n"); @@ -4884,24 +4899,23 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, total_avail = 0; /* If there is no space on this device, skip it. */ - if (total_avail == 0) + if (total_avail < ctl->dev_extent_min) continue; - ret = find_free_dev_extent(device, - max_stripe_size * dev_stripes, - &dev_offset, &max_avail); + ret = find_free_dev_extent(device, dev_extent_want, &dev_offset, + &max_avail); if (ret && ret != -ENOSPC) - goto error; + return ret; if (ret == 0) - max_avail = max_stripe_size * dev_stripes; + max_avail = dev_extent_want; - if (max_avail < BTRFS_STRIPE_LEN * dev_stripes) { + if (max_avail < ctl->dev_extent_min) { if (btrfs_test_opt(info, ENOSPC_DEBUG)) btrfs_debug(info, - "%s: devid %llu has no free space, have=%llu want=%u", + "%s: devid %llu has no free space, have=%llu want=%llu", __func__, device->devid, max_avail, - BTRFS_STRIPE_LEN * dev_stripes); + ctl->dev_extent_min); continue; } @@ -4916,6 +4930,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, devices_info[ndevs].dev = device; ++ndevs; } + ctl->ndevs = ndevs; /* * now sort the devices by hole size / available space @@ -4923,23 +4938,14 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, sort(devices_info, ndevs, sizeof(struct btrfs_device_info), btrfs_cmp_device_info, NULL); - /* - * Round down to number of usable stripes, devs_increment can be any - * number so we can't use round_down() - */ - ndevs -= ndevs % devs_increment; - - if (ndevs < devs_min) { - ret = -ENOSPC; - if (btrfs_test_opt(info, ENOSPC_DEBUG)) { - btrfs_debug(info, - "%s: not enough devices with free space: have=%d minimum required=%d", - __func__, ndevs, devs_min); - } - goto error; - } + return 0; +} - ndevs = min(ndevs, devs_max); +static int decide_stripe_size_regular(struct alloc_chunk_ctl *ctl, + struct btrfs_device_info *devices_info) +{ + /* Number of stripes that count for block group size */ + int data_stripes; /* * The primary goal is to maximize the number of stripes, so use as @@ -4948,73 +4954,116 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, * The DUP profile stores more than one stripe per device, the * max_avail is the total size so we have to adjust. */ - stripe_size = div_u64(devices_info[ndevs - 1].max_avail, dev_stripes); - num_stripes = ndevs * dev_stripes; + ctl->stripe_size = div_u64(devices_info[ctl->ndevs - 1].max_avail, + ctl->dev_stripes); + ctl->num_stripes = ctl->ndevs * ctl->dev_stripes; - /* - * this will have to be fixed for RAID1 and RAID10 over - * more drives - */ - data_stripes = (num_stripes - nparity) / ncopies; + /* This will have to be fixed for RAID1 and RAID10 over more drives */ + data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies; /* - * Use the number of data stripes to figure out how big this chunk - * is really going to be in terms of logical address space, - * and compare that answer with the max chunk size. If it's higher, - * we try to reduce stripe_size. + * Use the number of data stripes to figure out how big this chunk is + * really going to be in terms of logical address space, and compare + * that answer with the max chunk size. If it's higher, we try to + * reduce stripe_size. */ - if (stripe_size * data_stripes > max_chunk_size) { + if (ctl->stripe_size * data_stripes > ctl->max_chunk_size) { /* * Reduce stripe_size, round it up to a 16MB boundary again and * then use it, unless it ends up being even bigger than the * previous value we had already. */ - stripe_size = min(round_up(div_u64(max_chunk_size, - data_stripes), SZ_16M), - stripe_size); + ctl->stripe_size = min(round_up(div_u64(ctl->max_chunk_size, + data_stripes), SZ_16M), + ctl->stripe_size); } - /* align to BTRFS_STRIPE_LEN */ - stripe_size = round_down(stripe_size, BTRFS_STRIPE_LEN); + /* Align to BTRFS_STRIPE_LEN */ + ctl->stripe_size = round_down(ctl->stripe_size, BTRFS_STRIPE_LEN); + ctl->chunk_size = ctl->stripe_size * data_stripes; - map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS); - if (!map) { - ret = -ENOMEM; - goto error; + return 0; +} + +static int decide_stripe_size(struct btrfs_fs_devices *fs_devices, + struct alloc_chunk_ctl *ctl, + struct btrfs_device_info *devices_info) +{ + struct btrfs_fs_info *info = fs_devices->fs_info; + + /* + * Round down to number of usable stripes, devs_increment can be any + * number so we can't use round_down() that requires power of 2, while + * rounddown is safe. + */ + ctl->ndevs = rounddown(ctl->ndevs, ctl->devs_increment); + + if (ctl->ndevs < ctl->devs_min) { + if (btrfs_test_opt(info, ENOSPC_DEBUG)) { + btrfs_debug(info, + "%s: not enough devices with free space: have=%d minimum required=%d", + __func__, ctl->ndevs, ctl->devs_min); + } + return -ENOSPC; } - map->num_stripes = num_stripes; - for (i = 0; i < ndevs; ++i) { - for (j = 0; j < dev_stripes; ++j) { - int s = i * dev_stripes + j; + ctl->ndevs = min(ctl->ndevs, ctl->devs_max); + + switch (fs_devices->chunk_alloc_policy) { + case BTRFS_CHUNK_ALLOC_REGULAR: + return decide_stripe_size_regular(ctl, devices_info); + default: + BUG(); + } +} + +static int create_chunk(struct btrfs_trans_handle *trans, + struct alloc_chunk_ctl *ctl, + struct btrfs_device_info *devices_info) +{ + struct btrfs_fs_info *info = trans->fs_info; + struct map_lookup *map = NULL; + struct extent_map_tree *em_tree; + struct extent_map *em; + u64 start = ctl->start; + u64 type = ctl->type; + int ret; + int i; + int j; + + map = kmalloc(map_lookup_size(ctl->num_stripes), GFP_NOFS); + if (!map) + return -ENOMEM; + map->num_stripes = ctl->num_stripes; + + for (i = 0; i < ctl->ndevs; ++i) { + for (j = 0; j < ctl->dev_stripes; ++j) { + int s = i * ctl->dev_stripes + j; map->stripes[s].dev = devices_info[i].dev; map->stripes[s].physical = devices_info[i].dev_offset + - j * stripe_size; + j * ctl->stripe_size; } } map->stripe_len = BTRFS_STRIPE_LEN; map->io_align = BTRFS_STRIPE_LEN; map->io_width = BTRFS_STRIPE_LEN; map->type = type; - map->sub_stripes = sub_stripes; - - chunk_size = stripe_size * data_stripes; + map->sub_stripes = ctl->sub_stripes; - trace_btrfs_chunk_alloc(info, map, start, chunk_size); + trace_btrfs_chunk_alloc(info, map, start, ctl->chunk_size); em = alloc_extent_map(); if (!em) { kfree(map); - ret = -ENOMEM; - goto error; + return -ENOMEM; } set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags); em->map_lookup = map; em->start = start; - em->len = chunk_size; + em->len = ctl->chunk_size; em->block_start = 0; em->block_len = em->len; - em->orig_block_len = stripe_size; + em->orig_block_len = ctl->stripe_size; em_tree = &info->mapping_tree; write_lock(&em_tree->lock); @@ -5022,30 +5071,31 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, if (ret) { write_unlock(&em_tree->lock); free_extent_map(em); - goto error; + return ret; } write_unlock(&em_tree->lock); - ret = btrfs_make_block_group(trans, 0, type, start, chunk_size); + ret = btrfs_make_block_group(trans, 0, type, start, ctl->chunk_size); if (ret) goto error_del_extent; for (i = 0; i < map->num_stripes; i++) { struct btrfs_device *dev = map->stripes[i].dev; - btrfs_device_set_bytes_used(dev, dev->bytes_used + stripe_size); + btrfs_device_set_bytes_used(dev, + dev->bytes_used + ctl->stripe_size); if (list_empty(&dev->post_commit_list)) list_add_tail(&dev->post_commit_list, &trans->transaction->dev_update_list); } - atomic64_sub(stripe_size * map->num_stripes, &info->free_chunk_space); + atomic64_sub(ctl->stripe_size * map->num_stripes, + &info->free_chunk_space); free_extent_map(em); check_raid56_incompat_flag(info, type); check_raid1c34_incompat_flag(info, type); - kfree(devices_info); return 0; error_del_extent: @@ -5057,11 +5107,68 @@ error_del_extent: free_extent_map(em); /* One for the tree reference */ free_extent_map(em); -error: + + return ret; +} + +int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, u64 type) +{ + struct btrfs_fs_info *info = trans->fs_info; + struct btrfs_fs_devices *fs_devices = info->fs_devices; + struct btrfs_device_info *devices_info = NULL; + struct alloc_chunk_ctl ctl; + int ret; + + lockdep_assert_held(&info->chunk_mutex); + + if (!alloc_profile_is_valid(type, 0)) { + ASSERT(0); + return -EINVAL; + } + + if (list_empty(&fs_devices->alloc_list)) { + if (btrfs_test_opt(info, ENOSPC_DEBUG)) + btrfs_debug(info, "%s: no writable device", __func__); + return -ENOSPC; + } + + if (!(type & BTRFS_BLOCK_GROUP_TYPE_MASK)) { + btrfs_err(info, "invalid chunk type 0x%llx requested", type); + ASSERT(0); + return -EINVAL; + } + + ctl.start = find_next_chunk(info); + ctl.type = type; + init_alloc_chunk_ctl(fs_devices, &ctl); + + devices_info = kcalloc(fs_devices->rw_devices, sizeof(*devices_info), + GFP_NOFS); + if (!devices_info) + return -ENOMEM; + + ret = gather_device_info(fs_devices, &ctl, devices_info); + if (ret < 0) + goto out; + + ret = decide_stripe_size(fs_devices, &ctl, devices_info); + if (ret < 0) + goto out; + + ret = create_chunk(trans, &ctl, devices_info); + +out: kfree(devices_info); return ret; } +/* + * Chunk allocation falls into two parts. The first part does work + * that makes the new allocated chunk usable, but does not do any operation + * that modifies the chunk tree. The second part does the work that + * requires modifying the chunk tree. This division is important for the + * bootstrap process of adding storage to a seed btrfs. + */ int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans, u64 chunk_offset, u64 chunk_size) { @@ -5160,39 +5267,19 @@ out: return ret; } -/* - * Chunk allocation falls into two parts. The first part does work - * that makes the new allocated chunk usable, but does not do any operation - * that modifies the chunk tree. The second part does the work that - * requires modifying the chunk tree. This division is important for the - * bootstrap process of adding storage to a seed btrfs. - */ -int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, u64 type) -{ - u64 chunk_offset; - - lockdep_assert_held(&trans->fs_info->chunk_mutex); - chunk_offset = find_next_chunk(trans->fs_info); - return __btrfs_alloc_chunk(trans, chunk_offset, type); -} - static noinline int init_first_rw_device(struct btrfs_trans_handle *trans) { struct btrfs_fs_info *fs_info = trans->fs_info; - u64 chunk_offset; - u64 sys_chunk_offset; u64 alloc_profile; int ret; - chunk_offset = find_next_chunk(fs_info); alloc_profile = btrfs_metadata_alloc_profile(fs_info); - ret = __btrfs_alloc_chunk(trans, chunk_offset, alloc_profile); + ret = btrfs_alloc_chunk(trans, alloc_profile); if (ret) return ret; - sys_chunk_offset = find_next_chunk(fs_info); alloc_profile = btrfs_system_alloc_profile(fs_info); - ret = __btrfs_alloc_chunk(trans, sys_chunk_offset, alloc_profile); + ret = btrfs_alloc_chunk(trans, alloc_profile); return ret; } @@ -5389,31 +5476,19 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info, return preferred_mirror; } -static inline int parity_smaller(u64 a, u64 b) -{ - return a > b; -} - /* Bubble-sort the stripe set to put the parity/syndrome stripes last */ static void sort_parity_stripes(struct btrfs_bio *bbio, int num_stripes) { - struct btrfs_bio_stripe s; int i; - u64 l; int again = 1; while (again) { again = 0; for (i = 0; i < num_stripes - 1; i++) { - if (parity_smaller(bbio->raid_map[i], - bbio->raid_map[i+1])) { - s = bbio->stripes[i]; - l = bbio->raid_map[i]; - bbio->stripes[i] = bbio->stripes[i+1]; - bbio->raid_map[i] = bbio->raid_map[i+1]; - bbio->stripes[i+1] = s; - bbio->raid_map[i+1] = l; - + /* Swap if parity is on a smaller index */ + if (bbio->raid_map[i] > bbio->raid_map[i + 1]) { + swap(bbio->stripes[i], bbio->stripes[i + 1]); + swap(bbio->raid_map[i], bbio->raid_map[i + 1]); again = 1; } } @@ -5914,10 +5989,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, struct btrfs_io_geometry geom; ASSERT(bbio_ret); - - if (op == BTRFS_MAP_DISCARD) - return __btrfs_map_block_for_discard(fs_info, logical, - length, bbio_ret); + ASSERT(op != BTRFS_MAP_DISCARD); ret = btrfs_get_io_geometry(fs_info, op, logical, *length, &geom); if (ret < 0) @@ -6147,6 +6219,10 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, u64 logical, u64 *length, struct btrfs_bio **bbio_ret, int mirror_num) { + if (op == BTRFS_MAP_DISCARD) + return __btrfs_map_block_for_discard(fs_info, logical, + length, bbio_ret); + return __btrfs_map_block(fs_info, op, logical, length, bbio_ret, mirror_num, 0); } @@ -6241,8 +6317,8 @@ static void submit_stripe_bio(struct btrfs_bio *bbio, struct bio *bio, btrfs_debug_in_rcu(fs_info, "btrfs_map_bio: rw %d 0x%x, sector=%llu, dev=%lu (%s id %llu), size=%u", bio_op(bio), bio->bi_opf, (u64)bio->bi_iter.bi_sector, - (u_long)dev->bdev->bd_dev, rcu_str_deref(dev->name), dev->devid, - bio->bi_iter.bi_size); + (unsigned long)dev->bdev->bd_dev, rcu_str_deref(dev->name), + dev->devid, bio->bi_iter.bi_size); bio_set_dev(bio, dev->bdev); btrfs_bio_counter_inc_noblocked(fs_info); @@ -7317,36 +7393,6 @@ int btrfs_get_dev_stats(struct btrfs_fs_info *fs_info, return 0; } -void btrfs_scratch_superblocks(struct block_device *bdev, const char *device_path) -{ - struct buffer_head *bh; - struct btrfs_super_block *disk_super; - int copy_num; - - if (!bdev) - return; - - for (copy_num = 0; copy_num < BTRFS_SUPER_MIRROR_MAX; - copy_num++) { - - if (btrfs_read_dev_one_super(bdev, copy_num, &bh)) - continue; - - disk_super = (struct btrfs_super_block *)bh->b_data; - - memset(&disk_super->magic, 0, sizeof(disk_super->magic)); - set_buffer_dirty(bh); - sync_dirty_buffer(bh); - brelse(bh); - } - - /* Notify udev that device has changed */ - btrfs_kobject_uevent(bdev, KOBJ_CHANGE); - - /* Update ctime/mtime for device path for libblkid */ - update_dev_time(device_path); -} - /* * Update the size and bytes used for each device where it changed. This is * delayed since we would otherwise get errors while writing out the diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 409f4816fb89..f067b5934c46 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -17,8 +17,6 @@ extern struct mutex uuid_mutex; #define BTRFS_STRIPE_LEN SZ_64K -struct buffer_head; - struct btrfs_io_geometry { /* remaining bytes before crossing a stripe */ u64 len; @@ -209,6 +207,10 @@ BTRFS_DEVICE_GETSET_FUNCS(total_bytes); BTRFS_DEVICE_GETSET_FUNCS(disk_total_bytes); BTRFS_DEVICE_GETSET_FUNCS(bytes_used); +enum btrfs_chunk_allocation_policy { + BTRFS_CHUNK_ALLOC_REGULAR, +}; + struct btrfs_fs_devices { u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */ u8 metadata_uuid[BTRFS_FSID_SIZE]; @@ -258,7 +260,10 @@ struct btrfs_fs_devices { /* sysfs kobjects */ struct kobject fsid_kobj; struct kobject *devices_kobj; + struct kobject *devinfo_kobj; struct completion kobj_unregister; + + enum btrfs_chunk_allocation_policy chunk_alloc_policy; }; #define BTRFS_BIO_INLINE_CSUM_SIZE 64 @@ -460,7 +465,7 @@ int btrfs_recover_balance(struct btrfs_fs_info *fs_info); int btrfs_pause_balance(struct btrfs_fs_info *fs_info); int btrfs_cancel_balance(struct btrfs_fs_info *fs_info); int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info); -int btrfs_check_uuid_tree(struct btrfs_fs_info *fs_info); +int btrfs_uuid_scan_kthread(void *data); int btrfs_chunk_readonly(struct btrfs_fs_info *fs_info, u64 chunk_offset); int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes, u64 *start, u64 *max_avail); @@ -473,7 +478,6 @@ int btrfs_run_dev_stats(struct btrfs_trans_handle *trans); void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_device *srcdev); void btrfs_rm_dev_replace_free_srcdev(struct btrfs_device *srcdev); void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev); -void btrfs_scratch_superblocks(struct block_device *bdev, const char *device_path); int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, u64 logical, u64 len); unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info, @@ -483,6 +487,7 @@ int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans, int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset); struct extent_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info, u64 logical, u64 length); +void btrfs_release_disk_super(struct btrfs_super_block *super); static inline void btrfs_dev_stat_inc(struct btrfs_device *dev, int index) |