diff options
Diffstat (limited to 'fs/btrfs/extent-tree.c')
-rw-r--r-- | fs/btrfs/extent-tree.c | 653 |
1 files changed, 244 insertions, 409 deletions
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 1d49694e6ae3..f79e477a378e 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -643,7 +643,7 @@ static int cache_block_group(struct btrfs_block_group_cache *cache, if (btrfs_test_opt(fs_info, SPACE_CACHE)) { mutex_lock(&caching_ctl->mutex); - ret = load_free_space_cache(fs_info, cache); + ret = load_free_space_cache(cache); spin_lock(&cache->lock); if (ret == 1) { @@ -756,14 +756,15 @@ static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info, return NULL; } -static void add_pinned_bytes(struct btrfs_fs_info *fs_info, s64 num_bytes, - bool metadata, u64 root_objectid) +static void add_pinned_bytes(struct btrfs_fs_info *fs_info, + struct btrfs_ref *ref) { struct btrfs_space_info *space_info; + s64 num_bytes = -ref->len; u64 flags; - if (metadata) { - if (root_objectid == BTRFS_CHUNK_TREE_OBJECTID) + if (ref->type == BTRFS_REF_METADATA) { + if (ref->tree_ref.root == BTRFS_CHUNK_TREE_OBJECTID) flags = BTRFS_BLOCK_GROUP_SYSTEM; else flags = BTRFS_BLOCK_GROUP_METADATA; @@ -1704,7 +1705,7 @@ void setup_inline_extent_backref(struct btrfs_fs_info *fs_info, type = extent_ref_type(parent, owner); size = btrfs_extent_inline_ref_size(type); - btrfs_extend_item(fs_info, path, size); + btrfs_extend_item(path, size); ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); refs = btrfs_extent_refs(leaf, ei); @@ -1779,7 +1780,6 @@ void update_inline_extent_backref(struct btrfs_path *path, int *last_ref) { struct extent_buffer *leaf = path->nodes[0]; - struct btrfs_fs_info *fs_info = leaf->fs_info; struct btrfs_extent_item *ei; struct btrfs_extent_data_ref *dref = NULL; struct btrfs_shared_data_ref *sref = NULL; @@ -1834,7 +1834,7 @@ void update_inline_extent_backref(struct btrfs_path *path, memmove_extent_buffer(leaf, ptr, ptr + size, end - ptr - size); item_size -= size; - btrfs_truncate_item(fs_info, path, item_size, 1); + btrfs_truncate_item(path, item_size, 1); } btrfs_mark_buffer_dirty(leaf); } @@ -1905,7 +1905,6 @@ static int remove_extent_backref(struct btrfs_trans_handle *trans, return ret; } -#define in_range(b, first, len) ((b) >= (first) && (b) < (first) + (len)) static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len, u64 *discarded_bytes) { @@ -2043,39 +2042,28 @@ int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr, /* Can return -ENOMEM */ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - u64 bytenr, u64 num_bytes, u64 parent, - u64 root_objectid, u64 owner, u64 offset) + struct btrfs_ref *generic_ref) { - struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_fs_info *fs_info = trans->fs_info; int old_ref_mod, new_ref_mod; int ret; - BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID && - root_objectid == BTRFS_TREE_LOG_OBJECTID); + ASSERT(generic_ref->type != BTRFS_REF_NOT_SET && + generic_ref->action); + BUG_ON(generic_ref->type == BTRFS_REF_METADATA && + generic_ref->tree_ref.root == BTRFS_TREE_LOG_OBJECTID); - btrfs_ref_tree_mod(root, bytenr, num_bytes, parent, root_objectid, - owner, offset, BTRFS_ADD_DELAYED_REF); - - if (owner < BTRFS_FIRST_FREE_OBJECTID) { - ret = btrfs_add_delayed_tree_ref(trans, bytenr, - num_bytes, parent, - root_objectid, (int)owner, - BTRFS_ADD_DELAYED_REF, NULL, - &old_ref_mod, &new_ref_mod); - } else { - ret = btrfs_add_delayed_data_ref(trans, bytenr, - num_bytes, parent, - root_objectid, owner, offset, - 0, BTRFS_ADD_DELAYED_REF, + if (generic_ref->type == BTRFS_REF_METADATA) + ret = btrfs_add_delayed_tree_ref(trans, generic_ref, + NULL, &old_ref_mod, &new_ref_mod); + else + ret = btrfs_add_delayed_data_ref(trans, generic_ref, 0, &old_ref_mod, &new_ref_mod); - } - if (ret == 0 && old_ref_mod < 0 && new_ref_mod >= 0) { - bool metadata = owner < BTRFS_FIRST_FREE_OBJECTID; + btrfs_ref_tree_mod(fs_info, generic_ref); - add_pinned_bytes(fs_info, -num_bytes, metadata, root_objectid); - } + if (ret == 0 && old_ref_mod < 0 && new_ref_mod >= 0) + add_pinned_bytes(fs_info, generic_ref); return ret; } @@ -2877,97 +2865,6 @@ int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans) return btrfs_check_space_for_delayed_refs(trans->fs_info); } -struct async_delayed_refs { - struct btrfs_root *root; - u64 transid; - int count; - int error; - int sync; - struct completion wait; - struct btrfs_work work; -}; - -static inline struct async_delayed_refs * -to_async_delayed_refs(struct btrfs_work *work) -{ - return container_of(work, struct async_delayed_refs, work); -} - -static void delayed_ref_async_start(struct btrfs_work *work) -{ - struct async_delayed_refs *async = to_async_delayed_refs(work); - struct btrfs_trans_handle *trans; - struct btrfs_fs_info *fs_info = async->root->fs_info; - int ret; - - /* if the commit is already started, we don't need to wait here */ - if (btrfs_transaction_blocked(fs_info)) - goto done; - - trans = btrfs_join_transaction(async->root); - if (IS_ERR(trans)) { - async->error = PTR_ERR(trans); - goto done; - } - - /* - * trans->sync means that when we call end_transaction, we won't - * wait on delayed refs - */ - trans->sync = true; - - /* Don't bother flushing if we got into a different transaction */ - if (trans->transid > async->transid) - goto end; - - ret = btrfs_run_delayed_refs(trans, async->count); - if (ret) - async->error = ret; -end: - ret = btrfs_end_transaction(trans); - if (ret && !async->error) - async->error = ret; -done: - if (async->sync) - complete(&async->wait); - else - kfree(async); -} - -int btrfs_async_run_delayed_refs(struct btrfs_fs_info *fs_info, - unsigned long count, u64 transid, int wait) -{ - struct async_delayed_refs *async; - int ret; - - async = kmalloc(sizeof(*async), GFP_NOFS); - if (!async) - return -ENOMEM; - - async->root = fs_info->tree_root; - async->count = count; - async->error = 0; - async->transid = transid; - if (wait) - async->sync = 1; - else - async->sync = 0; - init_completion(&async->wait); - - btrfs_init_work(&async->work, btrfs_extent_refs_helper, - delayed_ref_async_start, NULL, NULL); - - btrfs_queue_work(fs_info->extent_workers, &async->work); - - if (wait) { - wait_for_completion(&async->wait); - ret = async->error; - kfree(async); - return ret; - } - return 0; -} - /* * this starts processing the delayed reference count updates and * extent insertions we have queued up so far. count can be @@ -3036,7 +2933,6 @@ out: } int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, u64 bytenr, u64 num_bytes, u64 flags, int level, int is_data) { @@ -3053,8 +2949,7 @@ int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, extent_op->is_data = is_data ? true : false; extent_op->level = level; - ret = btrfs_add_delayed_extent_op(fs_info, trans, bytenr, - num_bytes, extent_op); + ret = btrfs_add_delayed_extent_op(trans, bytenr, num_bytes, extent_op); if (ret) btrfs_free_delayed_extent_op(extent_op); return ret; @@ -3246,13 +3141,12 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, u32 nritems; struct btrfs_key key; struct btrfs_file_extent_item *fi; + struct btrfs_ref generic_ref = { 0 }; + bool for_reloc = btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC); int i; + int action; int level; int ret = 0; - int (*process_func)(struct btrfs_trans_handle *, - struct btrfs_root *, - u64, u64, u64, u64, u64, u64); - if (btrfs_is_testing(fs_info)) return 0; @@ -3264,15 +3158,14 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state) && level == 0) return 0; - if (inc) - process_func = btrfs_inc_extent_ref; - else - process_func = btrfs_free_extent; - if (full_backref) parent = buf->start; else parent = 0; + if (inc) + action = BTRFS_ADD_DELAYED_REF; + else + action = BTRFS_DROP_DELAYED_REF; for (i = 0; i < nritems; i++) { if (level == 0) { @@ -3290,16 +3183,30 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi); key.offset -= btrfs_file_extent_offset(buf, fi); - ret = process_func(trans, root, bytenr, num_bytes, - parent, ref_root, key.objectid, - key.offset); + btrfs_init_generic_ref(&generic_ref, action, bytenr, + num_bytes, parent); + generic_ref.real_root = root->root_key.objectid; + btrfs_init_data_ref(&generic_ref, ref_root, key.objectid, + key.offset); + generic_ref.skip_qgroup = for_reloc; + if (inc) + ret = btrfs_inc_extent_ref(trans, &generic_ref); + else + ret = btrfs_free_extent(trans, &generic_ref); if (ret) goto fail; } else { bytenr = btrfs_node_blockptr(buf, i); num_bytes = fs_info->nodesize; - ret = process_func(trans, root, bytenr, num_bytes, - parent, ref_root, level - 1, 0); + btrfs_init_generic_ref(&generic_ref, action, bytenr, + num_bytes, parent); + generic_ref.real_root = root->root_key.objectid; + btrfs_init_tree_ref(&generic_ref, level - 1, ref_root); + generic_ref.skip_qgroup = for_reloc; + if (inc) + ret = btrfs_inc_extent_ref(trans, &generic_ref); + else + ret = btrfs_free_extent(trans, &generic_ref); if (ret) goto fail; } @@ -3322,10 +3229,10 @@ int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, } static int write_one_cache_group(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, struct btrfs_path *path, struct btrfs_block_group_cache *cache) { + struct btrfs_fs_info *fs_info = trans->fs_info; int ret; struct btrfs_root *extent_root = fs_info->extent_root; unsigned long bi; @@ -3348,10 +3255,10 @@ fail: } -static struct btrfs_block_group_cache * -next_block_group(struct btrfs_fs_info *fs_info, - struct btrfs_block_group_cache *cache) +static struct btrfs_block_group_cache *next_block_group( + struct btrfs_block_group_cache *cache) { + struct btrfs_fs_info *fs_info = cache->fs_info; struct rb_node *node; spin_lock(&fs_info->block_group_cache_lock); @@ -3404,7 +3311,7 @@ static int cache_save_setup(struct btrfs_block_group_cache *block_group, if (trans->aborted) return 0; again: - inode = lookup_free_space_inode(fs_info, block_group, path); + inode = lookup_free_space_inode(block_group, path); if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) { ret = PTR_ERR(inode); btrfs_release_path(path); @@ -3418,8 +3325,7 @@ again: if (block_group->ro) goto out_free; - ret = create_free_space_inode(fs_info, trans, block_group, - path); + ret = create_free_space_inode(trans, block_group, path); if (ret) goto out_free; goto again; @@ -3538,9 +3444,9 @@ out: return ret; } -int btrfs_setup_space_cache(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info) +int btrfs_setup_space_cache(struct btrfs_trans_handle *trans) { + struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_block_group_cache *cache, *tmp; struct btrfs_transaction *cur_trans = trans->transaction; struct btrfs_path *path; @@ -3652,8 +3558,7 @@ again: if (cache->disk_cache_state == BTRFS_DC_SETUP) { cache->io_ctl.inode = NULL; - ret = btrfs_write_out_cache(fs_info, trans, - cache, path); + ret = btrfs_write_out_cache(trans, cache, path); if (ret == 0 && cache->io_ctl.inode) { num_started++; should_put = 0; @@ -3673,8 +3578,7 @@ again: } } if (!ret) { - ret = write_one_cache_group(trans, fs_info, - path, cache); + ret = write_one_cache_group(trans, path, cache); /* * Our block group might still be attached to the list * of new block groups in the transaction handle of some @@ -3744,9 +3648,9 @@ again: return ret; } -int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info) +int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans) { + struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_block_group_cache *cache; struct btrfs_transaction *cur_trans = trans->transaction; int ret = 0; @@ -3809,8 +3713,7 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, if (!ret && cache->disk_cache_state == BTRFS_DC_SETUP) { cache->io_ctl.inode = NULL; - ret = btrfs_write_out_cache(fs_info, trans, - cache, path); + ret = btrfs_write_out_cache(trans, cache, path); if (ret == 0 && cache->io_ctl.inode) { num_started++; should_put = 0; @@ -3824,8 +3727,7 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, } } if (!ret) { - ret = write_one_cache_group(trans, fs_info, - path, cache); + ret = write_one_cache_group(trans, path, cache); /* * One of the free space endio workers might have * created a new block group while updating a free space @@ -3842,8 +3744,7 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, if (ret == -ENOENT) { wait_event(cur_trans->writer_wait, atomic_read(&cur_trans->num_writers) == 1); - ret = write_one_cache_group(trans, fs_info, - path, cache); + ret = write_one_cache_group(trans, path, cache); } if (ret) btrfs_abort_transaction(trans, ret); @@ -4732,6 +4633,7 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim, struct btrfs_space_info *space_info; struct btrfs_trans_handle *trans; u64 delalloc_bytes; + u64 dio_bytes; u64 async_pages; u64 items; long time_left; @@ -4747,7 +4649,8 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim, delalloc_bytes = percpu_counter_sum_positive( &fs_info->delalloc_bytes); - if (delalloc_bytes == 0) { + dio_bytes = percpu_counter_sum_positive(&fs_info->dio_bytes); + if (delalloc_bytes == 0 && dio_bytes == 0) { if (trans) return; if (wait_ordered) @@ -4755,8 +4658,16 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim, return; } + /* + * If we are doing more ordered than delalloc we need to just wait on + * ordered extents, otherwise we'll waste time trying to flush delalloc + * that likely won't give us the space back we need. + */ + if (dio_bytes > delalloc_bytes) + wait_ordered = true; + loops = 0; - while (delalloc_bytes && loops < 3) { + while ((delalloc_bytes || dio_bytes) && loops < 3) { nr_pages = min(delalloc_bytes, to_reclaim) >> PAGE_SHIFT; /* @@ -4806,6 +4717,7 @@ skip_async: } delalloc_bytes = percpu_counter_sum_positive( &fs_info->delalloc_bytes); + dio_bytes = percpu_counter_sum_positive(&fs_info->dio_bytes); } } @@ -5803,85 +5715,6 @@ int btrfs_block_rsv_refill(struct btrfs_root *root, return ret; } -static void calc_refill_bytes(struct btrfs_block_rsv *block_rsv, - u64 *metadata_bytes, u64 *qgroup_bytes) -{ - *metadata_bytes = 0; - *qgroup_bytes = 0; - - spin_lock(&block_rsv->lock); - if (block_rsv->reserved < block_rsv->size) - *metadata_bytes = block_rsv->size - block_rsv->reserved; - if (block_rsv->qgroup_rsv_reserved < block_rsv->qgroup_rsv_size) - *qgroup_bytes = block_rsv->qgroup_rsv_size - - block_rsv->qgroup_rsv_reserved; - spin_unlock(&block_rsv->lock); -} - -/** - * btrfs_inode_rsv_refill - refill the inode block rsv. - * @inode - the inode we are refilling. - * @flush - the flushing restriction. - * - * Essentially the same as btrfs_block_rsv_refill, except it uses the - * block_rsv->size as the minimum size. We'll either refill the missing amount - * or return if we already have enough space. This will also handle the reserve - * tracepoint for the reserved amount. - */ -static int btrfs_inode_rsv_refill(struct btrfs_inode *inode, - enum btrfs_reserve_flush_enum flush) -{ - struct btrfs_root *root = inode->root; - struct btrfs_block_rsv *block_rsv = &inode->block_rsv; - u64 num_bytes, last = 0; - u64 qgroup_num_bytes; - int ret = -ENOSPC; - - calc_refill_bytes(block_rsv, &num_bytes, &qgroup_num_bytes); - if (num_bytes == 0) - return 0; - - do { - ret = btrfs_qgroup_reserve_meta_prealloc(root, qgroup_num_bytes, - true); - if (ret) - return ret; - ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush); - if (ret) { - btrfs_qgroup_free_meta_prealloc(root, qgroup_num_bytes); - last = num_bytes; - /* - * If we are fragmented we can end up with a lot of - * outstanding extents which will make our size be much - * larger than our reserved amount. - * - * If the reservation happens here, it might be very - * big though not needed in the end, if the delalloc - * flushing happens. - * - * If this is the case try and do the reserve again. - */ - if (flush == BTRFS_RESERVE_FLUSH_ALL) - calc_refill_bytes(block_rsv, &num_bytes, - &qgroup_num_bytes); - if (num_bytes == 0) - return 0; - } - } while (ret && last != num_bytes); - - if (!ret) { - block_rsv_add_bytes(block_rsv, num_bytes, false); - trace_btrfs_space_reservation(root->fs_info, "delalloc", - btrfs_ino(inode), num_bytes, 1); - - /* Don't forget to increase qgroup_rsv_reserved */ - spin_lock(&block_rsv->lock); - block_rsv->qgroup_rsv_reserved += qgroup_num_bytes; - spin_unlock(&block_rsv->lock); - } - return ret; -} - static u64 __btrfs_block_rsv_release(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *block_rsv, u64 num_bytes, u64 *qgroup_to_release) @@ -6174,7 +6007,7 @@ static void btrfs_calculate_inode_block_rsv_size(struct btrfs_fs_info *fs_info, * * This is overestimating in most cases. */ - qgroup_rsv_size = outstanding_extents * fs_info->nodesize; + qgroup_rsv_size = (u64)outstanding_extents * fs_info->nodesize; spin_lock(&block_rsv->lock); block_rsv->size = reserve_size; @@ -6182,9 +6015,25 @@ static void btrfs_calculate_inode_block_rsv_size(struct btrfs_fs_info *fs_info, spin_unlock(&block_rsv->lock); } +static void calc_inode_reservations(struct btrfs_fs_info *fs_info, + u64 num_bytes, u64 *meta_reserve, + u64 *qgroup_reserve) +{ + u64 nr_extents = count_max_extents(num_bytes); + u64 csum_leaves = btrfs_csum_bytes_to_leaves(fs_info, num_bytes); + + /* We add one for the inode update at finish ordered time */ + *meta_reserve = btrfs_calc_trans_metadata_size(fs_info, + nr_extents + csum_leaves + 1); + *qgroup_reserve = nr_extents * fs_info->nodesize; +} + int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes) { - struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct btrfs_root *root = inode->root; + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_block_rsv *block_rsv = &inode->block_rsv; + u64 meta_reserve, qgroup_reserve; unsigned nr_extents; enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL; int ret = 0; @@ -6214,7 +6063,31 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes) num_bytes = ALIGN(num_bytes, fs_info->sectorsize); - /* Add our new extents and calculate the new rsv size. */ + /* + * We always want to do it this way, every other way is wrong and ends + * in tears. Pre-reserving the amount we are going to add will always + * be the right way, because otherwise if we have enough parallelism we + * could end up with thousands of inodes all holding little bits of + * reservations they were able to make previously and the only way to + * reclaim that space is to ENOSPC out the operations and clear + * everything out and try again, which is bad. This way we just + * over-reserve slightly, and clean up the mess when we are done. + */ + calc_inode_reservations(fs_info, num_bytes, &meta_reserve, + &qgroup_reserve); + ret = btrfs_qgroup_reserve_meta_prealloc(root, qgroup_reserve, true); + if (ret) + goto out_fail; + ret = reserve_metadata_bytes(root, block_rsv, meta_reserve, flush); + if (ret) + goto out_qgroup; + + /* + * Now we need to update our outstanding extents and csum bytes _first_ + * and then add the reservation to the block_rsv. This keeps us from + * racing with an ordered completion or some such that would think it + * needs to free the reservation we just made. + */ spin_lock(&inode->lock); nr_extents = count_max_extents(num_bytes); btrfs_mod_outstanding_extents(inode, nr_extents); @@ -6222,22 +6095,21 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes) btrfs_calculate_inode_block_rsv_size(fs_info, inode); spin_unlock(&inode->lock); - ret = btrfs_inode_rsv_refill(inode, flush); - if (unlikely(ret)) - goto out_fail; + /* Now we can safely add our space to our block rsv */ + block_rsv_add_bytes(block_rsv, meta_reserve, false); + trace_btrfs_space_reservation(root->fs_info, "delalloc", + btrfs_ino(inode), meta_reserve, 1); + + spin_lock(&block_rsv->lock); + block_rsv->qgroup_rsv_reserved += qgroup_reserve; + spin_unlock(&block_rsv->lock); if (delalloc_lock) mutex_unlock(&inode->delalloc_mutex); return 0; - +out_qgroup: + btrfs_qgroup_free_meta_prealloc(root, qgroup_reserve); out_fail: - spin_lock(&inode->lock); - nr_extents = count_max_extents(num_bytes); - btrfs_mod_outstanding_extents(inode, -nr_extents); - inode->csum_bytes -= num_bytes; - btrfs_calculate_inode_block_rsv_size(fs_info, inode); - spin_unlock(&inode->lock); - btrfs_inode_rsv_release(inode, true); if (delalloc_lock) mutex_unlock(&inode->delalloc_mutex); @@ -6361,9 +6233,9 @@ void btrfs_delalloc_release_space(struct inode *inode, } static int update_block_group(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *info, u64 bytenr, - u64 num_bytes, int alloc) + u64 bytenr, u64 num_bytes, int alloc) { + struct btrfs_fs_info *info = trans->fs_info; struct btrfs_block_group_cache *cache = NULL; u64 total = num_bytes; u64 old_val; @@ -6444,7 +6316,6 @@ static int update_block_group(struct btrfs_trans_handle *trans, if (list_empty(&cache->dirty_list)) { list_add_tail(&cache->dirty_list, &trans->transaction->dirty_bgs); - trans->transaction->num_dirty_bgs++; trans->delayed_ref_updates++; btrfs_get_block_group(cache); } @@ -6491,10 +6362,11 @@ static u64 first_logical_byte(struct btrfs_fs_info *fs_info, u64 search_start) return bytenr; } -static int pin_down_extent(struct btrfs_fs_info *fs_info, - struct btrfs_block_group_cache *cache, +static int pin_down_extent(struct btrfs_block_group_cache *cache, u64 bytenr, u64 num_bytes, int reserved) { + struct btrfs_fs_info *fs_info = cache->fs_info; + spin_lock(&cache->space_info->lock); spin_lock(&cache->lock); cache->pinned += num_bytes; @@ -6526,7 +6398,7 @@ int btrfs_pin_extent(struct btrfs_fs_info *fs_info, cache = btrfs_lookup_block_group(fs_info, bytenr); BUG_ON(!cache); /* Logic error */ - pin_down_extent(fs_info, cache, bytenr, num_bytes, reserved); + pin_down_extent(cache, bytenr, num_bytes, reserved); btrfs_put_block_group(cache); return 0; @@ -6553,7 +6425,7 @@ int btrfs_pin_extent_for_log_replay(struct btrfs_fs_info *fs_info, */ cache_block_group(cache, 1); - pin_down_extent(fs_info, cache, bytenr, num_bytes, 0); + pin_down_extent(cache, bytenr, num_bytes, 0); /* remove us from the free space cache (if we're there at all) */ ret = btrfs_remove_free_space(cache, bytenr, num_bytes); @@ -6607,9 +6479,9 @@ out_lock: return ret; } -int btrfs_exclude_logged_extents(struct btrfs_fs_info *fs_info, - struct extent_buffer *eb) +int btrfs_exclude_logged_extents(struct extent_buffer *eb) { + struct btrfs_fs_info *fs_info = eb->fs_info; struct btrfs_file_extent_item *item; struct btrfs_key key; int found_type; @@ -7198,7 +7070,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, goto out; } - ret = update_block_group(trans, info, bytenr, num_bytes, 0); + ret = update_block_group(trans, bytenr, num_bytes, 0); if (ret) { btrfs_abort_transaction(trans, ret); goto out; @@ -7272,21 +7144,20 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, u64 parent, int last_ref) { struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_ref generic_ref = { 0 }; int pin = 1; int ret; + btrfs_init_generic_ref(&generic_ref, BTRFS_DROP_DELAYED_REF, + buf->start, buf->len, parent); + btrfs_init_tree_ref(&generic_ref, btrfs_header_level(buf), + root->root_key.objectid); + if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { int old_ref_mod, new_ref_mod; - btrfs_ref_tree_mod(root, buf->start, buf->len, parent, - root->root_key.objectid, - btrfs_header_level(buf), 0, - BTRFS_DROP_DELAYED_REF); - ret = btrfs_add_delayed_tree_ref(trans, buf->start, - buf->len, parent, - root->root_key.objectid, - btrfs_header_level(buf), - BTRFS_DROP_DELAYED_REF, NULL, + btrfs_ref_tree_mod(fs_info, &generic_ref); + ret = btrfs_add_delayed_tree_ref(trans, &generic_ref, NULL, &old_ref_mod, &new_ref_mod); BUG_ON(ret); /* -ENOMEM */ pin = old_ref_mod >= 0 && new_ref_mod < 0; @@ -7305,8 +7176,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, cache = btrfs_lookup_block_group(fs_info, buf->start); if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) { - pin_down_extent(fs_info, cache, buf->start, - buf->len, 1); + pin_down_extent(cache, buf->start, buf->len, 1); btrfs_put_block_group(cache); goto out; } @@ -7320,8 +7190,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, } out: if (pin) - add_pinned_bytes(fs_info, buf->len, true, - root->root_key.objectid); + add_pinned_bytes(fs_info, &generic_ref); if (last_ref) { /* @@ -7333,52 +7202,43 @@ out: } /* Can return -ENOMEM */ -int btrfs_free_extent(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid, - u64 owner, u64 offset) +int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref) { - struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_fs_info *fs_info = trans->fs_info; int old_ref_mod, new_ref_mod; int ret; if (btrfs_is_testing(fs_info)) return 0; - if (root_objectid != BTRFS_TREE_LOG_OBJECTID) - btrfs_ref_tree_mod(root, bytenr, num_bytes, parent, - root_objectid, owner, offset, - BTRFS_DROP_DELAYED_REF); - /* * tree log blocks never actually go into the extent allocation * tree, just update pinning info and exit early. */ - if (root_objectid == BTRFS_TREE_LOG_OBJECTID) { - WARN_ON(owner >= BTRFS_FIRST_FREE_OBJECTID); + if ((ref->type == BTRFS_REF_METADATA && + ref->tree_ref.root == BTRFS_TREE_LOG_OBJECTID) || + (ref->type == BTRFS_REF_DATA && + ref->data_ref.ref_root == BTRFS_TREE_LOG_OBJECTID)) { /* unlocks the pinned mutex */ - btrfs_pin_extent(fs_info, bytenr, num_bytes, 1); + btrfs_pin_extent(fs_info, ref->bytenr, ref->len, 1); old_ref_mod = new_ref_mod = 0; ret = 0; - } else if (owner < BTRFS_FIRST_FREE_OBJECTID) { - ret = btrfs_add_delayed_tree_ref(trans, bytenr, - num_bytes, parent, - root_objectid, (int)owner, - BTRFS_DROP_DELAYED_REF, NULL, + } else if (ref->type == BTRFS_REF_METADATA) { + ret = btrfs_add_delayed_tree_ref(trans, ref, NULL, &old_ref_mod, &new_ref_mod); } else { - ret = btrfs_add_delayed_data_ref(trans, bytenr, - num_bytes, parent, - root_objectid, owner, offset, - 0, BTRFS_DROP_DELAYED_REF, + ret = btrfs_add_delayed_data_ref(trans, ref, 0, &old_ref_mod, &new_ref_mod); } - if (ret == 0 && old_ref_mod >= 0 && new_ref_mod < 0) { - bool metadata = owner < BTRFS_FIRST_FREE_OBJECTID; + if (!((ref->type == BTRFS_REF_METADATA && + ref->tree_ref.root == BTRFS_TREE_LOG_OBJECTID) || + (ref->type == BTRFS_REF_DATA && + ref->data_ref.ref_root == BTRFS_TREE_LOG_OBJECTID))) + btrfs_ref_tree_mod(fs_info, ref); - add_pinned_bytes(fs_info, num_bytes, metadata, root_objectid); - } + if (ret == 0 && old_ref_mod >= 0 && new_ref_mod < 0) + add_pinned_bytes(fs_info, ref); return ret; } @@ -7569,7 +7429,6 @@ static int find_free_extent_clustered(struct btrfs_block_group_cache *bg, struct find_free_extent_ctl *ffe_ctl, struct btrfs_block_group_cache **cluster_bg_ret) { - struct btrfs_fs_info *fs_info = bg->fs_info; struct btrfs_block_group_cache *cluster_bg; u64 aligned_cluster; u64 offset; @@ -7629,9 +7488,8 @@ refill_cluster: aligned_cluster = max_t(u64, ffe_ctl->empty_cluster + ffe_ctl->empty_size, bg->full_stripe_len); - ret = btrfs_find_space_cluster(fs_info, bg, last_ptr, - ffe_ctl->search_start, ffe_ctl->num_bytes, - aligned_cluster); + ret = btrfs_find_space_cluster(bg, last_ptr, ffe_ctl->search_start, + ffe_ctl->num_bytes, aligned_cluster); if (ret == 0) { /* Now pull our allocation out of this cluster */ offset = btrfs_alloc_from_cluster(bg, last_ptr, @@ -8281,7 +8139,7 @@ static int __btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info, } if (pin) - pin_down_extent(fs_info, cache, start, len, 1); + pin_down_extent(cache, start, len, 1); else { if (btrfs_test_opt(fs_info, DISCARD)) ret = btrfs_discard_extent(fs_info, start, len, NULL); @@ -8370,7 +8228,7 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, if (ret) return ret; - ret = update_block_group(trans, fs_info, ins->objectid, ins->offset, 1); + ret = update_block_group(trans, ins->objectid, ins->offset, 1); if (ret) { /* -ENOENT, logic error */ btrfs_err(fs_info, "update block group failed for %llu %llu", ins->objectid, ins->offset); @@ -8460,7 +8318,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, if (ret) return ret; - ret = update_block_group(trans, fs_info, extent_key.objectid, + ret = update_block_group(trans, extent_key.objectid, fs_info->nodesize, 1); if (ret) { /* -ENOENT, logic error */ btrfs_err(fs_info, "update block group failed for %llu %llu", @@ -8478,19 +8336,17 @@ int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans, u64 offset, u64 ram_bytes, struct btrfs_key *ins) { + struct btrfs_ref generic_ref = { 0 }; int ret; BUG_ON(root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID); - btrfs_ref_tree_mod(root, ins->objectid, ins->offset, 0, - root->root_key.objectid, owner, offset, - BTRFS_ADD_DELAYED_EXTENT); - - ret = btrfs_add_delayed_data_ref(trans, ins->objectid, - ins->offset, 0, - root->root_key.objectid, owner, - offset, ram_bytes, - BTRFS_ADD_DELAYED_EXTENT, NULL, NULL); + btrfs_init_generic_ref(&generic_ref, BTRFS_ADD_DELAYED_EXTENT, + ins->objectid, ins->offset, 0); + btrfs_init_data_ref(&generic_ref, root->root_key.objectid, owner, offset); + btrfs_ref_tree_mod(root->fs_info, &generic_ref); + ret = btrfs_add_delayed_data_ref(trans, &generic_ref, + ram_bytes, NULL, NULL); return ret; } @@ -8563,7 +8419,7 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, btrfs_set_buffer_lockdep_class(root->root_key.objectid, buf, level); btrfs_tree_lock(buf); - clean_tree_block(fs_info, buf); + btrfs_clean_tree_block(buf); clear_bit(EXTENT_BUFFER_STALE, &buf->bflags); btrfs_set_lock_blocking_write(buf); @@ -8682,6 +8538,7 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, struct btrfs_block_rsv *block_rsv; struct extent_buffer *buf; struct btrfs_delayed_extent_op *extent_op; + struct btrfs_ref generic_ref = { 0 }; u64 flags = 0; int ret; u32 blocksize = fs_info->nodesize; @@ -8736,13 +8593,12 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, extent_op->is_data = false; extent_op->level = level; - btrfs_ref_tree_mod(root, ins.objectid, ins.offset, parent, - root_objectid, level, 0, - BTRFS_ADD_DELAYED_EXTENT); - ret = btrfs_add_delayed_tree_ref(trans, ins.objectid, - ins.offset, parent, - root_objectid, level, - BTRFS_ADD_DELAYED_EXTENT, + btrfs_init_generic_ref(&generic_ref, BTRFS_ADD_DELAYED_EXTENT, + ins.objectid, ins.offset, parent); + generic_ref.real_root = root->root_key.objectid; + btrfs_init_tree_ref(&generic_ref, level, root_objectid); + btrfs_ref_tree_mod(fs_info, &generic_ref); + ret = btrfs_add_delayed_tree_ref(trans, &generic_ref, extent_op, NULL, NULL); if (ret) goto out_free_delayed; @@ -8918,7 +8774,7 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans, BUG_ON(ret); /* -ENOMEM */ ret = btrfs_dec_ref(trans, root, eb, 0); BUG_ON(ret); /* -ENOMEM */ - ret = btrfs_set_disk_extent_flags(trans, fs_info, eb->start, + ret = btrfs_set_disk_extent_flags(trans, eb->start, eb->len, flag, btrfs_header_level(eb), 0); BUG_ON(ret); /* -ENOMEM */ @@ -8987,6 +8843,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, u64 parent; struct btrfs_key key; struct btrfs_key first_key; + struct btrfs_ref ref = { 0 }; struct extent_buffer *next; int level = wc->level; int reada = 0; @@ -9159,9 +9016,10 @@ skip: wc->drop_level = level; find_next_key(path, level, &wc->drop_progress); - ret = btrfs_free_extent(trans, root, bytenr, fs_info->nodesize, - parent, root->root_key.objectid, - level - 1, 0); + btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr, + fs_info->nodesize, parent); + btrfs_init_tree_ref(&ref, level - 1, root->root_key.objectid); + ret = btrfs_free_extent(trans, &ref); if (ret) goto out_unlock; } @@ -9251,21 +9109,23 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans, else ret = btrfs_dec_ref(trans, root, eb, 0); BUG_ON(ret); /* -ENOMEM */ - ret = btrfs_qgroup_trace_leaf_items(trans, eb); - if (ret) { - btrfs_err_rl(fs_info, - "error %d accounting leaf items. Quota is out of sync, rescan required.", + if (is_fstree(root->root_key.objectid)) { + ret = btrfs_qgroup_trace_leaf_items(trans, eb); + if (ret) { + btrfs_err_rl(fs_info, + "error %d accounting leaf items, quota is out of sync, rescan required", ret); + } } } - /* make block locked assertion in clean_tree_block happy */ + /* make block locked assertion in btrfs_clean_tree_block happy */ if (!path->locks[level] && btrfs_header_generation(eb) == trans->transid) { btrfs_tree_lock(eb); btrfs_set_lock_blocking_write(eb); path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; } - clean_tree_block(fs_info, eb); + btrfs_clean_tree_block(eb); } if (eb == root->node) { @@ -9921,12 +9781,10 @@ void btrfs_dec_block_group_ro(struct btrfs_block_group_cache *cache) */ int btrfs_can_relocate(struct btrfs_fs_info *fs_info, u64 bytenr) { - struct btrfs_root *root = fs_info->extent_root; struct btrfs_block_group_cache *block_group; struct btrfs_space_info *space_info; struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; struct btrfs_device *device; - struct btrfs_trans_handle *trans; u64 min_free; u64 dev_min = 1; u64 dev_nr = 0; @@ -10025,13 +9883,6 @@ int btrfs_can_relocate(struct btrfs_fs_info *fs_info, u64 bytenr) min_free = div64_u64(min_free, dev_min); } - /* We need to do this so that we can look at pending chunks */ - trans = btrfs_join_transaction(root); - if (IS_ERR(trans)) { - ret = PTR_ERR(trans); - goto out; - } - mutex_lock(&fs_info->chunk_mutex); list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) { u64 dev_offset; @@ -10042,7 +9893,7 @@ int btrfs_can_relocate(struct btrfs_fs_info *fs_info, u64 bytenr) */ if (device->total_bytes > device->bytes_used + min_free && !test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) { - ret = find_free_dev_extent(trans, device, min_free, + ret = find_free_dev_extent(device, min_free, &dev_offset, NULL); if (!ret) dev_nr++; @@ -10058,7 +9909,6 @@ int btrfs_can_relocate(struct btrfs_fs_info *fs_info, u64 bytenr) "no space to allocate a new chunk for block group %llu", block_group->key.objectid); mutex_unlock(&fs_info->chunk_mutex); - btrfs_end_transaction(trans); out: btrfs_put_block_group(block_group); return ret; @@ -10159,7 +10009,7 @@ void btrfs_put_block_group_cache(struct btrfs_fs_info *info) if (block_group->iref) break; spin_unlock(&block_group->lock); - block_group = next_block_group(info, block_group); + block_group = next_block_group(block_group); } if (!block_group) { if (last == 0) @@ -10660,7 +10510,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used, struct btrfs_block_group_cache *cache; int ret; - btrfs_set_log_full_commit(fs_info, trans); + btrfs_set_log_full_commit(trans); cache = btrfs_create_block_group_cache(fs_info, chunk_offset, size); if (!cache) @@ -10808,7 +10658,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, * get the inode first so any iput calls done for the io_list * aren't the final iput (no unlinks allowed now) */ - inode = lookup_free_space_inode(fs_info, block_group, path); + inode = lookup_free_space_inode(block_group, path); mutex_lock(&trans->transaction->cache_write_mutex); /* @@ -10952,10 +10802,6 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, memcpy(&key, &block_group->key, sizeof(key)); mutex_lock(&fs_info->chunk_mutex); - if (!list_empty(&em->list)) { - /* We're in the transaction->pending_chunks list. */ - free_extent_map(em); - } spin_lock(&block_group->lock); block_group->removed = 1; /* @@ -10982,25 +10828,6 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, * the transaction commit has completed. */ remove_em = (atomic_read(&block_group->trimming) == 0); - /* - * Make sure a trimmer task always sees the em in the pinned_chunks list - * if it sees block_group->removed == 1 (needs to lock block_group->lock - * before checking block_group->removed). - */ - if (!remove_em) { - /* - * Our em might be in trans->transaction->pending_chunks which - * is protected by fs_info->chunk_mutex ([lock|unlock]_chunks), - * and so is the fs_info->pinned_chunks list. - * - * So at this point we must be holding the chunk_mutex to avoid - * any races with chunk allocation (more specifically at - * volumes.c:contains_pending_extent()), to ensure it always - * sees the em, either in the pending_chunks list or in the - * pinned_chunks list. - */ - list_move_tail(&em->list, &fs_info->pinned_chunks); - } spin_unlock(&block_group->lock); if (remove_em) { @@ -11008,11 +10835,6 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, em_tree = &fs_info->mapping_tree.map_tree; write_lock(&em_tree->lock); - /* - * The em might be in the pending_chunks list, so make sure the - * chunk mutex is locked, since remove_extent_mapping() will - * delete us from that list. - */ remove_extent_mapping(em_tree, em); write_unlock(&em_tree->lock); /* once for the tree */ @@ -11315,11 +11137,12 @@ int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info, * held back allocations. */ static int btrfs_trim_free_extents(struct btrfs_device *device, - u64 minlen, u64 *trimmed) + struct fstrim_range *range, u64 *trimmed) { - u64 start = 0, len = 0; + u64 start, len = 0, end = 0; int ret; + start = max_t(u64, range->start, SZ_1M); *trimmed = 0; /* Discard not supported = nothing to do. */ @@ -11338,43 +11161,52 @@ static int btrfs_trim_free_extents(struct btrfs_device *device, while (1) { struct btrfs_fs_info *fs_info = device->fs_info; - struct btrfs_transaction *trans; u64 bytes; ret = mutex_lock_interruptible(&fs_info->chunk_mutex); if (ret) break; - ret = down_read_killable(&fs_info->commit_root_sem); - if (ret) { + find_first_clear_extent_bit(&device->alloc_state, start, + &start, &end, + CHUNK_TRIMMED | CHUNK_ALLOCATED); + /* + * If find_first_clear_extent_bit find a range that spans the + * end of the device it will set end to -1, in this case it's up + * to the caller to trim the value to the size of the device. + */ + end = min(end, device->total_bytes - 1); + len = end - start + 1; + + /* We didn't find any extents */ + if (!len) { mutex_unlock(&fs_info->chunk_mutex); + ret = 0; break; } - spin_lock(&fs_info->trans_lock); - trans = fs_info->running_transaction; - if (trans) - refcount_inc(&trans->use_count); - spin_unlock(&fs_info->trans_lock); - - if (!trans) - up_read(&fs_info->commit_root_sem); - - ret = find_free_dev_extent_start(trans, device, minlen, start, - &start, &len); - if (trans) { - up_read(&fs_info->commit_root_sem); - btrfs_put_transaction(trans); + /* Keep going until we satisfy minlen or reach end of space */ + if (len < range->minlen) { + mutex_unlock(&fs_info->chunk_mutex); + start += len; + continue; } - if (ret) { + /* If we are out of the passed range break */ + if (start > range->start + range->len - 1) { mutex_unlock(&fs_info->chunk_mutex); - if (ret == -ENOSPC) - ret = 0; break; } - ret = btrfs_issue_discard(device->bdev, start, len, &bytes); + start = max(range->start, start); + len = min(range->len, len); + + ret = btrfs_issue_discard(device->bdev, start, len, + &bytes); + if (!ret) + set_extent_bits(&device->alloc_state, start, + start + bytes - 1, + CHUNK_TRIMMED); mutex_unlock(&fs_info->chunk_mutex); if (ret) @@ -11383,6 +11215,10 @@ static int btrfs_trim_free_extents(struct btrfs_device *device, start += len; *trimmed += bytes; + /* We've trimmed enough */ + if (*trimmed >= range->len) + break; + if (fatal_signal_pending(current)) { ret = -ERESTARTSYS; break; @@ -11419,7 +11255,7 @@ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range) int ret = 0; cache = btrfs_lookup_first_block_group(fs_info, range->start); - for (; cache; cache = next_block_group(fs_info, cache)) { + for (; cache; cache = next_block_group(cache)) { if (cache->key.objectid >= (range->start + range->len)) { btrfs_put_block_group(cache); break; @@ -11466,8 +11302,7 @@ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range) mutex_lock(&fs_info->fs_devices->device_list_mutex); devices = &fs_info->fs_devices->devices; list_for_each_entry(device, devices, dev_list) { - ret = btrfs_trim_free_extents(device, range->minlen, - &group_trimmed); + ret = btrfs_trim_free_extents(device, range, &group_trimmed); if (ret) { dev_failed++; dev_ret = ret; |