diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2020-06-03 04:59:25 +0200 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2020-06-03 04:59:25 +0200 |
commit | f3cdc8ae116e27d84e1f33c7a2995960cebb73ac (patch) | |
tree | db3dbbbbf82b76590f601b5caee5de3bef151c4b /fs/btrfs/block-group.c | |
parent | Merge tag 'vfs-5.8-merge-2' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux (diff) | |
parent | btrfs: fix space_info bytes_may_use underflow during space cache writeout (diff) | |
download | linux-f3cdc8ae116e27d84e1f33c7a2995960cebb73ac.tar.xz linux-f3cdc8ae116e27d84e1f33c7a2995960cebb73ac.zip |
Merge tag 'for-5.8-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux
Pull btrfs updates from David Sterba:
"Highlights:
- speedup dead root detection during orphan cleanup, eg. when there
are many deleted subvolumes waiting to be cleaned, the trees are
now looked up in radix tree instead of a O(N^2) search
- snapshot creation with inherited qgroup will mark the qgroup
inconsistent, requires a rescan
- send will emit file capabilities after chown, this produces a
stream that does not need postprocessing to set the capabilities
again
- direct io ported to iomap infrastructure, cleaned up and simplified
code, notably removing last use of struct buffer_head in btrfs code
Core changes:
- factor out backreference iteration, to be used by ordinary
backreferences and relocation code
- improved global block reserve utilization
* better logic to serialize requests
* increased maximum available for unlink
* improved handling on large pages (64K)
- direct io cleanups and fixes
* simplify layering, where cloned bios were unnecessarily created
for some cases
* error handling fixes (submit, endio)
* remove repair worker thread, used to avoid deadlocks during
repair
- refactored block group reading code, preparatory work for new type
of block group storage that should improve mount time on large
filesystems
Cleanups:
- cleaned up (and slightly sped up) set/get helpers for metadata data
structure members
- root bit REF_COWS got renamed to SHAREABLE to reflect the that the
blocks of the tree get shared either among subvolumes or with the
relocation trees
Fixes:
- when subvolume deletion fails due to ENOSPC, the filesystem is not
turned read-only
- device scan deals with devices from other filesystems that changed
ownership due to overwrite (mkfs)
- fix a race between scrub and block group removal/allocation
- fix long standing bug of a runaway balance operation, printing the
same line to the syslog, caused by a stale status bit on a reloc
tree that prevented progress
- fix corrupt log due to concurrent fsync of inodes with shared
extents
- fix space underflow for NODATACOW and buffered writes when it for
some reason needs to fallback to COW mode"
* tag 'for-5.8-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux: (133 commits)
btrfs: fix space_info bytes_may_use underflow during space cache writeout
btrfs: fix space_info bytes_may_use underflow after nocow buffered write
btrfs: fix wrong file range cleanup after an error filling dealloc range
btrfs: remove redundant local variable in read_block_for_search
btrfs: open code key_search
btrfs: split btrfs_direct_IO to read and write part
btrfs: remove BTRFS_INODE_READDIO_NEED_LOCK
fs: remove dio_end_io()
btrfs: switch to iomap_dio_rw() for dio
iomap: remove lockdep_assert_held()
iomap: add a filesystem hook for direct I/O bio submission
fs: export generic_file_buffered_read()
btrfs: turn space cache writeout failure messages into debug messages
btrfs: include error on messages about failure to write space/inode caches
btrfs: remove useless 'fail_unlock' label from btrfs_csum_file_blocks()
btrfs: do not ignore error from btrfs_next_leaf() when inserting checksums
btrfs: make checksum item extension more efficient
btrfs: fix corrupt log due to concurrent fsync of inodes with shared extents
btrfs: unexport btrfs_compress_set_level()
btrfs: simplify iget helpers
...
Diffstat (limited to 'fs/btrfs/block-group.c')
-rw-r--r-- | fs/btrfs/block-group.c | 233 |
1 files changed, 154 insertions, 79 deletions
diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 696f47103cfc..176e8a292fd1 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -7,7 +7,6 @@ #include "disk-io.h" #include "free-space-cache.h" #include "free-space-tree.h" -#include "disk-io.h" #include "volumes.h" #include "transaction.h" #include "ref-verify.h" @@ -161,6 +160,8 @@ static int btrfs_add_block_group_cache(struct btrfs_fs_info *info, struct rb_node *parent = NULL; struct btrfs_block_group *cache; + ASSERT(block_group->length != 0); + spin_lock(&info->block_group_cache_lock); p = &info->block_group_cache_tree.rb_node; @@ -863,11 +864,34 @@ static void clear_incompat_bg_bits(struct btrfs_fs_info *fs_info, u64 flags) } } +static int remove_block_group_item(struct btrfs_trans_handle *trans, + struct btrfs_path *path, + struct btrfs_block_group *block_group) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_root *root; + struct btrfs_key key; + int ret; + + root = fs_info->extent_root; + key.objectid = block_group->start; + key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; + key.offset = block_group->length; + + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + if (ret > 0) + ret = -ENOENT; + if (ret < 0) + return ret; + + ret = btrfs_del_item(trans, root, path); + return ret; +} + int btrfs_remove_block_group(struct btrfs_trans_handle *trans, u64 group_start, struct extent_map *em) { struct btrfs_fs_info *fs_info = trans->fs_info; - struct btrfs_root *root = fs_info->extent_root; struct btrfs_path *path; struct btrfs_block_group *block_group; struct btrfs_free_cluster *cluster; @@ -1065,26 +1089,25 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, spin_unlock(&block_group->space_info->lock); - key.objectid = block_group->start; - key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; - key.offset = block_group->length; - mutex_lock(&fs_info->chunk_mutex); spin_lock(&block_group->lock); block_group->removed = 1; /* - * At this point trimming can't start on this block group, because we - * removed the block group from the tree fs_info->block_group_cache_tree - * so no one can't find it anymore and even if someone already got this - * block group before we removed it from the rbtree, they have already - * incremented block_group->trimming - if they didn't, they won't find - * any free space entries because we already removed them all when we - * called btrfs_remove_free_space_cache(). + * At this point trimming or scrub can't start on this block group, + * because we removed the block group from the rbtree + * fs_info->block_group_cache_tree so no one can't find it anymore and + * even if someone already got this block group before we removed it + * from the rbtree, they have already incremented block_group->frozen - + * if they didn't, for the trimming case they won't find any free space + * entries because we already removed them all when we called + * btrfs_remove_free_space_cache(). * * And we must not remove the extent map from the fs_info->mapping_tree * to prevent the same logical address range and physical device space - * ranges from being reused for a new block group. This is because our - * fs trim operation (btrfs_trim_fs() / btrfs_ioctl_fitrim()) is + * ranges from being reused for a new block group. This is needed to + * avoid races with trimming and scrub. + * + * An fs trim operation (btrfs_trim_fs() / btrfs_ioctl_fitrim()) is * completely transactionless, so while it is trimming a range the * currently running transaction might finish and a new one start, * allowing for new block groups to be created that can reuse the same @@ -1095,7 +1118,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, * in place until the extents have been discarded completely when * the transaction commit has completed. */ - remove_em = (atomic_read(&block_group->trimming) == 0); + remove_em = (atomic_read(&block_group->frozen) == 0); spin_unlock(&block_group->lock); mutex_unlock(&fs_info->chunk_mutex); @@ -1107,16 +1130,10 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, /* Once for the block groups rbtree */ btrfs_put_block_group(block_group); - ret = btrfs_search_slot(trans, root, &key, path, -1, 1); - if (ret > 0) - ret = -EIO; + ret = remove_block_group_item(trans, path, block_group); if (ret < 0) goto out; - ret = btrfs_del_item(trans, root, path); - if (ret) - goto out; - if (remove_em) { struct extent_map_tree *em_tree; @@ -1175,7 +1192,7 @@ struct btrfs_trans_handle *btrfs_start_trans_remove_block_group( free_extent_map(em); return btrfs_start_transaction_fallback_global_rsv(fs_info->extent_root, - num_items, 1); + num_items); } /* @@ -1284,25 +1301,17 @@ static bool clean_pinned_extents(struct btrfs_trans_handle *trans, ret = clear_extent_bits(&prev_trans->pinned_extents, start, end, EXTENT_DIRTY); if (ret) - goto err; + goto out; } ret = clear_extent_bits(&trans->transaction->pinned_extents, start, end, EXTENT_DIRTY); - if (ret) - goto err; +out: mutex_unlock(&fs_info->unused_bg_unpin_mutex); if (prev_trans) btrfs_put_transaction(prev_trans); - return true; - -err: - mutex_unlock(&fs_info->unused_bg_unpin_mutex); - if (prev_trans) - btrfs_put_transaction(prev_trans); - btrfs_dec_block_group_ro(bg); - return false; + return ret == 0; } /* @@ -1400,8 +1409,10 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) * We could have pending pinned extents for this block group, * just delete them, we don't care about them anymore. */ - if (!clean_pinned_extents(trans, block_group)) + if (!clean_pinned_extents(trans, block_group)) { + btrfs_dec_block_group_ro(block_group); goto end_trans; + } /* * At this point, the block_group is read only and should fail @@ -1450,7 +1461,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) /* Implicit trim during transaction commit. */ if (trimming) - btrfs_get_block_group_trimming(block_group); + btrfs_freeze_block_group(block_group); /* * Btrfs_remove_chunk will abort the transaction if things go @@ -1460,7 +1471,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) if (ret) { if (trimming) - btrfs_put_block_group_trimming(block_group); + btrfs_unfreeze_block_group(block_group); goto end_trans; } @@ -1774,7 +1785,7 @@ static void link_block_group(struct btrfs_block_group *cache) } static struct btrfs_block_group *btrfs_create_block_group_cache( - struct btrfs_fs_info *fs_info, u64 start, u64 size) + struct btrfs_fs_info *fs_info, u64 start) { struct btrfs_block_group *cache; @@ -1790,7 +1801,6 @@ static struct btrfs_block_group *btrfs_create_block_group_cache( } cache->start = start; - cache->length = size; cache->fs_info = fs_info; cache->full_stripe_len = btrfs_full_stripe_len(fs_info, start); @@ -1809,7 +1819,7 @@ static struct btrfs_block_group *btrfs_create_block_group_cache( INIT_LIST_HEAD(&cache->dirty_list); INIT_LIST_HEAD(&cache->io_list); btrfs_init_free_space_ctl(cache); - atomic_set(&cache->trimming, 0); + atomic_set(&cache->frozen, 0); mutex_init(&cache->free_space_lock); btrfs_init_full_stripe_locks_tree(&cache->full_stripe_locks_root); @@ -1870,25 +1880,44 @@ static int check_chunk_block_group_mappings(struct btrfs_fs_info *fs_info) return ret; } +static int read_block_group_item(struct btrfs_block_group *cache, + struct btrfs_path *path, + const struct btrfs_key *key) +{ + struct extent_buffer *leaf = path->nodes[0]; + struct btrfs_block_group_item bgi; + int slot = path->slots[0]; + + cache->length = key->offset; + + read_extent_buffer(leaf, &bgi, btrfs_item_ptr_offset(leaf, slot), + sizeof(bgi)); + cache->used = btrfs_stack_block_group_used(&bgi); + cache->flags = btrfs_stack_block_group_flags(&bgi); + + return 0; +} + static int read_one_block_group(struct btrfs_fs_info *info, struct btrfs_path *path, const struct btrfs_key *key, int need_clear) { - struct extent_buffer *leaf = path->nodes[0]; struct btrfs_block_group *cache; struct btrfs_space_info *space_info; - struct btrfs_block_group_item bgi; const bool mixed = btrfs_fs_incompat(info, MIXED_GROUPS); - int slot = path->slots[0]; int ret; ASSERT(key->type == BTRFS_BLOCK_GROUP_ITEM_KEY); - cache = btrfs_create_block_group_cache(info, key->objectid, key->offset); + cache = btrfs_create_block_group_cache(info, key->objectid); if (!cache) return -ENOMEM; + ret = read_block_group_item(cache, path, key); + if (ret < 0) + goto error; + if (need_clear) { /* * When we mount with old space cache, we need to @@ -1903,10 +1932,6 @@ static int read_one_block_group(struct btrfs_fs_info *info, if (btrfs_test_opt(info, SPACE_CACHE)) cache->disk_cache_state = BTRFS_DC_CLEAR; } - read_extent_buffer(leaf, &bgi, btrfs_item_ptr_offset(leaf, slot), - sizeof(bgi)); - cache->used = btrfs_stack_block_group_used(&bgi); - cache->flags = btrfs_stack_block_group_flags(&bgi); if (!mixed && ((cache->flags & BTRFS_BLOCK_GROUP_METADATA) && (cache->flags & BTRFS_BLOCK_GROUP_DATA))) { btrfs_err(info, @@ -1934,15 +1959,15 @@ static int read_one_block_group(struct btrfs_fs_info *info, * are empty, and we can just add all the space in and be done with it. * This saves us _a_lot_ of time, particularly in the full case. */ - if (key->offset == cache->used) { + if (cache->length == cache->used) { cache->last_byte_to_unpin = (u64)-1; cache->cached = BTRFS_CACHE_FINISHED; btrfs_free_excluded_extents(cache); } else if (cache->used == 0) { cache->last_byte_to_unpin = (u64)-1; cache->cached = BTRFS_CACHE_FINISHED; - add_new_free_space(cache, key->objectid, - key->objectid + key->offset); + add_new_free_space(cache, cache->start, + cache->start + cache->length); btrfs_free_excluded_extents(cache); } @@ -1952,7 +1977,7 @@ static int read_one_block_group(struct btrfs_fs_info *info, goto error; } trace_btrfs_add_block_group(info, cache, 0); - btrfs_update_space_info(info, cache->flags, key->offset, + btrfs_update_space_info(info, cache->flags, cache->length, cache->used, cache->bytes_super, &space_info); cache->space_info = space_info; @@ -1991,7 +2016,6 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info) path = btrfs_alloc_path(); if (!path) return -ENOMEM; - path->reada = READA_FORWARD; cache_gen = btrfs_super_cache_generation(info->super_copy); if (btrfs_test_opt(info, SPACE_CACHE) && @@ -2046,13 +2070,32 @@ error: return ret; } +static int insert_block_group_item(struct btrfs_trans_handle *trans, + struct btrfs_block_group *block_group) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_block_group_item bgi; + struct btrfs_root *root; + struct btrfs_key key; + + spin_lock(&block_group->lock); + btrfs_set_stack_block_group_used(&bgi, block_group->used); + btrfs_set_stack_block_group_chunk_objectid(&bgi, + BTRFS_FIRST_CHUNK_TREE_OBJECTID); + btrfs_set_stack_block_group_flags(&bgi, block_group->flags); + key.objectid = block_group->start; + key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; + key.offset = block_group->length; + spin_unlock(&block_group->lock); + + root = fs_info->extent_root; + return btrfs_insert_item(trans, root, &key, &bgi, sizeof(bgi)); +} + void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_block_group *block_group; - struct btrfs_root *extent_root = fs_info->extent_root; - struct btrfs_block_group_item item; - struct btrfs_key key; int ret = 0; if (!trans->can_flush_pending_bgs) @@ -2065,21 +2108,11 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans) if (ret) goto next; - spin_lock(&block_group->lock); - btrfs_set_stack_block_group_used(&item, block_group->used); - btrfs_set_stack_block_group_chunk_objectid(&item, - BTRFS_FIRST_CHUNK_TREE_OBJECTID); - btrfs_set_stack_block_group_flags(&item, block_group->flags); - key.objectid = block_group->start; - key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; - key.offset = block_group->length; - spin_unlock(&block_group->lock); - - ret = btrfs_insert_item(trans, extent_root, &key, &item, - sizeof(item)); + ret = insert_block_group_item(trans, block_group); if (ret) btrfs_abort_transaction(trans, ret); - ret = btrfs_finish_chunk_alloc(trans, key.objectid, key.offset); + ret = btrfs_finish_chunk_alloc(trans, block_group->start, + block_group->length); if (ret) btrfs_abort_transaction(trans, ret); add_block_group_free_space(trans, block_group); @@ -2100,10 +2133,11 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used, btrfs_set_log_full_commit(trans); - cache = btrfs_create_block_group_cache(fs_info, chunk_offset, size); + cache = btrfs_create_block_group_cache(fs_info, chunk_offset); if (!cache) return -ENOMEM; + cache->length = size; cache->used = bytes_used; cache->flags = type; cache->last_byte_to_unpin = (u64)-1; @@ -2314,13 +2348,13 @@ void btrfs_dec_block_group_ro(struct btrfs_block_group *cache) spin_unlock(&sinfo->lock); } -static int write_one_cache_group(struct btrfs_trans_handle *trans, - struct btrfs_path *path, - struct btrfs_block_group *cache) +static int update_block_group_item(struct btrfs_trans_handle *trans, + struct btrfs_path *path, + struct btrfs_block_group *cache) { struct btrfs_fs_info *fs_info = trans->fs_info; int ret; - struct btrfs_root *extent_root = fs_info->extent_root; + struct btrfs_root *root = fs_info->extent_root; unsigned long bi; struct extent_buffer *leaf; struct btrfs_block_group_item bgi; @@ -2330,7 +2364,7 @@ static int write_one_cache_group(struct btrfs_trans_handle *trans, key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; key.offset = cache->length; - ret = btrfs_search_slot(trans, extent_root, &key, path, 0, 1); + ret = btrfs_search_slot(trans, root, &key, path, 0, 1); if (ret) { if (ret > 0) ret = -ENOENT; @@ -2642,7 +2676,7 @@ again: } } if (!ret) { - ret = write_one_cache_group(trans, path, cache); + ret = update_block_group_item(trans, path, cache); /* * Our block group might still be attached to the list * of new block groups in the transaction handle of some @@ -2791,7 +2825,7 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans) } } if (!ret) { - ret = write_one_cache_group(trans, path, cache); + ret = update_block_group_item(trans, path, cache); /* * One of the free space endio workers might have * created a new block group while updating a free space @@ -2808,7 +2842,7 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans) if (ret == -ENOENT) { wait_event(cur_trans->writer_wait, atomic_read(&cur_trans->num_writers) == 1); - ret = write_one_cache_group(trans, path, cache); + ret = update_block_group_item(trans, path, cache); } if (ret) btrfs_abort_transaction(trans, ret); @@ -3384,3 +3418,44 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info) } return 0; } + +void btrfs_freeze_block_group(struct btrfs_block_group *cache) +{ + atomic_inc(&cache->frozen); +} + +void btrfs_unfreeze_block_group(struct btrfs_block_group *block_group) +{ + struct btrfs_fs_info *fs_info = block_group->fs_info; + struct extent_map_tree *em_tree; + struct extent_map *em; + bool cleanup; + + spin_lock(&block_group->lock); + cleanup = (atomic_dec_and_test(&block_group->frozen) && + block_group->removed); + spin_unlock(&block_group->lock); + + if (cleanup) { + mutex_lock(&fs_info->chunk_mutex); + em_tree = &fs_info->mapping_tree; + write_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, block_group->start, + 1); + BUG_ON(!em); /* logic error, can't happen */ + remove_extent_mapping(em_tree, em); + write_unlock(&em_tree->lock); + mutex_unlock(&fs_info->chunk_mutex); + + /* once for us and once for the tree */ + free_extent_map(em); + free_extent_map(em); + + /* + * We may have left one free space entry and other possible + * tasks trimming this block group have left 1 entry each one. + * Free them if any. + */ + __btrfs_remove_free_space_cache(block_group->free_space_ctl); + } +} |