summaryrefslogtreecommitdiffstats
path: root/fs/btrfs/extent-tree.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2018-04-04 22:03:38 +0200
committerLinus Torvalds <torvalds@linux-foundation.org>2018-04-04 22:03:38 +0200
commit94514bbe9e5c402c4232af158a295a8fdfd72a2c (patch)
treec990c722cbac5abe8a3b28e0564effa722b7c80e /fs/btrfs/extent-tree.c
parentMerge tag 'xfs-4.17-merge-1' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux (diff)
parentbtrfs: lift errors from add_extent_changeset to the callers (diff)
downloadlinux-94514bbe9e5c402c4232af158a295a8fdfd72a2c.tar.xz
linux-94514bbe9e5c402c4232af158a295a8fdfd72a2c.zip
Merge tag 'for-4.17-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux
Pull btrfs updates from David Sterba: "There are a several user visible changes, the rest is mostly invisible and continues to clean up the whole code base. User visible changes: - new mount option nossd_spread (pair for ssd_spread) - mount option subvolid will detect junk after the number and fail the mount - add message after cancelled device replace - direct module dependency on libcrc32, removed own crc wrappers - removed user space transaction ioctls - use lighter locking when reading /proc/self/mounts, RCU instead of mutex to avoid unnecessary contention Enhancements: - skip writeback of last page when truncating file to same size - send: do not issue unnecessary truncate operations - mount option token specifiers: use %u for unsigned values, more validation - selftests: more tree block validations qgroups: - preparatory work for splitting reservation types for data and metadata, this should allow for more accurate tracking and fix some issues with underflows or do further enhancements - split metadata reservations for started and joined transaction so they do not get mixed up and are accounted correctly at commit time - with the above, it's possible to revert patch that potentially deadlocks when trying to make more space by explicitly committing when the quota limit is hit - fix root item corruption when multiple same source snapshots are created with quota enabled RAID56: - make sure target is identical to source when raid56 rebuild fails after dev-replace - faster rebuild during scrub, batch by stripes and not block-by-block - make more use of cached data when rebuilding from a missing device Fixes: - null pointer deref when device replace target is missing - fix fsync after hole punching when using no-holes feature - fix lockdep splat when allocating percpu data with wrong GFP flags Cleanups, refactoring, core changes: - drop redunant parameters from various functions - kill and opencode trivial helpers - __cold/__exit function annotations - dead code removal - continued audit and documentation of memory barriers - error handling: handle removal from uuid tree - error handling: remove handling of impossible condtitons - more debugging or error messages - updated tracepoints - one VLA use removal (and one still left)" * tag 'for-4.17-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux: (164 commits) btrfs: lift errors from add_extent_changeset to the callers Btrfs: print error messages when failing to read trees btrfs: user proper type for btrfs_mask_flags flags btrfs: split dev-replace locking helpers for read and write btrfs: remove stale comments about fs_mutex btrfs: use RCU in btrfs_show_devname for device list traversal btrfs: update barrier in should_cow_block btrfs: use lockdep_assert_held for mutexes btrfs: use lockdep_assert_held for spinlocks btrfs: Validate child tree block's level and first key btrfs: tests/qgroup: Fix wrong tree backref level Btrfs: fix copy_items() return value when logging an inode Btrfs: fix fsync after hole punching when using no-holes feature btrfs: use helper to set ulist aux from a qgroup Revert "btrfs: qgroups: Retry after commit on getting EDQUOT" btrfs: qgroup: Update trace events for metadata reservation btrfs: qgroup: Use root::qgroup_meta_rsv_* to record qgroup meta reserved space btrfs: delayed-inode: Use new qgroup meta rsv for delayed inode and item btrfs: qgroup: Use separate meta reservation type for delalloc btrfs: qgroup: Introduce function to convert META_PREALLOC into META_PERTRANS ...
Diffstat (limited to 'fs/btrfs/extent-tree.c')
-rw-r--r--fs/btrfs/extent-tree.c317
1 files changed, 173 insertions, 144 deletions
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index e0460d7b5622..e08d0d45af4f 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -27,7 +27,7 @@
#include <linux/ratelimit.h>
#include <linux/percpu_counter.h>
#include <linux/lockdep.h>
-#include "hash.h"
+#include <linux/crc32c.h>
#include "tree-log.h"
#include "disk-io.h"
#include "print-tree.h"
@@ -535,13 +535,11 @@ static noinline void caching_thread(struct btrfs_work *work)
struct btrfs_block_group_cache *block_group;
struct btrfs_fs_info *fs_info;
struct btrfs_caching_control *caching_ctl;
- struct btrfs_root *extent_root;
int ret;
caching_ctl = container_of(work, struct btrfs_caching_control, work);
block_group = caching_ctl->block_group;
fs_info = block_group->fs_info;
- extent_root = fs_info->extent_root;
mutex_lock(&caching_ctl->mutex);
down_read(&fs_info->commit_root_sem);
@@ -1203,11 +1201,11 @@ static u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset)
__le64 lenum;
lenum = cpu_to_le64(root_objectid);
- high_crc = btrfs_crc32c(high_crc, &lenum, sizeof(lenum));
+ high_crc = crc32c(high_crc, &lenum, sizeof(lenum));
lenum = cpu_to_le64(owner);
- low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
+ low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
lenum = cpu_to_le64(offset);
- low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
+ low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
return ((u64)high_crc << 31) ^ (u64)low_crc;
}
@@ -2652,9 +2650,9 @@ static int cleanup_ref_head(struct btrfs_trans_handle *trans,
* Returns -ENOMEM or -EIO on failure and will abort the transaction.
*/
static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info,
unsigned long nr)
{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_delayed_ref_root *delayed_refs;
struct btrfs_delayed_ref_node *ref;
struct btrfs_delayed_ref_head *locked_ref = NULL;
@@ -2994,7 +2992,7 @@ static void delayed_ref_async_start(struct btrfs_work *work)
if (trans->transid > async->transid)
goto end;
- ret = btrfs_run_delayed_refs(trans, fs_info, async->count);
+ ret = btrfs_run_delayed_refs(trans, async->count);
if (ret)
async->error = ret;
end:
@@ -3053,8 +3051,9 @@ int btrfs_async_run_delayed_refs(struct btrfs_fs_info *fs_info,
* Returns <0 on error and aborts the transaction
*/
int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info, unsigned long count)
+ unsigned long count)
{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
struct rb_node *node;
struct btrfs_delayed_ref_root *delayed_refs;
struct btrfs_delayed_ref_head *head;
@@ -3078,7 +3077,7 @@ again:
delayed_refs->run_delayed_start = find_middle(&delayed_refs->root);
#endif
trans->can_flush_pending_bgs = false;
- ret = __btrfs_run_delayed_refs(trans, fs_info, count);
+ ret = __btrfs_run_delayed_refs(trans, count);
if (ret < 0) {
btrfs_abort_transaction(trans, ret);
return ret;
@@ -3086,7 +3085,7 @@ again:
if (run_all) {
if (!list_empty(&trans->new_bgs))
- btrfs_create_pending_block_groups(trans, fs_info);
+ btrfs_create_pending_block_groups(trans);
spin_lock(&delayed_refs->lock);
node = rb_first(&delayed_refs->href_root);
@@ -3660,9 +3659,9 @@ int btrfs_setup_space_cache(struct btrfs_trans_handle *trans,
* the commit latency by getting rid of the easy block groups while
* we're still allowing others to join the commit.
*/
-int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info)
+int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans)
{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_block_group_cache *cache;
struct btrfs_transaction *cur_trans = trans->transaction;
int ret = 0;
@@ -3686,7 +3685,7 @@ again:
* make sure all the block groups on our dirty list actually
* exist
*/
- btrfs_create_pending_block_groups(trans, fs_info);
+ btrfs_create_pending_block_groups(trans);
if (!path) {
path = btrfs_alloc_path();
@@ -3741,8 +3740,9 @@ again:
should_put = 0;
/*
- * the cache_write_mutex is protecting
- * the io_list
+ * The cache_write_mutex is protecting the
+ * io_list, also refer to the definition of
+ * btrfs_transaction::io_bgs for more details
*/
list_add_tail(&cache->io_list, io);
} else {
@@ -3800,7 +3800,7 @@ again:
* go through delayed refs for all the stuff we've just kicked off
* and then loop back (just once)
*/
- ret = btrfs_run_delayed_refs(trans, fs_info, 0);
+ ret = btrfs_run_delayed_refs(trans, 0);
if (!ret && loops == 0) {
loops++;
spin_lock(&cur_trans->dirty_bgs_lock);
@@ -3882,7 +3882,7 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
cache_save_setup(cache, trans, path);
if (!ret)
- ret = btrfs_run_delayed_refs(trans, fs_info,
+ ret = btrfs_run_delayed_refs(trans,
(unsigned long) -1);
if (!ret && cache->disk_cache_state == BTRFS_DC_SETUP) {
@@ -3934,6 +3934,10 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
}
spin_unlock(&cur_trans->dirty_bgs_lock);
+ /*
+ * Refer to the definition of io_bgs member for details why it's safe
+ * to use it without any locking
+ */
while (!list_empty(io)) {
cache = list_first_entry(io, struct btrfs_block_group_cache,
io_list);
@@ -4332,8 +4336,7 @@ again:
/* commit the current transaction and try again */
commit_trans:
- if (need_commit &&
- !atomic_read(&fs_info->open_ioctl_trans)) {
+ if (need_commit) {
need_commit--;
if (need_commit > 0) {
@@ -4541,7 +4544,7 @@ void check_system_chunk(struct btrfs_trans_handle *trans,
* Needed because we can end up allocating a system chunk and for an
* atomic and race free space reservation in the chunk block reserve.
*/
- ASSERT(mutex_is_locked(&fs_info->chunk_mutex));
+ lockdep_assert_held(&fs_info->chunk_mutex);
info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
spin_lock(&info->lock);
@@ -4602,11 +4605,7 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
return -ENOSPC;
space_info = __find_space_info(fs_info, flags);
- if (!space_info) {
- ret = create_space_info(fs_info, flags, &space_info);
- if (ret)
- return ret;
- }
+ ASSERT(space_info);
again:
spin_lock(&space_info->lock);
@@ -4705,7 +4704,7 @@ out:
*/
if (trans->can_flush_pending_bgs &&
trans->chunk_bytes_reserved >= (u64)SZ_2M) {
- btrfs_create_pending_block_groups(trans, fs_info);
+ btrfs_create_pending_block_groups(trans);
btrfs_trans_release_chunk_metadata(trans);
}
return ret;
@@ -4826,7 +4825,6 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim,
long time_left;
unsigned long nr_pages;
int loops;
- enum btrfs_reserve_flush_enum flush;
/* Calc the number of the pages we need flush for space reservation */
items = calc_reclaim_items_nr(fs_info, to_reclaim);
@@ -4867,10 +4865,6 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim,
atomic_read(&fs_info->async_delalloc_pages) <=
(int)max_reclaim);
skip_async:
- if (!trans)
- flush = BTRFS_RESERVE_FLUSH_ALL;
- else
- flush = BTRFS_RESERVE_NO_FLUSH;
spin_lock(&space_info->lock);
if (list_empty(&space_info->tickets) &&
list_empty(&space_info->priority_tickets)) {
@@ -4993,7 +4987,7 @@ static void flush_space(struct btrfs_fs_info *fs_info,
ret = PTR_ERR(trans);
break;
}
- ret = btrfs_run_delayed_items_nr(trans, fs_info, nr);
+ ret = btrfs_run_delayed_items_nr(trans, nr);
btrfs_end_transaction(trans);
break;
case FLUSH_DELALLOC:
@@ -5388,10 +5382,15 @@ static int reserve_metadata_bytes(struct btrfs_root *root,
!block_rsv_use_bytes(global_rsv, orig_bytes))
ret = 0;
}
- if (ret == -ENOSPC)
+ if (ret == -ENOSPC) {
trace_btrfs_space_reservation(fs_info, "space_info:enospc",
block_rsv->space_info->flags,
orig_bytes, 1);
+
+ if (btrfs_test_opt(fs_info, ENOSPC_DEBUG))
+ dump_space_info(fs_info, block_rsv->space_info,
+ orig_bytes, 0);
+ }
return ret;
}
@@ -5760,6 +5759,9 @@ static int btrfs_inode_rsv_refill(struct btrfs_inode *inode,
if (num_bytes == 0)
return 0;
+ ret = btrfs_qgroup_reserve_meta_prealloc(root, num_bytes, true);
+ if (ret)
+ return ret;
ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
if (!ret) {
block_rsv_add_bytes(block_rsv, num_bytes, 0);
@@ -5772,11 +5774,15 @@ static int btrfs_inode_rsv_refill(struct btrfs_inode *inode,
/**
* btrfs_inode_rsv_release - release any excessive reservation.
* @inode - the inode we need to release from.
+ * @qgroup_free - free or convert qgroup meta.
+ * Unlike normal operation, qgroup meta reservation needs to know if we are
+ * freeing qgroup reservation or just converting it into per-trans. Normally
+ * @qgroup_free is true for error handling, and false for normal release.
*
* This is the same as btrfs_block_rsv_release, except that it handles the
* tracepoint for the reservation.
*/
-static void btrfs_inode_rsv_release(struct btrfs_inode *inode)
+static void btrfs_inode_rsv_release(struct btrfs_inode *inode, bool qgroup_free)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
@@ -5792,6 +5798,10 @@ static void btrfs_inode_rsv_release(struct btrfs_inode *inode)
if (released > 0)
trace_btrfs_space_reservation(fs_info, "delalloc",
btrfs_ino(inode), released, 0);
+ if (qgroup_free)
+ btrfs_qgroup_free_meta_prealloc(inode->root, released);
+ else
+ btrfs_qgroup_convert_reserved_meta(inode->root, released);
}
void btrfs_block_rsv_release(struct btrfs_fs_info *fs_info,
@@ -5892,24 +5902,6 @@ static void release_global_block_rsv(struct btrfs_fs_info *fs_info)
WARN_ON(fs_info->delayed_block_rsv.reserved > 0);
}
-void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info)
-{
- if (!trans->block_rsv) {
- ASSERT(!trans->bytes_reserved);
- return;
- }
-
- if (!trans->bytes_reserved)
- return;
-
- ASSERT(trans->block_rsv == &fs_info->trans_block_rsv);
- trace_btrfs_space_reservation(fs_info, "transaction",
- trans->transid, trans->bytes_reserved, 0);
- btrfs_block_rsv_release(fs_info, trans->block_rsv,
- trans->bytes_reserved);
- trans->bytes_reserved = 0;
-}
/*
* To be called after all the new block groups attached to the transaction
@@ -5951,7 +5943,7 @@ int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
*/
u64 num_bytes = btrfs_calc_trans_metadata_size(fs_info, 1);
- trace_btrfs_space_reservation(fs_info, "orphan", btrfs_ino(inode),
+ trace_btrfs_space_reservation(fs_info, "orphan", btrfs_ino(inode),
num_bytes, 1);
return btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes, 1);
}
@@ -5995,7 +5987,7 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) {
/* One for parent inode, two for dir entries */
num_bytes = 3 * fs_info->nodesize;
- ret = btrfs_qgroup_reserve_meta(root, num_bytes, true);
+ ret = btrfs_qgroup_reserve_meta_prealloc(root, num_bytes, true);
if (ret)
return ret;
} else {
@@ -6014,7 +6006,7 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
ret = btrfs_block_rsv_migrate(global_rsv, rsv, num_bytes, 1);
if (ret && *qgroup_reserved)
- btrfs_qgroup_free_meta(root, *qgroup_reserved);
+ btrfs_qgroup_free_meta_prealloc(root, *qgroup_reserved);
return ret;
}
@@ -6051,7 +6043,6 @@ static void btrfs_calculate_inode_block_rsv_size(struct btrfs_fs_info *fs_info,
int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
- struct btrfs_root *root = inode->root;
unsigned nr_extents;
enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL;
int ret = 0;
@@ -6068,13 +6059,13 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes)
if (btrfs_is_free_space_inode(inode)) {
flush = BTRFS_RESERVE_NO_FLUSH;
delalloc_lock = false;
- } else if (current->journal_info) {
- flush = BTRFS_RESERVE_FLUSH_LIMIT;
- }
+ } else {
+ if (current->journal_info)
+ flush = BTRFS_RESERVE_FLUSH_LIMIT;
- if (flush != BTRFS_RESERVE_NO_FLUSH &&
- btrfs_transaction_in_commit(fs_info))
- schedule_timeout(1);
+ if (btrfs_transaction_in_commit(fs_info))
+ schedule_timeout(1);
+ }
if (delalloc_lock)
mutex_lock(&inode->delalloc_mutex);
@@ -6089,19 +6080,9 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes)
btrfs_calculate_inode_block_rsv_size(fs_info, inode);
spin_unlock(&inode->lock);
- if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) {
- ret = btrfs_qgroup_reserve_meta(root,
- nr_extents * fs_info->nodesize, true);
- if (ret)
- goto out_fail;
- }
-
ret = btrfs_inode_rsv_refill(inode, flush);
- if (unlikely(ret)) {
- btrfs_qgroup_free_meta(root,
- nr_extents * fs_info->nodesize);
+ if (unlikely(ret))
goto out_fail;
- }
if (delalloc_lock)
mutex_unlock(&inode->delalloc_mutex);
@@ -6115,7 +6096,7 @@ out_fail:
btrfs_calculate_inode_block_rsv_size(fs_info, inode);
spin_unlock(&inode->lock);
- btrfs_inode_rsv_release(inode);
+ btrfs_inode_rsv_release(inode, true);
if (delalloc_lock)
mutex_unlock(&inode->delalloc_mutex);
return ret;
@@ -6125,12 +6106,14 @@ out_fail:
* btrfs_delalloc_release_metadata - release a metadata reservation for an inode
* @inode: the inode to release the reservation for.
* @num_bytes: the number of bytes we are releasing.
+ * @qgroup_free: free qgroup reservation or convert it to per-trans reservation
*
* This will release the metadata reservation for an inode. This can be called
* once we complete IO for a given set of bytes to release their metadata
* reservations, or on error for the same reason.
*/
-void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes)
+void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes,
+ bool qgroup_free)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
@@ -6143,13 +6126,14 @@ void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes)
if (btrfs_is_testing(fs_info))
return;
- btrfs_inode_rsv_release(inode);
+ btrfs_inode_rsv_release(inode, qgroup_free);
}
/**
* btrfs_delalloc_release_extents - release our outstanding_extents
* @inode: the inode to balance the reservation for.
* @num_bytes: the number of bytes we originally reserved with
+ * @qgroup_free: do we need to free qgroup meta reservation or convert them.
*
* When we reserve space we increase outstanding_extents for the extents we may
* add. Once we've set the range as delalloc or created our ordered extents we
@@ -6157,7 +6141,8 @@ void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes)
* temporarily tracked outstanding_extents. This _must_ be used in conjunction
* with btrfs_delalloc_reserve_metadata.
*/
-void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes)
+void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes,
+ bool qgroup_free)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
unsigned num_extents;
@@ -6171,7 +6156,7 @@ void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes)
if (btrfs_is_testing(fs_info))
return;
- btrfs_inode_rsv_release(inode);
+ btrfs_inode_rsv_release(inode, qgroup_free);
}
/**
@@ -6227,9 +6212,9 @@ int btrfs_delalloc_reserve_space(struct inode *inode,
*/
void btrfs_delalloc_release_space(struct inode *inode,
struct extent_changeset *reserved,
- u64 start, u64 len)
+ u64 start, u64 len, bool qgroup_free)
{
- btrfs_delalloc_release_metadata(BTRFS_I(inode), len);
+ btrfs_delalloc_release_metadata(BTRFS_I(inode), len, qgroup_free);
btrfs_free_reserved_data_space(inode, reserved, start, len);
}
@@ -6783,9 +6768,9 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info,
return 0;
}
-int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info)
+int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans)
{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_block_group_cache *block_group, *tmp;
struct list_head *deleted_bgs;
struct extent_io_tree *unpin;
@@ -7351,29 +7336,6 @@ wait_block_group_cache_done(struct btrfs_block_group_cache *cache)
return ret;
}
-int __get_raid_index(u64 flags)
-{
- if (flags & BTRFS_BLOCK_GROUP_RAID10)
- return BTRFS_RAID_RAID10;
- else if (flags & BTRFS_BLOCK_GROUP_RAID1)
- return BTRFS_RAID_RAID1;
- else if (flags & BTRFS_BLOCK_GROUP_DUP)
- return BTRFS_RAID_DUP;
- else if (flags & BTRFS_BLOCK_GROUP_RAID0)
- return BTRFS_RAID_RAID0;
- else if (flags & BTRFS_BLOCK_GROUP_RAID5)
- return BTRFS_RAID_RAID5;
- else if (flags & BTRFS_BLOCK_GROUP_RAID6)
- return BTRFS_RAID_RAID6;
-
- return BTRFS_RAID_SINGLE; /* BTRFS_BLOCK_GROUP_SINGLE */
-}
-
-int get_block_group_index(struct btrfs_block_group_cache *cache)
-{
- return __get_raid_index(cache->flags);
-}
-
static const char *btrfs_raid_type_names[BTRFS_NR_RAID_TYPES] = {
[BTRFS_RAID_RAID10] = "raid10",
[BTRFS_RAID_RAID1] = "raid1",
@@ -7488,7 +7450,7 @@ static noinline int find_free_extent(struct btrfs_fs_info *fs_info,
u64 empty_cluster = 0;
struct btrfs_space_info *space_info;
int loop = 0;
- int index = __get_raid_index(flags);
+ int index = btrfs_bg_flags_to_raid_index(flags);
bool failed_cluster_refill = false;
bool failed_alloc = false;
bool use_cluster = true;
@@ -7574,7 +7536,8 @@ static noinline int find_free_extent(struct btrfs_fs_info *fs_info,
btrfs_put_block_group(block_group);
up_read(&space_info->groups_sem);
} else {
- index = get_block_group_index(block_group);
+ index = btrfs_bg_flags_to_raid_index(
+ block_group->flags);
btrfs_lock_block_group(block_group, delalloc);
goto have_block_group;
}
@@ -7584,7 +7547,7 @@ static noinline int find_free_extent(struct btrfs_fs_info *fs_info,
}
search:
have_caching_bg = false;
- if (index == 0 || index == __get_raid_index(flags))
+ if (index == 0 || index == btrfs_bg_flags_to_raid_index(flags))
full_search = true;
down_read(&space_info->groups_sem);
list_for_each_entry(block_group, &space_info->block_groups[index],
@@ -7842,7 +7805,8 @@ checks:
loop:
failed_cluster_refill = false;
failed_alloc = false;
- BUG_ON(index != get_block_group_index(block_group));
+ BUG_ON(btrfs_bg_flags_to_raid_index(block_group->flags) !=
+ index);
btrfs_release_block_group(block_group, delalloc);
cond_resched();
}
@@ -7996,6 +7960,51 @@ again:
up_read(&info->groups_sem);
}
+/*
+ * btrfs_reserve_extent - entry point to the extent allocator. Tries to find a
+ * hole that is at least as big as @num_bytes.
+ *
+ * @root - The root that will contain this extent
+ *
+ * @ram_bytes - The amount of space in ram that @num_bytes take. This
+ * is used for accounting purposes. This value differs
+ * from @num_bytes only in the case of compressed extents.
+ *
+ * @num_bytes - Number of bytes to allocate on-disk.
+ *
+ * @min_alloc_size - Indicates the minimum amount of space that the
+ * allocator should try to satisfy. In some cases
+ * @num_bytes may be larger than what is required and if
+ * the filesystem is fragmented then allocation fails.
+ * However, the presence of @min_alloc_size gives a
+ * chance to try and satisfy the smaller allocation.
+ *
+ * @empty_size - A hint that you plan on doing more COW. This is the
+ * size in bytes the allocator should try to find free
+ * next to the block it returns. This is just a hint and
+ * may be ignored by the allocator.
+ *
+ * @hint_byte - Hint to the allocator to start searching above the byte
+ * address passed. It might be ignored.
+ *
+ * @ins - This key is modified to record the found hole. It will
+ * have the following values:
+ * ins->objectid == start position
+ * ins->flags = BTRFS_EXTENT_ITEM_KEY
+ * ins->offset == the size of the hole.
+ *
+ * @is_data - Boolean flag indicating whether an extent is
+ * allocated for data (true) or metadata (false)
+ *
+ * @delalloc - Boolean flag indicating whether this allocation is for
+ * delalloc or not. If 'true' data_rwsem of block groups
+ * is going to be acquired.
+ *
+ *
+ * Returns 0 when an allocation succeeded or < 0 when an error occurred. In
+ * case -ENOSPC is returned then @ins->offset will contain the size of the
+ * largest available hole the allocator managed to find.
+ */
int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes,
u64 num_bytes, u64 min_alloc_size,
u64 empty_size, u64 hint_byte,
@@ -8699,6 +8708,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
u64 parent;
u32 blocksize;
struct btrfs_key key;
+ struct btrfs_key first_key;
struct extent_buffer *next;
int level = wc->level;
int reada = 0;
@@ -8719,6 +8729,8 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
}
bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]);
+ btrfs_node_key_to_cpu(path->nodes[level], &first_key,
+ path->slots[level]);
blocksize = fs_info->nodesize;
next = find_extent_buffer(fs_info, bytenr);
@@ -8783,7 +8795,8 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
if (!next) {
if (reada && level == 1)
reada_walk_down(trans, root, wc, path);
- next = read_tree_block(fs_info, bytenr, generation);
+ next = read_tree_block(fs_info, bytenr, generation, level - 1,
+ &first_key);
if (IS_ERR(next)) {
return PTR_ERR(next);
} else if (!extent_buffer_uptodate(next)) {
@@ -9648,7 +9661,7 @@ int btrfs_can_relocate(struct btrfs_fs_info *fs_info, u64 bytenr)
*/
target = get_restripe_target(fs_info, block_group->flags);
if (target) {
- index = __get_raid_index(extended_to_chunk(target));
+ index = btrfs_bg_flags_to_raid_index(extended_to_chunk(target));
} else {
/*
* this is just a balance, so if we were marked as full
@@ -9662,7 +9675,7 @@ int btrfs_can_relocate(struct btrfs_fs_info *fs_info, u64 bytenr)
goto out;
}
- index = get_block_group_index(block_group);
+ index = btrfs_bg_flags_to_raid_index(block_group->flags);
}
if (index == BTRFS_RAID_RAID10) {
@@ -9911,10 +9924,40 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
return 0;
}
+/* link_block_group will queue up kobjects to add when we're reclaim-safe */
+void btrfs_add_raid_kobjects(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_space_info *space_info;
+ struct raid_kobject *rkobj;
+ LIST_HEAD(list);
+ int index;
+ int ret = 0;
+
+ spin_lock(&fs_info->pending_raid_kobjs_lock);
+ list_splice_init(&fs_info->pending_raid_kobjs, &list);
+ spin_unlock(&fs_info->pending_raid_kobjs_lock);
+
+ list_for_each_entry(rkobj, &list, list) {
+ space_info = __find_space_info(fs_info, rkobj->flags);
+ index = btrfs_bg_flags_to_raid_index(rkobj->flags);
+
+ ret = kobject_add(&rkobj->kobj, &space_info->kobj,
+ "%s", get_raid_name(index));
+ if (ret) {
+ kobject_put(&rkobj->kobj);
+ break;
+ }
+ }
+ if (ret)
+ btrfs_warn(fs_info,
+ "failed to add kobject for block cache, ignoring");
+}
+
static void link_block_group(struct btrfs_block_group_cache *cache)
{
struct btrfs_space_info *space_info = cache->space_info;
- int index = get_block_group_index(cache);
+ struct btrfs_fs_info *fs_info = cache->fs_info;
+ int index = btrfs_bg_flags_to_raid_index(cache->flags);
bool first = false;
down_write(&space_info->groups_sem);
@@ -9924,27 +9967,20 @@ static void link_block_group(struct btrfs_block_group_cache *cache)
up_write(&space_info->groups_sem);
if (first) {
- struct raid_kobject *rkobj;
- int ret;
-
- rkobj = kzalloc(sizeof(*rkobj), GFP_NOFS);
- if (!rkobj)
- goto out_err;
- rkobj->raid_type = index;
- kobject_init(&rkobj->kobj, &btrfs_raid_ktype);
- ret = kobject_add(&rkobj->kobj, &space_info->kobj,
- "%s", get_raid_name(index));
- if (ret) {
- kobject_put(&rkobj->kobj);
- goto out_err;
+ struct raid_kobject *rkobj = kzalloc(sizeof(*rkobj), GFP_NOFS);
+ if (!rkobj) {
+ btrfs_warn(cache->fs_info,
+ "couldn't alloc memory for raid level kobject");
+ return;
}
+ rkobj->flags = cache->flags;
+ kobject_init(&rkobj->kobj, &btrfs_raid_ktype);
+
+ spin_lock(&fs_info->pending_raid_kobjs_lock);
+ list_add_tail(&rkobj->list, &fs_info->pending_raid_kobjs);
+ spin_unlock(&fs_info->pending_raid_kobjs_lock);
space_info->block_group_kobjs[index] = &rkobj->kobj;
}
-
- return;
-out_err:
- btrfs_warn(cache->fs_info,
- "failed to add kobject for block cache, ignoring");
}
static struct btrfs_block_group_cache *
@@ -10160,6 +10196,7 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info)
inc_block_group_ro(cache, 1);
}
+ btrfs_add_raid_kobjects(info);
init_global_block_rsv(info);
ret = 0;
error:
@@ -10167,9 +10204,9 @@ error:
return ret;
}
-void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info)
+void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans)
{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_block_group_cache *block_group, *tmp;
struct btrfs_root *extent_root = fs_info->extent_root;
struct btrfs_block_group_item item;
@@ -10254,15 +10291,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
* with its ->space_info set.
*/
cache->space_info = __find_space_info(fs_info, cache->flags);
- if (!cache->space_info) {
- ret = create_space_info(fs_info, cache->flags,
- &cache->space_info);
- if (ret) {
- btrfs_remove_free_space_cache(cache);
- btrfs_put_block_group(cache);
- return ret;
- }
- }
+ ASSERT(cache->space_info);
ret = btrfs_add_block_group_cache(fs_info, cache);
if (ret) {
@@ -10334,7 +10363,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
block_group->key.offset);
memcpy(&key, &block_group->key, sizeof(key));
- index = get_block_group_index(block_group);
+ index = btrfs_bg_flags_to_raid_index(block_group->flags);
if (block_group->flags & (BTRFS_BLOCK_GROUP_DUP |
BTRFS_BLOCK_GROUP_RAID1 |
BTRFS_BLOCK_GROUP_RAID10))