From e7070be198b34c26f39bd9010a29ce6462dc4f3e Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 16 Dec 2014 08:54:43 -0800 Subject: Btrfs: change how we track dirty roots I've been overloading root->dirty_list to keep track of dirty roots and which roots need to have their commit roots switched at transaction commit time. This could cause us to lose an update to the root which could corrupt the file system. To fix this use a state bit to know if the root is dirty, and if it isn't set we go ahead and move the root to the dirty list. This way if we re-dirty the root after adding it to the switch_commit list we make sure to update it. This also makes it so that the extent root is always the last root on the dirty list to try and keep the amount of churn down at this point in the commit. Thanks, Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/transaction.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'fs/btrfs/transaction.c') diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index a605d4e2f2bc..aa2219ebecc9 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1020,6 +1020,7 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans, u64 old_root_bytenr; u64 old_root_used; struct btrfs_root *tree_root = root->fs_info->tree_root; + bool extent_root = (root->objectid == BTRFS_EXTENT_TREE_OBJECTID); old_root_used = btrfs_root_used(&root->root_item); btrfs_write_dirty_block_groups(trans, root); @@ -1038,7 +1039,12 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans, return ret; old_root_used = btrfs_root_used(&root->root_item); - ret = btrfs_write_dirty_block_groups(trans, root); + if (extent_root) { + ret = btrfs_write_dirty_block_groups(trans, root); + if (ret) + return ret; + } + ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); if (ret) return ret; } @@ -1097,6 +1103,7 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans, next = fs_info->dirty_cowonly_roots.next; list_del_init(next); root = list_entry(next, struct btrfs_root, dirty_list); + clear_bit(BTRFS_ROOT_DIRTY, &root->state); if (root != fs_info->extent_root) list_add_tail(&root->dirty_list, -- cgit v1.2.3 From ce93ec548cfa02f9cd6b70d546d5f36f4d160f57 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 17 Nov 2014 15:45:48 -0500 Subject: Btrfs: track dirty block groups on their own list Currently any time we try to update the block groups on disk we will walk _all_ block groups and check for the ->dirty flag to see if it is set. This function can get called several times during a commit. So if you have several terabytes of data you will be a very sad panda as we will loop through _all_ of the block groups several times, which makes the commit take a while which slows down the rest of the file system operations. This patch introduces a dirty list for the block groups that we get added to when we dirty the block group for the first time. Then we simply update any block groups that have been dirtied since the last time we called btrfs_write_dirty_block_groups. This allows us to clean up how we write the free space cache out so it is much cleaner. Thanks, Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/ctree.h | 5 +- fs/btrfs/extent-tree.c | 167 ++++++++++++++------------------------------ fs/btrfs/free-space-cache.c | 8 ++- fs/btrfs/transaction.c | 14 ++-- fs/btrfs/transaction.h | 2 + 5 files changed, 72 insertions(+), 124 deletions(-) (limited to 'fs/btrfs/transaction.c') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 45ed4dc6a0ce..0b4683f560c8 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1238,7 +1238,6 @@ enum btrfs_disk_cache_state { BTRFS_DC_ERROR = 1, BTRFS_DC_CLEAR = 2, BTRFS_DC_SETUP = 3, - BTRFS_DC_NEED_WRITE = 4, }; struct btrfs_caching_control { @@ -1276,7 +1275,6 @@ struct btrfs_block_group_cache { unsigned long full_stripe_len; unsigned int ro:1; - unsigned int dirty:1; unsigned int iref:1; unsigned int has_caching_ctl:1; unsigned int removed:1; @@ -1314,6 +1312,9 @@ struct btrfs_block_group_cache { struct list_head ro_list; atomic_t trimming; + + /* For dirty block groups */ + struct list_head dirty_list; }; /* delayed seq elem */ diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 15116585e714..21c373fe256c 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -74,8 +74,9 @@ enum { RESERVE_ALLOC_NO_ACCOUNT = 2, }; -static int update_block_group(struct btrfs_root *root, - u64 bytenr, u64 num_bytes, int alloc); +static int update_block_group(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 bytenr, + u64 num_bytes, int alloc); static int __btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 num_bytes, u64 parent, @@ -3315,120 +3316,42 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, struct btrfs_root *root) { struct btrfs_block_group_cache *cache; - int err = 0; + struct btrfs_transaction *cur_trans = trans->transaction; + int ret = 0; struct btrfs_path *path; - u64 last = 0; + + if (list_empty(&cur_trans->dirty_bgs)) + return 0; path = btrfs_alloc_path(); if (!path) return -ENOMEM; -again: - while (1) { - cache = btrfs_lookup_first_block_group(root->fs_info, last); - while (cache) { - if (cache->disk_cache_state == BTRFS_DC_CLEAR) - break; - cache = next_block_group(root, cache); - } - if (!cache) { - if (last == 0) - break; - last = 0; - continue; - } - err = cache_save_setup(cache, trans, path); - last = cache->key.objectid + cache->key.offset; - btrfs_put_block_group(cache); - } - - while (1) { - if (last == 0) { - err = btrfs_run_delayed_refs(trans, root, - (unsigned long)-1); - if (err) /* File system offline */ - goto out; - } - - cache = btrfs_lookup_first_block_group(root->fs_info, last); - while (cache) { - if (cache->disk_cache_state == BTRFS_DC_CLEAR) { - btrfs_put_block_group(cache); - goto again; - } - - if (cache->dirty) - break; - cache = next_block_group(root, cache); - } - if (!cache) { - if (last == 0) - break; - last = 0; - continue; - } - - if (cache->disk_cache_state == BTRFS_DC_SETUP) - cache->disk_cache_state = BTRFS_DC_NEED_WRITE; - cache->dirty = 0; - last = cache->key.objectid + cache->key.offset; - - err = write_one_cache_group(trans, root, path, cache); - btrfs_put_block_group(cache); - if (err) /* File system offline */ - goto out; - } - - while (1) { - /* - * I don't think this is needed since we're just marking our - * preallocated extent as written, but just in case it can't - * hurt. - */ - if (last == 0) { - err = btrfs_run_delayed_refs(trans, root, - (unsigned long)-1); - if (err) /* File system offline */ - goto out; - } - - cache = btrfs_lookup_first_block_group(root->fs_info, last); - while (cache) { - /* - * Really this shouldn't happen, but it could if we - * couldn't write the entire preallocated extent and - * splitting the extent resulted in a new block. - */ - if (cache->dirty) { - btrfs_put_block_group(cache); - goto again; - } - if (cache->disk_cache_state == BTRFS_DC_NEED_WRITE) - break; - cache = next_block_group(root, cache); - } - if (!cache) { - if (last == 0) - break; - last = 0; - continue; - } - - err = btrfs_write_out_cache(root, trans, cache, path); - - /* - * If we didn't have an error then the cache state is still - * NEED_WRITE, so we can set it to WRITTEN. - */ - if (!err && cache->disk_cache_state == BTRFS_DC_NEED_WRITE) - cache->disk_cache_state = BTRFS_DC_WRITTEN; - last = cache->key.objectid + cache->key.offset; + /* + * We don't need the lock here since we are protected by the transaction + * commit. We want to do the cache_save_setup first and then run the + * delayed refs to make sure we have the best chance at doing this all + * in one shot. + */ + while (!list_empty(&cur_trans->dirty_bgs)) { + cache = list_first_entry(&cur_trans->dirty_bgs, + struct btrfs_block_group_cache, + dirty_list); + list_del_init(&cache->dirty_list); + if (cache->disk_cache_state == BTRFS_DC_CLEAR) + cache_save_setup(cache, trans, path); + if (!ret) + ret = btrfs_run_delayed_refs(trans, root, + (unsigned long) -1); + if (!ret && cache->disk_cache_state == BTRFS_DC_SETUP) + btrfs_write_out_cache(root, trans, cache, path); + if (!ret) + ret = write_one_cache_group(trans, root, path, cache); btrfs_put_block_group(cache); } -out: btrfs_free_path(path); - return err; + return ret; } int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr) @@ -5375,8 +5298,9 @@ void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes) btrfs_free_reserved_data_space(inode, num_bytes); } -static int update_block_group(struct btrfs_root *root, - u64 bytenr, u64 num_bytes, int alloc) +static int update_block_group(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 bytenr, + u64 num_bytes, int alloc) { struct btrfs_block_group_cache *cache = NULL; struct btrfs_fs_info *info = root->fs_info; @@ -5414,6 +5338,14 @@ static int update_block_group(struct btrfs_root *root, if (!alloc && cache->cached == BTRFS_CACHE_NO) cache_block_group(cache, 1); + spin_lock(&trans->transaction->dirty_bgs_lock); + if (list_empty(&cache->dirty_list)) { + list_add_tail(&cache->dirty_list, + &trans->transaction->dirty_bgs); + btrfs_get_block_group(cache); + } + spin_unlock(&trans->transaction->dirty_bgs_lock); + byte_in_group = bytenr - cache->key.objectid; WARN_ON(byte_in_group > cache->key.offset); @@ -5424,7 +5356,6 @@ static int update_block_group(struct btrfs_root *root, cache->disk_cache_state < BTRFS_DC_CLEAR) cache->disk_cache_state = BTRFS_DC_CLEAR; - cache->dirty = 1; old_val = btrfs_block_group_used(&cache->item); num_bytes = min(total, cache->key.offset - byte_in_group); if (alloc) { @@ -6103,7 +6034,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, } } - ret = update_block_group(root, bytenr, num_bytes, 0); + ret = update_block_group(trans, root, bytenr, num_bytes, 0); if (ret) { btrfs_abort_transaction(trans, extent_root, ret); goto out; @@ -7063,7 +6994,7 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, if (ret) return ret; - ret = update_block_group(root, ins->objectid, ins->offset, 1); + ret = update_block_group(trans, root, ins->objectid, ins->offset, 1); if (ret) { /* -ENOENT, logic error */ btrfs_err(fs_info, "update block group failed for %llu %llu", ins->objectid, ins->offset); @@ -7152,7 +7083,8 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, return ret; } - ret = update_block_group(root, ins->objectid, root->nodesize, 1); + ret = update_block_group(trans, root, ins->objectid, root->nodesize, + 1); if (ret) { /* -ENOENT, logic error */ btrfs_err(fs_info, "update block group failed for %llu %llu", ins->objectid, ins->offset); @@ -9005,6 +8937,7 @@ btrfs_create_block_group_cache(struct btrfs_root *root, u64 start, u64 size) INIT_LIST_HEAD(&cache->cluster_list); INIT_LIST_HEAD(&cache->bg_list); INIT_LIST_HEAD(&cache->ro_list); + INIT_LIST_HEAD(&cache->dirty_list); btrfs_init_free_space_ctl(cache); atomic_set(&cache->trimming, 0); @@ -9068,9 +9001,8 @@ int btrfs_read_block_groups(struct btrfs_root *root) * b) Setting 'dirty flag' makes sure that we flush * the new space cache info onto disk. */ - cache->disk_cache_state = BTRFS_DC_CLEAR; if (btrfs_test_opt(root, SPACE_CACHE)) - cache->dirty = 1; + cache->disk_cache_state = BTRFS_DC_CLEAR; } read_extent_buffer(leaf, &cache->item, @@ -9461,6 +9393,13 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, } } + spin_lock(&trans->transaction->dirty_bgs_lock); + if (!list_empty(&block_group->dirty_list)) { + list_del_init(&block_group->dirty_list); + btrfs_put_block_group(block_group); + } + spin_unlock(&trans->transaction->dirty_bgs_lock); + btrfs_remove_free_space_cache(block_group); spin_lock(&block_group->space_info->lock); diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index d6c03f7f136b..80a3141463e7 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -1243,6 +1243,7 @@ int btrfs_write_out_cache(struct btrfs_root *root, struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; struct inode *inode; int ret = 0; + enum btrfs_disk_cache_state dcs = BTRFS_DC_WRITTEN; root = root->fs_info->tree_root; @@ -1266,9 +1267,7 @@ int btrfs_write_out_cache(struct btrfs_root *root, ret = __btrfs_write_out_cache(root, inode, ctl, block_group, trans, path, block_group->key.objectid); if (ret) { - spin_lock(&block_group->lock); - block_group->disk_cache_state = BTRFS_DC_ERROR; - spin_unlock(&block_group->lock); + dcs = BTRFS_DC_ERROR; ret = 0; #ifdef DEBUG btrfs_err(root->fs_info, @@ -1277,6 +1276,9 @@ int btrfs_write_out_cache(struct btrfs_root *root, #endif } + spin_lock(&block_group->lock); + block_group->disk_cache_state = dcs; + spin_unlock(&block_group->lock); iput(inode); return ret; } diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index aa2219ebecc9..e0faf803513a 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -248,6 +248,8 @@ loop: INIT_LIST_HEAD(&cur_trans->pending_chunks); INIT_LIST_HEAD(&cur_trans->switch_commits); INIT_LIST_HEAD(&cur_trans->pending_ordered); + INIT_LIST_HEAD(&cur_trans->dirty_bgs); + spin_lock_init(&cur_trans->dirty_bgs_lock); list_add_tail(&cur_trans->list, &fs_info->trans_list); extent_io_tree_init(&cur_trans->dirty_pages, fs_info->btree_inode->i_mapping); @@ -1028,7 +1030,9 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans, while (1) { old_root_bytenr = btrfs_root_bytenr(&root->root_item); if (old_root_bytenr == root->node->start && - old_root_used == btrfs_root_used(&root->root_item)) + old_root_used == btrfs_root_used(&root->root_item) && + (!extent_root || + list_empty(&trans->transaction->dirty_bgs))) break; btrfs_set_root_node(&root->root_item, root->node); @@ -1047,6 +1051,9 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans, ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); if (ret) return ret; + ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); + if (ret) + return ret; } return 0; @@ -1067,10 +1074,6 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans, struct extent_buffer *eb; int ret; - ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); - if (ret) - return ret; - eb = btrfs_lock_root_node(fs_info->tree_root); ret = btrfs_cow_block(trans, fs_info->tree_root, eb, NULL, 0, &eb); @@ -1990,6 +1993,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, switch_commit_roots(cur_trans, root->fs_info); assert_qgroups_uptodate(trans); + ASSERT(list_empty(&cur_trans->dirty_bgs)); update_super_roots(root); btrfs_set_super_log_root(root->fs_info->super_copy, 0); diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 00ed29c4b3f9..3305451451ca 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -58,6 +58,8 @@ struct btrfs_transaction { struct list_head pending_chunks; struct list_head pending_ordered; struct list_head switch_commits; + struct list_head dirty_bgs; + spinlock_t dirty_bgs_lock; struct btrfs_delayed_ref_root delayed_refs; int aborted; }; -- cgit v1.2.3 From 13212b54d18d5235fb97fbdcba8ae453fd2a3a51 Mon Sep 17 00:00:00 2001 From: Zhao Lei Date: Thu, 12 Feb 2015 14:18:17 +0800 Subject: btrfs: Fix out-of-space bug Btrfs will report NO_SPACE when we create and remove files for several times, and we can't write to filesystem until mount it again. Steps to reproduce: 1: Create a single-dev btrfs fs with default option 2: Write a file into it to take up most fs space 3: Delete above file 4: Wait about 100s to let chunk removed 5: goto 2 Script is like following: #!/bin/bash # Recommend 1.2G space, too large disk will make test slow DEV="/dev/sda16" MNT="/mnt/tmp" dev_size="$(lsblk -bn -o SIZE "$DEV")" || exit 2 file_size_m=$((dev_size * 75 / 100 / 1024 / 1024)) echo "Loop write ${file_size_m}M file on $((dev_size / 1024 / 1024))M dev" for ((i = 0; i < 10; i++)); do umount "$MNT" 2>/dev/null; done echo "mkfs $DEV" mkfs.btrfs -f "$DEV" >/dev/null || exit 2 echo "mount $DEV $MNT" mount "$DEV" "$MNT" || exit 2 for ((loop_i = 0; loop_i < 20; loop_i++)); do echo echo "loop $loop_i" echo "dd file..." cmd=(dd if=/dev/zero of="$MNT"/file0 bs=1M count="$file_size_m") "${cmd[@]}" 2>/dev/null || { # NO_SPACE error triggered echo "dd failed: ${cmd[*]}" exit 1 } echo "rm file..." rm -f "$MNT"/file0 || exit 2 for ((i = 0; i < 10; i++)); do df "$MNT" | tail -1 sleep 10 done done Reason: It is triggered by commit: 47ab2a6c689913db23ccae38349714edf8365e0a which is used to remove empty block groups automatically, but the reason is not in that patch. Code before works well because btrfs don't need to create and delete chunks so many times with high complexity. Above bug is caused by many reason, any of them can trigger it. Reason1: When we remove some continuous chunks but leave other chunks after, these disk space should be used by chunk-recreating, but in current code, only first create will successed. Fixed by Forrest Liu in: Btrfs: fix find_free_dev_extent() malfunction in case device tree has hole Reason2: contains_pending_extent() return wrong value in calculation. Fixed by Forrest Liu in: Btrfs: fix find_free_dev_extent() malfunction in case device tree has hole Reason3: btrfs_check_data_free_space() try to commit transaction and retry allocating chunk when the first allocating failed, but space_info->full is set in first allocating, and prevent second allocating in retry. Fixed in this patch by clear space_info->full in commit transaction. Tested for severial times by above script. Changelog v3->v4: use light weight int instead of atomic_t to record have_remove_bgs in transaction, suggested by: Josef Bacik Changelog v2->v3: v2 fixed the bug by adding more commit-transaction, but we only need to reclaim space when we are really have no space for new chunk, noticed by: Filipe David Manana Actually, our code already have this type of commit-and-retry, we only need to make it working with removed-bgs. v3 fixed the bug with above way. Changelog v1->v2: v1 will introduce a new bug when delete and create chunk in same disk space in same transaction, noticed by: Filipe David Manana V2 fix this bug by commit transaction after remove block grops. Reported-by: Tsutomu Itoh Suggested-by: Filipe David Manana Suggested-by: Josef Bacik Signed-off-by: Zhao Lei Signed-off-by: Chris Mason --- fs/btrfs/transaction.c | 4 ++++ fs/btrfs/transaction.h | 5 +++++ fs/btrfs/volumes.c | 2 ++ 3 files changed, 11 insertions(+) (limited to 'fs/btrfs/transaction.c') diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index e0faf803513a..038fcf6051e0 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -220,6 +220,7 @@ loop: * commit the transaction. */ atomic_set(&cur_trans->use_count, 2); + cur_trans->have_free_bgs = 0; cur_trans->start_time = get_seconds(); cur_trans->delayed_refs.href_root = RB_ROOT; @@ -2037,6 +2038,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, btrfs_finish_extent_commit(trans, root); + if (cur_trans->have_free_bgs) + btrfs_clear_space_info_full(root->fs_info); + root->fs_info->last_trans_committed = cur_trans->transid; /* * We needn't acquire the lock here because there is no other task diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 3305451451ca..937050a2b68e 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -47,6 +47,11 @@ struct btrfs_transaction { atomic_t num_writers; atomic_t use_count; + /* + * true if there is free bgs operations in this transaction + */ + int have_free_bgs; + /* Be protected by fs_info->trans_lock when we want to change it. */ enum btrfs_trans_state state; struct list_head list; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 2c4cab2dbd1a..cd4d1315aaa9 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1310,6 +1310,8 @@ again: if (ret) { btrfs_error(root->fs_info, ret, "Failed to remove dev extent item"); + } else { + trans->transaction->have_free_bgs = 1; } out: btrfs_free_path(path); -- cgit v1.2.3