diff options
Diffstat (limited to 'fs/btrfs')
45 files changed, 2304 insertions, 1438 deletions
diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig index 2e558227931a..273351ee4c46 100644 --- a/fs/btrfs/Kconfig +++ b/fs/btrfs/Kconfig @@ -38,9 +38,6 @@ config BTRFS_FS_POSIX_ACL POSIX Access Control Lists (ACLs) support permissions for users and groups beyond the owner/group/world scheme. - To learn more about Access Control Lists, visit the POSIX ACLs for - Linux website <http://acl.bestbits.at/>. - If you don't know what Access Control Lists are, say N config BTRFS_FS_CHECK_INTEGRITY diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index 6fe881d5cb38..0c4373628eb4 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -19,4 +19,4 @@ btrfs-$(CONFIG_BTRFS_FS_REF_VERIFY) += ref-verify.o btrfs-$(CONFIG_BTRFS_FS_RUN_SANITY_TESTS) += tests/free-space-tests.o \ tests/extent-buffer-tests.o tests/btrfs-tests.o \ tests/extent-io-tests.o tests/inode-tests.o tests/qgroup-tests.o \ - tests/free-space-tree-tests.o + tests/free-space-tree-tests.o tests/extent-map-tests.o diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 7d0dc100a09a..e4054e533f6d 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -216,7 +216,8 @@ static int prelim_ref_compare(struct prelim_ref *ref1, return 0; } -void update_share_count(struct share_check *sc, int oldcount, int newcount) +static void update_share_count(struct share_check *sc, int oldcount, + int newcount) { if ((!sc) || (oldcount == 0 && newcount < 1)) return; diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 5982c8a71f02..07d049c0c20f 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -33,7 +33,6 @@ #include <linux/bit_spinlock.h> #include <linux/slab.h> #include <linux/sched/mm.h> -#include <linux/sort.h> #include <linux/log2.h> #include "ctree.h" #include "disk-io.h" @@ -45,6 +44,21 @@ #include "extent_io.h" #include "extent_map.h" +static const char* const btrfs_compress_types[] = { "", "zlib", "lzo", "zstd" }; + +const char* btrfs_compress_type2str(enum btrfs_compression_type type) +{ + switch (type) { + case BTRFS_COMPRESS_ZLIB: + case BTRFS_COMPRESS_LZO: + case BTRFS_COMPRESS_ZSTD: + case BTRFS_COMPRESS_NONE: + return btrfs_compress_types[type]; + } + + return NULL; +} + static int btrfs_decompress_bio(struct compressed_bio *cb); static inline int compressed_bio_size(struct btrfs_fs_info *fs_info, @@ -348,8 +362,6 @@ blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start, page->mapping = NULL; if (submit || bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) { - bio_get(bio); - /* * inc the count before we submit the bio so * we know the end IO handler won't happen before @@ -372,8 +384,6 @@ blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start, bio_endio(bio); } - bio_put(bio); - bio = btrfs_bio_alloc(bdev, first_byte); bio->bi_opf = REQ_OP_WRITE | write_flags; bio->bi_private = cb; @@ -389,7 +399,6 @@ blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start, first_byte += PAGE_SIZE; cond_resched(); } - bio_get(bio); ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DATA); BUG_ON(ret); /* -ENOMEM */ @@ -405,13 +414,12 @@ blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start, bio_endio(bio); } - bio_put(bio); return 0; } static u64 bio_end_offset(struct bio *bio) { - struct bio_vec *last = &bio->bi_io_vec[bio->bi_vcnt - 1]; + struct bio_vec *last = bio_last_bvec_all(bio); return page_offset(last->bv_page) + last->bv_len + last->bv_offset; } @@ -563,7 +571,7 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, /* we need the actual starting offset of this extent in the file */ read_lock(&em_tree->lock); em = lookup_extent_mapping(em_tree, - page_offset(bio->bi_io_vec->bv_page), + page_offset(bio_first_page_all(bio)), PAGE_SIZE); read_unlock(&em_tree->lock); if (!em) @@ -638,8 +646,6 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, page->mapping = NULL; if (submit || bio_add_page(comp_bio, page, PAGE_SIZE, 0) < PAGE_SIZE) { - bio_get(comp_bio); - ret = btrfs_bio_wq_end_io(fs_info, comp_bio, BTRFS_WQ_ENDIO_DATA); BUG_ON(ret); /* -ENOMEM */ @@ -666,8 +672,6 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, bio_endio(comp_bio); } - bio_put(comp_bio); - comp_bio = btrfs_bio_alloc(bdev, cur_disk_byte); bio_set_op_attrs(comp_bio, REQ_OP_READ, 0); comp_bio->bi_private = cb; @@ -677,7 +681,6 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, } cur_disk_byte += PAGE_SIZE; } - bio_get(comp_bio); ret = btrfs_bio_wq_end_io(fs_info, comp_bio, BTRFS_WQ_ENDIO_DATA); BUG_ON(ret); /* -ENOMEM */ @@ -693,7 +696,6 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, bio_endio(comp_bio); } - bio_put(comp_bio); return 0; fail2: @@ -752,6 +754,8 @@ struct heuristic_ws { u32 sample_size; /* Buckets store counters for each byte value */ struct bucket_item *bucket; + /* Sorting buffer */ + struct bucket_item *bucket_b; struct list_head list; }; @@ -763,6 +767,7 @@ static void free_heuristic_ws(struct list_head *ws) kvfree(workspace->sample); kfree(workspace->bucket); + kfree(workspace->bucket_b); kfree(workspace); } @@ -782,6 +787,10 @@ static struct list_head *alloc_heuristic_ws(void) if (!ws->bucket) goto fail; + ws->bucket_b = kcalloc(BUCKET_SIZE, sizeof(*ws->bucket_b), GFP_KERNEL); + if (!ws->bucket_b) + goto fail; + INIT_LIST_HEAD(&ws->list); return &ws->list; fail: @@ -1278,13 +1287,103 @@ static u32 shannon_entropy(struct heuristic_ws *ws) return entropy_sum * 100 / entropy_max; } -/* Compare buckets by size, ascending */ -static int bucket_comp_rev(const void *lv, const void *rv) +#define RADIX_BASE 4U +#define COUNTERS_SIZE (1U << RADIX_BASE) + +static u8 get4bits(u64 num, int shift) { + u8 low4bits; + + num >>= shift; + /* Reverse order */ + low4bits = (COUNTERS_SIZE - 1) - (num % COUNTERS_SIZE); + return low4bits; +} + +/* + * Use 4 bits as radix base + * Use 16 u32 counters for calculating new possition in buf array + * + * @array - array that will be sorted + * @array_buf - buffer array to store sorting results + * must be equal in size to @array + * @num - array size + */ +static void radix_sort(struct bucket_item *array, struct bucket_item *array_buf, + int num) { - const struct bucket_item *l = (const struct bucket_item *)lv; - const struct bucket_item *r = (const struct bucket_item *)rv; + u64 max_num; + u64 buf_num; + u32 counters[COUNTERS_SIZE]; + u32 new_addr; + u32 addr; + int bitlen; + int shift; + int i; - return r->count - l->count; + /* + * Try avoid useless loop iterations for small numbers stored in big + * counters. Example: 48 33 4 ... in 64bit array + */ + max_num = array[0].count; + for (i = 1; i < num; i++) { + buf_num = array[i].count; + if (buf_num > max_num) + max_num = buf_num; + } + + buf_num = ilog2(max_num); + bitlen = ALIGN(buf_num, RADIX_BASE * 2); + + shift = 0; + while (shift < bitlen) { + memset(counters, 0, sizeof(counters)); + + for (i = 0; i < num; i++) { + buf_num = array[i].count; + addr = get4bits(buf_num, shift); + counters[addr]++; + } + + for (i = 1; i < COUNTERS_SIZE; i++) + counters[i] += counters[i - 1]; + + for (i = num - 1; i >= 0; i--) { + buf_num = array[i].count; + addr = get4bits(buf_num, shift); + counters[addr]--; + new_addr = counters[addr]; + array_buf[new_addr] = array[i]; + } + + shift += RADIX_BASE; + + /* + * Normal radix expects to move data from a temporary array, to + * the main one. But that requires some CPU time. Avoid that + * by doing another sort iteration to original array instead of + * memcpy() + */ + memset(counters, 0, sizeof(counters)); + + for (i = 0; i < num; i ++) { + buf_num = array_buf[i].count; + addr = get4bits(buf_num, shift); + counters[addr]++; + } + + for (i = 1; i < COUNTERS_SIZE; i++) + counters[i] += counters[i - 1]; + + for (i = num - 1; i >= 0; i--) { + buf_num = array_buf[i].count; + addr = get4bits(buf_num, shift); + counters[addr]--; + new_addr = counters[addr]; + array[new_addr] = array_buf[i]; + } + + shift += RADIX_BASE; + } } /* @@ -1314,7 +1413,7 @@ static int byte_core_set_size(struct heuristic_ws *ws) struct bucket_item *bucket = ws->bucket; /* Sort in reverse order */ - sort(bucket, BUCKET_SIZE, sizeof(*bucket), &bucket_comp_rev, NULL); + radix_sort(ws->bucket, ws->bucket_b, BUCKET_SIZE); for (i = 0; i < BYTE_CORE_SET_LOW; i++) coreset_sum += bucket[i].count; diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index 0868cc554f14..677fa4aa0bd7 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -75,7 +75,7 @@ struct compressed_bio { u32 sums; }; -void btrfs_init_compress(void); +void __init btrfs_init_compress(void); void btrfs_exit_compress(void); int btrfs_compress_pages(unsigned int type_level, struct address_space *mapping, @@ -137,6 +137,8 @@ extern const struct btrfs_compress_op btrfs_zlib_compress; extern const struct btrfs_compress_op btrfs_lzo_compress; extern const struct btrfs_compress_op btrfs_zstd_compress; +const char* btrfs_compress_type2str(enum btrfs_compression_type type); + int btrfs_compress_heuristic(struct inode *inode, u64 start, u64 end); #endif diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 1e74cf826532..b88a79e69ddf 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -1807,8 +1807,8 @@ static noinline int generic_bin_search(struct extent_buffer *eb, * simple bin_search frontend that does the right thing for * leaves vs nodes */ -static int bin_search(struct extent_buffer *eb, const struct btrfs_key *key, - int level, int *slot) +int btrfs_bin_search(struct extent_buffer *eb, const struct btrfs_key *key, + int level, int *slot) { if (level == 0) return generic_bin_search(eb, @@ -1824,12 +1824,6 @@ static int bin_search(struct extent_buffer *eb, const struct btrfs_key *key, slot); } -int btrfs_bin_search(struct extent_buffer *eb, const struct btrfs_key *key, - int level, int *slot) -{ - return bin_search(eb, key, level, slot); -} - static void root_add_used(struct btrfs_root *root, u32 size) { spin_lock(&root->accounting_lock); @@ -2614,7 +2608,7 @@ static int key_search(struct extent_buffer *b, const struct btrfs_key *key, int level, int *prev_cmp, int *slot) { if (*prev_cmp != 0) { - *prev_cmp = bin_search(b, key, level, slot); + *prev_cmp = btrfs_bin_search(b, key, level, slot); return *prev_cmp; } @@ -2660,17 +2654,29 @@ int btrfs_find_item(struct btrfs_root *fs_root, struct btrfs_path *path, } /* - * look for key in the tree. path is filled in with nodes along the way - * if key is found, we return zero and you can find the item in the leaf - * level of the path (level 0) + * btrfs_search_slot - look for a key in a tree and perform necessary + * modifications to preserve tree invariants. + * + * @trans: Handle of transaction, used when modifying the tree + * @p: Holds all btree nodes along the search path + * @root: The root node of the tree + * @key: The key we are looking for + * @ins_len: Indicates purpose of search, for inserts it is 1, for + * deletions it's -1. 0 for plain searches + * @cow: boolean should CoW operations be performed. Must always be 1 + * when modifying the tree. + * + * If @ins_len > 0, nodes and leaves will be split as we walk down the tree. + * If @ins_len < 0, nodes will be merged as we walk down the tree (if possible) * - * If the key isn't found, the path points to the slot where it should - * be inserted, and 1 is returned. If there are other errors during the - * search a negative error number is returned. + * If @key is found, 0 is returned and you can find the item in the leaf level + * of the path (level 0) * - * if ins_len > 0, nodes and leaves will be split as we walk down the - * tree. if ins_len < 0, nodes will be merged as we walk down the tree (if - * possible) + * If @key isn't found, 1 is returned and the leaf level of the path (level 0) + * points to the slot where it should be inserted + * + * If an error is encountered while searching the tree a negative error number + * is returned */ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root, const struct btrfs_key *key, struct btrfs_path *p, @@ -2774,6 +2780,8 @@ again: * contention with the cow code */ if (cow) { + bool last_level = (level == (BTRFS_MAX_LEVEL - 1)); + /* * if we don't really need to cow this block * then we don't want to set the path blocking, @@ -2798,9 +2806,13 @@ again: } btrfs_set_path_blocking(p); - err = btrfs_cow_block(trans, root, b, - p->nodes[level + 1], - p->slots[level + 1], &b); + if (last_level) + err = btrfs_cow_block(trans, root, b, NULL, 0, + &b); + else + err = btrfs_cow_block(trans, root, b, + p->nodes[level + 1], + p->slots[level + 1], &b); if (err) { ret = err; goto done; @@ -5175,7 +5187,7 @@ again: while (1) { nritems = btrfs_header_nritems(cur); level = btrfs_header_level(cur); - sret = bin_search(cur, min_key, level, &slot); + sret = btrfs_bin_search(cur, min_key, level, &slot); /* at the lowest level, we're done, setup the path and exit */ if (level == path->lowest_level) { diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 13c260b525a1..1a462ab85c49 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -679,7 +679,6 @@ enum btrfs_orphan_cleanup_state { /* used by the raid56 code to lock stripes for read/modify/write */ struct btrfs_stripe_hash { struct list_head hash_list; - wait_queue_head_t wait; spinlock_t lock; }; @@ -3060,15 +3059,10 @@ struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans, struct btrfs_path *path, u64 dir, const char *name, u16 name_len, int mod); -int verify_dir_item(struct btrfs_fs_info *fs_info, - struct extent_buffer *leaf, int slot, - struct btrfs_dir_item *dir_item); struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_fs_info *fs_info, struct btrfs_path *path, const char *name, int name_len); -bool btrfs_is_name_len_valid(struct extent_buffer *leaf, int slot, - unsigned long start, u16 name_len); /* orphan.c */ int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans, @@ -3197,7 +3191,7 @@ int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc); struct inode *btrfs_alloc_inode(struct super_block *sb); void btrfs_destroy_inode(struct inode *inode); int btrfs_drop_inode(struct inode *inode); -int btrfs_init_cachep(void); +int __init btrfs_init_cachep(void); void btrfs_destroy_cachep(void); long btrfs_ioctl_trans_end(struct file *file); struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location, @@ -3248,7 +3242,7 @@ ssize_t btrfs_dedupe_file_range(struct file *src_file, u64 loff, u64 olen, struct file *dst_file, u64 dst_loff); /* file.c */ -int btrfs_auto_defrag_init(void); +int __init btrfs_auto_defrag_init(void); void btrfs_auto_defrag_exit(void); int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, struct btrfs_inode *inode); @@ -3283,7 +3277,7 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, struct btrfs_root *root); /* sysfs.c */ -int btrfs_init_sysfs(void); +int __init btrfs_init_sysfs(void); void btrfs_exit_sysfs(void); int btrfs_sysfs_add_mounted(struct btrfs_fs_info *fs_info); void btrfs_sysfs_remove_mounted(struct btrfs_fs_info *fs_info); diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 056276101c63..0530f6f2e4ba 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -18,6 +18,7 @@ */ #include <linux/slab.h> +#include <linux/iversion.h> #include "delayed-inode.h" #include "disk-io.h" #include "transaction.h" @@ -1302,40 +1303,42 @@ static void btrfs_async_run_delayed_root(struct btrfs_work *work) if (!path) goto out; -again: - if (atomic_read(&delayed_root->items) < BTRFS_DELAYED_BACKGROUND / 2) - goto free_path; + do { + if (atomic_read(&delayed_root->items) < + BTRFS_DELAYED_BACKGROUND / 2) + break; - delayed_node = btrfs_first_prepared_delayed_node(delayed_root); - if (!delayed_node) - goto free_path; + delayed_node = btrfs_first_prepared_delayed_node(delayed_root); + if (!delayed_node) + break; - path->leave_spinning = 1; - root = delayed_node->root; + path->leave_spinning = 1; + root = delayed_node->root; - trans = btrfs_join_transaction(root); - if (IS_ERR(trans)) - goto release_path; + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) { + btrfs_release_path(path); + btrfs_release_prepared_delayed_node(delayed_node); + total_done++; + continue; + } - block_rsv = trans->block_rsv; - trans->block_rsv = &root->fs_info->delayed_block_rsv; + block_rsv = trans->block_rsv; + trans->block_rsv = &root->fs_info->delayed_block_rsv; - __btrfs_commit_inode_delayed_items(trans, path, delayed_node); + __btrfs_commit_inode_delayed_items(trans, path, delayed_node); - trans->block_rsv = block_rsv; - btrfs_end_transaction(trans); - btrfs_btree_balance_dirty_nodelay(root->fs_info); + trans->block_rsv = block_rsv; + btrfs_end_transaction(trans); + btrfs_btree_balance_dirty_nodelay(root->fs_info); -release_path: - btrfs_release_path(path); - total_done++; + btrfs_release_path(path); + btrfs_release_prepared_delayed_node(delayed_node); + total_done++; - btrfs_release_prepared_delayed_node(delayed_node); - if ((async_work->nr == 0 && total_done < BTRFS_DELAYED_WRITEBACK) || - total_done < async_work->nr) - goto again; + } while ((async_work->nr == 0 && total_done < BTRFS_DELAYED_WRITEBACK) + || total_done < async_work->nr); -free_path: btrfs_free_path(path); out: wake_up(&delayed_root->wait); @@ -1348,10 +1351,6 @@ static int btrfs_wq_run_delayed_node(struct btrfs_delayed_root *delayed_root, { struct btrfs_async_delayed_work *async_work; - if (atomic_read(&delayed_root->items) < BTRFS_DELAYED_BACKGROUND || - btrfs_workqueue_normal_congested(fs_info->delayed_workers)) - return 0; - async_work = kmalloc(sizeof(*async_work), GFP_NOFS); if (!async_work) return -ENOMEM; @@ -1387,7 +1386,8 @@ void btrfs_balance_delayed_items(struct btrfs_fs_info *fs_info) { struct btrfs_delayed_root *delayed_root = fs_info->delayed_root; - if (atomic_read(&delayed_root->items) < BTRFS_DELAYED_BACKGROUND) + if ((atomic_read(&delayed_root->items) < BTRFS_DELAYED_BACKGROUND) || + btrfs_workqueue_normal_congested(fs_info->delayed_workers)) return; if (atomic_read(&delayed_root->items) >= BTRFS_DELAYED_WRITEBACK) { @@ -1633,28 +1633,18 @@ void btrfs_readdir_put_delayed_items(struct inode *inode, int btrfs_should_delete_dir_index(struct list_head *del_list, u64 index) { - struct btrfs_delayed_item *curr, *next; - int ret; - - if (list_empty(del_list)) - return 0; + struct btrfs_delayed_item *curr; + int ret = 0; - list_for_each_entry_safe(curr, next, del_list, readdir_list) { + list_for_each_entry(curr, del_list, readdir_list) { if (curr->key.offset > index) break; - - list_del(&curr->readdir_list); - ret = (curr->key.offset == index); - - if (refcount_dec_and_test(&curr->refs)) - kfree(curr); - - if (ret) - return 1; - else - continue; + if (curr->key.offset == index) { + ret = 1; + break; + } } - return 0; + return ret; } /* @@ -1723,7 +1713,8 @@ static void fill_stack_inode_item(struct btrfs_trans_handle *trans, btrfs_set_stack_inode_nbytes(inode_item, inode_get_bytes(inode)); btrfs_set_stack_inode_generation(inode_item, BTRFS_I(inode)->generation); - btrfs_set_stack_inode_sequence(inode_item, inode->i_version); + btrfs_set_stack_inode_sequence(inode_item, + inode_peek_iversion(inode)); btrfs_set_stack_inode_transid(inode_item, trans->transid); btrfs_set_stack_inode_rdev(inode_item, inode->i_rdev); btrfs_set_stack_inode_flags(inode_item, BTRFS_I(inode)->flags); @@ -1777,7 +1768,8 @@ int btrfs_fill_inode(struct inode *inode, u32 *rdev) BTRFS_I(inode)->generation = btrfs_stack_inode_generation(inode_item); BTRFS_I(inode)->last_trans = btrfs_stack_inode_transid(inode_item); - inode->i_version = btrfs_stack_inode_sequence(inode_item); + inode_set_iversion_queried(inode, + btrfs_stack_inode_sequence(inode_item)); inode->i_rdev = 0; *rdev = btrfs_stack_inode_rdev(inode_item); BTRFS_I(inode)->flags = btrfs_stack_inode_flags(inode_item); diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index 83be8f9fd906..a1a40cf382e3 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -937,7 +937,7 @@ void btrfs_delayed_ref_exit(void) kmem_cache_destroy(btrfs_delayed_extent_op_cachep); } -int btrfs_delayed_ref_init(void) +int __init btrfs_delayed_ref_init(void) { btrfs_delayed_ref_head_cachep = kmem_cache_create( "btrfs_delayed_ref_head", diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index a43af432f859..c4f625e5a691 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -203,7 +203,7 @@ extern struct kmem_cache *btrfs_delayed_tree_ref_cachep; extern struct kmem_cache *btrfs_delayed_data_ref_cachep; extern struct kmem_cache *btrfs_delayed_extent_op_cachep; -int btrfs_delayed_ref_init(void); +int __init btrfs_delayed_ref_init(void); void btrfs_delayed_ref_exit(void); static inline struct btrfs_delayed_extent_op * diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 7c655f9a7a50..7efbc4d1128b 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -172,7 +172,8 @@ no_valid_dev_replace_entry_found: dev_replace->tgtdev->commit_bytes_used = dev_replace->srcdev->commit_bytes_used; } - dev_replace->tgtdev->is_tgtdev_for_dev_replace = 1; + set_bit(BTRFS_DEV_STATE_REPLACE_TGT, + &dev_replace->tgtdev->dev_state); btrfs_init_dev_replace_tgtdev_for_resume(fs_info, dev_replace->tgtdev); } @@ -304,6 +305,14 @@ void btrfs_after_dev_replace_commit(struct btrfs_fs_info *fs_info) dev_replace->cursor_left_last_write_of_item; } +static char* btrfs_dev_name(struct btrfs_device *device) +{ + if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) + return "<missing disk>"; + else + return rcu_str_deref(device->name); +} + int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, const char *tgtdev_name, u64 srcdevid, const char *srcdev_name, int read_src) @@ -363,8 +372,7 @@ int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, btrfs_info_in_rcu(fs_info, "dev_replace from %s (devid %llu) to %s started", - src_device->missing ? "<missing disk>" : - rcu_str_deref(src_device->name), + btrfs_dev_name(src_device), src_device->devid, rcu_str_deref(tgt_device->name)); @@ -538,8 +546,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, } else { btrfs_err_in_rcu(fs_info, "btrfs_scrub_dev(%s, %llu, %s) failed %d", - src_device->missing ? "<missing disk>" : - rcu_str_deref(src_device->name), + btrfs_dev_name(src_device), src_device->devid, rcu_str_deref(tgt_device->name), scrub_ret); btrfs_dev_replace_unlock(dev_replace, 1); @@ -557,11 +564,10 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, btrfs_info_in_rcu(fs_info, "dev_replace from %s (devid %llu) to %s finished", - src_device->missing ? "<missing disk>" : - rcu_str_deref(src_device->name), + btrfs_dev_name(src_device), src_device->devid, rcu_str_deref(tgt_device->name)); - tgt_device->is_tgtdev_for_dev_replace = 0; + clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &tgt_device->dev_state); tgt_device->devid = src_device->devid; src_device->devid = BTRFS_DEV_REPLACE_DEVID; memcpy(uuid_tmp, tgt_device->uuid, sizeof(uuid_tmp)); @@ -814,12 +820,10 @@ static int btrfs_dev_replace_kthread(void *data) progress = btrfs_dev_replace_progress(fs_info); progress = div_u64(progress, 10); btrfs_info_in_rcu(fs_info, - "continuing dev_replace from %s (devid %llu) to %s @%u%%", - dev_replace->srcdev->missing ? "<missing disk>" - : rcu_str_deref(dev_replace->srcdev->name), + "continuing dev_replace from %s (devid %llu) to target %s @%u%%", + btrfs_dev_name(dev_replace->srcdev), dev_replace->srcdev->devid, - dev_replace->tgtdev ? rcu_str_deref(dev_replace->tgtdev->name) - : "<missing target disk>", + btrfs_dev_name(dev_replace->tgtdev), (unsigned int)progress); btrfs_dev_replace_continue_on_mount(fs_info); diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c index 41cb9196eaa8..cbe421605cd5 100644 --- a/fs/btrfs/dir-item.c +++ b/fs/btrfs/dir-item.c @@ -403,8 +403,6 @@ struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_fs_info *fs_info, btrfs_dir_data_len(leaf, dir_item); name_ptr = (unsigned long)(dir_item + 1); - if (verify_dir_item(fs_info, leaf, path->slots[0], dir_item)) - return NULL; if (btrfs_dir_name_len(leaf, dir_item) == name_len && memcmp_extent_buffer(leaf, name, name_ptr, name_len) == 0) return dir_item; @@ -450,109 +448,3 @@ int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans, } return ret; } - -int verify_dir_item(struct btrfs_fs_info *fs_info, - struct extent_buffer *leaf, - int slot, - struct btrfs_dir_item *dir_item) -{ - u16 namelen = BTRFS_NAME_LEN; - int ret; - u8 type = btrfs_dir_type(leaf, dir_item); - - if (type >= BTRFS_FT_MAX) { - btrfs_crit(fs_info, "invalid dir item type: %d", (int)type); - return 1; - } - - if (type == BTRFS_FT_XATTR) - namelen = XATTR_NAME_MAX; - - if (btrfs_dir_name_len(leaf, dir_item) > namelen) { - btrfs_crit(fs_info, "invalid dir item name len: %u", - (unsigned)btrfs_dir_name_len(leaf, dir_item)); - return 1; - } - - namelen = btrfs_dir_name_len(leaf, dir_item); - ret = btrfs_is_name_len_valid(leaf, slot, - (unsigned long)(dir_item + 1), namelen); - if (!ret) - return 1; - - /* BTRFS_MAX_XATTR_SIZE is the same for all dir items */ - if ((btrfs_dir_data_len(leaf, dir_item) + - btrfs_dir_name_len(leaf, dir_item)) > - BTRFS_MAX_XATTR_SIZE(fs_info)) { - btrfs_crit(fs_info, "invalid dir item name + data len: %u + %u", - (unsigned)btrfs_dir_name_len(leaf, dir_item), - (unsigned)btrfs_dir_data_len(leaf, dir_item)); - return 1; - } - - return 0; -} - -bool btrfs_is_name_len_valid(struct extent_buffer *leaf, int slot, - unsigned long start, u16 name_len) -{ - struct btrfs_fs_info *fs_info = leaf->fs_info; - struct btrfs_key key; - u32 read_start; - u32 read_end; - u32 item_start; - u32 item_end; - u32 size; - bool ret = true; - - ASSERT(start > BTRFS_LEAF_DATA_OFFSET); - - read_start = start - BTRFS_LEAF_DATA_OFFSET; - read_end = read_start + name_len; - item_start = btrfs_item_offset_nr(leaf, slot); - item_end = btrfs_item_end_nr(leaf, slot); - - btrfs_item_key_to_cpu(leaf, &key, slot); - - switch (key.type) { - case BTRFS_DIR_ITEM_KEY: - case BTRFS_XATTR_ITEM_KEY: - case BTRFS_DIR_INDEX_KEY: - size = sizeof(struct btrfs_dir_item); - break; - case BTRFS_INODE_REF_KEY: - size = sizeof(struct btrfs_inode_ref); - break; - case BTRFS_INODE_EXTREF_KEY: - size = sizeof(struct btrfs_inode_extref); - break; - case BTRFS_ROOT_REF_KEY: - case BTRFS_ROOT_BACKREF_KEY: - size = sizeof(struct btrfs_root_ref); - break; - default: - ret = false; - goto out; - } - - if (read_start < item_start) { - ret = false; - goto out; - } - if (read_end > item_end) { - ret = false; - goto out; - } - - /* there shall be item(s) before name */ - if (read_start - item_start < size) { - ret = false; - goto out; - } - -out: - if (!ret) - btrfs_crit(fs_info, "invalid dir item name len: %u", - (unsigned int)name_len); - return ret; -} diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index a8ecccfc36de..21f34ad0d411 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -30,6 +30,7 @@ #include <linux/ratelimit.h> #include <linux/uuid.h> #include <linux/semaphore.h> +#include <linux/error-injection.h> #include <asm/unaligned.h> #include "ctree.h" #include "disk-io.h" @@ -61,7 +62,8 @@ BTRFS_HEADER_FLAG_RELOC |\ BTRFS_SUPER_FLAG_ERROR |\ BTRFS_SUPER_FLAG_SEEDING |\ - BTRFS_SUPER_FLAG_METADUMP) + BTRFS_SUPER_FLAG_METADUMP |\ + BTRFS_SUPER_FLAG_METADUMP_V2) static const struct extent_io_ops btree_extent_io_ops; static void end_workqueue_fn(struct btrfs_work *work); @@ -220,7 +222,7 @@ void btrfs_set_buffer_lockdep_class(u64 objectid, struct extent_buffer *eb, * extents on the btree inode are pretty simple, there's one extent * that covers the entire device */ -static struct extent_map *btree_get_extent(struct btrfs_inode *inode, +struct extent_map *btree_get_extent(struct btrfs_inode *inode, struct page *page, size_t pg_offset, u64 start, u64 len, int create) { @@ -285,7 +287,7 @@ static int csum_tree_block(struct btrfs_fs_info *fs_info, int verify) { u16 csum_size = btrfs_super_csum_size(fs_info->super_copy); - char *result = NULL; + char result[BTRFS_CSUM_SIZE]; unsigned long len; unsigned long cur_len; unsigned long offset = BTRFS_CSUM_SIZE; @@ -294,7 +296,6 @@ static int csum_tree_block(struct btrfs_fs_info *fs_info, unsigned long map_len; int err; u32 crc = ~(u32)0; - unsigned long inline_result; len = buf->len - offset; while (len > 0) { @@ -308,13 +309,7 @@ static int csum_tree_block(struct btrfs_fs_info *fs_info, len -= cur_len; offset += cur_len; } - if (csum_size > sizeof(inline_result)) { - result = kzalloc(csum_size, GFP_NOFS); - if (!result) - return -ENOMEM; - } else { - result = (char *)&inline_result; - } + memset(result, 0, BTRFS_CSUM_SIZE); btrfs_csum_final(crc, result); @@ -329,15 +324,12 @@ static int csum_tree_block(struct btrfs_fs_info *fs_info, "%s checksum verify failed on %llu wanted %X found %X level %d", fs_info->sb->s_id, buf->start, val, found, btrfs_header_level(buf)); - if (result != (char *)&inline_result) - kfree(result); return -EUCLEAN; } } else { write_extent_buffer(buf, result, 0, csum_size); } - if (result != (char *)&inline_result) - kfree(result); + return 0; } @@ -391,7 +383,7 @@ static int verify_parent_transid(struct extent_io_tree *io_tree, clear_extent_buffer_uptodate(eb); out: unlock_extent_cached(io_tree, eb->start, eb->start + eb->len - 1, - &cached_state, GFP_NOFS); + &cached_state); if (need_lock) btrfs_tree_read_unlock_blocking(eb); return ret; @@ -455,7 +447,7 @@ static int btree_read_extent_buffer_pages(struct btrfs_fs_info *fs_info, io_tree = &BTRFS_I(fs_info->btree_inode)->io_tree; while (1) { ret = read_extent_buffer_pages(io_tree, eb, WAIT_COMPLETE, - btree_get_extent, mirror_num); + mirror_num); if (!ret) { if (!verify_parent_transid(io_tree, eb, parent_transid, 0)) @@ -1012,7 +1004,7 @@ void readahead_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr) if (IS_ERR(buf)) return; read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree, - buf, WAIT_NONE, btree_get_extent, 0); + buf, WAIT_NONE, 0); free_extent_buffer(buf); } @@ -1031,7 +1023,7 @@ int reada_tree_block_flagged(struct btrfs_fs_info *fs_info, u64 bytenr, set_bit(EXTENT_BUFFER_READAHEAD, &buf->bflags); ret = read_extent_buffer_pages(io_tree, buf, WAIT_PAGE_LOCK, - btree_get_extent, mirror_num); + mirror_num); if (ret) { free_extent_buffer(buf); return ret; @@ -1243,7 +1235,7 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, struct btrfs_root *root; struct btrfs_key key; int ret = 0; - uuid_le uuid; + uuid_le uuid = NULL_UUID_LE; root = btrfs_alloc_root(fs_info, GFP_KERNEL); if (!root) @@ -1284,7 +1276,8 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, btrfs_set_root_used(&root->root_item, leaf->len); btrfs_set_root_last_snapshot(&root->root_item, 0); btrfs_set_root_dirid(&root->root_item, 0); - uuid_le_gen(&uuid); + if (is_fstree(objectid)) + uuid_le_gen(&uuid); memcpy(root->root_item.uuid, uuid.b, BTRFS_UUID_SIZE); root->root_item.drop_level = 0; @@ -2875,7 +2868,7 @@ retry_root_backup: goto fail_sysfs; } - if (!sb_rdonly(sb) && !btrfs_check_rw_degradable(fs_info)) { + if (!sb_rdonly(sb) && !btrfs_check_rw_degradable(fs_info, NULL)) { btrfs_warn(fs_info, "writeable mount is not allowed due to too many missing devices"); goto fail_sysfs; @@ -3123,6 +3116,7 @@ recovery_tree_root: goto fail_block_groups; goto retry_root_backup; } +ALLOW_ERROR_INJECTION(open_ctree, ERRNO); static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate) { @@ -3357,7 +3351,7 @@ static void write_dev_flush(struct btrfs_device *device) bio->bi_private = &device->flush_wait; btrfsic_submit_bio(bio); - device->flush_bio_sent = 1; + set_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state); } /* @@ -3367,10 +3361,10 @@ static blk_status_t wait_dev_flush(struct btrfs_device *device) { struct bio *bio = device->flush_bio; - if (!device->flush_bio_sent) + if (!test_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state)) return BLK_STS_OK; - device->flush_bio_sent = 0; + clear_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state); wait_for_completion_io(&device->flush_wait); return bio->bi_status; @@ -3378,7 +3372,7 @@ static blk_status_t wait_dev_flush(struct btrfs_device *device) static int check_barrier_error(struct btrfs_fs_info *fs_info) { - if (!btrfs_check_rw_degradable(fs_info)) + if (!btrfs_check_rw_degradable(fs_info, NULL)) return -EIO; return 0; } @@ -3394,14 +3388,16 @@ static int barrier_all_devices(struct btrfs_fs_info *info) int errors_wait = 0; blk_status_t ret; + lockdep_assert_held(&info->fs_devices->device_list_mutex); /* send down all the barriers */ head = &info->fs_devices->devices; - list_for_each_entry_rcu(dev, head, dev_list) { - if (dev->missing) + list_for_each_entry(dev, head, dev_list) { + if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state)) continue; if (!dev->bdev) continue; - if (!dev->in_fs_metadata || !dev->writeable) + if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) || + !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) continue; write_dev_flush(dev); @@ -3409,14 +3405,15 @@ static int barrier_all_devices(struct btrfs_fs_info *info) } /* wait for all the barriers */ - list_for_each_entry_rcu(dev, head, dev_list) { - if (dev->missing) + list_for_each_entry(dev, head, dev_list) { + if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state)) continue; if (!dev->bdev) { errors_wait++; continue; } - if (!dev->in_fs_metadata || !dev->writeable) + if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) || + !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) continue; ret = wait_dev_flush(dev); @@ -3508,12 +3505,13 @@ int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors) } } - list_for_each_entry_rcu(dev, head, dev_list) { + list_for_each_entry(dev, head, dev_list) { if (!dev->bdev) { total_errors++; continue; } - if (!dev->in_fs_metadata || !dev->writeable) + if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) || + !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) continue; btrfs_set_stack_device_generation(dev_item, 0); @@ -3549,10 +3547,11 @@ int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors) } total_errors = 0; - list_for_each_entry_rcu(dev, head, dev_list) { + list_for_each_entry(dev, head, dev_list) { if (!dev->bdev) continue; - if (!dev->in_fs_metadata || !dev->writeable) + if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) || + !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) continue; ret = wait_dev_supers(dev, max_mirrors); @@ -3910,9 +3909,11 @@ static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info) btrfs_err(fs_info, "no valid FS found"); ret = -EINVAL; } - if (btrfs_super_flags(sb) & ~BTRFS_SUPER_FLAG_SUPP) - btrfs_warn(fs_info, "unrecognized super flag: %llu", + if (btrfs_super_flags(sb) & ~BTRFS_SUPER_FLAG_SUPP) { + btrfs_err(fs_info, "unrecognized or unsupported super flag: %llu", btrfs_super_flags(sb) & ~BTRFS_SUPER_FLAG_SUPP); + ret = -EINVAL; + } if (btrfs_super_root_level(sb) >= BTRFS_MAX_LEVEL) { btrfs_err(fs_info, "tree_root level too big: %d >= %d", btrfs_super_root_level(sb), BTRFS_MAX_LEVEL); diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 7f7c35d6347a..301151a50ac1 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -149,6 +149,9 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, u64 objectid); int btree_lock_page_hook(struct page *page, void *data, void (*flush_fn)(void *)); +struct extent_map *btree_get_extent(struct btrfs_inode *inode, + struct page *page, size_t pg_offset, u64 start, u64 len, + int create); int btrfs_get_num_tolerated_disk_barrier_failures(u64 flags); int __init btrfs_end_io_wq_init(void); void btrfs_end_io_wq_exit(void); diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c index 3aeb5770f896..ddaccad469f8 100644 --- a/fs/btrfs/export.c +++ b/fs/btrfs/export.c @@ -283,11 +283,6 @@ static int btrfs_get_name(struct dentry *parent, char *name, name_len = btrfs_inode_ref_name_len(leaf, iref); } - ret = btrfs_is_name_len_valid(leaf, path->slots[0], name_ptr, name_len); - if (!ret) { - btrfs_free_path(path); - return -EIO; - } read_extent_buffer(leaf, name, name_ptr, name_len); btrfs_free_path(path); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 2f4328511ac8..05751a677da4 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2145,7 +2145,10 @@ int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr, for (i = 0; i < bbio->num_stripes; i++, stripe++) { u64 bytes; - if (!stripe->dev->can_discard) + struct request_queue *req_q; + + req_q = bdev_get_queue(stripe->dev->bdev); + if (!blk_queue_discard(req_q)) continue; ret = btrfs_issue_discard(stripe->dev->bdev, @@ -2894,7 +2897,7 @@ int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans, struct btrfs_block_rsv *global_rsv; u64 num_heads = trans->transaction->delayed_refs.num_heads_ready; u64 csum_bytes = trans->transaction->delayed_refs.pending_csums; - u64 num_dirty_bgs = trans->transaction->num_dirty_bgs; + unsigned int num_dirty_bgs = trans->transaction->num_dirty_bgs; u64 num_bytes, num_dirty_bgs_bytes; int ret = 0; @@ -4945,12 +4948,12 @@ static int may_commit_transaction(struct btrfs_fs_info *fs_info, bytes = 0; else bytes -= delayed_rsv->size; + spin_unlock(&delayed_rsv->lock); + if (percpu_counter_compare(&space_info->total_bytes_pinned, bytes) < 0) { - spin_unlock(&delayed_rsv->lock); return -ENOSPC; } - spin_unlock(&delayed_rsv->lock); commit: trans = btrfs_join_transaction(fs_info->extent_root); @@ -5738,8 +5741,8 @@ int btrfs_block_rsv_refill(struct btrfs_root *root, * or return if we already have enough space. This will also handle the resreve * tracepoint for the reserved amount. */ -int btrfs_inode_rsv_refill(struct btrfs_inode *inode, - enum btrfs_reserve_flush_enum flush) +static int btrfs_inode_rsv_refill(struct btrfs_inode *inode, + enum btrfs_reserve_flush_enum flush) { struct btrfs_root *root = inode->root; struct btrfs_block_rsv *block_rsv = &inode->block_rsv; @@ -5770,7 +5773,7 @@ int btrfs_inode_rsv_refill(struct btrfs_inode *inode, * This is the same as btrfs_block_rsv_release, except that it handles the * tracepoint for the reservation. */ -void btrfs_inode_rsv_release(struct btrfs_inode *inode) +static void btrfs_inode_rsv_release(struct btrfs_inode *inode) { struct btrfs_fs_info *fs_info = inode->root->fs_info; struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; @@ -9690,7 +9693,7 @@ int btrfs_can_relocate(struct btrfs_fs_info *fs_info, u64 bytenr) * space to fit our block group in. */ if (device->total_bytes > device->bytes_used + min_free && - !device->is_tgtdev_for_dev_replace) { + !test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) { ret = find_free_dev_extent(trans, device, min_free, &dev_offset, NULL); if (!ret) @@ -10875,7 +10878,7 @@ static int btrfs_trim_free_extents(struct btrfs_device *device, *trimmed = 0; /* Not writeable = nothing to do. */ - if (!device->writeable) + if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) return 0; /* No free space = nothing to do. */ diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 012d63870b99..dfeb74a0be77 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -21,6 +21,7 @@ #include "locking.h" #include "rcu-string.h" #include "backref.h" +#include "disk-io.h" static struct kmem_cache *extent_state_cache; static struct kmem_cache *extent_buffer_cache; @@ -109,8 +110,6 @@ struct tree_entry { struct extent_page_data { struct bio *bio; struct extent_io_tree *tree; - get_extent_t *get_extent; - /* tells writepage not to lock the state bits for this range * it still does the unlocking */ @@ -139,7 +138,8 @@ static void add_extent_changeset(struct extent_state *state, unsigned bits, BUG_ON(ret < 0); } -static noinline void flush_write_bio(void *data); +static void flush_write_bio(struct extent_page_data *epd); + static inline struct btrfs_fs_info * tree_fs_info(struct extent_io_tree *tree) { @@ -581,7 +581,7 @@ static void extent_io_tree_panic(struct extent_io_tree *tree, int err) * * This takes the tree lock, and returns 0 on success and < 0 on error. */ -static int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, +int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, unsigned bits, int wake, int delete, struct extent_state **cached_state, gfp_t mask, struct extent_changeset *changeset) @@ -1295,10 +1295,10 @@ int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, unsigned bits, int wake, int delete, - struct extent_state **cached, gfp_t mask) + struct extent_state **cached) { return __clear_extent_bit(tree, start, end, bits, wake, delete, - cached, mask, NULL); + cached, GFP_NOFS, NULL); } int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, @@ -1348,7 +1348,7 @@ int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end) if (err == -EEXIST) { if (failed_start > start) clear_extent_bit(tree, start, failed_start - 1, - EXTENT_LOCKED, 1, 0, NULL, GFP_NOFS); + EXTENT_LOCKED, 1, 0, NULL); return 0; } return 1; @@ -1648,7 +1648,7 @@ again: EXTENT_DELALLOC, 1, cached_state); if (!ret) { unlock_extent_cached(tree, delalloc_start, delalloc_end, - &cached_state, GFP_NOFS); + &cached_state); __unlock_for_delalloc(inode, locked_page, delalloc_start, delalloc_end); cond_resched(); @@ -1744,7 +1744,7 @@ void extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end, unsigned long page_ops) { clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, clear_bits, 1, 0, - NULL, GFP_NOFS); + NULL); __process_pages_contig(inode->i_mapping, locked_page, start >> PAGE_SHIFT, end >> PAGE_SHIFT, @@ -2027,7 +2027,8 @@ int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, bio->bi_iter.bi_sector = sector; dev = bbio->stripes[bbio->mirror_num - 1].dev; btrfs_put_bbio(bbio); - if (!dev || !dev->bdev || !dev->writeable) { + if (!dev || !dev->bdev || + !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) { btrfs_bio_counter_dec(fs_info); bio_put(bio); return -EIO; @@ -2257,7 +2258,7 @@ int btrfs_get_io_failure_record(struct inode *inode, u64 start, u64 end, return 0; } -bool btrfs_check_repairable(struct inode *inode, struct bio *failed_bio, +bool btrfs_check_repairable(struct inode *inode, unsigned failed_bio_pages, struct io_failure_record *failrec, int failed_mirror) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); @@ -2281,7 +2282,7 @@ bool btrfs_check_repairable(struct inode *inode, struct bio *failed_bio, * a) deliver good data to the caller * b) correct the bad sectors on disk */ - if (failed_bio->bi_vcnt > 1) { + if (failed_bio_pages > 1) { /* * to fulfill b), we need to know the exact failing sectors, as * we don't want to rewrite any more than the failed ones. thus, @@ -2374,6 +2375,7 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset, int read_mode = 0; blk_status_t status; int ret; + unsigned failed_bio_pages = bio_pages_all(failed_bio); BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE); @@ -2381,13 +2383,13 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset, if (ret) return ret; - if (!btrfs_check_repairable(inode, failed_bio, failrec, + if (!btrfs_check_repairable(inode, failed_bio_pages, failrec, failed_mirror)) { free_io_failure(failure_tree, tree, failrec); return -EIO; } - if (failed_bio->bi_vcnt > 1) + if (failed_bio_pages > 1) read_mode |= REQ_FAILFAST_DEV; phy_offset >>= inode->i_sb->s_blocksize_bits; @@ -2492,7 +2494,7 @@ endio_readpage_release_extent(struct extent_io_tree *tree, u64 start, u64 len, if (uptodate && tree->track_uptodate) set_extent_uptodate(tree, start, end, &cached, GFP_ATOMIC); - unlock_extent_cached(tree, start, end, &cached, GFP_ATOMIC); + unlock_extent_cached_atomic(tree, start, end, &cached); } /* @@ -2724,7 +2726,7 @@ static int __must_check submit_one_bio(struct bio *bio, int mirror_num, unsigned long bio_flags) { blk_status_t ret = 0; - struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; + struct bio_vec *bvec = bio_last_bvec_all(bio); struct page *page = bvec->bv_page; struct extent_io_tree *tree = bio->bi_private; u64 start; @@ -2732,7 +2734,6 @@ static int __must_check submit_one_bio(struct bio *bio, int mirror_num, start = page_offset(page) + bvec->bv_offset; bio->bi_private = NULL; - bio_get(bio); if (tree->ops) ret = tree->ops->submit_bio_hook(tree->private_data, bio, @@ -2740,7 +2741,6 @@ static int __must_check submit_one_bio(struct bio *bio, int mirror_num, else btrfsic_submit_bio(bio); - bio_put(bio); return blk_status_to_errno(ret); } @@ -2942,8 +2942,7 @@ static int __do_readpage(struct extent_io_tree *tree, set_extent_uptodate(tree, cur, cur + iosize - 1, &cached, GFP_NOFS); unlock_extent_cached(tree, cur, - cur + iosize - 1, - &cached, GFP_NOFS); + cur + iosize - 1, &cached); break; } em = __get_extent_map(inode, page, pg_offset, cur, @@ -3036,8 +3035,7 @@ static int __do_readpage(struct extent_io_tree *tree, set_extent_uptodate(tree, cur, cur + iosize - 1, &cached, GFP_NOFS); unlock_extent_cached(tree, cur, - cur + iosize - 1, - &cached, GFP_NOFS); + cur + iosize - 1, &cached); cur = cur + iosize; pg_offset += iosize; continue; @@ -3092,9 +3090,8 @@ out: static inline void __do_contiguous_readpages(struct extent_io_tree *tree, struct page *pages[], int nr_pages, u64 start, u64 end, - get_extent_t *get_extent, struct extent_map **em_cached, - struct bio **bio, int mirror_num, + struct bio **bio, unsigned long *bio_flags, u64 *prev_em_start) { @@ -3115,18 +3112,17 @@ static inline void __do_contiguous_readpages(struct extent_io_tree *tree, } for (index = 0; index < nr_pages; index++) { - __do_readpage(tree, pages[index], get_extent, em_cached, bio, - mirror_num, bio_flags, 0, prev_em_start); + __do_readpage(tree, pages[index], btrfs_get_extent, em_cached, + bio, 0, bio_flags, 0, prev_em_start); put_page(pages[index]); } } static void __extent_readpages(struct extent_io_tree *tree, struct page *pages[], - int nr_pages, get_extent_t *get_extent, + int nr_pages, struct extent_map **em_cached, - struct bio **bio, int mirror_num, - unsigned long *bio_flags, + struct bio **bio, unsigned long *bio_flags, u64 *prev_em_start) { u64 start = 0; @@ -3146,8 +3142,8 @@ static void __extent_readpages(struct extent_io_tree *tree, } else { __do_contiguous_readpages(tree, &pages[first_index], index - first_index, start, - end, get_extent, em_cached, - bio, mirror_num, bio_flags, + end, em_cached, + bio, bio_flags, prev_em_start); start = page_start; end = start + PAGE_SIZE - 1; @@ -3158,9 +3154,8 @@ static void __extent_readpages(struct extent_io_tree *tree, if (end) __do_contiguous_readpages(tree, &pages[first_index], index - first_index, start, - end, get_extent, em_cached, bio, - mirror_num, bio_flags, - prev_em_start); + end, em_cached, bio, + bio_flags, prev_em_start); } static int __extent_read_full_page(struct extent_io_tree *tree, @@ -3375,7 +3370,7 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode, page_end, NULL, 1); break; } - em = epd->get_extent(BTRFS_I(inode), page, pg_offset, cur, + em = btrfs_get_extent(BTRFS_I(inode), page, pg_offset, cur, end - cur + 1, 1); if (IS_ERR_OR_NULL(em)) { SetPageError(page); @@ -3458,10 +3453,9 @@ done: * and the end_io handler clears the writeback ranges */ static int __extent_writepage(struct page *page, struct writeback_control *wbc, - void *data) + struct extent_page_data *epd) { struct inode *inode = page->mapping->host; - struct extent_page_data *epd = data; u64 start = page_offset(page); u64 page_end = start + PAGE_SIZE - 1; int ret; @@ -3895,8 +3889,7 @@ retry: * write_cache_pages - walk the list of dirty pages of the given address space and write all of them. * @mapping: address space structure to write * @wbc: subtract the number of written pages from *@wbc->nr_to_write - * @writepage: function called for each page - * @data: data passed to writepage function + * @data: data passed to __extent_writepage function * * If a page is already under I/O, write_cache_pages() skips it, even * if it's dirty. This is desirable behaviour for memory-cleaning writeback, @@ -3908,8 +3901,7 @@ retry: */ static int extent_write_cache_pages(struct address_space *mapping, struct writeback_control *wbc, - writepage_t writepage, void *data, - void (*flush_fn)(void *)) + struct extent_page_data *epd) { struct inode *inode = mapping->host; int ret = 0; @@ -3973,7 +3965,7 @@ retry: * mapping */ if (!trylock_page(page)) { - flush_fn(data); + flush_write_bio(epd); lock_page(page); } @@ -3984,7 +3976,7 @@ retry: if (wbc->sync_mode != WB_SYNC_NONE) { if (PageWriteback(page)) - flush_fn(data); + flush_write_bio(epd); wait_on_page_writeback(page); } @@ -3994,7 +3986,7 @@ retry: continue; } - ret = (*writepage)(page, wbc, data); + ret = __extent_writepage(page, wbc, epd); if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) { unlock_page(page); @@ -4042,7 +4034,7 @@ retry: return ret; } -static void flush_epd_write_bio(struct extent_page_data *epd) +static void flush_write_bio(struct extent_page_data *epd) { if (epd->bio) { int ret; @@ -4053,37 +4045,28 @@ static void flush_epd_write_bio(struct extent_page_data *epd) } } -static noinline void flush_write_bio(void *data) -{ - struct extent_page_data *epd = data; - flush_epd_write_bio(epd); -} - -int extent_write_full_page(struct extent_io_tree *tree, struct page *page, - get_extent_t *get_extent, - struct writeback_control *wbc) +int extent_write_full_page(struct page *page, struct writeback_control *wbc) { int ret; struct extent_page_data epd = { .bio = NULL, - .tree = tree, - .get_extent = get_extent, + .tree = &BTRFS_I(page->mapping->host)->io_tree, .extent_locked = 0, .sync_io = wbc->sync_mode == WB_SYNC_ALL, }; ret = __extent_writepage(page, wbc, &epd); - flush_epd_write_bio(&epd); + flush_write_bio(&epd); return ret; } -int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode, - u64 start, u64 end, get_extent_t *get_extent, +int extent_write_locked_range(struct inode *inode, u64 start, u64 end, int mode) { int ret = 0; struct address_space *mapping = inode->i_mapping; + struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; struct page *page; unsigned long nr_pages = (end - start + PAGE_SIZE) >> PAGE_SHIFT; @@ -4091,7 +4074,6 @@ int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode, struct extent_page_data epd = { .bio = NULL, .tree = tree, - .get_extent = get_extent, .extent_locked = 1, .sync_io = mode == WB_SYNC_ALL, }; @@ -4117,34 +4099,30 @@ int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode, start += PAGE_SIZE; } - flush_epd_write_bio(&epd); + flush_write_bio(&epd); return ret; } int extent_writepages(struct extent_io_tree *tree, struct address_space *mapping, - get_extent_t *get_extent, struct writeback_control *wbc) { int ret = 0; struct extent_page_data epd = { .bio = NULL, .tree = tree, - .get_extent = get_extent, .extent_locked = 0, .sync_io = wbc->sync_mode == WB_SYNC_ALL, }; - ret = extent_write_cache_pages(mapping, wbc, __extent_writepage, &epd, - flush_write_bio); - flush_epd_write_bio(&epd); + ret = extent_write_cache_pages(mapping, wbc, &epd); + flush_write_bio(&epd); return ret; } int extent_readpages(struct extent_io_tree *tree, struct address_space *mapping, - struct list_head *pages, unsigned nr_pages, - get_extent_t get_extent) + struct list_head *pages, unsigned nr_pages) { struct bio *bio = NULL; unsigned page_idx; @@ -4170,13 +4148,13 @@ int extent_readpages(struct extent_io_tree *tree, pagepool[nr++] = page; if (nr < ARRAY_SIZE(pagepool)) continue; - __extent_readpages(tree, pagepool, nr, get_extent, &em_cached, - &bio, 0, &bio_flags, &prev_em_start); + __extent_readpages(tree, pagepool, nr, &em_cached, &bio, + &bio_flags, &prev_em_start); nr = 0; } if (nr) - __extent_readpages(tree, pagepool, nr, get_extent, &em_cached, - &bio, 0, &bio_flags, &prev_em_start); + __extent_readpages(tree, pagepool, nr, &em_cached, &bio, + &bio_flags, &prev_em_start); if (em_cached) free_extent_map(em_cached); @@ -4209,7 +4187,7 @@ int extent_invalidatepage(struct extent_io_tree *tree, clear_extent_bit(tree, start, end, EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING, - 1, 1, &cached_state, GFP_NOFS); + 1, 1, &cached_state); return 0; } @@ -4234,9 +4212,9 @@ static int try_release_extent_state(struct extent_map_tree *map, * at this point we can safely clear everything except the * locked bit and the nodatasum bit */ - ret = clear_extent_bit(tree, start, end, + ret = __clear_extent_bit(tree, start, end, ~(EXTENT_LOCKED | EXTENT_NODATASUM), - 0, 0, NULL, mask); + 0, 0, NULL, mask, NULL); /* if clear_extent_bit failed for enomem reasons, * we can't allow the release to continue. @@ -4302,9 +4280,7 @@ int try_release_extent_mapping(struct extent_map_tree *map, * This maps until we find something past 'last' */ static struct extent_map *get_extent_skip_holes(struct inode *inode, - u64 offset, - u64 last, - get_extent_t *get_extent) + u64 offset, u64 last) { u64 sectorsize = btrfs_inode_sectorsize(inode); struct extent_map *em; @@ -4318,15 +4294,14 @@ static struct extent_map *get_extent_skip_holes(struct inode *inode, if (len == 0) break; len = ALIGN(len, sectorsize); - em = get_extent(BTRFS_I(inode), NULL, 0, offset, len, 0); + em = btrfs_get_extent_fiemap(BTRFS_I(inode), NULL, 0, offset, + len, 0); if (IS_ERR_OR_NULL(em)) return em; /* if this isn't a hole return it */ - if (!test_bit(EXTENT_FLAG_VACANCY, &em->flags) && - em->block_start != EXTENT_MAP_HOLE) { + if (em->block_start != EXTENT_MAP_HOLE) return em; - } /* this is a hole, advance to the next extent */ offset = extent_map_end(em); @@ -4451,7 +4426,7 @@ static int emit_last_fiemap_cache(struct btrfs_fs_info *fs_info, } int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, - __u64 start, __u64 len, get_extent_t *get_extent) + __u64 start, __u64 len) { int ret = 0; u64 off = start; @@ -4533,8 +4508,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len - 1, &cached_state); - em = get_extent_skip_holes(inode, start, last_for_get_extent, - get_extent); + em = get_extent_skip_holes(inode, start, last_for_get_extent); if (!em) goto out; if (IS_ERR(em)) { @@ -4622,8 +4596,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, } /* now scan forward to see if this is really the last extent. */ - em = get_extent_skip_holes(inode, off, last_for_get_extent, - get_extent); + em = get_extent_skip_holes(inode, off, last_for_get_extent); if (IS_ERR(em)) { ret = PTR_ERR(em); goto out; @@ -4647,7 +4620,7 @@ out_free: out: btrfs_free_path(path); unlock_extent_cached(&BTRFS_I(inode)->io_tree, start, start + len - 1, - &cached_state, GFP_NOFS); + &cached_state); return ret; } @@ -5263,8 +5236,7 @@ int extent_buffer_uptodate(struct extent_buffer *eb) } int read_extent_buffer_pages(struct extent_io_tree *tree, - struct extent_buffer *eb, int wait, - get_extent_t *get_extent, int mirror_num) + struct extent_buffer *eb, int wait, int mirror_num) { unsigned long i; struct page *page; @@ -5324,7 +5296,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree, ClearPageError(page); err = __extent_read_full_page(tree, page, - get_extent, &bio, + btree_get_extent, &bio, mirror_num, &bio_flags, REQ_META); if (err) { diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 93dcae0c3183..a7a850abd600 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -300,19 +300,29 @@ int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, unsigned bits, struct extent_changeset *changeset); int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, unsigned bits, int wake, int delete, - struct extent_state **cached, gfp_t mask); + struct extent_state **cached); +int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, + unsigned bits, int wake, int delete, + struct extent_state **cached, gfp_t mask, + struct extent_changeset *changeset); static inline int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end) { - return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, NULL, - GFP_NOFS); + return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, NULL); } static inline int unlock_extent_cached(struct extent_io_tree *tree, u64 start, - u64 end, struct extent_state **cached, gfp_t mask) + u64 end, struct extent_state **cached) +{ + return __clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, cached, + GFP_NOFS, NULL); +} + +static inline int unlock_extent_cached_atomic(struct extent_io_tree *tree, + u64 start, u64 end, struct extent_state **cached) { - return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, cached, - mask); + return __clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, cached, + GFP_ATOMIC, NULL); } static inline int clear_extent_bits(struct extent_io_tree *tree, u64 start, @@ -323,8 +333,7 @@ static inline int clear_extent_bits(struct extent_io_tree *tree, u64 start, if (bits & EXTENT_LOCKED) wake = 1; - return clear_extent_bit(tree, start, end, bits, wake, 0, NULL, - GFP_NOFS); + return clear_extent_bit(tree, start, end, bits, wake, 0, NULL); } int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, @@ -340,10 +349,10 @@ static inline int set_extent_bits(struct extent_io_tree *tree, u64 start, } static inline int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, - u64 end, struct extent_state **cached_state, gfp_t mask) + u64 end, struct extent_state **cached_state) { - return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0, - cached_state, mask); + return __clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0, + cached_state, GFP_NOFS, NULL); } static inline int set_extent_dirty(struct extent_io_tree *tree, u64 start, @@ -358,7 +367,7 @@ static inline int clear_extent_dirty(struct extent_io_tree *tree, u64 start, { return clear_extent_bit(tree, start, end, EXTENT_DIRTY | EXTENT_DELALLOC | - EXTENT_DO_ACCOUNTING, 0, 0, NULL, GFP_NOFS); + EXTENT_DO_ACCOUNTING, 0, 0, NULL); } int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, @@ -401,24 +410,19 @@ int find_first_extent_bit(struct extent_io_tree *tree, u64 start, struct extent_state **cached_state); int extent_invalidatepage(struct extent_io_tree *tree, struct page *page, unsigned long offset); -int extent_write_full_page(struct extent_io_tree *tree, struct page *page, - get_extent_t *get_extent, - struct writeback_control *wbc); -int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode, - u64 start, u64 end, get_extent_t *get_extent, +int extent_write_full_page(struct page *page, struct writeback_control *wbc); +int extent_write_locked_range(struct inode *inode, u64 start, u64 end, int mode); int extent_writepages(struct extent_io_tree *tree, struct address_space *mapping, - get_extent_t *get_extent, struct writeback_control *wbc); int btree_write_cache_pages(struct address_space *mapping, struct writeback_control *wbc); int extent_readpages(struct extent_io_tree *tree, struct address_space *mapping, - struct list_head *pages, unsigned nr_pages, - get_extent_t get_extent); + struct list_head *pages, unsigned nr_pages); int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, - __u64 start, __u64 len, get_extent_t *get_extent); + __u64 start, __u64 len); void set_page_extent_mapped(struct page *page); struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, @@ -437,7 +441,7 @@ void free_extent_buffer_stale(struct extent_buffer *eb); #define WAIT_PAGE_LOCK 2 int read_extent_buffer_pages(struct extent_io_tree *tree, struct extent_buffer *eb, int wait, - get_extent_t *get_extent, int mirror_num); + int mirror_num); void wait_on_extent_buffer_writeback(struct extent_buffer *eb); static inline unsigned long num_extent_pages(u64 start, u64 len) @@ -540,7 +544,7 @@ void btrfs_free_io_failure_record(struct btrfs_inode *inode, u64 start, u64 end); int btrfs_get_io_failure_record(struct inode *inode, u64 start, u64 end, struct io_failure_record **failrec_ret); -bool btrfs_check_repairable(struct inode *inode, struct bio *failed_bio, +bool btrfs_check_repairable(struct inode *inode, unsigned failed_bio_pages, struct io_failure_record *failrec, int fail_mirror); struct bio *btrfs_create_repair_bio(struct inode *inode, struct bio *failed_bio, struct io_failure_record *failrec, diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 2e348fb0b280..d3bd02105d1c 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -454,3 +454,135 @@ void replace_extent_mapping(struct extent_map_tree *tree, setup_extent_mapping(tree, new, modified); } + +static struct extent_map *next_extent_map(struct extent_map *em) +{ + struct rb_node *next; + + next = rb_next(&em->rb_node); + if (!next) + return NULL; + return container_of(next, struct extent_map, rb_node); +} + +static struct extent_map *prev_extent_map(struct extent_map *em) +{ + struct rb_node *prev; + + prev = rb_prev(&em->rb_node); + if (!prev) + return NULL; + return container_of(prev, struct extent_map, rb_node); +} + +/* helper for btfs_get_extent. Given an existing extent in the tree, + * the existing extent is the nearest extent to map_start, + * and an extent that you want to insert, deal with overlap and insert + * the best fitted new extent into the tree. + */ +static noinline int merge_extent_mapping(struct extent_map_tree *em_tree, + struct extent_map *existing, + struct extent_map *em, + u64 map_start) +{ + struct extent_map *prev; + struct extent_map *next; + u64 start; + u64 end; + u64 start_diff; + + BUG_ON(map_start < em->start || map_start >= extent_map_end(em)); + + if (existing->start > map_start) { + next = existing; + prev = prev_extent_map(next); + } else { + prev = existing; + next = next_extent_map(prev); + } + + start = prev ? extent_map_end(prev) : em->start; + start = max_t(u64, start, em->start); + end = next ? next->start : extent_map_end(em); + end = min_t(u64, end, extent_map_end(em)); + start_diff = start - em->start; + em->start = start; + em->len = end - start; + if (em->block_start < EXTENT_MAP_LAST_BYTE && + !test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { + em->block_start += start_diff; + em->block_len = em->len; + } + return add_extent_mapping(em_tree, em, 0); +} + +/** + * btrfs_add_extent_mapping - add extent mapping into em_tree + * @em_tree - the extent tree into which we want to insert the extent mapping + * @em_in - extent we are inserting + * @start - start of the logical range btrfs_get_extent() is requesting + * @len - length of the logical range btrfs_get_extent() is requesting + * + * Note that @em_in's range may be different from [start, start+len), + * but they must be overlapped. + * + * Insert @em_in into @em_tree. In case there is an overlapping range, handle + * the -EEXIST by either: + * a) Returning the existing extent in @em_in if @start is within the + * existing em. + * b) Merge the existing extent with @em_in passed in. + * + * Return 0 on success, otherwise -EEXIST. + * + */ +int btrfs_add_extent_mapping(struct extent_map_tree *em_tree, + struct extent_map **em_in, u64 start, u64 len) +{ + int ret; + struct extent_map *em = *em_in; + + ret = add_extent_mapping(em_tree, em, 0); + /* it is possible that someone inserted the extent into the tree + * while we had the lock dropped. It is also possible that + * an overlapping map exists in the tree + */ + if (ret == -EEXIST) { + struct extent_map *existing; + + ret = 0; + + existing = search_extent_mapping(em_tree, start, len); + /* + * existing will always be non-NULL, since there must be + * extent causing the -EEXIST. + */ + if (start >= existing->start && + start < extent_map_end(existing)) { + free_extent_map(em); + *em_in = existing; + ret = 0; + } else { + u64 orig_start = em->start; + u64 orig_len = em->len; + + /* + * The existing extent map is the one nearest to + * the [start, start + len) range which overlaps + */ + ret = merge_extent_mapping(em_tree, existing, + em, start); + if (ret) { + free_extent_map(em); + *em_in = NULL; + WARN_ONCE(ret, +"unexpected error %d: merge existing(start %llu len %llu) with em(start %llu len %llu)\n", + ret, existing->start, existing->len, + orig_start, orig_len); + } + free_extent_map(existing); + } + } + + ASSERT(ret == 0 || ret == -EEXIST); + return ret; +} diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h index 64365bbc9b16..b29f77bc0732 100644 --- a/fs/btrfs/extent_map.h +++ b/fs/btrfs/extent_map.h @@ -13,7 +13,6 @@ /* bits for the flags field */ #define EXTENT_FLAG_PINNED 0 /* this entry not yet on disk, don't free it */ #define EXTENT_FLAG_COMPRESSED 1 -#define EXTENT_FLAG_VACANCY 2 /* no file extent item found */ #define EXTENT_FLAG_PREALLOC 3 /* pre-allocated extent */ #define EXTENT_FLAG_LOGGING 4 /* Logging this extent */ #define EXTENT_FLAG_FILLING 5 /* Filling in a preallocated extent */ @@ -92,4 +91,6 @@ int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len, u64 gen void clear_em_logging(struct extent_map_tree *tree, struct extent_map *em); struct extent_map *search_extent_mapping(struct extent_map_tree *tree, u64 start, u64 len); +int btrfs_add_extent_mapping(struct extent_map_tree *em_tree, + struct extent_map **em_in, u64 start, u64 len); #endif diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index eb1bac7c8553..41ab9073d1d4 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -31,6 +31,7 @@ #include <linux/slab.h> #include <linux/btrfs.h> #include <linux/uio.h> +#include <linux/iversion.h> #include "ctree.h" #include "disk-io.h" #include "transaction.h" @@ -1504,7 +1505,7 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages, ordered->file_offset + ordered->len > start_pos && ordered->file_offset <= last_pos) { unlock_extent_cached(&inode->io_tree, start_pos, - last_pos, cached_state, GFP_NOFS); + last_pos, cached_state); for (i = 0; i < num_pages; i++) { unlock_page(pages[i]); put_page(pages[i]); @@ -1519,7 +1520,7 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages, clear_extent_bit(&inode->io_tree, start_pos, last_pos, EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, - 0, 0, cached_state, GFP_NOFS); + 0, 0, cached_state); *lockstart = start_pos; *lockend = last_pos; ret = 1; @@ -1755,11 +1756,10 @@ again: if (copied > 0) ret = btrfs_dirty_pages(inode, pages, dirty_pages, - pos, copied, NULL); + pos, copied, &cached_state); if (extents_locked) unlock_extent_cached(&BTRFS_I(inode)->io_tree, - lockstart, lockend, &cached_state, - GFP_NOFS); + lockstart, lockend, &cached_state); btrfs_delalloc_release_extents(BTRFS_I(inode), reserve_bytes); if (ret) { btrfs_drop_pages(pages, num_pages); @@ -2019,10 +2019,19 @@ int btrfs_release_file(struct inode *inode, struct file *filp) static int start_ordered_ops(struct inode *inode, loff_t start, loff_t end) { int ret; + struct blk_plug plug; + /* + * This is only called in fsync, which would do synchronous writes, so + * a plug can merge adjacent IOs as much as possible. Esp. in case of + * multiple disks using raid profile, a large IO can be split to + * several segments of stripe length (currently 64K). + */ + blk_start_plug(&plug); atomic_inc(&BTRFS_I(inode)->sync_writers); ret = btrfs_fdatawrite_range(inode, start, end); atomic_dec(&BTRFS_I(inode)->sync_writers); + blk_finish_plug(&plug); return ret; } @@ -2450,6 +2459,46 @@ static int find_first_non_hole(struct inode *inode, u64 *start, u64 *len) return ret; } +static int btrfs_punch_hole_lock_range(struct inode *inode, + const u64 lockstart, + const u64 lockend, + struct extent_state **cached_state) +{ + while (1) { + struct btrfs_ordered_extent *ordered; + int ret; + + truncate_pagecache_range(inode, lockstart, lockend); + + lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend, + cached_state); + ordered = btrfs_lookup_first_ordered_extent(inode, lockend); + + /* + * We need to make sure we have no ordered extents in this range + * and nobody raced in and read a page in this range, if we did + * we need to try again. + */ + if ((!ordered || + (ordered->file_offset + ordered->len <= lockstart || + ordered->file_offset > lockend)) && + !btrfs_page_exists_in_range(inode, lockstart, lockend)) { + if (ordered) + btrfs_put_ordered_extent(ordered); + break; + } + if (ordered) + btrfs_put_ordered_extent(ordered); + unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, + lockend, cached_state); + ret = btrfs_wait_ordered_range(inode, lockstart, + lockend - lockstart + 1); + if (ret) + return ret; + } + return 0; +} + static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); @@ -2566,38 +2615,11 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) goto out_only_mutex; } - while (1) { - struct btrfs_ordered_extent *ordered; - - truncate_pagecache_range(inode, lockstart, lockend); - - lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend, - &cached_state); - ordered = btrfs_lookup_first_ordered_extent(inode, lockend); - - /* - * We need to make sure we have no ordered extents in this range - * and nobody raced in and read a page in this range, if we did - * we need to try again. - */ - if ((!ordered || - (ordered->file_offset + ordered->len <= lockstart || - ordered->file_offset > lockend)) && - !btrfs_page_exists_in_range(inode, lockstart, lockend)) { - if (ordered) - btrfs_put_ordered_extent(ordered); - break; - } - if (ordered) - btrfs_put_ordered_extent(ordered); - unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, - lockend, &cached_state, GFP_NOFS); - ret = btrfs_wait_ordered_range(inode, lockstart, - lockend - lockstart + 1); - if (ret) { - inode_unlock(inode); - return ret; - } + ret = btrfs_punch_hole_lock_range(inode, lockstart, lockend, + &cached_state); + if (ret) { + inode_unlock(inode); + goto out_only_mutex; } path = btrfs_alloc_path(); @@ -2742,7 +2764,7 @@ out_free: btrfs_free_block_rsv(fs_info, rsv); out: unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend, - &cached_state, GFP_NOFS); + &cached_state); out_only_mutex: if (!updated_inode && truncated_block && !ret && !err) { /* @@ -2806,6 +2828,234 @@ insert: return 0; } +static int btrfs_fallocate_update_isize(struct inode *inode, + const u64 end, + const int mode) +{ + struct btrfs_trans_handle *trans; + struct btrfs_root *root = BTRFS_I(inode)->root; + int ret; + int ret2; + + if (mode & FALLOC_FL_KEEP_SIZE || end <= i_size_read(inode)) + return 0; + + trans = btrfs_start_transaction(root, 1); + if (IS_ERR(trans)) + return PTR_ERR(trans); + + inode->i_ctime = current_time(inode); + i_size_write(inode, end); + btrfs_ordered_update_i_size(inode, end, NULL); + ret = btrfs_update_inode(trans, root, inode); + ret2 = btrfs_end_transaction(trans); + + return ret ? ret : ret2; +} + +enum { + RANGE_BOUNDARY_WRITTEN_EXTENT = 0, + RANGE_BOUNDARY_PREALLOC_EXTENT = 1, + RANGE_BOUNDARY_HOLE = 2, +}; + +static int btrfs_zero_range_check_range_boundary(struct inode *inode, + u64 offset) +{ + const u64 sectorsize = btrfs_inode_sectorsize(inode); + struct extent_map *em; + int ret; + + offset = round_down(offset, sectorsize); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0); + if (IS_ERR(em)) + return PTR_ERR(em); + + if (em->block_start == EXTENT_MAP_HOLE) + ret = RANGE_BOUNDARY_HOLE; + else if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) + ret = RANGE_BOUNDARY_PREALLOC_EXTENT; + else + ret = RANGE_BOUNDARY_WRITTEN_EXTENT; + + free_extent_map(em); + return ret; +} + +static int btrfs_zero_range(struct inode *inode, + loff_t offset, + loff_t len, + const int mode) +{ + struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; + struct extent_map *em; + struct extent_changeset *data_reserved = NULL; + int ret; + u64 alloc_hint = 0; + const u64 sectorsize = btrfs_inode_sectorsize(inode); + u64 alloc_start = round_down(offset, sectorsize); + u64 alloc_end = round_up(offset + len, sectorsize); + u64 bytes_to_reserve = 0; + bool space_reserved = false; + + inode_dio_wait(inode); + + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, + alloc_start, alloc_end - alloc_start, 0); + if (IS_ERR(em)) { + ret = PTR_ERR(em); + goto out; + } + + /* + * Avoid hole punching and extent allocation for some cases. More cases + * could be considered, but these are unlikely common and we keep things + * as simple as possible for now. Also, intentionally, if the target + * range contains one or more prealloc extents together with regular + * extents and holes, we drop all the existing extents and allocate a + * new prealloc extent, so that we get a larger contiguous disk extent. + */ + if (em->start <= alloc_start && + test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) { + const u64 em_end = em->start + em->len; + + if (em_end >= offset + len) { + /* + * The whole range is already a prealloc extent, + * do nothing except updating the inode's i_size if + * needed. + */ + free_extent_map(em); + ret = btrfs_fallocate_update_isize(inode, offset + len, + mode); + goto out; + } + /* + * Part of the range is already a prealloc extent, so operate + * only on the remaining part of the range. + */ + alloc_start = em_end; + ASSERT(IS_ALIGNED(alloc_start, sectorsize)); + len = offset + len - alloc_start; + offset = alloc_start; + alloc_hint = em->block_start + em->len; + } + free_extent_map(em); + + if (BTRFS_BYTES_TO_BLKS(fs_info, offset) == + BTRFS_BYTES_TO_BLKS(fs_info, offset + len - 1)) { + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, + alloc_start, sectorsize, 0); + if (IS_ERR(em)) { + ret = PTR_ERR(em); + goto out; + } + + if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) { + free_extent_map(em); + ret = btrfs_fallocate_update_isize(inode, offset + len, + mode); + goto out; + } + if (len < sectorsize && em->block_start != EXTENT_MAP_HOLE) { + free_extent_map(em); + ret = btrfs_truncate_block(inode, offset, len, 0); + if (!ret) + ret = btrfs_fallocate_update_isize(inode, + offset + len, + mode); + return ret; + } + free_extent_map(em); + alloc_start = round_down(offset, sectorsize); + alloc_end = alloc_start + sectorsize; + goto reserve_space; + } + + alloc_start = round_up(offset, sectorsize); + alloc_end = round_down(offset + len, sectorsize); + + /* + * For unaligned ranges, check the pages at the boundaries, they might + * map to an extent, in which case we need to partially zero them, or + * they might map to a hole, in which case we need our allocation range + * to cover them. + */ + if (!IS_ALIGNED(offset, sectorsize)) { + ret = btrfs_zero_range_check_range_boundary(inode, offset); + if (ret < 0) + goto out; + if (ret == RANGE_BOUNDARY_HOLE) { + alloc_start = round_down(offset, sectorsize); + ret = 0; + } else if (ret == RANGE_BOUNDARY_WRITTEN_EXTENT) { + ret = btrfs_truncate_block(inode, offset, 0, 0); + if (ret) + goto out; + } else { + ret = 0; + } + } + + if (!IS_ALIGNED(offset + len, sectorsize)) { + ret = btrfs_zero_range_check_range_boundary(inode, + offset + len); + if (ret < 0) + goto out; + if (ret == RANGE_BOUNDARY_HOLE) { + alloc_end = round_up(offset + len, sectorsize); + ret = 0; + } else if (ret == RANGE_BOUNDARY_WRITTEN_EXTENT) { + ret = btrfs_truncate_block(inode, offset + len, 0, 1); + if (ret) + goto out; + } else { + ret = 0; + } + } + +reserve_space: + if (alloc_start < alloc_end) { + struct extent_state *cached_state = NULL; + const u64 lockstart = alloc_start; + const u64 lockend = alloc_end - 1; + + bytes_to_reserve = alloc_end - alloc_start; + ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode), + bytes_to_reserve); + if (ret < 0) + goto out; + space_reserved = true; + ret = btrfs_qgroup_reserve_data(inode, &data_reserved, + alloc_start, bytes_to_reserve); + if (ret) + goto out; + ret = btrfs_punch_hole_lock_range(inode, lockstart, lockend, + &cached_state); + if (ret) + goto out; + ret = btrfs_prealloc_file_range(inode, mode, alloc_start, + alloc_end - alloc_start, + i_blocksize(inode), + offset + len, &alloc_hint); + unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, + lockend, &cached_state); + /* btrfs_prealloc_file_range releases reserved space on error */ + if (ret) { + space_reserved = false; + goto out; + } + } + ret = btrfs_fallocate_update_isize(inode, offset + len, mode); + out: + if (ret && space_reserved) + btrfs_free_reserved_data_space(inode, data_reserved, + alloc_start, bytes_to_reserve); + extent_changeset_free(data_reserved); + + return ret; +} + static long btrfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len) { @@ -2831,7 +3081,8 @@ static long btrfs_fallocate(struct file *file, int mode, cur_offset = alloc_start; /* Make sure we aren't being give some crap mode */ - if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) + if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | + FALLOC_FL_ZERO_RANGE)) return -EOPNOTSUPP; if (mode & FALLOC_FL_PUNCH_HOLE) @@ -2842,10 +3093,12 @@ static long btrfs_fallocate(struct file *file, int mode, * * For qgroup space, it will be checked later. */ - ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode), - alloc_end - alloc_start); - if (ret < 0) - return ret; + if (!(mode & FALLOC_FL_ZERO_RANGE)) { + ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode), + alloc_end - alloc_start); + if (ret < 0) + return ret; + } inode_lock(inode); @@ -2887,6 +3140,12 @@ static long btrfs_fallocate(struct file *file, int mode, if (ret) goto out; + if (mode & FALLOC_FL_ZERO_RANGE) { + ret = btrfs_zero_range(inode, offset, len, mode); + inode_unlock(inode); + return ret; + } + locked_end = alloc_end - 1; while (1) { struct btrfs_ordered_extent *ordered; @@ -2896,15 +3155,15 @@ static long btrfs_fallocate(struct file *file, int mode, */ lock_extent_bits(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, &cached_state); - ordered = btrfs_lookup_first_ordered_extent(inode, - alloc_end - 1); + ordered = btrfs_lookup_first_ordered_extent(inode, locked_end); + if (ordered && ordered->file_offset + ordered->len > alloc_start && ordered->file_offset < alloc_end) { btrfs_put_ordered_extent(ordered); unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, - &cached_state, GFP_KERNEL); + &cached_state); /* * we can't wait on the range with the transaction * running or with the extent lock held @@ -2922,7 +3181,7 @@ static long btrfs_fallocate(struct file *file, int mode, /* First, check if we exceed the qgroup limit */ INIT_LIST_HEAD(&reserve_list); - while (1) { + while (cur_offset < alloc_end) { em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, cur_offset, alloc_end - cur_offset, 0); if (IS_ERR(em)) { @@ -2958,8 +3217,6 @@ static long btrfs_fallocate(struct file *file, int mode, } free_extent_map(em); cur_offset = last_byte; - if (cur_offset >= alloc_end) - break; } /* @@ -2982,37 +3239,18 @@ static long btrfs_fallocate(struct file *file, int mode, if (ret < 0) goto out_unlock; - if (actual_end > inode->i_size && - !(mode & FALLOC_FL_KEEP_SIZE)) { - struct btrfs_trans_handle *trans; - struct btrfs_root *root = BTRFS_I(inode)->root; - - /* - * We didn't need to allocate any more space, but we - * still extended the size of the file so we need to - * update i_size and the inode item. - */ - trans = btrfs_start_transaction(root, 1); - if (IS_ERR(trans)) { - ret = PTR_ERR(trans); - } else { - inode->i_ctime = current_time(inode); - i_size_write(inode, actual_end); - btrfs_ordered_update_i_size(inode, actual_end, NULL); - ret = btrfs_update_inode(trans, root, inode); - if (ret) - btrfs_end_transaction(trans); - else - ret = btrfs_end_transaction(trans); - } - } + /* + * We didn't need to allocate any more space, but we still extended the + * size of the file so we need to update i_size and the inode item. + */ + ret = btrfs_fallocate_update_isize(inode, actual_end, mode); out_unlock: unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, - &cached_state, GFP_KERNEL); + &cached_state); out: inode_unlock(inode); /* Let go of our reservation. */ - if (ret != 0) + if (ret != 0 && !(mode & FALLOC_FL_ZERO_RANGE)) btrfs_free_reserved_data_space(inode, data_reserved, alloc_start, alloc_end - cur_offset); extent_changeset_free(data_reserved); @@ -3081,7 +3319,7 @@ static int find_desired_extent(struct inode *inode, loff_t *offset, int whence) *offset = min_t(loff_t, start, inode->i_size); } unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend, - &cached_state, GFP_NOFS); + &cached_state); return ret; } @@ -3145,7 +3383,7 @@ void btrfs_auto_defrag_exit(void) kmem_cache_destroy(btrfs_inode_defrag_cachep); } -int btrfs_auto_defrag_init(void) +int __init btrfs_auto_defrag_init(void) { btrfs_inode_defrag_cachep = kmem_cache_create("btrfs_inode_defrag", sizeof(struct inode_defrag), 0, diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 4426d1c73e50..a9f22ac50d6a 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -22,6 +22,7 @@ #include <linux/slab.h> #include <linux/math64.h> #include <linux/ratelimit.h> +#include <linux/error-injection.h> #include "ctree.h" #include "free-space-cache.h" #include "transaction.h" @@ -332,6 +333,7 @@ static int io_ctl_init(struct btrfs_io_ctl *io_ctl, struct inode *inode, return 0; } +ALLOW_ERROR_INJECTION(io_ctl_init, ERRNO); static void io_ctl_free(struct btrfs_io_ctl *io_ctl) { @@ -993,8 +995,7 @@ update_cache_item(struct btrfs_trans_handle *trans, ret = btrfs_search_slot(trans, root, &key, path, 0, 1); if (ret < 0) { clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1, - EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, NULL, - GFP_NOFS); + EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, NULL); goto fail; } leaf = path->nodes[0]; @@ -1008,7 +1009,7 @@ update_cache_item(struct btrfs_trans_handle *trans, clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1, EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, - NULL, GFP_NOFS); + NULL); btrfs_release_path(path); goto fail; } @@ -1105,8 +1106,7 @@ static int flush_dirty_cache(struct inode *inode) ret = btrfs_wait_ordered_range(inode, 0, (u64)-1); if (ret) clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1, - EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, NULL, - GFP_NOFS); + EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, NULL); return ret; } @@ -1127,8 +1127,7 @@ cleanup_write_cache_enospc(struct inode *inode, { io_ctl_drop_pages(io_ctl); unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0, - i_size_read(inode) - 1, cached_state, - GFP_NOFS); + i_size_read(inode) - 1, cached_state); } static int __btrfs_wait_cache_io(struct btrfs_root *root, @@ -1322,7 +1321,7 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, io_ctl_drop_pages(io_ctl); unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0, - i_size_read(inode) - 1, &cached_state, GFP_NOFS); + i_size_read(inode) - 1, &cached_state); /* * at this point the pages are under IO and we're happy, diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index e1a7f3cb5be9..53ca025655fc 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -43,6 +43,7 @@ #include <linux/posix_acl_xattr.h> #include <linux/uio.h> #include <linux/magic.h> +#include <linux/iversion.h> #include "ctree.h" #include "disk-io.h" #include "transaction.h" @@ -536,9 +537,14 @@ again: * * If the compression fails for any reason, we set the pages * dirty again later on. + * + * Note that the remaining part is redirtied, the start pointer + * has moved, the end is the original one. */ - extent_range_clear_dirty_for_io(inode, start, end); - redirty = 1; + if (!redirty) { + extent_range_clear_dirty_for_io(inode, start, end); + redirty = 1; + } /* Compression level is applied here and only here */ ret = btrfs_compress_pages( @@ -765,11 +771,10 @@ retry: * all those pages down to the drive. */ if (!page_started && !ret) - extent_write_locked_range(io_tree, - inode, async_extent->start, + extent_write_locked_range(inode, + async_extent->start, async_extent->start + async_extent->ram_size - 1, - btrfs_get_extent, WB_SYNC_ALL); else if (ret) unlock_page(async_cow->locked_page); @@ -1203,7 +1208,7 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page, u64 cur_end; clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED, - 1, 0, NULL, GFP_NOFS); + 1, 0, NULL); while (start < end) { async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS); BUG_ON(!async_cow); /* -ENOMEM */ @@ -1951,7 +1956,21 @@ static blk_status_t __btrfs_submit_bio_done(void *private_data, struct bio *bio, /* * extent_io.c submission hook. This does the right thing for csum calculation - * on write, or reading the csums from the tree before a read + * on write, or reading the csums from the tree before a read. + * + * Rules about async/sync submit, + * a) read: sync submit + * + * b) write without checksum: sync submit + * + * c) write with checksum: + * c-1) if bio is issued by fsync: sync submit + * (sync_writers != 0) + * + * c-2) if root is reloc root: sync submit + * (only in case of buffered IO) + * + * c-3) otherwise: async submit */ static blk_status_t btrfs_submit_bio_hook(void *private_data, struct bio *bio, int mirror_num, unsigned long bio_flags, @@ -2023,10 +2042,10 @@ static noinline int add_pending_csums(struct btrfs_trans_handle *trans, struct btrfs_ordered_sum *sum; list_for_each_entry(sum, list, list) { - trans->adding_csums = 1; + trans->adding_csums = true; btrfs_csum_file_blocks(trans, BTRFS_I(inode)->root->fs_info->csum_root, sum); - trans->adding_csums = 0; + trans->adding_csums = false; } return 0; } @@ -2082,7 +2101,7 @@ again: PAGE_SIZE); if (ordered) { unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start, - page_end, &cached_state, GFP_NOFS); + page_end, &cached_state); unlock_page(page); btrfs_start_ordered_extent(inode, ordered, 1); btrfs_put_ordered_extent(ordered); @@ -2098,14 +2117,21 @@ again: goto out; } - btrfs_set_extent_delalloc(inode, page_start, page_end, 0, &cached_state, - 0); + ret = btrfs_set_extent_delalloc(inode, page_start, page_end, 0, + &cached_state, 0); + if (ret) { + mapping_set_error(page->mapping, ret); + end_extent_writepage(page, ret, page_start, page_end); + ClearPageChecked(page); + goto out; + } + ClearPageChecked(page); set_page_dirty(page); btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE); out: unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start, page_end, - &cached_state, GFP_NOFS); + &cached_state); out_page: unlock_page(page); put_page(page); @@ -2697,7 +2723,7 @@ out_free_path: btrfs_end_transaction(trans); out_unlock: unlock_extent_cached(&BTRFS_I(inode)->io_tree, lock_start, lock_end, - &cached, GFP_NOFS); + &cached); iput(inode); return ret; } @@ -2986,7 +3012,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) clear_extent_bit(io_tree, ordered_extent->file_offset, ordered_extent->file_offset + ordered_extent->len - 1, - EXTENT_DEFRAG, 0, 0, &cached_state, GFP_NOFS); + EXTENT_DEFRAG, 0, 0, &cached_state); } if (nolock) @@ -3056,7 +3082,7 @@ out: ordered_extent->len - 1, clear_bits, (clear_bits & EXTENT_LOCKED) ? 1 : 0, - 0, &cached_state, GFP_NOFS); + 0, &cached_state); } if (trans) @@ -3070,7 +3096,7 @@ out: else start = ordered_extent->file_offset; end = ordered_extent->file_offset + ordered_extent->len - 1; - clear_extent_uptodate(io_tree, start, end, NULL, GFP_NOFS); + clear_extent_uptodate(io_tree, start, end, NULL); /* Drop the cache for the part of the extent we didn't write. */ btrfs_drop_extent_cache(BTRFS_I(inode), start, end, 0); @@ -3777,7 +3803,8 @@ static int btrfs_read_locked_inode(struct inode *inode) BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item); BTRFS_I(inode)->last_trans = btrfs_inode_transid(leaf, inode_item); - inode->i_version = btrfs_inode_sequence(leaf, inode_item); + inode_set_iversion_queried(inode, + btrfs_inode_sequence(leaf, inode_item)); inode->i_generation = BTRFS_I(inode)->generation; inode->i_rdev = 0; rdev = btrfs_inode_rdev(leaf, inode_item); @@ -3945,7 +3972,8 @@ static void fill_inode_item(struct btrfs_trans_handle *trans, &token); btrfs_set_token_inode_generation(leaf, item, BTRFS_I(inode)->generation, &token); - btrfs_set_token_inode_sequence(leaf, item, inode->i_version, &token); + btrfs_set_token_inode_sequence(leaf, item, inode_peek_iversion(inode), + &token); btrfs_set_token_inode_transid(leaf, item, trans->transid, &token); btrfs_set_token_inode_rdev(leaf, item, inode->i_rdev, &token); btrfs_set_token_inode_flags(leaf, item, BTRFS_I(inode)->flags, &token); @@ -4744,8 +4772,8 @@ int btrfs_truncate_block(struct inode *inode, loff_t from, loff_t len, u64 block_start; u64 block_end; - if ((offset & (blocksize - 1)) == 0 && - (!len || ((len & (blocksize - 1)) == 0))) + if (IS_ALIGNED(offset, blocksize) && + (!len || IS_ALIGNED(len, blocksize))) goto out; block_start = round_down(from, blocksize); @@ -4787,7 +4815,7 @@ again: ordered = btrfs_lookup_ordered_extent(inode, block_start); if (ordered) { unlock_extent_cached(io_tree, block_start, block_end, - &cached_state, GFP_NOFS); + &cached_state); unlock_page(page); put_page(page); btrfs_start_ordered_extent(inode, ordered, 1); @@ -4798,13 +4826,13 @@ again: clear_extent_bit(&BTRFS_I(inode)->io_tree, block_start, block_end, EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, - 0, 0, &cached_state, GFP_NOFS); + 0, 0, &cached_state); ret = btrfs_set_extent_delalloc(inode, block_start, block_end, 0, &cached_state, 0); if (ret) { unlock_extent_cached(io_tree, block_start, block_end, - &cached_state, GFP_NOFS); + &cached_state); goto out_unlock; } @@ -4823,8 +4851,7 @@ again: } ClearPageChecked(page); set_page_dirty(page); - unlock_extent_cached(io_tree, block_start, block_end, &cached_state, - GFP_NOFS); + unlock_extent_cached(io_tree, block_start, block_end, &cached_state); out_unlock: if (ret) @@ -4925,7 +4952,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size) if (!ordered) break; unlock_extent_cached(io_tree, hole_start, block_end - 1, - &cached_state, GFP_NOFS); + &cached_state); btrfs_start_ordered_extent(inode, ordered, 1); btrfs_put_ordered_extent(ordered); } @@ -4990,8 +5017,7 @@ next: break; } free_extent_map(em); - unlock_extent_cached(io_tree, hole_start, block_end - 1, &cached_state, - GFP_NOFS); + unlock_extent_cached(io_tree, hole_start, block_end - 1, &cached_state); return err; } @@ -5234,8 +5260,7 @@ static void evict_inode_truncate_pages(struct inode *inode) clear_extent_bit(io_tree, start, end, EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | - EXTENT_DEFRAG, 1, 1, - &cached_state, GFP_NOFS); + EXTENT_DEFRAG, 1, 1, &cached_state); cond_resched(); spin_lock(&io_tree->lock); @@ -5894,7 +5919,6 @@ static int btrfs_filldir(void *addr, int entries, struct dir_context *ctx) static int btrfs_real_readdir(struct file *file, struct dir_context *ctx) { struct inode *inode = file_inode(file); - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_file_private *private = file->private_data; struct btrfs_dir_item *di; @@ -5962,9 +5986,6 @@ again: if (btrfs_should_delete_dir_index(&del_list, found_key.offset)) goto next; di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item); - if (verify_dir_item(fs_info, leaf, slot, di)) - goto next; - name_len = btrfs_dir_name_len(leaf, di); if ((total_len + sizeof(struct dir_entry) + name_len) >= PAGE_SIZE) { @@ -6104,19 +6125,20 @@ static int btrfs_update_time(struct inode *inode, struct timespec *now, int flags) { struct btrfs_root *root = BTRFS_I(inode)->root; + bool dirty = flags & ~S_VERSION; if (btrfs_root_readonly(root)) return -EROFS; if (flags & S_VERSION) - inode_inc_iversion(inode); + dirty |= inode_maybe_inc_iversion(inode, dirty); if (flags & S_CTIME) inode->i_ctime = *now; if (flags & S_MTIME) inode->i_mtime = *now; if (flags & S_ATIME) inode->i_atime = *now; - return btrfs_dirty_inode(inode); + return dirty ? btrfs_dirty_inode(inode) : 0; } /* @@ -6297,7 +6319,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, } /* * index_cnt is ignored for everything but a dir, - * btrfs_get_inode_index_count has an explanation for the magic + * btrfs_set_inode_index_count has an explanation for the magic * number */ BTRFS_I(inode)->index_cnt = 2; @@ -6560,7 +6582,6 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry, out_unlock: btrfs_end_transaction(trans); - btrfs_balance_delayed_items(fs_info); btrfs_btree_balance_dirty(fs_info); if (drop_inode) { inode_dec_link_count(inode); @@ -6641,7 +6662,6 @@ out_unlock: inode_dec_link_count(inode); iput(inode); } - btrfs_balance_delayed_items(fs_info); btrfs_btree_balance_dirty(fs_info); return err; @@ -6716,7 +6736,6 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, btrfs_log_new_name(trans, BTRFS_I(inode), NULL, parent); } - btrfs_balance_delayed_items(fs_info); fail: if (trans) btrfs_end_transaction(trans); @@ -6794,7 +6813,6 @@ out_fail: inode_dec_link_count(inode); iput(inode); } - btrfs_balance_delayed_items(fs_info); btrfs_btree_balance_dirty(fs_info); return err; @@ -6803,68 +6821,6 @@ out_fail_inode: goto out_fail; } -/* Find next extent map of a given extent map, caller needs to ensure locks */ -static struct extent_map *next_extent_map(struct extent_map *em) -{ - struct rb_node *next; - - next = rb_next(&em->rb_node); - if (!next) - return NULL; - return container_of(next, struct extent_map, rb_node); -} - -static struct extent_map *prev_extent_map(struct extent_map *em) -{ - struct rb_node *prev; - - prev = rb_prev(&em->rb_node); - if (!prev) - return NULL; - return container_of(prev, struct extent_map, rb_node); -} - -/* helper for btfs_get_extent. Given an existing extent in the tree, - * the existing extent is the nearest extent to map_start, - * and an extent that you want to insert, deal with overlap and insert - * the best fitted new extent into the tree. - */ -static int merge_extent_mapping(struct extent_map_tree *em_tree, - struct extent_map *existing, - struct extent_map *em, - u64 map_start) -{ - struct extent_map *prev; - struct extent_map *next; - u64 start; - u64 end; - u64 start_diff; - - BUG_ON(map_start < em->start || map_start >= extent_map_end(em)); - - if (existing->start > map_start) { - next = existing; - prev = prev_extent_map(next); - } else { - prev = existing; - next = next_extent_map(prev); - } - - start = prev ? extent_map_end(prev) : em->start; - start = max_t(u64, start, em->start); - end = next ? next->start : extent_map_end(em); - end = min_t(u64, end, extent_map_end(em)); - start_diff = start - em->start; - em->start = start; - em->len = end - start; - if (em->block_start < EXTENT_MAP_LAST_BYTE && - !test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { - em->block_start += start_diff; - em->block_len -= start_diff; - } - return add_extent_mapping(em_tree, em, 0); -} - static noinline int uncompress_inline(struct btrfs_path *path, struct page *page, size_t pg_offset, u64 extent_offset, @@ -6939,10 +6895,8 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, struct extent_map *em = NULL; struct extent_map_tree *em_tree = &inode->extent_tree; struct extent_io_tree *io_tree = &inode->io_tree; - struct btrfs_trans_handle *trans = NULL; const bool new_inline = !page || create; -again: read_lock(&em_tree->lock); em = lookup_extent_mapping(em_tree, start, len); if (em) @@ -6981,8 +6935,7 @@ again: path->reada = READA_FORWARD; } - ret = btrfs_lookup_file_extent(trans, root, path, - objectid, start, trans != NULL); + ret = btrfs_lookup_file_extent(NULL, root, path, objectid, start, 0); if (ret < 0) { err = ret; goto out; @@ -7083,7 +7036,7 @@ next: em->orig_block_len = em->len; em->orig_start = em->start; ptr = btrfs_file_extent_inline_start(item) + extent_offset; - if (create == 0 && !PageUptodate(page)) { + if (!PageUptodate(page)) { if (btrfs_file_extent_compression(leaf, item) != BTRFS_COMPRESS_NONE) { ret = uncompress_inline(path, page, pg_offset, @@ -7104,25 +7057,6 @@ next: kunmap(page); } flush_dcache_page(page); - } else if (create && PageUptodate(page)) { - BUG(); - if (!trans) { - kunmap(page); - free_extent_map(em); - em = NULL; - - btrfs_release_path(path); - trans = btrfs_join_transaction(root); - - if (IS_ERR(trans)) - return ERR_CAST(trans); - goto again; - } - map = kmap(page); - write_extent_buffer(leaf, map + pg_offset, ptr, - copy_size); - kunmap(page); - btrfs_mark_buffer_dirty(leaf); } set_extent_uptodate(io_tree, em->start, extent_map_end(em) - 1, NULL, GFP_NOFS); @@ -7134,7 +7068,6 @@ not_found: em->len = len; not_found_em: em->block_start = EXTENT_MAP_HOLE; - set_bit(EXTENT_FLAG_VACANCY, &em->flags); insert: btrfs_release_path(path); if (em->start > start || extent_map_end(em) <= start) { @@ -7147,62 +7080,13 @@ insert: err = 0; write_lock(&em_tree->lock); - ret = add_extent_mapping(em_tree, em, 0); - /* it is possible that someone inserted the extent into the tree - * while we had the lock dropped. It is also possible that - * an overlapping map exists in the tree - */ - if (ret == -EEXIST) { - struct extent_map *existing; - - ret = 0; - - existing = search_extent_mapping(em_tree, start, len); - /* - * existing will always be non-NULL, since there must be - * extent causing the -EEXIST. - */ - if (existing->start == em->start && - extent_map_end(existing) >= extent_map_end(em) && - em->block_start == existing->block_start) { - /* - * The existing extent map already encompasses the - * entire extent map we tried to add. - */ - free_extent_map(em); - em = existing; - err = 0; - - } else if (start >= extent_map_end(existing) || - start <= existing->start) { - /* - * The existing extent map is the one nearest to - * the [start, start + len) range which overlaps - */ - err = merge_extent_mapping(em_tree, existing, - em, start); - free_extent_map(existing); - if (err) { - free_extent_map(em); - em = NULL; - } - } else { - free_extent_map(em); - em = existing; - err = 0; - } - } + err = btrfs_add_extent_mapping(em_tree, &em, start, len); write_unlock(&em_tree->lock); out: trace_btrfs_get_extent(root, inode, em); btrfs_free_path(path); - if (trans) { - ret = btrfs_end_transaction(trans); - if (!err) - err = ret; - } if (err) { free_extent_map(em); return ERR_PTR(err); @@ -7324,7 +7208,7 @@ struct extent_map *btrfs_get_extent_fiemap(struct btrfs_inode *inode, em->block_start = EXTENT_MAP_DELALLOC; em->block_len = found; } - } else if (hole_em) { + } else { return hole_em; } out: @@ -7641,7 +7525,7 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend, break; unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend, - cached_state, GFP_NOFS); + cached_state); if (ordered) { /* @@ -7926,7 +7810,7 @@ unlock: if (lockstart < lockend) { clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend, unlock_bits, 1, 0, - &cached_state, GFP_NOFS); + &cached_state); } else { free_extent_state(cached_state); } @@ -7937,7 +7821,7 @@ unlock: unlock_err: clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend, - unlock_bits, 1, 0, &cached_state, GFP_NOFS); + unlock_bits, 1, 0, &cached_state); err: if (dio_data) current->journal_info = dio_data; @@ -7953,15 +7837,12 @@ static inline blk_status_t submit_dio_repair_bio(struct inode *inode, BUG_ON(bio_op(bio) == REQ_OP_WRITE); - bio_get(bio); - ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DIO_REPAIR); if (ret) - goto err; + return ret; ret = btrfs_map_bio(fs_info, bio, mirror_num, 0); -err: - bio_put(bio); + return ret; } @@ -8015,6 +7896,7 @@ static blk_status_t dio_read_error(struct inode *inode, struct bio *failed_bio, int segs; int ret; blk_status_t status; + struct bio_vec bvec; BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE); @@ -8030,8 +7912,9 @@ static blk_status_t dio_read_error(struct inode *inode, struct bio *failed_bio, } segs = bio_segments(failed_bio); + bio_get_first_bvec(failed_bio, &bvec); if (segs > 1 || - (failed_bio->bi_io_vec->bv_len > btrfs_inode_sectorsize(inode))) + (bvec.bv_len > btrfs_inode_sectorsize(inode))) read_mode |= REQ_FAILFAST_DEV; isector = start - btrfs_io_bio(failed_bio)->logical; @@ -8074,7 +7957,7 @@ static void btrfs_retry_endio_nocsum(struct bio *bio) ASSERT(bio->bi_vcnt == 1); io_tree = &BTRFS_I(inode)->io_tree; failure_tree = &BTRFS_I(inode)->io_failure_tree; - ASSERT(bio->bi_io_vec->bv_len == btrfs_inode_sectorsize(inode)); + ASSERT(bio_first_bvec_all(bio)->bv_len == btrfs_inode_sectorsize(inode)); done->uptodate = 1; ASSERT(!bio_flagged(bio, BIO_CLONED)); @@ -8164,7 +8047,7 @@ static void btrfs_retry_endio(struct bio *bio) uptodate = 1; ASSERT(bio->bi_vcnt == 1); - ASSERT(bio->bi_io_vec->bv_len == btrfs_inode_sectorsize(done->inode)); + ASSERT(bio_first_bvec_all(bio)->bv_len == btrfs_inode_sectorsize(done->inode)); io_tree = &BTRFS_I(inode)->io_tree; failure_tree = &BTRFS_I(inode)->io_failure_tree; @@ -8460,11 +8343,10 @@ __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode, u64 file_offset, bool write = bio_op(bio) == REQ_OP_WRITE; blk_status_t ret; + /* Check btrfs_submit_bio_hook() for rules about async submit. */ if (async_submit) async_submit = !atomic_read(&BTRFS_I(inode)->sync_writers); - bio_get(bio); - if (!write) { ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DATA); if (ret) @@ -8497,7 +8379,6 @@ __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode, u64 file_offset, map: ret = btrfs_map_bio(fs_info, bio, 0, 0); err: - bio_put(bio); return ret; } @@ -8854,7 +8735,7 @@ static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, if (ret) return ret; - return extent_fiemap(inode, fieinfo, start, len, btrfs_get_extent_fiemap); + return extent_fiemap(inode, fieinfo, start, len); } int btrfs_readpage(struct file *file, struct page *page) @@ -8866,7 +8747,6 @@ int btrfs_readpage(struct file *file, struct page *page) static int btrfs_writepage(struct page *page, struct writeback_control *wbc) { - struct extent_io_tree *tree; struct inode *inode = page->mapping->host; int ret; @@ -8885,8 +8765,7 @@ static int btrfs_writepage(struct page *page, struct writeback_control *wbc) redirty_page_for_writepage(wbc, page); return AOP_WRITEPAGE_ACTIVATE; } - tree = &BTRFS_I(page->mapping->host)->io_tree; - ret = extent_write_full_page(tree, page, btrfs_get_extent, wbc); + ret = extent_write_full_page(page, wbc); btrfs_add_delayed_iput(inode); return ret; } @@ -8897,7 +8776,7 @@ static int btrfs_writepages(struct address_space *mapping, struct extent_io_tree *tree; tree = &BTRFS_I(mapping->host)->io_tree; - return extent_writepages(tree, mapping, btrfs_get_extent, wbc); + return extent_writepages(tree, mapping, wbc); } static int @@ -8906,8 +8785,7 @@ btrfs_readpages(struct file *file, struct address_space *mapping, { struct extent_io_tree *tree; tree = &BTRFS_I(mapping->host)->io_tree; - return extent_readpages(tree, mapping, pages, nr_pages, - btrfs_get_extent); + return extent_readpages(tree, mapping, pages, nr_pages); } static int __btrfs_releasepage(struct page *page, gfp_t gfp_flags) { @@ -8978,8 +8856,7 @@ again: EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | EXTENT_LOCKED | EXTENT_DO_ACCOUNTING | - EXTENT_DEFRAG, 1, 0, &cached_state, - GFP_NOFS); + EXTENT_DEFRAG, 1, 0, &cached_state); /* * whoever cleared the private bit is responsible * for the finish_ordered_io @@ -9036,7 +8913,7 @@ again: EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 1, 1, - &cached_state, GFP_NOFS); + &cached_state); __btrfs_releasepage(page, GFP_NOFS); } @@ -9137,7 +9014,7 @@ again: PAGE_SIZE); if (ordered) { unlock_extent_cached(io_tree, page_start, page_end, - &cached_state, GFP_NOFS); + &cached_state); unlock_page(page); btrfs_start_ordered_extent(inode, ordered, 1); btrfs_put_ordered_extent(ordered); @@ -9164,13 +9041,13 @@ again: clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, end, EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, - 0, 0, &cached_state, GFP_NOFS); + 0, 0, &cached_state); ret = btrfs_set_extent_delalloc(inode, page_start, end, 0, &cached_state, 0); if (ret) { unlock_extent_cached(io_tree, page_start, page_end, - &cached_state, GFP_NOFS); + &cached_state); ret = VM_FAULT_SIGBUS; goto out_unlock; } @@ -9196,7 +9073,7 @@ again: BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid; BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->root->last_log_commit; - unlock_extent_cached(io_tree, page_start, page_end, &cached_state, GFP_NOFS); + unlock_extent_cached(io_tree, page_start, page_end, &cached_state); out_unlock: if (!ret) { @@ -9421,7 +9298,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) struct btrfs_inode *ei; struct inode *inode; - ei = kmem_cache_alloc(btrfs_inode_cachep, GFP_NOFS); + ei = kmem_cache_alloc(btrfs_inode_cachep, GFP_KERNEL); if (!ei) return NULL; @@ -9573,7 +9450,7 @@ void btrfs_destroy_cachep(void) kmem_cache_destroy(btrfs_free_space_cachep); } -int btrfs_init_cachep(void) +int __init btrfs_init_cachep(void) { btrfs_inode_cachep = kmem_cache_create("btrfs_inode", sizeof(struct btrfs_inode), 0, @@ -10688,7 +10565,6 @@ out: btrfs_end_transaction(trans); if (ret) iput(inode); - btrfs_balance_delayed_items(fs_info); btrfs_btree_balance_dirty(fs_info); return ret; diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 2ef8acaac688..111ee282b777 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -43,6 +43,7 @@ #include <linux/uuid.h> #include <linux/btrfs.h> #include <linux/uaccess.h> +#include <linux/iversion.h> #include "ctree.h" #include "disk-io.h" #include "transaction.h" @@ -307,12 +308,10 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg) ip->flags |= BTRFS_INODE_COMPRESS; ip->flags &= ~BTRFS_INODE_NOCOMPRESS; - if (fs_info->compress_type == BTRFS_COMPRESS_LZO) - comp = "lzo"; - else if (fs_info->compress_type == BTRFS_COMPRESS_ZLIB) - comp = "zlib"; - else - comp = "zstd"; + comp = btrfs_compress_type2str(fs_info->compress_type); + if (!comp || comp[0] == 0) + comp = btrfs_compress_type2str(BTRFS_COMPRESS_ZLIB); + ret = btrfs_set_prop(inode, "btrfs.compression", comp, strlen(comp), 0); if (ret) @@ -979,7 +978,7 @@ static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start) /* get the big lock and read metadata off disk */ lock_extent_bits(io_tree, start, end, &cached); em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, len, 0); - unlock_extent_cached(io_tree, start, end, &cached, GFP_NOFS); + unlock_extent_cached(io_tree, start, end, &cached); if (IS_ERR(em)) return NULL; @@ -1130,7 +1129,7 @@ again: ordered = btrfs_lookup_ordered_extent(inode, page_start); unlock_extent_cached(tree, page_start, page_end, - &cached_state, GFP_NOFS); + &cached_state); if (!ordered) break; @@ -1190,7 +1189,7 @@ again: clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, page_end - 1, EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 0, 0, - &cached_state, GFP_NOFS); + &cached_state); if (i_done != page_cnt) { spin_lock(&BTRFS_I(inode)->lock); @@ -1206,8 +1205,7 @@ again: &cached_state); unlock_extent_cached(&BTRFS_I(inode)->io_tree, - page_start, page_end - 1, &cached_state, - GFP_NOFS); + page_start, page_end - 1, &cached_state); for (i = 0; i < i_done; i++) { clear_page_dirty_for_io(pages[i]); @@ -1503,7 +1501,7 @@ static noinline int btrfs_ioctl_resize(struct file *file, goto out_free; } - if (!device->writeable) { + if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { btrfs_info(fs_info, "resizer unable to apply on readonly device %llu", devid); @@ -1528,7 +1526,7 @@ static noinline int btrfs_ioctl_resize(struct file *file, } } - if (device->is_tgtdev_for_dev_replace) { + if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) { ret = -EPERM; goto out_free; } @@ -2675,14 +2673,12 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg) goto out; } - mutex_lock(&fs_info->volume_mutex); if (vol_args->flags & BTRFS_DEVICE_SPEC_BY_ID) { ret = btrfs_rm_device(fs_info, NULL, vol_args->devid); } else { vol_args->name[BTRFS_SUBVOL_NAME_MAX] = '\0'; ret = btrfs_rm_device(fs_info, vol_args->name, 0); } - mutex_unlock(&fs_info->volume_mutex); clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); if (!ret) { @@ -2726,9 +2722,7 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg) } vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; - mutex_lock(&fs_info->volume_mutex); ret = btrfs_rm_device(fs_info, vol_args->name, 0); - mutex_unlock(&fs_info->volume_mutex); if (!ret) btrfs_info(fs_info, "disk deleted %s", vol_args->name); @@ -2753,16 +2747,16 @@ static long btrfs_ioctl_fs_info(struct btrfs_fs_info *fs_info, if (!fi_args) return -ENOMEM; - mutex_lock(&fs_devices->device_list_mutex); + rcu_read_lock(); fi_args->num_devices = fs_devices->num_devices; - memcpy(&fi_args->fsid, fs_info->fsid, sizeof(fi_args->fsid)); - list_for_each_entry(device, &fs_devices->devices, dev_list) { + list_for_each_entry_rcu(device, &fs_devices->devices, dev_list) { if (device->devid > fi_args->max_id) fi_args->max_id = device->devid; } - mutex_unlock(&fs_devices->device_list_mutex); + rcu_read_unlock(); + memcpy(&fi_args->fsid, fs_info->fsid, sizeof(fi_args->fsid)); fi_args->nodesize = fs_info->nodesize; fi_args->sectorsize = fs_info->sectorsize; fi_args->clone_alignment = fs_info->sectorsize; @@ -2779,7 +2773,6 @@ static long btrfs_ioctl_dev_info(struct btrfs_fs_info *fs_info, { struct btrfs_ioctl_dev_info_args *di_args; struct btrfs_device *dev; - struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; int ret = 0; char *s_uuid = NULL; @@ -2790,7 +2783,7 @@ static long btrfs_ioctl_dev_info(struct btrfs_fs_info *fs_info, if (!btrfs_is_empty_uuid(di_args->uuid)) s_uuid = di_args->uuid; - mutex_lock(&fs_devices->device_list_mutex); + rcu_read_lock(); dev = btrfs_find_device(fs_info, di_args->devid, s_uuid, NULL); if (!dev) { @@ -2805,17 +2798,15 @@ static long btrfs_ioctl_dev_info(struct btrfs_fs_info *fs_info, if (dev->name) { struct rcu_string *name; - rcu_read_lock(); name = rcu_dereference(dev->name); - strncpy(di_args->path, name->str, sizeof(di_args->path)); - rcu_read_unlock(); + strncpy(di_args->path, name->str, sizeof(di_args->path) - 1); di_args->path[sizeof(di_args->path) - 1] = 0; } else { di_args->path[0] = '\0'; } out: - mutex_unlock(&fs_devices->device_list_mutex); + rcu_read_unlock(); if (ret == 0 && copy_to_user(arg, di_args, sizeof(*di_args))) ret = -EFAULT; diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c index f6a05f836629..b30a056963ab 100644 --- a/fs/btrfs/props.c +++ b/fs/btrfs/props.c @@ -164,7 +164,6 @@ static int iterate_object_props(struct btrfs_root *root, size_t), void *ctx) { - struct btrfs_fs_info *fs_info = root->fs_info; int ret; char *name_buf = NULL; char *value_buf = NULL; @@ -215,12 +214,6 @@ static int iterate_object_props(struct btrfs_root *root, name_ptr = (unsigned long)(di + 1); data_ptr = name_ptr + name_len; - if (verify_dir_item(fs_info, leaf, - path->slots[0], di)) { - ret = -EIO; - goto out; - } - if (name_len <= XATTR_BTRFS_PREFIX_LEN || memcmp_extent_buffer(leaf, XATTR_BTRFS_PREFIX, name_ptr, @@ -430,11 +423,11 @@ static const char *prop_compression_extract(struct inode *inode) { switch (BTRFS_I(inode)->prop_compress) { case BTRFS_COMPRESS_ZLIB: - return "zlib"; case BTRFS_COMPRESS_LZO: - return "lzo"; case BTRFS_COMPRESS_ZSTD: - return "zstd"; + return btrfs_compress_type2str(BTRFS_I(inode)->prop_compress); + default: + break; } return NULL; diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 168fd03ca3ac..9e61dd624f7b 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -2883,8 +2883,7 @@ cleanup: ULIST_ITER_INIT(&uiter); while ((unode = ulist_next(&reserved->range_changed, &uiter))) clear_extent_bit(&BTRFS_I(inode)->io_tree, unode->val, - unode->aux, EXTENT_QGROUP_RESERVED, 0, 0, NULL, - GFP_NOFS); + unode->aux, EXTENT_QGROUP_RESERVED, 0, 0, NULL); extent_changeset_release(reserved); return ret; } diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index a7f79254ecca..dec0907dfb8a 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -231,7 +231,6 @@ int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info) cur = h + i; INIT_LIST_HEAD(&cur->hash_list); spin_lock_init(&cur->lock); - init_waitqueue_head(&cur->wait); } x = cmpxchg(&info->stripe_hash_table, NULL, table); @@ -595,14 +594,31 @@ static int rbio_can_merge(struct btrfs_raid_bio *last, * bio list here, anyone else that wants to * change this stripe needs to do their own rmw. */ - if (last->operation == BTRFS_RBIO_PARITY_SCRUB || - cur->operation == BTRFS_RBIO_PARITY_SCRUB) + if (last->operation == BTRFS_RBIO_PARITY_SCRUB) return 0; - if (last->operation == BTRFS_RBIO_REBUILD_MISSING || - cur->operation == BTRFS_RBIO_REBUILD_MISSING) + if (last->operation == BTRFS_RBIO_REBUILD_MISSING) return 0; + if (last->operation == BTRFS_RBIO_READ_REBUILD) { + int fa = last->faila; + int fb = last->failb; + int cur_fa = cur->faila; + int cur_fb = cur->failb; + + if (last->faila >= last->failb) { + fa = last->failb; + fb = last->faila; + } + + if (cur->faila >= cur->failb) { + cur_fa = cur->failb; + cur_fb = cur->faila; + } + + if (fa != cur_fa || fb != cur_fb) + return 0; + } return 1; } @@ -670,7 +686,6 @@ static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio) struct btrfs_raid_bio *cur; struct btrfs_raid_bio *pending; unsigned long flags; - DEFINE_WAIT(wait); struct btrfs_raid_bio *freeit = NULL; struct btrfs_raid_bio *cache_drop = NULL; int ret = 0; @@ -816,15 +831,6 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio) } goto done_nolock; - /* - * The barrier for this waitqueue_active is not needed, - * we're protected by h->lock and can't miss a wakeup. - */ - } else if (waitqueue_active(&h->wait)) { - spin_unlock(&rbio->bio_list_lock); - spin_unlock_irqrestore(&h->lock, flags); - wake_up(&h->wait); - goto done_nolock; } } done: @@ -858,10 +864,17 @@ static void __free_raid_bio(struct btrfs_raid_bio *rbio) kfree(rbio); } -static void free_raid_bio(struct btrfs_raid_bio *rbio) +static void rbio_endio_bio_list(struct bio *cur, blk_status_t err) { - unlock_stripe(rbio); - __free_raid_bio(rbio); + struct bio *next; + + while (cur) { + next = cur->bi_next; + cur->bi_next = NULL; + cur->bi_status = err; + bio_endio(cur); + cur = next; + } } /* @@ -871,20 +884,26 @@ static void free_raid_bio(struct btrfs_raid_bio *rbio) static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, blk_status_t err) { struct bio *cur = bio_list_get(&rbio->bio_list); - struct bio *next; + struct bio *extra; if (rbio->generic_bio_cnt) btrfs_bio_counter_sub(rbio->fs_info, rbio->generic_bio_cnt); - free_raid_bio(rbio); + /* + * At this moment, rbio->bio_list is empty, however since rbio does not + * always have RBIO_RMW_LOCKED_BIT set and rbio is still linked on the + * hash list, rbio may be merged with others so that rbio->bio_list + * becomes non-empty. + * Once unlock_stripe() is done, rbio->bio_list will not be updated any + * more and we can call bio_endio() on all queued bios. + */ + unlock_stripe(rbio); + extra = bio_list_get(&rbio->bio_list); + __free_raid_bio(rbio); - while (cur) { - next = cur->bi_next; - cur->bi_next = NULL; - cur->bi_status = err; - bio_endio(cur); - cur = next; - } + rbio_endio_bio_list(cur, err); + if (extra) + rbio_endio_bio_list(extra, err); } /* @@ -1435,14 +1454,13 @@ static int fail_bio_stripe(struct btrfs_raid_bio *rbio, */ static void set_bio_pages_uptodate(struct bio *bio) { - struct bio_vec bvec; - struct bvec_iter iter; + struct bio_vec *bvec; + int i; - if (bio_flagged(bio, BIO_CLONED)) - bio->bi_iter = btrfs_io_bio(bio)->iter; + ASSERT(!bio_flagged(bio, BIO_CLONED)); - bio_for_each_segment(bvec, bio, iter) - SetPageUptodate(bvec.bv_page); + bio_for_each_segment_all(bvec, bio, i) + SetPageUptodate(bvec->bv_page); } /* @@ -1969,7 +1987,22 @@ cleanup: cleanup_io: if (rbio->operation == BTRFS_RBIO_READ_REBUILD) { - if (err == BLK_STS_OK) + /* + * - In case of two failures, where rbio->failb != -1: + * + * Do not cache this rbio since the above read reconstruction + * (raid6_datap_recov() or raid6_2data_recov()) may have + * changed some content of stripes which are not identical to + * on-disk content any more, otherwise, a later write/recover + * may steal stripe_pages from this rbio and end up with + * corruptions or rebuild failures. + * + * - In case of single failure, where rbio->failb == -1: + * + * Cache this rbio iff the above read reconstruction is + * excuted without problems. + */ + if (err == BLK_STS_OK && rbio->failb < 0) cache_rbio_pages(rbio); else clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags); @@ -2170,11 +2203,21 @@ int raid56_parity_recover(struct btrfs_fs_info *fs_info, struct bio *bio, } /* - * reconstruct from the q stripe if they are - * asking for mirror 3 + * Loop retry: + * for 'mirror == 2', reconstruct from all other stripes. + * for 'mirror_num > 2', select a stripe to fail on every retry. */ - if (mirror_num == 3) - rbio->failb = rbio->real_stripes - 2; + if (mirror_num > 2) { + /* + * 'mirror == 3' is to fail the p stripe and + * reconstruct from the q stripe. 'mirror > 3' is to + * fail a data stripe and reconstruct from p+q stripe. + */ + rbio->failb = rbio->real_stripes - (mirror_num - 1); + ASSERT(rbio->failb > 0); + if (rbio->failb <= rbio->faila) + rbio->failb--; + } ret = lock_stripe_add(rbio); diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c index 34878699d363..171f3cce30e6 100644 --- a/fs/btrfs/ref-verify.c +++ b/fs/btrfs/ref-verify.c @@ -606,8 +606,7 @@ static int walk_down_tree(struct btrfs_root *root, struct btrfs_path *path, } /* Walk up to the next node that needs to be processed */ -static int walk_up_tree(struct btrfs_root *root, struct btrfs_path *path, - int *level) +static int walk_up_tree(struct btrfs_path *path, int *level) { int l; @@ -984,7 +983,6 @@ void btrfs_free_ref_tree_range(struct btrfs_fs_info *fs_info, u64 start, int btrfs_build_ref_tree(struct btrfs_fs_info *fs_info) { struct btrfs_path *path; - struct btrfs_root *root; struct extent_buffer *eb; u64 bytenr = 0, num_bytes = 0; int ret, level; @@ -1014,7 +1012,7 @@ int btrfs_build_ref_tree(struct btrfs_fs_info *fs_info) &bytenr, &num_bytes); if (ret) break; - ret = walk_up_tree(root, path, &level); + ret = walk_up_tree(path, &level); if (ret < 0) break; if (ret > 0) { diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index 3338407ef0f0..aab0194efe46 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -387,13 +387,6 @@ again: WARN_ON(btrfs_root_ref_dirid(leaf, ref) != dirid); WARN_ON(btrfs_root_ref_name_len(leaf, ref) != name_len); ptr = (unsigned long)(ref + 1); - ret = btrfs_is_name_len_valid(leaf, path->slots[0], ptr, - name_len); - if (!ret) { - err = -EIO; - goto out; - } - WARN_ON(memcmp_extent_buffer(leaf, name, ptr, name_len)); *sequence = btrfs_root_ref_sequence(leaf, ref); diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index b2f871d80982..ec56f33feea9 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -301,6 +301,11 @@ static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info); static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info); static void scrub_put_ctx(struct scrub_ctx *sctx); +static inline int scrub_is_page_on_raid56(struct scrub_page *page) +{ + return page->recover && + (page->recover->bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK); +} static void scrub_pending_bio_inc(struct scrub_ctx *sctx) { @@ -1323,15 +1328,34 @@ nodatasum_case: * could happen otherwise that a correct page would be * overwritten by a bad one). */ - for (mirror_index = 0; - mirror_index < BTRFS_MAX_MIRRORS && - sblocks_for_recheck[mirror_index].page_count > 0; - mirror_index++) { + for (mirror_index = 0; ;mirror_index++) { struct scrub_block *sblock_other; if (mirror_index == failed_mirror_index) continue; - sblock_other = sblocks_for_recheck + mirror_index; + + /* raid56's mirror can be more than BTRFS_MAX_MIRRORS */ + if (!scrub_is_page_on_raid56(sblock_bad->pagev[0])) { + if (mirror_index >= BTRFS_MAX_MIRRORS) + break; + if (!sblocks_for_recheck[mirror_index].page_count) + break; + + sblock_other = sblocks_for_recheck + mirror_index; + } else { + struct scrub_recover *r = sblock_bad->pagev[0]->recover; + int max_allowed = r->bbio->num_stripes - + r->bbio->num_tgtdevs; + + if (mirror_index >= max_allowed) + break; + if (!sblocks_for_recheck[1].page_count) + break; + + ASSERT(failed_mirror_index == 0); + sblock_other = sblocks_for_recheck + 1; + sblock_other->pagev[0]->mirror_num = 1 + mirror_index; + } /* build and submit the bios, check checksums */ scrub_recheck_block(fs_info, sblock_other, 0); @@ -1666,49 +1690,32 @@ leave_nomem: return 0; } -struct scrub_bio_ret { - struct completion event; - blk_status_t status; -}; - static void scrub_bio_wait_endio(struct bio *bio) { - struct scrub_bio_ret *ret = bio->bi_private; - - ret->status = bio->bi_status; - complete(&ret->event); -} - -static inline int scrub_is_page_on_raid56(struct scrub_page *page) -{ - return page->recover && - (page->recover->bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK); + complete(bio->bi_private); } static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info, struct bio *bio, struct scrub_page *page) { - struct scrub_bio_ret done; + DECLARE_COMPLETION_ONSTACK(done); int ret; + int mirror_num; - init_completion(&done.event); - done.status = 0; bio->bi_iter.bi_sector = page->logical >> 9; bio->bi_private = &done; bio->bi_end_io = scrub_bio_wait_endio; + mirror_num = page->sblock->pagev[0]->mirror_num; ret = raid56_parity_recover(fs_info, bio, page->recover->bbio, page->recover->map_length, - page->mirror_num, 0); + mirror_num, 0); if (ret) return ret; - wait_for_completion_io(&done.event); - if (done.status) - return -EIO; - - return 0; + wait_for_completion_io(&done); + return blk_status_to_errno(bio->bi_status); } /* @@ -2535,7 +2542,7 @@ leave_nomem: } WARN_ON(sblock->page_count == 0); - if (dev->missing) { + if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state)) { /* * This case should only be hit for RAID 5/6 device replace. See * the comment in scrub_missing_raid56_pages() for details. @@ -2870,7 +2877,7 @@ static int scrub_extent_for_parity(struct scrub_parity *sparity, u8 csum[BTRFS_CSUM_SIZE]; u32 blocksize; - if (dev->missing) { + if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state)) { scrub_parity_mark_sectors_error(sparity, logical, len); return 0; } @@ -4112,12 +4119,14 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, mutex_lock(&fs_info->fs_devices->device_list_mutex); dev = btrfs_find_device(fs_info, devid, NULL, NULL); - if (!dev || (dev->missing && !is_dev_replace)) { + if (!dev || (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) && + !is_dev_replace)) { mutex_unlock(&fs_info->fs_devices->device_list_mutex); return -ENODEV; } - if (!is_dev_replace && !readonly && !dev->writeable) { + if (!is_dev_replace && !readonly && + !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) { mutex_unlock(&fs_info->fs_devices->device_list_mutex); rcu_read_lock(); name = rcu_dereference(dev->name); @@ -4128,14 +4137,15 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, } mutex_lock(&fs_info->scrub_lock); - if (!dev->in_fs_metadata || dev->is_tgtdev_for_dev_replace) { + if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) || + test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &dev->dev_state)) { mutex_unlock(&fs_info->scrub_lock); mutex_unlock(&fs_info->fs_devices->device_list_mutex); return -EIO; } btrfs_dev_replace_lock(&fs_info->dev_replace, 0); - if (dev->scrub_device || + if (dev->scrub_ctx || (!is_dev_replace && btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))) { btrfs_dev_replace_unlock(&fs_info->dev_replace, 0); @@ -4160,7 +4170,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, return PTR_ERR(sctx); } sctx->readonly = readonly; - dev->scrub_device = sctx; + dev->scrub_ctx = sctx; mutex_unlock(&fs_info->fs_devices->device_list_mutex); /* @@ -4195,7 +4205,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, memcpy(progress, &sctx->stat, sizeof(*progress)); mutex_lock(&fs_info->scrub_lock); - dev->scrub_device = NULL; + dev->scrub_ctx = NULL; scrub_workers_put(fs_info); mutex_unlock(&fs_info->scrub_lock); @@ -4252,16 +4262,16 @@ int btrfs_scrub_cancel_dev(struct btrfs_fs_info *fs_info, struct scrub_ctx *sctx; mutex_lock(&fs_info->scrub_lock); - sctx = dev->scrub_device; + sctx = dev->scrub_ctx; if (!sctx) { mutex_unlock(&fs_info->scrub_lock); return -ENOTCONN; } atomic_inc(&sctx->cancel_req); - while (dev->scrub_device) { + while (dev->scrub_ctx) { mutex_unlock(&fs_info->scrub_lock); wait_event(fs_info->scrub_pause_wait, - dev->scrub_device == NULL); + dev->scrub_ctx == NULL); mutex_lock(&fs_info->scrub_lock); } mutex_unlock(&fs_info->scrub_lock); @@ -4278,7 +4288,7 @@ int btrfs_scrub_progress(struct btrfs_fs_info *fs_info, u64 devid, mutex_lock(&fs_info->fs_devices->device_list_mutex); dev = btrfs_find_device(fs_info, devid, NULL, NULL); if (dev) - sctx = dev->scrub_device; + sctx = dev->scrub_ctx; if (sctx) memcpy(progress, &sctx->stat, sizeof(*progress)); mutex_unlock(&fs_info->fs_devices->device_list_mutex); @@ -4478,8 +4488,7 @@ static int check_extent_to_block(struct btrfs_inode *inode, u64 start, u64 len, free_extent_map(em); out_unlock: - unlock_extent_cached(io_tree, lockstart, lockend, &cached_state, - GFP_NOFS); + unlock_extent_cached(io_tree, lockstart, lockend, &cached_state); return ret; } diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 20d3300bd268..f306c608dc28 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -1059,12 +1059,6 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path, } } - ret = btrfs_is_name_len_valid(eb, path->slots[0], - (unsigned long)(di + 1), name_len + data_len); - if (!ret) { - ret = -EIO; - goto out; - } if (name_len + data_len > buf_len) { buf_len = name_len + data_len; if (is_vmalloc_addr(buf)) { diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 3a4dce153645..6e71a2a78363 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -61,12 +61,21 @@ #include "tests/btrfs-tests.h" #include "qgroup.h" -#include "backref.h" #define CREATE_TRACE_POINTS #include <trace/events/btrfs.h> static const struct super_operations btrfs_super_ops; + +/* + * Types for mounting the default subvolume and a subvolume explicitly + * requested by subvol=/path. That way the callchain is straightforward and we + * don't have to play tricks with the mount options and recursive calls to + * btrfs_mount. + * + * The new btrfs_root_fs_type also servers as a tag for the bdev_holder. + */ static struct file_system_type btrfs_fs_type; +static struct file_system_type btrfs_root_fs_type; static int btrfs_remount(struct super_block *sb, int *flags, char *data); @@ -98,30 +107,6 @@ const char *btrfs_decode_error(int errno) return errstr; } -/* btrfs handle error by forcing the filesystem readonly */ -static void btrfs_handle_error(struct btrfs_fs_info *fs_info) -{ - struct super_block *sb = fs_info->sb; - - if (sb_rdonly(sb)) - return; - - if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) { - sb->s_flags |= SB_RDONLY; - btrfs_info(fs_info, "forced readonly"); - /* - * Note that a running device replace operation is not - * canceled here although there is no way to update - * the progress. It would add the risk of a deadlock, - * therefore the canceling is omitted. The only penalty - * is that some I/O remains active until the procedure - * completes. The next time when the filesystem is - * mounted writeable again, the device replace - * operation continues. - */ - } -} - /* * __btrfs_handle_fs_error decodes expected errors from the caller and * invokes the approciate error response. @@ -168,8 +153,23 @@ void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function set_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state); /* Don't go through full error handling during mount */ - if (sb->s_flags & SB_BORN) - btrfs_handle_error(fs_info); + if (!(sb->s_flags & SB_BORN)) + return; + + if (sb_rdonly(sb)) + return; + + /* btrfs handle error by forcing the filesystem readonly */ + sb->s_flags |= SB_RDONLY; + btrfs_info(fs_info, "forced readonly"); + /* + * Note that a running device replace operation is not canceled here + * although there is no way to update the progress. It would add the + * risk of a deadlock, therefore the canceling is omitted. The only + * penalty is that some I/O remains active until the procedure + * completes. The next time when the filesystem is mounted writeable + * again, the device replace operation continues. + */ } #ifdef CONFIG_PRINTK @@ -405,7 +405,7 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, unsigned long new_flags) { substring_t args[MAX_OPT_ARGS]; - char *p, *num, *orig = NULL; + char *p, *num; u64 cache_gen; int intarg; int ret = 0; @@ -428,16 +428,6 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, if (!options) goto check; - /* - * strsep changes the string, duplicate it because parse_options - * gets called twice - */ - options = kstrdup(options, GFP_KERNEL); - if (!options) - return -ENOMEM; - - orig = options; - while ((p = strsep(&options, ",")) != NULL) { int token; if (!*p) @@ -454,7 +444,8 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, case Opt_subvolrootid: case Opt_device: /* - * These are parsed by btrfs_parse_early_options + * These are parsed by btrfs_parse_subvol_options + * and btrfs_parse_early_options * and can be happily ignored here. */ break; @@ -877,7 +868,6 @@ out: btrfs_info(info, "disk space caching is enabled"); if (!ret && btrfs_test_opt(info, FREE_SPACE_TREE)) btrfs_info(info, "using free space tree"); - kfree(orig); return ret; } @@ -888,11 +878,60 @@ out: * only when we need to allocate a new super block. */ static int btrfs_parse_early_options(const char *options, fmode_t flags, - void *holder, char **subvol_name, u64 *subvol_objectid, - struct btrfs_fs_devices **fs_devices) + void *holder, struct btrfs_fs_devices **fs_devices) { substring_t args[MAX_OPT_ARGS]; char *device_name, *opts, *orig, *p; + int error = 0; + + if (!options) + return 0; + + /* + * strsep changes the string, duplicate it because btrfs_parse_options + * gets called later + */ + opts = kstrdup(options, GFP_KERNEL); + if (!opts) + return -ENOMEM; + orig = opts; + + while ((p = strsep(&opts, ",")) != NULL) { + int token; + + if (!*p) + continue; + + token = match_token(p, tokens, args); + if (token == Opt_device) { + device_name = match_strdup(&args[0]); + if (!device_name) { + error = -ENOMEM; + goto out; + } + error = btrfs_scan_one_device(device_name, + flags, holder, fs_devices); + kfree(device_name); + if (error) + goto out; + } + } + +out: + kfree(orig); + return error; +} + +/* + * Parse mount options that are related to subvolume id + * + * The value is later passed to mount_subvol() + */ +static int btrfs_parse_subvol_options(const char *options, fmode_t flags, + char **subvol_name, u64 *subvol_objectid) +{ + substring_t args[MAX_OPT_ARGS]; + char *opts, *orig, *p; char *num = NULL; int error = 0; @@ -900,8 +939,8 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags, return 0; /* - * strsep changes the string, duplicate it because parse_options - * gets called twice + * strsep changes the string, duplicate it because + * btrfs_parse_early_options gets called later */ opts = kstrdup(options, GFP_KERNEL); if (!opts) @@ -940,18 +979,6 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags, case Opt_subvolrootid: pr_warn("BTRFS: 'subvolrootid' mount option is deprecated and has no effect\n"); break; - case Opt_device: - device_name = match_strdup(&args[0]); - if (!device_name) { - error = -ENOMEM; - goto out; - } - error = btrfs_scan_one_device(device_name, - flags, holder, fs_devices); - kfree(device_name); - if (error) - goto out; - break; default: break; } @@ -1243,7 +1270,7 @@ int btrfs_sync_fs(struct super_block *sb, int wait) static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry) { struct btrfs_fs_info *info = btrfs_sb(dentry->d_sb); - char *compress_type; + const char *compress_type; if (btrfs_test_opt(info, DEGRADED)) seq_puts(seq, ",degraded"); @@ -1259,12 +1286,7 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry) num_online_cpus() + 2, 8)) seq_printf(seq, ",thread_pool=%d", info->thread_pool_size); if (btrfs_test_opt(info, COMPRESS)) { - if (info->compress_type == BTRFS_COMPRESS_ZLIB) - compress_type = "zlib"; - else if (info->compress_type == BTRFS_COMPRESS_LZO) - compress_type = "lzo"; - else - compress_type = "zstd"; + compress_type = btrfs_compress_type2str(info->compress_type); if (btrfs_test_opt(info, FORCE_COMPRESS)) seq_printf(seq, ",compress-force=%s", compress_type); else @@ -1365,86 +1387,12 @@ static inline int is_subvolume_inode(struct inode *inode) return 0; } -/* - * This will add subvolid=0 to the argument string while removing any subvol= - * and subvolid= arguments to make sure we get the top-level root for path - * walking to the subvol we want. - */ -static char *setup_root_args(char *args) -{ - char *buf, *dst, *sep; - - if (!args) - return kstrdup("subvolid=0", GFP_KERNEL); - - /* The worst case is that we add ",subvolid=0" to the end. */ - buf = dst = kmalloc(strlen(args) + strlen(",subvolid=0") + 1, - GFP_KERNEL); - if (!buf) - return NULL; - - while (1) { - sep = strchrnul(args, ','); - if (!strstarts(args, "subvol=") && - !strstarts(args, "subvolid=")) { - memcpy(dst, args, sep - args); - dst += sep - args; - *dst++ = ','; - } - if (*sep) - args = sep + 1; - else - break; - } - strcpy(dst, "subvolid=0"); - - return buf; -} - static struct dentry *mount_subvol(const char *subvol_name, u64 subvol_objectid, - int flags, const char *device_name, - char *data) + const char *device_name, struct vfsmount *mnt) { struct dentry *root; - struct vfsmount *mnt = NULL; - char *newargs; int ret; - newargs = setup_root_args(data); - if (!newargs) { - root = ERR_PTR(-ENOMEM); - goto out; - } - - mnt = vfs_kern_mount(&btrfs_fs_type, flags, device_name, newargs); - if (PTR_ERR_OR_ZERO(mnt) == -EBUSY) { - if (flags & SB_RDONLY) { - mnt = vfs_kern_mount(&btrfs_fs_type, flags & ~SB_RDONLY, - device_name, newargs); - } else { - mnt = vfs_kern_mount(&btrfs_fs_type, flags | SB_RDONLY, - device_name, newargs); - if (IS_ERR(mnt)) { - root = ERR_CAST(mnt); - mnt = NULL; - goto out; - } - - down_write(&mnt->mnt_sb->s_umount); - ret = btrfs_remount(mnt->mnt_sb, &flags, NULL); - up_write(&mnt->mnt_sb->s_umount); - if (ret < 0) { - root = ERR_PTR(ret); - goto out; - } - } - } - if (IS_ERR(mnt)) { - root = ERR_CAST(mnt); - mnt = NULL; - goto out; - } - if (!subvol_name) { if (!subvol_objectid) { ret = get_default_subvol_objectid(btrfs_sb(mnt->mnt_sb), @@ -1500,7 +1448,6 @@ static struct dentry *mount_subvol(const char *subvol_name, u64 subvol_objectid, out: mntput(mnt); - kfree(newargs); kfree(subvol_name); return root; } @@ -1558,11 +1505,11 @@ static int setup_security_options(struct btrfs_fs_info *fs_info, /* * Find a superblock for the given device / mount point. * - * Note: This is based on get_sb_bdev from fs/super.c with a few additions - * for multiple device setup. Make sure to keep it in sync. + * Note: This is based on mount_bdev from fs/super.c with a few additions + * for multiple device setup. Make sure to keep it in sync. */ -static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, - const char *device_name, void *data) +static struct dentry *btrfs_mount_root(struct file_system_type *fs_type, + int flags, const char *device_name, void *data) { struct block_device *bdev = NULL; struct super_block *s; @@ -1570,27 +1517,17 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, struct btrfs_fs_info *fs_info = NULL; struct security_mnt_opts new_sec_opts; fmode_t mode = FMODE_READ; - char *subvol_name = NULL; - u64 subvol_objectid = 0; int error = 0; if (!(flags & SB_RDONLY)) mode |= FMODE_WRITE; error = btrfs_parse_early_options(data, mode, fs_type, - &subvol_name, &subvol_objectid, &fs_devices); if (error) { - kfree(subvol_name); return ERR_PTR(error); } - if (subvol_name || subvol_objectid != BTRFS_FS_TREE_OBJECTID) { - /* mount_subvol() will free subvol_name. */ - return mount_subvol(subvol_name, subvol_objectid, flags, - device_name, data); - } - security_init_mnt_opts(&new_sec_opts); if (data) { error = parse_security_options(data, &new_sec_opts); @@ -1674,6 +1611,84 @@ error_sec_opts: return ERR_PTR(error); } +/* + * Mount function which is called by VFS layer. + * + * In order to allow mounting a subvolume directly, btrfs uses mount_subtree() + * which needs vfsmount* of device's root (/). This means device's root has to + * be mounted internally in any case. + * + * Operation flow: + * 1. Parse subvol id related options for later use in mount_subvol(). + * + * 2. Mount device's root (/) by calling vfs_kern_mount(). + * + * NOTE: vfs_kern_mount() is used by VFS to call btrfs_mount() in the + * first place. In order to avoid calling btrfs_mount() again, we use + * different file_system_type which is not registered to VFS by + * register_filesystem() (btrfs_root_fs_type). As a result, + * btrfs_mount_root() is called. The return value will be used by + * mount_subtree() in mount_subvol(). + * + * 3. Call mount_subvol() to get the dentry of subvolume. Since there is + * "btrfs subvolume set-default", mount_subvol() is called always. + */ +static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, + const char *device_name, void *data) +{ + struct vfsmount *mnt_root; + struct dentry *root; + fmode_t mode = FMODE_READ; + char *subvol_name = NULL; + u64 subvol_objectid = 0; + int error = 0; + + if (!(flags & SB_RDONLY)) + mode |= FMODE_WRITE; + + error = btrfs_parse_subvol_options(data, mode, + &subvol_name, &subvol_objectid); + if (error) { + kfree(subvol_name); + return ERR_PTR(error); + } + + /* mount device's root (/) */ + mnt_root = vfs_kern_mount(&btrfs_root_fs_type, flags, device_name, data); + if (PTR_ERR_OR_ZERO(mnt_root) == -EBUSY) { + if (flags & SB_RDONLY) { + mnt_root = vfs_kern_mount(&btrfs_root_fs_type, + flags & ~SB_RDONLY, device_name, data); + } else { + mnt_root = vfs_kern_mount(&btrfs_root_fs_type, + flags | SB_RDONLY, device_name, data); + if (IS_ERR(mnt_root)) { + root = ERR_CAST(mnt_root); + goto out; + } + + down_write(&mnt_root->mnt_sb->s_umount); + error = btrfs_remount(mnt_root->mnt_sb, &flags, NULL); + up_write(&mnt_root->mnt_sb->s_umount); + if (error < 0) { + root = ERR_PTR(error); + mntput(mnt_root); + goto out; + } + } + } + if (IS_ERR(mnt_root)) { + root = ERR_CAST(mnt_root); + goto out; + } + + /* mount_subvol() will free subvol_name and mnt_root */ + root = mount_subvol(subvol_name, subvol_objectid, device_name, mnt_root); + +out: + return root; +} + static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info, int new_pool_size, int old_pool_size) { @@ -1820,7 +1835,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) goto restore; } - if (!btrfs_check_rw_degradable(fs_info)) { + if (!btrfs_check_rw_degradable(fs_info, NULL)) { btrfs_warn(fs_info, "too many missing devices, writeable remount is not allowed"); ret = -EACCES; @@ -1972,8 +1987,10 @@ static int btrfs_calc_avail_data_space(struct btrfs_fs_info *fs_info, rcu_read_lock(); list_for_each_entry_rcu(device, &fs_devices->devices, dev_list) { - if (!device->in_fs_metadata || !device->bdev || - device->is_tgtdev_for_dev_replace) + if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, + &device->dev_state) || + !device->bdev || + test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) continue; if (i >= nr_devices) @@ -2174,6 +2191,15 @@ static struct file_system_type btrfs_fs_type = { .kill_sb = btrfs_kill_super, .fs_flags = FS_REQUIRES_DEV | FS_BINARY_MOUNTDATA, }; + +static struct file_system_type btrfs_root_fs_type = { + .owner = THIS_MODULE, + .name = "btrfs", + .mount = btrfs_mount_root, + .kill_sb = btrfs_kill_super, + .fs_flags = FS_REQUIRES_DEV | FS_BINARY_MOUNTDATA, +}; + MODULE_ALIAS_FS("btrfs"); static int btrfs_control_open(struct inode *inode, struct file *file) @@ -2207,11 +2233,11 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd, switch (cmd) { case BTRFS_IOC_SCAN_DEV: ret = btrfs_scan_one_device(vol->name, FMODE_READ, - &btrfs_fs_type, &fs_devices); + &btrfs_root_fs_type, &fs_devices); break; case BTRFS_IOC_DEVICES_READY: ret = btrfs_scan_one_device(vol->name, FMODE_READ, - &btrfs_fs_type, &fs_devices); + &btrfs_root_fs_type, &fs_devices); if (ret) break; ret = !(fs_devices->num_devices == fs_devices->total_devices); @@ -2269,7 +2295,7 @@ static int btrfs_show_devname(struct seq_file *m, struct dentry *root) while (cur_devices) { head = &cur_devices->devices; list_for_each_entry(dev, head, dev_list) { - if (dev->missing) + if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state)) continue; if (!dev->name) continue; @@ -2324,7 +2350,7 @@ static struct miscdevice btrfs_misc = { MODULE_ALIAS_MISCDEV(BTRFS_MINOR); MODULE_ALIAS("devname:btrfs-control"); -static int btrfs_interface_init(void) +static int __init btrfs_interface_init(void) { return misc_register(&btrfs_misc); } @@ -2334,7 +2360,7 @@ static void btrfs_interface_exit(void) misc_deregister(&btrfs_misc); } -static void btrfs_print_mod_info(void) +static void __init btrfs_print_mod_info(void) { pr_info("Btrfs loaded, crc32c=%s" #ifdef CONFIG_BTRFS_DEBUG diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index a28bba801264..a8bafed931f4 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -897,7 +897,7 @@ static int btrfs_init_debugfs(void) return 0; } -int btrfs_init_sysfs(void) +int __init btrfs_init_sysfs(void) { int ret; diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c index d3f25376a0f8..9786d8cd0aa6 100644 --- a/fs/btrfs/tests/btrfs-tests.c +++ b/fs/btrfs/tests/btrfs-tests.c @@ -277,6 +277,9 @@ int btrfs_run_sanity_tests(void) goto out; } } + ret = btrfs_test_extent_map(); + if (ret) + goto out; out: btrfs_destroy_test_fs(); return ret; diff --git a/fs/btrfs/tests/btrfs-tests.h b/fs/btrfs/tests/btrfs-tests.h index 266f1e3d1784..bc0615bac3cc 100644 --- a/fs/btrfs/tests/btrfs-tests.h +++ b/fs/btrfs/tests/btrfs-tests.h @@ -33,6 +33,7 @@ int btrfs_test_extent_io(u32 sectorsize, u32 nodesize); int btrfs_test_inodes(u32 sectorsize, u32 nodesize); int btrfs_test_qgroups(u32 sectorsize, u32 nodesize); int btrfs_test_free_space_tree(u32 sectorsize, u32 nodesize); +int btrfs_test_extent_map(void); struct inode *btrfs_new_test_inode(void); struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(u32 nodesize, u32 sectorsize); void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info); diff --git a/fs/btrfs/tests/extent-map-tests.c b/fs/btrfs/tests/extent-map-tests.c new file mode 100644 index 000000000000..70c993f01670 --- /dev/null +++ b/fs/btrfs/tests/extent-map-tests.c @@ -0,0 +1,366 @@ +/* + * Copyright (C) 2017 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/types.h> +#include "btrfs-tests.h" +#include "../ctree.h" + +static void free_extent_map_tree(struct extent_map_tree *em_tree) +{ + struct extent_map *em; + struct rb_node *node; + + while (!RB_EMPTY_ROOT(&em_tree->map)) { + node = rb_first(&em_tree->map); + em = rb_entry(node, struct extent_map, rb_node); + remove_extent_mapping(em_tree, em); + +#ifdef CONFIG_BTRFS_DEBUG + if (refcount_read(&em->refs) != 1) { + test_msg( +"em leak: em (start 0x%llx len 0x%llx block_start 0x%llx block_len 0x%llx) refs %d\n", + em->start, em->len, em->block_start, + em->block_len, refcount_read(&em->refs)); + + refcount_set(&em->refs, 1); + } +#endif + free_extent_map(em); + } +} + +/* + * Test scenario: + * + * Suppose that no extent map has been loaded into memory yet, there is a file + * extent [0, 16K), followed by another file extent [16K, 20K), two dio reads + * are entering btrfs_get_extent() concurrently, t1 is reading [8K, 16K), t2 is + * reading [0, 8K) + * + * t1 t2 + * btrfs_get_extent() btrfs_get_extent() + * -> lookup_extent_mapping() ->lookup_extent_mapping() + * -> add_extent_mapping(0, 16K) + * -> return em + * ->add_extent_mapping(0, 16K) + * -> #handle -EEXIST + */ +static void test_case_1(struct extent_map_tree *em_tree) +{ + struct extent_map *em; + u64 start = 0; + u64 len = SZ_8K; + int ret; + + em = alloc_extent_map(); + if (!em) + /* Skip the test on error. */ + return; + + /* Add [0, 16K) */ + em->start = 0; + em->len = SZ_16K; + em->block_start = 0; + em->block_len = SZ_16K; + ret = add_extent_mapping(em_tree, em, 0); + ASSERT(ret == 0); + free_extent_map(em); + + /* Add [16K, 20K) following [0, 16K) */ + em = alloc_extent_map(); + if (!em) + goto out; + + em->start = SZ_16K; + em->len = SZ_4K; + em->block_start = SZ_32K; /* avoid merging */ + em->block_len = SZ_4K; + ret = add_extent_mapping(em_tree, em, 0); + ASSERT(ret == 0); + free_extent_map(em); + + em = alloc_extent_map(); + if (!em) + goto out; + + /* Add [0, 8K), should return [0, 16K) instead. */ + em->start = start; + em->len = len; + em->block_start = start; + em->block_len = len; + ret = btrfs_add_extent_mapping(em_tree, &em, em->start, em->len); + if (ret) + test_msg("case1 [%llu %llu]: ret %d\n", start, start + len, ret); + if (em && + (em->start != 0 || extent_map_end(em) != SZ_16K || + em->block_start != 0 || em->block_len != SZ_16K)) + test_msg( +"case1 [%llu %llu]: ret %d return a wrong em (start %llu len %llu block_start %llu block_len %llu\n", + start, start + len, ret, em->start, em->len, + em->block_start, em->block_len); + free_extent_map(em); +out: + /* free memory */ + free_extent_map_tree(em_tree); +} + +/* + * Test scenario: + * + * Reading the inline ending up with EEXIST, ie. read an inline + * extent and discard page cache and read it again. + */ +static void test_case_2(struct extent_map_tree *em_tree) +{ + struct extent_map *em; + int ret; + + em = alloc_extent_map(); + if (!em) + /* Skip the test on error. */ + return; + + /* Add [0, 1K) */ + em->start = 0; + em->len = SZ_1K; + em->block_start = EXTENT_MAP_INLINE; + em->block_len = (u64)-1; + ret = add_extent_mapping(em_tree, em, 0); + ASSERT(ret == 0); + free_extent_map(em); + + /* Add [4K, 4K) following [0, 1K) */ + em = alloc_extent_map(); + if (!em) + goto out; + + em->start = SZ_4K; + em->len = SZ_4K; + em->block_start = SZ_4K; + em->block_len = SZ_4K; + ret = add_extent_mapping(em_tree, em, 0); + ASSERT(ret == 0); + free_extent_map(em); + + em = alloc_extent_map(); + if (!em) + goto out; + + /* Add [0, 1K) */ + em->start = 0; + em->len = SZ_1K; + em->block_start = EXTENT_MAP_INLINE; + em->block_len = (u64)-1; + ret = btrfs_add_extent_mapping(em_tree, &em, em->start, em->len); + if (ret) + test_msg("case2 [0 1K]: ret %d\n", ret); + if (em && + (em->start != 0 || extent_map_end(em) != SZ_1K || + em->block_start != EXTENT_MAP_INLINE || em->block_len != (u64)-1)) + test_msg( +"case2 [0 1K]: ret %d return a wrong em (start %llu len %llu block_start %llu block_len %llu\n", + ret, em->start, em->len, em->block_start, + em->block_len); + free_extent_map(em); +out: + /* free memory */ + free_extent_map_tree(em_tree); +} + +static void __test_case_3(struct extent_map_tree *em_tree, u64 start) +{ + struct extent_map *em; + u64 len = SZ_4K; + int ret; + + em = alloc_extent_map(); + if (!em) + /* Skip this test on error. */ + return; + + /* Add [4K, 8K) */ + em->start = SZ_4K; + em->len = SZ_4K; + em->block_start = SZ_4K; + em->block_len = SZ_4K; + ret = add_extent_mapping(em_tree, em, 0); + ASSERT(ret == 0); + free_extent_map(em); + + em = alloc_extent_map(); + if (!em) + goto out; + + /* Add [0, 16K) */ + em->start = 0; + em->len = SZ_16K; + em->block_start = 0; + em->block_len = SZ_16K; + ret = btrfs_add_extent_mapping(em_tree, &em, start, len); + if (ret) + test_msg("case3 [0x%llx 0x%llx): ret %d\n", + start, start + len, ret); + /* + * Since bytes within em are contiguous, em->block_start is identical to + * em->start. + */ + if (em && + (start < em->start || start + len > extent_map_end(em) || + em->start != em->block_start || em->len != em->block_len)) + test_msg( +"case3 [0x%llx 0x%llx): ret %d em (start 0x%llx len 0x%llx block_start 0x%llx block_len 0x%llx)\n", + start, start + len, ret, em->start, em->len, + em->block_start, em->block_len); + free_extent_map(em); +out: + /* free memory */ + free_extent_map_tree(em_tree); +} + +/* + * Test scenario: + * + * Suppose that no extent map has been loaded into memory yet. + * There is a file extent [0, 16K), two jobs are running concurrently + * against it, t1 is buffered writing to [4K, 8K) and t2 is doing dio + * read from [0, 4K) or [8K, 12K) or [12K, 16K). + * + * t1 goes ahead of t2 and adds em [4K, 8K) into tree. + * + * t1 t2 + * cow_file_range() btrfs_get_extent() + * -> lookup_extent_mapping() + * -> add_extent_mapping() + * -> add_extent_mapping() + */ +static void test_case_3(struct extent_map_tree *em_tree) +{ + __test_case_3(em_tree, 0); + __test_case_3(em_tree, SZ_8K); + __test_case_3(em_tree, (12 * 1024ULL)); +} + +static void __test_case_4(struct extent_map_tree *em_tree, u64 start) +{ + struct extent_map *em; + u64 len = SZ_4K; + int ret; + + em = alloc_extent_map(); + if (!em) + /* Skip this test on error. */ + return; + + /* Add [0K, 8K) */ + em->start = 0; + em->len = SZ_8K; + em->block_start = 0; + em->block_len = SZ_8K; + ret = add_extent_mapping(em_tree, em, 0); + ASSERT(ret == 0); + free_extent_map(em); + + em = alloc_extent_map(); + if (!em) + goto out; + + /* Add [8K, 24K) */ + em->start = SZ_8K; + em->len = 24 * 1024ULL; + em->block_start = SZ_16K; /* avoid merging */ + em->block_len = 24 * 1024ULL; + ret = add_extent_mapping(em_tree, em, 0); + ASSERT(ret == 0); + free_extent_map(em); + + em = alloc_extent_map(); + if (!em) + goto out; + /* Add [0K, 32K) */ + em->start = 0; + em->len = SZ_32K; + em->block_start = 0; + em->block_len = SZ_32K; + ret = btrfs_add_extent_mapping(em_tree, &em, start, len); + if (ret) + test_msg("case4 [0x%llx 0x%llx): ret %d\n", + start, len, ret); + if (em && + (start < em->start || start + len > extent_map_end(em))) + test_msg( +"case4 [0x%llx 0x%llx): ret %d, added wrong em (start 0x%llx len 0x%llx block_start 0x%llx block_len 0x%llx)\n", + start, len, ret, em->start, em->len, em->block_start, + em->block_len); + free_extent_map(em); +out: + /* free memory */ + free_extent_map_tree(em_tree); +} + +/* + * Test scenario: + * + * Suppose that no extent map has been loaded into memory yet. + * There is a file extent [0, 32K), two jobs are running concurrently + * against it, t1 is doing dio write to [8K, 32K) and t2 is doing dio + * read from [0, 4K) or [4K, 8K). + * + * t1 goes ahead of t2 and splits em [0, 32K) to em [0K, 8K) and [8K 32K). + * + * t1 t2 + * btrfs_get_blocks_direct() btrfs_get_blocks_direct() + * -> btrfs_get_extent() -> btrfs_get_extent() + * -> lookup_extent_mapping() + * -> add_extent_mapping() -> lookup_extent_mapping() + * # load [0, 32K) + * -> btrfs_new_extent_direct() + * -> btrfs_drop_extent_cache() + * # split [0, 32K) + * -> add_extent_mapping() + * # add [8K, 32K) + * -> add_extent_mapping() + * # handle -EEXIST when adding + * # [0, 32K) + */ +static void test_case_4(struct extent_map_tree *em_tree) +{ + __test_case_4(em_tree, 0); + __test_case_4(em_tree, SZ_4K); +} + +int btrfs_test_extent_map() +{ + struct extent_map_tree *em_tree; + + test_msg("Running extent_map tests\n"); + + em_tree = kzalloc(sizeof(*em_tree), GFP_KERNEL); + if (!em_tree) + /* Skip the test on error. */ + return 0; + + extent_map_tree_init(em_tree); + + test_case_1(em_tree); + test_case_2(em_tree); + test_case_3(em_tree); + test_case_4(em_tree); + + kfree(em_tree); + return 0; +} diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c index 30affb60da51..13420cd19ef0 100644 --- a/fs/btrfs/tests/inode-tests.c +++ b/fs/btrfs/tests/inode-tests.c @@ -288,10 +288,6 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) test_msg("Expected a hole, got %llu\n", em->block_start); goto out; } - if (!test_bit(EXTENT_FLAG_VACANCY, &em->flags)) { - test_msg("Vacancy flag wasn't set properly\n"); - goto out; - } free_extent_map(em); btrfs_drop_extent_cache(BTRFS_I(inode), 0, (u64)-1, 0); @@ -1001,8 +997,7 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize) BTRFS_MAX_EXTENT_SIZE >> 1, (BTRFS_MAX_EXTENT_SIZE >> 1) + sectorsize - 1, EXTENT_DELALLOC | EXTENT_DIRTY | - EXTENT_UPTODATE, 0, 0, - NULL, GFP_KERNEL); + EXTENT_UPTODATE, 0, 0, NULL); if (ret) { test_msg("clear_extent_bit returned %d\n", ret); goto out; @@ -1070,8 +1065,7 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize) BTRFS_MAX_EXTENT_SIZE + sectorsize, BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize - 1, EXTENT_DIRTY | EXTENT_DELALLOC | - EXTENT_UPTODATE, 0, 0, - NULL, GFP_KERNEL); + EXTENT_UPTODATE, 0, 0, NULL); if (ret) { test_msg("clear_extent_bit returned %d\n", ret); goto out; @@ -1104,8 +1098,7 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize) /* Empty */ ret = clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, (u64)-1, EXTENT_DIRTY | EXTENT_DELALLOC | - EXTENT_UPTODATE, 0, 0, - NULL, GFP_KERNEL); + EXTENT_UPTODATE, 0, 0, NULL); if (ret) { test_msg("clear_extent_bit returned %d\n", ret); goto out; @@ -1121,8 +1114,7 @@ out: if (ret) clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, (u64)-1, EXTENT_DIRTY | EXTENT_DELALLOC | - EXTENT_UPTODATE, 0, 0, - NULL, GFP_KERNEL); + EXTENT_UPTODATE, 0, 0, NULL); iput(inode); btrfs_free_dummy_root(root); btrfs_free_dummy_fs_info(fs_info); @@ -1134,7 +1126,6 @@ int btrfs_test_inodes(u32 sectorsize, u32 nodesize) int ret; set_bit(EXTENT_FLAG_COMPRESSED, &compressed_only); - set_bit(EXTENT_FLAG_VACANCY, &vacancy_only); set_bit(EXTENT_FLAG_PREALLOC, &prealloc_only); test_msg("Running btrfs_get_extent tests\n"); diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 5a8c2649af2f..04f07144b45c 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -495,8 +495,8 @@ start_transaction(struct btrfs_root *root, unsigned int num_items, if (current->journal_info) { WARN_ON(type & TRANS_EXTWRITERS); h = current->journal_info; - h->use_count++; - WARN_ON(h->use_count > 2); + refcount_inc(&h->use_count); + WARN_ON(refcount_read(&h->use_count) > 2); h->orig_rsv = h->block_rsv; h->block_rsv = NULL; goto got_it; @@ -567,7 +567,7 @@ again: h->transid = cur_trans->transid; h->transaction = cur_trans; h->root = root; - h->use_count = 1; + refcount_set(&h->use_count, 1); h->fs_info = root->fs_info; h->type = type; @@ -837,8 +837,8 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, int err = 0; int must_run_delayed_refs = 0; - if (trans->use_count > 1) { - trans->use_count--; + if (refcount_read(&trans->use_count) > 1) { + refcount_dec(&trans->use_count); trans->block_rsv = trans->orig_rsv; return 0; } @@ -1016,8 +1016,7 @@ static int __btrfs_wait_marked_extents(struct btrfs_fs_info *fs_info, * it's safe to do it (through clear_btree_io_tree()). */ err = clear_extent_bit(dirty_pages, start, end, - EXTENT_NEED_WAIT, - 0, 0, &cached_state, GFP_NOFS); + EXTENT_NEED_WAIT, 0, 0, &cached_state); if (err == -ENOMEM) err = 0; if (!err) @@ -1869,7 +1868,7 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans, struct btrfs_transaction *cur_trans = trans->transaction; DEFINE_WAIT(wait); - WARN_ON(trans->use_count > 1); + WARN_ON(refcount_read(&trans->use_count) > 1); btrfs_abort_transaction(trans, err); @@ -2266,16 +2265,13 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) } ret = write_all_supers(fs_info, 0); - if (ret) { - mutex_unlock(&fs_info->tree_log_mutex); - goto scrub_continue; - } - /* * the super is written, we can safely allow the tree-loggers * to go about their business */ mutex_unlock(&fs_info->tree_log_mutex); + if (ret) + goto scrub_continue; btrfs_finish_extent_commit(trans, fs_info); diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index c55e44560103..6beee072b1bd 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -58,6 +58,7 @@ struct btrfs_transaction { /* Be protected by fs_info->trans_lock when we want to change it. */ enum btrfs_trans_state state; + int aborted; struct list_head list; struct extent_io_tree dirty_pages; unsigned long start_time; @@ -70,7 +71,6 @@ struct btrfs_transaction { struct list_head dirty_bgs; struct list_head io_bgs; struct list_head dropped_roots; - u64 num_dirty_bgs; /* * we need to make sure block group deletion doesn't race with @@ -79,11 +79,11 @@ struct btrfs_transaction { */ struct mutex cache_write_mutex; spinlock_t dirty_bgs_lock; + unsigned int num_dirty_bgs; /* Protected by spin lock fs_info->unused_bgs_lock. */ struct list_head deleted_bgs; spinlock_t dropped_roots_lock; struct btrfs_delayed_ref_root delayed_refs; - int aborted; struct btrfs_fs_info *fs_info; }; @@ -111,20 +111,19 @@ struct btrfs_trans_handle { u64 transid; u64 bytes_reserved; u64 chunk_bytes_reserved; - unsigned long use_count; - unsigned long blocks_reserved; unsigned long delayed_ref_updates; struct btrfs_transaction *transaction; struct btrfs_block_rsv *block_rsv; struct btrfs_block_rsv *orig_rsv; + refcount_t use_count; + unsigned int type; short aborted; - short adding_csums; + bool adding_csums; bool allocating_chunk; bool can_flush_pending_bgs; bool reloc_reserved; bool sync; bool dirty; - unsigned int type; struct btrfs_root *root; struct btrfs_fs_info *fs_info; struct list_head new_bgs; diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index ce4ed6ec8f39..c3c8d48f6618 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -30,6 +30,7 @@ #include "tree-checker.h" #include "disk-io.h" #include "compression.h" +#include "hash.h" /* * Error message should follow the following format: @@ -223,6 +224,142 @@ static int check_csum_item(struct btrfs_root *root, struct extent_buffer *leaf, } /* + * Customized reported for dir_item, only important new info is key->objectid, + * which represents inode number + */ +__printf(4, 5) +static void dir_item_err(const struct btrfs_root *root, + const struct extent_buffer *eb, int slot, + const char *fmt, ...) +{ + struct btrfs_key key; + struct va_format vaf; + va_list args; + + btrfs_item_key_to_cpu(eb, &key, slot); + va_start(args, fmt); + + vaf.fmt = fmt; + vaf.va = &args; + + btrfs_crit(root->fs_info, + "corrupt %s: root=%llu block=%llu slot=%d ino=%llu, %pV", + btrfs_header_level(eb) == 0 ? "leaf" : "node", root->objectid, + btrfs_header_bytenr(eb), slot, key.objectid, &vaf); + va_end(args); +} + +static int check_dir_item(struct btrfs_root *root, + struct extent_buffer *leaf, + struct btrfs_key *key, int slot) +{ + struct btrfs_dir_item *di; + u32 item_size = btrfs_item_size_nr(leaf, slot); + u32 cur = 0; + + di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item); + while (cur < item_size) { + u32 name_len; + u32 data_len; + u32 max_name_len; + u32 total_size; + u32 name_hash; + u8 dir_type; + + /* header itself should not cross item boundary */ + if (cur + sizeof(*di) > item_size) { + dir_item_err(root, leaf, slot, + "dir item header crosses item boundary, have %zu boundary %u", + cur + sizeof(*di), item_size); + return -EUCLEAN; + } + + /* dir type check */ + dir_type = btrfs_dir_type(leaf, di); + if (dir_type >= BTRFS_FT_MAX) { + dir_item_err(root, leaf, slot, + "invalid dir item type, have %u expect [0, %u)", + dir_type, BTRFS_FT_MAX); + return -EUCLEAN; + } + + if (key->type == BTRFS_XATTR_ITEM_KEY && + dir_type != BTRFS_FT_XATTR) { + dir_item_err(root, leaf, slot, + "invalid dir item type for XATTR key, have %u expect %u", + dir_type, BTRFS_FT_XATTR); + return -EUCLEAN; + } + if (dir_type == BTRFS_FT_XATTR && + key->type != BTRFS_XATTR_ITEM_KEY) { + dir_item_err(root, leaf, slot, + "xattr dir type found for non-XATTR key"); + return -EUCLEAN; + } + if (dir_type == BTRFS_FT_XATTR) + max_name_len = XATTR_NAME_MAX; + else + max_name_len = BTRFS_NAME_LEN; + + /* Name/data length check */ + name_len = btrfs_dir_name_len(leaf, di); + data_len = btrfs_dir_data_len(leaf, di); + if (name_len > max_name_len) { + dir_item_err(root, leaf, slot, + "dir item name len too long, have %u max %u", + name_len, max_name_len); + return -EUCLEAN; + } + if (name_len + data_len > BTRFS_MAX_XATTR_SIZE(root->fs_info)) { + dir_item_err(root, leaf, slot, + "dir item name and data len too long, have %u max %u", + name_len + data_len, + BTRFS_MAX_XATTR_SIZE(root->fs_info)); + return -EUCLEAN; + } + + if (data_len && dir_type != BTRFS_FT_XATTR) { + dir_item_err(root, leaf, slot, + "dir item with invalid data len, have %u expect 0", + data_len); + return -EUCLEAN; + } + + total_size = sizeof(*di) + name_len + data_len; + + /* header and name/data should not cross item boundary */ + if (cur + total_size > item_size) { + dir_item_err(root, leaf, slot, + "dir item data crosses item boundary, have %u boundary %u", + cur + total_size, item_size); + return -EUCLEAN; + } + + /* + * Special check for XATTR/DIR_ITEM, as key->offset is name + * hash, should match its name + */ + if (key->type == BTRFS_DIR_ITEM_KEY || + key->type == BTRFS_XATTR_ITEM_KEY) { + char namebuf[max(BTRFS_NAME_LEN, XATTR_NAME_MAX)]; + + read_extent_buffer(leaf, namebuf, + (unsigned long)(di + 1), name_len); + name_hash = btrfs_name_hash(namebuf, name_len); + if (key->offset != name_hash) { + dir_item_err(root, leaf, slot, + "name hash mismatch with key, have 0x%016x expect 0x%016llx", + name_hash, key->offset); + return -EUCLEAN; + } + } + cur += total_size; + di = (struct btrfs_dir_item *)((void *)di + total_size); + } + return 0; +} + +/* * Common point to switch the item-specific validation. */ static int check_leaf_item(struct btrfs_root *root, @@ -238,6 +375,11 @@ static int check_leaf_item(struct btrfs_root *root, case BTRFS_EXTENT_CSUM_KEY: ret = check_csum_item(root, leaf, key, slot); break; + case BTRFS_DIR_ITEM_KEY: + case BTRFS_DIR_INDEX_KEY: + case BTRFS_XATTR_ITEM_KEY: + ret = check_dir_item(root, leaf, key, slot); + break; } return ret; } diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 7bf9b31561db..afadaadab18e 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -20,6 +20,7 @@ #include <linux/slab.h> #include <linux/blkdev.h> #include <linux/list_sort.h> +#include <linux/iversion.h> #include "tree-log.h" #include "disk-io.h" #include "locking.h" @@ -1173,19 +1174,15 @@ next: return 0; } -static int extref_get_fields(struct extent_buffer *eb, int slot, - unsigned long ref_ptr, u32 *namelen, char **name, - u64 *index, u64 *parent_objectid) +static int extref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr, + u32 *namelen, char **name, u64 *index, + u64 *parent_objectid) { struct btrfs_inode_extref *extref; extref = (struct btrfs_inode_extref *)ref_ptr; *namelen = btrfs_inode_extref_name_len(eb, extref); - if (!btrfs_is_name_len_valid(eb, slot, (unsigned long)&extref->name, - *namelen)) - return -EIO; - *name = kmalloc(*namelen, GFP_NOFS); if (*name == NULL) return -ENOMEM; @@ -1200,19 +1197,14 @@ static int extref_get_fields(struct extent_buffer *eb, int slot, return 0; } -static int ref_get_fields(struct extent_buffer *eb, int slot, - unsigned long ref_ptr, u32 *namelen, char **name, - u64 *index) +static int ref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr, + u32 *namelen, char **name, u64 *index) { struct btrfs_inode_ref *ref; ref = (struct btrfs_inode_ref *)ref_ptr; *namelen = btrfs_inode_ref_name_len(eb, ref); - if (!btrfs_is_name_len_valid(eb, slot, (unsigned long)(ref + 1), - *namelen)) - return -EIO; - *name = kmalloc(*namelen, GFP_NOFS); if (*name == NULL) return -ENOMEM; @@ -1287,8 +1279,8 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, while (ref_ptr < ref_end) { if (log_ref_ver) { - ret = extref_get_fields(eb, slot, ref_ptr, &namelen, - &name, &ref_index, &parent_objectid); + ret = extref_get_fields(eb, ref_ptr, &namelen, &name, + &ref_index, &parent_objectid); /* * parent object can change from one array * item to another. @@ -1300,8 +1292,8 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, goto out; } } else { - ret = ref_get_fields(eb, slot, ref_ptr, &namelen, - &name, &ref_index); + ret = ref_get_fields(eb, ref_ptr, &namelen, &name, + &ref_index); } if (ret) goto out; @@ -1835,7 +1827,6 @@ static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans, struct extent_buffer *eb, int slot, struct btrfs_key *key) { - struct btrfs_fs_info *fs_info = root->fs_info; int ret = 0; u32 item_size = btrfs_item_size_nr(eb, slot); struct btrfs_dir_item *di; @@ -1848,8 +1839,6 @@ static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans, ptr_end = ptr + item_size; while (ptr < ptr_end) { di = (struct btrfs_dir_item *)ptr; - if (verify_dir_item(fs_info, eb, slot, di)) - return -EIO; name_len = btrfs_dir_name_len(eb, di); ret = replay_one_name(trans, root, path, eb, di, key); if (ret < 0) @@ -2024,11 +2013,6 @@ again: ptr_end = ptr + item_size; while (ptr < ptr_end) { di = (struct btrfs_dir_item *)ptr; - if (verify_dir_item(fs_info, eb, slot, di)) { - ret = -EIO; - goto out; - } - name_len = btrfs_dir_name_len(eb, di); name = kmalloc(name_len, GFP_NOFS); if (!name) { @@ -2109,7 +2093,6 @@ static int replay_xattr_deletes(struct btrfs_trans_handle *trans, struct btrfs_path *path, const u64 ino) { - struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_key search_key; struct btrfs_path *log_path; int i; @@ -2151,11 +2134,6 @@ process_leaf: u32 this_len = sizeof(*di) + name_len + data_len; char *name; - ret = verify_dir_item(fs_info, path->nodes[0], i, di); - if (ret) { - ret = -EIO; - goto out; - } name = kmalloc(name_len, GFP_NOFS); if (!name) { ret = -ENOMEM; @@ -3609,7 +3587,8 @@ static void fill_inode_item(struct btrfs_trans_handle *trans, btrfs_set_token_inode_nbytes(leaf, item, inode_get_bytes(inode), &token); - btrfs_set_token_inode_sequence(leaf, item, inode->i_version, &token); + btrfs_set_token_inode_sequence(leaf, item, + inode_peek_iversion(inode), &token); btrfs_set_token_inode_transid(leaf, item, trans->transid, &token); btrfs_set_token_inode_rdev(leaf, item, inode->i_rdev, &token); btrfs_set_token_inode_flags(leaf, item, BTRFS_I(inode)->flags, &token); @@ -4572,12 +4551,6 @@ static int btrfs_check_ref_name_override(struct extent_buffer *eb, this_len = sizeof(*extref) + this_name_len; } - ret = btrfs_is_name_len_valid(eb, slot, name_ptr, - this_name_len); - if (!ret) { - ret = -EIO; - goto out; - } if (this_name_len > name_len) { char *new_name; @@ -5432,11 +5405,10 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, struct dentry *parent, const loff_t start, const loff_t end, - int exists_only, + int inode_only, struct btrfs_log_ctx *ctx) { struct btrfs_fs_info *fs_info = root->fs_info; - int inode_only = exists_only ? LOG_INODE_EXISTS : LOG_INODE_ALL; struct super_block *sb; struct dentry *old_parent = NULL; int ret = 0; @@ -5602,7 +5574,7 @@ int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, int ret; ret = btrfs_log_inode_parent(trans, root, BTRFS_I(d_inode(dentry)), - parent, start, end, 0, ctx); + parent, start, end, LOG_INODE_ALL, ctx); dput(parent); return ret; @@ -5865,6 +5837,6 @@ int btrfs_log_new_name(struct btrfs_trans_handle *trans, return 0; return btrfs_log_inode_parent(trans, root, inode, parent, 0, - LLONG_MAX, 1, NULL); + LLONG_MAX, LOG_INODE_EXISTS, NULL); } diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index a25684287501..b5036bd69e6a 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -145,6 +145,71 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, struct btrfs_bio **bbio_ret, int mirror_num, int need_raid_map); +/* + * Device locking + * ============== + * + * There are several mutexes that protect manipulation of devices and low-level + * structures like chunks but not block groups, extents or files + * + * uuid_mutex (global lock) + * ------------------------ + * protects the fs_uuids list that tracks all per-fs fs_devices, resulting from + * the SCAN_DEV ioctl registration or from mount either implicitly (the first + * device) or requested by the device= mount option + * + * the mutex can be very coarse and can cover long-running operations + * + * protects: updates to fs_devices counters like missing devices, rw devices, + * seeding, structure cloning, openning/closing devices at mount/umount time + * + * global::fs_devs - add, remove, updates to the global list + * + * does not protect: manipulation of the fs_devices::devices list! + * + * btrfs_device::name - renames (write side), read is RCU + * + * fs_devices::device_list_mutex (per-fs, with RCU) + * ------------------------------------------------ + * protects updates to fs_devices::devices, ie. adding and deleting + * + * simple list traversal with read-only actions can be done with RCU protection + * + * may be used to exclude some operations from running concurrently without any + * modifications to the list (see write_all_supers) + * + * volume_mutex + * ------------ + * coarse lock owned by a mounted filesystem; used to exclude some operations + * that cannot run in parallel and affect the higher-level properties of the + * filesystem like: device add/deleting/resize/replace, or balance + * + * balance_mutex + * ------------- + * protects balance structures (status, state) and context accessed from + * several places (internally, ioctl) + * + * chunk_mutex + * ----------- + * protects chunks, adding or removing during allocation, trim or when a new + * device is added/removed + * + * cleaner_mutex + * ------------- + * a big lock that is held by the cleaner thread and prevents running subvolume + * cleaning together with relocation or delayed iputs + * + * + * Lock nesting + * ============ + * + * uuid_mutex + * volume_mutex + * device_list_mutex + * chunk_mutex + * balance_mutex + */ + DEFINE_MUTEX(uuid_mutex); static LIST_HEAD(fs_uuids); struct list_head *btrfs_get_fs_uuids(void) @@ -180,6 +245,13 @@ static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid) return fs_devs; } +static void free_device(struct btrfs_device *device) +{ + rcu_string_free(device->name); + bio_put(device->flush_bio); + kfree(device); +} + static void free_fs_devices(struct btrfs_fs_devices *fs_devices) { struct btrfs_device *device; @@ -188,9 +260,7 @@ static void free_fs_devices(struct btrfs_fs_devices *fs_devices) device = list_entry(fs_devices->devices.next, struct btrfs_device, dev_list); list_del(&device->dev_list); - rcu_string_free(device->name); - bio_put(device->flush_bio); - kfree(device); + free_device(device); } kfree(fs_devices); } @@ -220,6 +290,11 @@ void btrfs_cleanup_fs_uuids(void) } } +/* + * Returns a pointer to a new btrfs_device on success; ERR_PTR() on error. + * Returned struct is not linked onto any lists and must be destroyed using + * free_device. + */ static struct btrfs_device *__alloc_device(void) { struct btrfs_device *dev; @@ -244,7 +319,6 @@ static struct btrfs_device *__alloc_device(void) spin_lock_init(&dev->io_lock); - spin_lock_init(&dev->reada_lock); atomic_set(&dev->reada_in_flight, 0); atomic_set(&dev->dev_stats_ccnt, 0); btrfs_device_data_ordered_init(dev); @@ -530,45 +604,42 @@ static void pending_bios_fn(struct btrfs_work *work) run_scheduled_bios(device); } - -static void btrfs_free_stale_device(struct btrfs_device *cur_dev) +/* + * Search and remove all stale (devices which are not mounted) devices. + * When both inputs are NULL, it will search and release all stale devices. + * path: Optional. When provided will it release all unmounted devices + * matching this path only. + * skip_dev: Optional. Will skip this device when searching for the stale + * devices. + */ +static void btrfs_free_stale_devices(const char *path, + struct btrfs_device *skip_dev) { - struct btrfs_fs_devices *fs_devs; - struct btrfs_device *dev; - - if (!cur_dev->name) - return; + struct btrfs_fs_devices *fs_devs, *tmp_fs_devs; + struct btrfs_device *dev, *tmp_dev; - list_for_each_entry(fs_devs, &fs_uuids, list) { - int del = 1; + list_for_each_entry_safe(fs_devs, tmp_fs_devs, &fs_uuids, list) { if (fs_devs->opened) continue; - if (fs_devs->seeding) - continue; - list_for_each_entry(dev, &fs_devs->devices, dev_list) { + list_for_each_entry_safe(dev, tmp_dev, + &fs_devs->devices, dev_list) { + int not_found = 0; - if (dev == cur_dev) + if (skip_dev && skip_dev == dev) continue; - if (!dev->name) + if (path && !dev->name) continue; - /* - * Todo: This won't be enough. What if the same device - * comes back (with new uuid and) with its mapper path? - * But for now, this does help as mostly an admin will - * either use mapper or non mapper path throughout. - */ rcu_read_lock(); - del = strcmp(rcu_str_deref(dev->name), - rcu_str_deref(cur_dev->name)); + if (path) + not_found = strcmp(rcu_str_deref(dev->name), + path); rcu_read_unlock(); - if (!del) - break; - } + if (not_found) + continue; - if (!del) { /* delete the stale device */ if (fs_devs->num_devices == 1) { btrfs_sysfs_remove_fsid(fs_devs); @@ -577,38 +648,99 @@ static void btrfs_free_stale_device(struct btrfs_device *cur_dev) } else { fs_devs->num_devices--; list_del(&dev->dev_list); - rcu_string_free(dev->name); - bio_put(dev->flush_bio); - kfree(dev); + free_device(dev); } - break; } } } +static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices, + struct btrfs_device *device, fmode_t flags, + void *holder) +{ + struct request_queue *q; + struct block_device *bdev; + struct buffer_head *bh; + struct btrfs_super_block *disk_super; + u64 devid; + int ret; + + if (device->bdev) + return -EINVAL; + if (!device->name) + return -EINVAL; + + ret = btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1, + &bdev, &bh); + if (ret) + return ret; + + disk_super = (struct btrfs_super_block *)bh->b_data; + devid = btrfs_stack_device_id(&disk_super->dev_item); + if (devid != device->devid) + goto error_brelse; + + if (memcmp(device->uuid, disk_super->dev_item.uuid, BTRFS_UUID_SIZE)) + goto error_brelse; + + device->generation = btrfs_super_generation(disk_super); + + if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) { + clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); + fs_devices->seeding = 1; + } else { + if (bdev_read_only(bdev)) + clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); + else + set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); + } + + q = bdev_get_queue(bdev); + if (!blk_queue_nonrot(q)) + fs_devices->rotating = 1; + + device->bdev = bdev; + clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); + device->mode = flags; + + fs_devices->open_devices++; + if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) && + device->devid != BTRFS_DEV_REPLACE_DEVID) { + fs_devices->rw_devices++; + list_add(&device->dev_alloc_list, &fs_devices->alloc_list); + } + brelse(bh); + + return 0; + +error_brelse: + brelse(bh); + blkdev_put(bdev, flags); + + return -EINVAL; +} + /* * Add new device to list of registered devices * * Returns: - * 1 - first time device is seen - * 0 - device already known - * < 0 - error + * device pointer which was just added or updated when successful + * error pointer when failed */ -static noinline int device_list_add(const char *path, - struct btrfs_super_block *disk_super, - u64 devid, struct btrfs_fs_devices **fs_devices_ret) +static noinline struct btrfs_device *device_list_add(const char *path, + struct btrfs_super_block *disk_super) { struct btrfs_device *device; struct btrfs_fs_devices *fs_devices; struct rcu_string *name; - int ret = 0; u64 found_transid = btrfs_super_generation(disk_super); + u64 devid = btrfs_stack_device_id(&disk_super->dev_item); fs_devices = find_fsid(disk_super->fsid); if (!fs_devices) { fs_devices = alloc_fs_devices(disk_super->fsid); if (IS_ERR(fs_devices)) - return PTR_ERR(fs_devices); + return ERR_CAST(fs_devices); list_add(&fs_devices->list, &fs_uuids); @@ -620,20 +752,19 @@ static noinline int device_list_add(const char *path, if (!device) { if (fs_devices->opened) - return -EBUSY; + return ERR_PTR(-EBUSY); device = btrfs_alloc_device(NULL, &devid, disk_super->dev_item.uuid); if (IS_ERR(device)) { /* we can safely leave the fs_devices entry around */ - return PTR_ERR(device); + return device; } name = rcu_string_strdup(path, GFP_NOFS); if (!name) { - bio_put(device->flush_bio); - kfree(device); - return -ENOMEM; + free_device(device); + return ERR_PTR(-ENOMEM); } rcu_assign_pointer(device->name, name); @@ -642,8 +773,16 @@ static noinline int device_list_add(const char *path, fs_devices->num_devices++; mutex_unlock(&fs_devices->device_list_mutex); - ret = 1; device->fs_devices = fs_devices; + btrfs_free_stale_devices(path, device); + + if (disk_super->label[0]) + pr_info("BTRFS: device label %s devid %llu transid %llu %s\n", + disk_super->label, devid, found_transid, path); + else + pr_info("BTRFS: device fsid %pU devid %llu transid %llu %s\n", + disk_super->fsid, devid, found_transid, path); + } else if (!device->name || strcmp(device->name->str, path)) { /* * When FS is already mounted. @@ -679,17 +818,17 @@ static noinline int device_list_add(const char *path, * with larger generation number or the last-in if * generation are equal. */ - return -EEXIST; + return ERR_PTR(-EEXIST); } name = rcu_string_strdup(path, GFP_NOFS); if (!name) - return -ENOMEM; + return ERR_PTR(-ENOMEM); rcu_string_free(device->name); rcu_assign_pointer(device->name, name); - if (device->missing) { + if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) { fs_devices->missing_devices--; - device->missing = 0; + clear_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state); } } @@ -702,16 +841,9 @@ static noinline int device_list_add(const char *path, if (!fs_devices->opened) device->generation = found_transid; - /* - * if there is new btrfs on an already registered device, - * then remove the stale device entry. - */ - if (ret > 0) - btrfs_free_stale_device(device); - - *fs_devices_ret = fs_devices; + fs_devices->total_devices = btrfs_super_num_devices(disk_super); - return ret; + return device; } static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig) @@ -744,8 +876,7 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig) name = rcu_string_strdup(orig_dev->name->str, GFP_KERNEL); if (!name) { - bio_put(device->flush_bio); - kfree(device); + free_device(device); goto error; } rcu_assign_pointer(device->name, name); @@ -772,10 +903,12 @@ void btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices, int step) again: /* This is the initialized path, it is safe to release the devices. */ list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) { - if (device->in_fs_metadata) { - if (!device->is_tgtdev_for_dev_replace && - (!latest_dev || - device->generation > latest_dev->generation)) { + if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, + &device->dev_state)) { + if (!test_bit(BTRFS_DEV_STATE_REPLACE_TGT, + &device->dev_state) && + (!latest_dev || + device->generation > latest_dev->generation)) { latest_dev = device; } continue; @@ -792,7 +925,8 @@ again: * not, which means whether this device is * used or whether it should be removed. */ - if (step == 0 || device->is_tgtdev_for_dev_replace) { + if (step == 0 || test_bit(BTRFS_DEV_STATE_REPLACE_TGT, + &device->dev_state)) { continue; } } @@ -801,17 +935,16 @@ again: device->bdev = NULL; fs_devices->open_devices--; } - if (device->writeable) { + if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { list_del_init(&device->dev_alloc_list); - device->writeable = 0; - if (!device->is_tgtdev_for_dev_replace) + clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); + if (!test_bit(BTRFS_DEV_STATE_REPLACE_TGT, + &device->dev_state)) fs_devices->rw_devices--; } list_del_init(&device->dev_list); fs_devices->num_devices--; - rcu_string_free(device->name); - bio_put(device->flush_bio); - kfree(device); + free_device(device); } if (fs_devices->seed) { @@ -824,35 +957,25 @@ again: mutex_unlock(&uuid_mutex); } -static void __free_device(struct work_struct *work) -{ - struct btrfs_device *device; - - device = container_of(work, struct btrfs_device, rcu_work); - rcu_string_free(device->name); - bio_put(device->flush_bio); - kfree(device); -} - -static void free_device(struct rcu_head *head) +static void free_device_rcu(struct rcu_head *head) { struct btrfs_device *device; device = container_of(head, struct btrfs_device, rcu); - - INIT_WORK(&device->rcu_work, __free_device); - schedule_work(&device->rcu_work); + free_device(device); } static void btrfs_close_bdev(struct btrfs_device *device) { - if (device->bdev && device->writeable) { + if (!device->bdev) + return; + + if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { sync_blockdev(device->bdev); invalidate_bdev(device->bdev); } - if (device->bdev) - blkdev_put(device->bdev, device->mode); + blkdev_put(device->bdev, device->mode); } static void btrfs_prepare_close_one_device(struct btrfs_device *device) @@ -864,13 +987,13 @@ static void btrfs_prepare_close_one_device(struct btrfs_device *device) if (device->bdev) fs_devices->open_devices--; - if (device->writeable && + if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) && device->devid != BTRFS_DEV_REPLACE_DEVID) { list_del_init(&device->dev_alloc_list); fs_devices->rw_devices--; } - if (device->missing) + if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) fs_devices->missing_devices--; new_device = btrfs_alloc_device(NULL, &device->devid, @@ -916,7 +1039,7 @@ static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices) struct btrfs_device, dev_list); list_del(&device->dev_list); btrfs_close_bdev(device); - call_rcu(&device->rcu, free_device); + call_rcu(&device->rcu, free_device_rcu); } WARN_ON(fs_devices->open_devices); @@ -946,93 +1069,32 @@ int btrfs_close_devices(struct btrfs_fs_devices *fs_devices) __btrfs_close_devices(fs_devices); free_fs_devices(fs_devices); } - /* - * Wait for rcu kworkers under __btrfs_close_devices - * to finish all blkdev_puts so device is really - * free when umount is done. - */ - rcu_barrier(); return ret; } static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices, fmode_t flags, void *holder) { - struct request_queue *q; - struct block_device *bdev; struct list_head *head = &fs_devices->devices; struct btrfs_device *device; struct btrfs_device *latest_dev = NULL; - struct buffer_head *bh; - struct btrfs_super_block *disk_super; - u64 devid; - int seeding = 1; int ret = 0; flags |= FMODE_EXCL; list_for_each_entry(device, head, dev_list) { - if (device->bdev) - continue; - if (!device->name) - continue; - /* Just open everything we can; ignore failures here */ - if (btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1, - &bdev, &bh)) + if (btrfs_open_one_device(fs_devices, device, flags, holder)) continue; - disk_super = (struct btrfs_super_block *)bh->b_data; - devid = btrfs_stack_device_id(&disk_super->dev_item); - if (devid != device->devid) - goto error_brelse; - - if (memcmp(device->uuid, disk_super->dev_item.uuid, - BTRFS_UUID_SIZE)) - goto error_brelse; - - device->generation = btrfs_super_generation(disk_super); if (!latest_dev || device->generation > latest_dev->generation) latest_dev = device; - - if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) { - device->writeable = 0; - } else { - device->writeable = !bdev_read_only(bdev); - seeding = 0; - } - - q = bdev_get_queue(bdev); - if (blk_queue_discard(q)) - device->can_discard = 1; - if (!blk_queue_nonrot(q)) - fs_devices->rotating = 1; - - device->bdev = bdev; - device->in_fs_metadata = 0; - device->mode = flags; - - fs_devices->open_devices++; - if (device->writeable && - device->devid != BTRFS_DEV_REPLACE_DEVID) { - fs_devices->rw_devices++; - list_add(&device->dev_alloc_list, - &fs_devices->alloc_list); - } - brelse(bh); - continue; - -error_brelse: - brelse(bh); - blkdev_put(bdev, flags); - continue; } if (fs_devices->open_devices == 0) { ret = -EINVAL; goto out; } - fs_devices->seeding = seeding; fs_devices->opened = 1; fs_devices->latest_bdev = latest_dev->bdev; fs_devices->total_rw_bytes = 0; @@ -1116,12 +1178,10 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, struct btrfs_fs_devices **fs_devices_ret) { struct btrfs_super_block *disk_super; + struct btrfs_device *device; struct block_device *bdev; struct page *page; - int ret = -EINVAL; - u64 devid; - u64 transid; - u64 total_devices; + int ret = 0; u64 bytenr; /* @@ -1140,26 +1200,16 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, goto error; } - if (btrfs_read_disk_super(bdev, bytenr, &page, &disk_super)) + if (btrfs_read_disk_super(bdev, bytenr, &page, &disk_super)) { + ret = -EINVAL; goto error_bdev_put; - - devid = btrfs_stack_device_id(&disk_super->dev_item); - transid = btrfs_super_generation(disk_super); - total_devices = btrfs_super_num_devices(disk_super); - - ret = device_list_add(path, disk_super, devid, fs_devices_ret); - if (ret > 0) { - if (disk_super->label[0]) { - pr_info("BTRFS: device label %s ", disk_super->label); - } else { - pr_info("BTRFS: device fsid %pU ", disk_super->fsid); - } - - pr_cont("devid %llu transid %llu %s\n", devid, transid, path); - ret = 0; } - if (!ret && fs_devices_ret) - (*fs_devices_ret)->total_devices = total_devices; + + device = device_list_add(path, disk_super); + if (IS_ERR(device)) + ret = PTR_ERR(device); + else + *fs_devices_ret = device->fs_devices; btrfs_release_disk_super(page); @@ -1185,7 +1235,8 @@ int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start, *length = 0; - if (start >= device->total_bytes || device->is_tgtdev_for_dev_replace) + if (start >= device->total_bytes || + test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) return 0; path = btrfs_alloc_path(); @@ -1363,7 +1414,8 @@ int find_free_dev_extent_start(struct btrfs_transaction *transaction, max_hole_size = 0; again: - if (search_start >= search_end || device->is_tgtdev_for_dev_replace) { + if (search_start >= search_end || + test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) { ret = -ENOSPC; goto out; } @@ -1570,8 +1622,8 @@ static int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans, struct extent_buffer *leaf; struct btrfs_key key; - WARN_ON(!device->in_fs_metadata); - WARN_ON(device->is_tgtdev_for_dev_replace); + WARN_ON(!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state)); + WARN_ON(test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)); path = btrfs_alloc_path(); if (!path) return -ENOMEM; @@ -1661,7 +1713,7 @@ error: * the device information is stored in the chunk root * the btrfs_device struct should be fully filled in */ -static int btrfs_add_device(struct btrfs_trans_handle *trans, +static int btrfs_add_dev_item(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, struct btrfs_device *device) { @@ -1817,7 +1869,8 @@ static struct btrfs_device * btrfs_find_next_active_device( list_for_each_entry(next_device, &fs_devs->devices, dev_list) { if (next_device != device && - !next_device->missing && next_device->bdev) + !test_bit(BTRFS_DEV_STATE_MISSING, &next_device->dev_state) + && next_device->bdev) return next_device; } @@ -1858,6 +1911,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path, u64 num_devices; int ret = 0; + mutex_lock(&fs_info->volume_mutex); mutex_lock(&uuid_mutex); num_devices = fs_info->fs_devices->num_devices; @@ -1877,17 +1931,18 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path, if (ret) goto out; - if (device->is_tgtdev_for_dev_replace) { + if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) { ret = BTRFS_ERROR_DEV_TGT_REPLACE; goto out; } - if (device->writeable && fs_info->fs_devices->rw_devices == 1) { + if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) && + fs_info->fs_devices->rw_devices == 1) { ret = BTRFS_ERROR_DEV_ONLY_WRITABLE; goto out; } - if (device->writeable) { + if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { mutex_lock(&fs_info->chunk_mutex); list_del_init(&device->dev_alloc_list); device->fs_devices->rw_devices--; @@ -1909,7 +1964,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path, if (ret) goto error_undo; - device->in_fs_metadata = 0; + clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); btrfs_scrub_cancel_dev(fs_info, device); /* @@ -1929,7 +1984,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path, device->fs_devices->num_devices--; device->fs_devices->total_devices--; - if (device->missing) + if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) device->fs_devices->missing_devices--; btrfs_assign_next_active_device(fs_info, device, NULL); @@ -1949,11 +2004,11 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path, * the devices list. All that's left is to zero out the old * supers and free the device. */ - if (device->writeable) + if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) btrfs_scratch_superblocks(device->bdev, device->name->str); btrfs_close_bdev(device); - call_rcu(&device->rcu, free_device); + call_rcu(&device->rcu, free_device_rcu); if (cur_devices->open_devices == 0) { struct btrfs_fs_devices *fs_devices; @@ -1972,10 +2027,11 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path, out: mutex_unlock(&uuid_mutex); + mutex_unlock(&fs_info->volume_mutex); return ret; error_undo: - if (device->writeable) { + if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { mutex_lock(&fs_info->chunk_mutex); list_add(&device->dev_alloc_list, &fs_info->fs_devices->alloc_list); @@ -2003,10 +2059,10 @@ void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_fs_info *fs_info, list_del_rcu(&srcdev->dev_list); list_del(&srcdev->dev_alloc_list); fs_devices->num_devices--; - if (srcdev->missing) + if (test_bit(BTRFS_DEV_STATE_MISSING, &srcdev->dev_state)) fs_devices->missing_devices--; - if (srcdev->writeable) + if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &srcdev->dev_state)) fs_devices->rw_devices--; if (srcdev->bdev) @@ -2018,13 +2074,13 @@ void btrfs_rm_dev_replace_free_srcdev(struct btrfs_fs_info *fs_info, { struct btrfs_fs_devices *fs_devices = srcdev->fs_devices; - if (srcdev->writeable) { + if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &srcdev->dev_state)) { /* zero out the old super if it is writable */ btrfs_scratch_superblocks(srcdev->bdev, srcdev->name->str); } btrfs_close_bdev(srcdev); - call_rcu(&srcdev->rcu, free_device); + call_rcu(&srcdev->rcu, free_device_rcu); /* if this is no devs we rather delete the fs_devices */ if (!fs_devices->num_devices) { @@ -2083,7 +2139,7 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, btrfs_scratch_superblocks(tgtdev->bdev, tgtdev->name->str); btrfs_close_bdev(tgtdev); - call_rcu(&tgtdev->rcu, free_device); + call_rcu(&tgtdev->rcu, free_device_rcu); } static int btrfs_find_device_by_path(struct btrfs_fs_info *fs_info, @@ -2128,7 +2184,8 @@ int btrfs_find_device_missing_or_by_path(struct btrfs_fs_info *fs_info, * is held by the caller. */ list_for_each_entry(tmp, devices, dev_list) { - if (tmp->in_fs_metadata && !tmp->bdev) { + if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, + &tmp->dev_state) && !tmp->bdev) { *device = tmp; break; } @@ -2357,26 +2414,19 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path name = rcu_string_strdup(device_path, GFP_KERNEL); if (!name) { - bio_put(device->flush_bio); - kfree(device); ret = -ENOMEM; - goto error; + goto error_free_device; } rcu_assign_pointer(device->name, name); trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { - rcu_string_free(device->name); - bio_put(device->flush_bio); - kfree(device); ret = PTR_ERR(trans); - goto error; + goto error_free_device; } q = bdev_get_queue(bdev); - if (blk_queue_discard(q)) - device->can_discard = 1; - device->writeable = 1; + set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); device->generation = trans->transid; device->io_width = fs_info->sectorsize; device->io_align = fs_info->sectorsize; @@ -2387,8 +2437,8 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path device->commit_total_bytes = device->total_bytes; device->fs_info = fs_info; device->bdev = bdev; - device->in_fs_metadata = 1; - device->is_tgtdev_for_dev_replace = 0; + set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); + clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state); device->mode = FMODE_EXCL; device->dev_stats_valid = 1; set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE); @@ -2449,7 +2499,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path } } - ret = btrfs_add_device(trans, fs_info, device); + ret = btrfs_add_dev_item(trans, fs_info, device); if (ret) { btrfs_abort_transaction(trans, ret); goto error_sysfs; @@ -2510,9 +2560,8 @@ error_trans: sb->s_flags |= SB_RDONLY; if (trans) btrfs_end_transaction(trans); - rcu_string_free(device->name); - bio_put(device->flush_bio); - kfree(device); +error_free_device: + free_device(device); error: blkdev_put(bdev, FMODE_EXCL); if (seeding_dev && !unlocked) { @@ -2527,7 +2576,6 @@ int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, struct btrfs_device *srcdev, struct btrfs_device **device_out) { - struct request_queue *q; struct btrfs_device *device; struct block_device *bdev; struct list_head *devices; @@ -2578,18 +2626,14 @@ int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, name = rcu_string_strdup(device_path, GFP_KERNEL); if (!name) { - bio_put(device->flush_bio); - kfree(device); + free_device(device); ret = -ENOMEM; goto error; } rcu_assign_pointer(device->name, name); - q = bdev_get_queue(bdev); - if (blk_queue_discard(q)) - device->can_discard = 1; mutex_lock(&fs_info->fs_devices->device_list_mutex); - device->writeable = 1; + set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); device->generation = 0; device->io_width = fs_info->sectorsize; device->io_align = fs_info->sectorsize; @@ -2602,8 +2646,8 @@ int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, device->commit_bytes_used = device->bytes_used; device->fs_info = fs_info; device->bdev = bdev; - device->in_fs_metadata = 1; - device->is_tgtdev_for_dev_replace = 1; + set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); + set_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state); device->mode = FMODE_EXCL; device->dev_stats_valid = 1; set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE); @@ -2631,7 +2675,7 @@ void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info, tgtdev->io_align = sectorsize; tgtdev->sector_size = sectorsize; tgtdev->fs_info = fs_info; - tgtdev->in_fs_metadata = 1; + set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &tgtdev->dev_state); } static noinline int btrfs_update_device(struct btrfs_trans_handle *trans, @@ -2689,7 +2733,7 @@ int btrfs_grow_device(struct btrfs_trans_handle *trans, u64 old_total; u64 diff; - if (!device->writeable) + if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) return -EACCES; new_size = round_down(new_size, fs_info->sectorsize); @@ -2699,7 +2743,7 @@ int btrfs_grow_device(struct btrfs_trans_handle *trans, diff = round_down(new_size - device->total_bytes, fs_info->sectorsize); if (new_size <= device->total_bytes || - device->is_tgtdev_for_dev_replace) { + test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) { mutex_unlock(&fs_info->chunk_mutex); return -EINVAL; } @@ -3043,6 +3087,48 @@ error: return ret; } +/* + * return 1 : allocate a data chunk successfully, + * return <0: errors during allocating a data chunk, + * return 0 : no need to allocate a data chunk. + */ +static int btrfs_may_alloc_data_chunk(struct btrfs_fs_info *fs_info, + u64 chunk_offset) +{ + struct btrfs_block_group_cache *cache; + u64 bytes_used; + u64 chunk_type; + + cache = btrfs_lookup_block_group(fs_info, chunk_offset); + ASSERT(cache); + chunk_type = cache->flags; + btrfs_put_block_group(cache); + + if (chunk_type & BTRFS_BLOCK_GROUP_DATA) { + spin_lock(&fs_info->data_sinfo->lock); + bytes_used = fs_info->data_sinfo->bytes_used; + spin_unlock(&fs_info->data_sinfo->lock); + + if (!bytes_used) { + struct btrfs_trans_handle *trans; + int ret; + + trans = btrfs_join_transaction(fs_info->tree_root); + if (IS_ERR(trans)) + return PTR_ERR(trans); + + ret = btrfs_force_chunk_alloc(trans, fs_info, + BTRFS_BLOCK_GROUP_DATA); + btrfs_end_transaction(trans); + if (ret < 0) + return ret; + + return 1; + } + } + return 0; +} + static int insert_balance_item(struct btrfs_fs_info *fs_info, struct btrfs_balance_control *bctl) { @@ -3501,7 +3587,6 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info) u32 count_meta = 0; u32 count_sys = 0; int chunk_reserved = 0; - u64 bytes_used = 0; /* step one make some room on all the devices */ devices = &fs_info->fs_devices->devices; @@ -3509,10 +3594,10 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info) old_size = btrfs_device_get_total_bytes(device); size_to_free = div_factor(old_size, 1); size_to_free = min_t(u64, size_to_free, SZ_1M); - if (!device->writeable || + if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) || btrfs_device_get_total_bytes(device) - btrfs_device_get_bytes_used(device) > size_to_free || - device->is_tgtdev_for_dev_replace) + test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) continue; ret = btrfs_shrink_device(device, old_size - size_to_free); @@ -3660,28 +3745,21 @@ again: goto loop; } - ASSERT(fs_info->data_sinfo); - spin_lock(&fs_info->data_sinfo->lock); - bytes_used = fs_info->data_sinfo->bytes_used; - spin_unlock(&fs_info->data_sinfo->lock); - - if ((chunk_type & BTRFS_BLOCK_GROUP_DATA) && - !chunk_reserved && !bytes_used) { - trans = btrfs_start_transaction(chunk_root, 0); - if (IS_ERR(trans)) { - mutex_unlock(&fs_info->delete_unused_bgs_mutex); - ret = PTR_ERR(trans); - goto error; - } - - ret = btrfs_force_chunk_alloc(trans, fs_info, - BTRFS_BLOCK_GROUP_DATA); - btrfs_end_transaction(trans); + if (!chunk_reserved) { + /* + * We may be relocating the only data chunk we have, + * which could potentially end up with losing data's + * raid profile, so lets allocate an empty one in + * advance. + */ + ret = btrfs_may_alloc_data_chunk(fs_info, + found_key.offset); if (ret < 0) { mutex_unlock(&fs_info->delete_unused_bgs_mutex); goto error; + } else if (ret == 1) { + chunk_reserved = 1; } - chunk_reserved = 1; } ret = btrfs_relocate_chunk(fs_info, found_key.offset); @@ -4380,7 +4458,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) new_size = round_down(new_size, fs_info->sectorsize); diff = round_down(old_size - new_size, fs_info->sectorsize); - if (device->is_tgtdev_for_dev_replace) + if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) return -EINVAL; path = btrfs_alloc_path(); @@ -4392,7 +4470,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) mutex_lock(&fs_info->chunk_mutex); btrfs_device_set_total_bytes(device, new_size); - if (device->writeable) { + if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { device->fs_devices->total_rw_bytes -= diff; atomic64_sub(diff, &fs_info->free_chunk_space); } @@ -4444,6 +4522,18 @@ again: chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent); btrfs_release_path(path); + /* + * We may be relocating the only data chunk we have, + * which could potentially end up with losing data's + * raid profile, so lets allocate an empty one in + * advance. + */ + ret = btrfs_may_alloc_data_chunk(fs_info, chunk_offset); + if (ret < 0) { + mutex_unlock(&fs_info->delete_unused_bgs_mutex); + goto done; + } + ret = btrfs_relocate_chunk(fs_info, chunk_offset); mutex_unlock(&fs_info->delete_unused_bgs_mutex); if (ret && ret != -ENOSPC) @@ -4517,7 +4607,7 @@ done: if (ret) { mutex_lock(&fs_info->chunk_mutex); btrfs_device_set_total_bytes(device, old_size); - if (device->writeable) + if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) device->fs_devices->total_rw_bytes += diff; atomic64_add(diff, &fs_info->free_chunk_space); mutex_unlock(&fs_info->chunk_mutex); @@ -4677,14 +4767,15 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, u64 max_avail; u64 dev_offset; - if (!device->writeable) { + if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { WARN(1, KERN_ERR "BTRFS: read-only device in alloc_list\n"); continue; } - if (!device->in_fs_metadata || - device->is_tgtdev_for_dev_replace) + if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, + &device->dev_state) || + test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) continue; if (device->total_bytes > device->bytes_used) @@ -5032,12 +5123,13 @@ int btrfs_chunk_readonly(struct btrfs_fs_info *fs_info, u64 chunk_offset) map = em->map_lookup; for (i = 0; i < map->num_stripes; i++) { - if (map->stripes[i].dev->missing) { + if (test_bit(BTRFS_DEV_STATE_MISSING, + &map->stripes[i].dev->dev_state)) { miss_ndevs++; continue; } - - if (!map->stripes[i].dev->writeable) { + if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, + &map->stripes[i].dev->dev_state)) { readonly = 1; goto end; } @@ -5103,7 +5195,14 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len) else if (map->type & BTRFS_BLOCK_GROUP_RAID5) ret = 2; else if (map->type & BTRFS_BLOCK_GROUP_RAID6) - ret = 3; + /* + * There could be two corrupted data stripes, we need + * to loop retry in order to rebuild the correct data. + * + * Fail a stripe at a time on every retry except the + * stripe under reconstruction. + */ + ret = map->num_stripes; else ret = 1; free_extent_map(em); @@ -6003,15 +6102,14 @@ static void btrfs_end_bio(struct bio *bio) dev = bbio->stripes[stripe_index].dev; if (dev->bdev) { if (bio_op(bio) == REQ_OP_WRITE) - btrfs_dev_stat_inc(dev, + btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS); else - btrfs_dev_stat_inc(dev, + btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS); if (bio->bi_opf & REQ_PREFLUSH) - btrfs_dev_stat_inc(dev, + btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_FLUSH_ERRS); - btrfs_dev_stat_print_on_error(dev); } } } @@ -6061,16 +6159,15 @@ static noinline void btrfs_schedule_bio(struct btrfs_device *device, int should_queue = 1; struct btrfs_pending_bios *pending_bios; - if (device->missing || !device->bdev) { + if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state) || + !device->bdev) { bio_io_error(bio); return; } /* don't bother with additional async steps for reads, right now */ if (bio_op(bio) == REQ_OP_READ) { - bio_get(bio); btrfsic_submit_bio(bio); - bio_put(bio); return; } @@ -6207,7 +6304,8 @@ blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio, for (dev_nr = 0; dev_nr < total_devs; dev_nr++) { dev = bbio->stripes[dev_nr].dev; if (!dev || !dev->bdev || - (bio_op(first_bio) == REQ_OP_WRITE && !dev->writeable)) { + (bio_op(first_bio) == REQ_OP_WRITE && + !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))) { bbio_error(bbio, first_bio, logical); continue; } @@ -6256,7 +6354,7 @@ static struct btrfs_device *add_missing_dev(struct btrfs_fs_devices *fs_devices, device->fs_devices = fs_devices; fs_devices->num_devices++; - device->missing = 1; + set_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state); fs_devices->missing_devices++; return device; @@ -6272,8 +6370,8 @@ static struct btrfs_device *add_missing_dev(struct btrfs_fs_devices *fs_devices, * is generated. * * Return: a pointer to a new &struct btrfs_device on success; ERR_PTR() - * on error. Returned struct is not linked onto any lists and can be - * destroyed with kfree() right away. + * on error. Returned struct is not linked onto any lists and must be + * destroyed with free_device. */ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info, const u64 *devid, @@ -6296,8 +6394,7 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info, ret = find_next_devid(fs_info, &tmp); if (ret) { - bio_put(dev->flush_bio); - kfree(dev); + free_device(dev); return ERR_PTR(ret); } } @@ -6476,7 +6573,9 @@ static int read_one_chunk(struct btrfs_fs_info *fs_info, struct btrfs_key *key, } btrfs_report_missing_device(fs_info, devid, uuid, false); } - map->stripes[i].dev->in_fs_metadata = 1; + set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, + &(map->stripes[i].dev->dev_state)); + } write_lock(&map_tree->map_tree.lock); @@ -6505,7 +6604,7 @@ static void fill_device_from_item(struct extent_buffer *leaf, device->io_width = btrfs_device_io_width(leaf, dev_item); device->sector_size = btrfs_device_sector_size(leaf, dev_item); WARN_ON(device->devid == BTRFS_DEV_REPLACE_DEVID); - device->is_tgtdev_for_dev_replace = 0; + clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state); ptr = btrfs_device_uuid(dev_item); read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE); @@ -6617,7 +6716,8 @@ static int read_one_dev(struct btrfs_fs_info *fs_info, dev_uuid, false); } - if(!device->bdev && !device->missing) { + if (!device->bdev && + !test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) { /* * this happens when a device that was properly setup * in the device info lists suddenly goes bad. @@ -6625,12 +6725,13 @@ static int read_one_dev(struct btrfs_fs_info *fs_info, * device->missing to one here */ device->fs_devices->missing_devices++; - device->missing = 1; + set_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state); } /* Move the device to its own fs_devices */ if (device->fs_devices != fs_devices) { - ASSERT(device->missing); + ASSERT(test_bit(BTRFS_DEV_STATE_MISSING, + &device->dev_state)); list_move(&device->dev_list, &fs_devices->devices); device->fs_devices->num_devices--; @@ -6644,15 +6745,16 @@ static int read_one_dev(struct btrfs_fs_info *fs_info, } if (device->fs_devices != fs_info->fs_devices) { - BUG_ON(device->writeable); + BUG_ON(test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)); if (device->generation != btrfs_device_generation(leaf, dev_item)) return -EINVAL; } fill_device_from_item(leaf, dev_item, device); - device->in_fs_metadata = 1; - if (device->writeable && !device->is_tgtdev_for_dev_replace) { + set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); + if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) && + !test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) { device->fs_devices->total_rw_bytes += device->total_bytes; atomic64_add(device->total_bytes - device->bytes_used, &fs_info->free_chunk_space); @@ -6784,10 +6886,13 @@ out_short_read: /* * Check if all chunks in the fs are OK for read-write degraded mount * + * If the @failing_dev is specified, it's accounted as missing. + * * Return true if all chunks meet the minimal RW mount requirements. * Return false if any chunk doesn't meet the minimal RW mount requirements. */ -bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info) +bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info, + struct btrfs_device *failing_dev) { struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; struct extent_map *em; @@ -6815,12 +6920,16 @@ bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info) for (i = 0; i < map->num_stripes; i++) { struct btrfs_device *dev = map->stripes[i].dev; - if (!dev || !dev->bdev || dev->missing || + if (!dev || !dev->bdev || + test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) || dev->last_flush_error) missing++; + else if (failing_dev && failing_dev == dev) + missing++; } if (missing > max_tolerated) { - btrfs_warn(fs_info, + if (!failing_dev) + btrfs_warn(fs_info, "chunk %llu missing %d devices, max tolerance is %d for writeable mount", em->start, missing, max_tolerated); free_extent_map(em); @@ -7091,10 +7200,24 @@ int btrfs_run_dev_stats(struct btrfs_trans_handle *trans, mutex_lock(&fs_devices->device_list_mutex); list_for_each_entry(device, &fs_devices->devices, dev_list) { - if (!device->dev_stats_valid || !btrfs_dev_stats_dirty(device)) + stats_cnt = atomic_read(&device->dev_stats_ccnt); + if (!device->dev_stats_valid || stats_cnt == 0) continue; - stats_cnt = atomic_read(&device->dev_stats_ccnt); + + /* + * There is a LOAD-LOAD control dependency between the value of + * dev_stats_ccnt and updating the on-disk values which requires + * reading the in-memory counters. Such control dependencies + * require explicit read memory barriers. + * + * This memory barriers pairs with smp_mb__before_atomic in + * btrfs_dev_stat_inc/btrfs_dev_stat_set and with the full + * barrier implied by atomic_xchg in + * btrfs_dev_stats_read_and_reset + */ + smp_rmb(); + ret = update_dev_stat_item(trans, fs_info, device); if (!ret) atomic_sub(stats_cnt, &device->dev_stats_ccnt); diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index ff15208344a7..28c28eeadff3 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -47,6 +47,12 @@ struct btrfs_pending_bios { #define btrfs_device_data_ordered_init(device) do { } while (0) #endif +#define BTRFS_DEV_STATE_WRITEABLE (0) +#define BTRFS_DEV_STATE_IN_FS_METADATA (1) +#define BTRFS_DEV_STATE_MISSING (2) +#define BTRFS_DEV_STATE_REPLACE_TGT (3) +#define BTRFS_DEV_STATE_FLUSH_SENT (4) + struct btrfs_device { struct list_head dev_list; struct list_head dev_alloc_list; @@ -69,11 +75,7 @@ struct btrfs_device { /* the mode sent to blkdev_get */ fmode_t mode; - int writeable; - int in_fs_metadata; - int missing; - int can_discard; - int is_tgtdev_for_dev_replace; + unsigned long dev_state; blk_status_t last_flush_error; int flush_bio_sent; @@ -129,14 +131,12 @@ struct btrfs_device { struct completion flush_wait; /* per-device scrub information */ - struct scrub_ctx *scrub_device; + struct scrub_ctx *scrub_ctx; struct btrfs_work work; struct rcu_head rcu; - struct work_struct rcu_work; /* readahead state */ - spinlock_t reada_lock; atomic_t reada_in_flight; u64 reada_next; struct reada_zone *reada_curr_zone; @@ -489,15 +489,16 @@ int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans, int btrfs_remove_chunk(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 chunk_offset); -static inline int btrfs_dev_stats_dirty(struct btrfs_device *dev) -{ - return atomic_read(&dev->dev_stats_ccnt); -} - static inline void btrfs_dev_stat_inc(struct btrfs_device *dev, int index) { atomic_inc(dev->dev_stat_values + index); + /* + * This memory barrier orders stores updating statistics before stores + * updating dev_stats_ccnt. + * + * It pairs with smp_rmb() in btrfs_run_dev_stats(). + */ smp_mb__before_atomic(); atomic_inc(&dev->dev_stats_ccnt); } @@ -514,7 +515,13 @@ static inline int btrfs_dev_stat_read_and_reset(struct btrfs_device *dev, int ret; ret = atomic_xchg(dev->dev_stat_values + index, 0); - smp_mb__before_atomic(); + /* + * atomic_xchg implies a full memory barriers as per atomic_t.txt: + * - RMW operations that have a return value are fully ordered; + * + * This implicit memory barriers is paired with the smp_rmb in + * btrfs_run_dev_stats + */ atomic_inc(&dev->dev_stats_ccnt); return ret; } @@ -523,6 +530,12 @@ static inline void btrfs_dev_stat_set(struct btrfs_device *dev, int index, unsigned long val) { atomic_set(dev->dev_stat_values + index, val); + /* + * This memory barrier orders stores updating statistics before stores + * updating dev_stats_ccnt. + * + * It pairs with smp_rmb() in btrfs_run_dev_stats(). + */ smp_mb__before_atomic(); atomic_inc(&dev->dev_stats_ccnt); } @@ -540,7 +553,7 @@ void btrfs_update_commit_device_bytes_used(struct btrfs_fs_info *fs_info, struct list_head *btrfs_get_fs_uuids(void); void btrfs_set_fs_info_ptr(struct btrfs_fs_info *fs_info); void btrfs_reset_fs_info_ptr(struct btrfs_fs_info *fs_info); - -bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info); +bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info, + struct btrfs_device *failing_dev); #endif diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index 2c7e53f9ff1b..de7d072c78ef 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -23,6 +23,7 @@ #include <linux/xattr.h> #include <linux/security.h> #include <linux/posix_acl_xattr.h> +#include <linux/iversion.h> #include "ctree.h" #include "btrfs_inode.h" #include "transaction.h" @@ -267,7 +268,6 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size) { struct btrfs_key key; struct inode *inode = d_inode(dentry); - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_path *path; int ret = 0; @@ -336,11 +336,6 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size) u32 this_len = sizeof(*di) + name_len + data_len; unsigned long name_ptr = (unsigned long)(di + 1); - if (verify_dir_item(fs_info, leaf, slot, di)) { - ret = -EIO; - goto err; - } - total_size += name_len + 1; /* * We are just looking for how big our buffer needs to diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c index 17f2dd8fddb8..01a4eab602a3 100644 --- a/fs/btrfs/zstd.c +++ b/fs/btrfs/zstd.c @@ -43,6 +43,8 @@ struct workspace { size_t size; char *buf; struct list_head list; + ZSTD_inBuffer in_buf; + ZSTD_outBuffer out_buf; }; static void zstd_free_workspace(struct list_head *ws) @@ -94,8 +96,6 @@ static int zstd_compress_pages(struct list_head *ws, int nr_pages = 0; struct page *in_page = NULL; /* The current page to read */ struct page *out_page = NULL; /* The current page to write to */ - ZSTD_inBuffer in_buf = { NULL, 0, 0 }; - ZSTD_outBuffer out_buf = { NULL, 0, 0 }; unsigned long tot_in = 0; unsigned long tot_out = 0; unsigned long len = *total_out; @@ -118,9 +118,9 @@ static int zstd_compress_pages(struct list_head *ws, /* map in the first page of input data */ in_page = find_get_page(mapping, start >> PAGE_SHIFT); - in_buf.src = kmap(in_page); - in_buf.pos = 0; - in_buf.size = min_t(size_t, len, PAGE_SIZE); + workspace->in_buf.src = kmap(in_page); + workspace->in_buf.pos = 0; + workspace->in_buf.size = min_t(size_t, len, PAGE_SIZE); /* Allocate and map in the output buffer */ @@ -130,14 +130,15 @@ static int zstd_compress_pages(struct list_head *ws, goto out; } pages[nr_pages++] = out_page; - out_buf.dst = kmap(out_page); - out_buf.pos = 0; - out_buf.size = min_t(size_t, max_out, PAGE_SIZE); + workspace->out_buf.dst = kmap(out_page); + workspace->out_buf.pos = 0; + workspace->out_buf.size = min_t(size_t, max_out, PAGE_SIZE); while (1) { size_t ret2; - ret2 = ZSTD_compressStream(stream, &out_buf, &in_buf); + ret2 = ZSTD_compressStream(stream, &workspace->out_buf, + &workspace->in_buf); if (ZSTD_isError(ret2)) { pr_debug("BTRFS: ZSTD_compressStream returned %d\n", ZSTD_getErrorCode(ret2)); @@ -146,22 +147,22 @@ static int zstd_compress_pages(struct list_head *ws, } /* Check to see if we are making it bigger */ - if (tot_in + in_buf.pos > 8192 && - tot_in + in_buf.pos < - tot_out + out_buf.pos) { + if (tot_in + workspace->in_buf.pos > 8192 && + tot_in + workspace->in_buf.pos < + tot_out + workspace->out_buf.pos) { ret = -E2BIG; goto out; } /* We've reached the end of our output range */ - if (out_buf.pos >= max_out) { - tot_out += out_buf.pos; + if (workspace->out_buf.pos >= max_out) { + tot_out += workspace->out_buf.pos; ret = -E2BIG; goto out; } /* Check if we need more output space */ - if (out_buf.pos == out_buf.size) { + if (workspace->out_buf.pos == workspace->out_buf.size) { tot_out += PAGE_SIZE; max_out -= PAGE_SIZE; kunmap(out_page); @@ -176,19 +177,20 @@ static int zstd_compress_pages(struct list_head *ws, goto out; } pages[nr_pages++] = out_page; - out_buf.dst = kmap(out_page); - out_buf.pos = 0; - out_buf.size = min_t(size_t, max_out, PAGE_SIZE); + workspace->out_buf.dst = kmap(out_page); + workspace->out_buf.pos = 0; + workspace->out_buf.size = min_t(size_t, max_out, + PAGE_SIZE); } /* We've reached the end of the input */ - if (in_buf.pos >= len) { - tot_in += in_buf.pos; + if (workspace->in_buf.pos >= len) { + tot_in += workspace->in_buf.pos; break; } /* Check if we need more input */ - if (in_buf.pos == in_buf.size) { + if (workspace->in_buf.pos == workspace->in_buf.size) { tot_in += PAGE_SIZE; kunmap(in_page); put_page(in_page); @@ -196,15 +198,15 @@ static int zstd_compress_pages(struct list_head *ws, start += PAGE_SIZE; len -= PAGE_SIZE; in_page = find_get_page(mapping, start >> PAGE_SHIFT); - in_buf.src = kmap(in_page); - in_buf.pos = 0; - in_buf.size = min_t(size_t, len, PAGE_SIZE); + workspace->in_buf.src = kmap(in_page); + workspace->in_buf.pos = 0; + workspace->in_buf.size = min_t(size_t, len, PAGE_SIZE); } } while (1) { size_t ret2; - ret2 = ZSTD_endStream(stream, &out_buf); + ret2 = ZSTD_endStream(stream, &workspace->out_buf); if (ZSTD_isError(ret2)) { pr_debug("BTRFS: ZSTD_endStream returned %d\n", ZSTD_getErrorCode(ret2)); @@ -212,11 +214,11 @@ static int zstd_compress_pages(struct list_head *ws, goto out; } if (ret2 == 0) { - tot_out += out_buf.pos; + tot_out += workspace->out_buf.pos; break; } - if (out_buf.pos >= max_out) { - tot_out += out_buf.pos; + if (workspace->out_buf.pos >= max_out) { + tot_out += workspace->out_buf.pos; ret = -E2BIG; goto out; } @@ -235,9 +237,9 @@ static int zstd_compress_pages(struct list_head *ws, goto out; } pages[nr_pages++] = out_page; - out_buf.dst = kmap(out_page); - out_buf.pos = 0; - out_buf.size = min_t(size_t, max_out, PAGE_SIZE); + workspace->out_buf.dst = kmap(out_page); + workspace->out_buf.pos = 0; + workspace->out_buf.size = min_t(size_t, max_out, PAGE_SIZE); } if (tot_out >= tot_in) { @@ -273,8 +275,6 @@ static int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb) unsigned long total_pages_in = DIV_ROUND_UP(srclen, PAGE_SIZE); unsigned long buf_start; unsigned long total_out = 0; - ZSTD_inBuffer in_buf = { NULL, 0, 0 }; - ZSTD_outBuffer out_buf = { NULL, 0, 0 }; stream = ZSTD_initDStream( ZSTD_BTRFS_MAX_INPUT, workspace->mem, workspace->size); @@ -284,18 +284,19 @@ static int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb) goto done; } - in_buf.src = kmap(pages_in[page_in_index]); - in_buf.pos = 0; - in_buf.size = min_t(size_t, srclen, PAGE_SIZE); + workspace->in_buf.src = kmap(pages_in[page_in_index]); + workspace->in_buf.pos = 0; + workspace->in_buf.size = min_t(size_t, srclen, PAGE_SIZE); - out_buf.dst = workspace->buf; - out_buf.pos = 0; - out_buf.size = PAGE_SIZE; + workspace->out_buf.dst = workspace->buf; + workspace->out_buf.pos = 0; + workspace->out_buf.size = PAGE_SIZE; while (1) { size_t ret2; - ret2 = ZSTD_decompressStream(stream, &out_buf, &in_buf); + ret2 = ZSTD_decompressStream(stream, &workspace->out_buf, + &workspace->in_buf); if (ZSTD_isError(ret2)) { pr_debug("BTRFS: ZSTD_decompressStream returned %d\n", ZSTD_getErrorCode(ret2)); @@ -303,38 +304,38 @@ static int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb) goto done; } buf_start = total_out; - total_out += out_buf.pos; - out_buf.pos = 0; + total_out += workspace->out_buf.pos; + workspace->out_buf.pos = 0; - ret = btrfs_decompress_buf2page(out_buf.dst, buf_start, - total_out, disk_start, orig_bio); + ret = btrfs_decompress_buf2page(workspace->out_buf.dst, + buf_start, total_out, disk_start, orig_bio); if (ret == 0) break; - if (in_buf.pos >= srclen) + if (workspace->in_buf.pos >= srclen) break; /* Check if we've hit the end of a frame */ if (ret2 == 0) break; - if (in_buf.pos == in_buf.size) { + if (workspace->in_buf.pos == workspace->in_buf.size) { kunmap(pages_in[page_in_index++]); if (page_in_index >= total_pages_in) { - in_buf.src = NULL; + workspace->in_buf.src = NULL; ret = -EIO; goto done; } srclen -= PAGE_SIZE; - in_buf.src = kmap(pages_in[page_in_index]); - in_buf.pos = 0; - in_buf.size = min_t(size_t, srclen, PAGE_SIZE); + workspace->in_buf.src = kmap(pages_in[page_in_index]); + workspace->in_buf.pos = 0; + workspace->in_buf.size = min_t(size_t, srclen, PAGE_SIZE); } } ret = 0; zero_fill_bio(orig_bio); done: - if (in_buf.src) + if (workspace->in_buf.src) kunmap(pages_in[page_in_index]); return ret; } @@ -348,8 +349,6 @@ static int zstd_decompress(struct list_head *ws, unsigned char *data_in, ZSTD_DStream *stream; int ret = 0; size_t ret2; - ZSTD_inBuffer in_buf = { NULL, 0, 0 }; - ZSTD_outBuffer out_buf = { NULL, 0, 0 }; unsigned long total_out = 0; unsigned long pg_offset = 0; char *kaddr; @@ -364,16 +363,17 @@ static int zstd_decompress(struct list_head *ws, unsigned char *data_in, destlen = min_t(size_t, destlen, PAGE_SIZE); - in_buf.src = data_in; - in_buf.pos = 0; - in_buf.size = srclen; + workspace->in_buf.src = data_in; + workspace->in_buf.pos = 0; + workspace->in_buf.size = srclen; - out_buf.dst = workspace->buf; - out_buf.pos = 0; - out_buf.size = PAGE_SIZE; + workspace->out_buf.dst = workspace->buf; + workspace->out_buf.pos = 0; + workspace->out_buf.size = PAGE_SIZE; ret2 = 1; - while (pg_offset < destlen && in_buf.pos < in_buf.size) { + while (pg_offset < destlen + && workspace->in_buf.pos < workspace->in_buf.size) { unsigned long buf_start; unsigned long buf_offset; unsigned long bytes; @@ -384,7 +384,8 @@ static int zstd_decompress(struct list_head *ws, unsigned char *data_in, ret = -EIO; goto finish; } - ret2 = ZSTD_decompressStream(stream, &out_buf, &in_buf); + ret2 = ZSTD_decompressStream(stream, &workspace->out_buf, + &workspace->in_buf); if (ZSTD_isError(ret2)) { pr_debug("BTRFS: ZSTD_decompressStream returned %d\n", ZSTD_getErrorCode(ret2)); @@ -393,8 +394,8 @@ static int zstd_decompress(struct list_head *ws, unsigned char *data_in, } buf_start = total_out; - total_out += out_buf.pos; - out_buf.pos = 0; + total_out += workspace->out_buf.pos; + workspace->out_buf.pos = 0; if (total_out <= start_byte) continue; @@ -405,10 +406,11 @@ static int zstd_decompress(struct list_head *ws, unsigned char *data_in, buf_offset = 0; bytes = min_t(unsigned long, destlen - pg_offset, - out_buf.size - buf_offset); + workspace->out_buf.size - buf_offset); kaddr = kmap_atomic(dest_page); - memcpy(kaddr + pg_offset, out_buf.dst + buf_offset, bytes); + memcpy(kaddr + pg_offset, workspace->out_buf.dst + buf_offset, + bytes); kunmap_atomic(kaddr); pg_offset += bytes; |