summaryrefslogtreecommitdiffstats
path: root/fs/btrfs
diff options
context:
space:
mode:
Diffstat (limited to 'fs/btrfs')
-rw-r--r--fs/btrfs/Makefile2
-rw-r--r--fs/btrfs/backref.c3
-rw-r--r--fs/btrfs/compression.c141
-rw-r--r--fs/btrfs/compression.h4
-rw-r--r--fs/btrfs/ctree.c56
-rw-r--r--fs/btrfs/ctree.h12
-rw-r--r--fs/btrfs/delayed-inode.c137
-rw-r--r--fs/btrfs/delayed-ref.c2
-rw-r--r--fs/btrfs/delayed-ref.h2
-rw-r--r--fs/btrfs/dev-replace.c30
-rw-r--r--fs/btrfs/dir-item.c108
-rw-r--r--fs/btrfs/disk-io.c75
-rw-r--r--fs/btrfs/disk-io.h3
-rw-r--r--fs/btrfs/export.c5
-rw-r--r--fs/btrfs/extent-tree.c21
-rw-r--r--fs/btrfs/extent_io.c154
-rw-r--r--fs/btrfs/extent_io.h50
-rw-r--r--fs/btrfs/extent_map.c132
-rw-r--r--fs/btrfs/extent_map.h3
-rw-r--r--fs/btrfs/file.c392
-rw-r--r--fs/btrfs/free-space-cache.c13
-rw-r--r--fs/btrfs/inode.c298
-rw-r--r--fs/btrfs/ioctl.c45
-rw-r--r--fs/btrfs/props.c13
-rw-r--r--fs/btrfs/qgroup.c3
-rw-r--r--fs/btrfs/raid56.c119
-rw-r--r--fs/btrfs/ref-verify.c6
-rw-r--r--fs/btrfs/root-tree.c7
-rw-r--r--fs/btrfs/scrub.c95
-rw-r--r--fs/btrfs/send.c6
-rw-r--r--fs/btrfs/super.c348
-rw-r--r--fs/btrfs/sysfs.c2
-rw-r--r--fs/btrfs/tests/btrfs-tests.c3
-rw-r--r--fs/btrfs/tests/btrfs-tests.h1
-rw-r--r--fs/btrfs/tests/extent-map-tests.c366
-rw-r--r--fs/btrfs/tests/inode-tests.c17
-rw-r--r--fs/btrfs/transaction.c22
-rw-r--r--fs/btrfs/transaction.h11
-rw-r--r--fs/btrfs/tree-checker.c142
-rw-r--r--fs/btrfs/tree-log.c58
-rw-r--r--fs/btrfs/volumes.c690
-rw-r--r--fs/btrfs/volumes.h45
-rw-r--r--fs/btrfs/xattr.c7
-rw-r--r--fs/btrfs/zstd.c132
44 files changed, 2334 insertions, 1447 deletions
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 6fe881d5cb38..0c4373628eb4 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -19,4 +19,4 @@ btrfs-$(CONFIG_BTRFS_FS_REF_VERIFY) += ref-verify.o
btrfs-$(CONFIG_BTRFS_FS_RUN_SANITY_TESTS) += tests/free-space-tests.o \
tests/extent-buffer-tests.o tests/btrfs-tests.o \
tests/extent-io-tests.o tests/inode-tests.o tests/qgroup-tests.o \
- tests/free-space-tree-tests.o
+ tests/free-space-tree-tests.o tests/extent-map-tests.o
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 7d0dc100a09a..e4054e533f6d 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -216,7 +216,8 @@ static int prelim_ref_compare(struct prelim_ref *ref1,
return 0;
}
-void update_share_count(struct share_check *sc, int oldcount, int newcount)
+static void update_share_count(struct share_check *sc, int oldcount,
+ int newcount)
{
if ((!sc) || (oldcount == 0 && newcount < 1))
return;
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 5982c8a71f02..07d049c0c20f 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -33,7 +33,6 @@
#include <linux/bit_spinlock.h>
#include <linux/slab.h>
#include <linux/sched/mm.h>
-#include <linux/sort.h>
#include <linux/log2.h>
#include "ctree.h"
#include "disk-io.h"
@@ -45,6 +44,21 @@
#include "extent_io.h"
#include "extent_map.h"
+static const char* const btrfs_compress_types[] = { "", "zlib", "lzo", "zstd" };
+
+const char* btrfs_compress_type2str(enum btrfs_compression_type type)
+{
+ switch (type) {
+ case BTRFS_COMPRESS_ZLIB:
+ case BTRFS_COMPRESS_LZO:
+ case BTRFS_COMPRESS_ZSTD:
+ case BTRFS_COMPRESS_NONE:
+ return btrfs_compress_types[type];
+ }
+
+ return NULL;
+}
+
static int btrfs_decompress_bio(struct compressed_bio *cb);
static inline int compressed_bio_size(struct btrfs_fs_info *fs_info,
@@ -348,8 +362,6 @@ blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start,
page->mapping = NULL;
if (submit || bio_add_page(bio, page, PAGE_SIZE, 0) <
PAGE_SIZE) {
- bio_get(bio);
-
/*
* inc the count before we submit the bio so
* we know the end IO handler won't happen before
@@ -372,8 +384,6 @@ blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start,
bio_endio(bio);
}
- bio_put(bio);
-
bio = btrfs_bio_alloc(bdev, first_byte);
bio->bi_opf = REQ_OP_WRITE | write_flags;
bio->bi_private = cb;
@@ -389,7 +399,6 @@ blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start,
first_byte += PAGE_SIZE;
cond_resched();
}
- bio_get(bio);
ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DATA);
BUG_ON(ret); /* -ENOMEM */
@@ -405,13 +414,12 @@ blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start,
bio_endio(bio);
}
- bio_put(bio);
return 0;
}
static u64 bio_end_offset(struct bio *bio)
{
- struct bio_vec *last = &bio->bi_io_vec[bio->bi_vcnt - 1];
+ struct bio_vec *last = bio_last_bvec_all(bio);
return page_offset(last->bv_page) + last->bv_len + last->bv_offset;
}
@@ -563,7 +571,7 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
/* we need the actual starting offset of this extent in the file */
read_lock(&em_tree->lock);
em = lookup_extent_mapping(em_tree,
- page_offset(bio->bi_io_vec->bv_page),
+ page_offset(bio_first_page_all(bio)),
PAGE_SIZE);
read_unlock(&em_tree->lock);
if (!em)
@@ -638,8 +646,6 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
page->mapping = NULL;
if (submit || bio_add_page(comp_bio, page, PAGE_SIZE, 0) <
PAGE_SIZE) {
- bio_get(comp_bio);
-
ret = btrfs_bio_wq_end_io(fs_info, comp_bio,
BTRFS_WQ_ENDIO_DATA);
BUG_ON(ret); /* -ENOMEM */
@@ -666,8 +672,6 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
bio_endio(comp_bio);
}
- bio_put(comp_bio);
-
comp_bio = btrfs_bio_alloc(bdev, cur_disk_byte);
bio_set_op_attrs(comp_bio, REQ_OP_READ, 0);
comp_bio->bi_private = cb;
@@ -677,7 +681,6 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
}
cur_disk_byte += PAGE_SIZE;
}
- bio_get(comp_bio);
ret = btrfs_bio_wq_end_io(fs_info, comp_bio, BTRFS_WQ_ENDIO_DATA);
BUG_ON(ret); /* -ENOMEM */
@@ -693,7 +696,6 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
bio_endio(comp_bio);
}
- bio_put(comp_bio);
return 0;
fail2:
@@ -752,6 +754,8 @@ struct heuristic_ws {
u32 sample_size;
/* Buckets store counters for each byte value */
struct bucket_item *bucket;
+ /* Sorting buffer */
+ struct bucket_item *bucket_b;
struct list_head list;
};
@@ -763,6 +767,7 @@ static void free_heuristic_ws(struct list_head *ws)
kvfree(workspace->sample);
kfree(workspace->bucket);
+ kfree(workspace->bucket_b);
kfree(workspace);
}
@@ -782,6 +787,10 @@ static struct list_head *alloc_heuristic_ws(void)
if (!ws->bucket)
goto fail;
+ ws->bucket_b = kcalloc(BUCKET_SIZE, sizeof(*ws->bucket_b), GFP_KERNEL);
+ if (!ws->bucket_b)
+ goto fail;
+
INIT_LIST_HEAD(&ws->list);
return &ws->list;
fail:
@@ -1278,13 +1287,103 @@ static u32 shannon_entropy(struct heuristic_ws *ws)
return entropy_sum * 100 / entropy_max;
}
-/* Compare buckets by size, ascending */
-static int bucket_comp_rev(const void *lv, const void *rv)
+#define RADIX_BASE 4U
+#define COUNTERS_SIZE (1U << RADIX_BASE)
+
+static u8 get4bits(u64 num, int shift) {
+ u8 low4bits;
+
+ num >>= shift;
+ /* Reverse order */
+ low4bits = (COUNTERS_SIZE - 1) - (num % COUNTERS_SIZE);
+ return low4bits;
+}
+
+/*
+ * Use 4 bits as radix base
+ * Use 16 u32 counters for calculating new possition in buf array
+ *
+ * @array - array that will be sorted
+ * @array_buf - buffer array to store sorting results
+ * must be equal in size to @array
+ * @num - array size
+ */
+static void radix_sort(struct bucket_item *array, struct bucket_item *array_buf,
+ int num)
{
- const struct bucket_item *l = (const struct bucket_item *)lv;
- const struct bucket_item *r = (const struct bucket_item *)rv;
+ u64 max_num;
+ u64 buf_num;
+ u32 counters[COUNTERS_SIZE];
+ u32 new_addr;
+ u32 addr;
+ int bitlen;
+ int shift;
+ int i;
- return r->count - l->count;
+ /*
+ * Try avoid useless loop iterations for small numbers stored in big
+ * counters. Example: 48 33 4 ... in 64bit array
+ */
+ max_num = array[0].count;
+ for (i = 1; i < num; i++) {
+ buf_num = array[i].count;
+ if (buf_num > max_num)
+ max_num = buf_num;
+ }
+
+ buf_num = ilog2(max_num);
+ bitlen = ALIGN(buf_num, RADIX_BASE * 2);
+
+ shift = 0;
+ while (shift < bitlen) {
+ memset(counters, 0, sizeof(counters));
+
+ for (i = 0; i < num; i++) {
+ buf_num = array[i].count;
+ addr = get4bits(buf_num, shift);
+ counters[addr]++;
+ }
+
+ for (i = 1; i < COUNTERS_SIZE; i++)
+ counters[i] += counters[i - 1];
+
+ for (i = num - 1; i >= 0; i--) {
+ buf_num = array[i].count;
+ addr = get4bits(buf_num, shift);
+ counters[addr]--;
+ new_addr = counters[addr];
+ array_buf[new_addr] = array[i];
+ }
+
+ shift += RADIX_BASE;
+
+ /*
+ * Normal radix expects to move data from a temporary array, to
+ * the main one. But that requires some CPU time. Avoid that
+ * by doing another sort iteration to original array instead of
+ * memcpy()
+ */
+ memset(counters, 0, sizeof(counters));
+
+ for (i = 0; i < num; i ++) {
+ buf_num = array_buf[i].count;
+ addr = get4bits(buf_num, shift);
+ counters[addr]++;
+ }
+
+ for (i = 1; i < COUNTERS_SIZE; i++)
+ counters[i] += counters[i - 1];
+
+ for (i = num - 1; i >= 0; i--) {
+ buf_num = array_buf[i].count;
+ addr = get4bits(buf_num, shift);
+ counters[addr]--;
+ new_addr = counters[addr];
+ array[new_addr] = array_buf[i];
+ }
+
+ shift += RADIX_BASE;
+ }
}
/*
@@ -1314,7 +1413,7 @@ static int byte_core_set_size(struct heuristic_ws *ws)
struct bucket_item *bucket = ws->bucket;
/* Sort in reverse order */
- sort(bucket, BUCKET_SIZE, sizeof(*bucket), &bucket_comp_rev, NULL);
+ radix_sort(ws->bucket, ws->bucket_b, BUCKET_SIZE);
for (i = 0; i < BYTE_CORE_SET_LOW; i++)
coreset_sum += bucket[i].count;
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
index 0868cc554f14..677fa4aa0bd7 100644
--- a/fs/btrfs/compression.h
+++ b/fs/btrfs/compression.h
@@ -75,7 +75,7 @@ struct compressed_bio {
u32 sums;
};
-void btrfs_init_compress(void);
+void __init btrfs_init_compress(void);
void btrfs_exit_compress(void);
int btrfs_compress_pages(unsigned int type_level, struct address_space *mapping,
@@ -137,6 +137,8 @@ extern const struct btrfs_compress_op btrfs_zlib_compress;
extern const struct btrfs_compress_op btrfs_lzo_compress;
extern const struct btrfs_compress_op btrfs_zstd_compress;
+const char* btrfs_compress_type2str(enum btrfs_compression_type type);
+
int btrfs_compress_heuristic(struct inode *inode, u64 start, u64 end);
#endif
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 1e74cf826532..b88a79e69ddf 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1807,8 +1807,8 @@ static noinline int generic_bin_search(struct extent_buffer *eb,
* simple bin_search frontend that does the right thing for
* leaves vs nodes
*/
-static int bin_search(struct extent_buffer *eb, const struct btrfs_key *key,
- int level, int *slot)
+int btrfs_bin_search(struct extent_buffer *eb, const struct btrfs_key *key,
+ int level, int *slot)
{
if (level == 0)
return generic_bin_search(eb,
@@ -1824,12 +1824,6 @@ static int bin_search(struct extent_buffer *eb, const struct btrfs_key *key,
slot);
}
-int btrfs_bin_search(struct extent_buffer *eb, const struct btrfs_key *key,
- int level, int *slot)
-{
- return bin_search(eb, key, level, slot);
-}
-
static void root_add_used(struct btrfs_root *root, u32 size)
{
spin_lock(&root->accounting_lock);
@@ -2614,7 +2608,7 @@ static int key_search(struct extent_buffer *b, const struct btrfs_key *key,
int level, int *prev_cmp, int *slot)
{
if (*prev_cmp != 0) {
- *prev_cmp = bin_search(b, key, level, slot);
+ *prev_cmp = btrfs_bin_search(b, key, level, slot);
return *prev_cmp;
}
@@ -2660,17 +2654,29 @@ int btrfs_find_item(struct btrfs_root *fs_root, struct btrfs_path *path,
}
/*
- * look for key in the tree. path is filled in with nodes along the way
- * if key is found, we return zero and you can find the item in the leaf
- * level of the path (level 0)
+ * btrfs_search_slot - look for a key in a tree and perform necessary
+ * modifications to preserve tree invariants.
+ *
+ * @trans: Handle of transaction, used when modifying the tree
+ * @p: Holds all btree nodes along the search path
+ * @root: The root node of the tree
+ * @key: The key we are looking for
+ * @ins_len: Indicates purpose of search, for inserts it is 1, for
+ * deletions it's -1. 0 for plain searches
+ * @cow: boolean should CoW operations be performed. Must always be 1
+ * when modifying the tree.
+ *
+ * If @ins_len > 0, nodes and leaves will be split as we walk down the tree.
+ * If @ins_len < 0, nodes will be merged as we walk down the tree (if possible)
*
- * If the key isn't found, the path points to the slot where it should
- * be inserted, and 1 is returned. If there are other errors during the
- * search a negative error number is returned.
+ * If @key is found, 0 is returned and you can find the item in the leaf level
+ * of the path (level 0)
*
- * if ins_len > 0, nodes and leaves will be split as we walk down the
- * tree. if ins_len < 0, nodes will be merged as we walk down the tree (if
- * possible)
+ * If @key isn't found, 1 is returned and the leaf level of the path (level 0)
+ * points to the slot where it should be inserted
+ *
+ * If an error is encountered while searching the tree a negative error number
+ * is returned
*/
int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root,
const struct btrfs_key *key, struct btrfs_path *p,
@@ -2774,6 +2780,8 @@ again:
* contention with the cow code
*/
if (cow) {
+ bool last_level = (level == (BTRFS_MAX_LEVEL - 1));
+
/*
* if we don't really need to cow this block
* then we don't want to set the path blocking,
@@ -2798,9 +2806,13 @@ again:
}
btrfs_set_path_blocking(p);
- err = btrfs_cow_block(trans, root, b,
- p->nodes[level + 1],
- p->slots[level + 1], &b);
+ if (last_level)
+ err = btrfs_cow_block(trans, root, b, NULL, 0,
+ &b);
+ else
+ err = btrfs_cow_block(trans, root, b,
+ p->nodes[level + 1],
+ p->slots[level + 1], &b);
if (err) {
ret = err;
goto done;
@@ -5175,7 +5187,7 @@ again:
while (1) {
nritems = btrfs_header_nritems(cur);
level = btrfs_header_level(cur);
- sret = bin_search(cur, min_key, level, &slot);
+ sret = btrfs_bin_search(cur, min_key, level, &slot);
/* at the lowest level, we're done, setup the path and exit */
if (level == path->lowest_level) {
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 13c260b525a1..1a462ab85c49 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -679,7 +679,6 @@ enum btrfs_orphan_cleanup_state {
/* used by the raid56 code to lock stripes for read/modify/write */
struct btrfs_stripe_hash {
struct list_head hash_list;
- wait_queue_head_t wait;
spinlock_t lock;
};
@@ -3060,15 +3059,10 @@ struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans,
struct btrfs_path *path, u64 dir,
const char *name, u16 name_len,
int mod);
-int verify_dir_item(struct btrfs_fs_info *fs_info,
- struct extent_buffer *leaf, int slot,
- struct btrfs_dir_item *dir_item);
struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_fs_info *fs_info,
struct btrfs_path *path,
const char *name,
int name_len);
-bool btrfs_is_name_len_valid(struct extent_buffer *leaf, int slot,
- unsigned long start, u16 name_len);
/* orphan.c */
int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans,
@@ -3197,7 +3191,7 @@ int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc);
struct inode *btrfs_alloc_inode(struct super_block *sb);
void btrfs_destroy_inode(struct inode *inode);
int btrfs_drop_inode(struct inode *inode);
-int btrfs_init_cachep(void);
+int __init btrfs_init_cachep(void);
void btrfs_destroy_cachep(void);
long btrfs_ioctl_trans_end(struct file *file);
struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
@@ -3248,7 +3242,7 @@ ssize_t btrfs_dedupe_file_range(struct file *src_file, u64 loff, u64 olen,
struct file *dst_file, u64 dst_loff);
/* file.c */
-int btrfs_auto_defrag_init(void);
+int __init btrfs_auto_defrag_init(void);
void btrfs_auto_defrag_exit(void);
int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode);
@@ -3283,7 +3277,7 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
/* sysfs.c */
-int btrfs_init_sysfs(void);
+int __init btrfs_init_sysfs(void);
void btrfs_exit_sysfs(void);
int btrfs_sysfs_add_mounted(struct btrfs_fs_info *fs_info);
void btrfs_sysfs_remove_mounted(struct btrfs_fs_info *fs_info);
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 5d73f79ded8b..0530f6f2e4ba 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -18,6 +18,7 @@
*/
#include <linux/slab.h>
+#include <linux/iversion.h>
#include "delayed-inode.h"
#include "disk-io.h"
#include "transaction.h"
@@ -87,6 +88,7 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node(
spin_lock(&root->inode_lock);
node = radix_tree_lookup(&root->delayed_nodes_tree, ino);
+
if (node) {
if (btrfs_inode->delayed_node) {
refcount_inc(&node->refs); /* can be accessed */
@@ -94,9 +96,30 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node(
spin_unlock(&root->inode_lock);
return node;
}
- btrfs_inode->delayed_node = node;
- /* can be accessed and cached in the inode */
- refcount_add(2, &node->refs);
+
+ /*
+ * It's possible that we're racing into the middle of removing
+ * this node from the radix tree. In this case, the refcount
+ * was zero and it should never go back to one. Just return
+ * NULL like it was never in the radix at all; our release
+ * function is in the process of removing it.
+ *
+ * Some implementations of refcount_inc refuse to bump the
+ * refcount once it has hit zero. If we don't do this dance
+ * here, refcount_inc() may decide to just WARN_ONCE() instead
+ * of actually bumping the refcount.
+ *
+ * If this node is properly in the radix, we want to bump the
+ * refcount twice, once for the inode and once for this get
+ * operation.
+ */
+ if (refcount_inc_not_zero(&node->refs)) {
+ refcount_inc(&node->refs);
+ btrfs_inode->delayed_node = node;
+ } else {
+ node = NULL;
+ }
+
spin_unlock(&root->inode_lock);
return node;
}
@@ -254,17 +277,18 @@ static void __btrfs_release_delayed_node(
mutex_unlock(&delayed_node->mutex);
if (refcount_dec_and_test(&delayed_node->refs)) {
- bool free = false;
struct btrfs_root *root = delayed_node->root;
+
spin_lock(&root->inode_lock);
- if (refcount_read(&delayed_node->refs) == 0) {
- radix_tree_delete(&root->delayed_nodes_tree,
- delayed_node->inode_id);
- free = true;
- }
+ /*
+ * Once our refcount goes to zero, nobody is allowed to bump it
+ * back up. We can delete it now.
+ */
+ ASSERT(refcount_read(&delayed_node->refs) == 0);
+ radix_tree_delete(&root->delayed_nodes_tree,
+ delayed_node->inode_id);
spin_unlock(&root->inode_lock);
- if (free)
- kmem_cache_free(delayed_node_cache, delayed_node);
+ kmem_cache_free(delayed_node_cache, delayed_node);
}
}
@@ -1279,40 +1303,42 @@ static void btrfs_async_run_delayed_root(struct btrfs_work *work)
if (!path)
goto out;
-again:
- if (atomic_read(&delayed_root->items) < BTRFS_DELAYED_BACKGROUND / 2)
- goto free_path;
+ do {
+ if (atomic_read(&delayed_root->items) <
+ BTRFS_DELAYED_BACKGROUND / 2)
+ break;
- delayed_node = btrfs_first_prepared_delayed_node(delayed_root);
- if (!delayed_node)
- goto free_path;
+ delayed_node = btrfs_first_prepared_delayed_node(delayed_root);
+ if (!delayed_node)
+ break;
- path->leave_spinning = 1;
- root = delayed_node->root;
+ path->leave_spinning = 1;
+ root = delayed_node->root;
- trans = btrfs_join_transaction(root);
- if (IS_ERR(trans))
- goto release_path;
+ trans = btrfs_join_transaction(root);
+ if (IS_ERR(trans)) {
+ btrfs_release_path(path);
+ btrfs_release_prepared_delayed_node(delayed_node);
+ total_done++;
+ continue;
+ }
- block_rsv = trans->block_rsv;
- trans->block_rsv = &root->fs_info->delayed_block_rsv;
+ block_rsv = trans->block_rsv;
+ trans->block_rsv = &root->fs_info->delayed_block_rsv;
- __btrfs_commit_inode_delayed_items(trans, path, delayed_node);
+ __btrfs_commit_inode_delayed_items(trans, path, delayed_node);
- trans->block_rsv = block_rsv;
- btrfs_end_transaction(trans);
- btrfs_btree_balance_dirty_nodelay(root->fs_info);
+ trans->block_rsv = block_rsv;
+ btrfs_end_transaction(trans);
+ btrfs_btree_balance_dirty_nodelay(root->fs_info);
-release_path:
- btrfs_release_path(path);
- total_done++;
+ btrfs_release_path(path);
+ btrfs_release_prepared_delayed_node(delayed_node);
+ total_done++;
- btrfs_release_prepared_delayed_node(delayed_node);
- if ((async_work->nr == 0 && total_done < BTRFS_DELAYED_WRITEBACK) ||
- total_done < async_work->nr)
- goto again;
+ } while ((async_work->nr == 0 && total_done < BTRFS_DELAYED_WRITEBACK)
+ || total_done < async_work->nr);
-free_path:
btrfs_free_path(path);
out:
wake_up(&delayed_root->wait);
@@ -1325,10 +1351,6 @@ static int btrfs_wq_run_delayed_node(struct btrfs_delayed_root *delayed_root,
{
struct btrfs_async_delayed_work *async_work;
- if (atomic_read(&delayed_root->items) < BTRFS_DELAYED_BACKGROUND ||
- btrfs_workqueue_normal_congested(fs_info->delayed_workers))
- return 0;
-
async_work = kmalloc(sizeof(*async_work), GFP_NOFS);
if (!async_work)
return -ENOMEM;
@@ -1364,7 +1386,8 @@ void btrfs_balance_delayed_items(struct btrfs_fs_info *fs_info)
{
struct btrfs_delayed_root *delayed_root = fs_info->delayed_root;
- if (atomic_read(&delayed_root->items) < BTRFS_DELAYED_BACKGROUND)
+ if ((atomic_read(&delayed_root->items) < BTRFS_DELAYED_BACKGROUND) ||
+ btrfs_workqueue_normal_congested(fs_info->delayed_workers))
return;
if (atomic_read(&delayed_root->items) >= BTRFS_DELAYED_WRITEBACK) {
@@ -1610,28 +1633,18 @@ void btrfs_readdir_put_delayed_items(struct inode *inode,
int btrfs_should_delete_dir_index(struct list_head *del_list,
u64 index)
{
- struct btrfs_delayed_item *curr, *next;
- int ret;
-
- if (list_empty(del_list))
- return 0;
+ struct btrfs_delayed_item *curr;
+ int ret = 0;
- list_for_each_entry_safe(curr, next, del_list, readdir_list) {
+ list_for_each_entry(curr, del_list, readdir_list) {
if (curr->key.offset > index)
break;
-
- list_del(&curr->readdir_list);
- ret = (curr->key.offset == index);
-
- if (refcount_dec_and_test(&curr->refs))
- kfree(curr);
-
- if (ret)
- return 1;
- else
- continue;
+ if (curr->key.offset == index) {
+ ret = 1;
+ break;
+ }
}
- return 0;
+ return ret;
}
/*
@@ -1700,7 +1713,8 @@ static void fill_stack_inode_item(struct btrfs_trans_handle *trans,
btrfs_set_stack_inode_nbytes(inode_item, inode_get_bytes(inode));
btrfs_set_stack_inode_generation(inode_item,
BTRFS_I(inode)->generation);
- btrfs_set_stack_inode_sequence(inode_item, inode->i_version);
+ btrfs_set_stack_inode_sequence(inode_item,
+ inode_peek_iversion(inode));
btrfs_set_stack_inode_transid(inode_item, trans->transid);
btrfs_set_stack_inode_rdev(inode_item, inode->i_rdev);
btrfs_set_stack_inode_flags(inode_item, BTRFS_I(inode)->flags);
@@ -1754,7 +1768,8 @@ int btrfs_fill_inode(struct inode *inode, u32 *rdev)
BTRFS_I(inode)->generation = btrfs_stack_inode_generation(inode_item);
BTRFS_I(inode)->last_trans = btrfs_stack_inode_transid(inode_item);
- inode->i_version = btrfs_stack_inode_sequence(inode_item);
+ inode_set_iversion_queried(inode,
+ btrfs_stack_inode_sequence(inode_item));
inode->i_rdev = 0;
*rdev = btrfs_stack_inode_rdev(inode_item);
BTRFS_I(inode)->flags = btrfs_stack_inode_flags(inode_item);
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 83be8f9fd906..a1a40cf382e3 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -937,7 +937,7 @@ void btrfs_delayed_ref_exit(void)
kmem_cache_destroy(btrfs_delayed_extent_op_cachep);
}
-int btrfs_delayed_ref_init(void)
+int __init btrfs_delayed_ref_init(void)
{
btrfs_delayed_ref_head_cachep = kmem_cache_create(
"btrfs_delayed_ref_head",
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index a43af432f859..c4f625e5a691 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -203,7 +203,7 @@ extern struct kmem_cache *btrfs_delayed_tree_ref_cachep;
extern struct kmem_cache *btrfs_delayed_data_ref_cachep;
extern struct kmem_cache *btrfs_delayed_extent_op_cachep;
-int btrfs_delayed_ref_init(void);
+int __init btrfs_delayed_ref_init(void);
void btrfs_delayed_ref_exit(void);
static inline struct btrfs_delayed_extent_op *
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 7c655f9a7a50..7efbc4d1128b 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -172,7 +172,8 @@ no_valid_dev_replace_entry_found:
dev_replace->tgtdev->commit_bytes_used =
dev_replace->srcdev->commit_bytes_used;
}
- dev_replace->tgtdev->is_tgtdev_for_dev_replace = 1;
+ set_bit(BTRFS_DEV_STATE_REPLACE_TGT,
+ &dev_replace->tgtdev->dev_state);
btrfs_init_dev_replace_tgtdev_for_resume(fs_info,
dev_replace->tgtdev);
}
@@ -304,6 +305,14 @@ void btrfs_after_dev_replace_commit(struct btrfs_fs_info *fs_info)
dev_replace->cursor_left_last_write_of_item;
}
+static char* btrfs_dev_name(struct btrfs_device *device)
+{
+ if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
+ return "<missing disk>";
+ else
+ return rcu_str_deref(device->name);
+}
+
int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
const char *tgtdev_name, u64 srcdevid, const char *srcdev_name,
int read_src)
@@ -363,8 +372,7 @@ int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
btrfs_info_in_rcu(fs_info,
"dev_replace from %s (devid %llu) to %s started",
- src_device->missing ? "<missing disk>" :
- rcu_str_deref(src_device->name),
+ btrfs_dev_name(src_device),
src_device->devid,
rcu_str_deref(tgt_device->name));
@@ -538,8 +546,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
} else {
btrfs_err_in_rcu(fs_info,
"btrfs_scrub_dev(%s, %llu, %s) failed %d",
- src_device->missing ? "<missing disk>" :
- rcu_str_deref(src_device->name),
+ btrfs_dev_name(src_device),
src_device->devid,
rcu_str_deref(tgt_device->name), scrub_ret);
btrfs_dev_replace_unlock(dev_replace, 1);
@@ -557,11 +564,10 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
btrfs_info_in_rcu(fs_info,
"dev_replace from %s (devid %llu) to %s finished",
- src_device->missing ? "<missing disk>" :
- rcu_str_deref(src_device->name),
+ btrfs_dev_name(src_device),
src_device->devid,
rcu_str_deref(tgt_device->name));
- tgt_device->is_tgtdev_for_dev_replace = 0;
+ clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &tgt_device->dev_state);
tgt_device->devid = src_device->devid;
src_device->devid = BTRFS_DEV_REPLACE_DEVID;
memcpy(uuid_tmp, tgt_device->uuid, sizeof(uuid_tmp));
@@ -814,12 +820,10 @@ static int btrfs_dev_replace_kthread(void *data)
progress = btrfs_dev_replace_progress(fs_info);
progress = div_u64(progress, 10);
btrfs_info_in_rcu(fs_info,
- "continuing dev_replace from %s (devid %llu) to %s @%u%%",
- dev_replace->srcdev->missing ? "<missing disk>"
- : rcu_str_deref(dev_replace->srcdev->name),
+ "continuing dev_replace from %s (devid %llu) to target %s @%u%%",
+ btrfs_dev_name(dev_replace->srcdev),
dev_replace->srcdev->devid,
- dev_replace->tgtdev ? rcu_str_deref(dev_replace->tgtdev->name)
- : "<missing target disk>",
+ btrfs_dev_name(dev_replace->tgtdev),
(unsigned int)progress);
btrfs_dev_replace_continue_on_mount(fs_info);
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index 41cb9196eaa8..cbe421605cd5 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -403,8 +403,6 @@ struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_fs_info *fs_info,
btrfs_dir_data_len(leaf, dir_item);
name_ptr = (unsigned long)(dir_item + 1);
- if (verify_dir_item(fs_info, leaf, path->slots[0], dir_item))
- return NULL;
if (btrfs_dir_name_len(leaf, dir_item) == name_len &&
memcmp_extent_buffer(leaf, name, name_ptr, name_len) == 0)
return dir_item;
@@ -450,109 +448,3 @@ int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans,
}
return ret;
}
-
-int verify_dir_item(struct btrfs_fs_info *fs_info,
- struct extent_buffer *leaf,
- int slot,
- struct btrfs_dir_item *dir_item)
-{
- u16 namelen = BTRFS_NAME_LEN;
- int ret;
- u8 type = btrfs_dir_type(leaf, dir_item);
-
- if (type >= BTRFS_FT_MAX) {
- btrfs_crit(fs_info, "invalid dir item type: %d", (int)type);
- return 1;
- }
-
- if (type == BTRFS_FT_XATTR)
- namelen = XATTR_NAME_MAX;
-
- if (btrfs_dir_name_len(leaf, dir_item) > namelen) {
- btrfs_crit(fs_info, "invalid dir item name len: %u",
- (unsigned)btrfs_dir_name_len(leaf, dir_item));
- return 1;
- }
-
- namelen = btrfs_dir_name_len(leaf, dir_item);
- ret = btrfs_is_name_len_valid(leaf, slot,
- (unsigned long)(dir_item + 1), namelen);
- if (!ret)
- return 1;
-
- /* BTRFS_MAX_XATTR_SIZE is the same for all dir items */
- if ((btrfs_dir_data_len(leaf, dir_item) +
- btrfs_dir_name_len(leaf, dir_item)) >
- BTRFS_MAX_XATTR_SIZE(fs_info)) {
- btrfs_crit(fs_info, "invalid dir item name + data len: %u + %u",
- (unsigned)btrfs_dir_name_len(leaf, dir_item),
- (unsigned)btrfs_dir_data_len(leaf, dir_item));
- return 1;
- }
-
- return 0;
-}
-
-bool btrfs_is_name_len_valid(struct extent_buffer *leaf, int slot,
- unsigned long start, u16 name_len)
-{
- struct btrfs_fs_info *fs_info = leaf->fs_info;
- struct btrfs_key key;
- u32 read_start;
- u32 read_end;
- u32 item_start;
- u32 item_end;
- u32 size;
- bool ret = true;
-
- ASSERT(start > BTRFS_LEAF_DATA_OFFSET);
-
- read_start = start - BTRFS_LEAF_DATA_OFFSET;
- read_end = read_start + name_len;
- item_start = btrfs_item_offset_nr(leaf, slot);
- item_end = btrfs_item_end_nr(leaf, slot);
-
- btrfs_item_key_to_cpu(leaf, &key, slot);
-
- switch (key.type) {
- case BTRFS_DIR_ITEM_KEY:
- case BTRFS_XATTR_ITEM_KEY:
- case BTRFS_DIR_INDEX_KEY:
- size = sizeof(struct btrfs_dir_item);
- break;
- case BTRFS_INODE_REF_KEY:
- size = sizeof(struct btrfs_inode_ref);
- break;
- case BTRFS_INODE_EXTREF_KEY:
- size = sizeof(struct btrfs_inode_extref);
- break;
- case BTRFS_ROOT_REF_KEY:
- case BTRFS_ROOT_BACKREF_KEY:
- size = sizeof(struct btrfs_root_ref);
- break;
- default:
- ret = false;
- goto out;
- }
-
- if (read_start < item_start) {
- ret = false;
- goto out;
- }
- if (read_end > item_end) {
- ret = false;
- goto out;
- }
-
- /* there shall be item(s) before name */
- if (read_start - item_start < size) {
- ret = false;
- goto out;
- }
-
-out:
- if (!ret)
- btrfs_crit(fs_info, "invalid dir item name len: %u",
- (unsigned int)name_len);
- return ret;
-}
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index a8ecccfc36de..ed095202942f 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -61,7 +61,8 @@
BTRFS_HEADER_FLAG_RELOC |\
BTRFS_SUPER_FLAG_ERROR |\
BTRFS_SUPER_FLAG_SEEDING |\
- BTRFS_SUPER_FLAG_METADUMP)
+ BTRFS_SUPER_FLAG_METADUMP |\
+ BTRFS_SUPER_FLAG_METADUMP_V2)
static const struct extent_io_ops btree_extent_io_ops;
static void end_workqueue_fn(struct btrfs_work *work);
@@ -220,7 +221,7 @@ void btrfs_set_buffer_lockdep_class(u64 objectid, struct extent_buffer *eb,
* extents on the btree inode are pretty simple, there's one extent
* that covers the entire device
*/
-static struct extent_map *btree_get_extent(struct btrfs_inode *inode,
+struct extent_map *btree_get_extent(struct btrfs_inode *inode,
struct page *page, size_t pg_offset, u64 start, u64 len,
int create)
{
@@ -285,7 +286,7 @@ static int csum_tree_block(struct btrfs_fs_info *fs_info,
int verify)
{
u16 csum_size = btrfs_super_csum_size(fs_info->super_copy);
- char *result = NULL;
+ char result[BTRFS_CSUM_SIZE];
unsigned long len;
unsigned long cur_len;
unsigned long offset = BTRFS_CSUM_SIZE;
@@ -294,7 +295,6 @@ static int csum_tree_block(struct btrfs_fs_info *fs_info,
unsigned long map_len;
int err;
u32 crc = ~(u32)0;
- unsigned long inline_result;
len = buf->len - offset;
while (len > 0) {
@@ -308,13 +308,7 @@ static int csum_tree_block(struct btrfs_fs_info *fs_info,
len -= cur_len;
offset += cur_len;
}
- if (csum_size > sizeof(inline_result)) {
- result = kzalloc(csum_size, GFP_NOFS);
- if (!result)
- return -ENOMEM;
- } else {
- result = (char *)&inline_result;
- }
+ memset(result, 0, BTRFS_CSUM_SIZE);
btrfs_csum_final(crc, result);
@@ -329,15 +323,12 @@ static int csum_tree_block(struct btrfs_fs_info *fs_info,
"%s checksum verify failed on %llu wanted %X found %X level %d",
fs_info->sb->s_id, buf->start,
val, found, btrfs_header_level(buf));
- if (result != (char *)&inline_result)
- kfree(result);
return -EUCLEAN;
}
} else {
write_extent_buffer(buf, result, 0, csum_size);
}
- if (result != (char *)&inline_result)
- kfree(result);
+
return 0;
}
@@ -391,7 +382,7 @@ static int verify_parent_transid(struct extent_io_tree *io_tree,
clear_extent_buffer_uptodate(eb);
out:
unlock_extent_cached(io_tree, eb->start, eb->start + eb->len - 1,
- &cached_state, GFP_NOFS);
+ &cached_state);
if (need_lock)
btrfs_tree_read_unlock_blocking(eb);
return ret;
@@ -455,7 +446,7 @@ static int btree_read_extent_buffer_pages(struct btrfs_fs_info *fs_info,
io_tree = &BTRFS_I(fs_info->btree_inode)->io_tree;
while (1) {
ret = read_extent_buffer_pages(io_tree, eb, WAIT_COMPLETE,
- btree_get_extent, mirror_num);
+ mirror_num);
if (!ret) {
if (!verify_parent_transid(io_tree, eb,
parent_transid, 0))
@@ -1012,7 +1003,7 @@ void readahead_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr)
if (IS_ERR(buf))
return;
read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree,
- buf, WAIT_NONE, btree_get_extent, 0);
+ buf, WAIT_NONE, 0);
free_extent_buffer(buf);
}
@@ -1031,7 +1022,7 @@ int reada_tree_block_flagged(struct btrfs_fs_info *fs_info, u64 bytenr,
set_bit(EXTENT_BUFFER_READAHEAD, &buf->bflags);
ret = read_extent_buffer_pages(io_tree, buf, WAIT_PAGE_LOCK,
- btree_get_extent, mirror_num);
+ mirror_num);
if (ret) {
free_extent_buffer(buf);
return ret;
@@ -1243,7 +1234,7 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
struct btrfs_root *root;
struct btrfs_key key;
int ret = 0;
- uuid_le uuid;
+ uuid_le uuid = NULL_UUID_LE;
root = btrfs_alloc_root(fs_info, GFP_KERNEL);
if (!root)
@@ -1284,7 +1275,8 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
btrfs_set_root_used(&root->root_item, leaf->len);
btrfs_set_root_last_snapshot(&root->root_item, 0);
btrfs_set_root_dirid(&root->root_item, 0);
- uuid_le_gen(&uuid);
+ if (is_fstree(objectid))
+ uuid_le_gen(&uuid);
memcpy(root->root_item.uuid, uuid.b, BTRFS_UUID_SIZE);
root->root_item.drop_level = 0;
@@ -2875,7 +2867,7 @@ retry_root_backup:
goto fail_sysfs;
}
- if (!sb_rdonly(sb) && !btrfs_check_rw_degradable(fs_info)) {
+ if (!sb_rdonly(sb) && !btrfs_check_rw_degradable(fs_info, NULL)) {
btrfs_warn(fs_info,
"writeable mount is not allowed due to too many missing devices");
goto fail_sysfs;
@@ -3357,7 +3349,7 @@ static void write_dev_flush(struct btrfs_device *device)
bio->bi_private = &device->flush_wait;
btrfsic_submit_bio(bio);
- device->flush_bio_sent = 1;
+ set_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state);
}
/*
@@ -3367,10 +3359,10 @@ static blk_status_t wait_dev_flush(struct btrfs_device *device)
{
struct bio *bio = device->flush_bio;
- if (!device->flush_bio_sent)
+ if (!test_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state))
return BLK_STS_OK;
- device->flush_bio_sent = 0;
+ clear_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state);
wait_for_completion_io(&device->flush_wait);
return bio->bi_status;
@@ -3378,7 +3370,7 @@ static blk_status_t wait_dev_flush(struct btrfs_device *device)
static int check_barrier_error(struct btrfs_fs_info *fs_info)
{
- if (!btrfs_check_rw_degradable(fs_info))
+ if (!btrfs_check_rw_degradable(fs_info, NULL))
return -EIO;
return 0;
}
@@ -3394,14 +3386,16 @@ static int barrier_all_devices(struct btrfs_fs_info *info)
int errors_wait = 0;
blk_status_t ret;
+ lockdep_assert_held(&info->fs_devices->device_list_mutex);
/* send down all the barriers */
head = &info->fs_devices->devices;
- list_for_each_entry_rcu(dev, head, dev_list) {
- if (dev->missing)
+ list_for_each_entry(dev, head, dev_list) {
+ if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state))
continue;
if (!dev->bdev)
continue;
- if (!dev->in_fs_metadata || !dev->writeable)
+ if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) ||
+ !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))
continue;
write_dev_flush(dev);
@@ -3409,14 +3403,15 @@ static int barrier_all_devices(struct btrfs_fs_info *info)
}
/* wait for all the barriers */
- list_for_each_entry_rcu(dev, head, dev_list) {
- if (dev->missing)
+ list_for_each_entry(dev, head, dev_list) {
+ if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state))
continue;
if (!dev->bdev) {
errors_wait++;
continue;
}
- if (!dev->in_fs_metadata || !dev->writeable)
+ if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) ||
+ !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))
continue;
ret = wait_dev_flush(dev);
@@ -3508,12 +3503,13 @@ int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors)
}
}
- list_for_each_entry_rcu(dev, head, dev_list) {
+ list_for_each_entry(dev, head, dev_list) {
if (!dev->bdev) {
total_errors++;
continue;
}
- if (!dev->in_fs_metadata || !dev->writeable)
+ if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) ||
+ !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))
continue;
btrfs_set_stack_device_generation(dev_item, 0);
@@ -3549,10 +3545,11 @@ int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors)
}
total_errors = 0;
- list_for_each_entry_rcu(dev, head, dev_list) {
+ list_for_each_entry(dev, head, dev_list) {
if (!dev->bdev)
continue;
- if (!dev->in_fs_metadata || !dev->writeable)
+ if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) ||
+ !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))
continue;
ret = wait_dev_supers(dev, max_mirrors);
@@ -3910,9 +3907,11 @@ static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info)
btrfs_err(fs_info, "no valid FS found");
ret = -EINVAL;
}
- if (btrfs_super_flags(sb) & ~BTRFS_SUPER_FLAG_SUPP)
- btrfs_warn(fs_info, "unrecognized super flag: %llu",
+ if (btrfs_super_flags(sb) & ~BTRFS_SUPER_FLAG_SUPP) {
+ btrfs_err(fs_info, "unrecognized or unsupported super flag: %llu",
btrfs_super_flags(sb) & ~BTRFS_SUPER_FLAG_SUPP);
+ ret = -EINVAL;
+ }
if (btrfs_super_root_level(sb) >= BTRFS_MAX_LEVEL) {
btrfs_err(fs_info, "tree_root level too big: %d >= %d",
btrfs_super_root_level(sb), BTRFS_MAX_LEVEL);
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 7f7c35d6347a..301151a50ac1 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -149,6 +149,9 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
u64 objectid);
int btree_lock_page_hook(struct page *page, void *data,
void (*flush_fn)(void *));
+struct extent_map *btree_get_extent(struct btrfs_inode *inode,
+ struct page *page, size_t pg_offset, u64 start, u64 len,
+ int create);
int btrfs_get_num_tolerated_disk_barrier_failures(u64 flags);
int __init btrfs_end_io_wq_init(void);
void btrfs_end_io_wq_exit(void);
diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
index 3aeb5770f896..ddaccad469f8 100644
--- a/fs/btrfs/export.c
+++ b/fs/btrfs/export.c
@@ -283,11 +283,6 @@ static int btrfs_get_name(struct dentry *parent, char *name,
name_len = btrfs_inode_ref_name_len(leaf, iref);
}
- ret = btrfs_is_name_len_valid(leaf, path->slots[0], name_ptr, name_len);
- if (!ret) {
- btrfs_free_path(path);
- return -EIO;
- }
read_extent_buffer(leaf, name, name_ptr, name_len);
btrfs_free_path(path);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 2f4328511ac8..05751a677da4 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2145,7 +2145,10 @@ int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr,
for (i = 0; i < bbio->num_stripes; i++, stripe++) {
u64 bytes;
- if (!stripe->dev->can_discard)
+ struct request_queue *req_q;
+
+ req_q = bdev_get_queue(stripe->dev->bdev);
+ if (!blk_queue_discard(req_q))
continue;
ret = btrfs_issue_discard(stripe->dev->bdev,
@@ -2894,7 +2897,7 @@ int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans,
struct btrfs_block_rsv *global_rsv;
u64 num_heads = trans->transaction->delayed_refs.num_heads_ready;
u64 csum_bytes = trans->transaction->delayed_refs.pending_csums;
- u64 num_dirty_bgs = trans->transaction->num_dirty_bgs;
+ unsigned int num_dirty_bgs = trans->transaction->num_dirty_bgs;
u64 num_bytes, num_dirty_bgs_bytes;
int ret = 0;
@@ -4945,12 +4948,12 @@ static int may_commit_transaction(struct btrfs_fs_info *fs_info,
bytes = 0;
else
bytes -= delayed_rsv->size;
+ spin_unlock(&delayed_rsv->lock);
+
if (percpu_counter_compare(&space_info->total_bytes_pinned,
bytes) < 0) {
- spin_unlock(&delayed_rsv->lock);
return -ENOSPC;
}
- spin_unlock(&delayed_rsv->lock);
commit:
trans = btrfs_join_transaction(fs_info->extent_root);
@@ -5738,8 +5741,8 @@ int btrfs_block_rsv_refill(struct btrfs_root *root,
* or return if we already have enough space. This will also handle the resreve
* tracepoint for the reserved amount.
*/
-int btrfs_inode_rsv_refill(struct btrfs_inode *inode,
- enum btrfs_reserve_flush_enum flush)
+static int btrfs_inode_rsv_refill(struct btrfs_inode *inode,
+ enum btrfs_reserve_flush_enum flush)
{
struct btrfs_root *root = inode->root;
struct btrfs_block_rsv *block_rsv = &inode->block_rsv;
@@ -5770,7 +5773,7 @@ int btrfs_inode_rsv_refill(struct btrfs_inode *inode,
* This is the same as btrfs_block_rsv_release, except that it handles the
* tracepoint for the reservation.
*/
-void btrfs_inode_rsv_release(struct btrfs_inode *inode)
+static void btrfs_inode_rsv_release(struct btrfs_inode *inode)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
@@ -9690,7 +9693,7 @@ int btrfs_can_relocate(struct btrfs_fs_info *fs_info, u64 bytenr)
* space to fit our block group in.
*/
if (device->total_bytes > device->bytes_used + min_free &&
- !device->is_tgtdev_for_dev_replace) {
+ !test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
ret = find_free_dev_extent(trans, device, min_free,
&dev_offset, NULL);
if (!ret)
@@ -10875,7 +10878,7 @@ static int btrfs_trim_free_extents(struct btrfs_device *device,
*trimmed = 0;
/* Not writeable = nothing to do. */
- if (!device->writeable)
+ if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
return 0;
/* No free space = nothing to do. */
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 012d63870b99..dfeb74a0be77 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -21,6 +21,7 @@
#include "locking.h"
#include "rcu-string.h"
#include "backref.h"
+#include "disk-io.h"
static struct kmem_cache *extent_state_cache;
static struct kmem_cache *extent_buffer_cache;
@@ -109,8 +110,6 @@ struct tree_entry {
struct extent_page_data {
struct bio *bio;
struct extent_io_tree *tree;
- get_extent_t *get_extent;
-
/* tells writepage not to lock the state bits for this range
* it still does the unlocking
*/
@@ -139,7 +138,8 @@ static void add_extent_changeset(struct extent_state *state, unsigned bits,
BUG_ON(ret < 0);
}
-static noinline void flush_write_bio(void *data);
+static void flush_write_bio(struct extent_page_data *epd);
+
static inline struct btrfs_fs_info *
tree_fs_info(struct extent_io_tree *tree)
{
@@ -581,7 +581,7 @@ static void extent_io_tree_panic(struct extent_io_tree *tree, int err)
*
* This takes the tree lock, and returns 0 on success and < 0 on error.
*/
-static int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
unsigned bits, int wake, int delete,
struct extent_state **cached_state,
gfp_t mask, struct extent_changeset *changeset)
@@ -1295,10 +1295,10 @@ int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
unsigned bits, int wake, int delete,
- struct extent_state **cached, gfp_t mask)
+ struct extent_state **cached)
{
return __clear_extent_bit(tree, start, end, bits, wake, delete,
- cached, mask, NULL);
+ cached, GFP_NOFS, NULL);
}
int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
@@ -1348,7 +1348,7 @@ int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end)
if (err == -EEXIST) {
if (failed_start > start)
clear_extent_bit(tree, start, failed_start - 1,
- EXTENT_LOCKED, 1, 0, NULL, GFP_NOFS);
+ EXTENT_LOCKED, 1, 0, NULL);
return 0;
}
return 1;
@@ -1648,7 +1648,7 @@ again:
EXTENT_DELALLOC, 1, cached_state);
if (!ret) {
unlock_extent_cached(tree, delalloc_start, delalloc_end,
- &cached_state, GFP_NOFS);
+ &cached_state);
__unlock_for_delalloc(inode, locked_page,
delalloc_start, delalloc_end);
cond_resched();
@@ -1744,7 +1744,7 @@ void extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end,
unsigned long page_ops)
{
clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, clear_bits, 1, 0,
- NULL, GFP_NOFS);
+ NULL);
__process_pages_contig(inode->i_mapping, locked_page,
start >> PAGE_SHIFT, end >> PAGE_SHIFT,
@@ -2027,7 +2027,8 @@ int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
bio->bi_iter.bi_sector = sector;
dev = bbio->stripes[bbio->mirror_num - 1].dev;
btrfs_put_bbio(bbio);
- if (!dev || !dev->bdev || !dev->writeable) {
+ if (!dev || !dev->bdev ||
+ !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) {
btrfs_bio_counter_dec(fs_info);
bio_put(bio);
return -EIO;
@@ -2257,7 +2258,7 @@ int btrfs_get_io_failure_record(struct inode *inode, u64 start, u64 end,
return 0;
}
-bool btrfs_check_repairable(struct inode *inode, struct bio *failed_bio,
+bool btrfs_check_repairable(struct inode *inode, unsigned failed_bio_pages,
struct io_failure_record *failrec, int failed_mirror)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
@@ -2281,7 +2282,7 @@ bool btrfs_check_repairable(struct inode *inode, struct bio *failed_bio,
* a) deliver good data to the caller
* b) correct the bad sectors on disk
*/
- if (failed_bio->bi_vcnt > 1) {
+ if (failed_bio_pages > 1) {
/*
* to fulfill b), we need to know the exact failing sectors, as
* we don't want to rewrite any more than the failed ones. thus,
@@ -2374,6 +2375,7 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,
int read_mode = 0;
blk_status_t status;
int ret;
+ unsigned failed_bio_pages = bio_pages_all(failed_bio);
BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE);
@@ -2381,13 +2383,13 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,
if (ret)
return ret;
- if (!btrfs_check_repairable(inode, failed_bio, failrec,
+ if (!btrfs_check_repairable(inode, failed_bio_pages, failrec,
failed_mirror)) {
free_io_failure(failure_tree, tree, failrec);
return -EIO;
}
- if (failed_bio->bi_vcnt > 1)
+ if (failed_bio_pages > 1)
read_mode |= REQ_FAILFAST_DEV;
phy_offset >>= inode->i_sb->s_blocksize_bits;
@@ -2492,7 +2494,7 @@ endio_readpage_release_extent(struct extent_io_tree *tree, u64 start, u64 len,
if (uptodate && tree->track_uptodate)
set_extent_uptodate(tree, start, end, &cached, GFP_ATOMIC);
- unlock_extent_cached(tree, start, end, &cached, GFP_ATOMIC);
+ unlock_extent_cached_atomic(tree, start, end, &cached);
}
/*
@@ -2724,7 +2726,7 @@ static int __must_check submit_one_bio(struct bio *bio, int mirror_num,
unsigned long bio_flags)
{
blk_status_t ret = 0;
- struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
+ struct bio_vec *bvec = bio_last_bvec_all(bio);
struct page *page = bvec->bv_page;
struct extent_io_tree *tree = bio->bi_private;
u64 start;
@@ -2732,7 +2734,6 @@ static int __must_check submit_one_bio(struct bio *bio, int mirror_num,
start = page_offset(page) + bvec->bv_offset;
bio->bi_private = NULL;
- bio_get(bio);
if (tree->ops)
ret = tree->ops->submit_bio_hook(tree->private_data, bio,
@@ -2740,7 +2741,6 @@ static int __must_check submit_one_bio(struct bio *bio, int mirror_num,
else
btrfsic_submit_bio(bio);
- bio_put(bio);
return blk_status_to_errno(ret);
}
@@ -2942,8 +2942,7 @@ static int __do_readpage(struct extent_io_tree *tree,
set_extent_uptodate(tree, cur, cur + iosize - 1,
&cached, GFP_NOFS);
unlock_extent_cached(tree, cur,
- cur + iosize - 1,
- &cached, GFP_NOFS);
+ cur + iosize - 1, &cached);
break;
}
em = __get_extent_map(inode, page, pg_offset, cur,
@@ -3036,8 +3035,7 @@ static int __do_readpage(struct extent_io_tree *tree,
set_extent_uptodate(tree, cur, cur + iosize - 1,
&cached, GFP_NOFS);
unlock_extent_cached(tree, cur,
- cur + iosize - 1,
- &cached, GFP_NOFS);
+ cur + iosize - 1, &cached);
cur = cur + iosize;
pg_offset += iosize;
continue;
@@ -3092,9 +3090,8 @@ out:
static inline void __do_contiguous_readpages(struct extent_io_tree *tree,
struct page *pages[], int nr_pages,
u64 start, u64 end,
- get_extent_t *get_extent,
struct extent_map **em_cached,
- struct bio **bio, int mirror_num,
+ struct bio **bio,
unsigned long *bio_flags,
u64 *prev_em_start)
{
@@ -3115,18 +3112,17 @@ static inline void __do_contiguous_readpages(struct extent_io_tree *tree,
}
for (index = 0; index < nr_pages; index++) {
- __do_readpage(tree, pages[index], get_extent, em_cached, bio,
- mirror_num, bio_flags, 0, prev_em_start);
+ __do_readpage(tree, pages[index], btrfs_get_extent, em_cached,
+ bio, 0, bio_flags, 0, prev_em_start);
put_page(pages[index]);
}
}
static void __extent_readpages(struct extent_io_tree *tree,
struct page *pages[],
- int nr_pages, get_extent_t *get_extent,
+ int nr_pages,
struct extent_map **em_cached,
- struct bio **bio, int mirror_num,
- unsigned long *bio_flags,
+ struct bio **bio, unsigned long *bio_flags,
u64 *prev_em_start)
{
u64 start = 0;
@@ -3146,8 +3142,8 @@ static void __extent_readpages(struct extent_io_tree *tree,
} else {
__do_contiguous_readpages(tree, &pages[first_index],
index - first_index, start,
- end, get_extent, em_cached,
- bio, mirror_num, bio_flags,
+ end, em_cached,
+ bio, bio_flags,
prev_em_start);
start = page_start;
end = start + PAGE_SIZE - 1;
@@ -3158,9 +3154,8 @@ static void __extent_readpages(struct extent_io_tree *tree,
if (end)
__do_contiguous_readpages(tree, &pages[first_index],
index - first_index, start,
- end, get_extent, em_cached, bio,
- mirror_num, bio_flags,
- prev_em_start);
+ end, em_cached, bio,
+ bio_flags, prev_em_start);
}
static int __extent_read_full_page(struct extent_io_tree *tree,
@@ -3375,7 +3370,7 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode,
page_end, NULL, 1);
break;
}
- em = epd->get_extent(BTRFS_I(inode), page, pg_offset, cur,
+ em = btrfs_get_extent(BTRFS_I(inode), page, pg_offset, cur,
end - cur + 1, 1);
if (IS_ERR_OR_NULL(em)) {
SetPageError(page);
@@ -3458,10 +3453,9 @@ done:
* and the end_io handler clears the writeback ranges
*/
static int __extent_writepage(struct page *page, struct writeback_control *wbc,
- void *data)
+ struct extent_page_data *epd)
{
struct inode *inode = page->mapping->host;
- struct extent_page_data *epd = data;
u64 start = page_offset(page);
u64 page_end = start + PAGE_SIZE - 1;
int ret;
@@ -3895,8 +3889,7 @@ retry:
* write_cache_pages - walk the list of dirty pages of the given address space and write all of them.
* @mapping: address space structure to write
* @wbc: subtract the number of written pages from *@wbc->nr_to_write
- * @writepage: function called for each page
- * @data: data passed to writepage function
+ * @data: data passed to __extent_writepage function
*
* If a page is already under I/O, write_cache_pages() skips it, even
* if it's dirty. This is desirable behaviour for memory-cleaning writeback,
@@ -3908,8 +3901,7 @@ retry:
*/
static int extent_write_cache_pages(struct address_space *mapping,
struct writeback_control *wbc,
- writepage_t writepage, void *data,
- void (*flush_fn)(void *))
+ struct extent_page_data *epd)
{
struct inode *inode = mapping->host;
int ret = 0;
@@ -3973,7 +3965,7 @@ retry:
* mapping
*/
if (!trylock_page(page)) {
- flush_fn(data);
+ flush_write_bio(epd);
lock_page(page);
}
@@ -3984,7 +3976,7 @@ retry:
if (wbc->sync_mode != WB_SYNC_NONE) {
if (PageWriteback(page))
- flush_fn(data);
+ flush_write_bio(epd);
wait_on_page_writeback(page);
}
@@ -3994,7 +3986,7 @@ retry:
continue;
}
- ret = (*writepage)(page, wbc, data);
+ ret = __extent_writepage(page, wbc, epd);
if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) {
unlock_page(page);
@@ -4042,7 +4034,7 @@ retry:
return ret;
}
-static void flush_epd_write_bio(struct extent_page_data *epd)
+static void flush_write_bio(struct extent_page_data *epd)
{
if (epd->bio) {
int ret;
@@ -4053,37 +4045,28 @@ static void flush_epd_write_bio(struct extent_page_data *epd)
}
}
-static noinline void flush_write_bio(void *data)
-{
- struct extent_page_data *epd = data;
- flush_epd_write_bio(epd);
-}
-
-int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
- get_extent_t *get_extent,
- struct writeback_control *wbc)
+int extent_write_full_page(struct page *page, struct writeback_control *wbc)
{
int ret;
struct extent_page_data epd = {
.bio = NULL,
- .tree = tree,
- .get_extent = get_extent,
+ .tree = &BTRFS_I(page->mapping->host)->io_tree,
.extent_locked = 0,
.sync_io = wbc->sync_mode == WB_SYNC_ALL,
};
ret = __extent_writepage(page, wbc, &epd);
- flush_epd_write_bio(&epd);
+ flush_write_bio(&epd);
return ret;
}
-int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode,
- u64 start, u64 end, get_extent_t *get_extent,
+int extent_write_locked_range(struct inode *inode, u64 start, u64 end,
int mode)
{
int ret = 0;
struct address_space *mapping = inode->i_mapping;
+ struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
struct page *page;
unsigned long nr_pages = (end - start + PAGE_SIZE) >>
PAGE_SHIFT;
@@ -4091,7 +4074,6 @@ int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode,
struct extent_page_data epd = {
.bio = NULL,
.tree = tree,
- .get_extent = get_extent,
.extent_locked = 1,
.sync_io = mode == WB_SYNC_ALL,
};
@@ -4117,34 +4099,30 @@ int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode,
start += PAGE_SIZE;
}
- flush_epd_write_bio(&epd);
+ flush_write_bio(&epd);
return ret;
}
int extent_writepages(struct extent_io_tree *tree,
struct address_space *mapping,
- get_extent_t *get_extent,
struct writeback_control *wbc)
{
int ret = 0;
struct extent_page_data epd = {
.bio = NULL,
.tree = tree,
- .get_extent = get_extent,
.extent_locked = 0,
.sync_io = wbc->sync_mode == WB_SYNC_ALL,
};
- ret = extent_write_cache_pages(mapping, wbc, __extent_writepage, &epd,
- flush_write_bio);
- flush_epd_write_bio(&epd);
+ ret = extent_write_cache_pages(mapping, wbc, &epd);
+ flush_write_bio(&epd);
return ret;
}
int extent_readpages(struct extent_io_tree *tree,
struct address_space *mapping,
- struct list_head *pages, unsigned nr_pages,
- get_extent_t get_extent)
+ struct list_head *pages, unsigned nr_pages)
{
struct bio *bio = NULL;
unsigned page_idx;
@@ -4170,13 +4148,13 @@ int extent_readpages(struct extent_io_tree *tree,
pagepool[nr++] = page;
if (nr < ARRAY_SIZE(pagepool))
continue;
- __extent_readpages(tree, pagepool, nr, get_extent, &em_cached,
- &bio, 0, &bio_flags, &prev_em_start);
+ __extent_readpages(tree, pagepool, nr, &em_cached, &bio,
+ &bio_flags, &prev_em_start);
nr = 0;
}
if (nr)
- __extent_readpages(tree, pagepool, nr, get_extent, &em_cached,
- &bio, 0, &bio_flags, &prev_em_start);
+ __extent_readpages(tree, pagepool, nr, &em_cached, &bio,
+ &bio_flags, &prev_em_start);
if (em_cached)
free_extent_map(em_cached);
@@ -4209,7 +4187,7 @@ int extent_invalidatepage(struct extent_io_tree *tree,
clear_extent_bit(tree, start, end,
EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC |
EXTENT_DO_ACCOUNTING,
- 1, 1, &cached_state, GFP_NOFS);
+ 1, 1, &cached_state);
return 0;
}
@@ -4234,9 +4212,9 @@ static int try_release_extent_state(struct extent_map_tree *map,
* at this point we can safely clear everything except the
* locked bit and the nodatasum bit
*/
- ret = clear_extent_bit(tree, start, end,
+ ret = __clear_extent_bit(tree, start, end,
~(EXTENT_LOCKED | EXTENT_NODATASUM),
- 0, 0, NULL, mask);
+ 0, 0, NULL, mask, NULL);
/* if clear_extent_bit failed for enomem reasons,
* we can't allow the release to continue.
@@ -4302,9 +4280,7 @@ int try_release_extent_mapping(struct extent_map_tree *map,
* This maps until we find something past 'last'
*/
static struct extent_map *get_extent_skip_holes(struct inode *inode,
- u64 offset,
- u64 last,
- get_extent_t *get_extent)
+ u64 offset, u64 last)
{
u64 sectorsize = btrfs_inode_sectorsize(inode);
struct extent_map *em;
@@ -4318,15 +4294,14 @@ static struct extent_map *get_extent_skip_holes(struct inode *inode,
if (len == 0)
break;
len = ALIGN(len, sectorsize);
- em = get_extent(BTRFS_I(inode), NULL, 0, offset, len, 0);
+ em = btrfs_get_extent_fiemap(BTRFS_I(inode), NULL, 0, offset,
+ len, 0);
if (IS_ERR_OR_NULL(em))
return em;
/* if this isn't a hole return it */
- if (!test_bit(EXTENT_FLAG_VACANCY, &em->flags) &&
- em->block_start != EXTENT_MAP_HOLE) {
+ if (em->block_start != EXTENT_MAP_HOLE)
return em;
- }
/* this is a hole, advance to the next extent */
offset = extent_map_end(em);
@@ -4451,7 +4426,7 @@ static int emit_last_fiemap_cache(struct btrfs_fs_info *fs_info,
}
int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
- __u64 start, __u64 len, get_extent_t *get_extent)
+ __u64 start, __u64 len)
{
int ret = 0;
u64 off = start;
@@ -4533,8 +4508,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len - 1,
&cached_state);
- em = get_extent_skip_holes(inode, start, last_for_get_extent,
- get_extent);
+ em = get_extent_skip_holes(inode, start, last_for_get_extent);
if (!em)
goto out;
if (IS_ERR(em)) {
@@ -4622,8 +4596,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
}
/* now scan forward to see if this is really the last extent. */
- em = get_extent_skip_holes(inode, off, last_for_get_extent,
- get_extent);
+ em = get_extent_skip_holes(inode, off, last_for_get_extent);
if (IS_ERR(em)) {
ret = PTR_ERR(em);
goto out;
@@ -4647,7 +4620,7 @@ out_free:
out:
btrfs_free_path(path);
unlock_extent_cached(&BTRFS_I(inode)->io_tree, start, start + len - 1,
- &cached_state, GFP_NOFS);
+ &cached_state);
return ret;
}
@@ -5263,8 +5236,7 @@ int extent_buffer_uptodate(struct extent_buffer *eb)
}
int read_extent_buffer_pages(struct extent_io_tree *tree,
- struct extent_buffer *eb, int wait,
- get_extent_t *get_extent, int mirror_num)
+ struct extent_buffer *eb, int wait, int mirror_num)
{
unsigned long i;
struct page *page;
@@ -5324,7 +5296,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
ClearPageError(page);
err = __extent_read_full_page(tree, page,
- get_extent, &bio,
+ btree_get_extent, &bio,
mirror_num, &bio_flags,
REQ_META);
if (err) {
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 93dcae0c3183..a7a850abd600 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -300,19 +300,29 @@ int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
unsigned bits, struct extent_changeset *changeset);
int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
unsigned bits, int wake, int delete,
- struct extent_state **cached, gfp_t mask);
+ struct extent_state **cached);
+int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+ unsigned bits, int wake, int delete,
+ struct extent_state **cached, gfp_t mask,
+ struct extent_changeset *changeset);
static inline int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end)
{
- return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, NULL,
- GFP_NOFS);
+ return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, NULL);
}
static inline int unlock_extent_cached(struct extent_io_tree *tree, u64 start,
- u64 end, struct extent_state **cached, gfp_t mask)
+ u64 end, struct extent_state **cached)
+{
+ return __clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, cached,
+ GFP_NOFS, NULL);
+}
+
+static inline int unlock_extent_cached_atomic(struct extent_io_tree *tree,
+ u64 start, u64 end, struct extent_state **cached)
{
- return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, cached,
- mask);
+ return __clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, cached,
+ GFP_ATOMIC, NULL);
}
static inline int clear_extent_bits(struct extent_io_tree *tree, u64 start,
@@ -323,8 +333,7 @@ static inline int clear_extent_bits(struct extent_io_tree *tree, u64 start,
if (bits & EXTENT_LOCKED)
wake = 1;
- return clear_extent_bit(tree, start, end, bits, wake, 0, NULL,
- GFP_NOFS);
+ return clear_extent_bit(tree, start, end, bits, wake, 0, NULL);
}
int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
@@ -340,10 +349,10 @@ static inline int set_extent_bits(struct extent_io_tree *tree, u64 start,
}
static inline int clear_extent_uptodate(struct extent_io_tree *tree, u64 start,
- u64 end, struct extent_state **cached_state, gfp_t mask)
+ u64 end, struct extent_state **cached_state)
{
- return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0,
- cached_state, mask);
+ return __clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0,
+ cached_state, GFP_NOFS, NULL);
}
static inline int set_extent_dirty(struct extent_io_tree *tree, u64 start,
@@ -358,7 +367,7 @@ static inline int clear_extent_dirty(struct extent_io_tree *tree, u64 start,
{
return clear_extent_bit(tree, start, end,
EXTENT_DIRTY | EXTENT_DELALLOC |
- EXTENT_DO_ACCOUNTING, 0, 0, NULL, GFP_NOFS);
+ EXTENT_DO_ACCOUNTING, 0, 0, NULL);
}
int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
@@ -401,24 +410,19 @@ int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
struct extent_state **cached_state);
int extent_invalidatepage(struct extent_io_tree *tree,
struct page *page, unsigned long offset);
-int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
- get_extent_t *get_extent,
- struct writeback_control *wbc);
-int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode,
- u64 start, u64 end, get_extent_t *get_extent,
+int extent_write_full_page(struct page *page, struct writeback_control *wbc);
+int extent_write_locked_range(struct inode *inode, u64 start, u64 end,
int mode);
int extent_writepages(struct extent_io_tree *tree,
struct address_space *mapping,
- get_extent_t *get_extent,
struct writeback_control *wbc);
int btree_write_cache_pages(struct address_space *mapping,
struct writeback_control *wbc);
int extent_readpages(struct extent_io_tree *tree,
struct address_space *mapping,
- struct list_head *pages, unsigned nr_pages,
- get_extent_t get_extent);
+ struct list_head *pages, unsigned nr_pages);
int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
- __u64 start, __u64 len, get_extent_t *get_extent);
+ __u64 start, __u64 len);
void set_page_extent_mapped(struct page *page);
struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
@@ -437,7 +441,7 @@ void free_extent_buffer_stale(struct extent_buffer *eb);
#define WAIT_PAGE_LOCK 2
int read_extent_buffer_pages(struct extent_io_tree *tree,
struct extent_buffer *eb, int wait,
- get_extent_t *get_extent, int mirror_num);
+ int mirror_num);
void wait_on_extent_buffer_writeback(struct extent_buffer *eb);
static inline unsigned long num_extent_pages(u64 start, u64 len)
@@ -540,7 +544,7 @@ void btrfs_free_io_failure_record(struct btrfs_inode *inode, u64 start,
u64 end);
int btrfs_get_io_failure_record(struct inode *inode, u64 start, u64 end,
struct io_failure_record **failrec_ret);
-bool btrfs_check_repairable(struct inode *inode, struct bio *failed_bio,
+bool btrfs_check_repairable(struct inode *inode, unsigned failed_bio_pages,
struct io_failure_record *failrec, int fail_mirror);
struct bio *btrfs_create_repair_bio(struct inode *inode, struct bio *failed_bio,
struct io_failure_record *failrec,
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 2e348fb0b280..d3bd02105d1c 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -454,3 +454,135 @@ void replace_extent_mapping(struct extent_map_tree *tree,
setup_extent_mapping(tree, new, modified);
}
+
+static struct extent_map *next_extent_map(struct extent_map *em)
+{
+ struct rb_node *next;
+
+ next = rb_next(&em->rb_node);
+ if (!next)
+ return NULL;
+ return container_of(next, struct extent_map, rb_node);
+}
+
+static struct extent_map *prev_extent_map(struct extent_map *em)
+{
+ struct rb_node *prev;
+
+ prev = rb_prev(&em->rb_node);
+ if (!prev)
+ return NULL;
+ return container_of(prev, struct extent_map, rb_node);
+}
+
+/* helper for btfs_get_extent. Given an existing extent in the tree,
+ * the existing extent is the nearest extent to map_start,
+ * and an extent that you want to insert, deal with overlap and insert
+ * the best fitted new extent into the tree.
+ */
+static noinline int merge_extent_mapping(struct extent_map_tree *em_tree,
+ struct extent_map *existing,
+ struct extent_map *em,
+ u64 map_start)
+{
+ struct extent_map *prev;
+ struct extent_map *next;
+ u64 start;
+ u64 end;
+ u64 start_diff;
+
+ BUG_ON(map_start < em->start || map_start >= extent_map_end(em));
+
+ if (existing->start > map_start) {
+ next = existing;
+ prev = prev_extent_map(next);
+ } else {
+ prev = existing;
+ next = next_extent_map(prev);
+ }
+
+ start = prev ? extent_map_end(prev) : em->start;
+ start = max_t(u64, start, em->start);
+ end = next ? next->start : extent_map_end(em);
+ end = min_t(u64, end, extent_map_end(em));
+ start_diff = start - em->start;
+ em->start = start;
+ em->len = end - start;
+ if (em->block_start < EXTENT_MAP_LAST_BYTE &&
+ !test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
+ em->block_start += start_diff;
+ em->block_len = em->len;
+ }
+ return add_extent_mapping(em_tree, em, 0);
+}
+
+/**
+ * btrfs_add_extent_mapping - add extent mapping into em_tree
+ * @em_tree - the extent tree into which we want to insert the extent mapping
+ * @em_in - extent we are inserting
+ * @start - start of the logical range btrfs_get_extent() is requesting
+ * @len - length of the logical range btrfs_get_extent() is requesting
+ *
+ * Note that @em_in's range may be different from [start, start+len),
+ * but they must be overlapped.
+ *
+ * Insert @em_in into @em_tree. In case there is an overlapping range, handle
+ * the -EEXIST by either:
+ * a) Returning the existing extent in @em_in if @start is within the
+ * existing em.
+ * b) Merge the existing extent with @em_in passed in.
+ *
+ * Return 0 on success, otherwise -EEXIST.
+ *
+ */
+int btrfs_add_extent_mapping(struct extent_map_tree *em_tree,
+ struct extent_map **em_in, u64 start, u64 len)
+{
+ int ret;
+ struct extent_map *em = *em_in;
+
+ ret = add_extent_mapping(em_tree, em, 0);
+ /* it is possible that someone inserted the extent into the tree
+ * while we had the lock dropped. It is also possible that
+ * an overlapping map exists in the tree
+ */
+ if (ret == -EEXIST) {
+ struct extent_map *existing;
+
+ ret = 0;
+
+ existing = search_extent_mapping(em_tree, start, len);
+ /*
+ * existing will always be non-NULL, since there must be
+ * extent causing the -EEXIST.
+ */
+ if (start >= existing->start &&
+ start < extent_map_end(existing)) {
+ free_extent_map(em);
+ *em_in = existing;
+ ret = 0;
+ } else {
+ u64 orig_start = em->start;
+ u64 orig_len = em->len;
+
+ /*
+ * The existing extent map is the one nearest to
+ * the [start, start + len) range which overlaps
+ */
+ ret = merge_extent_mapping(em_tree, existing,
+ em, start);
+ if (ret) {
+ free_extent_map(em);
+ *em_in = NULL;
+ WARN_ONCE(ret,
+"unexpected error %d: merge existing(start %llu len %llu) with em(start %llu len %llu)\n",
+ ret, existing->start, existing->len,
+ orig_start, orig_len);
+ }
+ free_extent_map(existing);
+ }
+ }
+
+ ASSERT(ret == 0 || ret == -EEXIST);
+ return ret;
+}
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index 64365bbc9b16..b29f77bc0732 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -13,7 +13,6 @@
/* bits for the flags field */
#define EXTENT_FLAG_PINNED 0 /* this entry not yet on disk, don't free it */
#define EXTENT_FLAG_COMPRESSED 1
-#define EXTENT_FLAG_VACANCY 2 /* no file extent item found */
#define EXTENT_FLAG_PREALLOC 3 /* pre-allocated extent */
#define EXTENT_FLAG_LOGGING 4 /* Logging this extent */
#define EXTENT_FLAG_FILLING 5 /* Filling in a preallocated extent */
@@ -92,4 +91,6 @@ int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len, u64 gen
void clear_em_logging(struct extent_map_tree *tree, struct extent_map *em);
struct extent_map *search_extent_mapping(struct extent_map_tree *tree,
u64 start, u64 len);
+int btrfs_add_extent_mapping(struct extent_map_tree *em_tree,
+ struct extent_map **em_in, u64 start, u64 len);
#endif
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index eb1bac7c8553..41ab9073d1d4 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -31,6 +31,7 @@
#include <linux/slab.h>
#include <linux/btrfs.h>
#include <linux/uio.h>
+#include <linux/iversion.h>
#include "ctree.h"
#include "disk-io.h"
#include "transaction.h"
@@ -1504,7 +1505,7 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages,
ordered->file_offset + ordered->len > start_pos &&
ordered->file_offset <= last_pos) {
unlock_extent_cached(&inode->io_tree, start_pos,
- last_pos, cached_state, GFP_NOFS);
+ last_pos, cached_state);
for (i = 0; i < num_pages; i++) {
unlock_page(pages[i]);
put_page(pages[i]);
@@ -1519,7 +1520,7 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages,
clear_extent_bit(&inode->io_tree, start_pos, last_pos,
EXTENT_DIRTY | EXTENT_DELALLOC |
EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
- 0, 0, cached_state, GFP_NOFS);
+ 0, 0, cached_state);
*lockstart = start_pos;
*lockend = last_pos;
ret = 1;
@@ -1755,11 +1756,10 @@ again:
if (copied > 0)
ret = btrfs_dirty_pages(inode, pages, dirty_pages,
- pos, copied, NULL);
+ pos, copied, &cached_state);
if (extents_locked)
unlock_extent_cached(&BTRFS_I(inode)->io_tree,
- lockstart, lockend, &cached_state,
- GFP_NOFS);
+ lockstart, lockend, &cached_state);
btrfs_delalloc_release_extents(BTRFS_I(inode), reserve_bytes);
if (ret) {
btrfs_drop_pages(pages, num_pages);
@@ -2019,10 +2019,19 @@ int btrfs_release_file(struct inode *inode, struct file *filp)
static int start_ordered_ops(struct inode *inode, loff_t start, loff_t end)
{
int ret;
+ struct blk_plug plug;
+ /*
+ * This is only called in fsync, which would do synchronous writes, so
+ * a plug can merge adjacent IOs as much as possible. Esp. in case of
+ * multiple disks using raid profile, a large IO can be split to
+ * several segments of stripe length (currently 64K).
+ */
+ blk_start_plug(&plug);
atomic_inc(&BTRFS_I(inode)->sync_writers);
ret = btrfs_fdatawrite_range(inode, start, end);
atomic_dec(&BTRFS_I(inode)->sync_writers);
+ blk_finish_plug(&plug);
return ret;
}
@@ -2450,6 +2459,46 @@ static int find_first_non_hole(struct inode *inode, u64 *start, u64 *len)
return ret;
}
+static int btrfs_punch_hole_lock_range(struct inode *inode,
+ const u64 lockstart,
+ const u64 lockend,
+ struct extent_state **cached_state)
+{
+ while (1) {
+ struct btrfs_ordered_extent *ordered;
+ int ret;
+
+ truncate_pagecache_range(inode, lockstart, lockend);
+
+ lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+ cached_state);
+ ordered = btrfs_lookup_first_ordered_extent(inode, lockend);
+
+ /*
+ * We need to make sure we have no ordered extents in this range
+ * and nobody raced in and read a page in this range, if we did
+ * we need to try again.
+ */
+ if ((!ordered ||
+ (ordered->file_offset + ordered->len <= lockstart ||
+ ordered->file_offset > lockend)) &&
+ !btrfs_page_exists_in_range(inode, lockstart, lockend)) {
+ if (ordered)
+ btrfs_put_ordered_extent(ordered);
+ break;
+ }
+ if (ordered)
+ btrfs_put_ordered_extent(ordered);
+ unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart,
+ lockend, cached_state);
+ ret = btrfs_wait_ordered_range(inode, lockstart,
+ lockend - lockstart + 1);
+ if (ret)
+ return ret;
+ }
+ return 0;
+}
+
static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
@@ -2566,38 +2615,11 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
goto out_only_mutex;
}
- while (1) {
- struct btrfs_ordered_extent *ordered;
-
- truncate_pagecache_range(inode, lockstart, lockend);
-
- lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
- &cached_state);
- ordered = btrfs_lookup_first_ordered_extent(inode, lockend);
-
- /*
- * We need to make sure we have no ordered extents in this range
- * and nobody raced in and read a page in this range, if we did
- * we need to try again.
- */
- if ((!ordered ||
- (ordered->file_offset + ordered->len <= lockstart ||
- ordered->file_offset > lockend)) &&
- !btrfs_page_exists_in_range(inode, lockstart, lockend)) {
- if (ordered)
- btrfs_put_ordered_extent(ordered);
- break;
- }
- if (ordered)
- btrfs_put_ordered_extent(ordered);
- unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart,
- lockend, &cached_state, GFP_NOFS);
- ret = btrfs_wait_ordered_range(inode, lockstart,
- lockend - lockstart + 1);
- if (ret) {
- inode_unlock(inode);
- return ret;
- }
+ ret = btrfs_punch_hole_lock_range(inode, lockstart, lockend,
+ &cached_state);
+ if (ret) {
+ inode_unlock(inode);
+ goto out_only_mutex;
}
path = btrfs_alloc_path();
@@ -2742,7 +2764,7 @@ out_free:
btrfs_free_block_rsv(fs_info, rsv);
out:
unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
- &cached_state, GFP_NOFS);
+ &cached_state);
out_only_mutex:
if (!updated_inode && truncated_block && !ret && !err) {
/*
@@ -2806,6 +2828,234 @@ insert:
return 0;
}
+static int btrfs_fallocate_update_isize(struct inode *inode,
+ const u64 end,
+ const int mode)
+{
+ struct btrfs_trans_handle *trans;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ int ret;
+ int ret2;
+
+ if (mode & FALLOC_FL_KEEP_SIZE || end <= i_size_read(inode))
+ return 0;
+
+ trans = btrfs_start_transaction(root, 1);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
+
+ inode->i_ctime = current_time(inode);
+ i_size_write(inode, end);
+ btrfs_ordered_update_i_size(inode, end, NULL);
+ ret = btrfs_update_inode(trans, root, inode);
+ ret2 = btrfs_end_transaction(trans);
+
+ return ret ? ret : ret2;
+}
+
+enum {
+ RANGE_BOUNDARY_WRITTEN_EXTENT = 0,
+ RANGE_BOUNDARY_PREALLOC_EXTENT = 1,
+ RANGE_BOUNDARY_HOLE = 2,
+};
+
+static int btrfs_zero_range_check_range_boundary(struct inode *inode,
+ u64 offset)
+{
+ const u64 sectorsize = btrfs_inode_sectorsize(inode);
+ struct extent_map *em;
+ int ret;
+
+ offset = round_down(offset, sectorsize);
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0);
+ if (IS_ERR(em))
+ return PTR_ERR(em);
+
+ if (em->block_start == EXTENT_MAP_HOLE)
+ ret = RANGE_BOUNDARY_HOLE;
+ else if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
+ ret = RANGE_BOUNDARY_PREALLOC_EXTENT;
+ else
+ ret = RANGE_BOUNDARY_WRITTEN_EXTENT;
+
+ free_extent_map(em);
+ return ret;
+}
+
+static int btrfs_zero_range(struct inode *inode,
+ loff_t offset,
+ loff_t len,
+ const int mode)
+{
+ struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
+ struct extent_map *em;
+ struct extent_changeset *data_reserved = NULL;
+ int ret;
+ u64 alloc_hint = 0;
+ const u64 sectorsize = btrfs_inode_sectorsize(inode);
+ u64 alloc_start = round_down(offset, sectorsize);
+ u64 alloc_end = round_up(offset + len, sectorsize);
+ u64 bytes_to_reserve = 0;
+ bool space_reserved = false;
+
+ inode_dio_wait(inode);
+
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, 0,
+ alloc_start, alloc_end - alloc_start, 0);
+ if (IS_ERR(em)) {
+ ret = PTR_ERR(em);
+ goto out;
+ }
+
+ /*
+ * Avoid hole punching and extent allocation for some cases. More cases
+ * could be considered, but these are unlikely common and we keep things
+ * as simple as possible for now. Also, intentionally, if the target
+ * range contains one or more prealloc extents together with regular
+ * extents and holes, we drop all the existing extents and allocate a
+ * new prealloc extent, so that we get a larger contiguous disk extent.
+ */
+ if (em->start <= alloc_start &&
+ test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
+ const u64 em_end = em->start + em->len;
+
+ if (em_end >= offset + len) {
+ /*
+ * The whole range is already a prealloc extent,
+ * do nothing except updating the inode's i_size if
+ * needed.
+ */
+ free_extent_map(em);
+ ret = btrfs_fallocate_update_isize(inode, offset + len,
+ mode);
+ goto out;
+ }
+ /*
+ * Part of the range is already a prealloc extent, so operate
+ * only on the remaining part of the range.
+ */
+ alloc_start = em_end;
+ ASSERT(IS_ALIGNED(alloc_start, sectorsize));
+ len = offset + len - alloc_start;
+ offset = alloc_start;
+ alloc_hint = em->block_start + em->len;
+ }
+ free_extent_map(em);
+
+ if (BTRFS_BYTES_TO_BLKS(fs_info, offset) ==
+ BTRFS_BYTES_TO_BLKS(fs_info, offset + len - 1)) {
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, 0,
+ alloc_start, sectorsize, 0);
+ if (IS_ERR(em)) {
+ ret = PTR_ERR(em);
+ goto out;
+ }
+
+ if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
+ free_extent_map(em);
+ ret = btrfs_fallocate_update_isize(inode, offset + len,
+ mode);
+ goto out;
+ }
+ if (len < sectorsize && em->block_start != EXTENT_MAP_HOLE) {
+ free_extent_map(em);
+ ret = btrfs_truncate_block(inode, offset, len, 0);
+ if (!ret)
+ ret = btrfs_fallocate_update_isize(inode,
+ offset + len,
+ mode);
+ return ret;
+ }
+ free_extent_map(em);
+ alloc_start = round_down(offset, sectorsize);
+ alloc_end = alloc_start + sectorsize;
+ goto reserve_space;
+ }
+
+ alloc_start = round_up(offset, sectorsize);
+ alloc_end = round_down(offset + len, sectorsize);
+
+ /*
+ * For unaligned ranges, check the pages at the boundaries, they might
+ * map to an extent, in which case we need to partially zero them, or
+ * they might map to a hole, in which case we need our allocation range
+ * to cover them.
+ */
+ if (!IS_ALIGNED(offset, sectorsize)) {
+ ret = btrfs_zero_range_check_range_boundary(inode, offset);
+ if (ret < 0)
+ goto out;
+ if (ret == RANGE_BOUNDARY_HOLE) {
+ alloc_start = round_down(offset, sectorsize);
+ ret = 0;
+ } else if (ret == RANGE_BOUNDARY_WRITTEN_EXTENT) {
+ ret = btrfs_truncate_block(inode, offset, 0, 0);
+ if (ret)
+ goto out;
+ } else {
+ ret = 0;
+ }
+ }
+
+ if (!IS_ALIGNED(offset + len, sectorsize)) {
+ ret = btrfs_zero_range_check_range_boundary(inode,
+ offset + len);
+ if (ret < 0)
+ goto out;
+ if (ret == RANGE_BOUNDARY_HOLE) {
+ alloc_end = round_up(offset + len, sectorsize);
+ ret = 0;
+ } else if (ret == RANGE_BOUNDARY_WRITTEN_EXTENT) {
+ ret = btrfs_truncate_block(inode, offset + len, 0, 1);
+ if (ret)
+ goto out;
+ } else {
+ ret = 0;
+ }
+ }
+
+reserve_space:
+ if (alloc_start < alloc_end) {
+ struct extent_state *cached_state = NULL;
+ const u64 lockstart = alloc_start;
+ const u64 lockend = alloc_end - 1;
+
+ bytes_to_reserve = alloc_end - alloc_start;
+ ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode),
+ bytes_to_reserve);
+ if (ret < 0)
+ goto out;
+ space_reserved = true;
+ ret = btrfs_qgroup_reserve_data(inode, &data_reserved,
+ alloc_start, bytes_to_reserve);
+ if (ret)
+ goto out;
+ ret = btrfs_punch_hole_lock_range(inode, lockstart, lockend,
+ &cached_state);
+ if (ret)
+ goto out;
+ ret = btrfs_prealloc_file_range(inode, mode, alloc_start,
+ alloc_end - alloc_start,
+ i_blocksize(inode),
+ offset + len, &alloc_hint);
+ unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart,
+ lockend, &cached_state);
+ /* btrfs_prealloc_file_range releases reserved space on error */
+ if (ret) {
+ space_reserved = false;
+ goto out;
+ }
+ }
+ ret = btrfs_fallocate_update_isize(inode, offset + len, mode);
+ out:
+ if (ret && space_reserved)
+ btrfs_free_reserved_data_space(inode, data_reserved,
+ alloc_start, bytes_to_reserve);
+ extent_changeset_free(data_reserved);
+
+ return ret;
+}
+
static long btrfs_fallocate(struct file *file, int mode,
loff_t offset, loff_t len)
{
@@ -2831,7 +3081,8 @@ static long btrfs_fallocate(struct file *file, int mode,
cur_offset = alloc_start;
/* Make sure we aren't being give some crap mode */
- if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
+ if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
+ FALLOC_FL_ZERO_RANGE))
return -EOPNOTSUPP;
if (mode & FALLOC_FL_PUNCH_HOLE)
@@ -2842,10 +3093,12 @@ static long btrfs_fallocate(struct file *file, int mode,
*
* For qgroup space, it will be checked later.
*/
- ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode),
- alloc_end - alloc_start);
- if (ret < 0)
- return ret;
+ if (!(mode & FALLOC_FL_ZERO_RANGE)) {
+ ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode),
+ alloc_end - alloc_start);
+ if (ret < 0)
+ return ret;
+ }
inode_lock(inode);
@@ -2887,6 +3140,12 @@ static long btrfs_fallocate(struct file *file, int mode,
if (ret)
goto out;
+ if (mode & FALLOC_FL_ZERO_RANGE) {
+ ret = btrfs_zero_range(inode, offset, len, mode);
+ inode_unlock(inode);
+ return ret;
+ }
+
locked_end = alloc_end - 1;
while (1) {
struct btrfs_ordered_extent *ordered;
@@ -2896,15 +3155,15 @@ static long btrfs_fallocate(struct file *file, int mode,
*/
lock_extent_bits(&BTRFS_I(inode)->io_tree, alloc_start,
locked_end, &cached_state);
- ordered = btrfs_lookup_first_ordered_extent(inode,
- alloc_end - 1);
+ ordered = btrfs_lookup_first_ordered_extent(inode, locked_end);
+
if (ordered &&
ordered->file_offset + ordered->len > alloc_start &&
ordered->file_offset < alloc_end) {
btrfs_put_ordered_extent(ordered);
unlock_extent_cached(&BTRFS_I(inode)->io_tree,
alloc_start, locked_end,
- &cached_state, GFP_KERNEL);
+ &cached_state);
/*
* we can't wait on the range with the transaction
* running or with the extent lock held
@@ -2922,7 +3181,7 @@ static long btrfs_fallocate(struct file *file, int mode,
/* First, check if we exceed the qgroup limit */
INIT_LIST_HEAD(&reserve_list);
- while (1) {
+ while (cur_offset < alloc_end) {
em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, cur_offset,
alloc_end - cur_offset, 0);
if (IS_ERR(em)) {
@@ -2958,8 +3217,6 @@ static long btrfs_fallocate(struct file *file, int mode,
}
free_extent_map(em);
cur_offset = last_byte;
- if (cur_offset >= alloc_end)
- break;
}
/*
@@ -2982,37 +3239,18 @@ static long btrfs_fallocate(struct file *file, int mode,
if (ret < 0)
goto out_unlock;
- if (actual_end > inode->i_size &&
- !(mode & FALLOC_FL_KEEP_SIZE)) {
- struct btrfs_trans_handle *trans;
- struct btrfs_root *root = BTRFS_I(inode)->root;
-
- /*
- * We didn't need to allocate any more space, but we
- * still extended the size of the file so we need to
- * update i_size and the inode item.
- */
- trans = btrfs_start_transaction(root, 1);
- if (IS_ERR(trans)) {
- ret = PTR_ERR(trans);
- } else {
- inode->i_ctime = current_time(inode);
- i_size_write(inode, actual_end);
- btrfs_ordered_update_i_size(inode, actual_end, NULL);
- ret = btrfs_update_inode(trans, root, inode);
- if (ret)
- btrfs_end_transaction(trans);
- else
- ret = btrfs_end_transaction(trans);
- }
- }
+ /*
+ * We didn't need to allocate any more space, but we still extended the
+ * size of the file so we need to update i_size and the inode item.
+ */
+ ret = btrfs_fallocate_update_isize(inode, actual_end, mode);
out_unlock:
unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
- &cached_state, GFP_KERNEL);
+ &cached_state);
out:
inode_unlock(inode);
/* Let go of our reservation. */
- if (ret != 0)
+ if (ret != 0 && !(mode & FALLOC_FL_ZERO_RANGE))
btrfs_free_reserved_data_space(inode, data_reserved,
alloc_start, alloc_end - cur_offset);
extent_changeset_free(data_reserved);
@@ -3081,7 +3319,7 @@ static int find_desired_extent(struct inode *inode, loff_t *offset, int whence)
*offset = min_t(loff_t, start, inode->i_size);
}
unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
- &cached_state, GFP_NOFS);
+ &cached_state);
return ret;
}
@@ -3145,7 +3383,7 @@ void btrfs_auto_defrag_exit(void)
kmem_cache_destroy(btrfs_inode_defrag_cachep);
}
-int btrfs_auto_defrag_init(void)
+int __init btrfs_auto_defrag_init(void)
{
btrfs_inode_defrag_cachep = kmem_cache_create("btrfs_inode_defrag",
sizeof(struct inode_defrag), 0,
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 4426d1c73e50..014f3c090231 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -993,8 +993,7 @@ update_cache_item(struct btrfs_trans_handle *trans,
ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
if (ret < 0) {
clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1,
- EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, NULL,
- GFP_NOFS);
+ EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, NULL);
goto fail;
}
leaf = path->nodes[0];
@@ -1008,7 +1007,7 @@ update_cache_item(struct btrfs_trans_handle *trans,
clear_extent_bit(&BTRFS_I(inode)->io_tree, 0,
inode->i_size - 1,
EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0,
- NULL, GFP_NOFS);
+ NULL);
btrfs_release_path(path);
goto fail;
}
@@ -1105,8 +1104,7 @@ static int flush_dirty_cache(struct inode *inode)
ret = btrfs_wait_ordered_range(inode, 0, (u64)-1);
if (ret)
clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1,
- EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, NULL,
- GFP_NOFS);
+ EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, NULL);
return ret;
}
@@ -1127,8 +1125,7 @@ cleanup_write_cache_enospc(struct inode *inode,
{
io_ctl_drop_pages(io_ctl);
unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
- i_size_read(inode) - 1, cached_state,
- GFP_NOFS);
+ i_size_read(inode) - 1, cached_state);
}
static int __btrfs_wait_cache_io(struct btrfs_root *root,
@@ -1322,7 +1319,7 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
io_ctl_drop_pages(io_ctl);
unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
- i_size_read(inode) - 1, &cached_state, GFP_NOFS);
+ i_size_read(inode) - 1, &cached_state);
/*
* at this point the pages are under IO and we're happy,
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index e1a7f3cb5be9..53ca025655fc 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -43,6 +43,7 @@
#include <linux/posix_acl_xattr.h>
#include <linux/uio.h>
#include <linux/magic.h>
+#include <linux/iversion.h>
#include "ctree.h"
#include "disk-io.h"
#include "transaction.h"
@@ -536,9 +537,14 @@ again:
*
* If the compression fails for any reason, we set the pages
* dirty again later on.
+ *
+ * Note that the remaining part is redirtied, the start pointer
+ * has moved, the end is the original one.
*/
- extent_range_clear_dirty_for_io(inode, start, end);
- redirty = 1;
+ if (!redirty) {
+ extent_range_clear_dirty_for_io(inode, start, end);
+ redirty = 1;
+ }
/* Compression level is applied here and only here */
ret = btrfs_compress_pages(
@@ -765,11 +771,10 @@ retry:
* all those pages down to the drive.
*/
if (!page_started && !ret)
- extent_write_locked_range(io_tree,
- inode, async_extent->start,
+ extent_write_locked_range(inode,
+ async_extent->start,
async_extent->start +
async_extent->ram_size - 1,
- btrfs_get_extent,
WB_SYNC_ALL);
else if (ret)
unlock_page(async_cow->locked_page);
@@ -1203,7 +1208,7 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page,
u64 cur_end;
clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED,
- 1, 0, NULL, GFP_NOFS);
+ 1, 0, NULL);
while (start < end) {
async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS);
BUG_ON(!async_cow); /* -ENOMEM */
@@ -1951,7 +1956,21 @@ static blk_status_t __btrfs_submit_bio_done(void *private_data, struct bio *bio,
/*
* extent_io.c submission hook. This does the right thing for csum calculation
- * on write, or reading the csums from the tree before a read
+ * on write, or reading the csums from the tree before a read.
+ *
+ * Rules about async/sync submit,
+ * a) read: sync submit
+ *
+ * b) write without checksum: sync submit
+ *
+ * c) write with checksum:
+ * c-1) if bio is issued by fsync: sync submit
+ * (sync_writers != 0)
+ *
+ * c-2) if root is reloc root: sync submit
+ * (only in case of buffered IO)
+ *
+ * c-3) otherwise: async submit
*/
static blk_status_t btrfs_submit_bio_hook(void *private_data, struct bio *bio,
int mirror_num, unsigned long bio_flags,
@@ -2023,10 +2042,10 @@ static noinline int add_pending_csums(struct btrfs_trans_handle *trans,
struct btrfs_ordered_sum *sum;
list_for_each_entry(sum, list, list) {
- trans->adding_csums = 1;
+ trans->adding_csums = true;
btrfs_csum_file_blocks(trans,
BTRFS_I(inode)->root->fs_info->csum_root, sum);
- trans->adding_csums = 0;
+ trans->adding_csums = false;
}
return 0;
}
@@ -2082,7 +2101,7 @@ again:
PAGE_SIZE);
if (ordered) {
unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start,
- page_end, &cached_state, GFP_NOFS);
+ page_end, &cached_state);
unlock_page(page);
btrfs_start_ordered_extent(inode, ordered, 1);
btrfs_put_ordered_extent(ordered);
@@ -2098,14 +2117,21 @@ again:
goto out;
}
- btrfs_set_extent_delalloc(inode, page_start, page_end, 0, &cached_state,
- 0);
+ ret = btrfs_set_extent_delalloc(inode, page_start, page_end, 0,
+ &cached_state, 0);
+ if (ret) {
+ mapping_set_error(page->mapping, ret);
+ end_extent_writepage(page, ret, page_start, page_end);
+ ClearPageChecked(page);
+ goto out;
+ }
+
ClearPageChecked(page);
set_page_dirty(page);
btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
out:
unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start, page_end,
- &cached_state, GFP_NOFS);
+ &cached_state);
out_page:
unlock_page(page);
put_page(page);
@@ -2697,7 +2723,7 @@ out_free_path:
btrfs_end_transaction(trans);
out_unlock:
unlock_extent_cached(&BTRFS_I(inode)->io_tree, lock_start, lock_end,
- &cached, GFP_NOFS);
+ &cached);
iput(inode);
return ret;
}
@@ -2986,7 +3012,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
clear_extent_bit(io_tree, ordered_extent->file_offset,
ordered_extent->file_offset + ordered_extent->len - 1,
- EXTENT_DEFRAG, 0, 0, &cached_state, GFP_NOFS);
+ EXTENT_DEFRAG, 0, 0, &cached_state);
}
if (nolock)
@@ -3056,7 +3082,7 @@ out:
ordered_extent->len - 1,
clear_bits,
(clear_bits & EXTENT_LOCKED) ? 1 : 0,
- 0, &cached_state, GFP_NOFS);
+ 0, &cached_state);
}
if (trans)
@@ -3070,7 +3096,7 @@ out:
else
start = ordered_extent->file_offset;
end = ordered_extent->file_offset + ordered_extent->len - 1;
- clear_extent_uptodate(io_tree, start, end, NULL, GFP_NOFS);
+ clear_extent_uptodate(io_tree, start, end, NULL);
/* Drop the cache for the part of the extent we didn't write. */
btrfs_drop_extent_cache(BTRFS_I(inode), start, end, 0);
@@ -3777,7 +3803,8 @@ static int btrfs_read_locked_inode(struct inode *inode)
BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item);
BTRFS_I(inode)->last_trans = btrfs_inode_transid(leaf, inode_item);
- inode->i_version = btrfs_inode_sequence(leaf, inode_item);
+ inode_set_iversion_queried(inode,
+ btrfs_inode_sequence(leaf, inode_item));
inode->i_generation = BTRFS_I(inode)->generation;
inode->i_rdev = 0;
rdev = btrfs_inode_rdev(leaf, inode_item);
@@ -3945,7 +3972,8 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
&token);
btrfs_set_token_inode_generation(leaf, item, BTRFS_I(inode)->generation,
&token);
- btrfs_set_token_inode_sequence(leaf, item, inode->i_version, &token);
+ btrfs_set_token_inode_sequence(leaf, item, inode_peek_iversion(inode),
+ &token);
btrfs_set_token_inode_transid(leaf, item, trans->transid, &token);
btrfs_set_token_inode_rdev(leaf, item, inode->i_rdev, &token);
btrfs_set_token_inode_flags(leaf, item, BTRFS_I(inode)->flags, &token);
@@ -4744,8 +4772,8 @@ int btrfs_truncate_block(struct inode *inode, loff_t from, loff_t len,
u64 block_start;
u64 block_end;
- if ((offset & (blocksize - 1)) == 0 &&
- (!len || ((len & (blocksize - 1)) == 0)))
+ if (IS_ALIGNED(offset, blocksize) &&
+ (!len || IS_ALIGNED(len, blocksize)))
goto out;
block_start = round_down(from, blocksize);
@@ -4787,7 +4815,7 @@ again:
ordered = btrfs_lookup_ordered_extent(inode, block_start);
if (ordered) {
unlock_extent_cached(io_tree, block_start, block_end,
- &cached_state, GFP_NOFS);
+ &cached_state);
unlock_page(page);
put_page(page);
btrfs_start_ordered_extent(inode, ordered, 1);
@@ -4798,13 +4826,13 @@ again:
clear_extent_bit(&BTRFS_I(inode)->io_tree, block_start, block_end,
EXTENT_DIRTY | EXTENT_DELALLOC |
EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
- 0, 0, &cached_state, GFP_NOFS);
+ 0, 0, &cached_state);
ret = btrfs_set_extent_delalloc(inode, block_start, block_end, 0,
&cached_state, 0);
if (ret) {
unlock_extent_cached(io_tree, block_start, block_end,
- &cached_state, GFP_NOFS);
+ &cached_state);
goto out_unlock;
}
@@ -4823,8 +4851,7 @@ again:
}
ClearPageChecked(page);
set_page_dirty(page);
- unlock_extent_cached(io_tree, block_start, block_end, &cached_state,
- GFP_NOFS);
+ unlock_extent_cached(io_tree, block_start, block_end, &cached_state);
out_unlock:
if (ret)
@@ -4925,7 +4952,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
if (!ordered)
break;
unlock_extent_cached(io_tree, hole_start, block_end - 1,
- &cached_state, GFP_NOFS);
+ &cached_state);
btrfs_start_ordered_extent(inode, ordered, 1);
btrfs_put_ordered_extent(ordered);
}
@@ -4990,8 +5017,7 @@ next:
break;
}
free_extent_map(em);
- unlock_extent_cached(io_tree, hole_start, block_end - 1, &cached_state,
- GFP_NOFS);
+ unlock_extent_cached(io_tree, hole_start, block_end - 1, &cached_state);
return err;
}
@@ -5234,8 +5260,7 @@ static void evict_inode_truncate_pages(struct inode *inode)
clear_extent_bit(io_tree, start, end,
EXTENT_LOCKED | EXTENT_DIRTY |
EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
- EXTENT_DEFRAG, 1, 1,
- &cached_state, GFP_NOFS);
+ EXTENT_DEFRAG, 1, 1, &cached_state);
cond_resched();
spin_lock(&io_tree->lock);
@@ -5894,7 +5919,6 @@ static int btrfs_filldir(void *addr, int entries, struct dir_context *ctx)
static int btrfs_real_readdir(struct file *file, struct dir_context *ctx)
{
struct inode *inode = file_inode(file);
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_file_private *private = file->private_data;
struct btrfs_dir_item *di;
@@ -5962,9 +5986,6 @@ again:
if (btrfs_should_delete_dir_index(&del_list, found_key.offset))
goto next;
di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
- if (verify_dir_item(fs_info, leaf, slot, di))
- goto next;
-
name_len = btrfs_dir_name_len(leaf, di);
if ((total_len + sizeof(struct dir_entry) + name_len) >=
PAGE_SIZE) {
@@ -6104,19 +6125,20 @@ static int btrfs_update_time(struct inode *inode, struct timespec *now,
int flags)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
+ bool dirty = flags & ~S_VERSION;
if (btrfs_root_readonly(root))
return -EROFS;
if (flags & S_VERSION)
- inode_inc_iversion(inode);
+ dirty |= inode_maybe_inc_iversion(inode, dirty);
if (flags & S_CTIME)
inode->i_ctime = *now;
if (flags & S_MTIME)
inode->i_mtime = *now;
if (flags & S_ATIME)
inode->i_atime = *now;
- return btrfs_dirty_inode(inode);
+ return dirty ? btrfs_dirty_inode(inode) : 0;
}
/*
@@ -6297,7 +6319,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
}
/*
* index_cnt is ignored for everything but a dir,
- * btrfs_get_inode_index_count has an explanation for the magic
+ * btrfs_set_inode_index_count has an explanation for the magic
* number
*/
BTRFS_I(inode)->index_cnt = 2;
@@ -6560,7 +6582,6 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
out_unlock:
btrfs_end_transaction(trans);
- btrfs_balance_delayed_items(fs_info);
btrfs_btree_balance_dirty(fs_info);
if (drop_inode) {
inode_dec_link_count(inode);
@@ -6641,7 +6662,6 @@ out_unlock:
inode_dec_link_count(inode);
iput(inode);
}
- btrfs_balance_delayed_items(fs_info);
btrfs_btree_balance_dirty(fs_info);
return err;
@@ -6716,7 +6736,6 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
btrfs_log_new_name(trans, BTRFS_I(inode), NULL, parent);
}
- btrfs_balance_delayed_items(fs_info);
fail:
if (trans)
btrfs_end_transaction(trans);
@@ -6794,7 +6813,6 @@ out_fail:
inode_dec_link_count(inode);
iput(inode);
}
- btrfs_balance_delayed_items(fs_info);
btrfs_btree_balance_dirty(fs_info);
return err;
@@ -6803,68 +6821,6 @@ out_fail_inode:
goto out_fail;
}
-/* Find next extent map of a given extent map, caller needs to ensure locks */
-static struct extent_map *next_extent_map(struct extent_map *em)
-{
- struct rb_node *next;
-
- next = rb_next(&em->rb_node);
- if (!next)
- return NULL;
- return container_of(next, struct extent_map, rb_node);
-}
-
-static struct extent_map *prev_extent_map(struct extent_map *em)
-{
- struct rb_node *prev;
-
- prev = rb_prev(&em->rb_node);
- if (!prev)
- return NULL;
- return container_of(prev, struct extent_map, rb_node);
-}
-
-/* helper for btfs_get_extent. Given an existing extent in the tree,
- * the existing extent is the nearest extent to map_start,
- * and an extent that you want to insert, deal with overlap and insert
- * the best fitted new extent into the tree.
- */
-static int merge_extent_mapping(struct extent_map_tree *em_tree,
- struct extent_map *existing,
- struct extent_map *em,
- u64 map_start)
-{
- struct extent_map *prev;
- struct extent_map *next;
- u64 start;
- u64 end;
- u64 start_diff;
-
- BUG_ON(map_start < em->start || map_start >= extent_map_end(em));
-
- if (existing->start > map_start) {
- next = existing;
- prev = prev_extent_map(next);
- } else {
- prev = existing;
- next = next_extent_map(prev);
- }
-
- start = prev ? extent_map_end(prev) : em->start;
- start = max_t(u64, start, em->start);
- end = next ? next->start : extent_map_end(em);
- end = min_t(u64, end, extent_map_end(em));
- start_diff = start - em->start;
- em->start = start;
- em->len = end - start;
- if (em->block_start < EXTENT_MAP_LAST_BYTE &&
- !test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
- em->block_start += start_diff;
- em->block_len -= start_diff;
- }
- return add_extent_mapping(em_tree, em, 0);
-}
-
static noinline int uncompress_inline(struct btrfs_path *path,
struct page *page,
size_t pg_offset, u64 extent_offset,
@@ -6939,10 +6895,8 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
struct extent_map *em = NULL;
struct extent_map_tree *em_tree = &inode->extent_tree;
struct extent_io_tree *io_tree = &inode->io_tree;
- struct btrfs_trans_handle *trans = NULL;
const bool new_inline = !page || create;
-again:
read_lock(&em_tree->lock);
em = lookup_extent_mapping(em_tree, start, len);
if (em)
@@ -6981,8 +6935,7 @@ again:
path->reada = READA_FORWARD;
}
- ret = btrfs_lookup_file_extent(trans, root, path,
- objectid, start, trans != NULL);
+ ret = btrfs_lookup_file_extent(NULL, root, path, objectid, start, 0);
if (ret < 0) {
err = ret;
goto out;
@@ -7083,7 +7036,7 @@ next:
em->orig_block_len = em->len;
em->orig_start = em->start;
ptr = btrfs_file_extent_inline_start(item) + extent_offset;
- if (create == 0 && !PageUptodate(page)) {
+ if (!PageUptodate(page)) {
if (btrfs_file_extent_compression(leaf, item) !=
BTRFS_COMPRESS_NONE) {
ret = uncompress_inline(path, page, pg_offset,
@@ -7104,25 +7057,6 @@ next:
kunmap(page);
}
flush_dcache_page(page);
- } else if (create && PageUptodate(page)) {
- BUG();
- if (!trans) {
- kunmap(page);
- free_extent_map(em);
- em = NULL;
-
- btrfs_release_path(path);
- trans = btrfs_join_transaction(root);
-
- if (IS_ERR(trans))
- return ERR_CAST(trans);
- goto again;
- }
- map = kmap(page);
- write_extent_buffer(leaf, map + pg_offset, ptr,
- copy_size);
- kunmap(page);
- btrfs_mark_buffer_dirty(leaf);
}
set_extent_uptodate(io_tree, em->start,
extent_map_end(em) - 1, NULL, GFP_NOFS);
@@ -7134,7 +7068,6 @@ not_found:
em->len = len;
not_found_em:
em->block_start = EXTENT_MAP_HOLE;
- set_bit(EXTENT_FLAG_VACANCY, &em->flags);
insert:
btrfs_release_path(path);
if (em->start > start || extent_map_end(em) <= start) {
@@ -7147,62 +7080,13 @@ insert:
err = 0;
write_lock(&em_tree->lock);
- ret = add_extent_mapping(em_tree, em, 0);
- /* it is possible that someone inserted the extent into the tree
- * while we had the lock dropped. It is also possible that
- * an overlapping map exists in the tree
- */
- if (ret == -EEXIST) {
- struct extent_map *existing;
-
- ret = 0;
-
- existing = search_extent_mapping(em_tree, start, len);
- /*
- * existing will always be non-NULL, since there must be
- * extent causing the -EEXIST.
- */
- if (existing->start == em->start &&
- extent_map_end(existing) >= extent_map_end(em) &&
- em->block_start == existing->block_start) {
- /*
- * The existing extent map already encompasses the
- * entire extent map we tried to add.
- */
- free_extent_map(em);
- em = existing;
- err = 0;
-
- } else if (start >= extent_map_end(existing) ||
- start <= existing->start) {
- /*
- * The existing extent map is the one nearest to
- * the [start, start + len) range which overlaps
- */
- err = merge_extent_mapping(em_tree, existing,
- em, start);
- free_extent_map(existing);
- if (err) {
- free_extent_map(em);
- em = NULL;
- }
- } else {
- free_extent_map(em);
- em = existing;
- err = 0;
- }
- }
+ err = btrfs_add_extent_mapping(em_tree, &em, start, len);
write_unlock(&em_tree->lock);
out:
trace_btrfs_get_extent(root, inode, em);
btrfs_free_path(path);
- if (trans) {
- ret = btrfs_end_transaction(trans);
- if (!err)
- err = ret;
- }
if (err) {
free_extent_map(em);
return ERR_PTR(err);
@@ -7324,7 +7208,7 @@ struct extent_map *btrfs_get_extent_fiemap(struct btrfs_inode *inode,
em->block_start = EXTENT_MAP_DELALLOC;
em->block_len = found;
}
- } else if (hole_em) {
+ } else {
return hole_em;
}
out:
@@ -7641,7 +7525,7 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
break;
unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
- cached_state, GFP_NOFS);
+ cached_state);
if (ordered) {
/*
@@ -7926,7 +7810,7 @@ unlock:
if (lockstart < lockend) {
clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
lockend, unlock_bits, 1, 0,
- &cached_state, GFP_NOFS);
+ &cached_state);
} else {
free_extent_state(cached_state);
}
@@ -7937,7 +7821,7 @@ unlock:
unlock_err:
clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
- unlock_bits, 1, 0, &cached_state, GFP_NOFS);
+ unlock_bits, 1, 0, &cached_state);
err:
if (dio_data)
current->journal_info = dio_data;
@@ -7953,15 +7837,12 @@ static inline blk_status_t submit_dio_repair_bio(struct inode *inode,
BUG_ON(bio_op(bio) == REQ_OP_WRITE);
- bio_get(bio);
-
ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DIO_REPAIR);
if (ret)
- goto err;
+ return ret;
ret = btrfs_map_bio(fs_info, bio, mirror_num, 0);
-err:
- bio_put(bio);
+
return ret;
}
@@ -8015,6 +7896,7 @@ static blk_status_t dio_read_error(struct inode *inode, struct bio *failed_bio,
int segs;
int ret;
blk_status_t status;
+ struct bio_vec bvec;
BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE);
@@ -8030,8 +7912,9 @@ static blk_status_t dio_read_error(struct inode *inode, struct bio *failed_bio,
}
segs = bio_segments(failed_bio);
+ bio_get_first_bvec(failed_bio, &bvec);
if (segs > 1 ||
- (failed_bio->bi_io_vec->bv_len > btrfs_inode_sectorsize(inode)))
+ (bvec.bv_len > btrfs_inode_sectorsize(inode)))
read_mode |= REQ_FAILFAST_DEV;
isector = start - btrfs_io_bio(failed_bio)->logical;
@@ -8074,7 +7957,7 @@ static void btrfs_retry_endio_nocsum(struct bio *bio)
ASSERT(bio->bi_vcnt == 1);
io_tree = &BTRFS_I(inode)->io_tree;
failure_tree = &BTRFS_I(inode)->io_failure_tree;
- ASSERT(bio->bi_io_vec->bv_len == btrfs_inode_sectorsize(inode));
+ ASSERT(bio_first_bvec_all(bio)->bv_len == btrfs_inode_sectorsize(inode));
done->uptodate = 1;
ASSERT(!bio_flagged(bio, BIO_CLONED));
@@ -8164,7 +8047,7 @@ static void btrfs_retry_endio(struct bio *bio)
uptodate = 1;
ASSERT(bio->bi_vcnt == 1);
- ASSERT(bio->bi_io_vec->bv_len == btrfs_inode_sectorsize(done->inode));
+ ASSERT(bio_first_bvec_all(bio)->bv_len == btrfs_inode_sectorsize(done->inode));
io_tree = &BTRFS_I(inode)->io_tree;
failure_tree = &BTRFS_I(inode)->io_failure_tree;
@@ -8460,11 +8343,10 @@ __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode, u64 file_offset,
bool write = bio_op(bio) == REQ_OP_WRITE;
blk_status_t ret;
+ /* Check btrfs_submit_bio_hook() for rules about async submit. */
if (async_submit)
async_submit = !atomic_read(&BTRFS_I(inode)->sync_writers);
- bio_get(bio);
-
if (!write) {
ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DATA);
if (ret)
@@ -8497,7 +8379,6 @@ __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode, u64 file_offset,
map:
ret = btrfs_map_bio(fs_info, bio, 0, 0);
err:
- bio_put(bio);
return ret;
}
@@ -8854,7 +8735,7 @@ static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
if (ret)
return ret;
- return extent_fiemap(inode, fieinfo, start, len, btrfs_get_extent_fiemap);
+ return extent_fiemap(inode, fieinfo, start, len);
}
int btrfs_readpage(struct file *file, struct page *page)
@@ -8866,7 +8747,6 @@ int btrfs_readpage(struct file *file, struct page *page)
static int btrfs_writepage(struct page *page, struct writeback_control *wbc)
{
- struct extent_io_tree *tree;
struct inode *inode = page->mapping->host;
int ret;
@@ -8885,8 +8765,7 @@ static int btrfs_writepage(struct page *page, struct writeback_control *wbc)
redirty_page_for_writepage(wbc, page);
return AOP_WRITEPAGE_ACTIVATE;
}
- tree = &BTRFS_I(page->mapping->host)->io_tree;
- ret = extent_write_full_page(tree, page, btrfs_get_extent, wbc);
+ ret = extent_write_full_page(page, wbc);
btrfs_add_delayed_iput(inode);
return ret;
}
@@ -8897,7 +8776,7 @@ static int btrfs_writepages(struct address_space *mapping,
struct extent_io_tree *tree;
tree = &BTRFS_I(mapping->host)->io_tree;
- return extent_writepages(tree, mapping, btrfs_get_extent, wbc);
+ return extent_writepages(tree, mapping, wbc);
}
static int
@@ -8906,8 +8785,7 @@ btrfs_readpages(struct file *file, struct address_space *mapping,
{
struct extent_io_tree *tree;
tree = &BTRFS_I(mapping->host)->io_tree;
- return extent_readpages(tree, mapping, pages, nr_pages,
- btrfs_get_extent);
+ return extent_readpages(tree, mapping, pages, nr_pages);
}
static int __btrfs_releasepage(struct page *page, gfp_t gfp_flags)
{
@@ -8978,8 +8856,7 @@ again:
EXTENT_DIRTY | EXTENT_DELALLOC |
EXTENT_DELALLOC_NEW |
EXTENT_LOCKED | EXTENT_DO_ACCOUNTING |
- EXTENT_DEFRAG, 1, 0, &cached_state,
- GFP_NOFS);
+ EXTENT_DEFRAG, 1, 0, &cached_state);
/*
* whoever cleared the private bit is responsible
* for the finish_ordered_io
@@ -9036,7 +8913,7 @@ again:
EXTENT_LOCKED | EXTENT_DIRTY |
EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 1, 1,
- &cached_state, GFP_NOFS);
+ &cached_state);
__btrfs_releasepage(page, GFP_NOFS);
}
@@ -9137,7 +9014,7 @@ again:
PAGE_SIZE);
if (ordered) {
unlock_extent_cached(io_tree, page_start, page_end,
- &cached_state, GFP_NOFS);
+ &cached_state);
unlock_page(page);
btrfs_start_ordered_extent(inode, ordered, 1);
btrfs_put_ordered_extent(ordered);
@@ -9164,13 +9041,13 @@ again:
clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, end,
EXTENT_DIRTY | EXTENT_DELALLOC |
EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
- 0, 0, &cached_state, GFP_NOFS);
+ 0, 0, &cached_state);
ret = btrfs_set_extent_delalloc(inode, page_start, end, 0,
&cached_state, 0);
if (ret) {
unlock_extent_cached(io_tree, page_start, page_end,
- &cached_state, GFP_NOFS);
+ &cached_state);
ret = VM_FAULT_SIGBUS;
goto out_unlock;
}
@@ -9196,7 +9073,7 @@ again:
BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid;
BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->root->last_log_commit;
- unlock_extent_cached(io_tree, page_start, page_end, &cached_state, GFP_NOFS);
+ unlock_extent_cached(io_tree, page_start, page_end, &cached_state);
out_unlock:
if (!ret) {
@@ -9421,7 +9298,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
struct btrfs_inode *ei;
struct inode *inode;
- ei = kmem_cache_alloc(btrfs_inode_cachep, GFP_NOFS);
+ ei = kmem_cache_alloc(btrfs_inode_cachep, GFP_KERNEL);
if (!ei)
return NULL;
@@ -9573,7 +9450,7 @@ void btrfs_destroy_cachep(void)
kmem_cache_destroy(btrfs_free_space_cachep);
}
-int btrfs_init_cachep(void)
+int __init btrfs_init_cachep(void)
{
btrfs_inode_cachep = kmem_cache_create("btrfs_inode",
sizeof(struct btrfs_inode), 0,
@@ -10688,7 +10565,6 @@ out:
btrfs_end_transaction(trans);
if (ret)
iput(inode);
- btrfs_balance_delayed_items(fs_info);
btrfs_btree_balance_dirty(fs_info);
return ret;
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 2ef8acaac688..111ee282b777 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -43,6 +43,7 @@
#include <linux/uuid.h>
#include <linux/btrfs.h>
#include <linux/uaccess.h>
+#include <linux/iversion.h>
#include "ctree.h"
#include "disk-io.h"
#include "transaction.h"
@@ -307,12 +308,10 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
ip->flags |= BTRFS_INODE_COMPRESS;
ip->flags &= ~BTRFS_INODE_NOCOMPRESS;
- if (fs_info->compress_type == BTRFS_COMPRESS_LZO)
- comp = "lzo";
- else if (fs_info->compress_type == BTRFS_COMPRESS_ZLIB)
- comp = "zlib";
- else
- comp = "zstd";
+ comp = btrfs_compress_type2str(fs_info->compress_type);
+ if (!comp || comp[0] == 0)
+ comp = btrfs_compress_type2str(BTRFS_COMPRESS_ZLIB);
+
ret = btrfs_set_prop(inode, "btrfs.compression",
comp, strlen(comp), 0);
if (ret)
@@ -979,7 +978,7 @@ static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start)
/* get the big lock and read metadata off disk */
lock_extent_bits(io_tree, start, end, &cached);
em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, len, 0);
- unlock_extent_cached(io_tree, start, end, &cached, GFP_NOFS);
+ unlock_extent_cached(io_tree, start, end, &cached);
if (IS_ERR(em))
return NULL;
@@ -1130,7 +1129,7 @@ again:
ordered = btrfs_lookup_ordered_extent(inode,
page_start);
unlock_extent_cached(tree, page_start, page_end,
- &cached_state, GFP_NOFS);
+ &cached_state);
if (!ordered)
break;
@@ -1190,7 +1189,7 @@ again:
clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start,
page_end - 1, EXTENT_DIRTY | EXTENT_DELALLOC |
EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 0, 0,
- &cached_state, GFP_NOFS);
+ &cached_state);
if (i_done != page_cnt) {
spin_lock(&BTRFS_I(inode)->lock);
@@ -1206,8 +1205,7 @@ again:
&cached_state);
unlock_extent_cached(&BTRFS_I(inode)->io_tree,
- page_start, page_end - 1, &cached_state,
- GFP_NOFS);
+ page_start, page_end - 1, &cached_state);
for (i = 0; i < i_done; i++) {
clear_page_dirty_for_io(pages[i]);
@@ -1503,7 +1501,7 @@ static noinline int btrfs_ioctl_resize(struct file *file,
goto out_free;
}
- if (!device->writeable) {
+ if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
btrfs_info(fs_info,
"resizer unable to apply on readonly device %llu",
devid);
@@ -1528,7 +1526,7 @@ static noinline int btrfs_ioctl_resize(struct file *file,
}
}
- if (device->is_tgtdev_for_dev_replace) {
+ if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
ret = -EPERM;
goto out_free;
}
@@ -2675,14 +2673,12 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg)
goto out;
}
- mutex_lock(&fs_info->volume_mutex);
if (vol_args->flags & BTRFS_DEVICE_SPEC_BY_ID) {
ret = btrfs_rm_device(fs_info, NULL, vol_args->devid);
} else {
vol_args->name[BTRFS_SUBVOL_NAME_MAX] = '\0';
ret = btrfs_rm_device(fs_info, vol_args->name, 0);
}
- mutex_unlock(&fs_info->volume_mutex);
clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
if (!ret) {
@@ -2726,9 +2722,7 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
}
vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
- mutex_lock(&fs_info->volume_mutex);
ret = btrfs_rm_device(fs_info, vol_args->name, 0);
- mutex_unlock(&fs_info->volume_mutex);
if (!ret)
btrfs_info(fs_info, "disk deleted %s", vol_args->name);
@@ -2753,16 +2747,16 @@ static long btrfs_ioctl_fs_info(struct btrfs_fs_info *fs_info,
if (!fi_args)
return -ENOMEM;
- mutex_lock(&fs_devices->device_list_mutex);
+ rcu_read_lock();
fi_args->num_devices = fs_devices->num_devices;
- memcpy(&fi_args->fsid, fs_info->fsid, sizeof(fi_args->fsid));
- list_for_each_entry(device, &fs_devices->devices, dev_list) {
+ list_for_each_entry_rcu(device, &fs_devices->devices, dev_list) {
if (device->devid > fi_args->max_id)
fi_args->max_id = device->devid;
}
- mutex_unlock(&fs_devices->device_list_mutex);
+ rcu_read_unlock();
+ memcpy(&fi_args->fsid, fs_info->fsid, sizeof(fi_args->fsid));
fi_args->nodesize = fs_info->nodesize;
fi_args->sectorsize = fs_info->sectorsize;
fi_args->clone_alignment = fs_info->sectorsize;
@@ -2779,7 +2773,6 @@ static long btrfs_ioctl_dev_info(struct btrfs_fs_info *fs_info,
{
struct btrfs_ioctl_dev_info_args *di_args;
struct btrfs_device *dev;
- struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
int ret = 0;
char *s_uuid = NULL;
@@ -2790,7 +2783,7 @@ static long btrfs_ioctl_dev_info(struct btrfs_fs_info *fs_info,
if (!btrfs_is_empty_uuid(di_args->uuid))
s_uuid = di_args->uuid;
- mutex_lock(&fs_devices->device_list_mutex);
+ rcu_read_lock();
dev = btrfs_find_device(fs_info, di_args->devid, s_uuid, NULL);
if (!dev) {
@@ -2805,17 +2798,15 @@ static long btrfs_ioctl_dev_info(struct btrfs_fs_info *fs_info,
if (dev->name) {
struct rcu_string *name;
- rcu_read_lock();
name = rcu_dereference(dev->name);
- strncpy(di_args->path, name->str, sizeof(di_args->path));
- rcu_read_unlock();
+ strncpy(di_args->path, name->str, sizeof(di_args->path) - 1);
di_args->path[sizeof(di_args->path) - 1] = 0;
} else {
di_args->path[0] = '\0';
}
out:
- mutex_unlock(&fs_devices->device_list_mutex);
+ rcu_read_unlock();
if (ret == 0 && copy_to_user(arg, di_args, sizeof(*di_args)))
ret = -EFAULT;
diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c
index f6a05f836629..b30a056963ab 100644
--- a/fs/btrfs/props.c
+++ b/fs/btrfs/props.c
@@ -164,7 +164,6 @@ static int iterate_object_props(struct btrfs_root *root,
size_t),
void *ctx)
{
- struct btrfs_fs_info *fs_info = root->fs_info;
int ret;
char *name_buf = NULL;
char *value_buf = NULL;
@@ -215,12 +214,6 @@ static int iterate_object_props(struct btrfs_root *root,
name_ptr = (unsigned long)(di + 1);
data_ptr = name_ptr + name_len;
- if (verify_dir_item(fs_info, leaf,
- path->slots[0], di)) {
- ret = -EIO;
- goto out;
- }
-
if (name_len <= XATTR_BTRFS_PREFIX_LEN ||
memcmp_extent_buffer(leaf, XATTR_BTRFS_PREFIX,
name_ptr,
@@ -430,11 +423,11 @@ static const char *prop_compression_extract(struct inode *inode)
{
switch (BTRFS_I(inode)->prop_compress) {
case BTRFS_COMPRESS_ZLIB:
- return "zlib";
case BTRFS_COMPRESS_LZO:
- return "lzo";
case BTRFS_COMPRESS_ZSTD:
- return "zstd";
+ return btrfs_compress_type2str(BTRFS_I(inode)->prop_compress);
+ default:
+ break;
}
return NULL;
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 168fd03ca3ac..9e61dd624f7b 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -2883,8 +2883,7 @@ cleanup:
ULIST_ITER_INIT(&uiter);
while ((unode = ulist_next(&reserved->range_changed, &uiter)))
clear_extent_bit(&BTRFS_I(inode)->io_tree, unode->val,
- unode->aux, EXTENT_QGROUP_RESERVED, 0, 0, NULL,
- GFP_NOFS);
+ unode->aux, EXTENT_QGROUP_RESERVED, 0, 0, NULL);
extent_changeset_release(reserved);
return ret;
}
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index a7f79254ecca..dec0907dfb8a 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -231,7 +231,6 @@ int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info)
cur = h + i;
INIT_LIST_HEAD(&cur->hash_list);
spin_lock_init(&cur->lock);
- init_waitqueue_head(&cur->wait);
}
x = cmpxchg(&info->stripe_hash_table, NULL, table);
@@ -595,14 +594,31 @@ static int rbio_can_merge(struct btrfs_raid_bio *last,
* bio list here, anyone else that wants to
* change this stripe needs to do their own rmw.
*/
- if (last->operation == BTRFS_RBIO_PARITY_SCRUB ||
- cur->operation == BTRFS_RBIO_PARITY_SCRUB)
+ if (last->operation == BTRFS_RBIO_PARITY_SCRUB)
return 0;
- if (last->operation == BTRFS_RBIO_REBUILD_MISSING ||
- cur->operation == BTRFS_RBIO_REBUILD_MISSING)
+ if (last->operation == BTRFS_RBIO_REBUILD_MISSING)
return 0;
+ if (last->operation == BTRFS_RBIO_READ_REBUILD) {
+ int fa = last->faila;
+ int fb = last->failb;
+ int cur_fa = cur->faila;
+ int cur_fb = cur->failb;
+
+ if (last->faila >= last->failb) {
+ fa = last->failb;
+ fb = last->faila;
+ }
+
+ if (cur->faila >= cur->failb) {
+ cur_fa = cur->failb;
+ cur_fb = cur->faila;
+ }
+
+ if (fa != cur_fa || fb != cur_fb)
+ return 0;
+ }
return 1;
}
@@ -670,7 +686,6 @@ static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio)
struct btrfs_raid_bio *cur;
struct btrfs_raid_bio *pending;
unsigned long flags;
- DEFINE_WAIT(wait);
struct btrfs_raid_bio *freeit = NULL;
struct btrfs_raid_bio *cache_drop = NULL;
int ret = 0;
@@ -816,15 +831,6 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
}
goto done_nolock;
- /*
- * The barrier for this waitqueue_active is not needed,
- * we're protected by h->lock and can't miss a wakeup.
- */
- } else if (waitqueue_active(&h->wait)) {
- spin_unlock(&rbio->bio_list_lock);
- spin_unlock_irqrestore(&h->lock, flags);
- wake_up(&h->wait);
- goto done_nolock;
}
}
done:
@@ -858,10 +864,17 @@ static void __free_raid_bio(struct btrfs_raid_bio *rbio)
kfree(rbio);
}
-static void free_raid_bio(struct btrfs_raid_bio *rbio)
+static void rbio_endio_bio_list(struct bio *cur, blk_status_t err)
{
- unlock_stripe(rbio);
- __free_raid_bio(rbio);
+ struct bio *next;
+
+ while (cur) {
+ next = cur->bi_next;
+ cur->bi_next = NULL;
+ cur->bi_status = err;
+ bio_endio(cur);
+ cur = next;
+ }
}
/*
@@ -871,20 +884,26 @@ static void free_raid_bio(struct btrfs_raid_bio *rbio)
static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, blk_status_t err)
{
struct bio *cur = bio_list_get(&rbio->bio_list);
- struct bio *next;
+ struct bio *extra;
if (rbio->generic_bio_cnt)
btrfs_bio_counter_sub(rbio->fs_info, rbio->generic_bio_cnt);
- free_raid_bio(rbio);
+ /*
+ * At this moment, rbio->bio_list is empty, however since rbio does not
+ * always have RBIO_RMW_LOCKED_BIT set and rbio is still linked on the
+ * hash list, rbio may be merged with others so that rbio->bio_list
+ * becomes non-empty.
+ * Once unlock_stripe() is done, rbio->bio_list will not be updated any
+ * more and we can call bio_endio() on all queued bios.
+ */
+ unlock_stripe(rbio);
+ extra = bio_list_get(&rbio->bio_list);
+ __free_raid_bio(rbio);
- while (cur) {
- next = cur->bi_next;
- cur->bi_next = NULL;
- cur->bi_status = err;
- bio_endio(cur);
- cur = next;
- }
+ rbio_endio_bio_list(cur, err);
+ if (extra)
+ rbio_endio_bio_list(extra, err);
}
/*
@@ -1435,14 +1454,13 @@ static int fail_bio_stripe(struct btrfs_raid_bio *rbio,
*/
static void set_bio_pages_uptodate(struct bio *bio)
{
- struct bio_vec bvec;
- struct bvec_iter iter;
+ struct bio_vec *bvec;
+ int i;
- if (bio_flagged(bio, BIO_CLONED))
- bio->bi_iter = btrfs_io_bio(bio)->iter;
+ ASSERT(!bio_flagged(bio, BIO_CLONED));
- bio_for_each_segment(bvec, bio, iter)
- SetPageUptodate(bvec.bv_page);
+ bio_for_each_segment_all(bvec, bio, i)
+ SetPageUptodate(bvec->bv_page);
}
/*
@@ -1969,7 +1987,22 @@ cleanup:
cleanup_io:
if (rbio->operation == BTRFS_RBIO_READ_REBUILD) {
- if (err == BLK_STS_OK)
+ /*
+ * - In case of two failures, where rbio->failb != -1:
+ *
+ * Do not cache this rbio since the above read reconstruction
+ * (raid6_datap_recov() or raid6_2data_recov()) may have
+ * changed some content of stripes which are not identical to
+ * on-disk content any more, otherwise, a later write/recover
+ * may steal stripe_pages from this rbio and end up with
+ * corruptions or rebuild failures.
+ *
+ * - In case of single failure, where rbio->failb == -1:
+ *
+ * Cache this rbio iff the above read reconstruction is
+ * excuted without problems.
+ */
+ if (err == BLK_STS_OK && rbio->failb < 0)
cache_rbio_pages(rbio);
else
clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
@@ -2170,11 +2203,21 @@ int raid56_parity_recover(struct btrfs_fs_info *fs_info, struct bio *bio,
}
/*
- * reconstruct from the q stripe if they are
- * asking for mirror 3
+ * Loop retry:
+ * for 'mirror == 2', reconstruct from all other stripes.
+ * for 'mirror_num > 2', select a stripe to fail on every retry.
*/
- if (mirror_num == 3)
- rbio->failb = rbio->real_stripes - 2;
+ if (mirror_num > 2) {
+ /*
+ * 'mirror == 3' is to fail the p stripe and
+ * reconstruct from the q stripe. 'mirror > 3' is to
+ * fail a data stripe and reconstruct from p+q stripe.
+ */
+ rbio->failb = rbio->real_stripes - (mirror_num - 1);
+ ASSERT(rbio->failb > 0);
+ if (rbio->failb <= rbio->faila)
+ rbio->failb--;
+ }
ret = lock_stripe_add(rbio);
diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c
index 34878699d363..171f3cce30e6 100644
--- a/fs/btrfs/ref-verify.c
+++ b/fs/btrfs/ref-verify.c
@@ -606,8 +606,7 @@ static int walk_down_tree(struct btrfs_root *root, struct btrfs_path *path,
}
/* Walk up to the next node that needs to be processed */
-static int walk_up_tree(struct btrfs_root *root, struct btrfs_path *path,
- int *level)
+static int walk_up_tree(struct btrfs_path *path, int *level)
{
int l;
@@ -984,7 +983,6 @@ void btrfs_free_ref_tree_range(struct btrfs_fs_info *fs_info, u64 start,
int btrfs_build_ref_tree(struct btrfs_fs_info *fs_info)
{
struct btrfs_path *path;
- struct btrfs_root *root;
struct extent_buffer *eb;
u64 bytenr = 0, num_bytes = 0;
int ret, level;
@@ -1014,7 +1012,7 @@ int btrfs_build_ref_tree(struct btrfs_fs_info *fs_info)
&bytenr, &num_bytes);
if (ret)
break;
- ret = walk_up_tree(root, path, &level);
+ ret = walk_up_tree(path, &level);
if (ret < 0)
break;
if (ret > 0) {
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 3338407ef0f0..aab0194efe46 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -387,13 +387,6 @@ again:
WARN_ON(btrfs_root_ref_dirid(leaf, ref) != dirid);
WARN_ON(btrfs_root_ref_name_len(leaf, ref) != name_len);
ptr = (unsigned long)(ref + 1);
- ret = btrfs_is_name_len_valid(leaf, path->slots[0], ptr,
- name_len);
- if (!ret) {
- err = -EIO;
- goto out;
- }
-
WARN_ON(memcmp_extent_buffer(leaf, name, ptr, name_len));
*sequence = btrfs_root_ref_sequence(leaf, ref);
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index b2f871d80982..ec56f33feea9 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -301,6 +301,11 @@ static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info);
static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info);
static void scrub_put_ctx(struct scrub_ctx *sctx);
+static inline int scrub_is_page_on_raid56(struct scrub_page *page)
+{
+ return page->recover &&
+ (page->recover->bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK);
+}
static void scrub_pending_bio_inc(struct scrub_ctx *sctx)
{
@@ -1323,15 +1328,34 @@ nodatasum_case:
* could happen otherwise that a correct page would be
* overwritten by a bad one).
*/
- for (mirror_index = 0;
- mirror_index < BTRFS_MAX_MIRRORS &&
- sblocks_for_recheck[mirror_index].page_count > 0;
- mirror_index++) {
+ for (mirror_index = 0; ;mirror_index++) {
struct scrub_block *sblock_other;
if (mirror_index == failed_mirror_index)
continue;
- sblock_other = sblocks_for_recheck + mirror_index;
+
+ /* raid56's mirror can be more than BTRFS_MAX_MIRRORS */
+ if (!scrub_is_page_on_raid56(sblock_bad->pagev[0])) {
+ if (mirror_index >= BTRFS_MAX_MIRRORS)
+ break;
+ if (!sblocks_for_recheck[mirror_index].page_count)
+ break;
+
+ sblock_other = sblocks_for_recheck + mirror_index;
+ } else {
+ struct scrub_recover *r = sblock_bad->pagev[0]->recover;
+ int max_allowed = r->bbio->num_stripes -
+ r->bbio->num_tgtdevs;
+
+ if (mirror_index >= max_allowed)
+ break;
+ if (!sblocks_for_recheck[1].page_count)
+ break;
+
+ ASSERT(failed_mirror_index == 0);
+ sblock_other = sblocks_for_recheck + 1;
+ sblock_other->pagev[0]->mirror_num = 1 + mirror_index;
+ }
/* build and submit the bios, check checksums */
scrub_recheck_block(fs_info, sblock_other, 0);
@@ -1666,49 +1690,32 @@ leave_nomem:
return 0;
}
-struct scrub_bio_ret {
- struct completion event;
- blk_status_t status;
-};
-
static void scrub_bio_wait_endio(struct bio *bio)
{
- struct scrub_bio_ret *ret = bio->bi_private;
-
- ret->status = bio->bi_status;
- complete(&ret->event);
-}
-
-static inline int scrub_is_page_on_raid56(struct scrub_page *page)
-{
- return page->recover &&
- (page->recover->bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK);
+ complete(bio->bi_private);
}
static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info,
struct bio *bio,
struct scrub_page *page)
{
- struct scrub_bio_ret done;
+ DECLARE_COMPLETION_ONSTACK(done);
int ret;
+ int mirror_num;
- init_completion(&done.event);
- done.status = 0;
bio->bi_iter.bi_sector = page->logical >> 9;
bio->bi_private = &done;
bio->bi_end_io = scrub_bio_wait_endio;
+ mirror_num = page->sblock->pagev[0]->mirror_num;
ret = raid56_parity_recover(fs_info, bio, page->recover->bbio,
page->recover->map_length,
- page->mirror_num, 0);
+ mirror_num, 0);
if (ret)
return ret;
- wait_for_completion_io(&done.event);
- if (done.status)
- return -EIO;
-
- return 0;
+ wait_for_completion_io(&done);
+ return blk_status_to_errno(bio->bi_status);
}
/*
@@ -2535,7 +2542,7 @@ leave_nomem:
}
WARN_ON(sblock->page_count == 0);
- if (dev->missing) {
+ if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state)) {
/*
* This case should only be hit for RAID 5/6 device replace. See
* the comment in scrub_missing_raid56_pages() for details.
@@ -2870,7 +2877,7 @@ static int scrub_extent_for_parity(struct scrub_parity *sparity,
u8 csum[BTRFS_CSUM_SIZE];
u32 blocksize;
- if (dev->missing) {
+ if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state)) {
scrub_parity_mark_sectors_error(sparity, logical, len);
return 0;
}
@@ -4112,12 +4119,14 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
mutex_lock(&fs_info->fs_devices->device_list_mutex);
dev = btrfs_find_device(fs_info, devid, NULL, NULL);
- if (!dev || (dev->missing && !is_dev_replace)) {
+ if (!dev || (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) &&
+ !is_dev_replace)) {
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
return -ENODEV;
}
- if (!is_dev_replace && !readonly && !dev->writeable) {
+ if (!is_dev_replace && !readonly &&
+ !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) {
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
rcu_read_lock();
name = rcu_dereference(dev->name);
@@ -4128,14 +4137,15 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
}
mutex_lock(&fs_info->scrub_lock);
- if (!dev->in_fs_metadata || dev->is_tgtdev_for_dev_replace) {
+ if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) ||
+ test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &dev->dev_state)) {
mutex_unlock(&fs_info->scrub_lock);
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
return -EIO;
}
btrfs_dev_replace_lock(&fs_info->dev_replace, 0);
- if (dev->scrub_device ||
+ if (dev->scrub_ctx ||
(!is_dev_replace &&
btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))) {
btrfs_dev_replace_unlock(&fs_info->dev_replace, 0);
@@ -4160,7 +4170,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
return PTR_ERR(sctx);
}
sctx->readonly = readonly;
- dev->scrub_device = sctx;
+ dev->scrub_ctx = sctx;
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
/*
@@ -4195,7 +4205,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
memcpy(progress, &sctx->stat, sizeof(*progress));
mutex_lock(&fs_info->scrub_lock);
- dev->scrub_device = NULL;
+ dev->scrub_ctx = NULL;
scrub_workers_put(fs_info);
mutex_unlock(&fs_info->scrub_lock);
@@ -4252,16 +4262,16 @@ int btrfs_scrub_cancel_dev(struct btrfs_fs_info *fs_info,
struct scrub_ctx *sctx;
mutex_lock(&fs_info->scrub_lock);
- sctx = dev->scrub_device;
+ sctx = dev->scrub_ctx;
if (!sctx) {
mutex_unlock(&fs_info->scrub_lock);
return -ENOTCONN;
}
atomic_inc(&sctx->cancel_req);
- while (dev->scrub_device) {
+ while (dev->scrub_ctx) {
mutex_unlock(&fs_info->scrub_lock);
wait_event(fs_info->scrub_pause_wait,
- dev->scrub_device == NULL);
+ dev->scrub_ctx == NULL);
mutex_lock(&fs_info->scrub_lock);
}
mutex_unlock(&fs_info->scrub_lock);
@@ -4278,7 +4288,7 @@ int btrfs_scrub_progress(struct btrfs_fs_info *fs_info, u64 devid,
mutex_lock(&fs_info->fs_devices->device_list_mutex);
dev = btrfs_find_device(fs_info, devid, NULL, NULL);
if (dev)
- sctx = dev->scrub_device;
+ sctx = dev->scrub_ctx;
if (sctx)
memcpy(progress, &sctx->stat, sizeof(*progress));
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
@@ -4478,8 +4488,7 @@ static int check_extent_to_block(struct btrfs_inode *inode, u64 start, u64 len,
free_extent_map(em);
out_unlock:
- unlock_extent_cached(io_tree, lockstart, lockend, &cached_state,
- GFP_NOFS);
+ unlock_extent_cached(io_tree, lockstart, lockend, &cached_state);
return ret;
}
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 20d3300bd268..f306c608dc28 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -1059,12 +1059,6 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
}
}
- ret = btrfs_is_name_len_valid(eb, path->slots[0],
- (unsigned long)(di + 1), name_len + data_len);
- if (!ret) {
- ret = -EIO;
- goto out;
- }
if (name_len + data_len > buf_len) {
buf_len = name_len + data_len;
if (is_vmalloc_addr(buf)) {
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 3a4dce153645..6e71a2a78363 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -61,12 +61,21 @@
#include "tests/btrfs-tests.h"
#include "qgroup.h"
-#include "backref.h"
#define CREATE_TRACE_POINTS
#include <trace/events/btrfs.h>
static const struct super_operations btrfs_super_ops;
+
+/*
+ * Types for mounting the default subvolume and a subvolume explicitly
+ * requested by subvol=/path. That way the callchain is straightforward and we
+ * don't have to play tricks with the mount options and recursive calls to
+ * btrfs_mount.
+ *
+ * The new btrfs_root_fs_type also servers as a tag for the bdev_holder.
+ */
static struct file_system_type btrfs_fs_type;
+static struct file_system_type btrfs_root_fs_type;
static int btrfs_remount(struct super_block *sb, int *flags, char *data);
@@ -98,30 +107,6 @@ const char *btrfs_decode_error(int errno)
return errstr;
}
-/* btrfs handle error by forcing the filesystem readonly */
-static void btrfs_handle_error(struct btrfs_fs_info *fs_info)
-{
- struct super_block *sb = fs_info->sb;
-
- if (sb_rdonly(sb))
- return;
-
- if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
- sb->s_flags |= SB_RDONLY;
- btrfs_info(fs_info, "forced readonly");
- /*
- * Note that a running device replace operation is not
- * canceled here although there is no way to update
- * the progress. It would add the risk of a deadlock,
- * therefore the canceling is omitted. The only penalty
- * is that some I/O remains active until the procedure
- * completes. The next time when the filesystem is
- * mounted writeable again, the device replace
- * operation continues.
- */
- }
-}
-
/*
* __btrfs_handle_fs_error decodes expected errors from the caller and
* invokes the approciate error response.
@@ -168,8 +153,23 @@ void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function
set_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state);
/* Don't go through full error handling during mount */
- if (sb->s_flags & SB_BORN)
- btrfs_handle_error(fs_info);
+ if (!(sb->s_flags & SB_BORN))
+ return;
+
+ if (sb_rdonly(sb))
+ return;
+
+ /* btrfs handle error by forcing the filesystem readonly */
+ sb->s_flags |= SB_RDONLY;
+ btrfs_info(fs_info, "forced readonly");
+ /*
+ * Note that a running device replace operation is not canceled here
+ * although there is no way to update the progress. It would add the
+ * risk of a deadlock, therefore the canceling is omitted. The only
+ * penalty is that some I/O remains active until the procedure
+ * completes. The next time when the filesystem is mounted writeable
+ * again, the device replace operation continues.
+ */
}
#ifdef CONFIG_PRINTK
@@ -405,7 +405,7 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
unsigned long new_flags)
{
substring_t args[MAX_OPT_ARGS];
- char *p, *num, *orig = NULL;
+ char *p, *num;
u64 cache_gen;
int intarg;
int ret = 0;
@@ -428,16 +428,6 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
if (!options)
goto check;
- /*
- * strsep changes the string, duplicate it because parse_options
- * gets called twice
- */
- options = kstrdup(options, GFP_KERNEL);
- if (!options)
- return -ENOMEM;
-
- orig = options;
-
while ((p = strsep(&options, ",")) != NULL) {
int token;
if (!*p)
@@ -454,7 +444,8 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
case Opt_subvolrootid:
case Opt_device:
/*
- * These are parsed by btrfs_parse_early_options
+ * These are parsed by btrfs_parse_subvol_options
+ * and btrfs_parse_early_options
* and can be happily ignored here.
*/
break;
@@ -877,7 +868,6 @@ out:
btrfs_info(info, "disk space caching is enabled");
if (!ret && btrfs_test_opt(info, FREE_SPACE_TREE))
btrfs_info(info, "using free space tree");
- kfree(orig);
return ret;
}
@@ -888,11 +878,60 @@ out:
* only when we need to allocate a new super block.
*/
static int btrfs_parse_early_options(const char *options, fmode_t flags,
- void *holder, char **subvol_name, u64 *subvol_objectid,
- struct btrfs_fs_devices **fs_devices)
+ void *holder, struct btrfs_fs_devices **fs_devices)
{
substring_t args[MAX_OPT_ARGS];
char *device_name, *opts, *orig, *p;
+ int error = 0;
+
+ if (!options)
+ return 0;
+
+ /*
+ * strsep changes the string, duplicate it because btrfs_parse_options
+ * gets called later
+ */
+ opts = kstrdup(options, GFP_KERNEL);
+ if (!opts)
+ return -ENOMEM;
+ orig = opts;
+
+ while ((p = strsep(&opts, ",")) != NULL) {
+ int token;
+
+ if (!*p)
+ continue;
+
+ token = match_token(p, tokens, args);
+ if (token == Opt_device) {
+ device_name = match_strdup(&args[0]);
+ if (!device_name) {
+ error = -ENOMEM;
+ goto out;
+ }
+ error = btrfs_scan_one_device(device_name,
+ flags, holder, fs_devices);
+ kfree(device_name);
+ if (error)
+ goto out;
+ }
+ }
+
+out:
+ kfree(orig);
+ return error;
+}
+
+/*
+ * Parse mount options that are related to subvolume id
+ *
+ * The value is later passed to mount_subvol()
+ */
+static int btrfs_parse_subvol_options(const char *options, fmode_t flags,
+ char **subvol_name, u64 *subvol_objectid)
+{
+ substring_t args[MAX_OPT_ARGS];
+ char *opts, *orig, *p;
char *num = NULL;
int error = 0;
@@ -900,8 +939,8 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags,
return 0;
/*
- * strsep changes the string, duplicate it because parse_options
- * gets called twice
+ * strsep changes the string, duplicate it because
+ * btrfs_parse_early_options gets called later
*/
opts = kstrdup(options, GFP_KERNEL);
if (!opts)
@@ -940,18 +979,6 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags,
case Opt_subvolrootid:
pr_warn("BTRFS: 'subvolrootid' mount option is deprecated and has no effect\n");
break;
- case Opt_device:
- device_name = match_strdup(&args[0]);
- if (!device_name) {
- error = -ENOMEM;
- goto out;
- }
- error = btrfs_scan_one_device(device_name,
- flags, holder, fs_devices);
- kfree(device_name);
- if (error)
- goto out;
- break;
default:
break;
}
@@ -1243,7 +1270,7 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
{
struct btrfs_fs_info *info = btrfs_sb(dentry->d_sb);
- char *compress_type;
+ const char *compress_type;
if (btrfs_test_opt(info, DEGRADED))
seq_puts(seq, ",degraded");
@@ -1259,12 +1286,7 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
num_online_cpus() + 2, 8))
seq_printf(seq, ",thread_pool=%d", info->thread_pool_size);
if (btrfs_test_opt(info, COMPRESS)) {
- if (info->compress_type == BTRFS_COMPRESS_ZLIB)
- compress_type = "zlib";
- else if (info->compress_type == BTRFS_COMPRESS_LZO)
- compress_type = "lzo";
- else
- compress_type = "zstd";
+ compress_type = btrfs_compress_type2str(info->compress_type);
if (btrfs_test_opt(info, FORCE_COMPRESS))
seq_printf(seq, ",compress-force=%s", compress_type);
else
@@ -1365,86 +1387,12 @@ static inline int is_subvolume_inode(struct inode *inode)
return 0;
}
-/*
- * This will add subvolid=0 to the argument string while removing any subvol=
- * and subvolid= arguments to make sure we get the top-level root for path
- * walking to the subvol we want.
- */
-static char *setup_root_args(char *args)
-{
- char *buf, *dst, *sep;
-
- if (!args)
- return kstrdup("subvolid=0", GFP_KERNEL);
-
- /* The worst case is that we add ",subvolid=0" to the end. */
- buf = dst = kmalloc(strlen(args) + strlen(",subvolid=0") + 1,
- GFP_KERNEL);
- if (!buf)
- return NULL;
-
- while (1) {
- sep = strchrnul(args, ',');
- if (!strstarts(args, "subvol=") &&
- !strstarts(args, "subvolid=")) {
- memcpy(dst, args, sep - args);
- dst += sep - args;
- *dst++ = ',';
- }
- if (*sep)
- args = sep + 1;
- else
- break;
- }
- strcpy(dst, "subvolid=0");
-
- return buf;
-}
-
static struct dentry *mount_subvol(const char *subvol_name, u64 subvol_objectid,
- int flags, const char *device_name,
- char *data)
+ const char *device_name, struct vfsmount *mnt)
{
struct dentry *root;
- struct vfsmount *mnt = NULL;
- char *newargs;
int ret;
- newargs = setup_root_args(data);
- if (!newargs) {
- root = ERR_PTR(-ENOMEM);
- goto out;
- }
-
- mnt = vfs_kern_mount(&btrfs_fs_type, flags, device_name, newargs);
- if (PTR_ERR_OR_ZERO(mnt) == -EBUSY) {
- if (flags & SB_RDONLY) {
- mnt = vfs_kern_mount(&btrfs_fs_type, flags & ~SB_RDONLY,
- device_name, newargs);
- } else {
- mnt = vfs_kern_mount(&btrfs_fs_type, flags | SB_RDONLY,
- device_name, newargs);
- if (IS_ERR(mnt)) {
- root = ERR_CAST(mnt);
- mnt = NULL;
- goto out;
- }
-
- down_write(&mnt->mnt_sb->s_umount);
- ret = btrfs_remount(mnt->mnt_sb, &flags, NULL);
- up_write(&mnt->mnt_sb->s_umount);
- if (ret < 0) {
- root = ERR_PTR(ret);
- goto out;
- }
- }
- }
- if (IS_ERR(mnt)) {
- root = ERR_CAST(mnt);
- mnt = NULL;
- goto out;
- }
-
if (!subvol_name) {
if (!subvol_objectid) {
ret = get_default_subvol_objectid(btrfs_sb(mnt->mnt_sb),
@@ -1500,7 +1448,6 @@ static struct dentry *mount_subvol(const char *subvol_name, u64 subvol_objectid,
out:
mntput(mnt);
- kfree(newargs);
kfree(subvol_name);
return root;
}
@@ -1558,11 +1505,11 @@ static int setup_security_options(struct btrfs_fs_info *fs_info,
/*
* Find a superblock for the given device / mount point.
*
- * Note: This is based on get_sb_bdev from fs/super.c with a few additions
- * for multiple device setup. Make sure to keep it in sync.
+ * Note: This is based on mount_bdev from fs/super.c with a few additions
+ * for multiple device setup. Make sure to keep it in sync.
*/
-static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
- const char *device_name, void *data)
+static struct dentry *btrfs_mount_root(struct file_system_type *fs_type,
+ int flags, const char *device_name, void *data)
{
struct block_device *bdev = NULL;
struct super_block *s;
@@ -1570,27 +1517,17 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
struct btrfs_fs_info *fs_info = NULL;
struct security_mnt_opts new_sec_opts;
fmode_t mode = FMODE_READ;
- char *subvol_name = NULL;
- u64 subvol_objectid = 0;
int error = 0;
if (!(flags & SB_RDONLY))
mode |= FMODE_WRITE;
error = btrfs_parse_early_options(data, mode, fs_type,
- &subvol_name, &subvol_objectid,
&fs_devices);
if (error) {
- kfree(subvol_name);
return ERR_PTR(error);
}
- if (subvol_name || subvol_objectid != BTRFS_FS_TREE_OBJECTID) {
- /* mount_subvol() will free subvol_name. */
- return mount_subvol(subvol_name, subvol_objectid, flags,
- device_name, data);
- }
-
security_init_mnt_opts(&new_sec_opts);
if (data) {
error = parse_security_options(data, &new_sec_opts);
@@ -1674,6 +1611,84 @@ error_sec_opts:
return ERR_PTR(error);
}
+/*
+ * Mount function which is called by VFS layer.
+ *
+ * In order to allow mounting a subvolume directly, btrfs uses mount_subtree()
+ * which needs vfsmount* of device's root (/). This means device's root has to
+ * be mounted internally in any case.
+ *
+ * Operation flow:
+ * 1. Parse subvol id related options for later use in mount_subvol().
+ *
+ * 2. Mount device's root (/) by calling vfs_kern_mount().
+ *
+ * NOTE: vfs_kern_mount() is used by VFS to call btrfs_mount() in the
+ * first place. In order to avoid calling btrfs_mount() again, we use
+ * different file_system_type which is not registered to VFS by
+ * register_filesystem() (btrfs_root_fs_type). As a result,
+ * btrfs_mount_root() is called. The return value will be used by
+ * mount_subtree() in mount_subvol().
+ *
+ * 3. Call mount_subvol() to get the dentry of subvolume. Since there is
+ * "btrfs subvolume set-default", mount_subvol() is called always.
+ */
+static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
+ const char *device_name, void *data)
+{
+ struct vfsmount *mnt_root;
+ struct dentry *root;
+ fmode_t mode = FMODE_READ;
+ char *subvol_name = NULL;
+ u64 subvol_objectid = 0;
+ int error = 0;
+
+ if (!(flags & SB_RDONLY))
+ mode |= FMODE_WRITE;
+
+ error = btrfs_parse_subvol_options(data, mode,
+ &subvol_name, &subvol_objectid);
+ if (error) {
+ kfree(subvol_name);
+ return ERR_PTR(error);
+ }
+
+ /* mount device's root (/) */
+ mnt_root = vfs_kern_mount(&btrfs_root_fs_type, flags, device_name, data);
+ if (PTR_ERR_OR_ZERO(mnt_root) == -EBUSY) {
+ if (flags & SB_RDONLY) {
+ mnt_root = vfs_kern_mount(&btrfs_root_fs_type,
+ flags & ~SB_RDONLY, device_name, data);
+ } else {
+ mnt_root = vfs_kern_mount(&btrfs_root_fs_type,
+ flags | SB_RDONLY, device_name, data);
+ if (IS_ERR(mnt_root)) {
+ root = ERR_CAST(mnt_root);
+ goto out;
+ }
+
+ down_write(&mnt_root->mnt_sb->s_umount);
+ error = btrfs_remount(mnt_root->mnt_sb, &flags, NULL);
+ up_write(&mnt_root->mnt_sb->s_umount);
+ if (error < 0) {
+ root = ERR_PTR(error);
+ mntput(mnt_root);
+ goto out;
+ }
+ }
+ }
+ if (IS_ERR(mnt_root)) {
+ root = ERR_CAST(mnt_root);
+ goto out;
+ }
+
+ /* mount_subvol() will free subvol_name and mnt_root */
+ root = mount_subvol(subvol_name, subvol_objectid, device_name, mnt_root);
+
+out:
+ return root;
+}
+
static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info,
int new_pool_size, int old_pool_size)
{
@@ -1820,7 +1835,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
goto restore;
}
- if (!btrfs_check_rw_degradable(fs_info)) {
+ if (!btrfs_check_rw_degradable(fs_info, NULL)) {
btrfs_warn(fs_info,
"too many missing devices, writeable remount is not allowed");
ret = -EACCES;
@@ -1972,8 +1987,10 @@ static int btrfs_calc_avail_data_space(struct btrfs_fs_info *fs_info,
rcu_read_lock();
list_for_each_entry_rcu(device, &fs_devices->devices, dev_list) {
- if (!device->in_fs_metadata || !device->bdev ||
- device->is_tgtdev_for_dev_replace)
+ if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
+ &device->dev_state) ||
+ !device->bdev ||
+ test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state))
continue;
if (i >= nr_devices)
@@ -2174,6 +2191,15 @@ static struct file_system_type btrfs_fs_type = {
.kill_sb = btrfs_kill_super,
.fs_flags = FS_REQUIRES_DEV | FS_BINARY_MOUNTDATA,
};
+
+static struct file_system_type btrfs_root_fs_type = {
+ .owner = THIS_MODULE,
+ .name = "btrfs",
+ .mount = btrfs_mount_root,
+ .kill_sb = btrfs_kill_super,
+ .fs_flags = FS_REQUIRES_DEV | FS_BINARY_MOUNTDATA,
+};
+
MODULE_ALIAS_FS("btrfs");
static int btrfs_control_open(struct inode *inode, struct file *file)
@@ -2207,11 +2233,11 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd,
switch (cmd) {
case BTRFS_IOC_SCAN_DEV:
ret = btrfs_scan_one_device(vol->name, FMODE_READ,
- &btrfs_fs_type, &fs_devices);
+ &btrfs_root_fs_type, &fs_devices);
break;
case BTRFS_IOC_DEVICES_READY:
ret = btrfs_scan_one_device(vol->name, FMODE_READ,
- &btrfs_fs_type, &fs_devices);
+ &btrfs_root_fs_type, &fs_devices);
if (ret)
break;
ret = !(fs_devices->num_devices == fs_devices->total_devices);
@@ -2269,7 +2295,7 @@ static int btrfs_show_devname(struct seq_file *m, struct dentry *root)
while (cur_devices) {
head = &cur_devices->devices;
list_for_each_entry(dev, head, dev_list) {
- if (dev->missing)
+ if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state))
continue;
if (!dev->name)
continue;
@@ -2324,7 +2350,7 @@ static struct miscdevice btrfs_misc = {
MODULE_ALIAS_MISCDEV(BTRFS_MINOR);
MODULE_ALIAS("devname:btrfs-control");
-static int btrfs_interface_init(void)
+static int __init btrfs_interface_init(void)
{
return misc_register(&btrfs_misc);
}
@@ -2334,7 +2360,7 @@ static void btrfs_interface_exit(void)
misc_deregister(&btrfs_misc);
}
-static void btrfs_print_mod_info(void)
+static void __init btrfs_print_mod_info(void)
{
pr_info("Btrfs loaded, crc32c=%s"
#ifdef CONFIG_BTRFS_DEBUG
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index a28bba801264..a8bafed931f4 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -897,7 +897,7 @@ static int btrfs_init_debugfs(void)
return 0;
}
-int btrfs_init_sysfs(void)
+int __init btrfs_init_sysfs(void)
{
int ret;
diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c
index d3f25376a0f8..9786d8cd0aa6 100644
--- a/fs/btrfs/tests/btrfs-tests.c
+++ b/fs/btrfs/tests/btrfs-tests.c
@@ -277,6 +277,9 @@ int btrfs_run_sanity_tests(void)
goto out;
}
}
+ ret = btrfs_test_extent_map();
+ if (ret)
+ goto out;
out:
btrfs_destroy_test_fs();
return ret;
diff --git a/fs/btrfs/tests/btrfs-tests.h b/fs/btrfs/tests/btrfs-tests.h
index 266f1e3d1784..bc0615bac3cc 100644
--- a/fs/btrfs/tests/btrfs-tests.h
+++ b/fs/btrfs/tests/btrfs-tests.h
@@ -33,6 +33,7 @@ int btrfs_test_extent_io(u32 sectorsize, u32 nodesize);
int btrfs_test_inodes(u32 sectorsize, u32 nodesize);
int btrfs_test_qgroups(u32 sectorsize, u32 nodesize);
int btrfs_test_free_space_tree(u32 sectorsize, u32 nodesize);
+int btrfs_test_extent_map(void);
struct inode *btrfs_new_test_inode(void);
struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(u32 nodesize, u32 sectorsize);
void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info);
diff --git a/fs/btrfs/tests/extent-map-tests.c b/fs/btrfs/tests/extent-map-tests.c
new file mode 100644
index 000000000000..70c993f01670
--- /dev/null
+++ b/fs/btrfs/tests/extent-map-tests.c
@@ -0,0 +1,366 @@
+/*
+ * Copyright (C) 2017 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/types.h>
+#include "btrfs-tests.h"
+#include "../ctree.h"
+
+static void free_extent_map_tree(struct extent_map_tree *em_tree)
+{
+ struct extent_map *em;
+ struct rb_node *node;
+
+ while (!RB_EMPTY_ROOT(&em_tree->map)) {
+ node = rb_first(&em_tree->map);
+ em = rb_entry(node, struct extent_map, rb_node);
+ remove_extent_mapping(em_tree, em);
+
+#ifdef CONFIG_BTRFS_DEBUG
+ if (refcount_read(&em->refs) != 1) {
+ test_msg(
+"em leak: em (start 0x%llx len 0x%llx block_start 0x%llx block_len 0x%llx) refs %d\n",
+ em->start, em->len, em->block_start,
+ em->block_len, refcount_read(&em->refs));
+
+ refcount_set(&em->refs, 1);
+ }
+#endif
+ free_extent_map(em);
+ }
+}
+
+/*
+ * Test scenario:
+ *
+ * Suppose that no extent map has been loaded into memory yet, there is a file
+ * extent [0, 16K), followed by another file extent [16K, 20K), two dio reads
+ * are entering btrfs_get_extent() concurrently, t1 is reading [8K, 16K), t2 is
+ * reading [0, 8K)
+ *
+ * t1 t2
+ * btrfs_get_extent() btrfs_get_extent()
+ * -> lookup_extent_mapping() ->lookup_extent_mapping()
+ * -> add_extent_mapping(0, 16K)
+ * -> return em
+ * ->add_extent_mapping(0, 16K)
+ * -> #handle -EEXIST
+ */
+static void test_case_1(struct extent_map_tree *em_tree)
+{
+ struct extent_map *em;
+ u64 start = 0;
+ u64 len = SZ_8K;
+ int ret;
+
+ em = alloc_extent_map();
+ if (!em)
+ /* Skip the test on error. */
+ return;
+
+ /* Add [0, 16K) */
+ em->start = 0;
+ em->len = SZ_16K;
+ em->block_start = 0;
+ em->block_len = SZ_16K;
+ ret = add_extent_mapping(em_tree, em, 0);
+ ASSERT(ret == 0);
+ free_extent_map(em);
+
+ /* Add [16K, 20K) following [0, 16K) */
+ em = alloc_extent_map();
+ if (!em)
+ goto out;
+
+ em->start = SZ_16K;
+ em->len = SZ_4K;
+ em->block_start = SZ_32K; /* avoid merging */
+ em->block_len = SZ_4K;
+ ret = add_extent_mapping(em_tree, em, 0);
+ ASSERT(ret == 0);
+ free_extent_map(em);
+
+ em = alloc_extent_map();
+ if (!em)
+ goto out;
+
+ /* Add [0, 8K), should return [0, 16K) instead. */
+ em->start = start;
+ em->len = len;
+ em->block_start = start;
+ em->block_len = len;
+ ret = btrfs_add_extent_mapping(em_tree, &em, em->start, em->len);
+ if (ret)
+ test_msg("case1 [%llu %llu]: ret %d\n", start, start + len, ret);
+ if (em &&
+ (em->start != 0 || extent_map_end(em) != SZ_16K ||
+ em->block_start != 0 || em->block_len != SZ_16K))
+ test_msg(
+"case1 [%llu %llu]: ret %d return a wrong em (start %llu len %llu block_start %llu block_len %llu\n",
+ start, start + len, ret, em->start, em->len,
+ em->block_start, em->block_len);
+ free_extent_map(em);
+out:
+ /* free memory */
+ free_extent_map_tree(em_tree);
+}
+
+/*
+ * Test scenario:
+ *
+ * Reading the inline ending up with EEXIST, ie. read an inline
+ * extent and discard page cache and read it again.
+ */
+static void test_case_2(struct extent_map_tree *em_tree)
+{
+ struct extent_map *em;
+ int ret;
+
+ em = alloc_extent_map();
+ if (!em)
+ /* Skip the test on error. */
+ return;
+
+ /* Add [0, 1K) */
+ em->start = 0;
+ em->len = SZ_1K;
+ em->block_start = EXTENT_MAP_INLINE;
+ em->block_len = (u64)-1;
+ ret = add_extent_mapping(em_tree, em, 0);
+ ASSERT(ret == 0);
+ free_extent_map(em);
+
+ /* Add [4K, 4K) following [0, 1K) */
+ em = alloc_extent_map();
+ if (!em)
+ goto out;
+
+ em->start = SZ_4K;
+ em->len = SZ_4K;
+ em->block_start = SZ_4K;
+ em->block_len = SZ_4K;
+ ret = add_extent_mapping(em_tree, em, 0);
+ ASSERT(ret == 0);
+ free_extent_map(em);
+
+ em = alloc_extent_map();
+ if (!em)
+ goto out;
+
+ /* Add [0, 1K) */
+ em->start = 0;
+ em->len = SZ_1K;
+ em->block_start = EXTENT_MAP_INLINE;
+ em->block_len = (u64)-1;
+ ret = btrfs_add_extent_mapping(em_tree, &em, em->start, em->len);
+ if (ret)
+ test_msg("case2 [0 1K]: ret %d\n", ret);
+ if (em &&
+ (em->start != 0 || extent_map_end(em) != SZ_1K ||
+ em->block_start != EXTENT_MAP_INLINE || em->block_len != (u64)-1))
+ test_msg(
+"case2 [0 1K]: ret %d return a wrong em (start %llu len %llu block_start %llu block_len %llu\n",
+ ret, em->start, em->len, em->block_start,
+ em->block_len);
+ free_extent_map(em);
+out:
+ /* free memory */
+ free_extent_map_tree(em_tree);
+}
+
+static void __test_case_3(struct extent_map_tree *em_tree, u64 start)
+{
+ struct extent_map *em;
+ u64 len = SZ_4K;
+ int ret;
+
+ em = alloc_extent_map();
+ if (!em)
+ /* Skip this test on error. */
+ return;
+
+ /* Add [4K, 8K) */
+ em->start = SZ_4K;
+ em->len = SZ_4K;
+ em->block_start = SZ_4K;
+ em->block_len = SZ_4K;
+ ret = add_extent_mapping(em_tree, em, 0);
+ ASSERT(ret == 0);
+ free_extent_map(em);
+
+ em = alloc_extent_map();
+ if (!em)
+ goto out;
+
+ /* Add [0, 16K) */
+ em->start = 0;
+ em->len = SZ_16K;
+ em->block_start = 0;
+ em->block_len = SZ_16K;
+ ret = btrfs_add_extent_mapping(em_tree, &em, start, len);
+ if (ret)
+ test_msg("case3 [0x%llx 0x%llx): ret %d\n",
+ start, start + len, ret);
+ /*
+ * Since bytes within em are contiguous, em->block_start is identical to
+ * em->start.
+ */
+ if (em &&
+ (start < em->start || start + len > extent_map_end(em) ||
+ em->start != em->block_start || em->len != em->block_len))
+ test_msg(
+"case3 [0x%llx 0x%llx): ret %d em (start 0x%llx len 0x%llx block_start 0x%llx block_len 0x%llx)\n",
+ start, start + len, ret, em->start, em->len,
+ em->block_start, em->block_len);
+ free_extent_map(em);
+out:
+ /* free memory */
+ free_extent_map_tree(em_tree);
+}
+
+/*
+ * Test scenario:
+ *
+ * Suppose that no extent map has been loaded into memory yet.
+ * There is a file extent [0, 16K), two jobs are running concurrently
+ * against it, t1 is buffered writing to [4K, 8K) and t2 is doing dio
+ * read from [0, 4K) or [8K, 12K) or [12K, 16K).
+ *
+ * t1 goes ahead of t2 and adds em [4K, 8K) into tree.
+ *
+ * t1 t2
+ * cow_file_range() btrfs_get_extent()
+ * -> lookup_extent_mapping()
+ * -> add_extent_mapping()
+ * -> add_extent_mapping()
+ */
+static void test_case_3(struct extent_map_tree *em_tree)
+{
+ __test_case_3(em_tree, 0);
+ __test_case_3(em_tree, SZ_8K);
+ __test_case_3(em_tree, (12 * 1024ULL));
+}
+
+static void __test_case_4(struct extent_map_tree *em_tree, u64 start)
+{
+ struct extent_map *em;
+ u64 len = SZ_4K;
+ int ret;
+
+ em = alloc_extent_map();
+ if (!em)
+ /* Skip this test on error. */
+ return;
+
+ /* Add [0K, 8K) */
+ em->start = 0;
+ em->len = SZ_8K;
+ em->block_start = 0;
+ em->block_len = SZ_8K;
+ ret = add_extent_mapping(em_tree, em, 0);
+ ASSERT(ret == 0);
+ free_extent_map(em);
+
+ em = alloc_extent_map();
+ if (!em)
+ goto out;
+
+ /* Add [8K, 24K) */
+ em->start = SZ_8K;
+ em->len = 24 * 1024ULL;
+ em->block_start = SZ_16K; /* avoid merging */
+ em->block_len = 24 * 1024ULL;
+ ret = add_extent_mapping(em_tree, em, 0);
+ ASSERT(ret == 0);
+ free_extent_map(em);
+
+ em = alloc_extent_map();
+ if (!em)
+ goto out;
+ /* Add [0K, 32K) */
+ em->start = 0;
+ em->len = SZ_32K;
+ em->block_start = 0;
+ em->block_len = SZ_32K;
+ ret = btrfs_add_extent_mapping(em_tree, &em, start, len);
+ if (ret)
+ test_msg("case4 [0x%llx 0x%llx): ret %d\n",
+ start, len, ret);
+ if (em &&
+ (start < em->start || start + len > extent_map_end(em)))
+ test_msg(
+"case4 [0x%llx 0x%llx): ret %d, added wrong em (start 0x%llx len 0x%llx block_start 0x%llx block_len 0x%llx)\n",
+ start, len, ret, em->start, em->len, em->block_start,
+ em->block_len);
+ free_extent_map(em);
+out:
+ /* free memory */
+ free_extent_map_tree(em_tree);
+}
+
+/*
+ * Test scenario:
+ *
+ * Suppose that no extent map has been loaded into memory yet.
+ * There is a file extent [0, 32K), two jobs are running concurrently
+ * against it, t1 is doing dio write to [8K, 32K) and t2 is doing dio
+ * read from [0, 4K) or [4K, 8K).
+ *
+ * t1 goes ahead of t2 and splits em [0, 32K) to em [0K, 8K) and [8K 32K).
+ *
+ * t1 t2
+ * btrfs_get_blocks_direct() btrfs_get_blocks_direct()
+ * -> btrfs_get_extent() -> btrfs_get_extent()
+ * -> lookup_extent_mapping()
+ * -> add_extent_mapping() -> lookup_extent_mapping()
+ * # load [0, 32K)
+ * -> btrfs_new_extent_direct()
+ * -> btrfs_drop_extent_cache()
+ * # split [0, 32K)
+ * -> add_extent_mapping()
+ * # add [8K, 32K)
+ * -> add_extent_mapping()
+ * # handle -EEXIST when adding
+ * # [0, 32K)
+ */
+static void test_case_4(struct extent_map_tree *em_tree)
+{
+ __test_case_4(em_tree, 0);
+ __test_case_4(em_tree, SZ_4K);
+}
+
+int btrfs_test_extent_map()
+{
+ struct extent_map_tree *em_tree;
+
+ test_msg("Running extent_map tests\n");
+
+ em_tree = kzalloc(sizeof(*em_tree), GFP_KERNEL);
+ if (!em_tree)
+ /* Skip the test on error. */
+ return 0;
+
+ extent_map_tree_init(em_tree);
+
+ test_case_1(em_tree);
+ test_case_2(em_tree);
+ test_case_3(em_tree);
+ test_case_4(em_tree);
+
+ kfree(em_tree);
+ return 0;
+}
diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c
index 30affb60da51..13420cd19ef0 100644
--- a/fs/btrfs/tests/inode-tests.c
+++ b/fs/btrfs/tests/inode-tests.c
@@ -288,10 +288,6 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
test_msg("Expected a hole, got %llu\n", em->block_start);
goto out;
}
- if (!test_bit(EXTENT_FLAG_VACANCY, &em->flags)) {
- test_msg("Vacancy flag wasn't set properly\n");
- goto out;
- }
free_extent_map(em);
btrfs_drop_extent_cache(BTRFS_I(inode), 0, (u64)-1, 0);
@@ -1001,8 +997,7 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)
BTRFS_MAX_EXTENT_SIZE >> 1,
(BTRFS_MAX_EXTENT_SIZE >> 1) + sectorsize - 1,
EXTENT_DELALLOC | EXTENT_DIRTY |
- EXTENT_UPTODATE, 0, 0,
- NULL, GFP_KERNEL);
+ EXTENT_UPTODATE, 0, 0, NULL);
if (ret) {
test_msg("clear_extent_bit returned %d\n", ret);
goto out;
@@ -1070,8 +1065,7 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)
BTRFS_MAX_EXTENT_SIZE + sectorsize,
BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize - 1,
EXTENT_DIRTY | EXTENT_DELALLOC |
- EXTENT_UPTODATE, 0, 0,
- NULL, GFP_KERNEL);
+ EXTENT_UPTODATE, 0, 0, NULL);
if (ret) {
test_msg("clear_extent_bit returned %d\n", ret);
goto out;
@@ -1104,8 +1098,7 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)
/* Empty */
ret = clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, (u64)-1,
EXTENT_DIRTY | EXTENT_DELALLOC |
- EXTENT_UPTODATE, 0, 0,
- NULL, GFP_KERNEL);
+ EXTENT_UPTODATE, 0, 0, NULL);
if (ret) {
test_msg("clear_extent_bit returned %d\n", ret);
goto out;
@@ -1121,8 +1114,7 @@ out:
if (ret)
clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, (u64)-1,
EXTENT_DIRTY | EXTENT_DELALLOC |
- EXTENT_UPTODATE, 0, 0,
- NULL, GFP_KERNEL);
+ EXTENT_UPTODATE, 0, 0, NULL);
iput(inode);
btrfs_free_dummy_root(root);
btrfs_free_dummy_fs_info(fs_info);
@@ -1134,7 +1126,6 @@ int btrfs_test_inodes(u32 sectorsize, u32 nodesize)
int ret;
set_bit(EXTENT_FLAG_COMPRESSED, &compressed_only);
- set_bit(EXTENT_FLAG_VACANCY, &vacancy_only);
set_bit(EXTENT_FLAG_PREALLOC, &prealloc_only);
test_msg("Running btrfs_get_extent tests\n");
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 5a8c2649af2f..04f07144b45c 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -495,8 +495,8 @@ start_transaction(struct btrfs_root *root, unsigned int num_items,
if (current->journal_info) {
WARN_ON(type & TRANS_EXTWRITERS);
h = current->journal_info;
- h->use_count++;
- WARN_ON(h->use_count > 2);
+ refcount_inc(&h->use_count);
+ WARN_ON(refcount_read(&h->use_count) > 2);
h->orig_rsv = h->block_rsv;
h->block_rsv = NULL;
goto got_it;
@@ -567,7 +567,7 @@ again:
h->transid = cur_trans->transid;
h->transaction = cur_trans;
h->root = root;
- h->use_count = 1;
+ refcount_set(&h->use_count, 1);
h->fs_info = root->fs_info;
h->type = type;
@@ -837,8 +837,8 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
int err = 0;
int must_run_delayed_refs = 0;
- if (trans->use_count > 1) {
- trans->use_count--;
+ if (refcount_read(&trans->use_count) > 1) {
+ refcount_dec(&trans->use_count);
trans->block_rsv = trans->orig_rsv;
return 0;
}
@@ -1016,8 +1016,7 @@ static int __btrfs_wait_marked_extents(struct btrfs_fs_info *fs_info,
* it's safe to do it (through clear_btree_io_tree()).
*/
err = clear_extent_bit(dirty_pages, start, end,
- EXTENT_NEED_WAIT,
- 0, 0, &cached_state, GFP_NOFS);
+ EXTENT_NEED_WAIT, 0, 0, &cached_state);
if (err == -ENOMEM)
err = 0;
if (!err)
@@ -1869,7 +1868,7 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans,
struct btrfs_transaction *cur_trans = trans->transaction;
DEFINE_WAIT(wait);
- WARN_ON(trans->use_count > 1);
+ WARN_ON(refcount_read(&trans->use_count) > 1);
btrfs_abort_transaction(trans, err);
@@ -2266,16 +2265,13 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
}
ret = write_all_supers(fs_info, 0);
- if (ret) {
- mutex_unlock(&fs_info->tree_log_mutex);
- goto scrub_continue;
- }
-
/*
* the super is written, we can safely allow the tree-loggers
* to go about their business
*/
mutex_unlock(&fs_info->tree_log_mutex);
+ if (ret)
+ goto scrub_continue;
btrfs_finish_extent_commit(trans, fs_info);
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index c55e44560103..6beee072b1bd 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -58,6 +58,7 @@ struct btrfs_transaction {
/* Be protected by fs_info->trans_lock when we want to change it. */
enum btrfs_trans_state state;
+ int aborted;
struct list_head list;
struct extent_io_tree dirty_pages;
unsigned long start_time;
@@ -70,7 +71,6 @@ struct btrfs_transaction {
struct list_head dirty_bgs;
struct list_head io_bgs;
struct list_head dropped_roots;
- u64 num_dirty_bgs;
/*
* we need to make sure block group deletion doesn't race with
@@ -79,11 +79,11 @@ struct btrfs_transaction {
*/
struct mutex cache_write_mutex;
spinlock_t dirty_bgs_lock;
+ unsigned int num_dirty_bgs;
/* Protected by spin lock fs_info->unused_bgs_lock. */
struct list_head deleted_bgs;
spinlock_t dropped_roots_lock;
struct btrfs_delayed_ref_root delayed_refs;
- int aborted;
struct btrfs_fs_info *fs_info;
};
@@ -111,20 +111,19 @@ struct btrfs_trans_handle {
u64 transid;
u64 bytes_reserved;
u64 chunk_bytes_reserved;
- unsigned long use_count;
- unsigned long blocks_reserved;
unsigned long delayed_ref_updates;
struct btrfs_transaction *transaction;
struct btrfs_block_rsv *block_rsv;
struct btrfs_block_rsv *orig_rsv;
+ refcount_t use_count;
+ unsigned int type;
short aborted;
- short adding_csums;
+ bool adding_csums;
bool allocating_chunk;
bool can_flush_pending_bgs;
bool reloc_reserved;
bool sync;
bool dirty;
- unsigned int type;
struct btrfs_root *root;
struct btrfs_fs_info *fs_info;
struct list_head new_bgs;
diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
index ce4ed6ec8f39..c3c8d48f6618 100644
--- a/fs/btrfs/tree-checker.c
+++ b/fs/btrfs/tree-checker.c
@@ -30,6 +30,7 @@
#include "tree-checker.h"
#include "disk-io.h"
#include "compression.h"
+#include "hash.h"
/*
* Error message should follow the following format:
@@ -223,6 +224,142 @@ static int check_csum_item(struct btrfs_root *root, struct extent_buffer *leaf,
}
/*
+ * Customized reported for dir_item, only important new info is key->objectid,
+ * which represents inode number
+ */
+__printf(4, 5)
+static void dir_item_err(const struct btrfs_root *root,
+ const struct extent_buffer *eb, int slot,
+ const char *fmt, ...)
+{
+ struct btrfs_key key;
+ struct va_format vaf;
+ va_list args;
+
+ btrfs_item_key_to_cpu(eb, &key, slot);
+ va_start(args, fmt);
+
+ vaf.fmt = fmt;
+ vaf.va = &args;
+
+ btrfs_crit(root->fs_info,
+ "corrupt %s: root=%llu block=%llu slot=%d ino=%llu, %pV",
+ btrfs_header_level(eb) == 0 ? "leaf" : "node", root->objectid,
+ btrfs_header_bytenr(eb), slot, key.objectid, &vaf);
+ va_end(args);
+}
+
+static int check_dir_item(struct btrfs_root *root,
+ struct extent_buffer *leaf,
+ struct btrfs_key *key, int slot)
+{
+ struct btrfs_dir_item *di;
+ u32 item_size = btrfs_item_size_nr(leaf, slot);
+ u32 cur = 0;
+
+ di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
+ while (cur < item_size) {
+ u32 name_len;
+ u32 data_len;
+ u32 max_name_len;
+ u32 total_size;
+ u32 name_hash;
+ u8 dir_type;
+
+ /* header itself should not cross item boundary */
+ if (cur + sizeof(*di) > item_size) {
+ dir_item_err(root, leaf, slot,
+ "dir item header crosses item boundary, have %zu boundary %u",
+ cur + sizeof(*di), item_size);
+ return -EUCLEAN;
+ }
+
+ /* dir type check */
+ dir_type = btrfs_dir_type(leaf, di);
+ if (dir_type >= BTRFS_FT_MAX) {
+ dir_item_err(root, leaf, slot,
+ "invalid dir item type, have %u expect [0, %u)",
+ dir_type, BTRFS_FT_MAX);
+ return -EUCLEAN;
+ }
+
+ if (key->type == BTRFS_XATTR_ITEM_KEY &&
+ dir_type != BTRFS_FT_XATTR) {
+ dir_item_err(root, leaf, slot,
+ "invalid dir item type for XATTR key, have %u expect %u",
+ dir_type, BTRFS_FT_XATTR);
+ return -EUCLEAN;
+ }
+ if (dir_type == BTRFS_FT_XATTR &&
+ key->type != BTRFS_XATTR_ITEM_KEY) {
+ dir_item_err(root, leaf, slot,
+ "xattr dir type found for non-XATTR key");
+ return -EUCLEAN;
+ }
+ if (dir_type == BTRFS_FT_XATTR)
+ max_name_len = XATTR_NAME_MAX;
+ else
+ max_name_len = BTRFS_NAME_LEN;
+
+ /* Name/data length check */
+ name_len = btrfs_dir_name_len(leaf, di);
+ data_len = btrfs_dir_data_len(leaf, di);
+ if (name_len > max_name_len) {
+ dir_item_err(root, leaf, slot,
+ "dir item name len too long, have %u max %u",
+ name_len, max_name_len);
+ return -EUCLEAN;
+ }
+ if (name_len + data_len > BTRFS_MAX_XATTR_SIZE(root->fs_info)) {
+ dir_item_err(root, leaf, slot,
+ "dir item name and data len too long, have %u max %u",
+ name_len + data_len,
+ BTRFS_MAX_XATTR_SIZE(root->fs_info));
+ return -EUCLEAN;
+ }
+
+ if (data_len && dir_type != BTRFS_FT_XATTR) {
+ dir_item_err(root, leaf, slot,
+ "dir item with invalid data len, have %u expect 0",
+ data_len);
+ return -EUCLEAN;
+ }
+
+ total_size = sizeof(*di) + name_len + data_len;
+
+ /* header and name/data should not cross item boundary */
+ if (cur + total_size > item_size) {
+ dir_item_err(root, leaf, slot,
+ "dir item data crosses item boundary, have %u boundary %u",
+ cur + total_size, item_size);
+ return -EUCLEAN;
+ }
+
+ /*
+ * Special check for XATTR/DIR_ITEM, as key->offset is name
+ * hash, should match its name
+ */
+ if (key->type == BTRFS_DIR_ITEM_KEY ||
+ key->type == BTRFS_XATTR_ITEM_KEY) {
+ char namebuf[max(BTRFS_NAME_LEN, XATTR_NAME_MAX)];
+
+ read_extent_buffer(leaf, namebuf,
+ (unsigned long)(di + 1), name_len);
+ name_hash = btrfs_name_hash(namebuf, name_len);
+ if (key->offset != name_hash) {
+ dir_item_err(root, leaf, slot,
+ "name hash mismatch with key, have 0x%016x expect 0x%016llx",
+ name_hash, key->offset);
+ return -EUCLEAN;
+ }
+ }
+ cur += total_size;
+ di = (struct btrfs_dir_item *)((void *)di + total_size);
+ }
+ return 0;
+}
+
+/*
* Common point to switch the item-specific validation.
*/
static int check_leaf_item(struct btrfs_root *root,
@@ -238,6 +375,11 @@ static int check_leaf_item(struct btrfs_root *root,
case BTRFS_EXTENT_CSUM_KEY:
ret = check_csum_item(root, leaf, key, slot);
break;
+ case BTRFS_DIR_ITEM_KEY:
+ case BTRFS_DIR_INDEX_KEY:
+ case BTRFS_XATTR_ITEM_KEY:
+ ret = check_dir_item(root, leaf, key, slot);
+ break;
}
return ret;
}
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 7bf9b31561db..afadaadab18e 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -20,6 +20,7 @@
#include <linux/slab.h>
#include <linux/blkdev.h>
#include <linux/list_sort.h>
+#include <linux/iversion.h>
#include "tree-log.h"
#include "disk-io.h"
#include "locking.h"
@@ -1173,19 +1174,15 @@ next:
return 0;
}
-static int extref_get_fields(struct extent_buffer *eb, int slot,
- unsigned long ref_ptr, u32 *namelen, char **name,
- u64 *index, u64 *parent_objectid)
+static int extref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr,
+ u32 *namelen, char **name, u64 *index,
+ u64 *parent_objectid)
{
struct btrfs_inode_extref *extref;
extref = (struct btrfs_inode_extref *)ref_ptr;
*namelen = btrfs_inode_extref_name_len(eb, extref);
- if (!btrfs_is_name_len_valid(eb, slot, (unsigned long)&extref->name,
- *namelen))
- return -EIO;
-
*name = kmalloc(*namelen, GFP_NOFS);
if (*name == NULL)
return -ENOMEM;
@@ -1200,19 +1197,14 @@ static int extref_get_fields(struct extent_buffer *eb, int slot,
return 0;
}
-static int ref_get_fields(struct extent_buffer *eb, int slot,
- unsigned long ref_ptr, u32 *namelen, char **name,
- u64 *index)
+static int ref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr,
+ u32 *namelen, char **name, u64 *index)
{
struct btrfs_inode_ref *ref;
ref = (struct btrfs_inode_ref *)ref_ptr;
*namelen = btrfs_inode_ref_name_len(eb, ref);
- if (!btrfs_is_name_len_valid(eb, slot, (unsigned long)(ref + 1),
- *namelen))
- return -EIO;
-
*name = kmalloc(*namelen, GFP_NOFS);
if (*name == NULL)
return -ENOMEM;
@@ -1287,8 +1279,8 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
while (ref_ptr < ref_end) {
if (log_ref_ver) {
- ret = extref_get_fields(eb, slot, ref_ptr, &namelen,
- &name, &ref_index, &parent_objectid);
+ ret = extref_get_fields(eb, ref_ptr, &namelen, &name,
+ &ref_index, &parent_objectid);
/*
* parent object can change from one array
* item to another.
@@ -1300,8 +1292,8 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
goto out;
}
} else {
- ret = ref_get_fields(eb, slot, ref_ptr, &namelen,
- &name, &ref_index);
+ ret = ref_get_fields(eb, ref_ptr, &namelen, &name,
+ &ref_index);
}
if (ret)
goto out;
@@ -1835,7 +1827,6 @@ static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans,
struct extent_buffer *eb, int slot,
struct btrfs_key *key)
{
- struct btrfs_fs_info *fs_info = root->fs_info;
int ret = 0;
u32 item_size = btrfs_item_size_nr(eb, slot);
struct btrfs_dir_item *di;
@@ -1848,8 +1839,6 @@ static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans,
ptr_end = ptr + item_size;
while (ptr < ptr_end) {
di = (struct btrfs_dir_item *)ptr;
- if (verify_dir_item(fs_info, eb, slot, di))
- return -EIO;
name_len = btrfs_dir_name_len(eb, di);
ret = replay_one_name(trans, root, path, eb, di, key);
if (ret < 0)
@@ -2024,11 +2013,6 @@ again:
ptr_end = ptr + item_size;
while (ptr < ptr_end) {
di = (struct btrfs_dir_item *)ptr;
- if (verify_dir_item(fs_info, eb, slot, di)) {
- ret = -EIO;
- goto out;
- }
-
name_len = btrfs_dir_name_len(eb, di);
name = kmalloc(name_len, GFP_NOFS);
if (!name) {
@@ -2109,7 +2093,6 @@ static int replay_xattr_deletes(struct btrfs_trans_handle *trans,
struct btrfs_path *path,
const u64 ino)
{
- struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_key search_key;
struct btrfs_path *log_path;
int i;
@@ -2151,11 +2134,6 @@ process_leaf:
u32 this_len = sizeof(*di) + name_len + data_len;
char *name;
- ret = verify_dir_item(fs_info, path->nodes[0], i, di);
- if (ret) {
- ret = -EIO;
- goto out;
- }
name = kmalloc(name_len, GFP_NOFS);
if (!name) {
ret = -ENOMEM;
@@ -3609,7 +3587,8 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
btrfs_set_token_inode_nbytes(leaf, item, inode_get_bytes(inode),
&token);
- btrfs_set_token_inode_sequence(leaf, item, inode->i_version, &token);
+ btrfs_set_token_inode_sequence(leaf, item,
+ inode_peek_iversion(inode), &token);
btrfs_set_token_inode_transid(leaf, item, trans->transid, &token);
btrfs_set_token_inode_rdev(leaf, item, inode->i_rdev, &token);
btrfs_set_token_inode_flags(leaf, item, BTRFS_I(inode)->flags, &token);
@@ -4572,12 +4551,6 @@ static int btrfs_check_ref_name_override(struct extent_buffer *eb,
this_len = sizeof(*extref) + this_name_len;
}
- ret = btrfs_is_name_len_valid(eb, slot, name_ptr,
- this_name_len);
- if (!ret) {
- ret = -EIO;
- goto out;
- }
if (this_name_len > name_len) {
char *new_name;
@@ -5432,11 +5405,10 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
struct dentry *parent,
const loff_t start,
const loff_t end,
- int exists_only,
+ int inode_only,
struct btrfs_log_ctx *ctx)
{
struct btrfs_fs_info *fs_info = root->fs_info;
- int inode_only = exists_only ? LOG_INODE_EXISTS : LOG_INODE_ALL;
struct super_block *sb;
struct dentry *old_parent = NULL;
int ret = 0;
@@ -5602,7 +5574,7 @@ int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
int ret;
ret = btrfs_log_inode_parent(trans, root, BTRFS_I(d_inode(dentry)),
- parent, start, end, 0, ctx);
+ parent, start, end, LOG_INODE_ALL, ctx);
dput(parent);
return ret;
@@ -5865,6 +5837,6 @@ int btrfs_log_new_name(struct btrfs_trans_handle *trans,
return 0;
return btrfs_log_inode_parent(trans, root, inode, parent, 0,
- LLONG_MAX, 1, NULL);
+ LLONG_MAX, LOG_INODE_EXISTS, NULL);
}
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 49810b70afd3..b5036bd69e6a 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -145,6 +145,71 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
struct btrfs_bio **bbio_ret,
int mirror_num, int need_raid_map);
+/*
+ * Device locking
+ * ==============
+ *
+ * There are several mutexes that protect manipulation of devices and low-level
+ * structures like chunks but not block groups, extents or files
+ *
+ * uuid_mutex (global lock)
+ * ------------------------
+ * protects the fs_uuids list that tracks all per-fs fs_devices, resulting from
+ * the SCAN_DEV ioctl registration or from mount either implicitly (the first
+ * device) or requested by the device= mount option
+ *
+ * the mutex can be very coarse and can cover long-running operations
+ *
+ * protects: updates to fs_devices counters like missing devices, rw devices,
+ * seeding, structure cloning, openning/closing devices at mount/umount time
+ *
+ * global::fs_devs - add, remove, updates to the global list
+ *
+ * does not protect: manipulation of the fs_devices::devices list!
+ *
+ * btrfs_device::name - renames (write side), read is RCU
+ *
+ * fs_devices::device_list_mutex (per-fs, with RCU)
+ * ------------------------------------------------
+ * protects updates to fs_devices::devices, ie. adding and deleting
+ *
+ * simple list traversal with read-only actions can be done with RCU protection
+ *
+ * may be used to exclude some operations from running concurrently without any
+ * modifications to the list (see write_all_supers)
+ *
+ * volume_mutex
+ * ------------
+ * coarse lock owned by a mounted filesystem; used to exclude some operations
+ * that cannot run in parallel and affect the higher-level properties of the
+ * filesystem like: device add/deleting/resize/replace, or balance
+ *
+ * balance_mutex
+ * -------------
+ * protects balance structures (status, state) and context accessed from
+ * several places (internally, ioctl)
+ *
+ * chunk_mutex
+ * -----------
+ * protects chunks, adding or removing during allocation, trim or when a new
+ * device is added/removed
+ *
+ * cleaner_mutex
+ * -------------
+ * a big lock that is held by the cleaner thread and prevents running subvolume
+ * cleaning together with relocation or delayed iputs
+ *
+ *
+ * Lock nesting
+ * ============
+ *
+ * uuid_mutex
+ * volume_mutex
+ * device_list_mutex
+ * chunk_mutex
+ * balance_mutex
+ */
+
DEFINE_MUTEX(uuid_mutex);
static LIST_HEAD(fs_uuids);
struct list_head *btrfs_get_fs_uuids(void)
@@ -180,6 +245,13 @@ static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid)
return fs_devs;
}
+static void free_device(struct btrfs_device *device)
+{
+ rcu_string_free(device->name);
+ bio_put(device->flush_bio);
+ kfree(device);
+}
+
static void free_fs_devices(struct btrfs_fs_devices *fs_devices)
{
struct btrfs_device *device;
@@ -188,9 +260,7 @@ static void free_fs_devices(struct btrfs_fs_devices *fs_devices)
device = list_entry(fs_devices->devices.next,
struct btrfs_device, dev_list);
list_del(&device->dev_list);
- rcu_string_free(device->name);
- bio_put(device->flush_bio);
- kfree(device);
+ free_device(device);
}
kfree(fs_devices);
}
@@ -220,6 +290,11 @@ void btrfs_cleanup_fs_uuids(void)
}
}
+/*
+ * Returns a pointer to a new btrfs_device on success; ERR_PTR() on error.
+ * Returned struct is not linked onto any lists and must be destroyed using
+ * free_device.
+ */
static struct btrfs_device *__alloc_device(void)
{
struct btrfs_device *dev;
@@ -237,7 +312,6 @@ static struct btrfs_device *__alloc_device(void)
kfree(dev);
return ERR_PTR(-ENOMEM);
}
- bio_get(dev->flush_bio);
INIT_LIST_HEAD(&dev->dev_list);
INIT_LIST_HEAD(&dev->dev_alloc_list);
@@ -245,7 +319,6 @@ static struct btrfs_device *__alloc_device(void)
spin_lock_init(&dev->io_lock);
- spin_lock_init(&dev->reada_lock);
atomic_set(&dev->reada_in_flight, 0);
atomic_set(&dev->dev_stats_ccnt, 0);
btrfs_device_data_ordered_init(dev);
@@ -531,45 +604,42 @@ static void pending_bios_fn(struct btrfs_work *work)
run_scheduled_bios(device);
}
-
-static void btrfs_free_stale_device(struct btrfs_device *cur_dev)
+/*
+ * Search and remove all stale (devices which are not mounted) devices.
+ * When both inputs are NULL, it will search and release all stale devices.
+ * path: Optional. When provided will it release all unmounted devices
+ * matching this path only.
+ * skip_dev: Optional. Will skip this device when searching for the stale
+ * devices.
+ */
+static void btrfs_free_stale_devices(const char *path,
+ struct btrfs_device *skip_dev)
{
- struct btrfs_fs_devices *fs_devs;
- struct btrfs_device *dev;
-
- if (!cur_dev->name)
- return;
+ struct btrfs_fs_devices *fs_devs, *tmp_fs_devs;
+ struct btrfs_device *dev, *tmp_dev;
- list_for_each_entry(fs_devs, &fs_uuids, list) {
- int del = 1;
+ list_for_each_entry_safe(fs_devs, tmp_fs_devs, &fs_uuids, list) {
if (fs_devs->opened)
continue;
- if (fs_devs->seeding)
- continue;
- list_for_each_entry(dev, &fs_devs->devices, dev_list) {
+ list_for_each_entry_safe(dev, tmp_dev,
+ &fs_devs->devices, dev_list) {
+ int not_found = 0;
- if (dev == cur_dev)
+ if (skip_dev && skip_dev == dev)
continue;
- if (!dev->name)
+ if (path && !dev->name)
continue;
- /*
- * Todo: This won't be enough. What if the same device
- * comes back (with new uuid and) with its mapper path?
- * But for now, this does help as mostly an admin will
- * either use mapper or non mapper path throughout.
- */
rcu_read_lock();
- del = strcmp(rcu_str_deref(dev->name),
- rcu_str_deref(cur_dev->name));
+ if (path)
+ not_found = strcmp(rcu_str_deref(dev->name),
+ path);
rcu_read_unlock();
- if (!del)
- break;
- }
+ if (not_found)
+ continue;
- if (!del) {
/* delete the stale device */
if (fs_devs->num_devices == 1) {
btrfs_sysfs_remove_fsid(fs_devs);
@@ -578,38 +648,99 @@ static void btrfs_free_stale_device(struct btrfs_device *cur_dev)
} else {
fs_devs->num_devices--;
list_del(&dev->dev_list);
- rcu_string_free(dev->name);
- bio_put(dev->flush_bio);
- kfree(dev);
+ free_device(dev);
}
- break;
}
}
}
+static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
+ struct btrfs_device *device, fmode_t flags,
+ void *holder)
+{
+ struct request_queue *q;
+ struct block_device *bdev;
+ struct buffer_head *bh;
+ struct btrfs_super_block *disk_super;
+ u64 devid;
+ int ret;
+
+ if (device->bdev)
+ return -EINVAL;
+ if (!device->name)
+ return -EINVAL;
+
+ ret = btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1,
+ &bdev, &bh);
+ if (ret)
+ return ret;
+
+ disk_super = (struct btrfs_super_block *)bh->b_data;
+ devid = btrfs_stack_device_id(&disk_super->dev_item);
+ if (devid != device->devid)
+ goto error_brelse;
+
+ if (memcmp(device->uuid, disk_super->dev_item.uuid, BTRFS_UUID_SIZE))
+ goto error_brelse;
+
+ device->generation = btrfs_super_generation(disk_super);
+
+ if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) {
+ clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
+ fs_devices->seeding = 1;
+ } else {
+ if (bdev_read_only(bdev))
+ clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
+ else
+ set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
+ }
+
+ q = bdev_get_queue(bdev);
+ if (!blk_queue_nonrot(q))
+ fs_devices->rotating = 1;
+
+ device->bdev = bdev;
+ clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
+ device->mode = flags;
+
+ fs_devices->open_devices++;
+ if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
+ device->devid != BTRFS_DEV_REPLACE_DEVID) {
+ fs_devices->rw_devices++;
+ list_add(&device->dev_alloc_list, &fs_devices->alloc_list);
+ }
+ brelse(bh);
+
+ return 0;
+
+error_brelse:
+ brelse(bh);
+ blkdev_put(bdev, flags);
+
+ return -EINVAL;
+}
+
/*
* Add new device to list of registered devices
*
* Returns:
- * 1 - first time device is seen
- * 0 - device already known
- * < 0 - error
+ * device pointer which was just added or updated when successful
+ * error pointer when failed
*/
-static noinline int device_list_add(const char *path,
- struct btrfs_super_block *disk_super,
- u64 devid, struct btrfs_fs_devices **fs_devices_ret)
+static noinline struct btrfs_device *device_list_add(const char *path,
+ struct btrfs_super_block *disk_super)
{
struct btrfs_device *device;
struct btrfs_fs_devices *fs_devices;
struct rcu_string *name;
- int ret = 0;
u64 found_transid = btrfs_super_generation(disk_super);
+ u64 devid = btrfs_stack_device_id(&disk_super->dev_item);
fs_devices = find_fsid(disk_super->fsid);
if (!fs_devices) {
fs_devices = alloc_fs_devices(disk_super->fsid);
if (IS_ERR(fs_devices))
- return PTR_ERR(fs_devices);
+ return ERR_CAST(fs_devices);
list_add(&fs_devices->list, &fs_uuids);
@@ -621,20 +752,19 @@ static noinline int device_list_add(const char *path,
if (!device) {
if (fs_devices->opened)
- return -EBUSY;
+ return ERR_PTR(-EBUSY);
device = btrfs_alloc_device(NULL, &devid,
disk_super->dev_item.uuid);
if (IS_ERR(device)) {
/* we can safely leave the fs_devices entry around */
- return PTR_ERR(device);
+ return device;
}
name = rcu_string_strdup(path, GFP_NOFS);
if (!name) {
- bio_put(device->flush_bio);
- kfree(device);
- return -ENOMEM;
+ free_device(device);
+ return ERR_PTR(-ENOMEM);
}
rcu_assign_pointer(device->name, name);
@@ -643,8 +773,16 @@ static noinline int device_list_add(const char *path,
fs_devices->num_devices++;
mutex_unlock(&fs_devices->device_list_mutex);
- ret = 1;
device->fs_devices = fs_devices;
+ btrfs_free_stale_devices(path, device);
+
+ if (disk_super->label[0])
+ pr_info("BTRFS: device label %s devid %llu transid %llu %s\n",
+ disk_super->label, devid, found_transid, path);
+ else
+ pr_info("BTRFS: device fsid %pU devid %llu transid %llu %s\n",
+ disk_super->fsid, devid, found_transid, path);
+
} else if (!device->name || strcmp(device->name->str, path)) {
/*
* When FS is already mounted.
@@ -680,17 +818,17 @@ static noinline int device_list_add(const char *path,
* with larger generation number or the last-in if
* generation are equal.
*/
- return -EEXIST;
+ return ERR_PTR(-EEXIST);
}
name = rcu_string_strdup(path, GFP_NOFS);
if (!name)
- return -ENOMEM;
+ return ERR_PTR(-ENOMEM);
rcu_string_free(device->name);
rcu_assign_pointer(device->name, name);
- if (device->missing) {
+ if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) {
fs_devices->missing_devices--;
- device->missing = 0;
+ clear_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
}
}
@@ -703,16 +841,9 @@ static noinline int device_list_add(const char *path,
if (!fs_devices->opened)
device->generation = found_transid;
- /*
- * if there is new btrfs on an already registered device,
- * then remove the stale device entry.
- */
- if (ret > 0)
- btrfs_free_stale_device(device);
-
- *fs_devices_ret = fs_devices;
+ fs_devices->total_devices = btrfs_super_num_devices(disk_super);
- return ret;
+ return device;
}
static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
@@ -745,8 +876,7 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
name = rcu_string_strdup(orig_dev->name->str,
GFP_KERNEL);
if (!name) {
- bio_put(device->flush_bio);
- kfree(device);
+ free_device(device);
goto error;
}
rcu_assign_pointer(device->name, name);
@@ -773,10 +903,12 @@ void btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices, int step)
again:
/* This is the initialized path, it is safe to release the devices. */
list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
- if (device->in_fs_metadata) {
- if (!device->is_tgtdev_for_dev_replace &&
- (!latest_dev ||
- device->generation > latest_dev->generation)) {
+ if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
+ &device->dev_state)) {
+ if (!test_bit(BTRFS_DEV_STATE_REPLACE_TGT,
+ &device->dev_state) &&
+ (!latest_dev ||
+ device->generation > latest_dev->generation)) {
latest_dev = device;
}
continue;
@@ -793,7 +925,8 @@ again:
* not, which means whether this device is
* used or whether it should be removed.
*/
- if (step == 0 || device->is_tgtdev_for_dev_replace) {
+ if (step == 0 || test_bit(BTRFS_DEV_STATE_REPLACE_TGT,
+ &device->dev_state)) {
continue;
}
}
@@ -802,17 +935,16 @@ again:
device->bdev = NULL;
fs_devices->open_devices--;
}
- if (device->writeable) {
+ if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
list_del_init(&device->dev_alloc_list);
- device->writeable = 0;
- if (!device->is_tgtdev_for_dev_replace)
+ clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
+ if (!test_bit(BTRFS_DEV_STATE_REPLACE_TGT,
+ &device->dev_state))
fs_devices->rw_devices--;
}
list_del_init(&device->dev_list);
fs_devices->num_devices--;
- rcu_string_free(device->name);
- bio_put(device->flush_bio);
- kfree(device);
+ free_device(device);
}
if (fs_devices->seed) {
@@ -825,35 +957,25 @@ again:
mutex_unlock(&uuid_mutex);
}
-static void __free_device(struct work_struct *work)
-{
- struct btrfs_device *device;
-
- device = container_of(work, struct btrfs_device, rcu_work);
- rcu_string_free(device->name);
- bio_put(device->flush_bio);
- kfree(device);
-}
-
-static void free_device(struct rcu_head *head)
+static void free_device_rcu(struct rcu_head *head)
{
struct btrfs_device *device;
device = container_of(head, struct btrfs_device, rcu);
-
- INIT_WORK(&device->rcu_work, __free_device);
- schedule_work(&device->rcu_work);
+ free_device(device);
}
static void btrfs_close_bdev(struct btrfs_device *device)
{
- if (device->bdev && device->writeable) {
+ if (!device->bdev)
+ return;
+
+ if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
sync_blockdev(device->bdev);
invalidate_bdev(device->bdev);
}
- if (device->bdev)
- blkdev_put(device->bdev, device->mode);
+ blkdev_put(device->bdev, device->mode);
}
static void btrfs_prepare_close_one_device(struct btrfs_device *device)
@@ -865,13 +987,13 @@ static void btrfs_prepare_close_one_device(struct btrfs_device *device)
if (device->bdev)
fs_devices->open_devices--;
- if (device->writeable &&
+ if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
device->devid != BTRFS_DEV_REPLACE_DEVID) {
list_del_init(&device->dev_alloc_list);
fs_devices->rw_devices--;
}
- if (device->missing)
+ if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
fs_devices->missing_devices--;
new_device = btrfs_alloc_device(NULL, &device->devid,
@@ -917,7 +1039,7 @@ static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
struct btrfs_device, dev_list);
list_del(&device->dev_list);
btrfs_close_bdev(device);
- call_rcu(&device->rcu, free_device);
+ call_rcu(&device->rcu, free_device_rcu);
}
WARN_ON(fs_devices->open_devices);
@@ -947,93 +1069,32 @@ int btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
__btrfs_close_devices(fs_devices);
free_fs_devices(fs_devices);
}
- /*
- * Wait for rcu kworkers under __btrfs_close_devices
- * to finish all blkdev_puts so device is really
- * free when umount is done.
- */
- rcu_barrier();
return ret;
}
static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
fmode_t flags, void *holder)
{
- struct request_queue *q;
- struct block_device *bdev;
struct list_head *head = &fs_devices->devices;
struct btrfs_device *device;
struct btrfs_device *latest_dev = NULL;
- struct buffer_head *bh;
- struct btrfs_super_block *disk_super;
- u64 devid;
- int seeding = 1;
int ret = 0;
flags |= FMODE_EXCL;
list_for_each_entry(device, head, dev_list) {
- if (device->bdev)
- continue;
- if (!device->name)
- continue;
-
/* Just open everything we can; ignore failures here */
- if (btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1,
- &bdev, &bh))
+ if (btrfs_open_one_device(fs_devices, device, flags, holder))
continue;
- disk_super = (struct btrfs_super_block *)bh->b_data;
- devid = btrfs_stack_device_id(&disk_super->dev_item);
- if (devid != device->devid)
- goto error_brelse;
-
- if (memcmp(device->uuid, disk_super->dev_item.uuid,
- BTRFS_UUID_SIZE))
- goto error_brelse;
-
- device->generation = btrfs_super_generation(disk_super);
if (!latest_dev ||
device->generation > latest_dev->generation)
latest_dev = device;
-
- if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) {
- device->writeable = 0;
- } else {
- device->writeable = !bdev_read_only(bdev);
- seeding = 0;
- }
-
- q = bdev_get_queue(bdev);
- if (blk_queue_discard(q))
- device->can_discard = 1;
- if (!blk_queue_nonrot(q))
- fs_devices->rotating = 1;
-
- device->bdev = bdev;
- device->in_fs_metadata = 0;
- device->mode = flags;
-
- fs_devices->open_devices++;
- if (device->writeable &&
- device->devid != BTRFS_DEV_REPLACE_DEVID) {
- fs_devices->rw_devices++;
- list_add(&device->dev_alloc_list,
- &fs_devices->alloc_list);
- }
- brelse(bh);
- continue;
-
-error_brelse:
- brelse(bh);
- blkdev_put(bdev, flags);
- continue;
}
if (fs_devices->open_devices == 0) {
ret = -EINVAL;
goto out;
}
- fs_devices->seeding = seeding;
fs_devices->opened = 1;
fs_devices->latest_bdev = latest_dev->bdev;
fs_devices->total_rw_bytes = 0;
@@ -1117,12 +1178,10 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
struct btrfs_fs_devices **fs_devices_ret)
{
struct btrfs_super_block *disk_super;
+ struct btrfs_device *device;
struct block_device *bdev;
struct page *page;
- int ret = -EINVAL;
- u64 devid;
- u64 transid;
- u64 total_devices;
+ int ret = 0;
u64 bytenr;
/*
@@ -1141,26 +1200,16 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
goto error;
}
- if (btrfs_read_disk_super(bdev, bytenr, &page, &disk_super))
+ if (btrfs_read_disk_super(bdev, bytenr, &page, &disk_super)) {
+ ret = -EINVAL;
goto error_bdev_put;
-
- devid = btrfs_stack_device_id(&disk_super->dev_item);
- transid = btrfs_super_generation(disk_super);
- total_devices = btrfs_super_num_devices(disk_super);
-
- ret = device_list_add(path, disk_super, devid, fs_devices_ret);
- if (ret > 0) {
- if (disk_super->label[0]) {
- pr_info("BTRFS: device label %s ", disk_super->label);
- } else {
- pr_info("BTRFS: device fsid %pU ", disk_super->fsid);
- }
-
- pr_cont("devid %llu transid %llu %s\n", devid, transid, path);
- ret = 0;
}
- if (!ret && fs_devices_ret)
- (*fs_devices_ret)->total_devices = total_devices;
+
+ device = device_list_add(path, disk_super);
+ if (IS_ERR(device))
+ ret = PTR_ERR(device);
+ else
+ *fs_devices_ret = device->fs_devices;
btrfs_release_disk_super(page);
@@ -1186,7 +1235,8 @@ int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
*length = 0;
- if (start >= device->total_bytes || device->is_tgtdev_for_dev_replace)
+ if (start >= device->total_bytes ||
+ test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state))
return 0;
path = btrfs_alloc_path();
@@ -1364,7 +1414,8 @@ int find_free_dev_extent_start(struct btrfs_transaction *transaction,
max_hole_size = 0;
again:
- if (search_start >= search_end || device->is_tgtdev_for_dev_replace) {
+ if (search_start >= search_end ||
+ test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
ret = -ENOSPC;
goto out;
}
@@ -1571,8 +1622,8 @@ static int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
struct extent_buffer *leaf;
struct btrfs_key key;
- WARN_ON(!device->in_fs_metadata);
- WARN_ON(device->is_tgtdev_for_dev_replace);
+ WARN_ON(!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state));
+ WARN_ON(test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state));
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
@@ -1662,7 +1713,7 @@ error:
* the device information is stored in the chunk root
* the btrfs_device struct should be fully filled in
*/
-static int btrfs_add_device(struct btrfs_trans_handle *trans,
+static int btrfs_add_dev_item(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info,
struct btrfs_device *device)
{
@@ -1818,7 +1869,8 @@ static struct btrfs_device * btrfs_find_next_active_device(
list_for_each_entry(next_device, &fs_devs->devices, dev_list) {
if (next_device != device &&
- !next_device->missing && next_device->bdev)
+ !test_bit(BTRFS_DEV_STATE_MISSING, &next_device->dev_state)
+ && next_device->bdev)
return next_device;
}
@@ -1859,6 +1911,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
u64 num_devices;
int ret = 0;
+ mutex_lock(&fs_info->volume_mutex);
mutex_lock(&uuid_mutex);
num_devices = fs_info->fs_devices->num_devices;
@@ -1878,17 +1931,18 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
if (ret)
goto out;
- if (device->is_tgtdev_for_dev_replace) {
+ if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
ret = BTRFS_ERROR_DEV_TGT_REPLACE;
goto out;
}
- if (device->writeable && fs_info->fs_devices->rw_devices == 1) {
+ if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
+ fs_info->fs_devices->rw_devices == 1) {
ret = BTRFS_ERROR_DEV_ONLY_WRITABLE;
goto out;
}
- if (device->writeable) {
+ if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
mutex_lock(&fs_info->chunk_mutex);
list_del_init(&device->dev_alloc_list);
device->fs_devices->rw_devices--;
@@ -1910,7 +1964,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
if (ret)
goto error_undo;
- device->in_fs_metadata = 0;
+ clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
btrfs_scrub_cancel_dev(fs_info, device);
/*
@@ -1930,7 +1984,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
device->fs_devices->num_devices--;
device->fs_devices->total_devices--;
- if (device->missing)
+ if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
device->fs_devices->missing_devices--;
btrfs_assign_next_active_device(fs_info, device, NULL);
@@ -1950,11 +2004,11 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
* the devices list. All that's left is to zero out the old
* supers and free the device.
*/
- if (device->writeable)
+ if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
btrfs_scratch_superblocks(device->bdev, device->name->str);
btrfs_close_bdev(device);
- call_rcu(&device->rcu, free_device);
+ call_rcu(&device->rcu, free_device_rcu);
if (cur_devices->open_devices == 0) {
struct btrfs_fs_devices *fs_devices;
@@ -1973,10 +2027,11 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
out:
mutex_unlock(&uuid_mutex);
+ mutex_unlock(&fs_info->volume_mutex);
return ret;
error_undo:
- if (device->writeable) {
+ if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
mutex_lock(&fs_info->chunk_mutex);
list_add(&device->dev_alloc_list,
&fs_info->fs_devices->alloc_list);
@@ -2004,10 +2059,10 @@ void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_fs_info *fs_info,
list_del_rcu(&srcdev->dev_list);
list_del(&srcdev->dev_alloc_list);
fs_devices->num_devices--;
- if (srcdev->missing)
+ if (test_bit(BTRFS_DEV_STATE_MISSING, &srcdev->dev_state))
fs_devices->missing_devices--;
- if (srcdev->writeable)
+ if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &srcdev->dev_state))
fs_devices->rw_devices--;
if (srcdev->bdev)
@@ -2019,13 +2074,13 @@ void btrfs_rm_dev_replace_free_srcdev(struct btrfs_fs_info *fs_info,
{
struct btrfs_fs_devices *fs_devices = srcdev->fs_devices;
- if (srcdev->writeable) {
+ if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &srcdev->dev_state)) {
/* zero out the old super if it is writable */
btrfs_scratch_superblocks(srcdev->bdev, srcdev->name->str);
}
btrfs_close_bdev(srcdev);
- call_rcu(&srcdev->rcu, free_device);
+ call_rcu(&srcdev->rcu, free_device_rcu);
/* if this is no devs we rather delete the fs_devices */
if (!fs_devices->num_devices) {
@@ -2084,7 +2139,7 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
btrfs_scratch_superblocks(tgtdev->bdev, tgtdev->name->str);
btrfs_close_bdev(tgtdev);
- call_rcu(&tgtdev->rcu, free_device);
+ call_rcu(&tgtdev->rcu, free_device_rcu);
}
static int btrfs_find_device_by_path(struct btrfs_fs_info *fs_info,
@@ -2129,7 +2184,8 @@ int btrfs_find_device_missing_or_by_path(struct btrfs_fs_info *fs_info,
* is held by the caller.
*/
list_for_each_entry(tmp, devices, dev_list) {
- if (tmp->in_fs_metadata && !tmp->bdev) {
+ if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
+ &tmp->dev_state) && !tmp->bdev) {
*device = tmp;
break;
}
@@ -2358,26 +2414,19 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
name = rcu_string_strdup(device_path, GFP_KERNEL);
if (!name) {
- bio_put(device->flush_bio);
- kfree(device);
ret = -ENOMEM;
- goto error;
+ goto error_free_device;
}
rcu_assign_pointer(device->name, name);
trans = btrfs_start_transaction(root, 0);
if (IS_ERR(trans)) {
- rcu_string_free(device->name);
- bio_put(device->flush_bio);
- kfree(device);
ret = PTR_ERR(trans);
- goto error;
+ goto error_free_device;
}
q = bdev_get_queue(bdev);
- if (blk_queue_discard(q))
- device->can_discard = 1;
- device->writeable = 1;
+ set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
device->generation = trans->transid;
device->io_width = fs_info->sectorsize;
device->io_align = fs_info->sectorsize;
@@ -2388,8 +2437,8 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
device->commit_total_bytes = device->total_bytes;
device->fs_info = fs_info;
device->bdev = bdev;
- device->in_fs_metadata = 1;
- device->is_tgtdev_for_dev_replace = 0;
+ set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
+ clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
device->mode = FMODE_EXCL;
device->dev_stats_valid = 1;
set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE);
@@ -2450,7 +2499,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
}
}
- ret = btrfs_add_device(trans, fs_info, device);
+ ret = btrfs_add_dev_item(trans, fs_info, device);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto error_sysfs;
@@ -2511,9 +2560,8 @@ error_trans:
sb->s_flags |= SB_RDONLY;
if (trans)
btrfs_end_transaction(trans);
- rcu_string_free(device->name);
- bio_put(device->flush_bio);
- kfree(device);
+error_free_device:
+ free_device(device);
error:
blkdev_put(bdev, FMODE_EXCL);
if (seeding_dev && !unlocked) {
@@ -2528,7 +2576,6 @@ int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
struct btrfs_device *srcdev,
struct btrfs_device **device_out)
{
- struct request_queue *q;
struct btrfs_device *device;
struct block_device *bdev;
struct list_head *devices;
@@ -2579,18 +2626,14 @@ int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
name = rcu_string_strdup(device_path, GFP_KERNEL);
if (!name) {
- bio_put(device->flush_bio);
- kfree(device);
+ free_device(device);
ret = -ENOMEM;
goto error;
}
rcu_assign_pointer(device->name, name);
- q = bdev_get_queue(bdev);
- if (blk_queue_discard(q))
- device->can_discard = 1;
mutex_lock(&fs_info->fs_devices->device_list_mutex);
- device->writeable = 1;
+ set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
device->generation = 0;
device->io_width = fs_info->sectorsize;
device->io_align = fs_info->sectorsize;
@@ -2603,8 +2646,8 @@ int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
device->commit_bytes_used = device->bytes_used;
device->fs_info = fs_info;
device->bdev = bdev;
- device->in_fs_metadata = 1;
- device->is_tgtdev_for_dev_replace = 1;
+ set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
+ set_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
device->mode = FMODE_EXCL;
device->dev_stats_valid = 1;
set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE);
@@ -2632,7 +2675,7 @@ void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info,
tgtdev->io_align = sectorsize;
tgtdev->sector_size = sectorsize;
tgtdev->fs_info = fs_info;
- tgtdev->in_fs_metadata = 1;
+ set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &tgtdev->dev_state);
}
static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
@@ -2690,7 +2733,7 @@ int btrfs_grow_device(struct btrfs_trans_handle *trans,
u64 old_total;
u64 diff;
- if (!device->writeable)
+ if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
return -EACCES;
new_size = round_down(new_size, fs_info->sectorsize);
@@ -2700,7 +2743,7 @@ int btrfs_grow_device(struct btrfs_trans_handle *trans,
diff = round_down(new_size - device->total_bytes, fs_info->sectorsize);
if (new_size <= device->total_bytes ||
- device->is_tgtdev_for_dev_replace) {
+ test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
mutex_unlock(&fs_info->chunk_mutex);
return -EINVAL;
}
@@ -3044,6 +3087,48 @@ error:
return ret;
}
+/*
+ * return 1 : allocate a data chunk successfully,
+ * return <0: errors during allocating a data chunk,
+ * return 0 : no need to allocate a data chunk.
+ */
+static int btrfs_may_alloc_data_chunk(struct btrfs_fs_info *fs_info,
+ u64 chunk_offset)
+{
+ struct btrfs_block_group_cache *cache;
+ u64 bytes_used;
+ u64 chunk_type;
+
+ cache = btrfs_lookup_block_group(fs_info, chunk_offset);
+ ASSERT(cache);
+ chunk_type = cache->flags;
+ btrfs_put_block_group(cache);
+
+ if (chunk_type & BTRFS_BLOCK_GROUP_DATA) {
+ spin_lock(&fs_info->data_sinfo->lock);
+ bytes_used = fs_info->data_sinfo->bytes_used;
+ spin_unlock(&fs_info->data_sinfo->lock);
+
+ if (!bytes_used) {
+ struct btrfs_trans_handle *trans;
+ int ret;
+
+ trans = btrfs_join_transaction(fs_info->tree_root);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
+
+ ret = btrfs_force_chunk_alloc(trans, fs_info,
+ BTRFS_BLOCK_GROUP_DATA);
+ btrfs_end_transaction(trans);
+ if (ret < 0)
+ return ret;
+
+ return 1;
+ }
+ }
+ return 0;
+}
+
static int insert_balance_item(struct btrfs_fs_info *fs_info,
struct btrfs_balance_control *bctl)
{
@@ -3502,7 +3587,6 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info)
u32 count_meta = 0;
u32 count_sys = 0;
int chunk_reserved = 0;
- u64 bytes_used = 0;
/* step one make some room on all the devices */
devices = &fs_info->fs_devices->devices;
@@ -3510,10 +3594,10 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info)
old_size = btrfs_device_get_total_bytes(device);
size_to_free = div_factor(old_size, 1);
size_to_free = min_t(u64, size_to_free, SZ_1M);
- if (!device->writeable ||
+ if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) ||
btrfs_device_get_total_bytes(device) -
btrfs_device_get_bytes_used(device) > size_to_free ||
- device->is_tgtdev_for_dev_replace)
+ test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state))
continue;
ret = btrfs_shrink_device(device, old_size - size_to_free);
@@ -3661,28 +3745,21 @@ again:
goto loop;
}
- ASSERT(fs_info->data_sinfo);
- spin_lock(&fs_info->data_sinfo->lock);
- bytes_used = fs_info->data_sinfo->bytes_used;
- spin_unlock(&fs_info->data_sinfo->lock);
-
- if ((chunk_type & BTRFS_BLOCK_GROUP_DATA) &&
- !chunk_reserved && !bytes_used) {
- trans = btrfs_start_transaction(chunk_root, 0);
- if (IS_ERR(trans)) {
- mutex_unlock(&fs_info->delete_unused_bgs_mutex);
- ret = PTR_ERR(trans);
- goto error;
- }
-
- ret = btrfs_force_chunk_alloc(trans, fs_info,
- BTRFS_BLOCK_GROUP_DATA);
- btrfs_end_transaction(trans);
+ if (!chunk_reserved) {
+ /*
+ * We may be relocating the only data chunk we have,
+ * which could potentially end up with losing data's
+ * raid profile, so lets allocate an empty one in
+ * advance.
+ */
+ ret = btrfs_may_alloc_data_chunk(fs_info,
+ found_key.offset);
if (ret < 0) {
mutex_unlock(&fs_info->delete_unused_bgs_mutex);
goto error;
+ } else if (ret == 1) {
+ chunk_reserved = 1;
}
- chunk_reserved = 1;
}
ret = btrfs_relocate_chunk(fs_info, found_key.offset);
@@ -4381,7 +4458,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
new_size = round_down(new_size, fs_info->sectorsize);
diff = round_down(old_size - new_size, fs_info->sectorsize);
- if (device->is_tgtdev_for_dev_replace)
+ if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state))
return -EINVAL;
path = btrfs_alloc_path();
@@ -4393,7 +4470,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
mutex_lock(&fs_info->chunk_mutex);
btrfs_device_set_total_bytes(device, new_size);
- if (device->writeable) {
+ if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
device->fs_devices->total_rw_bytes -= diff;
atomic64_sub(diff, &fs_info->free_chunk_space);
}
@@ -4445,6 +4522,18 @@ again:
chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
btrfs_release_path(path);
+ /*
+ * We may be relocating the only data chunk we have,
+ * which could potentially end up with losing data's
+ * raid profile, so lets allocate an empty one in
+ * advance.
+ */
+ ret = btrfs_may_alloc_data_chunk(fs_info, chunk_offset);
+ if (ret < 0) {
+ mutex_unlock(&fs_info->delete_unused_bgs_mutex);
+ goto done;
+ }
+
ret = btrfs_relocate_chunk(fs_info, chunk_offset);
mutex_unlock(&fs_info->delete_unused_bgs_mutex);
if (ret && ret != -ENOSPC)
@@ -4518,7 +4607,7 @@ done:
if (ret) {
mutex_lock(&fs_info->chunk_mutex);
btrfs_device_set_total_bytes(device, old_size);
- if (device->writeable)
+ if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
device->fs_devices->total_rw_bytes += diff;
atomic64_add(diff, &fs_info->free_chunk_space);
mutex_unlock(&fs_info->chunk_mutex);
@@ -4678,14 +4767,15 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
u64 max_avail;
u64 dev_offset;
- if (!device->writeable) {
+ if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
WARN(1, KERN_ERR
"BTRFS: read-only device in alloc_list\n");
continue;
}
- if (!device->in_fs_metadata ||
- device->is_tgtdev_for_dev_replace)
+ if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
+ &device->dev_state) ||
+ test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state))
continue;
if (device->total_bytes > device->bytes_used)
@@ -5033,12 +5123,13 @@ int btrfs_chunk_readonly(struct btrfs_fs_info *fs_info, u64 chunk_offset)
map = em->map_lookup;
for (i = 0; i < map->num_stripes; i++) {
- if (map->stripes[i].dev->missing) {
+ if (test_bit(BTRFS_DEV_STATE_MISSING,
+ &map->stripes[i].dev->dev_state)) {
miss_ndevs++;
continue;
}
-
- if (!map->stripes[i].dev->writeable) {
+ if (!test_bit(BTRFS_DEV_STATE_WRITEABLE,
+ &map->stripes[i].dev->dev_state)) {
readonly = 1;
goto end;
}
@@ -5104,7 +5195,14 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
else if (map->type & BTRFS_BLOCK_GROUP_RAID5)
ret = 2;
else if (map->type & BTRFS_BLOCK_GROUP_RAID6)
- ret = 3;
+ /*
+ * There could be two corrupted data stripes, we need
+ * to loop retry in order to rebuild the correct data.
+ *
+ * Fail a stripe at a time on every retry except the
+ * stripe under reconstruction.
+ */
+ ret = map->num_stripes;
else
ret = 1;
free_extent_map(em);
@@ -6004,15 +6102,14 @@ static void btrfs_end_bio(struct bio *bio)
dev = bbio->stripes[stripe_index].dev;
if (dev->bdev) {
if (bio_op(bio) == REQ_OP_WRITE)
- btrfs_dev_stat_inc(dev,
+ btrfs_dev_stat_inc_and_print(dev,
BTRFS_DEV_STAT_WRITE_ERRS);
else
- btrfs_dev_stat_inc(dev,
+ btrfs_dev_stat_inc_and_print(dev,
BTRFS_DEV_STAT_READ_ERRS);
if (bio->bi_opf & REQ_PREFLUSH)
- btrfs_dev_stat_inc(dev,
+ btrfs_dev_stat_inc_and_print(dev,
BTRFS_DEV_STAT_FLUSH_ERRS);
- btrfs_dev_stat_print_on_error(dev);
}
}
}
@@ -6062,16 +6159,15 @@ static noinline void btrfs_schedule_bio(struct btrfs_device *device,
int should_queue = 1;
struct btrfs_pending_bios *pending_bios;
- if (device->missing || !device->bdev) {
+ if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state) ||
+ !device->bdev) {
bio_io_error(bio);
return;
}
/* don't bother with additional async steps for reads, right now */
if (bio_op(bio) == REQ_OP_READ) {
- bio_get(bio);
btrfsic_submit_bio(bio);
- bio_put(bio);
return;
}
@@ -6208,7 +6304,8 @@ blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
for (dev_nr = 0; dev_nr < total_devs; dev_nr++) {
dev = bbio->stripes[dev_nr].dev;
if (!dev || !dev->bdev ||
- (bio_op(first_bio) == REQ_OP_WRITE && !dev->writeable)) {
+ (bio_op(first_bio) == REQ_OP_WRITE &&
+ !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))) {
bbio_error(bbio, first_bio, logical);
continue;
}
@@ -6257,7 +6354,7 @@ static struct btrfs_device *add_missing_dev(struct btrfs_fs_devices *fs_devices,
device->fs_devices = fs_devices;
fs_devices->num_devices++;
- device->missing = 1;
+ set_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
fs_devices->missing_devices++;
return device;
@@ -6273,8 +6370,8 @@ static struct btrfs_device *add_missing_dev(struct btrfs_fs_devices *fs_devices,
* is generated.
*
* Return: a pointer to a new &struct btrfs_device on success; ERR_PTR()
- * on error. Returned struct is not linked onto any lists and can be
- * destroyed with kfree() right away.
+ * on error. Returned struct is not linked onto any lists and must be
+ * destroyed with free_device.
*/
struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
const u64 *devid,
@@ -6297,8 +6394,7 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
ret = find_next_devid(fs_info, &tmp);
if (ret) {
- bio_put(dev->flush_bio);
- kfree(dev);
+ free_device(dev);
return ERR_PTR(ret);
}
}
@@ -6477,7 +6573,9 @@ static int read_one_chunk(struct btrfs_fs_info *fs_info, struct btrfs_key *key,
}
btrfs_report_missing_device(fs_info, devid, uuid, false);
}
- map->stripes[i].dev->in_fs_metadata = 1;
+ set_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
+ &(map->stripes[i].dev->dev_state));
+
}
write_lock(&map_tree->map_tree.lock);
@@ -6506,7 +6604,7 @@ static void fill_device_from_item(struct extent_buffer *leaf,
device->io_width = btrfs_device_io_width(leaf, dev_item);
device->sector_size = btrfs_device_sector_size(leaf, dev_item);
WARN_ON(device->devid == BTRFS_DEV_REPLACE_DEVID);
- device->is_tgtdev_for_dev_replace = 0;
+ clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
ptr = btrfs_device_uuid(dev_item);
read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
@@ -6618,7 +6716,8 @@ static int read_one_dev(struct btrfs_fs_info *fs_info,
dev_uuid, false);
}
- if(!device->bdev && !device->missing) {
+ if (!device->bdev &&
+ !test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) {
/*
* this happens when a device that was properly setup
* in the device info lists suddenly goes bad.
@@ -6626,12 +6725,13 @@ static int read_one_dev(struct btrfs_fs_info *fs_info,
* device->missing to one here
*/
device->fs_devices->missing_devices++;
- device->missing = 1;
+ set_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
}
/* Move the device to its own fs_devices */
if (device->fs_devices != fs_devices) {
- ASSERT(device->missing);
+ ASSERT(test_bit(BTRFS_DEV_STATE_MISSING,
+ &device->dev_state));
list_move(&device->dev_list, &fs_devices->devices);
device->fs_devices->num_devices--;
@@ -6645,15 +6745,16 @@ static int read_one_dev(struct btrfs_fs_info *fs_info,
}
if (device->fs_devices != fs_info->fs_devices) {
- BUG_ON(device->writeable);
+ BUG_ON(test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state));
if (device->generation !=
btrfs_device_generation(leaf, dev_item))
return -EINVAL;
}
fill_device_from_item(leaf, dev_item, device);
- device->in_fs_metadata = 1;
- if (device->writeable && !device->is_tgtdev_for_dev_replace) {
+ set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
+ if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
+ !test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
device->fs_devices->total_rw_bytes += device->total_bytes;
atomic64_add(device->total_bytes - device->bytes_used,
&fs_info->free_chunk_space);
@@ -6785,10 +6886,13 @@ out_short_read:
/*
* Check if all chunks in the fs are OK for read-write degraded mount
*
+ * If the @failing_dev is specified, it's accounted as missing.
+ *
* Return true if all chunks meet the minimal RW mount requirements.
* Return false if any chunk doesn't meet the minimal RW mount requirements.
*/
-bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info)
+bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info,
+ struct btrfs_device *failing_dev)
{
struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
struct extent_map *em;
@@ -6816,12 +6920,16 @@ bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info)
for (i = 0; i < map->num_stripes; i++) {
struct btrfs_device *dev = map->stripes[i].dev;
- if (!dev || !dev->bdev || dev->missing ||
+ if (!dev || !dev->bdev ||
+ test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) ||
dev->last_flush_error)
missing++;
+ else if (failing_dev && failing_dev == dev)
+ missing++;
}
if (missing > max_tolerated) {
- btrfs_warn(fs_info,
+ if (!failing_dev)
+ btrfs_warn(fs_info,
"chunk %llu missing %d devices, max tolerance is %d for writeable mount",
em->start, missing, max_tolerated);
free_extent_map(em);
@@ -7092,10 +7200,24 @@ int btrfs_run_dev_stats(struct btrfs_trans_handle *trans,
mutex_lock(&fs_devices->device_list_mutex);
list_for_each_entry(device, &fs_devices->devices, dev_list) {
- if (!device->dev_stats_valid || !btrfs_dev_stats_dirty(device))
+ stats_cnt = atomic_read(&device->dev_stats_ccnt);
+ if (!device->dev_stats_valid || stats_cnt == 0)
continue;
- stats_cnt = atomic_read(&device->dev_stats_ccnt);
+
+ /*
+ * There is a LOAD-LOAD control dependency between the value of
+ * dev_stats_ccnt and updating the on-disk values which requires
+ * reading the in-memory counters. Such control dependencies
+ * require explicit read memory barriers.
+ *
+ * This memory barriers pairs with smp_mb__before_atomic in
+ * btrfs_dev_stat_inc/btrfs_dev_stat_set and with the full
+ * barrier implied by atomic_xchg in
+ * btrfs_dev_stats_read_and_reset
+ */
+ smp_rmb();
+
ret = update_dev_stat_item(trans, fs_info, device);
if (!ret)
atomic_sub(stats_cnt, &device->dev_stats_ccnt);
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index ff15208344a7..28c28eeadff3 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -47,6 +47,12 @@ struct btrfs_pending_bios {
#define btrfs_device_data_ordered_init(device) do { } while (0)
#endif
+#define BTRFS_DEV_STATE_WRITEABLE (0)
+#define BTRFS_DEV_STATE_IN_FS_METADATA (1)
+#define BTRFS_DEV_STATE_MISSING (2)
+#define BTRFS_DEV_STATE_REPLACE_TGT (3)
+#define BTRFS_DEV_STATE_FLUSH_SENT (4)
+
struct btrfs_device {
struct list_head dev_list;
struct list_head dev_alloc_list;
@@ -69,11 +75,7 @@ struct btrfs_device {
/* the mode sent to blkdev_get */
fmode_t mode;
- int writeable;
- int in_fs_metadata;
- int missing;
- int can_discard;
- int is_tgtdev_for_dev_replace;
+ unsigned long dev_state;
blk_status_t last_flush_error;
int flush_bio_sent;
@@ -129,14 +131,12 @@ struct btrfs_device {
struct completion flush_wait;
/* per-device scrub information */
- struct scrub_ctx *scrub_device;
+ struct scrub_ctx *scrub_ctx;
struct btrfs_work work;
struct rcu_head rcu;
- struct work_struct rcu_work;
/* readahead state */
- spinlock_t reada_lock;
atomic_t reada_in_flight;
u64 reada_next;
struct reada_zone *reada_curr_zone;
@@ -489,15 +489,16 @@ int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
int btrfs_remove_chunk(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info, u64 chunk_offset);
-static inline int btrfs_dev_stats_dirty(struct btrfs_device *dev)
-{
- return atomic_read(&dev->dev_stats_ccnt);
-}
-
static inline void btrfs_dev_stat_inc(struct btrfs_device *dev,
int index)
{
atomic_inc(dev->dev_stat_values + index);
+ /*
+ * This memory barrier orders stores updating statistics before stores
+ * updating dev_stats_ccnt.
+ *
+ * It pairs with smp_rmb() in btrfs_run_dev_stats().
+ */
smp_mb__before_atomic();
atomic_inc(&dev->dev_stats_ccnt);
}
@@ -514,7 +515,13 @@ static inline int btrfs_dev_stat_read_and_reset(struct btrfs_device *dev,
int ret;
ret = atomic_xchg(dev->dev_stat_values + index, 0);
- smp_mb__before_atomic();
+ /*
+ * atomic_xchg implies a full memory barriers as per atomic_t.txt:
+ * - RMW operations that have a return value are fully ordered;
+ *
+ * This implicit memory barriers is paired with the smp_rmb in
+ * btrfs_run_dev_stats
+ */
atomic_inc(&dev->dev_stats_ccnt);
return ret;
}
@@ -523,6 +530,12 @@ static inline void btrfs_dev_stat_set(struct btrfs_device *dev,
int index, unsigned long val)
{
atomic_set(dev->dev_stat_values + index, val);
+ /*
+ * This memory barrier orders stores updating statistics before stores
+ * updating dev_stats_ccnt.
+ *
+ * It pairs with smp_rmb() in btrfs_run_dev_stats().
+ */
smp_mb__before_atomic();
atomic_inc(&dev->dev_stats_ccnt);
}
@@ -540,7 +553,7 @@ void btrfs_update_commit_device_bytes_used(struct btrfs_fs_info *fs_info,
struct list_head *btrfs_get_fs_uuids(void);
void btrfs_set_fs_info_ptr(struct btrfs_fs_info *fs_info);
void btrfs_reset_fs_info_ptr(struct btrfs_fs_info *fs_info);
-
-bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info);
+bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info,
+ struct btrfs_device *failing_dev);
#endif
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 2c7e53f9ff1b..de7d072c78ef 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -23,6 +23,7 @@
#include <linux/xattr.h>
#include <linux/security.h>
#include <linux/posix_acl_xattr.h>
+#include <linux/iversion.h>
#include "ctree.h"
#include "btrfs_inode.h"
#include "transaction.h"
@@ -267,7 +268,6 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
{
struct btrfs_key key;
struct inode *inode = d_inode(dentry);
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_path *path;
int ret = 0;
@@ -336,11 +336,6 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
u32 this_len = sizeof(*di) + name_len + data_len;
unsigned long name_ptr = (unsigned long)(di + 1);
- if (verify_dir_item(fs_info, leaf, slot, di)) {
- ret = -EIO;
- goto err;
- }
-
total_size += name_len + 1;
/*
* We are just looking for how big our buffer needs to
diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c
index 17f2dd8fddb8..01a4eab602a3 100644
--- a/fs/btrfs/zstd.c
+++ b/fs/btrfs/zstd.c
@@ -43,6 +43,8 @@ struct workspace {
size_t size;
char *buf;
struct list_head list;
+ ZSTD_inBuffer in_buf;
+ ZSTD_outBuffer out_buf;
};
static void zstd_free_workspace(struct list_head *ws)
@@ -94,8 +96,6 @@ static int zstd_compress_pages(struct list_head *ws,
int nr_pages = 0;
struct page *in_page = NULL; /* The current page to read */
struct page *out_page = NULL; /* The current page to write to */
- ZSTD_inBuffer in_buf = { NULL, 0, 0 };
- ZSTD_outBuffer out_buf = { NULL, 0, 0 };
unsigned long tot_in = 0;
unsigned long tot_out = 0;
unsigned long len = *total_out;
@@ -118,9 +118,9 @@ static int zstd_compress_pages(struct list_head *ws,
/* map in the first page of input data */
in_page = find_get_page(mapping, start >> PAGE_SHIFT);
- in_buf.src = kmap(in_page);
- in_buf.pos = 0;
- in_buf.size = min_t(size_t, len, PAGE_SIZE);
+ workspace->in_buf.src = kmap(in_page);
+ workspace->in_buf.pos = 0;
+ workspace->in_buf.size = min_t(size_t, len, PAGE_SIZE);
/* Allocate and map in the output buffer */
@@ -130,14 +130,15 @@ static int zstd_compress_pages(struct list_head *ws,
goto out;
}
pages[nr_pages++] = out_page;
- out_buf.dst = kmap(out_page);
- out_buf.pos = 0;
- out_buf.size = min_t(size_t, max_out, PAGE_SIZE);
+ workspace->out_buf.dst = kmap(out_page);
+ workspace->out_buf.pos = 0;
+ workspace->out_buf.size = min_t(size_t, max_out, PAGE_SIZE);
while (1) {
size_t ret2;
- ret2 = ZSTD_compressStream(stream, &out_buf, &in_buf);
+ ret2 = ZSTD_compressStream(stream, &workspace->out_buf,
+ &workspace->in_buf);
if (ZSTD_isError(ret2)) {
pr_debug("BTRFS: ZSTD_compressStream returned %d\n",
ZSTD_getErrorCode(ret2));
@@ -146,22 +147,22 @@ static int zstd_compress_pages(struct list_head *ws,
}
/* Check to see if we are making it bigger */
- if (tot_in + in_buf.pos > 8192 &&
- tot_in + in_buf.pos <
- tot_out + out_buf.pos) {
+ if (tot_in + workspace->in_buf.pos > 8192 &&
+ tot_in + workspace->in_buf.pos <
+ tot_out + workspace->out_buf.pos) {
ret = -E2BIG;
goto out;
}
/* We've reached the end of our output range */
- if (out_buf.pos >= max_out) {
- tot_out += out_buf.pos;
+ if (workspace->out_buf.pos >= max_out) {
+ tot_out += workspace->out_buf.pos;
ret = -E2BIG;
goto out;
}
/* Check if we need more output space */
- if (out_buf.pos == out_buf.size) {
+ if (workspace->out_buf.pos == workspace->out_buf.size) {
tot_out += PAGE_SIZE;
max_out -= PAGE_SIZE;
kunmap(out_page);
@@ -176,19 +177,20 @@ static int zstd_compress_pages(struct list_head *ws,
goto out;
}
pages[nr_pages++] = out_page;
- out_buf.dst = kmap(out_page);
- out_buf.pos = 0;
- out_buf.size = min_t(size_t, max_out, PAGE_SIZE);
+ workspace->out_buf.dst = kmap(out_page);
+ workspace->out_buf.pos = 0;
+ workspace->out_buf.size = min_t(size_t, max_out,
+ PAGE_SIZE);
}
/* We've reached the end of the input */
- if (in_buf.pos >= len) {
- tot_in += in_buf.pos;
+ if (workspace->in_buf.pos >= len) {
+ tot_in += workspace->in_buf.pos;
break;
}
/* Check if we need more input */
- if (in_buf.pos == in_buf.size) {
+ if (workspace->in_buf.pos == workspace->in_buf.size) {
tot_in += PAGE_SIZE;
kunmap(in_page);
put_page(in_page);
@@ -196,15 +198,15 @@ static int zstd_compress_pages(struct list_head *ws,
start += PAGE_SIZE;
len -= PAGE_SIZE;
in_page = find_get_page(mapping, start >> PAGE_SHIFT);
- in_buf.src = kmap(in_page);
- in_buf.pos = 0;
- in_buf.size = min_t(size_t, len, PAGE_SIZE);
+ workspace->in_buf.src = kmap(in_page);
+ workspace->in_buf.pos = 0;
+ workspace->in_buf.size = min_t(size_t, len, PAGE_SIZE);
}
}
while (1) {
size_t ret2;
- ret2 = ZSTD_endStream(stream, &out_buf);
+ ret2 = ZSTD_endStream(stream, &workspace->out_buf);
if (ZSTD_isError(ret2)) {
pr_debug("BTRFS: ZSTD_endStream returned %d\n",
ZSTD_getErrorCode(ret2));
@@ -212,11 +214,11 @@ static int zstd_compress_pages(struct list_head *ws,
goto out;
}
if (ret2 == 0) {
- tot_out += out_buf.pos;
+ tot_out += workspace->out_buf.pos;
break;
}
- if (out_buf.pos >= max_out) {
- tot_out += out_buf.pos;
+ if (workspace->out_buf.pos >= max_out) {
+ tot_out += workspace->out_buf.pos;
ret = -E2BIG;
goto out;
}
@@ -235,9 +237,9 @@ static int zstd_compress_pages(struct list_head *ws,
goto out;
}
pages[nr_pages++] = out_page;
- out_buf.dst = kmap(out_page);
- out_buf.pos = 0;
- out_buf.size = min_t(size_t, max_out, PAGE_SIZE);
+ workspace->out_buf.dst = kmap(out_page);
+ workspace->out_buf.pos = 0;
+ workspace->out_buf.size = min_t(size_t, max_out, PAGE_SIZE);
}
if (tot_out >= tot_in) {
@@ -273,8 +275,6 @@ static int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
unsigned long total_pages_in = DIV_ROUND_UP(srclen, PAGE_SIZE);
unsigned long buf_start;
unsigned long total_out = 0;
- ZSTD_inBuffer in_buf = { NULL, 0, 0 };
- ZSTD_outBuffer out_buf = { NULL, 0, 0 };
stream = ZSTD_initDStream(
ZSTD_BTRFS_MAX_INPUT, workspace->mem, workspace->size);
@@ -284,18 +284,19 @@ static int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
goto done;
}
- in_buf.src = kmap(pages_in[page_in_index]);
- in_buf.pos = 0;
- in_buf.size = min_t(size_t, srclen, PAGE_SIZE);
+ workspace->in_buf.src = kmap(pages_in[page_in_index]);
+ workspace->in_buf.pos = 0;
+ workspace->in_buf.size = min_t(size_t, srclen, PAGE_SIZE);
- out_buf.dst = workspace->buf;
- out_buf.pos = 0;
- out_buf.size = PAGE_SIZE;
+ workspace->out_buf.dst = workspace->buf;
+ workspace->out_buf.pos = 0;
+ workspace->out_buf.size = PAGE_SIZE;
while (1) {
size_t ret2;
- ret2 = ZSTD_decompressStream(stream, &out_buf, &in_buf);
+ ret2 = ZSTD_decompressStream(stream, &workspace->out_buf,
+ &workspace->in_buf);
if (ZSTD_isError(ret2)) {
pr_debug("BTRFS: ZSTD_decompressStream returned %d\n",
ZSTD_getErrorCode(ret2));
@@ -303,38 +304,38 @@ static int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
goto done;
}
buf_start = total_out;
- total_out += out_buf.pos;
- out_buf.pos = 0;
+ total_out += workspace->out_buf.pos;
+ workspace->out_buf.pos = 0;
- ret = btrfs_decompress_buf2page(out_buf.dst, buf_start,
- total_out, disk_start, orig_bio);
+ ret = btrfs_decompress_buf2page(workspace->out_buf.dst,
+ buf_start, total_out, disk_start, orig_bio);
if (ret == 0)
break;
- if (in_buf.pos >= srclen)
+ if (workspace->in_buf.pos >= srclen)
break;
/* Check if we've hit the end of a frame */
if (ret2 == 0)
break;
- if (in_buf.pos == in_buf.size) {
+ if (workspace->in_buf.pos == workspace->in_buf.size) {
kunmap(pages_in[page_in_index++]);
if (page_in_index >= total_pages_in) {
- in_buf.src = NULL;
+ workspace->in_buf.src = NULL;
ret = -EIO;
goto done;
}
srclen -= PAGE_SIZE;
- in_buf.src = kmap(pages_in[page_in_index]);
- in_buf.pos = 0;
- in_buf.size = min_t(size_t, srclen, PAGE_SIZE);
+ workspace->in_buf.src = kmap(pages_in[page_in_index]);
+ workspace->in_buf.pos = 0;
+ workspace->in_buf.size = min_t(size_t, srclen, PAGE_SIZE);
}
}
ret = 0;
zero_fill_bio(orig_bio);
done:
- if (in_buf.src)
+ if (workspace->in_buf.src)
kunmap(pages_in[page_in_index]);
return ret;
}
@@ -348,8 +349,6 @@ static int zstd_decompress(struct list_head *ws, unsigned char *data_in,
ZSTD_DStream *stream;
int ret = 0;
size_t ret2;
- ZSTD_inBuffer in_buf = { NULL, 0, 0 };
- ZSTD_outBuffer out_buf = { NULL, 0, 0 };
unsigned long total_out = 0;
unsigned long pg_offset = 0;
char *kaddr;
@@ -364,16 +363,17 @@ static int zstd_decompress(struct list_head *ws, unsigned char *data_in,
destlen = min_t(size_t, destlen, PAGE_SIZE);
- in_buf.src = data_in;
- in_buf.pos = 0;
- in_buf.size = srclen;
+ workspace->in_buf.src = data_in;
+ workspace->in_buf.pos = 0;
+ workspace->in_buf.size = srclen;
- out_buf.dst = workspace->buf;
- out_buf.pos = 0;
- out_buf.size = PAGE_SIZE;
+ workspace->out_buf.dst = workspace->buf;
+ workspace->out_buf.pos = 0;
+ workspace->out_buf.size = PAGE_SIZE;
ret2 = 1;
- while (pg_offset < destlen && in_buf.pos < in_buf.size) {
+ while (pg_offset < destlen
+ && workspace->in_buf.pos < workspace->in_buf.size) {
unsigned long buf_start;
unsigned long buf_offset;
unsigned long bytes;
@@ -384,7 +384,8 @@ static int zstd_decompress(struct list_head *ws, unsigned char *data_in,
ret = -EIO;
goto finish;
}
- ret2 = ZSTD_decompressStream(stream, &out_buf, &in_buf);
+ ret2 = ZSTD_decompressStream(stream, &workspace->out_buf,
+ &workspace->in_buf);
if (ZSTD_isError(ret2)) {
pr_debug("BTRFS: ZSTD_decompressStream returned %d\n",
ZSTD_getErrorCode(ret2));
@@ -393,8 +394,8 @@ static int zstd_decompress(struct list_head *ws, unsigned char *data_in,
}
buf_start = total_out;
- total_out += out_buf.pos;
- out_buf.pos = 0;
+ total_out += workspace->out_buf.pos;
+ workspace->out_buf.pos = 0;
if (total_out <= start_byte)
continue;
@@ -405,10 +406,11 @@ static int zstd_decompress(struct list_head *ws, unsigned char *data_in,
buf_offset = 0;
bytes = min_t(unsigned long, destlen - pg_offset,
- out_buf.size - buf_offset);
+ workspace->out_buf.size - buf_offset);
kaddr = kmap_atomic(dest_page);
- memcpy(kaddr + pg_offset, out_buf.dst + buf_offset, bytes);
+ memcpy(kaddr + pg_offset, workspace->out_buf.dst + buf_offset,
+ bytes);
kunmap_atomic(kaddr);
pg_offset += bytes;