summaryrefslogtreecommitdiffstats
path: root/fs/btrfs/inode.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/btrfs/inode.c')
-rw-r--r--fs/btrfs/inode.c1319
1 files changed, 452 insertions, 867 deletions
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 8b3489f229c7..768c8be4c765 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -5,7 +5,6 @@
#include <linux/kernel.h>
#include <linux/bio.h>
-#include <linux/buffer_head.h>
#include <linux/file.h>
#include <linux/fs.h>
#include <linux/pagemap.h>
@@ -49,17 +48,18 @@
#include "qgroup.h"
#include "delalloc-space.h"
#include "block-group.h"
+#include "space-info.h"
struct btrfs_iget_args {
- struct btrfs_key *location;
+ u64 ino;
struct btrfs_root *root;
};
struct btrfs_dio_data {
u64 reserve;
- u64 unsubmitted_oe_range_start;
- u64 unsubmitted_oe_range_end;
- int overwrite;
+ loff_t length;
+ ssize_t submitted;
+ struct extent_changeset *data_reserved;
};
static const struct inode_operations btrfs_dir_inode_operations;
@@ -1142,7 +1142,7 @@ out_unlock:
*/
if (extent_reserved) {
extent_clear_unlock_delalloc(inode, start,
- start + cur_alloc_size,
+ start + cur_alloc_size - 1,
locked_page,
clear_bits,
page_ops);
@@ -1355,6 +1355,66 @@ static noinline int csum_exist_in_range(struct btrfs_fs_info *fs_info,
return 1;
}
+static int fallback_to_cow(struct inode *inode, struct page *locked_page,
+ const u64 start, const u64 end,
+ int *page_started, unsigned long *nr_written)
+{
+ const bool is_space_ino = btrfs_is_free_space_inode(BTRFS_I(inode));
+ const u64 range_bytes = end + 1 - start;
+ struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+ u64 range_start = start;
+ u64 count;
+
+ /*
+ * If EXTENT_NORESERVE is set it means that when the buffered write was
+ * made we had not enough available data space and therefore we did not
+ * reserve data space for it, since we though we could do NOCOW for the
+ * respective file range (either there is prealloc extent or the inode
+ * has the NOCOW bit set).
+ *
+ * However when we need to fallback to COW mode (because for example the
+ * block group for the corresponding extent was turned to RO mode by a
+ * scrub or relocation) we need to do the following:
+ *
+ * 1) We increment the bytes_may_use counter of the data space info.
+ * If COW succeeds, it allocates a new data extent and after doing
+ * that it decrements the space info's bytes_may_use counter and
+ * increments its bytes_reserved counter by the same amount (we do
+ * this at btrfs_add_reserved_bytes()). So we need to increment the
+ * bytes_may_use counter to compensate (when space is reserved at
+ * buffered write time, the bytes_may_use counter is incremented);
+ *
+ * 2) We clear the EXTENT_NORESERVE bit from the range. We do this so
+ * that if the COW path fails for any reason, it decrements (through
+ * extent_clear_unlock_delalloc()) the bytes_may_use counter of the
+ * data space info, which we incremented in the step above.
+ *
+ * If we need to fallback to cow and the inode corresponds to a free
+ * space cache inode, we must also increment bytes_may_use of the data
+ * space_info for the same reason. Space caches always get a prealloc
+ * extent for them, however scrub or balance may have set the block
+ * group that contains that extent to RO mode.
+ */
+ count = count_range_bits(io_tree, &range_start, end, range_bytes,
+ EXTENT_NORESERVE, 0);
+ if (count > 0 || is_space_ino) {
+ const u64 bytes = is_space_ino ? range_bytes : count;
+ struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
+ struct btrfs_space_info *sinfo = fs_info->data_sinfo;
+
+ spin_lock(&sinfo->lock);
+ btrfs_space_info_update_bytes_may_use(fs_info, sinfo, bytes);
+ spin_unlock(&sinfo->lock);
+
+ if (count > 0)
+ clear_extent_bit(io_tree, start, end, EXTENT_NORESERVE,
+ 0, 0, NULL);
+ }
+
+ return cow_file_range(inode, locked_page, start, end, page_started,
+ nr_written, 1);
+}
+
/*
* when nowcow writeback call back. This checks for snapshots or COW copies
* of the extents that exist in the file, and COWs the file as required.
@@ -1602,9 +1662,9 @@ out_check:
* NOCOW, following one which needs to be COW'ed
*/
if (cow_start != (u64)-1) {
- ret = cow_file_range(inode, locked_page,
- cow_start, found_key.offset - 1,
- page_started, nr_written, 1);
+ ret = fallback_to_cow(inode, locked_page, cow_start,
+ found_key.offset - 1,
+ page_started, nr_written);
if (ret) {
if (nocow)
btrfs_dec_nocow_writers(fs_info,
@@ -1693,8 +1753,8 @@ out_check:
if (cow_start != (u64)-1) {
cur_offset = end;
- ret = cow_file_range(inode, locked_page, cow_start, end,
- page_started, nr_written, 1);
+ ret = fallback_to_cow(inode, locked_page, cow_start, end,
+ page_started, nr_written);
if (ret)
goto error;
}
@@ -2726,10 +2786,9 @@ void btrfs_writepage_endio_finish_ordered(struct page *page, u64 start,
btrfs_queue_work(wq, &ordered_extent->work);
}
-static int __readpage_endio_check(struct inode *inode,
- struct btrfs_io_bio *io_bio,
- int icsum, struct page *page,
- int pgoff, u64 start, size_t len)
+static int check_data_csum(struct inode *inode, struct btrfs_io_bio *io_bio,
+ int icsum, struct page *page, int pgoff, u64 start,
+ size_t len)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
@@ -2743,9 +2802,7 @@ static int __readpage_endio_check(struct inode *inode,
kaddr = kmap_atomic(page);
shash->tfm = fs_info->csum_shash;
- crypto_shash_init(shash);
- crypto_shash_update(shash, kaddr + pgoff, len);
- crypto_shash_final(shash, csum);
+ crypto_shash_digest(shash, kaddr + pgoff, len, csum);
if (memcmp(csum, csum_expected, csum_size))
goto zeroit;
@@ -2790,8 +2847,8 @@ static int btrfs_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
}
phy_offset >>= inode->i_sb->s_blocksize_bits;
- return __readpage_endio_check(inode, io_bio, phy_offset, page, offset,
- start, (size_t)(end - start + 1));
+ return check_data_csum(inode, io_bio, phy_offset, page, offset, start,
+ (size_t)(end - start + 1));
}
/*
@@ -2981,7 +3038,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
found_key.objectid = found_key.offset;
found_key.type = BTRFS_INODE_ITEM_KEY;
found_key.offset = 0;
- inode = btrfs_iget(fs_info->sb, &found_key, root);
+ inode = btrfs_iget(fs_info->sb, last_objectid, root);
ret = PTR_ERR_OR_ZERO(inode);
if (ret && ret != -ENOENT)
goto out;
@@ -3000,18 +3057,16 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
* orphan must not get deleted.
* find_dead_roots already ran before us, so if this
* is a snapshot deletion, we should find the root
- * in the dead_roots list
+ * in the fs_roots radix tree.
*/
- spin_lock(&fs_info->trans_lock);
- list_for_each_entry(dead_root, &fs_info->dead_roots,
- root_list) {
- if (dead_root->root_key.objectid ==
- found_key.objectid) {
- is_dead_root = 1;
- break;
- }
- }
- spin_unlock(&fs_info->trans_lock);
+
+ spin_lock(&fs_info->fs_roots_radix_lock);
+ dead_root = radix_tree_lookup(&fs_info->fs_roots_radix,
+ (unsigned long)found_key.objectid);
+ if (dead_root && btrfs_root_refs(&dead_root->root_item) == 0)
+ is_dead_root = 1;
+ spin_unlock(&fs_info->fs_roots_radix_lock);
+
if (is_dead_root) {
/* prevent this orphan from being found again */
key.offset = found_key.objectid - 1;
@@ -3357,43 +3412,40 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
btrfs_init_map_token(&token, leaf);
- btrfs_set_token_inode_uid(leaf, item, i_uid_read(inode), &token);
- btrfs_set_token_inode_gid(leaf, item, i_gid_read(inode), &token);
- btrfs_set_token_inode_size(leaf, item, BTRFS_I(inode)->disk_i_size,
- &token);
- btrfs_set_token_inode_mode(leaf, item, inode->i_mode, &token);
- btrfs_set_token_inode_nlink(leaf, item, inode->i_nlink, &token);
-
- btrfs_set_token_timespec_sec(leaf, &item->atime,
- inode->i_atime.tv_sec, &token);
- btrfs_set_token_timespec_nsec(leaf, &item->atime,
- inode->i_atime.tv_nsec, &token);
-
- btrfs_set_token_timespec_sec(leaf, &item->mtime,
- inode->i_mtime.tv_sec, &token);
- btrfs_set_token_timespec_nsec(leaf, &item->mtime,
- inode->i_mtime.tv_nsec, &token);
-
- btrfs_set_token_timespec_sec(leaf, &item->ctime,
- inode->i_ctime.tv_sec, &token);
- btrfs_set_token_timespec_nsec(leaf, &item->ctime,
- inode->i_ctime.tv_nsec, &token);
-
- btrfs_set_token_timespec_sec(leaf, &item->otime,
- BTRFS_I(inode)->i_otime.tv_sec, &token);
- btrfs_set_token_timespec_nsec(leaf, &item->otime,
- BTRFS_I(inode)->i_otime.tv_nsec, &token);
-
- btrfs_set_token_inode_nbytes(leaf, item, inode_get_bytes(inode),
- &token);
- btrfs_set_token_inode_generation(leaf, item, BTRFS_I(inode)->generation,
- &token);
- btrfs_set_token_inode_sequence(leaf, item, inode_peek_iversion(inode),
- &token);
- btrfs_set_token_inode_transid(leaf, item, trans->transid, &token);
- btrfs_set_token_inode_rdev(leaf, item, inode->i_rdev, &token);
- btrfs_set_token_inode_flags(leaf, item, BTRFS_I(inode)->flags, &token);
- btrfs_set_token_inode_block_group(leaf, item, 0, &token);
+ btrfs_set_token_inode_uid(&token, item, i_uid_read(inode));
+ btrfs_set_token_inode_gid(&token, item, i_gid_read(inode));
+ btrfs_set_token_inode_size(&token, item, BTRFS_I(inode)->disk_i_size);
+ btrfs_set_token_inode_mode(&token, item, inode->i_mode);
+ btrfs_set_token_inode_nlink(&token, item, inode->i_nlink);
+
+ btrfs_set_token_timespec_sec(&token, &item->atime,
+ inode->i_atime.tv_sec);
+ btrfs_set_token_timespec_nsec(&token, &item->atime,
+ inode->i_atime.tv_nsec);
+
+ btrfs_set_token_timespec_sec(&token, &item->mtime,
+ inode->i_mtime.tv_sec);
+ btrfs_set_token_timespec_nsec(&token, &item->mtime,
+ inode->i_mtime.tv_nsec);
+
+ btrfs_set_token_timespec_sec(&token, &item->ctime,
+ inode->i_ctime.tv_sec);
+ btrfs_set_token_timespec_nsec(&token, &item->ctime,
+ inode->i_ctime.tv_nsec);
+
+ btrfs_set_token_timespec_sec(&token, &item->otime,
+ BTRFS_I(inode)->i_otime.tv_sec);
+ btrfs_set_token_timespec_nsec(&token, &item->otime,
+ BTRFS_I(inode)->i_otime.tv_nsec);
+
+ btrfs_set_token_inode_nbytes(&token, item, inode_get_bytes(inode));
+ btrfs_set_token_inode_generation(&token, item,
+ BTRFS_I(inode)->generation);
+ btrfs_set_token_inode_sequence(&token, item, inode_peek_iversion(inode));
+ btrfs_set_token_inode_transid(&token, item, trans->transid);
+ btrfs_set_token_inode_rdev(&token, item, inode->i_rdev);
+ btrfs_set_token_inode_flags(&token, item, BTRFS_I(inode)->flags);
+ btrfs_set_token_inode_block_group(&token, item, 0);
}
/*
@@ -3618,7 +3670,7 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir)
* 1 for the inode ref
* 1 for the inode
*/
- return btrfs_start_transaction_fallback_global_rsv(root, 5, 5);
+ return btrfs_start_transaction_fallback_global_rsv(root, 5);
}
static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
@@ -4108,11 +4160,12 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
BUG_ON(new_size > 0 && min_type != BTRFS_EXTENT_DATA_KEY);
/*
- * for non-free space inodes and ref cows, we want to back off from
- * time to time
+ * For non-free space inodes and non-shareable roots, we want to back
+ * off from time to time. This means all inodes in subvolume roots,
+ * reloc roots, and data reloc roots.
*/
if (!btrfs_is_free_space_inode(BTRFS_I(inode)) &&
- test_bit(BTRFS_ROOT_REF_COWS, &root->state))
+ test_bit(BTRFS_ROOT_SHAREABLE, &root->state))
be_nice = true;
path = btrfs_alloc_path();
@@ -4120,20 +4173,19 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
return -ENOMEM;
path->reada = READA_BACK;
- if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
+ if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
lock_extent_bits(&BTRFS_I(inode)->io_tree, lock_start, (u64)-1,
&cached_state);
- /*
- * We want to drop from the next block forward in case this new size is
- * not block aligned since we will be keeping the last block of the
- * extent just the way it is.
- */
- if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) ||
- root == fs_info->tree_root)
+ /*
+ * We want to drop from the next block forward in case this
+ * new size is not block aligned since we will be keeping the
+ * last block of the extent just the way it is.
+ */
btrfs_drop_extent_cache(BTRFS_I(inode), ALIGN(new_size,
fs_info->sectorsize),
(u64)-1, 0);
+ }
/*
* This function is also used to drop the items in the log tree before
@@ -4241,7 +4293,7 @@ search_again:
extent_num_bytes);
num_dec = (orig_num_bytes -
extent_num_bytes);
- if (test_bit(BTRFS_ROOT_REF_COWS,
+ if (test_bit(BTRFS_ROOT_SHAREABLE,
&root->state) &&
extent_start != 0)
inode_sub_bytes(inode, num_dec);
@@ -4257,7 +4309,7 @@ search_again:
num_dec = btrfs_file_extent_num_bytes(leaf, fi);
if (extent_start != 0) {
found_extent = 1;
- if (test_bit(BTRFS_ROOT_REF_COWS,
+ if (test_bit(BTRFS_ROOT_SHAREABLE,
&root->state))
inode_sub_bytes(inode, num_dec);
}
@@ -4293,7 +4345,7 @@ search_again:
clear_len = fs_info->sectorsize;
}
- if (test_bit(BTRFS_ROOT_REF_COWS, &root->state))
+ if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state))
inode_sub_bytes(inode, item_end + 1 - new_size);
}
delete:
@@ -4334,8 +4386,7 @@ delete:
should_throttle = false;
if (found_extent &&
- (test_bit(BTRFS_ROOT_REF_COWS, &root->state) ||
- root == fs_info->tree_root)) {
+ root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
struct btrfs_ref ref = { 0 };
bytes_deleted += extent_num_bytes;
@@ -4759,10 +4810,7 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr)
truncate_setsize(inode, newsize);
- /* Disable nonlocked read DIO to avoid the endless truncate */
- btrfs_inode_block_unlocked_dio(BTRFS_I(inode));
inode_dio_wait(inode);
- btrfs_inode_resume_unlocked_dio(BTRFS_I(inode));
ret = btrfs_truncate(inode, newsize == oldsize);
if (ret && inode->i_nlink) {
@@ -5154,7 +5202,7 @@ static int fixup_tree_root_location(struct btrfs_fs_info *fs_info,
btrfs_release_path(path);
- new_root = btrfs_get_fs_root(fs_info, location, true);
+ new_root = btrfs_get_fs_root(fs_info, location->objectid, true);
if (IS_ERR(new_root)) {
err = PTR_ERR(new_root);
goto out;
@@ -5232,9 +5280,11 @@ static void inode_tree_del(struct inode *inode)
static int btrfs_init_locked_inode(struct inode *inode, void *p)
{
struct btrfs_iget_args *args = p;
- inode->i_ino = args->location->objectid;
- memcpy(&BTRFS_I(inode)->location, args->location,
- sizeof(*args->location));
+
+ inode->i_ino = args->ino;
+ BTRFS_I(inode)->location.objectid = args->ino;
+ BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY;
+ BTRFS_I(inode)->location.offset = 0;
BTRFS_I(inode)->root = btrfs_grab_root(args->root);
BUG_ON(args->root && !BTRFS_I(inode)->root);
return 0;
@@ -5243,19 +5293,19 @@ static int btrfs_init_locked_inode(struct inode *inode, void *p)
static int btrfs_find_actor(struct inode *inode, void *opaque)
{
struct btrfs_iget_args *args = opaque;
- return args->location->objectid == BTRFS_I(inode)->location.objectid &&
+
+ return args->ino == BTRFS_I(inode)->location.objectid &&
args->root == BTRFS_I(inode)->root;
}
-static struct inode *btrfs_iget_locked(struct super_block *s,
- struct btrfs_key *location,
+static struct inode *btrfs_iget_locked(struct super_block *s, u64 ino,
struct btrfs_root *root)
{
struct inode *inode;
struct btrfs_iget_args args;
- unsigned long hashval = btrfs_inode_hash(location->objectid, root);
+ unsigned long hashval = btrfs_inode_hash(ino, root);
- args.location = location;
+ args.ino = ino;
args.root = root;
inode = iget5_locked(s, hashval, btrfs_find_actor,
@@ -5265,17 +5315,17 @@ static struct inode *btrfs_iget_locked(struct super_block *s,
}
/*
- * Get an inode object given its location and corresponding root.
+ * Get an inode object given its inode number and corresponding root.
* Path can be preallocated to prevent recursing back to iget through
* allocator. NULL is also valid but may require an additional allocation
* later.
*/
-struct inode *btrfs_iget_path(struct super_block *s, struct btrfs_key *location,
+struct inode *btrfs_iget_path(struct super_block *s, u64 ino,
struct btrfs_root *root, struct btrfs_path *path)
{
struct inode *inode;
- inode = btrfs_iget_locked(s, location, root);
+ inode = btrfs_iget_locked(s, ino, root);
if (!inode)
return ERR_PTR(-ENOMEM);
@@ -5302,10 +5352,9 @@ struct inode *btrfs_iget_path(struct super_block *s, struct btrfs_key *location,
return inode;
}
-struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
- struct btrfs_root *root)
+struct inode *btrfs_iget(struct super_block *s, u64 ino, struct btrfs_root *root)
{
- return btrfs_iget_path(s, location, root, NULL);
+ return btrfs_iget_path(s, ino, root, NULL);
}
static struct inode *new_simple_dir(struct super_block *s,
@@ -5374,7 +5423,7 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
return ERR_PTR(ret);
if (location.type == BTRFS_INODE_ITEM_KEY) {
- inode = btrfs_iget(dir->i_sb, &location, root);
+ inode = btrfs_iget(dir->i_sb, location.objectid, root);
if (IS_ERR(inode))
return inode;
@@ -5398,7 +5447,7 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
else
inode = new_simple_dir(dir->i_sb, &location, sub_root);
} else {
- inode = btrfs_iget(dir->i_sb, &location, sub_root);
+ inode = btrfs_iget(dir->i_sb, location.objectid, sub_root);
}
if (root != sub_root)
btrfs_put_root(sub_root);
@@ -5779,7 +5828,8 @@ int btrfs_set_inode_index(struct btrfs_inode *dir, u64 *index)
static int btrfs_insert_inode_locked(struct inode *inode)
{
struct btrfs_iget_args args;
- args.location = &BTRFS_I(inode)->location;
+
+ args.ino = BTRFS_I(inode)->location.objectid;
args.root = BTRFS_I(inode)->root;
return insert_inode_locked4(inode,
@@ -6991,7 +7041,7 @@ out:
}
static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
- struct extent_state **cached_state, int writing)
+ struct extent_state **cached_state, bool writing)
{
struct btrfs_ordered_extent *ordered;
int ret = 0;
@@ -7129,30 +7179,7 @@ static struct extent_map *create_io_em(struct inode *inode, u64 start, u64 len,
}
-static int btrfs_get_blocks_direct_read(struct extent_map *em,
- struct buffer_head *bh_result,
- struct inode *inode,
- u64 start, u64 len)
-{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
-
- if (em->block_start == EXTENT_MAP_HOLE ||
- test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
- return -ENOENT;
-
- len = min(len, em->len - (start - em->start));
-
- bh_result->b_blocknr = (em->block_start + (start - em->start)) >>
- inode->i_blkbits;
- bh_result->b_size = len;
- bh_result->b_bdev = fs_info->fs_devices->latest_bdev;
- set_buffer_mapped(bh_result);
-
- return 0;
-}
-
static int btrfs_get_blocks_direct_write(struct extent_map **map,
- struct buffer_head *bh_result,
struct inode *inode,
struct btrfs_dio_data *dio_data,
u64 start, u64 len)
@@ -7214,7 +7241,6 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map,
}
/* this will cow the extent */
- len = bh_result->b_size;
free_extent_map(em);
*map = em = btrfs_new_extent_direct(inode, start, len);
if (IS_ERR(em)) {
@@ -7225,64 +7251,73 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map,
len = min(len, em->len - (start - em->start));
skip_cow:
- bh_result->b_blocknr = (em->block_start + (start - em->start)) >>
- inode->i_blkbits;
- bh_result->b_size = len;
- bh_result->b_bdev = fs_info->fs_devices->latest_bdev;
- set_buffer_mapped(bh_result);
-
- if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
- set_buffer_new(bh_result);
-
/*
* Need to update the i_size under the extent lock so buffered
* readers will get the updated i_size when we unlock.
*/
- if (!dio_data->overwrite && start + len > i_size_read(inode))
+ if (start + len > i_size_read(inode))
i_size_write(inode, start + len);
- WARN_ON(dio_data->reserve < len);
dio_data->reserve -= len;
- dio_data->unsubmitted_oe_range_end = start + len;
- current->journal_info = dio_data;
out:
return ret;
}
-static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
- struct buffer_head *bh_result, int create)
+static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
+ loff_t length, unsigned flags, struct iomap *iomap,
+ struct iomap *srcmap)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct extent_map *em;
struct extent_state *cached_state = NULL;
struct btrfs_dio_data *dio_data = NULL;
- u64 start = iblock << inode->i_blkbits;
u64 lockstart, lockend;
- u64 len = bh_result->b_size;
+ const bool write = !!(flags & IOMAP_WRITE);
int ret = 0;
+ u64 len = length;
+ bool unlock_extents = false;
- if (!create)
+ if (!write)
len = min_t(u64, len, fs_info->sectorsize);
lockstart = start;
lockend = start + len - 1;
- if (current->journal_info) {
- /*
- * Need to pull our outstanding extents and set journal_info to NULL so
- * that anything that needs to check if there's a transaction doesn't get
- * confused.
- */
- dio_data = current->journal_info;
- current->journal_info = NULL;
+ /*
+ * The generic stuff only does filemap_write_and_wait_range, which
+ * isn't enough if we've written compressed pages to this area, so we
+ * need to flush the dirty pages again to make absolutely sure that any
+ * outstanding dirty pages are on disk.
+ */
+ if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
+ &BTRFS_I(inode)->runtime_flags))
+ ret = filemap_fdatawrite_range(inode->i_mapping, start,
+ start + length - 1);
+
+ dio_data = kzalloc(sizeof(*dio_data), GFP_NOFS);
+ if (!dio_data)
+ return -ENOMEM;
+
+ dio_data->length = length;
+ if (write) {
+ dio_data->reserve = round_up(length, fs_info->sectorsize);
+ ret = btrfs_delalloc_reserve_space(inode,
+ &dio_data->data_reserved,
+ start, dio_data->reserve);
+ if (ret) {
+ extent_changeset_free(dio_data->data_reserved);
+ kfree(dio_data);
+ return ret;
+ }
}
+ iomap->private = dio_data;
+
/*
* If this errors out it's because we couldn't invalidate pagecache for
* this range and we need to fallback to buffered.
*/
- if (lock_extent_direct(inode, lockstart, lockend, &cached_state,
- create)) {
+ if (lock_extent_direct(inode, lockstart, lockend, &cached_state, write)) {
ret = -ENOTBLK;
goto err;
}
@@ -7314,36 +7349,48 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
goto unlock_err;
}
- if (create) {
- ret = btrfs_get_blocks_direct_write(&em, bh_result, inode,
- dio_data, start, len);
+ len = min(len, em->len - (start - em->start));
+ if (write) {
+ ret = btrfs_get_blocks_direct_write(&em, inode, dio_data,
+ start, len);
if (ret < 0)
goto unlock_err;
-
- unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart,
- lockend, &cached_state);
+ unlock_extents = true;
+ /* Recalc len in case the new em is smaller than requested */
+ len = min(len, em->len - (start - em->start));
} else {
- ret = btrfs_get_blocks_direct_read(em, bh_result, inode,
- start, len);
- /* Can be negative only if we read from a hole */
- if (ret < 0) {
- ret = 0;
- free_extent_map(em);
- goto unlock_err;
- }
/*
* We need to unlock only the end area that we aren't using.
* The rest is going to be unlocked by the endio routine.
*/
- lockstart = start + bh_result->b_size;
- if (lockstart < lockend) {
- unlock_extent_cached(&BTRFS_I(inode)->io_tree,
- lockstart, lockend, &cached_state);
- } else {
- free_extent_state(cached_state);
- }
+ lockstart = start + len;
+ if (lockstart < lockend)
+ unlock_extents = true;
}
+ if (unlock_extents)
+ unlock_extent_cached(&BTRFS_I(inode)->io_tree,
+ lockstart, lockend, &cached_state);
+ else
+ free_extent_state(cached_state);
+
+ /*
+ * Translate extent map information to iomap.
+ * We trim the extents (and move the addr) even though iomap code does
+ * that, since we have locked only the parts we are performing I/O in.
+ */
+ if ((em->block_start == EXTENT_MAP_HOLE) ||
+ (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) && !write)) {
+ iomap->addr = IOMAP_NULL_ADDR;
+ iomap->type = IOMAP_HOLE;
+ } else {
+ iomap->addr = em->block_start + (start - em->start);
+ iomap->type = IOMAP_MAPPED;
+ }
+ iomap->offset = start;
+ iomap->bdev = fs_info->fs_devices->latest_bdev;
+ iomap->length = len;
+
free_extent_map(em);
return 0;
@@ -7352,370 +7399,152 @@ unlock_err:
unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
&cached_state);
err:
- if (dio_data)
- current->journal_info = dio_data;
+ if (dio_data) {
+ btrfs_delalloc_release_space(inode, dio_data->data_reserved,
+ start, dio_data->reserve, true);
+ btrfs_delalloc_release_extents(BTRFS_I(inode), dio_data->reserve);
+ extent_changeset_free(dio_data->data_reserved);
+ kfree(dio_data);
+ }
return ret;
}
-static inline blk_status_t submit_dio_repair_bio(struct inode *inode,
- struct bio *bio,
- int mirror_num)
+static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length,
+ ssize_t written, unsigned flags, struct iomap *iomap)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- blk_status_t ret;
+ int ret = 0;
+ struct btrfs_dio_data *dio_data = iomap->private;
+ size_t submitted = dio_data->submitted;
+ const bool write = !!(flags & IOMAP_WRITE);
- BUG_ON(bio_op(bio) == REQ_OP_WRITE);
+ if (!write && (iomap->type == IOMAP_HOLE)) {
+ /* If reading from a hole, unlock and return */
+ unlock_extent(&BTRFS_I(inode)->io_tree, pos, pos + length - 1);
+ goto out;
+ }
- ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DIO_REPAIR);
- if (ret)
- return ret;
+ if (submitted < length) {
+ pos += submitted;
+ length -= submitted;
+ if (write)
+ __endio_write_update_ordered(inode, pos, length, false);
+ else
+ unlock_extent(&BTRFS_I(inode)->io_tree, pos,
+ pos + length - 1);
+ ret = -ENOTBLK;
+ }
- ret = btrfs_map_bio(fs_info, bio, mirror_num);
+ if (write) {
+ if (dio_data->reserve)
+ btrfs_delalloc_release_space(inode,
+ dio_data->data_reserved, pos,
+ dio_data->reserve, true);
+ btrfs_delalloc_release_extents(BTRFS_I(inode), dio_data->length);
+ extent_changeset_free(dio_data->data_reserved);
+ }
+out:
+ kfree(dio_data);
+ iomap->private = NULL;
return ret;
}
-static int btrfs_check_dio_repairable(struct inode *inode,
- struct bio *failed_bio,
- struct io_failure_record *failrec,
- int failed_mirror)
+static void btrfs_dio_private_put(struct btrfs_dio_private *dip)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- int num_copies;
-
- num_copies = btrfs_num_copies(fs_info, failrec->logical, failrec->len);
- if (num_copies == 1) {
- /*
- * we only have a single copy of the data, so don't bother with
- * all the retry and error correction code that follows. no
- * matter what the error is, it is very likely to persist.
- */
- btrfs_debug(fs_info,
- "Check DIO Repairable: cannot repair, num_copies=%d, next_mirror %d, failed_mirror %d",
- num_copies, failrec->this_mirror, failed_mirror);
- return 0;
- }
-
- failrec->failed_mirror = failed_mirror;
- failrec->this_mirror++;
- if (failrec->this_mirror == failed_mirror)
- failrec->this_mirror++;
+ /*
+ * This implies a barrier so that stores to dio_bio->bi_status before
+ * this and loads of dio_bio->bi_status after this are fully ordered.
+ */
+ if (!refcount_dec_and_test(&dip->refs))
+ return;
- if (failrec->this_mirror > num_copies) {
- btrfs_debug(fs_info,
- "Check DIO Repairable: (fail) num_copies=%d, next_mirror %d, failed_mirror %d",
- num_copies, failrec->this_mirror, failed_mirror);
- return 0;
+ if (bio_op(dip->dio_bio) == REQ_OP_WRITE) {
+ __endio_write_update_ordered(dip->inode, dip->logical_offset,
+ dip->bytes,
+ !dip->dio_bio->bi_status);
+ } else {
+ unlock_extent(&BTRFS_I(dip->inode)->io_tree,
+ dip->logical_offset,
+ dip->logical_offset + dip->bytes - 1);
}
- return 1;
+ bio_endio(dip->dio_bio);
+ kfree(dip);
}
-static blk_status_t dio_read_error(struct inode *inode, struct bio *failed_bio,
- struct page *page, unsigned int pgoff,
- u64 start, u64 end, int failed_mirror,
- bio_end_io_t *repair_endio, void *repair_arg)
+static blk_status_t submit_dio_repair_bio(struct inode *inode, struct bio *bio,
+ int mirror_num,
+ unsigned long bio_flags)
{
- struct io_failure_record *failrec;
- struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
- struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
- struct bio *bio;
- int isector;
- unsigned int read_mode = 0;
- int segs;
- int ret;
- blk_status_t status;
- struct bio_vec bvec;
+ struct btrfs_dio_private *dip = bio->bi_private;
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ blk_status_t ret;
- BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE);
+ BUG_ON(bio_op(bio) == REQ_OP_WRITE);
- ret = btrfs_get_io_failure_record(inode, start, end, &failrec);
+ ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DATA);
if (ret)
- return errno_to_blk_status(ret);
-
- ret = btrfs_check_dio_repairable(inode, failed_bio, failrec,
- failed_mirror);
- if (!ret) {
- free_io_failure(failure_tree, io_tree, failrec);
- return BLK_STS_IOERR;
- }
-
- segs = bio_segments(failed_bio);
- bio_get_first_bvec(failed_bio, &bvec);
- if (segs > 1 ||
- (bvec.bv_len > btrfs_inode_sectorsize(inode)))
- read_mode |= REQ_FAILFAST_DEV;
-
- isector = start - btrfs_io_bio(failed_bio)->logical;
- isector >>= inode->i_sb->s_blocksize_bits;
- bio = btrfs_create_repair_bio(inode, failed_bio, failrec, page,
- pgoff, isector, repair_endio, repair_arg);
- bio->bi_opf = REQ_OP_READ | read_mode;
-
- btrfs_debug(BTRFS_I(inode)->root->fs_info,
- "repair DIO read error: submitting new dio read[%#x] to this_mirror=%d, in_validation=%d",
- read_mode, failrec->this_mirror, failrec->in_validation);
-
- status = submit_dio_repair_bio(inode, bio, failrec->this_mirror);
- if (status) {
- free_io_failure(failure_tree, io_tree, failrec);
- bio_put(bio);
- }
-
- return status;
-}
-
-struct btrfs_retry_complete {
- struct completion done;
- struct inode *inode;
- u64 start;
- int uptodate;
-};
+ return ret;
-static void btrfs_retry_endio_nocsum(struct bio *bio)
-{
- struct btrfs_retry_complete *done = bio->bi_private;
- struct inode *inode = done->inode;
- struct bio_vec *bvec;
- struct extent_io_tree *io_tree, *failure_tree;
- struct bvec_iter_all iter_all;
-
- if (bio->bi_status)
- goto end;
-
- ASSERT(bio->bi_vcnt == 1);
- io_tree = &BTRFS_I(inode)->io_tree;
- failure_tree = &BTRFS_I(inode)->io_failure_tree;
- ASSERT(bio_first_bvec_all(bio)->bv_len == btrfs_inode_sectorsize(inode));
-
- done->uptodate = 1;
- ASSERT(!bio_flagged(bio, BIO_CLONED));
- bio_for_each_segment_all(bvec, bio, iter_all)
- clean_io_failure(BTRFS_I(inode)->root->fs_info, failure_tree,
- io_tree, done->start, bvec->bv_page,
- btrfs_ino(BTRFS_I(inode)), 0);
-end:
- complete(&done->done);
- bio_put(bio);
+ refcount_inc(&dip->refs);
+ ret = btrfs_map_bio(fs_info, bio, mirror_num);
+ if (ret)
+ refcount_dec(&dip->refs);
+ return ret;
}
-static blk_status_t __btrfs_correct_data_nocsum(struct inode *inode,
- struct btrfs_io_bio *io_bio)
+static blk_status_t btrfs_check_read_dio_bio(struct inode *inode,
+ struct btrfs_io_bio *io_bio,
+ const bool uptodate)
{
- struct btrfs_fs_info *fs_info;
+ struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
+ const u32 sectorsize = fs_info->sectorsize;
+ struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
+ struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+ const bool csum = !(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM);
struct bio_vec bvec;
struct bvec_iter iter;
- struct btrfs_retry_complete done;
- u64 start;
- unsigned int pgoff;
- u32 sectorsize;
- int nr_sectors;
- blk_status_t ret;
+ u64 start = io_bio->logical;
+ int icsum = 0;
blk_status_t err = BLK_STS_OK;
- fs_info = BTRFS_I(inode)->root->fs_info;
- sectorsize = fs_info->sectorsize;
-
- start = io_bio->logical;
- done.inode = inode;
- io_bio->bio.bi_iter = io_bio->iter;
+ __bio_for_each_segment(bvec, &io_bio->bio, iter, io_bio->iter) {
+ unsigned int i, nr_sectors, pgoff;
- bio_for_each_segment(bvec, &io_bio->bio, iter) {
nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec.bv_len);
pgoff = bvec.bv_offset;
-
-next_block_or_try_again:
- done.uptodate = 0;
- done.start = start;
- init_completion(&done.done);
-
- ret = dio_read_error(inode, &io_bio->bio, bvec.bv_page,
- pgoff, start, start + sectorsize - 1,
- io_bio->mirror_num,
- btrfs_retry_endio_nocsum, &done);
- if (ret) {
- err = ret;
- goto next;
- }
-
- wait_for_completion_io(&done.done);
-
- if (!done.uptodate) {
- /* We might have another mirror, so try again */
- goto next_block_or_try_again;
- }
-
-next:
- start += sectorsize;
-
- nr_sectors--;
- if (nr_sectors) {
- pgoff += sectorsize;
+ for (i = 0; i < nr_sectors; i++) {
ASSERT(pgoff < PAGE_SIZE);
- goto next_block_or_try_again;
- }
- }
-
- return err;
-}
-
-static void btrfs_retry_endio(struct bio *bio)
-{
- struct btrfs_retry_complete *done = bio->bi_private;
- struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
- struct extent_io_tree *io_tree, *failure_tree;
- struct inode *inode = done->inode;
- struct bio_vec *bvec;
- int uptodate;
- int ret;
- int i = 0;
- struct bvec_iter_all iter_all;
-
- if (bio->bi_status)
- goto end;
-
- uptodate = 1;
-
- ASSERT(bio->bi_vcnt == 1);
- ASSERT(bio_first_bvec_all(bio)->bv_len == btrfs_inode_sectorsize(done->inode));
-
- io_tree = &BTRFS_I(inode)->io_tree;
- failure_tree = &BTRFS_I(inode)->io_failure_tree;
-
- ASSERT(!bio_flagged(bio, BIO_CLONED));
- bio_for_each_segment_all(bvec, bio, iter_all) {
- ret = __readpage_endio_check(inode, io_bio, i, bvec->bv_page,
- bvec->bv_offset, done->start,
- bvec->bv_len);
- if (!ret)
- clean_io_failure(BTRFS_I(inode)->root->fs_info,
- failure_tree, io_tree, done->start,
- bvec->bv_page,
- btrfs_ino(BTRFS_I(inode)),
- bvec->bv_offset);
- else
- uptodate = 0;
- i++;
- }
-
- done->uptodate = uptodate;
-end:
- complete(&done->done);
- bio_put(bio);
-}
-
-static blk_status_t __btrfs_subio_endio_read(struct inode *inode,
- struct btrfs_io_bio *io_bio, blk_status_t err)
-{
- struct btrfs_fs_info *fs_info;
- struct bio_vec bvec;
- struct bvec_iter iter;
- struct btrfs_retry_complete done;
- u64 start;
- u64 offset = 0;
- u32 sectorsize;
- int nr_sectors;
- unsigned int pgoff;
- int csum_pos;
- bool uptodate = (err == 0);
- int ret;
- blk_status_t status;
-
- fs_info = BTRFS_I(inode)->root->fs_info;
- sectorsize = fs_info->sectorsize;
-
- err = BLK_STS_OK;
- start = io_bio->logical;
- done.inode = inode;
- io_bio->bio.bi_iter = io_bio->iter;
-
- bio_for_each_segment(bvec, &io_bio->bio, iter) {
- nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec.bv_len);
-
- pgoff = bvec.bv_offset;
-next_block:
- if (uptodate) {
- csum_pos = BTRFS_BYTES_TO_BLKS(fs_info, offset);
- ret = __readpage_endio_check(inode, io_bio, csum_pos,
- bvec.bv_page, pgoff, start, sectorsize);
- if (likely(!ret))
- goto next;
- }
-try_again:
- done.uptodate = 0;
- done.start = start;
- init_completion(&done.done);
-
- status = dio_read_error(inode, &io_bio->bio, bvec.bv_page,
- pgoff, start, start + sectorsize - 1,
- io_bio->mirror_num, btrfs_retry_endio,
- &done);
- if (status) {
- err = status;
- goto next;
- }
-
- wait_for_completion_io(&done.done);
-
- if (!done.uptodate) {
- /* We might have another mirror, so try again */
- goto try_again;
- }
-next:
- offset += sectorsize;
- start += sectorsize;
-
- ASSERT(nr_sectors);
-
- nr_sectors--;
- if (nr_sectors) {
+ if (uptodate &&
+ (!csum || !check_data_csum(inode, io_bio, icsum,
+ bvec.bv_page, pgoff,
+ start, sectorsize))) {
+ clean_io_failure(fs_info, failure_tree, io_tree,
+ start, bvec.bv_page,
+ btrfs_ino(BTRFS_I(inode)),
+ pgoff);
+ } else {
+ blk_status_t status;
+
+ status = btrfs_submit_read_repair(inode,
+ &io_bio->bio,
+ start - io_bio->logical,
+ bvec.bv_page, pgoff,
+ start,
+ start + sectorsize - 1,
+ io_bio->mirror_num,
+ submit_dio_repair_bio);
+ if (status)
+ err = status;
+ }
+ start += sectorsize;
+ icsum++;
pgoff += sectorsize;
- ASSERT(pgoff < PAGE_SIZE);
- goto next_block;
}
}
-
return err;
}
-static blk_status_t btrfs_subio_endio_read(struct inode *inode,
- struct btrfs_io_bio *io_bio, blk_status_t err)
-{
- bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
-
- if (skip_csum) {
- if (unlikely(err))
- return __btrfs_correct_data_nocsum(inode, io_bio);
- else
- return BLK_STS_OK;
- } else {
- return __btrfs_subio_endio_read(inode, io_bio, err);
- }
-}
-
-static void btrfs_endio_direct_read(struct bio *bio)
-{
- struct btrfs_dio_private *dip = bio->bi_private;
- struct inode *inode = dip->inode;
- struct bio *dio_bio;
- struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
- blk_status_t err = bio->bi_status;
-
- if (dip->flags & BTRFS_DIO_ORIG_BIO_SUBMITTED)
- err = btrfs_subio_endio_read(inode, io_bio, err);
-
- unlock_extent(&BTRFS_I(inode)->io_tree, dip->logical_offset,
- dip->logical_offset + dip->bytes - 1);
- dio_bio = dip->dio_bio;
-
- kfree(dip);
-
- dio_bio->bi_status = err;
- dio_end_io(dio_bio);
- btrfs_io_bio_free_csum(io_bio);
- bio_put(bio);
-}
-
static void __endio_write_update_ordered(struct inode *inode,
const u64 offset, const u64 bytes,
const bool uptodate)
@@ -7759,21 +7588,6 @@ static void __endio_write_update_ordered(struct inode *inode,
}
}
-static void btrfs_endio_direct_write(struct bio *bio)
-{
- struct btrfs_dio_private *dip = bio->bi_private;
- struct bio *dio_bio = dip->dio_bio;
-
- __endio_write_update_ordered(dip->inode, dip->logical_offset,
- dip->bytes, !bio->bi_status);
-
- kfree(dip);
-
- dio_bio->bi_status = bio->bi_status;
- dio_end_io(dio_bio);
- bio_put(bio);
-}
-
static blk_status_t btrfs_submit_bio_start_direct_io(void *private_data,
struct bio *bio, u64 offset)
{
@@ -7797,64 +7611,16 @@ static void btrfs_end_dio_bio(struct bio *bio)
(unsigned long long)bio->bi_iter.bi_sector,
bio->bi_iter.bi_size, err);
- if (dip->subio_endio)
- err = dip->subio_endio(dip->inode, btrfs_io_bio(bio), err);
-
- if (err) {
- /*
- * We want to perceive the errors flag being set before
- * decrementing the reference count. We don't need a barrier
- * since atomic operations with a return value are fully
- * ordered as per atomic_t.txt
- */
- dip->errors = 1;
+ if (bio_op(bio) == REQ_OP_READ) {
+ err = btrfs_check_read_dio_bio(dip->inode, btrfs_io_bio(bio),
+ !err);
}
- /* if there are more bios still pending for this dio, just exit */
- if (!atomic_dec_and_test(&dip->pending_bios))
- goto out;
+ if (err)
+ dip->dio_bio->bi_status = err;
- if (dip->errors) {
- bio_io_error(dip->orig_bio);
- } else {
- dip->dio_bio->bi_status = BLK_STS_OK;
- bio_endio(dip->orig_bio);
- }
-out:
bio_put(bio);
-}
-
-static inline blk_status_t btrfs_lookup_and_bind_dio_csum(struct inode *inode,
- struct btrfs_dio_private *dip,
- struct bio *bio,
- u64 file_offset)
-{
- struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
- struct btrfs_io_bio *orig_io_bio = btrfs_io_bio(dip->orig_bio);
- u16 csum_size;
- blk_status_t ret;
-
- /*
- * We load all the csum data we need when we submit
- * the first bio to reduce the csum tree search and
- * contention.
- */
- if (dip->logical_offset == file_offset) {
- ret = btrfs_lookup_bio_sums(inode, dip->orig_bio, file_offset,
- NULL);
- if (ret)
- return ret;
- }
-
- if (bio == dip->orig_bio)
- return 0;
-
- file_offset -= dip->logical_offset;
- file_offset >>= inode->i_sb->s_blocksize_bits;
- csum_size = btrfs_super_csum_size(btrfs_sb(inode->i_sb)->super_copy);
- io_bio->csum = orig_io_bio->csum + csum_size * file_offset;
-
- return 0;
+ btrfs_dio_private_put(dip);
}
static inline blk_status_t btrfs_submit_dio_bio(struct bio *bio,
@@ -7892,10 +7658,12 @@ static inline blk_status_t btrfs_submit_dio_bio(struct bio *bio,
if (ret)
goto err;
} else {
- ret = btrfs_lookup_and_bind_dio_csum(inode, dip, bio,
- file_offset);
- if (ret)
- goto err;
+ u64 csum_offset;
+
+ csum_offset = file_offset - dip->logical_offset;
+ csum_offset >>= inode->i_sb->s_blocksize_bits;
+ csum_offset *= btrfs_super_csum_size(fs_info->super_copy);
+ btrfs_io_bio(bio)->csum = dip->csums + csum_offset;
}
map:
ret = btrfs_map_bio(fs_info, bio, 0);
@@ -7903,14 +7671,53 @@ err:
return ret;
}
-static int btrfs_submit_direct_hook(struct btrfs_dio_private *dip)
+/*
+ * If this succeeds, the btrfs_dio_private is responsible for cleaning up locked
+ * or ordered extents whether or not we submit any bios.
+ */
+static struct btrfs_dio_private *btrfs_create_dio_private(struct bio *dio_bio,
+ struct inode *inode,
+ loff_t file_offset)
{
- struct inode *inode = dip->inode;
+ const bool write = (bio_op(dio_bio) == REQ_OP_WRITE);
+ const bool csum = !(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM);
+ size_t dip_size;
+ struct btrfs_dio_private *dip;
+
+ dip_size = sizeof(*dip);
+ if (!write && csum) {
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ const u16 csum_size = btrfs_super_csum_size(fs_info->super_copy);
+ size_t nblocks;
+
+ nblocks = dio_bio->bi_iter.bi_size >> inode->i_sb->s_blocksize_bits;
+ dip_size += csum_size * nblocks;
+ }
+
+ dip = kzalloc(dip_size, GFP_NOFS);
+ if (!dip)
+ return NULL;
+
+ dip->inode = inode;
+ dip->logical_offset = file_offset;
+ dip->bytes = dio_bio->bi_iter.bi_size;
+ dip->disk_bytenr = (u64)dio_bio->bi_iter.bi_sector << 9;
+ dip->dio_bio = dio_bio;
+ refcount_set(&dip->refs, 1);
+ return dip;
+}
+
+static blk_qc_t btrfs_submit_direct(struct inode *inode, struct iomap *iomap,
+ struct bio *dio_bio, loff_t file_offset)
+{
+ const bool write = (bio_op(dio_bio) == REQ_OP_WRITE);
+ const bool csum = !(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM);
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ const bool raid56 = (btrfs_data_alloc_profile(fs_info) &
+ BTRFS_BLOCK_GROUP_RAID56_MASK);
+ struct btrfs_dio_private *dip;
struct bio *bio;
- struct bio *orig_bio = dip->orig_bio;
- u64 start_sector = orig_bio->bi_iter.bi_sector;
- u64 file_offset = dip->logical_offset;
+ u64 start_sector;
int async_submit = 0;
u64 submit_len;
int clone_offset = 0;
@@ -7918,330 +7725,108 @@ static int btrfs_submit_direct_hook(struct btrfs_dio_private *dip)
int ret;
blk_status_t status;
struct btrfs_io_geometry geom;
+ struct btrfs_dio_data *dio_data = iomap->private;
- submit_len = orig_bio->bi_iter.bi_size;
- ret = btrfs_get_io_geometry(fs_info, btrfs_op(orig_bio),
- start_sector << 9, submit_len, &geom);
- if (ret)
- return -EIO;
+ dip = btrfs_create_dio_private(dio_bio, inode, file_offset);
+ if (!dip) {
+ if (!write) {
+ unlock_extent(&BTRFS_I(inode)->io_tree, file_offset,
+ file_offset + dio_bio->bi_iter.bi_size - 1);
+ }
+ dio_bio->bi_status = BLK_STS_RESOURCE;
+ bio_endio(dio_bio);
+ return BLK_QC_T_NONE;
+ }
- if (geom.len >= submit_len) {
- bio = orig_bio;
- dip->flags |= BTRFS_DIO_ORIG_BIO_SUBMITTED;
- goto submit;
+ if (!write && csum) {
+ /*
+ * Load the csums up front to reduce csum tree searches and
+ * contention when submitting bios.
+ */
+ status = btrfs_lookup_bio_sums(inode, dio_bio, file_offset,
+ dip->csums);
+ if (status != BLK_STS_OK)
+ goto out_err;
}
- /* async crcs make it difficult to collect full stripe writes. */
- if (btrfs_data_alloc_profile(fs_info) & BTRFS_BLOCK_GROUP_RAID56_MASK)
- async_submit = 0;
- else
- async_submit = 1;
+ start_sector = dio_bio->bi_iter.bi_sector;
+ submit_len = dio_bio->bi_iter.bi_size;
- /* bio split */
- ASSERT(geom.len <= INT_MAX);
- atomic_inc(&dip->pending_bios);
do {
+ ret = btrfs_get_io_geometry(fs_info, btrfs_op(dio_bio),
+ start_sector << 9, submit_len,
+ &geom);
+ if (ret) {
+ status = errno_to_blk_status(ret);
+ goto out_err;
+ }
+ ASSERT(geom.len <= INT_MAX);
+
clone_len = min_t(int, submit_len, geom.len);
/*
* This will never fail as it's passing GPF_NOFS and
* the allocation is backed by btrfs_bioset.
*/
- bio = btrfs_bio_clone_partial(orig_bio, clone_offset,
- clone_len);
+ bio = btrfs_bio_clone_partial(dio_bio, clone_offset, clone_len);
bio->bi_private = dip;
bio->bi_end_io = btrfs_end_dio_bio;
btrfs_io_bio(bio)->logical = file_offset;
ASSERT(submit_len >= clone_len);
submit_len -= clone_len;
- if (submit_len == 0)
- break;
/*
* Increase the count before we submit the bio so we know
* the end IO handler won't happen before we increase the
* count. Otherwise, the dip might get freed before we're
* done setting it up.
+ *
+ * We transfer the initial reference to the last bio, so we
+ * don't need to increment the reference count for the last one.
*/
- atomic_inc(&dip->pending_bios);
+ if (submit_len > 0) {
+ refcount_inc(&dip->refs);
+ /*
+ * If we are submitting more than one bio, submit them
+ * all asynchronously. The exception is RAID 5 or 6, as
+ * asynchronous checksums make it difficult to collect
+ * full stripe writes.
+ */
+ if (!raid56)
+ async_submit = 1;
+ }
status = btrfs_submit_dio_bio(bio, inode, file_offset,
async_submit);
if (status) {
bio_put(bio);
- atomic_dec(&dip->pending_bios);
+ if (submit_len > 0)
+ refcount_dec(&dip->refs);
goto out_err;
}
+ dio_data->submitted += clone_len;
clone_offset += clone_len;
start_sector += clone_len >> 9;
file_offset += clone_len;
-
- ret = btrfs_get_io_geometry(fs_info, btrfs_op(orig_bio),
- start_sector << 9, submit_len, &geom);
- if (ret)
- goto out_err;
} while (submit_len > 0);
+ return BLK_QC_T_NONE;
-submit:
- status = btrfs_submit_dio_bio(bio, inode, file_offset, async_submit);
- if (!status)
- return 0;
-
- bio_put(bio);
out_err:
- dip->errors = 1;
- /*
- * Before atomic variable goto zero, we must make sure dip->errors is
- * perceived to be set. This ordering is ensured by the fact that an
- * atomic operations with a return value are fully ordered as per
- * atomic_t.txt
- */
- if (atomic_dec_and_test(&dip->pending_bios))
- bio_io_error(dip->orig_bio);
-
- /* bio_end_io() will handle error, so we needn't return it */
- return 0;
-}
-
-static void btrfs_submit_direct(struct bio *dio_bio, struct inode *inode,
- loff_t file_offset)
-{
- struct btrfs_dio_private *dip = NULL;
- struct bio *bio = NULL;
- struct btrfs_io_bio *io_bio;
- bool write = (bio_op(dio_bio) == REQ_OP_WRITE);
- int ret = 0;
-
- bio = btrfs_bio_clone(dio_bio);
-
- dip = kzalloc(sizeof(*dip), GFP_NOFS);
- if (!dip) {
- ret = -ENOMEM;
- goto free_ordered;
- }
-
- dip->private = dio_bio->bi_private;
- dip->inode = inode;
- dip->logical_offset = file_offset;
- dip->bytes = dio_bio->bi_iter.bi_size;
- dip->disk_bytenr = (u64)dio_bio->bi_iter.bi_sector << 9;
- bio->bi_private = dip;
- dip->orig_bio = bio;
- dip->dio_bio = dio_bio;
- atomic_set(&dip->pending_bios, 0);
- io_bio = btrfs_io_bio(bio);
- io_bio->logical = file_offset;
-
- if (write) {
- bio->bi_end_io = btrfs_endio_direct_write;
- } else {
- bio->bi_end_io = btrfs_endio_direct_read;
- dip->subio_endio = btrfs_subio_endio_read;
- }
-
- /*
- * Reset the range for unsubmitted ordered extents (to a 0 length range)
- * even if we fail to submit a bio, because in such case we do the
- * corresponding error handling below and it must not be done a second
- * time by btrfs_direct_IO().
- */
- if (write) {
- struct btrfs_dio_data *dio_data = current->journal_info;
-
- dio_data->unsubmitted_oe_range_end = dip->logical_offset +
- dip->bytes;
- dio_data->unsubmitted_oe_range_start =
- dio_data->unsubmitted_oe_range_end;
- }
-
- ret = btrfs_submit_direct_hook(dip);
- if (!ret)
- return;
-
- btrfs_io_bio_free_csum(io_bio);
-
-free_ordered:
- /*
- * If we arrived here it means either we failed to submit the dip
- * or we either failed to clone the dio_bio or failed to allocate the
- * dip. If we cloned the dio_bio and allocated the dip, we can just
- * call bio_endio against our io_bio so that we get proper resource
- * cleanup if we fail to submit the dip, otherwise, we must do the
- * same as btrfs_endio_direct_[write|read] because we can't call these
- * callbacks - they require an allocated dip and a clone of dio_bio.
- */
- if (bio && dip) {
- bio_io_error(bio);
- /*
- * The end io callbacks free our dip, do the final put on bio
- * and all the cleanup and final put for dio_bio (through
- * dio_end_io()).
- */
- dip = NULL;
- bio = NULL;
- } else {
- if (write)
- __endio_write_update_ordered(inode,
- file_offset,
- dio_bio->bi_iter.bi_size,
- false);
- else
- unlock_extent(&BTRFS_I(inode)->io_tree, file_offset,
- file_offset + dio_bio->bi_iter.bi_size - 1);
-
- dio_bio->bi_status = BLK_STS_IOERR;
- /*
- * Releases and cleans up our dio_bio, no need to bio_put()
- * nor bio_endio()/bio_io_error() against dio_bio.
- */
- dio_end_io(dio_bio);
- }
- if (bio)
- bio_put(bio);
- kfree(dip);
+ dip->dio_bio->bi_status = status;
+ btrfs_dio_private_put(dip);
+ return BLK_QC_T_NONE;
}
-static ssize_t check_direct_IO(struct btrfs_fs_info *fs_info,
- const struct iov_iter *iter, loff_t offset)
-{
- int seg;
- int i;
- unsigned int blocksize_mask = fs_info->sectorsize - 1;
- ssize_t retval = -EINVAL;
-
- if (offset & blocksize_mask)
- goto out;
-
- if (iov_iter_alignment(iter) & blocksize_mask)
- goto out;
-
- /* If this is a write we don't need to check anymore */
- if (iov_iter_rw(iter) != READ || !iter_is_iovec(iter))
- return 0;
- /*
- * Check to make sure we don't have duplicate iov_base's in this
- * iovec, if so return EINVAL, otherwise we'll get csum errors
- * when reading back.
- */
- for (seg = 0; seg < iter->nr_segs; seg++) {
- for (i = seg + 1; i < iter->nr_segs; i++) {
- if (iter->iov[seg].iov_base == iter->iov[i].iov_base)
- goto out;
- }
- }
- retval = 0;
-out:
- return retval;
-}
-
-static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
-{
- struct file *file = iocb->ki_filp;
- struct inode *inode = file->f_mapping->host;
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- struct btrfs_dio_data dio_data = { 0 };
- struct extent_changeset *data_reserved = NULL;
- loff_t offset = iocb->ki_pos;
- size_t count = 0;
- int flags = 0;
- bool wakeup = true;
- bool relock = false;
- ssize_t ret;
-
- if (check_direct_IO(fs_info, iter, offset))
- return 0;
-
- inode_dio_begin(inode);
-
- /*
- * The generic stuff only does filemap_write_and_wait_range, which
- * isn't enough if we've written compressed pages to this area, so
- * we need to flush the dirty pages again to make absolutely sure
- * that any outstanding dirty pages are on disk.
- */
- count = iov_iter_count(iter);
- if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
- &BTRFS_I(inode)->runtime_flags))
- filemap_fdatawrite_range(inode->i_mapping, offset,
- offset + count - 1);
-
- if (iov_iter_rw(iter) == WRITE) {
- /*
- * If the write DIO is beyond the EOF, we need update
- * the isize, but it is protected by i_mutex. So we can
- * not unlock the i_mutex at this case.
- */
- if (offset + count <= inode->i_size) {
- dio_data.overwrite = 1;
- inode_unlock(inode);
- relock = true;
- } else if (iocb->ki_flags & IOCB_NOWAIT) {
- ret = -EAGAIN;
- goto out;
- }
- ret = btrfs_delalloc_reserve_space(inode, &data_reserved,
- offset, count);
- if (ret)
- goto out;
-
- /*
- * We need to know how many extents we reserved so that we can
- * do the accounting properly if we go over the number we
- * originally calculated. Abuse current->journal_info for this.
- */
- dio_data.reserve = round_up(count,
- fs_info->sectorsize);
- dio_data.unsubmitted_oe_range_start = (u64)offset;
- dio_data.unsubmitted_oe_range_end = (u64)offset;
- current->journal_info = &dio_data;
- down_read(&BTRFS_I(inode)->dio_sem);
- } else if (test_bit(BTRFS_INODE_READDIO_NEED_LOCK,
- &BTRFS_I(inode)->runtime_flags)) {
- inode_dio_end(inode);
- flags = DIO_LOCKING | DIO_SKIP_HOLES;
- wakeup = false;
- }
-
- ret = __blockdev_direct_IO(iocb, inode,
- fs_info->fs_devices->latest_bdev,
- iter, btrfs_get_blocks_direct, NULL,
- btrfs_submit_direct, flags);
- if (iov_iter_rw(iter) == WRITE) {
- up_read(&BTRFS_I(inode)->dio_sem);
- current->journal_info = NULL;
- if (ret < 0 && ret != -EIOCBQUEUED) {
- if (dio_data.reserve)
- btrfs_delalloc_release_space(inode, data_reserved,
- offset, dio_data.reserve, true);
- /*
- * On error we might have left some ordered extents
- * without submitting corresponding bios for them, so
- * cleanup them up to avoid other tasks getting them
- * and waiting for them to complete forever.
- */
- if (dio_data.unsubmitted_oe_range_start <
- dio_data.unsubmitted_oe_range_end)
- __endio_write_update_ordered(inode,
- dio_data.unsubmitted_oe_range_start,
- dio_data.unsubmitted_oe_range_end -
- dio_data.unsubmitted_oe_range_start,
- false);
- } else if (ret >= 0 && (size_t)ret < count)
- btrfs_delalloc_release_space(inode, data_reserved,
- offset, count - (size_t)ret, true);
- btrfs_delalloc_release_extents(BTRFS_I(inode), count);
- }
-out:
- if (wakeup)
- inode_dio_end(inode);
- if (relock)
- inode_lock(inode);
+const struct iomap_ops btrfs_dio_iomap_ops = {
+ .iomap_begin = btrfs_dio_iomap_begin,
+ .iomap_end = btrfs_dio_iomap_end,
+};
- extent_changeset_free(data_reserved);
- return ret;
-}
+const struct iomap_dio_ops btrfs_dops = {
+ .submit_io = btrfs_submit_direct,
+};
#define BTRFS_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC)
@@ -10539,7 +10124,7 @@ static const struct address_space_operations btrfs_aops = {
.writepage = btrfs_writepage,
.writepages = btrfs_writepages,
.readahead = btrfs_readahead,
- .direct_IO = btrfs_direct_IO,
+ .direct_IO = noop_direct_IO,
.invalidatepage = btrfs_invalidatepage,
.releasepage = btrfs_releasepage,
#ifdef CONFIG_MIGRATION