summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--fs/btrfs/Makefile2
-rw-r--r--fs/btrfs/accessors.h15
-rw-r--r--fs/btrfs/bio.c4
-rw-r--r--fs/btrfs/block-group.c53
-rw-r--r--fs/btrfs/block-group.h3
-rw-r--r--fs/btrfs/btrfs_inode.h156
-rw-r--r--fs/btrfs/compression.c25
-rw-r--r--fs/btrfs/compression.h2
-rw-r--r--fs/btrfs/ctree.c108
-rw-r--r--fs/btrfs/ctree.h18
-rw-r--r--fs/btrfs/defrag.c18
-rw-r--r--fs/btrfs/delalloc-space.c2
-rw-r--r--fs/btrfs/delalloc-space.h2
-rw-r--r--fs/btrfs/delayed-inode.c47
-rw-r--r--fs/btrfs/delayed-inode.h10
-rw-r--r--fs/btrfs/delayed-ref.c51
-rw-r--r--fs/btrfs/delayed-ref.h8
-rw-r--r--fs/btrfs/dev-replace.c4
-rw-r--r--fs/btrfs/dir-item.c8
-rw-r--r--fs/btrfs/dir-item.h6
-rw-r--r--fs/btrfs/direct-io.c1052
-rw-r--r--fs/btrfs/direct-io.h14
-rw-r--r--fs/btrfs/disk-io.c128
-rw-r--r--fs/btrfs/disk-io.h18
-rw-r--r--fs/btrfs/export.c6
-rw-r--r--fs/btrfs/extent-io-tree.c4
-rw-r--r--fs/btrfs/extent-tree.c685
-rw-r--r--fs/btrfs/extent-tree.h8
-rw-r--r--fs/btrfs/extent_io.c1092
-rw-r--r--fs/btrfs/extent_io.h19
-rw-r--r--fs/btrfs/extent_map.c243
-rw-r--r--fs/btrfs/extent_map.h54
-rw-r--r--fs/btrfs/fiemap.c930
-rw-r--r--fs/btrfs/fiemap.h11
-rw-r--r--fs/btrfs/file-item.c52
-rw-r--r--fs/btrfs/file.c355
-rw-r--r--fs/btrfs/file.h4
-rw-r--r--fs/btrfs/free-space-cache.c12
-rw-r--r--fs/btrfs/free-space-tree.c10
-rw-r--r--fs/btrfs/fs.h17
-rw-r--r--fs/btrfs/inode-item.c4
-rw-r--r--fs/btrfs/inode.c1459
-rw-r--r--fs/btrfs/ioctl.c94
-rw-r--r--fs/btrfs/ioctl.h2
-rw-r--r--fs/btrfs/locking.h1
-rw-r--r--fs/btrfs/lru_cache.h1
-rw-r--r--fs/btrfs/lzo.c43
-rw-r--r--fs/btrfs/messages.c3
-rw-r--r--fs/btrfs/misc.h4
-rw-r--r--fs/btrfs/ordered-data.c146
-rw-r--r--fs/btrfs/ordered-data.h27
-rw-r--r--fs/btrfs/print-tree.c10
-rw-r--r--fs/btrfs/props.c20
-rw-r--r--fs/btrfs/props.h4
-rw-r--r--fs/btrfs/qgroup.c221
-rw-r--r--fs/btrfs/qgroup.h25
-rw-r--r--fs/btrfs/raid-stripe-tree.c13
-rw-r--r--fs/btrfs/raid-stripe-tree.h3
-rw-r--r--fs/btrfs/raid56.c118
-rw-r--r--fs/btrfs/reflink.c8
-rw-r--r--fs/btrfs/relocation.c157
-rw-r--r--fs/btrfs/scrub.c13
-rw-r--r--fs/btrfs/send.c49
-rw-r--r--fs/btrfs/send.h4
-rw-r--r--fs/btrfs/space-info.c265
-rw-r--r--fs/btrfs/space-info.h48
-rw-r--r--fs/btrfs/subpage.c162
-rw-r--r--fs/btrfs/subpage.h9
-rw-r--r--fs/btrfs/super.c51
-rw-r--r--fs/btrfs/super.h2
-rw-r--r--fs/btrfs/sysfs.c85
-rw-r--r--fs/btrfs/tests/btrfs-tests.c5
-rw-r--r--fs/btrfs/tests/extent-map-tests.c120
-rw-r--r--fs/btrfs/tests/inode-tests.c176
-rw-r--r--fs/btrfs/transaction.c31
-rw-r--r--fs/btrfs/transaction.h9
-rw-r--r--fs/btrfs/tree-checker.c37
-rw-r--r--fs/btrfs/tree-log.c74
-rw-r--r--fs/btrfs/tree-log.h6
-rw-r--r--fs/btrfs/ulist.c21
-rw-r--r--fs/btrfs/ulist.h2
-rw-r--r--fs/btrfs/uuid-tree.c10
-rw-r--r--fs/btrfs/uuid-tree.h4
-rw-r--r--fs/btrfs/volumes.c62
-rw-r--r--fs/btrfs/volumes.h2
-rw-r--r--fs/btrfs/xattr.c4
-rw-r--r--fs/btrfs/xattr.h2
-rw-r--r--fs/btrfs/zlib.c56
-rw-r--r--fs/btrfs/zoned.c30
-rw-r--r--fs/btrfs/zoned.h11
-rw-r--r--fs/btrfs/zstd.c70
-rw-r--r--include/trace/events/btrfs.h19
-rw-r--r--include/uapi/linux/btrfs_tree.h22
93 files changed, 5175 insertions, 3905 deletions
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 525af975f61c..87617f2968bc 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -33,7 +33,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
uuid-tree.o props.o free-space-tree.o tree-checker.o space-info.o \
block-rsv.o delalloc-space.o block-group.o discard.o reflink.o \
subpage.o tree-mod-log.o extent-io-tree.o fs.o messages.o bio.o \
- lru_cache.o raid-stripe-tree.o
+ lru_cache.o raid-stripe-tree.o fiemap.o direct-io.o
btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
btrfs-$(CONFIG_BTRFS_FS_REF_VERIFY) += ref-verify.o
diff --git a/fs/btrfs/accessors.h b/fs/btrfs/accessors.h
index 6fce3e8d3dac..b2eb9cde2c5d 100644
--- a/fs/btrfs/accessors.h
+++ b/fs/btrfs/accessors.h
@@ -34,7 +34,7 @@ void btrfs_init_map_token(struct btrfs_map_token *token, struct extent_buffer *e
static inline u8 get_unaligned_le8(const void *p)
{
- return *(u8 *)p;
+ return *(const u8 *)p;
}
static inline void put_unaligned_le8(u8 val, void *p)
@@ -48,8 +48,8 @@ static inline void put_unaligned_le8(u8 val, void *p)
offsetof(type, member), \
sizeof_field(type, member)))
-#define write_eb_member(eb, ptr, type, member, result) (\
- write_extent_buffer(eb, (char *)(result), \
+#define write_eb_member(eb, ptr, type, member, source) ( \
+ write_extent_buffer(eb, (const char *)(source), \
((unsigned long)(ptr)) + \
offsetof(type, member), \
sizeof_field(type, member)))
@@ -315,11 +315,8 @@ BTRFS_SETGET_FUNCS(timespec_nsec, struct btrfs_timespec, nsec, 32);
BTRFS_SETGET_STACK_FUNCS(stack_timespec_sec, struct btrfs_timespec, sec, 64);
BTRFS_SETGET_STACK_FUNCS(stack_timespec_nsec, struct btrfs_timespec, nsec, 32);
-BTRFS_SETGET_FUNCS(stripe_extent_encoding, struct btrfs_stripe_extent, encoding, 8);
BTRFS_SETGET_FUNCS(raid_stride_devid, struct btrfs_raid_stride, devid, 64);
BTRFS_SETGET_FUNCS(raid_stride_physical, struct btrfs_raid_stride, physical, 64);
-BTRFS_SETGET_STACK_FUNCS(stack_stripe_extent_encoding,
- struct btrfs_stripe_extent, encoding, 8);
BTRFS_SETGET_STACK_FUNCS(stack_raid_stride_devid, struct btrfs_raid_stride, devid, 64);
BTRFS_SETGET_STACK_FUNCS(stack_raid_stride_physical, struct btrfs_raid_stride, physical, 64);
@@ -353,7 +350,7 @@ static inline void btrfs_tree_block_key(const struct extent_buffer *eb,
static inline void btrfs_set_tree_block_key(const struct extent_buffer *eb,
struct btrfs_tree_block_info *item,
- struct btrfs_disk_key *key)
+ const struct btrfs_disk_key *key)
{
write_eb_member(eb, item, struct btrfs_tree_block_info, key, key);
}
@@ -446,7 +443,7 @@ void btrfs_node_key(const struct extent_buffer *eb,
struct btrfs_disk_key *disk_key, int nr);
static inline void btrfs_set_node_key(const struct extent_buffer *eb,
- struct btrfs_disk_key *disk_key, int nr)
+ const struct btrfs_disk_key *disk_key, int nr)
{
unsigned long ptr;
@@ -512,7 +509,7 @@ static inline void btrfs_item_key(const struct extent_buffer *eb,
}
static inline void btrfs_set_item_key(struct extent_buffer *eb,
- struct btrfs_disk_key *disk_key, int nr)
+ const struct btrfs_disk_key *disk_key, int nr)
{
struct btrfs_item *item = btrfs_item_nr(eb, nr);
diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c
index e3a57196b0ee..f04d93109960 100644
--- a/fs/btrfs/bio.c
+++ b/fs/btrfs/bio.c
@@ -29,7 +29,7 @@ struct btrfs_failed_bio {
/* Is this a data path I/O that needs storage layer checksum and repair? */
static inline bool is_data_bbio(struct btrfs_bio *bbio)
{
- return bbio->inode && is_data_inode(&bbio->inode->vfs_inode);
+ return bbio->inode && is_data_inode(bbio->inode);
}
static bool bbio_has_ordered_extent(struct btrfs_bio *bbio)
@@ -732,7 +732,7 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num)
* point, so they are handled as part of the no-checksum case.
*/
if (inode && !(inode->flags & BTRFS_INODE_NODATASUM) &&
- !test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state) &&
+ !test_bit(BTRFS_FS_STATE_NO_DATA_CSUMS, &fs_info->fs_state) &&
!btrfs_is_data_reloc_root(inode->root)) {
if (should_async_write(bbio) &&
btrfs_wq_submit_bio(bbio, bioc, &smap, mirror_num))
diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index 60066822b532..498442d0c216 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -1022,6 +1022,13 @@ static void clear_incompat_bg_bits(struct btrfs_fs_info *fs_info, u64 flags)
}
}
+static struct btrfs_root *btrfs_block_group_root(struct btrfs_fs_info *fs_info)
+{
+ if (btrfs_fs_compat_ro(fs_info, BLOCK_GROUP_TREE))
+ return fs_info->block_group_root;
+ return btrfs_extent_root(fs_info, 0);
+}
+
static int remove_block_group_item(struct btrfs_trans_handle *trans,
struct btrfs_path *path,
struct btrfs_block_group *block_group)
@@ -1757,24 +1764,21 @@ static inline bool btrfs_should_reclaim(struct btrfs_fs_info *fs_info)
static bool should_reclaim_block_group(struct btrfs_block_group *bg, u64 bytes_freed)
{
- const struct btrfs_space_info *space_info = bg->space_info;
- const int reclaim_thresh = READ_ONCE(space_info->bg_reclaim_threshold);
+ const int thresh_pct = btrfs_calc_reclaim_threshold(bg->space_info);
+ u64 thresh_bytes = mult_perc(bg->length, thresh_pct);
const u64 new_val = bg->used;
const u64 old_val = new_val + bytes_freed;
- u64 thresh;
- if (reclaim_thresh == 0)
+ if (thresh_bytes == 0)
return false;
- thresh = mult_perc(bg->length, reclaim_thresh);
-
/*
* If we were below the threshold before don't reclaim, we are likely a
* brand new block group and we don't want to relocate new block groups.
*/
- if (old_val < thresh)
+ if (old_val < thresh_bytes)
return false;
- if (new_val >= thresh)
+ if (new_val >= thresh_bytes)
return false;
return true;
}
@@ -1822,6 +1826,7 @@ void btrfs_reclaim_bgs_work(struct work_struct *work)
list_sort(NULL, &fs_info->reclaim_bgs, reclaim_bgs_cmp);
while (!list_empty(&fs_info->reclaim_bgs)) {
u64 zone_unusable;
+ u64 reclaimed;
int ret = 0;
bg = list_first_entry(&fs_info->reclaim_bgs,
@@ -1835,6 +1840,7 @@ void btrfs_reclaim_bgs_work(struct work_struct *work)
/* Don't race with allocators so take the groups_sem */
down_write(&space_info->groups_sem);
+ spin_lock(&space_info->lock);
spin_lock(&bg->lock);
if (bg->reserved || bg->pinned || bg->ro) {
/*
@@ -1844,6 +1850,7 @@ void btrfs_reclaim_bgs_work(struct work_struct *work)
* this block group.
*/
spin_unlock(&bg->lock);
+ spin_unlock(&space_info->lock);
up_write(&space_info->groups_sem);
goto next;
}
@@ -1862,6 +1869,7 @@ void btrfs_reclaim_bgs_work(struct work_struct *work)
if (!btrfs_test_opt(fs_info, DISCARD_ASYNC))
btrfs_mark_bg_unused(bg);
spin_unlock(&bg->lock);
+ spin_unlock(&space_info->lock);
up_write(&space_info->groups_sem);
goto next;
@@ -1878,10 +1886,12 @@ void btrfs_reclaim_bgs_work(struct work_struct *work)
*/
if (!should_reclaim_block_group(bg, bg->length)) {
spin_unlock(&bg->lock);
+ spin_unlock(&space_info->lock);
up_write(&space_info->groups_sem);
goto next;
}
spin_unlock(&bg->lock);
+ spin_unlock(&space_info->lock);
/*
* Get out fast, in case we're read-only or unmounting the
@@ -1914,15 +1924,26 @@ void btrfs_reclaim_bgs_work(struct work_struct *work)
div64_u64(bg->used * 100, bg->length),
div64_u64(zone_unusable * 100, bg->length));
trace_btrfs_reclaim_block_group(bg);
+ reclaimed = bg->used;
ret = btrfs_relocate_chunk(fs_info, bg->start);
if (ret) {
btrfs_dec_block_group_ro(bg);
btrfs_err(fs_info, "error relocating chunk %llu",
bg->start);
+ reclaimed = 0;
+ spin_lock(&space_info->lock);
+ space_info->reclaim_errors++;
+ if (READ_ONCE(space_info->periodic_reclaim))
+ space_info->periodic_reclaim_ready = false;
+ spin_unlock(&space_info->lock);
}
+ spin_lock(&space_info->lock);
+ space_info->reclaim_count++;
+ space_info->reclaim_bytes += reclaimed;
+ spin_unlock(&space_info->lock);
next:
- if (ret) {
+ if (ret && !READ_ONCE(space_info->periodic_reclaim)) {
/* Refcount held by the reclaim_bgs list after splice. */
spin_lock(&fs_info->unused_bgs_lock);
/*
@@ -1964,6 +1985,7 @@ end:
void btrfs_reclaim_bgs(struct btrfs_fs_info *fs_info)
{
+ btrfs_reclaim_sweep(fs_info);
spin_lock(&fs_info->unused_bgs_lock);
if (!list_empty(&fs_info->reclaim_bgs))
queue_work(system_unbound_wq, &fs_info->reclaim_bgs_work);
@@ -3662,9 +3684,12 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans,
old_val += num_bytes;
cache->used = old_val;
cache->reserved -= num_bytes;
+ cache->reclaim_mark = 0;
space_info->bytes_reserved -= num_bytes;
space_info->bytes_used += num_bytes;
space_info->disk_used += num_bytes * factor;
+ if (READ_ONCE(space_info->periodic_reclaim))
+ btrfs_space_info_update_reclaimable(space_info, -num_bytes);
spin_unlock(&cache->lock);
spin_unlock(&space_info->lock);
} else {
@@ -3674,8 +3699,10 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans,
btrfs_space_info_update_bytes_pinned(info, space_info, num_bytes);
space_info->bytes_used -= num_bytes;
space_info->disk_used -= num_bytes * factor;
-
- reclaim = should_reclaim_block_group(cache, num_bytes);
+ if (READ_ONCE(space_info->periodic_reclaim))
+ btrfs_space_info_update_reclaimable(space_info, num_bytes);
+ else
+ reclaim = should_reclaim_block_group(cache, num_bytes);
spin_unlock(&cache->lock);
spin_unlock(&space_info->lock);
@@ -4329,13 +4356,13 @@ void btrfs_put_block_group_cache(struct btrfs_fs_info *info)
spin_lock(&block_group->lock);
if (test_and_clear_bit(BLOCK_GROUP_FLAG_IREF,
&block_group->runtime_flags)) {
- struct inode *inode = block_group->inode;
+ struct btrfs_inode *inode = block_group->inode;
block_group->inode = NULL;
spin_unlock(&block_group->lock);
ASSERT(block_group->io_ctl.inode == NULL);
- iput(inode);
+ iput(&inode->vfs_inode);
} else {
spin_unlock(&block_group->lock);
}
diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h
index 85e2d4cd12dc..915111338fc0 100644
--- a/fs/btrfs/block-group.h
+++ b/fs/btrfs/block-group.h
@@ -115,7 +115,7 @@ struct btrfs_caching_control {
struct btrfs_block_group {
struct btrfs_fs_info *fs_info;
- struct inode *inode;
+ struct btrfs_inode *inode;
spinlock_t lock;
u64 start;
u64 length;
@@ -263,6 +263,7 @@ struct btrfs_block_group {
struct work_struct zone_finish_work;
struct extent_buffer *last_eb;
enum btrfs_block_group_size_class size_class;
+ u64 reclaim_mark;
};
static inline u64 btrfs_block_group_end(struct btrfs_block_group *block_group)
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 6ed495ca7a31..3056c8aed8ef 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -19,7 +19,6 @@
#include <uapi/linux/btrfs_tree.h>
#include <trace/events/btrfs.h>
#include "block-rsv.h"
-#include "btrfs_inode.h"
#include "extent_map.h"
#include "extent_io.h"
#include "extent-io-tree.h"
@@ -99,6 +98,29 @@ enum {
* range).
*/
BTRFS_INODE_COW_WRITE_ERROR,
+ /*
+ * Indicate this is a directory that points to a subvolume for which
+ * there is no root reference item. That's a case like the following:
+ *
+ * $ btrfs subvolume create /mnt/parent
+ * $ btrfs subvolume create /mnt/parent/child
+ * $ btrfs subvolume snapshot /mnt/parent /mnt/snap
+ *
+ * If subvolume "parent" is root 256, subvolume "child" is root 257 and
+ * snapshot "snap" is root 258, then there's no root reference item (key
+ * BTRFS_ROOT_REF_KEY in the root tree) for the subvolume "child"
+ * associated to root 258 (the snapshot) - there's only for the root
+ * of the "parent" subvolume (root 256). In the chunk root we have a
+ * (256 BTRFS_ROOT_REF_KEY 257) key but we don't have a
+ * (258 BTRFS_ROOT_REF_KEY 257) key - the sames goes for backrefs, we
+ * have a (257 BTRFS_ROOT_BACKREF_KEY 256) but we don't have a
+ * (257 BTRFS_ROOT_BACKREF_KEY 258) key.
+ *
+ * So when opening the "child" dentry from the snapshot's directory,
+ * we don't find a root ref item and we create a stub inode. This is
+ * done at new_simple_dir(), called from btrfs_lookup_dentry().
+ */
+ BTRFS_INODE_ROOT_STUB,
};
/* in memory btrfs inode */
@@ -106,10 +128,14 @@ struct btrfs_inode {
/* which subvolume this inode belongs to */
struct btrfs_root *root;
- /* key used to find this inode on disk. This is used by the code
- * to read in roots of subvolumes
+#if BITS_PER_LONG == 32
+ /*
+ * The objectid of the corresponding BTRFS_INODE_ITEM_KEY.
+ * On 64 bits platforms we can get it from vfs_inode.i_ino, which is an
+ * unsigned long and therefore 64 bits on such platforms.
*/
- struct btrfs_key location;
+ u64 objectid;
+#endif
/* Cached value of inode property 'compression'. */
u8 prop_compress;
@@ -165,9 +191,6 @@ struct btrfs_inode {
*/
struct list_head delalloc_inodes;
- /* node for the red-black tree that links inodes in subvolume root */
- struct rb_node rb_node;
-
unsigned long runtime_flags;
/* full 64 bit generation number, struct vfs_inode doesn't have a big
@@ -228,11 +251,20 @@ struct btrfs_inode {
u64 last_dir_index_offset;
};
- /*
- * Total number of bytes pending defrag, used by stat to check whether
- * it needs COW. Protected by 'lock'.
- */
- u64 defrag_bytes;
+ union {
+ /*
+ * Total number of bytes pending defrag, used by stat to check whether
+ * it needs COW. Protected by 'lock'.
+ * Used by inodes other than the data relocation inode.
+ */
+ u64 defrag_bytes;
+
+ /*
+ * Logical address of the block group being relocated.
+ * Used only by the data relocation inode.
+ */
+ u64 reloc_block_group_start;
+ };
/*
* The size of the file stored in the metadata on disk. data=ordered
@@ -241,12 +273,21 @@ struct btrfs_inode {
*/
u64 disk_i_size;
- /*
- * If this is a directory then index_cnt is the counter for the index
- * number for new files that are created. For an empty directory, this
- * must be initialized to BTRFS_DIR_START_INDEX.
- */
- u64 index_cnt;
+ union {
+ /*
+ * If this is a directory then index_cnt is the counter for the
+ * index number for new files that are created. For an empty
+ * directory, this must be initialized to BTRFS_DIR_START_INDEX.
+ */
+ u64 index_cnt;
+
+ /*
+ * If this is not a directory, this is the number of bytes
+ * outstanding that are going to need csums. This is used in
+ * ENOSPC accounting. Protected by 'lock'.
+ */
+ u64 csum_bytes;
+ };
/* Cache the directory index number to speed the dir/file remove */
u64 dir_index;
@@ -258,22 +299,25 @@ struct btrfs_inode {
*/
u64 last_unlink_trans;
- /*
- * The id/generation of the last transaction where this inode was
- * either the source or the destination of a clone/dedupe operation.
- * Used when logging an inode to know if there are shared extents that
- * need special care when logging checksum items, to avoid duplicate
- * checksum items in a log (which can lead to a corruption where we end
- * up with missing checksum ranges after log replay).
- * Protected by the vfs inode lock.
- */
- u64 last_reflink_trans;
+ union {
+ /*
+ * The id/generation of the last transaction where this inode
+ * was either the source or the destination of a clone/dedupe
+ * operation. Used when logging an inode to know if there are
+ * shared extents that need special care when logging checksum
+ * items, to avoid duplicate checksum items in a log (which can
+ * lead to a corruption where we end up with missing checksum
+ * ranges after log replay). Protected by the VFS inode lock.
+ * Used for regular files only.
+ */
+ u64 last_reflink_trans;
- /*
- * Number of bytes outstanding that are going to need csums. This is
- * used in ENOSPC accounting. Protected by 'lock'.
- */
- u64 csum_bytes;
+ /*
+ * In case this a root stub inode (BTRFS_INODE_ROOT_STUB flag set),
+ * the ID of that root.
+ */
+ u64 ref_root_id;
+ };
/* Backwards incompatible flags, lower half of inode_item::flags */
u32 flags;
@@ -331,10 +375,9 @@ static inline unsigned long btrfs_inode_hash(u64 objectid,
*/
static inline u64 btrfs_ino(const struct btrfs_inode *inode)
{
- u64 ino = inode->location.objectid;
+ u64 ino = inode->objectid;
- /* type == BTRFS_ROOT_ITEM_KEY: subvol dir */
- if (inode->location.type == BTRFS_ROOT_ITEM_KEY)
+ if (test_bit(BTRFS_INODE_ROOT_STUB, &inode->runtime_flags))
ino = inode->vfs_inode.i_ino;
return ino;
}
@@ -348,20 +391,36 @@ static inline u64 btrfs_ino(const struct btrfs_inode *inode)
#endif
+static inline void btrfs_get_inode_key(const struct btrfs_inode *inode,
+ struct btrfs_key *key)
+{
+ key->objectid = btrfs_ino(inode);
+ key->type = BTRFS_INODE_ITEM_KEY;
+ key->offset = 0;
+}
+
+static inline void btrfs_set_inode_number(struct btrfs_inode *inode, u64 ino)
+{
+#if BITS_PER_LONG == 32
+ inode->objectid = ino;
+#endif
+ inode->vfs_inode.i_ino = ino;
+}
+
static inline void btrfs_i_size_write(struct btrfs_inode *inode, u64 size)
{
i_size_write(&inode->vfs_inode, size);
inode->disk_i_size = size;
}
-static inline bool btrfs_is_free_space_inode(struct btrfs_inode *inode)
+static inline bool btrfs_is_free_space_inode(const struct btrfs_inode *inode)
{
return test_bit(BTRFS_INODE_FREE_SPACE_INODE, &inode->runtime_flags);
}
-static inline bool is_data_inode(struct inode *inode)
+static inline bool is_data_inode(const struct btrfs_inode *inode)
{
- return btrfs_ino(BTRFS_I(inode)) != BTRFS_BTREE_INODE_OBJECTID;
+ return btrfs_ino(inode) != BTRFS_BTREE_INODE_OBJECTID;
}
static inline void btrfs_mod_outstanding_extents(struct btrfs_inode *inode,
@@ -455,8 +514,8 @@ int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, struct page *page,
bool btrfs_data_csum_ok(struct btrfs_bio *bbio, struct btrfs_device *dev,
u32 bio_offset, struct bio_vec *bv);
noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
- u64 *orig_start, u64 *orig_block_len,
- u64 *ram_bytes, bool nowait, bool strict);
+ struct btrfs_file_extent *file_extent,
+ bool nowait, bool strict);
void btrfs_del_delalloc_inode(struct btrfs_inode *inode);
struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry);
@@ -515,9 +574,9 @@ void btrfs_free_inode(struct inode *inode);
int btrfs_drop_inode(struct inode *inode);
int __init btrfs_init_cachep(void);
void __cold btrfs_destroy_cachep(void);
-struct inode *btrfs_iget_path(struct super_block *s, u64 ino,
- struct btrfs_root *root, struct btrfs_path *path);
-struct inode *btrfs_iget(struct super_block *s, u64 ino, struct btrfs_root *root);
+struct inode *btrfs_iget_path(u64 ino, struct btrfs_root *root,
+ struct btrfs_path *path);
+struct inode *btrfs_iget(u64 ino, struct btrfs_root *root);
struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
struct page *page, u64 start, u64 len);
int btrfs_update_inode(struct btrfs_trans_handle *trans,
@@ -551,10 +610,6 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
const struct btrfs_ioctl_encoded_io_args *encoded);
-ssize_t btrfs_dio_read(struct kiocb *iocb, struct iov_iter *iter,
- size_t done_before);
-struct iomap_dio *btrfs_dio_write(struct kiocb *iocb, struct iov_iter *iter,
- size_t done_before);
struct btrfs_inode *btrfs_find_first_inode(struct btrfs_root *root, u64 min_ino);
extern const struct dentry_operations btrfs_dentry_operations;
@@ -571,5 +626,10 @@ void btrfs_inode_unlock(struct btrfs_inode *inode, unsigned int ilock_flags);
void btrfs_update_inode_bytes(struct btrfs_inode *inode, const u64 add_bytes,
const u64 del_bytes);
void btrfs_assert_inode_range_clean(struct btrfs_inode *inode, u64 start, u64 end);
+u64 btrfs_get_extent_allocation_hint(struct btrfs_inode *inode, u64 start,
+ u64 num_bytes);
+struct extent_map *btrfs_create_io_em(struct btrfs_inode *inode, u64 start,
+ const struct btrfs_file_extent *file_extent,
+ int type);
#endif
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 6441e47d8a5e..a8e2c461aff7 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -261,7 +261,7 @@ void btrfs_free_compr_folio(struct folio *folio)
folio_put(folio);
}
-static void end_bbio_comprssed_read(struct btrfs_bio *bbio)
+static void end_bbio_compressed_read(struct btrfs_bio *bbio)
{
struct compressed_bio *cb = to_compressed_bio(bbio);
blk_status_t status = bbio->bio.bi_status;
@@ -334,7 +334,7 @@ static void btrfs_finish_compressed_write_work(struct work_struct *work)
* This also calls the writeback end hooks for the file pages so that metadata
* and checksums can be updated in the file.
*/
-static void end_bbio_comprssed_write(struct btrfs_bio *bbio)
+static void end_bbio_compressed_write(struct btrfs_bio *bbio)
{
struct compressed_bio *cb = to_compressed_bio(bbio);
struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info;
@@ -374,7 +374,7 @@ void btrfs_submit_compressed_write(struct btrfs_ordered_extent *ordered,
blk_opf_t write_flags,
bool writeback)
{
- struct btrfs_inode *inode = BTRFS_I(ordered->inode);
+ struct btrfs_inode *inode = ordered->inode;
struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct compressed_bio *cb;
@@ -383,7 +383,7 @@ void btrfs_submit_compressed_write(struct btrfs_ordered_extent *ordered,
cb = alloc_compressed_bio(inode, ordered->file_offset,
REQ_OP_WRITE | write_flags,
- end_bbio_comprssed_write);
+ end_bbio_compressed_write);
cb->start = ordered->file_offset;
cb->len = ordered->num_bytes;
cb->compressed_folios = compressed_folios;
@@ -507,13 +507,15 @@ static noinline int add_ra_bio_pages(struct inode *inode,
*/
if (!em || cur < em->start ||
(cur + fs_info->sectorsize > extent_map_end(em)) ||
- (em->block_start >> SECTOR_SHIFT) != orig_bio->bi_iter.bi_sector) {
+ (extent_map_block_start(em) >> SECTOR_SHIFT) !=
+ orig_bio->bi_iter.bi_sector) {
free_extent_map(em);
unlock_extent(tree, cur, page_end, NULL);
unlock_page(page);
put_page(page);
break;
}
+ add_size = min(em->start + em->len, page_end + 1) - cur;
free_extent_map(em);
if (page->index == end_index) {
@@ -526,7 +528,6 @@ static noinline int add_ra_bio_pages(struct inode *inode,
}
}
- add_size = min(em->start + em->len, page_end + 1) - cur;
ret = bio_add_page(orig_bio, page, add_size, offset_in_page(cur));
if (ret != add_size) {
unlock_extent(tree, cur, page_end, NULL);
@@ -585,12 +586,12 @@ void btrfs_submit_compressed_read(struct btrfs_bio *bbio)
}
ASSERT(extent_map_is_compressed(em));
- compressed_len = em->block_len;
+ compressed_len = em->disk_num_bytes;
cb = alloc_compressed_bio(inode, file_offset, REQ_OP_READ,
- end_bbio_comprssed_read);
+ end_bbio_compressed_read);
- cb->start = em->orig_start;
+ cb->start = em->start - em->offset;
em_len = em->len;
em_start = em->start;
@@ -608,7 +609,7 @@ void btrfs_submit_compressed_read(struct btrfs_bio *bbio)
goto out_free_bio;
}
- ret2 = btrfs_alloc_folio_array(cb->nr_folios, cb->compressed_folios, 0);
+ ret2 = btrfs_alloc_folio_array(cb->nr_folios, cb->compressed_folios);
if (ret2) {
ret = BLK_STS_RESOURCE;
goto out_free_compressed_pages;
@@ -1506,7 +1507,7 @@ static void heuristic_collect_sample(struct inode *inode, u64 start, u64 end,
*
* Return non-zero if the compression should be done, 0 otherwise.
*/
-int btrfs_compress_heuristic(struct inode *inode, u64 start, u64 end)
+int btrfs_compress_heuristic(struct btrfs_inode *inode, u64 start, u64 end)
{
struct list_head *ws_list = get_workspace(0, 0);
struct heuristic_ws *ws;
@@ -1516,7 +1517,7 @@ int btrfs_compress_heuristic(struct inode *inode, u64 start, u64 end)
ws = list_entry(ws_list, struct heuristic_ws, list);
- heuristic_collect_sample(inode, start, end, ws);
+ heuristic_collect_sample(&inode->vfs_inode, start, end, ws);
if (sample_repeated_patterns(ws)) {
ret = 1;
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
index c20c1a1b09d5..cfdc64319186 100644
--- a/fs/btrfs/compression.h
+++ b/fs/btrfs/compression.h
@@ -144,7 +144,7 @@ extern const struct btrfs_compress_op btrfs_zstd_compress;
const char* btrfs_compress_type2str(enum btrfs_compression_type type);
bool btrfs_compress_is_valid_type(const char *str, size_t len);
-int btrfs_compress_heuristic(struct inode *inode, u64 start, u64 end);
+int btrfs_compress_heuristic(struct btrfs_inode *inode, u64 start, u64 end);
int btrfs_compress_filemap_get_folio(struct address_space *mapping, u64 start,
struct folio **in_folio_ret);
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 1a49b9232990..451203055bbf 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -321,7 +321,7 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
trans->transid != fs_info->running_transaction->transid);
WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
- trans->transid != root->last_trans);
+ trans->transid != btrfs_get_root_last_trans(root));
level = btrfs_header_level(buf);
if (level == 0)
@@ -417,7 +417,6 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
u64 refs;
u64 owner;
u64 flags;
- u64 new_flags = 0;
int ret;
/*
@@ -462,8 +461,16 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
}
owner = btrfs_header_owner(buf);
- BUG_ON(owner == BTRFS_TREE_RELOC_OBJECTID &&
- !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
+ if (unlikely(owner == BTRFS_TREE_RELOC_OBJECTID &&
+ !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))) {
+ btrfs_crit(fs_info,
+"found tree block at bytenr %llu level %d root %llu refs %llu flags %llx without full backref flag set",
+ buf->start, btrfs_header_level(buf),
+ btrfs_root_id(root), refs, flags);
+ ret = -EUCLEAN;
+ btrfs_abort_transaction(trans, ret);
+ return ret;
+ }
if (refs > 1) {
if ((owner == btrfs_root_id(root) ||
@@ -481,7 +488,10 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
if (ret)
return ret;
}
- new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
+ ret = btrfs_set_disk_extent_flags(trans, buf,
+ BTRFS_BLOCK_FLAG_FULL_BACKREF);
+ if (ret)
+ return ret;
} else {
if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID)
@@ -491,11 +501,6 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
if (ret)
return ret;
}
- if (new_flags != 0) {
- ret = btrfs_set_disk_extent_flags(trans, buf, new_flags);
- if (ret)
- return ret;
- }
} else {
if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID)
@@ -551,7 +556,7 @@ int btrfs_force_cow_block(struct btrfs_trans_handle *trans,
WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
trans->transid != fs_info->running_transaction->transid);
WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
- trans->transid != root->last_trans);
+ trans->transid != btrfs_get_root_last_trans(root));
level = btrfs_header_level(buf);
@@ -588,19 +593,15 @@ int btrfs_force_cow_block(struct btrfs_trans_handle *trans,
ret = update_ref_for_cow(trans, root, buf, cow, &last_ref);
if (ret) {
- btrfs_tree_unlock(cow);
- free_extent_buffer(cow);
btrfs_abort_transaction(trans, ret);
- return ret;
+ goto error_unlock_cow;
}
if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) {
ret = btrfs_reloc_cow_block(trans, root, buf, cow);
if (ret) {
- btrfs_tree_unlock(cow);
- free_extent_buffer(cow);
btrfs_abort_transaction(trans, ret);
- return ret;
+ goto error_unlock_cow;
}
}
@@ -612,27 +613,27 @@ int btrfs_force_cow_block(struct btrfs_trans_handle *trans,
ret = btrfs_tree_mod_log_insert_root(root->node, cow, true);
if (ret < 0) {
- btrfs_tree_unlock(cow);
- free_extent_buffer(cow);
btrfs_abort_transaction(trans, ret);
- return ret;
+ goto error_unlock_cow;
}
atomic_inc(&cow->refs);
rcu_assign_pointer(root->node, cow);
- btrfs_free_tree_block(trans, btrfs_root_id(root), buf,
- parent_start, last_ref);
+ ret = btrfs_free_tree_block(trans, btrfs_root_id(root), buf,
+ parent_start, last_ref);
free_extent_buffer(buf);
add_root_to_dirty_list(root);
+ if (ret < 0) {
+ btrfs_abort_transaction(trans, ret);
+ goto error_unlock_cow;
+ }
} else {
WARN_ON(trans->transid != btrfs_header_generation(parent));
ret = btrfs_tree_mod_log_insert_key(parent, parent_slot,
BTRFS_MOD_LOG_KEY_REPLACE);
if (ret) {
- btrfs_tree_unlock(cow);
- free_extent_buffer(cow);
btrfs_abort_transaction(trans, ret);
- return ret;
+ goto error_unlock_cow;
}
btrfs_set_node_blockptr(parent, parent_slot,
cow->start);
@@ -642,14 +643,16 @@ int btrfs_force_cow_block(struct btrfs_trans_handle *trans,
if (last_ref) {
ret = btrfs_tree_mod_log_free_eb(buf);
if (ret) {
- btrfs_tree_unlock(cow);
- free_extent_buffer(cow);
btrfs_abort_transaction(trans, ret);
- return ret;
+ goto error_unlock_cow;
}
}
- btrfs_free_tree_block(trans, btrfs_root_id(root), buf,
- parent_start, last_ref);
+ ret = btrfs_free_tree_block(trans, btrfs_root_id(root), buf,
+ parent_start, last_ref);
+ if (ret < 0) {
+ btrfs_abort_transaction(trans, ret);
+ goto error_unlock_cow;
+ }
}
if (unlock_orig)
btrfs_tree_unlock(buf);
@@ -657,6 +660,11 @@ int btrfs_force_cow_block(struct btrfs_trans_handle *trans,
btrfs_mark_buffer_dirty(trans, cow);
*cow_ret = cow;
return 0;
+
+error_unlock_cow:
+ btrfs_tree_unlock(cow);
+ free_extent_buffer(cow);
+ return ret;
}
static inline int should_cow_block(struct btrfs_trans_handle *trans,
@@ -983,9 +991,13 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
free_extent_buffer(mid);
root_sub_used_bytes(root);
- btrfs_free_tree_block(trans, btrfs_root_id(root), mid, 0, 1);
+ ret = btrfs_free_tree_block(trans, btrfs_root_id(root), mid, 0, 1);
/* once for the root ptr */
free_extent_buffer_stale(mid);
+ if (ret < 0) {
+ btrfs_abort_transaction(trans, ret);
+ goto out;
+ }
return 0;
}
if (btrfs_header_nritems(mid) >
@@ -1053,10 +1065,14 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
goto out;
}
root_sub_used_bytes(root);
- btrfs_free_tree_block(trans, btrfs_root_id(root), right,
- 0, 1);
+ ret = btrfs_free_tree_block(trans, btrfs_root_id(root),
+ right, 0, 1);
free_extent_buffer_stale(right);
right = NULL;
+ if (ret < 0) {
+ btrfs_abort_transaction(trans, ret);
+ goto out;
+ }
} else {
struct btrfs_disk_key right_key;
btrfs_node_key(right, &right_key, 0);
@@ -1111,9 +1127,13 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
goto out;
}
root_sub_used_bytes(root);
- btrfs_free_tree_block(trans, btrfs_root_id(root), mid, 0, 1);
+ ret = btrfs_free_tree_block(trans, btrfs_root_id(root), mid, 0, 1);
free_extent_buffer_stale(mid);
mid = NULL;
+ if (ret < 0) {
+ btrfs_abort_transaction(trans, ret);
+ goto out;
+ }
} else {
/* update the parent key to reflect our changes */
struct btrfs_disk_key mid_key;
@@ -1551,12 +1571,7 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
if (ret) {
free_extent_buffer(tmp);
btrfs_release_path(p);
- return -EIO;
- }
- if (btrfs_check_eb_owner(tmp, btrfs_root_id(root))) {
- free_extent_buffer(tmp);
- btrfs_release_path(p);
- return -EUCLEAN;
+ return ret;
}
if (unlock_up)
@@ -2883,7 +2898,11 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
old = root->node;
ret = btrfs_tree_mod_log_insert_root(root->node, c, false);
if (ret < 0) {
- btrfs_free_tree_block(trans, btrfs_root_id(root), c, 0, 1);
+ int ret2;
+
+ ret2 = btrfs_free_tree_block(trans, btrfs_root_id(root), c, 0, 1);
+ if (ret2 < 0)
+ btrfs_abort_transaction(trans, ret2);
btrfs_tree_unlock(c);
free_extent_buffer(c);
return ret;
@@ -4452,9 +4471,12 @@ static noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans,
root_sub_used_bytes(root);
atomic_inc(&leaf->refs);
- btrfs_free_tree_block(trans, btrfs_root_id(root), leaf, 0, 1);
+ ret = btrfs_free_tree_block(trans, btrfs_root_id(root), leaf, 0, 1);
free_extent_buffer_stale(leaf);
- return 0;
+ if (ret < 0)
+ btrfs_abort_transaction(trans, ret);
+
+ return ret;
}
/*
* delete the item at the leaf level in path. If that empties
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index c03c58246033..c8568b1a61c4 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -221,9 +221,11 @@ struct btrfs_root {
struct list_head root_list;
- spinlock_t inode_lock;
- /* red-black tree that keeps track of in-memory inodes */
- struct rb_root inode_tree;
+ /*
+ * Xarray that keeps track of in-memory inodes, protected by the lock
+ * @inode_lock.
+ */
+ struct xarray inodes;
/*
* Xarray that keeps track of delayed nodes of every inode, protected
@@ -354,6 +356,16 @@ static inline void btrfs_set_root_last_log_commit(struct btrfs_root *root, int c
WRITE_ONCE(root->last_log_commit, commit_id);
}
+static inline u64 btrfs_get_root_last_trans(const struct btrfs_root *root)
+{
+ return READ_ONCE(root->last_trans);
+}
+
+static inline void btrfs_set_root_last_trans(struct btrfs_root *root, u64 transid)
+{
+ WRITE_ONCE(root->last_trans, transid);
+}
+
/*
* Structure that conveys information about an extent that is going to replace
* all the extents in a file range.
diff --git a/fs/btrfs/defrag.c b/fs/btrfs/defrag.c
index 407ccec3e57e..f6dbda37a361 100644
--- a/fs/btrfs/defrag.c
+++ b/fs/btrfs/defrag.c
@@ -139,7 +139,7 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
if (trans)
transid = trans->transid;
else
- transid = inode->root->last_trans;
+ transid = btrfs_get_root_last_trans(root);
defrag = kmem_cache_zalloc(btrfs_inode_defrag_cachep, GFP_NOFS);
if (!defrag)
@@ -255,7 +255,7 @@ again:
goto cleanup;
}
- inode = btrfs_iget(fs_info->sb, defrag->ino, inode_root);
+ inode = btrfs_iget(defrag->ino, inode_root);
btrfs_put_root(inode_root);
if (IS_ERR(inode)) {
ret = PTR_ERR(inode);
@@ -707,8 +707,10 @@ iterate:
*/
if (key.offset > start) {
em->start = start;
- em->orig_start = start;
- em->block_start = EXTENT_MAP_HOLE;
+ em->disk_bytenr = EXTENT_MAP_HOLE;
+ em->disk_num_bytes = 0;
+ em->ram_bytes = 0;
+ em->offset = 0;
em->len = key.offset - start;
break;
}
@@ -825,7 +827,7 @@ static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em,
*/
next = defrag_lookup_extent(inode, em->start + em->len, newer_than, locked);
/* No more em or hole */
- if (!next || next->block_start >= EXTENT_MAP_LAST_BYTE)
+ if (!next || next->disk_bytenr >= EXTENT_MAP_LAST_BYTE)
goto out;
if (next->flags & EXTENT_FLAG_PREALLOC)
goto out;
@@ -992,12 +994,12 @@ static int defrag_collect_targets(struct btrfs_inode *inode,
* This is for users who want to convert inline extents to
* regular ones through max_inline= mount option.
*/
- if (em->block_start == EXTENT_MAP_INLINE &&
+ if (em->disk_bytenr == EXTENT_MAP_INLINE &&
em->len <= inode->root->fs_info->max_inline)
goto next;
/* Skip holes and preallocated extents. */
- if (em->block_start == EXTENT_MAP_HOLE ||
+ if (em->disk_bytenr == EXTENT_MAP_HOLE ||
(em->flags & EXTENT_FLAG_PREALLOC))
goto next;
@@ -1062,7 +1064,7 @@ static int defrag_collect_targets(struct btrfs_inode *inode,
* So if an inline extent passed all above checks, just add it
* for defrag, and be converted to regular extents.
*/
- if (em->block_start == EXTENT_MAP_INLINE)
+ if (em->disk_bytenr == EXTENT_MAP_INLINE)
goto add;
next_mergeable = defrag_check_next_extent(&inode->vfs_inode, em,
diff --git a/fs/btrfs/delalloc-space.c b/fs/btrfs/delalloc-space.c
index b3527efd0b4b..7aa8a395d838 100644
--- a/fs/btrfs/delalloc-space.c
+++ b/fs/btrfs/delalloc-space.c
@@ -111,7 +111,7 @@
* making error handling and cleanup easier.
*/
-int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes)
+int btrfs_alloc_data_chunk_ondemand(const struct btrfs_inode *inode, u64 bytes)
{
struct btrfs_root *root = inode->root;
struct btrfs_fs_info *fs_info = root->fs_info;
diff --git a/fs/btrfs/delalloc-space.h b/fs/btrfs/delalloc-space.h
index ce4f889e4f17..3f32953c0a80 100644
--- a/fs/btrfs/delalloc-space.h
+++ b/fs/btrfs/delalloc-space.h
@@ -9,7 +9,7 @@ struct extent_changeset;
struct btrfs_inode;
struct btrfs_fs_info;
-int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes);
+int btrfs_alloc_data_chunk_ondemand(const struct btrfs_inode *inode, u64 bytes);
int btrfs_check_data_free_space(struct btrfs_inode *inode,
struct extent_changeset **reserved, u64 start, u64 len,
bool noflush);
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 95a0497fa866..508bdbae29a0 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -77,14 +77,14 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node(
return node;
}
- spin_lock(&root->inode_lock);
+ xa_lock(&root->delayed_nodes);
node = xa_load(&root->delayed_nodes, ino);
if (node) {
if (btrfs_inode->delayed_node) {
refcount_inc(&node->refs); /* can be accessed */
BUG_ON(btrfs_inode->delayed_node != node);
- spin_unlock(&root->inode_lock);
+ xa_unlock(&root->delayed_nodes);
return node;
}
@@ -111,10 +111,10 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node(
node = NULL;
}
- spin_unlock(&root->inode_lock);
+ xa_unlock(&root->delayed_nodes);
return node;
}
- spin_unlock(&root->inode_lock);
+ xa_unlock(&root->delayed_nodes);
return NULL;
}
@@ -148,21 +148,21 @@ again:
kmem_cache_free(delayed_node_cache, node);
return ERR_PTR(-ENOMEM);
}
- spin_lock(&root->inode_lock);
+ xa_lock(&root->delayed_nodes);
ptr = xa_load(&root->delayed_nodes, ino);
if (ptr) {
/* Somebody inserted it, go back and read it. */
- spin_unlock(&root->inode_lock);
+ xa_unlock(&root->delayed_nodes);
kmem_cache_free(delayed_node_cache, node);
node = NULL;
goto again;
}
- ptr = xa_store(&root->delayed_nodes, ino, node, GFP_ATOMIC);
+ ptr = __xa_store(&root->delayed_nodes, ino, node, GFP_ATOMIC);
ASSERT(xa_err(ptr) != -EINVAL);
ASSERT(xa_err(ptr) != -ENOMEM);
ASSERT(ptr == NULL);
btrfs_inode->delayed_node = node;
- spin_unlock(&root->inode_lock);
+ xa_unlock(&root->delayed_nodes);
return node;
}
@@ -275,14 +275,12 @@ static void __btrfs_release_delayed_node(
if (refcount_dec_and_test(&delayed_node->refs)) {
struct btrfs_root *root = delayed_node->root;
- spin_lock(&root->inode_lock);
+ xa_erase(&root->delayed_nodes, delayed_node->inode_id);
/*
* Once our refcount goes to zero, nobody is allowed to bump it
* back up. We can delete it now.
*/
ASSERT(refcount_read(&delayed_node->refs) == 0);
- xa_erase(&root->delayed_nodes, delayed_node->inode_id);
- spin_unlock(&root->inode_lock);
kmem_cache_free(delayed_node_cache, delayed_node);
}
}
@@ -1471,7 +1469,7 @@ static void btrfs_release_dir_index_item_space(struct btrfs_trans_handle *trans)
int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans,
const char *name, int name_len,
struct btrfs_inode *dir,
- struct btrfs_disk_key *disk_key, u8 flags,
+ const struct btrfs_disk_key *disk_key, u8 flags,
u64 index)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
@@ -1684,7 +1682,7 @@ int btrfs_inode_delayed_dir_index_count(struct btrfs_inode *inode)
return 0;
}
-bool btrfs_readdir_get_delayed_items(struct inode *inode,
+bool btrfs_readdir_get_delayed_items(struct btrfs_inode *inode,
u64 last_index,
struct list_head *ins_list,
struct list_head *del_list)
@@ -1692,7 +1690,7 @@ bool btrfs_readdir_get_delayed_items(struct inode *inode,
struct btrfs_delayed_node *delayed_node;
struct btrfs_delayed_item *item;
- delayed_node = btrfs_get_delayed_node(BTRFS_I(inode));
+ delayed_node = btrfs_get_delayed_node(inode);
if (!delayed_node)
return false;
@@ -1700,8 +1698,8 @@ bool btrfs_readdir_get_delayed_items(struct inode *inode,
* We can only do one readdir with delayed items at a time because of
* item->readdir_list.
*/
- btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_SHARED);
- btrfs_inode_lock(BTRFS_I(inode), 0);
+ btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
+ btrfs_inode_lock(inode, 0);
mutex_lock(&delayed_node->mutex);
item = __btrfs_first_delayed_insertion_item(delayed_node);
@@ -1732,7 +1730,7 @@ bool btrfs_readdir_get_delayed_items(struct inode *inode,
return true;
}
-void btrfs_readdir_put_delayed_items(struct inode *inode,
+void btrfs_readdir_put_delayed_items(struct btrfs_inode *inode,
struct list_head *ins_list,
struct list_head *del_list)
{
@@ -1754,10 +1752,10 @@ void btrfs_readdir_put_delayed_items(struct inode *inode,
* The VFS is going to do up_read(), so we need to downgrade back to a
* read lock.
*/
- downgrade_write(&inode->i_rwsem);
+ downgrade_write(&inode->vfs_inode.i_rwsem);
}
-int btrfs_should_delete_dir_index(struct list_head *del_list,
+int btrfs_should_delete_dir_index(const struct list_head *del_list,
u64 index)
{
struct btrfs_delayed_item *curr;
@@ -1778,7 +1776,7 @@ int btrfs_should_delete_dir_index(struct list_head *del_list,
* Read dir info stored in the delayed tree.
*/
int btrfs_readdir_delayed_dir_index(struct dir_context *ctx,
- struct list_head *ins_list)
+ const struct list_head *ins_list)
{
struct btrfs_dir_item *di;
struct btrfs_delayed_item *curr, *next;
@@ -1916,7 +1914,8 @@ int btrfs_fill_inode(struct inode *inode, u32 *rdev)
BTRFS_I(inode)->i_otime_nsec = btrfs_stack_timespec_nsec(&inode_item->otime);
inode->i_generation = BTRFS_I(inode)->generation;
- BTRFS_I(inode)->index_cnt = (u64)-1;
+ if (S_ISDIR(inode->i_mode))
+ BTRFS_I(inode)->index_cnt = (u64)-1;
mutex_unlock(&delayed_node->mutex);
btrfs_release_delayed_node(delayed_node);
@@ -2057,9 +2056,9 @@ void btrfs_kill_all_delayed_nodes(struct btrfs_root *root)
struct btrfs_delayed_node *node;
int count;
- spin_lock(&root->inode_lock);
+ xa_lock(&root->delayed_nodes);
if (xa_empty(&root->delayed_nodes)) {
- spin_unlock(&root->inode_lock);
+ xa_unlock(&root->delayed_nodes);
return;
}
@@ -2076,7 +2075,7 @@ void btrfs_kill_all_delayed_nodes(struct btrfs_root *root)
if (count >= ARRAY_SIZE(delayed_nodes))
break;
}
- spin_unlock(&root->inode_lock);
+ xa_unlock(&root->delayed_nodes);
index++;
for (int i = 0; i < count; i++) {
diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h
index 64e115d97499..7cfefdfe54ea 100644
--- a/fs/btrfs/delayed-inode.h
+++ b/fs/btrfs/delayed-inode.h
@@ -110,7 +110,7 @@ void btrfs_init_delayed_root(struct btrfs_delayed_root *delayed_root);
int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans,
const char *name, int name_len,
struct btrfs_inode *dir,
- struct btrfs_disk_key *disk_key, u8 flags,
+ const struct btrfs_disk_key *disk_key, u8 flags,
u64 index);
int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans,
@@ -143,17 +143,17 @@ void btrfs_kill_all_delayed_nodes(struct btrfs_root *root);
void btrfs_destroy_delayed_inodes(struct btrfs_fs_info *fs_info);
/* Used for readdir() */
-bool btrfs_readdir_get_delayed_items(struct inode *inode,
+bool btrfs_readdir_get_delayed_items(struct btrfs_inode *inode,
u64 last_index,
struct list_head *ins_list,
struct list_head *del_list);
-void btrfs_readdir_put_delayed_items(struct inode *inode,
+void btrfs_readdir_put_delayed_items(struct btrfs_inode *inode,
struct list_head *ins_list,
struct list_head *del_list);
-int btrfs_should_delete_dir_index(struct list_head *del_list,
+int btrfs_should_delete_dir_index(const struct list_head *del_list,
u64 index);
int btrfs_readdir_delayed_dir_index(struct dir_context *ctx,
- struct list_head *ins_list);
+ const struct list_head *ins_list);
/* Used during directory logging. */
void btrfs_log_get_delayed_items(struct btrfs_inode *inode,
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 6cc80fb10da2..2ac9296edccb 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -195,48 +195,6 @@ void btrfs_dec_delayed_refs_rsv_bg_updates(struct btrfs_fs_info *fs_info)
}
/*
- * Transfer bytes to our delayed refs rsv.
- *
- * @fs_info: the filesystem
- * @num_bytes: number of bytes to transfer
- *
- * This transfers up to the num_bytes amount, previously reserved, to the
- * delayed_refs_rsv. Any extra bytes are returned to the space info.
- */
-void btrfs_migrate_to_delayed_refs_rsv(struct btrfs_fs_info *fs_info,
- u64 num_bytes)
-{
- struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv;
- u64 to_free = 0;
-
- spin_lock(&delayed_refs_rsv->lock);
- if (delayed_refs_rsv->size > delayed_refs_rsv->reserved) {
- u64 delta = delayed_refs_rsv->size -
- delayed_refs_rsv->reserved;
- if (num_bytes > delta) {
- to_free = num_bytes - delta;
- num_bytes = delta;
- }
- } else {
- to_free = num_bytes;
- num_bytes = 0;
- }
-
- if (num_bytes)
- delayed_refs_rsv->reserved += num_bytes;
- if (delayed_refs_rsv->reserved >= delayed_refs_rsv->size)
- delayed_refs_rsv->full = true;
- spin_unlock(&delayed_refs_rsv->lock);
-
- if (num_bytes)
- trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv",
- 0, num_bytes, 1);
- if (to_free)
- btrfs_space_info_free_bytes_may_use(fs_info,
- delayed_refs_rsv->space_info, to_free);
-}
-
-/*
* Refill based on our delayed refs usage.
*
* @fs_info: the filesystem
@@ -861,6 +819,12 @@ static void init_delayed_ref_head(struct btrfs_delayed_ref_head *head_ref,
spin_lock_init(&head_ref->lock);
mutex_init(&head_ref->mutex);
+ /* If not metadata set an impossible level to help debugging. */
+ if (generic_ref->type == BTRFS_REF_METADATA)
+ head_ref->level = generic_ref->tree_ref.level;
+ else
+ head_ref->level = U8_MAX;
+
if (qrecord) {
if (generic_ref->ref_root && reserved) {
qrecord->data_rsv = reserved;
@@ -1114,7 +1078,7 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans,
}
int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans,
- u64 bytenr, u64 num_bytes,
+ u64 bytenr, u64 num_bytes, u8 level,
struct btrfs_delayed_extent_op *extent_op)
{
struct btrfs_delayed_ref_head *head_ref;
@@ -1124,6 +1088,7 @@ int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans,
.action = BTRFS_UPDATE_DELAYED_HEAD,
.bytenr = bytenr,
.num_bytes = num_bytes,
+ .tree_ref.level = level,
};
head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS);
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index 04b180ebe1fe..ef15e998be03 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -108,7 +108,6 @@ struct btrfs_delayed_ref_node {
struct btrfs_delayed_extent_op {
struct btrfs_disk_key key;
- u8 level;
bool update_key;
bool update_flags;
u64 flags_to_set;
@@ -172,6 +171,9 @@ struct btrfs_delayed_ref_head {
*/
u64 reserved_bytes;
+ /* Tree block level, for metadata only. */
+ u8 level;
+
/*
* when a new extent is allocated, it is just reserved in memory
* The actual extent isn't inserted into the extent allocation tree
@@ -355,7 +357,7 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans,
struct btrfs_ref *generic_ref,
u64 reserved);
int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans,
- u64 bytenr, u64 num_bytes,
+ u64 bytenr, u64 num_bytes, u8 level,
struct btrfs_delayed_extent_op *extent_op);
void btrfs_merge_delayed_refs(struct btrfs_fs_info *fs_info,
struct btrfs_delayed_ref_root *delayed_refs,
@@ -386,8 +388,6 @@ void btrfs_inc_delayed_refs_rsv_bg_updates(struct btrfs_fs_info *fs_info);
void btrfs_dec_delayed_refs_rsv_bg_updates(struct btrfs_fs_info *fs_info);
int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info,
enum btrfs_reserve_flush_enum flush);
-void btrfs_migrate_to_delayed_refs_rsv(struct btrfs_fs_info *fs_info,
- u64 num_bytes);
bool btrfs_check_space_for_delayed_refs(struct btrfs_fs_info *fs_info);
static inline u64 btrfs_delayed_ref_owner(struct btrfs_delayed_ref_node *node)
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 7130040d92ab..f638c458d285 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -684,7 +684,7 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
if (ret)
btrfs_err(fs_info, "kobj add dev failed %d", ret);
- btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1);
+ btrfs_wait_ordered_roots(fs_info, U64_MAX, NULL);
/*
* Commit dev_replace state and reserve 1 item for it.
@@ -880,7 +880,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
return ret;
}
- btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1);
+ btrfs_wait_ordered_roots(fs_info, U64_MAX, NULL);
/*
* We have to use this loop approach because at this point src_device
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index 9c07d5c3e5ad..001c0c2f872c 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -22,7 +22,7 @@ static struct btrfs_dir_item *insert_with_overflow(struct btrfs_trans_handle
*trans,
struct btrfs_root *root,
struct btrfs_path *path,
- struct btrfs_key *cpu_key,
+ const struct btrfs_key *cpu_key,
u32 data_size,
const char *name,
int name_len)
@@ -108,7 +108,7 @@ int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans,
*/
int btrfs_insert_dir_item(struct btrfs_trans_handle *trans,
const struct fscrypt_str *name, struct btrfs_inode *dir,
- struct btrfs_key *location, u8 type, u64 index)
+ const struct btrfs_key *location, u8 type, u64 index)
{
int ret = 0;
int ret2 = 0;
@@ -379,7 +379,7 @@ struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans,
* for a specific name.
*/
struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_fs_info *fs_info,
- struct btrfs_path *path,
+ const struct btrfs_path *path,
const char *name, int name_len)
{
struct btrfs_dir_item *dir_item;
@@ -417,7 +417,7 @@ struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_fs_info *fs_info,
int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path,
- struct btrfs_dir_item *di)
+ const struct btrfs_dir_item *di)
{
struct extent_buffer *leaf;
diff --git a/fs/btrfs/dir-item.h b/fs/btrfs/dir-item.h
index 00b3d83d7569..5f6dfafc91f1 100644
--- a/fs/btrfs/dir-item.h
+++ b/fs/btrfs/dir-item.h
@@ -17,7 +17,7 @@ int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir,
const struct fscrypt_str *name);
int btrfs_insert_dir_item(struct btrfs_trans_handle *trans,
const struct fscrypt_str *name, struct btrfs_inode *dir,
- struct btrfs_key *location, u8 type, u64 index);
+ const struct btrfs_key *location, u8 type, u64 index);
struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path, u64 dir,
@@ -33,7 +33,7 @@ struct btrfs_dir_item *btrfs_search_dir_index_item(struct btrfs_root *root,
int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path,
- struct btrfs_dir_item *di);
+ const struct btrfs_dir_item *di);
int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path, u64 objectid,
@@ -45,7 +45,7 @@ struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans,
const char *name, u16 name_len,
int mod);
struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_fs_info *fs_info,
- struct btrfs_path *path,
+ const struct btrfs_path *path,
const char *name,
int name_len);
diff --git a/fs/btrfs/direct-io.c b/fs/btrfs/direct-io.c
new file mode 100644
index 000000000000..f9fb2db6a1e4
--- /dev/null
+++ b/fs/btrfs/direct-io.c
@@ -0,0 +1,1052 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/fsverity.h>
+#include <linux/iomap.h>
+#include "ctree.h"
+#include "delalloc-space.h"
+#include "direct-io.h"
+#include "extent-tree.h"
+#include "file.h"
+#include "fs.h"
+#include "transaction.h"
+#include "volumes.h"
+
+struct btrfs_dio_data {
+ ssize_t submitted;
+ struct extent_changeset *data_reserved;
+ struct btrfs_ordered_extent *ordered;
+ bool data_space_reserved;
+ bool nocow_done;
+};
+
+struct btrfs_dio_private {
+ /* Range of I/O */
+ u64 file_offset;
+ u32 bytes;
+
+ /* This must be last */
+ struct btrfs_bio bbio;
+};
+
+static struct bio_set btrfs_dio_bioset;
+
+static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
+ struct extent_state **cached_state,
+ unsigned int iomap_flags)
+{
+ const bool writing = (iomap_flags & IOMAP_WRITE);
+ const bool nowait = (iomap_flags & IOMAP_NOWAIT);
+ struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+ struct btrfs_ordered_extent *ordered;
+ int ret = 0;
+
+ while (1) {
+ if (nowait) {
+ if (!try_lock_extent(io_tree, lockstart, lockend,
+ cached_state))
+ return -EAGAIN;
+ } else {
+ lock_extent(io_tree, lockstart, lockend, cached_state);
+ }
+ /*
+ * We're concerned with the entire range that we're going to be
+ * doing DIO to, so we need to make sure there's no ordered
+ * extents in this range.
+ */
+ ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), lockstart,
+ lockend - lockstart + 1);
+
+ /*
+ * We need to make sure there are no buffered pages in this
+ * range either, we could have raced between the invalidate in
+ * generic_file_direct_write and locking the extent. The
+ * invalidate needs to happen so that reads after a write do not
+ * get stale data.
+ */
+ if (!ordered &&
+ (!writing || !filemap_range_has_page(inode->i_mapping,
+ lockstart, lockend)))
+ break;
+
+ unlock_extent(io_tree, lockstart, lockend, cached_state);
+
+ if (ordered) {
+ if (nowait) {
+ btrfs_put_ordered_extent(ordered);
+ ret = -EAGAIN;
+ break;
+ }
+ /*
+ * If we are doing a DIO read and the ordered extent we
+ * found is for a buffered write, we can not wait for it
+ * to complete and retry, because if we do so we can
+ * deadlock with concurrent buffered writes on page
+ * locks. This happens only if our DIO read covers more
+ * than one extent map, if at this point has already
+ * created an ordered extent for a previous extent map
+ * and locked its range in the inode's io tree, and a
+ * concurrent write against that previous extent map's
+ * range and this range started (we unlock the ranges
+ * in the io tree only when the bios complete and
+ * buffered writes always lock pages before attempting
+ * to lock range in the io tree).
+ */
+ if (writing ||
+ test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags))
+ btrfs_start_ordered_extent(ordered);
+ else
+ ret = nowait ? -EAGAIN : -ENOTBLK;
+ btrfs_put_ordered_extent(ordered);
+ } else {
+ /*
+ * We could trigger writeback for this range (and wait
+ * for it to complete) and then invalidate the pages for
+ * this range (through invalidate_inode_pages2_range()),
+ * but that can lead us to a deadlock with a concurrent
+ * call to readahead (a buffered read or a defrag call
+ * triggered a readahead) on a page lock due to an
+ * ordered dio extent we created before but did not have
+ * yet a corresponding bio submitted (whence it can not
+ * complete), which makes readahead wait for that
+ * ordered extent to complete while holding a lock on
+ * that page.
+ */
+ ret = nowait ? -EAGAIN : -ENOTBLK;
+ }
+
+ if (ret)
+ break;
+
+ cond_resched();
+ }
+
+ return ret;
+}
+
+static struct extent_map *btrfs_create_dio_extent(struct btrfs_inode *inode,
+ struct btrfs_dio_data *dio_data,
+ const u64 start,
+ const struct btrfs_file_extent *file_extent,
+ const int type)
+{
+ struct extent_map *em = NULL;
+ struct btrfs_ordered_extent *ordered;
+
+ if (type != BTRFS_ORDERED_NOCOW) {
+ em = btrfs_create_io_em(inode, start, file_extent, type);
+ if (IS_ERR(em))
+ goto out;
+ }
+
+ ordered = btrfs_alloc_ordered_extent(inode, start, file_extent,
+ (1 << type) |
+ (1 << BTRFS_ORDERED_DIRECT));
+ if (IS_ERR(ordered)) {
+ if (em) {
+ free_extent_map(em);
+ btrfs_drop_extent_map_range(inode, start,
+ start + file_extent->num_bytes - 1, false);
+ }
+ em = ERR_CAST(ordered);
+ } else {
+ ASSERT(!dio_data->ordered);
+ dio_data->ordered = ordered;
+ }
+ out:
+
+ return em;
+}
+
+static struct extent_map *btrfs_new_extent_direct(struct btrfs_inode *inode,
+ struct btrfs_dio_data *dio_data,
+ u64 start, u64 len)
+{
+ struct btrfs_root *root = inode->root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_file_extent file_extent;
+ struct extent_map *em;
+ struct btrfs_key ins;
+ u64 alloc_hint;
+ int ret;
+
+ alloc_hint = btrfs_get_extent_allocation_hint(inode, start, len);
+again:
+ ret = btrfs_reserve_extent(root, len, len, fs_info->sectorsize,
+ 0, alloc_hint, &ins, 1, 1);
+ if (ret == -EAGAIN) {
+ ASSERT(btrfs_is_zoned(fs_info));
+ wait_on_bit_io(&inode->root->fs_info->flags, BTRFS_FS_NEED_ZONE_FINISH,
+ TASK_UNINTERRUPTIBLE);
+ goto again;
+ }
+ if (ret)
+ return ERR_PTR(ret);
+
+ file_extent.disk_bytenr = ins.objectid;
+ file_extent.disk_num_bytes = ins.offset;
+ file_extent.num_bytes = ins.offset;
+ file_extent.ram_bytes = ins.offset;
+ file_extent.offset = 0;
+ file_extent.compression = BTRFS_COMPRESS_NONE;
+ em = btrfs_create_dio_extent(inode, dio_data, start, &file_extent,
+ BTRFS_ORDERED_REGULAR);
+ btrfs_dec_block_group_reservations(fs_info, ins.objectid);
+ if (IS_ERR(em))
+ btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset,
+ 1);
+
+ return em;
+}
+
+static int btrfs_get_blocks_direct_write(struct extent_map **map,
+ struct inode *inode,
+ struct btrfs_dio_data *dio_data,
+ u64 start, u64 *lenp,
+ unsigned int iomap_flags)
+{
+ const bool nowait = (iomap_flags & IOMAP_NOWAIT);
+ struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
+ struct btrfs_file_extent file_extent;
+ struct extent_map *em = *map;
+ int type;
+ u64 block_start;
+ struct btrfs_block_group *bg;
+ bool can_nocow = false;
+ bool space_reserved = false;
+ u64 len = *lenp;
+ u64 prev_len;
+ int ret = 0;
+
+ /*
+ * We don't allocate a new extent in the following cases
+ *
+ * 1) The inode is marked as NODATACOW. In this case we'll just use the
+ * existing extent.
+ * 2) The extent is marked as PREALLOC. We're good to go here and can
+ * just use the extent.
+ *
+ */
+ if ((em->flags & EXTENT_FLAG_PREALLOC) ||
+ ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
+ em->disk_bytenr != EXTENT_MAP_HOLE)) {
+ if (em->flags & EXTENT_FLAG_PREALLOC)
+ type = BTRFS_ORDERED_PREALLOC;
+ else
+ type = BTRFS_ORDERED_NOCOW;
+ len = min(len, em->len - (start - em->start));
+ block_start = extent_map_block_start(em) + (start - em->start);
+
+ if (can_nocow_extent(inode, start, &len,
+ &file_extent, false, false) == 1) {
+ bg = btrfs_inc_nocow_writers(fs_info, block_start);
+ if (bg)
+ can_nocow = true;
+ }
+ }
+
+ prev_len = len;
+ if (can_nocow) {
+ struct extent_map *em2;
+
+ /* We can NOCOW, so only need to reserve metadata space. */
+ ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len, len,
+ nowait);
+ if (ret < 0) {
+ /* Our caller expects us to free the input extent map. */
+ free_extent_map(em);
+ *map = NULL;
+ btrfs_dec_nocow_writers(bg);
+ if (nowait && (ret == -ENOSPC || ret == -EDQUOT))
+ ret = -EAGAIN;
+ goto out;
+ }
+ space_reserved = true;
+
+ em2 = btrfs_create_dio_extent(BTRFS_I(inode), dio_data, start,
+ &file_extent, type);
+ btrfs_dec_nocow_writers(bg);
+ if (type == BTRFS_ORDERED_PREALLOC) {
+ free_extent_map(em);
+ *map = em2;
+ em = em2;
+ }
+
+ if (IS_ERR(em2)) {
+ ret = PTR_ERR(em2);
+ goto out;
+ }
+
+ dio_data->nocow_done = true;
+ } else {
+ /* Our caller expects us to free the input extent map. */
+ free_extent_map(em);
+ *map = NULL;
+
+ if (nowait) {
+ ret = -EAGAIN;
+ goto out;
+ }
+
+ /*
+ * If we could not allocate data space before locking the file
+ * range and we can't do a NOCOW write, then we have to fail.
+ */
+ if (!dio_data->data_space_reserved) {
+ ret = -ENOSPC;
+ goto out;
+ }
+
+ /*
+ * We have to COW and we have already reserved data space before,
+ * so now we reserve only metadata.
+ */
+ ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len, len,
+ false);
+ if (ret < 0)
+ goto out;
+ space_reserved = true;
+
+ em = btrfs_new_extent_direct(BTRFS_I(inode), dio_data, start, len);
+ if (IS_ERR(em)) {
+ ret = PTR_ERR(em);
+ goto out;
+ }
+ *map = em;
+ len = min(len, em->len - (start - em->start));
+ if (len < prev_len)
+ btrfs_delalloc_release_metadata(BTRFS_I(inode),
+ prev_len - len, true);
+ }
+
+ /*
+ * We have created our ordered extent, so we can now release our reservation
+ * for an outstanding extent.
+ */
+ btrfs_delalloc_release_extents(BTRFS_I(inode), prev_len);
+
+ /*
+ * Need to update the i_size under the extent lock so buffered
+ * readers will get the updated i_size when we unlock.
+ */
+ if (start + len > i_size_read(inode))
+ i_size_write(inode, start + len);
+out:
+ if (ret && space_reserved) {
+ btrfs_delalloc_release_extents(BTRFS_I(inode), len);
+ btrfs_delalloc_release_metadata(BTRFS_I(inode), len, true);
+ }
+ *lenp = len;
+ return ret;
+}
+
+static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
+ loff_t length, unsigned int flags, struct iomap *iomap,
+ struct iomap *srcmap)
+{
+ struct iomap_iter *iter = container_of(iomap, struct iomap_iter, iomap);
+ struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
+ struct extent_map *em;
+ struct extent_state *cached_state = NULL;
+ struct btrfs_dio_data *dio_data = iter->private;
+ u64 lockstart, lockend;
+ const bool write = !!(flags & IOMAP_WRITE);
+ int ret = 0;
+ u64 len = length;
+ const u64 data_alloc_len = length;
+ bool unlock_extents = false;
+
+ /*
+ * We could potentially fault if we have a buffer > PAGE_SIZE, and if
+ * we're NOWAIT we may submit a bio for a partial range and return
+ * EIOCBQUEUED, which would result in an errant short read.
+ *
+ * The best way to handle this would be to allow for partial completions
+ * of iocb's, so we could submit the partial bio, return and fault in
+ * the rest of the pages, and then submit the io for the rest of the
+ * range. However we don't have that currently, so simply return
+ * -EAGAIN at this point so that the normal path is used.
+ */
+ if (!write && (flags & IOMAP_NOWAIT) && length > PAGE_SIZE)
+ return -EAGAIN;
+
+ /*
+ * Cap the size of reads to that usually seen in buffered I/O as we need
+ * to allocate a contiguous array for the checksums.
+ */
+ if (!write)
+ len = min_t(u64, len, fs_info->sectorsize * BTRFS_MAX_BIO_SECTORS);
+
+ lockstart = start;
+ lockend = start + len - 1;
+
+ /*
+ * iomap_dio_rw() only does filemap_write_and_wait_range(), which isn't
+ * enough if we've written compressed pages to this area, so we need to
+ * flush the dirty pages again to make absolutely sure that any
+ * outstanding dirty pages are on disk - the first flush only starts
+ * compression on the data, while keeping the pages locked, so by the
+ * time the second flush returns we know bios for the compressed pages
+ * were submitted and finished, and the pages no longer under writeback.
+ *
+ * If we have a NOWAIT request and we have any pages in the range that
+ * are locked, likely due to compression still in progress, we don't want
+ * to block on page locks. We also don't want to block on pages marked as
+ * dirty or under writeback (same as for the non-compression case).
+ * iomap_dio_rw() did the same check, but after that and before we got
+ * here, mmap'ed writes may have happened or buffered reads started
+ * (readpage() and readahead(), which lock pages), as we haven't locked
+ * the file range yet.
+ */
+ if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
+ &BTRFS_I(inode)->runtime_flags)) {
+ if (flags & IOMAP_NOWAIT) {
+ if (filemap_range_needs_writeback(inode->i_mapping,
+ lockstart, lockend))
+ return -EAGAIN;
+ } else {
+ ret = filemap_fdatawrite_range(inode->i_mapping, start,
+ start + length - 1);
+ if (ret)
+ return ret;
+ }
+ }
+
+ memset(dio_data, 0, sizeof(*dio_data));
+
+ /*
+ * We always try to allocate data space and must do it before locking
+ * the file range, to avoid deadlocks with concurrent writes to the same
+ * range if the range has several extents and the writes don't expand the
+ * current i_size (the inode lock is taken in shared mode). If we fail to
+ * allocate data space here we continue and later, after locking the
+ * file range, we fail with ENOSPC only if we figure out we can not do a
+ * NOCOW write.
+ */
+ if (write && !(flags & IOMAP_NOWAIT)) {
+ ret = btrfs_check_data_free_space(BTRFS_I(inode),
+ &dio_data->data_reserved,
+ start, data_alloc_len, false);
+ if (!ret)
+ dio_data->data_space_reserved = true;
+ else if (ret && !(BTRFS_I(inode)->flags &
+ (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)))
+ goto err;
+ }
+
+ /*
+ * If this errors out it's because we couldn't invalidate pagecache for
+ * this range and we need to fallback to buffered IO, or we are doing a
+ * NOWAIT read/write and we need to block.
+ */
+ ret = lock_extent_direct(inode, lockstart, lockend, &cached_state, flags);
+ if (ret < 0)
+ goto err;
+
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, start, len);
+ if (IS_ERR(em)) {
+ ret = PTR_ERR(em);
+ goto unlock_err;
+ }
+
+ /*
+ * Ok for INLINE and COMPRESSED extents we need to fallback on buffered
+ * io. INLINE is special, and we could probably kludge it in here, but
+ * it's still buffered so for safety lets just fall back to the generic
+ * buffered path.
+ *
+ * For COMPRESSED we _have_ to read the entire extent in so we can
+ * decompress it, so there will be buffering required no matter what we
+ * do, so go ahead and fallback to buffered.
+ *
+ * We return -ENOTBLK because that's what makes DIO go ahead and go back
+ * to buffered IO. Don't blame me, this is the price we pay for using
+ * the generic code.
+ */
+ if (extent_map_is_compressed(em) || em->disk_bytenr == EXTENT_MAP_INLINE) {
+ free_extent_map(em);
+ /*
+ * If we are in a NOWAIT context, return -EAGAIN in order to
+ * fallback to buffered IO. This is not only because we can
+ * block with buffered IO (no support for NOWAIT semantics at
+ * the moment) but also to avoid returning short reads to user
+ * space - this happens if we were able to read some data from
+ * previous non-compressed extents and then when we fallback to
+ * buffered IO, at btrfs_file_read_iter() by calling
+ * filemap_read(), we fail to fault in pages for the read buffer,
+ * in which case filemap_read() returns a short read (the number
+ * of bytes previously read is > 0, so it does not return -EFAULT).
+ */
+ ret = (flags & IOMAP_NOWAIT) ? -EAGAIN : -ENOTBLK;
+ goto unlock_err;
+ }
+
+ len = min(len, em->len - (start - em->start));
+
+ /*
+ * If we have a NOWAIT request and the range contains multiple extents
+ * (or a mix of extents and holes), then we return -EAGAIN to make the
+ * caller fallback to a context where it can do a blocking (without
+ * NOWAIT) request. This way we avoid doing partial IO and returning
+ * success to the caller, which is not optimal for writes and for reads
+ * it can result in unexpected behaviour for an application.
+ *
+ * When doing a read, because we use IOMAP_DIO_PARTIAL when calling
+ * iomap_dio_rw(), we can end up returning less data then what the caller
+ * asked for, resulting in an unexpected, and incorrect, short read.
+ * That is, the caller asked to read N bytes and we return less than that,
+ * which is wrong unless we are crossing EOF. This happens if we get a
+ * page fault error when trying to fault in pages for the buffer that is
+ * associated to the struct iov_iter passed to iomap_dio_rw(), and we
+ * have previously submitted bios for other extents in the range, in
+ * which case iomap_dio_rw() may return us EIOCBQUEUED if not all of
+ * those bios have completed by the time we get the page fault error,
+ * which we return back to our caller - we should only return EIOCBQUEUED
+ * after we have submitted bios for all the extents in the range.
+ */
+ if ((flags & IOMAP_NOWAIT) && len < length) {
+ free_extent_map(em);
+ ret = -EAGAIN;
+ goto unlock_err;
+ }
+
+ if (write) {
+ ret = btrfs_get_blocks_direct_write(&em, inode, dio_data,
+ start, &len, flags);
+ if (ret < 0)
+ goto unlock_err;
+ unlock_extents = true;
+ /* Recalc len in case the new em is smaller than requested */
+ len = min(len, em->len - (start - em->start));
+ if (dio_data->data_space_reserved) {
+ u64 release_offset;
+ u64 release_len = 0;
+
+ if (dio_data->nocow_done) {
+ release_offset = start;
+ release_len = data_alloc_len;
+ } else if (len < data_alloc_len) {
+ release_offset = start + len;
+ release_len = data_alloc_len - len;
+ }
+
+ if (release_len > 0)
+ btrfs_free_reserved_data_space(BTRFS_I(inode),
+ dio_data->data_reserved,
+ release_offset,
+ release_len);
+ }
+ } else {
+ /*
+ * We need to unlock only the end area that we aren't using.
+ * The rest is going to be unlocked by the endio routine.
+ */
+ lockstart = start + len;
+ if (lockstart < lockend)
+ unlock_extents = true;
+ }
+
+ if (unlock_extents)
+ unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+ &cached_state);
+ else
+ free_extent_state(cached_state);
+
+ /*
+ * Translate extent map information to iomap.
+ * We trim the extents (and move the addr) even though iomap code does
+ * that, since we have locked only the parts we are performing I/O in.
+ */
+ if ((em->disk_bytenr == EXTENT_MAP_HOLE) ||
+ ((em->flags & EXTENT_FLAG_PREALLOC) && !write)) {
+ iomap->addr = IOMAP_NULL_ADDR;
+ iomap->type = IOMAP_HOLE;
+ } else {
+ iomap->addr = extent_map_block_start(em) + (start - em->start);
+ iomap->type = IOMAP_MAPPED;
+ }
+ iomap->offset = start;
+ iomap->bdev = fs_info->fs_devices->latest_dev->bdev;
+ iomap->length = len;
+ free_extent_map(em);
+
+ return 0;
+
+unlock_err:
+ unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+ &cached_state);
+err:
+ if (dio_data->data_space_reserved) {
+ btrfs_free_reserved_data_space(BTRFS_I(inode),
+ dio_data->data_reserved,
+ start, data_alloc_len);
+ extent_changeset_free(dio_data->data_reserved);
+ }
+
+ return ret;
+}
+
+static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length,
+ ssize_t written, unsigned int flags, struct iomap *iomap)
+{
+ struct iomap_iter *iter = container_of(iomap, struct iomap_iter, iomap);
+ struct btrfs_dio_data *dio_data = iter->private;
+ size_t submitted = dio_data->submitted;
+ const bool write = !!(flags & IOMAP_WRITE);
+ int ret = 0;
+
+ if (!write && (iomap->type == IOMAP_HOLE)) {
+ /* If reading from a hole, unlock and return */
+ unlock_extent(&BTRFS_I(inode)->io_tree, pos, pos + length - 1,
+ NULL);
+ return 0;
+ }
+
+ if (submitted < length) {
+ pos += submitted;
+ length -= submitted;
+ if (write)
+ btrfs_finish_ordered_extent(dio_data->ordered, NULL,
+ pos, length, false);
+ else
+ unlock_extent(&BTRFS_I(inode)->io_tree, pos,
+ pos + length - 1, NULL);
+ ret = -ENOTBLK;
+ }
+ if (write) {
+ btrfs_put_ordered_extent(dio_data->ordered);
+ dio_data->ordered = NULL;
+ }
+
+ if (write)
+ extent_changeset_free(dio_data->data_reserved);
+ return ret;
+}
+
+static void btrfs_dio_end_io(struct btrfs_bio *bbio)
+{
+ struct btrfs_dio_private *dip =
+ container_of(bbio, struct btrfs_dio_private, bbio);
+ struct btrfs_inode *inode = bbio->inode;
+ struct bio *bio = &bbio->bio;
+
+ if (bio->bi_status) {
+ btrfs_warn(inode->root->fs_info,
+ "direct IO failed ino %llu op 0x%0x offset %#llx len %u err no %d",
+ btrfs_ino(inode), bio->bi_opf,
+ dip->file_offset, dip->bytes, bio->bi_status);
+ }
+
+ if (btrfs_op(bio) == BTRFS_MAP_WRITE) {
+ btrfs_finish_ordered_extent(bbio->ordered, NULL,
+ dip->file_offset, dip->bytes,
+ !bio->bi_status);
+ } else {
+ unlock_extent(&inode->io_tree, dip->file_offset,
+ dip->file_offset + dip->bytes - 1, NULL);
+ }
+
+ bbio->bio.bi_private = bbio->private;
+ iomap_dio_bio_end_io(bio);
+}
+
+static int btrfs_extract_ordered_extent(struct btrfs_bio *bbio,
+ struct btrfs_ordered_extent *ordered)
+{
+ u64 start = (u64)bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT;
+ u64 len = bbio->bio.bi_iter.bi_size;
+ struct btrfs_ordered_extent *new;
+ int ret;
+
+ /* Must always be called for the beginning of an ordered extent. */
+ if (WARN_ON_ONCE(start != ordered->disk_bytenr))
+ return -EINVAL;
+
+ /* No need to split if the ordered extent covers the entire bio. */
+ if (ordered->disk_num_bytes == len) {
+ refcount_inc(&ordered->refs);
+ bbio->ordered = ordered;
+ return 0;
+ }
+
+ /*
+ * Don't split the extent_map for NOCOW extents, as we're writing into
+ * a pre-existing one.
+ */
+ if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) {
+ ret = split_extent_map(bbio->inode, bbio->file_offset,
+ ordered->num_bytes, len,
+ ordered->disk_bytenr);
+ if (ret)
+ return ret;
+ }
+
+ new = btrfs_split_ordered_extent(ordered, len);
+ if (IS_ERR(new))
+ return PTR_ERR(new);
+ bbio->ordered = new;
+ return 0;
+}
+
+static void btrfs_dio_submit_io(const struct iomap_iter *iter, struct bio *bio,
+ loff_t file_offset)
+{
+ struct btrfs_bio *bbio = btrfs_bio(bio);
+ struct btrfs_dio_private *dip =
+ container_of(bbio, struct btrfs_dio_private, bbio);
+ struct btrfs_dio_data *dio_data = iter->private;
+
+ btrfs_bio_init(bbio, BTRFS_I(iter->inode)->root->fs_info,
+ btrfs_dio_end_io, bio->bi_private);
+ bbio->inode = BTRFS_I(iter->inode);
+ bbio->file_offset = file_offset;
+
+ dip->file_offset = file_offset;
+ dip->bytes = bio->bi_iter.bi_size;
+
+ dio_data->submitted += bio->bi_iter.bi_size;
+
+ /*
+ * Check if we are doing a partial write. If we are, we need to split
+ * the ordered extent to match the submitted bio. Hang on to the
+ * remaining unfinishable ordered_extent in dio_data so that it can be
+ * cancelled in iomap_end to avoid a deadlock wherein faulting the
+ * remaining pages is blocked on the outstanding ordered extent.
+ */
+ if (iter->flags & IOMAP_WRITE) {
+ int ret;
+
+ ret = btrfs_extract_ordered_extent(bbio, dio_data->ordered);
+ if (ret) {
+ btrfs_finish_ordered_extent(dio_data->ordered, NULL,
+ file_offset, dip->bytes,
+ !ret);
+ bio->bi_status = errno_to_blk_status(ret);
+ iomap_dio_bio_end_io(bio);
+ return;
+ }
+ }
+
+ btrfs_submit_bio(bbio, 0);
+}
+
+static const struct iomap_ops btrfs_dio_iomap_ops = {
+ .iomap_begin = btrfs_dio_iomap_begin,
+ .iomap_end = btrfs_dio_iomap_end,
+};
+
+static const struct iomap_dio_ops btrfs_dio_ops = {
+ .submit_io = btrfs_dio_submit_io,
+ .bio_set = &btrfs_dio_bioset,
+};
+
+static ssize_t btrfs_dio_read(struct kiocb *iocb, struct iov_iter *iter,
+ size_t done_before)
+{
+ struct btrfs_dio_data data = { 0 };
+
+ return iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dio_ops,
+ IOMAP_DIO_PARTIAL, &data, done_before);
+}
+
+static struct iomap_dio *btrfs_dio_write(struct kiocb *iocb, struct iov_iter *iter,
+ size_t done_before)
+{
+ struct btrfs_dio_data data = { 0 };
+
+ return __iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dio_ops,
+ IOMAP_DIO_PARTIAL, &data, done_before);
+}
+
+static ssize_t check_direct_IO(struct btrfs_fs_info *fs_info,
+ const struct iov_iter *iter, loff_t offset)
+{
+ const u32 blocksize_mask = fs_info->sectorsize - 1;
+
+ if (offset & blocksize_mask)
+ return -EINVAL;
+
+ if (iov_iter_alignment(iter) & blocksize_mask)
+ return -EINVAL;
+
+ return 0;
+}
+
+ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from)
+{
+ struct file *file = iocb->ki_filp;
+ struct inode *inode = file_inode(file);
+ struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
+ loff_t pos;
+ ssize_t written = 0;
+ ssize_t written_buffered;
+ size_t prev_left = 0;
+ loff_t endbyte;
+ ssize_t ret;
+ unsigned int ilock_flags = 0;
+ struct iomap_dio *dio;
+
+ if (iocb->ki_flags & IOCB_NOWAIT)
+ ilock_flags |= BTRFS_ILOCK_TRY;
+
+ /*
+ * If the write DIO is within EOF, use a shared lock and also only if
+ * security bits will likely not be dropped by file_remove_privs() called
+ * from btrfs_write_check(). Either will need to be rechecked after the
+ * lock was acquired.
+ */
+ if (iocb->ki_pos + iov_iter_count(from) <= i_size_read(inode) && IS_NOSEC(inode))
+ ilock_flags |= BTRFS_ILOCK_SHARED;
+
+relock:
+ ret = btrfs_inode_lock(BTRFS_I(inode), ilock_flags);
+ if (ret < 0)
+ return ret;
+
+ /* Shared lock cannot be used with security bits set. */
+ if ((ilock_flags & BTRFS_ILOCK_SHARED) && !IS_NOSEC(inode)) {
+ btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
+ ilock_flags &= ~BTRFS_ILOCK_SHARED;
+ goto relock;
+ }
+
+ ret = generic_write_checks(iocb, from);
+ if (ret <= 0) {
+ btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
+ return ret;
+ }
+
+ ret = btrfs_write_check(iocb, from, ret);
+ if (ret < 0) {
+ btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
+ goto out;
+ }
+
+ pos = iocb->ki_pos;
+ /*
+ * Re-check since file size may have changed just before taking the
+ * lock or pos may have changed because of O_APPEND in generic_write_check()
+ */
+ if ((ilock_flags & BTRFS_ILOCK_SHARED) &&
+ pos + iov_iter_count(from) > i_size_read(inode)) {
+ btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
+ ilock_flags &= ~BTRFS_ILOCK_SHARED;
+ goto relock;
+ }
+
+ if (check_direct_IO(fs_info, from, pos)) {
+ btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
+ goto buffered;
+ }
+
+ /*
+ * The iov_iter can be mapped to the same file range we are writing to.
+ * If that's the case, then we will deadlock in the iomap code, because
+ * it first calls our callback btrfs_dio_iomap_begin(), which will create
+ * an ordered extent, and after that it will fault in the pages that the
+ * iov_iter refers to. During the fault in we end up in the readahead
+ * pages code (starting at btrfs_readahead()), which will lock the range,
+ * find that ordered extent and then wait for it to complete (at
+ * btrfs_lock_and_flush_ordered_range()), resulting in a deadlock since
+ * obviously the ordered extent can never complete as we didn't submit
+ * yet the respective bio(s). This always happens when the buffer is
+ * memory mapped to the same file range, since the iomap DIO code always
+ * invalidates pages in the target file range (after starting and waiting
+ * for any writeback).
+ *
+ * So here we disable page faults in the iov_iter and then retry if we
+ * got -EFAULT, faulting in the pages before the retry.
+ */
+ from->nofault = true;
+ dio = btrfs_dio_write(iocb, from, written);
+ from->nofault = false;
+
+ /*
+ * iomap_dio_complete() will call btrfs_sync_file() if we have a dsync
+ * iocb, and that needs to lock the inode. So unlock it before calling
+ * iomap_dio_complete() to avoid a deadlock.
+ */
+ btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
+
+ if (IS_ERR_OR_NULL(dio))
+ ret = PTR_ERR_OR_ZERO(dio);
+ else
+ ret = iomap_dio_complete(dio);
+
+ /* No increment (+=) because iomap returns a cumulative value. */
+ if (ret > 0)
+ written = ret;
+
+ if (iov_iter_count(from) > 0 && (ret == -EFAULT || ret > 0)) {
+ const size_t left = iov_iter_count(from);
+ /*
+ * We have more data left to write. Try to fault in as many as
+ * possible of the remainder pages and retry. We do this without
+ * releasing and locking again the inode, to prevent races with
+ * truncate.
+ *
+ * Also, in case the iov refers to pages in the file range of the
+ * file we want to write to (due to a mmap), we could enter an
+ * infinite loop if we retry after faulting the pages in, since
+ * iomap will invalidate any pages in the range early on, before
+ * it tries to fault in the pages of the iov. So we keep track of
+ * how much was left of iov in the previous EFAULT and fallback
+ * to buffered IO in case we haven't made any progress.
+ */
+ if (left == prev_left) {
+ ret = -ENOTBLK;
+ } else {
+ fault_in_iov_iter_readable(from, left);
+ prev_left = left;
+ goto relock;
+ }
+ }
+
+ /*
+ * If 'ret' is -ENOTBLK or we have not written all data, then it means
+ * we must fallback to buffered IO.
+ */
+ if ((ret < 0 && ret != -ENOTBLK) || !iov_iter_count(from))
+ goto out;
+
+buffered:
+ /*
+ * If we are in a NOWAIT context, then return -EAGAIN to signal the caller
+ * it must retry the operation in a context where blocking is acceptable,
+ * because even if we end up not blocking during the buffered IO attempt
+ * below, we will block when flushing and waiting for the IO.
+ */
+ if (iocb->ki_flags & IOCB_NOWAIT) {
+ ret = -EAGAIN;
+ goto out;
+ }
+
+ pos = iocb->ki_pos;
+ written_buffered = btrfs_buffered_write(iocb, from);
+ if (written_buffered < 0) {
+ ret = written_buffered;
+ goto out;
+ }
+ /*
+ * Ensure all data is persisted. We want the next direct IO read to be
+ * able to read what was just written.
+ */
+ endbyte = pos + written_buffered - 1;
+ ret = btrfs_fdatawrite_range(BTRFS_I(inode), pos, endbyte);
+ if (ret)
+ goto out;
+ ret = filemap_fdatawait_range(inode->i_mapping, pos, endbyte);
+ if (ret)
+ goto out;
+ written += written_buffered;
+ iocb->ki_pos = pos + written_buffered;
+ invalidate_mapping_pages(file->f_mapping, pos >> PAGE_SHIFT,
+ endbyte >> PAGE_SHIFT);
+out:
+ return ret < 0 ? ret : written;
+}
+
+static int check_direct_read(struct btrfs_fs_info *fs_info,
+ const struct iov_iter *iter, loff_t offset)
+{
+ int ret;
+ int i, seg;
+
+ ret = check_direct_IO(fs_info, iter, offset);
+ if (ret < 0)
+ return ret;
+
+ if (!iter_is_iovec(iter))
+ return 0;
+
+ for (seg = 0; seg < iter->nr_segs; seg++) {
+ for (i = seg + 1; i < iter->nr_segs; i++) {
+ const struct iovec *iov1 = iter_iov(iter) + seg;
+ const struct iovec *iov2 = iter_iov(iter) + i;
+
+ if (iov1->iov_base == iov2->iov_base)
+ return -EINVAL;
+ }
+ }
+ return 0;
+}
+
+ssize_t btrfs_direct_read(struct kiocb *iocb, struct iov_iter *to)
+{
+ struct inode *inode = file_inode(iocb->ki_filp);
+ size_t prev_left = 0;
+ ssize_t read = 0;
+ ssize_t ret;
+
+ if (fsverity_active(inode))
+ return 0;
+
+ if (check_direct_read(inode_to_fs_info(inode), to, iocb->ki_pos))
+ return 0;
+
+ btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_SHARED);
+again:
+ /*
+ * This is similar to what we do for direct IO writes, see the comment
+ * at btrfs_direct_write(), but we also disable page faults in addition
+ * to disabling them only at the iov_iter level. This is because when
+ * reading from a hole or prealloc extent, iomap calls iov_iter_zero(),
+ * which can still trigger page fault ins despite having set ->nofault
+ * to true of our 'to' iov_iter.
+ *
+ * The difference to direct IO writes is that we deadlock when trying
+ * to lock the extent range in the inode's tree during he page reads
+ * triggered by the fault in (while for writes it is due to waiting for
+ * our own ordered extent). This is because for direct IO reads,
+ * btrfs_dio_iomap_begin() returns with the extent range locked, which
+ * is only unlocked in the endio callback (end_bio_extent_readpage()).
+ */
+ pagefault_disable();
+ to->nofault = true;
+ ret = btrfs_dio_read(iocb, to, read);
+ to->nofault = false;
+ pagefault_enable();
+
+ /* No increment (+=) because iomap returns a cumulative value. */
+ if (ret > 0)
+ read = ret;
+
+ if (iov_iter_count(to) > 0 && (ret == -EFAULT || ret > 0)) {
+ const size_t left = iov_iter_count(to);
+
+ if (left == prev_left) {
+ /*
+ * We didn't make any progress since the last attempt,
+ * fallback to a buffered read for the remainder of the
+ * range. This is just to avoid any possibility of looping
+ * for too long.
+ */
+ ret = read;
+ } else {
+ /*
+ * We made some progress since the last retry or this is
+ * the first time we are retrying. Fault in as many pages
+ * as possible and retry.
+ */
+ fault_in_iov_iter_writeable(to, left);
+ prev_left = left;
+ goto again;
+ }
+ }
+ btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_SHARED);
+ return ret < 0 ? ret : read;
+}
+
+int __init btrfs_init_dio(void)
+{
+ if (bioset_init(&btrfs_dio_bioset, BIO_POOL_SIZE,
+ offsetof(struct btrfs_dio_private, bbio.bio),
+ BIOSET_NEED_BVECS))
+ return -ENOMEM;
+
+ return 0;
+}
+
+void __cold btrfs_destroy_dio(void)
+{
+ bioset_exit(&btrfs_dio_bioset);
+}
diff --git a/fs/btrfs/direct-io.h b/fs/btrfs/direct-io.h
new file mode 100644
index 000000000000..3dc3ea926afe
--- /dev/null
+++ b/fs/btrfs/direct-io.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef BTRFS_DIRECT_IO_H
+#define BTRFS_DIRECT_IO_H
+
+#include <linux/types.h>
+
+int __init btrfs_init_dio(void);
+void __cold btrfs_destroy_dio(void);
+
+ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from);
+ssize_t btrfs_direct_read(struct kiocb *iocb, struct iov_iter *to);
+
+#endif /* BTRFS_DIRECT_IO_H */
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index cabb558dbdaa..a6f5441e62d1 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -213,7 +213,7 @@ static int btrfs_repair_eb_io_failure(const struct extent_buffer *eb,
* structure for details.
*/
int btrfs_read_extent_buffer(struct extent_buffer *eb,
- struct btrfs_tree_parent_check *check)
+ const struct btrfs_tree_parent_check *check)
{
struct btrfs_fs_info *fs_info = eb->fs_info;
int failed = 0;
@@ -358,7 +358,7 @@ static bool check_tree_block_fsid(struct extent_buffer *eb)
/* Do basic extent buffer checks at read time */
int btrfs_validate_extent_buffer(struct extent_buffer *eb,
- struct btrfs_tree_parent_check *check)
+ const struct btrfs_tree_parent_check *check)
{
struct btrfs_fs_info *fs_info = eb->fs_info;
u64 found_start;
@@ -367,6 +367,7 @@ int btrfs_validate_extent_buffer(struct extent_buffer *eb,
u8 result[BTRFS_CSUM_SIZE];
const u8 *header_csum;
int ret = 0;
+ const bool ignore_csum = btrfs_test_opt(fs_info, IGNOREMETACSUMS);
ASSERT(check);
@@ -399,13 +400,16 @@ int btrfs_validate_extent_buffer(struct extent_buffer *eb,
if (memcmp(result, header_csum, csum_size) != 0) {
btrfs_warn_rl(fs_info,
-"checksum verify failed on logical %llu mirror %u wanted " CSUM_FMT " found " CSUM_FMT " level %d",
+"checksum verify failed on logical %llu mirror %u wanted " CSUM_FMT " found " CSUM_FMT " level %d%s",
eb->start, eb->read_mirror,
CSUM_FMT_VALUE(csum_size, header_csum),
CSUM_FMT_VALUE(csum_size, result),
- btrfs_header_level(eb));
- ret = -EUCLEAN;
- goto out;
+ btrfs_header_level(eb),
+ ignore_csum ? ", ignored" : "");
+ if (!ignore_csum) {
+ ret = -EUCLEAN;
+ goto out;
+ }
}
if (found_level != check->level) {
@@ -425,7 +429,7 @@ int btrfs_validate_extent_buffer(struct extent_buffer *eb,
goto out;
}
if (check->has_first_key) {
- struct btrfs_key *expect_key = &check->first_key;
+ const struct btrfs_key *expect_key = &check->first_key;
struct btrfs_key found_key;
if (found_level)
@@ -635,10 +639,6 @@ struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr,
free_extent_buffer_stale(buf);
return ERR_PTR(ret);
}
- if (btrfs_check_eb_owner(buf, check->owner_root)) {
- free_extent_buffer_stale(buf);
- return ERR_PTR(-EUCLEAN);
- }
return buf;
}
@@ -658,11 +658,11 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
root->state = 0;
RB_CLEAR_NODE(&root->rb_node);
- root->last_trans = 0;
+ btrfs_set_root_last_trans(root, 0);
root->free_objectid = 0;
root->nr_delalloc_inodes = 0;
root->nr_ordered_extents = 0;
- root->inode_tree = RB_ROOT;
+ xa_init(&root->inodes);
xa_init(&root->delayed_nodes);
btrfs_init_root_block_rsv(root);
@@ -674,7 +674,6 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
INIT_LIST_HEAD(&root->ordered_extents);
INIT_LIST_HEAD(&root->ordered_root);
INIT_LIST_HEAD(&root->reloc_dirty_list);
- spin_lock_init(&root->inode_lock);
spin_lock_init(&root->delalloc_lock);
spin_lock_init(&root->ordered_extent_lock);
spin_lock_init(&root->accounting_lock);
@@ -847,13 +846,6 @@ struct btrfs_root *btrfs_extent_root(struct btrfs_fs_info *fs_info, u64 bytenr)
return btrfs_global_root(fs_info, &key);
}
-struct btrfs_root *btrfs_block_group_root(struct btrfs_fs_info *fs_info)
-{
- if (btrfs_fs_compat_ro(fs_info, BLOCK_GROUP_TREE))
- return fs_info->block_group_root;
- return btrfs_extent_root(fs_info, 0);
-}
-
struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
u64 objectid)
{
@@ -1010,7 +1002,7 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
return ret;
}
- log_root->last_trans = trans->transid;
+ btrfs_set_root_last_trans(log_root, trans->transid);
log_root->root_key.offset = btrfs_root_id(root);
inode_item = &log_root->root_item.inode;
@@ -1033,7 +1025,7 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
static struct btrfs_root *read_tree_root_path(struct btrfs_root *tree_root,
struct btrfs_path *path,
- struct btrfs_key *key)
+ const struct btrfs_key *key)
{
struct btrfs_root *root;
struct btrfs_tree_parent_check check = { 0 };
@@ -1095,7 +1087,7 @@ fail:
}
struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root,
- struct btrfs_key *key)
+ const struct btrfs_key *key)
{
struct btrfs_root *root;
struct btrfs_path *path;
@@ -1230,7 +1222,7 @@ int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info,
return ret;
}
-void btrfs_check_leaked_roots(struct btrfs_fs_info *fs_info)
+void btrfs_check_leaked_roots(const struct btrfs_fs_info *fs_info)
{
#ifdef CONFIG_BTRFS_DEBUG
struct btrfs_root *root;
@@ -1854,7 +1846,8 @@ void btrfs_put_root(struct btrfs_root *root)
return;
if (refcount_dec_and_test(&root->refs)) {
- WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree));
+ if (WARN_ON(!xa_empty(&root->inodes)))
+ xa_destroy(&root->inodes);
WARN_ON(test_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state));
if (root->anon_dev)
free_anon_bdev(root->anon_dev);
@@ -1928,7 +1921,7 @@ static int btrfs_init_btree_inode(struct super_block *sb)
if (!inode)
return -ENOMEM;
- inode->i_ino = BTRFS_BTREE_INODE_OBJECTID;
+ btrfs_set_inode_number(BTRFS_I(inode), BTRFS_BTREE_INODE_OBJECTID);
set_nlink(inode, 1);
/*
* we set the i_size on the btree inode to the max possible int.
@@ -1939,15 +1932,11 @@ static int btrfs_init_btree_inode(struct super_block *sb)
inode->i_mapping->a_ops = &btree_aops;
mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS);
- RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node);
extent_io_tree_init(fs_info, &BTRFS_I(inode)->io_tree,
IO_TREE_BTREE_INODE_IO);
extent_map_tree_init(&BTRFS_I(inode)->extent_tree);
BTRFS_I(inode)->root = btrfs_grab_root(fs_info->tree_root);
- BTRFS_I(inode)->location.objectid = BTRFS_BTREE_INODE_OBJECTID;
- BTRFS_I(inode)->location.type = 0;
- BTRFS_I(inode)->location.offset = 0;
set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags);
__insert_inode_hash(inode, hash);
fs_info->btree_inode = inode;
@@ -2146,7 +2135,7 @@ static int load_global_roots_objectid(struct btrfs_root *tree_root,
/* If we have IGNOREDATACSUMS skip loading these roots. */
if (objectid == BTRFS_CSUM_TREE_OBJECTID &&
btrfs_test_opt(fs_info, IGNOREDATACSUMS)) {
- set_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state);
+ set_bit(BTRFS_FS_STATE_NO_DATA_CSUMS, &fs_info->fs_state);
return 0;
}
@@ -2199,7 +2188,7 @@ static int load_global_roots_objectid(struct btrfs_root *tree_root,
if (!found || ret) {
if (objectid == BTRFS_CSUM_TREE_OBJECTID)
- set_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state);
+ set_bit(BTRFS_FS_STATE_NO_DATA_CSUMS, &fs_info->fs_state);
if (!btrfs_test_opt(fs_info, IGNOREBADROOTS))
ret = ret ? ret : -ENOENT;
@@ -2350,21 +2339,29 @@ out:
* 1, 2 2nd and 3rd backup copy
* -1 skip bytenr check
*/
-int btrfs_validate_super(struct btrfs_fs_info *fs_info,
- struct btrfs_super_block *sb, int mirror_num)
+int btrfs_validate_super(const struct btrfs_fs_info *fs_info,
+ const struct btrfs_super_block *sb, int mirror_num)
{
u64 nodesize = btrfs_super_nodesize(sb);
u64 sectorsize = btrfs_super_sectorsize(sb);
int ret = 0;
+ const bool ignore_flags = btrfs_test_opt(fs_info, IGNORESUPERFLAGS);
if (btrfs_super_magic(sb) != BTRFS_MAGIC) {
btrfs_err(fs_info, "no valid FS found");
ret = -EINVAL;
}
- if (btrfs_super_flags(sb) & ~BTRFS_SUPER_FLAG_SUPP) {
- btrfs_err(fs_info, "unrecognized or unsupported super flag: %llu",
- btrfs_super_flags(sb) & ~BTRFS_SUPER_FLAG_SUPP);
- ret = -EINVAL;
+ if ((btrfs_super_flags(sb) & ~BTRFS_SUPER_FLAG_SUPP)) {
+ if (!ignore_flags) {
+ btrfs_err(fs_info,
+ "unrecognized or unsupported super flag 0x%llx",
+ btrfs_super_flags(sb) & ~BTRFS_SUPER_FLAG_SUPP);
+ ret = -EINVAL;
+ } else {
+ btrfs_info(fs_info,
+ "unrecognized or unsupported super flags: 0x%llx, ignored",
+ btrfs_super_flags(sb) & ~BTRFS_SUPER_FLAG_SUPP);
+ }
}
if (btrfs_super_root_level(sb) >= BTRFS_MAX_LEVEL) {
btrfs_err(fs_info, "tree_root level too big: %d >= %d",
@@ -2467,7 +2464,7 @@ int btrfs_validate_super(struct btrfs_fs_info *fs_info,
(!btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE_VALID) ||
!btrfs_fs_incompat(fs_info, NO_HOLES))) {
btrfs_err(fs_info,
- "block-group-tree feature requires fres-space-tree and no-holes");
+ "block-group-tree feature requires free-space-tree and no-holes");
ret = -EINVAL;
}
@@ -2882,6 +2879,8 @@ static int init_mount_fs_info(struct btrfs_fs_info *fs_info, struct super_block
if (sb_rdonly(sb))
set_bit(BTRFS_FS_STATE_RO, &fs_info->fs_state);
+ if (btrfs_test_opt(fs_info, IGNOREMETACSUMS))
+ set_bit(BTRFS_FS_STATE_SKIP_META_CSUMS, &fs_info->fs_state);
return btrfs_alloc_stripe_hash_table(fs_info);
}
@@ -2927,22 +2926,22 @@ static int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info)
{
u64 root_objectid = 0;
struct btrfs_root *gang[8];
- int i = 0;
- int err = 0;
- unsigned int ret = 0;
+ int ret = 0;
while (1) {
+ unsigned int found;
+
spin_lock(&fs_info->fs_roots_radix_lock);
- ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
+ found = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
(void **)gang, root_objectid,
ARRAY_SIZE(gang));
- if (!ret) {
+ if (!found) {
spin_unlock(&fs_info->fs_roots_radix_lock);
break;
}
- root_objectid = btrfs_root_id(gang[ret - 1]) + 1;
+ root_objectid = btrfs_root_id(gang[found - 1]) + 1;
- for (i = 0; i < ret; i++) {
+ for (int i = 0; i < found; i++) {
/* Avoid to grab roots in dead_roots. */
if (btrfs_root_refs(&gang[i]->root_item) == 0) {
gang[i] = NULL;
@@ -2953,24 +2952,25 @@ static int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info)
}
spin_unlock(&fs_info->fs_roots_radix_lock);
- for (i = 0; i < ret; i++) {
+ for (int i = 0; i < found; i++) {
if (!gang[i])
continue;
root_objectid = btrfs_root_id(gang[i]);
- err = btrfs_orphan_cleanup(gang[i]);
- if (err)
- goto out;
+ /*
+ * Continue to release the remaining roots after the first
+ * error without cleanup and preserve the first error
+ * for the return.
+ */
+ if (!ret)
+ ret = btrfs_orphan_cleanup(gang[i]);
btrfs_put_root(gang[i]);
}
+ if (ret)
+ break;
+
root_objectid++;
}
-out:
- /* Release the uncleaned roots due to error. */
- for (; i < ret; i++) {
- if (gang[i])
- btrfs_put_root(gang[i]);
- }
- return err;
+ return ret;
}
/*
@@ -3204,7 +3204,7 @@ int btrfs_check_features(struct btrfs_fs_info *fs_info, bool is_rw_mount)
}
int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_devices,
- char *options)
+ const char *options)
{
u32 sectorsize;
u32 nodesize;
@@ -4157,9 +4157,6 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info,
int btrfs_commit_super(struct btrfs_fs_info *fs_info)
{
- struct btrfs_root *root = fs_info->tree_root;
- struct btrfs_trans_handle *trans;
-
mutex_lock(&fs_info->cleaner_mutex);
btrfs_run_delayed_iputs(fs_info);
mutex_unlock(&fs_info->cleaner_mutex);
@@ -4169,10 +4166,7 @@ int btrfs_commit_super(struct btrfs_fs_info *fs_info)
down_write(&fs_info->cleanup_work_sem);
up_write(&fs_info->cleanup_work_sem);
- trans = btrfs_join_transaction(root);
- if (IS_ERR(trans))
- return PTR_ERR(trans);
- return btrfs_commit_transaction(trans);
+ return btrfs_commit_current_transaction(fs_info->tree_root);
}
static void warn_about_uncommitted_trans(struct btrfs_fs_info *fs_info)
@@ -4533,7 +4527,7 @@ static void btrfs_destroy_all_ordered_extents(struct btrfs_fs_info *fs_info)
* extents that haven't had their dirty pages IO start writeout yet
* actually get run and error out properly.
*/
- btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1);
+ btrfs_wait_ordered_roots(fs_info, U64_MAX, NULL);
}
static void btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 76eb53fe7a11..99af64d3f277 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -41,7 +41,7 @@ static inline u64 btrfs_sb_offset(int mirror)
return BTRFS_SUPER_INFO_OFFSET;
}
-void btrfs_check_leaked_roots(struct btrfs_fs_info *fs_info);
+void btrfs_check_leaked_roots(const struct btrfs_fs_info *fs_info);
void btrfs_init_fs_info(struct btrfs_fs_info *fs_info);
struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr,
struct btrfs_tree_parent_check *check);
@@ -52,12 +52,11 @@ struct extent_buffer *btrfs_find_create_tree_block(
int btrfs_start_pre_rw_mount(struct btrfs_fs_info *fs_info);
int btrfs_check_super_csum(struct btrfs_fs_info *fs_info,
const struct btrfs_super_block *disk_sb);
-int __cold open_ctree(struct super_block *sb,
- struct btrfs_fs_devices *fs_devices,
- char *options);
+int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_devices,
+ const char *options);
void __cold close_ctree(struct btrfs_fs_info *fs_info);
-int btrfs_validate_super(struct btrfs_fs_info *fs_info,
- struct btrfs_super_block *sb, int mirror_num);
+int btrfs_validate_super(const struct btrfs_fs_info *fs_info,
+ const struct btrfs_super_block *sb, int mirror_num);
int btrfs_check_features(struct btrfs_fs_info *fs_info, bool is_rw_mount);
int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors);
struct btrfs_super_block *btrfs_read_dev_super(struct block_device *bdev);
@@ -65,7 +64,7 @@ struct btrfs_super_block *btrfs_read_dev_one_super(struct block_device *bdev,
int copy_num, bool drop_cache);
int btrfs_commit_super(struct btrfs_fs_info *fs_info);
struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root,
- struct btrfs_key *key);
+ const struct btrfs_key *key);
int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info,
struct btrfs_root *root);
void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info);
@@ -83,7 +82,6 @@ struct btrfs_root *btrfs_global_root(struct btrfs_fs_info *fs_info,
struct btrfs_key *key);
struct btrfs_root *btrfs_csum_root(struct btrfs_fs_info *fs_info, u64 bytenr);
struct btrfs_root *btrfs_extent_root(struct btrfs_fs_info *fs_info, u64 bytenr);
-struct btrfs_root *btrfs_block_group_root(struct btrfs_fs_info *fs_info);
void btrfs_free_fs_info(struct btrfs_fs_info *fs_info);
void btrfs_btree_balance_dirty(struct btrfs_fs_info *fs_info);
@@ -91,7 +89,7 @@ void btrfs_btree_balance_dirty_nodelay(struct btrfs_fs_info *fs_info);
void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info,
struct btrfs_root *root);
int btrfs_validate_extent_buffer(struct extent_buffer *eb,
- struct btrfs_tree_parent_check *check);
+ const struct btrfs_tree_parent_check *check);
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
struct btrfs_root *btrfs_alloc_dummy_root(struct btrfs_fs_info *fs_info);
#endif
@@ -118,7 +116,7 @@ void btrfs_mark_buffer_dirty(struct btrfs_trans_handle *trans,
int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid,
int atomic);
int btrfs_read_extent_buffer(struct extent_buffer *buf,
- struct btrfs_tree_parent_check *check);
+ const struct btrfs_tree_parent_check *check);
blk_status_t btree_csum_one_bio(struct btrfs_bio *bbio);
int btrfs_alloc_log_tree_node(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
index 9e81f89e76d8..e2b22bea348a 100644
--- a/fs/btrfs/export.c
+++ b/fs/btrfs/export.c
@@ -40,7 +40,7 @@ static int btrfs_encode_fh(struct inode *inode, u32 *fh, int *max_len,
if (parent) {
u64 parent_root_id;
- fid->parent_objectid = BTRFS_I(parent)->location.objectid;
+ fid->parent_objectid = btrfs_ino(BTRFS_I(parent));
fid->parent_gen = parent->i_generation;
parent_root_id = btrfs_root_id(BTRFS_I(parent)->root);
@@ -84,7 +84,7 @@ struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
if (IS_ERR(root))
return ERR_CAST(root);
- inode = btrfs_iget(sb, objectid, root);
+ inode = btrfs_iget(objectid, root);
btrfs_put_root(root);
if (IS_ERR(inode))
return ERR_CAST(inode);
@@ -210,7 +210,7 @@ struct dentry *btrfs_get_parent(struct dentry *child)
found_key.offset, 0);
}
- return d_obtain_alias(btrfs_iget(fs_info->sb, key.objectid, root));
+ return d_obtain_alias(btrfs_iget(key.objectid, root));
fail:
btrfs_free_path(path);
return ERR_PTR(ret);
diff --git a/fs/btrfs/extent-io-tree.c b/fs/btrfs/extent-io-tree.c
index ed2cfc3d5d8a..c54c5d7a5cd5 100644
--- a/fs/btrfs/extent-io-tree.c
+++ b/fs/btrfs/extent-io-tree.c
@@ -4,6 +4,7 @@
#include <trace/events/btrfs.h>
#include "messages.h"
#include "ctree.h"
+#include "extent_io.h"
#include "extent-io-tree.h"
#include "btrfs_inode.h"
@@ -1084,6 +1085,9 @@ again:
*/
prealloc = alloc_extent_state(mask);
}
+ /* Optimistically preallocate the extent changeset ulist node. */
+ if (changeset)
+ extent_changeset_prealloc(changeset, mask);
spin_lock(&tree->lock);
if (cached_state && *cached_state) {
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 3774c191e36d..d77498e7671c 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -104,10 +104,7 @@ int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_head *head;
struct btrfs_delayed_ref_root *delayed_refs;
struct btrfs_path *path;
- struct btrfs_extent_item *ei;
- struct extent_buffer *leaf;
struct btrfs_key key;
- u32 item_size;
u64 num_refs;
u64 extent_flags;
u64 owner = 0;
@@ -126,11 +123,6 @@ int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
if (!path)
return -ENOMEM;
- if (!trans) {
- path->skip_locking = 1;
- path->search_commit_root = 1;
- }
-
search_again:
key.objectid = bytenr;
key.offset = offset;
@@ -144,7 +136,7 @@ search_again:
if (ret < 0)
goto out_free;
- if (ret > 0 && metadata && key.type == BTRFS_METADATA_ITEM_KEY) {
+ if (ret > 0 && key.type == BTRFS_METADATA_ITEM_KEY) {
if (path->slots[0]) {
path->slots[0]--;
btrfs_item_key_to_cpu(path->nodes[0], &key,
@@ -157,38 +149,37 @@ search_again:
}
if (ret == 0) {
- leaf = path->nodes[0];
- item_size = btrfs_item_size(leaf, path->slots[0]);
- if (item_size >= sizeof(*ei)) {
- ei = btrfs_item_ptr(leaf, path->slots[0],
- struct btrfs_extent_item);
- num_refs = btrfs_extent_refs(leaf, ei);
- extent_flags = btrfs_extent_flags(leaf, ei);
- owner = btrfs_get_extent_owner_root(fs_info, leaf,
- path->slots[0]);
- } else {
+ struct extent_buffer *leaf = path->nodes[0];
+ struct btrfs_extent_item *ei;
+ const u32 item_size = btrfs_item_size(leaf, path->slots[0]);
+
+ if (unlikely(item_size < sizeof(*ei))) {
ret = -EUCLEAN;
btrfs_err(fs_info,
"unexpected extent item size, has %u expect >= %zu",
item_size, sizeof(*ei));
- if (trans)
- btrfs_abort_transaction(trans, ret);
- else
- btrfs_handle_fs_error(fs_info, ret, NULL);
-
+ btrfs_abort_transaction(trans, ret);
goto out_free;
}
- BUG_ON(num_refs == 0);
+ ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
+ num_refs = btrfs_extent_refs(leaf, ei);
+ if (unlikely(num_refs == 0)) {
+ ret = -EUCLEAN;
+ btrfs_err(fs_info,
+ "unexpected zero reference count for extent item (%llu %u %llu)",
+ key.objectid, key.type, key.offset);
+ btrfs_abort_transaction(trans, ret);
+ goto out_free;
+ }
+ extent_flags = btrfs_extent_flags(leaf, ei);
+ owner = btrfs_get_extent_owner_root(fs_info, leaf, path->slots[0]);
} else {
num_refs = 0;
extent_flags = 0;
ret = 0;
}
- if (!trans)
- goto out;
-
delayed_refs = &trans->transaction->delayed_refs;
spin_lock(&delayed_refs->lock);
head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
@@ -211,15 +202,13 @@ search_again:
spin_lock(&head->lock);
if (head->extent_op && head->extent_op->update_flags)
extent_flags |= head->extent_op->flags_to_set;
- else
- BUG_ON(num_refs == 0);
num_refs += head->ref_mod;
spin_unlock(&head->lock);
mutex_unlock(&head->mutex);
}
spin_unlock(&delayed_refs->lock);
-out:
+
WARN_ON(num_refs == 0);
if (refs)
*refs = num_refs;
@@ -1632,7 +1621,7 @@ static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
if (metadata) {
key.type = BTRFS_METADATA_ITEM_KEY;
- key.offset = extent_op->level;
+ key.offset = head->level;
} else {
key.type = BTRFS_EXTENT_ITEM_KEY;
key.offset = head->num_bytes;
@@ -1667,7 +1656,7 @@ again:
ret = -EUCLEAN;
btrfs_err(fs_info,
"missing extent item for extent %llu num_bytes %llu level %d",
- head->bytenr, head->num_bytes, extent_op->level);
+ head->bytenr, head->num_bytes, head->level);
goto out;
}
}
@@ -1726,7 +1715,6 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
.generation = trans->transid,
};
- BUG_ON(!extent_op || !extent_op->update_flags);
ret = alloc_reserved_tree_block(trans, node, extent_op);
if (!ret)
btrfs_record_squota_delta(fs_info, &delta);
@@ -2233,7 +2221,6 @@ int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
struct extent_buffer *eb, u64 flags)
{
struct btrfs_delayed_extent_op *extent_op;
- int level = btrfs_header_level(eb);
int ret;
extent_op = btrfs_alloc_delayed_extent_op();
@@ -2243,9 +2230,9 @@ int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
extent_op->flags_to_set = flags;
extent_op->update_flags = true;
extent_op->update_key = false;
- extent_op->level = level;
- ret = btrfs_add_delayed_extent_op(trans, eb->start, eb->len, extent_op);
+ ret = btrfs_add_delayed_extent_op(trans, eb->start, eb->len,
+ btrfs_header_level(eb), extent_op);
if (ret)
btrfs_free_delayed_extent_op(extent_op);
return ret;
@@ -3419,10 +3406,10 @@ out_delayed_unlock:
return 0;
}
-void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
- u64 root_id,
- struct extent_buffer *buf,
- u64 parent, int last_ref)
+int btrfs_free_tree_block(struct btrfs_trans_handle *trans,
+ u64 root_id,
+ struct extent_buffer *buf,
+ u64 parent, int last_ref)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_block_group *bg;
@@ -3449,11 +3436,12 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
btrfs_init_tree_ref(&generic_ref, btrfs_header_level(buf), 0, false);
btrfs_ref_tree_mod(fs_info, &generic_ref);
ret = btrfs_add_delayed_tree_ref(trans, &generic_ref, NULL);
- BUG_ON(ret); /* -ENOMEM */
+ if (ret < 0)
+ return ret;
}
if (!last_ref)
- return;
+ return 0;
if (btrfs_header_generation(buf) != trans->transid)
goto out;
@@ -3510,6 +3498,7 @@ out:
* matter anymore.
*/
clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags);
+ return 0;
}
/* Can return -ENOMEM */
@@ -4864,7 +4853,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
struct btrfs_path *path;
struct extent_buffer *leaf;
u32 size = sizeof(*extent_item) + sizeof(*iref);
- u64 flags = extent_op->flags_to_set;
+ const u64 flags = (extent_op ? extent_op->flags_to_set : 0);
/* The owner of a tree block is the level. */
int level = btrfs_delayed_ref_owner(node);
bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
@@ -5121,7 +5110,6 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
struct btrfs_key ins;
struct btrfs_block_rsv *block_rsv;
struct extent_buffer *buf;
- struct btrfs_delayed_extent_op *extent_op;
u64 flags = 0;
int ret;
u32 blocksize = fs_info->nodesize;
@@ -5164,6 +5152,7 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
BUG_ON(parent > 0);
if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
+ struct btrfs_delayed_extent_op *extent_op;
struct btrfs_ref generic_ref = {
.action = BTRFS_ADD_DELAYED_EXTENT,
.bytenr = ins.objectid,
@@ -5172,30 +5161,34 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
.owning_root = owning_root,
.ref_root = root_objectid,
};
- extent_op = btrfs_alloc_delayed_extent_op();
- if (!extent_op) {
- ret = -ENOMEM;
- goto out_free_buf;
+
+ if (!skinny_metadata || flags != 0) {
+ extent_op = btrfs_alloc_delayed_extent_op();
+ if (!extent_op) {
+ ret = -ENOMEM;
+ goto out_free_buf;
+ }
+ if (key)
+ memcpy(&extent_op->key, key, sizeof(extent_op->key));
+ else
+ memset(&extent_op->key, 0, sizeof(extent_op->key));
+ extent_op->flags_to_set = flags;
+ extent_op->update_key = (skinny_metadata ? false : true);
+ extent_op->update_flags = (flags != 0);
+ } else {
+ extent_op = NULL;
}
- if (key)
- memcpy(&extent_op->key, key, sizeof(extent_op->key));
- else
- memset(&extent_op->key, 0, sizeof(extent_op->key));
- extent_op->flags_to_set = flags;
- extent_op->update_key = skinny_metadata ? false : true;
- extent_op->update_flags = true;
- extent_op->level = level;
btrfs_init_tree_ref(&generic_ref, level, btrfs_root_id(root), false);
btrfs_ref_tree_mod(fs_info, &generic_ref);
ret = btrfs_add_delayed_tree_ref(trans, &generic_ref, extent_op);
- if (ret)
- goto out_free_delayed;
+ if (ret) {
+ btrfs_free_delayed_extent_op(extent_op);
+ goto out_free_buf;
+ }
}
return buf;
-out_free_delayed:
- btrfs_free_delayed_extent_op(extent_op);
out_free_buf:
btrfs_tree_unlock(buf);
free_extent_buffer(buf);
@@ -5220,11 +5213,99 @@ struct walk_control {
int reada_slot;
int reada_count;
int restarted;
+ /* Indicate that extent info needs to be looked up when walking the tree. */
+ int lookup_info;
};
+/*
+ * This is our normal stage. We are traversing blocks the current snapshot owns
+ * and we are dropping any of our references to any children we are able to, and
+ * then freeing the block once we've processed all of the children.
+ */
#define DROP_REFERENCE 1
+
+/*
+ * We enter this stage when we have to walk into a child block (meaning we can't
+ * simply drop our reference to it from our current parent node) and there are
+ * more than one reference on it. If we are the owner of any of the children
+ * blocks from the current parent node then we have to do the FULL_BACKREF dance
+ * on them in order to drop our normal ref and add the shared ref.
+ */
#define UPDATE_BACKREF 2
+/*
+ * Decide if we need to walk down into this node to adjust the references.
+ *
+ * @root: the root we are currently deleting
+ * @wc: the walk control for this deletion
+ * @eb: the parent eb that we're currently visiting
+ * @refs: the number of refs for wc->level - 1
+ * @flags: the flags for wc->level - 1
+ * @slot: the slot in the eb that we're currently checking
+ *
+ * This is meant to be called when we're evaluating if a node we point to at
+ * wc->level should be read and walked into, or if we can simply delete our
+ * reference to it. We return true if we should walk into the node, false if we
+ * can skip it.
+ *
+ * We have assertions in here to make sure this is called correctly. We assume
+ * that sanity checking on the blocks read to this point has been done, so any
+ * corrupted file systems must have been caught before calling this function.
+ */
+static bool visit_node_for_delete(struct btrfs_root *root, struct walk_control *wc,
+ struct extent_buffer *eb, u64 refs, u64 flags, int slot)
+{
+ struct btrfs_key key;
+ u64 generation;
+ int level = wc->level;
+
+ ASSERT(level > 0);
+ ASSERT(wc->refs[level - 1] > 0);
+
+ /*
+ * The update backref stage we only want to skip if we already have
+ * FULL_BACKREF set, otherwise we need to read.
+ */
+ if (wc->stage == UPDATE_BACKREF) {
+ if (level == 1 && flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)
+ return false;
+ return true;
+ }
+
+ /*
+ * We're the last ref on this block, we must walk into it and process
+ * any refs it's pointing at.
+ */
+ if (wc->refs[level - 1] == 1)
+ return true;
+
+ /*
+ * If we're already FULL_BACKREF then we know we can just drop our
+ * current reference.
+ */
+ if (level == 1 && flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)
+ return false;
+
+ /*
+ * This block is older than our creation generation, we can drop our
+ * reference to it.
+ */
+ generation = btrfs_node_ptr_generation(eb, slot);
+ if (!wc->update_ref || generation <= root->root_key.offset)
+ return false;
+
+ /*
+ * This block was processed from a previous snapshot deletion run, we
+ * can skip it.
+ */
+ btrfs_node_key_to_cpu(eb, &key, slot);
+ if (btrfs_comp_cpu_keys(&key, &wc->update_progress) < 0)
+ return false;
+
+ /* All other cases we need to wander into the node. */
+ return true;
+}
+
static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct walk_control *wc,
@@ -5236,7 +5317,6 @@ static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
u64 refs;
u64 flags;
u32 nritems;
- struct btrfs_key key;
struct extent_buffer *eb;
int ret;
int slot;
@@ -5276,28 +5356,19 @@ static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
/* We don't care about errors in readahead. */
if (ret < 0)
continue;
- BUG_ON(refs == 0);
- if (wc->stage == DROP_REFERENCE) {
- if (refs == 1)
- goto reada;
+ /*
+ * This could be racey, it's conceivable that we raced and end
+ * up with a bogus refs count, if that's the case just skip, if
+ * we are actually corrupt we will notice when we look up
+ * everything again with our locks.
+ */
+ if (refs == 0)
+ continue;
- if (wc->level == 1 &&
- (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
- continue;
- if (!wc->update_ref ||
- generation <= root->root_key.offset)
- continue;
- btrfs_node_key_to_cpu(eb, &key, slot);
- ret = btrfs_comp_cpu_keys(&key,
- &wc->update_progress);
- if (ret < 0)
- continue;
- } else {
- if (wc->level == 1 &&
- (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
- continue;
- }
+ /* If we don't need to visit this node don't reada. */
+ if (!visit_node_for_delete(root, wc, eb, refs, flags, slot))
+ continue;
reada:
btrfs_readahead_node_child(eb, slot);
nread++;
@@ -5316,7 +5387,7 @@ reada:
static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path,
- struct walk_control *wc, int lookup_info)
+ struct walk_control *wc)
{
struct btrfs_fs_info *fs_info = root->fs_info;
int level = wc->level;
@@ -5331,19 +5402,22 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
* when reference count of tree block is 1, it won't increase
* again. once full backref flag is set, we never clear it.
*/
- if (lookup_info &&
+ if (wc->lookup_info &&
((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) ||
(wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag)))) {
- BUG_ON(!path->locks[level]);
+ ASSERT(path->locks[level]);
ret = btrfs_lookup_extent_info(trans, fs_info,
eb->start, level, 1,
&wc->refs[level],
&wc->flags[level],
NULL);
- BUG_ON(ret == -ENOMEM);
if (ret)
return ret;
- BUG_ON(wc->refs[level] == 0);
+ if (unlikely(wc->refs[level] == 0)) {
+ btrfs_err(fs_info, "bytenr %llu has 0 references, expect > 0",
+ eb->start);
+ return -EUCLEAN;
+ }
}
if (wc->stage == DROP_REFERENCE) {
@@ -5359,13 +5433,22 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
/* wc->stage == UPDATE_BACKREF */
if (!(wc->flags[level] & flag)) {
- BUG_ON(!path->locks[level]);
+ ASSERT(path->locks[level]);
ret = btrfs_inc_ref(trans, root, eb, 1);
- BUG_ON(ret); /* -ENOMEM */
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ return ret;
+ }
ret = btrfs_dec_ref(trans, root, eb, 0);
- BUG_ON(ret); /* -ENOMEM */
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ return ret;
+ }
ret = btrfs_set_disk_extent_flags(trans, eb, flag);
- BUG_ON(ret); /* -ENOMEM */
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ return ret;
+ }
wc->flags[level] |= flag;
}
@@ -5408,6 +5491,132 @@ static int check_ref_exists(struct btrfs_trans_handle *trans,
}
/*
+ * We may not have an uptodate block, so if we are going to walk down into this
+ * block we need to drop the lock, read it off of the disk, re-lock it and
+ * return to continue dropping the snapshot.
+ */
+static int check_next_block_uptodate(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct walk_control *wc,
+ struct extent_buffer *next)
+{
+ struct btrfs_tree_parent_check check = { 0 };
+ u64 generation;
+ int level = wc->level;
+ int ret;
+
+ btrfs_assert_tree_write_locked(next);
+
+ generation = btrfs_node_ptr_generation(path->nodes[level], path->slots[level]);
+
+ if (btrfs_buffer_uptodate(next, generation, 0))
+ return 0;
+
+ check.level = level - 1;
+ check.transid = generation;
+ check.owner_root = btrfs_root_id(root);
+ check.has_first_key = true;
+ btrfs_node_key_to_cpu(path->nodes[level], &check.first_key, path->slots[level]);
+
+ btrfs_tree_unlock(next);
+ if (level == 1)
+ reada_walk_down(trans, root, wc, path);
+ ret = btrfs_read_extent_buffer(next, &check);
+ if (ret) {
+ free_extent_buffer(next);
+ return ret;
+ }
+ btrfs_tree_lock(next);
+ wc->lookup_info = 1;
+ return 0;
+}
+
+/*
+ * If we determine that we don't have to visit wc->level - 1 then we need to
+ * determine if we can drop our reference.
+ *
+ * If we are UPDATE_BACKREF then we will not, we need to update our backrefs.
+ *
+ * If we are DROP_REFERENCE this will figure out if we need to drop our current
+ * reference, skipping it if we dropped it from a previous incompleted drop, or
+ * dropping it if we still have a reference to it.
+ */
+static int maybe_drop_reference(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+ struct btrfs_path *path, struct walk_control *wc,
+ struct extent_buffer *next, u64 owner_root)
+{
+ struct btrfs_ref ref = {
+ .action = BTRFS_DROP_DELAYED_REF,
+ .bytenr = next->start,
+ .num_bytes = root->fs_info->nodesize,
+ .owning_root = owner_root,
+ .ref_root = btrfs_root_id(root),
+ };
+ int level = wc->level;
+ int ret;
+
+ /* We are UPDATE_BACKREF, we're not dropping anything. */
+ if (wc->stage == UPDATE_BACKREF)
+ return 0;
+
+ if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
+ ref.parent = path->nodes[level]->start;
+ } else {
+ ASSERT(btrfs_root_id(root) == btrfs_header_owner(path->nodes[level]));
+ if (btrfs_root_id(root) != btrfs_header_owner(path->nodes[level])) {
+ btrfs_err(root->fs_info, "mismatched block owner");
+ return -EIO;
+ }
+ }
+
+ /*
+ * If we had a drop_progress we need to verify the refs are set as
+ * expected. If we find our ref then we know that from here on out
+ * everything should be correct, and we can clear the
+ * ->restarted flag.
+ */
+ if (wc->restarted) {
+ ret = check_ref_exists(trans, root, next->start, ref.parent,
+ level - 1);
+ if (ret <= 0)
+ return ret;
+ ret = 0;
+ wc->restarted = 0;
+ }
+
+ /*
+ * Reloc tree doesn't contribute to qgroup numbers, and we have already
+ * accounted them at merge time (replace_path), thus we could skip
+ * expensive subtree trace here.
+ */
+ if (btrfs_root_id(root) != BTRFS_TREE_RELOC_OBJECTID &&
+ wc->refs[level - 1] > 1) {
+ u64 generation = btrfs_node_ptr_generation(path->nodes[level],
+ path->slots[level]);
+
+ ret = btrfs_qgroup_trace_subtree(trans, next, generation, level - 1);
+ if (ret) {
+ btrfs_err_rl(root->fs_info,
+"error %d accounting shared subtree, quota is out of sync, rescan required",
+ ret);
+ }
+ }
+
+ /*
+ * We need to update the next key in our walk control so we can update
+ * the drop_progress key accordingly. We don't care if find_next_key
+ * doesn't find a key because that means we're at the end and are going
+ * to clean up now.
+ */
+ wc->drop_level = level;
+ find_next_key(path, level, &wc->drop_progress);
+
+ btrfs_init_tree_ref(&ref, level - 1, 0, false);
+ return btrfs_free_extent(trans, &ref);
+}
+
+/*
* helper to process tree block pointer.
*
* when wc->stage == DROP_REFERENCE, this function checks
@@ -5423,19 +5632,15 @@ static int check_ref_exists(struct btrfs_trans_handle *trans,
static noinline int do_walk_down(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path,
- struct walk_control *wc, int *lookup_info)
+ struct walk_control *wc)
{
struct btrfs_fs_info *fs_info = root->fs_info;
u64 bytenr;
u64 generation;
u64 owner_root = 0;
- struct btrfs_tree_parent_check check = { 0 };
- struct btrfs_key key;
struct extent_buffer *next;
int level = wc->level;
- int reada = 0;
int ret = 0;
- bool need_account = false;
generation = btrfs_node_ptr_generation(path->nodes[level],
path->slots[level]);
@@ -5446,27 +5651,17 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
*/
if (wc->stage == UPDATE_BACKREF &&
generation <= root->root_key.offset) {
- *lookup_info = 1;
+ wc->lookup_info = 1;
return 1;
}
bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]);
- check.level = level - 1;
- check.transid = generation;
- check.owner_root = btrfs_root_id(root);
- check.has_first_key = true;
- btrfs_node_key_to_cpu(path->nodes[level], &check.first_key,
- path->slots[level]);
+ next = btrfs_find_create_tree_block(fs_info, bytenr, btrfs_root_id(root),
+ level - 1);
+ if (IS_ERR(next))
+ return PTR_ERR(next);
- next = find_extent_buffer(fs_info, bytenr);
- if (!next) {
- next = btrfs_find_create_tree_block(fs_info, bytenr,
- btrfs_root_id(root), level - 1);
- if (IS_ERR(next))
- return PTR_ERR(next);
- reada = 1;
- }
btrfs_tree_lock(next);
ret = btrfs_lookup_extent_info(trans, fs_info, bytenr, level - 1, 1,
@@ -5477,57 +5672,32 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
goto out_unlock;
if (unlikely(wc->refs[level - 1] == 0)) {
- btrfs_err(fs_info, "Missing references.");
- ret = -EIO;
+ btrfs_err(fs_info, "bytenr %llu has 0 references, expect > 0",
+ bytenr);
+ ret = -EUCLEAN;
goto out_unlock;
}
- *lookup_info = 0;
+ wc->lookup_info = 0;
- if (wc->stage == DROP_REFERENCE) {
- if (wc->refs[level - 1] > 1) {
- need_account = true;
- if (level == 1 &&
- (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
- goto skip;
-
- if (!wc->update_ref ||
- generation <= root->root_key.offset)
- goto skip;
-
- btrfs_node_key_to_cpu(path->nodes[level], &key,
- path->slots[level]);
- ret = btrfs_comp_cpu_keys(&key, &wc->update_progress);
- if (ret < 0)
- goto skip;
+ /* If we don't have to walk into this node skip it. */
+ if (!visit_node_for_delete(root, wc, path->nodes[level],
+ wc->refs[level - 1], wc->flags[level - 1],
+ path->slots[level]))
+ goto skip;
- wc->stage = UPDATE_BACKREF;
- wc->shared_level = level - 1;
- }
- } else {
- if (level == 1 &&
- (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
- goto skip;
+ /*
+ * We have to walk down into this node, and if we're currently at the
+ * DROP_REFERNCE stage and this block is shared then we need to switch
+ * to the UPDATE_BACKREF stage in order to convert to FULL_BACKREF.
+ */
+ if (wc->stage == DROP_REFERENCE && wc->refs[level - 1] > 1) {
+ wc->stage = UPDATE_BACKREF;
+ wc->shared_level = level - 1;
}
- if (!btrfs_buffer_uptodate(next, generation, 0)) {
- btrfs_tree_unlock(next);
- free_extent_buffer(next);
- next = NULL;
- *lookup_info = 1;
- }
-
- if (!next) {
- if (reada && level == 1)
- reada_walk_down(trans, root, wc, path);
- next = read_tree_block(fs_info, bytenr, &check);
- if (IS_ERR(next)) {
- return PTR_ERR(next);
- } else if (!extent_buffer_uptodate(next)) {
- free_extent_buffer(next);
- return -EIO;
- }
- btrfs_tree_lock(next);
- }
+ ret = check_next_block_uptodate(trans, root, path, wc, next);
+ if (ret)
+ return ret;
level--;
ASSERT(level == btrfs_header_level(next));
@@ -5544,78 +5714,12 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
wc->reada_slot = 0;
return 0;
skip:
+ ret = maybe_drop_reference(trans, root, path, wc, next, owner_root);
+ if (ret)
+ goto out_unlock;
wc->refs[level - 1] = 0;
wc->flags[level - 1] = 0;
- if (wc->stage == DROP_REFERENCE) {
- struct btrfs_ref ref = {
- .action = BTRFS_DROP_DELAYED_REF,
- .bytenr = bytenr,
- .num_bytes = fs_info->nodesize,
- .owning_root = owner_root,
- .ref_root = btrfs_root_id(root),
- };
- if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
- ref.parent = path->nodes[level]->start;
- } else {
- ASSERT(btrfs_root_id(root) ==
- btrfs_header_owner(path->nodes[level]));
- if (btrfs_root_id(root) !=
- btrfs_header_owner(path->nodes[level])) {
- btrfs_err(root->fs_info,
- "mismatched block owner");
- ret = -EIO;
- goto out_unlock;
- }
- }
-
- /*
- * If we had a drop_progress we need to verify the refs are set
- * as expected. If we find our ref then we know that from here
- * on out everything should be correct, and we can clear the
- * ->restarted flag.
- */
- if (wc->restarted) {
- ret = check_ref_exists(trans, root, bytenr, ref.parent,
- level - 1);
- if (ret < 0)
- goto out_unlock;
- if (ret == 0)
- goto no_delete;
- ret = 0;
- wc->restarted = 0;
- }
-
- /*
- * Reloc tree doesn't contribute to qgroup numbers, and we have
- * already accounted them at merge time (replace_path),
- * thus we could skip expensive subtree trace here.
- */
- if (btrfs_root_id(root) != BTRFS_TREE_RELOC_OBJECTID && need_account) {
- ret = btrfs_qgroup_trace_subtree(trans, next,
- generation, level - 1);
- if (ret) {
- btrfs_err_rl(fs_info,
- "Error %d accounting shared subtree. Quota is out of sync, rescan required.",
- ret);
- }
- }
-
- /*
- * We need to update the next key in our walk control so we can
- * update the drop_progress key accordingly. We don't care if
- * find_next_key doesn't find a key because that means we're at
- * the end and are going to clean up now.
- */
- wc->drop_level = level;
- find_next_key(path, level, &wc->drop_progress);
-
- btrfs_init_tree_ref(&ref, level - 1, 0, false);
- ret = btrfs_free_extent(trans, &ref);
- if (ret)
- goto out_unlock;
- }
-no_delete:
- *lookup_info = 1;
+ wc->lookup_info = 1;
ret = 1;
out_unlock:
@@ -5643,13 +5747,13 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
struct walk_control *wc)
{
struct btrfs_fs_info *fs_info = root->fs_info;
- int ret;
+ int ret = 0;
int level = wc->level;
struct extent_buffer *eb = path->nodes[level];
u64 parent = 0;
if (wc->stage == UPDATE_BACKREF) {
- BUG_ON(wc->shared_level < level);
+ ASSERT(wc->shared_level >= level);
if (level < wc->shared_level)
goto out;
@@ -5667,7 +5771,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
* count is one.
*/
if (!path->locks[level]) {
- BUG_ON(level == 0);
+ ASSERT(level > 0);
btrfs_tree_lock(eb);
path->locks[level] = BTRFS_WRITE_LOCK;
@@ -5681,7 +5785,12 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
path->locks[level] = 0;
return ret;
}
- BUG_ON(wc->refs[level] == 0);
+ if (unlikely(wc->refs[level] == 0)) {
+ btrfs_tree_unlock_rw(eb, path->locks[level]);
+ btrfs_err(fs_info, "bytenr %llu has 0 references, expect > 0",
+ eb->start);
+ return -EUCLEAN;
+ }
if (wc->refs[level] == 1) {
btrfs_tree_unlock_rw(eb, path->locks[level]);
path->locks[level] = 0;
@@ -5691,7 +5800,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
}
/* wc->stage == DROP_REFERENCE */
- BUG_ON(wc->refs[level] > 1 && !path->locks[level]);
+ ASSERT(path->locks[level] || wc->refs[level] == 1);
if (wc->refs[level] == 1) {
if (level == 0) {
@@ -5699,7 +5808,10 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
ret = btrfs_dec_ref(trans, root, eb, 1);
else
ret = btrfs_dec_ref(trans, root, eb, 0);
- BUG_ON(ret); /* -ENOMEM */
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ return ret;
+ }
if (is_fstree(btrfs_root_id(root))) {
ret = btrfs_qgroup_trace_leaf_items(trans, eb);
if (ret) {
@@ -5730,12 +5842,14 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
goto owner_mismatch;
}
- btrfs_free_tree_block(trans, btrfs_root_id(root), eb, parent,
- wc->refs[level] == 1);
+ ret = btrfs_free_tree_block(trans, btrfs_root_id(root), eb, parent,
+ wc->refs[level] == 1);
+ if (ret < 0)
+ btrfs_abort_transaction(trans, ret);
out:
wc->refs[level] = 0;
wc->flags[level] = 0;
- return 0;
+ return ret;
owner_mismatch:
btrfs_err_rl(fs_info, "unexpected tree owner, have %llu expect %llu",
@@ -5743,17 +5857,38 @@ owner_mismatch:
return -EUCLEAN;
}
+/*
+ * walk_down_tree consists of two steps.
+ *
+ * walk_down_proc(). Look up the reference count and reference of our current
+ * wc->level. At this point path->nodes[wc->level] should be populated and
+ * uptodate, and in most cases should already be locked. If we are in
+ * DROP_REFERENCE and our refcount is > 1 then we've entered a shared node and
+ * we can walk back up the tree. If we are UPDATE_BACKREF we have to set
+ * FULL_BACKREF on this node if it's not already set, and then do the
+ * FULL_BACKREF conversion dance, which is to drop the root reference and add
+ * the shared reference to all of this nodes children.
+ *
+ * do_walk_down(). This is where we actually start iterating on the children of
+ * our current path->nodes[wc->level]. For DROP_REFERENCE that means dropping
+ * our reference to the children that return false from visit_node_for_delete(),
+ * which has various conditions where we know we can just drop our reference
+ * without visiting the node. For UPDATE_BACKREF we will skip any children that
+ * visit_node_for_delete() returns false for, only walking down when necessary.
+ * The bulk of the work for UPDATE_BACKREF occurs in the walk_up_tree() part of
+ * snapshot deletion.
+ */
static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path,
struct walk_control *wc)
{
int level = wc->level;
- int lookup_info = 1;
int ret = 0;
+ wc->lookup_info = 1;
while (level >= 0) {
- ret = walk_down_proc(trans, root, path, wc, lookup_info);
+ ret = walk_down_proc(trans, root, path, wc);
if (ret)
break;
@@ -5764,7 +5899,7 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
btrfs_header_nritems(path->nodes[level]))
break;
- ret = do_walk_down(trans, root, path, wc, &lookup_info);
+ ret = do_walk_down(trans, root, path, wc);
if (ret > 0) {
path->slots[level]++;
continue;
@@ -5775,6 +5910,23 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
return (ret == 1) ? 0 : ret;
}
+/*
+ * walk_up_tree() is responsible for making sure we visit every slot on our
+ * current node, and if we're at the end of that node then we call
+ * walk_up_proc() on our current node which will do one of a few things based on
+ * our stage.
+ *
+ * UPDATE_BACKREF. If we wc->level is currently less than our wc->shared_level
+ * then we need to walk back up the tree, and then going back down into the
+ * other slots via walk_down_tree to update any other children from our original
+ * wc->shared_level. Once we're at or above our wc->shared_level we can switch
+ * back to DROP_REFERENCE, lookup the current nodes refs and flags, and carry on.
+ *
+ * DROP_REFERENCE. If our refs == 1 then we're going to free this tree block.
+ * If we're level 0 then we need to btrfs_dec_ref() on all of the data extents
+ * in our current leaf. After that we call btrfs_free_tree_block() on the
+ * current node and walk up to the next node to walk down the next slot.
+ */
static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path,
@@ -5833,8 +5985,8 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
struct btrfs_root_item *root_item = &root->root_item;
struct walk_control *wc;
struct btrfs_key key;
- int err = 0;
- int ret;
+ const u64 rootid = btrfs_root_id(root);
+ int ret = 0;
int level;
bool root_dropped = false;
bool unfinished_drop = false;
@@ -5843,14 +5995,14 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
path = btrfs_alloc_path();
if (!path) {
- err = -ENOMEM;
+ ret = -ENOMEM;
goto out;
}
wc = kzalloc(sizeof(*wc), GFP_NOFS);
if (!wc) {
btrfs_free_path(path);
- err = -ENOMEM;
+ ret = -ENOMEM;
goto out;
}
@@ -5863,12 +6015,12 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
else
trans = btrfs_start_transaction(tree_root, 0);
if (IS_ERR(trans)) {
- err = PTR_ERR(trans);
+ ret = PTR_ERR(trans);
goto out_free;
}
- err = btrfs_run_delayed_items(trans);
- if (err)
+ ret = btrfs_run_delayed_items(trans);
+ if (ret)
goto out_end_trans;
/*
@@ -5899,11 +6051,11 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
path->lowest_level = level;
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
path->lowest_level = 0;
- if (ret < 0) {
- err = ret;
+ if (ret < 0)
goto out_end_trans;
- }
+
WARN_ON(ret > 0);
+ ret = 0;
/*
* unlock our path, this is safe because only this
@@ -5916,14 +6068,17 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
btrfs_tree_lock(path->nodes[level]);
path->locks[level] = BTRFS_WRITE_LOCK;
+ /*
+ * btrfs_lookup_extent_info() returns 0 for success,
+ * or < 0 for error.
+ */
ret = btrfs_lookup_extent_info(trans, fs_info,
path->nodes[level]->start,
level, 1, &wc->refs[level],
&wc->flags[level], NULL);
- if (ret < 0) {
- err = ret;
+ if (ret < 0)
goto out_end_trans;
- }
+
BUG_ON(wc->refs[level] == 0);
if (level == btrfs_root_drop_level(root_item))
@@ -5949,19 +6104,18 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
ret = walk_down_tree(trans, root, path, wc);
if (ret < 0) {
btrfs_abort_transaction(trans, ret);
- err = ret;
break;
}
ret = walk_up_tree(trans, root, path, wc, BTRFS_MAX_LEVEL);
if (ret < 0) {
btrfs_abort_transaction(trans, ret);
- err = ret;
break;
}
if (ret > 0) {
BUG_ON(wc->stage != DROP_REFERENCE);
+ ret = 0;
break;
}
@@ -5983,7 +6137,6 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
root_item);
if (ret) {
btrfs_abort_transaction(trans, ret);
- err = ret;
goto out_end_trans;
}
@@ -5994,7 +6147,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
if (!for_reloc && btrfs_need_cleaner_sleep(fs_info)) {
btrfs_debug(fs_info,
"drop snapshot early exit");
- err = -EAGAIN;
+ ret = -EAGAIN;
goto out_free;
}
@@ -6008,19 +6161,18 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
else
trans = btrfs_start_transaction(tree_root, 0);
if (IS_ERR(trans)) {
- err = PTR_ERR(trans);
+ ret = PTR_ERR(trans);
goto out_free;
}
}
}
btrfs_release_path(path);
- if (err)
+ if (ret)
goto out_end_trans;
ret = btrfs_del_root(trans, &root->root_key);
if (ret) {
btrfs_abort_transaction(trans, ret);
- err = ret;
goto out_end_trans;
}
@@ -6029,10 +6181,11 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
NULL, NULL);
if (ret < 0) {
btrfs_abort_transaction(trans, ret);
- err = ret;
goto out_end_trans;
} else if (ret > 0) {
- /* if we fail to delete the orphan item this time
+ ret = 0;
+ /*
+ * If we fail to delete the orphan item this time
* around, it'll get picked up the next time.
*
* The most common failure here is just -ENOENT.
@@ -6063,11 +6216,19 @@ out_free:
kfree(wc);
btrfs_free_path(path);
out:
+ if (!ret && root_dropped) {
+ ret = btrfs_qgroup_cleanup_dropped_subvolume(fs_info, rootid);
+ if (ret < 0)
+ btrfs_warn_rl(fs_info,
+ "failed to cleanup qgroup 0/%llu: %d",
+ rootid, ret);
+ ret = 0;
+ }
/*
* We were an unfinished drop root, check to see if there are any
* pending, and if not clear and wake up any waiters.
*/
- if (!err && unfinished_drop)
+ if (!ret && unfinished_drop)
btrfs_maybe_wake_unfinished_drop(fs_info);
/*
@@ -6079,7 +6240,7 @@ out:
*/
if (!for_reloc && !root_dropped)
btrfs_add_dead_root(root);
- return err;
+ return ret;
}
/*
diff --git a/fs/btrfs/extent-tree.h b/fs/btrfs/extent-tree.h
index af9f8800d5ac..2ad51130c037 100644
--- a/fs/btrfs/extent-tree.h
+++ b/fs/btrfs/extent-tree.h
@@ -127,10 +127,10 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
u64 empty_size,
u64 reloc_src_root,
enum btrfs_lock_nesting nest);
-void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
- u64 root_id,
- struct extent_buffer *buf,
- u64 parent, int last_ref);
+int btrfs_free_tree_block(struct btrfs_trans_handle *trans,
+ u64 root_id,
+ struct extent_buffer *buf,
+ u64 parent, int last_ref);
int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 owner,
u64 offset, u64 ram_bytes,
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 958155cc43a8..aa7f8148cd0d 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -164,23 +164,8 @@ void __cold extent_buffer_free_cachep(void)
kmem_cache_destroy(extent_buffer_cache);
}
-void extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end)
-{
- unsigned long index = start >> PAGE_SHIFT;
- unsigned long end_index = end >> PAGE_SHIFT;
- struct page *page;
-
- while (index <= end_index) {
- page = find_get_page(inode->i_mapping, index);
- BUG_ON(!page); /* Pages should be in the extent_io_tree */
- clear_page_dirty_for_io(page);
- put_page(page);
- index++;
- }
-}
-
static void process_one_page(struct btrfs_fs_info *fs_info,
- struct page *page, struct page *locked_page,
+ struct page *page, const struct page *locked_page,
unsigned long page_ops, u64 start, u64 end)
{
struct folio *folio = page_folio(page);
@@ -203,7 +188,7 @@ static void process_one_page(struct btrfs_fs_info *fs_info,
}
static void __process_pages_contig(struct address_space *mapping,
- struct page *locked_page, u64 start, u64 end,
+ const struct page *locked_page, u64 start, u64 end,
unsigned long page_ops)
{
struct btrfs_fs_info *fs_info = inode_to_fs_info(mapping->host);
@@ -230,8 +215,8 @@ static void __process_pages_contig(struct address_space *mapping,
}
}
-static noinline void __unlock_for_delalloc(struct inode *inode,
- struct page *locked_page,
+static noinline void __unlock_for_delalloc(const struct inode *inode,
+ const struct page *locked_page,
u64 start, u64 end)
{
unsigned long index = start >> PAGE_SHIFT;
@@ -246,7 +231,7 @@ static noinline void __unlock_for_delalloc(struct inode *inode,
}
static noinline int lock_delalloc_pages(struct inode *inode,
- struct page *locked_page,
+ const struct page *locked_page,
u64 start,
u64 end)
{
@@ -411,7 +396,7 @@ out_failed:
}
void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
- struct page *locked_page,
+ const struct page *locked_page,
struct extent_state **cached,
u32 clear_bits, unsigned long page_ops)
{
@@ -667,24 +652,22 @@ static void end_bbio_data_read(struct btrfs_bio *bbio)
}
/*
- * Populate every free slot in a provided array with folios.
+ * Populate every free slot in a provided array with folios using GFP_NOFS.
*
* @nr_folios: number of folios to allocate
* @folio_array: the array to fill with folios; any existing non-NULL entries in
* the array will be skipped
- * @extra_gfp: the extra GFP flags for the allocation
*
* Return: 0 if all folios were able to be allocated;
* -ENOMEM otherwise, the partially allocated folios would be freed and
* the array slots zeroed
*/
-int btrfs_alloc_folio_array(unsigned int nr_folios, struct folio **folio_array,
- gfp_t extra_gfp)
+int btrfs_alloc_folio_array(unsigned int nr_folios, struct folio **folio_array)
{
for (int i = 0; i < nr_folios; i++) {
if (folio_array[i])
continue;
- folio_array[i] = folio_alloc(GFP_NOFS | extra_gfp, 0);
+ folio_array[i] = folio_alloc(GFP_NOFS, 0);
if (!folio_array[i])
goto error;
}
@@ -698,21 +681,21 @@ error:
}
/*
- * Populate every free slot in a provided array with pages.
+ * Populate every free slot in a provided array with pages, using GFP_NOFS.
*
* @nr_pages: number of pages to allocate
* @page_array: the array to fill with pages; any existing non-null entries in
- * the array will be skipped
- * @extra_gfp: the extra GFP flags for the allocation.
+ * the array will be skipped
+ * @nofail: whether using __GFP_NOFAIL flag
*
* Return: 0 if all pages were able to be allocated;
* -ENOMEM otherwise, the partially allocated pages would be freed and
* the array slots zeroed
*/
int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array,
- gfp_t extra_gfp)
+ bool nofail)
{
- const gfp_t gfp = GFP_NOFS | extra_gfp;
+ const gfp_t gfp = nofail ? (GFP_NOFS | __GFP_NOFAIL) : GFP_NOFS;
unsigned int allocated;
for (allocated = 0; allocated < nr_pages;) {
@@ -736,13 +719,13 @@ int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array,
*
* For now, the folios populated are always in order 0 (aka, single page).
*/
-static int alloc_eb_folio_array(struct extent_buffer *eb, gfp_t extra_gfp)
+static int alloc_eb_folio_array(struct extent_buffer *eb, bool nofail)
{
struct page *page_array[INLINE_EXTENT_BUFFER_PAGES] = { 0 };
int num_pages = num_extent_pages(eb);
int ret;
- ret = btrfs_alloc_page_array(num_pages, page_array, extra_gfp);
+ ret = btrfs_alloc_page_array(num_pages, page_array, nofail);
if (ret < 0)
return ret;
@@ -860,7 +843,7 @@ static void submit_extent_page(struct btrfs_bio_ctrl *bio_ctrl,
/* Cap to the current ordered extent boundary if there is one. */
if (len > bio_ctrl->len_to_oe_boundary) {
ASSERT(bio_ctrl->compress_type == BTRFS_COMPRESS_NONE);
- ASSERT(is_data_inode(&inode->vfs_inode));
+ ASSERT(is_data_inode(inode));
len = bio_ctrl->len_to_oe_boundary;
}
@@ -1083,10 +1066,10 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
iosize = min(extent_map_end(em) - cur, end - cur + 1);
iosize = ALIGN(iosize, blocksize);
if (compress_type != BTRFS_COMPRESS_NONE)
- disk_bytenr = em->block_start;
+ disk_bytenr = em->disk_bytenr;
else
- disk_bytenr = em->block_start + extent_offset;
- block_start = em->block_start;
+ disk_bytenr = extent_map_block_start(em) + extent_offset;
+ block_start = extent_map_block_start(em);
if (em->flags & EXTENT_FLAG_PREALLOC)
block_start = EXTENT_MAP_HOLE;
@@ -1226,13 +1209,23 @@ static inline void contiguous_readpages(struct page *pages[], int nr_pages,
static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
struct page *page, struct writeback_control *wbc)
{
+ struct btrfs_fs_info *fs_info = inode_to_fs_info(&inode->vfs_inode);
+ struct folio *folio = page_folio(page);
+ const bool is_subpage = btrfs_is_subpage(fs_info, page->mapping);
const u64 page_start = page_offset(page);
const u64 page_end = page_start + PAGE_SIZE - 1;
+ /*
+ * Save the last found delalloc end. As the delalloc end can go beyond
+ * page boundary, thus we cannot rely on subpage bitmap to locate the
+ * last delalloc end.
+ */
+ u64 last_delalloc_end = 0;
u64 delalloc_start = page_start;
u64 delalloc_end = page_end;
u64 delalloc_to_write = 0;
int ret = 0;
+ /* Lock all (subpage) delalloc ranges inside the page first. */
while (delalloc_start < page_end) {
delalloc_end = page_end;
if (!find_lock_delalloc_range(&inode->vfs_inode, page,
@@ -1240,15 +1233,95 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
delalloc_start = delalloc_end + 1;
continue;
}
-
- ret = btrfs_run_delalloc_range(inode, page, delalloc_start,
- delalloc_end, wbc);
- if (ret < 0)
- return ret;
-
+ btrfs_folio_set_writer_lock(fs_info, folio, delalloc_start,
+ min(delalloc_end, page_end) + 1 -
+ delalloc_start);
+ last_delalloc_end = delalloc_end;
delalloc_start = delalloc_end + 1;
}
+ delalloc_start = page_start;
+
+ if (!last_delalloc_end)
+ goto out;
+
+ /* Run the delalloc ranges for the above locked ranges. */
+ while (delalloc_start < page_end) {
+ u64 found_start;
+ u32 found_len;
+ bool found;
+
+ if (!is_subpage) {
+ /*
+ * For non-subpage case, the found delalloc range must
+ * cover this page and there must be only one locked
+ * delalloc range.
+ */
+ found_start = page_start;
+ found_len = last_delalloc_end + 1 - found_start;
+ found = true;
+ } else {
+ found = btrfs_subpage_find_writer_locked(fs_info, folio,
+ delalloc_start, &found_start, &found_len);
+ }
+ if (!found)
+ break;
+ /*
+ * The subpage range covers the last sector, the delalloc range may
+ * end beyond the page boundary, use the saved delalloc_end
+ * instead.
+ */
+ if (found_start + found_len >= page_end)
+ found_len = last_delalloc_end + 1 - found_start;
+
+ if (ret >= 0) {
+ /* No errors hit so far, run the current delalloc range. */
+ ret = btrfs_run_delalloc_range(inode, page, found_start,
+ found_start + found_len - 1,
+ wbc);
+ } else {
+ /*
+ * We've hit an error during previous delalloc range,
+ * have to cleanup the remaining locked ranges.
+ */
+ unlock_extent(&inode->io_tree, found_start,
+ found_start + found_len - 1, NULL);
+ __unlock_for_delalloc(&inode->vfs_inode, page, found_start,
+ found_start + found_len - 1);
+ }
+
+ /*
+ * We can hit btrfs_run_delalloc_range() with >0 return value.
+ *
+ * This happens when either the IO is already done and page
+ * unlocked (inline) or the IO submission and page unlock would
+ * be handled as async (compression).
+ *
+ * Inline is only possible for regular sectorsize for now.
+ *
+ * Compression is possible for both subpage and regular cases,
+ * but even for subpage compression only happens for page aligned
+ * range, thus the found delalloc range must go beyond current
+ * page.
+ */
+ if (ret > 0)
+ ASSERT(!is_subpage || found_start + found_len >= page_end);
+
+ /*
+ * Above btrfs_run_delalloc_range() may have unlocked the page,
+ * thus for the last range, we cannot touch the page anymore.
+ */
+ if (found_start + found_len >= last_delalloc_end + 1)
+ break;
+ delalloc_start = found_start + found_len;
+ }
+ if (ret < 0)
+ return ret;
+out:
+ if (last_delalloc_end)
+ delalloc_end = last_delalloc_end;
+ else
+ delalloc_end = page_end;
/*
* delalloc_end is already one less than the total length, so
* we don't subtract one from PAGE_SIZE
@@ -1292,7 +1365,7 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
* Return the next dirty range in [@start, @end).
* If no dirty range is found, @start will be page_offset(page) + PAGE_SIZE.
*/
-static void find_next_dirty_byte(struct btrfs_fs_info *fs_info,
+static void find_next_dirty_byte(const struct btrfs_fs_info *fs_info,
struct page *page, u64 *start, u64 *end)
{
struct folio *folio = page_folio(page);
@@ -1339,20 +1412,23 @@ static void find_next_dirty_byte(struct btrfs_fs_info *fs_info,
* < 0 if there were errors (page still locked)
*/
static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
- struct page *page,
+ struct page *page, u64 start, u32 len,
struct btrfs_bio_ctrl *bio_ctrl,
loff_t i_size,
int *nr_ret)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
- u64 cur = page_offset(page);
- u64 end = cur + PAGE_SIZE - 1;
+ u64 cur = start;
+ u64 end = start + len - 1;
u64 extent_offset;
u64 block_start;
struct extent_map *em;
int ret = 0;
int nr = 0;
+ ASSERT(start >= page_offset(page) &&
+ start + len <= page_offset(page) + PAGE_SIZE);
+
ret = btrfs_writepage_cow_fixup(page);
if (ret) {
/* Fixup worker will requeue */
@@ -1405,8 +1481,8 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
ASSERT(IS_ALIGNED(em->start, fs_info->sectorsize));
ASSERT(IS_ALIGNED(em->len, fs_info->sectorsize));
- block_start = em->block_start;
- disk_bytenr = em->block_start + extent_offset;
+ block_start = extent_map_block_start(em);
+ disk_bytenr = extent_map_block_start(em) + extent_offset;
ASSERT(!extent_map_is_compressed(em));
ASSERT(block_start != EXTENT_MAP_HOLE);
@@ -1441,7 +1517,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
nr++;
}
- btrfs_folio_assert_not_dirty(fs_info, page_folio(page));
+ btrfs_folio_assert_not_dirty(fs_info, page_folio(page), start, len);
*nr_ret = nr;
return 0;
@@ -1499,7 +1575,8 @@ static int __extent_writepage(struct page *page, struct btrfs_bio_ctrl *bio_ctrl
if (ret)
goto done;
- ret = __extent_writepage_io(BTRFS_I(inode), page, bio_ctrl, i_size, &nr);
+ ret = __extent_writepage_io(BTRFS_I(inode), page, page_offset(page),
+ PAGE_SIZE, bio_ctrl, i_size, &nr);
if (ret == 1)
return 0;
@@ -1516,7 +1593,8 @@ done:
PAGE_SIZE, !ret);
mapping_set_error(page->mapping, ret);
}
- unlock_page(page);
+
+ btrfs_folio_end_all_writers(inode_to_fs_info(inode), folio);
ASSERT(ret <= 0);
return ret;
}
@@ -1648,7 +1726,7 @@ static void set_btree_ioerr(struct extent_buffer *eb)
* context.
*/
static struct extent_buffer *find_extent_buffer_nolock(
- struct btrfs_fs_info *fs_info, u64 start)
+ const struct btrfs_fs_info *fs_info, u64 start)
{
struct extent_buffer *eb;
@@ -2217,7 +2295,7 @@ retry:
* already been ran (aka, ordered extent inserted) and all pages are still
* locked.
*/
-void extent_write_locked_range(struct inode *inode, struct page *locked_page,
+void extent_write_locked_range(struct inode *inode, const struct page *locked_page,
u64 start, u64 end, struct writeback_control *wbc,
bool pages_dirty)
{
@@ -2246,20 +2324,21 @@ void extent_write_locked_range(struct inode *inode, struct page *locked_page,
page = find_get_page(mapping, cur >> PAGE_SHIFT);
ASSERT(PageLocked(page));
- if (pages_dirty && page != locked_page) {
+ if (pages_dirty && page != locked_page)
ASSERT(PageDirty(page));
- clear_page_dirty_for_io(page);
- }
- ret = __extent_writepage_io(BTRFS_I(inode), page, &bio_ctrl,
- i_size, &nr);
+ ret = __extent_writepage_io(BTRFS_I(inode), page, cur, cur_len,
+ &bio_ctrl, i_size, &nr);
if (ret == 1)
goto next_page;
/* Make sure the mapping tag for page dirty gets cleared. */
if (nr == 0) {
- set_page_writeback(page);
- end_page_writeback(page);
+ struct folio *folio;
+
+ folio = page_folio(page);
+ btrfs_folio_set_writeback(fs_info, folio, cur, cur_len);
+ btrfs_folio_clear_writeback(fs_info, folio, cur, cur_len);
}
if (ret) {
btrfs_mark_ordered_io_finished(BTRFS_I(inode), page,
@@ -2470,877 +2549,6 @@ next:
return try_release_extent_state(io_tree, page, mask);
}
-struct btrfs_fiemap_entry {
- u64 offset;
- u64 phys;
- u64 len;
- u32 flags;
-};
-
-/*
- * Indicate the caller of emit_fiemap_extent() that it needs to unlock the file
- * range from the inode's io tree, unlock the subvolume tree search path, flush
- * the fiemap cache and relock the file range and research the subvolume tree.
- * The value here is something negative that can't be confused with a valid
- * errno value and different from 1 because that's also a return value from
- * fiemap_fill_next_extent() and also it's often used to mean some btree search
- * did not find a key, so make it some distinct negative value.
- */
-#define BTRFS_FIEMAP_FLUSH_CACHE (-(MAX_ERRNO + 1))
-
-/*
- * Used to:
- *
- * - Cache the next entry to be emitted to the fiemap buffer, so that we can
- * merge extents that are contiguous and can be grouped as a single one;
- *
- * - Store extents ready to be written to the fiemap buffer in an intermediary
- * buffer. This intermediary buffer is to ensure that in case the fiemap
- * buffer is memory mapped to the fiemap target file, we don't deadlock
- * during btrfs_page_mkwrite(). This is because during fiemap we are locking
- * an extent range in order to prevent races with delalloc flushing and
- * ordered extent completion, which is needed in order to reliably detect
- * delalloc in holes and prealloc extents. And this can lead to a deadlock
- * if the fiemap buffer is memory mapped to the file we are running fiemap
- * against (a silly, useless in practice scenario, but possible) because
- * btrfs_page_mkwrite() will try to lock the same extent range.
- */
-struct fiemap_cache {
- /* An array of ready fiemap entries. */
- struct btrfs_fiemap_entry *entries;
- /* Number of entries in the entries array. */
- int entries_size;
- /* Index of the next entry in the entries array to write to. */
- int entries_pos;
- /*
- * Once the entries array is full, this indicates what's the offset for
- * the next file extent item we must search for in the inode's subvolume
- * tree after unlocking the extent range in the inode's io tree and
- * releasing the search path.
- */
- u64 next_search_offset;
- /*
- * This matches struct fiemap_extent_info::fi_mapped_extents, we use it
- * to count ourselves emitted extents and stop instead of relying on
- * fiemap_fill_next_extent() because we buffer ready fiemap entries at
- * the @entries array, and we want to stop as soon as we hit the max
- * amount of extents to map, not just to save time but also to make the
- * logic at extent_fiemap() simpler.
- */
- unsigned int extents_mapped;
- /* Fields for the cached extent (unsubmitted, not ready, extent). */
- u64 offset;
- u64 phys;
- u64 len;
- u32 flags;
- bool cached;
-};
-
-static int flush_fiemap_cache(struct fiemap_extent_info *fieinfo,
- struct fiemap_cache *cache)
-{
- for (int i = 0; i < cache->entries_pos; i++) {
- struct btrfs_fiemap_entry *entry = &cache->entries[i];
- int ret;
-
- ret = fiemap_fill_next_extent(fieinfo, entry->offset,
- entry->phys, entry->len,
- entry->flags);
- /*
- * Ignore 1 (reached max entries) because we keep track of that
- * ourselves in emit_fiemap_extent().
- */
- if (ret < 0)
- return ret;
- }
- cache->entries_pos = 0;
-
- return 0;
-}
-
-/*
- * Helper to submit fiemap extent.
- *
- * Will try to merge current fiemap extent specified by @offset, @phys,
- * @len and @flags with cached one.
- * And only when we fails to merge, cached one will be submitted as
- * fiemap extent.
- *
- * Return value is the same as fiemap_fill_next_extent().
- */
-static int emit_fiemap_extent(struct fiemap_extent_info *fieinfo,
- struct fiemap_cache *cache,
- u64 offset, u64 phys, u64 len, u32 flags)
-{
- struct btrfs_fiemap_entry *entry;
- u64 cache_end;
-
- /* Set at the end of extent_fiemap(). */
- ASSERT((flags & FIEMAP_EXTENT_LAST) == 0);
-
- if (!cache->cached)
- goto assign;
-
- /*
- * When iterating the extents of the inode, at extent_fiemap(), we may
- * find an extent that starts at an offset behind the end offset of the
- * previous extent we processed. This happens if fiemap is called
- * without FIEMAP_FLAG_SYNC and there are ordered extents completing
- * after we had to unlock the file range, release the search path, emit
- * the fiemap extents stored in the buffer (cache->entries array) and
- * the lock the remainder of the range and re-search the btree.
- *
- * For example we are in leaf X processing its last item, which is the
- * file extent item for file range [512K, 1M[, and after
- * btrfs_next_leaf() releases the path, there's an ordered extent that
- * completes for the file range [768K, 2M[, and that results in trimming
- * the file extent item so that it now corresponds to the file range
- * [512K, 768K[ and a new file extent item is inserted for the file
- * range [768K, 2M[, which may end up as the last item of leaf X or as
- * the first item of the next leaf - in either case btrfs_next_leaf()
- * will leave us with a path pointing to the new extent item, for the
- * file range [768K, 2M[, since that's the first key that follows the
- * last one we processed. So in order not to report overlapping extents
- * to user space, we trim the length of the previously cached extent and
- * emit it.
- *
- * Upon calling btrfs_next_leaf() we may also find an extent with an
- * offset smaller than or equals to cache->offset, and this happens
- * when we had a hole or prealloc extent with several delalloc ranges in
- * it, but after btrfs_next_leaf() released the path, delalloc was
- * flushed and the resulting ordered extents were completed, so we can
- * now have found a file extent item for an offset that is smaller than
- * or equals to what we have in cache->offset. We deal with this as
- * described below.
- */
- cache_end = cache->offset + cache->len;
- if (cache_end > offset) {
- if (offset == cache->offset) {
- /*
- * We cached a dealloc range (found in the io tree) for
- * a hole or prealloc extent and we have now found a
- * file extent item for the same offset. What we have
- * now is more recent and up to date, so discard what
- * we had in the cache and use what we have just found.
- */
- goto assign;
- } else if (offset > cache->offset) {
- /*
- * The extent range we previously found ends after the
- * offset of the file extent item we found and that
- * offset falls somewhere in the middle of that previous
- * extent range. So adjust the range we previously found
- * to end at the offset of the file extent item we have
- * just found, since this extent is more up to date.
- * Emit that adjusted range and cache the file extent
- * item we have just found. This corresponds to the case
- * where a previously found file extent item was split
- * due to an ordered extent completing.
- */
- cache->len = offset - cache->offset;
- goto emit;
- } else {
- const u64 range_end = offset + len;
-
- /*
- * The offset of the file extent item we have just found
- * is behind the cached offset. This means we were
- * processing a hole or prealloc extent for which we
- * have found delalloc ranges (in the io tree), so what
- * we have in the cache is the last delalloc range we
- * found while the file extent item we found can be
- * either for a whole delalloc range we previously
- * emmitted or only a part of that range.
- *
- * We have two cases here:
- *
- * 1) The file extent item's range ends at or behind the
- * cached extent's end. In this case just ignore the
- * current file extent item because we don't want to
- * overlap with previous ranges that may have been
- * emmitted already;
- *
- * 2) The file extent item starts behind the currently
- * cached extent but its end offset goes beyond the
- * end offset of the cached extent. We don't want to
- * overlap with a previous range that may have been
- * emmitted already, so we emit the currently cached
- * extent and then partially store the current file
- * extent item's range in the cache, for the subrange
- * going the cached extent's end to the end of the
- * file extent item.
- */
- if (range_end <= cache_end)
- return 0;
-
- if (!(flags & (FIEMAP_EXTENT_ENCODED | FIEMAP_EXTENT_DELALLOC)))
- phys += cache_end - offset;
-
- offset = cache_end;
- len = range_end - cache_end;
- goto emit;
- }
- }
-
- /*
- * Only merges fiemap extents if
- * 1) Their logical addresses are continuous
- *
- * 2) Their physical addresses are continuous
- * So truly compressed (physical size smaller than logical size)
- * extents won't get merged with each other
- *
- * 3) Share same flags
- */
- if (cache->offset + cache->len == offset &&
- cache->phys + cache->len == phys &&
- cache->flags == flags) {
- cache->len += len;
- return 0;
- }
-
-emit:
- /* Not mergeable, need to submit cached one */
-
- if (cache->entries_pos == cache->entries_size) {
- /*
- * We will need to research for the end offset of the last
- * stored extent and not from the current offset, because after
- * unlocking the range and releasing the path, if there's a hole
- * between that end offset and this current offset, a new extent
- * may have been inserted due to a new write, so we don't want
- * to miss it.
- */
- entry = &cache->entries[cache->entries_size - 1];
- cache->next_search_offset = entry->offset + entry->len;
- cache->cached = false;
-
- return BTRFS_FIEMAP_FLUSH_CACHE;
- }
-
- entry = &cache->entries[cache->entries_pos];
- entry->offset = cache->offset;
- entry->phys = cache->phys;
- entry->len = cache->len;
- entry->flags = cache->flags;
- cache->entries_pos++;
- cache->extents_mapped++;
-
- if (cache->extents_mapped == fieinfo->fi_extents_max) {
- cache->cached = false;
- return 1;
- }
-assign:
- cache->cached = true;
- cache->offset = offset;
- cache->phys = phys;
- cache->len = len;
- cache->flags = flags;
-
- return 0;
-}
-
-/*
- * Emit last fiemap cache
- *
- * The last fiemap cache may still be cached in the following case:
- * 0 4k 8k
- * |<- Fiemap range ->|
- * |<------------ First extent ----------->|
- *
- * In this case, the first extent range will be cached but not emitted.
- * So we must emit it before ending extent_fiemap().
- */
-static int emit_last_fiemap_cache(struct fiemap_extent_info *fieinfo,
- struct fiemap_cache *cache)
-{
- int ret;
-
- if (!cache->cached)
- return 0;
-
- ret = fiemap_fill_next_extent(fieinfo, cache->offset, cache->phys,
- cache->len, cache->flags);
- cache->cached = false;
- if (ret > 0)
- ret = 0;
- return ret;
-}
-
-static int fiemap_next_leaf_item(struct btrfs_inode *inode, struct btrfs_path *path)
-{
- struct extent_buffer *clone = path->nodes[0];
- struct btrfs_key key;
- int slot;
- int ret;
-
- path->slots[0]++;
- if (path->slots[0] < btrfs_header_nritems(path->nodes[0]))
- return 0;
-
- /*
- * Add a temporary extra ref to an already cloned extent buffer to
- * prevent btrfs_next_leaf() freeing it, we want to reuse it to avoid
- * the cost of allocating a new one.
- */
- ASSERT(test_bit(EXTENT_BUFFER_UNMAPPED, &clone->bflags));
- atomic_inc(&clone->refs);
-
- ret = btrfs_next_leaf(inode->root, path);
- if (ret != 0)
- goto out;
-
- /*
- * Don't bother with cloning if there are no more file extent items for
- * our inode.
- */
- btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
- if (key.objectid != btrfs_ino(inode) || key.type != BTRFS_EXTENT_DATA_KEY) {
- ret = 1;
- goto out;
- }
-
- /*
- * Important to preserve the start field, for the optimizations when
- * checking if extents are shared (see extent_fiemap()).
- *
- * We must set ->start before calling copy_extent_buffer_full(). If we
- * are on sub-pagesize blocksize, we use ->start to determine the offset
- * into the folio where our eb exists, and if we update ->start after
- * the fact then any subsequent reads of the eb may read from a
- * different offset in the folio than where we originally copied into.
- */
- clone->start = path->nodes[0]->start;
- /* See the comment at fiemap_search_slot() about why we clone. */
- copy_extent_buffer_full(clone, path->nodes[0]);
-
- slot = path->slots[0];
- btrfs_release_path(path);
- path->nodes[0] = clone;
- path->slots[0] = slot;
-out:
- if (ret)
- free_extent_buffer(clone);
-
- return ret;
-}
-
-/*
- * Search for the first file extent item that starts at a given file offset or
- * the one that starts immediately before that offset.
- * Returns: 0 on success, < 0 on error, 1 if not found.
- */
-static int fiemap_search_slot(struct btrfs_inode *inode, struct btrfs_path *path,
- u64 file_offset)
-{
- const u64 ino = btrfs_ino(inode);
- struct btrfs_root *root = inode->root;
- struct extent_buffer *clone;
- struct btrfs_key key;
- int slot;
- int ret;
-
- key.objectid = ino;
- key.type = BTRFS_EXTENT_DATA_KEY;
- key.offset = file_offset;
-
- ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
- if (ret < 0)
- return ret;
-
- if (ret > 0 && path->slots[0] > 0) {
- btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0] - 1);
- if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY)
- path->slots[0]--;
- }
-
- if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
- ret = btrfs_next_leaf(root, path);
- if (ret != 0)
- return ret;
-
- btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
- if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY)
- return 1;
- }
-
- /*
- * We clone the leaf and use it during fiemap. This is because while
- * using the leaf we do expensive things like checking if an extent is
- * shared, which can take a long time. In order to prevent blocking
- * other tasks for too long, we use a clone of the leaf. We have locked
- * the file range in the inode's io tree, so we know none of our file
- * extent items can change. This way we avoid blocking other tasks that
- * want to insert items for other inodes in the same leaf or b+tree
- * rebalance operations (triggered for example when someone is trying
- * to push items into this leaf when trying to insert an item in a
- * neighbour leaf).
- * We also need the private clone because holding a read lock on an
- * extent buffer of the subvolume's b+tree will make lockdep unhappy
- * when we check if extents are shared, as backref walking may need to
- * lock the same leaf we are processing.
- */
- clone = btrfs_clone_extent_buffer(path->nodes[0]);
- if (!clone)
- return -ENOMEM;
-
- slot = path->slots[0];
- btrfs_release_path(path);
- path->nodes[0] = clone;
- path->slots[0] = slot;
-
- return 0;
-}
-
-/*
- * Process a range which is a hole or a prealloc extent in the inode's subvolume
- * btree. If @disk_bytenr is 0, we are dealing with a hole, otherwise a prealloc
- * extent. The end offset (@end) is inclusive.
- */
-static int fiemap_process_hole(struct btrfs_inode *inode,
- struct fiemap_extent_info *fieinfo,
- struct fiemap_cache *cache,
- struct extent_state **delalloc_cached_state,
- struct btrfs_backref_share_check_ctx *backref_ctx,
- u64 disk_bytenr, u64 extent_offset,
- u64 extent_gen,
- u64 start, u64 end)
-{
- const u64 i_size = i_size_read(&inode->vfs_inode);
- u64 cur_offset = start;
- u64 last_delalloc_end = 0;
- u32 prealloc_flags = FIEMAP_EXTENT_UNWRITTEN;
- bool checked_extent_shared = false;
- int ret;
-
- /*
- * There can be no delalloc past i_size, so don't waste time looking for
- * it beyond i_size.
- */
- while (cur_offset < end && cur_offset < i_size) {
- u64 delalloc_start;
- u64 delalloc_end;
- u64 prealloc_start;
- u64 prealloc_len = 0;
- bool delalloc;
-
- delalloc = btrfs_find_delalloc_in_range(inode, cur_offset, end,
- delalloc_cached_state,
- &delalloc_start,
- &delalloc_end);
- if (!delalloc)
- break;
-
- /*
- * If this is a prealloc extent we have to report every section
- * of it that has no delalloc.
- */
- if (disk_bytenr != 0) {
- if (last_delalloc_end == 0) {
- prealloc_start = start;
- prealloc_len = delalloc_start - start;
- } else {
- prealloc_start = last_delalloc_end + 1;
- prealloc_len = delalloc_start - prealloc_start;
- }
- }
-
- if (prealloc_len > 0) {
- if (!checked_extent_shared && fieinfo->fi_extents_max) {
- ret = btrfs_is_data_extent_shared(inode,
- disk_bytenr,
- extent_gen,
- backref_ctx);
- if (ret < 0)
- return ret;
- else if (ret > 0)
- prealloc_flags |= FIEMAP_EXTENT_SHARED;
-
- checked_extent_shared = true;
- }
- ret = emit_fiemap_extent(fieinfo, cache, prealloc_start,
- disk_bytenr + extent_offset,
- prealloc_len, prealloc_flags);
- if (ret)
- return ret;
- extent_offset += prealloc_len;
- }
-
- ret = emit_fiemap_extent(fieinfo, cache, delalloc_start, 0,
- delalloc_end + 1 - delalloc_start,
- FIEMAP_EXTENT_DELALLOC |
- FIEMAP_EXTENT_UNKNOWN);
- if (ret)
- return ret;
-
- last_delalloc_end = delalloc_end;
- cur_offset = delalloc_end + 1;
- extent_offset += cur_offset - delalloc_start;
- cond_resched();
- }
-
- /*
- * Either we found no delalloc for the whole prealloc extent or we have
- * a prealloc extent that spans i_size or starts at or after i_size.
- */
- if (disk_bytenr != 0 && last_delalloc_end < end) {
- u64 prealloc_start;
- u64 prealloc_len;
-
- if (last_delalloc_end == 0) {
- prealloc_start = start;
- prealloc_len = end + 1 - start;
- } else {
- prealloc_start = last_delalloc_end + 1;
- prealloc_len = end + 1 - prealloc_start;
- }
-
- if (!checked_extent_shared && fieinfo->fi_extents_max) {
- ret = btrfs_is_data_extent_shared(inode,
- disk_bytenr,
- extent_gen,
- backref_ctx);
- if (ret < 0)
- return ret;
- else if (ret > 0)
- prealloc_flags |= FIEMAP_EXTENT_SHARED;
- }
- ret = emit_fiemap_extent(fieinfo, cache, prealloc_start,
- disk_bytenr + extent_offset,
- prealloc_len, prealloc_flags);
- if (ret)
- return ret;
- }
-
- return 0;
-}
-
-static int fiemap_find_last_extent_offset(struct btrfs_inode *inode,
- struct btrfs_path *path,
- u64 *last_extent_end_ret)
-{
- const u64 ino = btrfs_ino(inode);
- struct btrfs_root *root = inode->root;
- struct extent_buffer *leaf;
- struct btrfs_file_extent_item *ei;
- struct btrfs_key key;
- u64 disk_bytenr;
- int ret;
-
- /*
- * Lookup the last file extent. We're not using i_size here because
- * there might be preallocation past i_size.
- */
- ret = btrfs_lookup_file_extent(NULL, root, path, ino, (u64)-1, 0);
- /* There can't be a file extent item at offset (u64)-1 */
- ASSERT(ret != 0);
- if (ret < 0)
- return ret;
-
- /*
- * For a non-existing key, btrfs_search_slot() always leaves us at a
- * slot > 0, except if the btree is empty, which is impossible because
- * at least it has the inode item for this inode and all the items for
- * the root inode 256.
- */
- ASSERT(path->slots[0] > 0);
- path->slots[0]--;
- leaf = path->nodes[0];
- btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
- if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY) {
- /* No file extent items in the subvolume tree. */
- *last_extent_end_ret = 0;
- return 0;
- }
-
- /*
- * For an inline extent, the disk_bytenr is where inline data starts at,
- * so first check if we have an inline extent item before checking if we
- * have an implicit hole (disk_bytenr == 0).
- */
- ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item);
- if (btrfs_file_extent_type(leaf, ei) == BTRFS_FILE_EXTENT_INLINE) {
- *last_extent_end_ret = btrfs_file_extent_end(path);
- return 0;
- }
-
- /*
- * Find the last file extent item that is not a hole (when NO_HOLES is
- * not enabled). This should take at most 2 iterations in the worst
- * case: we have one hole file extent item at slot 0 of a leaf and
- * another hole file extent item as the last item in the previous leaf.
- * This is because we merge file extent items that represent holes.
- */
- disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, ei);
- while (disk_bytenr == 0) {
- ret = btrfs_previous_item(root, path, ino, BTRFS_EXTENT_DATA_KEY);
- if (ret < 0) {
- return ret;
- } else if (ret > 0) {
- /* No file extent items that are not holes. */
- *last_extent_end_ret = 0;
- return 0;
- }
- leaf = path->nodes[0];
- ei = btrfs_item_ptr(leaf, path->slots[0],
- struct btrfs_file_extent_item);
- disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, ei);
- }
-
- *last_extent_end_ret = btrfs_file_extent_end(path);
- return 0;
-}
-
-int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo,
- u64 start, u64 len)
-{
- const u64 ino = btrfs_ino(inode);
- struct extent_state *cached_state = NULL;
- struct extent_state *delalloc_cached_state = NULL;
- struct btrfs_path *path;
- struct fiemap_cache cache = { 0 };
- struct btrfs_backref_share_check_ctx *backref_ctx;
- u64 last_extent_end;
- u64 prev_extent_end;
- u64 range_start;
- u64 range_end;
- const u64 sectorsize = inode->root->fs_info->sectorsize;
- bool stopped = false;
- int ret;
-
- cache.entries_size = PAGE_SIZE / sizeof(struct btrfs_fiemap_entry);
- cache.entries = kmalloc_array(cache.entries_size,
- sizeof(struct btrfs_fiemap_entry),
- GFP_KERNEL);
- backref_ctx = btrfs_alloc_backref_share_check_ctx();
- path = btrfs_alloc_path();
- if (!cache.entries || !backref_ctx || !path) {
- ret = -ENOMEM;
- goto out;
- }
-
-restart:
- range_start = round_down(start, sectorsize);
- range_end = round_up(start + len, sectorsize);
- prev_extent_end = range_start;
-
- lock_extent(&inode->io_tree, range_start, range_end, &cached_state);
-
- ret = fiemap_find_last_extent_offset(inode, path, &last_extent_end);
- if (ret < 0)
- goto out_unlock;
- btrfs_release_path(path);
-
- path->reada = READA_FORWARD;
- ret = fiemap_search_slot(inode, path, range_start);
- if (ret < 0) {
- goto out_unlock;
- } else if (ret > 0) {
- /*
- * No file extent item found, but we may have delalloc between
- * the current offset and i_size. So check for that.
- */
- ret = 0;
- goto check_eof_delalloc;
- }
-
- while (prev_extent_end < range_end) {
- struct extent_buffer *leaf = path->nodes[0];
- struct btrfs_file_extent_item *ei;
- struct btrfs_key key;
- u64 extent_end;
- u64 extent_len;
- u64 extent_offset = 0;
- u64 extent_gen;
- u64 disk_bytenr = 0;
- u64 flags = 0;
- int extent_type;
- u8 compression;
-
- btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
- if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY)
- break;
-
- extent_end = btrfs_file_extent_end(path);
-
- /*
- * The first iteration can leave us at an extent item that ends
- * before our range's start. Move to the next item.
- */
- if (extent_end <= range_start)
- goto next_item;
-
- backref_ctx->curr_leaf_bytenr = leaf->start;
-
- /* We have in implicit hole (NO_HOLES feature enabled). */
- if (prev_extent_end < key.offset) {
- const u64 hole_end = min(key.offset, range_end) - 1;
-
- ret = fiemap_process_hole(inode, fieinfo, &cache,
- &delalloc_cached_state,
- backref_ctx, 0, 0, 0,
- prev_extent_end, hole_end);
- if (ret < 0) {
- goto out_unlock;
- } else if (ret > 0) {
- /* fiemap_fill_next_extent() told us to stop. */
- stopped = true;
- break;
- }
-
- /* We've reached the end of the fiemap range, stop. */
- if (key.offset >= range_end) {
- stopped = true;
- break;
- }
- }
-
- extent_len = extent_end - key.offset;
- ei = btrfs_item_ptr(leaf, path->slots[0],
- struct btrfs_file_extent_item);
- compression = btrfs_file_extent_compression(leaf, ei);
- extent_type = btrfs_file_extent_type(leaf, ei);
- extent_gen = btrfs_file_extent_generation(leaf, ei);
-
- if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
- disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, ei);
- if (compression == BTRFS_COMPRESS_NONE)
- extent_offset = btrfs_file_extent_offset(leaf, ei);
- }
-
- if (compression != BTRFS_COMPRESS_NONE)
- flags |= FIEMAP_EXTENT_ENCODED;
-
- if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
- flags |= FIEMAP_EXTENT_DATA_INLINE;
- flags |= FIEMAP_EXTENT_NOT_ALIGNED;
- ret = emit_fiemap_extent(fieinfo, &cache, key.offset, 0,
- extent_len, flags);
- } else if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
- ret = fiemap_process_hole(inode, fieinfo, &cache,
- &delalloc_cached_state,
- backref_ctx,
- disk_bytenr, extent_offset,
- extent_gen, key.offset,
- extent_end - 1);
- } else if (disk_bytenr == 0) {
- /* We have an explicit hole. */
- ret = fiemap_process_hole(inode, fieinfo, &cache,
- &delalloc_cached_state,
- backref_ctx, 0, 0, 0,
- key.offset, extent_end - 1);
- } else {
- /* We have a regular extent. */
- if (fieinfo->fi_extents_max) {
- ret = btrfs_is_data_extent_shared(inode,
- disk_bytenr,
- extent_gen,
- backref_ctx);
- if (ret < 0)
- goto out_unlock;
- else if (ret > 0)
- flags |= FIEMAP_EXTENT_SHARED;
- }
-
- ret = emit_fiemap_extent(fieinfo, &cache, key.offset,
- disk_bytenr + extent_offset,
- extent_len, flags);
- }
-
- if (ret < 0) {
- goto out_unlock;
- } else if (ret > 0) {
- /* emit_fiemap_extent() told us to stop. */
- stopped = true;
- break;
- }
-
- prev_extent_end = extent_end;
-next_item:
- if (fatal_signal_pending(current)) {
- ret = -EINTR;
- goto out_unlock;
- }
-
- ret = fiemap_next_leaf_item(inode, path);
- if (ret < 0) {
- goto out_unlock;
- } else if (ret > 0) {
- /* No more file extent items for this inode. */
- break;
- }
- cond_resched();
- }
-
-check_eof_delalloc:
- if (!stopped && prev_extent_end < range_end) {
- ret = fiemap_process_hole(inode, fieinfo, &cache,
- &delalloc_cached_state, backref_ctx,
- 0, 0, 0, prev_extent_end, range_end - 1);
- if (ret < 0)
- goto out_unlock;
- prev_extent_end = range_end;
- }
-
- if (cache.cached && cache.offset + cache.len >= last_extent_end) {
- const u64 i_size = i_size_read(&inode->vfs_inode);
-
- if (prev_extent_end < i_size) {
- u64 delalloc_start;
- u64 delalloc_end;
- bool delalloc;
-
- delalloc = btrfs_find_delalloc_in_range(inode,
- prev_extent_end,
- i_size - 1,
- &delalloc_cached_state,
- &delalloc_start,
- &delalloc_end);
- if (!delalloc)
- cache.flags |= FIEMAP_EXTENT_LAST;
- } else {
- cache.flags |= FIEMAP_EXTENT_LAST;
- }
- }
-
-out_unlock:
- unlock_extent(&inode->io_tree, range_start, range_end, &cached_state);
-
- if (ret == BTRFS_FIEMAP_FLUSH_CACHE) {
- btrfs_release_path(path);
- ret = flush_fiemap_cache(fieinfo, &cache);
- if (ret)
- goto out;
- len -= cache.next_search_offset - start;
- start = cache.next_search_offset;
- goto restart;
- } else if (ret < 0) {
- goto out;
- }
-
- /*
- * Must free the path before emitting to the fiemap buffer because we
- * may have a non-cloned leaf and if the fiemap buffer is memory mapped
- * to a file, a write into it (through btrfs_page_mkwrite()) may trigger
- * waiting for an ordered extent that in order to complete needs to
- * modify that leaf, therefore leading to a deadlock.
- */
- btrfs_free_path(path);
- path = NULL;
-
- ret = flush_fiemap_cache(fieinfo, &cache);
- if (ret)
- goto out;
-
- ret = emit_last_fiemap_cache(fieinfo, &cache);
-out:
- free_extent_state(delalloc_cached_state);
- kfree(cache.entries);
- btrfs_free_backref_share_ctx(backref_ctx);
- btrfs_free_path(path);
- return ret;
-}
-
static void __free_extent_buffer(struct extent_buffer *eb)
{
kmem_cache_free(extent_buffer_cache, eb);
@@ -3372,7 +2580,7 @@ static bool folio_range_has_eb(struct btrfs_fs_info *fs_info, struct folio *foli
return false;
}
-static void detach_extent_buffer_folio(struct extent_buffer *eb, struct folio *folio)
+static void detach_extent_buffer_folio(const struct extent_buffer *eb, struct folio *folio)
{
struct btrfs_fs_info *fs_info = eb->fs_info;
const bool mapped = !test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags);
@@ -3433,7 +2641,7 @@ static void detach_extent_buffer_folio(struct extent_buffer *eb, struct folio *f
}
/* Release all pages attached to the extent buffer */
-static void btrfs_release_extent_buffer_pages(struct extent_buffer *eb)
+static void btrfs_release_extent_buffer_pages(const struct extent_buffer *eb)
{
ASSERT(!extent_buffer_under_io(eb));
@@ -3499,7 +2707,7 @@ struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src)
*/
set_bit(EXTENT_BUFFER_UNMAPPED, &new->bflags);
- ret = alloc_eb_folio_array(new, 0);
+ ret = alloc_eb_folio_array(new, false);
if (ret) {
btrfs_release_extent_buffer(new);
return NULL;
@@ -3507,7 +2715,6 @@ struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src)
for (int i = 0; i < num_folios; i++) {
struct folio *folio = new->folios[i];
- int ret;
ret = attach_extent_buffer_folio(new, folio, NULL);
if (ret < 0) {
@@ -3533,7 +2740,7 @@ struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
if (!eb)
return NULL;
- ret = alloc_eb_folio_array(eb, 0);
+ ret = alloc_eb_folio_array(eb, false);
if (ret)
goto err;
@@ -3899,7 +3106,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
reallocate:
/* Allocate all pages first. */
- ret = alloc_eb_folio_array(eb, __GFP_NOFAIL);
+ ret = alloc_eb_folio_array(eb, true);
if (ret < 0) {
btrfs_free_subpage(prealloc);
goto out;
@@ -4351,7 +3558,7 @@ static void end_bbio_meta_read(struct btrfs_bio *bbio)
}
int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num,
- struct btrfs_tree_parent_check *check)
+ const struct btrfs_tree_parent_check *check)
{
struct btrfs_bio *bbio;
bool ret;
@@ -4589,8 +3796,7 @@ static void assert_eb_folio_uptodate(const struct extent_buffer *eb, int i)
return;
if (fs_info->nodesize < PAGE_SIZE) {
- struct folio *folio = eb->folios[0];
-
+ folio = eb->folios[0];
ASSERT(i == 0);
if (WARN_ON(!btrfs_subpage_test_uptodate(fs_info, folio,
eb->start, eb->len)))
@@ -4608,7 +3814,7 @@ static void __write_extent_buffer(const struct extent_buffer *eb,
size_t cur;
size_t offset;
char *kaddr;
- char *src = (char *)srcv;
+ const char *src = (const char *)srcv;
unsigned long i = get_eb_folio_index(eb, start);
/* For unmapped (dummy) ebs, no need to check their uptodate status. */
const bool check_uptodate = !test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags);
@@ -4965,7 +4171,7 @@ void memmove_extent_buffer(const struct extent_buffer *dst,
#define GANG_LOOKUP_SIZE 16
static struct extent_buffer *get_next_extent_buffer(
- struct btrfs_fs_info *fs_info, struct page *page, u64 bytenr)
+ const struct btrfs_fs_info *fs_info, struct page *page, u64 bytenr)
{
struct extent_buffer *gang[GANG_LOOKUP_SIZE];
struct extent_buffer *found = NULL;
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index dca6b12769ec..dceebd76c7d1 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -215,6 +215,11 @@ static inline struct extent_changeset *extent_changeset_alloc(void)
return ret;
}
+static inline void extent_changeset_prealloc(struct extent_changeset *changeset, gfp_t gfp_mask)
+{
+ ulist_prealloc(&changeset->range_changed, gfp_mask);
+}
+
static inline void extent_changeset_release(struct extent_changeset *changeset)
{
if (!changeset)
@@ -235,15 +240,13 @@ bool try_release_extent_mapping(struct page *page, gfp_t mask);
int try_release_extent_buffer(struct page *page);
int btrfs_read_folio(struct file *file, struct folio *folio);
-void extent_write_locked_range(struct inode *inode, struct page *locked_page,
+void extent_write_locked_range(struct inode *inode, const struct page *locked_page,
u64 start, u64 end, struct writeback_control *wbc,
bool pages_dirty);
int btrfs_writepages(struct address_space *mapping, struct writeback_control *wbc);
int btree_write_cache_pages(struct address_space *mapping,
struct writeback_control *wbc);
void btrfs_readahead(struct readahead_control *rac);
-int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo,
- u64 start, u64 len);
int set_folio_extent_mapped(struct folio *folio);
int set_page_extent_mapped(struct page *page);
void clear_page_extent_mapped(struct page *page);
@@ -263,7 +266,7 @@ void free_extent_buffer_stale(struct extent_buffer *eb);
#define WAIT_COMPLETE 1
#define WAIT_PAGE_LOCK 2
int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num,
- struct btrfs_tree_parent_check *parent_check);
+ const struct btrfs_tree_parent_check *parent_check);
void wait_on_extent_buffer_writeback(struct extent_buffer *eb);
void btrfs_readahead_tree_block(struct btrfs_fs_info *fs_info,
u64 bytenr, u64 owner_root, u64 gen, int level);
@@ -350,9 +353,8 @@ void extent_buffer_bitmap_clear(const struct extent_buffer *eb,
void set_extent_buffer_dirty(struct extent_buffer *eb);
void set_extent_buffer_uptodate(struct extent_buffer *eb);
void clear_extent_buffer_uptodate(struct extent_buffer *eb);
-void extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end);
void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
- struct page *locked_page,
+ const struct page *locked_page,
struct extent_state **cached,
u32 bits_to_clear, unsigned long page_ops);
int extent_invalidate_folio(struct extent_io_tree *tree,
@@ -361,9 +363,8 @@ void btrfs_clear_buffer_dirty(struct btrfs_trans_handle *trans,
struct extent_buffer *buf);
int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array,
- gfp_t extra_gfp);
-int btrfs_alloc_folio_array(unsigned int nr_folios, struct folio **folio_array,
- gfp_t extra_gfp);
+ bool nofail);
+int btrfs_alloc_folio_array(unsigned int nr_folios, struct folio **folio_array);
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
bool find_lock_delalloc_range(struct inode *inode,
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index b4c9a6aa118c..81558f90ee80 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -33,7 +33,7 @@ void __cold extent_map_exit(void)
*/
void extent_map_tree_init(struct extent_map_tree *tree)
{
- tree->map = RB_ROOT_CACHED;
+ tree->root = RB_ROOT;
INIT_LIST_HEAD(&tree->modified_extents);
rwlock_init(&tree->lock);
}
@@ -85,27 +85,24 @@ static void dec_evictable_extent_maps(struct btrfs_inode *inode)
percpu_counter_dec(&fs_info->evictable_extent_maps);
}
-static int tree_insert(struct rb_root_cached *root, struct extent_map *em)
+static int tree_insert(struct rb_root *root, struct extent_map *em)
{
- struct rb_node **p = &root->rb_root.rb_node;
+ struct rb_node **p = &root->rb_node;
struct rb_node *parent = NULL;
struct extent_map *entry = NULL;
struct rb_node *orig_parent = NULL;
u64 end = range_end(em->start, em->len);
- bool leftmost = true;
while (*p) {
parent = *p;
entry = rb_entry(parent, struct extent_map, rb_node);
- if (em->start < entry->start) {
+ if (em->start < entry->start)
p = &(*p)->rb_left;
- } else if (em->start >= extent_map_end(entry)) {
+ else if (em->start >= extent_map_end(entry))
p = &(*p)->rb_right;
- leftmost = false;
- } else {
+ else
return -EEXIST;
- }
}
orig_parent = parent;
@@ -128,7 +125,7 @@ static int tree_insert(struct rb_root_cached *root, struct extent_map *em)
return -EEXIST;
rb_link_node(&em->rb_node, orig_parent, p);
- rb_insert_color_cached(&em->rb_node, root, leftmost);
+ rb_insert_color(&em->rb_node, root);
return 0;
}
@@ -186,11 +183,19 @@ static struct rb_node *__tree_search(struct rb_root *root, u64 offset,
return NULL;
}
+static inline u64 extent_map_block_len(const struct extent_map *em)
+{
+ if (extent_map_is_compressed(em))
+ return em->disk_num_bytes;
+ return em->len;
+}
+
static inline u64 extent_map_block_end(const struct extent_map *em)
{
- if (em->block_start + em->block_len < em->block_start)
+ if (extent_map_block_start(em) + extent_map_block_len(em) <
+ extent_map_block_start(em))
return (u64)-1;
- return em->block_start + em->block_len;
+ return extent_map_block_start(em) + extent_map_block_len(em);
}
static bool can_merge_extent_map(const struct extent_map *em)
@@ -225,15 +230,106 @@ static bool mergeable_maps(const struct extent_map *prev, const struct extent_ma
if (prev->flags != next->flags)
return false;
- if (next->block_start < EXTENT_MAP_LAST_BYTE - 1)
- return next->block_start == extent_map_block_end(prev);
+ if (next->disk_bytenr < EXTENT_MAP_LAST_BYTE - 1)
+ return extent_map_block_start(next) == extent_map_block_end(prev);
/* HOLES and INLINE extents. */
- return next->block_start == prev->block_start;
+ return next->disk_bytenr == prev->disk_bytenr;
+}
+
+/*
+ * Handle the on-disk data extents merge for @prev and @next.
+ *
+ * Only touches disk_bytenr/disk_num_bytes/offset/ram_bytes.
+ * For now only uncompressed regular extent can be merged.
+ *
+ * @prev and @next will be both updated to point to the new merged range.
+ * Thus one of them should be removed by the caller.
+ */
+static void merge_ondisk_extents(struct extent_map *prev, struct extent_map *next)
+{
+ u64 new_disk_bytenr;
+ u64 new_disk_num_bytes;
+ u64 new_offset;
+
+ /* @prev and @next should not be compressed. */
+ ASSERT(!extent_map_is_compressed(prev));
+ ASSERT(!extent_map_is_compressed(next));
+
+ /*
+ * There are two different cases where @prev and @next can be merged.
+ *
+ * 1) They are referring to the same data extent:
+ *
+ * |<----- data extent A ----->|
+ * |<- prev ->|<- next ->|
+ *
+ * 2) They are referring to different data extents but still adjacent:
+ *
+ * |<-- data extent A -->|<-- data extent B -->|
+ * |<- prev ->|<- next ->|
+ *
+ * The calculation here always merges the data extents first, then updates
+ * @offset using the new data extents.
+ *
+ * For case 1), the merged data extent would be the same.
+ * For case 2), we just merge the two data extents into one.
+ */
+ new_disk_bytenr = min(prev->disk_bytenr, next->disk_bytenr);
+ new_disk_num_bytes = max(prev->disk_bytenr + prev->disk_num_bytes,
+ next->disk_bytenr + next->disk_num_bytes) -
+ new_disk_bytenr;
+ new_offset = prev->disk_bytenr + prev->offset - new_disk_bytenr;
+
+ prev->disk_bytenr = new_disk_bytenr;
+ prev->disk_num_bytes = new_disk_num_bytes;
+ prev->ram_bytes = new_disk_num_bytes;
+ prev->offset = new_offset;
+
+ next->disk_bytenr = new_disk_bytenr;
+ next->disk_num_bytes = new_disk_num_bytes;
+ next->ram_bytes = new_disk_num_bytes;
+ next->offset = new_offset;
+}
+
+static void dump_extent_map(struct btrfs_fs_info *fs_info, const char *prefix,
+ struct extent_map *em)
+{
+ if (!IS_ENABLED(CONFIG_BTRFS_DEBUG))
+ return;
+ btrfs_crit(fs_info,
+"%s, start=%llu len=%llu disk_bytenr=%llu disk_num_bytes=%llu ram_bytes=%llu offset=%llu flags=0x%x",
+ prefix, em->start, em->len, em->disk_bytenr, em->disk_num_bytes,
+ em->ram_bytes, em->offset, em->flags);
+ ASSERT(0);
+}
+
+/* Internal sanity checks for btrfs debug builds. */
+static void validate_extent_map(struct btrfs_fs_info *fs_info, struct extent_map *em)
+{
+ if (!IS_ENABLED(CONFIG_BTRFS_DEBUG))
+ return;
+ if (em->disk_bytenr < EXTENT_MAP_LAST_BYTE) {
+ if (em->disk_num_bytes == 0)
+ dump_extent_map(fs_info, "zero disk_num_bytes", em);
+ if (em->offset + em->len > em->ram_bytes)
+ dump_extent_map(fs_info, "ram_bytes too small", em);
+ if (em->offset + em->len > em->disk_num_bytes &&
+ !extent_map_is_compressed(em))
+ dump_extent_map(fs_info, "disk_num_bytes too small", em);
+ if (!extent_map_is_compressed(em) &&
+ em->ram_bytes != em->disk_num_bytes)
+ dump_extent_map(fs_info,
+ "ram_bytes mismatch with disk_num_bytes for non-compressed em",
+ em);
+ } else if (em->offset) {
+ dump_extent_map(fs_info, "non-zero offset for hole/inline", em);
+ }
}
static void try_merge_map(struct btrfs_inode *inode, struct extent_map *em)
{
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct extent_map_tree *tree = &inode->extent_tree;
struct extent_map *merge = NULL;
struct rb_node *rb;
@@ -258,14 +354,15 @@ static void try_merge_map(struct btrfs_inode *inode, struct extent_map *em)
merge = rb_entry(rb, struct extent_map, rb_node);
if (rb && can_merge_extent_map(merge) && mergeable_maps(merge, em)) {
em->start = merge->start;
- em->orig_start = merge->orig_start;
em->len += merge->len;
- em->block_len += merge->block_len;
- em->block_start = merge->block_start;
em->generation = max(em->generation, merge->generation);
+
+ if (em->disk_bytenr < EXTENT_MAP_LAST_BYTE)
+ merge_ondisk_extents(merge, em);
em->flags |= EXTENT_FLAG_MERGED;
- rb_erase_cached(&merge->rb_node, &tree->map);
+ validate_extent_map(fs_info, em);
+ rb_erase(&merge->rb_node, &tree->root);
RB_CLEAR_NODE(&merge->rb_node);
free_extent_map(merge);
dec_evictable_extent_maps(inode);
@@ -277,8 +374,10 @@ static void try_merge_map(struct btrfs_inode *inode, struct extent_map *em)
merge = rb_entry(rb, struct extent_map, rb_node);
if (rb && can_merge_extent_map(merge) && mergeable_maps(em, merge)) {
em->len += merge->len;
- em->block_len += merge->block_len;
- rb_erase_cached(&merge->rb_node, &tree->map);
+ if (em->disk_bytenr < EXTENT_MAP_LAST_BYTE)
+ merge_ondisk_extents(em, merge);
+ validate_extent_map(fs_info, em);
+ rb_erase(&merge->rb_node, &tree->root);
RB_CLEAR_NODE(&merge->rb_node);
em->generation = max(em->generation, merge->generation);
em->flags |= EXTENT_FLAG_MERGED;
@@ -389,7 +488,8 @@ static int add_extent_mapping(struct btrfs_inode *inode,
lockdep_assert_held_write(&tree->lock);
- ret = tree_insert(&tree->map, em);
+ validate_extent_map(fs_info, em);
+ ret = tree_insert(&tree->root, em);
if (ret)
return ret;
@@ -410,7 +510,7 @@ __lookup_extent_mapping(struct extent_map_tree *tree,
struct rb_node *prev_or_next = NULL;
u64 end = range_end(start, len);
- rb_node = __tree_search(&tree->map.rb_root, start, &prev_or_next);
+ rb_node = __tree_search(&tree->root, start, &prev_or_next);
if (!rb_node) {
if (prev_or_next)
rb_node = prev_or_next;
@@ -479,7 +579,7 @@ void remove_extent_mapping(struct btrfs_inode *inode, struct extent_map *em)
lockdep_assert_held_write(&tree->lock);
WARN_ON(em->flags & EXTENT_FLAG_PINNED);
- rb_erase_cached(&em->rb_node, &tree->map);
+ rb_erase(&em->rb_node, &tree->root);
if (!(em->flags & EXTENT_FLAG_LOGGING))
list_del_init(&em->list);
RB_CLEAR_NODE(&em->rb_node);
@@ -492,15 +592,18 @@ static void replace_extent_mapping(struct btrfs_inode *inode,
struct extent_map *new,
int modified)
{
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct extent_map_tree *tree = &inode->extent_tree;
lockdep_assert_held_write(&tree->lock);
+ validate_extent_map(fs_info, new);
+
WARN_ON(cur->flags & EXTENT_FLAG_PINNED);
ASSERT(extent_map_in_tree(cur));
if (!(cur->flags & EXTENT_FLAG_LOGGING))
list_del_init(&cur->list);
- rb_replace_node_cached(&cur->rb_node, &new->rb_node, &tree->map);
+ rb_replace_node(&cur->rb_node, &new->rb_node, &tree->root);
RB_CLEAR_NODE(&cur->rb_node);
setup_extent_mapping(inode, new, modified);
@@ -561,11 +664,8 @@ static noinline int merge_extent_mapping(struct btrfs_inode *inode,
start_diff = start - em->start;
em->start = start;
em->len = end - start;
- if (em->block_start < EXTENT_MAP_LAST_BYTE &&
- !extent_map_is_compressed(em)) {
- em->block_start += start_diff;
- em->block_len = em->len;
- }
+ if (em->disk_bytenr < EXTENT_MAP_LAST_BYTE && !extent_map_is_compressed(em))
+ em->offset += start_diff;
return add_extent_mapping(inode, em, 0);
}
@@ -600,7 +700,7 @@ int btrfs_add_extent_mapping(struct btrfs_inode *inode,
* Tree-checker should have rejected any inline extent with non-zero
* file offset. Here just do a sanity check.
*/
- if (em->block_start == EXTENT_MAP_INLINE)
+ if (em->disk_bytenr == EXTENT_MAP_INLINE)
ASSERT(em->start == 0);
ret = add_extent_mapping(inode, em, 0);
@@ -657,18 +757,23 @@ int btrfs_add_extent_mapping(struct btrfs_inode *inode,
static void drop_all_extent_maps_fast(struct btrfs_inode *inode)
{
struct extent_map_tree *tree = &inode->extent_tree;
+ struct rb_node *node;
write_lock(&tree->lock);
- while (!RB_EMPTY_ROOT(&tree->map.rb_root)) {
+ node = rb_first(&tree->root);
+ while (node) {
struct extent_map *em;
- struct rb_node *node;
+ struct rb_node *next = rb_next(node);
- node = rb_first_cached(&tree->map);
em = rb_entry(node, struct extent_map, rb_node);
em->flags &= ~(EXTENT_FLAG_PINNED | EXTENT_FLAG_LOGGING);
remove_extent_mapping(inode, em);
free_extent_map(em);
- cond_resched_rwlock_write(&tree->lock);
+
+ if (cond_resched_rwlock_write(&tree->lock))
+ node = rb_first(&tree->root);
+ else
+ node = next;
}
write_unlock(&tree->lock);
}
@@ -729,7 +834,6 @@ void btrfs_drop_extent_map_range(struct btrfs_inode *inode, u64 start, u64 end,
u64 gen;
unsigned long flags;
bool modified;
- bool compressed;
if (em_end < end) {
next_em = next_extent_map(em);
@@ -763,7 +867,6 @@ void btrfs_drop_extent_map_range(struct btrfs_inode *inode, u64 start, u64 end,
goto remove_em;
gen = em->generation;
- compressed = extent_map_is_compressed(em);
if (em->start < start) {
if (!split) {
@@ -775,22 +878,15 @@ void btrfs_drop_extent_map_range(struct btrfs_inode *inode, u64 start, u64 end,
split->start = em->start;
split->len = start - em->start;
- if (em->block_start < EXTENT_MAP_LAST_BYTE) {
- split->orig_start = em->orig_start;
- split->block_start = em->block_start;
-
- if (compressed)
- split->block_len = em->block_len;
- else
- split->block_len = split->len;
- split->orig_block_len = max(split->block_len,
- em->orig_block_len);
+ if (em->disk_bytenr < EXTENT_MAP_LAST_BYTE) {
+ split->disk_bytenr = em->disk_bytenr;
+ split->disk_num_bytes = em->disk_num_bytes;
+ split->offset = em->offset;
split->ram_bytes = em->ram_bytes;
} else {
- split->orig_start = split->start;
- split->block_len = 0;
- split->block_start = em->block_start;
- split->orig_block_len = 0;
+ split->disk_bytenr = em->disk_bytenr;
+ split->disk_num_bytes = 0;
+ split->offset = 0;
split->ram_bytes = split->len;
}
@@ -810,30 +906,18 @@ void btrfs_drop_extent_map_range(struct btrfs_inode *inode, u64 start, u64 end,
}
split->start = end;
split->len = em_end - end;
- split->block_start = em->block_start;
+ split->disk_bytenr = em->disk_bytenr;
split->flags = flags;
split->generation = gen;
- if (em->block_start < EXTENT_MAP_LAST_BYTE) {
- split->orig_block_len = max(em->block_len,
- em->orig_block_len);
-
+ if (em->disk_bytenr < EXTENT_MAP_LAST_BYTE) {
+ split->disk_num_bytes = em->disk_num_bytes;
+ split->offset = em->offset + end - em->start;
split->ram_bytes = em->ram_bytes;
- if (compressed) {
- split->block_len = em->block_len;
- split->orig_start = em->orig_start;
- } else {
- const u64 diff = end - em->start;
-
- split->block_len = split->len;
- split->block_start += diff;
- split->orig_start = em->orig_start;
- }
} else {
+ split->disk_num_bytes = 0;
+ split->offset = 0;
split->ram_bytes = split->len;
- split->orig_start = split->start;
- split->block_len = 0;
- split->orig_block_len = 0;
}
if (extent_map_in_tree(em)) {
@@ -976,7 +1060,7 @@ int split_extent_map(struct btrfs_inode *inode, u64 start, u64 len, u64 pre,
ASSERT(em->len == len);
ASSERT(!extent_map_is_compressed(em));
- ASSERT(em->block_start < EXTENT_MAP_LAST_BYTE);
+ ASSERT(em->disk_bytenr < EXTENT_MAP_LAST_BYTE);
ASSERT(em->flags & EXTENT_FLAG_PINNED);
ASSERT(!(em->flags & EXTENT_FLAG_LOGGING));
ASSERT(!list_empty(&em->list));
@@ -987,10 +1071,9 @@ int split_extent_map(struct btrfs_inode *inode, u64 start, u64 len, u64 pre,
/* First, replace the em with a new extent_map starting from * em->start */
split_pre->start = em->start;
split_pre->len = pre;
- split_pre->orig_start = split_pre->start;
- split_pre->block_start = new_logical;
- split_pre->block_len = split_pre->len;
- split_pre->orig_block_len = split_pre->block_len;
+ split_pre->disk_bytenr = new_logical;
+ split_pre->disk_num_bytes = split_pre->len;
+ split_pre->offset = 0;
split_pre->ram_bytes = split_pre->len;
split_pre->flags = flags;
split_pre->generation = em->generation;
@@ -1005,10 +1088,9 @@ int split_extent_map(struct btrfs_inode *inode, u64 start, u64 len, u64 pre,
/* Insert the middle extent_map. */
split_mid->start = em->start + pre;
split_mid->len = em->len - pre;
- split_mid->orig_start = split_mid->start;
- split_mid->block_start = em->block_start + pre;
- split_mid->block_len = split_mid->len;
- split_mid->orig_block_len = split_mid->block_len;
+ split_mid->disk_bytenr = extent_map_block_start(em) + pre;
+ split_mid->disk_num_bytes = split_mid->len;
+ split_mid->offset = 0;
split_mid->ram_bytes = split_mid->len;
split_mid->flags = flags;
split_mid->generation = em->generation;
@@ -1076,12 +1158,12 @@ static long btrfs_scan_inode(struct btrfs_inode *inode, struct btrfs_em_shrink_c
return 0;
}
- node = rb_first_cached(&tree->map);
+ node = rb_first(&tree->root);
while (node) {
+ struct rb_node *next = rb_next(node);
struct extent_map *em;
em = rb_entry(node, struct extent_map, rb_node);
- node = rb_next(node);
ctx->scanned++;
if (em->flags & EXTENT_FLAG_PINNED)
@@ -1115,6 +1197,7 @@ next:
*/
if (need_resched() || rwlock_needbreak(&tree->lock))
break;
+ node = next;
}
write_unlock(&tree->lock);
up_read(&inode->i_mmap_lock);
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index 6d587111f73a..5154a8f1d26c 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -4,12 +4,11 @@
#define BTRFS_EXTENT_MAP_H
#include <linux/compiler_types.h>
-#include <linux/rwlock_types.h>
+#include <linux/spinlock_types.h>
#include <linux/rbtree.h>
#include <linux/list.h>
#include <linux/refcount.h>
#include "misc.h"
-#include "extent_map.h"
#include "compression.h"
struct btrfs_inode;
@@ -62,46 +61,33 @@ struct extent_map {
u64 len;
/*
- * The file offset of the original file extent before splitting.
+ * The bytenr of the full on-disk extent.
*
- * This is an in-memory only member, matching
- * extent_map::start - btrfs_file_extent_item::offset for
- * regular/preallocated extents. EXTENT_MAP_HOLE otherwise.
+ * For regular extents it's btrfs_file_extent_item::disk_bytenr.
+ * For holes it's EXTENT_MAP_HOLE and for inline extents it's
+ * EXTENT_MAP_INLINE.
*/
- u64 orig_start;
+ u64 disk_bytenr;
/*
* The full on-disk extent length, matching
* btrfs_file_extent_item::disk_num_bytes.
*/
- u64 orig_block_len;
-
- /*
- * The decompressed size of the whole on-disk extent, matching
- * btrfs_file_extent_item::ram_bytes.
- */
- u64 ram_bytes;
+ u64 disk_num_bytes;
/*
- * The on-disk logical bytenr for the file extent.
- *
- * For compressed extents it matches btrfs_file_extent_item::disk_bytenr.
- * For uncompressed extents it matches
- * btrfs_file_extent_item::disk_bytenr + btrfs_file_extent_item::offset
+ * Offset inside the decompressed extent.
*
- * For holes it is EXTENT_MAP_HOLE and for inline extents it is
- * EXTENT_MAP_INLINE.
+ * For regular extents it's btrfs_file_extent_item::offset.
+ * For holes and inline extents it's 0.
*/
- u64 block_start;
+ u64 offset;
/*
- * The on-disk length for the file extent.
- *
- * For compressed extents it matches btrfs_file_extent_item::disk_num_bytes.
- * For uncompressed extents it matches extent_map::len.
- * For holes and inline extents it's -1 and shouldn't be used.
+ * The decompressed size of the whole on-disk extent, matching
+ * btrfs_file_extent_item::ram_bytes.
*/
- u64 block_len;
+ u64 ram_bytes;
/*
* Generation of the extent map, for merged em it's the highest
@@ -115,7 +101,7 @@ struct extent_map {
};
struct extent_map_tree {
- struct rb_root_cached map;
+ struct rb_root root;
struct list_head modified_extents;
rwlock_t lock;
};
@@ -163,6 +149,16 @@ static inline int extent_map_in_tree(const struct extent_map *em)
return !RB_EMPTY_NODE(&em->rb_node);
}
+static inline u64 extent_map_block_start(const struct extent_map *em)
+{
+ if (em->disk_bytenr < EXTENT_MAP_LAST_BYTE) {
+ if (extent_map_is_compressed(em))
+ return em->disk_bytenr;
+ return em->disk_bytenr + em->offset;
+ }
+ return em->disk_bytenr;
+}
+
static inline u64 extent_map_end(const struct extent_map *em)
{
if (em->start + em->len < em->start)
diff --git a/fs/btrfs/fiemap.c b/fs/btrfs/fiemap.c
new file mode 100644
index 000000000000..8f95f3e44e99
--- /dev/null
+++ b/fs/btrfs/fiemap.c
@@ -0,0 +1,930 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "backref.h"
+#include "btrfs_inode.h"
+#include "fiemap.h"
+#include "file.h"
+#include "file-item.h"
+
+struct btrfs_fiemap_entry {
+ u64 offset;
+ u64 phys;
+ u64 len;
+ u32 flags;
+};
+
+/*
+ * Indicate the caller of emit_fiemap_extent() that it needs to unlock the file
+ * range from the inode's io tree, unlock the subvolume tree search path, flush
+ * the fiemap cache and relock the file range and research the subvolume tree.
+ * The value here is something negative that can't be confused with a valid
+ * errno value and different from 1 because that's also a return value from
+ * fiemap_fill_next_extent() and also it's often used to mean some btree search
+ * did not find a key, so make it some distinct negative value.
+ */
+#define BTRFS_FIEMAP_FLUSH_CACHE (-(MAX_ERRNO + 1))
+
+/*
+ * Used to:
+ *
+ * - Cache the next entry to be emitted to the fiemap buffer, so that we can
+ * merge extents that are contiguous and can be grouped as a single one;
+ *
+ * - Store extents ready to be written to the fiemap buffer in an intermediary
+ * buffer. This intermediary buffer is to ensure that in case the fiemap
+ * buffer is memory mapped to the fiemap target file, we don't deadlock
+ * during btrfs_page_mkwrite(). This is because during fiemap we are locking
+ * an extent range in order to prevent races with delalloc flushing and
+ * ordered extent completion, which is needed in order to reliably detect
+ * delalloc in holes and prealloc extents. And this can lead to a deadlock
+ * if the fiemap buffer is memory mapped to the file we are running fiemap
+ * against (a silly, useless in practice scenario, but possible) because
+ * btrfs_page_mkwrite() will try to lock the same extent range.
+ */
+struct fiemap_cache {
+ /* An array of ready fiemap entries. */
+ struct btrfs_fiemap_entry *entries;
+ /* Number of entries in the entries array. */
+ int entries_size;
+ /* Index of the next entry in the entries array to write to. */
+ int entries_pos;
+ /*
+ * Once the entries array is full, this indicates what's the offset for
+ * the next file extent item we must search for in the inode's subvolume
+ * tree after unlocking the extent range in the inode's io tree and
+ * releasing the search path.
+ */
+ u64 next_search_offset;
+ /*
+ * This matches struct fiemap_extent_info::fi_mapped_extents, we use it
+ * to count ourselves emitted extents and stop instead of relying on
+ * fiemap_fill_next_extent() because we buffer ready fiemap entries at
+ * the @entries array, and we want to stop as soon as we hit the max
+ * amount of extents to map, not just to save time but also to make the
+ * logic at extent_fiemap() simpler.
+ */
+ unsigned int extents_mapped;
+ /* Fields for the cached extent (unsubmitted, not ready, extent). */
+ u64 offset;
+ u64 phys;
+ u64 len;
+ u32 flags;
+ bool cached;
+};
+
+static int flush_fiemap_cache(struct fiemap_extent_info *fieinfo,
+ struct fiemap_cache *cache)
+{
+ for (int i = 0; i < cache->entries_pos; i++) {
+ struct btrfs_fiemap_entry *entry = &cache->entries[i];
+ int ret;
+
+ ret = fiemap_fill_next_extent(fieinfo, entry->offset,
+ entry->phys, entry->len,
+ entry->flags);
+ /*
+ * Ignore 1 (reached max entries) because we keep track of that
+ * ourselves in emit_fiemap_extent().
+ */
+ if (ret < 0)
+ return ret;
+ }
+ cache->entries_pos = 0;
+
+ return 0;
+}
+
+/*
+ * Helper to submit fiemap extent.
+ *
+ * Will try to merge current fiemap extent specified by @offset, @phys,
+ * @len and @flags with cached one.
+ * And only when we fails to merge, cached one will be submitted as
+ * fiemap extent.
+ *
+ * Return value is the same as fiemap_fill_next_extent().
+ */
+static int emit_fiemap_extent(struct fiemap_extent_info *fieinfo,
+ struct fiemap_cache *cache,
+ u64 offset, u64 phys, u64 len, u32 flags)
+{
+ struct btrfs_fiemap_entry *entry;
+ u64 cache_end;
+
+ /* Set at the end of extent_fiemap(). */
+ ASSERT((flags & FIEMAP_EXTENT_LAST) == 0);
+
+ if (!cache->cached)
+ goto assign;
+
+ /*
+ * When iterating the extents of the inode, at extent_fiemap(), we may
+ * find an extent that starts at an offset behind the end offset of the
+ * previous extent we processed. This happens if fiemap is called
+ * without FIEMAP_FLAG_SYNC and there are ordered extents completing
+ * after we had to unlock the file range, release the search path, emit
+ * the fiemap extents stored in the buffer (cache->entries array) and
+ * the lock the remainder of the range and re-search the btree.
+ *
+ * For example we are in leaf X processing its last item, which is the
+ * file extent item for file range [512K, 1M[, and after
+ * btrfs_next_leaf() releases the path, there's an ordered extent that
+ * completes for the file range [768K, 2M[, and that results in trimming
+ * the file extent item so that it now corresponds to the file range
+ * [512K, 768K[ and a new file extent item is inserted for the file
+ * range [768K, 2M[, which may end up as the last item of leaf X or as
+ * the first item of the next leaf - in either case btrfs_next_leaf()
+ * will leave us with a path pointing to the new extent item, for the
+ * file range [768K, 2M[, since that's the first key that follows the
+ * last one we processed. So in order not to report overlapping extents
+ * to user space, we trim the length of the previously cached extent and
+ * emit it.
+ *
+ * Upon calling btrfs_next_leaf() we may also find an extent with an
+ * offset smaller than or equals to cache->offset, and this happens
+ * when we had a hole or prealloc extent with several delalloc ranges in
+ * it, but after btrfs_next_leaf() released the path, delalloc was
+ * flushed and the resulting ordered extents were completed, so we can
+ * now have found a file extent item for an offset that is smaller than
+ * or equals to what we have in cache->offset. We deal with this as
+ * described below.
+ */
+ cache_end = cache->offset + cache->len;
+ if (cache_end > offset) {
+ if (offset == cache->offset) {
+ /*
+ * We cached a dealloc range (found in the io tree) for
+ * a hole or prealloc extent and we have now found a
+ * file extent item for the same offset. What we have
+ * now is more recent and up to date, so discard what
+ * we had in the cache and use what we have just found.
+ */
+ goto assign;
+ } else if (offset > cache->offset) {
+ /*
+ * The extent range we previously found ends after the
+ * offset of the file extent item we found and that
+ * offset falls somewhere in the middle of that previous
+ * extent range. So adjust the range we previously found
+ * to end at the offset of the file extent item we have
+ * just found, since this extent is more up to date.
+ * Emit that adjusted range and cache the file extent
+ * item we have just found. This corresponds to the case
+ * where a previously found file extent item was split
+ * due to an ordered extent completing.
+ */
+ cache->len = offset - cache->offset;
+ goto emit;
+ } else {
+ const u64 range_end = offset + len;
+
+ /*
+ * The offset of the file extent item we have just found
+ * is behind the cached offset. This means we were
+ * processing a hole or prealloc extent for which we
+ * have found delalloc ranges (in the io tree), so what
+ * we have in the cache is the last delalloc range we
+ * found while the file extent item we found can be
+ * either for a whole delalloc range we previously
+ * emmitted or only a part of that range.
+ *
+ * We have two cases here:
+ *
+ * 1) The file extent item's range ends at or behind the
+ * cached extent's end. In this case just ignore the
+ * current file extent item because we don't want to
+ * overlap with previous ranges that may have been
+ * emmitted already;
+ *
+ * 2) The file extent item starts behind the currently
+ * cached extent but its end offset goes beyond the
+ * end offset of the cached extent. We don't want to
+ * overlap with a previous range that may have been
+ * emmitted already, so we emit the currently cached
+ * extent and then partially store the current file
+ * extent item's range in the cache, for the subrange
+ * going the cached extent's end to the end of the
+ * file extent item.
+ */
+ if (range_end <= cache_end)
+ return 0;
+
+ if (!(flags & (FIEMAP_EXTENT_ENCODED | FIEMAP_EXTENT_DELALLOC)))
+ phys += cache_end - offset;
+
+ offset = cache_end;
+ len = range_end - cache_end;
+ goto emit;
+ }
+ }
+
+ /*
+ * Only merges fiemap extents if
+ * 1) Their logical addresses are continuous
+ *
+ * 2) Their physical addresses are continuous
+ * So truly compressed (physical size smaller than logical size)
+ * extents won't get merged with each other
+ *
+ * 3) Share same flags
+ */
+ if (cache->offset + cache->len == offset &&
+ cache->phys + cache->len == phys &&
+ cache->flags == flags) {
+ cache->len += len;
+ return 0;
+ }
+
+emit:
+ /* Not mergeable, need to submit cached one */
+
+ if (cache->entries_pos == cache->entries_size) {
+ /*
+ * We will need to research for the end offset of the last
+ * stored extent and not from the current offset, because after
+ * unlocking the range and releasing the path, if there's a hole
+ * between that end offset and this current offset, a new extent
+ * may have been inserted due to a new write, so we don't want
+ * to miss it.
+ */
+ entry = &cache->entries[cache->entries_size - 1];
+ cache->next_search_offset = entry->offset + entry->len;
+ cache->cached = false;
+
+ return BTRFS_FIEMAP_FLUSH_CACHE;
+ }
+
+ entry = &cache->entries[cache->entries_pos];
+ entry->offset = cache->offset;
+ entry->phys = cache->phys;
+ entry->len = cache->len;
+ entry->flags = cache->flags;
+ cache->entries_pos++;
+ cache->extents_mapped++;
+
+ if (cache->extents_mapped == fieinfo->fi_extents_max) {
+ cache->cached = false;
+ return 1;
+ }
+assign:
+ cache->cached = true;
+ cache->offset = offset;
+ cache->phys = phys;
+ cache->len = len;
+ cache->flags = flags;
+
+ return 0;
+}
+
+/*
+ * Emit last fiemap cache
+ *
+ * The last fiemap cache may still be cached in the following case:
+ * 0 4k 8k
+ * |<- Fiemap range ->|
+ * |<------------ First extent ----------->|
+ *
+ * In this case, the first extent range will be cached but not emitted.
+ * So we must emit it before ending extent_fiemap().
+ */
+static int emit_last_fiemap_cache(struct fiemap_extent_info *fieinfo,
+ struct fiemap_cache *cache)
+{
+ int ret;
+
+ if (!cache->cached)
+ return 0;
+
+ ret = fiemap_fill_next_extent(fieinfo, cache->offset, cache->phys,
+ cache->len, cache->flags);
+ cache->cached = false;
+ if (ret > 0)
+ ret = 0;
+ return ret;
+}
+
+static int fiemap_next_leaf_item(struct btrfs_inode *inode, struct btrfs_path *path)
+{
+ struct extent_buffer *clone = path->nodes[0];
+ struct btrfs_key key;
+ int slot;
+ int ret;
+
+ path->slots[0]++;
+ if (path->slots[0] < btrfs_header_nritems(path->nodes[0]))
+ return 0;
+
+ /*
+ * Add a temporary extra ref to an already cloned extent buffer to
+ * prevent btrfs_next_leaf() freeing it, we want to reuse it to avoid
+ * the cost of allocating a new one.
+ */
+ ASSERT(test_bit(EXTENT_BUFFER_UNMAPPED, &clone->bflags));
+ atomic_inc(&clone->refs);
+
+ ret = btrfs_next_leaf(inode->root, path);
+ if (ret != 0)
+ goto out;
+
+ /*
+ * Don't bother with cloning if there are no more file extent items for
+ * our inode.
+ */
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+ if (key.objectid != btrfs_ino(inode) || key.type != BTRFS_EXTENT_DATA_KEY) {
+ ret = 1;
+ goto out;
+ }
+
+ /*
+ * Important to preserve the start field, for the optimizations when
+ * checking if extents are shared (see extent_fiemap()).
+ *
+ * We must set ->start before calling copy_extent_buffer_full(). If we
+ * are on sub-pagesize blocksize, we use ->start to determine the offset
+ * into the folio where our eb exists, and if we update ->start after
+ * the fact then any subsequent reads of the eb may read from a
+ * different offset in the folio than where we originally copied into.
+ */
+ clone->start = path->nodes[0]->start;
+ /* See the comment at fiemap_search_slot() about why we clone. */
+ copy_extent_buffer_full(clone, path->nodes[0]);
+
+ slot = path->slots[0];
+ btrfs_release_path(path);
+ path->nodes[0] = clone;
+ path->slots[0] = slot;
+out:
+ if (ret)
+ free_extent_buffer(clone);
+
+ return ret;
+}
+
+/*
+ * Search for the first file extent item that starts at a given file offset or
+ * the one that starts immediately before that offset.
+ * Returns: 0 on success, < 0 on error, 1 if not found.
+ */
+static int fiemap_search_slot(struct btrfs_inode *inode, struct btrfs_path *path,
+ u64 file_offset)
+{
+ const u64 ino = btrfs_ino(inode);
+ struct btrfs_root *root = inode->root;
+ struct extent_buffer *clone;
+ struct btrfs_key key;
+ int slot;
+ int ret;
+
+ key.objectid = ino;
+ key.type = BTRFS_EXTENT_DATA_KEY;
+ key.offset = file_offset;
+
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0)
+ return ret;
+
+ if (ret > 0 && path->slots[0] > 0) {
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0] - 1);
+ if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY)
+ path->slots[0]--;
+ }
+
+ if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret != 0)
+ return ret;
+
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+ if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY)
+ return 1;
+ }
+
+ /*
+ * We clone the leaf and use it during fiemap. This is because while
+ * using the leaf we do expensive things like checking if an extent is
+ * shared, which can take a long time. In order to prevent blocking
+ * other tasks for too long, we use a clone of the leaf. We have locked
+ * the file range in the inode's io tree, so we know none of our file
+ * extent items can change. This way we avoid blocking other tasks that
+ * want to insert items for other inodes in the same leaf or b+tree
+ * rebalance operations (triggered for example when someone is trying
+ * to push items into this leaf when trying to insert an item in a
+ * neighbour leaf).
+ * We also need the private clone because holding a read lock on an
+ * extent buffer of the subvolume's b+tree will make lockdep unhappy
+ * when we check if extents are shared, as backref walking may need to
+ * lock the same leaf we are processing.
+ */
+ clone = btrfs_clone_extent_buffer(path->nodes[0]);
+ if (!clone)
+ return -ENOMEM;
+
+ slot = path->slots[0];
+ btrfs_release_path(path);
+ path->nodes[0] = clone;
+ path->slots[0] = slot;
+
+ return 0;
+}
+
+/*
+ * Process a range which is a hole or a prealloc extent in the inode's subvolume
+ * btree. If @disk_bytenr is 0, we are dealing with a hole, otherwise a prealloc
+ * extent. The end offset (@end) is inclusive.
+ */
+static int fiemap_process_hole(struct btrfs_inode *inode,
+ struct fiemap_extent_info *fieinfo,
+ struct fiemap_cache *cache,
+ struct extent_state **delalloc_cached_state,
+ struct btrfs_backref_share_check_ctx *backref_ctx,
+ u64 disk_bytenr, u64 extent_offset,
+ u64 extent_gen,
+ u64 start, u64 end)
+{
+ const u64 i_size = i_size_read(&inode->vfs_inode);
+ u64 cur_offset = start;
+ u64 last_delalloc_end = 0;
+ u32 prealloc_flags = FIEMAP_EXTENT_UNWRITTEN;
+ bool checked_extent_shared = false;
+ int ret;
+
+ /*
+ * There can be no delalloc past i_size, so don't waste time looking for
+ * it beyond i_size.
+ */
+ while (cur_offset < end && cur_offset < i_size) {
+ u64 delalloc_start;
+ u64 delalloc_end;
+ u64 prealloc_start;
+ u64 prealloc_len = 0;
+ bool delalloc;
+
+ delalloc = btrfs_find_delalloc_in_range(inode, cur_offset, end,
+ delalloc_cached_state,
+ &delalloc_start,
+ &delalloc_end);
+ if (!delalloc)
+ break;
+
+ /*
+ * If this is a prealloc extent we have to report every section
+ * of it that has no delalloc.
+ */
+ if (disk_bytenr != 0) {
+ if (last_delalloc_end == 0) {
+ prealloc_start = start;
+ prealloc_len = delalloc_start - start;
+ } else {
+ prealloc_start = last_delalloc_end + 1;
+ prealloc_len = delalloc_start - prealloc_start;
+ }
+ }
+
+ if (prealloc_len > 0) {
+ if (!checked_extent_shared && fieinfo->fi_extents_max) {
+ ret = btrfs_is_data_extent_shared(inode,
+ disk_bytenr,
+ extent_gen,
+ backref_ctx);
+ if (ret < 0)
+ return ret;
+ else if (ret > 0)
+ prealloc_flags |= FIEMAP_EXTENT_SHARED;
+
+ checked_extent_shared = true;
+ }
+ ret = emit_fiemap_extent(fieinfo, cache, prealloc_start,
+ disk_bytenr + extent_offset,
+ prealloc_len, prealloc_flags);
+ if (ret)
+ return ret;
+ extent_offset += prealloc_len;
+ }
+
+ ret = emit_fiemap_extent(fieinfo, cache, delalloc_start, 0,
+ delalloc_end + 1 - delalloc_start,
+ FIEMAP_EXTENT_DELALLOC |
+ FIEMAP_EXTENT_UNKNOWN);
+ if (ret)
+ return ret;
+
+ last_delalloc_end = delalloc_end;
+ cur_offset = delalloc_end + 1;
+ extent_offset += cur_offset - delalloc_start;
+ cond_resched();
+ }
+
+ /*
+ * Either we found no delalloc for the whole prealloc extent or we have
+ * a prealloc extent that spans i_size or starts at or after i_size.
+ */
+ if (disk_bytenr != 0 && last_delalloc_end < end) {
+ u64 prealloc_start;
+ u64 prealloc_len;
+
+ if (last_delalloc_end == 0) {
+ prealloc_start = start;
+ prealloc_len = end + 1 - start;
+ } else {
+ prealloc_start = last_delalloc_end + 1;
+ prealloc_len = end + 1 - prealloc_start;
+ }
+
+ if (!checked_extent_shared && fieinfo->fi_extents_max) {
+ ret = btrfs_is_data_extent_shared(inode,
+ disk_bytenr,
+ extent_gen,
+ backref_ctx);
+ if (ret < 0)
+ return ret;
+ else if (ret > 0)
+ prealloc_flags |= FIEMAP_EXTENT_SHARED;
+ }
+ ret = emit_fiemap_extent(fieinfo, cache, prealloc_start,
+ disk_bytenr + extent_offset,
+ prealloc_len, prealloc_flags);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
+static int fiemap_find_last_extent_offset(struct btrfs_inode *inode,
+ struct btrfs_path *path,
+ u64 *last_extent_end_ret)
+{
+ const u64 ino = btrfs_ino(inode);
+ struct btrfs_root *root = inode->root;
+ struct extent_buffer *leaf;
+ struct btrfs_file_extent_item *ei;
+ struct btrfs_key key;
+ u64 disk_bytenr;
+ int ret;
+
+ /*
+ * Lookup the last file extent. We're not using i_size here because
+ * there might be preallocation past i_size.
+ */
+ ret = btrfs_lookup_file_extent(NULL, root, path, ino, (u64)-1, 0);
+ /* There can't be a file extent item at offset (u64)-1 */
+ ASSERT(ret != 0);
+ if (ret < 0)
+ return ret;
+
+ /*
+ * For a non-existing key, btrfs_search_slot() always leaves us at a
+ * slot > 0, except if the btree is empty, which is impossible because
+ * at least it has the inode item for this inode and all the items for
+ * the root inode 256.
+ */
+ ASSERT(path->slots[0] > 0);
+ path->slots[0]--;
+ leaf = path->nodes[0];
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+ if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY) {
+ /* No file extent items in the subvolume tree. */
+ *last_extent_end_ret = 0;
+ return 0;
+ }
+
+ /*
+ * For an inline extent, the disk_bytenr is where inline data starts at,
+ * so first check if we have an inline extent item before checking if we
+ * have an implicit hole (disk_bytenr == 0).
+ */
+ ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item);
+ if (btrfs_file_extent_type(leaf, ei) == BTRFS_FILE_EXTENT_INLINE) {
+ *last_extent_end_ret = btrfs_file_extent_end(path);
+ return 0;
+ }
+
+ /*
+ * Find the last file extent item that is not a hole (when NO_HOLES is
+ * not enabled). This should take at most 2 iterations in the worst
+ * case: we have one hole file extent item at slot 0 of a leaf and
+ * another hole file extent item as the last item in the previous leaf.
+ * This is because we merge file extent items that represent holes.
+ */
+ disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, ei);
+ while (disk_bytenr == 0) {
+ ret = btrfs_previous_item(root, path, ino, BTRFS_EXTENT_DATA_KEY);
+ if (ret < 0) {
+ return ret;
+ } else if (ret > 0) {
+ /* No file extent items that are not holes. */
+ *last_extent_end_ret = 0;
+ return 0;
+ }
+ leaf = path->nodes[0];
+ ei = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_file_extent_item);
+ disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, ei);
+ }
+
+ *last_extent_end_ret = btrfs_file_extent_end(path);
+ return 0;
+}
+
+static int extent_fiemap(struct btrfs_inode *inode,
+ struct fiemap_extent_info *fieinfo,
+ u64 start, u64 len)
+{
+ const u64 ino = btrfs_ino(inode);
+ struct extent_state *cached_state = NULL;
+ struct extent_state *delalloc_cached_state = NULL;
+ struct btrfs_path *path;
+ struct fiemap_cache cache = { 0 };
+ struct btrfs_backref_share_check_ctx *backref_ctx;
+ u64 last_extent_end;
+ u64 prev_extent_end;
+ u64 range_start;
+ u64 range_end;
+ const u64 sectorsize = inode->root->fs_info->sectorsize;
+ bool stopped = false;
+ int ret;
+
+ cache.entries_size = PAGE_SIZE / sizeof(struct btrfs_fiemap_entry);
+ cache.entries = kmalloc_array(cache.entries_size,
+ sizeof(struct btrfs_fiemap_entry),
+ GFP_KERNEL);
+ backref_ctx = btrfs_alloc_backref_share_check_ctx();
+ path = btrfs_alloc_path();
+ if (!cache.entries || !backref_ctx || !path) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+restart:
+ range_start = round_down(start, sectorsize);
+ range_end = round_up(start + len, sectorsize);
+ prev_extent_end = range_start;
+
+ lock_extent(&inode->io_tree, range_start, range_end, &cached_state);
+
+ ret = fiemap_find_last_extent_offset(inode, path, &last_extent_end);
+ if (ret < 0)
+ goto out_unlock;
+ btrfs_release_path(path);
+
+ path->reada = READA_FORWARD;
+ ret = fiemap_search_slot(inode, path, range_start);
+ if (ret < 0) {
+ goto out_unlock;
+ } else if (ret > 0) {
+ /*
+ * No file extent item found, but we may have delalloc between
+ * the current offset and i_size. So check for that.
+ */
+ ret = 0;
+ goto check_eof_delalloc;
+ }
+
+ while (prev_extent_end < range_end) {
+ struct extent_buffer *leaf = path->nodes[0];
+ struct btrfs_file_extent_item *ei;
+ struct btrfs_key key;
+ u64 extent_end;
+ u64 extent_len;
+ u64 extent_offset = 0;
+ u64 extent_gen;
+ u64 disk_bytenr = 0;
+ u64 flags = 0;
+ int extent_type;
+ u8 compression;
+
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+ if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY)
+ break;
+
+ extent_end = btrfs_file_extent_end(path);
+
+ /*
+ * The first iteration can leave us at an extent item that ends
+ * before our range's start. Move to the next item.
+ */
+ if (extent_end <= range_start)
+ goto next_item;
+
+ backref_ctx->curr_leaf_bytenr = leaf->start;
+
+ /* We have in implicit hole (NO_HOLES feature enabled). */
+ if (prev_extent_end < key.offset) {
+ const u64 hole_end = min(key.offset, range_end) - 1;
+
+ ret = fiemap_process_hole(inode, fieinfo, &cache,
+ &delalloc_cached_state,
+ backref_ctx, 0, 0, 0,
+ prev_extent_end, hole_end);
+ if (ret < 0) {
+ goto out_unlock;
+ } else if (ret > 0) {
+ /* fiemap_fill_next_extent() told us to stop. */
+ stopped = true;
+ break;
+ }
+
+ /* We've reached the end of the fiemap range, stop. */
+ if (key.offset >= range_end) {
+ stopped = true;
+ break;
+ }
+ }
+
+ extent_len = extent_end - key.offset;
+ ei = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_file_extent_item);
+ compression = btrfs_file_extent_compression(leaf, ei);
+ extent_type = btrfs_file_extent_type(leaf, ei);
+ extent_gen = btrfs_file_extent_generation(leaf, ei);
+
+ if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
+ disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, ei);
+ if (compression == BTRFS_COMPRESS_NONE)
+ extent_offset = btrfs_file_extent_offset(leaf, ei);
+ }
+
+ if (compression != BTRFS_COMPRESS_NONE)
+ flags |= FIEMAP_EXTENT_ENCODED;
+
+ if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
+ flags |= FIEMAP_EXTENT_DATA_INLINE;
+ flags |= FIEMAP_EXTENT_NOT_ALIGNED;
+ ret = emit_fiemap_extent(fieinfo, &cache, key.offset, 0,
+ extent_len, flags);
+ } else if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
+ ret = fiemap_process_hole(inode, fieinfo, &cache,
+ &delalloc_cached_state,
+ backref_ctx,
+ disk_bytenr, extent_offset,
+ extent_gen, key.offset,
+ extent_end - 1);
+ } else if (disk_bytenr == 0) {
+ /* We have an explicit hole. */
+ ret = fiemap_process_hole(inode, fieinfo, &cache,
+ &delalloc_cached_state,
+ backref_ctx, 0, 0, 0,
+ key.offset, extent_end - 1);
+ } else {
+ /* We have a regular extent. */
+ if (fieinfo->fi_extents_max) {
+ ret = btrfs_is_data_extent_shared(inode,
+ disk_bytenr,
+ extent_gen,
+ backref_ctx);
+ if (ret < 0)
+ goto out_unlock;
+ else if (ret > 0)
+ flags |= FIEMAP_EXTENT_SHARED;
+ }
+
+ ret = emit_fiemap_extent(fieinfo, &cache, key.offset,
+ disk_bytenr + extent_offset,
+ extent_len, flags);
+ }
+
+ if (ret < 0) {
+ goto out_unlock;
+ } else if (ret > 0) {
+ /* emit_fiemap_extent() told us to stop. */
+ stopped = true;
+ break;
+ }
+
+ prev_extent_end = extent_end;
+next_item:
+ if (fatal_signal_pending(current)) {
+ ret = -EINTR;
+ goto out_unlock;
+ }
+
+ ret = fiemap_next_leaf_item(inode, path);
+ if (ret < 0) {
+ goto out_unlock;
+ } else if (ret > 0) {
+ /* No more file extent items for this inode. */
+ break;
+ }
+ cond_resched();
+ }
+
+check_eof_delalloc:
+ if (!stopped && prev_extent_end < range_end) {
+ ret = fiemap_process_hole(inode, fieinfo, &cache,
+ &delalloc_cached_state, backref_ctx,
+ 0, 0, 0, prev_extent_end, range_end - 1);
+ if (ret < 0)
+ goto out_unlock;
+ prev_extent_end = range_end;
+ }
+
+ if (cache.cached && cache.offset + cache.len >= last_extent_end) {
+ const u64 i_size = i_size_read(&inode->vfs_inode);
+
+ if (prev_extent_end < i_size) {
+ u64 delalloc_start;
+ u64 delalloc_end;
+ bool delalloc;
+
+ delalloc = btrfs_find_delalloc_in_range(inode,
+ prev_extent_end,
+ i_size - 1,
+ &delalloc_cached_state,
+ &delalloc_start,
+ &delalloc_end);
+ if (!delalloc)
+ cache.flags |= FIEMAP_EXTENT_LAST;
+ } else {
+ cache.flags |= FIEMAP_EXTENT_LAST;
+ }
+ }
+
+out_unlock:
+ unlock_extent(&inode->io_tree, range_start, range_end, &cached_state);
+
+ if (ret == BTRFS_FIEMAP_FLUSH_CACHE) {
+ btrfs_release_path(path);
+ ret = flush_fiemap_cache(fieinfo, &cache);
+ if (ret)
+ goto out;
+ len -= cache.next_search_offset - start;
+ start = cache.next_search_offset;
+ goto restart;
+ } else if (ret < 0) {
+ goto out;
+ }
+
+ /*
+ * Must free the path before emitting to the fiemap buffer because we
+ * may have a non-cloned leaf and if the fiemap buffer is memory mapped
+ * to a file, a write into it (through btrfs_page_mkwrite()) may trigger
+ * waiting for an ordered extent that in order to complete needs to
+ * modify that leaf, therefore leading to a deadlock.
+ */
+ btrfs_free_path(path);
+ path = NULL;
+
+ ret = flush_fiemap_cache(fieinfo, &cache);
+ if (ret)
+ goto out;
+
+ ret = emit_last_fiemap_cache(fieinfo, &cache);
+out:
+ free_extent_state(delalloc_cached_state);
+ kfree(cache.entries);
+ btrfs_free_backref_share_ctx(backref_ctx);
+ btrfs_free_path(path);
+ return ret;
+}
+
+int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+ u64 start, u64 len)
+{
+ struct btrfs_inode *btrfs_inode = BTRFS_I(inode);
+ int ret;
+
+ ret = fiemap_prep(inode, fieinfo, start, &len, 0);
+ if (ret)
+ return ret;
+
+ /*
+ * fiemap_prep() called filemap_write_and_wait() for the whole possible
+ * file range (0 to LLONG_MAX), but that is not enough if we have
+ * compression enabled. The first filemap_fdatawrite_range() only kicks
+ * in the compression of data (in an async thread) and will return
+ * before the compression is done and writeback is started. A second
+ * filemap_fdatawrite_range() is needed to wait for the compression to
+ * complete and writeback to start. We also need to wait for ordered
+ * extents to complete, because our fiemap implementation uses mainly
+ * file extent items to list the extents, searching for extent maps
+ * only for file ranges with holes or prealloc extents to figure out
+ * if we have delalloc in those ranges.
+ */
+ if (fieinfo->fi_flags & FIEMAP_FLAG_SYNC) {
+ ret = btrfs_wait_ordered_range(btrfs_inode, 0, LLONG_MAX);
+ if (ret)
+ return ret;
+ }
+
+ btrfs_inode_lock(btrfs_inode, BTRFS_ILOCK_SHARED);
+
+ /*
+ * We did an initial flush to avoid holding the inode's lock while
+ * triggering writeback and waiting for the completion of IO and ordered
+ * extents. Now after we locked the inode we do it again, because it's
+ * possible a new write may have happened in between those two steps.
+ */
+ if (fieinfo->fi_flags & FIEMAP_FLAG_SYNC) {
+ ret = btrfs_wait_ordered_range(btrfs_inode, 0, LLONG_MAX);
+ if (ret) {
+ btrfs_inode_unlock(btrfs_inode, BTRFS_ILOCK_SHARED);
+ return ret;
+ }
+ }
+
+ ret = extent_fiemap(btrfs_inode, fieinfo, start, len);
+ btrfs_inode_unlock(btrfs_inode, BTRFS_ILOCK_SHARED);
+
+ return ret;
+}
diff --git a/fs/btrfs/fiemap.h b/fs/btrfs/fiemap.h
new file mode 100644
index 000000000000..cfd74b35988f
--- /dev/null
+++ b/fs/btrfs/fiemap.h
@@ -0,0 +1,11 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef BTRFS_FIEMAP_H
+#define BTRFS_FIEMAP_H
+
+#include <linux/fiemap.h>
+
+int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+ u64 start, u64 len);
+
+#endif /* BTRFS_FIEMAP_H */
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index bce95f871750..5c342fe1af61 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -45,13 +45,12 @@
*/
void btrfs_inode_safe_disk_i_size_write(struct btrfs_inode *inode, u64 new_i_size)
{
- struct btrfs_fs_info *fs_info = inode->root->fs_info;
u64 start, end, i_size;
int ret;
spin_lock(&inode->lock);
i_size = new_i_size ?: i_size_read(&inode->vfs_inode);
- if (btrfs_fs_incompat(fs_info, NO_HOLES)) {
+ if (!inode->file_extent_tree) {
inode->disk_i_size = i_size;
goto out_unlock;
}
@@ -84,13 +83,14 @@ out_unlock:
int btrfs_inode_set_file_extent_range(struct btrfs_inode *inode, u64 start,
u64 len)
{
+ if (!inode->file_extent_tree)
+ return 0;
+
if (len == 0)
return 0;
ASSERT(IS_ALIGNED(start + len, inode->root->fs_info->sectorsize));
- if (btrfs_fs_incompat(inode->root->fs_info, NO_HOLES))
- return 0;
return set_extent_bit(inode->file_extent_tree, start, start + len - 1,
EXTENT_DIRTY, NULL);
}
@@ -112,14 +112,15 @@ int btrfs_inode_set_file_extent_range(struct btrfs_inode *inode, u64 start,
int btrfs_inode_clear_file_extent_range(struct btrfs_inode *inode, u64 start,
u64 len)
{
+ if (!inode->file_extent_tree)
+ return 0;
+
if (len == 0)
return 0;
ASSERT(IS_ALIGNED(start + len, inode->root->fs_info->sectorsize) ||
len == (u64)-1);
- if (btrfs_fs_incompat(inode->root->fs_info, NO_HOLES))
- return 0;
return clear_extent_bit(inode->file_extent_tree, start,
start + len - 1, EXTENT_DIRTY, NULL);
}
@@ -352,7 +353,7 @@ blk_status_t btrfs_lookup_bio_sums(struct btrfs_bio *bbio)
u32 bio_offset = 0;
if ((inode->flags & BTRFS_INODE_NODATASUM) ||
- test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state))
+ test_bit(BTRFS_FS_STATE_NO_DATA_CSUMS, &fs_info->fs_state))
return BLK_STS_OK;
/*
@@ -1280,7 +1281,6 @@ void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode,
const int slot = path->slots[0];
struct btrfs_key key;
u64 extent_start;
- u64 bytenr;
u8 type = btrfs_file_extent_type(leaf, fi);
int compress_type = btrfs_file_extent_compression(leaf, fi);
@@ -1290,24 +1290,29 @@ void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode,
em->generation = btrfs_file_extent_generation(leaf, fi);
if (type == BTRFS_FILE_EXTENT_REG ||
type == BTRFS_FILE_EXTENT_PREALLOC) {
+ const u64 disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
+
em->start = extent_start;
em->len = btrfs_file_extent_end(path) - extent_start;
- em->orig_start = extent_start -
- btrfs_file_extent_offset(leaf, fi);
- em->orig_block_len = btrfs_file_extent_disk_num_bytes(leaf, fi);
- bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
- if (bytenr == 0) {
- em->block_start = EXTENT_MAP_HOLE;
+ if (disk_bytenr == 0) {
+ em->disk_bytenr = EXTENT_MAP_HOLE;
+ em->disk_num_bytes = 0;
+ em->offset = 0;
return;
}
+ em->disk_bytenr = disk_bytenr;
+ em->disk_num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
+ em->offset = btrfs_file_extent_offset(leaf, fi);
if (compress_type != BTRFS_COMPRESS_NONE) {
extent_map_set_compression(em, compress_type);
- em->block_start = bytenr;
- em->block_len = em->orig_block_len;
} else {
- bytenr += btrfs_file_extent_offset(leaf, fi);
- em->block_start = bytenr;
- em->block_len = em->len;
+ /*
+ * Older kernels can create regular non-hole data
+ * extents with ram_bytes smaller than disk_num_bytes.
+ * Not a big deal, just always use disk_num_bytes
+ * for ram_bytes.
+ */
+ em->ram_bytes = em->disk_num_bytes;
if (type == BTRFS_FILE_EXTENT_PREALLOC)
em->flags |= EXTENT_FLAG_PREALLOC;
}
@@ -1315,15 +1320,10 @@ void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode,
/* Tree-checker has ensured this. */
ASSERT(extent_start == 0);
- em->block_start = EXTENT_MAP_INLINE;
+ em->disk_bytenr = EXTENT_MAP_INLINE;
em->start = 0;
em->len = fs_info->sectorsize;
- /*
- * Initialize orig_start and block_len with the same values
- * as in inode.c:btrfs_get_extent().
- */
- em->orig_start = EXTENT_MAP_HOLE;
- em->block_len = (u64)-1;
+ em->offset = 0;
extent_map_set_compression(em, compress_type);
} else {
btrfs_err(fs_info,
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index d90138683a0a..21381de906f6 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -17,8 +17,8 @@
#include <linux/uio.h>
#include <linux/iversion.h>
#include <linux/fsverity.h>
-#include <linux/iomap.h>
#include "ctree.h"
+#include "direct-io.h"
#include "disk-io.h"
#include "transaction.h"
#include "btrfs_inode.h"
@@ -1104,7 +1104,7 @@ int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos,
&cached_state);
}
ret = can_nocow_extent(&inode->vfs_inode, lockstart, &num_bytes,
- NULL, NULL, NULL, nowait, false);
+ NULL, nowait, false);
if (ret <= 0)
btrfs_drew_write_unlock(&root->snapshot_lock);
else
@@ -1140,8 +1140,7 @@ static void update_time_for_write(struct inode *inode)
inode_inc_iversion(inode);
}
-static int btrfs_write_check(struct kiocb *iocb, struct iov_iter *from,
- size_t count)
+int btrfs_write_check(struct kiocb *iocb, struct iov_iter *from, size_t count)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file_inode(file);
@@ -1187,8 +1186,7 @@ static int btrfs_write_check(struct kiocb *iocb, struct iov_iter *from,
return 0;
}
-static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb,
- struct iov_iter *i)
+ssize_t btrfs_buffered_write(struct kiocb *iocb, struct iov_iter *i)
{
struct file *file = iocb->ki_filp;
loff_t pos;
@@ -1451,194 +1449,6 @@ out:
return num_written ? num_written : ret;
}
-static ssize_t check_direct_IO(struct btrfs_fs_info *fs_info,
- const struct iov_iter *iter, loff_t offset)
-{
- const u32 blocksize_mask = fs_info->sectorsize - 1;
-
- if (offset & blocksize_mask)
- return -EINVAL;
-
- if (iov_iter_alignment(iter) & blocksize_mask)
- return -EINVAL;
-
- return 0;
-}
-
-static ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from)
-{
- struct file *file = iocb->ki_filp;
- struct inode *inode = file_inode(file);
- struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
- loff_t pos;
- ssize_t written = 0;
- ssize_t written_buffered;
- size_t prev_left = 0;
- loff_t endbyte;
- ssize_t ret;
- unsigned int ilock_flags = 0;
- struct iomap_dio *dio;
-
- if (iocb->ki_flags & IOCB_NOWAIT)
- ilock_flags |= BTRFS_ILOCK_TRY;
-
- /*
- * If the write DIO is within EOF, use a shared lock and also only if
- * security bits will likely not be dropped by file_remove_privs() called
- * from btrfs_write_check(). Either will need to be rechecked after the
- * lock was acquired.
- */
- if (iocb->ki_pos + iov_iter_count(from) <= i_size_read(inode) && IS_NOSEC(inode))
- ilock_flags |= BTRFS_ILOCK_SHARED;
-
-relock:
- ret = btrfs_inode_lock(BTRFS_I(inode), ilock_flags);
- if (ret < 0)
- return ret;
-
- /* Shared lock cannot be used with security bits set. */
- if ((ilock_flags & BTRFS_ILOCK_SHARED) && !IS_NOSEC(inode)) {
- btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
- ilock_flags &= ~BTRFS_ILOCK_SHARED;
- goto relock;
- }
-
- ret = generic_write_checks(iocb, from);
- if (ret <= 0) {
- btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
- return ret;
- }
-
- ret = btrfs_write_check(iocb, from, ret);
- if (ret < 0) {
- btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
- goto out;
- }
-
- pos = iocb->ki_pos;
- /*
- * Re-check since file size may have changed just before taking the
- * lock or pos may have changed because of O_APPEND in generic_write_check()
- */
- if ((ilock_flags & BTRFS_ILOCK_SHARED) &&
- pos + iov_iter_count(from) > i_size_read(inode)) {
- btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
- ilock_flags &= ~BTRFS_ILOCK_SHARED;
- goto relock;
- }
-
- if (check_direct_IO(fs_info, from, pos)) {
- btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
- goto buffered;
- }
-
- /*
- * The iov_iter can be mapped to the same file range we are writing to.
- * If that's the case, then we will deadlock in the iomap code, because
- * it first calls our callback btrfs_dio_iomap_begin(), which will create
- * an ordered extent, and after that it will fault in the pages that the
- * iov_iter refers to. During the fault in we end up in the readahead
- * pages code (starting at btrfs_readahead()), which will lock the range,
- * find that ordered extent and then wait for it to complete (at
- * btrfs_lock_and_flush_ordered_range()), resulting in a deadlock since
- * obviously the ordered extent can never complete as we didn't submit
- * yet the respective bio(s). This always happens when the buffer is
- * memory mapped to the same file range, since the iomap DIO code always
- * invalidates pages in the target file range (after starting and waiting
- * for any writeback).
- *
- * So here we disable page faults in the iov_iter and then retry if we
- * got -EFAULT, faulting in the pages before the retry.
- */
- from->nofault = true;
- dio = btrfs_dio_write(iocb, from, written);
- from->nofault = false;
-
- /*
- * iomap_dio_complete() will call btrfs_sync_file() if we have a dsync
- * iocb, and that needs to lock the inode. So unlock it before calling
- * iomap_dio_complete() to avoid a deadlock.
- */
- btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
-
- if (IS_ERR_OR_NULL(dio))
- ret = PTR_ERR_OR_ZERO(dio);
- else
- ret = iomap_dio_complete(dio);
-
- /* No increment (+=) because iomap returns a cumulative value. */
- if (ret > 0)
- written = ret;
-
- if (iov_iter_count(from) > 0 && (ret == -EFAULT || ret > 0)) {
- const size_t left = iov_iter_count(from);
- /*
- * We have more data left to write. Try to fault in as many as
- * possible of the remainder pages and retry. We do this without
- * releasing and locking again the inode, to prevent races with
- * truncate.
- *
- * Also, in case the iov refers to pages in the file range of the
- * file we want to write to (due to a mmap), we could enter an
- * infinite loop if we retry after faulting the pages in, since
- * iomap will invalidate any pages in the range early on, before
- * it tries to fault in the pages of the iov. So we keep track of
- * how much was left of iov in the previous EFAULT and fallback
- * to buffered IO in case we haven't made any progress.
- */
- if (left == prev_left) {
- ret = -ENOTBLK;
- } else {
- fault_in_iov_iter_readable(from, left);
- prev_left = left;
- goto relock;
- }
- }
-
- /*
- * If 'ret' is -ENOTBLK or we have not written all data, then it means
- * we must fallback to buffered IO.
- */
- if ((ret < 0 && ret != -ENOTBLK) || !iov_iter_count(from))
- goto out;
-
-buffered:
- /*
- * If we are in a NOWAIT context, then return -EAGAIN to signal the caller
- * it must retry the operation in a context where blocking is acceptable,
- * because even if we end up not blocking during the buffered IO attempt
- * below, we will block when flushing and waiting for the IO.
- */
- if (iocb->ki_flags & IOCB_NOWAIT) {
- ret = -EAGAIN;
- goto out;
- }
-
- pos = iocb->ki_pos;
- written_buffered = btrfs_buffered_write(iocb, from);
- if (written_buffered < 0) {
- ret = written_buffered;
- goto out;
- }
- /*
- * Ensure all data is persisted. We want the next direct IO read to be
- * able to read what was just written.
- */
- endbyte = pos + written_buffered - 1;
- ret = btrfs_fdatawrite_range(inode, pos, endbyte);
- if (ret)
- goto out;
- ret = filemap_fdatawait_range(inode->i_mapping, pos, endbyte);
- if (ret)
- goto out;
- written += written_buffered;
- iocb->ki_pos = pos + written_buffered;
- invalidate_mapping_pages(file->f_mapping, pos >> PAGE_SHIFT,
- endbyte >> PAGE_SHIFT);
-out:
- return ret < 0 ? ret : written;
-}
-
static ssize_t btrfs_encoded_write(struct kiocb *iocb, struct iov_iter *from,
const struct btrfs_ioctl_encoded_io_args *encoded)
{
@@ -1738,7 +1548,7 @@ int btrfs_release_file(struct inode *inode, struct file *filp)
return 0;
}
-static int start_ordered_ops(struct inode *inode, loff_t start, loff_t end)
+static int start_ordered_ops(struct btrfs_inode *inode, loff_t start, loff_t end)
{
int ret;
struct blk_plug plug;
@@ -1758,7 +1568,7 @@ static int start_ordered_ops(struct inode *inode, loff_t start, loff_t end)
static inline bool skip_inode_logging(const struct btrfs_log_ctx *ctx)
{
- struct btrfs_inode *inode = BTRFS_I(ctx->inode);
+ struct btrfs_inode *inode = ctx->inode;
struct btrfs_fs_info *fs_info = inode->root->fs_info;
if (btrfs_inode_in_log(inode, btrfs_get_fs_generation(fs_info)) &&
@@ -1794,9 +1604,9 @@ static inline bool skip_inode_logging(const struct btrfs_log_ctx *ctx)
int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
{
struct dentry *dentry = file_dentry(file);
- struct inode *inode = d_inode(dentry);
- struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
- struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_inode *inode = BTRFS_I(d_inode(dentry));
+ struct btrfs_root *root = inode->root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_trans_handle *trans;
struct btrfs_log_ctx ctx;
int ret = 0, err;
@@ -1829,7 +1639,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
if (ret)
goto out;
- btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
+ btrfs_inode_lock(inode, BTRFS_ILOCK_MMAP);
atomic_inc(&root->log_batch);
@@ -1853,7 +1663,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
*/
ret = start_ordered_ops(inode, start, end);
if (ret) {
- btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
+ btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
goto out;
}
@@ -1865,8 +1675,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
* running delalloc the full sync flag may be set if we need to drop
* extra extent map ranges due to temporary memory allocation failures.
*/
- full_sync = test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
- &BTRFS_I(inode)->runtime_flags);
+ full_sync = test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags);
/*
* We have to do this here to avoid the priority inversion of waiting on
@@ -1885,16 +1694,15 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
*/
if (full_sync || btrfs_is_zoned(fs_info)) {
ret = btrfs_wait_ordered_range(inode, start, len);
- clear_bit(BTRFS_INODE_COW_WRITE_ERROR, &BTRFS_I(inode)->runtime_flags);
+ clear_bit(BTRFS_INODE_COW_WRITE_ERROR, &inode->runtime_flags);
} else {
/*
* Get our ordered extents as soon as possible to avoid doing
* checksum lookups in the csum tree, and use instead the
* checksums attached to the ordered extents.
*/
- btrfs_get_ordered_extents_for_logging(BTRFS_I(inode),
- &ctx.ordered_extents);
- ret = filemap_fdatawait_range(inode->i_mapping, start, end);
+ btrfs_get_ordered_extents_for_logging(inode, &ctx.ordered_extents);
+ ret = filemap_fdatawait_range(inode->vfs_inode.i_mapping, start, end);
if (ret)
goto out_release_extents;
@@ -1907,8 +1715,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
* extents to complete so that any extent maps that point to
* unwritten locations are dropped and we don't log them.
*/
- if (test_and_clear_bit(BTRFS_INODE_COW_WRITE_ERROR,
- &BTRFS_I(inode)->runtime_flags))
+ if (test_and_clear_bit(BTRFS_INODE_COW_WRITE_ERROR, &inode->runtime_flags))
ret = btrfs_wait_ordered_range(inode, start, len);
}
@@ -1923,8 +1730,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
* modified so clear this flag in case it was set for whatever
* reason, it's no longer relevant.
*/
- clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
- &BTRFS_I(inode)->runtime_flags);
+ clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags);
/*
* An ordered extent might have started before and completed
* already with io errors, in which case the inode was not
@@ -1932,7 +1738,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
* for any errors that might have happened since we last
* checked called fsync.
*/
- ret = filemap_check_wb_err(inode->i_mapping, file->f_wb_err);
+ ret = filemap_check_wb_err(inode->vfs_inode.i_mapping, file->f_wb_err);
goto out_release_extents;
}
@@ -1982,7 +1788,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
* file again, but that will end up using the synchronization
* inside btrfs_sync_log to keep things safe.
*/
- btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
+ btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
if (ret == BTRFS_NO_LOG_SYNC) {
ret = btrfs_end_transaction(trans);
@@ -2051,7 +1857,7 @@ out:
out_release_extents:
btrfs_release_log_ctx_extents(&ctx);
- btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
+ btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
goto out;
}
@@ -2350,11 +2156,9 @@ out:
hole_em->start = offset;
hole_em->len = end - offset;
hole_em->ram_bytes = hole_em->len;
- hole_em->orig_start = offset;
- hole_em->block_start = EXTENT_MAP_HOLE;
- hole_em->block_len = 0;
- hole_em->orig_block_len = 0;
+ hole_em->disk_bytenr = EXTENT_MAP_HOLE;
+ hole_em->disk_num_bytes = 0;
hole_em->generation = trans->transid;
ret = btrfs_replace_extent_map_range(inode, hole_em, true);
@@ -2385,7 +2189,7 @@ static int find_first_non_hole(struct btrfs_inode *inode, u64 *start, u64 *len)
return PTR_ERR(em);
/* Hole or vacuum extent(only exists in no-hole mode) */
- if (em->block_start == EXTENT_MAP_HOLE) {
+ if (em->disk_bytenr == EXTENT_MAP_HOLE) {
ret = 1;
*len = em->start + em->len > *start + *len ?
0 : *start + *len - em->start - em->len;
@@ -2814,7 +2618,7 @@ static int btrfs_punch_hole(struct file *file, loff_t offset, loff_t len)
btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
- ret = btrfs_wait_ordered_range(inode, offset, len);
+ ret = btrfs_wait_ordered_range(BTRFS_I(inode), offset, len);
if (ret)
goto out_only_mutex;
@@ -3042,7 +2846,7 @@ static int btrfs_zero_range_check_range_boundary(struct btrfs_inode *inode,
if (IS_ERR(em))
return PTR_ERR(em);
- if (em->block_start == EXTENT_MAP_HOLE)
+ if (em->disk_bytenr == EXTENT_MAP_HOLE)
ret = RANGE_BOUNDARY_HOLE;
else if (em->flags & EXTENT_FLAG_PREALLOC)
ret = RANGE_BOUNDARY_PREALLOC_EXTENT;
@@ -3106,7 +2910,7 @@ static int btrfs_zero_range(struct inode *inode,
ASSERT(IS_ALIGNED(alloc_start, sectorsize));
len = offset + len - alloc_start;
offset = alloc_start;
- alloc_hint = em->block_start + em->len;
+ alloc_hint = extent_map_block_start(em) + em->len;
}
free_extent_map(em);
@@ -3124,7 +2928,7 @@ static int btrfs_zero_range(struct inode *inode,
mode);
goto out;
}
- if (len < sectorsize && em->block_start != EXTENT_MAP_HOLE) {
+ if (len < sectorsize && em->disk_bytenr != EXTENT_MAP_HOLE) {
free_extent_map(em);
ret = btrfs_truncate_block(BTRFS_I(inode), offset, len,
0);
@@ -3309,7 +3113,7 @@ static long btrfs_fallocate(struct file *file, int mode,
* the file range and, due to the previous locking we did, we know there
* can't be more delalloc or ordered extents in the range.
*/
- ret = btrfs_wait_ordered_range(inode, alloc_start,
+ ret = btrfs_wait_ordered_range(BTRFS_I(inode), alloc_start,
alloc_end - alloc_start);
if (ret)
goto out;
@@ -3337,7 +3141,7 @@ static long btrfs_fallocate(struct file *file, int mode,
last_byte = min(extent_map_end(em), alloc_end);
actual_end = min_t(u64, extent_map_end(em), offset + len);
last_byte = ALIGN(last_byte, blocksize);
- if (em->block_start == EXTENT_MAP_HOLE ||
+ if (em->disk_bytenr == EXTENT_MAP_HOLE ||
(cur_offset >= inode->i_size &&
!(em->flags & EXTENT_FLAG_PREALLOC))) {
const u64 range_len = last_byte - cur_offset;
@@ -3920,97 +3724,6 @@ static int btrfs_file_open(struct inode *inode, struct file *filp)
return generic_file_open(inode, filp);
}
-static int check_direct_read(struct btrfs_fs_info *fs_info,
- const struct iov_iter *iter, loff_t offset)
-{
- int ret;
- int i, seg;
-
- ret = check_direct_IO(fs_info, iter, offset);
- if (ret < 0)
- return ret;
-
- if (!iter_is_iovec(iter))
- return 0;
-
- for (seg = 0; seg < iter->nr_segs; seg++) {
- for (i = seg + 1; i < iter->nr_segs; i++) {
- const struct iovec *iov1 = iter_iov(iter) + seg;
- const struct iovec *iov2 = iter_iov(iter) + i;
-
- if (iov1->iov_base == iov2->iov_base)
- return -EINVAL;
- }
- }
- return 0;
-}
-
-static ssize_t btrfs_direct_read(struct kiocb *iocb, struct iov_iter *to)
-{
- struct inode *inode = file_inode(iocb->ki_filp);
- size_t prev_left = 0;
- ssize_t read = 0;
- ssize_t ret;
-
- if (fsverity_active(inode))
- return 0;
-
- if (check_direct_read(inode_to_fs_info(inode), to, iocb->ki_pos))
- return 0;
-
- btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_SHARED);
-again:
- /*
- * This is similar to what we do for direct IO writes, see the comment
- * at btrfs_direct_write(), but we also disable page faults in addition
- * to disabling them only at the iov_iter level. This is because when
- * reading from a hole or prealloc extent, iomap calls iov_iter_zero(),
- * which can still trigger page fault ins despite having set ->nofault
- * to true of our 'to' iov_iter.
- *
- * The difference to direct IO writes is that we deadlock when trying
- * to lock the extent range in the inode's tree during he page reads
- * triggered by the fault in (while for writes it is due to waiting for
- * our own ordered extent). This is because for direct IO reads,
- * btrfs_dio_iomap_begin() returns with the extent range locked, which
- * is only unlocked in the endio callback (end_bio_extent_readpage()).
- */
- pagefault_disable();
- to->nofault = true;
- ret = btrfs_dio_read(iocb, to, read);
- to->nofault = false;
- pagefault_enable();
-
- /* No increment (+=) because iomap returns a cumulative value. */
- if (ret > 0)
- read = ret;
-
- if (iov_iter_count(to) > 0 && (ret == -EFAULT || ret > 0)) {
- const size_t left = iov_iter_count(to);
-
- if (left == prev_left) {
- /*
- * We didn't make any progress since the last attempt,
- * fallback to a buffered read for the remainder of the
- * range. This is just to avoid any possibility of looping
- * for too long.
- */
- ret = read;
- } else {
- /*
- * We made some progress since the last retry or this is
- * the first time we are retrying. Fault in as many pages
- * as possible and retry.
- */
- fault_in_iov_iter_writeable(to, left);
- prev_left = left;
- goto again;
- }
- }
- btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_SHARED);
- return ret < 0 ? ret : read;
-}
-
static ssize_t btrfs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
ssize_t ret = 0;
@@ -4045,8 +3758,9 @@ const struct file_operations btrfs_file_operations = {
.fop_flags = FOP_BUFFER_RASYNC | FOP_BUFFER_WASYNC,
};
-int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end)
+int btrfs_fdatawrite_range(struct btrfs_inode *inode, loff_t start, loff_t end)
{
+ struct address_space *mapping = inode->vfs_inode.i_mapping;
int ret;
/*
@@ -4063,10 +3777,9 @@ int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end)
* know better and pull this out at some point in the future, it is
* right and you are wrong.
*/
- ret = filemap_fdatawrite_range(inode->i_mapping, start, end);
- if (!ret && test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
- &BTRFS_I(inode)->runtime_flags))
- ret = filemap_fdatawrite_range(inode->i_mapping, start, end);
+ ret = filemap_fdatawrite_range(mapping, start, end);
+ if (!ret && test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, &inode->runtime_flags))
+ ret = filemap_fdatawrite_range(mapping, start, end);
return ret;
}
diff --git a/fs/btrfs/file.h b/fs/btrfs/file.h
index 77aaca208c7b..912254e653cf 100644
--- a/fs/btrfs/file.h
+++ b/fs/btrfs/file.h
@@ -37,12 +37,14 @@ int btrfs_release_file(struct inode *inode, struct file *file);
int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages,
size_t num_pages, loff_t pos, size_t write_bytes,
struct extent_state **cached, bool noreserve);
-int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end);
+int btrfs_fdatawrite_range(struct btrfs_inode *inode, loff_t start, loff_t end);
int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos,
size_t *write_bytes, bool nowait);
void btrfs_check_nocow_unlock(struct btrfs_inode *inode);
bool btrfs_find_delalloc_in_range(struct btrfs_inode *inode, u64 start, u64 end,
struct extent_state **cached_state,
u64 *delalloc_start_ret, u64 *delalloc_end_ret);
+int btrfs_write_check(struct kiocb *iocb, struct iov_iter *from, size_t count);
+ssize_t btrfs_buffered_write(struct kiocb *iocb, struct iov_iter *i);
#endif
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index dabc3d0793cf..3f9b7507543a 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -82,7 +82,6 @@ static struct inode *__lookup_free_space_inode(struct btrfs_root *root,
struct btrfs_path *path,
u64 offset)
{
- struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_key key;
struct btrfs_key location;
struct btrfs_disk_key disk_key;
@@ -116,7 +115,7 @@ static struct inode *__lookup_free_space_inode(struct btrfs_root *root,
* sure NOFS is set to keep us from deadlocking.
*/
nofs_flag = memalloc_nofs_save();
- inode = btrfs_iget_path(fs_info->sb, location.objectid, root, path);
+ inode = btrfs_iget_path(location.objectid, root, path);
btrfs_release_path(path);
memalloc_nofs_restore(nofs_flag);
if (IS_ERR(inode))
@@ -138,7 +137,7 @@ struct inode *lookup_free_space_inode(struct btrfs_block_group *block_group,
spin_lock(&block_group->lock);
if (block_group->inode)
- inode = igrab(block_group->inode);
+ inode = igrab(&block_group->inode->vfs_inode);
spin_unlock(&block_group->lock);
if (inode)
return inode;
@@ -157,7 +156,7 @@ struct inode *lookup_free_space_inode(struct btrfs_block_group *block_group,
}
if (!test_and_set_bit(BLOCK_GROUP_FLAG_IREF, &block_group->runtime_flags))
- block_group->inode = igrab(inode);
+ block_group->inode = BTRFS_I(igrab(inode));
spin_unlock(&block_group->lock);
return inode;
@@ -858,6 +857,7 @@ static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
spin_unlock(&ctl->tree_lock);
btrfs_err(fs_info,
"Duplicate entries in free space cache, dumping");
+ kmem_cache_free(btrfs_free_space_bitmap_cachep, e->bitmap);
kmem_cache_free(btrfs_free_space_cachep, e);
goto free_cache;
}
@@ -1268,7 +1268,7 @@ static int flush_dirty_cache(struct inode *inode)
{
int ret;
- ret = btrfs_wait_ordered_range(inode, 0, (u64)-1);
+ ret = btrfs_wait_ordered_range(BTRFS_I(inode), 0, (u64)-1);
if (ret)
clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1,
EXTENT_DELALLOC, NULL);
@@ -1483,7 +1483,7 @@ static int __btrfs_write_out_cache(struct inode *inode,
io_ctl->entries = entries;
io_ctl->bitmaps = bitmaps;
- ret = btrfs_fdatawrite_range(inode, 0, (u64)-1);
+ ret = btrfs_fdatawrite_range(BTRFS_I(inode), 0, (u64)-1);
if (ret)
goto out;
diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c
index 90f2938bd743..7ba50e133921 100644
--- a/fs/btrfs/free-space-tree.c
+++ b/fs/btrfs/free-space-tree.c
@@ -1300,10 +1300,14 @@ int btrfs_delete_free_space_tree(struct btrfs_fs_info *fs_info)
btrfs_tree_lock(free_space_root->node);
btrfs_clear_buffer_dirty(trans, free_space_root->node);
btrfs_tree_unlock(free_space_root->node);
- btrfs_free_tree_block(trans, btrfs_root_id(free_space_root),
- free_space_root->node, 0, 1);
-
+ ret = btrfs_free_tree_block(trans, btrfs_root_id(free_space_root),
+ free_space_root->node, 0, 1);
btrfs_put_root(free_space_root);
+ if (ret < 0) {
+ btrfs_abort_transaction(trans, ret);
+ btrfs_end_transaction(trans);
+ return ret;
+ }
return btrfs_commit_transaction(trans);
}
diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h
index 833dc3fe0a38..1b2a7aa0af36 100644
--- a/fs/btrfs/fs.h
+++ b/fs/btrfs/fs.h
@@ -29,7 +29,6 @@
#include "extent-io-tree.h"
#include "async-thread.h"
#include "block-rsv.h"
-#include "fs.h"
struct inode;
struct super_block;
@@ -99,7 +98,9 @@ enum {
/* The btrfs_fs_info created for self-tests */
BTRFS_FS_STATE_DUMMY_FS_INFO,
- BTRFS_FS_STATE_NO_CSUMS,
+ /* Checksum errors are ignored. */
+ BTRFS_FS_STATE_NO_DATA_CSUMS,
+ BTRFS_FS_STATE_SKIP_META_CSUMS,
/* Indicates there was an error cleaning up a log tree. */
BTRFS_FS_STATE_LOG_CLEANUP_ERROR,
@@ -225,6 +226,8 @@ enum {
BTRFS_MOUNT_IGNOREDATACSUMS = (1UL << 28),
BTRFS_MOUNT_NODISCARD = (1UL << 29),
BTRFS_MOUNT_NOSPACECACHE = (1UL << 30),
+ BTRFS_MOUNT_IGNOREMETACSUMS = (1UL << 31),
+ BTRFS_MOUNT_IGNORESUPERFLAGS = (1ULL << 32),
};
/*
@@ -958,7 +961,7 @@ static inline bool btrfs_is_zoned(const struct btrfs_fs_info *fs_info)
/*
* Count how many fs_info->max_extent_size cover the @size
*/
-static inline u32 count_max_extents(struct btrfs_fs_info *fs_info, u64 size)
+static inline u32 count_max_extents(const struct btrfs_fs_info *fs_info, u64 size)
{
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
if (!fs_info)
@@ -1019,7 +1022,7 @@ void __btrfs_clear_fs_compat_ro(struct btrfs_fs_info *fs_info, u64 flag,
#define btrfs_test_opt(fs_info, opt) ((fs_info)->mount_opt & \
BTRFS_MOUNT_##opt)
-static inline int btrfs_fs_closing(struct btrfs_fs_info *fs_info)
+static inline int btrfs_fs_closing(const struct btrfs_fs_info *fs_info)
{
/* Do it this way so we only ever do one test_bit in the normal case. */
if (test_bit(BTRFS_FS_CLOSING_START, &fs_info->flags)) {
@@ -1038,7 +1041,7 @@ static inline int btrfs_fs_closing(struct btrfs_fs_info *fs_info)
* since setting and checking for SB_RDONLY in the superblock's flags is not
* atomic.
*/
-static inline int btrfs_need_cleaner_sleep(struct btrfs_fs_info *fs_info)
+static inline int btrfs_need_cleaner_sleep(const struct btrfs_fs_info *fs_info)
{
return test_bit(BTRFS_FS_STATE_RO, &fs_info->fs_state) ||
btrfs_fs_closing(fs_info);
@@ -1059,7 +1062,7 @@ static inline void btrfs_wake_unfinished_drop(struct btrfs_fs_info *fs_info)
#define EXPORT_FOR_TESTS
-static inline int btrfs_is_testing(struct btrfs_fs_info *fs_info)
+static inline int btrfs_is_testing(const struct btrfs_fs_info *fs_info)
{
return test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state);
}
@@ -1070,7 +1073,7 @@ void btrfs_test_destroy_inode(struct inode *inode);
#define EXPORT_FOR_TESTS static
-static inline int btrfs_is_testing(struct btrfs_fs_info *fs_info)
+static inline int btrfs_is_testing(const struct btrfs_fs_info *fs_info)
{
return 0;
}
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
index 84a94d19b22c..316756ff08ac 100644
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c
@@ -141,8 +141,8 @@ static int btrfs_del_inode_extref(struct btrfs_trans_handle *trans,
extref = btrfs_find_name_in_ext_backref(path->nodes[0], path->slots[0],
ref_objectid, name);
if (!extref) {
- btrfs_handle_fs_error(root->fs_info, -ENOENT, NULL);
- ret = -EROFS;
+ btrfs_abort_transaction(trans, -ENOENT);
+ ret = -ENOENT;
goto out;
}
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index d62c96f00ff8..01eab6955647 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -70,31 +70,13 @@
#include "orphan.h"
#include "backref.h"
#include "raid-stripe-tree.h"
+#include "fiemap.h"
struct btrfs_iget_args {
u64 ino;
struct btrfs_root *root;
};
-struct btrfs_dio_data {
- ssize_t submitted;
- struct extent_changeset *data_reserved;
- struct btrfs_ordered_extent *ordered;
- bool data_space_reserved;
- bool nocow_done;
-};
-
-struct btrfs_dio_private {
- /* Range of I/O */
- u64 file_offset;
- u32 bytes;
-
- /* This must be last */
- struct btrfs_bio bbio;
-};
-
-static struct bio_set btrfs_dio_bioset;
-
struct btrfs_rename_ctx {
/* Output field. Stores the index number of the old directory entry. */
u64 index;
@@ -137,11 +119,6 @@ static noinline int run_delalloc_cow(struct btrfs_inode *inode,
struct page *locked_page, u64 start,
u64 end, struct writeback_control *wbc,
bool pages_dirty);
-static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start,
- u64 len, u64 orig_start, u64 block_start,
- u64 block_len, u64 orig_block_len,
- u64 ram_bytes, int compress_type,
- int type);
static int data_reloc_print_warning_inode(u64 inum, u64 offset, u64 num_bytes,
u64 root, void *warn_ctx)
@@ -877,7 +854,7 @@ static inline int inode_need_compress(struct btrfs_inode *inode, u64 start,
if (btrfs_test_opt(fs_info, COMPRESS) ||
inode->flags & BTRFS_INODE_COMPRESS ||
inode->prop_compress)
- return btrfs_compress_heuristic(&inode->vfs_inode, start, end);
+ return btrfs_compress_heuristic(inode, start, end);
return 0;
}
@@ -890,6 +867,26 @@ static inline void inode_should_defrag(struct btrfs_inode *inode,
btrfs_add_inode_defrag(NULL, inode, small_write);
}
+static int extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end)
+{
+ unsigned long end_index = end >> PAGE_SHIFT;
+ struct page *page;
+ int ret = 0;
+
+ for (unsigned long index = start >> PAGE_SHIFT;
+ index <= end_index; index++) {
+ page = find_get_page(inode->i_mapping, index);
+ if (unlikely(!page)) {
+ if (!ret)
+ ret = -ENOENT;
+ continue;
+ }
+ clear_page_dirty_for_io(page);
+ put_page(page);
+ }
+ return ret;
+}
+
/*
* Work queue call back to started compression on a file and pages.
*
@@ -931,7 +928,16 @@ static void compress_file_range(struct btrfs_work *work)
* Otherwise applications with the file mmap'd can wander in and change
* the page contents while we are compressing them.
*/
- extent_range_clear_dirty_for_io(&inode->vfs_inode, start, end);
+ ret = extent_range_clear_dirty_for_io(&inode->vfs_inode, start, end);
+
+ /*
+ * All the folios should have been locked thus no failure.
+ *
+ * And even if some folios are missing, btrfs_compress_folios()
+ * would handle them correctly, so here just do an ASSERT() check for
+ * early logic errors.
+ */
+ ASSERT(ret == 0);
/*
* We need to save i_size before now because it could change in between
@@ -1152,6 +1158,7 @@ static void submit_one_async_extent(struct async_chunk *async_chunk,
struct btrfs_root *root = inode->root;
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_ordered_extent *ordered;
+ struct btrfs_file_extent file_extent;
struct btrfs_key ins;
struct page *locked_page = NULL;
struct extent_state *cached = NULL;
@@ -1198,29 +1205,22 @@ static void submit_one_async_extent(struct async_chunk *async_chunk,
lock_extent(io_tree, start, end, &cached);
/* Here we're doing allocation and writeback of the compressed pages */
- em = create_io_em(inode, start,
- async_extent->ram_size, /* len */
- start, /* orig_start */
- ins.objectid, /* block_start */
- ins.offset, /* block_len */
- ins.offset, /* orig_block_len */
- async_extent->ram_size, /* ram_bytes */
- async_extent->compress_type,
- BTRFS_ORDERED_COMPRESSED);
+ file_extent.disk_bytenr = ins.objectid;
+ file_extent.disk_num_bytes = ins.offset;
+ file_extent.ram_bytes = async_extent->ram_size;
+ file_extent.num_bytes = async_extent->ram_size;
+ file_extent.offset = 0;
+ file_extent.compression = async_extent->compress_type;
+
+ em = btrfs_create_io_em(inode, start, &file_extent, BTRFS_ORDERED_COMPRESSED);
if (IS_ERR(em)) {
ret = PTR_ERR(em);
goto out_free_reserve;
}
free_extent_map(em);
- ordered = btrfs_alloc_ordered_extent(inode, start, /* file_offset */
- async_extent->ram_size, /* num_bytes */
- async_extent->ram_size, /* ram_bytes */
- ins.objectid, /* disk_bytenr */
- ins.offset, /* disk_num_bytes */
- 0, /* offset */
- 1 << BTRFS_ORDERED_COMPRESSED,
- async_extent->compress_type);
+ ordered = btrfs_alloc_ordered_extent(inode, start, &file_extent,
+ 1 << BTRFS_ORDERED_COMPRESSED);
if (IS_ERR(ordered)) {
btrfs_drop_extent_map_range(inode, start, end, false);
ret = PTR_ERR(ordered);
@@ -1264,8 +1264,8 @@ out_free_reserve:
kfree(async_extent);
}
-static u64 get_extent_allocation_hint(struct btrfs_inode *inode, u64 start,
- u64 num_bytes)
+u64 btrfs_get_extent_allocation_hint(struct btrfs_inode *inode, u64 start,
+ u64 num_bytes)
{
struct extent_map_tree *em_tree = &inode->extent_tree;
struct extent_map *em;
@@ -1279,15 +1279,15 @@ static u64 get_extent_allocation_hint(struct btrfs_inode *inode, u64 start,
* first block in this inode and use that as a hint. If that
* block is also bogus then just don't worry about it.
*/
- if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
+ if (em->disk_bytenr >= EXTENT_MAP_LAST_BYTE) {
free_extent_map(em);
em = search_extent_mapping(em_tree, 0, 0);
- if (em && em->block_start < EXTENT_MAP_LAST_BYTE)
- alloc_hint = em->block_start;
+ if (em && em->disk_bytenr < EXTENT_MAP_LAST_BYTE)
+ alloc_hint = extent_map_block_start(em);
if (em)
free_extent_map(em);
} else {
- alloc_hint = em->block_start;
+ alloc_hint = extent_map_block_start(em);
free_extent_map(em);
}
}
@@ -1375,7 +1375,7 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
}
}
- alloc_hint = get_extent_allocation_hint(inode, start, num_bytes);
+ alloc_hint = btrfs_get_extent_allocation_hint(inode, start, num_bytes);
/*
* Relocation relies on the relocated extents to have exactly the same
@@ -1395,6 +1395,7 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
while (num_bytes > 0) {
struct btrfs_ordered_extent *ordered;
+ struct btrfs_file_extent file_extent;
cur_alloc_size = num_bytes;
ret = btrfs_reserve_extent(root, cur_alloc_size, cur_alloc_size,
@@ -1431,18 +1432,18 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
extent_reserved = true;
ram_size = ins.offset;
+ file_extent.disk_bytenr = ins.objectid;
+ file_extent.disk_num_bytes = ins.offset;
+ file_extent.num_bytes = ins.offset;
+ file_extent.ram_bytes = ins.offset;
+ file_extent.offset = 0;
+ file_extent.compression = BTRFS_COMPRESS_NONE;
lock_extent(&inode->io_tree, start, start + ram_size - 1,
&cached);
- em = create_io_em(inode, start, ins.offset, /* len */
- start, /* orig_start */
- ins.objectid, /* block_start */
- ins.offset, /* block_len */
- ins.offset, /* orig_block_len */
- ram_size, /* ram_bytes */
- BTRFS_COMPRESS_NONE, /* compress_type */
- BTRFS_ORDERED_REGULAR /* type */);
+ em = btrfs_create_io_em(inode, start, &file_extent,
+ BTRFS_ORDERED_REGULAR);
if (IS_ERR(em)) {
unlock_extent(&inode->io_tree, start,
start + ram_size - 1, &cached);
@@ -1451,10 +1452,8 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
}
free_extent_map(em);
- ordered = btrfs_alloc_ordered_extent(inode, start, ram_size,
- ram_size, ins.objectid, cur_alloc_size,
- 0, 1 << BTRFS_ORDERED_REGULAR,
- BTRFS_COMPRESS_NONE);
+ ordered = btrfs_alloc_ordered_extent(inode, start, &file_extent,
+ 1 << BTRFS_ORDERED_REGULAR);
if (IS_ERR(ordered)) {
unlock_extent(&inode->io_tree, start,
start + ram_size - 1, &cached);
@@ -1617,10 +1616,8 @@ static noinline void submit_compressed_extents(struct btrfs_work *work, bool do_
u64 alloc_hint = 0;
if (do_free) {
- struct async_chunk *async_chunk;
struct async_cow *async_cow;
- async_chunk = container_of(work, struct async_chunk, work);
btrfs_add_delayed_iput(async_chunk->inode);
if (async_chunk->blkcg_css)
css_put(async_chunk->blkcg_css);
@@ -1850,13 +1847,11 @@ struct can_nocow_file_extent_args {
*/
bool free_path;
- /* Output fields. Only set when can_nocow_file_extent() returns 1. */
-
- u64 disk_bytenr;
- u64 disk_num_bytes;
- u64 extent_offset;
- /* Number of bytes that can be written to in NOCOW mode. */
- u64 num_bytes;
+ /*
+ * Output fields. Only set when can_nocow_file_extent() returns 1.
+ * The expected file extent for the NOCOW write.
+ */
+ struct btrfs_file_extent file_extent;
};
/*
@@ -1878,6 +1873,7 @@ static int can_nocow_file_extent(struct btrfs_path *path,
struct btrfs_root *root = inode->root;
struct btrfs_file_extent_item *fi;
struct btrfs_root *csum_root;
+ u64 io_start;
u64 extent_end;
u8 extent_type;
int can_nocow = 0;
@@ -1890,11 +1886,6 @@ static int can_nocow_file_extent(struct btrfs_path *path,
if (extent_type == BTRFS_FILE_EXTENT_INLINE)
goto out;
- /* Can't access these fields unless we know it's not an inline extent. */
- args->disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
- args->disk_num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
- args->extent_offset = btrfs_file_extent_offset(leaf, fi);
-
if (!(inode->flags & BTRFS_INODE_NODATACOW) &&
extent_type == BTRFS_FILE_EXTENT_REG)
goto out;
@@ -1910,7 +1901,7 @@ static int can_nocow_file_extent(struct btrfs_path *path,
goto out;
/* An explicit hole, must COW. */
- if (args->disk_bytenr == 0)
+ if (btrfs_file_extent_disk_bytenr(leaf, fi) == 0)
goto out;
/* Compressed/encrypted/encoded extents must be COWed. */
@@ -1921,6 +1912,12 @@ static int can_nocow_file_extent(struct btrfs_path *path,
extent_end = btrfs_file_extent_end(path);
+ args->file_extent.disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
+ args->file_extent.disk_num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
+ args->file_extent.ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
+ args->file_extent.offset = btrfs_file_extent_offset(leaf, fi);
+ args->file_extent.compression = btrfs_file_extent_compression(leaf, fi);
+
/*
* The following checks can be expensive, as they need to take other
* locks and do btree or rbtree searches, so release the path to avoid
@@ -1929,8 +1926,8 @@ static int can_nocow_file_extent(struct btrfs_path *path,
btrfs_release_path(path);
ret = btrfs_cross_ref_exist(root, btrfs_ino(inode),
- key->offset - args->extent_offset,
- args->disk_bytenr, args->strict, path);
+ key->offset - args->file_extent.offset,
+ args->file_extent.disk_bytenr, args->strict, path);
WARN_ON_ONCE(ret > 0 && is_freespace_inode);
if (ret != 0)
goto out;
@@ -1951,18 +1948,18 @@ static int can_nocow_file_extent(struct btrfs_path *path,
atomic_read(&root->snapshot_force_cow))
goto out;
- args->disk_bytenr += args->extent_offset;
- args->disk_bytenr += args->start - key->offset;
- args->num_bytes = min(args->end + 1, extent_end) - args->start;
+ args->file_extent.num_bytes = min(args->end + 1, extent_end) - args->start;
+ args->file_extent.offset += args->start - key->offset;
+ io_start = args->file_extent.disk_bytenr + args->file_extent.offset;
/*
* Force COW if csums exist in the range. This ensures that csums for a
* given extent are either valid or do not exist.
*/
- csum_root = btrfs_csum_root(root->fs_info, args->disk_bytenr);
- ret = btrfs_lookup_csums_list(csum_root, args->disk_bytenr,
- args->disk_bytenr + args->num_bytes - 1,
+ csum_root = btrfs_csum_root(root->fs_info, io_start);
+ ret = btrfs_lookup_csums_list(csum_root, io_start,
+ io_start + args->file_extent.num_bytes - 1,
NULL, nowait);
WARN_ON_ONCE(ret > 0 && is_freespace_inode);
if (ret != 0)
@@ -2021,7 +2018,6 @@ static noinline int run_delalloc_nocow(struct btrfs_inode *inode,
struct extent_buffer *leaf;
struct extent_state *cached_state = NULL;
u64 extent_end;
- u64 ram_bytes;
u64 nocow_end;
int extent_type;
bool is_prealloc;
@@ -2100,7 +2096,6 @@ next_slot:
ret = -EUCLEAN;
goto error;
}
- ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
extent_end = btrfs_file_extent_end(path);
/*
@@ -2120,7 +2115,9 @@ next_slot:
goto must_cow;
ret = 0;
- nocow_bg = btrfs_inc_nocow_writers(fs_info, nocow_args.disk_bytenr);
+ nocow_bg = btrfs_inc_nocow_writers(fs_info,
+ nocow_args.file_extent.disk_bytenr +
+ nocow_args.file_extent.offset);
if (!nocow_bg) {
must_cow:
/*
@@ -2156,21 +2153,16 @@ must_cow:
}
}
- nocow_end = cur_offset + nocow_args.num_bytes - 1;
+ nocow_end = cur_offset + nocow_args.file_extent.num_bytes - 1;
lock_extent(&inode->io_tree, cur_offset, nocow_end, &cached_state);
is_prealloc = extent_type == BTRFS_FILE_EXTENT_PREALLOC;
if (is_prealloc) {
- u64 orig_start = found_key.offset - nocow_args.extent_offset;
struct extent_map *em;
- em = create_io_em(inode, cur_offset, nocow_args.num_bytes,
- orig_start,
- nocow_args.disk_bytenr, /* block_start */
- nocow_args.num_bytes, /* block_len */
- nocow_args.disk_num_bytes, /* orig_block_len */
- ram_bytes, BTRFS_COMPRESS_NONE,
- BTRFS_ORDERED_PREALLOC);
+ em = btrfs_create_io_em(inode, cur_offset,
+ &nocow_args.file_extent,
+ BTRFS_ORDERED_PREALLOC);
if (IS_ERR(em)) {
unlock_extent(&inode->io_tree, cur_offset,
nocow_end, &cached_state);
@@ -2182,12 +2174,10 @@ must_cow:
}
ordered = btrfs_alloc_ordered_extent(inode, cur_offset,
- nocow_args.num_bytes, nocow_args.num_bytes,
- nocow_args.disk_bytenr, nocow_args.num_bytes, 0,
+ &nocow_args.file_extent,
is_prealloc
? (1 << BTRFS_ORDERED_PREALLOC)
- : (1 << BTRFS_ORDERED_NOCOW),
- BTRFS_COMPRESS_NONE);
+ : (1 << BTRFS_ORDERED_NOCOW));
btrfs_dec_nocow_writers(nocow_bg);
if (IS_ERR(ordered)) {
if (is_prealloc) {
@@ -2601,44 +2591,6 @@ void btrfs_clear_delalloc_extent(struct btrfs_inode *inode,
}
}
-static int btrfs_extract_ordered_extent(struct btrfs_bio *bbio,
- struct btrfs_ordered_extent *ordered)
-{
- u64 start = (u64)bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT;
- u64 len = bbio->bio.bi_iter.bi_size;
- struct btrfs_ordered_extent *new;
- int ret;
-
- /* Must always be called for the beginning of an ordered extent. */
- if (WARN_ON_ONCE(start != ordered->disk_bytenr))
- return -EINVAL;
-
- /* No need to split if the ordered extent covers the entire bio. */
- if (ordered->disk_num_bytes == len) {
- refcount_inc(&ordered->refs);
- bbio->ordered = ordered;
- return 0;
- }
-
- /*
- * Don't split the extent_map for NOCOW extents, as we're writing into
- * a pre-existing one.
- */
- if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) {
- ret = split_extent_map(bbio->inode, bbio->file_offset,
- ordered->num_bytes, len,
- ordered->disk_bytenr);
- if (ret)
- return ret;
- }
-
- new = btrfs_split_ordered_extent(ordered, len);
- if (IS_ERR(new))
- return PTR_ERR(new);
- bbio->ordered = new;
- return 0;
-}
-
/*
* given a list of ordered sums record them in the inode. This happens
* at IO completion time based on sums calculated at bio submission time.
@@ -2681,7 +2633,7 @@ static int btrfs_find_new_delalloc_bytes(struct btrfs_inode *inode,
if (IS_ERR(em))
return PTR_ERR(em);
- if (em->block_start != EXTENT_MAP_HOLE)
+ if (em->disk_bytenr != EXTENT_MAP_HOLE)
goto next;
em_len = em->len;
@@ -3037,10 +2989,8 @@ static int insert_ordered_extent_file_extent(struct btrfs_trans_handle *trans,
btrfs_set_stack_file_extent_disk_num_bytes(&stack_fi,
oe->disk_num_bytes);
btrfs_set_stack_file_extent_offset(&stack_fi, oe->offset);
- if (test_bit(BTRFS_ORDERED_TRUNCATED, &oe->flags)) {
+ if (test_bit(BTRFS_ORDERED_TRUNCATED, &oe->flags))
num_bytes = oe->truncated_len;
- ram_bytes = num_bytes;
- }
btrfs_set_stack_file_extent_num_bytes(&stack_fi, num_bytes);
btrfs_set_stack_file_extent_ram_bytes(&stack_fi, ram_bytes);
btrfs_set_stack_file_extent_compression(&stack_fi, oe->compress_type);
@@ -3056,7 +3006,7 @@ static int insert_ordered_extent_file_extent(struct btrfs_trans_handle *trans,
test_bit(BTRFS_ORDERED_ENCODED, &oe->flags) ||
test_bit(BTRFS_ORDERED_TRUNCATED, &oe->flags);
- return insert_reserved_file_extent(trans, BTRFS_I(oe->inode),
+ return insert_reserved_file_extent(trans, oe->inode,
oe->file_offset, &stack_fi,
update_inode_bytes, oe->qgroup_rsv);
}
@@ -3068,7 +3018,7 @@ static int insert_ordered_extent_file_extent(struct btrfs_trans_handle *trans,
*/
int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent)
{
- struct btrfs_inode *inode = BTRFS_I(ordered_extent->inode);
+ struct btrfs_inode *inode = ordered_extent->inode;
struct btrfs_root *root = inode->root;
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_trans_handle *trans = NULL;
@@ -3302,7 +3252,7 @@ out:
int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered)
{
- if (btrfs_is_zoned(inode_to_fs_info(ordered->inode)) &&
+ if (btrfs_is_zoned(ordered->inode->root->fs_info) &&
!test_bit(BTRFS_ORDERED_IOERR, &ordered->flags) &&
list_empty(&ordered->bioc_list))
btrfs_finish_ordered_zoned(ordered);
@@ -3596,7 +3546,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
found_key.objectid = found_key.offset;
found_key.type = BTRFS_INODE_ITEM_KEY;
found_key.offset = 0;
- inode = btrfs_iget(fs_info->sb, last_objectid, root);
+ inode = btrfs_iget(last_objectid, root);
if (IS_ERR(inode)) {
ret = PTR_ERR(inode);
inode = NULL;
@@ -3781,6 +3731,30 @@ static noinline int acls_after_inode_item(struct extent_buffer *leaf,
return 1;
}
+static int btrfs_init_file_extent_tree(struct btrfs_inode *inode)
+{
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+
+ if (WARN_ON_ONCE(inode->file_extent_tree))
+ return 0;
+ if (btrfs_fs_incompat(fs_info, NO_HOLES))
+ return 0;
+ if (!S_ISREG(inode->vfs_inode.i_mode))
+ return 0;
+ if (btrfs_is_free_space_inode(inode))
+ return 0;
+
+ inode->file_extent_tree = kmalloc(sizeof(struct extent_io_tree), GFP_KERNEL);
+ if (!inode->file_extent_tree)
+ return -ENOMEM;
+
+ extent_io_tree_init(fs_info, inode->file_extent_tree, IO_TREE_INODE_FILE_EXTENT);
+ /* Lockdep class is set only for the file extent tree. */
+ lockdep_set_class(&inode->file_extent_tree->lock, &file_extent_tree_class);
+
+ return 0;
+}
+
/*
* read an inode from the btree into the in-memory inode
*/
@@ -3800,6 +3774,10 @@ static int btrfs_read_locked_inode(struct inode *inode,
bool filled = false;
int first_xattr_slot;
+ ret = btrfs_init_file_extent_tree(BTRFS_I(inode));
+ if (ret)
+ return ret;
+
ret = btrfs_fill_inode(inode, &rdev);
if (!ret)
filled = true;
@@ -3810,7 +3788,7 @@ static int btrfs_read_locked_inode(struct inode *inode,
return -ENOMEM;
}
- memcpy(&location, &BTRFS_I(inode)->location, sizeof(location));
+ btrfs_get_inode_key(BTRFS_I(inode), &location);
ret = btrfs_lookup_inode(NULL, root, path, &location, 0);
if (ret) {
@@ -3856,7 +3834,9 @@ static int btrfs_read_locked_inode(struct inode *inode,
inode->i_rdev = 0;
rdev = btrfs_inode_rdev(leaf, inode_item);
- BTRFS_I(inode)->index_cnt = (u64)-1;
+ if (S_ISDIR(inode->i_mode))
+ BTRFS_I(inode)->index_cnt = (u64)-1;
+
btrfs_inode_split_flags(btrfs_inode_flags(leaf, inode_item),
&BTRFS_I(inode)->flags, &BTRFS_I(inode)->ro_flags);
@@ -4038,13 +4018,15 @@ static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans,
struct btrfs_inode_item *inode_item;
struct btrfs_path *path;
struct extent_buffer *leaf;
+ struct btrfs_key key;
int ret;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
- ret = btrfs_lookup_inode(trans, inode->root, path, &inode->location, 1);
+ btrfs_get_inode_key(inode, &key);
+ ret = btrfs_lookup_inode(trans, inode->root, path, &key, 1);
if (ret) {
if (ret > 0)
ret = -ENOENT;
@@ -4308,7 +4290,7 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
if (btrfs_ino(inode) == BTRFS_FIRST_FREE_OBJECTID) {
objectid = btrfs_root_id(inode->root);
} else if (btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) {
- objectid = inode->location.objectid;
+ objectid = inode->ref_root_id;
} else {
WARN_ON(1);
fscrypt_free_filename(&fname);
@@ -4539,11 +4521,6 @@ int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry)
ret = PTR_ERR(trans);
goto out_release;
}
- ret = btrfs_record_root_in_trans(trans, root);
- if (ret) {
- btrfs_abort_transaction(trans, ret);
- goto out_end_trans;
- }
btrfs_qgroup_convert_reserved_meta(root, qgroup_reserved);
qgroup_reserved = 0;
trans->block_rsv = &block_rsv;
@@ -4967,11 +4944,9 @@ int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size)
}
hole_em->start = cur_offset;
hole_em->len = hole_size;
- hole_em->orig_start = cur_offset;
- hole_em->block_start = EXTENT_MAP_HOLE;
- hole_em->block_len = 0;
- hole_em->orig_block_len = 0;
+ hole_em->disk_bytenr = EXTENT_MAP_HOLE;
+ hole_em->disk_num_bytes = 0;
hole_em->ram_bytes = hole_size;
hole_em->generation = btrfs_get_fs_generation(fs_info);
@@ -5049,7 +5024,7 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr)
struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
if (btrfs_is_zoned(fs_info)) {
- ret = btrfs_wait_ordered_range(inode,
+ ret = btrfs_wait_ordered_range(BTRFS_I(inode),
ALIGN(newsize, fs_info->sectorsize),
(u64)-1);
if (ret)
@@ -5079,7 +5054,7 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr)
* wait for disk_i_size to be stable and then update the
* in-memory size to match.
*/
- err = btrfs_wait_ordered_range(inode, 0, (u64)-1);
+ err = btrfs_wait_ordered_range(BTRFS_I(inode), 0, (u64)-1);
if (err)
return err;
i_size_write(inode, BTRFS_I(inode)->disk_i_size);
@@ -5493,59 +5468,52 @@ out:
return err;
}
-static void inode_tree_add(struct btrfs_inode *inode)
+static int btrfs_add_inode_to_root(struct btrfs_inode *inode, bool prealloc)
{
struct btrfs_root *root = inode->root;
- struct btrfs_inode *entry;
- struct rb_node **p;
- struct rb_node *parent;
- struct rb_node *new = &inode->rb_node;
- u64 ino = btrfs_ino(inode);
+ struct btrfs_inode *existing;
+ const u64 ino = btrfs_ino(inode);
+ int ret;
if (inode_unhashed(&inode->vfs_inode))
- return;
- parent = NULL;
- spin_lock(&root->inode_lock);
- p = &root->inode_tree.rb_node;
- while (*p) {
- parent = *p;
- entry = rb_entry(parent, struct btrfs_inode, rb_node);
+ return 0;
- if (ino < btrfs_ino(entry))
- p = &parent->rb_left;
- else if (ino > btrfs_ino(entry))
- p = &parent->rb_right;
- else {
- WARN_ON(!(entry->vfs_inode.i_state &
- (I_WILL_FREE | I_FREEING)));
- rb_replace_node(parent, new, &root->inode_tree);
- RB_CLEAR_NODE(parent);
- spin_unlock(&root->inode_lock);
- return;
- }
+ if (prealloc) {
+ ret = xa_reserve(&root->inodes, ino, GFP_NOFS);
+ if (ret)
+ return ret;
}
- rb_link_node(new, parent, p);
- rb_insert_color(new, &root->inode_tree);
- spin_unlock(&root->inode_lock);
+
+ existing = xa_store(&root->inodes, ino, inode, GFP_ATOMIC);
+
+ if (xa_is_err(existing)) {
+ ret = xa_err(existing);
+ ASSERT(ret != -EINVAL);
+ ASSERT(ret != -ENOMEM);
+ return ret;
+ } else if (existing) {
+ WARN_ON(!(existing->vfs_inode.i_state & (I_WILL_FREE | I_FREEING)));
+ }
+
+ return 0;
}
-static void inode_tree_del(struct btrfs_inode *inode)
+static void btrfs_del_inode_from_root(struct btrfs_inode *inode)
{
struct btrfs_root *root = inode->root;
- int empty = 0;
+ struct btrfs_inode *entry;
+ bool empty = false;
- spin_lock(&root->inode_lock);
- if (!RB_EMPTY_NODE(&inode->rb_node)) {
- rb_erase(&inode->rb_node, &root->inode_tree);
- RB_CLEAR_NODE(&inode->rb_node);
- empty = RB_EMPTY_ROOT(&root->inode_tree);
- }
- spin_unlock(&root->inode_lock);
+ xa_lock(&root->inodes);
+ entry = __xa_erase(&root->inodes, btrfs_ino(inode));
+ if (entry == inode)
+ empty = xa_empty(&root->inodes);
+ xa_unlock(&root->inodes);
if (empty && btrfs_root_refs(&root->root_item) == 0) {
- spin_lock(&root->inode_lock);
- empty = RB_EMPTY_ROOT(&root->inode_tree);
- spin_unlock(&root->inode_lock);
+ xa_lock(&root->inodes);
+ empty = xa_empty(&root->inodes);
+ xa_unlock(&root->inodes);
if (empty)
btrfs_add_dead_root(root);
}
@@ -5556,10 +5524,7 @@ static int btrfs_init_locked_inode(struct inode *inode, void *p)
{
struct btrfs_iget_args *args = p;
- inode->i_ino = args->ino;
- BTRFS_I(inode)->location.objectid = args->ino;
- BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY;
- BTRFS_I(inode)->location.offset = 0;
+ btrfs_set_inode_number(BTRFS_I(inode), args->ino);
BTRFS_I(inode)->root = btrfs_grab_root(args->root);
if (args->root && args->root == args->root->fs_info->tree_root &&
@@ -5573,12 +5538,11 @@ static int btrfs_find_actor(struct inode *inode, void *opaque)
{
struct btrfs_iget_args *args = opaque;
- return args->ino == BTRFS_I(inode)->location.objectid &&
+ return args->ino == btrfs_ino(BTRFS_I(inode)) &&
args->root == BTRFS_I(inode)->root;
}
-static struct inode *btrfs_iget_locked(struct super_block *s, u64 ino,
- struct btrfs_root *root)
+static struct inode *btrfs_iget_locked(u64 ino, struct btrfs_root *root)
{
struct inode *inode;
struct btrfs_iget_args args;
@@ -5587,7 +5551,7 @@ static struct inode *btrfs_iget_locked(struct super_block *s, u64 ino,
args.ino = ino;
args.root = root;
- inode = iget5_locked_rcu(s, hashval, btrfs_find_actor,
+ inode = iget5_locked_rcu(root->fs_info->sb, hashval, btrfs_find_actor,
btrfs_init_locked_inode,
(void *)&args);
return inode;
@@ -5599,41 +5563,44 @@ static struct inode *btrfs_iget_locked(struct super_block *s, u64 ino,
* allocator. NULL is also valid but may require an additional allocation
* later.
*/
-struct inode *btrfs_iget_path(struct super_block *s, u64 ino,
- struct btrfs_root *root, struct btrfs_path *path)
+struct inode *btrfs_iget_path(u64 ino, struct btrfs_root *root,
+ struct btrfs_path *path)
{
struct inode *inode;
+ int ret;
- inode = btrfs_iget_locked(s, ino, root);
+ inode = btrfs_iget_locked(ino, root);
if (!inode)
return ERR_PTR(-ENOMEM);
- if (inode->i_state & I_NEW) {
- int ret;
+ if (!(inode->i_state & I_NEW))
+ return inode;
- ret = btrfs_read_locked_inode(inode, path);
- if (!ret) {
- inode_tree_add(BTRFS_I(inode));
- unlock_new_inode(inode);
- } else {
- iget_failed(inode);
- /*
- * ret > 0 can come from btrfs_search_slot called by
- * btrfs_read_locked_inode, this means the inode item
- * was not found.
- */
- if (ret > 0)
- ret = -ENOENT;
- inode = ERR_PTR(ret);
- }
- }
+ ret = btrfs_read_locked_inode(inode, path);
+ /*
+ * ret > 0 can come from btrfs_search_slot called by
+ * btrfs_read_locked_inode(), this means the inode item was not found.
+ */
+ if (ret > 0)
+ ret = -ENOENT;
+ if (ret < 0)
+ goto error;
+
+ ret = btrfs_add_inode_to_root(BTRFS_I(inode), true);
+ if (ret < 0)
+ goto error;
+
+ unlock_new_inode(inode);
return inode;
+error:
+ iget_failed(inode);
+ return ERR_PTR(ret);
}
-struct inode *btrfs_iget(struct super_block *s, u64 ino, struct btrfs_root *root)
+struct inode *btrfs_iget(u64 ino, struct btrfs_root *root)
{
- return btrfs_iget_path(s, ino, root, NULL);
+ return btrfs_iget_path(ino, root, NULL);
}
static struct inode *new_simple_dir(struct inode *dir,
@@ -5647,10 +5614,11 @@ static struct inode *new_simple_dir(struct inode *dir,
return ERR_PTR(-ENOMEM);
BTRFS_I(inode)->root = btrfs_grab_root(root);
- memcpy(&BTRFS_I(inode)->location, key, sizeof(*key));
+ BTRFS_I(inode)->ref_root_id = key->objectid;
+ set_bit(BTRFS_INODE_ROOT_STUB, &BTRFS_I(inode)->runtime_flags);
set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags);
- inode->i_ino = BTRFS_EMPTY_SUBVOL_DIR_OBJECTID;
+ btrfs_set_inode_number(BTRFS_I(inode), BTRFS_EMPTY_SUBVOL_DIR_OBJECTID);
/*
* We only need lookup, the rest is read-only and there's no inode
* associated with the dentry
@@ -5704,7 +5672,7 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
return ERR_PTR(ret);
if (location.type == BTRFS_INODE_ITEM_KEY) {
- inode = btrfs_iget(dir->i_sb, location.objectid, root);
+ inode = btrfs_iget(location.objectid, root);
if (IS_ERR(inode))
return inode;
@@ -5728,7 +5696,7 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
else
inode = new_simple_dir(dir, &location, root);
} else {
- inode = btrfs_iget(dir->i_sb, location.objectid, sub_root);
+ inode = btrfs_iget(location.objectid, sub_root);
btrfs_put_root(sub_root);
if (IS_ERR(inode))
@@ -5948,7 +5916,7 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx)
addr = private->filldir_buf;
path->reada = READA_FORWARD;
- put = btrfs_readdir_get_delayed_items(inode, private->last_index,
+ put = btrfs_readdir_get_delayed_items(BTRFS_I(inode), private->last_index,
&ins_list, &del_list);
again:
@@ -6038,7 +6006,7 @@ nopos:
ret = 0;
err:
if (put)
- btrfs_readdir_put_delayed_items(inode, &ins_list, &del_list);
+ btrfs_readdir_put_delayed_items(BTRFS_I(inode), &ins_list, &del_list);
btrfs_free_path(path);
return ret;
}
@@ -6123,7 +6091,7 @@ static int btrfs_insert_inode_locked(struct inode *inode)
{
struct btrfs_iget_args args;
- args.ino = BTRFS_I(inode)->location.objectid;
+ args.ino = btrfs_ino(BTRFS_I(inode));
args.root = BTRFS_I(inode)->root;
return insert_inode_locked4(inode,
@@ -6230,7 +6198,6 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info = inode_to_fs_info(dir);
struct btrfs_root *root;
struct btrfs_inode_item *inode_item;
- struct btrfs_key *location;
struct btrfs_path *path;
u64 objectid;
struct btrfs_inode_ref *ref;
@@ -6239,6 +6206,7 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans,
struct btrfs_item_batch batch;
unsigned long ptr;
int ret;
+ bool xa_reserved = false;
path = btrfs_alloc_path();
if (!path)
@@ -6248,10 +6216,19 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans,
BTRFS_I(inode)->root = btrfs_grab_root(BTRFS_I(dir)->root);
root = BTRFS_I(inode)->root;
+ ret = btrfs_init_file_extent_tree(BTRFS_I(inode));
+ if (ret)
+ goto out;
+
ret = btrfs_get_free_objectid(root, &objectid);
if (ret)
goto out;
- inode->i_ino = objectid;
+ btrfs_set_inode_number(BTRFS_I(inode), objectid);
+
+ ret = xa_reserve(&root->inodes, objectid, GFP_NOFS);
+ if (ret)
+ goto out;
+ xa_reserved = true;
if (args->orphan) {
/*
@@ -6266,8 +6243,10 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans,
if (ret)
goto out;
}
- /* index_cnt is ignored for everything but a dir. */
- BTRFS_I(inode)->index_cnt = BTRFS_DIR_START_INDEX;
+
+ if (S_ISDIR(inode->i_mode))
+ BTRFS_I(inode)->index_cnt = BTRFS_DIR_START_INDEX;
+
BTRFS_I(inode)->generation = trans->transid;
inode->i_generation = BTRFS_I(inode)->generation;
@@ -6294,11 +6273,6 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans,
BTRFS_INODE_NODATASUM;
}
- location = &BTRFS_I(inode)->location;
- location->objectid = objectid;
- location->offset = 0;
- location->type = BTRFS_INODE_ITEM_KEY;
-
ret = btrfs_insert_inode_locked(inode);
if (ret < 0) {
if (!args->orphan)
@@ -6397,8 +6371,7 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans,
* Subvolumes inherit properties from their parent subvolume,
* not the directory they were created in.
*/
- parent = btrfs_iget(fs_info->sb, BTRFS_FIRST_FREE_OBJECTID,
- BTRFS_I(dir)->root);
+ parent = btrfs_iget(BTRFS_FIRST_FREE_OBJECTID, BTRFS_I(dir)->root);
if (IS_ERR(parent)) {
ret = PTR_ERR(parent);
} else {
@@ -6426,7 +6399,12 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans,
}
}
- inode_tree_add(BTRFS_I(inode));
+ ret = btrfs_add_inode_to_root(BTRFS_I(inode), false);
+ if (WARN_ON(ret)) {
+ /* Shouldn't happen, we used xa_reserve() before. */
+ btrfs_abort_transaction(trans, ret);
+ goto discard;
+ }
trace_btrfs_inode_new(inode);
btrfs_set_inode_last_trans(trans, BTRFS_I(inode));
@@ -6454,6 +6432,9 @@ discard:
ihold(inode);
discard_new_inode(inode);
out:
+ if (xa_reserved)
+ xa_release(&root->inodes, objectid);
+
btrfs_free_path(path);
return ret;
}
@@ -6818,7 +6799,7 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
if (em) {
if (em->start > start || em->start + em->len <= start)
free_extent_map(em);
- else if (em->block_start == EXTENT_MAP_INLINE && page)
+ else if (em->disk_bytenr == EXTENT_MAP_INLINE && page)
free_extent_map(em);
else
goto out;
@@ -6829,9 +6810,8 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
goto out;
}
em->start = EXTENT_MAP_HOLE;
- em->orig_start = EXTENT_MAP_HOLE;
+ em->disk_bytenr = EXTENT_MAP_HOLE;
em->len = (u64)-1;
- em->block_len = (u64)-1;
path = btrfs_alloc_path();
if (!path) {
@@ -6921,9 +6901,8 @@ next:
/* New extent overlaps with existing one */
em->start = start;
- em->orig_start = start;
em->len = found_key.offset - start;
- em->block_start = EXTENT_MAP_HOLE;
+ em->disk_bytenr = EXTENT_MAP_HOLE;
goto insert;
}
@@ -6947,7 +6926,7 @@ next:
*
* Other members are not utilized for inline extents.
*/
- ASSERT(em->block_start == EXTENT_MAP_INLINE);
+ ASSERT(em->disk_bytenr == EXTENT_MAP_INLINE);
ASSERT(em->len == fs_info->sectorsize);
ret = read_inline_extent(inode, path, page);
@@ -6957,9 +6936,8 @@ next:
}
not_found:
em->start = start;
- em->orig_start = start;
em->len = len;
- em->block_start = EXTENT_MAP_HOLE;
+ em->disk_bytenr = EXTENT_MAP_HOLE;
insert:
ret = 0;
btrfs_release_path(path);
@@ -6986,84 +6964,6 @@ out:
return em;
}
-static struct extent_map *btrfs_create_dio_extent(struct btrfs_inode *inode,
- struct btrfs_dio_data *dio_data,
- const u64 start,
- const u64 len,
- const u64 orig_start,
- const u64 block_start,
- const u64 block_len,
- const u64 orig_block_len,
- const u64 ram_bytes,
- const int type)
-{
- struct extent_map *em = NULL;
- struct btrfs_ordered_extent *ordered;
-
- if (type != BTRFS_ORDERED_NOCOW) {
- em = create_io_em(inode, start, len, orig_start, block_start,
- block_len, orig_block_len, ram_bytes,
- BTRFS_COMPRESS_NONE, /* compress_type */
- type);
- if (IS_ERR(em))
- goto out;
- }
- ordered = btrfs_alloc_ordered_extent(inode, start, len, len,
- block_start, block_len, 0,
- (1 << type) |
- (1 << BTRFS_ORDERED_DIRECT),
- BTRFS_COMPRESS_NONE);
- if (IS_ERR(ordered)) {
- if (em) {
- free_extent_map(em);
- btrfs_drop_extent_map_range(inode, start,
- start + len - 1, false);
- }
- em = ERR_CAST(ordered);
- } else {
- ASSERT(!dio_data->ordered);
- dio_data->ordered = ordered;
- }
- out:
-
- return em;
-}
-
-static struct extent_map *btrfs_new_extent_direct(struct btrfs_inode *inode,
- struct btrfs_dio_data *dio_data,
- u64 start, u64 len)
-{
- struct btrfs_root *root = inode->root;
- struct btrfs_fs_info *fs_info = root->fs_info;
- struct extent_map *em;
- struct btrfs_key ins;
- u64 alloc_hint;
- int ret;
-
- alloc_hint = get_extent_allocation_hint(inode, start, len);
-again:
- ret = btrfs_reserve_extent(root, len, len, fs_info->sectorsize,
- 0, alloc_hint, &ins, 1, 1);
- if (ret == -EAGAIN) {
- ASSERT(btrfs_is_zoned(fs_info));
- wait_on_bit_io(&inode->root->fs_info->flags, BTRFS_FS_NEED_ZONE_FINISH,
- TASK_UNINTERRUPTIBLE);
- goto again;
- }
- if (ret)
- return ERR_PTR(ret);
-
- em = btrfs_create_dio_extent(inode, dio_data, start, ins.offset, start,
- ins.objectid, ins.offset, ins.offset,
- ins.offset, BTRFS_ORDERED_REGULAR);
- btrfs_dec_block_group_reservations(fs_info, ins.objectid);
- if (IS_ERR(em))
- btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset,
- 1);
-
- return em;
-}
-
static bool btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr)
{
struct btrfs_block_group *block_group;
@@ -7098,8 +6998,8 @@ static bool btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr)
* any ordered extents.
*/
noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
- u64 *orig_start, u64 *orig_block_len,
- u64 *ram_bytes, bool nowait, bool strict)
+ struct btrfs_file_extent *file_extent,
+ bool nowait, bool strict)
{
struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
struct can_nocow_file_extent_args nocow_args = { 0 };
@@ -7149,8 +7049,6 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item);
found_type = btrfs_file_extent_type(leaf, fi);
- if (ram_bytes)
- *ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
nocow_args.start = offset;
nocow_args.end = offset + *len - 1;
@@ -7168,14 +7066,16 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
}
ret = 0;
- if (btrfs_extent_readonly(fs_info, nocow_args.disk_bytenr))
+ if (btrfs_extent_readonly(fs_info,
+ nocow_args.file_extent.disk_bytenr +
+ nocow_args.file_extent.offset))
goto out;
if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
found_type == BTRFS_FILE_EXTENT_PREALLOC) {
u64 range_end;
- range_end = round_up(offset + nocow_args.num_bytes,
+ range_end = round_up(offset + nocow_args.file_extent.num_bytes,
root->fs_info->sectorsize) - 1;
ret = test_range_bit_exists(io_tree, offset, range_end, EXTENT_DELALLOC);
if (ret) {
@@ -7184,117 +7084,20 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
}
}
- if (orig_start)
- *orig_start = key.offset - nocow_args.extent_offset;
- if (orig_block_len)
- *orig_block_len = nocow_args.disk_num_bytes;
+ if (file_extent)
+ memcpy(file_extent, &nocow_args.file_extent, sizeof(*file_extent));
- *len = nocow_args.num_bytes;
+ *len = nocow_args.file_extent.num_bytes;
ret = 1;
out:
btrfs_free_path(path);
return ret;
}
-static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
- struct extent_state **cached_state,
- unsigned int iomap_flags)
-{
- const bool writing = (iomap_flags & IOMAP_WRITE);
- const bool nowait = (iomap_flags & IOMAP_NOWAIT);
- struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
- struct btrfs_ordered_extent *ordered;
- int ret = 0;
-
- while (1) {
- if (nowait) {
- if (!try_lock_extent(io_tree, lockstart, lockend,
- cached_state))
- return -EAGAIN;
- } else {
- lock_extent(io_tree, lockstart, lockend, cached_state);
- }
- /*
- * We're concerned with the entire range that we're going to be
- * doing DIO to, so we need to make sure there's no ordered
- * extents in this range.
- */
- ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), lockstart,
- lockend - lockstart + 1);
-
- /*
- * We need to make sure there are no buffered pages in this
- * range either, we could have raced between the invalidate in
- * generic_file_direct_write and locking the extent. The
- * invalidate needs to happen so that reads after a write do not
- * get stale data.
- */
- if (!ordered &&
- (!writing || !filemap_range_has_page(inode->i_mapping,
- lockstart, lockend)))
- break;
-
- unlock_extent(io_tree, lockstart, lockend, cached_state);
-
- if (ordered) {
- if (nowait) {
- btrfs_put_ordered_extent(ordered);
- ret = -EAGAIN;
- break;
- }
- /*
- * If we are doing a DIO read and the ordered extent we
- * found is for a buffered write, we can not wait for it
- * to complete and retry, because if we do so we can
- * deadlock with concurrent buffered writes on page
- * locks. This happens only if our DIO read covers more
- * than one extent map, if at this point has already
- * created an ordered extent for a previous extent map
- * and locked its range in the inode's io tree, and a
- * concurrent write against that previous extent map's
- * range and this range started (we unlock the ranges
- * in the io tree only when the bios complete and
- * buffered writes always lock pages before attempting
- * to lock range in the io tree).
- */
- if (writing ||
- test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags))
- btrfs_start_ordered_extent(ordered);
- else
- ret = nowait ? -EAGAIN : -ENOTBLK;
- btrfs_put_ordered_extent(ordered);
- } else {
- /*
- * We could trigger writeback for this range (and wait
- * for it to complete) and then invalidate the pages for
- * this range (through invalidate_inode_pages2_range()),
- * but that can lead us to a deadlock with a concurrent
- * call to readahead (a buffered read or a defrag call
- * triggered a readahead) on a page lock due to an
- * ordered dio extent we created before but did not have
- * yet a corresponding bio submitted (whence it can not
- * complete), which makes readahead wait for that
- * ordered extent to complete while holding a lock on
- * that page.
- */
- ret = nowait ? -EAGAIN : -ENOTBLK;
- }
-
- if (ret)
- break;
-
- cond_resched();
- }
-
- return ret;
-}
-
/* The callers of this must take lock_extent() */
-static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start,
- u64 len, u64 orig_start, u64 block_start,
- u64 block_len, u64 orig_block_len,
- u64 ram_bytes, int compress_type,
- int type)
+struct extent_map *btrfs_create_io_em(struct btrfs_inode *inode, u64 start,
+ const struct btrfs_file_extent *file_extent,
+ int type)
{
struct extent_map *em;
int ret;
@@ -7313,32 +7116,26 @@ static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start,
switch (type) {
case BTRFS_ORDERED_PREALLOC:
- /* Uncompressed extents. */
- ASSERT(block_len == len);
-
/* We're only referring part of a larger preallocated extent. */
- ASSERT(block_len <= ram_bytes);
+ ASSERT(file_extent->num_bytes <= file_extent->ram_bytes);
break;
case BTRFS_ORDERED_REGULAR:
- /* Uncompressed extents. */
- ASSERT(block_len == len);
-
/* COW results a new extent matching our file extent size. */
- ASSERT(orig_block_len == len);
- ASSERT(ram_bytes == len);
+ ASSERT(file_extent->disk_num_bytes == file_extent->num_bytes);
+ ASSERT(file_extent->ram_bytes == file_extent->num_bytes);
/* Since it's a new extent, we should not have any offset. */
- ASSERT(orig_start == start);
+ ASSERT(file_extent->offset == 0);
break;
case BTRFS_ORDERED_COMPRESSED:
/* Must be compressed. */
- ASSERT(compress_type != BTRFS_COMPRESS_NONE);
+ ASSERT(file_extent->compression != BTRFS_COMPRESS_NONE);
/*
* Encoded write can make us to refer to part of the
* uncompressed extent.
*/
- ASSERT(len <= ram_bytes);
+ ASSERT(file_extent->num_bytes <= file_extent->ram_bytes);
break;
}
@@ -7347,16 +7144,15 @@ static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start,
return ERR_PTR(-ENOMEM);
em->start = start;
- em->orig_start = orig_start;
- em->len = len;
- em->block_len = block_len;
- em->block_start = block_start;
- em->orig_block_len = orig_block_len;
- em->ram_bytes = ram_bytes;
+ em->len = file_extent->num_bytes;
+ em->disk_bytenr = file_extent->disk_bytenr;
+ em->disk_num_bytes = file_extent->disk_num_bytes;
+ em->ram_bytes = file_extent->ram_bytes;
em->generation = -1;
+ em->offset = file_extent->offset;
em->flags |= EXTENT_FLAG_PINNED;
if (type == BTRFS_ORDERED_COMPRESSED)
- extent_map_set_compression(em, compress_type);
+ extent_map_set_compression(em, file_extent->compression);
ret = btrfs_replace_extent_map_range(inode, em, true);
if (ret) {
@@ -7368,580 +7164,6 @@ static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start,
return em;
}
-
-static int btrfs_get_blocks_direct_write(struct extent_map **map,
- struct inode *inode,
- struct btrfs_dio_data *dio_data,
- u64 start, u64 *lenp,
- unsigned int iomap_flags)
-{
- const bool nowait = (iomap_flags & IOMAP_NOWAIT);
- struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
- struct extent_map *em = *map;
- int type;
- u64 block_start, orig_start, orig_block_len, ram_bytes;
- struct btrfs_block_group *bg;
- bool can_nocow = false;
- bool space_reserved = false;
- u64 len = *lenp;
- u64 prev_len;
- int ret = 0;
-
- /*
- * We don't allocate a new extent in the following cases
- *
- * 1) The inode is marked as NODATACOW. In this case we'll just use the
- * existing extent.
- * 2) The extent is marked as PREALLOC. We're good to go here and can
- * just use the extent.
- *
- */
- if ((em->flags & EXTENT_FLAG_PREALLOC) ||
- ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
- em->block_start != EXTENT_MAP_HOLE)) {
- if (em->flags & EXTENT_FLAG_PREALLOC)
- type = BTRFS_ORDERED_PREALLOC;
- else
- type = BTRFS_ORDERED_NOCOW;
- len = min(len, em->len - (start - em->start));
- block_start = em->block_start + (start - em->start);
-
- if (can_nocow_extent(inode, start, &len, &orig_start,
- &orig_block_len, &ram_bytes, false, false) == 1) {
- bg = btrfs_inc_nocow_writers(fs_info, block_start);
- if (bg)
- can_nocow = true;
- }
- }
-
- prev_len = len;
- if (can_nocow) {
- struct extent_map *em2;
-
- /* We can NOCOW, so only need to reserve metadata space. */
- ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len, len,
- nowait);
- if (ret < 0) {
- /* Our caller expects us to free the input extent map. */
- free_extent_map(em);
- *map = NULL;
- btrfs_dec_nocow_writers(bg);
- if (nowait && (ret == -ENOSPC || ret == -EDQUOT))
- ret = -EAGAIN;
- goto out;
- }
- space_reserved = true;
-
- em2 = btrfs_create_dio_extent(BTRFS_I(inode), dio_data, start, len,
- orig_start, block_start,
- len, orig_block_len,
- ram_bytes, type);
- btrfs_dec_nocow_writers(bg);
- if (type == BTRFS_ORDERED_PREALLOC) {
- free_extent_map(em);
- *map = em2;
- em = em2;
- }
-
- if (IS_ERR(em2)) {
- ret = PTR_ERR(em2);
- goto out;
- }
-
- dio_data->nocow_done = true;
- } else {
- /* Our caller expects us to free the input extent map. */
- free_extent_map(em);
- *map = NULL;
-
- if (nowait) {
- ret = -EAGAIN;
- goto out;
- }
-
- /*
- * If we could not allocate data space before locking the file
- * range and we can't do a NOCOW write, then we have to fail.
- */
- if (!dio_data->data_space_reserved) {
- ret = -ENOSPC;
- goto out;
- }
-
- /*
- * We have to COW and we have already reserved data space before,
- * so now we reserve only metadata.
- */
- ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len, len,
- false);
- if (ret < 0)
- goto out;
- space_reserved = true;
-
- em = btrfs_new_extent_direct(BTRFS_I(inode), dio_data, start, len);
- if (IS_ERR(em)) {
- ret = PTR_ERR(em);
- goto out;
- }
- *map = em;
- len = min(len, em->len - (start - em->start));
- if (len < prev_len)
- btrfs_delalloc_release_metadata(BTRFS_I(inode),
- prev_len - len, true);
- }
-
- /*
- * We have created our ordered extent, so we can now release our reservation
- * for an outstanding extent.
- */
- btrfs_delalloc_release_extents(BTRFS_I(inode), prev_len);
-
- /*
- * Need to update the i_size under the extent lock so buffered
- * readers will get the updated i_size when we unlock.
- */
- if (start + len > i_size_read(inode))
- i_size_write(inode, start + len);
-out:
- if (ret && space_reserved) {
- btrfs_delalloc_release_extents(BTRFS_I(inode), len);
- btrfs_delalloc_release_metadata(BTRFS_I(inode), len, true);
- }
- *lenp = len;
- return ret;
-}
-
-static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
- loff_t length, unsigned int flags, struct iomap *iomap,
- struct iomap *srcmap)
-{
- struct iomap_iter *iter = container_of(iomap, struct iomap_iter, iomap);
- struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
- struct extent_map *em;
- struct extent_state *cached_state = NULL;
- struct btrfs_dio_data *dio_data = iter->private;
- u64 lockstart, lockend;
- const bool write = !!(flags & IOMAP_WRITE);
- int ret = 0;
- u64 len = length;
- const u64 data_alloc_len = length;
- bool unlock_extents = false;
-
- /*
- * We could potentially fault if we have a buffer > PAGE_SIZE, and if
- * we're NOWAIT we may submit a bio for a partial range and return
- * EIOCBQUEUED, which would result in an errant short read.
- *
- * The best way to handle this would be to allow for partial completions
- * of iocb's, so we could submit the partial bio, return and fault in
- * the rest of the pages, and then submit the io for the rest of the
- * range. However we don't have that currently, so simply return
- * -EAGAIN at this point so that the normal path is used.
- */
- if (!write && (flags & IOMAP_NOWAIT) && length > PAGE_SIZE)
- return -EAGAIN;
-
- /*
- * Cap the size of reads to that usually seen in buffered I/O as we need
- * to allocate a contiguous array for the checksums.
- */
- if (!write)
- len = min_t(u64, len, fs_info->sectorsize * BTRFS_MAX_BIO_SECTORS);
-
- lockstart = start;
- lockend = start + len - 1;
-
- /*
- * iomap_dio_rw() only does filemap_write_and_wait_range(), which isn't
- * enough if we've written compressed pages to this area, so we need to
- * flush the dirty pages again to make absolutely sure that any
- * outstanding dirty pages are on disk - the first flush only starts
- * compression on the data, while keeping the pages locked, so by the
- * time the second flush returns we know bios for the compressed pages
- * were submitted and finished, and the pages no longer under writeback.
- *
- * If we have a NOWAIT request and we have any pages in the range that
- * are locked, likely due to compression still in progress, we don't want
- * to block on page locks. We also don't want to block on pages marked as
- * dirty or under writeback (same as for the non-compression case).
- * iomap_dio_rw() did the same check, but after that and before we got
- * here, mmap'ed writes may have happened or buffered reads started
- * (readpage() and readahead(), which lock pages), as we haven't locked
- * the file range yet.
- */
- if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
- &BTRFS_I(inode)->runtime_flags)) {
- if (flags & IOMAP_NOWAIT) {
- if (filemap_range_needs_writeback(inode->i_mapping,
- lockstart, lockend))
- return -EAGAIN;
- } else {
- ret = filemap_fdatawrite_range(inode->i_mapping, start,
- start + length - 1);
- if (ret)
- return ret;
- }
- }
-
- memset(dio_data, 0, sizeof(*dio_data));
-
- /*
- * We always try to allocate data space and must do it before locking
- * the file range, to avoid deadlocks with concurrent writes to the same
- * range if the range has several extents and the writes don't expand the
- * current i_size (the inode lock is taken in shared mode). If we fail to
- * allocate data space here we continue and later, after locking the
- * file range, we fail with ENOSPC only if we figure out we can not do a
- * NOCOW write.
- */
- if (write && !(flags & IOMAP_NOWAIT)) {
- ret = btrfs_check_data_free_space(BTRFS_I(inode),
- &dio_data->data_reserved,
- start, data_alloc_len, false);
- if (!ret)
- dio_data->data_space_reserved = true;
- else if (ret && !(BTRFS_I(inode)->flags &
- (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)))
- goto err;
- }
-
- /*
- * If this errors out it's because we couldn't invalidate pagecache for
- * this range and we need to fallback to buffered IO, or we are doing a
- * NOWAIT read/write and we need to block.
- */
- ret = lock_extent_direct(inode, lockstart, lockend, &cached_state, flags);
- if (ret < 0)
- goto err;
-
- em = btrfs_get_extent(BTRFS_I(inode), NULL, start, len);
- if (IS_ERR(em)) {
- ret = PTR_ERR(em);
- goto unlock_err;
- }
-
- /*
- * Ok for INLINE and COMPRESSED extents we need to fallback on buffered
- * io. INLINE is special, and we could probably kludge it in here, but
- * it's still buffered so for safety lets just fall back to the generic
- * buffered path.
- *
- * For COMPRESSED we _have_ to read the entire extent in so we can
- * decompress it, so there will be buffering required no matter what we
- * do, so go ahead and fallback to buffered.
- *
- * We return -ENOTBLK because that's what makes DIO go ahead and go back
- * to buffered IO. Don't blame me, this is the price we pay for using
- * the generic code.
- */
- if (extent_map_is_compressed(em) ||
- em->block_start == EXTENT_MAP_INLINE) {
- free_extent_map(em);
- /*
- * If we are in a NOWAIT context, return -EAGAIN in order to
- * fallback to buffered IO. This is not only because we can
- * block with buffered IO (no support for NOWAIT semantics at
- * the moment) but also to avoid returning short reads to user
- * space - this happens if we were able to read some data from
- * previous non-compressed extents and then when we fallback to
- * buffered IO, at btrfs_file_read_iter() by calling
- * filemap_read(), we fail to fault in pages for the read buffer,
- * in which case filemap_read() returns a short read (the number
- * of bytes previously read is > 0, so it does not return -EFAULT).
- */
- ret = (flags & IOMAP_NOWAIT) ? -EAGAIN : -ENOTBLK;
- goto unlock_err;
- }
-
- len = min(len, em->len - (start - em->start));
-
- /*
- * If we have a NOWAIT request and the range contains multiple extents
- * (or a mix of extents and holes), then we return -EAGAIN to make the
- * caller fallback to a context where it can do a blocking (without
- * NOWAIT) request. This way we avoid doing partial IO and returning
- * success to the caller, which is not optimal for writes and for reads
- * it can result in unexpected behaviour for an application.
- *
- * When doing a read, because we use IOMAP_DIO_PARTIAL when calling
- * iomap_dio_rw(), we can end up returning less data then what the caller
- * asked for, resulting in an unexpected, and incorrect, short read.
- * That is, the caller asked to read N bytes and we return less than that,
- * which is wrong unless we are crossing EOF. This happens if we get a
- * page fault error when trying to fault in pages for the buffer that is
- * associated to the struct iov_iter passed to iomap_dio_rw(), and we
- * have previously submitted bios for other extents in the range, in
- * which case iomap_dio_rw() may return us EIOCBQUEUED if not all of
- * those bios have completed by the time we get the page fault error,
- * which we return back to our caller - we should only return EIOCBQUEUED
- * after we have submitted bios for all the extents in the range.
- */
- if ((flags & IOMAP_NOWAIT) && len < length) {
- free_extent_map(em);
- ret = -EAGAIN;
- goto unlock_err;
- }
-
- if (write) {
- ret = btrfs_get_blocks_direct_write(&em, inode, dio_data,
- start, &len, flags);
- if (ret < 0)
- goto unlock_err;
- unlock_extents = true;
- /* Recalc len in case the new em is smaller than requested */
- len = min(len, em->len - (start - em->start));
- if (dio_data->data_space_reserved) {
- u64 release_offset;
- u64 release_len = 0;
-
- if (dio_data->nocow_done) {
- release_offset = start;
- release_len = data_alloc_len;
- } else if (len < data_alloc_len) {
- release_offset = start + len;
- release_len = data_alloc_len - len;
- }
-
- if (release_len > 0)
- btrfs_free_reserved_data_space(BTRFS_I(inode),
- dio_data->data_reserved,
- release_offset,
- release_len);
- }
- } else {
- /*
- * We need to unlock only the end area that we aren't using.
- * The rest is going to be unlocked by the endio routine.
- */
- lockstart = start + len;
- if (lockstart < lockend)
- unlock_extents = true;
- }
-
- if (unlock_extents)
- unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
- &cached_state);
- else
- free_extent_state(cached_state);
-
- /*
- * Translate extent map information to iomap.
- * We trim the extents (and move the addr) even though iomap code does
- * that, since we have locked only the parts we are performing I/O in.
- */
- if ((em->block_start == EXTENT_MAP_HOLE) ||
- ((em->flags & EXTENT_FLAG_PREALLOC) && !write)) {
- iomap->addr = IOMAP_NULL_ADDR;
- iomap->type = IOMAP_HOLE;
- } else {
- iomap->addr = em->block_start + (start - em->start);
- iomap->type = IOMAP_MAPPED;
- }
- iomap->offset = start;
- iomap->bdev = fs_info->fs_devices->latest_dev->bdev;
- iomap->length = len;
- free_extent_map(em);
-
- return 0;
-
-unlock_err:
- unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
- &cached_state);
-err:
- if (dio_data->data_space_reserved) {
- btrfs_free_reserved_data_space(BTRFS_I(inode),
- dio_data->data_reserved,
- start, data_alloc_len);
- extent_changeset_free(dio_data->data_reserved);
- }
-
- return ret;
-}
-
-static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length,
- ssize_t written, unsigned int flags, struct iomap *iomap)
-{
- struct iomap_iter *iter = container_of(iomap, struct iomap_iter, iomap);
- struct btrfs_dio_data *dio_data = iter->private;
- size_t submitted = dio_data->submitted;
- const bool write = !!(flags & IOMAP_WRITE);
- int ret = 0;
-
- if (!write && (iomap->type == IOMAP_HOLE)) {
- /* If reading from a hole, unlock and return */
- unlock_extent(&BTRFS_I(inode)->io_tree, pos, pos + length - 1,
- NULL);
- return 0;
- }
-
- if (submitted < length) {
- pos += submitted;
- length -= submitted;
- if (write)
- btrfs_finish_ordered_extent(dio_data->ordered, NULL,
- pos, length, false);
- else
- unlock_extent(&BTRFS_I(inode)->io_tree, pos,
- pos + length - 1, NULL);
- ret = -ENOTBLK;
- }
- if (write) {
- btrfs_put_ordered_extent(dio_data->ordered);
- dio_data->ordered = NULL;
- }
-
- if (write)
- extent_changeset_free(dio_data->data_reserved);
- return ret;
-}
-
-static void btrfs_dio_end_io(struct btrfs_bio *bbio)
-{
- struct btrfs_dio_private *dip =
- container_of(bbio, struct btrfs_dio_private, bbio);
- struct btrfs_inode *inode = bbio->inode;
- struct bio *bio = &bbio->bio;
-
- if (bio->bi_status) {
- btrfs_warn(inode->root->fs_info,
- "direct IO failed ino %llu op 0x%0x offset %#llx len %u err no %d",
- btrfs_ino(inode), bio->bi_opf,
- dip->file_offset, dip->bytes, bio->bi_status);
- }
-
- if (btrfs_op(bio) == BTRFS_MAP_WRITE) {
- btrfs_finish_ordered_extent(bbio->ordered, NULL,
- dip->file_offset, dip->bytes,
- !bio->bi_status);
- } else {
- unlock_extent(&inode->io_tree, dip->file_offset,
- dip->file_offset + dip->bytes - 1, NULL);
- }
-
- bbio->bio.bi_private = bbio->private;
- iomap_dio_bio_end_io(bio);
-}
-
-static void btrfs_dio_submit_io(const struct iomap_iter *iter, struct bio *bio,
- loff_t file_offset)
-{
- struct btrfs_bio *bbio = btrfs_bio(bio);
- struct btrfs_dio_private *dip =
- container_of(bbio, struct btrfs_dio_private, bbio);
- struct btrfs_dio_data *dio_data = iter->private;
-
- btrfs_bio_init(bbio, BTRFS_I(iter->inode)->root->fs_info,
- btrfs_dio_end_io, bio->bi_private);
- bbio->inode = BTRFS_I(iter->inode);
- bbio->file_offset = file_offset;
-
- dip->file_offset = file_offset;
- dip->bytes = bio->bi_iter.bi_size;
-
- dio_data->submitted += bio->bi_iter.bi_size;
-
- /*
- * Check if we are doing a partial write. If we are, we need to split
- * the ordered extent to match the submitted bio. Hang on to the
- * remaining unfinishable ordered_extent in dio_data so that it can be
- * cancelled in iomap_end to avoid a deadlock wherein faulting the
- * remaining pages is blocked on the outstanding ordered extent.
- */
- if (iter->flags & IOMAP_WRITE) {
- int ret;
-
- ret = btrfs_extract_ordered_extent(bbio, dio_data->ordered);
- if (ret) {
- btrfs_finish_ordered_extent(dio_data->ordered, NULL,
- file_offset, dip->bytes,
- !ret);
- bio->bi_status = errno_to_blk_status(ret);
- iomap_dio_bio_end_io(bio);
- return;
- }
- }
-
- btrfs_submit_bio(bbio, 0);
-}
-
-static const struct iomap_ops btrfs_dio_iomap_ops = {
- .iomap_begin = btrfs_dio_iomap_begin,
- .iomap_end = btrfs_dio_iomap_end,
-};
-
-static const struct iomap_dio_ops btrfs_dio_ops = {
- .submit_io = btrfs_dio_submit_io,
- .bio_set = &btrfs_dio_bioset,
-};
-
-ssize_t btrfs_dio_read(struct kiocb *iocb, struct iov_iter *iter, size_t done_before)
-{
- struct btrfs_dio_data data = { 0 };
-
- return iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dio_ops,
- IOMAP_DIO_PARTIAL, &data, done_before);
-}
-
-struct iomap_dio *btrfs_dio_write(struct kiocb *iocb, struct iov_iter *iter,
- size_t done_before)
-{
- struct btrfs_dio_data data = { 0 };
-
- return __iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dio_ops,
- IOMAP_DIO_PARTIAL, &data, done_before);
-}
-
-static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
- u64 start, u64 len)
-{
- struct btrfs_inode *btrfs_inode = BTRFS_I(inode);
- int ret;
-
- ret = fiemap_prep(inode, fieinfo, start, &len, 0);
- if (ret)
- return ret;
-
- /*
- * fiemap_prep() called filemap_write_and_wait() for the whole possible
- * file range (0 to LLONG_MAX), but that is not enough if we have
- * compression enabled. The first filemap_fdatawrite_range() only kicks
- * in the compression of data (in an async thread) and will return
- * before the compression is done and writeback is started. A second
- * filemap_fdatawrite_range() is needed to wait for the compression to
- * complete and writeback to start. We also need to wait for ordered
- * extents to complete, because our fiemap implementation uses mainly
- * file extent items to list the extents, searching for extent maps
- * only for file ranges with holes or prealloc extents to figure out
- * if we have delalloc in those ranges.
- */
- if (fieinfo->fi_flags & FIEMAP_FLAG_SYNC) {
- ret = btrfs_wait_ordered_range(inode, 0, LLONG_MAX);
- if (ret)
- return ret;
- }
-
- btrfs_inode_lock(btrfs_inode, BTRFS_ILOCK_SHARED);
-
- /*
- * We did an initial flush to avoid holding the inode's lock while
- * triggering writeback and waiting for the completion of IO and ordered
- * extents. Now after we locked the inode we do it again, because it's
- * possible a new write may have happened in between those two steps.
- */
- if (fieinfo->fi_flags & FIEMAP_FLAG_SYNC) {
- ret = btrfs_wait_ordered_range(inode, 0, LLONG_MAX);
- if (ret) {
- btrfs_inode_unlock(btrfs_inode, BTRFS_ILOCK_SHARED);
- return ret;
- }
- }
-
- ret = extent_fiemap(btrfs_inode, fieinfo, start, len);
- btrfs_inode_unlock(btrfs_inode, BTRFS_ILOCK_SHARED);
-
- return ret;
-}
-
/*
* For release_folio() and invalidate_folio() we have a race window where
* folio_end_writeback() is called but the subpage spinlock is not yet released.
@@ -8198,7 +7420,7 @@ static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback)
const u64 min_size = btrfs_calc_metadata_size(fs_info, 1);
if (!skip_writeback) {
- ret = btrfs_wait_ordered_range(&inode->vfs_inode,
+ ret = btrfs_wait_ordered_range(inode,
inode->vfs_inode.i_size & (~mask),
(u64)-1);
if (ret)
@@ -8399,20 +7621,10 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
struct btrfs_fs_info *fs_info = btrfs_sb(sb);
struct btrfs_inode *ei;
struct inode *inode;
- struct extent_io_tree *file_extent_tree = NULL;
-
- /* Self tests may pass a NULL fs_info. */
- if (fs_info && !btrfs_fs_incompat(fs_info, NO_HOLES)) {
- file_extent_tree = kmalloc(sizeof(struct extent_io_tree), GFP_KERNEL);
- if (!file_extent_tree)
- return NULL;
- }
ei = alloc_inode_sb(sb, btrfs_inode_cachep, GFP_KERNEL);
- if (!ei) {
- kfree(file_extent_tree);
+ if (!ei)
return NULL;
- }
ei->root = NULL;
ei->generation = 0;
@@ -8425,8 +7637,12 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
ei->disk_i_size = 0;
ei->flags = 0;
ei->ro_flags = 0;
+ /*
+ * ->index_cnt will be properly initialized later when creating a new
+ * inode (btrfs_create_new_inode()) or when reading an existing inode
+ * from disk (btrfs_read_locked_inode()).
+ */
ei->csum_bytes = 0;
- ei->index_cnt = (u64)-1;
ei->dir_index = 0;
ei->last_unlink_trans = 0;
ei->last_reflink_trans = 0;
@@ -8453,20 +7669,14 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
extent_io_tree_init(fs_info, &ei->io_tree, IO_TREE_INODE_IO);
ei->io_tree.inode = ei;
- ei->file_extent_tree = file_extent_tree;
- if (file_extent_tree) {
- extent_io_tree_init(fs_info, ei->file_extent_tree,
- IO_TREE_INODE_FILE_EXTENT);
- /* Lockdep class is set only for the file extent tree. */
- lockdep_set_class(&ei->file_extent_tree->lock, &file_extent_tree_class);
- }
+ ei->file_extent_tree = NULL;
+
mutex_init(&ei->log_mutex);
spin_lock_init(&ei->ordered_tree_lock);
ei->ordered_tree = RB_ROOT;
ei->ordered_tree_last = NULL;
INIT_LIST_HEAD(&ei->delalloc_inodes);
INIT_LIST_HEAD(&ei->delayed_iput);
- RB_CLEAR_NODE(&ei->rb_node);
init_rwsem(&ei->i_mmap_lock);
return inode;
@@ -8502,9 +7712,10 @@ void btrfs_destroy_inode(struct inode *vfs_inode)
if (!S_ISDIR(vfs_inode->i_mode)) {
WARN_ON(inode->delalloc_bytes);
WARN_ON(inode->new_delalloc_bytes);
+ WARN_ON(inode->csum_bytes);
}
- WARN_ON(inode->csum_bytes);
- WARN_ON(inode->defrag_bytes);
+ if (!root || !btrfs_is_data_reloc_root(root))
+ WARN_ON(inode->defrag_bytes);
/*
* This can happen where we create an inode, but somebody else also
@@ -8538,7 +7749,7 @@ void btrfs_destroy_inode(struct inode *vfs_inode)
}
}
btrfs_qgroup_check_reserved_leak(inode);
- inode_tree_del(inode);
+ btrfs_del_inode_from_root(inode);
btrfs_drop_extent_map_range(inode, 0, (u64)-1, false);
btrfs_inode_clear_file_extent_range(inode, 0, (u64)-1);
btrfs_put_root(inode->root);
@@ -8572,7 +7783,6 @@ void __cold btrfs_destroy_cachep(void)
* destroy cache.
*/
rcu_barrier();
- bioset_exit(&btrfs_dio_bioset);
kmem_cache_destroy(btrfs_inode_cachep);
}
@@ -8583,17 +7793,9 @@ int __init btrfs_init_cachep(void)
SLAB_RECLAIM_ACCOUNT | SLAB_ACCOUNT,
init_once);
if (!btrfs_inode_cachep)
- goto fail;
-
- if (bioset_init(&btrfs_dio_bioset, BIO_POOL_SIZE,
- offsetof(struct btrfs_dio_private, bbio.bio),
- BIOSET_NEED_BVECS))
- goto fail;
+ return -ENOMEM;
return 0;
-fail:
- btrfs_destroy_cachep();
- return -ENOMEM;
}
static int btrfs_getattr(struct mnt_idmap *idmap,
@@ -9586,11 +8788,10 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
}
em->start = cur_offset;
- em->orig_start = cur_offset;
em->len = ins.offset;
- em->block_start = ins.objectid;
- em->block_len = ins.offset;
- em->orig_block_len = ins.offset;
+ em->disk_bytenr = ins.objectid;
+ em->offset = 0;
+ em->disk_num_bytes = ins.offset;
em->ram_bytes = ins.offset;
em->flags |= EXTENT_FLAG_PREALLOC;
em->generation = trans->transid;
@@ -9956,7 +9157,7 @@ static ssize_t btrfs_encoded_read_regular(struct kiocb *iocb,
pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
if (!pages)
return -ENOMEM;
- ret = btrfs_alloc_page_array(nr_pages, pages, 0);
+ ret = btrfs_alloc_page_array(nr_pages, pages, false);
if (ret) {
ret = -ENOMEM;
goto out;
@@ -10033,7 +9234,7 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
for (;;) {
struct btrfs_ordered_extent *ordered;
- ret = btrfs_wait_ordered_range(&inode->vfs_inode, start,
+ ret = btrfs_wait_ordered_range(inode, start,
lockend - start + 1);
if (ret)
goto out_unlock_inode;
@@ -10053,7 +9254,7 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
goto out_unlock_extent;
}
- if (em->block_start == EXTENT_MAP_INLINE) {
+ if (em->disk_bytenr == EXTENT_MAP_INLINE) {
u64 extent_start = em->start;
/*
@@ -10074,33 +9275,33 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
*/
encoded->len = min_t(u64, extent_map_end(em),
inode->vfs_inode.i_size) - iocb->ki_pos;
- if (em->block_start == EXTENT_MAP_HOLE ||
+ if (em->disk_bytenr == EXTENT_MAP_HOLE ||
(em->flags & EXTENT_FLAG_PREALLOC)) {
disk_bytenr = EXTENT_MAP_HOLE;
count = min_t(u64, count, encoded->len);
encoded->len = count;
encoded->unencoded_len = count;
} else if (extent_map_is_compressed(em)) {
- disk_bytenr = em->block_start;
+ disk_bytenr = em->disk_bytenr;
/*
* Bail if the buffer isn't large enough to return the whole
* compressed extent.
*/
- if (em->block_len > count) {
+ if (em->disk_num_bytes > count) {
ret = -ENOBUFS;
goto out_em;
}
- disk_io_size = em->block_len;
- count = em->block_len;
+ disk_io_size = em->disk_num_bytes;
+ count = em->disk_num_bytes;
encoded->unencoded_len = em->ram_bytes;
- encoded->unencoded_offset = iocb->ki_pos - em->orig_start;
+ encoded->unencoded_offset = iocb->ki_pos - (em->start - em->offset);
ret = btrfs_encoded_io_compression_from_extent(fs_info,
extent_map_compression(em));
if (ret < 0)
goto out_em;
encoded->compression = ret;
} else {
- disk_bytenr = em->block_start + (start - em->start);
+ disk_bytenr = extent_map_block_start(em) + (start - em->start);
if (encoded->len > count)
encoded->len = count;
/*
@@ -10155,6 +9356,7 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
struct extent_changeset *data_reserved = NULL;
struct extent_state *cached_state = NULL;
struct btrfs_ordered_extent *ordered;
+ struct btrfs_file_extent file_extent;
int compression;
size_t orig_count;
u64 start, end;
@@ -10276,7 +9478,7 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
for (;;) {
struct btrfs_ordered_extent *ordered;
- ret = btrfs_wait_ordered_range(&inode->vfs_inode, start, num_bytes);
+ ret = btrfs_wait_ordered_range(inode, start, num_bytes);
if (ret)
goto out_folios;
ret = invalidate_inode_pages2_range(inode->vfs_inode.i_mapping,
@@ -10330,22 +9532,22 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
goto out_delalloc_release;
extent_reserved = true;
- em = create_io_em(inode, start, num_bytes,
- start - encoded->unencoded_offset, ins.objectid,
- ins.offset, ins.offset, ram_bytes, compression,
- BTRFS_ORDERED_COMPRESSED);
+ file_extent.disk_bytenr = ins.objectid;
+ file_extent.disk_num_bytes = ins.offset;
+ file_extent.num_bytes = num_bytes;
+ file_extent.ram_bytes = ram_bytes;
+ file_extent.offset = encoded->unencoded_offset;
+ file_extent.compression = compression;
+ em = btrfs_create_io_em(inode, start, &file_extent, BTRFS_ORDERED_COMPRESSED);
if (IS_ERR(em)) {
ret = PTR_ERR(em);
goto out_free_reserved;
}
free_extent_map(em);
- ordered = btrfs_alloc_ordered_extent(inode, start, num_bytes, ram_bytes,
- ins.objectid, ins.offset,
- encoded->unencoded_offset,
+ ordered = btrfs_alloc_ordered_extent(inode, start, &file_extent,
(1 << BTRFS_ORDERED_ENCODED) |
- (1 << BTRFS_ORDERED_COMPRESSED),
- compression);
+ (1 << BTRFS_ORDERED_COMPRESSED));
if (IS_ERR(ordered)) {
btrfs_drop_extent_map_range(inode, start, end, false);
ret = PTR_ERR(ordered);
@@ -10549,7 +9751,7 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
* file changes again after this, the user is doing something stupid and
* we don't really care.
*/
- ret = btrfs_wait_ordered_range(inode, 0, (u64)-1);
+ ret = btrfs_wait_ordered_range(BTRFS_I(inode), 0, (u64)-1);
if (ret)
return ret;
@@ -10635,12 +9837,12 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
goto out;
}
- if (em->block_start == EXTENT_MAP_HOLE) {
+ if (em->disk_bytenr == EXTENT_MAP_HOLE) {
btrfs_warn(fs_info, "swapfile must not have holes");
ret = -EINVAL;
goto out;
}
- if (em->block_start == EXTENT_MAP_INLINE) {
+ if (em->disk_bytenr == EXTENT_MAP_INLINE) {
/*
* It's unlikely we'll ever actually find ourselves
* here, as a file small enough to fit inline won't be
@@ -10658,12 +9860,12 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
goto out;
}
- logical_block_start = em->block_start + (start - em->start);
+ logical_block_start = extent_map_block_start(em) + (start - em->start);
len = min(len, em->len - (start - em->start));
free_extent_map(em);
em = NULL;
- ret = can_nocow_extent(inode, start, &len, NULL, NULL, NULL, false, true);
+ ret = can_nocow_extent(inode, start, &len, NULL, false, true);
if (ret < 0) {
goto out;
} else if (ret) {
@@ -10860,52 +10062,23 @@ void btrfs_assert_inode_range_clean(struct btrfs_inode *inode, u64 start, u64 en
*/
struct btrfs_inode *btrfs_find_first_inode(struct btrfs_root *root, u64 min_ino)
{
- struct rb_node *node;
- struct rb_node *prev;
struct btrfs_inode *inode;
+ unsigned long from = min_ino;
- spin_lock(&root->inode_lock);
-again:
- node = root->inode_tree.rb_node;
- prev = NULL;
- while (node) {
- prev = node;
- inode = rb_entry(node, struct btrfs_inode, rb_node);
- if (min_ino < btrfs_ino(inode))
- node = node->rb_left;
- else if (min_ino > btrfs_ino(inode))
- node = node->rb_right;
- else
+ xa_lock(&root->inodes);
+ while (true) {
+ inode = xa_find(&root->inodes, &from, ULONG_MAX, XA_PRESENT);
+ if (!inode)
+ break;
+ if (igrab(&inode->vfs_inode))
break;
- }
-
- if (!node) {
- while (prev) {
- inode = rb_entry(prev, struct btrfs_inode, rb_node);
- if (min_ino <= btrfs_ino(inode)) {
- node = prev;
- break;
- }
- prev = rb_next(prev);
- }
- }
-
- while (node) {
- inode = rb_entry(prev, struct btrfs_inode, rb_node);
- if (igrab(&inode->vfs_inode)) {
- spin_unlock(&root->inode_lock);
- return inode;
- }
-
- min_ino = btrfs_ino(inode) + 1;
- if (cond_resched_lock(&root->inode_lock))
- goto again;
- node = rb_next(node);
+ from = btrfs_ino(inode) + 1;
+ cond_resched_lock(&root->inodes.xa_lock);
}
- spin_unlock(&root->inode_lock);
+ xa_unlock(&root->inodes);
- return NULL;
+ return inode;
}
static const struct inode_operations btrfs_dir_inode_operations = {
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 6ad524b894fc..e0a664b8a46a 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -375,15 +375,15 @@ int btrfs_fileattr_set(struct mnt_idmap *idmap,
return PTR_ERR(trans);
if (comp) {
- ret = btrfs_set_prop(trans, inode, "btrfs.compression", comp,
- strlen(comp), 0);
+ ret = btrfs_set_prop(trans, BTRFS_I(inode), "btrfs.compression",
+ comp, strlen(comp), 0);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto out_end_trans;
}
} else {
- ret = btrfs_set_prop(trans, inode, "btrfs.compression", NULL,
- 0, 0);
+ ret = btrfs_set_prop(trans, BTRFS_I(inode), "btrfs.compression",
+ NULL, 0, 0);
if (ret && ret != -ENODATA) {
btrfs_abort_transaction(trans, ret);
goto out_end_trans;
@@ -552,7 +552,7 @@ static noinline int btrfs_ioctl_fitrim(struct btrfs_fs_info *fs_info,
return 0;
}
-int __pure btrfs_is_empty_uuid(u8 *uuid)
+int __pure btrfs_is_empty_uuid(const u8 *uuid)
{
int i;
@@ -658,15 +658,10 @@ static noinline int create_subvol(struct mnt_idmap *idmap,
ret = PTR_ERR(trans);
goto out_release_rsv;
}
- ret = btrfs_record_root_in_trans(trans, BTRFS_I(dir)->root);
- if (ret)
- goto out;
btrfs_qgroup_convert_reserved_meta(root, qgroup_reserved);
qgroup_reserved = 0;
trans->block_rsv = &block_rsv;
trans->bytes_reserved = block_rsv.size;
- /* Tree log can't currently deal with an inode which is a new root. */
- btrfs_set_log_full_commit(trans);
ret = btrfs_qgroup_inherit(trans, 0, objectid, btrfs_root_id(root), inherit);
if (ret)
@@ -719,6 +714,8 @@ static noinline int create_subvol(struct mnt_idmap *idmap,
ret = btrfs_insert_root(trans, fs_info->tree_root, &key,
root_item);
if (ret) {
+ int ret2;
+
/*
* Since we don't abort the transaction in this case, free the
* tree block so that we don't leak space and leave the
@@ -729,7 +726,9 @@ static noinline int create_subvol(struct mnt_idmap *idmap,
btrfs_tree_lock(leaf);
btrfs_clear_buffer_dirty(trans, leaf);
btrfs_tree_unlock(leaf);
- btrfs_free_tree_block(trans, objectid, leaf, 0, 1);
+ ret2 = btrfs_free_tree_block(trans, objectid, leaf, 0, 1);
+ if (ret2 < 0)
+ btrfs_abort_transaction(trans, ret2);
free_extent_buffer(leaf);
goto out;
}
@@ -767,6 +766,8 @@ static noinline int create_subvol(struct mnt_idmap *idmap,
goto out;
}
+ btrfs_record_new_subvolume(trans, BTRFS_I(dir));
+
d_instantiate_new(dentry, new_inode_args.inode);
new_inode_args.inode = NULL;
@@ -854,7 +855,7 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
pending_snapshot->dentry = dentry;
pending_snapshot->root = root;
pending_snapshot->readonly = readonly;
- pending_snapshot->dir = dir;
+ pending_snapshot->dir = BTRFS_I(dir);
pending_snapshot->inherit = inherit;
trans = btrfs_start_transaction(root, 0);
@@ -1070,7 +1071,7 @@ static noinline int btrfs_mksnapshot(const struct path *parent,
atomic_inc(&root->snapshot_force_cow);
snapshot_force_cow = true;
- btrfs_wait_ordered_extents(root, U64_MAX, 0, (u64)-1);
+ btrfs_wait_ordered_extents(root, U64_MAX, NULL);
ret = btrfs_mksubvol(parent, idmap, name, namelen,
root, readonly, inherit);
@@ -1917,8 +1918,7 @@ static int btrfs_search_path_in_tree_user(struct mnt_idmap *idmap,
struct btrfs_ioctl_ino_lookup_user_args *args)
{
struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
- struct super_block *sb = inode->i_sb;
- struct btrfs_key upper_limit = BTRFS_I(inode)->location;
+ u64 upper_limit = btrfs_ino(BTRFS_I(inode));
u64 treeid = btrfs_root_id(BTRFS_I(inode)->root);
u64 dirid = args->dirid;
unsigned long item_off;
@@ -1944,7 +1944,7 @@ static int btrfs_search_path_in_tree_user(struct mnt_idmap *idmap,
* If the bottom subvolume does not exist directly under upper_limit,
* construct the path in from the bottom up.
*/
- if (dirid != upper_limit.objectid) {
+ if (dirid != upper_limit) {
ptr = &args->path[BTRFS_INO_LOOKUP_USER_PATH_MAX - 1];
root = btrfs_get_fs_root(fs_info, treeid, true);
@@ -2006,7 +2006,7 @@ static int btrfs_search_path_in_tree_user(struct mnt_idmap *idmap,
* btree and lock the same leaf.
*/
btrfs_release_path(path);
- temp_inode = btrfs_iget(sb, key2.objectid, root);
+ temp_inode = btrfs_iget(key2.objectid, root);
if (IS_ERR(temp_inode)) {
ret = PTR_ERR(temp_inode);
goto out_put;
@@ -2019,7 +2019,7 @@ static int btrfs_search_path_in_tree_user(struct mnt_idmap *idmap,
goto out_put;
}
- if (key.offset == upper_limit.objectid)
+ if (key.offset == upper_limit)
break;
if (key.objectid == BTRFS_FIRST_FREE_OBJECTID) {
ret = -EACCES;
@@ -2140,7 +2140,7 @@ static int btrfs_ioctl_ino_lookup_user(struct file *file, void __user *argp)
inode = file_inode(file);
if (args->dirid == BTRFS_FIRST_FREE_OBJECTID &&
- BTRFS_I(inode)->location.objectid != BTRFS_FIRST_FREE_OBJECTID) {
+ btrfs_ino(BTRFS_I(inode)) != BTRFS_FIRST_FREE_OBJECTID) {
/*
* The subvolume does not exist under fd with which this is
* called
@@ -3807,12 +3807,29 @@ drop_write:
return ret;
}
+/*
+ * Quick check for ioctl handlers if quotas are enabled. Proper locking must be
+ * done before any operations.
+ */
+static bool qgroup_enabled(struct btrfs_fs_info *fs_info)
+{
+ bool ret = true;
+
+ mutex_lock(&fs_info->qgroup_ioctl_lock);
+ if (!fs_info->quota_root)
+ ret = false;
+ mutex_unlock(&fs_info->qgroup_ioctl_lock);
+
+ return ret;
+}
+
static long btrfs_ioctl_qgroup_assign(struct file *file, void __user *arg)
{
struct inode *inode = file_inode(file);
struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_ioctl_qgroup_assign_args *sa;
+ struct btrfs_qgroup_list *prealloc = NULL;
struct btrfs_trans_handle *trans;
int ret;
int err;
@@ -3820,6 +3837,9 @@ static long btrfs_ioctl_qgroup_assign(struct file *file, void __user *arg)
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
+ if (!qgroup_enabled(root->fs_info))
+ return -ENOTCONN;
+
ret = mnt_want_write_file(file);
if (ret)
return ret;
@@ -3830,14 +3850,27 @@ static long btrfs_ioctl_qgroup_assign(struct file *file, void __user *arg)
goto drop_write;
}
+ if (sa->assign) {
+ prealloc = kzalloc(sizeof(*prealloc), GFP_KERNEL);
+ if (!prealloc) {
+ ret = -ENOMEM;
+ goto drop_write;
+ }
+ }
+
trans = btrfs_join_transaction(root);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
goto out;
}
+ /*
+ * Prealloc ownership is moved to the relation handler, there it's used
+ * or freed on error.
+ */
if (sa->assign) {
- ret = btrfs_add_qgroup_relation(trans, sa->src, sa->dst);
+ ret = btrfs_add_qgroup_relation(trans, sa->src, sa->dst, prealloc);
+ prealloc = NULL;
} else {
ret = btrfs_del_qgroup_relation(trans, sa->src, sa->dst);
}
@@ -3847,13 +3880,15 @@ static long btrfs_ioctl_qgroup_assign(struct file *file, void __user *arg)
err = btrfs_run_qgroups(trans);
mutex_unlock(&fs_info->qgroup_ioctl_lock);
if (err < 0)
- btrfs_handle_fs_error(fs_info, err,
- "failed to update qgroup status and info");
+ btrfs_warn(fs_info,
+ "qgroup status update failed after %s relation, marked as inconsistent",
+ sa->assign ? "adding" : "deleting");
err = btrfs_end_transaction(trans);
if (err && !ret)
ret = err;
out:
+ kfree(prealloc);
kfree(sa);
drop_write:
mnt_drop_write_file(file);
@@ -3872,6 +3907,9 @@ static long btrfs_ioctl_qgroup_create(struct file *file, void __user *arg)
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
+ if (!qgroup_enabled(root->fs_info))
+ return -ENOTCONN;
+
ret = mnt_want_write_file(file);
if (ret)
return ret;
@@ -3928,6 +3966,9 @@ static long btrfs_ioctl_qgroup_limit(struct file *file, void __user *arg)
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
+ if (!qgroup_enabled(root->fs_info))
+ return -ENOTCONN;
+
ret = mnt_want_write_file(file);
if (ret)
return ret;
@@ -3973,6 +4014,9 @@ static long btrfs_ioctl_quota_rescan(struct file *file, void __user *arg)
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
+ if (!qgroup_enabled(fs_info))
+ return -ENOTCONN;
+
ret = mnt_want_write_file(file);
if (ret)
return ret;
@@ -4429,7 +4473,7 @@ out_drop_write:
return ret;
}
-static int _btrfs_ioctl_send(struct inode *inode, void __user *argp, bool compat)
+static int _btrfs_ioctl_send(struct btrfs_inode *inode, void __user *argp, bool compat)
{
struct btrfs_ioctl_send_args *arg;
int ret;
@@ -4751,10 +4795,10 @@ long btrfs_ioctl(struct file *file, unsigned int
return btrfs_ioctl_set_received_subvol_32(file, argp);
#endif
case BTRFS_IOC_SEND:
- return _btrfs_ioctl_send(inode, argp, false);
+ return _btrfs_ioctl_send(BTRFS_I(inode), argp, false);
#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT)
case BTRFS_IOC_SEND_32:
- return _btrfs_ioctl_send(inode, argp, true);
+ return _btrfs_ioctl_send(BTRFS_I(inode), argp, true);
#endif
case BTRFS_IOC_GET_DEV_STATS:
return btrfs_ioctl_get_dev_stats(fs_info, argp);
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
index 2c5dc25ec670..19cd26b0244a 100644
--- a/fs/btrfs/ioctl.h
+++ b/fs/btrfs/ioctl.h
@@ -19,7 +19,7 @@ int btrfs_fileattr_set(struct mnt_idmap *idmap,
struct dentry *dentry, struct fileattr *fa);
int btrfs_ioctl_get_supported_features(void __user *arg);
void btrfs_sync_inode_flags_to_i_flags(struct inode *inode);
-int __pure btrfs_is_empty_uuid(u8 *uuid);
+int __pure btrfs_is_empty_uuid(const u8 *uuid);
void btrfs_update_ioctl_balance_args(struct btrfs_fs_info *fs_info,
struct btrfs_ioctl_balance_args *bargs);
diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h
index 1bc8e6738879..3c15c75e0582 100644
--- a/fs/btrfs/locking.h
+++ b/fs/btrfs/locking.h
@@ -11,7 +11,6 @@
#include <linux/lockdep.h>
#include <linux/percpu_counter.h>
#include "extent_io.h"
-#include "locking.h"
struct extent_buffer;
struct btrfs_path;
diff --git a/fs/btrfs/lru_cache.h b/fs/btrfs/lru_cache.h
index e32906ab6faa..07f1bb1c6aa3 100644
--- a/fs/btrfs/lru_cache.h
+++ b/fs/btrfs/lru_cache.h
@@ -6,7 +6,6 @@
#include <linux/types.h>
#include <linux/maple_tree.h>
#include <linux/list.h>
-#include "lru_cache.h"
/*
* A cache entry. This is meant to be embedded in a structure of a user of
diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c
index 1c396ac167aa..1e2a68b8f62d 100644
--- a/fs/btrfs/lzo.c
+++ b/fs/btrfs/lzo.c
@@ -258,8 +258,8 @@ int lzo_compress_folios(struct list_head *ws, struct address_space *mapping,
workspace->cbuf, &out_len,
workspace->mem);
kunmap_local(data_in);
- if (ret < 0) {
- pr_debug("BTRFS: lzo in loop returned %d\n", ret);
+ if (unlikely(ret < 0)) {
+ /* lzo1x_1_compress never fails. */
ret = -EIO;
goto out;
}
@@ -354,11 +354,14 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
* and all sectors should be used.
* If this happens, it means the compressed extent is corrupted.
*/
- if (len_in > min_t(size_t, BTRFS_MAX_COMPRESSED, cb->compressed_len) ||
- round_up(len_in, sectorsize) < cb->compressed_len) {
+ if (unlikely(len_in > min_t(size_t, BTRFS_MAX_COMPRESSED, cb->compressed_len) ||
+ round_up(len_in, sectorsize) < cb->compressed_len)) {
+ struct btrfs_inode *inode = cb->bbio.inode;
+
btrfs_err(fs_info,
- "invalid lzo header, lzo len %u compressed len %u",
- len_in, cb->compressed_len);
+"lzo header invalid, root %llu inode %llu offset %llu lzo len %u compressed len %u",
+ btrfs_root_id(inode->root), btrfs_ino(inode),
+ cb->start, len_in, cb->compressed_len);
return -EUCLEAN;
}
@@ -383,13 +386,17 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
kunmap_local(kaddr);
cur_in += LZO_LEN;
- if (seg_len > WORKSPACE_CBUF_LENGTH) {
+ if (unlikely(seg_len > WORKSPACE_CBUF_LENGTH)) {
+ struct btrfs_inode *inode = cb->bbio.inode;
+
/*
* seg_len shouldn't be larger than we have allocated
* for workspace->cbuf
*/
- btrfs_err(fs_info, "unexpectedly large lzo segment len %u",
- seg_len);
+ btrfs_err(fs_info,
+ "lzo segment too big, root %llu inode %llu offset %llu len %u",
+ btrfs_root_id(inode->root), btrfs_ino(inode),
+ cb->start, seg_len);
return -EIO;
}
@@ -399,8 +406,13 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
/* Decompress the data */
ret = lzo1x_decompress_safe(workspace->cbuf, seg_len,
workspace->buf, &out_len);
- if (ret != LZO_E_OK) {
- btrfs_err(fs_info, "failed to decompress");
+ if (unlikely(ret != LZO_E_OK)) {
+ struct btrfs_inode *inode = cb->bbio.inode;
+
+ btrfs_err(fs_info,
+ "lzo decompression failed, error %d root %llu inode %llu offset %llu",
+ ret, btrfs_root_id(inode->root), btrfs_ino(inode),
+ cb->start);
return -EIO;
}
@@ -454,8 +466,13 @@ int lzo_decompress(struct list_head *ws, const u8 *data_in,
out_len = sectorsize;
ret = lzo1x_decompress_safe(data_in, in_len, workspace->buf, &out_len);
- if (ret != LZO_E_OK) {
- pr_warn("BTRFS: decompress failed!\n");
+ if (unlikely(ret != LZO_E_OK)) {
+ struct btrfs_inode *inode = BTRFS_I(dest_page->mapping->host);
+
+ btrfs_err(fs_info,
+ "lzo decompression failed, error %d root %llu inode %llu offset %llu",
+ ret, btrfs_root_id(inode->root), btrfs_ino(inode),
+ page_offset(dest_page));
ret = -EIO;
goto out;
}
diff --git a/fs/btrfs/messages.c b/fs/btrfs/messages.c
index 210d9c82e2ae..77752eec125d 100644
--- a/fs/btrfs/messages.c
+++ b/fs/btrfs/messages.c
@@ -20,7 +20,8 @@ static const char fs_state_chars[] = {
[BTRFS_FS_STATE_TRANS_ABORTED] = 'A',
[BTRFS_FS_STATE_DEV_REPLACING] = 'R',
[BTRFS_FS_STATE_DUMMY_FS_INFO] = 0,
- [BTRFS_FS_STATE_NO_CSUMS] = 'C',
+ [BTRFS_FS_STATE_NO_DATA_CSUMS] = 'C',
+ [BTRFS_FS_STATE_SKIP_META_CSUMS] = 'S',
[BTRFS_FS_STATE_LOG_CLEANUP_ERROR] = 'L',
};
diff --git a/fs/btrfs/misc.h b/fs/btrfs/misc.h
index dde4904aead9..0d599fd847c9 100644
--- a/fs/btrfs/misc.h
+++ b/fs/btrfs/misc.h
@@ -66,7 +66,7 @@ struct rb_simple_node {
u64 bytenr;
};
-static inline struct rb_node *rb_simple_search(struct rb_root *root, u64 bytenr)
+static inline struct rb_node *rb_simple_search(const struct rb_root *root, u64 bytenr)
{
struct rb_node *node = root->rb_node;
struct rb_simple_node *entry;
@@ -93,7 +93,7 @@ static inline struct rb_node *rb_simple_search(struct rb_root *root, u64 bytenr)
* Return the rb_node that start at or after @bytenr. If there is no entry at
* or after @bytner return NULL.
*/
-static inline struct rb_node *rb_simple_search_first(struct rb_root *root,
+static inline struct rb_node *rb_simple_search_first(const struct rb_root *root,
u64 bytenr)
{
struct rb_node *node = root->rb_node, *ret = NULL;
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 35a413ce935d..82a68394a89c 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -19,6 +19,7 @@
#include "qgroup.h"
#include "subpage.h"
#include "file.h"
+#include "block-group.h"
static struct kmem_cache *btrfs_ordered_extent_cache;
@@ -179,7 +180,7 @@ static struct btrfs_ordered_extent *alloc_ordered_extent(
entry->disk_num_bytes = disk_num_bytes;
entry->offset = offset;
entry->bytes_left = num_bytes;
- entry->inode = igrab(&inode->vfs_inode);
+ entry->inode = BTRFS_I(igrab(&inode->vfs_inode));
entry->compress_type = compress_type;
entry->truncated_len = (u64)-1;
entry->qgroup_rsv = qgroup_rsv;
@@ -207,7 +208,7 @@ static struct btrfs_ordered_extent *alloc_ordered_extent(
static void insert_ordered_extent(struct btrfs_ordered_extent *entry)
{
- struct btrfs_inode *inode = BTRFS_I(entry->inode);
+ struct btrfs_inode *inode = entry->inode;
struct btrfs_root *root = inode->root;
struct btrfs_fs_info *fs_info = root->fs_info;
struct rb_node *node;
@@ -223,7 +224,7 @@ static void insert_ordered_extent(struct btrfs_ordered_extent *entry)
spin_lock_irq(&inode->ordered_tree_lock);
node = tree_insert(&inode->ordered_tree, entry->file_offset,
&entry->rb_node);
- if (node)
+ if (unlikely(node))
btrfs_panic(fs_info, -EEXIST,
"inconsistency in ordered tree at offset %llu",
entry->file_offset);
@@ -263,17 +264,39 @@ static void insert_ordered_extent(struct btrfs_ordered_extent *entry)
*/
struct btrfs_ordered_extent *btrfs_alloc_ordered_extent(
struct btrfs_inode *inode, u64 file_offset,
- u64 num_bytes, u64 ram_bytes, u64 disk_bytenr,
- u64 disk_num_bytes, u64 offset, unsigned long flags,
- int compress_type)
+ const struct btrfs_file_extent *file_extent, unsigned long flags)
{
struct btrfs_ordered_extent *entry;
ASSERT((flags & ~BTRFS_ORDERED_TYPE_FLAGS) == 0);
- entry = alloc_ordered_extent(inode, file_offset, num_bytes, ram_bytes,
- disk_bytenr, disk_num_bytes, offset, flags,
- compress_type);
+ /*
+ * For regular writes, we just use the members in @file_extent.
+ *
+ * For NOCOW, we don't really care about the numbers except @start and
+ * file_extent->num_bytes, as we won't insert a file extent item at all.
+ *
+ * For PREALLOC, we do not use ordered extent members, but
+ * btrfs_mark_extent_written() handles everything.
+ *
+ * So here we always pass 0 as offset for NOCOW/PREALLOC ordered extents,
+ * or btrfs_split_ordered_extent() cannot handle it correctly.
+ */
+ if (flags & ((1U << BTRFS_ORDERED_NOCOW) | (1U << BTRFS_ORDERED_PREALLOC)))
+ entry = alloc_ordered_extent(inode, file_offset,
+ file_extent->num_bytes,
+ file_extent->num_bytes,
+ file_extent->disk_bytenr + file_extent->offset,
+ file_extent->num_bytes, 0, flags,
+ file_extent->compression);
+ else
+ entry = alloc_ordered_extent(inode, file_offset,
+ file_extent->num_bytes,
+ file_extent->ram_bytes,
+ file_extent->disk_bytenr,
+ file_extent->disk_num_bytes,
+ file_extent->offset, flags,
+ file_extent->compression);
if (!IS_ERR(entry))
insert_ordered_extent(entry);
return entry;
@@ -287,7 +310,7 @@ struct btrfs_ordered_extent *btrfs_alloc_ordered_extent(
void btrfs_add_ordered_sum(struct btrfs_ordered_extent *entry,
struct btrfs_ordered_sum *sum)
{
- struct btrfs_inode *inode = BTRFS_I(entry->inode);
+ struct btrfs_inode *inode = entry->inode;
spin_lock_irq(&inode->ordered_tree_lock);
list_add_tail(&sum->list, &entry->list);
@@ -297,7 +320,7 @@ void btrfs_add_ordered_sum(struct btrfs_ordered_extent *entry,
void btrfs_mark_ordered_extent_error(struct btrfs_ordered_extent *ordered)
{
if (!test_and_set_bit(BTRFS_ORDERED_IOERR, &ordered->flags))
- mapping_set_error(ordered->inode->i_mapping, -EIO);
+ mapping_set_error(ordered->inode->vfs_inode.i_mapping, -EIO);
}
static void finish_ordered_fn(struct btrfs_work *work)
@@ -312,7 +335,7 @@ static bool can_finish_ordered_extent(struct btrfs_ordered_extent *ordered,
struct page *page, u64 file_offset,
u64 len, bool uptodate)
{
- struct btrfs_inode *inode = BTRFS_I(ordered->inode);
+ struct btrfs_inode *inode = ordered->inode;
struct btrfs_fs_info *fs_info = inode->root->fs_info;
lockdep_assert_held(&inode->ordered_tree_lock);
@@ -365,7 +388,7 @@ static bool can_finish_ordered_extent(struct btrfs_ordered_extent *ordered,
static void btrfs_queue_ordered_fn(struct btrfs_ordered_extent *ordered)
{
- struct btrfs_inode *inode = BTRFS_I(ordered->inode);
+ struct btrfs_inode *inode = ordered->inode;
struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct btrfs_workqueue *wq = btrfs_is_free_space_inode(inode) ?
fs_info->endio_freespace_worker : fs_info->endio_write_workers;
@@ -374,11 +397,11 @@ static void btrfs_queue_ordered_fn(struct btrfs_ordered_extent *ordered)
btrfs_queue_work(wq, &ordered->work);
}
-bool btrfs_finish_ordered_extent(struct btrfs_ordered_extent *ordered,
+void btrfs_finish_ordered_extent(struct btrfs_ordered_extent *ordered,
struct page *page, u64 file_offset, u64 len,
bool uptodate)
{
- struct btrfs_inode *inode = BTRFS_I(ordered->inode);
+ struct btrfs_inode *inode = ordered->inode;
unsigned long flags;
bool ret;
@@ -421,7 +444,6 @@ bool btrfs_finish_ordered_extent(struct btrfs_ordered_extent *ordered,
if (ret)
btrfs_queue_ordered_fn(ordered);
- return ret;
}
/*
@@ -588,14 +610,14 @@ void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry)
struct list_head *cur;
struct btrfs_ordered_sum *sum;
- trace_btrfs_ordered_extent_put(BTRFS_I(entry->inode), entry);
+ trace_btrfs_ordered_extent_put(entry->inode, entry);
if (refcount_dec_and_test(&entry->refs)) {
ASSERT(list_empty(&entry->root_extent_list));
ASSERT(list_empty(&entry->log_list));
ASSERT(RB_EMPTY_NODE(&entry->rb_node));
if (entry->inode)
- btrfs_add_delayed_iput(BTRFS_I(entry->inode));
+ btrfs_add_delayed_iput(entry->inode);
while (!list_empty(&entry->list)) {
cur = entry->list.next;
sum = list_entry(cur, struct btrfs_ordered_sum, list);
@@ -626,7 +648,7 @@ void btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode,
freespace_inode = btrfs_is_free_space_inode(btrfs_inode);
btrfs_lockdep_acquire(fs_info, btrfs_trans_pending_ordered);
- /* This is paired with btrfs_alloc_ordered_extent. */
+ /* This is paired with alloc_ordered_extent(). */
spin_lock(&btrfs_inode->lock);
btrfs_mod_outstanding_extents(btrfs_inode, -1);
spin_unlock(&btrfs_inode->lock);
@@ -712,11 +734,11 @@ static void btrfs_run_ordered_extent_work(struct btrfs_work *work)
}
/*
- * wait for all the ordered extents in a root. This is done when balancing
- * space between drives.
+ * Wait for all the ordered extents in a root. Use @bg as range or do whole
+ * range if it's NULL.
*/
u64 btrfs_wait_ordered_extents(struct btrfs_root *root, u64 nr,
- const u64 range_start, const u64 range_len)
+ const struct btrfs_block_group *bg)
{
struct btrfs_fs_info *fs_info = root->fs_info;
LIST_HEAD(splice);
@@ -724,7 +746,17 @@ u64 btrfs_wait_ordered_extents(struct btrfs_root *root, u64 nr,
LIST_HEAD(works);
struct btrfs_ordered_extent *ordered, *next;
u64 count = 0;
- const u64 range_end = range_start + range_len;
+ u64 range_start, range_len;
+ u64 range_end;
+
+ if (bg) {
+ range_start = bg->start;
+ range_len = bg->length;
+ } else {
+ range_start = 0;
+ range_len = U64_MAX;
+ }
+ range_end = range_start + range_len;
mutex_lock(&root->ordered_extent_mutex);
spin_lock(&root->ordered_extent_lock);
@@ -751,10 +783,10 @@ u64 btrfs_wait_ordered_extents(struct btrfs_root *root, u64 nr,
btrfs_queue_work(fs_info->flush_workers, &ordered->flush_work);
cond_resched();
- spin_lock(&root->ordered_extent_lock);
if (nr != U64_MAX)
nr--;
count++;
+ spin_lock(&root->ordered_extent_lock);
}
list_splice_tail(&skipped, &root->ordered_extents);
list_splice_tail(&splice, &root->ordered_extents);
@@ -771,8 +803,12 @@ u64 btrfs_wait_ordered_extents(struct btrfs_root *root, u64 nr,
return count;
}
+/*
+ * Wait for @nr ordered extents that intersect the @bg, or the whole range of
+ * the filesystem if @bg is NULL.
+ */
void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr,
- const u64 range_start, const u64 range_len)
+ const struct btrfs_block_group *bg)
{
struct btrfs_root *root;
LIST_HEAD(splice);
@@ -790,14 +826,13 @@ void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr,
&fs_info->ordered_roots);
spin_unlock(&fs_info->ordered_root_lock);
- done = btrfs_wait_ordered_extents(root, nr,
- range_start, range_len);
+ done = btrfs_wait_ordered_extents(root, nr, bg);
btrfs_put_root(root);
- spin_lock(&fs_info->ordered_root_lock);
- if (nr != U64_MAX) {
+ if (nr != U64_MAX)
nr -= done;
- }
+
+ spin_lock(&fs_info->ordered_root_lock);
}
list_splice_tail(&splice, &fs_info->ordered_roots);
spin_unlock(&fs_info->ordered_root_lock);
@@ -814,7 +849,7 @@ void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry)
{
u64 start = entry->file_offset;
u64 end = start + entry->num_bytes - 1;
- struct btrfs_inode *inode = BTRFS_I(entry->inode);
+ struct btrfs_inode *inode = entry->inode;
bool freespace_inode;
trace_btrfs_ordered_extent_start(inode, entry);
@@ -841,7 +876,7 @@ void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry)
/*
* Used to wait on ordered extents across a large range of bytes.
*/
-int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
+int btrfs_wait_ordered_range(struct btrfs_inode *inode, u64 start, u64 len)
{
int ret = 0;
int ret_wb = 0;
@@ -871,11 +906,11 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
* before the ordered extents complete - to avoid failures (-EEXIST)
* when adding the new ordered extents to the ordered tree.
*/
- ret_wb = filemap_fdatawait_range(inode->i_mapping, start, orig_end);
+ ret_wb = filemap_fdatawait_range(inode->vfs_inode.i_mapping, start, orig_end);
end = orig_end;
while (1) {
- ordered = btrfs_lookup_first_ordered_extent(BTRFS_I(inode), end);
+ ordered = btrfs_lookup_first_ordered_extent(inode, end);
if (!ordered)
break;
if (ordered->file_offset > orig_end) {
@@ -1173,7 +1208,7 @@ bool btrfs_try_lock_ordered_range(struct btrfs_inode *inode, u64 start, u64 end,
struct btrfs_ordered_extent *btrfs_split_ordered_extent(
struct btrfs_ordered_extent *ordered, u64 len)
{
- struct btrfs_inode *inode = BTRFS_I(ordered->inode);
+ struct btrfs_inode *inode = ordered->inode;
struct btrfs_root *root = inode->root;
struct btrfs_fs_info *fs_info = root->fs_info;
u64 file_offset = ordered->file_offset;
@@ -1212,15 +1247,32 @@ struct btrfs_ordered_extent *btrfs_split_ordered_extent(
/* One ref for the tree. */
refcount_inc(&new->refs);
+ /*
+ * Take the root's ordered_extent_lock to avoid a race with
+ * btrfs_wait_ordered_extents() when updating the disk_bytenr and
+ * disk_num_bytes fields of the ordered extent below. And we disable
+ * IRQs because the inode's ordered_tree_lock is used in IRQ context
+ * elsewhere.
+ *
+ * There's no concern about a previous caller of
+ * btrfs_wait_ordered_extents() getting the trimmed ordered extent
+ * before we insert the new one, because even if it gets the ordered
+ * extent before it's trimmed and the new one inserted, right before it
+ * uses it or during its use, the ordered extent might have been
+ * trimmed in the meanwhile, and it missed the new ordered extent.
+ * There's no way around this and it's harmless for current use cases,
+ * so we take the root's ordered_extent_lock to fix that race during
+ * trimming and silence tools like KCSAN.
+ */
spin_lock_irq(&root->ordered_extent_lock);
spin_lock(&inode->ordered_tree_lock);
- /* Remove from tree once */
- node = &ordered->rb_node;
- rb_erase(node, &inode->ordered_tree);
- RB_CLEAR_NODE(node);
- if (inode->ordered_tree_last == node)
- inode->ordered_tree_last = NULL;
+ /*
+ * We don't have overlapping ordered extents (that would imply double
+ * allocation of extents) and we checked above that the split length
+ * does not cross the ordered extent's num_bytes field, so there's
+ * no need to remove it and re-insert it in the tree.
+ */
ordered->file_offset += len;
ordered->disk_bytenr += len;
ordered->num_bytes -= len;
@@ -1250,18 +1302,10 @@ struct btrfs_ordered_extent *btrfs_split_ordered_extent(
offset += sum->len;
}
- /* Re-insert the node */
- node = tree_insert(&inode->ordered_tree, ordered->file_offset,
- &ordered->rb_node);
- if (node)
- btrfs_panic(fs_info, -EEXIST,
- "zoned: inconsistency in ordered tree at offset %llu",
- ordered->file_offset);
-
node = tree_insert(&inode->ordered_tree, new->file_offset, &new->rb_node);
- if (node)
+ if (unlikely(node))
btrfs_panic(fs_info, -EEXIST,
- "zoned: inconsistency in ordered tree at offset %llu",
+ "inconsistency in ordered tree at offset %llu after split",
new->file_offset);
spin_unlock(&inode->ordered_tree_lock);
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index b6f6c6b91732..51b9e81726e2 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -130,7 +130,7 @@ struct btrfs_ordered_extent {
refcount_t refs;
/* the inode we belong to */
- struct inode *inode;
+ struct btrfs_inode *inode;
/* list of checksums for insertion when the extent io is done */
struct list_head list;
@@ -162,7 +162,7 @@ int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent);
void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry);
void btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode,
struct btrfs_ordered_extent *entry);
-bool btrfs_finish_ordered_extent(struct btrfs_ordered_extent *ordered,
+void btrfs_finish_ordered_extent(struct btrfs_ordered_extent *ordered,
struct page *page, u64 file_offset, u64 len,
bool uptodate);
void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode,
@@ -171,17 +171,28 @@ void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode,
bool btrfs_dec_test_ordered_pending(struct btrfs_inode *inode,
struct btrfs_ordered_extent **cached,
u64 file_offset, u64 io_size);
+
+/*
+ * This represents details about the target file extent item of a write operation.
+ */
+struct btrfs_file_extent {
+ u64 disk_bytenr;
+ u64 disk_num_bytes;
+ u64 num_bytes;
+ u64 ram_bytes;
+ u64 offset;
+ u8 compression;
+};
+
struct btrfs_ordered_extent *btrfs_alloc_ordered_extent(
struct btrfs_inode *inode, u64 file_offset,
- u64 num_bytes, u64 ram_bytes, u64 disk_bytenr,
- u64 disk_num_bytes, u64 offset, unsigned long flags,
- int compress_type);
+ const struct btrfs_file_extent *file_extent, unsigned long flags);
void btrfs_add_ordered_sum(struct btrfs_ordered_extent *entry,
struct btrfs_ordered_sum *sum);
struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct btrfs_inode *inode,
u64 file_offset);
void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry);
-int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len);
+int btrfs_wait_ordered_range(struct btrfs_inode *inode, u64 start, u64 len);
struct btrfs_ordered_extent *
btrfs_lookup_first_ordered_extent(struct btrfs_inode *inode, u64 file_offset);
struct btrfs_ordered_extent *btrfs_lookup_first_ordered_range(
@@ -193,9 +204,9 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_range(
void btrfs_get_ordered_extents_for_logging(struct btrfs_inode *inode,
struct list_head *list);
u64 btrfs_wait_ordered_extents(struct btrfs_root *root, u64 nr,
- const u64 range_start, const u64 range_len);
+ const struct btrfs_block_group *bg);
void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr,
- const u64 range_start, const u64 range_len);
+ const struct btrfs_block_group *bg);
void btrfs_lock_and_flush_ordered_range(struct btrfs_inode *inode, u64 start,
u64 end,
struct extent_state **cached_state);
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index 7e46aa8a0444..32dcea662da3 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -109,7 +109,7 @@ static void print_extent_item(const struct extent_buffer *eb, int slot, int type
btrfs_err(eb->fs_info,
"unexpected extent item size, has %u expect >= %zu",
item_size, sizeof(*ei));
- btrfs_handle_fs_error(eb->fs_info, -EUCLEAN, NULL);
+ return;
}
ei = btrfs_item_ptr(eb, slot, struct btrfs_extent_item);
@@ -208,11 +208,6 @@ static void print_raid_stripe_key(const struct extent_buffer *eb, u32 item_size,
struct btrfs_stripe_extent *stripe)
{
const int num_stripes = btrfs_num_raid_stripes(item_size);
- const u8 encoding = btrfs_stripe_extent_encoding(eb, stripe);
-
- pr_info("\t\t\tencoding: %s\n",
- (encoding && encoding < BTRFS_NR_RAID_TYPES) ?
- btrfs_raid_array[encoding].raid_name : "unknown");
for (int i = 0; i < num_stripes; i++)
pr_info("\t\t\tstride %d devid %llu physical %llu\n",
@@ -310,6 +305,9 @@ void btrfs_print_leaf(const struct extent_buffer *l)
case BTRFS_EXTENT_DATA_KEY:
fi = btrfs_item_ptr(l, i,
struct btrfs_file_extent_item);
+ pr_info("\t\tgeneration %llu type %hhu\n",
+ btrfs_file_extent_generation(l, fi),
+ btrfs_file_extent_type(l, fi));
if (btrfs_file_extent_type(l, fi) ==
BTRFS_FILE_EXTENT_INLINE) {
pr_info("\t\tinline extent data size %llu\n",
diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c
index 155570e20f45..b8fa34e16abb 100644
--- a/fs/btrfs/props.c
+++ b/fs/btrfs/props.c
@@ -27,7 +27,7 @@ struct prop_handler {
int (*validate)(const struct btrfs_inode *inode, const char *value,
size_t len);
int (*apply)(struct inode *inode, const char *value, size_t len);
- const char *(*extract)(struct inode *inode);
+ const char *(*extract)(const struct inode *inode);
bool (*ignore)(const struct btrfs_inode *inode);
int inheritable;
};
@@ -104,7 +104,7 @@ bool btrfs_ignore_prop(const struct btrfs_inode *inode, const char *name)
return handler->ignore(inode);
}
-int btrfs_set_prop(struct btrfs_trans_handle *trans, struct inode *inode,
+int btrfs_set_prop(struct btrfs_trans_handle *trans, struct btrfs_inode *inode,
const char *name, const char *value, size_t value_len,
int flags)
{
@@ -116,29 +116,29 @@ int btrfs_set_prop(struct btrfs_trans_handle *trans, struct inode *inode,
return -EINVAL;
if (value_len == 0) {
- ret = btrfs_setxattr(trans, inode, handler->xattr_name,
+ ret = btrfs_setxattr(trans, &inode->vfs_inode, handler->xattr_name,
NULL, 0, flags);
if (ret)
return ret;
- ret = handler->apply(inode, NULL, 0);
+ ret = handler->apply(&inode->vfs_inode, NULL, 0);
ASSERT(ret == 0);
return ret;
}
- ret = btrfs_setxattr(trans, inode, handler->xattr_name, value,
+ ret = btrfs_setxattr(trans, &inode->vfs_inode, handler->xattr_name, value,
value_len, flags);
if (ret)
return ret;
- ret = handler->apply(inode, value, value_len);
+ ret = handler->apply(&inode->vfs_inode, value, value_len);
if (ret) {
- btrfs_setxattr(trans, inode, handler->xattr_name, NULL,
+ btrfs_setxattr(trans, &inode->vfs_inode, handler->xattr_name, NULL,
0, flags);
return ret;
}
- set_bit(BTRFS_INODE_HAS_PROPS, &BTRFS_I(inode)->runtime_flags);
+ set_bit(BTRFS_INODE_HAS_PROPS, &inode->runtime_flags);
return 0;
}
@@ -359,7 +359,7 @@ static bool prop_compression_ignore(const struct btrfs_inode *inode)
return false;
}
-static const char *prop_compression_extract(struct inode *inode)
+static const char *prop_compression_extract(const struct inode *inode)
{
switch (BTRFS_I(inode)->prop_compress) {
case BTRFS_COMPRESS_ZLIB:
@@ -385,7 +385,7 @@ static struct prop_handler prop_handlers[] = {
};
int btrfs_inode_inherit_props(struct btrfs_trans_handle *trans,
- struct inode *inode, struct inode *parent)
+ struct inode *inode, const struct inode *parent)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_fs_info *fs_info = root->fs_info;
diff --git a/fs/btrfs/props.h b/fs/btrfs/props.h
index f60cd89feb29..63546d0a9444 100644
--- a/fs/btrfs/props.h
+++ b/fs/btrfs/props.h
@@ -15,7 +15,7 @@ struct btrfs_trans_handle;
int __init btrfs_props_init(void);
-int btrfs_set_prop(struct btrfs_trans_handle *trans, struct inode *inode,
+int btrfs_set_prop(struct btrfs_trans_handle *trans, struct btrfs_inode *inode,
const char *name, const char *value, size_t value_len,
int flags);
int btrfs_validate_prop(const struct btrfs_inode *inode, const char *name,
@@ -26,6 +26,6 @@ int btrfs_load_inode_props(struct inode *inode, struct btrfs_path *path);
int btrfs_inode_inherit_props(struct btrfs_trans_handle *trans,
struct inode *inode,
- struct inode *dir);
+ const struct inode *dir);
#endif
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 39a15cca58ca..5d57a285d59b 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -30,7 +30,7 @@
#include "root-tree.h"
#include "tree-checker.h"
-enum btrfs_qgroup_mode btrfs_qgroup_mode(struct btrfs_fs_info *fs_info)
+enum btrfs_qgroup_mode btrfs_qgroup_mode(const struct btrfs_fs_info *fs_info)
{
if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
return BTRFS_QGROUP_MODE_DISABLED;
@@ -39,12 +39,12 @@ enum btrfs_qgroup_mode btrfs_qgroup_mode(struct btrfs_fs_info *fs_info)
return BTRFS_QGROUP_MODE_FULL;
}
-bool btrfs_qgroup_enabled(struct btrfs_fs_info *fs_info)
+bool btrfs_qgroup_enabled(const struct btrfs_fs_info *fs_info)
{
return btrfs_qgroup_mode(fs_info) != BTRFS_QGROUP_MODE_DISABLED;
}
-bool btrfs_qgroup_full_accounting(struct btrfs_fs_info *fs_info)
+bool btrfs_qgroup_full_accounting(const struct btrfs_fs_info *fs_info)
{
return btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_FULL;
}
@@ -107,7 +107,7 @@ static void qgroup_rsv_release(struct btrfs_fs_info *fs_info,
static void qgroup_rsv_add_by_qgroup(struct btrfs_fs_info *fs_info,
struct btrfs_qgroup *dest,
- struct btrfs_qgroup *src)
+ const struct btrfs_qgroup *src)
{
int i;
@@ -117,7 +117,7 @@ static void qgroup_rsv_add_by_qgroup(struct btrfs_fs_info *fs_info,
static void qgroup_rsv_release_by_qgroup(struct btrfs_fs_info *fs_info,
struct btrfs_qgroup *dest,
- struct btrfs_qgroup *src)
+ const struct btrfs_qgroup *src)
{
int i;
@@ -141,37 +141,27 @@ static void btrfs_qgroup_update_new_refcnt(struct btrfs_qgroup *qg, u64 seq,
qg->new_refcnt += mod;
}
-static inline u64 btrfs_qgroup_get_old_refcnt(struct btrfs_qgroup *qg, u64 seq)
+static inline u64 btrfs_qgroup_get_old_refcnt(const struct btrfs_qgroup *qg, u64 seq)
{
if (qg->old_refcnt < seq)
return 0;
return qg->old_refcnt - seq;
}
-static inline u64 btrfs_qgroup_get_new_refcnt(struct btrfs_qgroup *qg, u64 seq)
+static inline u64 btrfs_qgroup_get_new_refcnt(const struct btrfs_qgroup *qg, u64 seq)
{
if (qg->new_refcnt < seq)
return 0;
return qg->new_refcnt - seq;
}
-/*
- * glue structure to represent the relations between qgroups.
- */
-struct btrfs_qgroup_list {
- struct list_head next_group;
- struct list_head next_member;
- struct btrfs_qgroup *group;
- struct btrfs_qgroup *member;
-};
-
static int
qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
int init_flags);
static void qgroup_rescan_zero_tracking(struct btrfs_fs_info *fs_info);
/* must be called with qgroup_ioctl_lock held */
-static struct btrfs_qgroup *find_qgroup_rb(struct btrfs_fs_info *fs_info,
+static struct btrfs_qgroup *find_qgroup_rb(const struct btrfs_fs_info *fs_info,
u64 qgroupid)
{
struct rb_node *n = fs_info->qgroup_tree.rb_node;
@@ -346,7 +336,7 @@ static int del_relation_rb(struct btrfs_fs_info *fs_info,
}
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
-int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid,
+int btrfs_verify_qgroup_counts(const struct btrfs_fs_info *fs_info, u64 qgroupid,
u64 rfer, u64 excl)
{
struct btrfs_qgroup *qgroup;
@@ -608,7 +598,7 @@ out:
* Return false if no reserved space is left.
* Return true if some reserved space is leaked.
*/
-bool btrfs_check_quota_leak(struct btrfs_fs_info *fs_info)
+bool btrfs_check_quota_leak(const struct btrfs_fs_info *fs_info)
{
struct rb_node *node;
bool ret = false;
@@ -1334,19 +1324,14 @@ out:
*/
static int flush_reservations(struct btrfs_fs_info *fs_info)
{
- struct btrfs_trans_handle *trans;
int ret;
ret = btrfs_start_delalloc_roots(fs_info, LONG_MAX, false);
if (ret)
return ret;
- btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1);
- trans = btrfs_join_transaction(fs_info->tree_root);
- if (IS_ERR(trans))
- return PTR_ERR(trans);
- ret = btrfs_commit_transaction(trans);
+ btrfs_wait_ordered_roots(fs_info, U64_MAX, NULL);
- return ret;
+ return btrfs_commit_current_transaction(fs_info->tree_root);
}
int btrfs_quota_disable(struct btrfs_fs_info *fs_info)
@@ -1446,9 +1431,11 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info)
btrfs_tree_lock(quota_root->node);
btrfs_clear_buffer_dirty(trans, quota_root->node);
btrfs_tree_unlock(quota_root->node);
- btrfs_free_tree_block(trans, btrfs_root_id(quota_root),
- quota_root->node, 0, 1);
+ ret = btrfs_free_tree_block(trans, btrfs_root_id(quota_root),
+ quota_root->node, 0, 1);
+ if (ret < 0)
+ btrfs_abort_transaction(trans, ret);
out:
btrfs_put_root(quota_root);
@@ -1572,15 +1559,21 @@ out:
return ret;
}
-int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, u64 src, u64 dst)
+/*
+ * Add relation between @src and @dst qgroup. The @prealloc is allocated by the
+ * callers and transferred here (either used or freed on error).
+ */
+int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, u64 src, u64 dst,
+ struct btrfs_qgroup_list *prealloc)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_qgroup *parent;
struct btrfs_qgroup *member;
struct btrfs_qgroup_list *list;
- struct btrfs_qgroup_list *prealloc = NULL;
int ret = 0;
+ ASSERT(prealloc);
+
/* Check the level of src and dst first */
if (btrfs_qgroup_level(src) >= btrfs_qgroup_level(dst))
return -EINVAL;
@@ -1605,11 +1598,6 @@ int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, u64 src, u64 dst
}
}
- prealloc = kzalloc(sizeof(*list), GFP_NOFS);
- if (!prealloc) {
- ret = -ENOMEM;
- goto out;
- }
ret = add_qgroup_relation_item(trans, src, dst);
if (ret)
goto out;
@@ -1748,13 +1736,55 @@ out:
return ret;
}
-static bool qgroup_has_usage(struct btrfs_qgroup *qgroup)
+/*
+ * Return 0 if we can not delete the qgroup (not empty or has children etc).
+ * Return >0 if we can delete the qgroup.
+ * Return <0 for other errors during tree search.
+ */
+static int can_delete_qgroup(struct btrfs_fs_info *fs_info, struct btrfs_qgroup *qgroup)
{
- return (qgroup->rfer > 0 || qgroup->rfer_cmpr > 0 ||
- qgroup->excl > 0 || qgroup->excl_cmpr > 0 ||
- qgroup->rsv.values[BTRFS_QGROUP_RSV_DATA] > 0 ||
- qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PREALLOC] > 0 ||
- qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PERTRANS] > 0);
+ struct btrfs_key key;
+ struct btrfs_path *path;
+ int ret;
+
+ /*
+ * Squota would never be inconsistent, but there can still be case
+ * where a dropped subvolume still has qgroup numbers, and squota
+ * relies on such qgroup for future accounting.
+ *
+ * So for squota, do not allow dropping any non-zero qgroup.
+ */
+ if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE &&
+ (qgroup->rfer || qgroup->excl || qgroup->excl_cmpr || qgroup->rfer_cmpr))
+ return 0;
+
+ /* For higher level qgroup, we can only delete it if it has no child. */
+ if (btrfs_qgroup_level(qgroup->qgroupid)) {
+ if (!list_empty(&qgroup->members))
+ return 0;
+ return 1;
+ }
+
+ /*
+ * For level-0 qgroups, we can only delete it if it has no subvolume
+ * for it.
+ * This means even a subvolume is unlinked but not yet fully dropped,
+ * we can not delete the qgroup.
+ */
+ key.objectid = qgroup->qgroupid;
+ key.type = BTRFS_ROOT_ITEM_KEY;
+ key.offset = -1ULL;
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ ret = btrfs_find_root(fs_info->tree_root, &key, path, NULL, NULL);
+ btrfs_free_path(path);
+ /*
+ * The @ret from btrfs_find_root() exactly matches our definition for
+ * the return value, thus can be returned directly.
+ */
+ return ret;
}
int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid)
@@ -1776,7 +1806,10 @@ int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid)
goto out;
}
- if (is_fstree(qgroupid) && qgroup_has_usage(qgroup)) {
+ ret = can_delete_qgroup(fs_info, qgroup);
+ if (ret < 0)
+ goto out;
+ if (ret == 0) {
ret = -EBUSY;
goto out;
}
@@ -1801,6 +1834,34 @@ int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid)
}
spin_lock(&fs_info->qgroup_lock);
+ /*
+ * Warn on reserved space. The subvolume should has no child nor
+ * corresponding subvolume.
+ * Thus its reserved space should all be zero, no matter if qgroup
+ * is consistent or the mode.
+ */
+ WARN_ON(qgroup->rsv.values[BTRFS_QGROUP_RSV_DATA] ||
+ qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PREALLOC] ||
+ qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PERTRANS]);
+ /*
+ * The same for rfer/excl numbers, but that's only if our qgroup is
+ * consistent and if it's in regular qgroup mode.
+ * For simple mode it's not as accurate thus we can hit non-zero values
+ * very frequently.
+ */
+ if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_FULL &&
+ !(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT)) {
+ if (WARN_ON(qgroup->rfer || qgroup->excl ||
+ qgroup->rfer_cmpr || qgroup->excl_cmpr)) {
+ btrfs_warn_rl(fs_info,
+"to be deleted qgroup %u/%llu has non-zero numbers, rfer %llu rfer_cmpr %llu excl %llu excl_cmpr %llu",
+ btrfs_qgroup_level(qgroup->qgroupid),
+ btrfs_qgroup_subvolid(qgroup->qgroupid),
+ qgroup->rfer, qgroup->rfer_cmpr,
+ qgroup->excl, qgroup->excl_cmpr);
+ qgroup_mark_inconsistent(fs_info);
+ }
+ }
del_qgroup_rb(fs_info, qgroupid);
spin_unlock(&fs_info->qgroup_lock);
@@ -1816,6 +1877,41 @@ out:
return ret;
}
+int btrfs_qgroup_cleanup_dropped_subvolume(struct btrfs_fs_info *fs_info, u64 subvolid)
+{
+ struct btrfs_trans_handle *trans;
+ int ret;
+
+ if (!is_fstree(subvolid) || !btrfs_qgroup_enabled(fs_info) || !fs_info->quota_root)
+ return 0;
+
+ /*
+ * Commit current transaction to make sure all the rfer/excl numbers
+ * get updated.
+ */
+ trans = btrfs_start_transaction(fs_info->quota_root, 0);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
+
+ ret = btrfs_commit_transaction(trans);
+ if (ret < 0)
+ return ret;
+
+ /* Start new trans to delete the qgroup info and limit items. */
+ trans = btrfs_start_transaction(fs_info->quota_root, 2);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
+ ret = btrfs_remove_qgroup(trans, subvolid);
+ btrfs_end_transaction(trans);
+ /*
+ * It's squota and the subvolume still has numbers needed for future
+ * accounting, in this case we can not delete it. Just skip it.
+ */
+ if (ret == -EBUSY)
+ ret = 0;
+ return ret;
+}
+
int btrfs_limit_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid,
struct btrfs_qgroup_limit *limit)
{
@@ -3222,7 +3318,6 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid,
struct btrfs_qgroup_inherit *inherit)
{
int ret = 0;
- int i;
u64 *i_qgroups;
bool committing = false;
struct btrfs_fs_info *fs_info = trans->fs_info;
@@ -3279,7 +3374,7 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid,
i_qgroups = (u64 *)(inherit + 1);
nums = inherit->num_qgroups + 2 * inherit->num_ref_copies +
2 * inherit->num_excl_copies;
- for (i = 0; i < nums; ++i) {
+ for (int i = 0; i < nums; i++) {
srcgroup = find_qgroup_rb(fs_info, *i_qgroups);
/*
@@ -3306,7 +3401,7 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid,
*/
if (inherit) {
i_qgroups = (u64 *)(inherit + 1);
- for (i = 0; i < inherit->num_qgroups; ++i, ++i_qgroups) {
+ for (int i = 0; i < inherit->num_qgroups; i++, i_qgroups++) {
if (*i_qgroups == 0)
continue;
ret = add_qgroup_relation_item(trans, objectid,
@@ -3392,7 +3487,7 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid,
goto unlock;
i_qgroups = (u64 *)(inherit + 1);
- for (i = 0; i < inherit->num_qgroups; ++i) {
+ for (int i = 0; i < inherit->num_qgroups; i++) {
if (*i_qgroups) {
ret = add_relation_rb(fs_info, qlist_prealloc[i], objectid,
*i_qgroups);
@@ -3412,7 +3507,7 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid,
++i_qgroups;
}
- for (i = 0; i < inherit->num_ref_copies; ++i, i_qgroups += 2) {
+ for (int i = 0; i < inherit->num_ref_copies; i++, i_qgroups += 2) {
struct btrfs_qgroup *src;
struct btrfs_qgroup *dst;
@@ -3433,7 +3528,7 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid,
/* Manually tweaking numbers certainly needs a rescan */
need_rescan = true;
}
- for (i = 0; i < inherit->num_excl_copies; ++i, i_qgroups += 2) {
+ for (int i = 0; i < inherit->num_excl_copies; i++, i_qgroups += 2) {
struct btrfs_qgroup *src;
struct btrfs_qgroup *dst;
@@ -3918,7 +4013,6 @@ int
btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info)
{
int ret = 0;
- struct btrfs_trans_handle *trans;
ret = qgroup_rescan_init(fs_info, 0, 1);
if (ret)
@@ -3935,16 +4029,10 @@ btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info)
* going to clear all tracking information for a clean start.
*/
- trans = btrfs_attach_transaction_barrier(fs_info->fs_root);
- if (IS_ERR(trans) && trans != ERR_PTR(-ENOENT)) {
+ ret = btrfs_commit_current_transaction(fs_info->fs_root);
+ if (ret) {
fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
- return PTR_ERR(trans);
- } else if (trans != ERR_PTR(-ENOENT)) {
- ret = btrfs_commit_transaction(trans);
- if (ret) {
- fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
- return ret;
- }
+ return ret;
}
qgroup_rescan_zero_tracking(fs_info);
@@ -4080,7 +4168,6 @@ static int qgroup_unreserve_range(struct btrfs_inode *inode,
*/
static int try_flush_qgroup(struct btrfs_root *root)
{
- struct btrfs_trans_handle *trans;
int ret;
/* Can't hold an open transaction or we run the risk of deadlocking. */
@@ -4101,17 +4188,9 @@ static int try_flush_qgroup(struct btrfs_root *root)
ret = btrfs_start_delalloc_snapshot(root, true);
if (ret < 0)
goto out;
- btrfs_wait_ordered_extents(root, U64_MAX, 0, (u64)-1);
+ btrfs_wait_ordered_extents(root, U64_MAX, NULL);
- trans = btrfs_attach_transaction_barrier(root);
- if (IS_ERR(trans)) {
- ret = PTR_ERR(trans);
- if (ret == -ENOENT)
- ret = 0;
- goto out;
- }
-
- ret = btrfs_commit_transaction(trans);
+ ret = btrfs_commit_current_transaction(root);
out:
clear_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state);
wake_up(&root->qgroup_flush_wait);
@@ -4817,7 +4896,7 @@ void btrfs_free_squota_rsv(struct btrfs_fs_info *fs_info, u64 root, u64 rsv_byte
}
int btrfs_record_squota_delta(struct btrfs_fs_info *fs_info,
- struct btrfs_squota_delta *delta)
+ const struct btrfs_squota_delta *delta)
{
int ret;
struct btrfs_qgroup *qgroup;
diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h
index 706640be0ec2..deb479d176a9 100644
--- a/fs/btrfs/qgroup.h
+++ b/fs/btrfs/qgroup.h
@@ -123,7 +123,6 @@ struct btrfs_inode;
/*
* Record a dirty extent, and info qgroup to update quota on it
- * TODO: Use kmem cache to alloc it.
*/
struct btrfs_qgroup_extent_record {
struct rb_node node;
@@ -279,6 +278,14 @@ struct btrfs_qgroup {
struct kobject kobj;
};
+/* Glue structure to represent the relations between qgroups. */
+struct btrfs_qgroup_list {
+ struct list_head next_group;
+ struct list_head next_member;
+ struct btrfs_qgroup *group;
+ struct btrfs_qgroup *member;
+};
+
struct btrfs_squota_delta {
/* The fstree root this delta counts against. */
u64 root;
@@ -312,9 +319,9 @@ enum btrfs_qgroup_mode {
BTRFS_QGROUP_MODE_SIMPLE
};
-enum btrfs_qgroup_mode btrfs_qgroup_mode(struct btrfs_fs_info *fs_info);
-bool btrfs_qgroup_enabled(struct btrfs_fs_info *fs_info);
-bool btrfs_qgroup_full_accounting(struct btrfs_fs_info *fs_info);
+enum btrfs_qgroup_mode btrfs_qgroup_mode(const struct btrfs_fs_info *fs_info);
+bool btrfs_qgroup_enabled(const struct btrfs_fs_info *fs_info);
+bool btrfs_qgroup_full_accounting(const struct btrfs_fs_info *fs_info);
int btrfs_quota_enable(struct btrfs_fs_info *fs_info,
struct btrfs_ioctl_quota_ctl_args *quota_ctl_args);
int btrfs_quota_disable(struct btrfs_fs_info *fs_info);
@@ -322,11 +329,13 @@ int btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info);
void btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info);
int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info,
bool interruptible);
-int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, u64 src, u64 dst);
+int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, u64 src, u64 dst,
+ struct btrfs_qgroup_list *prealloc);
int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans, u64 src,
u64 dst);
int btrfs_create_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid);
int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid);
+int btrfs_qgroup_cleanup_dropped_subvolume(struct btrfs_fs_info *fs_info, u64 subvolid);
int btrfs_limit_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid,
struct btrfs_qgroup_limit *limit);
int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info);
@@ -361,7 +370,7 @@ void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info,
enum btrfs_qgroup_rsv_type type);
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
-int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid,
+int btrfs_verify_qgroup_counts(const struct btrfs_fs_info *fs_info, u64 qgroupid,
u64 rfer, u64 excl);
#endif
@@ -431,9 +440,9 @@ int btrfs_qgroup_add_swapped_blocks(struct btrfs_trans_handle *trans,
int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct extent_buffer *eb);
void btrfs_qgroup_destroy_extent_records(struct btrfs_transaction *trans);
-bool btrfs_check_quota_leak(struct btrfs_fs_info *fs_info);
+bool btrfs_check_quota_leak(const struct btrfs_fs_info *fs_info);
void btrfs_free_squota_rsv(struct btrfs_fs_info *fs_info, u64 root, u64 rsv_bytes);
int btrfs_record_squota_delta(struct btrfs_fs_info *fs_info,
- struct btrfs_squota_delta *delta);
+ const struct btrfs_squota_delta *delta);
#endif
diff --git a/fs/btrfs/raid-stripe-tree.c b/fs/btrfs/raid-stripe-tree.c
index 6af6b4b9a32e..e6f7a234b8f6 100644
--- a/fs/btrfs/raid-stripe-tree.c
+++ b/fs/btrfs/raid-stripe-tree.c
@@ -80,7 +80,6 @@ static int btrfs_insert_one_raid_extent(struct btrfs_trans_handle *trans,
struct btrfs_key stripe_key;
struct btrfs_root *stripe_root = fs_info->stripe_root;
const int num_stripes = btrfs_bg_type_to_factor(bioc->map_type);
- u8 encoding = btrfs_bg_flags_to_raid_index(bioc->map_type);
struct btrfs_stripe_extent *stripe_extent;
const size_t item_size = struct_size(stripe_extent, strides, num_stripes);
int ret;
@@ -94,7 +93,6 @@ static int btrfs_insert_one_raid_extent(struct btrfs_trans_handle *trans,
trace_btrfs_insert_one_raid_extent(fs_info, bioc->logical, bioc->size,
num_stripes);
- btrfs_set_stack_stripe_extent_encoding(stripe_extent, encoding);
for (int i = 0; i < num_stripes; i++) {
u64 devid = bioc->stripes[i].dev->devid;
u64 physical = bioc->stripes[i].physical;
@@ -159,7 +157,6 @@ int btrfs_get_raid_extent_offset(struct btrfs_fs_info *fs_info,
struct extent_buffer *leaf;
const u64 end = logical + *length;
int num_stripes;
- u8 encoding;
u64 offset;
u64 found_logical;
u64 found_length;
@@ -222,16 +219,6 @@ int btrfs_get_raid_extent_offset(struct btrfs_fs_info *fs_info,
num_stripes = btrfs_num_raid_stripes(btrfs_item_size(leaf, slot));
stripe_extent = btrfs_item_ptr(leaf, slot, struct btrfs_stripe_extent);
- encoding = btrfs_stripe_extent_encoding(leaf, stripe_extent);
-
- if (encoding != btrfs_bg_flags_to_raid_index(map_type)) {
- ret = -EUCLEAN;
- btrfs_handle_fs_error(fs_info, ret,
- "on-disk stripe encoding %d doesn't match RAID index %d",
- encoding,
- btrfs_bg_flags_to_raid_index(map_type));
- goto out;
- }
for (int i = 0; i < num_stripes; i++) {
struct btrfs_raid_stride *stride = &stripe_extent->strides[i];
diff --git a/fs/btrfs/raid-stripe-tree.h b/fs/btrfs/raid-stripe-tree.h
index c9c258f84903..1ac1c21aac2f 100644
--- a/fs/btrfs/raid-stripe-tree.h
+++ b/fs/btrfs/raid-stripe-tree.h
@@ -48,8 +48,7 @@ static inline bool btrfs_need_stripe_tree_update(struct btrfs_fs_info *fs_info,
static inline int btrfs_num_raid_stripes(u32 item_size)
{
- return (item_size - offsetof(struct btrfs_stripe_extent, strides)) /
- sizeof(struct btrfs_raid_stride);
+ return item_size / sizeof(struct btrfs_raid_stride);
}
#endif
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index 831fac45e70f..39bec672df0c 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -40,6 +40,85 @@
#define BTRFS_STRIPE_HASH_TABLE_BITS 11
+static void dump_bioc(const struct btrfs_fs_info *fs_info, const struct btrfs_io_context *bioc)
+{
+ if (unlikely(!bioc)) {
+ btrfs_crit(fs_info, "bioc=NULL");
+ return;
+ }
+ btrfs_crit(fs_info,
+"bioc logical=%llu full_stripe=%llu size=%llu map_type=0x%llx mirror=%u replace_nr_stripes=%u replace_stripe_src=%d num_stripes=%u",
+ bioc->logical, bioc->full_stripe_logical, bioc->size,
+ bioc->map_type, bioc->mirror_num, bioc->replace_nr_stripes,
+ bioc->replace_stripe_src, bioc->num_stripes);
+ for (int i = 0; i < bioc->num_stripes; i++) {
+ btrfs_crit(fs_info, " nr=%d devid=%llu physical=%llu",
+ i, bioc->stripes[i].dev->devid,
+ bioc->stripes[i].physical);
+ }
+}
+
+static void btrfs_dump_rbio(const struct btrfs_fs_info *fs_info,
+ const struct btrfs_raid_bio *rbio)
+{
+ if (!IS_ENABLED(CONFIG_BTRFS_ASSERT))
+ return;
+
+ dump_bioc(fs_info, rbio->bioc);
+ btrfs_crit(fs_info,
+"rbio flags=0x%lx nr_sectors=%u nr_data=%u real_stripes=%u stripe_nsectors=%u scrubp=%u dbitmap=0x%lx",
+ rbio->flags, rbio->nr_sectors, rbio->nr_data,
+ rbio->real_stripes, rbio->stripe_nsectors,
+ rbio->scrubp, rbio->dbitmap);
+}
+
+#define ASSERT_RBIO(expr, rbio) \
+({ \
+ if (IS_ENABLED(CONFIG_BTRFS_ASSERT) && unlikely(!(expr))) { \
+ const struct btrfs_fs_info *__fs_info = (rbio)->bioc ? \
+ (rbio)->bioc->fs_info : NULL; \
+ \
+ btrfs_dump_rbio(__fs_info, (rbio)); \
+ } \
+ ASSERT((expr)); \
+})
+
+#define ASSERT_RBIO_STRIPE(expr, rbio, stripe_nr) \
+({ \
+ if (IS_ENABLED(CONFIG_BTRFS_ASSERT) && unlikely(!(expr))) { \
+ const struct btrfs_fs_info *__fs_info = (rbio)->bioc ? \
+ (rbio)->bioc->fs_info : NULL; \
+ \
+ btrfs_dump_rbio(__fs_info, (rbio)); \
+ btrfs_crit(__fs_info, "stripe_nr=%d", (stripe_nr)); \
+ } \
+ ASSERT((expr)); \
+})
+
+#define ASSERT_RBIO_SECTOR(expr, rbio, sector_nr) \
+({ \
+ if (IS_ENABLED(CONFIG_BTRFS_ASSERT) && unlikely(!(expr))) { \
+ const struct btrfs_fs_info *__fs_info = (rbio)->bioc ? \
+ (rbio)->bioc->fs_info : NULL; \
+ \
+ btrfs_dump_rbio(__fs_info, (rbio)); \
+ btrfs_crit(__fs_info, "sector_nr=%d", (sector_nr)); \
+ } \
+ ASSERT((expr)); \
+})
+
+#define ASSERT_RBIO_LOGICAL(expr, rbio, logical) \
+({ \
+ if (IS_ENABLED(CONFIG_BTRFS_ASSERT) && unlikely(!(expr))) { \
+ const struct btrfs_fs_info *__fs_info = (rbio)->bioc ? \
+ (rbio)->bioc->fs_info : NULL; \
+ \
+ btrfs_dump_rbio(__fs_info, (rbio)); \
+ btrfs_crit(__fs_info, "logical=%llu", (logical)); \
+ } \
+ ASSERT((expr)); \
+})
+
/* Used by the raid56 code to lock stripes for read/modify/write */
struct btrfs_stripe_hash {
struct list_head hash_list;
@@ -592,8 +671,8 @@ static unsigned int rbio_stripe_sector_index(const struct btrfs_raid_bio *rbio,
unsigned int stripe_nr,
unsigned int sector_nr)
{
- ASSERT(stripe_nr < rbio->real_stripes);
- ASSERT(sector_nr < rbio->stripe_nsectors);
+ ASSERT_RBIO_STRIPE(stripe_nr < rbio->real_stripes, rbio, stripe_nr);
+ ASSERT_RBIO_SECTOR(sector_nr < rbio->stripe_nsectors, rbio, sector_nr);
return stripe_nr * rbio->stripe_nsectors + sector_nr;
}
@@ -873,8 +952,10 @@ static struct sector_ptr *sector_in_rbio(struct btrfs_raid_bio *rbio,
struct sector_ptr *sector;
int index;
- ASSERT(stripe_nr >= 0 && stripe_nr < rbio->real_stripes);
- ASSERT(sector_nr >= 0 && sector_nr < rbio->stripe_nsectors);
+ ASSERT_RBIO_STRIPE(stripe_nr >= 0 && stripe_nr < rbio->real_stripes,
+ rbio, stripe_nr);
+ ASSERT_RBIO_SECTOR(sector_nr >= 0 && sector_nr < rbio->stripe_nsectors,
+ rbio, sector_nr);
index = stripe_nr * rbio->stripe_nsectors + sector_nr;
ASSERT(index >= 0 && index < rbio->nr_sectors);
@@ -970,7 +1051,7 @@ static int alloc_rbio_pages(struct btrfs_raid_bio *rbio)
{
int ret;
- ret = btrfs_alloc_page_array(rbio->nr_pages, rbio->stripe_pages, 0);
+ ret = btrfs_alloc_page_array(rbio->nr_pages, rbio->stripe_pages, false);
if (ret < 0)
return ret;
/* Mapping all sectors */
@@ -985,7 +1066,7 @@ static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio)
int ret;
ret = btrfs_alloc_page_array(rbio->nr_pages - data_pages,
- rbio->stripe_pages + data_pages, 0);
+ rbio->stripe_pages + data_pages, false);
if (ret < 0)
return ret;
@@ -1057,8 +1138,10 @@ static int rbio_add_io_sector(struct btrfs_raid_bio *rbio,
* thus it can be larger than rbio->real_stripe.
* So here we check against bioc->num_stripes, not rbio->real_stripes.
*/
- ASSERT(stripe_nr >= 0 && stripe_nr < rbio->bioc->num_stripes);
- ASSERT(sector_nr >= 0 && sector_nr < rbio->stripe_nsectors);
+ ASSERT_RBIO_STRIPE(stripe_nr >= 0 && stripe_nr < rbio->bioc->num_stripes,
+ rbio, stripe_nr);
+ ASSERT_RBIO_SECTOR(sector_nr >= 0 && sector_nr < rbio->stripe_nsectors,
+ rbio, sector_nr);
ASSERT(sector->page);
stripe = &rbio->bioc->stripes[stripe_nr];
@@ -1197,14 +1280,14 @@ static void assert_rbio(struct btrfs_raid_bio *rbio)
* At least two stripes (2 disks RAID5), and since real_stripes is U8,
* we won't go beyond 256 disks anyway.
*/
- ASSERT(rbio->real_stripes >= 2);
- ASSERT(rbio->nr_data > 0);
+ ASSERT_RBIO(rbio->real_stripes >= 2, rbio);
+ ASSERT_RBIO(rbio->nr_data > 0, rbio);
/*
* This is another check to make sure nr data stripes is smaller
* than total stripes.
*/
- ASSERT(rbio->nr_data < rbio->real_stripes);
+ ASSERT_RBIO(rbio->nr_data < rbio->real_stripes, rbio);
}
/* Generate PQ for one vertical stripe. */
@@ -1557,7 +1640,7 @@ static int alloc_rbio_data_pages(struct btrfs_raid_bio *rbio)
const int data_pages = rbio->nr_data * rbio->stripe_npages;
int ret;
- ret = btrfs_alloc_page_array(data_pages, rbio->stripe_pages, 0);
+ ret = btrfs_alloc_page_array(data_pages, rbio->stripe_pages, false);
if (ret < 0)
return ret;
@@ -1641,9 +1724,10 @@ static void rbio_add_bio(struct btrfs_raid_bio *rbio, struct bio *orig_bio)
const u32 sectorsize = fs_info->sectorsize;
u64 cur_logical;
- ASSERT(orig_logical >= full_stripe_start &&
- orig_logical + orig_len <= full_stripe_start +
- rbio->nr_data * BTRFS_STRIPE_LEN);
+ ASSERT_RBIO_LOGICAL(orig_logical >= full_stripe_start &&
+ orig_logical + orig_len <= full_stripe_start +
+ rbio->nr_data * BTRFS_STRIPE_LEN,
+ rbio, orig_logical);
bio_list_add(&rbio->bio_list, orig_bio);
rbio->bio_list_bytes += orig_bio->bi_iter.bi_size;
@@ -2389,7 +2473,7 @@ struct btrfs_raid_bio *raid56_parity_alloc_scrub_rbio(struct bio *bio,
break;
}
}
- ASSERT(i < rbio->real_stripes);
+ ASSERT_RBIO_STRIPE(i < rbio->real_stripes, rbio, i);
bitmap_copy(&rbio->dbitmap, dbitmap, stripe_nsectors);
return rbio;
@@ -2555,7 +2639,7 @@ static int finish_parity_scrub(struct btrfs_raid_bio *rbio)
* Replace is running and our parity stripe needs to be duplicated to
* the target device. Check we have a valid source stripe number.
*/
- ASSERT(rbio->bioc->replace_stripe_src >= 0);
+ ASSERT_RBIO(rbio->bioc->replace_stripe_src >= 0, rbio);
for_each_set_bit(sectornr, pbitmap, rbio->stripe_nsectors) {
struct sector_ptr *sector;
diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c
index d0a3fcecc46a..df6b93b927cd 100644
--- a/fs/btrfs/reflink.c
+++ b/fs/btrfs/reflink.c
@@ -733,7 +733,7 @@ static noinline int btrfs_clone_files(struct file *file, struct file *file_src,
* we found the previous extent covering eof and before we
* attempted to increment its reference count).
*/
- ret = btrfs_wait_ordered_range(inode, wb_start,
+ ret = btrfs_wait_ordered_range(BTRFS_I(inode), wb_start,
destoff - wb_start);
if (ret)
return ret;
@@ -755,7 +755,7 @@ static noinline int btrfs_clone_files(struct file *file, struct file *file_src,
* range, so wait for writeback to complete before truncating pages
* from the page cache. This is a rare case.
*/
- wb_ret = btrfs_wait_ordered_range(inode, destoff, len);
+ wb_ret = btrfs_wait_ordered_range(BTRFS_I(inode), destoff, len);
ret = ret ? ret : wb_ret;
/*
* Truncate page cache pages so that future reads will see the cloned
@@ -835,11 +835,11 @@ static int btrfs_remap_file_range_prep(struct file *file_in, loff_t pos_in,
if (ret < 0)
return ret;
- ret = btrfs_wait_ordered_range(inode_in, ALIGN_DOWN(pos_in, bs),
+ ret = btrfs_wait_ordered_range(BTRFS_I(inode_in), ALIGN_DOWN(pos_in, bs),
wb_len);
if (ret < 0)
return ret;
- ret = btrfs_wait_ordered_range(inode_out, ALIGN_DOWN(pos_out, bs),
+ ret = btrfs_wait_ordered_range(BTRFS_I(inode_out), ALIGN_DOWN(pos_out, bs),
wb_len);
if (ret < 0)
return ret;
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 8b24bb5a0aa1..b592fc8cf368 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -817,7 +817,7 @@ static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans,
goto abort;
}
set_bit(BTRFS_ROOT_SHAREABLE, &reloc_root->state);
- reloc_root->last_trans = trans->transid;
+ btrfs_set_root_last_trans(reloc_root, trans->transid);
return reloc_root;
fail:
kfree(root_item);
@@ -864,7 +864,7 @@ int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
*/
if (root->reloc_root) {
reloc_root = root->reloc_root;
- reloc_root->last_trans = trans->transid;
+ btrfs_set_root_last_trans(reloc_root, trans->transid);
return 0;
}
@@ -962,7 +962,7 @@ static int get_new_location(struct inode *reloc_inode, u64 *new_bytenr,
if (!path)
return -ENOMEM;
- bytenr -= BTRFS_I(reloc_inode)->index_cnt;
+ bytenr -= BTRFS_I(reloc_inode)->reloc_block_group_start;
ret = btrfs_lookup_file_extent(NULL, root, path,
btrfs_ino(BTRFS_I(reloc_inode)), bytenr, 0);
if (ret < 0)
@@ -1739,7 +1739,7 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
* btrfs_update_reloc_root() and update our root item
* appropriately.
*/
- reloc_root->last_trans = trans->transid;
+ btrfs_set_root_last_trans(reloc_root, trans->transid);
trans->block_rsv = rc->block_rsv;
replaced = 0;
@@ -2082,7 +2082,7 @@ static int record_reloc_root_in_trans(struct btrfs_trans_handle *trans,
struct btrfs_root *root;
int ret;
- if (reloc_root->last_trans == trans->transid)
+ if (btrfs_get_root_last_trans(reloc_root) == trans->transid)
return 0;
root = btrfs_get_fs_root(fs_info, reloc_root->root_key.offset, false);
@@ -2790,14 +2790,14 @@ out_free_blocks:
return ret;
}
-static noinline_for_stack int prealloc_file_extent_cluster(
- struct btrfs_inode *inode,
- const struct file_extent_cluster *cluster)
+static noinline_for_stack int prealloc_file_extent_cluster(struct reloc_control *rc)
{
+ const struct file_extent_cluster *cluster = &rc->cluster;
+ struct btrfs_inode *inode = BTRFS_I(rc->data_inode);
u64 alloc_hint = 0;
u64 start;
u64 end;
- u64 offset = inode->index_cnt;
+ u64 offset = inode->reloc_block_group_start;
u64 num_bytes;
int nr;
int ret = 0;
@@ -2899,11 +2899,14 @@ static noinline_for_stack int prealloc_file_extent_cluster(
return ret;
}
-static noinline_for_stack int setup_relocation_extent_mapping(struct inode *inode,
- u64 start, u64 end, u64 block_start)
+static noinline_for_stack int setup_relocation_extent_mapping(struct reloc_control *rc)
{
+ struct btrfs_inode *inode = BTRFS_I(rc->data_inode);
struct extent_map *em;
struct extent_state *cached_state = NULL;
+ u64 offset = inode->reloc_block_group_start;
+ u64 start = rc->cluster.start - offset;
+ u64 end = rc->cluster.end - offset;
int ret = 0;
em = alloc_extent_map();
@@ -2912,13 +2915,14 @@ static noinline_for_stack int setup_relocation_extent_mapping(struct inode *inod
em->start = start;
em->len = end + 1 - start;
- em->block_len = em->len;
- em->block_start = block_start;
+ em->disk_bytenr = rc->cluster.start;
+ em->disk_num_bytes = em->len;
+ em->ram_bytes = em->len;
em->flags |= EXTENT_FLAG_PINNED;
- lock_extent(&BTRFS_I(inode)->io_tree, start, end, &cached_state);
- ret = btrfs_replace_extent_map_range(BTRFS_I(inode), em, false);
- unlock_extent(&BTRFS_I(inode)->io_tree, start, end, &cached_state);
+ lock_extent(&inode->io_tree, start, end, &cached_state);
+ ret = btrfs_replace_extent_map_range(inode, em, false);
+ unlock_extent(&inode->io_tree, start, end, &cached_state);
free_extent_map(em);
return ret;
@@ -2946,12 +2950,14 @@ static u64 get_cluster_boundary_end(const struct file_extent_cluster *cluster,
return cluster->boundary[cluster_nr + 1] - 1;
}
-static int relocate_one_folio(struct inode *inode, struct file_ra_state *ra,
- const struct file_extent_cluster *cluster,
+static int relocate_one_folio(struct reloc_control *rc,
+ struct file_ra_state *ra,
int *cluster_nr, unsigned long index)
{
+ const struct file_extent_cluster *cluster = &rc->cluster;
+ struct inode *inode = rc->data_inode;
struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
- u64 offset = BTRFS_I(inode)->index_cnt;
+ u64 offset = BTRFS_I(inode)->reloc_block_group_start;
const unsigned long last_index = (cluster->end - offset) >> PAGE_SHIFT;
gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
struct folio *folio;
@@ -3083,10 +3089,11 @@ release_folio:
return ret;
}
-static int relocate_file_extent_cluster(struct inode *inode,
- const struct file_extent_cluster *cluster)
+static int relocate_file_extent_cluster(struct reloc_control *rc)
{
- u64 offset = BTRFS_I(inode)->index_cnt;
+ struct inode *inode = rc->data_inode;
+ const struct file_extent_cluster *cluster = &rc->cluster;
+ u64 offset = BTRFS_I(inode)->reloc_block_group_start;
unsigned long index;
unsigned long last_index;
struct file_ra_state *ra;
@@ -3100,21 +3107,20 @@ static int relocate_file_extent_cluster(struct inode *inode,
if (!ra)
return -ENOMEM;
- ret = prealloc_file_extent_cluster(BTRFS_I(inode), cluster);
+ ret = prealloc_file_extent_cluster(rc);
if (ret)
goto out;
file_ra_state_init(ra, inode->i_mapping);
- ret = setup_relocation_extent_mapping(inode, cluster->start - offset,
- cluster->end - offset, cluster->start);
+ ret = setup_relocation_extent_mapping(rc);
if (ret)
goto out;
last_index = (cluster->end - offset) >> PAGE_SHIFT;
for (index = (cluster->start - offset) >> PAGE_SHIFT;
index <= last_index && !ret; index++)
- ret = relocate_one_folio(inode, ra, cluster, &cluster_nr, index);
+ ret = relocate_one_folio(rc, ra, &cluster_nr, index);
if (ret == 0)
WARN_ON(cluster_nr != cluster->nr);
out:
@@ -3122,15 +3128,16 @@ out:
return ret;
}
-static noinline_for_stack int relocate_data_extent(struct inode *inode,
- const struct btrfs_key *extent_key,
- struct file_extent_cluster *cluster)
+static noinline_for_stack int relocate_data_extent(struct reloc_control *rc,
+ const struct btrfs_key *extent_key)
{
+ struct inode *inode = rc->data_inode;
+ struct file_extent_cluster *cluster = &rc->cluster;
int ret;
struct btrfs_root *root = BTRFS_I(inode)->root;
if (cluster->nr > 0 && extent_key->objectid != cluster->end + 1) {
- ret = relocate_file_extent_cluster(inode, cluster);
+ ret = relocate_file_extent_cluster(rc);
if (ret)
return ret;
cluster->nr = 0;
@@ -3156,7 +3163,7 @@ static noinline_for_stack int relocate_data_extent(struct inode *inode,
* the cluster we need to relocate.
*/
root->relocation_src_root = cluster->owning_root;
- ret = relocate_file_extent_cluster(inode, cluster);
+ ret = relocate_file_extent_cluster(rc);
if (ret)
return ret;
cluster->nr = 0;
@@ -3175,7 +3182,7 @@ static noinline_for_stack int relocate_data_extent(struct inode *inode,
cluster->nr++;
if (cluster->nr >= MAX_EXTENTS) {
- ret = relocate_file_extent_cluster(inode, cluster);
+ ret = relocate_file_extent_cluster(rc);
if (ret)
return ret;
cluster->nr = 0;
@@ -3369,7 +3376,7 @@ static int delete_block_group_cache(struct btrfs_fs_info *fs_info,
if (inode)
goto truncate;
- inode = btrfs_iget(fs_info->sb, ino, root);
+ inode = btrfs_iget(ino, root);
if (IS_ERR(inode))
return -ENOENT;
@@ -3744,8 +3751,7 @@ restart:
if (rc->stage == MOVE_DATA_EXTENTS &&
(flags & BTRFS_EXTENT_FLAG_DATA)) {
rc->found_file_extent = true;
- ret = relocate_data_extent(rc->data_inode,
- &key, &rc->cluster);
+ ret = relocate_data_extent(rc, &key);
if (ret < 0) {
err = ret;
break;
@@ -3774,8 +3780,7 @@ restart:
}
if (!err) {
- ret = relocate_file_extent_cluster(rc->data_inode,
- &rc->cluster);
+ ret = relocate_file_extent_cluster(rc);
if (ret < 0)
err = ret;
}
@@ -3908,14 +3913,14 @@ static noinline_for_stack struct inode *create_reloc_inode(
if (ret)
goto out;
- inode = btrfs_iget(fs_info->sb, objectid, root);
+ inode = btrfs_iget(objectid, root);
if (IS_ERR(inode)) {
delete_orphan_inode(trans, root, objectid);
ret = PTR_ERR(inode);
inode = NULL;
goto out;
}
- BTRFS_I(inode)->index_cnt = group->start;
+ BTRFS_I(inode)->reloc_block_group_start = group->start;
ret = btrfs_orphan_add(trans, BTRFS_I(inode));
out:
@@ -4002,15 +4007,13 @@ static void free_reloc_control(struct reloc_control *rc)
/*
* Print the block group being relocated
*/
-static void describe_relocation(struct btrfs_fs_info *fs_info,
- struct btrfs_block_group *block_group)
+static void describe_relocation(struct btrfs_block_group *block_group)
{
char buf[128] = {'\0'};
btrfs_describe_block_groups(block_group->flags, buf, sizeof(buf));
- btrfs_info(fs_info,
- "relocating block group %llu flags %s",
+ btrfs_info(block_group->fs_info, "relocating block group %llu flags %s",
block_group->start, buf);
}
@@ -4118,13 +4121,11 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start)
goto out;
}
- describe_relocation(fs_info, rc->block_group);
+ describe_relocation(rc->block_group);
btrfs_wait_block_group_reservations(rc->block_group);
btrfs_wait_nocow_writers(rc->block_group);
- btrfs_wait_ordered_roots(fs_info, U64_MAX,
- rc->block_group->start,
- rc->block_group->length);
+ btrfs_wait_ordered_roots(fs_info, U64_MAX, rc->block_group);
ret = btrfs_zone_finish(rc->block_group);
WARN_ON(ret && ret != -EAGAIN);
@@ -4149,7 +4150,7 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start)
* out of the loop if we hit an error.
*/
if (rc->stage == MOVE_DATA_EXTENTS && rc->found_file_extent) {
- ret = btrfs_wait_ordered_range(rc->data_inode, 0,
+ ret = btrfs_wait_ordered_range(BTRFS_I(rc->data_inode), 0,
(u64)-1);
if (ret)
err = ret;
@@ -4221,8 +4222,8 @@ int btrfs_recover_relocation(struct btrfs_fs_info *fs_info)
struct extent_buffer *leaf;
struct reloc_control *rc = NULL;
struct btrfs_trans_handle *trans;
- int ret;
- int err = 0;
+ int ret2;
+ int ret = 0;
path = btrfs_alloc_path();
if (!path)
@@ -4236,15 +4237,14 @@ int btrfs_recover_relocation(struct btrfs_fs_info *fs_info)
while (1) {
ret = btrfs_search_slot(NULL, fs_info->tree_root, &key,
path, 0, 0);
- if (ret < 0) {
- err = ret;
+ if (ret < 0)
goto out;
- }
if (ret > 0) {
if (path->slots[0] == 0)
break;
path->slots[0]--;
}
+ ret = 0;
leaf = path->nodes[0];
btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
btrfs_release_path(path);
@@ -4255,7 +4255,7 @@ int btrfs_recover_relocation(struct btrfs_fs_info *fs_info)
reloc_root = btrfs_read_tree_root(fs_info->tree_root, &key);
if (IS_ERR(reloc_root)) {
- err = PTR_ERR(reloc_root);
+ ret = PTR_ERR(reloc_root);
goto out;
}
@@ -4267,15 +4267,12 @@ int btrfs_recover_relocation(struct btrfs_fs_info *fs_info)
reloc_root->root_key.offset, false);
if (IS_ERR(fs_root)) {
ret = PTR_ERR(fs_root);
- if (ret != -ENOENT) {
- err = ret;
+ if (ret != -ENOENT)
goto out;
- }
ret = mark_garbage_root(reloc_root);
- if (ret < 0) {
- err = ret;
+ if (ret < 0)
goto out;
- }
+ ret = 0;
} else {
btrfs_put_root(fs_root);
}
@@ -4293,15 +4290,13 @@ int btrfs_recover_relocation(struct btrfs_fs_info *fs_info)
rc = alloc_reloc_control(fs_info);
if (!rc) {
- err = -ENOMEM;
+ ret = -ENOMEM;
goto out;
}
ret = reloc_chunk_start(fs_info);
- if (ret < 0) {
- err = ret;
+ if (ret < 0)
goto out_end;
- }
rc->extent_root = btrfs_extent_root(fs_info, 0);
@@ -4309,7 +4304,7 @@ int btrfs_recover_relocation(struct btrfs_fs_info *fs_info)
trans = btrfs_join_transaction(rc->extent_root);
if (IS_ERR(trans)) {
- err = PTR_ERR(trans);
+ ret = PTR_ERR(trans);
goto out_unset;
}
@@ -4329,15 +4324,15 @@ int btrfs_recover_relocation(struct btrfs_fs_info *fs_info)
fs_root = btrfs_get_fs_root(fs_info, reloc_root->root_key.offset,
false);
if (IS_ERR(fs_root)) {
- err = PTR_ERR(fs_root);
+ ret = PTR_ERR(fs_root);
list_add_tail(&reloc_root->root_list, &reloc_roots);
btrfs_end_transaction(trans);
goto out_unset;
}
- err = __add_reloc_root(reloc_root);
- ASSERT(err != -EEXIST);
- if (err) {
+ ret = __add_reloc_root(reloc_root);
+ ASSERT(ret != -EEXIST);
+ if (ret) {
list_add_tail(&reloc_root->root_list, &reloc_roots);
btrfs_put_root(fs_root);
btrfs_end_transaction(trans);
@@ -4347,8 +4342,8 @@ int btrfs_recover_relocation(struct btrfs_fs_info *fs_info)
btrfs_put_root(fs_root);
}
- err = btrfs_commit_transaction(trans);
- if (err)
+ ret = btrfs_commit_transaction(trans);
+ if (ret)
goto out_unset;
merge_reloc_roots(rc);
@@ -4357,14 +4352,14 @@ int btrfs_recover_relocation(struct btrfs_fs_info *fs_info)
trans = btrfs_join_transaction(rc->extent_root);
if (IS_ERR(trans)) {
- err = PTR_ERR(trans);
+ ret = PTR_ERR(trans);
goto out_clean;
}
- err = btrfs_commit_transaction(trans);
+ ret = btrfs_commit_transaction(trans);
out_clean:
- ret = clean_dirty_subvols(rc);
- if (ret < 0 && !err)
- err = ret;
+ ret2 = clean_dirty_subvols(rc);
+ if (ret2 < 0 && !ret)
+ ret = ret2;
out_unset:
unset_reloc_control(rc);
out_end:
@@ -4375,14 +4370,14 @@ out:
btrfs_free_path(path);
- if (err == 0) {
+ if (ret == 0) {
/* cleanup orphan inode in data relocation tree */
fs_root = btrfs_grab_root(fs_info->data_reloc_root);
ASSERT(fs_root);
- err = btrfs_orphan_cleanup(fs_root);
+ ret = btrfs_orphan_cleanup(fs_root);
btrfs_put_root(fs_root);
}
- return err;
+ return ret;
}
/*
@@ -4393,9 +4388,9 @@ out:
*/
int btrfs_reloc_clone_csums(struct btrfs_ordered_extent *ordered)
{
- struct btrfs_inode *inode = BTRFS_I(ordered->inode);
+ struct btrfs_inode *inode = ordered->inode;
struct btrfs_fs_info *fs_info = inode->root->fs_info;
- u64 disk_bytenr = ordered->file_offset + inode->index_cnt;
+ u64 disk_bytenr = ordered->file_offset + inode->reloc_block_group_start;
struct btrfs_root *csum_root = btrfs_csum_root(fs_info, disk_bytenr);
LIST_HEAD(list);
int ret;
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index d7caa3732f07..14a8d7100018 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -261,7 +261,7 @@ static int init_scrub_stripe(struct btrfs_fs_info *fs_info,
atomic_set(&stripe->pending_io, 0);
spin_lock_init(&stripe->write_error_lock);
- ret = btrfs_alloc_page_array(SCRUB_STRIPE_PAGES, stripe->pages, 0);
+ ret = btrfs_alloc_page_array(SCRUB_STRIPE_PAGES, stripe->pages, false);
if (ret < 0)
goto error;
@@ -2441,19 +2441,15 @@ static int finish_extent_writes_for_zoned(struct btrfs_root *root,
struct btrfs_block_group *cache)
{
struct btrfs_fs_info *fs_info = cache->fs_info;
- struct btrfs_trans_handle *trans;
if (!btrfs_is_zoned(fs_info))
return 0;
btrfs_wait_block_group_reservations(cache);
btrfs_wait_nocow_writers(cache);
- btrfs_wait_ordered_roots(fs_info, U64_MAX, cache->start, cache->length);
+ btrfs_wait_ordered_roots(fs_info, U64_MAX, cache);
- trans = btrfs_join_transaction(root);
- if (IS_ERR(trans))
- return PTR_ERR(trans);
- return btrfs_commit_transaction(trans);
+ return btrfs_commit_current_transaction(root);
}
static noinline_for_stack
@@ -2684,8 +2680,7 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
*/
if (sctx->is_dev_replace) {
btrfs_wait_nocow_writers(cache);
- btrfs_wait_ordered_roots(fs_info, U64_MAX, cache->start,
- cache->length);
+ btrfs_wait_ordered_roots(fs_info, U64_MAX, cache);
}
scrub_pause_off(fs_info);
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 3dd4a48479a9..fb3675f5bf50 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -5188,11 +5188,10 @@ out:
static int process_verity(struct send_ctx *sctx)
{
int ret = 0;
- struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
struct inode *inode;
struct fs_path *p;
- inode = btrfs_iget(fs_info->sb, sctx->cur_ino, sctx->send_root);
+ inode = btrfs_iget(sctx->cur_ino, sctx->send_root);
if (IS_ERR(inode))
return PTR_ERR(inode);
@@ -5550,7 +5549,7 @@ static int send_encoded_inline_extent(struct send_ctx *sctx,
size_t inline_size;
int ret;
- inode = btrfs_iget(fs_info->sb, sctx->cur_ino, root);
+ inode = btrfs_iget(sctx->cur_ino, root);
if (IS_ERR(inode))
return PTR_ERR(inode);
@@ -5617,7 +5616,7 @@ static int send_encoded_extent(struct send_ctx *sctx, struct btrfs_path *path,
u32 crc;
int ret;
- inode = btrfs_iget(fs_info->sb, sctx->cur_ino, root);
+ inode = btrfs_iget(sctx->cur_ino, root);
if (IS_ERR(inode))
return PTR_ERR(inode);
@@ -5746,7 +5745,7 @@ static int send_extent_data(struct send_ctx *sctx, struct btrfs_path *path,
if (sctx->cur_inode == NULL) {
struct btrfs_root *root = sctx->send_root;
- sctx->cur_inode = btrfs_iget(root->fs_info->sb, sctx->cur_ino, root);
+ sctx->cur_inode = btrfs_iget(sctx->cur_ino, root);
if (IS_ERR(sctx->cur_inode)) {
int err = PTR_ERR(sctx->cur_inode);
@@ -7998,34 +7997,18 @@ out:
*/
static int ensure_commit_roots_uptodate(struct send_ctx *sctx)
{
- int i;
- struct btrfs_trans_handle *trans = NULL;
-
-again:
- if (sctx->parent_root &&
- sctx->parent_root->node != sctx->parent_root->commit_root)
- goto commit_trans;
-
- for (i = 0; i < sctx->clone_roots_cnt; i++)
- if (sctx->clone_roots[i].root->node !=
- sctx->clone_roots[i].root->commit_root)
- goto commit_trans;
-
- if (trans)
- return btrfs_end_transaction(trans);
+ struct btrfs_root *root = sctx->parent_root;
- return 0;
+ if (root && root->node != root->commit_root)
+ return btrfs_commit_current_transaction(root);
-commit_trans:
- /* Use any root, all fs roots will get their commit roots updated. */
- if (!trans) {
- trans = btrfs_join_transaction(sctx->send_root);
- if (IS_ERR(trans))
- return PTR_ERR(trans);
- goto again;
+ for (int i = 0; i < sctx->clone_roots_cnt; i++) {
+ root = sctx->clone_roots[i].root;
+ if (root->node != root->commit_root)
+ return btrfs_commit_current_transaction(root);
}
- return btrfs_commit_transaction(trans);
+ return 0;
}
/*
@@ -8046,7 +8029,7 @@ static int flush_delalloc_roots(struct send_ctx *sctx)
ret = btrfs_start_delalloc_snapshot(root, false);
if (ret)
return ret;
- btrfs_wait_ordered_extents(root, U64_MAX, 0, U64_MAX);
+ btrfs_wait_ordered_extents(root, U64_MAX, NULL);
}
for (i = 0; i < sctx->clone_roots_cnt; i++) {
@@ -8054,7 +8037,7 @@ static int flush_delalloc_roots(struct send_ctx *sctx)
ret = btrfs_start_delalloc_snapshot(root, false);
if (ret)
return ret;
- btrfs_wait_ordered_extents(root, U64_MAX, 0, U64_MAX);
+ btrfs_wait_ordered_extents(root, U64_MAX, NULL);
}
return 0;
@@ -8082,10 +8065,10 @@ static void dedupe_in_progress_warn(const struct btrfs_root *root)
btrfs_root_id(root), root->dedupe_in_progress);
}
-long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg)
+long btrfs_ioctl_send(struct btrfs_inode *inode, const struct btrfs_ioctl_send_args *arg)
{
int ret = 0;
- struct btrfs_root *send_root = BTRFS_I(inode)->root;
+ struct btrfs_root *send_root = inode->root;
struct btrfs_fs_info *fs_info = send_root->fs_info;
struct btrfs_root *clone_root;
struct send_ctx *sctx = NULL;
diff --git a/fs/btrfs/send.h b/fs/btrfs/send.h
index dd1c9f02b011..b07f4aa66878 100644
--- a/fs/btrfs/send.h
+++ b/fs/btrfs/send.h
@@ -11,7 +11,7 @@
#include <linux/sizes.h>
#include <linux/align.h>
-struct inode;
+struct btrfs_inode;
struct btrfs_ioctl_send_args;
#define BTRFS_SEND_STREAM_MAGIC "btrfs-stream"
@@ -182,6 +182,6 @@ enum {
__BTRFS_SEND_A_MAX = 35,
};
-long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg);
+long btrfs_ioctl_send(struct btrfs_inode *inode, const struct btrfs_ioctl_send_args *arg);
#endif
diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
index ae8c56442549..9ac94d3119e8 100644
--- a/fs/btrfs/space-info.c
+++ b/fs/btrfs/space-info.c
@@ -1,5 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
+#include "linux/spinlock.h"
+#include <linux/minmax.h>
#include "misc.h"
#include "ctree.h"
#include "space-info.h"
@@ -190,6 +192,8 @@ void btrfs_clear_space_info_full(struct btrfs_fs_info *info)
*/
#define BTRFS_DEFAULT_ZONED_RECLAIM_THRESH (75)
+#define BTRFS_UNALLOC_BLOCK_GROUP_TARGET (10ULL)
+
/*
* Calculate chunk size depending on volume type (regular or zoned).
*/
@@ -232,6 +236,7 @@ static int create_space_info(struct btrfs_fs_info *info, u64 flags)
if (!space_info)
return -ENOMEM;
+ space_info->fs_info = info;
for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
INIT_LIST_HEAD(&space_info->block_groups[i]);
init_rwsem(&space_info->groups_sem);
@@ -340,11 +345,32 @@ struct btrfs_space_info *btrfs_find_space_info(struct btrfs_fs_info *info,
return NULL;
}
+static u64 calc_effective_data_chunk_size(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_space_info *data_sinfo;
+ u64 data_chunk_size;
+
+ /*
+ * Calculate the data_chunk_size, space_info->chunk_size is the
+ * "optimal" chunk size based on the fs size. However when we actually
+ * allocate the chunk we will strip this down further, making it no
+ * more than 10% of the disk or 1G, whichever is smaller.
+ *
+ * On the zoned mode, we need to use zone_size (= data_sinfo->chunk_size)
+ * as it is.
+ */
+ data_sinfo = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_DATA);
+ if (btrfs_is_zoned(fs_info))
+ return data_sinfo->chunk_size;
+ data_chunk_size = min(data_sinfo->chunk_size,
+ mult_perc(fs_info->fs_devices->total_rw_bytes, 10));
+ return min_t(u64, data_chunk_size, SZ_1G);
+}
+
static u64 calc_available_free_space(struct btrfs_fs_info *fs_info,
struct btrfs_space_info *space_info,
enum btrfs_reserve_flush_enum flush)
{
- struct btrfs_space_info *data_sinfo;
u64 profile;
u64 avail;
u64 data_chunk_size;
@@ -368,23 +394,7 @@ static u64 calc_available_free_space(struct btrfs_fs_info *fs_info,
if (avail == 0)
return 0;
- /*
- * Calculate the data_chunk_size, space_info->chunk_size is the
- * "optimal" chunk size based on the fs size. However when we actually
- * allocate the chunk we will strip this down further, making it no more
- * than 10% of the disk or 1G, whichever is smaller.
- *
- * On the zoned mode, we need to use zone_size (=
- * data_sinfo->chunk_size) as it is.
- */
- data_sinfo = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_DATA);
- if (!btrfs_is_zoned(fs_info)) {
- data_chunk_size = min(data_sinfo->chunk_size,
- mult_perc(fs_info->fs_devices->total_rw_bytes, 10));
- data_chunk_size = min_t(u64, data_chunk_size, SZ_1G);
- } else {
- data_chunk_size = data_sinfo->chunk_size;
- }
+ data_chunk_size = calc_effective_data_chunk_size(fs_info);
/*
* Since data allocations immediately use block groups as part of the
@@ -605,8 +615,6 @@ static inline u64 calc_reclaim_items_nr(const struct btrfs_fs_info *fs_info,
return nr;
}
-#define EXTENT_SIZE_PER_ITEM SZ_256K
-
/*
* shrink metadata reservation for delalloc
*/
@@ -706,7 +714,7 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info,
skip_async:
loops++;
if (wait_ordered && !trans) {
- btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1);
+ btrfs_wait_ordered_roots(fs_info, items, NULL);
} else {
time_left = schedule_timeout_killable(1);
if (time_left)
@@ -825,14 +833,7 @@ static void flush_space(struct btrfs_fs_info *fs_info,
* because that does not wait for a transaction to fully commit
* (only for it to be unblocked, state TRANS_STATE_UNBLOCKED).
*/
- trans = btrfs_attach_transaction_barrier(root);
- if (IS_ERR(trans)) {
- ret = PTR_ERR(trans);
- if (ret == -ENOENT)
- ret = 0;
- break;
- }
- ret = btrfs_commit_transaction(trans);
+ ret = btrfs_commit_current_transaction(root);
break;
default:
ret = -ENOSPC;
@@ -1886,3 +1887,209 @@ u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo)
return free_bytes;
}
+
+static u64 calc_pct_ratio(u64 x, u64 y)
+{
+ int err;
+
+ if (!y)
+ return 0;
+again:
+ err = check_mul_overflow(100, x, &x);
+ if (err)
+ goto lose_precision;
+ return div64_u64(x, y);
+lose_precision:
+ x >>= 10;
+ y >>= 10;
+ if (!y)
+ y = 1;
+ goto again;
+}
+
+/*
+ * A reasonable buffer for unallocated space is 10 data block_groups.
+ * If we claw this back repeatedly, we can still achieve efficient
+ * utilization when near full, and not do too much reclaim while
+ * always maintaining a solid buffer for workloads that quickly
+ * allocate and pressure the unallocated space.
+ */
+static u64 calc_unalloc_target(struct btrfs_fs_info *fs_info)
+{
+ u64 chunk_sz = calc_effective_data_chunk_size(fs_info);
+
+ return BTRFS_UNALLOC_BLOCK_GROUP_TARGET * chunk_sz;
+}
+
+/*
+ * The fundamental goal of automatic reclaim is to protect the filesystem's
+ * unallocated space and thus minimize the probability of the filesystem going
+ * read only when a metadata allocation failure causes a transaction abort.
+ *
+ * However, relocations happen into the space_info's unused space, therefore
+ * automatic reclaim must also back off as that space runs low. There is no
+ * value in doing trivial "relocations" of re-writing the same block group
+ * into a fresh one.
+ *
+ * Furthermore, we want to avoid doing too much reclaim even if there are good
+ * candidates. This is because the allocator is pretty good at filling up the
+ * holes with writes. So we want to do just enough reclaim to try and stay
+ * safe from running out of unallocated space but not be wasteful about it.
+ *
+ * Therefore, the dynamic reclaim threshold is calculated as follows:
+ * - calculate a target unallocated amount of 5 block group sized chunks
+ * - ratchet up the intensity of reclaim depending on how far we are from
+ * that target by using a formula of unalloc / target to set the threshold.
+ *
+ * Typically with 10 block groups as the target, the discrete values this comes
+ * out to are 0, 10, 20, ... , 80, 90, and 99.
+ */
+static int calc_dynamic_reclaim_threshold(struct btrfs_space_info *space_info)
+{
+ struct btrfs_fs_info *fs_info = space_info->fs_info;
+ u64 unalloc = atomic64_read(&fs_info->free_chunk_space);
+ u64 target = calc_unalloc_target(fs_info);
+ u64 alloc = space_info->total_bytes;
+ u64 used = btrfs_space_info_used(space_info, false);
+ u64 unused = alloc - used;
+ u64 want = target > unalloc ? target - unalloc : 0;
+ u64 data_chunk_size = calc_effective_data_chunk_size(fs_info);
+
+ /* If we have no unused space, don't bother, it won't work anyway. */
+ if (unused < data_chunk_size)
+ return 0;
+
+ /* Cast to int is OK because want <= target. */
+ return calc_pct_ratio(want, target);
+}
+
+int btrfs_calc_reclaim_threshold(struct btrfs_space_info *space_info)
+{
+ lockdep_assert_held(&space_info->lock);
+
+ if (READ_ONCE(space_info->dynamic_reclaim))
+ return calc_dynamic_reclaim_threshold(space_info);
+ return READ_ONCE(space_info->bg_reclaim_threshold);
+}
+
+/*
+ * Under "urgent" reclaim, we will reclaim even fresh block groups that have
+ * recently seen successful allocations, as we are desperate to reclaim
+ * whatever we can to avoid ENOSPC in a transaction leading to a readonly fs.
+ */
+static bool is_reclaim_urgent(struct btrfs_space_info *space_info)
+{
+ struct btrfs_fs_info *fs_info = space_info->fs_info;
+ u64 unalloc = atomic64_read(&fs_info->free_chunk_space);
+ u64 data_chunk_size = calc_effective_data_chunk_size(fs_info);
+
+ return unalloc < data_chunk_size;
+}
+
+static int do_reclaim_sweep(struct btrfs_fs_info *fs_info,
+ struct btrfs_space_info *space_info, int raid)
+{
+ struct btrfs_block_group *bg;
+ int thresh_pct;
+ bool try_again = true;
+ bool urgent;
+
+ spin_lock(&space_info->lock);
+ urgent = is_reclaim_urgent(space_info);
+ thresh_pct = btrfs_calc_reclaim_threshold(space_info);
+ spin_unlock(&space_info->lock);
+
+ down_read(&space_info->groups_sem);
+again:
+ list_for_each_entry(bg, &space_info->block_groups[raid], list) {
+ u64 thresh;
+ bool reclaim = false;
+
+ btrfs_get_block_group(bg);
+ spin_lock(&bg->lock);
+ thresh = mult_perc(bg->length, thresh_pct);
+ if (bg->used < thresh && bg->reclaim_mark) {
+ try_again = false;
+ reclaim = true;
+ }
+ bg->reclaim_mark++;
+ spin_unlock(&bg->lock);
+ if (reclaim)
+ btrfs_mark_bg_to_reclaim(bg);
+ btrfs_put_block_group(bg);
+ }
+
+ /*
+ * In situations where we are very motivated to reclaim (low unalloc)
+ * use two passes to make the reclaim mark check best effort.
+ *
+ * If we have any staler groups, we don't touch the fresher ones, but if we
+ * really need a block group, do take a fresh one.
+ */
+ if (try_again && urgent) {
+ try_again = false;
+ goto again;
+ }
+
+ up_read(&space_info->groups_sem);
+ return 0;
+}
+
+void btrfs_space_info_update_reclaimable(struct btrfs_space_info *space_info, s64 bytes)
+{
+ u64 chunk_sz = calc_effective_data_chunk_size(space_info->fs_info);
+
+ lockdep_assert_held(&space_info->lock);
+ space_info->reclaimable_bytes += bytes;
+
+ if (space_info->reclaimable_bytes >= chunk_sz)
+ btrfs_set_periodic_reclaim_ready(space_info, true);
+}
+
+void btrfs_set_periodic_reclaim_ready(struct btrfs_space_info *space_info, bool ready)
+{
+ lockdep_assert_held(&space_info->lock);
+ if (!READ_ONCE(space_info->periodic_reclaim))
+ return;
+ if (ready != space_info->periodic_reclaim_ready) {
+ space_info->periodic_reclaim_ready = ready;
+ if (!ready)
+ space_info->reclaimable_bytes = 0;
+ }
+}
+
+bool btrfs_should_periodic_reclaim(struct btrfs_space_info *space_info)
+{
+ bool ret;
+
+ if (space_info->flags & BTRFS_BLOCK_GROUP_SYSTEM)
+ return false;
+ if (!READ_ONCE(space_info->periodic_reclaim))
+ return false;
+
+ spin_lock(&space_info->lock);
+ ret = space_info->periodic_reclaim_ready;
+ btrfs_set_periodic_reclaim_ready(space_info, false);
+ spin_unlock(&space_info->lock);
+
+ return ret;
+}
+
+int btrfs_reclaim_sweep(struct btrfs_fs_info *fs_info)
+{
+ int ret;
+ int raid;
+ struct btrfs_space_info *space_info;
+
+ list_for_each_entry(space_info, &fs_info->space_info, list) {
+ if (!btrfs_should_periodic_reclaim(space_info))
+ continue;
+ for (raid = 0; raid < BTRFS_NR_RAID_TYPES; raid++) {
+ ret = do_reclaim_sweep(fs_info, space_info, raid);
+ if (ret)
+ return ret;
+ }
+ }
+
+ return ret;
+}
diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h
index a733458fd13b..4db8a0267c16 100644
--- a/fs/btrfs/space-info.h
+++ b/fs/btrfs/space-info.h
@@ -94,6 +94,7 @@ enum btrfs_flush_state {
};
struct btrfs_space_info {
+ struct btrfs_fs_info *fs_info;
spinlock_t lock;
u64 total_bytes; /* total bytes in the space,
@@ -165,6 +166,47 @@ struct btrfs_space_info {
struct kobject kobj;
struct kobject *block_group_kobjs[BTRFS_NR_RAID_TYPES];
+
+ /*
+ * Monotonically increasing counter of block group reclaim attempts
+ * Exposed in /sys/fs/<uuid>/allocation/<type>/reclaim_count
+ */
+ u64 reclaim_count;
+
+ /*
+ * Monotonically increasing counter of reclaimed bytes
+ * Exposed in /sys/fs/<uuid>/allocation/<type>/reclaim_bytes
+ */
+ u64 reclaim_bytes;
+
+ /*
+ * Monotonically increasing counter of reclaim errors
+ * Exposed in /sys/fs/<uuid>/allocation/<type>/reclaim_errors
+ */
+ u64 reclaim_errors;
+
+ /*
+ * If true, use the dynamic relocation threshold, instead of the
+ * fixed bg_reclaim_threshold.
+ */
+ bool dynamic_reclaim;
+
+ /*
+ * Periodically check all block groups against the reclaim
+ * threshold in the cleaner thread.
+ */
+ bool periodic_reclaim;
+
+ /*
+ * Periodic reclaim should be a no-op if a space_info hasn't
+ * freed any space since the last time we tried.
+ */
+ bool periodic_reclaim_ready;
+
+ /*
+ * Net bytes freed or allocated since the last reclaim pass.
+ */
+ s64 reclaimable_bytes;
};
struct reserve_ticket {
@@ -247,4 +289,10 @@ void btrfs_dump_space_info_for_trans_abort(struct btrfs_fs_info *fs_info);
void btrfs_init_async_reclaim_work(struct btrfs_fs_info *fs_info);
u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo);
+void btrfs_space_info_update_reclaimable(struct btrfs_space_info *space_info, s64 bytes);
+void btrfs_set_periodic_reclaim_ready(struct btrfs_space_info *space_info, bool ready);
+bool btrfs_should_periodic_reclaim(struct btrfs_space_info *space_info);
+int btrfs_calc_reclaim_threshold(struct btrfs_space_info *space_info);
+int btrfs_reclaim_sweep(struct btrfs_fs_info *fs_info);
+
#endif /* BTRFS_SPACE_INFO_H */
diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c
index 54736f6238e6..8ddd5fcbeb93 100644
--- a/fs/btrfs/subpage.c
+++ b/fs/btrfs/subpage.c
@@ -74,7 +74,7 @@ bool btrfs_is_subpage(const struct btrfs_fs_info *fs_info, struct address_space
* mapping. And if page->mapping->host is data inode, it's subpage.
* As we have ruled our sectorsize >= PAGE_SIZE case already.
*/
- if (!mapping || !mapping->host || is_data_inode(mapping->host))
+ if (!mapping || !mapping->host || is_data_inode(BTRFS_I(mapping->host)))
return true;
/*
@@ -242,12 +242,12 @@ static void btrfs_subpage_assert(const struct btrfs_fs_info *fs_info,
#define subpage_calc_start_bit(fs_info, folio, name, start, len) \
({ \
- unsigned int start_bit; \
+ unsigned int __start_bit; \
\
btrfs_subpage_assert(fs_info, folio, start, len); \
- start_bit = offset_in_page(start) >> fs_info->sectorsize_bits; \
- start_bit += fs_info->subpage_info->name##_offset; \
- start_bit; \
+ __start_bit = offset_in_page(start) >> fs_info->sectorsize_bits; \
+ __start_bit += fs_info->subpage_info->name##_offset; \
+ __start_bit; \
})
void btrfs_subpage_start_reader(const struct btrfs_fs_info *fs_info,
@@ -283,7 +283,7 @@ void btrfs_subpage_end_reader(const struct btrfs_fs_info *fs_info,
bool last;
btrfs_subpage_assert(fs_info, folio, start, len);
- is_data = is_data_inode(folio->mapping->host);
+ is_data = is_data_inode(BTRFS_I(folio->mapping->host));
spin_lock_irqsave(&subpage->lock, flags);
@@ -703,19 +703,29 @@ IMPLEMENT_BTRFS_PAGE_OPS(checked, folio_set_checked, folio_clear_checked,
* Make sure not only the page dirty bit is cleared, but also subpage dirty bit
* is cleared.
*/
-void btrfs_folio_assert_not_dirty(const struct btrfs_fs_info *fs_info, struct folio *folio)
+void btrfs_folio_assert_not_dirty(const struct btrfs_fs_info *fs_info,
+ struct folio *folio, u64 start, u32 len)
{
- struct btrfs_subpage *subpage = folio_get_private(folio);
+ struct btrfs_subpage *subpage;
+ unsigned int start_bit;
+ unsigned int nbits;
+ unsigned long flags;
if (!IS_ENABLED(CONFIG_BTRFS_ASSERT))
return;
- ASSERT(!folio_test_dirty(folio));
- if (!btrfs_is_subpage(fs_info, folio->mapping))
+ if (!btrfs_is_subpage(fs_info, folio->mapping)) {
+ ASSERT(!folio_test_dirty(folio));
return;
+ }
- ASSERT(folio_test_private(folio) && folio_get_private(folio));
- ASSERT(subpage_test_bitmap_all_zero(fs_info, subpage, dirty));
+ start_bit = subpage_calc_start_bit(fs_info, folio, dirty, start, len);
+ nbits = len >> fs_info->sectorsize_bits;
+ subpage = folio_get_private(folio);
+ ASSERT(subpage);
+ spin_lock_irqsave(&subpage->lock, flags);
+ ASSERT(bitmap_test_range_all_zero(subpage->bitmaps, start_bit, nbits));
+ spin_unlock_irqrestore(&subpage->lock, flags);
}
/*
@@ -765,6 +775,130 @@ void btrfs_folio_unlock_writer(struct btrfs_fs_info *fs_info,
btrfs_folio_end_writer_lock(fs_info, folio, start, len);
}
+/*
+ * This is for folio already locked by plain lock_page()/folio_lock(), which
+ * doesn't have any subpage awareness.
+ *
+ * This populates the involved subpage ranges so that subpage helpers can
+ * properly unlock them.
+ */
+void btrfs_folio_set_writer_lock(const struct btrfs_fs_info *fs_info,
+ struct folio *folio, u64 start, u32 len)
+{
+ struct btrfs_subpage *subpage;
+ unsigned long flags;
+ unsigned int start_bit;
+ unsigned int nbits;
+ int ret;
+
+ ASSERT(folio_test_locked(folio));
+ if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, folio->mapping))
+ return;
+
+ subpage = folio_get_private(folio);
+ start_bit = subpage_calc_start_bit(fs_info, folio, locked, start, len);
+ nbits = len >> fs_info->sectorsize_bits;
+ spin_lock_irqsave(&subpage->lock, flags);
+ /* Target range should not yet be locked. */
+ ASSERT(bitmap_test_range_all_zero(subpage->bitmaps, start_bit, nbits));
+ bitmap_set(subpage->bitmaps, start_bit, nbits);
+ ret = atomic_add_return(nbits, &subpage->writers);
+ ASSERT(ret <= fs_info->subpage_info->bitmap_nr_bits);
+ spin_unlock_irqrestore(&subpage->lock, flags);
+}
+
+/*
+ * Find any subpage writer locked range inside @folio, starting at file offset
+ * @search_start. The caller should ensure the folio is locked.
+ *
+ * Return true and update @found_start_ret and @found_len_ret to the first
+ * writer locked range.
+ * Return false if there is no writer locked range.
+ */
+bool btrfs_subpage_find_writer_locked(const struct btrfs_fs_info *fs_info,
+ struct folio *folio, u64 search_start,
+ u64 *found_start_ret, u32 *found_len_ret)
+{
+ struct btrfs_subpage_info *subpage_info = fs_info->subpage_info;
+ struct btrfs_subpage *subpage = folio_get_private(folio);
+ const unsigned int len = PAGE_SIZE - offset_in_page(search_start);
+ const unsigned int start_bit = subpage_calc_start_bit(fs_info, folio,
+ locked, search_start, len);
+ const unsigned int locked_bitmap_start = subpage_info->locked_offset;
+ const unsigned int locked_bitmap_end = locked_bitmap_start +
+ subpage_info->bitmap_nr_bits;
+ unsigned long flags;
+ int first_zero;
+ int first_set;
+ bool found = false;
+
+ ASSERT(folio_test_locked(folio));
+ spin_lock_irqsave(&subpage->lock, flags);
+ first_set = find_next_bit(subpage->bitmaps, locked_bitmap_end, start_bit);
+ if (first_set >= locked_bitmap_end)
+ goto out;
+
+ found = true;
+
+ *found_start_ret = folio_pos(folio) +
+ ((first_set - locked_bitmap_start) << fs_info->sectorsize_bits);
+ /*
+ * Since @first_set is ensured to be smaller than locked_bitmap_end
+ * here, @found_start_ret should be inside the folio.
+ */
+ ASSERT(*found_start_ret < folio_pos(folio) + PAGE_SIZE);
+
+ first_zero = find_next_zero_bit(subpage->bitmaps, locked_bitmap_end, first_set);
+ *found_len_ret = (first_zero - first_set) << fs_info->sectorsize_bits;
+out:
+ spin_unlock_irqrestore(&subpage->lock, flags);
+ return found;
+}
+
+/*
+ * Unlike btrfs_folio_end_writer_lock() which unlocks a specified subpage range,
+ * this ends all writer locked ranges of a page.
+ *
+ * This is for the locked page of __extent_writepage(), as the locked page
+ * can contain several locked subpage ranges.
+ */
+void btrfs_folio_end_all_writers(const struct btrfs_fs_info *fs_info, struct folio *folio)
+{
+ struct btrfs_subpage *subpage = folio_get_private(folio);
+ u64 folio_start = folio_pos(folio);
+ u64 cur = folio_start;
+
+ ASSERT(folio_test_locked(folio));
+ if (!btrfs_is_subpage(fs_info, folio->mapping)) {
+ folio_unlock(folio);
+ return;
+ }
+
+ /* The page has no new delalloc range locked on it. Just plain unlock. */
+ if (atomic_read(&subpage->writers) == 0) {
+ folio_unlock(folio);
+ return;
+ }
+ while (cur < folio_start + PAGE_SIZE) {
+ u64 found_start;
+ u32 found_len;
+ bool found;
+ bool last;
+
+ found = btrfs_subpage_find_writer_locked(fs_info, folio, cur,
+ &found_start, &found_len);
+ if (!found)
+ break;
+ last = btrfs_subpage_end_and_test_writer(fs_info, folio,
+ found_start, found_len);
+ if (last) {
+ folio_unlock(folio);
+ break;
+ }
+ cur = found_start + found_len;
+ }
+}
+
#define GET_SUBPAGE_BITMAP(subpage, subpage_info, name, dst) \
bitmap_cut(dst, subpage->bitmaps, 0, \
subpage_info->name##_offset, subpage_info->bitmap_nr_bits)
@@ -775,7 +909,6 @@ void __cold btrfs_subpage_dump_bitmap(const struct btrfs_fs_info *fs_info,
struct btrfs_subpage_info *subpage_info = fs_info->subpage_info;
struct btrfs_subpage *subpage;
unsigned long uptodate_bitmap;
- unsigned long error_bitmap;
unsigned long dirty_bitmap;
unsigned long writeback_bitmap;
unsigned long ordered_bitmap;
@@ -797,10 +930,9 @@ void __cold btrfs_subpage_dump_bitmap(const struct btrfs_fs_info *fs_info,
dump_page(folio_page(folio, 0), "btrfs subpage dump");
btrfs_warn(fs_info,
-"start=%llu len=%u page=%llu, bitmaps uptodate=%*pbl error=%*pbl dirty=%*pbl writeback=%*pbl ordered=%*pbl checked=%*pbl",
+"start=%llu len=%u page=%llu, bitmaps uptodate=%*pbl dirty=%*pbl writeback=%*pbl ordered=%*pbl checked=%*pbl",
start, len, folio_pos(folio),
subpage_info->bitmap_nr_bits, &uptodate_bitmap,
- subpage_info->bitmap_nr_bits, &error_bitmap,
subpage_info->bitmap_nr_bits, &dirty_bitmap,
subpage_info->bitmap_nr_bits, &writeback_bitmap,
subpage_info->bitmap_nr_bits, &ordered_bitmap,
diff --git a/fs/btrfs/subpage.h b/fs/btrfs/subpage.h
index b6dc013b0fdc..249396e118d0 100644
--- a/fs/btrfs/subpage.h
+++ b/fs/btrfs/subpage.h
@@ -112,6 +112,12 @@ int btrfs_folio_start_writer_lock(const struct btrfs_fs_info *fs_info,
struct folio *folio, u64 start, u32 len);
void btrfs_folio_end_writer_lock(const struct btrfs_fs_info *fs_info,
struct folio *folio, u64 start, u32 len);
+void btrfs_folio_set_writer_lock(const struct btrfs_fs_info *fs_info,
+ struct folio *folio, u64 start, u32 len);
+bool btrfs_subpage_find_writer_locked(const struct btrfs_fs_info *fs_info,
+ struct folio *folio, u64 search_start,
+ u64 *found_start_ret, u32 *found_len_ret);
+void btrfs_folio_end_all_writers(const struct btrfs_fs_info *fs_info, struct folio *folio);
/*
* Template for subpage related operations.
@@ -156,7 +162,8 @@ DECLARE_BTRFS_SUBPAGE_OPS(checked);
bool btrfs_subpage_clear_and_test_dirty(const struct btrfs_fs_info *fs_info,
struct folio *folio, u64 start, u32 len);
-void btrfs_folio_assert_not_dirty(const struct btrfs_fs_info *fs_info, struct folio *folio);
+void btrfs_folio_assert_not_dirty(const struct btrfs_fs_info *fs_info,
+ struct folio *folio, u64 start, u32 len);
void btrfs_folio_unlock_writer(struct btrfs_fs_info *fs_info,
struct folio *folio, u64 start, u32 len);
void __cold btrfs_subpage_dump_bitmap(const struct btrfs_fs_info *fs_info,
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index f05cce7c8b8d..0eda8c21d861 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -34,6 +34,7 @@
#include "disk-io.h"
#include "transaction.h"
#include "btrfs_inode.h"
+#include "direct-io.h"
#include "props.h"
#include "xattr.h"
#include "bio.h"
@@ -125,9 +126,6 @@ enum {
Opt_rescue,
Opt_usebackuproot,
Opt_nologreplay,
- Opt_ignorebadroots,
- Opt_ignoredatacsums,
- Opt_rescue_all,
/* Debugging options */
Opt_enospc_debug,
@@ -178,6 +176,8 @@ enum {
Opt_rescue_nologreplay,
Opt_rescue_ignorebadroots,
Opt_rescue_ignoredatacsums,
+ Opt_rescue_ignoremetacsums,
+ Opt_rescue_ignoresuperflags,
Opt_rescue_parameter_all,
};
@@ -187,7 +187,11 @@ static const struct constant_table btrfs_parameter_rescue[] = {
{ "ignorebadroots", Opt_rescue_ignorebadroots },
{ "ibadroots", Opt_rescue_ignorebadroots },
{ "ignoredatacsums", Opt_rescue_ignoredatacsums },
+ { "ignoremetacsums", Opt_rescue_ignoremetacsums},
+ { "ignoresuperflags", Opt_rescue_ignoresuperflags},
{ "idatacsums", Opt_rescue_ignoredatacsums },
+ { "imetacsums", Opt_rescue_ignoremetacsums},
+ { "isuperflags", Opt_rescue_ignoresuperflags},
{ "all", Opt_rescue_parameter_all },
{}
};
@@ -573,8 +577,16 @@ static int btrfs_parse_param(struct fs_context *fc, struct fs_parameter *param)
case Opt_rescue_ignoredatacsums:
btrfs_set_opt(ctx->mount_opt, IGNOREDATACSUMS);
break;
+ case Opt_rescue_ignoremetacsums:
+ btrfs_set_opt(ctx->mount_opt, IGNOREMETACSUMS);
+ break;
+ case Opt_rescue_ignoresuperflags:
+ btrfs_set_opt(ctx->mount_opt, IGNORESUPERFLAGS);
+ break;
case Opt_rescue_parameter_all:
btrfs_set_opt(ctx->mount_opt, IGNOREDATACSUMS);
+ btrfs_set_opt(ctx->mount_opt, IGNOREMETACSUMS);
+ btrfs_set_opt(ctx->mount_opt, IGNORESUPERFLAGS);
btrfs_set_opt(ctx->mount_opt, IGNOREBADROOTS);
btrfs_set_opt(ctx->mount_opt, NOLOGREPLAY);
break;
@@ -629,7 +641,7 @@ static void btrfs_clear_oneshot_options(struct btrfs_fs_info *fs_info)
btrfs_clear_opt(fs_info->mount_opt, NOSPACECACHE);
}
-static bool check_ro_option(struct btrfs_fs_info *fs_info,
+static bool check_ro_option(const struct btrfs_fs_info *fs_info,
unsigned long mount_opt, unsigned long opt,
const char *opt_name)
{
@@ -641,7 +653,7 @@ static bool check_ro_option(struct btrfs_fs_info *fs_info,
return false;
}
-bool btrfs_check_options(struct btrfs_fs_info *info, unsigned long *mount_opt,
+bool btrfs_check_options(const struct btrfs_fs_info *info, unsigned long *mount_opt,
unsigned long flags)
{
bool ret = true;
@@ -649,7 +661,9 @@ bool btrfs_check_options(struct btrfs_fs_info *info, unsigned long *mount_opt,
if (!(flags & SB_RDONLY) &&
(check_ro_option(info, *mount_opt, BTRFS_MOUNT_NOLOGREPLAY, "nologreplay") ||
check_ro_option(info, *mount_opt, BTRFS_MOUNT_IGNOREBADROOTS, "ignorebadroots") ||
- check_ro_option(info, *mount_opt, BTRFS_MOUNT_IGNOREDATACSUMS, "ignoredatacsums")))
+ check_ro_option(info, *mount_opt, BTRFS_MOUNT_IGNOREDATACSUMS, "ignoredatacsums") ||
+ check_ro_option(info, *mount_opt, BTRFS_MOUNT_IGNOREMETACSUMS, "ignoremetacsums") ||
+ check_ro_option(info, *mount_opt, BTRFS_MOUNT_IGNORESUPERFLAGS, "ignoresuperflags")))
ret = false;
if (btrfs_fs_compat_ro(info, FREE_SPACE_TREE) &&
@@ -949,7 +963,7 @@ static int btrfs_fill_super(struct super_block *sb,
return err;
}
- inode = btrfs_iget(sb, BTRFS_FIRST_FREE_OBJECTID, fs_info->fs_root);
+ inode = btrfs_iget(BTRFS_FIRST_FREE_OBJECTID, fs_info->fs_root);
if (IS_ERR(inode)) {
err = PTR_ERR(inode);
btrfs_handle_fs_error(fs_info, err, NULL);
@@ -983,7 +997,7 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
return 0;
}
- btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1);
+ btrfs_wait_ordered_roots(fs_info, U64_MAX, NULL);
trans = btrfs_attach_transaction_barrier(root);
if (IS_ERR(trans)) {
@@ -1065,6 +1079,10 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
print_rescue_option(seq, "ignorebadroots", &printed);
if (btrfs_test_opt(info, IGNOREDATACSUMS))
print_rescue_option(seq, "ignoredatacsums", &printed);
+ if (btrfs_test_opt(info, IGNOREMETACSUMS))
+ print_rescue_option(seq, "ignoremetacsums", &printed);
+ if (btrfs_test_opt(info, IGNORESUPERFLAGS))
+ print_rescue_option(seq, "ignoresuperflags", &printed);
if (btrfs_test_opt(info, FLUSHONCOMMIT))
seq_puts(seq, ",flushoncommit");
if (btrfs_test_opt(info, DISCARD_SYNC))
@@ -1422,6 +1440,8 @@ static void btrfs_emit_options(struct btrfs_fs_info *info,
btrfs_info_if_set(info, old, USEBACKUPROOT, "trying to use backup root at mount time");
btrfs_info_if_set(info, old, IGNOREBADROOTS, "ignoring bad roots");
btrfs_info_if_set(info, old, IGNOREDATACSUMS, "ignoring data csums");
+ btrfs_info_if_set(info, old, IGNOREMETACSUMS, "ignoring meta csums");
+ btrfs_info_if_set(info, old, IGNORESUPERFLAGS, "ignoring unknown super block flags");
btrfs_info_if_unset(info, old, NODATACOW, "setting datacow");
btrfs_info_if_unset(info, old, SSD, "not using ssd optimizations");
@@ -2257,9 +2277,7 @@ out:
static int btrfs_freeze(struct super_block *sb)
{
- struct btrfs_trans_handle *trans;
struct btrfs_fs_info *fs_info = btrfs_sb(sb);
- struct btrfs_root *root = fs_info->tree_root;
set_bit(BTRFS_FS_FROZEN, &fs_info->flags);
/*
@@ -2268,14 +2286,7 @@ static int btrfs_freeze(struct super_block *sb)
* we want to avoid on a frozen filesystem), or do the commit
* ourselves.
*/
- trans = btrfs_attach_transaction_barrier(root);
- if (IS_ERR(trans)) {
- /* no transaction, don't bother */
- if (PTR_ERR(trans) == -ENOENT)
- return 0;
- return PTR_ERR(trans);
- }
- return btrfs_commit_transaction(trans);
+ return btrfs_commit_current_transaction(fs_info->tree_root);
}
static int check_dev_super(struct btrfs_device *dev)
@@ -2499,6 +2510,9 @@ static const struct init_sequence mod_init_seq[] = {
.init_func = btrfs_init_cachep,
.exit_func = btrfs_destroy_cachep,
}, {
+ .init_func = btrfs_init_dio,
+ .exit_func = btrfs_destroy_dio,
+ }, {
.init_func = btrfs_transaction_init,
.exit_func = btrfs_transaction_exit,
}, {
@@ -2590,6 +2604,7 @@ static int __init init_btrfs_fs(void)
late_initcall(init_btrfs_fs);
module_exit(exit_btrfs_fs)
+MODULE_DESCRIPTION("B-Tree File System (BTRFS)");
MODULE_LICENSE("GPL");
MODULE_SOFTDEP("pre: crc32c");
MODULE_SOFTDEP("pre: xxhash64");
diff --git a/fs/btrfs/super.h b/fs/btrfs/super.h
index cbcab434b5ec..d2b8ebb46bc6 100644
--- a/fs/btrfs/super.h
+++ b/fs/btrfs/super.h
@@ -10,7 +10,7 @@
struct super_block;
struct btrfs_fs_info;
-bool btrfs_check_options(struct btrfs_fs_info *info, unsigned long *mount_opt,
+bool btrfs_check_options(const struct btrfs_fs_info *info, unsigned long *mount_opt,
unsigned long flags);
int btrfs_sync_fs(struct super_block *sb, int wait);
char *btrfs_get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info,
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index af545b6b1190..03926ad467c9 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -385,6 +385,8 @@ static const char *rescue_opts[] = {
"nologreplay",
"ignorebadroots",
"ignoredatacsums",
+ "ignoremetacsums",
+ "ignoresuperflags",
"all",
};
@@ -894,6 +896,9 @@ SPACE_INFO_ATTR(bytes_readonly);
SPACE_INFO_ATTR(bytes_zone_unusable);
SPACE_INFO_ATTR(disk_used);
SPACE_INFO_ATTR(disk_total);
+SPACE_INFO_ATTR(reclaim_count);
+SPACE_INFO_ATTR(reclaim_bytes);
+SPACE_INFO_ATTR(reclaim_errors);
BTRFS_ATTR_RW(space_info, chunk_size, btrfs_chunk_size_show, btrfs_chunk_size_store);
BTRFS_ATTR(space_info, size_classes, btrfs_size_classes_show);
@@ -902,8 +907,12 @@ static ssize_t btrfs_sinfo_bg_reclaim_threshold_show(struct kobject *kobj,
char *buf)
{
struct btrfs_space_info *space_info = to_space_info(kobj);
+ ssize_t ret;
- return sysfs_emit(buf, "%d\n", READ_ONCE(space_info->bg_reclaim_threshold));
+ spin_lock(&space_info->lock);
+ ret = sysfs_emit(buf, "%d\n", btrfs_calc_reclaim_threshold(space_info));
+ spin_unlock(&space_info->lock);
+ return ret;
}
static ssize_t btrfs_sinfo_bg_reclaim_threshold_store(struct kobject *kobj,
@@ -914,6 +923,9 @@ static ssize_t btrfs_sinfo_bg_reclaim_threshold_store(struct kobject *kobj,
int thresh;
int ret;
+ if (READ_ONCE(space_info->dynamic_reclaim))
+ return -EINVAL;
+
ret = kstrtoint(buf, 10, &thresh);
if (ret)
return ret;
@@ -930,6 +942,72 @@ BTRFS_ATTR_RW(space_info, bg_reclaim_threshold,
btrfs_sinfo_bg_reclaim_threshold_show,
btrfs_sinfo_bg_reclaim_threshold_store);
+static ssize_t btrfs_sinfo_dynamic_reclaim_show(struct kobject *kobj,
+ struct kobj_attribute *a,
+ char *buf)
+{
+ struct btrfs_space_info *space_info = to_space_info(kobj);
+
+ return sysfs_emit(buf, "%d\n", READ_ONCE(space_info->dynamic_reclaim));
+}
+
+static ssize_t btrfs_sinfo_dynamic_reclaim_store(struct kobject *kobj,
+ struct kobj_attribute *a,
+ const char *buf, size_t len)
+{
+ struct btrfs_space_info *space_info = to_space_info(kobj);
+ int dynamic_reclaim;
+ int ret;
+
+ ret = kstrtoint(buf, 10, &dynamic_reclaim);
+ if (ret)
+ return ret;
+
+ if (dynamic_reclaim < 0)
+ return -EINVAL;
+
+ WRITE_ONCE(space_info->dynamic_reclaim, dynamic_reclaim != 0);
+
+ return len;
+}
+
+BTRFS_ATTR_RW(space_info, dynamic_reclaim,
+ btrfs_sinfo_dynamic_reclaim_show,
+ btrfs_sinfo_dynamic_reclaim_store);
+
+static ssize_t btrfs_sinfo_periodic_reclaim_show(struct kobject *kobj,
+ struct kobj_attribute *a,
+ char *buf)
+{
+ struct btrfs_space_info *space_info = to_space_info(kobj);
+
+ return sysfs_emit(buf, "%d\n", READ_ONCE(space_info->periodic_reclaim));
+}
+
+static ssize_t btrfs_sinfo_periodic_reclaim_store(struct kobject *kobj,
+ struct kobj_attribute *a,
+ const char *buf, size_t len)
+{
+ struct btrfs_space_info *space_info = to_space_info(kobj);
+ int periodic_reclaim;
+ int ret;
+
+ ret = kstrtoint(buf, 10, &periodic_reclaim);
+ if (ret)
+ return ret;
+
+ if (periodic_reclaim < 0)
+ return -EINVAL;
+
+ WRITE_ONCE(space_info->periodic_reclaim, periodic_reclaim != 0);
+
+ return len;
+}
+
+BTRFS_ATTR_RW(space_info, periodic_reclaim,
+ btrfs_sinfo_periodic_reclaim_show,
+ btrfs_sinfo_periodic_reclaim_store);
+
/*
* Allocation information about block group types.
*
@@ -947,8 +1025,13 @@ static struct attribute *space_info_attrs[] = {
BTRFS_ATTR_PTR(space_info, disk_used),
BTRFS_ATTR_PTR(space_info, disk_total),
BTRFS_ATTR_PTR(space_info, bg_reclaim_threshold),
+ BTRFS_ATTR_PTR(space_info, dynamic_reclaim),
BTRFS_ATTR_PTR(space_info, chunk_size),
BTRFS_ATTR_PTR(space_info, size_classes),
+ BTRFS_ATTR_PTR(space_info, reclaim_count),
+ BTRFS_ATTR_PTR(space_info, reclaim_bytes),
+ BTRFS_ATTR_PTR(space_info, reclaim_errors),
+ BTRFS_ATTR_PTR(space_info, periodic_reclaim),
#ifdef CONFIG_BTRFS_DEBUG
BTRFS_ATTR_PTR(space_info, force_chunk_alloc),
#endif
diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c
index dce0387ef155..ce50847e1e01 100644
--- a/fs/btrfs/tests/btrfs-tests.c
+++ b/fs/btrfs/tests/btrfs-tests.c
@@ -61,10 +61,7 @@ struct inode *btrfs_new_test_inode(void)
return NULL;
inode->i_mode = S_IFREG;
- inode->i_ino = BTRFS_FIRST_FREE_OBJECTID;
- BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY;
- BTRFS_I(inode)->location.objectid = BTRFS_FIRST_FREE_OBJECTID;
- BTRFS_I(inode)->location.offset = 0;
+ btrfs_set_inode_number(BTRFS_I(inode), BTRFS_FIRST_FREE_OBJECTID);
inode_init_owner(&nop_mnt_idmap, inode, NULL, S_IFREG);
return inode;
diff --git a/fs/btrfs/tests/extent-map-tests.c b/fs/btrfs/tests/extent-map-tests.c
index ba36794ba2d5..ebec4ab361b8 100644
--- a/fs/btrfs/tests/extent-map-tests.c
+++ b/fs/btrfs/tests/extent-map-tests.c
@@ -19,8 +19,8 @@ static int free_extent_map_tree(struct btrfs_inode *inode)
int ret = 0;
write_lock(&em_tree->lock);
- while (!RB_EMPTY_ROOT(&em_tree->map.rb_root)) {
- node = rb_first_cached(&em_tree->map);
+ while (!RB_EMPTY_ROOT(&em_tree->root)) {
+ node = rb_first(&em_tree->root);
em = rb_entry(node, struct extent_map, rb_node);
remove_extent_mapping(inode, em);
@@ -28,9 +28,10 @@ static int free_extent_map_tree(struct btrfs_inode *inode)
if (refcount_read(&em->refs) != 1) {
ret = -EINVAL;
test_err(
-"em leak: em (start %llu len %llu block_start %llu block_len %llu) refs %d",
- em->start, em->len, em->block_start,
- em->block_len, refcount_read(&em->refs));
+"em leak: em (start %llu len %llu disk_bytenr %llu disk_num_bytes %llu offset %llu) refs %d",
+ em->start, em->len, em->disk_bytenr,
+ em->disk_num_bytes, em->offset,
+ refcount_read(&em->refs));
refcount_set(&em->refs, 1);
}
@@ -76,8 +77,9 @@ static int test_case_1(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode)
/* Add [0, 16K) */
em->start = 0;
em->len = SZ_16K;
- em->block_start = 0;
- em->block_len = SZ_16K;
+ em->disk_bytenr = 0;
+ em->disk_num_bytes = SZ_16K;
+ em->ram_bytes = SZ_16K;
write_lock(&em_tree->lock);
ret = btrfs_add_extent_mapping(inode, &em, em->start, em->len);
write_unlock(&em_tree->lock);
@@ -97,8 +99,9 @@ static int test_case_1(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode)
em->start = SZ_16K;
em->len = SZ_4K;
- em->block_start = SZ_32K; /* avoid merging */
- em->block_len = SZ_4K;
+ em->disk_bytenr = SZ_32K; /* avoid merging */
+ em->disk_num_bytes = SZ_4K;
+ em->ram_bytes = SZ_4K;
write_lock(&em_tree->lock);
ret = btrfs_add_extent_mapping(inode, &em, em->start, em->len);
write_unlock(&em_tree->lock);
@@ -118,8 +121,9 @@ static int test_case_1(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode)
/* Add [0, 8K), should return [0, 16K) instead. */
em->start = start;
em->len = len;
- em->block_start = start;
- em->block_len = len;
+ em->disk_bytenr = start;
+ em->disk_num_bytes = len;
+ em->ram_bytes = len;
write_lock(&em_tree->lock);
ret = btrfs_add_extent_mapping(inode, &em, em->start, em->len);
write_unlock(&em_tree->lock);
@@ -134,11 +138,11 @@ static int test_case_1(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode)
goto out;
}
if (em->start != 0 || extent_map_end(em) != SZ_16K ||
- em->block_start != 0 || em->block_len != SZ_16K) {
+ em->disk_bytenr != 0 || em->disk_num_bytes != SZ_16K) {
test_err(
-"case1 [%llu %llu]: ret %d return a wrong em (start %llu len %llu block_start %llu block_len %llu",
+"case1 [%llu %llu]: ret %d return a wrong em (start %llu len %llu disk_bytenr %llu disk_num_bytes %llu",
start, start + len, ret, em->start, em->len,
- em->block_start, em->block_len);
+ em->disk_bytenr, em->disk_num_bytes);
ret = -EINVAL;
}
free_extent_map(em);
@@ -172,8 +176,9 @@ static int test_case_2(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode)
/* Add [0, 1K) */
em->start = 0;
em->len = SZ_1K;
- em->block_start = EXTENT_MAP_INLINE;
- em->block_len = (u64)-1;
+ em->disk_bytenr = EXTENT_MAP_INLINE;
+ em->disk_num_bytes = 0;
+ em->ram_bytes = SZ_1K;
write_lock(&em_tree->lock);
ret = btrfs_add_extent_mapping(inode, &em, em->start, em->len);
write_unlock(&em_tree->lock);
@@ -193,8 +198,9 @@ static int test_case_2(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode)
em->start = SZ_4K;
em->len = SZ_4K;
- em->block_start = SZ_4K;
- em->block_len = SZ_4K;
+ em->disk_bytenr = SZ_4K;
+ em->disk_num_bytes = SZ_4K;
+ em->ram_bytes = SZ_4K;
write_lock(&em_tree->lock);
ret = btrfs_add_extent_mapping(inode, &em, em->start, em->len);
write_unlock(&em_tree->lock);
@@ -214,8 +220,9 @@ static int test_case_2(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode)
/* Add [0, 1K) */
em->start = 0;
em->len = SZ_1K;
- em->block_start = EXTENT_MAP_INLINE;
- em->block_len = (u64)-1;
+ em->disk_bytenr = EXTENT_MAP_INLINE;
+ em->disk_num_bytes = 0;
+ em->ram_bytes = SZ_1K;
write_lock(&em_tree->lock);
ret = btrfs_add_extent_mapping(inode, &em, em->start, em->len);
write_unlock(&em_tree->lock);
@@ -229,11 +236,10 @@ static int test_case_2(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode)
goto out;
}
if (em->start != 0 || extent_map_end(em) != SZ_1K ||
- em->block_start != EXTENT_MAP_INLINE || em->block_len != (u64)-1) {
+ em->disk_bytenr != EXTENT_MAP_INLINE) {
test_err(
-"case2 [0 1K]: ret %d return a wrong em (start %llu len %llu block_start %llu block_len %llu",
- ret, em->start, em->len, em->block_start,
- em->block_len);
+"case2 [0 1K]: ret %d return a wrong em (start %llu len %llu disk_bytenr %llu",
+ ret, em->start, em->len, em->disk_bytenr);
ret = -EINVAL;
}
free_extent_map(em);
@@ -263,8 +269,9 @@ static int __test_case_3(struct btrfs_fs_info *fs_info,
/* Add [4K, 8K) */
em->start = SZ_4K;
em->len = SZ_4K;
- em->block_start = SZ_4K;
- em->block_len = SZ_4K;
+ em->disk_bytenr = SZ_4K;
+ em->disk_num_bytes = SZ_4K;
+ em->ram_bytes = SZ_4K;
write_lock(&em_tree->lock);
ret = btrfs_add_extent_mapping(inode, &em, em->start, em->len);
write_unlock(&em_tree->lock);
@@ -284,8 +291,9 @@ static int __test_case_3(struct btrfs_fs_info *fs_info,
/* Add [0, 16K) */
em->start = 0;
em->len = SZ_16K;
- em->block_start = 0;
- em->block_len = SZ_16K;
+ em->disk_bytenr = 0;
+ em->disk_num_bytes = SZ_16K;
+ em->ram_bytes = SZ_16K;
write_lock(&em_tree->lock);
ret = btrfs_add_extent_mapping(inode, &em, start, len);
write_unlock(&em_tree->lock);
@@ -305,11 +313,11 @@ static int __test_case_3(struct btrfs_fs_info *fs_info,
* em->start.
*/
if (start < em->start || start + len > extent_map_end(em) ||
- em->start != em->block_start || em->len != em->block_len) {
+ em->start != extent_map_block_start(em)) {
test_err(
-"case3 [%llu %llu): ret %d em (start %llu len %llu block_start %llu block_len %llu)",
+"case3 [%llu %llu): ret %d em (start %llu len %llu disk_bytenr %llu block_len %llu)",
start, start + len, ret, em->start, em->len,
- em->block_start, em->block_len);
+ em->disk_bytenr, em->disk_num_bytes);
ret = -EINVAL;
}
free_extent_map(em);
@@ -370,8 +378,9 @@ static int __test_case_4(struct btrfs_fs_info *fs_info,
/* Add [0K, 8K) */
em->start = 0;
em->len = SZ_8K;
- em->block_start = 0;
- em->block_len = SZ_8K;
+ em->disk_bytenr = 0;
+ em->disk_num_bytes = SZ_8K;
+ em->ram_bytes = SZ_8K;
write_lock(&em_tree->lock);
ret = btrfs_add_extent_mapping(inode, &em, em->start, em->len);
write_unlock(&em_tree->lock);
@@ -391,8 +400,9 @@ static int __test_case_4(struct btrfs_fs_info *fs_info,
/* Add [8K, 32K) */
em->start = SZ_8K;
em->len = 24 * SZ_1K;
- em->block_start = SZ_16K; /* avoid merging */
- em->block_len = 24 * SZ_1K;
+ em->disk_bytenr = SZ_16K; /* avoid merging */
+ em->disk_num_bytes = 24 * SZ_1K;
+ em->ram_bytes = 24 * SZ_1K;
write_lock(&em_tree->lock);
ret = btrfs_add_extent_mapping(inode, &em, em->start, em->len);
write_unlock(&em_tree->lock);
@@ -411,8 +421,9 @@ static int __test_case_4(struct btrfs_fs_info *fs_info,
/* Add [0K, 32K) */
em->start = 0;
em->len = SZ_32K;
- em->block_start = 0;
- em->block_len = SZ_32K;
+ em->disk_bytenr = 0;
+ em->disk_num_bytes = SZ_32K;
+ em->ram_bytes = SZ_32K;
write_lock(&em_tree->lock);
ret = btrfs_add_extent_mapping(inode, &em, start, len);
write_unlock(&em_tree->lock);
@@ -429,9 +440,9 @@ static int __test_case_4(struct btrfs_fs_info *fs_info,
}
if (start < em->start || start + len > extent_map_end(em)) {
test_err(
-"case4 [%llu %llu): ret %d, added wrong em (start %llu len %llu block_start %llu block_len %llu)",
- start, start + len, ret, em->start, em->len, em->block_start,
- em->block_len);
+"case4 [%llu %llu): ret %d, added wrong em (start %llu len %llu disk_bytenr %llu disk_num_bytes %llu)",
+ start, start + len, ret, em->start, em->len,
+ em->disk_bytenr, em->disk_num_bytes);
ret = -EINVAL;
}
free_extent_map(em);
@@ -495,8 +506,9 @@ static int add_compressed_extent(struct btrfs_inode *inode,
em->start = start;
em->len = len;
- em->block_start = block_start;
- em->block_len = SZ_4K;
+ em->disk_bytenr = block_start;
+ em->disk_num_bytes = SZ_4K;
+ em->ram_bytes = len;
em->flags |= EXTENT_FLAG_COMPRESS_ZLIB;
write_lock(&em_tree->lock);
ret = btrfs_add_extent_mapping(inode, &em, em->start, em->len);
@@ -551,7 +563,7 @@ static int validate_range(struct extent_map_tree *em_tree, int index)
struct rb_node *n;
int i;
- for (i = 0, n = rb_first_cached(&em_tree->map);
+ for (i = 0, n = rb_first(&em_tree->root);
valid_ranges[index][i].len && n;
i++, n = rb_next(n)) {
struct extent_map *entry = rb_entry(n, struct extent_map, rb_node);
@@ -716,8 +728,9 @@ static int test_case_6(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode)
em->start = SZ_4K;
em->len = SZ_4K;
- em->block_start = SZ_16K;
- em->block_len = SZ_16K;
+ em->disk_bytenr = SZ_16K;
+ em->disk_num_bytes = SZ_16K;
+ em->ram_bytes = SZ_16K;
write_lock(&em_tree->lock);
ret = btrfs_add_extent_mapping(inode, &em, 0, SZ_8K);
write_unlock(&em_tree->lock);
@@ -769,9 +782,10 @@ static int test_case_7(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode)
/* [0, 16K), pinned */
em->start = 0;
em->len = SZ_16K;
- em->block_start = 0;
- em->block_len = SZ_4K;
- em->flags |= EXTENT_FLAG_PINNED;
+ em->disk_bytenr = 0;
+ em->disk_num_bytes = SZ_4K;
+ em->ram_bytes = SZ_16K;
+ em->flags |= (EXTENT_FLAG_PINNED | EXTENT_FLAG_COMPRESS_ZLIB);
write_lock(&em_tree->lock);
ret = btrfs_add_extent_mapping(inode, &em, em->start, em->len);
write_unlock(&em_tree->lock);
@@ -791,8 +805,9 @@ static int test_case_7(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode)
/* [32K, 48K), not pinned */
em->start = SZ_32K;
em->len = SZ_16K;
- em->block_start = SZ_32K;
- em->block_len = SZ_16K;
+ em->disk_bytenr = SZ_32K;
+ em->disk_num_bytes = SZ_16K;
+ em->ram_bytes = SZ_16K;
write_lock(&em_tree->lock);
ret = btrfs_add_extent_mapping(inode, &em, em->start, em->len);
write_unlock(&em_tree->lock);
@@ -855,8 +870,9 @@ static int test_case_7(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode)
goto out;
}
- if (em->block_start != SZ_32K + SZ_4K) {
- test_err("em->block_start is %llu, expected 36K", em->block_start);
+ if (extent_map_block_start(em) != SZ_32K + SZ_4K) {
+ test_err("em->block_start is %llu, expected 36K",
+ extent_map_block_start(em));
goto out;
}
diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c
index 99da9d34b77a..3ea3bc2225fe 100644
--- a/fs/btrfs/tests/inode-tests.c
+++ b/fs/btrfs/tests/inode-tests.c
@@ -117,7 +117,7 @@ static void setup_file_extents(struct btrfs_root *root, u32 sectorsize)
/* Now for a regular extent */
insert_extent(root, offset, sectorsize - 1, sectorsize - 1, 0,
- disk_bytenr, sectorsize, BTRFS_FILE_EXTENT_REG, 0, slot);
+ disk_bytenr, sectorsize - 1, BTRFS_FILE_EXTENT_REG, 0, slot);
slot++;
disk_bytenr += sectorsize;
offset += sectorsize - 1;
@@ -264,8 +264,8 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
test_err("got an error when we shouldn't have");
goto out;
}
- if (em->block_start != EXTENT_MAP_HOLE) {
- test_err("expected a hole, got %llu", em->block_start);
+ if (em->disk_bytenr != EXTENT_MAP_HOLE) {
+ test_err("expected a hole, got %llu", em->disk_bytenr);
goto out;
}
free_extent_map(em);
@@ -283,8 +283,8 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
test_err("got an error when we shouldn't have");
goto out;
}
- if (em->block_start != EXTENT_MAP_INLINE) {
- test_err("expected an inline, got %llu", em->block_start);
+ if (em->disk_bytenr != EXTENT_MAP_INLINE) {
+ test_err("expected an inline, got %llu", em->disk_bytenr);
goto out;
}
@@ -321,8 +321,8 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
test_err("got an error when we shouldn't have");
goto out;
}
- if (em->block_start != EXTENT_MAP_HOLE) {
- test_err("expected a hole, got %llu", em->block_start);
+ if (em->disk_bytenr != EXTENT_MAP_HOLE) {
+ test_err("expected a hole, got %llu", em->disk_bytenr);
goto out;
}
if (em->start != offset || em->len != 4) {
@@ -344,8 +344,8 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
test_err("got an error when we shouldn't have");
goto out;
}
- if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
- test_err("expected a real extent, got %llu", em->block_start);
+ if (em->disk_bytenr >= EXTENT_MAP_LAST_BYTE) {
+ test_err("expected a real extent, got %llu", em->disk_bytenr);
goto out;
}
if (em->start != offset || em->len != sectorsize - 1) {
@@ -358,9 +358,8 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
test_err("unexpected flags set, want 0 have %u", em->flags);
goto out;
}
- if (em->orig_start != em->start) {
- test_err("wrong orig offset, want %llu, have %llu", em->start,
- em->orig_start);
+ if (em->offset != 0) {
+ test_err("wrong offset, want 0, have %llu", em->offset);
goto out;
}
offset = em->start + em->len;
@@ -372,8 +371,8 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
test_err("got an error when we shouldn't have");
goto out;
}
- if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
- test_err("expected a real extent, got %llu", em->block_start);
+ if (em->disk_bytenr >= EXTENT_MAP_LAST_BYTE) {
+ test_err("expected a real extent, got %llu", em->disk_bytenr);
goto out;
}
if (em->start != offset || em->len != sectorsize) {
@@ -386,12 +385,11 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
test_err("unexpected flags set, want 0 have %u", em->flags);
goto out;
}
- if (em->orig_start != em->start) {
- test_err("wrong orig offset, want %llu, have %llu", em->start,
- em->orig_start);
+ if (em->offset != 0) {
+ test_err("wrong offset, want 0, have %llu", em->offset);
goto out;
}
- disk_bytenr = em->block_start;
+ disk_bytenr = extent_map_block_start(em);
orig_start = em->start;
offset = em->start + em->len;
free_extent_map(em);
@@ -401,8 +399,8 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
test_err("got an error when we shouldn't have");
goto out;
}
- if (em->block_start != EXTENT_MAP_HOLE) {
- test_err("expected a hole, got %llu", em->block_start);
+ if (em->disk_bytenr != EXTENT_MAP_HOLE) {
+ test_err("expected a hole, got %llu", em->disk_bytenr);
goto out;
}
if (em->start != offset || em->len != sectorsize) {
@@ -423,8 +421,8 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
test_err("got an error when we shouldn't have");
goto out;
}
- if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
- test_err("expected a real extent, got %llu", em->block_start);
+ if (em->disk_bytenr >= EXTENT_MAP_LAST_BYTE) {
+ test_err("expected a real extent, got %llu", em->disk_bytenr);
goto out;
}
if (em->start != offset || em->len != 2 * sectorsize) {
@@ -437,15 +435,15 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
test_err("unexpected flags set, want 0 have %u", em->flags);
goto out;
}
- if (em->orig_start != orig_start) {
- test_err("wrong orig offset, want %llu, have %llu",
- orig_start, em->orig_start);
+ if (em->start - em->offset != orig_start) {
+ test_err("wrong offset, em->start=%llu em->offset=%llu orig_start=%llu",
+ em->start, em->offset, orig_start);
goto out;
}
disk_bytenr += (em->start - orig_start);
- if (em->block_start != disk_bytenr) {
+ if (extent_map_block_start(em) != disk_bytenr) {
test_err("wrong block start, want %llu, have %llu",
- disk_bytenr, em->block_start);
+ disk_bytenr, extent_map_block_start(em));
goto out;
}
offset = em->start + em->len;
@@ -457,8 +455,8 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
test_err("got an error when we shouldn't have");
goto out;
}
- if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
- test_err("expected a real extent, got %llu", em->block_start);
+ if (em->disk_bytenr >= EXTENT_MAP_LAST_BYTE) {
+ test_err("expected a real extent, got %llu", em->disk_bytenr);
goto out;
}
if (em->start != offset || em->len != sectorsize) {
@@ -472,9 +470,8 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
prealloc_only, em->flags);
goto out;
}
- if (em->orig_start != em->start) {
- test_err("wrong orig offset, want %llu, have %llu", em->start,
- em->orig_start);
+ if (em->offset != 0) {
+ test_err("wrong offset, want 0, have %llu", em->offset);
goto out;
}
offset = em->start + em->len;
@@ -486,8 +483,8 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
test_err("got an error when we shouldn't have");
goto out;
}
- if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
- test_err("expected a real extent, got %llu", em->block_start);
+ if (em->disk_bytenr >= EXTENT_MAP_LAST_BYTE) {
+ test_err("expected a real extent, got %llu", em->disk_bytenr);
goto out;
}
if (em->start != offset || em->len != sectorsize) {
@@ -501,12 +498,11 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
prealloc_only, em->flags);
goto out;
}
- if (em->orig_start != em->start) {
- test_err("wrong orig offset, want %llu, have %llu", em->start,
- em->orig_start);
+ if (em->offset != 0) {
+ test_err("wrong offset, want 0, have %llu", em->offset);
goto out;
}
- disk_bytenr = em->block_start;
+ disk_bytenr = extent_map_block_start(em);
orig_start = em->start;
offset = em->start + em->len;
free_extent_map(em);
@@ -516,8 +512,8 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
test_err("got an error when we shouldn't have");
goto out;
}
- if (em->block_start >= EXTENT_MAP_HOLE) {
- test_err("expected a real extent, got %llu", em->block_start);
+ if (em->disk_bytenr >= EXTENT_MAP_HOLE) {
+ test_err("expected a real extent, got %llu", em->disk_bytenr);
goto out;
}
if (em->start != offset || em->len != sectorsize) {
@@ -530,15 +526,14 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
test_err("unexpected flags set, want 0 have %u", em->flags);
goto out;
}
- if (em->orig_start != orig_start) {
- test_err("unexpected orig offset, wanted %llu, have %llu",
- orig_start, em->orig_start);
+ if (em->start - em->offset != orig_start) {
+ test_err("unexpected offset, wanted %llu, have %llu",
+ em->start - orig_start, em->offset);
goto out;
}
- if (em->block_start != (disk_bytenr + (em->start - em->orig_start))) {
+ if (extent_map_block_start(em) != disk_bytenr + em->offset) {
test_err("unexpected block start, wanted %llu, have %llu",
- disk_bytenr + (em->start - em->orig_start),
- em->block_start);
+ disk_bytenr + em->offset, extent_map_block_start(em));
goto out;
}
offset = em->start + em->len;
@@ -549,8 +544,8 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
test_err("got an error when we shouldn't have");
goto out;
}
- if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
- test_err("expected a real extent, got %llu", em->block_start);
+ if (em->disk_bytenr >= EXTENT_MAP_LAST_BYTE) {
+ test_err("expected a real extent, got %llu", em->disk_bytenr);
goto out;
}
if (em->start != offset || em->len != 2 * sectorsize) {
@@ -564,15 +559,14 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
prealloc_only, em->flags);
goto out;
}
- if (em->orig_start != orig_start) {
- test_err("wrong orig offset, want %llu, have %llu", orig_start,
- em->orig_start);
+ if (em->start - em->offset != orig_start) {
+ test_err("wrong offset, em->start=%llu em->offset=%llu orig_start=%llu",
+ em->start, em->offset, orig_start);
goto out;
}
- if (em->block_start != (disk_bytenr + (em->start - em->orig_start))) {
+ if (extent_map_block_start(em) != disk_bytenr + em->offset) {
test_err("unexpected block start, wanted %llu, have %llu",
- disk_bytenr + (em->start - em->orig_start),
- em->block_start);
+ disk_bytenr + em->offset, extent_map_block_start(em));
goto out;
}
offset = em->start + em->len;
@@ -584,8 +578,8 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
test_err("got an error when we shouldn't have");
goto out;
}
- if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
- test_err("expected a real extent, got %llu", em->block_start);
+ if (em->disk_bytenr >= EXTENT_MAP_LAST_BYTE) {
+ test_err("expected a real extent, got %llu", em->disk_bytenr);
goto out;
}
if (em->start != offset || em->len != 2 * sectorsize) {
@@ -599,9 +593,8 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
compressed_only, em->flags);
goto out;
}
- if (em->orig_start != em->start) {
- test_err("wrong orig offset, want %llu, have %llu",
- em->start, em->orig_start);
+ if (em->offset != 0) {
+ test_err("wrong offset, want 0, have %llu", em->offset);
goto out;
}
if (extent_map_compression(em) != BTRFS_COMPRESS_ZLIB) {
@@ -618,8 +611,8 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
test_err("got an error when we shouldn't have");
goto out;
}
- if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
- test_err("expected a real extent, got %llu", em->block_start);
+ if (em->disk_bytenr >= EXTENT_MAP_LAST_BYTE) {
+ test_err("expected a real extent, got %llu", em->disk_bytenr);
goto out;
}
if (em->start != offset || em->len != sectorsize) {
@@ -633,9 +626,8 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
compressed_only, em->flags);
goto out;
}
- if (em->orig_start != em->start) {
- test_err("wrong orig offset, want %llu, have %llu",
- em->start, em->orig_start);
+ if (em->offset != 0) {
+ test_err("wrong offset, want 0, have %llu", em->offset);
goto out;
}
if (extent_map_compression(em) != BTRFS_COMPRESS_ZLIB) {
@@ -643,7 +635,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
BTRFS_COMPRESS_ZLIB, extent_map_compression(em));
goto out;
}
- disk_bytenr = em->block_start;
+ disk_bytenr = extent_map_block_start(em);
orig_start = em->start;
offset = em->start + em->len;
free_extent_map(em);
@@ -653,8 +645,8 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
test_err("got an error when we shouldn't have");
goto out;
}
- if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
- test_err("expected a real extent, got %llu", em->block_start);
+ if (em->disk_bytenr >= EXTENT_MAP_LAST_BYTE) {
+ test_err("expected a real extent, got %llu", em->disk_bytenr);
goto out;
}
if (em->start != offset || em->len != sectorsize) {
@@ -667,9 +659,8 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
test_err("unexpected flags set, want 0 have %u", em->flags);
goto out;
}
- if (em->orig_start != em->start) {
- test_err("wrong orig offset, want %llu, have %llu", em->start,
- em->orig_start);
+ if (em->offset != 0) {
+ test_err("wrong offset, want 0, have %llu", em->offset);
goto out;
}
offset = em->start + em->len;
@@ -680,9 +671,9 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
test_err("got an error when we shouldn't have");
goto out;
}
- if (em->block_start != disk_bytenr) {
+ if (extent_map_block_start(em) != disk_bytenr) {
test_err("block start does not match, want %llu got %llu",
- disk_bytenr, em->block_start);
+ disk_bytenr, extent_map_block_start(em));
goto out;
}
if (em->start != offset || em->len != 2 * sectorsize) {
@@ -696,9 +687,9 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
compressed_only, em->flags);
goto out;
}
- if (em->orig_start != orig_start) {
- test_err("wrong orig offset, want %llu, have %llu",
- em->start, orig_start);
+ if (em->start - em->offset != orig_start) {
+ test_err("wrong offset, em->start=%llu em->offset=%llu orig_start=%llu",
+ em->start, em->offset, orig_start);
goto out;
}
if (extent_map_compression(em) != BTRFS_COMPRESS_ZLIB) {
@@ -715,8 +706,8 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
test_err("got an error when we shouldn't have");
goto out;
}
- if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
- test_err("expected a real extent, got %llu", em->block_start);
+ if (em->disk_bytenr >= EXTENT_MAP_LAST_BYTE) {
+ test_err("expected a real extent, got %llu", em->disk_bytenr);
goto out;
}
if (em->start != offset || em->len != sectorsize) {
@@ -729,9 +720,8 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
test_err("unexpected flags set, want 0 have %u", em->flags);
goto out;
}
- if (em->orig_start != em->start) {
- test_err("wrong orig offset, want %llu, have %llu", em->start,
- em->orig_start);
+ if (em->offset != 0) {
+ test_err("wrong offset, want 0, have %llu", em->offset);
goto out;
}
offset = em->start + em->len;
@@ -742,8 +732,8 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
test_err("got an error when we shouldn't have");
goto out;
}
- if (em->block_start != EXTENT_MAP_HOLE) {
- test_err("expected a hole extent, got %llu", em->block_start);
+ if (em->disk_bytenr != EXTENT_MAP_HOLE) {
+ test_err("expected a hole extent, got %llu", em->disk_bytenr);
goto out;
}
/*
@@ -762,9 +752,8 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
vacancy_only, em->flags);
goto out;
}
- if (em->orig_start != em->start) {
- test_err("wrong orig offset, want %llu, have %llu", em->start,
- em->orig_start);
+ if (em->offset != 0) {
+ test_err("wrong offset, want 0, have %llu", em->offset);
goto out;
}
offset = em->start + em->len;
@@ -775,8 +764,8 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
test_err("got an error when we shouldn't have");
goto out;
}
- if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
- test_err("expected a real extent, got %llu", em->block_start);
+ if (em->disk_bytenr >= EXTENT_MAP_LAST_BYTE) {
+ test_err("expected a real extent, got %llu", em->disk_bytenr);
goto out;
}
if (em->start != offset || em->len != sectorsize) {
@@ -789,9 +778,8 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
test_err("unexpected flags set, want 0 have %u", em->flags);
goto out;
}
- if (em->orig_start != em->start) {
- test_err("wrong orig offset, want %llu, have %llu", em->start,
- em->orig_start);
+ if (em->offset != 0) {
+ test_err("wrong orig offset, want 0, have %llu", em->offset);
goto out;
}
ret = 0;
@@ -855,8 +843,8 @@ static int test_hole_first(u32 sectorsize, u32 nodesize)
test_err("got an error when we shouldn't have");
goto out;
}
- if (em->block_start != EXTENT_MAP_HOLE) {
- test_err("expected a hole, got %llu", em->block_start);
+ if (em->disk_bytenr != EXTENT_MAP_HOLE) {
+ test_err("expected a hole, got %llu", em->disk_bytenr);
goto out;
}
if (em->start != 0 || em->len != sectorsize) {
@@ -877,8 +865,8 @@ static int test_hole_first(u32 sectorsize, u32 nodesize)
test_err("got an error when we shouldn't have");
goto out;
}
- if (em->block_start != sectorsize) {
- test_err("expected a real extent, got %llu", em->block_start);
+ if (extent_map_block_start(em) != sectorsize) {
+ test_err("expected a real extent, got %llu", extent_map_block_start(em));
goto out;
}
if (em->start != sectorsize || em->len != sectorsize) {
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 3388c836b9a5..5e6fff8e1003 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -405,7 +405,7 @@ static int record_root_in_trans(struct btrfs_trans_handle *trans,
int ret = 0;
if ((test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
- root->last_trans < trans->transid) || force) {
+ btrfs_get_root_last_trans(root) < trans->transid) || force) {
WARN_ON(!force && root->commit_root != root->node);
/*
@@ -421,7 +421,7 @@ static int record_root_in_trans(struct btrfs_trans_handle *trans,
smp_wmb();
spin_lock(&fs_info->fs_roots_radix_lock);
- if (root->last_trans == trans->transid && !force) {
+ if (btrfs_get_root_last_trans(root) == trans->transid && !force) {
spin_unlock(&fs_info->fs_roots_radix_lock);
return 0;
}
@@ -429,7 +429,7 @@ static int record_root_in_trans(struct btrfs_trans_handle *trans,
(unsigned long)btrfs_root_id(root),
BTRFS_ROOT_TRANS_TAG);
spin_unlock(&fs_info->fs_roots_radix_lock);
- root->last_trans = trans->transid;
+ btrfs_set_root_last_trans(root, trans->transid);
/* this is pretty tricky. We don't want to
* take the relocation lock in btrfs_record_root_in_trans
@@ -491,7 +491,7 @@ int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
* and barriers
*/
smp_rmb();
- if (root->last_trans == trans->transid &&
+ if (btrfs_get_root_last_trans(root) == trans->transid &&
!test_bit(BTRFS_ROOT_IN_TRANS_SETUP, &root->state))
return 0;
@@ -1637,7 +1637,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
struct btrfs_root *root = pending->root;
struct btrfs_root *parent_root;
struct btrfs_block_rsv *rsv;
- struct inode *parent_inode = pending->dir;
+ struct inode *parent_inode = &pending->dir->vfs_inode;
struct btrfs_path *path;
struct btrfs_dir_item *dir_item;
struct extent_buffer *tmp;
@@ -1989,6 +1989,25 @@ void btrfs_commit_transaction_async(struct btrfs_trans_handle *trans)
btrfs_put_transaction(cur_trans);
}
+/*
+ * If there is a running transaction commit it or if it's already committing,
+ * wait for its commit to complete. Does not start and commit a new transaction
+ * if there isn't any running.
+ */
+int btrfs_commit_current_transaction(struct btrfs_root *root)
+{
+ struct btrfs_trans_handle *trans;
+
+ trans = btrfs_attach_transaction_barrier(root);
+ if (IS_ERR(trans)) {
+ int ret = PTR_ERR(trans);
+
+ return (ret == -ENOENT) ? 0 : ret;
+ }
+
+ return btrfs_commit_transaction(trans);
+}
+
static void cleanup_transaction(struct btrfs_trans_handle *trans, int err)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
@@ -2110,7 +2129,7 @@ static inline int btrfs_start_delalloc_flush(struct btrfs_fs_info *fs_info)
static inline void btrfs_wait_delalloc_flush(struct btrfs_fs_info *fs_info)
{
if (btrfs_test_opt(fs_info, FLUSHONCOMMIT))
- btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1);
+ btrfs_wait_ordered_roots(fs_info, U64_MAX, NULL);
}
/*
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 4e451ab173b1..98c03ddc760b 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -172,7 +172,7 @@ struct btrfs_trans_handle {
struct btrfs_pending_snapshot {
struct dentry *dentry;
- struct inode *dir;
+ struct btrfs_inode *dir;
struct btrfs_root *root;
struct btrfs_root_item *root_item;
struct btrfs_root *snap;
@@ -229,11 +229,11 @@ bool __cold abort_should_print_stack(int error);
*/
#define btrfs_abort_transaction(trans, error) \
do { \
- bool first = false; \
+ bool __first = false; \
/* Report first abort since mount */ \
if (!test_and_set_bit(BTRFS_FS_STATE_TRANS_ABORTED, \
&((trans)->fs_info->fs_state))) { \
- first = true; \
+ __first = true; \
if (WARN(abort_should_print_stack(error), \
KERN_ERR \
"BTRFS: Transaction aborted (error %d)\n", \
@@ -246,7 +246,7 @@ do { \
} \
} \
__btrfs_abort_transaction((trans), __func__, \
- __LINE__, (error), first); \
+ __LINE__, (error), __first); \
} while (0)
int btrfs_end_transaction(struct btrfs_trans_handle *trans);
@@ -268,6 +268,7 @@ void btrfs_maybe_wake_unfinished_drop(struct btrfs_fs_info *fs_info);
int btrfs_clean_one_deleted_snapshot(struct btrfs_fs_info *fs_info);
int btrfs_commit_transaction(struct btrfs_trans_handle *trans);
void btrfs_commit_transaction_async(struct btrfs_trans_handle *trans);
+int btrfs_commit_current_transaction(struct btrfs_root *root);
int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans);
bool btrfs_should_end_transaction(struct btrfs_trans_handle *trans);
void btrfs_throttle(struct btrfs_fs_info *fs_info);
diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
index a2c3651a3d8f..6388786fd8b5 100644
--- a/fs/btrfs/tree-checker.c
+++ b/fs/btrfs/tree-checker.c
@@ -340,6 +340,24 @@ static int check_extent_data_item(struct extent_buffer *leaf,
}
}
+ /*
+ * For non-compressed data extents, ram_bytes should match its
+ * disk_num_bytes.
+ * However we do not really utilize ram_bytes in this case, so this check
+ * is only optional for DEBUG builds for developers to catch the
+ * unexpected behaviors.
+ */
+ if (IS_ENABLED(CONFIG_BTRFS_DEBUG) &&
+ btrfs_file_extent_compression(leaf, fi) == BTRFS_COMPRESS_NONE &&
+ btrfs_file_extent_disk_bytenr(leaf, fi)) {
+ if (WARN_ON(btrfs_file_extent_ram_bytes(leaf, fi) !=
+ btrfs_file_extent_disk_num_bytes(leaf, fi)))
+ file_extent_err(leaf, slot,
+"mismatch ram_bytes (%llu) and disk_num_bytes (%llu) for non-compressed extent",
+ btrfs_file_extent_ram_bytes(leaf, fi),
+ btrfs_file_extent_disk_num_bytes(leaf, fi));
+ }
+
return 0;
}
@@ -1682,9 +1700,6 @@ static int check_inode_ref(struct extent_buffer *leaf,
static int check_raid_stripe_extent(const struct extent_buffer *leaf,
const struct btrfs_key *key, int slot)
{
- struct btrfs_stripe_extent *stripe_extent =
- btrfs_item_ptr(leaf, slot, struct btrfs_stripe_extent);
-
if (unlikely(!IS_ALIGNED(key->objectid, leaf->fs_info->sectorsize))) {
generic_err(leaf, slot,
"invalid key objectid for raid stripe extent, have %llu expect aligned to %u",
@@ -1698,22 +1713,6 @@ static int check_raid_stripe_extent(const struct extent_buffer *leaf,
return -EUCLEAN;
}
- switch (btrfs_stripe_extent_encoding(leaf, stripe_extent)) {
- case BTRFS_STRIPE_RAID0:
- case BTRFS_STRIPE_RAID1:
- case BTRFS_STRIPE_DUP:
- case BTRFS_STRIPE_RAID10:
- case BTRFS_STRIPE_RAID5:
- case BTRFS_STRIPE_RAID6:
- case BTRFS_STRIPE_RAID1C3:
- case BTRFS_STRIPE_RAID1C4:
- break;
- default:
- generic_err(leaf, slot, "invalid raid stripe encoding %u",
- btrfs_stripe_extent_encoding(leaf, stripe_extent));
- return -EUCLEAN;
- }
-
return 0;
}
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 0bce1d45e252..f0cf8ce26f01 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -151,7 +151,7 @@ static struct inode *btrfs_iget_logging(u64 objectid, struct btrfs_root *root)
* attempt a transaction commit, resulting in a deadlock.
*/
nofs_flag = memalloc_nofs_save();
- inode = btrfs_iget(root->fs_info->sb, objectid, root);
+ inode = btrfs_iget(objectid, root);
memalloc_nofs_restore(nofs_flag);
return inode;
@@ -1644,7 +1644,8 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
if (ret)
goto out;
}
- BTRFS_I(inode)->index_cnt = (u64)-1;
+ if (S_ISDIR(inode->i_mode))
+ BTRFS_I(inode)->index_cnt = (u64)-1;
if (inode->i_nlink == 0) {
if (S_ISDIR(inode->i_mode)) {
@@ -2839,7 +2840,7 @@ static void wait_for_writer(struct btrfs_root *root)
finish_wait(&root->log_writer_wait, &wait);
}
-void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx, struct inode *inode)
+void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx, struct btrfs_inode *inode)
{
ctx->log_ret = 0;
ctx->log_transid = 0;
@@ -2858,7 +2859,7 @@ void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx, struct inode *inode)
void btrfs_init_log_ctx_scratch_eb(struct btrfs_log_ctx *ctx)
{
- struct btrfs_inode *inode = BTRFS_I(ctx->inode);
+ struct btrfs_inode *inode = ctx->inode;
if (!test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags) &&
!test_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags))
@@ -2876,7 +2877,7 @@ void btrfs_release_log_ctx_extents(struct btrfs_log_ctx *ctx)
struct btrfs_ordered_extent *ordered;
struct btrfs_ordered_extent *tmp;
- ASSERT(inode_is_locked(ctx->inode));
+ ASSERT(inode_is_locked(&ctx->inode->vfs_inode));
list_for_each_entry_safe(ordered, tmp, &ctx->ordered_extents, log_list) {
list_del_init(&ordered->log_list);
@@ -4253,8 +4254,10 @@ static int log_inode_item(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode, bool inode_item_dropped)
{
struct btrfs_inode_item *inode_item;
+ struct btrfs_key key;
int ret;
+ btrfs_get_inode_key(inode, &key);
/*
* If we are doing a fast fsync and the inode was logged before in the
* current transaction, then we know the inode was previously logged and
@@ -4266,7 +4269,7 @@ static int log_inode_item(struct btrfs_trans_handle *trans,
* already exists can also result in unnecessarily splitting a leaf.
*/
if (!inode_item_dropped && inode->logged_trans == trans->transid) {
- ret = btrfs_search_slot(trans, log, &inode->location, path, 0, 1);
+ ret = btrfs_search_slot(trans, log, &key, path, 0, 1);
ASSERT(ret <= 0);
if (ret > 0)
ret = -ENOENT;
@@ -4280,7 +4283,7 @@ static int log_inode_item(struct btrfs_trans_handle *trans,
* the inode, we set BTRFS_INODE_NEEDS_FULL_SYNC on its runtime
* flags and set ->logged_trans to 0.
*/
- ret = btrfs_insert_empty_item(trans, log, path, &inode->location,
+ ret = btrfs_insert_empty_item(trans, log, path, &key,
sizeof(*inode_item));
ASSERT(ret != -EEXIST);
}
@@ -4594,6 +4597,7 @@ static int log_extent_csums(struct btrfs_trans_handle *trans,
{
struct btrfs_ordered_extent *ordered;
struct btrfs_root *csum_root;
+ u64 block_start;
u64 csum_offset;
u64 csum_len;
u64 mod_start = em->start;
@@ -4603,7 +4607,7 @@ static int log_extent_csums(struct btrfs_trans_handle *trans,
if (inode->flags & BTRFS_INODE_NODATASUM ||
(em->flags & EXTENT_FLAG_PREALLOC) ||
- em->block_start == EXTENT_MAP_HOLE)
+ em->disk_bytenr == EXTENT_MAP_HOLE)
return 0;
list_for_each_entry(ordered, &ctx->ordered_extents, log_list) {
@@ -4667,17 +4671,18 @@ static int log_extent_csums(struct btrfs_trans_handle *trans,
/* If we're compressed we have to save the entire range of csums. */
if (extent_map_is_compressed(em)) {
csum_offset = 0;
- csum_len = max(em->block_len, em->orig_block_len);
+ csum_len = em->disk_num_bytes;
} else {
csum_offset = mod_start - em->start;
csum_len = mod_len;
}
/* block start is already adjusted for the file extent offset. */
- csum_root = btrfs_csum_root(trans->fs_info, em->block_start);
- ret = btrfs_lookup_csums_list(csum_root, em->block_start + csum_offset,
- em->block_start + csum_offset +
- csum_len - 1, &ordered_sums, false);
+ block_start = extent_map_block_start(em);
+ csum_root = btrfs_csum_root(trans->fs_info, block_start);
+ ret = btrfs_lookup_csums_list(csum_root, block_start + csum_offset,
+ block_start + csum_offset + csum_len - 1,
+ &ordered_sums, false);
if (ret < 0)
return ret;
ret = 0;
@@ -4707,7 +4712,8 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
struct extent_buffer *leaf;
struct btrfs_key key;
enum btrfs_compression_type compress_type;
- u64 extent_offset = em->start - em->orig_start;
+ u64 extent_offset = em->offset;
+ u64 block_start = extent_map_block_start(em);
u64 block_len;
int ret;
@@ -4717,14 +4723,13 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
else
btrfs_set_stack_file_extent_type(&fi, BTRFS_FILE_EXTENT_REG);
- block_len = max(em->block_len, em->orig_block_len);
+ block_len = em->disk_num_bytes;
compress_type = extent_map_compression(em);
if (compress_type != BTRFS_COMPRESS_NONE) {
- btrfs_set_stack_file_extent_disk_bytenr(&fi, em->block_start);
+ btrfs_set_stack_file_extent_disk_bytenr(&fi, block_start);
btrfs_set_stack_file_extent_disk_num_bytes(&fi, block_len);
- } else if (em->block_start < EXTENT_MAP_LAST_BYTE) {
- btrfs_set_stack_file_extent_disk_bytenr(&fi, em->block_start -
- extent_offset);
+ } else if (em->disk_bytenr < EXTENT_MAP_LAST_BYTE) {
+ btrfs_set_stack_file_extent_disk_bytenr(&fi, block_start - extent_offset);
btrfs_set_stack_file_extent_disk_num_bytes(&fi, block_len);
}
@@ -5927,7 +5932,7 @@ again:
if (ret < 0) {
return ret;
} else if (ret > 0 &&
- other_ino != btrfs_ino(BTRFS_I(ctx->inode))) {
+ other_ino != btrfs_ino(ctx->inode)) {
if (ins_nr > 0) {
ins_nr++;
} else {
@@ -7074,6 +7079,15 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
}
/*
+ * If we're logging an inode from a subvolume created in the current
+ * transaction we must force a commit since the root is not persisted.
+ */
+ if (btrfs_root_generation(&root->root_item) == trans->transid) {
+ ret = BTRFS_LOG_FORCE_COMMIT;
+ goto end_no_trans;
+ }
+
+ /*
* Skip already logged inodes or inodes corresponding to tmpfiles
* (since logging them is pointless, a link count of 0 means they
* will never be accessible).
@@ -7454,6 +7468,24 @@ void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans,
}
/*
+ * Call this when creating a subvolume in a directory.
+ * Because we don't commit a transaction when creating a subvolume, we can't
+ * allow the directory pointing to the subvolume to be logged with an entry that
+ * points to an unpersisted root if we are still in the transaction used to
+ * create the subvolume, so make any attempt to log the directory to result in a
+ * full log sync.
+ * Also we don't need to worry with renames, since btrfs_rename() marks the log
+ * for full commit when renaming a subvolume.
+ */
+void btrfs_record_new_subvolume(const struct btrfs_trans_handle *trans,
+ struct btrfs_inode *dir)
+{
+ mutex_lock(&dir->log_mutex);
+ dir->last_unlink_trans = trans->transid;
+ mutex_unlock(&dir->log_mutex);
+}
+
+/*
* Update the log after adding a new name for an inode.
*
* @trans: Transaction handle.
@@ -7585,7 +7617,7 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans,
goto out;
}
- btrfs_init_log_ctx(&ctx, &inode->vfs_inode);
+ btrfs_init_log_ctx(&ctx, inode);
ctx.logging_new_name = true;
btrfs_init_log_ctx_scratch_eb(&ctx);
/*
diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
index 22e9cbc81577..dc313e6bb2fa 100644
--- a/fs/btrfs/tree-log.h
+++ b/fs/btrfs/tree-log.h
@@ -37,7 +37,7 @@ struct btrfs_log_ctx {
bool logging_new_delayed_dentries;
/* Indicate if the inode being logged was logged before. */
bool logged_before;
- struct inode *inode;
+ struct btrfs_inode *inode;
struct list_head list;
/* Only used for fast fsyncs. */
struct list_head ordered_extents;
@@ -55,7 +55,7 @@ struct btrfs_log_ctx {
struct extent_buffer *scratch_eb;
};
-void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx, struct inode *inode);
+void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx, struct btrfs_inode *inode);
void btrfs_init_log_ctx_scratch_eb(struct btrfs_log_ctx *ctx);
void btrfs_release_log_ctx_extents(struct btrfs_log_ctx *ctx);
@@ -94,6 +94,8 @@ void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
bool for_rename);
void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans,
struct btrfs_inode *dir);
+void btrfs_record_new_subvolume(const struct btrfs_trans_handle *trans,
+ struct btrfs_inode *dir);
void btrfs_log_new_name(struct btrfs_trans_handle *trans,
struct dentry *old_dentry, struct btrfs_inode *old_dir,
u64 old_dir_index, struct dentry *parent);
diff --git a/fs/btrfs/ulist.c b/fs/btrfs/ulist.c
index 183863f4bfa4..fc59b57257d6 100644
--- a/fs/btrfs/ulist.c
+++ b/fs/btrfs/ulist.c
@@ -50,6 +50,7 @@ void ulist_init(struct ulist *ulist)
INIT_LIST_HEAD(&ulist->nodes);
ulist->root = RB_ROOT;
ulist->nnodes = 0;
+ ulist->prealloc = NULL;
}
/*
@@ -68,6 +69,8 @@ void ulist_release(struct ulist *ulist)
list_for_each_entry_safe(node, next, &ulist->nodes, list) {
kfree(node);
}
+ kfree(ulist->prealloc);
+ ulist->prealloc = NULL;
ulist->root = RB_ROOT;
INIT_LIST_HEAD(&ulist->nodes);
}
@@ -105,6 +108,12 @@ struct ulist *ulist_alloc(gfp_t gfp_mask)
return ulist;
}
+void ulist_prealloc(struct ulist *ulist, gfp_t gfp_mask)
+{
+ if (!ulist->prealloc)
+ ulist->prealloc = kzalloc(sizeof(*ulist->prealloc), gfp_mask);
+}
+
/*
* Free dynamically allocated ulist.
*
@@ -206,9 +215,15 @@ int ulist_add_merge(struct ulist *ulist, u64 val, u64 aux,
*old_aux = node->aux;
return 0;
}
- node = kmalloc(sizeof(*node), gfp_mask);
- if (!node)
- return -ENOMEM;
+
+ if (ulist->prealloc) {
+ node = ulist->prealloc;
+ ulist->prealloc = NULL;
+ } else {
+ node = kmalloc(sizeof(*node), gfp_mask);
+ if (!node)
+ return -ENOMEM;
+ }
node->val = val;
node->aux = aux;
diff --git a/fs/btrfs/ulist.h b/fs/btrfs/ulist.h
index 8e200fe1a2dd..c62a372f1462 100644
--- a/fs/btrfs/ulist.h
+++ b/fs/btrfs/ulist.h
@@ -41,12 +41,14 @@ struct ulist {
struct list_head nodes;
struct rb_root root;
+ struct ulist_node *prealloc;
};
void ulist_init(struct ulist *ulist);
void ulist_release(struct ulist *ulist);
void ulist_reinit(struct ulist *ulist);
struct ulist *ulist_alloc(gfp_t gfp_mask);
+void ulist_prealloc(struct ulist *ulist, gfp_t mask);
void ulist_free(struct ulist *ulist);
int ulist_add(struct ulist *ulist, u64 val, u64 aux, gfp_t gfp_mask);
int ulist_add_merge(struct ulist *ulist, u64 val, u64 aux,
diff --git a/fs/btrfs/uuid-tree.c b/fs/btrfs/uuid-tree.c
index b0aff297d67d..eae75bb572b9 100644
--- a/fs/btrfs/uuid-tree.c
+++ b/fs/btrfs/uuid-tree.c
@@ -13,7 +13,7 @@
#include "accessors.h"
#include "uuid-tree.h"
-static void btrfs_uuid_to_key(u8 *uuid, u8 type, struct btrfs_key *key)
+static void btrfs_uuid_to_key(const u8 *uuid, u8 type, struct btrfs_key *key)
{
key->type = type;
key->objectid = get_unaligned_le64(uuid);
@@ -21,7 +21,7 @@ static void btrfs_uuid_to_key(u8 *uuid, u8 type, struct btrfs_key *key)
}
/* return -ENOENT for !found, < 0 for errors, or 0 if an item was found */
-static int btrfs_uuid_tree_lookup(struct btrfs_root *uuid_root, u8 *uuid,
+static int btrfs_uuid_tree_lookup(struct btrfs_root *uuid_root, const u8 *uuid,
u8 type, u64 subid)
{
int ret;
@@ -81,7 +81,7 @@ out:
return ret;
}
-int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans, u8 *uuid, u8 type,
+int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans, const u8 *uuid, u8 type,
u64 subid_cpu)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
@@ -145,7 +145,7 @@ out:
return ret;
}
-int btrfs_uuid_tree_remove(struct btrfs_trans_handle *trans, u8 *uuid, u8 type,
+int btrfs_uuid_tree_remove(struct btrfs_trans_handle *trans, const u8 *uuid, u8 type,
u64 subid)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
@@ -256,7 +256,7 @@ out:
* < 0 if an error occurred
*/
static int btrfs_check_uuid_tree_entry(struct btrfs_fs_info *fs_info,
- u8 *uuid, u8 type, u64 subvolid)
+ const u8 *uuid, u8 type, u64 subvolid)
{
int ret = 0;
struct btrfs_root *subvol_root;
diff --git a/fs/btrfs/uuid-tree.h b/fs/btrfs/uuid-tree.h
index 080ede0227ae..a3f5757cc7cf 100644
--- a/fs/btrfs/uuid-tree.h
+++ b/fs/btrfs/uuid-tree.h
@@ -8,9 +8,9 @@
struct btrfs_trans_handle;
struct btrfs_fs_info;
-int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans, u8 *uuid, u8 type,
+int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans, const u8 *uuid, u8 type,
u64 subid);
-int btrfs_uuid_tree_remove(struct btrfs_trans_handle *trans, u8 *uuid, u8 type,
+int btrfs_uuid_tree_remove(struct btrfs_trans_handle *trans, const u8 *uuid, u8 type,
u64 subid);
int btrfs_uuid_tree_iterate(struct btrfs_fs_info *fs_info);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index c39145e8c4ad..fcedc43ef291 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -722,7 +722,7 @@ error_free_page:
return -EINVAL;
}
-u8 *btrfs_sb_fsid_ptr(struct btrfs_super_block *sb)
+const u8 *btrfs_sb_fsid_ptr(const struct btrfs_super_block *sb)
{
bool has_metadata_uuid = (btrfs_super_incompat_flags(sb) &
BTRFS_FEATURE_INCOMPAT_METADATA_UUID);
@@ -1380,20 +1380,13 @@ struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags,
bool new_device_added = false;
struct btrfs_device *device = NULL;
struct file *bdev_file;
- u64 bytenr, bytenr_orig;
+ u64 bytenr;
dev_t devt;
int ret;
lockdep_assert_held(&uuid_mutex);
/*
- * we would like to check all the supers, but that would make
- * a btrfs mount succeed after a mkfs from a different FS.
- * So, we need to add a special mount option to scan for
- * later supers, using BTRFS_SUPER_MIRROR_MAX instead
- */
-
- /*
* Avoid an exclusive open here, as the systemd-udev may initiate the
* device scan which may race with the user's mount or mkfs command,
* resulting in failure.
@@ -1407,7 +1400,12 @@ struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags,
if (IS_ERR(bdev_file))
return ERR_CAST(bdev_file);
- bytenr_orig = btrfs_sb_offset(0);
+ /*
+ * We would like to check all the super blocks, but doing so would
+ * allow a mount to succeed after a mkfs from a different filesystem.
+ * Currently, recovery from a bad primary btrfs superblock is done
+ * using the userspace command 'btrfs check --super'.
+ */
ret = btrfs_sb_log_location_bdev(file_bdev(bdev_file), 0, READ, &bytenr);
if (ret) {
device = ERR_PTR(ret);
@@ -1415,7 +1413,7 @@ struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags,
}
disk_super = btrfs_read_disk_super(file_bdev(bdev_file), bytenr,
- bytenr_orig);
+ btrfs_sb_offset(0));
if (IS_ERR(disk_super)) {
device = ERR_CAST(disk_super);
goto error_bdev_put;
@@ -2991,16 +2989,19 @@ static int btrfs_free_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
if (ret < 0)
goto out;
else if (ret > 0) { /* Logic error or corruption */
- btrfs_handle_fs_error(fs_info, -ENOENT,
- "Failed lookup while freeing chunk.");
- ret = -ENOENT;
+ btrfs_err(fs_info, "failed to lookup chunk %llu when freeing",
+ chunk_offset);
+ btrfs_abort_transaction(trans, -ENOENT);
+ ret = -EUCLEAN;
goto out;
}
ret = btrfs_del_item(trans, root, path);
- if (ret < 0)
- btrfs_handle_fs_error(fs_info, ret,
- "Failed to delete chunk item.");
+ if (ret < 0) {
+ btrfs_err(fs_info, "failed to delete chunk %llu item", chunk_offset);
+ btrfs_abort_transaction(trans, ret);
+ goto out;
+ }
out:
btrfs_free_path(path);
return ret;
@@ -5628,8 +5629,6 @@ static struct btrfs_block_group *create_chunk(struct btrfs_trans_handle *trans,
u64 start = ctl->start;
u64 type = ctl->type;
int ret;
- int i;
- int j;
map = btrfs_alloc_chunk_map(ctl->num_stripes, GFP_NOFS);
if (!map)
@@ -5644,8 +5643,8 @@ static struct btrfs_block_group *create_chunk(struct btrfs_trans_handle *trans,
map->sub_stripes = ctl->sub_stripes;
map->num_stripes = ctl->num_stripes;
- for (i = 0; i < ctl->ndevs; ++i) {
- for (j = 0; j < ctl->dev_stripes; ++j) {
+ for (int i = 0; i < ctl->ndevs; i++) {
+ for (int j = 0; j < ctl->dev_stripes; j++) {
int s = i * ctl->dev_stripes + j;
map->stripes[s].dev = devices_info[i].dev;
map->stripes[s].physical = devices_info[i].dev_offset +
@@ -6288,20 +6287,19 @@ static bool is_block_group_to_copy(struct btrfs_fs_info *fs_info, u64 logical)
return ret;
}
-static void handle_ops_on_dev_replace(enum btrfs_map_op op,
- struct btrfs_io_context *bioc,
+static void handle_ops_on_dev_replace(struct btrfs_io_context *bioc,
struct btrfs_dev_replace *dev_replace,
u64 logical,
- int *num_stripes_ret, int *max_errors_ret)
+ struct btrfs_io_geometry *io_geom)
{
u64 srcdev_devid = dev_replace->srcdev->devid;
/*
* At this stage, num_stripes is still the real number of stripes,
* excluding the duplicated stripes.
*/
- int num_stripes = *num_stripes_ret;
+ int num_stripes = io_geom->num_stripes;
+ int max_errors = io_geom->max_errors;
int nr_extra_stripes = 0;
- int max_errors = *max_errors_ret;
int i;
/*
@@ -6342,7 +6340,7 @@ static void handle_ops_on_dev_replace(enum btrfs_map_op op,
* replace.
* If we have 2 extra stripes, only choose the one with smaller physical.
*/
- if (op == BTRFS_MAP_GET_READ_MIRRORS && nr_extra_stripes == 2) {
+ if (io_geom->op == BTRFS_MAP_GET_READ_MIRRORS && nr_extra_stripes == 2) {
struct btrfs_io_stripe *first = &bioc->stripes[num_stripes];
struct btrfs_io_stripe *second = &bioc->stripes[num_stripes + 1];
@@ -6360,8 +6358,8 @@ static void handle_ops_on_dev_replace(enum btrfs_map_op op,
}
}
- *num_stripes_ret = num_stripes + nr_extra_stripes;
- *max_errors_ret = max_errors + nr_extra_stripes;
+ io_geom->num_stripes = num_stripes + nr_extra_stripes;
+ io_geom->max_errors = max_errors + nr_extra_stripes;
bioc->replace_nr_stripes = nr_extra_stripes;
}
@@ -6624,7 +6622,6 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
struct btrfs_chunk_map *map;
struct btrfs_io_geometry io_geom = { 0 };
u64 map_offset;
- int i;
int ret = 0;
int num_copies;
struct btrfs_io_context *bioc = NULL;
@@ -6770,7 +6767,7 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
* For all other non-RAID56 profiles, just copy the target
* stripe into the bioc.
*/
- for (i = 0; i < io_geom.num_stripes; i++) {
+ for (int i = 0; i < io_geom.num_stripes; i++) {
ret = set_io_stripe(fs_info, logical, length,
&bioc->stripes[i], map, &io_geom);
if (ret < 0)
@@ -6790,8 +6787,7 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL &&
op != BTRFS_MAP_READ) {
- handle_ops_on_dev_replace(op, bioc, dev_replace, logical,
- &io_geom.num_stripes, &io_geom.max_errors);
+ handle_ops_on_dev_replace(bioc, dev_replace, logical, &io_geom);
}
*bioc_ret = bioc;
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 66e6fc481ecd..37a09ebb34dd 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -834,6 +834,6 @@ int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info);
bool btrfs_repair_one_zone(struct btrfs_fs_info *fs_info, u64 logical);
bool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr);
-u8 *btrfs_sb_fsid_ptr(struct btrfs_super_block *sb);
+const u8 *btrfs_sb_fsid_ptr(const struct btrfs_super_block *sb);
#endif
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 15d0999e340e..738c7bb8ea7c 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -24,7 +24,7 @@
#include "accessors.h"
#include "dir-item.h"
-int btrfs_getxattr(struct inode *inode, const char *name,
+int btrfs_getxattr(const struct inode *inode, const char *name,
void *buffer, size_t size)
{
struct btrfs_dir_item *di;
@@ -451,7 +451,7 @@ static int btrfs_xattr_handler_set_prop(const struct xattr_handler *handler,
if (IS_ERR(trans))
return PTR_ERR(trans);
- ret = btrfs_set_prop(trans, inode, name, value, size, flags);
+ ret = btrfs_set_prop(trans, BTRFS_I(inode), name, value, size, flags);
if (!ret) {
inode_inc_iversion(inode);
inode_set_ctime_current(inode);
diff --git a/fs/btrfs/xattr.h b/fs/btrfs/xattr.h
index b9376ea258ff..8dc4cf49f6f0 100644
--- a/fs/btrfs/xattr.h
+++ b/fs/btrfs/xattr.h
@@ -14,7 +14,7 @@ struct btrfs_trans_handle;
extern const struct xattr_handler * const btrfs_xattr_handlers[];
-int btrfs_getxattr(struct inode *inode, const char *name,
+int btrfs_getxattr(const struct inode *inode, const char *name,
void *buffer, size_t size);
int btrfs_setxattr(struct btrfs_trans_handle *trans, struct inode *inode,
const char *name, const void *value, size_t size, int flags);
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
index d9e5c88a0f85..30971dd741e2 100644
--- a/fs/btrfs/zlib.c
+++ b/fs/btrfs/zlib.c
@@ -18,6 +18,7 @@
#include <linux/pagemap.h>
#include <linux/bio.h>
#include <linux/refcount.h>
+#include "btrfs_inode.h"
#include "compression.h"
/* workspace buffer size for s390 zlib hardware support */
@@ -112,8 +113,13 @@ int zlib_compress_folios(struct list_head *ws, struct address_space *mapping,
*total_out = 0;
*total_in = 0;
- if (Z_OK != zlib_deflateInit(&workspace->strm, workspace->level)) {
- pr_warn("BTRFS: deflateInit failed\n");
+ ret = zlib_deflateInit(&workspace->strm, workspace->level);
+ if (unlikely(ret != Z_OK)) {
+ struct btrfs_inode *inode = BTRFS_I(mapping->host);
+
+ btrfs_err(inode->root->fs_info,
+ "zlib compression init failed, error %d root %llu inode %llu offset %llu",
+ ret, btrfs_root_id(inode->root), btrfs_ino(inode), start);
ret = -EIO;
goto out;
}
@@ -182,9 +188,13 @@ int zlib_compress_folios(struct list_head *ws, struct address_space *mapping,
}
ret = zlib_deflate(&workspace->strm, Z_SYNC_FLUSH);
- if (ret != Z_OK) {
- pr_debug("BTRFS: deflate in loop returned %d\n",
- ret);
+ if (unlikely(ret != Z_OK)) {
+ struct btrfs_inode *inode = BTRFS_I(mapping->host);
+
+ btrfs_warn(inode->root->fs_info,
+ "zlib compression failed, error %d root %llu inode %llu offset %llu",
+ ret, btrfs_root_id(inode->root), btrfs_ino(inode),
+ start);
zlib_deflateEnd(&workspace->strm);
ret = -EIO;
goto out;
@@ -307,9 +317,14 @@ int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
workspace->strm.avail_in -= 2;
}
- if (Z_OK != zlib_inflateInit2(&workspace->strm, wbits)) {
- pr_warn("BTRFS: inflateInit failed\n");
+ ret = zlib_inflateInit2(&workspace->strm, wbits);
+ if (unlikely(ret != Z_OK)) {
+ struct btrfs_inode *inode = cb->bbio.inode;
+
kunmap_local(data_in);
+ btrfs_err(inode->root->fs_info,
+ "zlib decompression init failed, error %d root %llu inode %llu offset %llu",
+ ret, btrfs_root_id(inode->root), btrfs_ino(inode), cb->start);
return -EIO;
}
while (workspace->strm.total_in < srclen) {
@@ -348,10 +363,15 @@ int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
workspace->strm.avail_in = min(tmp, PAGE_SIZE);
}
}
- if (ret != Z_STREAM_END)
+ if (unlikely(ret != Z_STREAM_END)) {
+ btrfs_err(cb->bbio.inode->root->fs_info,
+ "zlib decompression failed, error %d root %llu inode %llu offset %llu",
+ ret, btrfs_root_id(cb->bbio.inode->root),
+ btrfs_ino(cb->bbio.inode), cb->start);
ret = -EIO;
- else
+ } else {
ret = 0;
+ }
done:
zlib_inflateEnd(&workspace->strm);
if (data_in)
@@ -386,8 +406,14 @@ int zlib_decompress(struct list_head *ws, const u8 *data_in,
workspace->strm.avail_in -= 2;
}
- if (Z_OK != zlib_inflateInit2(&workspace->strm, wbits)) {
- pr_warn("BTRFS: inflateInit failed\n");
+ ret = zlib_inflateInit2(&workspace->strm, wbits);
+ if (unlikely(ret != Z_OK)) {
+ struct btrfs_inode *inode = BTRFS_I(dest_page->mapping->host);
+
+ btrfs_err(inode->root->fs_info,
+ "zlib decompression init failed, error %d root %llu inode %llu offset %llu",
+ ret, btrfs_root_id(inode->root), btrfs_ino(inode),
+ page_offset(dest_page));
return -EIO;
}
@@ -404,8 +430,12 @@ int zlib_decompress(struct list_head *ws, const u8 *data_in,
out:
if (unlikely(to_copy != destlen)) {
- pr_warn_ratelimited("BTRFS: inflate failed, decompressed=%lu expected=%zu\n",
- to_copy, destlen);
+ struct btrfs_inode *inode = BTRFS_I(dest_page->mapping->host);
+
+ btrfs_err(inode->root->fs_info,
+"zlib decompression failed, error %d root %llu inode %llu offset %llu decompressed %lu expected %zu",
+ ret, btrfs_root_id(inode->root), btrfs_ino(inode),
+ page_offset(dest_page), to_copy, destlen);
ret = -EIO;
} else {
ret = 0;
diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
index 947a87576f6c..df7733044f7e 100644
--- a/fs/btrfs/zoned.c
+++ b/fs/btrfs/zoned.c
@@ -87,9 +87,8 @@ static int sb_write_pointer(struct block_device *bdev, struct blk_zone *zones,
bool empty[BTRFS_NR_SB_LOG_ZONES];
bool full[BTRFS_NR_SB_LOG_ZONES];
sector_t sector;
- int i;
- for (i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++) {
+ for (int i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++) {
ASSERT(zones[i].type != BLK_ZONE_TYPE_CONVENTIONAL);
empty[i] = (zones[i].cond == BLK_ZONE_COND_EMPTY);
full[i] = sb_zone_is_full(&zones[i]);
@@ -121,9 +120,8 @@ static int sb_write_pointer(struct block_device *bdev, struct blk_zone *zones,
struct address_space *mapping = bdev->bd_mapping;
struct page *page[BTRFS_NR_SB_LOG_ZONES];
struct btrfs_super_block *super[BTRFS_NR_SB_LOG_ZONES];
- int i;
- for (i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++) {
+ for (int i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++) {
u64 zone_end = (zones[i].start + zones[i].capacity) << SECTOR_SHIFT;
u64 bytenr = ALIGN_DOWN(zone_end, BTRFS_SUPER_INFO_SIZE) -
BTRFS_SUPER_INFO_SIZE;
@@ -144,7 +142,7 @@ static int sb_write_pointer(struct block_device *bdev, struct blk_zone *zones,
else
sector = zones[0].start;
- for (i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++)
+ for (int i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++)
btrfs_release_disk_super(super[i]);
} else if (!full[0] && (empty[1] || full[1])) {
sector = zones[0].wp;
@@ -652,8 +650,7 @@ out:
return NULL;
}
-int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos,
- struct blk_zone *zone)
+static int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos, struct blk_zone *zone)
{
unsigned int nr_zones = 1;
int ret;
@@ -770,7 +767,7 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info)
return 0;
}
-int btrfs_check_mountopts_zoned(struct btrfs_fs_info *info, unsigned long *mount_opt)
+int btrfs_check_mountopts_zoned(const struct btrfs_fs_info *info, unsigned long *mount_opt)
{
if (!btrfs_is_zoned(info))
return 0;
@@ -1726,7 +1723,7 @@ bool btrfs_use_zone_append(struct btrfs_bio *bbio)
if (!btrfs_is_zoned(fs_info))
return false;
- if (!inode || !is_data_inode(&inode->vfs_inode))
+ if (!inode || !is_data_inode(inode))
return false;
if (btrfs_op(&bbio->bio) != BTRFS_MAP_WRITE)
@@ -1768,7 +1765,7 @@ void btrfs_record_physical_zoned(struct btrfs_bio *bbio)
static void btrfs_rewrite_logical_zoned(struct btrfs_ordered_extent *ordered,
u64 logical)
{
- struct extent_map_tree *em_tree = &BTRFS_I(ordered->inode)->extent_tree;
+ struct extent_map_tree *em_tree = &ordered->inode->extent_tree;
struct extent_map *em;
ordered->disk_bytenr = logical;
@@ -1776,7 +1773,9 @@ static void btrfs_rewrite_logical_zoned(struct btrfs_ordered_extent *ordered,
write_lock(&em_tree->lock);
em = search_extent_mapping(em_tree, ordered->file_offset,
ordered->num_bytes);
- em->block_start = logical;
+ /* The em should be a new COW extent, thus it should not have an offset. */
+ ASSERT(em->offset == 0);
+ em->disk_bytenr = logical;
free_extent_map(em);
write_unlock(&em_tree->lock);
}
@@ -1787,7 +1786,7 @@ static bool btrfs_zoned_split_ordered(struct btrfs_ordered_extent *ordered,
struct btrfs_ordered_extent *new;
if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags) &&
- split_extent_map(BTRFS_I(ordered->inode), ordered->file_offset,
+ split_extent_map(ordered->inode, ordered->file_offset,
ordered->num_bytes, len, logical))
return false;
@@ -1801,7 +1800,7 @@ static bool btrfs_zoned_split_ordered(struct btrfs_ordered_extent *ordered,
void btrfs_finish_ordered_zoned(struct btrfs_ordered_extent *ordered)
{
- struct btrfs_inode *inode = BTRFS_I(ordered->inode);
+ struct btrfs_inode *inode = ordered->inode;
struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct btrfs_ordered_sum *sum;
u64 logical, len;
@@ -1845,7 +1844,7 @@ out:
* here so that we don't attempt to log the csums later.
*/
if ((inode->flags & BTRFS_INODE_NODATASUM) ||
- test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state)) {
+ test_bit(BTRFS_FS_STATE_NO_DATA_CSUMS, &fs_info->fs_state)) {
while ((sum = list_first_entry_or_null(&ordered->list,
typeof(*sum), list))) {
list_del(&sum->list);
@@ -2215,8 +2214,7 @@ static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_writ
/* Ensure all writes in this block group finish */
btrfs_wait_block_group_reservations(block_group);
/* No need to wait for NOCOW writers. Zoned mode does not allow that */
- btrfs_wait_ordered_roots(fs_info, U64_MAX, block_group->start,
- block_group->length);
+ btrfs_wait_ordered_roots(fs_info, U64_MAX, block_group);
/* Wait for extent buffers to be written. */
if (is_metadata)
wait_eb_writebacks(block_group);
diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h
index 77c4321e331f..d66d00c08001 100644
--- a/fs/btrfs/zoned.h
+++ b/fs/btrfs/zoned.h
@@ -53,14 +53,12 @@ struct btrfs_zoned_device_info {
void btrfs_finish_ordered_zoned(struct btrfs_ordered_extent *ordered);
#ifdef CONFIG_BLK_DEV_ZONED
-int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos,
- struct blk_zone *zone);
int btrfs_get_dev_zone_info_all_devices(struct btrfs_fs_info *fs_info);
int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache);
void btrfs_destroy_dev_zone_info(struct btrfs_device *device);
struct btrfs_zoned_device_info *btrfs_clone_dev_zone_info(struct btrfs_device *orig_dev);
int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info);
-int btrfs_check_mountopts_zoned(struct btrfs_fs_info *info, unsigned long *mount_opt);
+int btrfs_check_mountopts_zoned(const struct btrfs_fs_info *info, unsigned long *mount_opt);
int btrfs_sb_log_location_bdev(struct block_device *bdev, int mirror, int rw,
u64 *bytenr_ret);
int btrfs_sb_log_location(struct btrfs_device *device, int mirror, int rw,
@@ -98,11 +96,6 @@ int btrfs_zoned_activate_one_bg(struct btrfs_fs_info *fs_info,
struct btrfs_space_info *space_info, bool do_finish);
void btrfs_check_active_zone_reservation(struct btrfs_fs_info *fs_info);
#else /* CONFIG_BLK_DEV_ZONED */
-static inline int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos,
- struct blk_zone *zone)
-{
- return 0;
-}
static inline int btrfs_get_dev_zone_info_all_devices(struct btrfs_fs_info *fs_info)
{
@@ -136,7 +129,7 @@ static inline int btrfs_check_zoned_mode(const struct btrfs_fs_info *fs_info)
return -EOPNOTSUPP;
}
-static inline int btrfs_check_mountopts_zoned(struct btrfs_fs_info *info,
+static inline int btrfs_check_mountopts_zoned(const struct btrfs_fs_info *info,
unsigned long *mount_opt)
{
return 0;
diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c
index 2b232b82c3a8..2a079561b2b1 100644
--- a/fs/btrfs/zstd.c
+++ b/fs/btrfs/zstd.c
@@ -19,6 +19,7 @@
#include <linux/zstd.h>
#include "misc.h"
#include "fs.h"
+#include "btrfs_inode.h"
#include "compression.h"
#include "super.h"
@@ -399,8 +400,13 @@ int zstd_compress_folios(struct list_head *ws, struct address_space *mapping,
/* Initialize the stream */
stream = zstd_init_cstream(&params, len, workspace->mem,
workspace->size);
- if (!stream) {
- pr_warn("BTRFS: zstd_init_cstream failed\n");
+ if (unlikely(!stream)) {
+ struct btrfs_inode *inode = BTRFS_I(mapping->host);
+
+ btrfs_err(inode->root->fs_info,
+ "zstd compression init level %d failed, root %llu inode %llu offset %llu",
+ workspace->req_level, btrfs_root_id(inode->root),
+ btrfs_ino(inode), start);
ret = -EIO;
goto out;
}
@@ -429,9 +435,14 @@ int zstd_compress_folios(struct list_head *ws, struct address_space *mapping,
ret2 = zstd_compress_stream(stream, &workspace->out_buf,
&workspace->in_buf);
- if (zstd_is_error(ret2)) {
- pr_debug("BTRFS: zstd_compress_stream returned %d\n",
- zstd_get_error_code(ret2));
+ if (unlikely(zstd_is_error(ret2))) {
+ struct btrfs_inode *inode = BTRFS_I(mapping->host);
+
+ btrfs_warn(inode->root->fs_info,
+"zstd compression level %d failed, error %d root %llu inode %llu offset %llu",
+ workspace->req_level, zstd_get_error_code(ret2),
+ btrfs_root_id(inode->root), btrfs_ino(inode),
+ start);
ret = -EIO;
goto out;
}
@@ -497,9 +508,14 @@ int zstd_compress_folios(struct list_head *ws, struct address_space *mapping,
size_t ret2;
ret2 = zstd_end_stream(stream, &workspace->out_buf);
- if (zstd_is_error(ret2)) {
- pr_debug("BTRFS: zstd_end_stream returned %d\n",
- zstd_get_error_code(ret2));
+ if (unlikely(zstd_is_error(ret2))) {
+ struct btrfs_inode *inode = BTRFS_I(mapping->host);
+
+ btrfs_err(inode->root->fs_info,
+"zstd compression end level %d failed, error %d root %llu inode %llu offset %llu",
+ workspace->req_level, zstd_get_error_code(ret2),
+ btrfs_root_id(inode->root), btrfs_ino(inode),
+ start);
ret = -EIO;
goto out;
}
@@ -561,8 +577,12 @@ int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
stream = zstd_init_dstream(
ZSTD_BTRFS_MAX_INPUT, workspace->mem, workspace->size);
- if (!stream) {
- pr_debug("BTRFS: zstd_init_dstream failed\n");
+ if (unlikely(!stream)) {
+ struct btrfs_inode *inode = cb->bbio.inode;
+
+ btrfs_err(inode->root->fs_info,
+ "zstd decompression init failed, root %llu inode %llu offset %llu",
+ btrfs_root_id(inode->root), btrfs_ino(inode), cb->start);
ret = -EIO;
goto done;
}
@@ -580,9 +600,13 @@ int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
ret2 = zstd_decompress_stream(stream, &workspace->out_buf,
&workspace->in_buf);
- if (zstd_is_error(ret2)) {
- pr_debug("BTRFS: zstd_decompress_stream returned %d\n",
- zstd_get_error_code(ret2));
+ if (unlikely(zstd_is_error(ret2))) {
+ struct btrfs_inode *inode = cb->bbio.inode;
+
+ btrfs_err(inode->root->fs_info,
+ "zstd decompression failed, error %d root %llu inode %llu offset %llu",
+ zstd_get_error_code(ret2), btrfs_root_id(inode->root),
+ btrfs_ino(inode), cb->start);
ret = -EIO;
goto done;
}
@@ -637,8 +661,14 @@ int zstd_decompress(struct list_head *ws, const u8 *data_in,
stream = zstd_init_dstream(
ZSTD_BTRFS_MAX_INPUT, workspace->mem, workspace->size);
- if (!stream) {
- pr_warn("BTRFS: zstd_init_dstream failed\n");
+ if (unlikely(!stream)) {
+ struct btrfs_inode *inode = BTRFS_I(dest_page->mapping->host);
+
+ btrfs_err(inode->root->fs_info,
+ "zstd decompression init failed, root %llu inode %llu offset %llu",
+ btrfs_root_id(inode->root), btrfs_ino(inode),
+ page_offset(dest_page));
+ ret = -EIO;
goto finish;
}
@@ -655,9 +685,13 @@ int zstd_decompress(struct list_head *ws, const u8 *data_in,
* one call should end the decompression.
*/
ret = zstd_decompress_stream(stream, &workspace->out_buf, &workspace->in_buf);
- if (zstd_is_error(ret)) {
- pr_warn_ratelimited("BTRFS: zstd_decompress_stream return %d\n",
- zstd_get_error_code(ret));
+ if (unlikely(zstd_is_error(ret))) {
+ struct btrfs_inode *inode = BTRFS_I(dest_page->mapping->host);
+
+ btrfs_err(inode->root->fs_info,
+ "zstd decompression failed, error %d root %llu inode %llu offset %llu",
+ zstd_get_error_code(ret), btrfs_root_id(inode->root),
+ btrfs_ino(inode), page_offset(dest_page));
goto finish;
}
to_copy = workspace->out_buf.pos;
diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h
index c978fa2893a5..246c0fbd582e 100644
--- a/include/trace/events/btrfs.h
+++ b/include/trace/events/btrfs.h
@@ -291,9 +291,6 @@ TRACE_EVENT_CONDITION(btrfs_get_extent,
__field( u64, ino )
__field( u64, start )
__field( u64, len )
- __field( u64, orig_start )
- __field( u64, block_start )
- __field( u64, block_len )
__field( u32, flags )
__field( int, refs )
),
@@ -303,23 +300,15 @@ TRACE_EVENT_CONDITION(btrfs_get_extent,
__entry->ino = btrfs_ino(inode);
__entry->start = map->start;
__entry->len = map->len;
- __entry->orig_start = map->orig_start;
- __entry->block_start = map->block_start;
- __entry->block_len = map->block_len;
__entry->flags = map->flags;
__entry->refs = refcount_read(&map->refs);
),
- TP_printk_btrfs("root=%llu(%s) ino=%llu start=%llu len=%llu "
- "orig_start=%llu block_start=%llu(%s) "
- "block_len=%llu flags=%s refs=%u",
+ TP_printk_btrfs("root=%llu(%s) ino=%llu start=%llu len=%llu flags=%s refs=%u",
show_root_type(__entry->root_objectid),
__entry->ino,
__entry->start,
__entry->len,
- __entry->orig_start,
- show_map_type(__entry->block_start),
- __entry->block_len,
show_map_flags(__entry->flags),
__entry->refs)
);
@@ -2617,7 +2606,6 @@ TRACE_EVENT(btrfs_extent_map_shrinker_remove_em,
__field( u64, root_id )
__field( u64, start )
__field( u64, len )
- __field( u64, block_start )
__field( u32, flags )
),
@@ -2626,15 +2614,12 @@ TRACE_EVENT(btrfs_extent_map_shrinker_remove_em,
__entry->root_id = inode->root->root_key.objectid;
__entry->start = em->start;
__entry->len = em->len;
- __entry->block_start = em->block_start;
__entry->flags = em->flags;
),
- TP_printk_btrfs(
-"ino=%llu root=%llu(%s) start=%llu len=%llu block_start=%llu(%s) flags=%s",
+ TP_printk_btrfs("ino=%llu root=%llu(%s) start=%llu len=%llu flags=%s",
__entry->ino, show_root_type(__entry->root_id),
__entry->start, __entry->len,
- show_map_type(__entry->block_start),
show_map_flags(__entry->flags))
);
diff --git a/include/uapi/linux/btrfs_tree.h b/include/uapi/linux/btrfs_tree.h
index d24e8e121507..fc29d273845d 100644
--- a/include/uapi/linux/btrfs_tree.h
+++ b/include/uapi/linux/btrfs_tree.h
@@ -747,21 +747,9 @@ struct btrfs_raid_stride {
__le64 physical;
} __attribute__ ((__packed__));
-/* The stripe_extent::encoding, 1:1 mapping of enum btrfs_raid_types. */
-#define BTRFS_STRIPE_RAID0 1
-#define BTRFS_STRIPE_RAID1 2
-#define BTRFS_STRIPE_DUP 3
-#define BTRFS_STRIPE_RAID10 4
-#define BTRFS_STRIPE_RAID5 5
-#define BTRFS_STRIPE_RAID6 6
-#define BTRFS_STRIPE_RAID1C3 7
-#define BTRFS_STRIPE_RAID1C4 8
-
struct btrfs_stripe_extent {
- __u8 encoding;
- __u8 reserved[7];
/* An array of raid strides this stripe is composed of. */
- struct btrfs_raid_stride strides[];
+ __DECLARE_FLEX_ARRAY(struct btrfs_raid_stride, strides);
} __attribute__ ((__packed__));
#define BTRFS_HEADER_FLAG_WRITTEN (1ULL << 0)
@@ -777,6 +765,14 @@ struct btrfs_stripe_extent {
#define BTRFS_SUPER_FLAG_CHANGING_FSID (1ULL << 35)
#define BTRFS_SUPER_FLAG_CHANGING_FSID_V2 (1ULL << 36)
+/*
+ * Those are temporaray flags utilized by btrfs-progs to do offline conversion.
+ * They are rejected by kernel.
+ * But still keep them all here to avoid conflicts.
+ */
+#define BTRFS_SUPER_FLAG_CHANGING_BG_TREE (1ULL << 38)
+#define BTRFS_SUPER_FLAG_CHANGING_DATA_CSUM (1ULL << 39)
+#define BTRFS_SUPER_FLAG_CHANGING_META_CSUM (1ULL << 40)
/*
* items in the extent btree are used to record the objectid of the