summaryrefslogtreecommitdiffstats
path: root/fs/btrfs
diff options
context:
space:
mode:
Diffstat (limited to 'fs/btrfs')
-rw-r--r--fs/btrfs/Makefile6
-rw-r--r--fs/btrfs/accessors.c (renamed from fs/btrfs/struct-funcs.c)12
-rw-r--r--fs/btrfs/accessors.h1073
-rw-r--r--fs/btrfs/acl.c5
-rw-r--r--fs/btrfs/acl.h27
-rw-r--r--fs/btrfs/backref.c1001
-rw-r--r--fs/btrfs/backref.h195
-rw-r--r--fs/btrfs/bio.c381
-rw-r--r--fs/btrfs/bio.h127
-rw-r--r--fs/btrfs/block-group.c152
-rw-r--r--fs/btrfs/block-group.h30
-rw-r--r--fs/btrfs/block-rsv.c43
-rw-r--r--fs/btrfs/block-rsv.h6
-rw-r--r--fs/btrfs/btrfs_inode.h161
-rw-r--r--fs/btrfs/check-integrity.c4
-rw-r--r--fs/btrfs/compression.c18
-rw-r--r--fs/btrfs/compression.h11
-rw-r--r--fs/btrfs/ctree.c363
-rw-r--r--fs/btrfs/ctree.h3462
-rw-r--r--fs/btrfs/defrag.c1376
-rw-r--r--fs/btrfs/defrag.h22
-rw-r--r--fs/btrfs/delalloc-space.c61
-rw-r--r--fs/btrfs/delalloc-space.h3
-rw-r--r--fs/btrfs/delayed-inode.c17
-rw-r--r--fs/btrfs/delayed-inode.h2
-rw-r--r--fs/btrfs/delayed-ref.c21
-rw-r--r--fs/btrfs/dev-replace.c28
-rw-r--r--fs/btrfs/dev-replace.h8
-rw-r--r--fs/btrfs/dir-item.c60
-rw-r--r--fs/btrfs/dir-item.h42
-rw-r--r--fs/btrfs/discard.c112
-rw-r--r--fs/btrfs/disk-io.c251
-rw-r--r--fs/btrfs/disk-io.h35
-rw-r--r--fs/btrfs/export.c25
-rw-r--r--fs/btrfs/export.h3
-rw-r--r--fs/btrfs/extent-io-tree.c192
-rw-r--r--fs/btrfs/extent-io-tree.h100
-rw-r--r--fs/btrfs/extent-tree.c55
-rw-r--r--fs/btrfs/extent-tree.h78
-rw-r--r--fs/btrfs/extent_io.c482
-rw-r--r--fs/btrfs/extent_io.h67
-rw-r--r--fs/btrfs/extent_map.c75
-rw-r--r--fs/btrfs/file-item.c258
-rw-r--r--fs/btrfs/file-item.h69
-rw-r--r--fs/btrfs/file.c621
-rw-r--r--fs/btrfs/file.h33
-rw-r--r--fs/btrfs/free-space-cache.c52
-rw-r--r--fs/btrfs/free-space-cache.h13
-rw-r--r--fs/btrfs/free-space-tree.c15
-rw-r--r--fs/btrfs/fs.c94
-rw-r--r--fs/btrfs/fs.h976
-rw-r--r--fs/btrfs/inode-item.c79
-rw-r--r--fs/btrfs/inode-item.h20
-rw-r--r--fs/btrfs/inode.c912
-rw-r--r--fs/btrfs/ioctl.c972
-rw-r--r--fs/btrfs/ioctl.h17
-rw-r--r--fs/btrfs/locking.c1
-rw-r--r--fs/btrfs/locking.h76
-rw-r--r--fs/btrfs/lzo.c4
-rw-r--r--fs/btrfs/messages.c353
-rw-r--r--fs/btrfs/messages.h245
-rw-r--r--fs/btrfs/misc.h24
-rw-r--r--fs/btrfs/ordered-data.c31
-rw-r--r--fs/btrfs/ordered-data.h3
-rw-r--r--fs/btrfs/orphan.c1
-rw-r--r--fs/btrfs/orphan.h11
-rw-r--r--fs/btrfs/print-tree.c21
-rw-r--r--fs/btrfs/props.c8
-rw-r--r--fs/btrfs/props.h2
-rw-r--r--fs/btrfs/qgroup.c87
-rw-r--r--fs/btrfs/qgroup.h11
-rw-r--r--fs/btrfs/raid56.c2052
-rw-r--r--fs/btrfs/raid56.h33
-rw-r--r--fs/btrfs/rcu-string.h6
-rw-r--r--fs/btrfs/ref-verify.c3
-rw-r--r--fs/btrfs/reflink.c30
-rw-r--r--fs/btrfs/relocation.c94
-rw-r--r--fs/btrfs/relocation.h23
-rw-r--r--fs/btrfs/root-tree.c24
-rw-r--r--fs/btrfs/root-tree.h34
-rw-r--r--fs/btrfs/scrub.c84
-rw-r--r--fs/btrfs/scrub.h16
-rw-r--r--fs/btrfs/send.c512
-rw-r--r--fs/btrfs/send.h6
-rw-r--r--fs/btrfs/space-info.c86
-rw-r--r--fs/btrfs/space-info.h78
-rw-r--r--fs/btrfs/subpage.c1
-rw-r--r--fs/btrfs/super.c554
-rw-r--r--fs/btrfs/super.h29
-rw-r--r--fs/btrfs/sysfs.c23
-rw-r--r--fs/btrfs/tests/btrfs-tests.c5
-rw-r--r--fs/btrfs/tests/extent-buffer-tests.c1
-rw-r--r--fs/btrfs/tests/extent-io-tests.c4
-rw-r--r--fs/btrfs/tests/free-space-tree-tests.c3
-rw-r--r--fs/btrfs/tests/inode-tests.c58
-rw-r--r--fs/btrfs/tests/qgroup-tests.c52
-rw-r--r--fs/btrfs/transaction.c92
-rw-r--r--fs/btrfs/transaction.h22
-rw-r--r--fs/btrfs/tree-checker.c10
-rw-r--r--fs/btrfs/tree-checker.h35
-rw-r--r--fs/btrfs/tree-defrag.c132
-rw-r--r--fs/btrfs/tree-log.c511
-rw-r--r--fs/btrfs/tree-log.h5
-rw-r--r--fs/btrfs/tree-mod-log.c36
-rw-r--r--fs/btrfs/tree-mod-log.h4
-rw-r--r--fs/btrfs/ulist.c38
-rw-r--r--fs/btrfs/ulist.h2
-rw-r--r--fs/btrfs/uuid-tree.c5
-rw-r--r--fs/btrfs/uuid-tree.h12
-rw-r--r--fs/btrfs/verity.c6
-rw-r--r--fs/btrfs/verity.h28
-rw-r--r--fs/btrfs/volumes.c481
-rw-r--r--fs/btrfs/volumes.h118
-rw-r--r--fs/btrfs/xattr.c4
-rw-r--r--fs/btrfs/zlib.c6
-rw-r--r--fs/btrfs/zoned.c67
-rw-r--r--fs/btrfs/zoned.h12
-rw-r--r--fs/btrfs/zstd.c4
118 files changed, 11163 insertions, 9479 deletions
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index fa9ddcc9eb0b..555c962fdad6 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -23,15 +23,15 @@ obj-$(CONFIG_BTRFS_FS) := btrfs.o
btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
file-item.o inode-item.o disk-io.o \
- transaction.o inode.o file.o tree-defrag.o \
- extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \
+ transaction.o inode.o file.o defrag.o \
+ extent_map.o sysfs.o accessors.o xattr.o ordered-data.o \
extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
export.o tree-log.o free-space-cache.o zlib.o lzo.o zstd.o \
compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \
backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o \
uuid-tree.o props.o free-space-tree.o tree-checker.o space-info.o \
block-rsv.o delalloc-space.o block-group.o discard.o reflink.o \
- subpage.o tree-mod-log.o extent-io-tree.o
+ subpage.o tree-mod-log.o extent-io-tree.o fs.o messages.o bio.o
btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o
diff --git a/fs/btrfs/struct-funcs.c b/fs/btrfs/accessors.c
index 12455b2b41de..206cf1612c1d 100644
--- a/fs/btrfs/struct-funcs.c
+++ b/fs/btrfs/accessors.c
@@ -4,8 +4,9 @@
*/
#include <asm/unaligned.h>
-
+#include "messages.h"
#include "ctree.h"
+#include "accessors.h"
static bool check_setget_bounds(const struct extent_buffer *eb,
const void *ptr, unsigned off, int size)
@@ -23,6 +24,13 @@ static bool check_setget_bounds(const struct extent_buffer *eb,
return true;
}
+void btrfs_init_map_token(struct btrfs_map_token *token, struct extent_buffer *eb)
+{
+ token->eb = eb;
+ token->kaddr = page_address(eb->pages[0]);
+ token->offset = 0;
+}
+
/*
* Macro templates that define helpers to read/write extent buffer data of a
* given size, that are also used via ctree.h for access to item members by
@@ -160,7 +168,7 @@ DEFINE_BTRFS_SETGET_BITS(64)
void btrfs_node_key(const struct extent_buffer *eb,
struct btrfs_disk_key *disk_key, int nr)
{
- unsigned long ptr = btrfs_node_key_ptr_offset(nr);
+ unsigned long ptr = btrfs_node_key_ptr_offset(eb, nr);
read_eb_member(eb, (struct btrfs_key_ptr *)ptr,
struct btrfs_key_ptr, key, disk_key);
}
diff --git a/fs/btrfs/accessors.h b/fs/btrfs/accessors.h
new file mode 100644
index 000000000000..ceadfc5d6c66
--- /dev/null
+++ b/fs/btrfs/accessors.h
@@ -0,0 +1,1073 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef BTRFS_ACCESSORS_H
+#define BTRFS_ACCESSORS_H
+
+struct btrfs_map_token {
+ struct extent_buffer *eb;
+ char *kaddr;
+ unsigned long offset;
+};
+
+void btrfs_init_map_token(struct btrfs_map_token *token, struct extent_buffer *eb);
+
+/*
+ * Some macros to generate set/get functions for the struct fields. This
+ * assumes there is a lefoo_to_cpu for every type, so lets make a simple one
+ * for u8:
+ */
+#define le8_to_cpu(v) (v)
+#define cpu_to_le8(v) (v)
+#define __le8 u8
+
+static inline u8 get_unaligned_le8(const void *p)
+{
+ return *(u8 *)p;
+}
+
+static inline void put_unaligned_le8(u8 val, void *p)
+{
+ *(u8 *)p = val;
+}
+
+#define read_eb_member(eb, ptr, type, member, result) (\
+ read_extent_buffer(eb, (char *)(result), \
+ ((unsigned long)(ptr)) + \
+ offsetof(type, member), \
+ sizeof(((type *)0)->member)))
+
+#define write_eb_member(eb, ptr, type, member, result) (\
+ write_extent_buffer(eb, (char *)(result), \
+ ((unsigned long)(ptr)) + \
+ offsetof(type, member), \
+ sizeof(((type *)0)->member)))
+
+#define DECLARE_BTRFS_SETGET_BITS(bits) \
+u##bits btrfs_get_token_##bits(struct btrfs_map_token *token, \
+ const void *ptr, unsigned long off); \
+void btrfs_set_token_##bits(struct btrfs_map_token *token, \
+ const void *ptr, unsigned long off, \
+ u##bits val); \
+u##bits btrfs_get_##bits(const struct extent_buffer *eb, \
+ const void *ptr, unsigned long off); \
+void btrfs_set_##bits(const struct extent_buffer *eb, void *ptr, \
+ unsigned long off, u##bits val);
+
+DECLARE_BTRFS_SETGET_BITS(8)
+DECLARE_BTRFS_SETGET_BITS(16)
+DECLARE_BTRFS_SETGET_BITS(32)
+DECLARE_BTRFS_SETGET_BITS(64)
+
+#define BTRFS_SETGET_FUNCS(name, type, member, bits) \
+static inline u##bits btrfs_##name(const struct extent_buffer *eb, \
+ const type *s) \
+{ \
+ static_assert(sizeof(u##bits) == sizeof(((type *)0))->member); \
+ return btrfs_get_##bits(eb, s, offsetof(type, member)); \
+} \
+static inline void btrfs_set_##name(const struct extent_buffer *eb, type *s, \
+ u##bits val) \
+{ \
+ static_assert(sizeof(u##bits) == sizeof(((type *)0))->member); \
+ btrfs_set_##bits(eb, s, offsetof(type, member), val); \
+} \
+static inline u##bits btrfs_token_##name(struct btrfs_map_token *token, \
+ const type *s) \
+{ \
+ static_assert(sizeof(u##bits) == sizeof(((type *)0))->member); \
+ return btrfs_get_token_##bits(token, s, offsetof(type, member));\
+} \
+static inline void btrfs_set_token_##name(struct btrfs_map_token *token,\
+ type *s, u##bits val) \
+{ \
+ static_assert(sizeof(u##bits) == sizeof(((type *)0))->member); \
+ btrfs_set_token_##bits(token, s, offsetof(type, member), val); \
+}
+
+#define BTRFS_SETGET_HEADER_FUNCS(name, type, member, bits) \
+static inline u##bits btrfs_##name(const struct extent_buffer *eb) \
+{ \
+ const type *p = page_address(eb->pages[0]) + \
+ offset_in_page(eb->start); \
+ return get_unaligned_le##bits(&p->member); \
+} \
+static inline void btrfs_set_##name(const struct extent_buffer *eb, \
+ u##bits val) \
+{ \
+ type *p = page_address(eb->pages[0]) + offset_in_page(eb->start); \
+ put_unaligned_le##bits(val, &p->member); \
+}
+
+#define BTRFS_SETGET_STACK_FUNCS(name, type, member, bits) \
+static inline u##bits btrfs_##name(const type *s) \
+{ \
+ return get_unaligned_le##bits(&s->member); \
+} \
+static inline void btrfs_set_##name(type *s, u##bits val) \
+{ \
+ put_unaligned_le##bits(val, &s->member); \
+}
+
+static inline u64 btrfs_device_total_bytes(const struct extent_buffer *eb,
+ struct btrfs_dev_item *s)
+{
+ static_assert(sizeof(u64) ==
+ sizeof(((struct btrfs_dev_item *)0))->total_bytes);
+ return btrfs_get_64(eb, s, offsetof(struct btrfs_dev_item,
+ total_bytes));
+}
+static inline void btrfs_set_device_total_bytes(const struct extent_buffer *eb,
+ struct btrfs_dev_item *s,
+ u64 val)
+{
+ static_assert(sizeof(u64) ==
+ sizeof(((struct btrfs_dev_item *)0))->total_bytes);
+ WARN_ON(!IS_ALIGNED(val, eb->fs_info->sectorsize));
+ btrfs_set_64(eb, s, offsetof(struct btrfs_dev_item, total_bytes), val);
+}
+
+BTRFS_SETGET_FUNCS(device_type, struct btrfs_dev_item, type, 64);
+BTRFS_SETGET_FUNCS(device_bytes_used, struct btrfs_dev_item, bytes_used, 64);
+BTRFS_SETGET_FUNCS(device_io_align, struct btrfs_dev_item, io_align, 32);
+BTRFS_SETGET_FUNCS(device_io_width, struct btrfs_dev_item, io_width, 32);
+BTRFS_SETGET_FUNCS(device_start_offset, struct btrfs_dev_item, start_offset, 64);
+BTRFS_SETGET_FUNCS(device_sector_size, struct btrfs_dev_item, sector_size, 32);
+BTRFS_SETGET_FUNCS(device_id, struct btrfs_dev_item, devid, 64);
+BTRFS_SETGET_FUNCS(device_group, struct btrfs_dev_item, dev_group, 32);
+BTRFS_SETGET_FUNCS(device_seek_speed, struct btrfs_dev_item, seek_speed, 8);
+BTRFS_SETGET_FUNCS(device_bandwidth, struct btrfs_dev_item, bandwidth, 8);
+BTRFS_SETGET_FUNCS(device_generation, struct btrfs_dev_item, generation, 64);
+
+BTRFS_SETGET_STACK_FUNCS(stack_device_type, struct btrfs_dev_item, type, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_device_total_bytes, struct btrfs_dev_item,
+ total_bytes, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_device_bytes_used, struct btrfs_dev_item,
+ bytes_used, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_device_io_align, struct btrfs_dev_item,
+ io_align, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_device_io_width, struct btrfs_dev_item,
+ io_width, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_device_sector_size, struct btrfs_dev_item,
+ sector_size, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_device_id, struct btrfs_dev_item, devid, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_device_group, struct btrfs_dev_item, dev_group, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_device_seek_speed, struct btrfs_dev_item,
+ seek_speed, 8);
+BTRFS_SETGET_STACK_FUNCS(stack_device_bandwidth, struct btrfs_dev_item,
+ bandwidth, 8);
+BTRFS_SETGET_STACK_FUNCS(stack_device_generation, struct btrfs_dev_item,
+ generation, 64);
+
+static inline unsigned long btrfs_device_uuid(struct btrfs_dev_item *d)
+{
+ return (unsigned long)d + offsetof(struct btrfs_dev_item, uuid);
+}
+
+static inline unsigned long btrfs_device_fsid(struct btrfs_dev_item *d)
+{
+ return (unsigned long)d + offsetof(struct btrfs_dev_item, fsid);
+}
+
+BTRFS_SETGET_FUNCS(chunk_length, struct btrfs_chunk, length, 64);
+BTRFS_SETGET_FUNCS(chunk_owner, struct btrfs_chunk, owner, 64);
+BTRFS_SETGET_FUNCS(chunk_stripe_len, struct btrfs_chunk, stripe_len, 64);
+BTRFS_SETGET_FUNCS(chunk_io_align, struct btrfs_chunk, io_align, 32);
+BTRFS_SETGET_FUNCS(chunk_io_width, struct btrfs_chunk, io_width, 32);
+BTRFS_SETGET_FUNCS(chunk_sector_size, struct btrfs_chunk, sector_size, 32);
+BTRFS_SETGET_FUNCS(chunk_type, struct btrfs_chunk, type, 64);
+BTRFS_SETGET_FUNCS(chunk_num_stripes, struct btrfs_chunk, num_stripes, 16);
+BTRFS_SETGET_FUNCS(chunk_sub_stripes, struct btrfs_chunk, sub_stripes, 16);
+BTRFS_SETGET_FUNCS(stripe_devid, struct btrfs_stripe, devid, 64);
+BTRFS_SETGET_FUNCS(stripe_offset, struct btrfs_stripe, offset, 64);
+
+static inline char *btrfs_stripe_dev_uuid(struct btrfs_stripe *s)
+{
+ return (char *)s + offsetof(struct btrfs_stripe, dev_uuid);
+}
+
+BTRFS_SETGET_STACK_FUNCS(stack_chunk_length, struct btrfs_chunk, length, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_chunk_owner, struct btrfs_chunk, owner, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_chunk_stripe_len, struct btrfs_chunk,
+ stripe_len, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_chunk_io_align, struct btrfs_chunk, io_align, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_chunk_io_width, struct btrfs_chunk, io_width, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_chunk_sector_size, struct btrfs_chunk,
+ sector_size, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_chunk_type, struct btrfs_chunk, type, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_chunk_num_stripes, struct btrfs_chunk,
+ num_stripes, 16);
+BTRFS_SETGET_STACK_FUNCS(stack_chunk_sub_stripes, struct btrfs_chunk,
+ sub_stripes, 16);
+BTRFS_SETGET_STACK_FUNCS(stack_stripe_devid, struct btrfs_stripe, devid, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_stripe_offset, struct btrfs_stripe, offset, 64);
+
+static inline struct btrfs_stripe *btrfs_stripe_nr(struct btrfs_chunk *c, int nr)
+{
+ unsigned long offset = (unsigned long)c;
+
+ offset += offsetof(struct btrfs_chunk, stripe);
+ offset += nr * sizeof(struct btrfs_stripe);
+ return (struct btrfs_stripe *)offset;
+}
+
+static inline char *btrfs_stripe_dev_uuid_nr(struct btrfs_chunk *c, int nr)
+{
+ return btrfs_stripe_dev_uuid(btrfs_stripe_nr(c, nr));
+}
+
+static inline u64 btrfs_stripe_offset_nr(const struct extent_buffer *eb,
+ struct btrfs_chunk *c, int nr)
+{
+ return btrfs_stripe_offset(eb, btrfs_stripe_nr(c, nr));
+}
+
+static inline void btrfs_set_stripe_offset_nr(struct extent_buffer *eb,
+ struct btrfs_chunk *c, int nr,
+ u64 val)
+{
+ btrfs_set_stripe_offset(eb, btrfs_stripe_nr(c, nr), val);
+}
+
+static inline u64 btrfs_stripe_devid_nr(const struct extent_buffer *eb,
+ struct btrfs_chunk *c, int nr)
+{
+ return btrfs_stripe_devid(eb, btrfs_stripe_nr(c, nr));
+}
+
+static inline void btrfs_set_stripe_devid_nr(struct extent_buffer *eb,
+ struct btrfs_chunk *c, int nr,
+ u64 val)
+{
+ btrfs_set_stripe_devid(eb, btrfs_stripe_nr(c, nr), val);
+}
+
+/* struct btrfs_block_group_item */
+BTRFS_SETGET_STACK_FUNCS(stack_block_group_used, struct btrfs_block_group_item,
+ used, 64);
+BTRFS_SETGET_FUNCS(block_group_used, struct btrfs_block_group_item, used, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_block_group_chunk_objectid,
+ struct btrfs_block_group_item, chunk_objectid, 64);
+
+BTRFS_SETGET_FUNCS(block_group_chunk_objectid,
+ struct btrfs_block_group_item, chunk_objectid, 64);
+BTRFS_SETGET_FUNCS(block_group_flags, struct btrfs_block_group_item, flags, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_block_group_flags,
+ struct btrfs_block_group_item, flags, 64);
+
+/* struct btrfs_free_space_info */
+BTRFS_SETGET_FUNCS(free_space_extent_count, struct btrfs_free_space_info,
+ extent_count, 32);
+BTRFS_SETGET_FUNCS(free_space_flags, struct btrfs_free_space_info, flags, 32);
+
+/* struct btrfs_inode_ref */
+BTRFS_SETGET_FUNCS(inode_ref_name_len, struct btrfs_inode_ref, name_len, 16);
+BTRFS_SETGET_FUNCS(inode_ref_index, struct btrfs_inode_ref, index, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_inode_ref_name_len, struct btrfs_inode_ref, name_len, 16);
+BTRFS_SETGET_STACK_FUNCS(stack_inode_ref_index, struct btrfs_inode_ref, index, 64);
+
+/* struct btrfs_inode_extref */
+BTRFS_SETGET_FUNCS(inode_extref_parent, struct btrfs_inode_extref,
+ parent_objectid, 64);
+BTRFS_SETGET_FUNCS(inode_extref_name_len, struct btrfs_inode_extref,
+ name_len, 16);
+BTRFS_SETGET_FUNCS(inode_extref_index, struct btrfs_inode_extref, index, 64);
+
+/* struct btrfs_inode_item */
+BTRFS_SETGET_FUNCS(inode_generation, struct btrfs_inode_item, generation, 64);
+BTRFS_SETGET_FUNCS(inode_sequence, struct btrfs_inode_item, sequence, 64);
+BTRFS_SETGET_FUNCS(inode_transid, struct btrfs_inode_item, transid, 64);
+BTRFS_SETGET_FUNCS(inode_size, struct btrfs_inode_item, size, 64);
+BTRFS_SETGET_FUNCS(inode_nbytes, struct btrfs_inode_item, nbytes, 64);
+BTRFS_SETGET_FUNCS(inode_block_group, struct btrfs_inode_item, block_group, 64);
+BTRFS_SETGET_FUNCS(inode_nlink, struct btrfs_inode_item, nlink, 32);
+BTRFS_SETGET_FUNCS(inode_uid, struct btrfs_inode_item, uid, 32);
+BTRFS_SETGET_FUNCS(inode_gid, struct btrfs_inode_item, gid, 32);
+BTRFS_SETGET_FUNCS(inode_mode, struct btrfs_inode_item, mode, 32);
+BTRFS_SETGET_FUNCS(inode_rdev, struct btrfs_inode_item, rdev, 64);
+BTRFS_SETGET_FUNCS(inode_flags, struct btrfs_inode_item, flags, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_inode_generation, struct btrfs_inode_item,
+ generation, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_inode_sequence, struct btrfs_inode_item,
+ sequence, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_inode_transid, struct btrfs_inode_item,
+ transid, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_inode_size, struct btrfs_inode_item, size, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_inode_nbytes, struct btrfs_inode_item, nbytes, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_inode_block_group, struct btrfs_inode_item,
+ block_group, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_inode_nlink, struct btrfs_inode_item, nlink, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_inode_uid, struct btrfs_inode_item, uid, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_inode_gid, struct btrfs_inode_item, gid, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_inode_mode, struct btrfs_inode_item, mode, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_inode_rdev, struct btrfs_inode_item, rdev, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_inode_flags, struct btrfs_inode_item, flags, 64);
+BTRFS_SETGET_FUNCS(timespec_sec, struct btrfs_timespec, sec, 64);
+BTRFS_SETGET_FUNCS(timespec_nsec, struct btrfs_timespec, nsec, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_timespec_sec, struct btrfs_timespec, sec, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_timespec_nsec, struct btrfs_timespec, nsec, 32);
+
+/* struct btrfs_dev_extent */
+BTRFS_SETGET_FUNCS(dev_extent_chunk_tree, struct btrfs_dev_extent, chunk_tree, 64);
+BTRFS_SETGET_FUNCS(dev_extent_chunk_objectid, struct btrfs_dev_extent,
+ chunk_objectid, 64);
+BTRFS_SETGET_FUNCS(dev_extent_chunk_offset, struct btrfs_dev_extent,
+ chunk_offset, 64);
+BTRFS_SETGET_FUNCS(dev_extent_length, struct btrfs_dev_extent, length, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_dev_extent_chunk_tree, struct btrfs_dev_extent,
+ chunk_tree, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_dev_extent_chunk_objectid, struct btrfs_dev_extent,
+ chunk_objectid, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_dev_extent_chunk_offset, struct btrfs_dev_extent,
+ chunk_offset, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_dev_extent_length, struct btrfs_dev_extent, length, 64);
+
+BTRFS_SETGET_FUNCS(extent_refs, struct btrfs_extent_item, refs, 64);
+BTRFS_SETGET_FUNCS(extent_generation, struct btrfs_extent_item, generation, 64);
+BTRFS_SETGET_FUNCS(extent_flags, struct btrfs_extent_item, flags, 64);
+
+BTRFS_SETGET_FUNCS(tree_block_level, struct btrfs_tree_block_info, level, 8);
+
+static inline void btrfs_tree_block_key(const struct extent_buffer *eb,
+ struct btrfs_tree_block_info *item,
+ struct btrfs_disk_key *key)
+{
+ read_eb_member(eb, item, struct btrfs_tree_block_info, key, key);
+}
+
+static inline void btrfs_set_tree_block_key(const struct extent_buffer *eb,
+ struct btrfs_tree_block_info *item,
+ struct btrfs_disk_key *key)
+{
+ write_eb_member(eb, item, struct btrfs_tree_block_info, key, key);
+}
+
+BTRFS_SETGET_FUNCS(extent_data_ref_root, struct btrfs_extent_data_ref, root, 64);
+BTRFS_SETGET_FUNCS(extent_data_ref_objectid, struct btrfs_extent_data_ref,
+ objectid, 64);
+BTRFS_SETGET_FUNCS(extent_data_ref_offset, struct btrfs_extent_data_ref,
+ offset, 64);
+BTRFS_SETGET_FUNCS(extent_data_ref_count, struct btrfs_extent_data_ref, count, 32);
+
+BTRFS_SETGET_FUNCS(shared_data_ref_count, struct btrfs_shared_data_ref, count, 32);
+
+BTRFS_SETGET_FUNCS(extent_inline_ref_type, struct btrfs_extent_inline_ref,
+ type, 8);
+BTRFS_SETGET_FUNCS(extent_inline_ref_offset, struct btrfs_extent_inline_ref,
+ offset, 64);
+
+static inline u32 btrfs_extent_inline_ref_size(int type)
+{
+ if (type == BTRFS_TREE_BLOCK_REF_KEY ||
+ type == BTRFS_SHARED_BLOCK_REF_KEY)
+ return sizeof(struct btrfs_extent_inline_ref);
+ if (type == BTRFS_SHARED_DATA_REF_KEY)
+ return sizeof(struct btrfs_shared_data_ref) +
+ sizeof(struct btrfs_extent_inline_ref);
+ if (type == BTRFS_EXTENT_DATA_REF_KEY)
+ return sizeof(struct btrfs_extent_data_ref) +
+ offsetof(struct btrfs_extent_inline_ref, offset);
+ return 0;
+}
+
+/* struct btrfs_node */
+BTRFS_SETGET_FUNCS(key_blockptr, struct btrfs_key_ptr, blockptr, 64);
+BTRFS_SETGET_FUNCS(key_generation, struct btrfs_key_ptr, generation, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_key_blockptr, struct btrfs_key_ptr, blockptr, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_key_generation, struct btrfs_key_ptr,
+ generation, 64);
+
+static inline u64 btrfs_node_blockptr(const struct extent_buffer *eb, int nr)
+{
+ unsigned long ptr;
+
+ ptr = offsetof(struct btrfs_node, ptrs) +
+ sizeof(struct btrfs_key_ptr) * nr;
+ return btrfs_key_blockptr(eb, (struct btrfs_key_ptr *)ptr);
+}
+
+static inline void btrfs_set_node_blockptr(const struct extent_buffer *eb,
+ int nr, u64 val)
+{
+ unsigned long ptr;
+
+ ptr = offsetof(struct btrfs_node, ptrs) +
+ sizeof(struct btrfs_key_ptr) * nr;
+ btrfs_set_key_blockptr(eb, (struct btrfs_key_ptr *)ptr, val);
+}
+
+static inline u64 btrfs_node_ptr_generation(const struct extent_buffer *eb, int nr)
+{
+ unsigned long ptr;
+
+ ptr = offsetof(struct btrfs_node, ptrs) +
+ sizeof(struct btrfs_key_ptr) * nr;
+ return btrfs_key_generation(eb, (struct btrfs_key_ptr *)ptr);
+}
+
+static inline void btrfs_set_node_ptr_generation(const struct extent_buffer *eb,
+ int nr, u64 val)
+{
+ unsigned long ptr;
+
+ ptr = offsetof(struct btrfs_node, ptrs) +
+ sizeof(struct btrfs_key_ptr) * nr;
+ btrfs_set_key_generation(eb, (struct btrfs_key_ptr *)ptr, val);
+}
+
+static inline unsigned long btrfs_node_key_ptr_offset(const struct extent_buffer *eb, int nr)
+{
+ return offsetof(struct btrfs_node, ptrs) +
+ sizeof(struct btrfs_key_ptr) * nr;
+}
+
+void btrfs_node_key(const struct extent_buffer *eb,
+ struct btrfs_disk_key *disk_key, int nr);
+
+static inline void btrfs_set_node_key(const struct extent_buffer *eb,
+ struct btrfs_disk_key *disk_key, int nr)
+{
+ unsigned long ptr;
+
+ ptr = btrfs_node_key_ptr_offset(eb, nr);
+ write_eb_member(eb, (struct btrfs_key_ptr *)ptr,
+ struct btrfs_key_ptr, key, disk_key);
+}
+
+/* struct btrfs_item */
+BTRFS_SETGET_FUNCS(raw_item_offset, struct btrfs_item, offset, 32);
+BTRFS_SETGET_FUNCS(raw_item_size, struct btrfs_item, size, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_item_offset, struct btrfs_item, offset, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_item_size, struct btrfs_item, size, 32);
+
+static inline unsigned long btrfs_item_nr_offset(const struct extent_buffer *eb, int nr)
+{
+ return offsetof(struct btrfs_leaf, items) +
+ sizeof(struct btrfs_item) * nr;
+}
+
+static inline struct btrfs_item *btrfs_item_nr(const struct extent_buffer *eb, int nr)
+{
+ return (struct btrfs_item *)btrfs_item_nr_offset(eb, nr);
+}
+
+#define BTRFS_ITEM_SETGET_FUNCS(member) \
+static inline u32 btrfs_item_##member(const struct extent_buffer *eb, int slot) \
+{ \
+ return btrfs_raw_item_##member(eb, btrfs_item_nr(eb, slot)); \
+} \
+static inline void btrfs_set_item_##member(const struct extent_buffer *eb, \
+ int slot, u32 val) \
+{ \
+ btrfs_set_raw_item_##member(eb, btrfs_item_nr(eb, slot), val); \
+} \
+static inline u32 btrfs_token_item_##member(struct btrfs_map_token *token, \
+ int slot) \
+{ \
+ struct btrfs_item *item = btrfs_item_nr(token->eb, slot); \
+ return btrfs_token_raw_item_##member(token, item); \
+} \
+static inline void btrfs_set_token_item_##member(struct btrfs_map_token *token, \
+ int slot, u32 val) \
+{ \
+ struct btrfs_item *item = btrfs_item_nr(token->eb, slot); \
+ btrfs_set_token_raw_item_##member(token, item, val); \
+}
+
+BTRFS_ITEM_SETGET_FUNCS(offset)
+BTRFS_ITEM_SETGET_FUNCS(size);
+
+static inline u32 btrfs_item_data_end(const struct extent_buffer *eb, int nr)
+{
+ return btrfs_item_offset(eb, nr) + btrfs_item_size(eb, nr);
+}
+
+static inline void btrfs_item_key(const struct extent_buffer *eb,
+ struct btrfs_disk_key *disk_key, int nr)
+{
+ struct btrfs_item *item = btrfs_item_nr(eb, nr);
+
+ read_eb_member(eb, item, struct btrfs_item, key, disk_key);
+}
+
+static inline void btrfs_set_item_key(struct extent_buffer *eb,
+ struct btrfs_disk_key *disk_key, int nr)
+{
+ struct btrfs_item *item = btrfs_item_nr(eb, nr);
+
+ write_eb_member(eb, item, struct btrfs_item, key, disk_key);
+}
+
+BTRFS_SETGET_FUNCS(dir_log_end, struct btrfs_dir_log_item, end, 64);
+
+/* struct btrfs_root_ref */
+BTRFS_SETGET_FUNCS(root_ref_dirid, struct btrfs_root_ref, dirid, 64);
+BTRFS_SETGET_FUNCS(root_ref_sequence, struct btrfs_root_ref, sequence, 64);
+BTRFS_SETGET_FUNCS(root_ref_name_len, struct btrfs_root_ref, name_len, 16);
+BTRFS_SETGET_STACK_FUNCS(stack_root_ref_dirid, struct btrfs_root_ref, dirid, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_root_ref_sequence, struct btrfs_root_ref, sequence, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_root_ref_name_len, struct btrfs_root_ref, name_len, 16);
+
+/* struct btrfs_dir_item */
+BTRFS_SETGET_FUNCS(dir_data_len, struct btrfs_dir_item, data_len, 16);
+BTRFS_SETGET_FUNCS(dir_flags, struct btrfs_dir_item, type, 8);
+BTRFS_SETGET_FUNCS(dir_name_len, struct btrfs_dir_item, name_len, 16);
+BTRFS_SETGET_FUNCS(dir_transid, struct btrfs_dir_item, transid, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_dir_flags, struct btrfs_dir_item, type, 8);
+BTRFS_SETGET_STACK_FUNCS(stack_dir_data_len, struct btrfs_dir_item, data_len, 16);
+BTRFS_SETGET_STACK_FUNCS(stack_dir_name_len, struct btrfs_dir_item, name_len, 16);
+BTRFS_SETGET_STACK_FUNCS(stack_dir_transid, struct btrfs_dir_item, transid, 64);
+
+static inline u8 btrfs_dir_ftype(const struct extent_buffer *eb,
+ const struct btrfs_dir_item *item)
+{
+ return btrfs_dir_flags_to_ftype(btrfs_dir_flags(eb, item));
+}
+
+static inline u8 btrfs_stack_dir_ftype(const struct btrfs_dir_item *item)
+{
+ return btrfs_dir_flags_to_ftype(btrfs_stack_dir_flags(item));
+}
+
+static inline void btrfs_dir_item_key(const struct extent_buffer *eb,
+ const struct btrfs_dir_item *item,
+ struct btrfs_disk_key *key)
+{
+ read_eb_member(eb, item, struct btrfs_dir_item, location, key);
+}
+
+static inline void btrfs_set_dir_item_key(struct extent_buffer *eb,
+ struct btrfs_dir_item *item,
+ const struct btrfs_disk_key *key)
+{
+ write_eb_member(eb, item, struct btrfs_dir_item, location, key);
+}
+
+BTRFS_SETGET_FUNCS(free_space_entries, struct btrfs_free_space_header,
+ num_entries, 64);
+BTRFS_SETGET_FUNCS(free_space_bitmaps, struct btrfs_free_space_header,
+ num_bitmaps, 64);
+BTRFS_SETGET_FUNCS(free_space_generation, struct btrfs_free_space_header,
+ generation, 64);
+
+static inline void btrfs_free_space_key(const struct extent_buffer *eb,
+ const struct btrfs_free_space_header *h,
+ struct btrfs_disk_key *key)
+{
+ read_eb_member(eb, h, struct btrfs_free_space_header, location, key);
+}
+
+static inline void btrfs_set_free_space_key(struct extent_buffer *eb,
+ struct btrfs_free_space_header *h,
+ const struct btrfs_disk_key *key)
+{
+ write_eb_member(eb, h, struct btrfs_free_space_header, location, key);
+}
+
+/* struct btrfs_disk_key */
+BTRFS_SETGET_STACK_FUNCS(disk_key_objectid, struct btrfs_disk_key, objectid, 64);
+BTRFS_SETGET_STACK_FUNCS(disk_key_offset, struct btrfs_disk_key, offset, 64);
+BTRFS_SETGET_STACK_FUNCS(disk_key_type, struct btrfs_disk_key, type, 8);
+
+#ifdef __LITTLE_ENDIAN
+
+/*
+ * Optimized helpers for little-endian architectures where CPU and on-disk
+ * structures have the same endianness and we can skip conversions.
+ */
+
+static inline void btrfs_disk_key_to_cpu(struct btrfs_key *cpu_key,
+ const struct btrfs_disk_key *disk_key)
+{
+ memcpy(cpu_key, disk_key, sizeof(struct btrfs_key));
+}
+
+static inline void btrfs_cpu_key_to_disk(struct btrfs_disk_key *disk_key,
+ const struct btrfs_key *cpu_key)
+{
+ memcpy(disk_key, cpu_key, sizeof(struct btrfs_key));
+}
+
+static inline void btrfs_node_key_to_cpu(const struct extent_buffer *eb,
+ struct btrfs_key *cpu_key, int nr)
+{
+ struct btrfs_disk_key *disk_key = (struct btrfs_disk_key *)cpu_key;
+
+ btrfs_node_key(eb, disk_key, nr);
+}
+
+static inline void btrfs_item_key_to_cpu(const struct extent_buffer *eb,
+ struct btrfs_key *cpu_key, int nr)
+{
+ struct btrfs_disk_key *disk_key = (struct btrfs_disk_key *)cpu_key;
+
+ btrfs_item_key(eb, disk_key, nr);
+}
+
+static inline void btrfs_dir_item_key_to_cpu(const struct extent_buffer *eb,
+ const struct btrfs_dir_item *item,
+ struct btrfs_key *cpu_key)
+{
+ struct btrfs_disk_key *disk_key = (struct btrfs_disk_key *)cpu_key;
+
+ btrfs_dir_item_key(eb, item, disk_key);
+}
+
+#else
+
+static inline void btrfs_disk_key_to_cpu(struct btrfs_key *cpu,
+ const struct btrfs_disk_key *disk)
+{
+ cpu->offset = le64_to_cpu(disk->offset);
+ cpu->type = disk->type;
+ cpu->objectid = le64_to_cpu(disk->objectid);
+}
+
+static inline void btrfs_cpu_key_to_disk(struct btrfs_disk_key *disk,
+ const struct btrfs_key *cpu)
+{
+ disk->offset = cpu_to_le64(cpu->offset);
+ disk->type = cpu->type;
+ disk->objectid = cpu_to_le64(cpu->objectid);
+}
+
+static inline void btrfs_node_key_to_cpu(const struct extent_buffer *eb,
+ struct btrfs_key *key, int nr)
+{
+ struct btrfs_disk_key disk_key;
+
+ btrfs_node_key(eb, &disk_key, nr);
+ btrfs_disk_key_to_cpu(key, &disk_key);
+}
+
+static inline void btrfs_item_key_to_cpu(const struct extent_buffer *eb,
+ struct btrfs_key *key, int nr)
+{
+ struct btrfs_disk_key disk_key;
+
+ btrfs_item_key(eb, &disk_key, nr);
+ btrfs_disk_key_to_cpu(key, &disk_key);
+}
+
+static inline void btrfs_dir_item_key_to_cpu(const struct extent_buffer *eb,
+ const struct btrfs_dir_item *item,
+ struct btrfs_key *key)
+{
+ struct btrfs_disk_key disk_key;
+
+ btrfs_dir_item_key(eb, item, &disk_key);
+ btrfs_disk_key_to_cpu(key, &disk_key);
+}
+
+#endif
+
+/* struct btrfs_header */
+BTRFS_SETGET_HEADER_FUNCS(header_bytenr, struct btrfs_header, bytenr, 64);
+BTRFS_SETGET_HEADER_FUNCS(header_generation, struct btrfs_header, generation, 64);
+BTRFS_SETGET_HEADER_FUNCS(header_owner, struct btrfs_header, owner, 64);
+BTRFS_SETGET_HEADER_FUNCS(header_nritems, struct btrfs_header, nritems, 32);
+BTRFS_SETGET_HEADER_FUNCS(header_flags, struct btrfs_header, flags, 64);
+BTRFS_SETGET_HEADER_FUNCS(header_level, struct btrfs_header, level, 8);
+BTRFS_SETGET_STACK_FUNCS(stack_header_generation, struct btrfs_header,
+ generation, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_header_owner, struct btrfs_header, owner, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_header_nritems, struct btrfs_header, nritems, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_header_bytenr, struct btrfs_header, bytenr, 64);
+
+static inline int btrfs_header_flag(const struct extent_buffer *eb, u64 flag)
+{
+ return (btrfs_header_flags(eb) & flag) == flag;
+}
+
+static inline void btrfs_set_header_flag(struct extent_buffer *eb, u64 flag)
+{
+ u64 flags = btrfs_header_flags(eb);
+
+ btrfs_set_header_flags(eb, flags | flag);
+}
+
+static inline void btrfs_clear_header_flag(struct extent_buffer *eb, u64 flag)
+{
+ u64 flags = btrfs_header_flags(eb);
+
+ btrfs_set_header_flags(eb, flags & ~flag);
+}
+
+static inline int btrfs_header_backref_rev(const struct extent_buffer *eb)
+{
+ u64 flags = btrfs_header_flags(eb);
+
+ return flags >> BTRFS_BACKREF_REV_SHIFT;
+}
+
+static inline void btrfs_set_header_backref_rev(struct extent_buffer *eb, int rev)
+{
+ u64 flags = btrfs_header_flags(eb);
+
+ flags &= ~BTRFS_BACKREF_REV_MASK;
+ flags |= (u64)rev << BTRFS_BACKREF_REV_SHIFT;
+ btrfs_set_header_flags(eb, flags);
+}
+
+static inline int btrfs_is_leaf(const struct extent_buffer *eb)
+{
+ return btrfs_header_level(eb) == 0;
+}
+
+/* struct btrfs_root_item */
+BTRFS_SETGET_FUNCS(disk_root_generation, struct btrfs_root_item, generation, 64);
+BTRFS_SETGET_FUNCS(disk_root_refs, struct btrfs_root_item, refs, 32);
+BTRFS_SETGET_FUNCS(disk_root_bytenr, struct btrfs_root_item, bytenr, 64);
+BTRFS_SETGET_FUNCS(disk_root_level, struct btrfs_root_item, level, 8);
+
+BTRFS_SETGET_STACK_FUNCS(root_generation, struct btrfs_root_item, generation, 64);
+BTRFS_SETGET_STACK_FUNCS(root_bytenr, struct btrfs_root_item, bytenr, 64);
+BTRFS_SETGET_STACK_FUNCS(root_drop_level, struct btrfs_root_item, drop_level, 8);
+BTRFS_SETGET_STACK_FUNCS(root_level, struct btrfs_root_item, level, 8);
+BTRFS_SETGET_STACK_FUNCS(root_dirid, struct btrfs_root_item, root_dirid, 64);
+BTRFS_SETGET_STACK_FUNCS(root_refs, struct btrfs_root_item, refs, 32);
+BTRFS_SETGET_STACK_FUNCS(root_flags, struct btrfs_root_item, flags, 64);
+BTRFS_SETGET_STACK_FUNCS(root_used, struct btrfs_root_item, bytes_used, 64);
+BTRFS_SETGET_STACK_FUNCS(root_limit, struct btrfs_root_item, byte_limit, 64);
+BTRFS_SETGET_STACK_FUNCS(root_last_snapshot, struct btrfs_root_item,
+ last_snapshot, 64);
+BTRFS_SETGET_STACK_FUNCS(root_generation_v2, struct btrfs_root_item,
+ generation_v2, 64);
+BTRFS_SETGET_STACK_FUNCS(root_ctransid, struct btrfs_root_item, ctransid, 64);
+BTRFS_SETGET_STACK_FUNCS(root_otransid, struct btrfs_root_item, otransid, 64);
+BTRFS_SETGET_STACK_FUNCS(root_stransid, struct btrfs_root_item, stransid, 64);
+BTRFS_SETGET_STACK_FUNCS(root_rtransid, struct btrfs_root_item, rtransid, 64);
+
+/* struct btrfs_root_backup */
+BTRFS_SETGET_STACK_FUNCS(backup_tree_root, struct btrfs_root_backup,
+ tree_root, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_tree_root_gen, struct btrfs_root_backup,
+ tree_root_gen, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_tree_root_level, struct btrfs_root_backup,
+ tree_root_level, 8);
+
+BTRFS_SETGET_STACK_FUNCS(backup_chunk_root, struct btrfs_root_backup,
+ chunk_root, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_chunk_root_gen, struct btrfs_root_backup,
+ chunk_root_gen, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_chunk_root_level, struct btrfs_root_backup,
+ chunk_root_level, 8);
+
+BTRFS_SETGET_STACK_FUNCS(backup_extent_root, struct btrfs_root_backup,
+ extent_root, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_extent_root_gen, struct btrfs_root_backup,
+ extent_root_gen, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_extent_root_level, struct btrfs_root_backup,
+ extent_root_level, 8);
+
+BTRFS_SETGET_STACK_FUNCS(backup_fs_root, struct btrfs_root_backup,
+ fs_root, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_fs_root_gen, struct btrfs_root_backup,
+ fs_root_gen, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_fs_root_level, struct btrfs_root_backup,
+ fs_root_level, 8);
+
+BTRFS_SETGET_STACK_FUNCS(backup_dev_root, struct btrfs_root_backup,
+ dev_root, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_dev_root_gen, struct btrfs_root_backup,
+ dev_root_gen, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_dev_root_level, struct btrfs_root_backup,
+ dev_root_level, 8);
+
+BTRFS_SETGET_STACK_FUNCS(backup_csum_root, struct btrfs_root_backup,
+ csum_root, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_csum_root_gen, struct btrfs_root_backup,
+ csum_root_gen, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_csum_root_level, struct btrfs_root_backup,
+ csum_root_level, 8);
+BTRFS_SETGET_STACK_FUNCS(backup_total_bytes, struct btrfs_root_backup,
+ total_bytes, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_bytes_used, struct btrfs_root_backup,
+ bytes_used, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_num_devices, struct btrfs_root_backup,
+ num_devices, 64);
+
+/* struct btrfs_balance_item */
+BTRFS_SETGET_FUNCS(balance_flags, struct btrfs_balance_item, flags, 64);
+
+static inline void btrfs_balance_data(const struct extent_buffer *eb,
+ const struct btrfs_balance_item *bi,
+ struct btrfs_disk_balance_args *ba)
+{
+ read_eb_member(eb, bi, struct btrfs_balance_item, data, ba);
+}
+
+static inline void btrfs_set_balance_data(struct extent_buffer *eb,
+ struct btrfs_balance_item *bi,
+ const struct btrfs_disk_balance_args *ba)
+{
+ write_eb_member(eb, bi, struct btrfs_balance_item, data, ba);
+}
+
+static inline void btrfs_balance_meta(const struct extent_buffer *eb,
+ const struct btrfs_balance_item *bi,
+ struct btrfs_disk_balance_args *ba)
+{
+ read_eb_member(eb, bi, struct btrfs_balance_item, meta, ba);
+}
+
+static inline void btrfs_set_balance_meta(struct extent_buffer *eb,
+ struct btrfs_balance_item *bi,
+ const struct btrfs_disk_balance_args *ba)
+{
+ write_eb_member(eb, bi, struct btrfs_balance_item, meta, ba);
+}
+
+static inline void btrfs_balance_sys(const struct extent_buffer *eb,
+ const struct btrfs_balance_item *bi,
+ struct btrfs_disk_balance_args *ba)
+{
+ read_eb_member(eb, bi, struct btrfs_balance_item, sys, ba);
+}
+
+static inline void btrfs_set_balance_sys(struct extent_buffer *eb,
+ struct btrfs_balance_item *bi,
+ const struct btrfs_disk_balance_args *ba)
+{
+ write_eb_member(eb, bi, struct btrfs_balance_item, sys, ba);
+}
+
+static inline void btrfs_disk_balance_args_to_cpu(struct btrfs_balance_args *cpu,
+ const struct btrfs_disk_balance_args *disk)
+{
+ memset(cpu, 0, sizeof(*cpu));
+
+ cpu->profiles = le64_to_cpu(disk->profiles);
+ cpu->usage = le64_to_cpu(disk->usage);
+ cpu->devid = le64_to_cpu(disk->devid);
+ cpu->pstart = le64_to_cpu(disk->pstart);
+ cpu->pend = le64_to_cpu(disk->pend);
+ cpu->vstart = le64_to_cpu(disk->vstart);
+ cpu->vend = le64_to_cpu(disk->vend);
+ cpu->target = le64_to_cpu(disk->target);
+ cpu->flags = le64_to_cpu(disk->flags);
+ cpu->limit = le64_to_cpu(disk->limit);
+ cpu->stripes_min = le32_to_cpu(disk->stripes_min);
+ cpu->stripes_max = le32_to_cpu(disk->stripes_max);
+}
+
+static inline void btrfs_cpu_balance_args_to_disk(
+ struct btrfs_disk_balance_args *disk,
+ const struct btrfs_balance_args *cpu)
+{
+ memset(disk, 0, sizeof(*disk));
+
+ disk->profiles = cpu_to_le64(cpu->profiles);
+ disk->usage = cpu_to_le64(cpu->usage);
+ disk->devid = cpu_to_le64(cpu->devid);
+ disk->pstart = cpu_to_le64(cpu->pstart);
+ disk->pend = cpu_to_le64(cpu->pend);
+ disk->vstart = cpu_to_le64(cpu->vstart);
+ disk->vend = cpu_to_le64(cpu->vend);
+ disk->target = cpu_to_le64(cpu->target);
+ disk->flags = cpu_to_le64(cpu->flags);
+ disk->limit = cpu_to_le64(cpu->limit);
+ disk->stripes_min = cpu_to_le32(cpu->stripes_min);
+ disk->stripes_max = cpu_to_le32(cpu->stripes_max);
+}
+
+/* struct btrfs_super_block */
+BTRFS_SETGET_STACK_FUNCS(super_bytenr, struct btrfs_super_block, bytenr, 64);
+BTRFS_SETGET_STACK_FUNCS(super_flags, struct btrfs_super_block, flags, 64);
+BTRFS_SETGET_STACK_FUNCS(super_generation, struct btrfs_super_block,
+ generation, 64);
+BTRFS_SETGET_STACK_FUNCS(super_root, struct btrfs_super_block, root, 64);
+BTRFS_SETGET_STACK_FUNCS(super_sys_array_size,
+ struct btrfs_super_block, sys_chunk_array_size, 32);
+BTRFS_SETGET_STACK_FUNCS(super_chunk_root_generation,
+ struct btrfs_super_block, chunk_root_generation, 64);
+BTRFS_SETGET_STACK_FUNCS(super_root_level, struct btrfs_super_block,
+ root_level, 8);
+BTRFS_SETGET_STACK_FUNCS(super_chunk_root, struct btrfs_super_block,
+ chunk_root, 64);
+BTRFS_SETGET_STACK_FUNCS(super_chunk_root_level, struct btrfs_super_block,
+ chunk_root_level, 8);
+BTRFS_SETGET_STACK_FUNCS(super_log_root, struct btrfs_super_block, log_root, 64);
+BTRFS_SETGET_STACK_FUNCS(super_log_root_level, struct btrfs_super_block,
+ log_root_level, 8);
+BTRFS_SETGET_STACK_FUNCS(super_total_bytes, struct btrfs_super_block,
+ total_bytes, 64);
+BTRFS_SETGET_STACK_FUNCS(super_bytes_used, struct btrfs_super_block,
+ bytes_used, 64);
+BTRFS_SETGET_STACK_FUNCS(super_sectorsize, struct btrfs_super_block,
+ sectorsize, 32);
+BTRFS_SETGET_STACK_FUNCS(super_nodesize, struct btrfs_super_block,
+ nodesize, 32);
+BTRFS_SETGET_STACK_FUNCS(super_stripesize, struct btrfs_super_block,
+ stripesize, 32);
+BTRFS_SETGET_STACK_FUNCS(super_root_dir, struct btrfs_super_block,
+ root_dir_objectid, 64);
+BTRFS_SETGET_STACK_FUNCS(super_num_devices, struct btrfs_super_block,
+ num_devices, 64);
+BTRFS_SETGET_STACK_FUNCS(super_compat_flags, struct btrfs_super_block,
+ compat_flags, 64);
+BTRFS_SETGET_STACK_FUNCS(super_compat_ro_flags, struct btrfs_super_block,
+ compat_ro_flags, 64);
+BTRFS_SETGET_STACK_FUNCS(super_incompat_flags, struct btrfs_super_block,
+ incompat_flags, 64);
+BTRFS_SETGET_STACK_FUNCS(super_csum_type, struct btrfs_super_block,
+ csum_type, 16);
+BTRFS_SETGET_STACK_FUNCS(super_cache_generation, struct btrfs_super_block,
+ cache_generation, 64);
+BTRFS_SETGET_STACK_FUNCS(super_magic, struct btrfs_super_block, magic, 64);
+BTRFS_SETGET_STACK_FUNCS(super_uuid_tree_generation, struct btrfs_super_block,
+ uuid_tree_generation, 64);
+BTRFS_SETGET_STACK_FUNCS(super_nr_global_roots, struct btrfs_super_block,
+ nr_global_roots, 64);
+
+/* struct btrfs_file_extent_item */
+BTRFS_SETGET_STACK_FUNCS(stack_file_extent_type, struct btrfs_file_extent_item,
+ type, 8);
+BTRFS_SETGET_STACK_FUNCS(stack_file_extent_disk_bytenr,
+ struct btrfs_file_extent_item, disk_bytenr, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_file_extent_offset,
+ struct btrfs_file_extent_item, offset, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_file_extent_generation,
+ struct btrfs_file_extent_item, generation, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_file_extent_num_bytes,
+ struct btrfs_file_extent_item, num_bytes, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_file_extent_ram_bytes,
+ struct btrfs_file_extent_item, ram_bytes, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_file_extent_disk_num_bytes,
+ struct btrfs_file_extent_item, disk_num_bytes, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_file_extent_compression,
+ struct btrfs_file_extent_item, compression, 8);
+
+
+BTRFS_SETGET_FUNCS(file_extent_type, struct btrfs_file_extent_item, type, 8);
+BTRFS_SETGET_FUNCS(file_extent_disk_bytenr, struct btrfs_file_extent_item,
+ disk_bytenr, 64);
+BTRFS_SETGET_FUNCS(file_extent_generation, struct btrfs_file_extent_item,
+ generation, 64);
+BTRFS_SETGET_FUNCS(file_extent_disk_num_bytes, struct btrfs_file_extent_item,
+ disk_num_bytes, 64);
+BTRFS_SETGET_FUNCS(file_extent_offset, struct btrfs_file_extent_item,
+ offset, 64);
+BTRFS_SETGET_FUNCS(file_extent_num_bytes, struct btrfs_file_extent_item,
+ num_bytes, 64);
+BTRFS_SETGET_FUNCS(file_extent_ram_bytes, struct btrfs_file_extent_item,
+ ram_bytes, 64);
+BTRFS_SETGET_FUNCS(file_extent_compression, struct btrfs_file_extent_item,
+ compression, 8);
+BTRFS_SETGET_FUNCS(file_extent_encryption, struct btrfs_file_extent_item,
+ encryption, 8);
+BTRFS_SETGET_FUNCS(file_extent_other_encoding, struct btrfs_file_extent_item,
+ other_encoding, 16);
+
+/* btrfs_qgroup_status_item */
+BTRFS_SETGET_FUNCS(qgroup_status_generation, struct btrfs_qgroup_status_item,
+ generation, 64);
+BTRFS_SETGET_FUNCS(qgroup_status_version, struct btrfs_qgroup_status_item,
+ version, 64);
+BTRFS_SETGET_FUNCS(qgroup_status_flags, struct btrfs_qgroup_status_item,
+ flags, 64);
+BTRFS_SETGET_FUNCS(qgroup_status_rescan, struct btrfs_qgroup_status_item,
+ rescan, 64);
+
+/* btrfs_qgroup_info_item */
+BTRFS_SETGET_FUNCS(qgroup_info_generation, struct btrfs_qgroup_info_item,
+ generation, 64);
+BTRFS_SETGET_FUNCS(qgroup_info_rfer, struct btrfs_qgroup_info_item, rfer, 64);
+BTRFS_SETGET_FUNCS(qgroup_info_rfer_cmpr, struct btrfs_qgroup_info_item,
+ rfer_cmpr, 64);
+BTRFS_SETGET_FUNCS(qgroup_info_excl, struct btrfs_qgroup_info_item, excl, 64);
+BTRFS_SETGET_FUNCS(qgroup_info_excl_cmpr, struct btrfs_qgroup_info_item,
+ excl_cmpr, 64);
+
+BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_generation,
+ struct btrfs_qgroup_info_item, generation, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_rfer, struct btrfs_qgroup_info_item,
+ rfer, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_rfer_cmpr,
+ struct btrfs_qgroup_info_item, rfer_cmpr, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_excl, struct btrfs_qgroup_info_item,
+ excl, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_excl_cmpr,
+ struct btrfs_qgroup_info_item, excl_cmpr, 64);
+
+/* btrfs_qgroup_limit_item */
+BTRFS_SETGET_FUNCS(qgroup_limit_flags, struct btrfs_qgroup_limit_item, flags, 64);
+BTRFS_SETGET_FUNCS(qgroup_limit_max_rfer, struct btrfs_qgroup_limit_item,
+ max_rfer, 64);
+BTRFS_SETGET_FUNCS(qgroup_limit_max_excl, struct btrfs_qgroup_limit_item,
+ max_excl, 64);
+BTRFS_SETGET_FUNCS(qgroup_limit_rsv_rfer, struct btrfs_qgroup_limit_item,
+ rsv_rfer, 64);
+BTRFS_SETGET_FUNCS(qgroup_limit_rsv_excl, struct btrfs_qgroup_limit_item,
+ rsv_excl, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_qgroup_limit_flags,
+ struct btrfs_qgroup_limit_item, flags, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_qgroup_limit_max_rfer,
+ struct btrfs_qgroup_limit_item, max_rfer, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_qgroup_limit_max_excl,
+ struct btrfs_qgroup_limit_item, max_excl, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_qgroup_limit_rsv_rfer,
+ struct btrfs_qgroup_limit_item, rsv_rfer, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_qgroup_limit_rsv_excl,
+ struct btrfs_qgroup_limit_item, rsv_excl, 64);
+
+/* btrfs_dev_replace_item */
+BTRFS_SETGET_FUNCS(dev_replace_src_devid,
+ struct btrfs_dev_replace_item, src_devid, 64);
+BTRFS_SETGET_FUNCS(dev_replace_cont_reading_from_srcdev_mode,
+ struct btrfs_dev_replace_item, cont_reading_from_srcdev_mode,
+ 64);
+BTRFS_SETGET_FUNCS(dev_replace_replace_state, struct btrfs_dev_replace_item,
+ replace_state, 64);
+BTRFS_SETGET_FUNCS(dev_replace_time_started, struct btrfs_dev_replace_item,
+ time_started, 64);
+BTRFS_SETGET_FUNCS(dev_replace_time_stopped, struct btrfs_dev_replace_item,
+ time_stopped, 64);
+BTRFS_SETGET_FUNCS(dev_replace_num_write_errors, struct btrfs_dev_replace_item,
+ num_write_errors, 64);
+BTRFS_SETGET_FUNCS(dev_replace_num_uncorrectable_read_errors,
+ struct btrfs_dev_replace_item, num_uncorrectable_read_errors,
+ 64);
+BTRFS_SETGET_FUNCS(dev_replace_cursor_left, struct btrfs_dev_replace_item,
+ cursor_left, 64);
+BTRFS_SETGET_FUNCS(dev_replace_cursor_right, struct btrfs_dev_replace_item,
+ cursor_right, 64);
+
+BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_src_devid,
+ struct btrfs_dev_replace_item, src_devid, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cont_reading_from_srcdev_mode,
+ struct btrfs_dev_replace_item,
+ cont_reading_from_srcdev_mode, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_replace_state,
+ struct btrfs_dev_replace_item, replace_state, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_time_started,
+ struct btrfs_dev_replace_item, time_started, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_time_stopped,
+ struct btrfs_dev_replace_item, time_stopped, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_num_write_errors,
+ struct btrfs_dev_replace_item, num_write_errors, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_num_uncorrectable_read_errors,
+ struct btrfs_dev_replace_item,
+ num_uncorrectable_read_errors, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cursor_left,
+ struct btrfs_dev_replace_item, cursor_left, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cursor_right,
+ struct btrfs_dev_replace_item, cursor_right, 64);
+
+/* btrfs_verity_descriptor_item */
+BTRFS_SETGET_FUNCS(verity_descriptor_encryption, struct btrfs_verity_descriptor_item,
+ encryption, 8);
+BTRFS_SETGET_FUNCS(verity_descriptor_size, struct btrfs_verity_descriptor_item,
+ size, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_verity_descriptor_encryption,
+ struct btrfs_verity_descriptor_item, encryption, 8);
+BTRFS_SETGET_STACK_FUNCS(stack_verity_descriptor_size,
+ struct btrfs_verity_descriptor_item, size, 64);
+
+/* Cast into the data area of the leaf. */
+#define btrfs_item_ptr(leaf, slot, type) \
+ ((type *)(btrfs_item_nr_offset(leaf, 0) + btrfs_item_offset(leaf, slot)))
+
+#define btrfs_item_ptr_offset(leaf, slot) \
+ ((unsigned long)(btrfs_item_nr_offset(leaf, 0) + btrfs_item_offset(leaf, slot)))
+
+#endif
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index 548d6a5477b4..3da1779e8b79 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -11,10 +11,10 @@
#include <linux/sched.h>
#include <linux/sched/mm.h>
#include <linux/slab.h>
-
#include "ctree.h"
#include "btrfs_inode.h"
#include "xattr.h"
+#include "acl.h"
struct posix_acl *btrfs_get_acl(struct inode *inode, int type, bool rcu)
{
@@ -110,10 +110,11 @@ out:
return ret;
}
-int btrfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+int btrfs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
struct posix_acl *acl, int type)
{
int ret;
+ struct inode *inode = d_inode(dentry);
umode_t old_mode = inode->i_mode;
if (type == ACL_TYPE_ACCESS && acl) {
diff --git a/fs/btrfs/acl.h b/fs/btrfs/acl.h
new file mode 100644
index 000000000000..39bd36e6eeb7
--- /dev/null
+++ b/fs/btrfs/acl.h
@@ -0,0 +1,27 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef BTRFS_ACL_H
+#define BTRFS_ACL_H
+
+#ifdef CONFIG_BTRFS_FS_POSIX_ACL
+
+struct posix_acl *btrfs_get_acl(struct inode *inode, int type, bool rcu);
+int btrfs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
+ struct posix_acl *acl, int type);
+int __btrfs_set_acl(struct btrfs_trans_handle *trans, struct inode *inode,
+ struct posix_acl *acl, int type);
+
+#else
+
+#define btrfs_get_acl NULL
+#define btrfs_set_acl NULL
+static inline int __btrfs_set_acl(struct btrfs_trans_handle *trans,
+ struct inode *inode, struct posix_acl *acl,
+ int type)
+{
+ return -EOPNOTSUPP;
+}
+
+#endif
+
+#endif
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 18374a6d05bd..21c92c74bf71 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -15,49 +15,76 @@
#include "locking.h"
#include "misc.h"
#include "tree-mod-log.h"
+#include "fs.h"
+#include "accessors.h"
+#include "extent-tree.h"
+#include "relocation.h"
+#include "tree-checker.h"
-/* Just an arbitrary number so we can be sure this happened */
-#define BACKREF_FOUND_SHARED 6
+/* Just arbitrary numbers so we can be sure one of these happened. */
+#define BACKREF_FOUND_SHARED 6
+#define BACKREF_FOUND_NOT_SHARED 7
struct extent_inode_elem {
u64 inum;
u64 offset;
+ u64 num_bytes;
struct extent_inode_elem *next;
};
-static int check_extent_in_eb(const struct btrfs_key *key,
+static int check_extent_in_eb(struct btrfs_backref_walk_ctx *ctx,
+ const struct btrfs_key *key,
const struct extent_buffer *eb,
const struct btrfs_file_extent_item *fi,
- u64 extent_item_pos,
- struct extent_inode_elem **eie,
- bool ignore_offset)
+ struct extent_inode_elem **eie)
{
- u64 offset = 0;
+ const u64 data_len = btrfs_file_extent_num_bytes(eb, fi);
+ u64 offset = key->offset;
struct extent_inode_elem *e;
+ const u64 *root_ids;
+ int root_count;
+ bool cached;
- if (!ignore_offset &&
- !btrfs_file_extent_compression(eb, fi) &&
+ if (!btrfs_file_extent_compression(eb, fi) &&
!btrfs_file_extent_encryption(eb, fi) &&
!btrfs_file_extent_other_encoding(eb, fi)) {
u64 data_offset;
- u64 data_len;
data_offset = btrfs_file_extent_offset(eb, fi);
- data_len = btrfs_file_extent_num_bytes(eb, fi);
- if (extent_item_pos < data_offset ||
- extent_item_pos >= data_offset + data_len)
+ if (ctx->extent_item_pos < data_offset ||
+ ctx->extent_item_pos >= data_offset + data_len)
return 1;
- offset = extent_item_pos - data_offset;
+ offset += ctx->extent_item_pos - data_offset;
}
+ if (!ctx->indirect_ref_iterator || !ctx->cache_lookup)
+ goto add_inode_elem;
+
+ cached = ctx->cache_lookup(eb->start, ctx->user_ctx, &root_ids,
+ &root_count);
+ if (!cached)
+ goto add_inode_elem;
+
+ for (int i = 0; i < root_count; i++) {
+ int ret;
+
+ ret = ctx->indirect_ref_iterator(key->objectid, offset,
+ data_len, root_ids[i],
+ ctx->user_ctx);
+ if (ret)
+ return ret;
+ }
+
+add_inode_elem:
e = kmalloc(sizeof(*e), GFP_NOFS);
if (!e)
return -ENOMEM;
e->next = *eie;
e->inum = key->objectid;
- e->offset = key->offset + offset;
+ e->offset = offset;
+ e->num_bytes = data_len;
*eie = e;
return 0;
@@ -73,10 +100,9 @@ static void free_inode_elem_list(struct extent_inode_elem *eie)
}
}
-static int find_extent_in_eb(const struct extent_buffer *eb,
- u64 wanted_disk_byte, u64 extent_item_pos,
- struct extent_inode_elem **eie,
- bool ignore_offset)
+static int find_extent_in_eb(struct btrfs_backref_walk_ctx *ctx,
+ const struct extent_buffer *eb,
+ struct extent_inode_elem **eie)
{
u64 disk_byte;
struct btrfs_key key;
@@ -102,11 +128,11 @@ static int find_extent_in_eb(const struct extent_buffer *eb,
continue;
/* don't skip BTRFS_FILE_EXTENT_PREALLOC, we can handle that */
disk_byte = btrfs_file_extent_disk_bytenr(eb, fi);
- if (disk_byte != wanted_disk_byte)
+ if (disk_byte != ctx->bytenr)
continue;
- ret = check_extent_in_eb(&key, eb, fi, extent_item_pos, eie, ignore_offset);
- if (ret < 0)
+ ret = check_extent_in_eb(ctx, &key, eb, fi, eie);
+ if (ret == BTRFS_ITERATE_EXTENT_INODES_STOP || ret < 0)
return ret;
}
@@ -135,9 +161,29 @@ struct preftrees {
* - decremented when a ref->count transitions to <1
*/
struct share_check {
- u64 root_objectid;
+ struct btrfs_backref_share_check_ctx *ctx;
+ struct btrfs_root *root;
u64 inum;
+ u64 data_bytenr;
+ u64 data_extent_gen;
+ /*
+ * Counts number of inodes that refer to an extent (different inodes in
+ * the same root or different roots) that we could find. The sharedness
+ * check typically stops once this counter gets greater than 1, so it
+ * may not reflect the total number of inodes.
+ */
int share_count;
+ /*
+ * The number of times we found our inode refers to the data extent we
+ * are determining the sharedness. In other words, how many file extent
+ * items we could find for our inode that point to our target data
+ * extent. The value we get here after finishing the extent sharedness
+ * check may be smaller than reality, but if it ends up being greater
+ * than 1, then we know for sure the inode has multiple file extent
+ * items that point to our inode, and we can safely assume it's useful
+ * to cache the sharedness check result.
+ */
+ int self_ref_count;
bool have_delayed_delete_refs;
};
@@ -207,7 +253,7 @@ static int prelim_ref_compare(struct prelim_ref *ref1,
}
static void update_share_count(struct share_check *sc, int oldcount,
- int newcount)
+ int newcount, struct prelim_ref *newref)
{
if ((!sc) || (oldcount == 0 && newcount < 1))
return;
@@ -216,6 +262,11 @@ static void update_share_count(struct share_check *sc, int oldcount,
sc->share_count--;
else if (oldcount < 1 && newcount > 0)
sc->share_count++;
+
+ if (newref->root_id == sc->root->root_key.objectid &&
+ newref->wanted_disk_byte == sc->data_bytenr &&
+ newref->key_for_search.objectid == sc->inum)
+ sc->self_ref_count += newref->count;
}
/*
@@ -266,14 +317,14 @@ static void prelim_ref_insert(const struct btrfs_fs_info *fs_info,
* BTRFS_[ADD|DROP]_DELAYED_REF actions.
*/
update_share_count(sc, ref->count,
- ref->count + newref->count);
+ ref->count + newref->count, newref);
ref->count += newref->count;
free_pref(newref);
return;
}
}
- update_share_count(sc, 0, newref->count);
+ update_share_count(sc, 0, newref->count, newref);
preftree->count++;
trace_btrfs_prelim_ref_insert(fs_info, newref, NULL, preftree->count);
rb_link_node(&newref->rbnode, parent, p);
@@ -416,11 +467,11 @@ static int is_shared_data_backref(struct preftrees *preftrees, u64 bytenr)
return 0;
}
-static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
+static int add_all_parents(struct btrfs_backref_walk_ctx *ctx,
+ struct btrfs_root *root, struct btrfs_path *path,
struct ulist *parents,
struct preftrees *preftrees, struct prelim_ref *ref,
- int level, u64 time_seq, const u64 *extent_item_pos,
- bool ignore_offset)
+ int level)
{
int ret = 0;
int slot;
@@ -456,10 +507,10 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
if (path->slots[0] >= btrfs_header_nritems(eb) ||
is_shared_data_backref(preftrees, eb->start) ||
ref->root_id != btrfs_header_owner(eb)) {
- if (time_seq == BTRFS_SEQ_LAST)
+ if (ctx->time_seq == BTRFS_SEQ_LAST)
ret = btrfs_next_leaf(root, path);
else
- ret = btrfs_next_old_leaf(root, path, time_seq);
+ ret = btrfs_next_old_leaf(root, path, ctx->time_seq);
}
while (!ret && count < ref->count) {
@@ -480,10 +531,10 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
if (slot == 0 &&
(is_shared_data_backref(preftrees, eb->start) ||
ref->root_id != btrfs_header_owner(eb))) {
- if (time_seq == BTRFS_SEQ_LAST)
+ if (ctx->time_seq == BTRFS_SEQ_LAST)
ret = btrfs_next_leaf(root, path);
else
- ret = btrfs_next_old_leaf(root, path, time_seq);
+ ret = btrfs_next_old_leaf(root, path, ctx->time_seq);
continue;
}
fi = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
@@ -497,11 +548,10 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
count++;
else
goto next;
- if (extent_item_pos) {
- ret = check_extent_in_eb(&key, eb, fi,
- *extent_item_pos,
- &eie, ignore_offset);
- if (ret < 0)
+ if (!ctx->ignore_extent_item_pos) {
+ ret = check_extent_in_eb(ctx, &key, eb, fi, &eie);
+ if (ret == BTRFS_ITERATE_EXTENT_INODES_STOP ||
+ ret < 0)
break;
}
if (ret > 0)
@@ -510,7 +560,7 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
eie, (void **)&old, GFP_NOFS);
if (ret < 0)
break;
- if (!ret && extent_item_pos) {
+ if (!ret && !ctx->ignore_extent_item_pos) {
while (old->next)
old = old->next;
old->next = eie;
@@ -518,16 +568,17 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
eie = NULL;
}
next:
- if (time_seq == BTRFS_SEQ_LAST)
+ if (ctx->time_seq == BTRFS_SEQ_LAST)
ret = btrfs_next_item(root, path);
else
- ret = btrfs_next_old_item(root, path, time_seq);
+ ret = btrfs_next_old_item(root, path, ctx->time_seq);
}
- if (ret > 0)
- ret = 0;
- else if (ret < 0)
+ if (ret == BTRFS_ITERATE_EXTENT_INODES_STOP || ret < 0)
free_inode_elem_list(eie);
+ else if (ret > 0)
+ ret = 0;
+
return ret;
}
@@ -535,11 +586,10 @@ next:
* resolve an indirect backref in the form (root_id, key, level)
* to a logical address
*/
-static int resolve_indirect_ref(struct btrfs_fs_info *fs_info,
- struct btrfs_path *path, u64 time_seq,
+static int resolve_indirect_ref(struct btrfs_backref_walk_ctx *ctx,
+ struct btrfs_path *path,
struct preftrees *preftrees,
- struct prelim_ref *ref, struct ulist *parents,
- const u64 *extent_item_pos, bool ignore_offset)
+ struct prelim_ref *ref, struct ulist *parents)
{
struct btrfs_root *root;
struct extent_buffer *eb;
@@ -557,9 +607,9 @@ static int resolve_indirect_ref(struct btrfs_fs_info *fs_info,
* here.
*/
if (path->search_commit_root)
- root = btrfs_get_fs_root_commit_root(fs_info, path, ref->root_id);
+ root = btrfs_get_fs_root_commit_root(ctx->fs_info, path, ref->root_id);
else
- root = btrfs_get_fs_root(fs_info, ref->root_id, false);
+ root = btrfs_get_fs_root(ctx->fs_info, ref->root_id, false);
if (IS_ERR(root)) {
ret = PTR_ERR(root);
goto out_free;
@@ -571,17 +621,17 @@ static int resolve_indirect_ref(struct btrfs_fs_info *fs_info,
goto out;
}
- if (btrfs_is_testing(fs_info)) {
+ if (btrfs_is_testing(ctx->fs_info)) {
ret = -ENOENT;
goto out;
}
if (path->search_commit_root)
root_level = btrfs_header_level(root->commit_root);
- else if (time_seq == BTRFS_SEQ_LAST)
+ else if (ctx->time_seq == BTRFS_SEQ_LAST)
root_level = btrfs_header_level(root->node);
else
- root_level = btrfs_old_root_level(root, time_seq);
+ root_level = btrfs_old_root_level(root, ctx->time_seq);
if (root_level + 1 == level)
goto out;
@@ -609,12 +659,12 @@ static int resolve_indirect_ref(struct btrfs_fs_info *fs_info,
search_key.offset >= LLONG_MAX)
search_key.offset = 0;
path->lowest_level = level;
- if (time_seq == BTRFS_SEQ_LAST)
+ if (ctx->time_seq == BTRFS_SEQ_LAST)
ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
else
- ret = btrfs_search_old_slot(root, &search_key, path, time_seq);
+ ret = btrfs_search_old_slot(root, &search_key, path, ctx->time_seq);
- btrfs_debug(fs_info,
+ btrfs_debug(ctx->fs_info,
"search slot in root %llu (level %d, ref count %d) returned %d for key (%llu %u %llu)",
ref->root_id, level, ref->count, ret,
ref->key_for_search.objectid, ref->key_for_search.type,
@@ -632,8 +682,7 @@ static int resolve_indirect_ref(struct btrfs_fs_info *fs_info,
eb = path->nodes[level];
}
- ret = add_all_parents(root, path, parents, preftrees, ref, level,
- time_seq, extent_item_pos, ignore_offset);
+ ret = add_all_parents(ctx, root, path, parents, preftrees, ref, level);
out:
btrfs_put_root(root);
out_free:
@@ -678,11 +727,10 @@ static void free_leaf_list(struct ulist *ulist)
* rbtree as they are encountered. The new backrefs are subsequently
* resolved as above.
*/
-static int resolve_indirect_refs(struct btrfs_fs_info *fs_info,
- struct btrfs_path *path, u64 time_seq,
+static int resolve_indirect_refs(struct btrfs_backref_walk_ctx *ctx,
+ struct btrfs_path *path,
struct preftrees *preftrees,
- const u64 *extent_item_pos,
- struct share_check *sc, bool ignore_offset)
+ struct share_check *sc)
{
int err;
int ret = 0;
@@ -719,21 +767,18 @@ static int resolve_indirect_refs(struct btrfs_fs_info *fs_info,
continue;
}
- if (sc && sc->root_objectid &&
- ref->root_id != sc->root_objectid) {
+ if (sc && ref->root_id != sc->root->root_key.objectid) {
free_pref(ref);
ret = BACKREF_FOUND_SHARED;
goto out;
}
- err = resolve_indirect_ref(fs_info, path, time_seq, preftrees,
- ref, parents, extent_item_pos,
- ignore_offset);
+ err = resolve_indirect_ref(ctx, path, preftrees, ref, parents);
/*
* we can only tolerate ENOENT,otherwise,we should catch error
* and return directly.
*/
if (err == -ENOENT) {
- prelim_ref_insert(fs_info, &preftrees->direct, ref,
+ prelim_ref_insert(ctx->fs_info, &preftrees->direct, ref,
NULL);
continue;
} else if (err) {
@@ -762,7 +807,7 @@ static int resolve_indirect_refs(struct btrfs_fs_info *fs_info,
memcpy(new_ref, ref, sizeof(*ref));
new_ref->parent = node->val;
new_ref->inode_list = unode_aux_to_inode_list(node);
- prelim_ref_insert(fs_info, &preftrees->direct,
+ prelim_ref_insert(ctx->fs_info, &preftrees->direct,
new_ref, NULL);
}
@@ -770,7 +815,7 @@ static int resolve_indirect_refs(struct btrfs_fs_info *fs_info,
* Now it's a direct ref, put it in the direct tree. We must
* do this last because the ref could be merged/freed here.
*/
- prelim_ref_insert(fs_info, &preftrees->direct, ref, NULL);
+ prelim_ref_insert(ctx->fs_info, &preftrees->direct, ref, NULL);
ulist_reinit(parents);
cond_resched();
@@ -796,6 +841,8 @@ static int add_missing_keys(struct btrfs_fs_info *fs_info,
struct rb_node *node;
while ((node = rb_first_cached(&tree->root))) {
+ struct btrfs_tree_parent_check check = { 0 };
+
ref = rb_entry(node, struct prelim_ref, rbnode);
rb_erase_cached(node, &tree->root);
@@ -803,8 +850,10 @@ static int add_missing_keys(struct btrfs_fs_info *fs_info,
BUG_ON(ref->key_for_search.type);
BUG_ON(!ref->wanted_disk_byte);
- eb = read_tree_block(fs_info, ref->wanted_disk_byte,
- ref->root_id, 0, ref->level - 1, NULL);
+ check.level = ref->level - 1;
+ check.owner_root = ref->root_id;
+
+ eb = read_tree_block(fs_info, ref->wanted_disk_byte, &check);
if (IS_ERR(eb)) {
free_pref(ref);
return PTR_ERR(eb);
@@ -959,8 +1008,8 @@ static int add_delayed_refs(const struct btrfs_fs_info *fs_info,
*
* Returns 0 on success, <0 on error, or BACKREF_FOUND_SHARED.
*/
-static int add_inline_refs(const struct btrfs_fs_info *fs_info,
- struct btrfs_path *path, u64 bytenr,
+static int add_inline_refs(struct btrfs_backref_walk_ctx *ctx,
+ struct btrfs_path *path,
int *info_level, struct preftrees *preftrees,
struct share_check *sc)
{
@@ -985,6 +1034,13 @@ static int add_inline_refs(const struct btrfs_fs_info *fs_info,
BUG_ON(item_size < sizeof(*ei));
ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item);
+
+ if (ctx->check_extent_item) {
+ ret = ctx->check_extent_item(ctx->bytenr, ei, leaf, ctx->user_ctx);
+ if (ret)
+ return ret;
+ }
+
flags = btrfs_extent_flags(leaf, ei);
btrfs_item_key_to_cpu(leaf, &found_key, slot);
@@ -1020,9 +1076,9 @@ static int add_inline_refs(const struct btrfs_fs_info *fs_info,
switch (type) {
case BTRFS_SHARED_BLOCK_REF_KEY:
- ret = add_direct_ref(fs_info, preftrees,
+ ret = add_direct_ref(ctx->fs_info, preftrees,
*info_level + 1, offset,
- bytenr, 1, NULL, GFP_NOFS);
+ ctx->bytenr, 1, NULL, GFP_NOFS);
break;
case BTRFS_SHARED_DATA_REF_KEY: {
struct btrfs_shared_data_ref *sdref;
@@ -1031,14 +1087,14 @@ static int add_inline_refs(const struct btrfs_fs_info *fs_info,
sdref = (struct btrfs_shared_data_ref *)(iref + 1);
count = btrfs_shared_data_ref_count(leaf, sdref);
- ret = add_direct_ref(fs_info, preftrees, 0, offset,
- bytenr, count, sc, GFP_NOFS);
+ ret = add_direct_ref(ctx->fs_info, preftrees, 0, offset,
+ ctx->bytenr, count, sc, GFP_NOFS);
break;
}
case BTRFS_TREE_BLOCK_REF_KEY:
- ret = add_indirect_ref(fs_info, preftrees, offset,
+ ret = add_indirect_ref(ctx->fs_info, preftrees, offset,
NULL, *info_level + 1,
- bytenr, 1, NULL, GFP_NOFS);
+ ctx->bytenr, 1, NULL, GFP_NOFS);
break;
case BTRFS_EXTENT_DATA_REF_KEY: {
struct btrfs_extent_data_ref *dref;
@@ -1052,7 +1108,7 @@ static int add_inline_refs(const struct btrfs_fs_info *fs_info,
key.type = BTRFS_EXTENT_DATA_KEY;
key.offset = btrfs_extent_data_ref_offset(leaf, dref);
- if (sc && sc->inum && key.objectid != sc->inum &&
+ if (sc && key.objectid != sc->inum &&
!sc->have_delayed_delete_refs) {
ret = BACKREF_FOUND_SHARED;
break;
@@ -1060,10 +1116,12 @@ static int add_inline_refs(const struct btrfs_fs_info *fs_info,
root = btrfs_extent_data_ref_root(leaf, dref);
- ret = add_indirect_ref(fs_info, preftrees, root,
- &key, 0, bytenr, count,
- sc, GFP_NOFS);
-
+ if (!ctx->skip_data_ref ||
+ !ctx->skip_data_ref(root, key.objectid, key.offset,
+ ctx->user_ctx))
+ ret = add_indirect_ref(ctx->fs_info, preftrees,
+ root, &key, 0, ctx->bytenr,
+ count, sc, GFP_NOFS);
break;
}
default:
@@ -1082,8 +1140,9 @@ static int add_inline_refs(const struct btrfs_fs_info *fs_info,
*
* Returns 0 on success, <0 on error, or BACKREF_FOUND_SHARED.
*/
-static int add_keyed_refs(struct btrfs_root *extent_root,
- struct btrfs_path *path, u64 bytenr,
+static int add_keyed_refs(struct btrfs_backref_walk_ctx *ctx,
+ struct btrfs_root *extent_root,
+ struct btrfs_path *path,
int info_level, struct preftrees *preftrees,
struct share_check *sc)
{
@@ -1106,7 +1165,7 @@ static int add_keyed_refs(struct btrfs_root *extent_root,
leaf = path->nodes[0];
btrfs_item_key_to_cpu(leaf, &key, slot);
- if (key.objectid != bytenr)
+ if (key.objectid != ctx->bytenr)
break;
if (key.type < BTRFS_TREE_BLOCK_REF_KEY)
continue;
@@ -1118,7 +1177,7 @@ static int add_keyed_refs(struct btrfs_root *extent_root,
/* SHARED DIRECT METADATA backref */
ret = add_direct_ref(fs_info, preftrees,
info_level + 1, key.offset,
- bytenr, 1, NULL, GFP_NOFS);
+ ctx->bytenr, 1, NULL, GFP_NOFS);
break;
case BTRFS_SHARED_DATA_REF_KEY: {
/* SHARED DIRECT FULL backref */
@@ -1129,14 +1188,14 @@ static int add_keyed_refs(struct btrfs_root *extent_root,
struct btrfs_shared_data_ref);
count = btrfs_shared_data_ref_count(leaf, sdref);
ret = add_direct_ref(fs_info, preftrees, 0,
- key.offset, bytenr, count,
+ key.offset, ctx->bytenr, count,
sc, GFP_NOFS);
break;
}
case BTRFS_TREE_BLOCK_REF_KEY:
/* NORMAL INDIRECT METADATA backref */
ret = add_indirect_ref(fs_info, preftrees, key.offset,
- NULL, info_level + 1, bytenr,
+ NULL, info_level + 1, ctx->bytenr,
1, NULL, GFP_NOFS);
break;
case BTRFS_EXTENT_DATA_REF_KEY: {
@@ -1153,16 +1212,20 @@ static int add_keyed_refs(struct btrfs_root *extent_root,
key.type = BTRFS_EXTENT_DATA_KEY;
key.offset = btrfs_extent_data_ref_offset(leaf, dref);
- if (sc && sc->inum && key.objectid != sc->inum &&
+ if (sc && key.objectid != sc->inum &&
!sc->have_delayed_delete_refs) {
ret = BACKREF_FOUND_SHARED;
break;
}
root = btrfs_extent_data_ref_root(leaf, dref);
- ret = add_indirect_ref(fs_info, preftrees, root,
- &key, 0, bytenr, count,
- sc, GFP_NOFS);
+
+ if (!ctx->skip_data_ref ||
+ !ctx->skip_data_ref(root, key.objectid, key.offset,
+ ctx->user_ctx))
+ ret = add_indirect_ref(fs_info, preftrees, root,
+ &key, 0, ctx->bytenr,
+ count, sc, GFP_NOFS);
break;
}
default:
@@ -1177,34 +1240,141 @@ static int add_keyed_refs(struct btrfs_root *extent_root,
}
/*
+ * The caller has joined a transaction or is holding a read lock on the
+ * fs_info->commit_root_sem semaphore, so no need to worry about the root's last
+ * snapshot field changing while updating or checking the cache.
+ */
+static bool lookup_backref_shared_cache(struct btrfs_backref_share_check_ctx *ctx,
+ struct btrfs_root *root,
+ u64 bytenr, int level, bool *is_shared)
+{
+ struct btrfs_backref_shared_cache_entry *entry;
+
+ if (!ctx->use_path_cache)
+ return false;
+
+ if (WARN_ON_ONCE(level >= BTRFS_MAX_LEVEL))
+ return false;
+
+ /*
+ * Level -1 is used for the data extent, which is not reliable to cache
+ * because its reference count can increase or decrease without us
+ * realizing. We cache results only for extent buffers that lead from
+ * the root node down to the leaf with the file extent item.
+ */
+ ASSERT(level >= 0);
+
+ entry = &ctx->path_cache_entries[level];
+
+ /* Unused cache entry or being used for some other extent buffer. */
+ if (entry->bytenr != bytenr)
+ return false;
+
+ /*
+ * We cached a false result, but the last snapshot generation of the
+ * root changed, so we now have a snapshot. Don't trust the result.
+ */
+ if (!entry->is_shared &&
+ entry->gen != btrfs_root_last_snapshot(&root->root_item))
+ return false;
+
+ /*
+ * If we cached a true result and the last generation used for dropping
+ * a root changed, we can not trust the result, because the dropped root
+ * could be a snapshot sharing this extent buffer.
+ */
+ if (entry->is_shared &&
+ entry->gen != btrfs_get_last_root_drop_gen(root->fs_info))
+ return false;
+
+ *is_shared = entry->is_shared;
+ /*
+ * If the node at this level is shared, than all nodes below are also
+ * shared. Currently some of the nodes below may be marked as not shared
+ * because we have just switched from one leaf to another, and switched
+ * also other nodes above the leaf and below the current level, so mark
+ * them as shared.
+ */
+ if (*is_shared) {
+ for (int i = 0; i < level; i++) {
+ ctx->path_cache_entries[i].is_shared = true;
+ ctx->path_cache_entries[i].gen = entry->gen;
+ }
+ }
+
+ return true;
+}
+
+/*
+ * The caller has joined a transaction or is holding a read lock on the
+ * fs_info->commit_root_sem semaphore, so no need to worry about the root's last
+ * snapshot field changing while updating or checking the cache.
+ */
+static void store_backref_shared_cache(struct btrfs_backref_share_check_ctx *ctx,
+ struct btrfs_root *root,
+ u64 bytenr, int level, bool is_shared)
+{
+ struct btrfs_backref_shared_cache_entry *entry;
+ u64 gen;
+
+ if (!ctx->use_path_cache)
+ return;
+
+ if (WARN_ON_ONCE(level >= BTRFS_MAX_LEVEL))
+ return;
+
+ /*
+ * Level -1 is used for the data extent, which is not reliable to cache
+ * because its reference count can increase or decrease without us
+ * realizing. We cache results only for extent buffers that lead from
+ * the root node down to the leaf with the file extent item.
+ */
+ ASSERT(level >= 0);
+
+ if (is_shared)
+ gen = btrfs_get_last_root_drop_gen(root->fs_info);
+ else
+ gen = btrfs_root_last_snapshot(&root->root_item);
+
+ entry = &ctx->path_cache_entries[level];
+ entry->bytenr = bytenr;
+ entry->is_shared = is_shared;
+ entry->gen = gen;
+
+ /*
+ * If we found an extent buffer is shared, set the cache result for all
+ * extent buffers below it to true. As nodes in the path are COWed,
+ * their sharedness is moved to their children, and if a leaf is COWed,
+ * then the sharedness of a data extent becomes direct, the refcount of
+ * data extent is increased in the extent item at the extent tree.
+ */
+ if (is_shared) {
+ for (int i = 0; i < level; i++) {
+ entry = &ctx->path_cache_entries[i];
+ entry->is_shared = is_shared;
+ entry->gen = gen;
+ }
+ }
+}
+
+/*
* this adds all existing backrefs (inline backrefs, backrefs and delayed
* refs) for the given bytenr to the refs list, merges duplicates and resolves
* indirect refs to their parent bytenr.
* When roots are found, they're added to the roots list
*
- * If time_seq is set to BTRFS_SEQ_LAST, it will not search delayed_refs, and
- * behave much like trans == NULL case, the difference only lies in it will not
- * commit root.
- * The special case is for qgroup to search roots in commit_transaction().
- *
- * @sc - if !NULL, then immediately return BACKREF_FOUND_SHARED when a
- * shared extent is detected.
+ * @ctx: Backref walking context object, must be not NULL.
+ * @sc: If !NULL, then immediately return BACKREF_FOUND_SHARED when a
+ * shared extent is detected.
*
* Otherwise this returns 0 for success and <0 for an error.
*
- * If ignore_offset is set to false, only extent refs whose offsets match
- * extent_item_pos are returned. If true, every extent ref is returned
- * and extent_item_pos is ignored.
- *
* FIXME some caching might speed things up
*/
-static int find_parent_nodes(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info, u64 bytenr,
- u64 time_seq, struct ulist *refs,
- struct ulist *roots, const u64 *extent_item_pos,
- struct share_check *sc, bool ignore_offset)
+static int find_parent_nodes(struct btrfs_backref_walk_ctx *ctx,
+ struct share_check *sc)
{
- struct btrfs_root *root = btrfs_extent_root(fs_info, bytenr);
+ struct btrfs_root *root = btrfs_extent_root(ctx->fs_info, ctx->bytenr);
struct btrfs_key key;
struct btrfs_path *path;
struct btrfs_delayed_ref_root *delayed_refs = NULL;
@@ -1220,9 +1390,13 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans,
.indirect_missing_keys = PREFTREE_INIT
};
- key.objectid = bytenr;
+ /* Roots ulist is not needed when using a sharedness check context. */
+ if (sc)
+ ASSERT(ctx->roots == NULL);
+
+ key.objectid = ctx->bytenr;
key.offset = (u64)-1;
- if (btrfs_fs_incompat(fs_info, SKINNY_METADATA))
+ if (btrfs_fs_incompat(ctx->fs_info, SKINNY_METADATA))
key.type = BTRFS_METADATA_ITEM_KEY;
else
key.type = BTRFS_EXTENT_ITEM_KEY;
@@ -1230,12 +1404,12 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans,
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
- if (!trans) {
+ if (!ctx->trans) {
path->search_commit_root = 1;
path->skip_locking = 1;
}
- if (time_seq == BTRFS_SEQ_LAST)
+ if (ctx->time_seq == BTRFS_SEQ_LAST)
path->skip_locking = 1;
again:
@@ -1251,17 +1425,17 @@ again:
goto out;
}
- if (trans && likely(trans->type != __TRANS_DUMMY) &&
- time_seq != BTRFS_SEQ_LAST) {
+ if (ctx->trans && likely(ctx->trans->type != __TRANS_DUMMY) &&
+ ctx->time_seq != BTRFS_SEQ_LAST) {
/*
* We have a specific time_seq we care about and trans which
* means we have the path lock, we need to grab the ref head and
* lock it so we have a consistent view of the refs at the given
* time.
*/
- delayed_refs = &trans->transaction->delayed_refs;
+ delayed_refs = &ctx->trans->transaction->delayed_refs;
spin_lock(&delayed_refs->lock);
- head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
+ head = btrfs_find_delayed_ref_head(delayed_refs, ctx->bytenr);
if (head) {
if (!mutex_trylock(&head->mutex)) {
refcount_inc(&head->refs);
@@ -1279,7 +1453,7 @@ again:
goto again;
}
spin_unlock(&delayed_refs->lock);
- ret = add_delayed_refs(fs_info, head, time_seq,
+ ret = add_delayed_refs(ctx->fs_info, head, ctx->time_seq,
&preftrees, sc);
mutex_unlock(&head->mutex);
if (ret)
@@ -1297,30 +1471,96 @@ again:
leaf = path->nodes[0];
slot = path->slots[0];
btrfs_item_key_to_cpu(leaf, &key, slot);
- if (key.objectid == bytenr &&
+ if (key.objectid == ctx->bytenr &&
(key.type == BTRFS_EXTENT_ITEM_KEY ||
key.type == BTRFS_METADATA_ITEM_KEY)) {
- ret = add_inline_refs(fs_info, path, bytenr,
- &info_level, &preftrees, sc);
+ ret = add_inline_refs(ctx, path, &info_level,
+ &preftrees, sc);
if (ret)
goto out;
- ret = add_keyed_refs(root, path, bytenr, info_level,
+ ret = add_keyed_refs(ctx, root, path, info_level,
&preftrees, sc);
if (ret)
goto out;
}
}
+ /*
+ * If we have a share context and we reached here, it means the extent
+ * is not directly shared (no multiple reference items for it),
+ * otherwise we would have exited earlier with a return value of
+ * BACKREF_FOUND_SHARED after processing delayed references or while
+ * processing inline or keyed references from the extent tree.
+ * The extent may however be indirectly shared through shared subtrees
+ * as a result from creating snapshots, so we determine below what is
+ * its parent node, in case we are dealing with a metadata extent, or
+ * what's the leaf (or leaves), from a fs tree, that has a file extent
+ * item pointing to it in case we are dealing with a data extent.
+ */
+ ASSERT(extent_is_shared(sc) == 0);
+
+ /*
+ * If we are here for a data extent and we have a share_check structure
+ * it means the data extent is not directly shared (does not have
+ * multiple reference items), so we have to check if a path in the fs
+ * tree (going from the root node down to the leaf that has the file
+ * extent item pointing to the data extent) is shared, that is, if any
+ * of the extent buffers in the path is referenced by other trees.
+ */
+ if (sc && ctx->bytenr == sc->data_bytenr) {
+ /*
+ * If our data extent is from a generation more recent than the
+ * last generation used to snapshot the root, then we know that
+ * it can not be shared through subtrees, so we can skip
+ * resolving indirect references, there's no point in
+ * determining the extent buffers for the path from the fs tree
+ * root node down to the leaf that has the file extent item that
+ * points to the data extent.
+ */
+ if (sc->data_extent_gen >
+ btrfs_root_last_snapshot(&sc->root->root_item)) {
+ ret = BACKREF_FOUND_NOT_SHARED;
+ goto out;
+ }
+
+ /*
+ * If we are only determining if a data extent is shared or not
+ * and the corresponding file extent item is located in the same
+ * leaf as the previous file extent item, we can skip resolving
+ * indirect references for a data extent, since the fs tree path
+ * is the same (same leaf, so same path). We skip as long as the
+ * cached result for the leaf is valid and only if there's only
+ * one file extent item pointing to the data extent, because in
+ * the case of multiple file extent items, they may be located
+ * in different leaves and therefore we have multiple paths.
+ */
+ if (sc->ctx->curr_leaf_bytenr == sc->ctx->prev_leaf_bytenr &&
+ sc->self_ref_count == 1) {
+ bool cached;
+ bool is_shared;
+
+ cached = lookup_backref_shared_cache(sc->ctx, sc->root,
+ sc->ctx->curr_leaf_bytenr,
+ 0, &is_shared);
+ if (cached) {
+ if (is_shared)
+ ret = BACKREF_FOUND_SHARED;
+ else
+ ret = BACKREF_FOUND_NOT_SHARED;
+ goto out;
+ }
+ }
+ }
+
btrfs_release_path(path);
- ret = add_missing_keys(fs_info, &preftrees, path->skip_locking == 0);
+ ret = add_missing_keys(ctx->fs_info, &preftrees, path->skip_locking == 0);
if (ret)
goto out;
WARN_ON(!RB_EMPTY_ROOT(&preftrees.indirect_missing_keys.root.rb_root));
- ret = resolve_indirect_refs(fs_info, path, time_seq, &preftrees,
- extent_item_pos, sc, ignore_offset);
+ ret = resolve_indirect_refs(ctx, path, &preftrees, sc);
if (ret)
goto out;
@@ -1347,25 +1587,22 @@ again:
* e.g. different offsets would not be merged,
* and would retain their original ref->count < 0.
*/
- if (roots && ref->count && ref->root_id && ref->parent == 0) {
- if (sc && sc->root_objectid &&
- ref->root_id != sc->root_objectid) {
- ret = BACKREF_FOUND_SHARED;
- goto out;
- }
-
+ if (ctx->roots && ref->count && ref->root_id && ref->parent == 0) {
/* no parent == root of tree */
- ret = ulist_add(roots, ref->root_id, 0, GFP_NOFS);
+ ret = ulist_add(ctx->roots, ref->root_id, 0, GFP_NOFS);
if (ret < 0)
goto out;
}
if (ref->count && ref->parent) {
- if (extent_item_pos && !ref->inode_list &&
+ if (!ctx->ignore_extent_item_pos && !ref->inode_list &&
ref->level == 0) {
+ struct btrfs_tree_parent_check check = { 0 };
struct extent_buffer *eb;
- eb = read_tree_block(fs_info, ref->parent, 0,
- 0, ref->level, NULL);
+ check.level = ref->level;
+
+ eb = read_tree_block(ctx->fs_info, ref->parent,
+ &check);
if (IS_ERR(eb)) {
ret = PTR_ERR(eb);
goto out;
@@ -1378,12 +1615,12 @@ again:
if (!path->skip_locking)
btrfs_tree_read_lock(eb);
- ret = find_extent_in_eb(eb, bytenr,
- *extent_item_pos, &eie, ignore_offset);
+ ret = find_extent_in_eb(ctx, eb, &eie);
if (!path->skip_locking)
btrfs_tree_read_unlock(eb);
free_extent_buffer(eb);
- if (ret < 0)
+ if (ret == BTRFS_ITERATE_EXTENT_INODES_STOP ||
+ ret < 0)
goto out;
ref->inode_list = eie;
/*
@@ -1393,12 +1630,12 @@ again:
*/
eie = NULL;
}
- ret = ulist_add_merge_ptr(refs, ref->parent,
+ ret = ulist_add_merge_ptr(ctx->refs, ref->parent,
ref->inode_list,
(void **)&eie, GFP_NOFS);
if (ret < 0)
goto out;
- if (!ret && extent_item_pos) {
+ if (!ret && !ctx->ignore_extent_item_pos) {
/*
* We've recorded that parent, so we must extend
* its inode list here.
@@ -1436,34 +1673,36 @@ out:
prelim_release(&preftrees.indirect);
prelim_release(&preftrees.indirect_missing_keys);
- if (ret < 0)
+ if (ret == BTRFS_ITERATE_EXTENT_INODES_STOP || ret < 0)
free_inode_elem_list(eie);
return ret;
}
/*
- * Finds all leafs with a reference to the specified combination of bytenr and
- * offset. key_list_head will point to a list of corresponding keys (caller must
- * free each list element). The leafs will be stored in the leafs ulist, which
- * must be freed with ulist_free.
+ * Finds all leaves with a reference to the specified combination of
+ * @ctx->bytenr and @ctx->extent_item_pos. The bytenr of the found leaves are
+ * added to the ulist at @ctx->refs, and that ulist is allocated by this
+ * function. The caller should free the ulist with free_leaf_list() if
+ * @ctx->ignore_extent_item_pos is false, otherwise a fimple ulist_free() is
+ * enough.
*
- * returns 0 on success, <0 on error
+ * Returns 0 on success and < 0 on error. On error @ctx->refs is not allocated.
*/
-int btrfs_find_all_leafs(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info, u64 bytenr,
- u64 time_seq, struct ulist **leafs,
- const u64 *extent_item_pos, bool ignore_offset)
+int btrfs_find_all_leafs(struct btrfs_backref_walk_ctx *ctx)
{
int ret;
- *leafs = ulist_alloc(GFP_NOFS);
- if (!*leafs)
+ ASSERT(ctx->refs == NULL);
+
+ ctx->refs = ulist_alloc(GFP_NOFS);
+ if (!ctx->refs)
return -ENOMEM;
- ret = find_parent_nodes(trans, fs_info, bytenr, time_seq,
- *leafs, NULL, extent_item_pos, NULL, ignore_offset);
- if (ret < 0 && ret != -ENOENT) {
- free_leaf_list(*leafs);
+ ret = find_parent_nodes(ctx, NULL);
+ if (ret == BTRFS_ITERATE_EXTENT_INODES_STOP ||
+ (ret < 0 && ret != -ENOENT)) {
+ free_leaf_list(ctx->refs);
+ ctx->refs = NULL;
return ret;
}
@@ -1471,7 +1710,7 @@ int btrfs_find_all_leafs(struct btrfs_trans_handle *trans,
}
/*
- * walk all backrefs for a given extent to find all roots that reference this
+ * Walk all backrefs for a given extent to find all roots that reference this
* extent. Walking a backref means finding all extents that reference this
* extent and in turn walk the backrefs of those, too. Naturally this is a
* recursive process, but here it is implemented in an iterative fashion: We
@@ -1479,195 +1718,113 @@ int btrfs_find_all_leafs(struct btrfs_trans_handle *trans,
* list. In turn, we find all referencing extents for those, further appending
* to the list. The way we iterate the list allows adding more elements after
* the current while iterating. The process stops when we reach the end of the
- * list. Found roots are added to the roots list.
+ * list.
+ *
+ * Found roots are added to @ctx->roots, which is allocated by this function if
+ * it points to NULL, in which case the caller is responsible for freeing it
+ * after it's not needed anymore.
+ * This function requires @ctx->refs to be NULL, as it uses it for allocating a
+ * ulist to do temporary work, and frees it before returning.
*
- * returns 0 on success, < 0 on error.
+ * Returns 0 on success, < 0 on error.
*/
-static int btrfs_find_all_roots_safe(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info, u64 bytenr,
- u64 time_seq, struct ulist **roots,
- bool ignore_offset)
+static int btrfs_find_all_roots_safe(struct btrfs_backref_walk_ctx *ctx)
{
- struct ulist *tmp;
- struct ulist_node *node = NULL;
+ const u64 orig_bytenr = ctx->bytenr;
+ const bool orig_ignore_extent_item_pos = ctx->ignore_extent_item_pos;
+ bool roots_ulist_allocated = false;
struct ulist_iterator uiter;
- int ret;
+ int ret = 0;
- tmp = ulist_alloc(GFP_NOFS);
- if (!tmp)
- return -ENOMEM;
- *roots = ulist_alloc(GFP_NOFS);
- if (!*roots) {
- ulist_free(tmp);
+ ASSERT(ctx->refs == NULL);
+
+ ctx->refs = ulist_alloc(GFP_NOFS);
+ if (!ctx->refs)
return -ENOMEM;
+
+ if (!ctx->roots) {
+ ctx->roots = ulist_alloc(GFP_NOFS);
+ if (!ctx->roots) {
+ ulist_free(ctx->refs);
+ ctx->refs = NULL;
+ return -ENOMEM;
+ }
+ roots_ulist_allocated = true;
}
+ ctx->ignore_extent_item_pos = true;
+
ULIST_ITER_INIT(&uiter);
while (1) {
- ret = find_parent_nodes(trans, fs_info, bytenr, time_seq,
- tmp, *roots, NULL, NULL, ignore_offset);
+ struct ulist_node *node;
+
+ ret = find_parent_nodes(ctx, NULL);
if (ret < 0 && ret != -ENOENT) {
- ulist_free(tmp);
- ulist_free(*roots);
- *roots = NULL;
- return ret;
+ if (roots_ulist_allocated) {
+ ulist_free(ctx->roots);
+ ctx->roots = NULL;
+ }
+ break;
}
- node = ulist_next(tmp, &uiter);
+ ret = 0;
+ node = ulist_next(ctx->refs, &uiter);
if (!node)
break;
- bytenr = node->val;
+ ctx->bytenr = node->val;
cond_resched();
}
- ulist_free(tmp);
- return 0;
+ ulist_free(ctx->refs);
+ ctx->refs = NULL;
+ ctx->bytenr = orig_bytenr;
+ ctx->ignore_extent_item_pos = orig_ignore_extent_item_pos;
+
+ return ret;
}
-int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info, u64 bytenr,
- u64 time_seq, struct ulist **roots,
+int btrfs_find_all_roots(struct btrfs_backref_walk_ctx *ctx,
bool skip_commit_root_sem)
{
int ret;
- if (!trans && !skip_commit_root_sem)
- down_read(&fs_info->commit_root_sem);
- ret = btrfs_find_all_roots_safe(trans, fs_info, bytenr,
- time_seq, roots, false);
- if (!trans && !skip_commit_root_sem)
- up_read(&fs_info->commit_root_sem);
+ if (!ctx->trans && !skip_commit_root_sem)
+ down_read(&ctx->fs_info->commit_root_sem);
+ ret = btrfs_find_all_roots_safe(ctx);
+ if (!ctx->trans && !skip_commit_root_sem)
+ up_read(&ctx->fs_info->commit_root_sem);
return ret;
}
-/*
- * The caller has joined a transaction or is holding a read lock on the
- * fs_info->commit_root_sem semaphore, so no need to worry about the root's last
- * snapshot field changing while updating or checking the cache.
- */
-static bool lookup_backref_shared_cache(struct btrfs_backref_shared_cache *cache,
- struct btrfs_root *root,
- u64 bytenr, int level, bool *is_shared)
+struct btrfs_backref_share_check_ctx *btrfs_alloc_backref_share_check_ctx(void)
{
- struct btrfs_backref_shared_cache_entry *entry;
-
- if (!cache->use_cache)
- return false;
-
- if (WARN_ON_ONCE(level >= BTRFS_MAX_LEVEL))
- return false;
-
- /*
- * Level -1 is used for the data extent, which is not reliable to cache
- * because its reference count can increase or decrease without us
- * realizing. We cache results only for extent buffers that lead from
- * the root node down to the leaf with the file extent item.
- */
- ASSERT(level >= 0);
-
- entry = &cache->entries[level];
+ struct btrfs_backref_share_check_ctx *ctx;
- /* Unused cache entry or being used for some other extent buffer. */
- if (entry->bytenr != bytenr)
- return false;
-
- /*
- * We cached a false result, but the last snapshot generation of the
- * root changed, so we now have a snapshot. Don't trust the result.
- */
- if (!entry->is_shared &&
- entry->gen != btrfs_root_last_snapshot(&root->root_item))
- return false;
-
- /*
- * If we cached a true result and the last generation used for dropping
- * a root changed, we can not trust the result, because the dropped root
- * could be a snapshot sharing this extent buffer.
- */
- if (entry->is_shared &&
- entry->gen != btrfs_get_last_root_drop_gen(root->fs_info))
- return false;
+ ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
+ if (!ctx)
+ return NULL;
- *is_shared = entry->is_shared;
- /*
- * If the node at this level is shared, than all nodes below are also
- * shared. Currently some of the nodes below may be marked as not shared
- * because we have just switched from one leaf to another, and switched
- * also other nodes above the leaf and below the current level, so mark
- * them as shared.
- */
- if (*is_shared) {
- for (int i = 0; i < level; i++) {
- cache->entries[i].is_shared = true;
- cache->entries[i].gen = entry->gen;
- }
- }
+ ulist_init(&ctx->refs);
- return true;
+ return ctx;
}
-/*
- * The caller has joined a transaction or is holding a read lock on the
- * fs_info->commit_root_sem semaphore, so no need to worry about the root's last
- * snapshot field changing while updating or checking the cache.
- */
-static void store_backref_shared_cache(struct btrfs_backref_shared_cache *cache,
- struct btrfs_root *root,
- u64 bytenr, int level, bool is_shared)
+void btrfs_free_backref_share_ctx(struct btrfs_backref_share_check_ctx *ctx)
{
- struct btrfs_backref_shared_cache_entry *entry;
- u64 gen;
-
- if (!cache->use_cache)
- return;
-
- if (WARN_ON_ONCE(level >= BTRFS_MAX_LEVEL))
+ if (!ctx)
return;
- /*
- * Level -1 is used for the data extent, which is not reliable to cache
- * because its reference count can increase or decrease without us
- * realizing. We cache results only for extent buffers that lead from
- * the root node down to the leaf with the file extent item.
- */
- ASSERT(level >= 0);
-
- if (is_shared)
- gen = btrfs_get_last_root_drop_gen(root->fs_info);
- else
- gen = btrfs_root_last_snapshot(&root->root_item);
-
- entry = &cache->entries[level];
- entry->bytenr = bytenr;
- entry->is_shared = is_shared;
- entry->gen = gen;
-
- /*
- * If we found an extent buffer is shared, set the cache result for all
- * extent buffers below it to true. As nodes in the path are COWed,
- * their sharedness is moved to their children, and if a leaf is COWed,
- * then the sharedness of a data extent becomes direct, the refcount of
- * data extent is increased in the extent item at the extent tree.
- */
- if (is_shared) {
- for (int i = 0; i < level; i++) {
- entry = &cache->entries[i];
- entry->is_shared = is_shared;
- entry->gen = gen;
- }
- }
+ ulist_release(&ctx->refs);
+ kfree(ctx);
}
/*
* Check if a data extent is shared or not.
*
- * @root: The root the inode belongs to.
- * @inum: Number of the inode whose extent we are checking.
+ * @inode: The inode whose extent we are checking.
* @bytenr: Logical bytenr of the extent we are checking.
* @extent_gen: Generation of the extent (file extent item) or 0 if it is
* not known.
- * @roots: List of roots this extent is shared among.
- * @tmp: Temporary list used for iteration.
- * @cache: A backref lookup result cache.
+ * @ctx: A backref sharedness check context.
*
* btrfs_is_data_extent_shared uses the backref walking code but will short
* circuit as soon as it finds a root or inode that doesn't match the
@@ -1680,11 +1837,12 @@ static void store_backref_shared_cache(struct btrfs_backref_shared_cache *cache,
*
* Return: 0 if extent is not shared, 1 if it is shared, < 0 on error.
*/
-int btrfs_is_data_extent_shared(struct btrfs_root *root, u64 inum, u64 bytenr,
+int btrfs_is_data_extent_shared(struct btrfs_inode *inode, u64 bytenr,
u64 extent_gen,
- struct ulist *roots, struct ulist *tmp,
- struct btrfs_backref_shared_cache *cache)
+ struct btrfs_backref_share_check_ctx *ctx)
{
+ struct btrfs_backref_walk_ctx walk_ctx = { 0 };
+ struct btrfs_root *root = inode->root;
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_trans_handle *trans;
struct ulist_iterator uiter;
@@ -1692,15 +1850,23 @@ int btrfs_is_data_extent_shared(struct btrfs_root *root, u64 inum, u64 bytenr,
struct btrfs_seq_list elem = BTRFS_SEQ_LIST_INIT(elem);
int ret = 0;
struct share_check shared = {
- .root_objectid = root->root_key.objectid,
- .inum = inum,
+ .ctx = ctx,
+ .root = root,
+ .inum = btrfs_ino(inode),
+ .data_bytenr = bytenr,
+ .data_extent_gen = extent_gen,
.share_count = 0,
+ .self_ref_count = 0,
.have_delayed_delete_refs = false,
};
int level;
- ulist_init(roots);
- ulist_init(tmp);
+ for (int i = 0; i < BTRFS_BACKREF_CTX_PREV_EXTENTS_SIZE; i++) {
+ if (ctx->prev_extents_cache[i].bytenr == bytenr)
+ return ctx->prev_extents_cache[i].is_shared;
+ }
+
+ ulist_init(&ctx->refs);
trans = btrfs_join_transaction_nostart(root);
if (IS_ERR(trans)) {
@@ -1712,40 +1878,36 @@ int btrfs_is_data_extent_shared(struct btrfs_root *root, u64 inum, u64 bytenr,
down_read(&fs_info->commit_root_sem);
} else {
btrfs_get_tree_mod_seq(fs_info, &elem);
+ walk_ctx.time_seq = elem.seq;
}
+ walk_ctx.ignore_extent_item_pos = true;
+ walk_ctx.trans = trans;
+ walk_ctx.fs_info = fs_info;
+ walk_ctx.refs = &ctx->refs;
+
/* -1 means we are in the bytenr of the data extent. */
level = -1;
ULIST_ITER_INIT(&uiter);
- cache->use_cache = true;
+ ctx->use_path_cache = true;
while (1) {
bool is_shared;
bool cached;
- ret = find_parent_nodes(trans, fs_info, bytenr, elem.seq, tmp,
- roots, NULL, &shared, false);
- if (ret == BACKREF_FOUND_SHARED) {
- /* this is the only condition under which we return 1 */
- ret = 1;
+ walk_ctx.bytenr = bytenr;
+ ret = find_parent_nodes(&walk_ctx, &shared);
+ if (ret == BACKREF_FOUND_SHARED ||
+ ret == BACKREF_FOUND_NOT_SHARED) {
+ /* If shared must return 1, otherwise return 0. */
+ ret = (ret == BACKREF_FOUND_SHARED) ? 1 : 0;
if (level >= 0)
- store_backref_shared_cache(cache, root, bytenr,
- level, true);
+ store_backref_shared_cache(ctx, root, bytenr,
+ level, ret == 1);
break;
}
if (ret < 0 && ret != -ENOENT)
break;
ret = 0;
- /*
- * If our data extent is not shared through reflinks and it was
- * created in a generation after the last one used to create a
- * snapshot of the inode's root, then it can not be shared
- * indirectly through subtrees, as that can only happen with
- * snapshots. In this case bail out, no need to check for the
- * sharedness of extent buffers.
- */
- if (level == -1 &&
- extent_gen > btrfs_root_last_snapshot(&root->root_item))
- break;
/*
* If our data extent was not directly shared (without multiple
@@ -1762,18 +1924,18 @@ int btrfs_is_data_extent_shared(struct btrfs_root *root, u64 inum, u64 bytenr,
* deal with), we can not use it if we have multiple leaves
* (which implies multiple paths).
*/
- if (level == -1 && tmp->nnodes > 1)
- cache->use_cache = false;
+ if (level == -1 && ctx->refs.nnodes > 1)
+ ctx->use_path_cache = false;
if (level >= 0)
- store_backref_shared_cache(cache, root, bytenr,
+ store_backref_shared_cache(ctx, root, bytenr,
level, false);
- node = ulist_next(tmp, &uiter);
+ node = ulist_next(&ctx->refs, &uiter);
if (!node)
break;
bytenr = node->val;
level++;
- cached = lookup_backref_shared_cache(cache, root, bytenr, level,
+ cached = lookup_backref_shared_cache(ctx, root, bytenr, level,
&is_shared);
if (cached) {
ret = (is_shared ? 1 : 0);
@@ -1784,6 +1946,20 @@ int btrfs_is_data_extent_shared(struct btrfs_root *root, u64 inum, u64 bytenr,
cond_resched();
}
+ /*
+ * Cache the sharedness result for the data extent if we know our inode
+ * has more than 1 file extent item that refers to the data extent.
+ */
+ if (ret >= 0 && shared.self_ref_count > 1) {
+ int slot = ctx->prev_extents_cache_slot;
+
+ ctx->prev_extents_cache[slot].bytenr = shared.data_bytenr;
+ ctx->prev_extents_cache[slot].is_shared = (ret == 1);
+
+ slot = (slot + 1) % BTRFS_BACKREF_CTX_PREV_EXTENTS_SIZE;
+ ctx->prev_extents_cache_slot = slot;
+ }
+
if (trans) {
btrfs_put_tree_mod_seq(fs_info, &elem);
btrfs_end_transaction(trans);
@@ -1791,8 +1967,9 @@ int btrfs_is_data_extent_shared(struct btrfs_root *root, u64 inum, u64 bytenr,
up_read(&fs_info->commit_root_sem);
}
out:
- ulist_release(roots);
- ulist_release(tmp);
+ ulist_release(&ctx->refs);
+ ctx->prev_leaf_bytenr = ctx->curr_leaf_bytenr;
+
return ret;
}
@@ -2139,7 +2316,7 @@ static int iterate_leaf_refs(struct btrfs_fs_info *fs_info,
"ref for %llu resolved, key (%llu EXTEND_DATA %llu), root %llu",
extent_item_objectid, eie->inum,
eie->offset, root);
- ret = iterate(eie->inum, eie->offset, root, ctx);
+ ret = iterate(eie->inum, eie->offset, eie->num_bytes, root, ctx);
if (ret) {
btrfs_debug(fs_info,
"stopping iteration for %llu due to ret=%d",
@@ -2156,82 +2333,128 @@ static int iterate_leaf_refs(struct btrfs_fs_info *fs_info,
* the given parameters.
* when the iterator function returns a non-zero value, iteration stops.
*/
-int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
- u64 extent_item_objectid, u64 extent_item_pos,
- int search_commit_root,
- iterate_extent_inodes_t *iterate, void *ctx,
- bool ignore_offset)
+int iterate_extent_inodes(struct btrfs_backref_walk_ctx *ctx,
+ bool search_commit_root,
+ iterate_extent_inodes_t *iterate, void *user_ctx)
{
int ret;
- struct btrfs_trans_handle *trans = NULL;
- struct ulist *refs = NULL;
- struct ulist *roots = NULL;
- struct ulist_node *ref_node = NULL;
- struct ulist_node *root_node = NULL;
+ struct ulist *refs;
+ struct ulist_node *ref_node;
struct btrfs_seq_list seq_elem = BTRFS_SEQ_LIST_INIT(seq_elem);
struct ulist_iterator ref_uiter;
- struct ulist_iterator root_uiter;
- btrfs_debug(fs_info, "resolving all inodes for extent %llu",
- extent_item_objectid);
+ btrfs_debug(ctx->fs_info, "resolving all inodes for extent %llu",
+ ctx->bytenr);
+
+ ASSERT(ctx->trans == NULL);
+ ASSERT(ctx->roots == NULL);
if (!search_commit_root) {
- trans = btrfs_attach_transaction(fs_info->tree_root);
+ struct btrfs_trans_handle *trans;
+
+ trans = btrfs_attach_transaction(ctx->fs_info->tree_root);
if (IS_ERR(trans)) {
if (PTR_ERR(trans) != -ENOENT &&
PTR_ERR(trans) != -EROFS)
return PTR_ERR(trans);
trans = NULL;
}
+ ctx->trans = trans;
}
- if (trans)
- btrfs_get_tree_mod_seq(fs_info, &seq_elem);
- else
- down_read(&fs_info->commit_root_sem);
+ if (ctx->trans) {
+ btrfs_get_tree_mod_seq(ctx->fs_info, &seq_elem);
+ ctx->time_seq = seq_elem.seq;
+ } else {
+ down_read(&ctx->fs_info->commit_root_sem);
+ }
- ret = btrfs_find_all_leafs(trans, fs_info, extent_item_objectid,
- seq_elem.seq, &refs,
- &extent_item_pos, ignore_offset);
+ ret = btrfs_find_all_leafs(ctx);
if (ret)
goto out;
+ refs = ctx->refs;
+ ctx->refs = NULL;
ULIST_ITER_INIT(&ref_uiter);
while (!ret && (ref_node = ulist_next(refs, &ref_uiter))) {
- ret = btrfs_find_all_roots_safe(trans, fs_info, ref_node->val,
- seq_elem.seq, &roots,
- ignore_offset);
+ const u64 leaf_bytenr = ref_node->val;
+ struct ulist_node *root_node;
+ struct ulist_iterator root_uiter;
+ struct extent_inode_elem *inode_list;
+
+ inode_list = (struct extent_inode_elem *)(uintptr_t)ref_node->aux;
+
+ if (ctx->cache_lookup) {
+ const u64 *root_ids;
+ int root_count;
+ bool cached;
+
+ cached = ctx->cache_lookup(leaf_bytenr, ctx->user_ctx,
+ &root_ids, &root_count);
+ if (cached) {
+ for (int i = 0; i < root_count; i++) {
+ ret = iterate_leaf_refs(ctx->fs_info,
+ inode_list,
+ root_ids[i],
+ leaf_bytenr,
+ iterate,
+ user_ctx);
+ if (ret)
+ break;
+ }
+ continue;
+ }
+ }
+
+ if (!ctx->roots) {
+ ctx->roots = ulist_alloc(GFP_NOFS);
+ if (!ctx->roots) {
+ ret = -ENOMEM;
+ break;
+ }
+ }
+
+ ctx->bytenr = leaf_bytenr;
+ ret = btrfs_find_all_roots_safe(ctx);
if (ret)
break;
+
+ if (ctx->cache_store)
+ ctx->cache_store(leaf_bytenr, ctx->roots, ctx->user_ctx);
+
ULIST_ITER_INIT(&root_uiter);
- while (!ret && (root_node = ulist_next(roots, &root_uiter))) {
- btrfs_debug(fs_info,
+ while (!ret && (root_node = ulist_next(ctx->roots, &root_uiter))) {
+ btrfs_debug(ctx->fs_info,
"root %llu references leaf %llu, data list %#llx",
root_node->val, ref_node->val,
ref_node->aux);
- ret = iterate_leaf_refs(fs_info,
- (struct extent_inode_elem *)
- (uintptr_t)ref_node->aux,
- root_node->val,
- extent_item_objectid,
- iterate, ctx);
+ ret = iterate_leaf_refs(ctx->fs_info, inode_list,
+ root_node->val, ctx->bytenr,
+ iterate, user_ctx);
}
- ulist_free(roots);
+ ulist_reinit(ctx->roots);
}
free_leaf_list(refs);
out:
- if (trans) {
- btrfs_put_tree_mod_seq(fs_info, &seq_elem);
- btrfs_end_transaction(trans);
+ if (ctx->trans) {
+ btrfs_put_tree_mod_seq(ctx->fs_info, &seq_elem);
+ btrfs_end_transaction(ctx->trans);
+ ctx->trans = NULL;
} else {
- up_read(&fs_info->commit_root_sem);
+ up_read(&ctx->fs_info->commit_root_sem);
}
+ ulist_free(ctx->roots);
+ ctx->roots = NULL;
+
+ if (ret == BTRFS_ITERATE_EXTENT_INODES_STOP)
+ ret = 0;
+
return ret;
}
-static int build_ino_list(u64 inum, u64 offset, u64 root, void *ctx)
+static int build_ino_list(u64 inum, u64 offset, u64 num_bytes, u64 root, void *ctx)
{
struct btrfs_data_container *inodes = ctx;
const size_t c = 3 * sizeof(u64);
@@ -2255,8 +2478,8 @@ int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info,
struct btrfs_path *path,
void *ctx, bool ignore_offset)
{
+ struct btrfs_backref_walk_ctx walk_ctx = { 0 };
int ret;
- u64 extent_item_pos;
u64 flags = 0;
struct btrfs_key found_key;
int search_commit_root = path->search_commit_root;
@@ -2268,12 +2491,15 @@ int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info,
if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
return -EINVAL;
- extent_item_pos = logical - found_key.objectid;
- ret = iterate_extent_inodes(fs_info, found_key.objectid,
- extent_item_pos, search_commit_root,
- build_ino_list, ctx, ignore_offset);
+ walk_ctx.bytenr = found_key.objectid;
+ if (ignore_offset)
+ walk_ctx.ignore_extent_item_pos = true;
+ else
+ walk_ctx.extent_item_pos = logical - found_key.objectid;
+ walk_ctx.fs_info = fs_info;
- return ret;
+ return iterate_extent_inodes(&walk_ctx, search_commit_root,
+ build_ino_list, ctx);
}
static int inode_to_path(u64 inum, u32 name_len, unsigned long name_off,
@@ -2526,12 +2752,11 @@ void free_ipath(struct inode_fs_paths *ipath)
kfree(ipath);
}
-struct btrfs_backref_iter *btrfs_backref_iter_alloc(
- struct btrfs_fs_info *fs_info, gfp_t gfp_flag)
+struct btrfs_backref_iter *btrfs_backref_iter_alloc(struct btrfs_fs_info *fs_info)
{
struct btrfs_backref_iter *ret;
- ret = kzalloc(sizeof(*ret), gfp_flag);
+ ret = kzalloc(sizeof(*ret), GFP_NOFS);
if (!ret)
return NULL;
diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h
index 8e69584d538d..ef6bbea3f456 100644
--- a/fs/btrfs/backref.h
+++ b/fs/btrfs/backref.h
@@ -7,10 +7,128 @@
#define BTRFS_BACKREF_H
#include <linux/btrfs.h>
+#include "messages.h"
#include "ulist.h"
#include "disk-io.h"
#include "extent_io.h"
+/*
+ * Used by implementations of iterate_extent_inodes_t (see definition below) to
+ * signal that backref iteration can stop immediately and no error happened.
+ * The value must be non-negative and must not be 0, 1 (which is a common return
+ * value from things like btrfs_search_slot() and used internally in the backref
+ * walking code) and different from BACKREF_FOUND_SHARED and
+ * BACKREF_FOUND_NOT_SHARED
+ */
+#define BTRFS_ITERATE_EXTENT_INODES_STOP 5
+
+/*
+ * Should return 0 if no errors happened and iteration of backrefs should
+ * continue. Can return BTRFS_ITERATE_EXTENT_INODES_STOP or any other non-zero
+ * value to immediately stop iteration and possibly signal an error back to
+ * the caller.
+ */
+typedef int (iterate_extent_inodes_t)(u64 inum, u64 offset, u64 num_bytes,
+ u64 root, void *ctx);
+
+/*
+ * Context and arguments for backref walking functions. Some of the fields are
+ * to be filled by the caller of such functions while other are filled by the
+ * functions themselves, as described below.
+ */
+struct btrfs_backref_walk_ctx {
+ /*
+ * The address of the extent for which we are doing backref walking.
+ * Can be either a data extent or a metadata extent.
+ *
+ * Must always be set by the top level caller.
+ */
+ u64 bytenr;
+ /*
+ * Offset relative to the target extent. This is only used for data
+ * extents, and it's meaningful because we can have file extent items
+ * that point only to a section of a data extent ("bookend" extents),
+ * and we want to filter out any that don't point to a section of the
+ * data extent containing the given offset.
+ *
+ * Must always be set by the top level caller.
+ */
+ u64 extent_item_pos;
+ /*
+ * If true and bytenr corresponds to a data extent, then references from
+ * all file extent items that point to the data extent are considered,
+ * @extent_item_pos is ignored.
+ */
+ bool ignore_extent_item_pos;
+ /* A valid transaction handle or NULL. */
+ struct btrfs_trans_handle *trans;
+ /*
+ * The file system's info object, can not be NULL.
+ *
+ * Must always be set by the top level caller.
+ */
+ struct btrfs_fs_info *fs_info;
+ /*
+ * Time sequence acquired from btrfs_get_tree_mod_seq(), in case the
+ * caller joined the tree mod log to get a consistent view of b+trees
+ * while we do backref walking, or BTRFS_SEQ_LAST.
+ * When using BTRFS_SEQ_LAST, delayed refs are not checked and it uses
+ * commit roots when searching b+trees - this is a special case for
+ * qgroups used during a transaction commit.
+ */
+ u64 time_seq;
+ /*
+ * Used to collect the bytenr of metadata extents that point to the
+ * target extent.
+ */
+ struct ulist *refs;
+ /*
+ * List used to collect the IDs of the roots from which the target
+ * extent is accessible. Can be NULL in case the caller does not care
+ * about collecting root IDs.
+ */
+ struct ulist *roots;
+ /*
+ * Used by iterate_extent_inodes() and the main backref walk code
+ * (find_parent_nodes()). Lookup and store functions for an optional
+ * cache which maps the logical address (bytenr) of leaves to an array
+ * of root IDs.
+ */
+ bool (*cache_lookup)(u64 leaf_bytenr, void *user_ctx,
+ const u64 **root_ids_ret, int *root_count_ret);
+ void (*cache_store)(u64 leaf_bytenr, const struct ulist *root_ids,
+ void *user_ctx);
+ /*
+ * If this is not NULL, then the backref walking code will call this
+ * for each indirect data extent reference as soon as it finds one,
+ * before collecting all the remaining backrefs and before resolving
+ * indirect backrefs. This allows for the caller to terminate backref
+ * walking as soon as it finds one backref that matches some specific
+ * criteria. The @cache_lookup and @cache_store callbacks should not
+ * be NULL in order to use this callback.
+ */
+ iterate_extent_inodes_t *indirect_ref_iterator;
+ /*
+ * If this is not NULL, then the backref walking code will call this for
+ * each extent item it's meant to process before it actually starts
+ * processing it. If this returns anything other than 0, then it stops
+ * the backref walking code immediately.
+ */
+ int (*check_extent_item)(u64 bytenr, const struct btrfs_extent_item *ei,
+ const struct extent_buffer *leaf, void *user_ctx);
+ /*
+ * If this is not NULL, then the backref walking code will call this for
+ * each extent data ref it finds (BTRFS_EXTENT_DATA_REF_KEY keys) before
+ * processing that data ref. If this callback return false, then it will
+ * ignore this data ref and it will never resolve the indirect data ref,
+ * saving time searching for leaves in a fs tree with file extent items
+ * matching the data ref.
+ */
+ bool (*skip_data_ref)(u64 root, u64 ino, u64 offset, void *user_ctx);
+ /* Context object to pass to the callbacks defined above. */
+ void *user_ctx;
+};
+
struct inode_fs_paths {
struct btrfs_path *btrfs_path;
struct btrfs_root *fs_root;
@@ -23,17 +141,59 @@ struct btrfs_backref_shared_cache_entry {
bool is_shared;
};
-struct btrfs_backref_shared_cache {
+#define BTRFS_BACKREF_CTX_PREV_EXTENTS_SIZE 8
+
+struct btrfs_backref_share_check_ctx {
+ /* Ulists used during backref walking. */
+ struct ulist refs;
+ /*
+ * The current leaf the caller of btrfs_is_data_extent_shared() is at.
+ * Typically the caller (at the moment only fiemap) tries to determine
+ * the sharedness of data extents point by file extent items from entire
+ * leaves.
+ */
+ u64 curr_leaf_bytenr;
+ /*
+ * The previous leaf the caller was at in the previous call to
+ * btrfs_is_data_extent_shared(). This may be the same as the current
+ * leaf. On the first call it must be 0.
+ */
+ u64 prev_leaf_bytenr;
/*
* A path from a root to a leaf that has a file extent item pointing to
* a given data extent should never exceed the maximum b+tree height.
*/
- struct btrfs_backref_shared_cache_entry entries[BTRFS_MAX_LEVEL];
- bool use_cache;
+ struct btrfs_backref_shared_cache_entry path_cache_entries[BTRFS_MAX_LEVEL];
+ bool use_path_cache;
+ /*
+ * Cache the sharedness result for the last few extents we have found,
+ * but only for extents for which we have multiple file extent items
+ * that point to them.
+ * It's very common to have several file extent items that point to the
+ * same extent (bytenr) but with different offsets and lengths. This
+ * typically happens for COW writes, partial writes into prealloc
+ * extents, NOCOW writes after snapshoting a root, hole punching or
+ * reflinking within the same file (less common perhaps).
+ * So keep a small cache with the lookup results for the extent pointed
+ * by the last few file extent items. This cache is checked, with a
+ * linear scan, whenever btrfs_is_data_extent_shared() is called, so
+ * it must be small so that it does not negatively affect performance in
+ * case we don't have multiple file extent items that point to the same
+ * data extent.
+ */
+ struct {
+ u64 bytenr;
+ bool is_shared;
+ } prev_extents_cache[BTRFS_BACKREF_CTX_PREV_EXTENTS_SIZE];
+ /*
+ * The slot in the prev_extents_cache array that will be used for
+ * storing the sharedness result of a new data extent.
+ */
+ int prev_extents_cache_slot;
};
-typedef int (iterate_extent_inodes_t)(u64 inum, u64 offset, u64 root,
- void *ctx);
+struct btrfs_backref_share_check_ctx *btrfs_alloc_backref_share_check_ctx(void);
+void btrfs_free_backref_share_ctx(struct btrfs_backref_share_check_ctx *ctx);
int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
struct btrfs_path *path, struct btrfs_key *found_key,
@@ -43,11 +203,9 @@ int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb,
struct btrfs_key *key, struct btrfs_extent_item *ei,
u32 item_size, u64 *out_root, u8 *out_level);
-int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
- u64 extent_item_objectid,
- u64 extent_offset, int search_commit_root,
- iterate_extent_inodes_t *iterate, void *ctx,
- bool ignore_offset);
+int iterate_extent_inodes(struct btrfs_backref_walk_ctx *ctx,
+ bool search_commit_root,
+ iterate_extent_inodes_t *iterate, void *user_ctx);
int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info,
struct btrfs_path *path, void *ctx,
@@ -55,13 +213,8 @@ int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info,
int paths_from_inode(u64 inum, struct inode_fs_paths *ipath);
-int btrfs_find_all_leafs(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info, u64 bytenr,
- u64 time_seq, struct ulist **leafs,
- const u64 *extent_item_pos, bool ignore_offset);
-int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info, u64 bytenr,
- u64 time_seq, struct ulist **roots,
+int btrfs_find_all_leafs(struct btrfs_backref_walk_ctx *ctx);
+int btrfs_find_all_roots(struct btrfs_backref_walk_ctx *ctx,
bool skip_commit_root_sem);
char *btrfs_ref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
u32 name_len, unsigned long name_off,
@@ -77,10 +230,9 @@ int btrfs_find_one_extref(struct btrfs_root *root, u64 inode_objectid,
u64 start_off, struct btrfs_path *path,
struct btrfs_inode_extref **ret_extref,
u64 *found_off);
-int btrfs_is_data_extent_shared(struct btrfs_root *root, u64 inum, u64 bytenr,
+int btrfs_is_data_extent_shared(struct btrfs_inode *inode, u64 bytenr,
u64 extent_gen,
- struct ulist *roots, struct ulist *tmp,
- struct btrfs_backref_shared_cache *cache);
+ struct btrfs_backref_share_check_ctx *ctx);
int __init btrfs_prelim_ref_init(void);
void __cold btrfs_prelim_ref_exit(void);
@@ -111,8 +263,7 @@ struct btrfs_backref_iter {
u32 end_ptr;
};
-struct btrfs_backref_iter *btrfs_backref_iter_alloc(
- struct btrfs_fs_info *fs_info, gfp_t gfp_flag);
+struct btrfs_backref_iter *btrfs_backref_iter_alloc(struct btrfs_fs_info *fs_info);
static inline void btrfs_backref_iter_free(struct btrfs_backref_iter *iter)
{
diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c
new file mode 100644
index 000000000000..b8fb7ef6b520
--- /dev/null
+++ b/fs/btrfs/bio.c
@@ -0,0 +1,381 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2007 Oracle. All rights reserved.
+ * Copyright (C) 2022 Christoph Hellwig.
+ */
+
+#include <linux/bio.h>
+#include "bio.h"
+#include "ctree.h"
+#include "volumes.h"
+#include "raid56.h"
+#include "async-thread.h"
+#include "check-integrity.h"
+#include "dev-replace.h"
+#include "rcu-string.h"
+#include "zoned.h"
+
+static struct bio_set btrfs_bioset;
+
+/*
+ * Initialize a btrfs_bio structure. This skips the embedded bio itself as it
+ * is already initialized by the block layer.
+ */
+static inline void btrfs_bio_init(struct btrfs_bio *bbio,
+ btrfs_bio_end_io_t end_io, void *private)
+{
+ memset(bbio, 0, offsetof(struct btrfs_bio, bio));
+ bbio->end_io = end_io;
+ bbio->private = private;
+}
+
+/*
+ * Allocate a btrfs_bio structure. The btrfs_bio is the main I/O container for
+ * btrfs, and is used for all I/O submitted through btrfs_submit_bio.
+ *
+ * Just like the underlying bio_alloc_bioset it will not fail as it is backed by
+ * a mempool.
+ */
+struct bio *btrfs_bio_alloc(unsigned int nr_vecs, blk_opf_t opf,
+ btrfs_bio_end_io_t end_io, void *private)
+{
+ struct bio *bio;
+
+ bio = bio_alloc_bioset(NULL, nr_vecs, opf, GFP_NOFS, &btrfs_bioset);
+ btrfs_bio_init(btrfs_bio(bio), end_io, private);
+ return bio;
+}
+
+struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size,
+ btrfs_bio_end_io_t end_io, void *private)
+{
+ struct bio *bio;
+ struct btrfs_bio *bbio;
+
+ ASSERT(offset <= UINT_MAX && size <= UINT_MAX);
+
+ bio = bio_alloc_clone(orig->bi_bdev, orig, GFP_NOFS, &btrfs_bioset);
+ bbio = btrfs_bio(bio);
+ btrfs_bio_init(bbio, end_io, private);
+
+ bio_trim(bio, offset >> 9, size >> 9);
+ bbio->iter = bio->bi_iter;
+ return bio;
+}
+
+static void btrfs_log_dev_io_error(struct bio *bio, struct btrfs_device *dev)
+{
+ if (!dev || !dev->bdev)
+ return;
+ if (bio->bi_status != BLK_STS_IOERR && bio->bi_status != BLK_STS_TARGET)
+ return;
+
+ if (btrfs_op(bio) == BTRFS_MAP_WRITE)
+ btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS);
+ if (!(bio->bi_opf & REQ_RAHEAD))
+ btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
+ if (bio->bi_opf & REQ_PREFLUSH)
+ btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_FLUSH_ERRS);
+}
+
+static struct workqueue_struct *btrfs_end_io_wq(struct btrfs_fs_info *fs_info,
+ struct bio *bio)
+{
+ if (bio->bi_opf & REQ_META)
+ return fs_info->endio_meta_workers;
+ return fs_info->endio_workers;
+}
+
+static void btrfs_end_bio_work(struct work_struct *work)
+{
+ struct btrfs_bio *bbio = container_of(work, struct btrfs_bio, end_io_work);
+
+ bbio->end_io(bbio);
+}
+
+static void btrfs_simple_end_io(struct bio *bio)
+{
+ struct btrfs_fs_info *fs_info = bio->bi_private;
+ struct btrfs_bio *bbio = btrfs_bio(bio);
+
+ btrfs_bio_counter_dec(fs_info);
+
+ if (bio->bi_status)
+ btrfs_log_dev_io_error(bio, bbio->device);
+
+ if (bio_op(bio) == REQ_OP_READ) {
+ INIT_WORK(&bbio->end_io_work, btrfs_end_bio_work);
+ queue_work(btrfs_end_io_wq(fs_info, bio), &bbio->end_io_work);
+ } else {
+ bbio->end_io(bbio);
+ }
+}
+
+static void btrfs_raid56_end_io(struct bio *bio)
+{
+ struct btrfs_io_context *bioc = bio->bi_private;
+ struct btrfs_bio *bbio = btrfs_bio(bio);
+
+ btrfs_bio_counter_dec(bioc->fs_info);
+ bbio->mirror_num = bioc->mirror_num;
+ bbio->end_io(bbio);
+
+ btrfs_put_bioc(bioc);
+}
+
+static void btrfs_orig_write_end_io(struct bio *bio)
+{
+ struct btrfs_io_stripe *stripe = bio->bi_private;
+ struct btrfs_io_context *bioc = stripe->bioc;
+ struct btrfs_bio *bbio = btrfs_bio(bio);
+
+ btrfs_bio_counter_dec(bioc->fs_info);
+
+ if (bio->bi_status) {
+ atomic_inc(&bioc->error);
+ btrfs_log_dev_io_error(bio, stripe->dev);
+ }
+
+ /*
+ * Only send an error to the higher layers if it is beyond the tolerance
+ * threshold.
+ */
+ if (atomic_read(&bioc->error) > bioc->max_errors)
+ bio->bi_status = BLK_STS_IOERR;
+ else
+ bio->bi_status = BLK_STS_OK;
+
+ bbio->end_io(bbio);
+ btrfs_put_bioc(bioc);
+}
+
+static void btrfs_clone_write_end_io(struct bio *bio)
+{
+ struct btrfs_io_stripe *stripe = bio->bi_private;
+
+ if (bio->bi_status) {
+ atomic_inc(&stripe->bioc->error);
+ btrfs_log_dev_io_error(bio, stripe->dev);
+ }
+
+ /* Pass on control to the original bio this one was cloned from */
+ bio_endio(stripe->bioc->orig_bio);
+ bio_put(bio);
+}
+
+static void btrfs_submit_dev_bio(struct btrfs_device *dev, struct bio *bio)
+{
+ if (!dev || !dev->bdev ||
+ test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) ||
+ (btrfs_op(bio) == BTRFS_MAP_WRITE &&
+ !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))) {
+ bio_io_error(bio);
+ return;
+ }
+
+ bio_set_dev(bio, dev->bdev);
+
+ /*
+ * For zone append writing, bi_sector must point the beginning of the
+ * zone
+ */
+ if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
+ u64 physical = bio->bi_iter.bi_sector << SECTOR_SHIFT;
+
+ if (btrfs_dev_is_sequential(dev, physical)) {
+ u64 zone_start = round_down(physical,
+ dev->fs_info->zone_size);
+
+ bio->bi_iter.bi_sector = zone_start >> SECTOR_SHIFT;
+ } else {
+ bio->bi_opf &= ~REQ_OP_ZONE_APPEND;
+ bio->bi_opf |= REQ_OP_WRITE;
+ }
+ }
+ btrfs_debug_in_rcu(dev->fs_info,
+ "%s: rw %d 0x%x, sector=%llu, dev=%lu (%s id %llu), size=%u",
+ __func__, bio_op(bio), bio->bi_opf, bio->bi_iter.bi_sector,
+ (unsigned long)dev->bdev->bd_dev, btrfs_dev_name(dev),
+ dev->devid, bio->bi_iter.bi_size);
+
+ btrfsic_check_bio(bio);
+ submit_bio(bio);
+}
+
+static void btrfs_submit_mirrored_bio(struct btrfs_io_context *bioc, int dev_nr)
+{
+ struct bio *orig_bio = bioc->orig_bio, *bio;
+
+ ASSERT(bio_op(orig_bio) != REQ_OP_READ);
+
+ /* Reuse the bio embedded into the btrfs_bio for the last mirror */
+ if (dev_nr == bioc->num_stripes - 1) {
+ bio = orig_bio;
+ bio->bi_end_io = btrfs_orig_write_end_io;
+ } else {
+ bio = bio_alloc_clone(NULL, orig_bio, GFP_NOFS, &fs_bio_set);
+ bio_inc_remaining(orig_bio);
+ bio->bi_end_io = btrfs_clone_write_end_io;
+ }
+
+ bio->bi_private = &bioc->stripes[dev_nr];
+ bio->bi_iter.bi_sector = bioc->stripes[dev_nr].physical >> SECTOR_SHIFT;
+ bioc->stripes[dev_nr].bioc = bioc;
+ btrfs_submit_dev_bio(bioc->stripes[dev_nr].dev, bio);
+}
+
+void btrfs_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio, int mirror_num)
+{
+ u64 logical = bio->bi_iter.bi_sector << 9;
+ u64 length = bio->bi_iter.bi_size;
+ u64 map_length = length;
+ struct btrfs_io_context *bioc = NULL;
+ struct btrfs_io_stripe smap;
+ int ret;
+
+ btrfs_bio_counter_inc_blocked(fs_info);
+ ret = __btrfs_map_block(fs_info, btrfs_op(bio), logical, &map_length,
+ &bioc, &smap, &mirror_num, 1);
+ if (ret) {
+ btrfs_bio_counter_dec(fs_info);
+ btrfs_bio_end_io(btrfs_bio(bio), errno_to_blk_status(ret));
+ return;
+ }
+
+ if (map_length < length) {
+ btrfs_crit(fs_info,
+ "mapping failed logical %llu bio len %llu len %llu",
+ logical, length, map_length);
+ BUG();
+ }
+
+ if (!bioc) {
+ /* Single mirror read/write fast path */
+ btrfs_bio(bio)->mirror_num = mirror_num;
+ btrfs_bio(bio)->device = smap.dev;
+ bio->bi_iter.bi_sector = smap.physical >> SECTOR_SHIFT;
+ bio->bi_private = fs_info;
+ bio->bi_end_io = btrfs_simple_end_io;
+ btrfs_submit_dev_bio(smap.dev, bio);
+ } else if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
+ /* Parity RAID write or read recovery */
+ bio->bi_private = bioc;
+ bio->bi_end_io = btrfs_raid56_end_io;
+ if (bio_op(bio) == REQ_OP_READ)
+ raid56_parity_recover(bio, bioc, mirror_num);
+ else
+ raid56_parity_write(bio, bioc);
+ } else {
+ /* Write to multiple mirrors */
+ int total_devs = bioc->num_stripes;
+ int dev_nr;
+
+ bioc->orig_bio = bio;
+ for (dev_nr = 0; dev_nr < total_devs; dev_nr++)
+ btrfs_submit_mirrored_bio(bioc, dev_nr);
+ }
+}
+
+/*
+ * Submit a repair write.
+ *
+ * This bypasses btrfs_submit_bio deliberately, as that writes all copies in a
+ * RAID setup. Here we only want to write the one bad copy, so we do the
+ * mapping ourselves and submit the bio directly.
+ *
+ * The I/O is issued sychronously to block the repair read completion from
+ * freeing the bio.
+ */
+int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
+ u64 length, u64 logical, struct page *page,
+ unsigned int pg_offset, int mirror_num)
+{
+ struct btrfs_device *dev;
+ struct bio_vec bvec;
+ struct bio bio;
+ u64 map_length = 0;
+ u64 sector;
+ struct btrfs_io_context *bioc = NULL;
+ int ret = 0;
+
+ ASSERT(!(fs_info->sb->s_flags & SB_RDONLY));
+ BUG_ON(!mirror_num);
+
+ if (btrfs_repair_one_zone(fs_info, logical))
+ return 0;
+
+ map_length = length;
+
+ /*
+ * Avoid races with device replace and make sure our bioc has devices
+ * associated to its stripes that don't go away while we are doing the
+ * read repair operation.
+ */
+ btrfs_bio_counter_inc_blocked(fs_info);
+ if (btrfs_is_parity_mirror(fs_info, logical, length)) {
+ /*
+ * Note that we don't use BTRFS_MAP_WRITE because it's supposed
+ * to update all raid stripes, but here we just want to correct
+ * bad stripe, thus BTRFS_MAP_READ is abused to only get the bad
+ * stripe's dev and sector.
+ */
+ ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, logical,
+ &map_length, &bioc, 0);
+ if (ret)
+ goto out_counter_dec;
+ ASSERT(bioc->mirror_num == 1);
+ } else {
+ ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, logical,
+ &map_length, &bioc, mirror_num);
+ if (ret)
+ goto out_counter_dec;
+ BUG_ON(mirror_num != bioc->mirror_num);
+ }
+
+ sector = bioc->stripes[bioc->mirror_num - 1].physical >> 9;
+ dev = bioc->stripes[bioc->mirror_num - 1].dev;
+ btrfs_put_bioc(bioc);
+
+ if (!dev || !dev->bdev ||
+ !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) {
+ ret = -EIO;
+ goto out_counter_dec;
+ }
+
+ bio_init(&bio, dev->bdev, &bvec, 1, REQ_OP_WRITE | REQ_SYNC);
+ bio.bi_iter.bi_sector = sector;
+ __bio_add_page(&bio, page, length, pg_offset);
+
+ btrfsic_check_bio(&bio);
+ ret = submit_bio_wait(&bio);
+ if (ret) {
+ /* try to remap that extent elsewhere? */
+ btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS);
+ goto out_bio_uninit;
+ }
+
+ btrfs_info_rl_in_rcu(fs_info,
+ "read error corrected: ino %llu off %llu (dev %s sector %llu)",
+ ino, start, btrfs_dev_name(dev), sector);
+ ret = 0;
+
+out_bio_uninit:
+ bio_uninit(&bio);
+out_counter_dec:
+ btrfs_bio_counter_dec(fs_info);
+ return ret;
+}
+
+int __init btrfs_bioset_init(void)
+{
+ if (bioset_init(&btrfs_bioset, BIO_POOL_SIZE,
+ offsetof(struct btrfs_bio, bio),
+ BIOSET_NEED_BVECS))
+ return -ENOMEM;
+ return 0;
+}
+
+void __cold btrfs_bioset_exit(void)
+{
+ bioset_exit(&btrfs_bioset);
+}
diff --git a/fs/btrfs/bio.h b/fs/btrfs/bio.h
new file mode 100644
index 000000000000..b12f84b3b341
--- /dev/null
+++ b/fs/btrfs/bio.h
@@ -0,0 +1,127 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2007 Oracle. All rights reserved.
+ * Copyright (C) 2022 Christoph Hellwig.
+ */
+
+#ifndef BTRFS_BIO_H
+#define BTRFS_BIO_H
+
+#include <linux/bio.h>
+#include <linux/workqueue.h>
+#include "tree-checker.h"
+
+struct btrfs_bio;
+struct btrfs_fs_info;
+
+#define BTRFS_BIO_INLINE_CSUM_SIZE 64
+
+/*
+ * Maximum number of sectors for a single bio to limit the size of the
+ * checksum array. This matches the number of bio_vecs per bio and thus the
+ * I/O size for buffered I/O.
+ */
+#define BTRFS_MAX_BIO_SECTORS (256)
+
+typedef void (*btrfs_bio_end_io_t)(struct btrfs_bio *bbio);
+
+/*
+ * Additional info to pass along bio.
+ *
+ * Mostly for btrfs specific features like csum and mirror_num.
+ */
+struct btrfs_bio {
+ unsigned int mirror_num:7;
+
+ /*
+ * Extra indicator for metadata bios.
+ * For some btrfs bios they use pages without a mapping, thus
+ * we can not rely on page->mapping->host to determine if
+ * it's a metadata bio.
+ */
+ unsigned int is_metadata:1;
+ struct bvec_iter iter;
+
+ /* for direct I/O */
+ u64 file_offset;
+
+ /* @device is for stripe IO submission. */
+ struct btrfs_device *device;
+ union {
+ /* For data checksum verification. */
+ struct {
+ u8 *csum;
+ u8 csum_inline[BTRFS_BIO_INLINE_CSUM_SIZE];
+ };
+
+ /* For metadata parentness verification. */
+ struct btrfs_tree_parent_check parent_check;
+ };
+
+ /* End I/O information supplied to btrfs_bio_alloc */
+ btrfs_bio_end_io_t end_io;
+ void *private;
+
+ /* For read end I/O handling */
+ struct work_struct end_io_work;
+
+ /*
+ * This member must come last, bio_alloc_bioset will allocate enough
+ * bytes for entire btrfs_bio but relies on bio being last.
+ */
+ struct bio bio;
+};
+
+static inline struct btrfs_bio *btrfs_bio(struct bio *bio)
+{
+ return container_of(bio, struct btrfs_bio, bio);
+}
+
+int __init btrfs_bioset_init(void);
+void __cold btrfs_bioset_exit(void);
+
+struct bio *btrfs_bio_alloc(unsigned int nr_vecs, blk_opf_t opf,
+ btrfs_bio_end_io_t end_io, void *private);
+struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size,
+ btrfs_bio_end_io_t end_io, void *private);
+
+
+static inline void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status)
+{
+ bbio->bio.bi_status = status;
+ bbio->end_io(bbio);
+}
+
+static inline void btrfs_bio_free_csum(struct btrfs_bio *bbio)
+{
+ if (bbio->is_metadata)
+ return;
+ if (bbio->csum != bbio->csum_inline) {
+ kfree(bbio->csum);
+ bbio->csum = NULL;
+ }
+}
+
+/*
+ * Iterate through a btrfs_bio (@bbio) on a per-sector basis.
+ *
+ * bvl - struct bio_vec
+ * bbio - struct btrfs_bio
+ * iters - struct bvec_iter
+ * bio_offset - unsigned int
+ */
+#define btrfs_bio_for_each_sector(fs_info, bvl, bbio, iter, bio_offset) \
+ for ((iter) = (bbio)->iter, (bio_offset) = 0; \
+ (iter).bi_size && \
+ (((bvl) = bio_iter_iovec((&(bbio)->bio), (iter))), 1); \
+ (bio_offset) += fs_info->sectorsize, \
+ bio_advance_iter_single(&(bbio)->bio, &(iter), \
+ (fs_info)->sectorsize))
+
+void btrfs_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
+ int mirror_num);
+int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
+ u64 length, u64 logical, struct page *page,
+ unsigned int pg_offset, int mirror_num);
+
+#endif
diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index deebc8ddbd93..708d843daa72 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -17,6 +17,21 @@
#include "discard.h"
#include "raid56.h"
#include "zoned.h"
+#include "fs.h"
+#include "accessors.h"
+#include "extent-tree.h"
+
+#ifdef CONFIG_BTRFS_DEBUG
+int btrfs_should_fragment_free_space(struct btrfs_block_group *block_group)
+{
+ struct btrfs_fs_info *fs_info = block_group->fs_info;
+
+ return (btrfs_test_opt(fs_info, FRAGMENT_METADATA) &&
+ block_group->flags & BTRFS_BLOCK_GROUP_METADATA) ||
+ (btrfs_test_opt(fs_info, FRAGMENT_DATA) &&
+ block_group->flags & BTRFS_BLOCK_GROUP_DATA);
+}
+#endif
/*
* Return target flags in extended format or 0 if restripe for this chunk_type
@@ -284,7 +299,7 @@ struct btrfs_block_group *btrfs_next_block_group(
return cache;
}
-/**
+/*
* Check if we can do a NOCOW write for a given extent.
*
* @fs_info: The filesystem information object.
@@ -325,11 +340,9 @@ struct btrfs_block_group *btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info,
return bg;
}
-/**
+/*
* Decrement the number of NOCOW writers in a block group.
*
- * @bg: The block group.
- *
* This is meant to be called after a previous call to btrfs_inc_nocow_writers(),
* and on the block group returned by that call. Typically this is called after
* creating an ordered extent for a NOCOW write, to prevent races with scrub and
@@ -1527,6 +1540,30 @@ static inline bool btrfs_should_reclaim(struct btrfs_fs_info *fs_info)
return true;
}
+static bool should_reclaim_block_group(struct btrfs_block_group *bg, u64 bytes_freed)
+{
+ const struct btrfs_space_info *space_info = bg->space_info;
+ const int reclaim_thresh = READ_ONCE(space_info->bg_reclaim_threshold);
+ const u64 new_val = bg->used;
+ const u64 old_val = new_val + bytes_freed;
+ u64 thresh;
+
+ if (reclaim_thresh == 0)
+ return false;
+
+ thresh = mult_perc(bg->length, reclaim_thresh);
+
+ /*
+ * If we were below the threshold before don't reclaim, we are likely a
+ * brand new block group and we don't want to relocate new block groups.
+ */
+ if (old_val < thresh)
+ return false;
+ if (new_val >= thresh)
+ return false;
+ return true;
+}
+
void btrfs_reclaim_bgs_work(struct work_struct *work)
{
struct btrfs_fs_info *fs_info =
@@ -1594,6 +1631,40 @@ void btrfs_reclaim_bgs_work(struct work_struct *work)
up_write(&space_info->groups_sem);
goto next;
}
+ if (bg->used == 0) {
+ /*
+ * It is possible that we trigger relocation on a block
+ * group as its extents are deleted and it first goes
+ * below the threshold, then shortly after goes empty.
+ *
+ * In this case, relocating it does delete it, but has
+ * some overhead in relocation specific metadata, looking
+ * for the non-existent extents and running some extra
+ * transactions, which we can avoid by using one of the
+ * other mechanisms for dealing with empty block groups.
+ */
+ if (!btrfs_test_opt(fs_info, DISCARD_ASYNC))
+ btrfs_mark_bg_unused(bg);
+ spin_unlock(&bg->lock);
+ up_write(&space_info->groups_sem);
+ goto next;
+
+ }
+ /*
+ * The block group might no longer meet the reclaim condition by
+ * the time we get around to reclaiming it, so to avoid
+ * reclaiming overly full block_groups, skip reclaiming them.
+ *
+ * Since the decision making process also depends on the amount
+ * being freed, pass in a fake giant value to skip that extra
+ * check, which is more meaningful when adding to the list in
+ * the first place.
+ */
+ if (!should_reclaim_block_group(bg, bg->length)) {
+ spin_unlock(&bg->lock);
+ up_write(&space_info->groups_sem);
+ goto next;
+ }
spin_unlock(&bg->lock);
/* Get out fast, in case we're unmounting the filesystem */
@@ -1740,8 +1811,8 @@ static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
write_sequnlock(&fs_info->profiles_lock);
}
-/**
- * Map a physical disk address to a list of logical addresses
+/*
+ * Map a physical disk address to a list of logical addresses.
*
* @fs_info: the filesystem
* @chunk_start: logical address of block group
@@ -2001,6 +2072,7 @@ static int read_one_block_group(struct btrfs_fs_info *info,
cache->length = key->offset;
cache->used = btrfs_stack_block_group_used(bgi);
+ cache->commit_used = cache->used;
cache->flags = btrfs_stack_block_group_flags(bgi);
cache->global_root_id = btrfs_stack_block_group_chunk_objectid(bgi);
@@ -2481,7 +2553,7 @@ struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *tran
cache->global_root_id = calculate_global_root_id(fs_info, cache->start);
if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE))
- cache->needs_free_space = 1;
+ set_bit(BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE, &cache->runtime_flags);
ret = btrfs_load_block_group_zone_info(cache, true);
if (ret) {
@@ -2692,6 +2764,25 @@ static int update_block_group_item(struct btrfs_trans_handle *trans,
struct extent_buffer *leaf;
struct btrfs_block_group_item bgi;
struct btrfs_key key;
+ u64 old_commit_used;
+ u64 used;
+
+ /*
+ * Block group items update can be triggered out of commit transaction
+ * critical section, thus we need a consistent view of used bytes.
+ * We cannot use cache->used directly outside of the spin lock, as it
+ * may be changed.
+ */
+ spin_lock(&cache->lock);
+ old_commit_used = cache->commit_used;
+ used = cache->used;
+ /* No change in used bytes, can safely skip it. */
+ if (cache->commit_used == used) {
+ spin_unlock(&cache->lock);
+ return 0;
+ }
+ cache->commit_used = used;
+ spin_unlock(&cache->lock);
key.objectid = cache->start;
key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
@@ -2706,7 +2797,7 @@ static int update_block_group_item(struct btrfs_trans_handle *trans,
leaf = path->nodes[0];
bi = btrfs_item_ptr_offset(leaf, path->slots[0]);
- btrfs_set_stack_block_group_used(&bgi, cache->used);
+ btrfs_set_stack_block_group_used(&bgi, used);
btrfs_set_stack_block_group_chunk_objectid(&bgi,
cache->global_root_id);
btrfs_set_stack_block_group_flags(&bgi, cache->flags);
@@ -2714,6 +2805,12 @@ static int update_block_group_item(struct btrfs_trans_handle *trans,
btrfs_mark_buffer_dirty(leaf);
fail:
btrfs_release_path(path);
+ /* We didn't update the block group item, need to revert @commit_used. */
+ if (ret < 0) {
+ spin_lock(&cache->lock);
+ cache->commit_used = old_commit_used;
+ spin_unlock(&cache->lock);
+ }
return ret;
}
@@ -3211,31 +3308,6 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans)
return ret;
}
-static inline bool should_reclaim_block_group(struct btrfs_block_group *bg,
- u64 bytes_freed)
-{
- const struct btrfs_space_info *space_info = bg->space_info;
- const int reclaim_thresh = READ_ONCE(space_info->bg_reclaim_threshold);
- const u64 new_val = bg->used;
- const u64 old_val = new_val + bytes_freed;
- u64 thresh;
-
- if (reclaim_thresh == 0)
- return false;
-
- thresh = div_factor_fine(bg->length, reclaim_thresh);
-
- /*
- * If we were below the threshold before don't reclaim, we are likely a
- * brand new block group and we don't want to relocate new block groups.
- */
- if (old_val < thresh)
- return false;
- if (new_val >= thresh)
- return false;
- return true;
-}
-
int btrfs_update_block_group(struct btrfs_trans_handle *trans,
u64 bytenr, u64 num_bytes, bool alloc)
{
@@ -3347,8 +3419,9 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans,
return ret;
}
-/**
- * btrfs_add_reserved_bytes - update the block_group and space info counters
+/*
+ * Update the block_group and space info counters.
+ *
* @cache: The cache we are manipulating
* @ram_bytes: The number of bytes of file content, and will be same to
* @num_bytes except for the compress path.
@@ -3391,8 +3464,9 @@ int btrfs_add_reserved_bytes(struct btrfs_block_group *cache,
return ret;
}
-/**
- * btrfs_free_reserved_bytes - update the block_group and space info counters
+/*
+ * Update the block_group and space info counters.
+ *
* @cache: The cache we are manipulating
* @num_bytes: The number of bytes in question
* @delalloc: The blocks are allocated for the delalloc write
@@ -3449,13 +3523,13 @@ static int should_alloc_chunk(struct btrfs_fs_info *fs_info,
*/
if (force == CHUNK_ALLOC_LIMITED) {
thresh = btrfs_super_total_bytes(fs_info->super_copy);
- thresh = max_t(u64, SZ_64M, div_factor_fine(thresh, 1));
+ thresh = max_t(u64, SZ_64M, mult_perc(thresh, 1));
if (sinfo->total_bytes - bytes_used < thresh)
return 1;
}
- if (bytes_used + SZ_2M < div_factor(sinfo->total_bytes, 8))
+ if (bytes_used + SZ_2M < mult_perc(sinfo->total_bytes, 80))
return 0;
return 1;
}
diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h
index 8fb14b99a1d1..a02ea76fd6cf 100644
--- a/fs/btrfs/block-group.h
+++ b/fs/btrfs/block-group.h
@@ -55,6 +55,10 @@ enum btrfs_block_group_flags {
BLOCK_GROUP_FLAG_CHUNK_ITEM_INSERTED,
BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE,
BLOCK_GROUP_FLAG_ZONED_DATA_RELOC,
+ /* Does the block group need to be added to the free space tree? */
+ BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE,
+ /* Indicate that the block group is placed on a sequential zone */
+ BLOCK_GROUP_FLAG_SEQUENTIAL_ZONE,
};
enum btrfs_caching_type {
@@ -100,6 +104,12 @@ struct btrfs_block_group {
u64 global_root_id;
/*
+ * The last committed used bytes of this block group, if the above @used
+ * is still the same as @commit_used, we don't need to update block
+ * group item of this block group.
+ */
+ u64 commit_used;
+ /*
* If the free space extent count exceeds this number, convert the block
* group to bitmaps.
*/
@@ -203,15 +213,6 @@ struct btrfs_block_group {
struct mutex free_space_lock;
/*
- * Does the block group need to be added to the free space tree?
- * Protected by free_space_lock.
- */
- int needs_free_space;
-
- /* Flag indicating this block group is placed on a sequential zone */
- bool seq_zone;
-
- /*
* Number of extents in this block group used for swap files.
* All accesses protected by the spinlock 'lock'.
*/
@@ -251,16 +252,7 @@ static inline bool btrfs_is_block_group_data_only(
}
#ifdef CONFIG_BTRFS_DEBUG
-static inline int btrfs_should_fragment_free_space(
- struct btrfs_block_group *block_group)
-{
- struct btrfs_fs_info *fs_info = block_group->fs_info;
-
- return (btrfs_test_opt(fs_info, FRAGMENT_METADATA) &&
- block_group->flags & BTRFS_BLOCK_GROUP_METADATA) ||
- (btrfs_test_opt(fs_info, FRAGMENT_DATA) &&
- block_group->flags & BTRFS_BLOCK_GROUP_DATA);
-}
+int btrfs_should_fragment_free_space(struct btrfs_block_group *block_group);
#endif
struct btrfs_block_group *btrfs_lookup_first_block_group(
diff --git a/fs/btrfs/block-rsv.c b/fs/btrfs/block-rsv.c
index ec96285357e0..5367a14d44d2 100644
--- a/fs/btrfs/block-rsv.c
+++ b/fs/btrfs/block-rsv.c
@@ -7,6 +7,8 @@
#include "transaction.h"
#include "block-group.h"
#include "disk-io.h"
+#include "fs.h"
+#include "accessors.h"
/*
* HOW DO BLOCK RESERVES WORK
@@ -225,7 +227,7 @@ int btrfs_block_rsv_add(struct btrfs_fs_info *fs_info,
return ret;
}
-int btrfs_block_rsv_check(struct btrfs_block_rsv *block_rsv, int min_factor)
+int btrfs_block_rsv_check(struct btrfs_block_rsv *block_rsv, int min_percent)
{
u64 num_bytes = 0;
int ret = -ENOSPC;
@@ -234,7 +236,7 @@ int btrfs_block_rsv_check(struct btrfs_block_rsv *block_rsv, int min_factor)
return 0;
spin_lock(&block_rsv->lock);
- num_bytes = div_factor(block_rsv->size, min_factor);
+ num_bytes = mult_perc(block_rsv->size, min_percent);
if (block_rsv->reserved >= num_bytes)
ret = 0;
spin_unlock(&block_rsv->lock);
@@ -323,31 +325,6 @@ void btrfs_block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv,
spin_unlock(&block_rsv->lock);
}
-int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info,
- struct btrfs_block_rsv *dest, u64 num_bytes,
- int min_factor)
-{
- struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
- u64 min_bytes;
-
- if (global_rsv->space_info != dest->space_info)
- return -ENOSPC;
-
- spin_lock(&global_rsv->lock);
- min_bytes = div_factor(global_rsv->size, min_factor);
- if (global_rsv->reserved < min_bytes + num_bytes) {
- spin_unlock(&global_rsv->lock);
- return -ENOSPC;
- }
- global_rsv->reserved -= num_bytes;
- if (global_rsv->reserved < global_rsv->size)
- global_rsv->full = false;
- spin_unlock(&global_rsv->lock);
-
- btrfs_block_rsv_add_bytes(dest, num_bytes, true);
- return 0;
-}
-
void btrfs_update_global_block_rsv(struct btrfs_fs_info *fs_info)
{
struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
@@ -552,5 +529,17 @@ try_reserve:
if (!ret)
return global_rsv;
}
+
+ /*
+ * All hope is lost, but of course our reservations are overly
+ * pessimistic, so instead of possibly having an ENOSPC abort here, try
+ * one last time to force a reservation if there's enough actual space
+ * on disk to make the reservation.
+ */
+ ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv, blocksize,
+ BTRFS_RESERVE_FLUSH_EMERGENCY);
+ if (!ret)
+ return block_rsv;
+
return ERR_PTR(ret);
}
diff --git a/fs/btrfs/block-rsv.h b/fs/btrfs/block-rsv.h
index 578c3497a455..4cc41c9aaa82 100644
--- a/fs/btrfs/block-rsv.h
+++ b/fs/btrfs/block-rsv.h
@@ -4,6 +4,7 @@
#define BTRFS_BLOCK_RSV_H
struct btrfs_trans_handle;
+struct btrfs_root;
enum btrfs_reserve_flush_enum;
/*
@@ -62,7 +63,7 @@ void btrfs_free_block_rsv(struct btrfs_fs_info *fs_info,
int btrfs_block_rsv_add(struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *block_rsv, u64 num_bytes,
enum btrfs_reserve_flush_enum flush);
-int btrfs_block_rsv_check(struct btrfs_block_rsv *block_rsv, int min_factor);
+int btrfs_block_rsv_check(struct btrfs_block_rsv *block_rsv, int min_percent);
int btrfs_block_rsv_refill(struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *block_rsv, u64 min_reserved,
enum btrfs_reserve_flush_enum flush);
@@ -70,9 +71,6 @@ int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
struct btrfs_block_rsv *dst_rsv, u64 num_bytes,
bool update_size);
int btrfs_block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv, u64 num_bytes);
-int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info,
- struct btrfs_block_rsv *dest, u64 num_bytes,
- int min_factor);
void btrfs_block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv,
u64 num_bytes, bool update_size);
u64 btrfs_block_rsv_release(struct btrfs_fs_info *fs_info,
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 54c2ccb36b61..195c09e20609 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -411,29 +411,142 @@ static inline void btrfs_inode_split_flags(u64 inode_item_flags,
#define CSUM_FMT "0x%*phN"
#define CSUM_FMT_VALUE(size, bytes) size, bytes
-static inline void btrfs_print_data_csum_error(struct btrfs_inode *inode,
- u64 logical_start, u8 *csum, u8 *csum_expected, int mirror_num)
-{
- struct btrfs_root *root = inode->root;
- const u32 csum_size = root->fs_info->csum_size;
-
- /* Output minus objectid, which is more meaningful */
- if (root->root_key.objectid >= BTRFS_LAST_FREE_OBJECTID)
- btrfs_warn_rl(root->fs_info,
-"csum failed root %lld ino %lld off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d",
- root->root_key.objectid, btrfs_ino(inode),
- logical_start,
- CSUM_FMT_VALUE(csum_size, csum),
- CSUM_FMT_VALUE(csum_size, csum_expected),
- mirror_num);
- else
- btrfs_warn_rl(root->fs_info,
-"csum failed root %llu ino %llu off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d",
- root->root_key.objectid, btrfs_ino(inode),
- logical_start,
- CSUM_FMT_VALUE(csum_size, csum),
- CSUM_FMT_VALUE(csum_size, csum_expected),
- mirror_num);
-}
+void btrfs_submit_data_write_bio(struct btrfs_inode *inode, struct bio *bio, int mirror_num);
+void btrfs_submit_data_read_bio(struct btrfs_inode *inode, struct bio *bio,
+ int mirror_num, enum btrfs_compression_type compress_type);
+void btrfs_submit_dio_repair_bio(struct btrfs_inode *inode, struct bio *bio, int mirror_num);
+blk_status_t btrfs_submit_bio_start(struct btrfs_inode *inode, struct bio *bio);
+blk_status_t btrfs_submit_bio_start_direct_io(struct btrfs_inode *inode,
+ struct bio *bio,
+ u64 dio_file_offset);
+int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, struct page *page,
+ u32 pgoff, u8 *csum, const u8 * const csum_expected);
+int btrfs_check_data_csum(struct btrfs_inode *inode, struct btrfs_bio *bbio,
+ u32 bio_offset, struct page *page, u32 pgoff);
+unsigned int btrfs_verify_data_csum(struct btrfs_bio *bbio,
+ u32 bio_offset, struct page *page,
+ u64 start, u64 end);
+noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
+ u64 *orig_start, u64 *orig_block_len,
+ u64 *ram_bytes, bool nowait, bool strict);
+
+void __btrfs_del_delalloc_inode(struct btrfs_root *root, struct btrfs_inode *inode);
+struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry);
+int btrfs_set_inode_index(struct btrfs_inode *dir, u64 *index);
+int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
+ struct btrfs_inode *dir, struct btrfs_inode *inode,
+ const struct fscrypt_str *name);
+int btrfs_add_link(struct btrfs_trans_handle *trans,
+ struct btrfs_inode *parent_inode, struct btrfs_inode *inode,
+ const struct fscrypt_str *name, int add_backref, u64 index);
+int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry);
+int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len,
+ int front);
+
+int btrfs_start_delalloc_snapshot(struct btrfs_root *root, bool in_reclaim_context);
+int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr,
+ bool in_reclaim_context);
+int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
+ unsigned int extra_bits,
+ struct extent_state **cached_state);
+
+struct btrfs_new_inode_args {
+ /* Input */
+ struct inode *dir;
+ struct dentry *dentry;
+ struct inode *inode;
+ bool orphan;
+ bool subvol;
+
+ /* Output from btrfs_new_inode_prepare(), input to btrfs_create_new_inode(). */
+ struct posix_acl *default_acl;
+ struct posix_acl *acl;
+ struct fscrypt_name fname;
+};
+
+int btrfs_new_inode_prepare(struct btrfs_new_inode_args *args,
+ unsigned int *trans_num_items);
+int btrfs_create_new_inode(struct btrfs_trans_handle *trans,
+ struct btrfs_new_inode_args *args);
+void btrfs_new_inode_args_destroy(struct btrfs_new_inode_args *args);
+struct inode *btrfs_new_subvol_inode(struct user_namespace *mnt_userns,
+ struct inode *dir);
+ void btrfs_set_delalloc_extent(struct btrfs_inode *inode, struct extent_state *state,
+ u32 bits);
+void btrfs_clear_delalloc_extent(struct btrfs_inode *inode,
+ struct extent_state *state, u32 bits);
+void btrfs_merge_delalloc_extent(struct btrfs_inode *inode, struct extent_state *new,
+ struct extent_state *other);
+void btrfs_split_delalloc_extent(struct btrfs_inode *inode,
+ struct extent_state *orig, u64 split);
+void btrfs_set_range_writeback(struct btrfs_inode *inode, u64 start, u64 end);
+vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf);
+void btrfs_evict_inode(struct inode *inode);
+struct inode *btrfs_alloc_inode(struct super_block *sb);
+void btrfs_destroy_inode(struct inode *inode);
+void btrfs_free_inode(struct inode *inode);
+int btrfs_drop_inode(struct inode *inode);
+int __init btrfs_init_cachep(void);
+void __cold btrfs_destroy_cachep(void);
+struct inode *btrfs_iget_path(struct super_block *s, u64 ino,
+ struct btrfs_root *root, struct btrfs_path *path);
+struct inode *btrfs_iget(struct super_block *s, u64 ino, struct btrfs_root *root);
+struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
+ struct page *page, size_t pg_offset,
+ u64 start, u64 end);
+int btrfs_update_inode(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct btrfs_inode *inode);
+int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct btrfs_inode *inode);
+int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct btrfs_inode *inode);
+int btrfs_orphan_cleanup(struct btrfs_root *root);
+int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size);
+void btrfs_add_delayed_iput(struct btrfs_inode *inode);
+void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info);
+int btrfs_wait_on_delayed_iputs(struct btrfs_fs_info *fs_info);
+int btrfs_prealloc_file_range(struct inode *inode, int mode,
+ u64 start, u64 num_bytes, u64 min_size,
+ loff_t actual_len, u64 *alloc_hint);
+int btrfs_prealloc_file_range_trans(struct inode *inode,
+ struct btrfs_trans_handle *trans, int mode,
+ u64 start, u64 num_bytes, u64 min_size,
+ loff_t actual_len, u64 *alloc_hint);
+int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page,
+ u64 start, u64 end, int *page_started,
+ unsigned long *nr_written, struct writeback_control *wbc);
+int btrfs_writepage_cow_fixup(struct page *page);
+void btrfs_writepage_endio_finish_ordered(struct btrfs_inode *inode,
+ struct page *page, u64 start,
+ u64 end, bool uptodate);
+int btrfs_encoded_io_compression_from_extent(struct btrfs_fs_info *fs_info,
+ int compress_type);
+int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,
+ u64 file_offset, u64 disk_bytenr,
+ u64 disk_io_size,
+ struct page **pages);
+ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
+ struct btrfs_ioctl_encoded_io_args *encoded);
+ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
+ const struct btrfs_ioctl_encoded_io_args *encoded);
+
+ssize_t btrfs_dio_read(struct kiocb *iocb, struct iov_iter *iter,
+ size_t done_before);
+struct iomap_dio *btrfs_dio_write(struct kiocb *iocb, struct iov_iter *iter,
+ size_t done_before);
+
+extern const struct dentry_operations btrfs_dentry_operations;
+
+/* Inode locking type flags, by default the exclusive lock is taken. */
+enum btrfs_ilock_type {
+ ENUM_BIT(BTRFS_ILOCK_SHARED),
+ ENUM_BIT(BTRFS_ILOCK_TRY),
+ ENUM_BIT(BTRFS_ILOCK_MMAP),
+};
+
+int btrfs_inode_lock(struct btrfs_inode *inode, unsigned int ilock_flags);
+void btrfs_inode_unlock(struct btrfs_inode *inode, unsigned int ilock_flags);
+void btrfs_update_inode_bytes(struct btrfs_inode *inode, const u64 add_bytes,
+ const u64 del_bytes);
+void btrfs_assert_inode_range_clean(struct btrfs_inode *inode, u64 start, u64 end);
#endif
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index 98c6e5feab19..82e49d985019 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -82,6 +82,7 @@
#include <linux/mm.h>
#include <linux/string.h>
#include <crypto/hash.h>
+#include "messages.h"
#include "ctree.h"
#include "disk-io.h"
#include "transaction.h"
@@ -92,6 +93,7 @@
#include "check-integrity.h"
#include "rcu-string.h"
#include "compression.h"
+#include "accessors.h"
#define BTRFSIC_BLOCK_HASHTABLE_SIZE 0x10000
#define BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE 0x10000
@@ -755,7 +757,7 @@ static int btrfsic_process_superblock_dev_mirror(
btrfs_info_in_rcu(fs_info,
"new initial S-block (bdev %p, %s) @%llu (%pg/%llu/%d)",
superblock_bdev,
- rcu_str_deref(device->name), dev_bytenr,
+ btrfs_dev_name(device), dev_bytenr,
dev_state->bdev, dev_bytenr,
superblock_mirror_num);
list_add(&superblock_tmp->all_blocks_node,
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index e6635fe70067..5122ca79f7ea 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -23,16 +23,19 @@
#include <crypto/hash.h>
#include "misc.h"
#include "ctree.h"
+#include "fs.h"
#include "disk-io.h"
#include "transaction.h"
#include "btrfs_inode.h"
-#include "volumes.h"
+#include "bio.h"
#include "ordered-data.h"
#include "compression.h"
#include "extent_io.h"
#include "extent_map.h"
#include "subpage.h"
#include "zoned.h"
+#include "file-item.h"
+#include "super.h"
static const char* const btrfs_compress_types[] = { "", "zlib", "lzo", "zstd" };
@@ -116,7 +119,7 @@ static int compression_decompress_bio(struct list_head *ws,
}
static int compression_decompress(int type, struct list_head *ws,
- unsigned char *data_in, struct page *dest_page,
+ const u8 *data_in, struct page *dest_page,
unsigned long start_byte, size_t srclen, size_t destlen)
{
switch (type) {
@@ -183,7 +186,7 @@ static void end_compressed_bio_read(struct btrfs_bio *bbio)
u64 start = bbio->file_offset + offset;
if (!status &&
- (!csum || !btrfs_check_data_csum(inode, bbio, offset,
+ (!csum || !btrfs_check_data_csum(bi, bbio, offset,
bv.bv_page, bv.bv_offset))) {
btrfs_clean_io_failure(bi, start, bv.bv_page,
bv.bv_offset);
@@ -191,9 +194,9 @@ static void end_compressed_bio_read(struct btrfs_bio *bbio)
int ret;
refcount_inc(&cb->pending_ios);
- ret = btrfs_repair_one_sector(inode, bbio, offset,
+ ret = btrfs_repair_one_sector(BTRFS_I(inode), bbio, offset,
bv.bv_page, bv.bv_offset,
- btrfs_submit_data_read_bio);
+ true);
if (ret) {
refcount_dec(&cb->pending_ios);
status = errno_to_blk_status(ret);
@@ -1229,7 +1232,7 @@ static int btrfs_decompress_bio(struct compressed_bio *cb)
* single page, and we want to read a single page out of it.
* start_byte tells us the offset into the compressed data we're interested in
*/
-int btrfs_decompress(int type, unsigned char *data_in, struct page *dest_page,
+int btrfs_decompress(int type, const u8 *data_in, struct page *dest_page,
unsigned long start_byte, size_t srclen, size_t destlen)
{
struct list_head *workspace;
@@ -1243,12 +1246,13 @@ int btrfs_decompress(int type, unsigned char *data_in, struct page *dest_page,
return ret;
}
-void __init btrfs_init_compress(void)
+int __init btrfs_init_compress(void)
{
btrfs_init_workspace_manager(BTRFS_COMPRESS_NONE);
btrfs_init_workspace_manager(BTRFS_COMPRESS_ZLIB);
btrfs_init_workspace_manager(BTRFS_COMPRESS_LZO);
zstd_init_workspace_manager();
+ return 0;
}
void __cold btrfs_exit_compress(void)
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
index 1aa02903de69..6209d40a1e08 100644
--- a/fs/btrfs/compression.h
+++ b/fs/btrfs/compression.h
@@ -6,6 +6,7 @@
#ifndef BTRFS_COMPRESSION_H
#define BTRFS_COMPRESSION_H
+#include <linux/blk_types.h>
#include <linux/sizes.h>
struct btrfs_inode;
@@ -77,7 +78,7 @@ static inline unsigned int btrfs_compress_level(unsigned int type_level)
return ((type_level & 0xF0) >> 4);
}
-void __init btrfs_init_compress(void);
+int __init btrfs_init_compress(void);
void __cold btrfs_exit_compress(void);
int btrfs_compress_pages(unsigned int type_level, struct address_space *mapping,
@@ -85,7 +86,7 @@ int btrfs_compress_pages(unsigned int type_level, struct address_space *mapping,
unsigned long *out_pages,
unsigned long *total_in,
unsigned long *total_out);
-int btrfs_decompress(int type, unsigned char *data_in, struct page *dest_page,
+int btrfs_decompress(int type, const u8 *data_in, struct page *dest_page,
unsigned long start_byte, size_t srclen, size_t destlen);
int btrfs_decompress_buf2page(const char *buf, u32 buf_len,
struct compressed_bio *cb, u32 decompressed);
@@ -149,7 +150,7 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
u64 start, struct page **pages, unsigned long *out_pages,
unsigned long *total_in, unsigned long *total_out);
int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb);
-int zlib_decompress(struct list_head *ws, unsigned char *data_in,
+int zlib_decompress(struct list_head *ws, const u8 *data_in,
struct page *dest_page, unsigned long start_byte, size_t srclen,
size_t destlen);
struct list_head *zlib_alloc_workspace(unsigned int level);
@@ -160,7 +161,7 @@ int lzo_compress_pages(struct list_head *ws, struct address_space *mapping,
u64 start, struct page **pages, unsigned long *out_pages,
unsigned long *total_in, unsigned long *total_out);
int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb);
-int lzo_decompress(struct list_head *ws, unsigned char *data_in,
+int lzo_decompress(struct list_head *ws, const u8 *data_in,
struct page *dest_page, unsigned long start_byte, size_t srclen,
size_t destlen);
struct list_head *lzo_alloc_workspace(unsigned int level);
@@ -170,7 +171,7 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
u64 start, struct page **pages, unsigned long *out_pages,
unsigned long *total_in, unsigned long *total_out);
int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb);
-int zstd_decompress(struct list_head *ws, unsigned char *data_in,
+int zstd_decompress(struct list_head *ws, const u8 *data_in,
struct page *dest_page, unsigned long start_byte, size_t srclen,
size_t destlen);
void zstd_init_workspace_manager(void);
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index b39b339fbf96..4754c9101a4c 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -8,6 +8,7 @@
#include <linux/rbtree.h>
#include <linux/mm.h>
#include <linux/error-injection.h>
+#include "messages.h"
#include "ctree.h"
#include "disk-io.h"
#include "transaction.h"
@@ -17,6 +18,13 @@
#include "qgroup.h"
#include "tree-mod-log.h"
#include "tree-checker.h"
+#include "fs.h"
+#include "accessors.h"
+#include "extent-tree.h"
+#include "relocation.h"
+#include "file-item.h"
+
+static struct kmem_cache *btrfs_path_cachep;
static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
*root, struct btrfs_path *path, int level);
@@ -44,6 +52,104 @@ static const struct btrfs_csums {
.driver = "blake2b-256" },
};
+/*
+ * The leaf data grows from end-to-front in the node. this returns the address
+ * of the start of the last item, which is the stop of the leaf data stack.
+ */
+static unsigned int leaf_data_end(const struct extent_buffer *leaf)
+{
+ u32 nr = btrfs_header_nritems(leaf);
+
+ if (nr == 0)
+ return BTRFS_LEAF_DATA_SIZE(leaf->fs_info);
+ return btrfs_item_offset(leaf, nr - 1);
+}
+
+/*
+ * Move data in a @leaf (using memmove, safe for overlapping ranges).
+ *
+ * @leaf: leaf that we're doing a memmove on
+ * @dst_offset: item data offset we're moving to
+ * @src_offset: item data offset were' moving from
+ * @len: length of the data we're moving
+ *
+ * Wrapper around memmove_extent_buffer() that takes into account the header on
+ * the leaf. The btrfs_item offset's start directly after the header, so we
+ * have to adjust any offsets to account for the header in the leaf. This
+ * handles that math to simplify the callers.
+ */
+static inline void memmove_leaf_data(const struct extent_buffer *leaf,
+ unsigned long dst_offset,
+ unsigned long src_offset,
+ unsigned long len)
+{
+ memmove_extent_buffer(leaf, btrfs_item_nr_offset(leaf, 0) + dst_offset,
+ btrfs_item_nr_offset(leaf, 0) + src_offset, len);
+}
+
+/*
+ * Copy item data from @src into @dst at the given @offset.
+ *
+ * @dst: destination leaf that we're copying into
+ * @src: source leaf that we're copying from
+ * @dst_offset: item data offset we're copying to
+ * @src_offset: item data offset were' copying from
+ * @len: length of the data we're copying
+ *
+ * Wrapper around copy_extent_buffer() that takes into account the header on
+ * the leaf. The btrfs_item offset's start directly after the header, so we
+ * have to adjust any offsets to account for the header in the leaf. This
+ * handles that math to simplify the callers.
+ */
+static inline void copy_leaf_data(const struct extent_buffer *dst,
+ const struct extent_buffer *src,
+ unsigned long dst_offset,
+ unsigned long src_offset, unsigned long len)
+{
+ copy_extent_buffer(dst, src, btrfs_item_nr_offset(dst, 0) + dst_offset,
+ btrfs_item_nr_offset(src, 0) + src_offset, len);
+}
+
+/*
+ * Move items in a @leaf (using memmove).
+ *
+ * @dst: destination leaf for the items
+ * @dst_item: the item nr we're copying into
+ * @src_item: the item nr we're copying from
+ * @nr_items: the number of items to copy
+ *
+ * Wrapper around memmove_extent_buffer() that does the math to get the
+ * appropriate offsets into the leaf from the item numbers.
+ */
+static inline void memmove_leaf_items(const struct extent_buffer *leaf,
+ int dst_item, int src_item, int nr_items)
+{
+ memmove_extent_buffer(leaf, btrfs_item_nr_offset(leaf, dst_item),
+ btrfs_item_nr_offset(leaf, src_item),
+ nr_items * sizeof(struct btrfs_item));
+}
+
+/*
+ * Copy items from @src into @dst at the given @offset.
+ *
+ * @dst: destination leaf for the items
+ * @src: source leaf for the items
+ * @dst_item: the item nr we're copying into
+ * @src_item: the item nr we're copying from
+ * @nr_items: the number of items to copy
+ *
+ * Wrapper around copy_extent_buffer() that does the math to get the
+ * appropriate offsets into the leaf from the item numbers.
+ */
+static inline void copy_leaf_items(const struct extent_buffer *dst,
+ const struct extent_buffer *src,
+ int dst_item, int src_item, int nr_items)
+{
+ copy_extent_buffer(dst, src, btrfs_item_nr_offset(dst, dst_item),
+ btrfs_item_nr_offset(src, src_item),
+ nr_items * sizeof(struct btrfs_item));
+}
+
int btrfs_super_csum_size(const struct btrfs_super_block *s)
{
u16 t = btrfs_super_csum_type(s);
@@ -78,6 +184,8 @@ size_t __attribute_const__ btrfs_get_num_csums(void)
struct btrfs_path *btrfs_alloc_path(void)
{
+ might_sleep();
+
return kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS);
}
@@ -114,6 +222,22 @@ noinline void btrfs_release_path(struct btrfs_path *p)
}
/*
+ * We want the transaction abort to print stack trace only for errors where the
+ * cause could be a bug, eg. due to ENOSPC, and not for common errors that are
+ * caused by external factors.
+ */
+bool __cold abort_should_print_stack(int errno)
+{
+ switch (errno) {
+ case -EIO:
+ case -EROFS:
+ case -ENOMEM:
+ return false;
+ }
+ return true;
+}
+
+/*
* safely gets a reference on the root node of a tree. A lock
* is not taken, so a concurrent writer may put a different node
* at the root of the tree. See btrfs_lock_root_node for the
@@ -471,7 +595,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
} else {
WARN_ON(trans->transid != btrfs_header_generation(parent));
btrfs_tree_mod_log_insert_key(parent, parent_slot,
- BTRFS_MOD_LOG_KEY_REPLACE, GFP_NOFS);
+ BTRFS_MOD_LOG_KEY_REPLACE);
btrfs_set_node_blockptr(parent, parent_slot,
cow->start);
btrfs_set_node_ptr_generation(parent, parent_slot,
@@ -834,19 +958,22 @@ struct extent_buffer *btrfs_read_node_slot(struct extent_buffer *parent,
int slot)
{
int level = btrfs_header_level(parent);
+ struct btrfs_tree_parent_check check = { 0 };
struct extent_buffer *eb;
- struct btrfs_key first_key;
if (slot < 0 || slot >= btrfs_header_nritems(parent))
return ERR_PTR(-ENOENT);
BUG_ON(level == 0);
- btrfs_node_key_to_cpu(parent, &first_key, slot);
+ check.level = level - 1;
+ check.transid = btrfs_node_ptr_generation(parent, slot);
+ check.owner_root = btrfs_header_owner(parent);
+ check.has_first_key = true;
+ btrfs_node_key_to_cpu(parent, &check.first_key, slot);
+
eb = read_tree_block(parent->fs_info, btrfs_node_blockptr(parent, slot),
- btrfs_header_owner(parent),
- btrfs_node_ptr_generation(parent, slot),
- level - 1, &first_key);
+ &check);
if (IS_ERR(eb))
return eb;
if (!extent_buffer_uptodate(eb)) {
@@ -1000,7 +1127,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
struct btrfs_disk_key right_key;
btrfs_node_key(right, &right_key, 0);
ret = btrfs_tree_mod_log_insert_key(parent, pslot + 1,
- BTRFS_MOD_LOG_KEY_REPLACE, GFP_NOFS);
+ BTRFS_MOD_LOG_KEY_REPLACE);
BUG_ON(ret < 0);
btrfs_set_node_key(parent, &right_key, pslot + 1);
btrfs_mark_buffer_dirty(parent);
@@ -1046,7 +1173,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
struct btrfs_disk_key mid_key;
btrfs_node_key(mid, &mid_key, 0);
ret = btrfs_tree_mod_log_insert_key(parent, pslot,
- BTRFS_MOD_LOG_KEY_REPLACE, GFP_NOFS);
+ BTRFS_MOD_LOG_KEY_REPLACE);
BUG_ON(ret < 0);
btrfs_set_node_key(parent, &mid_key, pslot);
btrfs_mark_buffer_dirty(parent);
@@ -1148,7 +1275,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
orig_slot += left_nr;
btrfs_node_key(mid, &disk_key, 0);
ret = btrfs_tree_mod_log_insert_key(parent, pslot,
- BTRFS_MOD_LOG_KEY_REPLACE, GFP_NOFS);
+ BTRFS_MOD_LOG_KEY_REPLACE);
BUG_ON(ret < 0);
btrfs_set_node_key(parent, &disk_key, pslot);
btrfs_mark_buffer_dirty(parent);
@@ -1202,7 +1329,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
btrfs_node_key(right, &disk_key, 0);
ret = btrfs_tree_mod_log_insert_key(parent, pslot + 1,
- BTRFS_MOD_LOG_KEY_REPLACE, GFP_NOFS);
+ BTRFS_MOD_LOG_KEY_REPLACE);
BUG_ON(ret < 0);
btrfs_set_node_key(parent, &disk_key, pslot + 1);
btrfs_mark_buffer_dirty(parent);
@@ -1405,10 +1532,10 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
const struct btrfs_key *key)
{
struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_tree_parent_check check = { 0 };
u64 blocknr;
u64 gen;
struct extent_buffer *tmp;
- struct btrfs_key first_key;
int ret;
int parent_level;
bool unlock_up;
@@ -1417,7 +1544,11 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
blocknr = btrfs_node_blockptr(*eb_ret, slot);
gen = btrfs_node_ptr_generation(*eb_ret, slot);
parent_level = btrfs_header_level(*eb_ret);
- btrfs_node_key_to_cpu(*eb_ret, &first_key, slot);
+ btrfs_node_key_to_cpu(*eb_ret, &check.first_key, slot);
+ check.has_first_key = true;
+ check.level = parent_level - 1;
+ check.transid = gen;
+ check.owner_root = root->root_key.objectid;
/*
* If we need to read an extent buffer from disk and we are holding locks
@@ -1439,7 +1570,7 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
* parents (shared tree blocks).
*/
if (btrfs_verify_level_key(tmp,
- parent_level - 1, &first_key, gen)) {
+ parent_level - 1, &check.first_key, gen)) {
free_extent_buffer(tmp);
return -EUCLEAN;
}
@@ -1456,7 +1587,7 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
btrfs_unlock_up_safe(p, level + 1);
/* now we're allowed to do a blocking uptodate check */
- ret = btrfs_read_extent_buffer(tmp, gen, parent_level - 1, &first_key);
+ ret = btrfs_read_extent_buffer(tmp, &check);
if (ret) {
free_extent_buffer(tmp);
btrfs_release_path(p);
@@ -1486,8 +1617,7 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
if (p->reada != READA_NONE)
reada_for_search(fs_info, p, level, slot, key->objectid);
- tmp = read_tree_block(fs_info, blocknr, root->root_key.objectid,
- gen, parent_level - 1, &first_key);
+ tmp = read_tree_block(fs_info, blocknr, &check);
if (IS_ERR(tmp)) {
btrfs_release_path(p);
return PTR_ERR(tmp);
@@ -1918,6 +2048,8 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root,
int min_write_lock_level;
int prev_cmp;
+ might_sleep();
+
lowest_level = p->lowest_level;
WARN_ON(lowest_level && ins_len > 0);
WARN_ON(p->nodes[0] != NULL);
@@ -2341,7 +2473,7 @@ int btrfs_search_backwards(struct btrfs_root *root, struct btrfs_key *key,
return ret;
}
-/**
+/*
* Search for a valid slot for the given path.
*
* @root: The root node of the tree.
@@ -2400,7 +2532,7 @@ static void fixup_low_keys(struct btrfs_path *path,
break;
t = path->nodes[i];
ret = btrfs_tree_mod_log_insert_key(t, tslot,
- BTRFS_MOD_LOG_KEY_REPLACE, GFP_ATOMIC);
+ BTRFS_MOD_LOG_KEY_REPLACE);
BUG_ON(ret < 0);
btrfs_set_node_key(t, key, tslot);
btrfs_mark_buffer_dirty(path->nodes[i]);
@@ -2569,8 +2701,8 @@ static int push_node_left(struct btrfs_trans_handle *trans,
return ret;
}
copy_extent_buffer(dst, src,
- btrfs_node_key_ptr_offset(dst_nritems),
- btrfs_node_key_ptr_offset(0),
+ btrfs_node_key_ptr_offset(dst, dst_nritems),
+ btrfs_node_key_ptr_offset(src, 0),
push_items * sizeof(struct btrfs_key_ptr));
if (push_items < src_nritems) {
@@ -2578,8 +2710,8 @@ static int push_node_left(struct btrfs_trans_handle *trans,
* Don't call btrfs_tree_mod_log_insert_move() here, key removal
* was already fully logged by btrfs_tree_mod_log_eb_copy() above.
*/
- memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0),
- btrfs_node_key_ptr_offset(push_items),
+ memmove_extent_buffer(src, btrfs_node_key_ptr_offset(src, 0),
+ btrfs_node_key_ptr_offset(src, push_items),
(src_nritems - push_items) *
sizeof(struct btrfs_key_ptr));
}
@@ -2639,8 +2771,8 @@ static int balance_node_right(struct btrfs_trans_handle *trans,
}
ret = btrfs_tree_mod_log_insert_move(dst, push_items, 0, dst_nritems);
BUG_ON(ret < 0);
- memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items),
- btrfs_node_key_ptr_offset(0),
+ memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(dst, push_items),
+ btrfs_node_key_ptr_offset(dst, 0),
(dst_nritems) *
sizeof(struct btrfs_key_ptr));
@@ -2651,8 +2783,8 @@ static int balance_node_right(struct btrfs_trans_handle *trans,
return ret;
}
copy_extent_buffer(dst, src,
- btrfs_node_key_ptr_offset(0),
- btrfs_node_key_ptr_offset(src_nritems - push_items),
+ btrfs_node_key_ptr_offset(dst, 0),
+ btrfs_node_key_ptr_offset(src, src_nritems - push_items),
push_items * sizeof(struct btrfs_key_ptr));
btrfs_set_header_nritems(src, src_nritems - push_items);
@@ -2755,13 +2887,13 @@ static void insert_ptr(struct btrfs_trans_handle *trans,
BUG_ON(ret < 0);
}
memmove_extent_buffer(lower,
- btrfs_node_key_ptr_offset(slot + 1),
- btrfs_node_key_ptr_offset(slot),
+ btrfs_node_key_ptr_offset(lower, slot + 1),
+ btrfs_node_key_ptr_offset(lower, slot),
(nritems - slot) * sizeof(struct btrfs_key_ptr));
}
if (level) {
ret = btrfs_tree_mod_log_insert_key(lower, slot,
- BTRFS_MOD_LOG_KEY_ADD, GFP_NOFS);
+ BTRFS_MOD_LOG_KEY_ADD);
BUG_ON(ret < 0);
}
btrfs_set_node_key(lower, key, slot);
@@ -2838,8 +2970,8 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
return ret;
}
copy_extent_buffer(split, c,
- btrfs_node_key_ptr_offset(0),
- btrfs_node_key_ptr_offset(mid),
+ btrfs_node_key_ptr_offset(split, 0),
+ btrfs_node_key_ptr_offset(c, mid),
(c_nritems - mid) * sizeof(struct btrfs_key_ptr));
btrfs_set_header_nritems(split, c_nritems - mid);
btrfs_set_header_nritems(c, mid);
@@ -2979,25 +3111,17 @@ static noinline int __push_leaf_right(struct btrfs_path *path,
/* make room in the right data area */
data_end = leaf_data_end(right);
- memmove_extent_buffer(right,
- BTRFS_LEAF_DATA_OFFSET + data_end - push_space,
- BTRFS_LEAF_DATA_OFFSET + data_end,
- BTRFS_LEAF_DATA_SIZE(fs_info) - data_end);
+ memmove_leaf_data(right, data_end - push_space, data_end,
+ BTRFS_LEAF_DATA_SIZE(fs_info) - data_end);
/* copy from the left data area */
- copy_extent_buffer(right, left, BTRFS_LEAF_DATA_OFFSET +
- BTRFS_LEAF_DATA_SIZE(fs_info) - push_space,
- BTRFS_LEAF_DATA_OFFSET + leaf_data_end(left),
- push_space);
+ copy_leaf_data(right, left, BTRFS_LEAF_DATA_SIZE(fs_info) - push_space,
+ leaf_data_end(left), push_space);
- memmove_extent_buffer(right, btrfs_item_nr_offset(push_items),
- btrfs_item_nr_offset(0),
- right_nritems * sizeof(struct btrfs_item));
+ memmove_leaf_items(right, push_items, 0, right_nritems);
/* copy the items from left to right */
- copy_extent_buffer(right, left, btrfs_item_nr_offset(0),
- btrfs_item_nr_offset(left_nritems - push_items),
- push_items * sizeof(struct btrfs_item));
+ copy_leaf_items(right, left, 0, left_nritems - push_items, push_items);
/* update the item pointers */
btrfs_init_map_token(&token, right);
@@ -3189,19 +3313,13 @@ static noinline int __push_leaf_left(struct btrfs_path *path, int data_size,
WARN_ON(!empty && push_items == btrfs_header_nritems(right));
/* push data from right to left */
- copy_extent_buffer(left, right,
- btrfs_item_nr_offset(btrfs_header_nritems(left)),
- btrfs_item_nr_offset(0),
- push_items * sizeof(struct btrfs_item));
+ copy_leaf_items(left, right, btrfs_header_nritems(left), 0, push_items);
push_space = BTRFS_LEAF_DATA_SIZE(fs_info) -
btrfs_item_offset(right, push_items - 1);
- copy_extent_buffer(left, right, BTRFS_LEAF_DATA_OFFSET +
- leaf_data_end(left) - push_space,
- BTRFS_LEAF_DATA_OFFSET +
- btrfs_item_offset(right, push_items - 1),
- push_space);
+ copy_leaf_data(left, right, leaf_data_end(left) - push_space,
+ btrfs_item_offset(right, push_items - 1), push_space);
old_left_nritems = btrfs_header_nritems(left);
BUG_ON(old_left_nritems <= 0);
@@ -3224,15 +3342,12 @@ static noinline int __push_leaf_left(struct btrfs_path *path, int data_size,
if (push_items < right_nritems) {
push_space = btrfs_item_offset(right, push_items - 1) -
leaf_data_end(right);
- memmove_extent_buffer(right, BTRFS_LEAF_DATA_OFFSET +
- BTRFS_LEAF_DATA_SIZE(fs_info) - push_space,
- BTRFS_LEAF_DATA_OFFSET +
- leaf_data_end(right), push_space);
+ memmove_leaf_data(right,
+ BTRFS_LEAF_DATA_SIZE(fs_info) - push_space,
+ leaf_data_end(right), push_space);
- memmove_extent_buffer(right, btrfs_item_nr_offset(0),
- btrfs_item_nr_offset(push_items),
- (btrfs_header_nritems(right) - push_items) *
- sizeof(struct btrfs_item));
+ memmove_leaf_items(right, 0, push_items,
+ btrfs_header_nritems(right) - push_items);
}
btrfs_init_map_token(&token, right);
@@ -3364,14 +3479,10 @@ static noinline void copy_for_split(struct btrfs_trans_handle *trans,
btrfs_set_header_nritems(right, nritems);
data_copy_size = btrfs_item_data_end(l, mid) - leaf_data_end(l);
- copy_extent_buffer(right, l, btrfs_item_nr_offset(0),
- btrfs_item_nr_offset(mid),
- nritems * sizeof(struct btrfs_item));
+ copy_leaf_items(right, l, 0, mid, nritems);
- copy_extent_buffer(right, l,
- BTRFS_LEAF_DATA_OFFSET + BTRFS_LEAF_DATA_SIZE(fs_info) -
- data_copy_size, BTRFS_LEAF_DATA_OFFSET +
- leaf_data_end(l), data_copy_size);
+ copy_leaf_data(right, l, BTRFS_LEAF_DATA_SIZE(fs_info) - data_copy_size,
+ leaf_data_end(l), data_copy_size);
rt_data_off = BTRFS_LEAF_DATA_SIZE(fs_info) - btrfs_item_data_end(l, mid);
@@ -3741,9 +3852,7 @@ static noinline int split_item(struct btrfs_path *path,
nritems = btrfs_header_nritems(leaf);
if (slot != nritems) {
/* shift the items */
- memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + 1),
- btrfs_item_nr_offset(slot),
- (nritems - slot) * sizeof(struct btrfs_item));
+ memmove_leaf_items(leaf, slot + 1, slot, nritems - slot);
}
btrfs_cpu_key_to_disk(&disk_key, new_key);
@@ -3854,9 +3963,8 @@ void btrfs_truncate_item(struct btrfs_path *path, u32 new_size, int from_end)
/* shift the data */
if (from_end) {
- memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
- data_end + size_diff, BTRFS_LEAF_DATA_OFFSET +
- data_end, old_data_start + new_size - data_end);
+ memmove_leaf_data(leaf, data_end + size_diff, data_end,
+ old_data_start + new_size - data_end);
} else {
struct btrfs_disk_key disk_key;
u64 offset;
@@ -3881,9 +3989,8 @@ void btrfs_truncate_item(struct btrfs_path *path, u32 new_size, int from_end)
}
}
- memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
- data_end + size_diff, BTRFS_LEAF_DATA_OFFSET +
- data_end, old_data_start - data_end);
+ memmove_leaf_data(leaf, data_end + size_diff, data_end,
+ old_data_start - data_end);
offset = btrfs_disk_key_offset(&disk_key);
btrfs_set_disk_key_offset(&disk_key, offset + size_diff);
@@ -3948,9 +4055,8 @@ void btrfs_extend_item(struct btrfs_path *path, u32 data_size)
}
/* shift the data */
- memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
- data_end - data_size, BTRFS_LEAF_DATA_OFFSET +
- data_end, old_data - data_end);
+ memmove_leaf_data(leaf, data_end - data_size, data_end,
+ old_data - data_end);
data_end = old_data;
old_size = btrfs_item_size(leaf, slot);
@@ -3963,14 +4069,15 @@ void btrfs_extend_item(struct btrfs_path *path, u32 data_size)
}
}
-/**
- * setup_items_for_insert - Helper called before inserting one or more items
- * to a leaf. Main purpose is to save stack depth by doing the bulk of the work
- * in a function that doesn't call btrfs_search_slot
+/*
+ * Make space in the node before inserting one or more items.
*
* @root: root we are inserting items to
* @path: points to the leaf/slot where we are going to insert new items
* @batch: information about the batch of items to insert
+ *
+ * Main purpose is to save stack depth by doing the bulk of the work in a
+ * function that doesn't call btrfs_search_slot
*/
static void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
const struct btrfs_item_batch *batch)
@@ -4033,15 +4140,11 @@ static void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *p
ioff - batch->total_data_size);
}
/* shift the items */
- memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + batch->nr),
- btrfs_item_nr_offset(slot),
- (nritems - slot) * sizeof(struct btrfs_item));
+ memmove_leaf_items(leaf, slot + batch->nr, slot, nritems - slot);
/* shift the data */
- memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
- data_end - batch->total_data_size,
- BTRFS_LEAF_DATA_OFFSET + data_end,
- old_data - data_end);
+ memmove_leaf_data(leaf, data_end - batch->total_data_size,
+ data_end, old_data - data_end);
data_end = old_data;
}
@@ -4195,13 +4298,13 @@ static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
BUG_ON(ret < 0);
}
memmove_extent_buffer(parent,
- btrfs_node_key_ptr_offset(slot),
- btrfs_node_key_ptr_offset(slot + 1),
+ btrfs_node_key_ptr_offset(parent, slot),
+ btrfs_node_key_ptr_offset(parent, slot + 1),
sizeof(struct btrfs_key_ptr) *
(nritems - slot - 1));
} else if (level) {
ret = btrfs_tree_mod_log_insert_key(parent, slot,
- BTRFS_MOD_LOG_KEY_REMOVE, GFP_NOFS);
+ BTRFS_MOD_LOG_KEY_REMOVE);
BUG_ON(ret < 0);
}
@@ -4276,10 +4379,8 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
for (i = 0; i < nr; i++)
dsize += btrfs_item_size(leaf, slot + i);
- memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
- data_end + dsize,
- BTRFS_LEAF_DATA_OFFSET + data_end,
- last_off - data_end);
+ memmove_leaf_data(leaf, data_end + dsize, data_end,
+ last_off - data_end);
btrfs_init_map_token(&token, leaf);
for (i = slot + nr; i < nritems; i++) {
@@ -4289,10 +4390,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
btrfs_set_token_item_offset(&token, i, ioff + dsize);
}
- memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot),
- btrfs_item_nr_offset(slot + nr),
- sizeof(struct btrfs_item) *
- (nritems - slot - nr));
+ memmove_leaf_items(leaf, slot, slot + nr, nritems - slot - nr);
}
btrfs_set_header_nritems(leaf, nritems - nr);
nritems -= nr;
@@ -4647,7 +4745,12 @@ int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
int ret;
int i;
- ASSERT(!path->nowait);
+ /*
+ * The nowait semantics are used only for write paths, where we don't
+ * use the tree mod log and sequence numbers.
+ */
+ if (time_seq)
+ ASSERT(!path->nowait);
nritems = btrfs_header_nritems(path->nodes[0]);
if (nritems == 0)
@@ -4667,7 +4770,14 @@ again:
if (path->need_commit_sem) {
path->need_commit_sem = 0;
need_commit_sem = true;
- down_read(&fs_info->commit_root_sem);
+ if (path->nowait) {
+ if (!down_read_trylock(&fs_info->commit_root_sem)) {
+ ret = -EAGAIN;
+ goto done;
+ }
+ } else {
+ down_read(&fs_info->commit_root_sem);
+ }
}
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
}
@@ -4743,7 +4853,7 @@ again:
next = c;
ret = read_block_for_search(root, path, &next, level,
slot, &key);
- if (ret == -EAGAIN)
+ if (ret == -EAGAIN && !path->nowait)
goto again;
if (ret < 0) {
@@ -4753,6 +4863,10 @@ again:
if (!path->skip_locking) {
ret = btrfs_try_tree_read_lock(next);
+ if (!ret && path->nowait) {
+ ret = -EAGAIN;
+ goto done;
+ }
if (!ret && time_seq) {
/*
* If we don't get the lock, we may be racing
@@ -4783,7 +4897,7 @@ again:
ret = read_block_for_search(root, path, &next, level,
0, &key);
- if (ret == -EAGAIN)
+ if (ret == -EAGAIN && !path->nowait)
goto again;
if (ret < 0) {
@@ -4791,8 +4905,16 @@ again:
goto done;
}
- if (!path->skip_locking)
- btrfs_tree_read_lock(next);
+ if (!path->skip_locking) {
+ if (path->nowait) {
+ if (!btrfs_try_tree_read_lock(next)) {
+ ret = -EAGAIN;
+ goto done;
+ }
+ } else {
+ btrfs_tree_read_lock(next);
+ }
+ }
}
ret = 0;
done:
@@ -4810,6 +4932,14 @@ done:
return ret;
}
+int btrfs_next_old_item(struct btrfs_root *root, struct btrfs_path *path, u64 time_seq)
+{
+ path->slots[0]++;
+ if (path->slots[0] >= btrfs_header_nritems(path->nodes[0]))
+ return btrfs_next_old_leaf(root, path, time_seq);
+ return 0;
+}
+
/*
* this uses btrfs_prev_leaf to walk backwards in the tree, and keeps
* searching until it gets past min_objectid or finds an item of 'type'
@@ -4893,3 +5023,18 @@ int btrfs_previous_extent_item(struct btrfs_root *root,
}
return 1;
}
+
+int __init btrfs_ctree_init(void)
+{
+ btrfs_path_cachep = kmem_cache_create("btrfs_path",
+ sizeof(struct btrfs_path), 0,
+ SLAB_MEM_SPREAD, NULL);
+ if (!btrfs_path_cachep)
+ return -ENOMEM;
+ return 0;
+}
+
+void __cold btrfs_ctree_exit(void)
+{
+ kmem_cache_destroy(btrfs_path_cachep);
+}
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index f677b49df8ae..6965703a81b6 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -28,12 +28,15 @@
#include <linux/refcount.h>
#include <linux/crc32c.h>
#include <linux/iomap.h>
+#include <linux/fscrypt.h>
#include "extent-io-tree.h"
#include "extent_io.h"
#include "extent_map.h"
#include "async-thread.h"
#include "block-rsv.h"
#include "locking.h"
+#include "misc.h"
+#include "fs.h"
struct btrfs_trans_handle;
struct btrfs_transaction;
@@ -41,10 +44,6 @@ struct btrfs_pending_snapshot;
struct btrfs_delayed_ref_root;
struct btrfs_space_info;
struct btrfs_block_group;
-extern struct kmem_cache *btrfs_trans_handle_cachep;
-extern struct kmem_cache *btrfs_path_cachep;
-extern struct kmem_cache *btrfs_free_space_cachep;
-extern struct kmem_cache *btrfs_free_space_bitmap_cachep;
struct btrfs_ordered_sum;
struct btrfs_ref;
struct btrfs_bio;
@@ -55,340 +54,6 @@ struct btrfs_balance_control;
struct btrfs_delayed_root;
struct reloc_control;
-#define BTRFS_MAGIC 0x4D5F53665248425FULL /* ascii _BHRfS_M, no null */
-
-/*
- * Maximum number of mirrors that can be available for all profiles counting
- * the target device of dev-replace as one. During an active device replace
- * procedure, the target device of the copy operation is a mirror for the
- * filesystem data as well that can be used to read data in order to repair
- * read errors on other disks.
- *
- * Current value is derived from RAID1C4 with 4 copies.
- */
-#define BTRFS_MAX_MIRRORS (4 + 1)
-
-#define BTRFS_MAX_LEVEL 8
-
-#define BTRFS_OLDEST_GENERATION 0ULL
-
-/*
- * we can actually store much bigger names, but lets not confuse the rest
- * of linux
- */
-#define BTRFS_NAME_LEN 255
-
-/*
- * Theoretical limit is larger, but we keep this down to a sane
- * value. That should limit greatly the possibility of collisions on
- * inode ref items.
- */
-#define BTRFS_LINK_MAX 65535U
-
-#define BTRFS_EMPTY_DIR_SIZE 0
-
-/* ioprio of readahead is set to idle */
-#define BTRFS_IOPRIO_READA (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0))
-
-#define BTRFS_DIRTY_METADATA_THRESH SZ_32M
-
-/*
- * Use large batch size to reduce overhead of metadata updates. On the reader
- * side, we only read it when we are close to ENOSPC and the read overhead is
- * mostly related to the number of CPUs, so it is OK to use arbitrary large
- * value here.
- */
-#define BTRFS_TOTAL_BYTES_PINNED_BATCH SZ_128M
-
-#define BTRFS_MAX_EXTENT_SIZE SZ_128M
-
-/*
- * Deltas are an effective way to populate global statistics. Give macro names
- * to make it clear what we're doing. An example is discard_extents in
- * btrfs_free_space_ctl.
- */
-#define BTRFS_STAT_NR_ENTRIES 2
-#define BTRFS_STAT_CURR 0
-#define BTRFS_STAT_PREV 1
-
-static inline unsigned long btrfs_chunk_item_size(int num_stripes)
-{
- BUG_ON(num_stripes == 0);
- return sizeof(struct btrfs_chunk) +
- sizeof(struct btrfs_stripe) * (num_stripes - 1);
-}
-
-/*
- * Runtime (in-memory) states of filesystem
- */
-enum {
- /* Global indicator of serious filesystem errors */
- BTRFS_FS_STATE_ERROR,
- /*
- * Filesystem is being remounted, allow to skip some operations, like
- * defrag
- */
- BTRFS_FS_STATE_REMOUNTING,
- /* Filesystem in RO mode */
- BTRFS_FS_STATE_RO,
- /* Track if a transaction abort has been reported on this filesystem */
- BTRFS_FS_STATE_TRANS_ABORTED,
- /*
- * Bio operations should be blocked on this filesystem because a source
- * or target device is being destroyed as part of a device replace
- */
- BTRFS_FS_STATE_DEV_REPLACING,
- /* The btrfs_fs_info created for self-tests */
- BTRFS_FS_STATE_DUMMY_FS_INFO,
-
- BTRFS_FS_STATE_NO_CSUMS,
-
- /* Indicates there was an error cleaning up a log tree. */
- BTRFS_FS_STATE_LOG_CLEANUP_ERROR,
-
- BTRFS_FS_STATE_COUNT
-};
-
-#define BTRFS_BACKREF_REV_MAX 256
-#define BTRFS_BACKREF_REV_SHIFT 56
-#define BTRFS_BACKREF_REV_MASK (((u64)BTRFS_BACKREF_REV_MAX - 1) << \
- BTRFS_BACKREF_REV_SHIFT)
-
-#define BTRFS_OLD_BACKREF_REV 0
-#define BTRFS_MIXED_BACKREF_REV 1
-
-/*
- * every tree block (leaf or node) starts with this header.
- */
-struct btrfs_header {
- /* these first four must match the super block */
- u8 csum[BTRFS_CSUM_SIZE];
- u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */
- __le64 bytenr; /* which block this node is supposed to live in */
- __le64 flags;
-
- /* allowed to be different from the super from here on down */
- u8 chunk_tree_uuid[BTRFS_UUID_SIZE];
- __le64 generation;
- __le64 owner;
- __le32 nritems;
- u8 level;
-} __attribute__ ((__packed__));
-
-/*
- * this is a very generous portion of the super block, giving us
- * room to translate 14 chunks with 3 stripes each.
- */
-#define BTRFS_SYSTEM_CHUNK_ARRAY_SIZE 2048
-
-/*
- * just in case we somehow lose the roots and are not able to mount,
- * we store an array of the roots from previous transactions
- * in the super.
- */
-#define BTRFS_NUM_BACKUP_ROOTS 4
-struct btrfs_root_backup {
- __le64 tree_root;
- __le64 tree_root_gen;
-
- __le64 chunk_root;
- __le64 chunk_root_gen;
-
- __le64 extent_root;
- __le64 extent_root_gen;
-
- __le64 fs_root;
- __le64 fs_root_gen;
-
- __le64 dev_root;
- __le64 dev_root_gen;
-
- __le64 csum_root;
- __le64 csum_root_gen;
-
- __le64 total_bytes;
- __le64 bytes_used;
- __le64 num_devices;
- /* future */
- __le64 unused_64[4];
-
- u8 tree_root_level;
- u8 chunk_root_level;
- u8 extent_root_level;
- u8 fs_root_level;
- u8 dev_root_level;
- u8 csum_root_level;
- /* future and to align */
- u8 unused_8[10];
-} __attribute__ ((__packed__));
-
-#define BTRFS_SUPER_INFO_OFFSET SZ_64K
-#define BTRFS_SUPER_INFO_SIZE 4096
-
-/*
- * The reserved space at the beginning of each device.
- * It covers the primary super block and leaves space for potential use by other
- * tools like bootloaders or to lower potential damage of accidental overwrite.
- */
-#define BTRFS_DEVICE_RANGE_RESERVED (SZ_1M)
-
-/*
- * the super block basically lists the main trees of the FS
- * it currently lacks any block count etc etc
- */
-struct btrfs_super_block {
- /* the first 4 fields must match struct btrfs_header */
- u8 csum[BTRFS_CSUM_SIZE];
- /* FS specific UUID, visible to user */
- u8 fsid[BTRFS_FSID_SIZE];
- __le64 bytenr; /* this block number */
- __le64 flags;
-
- /* allowed to be different from the btrfs_header from here own down */
- __le64 magic;
- __le64 generation;
- __le64 root;
- __le64 chunk_root;
- __le64 log_root;
-
- /*
- * This member has never been utilized since the very beginning, thus
- * it's always 0 regardless of kernel version. We always use
- * generation + 1 to read log tree root. So here we mark it deprecated.
- */
- __le64 __unused_log_root_transid;
- __le64 total_bytes;
- __le64 bytes_used;
- __le64 root_dir_objectid;
- __le64 num_devices;
- __le32 sectorsize;
- __le32 nodesize;
- __le32 __unused_leafsize;
- __le32 stripesize;
- __le32 sys_chunk_array_size;
- __le64 chunk_root_generation;
- __le64 compat_flags;
- __le64 compat_ro_flags;
- __le64 incompat_flags;
- __le16 csum_type;
- u8 root_level;
- u8 chunk_root_level;
- u8 log_root_level;
- struct btrfs_dev_item dev_item;
-
- char label[BTRFS_LABEL_SIZE];
-
- __le64 cache_generation;
- __le64 uuid_tree_generation;
-
- /* the UUID written into btree blocks */
- u8 metadata_uuid[BTRFS_FSID_SIZE];
-
- /* future expansion */
- u8 reserved8[8];
- __le64 reserved[27];
- u8 sys_chunk_array[BTRFS_SYSTEM_CHUNK_ARRAY_SIZE];
- struct btrfs_root_backup super_roots[BTRFS_NUM_BACKUP_ROOTS];
-
- /* Padded to 4096 bytes */
- u8 padding[565];
-} __attribute__ ((__packed__));
-static_assert(sizeof(struct btrfs_super_block) == BTRFS_SUPER_INFO_SIZE);
-
-/*
- * Compat flags that we support. If any incompat flags are set other than the
- * ones specified below then we will fail to mount
- */
-#define BTRFS_FEATURE_COMPAT_SUPP 0ULL
-#define BTRFS_FEATURE_COMPAT_SAFE_SET 0ULL
-#define BTRFS_FEATURE_COMPAT_SAFE_CLEAR 0ULL
-
-#define BTRFS_FEATURE_COMPAT_RO_SUPP \
- (BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE | \
- BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE_VALID | \
- BTRFS_FEATURE_COMPAT_RO_VERITY | \
- BTRFS_FEATURE_COMPAT_RO_BLOCK_GROUP_TREE)
-
-#define BTRFS_FEATURE_COMPAT_RO_SAFE_SET 0ULL
-#define BTRFS_FEATURE_COMPAT_RO_SAFE_CLEAR 0ULL
-
-#ifdef CONFIG_BTRFS_DEBUG
-/*
- * Extent tree v2 supported only with CONFIG_BTRFS_DEBUG
- */
-#define BTRFS_FEATURE_INCOMPAT_SUPP \
- (BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF | \
- BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL | \
- BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS | \
- BTRFS_FEATURE_INCOMPAT_BIG_METADATA | \
- BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO | \
- BTRFS_FEATURE_INCOMPAT_COMPRESS_ZSTD | \
- BTRFS_FEATURE_INCOMPAT_RAID56 | \
- BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF | \
- BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA | \
- BTRFS_FEATURE_INCOMPAT_NO_HOLES | \
- BTRFS_FEATURE_INCOMPAT_METADATA_UUID | \
- BTRFS_FEATURE_INCOMPAT_RAID1C34 | \
- BTRFS_FEATURE_INCOMPAT_ZONED | \
- BTRFS_FEATURE_INCOMPAT_EXTENT_TREE_V2)
-#else
-#define BTRFS_FEATURE_INCOMPAT_SUPP \
- (BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF | \
- BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL | \
- BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS | \
- BTRFS_FEATURE_INCOMPAT_BIG_METADATA | \
- BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO | \
- BTRFS_FEATURE_INCOMPAT_COMPRESS_ZSTD | \
- BTRFS_FEATURE_INCOMPAT_RAID56 | \
- BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF | \
- BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA | \
- BTRFS_FEATURE_INCOMPAT_NO_HOLES | \
- BTRFS_FEATURE_INCOMPAT_METADATA_UUID | \
- BTRFS_FEATURE_INCOMPAT_RAID1C34 | \
- BTRFS_FEATURE_INCOMPAT_ZONED)
-#endif
-
-#define BTRFS_FEATURE_INCOMPAT_SAFE_SET \
- (BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF)
-#define BTRFS_FEATURE_INCOMPAT_SAFE_CLEAR 0ULL
-
-/*
- * A leaf is full of items. offset and size tell us where to find
- * the item in the leaf (relative to the start of the data area)
- */
-struct btrfs_item {
- struct btrfs_disk_key key;
- __le32 offset;
- __le32 size;
-} __attribute__ ((__packed__));
-
-/*
- * leaves have an item area and a data area:
- * [item0, item1....itemN] [free space] [dataN...data1, data0]
- *
- * The data is separate from the items to get the keys closer together
- * during searches.
- */
-struct btrfs_leaf {
- struct btrfs_header header;
- struct btrfs_item items[];
-} __attribute__ ((__packed__));
-
-/*
- * all non-leaf blocks are nodes, they hold only keys and pointers to
- * other blocks
- */
-struct btrfs_key_ptr {
- struct btrfs_disk_key key;
- __le64 blockptr;
- __le64 generation;
-} __attribute__ ((__packed__));
-
-struct btrfs_node {
- struct btrfs_header header;
- struct btrfs_key_ptr ptrs[];
-} __attribute__ ((__packed__));
-
/* Read ahead values for struct btrfs_path.reada */
enum {
READA_NONE,
@@ -447,712 +112,6 @@ struct btrfs_path {
unsigned int nowait:1;
};
-struct btrfs_dev_replace {
- u64 replace_state; /* see #define above */
- time64_t time_started; /* seconds since 1-Jan-1970 */
- time64_t time_stopped; /* seconds since 1-Jan-1970 */
- atomic64_t num_write_errors;
- atomic64_t num_uncorrectable_read_errors;
-
- u64 cursor_left;
- u64 committed_cursor_left;
- u64 cursor_left_last_write_of_item;
- u64 cursor_right;
-
- u64 cont_reading_from_srcdev_mode; /* see #define above */
-
- int is_valid;
- int item_needs_writeback;
- struct btrfs_device *srcdev;
- struct btrfs_device *tgtdev;
-
- struct mutex lock_finishing_cancel_unmount;
- struct rw_semaphore rwsem;
-
- struct btrfs_scrub_progress scrub_progress;
-
- struct percpu_counter bio_counter;
- wait_queue_head_t replace_wait;
-};
-
-/*
- * free clusters are used to claim free space in relatively large chunks,
- * allowing us to do less seeky writes. They are used for all metadata
- * allocations. In ssd_spread mode they are also used for data allocations.
- */
-struct btrfs_free_cluster {
- spinlock_t lock;
- spinlock_t refill_lock;
- struct rb_root root;
-
- /* largest extent in this cluster */
- u64 max_size;
-
- /* first extent starting offset */
- u64 window_start;
-
- /* We did a full search and couldn't create a cluster */
- bool fragmented;
-
- struct btrfs_block_group *block_group;
- /*
- * when a cluster is allocated from a block group, we put the
- * cluster onto a list in the block group so that it can
- * be freed before the block group is freed.
- */
- struct list_head block_group_list;
-};
-
-/* Discard control. */
-/*
- * Async discard uses multiple lists to differentiate the discard filter
- * parameters. Index 0 is for completely free block groups where we need to
- * ensure the entire block group is trimmed without being lossy. Indices
- * afterwards represent monotonically decreasing discard filter sizes to
- * prioritize what should be discarded next.
- */
-#define BTRFS_NR_DISCARD_LISTS 3
-#define BTRFS_DISCARD_INDEX_UNUSED 0
-#define BTRFS_DISCARD_INDEX_START 1
-
-struct btrfs_discard_ctl {
- struct workqueue_struct *discard_workers;
- struct delayed_work work;
- spinlock_t lock;
- struct btrfs_block_group *block_group;
- struct list_head discard_list[BTRFS_NR_DISCARD_LISTS];
- u64 prev_discard;
- u64 prev_discard_time;
- atomic_t discardable_extents;
- atomic64_t discardable_bytes;
- u64 max_discard_size;
- u64 delay_ms;
- u32 iops_limit;
- u32 kbps_limit;
- u64 discard_extent_bytes;
- u64 discard_bitmap_bytes;
- atomic64_t discard_bytes_saved;
-};
-
-enum {
- BTRFS_FS_CLOSING_START,
- BTRFS_FS_CLOSING_DONE,
- BTRFS_FS_LOG_RECOVERING,
- BTRFS_FS_OPEN,
- BTRFS_FS_QUOTA_ENABLED,
- BTRFS_FS_UPDATE_UUID_TREE_GEN,
- BTRFS_FS_CREATING_FREE_SPACE_TREE,
- BTRFS_FS_BTREE_ERR,
- BTRFS_FS_LOG1_ERR,
- BTRFS_FS_LOG2_ERR,
- BTRFS_FS_QUOTA_OVERRIDE,
- /* Used to record internally whether fs has been frozen */
- BTRFS_FS_FROZEN,
- /*
- * Indicate that balance has been set up from the ioctl and is in the
- * main phase. The fs_info::balance_ctl is initialized.
- */
- BTRFS_FS_BALANCE_RUNNING,
-
- /*
- * Indicate that relocation of a chunk has started, it's set per chunk
- * and is toggled between chunks.
- */
- BTRFS_FS_RELOC_RUNNING,
-
- /* Indicate that the cleaner thread is awake and doing something. */
- BTRFS_FS_CLEANER_RUNNING,
-
- /*
- * The checksumming has an optimized version and is considered fast,
- * so we don't need to offload checksums to workqueues.
- */
- BTRFS_FS_CSUM_IMPL_FAST,
-
- /* Indicate that the discard workqueue can service discards. */
- BTRFS_FS_DISCARD_RUNNING,
-
- /* Indicate that we need to cleanup space cache v1 */
- BTRFS_FS_CLEANUP_SPACE_CACHE_V1,
-
- /* Indicate that we can't trust the free space tree for caching yet */
- BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED,
-
- /* Indicate whether there are any tree modification log users */
- BTRFS_FS_TREE_MOD_LOG_USERS,
-
- /* Indicate that we want the transaction kthread to commit right now. */
- BTRFS_FS_COMMIT_TRANS,
-
- /* Indicate we have half completed snapshot deletions pending. */
- BTRFS_FS_UNFINISHED_DROPS,
-
- /* Indicate we have to finish a zone to do next allocation. */
- BTRFS_FS_NEED_ZONE_FINISH,
-
-#if BITS_PER_LONG == 32
- /* Indicate if we have error/warn message printed on 32bit systems */
- BTRFS_FS_32BIT_ERROR,
- BTRFS_FS_32BIT_WARN,
-#endif
-};
-
-/*
- * Exclusive operations (device replace, resize, device add/remove, balance)
- */
-enum btrfs_exclusive_operation {
- BTRFS_EXCLOP_NONE,
- BTRFS_EXCLOP_BALANCE_PAUSED,
- BTRFS_EXCLOP_BALANCE,
- BTRFS_EXCLOP_DEV_ADD,
- BTRFS_EXCLOP_DEV_REMOVE,
- BTRFS_EXCLOP_DEV_REPLACE,
- BTRFS_EXCLOP_RESIZE,
- BTRFS_EXCLOP_SWAP_ACTIVATE,
-};
-
-/* Store data about transaction commits, exported via sysfs. */
-struct btrfs_commit_stats {
- /* Total number of commits */
- u64 commit_count;
- /* The maximum commit duration so far in ns */
- u64 max_commit_dur;
- /* The last commit duration in ns */
- u64 last_commit_dur;
- /* The total commit duration in ns */
- u64 total_commit_dur;
-};
-
-struct btrfs_fs_info {
- u8 chunk_tree_uuid[BTRFS_UUID_SIZE];
- unsigned long flags;
- struct btrfs_root *tree_root;
- struct btrfs_root *chunk_root;
- struct btrfs_root *dev_root;
- struct btrfs_root *fs_root;
- struct btrfs_root *quota_root;
- struct btrfs_root *uuid_root;
- struct btrfs_root *data_reloc_root;
- struct btrfs_root *block_group_root;
-
- /* the log root tree is a directory of all the other log roots */
- struct btrfs_root *log_root_tree;
-
- /* The tree that holds the global roots (csum, extent, etc) */
- rwlock_t global_root_lock;
- struct rb_root global_root_tree;
-
- spinlock_t fs_roots_radix_lock;
- struct radix_tree_root fs_roots_radix;
-
- /* block group cache stuff */
- rwlock_t block_group_cache_lock;
- struct rb_root_cached block_group_cache_tree;
-
- /* keep track of unallocated space */
- atomic64_t free_chunk_space;
-
- /* Track ranges which are used by log trees blocks/logged data extents */
- struct extent_io_tree excluded_extents;
-
- /* logical->physical extent mapping */
- struct extent_map_tree mapping_tree;
-
- /*
- * block reservation for extent, checksum, root tree and
- * delayed dir index item
- */
- struct btrfs_block_rsv global_block_rsv;
- /* block reservation for metadata operations */
- struct btrfs_block_rsv trans_block_rsv;
- /* block reservation for chunk tree */
- struct btrfs_block_rsv chunk_block_rsv;
- /* block reservation for delayed operations */
- struct btrfs_block_rsv delayed_block_rsv;
- /* block reservation for delayed refs */
- struct btrfs_block_rsv delayed_refs_rsv;
-
- struct btrfs_block_rsv empty_block_rsv;
-
- u64 generation;
- u64 last_trans_committed;
- /*
- * Generation of the last transaction used for block group relocation
- * since the filesystem was last mounted (or 0 if none happened yet).
- * Must be written and read while holding btrfs_fs_info::commit_root_sem.
- */
- u64 last_reloc_trans;
- u64 avg_delayed_ref_runtime;
-
- /*
- * this is updated to the current trans every time a full commit
- * is required instead of the faster short fsync log commits
- */
- u64 last_trans_log_full_commit;
- unsigned long mount_opt;
- /*
- * Track requests for actions that need to be done during transaction
- * commit (like for some mount options).
- */
- unsigned long pending_changes;
- unsigned long compress_type:4;
- unsigned int compress_level;
- u32 commit_interval;
- /*
- * It is a suggestive number, the read side is safe even it gets a
- * wrong number because we will write out the data into a regular
- * extent. The write side(mount/remount) is under ->s_umount lock,
- * so it is also safe.
- */
- u64 max_inline;
-
- struct btrfs_transaction *running_transaction;
- wait_queue_head_t transaction_throttle;
- wait_queue_head_t transaction_wait;
- wait_queue_head_t transaction_blocked_wait;
- wait_queue_head_t async_submit_wait;
-
- /*
- * Used to protect the incompat_flags, compat_flags, compat_ro_flags
- * when they are updated.
- *
- * Because we do not clear the flags for ever, so we needn't use
- * the lock on the read side.
- *
- * We also needn't use the lock when we mount the fs, because
- * there is no other task which will update the flag.
- */
- spinlock_t super_lock;
- struct btrfs_super_block *super_copy;
- struct btrfs_super_block *super_for_commit;
- struct super_block *sb;
- struct inode *btree_inode;
- struct mutex tree_log_mutex;
- struct mutex transaction_kthread_mutex;
- struct mutex cleaner_mutex;
- struct mutex chunk_mutex;
-
- /*
- * this is taken to make sure we don't set block groups ro after
- * the free space cache has been allocated on them
- */
- struct mutex ro_block_group_mutex;
-
- /* this is used during read/modify/write to make sure
- * no two ios are trying to mod the same stripe at the same
- * time
- */
- struct btrfs_stripe_hash_table *stripe_hash_table;
-
- /*
- * this protects the ordered operations list only while we are
- * processing all of the entries on it. This way we make
- * sure the commit code doesn't find the list temporarily empty
- * because another function happens to be doing non-waiting preflush
- * before jumping into the main commit.
- */
- struct mutex ordered_operations_mutex;
-
- struct rw_semaphore commit_root_sem;
-
- struct rw_semaphore cleanup_work_sem;
-
- struct rw_semaphore subvol_sem;
-
- spinlock_t trans_lock;
- /*
- * the reloc mutex goes with the trans lock, it is taken
- * during commit to protect us from the relocation code
- */
- struct mutex reloc_mutex;
-
- struct list_head trans_list;
- struct list_head dead_roots;
- struct list_head caching_block_groups;
-
- spinlock_t delayed_iput_lock;
- struct list_head delayed_iputs;
- atomic_t nr_delayed_iputs;
- wait_queue_head_t delayed_iputs_wait;
-
- atomic64_t tree_mod_seq;
-
- /* this protects tree_mod_log and tree_mod_seq_list */
- rwlock_t tree_mod_log_lock;
- struct rb_root tree_mod_log;
- struct list_head tree_mod_seq_list;
-
- atomic_t async_delalloc_pages;
-
- /*
- * this is used to protect the following list -- ordered_roots.
- */
- spinlock_t ordered_root_lock;
-
- /*
- * all fs/file tree roots in which there are data=ordered extents
- * pending writeback are added into this list.
- *
- * these can span multiple transactions and basically include
- * every dirty data page that isn't from nodatacow
- */
- struct list_head ordered_roots;
-
- struct mutex delalloc_root_mutex;
- spinlock_t delalloc_root_lock;
- /* all fs/file tree roots that have delalloc inodes. */
- struct list_head delalloc_roots;
-
- /*
- * there is a pool of worker threads for checksumming during writes
- * and a pool for checksumming after reads. This is because readers
- * can run with FS locks held, and the writers may be waiting for
- * those locks. We don't want ordering in the pending list to cause
- * deadlocks, and so the two are serviced separately.
- *
- * A third pool does submit_bio to avoid deadlocking with the other
- * two
- */
- struct btrfs_workqueue *workers;
- struct btrfs_workqueue *hipri_workers;
- struct btrfs_workqueue *delalloc_workers;
- struct btrfs_workqueue *flush_workers;
- struct workqueue_struct *endio_workers;
- struct workqueue_struct *endio_meta_workers;
- struct workqueue_struct *endio_raid56_workers;
- struct workqueue_struct *rmw_workers;
- struct workqueue_struct *compressed_write_workers;
- struct btrfs_workqueue *endio_write_workers;
- struct btrfs_workqueue *endio_freespace_worker;
- struct btrfs_workqueue *caching_workers;
-
- /*
- * fixup workers take dirty pages that didn't properly go through
- * the cow mechanism and make them safe to write. It happens
- * for the sys_munmap function call path
- */
- struct btrfs_workqueue *fixup_workers;
- struct btrfs_workqueue *delayed_workers;
-
- struct task_struct *transaction_kthread;
- struct task_struct *cleaner_kthread;
- u32 thread_pool_size;
-
- struct kobject *space_info_kobj;
- struct kobject *qgroups_kobj;
- struct kobject *discard_kobj;
-
- /* used to keep from writing metadata until there is a nice batch */
- struct percpu_counter dirty_metadata_bytes;
- struct percpu_counter delalloc_bytes;
- struct percpu_counter ordered_bytes;
- s32 dirty_metadata_batch;
- s32 delalloc_batch;
-
- struct list_head dirty_cowonly_roots;
-
- struct btrfs_fs_devices *fs_devices;
-
- /*
- * The space_info list is effectively read only after initial
- * setup. It is populated at mount time and cleaned up after
- * all block groups are removed. RCU is used to protect it.
- */
- struct list_head space_info;
-
- struct btrfs_space_info *data_sinfo;
-
- struct reloc_control *reloc_ctl;
-
- /* data_alloc_cluster is only used in ssd_spread mode */
- struct btrfs_free_cluster data_alloc_cluster;
-
- /* all metadata allocations go through this cluster */
- struct btrfs_free_cluster meta_alloc_cluster;
-
- /* auto defrag inodes go here */
- spinlock_t defrag_inodes_lock;
- struct rb_root defrag_inodes;
- atomic_t defrag_running;
-
- /* Used to protect avail_{data, metadata, system}_alloc_bits */
- seqlock_t profiles_lock;
- /*
- * these three are in extended format (availability of single
- * chunks is denoted by BTRFS_AVAIL_ALLOC_BIT_SINGLE bit, other
- * types are denoted by corresponding BTRFS_BLOCK_GROUP_* bits)
- */
- u64 avail_data_alloc_bits;
- u64 avail_metadata_alloc_bits;
- u64 avail_system_alloc_bits;
-
- /* restriper state */
- spinlock_t balance_lock;
- struct mutex balance_mutex;
- atomic_t balance_pause_req;
- atomic_t balance_cancel_req;
- struct btrfs_balance_control *balance_ctl;
- wait_queue_head_t balance_wait_q;
-
- /* Cancellation requests for chunk relocation */
- atomic_t reloc_cancel_req;
-
- u32 data_chunk_allocations;
- u32 metadata_ratio;
-
- void *bdev_holder;
-
- /* private scrub information */
- struct mutex scrub_lock;
- atomic_t scrubs_running;
- atomic_t scrub_pause_req;
- atomic_t scrubs_paused;
- atomic_t scrub_cancel_req;
- wait_queue_head_t scrub_pause_wait;
- /*
- * The worker pointers are NULL iff the refcount is 0, ie. scrub is not
- * running.
- */
- refcount_t scrub_workers_refcnt;
- struct workqueue_struct *scrub_workers;
- struct workqueue_struct *scrub_wr_completion_workers;
- struct workqueue_struct *scrub_parity_workers;
- struct btrfs_subpage_info *subpage_info;
-
- struct btrfs_discard_ctl discard_ctl;
-
-#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
- u32 check_integrity_print_mask;
-#endif
- /* is qgroup tracking in a consistent state? */
- u64 qgroup_flags;
-
- /* holds configuration and tracking. Protected by qgroup_lock */
- struct rb_root qgroup_tree;
- spinlock_t qgroup_lock;
-
- /*
- * used to avoid frequently calling ulist_alloc()/ulist_free()
- * when doing qgroup accounting, it must be protected by qgroup_lock.
- */
- struct ulist *qgroup_ulist;
-
- /*
- * Protect user change for quota operations. If a transaction is needed,
- * it must be started before locking this lock.
- */
- struct mutex qgroup_ioctl_lock;
-
- /* list of dirty qgroups to be written at next commit */
- struct list_head dirty_qgroups;
-
- /* used by qgroup for an efficient tree traversal */
- u64 qgroup_seq;
-
- /* qgroup rescan items */
- struct mutex qgroup_rescan_lock; /* protects the progress item */
- struct btrfs_key qgroup_rescan_progress;
- struct btrfs_workqueue *qgroup_rescan_workers;
- struct completion qgroup_rescan_completion;
- struct btrfs_work qgroup_rescan_work;
- bool qgroup_rescan_running; /* protected by qgroup_rescan_lock */
- u8 qgroup_drop_subtree_thres;
-
- /* filesystem state */
- unsigned long fs_state;
-
- struct btrfs_delayed_root *delayed_root;
-
- /* Extent buffer radix tree */
- spinlock_t buffer_lock;
- /* Entries are eb->start / sectorsize */
- struct radix_tree_root buffer_radix;
-
- /* next backup root to be overwritten */
- int backup_root_index;
-
- /* device replace state */
- struct btrfs_dev_replace dev_replace;
-
- struct semaphore uuid_tree_rescan_sem;
-
- /* Used to reclaim the metadata space in the background. */
- struct work_struct async_reclaim_work;
- struct work_struct async_data_reclaim_work;
- struct work_struct preempt_reclaim_work;
-
- /* Reclaim partially filled block groups in the background */
- struct work_struct reclaim_bgs_work;
- struct list_head reclaim_bgs;
- int bg_reclaim_threshold;
-
- spinlock_t unused_bgs_lock;
- struct list_head unused_bgs;
- struct mutex unused_bg_unpin_mutex;
- /* Protect block groups that are going to be deleted */
- struct mutex reclaim_bgs_lock;
-
- /* Cached block sizes */
- u32 nodesize;
- u32 sectorsize;
- /* ilog2 of sectorsize, use to avoid 64bit division */
- u32 sectorsize_bits;
- u32 csum_size;
- u32 csums_per_leaf;
- u32 stripesize;
-
- /*
- * Maximum size of an extent. BTRFS_MAX_EXTENT_SIZE on regular
- * filesystem, on zoned it depends on the device constraints.
- */
- u64 max_extent_size;
-
- /* Block groups and devices containing active swapfiles. */
- spinlock_t swapfile_pins_lock;
- struct rb_root swapfile_pins;
-
- struct crypto_shash *csum_shash;
-
- /* Type of exclusive operation running, protected by super_lock */
- enum btrfs_exclusive_operation exclusive_operation;
-
- /*
- * Zone size > 0 when in ZONED mode, otherwise it's used for a check
- * if the mode is enabled
- */
- u64 zone_size;
-
- /* Max size to emit ZONE_APPEND write command */
- u64 max_zone_append_size;
- struct mutex zoned_meta_io_lock;
- spinlock_t treelog_bg_lock;
- u64 treelog_bg;
-
- /*
- * Start of the dedicated data relocation block group, protected by
- * relocation_bg_lock.
- */
- spinlock_t relocation_bg_lock;
- u64 data_reloc_bg;
- struct mutex zoned_data_reloc_io_lock;
-
- u64 nr_global_roots;
-
- spinlock_t zone_active_bgs_lock;
- struct list_head zone_active_bgs;
-
- /* Updates are not protected by any lock */
- struct btrfs_commit_stats commit_stats;
-
- /*
- * Last generation where we dropped a non-relocation root.
- * Use btrfs_set_last_root_drop_gen() and btrfs_get_last_root_drop_gen()
- * to change it and to read it, respectively.
- */
- u64 last_root_drop_gen;
-
- /*
- * Annotations for transaction events (structures are empty when
- * compiled without lockdep).
- */
- struct lockdep_map btrfs_trans_num_writers_map;
- struct lockdep_map btrfs_trans_num_extwriters_map;
- struct lockdep_map btrfs_state_change_map[4];
- struct lockdep_map btrfs_trans_pending_ordered_map;
- struct lockdep_map btrfs_ordered_extent_map;
-
-#ifdef CONFIG_BTRFS_FS_REF_VERIFY
- spinlock_t ref_verify_lock;
- struct rb_root block_tree;
-#endif
-
-#ifdef CONFIG_BTRFS_DEBUG
- struct kobject *debug_kobj;
- struct list_head allocated_roots;
-
- spinlock_t eb_leak_lock;
- struct list_head allocated_ebs;
-#endif
-};
-
-static inline void btrfs_set_last_root_drop_gen(struct btrfs_fs_info *fs_info,
- u64 gen)
-{
- WRITE_ONCE(fs_info->last_root_drop_gen, gen);
-}
-
-static inline u64 btrfs_get_last_root_drop_gen(const struct btrfs_fs_info *fs_info)
-{
- return READ_ONCE(fs_info->last_root_drop_gen);
-}
-
-static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb)
-{
- return sb->s_fs_info;
-}
-
-/*
- * Take the number of bytes to be checksummed and figure out how many leaves
- * it would require to store the csums for that many bytes.
- */
-static inline u64 btrfs_csum_bytes_to_leaves(
- const struct btrfs_fs_info *fs_info, u64 csum_bytes)
-{
- const u64 num_csums = csum_bytes >> fs_info->sectorsize_bits;
-
- return DIV_ROUND_UP_ULL(num_csums, fs_info->csums_per_leaf);
-}
-
-/*
- * Use this if we would be adding new items, as we could split nodes as we cow
- * down the tree.
- */
-static inline u64 btrfs_calc_insert_metadata_size(struct btrfs_fs_info *fs_info,
- unsigned num_items)
-{
- return (u64)fs_info->nodesize * BTRFS_MAX_LEVEL * 2 * num_items;
-}
-
-/*
- * Doing a truncate or a modification won't result in new nodes or leaves, just
- * what we need for COW.
- */
-static inline u64 btrfs_calc_metadata_size(struct btrfs_fs_info *fs_info,
- unsigned num_items)
-{
- return (u64)fs_info->nodesize * BTRFS_MAX_LEVEL * num_items;
-}
-
-#define BTRFS_MAX_EXTENT_ITEM_SIZE(r) ((BTRFS_LEAF_DATA_SIZE(r->fs_info) >> 4) - \
- sizeof(struct btrfs_item))
-
-static inline bool btrfs_is_zoned(const struct btrfs_fs_info *fs_info)
-{
- return fs_info->zone_size > 0;
-}
-
-/*
- * Count how many fs_info->max_extent_size cover the @size
- */
-static inline u32 count_max_extents(struct btrfs_fs_info *fs_info, u64 size)
-{
-#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
- if (!fs_info)
- return div_u64(size + BTRFS_MAX_EXTENT_SIZE - 1, BTRFS_MAX_EXTENT_SIZE);
-#endif
-
- return div_u64(size + fs_info->max_extent_size - 1, fs_info->max_extent_size);
-}
-
-bool btrfs_exclop_start(struct btrfs_fs_info *fs_info,
- enum btrfs_exclusive_operation type);
-bool btrfs_exclop_start_try_lock(struct btrfs_fs_info *fs_info,
- enum btrfs_exclusive_operation type);
-void btrfs_exclop_start_unlock(struct btrfs_fs_info *fs_info);
-void btrfs_exclop_finish(struct btrfs_fs_info *fs_info);
-void btrfs_exclop_balance(struct btrfs_fs_info *fs_info,
- enum btrfs_exclusive_operation op);
-
/*
* The state of btrfs root
*/
@@ -1215,87 +174,6 @@ enum {
BTRFS_ROOT_RESET_LOCKDEP_CLASS,
};
-enum btrfs_lockdep_trans_states {
- BTRFS_LOCKDEP_TRANS_COMMIT_START,
- BTRFS_LOCKDEP_TRANS_UNBLOCKED,
- BTRFS_LOCKDEP_TRANS_SUPER_COMMITTED,
- BTRFS_LOCKDEP_TRANS_COMPLETED,
-};
-
-/*
- * Lockdep annotation for wait events.
- *
- * @owner: The struct where the lockdep map is defined
- * @lock: The lockdep map corresponding to a wait event
- *
- * This macro is used to annotate a wait event. In this case a thread acquires
- * the lockdep map as writer (exclusive lock) because it has to block until all
- * the threads that hold the lock as readers signal the condition for the wait
- * event and release their locks.
- */
-#define btrfs_might_wait_for_event(owner, lock) \
- do { \
- rwsem_acquire(&owner->lock##_map, 0, 0, _THIS_IP_); \
- rwsem_release(&owner->lock##_map, _THIS_IP_); \
- } while (0)
-
-/*
- * Protection for the resource/condition of a wait event.
- *
- * @owner: The struct where the lockdep map is defined
- * @lock: The lockdep map corresponding to a wait event
- *
- * Many threads can modify the condition for the wait event at the same time
- * and signal the threads that block on the wait event. The threads that modify
- * the condition and do the signaling acquire the lock as readers (shared
- * lock).
- */
-#define btrfs_lockdep_acquire(owner, lock) \
- rwsem_acquire_read(&owner->lock##_map, 0, 0, _THIS_IP_)
-
-/*
- * Used after signaling the condition for a wait event to release the lockdep
- * map held by a reader thread.
- */
-#define btrfs_lockdep_release(owner, lock) \
- rwsem_release(&owner->lock##_map, _THIS_IP_)
-
-/*
- * Macros for the transaction states wait events, similar to the generic wait
- * event macros.
- */
-#define btrfs_might_wait_for_state(owner, i) \
- do { \
- rwsem_acquire(&owner->btrfs_state_change_map[i], 0, 0, _THIS_IP_); \
- rwsem_release(&owner->btrfs_state_change_map[i], _THIS_IP_); \
- } while (0)
-
-#define btrfs_trans_state_lockdep_acquire(owner, i) \
- rwsem_acquire_read(&owner->btrfs_state_change_map[i], 0, 0, _THIS_IP_)
-
-#define btrfs_trans_state_lockdep_release(owner, i) \
- rwsem_release(&owner->btrfs_state_change_map[i], _THIS_IP_)
-
-/* Initialization of the lockdep map */
-#define btrfs_lockdep_init_map(owner, lock) \
- do { \
- static struct lock_class_key lock##_key; \
- lockdep_init_map(&owner->lock##_map, #lock, &lock##_key, 0); \
- } while (0)
-
-/* Initialization of the transaction states lockdep maps. */
-#define btrfs_state_lockdep_init_map(owner, lock, state) \
- do { \
- static struct lock_class_key lock##_key; \
- lockdep_init_map(&owner->btrfs_state_change_map[state], #lock, \
- &lock##_key, 0); \
- } while (0)
-
-static inline void btrfs_wake_unfinished_drop(struct btrfs_fs_info *fs_info)
-{
- clear_and_wake_up_bit(BTRFS_FS_UNFINISHED_DROPS, &fs_info->flags);
-}
-
/*
* Record swapped tree blocks of a subvolume tree for delayed subtree trace
* code. For detail check comment in fs/btrfs/qgroup.c.
@@ -1457,6 +335,23 @@ struct btrfs_root {
#endif
};
+static inline bool btrfs_root_readonly(const struct btrfs_root *root)
+{
+ /* Byte-swap the constant at compile time, root_item::flags is LE */
+ return (root->root_item.flags & cpu_to_le64(BTRFS_ROOT_SUBVOL_RDONLY)) != 0;
+}
+
+static inline bool btrfs_root_dead(const struct btrfs_root *root)
+{
+ /* Byte-swap the constant at compile time, root_item::flags is LE */
+ return (root->root_item.flags & cpu_to_le64(BTRFS_ROOT_SUBVOL_DEAD)) != 0;
+}
+
+static inline u64 btrfs_root_id(const struct btrfs_root *root)
+{
+ return root->root_key.objectid;
+}
+
/*
* Structure that conveys information about an extent that is going to replace
* all the extents in a file range.
@@ -1548,17 +443,14 @@ struct btrfs_drop_extents_args {
struct btrfs_file_private {
void *filldir_buf;
+ struct extent_state *llseek_cached_state;
};
-
static inline u32 BTRFS_LEAF_DATA_SIZE(const struct btrfs_fs_info *info)
{
-
return info->nodesize - sizeof(struct btrfs_header);
}
-#define BTRFS_LEAF_DATA_OFFSET offsetof(struct btrfs_leaf, items)
-
static inline u32 BTRFS_MAX_ITEM_SIZE(const struct btrfs_fs_info *info)
{
return BTRFS_LEAF_DATA_SIZE(info) - sizeof(struct btrfs_item);
@@ -1569,1254 +461,14 @@ static inline u32 BTRFS_NODEPTRS_PER_BLOCK(const struct btrfs_fs_info *info)
return BTRFS_LEAF_DATA_SIZE(info) / sizeof(struct btrfs_key_ptr);
}
-#define BTRFS_FILE_EXTENT_INLINE_DATA_START \
- (offsetof(struct btrfs_file_extent_item, disk_bytenr))
-static inline u32 BTRFS_MAX_INLINE_DATA_SIZE(const struct btrfs_fs_info *info)
-{
- return BTRFS_MAX_ITEM_SIZE(info) -
- BTRFS_FILE_EXTENT_INLINE_DATA_START;
-}
-
static inline u32 BTRFS_MAX_XATTR_SIZE(const struct btrfs_fs_info *info)
{
return BTRFS_MAX_ITEM_SIZE(info) - sizeof(struct btrfs_dir_item);
}
-/*
- * Flags for mount options.
- *
- * Note: don't forget to add new options to btrfs_show_options()
- */
-enum {
- BTRFS_MOUNT_NODATASUM = (1UL << 0),
- BTRFS_MOUNT_NODATACOW = (1UL << 1),
- BTRFS_MOUNT_NOBARRIER = (1UL << 2),
- BTRFS_MOUNT_SSD = (1UL << 3),
- BTRFS_MOUNT_DEGRADED = (1UL << 4),
- BTRFS_MOUNT_COMPRESS = (1UL << 5),
- BTRFS_MOUNT_NOTREELOG = (1UL << 6),
- BTRFS_MOUNT_FLUSHONCOMMIT = (1UL << 7),
- BTRFS_MOUNT_SSD_SPREAD = (1UL << 8),
- BTRFS_MOUNT_NOSSD = (1UL << 9),
- BTRFS_MOUNT_DISCARD_SYNC = (1UL << 10),
- BTRFS_MOUNT_FORCE_COMPRESS = (1UL << 11),
- BTRFS_MOUNT_SPACE_CACHE = (1UL << 12),
- BTRFS_MOUNT_CLEAR_CACHE = (1UL << 13),
- BTRFS_MOUNT_USER_SUBVOL_RM_ALLOWED = (1UL << 14),
- BTRFS_MOUNT_ENOSPC_DEBUG = (1UL << 15),
- BTRFS_MOUNT_AUTO_DEFRAG = (1UL << 16),
- BTRFS_MOUNT_USEBACKUPROOT = (1UL << 17),
- BTRFS_MOUNT_SKIP_BALANCE = (1UL << 18),
- BTRFS_MOUNT_CHECK_INTEGRITY = (1UL << 19),
- BTRFS_MOUNT_CHECK_INTEGRITY_DATA = (1UL << 20),
- BTRFS_MOUNT_PANIC_ON_FATAL_ERROR = (1UL << 21),
- BTRFS_MOUNT_RESCAN_UUID_TREE = (1UL << 22),
- BTRFS_MOUNT_FRAGMENT_DATA = (1UL << 23),
- BTRFS_MOUNT_FRAGMENT_METADATA = (1UL << 24),
- BTRFS_MOUNT_FREE_SPACE_TREE = (1UL << 25),
- BTRFS_MOUNT_NOLOGREPLAY = (1UL << 26),
- BTRFS_MOUNT_REF_VERIFY = (1UL << 27),
- BTRFS_MOUNT_DISCARD_ASYNC = (1UL << 28),
- BTRFS_MOUNT_IGNOREBADROOTS = (1UL << 29),
- BTRFS_MOUNT_IGNOREDATACSUMS = (1UL << 30),
-};
-
-#define BTRFS_DEFAULT_COMMIT_INTERVAL (30)
-#define BTRFS_DEFAULT_MAX_INLINE (2048)
-
-#define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt)
-#define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt)
-#define btrfs_raw_test_opt(o, opt) ((o) & BTRFS_MOUNT_##opt)
-#define btrfs_test_opt(fs_info, opt) ((fs_info)->mount_opt & \
- BTRFS_MOUNT_##opt)
-
-#define btrfs_set_and_info(fs_info, opt, fmt, args...) \
-do { \
- if (!btrfs_test_opt(fs_info, opt)) \
- btrfs_info(fs_info, fmt, ##args); \
- btrfs_set_opt(fs_info->mount_opt, opt); \
-} while (0)
-
-#define btrfs_clear_and_info(fs_info, opt, fmt, args...) \
-do { \
- if (btrfs_test_opt(fs_info, opt)) \
- btrfs_info(fs_info, fmt, ##args); \
- btrfs_clear_opt(fs_info->mount_opt, opt); \
-} while (0)
-
-/*
- * Requests for changes that need to be done during transaction commit.
- *
- * Internal mount options that are used for special handling of the real
- * mount options (eg. cannot be set during remount and have to be set during
- * transaction commit)
- */
-
-#define BTRFS_PENDING_COMMIT (0)
-
-#define btrfs_test_pending(info, opt) \
- test_bit(BTRFS_PENDING_##opt, &(info)->pending_changes)
-#define btrfs_set_pending(info, opt) \
- set_bit(BTRFS_PENDING_##opt, &(info)->pending_changes)
-#define btrfs_clear_pending(info, opt) \
- clear_bit(BTRFS_PENDING_##opt, &(info)->pending_changes)
-
-/*
- * Helpers for setting pending mount option changes.
- *
- * Expects corresponding macros
- * BTRFS_PENDING_SET_ and CLEAR_ + short mount option name
- */
-#define btrfs_set_pending_and_info(info, opt, fmt, args...) \
-do { \
- if (!btrfs_raw_test_opt((info)->mount_opt, opt)) { \
- btrfs_info((info), fmt, ##args); \
- btrfs_set_pending((info), SET_##opt); \
- btrfs_clear_pending((info), CLEAR_##opt); \
- } \
-} while(0)
-
-#define btrfs_clear_pending_and_info(info, opt, fmt, args...) \
-do { \
- if (btrfs_raw_test_opt((info)->mount_opt, opt)) { \
- btrfs_info((info), fmt, ##args); \
- btrfs_set_pending((info), CLEAR_##opt); \
- btrfs_clear_pending((info), SET_##opt); \
- } \
-} while(0)
-
-/*
- * Inode flags
- */
-#define BTRFS_INODE_NODATASUM (1U << 0)
-#define BTRFS_INODE_NODATACOW (1U << 1)
-#define BTRFS_INODE_READONLY (1U << 2)
-#define BTRFS_INODE_NOCOMPRESS (1U << 3)
-#define BTRFS_INODE_PREALLOC (1U << 4)
-#define BTRFS_INODE_SYNC (1U << 5)
-#define BTRFS_INODE_IMMUTABLE (1U << 6)
-#define BTRFS_INODE_APPEND (1U << 7)
-#define BTRFS_INODE_NODUMP (1U << 8)
-#define BTRFS_INODE_NOATIME (1U << 9)
-#define BTRFS_INODE_DIRSYNC (1U << 10)
-#define BTRFS_INODE_COMPRESS (1U << 11)
-
-#define BTRFS_INODE_ROOT_ITEM_INIT (1U << 31)
-
-#define BTRFS_INODE_FLAG_MASK \
- (BTRFS_INODE_NODATASUM | \
- BTRFS_INODE_NODATACOW | \
- BTRFS_INODE_READONLY | \
- BTRFS_INODE_NOCOMPRESS | \
- BTRFS_INODE_PREALLOC | \
- BTRFS_INODE_SYNC | \
- BTRFS_INODE_IMMUTABLE | \
- BTRFS_INODE_APPEND | \
- BTRFS_INODE_NODUMP | \
- BTRFS_INODE_NOATIME | \
- BTRFS_INODE_DIRSYNC | \
- BTRFS_INODE_COMPRESS | \
- BTRFS_INODE_ROOT_ITEM_INIT)
-
-#define BTRFS_INODE_RO_VERITY (1U << 0)
-
-#define BTRFS_INODE_RO_FLAG_MASK (BTRFS_INODE_RO_VERITY)
-
-struct btrfs_map_token {
- struct extent_buffer *eb;
- char *kaddr;
- unsigned long offset;
-};
-
#define BTRFS_BYTES_TO_BLKS(fs_info, bytes) \
((bytes) >> (fs_info)->sectorsize_bits)
-static inline void btrfs_init_map_token(struct btrfs_map_token *token,
- struct extent_buffer *eb)
-{
- token->eb = eb;
- token->kaddr = page_address(eb->pages[0]);
- token->offset = 0;
-}
-
-/* some macros to generate set/get functions for the struct fields. This
- * assumes there is a lefoo_to_cpu for every type, so lets make a simple
- * one for u8:
- */
-#define le8_to_cpu(v) (v)
-#define cpu_to_le8(v) (v)
-#define __le8 u8
-
-static inline u8 get_unaligned_le8(const void *p)
-{
- return *(u8 *)p;
-}
-
-static inline void put_unaligned_le8(u8 val, void *p)
-{
- *(u8 *)p = val;
-}
-
-#define read_eb_member(eb, ptr, type, member, result) (\
- read_extent_buffer(eb, (char *)(result), \
- ((unsigned long)(ptr)) + \
- offsetof(type, member), \
- sizeof(((type *)0)->member)))
-
-#define write_eb_member(eb, ptr, type, member, result) (\
- write_extent_buffer(eb, (char *)(result), \
- ((unsigned long)(ptr)) + \
- offsetof(type, member), \
- sizeof(((type *)0)->member)))
-
-#define DECLARE_BTRFS_SETGET_BITS(bits) \
-u##bits btrfs_get_token_##bits(struct btrfs_map_token *token, \
- const void *ptr, unsigned long off); \
-void btrfs_set_token_##bits(struct btrfs_map_token *token, \
- const void *ptr, unsigned long off, \
- u##bits val); \
-u##bits btrfs_get_##bits(const struct extent_buffer *eb, \
- const void *ptr, unsigned long off); \
-void btrfs_set_##bits(const struct extent_buffer *eb, void *ptr, \
- unsigned long off, u##bits val);
-
-DECLARE_BTRFS_SETGET_BITS(8)
-DECLARE_BTRFS_SETGET_BITS(16)
-DECLARE_BTRFS_SETGET_BITS(32)
-DECLARE_BTRFS_SETGET_BITS(64)
-
-#define BTRFS_SETGET_FUNCS(name, type, member, bits) \
-static inline u##bits btrfs_##name(const struct extent_buffer *eb, \
- const type *s) \
-{ \
- static_assert(sizeof(u##bits) == sizeof(((type *)0))->member); \
- return btrfs_get_##bits(eb, s, offsetof(type, member)); \
-} \
-static inline void btrfs_set_##name(const struct extent_buffer *eb, type *s, \
- u##bits val) \
-{ \
- static_assert(sizeof(u##bits) == sizeof(((type *)0))->member); \
- btrfs_set_##bits(eb, s, offsetof(type, member), val); \
-} \
-static inline u##bits btrfs_token_##name(struct btrfs_map_token *token, \
- const type *s) \
-{ \
- static_assert(sizeof(u##bits) == sizeof(((type *)0))->member); \
- return btrfs_get_token_##bits(token, s, offsetof(type, member));\
-} \
-static inline void btrfs_set_token_##name(struct btrfs_map_token *token,\
- type *s, u##bits val) \
-{ \
- static_assert(sizeof(u##bits) == sizeof(((type *)0))->member); \
- btrfs_set_token_##bits(token, s, offsetof(type, member), val); \
-}
-
-#define BTRFS_SETGET_HEADER_FUNCS(name, type, member, bits) \
-static inline u##bits btrfs_##name(const struct extent_buffer *eb) \
-{ \
- const type *p = page_address(eb->pages[0]) + \
- offset_in_page(eb->start); \
- return get_unaligned_le##bits(&p->member); \
-} \
-static inline void btrfs_set_##name(const struct extent_buffer *eb, \
- u##bits val) \
-{ \
- type *p = page_address(eb->pages[0]) + offset_in_page(eb->start); \
- put_unaligned_le##bits(val, &p->member); \
-}
-
-#define BTRFS_SETGET_STACK_FUNCS(name, type, member, bits) \
-static inline u##bits btrfs_##name(const type *s) \
-{ \
- return get_unaligned_le##bits(&s->member); \
-} \
-static inline void btrfs_set_##name(type *s, u##bits val) \
-{ \
- put_unaligned_le##bits(val, &s->member); \
-}
-
-static inline u64 btrfs_device_total_bytes(const struct extent_buffer *eb,
- struct btrfs_dev_item *s)
-{
- static_assert(sizeof(u64) ==
- sizeof(((struct btrfs_dev_item *)0))->total_bytes);
- return btrfs_get_64(eb, s, offsetof(struct btrfs_dev_item,
- total_bytes));
-}
-static inline void btrfs_set_device_total_bytes(const struct extent_buffer *eb,
- struct btrfs_dev_item *s,
- u64 val)
-{
- static_assert(sizeof(u64) ==
- sizeof(((struct btrfs_dev_item *)0))->total_bytes);
- WARN_ON(!IS_ALIGNED(val, eb->fs_info->sectorsize));
- btrfs_set_64(eb, s, offsetof(struct btrfs_dev_item, total_bytes), val);
-}
-
-
-BTRFS_SETGET_FUNCS(device_type, struct btrfs_dev_item, type, 64);
-BTRFS_SETGET_FUNCS(device_bytes_used, struct btrfs_dev_item, bytes_used, 64);
-BTRFS_SETGET_FUNCS(device_io_align, struct btrfs_dev_item, io_align, 32);
-BTRFS_SETGET_FUNCS(device_io_width, struct btrfs_dev_item, io_width, 32);
-BTRFS_SETGET_FUNCS(device_start_offset, struct btrfs_dev_item,
- start_offset, 64);
-BTRFS_SETGET_FUNCS(device_sector_size, struct btrfs_dev_item, sector_size, 32);
-BTRFS_SETGET_FUNCS(device_id, struct btrfs_dev_item, devid, 64);
-BTRFS_SETGET_FUNCS(device_group, struct btrfs_dev_item, dev_group, 32);
-BTRFS_SETGET_FUNCS(device_seek_speed, struct btrfs_dev_item, seek_speed, 8);
-BTRFS_SETGET_FUNCS(device_bandwidth, struct btrfs_dev_item, bandwidth, 8);
-BTRFS_SETGET_FUNCS(device_generation, struct btrfs_dev_item, generation, 64);
-
-BTRFS_SETGET_STACK_FUNCS(stack_device_type, struct btrfs_dev_item, type, 64);
-BTRFS_SETGET_STACK_FUNCS(stack_device_total_bytes, struct btrfs_dev_item,
- total_bytes, 64);
-BTRFS_SETGET_STACK_FUNCS(stack_device_bytes_used, struct btrfs_dev_item,
- bytes_used, 64);
-BTRFS_SETGET_STACK_FUNCS(stack_device_io_align, struct btrfs_dev_item,
- io_align, 32);
-BTRFS_SETGET_STACK_FUNCS(stack_device_io_width, struct btrfs_dev_item,
- io_width, 32);
-BTRFS_SETGET_STACK_FUNCS(stack_device_sector_size, struct btrfs_dev_item,
- sector_size, 32);
-BTRFS_SETGET_STACK_FUNCS(stack_device_id, struct btrfs_dev_item, devid, 64);
-BTRFS_SETGET_STACK_FUNCS(stack_device_group, struct btrfs_dev_item,
- dev_group, 32);
-BTRFS_SETGET_STACK_FUNCS(stack_device_seek_speed, struct btrfs_dev_item,
- seek_speed, 8);
-BTRFS_SETGET_STACK_FUNCS(stack_device_bandwidth, struct btrfs_dev_item,
- bandwidth, 8);
-BTRFS_SETGET_STACK_FUNCS(stack_device_generation, struct btrfs_dev_item,
- generation, 64);
-
-static inline unsigned long btrfs_device_uuid(struct btrfs_dev_item *d)
-{
- return (unsigned long)d + offsetof(struct btrfs_dev_item, uuid);
-}
-
-static inline unsigned long btrfs_device_fsid(struct btrfs_dev_item *d)
-{
- return (unsigned long)d + offsetof(struct btrfs_dev_item, fsid);
-}
-
-BTRFS_SETGET_FUNCS(chunk_length, struct btrfs_chunk, length, 64);
-BTRFS_SETGET_FUNCS(chunk_owner, struct btrfs_chunk, owner, 64);
-BTRFS_SETGET_FUNCS(chunk_stripe_len, struct btrfs_chunk, stripe_len, 64);
-BTRFS_SETGET_FUNCS(chunk_io_align, struct btrfs_chunk, io_align, 32);
-BTRFS_SETGET_FUNCS(chunk_io_width, struct btrfs_chunk, io_width, 32);
-BTRFS_SETGET_FUNCS(chunk_sector_size, struct btrfs_chunk, sector_size, 32);
-BTRFS_SETGET_FUNCS(chunk_type, struct btrfs_chunk, type, 64);
-BTRFS_SETGET_FUNCS(chunk_num_stripes, struct btrfs_chunk, num_stripes, 16);
-BTRFS_SETGET_FUNCS(chunk_sub_stripes, struct btrfs_chunk, sub_stripes, 16);
-BTRFS_SETGET_FUNCS(stripe_devid, struct btrfs_stripe, devid, 64);
-BTRFS_SETGET_FUNCS(stripe_offset, struct btrfs_stripe, offset, 64);
-
-static inline char *btrfs_stripe_dev_uuid(struct btrfs_stripe *s)
-{
- return (char *)s + offsetof(struct btrfs_stripe, dev_uuid);
-}
-
-BTRFS_SETGET_STACK_FUNCS(stack_chunk_length, struct btrfs_chunk, length, 64);
-BTRFS_SETGET_STACK_FUNCS(stack_chunk_owner, struct btrfs_chunk, owner, 64);
-BTRFS_SETGET_STACK_FUNCS(stack_chunk_stripe_len, struct btrfs_chunk,
- stripe_len, 64);
-BTRFS_SETGET_STACK_FUNCS(stack_chunk_io_align, struct btrfs_chunk,
- io_align, 32);
-BTRFS_SETGET_STACK_FUNCS(stack_chunk_io_width, struct btrfs_chunk,
- io_width, 32);
-BTRFS_SETGET_STACK_FUNCS(stack_chunk_sector_size, struct btrfs_chunk,
- sector_size, 32);
-BTRFS_SETGET_STACK_FUNCS(stack_chunk_type, struct btrfs_chunk, type, 64);
-BTRFS_SETGET_STACK_FUNCS(stack_chunk_num_stripes, struct btrfs_chunk,
- num_stripes, 16);
-BTRFS_SETGET_STACK_FUNCS(stack_chunk_sub_stripes, struct btrfs_chunk,
- sub_stripes, 16);
-BTRFS_SETGET_STACK_FUNCS(stack_stripe_devid, struct btrfs_stripe, devid, 64);
-BTRFS_SETGET_STACK_FUNCS(stack_stripe_offset, struct btrfs_stripe, offset, 64);
-
-static inline struct btrfs_stripe *btrfs_stripe_nr(struct btrfs_chunk *c,
- int nr)
-{
- unsigned long offset = (unsigned long)c;
- offset += offsetof(struct btrfs_chunk, stripe);
- offset += nr * sizeof(struct btrfs_stripe);
- return (struct btrfs_stripe *)offset;
-}
-
-static inline char *btrfs_stripe_dev_uuid_nr(struct btrfs_chunk *c, int nr)
-{
- return btrfs_stripe_dev_uuid(btrfs_stripe_nr(c, nr));
-}
-
-static inline u64 btrfs_stripe_offset_nr(const struct extent_buffer *eb,
- struct btrfs_chunk *c, int nr)
-{
- return btrfs_stripe_offset(eb, btrfs_stripe_nr(c, nr));
-}
-
-static inline u64 btrfs_stripe_devid_nr(const struct extent_buffer *eb,
- struct btrfs_chunk *c, int nr)
-{
- return btrfs_stripe_devid(eb, btrfs_stripe_nr(c, nr));
-}
-
-/* struct btrfs_block_group_item */
-BTRFS_SETGET_STACK_FUNCS(stack_block_group_used, struct btrfs_block_group_item,
- used, 64);
-BTRFS_SETGET_FUNCS(block_group_used, struct btrfs_block_group_item,
- used, 64);
-BTRFS_SETGET_STACK_FUNCS(stack_block_group_chunk_objectid,
- struct btrfs_block_group_item, chunk_objectid, 64);
-
-BTRFS_SETGET_FUNCS(block_group_chunk_objectid,
- struct btrfs_block_group_item, chunk_objectid, 64);
-BTRFS_SETGET_FUNCS(block_group_flags,
- struct btrfs_block_group_item, flags, 64);
-BTRFS_SETGET_STACK_FUNCS(stack_block_group_flags,
- struct btrfs_block_group_item, flags, 64);
-
-/* struct btrfs_free_space_info */
-BTRFS_SETGET_FUNCS(free_space_extent_count, struct btrfs_free_space_info,
- extent_count, 32);
-BTRFS_SETGET_FUNCS(free_space_flags, struct btrfs_free_space_info, flags, 32);
-
-/* struct btrfs_inode_ref */
-BTRFS_SETGET_FUNCS(inode_ref_name_len, struct btrfs_inode_ref, name_len, 16);
-BTRFS_SETGET_FUNCS(inode_ref_index, struct btrfs_inode_ref, index, 64);
-
-/* struct btrfs_inode_extref */
-BTRFS_SETGET_FUNCS(inode_extref_parent, struct btrfs_inode_extref,
- parent_objectid, 64);
-BTRFS_SETGET_FUNCS(inode_extref_name_len, struct btrfs_inode_extref,
- name_len, 16);
-BTRFS_SETGET_FUNCS(inode_extref_index, struct btrfs_inode_extref, index, 64);
-
-/* struct btrfs_inode_item */
-BTRFS_SETGET_FUNCS(inode_generation, struct btrfs_inode_item, generation, 64);
-BTRFS_SETGET_FUNCS(inode_sequence, struct btrfs_inode_item, sequence, 64);
-BTRFS_SETGET_FUNCS(inode_transid, struct btrfs_inode_item, transid, 64);
-BTRFS_SETGET_FUNCS(inode_size, struct btrfs_inode_item, size, 64);
-BTRFS_SETGET_FUNCS(inode_nbytes, struct btrfs_inode_item, nbytes, 64);
-BTRFS_SETGET_FUNCS(inode_block_group, struct btrfs_inode_item, block_group, 64);
-BTRFS_SETGET_FUNCS(inode_nlink, struct btrfs_inode_item, nlink, 32);
-BTRFS_SETGET_FUNCS(inode_uid, struct btrfs_inode_item, uid, 32);
-BTRFS_SETGET_FUNCS(inode_gid, struct btrfs_inode_item, gid, 32);
-BTRFS_SETGET_FUNCS(inode_mode, struct btrfs_inode_item, mode, 32);
-BTRFS_SETGET_FUNCS(inode_rdev, struct btrfs_inode_item, rdev, 64);
-BTRFS_SETGET_FUNCS(inode_flags, struct btrfs_inode_item, flags, 64);
-BTRFS_SETGET_STACK_FUNCS(stack_inode_generation, struct btrfs_inode_item,
- generation, 64);
-BTRFS_SETGET_STACK_FUNCS(stack_inode_sequence, struct btrfs_inode_item,
- sequence, 64);
-BTRFS_SETGET_STACK_FUNCS(stack_inode_transid, struct btrfs_inode_item,
- transid, 64);
-BTRFS_SETGET_STACK_FUNCS(stack_inode_size, struct btrfs_inode_item, size, 64);
-BTRFS_SETGET_STACK_FUNCS(stack_inode_nbytes, struct btrfs_inode_item,
- nbytes, 64);
-BTRFS_SETGET_STACK_FUNCS(stack_inode_block_group, struct btrfs_inode_item,
- block_group, 64);
-BTRFS_SETGET_STACK_FUNCS(stack_inode_nlink, struct btrfs_inode_item, nlink, 32);
-BTRFS_SETGET_STACK_FUNCS(stack_inode_uid, struct btrfs_inode_item, uid, 32);
-BTRFS_SETGET_STACK_FUNCS(stack_inode_gid, struct btrfs_inode_item, gid, 32);
-BTRFS_SETGET_STACK_FUNCS(stack_inode_mode, struct btrfs_inode_item, mode, 32);
-BTRFS_SETGET_STACK_FUNCS(stack_inode_rdev, struct btrfs_inode_item, rdev, 64);
-BTRFS_SETGET_STACK_FUNCS(stack_inode_flags, struct btrfs_inode_item, flags, 64);
-BTRFS_SETGET_FUNCS(timespec_sec, struct btrfs_timespec, sec, 64);
-BTRFS_SETGET_FUNCS(timespec_nsec, struct btrfs_timespec, nsec, 32);
-BTRFS_SETGET_STACK_FUNCS(stack_timespec_sec, struct btrfs_timespec, sec, 64);
-BTRFS_SETGET_STACK_FUNCS(stack_timespec_nsec, struct btrfs_timespec, nsec, 32);
-
-/* struct btrfs_dev_extent */
-BTRFS_SETGET_FUNCS(dev_extent_chunk_tree, struct btrfs_dev_extent,
- chunk_tree, 64);
-BTRFS_SETGET_FUNCS(dev_extent_chunk_objectid, struct btrfs_dev_extent,
- chunk_objectid, 64);
-BTRFS_SETGET_FUNCS(dev_extent_chunk_offset, struct btrfs_dev_extent,
- chunk_offset, 64);
-BTRFS_SETGET_FUNCS(dev_extent_length, struct btrfs_dev_extent, length, 64);
-BTRFS_SETGET_FUNCS(extent_refs, struct btrfs_extent_item, refs, 64);
-BTRFS_SETGET_FUNCS(extent_generation, struct btrfs_extent_item,
- generation, 64);
-BTRFS_SETGET_FUNCS(extent_flags, struct btrfs_extent_item, flags, 64);
-
-BTRFS_SETGET_FUNCS(tree_block_level, struct btrfs_tree_block_info, level, 8);
-
-static inline void btrfs_tree_block_key(const struct extent_buffer *eb,
- struct btrfs_tree_block_info *item,
- struct btrfs_disk_key *key)
-{
- read_eb_member(eb, item, struct btrfs_tree_block_info, key, key);
-}
-
-static inline void btrfs_set_tree_block_key(const struct extent_buffer *eb,
- struct btrfs_tree_block_info *item,
- struct btrfs_disk_key *key)
-{
- write_eb_member(eb, item, struct btrfs_tree_block_info, key, key);
-}
-
-BTRFS_SETGET_FUNCS(extent_data_ref_root, struct btrfs_extent_data_ref,
- root, 64);
-BTRFS_SETGET_FUNCS(extent_data_ref_objectid, struct btrfs_extent_data_ref,
- objectid, 64);
-BTRFS_SETGET_FUNCS(extent_data_ref_offset, struct btrfs_extent_data_ref,
- offset, 64);
-BTRFS_SETGET_FUNCS(extent_data_ref_count, struct btrfs_extent_data_ref,
- count, 32);
-
-BTRFS_SETGET_FUNCS(shared_data_ref_count, struct btrfs_shared_data_ref,
- count, 32);
-
-BTRFS_SETGET_FUNCS(extent_inline_ref_type, struct btrfs_extent_inline_ref,
- type, 8);
-BTRFS_SETGET_FUNCS(extent_inline_ref_offset, struct btrfs_extent_inline_ref,
- offset, 64);
-
-static inline u32 btrfs_extent_inline_ref_size(int type)
-{
- if (type == BTRFS_TREE_BLOCK_REF_KEY ||
- type == BTRFS_SHARED_BLOCK_REF_KEY)
- return sizeof(struct btrfs_extent_inline_ref);
- if (type == BTRFS_SHARED_DATA_REF_KEY)
- return sizeof(struct btrfs_shared_data_ref) +
- sizeof(struct btrfs_extent_inline_ref);
- if (type == BTRFS_EXTENT_DATA_REF_KEY)
- return sizeof(struct btrfs_extent_data_ref) +
- offsetof(struct btrfs_extent_inline_ref, offset);
- return 0;
-}
-
-/* struct btrfs_node */
-BTRFS_SETGET_FUNCS(key_blockptr, struct btrfs_key_ptr, blockptr, 64);
-BTRFS_SETGET_FUNCS(key_generation, struct btrfs_key_ptr, generation, 64);
-BTRFS_SETGET_STACK_FUNCS(stack_key_blockptr, struct btrfs_key_ptr,
- blockptr, 64);
-BTRFS_SETGET_STACK_FUNCS(stack_key_generation, struct btrfs_key_ptr,
- generation, 64);
-
-static inline u64 btrfs_node_blockptr(const struct extent_buffer *eb, int nr)
-{
- unsigned long ptr;
- ptr = offsetof(struct btrfs_node, ptrs) +
- sizeof(struct btrfs_key_ptr) * nr;
- return btrfs_key_blockptr(eb, (struct btrfs_key_ptr *)ptr);
-}
-
-static inline void btrfs_set_node_blockptr(const struct extent_buffer *eb,
- int nr, u64 val)
-{
- unsigned long ptr;
- ptr = offsetof(struct btrfs_node, ptrs) +
- sizeof(struct btrfs_key_ptr) * nr;
- btrfs_set_key_blockptr(eb, (struct btrfs_key_ptr *)ptr, val);
-}
-
-static inline u64 btrfs_node_ptr_generation(const struct extent_buffer *eb, int nr)
-{
- unsigned long ptr;
- ptr = offsetof(struct btrfs_node, ptrs) +
- sizeof(struct btrfs_key_ptr) * nr;
- return btrfs_key_generation(eb, (struct btrfs_key_ptr *)ptr);
-}
-
-static inline void btrfs_set_node_ptr_generation(const struct extent_buffer *eb,
- int nr, u64 val)
-{
- unsigned long ptr;
- ptr = offsetof(struct btrfs_node, ptrs) +
- sizeof(struct btrfs_key_ptr) * nr;
- btrfs_set_key_generation(eb, (struct btrfs_key_ptr *)ptr, val);
-}
-
-static inline unsigned long btrfs_node_key_ptr_offset(int nr)
-{
- return offsetof(struct btrfs_node, ptrs) +
- sizeof(struct btrfs_key_ptr) * nr;
-}
-
-void btrfs_node_key(const struct extent_buffer *eb,
- struct btrfs_disk_key *disk_key, int nr);
-
-static inline void btrfs_set_node_key(const struct extent_buffer *eb,
- struct btrfs_disk_key *disk_key, int nr)
-{
- unsigned long ptr;
- ptr = btrfs_node_key_ptr_offset(nr);
- write_eb_member(eb, (struct btrfs_key_ptr *)ptr,
- struct btrfs_key_ptr, key, disk_key);
-}
-
-/* struct btrfs_item */
-BTRFS_SETGET_FUNCS(raw_item_offset, struct btrfs_item, offset, 32);
-BTRFS_SETGET_FUNCS(raw_item_size, struct btrfs_item, size, 32);
-BTRFS_SETGET_STACK_FUNCS(stack_item_offset, struct btrfs_item, offset, 32);
-BTRFS_SETGET_STACK_FUNCS(stack_item_size, struct btrfs_item, size, 32);
-
-static inline unsigned long btrfs_item_nr_offset(int nr)
-{
- return offsetof(struct btrfs_leaf, items) +
- sizeof(struct btrfs_item) * nr;
-}
-
-static inline struct btrfs_item *btrfs_item_nr(int nr)
-{
- return (struct btrfs_item *)btrfs_item_nr_offset(nr);
-}
-
-#define BTRFS_ITEM_SETGET_FUNCS(member) \
-static inline u32 btrfs_item_##member(const struct extent_buffer *eb, \
- int slot) \
-{ \
- return btrfs_raw_item_##member(eb, btrfs_item_nr(slot)); \
-} \
-static inline void btrfs_set_item_##member(const struct extent_buffer *eb, \
- int slot, u32 val) \
-{ \
- btrfs_set_raw_item_##member(eb, btrfs_item_nr(slot), val); \
-} \
-static inline u32 btrfs_token_item_##member(struct btrfs_map_token *token, \
- int slot) \
-{ \
- struct btrfs_item *item = btrfs_item_nr(slot); \
- return btrfs_token_raw_item_##member(token, item); \
-} \
-static inline void btrfs_set_token_item_##member(struct btrfs_map_token *token, \
- int slot, u32 val) \
-{ \
- struct btrfs_item *item = btrfs_item_nr(slot); \
- btrfs_set_token_raw_item_##member(token, item, val); \
-}
-
-BTRFS_ITEM_SETGET_FUNCS(offset)
-BTRFS_ITEM_SETGET_FUNCS(size);
-
-static inline u32 btrfs_item_data_end(const struct extent_buffer *eb, int nr)
-{
- return btrfs_item_offset(eb, nr) + btrfs_item_size(eb, nr);
-}
-
-static inline void btrfs_item_key(const struct extent_buffer *eb,
- struct btrfs_disk_key *disk_key, int nr)
-{
- struct btrfs_item *item = btrfs_item_nr(nr);
- read_eb_member(eb, item, struct btrfs_item, key, disk_key);
-}
-
-static inline void btrfs_set_item_key(struct extent_buffer *eb,
- struct btrfs_disk_key *disk_key, int nr)
-{
- struct btrfs_item *item = btrfs_item_nr(nr);
- write_eb_member(eb, item, struct btrfs_item, key, disk_key);
-}
-
-BTRFS_SETGET_FUNCS(dir_log_end, struct btrfs_dir_log_item, end, 64);
-
-/*
- * struct btrfs_root_ref
- */
-BTRFS_SETGET_FUNCS(root_ref_dirid, struct btrfs_root_ref, dirid, 64);
-BTRFS_SETGET_FUNCS(root_ref_sequence, struct btrfs_root_ref, sequence, 64);
-BTRFS_SETGET_FUNCS(root_ref_name_len, struct btrfs_root_ref, name_len, 16);
-
-/* struct btrfs_dir_item */
-BTRFS_SETGET_FUNCS(dir_data_len, struct btrfs_dir_item, data_len, 16);
-BTRFS_SETGET_FUNCS(dir_type, struct btrfs_dir_item, type, 8);
-BTRFS_SETGET_FUNCS(dir_name_len, struct btrfs_dir_item, name_len, 16);
-BTRFS_SETGET_FUNCS(dir_transid, struct btrfs_dir_item, transid, 64);
-BTRFS_SETGET_STACK_FUNCS(stack_dir_type, struct btrfs_dir_item, type, 8);
-BTRFS_SETGET_STACK_FUNCS(stack_dir_data_len, struct btrfs_dir_item,
- data_len, 16);
-BTRFS_SETGET_STACK_FUNCS(stack_dir_name_len, struct btrfs_dir_item,
- name_len, 16);
-BTRFS_SETGET_STACK_FUNCS(stack_dir_transid, struct btrfs_dir_item,
- transid, 64);
-
-static inline void btrfs_dir_item_key(const struct extent_buffer *eb,
- const struct btrfs_dir_item *item,
- struct btrfs_disk_key *key)
-{
- read_eb_member(eb, item, struct btrfs_dir_item, location, key);
-}
-
-static inline void btrfs_set_dir_item_key(struct extent_buffer *eb,
- struct btrfs_dir_item *item,
- const struct btrfs_disk_key *key)
-{
- write_eb_member(eb, item, struct btrfs_dir_item, location, key);
-}
-
-BTRFS_SETGET_FUNCS(free_space_entries, struct btrfs_free_space_header,
- num_entries, 64);
-BTRFS_SETGET_FUNCS(free_space_bitmaps, struct btrfs_free_space_header,
- num_bitmaps, 64);
-BTRFS_SETGET_FUNCS(free_space_generation, struct btrfs_free_space_header,
- generation, 64);
-
-static inline void btrfs_free_space_key(const struct extent_buffer *eb,
- const struct btrfs_free_space_header *h,
- struct btrfs_disk_key *key)
-{
- read_eb_member(eb, h, struct btrfs_free_space_header, location, key);
-}
-
-static inline void btrfs_set_free_space_key(struct extent_buffer *eb,
- struct btrfs_free_space_header *h,
- const struct btrfs_disk_key *key)
-{
- write_eb_member(eb, h, struct btrfs_free_space_header, location, key);
-}
-
-/* struct btrfs_disk_key */
-BTRFS_SETGET_STACK_FUNCS(disk_key_objectid, struct btrfs_disk_key,
- objectid, 64);
-BTRFS_SETGET_STACK_FUNCS(disk_key_offset, struct btrfs_disk_key, offset, 64);
-BTRFS_SETGET_STACK_FUNCS(disk_key_type, struct btrfs_disk_key, type, 8);
-
-#ifdef __LITTLE_ENDIAN
-
-/*
- * Optimized helpers for little-endian architectures where CPU and on-disk
- * structures have the same endianness and we can skip conversions.
- */
-
-static inline void btrfs_disk_key_to_cpu(struct btrfs_key *cpu_key,
- const struct btrfs_disk_key *disk_key)
-{
- memcpy(cpu_key, disk_key, sizeof(struct btrfs_key));
-}
-
-static inline void btrfs_cpu_key_to_disk(struct btrfs_disk_key *disk_key,
- const struct btrfs_key *cpu_key)
-{
- memcpy(disk_key, cpu_key, sizeof(struct btrfs_key));
-}
-
-static inline void btrfs_node_key_to_cpu(const struct extent_buffer *eb,
- struct btrfs_key *cpu_key, int nr)
-{
- struct btrfs_disk_key *disk_key = (struct btrfs_disk_key *)cpu_key;
-
- btrfs_node_key(eb, disk_key, nr);
-}
-
-static inline void btrfs_item_key_to_cpu(const struct extent_buffer *eb,
- struct btrfs_key *cpu_key, int nr)
-{
- struct btrfs_disk_key *disk_key = (struct btrfs_disk_key *)cpu_key;
-
- btrfs_item_key(eb, disk_key, nr);
-}
-
-static inline void btrfs_dir_item_key_to_cpu(const struct extent_buffer *eb,
- const struct btrfs_dir_item *item,
- struct btrfs_key *cpu_key)
-{
- struct btrfs_disk_key *disk_key = (struct btrfs_disk_key *)cpu_key;
-
- btrfs_dir_item_key(eb, item, disk_key);
-}
-
-#else
-
-static inline void btrfs_disk_key_to_cpu(struct btrfs_key *cpu,
- const struct btrfs_disk_key *disk)
-{
- cpu->offset = le64_to_cpu(disk->offset);
- cpu->type = disk->type;
- cpu->objectid = le64_to_cpu(disk->objectid);
-}
-
-static inline void btrfs_cpu_key_to_disk(struct btrfs_disk_key *disk,
- const struct btrfs_key *cpu)
-{
- disk->offset = cpu_to_le64(cpu->offset);
- disk->type = cpu->type;
- disk->objectid = cpu_to_le64(cpu->objectid);
-}
-
-static inline void btrfs_node_key_to_cpu(const struct extent_buffer *eb,
- struct btrfs_key *key, int nr)
-{
- struct btrfs_disk_key disk_key;
- btrfs_node_key(eb, &disk_key, nr);
- btrfs_disk_key_to_cpu(key, &disk_key);
-}
-
-static inline void btrfs_item_key_to_cpu(const struct extent_buffer *eb,
- struct btrfs_key *key, int nr)
-{
- struct btrfs_disk_key disk_key;
- btrfs_item_key(eb, &disk_key, nr);
- btrfs_disk_key_to_cpu(key, &disk_key);
-}
-
-static inline void btrfs_dir_item_key_to_cpu(const struct extent_buffer *eb,
- const struct btrfs_dir_item *item,
- struct btrfs_key *key)
-{
- struct btrfs_disk_key disk_key;
- btrfs_dir_item_key(eb, item, &disk_key);
- btrfs_disk_key_to_cpu(key, &disk_key);
-}
-
-#endif
-
-/* struct btrfs_header */
-BTRFS_SETGET_HEADER_FUNCS(header_bytenr, struct btrfs_header, bytenr, 64);
-BTRFS_SETGET_HEADER_FUNCS(header_generation, struct btrfs_header,
- generation, 64);
-BTRFS_SETGET_HEADER_FUNCS(header_owner, struct btrfs_header, owner, 64);
-BTRFS_SETGET_HEADER_FUNCS(header_nritems, struct btrfs_header, nritems, 32);
-BTRFS_SETGET_HEADER_FUNCS(header_flags, struct btrfs_header, flags, 64);
-BTRFS_SETGET_HEADER_FUNCS(header_level, struct btrfs_header, level, 8);
-BTRFS_SETGET_STACK_FUNCS(stack_header_generation, struct btrfs_header,
- generation, 64);
-BTRFS_SETGET_STACK_FUNCS(stack_header_owner, struct btrfs_header, owner, 64);
-BTRFS_SETGET_STACK_FUNCS(stack_header_nritems, struct btrfs_header,
- nritems, 32);
-BTRFS_SETGET_STACK_FUNCS(stack_header_bytenr, struct btrfs_header, bytenr, 64);
-
-static inline int btrfs_header_flag(const struct extent_buffer *eb, u64 flag)
-{
- return (btrfs_header_flags(eb) & flag) == flag;
-}
-
-static inline void btrfs_set_header_flag(struct extent_buffer *eb, u64 flag)
-{
- u64 flags = btrfs_header_flags(eb);
- btrfs_set_header_flags(eb, flags | flag);
-}
-
-static inline void btrfs_clear_header_flag(struct extent_buffer *eb, u64 flag)
-{
- u64 flags = btrfs_header_flags(eb);
- btrfs_set_header_flags(eb, flags & ~flag);
-}
-
-static inline int btrfs_header_backref_rev(const struct extent_buffer *eb)
-{
- u64 flags = btrfs_header_flags(eb);
- return flags >> BTRFS_BACKREF_REV_SHIFT;
-}
-
-static inline void btrfs_set_header_backref_rev(struct extent_buffer *eb,
- int rev)
-{
- u64 flags = btrfs_header_flags(eb);
- flags &= ~BTRFS_BACKREF_REV_MASK;
- flags |= (u64)rev << BTRFS_BACKREF_REV_SHIFT;
- btrfs_set_header_flags(eb, flags);
-}
-
-static inline int btrfs_is_leaf(const struct extent_buffer *eb)
-{
- return btrfs_header_level(eb) == 0;
-}
-
-/* struct btrfs_root_item */
-BTRFS_SETGET_FUNCS(disk_root_generation, struct btrfs_root_item,
- generation, 64);
-BTRFS_SETGET_FUNCS(disk_root_refs, struct btrfs_root_item, refs, 32);
-BTRFS_SETGET_FUNCS(disk_root_bytenr, struct btrfs_root_item, bytenr, 64);
-BTRFS_SETGET_FUNCS(disk_root_level, struct btrfs_root_item, level, 8);
-
-BTRFS_SETGET_STACK_FUNCS(root_generation, struct btrfs_root_item,
- generation, 64);
-BTRFS_SETGET_STACK_FUNCS(root_bytenr, struct btrfs_root_item, bytenr, 64);
-BTRFS_SETGET_STACK_FUNCS(root_drop_level, struct btrfs_root_item, drop_level, 8);
-BTRFS_SETGET_STACK_FUNCS(root_level, struct btrfs_root_item, level, 8);
-BTRFS_SETGET_STACK_FUNCS(root_dirid, struct btrfs_root_item, root_dirid, 64);
-BTRFS_SETGET_STACK_FUNCS(root_refs, struct btrfs_root_item, refs, 32);
-BTRFS_SETGET_STACK_FUNCS(root_flags, struct btrfs_root_item, flags, 64);
-BTRFS_SETGET_STACK_FUNCS(root_used, struct btrfs_root_item, bytes_used, 64);
-BTRFS_SETGET_STACK_FUNCS(root_limit, struct btrfs_root_item, byte_limit, 64);
-BTRFS_SETGET_STACK_FUNCS(root_last_snapshot, struct btrfs_root_item,
- last_snapshot, 64);
-BTRFS_SETGET_STACK_FUNCS(root_generation_v2, struct btrfs_root_item,
- generation_v2, 64);
-BTRFS_SETGET_STACK_FUNCS(root_ctransid, struct btrfs_root_item,
- ctransid, 64);
-BTRFS_SETGET_STACK_FUNCS(root_otransid, struct btrfs_root_item,
- otransid, 64);
-BTRFS_SETGET_STACK_FUNCS(root_stransid, struct btrfs_root_item,
- stransid, 64);
-BTRFS_SETGET_STACK_FUNCS(root_rtransid, struct btrfs_root_item,
- rtransid, 64);
-
-static inline bool btrfs_root_readonly(const struct btrfs_root *root)
-{
- /* Byte-swap the constant at compile time, root_item::flags is LE */
- return (root->root_item.flags & cpu_to_le64(BTRFS_ROOT_SUBVOL_RDONLY)) != 0;
-}
-
-static inline bool btrfs_root_dead(const struct btrfs_root *root)
-{
- /* Byte-swap the constant at compile time, root_item::flags is LE */
- return (root->root_item.flags & cpu_to_le64(BTRFS_ROOT_SUBVOL_DEAD)) != 0;
-}
-
-static inline u64 btrfs_root_id(const struct btrfs_root *root)
-{
- return root->root_key.objectid;
-}
-
-/* struct btrfs_root_backup */
-BTRFS_SETGET_STACK_FUNCS(backup_tree_root, struct btrfs_root_backup,
- tree_root, 64);
-BTRFS_SETGET_STACK_FUNCS(backup_tree_root_gen, struct btrfs_root_backup,
- tree_root_gen, 64);
-BTRFS_SETGET_STACK_FUNCS(backup_tree_root_level, struct btrfs_root_backup,
- tree_root_level, 8);
-
-BTRFS_SETGET_STACK_FUNCS(backup_chunk_root, struct btrfs_root_backup,
- chunk_root, 64);
-BTRFS_SETGET_STACK_FUNCS(backup_chunk_root_gen, struct btrfs_root_backup,
- chunk_root_gen, 64);
-BTRFS_SETGET_STACK_FUNCS(backup_chunk_root_level, struct btrfs_root_backup,
- chunk_root_level, 8);
-
-BTRFS_SETGET_STACK_FUNCS(backup_extent_root, struct btrfs_root_backup,
- extent_root, 64);
-BTRFS_SETGET_STACK_FUNCS(backup_extent_root_gen, struct btrfs_root_backup,
- extent_root_gen, 64);
-BTRFS_SETGET_STACK_FUNCS(backup_extent_root_level, struct btrfs_root_backup,
- extent_root_level, 8);
-
-BTRFS_SETGET_STACK_FUNCS(backup_fs_root, struct btrfs_root_backup,
- fs_root, 64);
-BTRFS_SETGET_STACK_FUNCS(backup_fs_root_gen, struct btrfs_root_backup,
- fs_root_gen, 64);
-BTRFS_SETGET_STACK_FUNCS(backup_fs_root_level, struct btrfs_root_backup,
- fs_root_level, 8);
-
-BTRFS_SETGET_STACK_FUNCS(backup_dev_root, struct btrfs_root_backup,
- dev_root, 64);
-BTRFS_SETGET_STACK_FUNCS(backup_dev_root_gen, struct btrfs_root_backup,
- dev_root_gen, 64);
-BTRFS_SETGET_STACK_FUNCS(backup_dev_root_level, struct btrfs_root_backup,
- dev_root_level, 8);
-
-BTRFS_SETGET_STACK_FUNCS(backup_csum_root, struct btrfs_root_backup,
- csum_root, 64);
-BTRFS_SETGET_STACK_FUNCS(backup_csum_root_gen, struct btrfs_root_backup,
- csum_root_gen, 64);
-BTRFS_SETGET_STACK_FUNCS(backup_csum_root_level, struct btrfs_root_backup,
- csum_root_level, 8);
-BTRFS_SETGET_STACK_FUNCS(backup_total_bytes, struct btrfs_root_backup,
- total_bytes, 64);
-BTRFS_SETGET_STACK_FUNCS(backup_bytes_used, struct btrfs_root_backup,
- bytes_used, 64);
-BTRFS_SETGET_STACK_FUNCS(backup_num_devices, struct btrfs_root_backup,
- num_devices, 64);
-
-/* struct btrfs_balance_item */
-BTRFS_SETGET_FUNCS(balance_flags, struct btrfs_balance_item, flags, 64);
-
-static inline void btrfs_balance_data(const struct extent_buffer *eb,
- const struct btrfs_balance_item *bi,
- struct btrfs_disk_balance_args *ba)
-{
- read_eb_member(eb, bi, struct btrfs_balance_item, data, ba);
-}
-
-static inline void btrfs_set_balance_data(struct extent_buffer *eb,
- struct btrfs_balance_item *bi,
- const struct btrfs_disk_balance_args *ba)
-{
- write_eb_member(eb, bi, struct btrfs_balance_item, data, ba);
-}
-
-static inline void btrfs_balance_meta(const struct extent_buffer *eb,
- const struct btrfs_balance_item *bi,
- struct btrfs_disk_balance_args *ba)
-{
- read_eb_member(eb, bi, struct btrfs_balance_item, meta, ba);
-}
-
-static inline void btrfs_set_balance_meta(struct extent_buffer *eb,
- struct btrfs_balance_item *bi,
- const struct btrfs_disk_balance_args *ba)
-{
- write_eb_member(eb, bi, struct btrfs_balance_item, meta, ba);
-}
-
-static inline void btrfs_balance_sys(const struct extent_buffer *eb,
- const struct btrfs_balance_item *bi,
- struct btrfs_disk_balance_args *ba)
-{
- read_eb_member(eb, bi, struct btrfs_balance_item, sys, ba);
-}
-
-static inline void btrfs_set_balance_sys(struct extent_buffer *eb,
- struct btrfs_balance_item *bi,
- const struct btrfs_disk_balance_args *ba)
-{
- write_eb_member(eb, bi, struct btrfs_balance_item, sys, ba);
-}
-
-static inline void
-btrfs_disk_balance_args_to_cpu(struct btrfs_balance_args *cpu,
- const struct btrfs_disk_balance_args *disk)
-{
- memset(cpu, 0, sizeof(*cpu));
-
- cpu->profiles = le64_to_cpu(disk->profiles);
- cpu->usage = le64_to_cpu(disk->usage);
- cpu->devid = le64_to_cpu(disk->devid);
- cpu->pstart = le64_to_cpu(disk->pstart);
- cpu->pend = le64_to_cpu(disk->pend);
- cpu->vstart = le64_to_cpu(disk->vstart);
- cpu->vend = le64_to_cpu(disk->vend);
- cpu->target = le64_to_cpu(disk->target);
- cpu->flags = le64_to_cpu(disk->flags);
- cpu->limit = le64_to_cpu(disk->limit);
- cpu->stripes_min = le32_to_cpu(disk->stripes_min);
- cpu->stripes_max = le32_to_cpu(disk->stripes_max);
-}
-
-static inline void
-btrfs_cpu_balance_args_to_disk(struct btrfs_disk_balance_args *disk,
- const struct btrfs_balance_args *cpu)
-{
- memset(disk, 0, sizeof(*disk));
-
- disk->profiles = cpu_to_le64(cpu->profiles);
- disk->usage = cpu_to_le64(cpu->usage);
- disk->devid = cpu_to_le64(cpu->devid);
- disk->pstart = cpu_to_le64(cpu->pstart);
- disk->pend = cpu_to_le64(cpu->pend);
- disk->vstart = cpu_to_le64(cpu->vstart);
- disk->vend = cpu_to_le64(cpu->vend);
- disk->target = cpu_to_le64(cpu->target);
- disk->flags = cpu_to_le64(cpu->flags);
- disk->limit = cpu_to_le64(cpu->limit);
- disk->stripes_min = cpu_to_le32(cpu->stripes_min);
- disk->stripes_max = cpu_to_le32(cpu->stripes_max);
-}
-
-/* struct btrfs_super_block */
-BTRFS_SETGET_STACK_FUNCS(super_bytenr, struct btrfs_super_block, bytenr, 64);
-BTRFS_SETGET_STACK_FUNCS(super_flags, struct btrfs_super_block, flags, 64);
-BTRFS_SETGET_STACK_FUNCS(super_generation, struct btrfs_super_block,
- generation, 64);
-BTRFS_SETGET_STACK_FUNCS(super_root, struct btrfs_super_block, root, 64);
-BTRFS_SETGET_STACK_FUNCS(super_sys_array_size,
- struct btrfs_super_block, sys_chunk_array_size, 32);
-BTRFS_SETGET_STACK_FUNCS(super_chunk_root_generation,
- struct btrfs_super_block, chunk_root_generation, 64);
-BTRFS_SETGET_STACK_FUNCS(super_root_level, struct btrfs_super_block,
- root_level, 8);
-BTRFS_SETGET_STACK_FUNCS(super_chunk_root, struct btrfs_super_block,
- chunk_root, 64);
-BTRFS_SETGET_STACK_FUNCS(super_chunk_root_level, struct btrfs_super_block,
- chunk_root_level, 8);
-BTRFS_SETGET_STACK_FUNCS(super_log_root, struct btrfs_super_block,
- log_root, 64);
-BTRFS_SETGET_STACK_FUNCS(super_log_root_level, struct btrfs_super_block,
- log_root_level, 8);
-BTRFS_SETGET_STACK_FUNCS(super_total_bytes, struct btrfs_super_block,
- total_bytes, 64);
-BTRFS_SETGET_STACK_FUNCS(super_bytes_used, struct btrfs_super_block,
- bytes_used, 64);
-BTRFS_SETGET_STACK_FUNCS(super_sectorsize, struct btrfs_super_block,
- sectorsize, 32);
-BTRFS_SETGET_STACK_FUNCS(super_nodesize, struct btrfs_super_block,
- nodesize, 32);
-BTRFS_SETGET_STACK_FUNCS(super_stripesize, struct btrfs_super_block,
- stripesize, 32);
-BTRFS_SETGET_STACK_FUNCS(super_root_dir, struct btrfs_super_block,
- root_dir_objectid, 64);
-BTRFS_SETGET_STACK_FUNCS(super_num_devices, struct btrfs_super_block,
- num_devices, 64);
-BTRFS_SETGET_STACK_FUNCS(super_compat_flags, struct btrfs_super_block,
- compat_flags, 64);
-BTRFS_SETGET_STACK_FUNCS(super_compat_ro_flags, struct btrfs_super_block,
- compat_ro_flags, 64);
-BTRFS_SETGET_STACK_FUNCS(super_incompat_flags, struct btrfs_super_block,
- incompat_flags, 64);
-BTRFS_SETGET_STACK_FUNCS(super_csum_type, struct btrfs_super_block,
- csum_type, 16);
-BTRFS_SETGET_STACK_FUNCS(super_cache_generation, struct btrfs_super_block,
- cache_generation, 64);
-BTRFS_SETGET_STACK_FUNCS(super_magic, struct btrfs_super_block, magic, 64);
-BTRFS_SETGET_STACK_FUNCS(super_uuid_tree_generation, struct btrfs_super_block,
- uuid_tree_generation, 64);
-
-int btrfs_super_csum_size(const struct btrfs_super_block *s);
-const char *btrfs_super_csum_name(u16 csum_type);
-const char *btrfs_super_csum_driver(u16 csum_type);
-size_t __attribute_const__ btrfs_get_num_csums(void);
-
-
-/*
- * The leaf data grows from end-to-front in the node.
- * this returns the address of the start of the last item,
- * which is the stop of the leaf data stack
- */
-static inline unsigned int leaf_data_end(const struct extent_buffer *leaf)
-{
- u32 nr = btrfs_header_nritems(leaf);
-
- if (nr == 0)
- return BTRFS_LEAF_DATA_SIZE(leaf->fs_info);
- return btrfs_item_offset(leaf, nr - 1);
-}
-
-/* struct btrfs_file_extent_item */
-BTRFS_SETGET_STACK_FUNCS(stack_file_extent_type, struct btrfs_file_extent_item,
- type, 8);
-BTRFS_SETGET_STACK_FUNCS(stack_file_extent_disk_bytenr,
- struct btrfs_file_extent_item, disk_bytenr, 64);
-BTRFS_SETGET_STACK_FUNCS(stack_file_extent_offset,
- struct btrfs_file_extent_item, offset, 64);
-BTRFS_SETGET_STACK_FUNCS(stack_file_extent_generation,
- struct btrfs_file_extent_item, generation, 64);
-BTRFS_SETGET_STACK_FUNCS(stack_file_extent_num_bytes,
- struct btrfs_file_extent_item, num_bytes, 64);
-BTRFS_SETGET_STACK_FUNCS(stack_file_extent_ram_bytes,
- struct btrfs_file_extent_item, ram_bytes, 64);
-BTRFS_SETGET_STACK_FUNCS(stack_file_extent_disk_num_bytes,
- struct btrfs_file_extent_item, disk_num_bytes, 64);
-BTRFS_SETGET_STACK_FUNCS(stack_file_extent_compression,
- struct btrfs_file_extent_item, compression, 8);
-
-static inline unsigned long
-btrfs_file_extent_inline_start(const struct btrfs_file_extent_item *e)
-{
- return (unsigned long)e + BTRFS_FILE_EXTENT_INLINE_DATA_START;
-}
-
-static inline u32 btrfs_file_extent_calc_inline_size(u32 datasize)
-{
- return BTRFS_FILE_EXTENT_INLINE_DATA_START + datasize;
-}
-
-BTRFS_SETGET_FUNCS(file_extent_type, struct btrfs_file_extent_item, type, 8);
-BTRFS_SETGET_FUNCS(file_extent_disk_bytenr, struct btrfs_file_extent_item,
- disk_bytenr, 64);
-BTRFS_SETGET_FUNCS(file_extent_generation, struct btrfs_file_extent_item,
- generation, 64);
-BTRFS_SETGET_FUNCS(file_extent_disk_num_bytes, struct btrfs_file_extent_item,
- disk_num_bytes, 64);
-BTRFS_SETGET_FUNCS(file_extent_offset, struct btrfs_file_extent_item,
- offset, 64);
-BTRFS_SETGET_FUNCS(file_extent_num_bytes, struct btrfs_file_extent_item,
- num_bytes, 64);
-BTRFS_SETGET_FUNCS(file_extent_ram_bytes, struct btrfs_file_extent_item,
- ram_bytes, 64);
-BTRFS_SETGET_FUNCS(file_extent_compression, struct btrfs_file_extent_item,
- compression, 8);
-BTRFS_SETGET_FUNCS(file_extent_encryption, struct btrfs_file_extent_item,
- encryption, 8);
-BTRFS_SETGET_FUNCS(file_extent_other_encoding, struct btrfs_file_extent_item,
- other_encoding, 16);
-
-/*
- * this returns the number of bytes used by the item on disk, minus the
- * size of any extent headers. If a file is compressed on disk, this is
- * the compressed size
- */
-static inline u32 btrfs_file_extent_inline_item_len(
- const struct extent_buffer *eb,
- int nr)
-{
- return btrfs_item_size(eb, nr) - BTRFS_FILE_EXTENT_INLINE_DATA_START;
-}
-
-/* btrfs_qgroup_status_item */
-BTRFS_SETGET_FUNCS(qgroup_status_generation, struct btrfs_qgroup_status_item,
- generation, 64);
-BTRFS_SETGET_FUNCS(qgroup_status_version, struct btrfs_qgroup_status_item,
- version, 64);
-BTRFS_SETGET_FUNCS(qgroup_status_flags, struct btrfs_qgroup_status_item,
- flags, 64);
-BTRFS_SETGET_FUNCS(qgroup_status_rescan, struct btrfs_qgroup_status_item,
- rescan, 64);
-
-/* btrfs_qgroup_info_item */
-BTRFS_SETGET_FUNCS(qgroup_info_generation, struct btrfs_qgroup_info_item,
- generation, 64);
-BTRFS_SETGET_FUNCS(qgroup_info_rfer, struct btrfs_qgroup_info_item, rfer, 64);
-BTRFS_SETGET_FUNCS(qgroup_info_rfer_cmpr, struct btrfs_qgroup_info_item,
- rfer_cmpr, 64);
-BTRFS_SETGET_FUNCS(qgroup_info_excl, struct btrfs_qgroup_info_item, excl, 64);
-BTRFS_SETGET_FUNCS(qgroup_info_excl_cmpr, struct btrfs_qgroup_info_item,
- excl_cmpr, 64);
-
-BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_generation,
- struct btrfs_qgroup_info_item, generation, 64);
-BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_rfer, struct btrfs_qgroup_info_item,
- rfer, 64);
-BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_rfer_cmpr,
- struct btrfs_qgroup_info_item, rfer_cmpr, 64);
-BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_excl, struct btrfs_qgroup_info_item,
- excl, 64);
-BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_excl_cmpr,
- struct btrfs_qgroup_info_item, excl_cmpr, 64);
-
-/* btrfs_qgroup_limit_item */
-BTRFS_SETGET_FUNCS(qgroup_limit_flags, struct btrfs_qgroup_limit_item,
- flags, 64);
-BTRFS_SETGET_FUNCS(qgroup_limit_max_rfer, struct btrfs_qgroup_limit_item,
- max_rfer, 64);
-BTRFS_SETGET_FUNCS(qgroup_limit_max_excl, struct btrfs_qgroup_limit_item,
- max_excl, 64);
-BTRFS_SETGET_FUNCS(qgroup_limit_rsv_rfer, struct btrfs_qgroup_limit_item,
- rsv_rfer, 64);
-BTRFS_SETGET_FUNCS(qgroup_limit_rsv_excl, struct btrfs_qgroup_limit_item,
- rsv_excl, 64);
-
-/* btrfs_dev_replace_item */
-BTRFS_SETGET_FUNCS(dev_replace_src_devid,
- struct btrfs_dev_replace_item, src_devid, 64);
-BTRFS_SETGET_FUNCS(dev_replace_cont_reading_from_srcdev_mode,
- struct btrfs_dev_replace_item, cont_reading_from_srcdev_mode,
- 64);
-BTRFS_SETGET_FUNCS(dev_replace_replace_state, struct btrfs_dev_replace_item,
- replace_state, 64);
-BTRFS_SETGET_FUNCS(dev_replace_time_started, struct btrfs_dev_replace_item,
- time_started, 64);
-BTRFS_SETGET_FUNCS(dev_replace_time_stopped, struct btrfs_dev_replace_item,
- time_stopped, 64);
-BTRFS_SETGET_FUNCS(dev_replace_num_write_errors, struct btrfs_dev_replace_item,
- num_write_errors, 64);
-BTRFS_SETGET_FUNCS(dev_replace_num_uncorrectable_read_errors,
- struct btrfs_dev_replace_item, num_uncorrectable_read_errors,
- 64);
-BTRFS_SETGET_FUNCS(dev_replace_cursor_left, struct btrfs_dev_replace_item,
- cursor_left, 64);
-BTRFS_SETGET_FUNCS(dev_replace_cursor_right, struct btrfs_dev_replace_item,
- cursor_right, 64);
-
-BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_src_devid,
- struct btrfs_dev_replace_item, src_devid, 64);
-BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cont_reading_from_srcdev_mode,
- struct btrfs_dev_replace_item,
- cont_reading_from_srcdev_mode, 64);
-BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_replace_state,
- struct btrfs_dev_replace_item, replace_state, 64);
-BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_time_started,
- struct btrfs_dev_replace_item, time_started, 64);
-BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_time_stopped,
- struct btrfs_dev_replace_item, time_stopped, 64);
-BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_num_write_errors,
- struct btrfs_dev_replace_item, num_write_errors, 64);
-BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_num_uncorrectable_read_errors,
- struct btrfs_dev_replace_item,
- num_uncorrectable_read_errors, 64);
-BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cursor_left,
- struct btrfs_dev_replace_item, cursor_left, 64);
-BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cursor_right,
- struct btrfs_dev_replace_item, cursor_right, 64);
-
-/* helper function to cast into the data area of the leaf. */
-#define btrfs_item_ptr(leaf, slot, type) \
- ((type *)(BTRFS_LEAF_DATA_OFFSET + \
- btrfs_item_offset(leaf, slot)))
-
-#define btrfs_item_ptr_offset(leaf, slot) \
- ((unsigned long)(BTRFS_LEAF_DATA_OFFSET + \
- btrfs_item_offset(leaf, slot)))
-
static inline u32 btrfs_crc32c(u32 crc, const void *address, unsigned length)
{
return crc32c(crc, address, length);
@@ -2846,163 +498,15 @@ static inline gfp_t btrfs_alloc_write_mask(struct address_space *mapping)
return mapping_gfp_constraint(mapping, ~__GFP_FS);
}
-/* extent-tree.c */
-
-enum btrfs_inline_ref_type {
- BTRFS_REF_TYPE_INVALID,
- BTRFS_REF_TYPE_BLOCK,
- BTRFS_REF_TYPE_DATA,
- BTRFS_REF_TYPE_ANY,
-};
-
-int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb,
- struct btrfs_extent_inline_ref *iref,
- enum btrfs_inline_ref_type is_data);
-u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset);
-
-
-int btrfs_add_excluded_extent(struct btrfs_fs_info *fs_info,
- u64 start, u64 num_bytes);
-void btrfs_free_excluded_extents(struct btrfs_block_group *cache);
-int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
- unsigned long count);
-void btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info,
- struct btrfs_delayed_ref_root *delayed_refs,
- struct btrfs_delayed_ref_head *head);
-int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len);
-int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info, u64 bytenr,
- u64 offset, int metadata, u64 *refs, u64 *flags);
-int btrfs_pin_extent(struct btrfs_trans_handle *trans, u64 bytenr, u64 num,
- int reserved);
-int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
- u64 bytenr, u64 num_bytes);
-int btrfs_exclude_logged_extents(struct extent_buffer *eb);
-int btrfs_cross_ref_exist(struct btrfs_root *root,
- u64 objectid, u64 offset, u64 bytenr, bool strict,
- struct btrfs_path *path);
-struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- u64 parent, u64 root_objectid,
- const struct btrfs_disk_key *key,
- int level, u64 hint,
- u64 empty_size,
- enum btrfs_lock_nesting nest);
-void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
- u64 root_id,
- struct extent_buffer *buf,
- u64 parent, int last_ref);
-int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, u64 owner,
- u64 offset, u64 ram_bytes,
- struct btrfs_key *ins);
-int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
- u64 root_objectid, u64 owner, u64 offset,
- struct btrfs_key *ins);
-int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes, u64 num_bytes,
- u64 min_alloc_size, u64 empty_size, u64 hint_byte,
- struct btrfs_key *ins, int is_data, int delalloc);
-int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
- struct extent_buffer *buf, int full_backref);
-int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
- struct extent_buffer *buf, int full_backref);
-int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
- struct extent_buffer *eb, u64 flags, int level);
-int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref);
-
-int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info,
- u64 start, u64 len, int delalloc);
-int btrfs_pin_reserved_extent(struct btrfs_trans_handle *trans, u64 start,
- u64 len);
-int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans);
-int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
- struct btrfs_ref *generic_ref);
-
-void btrfs_clear_space_info_full(struct btrfs_fs_info *info);
-
-/*
- * Different levels for to flush space when doing space reservations.
- *
- * The higher the level, the more methods we try to reclaim space.
- */
-enum btrfs_reserve_flush_enum {
- /* If we are in the transaction, we can't flush anything.*/
- BTRFS_RESERVE_NO_FLUSH,
-
- /*
- * Flush space by:
- * - Running delayed inode items
- * - Allocating a new chunk
- */
- BTRFS_RESERVE_FLUSH_LIMIT,
-
- /*
- * Flush space by:
- * - Running delayed inode items
- * - Running delayed refs
- * - Running delalloc and waiting for ordered extents
- * - Allocating a new chunk
- */
- BTRFS_RESERVE_FLUSH_EVICT,
-
- /*
- * Flush space by above mentioned methods and by:
- * - Running delayed iputs
- * - Committing transaction
- *
- * Can be interrupted by a fatal signal.
- */
- BTRFS_RESERVE_FLUSH_DATA,
- BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE,
- BTRFS_RESERVE_FLUSH_ALL,
-
- /*
- * Pretty much the same as FLUSH_ALL, but can also steal space from
- * global rsv.
- *
- * Can be interrupted by a fatal signal.
- */
- BTRFS_RESERVE_FLUSH_ALL_STEAL,
-};
-
-enum btrfs_flush_state {
- FLUSH_DELAYED_ITEMS_NR = 1,
- FLUSH_DELAYED_ITEMS = 2,
- FLUSH_DELAYED_REFS_NR = 3,
- FLUSH_DELAYED_REFS = 4,
- FLUSH_DELALLOC = 5,
- FLUSH_DELALLOC_WAIT = 6,
- FLUSH_DELALLOC_FULL = 7,
- ALLOC_CHUNK = 8,
- ALLOC_CHUNK_FORCE = 9,
- RUN_DELAYED_IPUTS = 10,
- COMMIT_TRANS = 11,
-};
-
-int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
- struct btrfs_block_rsv *rsv,
- int nitems, bool use_global_rsv);
-void btrfs_subvolume_release_metadata(struct btrfs_root *root,
- struct btrfs_block_rsv *rsv);
-void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes);
-
-int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes,
- u64 disk_num_bytes, bool noflush);
-u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo);
int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info,
u64 start, u64 end);
int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr,
u64 num_bytes, u64 *actual_bytes);
int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range);
-int btrfs_init_space_info(struct btrfs_fs_info *fs_info);
-int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info);
-int btrfs_start_write_no_snapshotting(struct btrfs_root *root);
-void btrfs_end_write_no_snapshotting(struct btrfs_root *root);
-void btrfs_wait_for_snapshot_creation(struct btrfs_root *root);
-
/* ctree.c */
+int __init btrfs_ctree_init(void);
+void __cold btrfs_ctree_exit(void);
int btrfs_bin_search(struct extent_buffer *eb, const struct btrfs_key *key,
int *slot);
int __pure btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_key *k2);
@@ -3163,14 +667,7 @@ int btrfs_get_next_valid_item(struct btrfs_root *root, struct btrfs_key *key,
(path)->slots[0]++ \
)
-static inline int btrfs_next_old_item(struct btrfs_root *root,
- struct btrfs_path *p, u64 time_seq)
-{
- ++p->slots[0];
- if (p->slots[0] >= btrfs_header_nritems(p->nodes[0]))
- return btrfs_next_old_leaf(root, p, time_seq);
- return 0;
-}
+int btrfs_next_old_item(struct btrfs_root *root, struct btrfs_path *path, u64 time_seq);
/*
* Search the tree again to find a leaf with greater keys.
@@ -3188,862 +685,6 @@ static inline int btrfs_next_item(struct btrfs_root *root, struct btrfs_path *p)
return btrfs_next_old_item(root, p, 0);
}
int btrfs_leaf_free_space(struct extent_buffer *leaf);
-int __must_check btrfs_drop_snapshot(struct btrfs_root *root, int update_ref,
- int for_reloc);
-int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct extent_buffer *node,
- struct extent_buffer *parent);
-static inline int btrfs_fs_closing(struct btrfs_fs_info *fs_info)
-{
- /*
- * Do it this way so we only ever do one test_bit in the normal case.
- */
- if (test_bit(BTRFS_FS_CLOSING_START, &fs_info->flags)) {
- if (test_bit(BTRFS_FS_CLOSING_DONE, &fs_info->flags))
- return 2;
- return 1;
- }
- return 0;
-}
-
-/*
- * If we remount the fs to be R/O or umount the fs, the cleaner needn't do
- * anything except sleeping. This function is used to check the status of
- * the fs.
- * We check for BTRFS_FS_STATE_RO to avoid races with a concurrent remount,
- * since setting and checking for SB_RDONLY in the superblock's flags is not
- * atomic.
- */
-static inline int btrfs_need_cleaner_sleep(struct btrfs_fs_info *fs_info)
-{
- return test_bit(BTRFS_FS_STATE_RO, &fs_info->fs_state) ||
- btrfs_fs_closing(fs_info);
-}
-
-static inline void btrfs_set_sb_rdonly(struct super_block *sb)
-{
- sb->s_flags |= SB_RDONLY;
- set_bit(BTRFS_FS_STATE_RO, &btrfs_sb(sb)->fs_state);
-}
-
-static inline void btrfs_clear_sb_rdonly(struct super_block *sb)
-{
- sb->s_flags &= ~SB_RDONLY;
- clear_bit(BTRFS_FS_STATE_RO, &btrfs_sb(sb)->fs_state);
-}
-
-/* root-item.c */
-int btrfs_add_root_ref(struct btrfs_trans_handle *trans, u64 root_id,
- u64 ref_id, u64 dirid, u64 sequence, const char *name,
- int name_len);
-int btrfs_del_root_ref(struct btrfs_trans_handle *trans, u64 root_id,
- u64 ref_id, u64 dirid, u64 *sequence, const char *name,
- int name_len);
-int btrfs_del_root(struct btrfs_trans_handle *trans,
- const struct btrfs_key *key);
-int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
- const struct btrfs_key *key,
- struct btrfs_root_item *item);
-int __must_check btrfs_update_root(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_key *key,
- struct btrfs_root_item *item);
-int btrfs_find_root(struct btrfs_root *root, const struct btrfs_key *search_key,
- struct btrfs_path *path, struct btrfs_root_item *root_item,
- struct btrfs_key *root_key);
-int btrfs_find_orphan_roots(struct btrfs_fs_info *fs_info);
-void btrfs_set_root_node(struct btrfs_root_item *item,
- struct extent_buffer *node);
-void btrfs_check_and_init_root_item(struct btrfs_root_item *item);
-void btrfs_update_root_times(struct btrfs_trans_handle *trans,
- struct btrfs_root *root);
-
-/* uuid-tree.c */
-int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans, u8 *uuid, u8 type,
- u64 subid);
-int btrfs_uuid_tree_remove(struct btrfs_trans_handle *trans, u8 *uuid, u8 type,
- u64 subid);
-int btrfs_uuid_tree_iterate(struct btrfs_fs_info *fs_info);
-
-/* dir-item.c */
-int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir,
- const char *name, int name_len);
-int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, const char *name,
- int name_len, struct btrfs_inode *dir,
- struct btrfs_key *location, u8 type, u64 index);
-struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_path *path, u64 dir,
- const char *name, int name_len,
- int mod);
-struct btrfs_dir_item *
-btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_path *path, u64 dir,
- u64 index, const char *name, int name_len,
- int mod);
-struct btrfs_dir_item *
-btrfs_search_dir_index_item(struct btrfs_root *root,
- struct btrfs_path *path, u64 dirid,
- const char *name, int name_len);
-int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_path *path,
- struct btrfs_dir_item *di);
-int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_path *path, u64 objectid,
- const char *name, u16 name_len,
- const void *data, u16 data_len);
-struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_path *path, u64 dir,
- const char *name, u16 name_len,
- int mod);
-struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_fs_info *fs_info,
- struct btrfs_path *path,
- const char *name,
- int name_len);
-
-/* orphan.c */
-int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, u64 offset);
-int btrfs_del_orphan_item(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, u64 offset);
-int btrfs_find_orphan_item(struct btrfs_root *root, u64 offset);
-
-/* file-item.c */
-int btrfs_del_csums(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, u64 bytenr, u64 len);
-blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst);
-int btrfs_insert_hole_extent(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, u64 objectid, u64 pos,
- u64 num_bytes);
-int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_path *path, u64 objectid,
- u64 bytenr, int mod);
-int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_ordered_sum *sums);
-blk_status_t btrfs_csum_one_bio(struct btrfs_inode *inode, struct bio *bio,
- u64 offset, bool one_ordered);
-int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
- struct list_head *list, int search_commit,
- bool nowait);
-void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode,
- const struct btrfs_path *path,
- struct btrfs_file_extent_item *fi,
- const bool new_inline,
- struct extent_map *em);
-int btrfs_inode_clear_file_extent_range(struct btrfs_inode *inode, u64 start,
- u64 len);
-int btrfs_inode_set_file_extent_range(struct btrfs_inode *inode, u64 start,
- u64 len);
-void btrfs_inode_safe_disk_i_size_write(struct btrfs_inode *inode, u64 new_i_size);
-u64 btrfs_file_extent_end(const struct btrfs_path *path);
-
-/* inode.c */
-void btrfs_submit_data_write_bio(struct inode *inode, struct bio *bio, int mirror_num);
-void btrfs_submit_data_read_bio(struct inode *inode, struct bio *bio,
- int mirror_num, enum btrfs_compression_type compress_type);
-int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, struct page *page,
- u32 pgoff, u8 *csum, const u8 * const csum_expected);
-int btrfs_check_data_csum(struct inode *inode, struct btrfs_bio *bbio,
- u32 bio_offset, struct page *page, u32 pgoff);
-unsigned int btrfs_verify_data_csum(struct btrfs_bio *bbio,
- u32 bio_offset, struct page *page,
- u64 start, u64 end);
-int btrfs_check_data_csum(struct inode *inode, struct btrfs_bio *bbio,
- u32 bio_offset, struct page *page, u32 pgoff);
-noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
- u64 *orig_start, u64 *orig_block_len,
- u64 *ram_bytes, bool nowait, bool strict);
-
-void __btrfs_del_delalloc_inode(struct btrfs_root *root,
- struct btrfs_inode *inode);
-struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry);
-int btrfs_set_inode_index(struct btrfs_inode *dir, u64 *index);
-int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
- struct btrfs_inode *dir, struct btrfs_inode *inode,
- const char *name, int name_len);
-int btrfs_add_link(struct btrfs_trans_handle *trans,
- struct btrfs_inode *parent_inode, struct btrfs_inode *inode,
- const char *name, int name_len, int add_backref, u64 index);
-int btrfs_delete_subvolume(struct inode *dir, struct dentry *dentry);
-int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len,
- int front);
-
-int btrfs_start_delalloc_snapshot(struct btrfs_root *root, bool in_reclaim_context);
-int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr,
- bool in_reclaim_context);
-int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
- unsigned int extra_bits,
- struct extent_state **cached_state);
-struct btrfs_new_inode_args {
- /* Input */
- struct inode *dir;
- struct dentry *dentry;
- struct inode *inode;
- bool orphan;
- bool subvol;
-
- /*
- * Output from btrfs_new_inode_prepare(), input to
- * btrfs_create_new_inode().
- */
- struct posix_acl *default_acl;
- struct posix_acl *acl;
-};
-int btrfs_new_inode_prepare(struct btrfs_new_inode_args *args,
- unsigned int *trans_num_items);
-int btrfs_create_new_inode(struct btrfs_trans_handle *trans,
- struct btrfs_new_inode_args *args);
-void btrfs_new_inode_args_destroy(struct btrfs_new_inode_args *args);
-struct inode *btrfs_new_subvol_inode(struct user_namespace *mnt_userns,
- struct inode *dir);
- void btrfs_set_delalloc_extent(struct inode *inode, struct extent_state *state,
- u32 bits);
-void btrfs_clear_delalloc_extent(struct inode *inode,
- struct extent_state *state, u32 bits);
-void btrfs_merge_delalloc_extent(struct inode *inode, struct extent_state *new,
- struct extent_state *other);
-void btrfs_split_delalloc_extent(struct inode *inode,
- struct extent_state *orig, u64 split);
-void btrfs_set_range_writeback(struct btrfs_inode *inode, u64 start, u64 end);
-vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf);
-void btrfs_evict_inode(struct inode *inode);
-struct inode *btrfs_alloc_inode(struct super_block *sb);
-void btrfs_destroy_inode(struct inode *inode);
-void btrfs_free_inode(struct inode *inode);
-int btrfs_drop_inode(struct inode *inode);
-int __init btrfs_init_cachep(void);
-void __cold btrfs_destroy_cachep(void);
-struct inode *btrfs_iget_path(struct super_block *s, u64 ino,
- struct btrfs_root *root, struct btrfs_path *path);
-struct inode *btrfs_iget(struct super_block *s, u64 ino, struct btrfs_root *root);
-struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
- struct page *page, size_t pg_offset,
- u64 start, u64 end);
-int btrfs_update_inode(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct btrfs_inode *inode);
-int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct btrfs_inode *inode);
-int btrfs_orphan_add(struct btrfs_trans_handle *trans,
- struct btrfs_inode *inode);
-int btrfs_orphan_cleanup(struct btrfs_root *root);
-int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size);
-void btrfs_add_delayed_iput(struct inode *inode);
-void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info);
-int btrfs_wait_on_delayed_iputs(struct btrfs_fs_info *fs_info);
-int btrfs_prealloc_file_range(struct inode *inode, int mode,
- u64 start, u64 num_bytes, u64 min_size,
- loff_t actual_len, u64 *alloc_hint);
-int btrfs_prealloc_file_range_trans(struct inode *inode,
- struct btrfs_trans_handle *trans, int mode,
- u64 start, u64 num_bytes, u64 min_size,
- loff_t actual_len, u64 *alloc_hint);
-int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page,
- u64 start, u64 end, int *page_started, unsigned long *nr_written,
- struct writeback_control *wbc);
-int btrfs_writepage_cow_fixup(struct page *page);
-void btrfs_writepage_endio_finish_ordered(struct btrfs_inode *inode,
- struct page *page, u64 start,
- u64 end, bool uptodate);
-int btrfs_encoded_io_compression_from_extent(struct btrfs_fs_info *fs_info,
- int compress_type);
-int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,
- u64 file_offset, u64 disk_bytenr,
- u64 disk_io_size,
- struct page **pages);
-ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
- struct btrfs_ioctl_encoded_io_args *encoded);
-ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
- const struct btrfs_ioctl_encoded_io_args *encoded);
-
-ssize_t btrfs_dio_read(struct kiocb *iocb, struct iov_iter *iter,
- size_t done_before);
-struct iomap_dio *btrfs_dio_write(struct kiocb *iocb, struct iov_iter *iter,
- size_t done_before);
-
-extern const struct dentry_operations btrfs_dentry_operations;
-
-/* Inode locking type flags, by default the exclusive lock is taken */
-#define BTRFS_ILOCK_SHARED (1U << 0)
-#define BTRFS_ILOCK_TRY (1U << 1)
-#define BTRFS_ILOCK_MMAP (1U << 2)
-
-int btrfs_inode_lock(struct inode *inode, unsigned int ilock_flags);
-void btrfs_inode_unlock(struct inode *inode, unsigned int ilock_flags);
-void btrfs_update_inode_bytes(struct btrfs_inode *inode,
- const u64 add_bytes,
- const u64 del_bytes);
-void btrfs_assert_inode_range_clean(struct btrfs_inode *inode, u64 start, u64 end);
-
-/* ioctl.c */
-long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
-long btrfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
-int btrfs_fileattr_get(struct dentry *dentry, struct fileattr *fa);
-int btrfs_fileattr_set(struct user_namespace *mnt_userns,
- struct dentry *dentry, struct fileattr *fa);
-int btrfs_ioctl_get_supported_features(void __user *arg);
-void btrfs_sync_inode_flags_to_i_flags(struct inode *inode);
-int __pure btrfs_is_empty_uuid(u8 *uuid);
-int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra,
- struct btrfs_ioctl_defrag_range_args *range,
- u64 newer_than, unsigned long max_to_defrag);
-void btrfs_get_block_group_info(struct list_head *groups_list,
- struct btrfs_ioctl_space_info *space);
-void btrfs_update_ioctl_balance_args(struct btrfs_fs_info *fs_info,
- struct btrfs_ioctl_balance_args *bargs);
-
-/* file.c */
-int __init btrfs_auto_defrag_init(void);
-void __cold btrfs_auto_defrag_exit(void);
-int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
- struct btrfs_inode *inode, u32 extent_thresh);
-int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info);
-void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info);
-int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync);
-extern const struct file_operations btrfs_file_operations;
-int btrfs_drop_extents(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct btrfs_inode *inode,
- struct btrfs_drop_extents_args *args);
-int btrfs_replace_file_extents(struct btrfs_inode *inode,
- struct btrfs_path *path, const u64 start,
- const u64 end,
- struct btrfs_replace_extent_info *extent_info,
- struct btrfs_trans_handle **trans_out);
-int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
- struct btrfs_inode *inode, u64 start, u64 end);
-ssize_t btrfs_do_write_iter(struct kiocb *iocb, struct iov_iter *from,
- const struct btrfs_ioctl_encoded_io_args *encoded);
-int btrfs_release_file(struct inode *inode, struct file *file);
-int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages,
- size_t num_pages, loff_t pos, size_t write_bytes,
- struct extent_state **cached, bool noreserve);
-int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end);
-int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos,
- size_t *write_bytes, bool nowait);
-void btrfs_check_nocow_unlock(struct btrfs_inode *inode);
-bool btrfs_find_delalloc_in_range(struct btrfs_inode *inode, u64 start, u64 end,
- u64 *delalloc_start_ret, u64 *delalloc_end_ret);
-
-/* tree-defrag.c */
-int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
- struct btrfs_root *root);
-
-/* super.c */
-int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
- unsigned long new_flags);
-int btrfs_sync_fs(struct super_block *sb, int wait);
-char *btrfs_get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info,
- u64 subvol_objectid);
-
-static inline __printf(2, 3) __cold
-void btrfs_no_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...)
-{
-}
-
-#ifdef CONFIG_PRINTK_INDEX
-
-#define btrfs_printk(fs_info, fmt, args...) \
-do { \
- printk_index_subsys_emit("%sBTRFS %s (device %s): ", NULL, fmt); \
- _btrfs_printk(fs_info, fmt, ##args); \
-} while (0)
-
-__printf(2, 3)
-__cold
-void _btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...);
-
-#elif defined(CONFIG_PRINTK)
-
-#define btrfs_printk(fs_info, fmt, args...) \
- _btrfs_printk(fs_info, fmt, ##args)
-
-__printf(2, 3)
-__cold
-void _btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...);
-
-#else
-
-#define btrfs_printk(fs_info, fmt, args...) \
- btrfs_no_printk(fs_info, fmt, ##args)
-#endif
-
-#define btrfs_emerg(fs_info, fmt, args...) \
- btrfs_printk(fs_info, KERN_EMERG fmt, ##args)
-#define btrfs_alert(fs_info, fmt, args...) \
- btrfs_printk(fs_info, KERN_ALERT fmt, ##args)
-#define btrfs_crit(fs_info, fmt, args...) \
- btrfs_printk(fs_info, KERN_CRIT fmt, ##args)
-#define btrfs_err(fs_info, fmt, args...) \
- btrfs_printk(fs_info, KERN_ERR fmt, ##args)
-#define btrfs_warn(fs_info, fmt, args...) \
- btrfs_printk(fs_info, KERN_WARNING fmt, ##args)
-#define btrfs_notice(fs_info, fmt, args...) \
- btrfs_printk(fs_info, KERN_NOTICE fmt, ##args)
-#define btrfs_info(fs_info, fmt, args...) \
- btrfs_printk(fs_info, KERN_INFO fmt, ##args)
-
-/*
- * Wrappers that use printk_in_rcu
- */
-#define btrfs_emerg_in_rcu(fs_info, fmt, args...) \
- btrfs_printk_in_rcu(fs_info, KERN_EMERG fmt, ##args)
-#define btrfs_alert_in_rcu(fs_info, fmt, args...) \
- btrfs_printk_in_rcu(fs_info, KERN_ALERT fmt, ##args)
-#define btrfs_crit_in_rcu(fs_info, fmt, args...) \
- btrfs_printk_in_rcu(fs_info, KERN_CRIT fmt, ##args)
-#define btrfs_err_in_rcu(fs_info, fmt, args...) \
- btrfs_printk_in_rcu(fs_info, KERN_ERR fmt, ##args)
-#define btrfs_warn_in_rcu(fs_info, fmt, args...) \
- btrfs_printk_in_rcu(fs_info, KERN_WARNING fmt, ##args)
-#define btrfs_notice_in_rcu(fs_info, fmt, args...) \
- btrfs_printk_in_rcu(fs_info, KERN_NOTICE fmt, ##args)
-#define btrfs_info_in_rcu(fs_info, fmt, args...) \
- btrfs_printk_in_rcu(fs_info, KERN_INFO fmt, ##args)
-
-/*
- * Wrappers that use a ratelimited printk_in_rcu
- */
-#define btrfs_emerg_rl_in_rcu(fs_info, fmt, args...) \
- btrfs_printk_rl_in_rcu(fs_info, KERN_EMERG fmt, ##args)
-#define btrfs_alert_rl_in_rcu(fs_info, fmt, args...) \
- btrfs_printk_rl_in_rcu(fs_info, KERN_ALERT fmt, ##args)
-#define btrfs_crit_rl_in_rcu(fs_info, fmt, args...) \
- btrfs_printk_rl_in_rcu(fs_info, KERN_CRIT fmt, ##args)
-#define btrfs_err_rl_in_rcu(fs_info, fmt, args...) \
- btrfs_printk_rl_in_rcu(fs_info, KERN_ERR fmt, ##args)
-#define btrfs_warn_rl_in_rcu(fs_info, fmt, args...) \
- btrfs_printk_rl_in_rcu(fs_info, KERN_WARNING fmt, ##args)
-#define btrfs_notice_rl_in_rcu(fs_info, fmt, args...) \
- btrfs_printk_rl_in_rcu(fs_info, KERN_NOTICE fmt, ##args)
-#define btrfs_info_rl_in_rcu(fs_info, fmt, args...) \
- btrfs_printk_rl_in_rcu(fs_info, KERN_INFO fmt, ##args)
-
-/*
- * Wrappers that use a ratelimited printk
- */
-#define btrfs_emerg_rl(fs_info, fmt, args...) \
- btrfs_printk_ratelimited(fs_info, KERN_EMERG fmt, ##args)
-#define btrfs_alert_rl(fs_info, fmt, args...) \
- btrfs_printk_ratelimited(fs_info, KERN_ALERT fmt, ##args)
-#define btrfs_crit_rl(fs_info, fmt, args...) \
- btrfs_printk_ratelimited(fs_info, KERN_CRIT fmt, ##args)
-#define btrfs_err_rl(fs_info, fmt, args...) \
- btrfs_printk_ratelimited(fs_info, KERN_ERR fmt, ##args)
-#define btrfs_warn_rl(fs_info, fmt, args...) \
- btrfs_printk_ratelimited(fs_info, KERN_WARNING fmt, ##args)
-#define btrfs_notice_rl(fs_info, fmt, args...) \
- btrfs_printk_ratelimited(fs_info, KERN_NOTICE fmt, ##args)
-#define btrfs_info_rl(fs_info, fmt, args...) \
- btrfs_printk_ratelimited(fs_info, KERN_INFO fmt, ##args)
-
-#if defined(CONFIG_DYNAMIC_DEBUG)
-#define btrfs_debug(fs_info, fmt, args...) \
- _dynamic_func_call_no_desc(fmt, btrfs_printk, \
- fs_info, KERN_DEBUG fmt, ##args)
-#define btrfs_debug_in_rcu(fs_info, fmt, args...) \
- _dynamic_func_call_no_desc(fmt, btrfs_printk_in_rcu, \
- fs_info, KERN_DEBUG fmt, ##args)
-#define btrfs_debug_rl_in_rcu(fs_info, fmt, args...) \
- _dynamic_func_call_no_desc(fmt, btrfs_printk_rl_in_rcu, \
- fs_info, KERN_DEBUG fmt, ##args)
-#define btrfs_debug_rl(fs_info, fmt, args...) \
- _dynamic_func_call_no_desc(fmt, btrfs_printk_ratelimited, \
- fs_info, KERN_DEBUG fmt, ##args)
-#elif defined(DEBUG)
-#define btrfs_debug(fs_info, fmt, args...) \
- btrfs_printk(fs_info, KERN_DEBUG fmt, ##args)
-#define btrfs_debug_in_rcu(fs_info, fmt, args...) \
- btrfs_printk_in_rcu(fs_info, KERN_DEBUG fmt, ##args)
-#define btrfs_debug_rl_in_rcu(fs_info, fmt, args...) \
- btrfs_printk_rl_in_rcu(fs_info, KERN_DEBUG fmt, ##args)
-#define btrfs_debug_rl(fs_info, fmt, args...) \
- btrfs_printk_ratelimited(fs_info, KERN_DEBUG fmt, ##args)
-#else
-#define btrfs_debug(fs_info, fmt, args...) \
- btrfs_no_printk(fs_info, KERN_DEBUG fmt, ##args)
-#define btrfs_debug_in_rcu(fs_info, fmt, args...) \
- btrfs_no_printk_in_rcu(fs_info, KERN_DEBUG fmt, ##args)
-#define btrfs_debug_rl_in_rcu(fs_info, fmt, args...) \
- btrfs_no_printk_in_rcu(fs_info, KERN_DEBUG fmt, ##args)
-#define btrfs_debug_rl(fs_info, fmt, args...) \
- btrfs_no_printk(fs_info, KERN_DEBUG fmt, ##args)
-#endif
-
-#define btrfs_printk_in_rcu(fs_info, fmt, args...) \
-do { \
- rcu_read_lock(); \
- btrfs_printk(fs_info, fmt, ##args); \
- rcu_read_unlock(); \
-} while (0)
-
-#define btrfs_no_printk_in_rcu(fs_info, fmt, args...) \
-do { \
- rcu_read_lock(); \
- btrfs_no_printk(fs_info, fmt, ##args); \
- rcu_read_unlock(); \
-} while (0)
-
-#define btrfs_printk_ratelimited(fs_info, fmt, args...) \
-do { \
- static DEFINE_RATELIMIT_STATE(_rs, \
- DEFAULT_RATELIMIT_INTERVAL, \
- DEFAULT_RATELIMIT_BURST); \
- if (__ratelimit(&_rs)) \
- btrfs_printk(fs_info, fmt, ##args); \
-} while (0)
-
-#define btrfs_printk_rl_in_rcu(fs_info, fmt, args...) \
-do { \
- rcu_read_lock(); \
- btrfs_printk_ratelimited(fs_info, fmt, ##args); \
- rcu_read_unlock(); \
-} while (0)
-
-#ifdef CONFIG_BTRFS_ASSERT
-__cold __noreturn
-static inline void assertfail(const char *expr, const char *file, int line)
-{
- pr_err("assertion failed: %s, in %s:%d\n", expr, file, line);
- BUG();
-}
-
-#define ASSERT(expr) \
- (likely(expr) ? (void)0 : assertfail(#expr, __FILE__, __LINE__))
-
-#else
-static inline void assertfail(const char *expr, const char* file, int line) { }
-#define ASSERT(expr) (void)(expr)
-#endif
-
-#if BITS_PER_LONG == 32
-#define BTRFS_32BIT_MAX_FILE_SIZE (((u64)ULONG_MAX + 1) << PAGE_SHIFT)
-/*
- * The warning threshold is 5/8th of the MAX_LFS_FILESIZE that limits the logical
- * addresses of extents.
- *
- * For 4K page size it's about 10T, for 64K it's 160T.
- */
-#define BTRFS_32BIT_EARLY_WARN_THRESHOLD (BTRFS_32BIT_MAX_FILE_SIZE * 5 / 8)
-void btrfs_warn_32bit_limit(struct btrfs_fs_info *fs_info);
-void btrfs_err_32bit_limit(struct btrfs_fs_info *fs_info);
-#endif
-
-/*
- * Get the correct offset inside the page of extent buffer.
- *
- * @eb: target extent buffer
- * @start: offset inside the extent buffer
- *
- * Will handle both sectorsize == PAGE_SIZE and sectorsize < PAGE_SIZE cases.
- */
-static inline size_t get_eb_offset_in_page(const struct extent_buffer *eb,
- unsigned long offset)
-{
- /*
- * For sectorsize == PAGE_SIZE case, eb->start will always be aligned
- * to PAGE_SIZE, thus adding it won't cause any difference.
- *
- * For sectorsize < PAGE_SIZE, we must only read the data that belongs
- * to the eb, thus we have to take the eb->start into consideration.
- */
- return offset_in_page(offset + eb->start);
-}
-
-static inline unsigned long get_eb_page_index(unsigned long offset)
-{
- /*
- * For sectorsize == PAGE_SIZE case, plain >> PAGE_SHIFT is enough.
- *
- * For sectorsize < PAGE_SIZE case, we only support 64K PAGE_SIZE,
- * and have ensured that all tree blocks are contained in one page,
- * thus we always get index == 0.
- */
- return offset >> PAGE_SHIFT;
-}
-
-/*
- * Use that for functions that are conditionally exported for sanity tests but
- * otherwise static
- */
-#ifndef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
-#define EXPORT_FOR_TESTS static
-#else
-#define EXPORT_FOR_TESTS
-#endif
-
-__cold
-static inline void btrfs_print_v0_err(struct btrfs_fs_info *fs_info)
-{
- btrfs_err(fs_info,
-"Unsupported V0 extent filesystem detected. Aborting. Please re-create your filesystem with a newer kernel");
-}
-
-__printf(5, 6)
-__cold
-void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function,
- unsigned int line, int errno, const char *fmt, ...);
-
-const char * __attribute_const__ btrfs_decode_error(int errno);
-
-__cold
-void __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
- const char *function,
- unsigned int line, int errno, bool first_hit);
-
-/*
- * Call btrfs_abort_transaction as early as possible when an error condition is
- * detected, that way the exact line number is reported.
- */
-#define btrfs_abort_transaction(trans, errno) \
-do { \
- bool first = false; \
- /* Report first abort since mount */ \
- if (!test_and_set_bit(BTRFS_FS_STATE_TRANS_ABORTED, \
- &((trans)->fs_info->fs_state))) { \
- first = true; \
- if ((errno) != -EIO && (errno) != -EROFS) { \
- WARN(1, KERN_DEBUG \
- "BTRFS: Transaction aborted (error %d)\n", \
- (errno)); \
- } else { \
- btrfs_debug((trans)->fs_info, \
- "Transaction aborted (error %d)", \
- (errno)); \
- } \
- } \
- __btrfs_abort_transaction((trans), __func__, \
- __LINE__, (errno), first); \
-} while (0)
-
-#ifdef CONFIG_PRINTK_INDEX
-
-#define btrfs_handle_fs_error(fs_info, errno, fmt, args...) \
-do { \
- printk_index_subsys_emit( \
- "BTRFS: error (device %s%s) in %s:%d: errno=%d %s", \
- KERN_CRIT, fmt); \
- __btrfs_handle_fs_error((fs_info), __func__, __LINE__, \
- (errno), fmt, ##args); \
-} while (0)
-
-#else
-
-#define btrfs_handle_fs_error(fs_info, errno, fmt, args...) \
- __btrfs_handle_fs_error((fs_info), __func__, __LINE__, \
- (errno), fmt, ##args)
-
-#endif
-
-#define BTRFS_FS_ERROR(fs_info) (unlikely(test_bit(BTRFS_FS_STATE_ERROR, \
- &(fs_info)->fs_state)))
-#define BTRFS_FS_LOG_CLEANUP_ERROR(fs_info) \
- (unlikely(test_bit(BTRFS_FS_STATE_LOG_CLEANUP_ERROR, \
- &(fs_info)->fs_state)))
-
-__printf(5, 6)
-__cold
-void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function,
- unsigned int line, int errno, const char *fmt, ...);
-/*
- * If BTRFS_MOUNT_PANIC_ON_FATAL_ERROR is in mount_opt, __btrfs_panic
- * will panic(). Otherwise we BUG() here.
- */
-#define btrfs_panic(fs_info, errno, fmt, args...) \
-do { \
- __btrfs_panic(fs_info, __func__, __LINE__, errno, fmt, ##args); \
- BUG(); \
-} while (0)
-
-
-/* compatibility and incompatibility defines */
-
-#define btrfs_set_fs_incompat(__fs_info, opt) \
- __btrfs_set_fs_incompat((__fs_info), BTRFS_FEATURE_INCOMPAT_##opt, \
- #opt)
-
-static inline void __btrfs_set_fs_incompat(struct btrfs_fs_info *fs_info,
- u64 flag, const char* name)
-{
- struct btrfs_super_block *disk_super;
- u64 features;
-
- disk_super = fs_info->super_copy;
- features = btrfs_super_incompat_flags(disk_super);
- if (!(features & flag)) {
- spin_lock(&fs_info->super_lock);
- features = btrfs_super_incompat_flags(disk_super);
- if (!(features & flag)) {
- features |= flag;
- btrfs_set_super_incompat_flags(disk_super, features);
- btrfs_info(fs_info,
- "setting incompat feature flag for %s (0x%llx)",
- name, flag);
- }
- spin_unlock(&fs_info->super_lock);
- }
-}
-
-#define btrfs_clear_fs_incompat(__fs_info, opt) \
- __btrfs_clear_fs_incompat((__fs_info), BTRFS_FEATURE_INCOMPAT_##opt, \
- #opt)
-
-static inline void __btrfs_clear_fs_incompat(struct btrfs_fs_info *fs_info,
- u64 flag, const char* name)
-{
- struct btrfs_super_block *disk_super;
- u64 features;
-
- disk_super = fs_info->super_copy;
- features = btrfs_super_incompat_flags(disk_super);
- if (features & flag) {
- spin_lock(&fs_info->super_lock);
- features = btrfs_super_incompat_flags(disk_super);
- if (features & flag) {
- features &= ~flag;
- btrfs_set_super_incompat_flags(disk_super, features);
- btrfs_info(fs_info,
- "clearing incompat feature flag for %s (0x%llx)",
- name, flag);
- }
- spin_unlock(&fs_info->super_lock);
- }
-}
-
-#define btrfs_fs_incompat(fs_info, opt) \
- __btrfs_fs_incompat((fs_info), BTRFS_FEATURE_INCOMPAT_##opt)
-
-static inline bool __btrfs_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag)
-{
- struct btrfs_super_block *disk_super;
- disk_super = fs_info->super_copy;
- return !!(btrfs_super_incompat_flags(disk_super) & flag);
-}
-
-#define btrfs_set_fs_compat_ro(__fs_info, opt) \
- __btrfs_set_fs_compat_ro((__fs_info), BTRFS_FEATURE_COMPAT_RO_##opt, \
- #opt)
-
-static inline void __btrfs_set_fs_compat_ro(struct btrfs_fs_info *fs_info,
- u64 flag, const char *name)
-{
- struct btrfs_super_block *disk_super;
- u64 features;
-
- disk_super = fs_info->super_copy;
- features = btrfs_super_compat_ro_flags(disk_super);
- if (!(features & flag)) {
- spin_lock(&fs_info->super_lock);
- features = btrfs_super_compat_ro_flags(disk_super);
- if (!(features & flag)) {
- features |= flag;
- btrfs_set_super_compat_ro_flags(disk_super, features);
- btrfs_info(fs_info,
- "setting compat-ro feature flag for %s (0x%llx)",
- name, flag);
- }
- spin_unlock(&fs_info->super_lock);
- }
-}
-
-#define btrfs_clear_fs_compat_ro(__fs_info, opt) \
- __btrfs_clear_fs_compat_ro((__fs_info), BTRFS_FEATURE_COMPAT_RO_##opt, \
- #opt)
-
-static inline void __btrfs_clear_fs_compat_ro(struct btrfs_fs_info *fs_info,
- u64 flag, const char *name)
-{
- struct btrfs_super_block *disk_super;
- u64 features;
-
- disk_super = fs_info->super_copy;
- features = btrfs_super_compat_ro_flags(disk_super);
- if (features & flag) {
- spin_lock(&fs_info->super_lock);
- features = btrfs_super_compat_ro_flags(disk_super);
- if (features & flag) {
- features &= ~flag;
- btrfs_set_super_compat_ro_flags(disk_super, features);
- btrfs_info(fs_info,
- "clearing compat-ro feature flag for %s (0x%llx)",
- name, flag);
- }
- spin_unlock(&fs_info->super_lock);
- }
-}
-
-#define btrfs_fs_compat_ro(fs_info, opt) \
- __btrfs_fs_compat_ro((fs_info), BTRFS_FEATURE_COMPAT_RO_##opt)
-
-static inline int __btrfs_fs_compat_ro(struct btrfs_fs_info *fs_info, u64 flag)
-{
- struct btrfs_super_block *disk_super;
- disk_super = fs_info->super_copy;
- return !!(btrfs_super_compat_ro_flags(disk_super) & flag);
-}
-
-/* acl.c */
-#ifdef CONFIG_BTRFS_FS_POSIX_ACL
-struct posix_acl *btrfs_get_acl(struct inode *inode, int type, bool rcu);
-int btrfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
- struct posix_acl *acl, int type);
-int __btrfs_set_acl(struct btrfs_trans_handle *trans, struct inode *inode,
- struct posix_acl *acl, int type);
-#else
-#define btrfs_get_acl NULL
-#define btrfs_set_acl NULL
-static inline int __btrfs_set_acl(struct btrfs_trans_handle *trans,
- struct inode *inode, struct posix_acl *acl,
- int type)
-{
- return -EOPNOTSUPP;
-}
-#endif
-
-/* relocation.c */
-int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start);
-int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
- struct btrfs_root *root);
-int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
- struct btrfs_root *root);
-int btrfs_recover_relocation(struct btrfs_fs_info *fs_info);
-int btrfs_reloc_clone_csums(struct btrfs_inode *inode, u64 file_pos, u64 len);
-int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct extent_buffer *buf,
- struct extent_buffer *cow);
-void btrfs_reloc_pre_snapshot(struct btrfs_pending_snapshot *pending,
- u64 *bytes_to_reserve);
-int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,
- struct btrfs_pending_snapshot *pending);
-int btrfs_should_cancel_balance(struct btrfs_fs_info *fs_info);
-struct btrfs_root *find_reloc_root(struct btrfs_fs_info *fs_info,
- u64 bytenr);
-int btrfs_should_ignore_reloc_root(struct btrfs_root *root);
-
-/* scrub.c */
-int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
- u64 end, struct btrfs_scrub_progress *progress,
- int readonly, int is_dev_replace);
-void btrfs_scrub_pause(struct btrfs_fs_info *fs_info);
-void btrfs_scrub_continue(struct btrfs_fs_info *fs_info);
-int btrfs_scrub_cancel(struct btrfs_fs_info *info);
-int btrfs_scrub_cancel_dev(struct btrfs_device *dev);
-int btrfs_scrub_progress(struct btrfs_fs_info *fs_info, u64 devid,
- struct btrfs_scrub_progress *progress);
-
-/* dev-replace.c */
-void btrfs_bio_counter_inc_blocked(struct btrfs_fs_info *fs_info);
-void btrfs_bio_counter_sub(struct btrfs_fs_info *fs_info, s64 amount);
-
-static inline void btrfs_bio_counter_dec(struct btrfs_fs_info *fs_info)
-{
- btrfs_bio_counter_sub(fs_info, 1);
-}
static inline int is_fstree(u64 rootid)
{
@@ -4054,61 +695,16 @@ static inline int is_fstree(u64 rootid)
return 0;
}
-static inline int btrfs_defrag_cancelled(struct btrfs_fs_info *fs_info)
-{
- return signal_pending(current);
-}
-
-/* verity.c */
-#ifdef CONFIG_FS_VERITY
-
-extern const struct fsverity_operations btrfs_verityops;
-int btrfs_drop_verity_items(struct btrfs_inode *inode);
-int btrfs_get_verity_descriptor(struct inode *inode, void *buf, size_t buf_size);
-
-BTRFS_SETGET_FUNCS(verity_descriptor_encryption, struct btrfs_verity_descriptor_item,
- encryption, 8);
-BTRFS_SETGET_FUNCS(verity_descriptor_size, struct btrfs_verity_descriptor_item,
- size, 64);
-BTRFS_SETGET_STACK_FUNCS(stack_verity_descriptor_encryption,
- struct btrfs_verity_descriptor_item, encryption, 8);
-BTRFS_SETGET_STACK_FUNCS(stack_verity_descriptor_size,
- struct btrfs_verity_descriptor_item, size, 64);
-
-#else
-
-static inline int btrfs_drop_verity_items(struct btrfs_inode *inode)
-{
- return 0;
-}
-
-static inline int btrfs_get_verity_descriptor(struct inode *inode, void *buf,
- size_t buf_size)
-{
- return -EPERM;
-}
-
-#endif
-
-/* Sanity test specific functions */
-#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
-void btrfs_test_destroy_inode(struct inode *inode);
-static inline int btrfs_is_testing(struct btrfs_fs_info *fs_info)
-{
- return test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state);
-}
-#else
-static inline int btrfs_is_testing(struct btrfs_fs_info *fs_info)
-{
- return 0;
-}
-#endif
-
static inline bool btrfs_is_data_reloc_root(const struct btrfs_root *root)
{
return root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID;
}
+int btrfs_super_csum_size(const struct btrfs_super_block *s);
+const char *btrfs_super_csum_name(u16 csum_type);
+const char *btrfs_super_csum_driver(u16 csum_type);
+size_t __attribute_const__ btrfs_get_num_csums(void);
+
/*
* We use page status Private2 to indicate there is an ordered extent with
* unfinished IO.
diff --git a/fs/btrfs/defrag.c b/fs/btrfs/defrag.c
new file mode 100644
index 000000000000..0a3c261b69c9
--- /dev/null
+++ b/fs/btrfs/defrag.c
@@ -0,0 +1,1376 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2007 Oracle. All rights reserved.
+ */
+
+#include <linux/sched.h>
+#include "ctree.h"
+#include "disk-io.h"
+#include "print-tree.h"
+#include "transaction.h"
+#include "locking.h"
+#include "accessors.h"
+#include "messages.h"
+#include "delalloc-space.h"
+#include "subpage.h"
+#include "defrag.h"
+#include "file-item.h"
+#include "super.h"
+
+static struct kmem_cache *btrfs_inode_defrag_cachep;
+
+/*
+ * When auto defrag is enabled we queue up these defrag structs to remember
+ * which inodes need defragging passes.
+ */
+struct inode_defrag {
+ struct rb_node rb_node;
+ /* Inode number */
+ u64 ino;
+ /*
+ * Transid where the defrag was added, we search for extents newer than
+ * this.
+ */
+ u64 transid;
+
+ /* Root objectid */
+ u64 root;
+
+ /*
+ * The extent size threshold for autodefrag.
+ *
+ * This value is different for compressed/non-compressed extents, thus
+ * needs to be passed from higher layer.
+ * (aka, inode_should_defrag())
+ */
+ u32 extent_thresh;
+};
+
+static int __compare_inode_defrag(struct inode_defrag *defrag1,
+ struct inode_defrag *defrag2)
+{
+ if (defrag1->root > defrag2->root)
+ return 1;
+ else if (defrag1->root < defrag2->root)
+ return -1;
+ else if (defrag1->ino > defrag2->ino)
+ return 1;
+ else if (defrag1->ino < defrag2->ino)
+ return -1;
+ else
+ return 0;
+}
+
+/*
+ * Pop a record for an inode into the defrag tree. The lock must be held
+ * already.
+ *
+ * If you're inserting a record for an older transid than an existing record,
+ * the transid already in the tree is lowered.
+ *
+ * If an existing record is found the defrag item you pass in is freed.
+ */
+static int __btrfs_add_inode_defrag(struct btrfs_inode *inode,
+ struct inode_defrag *defrag)
+{
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ struct inode_defrag *entry;
+ struct rb_node **p;
+ struct rb_node *parent = NULL;
+ int ret;
+
+ p = &fs_info->defrag_inodes.rb_node;
+ while (*p) {
+ parent = *p;
+ entry = rb_entry(parent, struct inode_defrag, rb_node);
+
+ ret = __compare_inode_defrag(defrag, entry);
+ if (ret < 0)
+ p = &parent->rb_left;
+ else if (ret > 0)
+ p = &parent->rb_right;
+ else {
+ /*
+ * If we're reinserting an entry for an old defrag run,
+ * make sure to lower the transid of our existing
+ * record.
+ */
+ if (defrag->transid < entry->transid)
+ entry->transid = defrag->transid;
+ entry->extent_thresh = min(defrag->extent_thresh,
+ entry->extent_thresh);
+ return -EEXIST;
+ }
+ }
+ set_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags);
+ rb_link_node(&defrag->rb_node, parent, p);
+ rb_insert_color(&defrag->rb_node, &fs_info->defrag_inodes);
+ return 0;
+}
+
+static inline int __need_auto_defrag(struct btrfs_fs_info *fs_info)
+{
+ if (!btrfs_test_opt(fs_info, AUTO_DEFRAG))
+ return 0;
+
+ if (btrfs_fs_closing(fs_info))
+ return 0;
+
+ return 1;
+}
+
+/*
+ * Insert a defrag record for this inode if auto defrag is enabled.
+ */
+int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
+ struct btrfs_inode *inode, u32 extent_thresh)
+{
+ struct btrfs_root *root = inode->root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct inode_defrag *defrag;
+ u64 transid;
+ int ret;
+
+ if (!__need_auto_defrag(fs_info))
+ return 0;
+
+ if (test_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags))
+ return 0;
+
+ if (trans)
+ transid = trans->transid;
+ else
+ transid = inode->root->last_trans;
+
+ defrag = kmem_cache_zalloc(btrfs_inode_defrag_cachep, GFP_NOFS);
+ if (!defrag)
+ return -ENOMEM;
+
+ defrag->ino = btrfs_ino(inode);
+ defrag->transid = transid;
+ defrag->root = root->root_key.objectid;
+ defrag->extent_thresh = extent_thresh;
+
+ spin_lock(&fs_info->defrag_inodes_lock);
+ if (!test_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags)) {
+ /*
+ * If we set IN_DEFRAG flag and evict the inode from memory,
+ * and then re-read this inode, this new inode doesn't have
+ * IN_DEFRAG flag. At the case, we may find the existed defrag.
+ */
+ ret = __btrfs_add_inode_defrag(inode, defrag);
+ if (ret)
+ kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
+ } else {
+ kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
+ }
+ spin_unlock(&fs_info->defrag_inodes_lock);
+ return 0;
+}
+
+/*
+ * Pick the defragable inode that we want, if it doesn't exist, we will get the
+ * next one.
+ */
+static struct inode_defrag *btrfs_pick_defrag_inode(
+ struct btrfs_fs_info *fs_info, u64 root, u64 ino)
+{
+ struct inode_defrag *entry = NULL;
+ struct inode_defrag tmp;
+ struct rb_node *p;
+ struct rb_node *parent = NULL;
+ int ret;
+
+ tmp.ino = ino;
+ tmp.root = root;
+
+ spin_lock(&fs_info->defrag_inodes_lock);
+ p = fs_info->defrag_inodes.rb_node;
+ while (p) {
+ parent = p;
+ entry = rb_entry(parent, struct inode_defrag, rb_node);
+
+ ret = __compare_inode_defrag(&tmp, entry);
+ if (ret < 0)
+ p = parent->rb_left;
+ else if (ret > 0)
+ p = parent->rb_right;
+ else
+ goto out;
+ }
+
+ if (parent && __compare_inode_defrag(&tmp, entry) > 0) {
+ parent = rb_next(parent);
+ if (parent)
+ entry = rb_entry(parent, struct inode_defrag, rb_node);
+ else
+ entry = NULL;
+ }
+out:
+ if (entry)
+ rb_erase(parent, &fs_info->defrag_inodes);
+ spin_unlock(&fs_info->defrag_inodes_lock);
+ return entry;
+}
+
+void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info)
+{
+ struct inode_defrag *defrag;
+ struct rb_node *node;
+
+ spin_lock(&fs_info->defrag_inodes_lock);
+ node = rb_first(&fs_info->defrag_inodes);
+ while (node) {
+ rb_erase(node, &fs_info->defrag_inodes);
+ defrag = rb_entry(node, struct inode_defrag, rb_node);
+ kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
+
+ cond_resched_lock(&fs_info->defrag_inodes_lock);
+
+ node = rb_first(&fs_info->defrag_inodes);
+ }
+ spin_unlock(&fs_info->defrag_inodes_lock);
+}
+
+#define BTRFS_DEFRAG_BATCH 1024
+
+static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info,
+ struct inode_defrag *defrag)
+{
+ struct btrfs_root *inode_root;
+ struct inode *inode;
+ struct btrfs_ioctl_defrag_range_args range;
+ int ret = 0;
+ u64 cur = 0;
+
+again:
+ if (test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state))
+ goto cleanup;
+ if (!__need_auto_defrag(fs_info))
+ goto cleanup;
+
+ /* Get the inode */
+ inode_root = btrfs_get_fs_root(fs_info, defrag->root, true);
+ if (IS_ERR(inode_root)) {
+ ret = PTR_ERR(inode_root);
+ goto cleanup;
+ }
+
+ inode = btrfs_iget(fs_info->sb, defrag->ino, inode_root);
+ btrfs_put_root(inode_root);
+ if (IS_ERR(inode)) {
+ ret = PTR_ERR(inode);
+ goto cleanup;
+ }
+
+ if (cur >= i_size_read(inode)) {
+ iput(inode);
+ goto cleanup;
+ }
+
+ /* Do a chunk of defrag */
+ clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags);
+ memset(&range, 0, sizeof(range));
+ range.len = (u64)-1;
+ range.start = cur;
+ range.extent_thresh = defrag->extent_thresh;
+
+ sb_start_write(fs_info->sb);
+ ret = btrfs_defrag_file(inode, NULL, &range, defrag->transid,
+ BTRFS_DEFRAG_BATCH);
+ sb_end_write(fs_info->sb);
+ iput(inode);
+
+ if (ret < 0)
+ goto cleanup;
+
+ cur = max(cur + fs_info->sectorsize, range.start);
+ goto again;
+
+cleanup:
+ kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
+ return ret;
+}
+
+/*
+ * Run through the list of inodes in the FS that need defragging.
+ */
+int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
+{
+ struct inode_defrag *defrag;
+ u64 first_ino = 0;
+ u64 root_objectid = 0;
+
+ atomic_inc(&fs_info->defrag_running);
+ while (1) {
+ /* Pause the auto defragger. */
+ if (test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state))
+ break;
+
+ if (!__need_auto_defrag(fs_info))
+ break;
+
+ /* find an inode to defrag */
+ defrag = btrfs_pick_defrag_inode(fs_info, root_objectid, first_ino);
+ if (!defrag) {
+ if (root_objectid || first_ino) {
+ root_objectid = 0;
+ first_ino = 0;
+ continue;
+ } else {
+ break;
+ }
+ }
+
+ first_ino = defrag->ino + 1;
+ root_objectid = defrag->root;
+
+ __btrfs_run_defrag_inode(fs_info, defrag);
+ }
+ atomic_dec(&fs_info->defrag_running);
+
+ /*
+ * During unmount, we use the transaction_wait queue to wait for the
+ * defragger to stop.
+ */
+ wake_up(&fs_info->transaction_wait);
+ return 0;
+}
+
+/*
+ * Defrag all the leaves in a given btree.
+ * Read all the leaves and try to get key order to
+ * better reflect disk order
+ */
+
+int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root)
+{
+ struct btrfs_path *path = NULL;
+ struct btrfs_key key;
+ int ret = 0;
+ int wret;
+ int level;
+ int next_key_ret = 0;
+ u64 last_ret = 0;
+
+ if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state))
+ goto out;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ level = btrfs_header_level(root->node);
+
+ if (level == 0)
+ goto out;
+
+ if (root->defrag_progress.objectid == 0) {
+ struct extent_buffer *root_node;
+ u32 nritems;
+
+ root_node = btrfs_lock_root_node(root);
+ nritems = btrfs_header_nritems(root_node);
+ root->defrag_max.objectid = 0;
+ /* from above we know this is not a leaf */
+ btrfs_node_key_to_cpu(root_node, &root->defrag_max,
+ nritems - 1);
+ btrfs_tree_unlock(root_node);
+ free_extent_buffer(root_node);
+ memset(&key, 0, sizeof(key));
+ } else {
+ memcpy(&key, &root->defrag_progress, sizeof(key));
+ }
+
+ path->keep_locks = 1;
+
+ ret = btrfs_search_forward(root, &key, path, BTRFS_OLDEST_GENERATION);
+ if (ret < 0)
+ goto out;
+ if (ret > 0) {
+ ret = 0;
+ goto out;
+ }
+ btrfs_release_path(path);
+ /*
+ * We don't need a lock on a leaf. btrfs_realloc_node() will lock all
+ * leafs from path->nodes[1], so set lowest_level to 1 to avoid later
+ * a deadlock (attempting to write lock an already write locked leaf).
+ */
+ path->lowest_level = 1;
+ wret = btrfs_search_slot(trans, root, &key, path, 0, 1);
+
+ if (wret < 0) {
+ ret = wret;
+ goto out;
+ }
+ if (!path->nodes[1]) {
+ ret = 0;
+ goto out;
+ }
+ /*
+ * The node at level 1 must always be locked when our path has
+ * keep_locks set and lowest_level is 1, regardless of the value of
+ * path->slots[1].
+ */
+ BUG_ON(path->locks[1] == 0);
+ ret = btrfs_realloc_node(trans, root,
+ path->nodes[1], 0,
+ &last_ret,
+ &root->defrag_progress);
+ if (ret) {
+ WARN_ON(ret == -EAGAIN);
+ goto out;
+ }
+ /*
+ * Now that we reallocated the node we can find the next key. Note that
+ * btrfs_find_next_key() can release our path and do another search
+ * without COWing, this is because even with path->keep_locks = 1,
+ * btrfs_search_slot() / ctree.c:unlock_up() does not keeps a lock on a
+ * node when path->slots[node_level - 1] does not point to the last
+ * item or a slot beyond the last item (ctree.c:unlock_up()). Therefore
+ * we search for the next key after reallocating our node.
+ */
+ path->slots[1] = btrfs_header_nritems(path->nodes[1]);
+ next_key_ret = btrfs_find_next_key(root, path, &key, 1,
+ BTRFS_OLDEST_GENERATION);
+ if (next_key_ret == 0) {
+ memcpy(&root->defrag_progress, &key, sizeof(key));
+ ret = -EAGAIN;
+ }
+out:
+ btrfs_free_path(path);
+ if (ret == -EAGAIN) {
+ if (root->defrag_max.objectid > root->defrag_progress.objectid)
+ goto done;
+ if (root->defrag_max.type > root->defrag_progress.type)
+ goto done;
+ if (root->defrag_max.offset > root->defrag_progress.offset)
+ goto done;
+ ret = 0;
+ }
+done:
+ if (ret != -EAGAIN)
+ memset(&root->defrag_progress, 0,
+ sizeof(root->defrag_progress));
+
+ return ret;
+}
+
+/*
+ * Defrag specific helper to get an extent map.
+ *
+ * Differences between this and btrfs_get_extent() are:
+ *
+ * - No extent_map will be added to inode->extent_tree
+ * To reduce memory usage in the long run.
+ *
+ * - Extra optimization to skip file extents older than @newer_than
+ * By using btrfs_search_forward() we can skip entire file ranges that
+ * have extents created in past transactions, because btrfs_search_forward()
+ * will not visit leaves and nodes with a generation smaller than given
+ * minimal generation threshold (@newer_than).
+ *
+ * Return valid em if we find a file extent matching the requirement.
+ * Return NULL if we can not find a file extent matching the requirement.
+ *
+ * Return ERR_PTR() for error.
+ */
+static struct extent_map *defrag_get_extent(struct btrfs_inode *inode,
+ u64 start, u64 newer_than)
+{
+ struct btrfs_root *root = inode->root;
+ struct btrfs_file_extent_item *fi;
+ struct btrfs_path path = { 0 };
+ struct extent_map *em;
+ struct btrfs_key key;
+ u64 ino = btrfs_ino(inode);
+ int ret;
+
+ em = alloc_extent_map();
+ if (!em) {
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ key.objectid = ino;
+ key.type = BTRFS_EXTENT_DATA_KEY;
+ key.offset = start;
+
+ if (newer_than) {
+ ret = btrfs_search_forward(root, &key, &path, newer_than);
+ if (ret < 0)
+ goto err;
+ /* Can't find anything newer */
+ if (ret > 0)
+ goto not_found;
+ } else {
+ ret = btrfs_search_slot(NULL, root, &key, &path, 0, 0);
+ if (ret < 0)
+ goto err;
+ }
+ if (path.slots[0] >= btrfs_header_nritems(path.nodes[0])) {
+ /*
+ * If btrfs_search_slot() makes path to point beyond nritems,
+ * we should not have an empty leaf, as this inode must at
+ * least have its INODE_ITEM.
+ */
+ ASSERT(btrfs_header_nritems(path.nodes[0]));
+ path.slots[0] = btrfs_header_nritems(path.nodes[0]) - 1;
+ }
+ btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]);
+ /* Perfect match, no need to go one slot back */
+ if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY &&
+ key.offset == start)
+ goto iterate;
+
+ /* We didn't find a perfect match, needs to go one slot back */
+ if (path.slots[0] > 0) {
+ btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]);
+ if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY)
+ path.slots[0]--;
+ }
+
+iterate:
+ /* Iterate through the path to find a file extent covering @start */
+ while (true) {
+ u64 extent_end;
+
+ if (path.slots[0] >= btrfs_header_nritems(path.nodes[0]))
+ goto next;
+
+ btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]);
+
+ /*
+ * We may go one slot back to INODE_REF/XATTR item, then
+ * need to go forward until we reach an EXTENT_DATA.
+ * But we should still has the correct ino as key.objectid.
+ */
+ if (WARN_ON(key.objectid < ino) || key.type < BTRFS_EXTENT_DATA_KEY)
+ goto next;
+
+ /* It's beyond our target range, definitely not extent found */
+ if (key.objectid > ino || key.type > BTRFS_EXTENT_DATA_KEY)
+ goto not_found;
+
+ /*
+ * | |<- File extent ->|
+ * \- start
+ *
+ * This means there is a hole between start and key.offset.
+ */
+ if (key.offset > start) {
+ em->start = start;
+ em->orig_start = start;
+ em->block_start = EXTENT_MAP_HOLE;
+ em->len = key.offset - start;
+ break;
+ }
+
+ fi = btrfs_item_ptr(path.nodes[0], path.slots[0],
+ struct btrfs_file_extent_item);
+ extent_end = btrfs_file_extent_end(&path);
+
+ /*
+ * |<- file extent ->| |
+ * \- start
+ *
+ * We haven't reached start, search next slot.
+ */
+ if (extent_end <= start)
+ goto next;
+
+ /* Now this extent covers @start, convert it to em */
+ btrfs_extent_item_to_extent_map(inode, &path, fi, em);
+ break;
+next:
+ ret = btrfs_next_item(root, &path);
+ if (ret < 0)
+ goto err;
+ if (ret > 0)
+ goto not_found;
+ }
+ btrfs_release_path(&path);
+ return em;
+
+not_found:
+ btrfs_release_path(&path);
+ free_extent_map(em);
+ return NULL;
+
+err:
+ btrfs_release_path(&path);
+ free_extent_map(em);
+ return ERR_PTR(ret);
+}
+
+static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start,
+ u64 newer_than, bool locked)
+{
+ struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+ struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+ struct extent_map *em;
+ const u32 sectorsize = BTRFS_I(inode)->root->fs_info->sectorsize;
+
+ /*
+ * Hopefully we have this extent in the tree already, try without the
+ * full extent lock.
+ */
+ read_lock(&em_tree->lock);
+ em = lookup_extent_mapping(em_tree, start, sectorsize);
+ read_unlock(&em_tree->lock);
+
+ /*
+ * We can get a merged extent, in that case, we need to re-search
+ * tree to get the original em for defrag.
+ *
+ * If @newer_than is 0 or em::generation < newer_than, we can trust
+ * this em, as either we don't care about the generation, or the
+ * merged extent map will be rejected anyway.
+ */
+ if (em && test_bit(EXTENT_FLAG_MERGED, &em->flags) &&
+ newer_than && em->generation >= newer_than) {
+ free_extent_map(em);
+ em = NULL;
+ }
+
+ if (!em) {
+ struct extent_state *cached = NULL;
+ u64 end = start + sectorsize - 1;
+
+ /* Get the big lock and read metadata off disk. */
+ if (!locked)
+ lock_extent(io_tree, start, end, &cached);
+ em = defrag_get_extent(BTRFS_I(inode), start, newer_than);
+ if (!locked)
+ unlock_extent(io_tree, start, end, &cached);
+
+ if (IS_ERR(em))
+ return NULL;
+ }
+
+ return em;
+}
+
+static u32 get_extent_max_capacity(const struct btrfs_fs_info *fs_info,
+ const struct extent_map *em)
+{
+ if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
+ return BTRFS_MAX_COMPRESSED;
+ return fs_info->max_extent_size;
+}
+
+static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em,
+ u32 extent_thresh, u64 newer_than, bool locked)
+{
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct extent_map *next;
+ bool ret = false;
+
+ /* This is the last extent */
+ if (em->start + em->len >= i_size_read(inode))
+ return false;
+
+ /*
+ * Here we need to pass @newer_then when checking the next extent, or
+ * we will hit a case we mark current extent for defrag, but the next
+ * one will not be a target.
+ * This will just cause extra IO without really reducing the fragments.
+ */
+ next = defrag_lookup_extent(inode, em->start + em->len, newer_than, locked);
+ /* No more em or hole */
+ if (!next || next->block_start >= EXTENT_MAP_LAST_BYTE)
+ goto out;
+ if (test_bit(EXTENT_FLAG_PREALLOC, &next->flags))
+ goto out;
+ /*
+ * If the next extent is at its max capacity, defragging current extent
+ * makes no sense, as the total number of extents won't change.
+ */
+ if (next->len >= get_extent_max_capacity(fs_info, em))
+ goto out;
+ /* Skip older extent */
+ if (next->generation < newer_than)
+ goto out;
+ /* Also check extent size */
+ if (next->len >= extent_thresh)
+ goto out;
+
+ ret = true;
+out:
+ free_extent_map(next);
+ return ret;
+}
+
+/*
+ * Prepare one page to be defragged.
+ *
+ * This will ensure:
+ *
+ * - Returned page is locked and has been set up properly.
+ * - No ordered extent exists in the page.
+ * - The page is uptodate.
+ *
+ * NOTE: Caller should also wait for page writeback after the cluster is
+ * prepared, here we don't do writeback wait for each page.
+ */
+static struct page *defrag_prepare_one_page(struct btrfs_inode *inode, pgoff_t index)
+{
+ struct address_space *mapping = inode->vfs_inode.i_mapping;
+ gfp_t mask = btrfs_alloc_write_mask(mapping);
+ u64 page_start = (u64)index << PAGE_SHIFT;
+ u64 page_end = page_start + PAGE_SIZE - 1;
+ struct extent_state *cached_state = NULL;
+ struct page *page;
+ int ret;
+
+again:
+ page = find_or_create_page(mapping, index, mask);
+ if (!page)
+ return ERR_PTR(-ENOMEM);
+
+ /*
+ * Since we can defragment files opened read-only, we can encounter
+ * transparent huge pages here (see CONFIG_READ_ONLY_THP_FOR_FS). We
+ * can't do I/O using huge pages yet, so return an error for now.
+ * Filesystem transparent huge pages are typically only used for
+ * executables that explicitly enable them, so this isn't very
+ * restrictive.
+ */
+ if (PageCompound(page)) {
+ unlock_page(page);
+ put_page(page);
+ return ERR_PTR(-ETXTBSY);
+ }
+
+ ret = set_page_extent_mapped(page);
+ if (ret < 0) {
+ unlock_page(page);
+ put_page(page);
+ return ERR_PTR(ret);
+ }
+
+ /* Wait for any existing ordered extent in the range */
+ while (1) {
+ struct btrfs_ordered_extent *ordered;
+
+ lock_extent(&inode->io_tree, page_start, page_end, &cached_state);
+ ordered = btrfs_lookup_ordered_range(inode, page_start, PAGE_SIZE);
+ unlock_extent(&inode->io_tree, page_start, page_end,
+ &cached_state);
+ if (!ordered)
+ break;
+
+ unlock_page(page);
+ btrfs_start_ordered_extent(ordered, 1);
+ btrfs_put_ordered_extent(ordered);
+ lock_page(page);
+ /*
+ * We unlocked the page above, so we need check if it was
+ * released or not.
+ */
+ if (page->mapping != mapping || !PagePrivate(page)) {
+ unlock_page(page);
+ put_page(page);
+ goto again;
+ }
+ }
+
+ /*
+ * Now the page range has no ordered extent any more. Read the page to
+ * make it uptodate.
+ */
+ if (!PageUptodate(page)) {
+ btrfs_read_folio(NULL, page_folio(page));
+ lock_page(page);
+ if (page->mapping != mapping || !PagePrivate(page)) {
+ unlock_page(page);
+ put_page(page);
+ goto again;
+ }
+ if (!PageUptodate(page)) {
+ unlock_page(page);
+ put_page(page);
+ return ERR_PTR(-EIO);
+ }
+ }
+ return page;
+}
+
+struct defrag_target_range {
+ struct list_head list;
+ u64 start;
+ u64 len;
+};
+
+/*
+ * Collect all valid target extents.
+ *
+ * @start: file offset to lookup
+ * @len: length to lookup
+ * @extent_thresh: file extent size threshold, any extent size >= this value
+ * will be ignored
+ * @newer_than: only defrag extents newer than this value
+ * @do_compress: whether the defrag is doing compression
+ * if true, @extent_thresh will be ignored and all regular
+ * file extents meeting @newer_than will be targets.
+ * @locked: if the range has already held extent lock
+ * @target_list: list of targets file extents
+ */
+static int defrag_collect_targets(struct btrfs_inode *inode,
+ u64 start, u64 len, u32 extent_thresh,
+ u64 newer_than, bool do_compress,
+ bool locked, struct list_head *target_list,
+ u64 *last_scanned_ret)
+{
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ bool last_is_target = false;
+ u64 cur = start;
+ int ret = 0;
+
+ while (cur < start + len) {
+ struct extent_map *em;
+ struct defrag_target_range *new;
+ bool next_mergeable = true;
+ u64 range_len;
+
+ last_is_target = false;
+ em = defrag_lookup_extent(&inode->vfs_inode, cur, newer_than, locked);
+ if (!em)
+ break;
+
+ /*
+ * If the file extent is an inlined one, we may still want to
+ * defrag it (fallthrough) if it will cause a regular extent.
+ * This is for users who want to convert inline extents to
+ * regular ones through max_inline= mount option.
+ */
+ if (em->block_start == EXTENT_MAP_INLINE &&
+ em->len <= inode->root->fs_info->max_inline)
+ goto next;
+
+ /* Skip hole/delalloc/preallocated extents */
+ if (em->block_start == EXTENT_MAP_HOLE ||
+ em->block_start == EXTENT_MAP_DELALLOC ||
+ test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
+ goto next;
+
+ /* Skip older extent */
+ if (em->generation < newer_than)
+ goto next;
+
+ /* This em is under writeback, no need to defrag */
+ if (em->generation == (u64)-1)
+ goto next;
+
+ /*
+ * Our start offset might be in the middle of an existing extent
+ * map, so take that into account.
+ */
+ range_len = em->len - (cur - em->start);
+ /*
+ * If this range of the extent map is already flagged for delalloc,
+ * skip it, because:
+ *
+ * 1) We could deadlock later, when trying to reserve space for
+ * delalloc, because in case we can't immediately reserve space
+ * the flusher can start delalloc and wait for the respective
+ * ordered extents to complete. The deadlock would happen
+ * because we do the space reservation while holding the range
+ * locked, and starting writeback, or finishing an ordered
+ * extent, requires locking the range;
+ *
+ * 2) If there's delalloc there, it means there's dirty pages for
+ * which writeback has not started yet (we clean the delalloc
+ * flag when starting writeback and after creating an ordered
+ * extent). If we mark pages in an adjacent range for defrag,
+ * then we will have a larger contiguous range for delalloc,
+ * very likely resulting in a larger extent after writeback is
+ * triggered (except in a case of free space fragmentation).
+ */
+ if (test_range_bit(&inode->io_tree, cur, cur + range_len - 1,
+ EXTENT_DELALLOC, 0, NULL))
+ goto next;
+
+ /*
+ * For do_compress case, we want to compress all valid file
+ * extents, thus no @extent_thresh or mergeable check.
+ */
+ if (do_compress)
+ goto add;
+
+ /* Skip too large extent */
+ if (range_len >= extent_thresh)
+ goto next;
+
+ /*
+ * Skip extents already at its max capacity, this is mostly for
+ * compressed extents, which max cap is only 128K.
+ */
+ if (em->len >= get_extent_max_capacity(fs_info, em))
+ goto next;
+
+ /*
+ * Normally there are no more extents after an inline one, thus
+ * @next_mergeable will normally be false and not defragged.
+ * So if an inline extent passed all above checks, just add it
+ * for defrag, and be converted to regular extents.
+ */
+ if (em->block_start == EXTENT_MAP_INLINE)
+ goto add;
+
+ next_mergeable = defrag_check_next_extent(&inode->vfs_inode, em,
+ extent_thresh, newer_than, locked);
+ if (!next_mergeable) {
+ struct defrag_target_range *last;
+
+ /* Empty target list, no way to merge with last entry */
+ if (list_empty(target_list))
+ goto next;
+ last = list_entry(target_list->prev,
+ struct defrag_target_range, list);
+ /* Not mergeable with last entry */
+ if (last->start + last->len != cur)
+ goto next;
+
+ /* Mergeable, fall through to add it to @target_list. */
+ }
+
+add:
+ last_is_target = true;
+ range_len = min(extent_map_end(em), start + len) - cur;
+ /*
+ * This one is a good target, check if it can be merged into
+ * last range of the target list.
+ */
+ if (!list_empty(target_list)) {
+ struct defrag_target_range *last;
+
+ last = list_entry(target_list->prev,
+ struct defrag_target_range, list);
+ ASSERT(last->start + last->len <= cur);
+ if (last->start + last->len == cur) {
+ /* Mergeable, enlarge the last entry */
+ last->len += range_len;
+ goto next;
+ }
+ /* Fall through to allocate a new entry */
+ }
+
+ /* Allocate new defrag_target_range */
+ new = kmalloc(sizeof(*new), GFP_NOFS);
+ if (!new) {
+ free_extent_map(em);
+ ret = -ENOMEM;
+ break;
+ }
+ new->start = cur;
+ new->len = range_len;
+ list_add_tail(&new->list, target_list);
+
+next:
+ cur = extent_map_end(em);
+ free_extent_map(em);
+ }
+ if (ret < 0) {
+ struct defrag_target_range *entry;
+ struct defrag_target_range *tmp;
+
+ list_for_each_entry_safe(entry, tmp, target_list, list) {
+ list_del_init(&entry->list);
+ kfree(entry);
+ }
+ }
+ if (!ret && last_scanned_ret) {
+ /*
+ * If the last extent is not a target, the caller can skip to
+ * the end of that extent.
+ * Otherwise, we can only go the end of the specified range.
+ */
+ if (!last_is_target)
+ *last_scanned_ret = max(cur, *last_scanned_ret);
+ else
+ *last_scanned_ret = max(start + len, *last_scanned_ret);
+ }
+ return ret;
+}
+
+#define CLUSTER_SIZE (SZ_256K)
+static_assert(IS_ALIGNED(CLUSTER_SIZE, PAGE_SIZE));
+
+/*
+ * Defrag one contiguous target range.
+ *
+ * @inode: target inode
+ * @target: target range to defrag
+ * @pages: locked pages covering the defrag range
+ * @nr_pages: number of locked pages
+ *
+ * Caller should ensure:
+ *
+ * - Pages are prepared
+ * Pages should be locked, no ordered extent in the pages range,
+ * no writeback.
+ *
+ * - Extent bits are locked
+ */
+static int defrag_one_locked_target(struct btrfs_inode *inode,
+ struct defrag_target_range *target,
+ struct page **pages, int nr_pages,
+ struct extent_state **cached_state)
+{
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ struct extent_changeset *data_reserved = NULL;
+ const u64 start = target->start;
+ const u64 len = target->len;
+ unsigned long last_index = (start + len - 1) >> PAGE_SHIFT;
+ unsigned long start_index = start >> PAGE_SHIFT;
+ unsigned long first_index = page_index(pages[0]);
+ int ret = 0;
+ int i;
+
+ ASSERT(last_index - first_index + 1 <= nr_pages);
+
+ ret = btrfs_delalloc_reserve_space(inode, &data_reserved, start, len);
+ if (ret < 0)
+ return ret;
+ clear_extent_bit(&inode->io_tree, start, start + len - 1,
+ EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
+ EXTENT_DEFRAG, cached_state);
+ set_extent_defrag(&inode->io_tree, start, start + len - 1, cached_state);
+
+ /* Update the page status */
+ for (i = start_index - first_index; i <= last_index - first_index; i++) {
+ ClearPageChecked(pages[i]);
+ btrfs_page_clamp_set_dirty(fs_info, pages[i], start, len);
+ }
+ btrfs_delalloc_release_extents(inode, len);
+ extent_changeset_free(data_reserved);
+
+ return ret;
+}
+
+static int defrag_one_range(struct btrfs_inode *inode, u64 start, u32 len,
+ u32 extent_thresh, u64 newer_than, bool do_compress,
+ u64 *last_scanned_ret)
+{
+ struct extent_state *cached_state = NULL;
+ struct defrag_target_range *entry;
+ struct defrag_target_range *tmp;
+ LIST_HEAD(target_list);
+ struct page **pages;
+ const u32 sectorsize = inode->root->fs_info->sectorsize;
+ u64 last_index = (start + len - 1) >> PAGE_SHIFT;
+ u64 start_index = start >> PAGE_SHIFT;
+ unsigned int nr_pages = last_index - start_index + 1;
+ int ret = 0;
+ int i;
+
+ ASSERT(nr_pages <= CLUSTER_SIZE / PAGE_SIZE);
+ ASSERT(IS_ALIGNED(start, sectorsize) && IS_ALIGNED(len, sectorsize));
+
+ pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
+ if (!pages)
+ return -ENOMEM;
+
+ /* Prepare all pages */
+ for (i = 0; i < nr_pages; i++) {
+ pages[i] = defrag_prepare_one_page(inode, start_index + i);
+ if (IS_ERR(pages[i])) {
+ ret = PTR_ERR(pages[i]);
+ pages[i] = NULL;
+ goto free_pages;
+ }
+ }
+ for (i = 0; i < nr_pages; i++)
+ wait_on_page_writeback(pages[i]);
+
+ /* Lock the pages range */
+ lock_extent(&inode->io_tree, start_index << PAGE_SHIFT,
+ (last_index << PAGE_SHIFT) + PAGE_SIZE - 1,
+ &cached_state);
+ /*
+ * Now we have a consistent view about the extent map, re-check
+ * which range really needs to be defragged.
+ *
+ * And this time we have extent locked already, pass @locked = true
+ * so that we won't relock the extent range and cause deadlock.
+ */
+ ret = defrag_collect_targets(inode, start, len, extent_thresh,
+ newer_than, do_compress, true,
+ &target_list, last_scanned_ret);
+ if (ret < 0)
+ goto unlock_extent;
+
+ list_for_each_entry(entry, &target_list, list) {
+ ret = defrag_one_locked_target(inode, entry, pages, nr_pages,
+ &cached_state);
+ if (ret < 0)
+ break;
+ }
+
+ list_for_each_entry_safe(entry, tmp, &target_list, list) {
+ list_del_init(&entry->list);
+ kfree(entry);
+ }
+unlock_extent:
+ unlock_extent(&inode->io_tree, start_index << PAGE_SHIFT,
+ (last_index << PAGE_SHIFT) + PAGE_SIZE - 1,
+ &cached_state);
+free_pages:
+ for (i = 0; i < nr_pages; i++) {
+ if (pages[i]) {
+ unlock_page(pages[i]);
+ put_page(pages[i]);
+ }
+ }
+ kfree(pages);
+ return ret;
+}
+
+static int defrag_one_cluster(struct btrfs_inode *inode,
+ struct file_ra_state *ra,
+ u64 start, u32 len, u32 extent_thresh,
+ u64 newer_than, bool do_compress,
+ unsigned long *sectors_defragged,
+ unsigned long max_sectors,
+ u64 *last_scanned_ret)
+{
+ const u32 sectorsize = inode->root->fs_info->sectorsize;
+ struct defrag_target_range *entry;
+ struct defrag_target_range *tmp;
+ LIST_HEAD(target_list);
+ int ret;
+
+ ret = defrag_collect_targets(inode, start, len, extent_thresh,
+ newer_than, do_compress, false,
+ &target_list, NULL);
+ if (ret < 0)
+ goto out;
+
+ list_for_each_entry(entry, &target_list, list) {
+ u32 range_len = entry->len;
+
+ /* Reached or beyond the limit */
+ if (max_sectors && *sectors_defragged >= max_sectors) {
+ ret = 1;
+ break;
+ }
+
+ if (max_sectors)
+ range_len = min_t(u32, range_len,
+ (max_sectors - *sectors_defragged) * sectorsize);
+
+ /*
+ * If defrag_one_range() has updated last_scanned_ret,
+ * our range may already be invalid (e.g. hole punched).
+ * Skip if our range is before last_scanned_ret, as there is
+ * no need to defrag the range anymore.
+ */
+ if (entry->start + range_len <= *last_scanned_ret)
+ continue;
+
+ if (ra)
+ page_cache_sync_readahead(inode->vfs_inode.i_mapping,
+ ra, NULL, entry->start >> PAGE_SHIFT,
+ ((entry->start + range_len - 1) >> PAGE_SHIFT) -
+ (entry->start >> PAGE_SHIFT) + 1);
+ /*
+ * Here we may not defrag any range if holes are punched before
+ * we locked the pages.
+ * But that's fine, it only affects the @sectors_defragged
+ * accounting.
+ */
+ ret = defrag_one_range(inode, entry->start, range_len,
+ extent_thresh, newer_than, do_compress,
+ last_scanned_ret);
+ if (ret < 0)
+ break;
+ *sectors_defragged += range_len >>
+ inode->root->fs_info->sectorsize_bits;
+ }
+out:
+ list_for_each_entry_safe(entry, tmp, &target_list, list) {
+ list_del_init(&entry->list);
+ kfree(entry);
+ }
+ if (ret >= 0)
+ *last_scanned_ret = max(*last_scanned_ret, start + len);
+ return ret;
+}
+
+/*
+ * Entry point to file defragmentation.
+ *
+ * @inode: inode to be defragged
+ * @ra: readahead state (can be NUL)
+ * @range: defrag options including range and flags
+ * @newer_than: minimum transid to defrag
+ * @max_to_defrag: max number of sectors to be defragged, if 0, the whole inode
+ * will be defragged.
+ *
+ * Return <0 for error.
+ * Return >=0 for the number of sectors defragged, and range->start will be updated
+ * to indicate the file offset where next defrag should be started at.
+ * (Mostly for autodefrag, which sets @max_to_defrag thus we may exit early without
+ * defragging all the range).
+ */
+int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra,
+ struct btrfs_ioctl_defrag_range_args *range,
+ u64 newer_than, unsigned long max_to_defrag)
+{
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ unsigned long sectors_defragged = 0;
+ u64 isize = i_size_read(inode);
+ u64 cur;
+ u64 last_byte;
+ bool do_compress = (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS);
+ bool ra_allocated = false;
+ int compress_type = BTRFS_COMPRESS_ZLIB;
+ int ret = 0;
+ u32 extent_thresh = range->extent_thresh;
+ pgoff_t start_index;
+
+ if (isize == 0)
+ return 0;
+
+ if (range->start >= isize)
+ return -EINVAL;
+
+ if (do_compress) {
+ if (range->compress_type >= BTRFS_NR_COMPRESS_TYPES)
+ return -EINVAL;
+ if (range->compress_type)
+ compress_type = range->compress_type;
+ }
+
+ if (extent_thresh == 0)
+ extent_thresh = SZ_256K;
+
+ if (range->start + range->len > range->start) {
+ /* Got a specific range */
+ last_byte = min(isize, range->start + range->len);
+ } else {
+ /* Defrag until file end */
+ last_byte = isize;
+ }
+
+ /* Align the range */
+ cur = round_down(range->start, fs_info->sectorsize);
+ last_byte = round_up(last_byte, fs_info->sectorsize) - 1;
+
+ /*
+ * If we were not given a ra, allocate a readahead context. As
+ * readahead is just an optimization, defrag will work without it so
+ * we don't error out.
+ */
+ if (!ra) {
+ ra_allocated = true;
+ ra = kzalloc(sizeof(*ra), GFP_KERNEL);
+ if (ra)
+ file_ra_state_init(ra, inode->i_mapping);
+ }
+
+ /*
+ * Make writeback start from the beginning of the range, so that the
+ * defrag range can be written sequentially.
+ */
+ start_index = cur >> PAGE_SHIFT;
+ if (start_index < inode->i_mapping->writeback_index)
+ inode->i_mapping->writeback_index = start_index;
+
+ while (cur < last_byte) {
+ const unsigned long prev_sectors_defragged = sectors_defragged;
+ u64 last_scanned = cur;
+ u64 cluster_end;
+
+ if (btrfs_defrag_cancelled(fs_info)) {
+ ret = -EAGAIN;
+ break;
+ }
+
+ /* We want the cluster end at page boundary when possible */
+ cluster_end = (((cur >> PAGE_SHIFT) +
+ (SZ_256K >> PAGE_SHIFT)) << PAGE_SHIFT) - 1;
+ cluster_end = min(cluster_end, last_byte);
+
+ btrfs_inode_lock(BTRFS_I(inode), 0);
+ if (IS_SWAPFILE(inode)) {
+ ret = -ETXTBSY;
+ btrfs_inode_unlock(BTRFS_I(inode), 0);
+ break;
+ }
+ if (!(inode->i_sb->s_flags & SB_ACTIVE)) {
+ btrfs_inode_unlock(BTRFS_I(inode), 0);
+ break;
+ }
+ if (do_compress)
+ BTRFS_I(inode)->defrag_compress = compress_type;
+ ret = defrag_one_cluster(BTRFS_I(inode), ra, cur,
+ cluster_end + 1 - cur, extent_thresh,
+ newer_than, do_compress, &sectors_defragged,
+ max_to_defrag, &last_scanned);
+
+ if (sectors_defragged > prev_sectors_defragged)
+ balance_dirty_pages_ratelimited(inode->i_mapping);
+
+ btrfs_inode_unlock(BTRFS_I(inode), 0);
+ if (ret < 0)
+ break;
+ cur = max(cluster_end + 1, last_scanned);
+ if (ret > 0) {
+ ret = 0;
+ break;
+ }
+ cond_resched();
+ }
+
+ if (ra_allocated)
+ kfree(ra);
+ /*
+ * Update range.start for autodefrag, this will indicate where to start
+ * in next run.
+ */
+ range->start = cur;
+ if (sectors_defragged) {
+ /*
+ * We have defragged some sectors, for compression case they
+ * need to be written back immediately.
+ */
+ if (range->flags & BTRFS_DEFRAG_RANGE_START_IO) {
+ filemap_flush(inode->i_mapping);
+ if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
+ &BTRFS_I(inode)->runtime_flags))
+ filemap_flush(inode->i_mapping);
+ }
+ if (range->compress_type == BTRFS_COMPRESS_LZO)
+ btrfs_set_fs_incompat(fs_info, COMPRESS_LZO);
+ else if (range->compress_type == BTRFS_COMPRESS_ZSTD)
+ btrfs_set_fs_incompat(fs_info, COMPRESS_ZSTD);
+ ret = sectors_defragged;
+ }
+ if (do_compress) {
+ btrfs_inode_lock(BTRFS_I(inode), 0);
+ BTRFS_I(inode)->defrag_compress = BTRFS_COMPRESS_NONE;
+ btrfs_inode_unlock(BTRFS_I(inode), 0);
+ }
+ return ret;
+}
+
+void __cold btrfs_auto_defrag_exit(void)
+{
+ kmem_cache_destroy(btrfs_inode_defrag_cachep);
+}
+
+int __init btrfs_auto_defrag_init(void)
+{
+ btrfs_inode_defrag_cachep = kmem_cache_create("btrfs_inode_defrag",
+ sizeof(struct inode_defrag), 0,
+ SLAB_MEM_SPREAD,
+ NULL);
+ if (!btrfs_inode_defrag_cachep)
+ return -ENOMEM;
+
+ return 0;
+}
diff --git a/fs/btrfs/defrag.h b/fs/btrfs/defrag.h
new file mode 100644
index 000000000000..5305f2283b5e
--- /dev/null
+++ b/fs/btrfs/defrag.h
@@ -0,0 +1,22 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef BTRFS_DEFRAG_H
+#define BTRFS_DEFRAG_H
+
+int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra,
+ struct btrfs_ioctl_defrag_range_args *range,
+ u64 newer_than, unsigned long max_to_defrag);
+int __init btrfs_auto_defrag_init(void);
+void __cold btrfs_auto_defrag_exit(void);
+int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
+ struct btrfs_inode *inode, u32 extent_thresh);
+int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info);
+void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info);
+int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, struct btrfs_root *root);
+
+static inline int btrfs_defrag_cancelled(struct btrfs_fs_info *fs_info)
+{
+ return signal_pending(current);
+}
+
+#endif
diff --git a/fs/btrfs/delalloc-space.c b/fs/btrfs/delalloc-space.c
index 118b2e20b2e1..7ddb1d104e8e 100644
--- a/fs/btrfs/delalloc-space.c
+++ b/fs/btrfs/delalloc-space.c
@@ -1,5 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
+#include "messages.h"
#include "ctree.h"
#include "delalloc-space.h"
#include "block-rsv.h"
@@ -8,6 +9,7 @@
#include "transaction.h"
#include "qgroup.h"
#include "block-group.h"
+#include "fs.h"
/*
* HOW DOES THIS WORK
@@ -200,8 +202,8 @@ void btrfs_free_reserved_data_space(struct btrfs_inode *inode,
btrfs_qgroup_free_data(inode, reserved, start, len);
}
-/**
- * Release any excessive reservation
+/*
+ * Release any excessive reservations for an inode.
*
* @inode: the inode we need to release from
* @qgroup_free: free or convert qgroup meta. Unlike normal operation, qgroup
@@ -375,12 +377,12 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes,
return 0;
}
-/**
- * Release a metadata reservation for an inode
+/*
+ * Release a metadata reservation for an inode.
*
- * @inode: the inode to release the reservation for.
- * @num_bytes: the number of bytes we are releasing.
- * @qgroup_free: free qgroup reservation or convert it to per-trans reservation
+ * @inode: the inode to release the reservation for.
+ * @num_bytes: the number of bytes we are releasing.
+ * @qgroup_free: free qgroup reservation or convert it to per-trans reservation
*
* This will release the metadata reservation for an inode. This can be called
* once we complete IO for a given set of bytes to release their metadata
@@ -403,10 +405,11 @@ void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes,
btrfs_inode_rsv_release(inode, qgroup_free);
}
-/**
- * btrfs_delalloc_release_extents - release our outstanding_extents
- * @inode: the inode to balance the reservation for.
- * @num_bytes: the number of bytes we originally reserved with
+/*
+ * Release our outstanding_extents for an inode.
+ *
+ * @inode: the inode to balance the reservation for.
+ * @num_bytes: the number of bytes we originally reserved with
*
* When we reserve space we increase outstanding_extents for the extents we may
* add. Once we've set the range as delalloc or created our ordered extents we
@@ -431,30 +434,30 @@ void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes)
btrfs_inode_rsv_release(inode, true);
}
-/**
- * btrfs_delalloc_reserve_space - reserve data and metadata space for
- * delalloc
- * @inode: inode we're writing to
- * @start: start range we are writing to
- * @len: how long the range we are writing to
- * @reserved: mandatory parameter, record actually reserved qgroup ranges of
- * current reservation.
+/*
+ * Reserve data and metadata space for delalloc
+ *
+ * @inode: inode we're writing to
+ * @start: start range we are writing to
+ * @len: how long the range we are writing to
+ * @reserved: mandatory parameter, record actually reserved qgroup ranges of
+ * current reservation.
*
* This will do the following things
*
- * - reserve space in data space info for num bytes
- * and reserve precious corresponding qgroup space
+ * - reserve space in data space info for num bytes and reserve precious
+ * corresponding qgroup space
* (Done in check_data_free_space)
*
* - reserve space for metadata space, based on the number of outstanding
- * extents and how much csums will be needed
- * also reserve metadata space in a per root over-reserve method.
+ * extents and how much csums will be needed also reserve metadata space in a
+ * per root over-reserve method.
* - add to the inodes->delalloc_bytes
* - add it to the fs_info's delalloc inodes list.
* (Above 3 all done in delalloc_reserve_metadata)
*
* Return 0 for success
- * Return <0 for error(-ENOSPC or -EQUOT)
+ * Return <0 for error(-ENOSPC or -EDQUOT)
*/
int btrfs_delalloc_reserve_space(struct btrfs_inode *inode,
struct extent_changeset **reserved, u64 start, u64 len)
@@ -473,7 +476,7 @@ int btrfs_delalloc_reserve_space(struct btrfs_inode *inode,
return ret;
}
-/**
+/*
* Release data and metadata space for delalloc
*
* @inode: inode we're releasing space for
@@ -482,10 +485,10 @@ int btrfs_delalloc_reserve_space(struct btrfs_inode *inode,
* @len: length of the space already reserved
* @qgroup_free: should qgroup reserved-space also be freed
*
- * This function will release the metadata space that was not used and will
- * decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes
- * list if there are no delalloc bytes left.
- * Also it will handle the qgroup reserved space.
+ * Release the metadata space that was not used and will decrement
+ * ->delalloc_bytes and remove it from the fs_info->delalloc_inodes list if
+ * there are no delalloc bytes left. Also it will handle the qgroup reserved
+ * space.
*/
void btrfs_delalloc_release_space(struct btrfs_inode *inode,
struct extent_changeset *reserved,
diff --git a/fs/btrfs/delalloc-space.h b/fs/btrfs/delalloc-space.h
index e07d46043455..c5d573f2366e 100644
--- a/fs/btrfs/delalloc-space.h
+++ b/fs/btrfs/delalloc-space.h
@@ -20,5 +20,8 @@ void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes,
bool qgroup_free);
int btrfs_delalloc_reserve_space(struct btrfs_inode *inode,
struct extent_changeset **reserved, u64 start, u64 len);
+int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes,
+ u64 disk_num_bytes, bool noflush);
+void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes);
#endif /* BTRFS_DELALLOC_SPACE_H */
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index cac5169eaf8d..0095c6e4c3d1 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -6,14 +6,19 @@
#include <linux/slab.h>
#include <linux/iversion.h>
+#include "ctree.h"
+#include "fs.h"
+#include "messages.h"
#include "misc.h"
#include "delayed-inode.h"
#include "disk-io.h"
#include "transaction.h"
-#include "ctree.h"
#include "qgroup.h"
#include "locking.h"
#include "inode-item.h"
+#include "space-info.h"
+#include "accessors.h"
+#include "file-item.h"
#define BTRFS_DELAYED_WRITEBACK 512
#define BTRFS_DELAYED_BACKGROUND 128
@@ -1412,7 +1417,7 @@ void btrfs_balance_delayed_items(struct btrfs_fs_info *fs_info)
int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans,
const char *name, int name_len,
struct btrfs_inode *dir,
- struct btrfs_disk_key *disk_key, u8 type,
+ struct btrfs_disk_key *disk_key, u8 flags,
u64 index)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
@@ -1443,7 +1448,7 @@ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans,
btrfs_set_stack_dir_transid(dir_item, trans->transid);
btrfs_set_stack_dir_data_len(dir_item, 0);
btrfs_set_stack_dir_name_len(dir_item, name_len);
- btrfs_set_stack_dir_type(dir_item, type);
+ btrfs_set_stack_dir_flags(dir_item, flags);
memcpy((char *)(dir_item + 1), name, name_len);
data_len = delayed_item->data_len + sizeof(struct btrfs_item);
@@ -1641,8 +1646,8 @@ bool btrfs_readdir_get_delayed_items(struct inode *inode,
* We can only do one readdir with delayed items at a time because of
* item->readdir_list.
*/
- btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
- btrfs_inode_lock(inode, 0);
+ btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_SHARED);
+ btrfs_inode_lock(BTRFS_I(inode), 0);
mutex_lock(&delayed_node->mutex);
item = __btrfs_first_delayed_insertion_item(delayed_node);
@@ -1753,7 +1758,7 @@ int btrfs_readdir_delayed_dir_index(struct dir_context *ctx,
name = (char *)(di + 1);
name_len = btrfs_stack_dir_name_len(di);
- d_type = fs_ftype_to_dtype(di->type);
+ d_type = fs_ftype_to_dtype(btrfs_dir_flags_to_ftype(di->type));
btrfs_disk_key_to_cpu(&location, &di->location);
over = !dir_emit(ctx, name, name_len,
diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h
index 0163ca637a96..4f21daa3dbc7 100644
--- a/fs/btrfs/delayed-inode.h
+++ b/fs/btrfs/delayed-inode.h
@@ -113,7 +113,7 @@ static inline void btrfs_init_delayed_root(
int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans,
const char *name, int name_len,
struct btrfs_inode *dir,
- struct btrfs_disk_key *disk_key, u8 type,
+ struct btrfs_disk_key *disk_key, u8 flags,
u64 index);
int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 36a3debe9493..573ebab886e2 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -6,12 +6,14 @@
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/sort.h>
+#include "messages.h"
#include "ctree.h"
#include "delayed-ref.h"
#include "transaction.h"
#include "qgroup.h"
#include "space-info.h"
#include "tree-mod-log.h"
+#include "fs.h"
struct kmem_cache *btrfs_delayed_ref_head_cachep;
struct kmem_cache *btrfs_delayed_tree_ref_cachep;
@@ -69,14 +71,14 @@ int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans)
return btrfs_check_space_for_delayed_refs(trans->fs_info);
}
-/**
- * Release a ref head's reservation
+/*
+ * Release a ref head's reservation.
*
* @fs_info: the filesystem
* @nr: number of items to drop
*
- * This drops the delayed ref head's count from the delayed refs rsv and frees
- * any excess reservation we had.
+ * Drops the delayed ref head's count from the delayed refs rsv and free any
+ * excess reservation we had.
*/
void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr)
{
@@ -102,8 +104,7 @@ void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr)
}
/*
- * btrfs_update_delayed_refs_rsv - adjust the size of the delayed refs rsv
- * @trans - the trans that may have generated delayed refs
+ * Adjust the size of the delayed refs rsv.
*
* This is to be called anytime we may have adjusted trans->delayed_ref_updates,
* it'll calculate the additional size and add it to the delayed_refs_rsv.
@@ -137,8 +138,8 @@ void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans)
trans->delayed_ref_updates = 0;
}
-/**
- * Transfer bytes to our delayed refs rsv
+/*
+ * Transfer bytes to our delayed refs rsv.
*
* @fs_info: the filesystem
* @src: source block rsv to transfer from
@@ -186,8 +187,8 @@ void btrfs_migrate_to_delayed_refs_rsv(struct btrfs_fs_info *fs_info,
delayed_refs_rsv->space_info, to_free);
}
-/**
- * Refill based on our delayed refs usage
+/*
+ * Refill based on our delayed refs usage.
*
* @fs_info: the filesystem
* @flush: control how we can flush for this reservation.
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 61e58066b5fd..78696d331639 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -18,11 +18,13 @@
#include "volumes.h"
#include "async-thread.h"
#include "check-integrity.h"
-#include "rcu-string.h"
#include "dev-replace.h"
#include "sysfs.h"
#include "zoned.h"
#include "block-group.h"
+#include "fs.h"
+#include "accessors.h"
+#include "scrub.h"
/*
* Device replace overview
@@ -246,7 +248,6 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
struct btrfs_device *device;
struct block_device *bdev;
- struct rcu_string *name;
u64 devid = BTRFS_DEV_REPLACE_DEVID;
int ret = 0;
@@ -290,19 +291,12 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
}
- device = btrfs_alloc_device(NULL, &devid, NULL);
+ device = btrfs_alloc_device(NULL, &devid, NULL, device_path);
if (IS_ERR(device)) {
ret = PTR_ERR(device);
goto error;
}
- name = rcu_string_strdup(device_path, GFP_KERNEL);
- if (!name) {
- btrfs_free_device(device);
- ret = -ENOMEM;
- goto error;
- }
- rcu_assign_pointer(device->name, name);
ret = lookup_bdev(device_path, &device->devt);
if (ret)
goto error;
@@ -456,14 +450,6 @@ out:
return ret;
}
-static char* btrfs_dev_name(struct btrfs_device *device)
-{
- if (!device || test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
- return "<missing disk>";
- else
- return rcu_str_deref(device->name);
-}
-
static int mark_block_group_to_copy(struct btrfs_fs_info *fs_info,
struct btrfs_device *src_dev)
{
@@ -679,7 +665,7 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
"dev_replace from %s (devid %llu) to %s started",
btrfs_dev_name(src_device),
src_device->devid,
- rcu_str_deref(tgt_device->name));
+ btrfs_dev_name(tgt_device));
/*
* from now on, the writes to the srcdev are all duplicated to
@@ -938,7 +924,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
"btrfs_scrub_dev(%s, %llu, %s) failed %d",
btrfs_dev_name(src_device),
src_device->devid,
- rcu_str_deref(tgt_device->name), scrub_ret);
+ btrfs_dev_name(tgt_device), scrub_ret);
error:
up_write(&dev_replace->rwsem);
mutex_unlock(&fs_info->chunk_mutex);
@@ -956,7 +942,7 @@ error:
"dev_replace from %s (devid %llu) to %s finished",
btrfs_dev_name(src_device),
src_device->devid,
- rcu_str_deref(tgt_device->name));
+ btrfs_dev_name(tgt_device));
clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &tgt_device->dev_state);
tgt_device->devid = src_device->devid;
src_device->devid = BTRFS_DEV_REPLACE_DEVID;
diff --git a/fs/btrfs/dev-replace.h b/fs/btrfs/dev-replace.h
index 6084b313056a..675082ccec89 100644
--- a/fs/btrfs/dev-replace.h
+++ b/fs/btrfs/dev-replace.h
@@ -25,5 +25,13 @@ int __pure btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace);
bool btrfs_finish_block_group_to_copy(struct btrfs_device *srcdev,
struct btrfs_block_group *cache,
u64 physical);
+void btrfs_bio_counter_inc_blocked(struct btrfs_fs_info *fs_info);
+void btrfs_bio_counter_sub(struct btrfs_fs_info *fs_info, s64 amount);
+
+static inline void btrfs_bio_counter_dec(struct btrfs_fs_info *fs_info)
+{
+ btrfs_bio_counter_sub(fs_info, 1);
+}
+
#endif
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index 72fb2c518a2b..082eb0e19598 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -3,9 +3,12 @@
* Copyright (C) 2007 Oracle. All rights reserved.
*/
+#include "messages.h"
#include "ctree.h"
#include "disk-io.h"
#include "transaction.h"
+#include "accessors.h"
+#include "dir-item.h"
/*
* insert a name into a directory, doing overflow properly if there is a hash
@@ -81,7 +84,7 @@ int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans,
leaf = path->nodes[0];
btrfs_cpu_key_to_disk(&disk_key, &location);
btrfs_set_dir_item_key(leaf, dir_item, &disk_key);
- btrfs_set_dir_type(leaf, dir_item, BTRFS_FT_XATTR);
+ btrfs_set_dir_flags(leaf, dir_item, BTRFS_FT_XATTR);
btrfs_set_dir_name_len(leaf, dir_item, name_len);
btrfs_set_dir_transid(leaf, dir_item, trans->transid);
btrfs_set_dir_data_len(leaf, dir_item, data_len);
@@ -103,8 +106,8 @@ int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans,
* to use for the second index (if one is created).
* Will return 0 or -ENOMEM
*/
-int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, const char *name,
- int name_len, struct btrfs_inode *dir,
+int btrfs_insert_dir_item(struct btrfs_trans_handle *trans,
+ const struct fscrypt_str *name, struct btrfs_inode *dir,
struct btrfs_key *location, u8 type, u64 index)
{
int ret = 0;
@@ -120,7 +123,7 @@ int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, const char *name,
key.objectid = btrfs_ino(dir);
key.type = BTRFS_DIR_ITEM_KEY;
- key.offset = btrfs_name_hash(name, name_len);
+ key.offset = btrfs_name_hash(name->name, name->len);
path = btrfs_alloc_path();
if (!path)
@@ -128,9 +131,9 @@ int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, const char *name,
btrfs_cpu_key_to_disk(&disk_key, location);
- data_size = sizeof(*dir_item) + name_len;
+ data_size = sizeof(*dir_item) + name->len;
dir_item = insert_with_overflow(trans, root, path, &key, data_size,
- name, name_len);
+ name->name, name->len);
if (IS_ERR(dir_item)) {
ret = PTR_ERR(dir_item);
if (ret == -EEXIST)
@@ -138,15 +141,18 @@ int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, const char *name,
goto out_free;
}
+ if (IS_ENCRYPTED(&dir->vfs_inode))
+ type |= BTRFS_FT_ENCRYPTED;
+
leaf = path->nodes[0];
btrfs_set_dir_item_key(leaf, dir_item, &disk_key);
- btrfs_set_dir_type(leaf, dir_item, type);
+ btrfs_set_dir_flags(leaf, dir_item, type);
btrfs_set_dir_data_len(leaf, dir_item, 0);
- btrfs_set_dir_name_len(leaf, dir_item, name_len);
+ btrfs_set_dir_name_len(leaf, dir_item, name->len);
btrfs_set_dir_transid(leaf, dir_item, trans->transid);
name_ptr = (unsigned long)(dir_item + 1);
- write_extent_buffer(leaf, name, name_ptr, name_len);
+ write_extent_buffer(leaf, name->name, name_ptr, name->len);
btrfs_mark_buffer_dirty(leaf);
second_insert:
@@ -157,7 +163,7 @@ second_insert:
}
btrfs_release_path(path);
- ret2 = btrfs_insert_delayed_dir_index(trans, name, name_len, dir,
+ ret2 = btrfs_insert_delayed_dir_index(trans, name->name, name->len, dir,
&disk_key, type, index);
out_free:
btrfs_free_path(path);
@@ -206,7 +212,7 @@ static struct btrfs_dir_item *btrfs_lookup_match_dir(
struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path, u64 dir,
- const char *name, int name_len,
+ const struct fscrypt_str *name,
int mod)
{
struct btrfs_key key;
@@ -214,9 +220,10 @@ struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans,
key.objectid = dir;
key.type = BTRFS_DIR_ITEM_KEY;
- key.offset = btrfs_name_hash(name, name_len);
+ key.offset = btrfs_name_hash(name->name, name->len);
- di = btrfs_lookup_match_dir(trans, root, path, &key, name, name_len, mod);
+ di = btrfs_lookup_match_dir(trans, root, path, &key, name->name,
+ name->len, mod);
if (IS_ERR(di) && PTR_ERR(di) == -ENOENT)
return NULL;
@@ -224,7 +231,7 @@ struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans,
}
int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir,
- const char *name, int name_len)
+ const struct fscrypt_str *name)
{
int ret;
struct btrfs_key key;
@@ -240,9 +247,10 @@ int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir,
key.objectid = dir;
key.type = BTRFS_DIR_ITEM_KEY;
- key.offset = btrfs_name_hash(name, name_len);
+ key.offset = btrfs_name_hash(name->name, name->len);
- di = btrfs_lookup_match_dir(NULL, root, path, &key, name, name_len, 0);
+ di = btrfs_lookup_match_dir(NULL, root, path, &key, name->name,
+ name->len, 0);
if (IS_ERR(di)) {
ret = PTR_ERR(di);
/* Nothing found, we're safe */
@@ -262,11 +270,8 @@ int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir,
goto out;
}
- /*
- * see if there is room in the item to insert this
- * name
- */
- data_size = sizeof(*di) + name_len;
+ /* See if there is room in the item to insert this name. */
+ data_size = sizeof(*di) + name->len;
leaf = path->nodes[0];
slot = path->slots[0];
if (data_size + btrfs_item_size(leaf, slot) +
@@ -303,8 +308,7 @@ struct btrfs_dir_item *
btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path, u64 dir,
- u64 index, const char *name, int name_len,
- int mod)
+ u64 index, const struct fscrypt_str *name, int mod)
{
struct btrfs_dir_item *di;
struct btrfs_key key;
@@ -313,7 +317,8 @@ btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans,
key.type = BTRFS_DIR_INDEX_KEY;
key.offset = index;
- di = btrfs_lookup_match_dir(trans, root, path, &key, name, name_len, mod);
+ di = btrfs_lookup_match_dir(trans, root, path, &key, name->name,
+ name->len, mod);
if (di == ERR_PTR(-ENOENT))
return NULL;
@@ -321,9 +326,8 @@ btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans,
}
struct btrfs_dir_item *
-btrfs_search_dir_index_item(struct btrfs_root *root,
- struct btrfs_path *path, u64 dirid,
- const char *name, int name_len)
+btrfs_search_dir_index_item(struct btrfs_root *root, struct btrfs_path *path,
+ u64 dirid, const struct fscrypt_str *name)
{
struct btrfs_dir_item *di;
struct btrfs_key key;
@@ -338,7 +342,7 @@ btrfs_search_dir_index_item(struct btrfs_root *root,
break;
di = btrfs_match_dir_item_name(root->fs_info, path,
- name, name_len);
+ name->name, name->len);
if (di)
return di;
}
diff --git a/fs/btrfs/dir-item.h b/fs/btrfs/dir-item.h
new file mode 100644
index 000000000000..aab4b7cc7fa0
--- /dev/null
+++ b/fs/btrfs/dir-item.h
@@ -0,0 +1,42 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef BTRFS_DIR_ITEM_H
+#define BTRFS_DIR_ITEM_H
+
+int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir,
+ const struct fscrypt_str *name);
+int btrfs_insert_dir_item(struct btrfs_trans_handle *trans,
+ const struct fscrypt_str *name, struct btrfs_inode *dir,
+ struct btrfs_key *location, u8 type, u64 index);
+struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path, u64 dir,
+ const struct fscrypt_str *name, int mod);
+struct btrfs_dir_item *btrfs_lookup_dir_index_item(
+ struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path, u64 dir,
+ u64 index, const struct fscrypt_str *name, int mod);
+struct btrfs_dir_item *btrfs_search_dir_index_item(struct btrfs_root *root,
+ struct btrfs_path *path, u64 dirid,
+ const struct fscrypt_str *name);
+int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct btrfs_dir_item *di);
+int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path, u64 objectid,
+ const char *name, u16 name_len,
+ const void *data, u16 data_len);
+struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path, u64 dir,
+ const char *name, u16 name_len,
+ int mod);
+struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_fs_info *fs_info,
+ struct btrfs_path *path,
+ const char *name,
+ int name_len);
+
+#endif
diff --git a/fs/btrfs/discard.c b/fs/btrfs/discard.c
index e1b7bd927d69..ff2e524d9937 100644
--- a/fs/btrfs/discard.c
+++ b/fs/btrfs/discard.c
@@ -11,6 +11,7 @@
#include "block-group.h"
#include "discard.h"
#include "free-space-cache.h"
+#include "fs.h"
/*
* This contains the logic to handle async discard.
@@ -61,7 +62,7 @@
#define BTRFS_DISCARD_MAX_DELAY_MSEC (1000UL)
#define BTRFS_DISCARD_MAX_IOPS (10U)
-/* Montonically decreasing minimum length filters after index 0 */
+/* Monotonically decreasing minimum length filters after index 0 */
static int discard_minlen[BTRFS_NR_DISCARD_LISTS] = {
0,
BTRFS_ASYNC_DISCARD_MAX_FILTER,
@@ -146,10 +147,11 @@ static bool remove_from_discard_list(struct btrfs_discard_ctl *discard_ctl,
return running;
}
-/**
- * find_next_block_group - find block_group that's up next for discarding
- * @discard_ctl: discard control
- * @now: current time
+/*
+ * Find block_group that's up next for discarding.
+ *
+ * @discard_ctl: discard control
+ * @now: current time
*
* Iterate over the discard lists to find the next block_group up for
* discarding checking the discard_eligible_time of block_group.
@@ -184,17 +186,17 @@ static struct btrfs_block_group *find_next_block_group(
return ret_block_group;
}
-/**
- * Wrap find_next_block_group()
+/*
+ * Look up next block group and set it for use.
*
* @discard_ctl: discard control
* @discard_state: the discard_state of the block_group after state management
* @discard_index: the discard_index of the block_group after state management
* @now: time when discard was invoked, in ns
*
- * This wraps find_next_block_group() and sets the block_group to be in use.
- * discard_state's control flow is managed here. Variables related to
- * discard_state are reset here as needed (eg discard_cursor). @discard_state
+ * Wrap find_next_block_group() and set the block_group to be in use.
+ * @discard_state's control flow is managed here. Variables related to
+ * @discard_state are reset here as needed (eg. @discard_cursor). @discard_state
* and @discard_index are remembered as it may change while we're discarding,
* but we want the discard to execute in the context determined here.
*/
@@ -233,10 +235,11 @@ again:
return block_group;
}
-/**
- * btrfs_discard_check_filter - updates a block groups filters
- * @block_group: block group of interest
- * @bytes: recently freed region size after coalescing
+/*
+ * Update a block group's filters.
+ *
+ * @block_group: block group of interest
+ * @bytes: recently freed region size after coalescing
*
* Async discard maintains multiple lists with progressively smaller filters
* to prioritize discarding based on size. Should a free space that matches
@@ -271,8 +274,9 @@ void btrfs_discard_check_filter(struct btrfs_block_group *block_group,
}
}
-/**
- * btrfs_update_discard_index - moves a block group along the discard lists
+/*
+ * Move a block group along the discard lists.
+ *
* @discard_ctl: discard control
* @block_group: block_group of interest
*
@@ -291,13 +295,14 @@ static void btrfs_update_discard_index(struct btrfs_discard_ctl *discard_ctl,
add_to_discard_list(discard_ctl, block_group);
}
-/**
- * btrfs_discard_cancel_work - remove a block_group from the discard lists
+/*
+ * Remove a block_group from the discard lists.
+ *
* @discard_ctl: discard control
* @block_group: block_group of interest
*
- * This removes @block_group from the discard lists. If necessary, it waits on
- * the current work and then reschedules the delayed work.
+ * Remove @block_group from the discard lists. If necessary, wait on the
+ * current work and then reschedule the delayed work.
*/
void btrfs_discard_cancel_work(struct btrfs_discard_ctl *discard_ctl,
struct btrfs_block_group *block_group)
@@ -308,12 +313,13 @@ void btrfs_discard_cancel_work(struct btrfs_discard_ctl *discard_ctl,
}
}
-/**
- * btrfs_discard_queue_work - handles queuing the block_groups
+/*
+ * Handles queuing the block_groups.
+ *
* @discard_ctl: discard control
* @block_group: block_group of interest
*
- * This maintains the LRU order of the discard lists.
+ * Maintain the LRU order of the discard lists.
*/
void btrfs_discard_queue_work(struct btrfs_discard_ctl *discard_ctl,
struct btrfs_block_group *block_group)
@@ -383,7 +389,8 @@ static void __btrfs_discard_schedule_work(struct btrfs_discard_ctl *discard_ctl,
}
/*
- * btrfs_discard_schedule_work - responsible for scheduling the discard work
+ * Responsible for scheduling the discard work.
+ *
* @discard_ctl: discard control
* @override: override the current timer
*
@@ -401,15 +408,16 @@ void btrfs_discard_schedule_work(struct btrfs_discard_ctl *discard_ctl,
spin_unlock(&discard_ctl->lock);
}
-/**
- * btrfs_finish_discard_pass - determine next step of a block_group
+/*
+ * Determine next step of a block_group.
+ *
* @discard_ctl: discard control
* @block_group: block_group of interest
*
- * This determines the next step for a block group after it's finished going
- * through a pass on a discard list. If it is unused and fully trimmed, we can
- * mark it unused and send it to the unused_bgs path. Otherwise, pass it onto
- * the appropriate filter list or let it fall off.
+ * Determine the next step for a block group after it's finished going through
+ * a pass on a discard list. If it is unused and fully trimmed, we can mark it
+ * unused and send it to the unused_bgs path. Otherwise, pass it onto the
+ * appropriate filter list or let it fall off.
*/
static void btrfs_finish_discard_pass(struct btrfs_discard_ctl *discard_ctl,
struct btrfs_block_group *block_group)
@@ -426,12 +434,13 @@ static void btrfs_finish_discard_pass(struct btrfs_discard_ctl *discard_ctl,
}
}
-/**
- * btrfs_discard_workfn - discard work function
+/*
+ * Discard work queue callback
+ *
* @work: work
*
- * This finds the next block_group to start discarding and then discards a
- * single region. It does this in a two-pass fashion: first extents and second
+ * Find the next block_group to start discarding and then discard a single
+ * region. It does this in a two-pass fashion: first extents and second
* bitmaps. Completely discarded block groups are sent to the unused_bgs path.
*/
static void btrfs_discard_workfn(struct work_struct *work)
@@ -507,11 +516,12 @@ static void btrfs_discard_workfn(struct work_struct *work)
spin_unlock(&discard_ctl->lock);
}
-/**
- * btrfs_run_discard_work - determines if async discard should be running
+/*
+ * Determine if async discard should be running.
+ *
* @discard_ctl: discard control
*
- * Checks if the file system is writeable and BTRFS_FS_DISCARD_RUNNING is set.
+ * Check if the file system is writeable and BTRFS_FS_DISCARD_RUNNING is set.
*/
bool btrfs_run_discard_work(struct btrfs_discard_ctl *discard_ctl)
{
@@ -523,8 +533,9 @@ bool btrfs_run_discard_work(struct btrfs_discard_ctl *discard_ctl)
test_bit(BTRFS_FS_DISCARD_RUNNING, &fs_info->flags));
}
-/**
- * btrfs_discard_calc_delay - recalculate the base delay
+/*
+ * Recalculate the base delay.
+ *
* @discard_ctl: discard control
*
* Recalculate the base delay which is based off the total number of
@@ -545,7 +556,7 @@ void btrfs_discard_calc_delay(struct btrfs_discard_ctl *discard_ctl)
spin_lock(&discard_ctl->lock);
/*
- * The following is to fix a potential -1 discrepenancy that we're not
+ * The following is to fix a potential -1 discrepancy that we're not
* sure how to reproduce. But given that this is the only place that
* utilizes these numbers and this is only called by from
* btrfs_finish_extent_commit() which is synchronized, we can correct
@@ -578,13 +589,14 @@ void btrfs_discard_calc_delay(struct btrfs_discard_ctl *discard_ctl)
spin_unlock(&discard_ctl->lock);
}
-/**
- * btrfs_discard_update_discardable - propagate discard counters
+/*
+ * Propagate discard counters.
+ *
* @block_group: block_group of interest
*
- * This propagates deltas of counters up to the discard_ctl. It maintains a
- * current counter and a previous counter passing the delta up to the global
- * stat. Then the current counter value becomes the previous counter value.
+ * Propagate deltas of counters up to the discard_ctl. It maintains a current
+ * counter and a previous counter passing the delta up to the global stat.
+ * Then the current counter value becomes the previous counter value.
*/
void btrfs_discard_update_discardable(struct btrfs_block_group *block_group)
{
@@ -619,8 +631,9 @@ void btrfs_discard_update_discardable(struct btrfs_block_group *block_group)
}
}
-/**
- * btrfs_discard_punt_unused_bgs_list - punt unused_bgs list to discard lists
+/*
+ * Punt unused_bgs list to discard lists.
+ *
* @fs_info: fs_info of interest
*
* The unused_bgs list needs to be punted to the discard lists because the
@@ -644,8 +657,9 @@ void btrfs_discard_punt_unused_bgs_list(struct btrfs_fs_info *fs_info)
spin_unlock(&fs_info->unused_bgs_lock);
}
-/**
- * btrfs_discard_purge_list - purge discard lists
+/*
+ * Purge discard lists.
+ *
* @discard_ctl: discard control
*
* If we are disabling async discard, we may have intercepted block groups that
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 4b28263c3d32..0888d484df80 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -23,7 +23,7 @@
#include "disk-io.h"
#include "transaction.h"
#include "btrfs_inode.h"
-#include "volumes.h"
+#include "bio.h"
#include "print-tree.h"
#include "locking.h"
#include "tree-log.h"
@@ -43,6 +43,15 @@
#include "space-info.h"
#include "zoned.h"
#include "subpage.h"
+#include "fs.h"
+#include "accessors.h"
+#include "extent-tree.h"
+#include "root-tree.h"
+#include "defrag.h"
+#include "uuid-tree.h"
+#include "relocation.h"
+#include "scrub.h"
+#include "super.h"
#define BTRFS_SUPER_FLAG_SUPP (BTRFS_HEADER_FLAG_WRITTEN |\
BTRFS_HEADER_FLAG_RELOC |\
@@ -75,12 +84,12 @@ static void btrfs_free_csum_hash(struct btrfs_fs_info *fs_info)
* just before they are sent down the IO stack.
*/
struct async_submit_bio {
- struct inode *inode;
+ struct btrfs_inode *inode;
struct bio *bio;
- extent_submit_bio_start_t *submit_bio_start;
+ enum btrfs_wq_submit_cmd submit_cmd;
int mirror_num;
- /* Optional parameter for submit_bio_start used by direct io */
+ /* Optional parameter for used by direct io */
u64 dio_file_offset;
struct btrfs_work work;
blk_status_t status;
@@ -246,40 +255,54 @@ int btrfs_verify_level_key(struct extent_buffer *eb, int level,
return ret;
}
+static int btrfs_repair_eb_io_failure(const struct extent_buffer *eb,
+ int mirror_num)
+{
+ struct btrfs_fs_info *fs_info = eb->fs_info;
+ u64 start = eb->start;
+ int i, num_pages = num_extent_pages(eb);
+ int ret = 0;
+
+ if (sb_rdonly(fs_info->sb))
+ return -EROFS;
+
+ for (i = 0; i < num_pages; i++) {
+ struct page *p = eb->pages[i];
+
+ ret = btrfs_repair_io_failure(fs_info, 0, start, PAGE_SIZE,
+ start, p, start - page_offset(p), mirror_num);
+ if (ret)
+ break;
+ start += PAGE_SIZE;
+ }
+
+ return ret;
+}
+
/*
* helper to read a given tree block, doing retries as required when
* the checksums don't match and we have alternate mirrors to try.
*
- * @parent_transid: expected transid, skip check if 0
- * @level: expected level, mandatory check
- * @first_key: expected key of first slot, skip check if NULL
+ * @check: expected tree parentness check, see the comments of the
+ * structure for details.
*/
int btrfs_read_extent_buffer(struct extent_buffer *eb,
- u64 parent_transid, int level,
- struct btrfs_key *first_key)
+ struct btrfs_tree_parent_check *check)
{
struct btrfs_fs_info *fs_info = eb->fs_info;
- struct extent_io_tree *io_tree;
int failed = 0;
int ret;
int num_copies = 0;
int mirror_num = 0;
int failed_mirror = 0;
- io_tree = &BTRFS_I(fs_info->btree_inode)->io_tree;
+ ASSERT(check);
+
while (1) {
clear_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags);
- ret = read_extent_buffer_pages(eb, WAIT_COMPLETE, mirror_num);
- if (!ret) {
- if (verify_parent_transid(io_tree, eb,
- parent_transid, 0))
- ret = -EIO;
- else if (btrfs_verify_level_key(eb, level,
- first_key, parent_transid))
- ret = -EUCLEAN;
- else
- break;
- }
+ ret = read_extent_buffer_pages(eb, WAIT_COMPLETE, mirror_num, check);
+ if (!ret)
+ break;
num_copies = btrfs_num_copies(fs_info,
eb->start, eb->len);
@@ -455,7 +478,8 @@ static int check_tree_block_fsid(struct extent_buffer *eb)
}
/* Do basic extent buffer checks at read time */
-static int validate_extent_buffer(struct extent_buffer *eb)
+static int validate_extent_buffer(struct extent_buffer *eb,
+ struct btrfs_tree_parent_check *check)
{
struct btrfs_fs_info *fs_info = eb->fs_info;
u64 found_start;
@@ -465,6 +489,8 @@ static int validate_extent_buffer(struct extent_buffer *eb)
const u8 *header_csum;
int ret = 0;
+ ASSERT(check);
+
found_start = btrfs_header_bytenr(eb);
if (found_start != eb->start) {
btrfs_err_rl(fs_info,
@@ -503,6 +529,45 @@ static int validate_extent_buffer(struct extent_buffer *eb)
goto out;
}
+ if (found_level != check->level) {
+ ret = -EIO;
+ goto out;
+ }
+ if (unlikely(check->transid &&
+ btrfs_header_generation(eb) != check->transid)) {
+ btrfs_err_rl(eb->fs_info,
+"parent transid verify failed on logical %llu mirror %u wanted %llu found %llu",
+ eb->start, eb->read_mirror, check->transid,
+ btrfs_header_generation(eb));
+ ret = -EIO;
+ goto out;
+ }
+ if (check->has_first_key) {
+ struct btrfs_key *expect_key = &check->first_key;
+ struct btrfs_key found_key;
+
+ if (found_level)
+ btrfs_node_key_to_cpu(eb, &found_key, 0);
+ else
+ btrfs_item_key_to_cpu(eb, &found_key, 0);
+ if (unlikely(btrfs_comp_cpu_keys(expect_key, &found_key))) {
+ btrfs_err(fs_info,
+"tree first key mismatch detected, bytenr=%llu parent_transid=%llu key expected=(%llu,%u,%llu) has=(%llu,%u,%llu)",
+ eb->start, check->transid,
+ expect_key->objectid,
+ expect_key->type, expect_key->offset,
+ found_key.objectid, found_key.type,
+ found_key.offset);
+ ret = -EUCLEAN;
+ goto out;
+ }
+ }
+ if (check->owner_root) {
+ ret = btrfs_check_eb_owner(eb, check->owner_root);
+ if (ret < 0)
+ goto out;
+ }
+
/*
* If this is a leaf block and it is corrupt, set the corrupt bit so
* that we don't try and read the other copies of this block, just
@@ -527,13 +592,15 @@ out:
}
static int validate_subpage_buffer(struct page *page, u64 start, u64 end,
- int mirror)
+ int mirror, struct btrfs_tree_parent_check *check)
{
struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb);
struct extent_buffer *eb;
bool reads_done;
int ret = 0;
+ ASSERT(check);
+
/*
* We don't allow bio merge for subpage metadata read, so we should
* only get one eb for each endio hook.
@@ -557,7 +624,7 @@ static int validate_subpage_buffer(struct page *page, u64 start, u64 end,
ret = -EIO;
goto err;
}
- ret = validate_extent_buffer(eb);
+ ret = validate_extent_buffer(eb, check);
if (ret < 0)
goto err;
@@ -587,7 +654,8 @@ int btrfs_validate_metadata_buffer(struct btrfs_bio *bbio,
ASSERT(page->private);
if (btrfs_sb(page->mapping->host->i_sb)->nodesize < PAGE_SIZE)
- return validate_subpage_buffer(page, start, end, mirror);
+ return validate_subpage_buffer(page, start, end, mirror,
+ &bbio->parent_check);
eb = (struct extent_buffer *)page->private;
@@ -606,7 +674,7 @@ int btrfs_validate_metadata_buffer(struct btrfs_bio *bbio,
ret = -EIO;
goto err;
}
- ret = validate_extent_buffer(eb);
+ ret = validate_extent_buffer(eb, &bbio->parent_check);
err:
if (ret) {
/*
@@ -628,8 +696,18 @@ static void run_one_async_start(struct btrfs_work *work)
blk_status_t ret;
async = container_of(work, struct async_submit_bio, work);
- ret = async->submit_bio_start(async->inode, async->bio,
- async->dio_file_offset);
+ switch (async->submit_cmd) {
+ case WQ_SUBMIT_METADATA:
+ ret = btree_submit_bio_start(async->bio);
+ break;
+ case WQ_SUBMIT_DATA:
+ ret = btrfs_submit_bio_start(async->inode, async->bio);
+ break;
+ case WQ_SUBMIT_DATA_DIO:
+ ret = btrfs_submit_bio_start_direct_io(async->inode,
+ async->bio, async->dio_file_offset);
+ break;
+ }
if (ret)
async->status = ret;
}
@@ -646,7 +724,7 @@ static void run_one_async_done(struct btrfs_work *work)
{
struct async_submit_bio *async =
container_of(work, struct async_submit_bio, work);
- struct inode *inode = async->inode;
+ struct btrfs_inode *inode = async->inode;
struct btrfs_bio *bbio = btrfs_bio(async->bio);
/* If an error occurred we just want to clean up the bio and move on */
@@ -661,7 +739,7 @@ static void run_one_async_done(struct btrfs_work *work)
* This changes nothing when cgroups aren't in use.
*/
async->bio->bi_opf |= REQ_CGROUP_PUNT;
- btrfs_submit_bio(btrfs_sb(inode->i_sb), async->bio, async->mirror_num);
+ btrfs_submit_bio(inode->root->fs_info, async->bio, async->mirror_num);
}
static void run_one_async_free(struct btrfs_work *work)
@@ -679,11 +757,10 @@ static void run_one_async_free(struct btrfs_work *work)
* - true if the work has been succesfuly submitted
* - false in case of error
*/
-bool btrfs_wq_submit_bio(struct inode *inode, struct bio *bio, int mirror_num,
- u64 dio_file_offset,
- extent_submit_bio_start_t *submit_bio_start)
+bool btrfs_wq_submit_bio(struct btrfs_inode *inode, struct bio *bio, int mirror_num,
+ u64 dio_file_offset, enum btrfs_wq_submit_cmd cmd)
{
- struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct async_submit_bio *async;
async = kmalloc(sizeof(*async), GFP_NOFS);
@@ -693,7 +770,7 @@ bool btrfs_wq_submit_bio(struct inode *inode, struct bio *bio, int mirror_num,
async->inode = inode;
async->bio = bio;
async->mirror_num = mirror_num;
- async->submit_bio_start = submit_bio_start;
+ async->submit_cmd = cmd;
btrfs_init_work(&async->work, run_one_async_start, run_one_async_done,
run_one_async_free);
@@ -727,8 +804,7 @@ static blk_status_t btree_csum_one_bio(struct bio *bio)
return errno_to_blk_status(ret);
}
-static blk_status_t btree_submit_bio_start(struct inode *inode, struct bio *bio,
- u64 dio_file_offset)
+blk_status_t btree_submit_bio_start(struct bio *bio)
{
/*
* when we're called for a write, we're already in the async
@@ -749,13 +825,14 @@ static bool should_async_write(struct btrfs_fs_info *fs_info,
return true;
}
-void btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio, int mirror_num)
+void btrfs_submit_metadata_bio(struct btrfs_inode *inode, struct bio *bio, int mirror_num)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct btrfs_bio *bbio = btrfs_bio(bio);
blk_status_t ret;
bio->bi_opf |= REQ_META;
+ bbio->is_metadata = 1;
if (btrfs_op(bio) != BTRFS_MAP_WRITE) {
btrfs_submit_bio(fs_info, bio, mirror_num);
@@ -766,8 +843,8 @@ void btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio, int mirror_
* Kthread helpers are used to submit writes so that checksumming can
* happen in parallel across all CPUs.
*/
- if (should_async_write(fs_info, BTRFS_I(inode)) &&
- btrfs_wq_submit_bio(inode, bio, mirror_num, 0, btree_submit_bio_start))
+ if (should_async_write(fs_info, inode) &&
+ btrfs_wq_submit_bio(inode, bio, mirror_num, 0, WQ_SUBMIT_METADATA))
return;
ret = btree_csum_one_bio(bio);
@@ -919,28 +996,28 @@ struct extent_buffer *btrfs_find_create_tree_block(
* Read tree block at logical address @bytenr and do variant basic but critical
* verification.
*
- * @owner_root: the objectid of the root owner for this block.
- * @parent_transid: expected transid of this tree block, skip check if 0
- * @level: expected level, mandatory check
- * @first_key: expected key in slot 0, skip check if NULL
+ * @check: expected tree parentness check, see comments of the
+ * structure for details.
*/
struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr,
- u64 owner_root, u64 parent_transid,
- int level, struct btrfs_key *first_key)
+ struct btrfs_tree_parent_check *check)
{
struct extent_buffer *buf = NULL;
int ret;
- buf = btrfs_find_create_tree_block(fs_info, bytenr, owner_root, level);
+ ASSERT(check);
+
+ buf = btrfs_find_create_tree_block(fs_info, bytenr, check->owner_root,
+ check->level);
if (IS_ERR(buf))
return buf;
- ret = btrfs_read_extent_buffer(buf, parent_transid, level, first_key);
+ ret = btrfs_read_extent_buffer(buf, check);
if (ret) {
free_extent_buffer_stale(buf);
return ERR_PTR(ret);
}
- if (btrfs_check_eb_owner(buf, owner_root)) {
+ if (btrfs_check_eb_owner(buf, check->owner_root)) {
free_extent_buffer_stale(buf);
return ERR_PTR(-EUCLEAN);
}
@@ -1027,9 +1104,9 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
root->anon_dev = 0;
if (!dummy) {
extent_io_tree_init(fs_info, &root->dirty_log_pages,
- IO_TREE_ROOT_DIRTY_LOG_PAGES, NULL);
+ IO_TREE_ROOT_DIRTY_LOG_PAGES);
extent_io_tree_init(fs_info, &root->log_csum_range,
- IO_TREE_LOG_CSUM_RANGE, NULL);
+ IO_TREE_LOG_CSUM_RANGE);
}
spin_lock_init(&root->root_item_lock);
@@ -1167,6 +1244,13 @@ struct btrfs_root *btrfs_extent_root(struct btrfs_fs_info *fs_info, u64 bytenr)
return btrfs_global_root(fs_info, &key);
}
+struct btrfs_root *btrfs_block_group_root(struct btrfs_fs_info *fs_info)
+{
+ if (btrfs_fs_compat_ro(fs_info, BLOCK_GROUP_TREE))
+ return fs_info->block_group_root;
+ return btrfs_extent_root(fs_info, 0);
+}
+
struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
u64 objectid)
{
@@ -1197,7 +1281,7 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
if (IS_ERR(leaf)) {
ret = PTR_ERR(leaf);
leaf = NULL;
- goto fail_unlock;
+ goto fail;
}
root->node = leaf;
@@ -1232,9 +1316,6 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
return root;
-fail_unlock:
- if (leaf)
- btrfs_tree_unlock(leaf);
fail:
btrfs_put_root(root);
@@ -1352,6 +1433,7 @@ static struct btrfs_root *read_tree_root_path(struct btrfs_root *tree_root,
struct btrfs_key *key)
{
struct btrfs_root *root;
+ struct btrfs_tree_parent_check check = { 0 };
struct btrfs_fs_info *fs_info = tree_root->fs_info;
u64 generation;
int ret;
@@ -1371,9 +1453,11 @@ static struct btrfs_root *read_tree_root_path(struct btrfs_root *tree_root,
generation = btrfs_root_generation(&root->root_item);
level = btrfs_root_level(&root->root_item);
- root->node = read_tree_block(fs_info,
- btrfs_root_bytenr(&root->root_item),
- key->objectid, generation, level, NULL);
+ check.level = level;
+ check.transid = generation;
+ check.owner_root = key->objectid;
+ root->node = read_tree_block(fs_info, btrfs_root_bytenr(&root->root_item),
+ &check);
if (IS_ERR(root->node)) {
ret = PTR_ERR(root->node);
root->node = NULL;
@@ -2084,8 +2168,6 @@ static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info)
btrfs_destroy_workqueue(fs_info->workers);
if (fs_info->endio_workers)
destroy_workqueue(fs_info->endio_workers);
- if (fs_info->endio_raid56_workers)
- destroy_workqueue(fs_info->endio_raid56_workers);
if (fs_info->rmw_workers)
destroy_workqueue(fs_info->rmw_workers);
if (fs_info->compressed_write_workers)
@@ -2231,7 +2313,7 @@ static void btrfs_init_btree_inode(struct btrfs_fs_info *fs_info)
RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node);
extent_io_tree_init(fs_info, &BTRFS_I(inode)->io_tree,
- IO_TREE_BTREE_INODE_IO, NULL);
+ IO_TREE_BTREE_INODE_IO);
extent_map_tree_init(&BTRFS_I(inode)->extent_tree);
BTRFS_I(inode)->root = btrfs_grab_root(fs_info->tree_root);
@@ -2291,8 +2373,6 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info)
alloc_workqueue("btrfs-endio", flags, max_active);
fs_info->endio_meta_workers =
alloc_workqueue("btrfs-endio-meta", flags, max_active);
- fs_info->endio_raid56_workers =
- alloc_workqueue("btrfs-endio-raid56", flags, max_active);
fs_info->rmw_workers = alloc_workqueue("btrfs-rmw", flags, max_active);
fs_info->endio_write_workers =
btrfs_alloc_workqueue(fs_info, "endio-write", flags,
@@ -2314,7 +2394,7 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info)
fs_info->delalloc_workers && fs_info->flush_workers &&
fs_info->endio_workers && fs_info->endio_meta_workers &&
fs_info->compressed_write_workers &&
- fs_info->endio_write_workers && fs_info->endio_raid56_workers &&
+ fs_info->endio_write_workers &&
fs_info->endio_freespace_worker && fs_info->rmw_workers &&
fs_info->caching_workers && fs_info->fixup_workers &&
fs_info->delayed_workers && fs_info->qgroup_rescan_workers &&
@@ -2350,6 +2430,7 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info,
struct btrfs_fs_devices *fs_devices)
{
int ret;
+ struct btrfs_tree_parent_check check = { 0 };
struct btrfs_root *log_tree_root;
struct btrfs_super_block *disk_super = fs_info->super_copy;
u64 bytenr = btrfs_super_log_root(disk_super);
@@ -2365,10 +2446,10 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info,
if (!log_tree_root)
return -ENOMEM;
- log_tree_root->node = read_tree_block(fs_info, bytenr,
- BTRFS_TREE_LOG_OBJECTID,
- fs_info->generation + 1, level,
- NULL);
+ check.level = level;
+ check.transid = fs_info->generation + 1;
+ check.owner_root = BTRFS_TREE_LOG_OBJECTID;
+ log_tree_root->node = read_tree_block(fs_info, bytenr, &check);
if (IS_ERR(log_tree_root->node)) {
btrfs_warn(fs_info, "failed to read log tree");
ret = PTR_ERR(log_tree_root->node);
@@ -2551,7 +2632,9 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info)
fs_info->dev_root = root;
}
/* Initialize fs_info for all devices in any case */
- btrfs_init_devices_late(fs_info);
+ ret = btrfs_init_devices_late(fs_info);
+ if (ret)
+ goto out;
/*
* This tree can share blocks with some other fs tree during relocation
@@ -2844,10 +2927,14 @@ out:
static int load_super_root(struct btrfs_root *root, u64 bytenr, u64 gen, int level)
{
+ struct btrfs_tree_parent_check check = {
+ .level = level,
+ .transid = gen,
+ .owner_root = root->root_key.objectid
+ };
int ret = 0;
- root->node = read_tree_block(root->fs_info, bytenr,
- root->root_key.objectid, gen, level, NULL);
+ root->node = read_tree_block(root->fs_info, bytenr, &check);
if (IS_ERR(root->node)) {
ret = PTR_ERR(root->node);
root->node = NULL;
@@ -3055,7 +3142,7 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
fs_info->block_group_cache_tree = RB_ROOT_CACHED;
extent_io_tree_init(fs_info, &fs_info->excluded_extents,
- IO_TREE_FS_EXCLUDED_EXTENTS, NULL);
+ IO_TREE_FS_EXCLUDED_EXTENTS);
mutex_init(&fs_info->ordered_operations_mutex);
mutex_init(&fs_info->tree_log_mutex);
@@ -3741,10 +3828,18 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
}
/*
- * Mount does not set all options immediately, we can do it now and do
- * not have to wait for transaction commit
+ * For devices supporting discard turn on discard=async automatically,
+ * unless it's already set or disabled. This could be turned off by
+ * nodiscard for the same mount.
*/
- btrfs_apply_pending_changes(fs_info);
+ if (!(btrfs_test_opt(fs_info, DISCARD_SYNC) ||
+ btrfs_test_opt(fs_info, DISCARD_ASYNC) ||
+ btrfs_test_opt(fs_info, NODISCARD)) &&
+ fs_info->fs_devices->discardable) {
+ btrfs_set_and_info(fs_info, DISCARD_ASYNC,
+ "auto enabling async discard");
+ btrfs_clear_opt(fs_info->mount_opt, NODISCARD);
+ }
#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
if (btrfs_test_opt(fs_info, CHECK_INTEGRITY)) {
@@ -3873,7 +3968,7 @@ static void btrfs_end_super_write(struct bio *bio)
if (bio->bi_status) {
btrfs_warn_rl_in_rcu(device->fs_info,
"lost page write due to IO error on %s (%d)",
- rcu_str_deref(device->name),
+ btrfs_dev_name(device),
blk_status_to_errno(bio->bi_status));
ClearPageUptodate(page);
SetPageError(page);
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 9fa923e005a3..363935cfc084 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -27,14 +27,14 @@ static inline u64 btrfs_sb_offset(int mirror)
struct btrfs_device;
struct btrfs_fs_devices;
+struct btrfs_tree_parent_check;
void btrfs_check_leaked_roots(struct btrfs_fs_info *fs_info);
void btrfs_init_fs_info(struct btrfs_fs_info *fs_info);
int btrfs_verify_level_key(struct extent_buffer *eb, int level,
struct btrfs_key *first_key, u64 parent_transid);
struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr,
- u64 owner_root, u64 parent_transid,
- int level, struct btrfs_key *first_key);
+ struct btrfs_tree_parent_check *check);
struct extent_buffer *btrfs_find_create_tree_block(
struct btrfs_fs_info *fs_info,
u64 bytenr, u64 owner_root,
@@ -75,6 +75,7 @@ struct btrfs_root *btrfs_global_root(struct btrfs_fs_info *fs_info,
struct btrfs_key *key);
struct btrfs_root *btrfs_csum_root(struct btrfs_fs_info *fs_info, u64 bytenr);
struct btrfs_root *btrfs_extent_root(struct btrfs_fs_info *fs_info, u64 bytenr);
+struct btrfs_root *btrfs_block_group_root(struct btrfs_fs_info *fs_info);
void btrfs_free_fs_info(struct btrfs_fs_info *fs_info);
int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info);
@@ -85,7 +86,7 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info,
int btrfs_validate_metadata_buffer(struct btrfs_bio *bbio,
struct page *page, u64 start, u64 end,
int mirror);
-void btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio, int mirror_num);
+void btrfs_submit_metadata_bio(struct btrfs_inode *inode, struct bio *bio, int mirror_num);
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
struct btrfs_root *btrfs_alloc_dummy_root(struct btrfs_fs_info *fs_info);
#endif
@@ -106,24 +107,22 @@ static inline struct btrfs_root *btrfs_grab_root(struct btrfs_root *root)
return NULL;
}
-static inline struct btrfs_root *btrfs_block_group_root(struct btrfs_fs_info *fs_info)
-{
- if (btrfs_fs_compat_ro(fs_info, BLOCK_GROUP_TREE))
- return fs_info->block_group_root;
- return btrfs_extent_root(fs_info, 0);
-}
-
void btrfs_put_root(struct btrfs_root *root);
void btrfs_mark_buffer_dirty(struct extent_buffer *buf);
int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid,
int atomic);
-int btrfs_read_extent_buffer(struct extent_buffer *buf, u64 parent_transid,
- int level, struct btrfs_key *first_key);
-bool btrfs_wq_submit_bio(struct inode *inode, struct bio *bio, int mirror_num,
- u64 dio_file_offset,
- extent_submit_bio_start_t *submit_bio_start);
-blk_status_t btrfs_submit_bio_done(void *private_data, struct bio *bio,
- int mirror_num);
+int btrfs_read_extent_buffer(struct extent_buffer *buf,
+ struct btrfs_tree_parent_check *check);
+
+enum btrfs_wq_submit_cmd {
+ WQ_SUBMIT_METADATA,
+ WQ_SUBMIT_DATA,
+ WQ_SUBMIT_DATA_DIO,
+};
+
+bool btrfs_wq_submit_bio(struct btrfs_inode *inode, struct bio *bio, int mirror_num,
+ u64 dio_file_offset, enum btrfs_wq_submit_cmd cmd);
+blk_status_t btree_submit_bio_start(struct bio *bio);
int btrfs_alloc_log_tree_node(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
@@ -136,8 +135,6 @@ void btrfs_cleanup_one_transaction(struct btrfs_transaction *trans,
struct btrfs_fs_info *fs_info);
struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
u64 objectid);
-int btree_lock_page_hook(struct page *page, void *data,
- void (*flush_fn)(void *));
int btrfs_get_num_tolerated_disk_barrier_failures(u64 flags);
int btrfs_get_free_objectid(struct btrfs_root *root, u64 *objectid);
int btrfs_init_root_free_objectid(struct btrfs_root *root);
diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
index fab7eb76e53b..744a02b7fd67 100644
--- a/fs/btrfs/export.c
+++ b/fs/btrfs/export.c
@@ -7,6 +7,8 @@
#include "btrfs_inode.h"
#include "print-tree.h"
#include "export.h"
+#include "accessors.h"
+#include "super.h"
#define BTRFS_FID_SIZE_NON_CONNECTABLE (offsetof(struct btrfs_fid, \
parent_objectid) / 4)
@@ -57,9 +59,20 @@ static int btrfs_encode_fh(struct inode *inode, u32 *fh, int *max_len,
return type;
}
+/*
+ * Read dentry of inode with @objectid from filesystem root @root_objectid.
+ *
+ * @sb: the filesystem super block
+ * @objectid: inode objectid
+ * @root_objectid: object id of the subvolume root where to look up the inode
+ * @generation: optional, if not zero, verify that the found inode
+ * generation matches
+ *
+ * Return dentry alias for the inode, otherwise an error. In case the
+ * generation does not match return ESTALE.
+ */
struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
- u64 root_objectid, u64 generation,
- int check_generation)
+ u64 root_objectid, u64 generation)
{
struct btrfs_fs_info *fs_info = btrfs_sb(sb);
struct btrfs_root *root;
@@ -77,7 +90,7 @@ struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
if (IS_ERR(inode))
return ERR_CAST(inode);
- if (check_generation && generation != inode->i_generation) {
+ if (generation != 0 && generation != inode->i_generation) {
iput(inode);
return ERR_PTR(-ESTALE);
}
@@ -106,7 +119,7 @@ static struct dentry *btrfs_fh_to_parent(struct super_block *sb, struct fid *fh,
objectid = fid->parent_objectid;
generation = fid->parent_gen;
- return btrfs_get_dentry(sb, objectid, root_objectid, generation, 1);
+ return btrfs_get_dentry(sb, objectid, root_objectid, generation);
}
static struct dentry *btrfs_fh_to_dentry(struct super_block *sb, struct fid *fh,
@@ -128,7 +141,7 @@ static struct dentry *btrfs_fh_to_dentry(struct super_block *sb, struct fid *fh,
root_objectid = fid->root_objectid;
generation = fid->gen;
- return btrfs_get_dentry(sb, objectid, root_objectid, generation, 1);
+ return btrfs_get_dentry(sb, objectid, root_objectid, generation);
}
struct dentry *btrfs_get_parent(struct dentry *child)
@@ -188,7 +201,7 @@ struct dentry *btrfs_get_parent(struct dentry *child)
if (found_key.type == BTRFS_ROOT_BACKREF_KEY) {
return btrfs_get_dentry(fs_info->sb, key.objectid,
- found_key.offset, 0, 0);
+ found_key.offset, 0);
}
return d_obtain_alias(btrfs_iget(fs_info->sb, key.objectid, root));
diff --git a/fs/btrfs/export.h b/fs/btrfs/export.h
index 5afb7ca42828..eba6bc4f5a61 100644
--- a/fs/btrfs/export.h
+++ b/fs/btrfs/export.h
@@ -19,8 +19,7 @@ struct btrfs_fid {
} __attribute__ ((packed));
struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
- u64 root_objectid, u64 generation,
- int check_generation);
+ u64 root_objectid, u64 generation);
struct dentry *btrfs_get_parent(struct dentry *child);
#endif
diff --git a/fs/btrfs/extent-io-tree.c b/fs/btrfs/extent-io-tree.c
index 83cb0378096f..9ae9cd1e7035 100644
--- a/fs/btrfs/extent-io-tree.c
+++ b/fs/btrfs/extent-io-tree.c
@@ -2,6 +2,7 @@
#include <linux/slab.h>
#include <trace/events/btrfs.h>
+#include "messages.h"
#include "ctree.h"
#include "extent-io-tree.h"
#include "btrfs_inode.h"
@@ -57,17 +58,17 @@ static inline void __btrfs_debug_check_extent_io_range(const char *caller,
struct extent_io_tree *tree,
u64 start, u64 end)
{
- struct inode *inode = tree->private_data;
+ struct btrfs_inode *inode = tree->inode;
u64 isize;
if (!inode)
return;
- isize = i_size_read(inode);
+ isize = i_size_read(&inode->vfs_inode);
if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) {
- btrfs_debug_rl(BTRFS_I(inode)->root->fs_info,
+ btrfs_debug_rl(inode->root->fs_info,
"%s: ino %llu isize %llu odd range [%llu,%llu]",
- caller, btrfs_ino(BTRFS_I(inode)), isize, start, end);
+ caller, btrfs_ino(inode), isize, start, end);
}
}
#else
@@ -93,13 +94,12 @@ struct tree_entry {
};
void extent_io_tree_init(struct btrfs_fs_info *fs_info,
- struct extent_io_tree *tree, unsigned int owner,
- void *private_data)
+ struct extent_io_tree *tree, unsigned int owner)
{
tree->fs_info = fs_info;
tree->state = RB_ROOT;
spin_lock_init(&tree->lock);
- tree->private_data = private_data;
+ tree->inode = NULL;
tree->owner = owner;
if (owner == IO_TREE_INODE_FILE_EXTENT)
lockdep_set_class(&tree->lock, &file_extent_tree_class);
@@ -346,9 +346,8 @@ static void merge_state(struct extent_io_tree *tree, struct extent_state *state)
other = prev_state(state);
if (other && other->end == state->start - 1 &&
other->state == state->state) {
- if (tree->private_data)
- btrfs_merge_delalloc_extent(tree->private_data,
- state, other);
+ if (tree->inode)
+ btrfs_merge_delalloc_extent(tree->inode, state, other);
state->start = other->start;
rb_erase(&other->rb_node, &tree->state);
RB_CLEAR_NODE(&other->rb_node);
@@ -357,9 +356,8 @@ static void merge_state(struct extent_io_tree *tree, struct extent_state *state)
other = next_state(state);
if (other && other->start == state->end + 1 &&
other->state == state->state) {
- if (tree->private_data)
- btrfs_merge_delalloc_extent(tree->private_data, state,
- other);
+ if (tree->inode)
+ btrfs_merge_delalloc_extent(tree->inode, state, other);
state->end = other->end;
rb_erase(&other->rb_node, &tree->state);
RB_CLEAR_NODE(&other->rb_node);
@@ -374,8 +372,8 @@ static void set_state_bits(struct extent_io_tree *tree,
u32 bits_to_set = bits & ~EXTENT_CTLBITS;
int ret;
- if (tree->private_data)
- btrfs_set_delalloc_extent(tree->private_data, state, bits);
+ if (tree->inode)
+ btrfs_set_delalloc_extent(tree->inode, state, bits);
ret = add_extent_changeset(state, bits_to_set, changeset, 1);
BUG_ON(ret < 0);
@@ -397,7 +395,7 @@ static int insert_state(struct extent_io_tree *tree,
u32 bits, struct extent_changeset *changeset)
{
struct rb_node **node;
- struct rb_node *parent;
+ struct rb_node *parent = NULL;
const u64 end = state->end;
set_state_bits(tree, state, bits, changeset);
@@ -462,8 +460,8 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
struct rb_node *parent = NULL;
struct rb_node **node;
- if (tree->private_data)
- btrfs_split_delalloc_extent(tree->private_data, orig, split);
+ if (tree->inode)
+ btrfs_split_delalloc_extent(tree->inode, orig, split);
prealloc->start = orig->start;
prealloc->end = split - 1;
@@ -510,8 +508,8 @@ static struct extent_state *clear_state_bit(struct extent_io_tree *tree,
u32 bits_to_clear = bits & ~EXTENT_CTLBITS;
int ret;
- if (tree->private_data)
- btrfs_clear_delalloc_extent(tree->private_data, state, bits);
+ if (tree->inode)
+ btrfs_clear_delalloc_extent(tree->inode, state, bits);
ret = add_extent_changeset(state, bits_to_clear, changeset, 0);
BUG_ON(ret < 0);
@@ -572,7 +570,7 @@ int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
if (bits & (EXTENT_LOCKED | EXTENT_BOUNDARY))
clear = 1;
again:
- if (!prealloc && gfpflags_allow_blocking(mask)) {
+ if (!prealloc) {
/*
* Don't care for allocation failure here because we might end
* up not needing the pre-allocated extent state at all, which
@@ -636,7 +634,8 @@ hit_next:
if (state->start < start) {
prealloc = alloc_extent_state_atomic(prealloc);
- BUG_ON(!prealloc);
+ if (!prealloc)
+ goto search_again;
err = split_state(tree, state, prealloc, start);
if (err)
extent_io_tree_panic(tree, err);
@@ -657,7 +656,8 @@ hit_next:
*/
if (state->start <= end && state->end > end) {
prealloc = alloc_extent_state_atomic(prealloc);
- BUG_ON(!prealloc);
+ if (!prealloc)
+ goto search_again;
err = split_state(tree, state, prealloc, end + 1);
if (err)
extent_io_tree_panic(tree, err);
@@ -714,7 +714,8 @@ static void wait_on_state(struct extent_io_tree *tree,
* The range [start, end] is inclusive.
* The tree lock is taken by this function
*/
-void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bits)
+void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bits,
+ struct extent_state **cached_state)
{
struct extent_state *state;
@@ -722,6 +723,16 @@ void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bits)
spin_lock(&tree->lock);
again:
+ /*
+ * Maintain cached_state, as we may not remove it from the tree if there
+ * are more bits than the bits we're waiting on set on this state.
+ */
+ if (cached_state && *cached_state) {
+ state = *cached_state;
+ if (extent_state_in_tree(state) &&
+ state->start <= start && start < state->end)
+ goto process_node;
+ }
while (1) {
/*
* This search will find all the extents that end after our
@@ -752,6 +763,12 @@ process_node:
}
}
out:
+ /* This state is no longer useful, clear it and free it up. */
+ if (cached_state && *cached_state) {
+ state = *cached_state;
+ *cached_state = NULL;
+ free_extent_state(state);
+ }
spin_unlock(&tree->lock);
}
@@ -939,13 +956,17 @@ out:
* sleeping, so the gfp mask is used to indicate what is allowed.
*
* If any of the exclusive bits are set, this will fail with -EEXIST if some
- * part of the range already has the desired bits set. The start of the
- * existing range is returned in failed_start in this case.
+ * part of the range already has the desired bits set. The extent_state of the
+ * existing range is returned in failed_state in this case, and the start of the
+ * existing range is returned in failed_start. failed_state is used as an
+ * optimization for wait_extent_bit, failed_start must be used as the source of
+ * truth as failed_state may have changed since we returned.
*
* [start, end] is inclusive This takes the tree lock.
*/
static int __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
u32 bits, u64 *failed_start,
+ struct extent_state **failed_state,
struct extent_state **cached_state,
struct extent_changeset *changeset, gfp_t mask)
{
@@ -964,9 +985,9 @@ static int __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
if (exclusive_bits)
ASSERT(failed_start);
else
- ASSERT(failed_start == NULL);
+ ASSERT(failed_start == NULL && failed_state == NULL);
again:
- if (!prealloc && gfpflags_allow_blocking(mask)) {
+ if (!prealloc) {
/*
* Don't care for allocation failure here because we might end
* up not needing the pre-allocated extent state at all, which
@@ -991,7 +1012,8 @@ again:
state = tree_search_for_insert(tree, start, &p, &parent);
if (!state) {
prealloc = alloc_extent_state_atomic(prealloc);
- BUG_ON(!prealloc);
+ if (!prealloc)
+ goto search_again;
prealloc->start = start;
prealloc->end = end;
insert_state_fast(tree, prealloc, p, parent, bits, changeset);
@@ -1012,6 +1034,7 @@ hit_next:
if (state->start == start && state->end <= end) {
if (state->state & exclusive_bits) {
*failed_start = state->start;
+ cache_state(state, failed_state);
err = -EEXIST;
goto out;
}
@@ -1047,6 +1070,7 @@ hit_next:
if (state->start < start) {
if (state->state & exclusive_bits) {
*failed_start = start;
+ cache_state(state, failed_state);
err = -EEXIST;
goto out;
}
@@ -1062,7 +1086,8 @@ hit_next:
}
prealloc = alloc_extent_state_atomic(prealloc);
- BUG_ON(!prealloc);
+ if (!prealloc)
+ goto search_again;
err = split_state(tree, state, prealloc, start);
if (err)
extent_io_tree_panic(tree, err);
@@ -1099,7 +1124,8 @@ hit_next:
this_end = last_start - 1;
prealloc = alloc_extent_state_atomic(prealloc);
- BUG_ON(!prealloc);
+ if (!prealloc)
+ goto search_again;
/*
* Avoid to free 'prealloc' if it can be merged with the later
@@ -1125,12 +1151,14 @@ hit_next:
if (state->start <= end && state->end > end) {
if (state->state & exclusive_bits) {
*failed_start = start;
+ cache_state(state, failed_state);
err = -EEXIST;
goto out;
}
prealloc = alloc_extent_state_atomic(prealloc);
- BUG_ON(!prealloc);
+ if (!prealloc)
+ goto search_again;
err = split_state(tree, state, prealloc, end + 1);
if (err)
extent_io_tree_panic(tree, err);
@@ -1162,8 +1190,8 @@ out:
int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
u32 bits, struct extent_state **cached_state, gfp_t mask)
{
- return __set_extent_bit(tree, start, end, bits, NULL, cached_state,
- NULL, mask);
+ return __set_extent_bit(tree, start, end, bits, NULL, NULL,
+ cached_state, NULL, mask);
}
/*
@@ -1397,7 +1425,7 @@ void find_first_clear_extent_bit(struct extent_io_tree *tree, u64 start,
u64 *start_ret, u64 *end_ret, u32 bits)
{
struct extent_state *state;
- struct extent_state *prev = NULL, *next;
+ struct extent_state *prev = NULL, *next = NULL;
spin_lock(&tree->lock);
@@ -1487,15 +1515,37 @@ out:
}
/*
- * Count the number of bytes in the tree that have a given bit(s) set. This
- * can be fairly slow, except for EXTENT_DIRTY which is cached. The total
- * number found is returned.
+ * Count the number of bytes in the tree that have a given bit(s) set for a
+ * given range.
+ *
+ * @tree: The io tree to search.
+ * @start: The start offset of the range. This value is updated to the
+ * offset of the first byte found with the given bit(s), so it
+ * can end up being bigger than the initial value.
+ * @search_end: The end offset (inclusive value) of the search range.
+ * @max_bytes: The maximum byte count we are interested. The search stops
+ * once it reaches this count.
+ * @bits: The bits the range must have in order to be accounted for.
+ * If multiple bits are set, then only subranges that have all
+ * the bits set are accounted for.
+ * @contig: Indicate if we should ignore holes in the range or not. If
+ * this is true, then stop once we find a hole.
+ * @cached_state: A cached state to be used across multiple calls to this
+ * function in order to speedup searches. Use NULL if this is
+ * called only once or if each call does not start where the
+ * previous one ended.
+ *
+ * Returns the total number of bytes found within the given range that have
+ * all given bits set. If the returned number of bytes is greater than zero
+ * then @start is updated with the offset of the first byte with the bits set.
*/
u64 count_range_bits(struct extent_io_tree *tree,
u64 *start, u64 search_end, u64 max_bytes,
- u32 bits, int contig)
+ u32 bits, int contig,
+ struct extent_state **cached_state)
{
- struct extent_state *state;
+ struct extent_state *state = NULL;
+ struct extent_state *cached;
u64 cur_start = *start;
u64 total_bytes = 0;
u64 last = 0;
@@ -1506,11 +1556,41 @@ u64 count_range_bits(struct extent_io_tree *tree,
spin_lock(&tree->lock);
+ if (!cached_state || !*cached_state)
+ goto search;
+
+ cached = *cached_state;
+
+ if (!extent_state_in_tree(cached))
+ goto search;
+
+ if (cached->start <= cur_start && cur_start <= cached->end) {
+ state = cached;
+ } else if (cached->start > cur_start) {
+ struct extent_state *prev;
+
+ /*
+ * The cached state starts after our search range's start. Check
+ * if the previous state record starts at or before the range we
+ * are looking for, and if so, use it - this is a common case
+ * when there are holes between records in the tree. If there is
+ * no previous state record, we can start from our cached state.
+ */
+ prev = prev_state(cached);
+ if (!prev)
+ state = cached;
+ else if (prev->start <= cur_start && cur_start <= prev->end)
+ state = prev;
+ }
+
/*
* This search will find all the extents that end after our range
* starts.
*/
- state = tree_search(tree, cur_start);
+search:
+ if (!state)
+ state = tree_search(tree, cur_start);
+
while (state) {
if (state->start > search_end)
break;
@@ -1531,7 +1611,16 @@ u64 count_range_bits(struct extent_io_tree *tree,
}
state = next_state(state);
}
+
+ if (cached_state) {
+ free_extent_state(*cached_state);
+ *cached_state = state;
+ if (state)
+ refcount_inc(&state->refs);
+ }
+
spin_unlock(&tree->lock);
+
return total_bytes;
}
@@ -1598,8 +1687,8 @@ int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
*/
ASSERT(!(bits & EXTENT_LOCKED));
- return __set_extent_bit(tree, start, end, bits, NULL, NULL, changeset,
- GFP_NOFS);
+ return __set_extent_bit(tree, start, end, bits, NULL, NULL, NULL,
+ changeset, GFP_NOFS);
}
int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
@@ -1615,17 +1704,18 @@ int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
changeset);
}
-int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end)
+int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
+ struct extent_state **cached)
{
int err;
u64 failed_start;
err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, &failed_start,
- NULL, NULL, GFP_NOFS);
+ NULL, cached, NULL, GFP_NOFS);
if (err == -EEXIST) {
if (failed_start > start)
clear_extent_bit(tree, start, failed_start - 1,
- EXTENT_LOCKED, NULL);
+ EXTENT_LOCKED, cached);
return 0;
}
return 1;
@@ -1638,20 +1728,22 @@ int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end)
int lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
struct extent_state **cached_state)
{
+ struct extent_state *failed_state = NULL;
int err;
u64 failed_start;
err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, &failed_start,
- cached_state, NULL, GFP_NOFS);
+ &failed_state, cached_state, NULL, GFP_NOFS);
while (err == -EEXIST) {
if (failed_start != start)
clear_extent_bit(tree, start, failed_start - 1,
EXTENT_LOCKED, cached_state);
- wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED);
+ wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED,
+ &failed_state);
err = __set_extent_bit(tree, start, end, EXTENT_LOCKED,
- &failed_start, cached_state, NULL,
- GFP_NOFS);
+ &failed_start, &failed_state,
+ cached_state, NULL, GFP_NOFS);
}
return err;
}
diff --git a/fs/btrfs/extent-io-tree.h b/fs/btrfs/extent-io-tree.h
index a855f40dd61d..e3eeec380844 100644
--- a/fs/btrfs/extent-io-tree.h
+++ b/fs/btrfs/extent-io-tree.h
@@ -3,43 +3,48 @@
#ifndef BTRFS_EXTENT_IO_TREE_H
#define BTRFS_EXTENT_IO_TREE_H
+#include "misc.h"
+
struct extent_changeset;
struct io_failure_record;
/* Bits for the extent state */
-#define EXTENT_DIRTY (1U << 0)
-#define EXTENT_UPTODATE (1U << 1)
-#define EXTENT_LOCKED (1U << 2)
-#define EXTENT_NEW (1U << 3)
-#define EXTENT_DELALLOC (1U << 4)
-#define EXTENT_DEFRAG (1U << 5)
-#define EXTENT_BOUNDARY (1U << 6)
-#define EXTENT_NODATASUM (1U << 7)
-#define EXTENT_CLEAR_META_RESV (1U << 8)
-#define EXTENT_NEED_WAIT (1U << 9)
-#define EXTENT_NORESERVE (1U << 11)
-#define EXTENT_QGROUP_RESERVED (1U << 12)
-#define EXTENT_CLEAR_DATA_RESV (1U << 13)
-/*
- * Must be cleared only during ordered extent completion or on error paths if we
- * did not manage to submit bios and create the ordered extents for the range.
- * Should not be cleared during page release and page invalidation (if there is
- * an ordered extent in flight), that is left for the ordered extent completion.
- */
-#define EXTENT_DELALLOC_NEW (1U << 14)
-/*
- * When an ordered extent successfully completes for a region marked as a new
- * delalloc range, use this flag when clearing a new delalloc range to indicate
- * that the VFS' inode number of bytes should be incremented and the inode's new
- * delalloc bytes decremented, in an atomic way to prevent races with stat(2).
- */
-#define EXTENT_ADD_INODE_BYTES (1U << 15)
-
-/*
- * Set during truncate when we're clearing an entire range and we just want the
- * extent states to go away.
- */
-#define EXTENT_CLEAR_ALL_BITS (1U << 16)
+enum {
+ ENUM_BIT(EXTENT_DIRTY),
+ ENUM_BIT(EXTENT_UPTODATE),
+ ENUM_BIT(EXTENT_LOCKED),
+ ENUM_BIT(EXTENT_NEW),
+ ENUM_BIT(EXTENT_DELALLOC),
+ ENUM_BIT(EXTENT_DEFRAG),
+ ENUM_BIT(EXTENT_BOUNDARY),
+ ENUM_BIT(EXTENT_NODATASUM),
+ ENUM_BIT(EXTENT_CLEAR_META_RESV),
+ ENUM_BIT(EXTENT_NEED_WAIT),
+ ENUM_BIT(EXTENT_NORESERVE),
+ ENUM_BIT(EXTENT_QGROUP_RESERVED),
+ ENUM_BIT(EXTENT_CLEAR_DATA_RESV),
+ /*
+ * Must be cleared only during ordered extent completion or on error
+ * paths if we did not manage to submit bios and create the ordered
+ * extents for the range. Should not be cleared during page release
+ * and page invalidation (if there is an ordered extent in flight),
+ * that is left for the ordered extent completion.
+ */
+ ENUM_BIT(EXTENT_DELALLOC_NEW),
+ /*
+ * When an ordered extent successfully completes for a region marked as
+ * a new delalloc range, use this flag when clearing a new delalloc
+ * range to indicate that the VFS' inode number of bytes should be
+ * incremented and the inode's new delalloc bytes decremented, in an
+ * atomic way to prevent races with stat(2).
+ */
+ ENUM_BIT(EXTENT_ADD_INODE_BYTES),
+ /*
+ * Set during truncate when we're clearing an entire range and we just
+ * want the extent states to go away.
+ */
+ ENUM_BIT(EXTENT_CLEAR_ALL_BITS),
+};
#define EXTENT_DO_ACCOUNTING (EXTENT_CLEAR_META_RESV | \
EXTENT_CLEAR_DATA_RESV)
@@ -75,7 +80,8 @@ enum {
struct extent_io_tree {
struct rb_root state;
struct btrfs_fs_info *fs_info;
- void *private_data;
+ /* Inode associated with this tree, or NULL. */
+ struct btrfs_inode *inode;
/* Who owns this io tree, should be one of IO_TREE_* */
u8 owner;
@@ -99,21 +105,22 @@ struct extent_state {
};
void extent_io_tree_init(struct btrfs_fs_info *fs_info,
- struct extent_io_tree *tree, unsigned int owner,
- void *private_data);
+ struct extent_io_tree *tree, unsigned int owner);
void extent_io_tree_release(struct extent_io_tree *tree);
int lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
struct extent_state **cached);
-int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end);
+int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
+ struct extent_state **cached);
int __init extent_state_init_cachep(void);
void __cold extent_state_free_cachep(void);
u64 count_range_bits(struct extent_io_tree *tree,
u64 *start, u64 search_end,
- u64 max_bytes, u32 bits, int contig);
+ u64 max_bytes, u32 bits, int contig,
+ struct extent_state **cached_state);
void free_extent_state(struct extent_state *state);
int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
@@ -139,13 +146,6 @@ static inline int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end,
GFP_NOFS, NULL);
}
-static inline int unlock_extent_atomic(struct extent_io_tree *tree, u64 start,
- u64 end, struct extent_state **cached)
-{
- return __clear_extent_bit(tree, start, end, EXTENT_LOCKED, cached,
- GFP_ATOMIC, NULL);
-}
-
static inline int clear_extent_bits(struct extent_io_tree *tree, u64 start,
u64 end, u32 bits)
{
@@ -217,13 +217,6 @@ static inline int set_extent_new(struct extent_io_tree *tree, u64 start,
return set_extent_bit(tree, start, end, EXTENT_NEW, NULL, GFP_NOFS);
}
-static inline int set_extent_uptodate(struct extent_io_tree *tree, u64 start,
- u64 end, struct extent_state **cached_state, gfp_t mask)
-{
- return set_extent_bit(tree, start, end, EXTENT_UPTODATE,
- cached_state, mask);
-}
-
int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
u64 *start_ret, u64 *end_ret, u32 bits,
struct extent_state **cached_state);
@@ -234,6 +227,7 @@ int find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start,
bool btrfs_find_delalloc_range(struct extent_io_tree *tree, u64 *start,
u64 *end, u64 max_bytes,
struct extent_state **cached_state);
-void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bits);
+void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bits,
+ struct extent_state **cached_state);
#endif /* BTRFS_EXTENT_IO_TREE_H */
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 2801c991814f..892d78c1853c 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -36,6 +36,13 @@
#include "rcu-string.h"
#include "zoned.h"
#include "dev-replace.h"
+#include "fs.h"
+#include "accessors.h"
+#include "extent-tree.h"
+#include "root-tree.h"
+#include "file-item.h"
+#include "orphan.h"
+#include "tree-checker.h"
#undef SCRAMBLE_DELAYED_REFS
@@ -5255,8 +5262,8 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
u64 bytenr;
u64 generation;
u64 parent;
+ struct btrfs_tree_parent_check check = { 0 };
struct btrfs_key key;
- struct btrfs_key first_key;
struct btrfs_ref ref = { 0 };
struct extent_buffer *next;
int level = wc->level;
@@ -5278,7 +5285,12 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
}
bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]);
- btrfs_node_key_to_cpu(path->nodes[level], &first_key,
+
+ check.level = level - 1;
+ check.transid = generation;
+ check.owner_root = root->root_key.objectid;
+ check.has_first_key = true;
+ btrfs_node_key_to_cpu(path->nodes[level], &check.first_key,
path->slots[level]);
next = find_extent_buffer(fs_info, bytenr);
@@ -5340,8 +5352,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
if (!next) {
if (reada && level == 1)
reada_walk_down(trans, root, wc, path);
- next = read_tree_block(fs_info, bytenr, root->root_key.objectid,
- generation, level - 1, &first_key);
+ next = read_tree_block(fs_info, bytenr, &check);
if (IS_ERR(next)) {
return PTR_ERR(next);
} else if (!extent_buffer_uptodate(next)) {
@@ -5973,40 +5984,6 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
return ret;
}
-/*
- * helper to account the unused space of all the readonly block group in the
- * space_info. takes mirrors into account.
- */
-u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo)
-{
- struct btrfs_block_group *block_group;
- u64 free_bytes = 0;
- int factor;
-
- /* It's df, we don't care if it's racy */
- if (list_empty(&sinfo->ro_bgs))
- return 0;
-
- spin_lock(&sinfo->lock);
- list_for_each_entry(block_group, &sinfo->ro_bgs, ro_list) {
- spin_lock(&block_group->lock);
-
- if (!block_group->ro) {
- spin_unlock(&block_group->lock);
- continue;
- }
-
- factor = btrfs_bg_type_to_factor(block_group->flags);
- free_bytes += (block_group->length -
- block_group->used) * factor;
-
- spin_unlock(&block_group->lock);
- }
- spin_unlock(&sinfo->lock);
-
- return free_bytes;
-}
-
int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info,
u64 start, u64 end)
{
@@ -6072,7 +6049,7 @@ static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed)
btrfs_warn_in_rcu(fs_info,
"ignoring attempt to trim beyond device size: offset %llu length %llu device %s device size %llu",
start, end - start + 1,
- rcu_str_deref(device->name),
+ btrfs_dev_name(device),
device->total_bytes);
mutex_unlock(&fs_info->chunk_mutex);
ret = 0;
diff --git a/fs/btrfs/extent-tree.h b/fs/btrfs/extent-tree.h
new file mode 100644
index 000000000000..ae5425253603
--- /dev/null
+++ b/fs/btrfs/extent-tree.h
@@ -0,0 +1,78 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef BTRFS_EXTENT_TREE_H
+#define BTRFS_EXTENT_TREE_H
+
+enum btrfs_inline_ref_type {
+ BTRFS_REF_TYPE_INVALID,
+ BTRFS_REF_TYPE_BLOCK,
+ BTRFS_REF_TYPE_DATA,
+ BTRFS_REF_TYPE_ANY,
+};
+
+int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb,
+ struct btrfs_extent_inline_ref *iref,
+ enum btrfs_inline_ref_type is_data);
+u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset);
+
+int btrfs_add_excluded_extent(struct btrfs_fs_info *fs_info,
+ u64 start, u64 num_bytes);
+void btrfs_free_excluded_extents(struct btrfs_block_group *cache);
+int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, unsigned long count);
+void btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info,
+ struct btrfs_delayed_ref_root *delayed_refs,
+ struct btrfs_delayed_ref_head *head);
+int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len);
+int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info, u64 bytenr,
+ u64 offset, int metadata, u64 *refs, u64 *flags);
+int btrfs_pin_extent(struct btrfs_trans_handle *trans, u64 bytenr, u64 num,
+ int reserved);
+int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
+ u64 bytenr, u64 num_bytes);
+int btrfs_exclude_logged_extents(struct extent_buffer *eb);
+int btrfs_cross_ref_exist(struct btrfs_root *root,
+ u64 objectid, u64 offset, u64 bytenr, bool strict,
+ struct btrfs_path *path);
+struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ u64 parent, u64 root_objectid,
+ const struct btrfs_disk_key *key,
+ int level, u64 hint,
+ u64 empty_size,
+ enum btrfs_lock_nesting nest);
+void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
+ u64 root_id,
+ struct extent_buffer *buf,
+ u64 parent, int last_ref);
+int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, u64 owner,
+ u64 offset, u64 ram_bytes,
+ struct btrfs_key *ins);
+int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
+ u64 root_objectid, u64 owner, u64 offset,
+ struct btrfs_key *ins);
+int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes, u64 num_bytes,
+ u64 min_alloc_size, u64 empty_size, u64 hint_byte,
+ struct btrfs_key *ins, int is_data, int delalloc);
+int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+ struct extent_buffer *buf, int full_backref);
+int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+ struct extent_buffer *buf, int full_backref);
+int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
+ struct extent_buffer *eb, u64 flags, int level);
+int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref);
+
+int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info,
+ u64 start, u64 len, int delalloc);
+int btrfs_pin_reserved_extent(struct btrfs_trans_handle *trans, u64 start, u64 len);
+int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans);
+int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, struct btrfs_ref *generic_ref);
+int __must_check btrfs_drop_snapshot(struct btrfs_root *root, int update_ref,
+ int for_reloc);
+int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct extent_buffer *node,
+ struct extent_buffer *parent);
+
+#endif
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 4dcf22e051ff..83dd3aa59663 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -20,7 +20,7 @@
#include "extent_map.h"
#include "ctree.h"
#include "btrfs_inode.h"
-#include "volumes.h"
+#include "bio.h"
#include "check-integrity.h"
#include "locking.h"
#include "rcu-string.h"
@@ -30,6 +30,12 @@
#include "zoned.h"
#include "block-group.h"
#include "compression.h"
+#include "fs.h"
+#include "accessors.h"
+#include "file-item.h"
+#include "file.h"
+#include "dev-replace.h"
+#include "super.h"
static struct kmem_cache *extent_buffer_cache;
@@ -96,24 +102,22 @@ struct btrfs_bio_ctrl {
u32 len_to_stripe_boundary;
u32 len_to_oe_boundary;
btrfs_bio_end_io_t end_io_func;
-};
-struct extent_page_data {
- struct btrfs_bio_ctrl bio_ctrl;
- /* tells writepage not to lock the state bits for this range
- * it still does the unlocking
+ /*
+ * Tell writepage not to lock the state bits for this range, it still
+ * does the unlocking.
*/
- unsigned int extent_locked:1;
+ bool extent_locked;
- /* tells the submit_bio code to use REQ_SYNC */
- unsigned int sync_io:1;
+ /* Tell the submit_bio code to use REQ_SYNC */
+ bool sync_io;
};
static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl)
{
struct bio *bio;
struct bio_vec *bv;
- struct inode *inode;
+ struct btrfs_inode *inode;
int mirror_num;
if (!bio_ctrl->bio)
@@ -121,7 +125,7 @@ static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl)
bio = bio_ctrl->bio;
bv = bio_first_bvec_all(bio);
- inode = bv->bv_page->mapping->host;
+ inode = BTRFS_I(bv->bv_page->mapping->host);
mirror_num = bio_ctrl->mirror_num;
/* Caller should ensure the bio has at least some range added */
@@ -129,7 +133,7 @@ static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl)
btrfs_bio(bio)->file_offset = page_offset(bv->bv_page) + bv->bv_offset;
- if (!is_data_inode(inode))
+ if (!is_data_inode(&inode->vfs_inode))
btrfs_submit_metadata_bio(inode, bio, mirror_num);
else if (btrfs_op(bio) == BTRFS_MAP_WRITE)
btrfs_submit_data_write_bio(inode, bio, mirror_num);
@@ -142,11 +146,11 @@ static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl)
}
/*
- * Submit or fail the current bio in an extent_page_data structure.
+ * Submit or fail the current bio in the bio_ctrl structure.
*/
-static void submit_write_bio(struct extent_page_data *epd, int ret)
+static void submit_write_bio(struct btrfs_bio_ctrl *bio_ctrl, int ret)
{
- struct bio *bio = epd->bio_ctrl.bio;
+ struct bio *bio = bio_ctrl->bio;
if (!bio)
return;
@@ -155,9 +159,9 @@ static void submit_write_bio(struct extent_page_data *epd, int ret)
ASSERT(ret < 0);
btrfs_bio_end_io(btrfs_bio(bio), errno_to_blk_status(ret));
/* The bio is owned by the end_io handler now */
- epd->bio_ctrl.bio = NULL;
+ bio_ctrl->bio = NULL;
} else {
- submit_one_bio(&epd->bio_ctrl);
+ submit_one_bio(bio_ctrl);
}
}
@@ -527,120 +531,6 @@ static void free_io_failure(struct btrfs_inode *inode,
kfree(rec);
}
-/*
- * this bypasses the standard btrfs submit functions deliberately, as
- * the standard behavior is to write all copies in a raid setup. here we only
- * want to write the one bad copy. so we do the mapping for ourselves and issue
- * submit_bio directly.
- * to avoid any synchronization issues, wait for the data after writing, which
- * actually prevents the read that triggered the error from finishing.
- * currently, there can be no more than two copies of every data bit. thus,
- * exactly one rewrite is required.
- */
-static int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
- u64 length, u64 logical, struct page *page,
- unsigned int pg_offset, int mirror_num)
-{
- struct btrfs_device *dev;
- struct bio_vec bvec;
- struct bio bio;
- u64 map_length = 0;
- u64 sector;
- struct btrfs_io_context *bioc = NULL;
- int ret = 0;
-
- ASSERT(!(fs_info->sb->s_flags & SB_RDONLY));
- BUG_ON(!mirror_num);
-
- if (btrfs_repair_one_zone(fs_info, logical))
- return 0;
-
- map_length = length;
-
- /*
- * Avoid races with device replace and make sure our bioc has devices
- * associated to its stripes that don't go away while we are doing the
- * read repair operation.
- */
- btrfs_bio_counter_inc_blocked(fs_info);
- if (btrfs_is_parity_mirror(fs_info, logical, length)) {
- /*
- * Note that we don't use BTRFS_MAP_WRITE because it's supposed
- * to update all raid stripes, but here we just want to correct
- * bad stripe, thus BTRFS_MAP_READ is abused to only get the bad
- * stripe's dev and sector.
- */
- ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, logical,
- &map_length, &bioc, 0);
- if (ret)
- goto out_counter_dec;
- ASSERT(bioc->mirror_num == 1);
- } else {
- ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, logical,
- &map_length, &bioc, mirror_num);
- if (ret)
- goto out_counter_dec;
- BUG_ON(mirror_num != bioc->mirror_num);
- }
-
- sector = bioc->stripes[bioc->mirror_num - 1].physical >> 9;
- dev = bioc->stripes[bioc->mirror_num - 1].dev;
- btrfs_put_bioc(bioc);
-
- if (!dev || !dev->bdev ||
- !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) {
- ret = -EIO;
- goto out_counter_dec;
- }
-
- bio_init(&bio, dev->bdev, &bvec, 1, REQ_OP_WRITE | REQ_SYNC);
- bio.bi_iter.bi_sector = sector;
- __bio_add_page(&bio, page, length, pg_offset);
-
- btrfsic_check_bio(&bio);
- ret = submit_bio_wait(&bio);
- if (ret) {
- /* try to remap that extent elsewhere? */
- btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS);
- goto out_bio_uninit;
- }
-
- btrfs_info_rl_in_rcu(fs_info,
- "read error corrected: ino %llu off %llu (dev %s sector %llu)",
- ino, start,
- rcu_str_deref(dev->name), sector);
- ret = 0;
-
-out_bio_uninit:
- bio_uninit(&bio);
-out_counter_dec:
- btrfs_bio_counter_dec(fs_info);
- return ret;
-}
-
-int btrfs_repair_eb_io_failure(const struct extent_buffer *eb, int mirror_num)
-{
- struct btrfs_fs_info *fs_info = eb->fs_info;
- u64 start = eb->start;
- int i, num_pages = num_extent_pages(eb);
- int ret = 0;
-
- if (sb_rdonly(fs_info->sb))
- return -EROFS;
-
- for (i = 0; i < num_pages; i++) {
- struct page *p = eb->pages[i];
-
- ret = repair_io_failure(fs_info, 0, start, PAGE_SIZE, start, p,
- start - page_offset(p), mirror_num);
- if (ret)
- break;
- start += PAGE_SIZE;
- }
-
- return ret;
-}
-
static int next_mirror(const struct io_failure_record *failrec, int cur_mirror)
{
if (cur_mirror == failrec->num_copies)
@@ -688,7 +578,7 @@ int btrfs_clean_io_failure(struct btrfs_inode *inode, u64 start,
mirror = failrec->this_mirror;
do {
mirror = prev_mirror(failrec, mirror);
- repair_io_failure(fs_info, ino, start, failrec->len,
+ btrfs_repair_io_failure(fs_info, ino, start, failrec->len,
failrec->logical, page, pg_offset, mirror);
} while (mirror != failrec->failed_mirror);
@@ -791,13 +681,13 @@ static struct io_failure_record *btrfs_get_io_failure_record(struct inode *inode
return failrec;
}
-int btrfs_repair_one_sector(struct inode *inode, struct btrfs_bio *failed_bbio,
+int btrfs_repair_one_sector(struct btrfs_inode *inode, struct btrfs_bio *failed_bbio,
u32 bio_offset, struct page *page, unsigned int pgoff,
- submit_bio_hook_t *submit_bio_hook)
+ bool submit_buffered)
{
u64 start = failed_bbio->file_offset + bio_offset;
struct io_failure_record *failrec;
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct bio *failed_bio = &failed_bbio->bio;
const int icsum = bio_offset >> fs_info->sectorsize_bits;
struct bio *repair_bio;
@@ -808,7 +698,7 @@ int btrfs_repair_one_sector(struct inode *inode, struct btrfs_bio *failed_bbio,
BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE);
- failrec = btrfs_get_io_failure_record(inode, failed_bbio, bio_offset);
+ failrec = btrfs_get_io_failure_record(&inode->vfs_inode, failed_bbio, bio_offset);
if (IS_ERR(failrec))
return PTR_ERR(failrec);
@@ -819,14 +709,14 @@ int btrfs_repair_one_sector(struct inode *inode, struct btrfs_bio *failed_bbio,
*
* Since we're only doing repair for one sector, we only need to get
* a good copy of the failed sector and if we succeed, we have setup
- * everything for repair_io_failure to do the rest for us.
+ * everything for btrfs_repair_io_failure to do the rest for us.
*/
failrec->this_mirror = next_mirror(failrec, failrec->this_mirror);
if (failrec->this_mirror == failrec->failed_mirror) {
btrfs_debug(fs_info,
"failed to repair num_copies %d this_mirror %d failed_mirror %d",
failrec->num_copies, failrec->this_mirror, failrec->failed_mirror);
- free_io_failure(BTRFS_I(inode), failrec);
+ free_io_failure(inode, failrec);
return -EIO;
}
@@ -847,16 +737,21 @@ int btrfs_repair_one_sector(struct inode *inode, struct btrfs_bio *failed_bbio,
bio_add_page(repair_bio, page, failrec->len, pgoff);
repair_bbio->iter = repair_bio->bi_iter;
- btrfs_debug(btrfs_sb(inode->i_sb),
+ btrfs_debug(fs_info,
"repair read error: submitting new read to mirror %d",
failrec->this_mirror);
/*
- * At this point we have a bio, so any errors from submit_bio_hook()
- * will be handled by the endio on the repair_bio, so we can't return an
+ * At this point we have a bio, so any errors from bio submission will
+ * be handled by the endio on the repair_bio, so we can't return an
* error here.
*/
- submit_bio_hook(inode, repair_bio, failrec->this_mirror, 0);
+ if (submit_buffered)
+ btrfs_submit_data_read_bio(inode, repair_bio,
+ failrec->this_mirror, 0);
+ else
+ btrfs_submit_dio_repair_bio(inode, repair_bio, failrec->this_mirror);
+
return BLK_STS_OK;
}
@@ -892,14 +787,9 @@ static void end_sector_io(struct page *page, u64 offset, bool uptodate)
{
struct btrfs_inode *inode = BTRFS_I(page->mapping->host);
const u32 sectorsize = inode->root->fs_info->sectorsize;
- struct extent_state *cached = NULL;
end_page_read(page, uptodate, offset, sectorsize);
- if (uptodate)
- set_extent_uptodate(&inode->io_tree, offset,
- offset + sectorsize - 1, &cached, GFP_ATOMIC);
- unlock_extent_atomic(&inode->io_tree, offset, offset + sectorsize - 1,
- &cached);
+ unlock_extent(&inode->io_tree, offset, offset + sectorsize - 1, NULL);
}
static void submit_data_read_repair(struct inode *inode,
@@ -945,9 +835,9 @@ static void submit_data_read_repair(struct inode *inode,
goto next;
}
- ret = btrfs_repair_one_sector(inode, failed_bbio,
+ ret = btrfs_repair_one_sector(BTRFS_I(inode), failed_bbio,
bio_offset + offset, page, pgoff + offset,
- btrfs_submit_data_read_bio);
+ true);
if (!ret) {
/*
* We have submitted the read repair, the page release
@@ -1103,7 +993,7 @@ static void endio_readpage_release_extent(struct processed_extent *processed,
* Now we don't have range contiguous to the processed range, release
* the processed range now.
*/
- unlock_extent_atomic(tree, processed->start, processed->end, &cached);
+ unlock_extent(tree, processed->start, processed->end, &cached);
update:
/* Update processed to current range */
@@ -1296,7 +1186,7 @@ static void end_bio_extent_readpage(struct btrfs_bio *bbio)
bio_put(bio);
}
-/**
+/*
* Populate every free slot in a provided array with pages.
*
* @nr_pages: number of pages to allocate
@@ -1332,16 +1222,16 @@ int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array)
return 0;
}
-/**
- * Attempt to add a page to bio
+/*
+ * Attempt to add a page to bio.
*
- * @bio_ctrl: record both the bio, and its bio_flags
- * @page: page to add to the bio
- * @disk_bytenr: offset of the new bio or to check whether we are adding
- * a contiguous page to the previous one
- * @size: portion of page that we want to write
- * @pg_offset: starting offset in the page
- * @compress_type: compression type of the current bio to see if we can merge them
+ * @bio_ctrl: record both the bio, and its bio_flags
+ * @page: page to add to the bio
+ * @disk_bytenr: offset of the new bio or to check whether we are adding
+ * a contiguous page to the previous one
+ * @size: portion of page that we want to write
+ * @pg_offset: starting offset in the page
+ * @compress_type: compression type of the current bio to see if we can merge them
*
* Attempt to add a page to bio considering stripe alignment etc.
*
@@ -1772,13 +1662,9 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
ASSERT(IS_ALIGNED(cur, fs_info->sectorsize));
if (cur >= last_byte) {
- struct extent_state *cached = NULL;
-
iosize = PAGE_SIZE - pg_offset;
memzero_page(page, pg_offset, iosize);
- set_extent_uptodate(tree, cur, cur + iosize - 1,
- &cached, GFP_NOFS);
- unlock_extent(tree, cur, cur + iosize - 1, &cached);
+ unlock_extent(tree, cur, cur + iosize - 1, NULL);
end_page_read(page, true, cur, iosize);
break;
}
@@ -1854,13 +1740,9 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
/* we've found a hole, just zero and go on */
if (block_start == EXTENT_MAP_HOLE) {
- struct extent_state *cached = NULL;
-
memzero_page(page, pg_offset, iosize);
- set_extent_uptodate(tree, cur, cur + iosize - 1,
- &cached, GFP_NOFS);
- unlock_extent(tree, cur, cur + iosize - 1, &cached);
+ unlock_extent(tree, cur, cur + iosize - 1, NULL);
end_page_read(page, true, cur, iosize);
cur = cur + iosize;
pg_offset += iosize;
@@ -2065,7 +1947,7 @@ static void find_next_dirty_byte(struct btrfs_fs_info *fs_info,
static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
struct page *page,
struct writeback_control *wbc,
- struct extent_page_data *epd,
+ struct btrfs_bio_ctrl *bio_ctrl,
loff_t i_size,
int *nr_ret)
{
@@ -2097,7 +1979,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
*/
wbc->nr_to_write--;
- epd->bio_ctrl.end_io_func = end_bio_extent_writepage;
+ bio_ctrl->end_io_func = end_bio_extent_writepage;
while (cur <= end) {
u64 disk_bytenr;
u64 em_end;
@@ -2191,7 +2073,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
btrfs_page_clear_dirty(fs_info, page, cur, iosize);
ret = submit_extent_page(op | write_flags, wbc,
- &epd->bio_ctrl, disk_bytenr,
+ bio_ctrl, disk_bytenr,
page, iosize,
cur - page_offset(page),
0, false);
@@ -2231,7 +2113,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
* Return <0 for error.
*/
static int __extent_writepage(struct page *page, struct writeback_control *wbc,
- struct extent_page_data *epd)
+ struct btrfs_bio_ctrl *bio_ctrl)
{
struct folio *folio = page_folio(page);
struct inode *inode = page->mapping->host;
@@ -2268,7 +2150,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
goto done;
}
- if (!epd->extent_locked) {
+ if (!bio_ctrl->extent_locked) {
ret = writepage_delalloc(BTRFS_I(inode), page, wbc);
if (ret == 1)
return 0;
@@ -2276,7 +2158,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
goto done;
}
- ret = __extent_writepage_io(BTRFS_I(inode), page, wbc, epd, i_size,
+ ret = __extent_writepage_io(BTRFS_I(inode), page, wbc, bio_ctrl, i_size,
&nr);
if (ret == 1)
return 0;
@@ -2320,9 +2202,9 @@ done:
*/
if (PageError(page))
end_extent_writepage(page, ret, page_start, page_end);
- if (epd->extent_locked) {
+ if (bio_ctrl->extent_locked) {
/*
- * If epd->extent_locked, it's from extent_write_locked_range(),
+ * If bio_ctrl->extent_locked, it's from extent_write_locked_range(),
* the page can either be locked by lock_page() or
* process_one_page().
* Let btrfs_page_unlock_writer() handle both cases.
@@ -2361,7 +2243,7 @@ static void end_extent_buffer_writeback(struct extent_buffer *eb)
* Return <0 if something went wrong, no page is locked.
*/
static noinline_for_stack int lock_extent_buffer_for_io(struct extent_buffer *eb,
- struct extent_page_data *epd)
+ struct btrfs_bio_ctrl *bio_ctrl)
{
struct btrfs_fs_info *fs_info = eb->fs_info;
int i, num_pages;
@@ -2369,17 +2251,17 @@ static noinline_for_stack int lock_extent_buffer_for_io(struct extent_buffer *eb
int ret = 0;
if (!btrfs_try_tree_write_lock(eb)) {
- submit_write_bio(epd, 0);
+ submit_write_bio(bio_ctrl, 0);
flush = 1;
btrfs_tree_lock(eb);
}
if (test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags)) {
btrfs_tree_unlock(eb);
- if (!epd->sync_io)
+ if (!bio_ctrl->sync_io)
return 0;
if (!flush) {
- submit_write_bio(epd, 0);
+ submit_write_bio(bio_ctrl, 0);
flush = 1;
}
while (1) {
@@ -2426,7 +2308,7 @@ static noinline_for_stack int lock_extent_buffer_for_io(struct extent_buffer *eb
if (!trylock_page(p)) {
if (!flush) {
- submit_write_bio(epd, 0);
+ submit_write_bio(bio_ctrl, 0);
flush = 1;
}
lock_page(p);
@@ -2647,15 +2529,19 @@ static void prepare_eb_write(struct extent_buffer *eb)
/* Set btree blocks beyond nritems with 0 to avoid stale content */
nritems = btrfs_header_nritems(eb);
if (btrfs_header_level(eb) > 0) {
- end = btrfs_node_key_ptr_offset(nritems);
+ end = btrfs_node_key_ptr_offset(eb, nritems);
memzero_extent_buffer(eb, end, eb->len - end);
} else {
/*
* Leaf:
* header 0 1 2 .. N ... data_N .. data_2 data_1 data_0
*/
- start = btrfs_item_nr_offset(nritems);
- end = BTRFS_LEAF_DATA_OFFSET + leaf_data_end(eb);
+ start = btrfs_item_nr_offset(eb, nritems);
+ end = btrfs_item_nr_offset(eb, 0);
+ if (nritems == 0)
+ end += BTRFS_LEAF_DATA_SIZE(eb->fs_info);
+ else
+ end += btrfs_item_offset(eb, nritems - 1);
memzero_extent_buffer(eb, start, end - start);
}
}
@@ -2666,7 +2552,7 @@ static void prepare_eb_write(struct extent_buffer *eb)
*/
static int write_one_subpage_eb(struct extent_buffer *eb,
struct writeback_control *wbc,
- struct extent_page_data *epd)
+ struct btrfs_bio_ctrl *bio_ctrl)
{
struct btrfs_fs_info *fs_info = eb->fs_info;
struct page *page = eb->pages[0];
@@ -2686,10 +2572,10 @@ static int write_one_subpage_eb(struct extent_buffer *eb,
if (no_dirty_ebs)
clear_page_dirty_for_io(page);
- epd->bio_ctrl.end_io_func = end_bio_subpage_eb_writepage;
+ bio_ctrl->end_io_func = end_bio_subpage_eb_writepage;
ret = submit_extent_page(REQ_OP_WRITE | write_flags, wbc,
- &epd->bio_ctrl, eb->start, page, eb->len,
+ bio_ctrl, eb->start, page, eb->len,
eb->start - page_offset(page), 0, false);
if (ret) {
btrfs_subpage_clear_writeback(fs_info, page, eb->start, eb->len);
@@ -2712,7 +2598,7 @@ static int write_one_subpage_eb(struct extent_buffer *eb,
static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
struct writeback_control *wbc,
- struct extent_page_data *epd)
+ struct btrfs_bio_ctrl *bio_ctrl)
{
u64 disk_bytenr = eb->start;
int i, num_pages;
@@ -2721,7 +2607,7 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
prepare_eb_write(eb);
- epd->bio_ctrl.end_io_func = end_bio_extent_buffer_writepage;
+ bio_ctrl->end_io_func = end_bio_extent_buffer_writepage;
num_pages = num_extent_pages(eb);
for (i = 0; i < num_pages; i++) {
@@ -2730,7 +2616,7 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
clear_page_dirty_for_io(p);
set_page_writeback(p);
ret = submit_extent_page(REQ_OP_WRITE | write_flags, wbc,
- &epd->bio_ctrl, disk_bytenr, p,
+ bio_ctrl, disk_bytenr, p,
PAGE_SIZE, 0, 0, false);
if (ret) {
set_btree_ioerr(p, eb);
@@ -2773,7 +2659,7 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
*/
static int submit_eb_subpage(struct page *page,
struct writeback_control *wbc,
- struct extent_page_data *epd)
+ struct btrfs_bio_ctrl *bio_ctrl)
{
struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb);
int submitted = 0;
@@ -2826,7 +2712,7 @@ static int submit_eb_subpage(struct page *page,
if (!eb)
continue;
- ret = lock_extent_buffer_for_io(eb, epd);
+ ret = lock_extent_buffer_for_io(eb, bio_ctrl);
if (ret == 0) {
free_extent_buffer(eb);
continue;
@@ -2835,7 +2721,7 @@ static int submit_eb_subpage(struct page *page,
free_extent_buffer(eb);
goto cleanup;
}
- ret = write_one_subpage_eb(eb, wbc, epd);
+ ret = write_one_subpage_eb(eb, wbc, bio_ctrl);
free_extent_buffer(eb);
if (ret < 0)
goto cleanup;
@@ -2845,7 +2731,7 @@ static int submit_eb_subpage(struct page *page,
cleanup:
/* We hit error, end bio for the submitted extent buffers */
- submit_write_bio(epd, ret);
+ submit_write_bio(bio_ctrl, ret);
return ret;
}
@@ -2870,7 +2756,7 @@ cleanup:
* Return <0 for fatal error.
*/
static int submit_eb_page(struct page *page, struct writeback_control *wbc,
- struct extent_page_data *epd,
+ struct btrfs_bio_ctrl *bio_ctrl,
struct extent_buffer **eb_context)
{
struct address_space *mapping = page->mapping;
@@ -2882,7 +2768,7 @@ static int submit_eb_page(struct page *page, struct writeback_control *wbc,
return 0;
if (btrfs_sb(page->mapping->host->i_sb)->nodesize < PAGE_SIZE)
- return submit_eb_subpage(page, wbc, epd);
+ return submit_eb_subpage(page, wbc, bio_ctrl);
spin_lock(&mapping->private_lock);
if (!PagePrivate(page)) {
@@ -2925,7 +2811,7 @@ static int submit_eb_page(struct page *page, struct writeback_control *wbc,
*eb_context = eb;
- ret = lock_extent_buffer_for_io(eb, epd);
+ ret = lock_extent_buffer_for_io(eb, bio_ctrl);
if (ret <= 0) {
btrfs_revert_meta_write_pointer(cache, eb);
if (cache)
@@ -2940,7 +2826,7 @@ static int submit_eb_page(struct page *page, struct writeback_control *wbc,
btrfs_schedule_zone_finish_bg(cache, eb);
btrfs_put_block_group(cache);
}
- ret = write_one_eb(eb, wbc, epd);
+ ret = write_one_eb(eb, wbc, bio_ctrl);
free_extent_buffer(eb);
if (ret < 0)
return ret;
@@ -2951,10 +2837,9 @@ int btree_write_cache_pages(struct address_space *mapping,
struct writeback_control *wbc)
{
struct extent_buffer *eb_context = NULL;
- struct extent_page_data epd = {
- .bio_ctrl = { 0 },
+ struct btrfs_bio_ctrl bio_ctrl = {
.extent_locked = 0,
- .sync_io = wbc->sync_mode == WB_SYNC_ALL,
+ .sync_io = (wbc->sync_mode == WB_SYNC_ALL),
};
struct btrfs_fs_info *fs_info = BTRFS_I(mapping->host)->root->fs_info;
int ret = 0;
@@ -2997,7 +2882,7 @@ retry:
for (i = 0; i < nr_pages; i++) {
struct page *page = pvec.pages[i];
- ret = submit_eb_page(page, wbc, &epd, &eb_context);
+ ret = submit_eb_page(page, wbc, &bio_ctrl, &eb_context);
if (ret == 0)
continue;
if (ret < 0) {
@@ -3058,18 +2943,18 @@ retry:
ret = 0;
if (!ret && BTRFS_FS_ERROR(fs_info))
ret = -EROFS;
- submit_write_bio(&epd, ret);
+ submit_write_bio(&bio_ctrl, ret);
btrfs_zoned_meta_io_unlock(fs_info);
return ret;
}
-/**
+/*
* Walk the list of dirty pages of the given address space and write all of them.
*
- * @mapping: address space structure to write
- * @wbc: subtract the number of written pages from *@wbc->nr_to_write
- * @epd: holds context for the write, namely the bio
+ * @mapping: address space structure to write
+ * @wbc: subtract the number of written pages from *@wbc->nr_to_write
+ * @bio_ctrl: holds context for the write, namely the bio
*
* If a page is already under I/O, write_cache_pages() skips it, even
* if it's dirty. This is desirable behaviour for memory-cleaning writeback,
@@ -3081,7 +2966,7 @@ retry:
*/
static int extent_write_cache_pages(struct address_space *mapping,
struct writeback_control *wbc,
- struct extent_page_data *epd)
+ struct btrfs_bio_ctrl *bio_ctrl)
{
struct inode *inode = mapping->host;
int ret = 0;
@@ -3162,7 +3047,7 @@ retry:
* tmpfs file mapping
*/
if (!trylock_page(page)) {
- submit_write_bio(epd, 0);
+ submit_write_bio(bio_ctrl, 0);
lock_page(page);
}
@@ -3173,7 +3058,7 @@ retry:
if (wbc->sync_mode != WB_SYNC_NONE) {
if (PageWriteback(page))
- submit_write_bio(epd, 0);
+ submit_write_bio(bio_ctrl, 0);
wait_on_page_writeback(page);
}
@@ -3183,7 +3068,7 @@ retry:
continue;
}
- ret = __extent_writepage(page, wbc, epd);
+ ret = __extent_writepage(page, wbc, bio_ctrl);
if (ret < 0) {
done = 1;
break;
@@ -3213,14 +3098,14 @@ retry:
* page in our current bio, and thus deadlock, so flush the
* write bio here.
*/
- submit_write_bio(epd, 0);
+ submit_write_bio(bio_ctrl, 0);
goto retry;
}
if (wbc->range_cyclic || (wbc->nr_to_write > 0 && range_whole))
mapping->writeback_index = done_index;
- btrfs_add_delayed_iput(inode);
+ btrfs_add_delayed_iput(BTRFS_I(inode));
return ret;
}
@@ -3239,8 +3124,7 @@ int extent_write_locked_range(struct inode *inode, u64 start, u64 end)
u64 cur = start;
unsigned long nr_pages;
const u32 sectorsize = btrfs_sb(inode->i_sb)->sectorsize;
- struct extent_page_data epd = {
- .bio_ctrl = { 0 },
+ struct btrfs_bio_ctrl bio_ctrl = {
.extent_locked = 1,
.sync_io = 1,
};
@@ -3271,7 +3155,7 @@ int extent_write_locked_range(struct inode *inode, u64 start, u64 end)
ASSERT(PageLocked(page));
ASSERT(PageDirty(page));
clear_page_dirty_for_io(page);
- ret = __extent_writepage(page, &wbc_writepages, &epd);
+ ret = __extent_writepage(page, &wbc_writepages, &bio_ctrl);
ASSERT(ret <= 0);
if (ret < 0) {
found_error = true;
@@ -3281,7 +3165,7 @@ int extent_write_locked_range(struct inode *inode, u64 start, u64 end)
cur = cur_end + 1;
}
- submit_write_bio(&epd, found_error ? ret : 0);
+ submit_write_bio(&bio_ctrl, found_error ? ret : 0);
wbc_detach_inode(&wbc_writepages);
if (found_error)
@@ -3294,10 +3178,9 @@ int extent_writepages(struct address_space *mapping,
{
struct inode *inode = mapping->host;
int ret = 0;
- struct extent_page_data epd = {
- .bio_ctrl = { 0 },
+ struct btrfs_bio_ctrl bio_ctrl = {
.extent_locked = 0,
- .sync_io = wbc->sync_mode == WB_SYNC_ALL,
+ .sync_io = (wbc->sync_mode == WB_SYNC_ALL),
};
/*
@@ -3305,8 +3188,8 @@ int extent_writepages(struct address_space *mapping,
* protect the write pointer updates.
*/
btrfs_zoned_data_reloc_lock(BTRFS_I(inode));
- ret = extent_write_cache_pages(mapping, wbc, &epd);
- submit_write_bio(&epd, ret);
+ ret = extent_write_cache_pages(mapping, wbc, &bio_ctrl);
+ submit_write_bio(&bio_ctrl, ret);
btrfs_zoned_data_reloc_unlock(BTRFS_I(inode));
return ret;
}
@@ -3705,14 +3588,13 @@ static int fiemap_search_slot(struct btrfs_inode *inode, struct btrfs_path *path
static int fiemap_process_hole(struct btrfs_inode *inode,
struct fiemap_extent_info *fieinfo,
struct fiemap_cache *cache,
- struct btrfs_backref_shared_cache *backref_cache,
+ struct extent_state **delalloc_cached_state,
+ struct btrfs_backref_share_check_ctx *backref_ctx,
u64 disk_bytenr, u64 extent_offset,
u64 extent_gen,
- struct ulist *roots, struct ulist *tmp_ulist,
u64 start, u64 end)
{
const u64 i_size = i_size_read(&inode->vfs_inode);
- const u64 ino = btrfs_ino(inode);
u64 cur_offset = start;
u64 last_delalloc_end = 0;
u32 prealloc_flags = FIEMAP_EXTENT_UNWRITTEN;
@@ -3731,6 +3613,7 @@ static int fiemap_process_hole(struct btrfs_inode *inode,
bool delalloc;
delalloc = btrfs_find_delalloc_in_range(inode, cur_offset, end,
+ delalloc_cached_state,
&delalloc_start,
&delalloc_end);
if (!delalloc)
@@ -3752,11 +3635,10 @@ static int fiemap_process_hole(struct btrfs_inode *inode,
if (prealloc_len > 0) {
if (!checked_extent_shared && fieinfo->fi_extents_max) {
- ret = btrfs_is_data_extent_shared(inode->root,
- ino, disk_bytenr,
- extent_gen, roots,
- tmp_ulist,
- backref_cache);
+ ret = btrfs_is_data_extent_shared(inode,
+ disk_bytenr,
+ extent_gen,
+ backref_ctx);
if (ret < 0)
return ret;
else if (ret > 0)
@@ -3802,11 +3684,10 @@ static int fiemap_process_hole(struct btrfs_inode *inode,
}
if (!checked_extent_shared && fieinfo->fi_extents_max) {
- ret = btrfs_is_data_extent_shared(inode->root,
- ino, disk_bytenr,
- extent_gen, roots,
- tmp_ulist,
- backref_cache);
+ ret = btrfs_is_data_extent_shared(inode,
+ disk_bytenr,
+ extent_gen,
+ backref_ctx);
if (ret < 0)
return ret;
else if (ret > 0)
@@ -3903,12 +3784,10 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo,
{
const u64 ino = btrfs_ino(inode);
struct extent_state *cached_state = NULL;
+ struct extent_state *delalloc_cached_state = NULL;
struct btrfs_path *path;
- struct btrfs_root *root = inode->root;
struct fiemap_cache cache = { 0 };
- struct btrfs_backref_shared_cache *backref_cache;
- struct ulist *roots;
- struct ulist *tmp_ulist;
+ struct btrfs_backref_share_check_ctx *backref_ctx;
u64 last_extent_end;
u64 prev_extent_end;
u64 lockstart;
@@ -3916,17 +3795,15 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo,
bool stopped = false;
int ret;
- backref_cache = kzalloc(sizeof(*backref_cache), GFP_KERNEL);
+ backref_ctx = btrfs_alloc_backref_share_check_ctx();
path = btrfs_alloc_path();
- roots = ulist_alloc(GFP_KERNEL);
- tmp_ulist = ulist_alloc(GFP_KERNEL);
- if (!backref_cache || !path || !roots || !tmp_ulist) {
+ if (!backref_ctx || !path) {
ret = -ENOMEM;
goto out;
}
- lockstart = round_down(start, root->fs_info->sectorsize);
- lockend = round_up(start + len, root->fs_info->sectorsize);
+ lockstart = round_down(start, inode->root->fs_info->sectorsize);
+ lockend = round_up(start + len, inode->root->fs_info->sectorsize);
prev_extent_end = lockstart;
lock_extent(&inode->io_tree, lockstart, lockend, &cached_state);
@@ -3975,13 +3852,15 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo,
if (extent_end <= lockstart)
goto next_item;
+ backref_ctx->curr_leaf_bytenr = leaf->start;
+
/* We have in implicit hole (NO_HOLES feature enabled). */
if (prev_extent_end < key.offset) {
const u64 range_end = min(key.offset, lockend) - 1;
ret = fiemap_process_hole(inode, fieinfo, &cache,
- backref_cache, 0, 0, 0,
- roots, tmp_ulist,
+ &delalloc_cached_state,
+ backref_ctx, 0, 0, 0,
prev_extent_end, range_end);
if (ret < 0) {
goto out_unlock;
@@ -4021,25 +3900,24 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo,
extent_len, flags);
} else if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
ret = fiemap_process_hole(inode, fieinfo, &cache,
- backref_cache,
+ &delalloc_cached_state,
+ backref_ctx,
disk_bytenr, extent_offset,
- extent_gen, roots, tmp_ulist,
- key.offset, extent_end - 1);
+ extent_gen, key.offset,
+ extent_end - 1);
} else if (disk_bytenr == 0) {
/* We have an explicit hole. */
ret = fiemap_process_hole(inode, fieinfo, &cache,
- backref_cache, 0, 0, 0,
- roots, tmp_ulist,
+ &delalloc_cached_state,
+ backref_ctx, 0, 0, 0,
key.offset, extent_end - 1);
} else {
/* We have a regular extent. */
if (fieinfo->fi_extents_max) {
- ret = btrfs_is_data_extent_shared(root, ino,
+ ret = btrfs_is_data_extent_shared(inode,
disk_bytenr,
extent_gen,
- roots,
- tmp_ulist,
- backref_cache);
+ backref_ctx);
if (ret < 0)
goto out_unlock;
else if (ret > 0)
@@ -4088,9 +3966,9 @@ check_eof_delalloc:
path = NULL;
if (!stopped && prev_extent_end < lockend) {
- ret = fiemap_process_hole(inode, fieinfo, &cache, backref_cache,
- 0, 0, 0, roots, tmp_ulist,
- prev_extent_end, lockend - 1);
+ ret = fiemap_process_hole(inode, fieinfo, &cache,
+ &delalloc_cached_state, backref_ctx,
+ 0, 0, 0, prev_extent_end, lockend - 1);
if (ret < 0)
goto out_unlock;
prev_extent_end = lockend;
@@ -4107,6 +3985,7 @@ check_eof_delalloc:
delalloc = btrfs_find_delalloc_in_range(inode,
prev_extent_end,
i_size - 1,
+ &delalloc_cached_state,
&delalloc_start,
&delalloc_end);
if (!delalloc)
@@ -4121,10 +4000,9 @@ check_eof_delalloc:
out_unlock:
unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state);
out:
- kfree(backref_cache);
+ free_extent_state(delalloc_cached_state);
+ btrfs_free_backref_share_ctx(backref_ctx);
btrfs_free_path(path);
- ulist_free(roots);
- ulist_free(tmp_ulist);
return ret;
}
@@ -4266,7 +4144,6 @@ __alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start,
eb->start = start;
eb->len = len;
eb->fs_info = fs_info;
- eb->bflags = 0;
init_rwsem(&eb->lock);
btrfs_leak_debug_add_eb(eb);
@@ -4299,7 +4176,6 @@ struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src)
*/
set_bit(EXTENT_BUFFER_UNMAPPED, &new->bflags);
- memset(new->pages, 0, sizeof(*new->pages) * num_pages);
ret = btrfs_alloc_page_array(num_pages, new->pages);
if (ret) {
btrfs_release_extent_buffer(new);
@@ -4944,11 +4820,13 @@ void set_extent_buffer_uptodate(struct extent_buffer *eb)
}
static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait,
- int mirror_num)
+ int mirror_num,
+ struct btrfs_tree_parent_check *check)
{
struct btrfs_fs_info *fs_info = eb->fs_info;
struct extent_io_tree *io_tree;
struct page *page = eb->pages[0];
+ struct extent_state *cached_state = NULL;
struct btrfs_bio_ctrl bio_ctrl = {
.mirror_num = mirror_num,
};
@@ -4956,13 +4834,16 @@ static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait,
ASSERT(!test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags));
ASSERT(PagePrivate(page));
+ ASSERT(check);
io_tree = &BTRFS_I(fs_info->btree_inode)->io_tree;
if (wait == WAIT_NONE) {
- if (!try_lock_extent(io_tree, eb->start, eb->start + eb->len - 1))
+ if (!try_lock_extent(io_tree, eb->start, eb->start + eb->len - 1,
+ &cached_state))
return -EAGAIN;
} else {
- ret = lock_extent(io_tree, eb->start, eb->start + eb->len - 1, NULL);
+ ret = lock_extent(io_tree, eb->start, eb->start + eb->len - 1,
+ &cached_state);
if (ret < 0)
return ret;
}
@@ -4972,7 +4853,8 @@ static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait,
PageUptodate(page) ||
btrfs_subpage_test_uptodate(fs_info, page, eb->start, eb->len)) {
set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
- unlock_extent(io_tree, eb->start, eb->start + eb->len - 1, NULL);
+ unlock_extent(io_tree, eb->start, eb->start + eb->len - 1,
+ &cached_state);
return ret;
}
@@ -4996,17 +4878,22 @@ static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait,
*/
atomic_dec(&eb->io_pages);
}
+ memcpy(&btrfs_bio(bio_ctrl.bio)->parent_check, check, sizeof(*check));
submit_one_bio(&bio_ctrl);
- if (ret || wait != WAIT_COMPLETE)
+ if (ret || wait != WAIT_COMPLETE) {
+ free_extent_state(cached_state);
return ret;
+ }
- wait_extent_bit(io_tree, eb->start, eb->start + eb->len - 1, EXTENT_LOCKED);
+ wait_extent_bit(io_tree, eb->start, eb->start + eb->len - 1,
+ EXTENT_LOCKED, &cached_state);
if (!test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
ret = -EIO;
return ret;
}
-int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num)
+int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num,
+ struct btrfs_tree_parent_check *check)
{
int i;
struct page *page;
@@ -5032,7 +4919,7 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num)
return -EIO;
if (eb->fs_info->nodesize < PAGE_SIZE)
- return read_extent_buffer_subpage(eb, wait, mirror_num);
+ return read_extent_buffer_subpage(eb, wait, mirror_num, check);
num_pages = num_extent_pages(eb);
for (i = 0; i < num_pages; i++) {
@@ -5109,6 +4996,7 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num)
}
}
+ memcpy(&btrfs_bio(bio_ctrl.bio)->parent_check, check, sizeof(*check));
submit_one_bio(&bio_ctrl);
if (ret || wait != WAIT_COMPLETE)
@@ -5467,11 +5355,12 @@ static inline void eb_bitmap_offset(const struct extent_buffer *eb,
*page_offset = offset_in_page(offset);
}
-/**
- * extent_buffer_test_bit - determine whether a bit in a bitmap item is set
- * @eb: the extent buffer
- * @start: offset of the bitmap item in the extent buffer
- * @nr: bit number to test
+/*
+ * Determine whether a bit in a bitmap item is set.
+ *
+ * @eb: the extent buffer
+ * @start: offset of the bitmap item in the extent buffer
+ * @nr: bit number to test
*/
int extent_buffer_test_bit(const struct extent_buffer *eb, unsigned long start,
unsigned long nr)
@@ -5488,12 +5377,13 @@ int extent_buffer_test_bit(const struct extent_buffer *eb, unsigned long start,
return 1U & (kaddr[offset] >> (nr & (BITS_PER_BYTE - 1)));
}
-/**
- * extent_buffer_bitmap_set - set an area of a bitmap
- * @eb: the extent buffer
- * @start: offset of the bitmap item in the extent buffer
- * @pos: bit number of the first bit
- * @len: number of bits to set
+/*
+ * Set an area of a bitmap to 1.
+ *
+ * @eb: the extent buffer
+ * @start: offset of the bitmap item in the extent buffer
+ * @pos: bit number of the first bit
+ * @len: number of bits to set
*/
void extent_buffer_bitmap_set(const struct extent_buffer *eb, unsigned long start,
unsigned long pos, unsigned long len)
@@ -5530,12 +5420,13 @@ void extent_buffer_bitmap_set(const struct extent_buffer *eb, unsigned long star
}
-/**
- * extent_buffer_bitmap_clear - clear an area of a bitmap
- * @eb: the extent buffer
- * @start: offset of the bitmap item in the extent buffer
- * @pos: bit number of the first bit
- * @len: number of bits to clear
+/*
+ * Clear an area of a bitmap.
+ *
+ * @eb: the extent buffer
+ * @start: offset of the bitmap item in the extent buffer
+ * @pos: bit number of the first bit
+ * @len: number of bits to clear
*/
void extent_buffer_bitmap_clear(const struct extent_buffer *eb,
unsigned long start, unsigned long pos,
@@ -5841,6 +5732,11 @@ int try_release_extent_buffer(struct page *page)
void btrfs_readahead_tree_block(struct btrfs_fs_info *fs_info,
u64 bytenr, u64 owner_root, u64 gen, int level)
{
+ struct btrfs_tree_parent_check check = {
+ .has_first_key = 0,
+ .level = level,
+ .transid = gen
+ };
struct extent_buffer *eb;
int ret;
@@ -5853,7 +5749,7 @@ void btrfs_readahead_tree_block(struct btrfs_fs_info *fs_info,
return;
}
- ret = read_extent_buffer_pages(eb, WAIT_NONE, 0);
+ ret = read_extent_buffer_pages(eb, WAIT_NONE, 0, &check);
if (ret < 0)
free_extent_buffer_stale(eb);
else
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 7929f054dda3..a2c82448b2e0 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -9,6 +9,7 @@
#include <linux/btrfs_tree.h>
#include "compression.h"
#include "ulist.h"
+#include "misc.h"
enum {
EXTENT_BUFFER_UPTODATE,
@@ -29,13 +30,15 @@ enum {
};
/* these are flags for __process_pages_contig */
-#define PAGE_UNLOCK (1 << 0)
-/* Page starts writeback, clear dirty bit and set writeback bit */
-#define PAGE_START_WRITEBACK (1 << 1)
-#define PAGE_END_WRITEBACK (1 << 2)
-#define PAGE_SET_ORDERED (1 << 3)
-#define PAGE_SET_ERROR (1 << 4)
-#define PAGE_LOCK (1 << 5)
+enum {
+ ENUM_BIT(PAGE_UNLOCK),
+ /* Page starts writeback, clear dirty bit and set writeback bit */
+ ENUM_BIT(PAGE_START_WRITEBACK),
+ ENUM_BIT(PAGE_END_WRITEBACK),
+ ENUM_BIT(PAGE_SET_ORDERED),
+ ENUM_BIT(PAGE_SET_ERROR),
+ ENUM_BIT(PAGE_LOCK),
+};
/*
* page->private values. Every page that is controlled by the extent
@@ -63,17 +66,11 @@ struct btrfs_inode;
struct btrfs_fs_info;
struct io_failure_record;
struct extent_io_tree;
+struct btrfs_tree_parent_check;
int __init extent_buffer_init_cachep(void);
void __cold extent_buffer_free_cachep(void);
-typedef void (submit_bio_hook_t)(struct inode *inode, struct bio *bio,
- int mirror_num,
- enum btrfs_compression_type compress_type);
-
-typedef blk_status_t (extent_submit_bio_start_t)(struct inode *inode,
- struct bio *bio, u64 dio_file_offset);
-
#define INLINE_EXTENT_BUFFER_PAGES (BTRFS_MAX_METADATA_BLOCKSIZE / PAGE_SIZE)
struct extent_buffer {
u64 start;
@@ -99,6 +96,39 @@ struct extent_buffer {
};
/*
+ * Get the correct offset inside the page of extent buffer.
+ *
+ * @eb: target extent buffer
+ * @start: offset inside the extent buffer
+ *
+ * Will handle both sectorsize == PAGE_SIZE and sectorsize < PAGE_SIZE cases.
+ */
+static inline size_t get_eb_offset_in_page(const struct extent_buffer *eb,
+ unsigned long offset)
+{
+ /*
+ * For sectorsize == PAGE_SIZE case, eb->start will always be aligned
+ * to PAGE_SIZE, thus adding it won't cause any difference.
+ *
+ * For sectorsize < PAGE_SIZE, we must only read the data that belongs
+ * to the eb, thus we have to take the eb->start into consideration.
+ */
+ return offset_in_page(offset + eb->start);
+}
+
+static inline unsigned long get_eb_page_index(unsigned long offset)
+{
+ /*
+ * For sectorsize == PAGE_SIZE case, plain >> PAGE_SHIFT is enough.
+ *
+ * For sectorsize < PAGE_SIZE case, we only support 64K PAGE_SIZE,
+ * and have ensured that all tree blocks are contained in one page,
+ * thus we always get index == 0.
+ */
+ return offset >> PAGE_SHIFT;
+}
+
+/*
* Structure to record how many bytes and which ranges are set/cleared
*/
struct extent_changeset {
@@ -174,8 +204,8 @@ void free_extent_buffer_stale(struct extent_buffer *eb);
#define WAIT_NONE 0
#define WAIT_COMPLETE 1
#define WAIT_PAGE_LOCK 2
-int read_extent_buffer_pages(struct extent_buffer *eb, int wait,
- int mirror_num);
+int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num,
+ struct btrfs_tree_parent_check *parent_check);
void wait_on_extent_buffer_writeback(struct extent_buffer *eb);
void btrfs_readahead_tree_block(struct btrfs_fs_info *fs_info,
u64 bytenr, u64 owner_root, u64 gen, int level);
@@ -248,7 +278,6 @@ int extent_invalidate_folio(struct extent_io_tree *tree,
int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array);
void end_extent_writepage(struct page *page, int err, u64 start, u64 end);
-int btrfs_repair_eb_io_failure(const struct extent_buffer *eb, int mirror_num);
/*
* When IO fails, either with EIO or csum verification fails, we
@@ -272,9 +301,9 @@ struct io_failure_record {
int num_copies;
};
-int btrfs_repair_one_sector(struct inode *inode, struct btrfs_bio *failed_bbio,
+int btrfs_repair_one_sector(struct btrfs_inode *inode, struct btrfs_bio *failed_bbio,
u32 bio_offset, struct page *page, unsigned int pgoff,
- submit_bio_hook_t *submit_bio_hook);
+ bool submit_buffered);
void btrfs_free_io_failure_record(struct btrfs_inode *inode, u64 start, u64 end);
int btrfs_clean_io_failure(struct btrfs_inode *inode, u64 start,
struct page *page, unsigned int pg_offset);
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 6092a4eedc92..be94030e1dfb 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -3,6 +3,7 @@
#include <linux/err.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
+#include "messages.h"
#include "ctree.h"
#include "volumes.h"
#include "extent_map.h"
@@ -27,12 +28,9 @@ void __cold extent_map_exit(void)
kmem_cache_destroy(extent_map_cache);
}
-/**
- * extent_map_tree_init - initialize extent map tree
- * @tree: tree to initialize
- *
- * Initialize the extent tree @tree. Should be called for each new inode
- * or other user of the extent_map interface.
+/*
+ * Initialize the extent tree @tree. Should be called for each new inode or
+ * other user of the extent_map interface.
*/
void extent_map_tree_init(struct extent_map_tree *tree)
{
@@ -41,12 +39,9 @@ void extent_map_tree_init(struct extent_map_tree *tree)
rwlock_init(&tree->lock);
}
-/**
- * alloc_extent_map - allocate new extent map structure
- *
- * Allocate a new extent_map structure. The new structure is
- * returned with a reference count of one and needs to be
- * freed using free_extent_map()
+/*
+ * Allocate a new extent_map structure. The new structure is returned with a
+ * reference count of one and needs to be freed using free_extent_map()
*/
struct extent_map *alloc_extent_map(void)
{
@@ -61,12 +56,9 @@ struct extent_map *alloc_extent_map(void)
return em;
}
-/**
- * free_extent_map - drop reference count of an extent_map
- * @em: extent map being released
- *
- * Drops the reference out on @em by one and free the structure
- * if the reference count hits zero.
+/*
+ * Drop the reference out on @em by one and free the structure if the reference
+ * count hits zero.
*/
void free_extent_map(struct extent_map *em)
{
@@ -81,7 +73,7 @@ void free_extent_map(struct extent_map *em)
}
}
-/* simple helper to do math around the end of an extent, handling wrap */
+/* Do the math around the end of an extent, handling wrapping. */
static u64 range_end(u64 start, u64 len)
{
if (start + len < start)
@@ -137,8 +129,8 @@ static int tree_insert(struct rb_root_cached *root, struct extent_map *em)
}
/*
- * search through the tree for an extent_map with a given offset. If
- * it can't be found, try to find some neighboring extents
+ * Search through the tree for an extent_map with a given offset. If it can't
+ * be found, try to find some neighboring extents
*/
static struct rb_node *__tree_search(struct rb_root *root, u64 offset,
struct rb_node **prev_or_next_ret)
@@ -190,7 +182,7 @@ static struct rb_node *__tree_search(struct rb_root *root, u64 offset,
return NULL;
}
-/* check to see if two extent_map structs are adjacent and safe to merge */
+/* Check to see if two extent_map structs are adjacent and safe to merge. */
static int mergable_maps(struct extent_map *prev, struct extent_map *next)
{
if (test_bit(EXTENT_FLAG_PINNED, &prev->flags))
@@ -288,8 +280,9 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em)
}
}
-/**
- * unpin_extent_cache - unpin an extent from the cache
+/*
+ * Unpin an extent from the cache.
+ *
* @tree: tree to unpin the extent in
* @start: logical offset in the file
* @len: length of the extent
@@ -392,7 +385,7 @@ static void extent_map_device_clear_bits(struct extent_map *em, unsigned bits)
}
}
-/**
+/*
* Add new extent map to the extent tree
*
* @tree: tree to insert new map in
@@ -451,8 +444,9 @@ __lookup_extent_mapping(struct extent_map_tree *tree,
return em;
}
-/**
- * lookup_extent_mapping - lookup extent_map
+/*
+ * Lookup extent_map that intersects @start + @len range.
+ *
* @tree: tree to lookup in
* @start: byte offset to start the search
* @len: length of the lookup range
@@ -468,8 +462,9 @@ struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree,
return __lookup_extent_mapping(tree, start, len, 1);
}
-/**
- * search_extent_mapping - find a nearby extent map
+/*
+ * Find a nearby extent map intersecting @start + @len (not an exact search).
+ *
* @tree: tree to lookup in
* @start: byte offset to start the search
* @len: length of the lookup range
@@ -485,13 +480,14 @@ struct extent_map *search_extent_mapping(struct extent_map_tree *tree,
return __lookup_extent_mapping(tree, start, len, 0);
}
-/**
- * remove_extent_mapping - removes an extent_map from the extent tree
+/*
+ * Remove an extent_map from the extent tree.
+ *
* @tree: extent tree to remove from
* @em: extent map being removed
*
- * Removes @em from @tree. No reference counts are dropped, and no checks
- * are done to see if the range is in use
+ * Remove @em from @tree. No reference counts are dropped, and no checks
+ * are done to see if the range is in use.
*/
void remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em)
{
@@ -523,7 +519,7 @@ void replace_extent_mapping(struct extent_map_tree *tree,
setup_extent_mapping(tree, new, modified);
}
-static struct extent_map *next_extent_map(struct extent_map *em)
+static struct extent_map *next_extent_map(const struct extent_map *em)
{
struct rb_node *next;
@@ -585,8 +581,8 @@ static noinline int merge_extent_mapping(struct extent_map_tree *em_tree,
return add_extent_mapping(em_tree, em, 0);
}
-/**
- * Add extent mapping into em_tree
+/*
+ * Add extent mapping into em_tree.
*
* @fs_info: the filesystem
* @em_tree: extent tree into which we want to insert the extent mapping
@@ -613,6 +609,13 @@ int btrfs_add_extent_mapping(struct btrfs_fs_info *fs_info,
int ret;
struct extent_map *em = *em_in;
+ /*
+ * Tree-checker should have rejected any inline extent with non-zero
+ * file offset. Here just do a sanity check.
+ */
+ if (em->block_start == EXTENT_MAP_INLINE)
+ ASSERT(em->start == 0);
+
ret = add_extent_mapping(em_tree, em, 0);
/* it is possible that someone inserted the extent into the tree
* while we had the lock dropped. It is also possible that
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 6bb9fa961a6a..5de73466b2ca 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -9,13 +9,18 @@
#include <linux/highmem.h>
#include <linux/sched/mm.h>
#include <crypto/hash.h>
+#include "messages.h"
#include "misc.h"
#include "ctree.h"
#include "disk-io.h"
#include "transaction.h"
-#include "volumes.h"
+#include "bio.h"
#include "print-tree.h"
#include "compression.h"
+#include "fs.h"
+#include "accessors.h"
+#include "file-item.h"
+#include "super.h"
#define __MAX_CSUM_ITEMS(r, size) ((unsigned long)(((BTRFS_LEAF_DATA_SIZE(r) - \
sizeof(struct btrfs_item) * 2) / \
@@ -24,8 +29,8 @@
#define MAX_CSUM_ITEMS(r, size) (min_t(u32, __MAX_CSUM_ITEMS(r, size), \
PAGE_SIZE))
-/**
- * Set inode's size according to filesystem options
+/*
+ * Set inode's size according to filesystem options.
*
* @inode: inode we want to update the disk_i_size for
* @new_i_size: i_size we want to set to, 0 if we use i_size
@@ -64,8 +69,8 @@ void btrfs_inode_safe_disk_i_size_write(struct btrfs_inode *inode, u64 new_i_siz
spin_unlock(&inode->lock);
}
-/**
- * Mark range within a file as having a new extent inserted
+/*
+ * Mark range within a file as having a new extent inserted.
*
* @inode: inode being modified
* @start: start file offset of the file extent we've inserted
@@ -92,8 +97,8 @@ int btrfs_inode_set_file_extent_range(struct btrfs_inode *inode, u64 start,
EXTENT_DIRTY);
}
-/**
- * Marks an inode range as not having a backing extent
+/*
+ * Mark an inode range as not having a backing extent.
*
* @inode: inode being modified
* @start: start file offset of the file extent we've inserted
@@ -121,12 +126,26 @@ int btrfs_inode_clear_file_extent_range(struct btrfs_inode *inode, u64 start,
start + len - 1, EXTENT_DIRTY, NULL);
}
-static inline u32 max_ordered_sum_bytes(struct btrfs_fs_info *fs_info,
- u16 csum_size)
+static size_t bytes_to_csum_size(const struct btrfs_fs_info *fs_info, u32 bytes)
+{
+ ASSERT(IS_ALIGNED(bytes, fs_info->sectorsize));
+
+ return (bytes >> fs_info->sectorsize_bits) * fs_info->csum_size;
+}
+
+static size_t csum_size_to_bytes(const struct btrfs_fs_info *fs_info, u32 csum_size)
{
- u32 ncsums = (PAGE_SIZE - sizeof(struct btrfs_ordered_sum)) / csum_size;
+ ASSERT(IS_ALIGNED(csum_size, fs_info->csum_size));
- return ncsums * fs_info->sectorsize;
+ return (csum_size / fs_info->csum_size) << fs_info->sectorsize_bits;
+}
+
+static inline u32 max_ordered_sum_bytes(const struct btrfs_fs_info *fs_info)
+{
+ u32 max_csum_size = round_down(PAGE_SIZE - sizeof(struct btrfs_ordered_sum),
+ fs_info->csum_size);
+
+ return csum_size_to_bytes(fs_info, max_csum_size);
}
/*
@@ -135,9 +154,7 @@ static inline u32 max_ordered_sum_bytes(struct btrfs_fs_info *fs_info,
*/
static int btrfs_ordered_sum_size(struct btrfs_fs_info *fs_info, unsigned long bytes)
{
- int num_sectors = (int)DIV_ROUND_UP(bytes, fs_info->sectorsize);
-
- return sizeof(struct btrfs_ordered_sum) + num_sectors * fs_info->csum_size;
+ return sizeof(struct btrfs_ordered_sum) + bytes_to_csum_size(fs_info, bytes);
}
int btrfs_insert_hole_extent(struct btrfs_trans_handle *trans,
@@ -254,7 +271,7 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
/*
* Find checksums for logical bytenr range [disk_bytenr, disk_bytenr + len) and
- * estore the result to @dst.
+ * store the result to @dst.
*
* Return >0 for the number of sectors we found.
* Return 0 for the range [disk_bytenr, disk_bytenr + sectorsize) has no csum
@@ -360,15 +377,15 @@ static int search_file_offset_in_bio(struct bio *bio, struct inode *inode,
return ret;
}
-/**
+/*
* Lookup the checksum for the read bio in csum tree.
*
- * @inode: inode that the bio is for.
- * @bio: bio to look up.
- * @dst: Buffer of size nblocks * btrfs_super_csum_size() used to return
- * checksum (nblocks = bio->bi_iter.bi_size / fs_info->sectorsize). If
- * NULL, the checksum buffer is allocated and returned in
- * btrfs_bio(bio)->csum instead.
+ * @inode: inode that the bio is for.
+ * @bio: bio to look up.
+ * @dst: Buffer of size nblocks * btrfs_super_csum_size() used to return
+ * checksum (nblocks = bio->bi_iter.bi_size / fs_info->sectorsize). If
+ * NULL, the checksum buffer is allocated and returned in
+ * btrfs_bio(bio)->csum instead.
*
* Return: BLK_STS_RESOURCE if allocating memory fails, BLK_STS_OK otherwise.
*/
@@ -510,9 +527,9 @@ blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst
return ret;
}
-int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
- struct list_head *list, int search_commit,
- bool nowait)
+int btrfs_lookup_csums_list(struct btrfs_root *root, u64 start, u64 end,
+ struct list_head *list, int search_commit,
+ bool nowait)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_key key;
@@ -521,11 +538,7 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
struct btrfs_ordered_sum *sums;
struct btrfs_csum_item *item;
LIST_HEAD(tmplist);
- unsigned long offset;
int ret;
- size_t size;
- u64 csum_end;
- const u32 csum_size = fs_info->csum_size;
ASSERT(IS_ALIGNED(start, fs_info->sectorsize) &&
IS_ALIGNED(end + 1, fs_info->sectorsize));
@@ -551,16 +564,33 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
if (ret > 0 && path->slots[0] > 0) {
leaf = path->nodes[0];
btrfs_item_key_to_cpu(leaf, &key, path->slots[0] - 1);
+
+ /*
+ * There are two cases we can hit here for the previous csum
+ * item:
+ *
+ * |<- search range ->|
+ * |<- csum item ->|
+ *
+ * Or
+ * |<- search range ->|
+ * |<- csum item ->|
+ *
+ * Check if the previous csum item covers the leading part of
+ * the search range. If so we have to start from previous csum
+ * item.
+ */
if (key.objectid == BTRFS_EXTENT_CSUM_OBJECTID &&
key.type == BTRFS_EXTENT_CSUM_KEY) {
- offset = (start - key.offset) >> fs_info->sectorsize_bits;
- if (offset * csum_size <
+ if (bytes_to_csum_size(fs_info, start - key.offset) <
btrfs_item_size(leaf, path->slots[0] - 1))
path->slots[0]--;
}
}
while (start <= end) {
+ u64 csum_end;
+
leaf = path->nodes[0];
if (path->slots[0] >= btrfs_header_nritems(leaf)) {
ret = btrfs_next_leaf(root, path);
@@ -580,8 +610,8 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
if (key.offset > start)
start = key.offset;
- size = btrfs_item_size(leaf, path->slots[0]);
- csum_end = key.offset + (size / csum_size) * fs_info->sectorsize;
+ csum_end = key.offset + csum_size_to_bytes(fs_info,
+ btrfs_item_size(leaf, path->slots[0]));
if (csum_end <= start) {
path->slots[0]++;
continue;
@@ -591,8 +621,11 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
item = btrfs_item_ptr(path->nodes[0], path->slots[0],
struct btrfs_csum_item);
while (start < csum_end) {
+ unsigned long offset;
+ size_t size;
+
size = min_t(size_t, csum_end - start,
- max_ordered_sum_bytes(fs_info, csum_size));
+ max_ordered_sum_bytes(fs_info));
sums = kzalloc(btrfs_ordered_sum_size(fs_info, size),
GFP_NOFS);
if (!sums) {
@@ -603,16 +636,14 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
sums->bytenr = start;
sums->len = (int)size;
- offset = (start - key.offset) >> fs_info->sectorsize_bits;
- offset *= csum_size;
- size >>= fs_info->sectorsize_bits;
+ offset = bytes_to_csum_size(fs_info, start - key.offset);
read_extent_buffer(path->nodes[0],
sums->sums,
((unsigned long)item) + offset,
- csum_size * size);
+ bytes_to_csum_size(fs_info, size));
- start += fs_info->sectorsize * size;
+ start += size;
list_add_tail(&sums->list, &tmplist);
}
path->slots[0]++;
@@ -630,8 +661,129 @@ fail:
return ret;
}
-/**
- * Calculate checksums of the data contained inside a bio
+/*
+ * Do the same work as btrfs_lookup_csums_list(), the difference is in how
+ * we return the result.
+ *
+ * This version will set the corresponding bits in @csum_bitmap to represent
+ * that there is a csum found.
+ * Each bit represents a sector. Thus caller should ensure @csum_buf passed
+ * in is large enough to contain all csums.
+ */
+int btrfs_lookup_csums_bitmap(struct btrfs_root *root, u64 start, u64 end,
+ u8 *csum_buf, unsigned long *csum_bitmap)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_key key;
+ struct btrfs_path *path;
+ struct extent_buffer *leaf;
+ struct btrfs_csum_item *item;
+ const u64 orig_start = start;
+ int ret;
+
+ ASSERT(IS_ALIGNED(start, fs_info->sectorsize) &&
+ IS_ALIGNED(end + 1, fs_info->sectorsize));
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
+ key.type = BTRFS_EXTENT_CSUM_KEY;
+ key.offset = start;
+
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0)
+ goto fail;
+ if (ret > 0 && path->slots[0] > 0) {
+ leaf = path->nodes[0];
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0] - 1);
+
+ /*
+ * There are two cases we can hit here for the previous csum
+ * item:
+ *
+ * |<- search range ->|
+ * |<- csum item ->|
+ *
+ * Or
+ * |<- search range ->|
+ * |<- csum item ->|
+ *
+ * Check if the previous csum item covers the leading part of
+ * the search range. If so we have to start from previous csum
+ * item.
+ */
+ if (key.objectid == BTRFS_EXTENT_CSUM_OBJECTID &&
+ key.type == BTRFS_EXTENT_CSUM_KEY) {
+ if (bytes_to_csum_size(fs_info, start - key.offset) <
+ btrfs_item_size(leaf, path->slots[0] - 1))
+ path->slots[0]--;
+ }
+ }
+
+ while (start <= end) {
+ u64 csum_end;
+
+ leaf = path->nodes[0];
+ if (path->slots[0] >= btrfs_header_nritems(leaf)) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret < 0)
+ goto fail;
+ if (ret > 0)
+ break;
+ leaf = path->nodes[0];
+ }
+
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+ if (key.objectid != BTRFS_EXTENT_CSUM_OBJECTID ||
+ key.type != BTRFS_EXTENT_CSUM_KEY ||
+ key.offset > end)
+ break;
+
+ if (key.offset > start)
+ start = key.offset;
+
+ csum_end = key.offset + csum_size_to_bytes(fs_info,
+ btrfs_item_size(leaf, path->slots[0]));
+ if (csum_end <= start) {
+ path->slots[0]++;
+ continue;
+ }
+
+ csum_end = min(csum_end, end + 1);
+ item = btrfs_item_ptr(path->nodes[0], path->slots[0],
+ struct btrfs_csum_item);
+ while (start < csum_end) {
+ unsigned long offset;
+ size_t size;
+ u8 *csum_dest = csum_buf + bytes_to_csum_size(fs_info,
+ start - orig_start);
+
+ size = min_t(size_t, csum_end - start, end + 1 - start);
+
+ offset = bytes_to_csum_size(fs_info, start - key.offset);
+
+ read_extent_buffer(path->nodes[0], csum_dest,
+ ((unsigned long)item) + offset,
+ bytes_to_csum_size(fs_info, size));
+
+ bitmap_set(csum_bitmap,
+ (start - orig_start) >> fs_info->sectorsize_bits,
+ size >> fs_info->sectorsize_bits);
+
+ start += size;
+ }
+ path->slots[0]++;
+ }
+ ret = 0;
+fail:
+ btrfs_free_path(path);
+ return ret;
+}
+
+/*
+ * Calculate checksums of the data contained inside a bio.
*
* @inode: Owner of the data inside the bio
* @bio: Contains the data to be checksummed
@@ -746,15 +898,16 @@ blk_status_t btrfs_csum_one_bio(struct btrfs_inode *inode, struct bio *bio,
}
/*
- * helper function for csum removal, this expects the
- * key to describe the csum pointed to by the path, and it expects
- * the csum to overlap the range [bytenr, len]
+ * Remove one checksum overlapping a range.
*
- * The csum should not be entirely contained in the range and the
- * range should not be entirely contained in the csum.
+ * This expects the key to describe the csum pointed to by the path, and it
+ * expects the csum to overlap the range [bytenr, len]
*
- * This calls btrfs_truncate_item with the correct args based on the
- * overlap, and fixes up the key as required.
+ * The csum should not be entirely contained in the range and the range should
+ * not be entirely contained in the csum.
+ *
+ * This calls btrfs_truncate_item with the correct args based on the overlap,
+ * and fixes up the key as required.
*/
static noinline void truncate_one_csum(struct btrfs_fs_info *fs_info,
struct btrfs_path *path,
@@ -803,8 +956,7 @@ static noinline void truncate_one_csum(struct btrfs_fs_info *fs_info,
}
/*
- * deletes the csum items from the csum tree for a given
- * range of bytes.
+ * Delete the csum items from the csum tree for a given range of bytes.
*/
int btrfs_del_csums(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 bytenr, u64 len)
@@ -1209,7 +1361,6 @@ out:
void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode,
const struct btrfs_path *path,
struct btrfs_file_extent_item *fi,
- const bool new_inline,
struct extent_map *em)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
@@ -1261,10 +1412,9 @@ void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode,
*/
em->orig_start = EXTENT_MAP_HOLE;
em->block_len = (u64)-1;
- if (!new_inline && compress_type != BTRFS_COMPRESS_NONE) {
+ em->compress_type = compress_type;
+ if (compress_type != BTRFS_COMPRESS_NONE)
set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
- em->compress_type = compress_type;
- }
} else {
btrfs_err(fs_info,
"unknown file extent item type %d, inode %llu, offset %llu, "
diff --git a/fs/btrfs/file-item.h b/fs/btrfs/file-item.h
new file mode 100644
index 000000000000..031225668434
--- /dev/null
+++ b/fs/btrfs/file-item.h
@@ -0,0 +1,69 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef BTRFS_FILE_ITEM_H
+#define BTRFS_FILE_ITEM_H
+
+#include "accessors.h"
+
+#define BTRFS_FILE_EXTENT_INLINE_DATA_START \
+ (offsetof(struct btrfs_file_extent_item, disk_bytenr))
+
+static inline u32 BTRFS_MAX_INLINE_DATA_SIZE(const struct btrfs_fs_info *info)
+{
+ return BTRFS_MAX_ITEM_SIZE(info) - BTRFS_FILE_EXTENT_INLINE_DATA_START;
+}
+
+/*
+ * Return the number of bytes used by the item on disk, minus the size of any
+ * extent headers. If a file is compressed on disk, this is the compressed
+ * size.
+ */
+static inline u32 btrfs_file_extent_inline_item_len(
+ const struct extent_buffer *eb,
+ int nr)
+{
+ return btrfs_item_size(eb, nr) - BTRFS_FILE_EXTENT_INLINE_DATA_START;
+}
+
+static inline unsigned long btrfs_file_extent_inline_start(
+ const struct btrfs_file_extent_item *e)
+{
+ return (unsigned long)e + BTRFS_FILE_EXTENT_INLINE_DATA_START;
+}
+
+static inline u32 btrfs_file_extent_calc_inline_size(u32 datasize)
+{
+ return BTRFS_FILE_EXTENT_INLINE_DATA_START + datasize;
+}
+
+int btrfs_del_csums(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, u64 bytenr, u64 len);
+blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst);
+int btrfs_insert_hole_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, u64 objectid, u64 pos,
+ u64 num_bytes);
+int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path, u64 objectid,
+ u64 bytenr, int mod);
+int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_ordered_sum *sums);
+blk_status_t btrfs_csum_one_bio(struct btrfs_inode *inode, struct bio *bio,
+ u64 offset, bool one_ordered);
+int btrfs_lookup_csums_list(struct btrfs_root *root, u64 start, u64 end,
+ struct list_head *list, int search_commit,
+ bool nowait);
+int btrfs_lookup_csums_bitmap(struct btrfs_root *root, u64 start, u64 end,
+ u8 *csum_buf, unsigned long *csum_bitmap);
+void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode,
+ const struct btrfs_path *path,
+ struct btrfs_file_extent_item *fi,
+ struct extent_map *em);
+int btrfs_inode_clear_file_extent_range(struct btrfs_inode *inode, u64 start,
+ u64 len);
+int btrfs_inode_set_file_extent_range(struct btrfs_inode *inode, u64 start, u64 len);
+void btrfs_inode_safe_disk_i_size_write(struct btrfs_inode *inode, u64 new_i_size);
+u64 btrfs_file_extent_end(const struct btrfs_path *path);
+
+#endif
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index d01631d47806..91b00eb2440e 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -30,329 +30,13 @@
#include "delalloc-space.h"
#include "reflink.h"
#include "subpage.h"
-
-static struct kmem_cache *btrfs_inode_defrag_cachep;
-/*
- * when auto defrag is enabled we
- * queue up these defrag structs to remember which
- * inodes need defragging passes
- */
-struct inode_defrag {
- struct rb_node rb_node;
- /* objectid */
- u64 ino;
- /*
- * transid where the defrag was added, we search for
- * extents newer than this
- */
- u64 transid;
-
- /* root objectid */
- u64 root;
-
- /*
- * The extent size threshold for autodefrag.
- *
- * This value is different for compressed/non-compressed extents,
- * thus needs to be passed from higher layer.
- * (aka, inode_should_defrag())
- */
- u32 extent_thresh;
-};
-
-static int __compare_inode_defrag(struct inode_defrag *defrag1,
- struct inode_defrag *defrag2)
-{
- if (defrag1->root > defrag2->root)
- return 1;
- else if (defrag1->root < defrag2->root)
- return -1;
- else if (defrag1->ino > defrag2->ino)
- return 1;
- else if (defrag1->ino < defrag2->ino)
- return -1;
- else
- return 0;
-}
-
-/* pop a record for an inode into the defrag tree. The lock
- * must be held already
- *
- * If you're inserting a record for an older transid than an
- * existing record, the transid already in the tree is lowered
- *
- * If an existing record is found the defrag item you
- * pass in is freed
- */
-static int __btrfs_add_inode_defrag(struct btrfs_inode *inode,
- struct inode_defrag *defrag)
-{
- struct btrfs_fs_info *fs_info = inode->root->fs_info;
- struct inode_defrag *entry;
- struct rb_node **p;
- struct rb_node *parent = NULL;
- int ret;
-
- p = &fs_info->defrag_inodes.rb_node;
- while (*p) {
- parent = *p;
- entry = rb_entry(parent, struct inode_defrag, rb_node);
-
- ret = __compare_inode_defrag(defrag, entry);
- if (ret < 0)
- p = &parent->rb_left;
- else if (ret > 0)
- p = &parent->rb_right;
- else {
- /* if we're reinserting an entry for
- * an old defrag run, make sure to
- * lower the transid of our existing record
- */
- if (defrag->transid < entry->transid)
- entry->transid = defrag->transid;
- entry->extent_thresh = min(defrag->extent_thresh,
- entry->extent_thresh);
- return -EEXIST;
- }
- }
- set_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags);
- rb_link_node(&defrag->rb_node, parent, p);
- rb_insert_color(&defrag->rb_node, &fs_info->defrag_inodes);
- return 0;
-}
-
-static inline int __need_auto_defrag(struct btrfs_fs_info *fs_info)
-{
- if (!btrfs_test_opt(fs_info, AUTO_DEFRAG))
- return 0;
-
- if (btrfs_fs_closing(fs_info))
- return 0;
-
- return 1;
-}
-
-/*
- * insert a defrag record for this inode if auto defrag is
- * enabled
- */
-int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
- struct btrfs_inode *inode, u32 extent_thresh)
-{
- struct btrfs_root *root = inode->root;
- struct btrfs_fs_info *fs_info = root->fs_info;
- struct inode_defrag *defrag;
- u64 transid;
- int ret;
-
- if (!__need_auto_defrag(fs_info))
- return 0;
-
- if (test_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags))
- return 0;
-
- if (trans)
- transid = trans->transid;
- else
- transid = inode->root->last_trans;
-
- defrag = kmem_cache_zalloc(btrfs_inode_defrag_cachep, GFP_NOFS);
- if (!defrag)
- return -ENOMEM;
-
- defrag->ino = btrfs_ino(inode);
- defrag->transid = transid;
- defrag->root = root->root_key.objectid;
- defrag->extent_thresh = extent_thresh;
-
- spin_lock(&fs_info->defrag_inodes_lock);
- if (!test_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags)) {
- /*
- * If we set IN_DEFRAG flag and evict the inode from memory,
- * and then re-read this inode, this new inode doesn't have
- * IN_DEFRAG flag. At the case, we may find the existed defrag.
- */
- ret = __btrfs_add_inode_defrag(inode, defrag);
- if (ret)
- kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
- } else {
- kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
- }
- spin_unlock(&fs_info->defrag_inodes_lock);
- return 0;
-}
-
-/*
- * pick the defragable inode that we want, if it doesn't exist, we will get
- * the next one.
- */
-static struct inode_defrag *
-btrfs_pick_defrag_inode(struct btrfs_fs_info *fs_info, u64 root, u64 ino)
-{
- struct inode_defrag *entry = NULL;
- struct inode_defrag tmp;
- struct rb_node *p;
- struct rb_node *parent = NULL;
- int ret;
-
- tmp.ino = ino;
- tmp.root = root;
-
- spin_lock(&fs_info->defrag_inodes_lock);
- p = fs_info->defrag_inodes.rb_node;
- while (p) {
- parent = p;
- entry = rb_entry(parent, struct inode_defrag, rb_node);
-
- ret = __compare_inode_defrag(&tmp, entry);
- if (ret < 0)
- p = parent->rb_left;
- else if (ret > 0)
- p = parent->rb_right;
- else
- goto out;
- }
-
- if (parent && __compare_inode_defrag(&tmp, entry) > 0) {
- parent = rb_next(parent);
- if (parent)
- entry = rb_entry(parent, struct inode_defrag, rb_node);
- else
- entry = NULL;
- }
-out:
- if (entry)
- rb_erase(parent, &fs_info->defrag_inodes);
- spin_unlock(&fs_info->defrag_inodes_lock);
- return entry;
-}
-
-void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info)
-{
- struct inode_defrag *defrag;
- struct rb_node *node;
-
- spin_lock(&fs_info->defrag_inodes_lock);
- node = rb_first(&fs_info->defrag_inodes);
- while (node) {
- rb_erase(node, &fs_info->defrag_inodes);
- defrag = rb_entry(node, struct inode_defrag, rb_node);
- kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
-
- cond_resched_lock(&fs_info->defrag_inodes_lock);
-
- node = rb_first(&fs_info->defrag_inodes);
- }
- spin_unlock(&fs_info->defrag_inodes_lock);
-}
-
-#define BTRFS_DEFRAG_BATCH 1024
-
-static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info,
- struct inode_defrag *defrag)
-{
- struct btrfs_root *inode_root;
- struct inode *inode;
- struct btrfs_ioctl_defrag_range_args range;
- int ret = 0;
- u64 cur = 0;
-
-again:
- if (test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state))
- goto cleanup;
- if (!__need_auto_defrag(fs_info))
- goto cleanup;
-
- /* get the inode */
- inode_root = btrfs_get_fs_root(fs_info, defrag->root, true);
- if (IS_ERR(inode_root)) {
- ret = PTR_ERR(inode_root);
- goto cleanup;
- }
-
- inode = btrfs_iget(fs_info->sb, defrag->ino, inode_root);
- btrfs_put_root(inode_root);
- if (IS_ERR(inode)) {
- ret = PTR_ERR(inode);
- goto cleanup;
- }
-
- if (cur >= i_size_read(inode)) {
- iput(inode);
- goto cleanup;
- }
-
- /* do a chunk of defrag */
- clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags);
- memset(&range, 0, sizeof(range));
- range.len = (u64)-1;
- range.start = cur;
- range.extent_thresh = defrag->extent_thresh;
-
- sb_start_write(fs_info->sb);
- ret = btrfs_defrag_file(inode, NULL, &range, defrag->transid,
- BTRFS_DEFRAG_BATCH);
- sb_end_write(fs_info->sb);
- iput(inode);
-
- if (ret < 0)
- goto cleanup;
-
- cur = max(cur + fs_info->sectorsize, range.start);
- goto again;
-
-cleanup:
- kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
- return ret;
-}
-
-/*
- * run through the list of inodes in the FS that need
- * defragging
- */
-int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
-{
- struct inode_defrag *defrag;
- u64 first_ino = 0;
- u64 root_objectid = 0;
-
- atomic_inc(&fs_info->defrag_running);
- while (1) {
- /* Pause the auto defragger. */
- if (test_bit(BTRFS_FS_STATE_REMOUNTING,
- &fs_info->fs_state))
- break;
-
- if (!__need_auto_defrag(fs_info))
- break;
-
- /* find an inode to defrag */
- defrag = btrfs_pick_defrag_inode(fs_info, root_objectid,
- first_ino);
- if (!defrag) {
- if (root_objectid || first_ino) {
- root_objectid = 0;
- first_ino = 0;
- continue;
- } else {
- break;
- }
- }
-
- first_ino = defrag->ino + 1;
- root_objectid = defrag->root;
-
- __btrfs_run_defrag_inode(fs_info, defrag);
- }
- atomic_dec(&fs_info->defrag_running);
-
- /*
- * during unmount, we use the transaction_wait queue to
- * wait for the defragger to stop
- */
- wake_up(&fs_info->transaction_wait);
- return 0;
-}
+#include "fs.h"
+#include "accessors.h"
+#include "extent-tree.h"
+#include "file-item.h"
+#include "ioctl.h"
+#include "file.h"
+#include "super.h"
/* simple helper to fault in pages and copy. This should go away
* and be replaced with calls into generic code.
@@ -696,7 +380,10 @@ next_slot:
args->start - extent_offset,
0, false);
ret = btrfs_inc_extent_ref(trans, &ref);
- BUG_ON(ret); /* -ENOMEM */
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ break;
+ }
}
key.offset = args->start;
}
@@ -783,7 +470,10 @@ delete_extent_item:
key.offset - extent_offset, 0,
false);
ret = btrfs_free_extent(trans, &ref);
- BUG_ON(ret); /* -ENOMEM */
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ break;
+ }
args->bytes_found += extent_end - key.offset;
}
@@ -1302,7 +992,8 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages,
struct btrfs_ordered_extent *ordered;
if (nowait) {
- if (!try_lock_extent(&inode->io_tree, start_pos, last_pos)) {
+ if (!try_lock_extent(&inode->io_tree, start_pos, last_pos,
+ cached_state)) {
for (i = 0; i < num_pages; i++) {
unlock_page(pages[i]);
put_page(pages[i]);
@@ -1372,6 +1063,7 @@ int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos,
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct btrfs_root *root = inode->root;
+ struct extent_state *cached_state = NULL;
u64 lockstart, lockend;
u64 num_bytes;
int ret;
@@ -1388,12 +1080,14 @@ int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos,
num_bytes = lockend - lockstart + 1;
if (nowait) {
- if (!btrfs_try_lock_ordered_range(inode, lockstart, lockend)) {
+ if (!btrfs_try_lock_ordered_range(inode, lockstart, lockend,
+ &cached_state)) {
btrfs_drew_write_unlock(&root->snapshot_lock);
return -EAGAIN;
}
} else {
- btrfs_lock_and_flush_ordered_range(inode, lockstart, lockend, NULL);
+ btrfs_lock_and_flush_ordered_range(inode, lockstart, lockend,
+ &cached_state);
}
ret = can_nocow_extent(&inode->vfs_inode, lockstart, &num_bytes,
NULL, NULL, NULL, nowait, false);
@@ -1402,7 +1096,7 @@ int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos,
else
*write_bytes = min_t(size_t, *write_bytes ,
num_bytes - pos + lockstart);
- unlock_extent(&inode->io_tree, lockstart, lockend, NULL);
+ unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state);
return ret;
}
@@ -1505,7 +1199,7 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb,
if (nowait)
ilock_flags |= BTRFS_ILOCK_TRY;
- ret = btrfs_inode_lock(inode, ilock_flags);
+ ret = btrfs_inode_lock(BTRFS_I(inode), ilock_flags);
if (ret < 0)
return ret;
@@ -1740,7 +1434,7 @@ again:
iocb->ki_pos += num_written;
}
out:
- btrfs_inode_unlock(inode, ilock_flags);
+ btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
return num_written ? num_written : ret;
}
@@ -1780,19 +1474,19 @@ static ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from)
ilock_flags |= BTRFS_ILOCK_SHARED;
relock:
- err = btrfs_inode_lock(inode, ilock_flags);
+ err = btrfs_inode_lock(BTRFS_I(inode), ilock_flags);
if (err < 0)
return err;
err = generic_write_checks(iocb, from);
if (err <= 0) {
- btrfs_inode_unlock(inode, ilock_flags);
+ btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
return err;
}
err = btrfs_write_check(iocb, from, err);
if (err < 0) {
- btrfs_inode_unlock(inode, ilock_flags);
+ btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
goto out;
}
@@ -1803,13 +1497,13 @@ relock:
*/
if ((ilock_flags & BTRFS_ILOCK_SHARED) &&
pos + iov_iter_count(from) > i_size_read(inode)) {
- btrfs_inode_unlock(inode, ilock_flags);
+ btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
ilock_flags &= ~BTRFS_ILOCK_SHARED;
goto relock;
}
if (check_direct_IO(fs_info, from, pos)) {
- btrfs_inode_unlock(inode, ilock_flags);
+ btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
goto buffered;
}
@@ -1840,7 +1534,7 @@ relock:
* iocb, and that needs to lock the inode. So unlock it before calling
* iomap_dio_complete() to avoid a deadlock.
*/
- btrfs_inode_unlock(inode, ilock_flags);
+ btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
if (IS_ERR_OR_NULL(dio))
err = PTR_ERR_OR_ZERO(dio);
@@ -1887,8 +1581,8 @@ buffered:
/*
* If we are in a NOWAIT context, then return -EAGAIN to signal the caller
* it must retry the operation in a context where blocking is acceptable,
- * since we currently don't have NOWAIT semantics support for buffered IO
- * and may block there for many reasons (reserving space for example).
+ * because even if we end up not blocking during the buffered IO attempt
+ * below, we will block when flushing and waiting for the IO.
*/
if (iocb->ki_flags & IOCB_NOWAIT) {
err = -EAGAIN;
@@ -1928,7 +1622,7 @@ static ssize_t btrfs_encoded_write(struct kiocb *iocb, struct iov_iter *from,
loff_t count;
ssize_t ret;
- btrfs_inode_lock(inode, 0);
+ btrfs_inode_lock(BTRFS_I(inode), 0);
count = encoded->len;
ret = generic_write_checks_count(iocb, &count);
if (ret == 0 && count != encoded->len) {
@@ -1947,7 +1641,7 @@ static ssize_t btrfs_encoded_write(struct kiocb *iocb, struct iov_iter *from,
ret = btrfs_do_encoded_write(iocb, from, encoded);
out:
- btrfs_inode_unlock(inode, 0);
+ btrfs_inode_unlock(BTRFS_I(inode), 0);
return ret;
}
@@ -2008,10 +1702,12 @@ int btrfs_release_file(struct inode *inode, struct file *filp)
{
struct btrfs_file_private *private = filp->private_data;
- if (private && private->filldir_buf)
+ if (private) {
kfree(private->filldir_buf);
- kfree(private);
- filp->private_data = NULL;
+ free_extent_state(private->llseek_cached_state);
+ kfree(private);
+ filp->private_data = NULL;
+ }
/*
* Set by setattr when we are about to truncate a file from a non-zero
@@ -2118,7 +1814,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
if (ret)
goto out;
- btrfs_inode_lock(inode, BTRFS_ILOCK_MMAP);
+ btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
atomic_inc(&root->log_batch);
@@ -2142,7 +1838,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
*/
ret = start_ordered_ops(inode, start, end);
if (ret) {
- btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
+ btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
goto out;
}
@@ -2245,7 +1941,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
* file again, but that will end up using the synchronization
* inside btrfs_sync_log to keep things safe.
*/
- btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
+ btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
if (ret == BTRFS_NO_LOG_SYNC) {
ret = btrfs_end_transaction(trans);
@@ -2313,7 +2009,7 @@ out:
out_release_extents:
btrfs_release_log_ctx_extents(&ctx);
- btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
+ btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
goto out;
}
@@ -2908,7 +2604,7 @@ static int btrfs_punch_hole(struct file *file, loff_t offset, loff_t len)
bool truncated_block = false;
bool updated_inode = false;
- btrfs_inode_lock(inode, BTRFS_ILOCK_MMAP);
+ btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
ret = btrfs_wait_ordered_range(inode, offset, len);
if (ret)
@@ -2956,7 +2652,7 @@ static int btrfs_punch_hole(struct file *file, loff_t offset, loff_t len)
truncated_block = true;
ret = btrfs_truncate_block(BTRFS_I(inode), offset, 0, 0);
if (ret) {
- btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
+ btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
return ret;
}
}
@@ -3055,7 +2751,7 @@ out_only_mutex:
ret = ret2;
}
}
- btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
+ btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
return ret;
}
@@ -3366,7 +3062,7 @@ static long btrfs_fallocate(struct file *file, int mode,
if (mode & FALLOC_FL_PUNCH_HOLE)
return btrfs_punch_hole(file, offset, len);
- btrfs_inode_lock(inode, BTRFS_ILOCK_MMAP);
+ btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size) {
ret = inode_newsize_ok(inode, offset + len);
@@ -3416,7 +3112,7 @@ static long btrfs_fallocate(struct file *file, int mode,
if (mode & FALLOC_FL_ZERO_RANGE) {
ret = btrfs_zero_range(inode, offset, len, mode);
- btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
+ btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
return ret;
}
@@ -3514,7 +3210,7 @@ out_unlock:
unlock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
&cached_state);
out:
- btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
+ btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
extent_changeset_free(data_reserved);
return ret;
}
@@ -3526,117 +3222,106 @@ out:
* looping while it gets adjacent subranges, and merging them together.
*/
static bool find_delalloc_subrange(struct btrfs_inode *inode, u64 start, u64 end,
+ struct extent_state **cached_state,
+ bool *search_io_tree,
u64 *delalloc_start_ret, u64 *delalloc_end_ret)
{
- const u64 len = end + 1 - start;
- struct extent_map_tree *em_tree = &inode->extent_tree;
- struct extent_map *em;
- u64 em_end;
- u64 delalloc_len;
+ u64 len = end + 1 - start;
+ u64 delalloc_len = 0;
+ struct btrfs_ordered_extent *oe;
+ u64 oe_start;
+ u64 oe_end;
/*
* Search the io tree first for EXTENT_DELALLOC. If we find any, it
* means we have delalloc (dirty pages) for which writeback has not
* started yet.
*/
- *delalloc_start_ret = start;
- delalloc_len = count_range_bits(&inode->io_tree, delalloc_start_ret, end,
- len, EXTENT_DELALLOC, 1);
- /*
- * If delalloc was found then *delalloc_start_ret has a sector size
- * aligned value (rounded down).
- */
- if (delalloc_len > 0)
+ if (*search_io_tree) {
+ spin_lock(&inode->lock);
+ if (inode->delalloc_bytes > 0) {
+ spin_unlock(&inode->lock);
+ *delalloc_start_ret = start;
+ delalloc_len = count_range_bits(&inode->io_tree,
+ delalloc_start_ret, end,
+ len, EXTENT_DELALLOC, 1,
+ cached_state);
+ } else {
+ spin_unlock(&inode->lock);
+ }
+ }
+
+ if (delalloc_len > 0) {
+ /*
+ * If delalloc was found then *delalloc_start_ret has a sector size
+ * aligned value (rounded down).
+ */
*delalloc_end_ret = *delalloc_start_ret + delalloc_len - 1;
+ if (*delalloc_start_ret == start) {
+ /* Delalloc for the whole range, nothing more to do. */
+ if (*delalloc_end_ret == end)
+ return true;
+ /* Else trim our search range for ordered extents. */
+ start = *delalloc_end_ret + 1;
+ len = end + 1 - start;
+ }
+ } else {
+ /* No delalloc, future calls don't need to search again. */
+ *search_io_tree = false;
+ }
+
/*
- * Now also check if there's any extent map in the range that does not
- * map to a hole or prealloc extent. We do this because:
+ * Now also check if there's any ordered extent in the range.
+ * We do this because:
*
* 1) When delalloc is flushed, the file range is locked, we clear the
- * EXTENT_DELALLOC bit from the io tree and create an extent map for
- * an allocated extent. So we might just have been called after
- * delalloc is flushed and before the ordered extent completes and
- * inserts the new file extent item in the subvolume's btree;
+ * EXTENT_DELALLOC bit from the io tree and create an extent map and
+ * an ordered extent for the write. So we might just have been called
+ * after delalloc is flushed and before the ordered extent completes
+ * and inserts the new file extent item in the subvolume's btree;
*
- * 2) We may have an extent map created by flushing delalloc for a
+ * 2) We may have an ordered extent created by flushing delalloc for a
* subrange that starts before the subrange we found marked with
* EXTENT_DELALLOC in the io tree.
+ *
+ * We could also use the extent map tree to find such delalloc that is
+ * being flushed, but using the ordered extents tree is more efficient
+ * because it's usually much smaller as ordered extents are removed from
+ * the tree once they complete. With the extent maps, we mau have them
+ * in the extent map tree for a very long time, and they were either
+ * created by previous writes or loaded by read operations.
*/
- read_lock(&em_tree->lock);
- em = lookup_extent_mapping(em_tree, start, len);
- read_unlock(&em_tree->lock);
-
- /* extent_map_end() returns a non-inclusive end offset. */
- em_end = em ? extent_map_end(em) : 0;
-
- /*
- * If we have a hole/prealloc extent map, check the next one if this one
- * ends before our range's end.
- */
- if (em && (em->block_start == EXTENT_MAP_HOLE ||
- test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) && em_end < end) {
- struct extent_map *next_em;
-
- read_lock(&em_tree->lock);
- next_em = lookup_extent_mapping(em_tree, em_end, len - em_end);
- read_unlock(&em_tree->lock);
-
- free_extent_map(em);
- em_end = next_em ? extent_map_end(next_em) : 0;
- em = next_em;
- }
+ oe = btrfs_lookup_first_ordered_range(inode, start, len);
+ if (!oe)
+ return (delalloc_len > 0);
- if (em && (em->block_start == EXTENT_MAP_HOLE ||
- test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
- free_extent_map(em);
- em = NULL;
- }
+ /* The ordered extent may span beyond our search range. */
+ oe_start = max(oe->file_offset, start);
+ oe_end = min(oe->file_offset + oe->num_bytes - 1, end);
- /*
- * No extent map or one for a hole or prealloc extent. Use the delalloc
- * range we found in the io tree if we have one.
- */
- if (!em)
- return (delalloc_len > 0);
+ btrfs_put_ordered_extent(oe);
- /*
- * We don't have any range as EXTENT_DELALLOC in the io tree, so the
- * extent map is the only subrange representing delalloc.
- */
+ /* Don't have unflushed delalloc, return the ordered extent range. */
if (delalloc_len == 0) {
- *delalloc_start_ret = em->start;
- *delalloc_end_ret = min(end, em_end - 1);
- free_extent_map(em);
+ *delalloc_start_ret = oe_start;
+ *delalloc_end_ret = oe_end;
return true;
}
/*
- * The extent map represents a delalloc range that starts before the
- * delalloc range we found in the io tree.
+ * We have both unflushed delalloc (io_tree) and an ordered extent.
+ * If the ranges are adjacent returned a combined range, otherwise
+ * return the leftmost range.
*/
- if (em->start < *delalloc_start_ret) {
- *delalloc_start_ret = em->start;
- /*
- * If the ranges are adjacent, return a combined range.
- * Otherwise return the extent map's range.
- */
- if (em_end < *delalloc_start_ret)
- *delalloc_end_ret = min(end, em_end - 1);
-
- free_extent_map(em);
- return true;
+ if (oe_start < *delalloc_start_ret) {
+ if (oe_end < *delalloc_start_ret)
+ *delalloc_end_ret = oe_end;
+ *delalloc_start_ret = oe_start;
+ } else if (*delalloc_end_ret + 1 == oe_start) {
+ *delalloc_end_ret = oe_end;
}
- /*
- * The extent map starts after the delalloc range we found in the io
- * tree. If it's adjacent, return a combined range, otherwise return
- * the range found in the io tree.
- */
- if (*delalloc_end_ret + 1 == em->start)
- *delalloc_end_ret = min(end, em_end - 1);
-
- free_extent_map(em);
return true;
}
@@ -3648,6 +3333,8 @@ static bool find_delalloc_subrange(struct btrfs_inode *inode, u64 start, u64 end
* sector size aligned.
* @end: The end offset (inclusive value) of the search range.
* It does not need to be sector size aligned.
+ * @cached_state: Extent state record used for speeding up delalloc
+ * searches in the inode's io_tree. Can be NULL.
* @delalloc_start_ret: Output argument, set to the start offset of the
* subrange found with delalloc (may not be sector size
* aligned).
@@ -3659,10 +3346,12 @@ static bool find_delalloc_subrange(struct btrfs_inode *inode, u64 start, u64 end
* end offsets of the subrange.
*/
bool btrfs_find_delalloc_in_range(struct btrfs_inode *inode, u64 start, u64 end,
+ struct extent_state **cached_state,
u64 *delalloc_start_ret, u64 *delalloc_end_ret)
{
u64 cur_offset = round_down(start, inode->root->fs_info->sectorsize);
u64 prev_delalloc_end = 0;
+ bool search_io_tree = true;
bool ret = false;
while (cur_offset < end) {
@@ -3671,6 +3360,7 @@ bool btrfs_find_delalloc_in_range(struct btrfs_inode *inode, u64 start, u64 end,
bool delalloc;
delalloc = find_delalloc_subrange(inode, cur_offset, end,
+ cached_state, &search_io_tree,
&delalloc_start,
&delalloc_end);
if (!delalloc)
@@ -3716,13 +3406,14 @@ bool btrfs_find_delalloc_in_range(struct btrfs_inode *inode, u64 start, u64 end,
* is found, it updates @start_ret with the start of the subrange.
*/
static bool find_desired_extent_in_hole(struct btrfs_inode *inode, int whence,
+ struct extent_state **cached_state,
u64 start, u64 end, u64 *start_ret)
{
u64 delalloc_start;
u64 delalloc_end;
bool delalloc;
- delalloc = btrfs_find_delalloc_in_range(inode, start, end,
+ delalloc = btrfs_find_delalloc_in_range(inode, start, end, cached_state,
&delalloc_start, &delalloc_end);
if (delalloc && whence == SEEK_DATA) {
*start_ret = delalloc_start;
@@ -3765,11 +3456,13 @@ static bool find_desired_extent_in_hole(struct btrfs_inode *inode, int whence,
return false;
}
-static loff_t find_desired_extent(struct btrfs_inode *inode, loff_t offset,
- int whence)
+static loff_t find_desired_extent(struct file *file, loff_t offset, int whence)
{
+ struct btrfs_inode *inode = BTRFS_I(file->f_mapping->host);
+ struct btrfs_file_private *private = file->private_data;
struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct extent_state *cached_state = NULL;
+ struct extent_state **delalloc_cached_state;
const loff_t i_size = i_size_read(&inode->vfs_inode);
const u64 ino = btrfs_ino(inode);
struct btrfs_root *root = inode->root;
@@ -3794,6 +3487,22 @@ static loff_t find_desired_extent(struct btrfs_inode *inode, loff_t offset,
inode_get_bytes(&inode->vfs_inode) == i_size)
return i_size;
+ if (!private) {
+ private = kzalloc(sizeof(*private), GFP_KERNEL);
+ /*
+ * No worries if memory allocation failed.
+ * The private structure is used only for speeding up multiple
+ * lseek SEEK_HOLE/DATA calls to a file when there's delalloc,
+ * so everything will still be correct.
+ */
+ file->private_data = private;
+ }
+
+ if (private)
+ delalloc_cached_state = &private->llseek_cached_state;
+ else
+ delalloc_cached_state = NULL;
+
/*
* offset can be negative, in this case we start finding DATA/HOLE from
* the very start of the file.
@@ -3871,6 +3580,7 @@ static loff_t find_desired_extent(struct btrfs_inode *inode, loff_t offset,
search_start = offset;
found = find_desired_extent_in_hole(inode, whence,
+ delalloc_cached_state,
search_start,
key.offset - 1,
&found_start);
@@ -3905,6 +3615,7 @@ static loff_t find_desired_extent(struct btrfs_inode *inode, loff_t offset,
search_start = offset;
found = find_desired_extent_in_hole(inode, whence,
+ delalloc_cached_state,
search_start,
extent_end - 1,
&found_start);
@@ -3946,7 +3657,8 @@ static loff_t find_desired_extent(struct btrfs_inode *inode, loff_t offset,
/* We have an implicit hole from the last extent found up to i_size. */
if (!found && start < i_size) {
- found = find_desired_extent_in_hole(inode, whence, start,
+ found = find_desired_extent_in_hole(inode, whence,
+ delalloc_cached_state, start,
i_size - 1, &start);
if (!found)
start = i_size;
@@ -3974,9 +3686,9 @@ static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int whence)
return generic_file_llseek(file, offset, whence);
case SEEK_DATA:
case SEEK_HOLE:
- btrfs_inode_lock(inode, BTRFS_ILOCK_SHARED);
- offset = find_desired_extent(BTRFS_I(inode), offset, whence);
- btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
+ btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_SHARED);
+ offset = find_desired_extent(file, offset, whence);
+ btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_SHARED);
break;
}
@@ -4031,7 +3743,7 @@ static ssize_t btrfs_direct_read(struct kiocb *iocb, struct iov_iter *to)
if (check_direct_read(btrfs_sb(inode->i_sb), to, iocb->ki_pos))
return 0;
- btrfs_inode_lock(inode, BTRFS_ILOCK_SHARED);
+ btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_SHARED);
again:
/*
* This is similar to what we do for direct IO writes, see the comment
@@ -4080,7 +3792,7 @@ again:
goto again;
}
}
- btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
+ btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_SHARED);
return ret < 0 ? ret : read;
}
@@ -4117,23 +3829,6 @@ const struct file_operations btrfs_file_operations = {
.remap_file_range = btrfs_remap_file_range,
};
-void __cold btrfs_auto_defrag_exit(void)
-{
- kmem_cache_destroy(btrfs_inode_defrag_cachep);
-}
-
-int __init btrfs_auto_defrag_init(void)
-{
- btrfs_inode_defrag_cachep = kmem_cache_create("btrfs_inode_defrag",
- sizeof(struct inode_defrag), 0,
- SLAB_MEM_SPREAD,
- NULL);
- if (!btrfs_inode_defrag_cachep)
- return -ENOMEM;
-
- return 0;
-}
-
int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end)
{
int ret;
diff --git a/fs/btrfs/file.h b/fs/btrfs/file.h
new file mode 100644
index 000000000000..82b34fbb295f
--- /dev/null
+++ b/fs/btrfs/file.h
@@ -0,0 +1,33 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef BTRFS_FILE_H
+#define BTRFS_FILE_H
+
+extern const struct file_operations btrfs_file_operations;
+
+int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync);
+int btrfs_drop_extents(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct btrfs_inode *inode,
+ struct btrfs_drop_extents_args *args);
+int btrfs_replace_file_extents(struct btrfs_inode *inode,
+ struct btrfs_path *path, const u64 start,
+ const u64 end,
+ struct btrfs_replace_extent_info *extent_info,
+ struct btrfs_trans_handle **trans_out);
+int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
+ struct btrfs_inode *inode, u64 start, u64 end);
+ssize_t btrfs_do_write_iter(struct kiocb *iocb, struct iov_iter *from,
+ const struct btrfs_ioctl_encoded_io_args *encoded);
+int btrfs_release_file(struct inode *inode, struct file *file);
+int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages,
+ size_t num_pages, loff_t pos, size_t write_bytes,
+ struct extent_state **cached, bool noreserve);
+int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end);
+int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos,
+ size_t *write_bytes, bool nowait);
+void btrfs_check_nocow_unlock(struct btrfs_inode *inode);
+bool btrfs_find_delalloc_in_range(struct btrfs_inode *inode, u64 start, u64 end,
+ struct extent_state **cached_state,
+ u64 *delalloc_start_ret, u64 *delalloc_end_ret);
+
+#endif
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index f4023651dd68..0d250d052487 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -11,8 +11,10 @@
#include <linux/ratelimit.h>
#include <linux/error-injection.h>
#include <linux/sched/mm.h>
-#include "misc.h"
#include "ctree.h"
+#include "fs.h"
+#include "messages.h"
+#include "misc.h"
#include "free-space-cache.h"
#include "transaction.h"
#include "disk-io.h"
@@ -24,11 +26,18 @@
#include "discard.h"
#include "subpage.h"
#include "inode-item.h"
+#include "accessors.h"
+#include "file-item.h"
+#include "file.h"
+#include "super.h"
#define BITS_PER_BITMAP (PAGE_SIZE * 8UL)
#define MAX_CACHE_BYTES_PER_GIG SZ_64K
#define FORCE_EXTENT_THRESHOLD SZ_1M
+static struct kmem_cache *btrfs_free_space_cachep;
+static struct kmem_cache *btrfs_free_space_bitmap_cachep;
+
struct btrfs_trim_range {
u64 start;
u64 bytes;
@@ -251,7 +260,7 @@ int btrfs_remove_free_space_inode(struct btrfs_trans_handle *trans,
}
ret = btrfs_orphan_add(trans, BTRFS_I(inode));
if (ret) {
- btrfs_add_delayed_iput(inode);
+ btrfs_add_delayed_iput(BTRFS_I(inode));
goto out;
}
clear_nlink(inode);
@@ -265,7 +274,7 @@ int btrfs_remove_free_space_inode(struct btrfs_trans_handle *trans,
spin_unlock(&block_group->lock);
}
/* One for the lookup ref */
- btrfs_add_delayed_iput(inode);
+ btrfs_add_delayed_iput(BTRFS_I(inode));
key.objectid = BTRFS_FREE_SPACE_OBJECTID;
key.type = 0;
@@ -1363,8 +1372,8 @@ int btrfs_wait_cache_io(struct btrfs_trans_handle *trans,
path, block_group->start);
}
-/**
- * Write out cached info to an inode
+/*
+ * Write out cached info to an inode.
*
* @root: root the inode belongs to
* @inode: freespace inode we are writing out
@@ -2717,8 +2726,7 @@ static int __btrfs_add_free_space_zoned(struct btrfs_block_group *block_group,
btrfs_mark_bg_unused(block_group);
} else if (bg_reclaim_threshold &&
reclaimable_unusable >=
- div_factor_fine(block_group->zone_capacity,
- bg_reclaim_threshold)) {
+ mult_perc(block_group->zone_capacity, bg_reclaim_threshold)) {
btrfs_mark_bg_to_reclaim(block_group);
}
@@ -3028,10 +3036,7 @@ void btrfs_remove_free_space_cache(struct btrfs_block_group *block_group)
}
-/**
- * btrfs_is_free_space_trimmed - see if everything is trimmed
- * @block_group: block_group of interest
- *
+/*
* Walk @block_group's free space rb_tree to determine if everything is trimmed.
*/
bool btrfs_is_free_space_trimmed(struct btrfs_block_group *block_group)
@@ -4132,6 +4137,31 @@ out:
return ret;
}
+int __init btrfs_free_space_init(void)
+{
+ btrfs_free_space_cachep = kmem_cache_create("btrfs_free_space",
+ sizeof(struct btrfs_free_space), 0,
+ SLAB_MEM_SPREAD, NULL);
+ if (!btrfs_free_space_cachep)
+ return -ENOMEM;
+
+ btrfs_free_space_bitmap_cachep = kmem_cache_create("btrfs_free_space_bitmap",
+ PAGE_SIZE, PAGE_SIZE,
+ SLAB_MEM_SPREAD, NULL);
+ if (!btrfs_free_space_bitmap_cachep) {
+ kmem_cache_destroy(btrfs_free_space_cachep);
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+void __cold btrfs_free_space_exit(void)
+{
+ kmem_cache_destroy(btrfs_free_space_cachep);
+ kmem_cache_destroy(btrfs_free_space_bitmap_cachep);
+}
+
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
/*
* Use this if you need to make a bitmap or extent entry specifically, it
diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h
index 6d419ba53e95..a855e0483e03 100644
--- a/fs/btrfs/free-space-cache.h
+++ b/fs/btrfs/free-space-cache.h
@@ -43,6 +43,17 @@ static inline bool btrfs_free_space_trimming_bitmap(
return (info->trim_state == BTRFS_TRIM_STATE_TRIMMING);
}
+/*
+ * Deltas are an effective way to populate global statistics. Give macro names
+ * to make it clear what we're doing. An example is discard_extents in
+ * btrfs_free_space_ctl.
+ */
+enum {
+ BTRFS_STAT_CURR,
+ BTRFS_STAT_PREV,
+ BTRFS_STAT_NR_ENTRIES,
+};
+
struct btrfs_free_space_ctl {
spinlock_t tree_lock;
struct rb_root free_space_offset;
@@ -79,6 +90,8 @@ struct btrfs_io_ctl {
int bitmaps;
};
+int __init btrfs_free_space_init(void);
+void __cold btrfs_free_space_exit(void);
struct inode *lookup_free_space_inode(struct btrfs_block_group *block_group,
struct btrfs_path *path);
int create_free_space_inode(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c
index 367bcfcf68f5..c667e878ef1a 100644
--- a/fs/btrfs/free-space-tree.c
+++ b/fs/btrfs/free-space-tree.c
@@ -5,12 +5,17 @@
#include <linux/kernel.h>
#include <linux/sched/mm.h>
+#include "messages.h"
#include "ctree.h"
#include "disk-io.h"
#include "locking.h"
#include "free-space-tree.h"
#include "transaction.h"
#include "block-group.h"
+#include "fs.h"
+#include "accessors.h"
+#include "extent-tree.h"
+#include "root-tree.h"
static int __add_block_group_free_space(struct btrfs_trans_handle *trans,
struct btrfs_block_group *block_group,
@@ -803,7 +808,7 @@ int __remove_from_free_space_tree(struct btrfs_trans_handle *trans,
u32 flags;
int ret;
- if (block_group->needs_free_space) {
+ if (test_bit(BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE, &block_group->runtime_flags)) {
ret = __add_block_group_free_space(trans, block_group, path);
if (ret)
return ret;
@@ -996,7 +1001,7 @@ int __add_to_free_space_tree(struct btrfs_trans_handle *trans,
u32 flags;
int ret;
- if (block_group->needs_free_space) {
+ if (test_bit(BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE, &block_group->runtime_flags)) {
ret = __add_block_group_free_space(trans, block_group, path);
if (ret)
return ret;
@@ -1299,7 +1304,7 @@ static int __add_block_group_free_space(struct btrfs_trans_handle *trans,
{
int ret;
- block_group->needs_free_space = 0;
+ clear_bit(BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE, &block_group->runtime_flags);
ret = add_new_free_space_info(trans, block_group, path);
if (ret)
@@ -1321,7 +1326,7 @@ int add_block_group_free_space(struct btrfs_trans_handle *trans,
return 0;
mutex_lock(&block_group->free_space_lock);
- if (!block_group->needs_free_space)
+ if (!test_bit(BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE, &block_group->runtime_flags))
goto out;
path = btrfs_alloc_path();
@@ -1354,7 +1359,7 @@ int remove_block_group_free_space(struct btrfs_trans_handle *trans,
if (!btrfs_fs_compat_ro(trans->fs_info, FREE_SPACE_TREE))
return 0;
- if (block_group->needs_free_space) {
+ if (test_bit(BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE, &block_group->runtime_flags)) {
/* We never added this block group to the free space tree. */
return 0;
}
diff --git a/fs/btrfs/fs.c b/fs/btrfs/fs.c
new file mode 100644
index 000000000000..5553e1f8afe8
--- /dev/null
+++ b/fs/btrfs/fs.c
@@ -0,0 +1,94 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "messages.h"
+#include "ctree.h"
+#include "fs.h"
+#include "accessors.h"
+
+void __btrfs_set_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag,
+ const char *name)
+{
+ struct btrfs_super_block *disk_super;
+ u64 features;
+
+ disk_super = fs_info->super_copy;
+ features = btrfs_super_incompat_flags(disk_super);
+ if (!(features & flag)) {
+ spin_lock(&fs_info->super_lock);
+ features = btrfs_super_incompat_flags(disk_super);
+ if (!(features & flag)) {
+ features |= flag;
+ btrfs_set_super_incompat_flags(disk_super, features);
+ btrfs_info(fs_info,
+ "setting incompat feature flag for %s (0x%llx)",
+ name, flag);
+ }
+ spin_unlock(&fs_info->super_lock);
+ }
+}
+
+void __btrfs_clear_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag,
+ const char *name)
+{
+ struct btrfs_super_block *disk_super;
+ u64 features;
+
+ disk_super = fs_info->super_copy;
+ features = btrfs_super_incompat_flags(disk_super);
+ if (features & flag) {
+ spin_lock(&fs_info->super_lock);
+ features = btrfs_super_incompat_flags(disk_super);
+ if (features & flag) {
+ features &= ~flag;
+ btrfs_set_super_incompat_flags(disk_super, features);
+ btrfs_info(fs_info,
+ "clearing incompat feature flag for %s (0x%llx)",
+ name, flag);
+ }
+ spin_unlock(&fs_info->super_lock);
+ }
+}
+
+void __btrfs_set_fs_compat_ro(struct btrfs_fs_info *fs_info, u64 flag,
+ const char *name)
+{
+ struct btrfs_super_block *disk_super;
+ u64 features;
+
+ disk_super = fs_info->super_copy;
+ features = btrfs_super_compat_ro_flags(disk_super);
+ if (!(features & flag)) {
+ spin_lock(&fs_info->super_lock);
+ features = btrfs_super_compat_ro_flags(disk_super);
+ if (!(features & flag)) {
+ features |= flag;
+ btrfs_set_super_compat_ro_flags(disk_super, features);
+ btrfs_info(fs_info,
+ "setting compat-ro feature flag for %s (0x%llx)",
+ name, flag);
+ }
+ spin_unlock(&fs_info->super_lock);
+ }
+}
+
+void __btrfs_clear_fs_compat_ro(struct btrfs_fs_info *fs_info, u64 flag,
+ const char *name)
+{
+ struct btrfs_super_block *disk_super;
+ u64 features;
+
+ disk_super = fs_info->super_copy;
+ features = btrfs_super_compat_ro_flags(disk_super);
+ if (features & flag) {
+ spin_lock(&fs_info->super_lock);
+ features = btrfs_super_compat_ro_flags(disk_super);
+ if (features & flag) {
+ features &= ~flag;
+ btrfs_set_super_compat_ro_flags(disk_super, features);
+ btrfs_info(fs_info,
+ "clearing compat-ro feature flag for %s (0x%llx)",
+ name, flag);
+ }
+ spin_unlock(&fs_info->super_lock);
+ }
+}
diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h
new file mode 100644
index 000000000000..a749367e5ae2
--- /dev/null
+++ b/fs/btrfs/fs.h
@@ -0,0 +1,976 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef BTRFS_FS_H
+#define BTRFS_FS_H
+
+#include <linux/fs.h>
+#include <linux/btrfs_tree.h>
+#include <linux/sizes.h>
+#include "extent-io-tree.h"
+#include "extent_map.h"
+#include "async-thread.h"
+#include "block-rsv.h"
+
+#define BTRFS_MAX_EXTENT_SIZE SZ_128M
+
+#define BTRFS_OLDEST_GENERATION 0ULL
+
+#define BTRFS_EMPTY_DIR_SIZE 0
+
+#define BTRFS_DIRTY_METADATA_THRESH SZ_32M
+
+#define BTRFS_SUPER_INFO_OFFSET SZ_64K
+#define BTRFS_SUPER_INFO_SIZE 4096
+static_assert(sizeof(struct btrfs_super_block) == BTRFS_SUPER_INFO_SIZE);
+
+/*
+ * The reserved space at the beginning of each device. It covers the primary
+ * super block and leaves space for potential use by other tools like
+ * bootloaders or to lower potential damage of accidental overwrite.
+ */
+#define BTRFS_DEVICE_RANGE_RESERVED (SZ_1M)
+/*
+ * Runtime (in-memory) states of filesystem
+ */
+enum {
+ /* Global indicator of serious filesystem errors */
+ BTRFS_FS_STATE_ERROR,
+ /*
+ * Filesystem is being remounted, allow to skip some operations, like
+ * defrag
+ */
+ BTRFS_FS_STATE_REMOUNTING,
+ /* Filesystem in RO mode */
+ BTRFS_FS_STATE_RO,
+ /* Track if a transaction abort has been reported on this filesystem */
+ BTRFS_FS_STATE_TRANS_ABORTED,
+ /*
+ * Bio operations should be blocked on this filesystem because a source
+ * or target device is being destroyed as part of a device replace
+ */
+ BTRFS_FS_STATE_DEV_REPLACING,
+ /* The btrfs_fs_info created for self-tests */
+ BTRFS_FS_STATE_DUMMY_FS_INFO,
+
+ BTRFS_FS_STATE_NO_CSUMS,
+
+ /* Indicates there was an error cleaning up a log tree. */
+ BTRFS_FS_STATE_LOG_CLEANUP_ERROR,
+
+ BTRFS_FS_STATE_COUNT
+};
+
+enum {
+ BTRFS_FS_CLOSING_START,
+ BTRFS_FS_CLOSING_DONE,
+ BTRFS_FS_LOG_RECOVERING,
+ BTRFS_FS_OPEN,
+ BTRFS_FS_QUOTA_ENABLED,
+ BTRFS_FS_UPDATE_UUID_TREE_GEN,
+ BTRFS_FS_CREATING_FREE_SPACE_TREE,
+ BTRFS_FS_BTREE_ERR,
+ BTRFS_FS_LOG1_ERR,
+ BTRFS_FS_LOG2_ERR,
+ BTRFS_FS_QUOTA_OVERRIDE,
+ /* Used to record internally whether fs has been frozen */
+ BTRFS_FS_FROZEN,
+ /*
+ * Indicate that balance has been set up from the ioctl and is in the
+ * main phase. The fs_info::balance_ctl is initialized.
+ */
+ BTRFS_FS_BALANCE_RUNNING,
+
+ /*
+ * Indicate that relocation of a chunk has started, it's set per chunk
+ * and is toggled between chunks.
+ */
+ BTRFS_FS_RELOC_RUNNING,
+
+ /* Indicate that the cleaner thread is awake and doing something. */
+ BTRFS_FS_CLEANER_RUNNING,
+
+ /*
+ * The checksumming has an optimized version and is considered fast,
+ * so we don't need to offload checksums to workqueues.
+ */
+ BTRFS_FS_CSUM_IMPL_FAST,
+
+ /* Indicate that the discard workqueue can service discards. */
+ BTRFS_FS_DISCARD_RUNNING,
+
+ /* Indicate that we need to cleanup space cache v1 */
+ BTRFS_FS_CLEANUP_SPACE_CACHE_V1,
+
+ /* Indicate that we can't trust the free space tree for caching yet */
+ BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED,
+
+ /* Indicate whether there are any tree modification log users */
+ BTRFS_FS_TREE_MOD_LOG_USERS,
+
+ /* Indicate that we want the transaction kthread to commit right now. */
+ BTRFS_FS_COMMIT_TRANS,
+
+ /* Indicate we have half completed snapshot deletions pending. */
+ BTRFS_FS_UNFINISHED_DROPS,
+
+ /* Indicate we have to finish a zone to do next allocation. */
+ BTRFS_FS_NEED_ZONE_FINISH,
+
+ /* Indicate that we want to commit the transaction. */
+ BTRFS_FS_NEED_TRANS_COMMIT,
+
+#if BITS_PER_LONG == 32
+ /* Indicate if we have error/warn message printed on 32bit systems */
+ BTRFS_FS_32BIT_ERROR,
+ BTRFS_FS_32BIT_WARN,
+#endif
+};
+
+/*
+ * Flags for mount options.
+ *
+ * Note: don't forget to add new options to btrfs_show_options()
+ */
+enum {
+ BTRFS_MOUNT_NODATASUM = (1UL << 0),
+ BTRFS_MOUNT_NODATACOW = (1UL << 1),
+ BTRFS_MOUNT_NOBARRIER = (1UL << 2),
+ BTRFS_MOUNT_SSD = (1UL << 3),
+ BTRFS_MOUNT_DEGRADED = (1UL << 4),
+ BTRFS_MOUNT_COMPRESS = (1UL << 5),
+ BTRFS_MOUNT_NOTREELOG = (1UL << 6),
+ BTRFS_MOUNT_FLUSHONCOMMIT = (1UL << 7),
+ BTRFS_MOUNT_SSD_SPREAD = (1UL << 8),
+ BTRFS_MOUNT_NOSSD = (1UL << 9),
+ BTRFS_MOUNT_DISCARD_SYNC = (1UL << 10),
+ BTRFS_MOUNT_FORCE_COMPRESS = (1UL << 11),
+ BTRFS_MOUNT_SPACE_CACHE = (1UL << 12),
+ BTRFS_MOUNT_CLEAR_CACHE = (1UL << 13),
+ BTRFS_MOUNT_USER_SUBVOL_RM_ALLOWED = (1UL << 14),
+ BTRFS_MOUNT_ENOSPC_DEBUG = (1UL << 15),
+ BTRFS_MOUNT_AUTO_DEFRAG = (1UL << 16),
+ BTRFS_MOUNT_USEBACKUPROOT = (1UL << 17),
+ BTRFS_MOUNT_SKIP_BALANCE = (1UL << 18),
+ BTRFS_MOUNT_CHECK_INTEGRITY = (1UL << 19),
+ BTRFS_MOUNT_CHECK_INTEGRITY_DATA = (1UL << 20),
+ BTRFS_MOUNT_PANIC_ON_FATAL_ERROR = (1UL << 21),
+ BTRFS_MOUNT_RESCAN_UUID_TREE = (1UL << 22),
+ BTRFS_MOUNT_FRAGMENT_DATA = (1UL << 23),
+ BTRFS_MOUNT_FRAGMENT_METADATA = (1UL << 24),
+ BTRFS_MOUNT_FREE_SPACE_TREE = (1UL << 25),
+ BTRFS_MOUNT_NOLOGREPLAY = (1UL << 26),
+ BTRFS_MOUNT_REF_VERIFY = (1UL << 27),
+ BTRFS_MOUNT_DISCARD_ASYNC = (1UL << 28),
+ BTRFS_MOUNT_IGNOREBADROOTS = (1UL << 29),
+ BTRFS_MOUNT_IGNOREDATACSUMS = (1UL << 30),
+ BTRFS_MOUNT_NODISCARD = (1UL << 31),
+};
+
+/*
+ * Compat flags that we support. If any incompat flags are set other than the
+ * ones specified below then we will fail to mount
+ */
+#define BTRFS_FEATURE_COMPAT_SUPP 0ULL
+#define BTRFS_FEATURE_COMPAT_SAFE_SET 0ULL
+#define BTRFS_FEATURE_COMPAT_SAFE_CLEAR 0ULL
+
+#define BTRFS_FEATURE_COMPAT_RO_SUPP \
+ (BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE | \
+ BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE_VALID | \
+ BTRFS_FEATURE_COMPAT_RO_VERITY | \
+ BTRFS_FEATURE_COMPAT_RO_BLOCK_GROUP_TREE)
+
+#define BTRFS_FEATURE_COMPAT_RO_SAFE_SET 0ULL
+#define BTRFS_FEATURE_COMPAT_RO_SAFE_CLEAR 0ULL
+
+#ifdef CONFIG_BTRFS_DEBUG
+/*
+ * Extent tree v2 supported only with CONFIG_BTRFS_DEBUG
+ */
+#define BTRFS_FEATURE_INCOMPAT_SUPP \
+ (BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF | \
+ BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL | \
+ BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS | \
+ BTRFS_FEATURE_INCOMPAT_BIG_METADATA | \
+ BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO | \
+ BTRFS_FEATURE_INCOMPAT_COMPRESS_ZSTD | \
+ BTRFS_FEATURE_INCOMPAT_RAID56 | \
+ BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF | \
+ BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA | \
+ BTRFS_FEATURE_INCOMPAT_NO_HOLES | \
+ BTRFS_FEATURE_INCOMPAT_METADATA_UUID | \
+ BTRFS_FEATURE_INCOMPAT_RAID1C34 | \
+ BTRFS_FEATURE_INCOMPAT_ZONED | \
+ BTRFS_FEATURE_INCOMPAT_EXTENT_TREE_V2)
+#else
+#define BTRFS_FEATURE_INCOMPAT_SUPP \
+ (BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF | \
+ BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL | \
+ BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS | \
+ BTRFS_FEATURE_INCOMPAT_BIG_METADATA | \
+ BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO | \
+ BTRFS_FEATURE_INCOMPAT_COMPRESS_ZSTD | \
+ BTRFS_FEATURE_INCOMPAT_RAID56 | \
+ BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF | \
+ BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA | \
+ BTRFS_FEATURE_INCOMPAT_NO_HOLES | \
+ BTRFS_FEATURE_INCOMPAT_METADATA_UUID | \
+ BTRFS_FEATURE_INCOMPAT_RAID1C34 | \
+ BTRFS_FEATURE_INCOMPAT_ZONED)
+#endif
+
+#define BTRFS_FEATURE_INCOMPAT_SAFE_SET \
+ (BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF)
+#define BTRFS_FEATURE_INCOMPAT_SAFE_CLEAR 0ULL
+
+#define BTRFS_DEFAULT_COMMIT_INTERVAL (30)
+#define BTRFS_DEFAULT_MAX_INLINE (2048)
+
+struct btrfs_dev_replace {
+ /* See #define above */
+ u64 replace_state;
+ /* Seconds since 1-Jan-1970 */
+ time64_t time_started;
+ /* Seconds since 1-Jan-1970 */
+ time64_t time_stopped;
+ atomic64_t num_write_errors;
+ atomic64_t num_uncorrectable_read_errors;
+
+ u64 cursor_left;
+ u64 committed_cursor_left;
+ u64 cursor_left_last_write_of_item;
+ u64 cursor_right;
+
+ /* See #define above */
+ u64 cont_reading_from_srcdev_mode;
+
+ int is_valid;
+ int item_needs_writeback;
+ struct btrfs_device *srcdev;
+ struct btrfs_device *tgtdev;
+
+ struct mutex lock_finishing_cancel_unmount;
+ struct rw_semaphore rwsem;
+
+ struct btrfs_scrub_progress scrub_progress;
+
+ struct percpu_counter bio_counter;
+ wait_queue_head_t replace_wait;
+};
+
+/*
+ * Free clusters are used to claim free space in relatively large chunks,
+ * allowing us to do less seeky writes. They are used for all metadata
+ * allocations. In ssd_spread mode they are also used for data allocations.
+ */
+struct btrfs_free_cluster {
+ spinlock_t lock;
+ spinlock_t refill_lock;
+ struct rb_root root;
+
+ /* Largest extent in this cluster */
+ u64 max_size;
+
+ /* First extent starting offset */
+ u64 window_start;
+
+ /* We did a full search and couldn't create a cluster */
+ bool fragmented;
+
+ struct btrfs_block_group *block_group;
+ /*
+ * When a cluster is allocated from a block group, we put the cluster
+ * onto a list in the block group so that it can be freed before the
+ * block group is freed.
+ */
+ struct list_head block_group_list;
+};
+
+/* Discard control. */
+/*
+ * Async discard uses multiple lists to differentiate the discard filter
+ * parameters. Index 0 is for completely free block groups where we need to
+ * ensure the entire block group is trimmed without being lossy. Indices
+ * afterwards represent monotonically decreasing discard filter sizes to
+ * prioritize what should be discarded next.
+ */
+#define BTRFS_NR_DISCARD_LISTS 3
+#define BTRFS_DISCARD_INDEX_UNUSED 0
+#define BTRFS_DISCARD_INDEX_START 1
+
+struct btrfs_discard_ctl {
+ struct workqueue_struct *discard_workers;
+ struct delayed_work work;
+ spinlock_t lock;
+ struct btrfs_block_group *block_group;
+ struct list_head discard_list[BTRFS_NR_DISCARD_LISTS];
+ u64 prev_discard;
+ u64 prev_discard_time;
+ atomic_t discardable_extents;
+ atomic64_t discardable_bytes;
+ u64 max_discard_size;
+ u64 delay_ms;
+ u32 iops_limit;
+ u32 kbps_limit;
+ u64 discard_extent_bytes;
+ u64 discard_bitmap_bytes;
+ atomic64_t discard_bytes_saved;
+};
+
+/*
+ * Exclusive operations (device replace, resize, device add/remove, balance)
+ */
+enum btrfs_exclusive_operation {
+ BTRFS_EXCLOP_NONE,
+ BTRFS_EXCLOP_BALANCE_PAUSED,
+ BTRFS_EXCLOP_BALANCE,
+ BTRFS_EXCLOP_DEV_ADD,
+ BTRFS_EXCLOP_DEV_REMOVE,
+ BTRFS_EXCLOP_DEV_REPLACE,
+ BTRFS_EXCLOP_RESIZE,
+ BTRFS_EXCLOP_SWAP_ACTIVATE,
+};
+
+/* Store data about transaction commits, exported via sysfs. */
+struct btrfs_commit_stats {
+ /* Total number of commits */
+ u64 commit_count;
+ /* The maximum commit duration so far in ns */
+ u64 max_commit_dur;
+ /* The last commit duration in ns */
+ u64 last_commit_dur;
+ /* The total commit duration in ns */
+ u64 total_commit_dur;
+};
+
+struct btrfs_fs_info {
+ u8 chunk_tree_uuid[BTRFS_UUID_SIZE];
+ unsigned long flags;
+ struct btrfs_root *tree_root;
+ struct btrfs_root *chunk_root;
+ struct btrfs_root *dev_root;
+ struct btrfs_root *fs_root;
+ struct btrfs_root *quota_root;
+ struct btrfs_root *uuid_root;
+ struct btrfs_root *data_reloc_root;
+ struct btrfs_root *block_group_root;
+
+ /* The log root tree is a directory of all the other log roots */
+ struct btrfs_root *log_root_tree;
+
+ /* The tree that holds the global roots (csum, extent, etc) */
+ rwlock_t global_root_lock;
+ struct rb_root global_root_tree;
+
+ spinlock_t fs_roots_radix_lock;
+ struct radix_tree_root fs_roots_radix;
+
+ /* Block group cache stuff */
+ rwlock_t block_group_cache_lock;
+ struct rb_root_cached block_group_cache_tree;
+
+ /* Keep track of unallocated space */
+ atomic64_t free_chunk_space;
+
+ /* Track ranges which are used by log trees blocks/logged data extents */
+ struct extent_io_tree excluded_extents;
+
+ /* logical->physical extent mapping */
+ struct extent_map_tree mapping_tree;
+
+ /*
+ * Block reservation for extent, checksum, root tree and delayed dir
+ * index item.
+ */
+ struct btrfs_block_rsv global_block_rsv;
+ /* Block reservation for metadata operations */
+ struct btrfs_block_rsv trans_block_rsv;
+ /* Block reservation for chunk tree */
+ struct btrfs_block_rsv chunk_block_rsv;
+ /* Block reservation for delayed operations */
+ struct btrfs_block_rsv delayed_block_rsv;
+ /* Block reservation for delayed refs */
+ struct btrfs_block_rsv delayed_refs_rsv;
+
+ struct btrfs_block_rsv empty_block_rsv;
+
+ u64 generation;
+ u64 last_trans_committed;
+ /*
+ * Generation of the last transaction used for block group relocation
+ * since the filesystem was last mounted (or 0 if none happened yet).
+ * Must be written and read while holding btrfs_fs_info::commit_root_sem.
+ */
+ u64 last_reloc_trans;
+ u64 avg_delayed_ref_runtime;
+
+ /*
+ * This is updated to the current trans every time a full commit is
+ * required instead of the faster short fsync log commits
+ */
+ u64 last_trans_log_full_commit;
+ unsigned long mount_opt;
+
+ unsigned long compress_type:4;
+ unsigned int compress_level;
+ u32 commit_interval;
+ /*
+ * It is a suggestive number, the read side is safe even it gets a
+ * wrong number because we will write out the data into a regular
+ * extent. The write side(mount/remount) is under ->s_umount lock,
+ * so it is also safe.
+ */
+ u64 max_inline;
+
+ struct btrfs_transaction *running_transaction;
+ wait_queue_head_t transaction_throttle;
+ wait_queue_head_t transaction_wait;
+ wait_queue_head_t transaction_blocked_wait;
+ wait_queue_head_t async_submit_wait;
+
+ /*
+ * Used to protect the incompat_flags, compat_flags, compat_ro_flags
+ * when they are updated.
+ *
+ * Because we do not clear the flags for ever, so we needn't use
+ * the lock on the read side.
+ *
+ * We also needn't use the lock when we mount the fs, because
+ * there is no other task which will update the flag.
+ */
+ spinlock_t super_lock;
+ struct btrfs_super_block *super_copy;
+ struct btrfs_super_block *super_for_commit;
+ struct super_block *sb;
+ struct inode *btree_inode;
+ struct mutex tree_log_mutex;
+ struct mutex transaction_kthread_mutex;
+ struct mutex cleaner_mutex;
+ struct mutex chunk_mutex;
+
+ /*
+ * This is taken to make sure we don't set block groups ro after the
+ * free space cache has been allocated on them.
+ */
+ struct mutex ro_block_group_mutex;
+
+ /*
+ * This is used during read/modify/write to make sure no two ios are
+ * trying to mod the same stripe at the same time.
+ */
+ struct btrfs_stripe_hash_table *stripe_hash_table;
+
+ /*
+ * This protects the ordered operations list only while we are
+ * processing all of the entries on it. This way we make sure the
+ * commit code doesn't find the list temporarily empty because another
+ * function happens to be doing non-waiting preflush before jumping
+ * into the main commit.
+ */
+ struct mutex ordered_operations_mutex;
+
+ struct rw_semaphore commit_root_sem;
+
+ struct rw_semaphore cleanup_work_sem;
+
+ struct rw_semaphore subvol_sem;
+
+ spinlock_t trans_lock;
+ /*
+ * The reloc mutex goes with the trans lock, it is taken during commit
+ * to protect us from the relocation code.
+ */
+ struct mutex reloc_mutex;
+
+ struct list_head trans_list;
+ struct list_head dead_roots;
+ struct list_head caching_block_groups;
+
+ spinlock_t delayed_iput_lock;
+ struct list_head delayed_iputs;
+ atomic_t nr_delayed_iputs;
+ wait_queue_head_t delayed_iputs_wait;
+
+ atomic64_t tree_mod_seq;
+
+ /* This protects tree_mod_log and tree_mod_seq_list */
+ rwlock_t tree_mod_log_lock;
+ struct rb_root tree_mod_log;
+ struct list_head tree_mod_seq_list;
+
+ atomic_t async_delalloc_pages;
+
+ /* This is used to protect the following list -- ordered_roots. */
+ spinlock_t ordered_root_lock;
+
+ /*
+ * All fs/file tree roots in which there are data=ordered extents
+ * pending writeback are added into this list.
+ *
+ * These can span multiple transactions and basically include every
+ * dirty data page that isn't from nodatacow.
+ */
+ struct list_head ordered_roots;
+
+ struct mutex delalloc_root_mutex;
+ spinlock_t delalloc_root_lock;
+ /* All fs/file tree roots that have delalloc inodes. */
+ struct list_head delalloc_roots;
+
+ /*
+ * There is a pool of worker threads for checksumming during writes and
+ * a pool for checksumming after reads. This is because readers can
+ * run with FS locks held, and the writers may be waiting for those
+ * locks. We don't want ordering in the pending list to cause
+ * deadlocks, and so the two are serviced separately.
+ *
+ * A third pool does submit_bio to avoid deadlocking with the other two.
+ */
+ struct btrfs_workqueue *workers;
+ struct btrfs_workqueue *hipri_workers;
+ struct btrfs_workqueue *delalloc_workers;
+ struct btrfs_workqueue *flush_workers;
+ struct workqueue_struct *endio_workers;
+ struct workqueue_struct *endio_meta_workers;
+ struct workqueue_struct *rmw_workers;
+ struct workqueue_struct *compressed_write_workers;
+ struct btrfs_workqueue *endio_write_workers;
+ struct btrfs_workqueue *endio_freespace_worker;
+ struct btrfs_workqueue *caching_workers;
+
+ /*
+ * Fixup workers take dirty pages that didn't properly go through the
+ * cow mechanism and make them safe to write. It happens for the
+ * sys_munmap function call path.
+ */
+ struct btrfs_workqueue *fixup_workers;
+ struct btrfs_workqueue *delayed_workers;
+
+ struct task_struct *transaction_kthread;
+ struct task_struct *cleaner_kthread;
+ u32 thread_pool_size;
+
+ struct kobject *space_info_kobj;
+ struct kobject *qgroups_kobj;
+ struct kobject *discard_kobj;
+
+ /* Used to keep from writing metadata until there is a nice batch */
+ struct percpu_counter dirty_metadata_bytes;
+ struct percpu_counter delalloc_bytes;
+ struct percpu_counter ordered_bytes;
+ s32 dirty_metadata_batch;
+ s32 delalloc_batch;
+
+ struct list_head dirty_cowonly_roots;
+
+ struct btrfs_fs_devices *fs_devices;
+
+ /*
+ * The space_info list is effectively read only after initial setup.
+ * It is populated at mount time and cleaned up after all block groups
+ * are removed. RCU is used to protect it.
+ */
+ struct list_head space_info;
+
+ struct btrfs_space_info *data_sinfo;
+
+ struct reloc_control *reloc_ctl;
+
+ /* data_alloc_cluster is only used in ssd_spread mode */
+ struct btrfs_free_cluster data_alloc_cluster;
+
+ /* All metadata allocations go through this cluster. */
+ struct btrfs_free_cluster meta_alloc_cluster;
+
+ /* Auto defrag inodes go here. */
+ spinlock_t defrag_inodes_lock;
+ struct rb_root defrag_inodes;
+ atomic_t defrag_running;
+
+ /* Used to protect avail_{data, metadata, system}_alloc_bits */
+ seqlock_t profiles_lock;
+ /*
+ * These three are in extended format (availability of single chunks is
+ * denoted by BTRFS_AVAIL_ALLOC_BIT_SINGLE bit, other types are denoted
+ * by corresponding BTRFS_BLOCK_GROUP_* bits)
+ */
+ u64 avail_data_alloc_bits;
+ u64 avail_metadata_alloc_bits;
+ u64 avail_system_alloc_bits;
+
+ /* Balance state */
+ spinlock_t balance_lock;
+ struct mutex balance_mutex;
+ atomic_t balance_pause_req;
+ atomic_t balance_cancel_req;
+ struct btrfs_balance_control *balance_ctl;
+ wait_queue_head_t balance_wait_q;
+
+ /* Cancellation requests for chunk relocation */
+ atomic_t reloc_cancel_req;
+
+ u32 data_chunk_allocations;
+ u32 metadata_ratio;
+
+ void *bdev_holder;
+
+ /* Private scrub information */
+ struct mutex scrub_lock;
+ atomic_t scrubs_running;
+ atomic_t scrub_pause_req;
+ atomic_t scrubs_paused;
+ atomic_t scrub_cancel_req;
+ wait_queue_head_t scrub_pause_wait;
+ /*
+ * The worker pointers are NULL iff the refcount is 0, ie. scrub is not
+ * running.
+ */
+ refcount_t scrub_workers_refcnt;
+ struct workqueue_struct *scrub_workers;
+ struct workqueue_struct *scrub_wr_completion_workers;
+ struct workqueue_struct *scrub_parity_workers;
+ struct btrfs_subpage_info *subpage_info;
+
+ struct btrfs_discard_ctl discard_ctl;
+
+#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
+ u32 check_integrity_print_mask;
+#endif
+ /* Is qgroup tracking in a consistent state? */
+ u64 qgroup_flags;
+
+ /* Holds configuration and tracking. Protected by qgroup_lock. */
+ struct rb_root qgroup_tree;
+ spinlock_t qgroup_lock;
+
+ /*
+ * Used to avoid frequently calling ulist_alloc()/ulist_free()
+ * when doing qgroup accounting, it must be protected by qgroup_lock.
+ */
+ struct ulist *qgroup_ulist;
+
+ /*
+ * Protect user change for quota operations. If a transaction is needed,
+ * it must be started before locking this lock.
+ */
+ struct mutex qgroup_ioctl_lock;
+
+ /* List of dirty qgroups to be written at next commit. */
+ struct list_head dirty_qgroups;
+
+ /* Used by qgroup for an efficient tree traversal. */
+ u64 qgroup_seq;
+
+ /* Qgroup rescan items. */
+ /* Protects the progress item */
+ struct mutex qgroup_rescan_lock;
+ struct btrfs_key qgroup_rescan_progress;
+ struct btrfs_workqueue *qgroup_rescan_workers;
+ struct completion qgroup_rescan_completion;
+ struct btrfs_work qgroup_rescan_work;
+ /* Protected by qgroup_rescan_lock */
+ bool qgroup_rescan_running;
+ u8 qgroup_drop_subtree_thres;
+
+ /* Filesystem state */
+ unsigned long fs_state;
+
+ struct btrfs_delayed_root *delayed_root;
+
+ /* Extent buffer radix tree */
+ spinlock_t buffer_lock;
+ /* Entries are eb->start / sectorsize */
+ struct radix_tree_root buffer_radix;
+
+ /* Next backup root to be overwritten */
+ int backup_root_index;
+
+ /* Device replace state */
+ struct btrfs_dev_replace dev_replace;
+
+ struct semaphore uuid_tree_rescan_sem;
+
+ /* Used to reclaim the metadata space in the background. */
+ struct work_struct async_reclaim_work;
+ struct work_struct async_data_reclaim_work;
+ struct work_struct preempt_reclaim_work;
+
+ /* Reclaim partially filled block groups in the background */
+ struct work_struct reclaim_bgs_work;
+ struct list_head reclaim_bgs;
+ int bg_reclaim_threshold;
+
+ spinlock_t unused_bgs_lock;
+ struct list_head unused_bgs;
+ struct mutex unused_bg_unpin_mutex;
+ /* Protect block groups that are going to be deleted */
+ struct mutex reclaim_bgs_lock;
+
+ /* Cached block sizes */
+ u32 nodesize;
+ u32 sectorsize;
+ /* ilog2 of sectorsize, use to avoid 64bit division */
+ u32 sectorsize_bits;
+ u32 csum_size;
+ u32 csums_per_leaf;
+ u32 stripesize;
+
+ /*
+ * Maximum size of an extent. BTRFS_MAX_EXTENT_SIZE on regular
+ * filesystem, on zoned it depends on the device constraints.
+ */
+ u64 max_extent_size;
+
+ /* Block groups and devices containing active swapfiles. */
+ spinlock_t swapfile_pins_lock;
+ struct rb_root swapfile_pins;
+
+ struct crypto_shash *csum_shash;
+
+ /* Type of exclusive operation running, protected by super_lock */
+ enum btrfs_exclusive_operation exclusive_operation;
+
+ /*
+ * Zone size > 0 when in ZONED mode, otherwise it's used for a check
+ * if the mode is enabled
+ */
+ u64 zone_size;
+
+ /* Max size to emit ZONE_APPEND write command */
+ u64 max_zone_append_size;
+ struct mutex zoned_meta_io_lock;
+ spinlock_t treelog_bg_lock;
+ u64 treelog_bg;
+
+ /*
+ * Start of the dedicated data relocation block group, protected by
+ * relocation_bg_lock.
+ */
+ spinlock_t relocation_bg_lock;
+ u64 data_reloc_bg;
+ struct mutex zoned_data_reloc_io_lock;
+
+ u64 nr_global_roots;
+
+ spinlock_t zone_active_bgs_lock;
+ struct list_head zone_active_bgs;
+
+ /* Updates are not protected by any lock */
+ struct btrfs_commit_stats commit_stats;
+
+ /*
+ * Last generation where we dropped a non-relocation root.
+ * Use btrfs_set_last_root_drop_gen() and btrfs_get_last_root_drop_gen()
+ * to change it and to read it, respectively.
+ */
+ u64 last_root_drop_gen;
+
+ /*
+ * Annotations for transaction events (structures are empty when
+ * compiled without lockdep).
+ */
+ struct lockdep_map btrfs_trans_num_writers_map;
+ struct lockdep_map btrfs_trans_num_extwriters_map;
+ struct lockdep_map btrfs_state_change_map[4];
+ struct lockdep_map btrfs_trans_pending_ordered_map;
+ struct lockdep_map btrfs_ordered_extent_map;
+
+#ifdef CONFIG_BTRFS_FS_REF_VERIFY
+ spinlock_t ref_verify_lock;
+ struct rb_root block_tree;
+#endif
+
+#ifdef CONFIG_BTRFS_DEBUG
+ struct kobject *debug_kobj;
+ struct list_head allocated_roots;
+
+ spinlock_t eb_leak_lock;
+ struct list_head allocated_ebs;
+#endif
+};
+
+static inline void btrfs_set_last_root_drop_gen(struct btrfs_fs_info *fs_info,
+ u64 gen)
+{
+ WRITE_ONCE(fs_info->last_root_drop_gen, gen);
+}
+
+static inline u64 btrfs_get_last_root_drop_gen(const struct btrfs_fs_info *fs_info)
+{
+ return READ_ONCE(fs_info->last_root_drop_gen);
+}
+
+/*
+ * Take the number of bytes to be checksummed and figure out how many leaves
+ * it would require to store the csums for that many bytes.
+ */
+static inline u64 btrfs_csum_bytes_to_leaves(
+ const struct btrfs_fs_info *fs_info, u64 csum_bytes)
+{
+ const u64 num_csums = csum_bytes >> fs_info->sectorsize_bits;
+
+ return DIV_ROUND_UP_ULL(num_csums, fs_info->csums_per_leaf);
+}
+
+/*
+ * Use this if we would be adding new items, as we could split nodes as we cow
+ * down the tree.
+ */
+static inline u64 btrfs_calc_insert_metadata_size(struct btrfs_fs_info *fs_info,
+ unsigned num_items)
+{
+ return (u64)fs_info->nodesize * BTRFS_MAX_LEVEL * 2 * num_items;
+}
+
+/*
+ * Doing a truncate or a modification won't result in new nodes or leaves, just
+ * what we need for COW.
+ */
+static inline u64 btrfs_calc_metadata_size(struct btrfs_fs_info *fs_info,
+ unsigned num_items)
+{
+ return (u64)fs_info->nodesize * BTRFS_MAX_LEVEL * num_items;
+}
+
+#define BTRFS_MAX_EXTENT_ITEM_SIZE(r) ((BTRFS_LEAF_DATA_SIZE(r->fs_info) >> 4) - \
+ sizeof(struct btrfs_item))
+
+static inline bool btrfs_is_zoned(const struct btrfs_fs_info *fs_info)
+{
+ return fs_info->zone_size > 0;
+}
+
+/*
+ * Count how many fs_info->max_extent_size cover the @size
+ */
+static inline u32 count_max_extents(struct btrfs_fs_info *fs_info, u64 size)
+{
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+ if (!fs_info)
+ return div_u64(size + BTRFS_MAX_EXTENT_SIZE - 1, BTRFS_MAX_EXTENT_SIZE);
+#endif
+
+ return div_u64(size + fs_info->max_extent_size - 1, fs_info->max_extent_size);
+}
+
+bool btrfs_exclop_start(struct btrfs_fs_info *fs_info,
+ enum btrfs_exclusive_operation type);
+bool btrfs_exclop_start_try_lock(struct btrfs_fs_info *fs_info,
+ enum btrfs_exclusive_operation type);
+void btrfs_exclop_start_unlock(struct btrfs_fs_info *fs_info);
+void btrfs_exclop_finish(struct btrfs_fs_info *fs_info);
+void btrfs_exclop_balance(struct btrfs_fs_info *fs_info,
+ enum btrfs_exclusive_operation op);
+
+/* Compatibility and incompatibility defines */
+void __btrfs_set_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag,
+ const char *name);
+void __btrfs_clear_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag,
+ const char *name);
+void __btrfs_set_fs_compat_ro(struct btrfs_fs_info *fs_info, u64 flag,
+ const char *name);
+void __btrfs_clear_fs_compat_ro(struct btrfs_fs_info *fs_info, u64 flag,
+ const char *name);
+
+#define __btrfs_fs_incompat(fs_info, flags) \
+ (!!(btrfs_super_incompat_flags((fs_info)->super_copy) & (flags)))
+
+#define __btrfs_fs_compat_ro(fs_info, flags) \
+ (!!(btrfs_super_compat_ro_flags((fs_info)->super_copy) & (flags)))
+
+#define btrfs_set_fs_incompat(__fs_info, opt) \
+ __btrfs_set_fs_incompat((__fs_info), BTRFS_FEATURE_INCOMPAT_##opt, #opt)
+
+#define btrfs_clear_fs_incompat(__fs_info, opt) \
+ __btrfs_clear_fs_incompat((__fs_info), BTRFS_FEATURE_INCOMPAT_##opt, #opt)
+
+#define btrfs_fs_incompat(fs_info, opt) \
+ __btrfs_fs_incompat((fs_info), BTRFS_FEATURE_INCOMPAT_##opt)
+
+#define btrfs_set_fs_compat_ro(__fs_info, opt) \
+ __btrfs_set_fs_compat_ro((__fs_info), BTRFS_FEATURE_COMPAT_RO_##opt, #opt)
+
+#define btrfs_clear_fs_compat_ro(__fs_info, opt) \
+ __btrfs_clear_fs_compat_ro((__fs_info), BTRFS_FEATURE_COMPAT_RO_##opt, #opt)
+
+#define btrfs_fs_compat_ro(fs_info, opt) \
+ __btrfs_fs_compat_ro((fs_info), BTRFS_FEATURE_COMPAT_RO_##opt)
+
+#define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt)
+#define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt)
+#define btrfs_raw_test_opt(o, opt) ((o) & BTRFS_MOUNT_##opt)
+#define btrfs_test_opt(fs_info, opt) ((fs_info)->mount_opt & \
+ BTRFS_MOUNT_##opt)
+
+#define btrfs_set_and_info(fs_info, opt, fmt, args...) \
+do { \
+ if (!btrfs_test_opt(fs_info, opt)) \
+ btrfs_info(fs_info, fmt, ##args); \
+ btrfs_set_opt(fs_info->mount_opt, opt); \
+} while (0)
+
+#define btrfs_clear_and_info(fs_info, opt, fmt, args...) \
+do { \
+ if (btrfs_test_opt(fs_info, opt)) \
+ btrfs_info(fs_info, fmt, ##args); \
+ btrfs_clear_opt(fs_info->mount_opt, opt); \
+} while (0)
+
+static inline int btrfs_fs_closing(struct btrfs_fs_info *fs_info)
+{
+ /* Do it this way so we only ever do one test_bit in the normal case. */
+ if (test_bit(BTRFS_FS_CLOSING_START, &fs_info->flags)) {
+ if (test_bit(BTRFS_FS_CLOSING_DONE, &fs_info->flags))
+ return 2;
+ return 1;
+ }
+ return 0;
+}
+
+/*
+ * If we remount the fs to be R/O or umount the fs, the cleaner needn't do
+ * anything except sleeping. This function is used to check the status of
+ * the fs.
+ * We check for BTRFS_FS_STATE_RO to avoid races with a concurrent remount,
+ * since setting and checking for SB_RDONLY in the superblock's flags is not
+ * atomic.
+ */
+static inline int btrfs_need_cleaner_sleep(struct btrfs_fs_info *fs_info)
+{
+ return test_bit(BTRFS_FS_STATE_RO, &fs_info->fs_state) ||
+ btrfs_fs_closing(fs_info);
+}
+
+static inline void btrfs_wake_unfinished_drop(struct btrfs_fs_info *fs_info)
+{
+ clear_and_wake_up_bit(BTRFS_FS_UNFINISHED_DROPS, &fs_info->flags);
+}
+
+#define BTRFS_FS_ERROR(fs_info) (unlikely(test_bit(BTRFS_FS_STATE_ERROR, \
+ &(fs_info)->fs_state)))
+#define BTRFS_FS_LOG_CLEANUP_ERROR(fs_info) \
+ (unlikely(test_bit(BTRFS_FS_STATE_LOG_CLEANUP_ERROR, \
+ &(fs_info)->fs_state)))
+
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+
+#define EXPORT_FOR_TESTS
+
+static inline int btrfs_is_testing(struct btrfs_fs_info *fs_info)
+{
+ return test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state);
+}
+
+void btrfs_test_destroy_inode(struct inode *inode);
+
+#else
+
+#define EXPORT_FOR_TESTS static
+
+static inline int btrfs_is_testing(struct btrfs_fs_info *fs_info)
+{
+ return 0;
+}
+#endif
+
+#endif
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
index 0eeb5ea87894..b65c45b5d681 100644
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c
@@ -4,14 +4,20 @@
*/
#include "ctree.h"
+#include "fs.h"
+#include "messages.h"
#include "inode-item.h"
#include "disk-io.h"
#include "transaction.h"
#include "print-tree.h"
+#include "space-info.h"
+#include "accessors.h"
+#include "extent-tree.h"
+#include "file-item.h"
struct btrfs_inode_ref *btrfs_find_name_in_backref(struct extent_buffer *leaf,
- int slot, const char *name,
- int name_len)
+ int slot,
+ const struct fscrypt_str *name)
{
struct btrfs_inode_ref *ref;
unsigned long ptr;
@@ -27,9 +33,10 @@ struct btrfs_inode_ref *btrfs_find_name_in_backref(struct extent_buffer *leaf,
len = btrfs_inode_ref_name_len(leaf, ref);
name_ptr = (unsigned long)(ref + 1);
cur_offset += len + sizeof(*ref);
- if (len != name_len)
+ if (len != name->len)
continue;
- if (memcmp_extent_buffer(leaf, name, name_ptr, name_len) == 0)
+ if (memcmp_extent_buffer(leaf, name->name, name_ptr,
+ name->len) == 0)
return ref;
}
return NULL;
@@ -37,7 +44,7 @@ struct btrfs_inode_ref *btrfs_find_name_in_backref(struct extent_buffer *leaf,
struct btrfs_inode_extref *btrfs_find_name_in_ext_backref(
struct extent_buffer *leaf, int slot, u64 ref_objectid,
- const char *name, int name_len)
+ const struct fscrypt_str *name)
{
struct btrfs_inode_extref *extref;
unsigned long ptr;
@@ -60,9 +67,10 @@ struct btrfs_inode_extref *btrfs_find_name_in_ext_backref(
name_ptr = (unsigned long)(&extref->name);
ref_name_len = btrfs_inode_extref_name_len(leaf, extref);
- if (ref_name_len == name_len &&
+ if (ref_name_len == name->len &&
btrfs_inode_extref_parent(leaf, extref) == ref_objectid &&
- (memcmp_extent_buffer(leaf, name, name_ptr, name_len) == 0))
+ (memcmp_extent_buffer(leaf, name->name, name_ptr,
+ name->len) == 0))
return extref;
cur_offset += ref_name_len + sizeof(*extref);
@@ -75,7 +83,7 @@ struct btrfs_inode_extref *
btrfs_lookup_inode_extref(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path,
- const char *name, int name_len,
+ const struct fscrypt_str *name,
u64 inode_objectid, u64 ref_objectid, int ins_len,
int cow)
{
@@ -84,7 +92,7 @@ btrfs_lookup_inode_extref(struct btrfs_trans_handle *trans,
key.objectid = inode_objectid;
key.type = BTRFS_INODE_EXTREF_KEY;
- key.offset = btrfs_extref_hash(ref_objectid, name, name_len);
+ key.offset = btrfs_extref_hash(ref_objectid, name->name, name->len);
ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow);
if (ret < 0)
@@ -92,13 +100,13 @@ btrfs_lookup_inode_extref(struct btrfs_trans_handle *trans,
if (ret > 0)
return NULL;
return btrfs_find_name_in_ext_backref(path->nodes[0], path->slots[0],
- ref_objectid, name, name_len);
+ ref_objectid, name);
}
static int btrfs_del_inode_extref(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
- const char *name, int name_len,
+ const struct fscrypt_str *name,
u64 inode_objectid, u64 ref_objectid,
u64 *index)
{
@@ -107,14 +115,14 @@ static int btrfs_del_inode_extref(struct btrfs_trans_handle *trans,
struct btrfs_inode_extref *extref;
struct extent_buffer *leaf;
int ret;
- int del_len = name_len + sizeof(*extref);
+ int del_len = name->len + sizeof(*extref);
unsigned long ptr;
unsigned long item_start;
u32 item_size;
key.objectid = inode_objectid;
key.type = BTRFS_INODE_EXTREF_KEY;
- key.offset = btrfs_extref_hash(ref_objectid, name, name_len);
+ key.offset = btrfs_extref_hash(ref_objectid, name->name, name->len);
path = btrfs_alloc_path();
if (!path)
@@ -132,7 +140,7 @@ static int btrfs_del_inode_extref(struct btrfs_trans_handle *trans,
* readonly.
*/
extref = btrfs_find_name_in_ext_backref(path->nodes[0], path->slots[0],
- ref_objectid, name, name_len);
+ ref_objectid, name);
if (!extref) {
btrfs_handle_fs_error(root->fs_info, -ENOENT, NULL);
ret = -EROFS;
@@ -168,8 +176,7 @@ out:
}
int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- const char *name, int name_len,
+ struct btrfs_root *root, const struct fscrypt_str *name,
u64 inode_objectid, u64 ref_objectid, u64 *index)
{
struct btrfs_path *path;
@@ -182,7 +189,7 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
u32 sub_item_len;
int ret;
int search_ext_refs = 0;
- int del_len = name_len + sizeof(*ref);
+ int del_len = name->len + sizeof(*ref);
key.objectid = inode_objectid;
key.offset = ref_objectid;
@@ -201,8 +208,7 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
goto out;
}
- ref = btrfs_find_name_in_backref(path->nodes[0], path->slots[0], name,
- name_len);
+ ref = btrfs_find_name_in_backref(path->nodes[0], path->slots[0], name);
if (!ref) {
ret = -ENOENT;
search_ext_refs = 1;
@@ -219,7 +225,7 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
goto out;
}
ptr = (unsigned long)ref;
- sub_item_len = name_len + sizeof(*ref);
+ sub_item_len = name->len + sizeof(*ref);
item_start = btrfs_item_ptr_offset(leaf, path->slots[0]);
memmove_extent_buffer(leaf, ptr, ptr + sub_item_len,
item_size - (ptr + sub_item_len - item_start));
@@ -233,7 +239,7 @@ out:
* name in our ref array. Find and remove the extended
* inode ref then.
*/
- return btrfs_del_inode_extref(trans, root, name, name_len,
+ return btrfs_del_inode_extref(trans, root, name,
inode_objectid, ref_objectid, index);
}
@@ -247,12 +253,13 @@ out:
*/
static int btrfs_insert_inode_extref(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
- const char *name, int name_len,
- u64 inode_objectid, u64 ref_objectid, u64 index)
+ const struct fscrypt_str *name,
+ u64 inode_objectid, u64 ref_objectid,
+ u64 index)
{
struct btrfs_inode_extref *extref;
int ret;
- int ins_len = name_len + sizeof(*extref);
+ int ins_len = name->len + sizeof(*extref);
unsigned long ptr;
struct btrfs_path *path;
struct btrfs_key key;
@@ -260,7 +267,7 @@ static int btrfs_insert_inode_extref(struct btrfs_trans_handle *trans,
key.objectid = inode_objectid;
key.type = BTRFS_INODE_EXTREF_KEY;
- key.offset = btrfs_extref_hash(ref_objectid, name, name_len);
+ key.offset = btrfs_extref_hash(ref_objectid, name->name, name->len);
path = btrfs_alloc_path();
if (!path)
@@ -272,7 +279,7 @@ static int btrfs_insert_inode_extref(struct btrfs_trans_handle *trans,
if (btrfs_find_name_in_ext_backref(path->nodes[0],
path->slots[0],
ref_objectid,
- name, name_len))
+ name))
goto out;
btrfs_extend_item(path, ins_len);
@@ -286,12 +293,12 @@ static int btrfs_insert_inode_extref(struct btrfs_trans_handle *trans,
ptr += btrfs_item_size(leaf, path->slots[0]) - ins_len;
extref = (struct btrfs_inode_extref *)ptr;
- btrfs_set_inode_extref_name_len(path->nodes[0], extref, name_len);
+ btrfs_set_inode_extref_name_len(path->nodes[0], extref, name->len);
btrfs_set_inode_extref_index(path->nodes[0], extref, index);
btrfs_set_inode_extref_parent(path->nodes[0], extref, ref_objectid);
ptr = (unsigned long)&extref->name;
- write_extent_buffer(path->nodes[0], name, ptr, name_len);
+ write_extent_buffer(path->nodes[0], name->name, ptr, name->len);
btrfs_mark_buffer_dirty(path->nodes[0]);
out:
@@ -301,8 +308,7 @@ out:
/* Will return 0, -ENOMEM, -EMLINK, or -EEXIST or anything from the CoW path */
int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- const char *name, int name_len,
+ struct btrfs_root *root, const struct fscrypt_str *name,
u64 inode_objectid, u64 ref_objectid, u64 index)
{
struct btrfs_fs_info *fs_info = root->fs_info;
@@ -311,7 +317,7 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
struct btrfs_inode_ref *ref;
unsigned long ptr;
int ret;
- int ins_len = name_len + sizeof(*ref);
+ int ins_len = name->len + sizeof(*ref);
key.objectid = inode_objectid;
key.offset = ref_objectid;
@@ -327,7 +333,7 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
if (ret == -EEXIST) {
u32 old_size;
ref = btrfs_find_name_in_backref(path->nodes[0], path->slots[0],
- name, name_len);
+ name);
if (ref)
goto out;
@@ -336,7 +342,7 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
ref = btrfs_item_ptr(path->nodes[0], path->slots[0],
struct btrfs_inode_ref);
ref = (struct btrfs_inode_ref *)((unsigned long)ref + old_size);
- btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len);
+ btrfs_set_inode_ref_name_len(path->nodes[0], ref, name->len);
btrfs_set_inode_ref_index(path->nodes[0], ref, index);
ptr = (unsigned long)(ref + 1);
ret = 0;
@@ -344,7 +350,7 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
if (ret == -EOVERFLOW) {
if (btrfs_find_name_in_backref(path->nodes[0],
path->slots[0],
- name, name_len))
+ name))
ret = -EEXIST;
else
ret = -EMLINK;
@@ -353,11 +359,11 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
} else {
ref = btrfs_item_ptr(path->nodes[0], path->slots[0],
struct btrfs_inode_ref);
- btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len);
+ btrfs_set_inode_ref_name_len(path->nodes[0], ref, name->len);
btrfs_set_inode_ref_index(path->nodes[0], ref, index);
ptr = (unsigned long)(ref + 1);
}
- write_extent_buffer(path->nodes[0], name, ptr, name_len);
+ write_extent_buffer(path->nodes[0], name->name, ptr, name->len);
btrfs_mark_buffer_dirty(path->nodes[0]);
out:
@@ -370,7 +376,6 @@ out:
if (btrfs_super_incompat_flags(disk_super)
& BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF)
ret = btrfs_insert_inode_extref(trans, root, name,
- name_len,
inode_objectid,
ref_objectid, index);
}
diff --git a/fs/btrfs/inode-item.h b/fs/btrfs/inode-item.h
index a8fc16d0147f..b80aeb715701 100644
--- a/fs/btrfs/inode-item.h
+++ b/fs/btrfs/inode-item.h
@@ -64,33 +64,31 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_truncate_control *control);
int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- const char *name, int name_len,
+ struct btrfs_root *root, const struct fscrypt_str *name,
u64 inode_objectid, u64 ref_objectid, u64 index);
int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- const char *name, int name_len,
- u64 inode_objectid, u64 ref_objectid, u64 *index);
+ struct btrfs_root *root, const struct fscrypt_str *name,
+ u64 inode_objectid, u64 ref_objectid, u64 *index);
int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path, u64 objectid);
-int btrfs_lookup_inode(struct btrfs_trans_handle *trans, struct btrfs_root
- *root, struct btrfs_path *path,
+int btrfs_lookup_inode(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct btrfs_path *path,
struct btrfs_key *location, int mod);
struct btrfs_inode_extref *btrfs_lookup_inode_extref(
struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path,
- const char *name, int name_len,
+ const struct fscrypt_str *name,
u64 inode_objectid, u64 ref_objectid, int ins_len,
int cow);
struct btrfs_inode_ref *btrfs_find_name_in_backref(struct extent_buffer *leaf,
- int slot, const char *name,
- int name_len);
+ int slot,
+ const struct fscrypt_str *name);
struct btrfs_inode_extref *btrfs_find_name_in_ext_backref(
struct extent_buffer *leaf, int slot, u64 ref_objectid,
- const char *name, int name_len);
+ const struct fscrypt_str *name);
#endif
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 0e516aefbf51..8bcad9940154 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -43,7 +43,7 @@
#include "ordered-data.h"
#include "xattr.h"
#include "tree-log.h"
-#include "volumes.h"
+#include "bio.h"
#include "compression.h"
#include "locking.h"
#include "free-space-cache.h"
@@ -55,6 +55,21 @@
#include "zoned.h"
#include "subpage.h"
#include "inode-item.h"
+#include "fs.h"
+#include "accessors.h"
+#include "extent-tree.h"
+#include "root-tree.h"
+#include "defrag.h"
+#include "dir-item.h"
+#include "file-item.h"
+#include "uuid-tree.h"
+#include "ioctl.h"
+#include "file.h"
+#include "acl.h"
+#include "relocation.h"
+#include "verity.h"
+#include "super.h"
+#include "orphan.h"
struct btrfs_iget_args {
u64 ino;
@@ -69,7 +84,7 @@ struct btrfs_dio_data {
};
struct btrfs_dio_private {
- struct inode *inode;
+ struct btrfs_inode *inode;
/*
* Since DIO can use anonymous page, we cannot use page_offset() to
@@ -107,13 +122,9 @@ static const struct address_space_operations btrfs_aops;
static const struct file_operations btrfs_dir_file_operations;
static struct kmem_cache *btrfs_inode_cachep;
-struct kmem_cache *btrfs_trans_handle_cachep;
-struct kmem_cache *btrfs_path_cachep;
-struct kmem_cache *btrfs_free_space_cachep;
-struct kmem_cache *btrfs_free_space_bitmap_cachep;
static int btrfs_setsize(struct inode *inode, struct iattr *attr);
-static int btrfs_truncate(struct inode *inode, bool skip_writeback);
+static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback);
static noinline int cow_file_range(struct btrfs_inode *inode,
struct page *locked_page,
u64 start, u64 end, int *page_started,
@@ -125,6 +136,32 @@ static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start,
u64 ram_bytes, int compress_type,
int type);
+static void __cold btrfs_print_data_csum_error(struct btrfs_inode *inode,
+ u64 logical_start, u8 *csum, u8 *csum_expected, int mirror_num)
+{
+ struct btrfs_root *root = inode->root;
+ const u32 csum_size = root->fs_info->csum_size;
+
+ /* Output without objectid, which is more meaningful */
+ if (root->root_key.objectid >= BTRFS_LAST_FREE_OBJECTID) {
+ btrfs_warn_rl(root->fs_info,
+"csum failed root %lld ino %lld off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d",
+ root->root_key.objectid, btrfs_ino(inode),
+ logical_start,
+ CSUM_FMT_VALUE(csum_size, csum),
+ CSUM_FMT_VALUE(csum_size, csum_expected),
+ mirror_num);
+ } else {
+ btrfs_warn_rl(root->fs_info,
+"csum failed root %llu ino %llu off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d",
+ root->root_key.objectid, btrfs_ino(inode),
+ logical_start,
+ CSUM_FMT_VALUE(csum_size, csum),
+ CSUM_FMT_VALUE(csum_size, csum_expected),
+ mirror_num);
+ }
+}
+
/*
* btrfs_inode_lock - lock inode i_rwsem based on arguments passed
*
@@ -135,27 +172,27 @@ static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start,
* return -EAGAIN
* BTRFS_ILOCK_MMAP - acquire a write lock on the i_mmap_lock
*/
-int btrfs_inode_lock(struct inode *inode, unsigned int ilock_flags)
+int btrfs_inode_lock(struct btrfs_inode *inode, unsigned int ilock_flags)
{
if (ilock_flags & BTRFS_ILOCK_SHARED) {
if (ilock_flags & BTRFS_ILOCK_TRY) {
- if (!inode_trylock_shared(inode))
+ if (!inode_trylock_shared(&inode->vfs_inode))
return -EAGAIN;
else
return 0;
}
- inode_lock_shared(inode);
+ inode_lock_shared(&inode->vfs_inode);
} else {
if (ilock_flags & BTRFS_ILOCK_TRY) {
- if (!inode_trylock(inode))
+ if (!inode_trylock(&inode->vfs_inode))
return -EAGAIN;
else
return 0;
}
- inode_lock(inode);
+ inode_lock(&inode->vfs_inode);
}
if (ilock_flags & BTRFS_ILOCK_MMAP)
- down_write(&BTRFS_I(inode)->i_mmap_lock);
+ down_write(&inode->i_mmap_lock);
return 0;
}
@@ -165,14 +202,14 @@ int btrfs_inode_lock(struct inode *inode, unsigned int ilock_flags)
* ilock_flags should contain the same bits set as passed to btrfs_inode_lock()
* to decide whether the lock acquired is shared or exclusive.
*/
-void btrfs_inode_unlock(struct inode *inode, unsigned int ilock_flags)
+void btrfs_inode_unlock(struct btrfs_inode *inode, unsigned int ilock_flags)
{
if (ilock_flags & BTRFS_ILOCK_MMAP)
- up_write(&BTRFS_I(inode)->i_mmap_lock);
+ up_write(&inode->i_mmap_lock);
if (ilock_flags & BTRFS_ILOCK_SHARED)
- inode_unlock_shared(inode);
+ inode_unlock_shared(&inode->vfs_inode);
else
- inode_unlock(inode);
+ inode_unlock(&inode->vfs_inode);
}
/*
@@ -249,7 +286,7 @@ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode,
return btrfs_mark_ordered_io_finished(inode, NULL, offset, bytes, false);
}
-static int btrfs_dirty_inode(struct inode *inode);
+static int btrfs_dirty_inode(struct btrfs_inode *inode);
static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
struct btrfs_new_inode_args *args)
@@ -483,7 +520,7 @@ struct async_extent {
};
struct async_chunk {
- struct inode *inode;
+ struct btrfs_inode *inode;
struct page *locked_page;
u64 start;
u64 end;
@@ -611,8 +648,8 @@ static inline void inode_should_defrag(struct btrfs_inode *inode,
*/
static noinline int compress_file_range(struct async_chunk *async_chunk)
{
- struct inode *inode = async_chunk->inode;
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_inode *inode = async_chunk->inode;
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
u64 blocksize = fs_info->sectorsize;
u64 start = async_chunk->start;
u64 end = async_chunk->end;
@@ -629,8 +666,7 @@ static noinline int compress_file_range(struct async_chunk *async_chunk)
int compressed_extents = 0;
int redirty = 0;
- inode_should_defrag(BTRFS_I(inode), start, end, end - start + 1,
- SZ_16K);
+ inode_should_defrag(inode, start, end, end - start + 1, SZ_16K);
/*
* We need to save i_size before now because it could change in between
@@ -642,7 +678,7 @@ static noinline int compress_file_range(struct async_chunk *async_chunk)
* does that for us.
*/
barrier();
- i_size = i_size_read(inode);
+ i_size = i_size_read(&inode->vfs_inode);
barrier();
actual_end = min_t(u64, i_size, end + 1);
again:
@@ -671,7 +707,7 @@ again:
* isn't an inline extent, since it doesn't save disk space at all.
*/
if (total_compressed <= blocksize &&
- (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
+ (start > 0 || end + 1 < inode->disk_i_size))
goto cleanup_and_bail_uncompressed;
/*
@@ -695,7 +731,7 @@ again:
* inode has not been flagged as nocompress. This flag can
* change at any time if we discover bad compression ratios.
*/
- if (inode_need_compress(BTRFS_I(inode), start, end)) {
+ if (inode_need_compress(inode, start, end)) {
WARN_ON(pages);
pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
if (!pages) {
@@ -704,10 +740,10 @@ again:
goto cont;
}
- if (BTRFS_I(inode)->defrag_compress)
- compress_type = BTRFS_I(inode)->defrag_compress;
- else if (BTRFS_I(inode)->prop_compress)
- compress_type = BTRFS_I(inode)->prop_compress;
+ if (inode->defrag_compress)
+ compress_type = inode->defrag_compress;
+ else if (inode->prop_compress)
+ compress_type = inode->prop_compress;
/*
* we need to call clear_page_dirty_for_io on each
@@ -722,14 +758,14 @@ again:
* has moved, the end is the original one.
*/
if (!redirty) {
- extent_range_clear_dirty_for_io(inode, start, end);
+ extent_range_clear_dirty_for_io(&inode->vfs_inode, start, end);
redirty = 1;
}
/* Compression level is applied here and only here */
ret = btrfs_compress_pages(
compress_type | (fs_info->compress_level << 4),
- inode->i_mapping, start,
+ inode->vfs_inode.i_mapping, start,
pages,
&nr_pages,
&total_in,
@@ -758,12 +794,12 @@ cont:
/* we didn't compress the entire range, try
* to make an uncompressed inline extent.
*/
- ret = cow_file_range_inline(BTRFS_I(inode), actual_end,
+ ret = cow_file_range_inline(inode, actual_end,
0, BTRFS_COMPRESS_NONE,
NULL, false);
} else {
/* try making a compressed inline extent */
- ret = cow_file_range_inline(BTRFS_I(inode), actual_end,
+ ret = cow_file_range_inline(inode, actual_end,
total_compressed,
compress_type, pages,
false);
@@ -786,7 +822,7 @@ cont:
* our outstanding extent for clearing delalloc for this
* range.
*/
- extent_clear_unlock_delalloc(BTRFS_I(inode), start, end,
+ extent_clear_unlock_delalloc(inode, start, end,
NULL,
clear_flags,
PAGE_UNLOCK |
@@ -861,8 +897,8 @@ cont:
/* flag the file so we don't compress in the future */
if (!btrfs_test_opt(fs_info, FORCE_COMPRESS) &&
- !(BTRFS_I(inode)->prop_compress)) {
- BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS;
+ !(inode->prop_compress)) {
+ inode->flags |= BTRFS_INODE_NOCOMPRESS;
}
}
cleanup_and_bail_uncompressed:
@@ -880,7 +916,7 @@ cleanup_and_bail_uncompressed:
}
if (redirty)
- extent_range_redirty_for_io(inode, start, end);
+ extent_range_redirty_for_io(&inode->vfs_inode, start, end);
add_async_extent(async_chunk, start, end - start + 1, 0, NULL, 0,
BTRFS_COMPRESS_NONE);
compressed_extents++;
@@ -1076,7 +1112,7 @@ out_free:
*/
static noinline void submit_compressed_extents(struct async_chunk *async_chunk)
{
- struct btrfs_inode *inode = BTRFS_I(async_chunk->inode);
+ struct btrfs_inode *inode = async_chunk->inode;
struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct async_extent *async_extent;
u64 alloc_hint = 0;
@@ -1565,7 +1601,7 @@ static int cow_file_range_async(struct btrfs_inode *inode,
*/
ihold(&inode->vfs_inode);
async_chunk[i].async_cow = ctx;
- async_chunk[i].inode = &inode->vfs_inode;
+ async_chunk[i].inode = inode;
async_chunk[i].start = start;
async_chunk[i].end = cur_end;
async_chunk[i].write_flags = write_flags;
@@ -1673,9 +1709,8 @@ static noinline int csum_exist_in_range(struct btrfs_fs_info *fs_info,
int ret;
LIST_HEAD(list);
- ret = btrfs_lookup_csums_range(csum_root, bytenr,
- bytenr + num_bytes - 1, &list, 0,
- nowait);
+ ret = btrfs_lookup_csums_list(csum_root, bytenr, bytenr + num_bytes - 1,
+ &list, 0, nowait);
if (ret == 0 && list_empty(&list))
return 0;
@@ -1733,7 +1768,7 @@ static int fallback_to_cow(struct btrfs_inode *inode, struct page *locked_page,
* when starting writeback.
*/
count = count_range_bits(io_tree, &range_start, end, range_bytes,
- EXTENT_NORESERVE, 0);
+ EXTENT_NORESERVE, 0, NULL);
if (count > 0 || is_space_ino || is_reloc_ino) {
u64 bytes = count;
struct btrfs_fs_info *fs_info = inode->root->fs_info;
@@ -2240,10 +2275,10 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page
return ret;
}
-void btrfs_split_delalloc_extent(struct inode *inode,
+void btrfs_split_delalloc_extent(struct btrfs_inode *inode,
struct extent_state *orig, u64 split)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
u64 size;
/* not delalloc, ignore it */
@@ -2267,9 +2302,9 @@ void btrfs_split_delalloc_extent(struct inode *inode,
return;
}
- spin_lock(&BTRFS_I(inode)->lock);
- btrfs_mod_outstanding_extents(BTRFS_I(inode), 1);
- spin_unlock(&BTRFS_I(inode)->lock);
+ spin_lock(&inode->lock);
+ btrfs_mod_outstanding_extents(inode, 1);
+ spin_unlock(&inode->lock);
}
/*
@@ -2277,10 +2312,10 @@ void btrfs_split_delalloc_extent(struct inode *inode,
* that are just merged onto old extents, such as when we are doing sequential
* writes, so we can properly account for the metadata space we'll need.
*/
-void btrfs_merge_delalloc_extent(struct inode *inode, struct extent_state *new,
+void btrfs_merge_delalloc_extent(struct btrfs_inode *inode, struct extent_state *new,
struct extent_state *other)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
u64 new_size, old_size;
u32 num_extents;
@@ -2295,9 +2330,9 @@ void btrfs_merge_delalloc_extent(struct inode *inode, struct extent_state *new,
/* we're not bigger than the max, unreserve the space and go */
if (new_size <= fs_info->max_extent_size) {
- spin_lock(&BTRFS_I(inode)->lock);
- btrfs_mod_outstanding_extents(BTRFS_I(inode), -1);
- spin_unlock(&BTRFS_I(inode)->lock);
+ spin_lock(&inode->lock);
+ btrfs_mod_outstanding_extents(inode, -1);
+ spin_unlock(&inode->lock);
return;
}
@@ -2326,22 +2361,20 @@ void btrfs_merge_delalloc_extent(struct inode *inode, struct extent_state *new,
if (count_max_extents(fs_info, new_size) >= num_extents)
return;
- spin_lock(&BTRFS_I(inode)->lock);
- btrfs_mod_outstanding_extents(BTRFS_I(inode), -1);
- spin_unlock(&BTRFS_I(inode)->lock);
+ spin_lock(&inode->lock);
+ btrfs_mod_outstanding_extents(inode, -1);
+ spin_unlock(&inode->lock);
}
static void btrfs_add_delalloc_inodes(struct btrfs_root *root,
- struct inode *inode)
+ struct btrfs_inode *inode)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
spin_lock(&root->delalloc_lock);
- if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
- list_add_tail(&BTRFS_I(inode)->delalloc_inodes,
- &root->delalloc_inodes);
- set_bit(BTRFS_INODE_IN_DELALLOC_LIST,
- &BTRFS_I(inode)->runtime_flags);
+ if (list_empty(&inode->delalloc_inodes)) {
+ list_add_tail(&inode->delalloc_inodes, &root->delalloc_inodes);
+ set_bit(BTRFS_INODE_IN_DELALLOC_LIST, &inode->runtime_flags);
root->nr_delalloc_inodes++;
if (root->nr_delalloc_inodes == 1) {
spin_lock(&fs_info->delalloc_root_lock);
@@ -2354,7 +2387,6 @@ static void btrfs_add_delalloc_inodes(struct btrfs_root *root,
spin_unlock(&root->delalloc_lock);
}
-
void __btrfs_del_delalloc_inode(struct btrfs_root *root,
struct btrfs_inode *inode)
{
@@ -2387,10 +2419,10 @@ static void btrfs_del_delalloc_inode(struct btrfs_root *root,
* Properly track delayed allocation bytes in the inode and to maintain the
* list of inodes that have pending delalloc work to be done.
*/
-void btrfs_set_delalloc_extent(struct inode *inode, struct extent_state *state,
+void btrfs_set_delalloc_extent(struct btrfs_inode *inode, struct extent_state *state,
u32 bits)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
if ((bits & EXTENT_DEFRAG) && !(bits & EXTENT_DELALLOC))
WARN_ON(1);
@@ -2400,14 +2432,14 @@ void btrfs_set_delalloc_extent(struct inode *inode, struct extent_state *state,
* bit, which is only set or cleared with irqs on
*/
if (!(state->state & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
- struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_root *root = inode->root;
u64 len = state->end + 1 - state->start;
u32 num_extents = count_max_extents(fs_info, len);
- bool do_list = !btrfs_is_free_space_inode(BTRFS_I(inode));
+ bool do_list = !btrfs_is_free_space_inode(inode);
- spin_lock(&BTRFS_I(inode)->lock);
- btrfs_mod_outstanding_extents(BTRFS_I(inode), num_extents);
- spin_unlock(&BTRFS_I(inode)->lock);
+ spin_lock(&inode->lock);
+ btrfs_mod_outstanding_extents(inode, num_extents);
+ spin_unlock(&inode->lock);
/* For sanity tests */
if (btrfs_is_testing(fs_info))
@@ -2415,22 +2447,21 @@ void btrfs_set_delalloc_extent(struct inode *inode, struct extent_state *state,
percpu_counter_add_batch(&fs_info->delalloc_bytes, len,
fs_info->delalloc_batch);
- spin_lock(&BTRFS_I(inode)->lock);
- BTRFS_I(inode)->delalloc_bytes += len;
+ spin_lock(&inode->lock);
+ inode->delalloc_bytes += len;
if (bits & EXTENT_DEFRAG)
- BTRFS_I(inode)->defrag_bytes += len;
+ inode->defrag_bytes += len;
if (do_list && !test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
- &BTRFS_I(inode)->runtime_flags))
+ &inode->runtime_flags))
btrfs_add_delalloc_inodes(root, inode);
- spin_unlock(&BTRFS_I(inode)->lock);
+ spin_unlock(&inode->lock);
}
if (!(state->state & EXTENT_DELALLOC_NEW) &&
(bits & EXTENT_DELALLOC_NEW)) {
- spin_lock(&BTRFS_I(inode)->lock);
- BTRFS_I(inode)->new_delalloc_bytes += state->end + 1 -
- state->start;
- spin_unlock(&BTRFS_I(inode)->lock);
+ spin_lock(&inode->lock);
+ inode->new_delalloc_bytes += state->end + 1 - state->start;
+ spin_unlock(&inode->lock);
}
}
@@ -2438,11 +2469,10 @@ void btrfs_set_delalloc_extent(struct inode *inode, struct extent_state *state,
* Once a range is no longer delalloc this function ensures that proper
* accounting happens.
*/
-void btrfs_clear_delalloc_extent(struct inode *vfs_inode,
+void btrfs_clear_delalloc_extent(struct btrfs_inode *inode,
struct extent_state *state, u32 bits)
{
- struct btrfs_inode *inode = BTRFS_I(vfs_inode);
- struct btrfs_fs_info *fs_info = btrfs_sb(vfs_inode->i_sb);
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
u64 len = state->end + 1 - state->start;
u32 num_extents = count_max_extents(fs_info, len);
@@ -2513,10 +2543,9 @@ void btrfs_clear_delalloc_extent(struct inode *vfs_inode,
* At IO completion time the cums attached on the ordered extent record
* are inserted into the btree
*/
-static blk_status_t btrfs_submit_bio_start(struct inode *inode, struct bio *bio,
- u64 dio_file_offset)
+blk_status_t btrfs_submit_bio_start(struct btrfs_inode *inode, struct bio *bio)
{
- return btrfs_csum_one_bio(BTRFS_I(inode), bio, (u64)-1, false);
+ return btrfs_csum_one_bio(inode, bio, (u64)-1, false);
}
/*
@@ -2694,14 +2723,13 @@ out:
return errno_to_blk_status(ret);
}
-void btrfs_submit_data_write_bio(struct inode *inode, struct bio *bio, int mirror_num)
+void btrfs_submit_data_write_bio(struct btrfs_inode *inode, struct bio *bio, int mirror_num)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- struct btrfs_inode *bi = BTRFS_I(inode);
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
blk_status_t ret;
if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
- ret = extract_ordered_extent(bi, bio,
+ ret = extract_ordered_extent(inode, bio,
page_offset(bio_first_bvec_all(bio)->bv_page));
if (ret) {
btrfs_bio_end_io(btrfs_bio(bio), ret);
@@ -2717,15 +2745,14 @@ void btrfs_submit_data_write_bio(struct inode *inode, struct bio *bio, int mirro
* Csum items for reloc roots have already been cloned at this point,
* so they are handled as part of the no-checksum case.
*/
- if (!(bi->flags & BTRFS_INODE_NODATASUM) &&
+ if (!(inode->flags & BTRFS_INODE_NODATASUM) &&
!test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state) &&
- !btrfs_is_data_reloc_root(bi->root)) {
- if (!atomic_read(&bi->sync_writers) &&
- btrfs_wq_submit_bio(inode, bio, mirror_num, 0,
- btrfs_submit_bio_start))
+ !btrfs_is_data_reloc_root(inode->root)) {
+ if (!atomic_read(&inode->sync_writers) &&
+ btrfs_wq_submit_bio(inode, bio, mirror_num, 0, WQ_SUBMIT_DATA))
return;
- ret = btrfs_csum_one_bio(bi, bio, (u64)-1, false);
+ ret = btrfs_csum_one_bio(inode, bio, (u64)-1, false);
if (ret) {
btrfs_bio_end_io(btrfs_bio(bio), ret);
return;
@@ -2734,10 +2761,10 @@ void btrfs_submit_data_write_bio(struct inode *inode, struct bio *bio, int mirro
btrfs_submit_bio(fs_info, bio, mirror_num);
}
-void btrfs_submit_data_read_bio(struct inode *inode, struct bio *bio,
+void btrfs_submit_data_read_bio(struct btrfs_inode *inode, struct bio *bio,
int mirror_num, enum btrfs_compression_type compress_type)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
blk_status_t ret;
if (compress_type != BTRFS_COMPRESS_NONE) {
@@ -2745,7 +2772,7 @@ void btrfs_submit_data_read_bio(struct inode *inode, struct bio *bio,
* btrfs_submit_compressed_read will handle completing the bio
* if there were any errors, so just return here.
*/
- btrfs_submit_compressed_read(inode, bio, mirror_num);
+ btrfs_submit_compressed_read(&inode->vfs_inode, bio, mirror_num);
return;
}
@@ -2756,7 +2783,7 @@ void btrfs_submit_data_read_bio(struct inode *inode, struct bio *bio,
* Lookup bio sums does extra checks around whether we need to csum or
* not, which is why we ignore skip_sum here.
*/
- ret = btrfs_lookup_bio_sums(inode, bio, NULL);
+ ret = btrfs_lookup_bio_sums(&inode->vfs_inode, bio, NULL);
if (ret) {
btrfs_bio_end_io(btrfs_bio(bio), ret);
return;
@@ -2859,7 +2886,7 @@ int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
/* see btrfs_writepage_start_hook for details on why this is required */
struct btrfs_writepage_fixup {
struct page *page;
- struct inode *inode;
+ struct btrfs_inode *inode;
struct btrfs_work work;
};
@@ -2878,7 +2905,7 @@ static void btrfs_writepage_fixup_worker(struct btrfs_work *work)
fixup = container_of(work, struct btrfs_writepage_fixup, work);
page = fixup->page;
- inode = BTRFS_I(fixup->inode);
+ inode = fixup->inode;
page_start = page_offset(page);
page_end = page_offset(page) + PAGE_SIZE - 1;
@@ -2988,7 +3015,7 @@ out_page:
* that could need flushing space. Recursing back to fixup worker would
* deadlock.
*/
- btrfs_add_delayed_iput(&inode->vfs_inode);
+ btrfs_add_delayed_iput(inode);
}
/*
@@ -3037,7 +3064,7 @@ int btrfs_writepage_cow_fixup(struct page *page)
get_page(page);
btrfs_init_work(&fixup->work, btrfs_writepage_fixup_worker, NULL, NULL);
fixup->page = page;
- fixup->inode = inode;
+ fixup->inode = BTRFS_I(inode);
btrfs_queue_work(fs_info->fixup_workers, &fixup->work);
return -EAGAIN;
@@ -3459,10 +3486,10 @@ static u8 *btrfs_csum_ptr(const struct btrfs_fs_info *fs_info, u8 *csums, u64 of
* When csum mismatch is detected, we will also report the error and fill the
* corrupted range with zero. (Thus it needs the extra parameters)
*/
-int btrfs_check_data_csum(struct inode *inode, struct btrfs_bio *bbio,
+int btrfs_check_data_csum(struct btrfs_inode *inode, struct btrfs_bio *bbio,
u32 bio_offset, struct page *page, u32 pgoff)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
u32 len = fs_info->sectorsize;
u8 *csum_expected;
u8 csum[BTRFS_CSUM_SIZE];
@@ -3476,8 +3503,7 @@ int btrfs_check_data_csum(struct inode *inode, struct btrfs_bio *bbio,
return 0;
zeroit:
- btrfs_print_data_csum_error(BTRFS_I(inode),
- bbio->file_offset + bio_offset,
+ btrfs_print_data_csum_error(inode, bbio->file_offset + bio_offset,
csum, csum_expected, bbio->mirror_num);
if (bbio->device)
btrfs_dev_stat_inc_and_print(bbio->device,
@@ -3502,10 +3528,10 @@ unsigned int btrfs_verify_data_csum(struct btrfs_bio *bbio,
u32 bio_offset, struct page *page,
u64 start, u64 end)
{
- struct inode *inode = page->mapping->host;
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
- struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_inode *inode = BTRFS_I(page->mapping->host);
+ struct btrfs_root *root = inode->root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct extent_io_tree *io_tree = &inode->io_tree;
const u32 sectorsize = root->fs_info->sectorsize;
u32 pg_off;
unsigned int result = 0;
@@ -3518,7 +3544,7 @@ unsigned int btrfs_verify_data_csum(struct btrfs_bio *bbio,
if (bbio->csum == NULL)
return 0;
- if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)
+ if (inode->flags & BTRFS_INODE_NODATASUM)
return 0;
if (unlikely(test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state)))
@@ -3563,18 +3589,17 @@ unsigned int btrfs_verify_data_csum(struct btrfs_bio *bbio,
* the inode to the delayed iput machinery. Delayed iputs are processed at
* transaction commit time/superblock commit/cleaner kthread.
*/
-void btrfs_add_delayed_iput(struct inode *inode)
+void btrfs_add_delayed_iput(struct btrfs_inode *inode)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- struct btrfs_inode *binode = BTRFS_I(inode);
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
- if (atomic_add_unless(&inode->i_count, -1, 1))
+ if (atomic_add_unless(&inode->vfs_inode.i_count, -1, 1))
return;
atomic_inc(&fs_info->nr_delayed_iputs);
spin_lock(&fs_info->delayed_iput_lock);
- ASSERT(list_empty(&binode->delayed_iput));
- list_add_tail(&binode->delayed_iput, &fs_info->delayed_iputs);
+ ASSERT(list_empty(&inode->delayed_iput));
+ list_add_tail(&inode->delayed_iput, &fs_info->delayed_iputs);
spin_unlock(&fs_info->delayed_iput_lock);
if (!test_bit(BTRFS_FS_CLEANER_RUNNING, &fs_info->flags))
wake_up_process(fs_info->cleaner_kthread);
@@ -3617,7 +3642,7 @@ void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info)
spin_unlock(&fs_info->delayed_iput_lock);
}
-/**
+/*
* Wait for flushing all delayed iputs
*
* @fs_info: the filesystem
@@ -4262,7 +4287,7 @@ int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans,
static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans,
struct btrfs_inode *dir,
struct btrfs_inode *inode,
- const char *name, int name_len,
+ const struct fscrypt_str *name,
struct btrfs_rename_ctx *rename_ctx)
{
struct btrfs_root *root = dir->root;
@@ -4280,8 +4305,7 @@ static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans,
goto out;
}
- di = btrfs_lookup_dir_item(trans, root, path, dir_ino,
- name, name_len, -1);
+ di = btrfs_lookup_dir_item(trans, root, path, dir_ino, name, -1);
if (IS_ERR_OR_NULL(di)) {
ret = di ? PTR_ERR(di) : -ENOENT;
goto err;
@@ -4309,12 +4333,11 @@ static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans,
}
}
- ret = btrfs_del_inode_ref(trans, root, name, name_len, ino,
- dir_ino, &index);
+ ret = btrfs_del_inode_ref(trans, root, name, ino, dir_ino, &index);
if (ret) {
btrfs_info(fs_info,
"failed to delete reference to %.*s, inode %llu parent %llu",
- name_len, name, ino, dir_ino);
+ name->len, name->name, ino, dir_ino);
btrfs_abort_transaction(trans, ret);
goto err;
}
@@ -4335,10 +4358,8 @@ skip_backref:
* operations on the log tree, increasing latency for applications.
*/
if (!rename_ctx) {
- btrfs_del_inode_ref_in_log(trans, root, name, name_len, inode,
- dir_ino);
- btrfs_del_dir_entries_in_log(trans, root, name, name_len, dir,
- index);
+ btrfs_del_inode_ref_in_log(trans, root, name, inode, dir_ino);
+ btrfs_del_dir_entries_in_log(trans, root, name, dir, index);
}
/*
@@ -4356,7 +4377,7 @@ err:
if (ret)
goto out;
- btrfs_i_size_write(dir, dir->vfs_inode.i_size - name_len * 2);
+ btrfs_i_size_write(dir, dir->vfs_inode.i_size - name->len * 2);
inode_inc_iversion(&inode->vfs_inode);
inode_inc_iversion(&dir->vfs_inode);
inode->vfs_inode.i_ctime = current_time(&inode->vfs_inode);
@@ -4369,10 +4390,11 @@ out:
int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
struct btrfs_inode *dir, struct btrfs_inode *inode,
- const char *name, int name_len)
+ const struct fscrypt_str *name)
{
int ret;
- ret = __btrfs_unlink_inode(trans, dir, inode, name, name_len, NULL);
+
+ ret = __btrfs_unlink_inode(trans, dir, inode, name, NULL);
if (!ret) {
drop_nlink(&inode->vfs_inode);
ret = btrfs_update_inode(trans, inode->root, inode);
@@ -4388,9 +4410,9 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
* plenty of slack room in the global reserve to migrate, otherwise we cannot
* allow the unlink to occur.
*/
-static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir)
+static struct btrfs_trans_handle *__unlink_start_trans(struct btrfs_inode *dir)
{
- struct btrfs_root *root = BTRFS_I(dir)->root;
+ struct btrfs_root *root = dir->root;
/*
* 1 for the possible orphan item
@@ -4408,47 +4430,62 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
struct btrfs_trans_handle *trans;
struct inode *inode = d_inode(dentry);
int ret;
+ struct fscrypt_name fname;
- trans = __unlink_start_trans(dir);
- if (IS_ERR(trans))
- return PTR_ERR(trans);
+ ret = fscrypt_setup_filename(dir, &dentry->d_name, 1, &fname);
+ if (ret)
+ return ret;
+
+ /* This needs to handle no-key deletions later on */
+
+ trans = __unlink_start_trans(BTRFS_I(dir));
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ goto fscrypt_free;
+ }
btrfs_record_unlink_dir(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)),
0);
- ret = btrfs_unlink_inode(trans, BTRFS_I(dir),
- BTRFS_I(d_inode(dentry)), dentry->d_name.name,
- dentry->d_name.len);
+ ret = btrfs_unlink_inode(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)),
+ &fname.disk_name);
if (ret)
- goto out;
+ goto end_trans;
if (inode->i_nlink == 0) {
ret = btrfs_orphan_add(trans, BTRFS_I(inode));
if (ret)
- goto out;
+ goto end_trans;
}
-out:
+end_trans:
btrfs_end_transaction(trans);
btrfs_btree_balance_dirty(BTRFS_I(dir)->root->fs_info);
+fscrypt_free:
+ fscrypt_free_filename(&fname);
return ret;
}
static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
- struct inode *dir, struct dentry *dentry)
+ struct btrfs_inode *dir, struct dentry *dentry)
{
- struct btrfs_root *root = BTRFS_I(dir)->root;
+ struct btrfs_root *root = dir->root;
struct btrfs_inode *inode = BTRFS_I(d_inode(dentry));
struct btrfs_path *path;
struct extent_buffer *leaf;
struct btrfs_dir_item *di;
struct btrfs_key key;
- const char *name = dentry->d_name.name;
- int name_len = dentry->d_name.len;
u64 index;
int ret;
u64 objectid;
- u64 dir_ino = btrfs_ino(BTRFS_I(dir));
+ u64 dir_ino = btrfs_ino(dir);
+ struct fscrypt_name fname;
+
+ ret = fscrypt_setup_filename(&dir->vfs_inode, &dentry->d_name, 1, &fname);
+ if (ret)
+ return ret;
+
+ /* This needs to handle no-key deletions later on */
if (btrfs_ino(inode) == BTRFS_FIRST_FREE_OBJECTID) {
objectid = inode->root->root_key.objectid;
@@ -4456,15 +4493,18 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
objectid = inode->location.objectid;
} else {
WARN_ON(1);
+ fscrypt_free_filename(&fname);
return -EINVAL;
}
path = btrfs_alloc_path();
- if (!path)
- return -ENOMEM;
+ if (!path) {
+ ret = -ENOMEM;
+ goto out;
+ }
di = btrfs_lookup_dir_item(trans, root, path, dir_ino,
- name, name_len, -1);
+ &fname.disk_name, -1);
if (IS_ERR_OR_NULL(di)) {
ret = di ? PTR_ERR(di) : -ENOENT;
goto out;
@@ -4490,8 +4530,7 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
* call btrfs_del_root_ref, and it _shouldn't_ fail.
*/
if (btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) {
- di = btrfs_search_dir_index_item(root, path, dir_ino,
- name, name_len);
+ di = btrfs_search_dir_index_item(root, path, dir_ino, &fname.disk_name);
if (IS_ERR_OR_NULL(di)) {
if (!di)
ret = -ENOENT;
@@ -4508,28 +4547,29 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
} else {
ret = btrfs_del_root_ref(trans, objectid,
root->root_key.objectid, dir_ino,
- &index, name, name_len);
+ &index, &fname.disk_name);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto out;
}
}
- ret = btrfs_delete_delayed_dir_index(trans, BTRFS_I(dir), index);
+ ret = btrfs_delete_delayed_dir_index(trans, dir, index);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto out;
}
- btrfs_i_size_write(BTRFS_I(dir), dir->i_size - name_len * 2);
- inode_inc_iversion(dir);
- dir->i_mtime = current_time(dir);
- dir->i_ctime = dir->i_mtime;
- ret = btrfs_update_inode_fallback(trans, root, BTRFS_I(dir));
+ btrfs_i_size_write(dir, dir->vfs_inode.i_size - fname.disk_name.len * 2);
+ inode_inc_iversion(&dir->vfs_inode);
+ dir->vfs_inode.i_mtime = current_time(&dir->vfs_inode);
+ dir->vfs_inode.i_ctime = dir->vfs_inode.i_mtime;
+ ret = btrfs_update_inode_fallback(trans, root, dir);
if (ret)
btrfs_abort_transaction(trans, ret);
out:
btrfs_free_path(path);
+ fscrypt_free_filename(&fname);
return ret;
}
@@ -4543,6 +4583,7 @@ static noinline int may_destroy_subvol(struct btrfs_root *root)
struct btrfs_path *path;
struct btrfs_dir_item *di;
struct btrfs_key key;
+ struct fscrypt_str name = FSTR_INIT("default", 7);
u64 dir_id;
int ret;
@@ -4553,7 +4594,7 @@ static noinline int may_destroy_subvol(struct btrfs_root *root)
/* Make sure this root isn't set as the default subvol */
dir_id = btrfs_super_root_dir(fs_info->super_copy);
di = btrfs_lookup_dir_item(NULL, fs_info->tree_root, path,
- dir_id, "default", 7, 0);
+ dir_id, &name, 0);
if (di && !IS_ERR(di)) {
btrfs_dir_item_key_to_cpu(path->nodes[0], di, &key);
if (key.objectid == root->root_key.objectid) {
@@ -4652,10 +4693,10 @@ again:
spin_unlock(&root->inode_lock);
}
-int btrfs_delete_subvolume(struct inode *dir, struct dentry *dentry)
+int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry)
{
struct btrfs_fs_info *fs_info = btrfs_sb(dentry->d_sb);
- struct btrfs_root *root = BTRFS_I(dir)->root;
+ struct btrfs_root *root = dir->root;
struct inode *inode = d_inode(dentry);
struct btrfs_root *dest = BTRFS_I(inode)->root;
struct btrfs_trans_handle *trans;
@@ -4712,7 +4753,7 @@ int btrfs_delete_subvolume(struct inode *dir, struct dentry *dentry)
trans->block_rsv = &block_rsv;
trans->bytes_reserved = block_rsv.size;
- btrfs_record_snapshot_destroy(trans, BTRFS_I(dir));
+ btrfs_record_snapshot_destroy(trans, dir);
ret = btrfs_unlink_subvol(trans, dir, dentry);
if (ret) {
@@ -4792,6 +4833,7 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
int err = 0;
struct btrfs_trans_handle *trans;
u64 last_unlink_trans;
+ struct fscrypt_name fname;
if (inode->i_size > BTRFS_EMPTY_DIR_SIZE)
return -ENOTEMPTY;
@@ -4801,15 +4843,23 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
"extent tree v2 doesn't support snapshot deletion yet");
return -EOPNOTSUPP;
}
- return btrfs_delete_subvolume(dir, dentry);
+ return btrfs_delete_subvolume(BTRFS_I(dir), dentry);
}
- trans = __unlink_start_trans(dir);
- if (IS_ERR(trans))
- return PTR_ERR(trans);
+ err = fscrypt_setup_filename(dir, &dentry->d_name, 1, &fname);
+ if (err)
+ return err;
+
+ /* This needs to handle no-key deletions later on */
+
+ trans = __unlink_start_trans(BTRFS_I(dir));
+ if (IS_ERR(trans)) {
+ err = PTR_ERR(trans);
+ goto out_notrans;
+ }
if (unlikely(btrfs_ino(BTRFS_I(inode)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
- err = btrfs_unlink_subvol(trans, dir, dentry);
+ err = btrfs_unlink_subvol(trans, BTRFS_I(dir), dentry);
goto out;
}
@@ -4820,9 +4870,8 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
last_unlink_trans = BTRFS_I(inode)->last_unlink_trans;
/* now the directory is empty */
- err = btrfs_unlink_inode(trans, BTRFS_I(dir),
- BTRFS_I(d_inode(dentry)), dentry->d_name.name,
- dentry->d_name.len);
+ err = btrfs_unlink_inode(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)),
+ &fname.disk_name);
if (!err) {
btrfs_i_size_write(BTRFS_I(inode), 0);
/*
@@ -4841,7 +4890,9 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
}
out:
btrfs_end_transaction(trans);
+out_notrans:
btrfs_btree_balance_dirty(fs_info);
+ fscrypt_free_filename(&fname);
return err;
}
@@ -5210,7 +5261,7 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr)
inode_dio_wait(inode);
- ret = btrfs_truncate(inode, newsize == oldsize);
+ ret = btrfs_truncate(BTRFS_I(inode), newsize == oldsize);
if (ret && inode->i_nlink) {
int err;
@@ -5253,10 +5304,10 @@ static int btrfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentr
if (attr->ia_valid) {
setattr_copy(mnt_userns, inode, attr);
inode_inc_iversion(inode);
- err = btrfs_dirty_inode(inode);
+ err = btrfs_dirty_inode(BTRFS_I(inode));
if (!err && attr->ia_valid & ATTR_MODE)
- err = posix_acl_chmod(mnt_userns, inode, inode->i_mode);
+ err = posix_acl_chmod(mnt_userns, dentry, inode->i_mode);
}
return err;
@@ -5511,22 +5562,27 @@ no_delete:
* If no dir entries were found, returns -ENOENT.
* If found a corrupted location in dir entry, returns -EUCLEAN.
*/
-static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry,
+static int btrfs_inode_by_name(struct btrfs_inode *dir, struct dentry *dentry,
struct btrfs_key *location, u8 *type)
{
- const char *name = dentry->d_name.name;
- int namelen = dentry->d_name.len;
struct btrfs_dir_item *di;
struct btrfs_path *path;
- struct btrfs_root *root = BTRFS_I(dir)->root;
+ struct btrfs_root *root = dir->root;
int ret = 0;
+ struct fscrypt_name fname;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
- di = btrfs_lookup_dir_item(NULL, root, path, btrfs_ino(BTRFS_I(dir)),
- name, namelen, 0);
+ ret = fscrypt_setup_filename(&dir->vfs_inode, &dentry->d_name, 1, &fname);
+ if (ret)
+ goto out;
+
+ /* This needs to handle no-key deletions later on */
+
+ di = btrfs_lookup_dir_item(NULL, root, path, btrfs_ino(dir),
+ &fname.disk_name, 0);
if (IS_ERR_OR_NULL(di)) {
ret = di ? PTR_ERR(di) : -ENOENT;
goto out;
@@ -5538,12 +5594,13 @@ static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry,
ret = -EUCLEAN;
btrfs_warn(root->fs_info,
"%s gets something invalid in DIR_ITEM (name %s, directory ino %llu, location(%llu %u %llu))",
- __func__, name, btrfs_ino(BTRFS_I(dir)),
+ __func__, fname.disk_name.name, btrfs_ino(dir),
location->objectid, location->type, location->offset);
}
if (!ret)
- *type = btrfs_dir_type(path->nodes[0], di);
+ *type = btrfs_dir_ftype(path->nodes[0], di);
out:
+ fscrypt_free_filename(&fname);
btrfs_free_path(path);
return ret;
}
@@ -5554,7 +5611,7 @@ out:
* is kind of like crossing a mount point.
*/
static int fixup_tree_root_location(struct btrfs_fs_info *fs_info,
- struct inode *dir,
+ struct btrfs_inode *dir,
struct dentry *dentry,
struct btrfs_key *location,
struct btrfs_root **sub_root)
@@ -5566,6 +5623,11 @@ static int fixup_tree_root_location(struct btrfs_fs_info *fs_info,
struct btrfs_key key;
int ret;
int err = 0;
+ struct fscrypt_name fname;
+
+ ret = fscrypt_setup_filename(&dir->vfs_inode, &dentry->d_name, 0, &fname);
+ if (ret)
+ return ret;
path = btrfs_alloc_path();
if (!path) {
@@ -5574,7 +5636,7 @@ static int fixup_tree_root_location(struct btrfs_fs_info *fs_info,
}
err = -ENOENT;
- key.objectid = BTRFS_I(dir)->root->root_key.objectid;
+ key.objectid = dir->root->root_key.objectid;
key.type = BTRFS_ROOT_REF_KEY;
key.offset = location->objectid;
@@ -5587,13 +5649,12 @@ static int fixup_tree_root_location(struct btrfs_fs_info *fs_info,
leaf = path->nodes[0];
ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref);
- if (btrfs_root_ref_dirid(leaf, ref) != btrfs_ino(BTRFS_I(dir)) ||
- btrfs_root_ref_name_len(leaf, ref) != dentry->d_name.len)
+ if (btrfs_root_ref_dirid(leaf, ref) != btrfs_ino(dir) ||
+ btrfs_root_ref_name_len(leaf, ref) != fname.disk_name.len)
goto out;
- ret = memcmp_extent_buffer(leaf, dentry->d_name.name,
- (unsigned long)(ref + 1),
- dentry->d_name.len);
+ ret = memcmp_extent_buffer(leaf, fname.disk_name.name,
+ (unsigned long)(ref + 1), fname.disk_name.len);
if (ret)
goto out;
@@ -5612,19 +5673,20 @@ static int fixup_tree_root_location(struct btrfs_fs_info *fs_info,
err = 0;
out:
btrfs_free_path(path);
+ fscrypt_free_filename(&fname);
return err;
}
-static void inode_tree_add(struct inode *inode)
+static void inode_tree_add(struct btrfs_inode *inode)
{
- struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_root *root = inode->root;
struct btrfs_inode *entry;
struct rb_node **p;
struct rb_node *parent;
- struct rb_node *new = &BTRFS_I(inode)->rb_node;
- u64 ino = btrfs_ino(BTRFS_I(inode));
+ struct rb_node *new = &inode->rb_node;
+ u64 ino = btrfs_ino(inode);
- if (inode_unhashed(inode))
+ if (inode_unhashed(&inode->vfs_inode))
return;
parent = NULL;
spin_lock(&root->inode_lock);
@@ -5736,7 +5798,7 @@ struct inode *btrfs_iget_path(struct super_block *s, u64 ino,
ret = btrfs_read_locked_inode(inode, path);
if (!ret) {
- inode_tree_add(inode);
+ inode_tree_add(BTRFS_I(inode));
unlock_new_inode(inode);
} else {
iget_failed(inode);
@@ -5816,7 +5878,7 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
if (dentry->d_name.len > BTRFS_NAME_LEN)
return ERR_PTR(-ENAMETOOLONG);
- ret = btrfs_inode_by_name(dir, dentry, &location, &di_type);
+ ret = btrfs_inode_by_name(BTRFS_I(dir), dentry, &location, &di_type);
if (ret < 0)
return ERR_PTR(ret);
@@ -5837,7 +5899,7 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
return inode;
}
- ret = fixup_tree_root_location(fs_info, dir, dentry,
+ ret = fixup_tree_root_location(fs_info, BTRFS_I(dir), dentry,
&location, &sub_root);
if (ret < 0) {
if (ret != -ENOENT)
@@ -5985,6 +6047,7 @@ again:
btrfs_for_each_slot(root, &key, &found_key, path, ret) {
struct dir_entry *entry;
struct extent_buffer *leaf = path->nodes[0];
+ u8 ftype;
if (found_key.objectid != key.objectid)
break;
@@ -6008,13 +6071,13 @@ again:
goto again;
}
+ ftype = btrfs_dir_flags_to_ftype(btrfs_dir_flags(leaf, di));
entry = addr;
- put_unaligned(name_len, &entry->name_len);
name_ptr = (char *)(entry + 1);
- read_extent_buffer(leaf, name_ptr, (unsigned long)(di + 1),
- name_len);
- put_unaligned(fs_ftype_to_dtype(btrfs_dir_type(leaf, di)),
- &entry->type);
+ read_extent_buffer(leaf, name_ptr,
+ (unsigned long)(di + 1), name_len);
+ put_unaligned(name_len, &entry->name_len);
+ put_unaligned(fs_ftype_to_dtype(ftype), &entry->type);
btrfs_dir_item_key_to_cpu(leaf, di, &location);
put_unaligned(location.objectid, &entry->ino);
put_unaligned(found_key.offset, &entry->offset);
@@ -6072,21 +6135,21 @@ err:
* FIXME, needs more benchmarking...there are no reasons other than performance
* to keep or drop this code.
*/
-static int btrfs_dirty_inode(struct inode *inode)
+static int btrfs_dirty_inode(struct btrfs_inode *inode)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_root *root = inode->root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_trans_handle *trans;
int ret;
- if (test_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags))
+ if (test_bit(BTRFS_INODE_DUMMY, &inode->runtime_flags))
return 0;
trans = btrfs_join_transaction(root);
if (IS_ERR(trans))
return PTR_ERR(trans);
- ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
+ ret = btrfs_update_inode(trans, root, inode);
if (ret && (ret == -ENOSPC || ret == -EDQUOT)) {
/* whoops, lets try again with the full transaction */
btrfs_end_transaction(trans);
@@ -6094,10 +6157,10 @@ static int btrfs_dirty_inode(struct inode *inode)
if (IS_ERR(trans))
return PTR_ERR(trans);
- ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
+ ret = btrfs_update_inode(trans, root, inode);
}
btrfs_end_transaction(trans);
- if (BTRFS_I(inode)->delayed_node)
+ if (inode->delayed_node)
btrfs_balance_delayed_items(fs_info);
return ret;
@@ -6124,7 +6187,7 @@ static int btrfs_update_time(struct inode *inode, struct timespec64 *now,
inode->i_mtime = *now;
if (flags & S_ATIME)
inode->i_atime = *now;
- return dirty ? btrfs_dirty_inode(inode) : 0;
+ return dirty ? btrfs_dirty_inode(BTRFS_I(inode)) : 0;
}
/*
@@ -6220,9 +6283,18 @@ int btrfs_new_inode_prepare(struct btrfs_new_inode_args *args,
struct inode *inode = args->inode;
int ret;
+ if (!args->orphan) {
+ ret = fscrypt_setup_filename(dir, &args->dentry->d_name, 0,
+ &args->fname);
+ if (ret)
+ return ret;
+ }
+
ret = posix_acl_create(dir, &inode->i_mode, &args->default_acl, &args->acl);
- if (ret)
+ if (ret) {
+ fscrypt_free_filename(&args->fname);
return ret;
+ }
/* 1 to add inode item */
*trans_num_items = 1;
@@ -6262,6 +6334,7 @@ void btrfs_new_inode_args_destroy(struct btrfs_new_inode_args *args)
{
posix_acl_release(args->acl);
posix_acl_release(args->default_acl);
+ fscrypt_free_filename(&args->fname);
}
/*
@@ -6269,27 +6342,27 @@ void btrfs_new_inode_args_destroy(struct btrfs_new_inode_args *args)
*
* Currently only the compression flags and the cow flags are inherited.
*/
-static void btrfs_inherit_iflags(struct inode *inode, struct inode *dir)
+static void btrfs_inherit_iflags(struct btrfs_inode *inode, struct btrfs_inode *dir)
{
unsigned int flags;
- flags = BTRFS_I(dir)->flags;
+ flags = dir->flags;
if (flags & BTRFS_INODE_NOCOMPRESS) {
- BTRFS_I(inode)->flags &= ~BTRFS_INODE_COMPRESS;
- BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS;
+ inode->flags &= ~BTRFS_INODE_COMPRESS;
+ inode->flags |= BTRFS_INODE_NOCOMPRESS;
} else if (flags & BTRFS_INODE_COMPRESS) {
- BTRFS_I(inode)->flags &= ~BTRFS_INODE_NOCOMPRESS;
- BTRFS_I(inode)->flags |= BTRFS_INODE_COMPRESS;
+ inode->flags &= ~BTRFS_INODE_NOCOMPRESS;
+ inode->flags |= BTRFS_INODE_COMPRESS;
}
if (flags & BTRFS_INODE_NODATACOW) {
- BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW;
- if (S_ISREG(inode->i_mode))
- BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM;
+ inode->flags |= BTRFS_INODE_NODATACOW;
+ if (S_ISREG(inode->vfs_inode.i_mode))
+ inode->flags |= BTRFS_INODE_NODATASUM;
}
- btrfs_sync_inode_flags_to_i_flags(inode);
+ btrfs_sync_inode_flags_to_i_flags(&inode->vfs_inode);
}
int btrfs_create_new_inode(struct btrfs_trans_handle *trans,
@@ -6297,8 +6370,7 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans,
{
struct inode *dir = args->dir;
struct inode *inode = args->inode;
- const char *name = args->orphan ? NULL : args->dentry->d_name.name;
- int name_len = args->orphan ? 0 : args->dentry->d_name.len;
+ const struct fscrypt_str *name = args->orphan ? NULL : &args->fname.disk_name;
struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
struct btrfs_root *root;
struct btrfs_inode_item *inode_item;
@@ -6349,7 +6421,7 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans,
* change it now without compatibility issues.
*/
if (!args->subvol)
- btrfs_inherit_iflags(inode, dir);
+ btrfs_inherit_iflags(BTRFS_I(inode), BTRFS_I(dir));
if (S_ISREG(inode->i_mode)) {
if (btrfs_test_opt(fs_info, NODATASUM))
@@ -6399,7 +6471,7 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans,
sizes[1] = 2 + sizeof(*ref);
} else {
key[1].offset = btrfs_ino(BTRFS_I(dir));
- sizes[1] = name_len + sizeof(*ref);
+ sizes[1] = name->len + sizeof(*ref);
}
}
@@ -6438,10 +6510,12 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans,
btrfs_set_inode_ref_index(path->nodes[0], ref, 0);
write_extent_buffer(path->nodes[0], "..", ptr, 2);
} else {
- btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len);
+ btrfs_set_inode_ref_name_len(path->nodes[0], ref,
+ name->len);
btrfs_set_inode_ref_index(path->nodes[0], ref,
BTRFS_I(inode)->dir_index);
- write_extent_buffer(path->nodes[0], name, ptr, name_len);
+ write_extent_buffer(path->nodes[0], name->name, ptr,
+ name->len);
}
}
@@ -6491,7 +6565,7 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans,
}
}
- inode_tree_add(inode);
+ inode_tree_add(BTRFS_I(inode));
trace_btrfs_inode_new(inode);
btrfs_set_inode_last_trans(trans, BTRFS_I(inode));
@@ -6502,7 +6576,7 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans,
ret = btrfs_orphan_add(trans, BTRFS_I(inode));
} else {
ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), name,
- name_len, 0, BTRFS_I(inode)->dir_index);
+ 0, BTRFS_I(inode)->dir_index);
}
if (ret) {
btrfs_abort_transaction(trans, ret);
@@ -6531,7 +6605,7 @@ out:
*/
int btrfs_add_link(struct btrfs_trans_handle *trans,
struct btrfs_inode *parent_inode, struct btrfs_inode *inode,
- const char *name, int name_len, int add_backref, u64 index)
+ const struct fscrypt_str *name, int add_backref, u64 index)
{
int ret = 0;
struct btrfs_key key;
@@ -6550,17 +6624,17 @@ int btrfs_add_link(struct btrfs_trans_handle *trans,
if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) {
ret = btrfs_add_root_ref(trans, key.objectid,
root->root_key.objectid, parent_ino,
- index, name, name_len);
+ index, name);
} else if (add_backref) {
- ret = btrfs_insert_inode_ref(trans, root, name, name_len, ino,
- parent_ino, index);
+ ret = btrfs_insert_inode_ref(trans, root, name,
+ ino, parent_ino, index);
}
/* Nothing to clean up yet */
if (ret)
return ret;
- ret = btrfs_insert_dir_item(trans, name, name_len, parent_inode, &key,
+ ret = btrfs_insert_dir_item(trans, name, parent_inode, &key,
btrfs_inode_type(&inode->vfs_inode), index);
if (ret == -EEXIST || ret == -EOVERFLOW)
goto fail_dir_item;
@@ -6570,7 +6644,7 @@ int btrfs_add_link(struct btrfs_trans_handle *trans,
}
btrfs_i_size_write(parent_inode, parent_inode->vfs_inode.i_size +
- name_len * 2);
+ name->len * 2);
inode_inc_iversion(&parent_inode->vfs_inode);
/*
* If we are replaying a log tree, we do not want to update the mtime
@@ -6595,15 +6669,15 @@ fail_dir_item:
int err;
err = btrfs_del_root_ref(trans, key.objectid,
root->root_key.objectid, parent_ino,
- &local_index, name, name_len);
+ &local_index, name);
if (err)
btrfs_abort_transaction(trans, err);
} else if (add_backref) {
u64 local_index;
int err;
- err = btrfs_del_inode_ref(trans, root, name, name_len,
- ino, parent_ino, &local_index);
+ err = btrfs_del_inode_ref(trans, root, name, ino, parent_ino,
+ &local_index);
if (err)
btrfs_abort_transaction(trans, err);
}
@@ -6686,6 +6760,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
struct btrfs_root *root = BTRFS_I(dir)->root;
struct inode *inode = d_inode(old_dentry);
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct fscrypt_name fname;
u64 index;
int err;
int drop_inode = 0;
@@ -6697,6 +6772,10 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
if (inode->i_nlink >= BTRFS_LINK_MAX)
return -EMLINK;
+ err = fscrypt_setup_filename(dir, &dentry->d_name, 0, &fname);
+ if (err)
+ goto fail;
+
err = btrfs_set_inode_index(BTRFS_I(dir), &index);
if (err)
goto fail;
@@ -6723,7 +6802,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags);
err = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode),
- dentry->d_name.name, dentry->d_name.len, 1, index);
+ &fname.disk_name, 1, index);
if (err) {
drop_inode = 1;
@@ -6747,6 +6826,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
}
fail:
+ fscrypt_free_filename(&fname);
if (trans)
btrfs_end_transaction(trans);
if (drop_inode) {
@@ -6773,7 +6853,6 @@ static int btrfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
static noinline int uncompress_inline(struct btrfs_path *path,
struct page *page,
- size_t pg_offset, u64 extent_offset,
struct btrfs_file_extent_item *item)
{
int ret;
@@ -6784,7 +6863,6 @@ static noinline int uncompress_inline(struct btrfs_path *path,
unsigned long ptr;
int compress_type;
- WARN_ON(pg_offset != 0);
compress_type = btrfs_file_extent_compression(leaf, item);
max_size = btrfs_file_extent_ram_bytes(leaf, item);
inline_size = btrfs_file_extent_inline_item_len(leaf, path->slots[0]);
@@ -6796,8 +6874,7 @@ static noinline int uncompress_inline(struct btrfs_path *path,
read_extent_buffer(leaf, tmp, ptr, inline_size);
max_size = min_t(unsigned long, PAGE_SIZE, max_size);
- ret = btrfs_decompress(compress_type, tmp, page,
- extent_offset, inline_size, max_size);
+ ret = btrfs_decompress(compress_type, tmp, page, 0, inline_size, max_size);
/*
* decompression code contains a memset to fill in any space between the end
@@ -6807,25 +6884,52 @@ static noinline int uncompress_inline(struct btrfs_path *path,
* cover that region here.
*/
- if (max_size + pg_offset < PAGE_SIZE)
- memzero_page(page, pg_offset + max_size,
- PAGE_SIZE - max_size - pg_offset);
+ if (max_size < PAGE_SIZE)
+ memzero_page(page, max_size, PAGE_SIZE - max_size);
kfree(tmp);
return ret;
}
-/**
- * btrfs_get_extent - Lookup the first extent overlapping a range in a file.
+static int read_inline_extent(struct btrfs_inode *inode, struct btrfs_path *path,
+ struct page *page)
+{
+ struct btrfs_file_extent_item *fi;
+ void *kaddr;
+ size_t copy_size;
+
+ if (!page || PageUptodate(page))
+ return 0;
+
+ ASSERT(page_offset(page) == 0);
+
+ fi = btrfs_item_ptr(path->nodes[0], path->slots[0],
+ struct btrfs_file_extent_item);
+ if (btrfs_file_extent_compression(path->nodes[0], fi) != BTRFS_COMPRESS_NONE)
+ return uncompress_inline(path, page, fi);
+
+ copy_size = min_t(u64, PAGE_SIZE,
+ btrfs_file_extent_ram_bytes(path->nodes[0], fi));
+ kaddr = kmap_local_page(page);
+ read_extent_buffer(path->nodes[0], kaddr,
+ btrfs_file_extent_inline_start(fi), copy_size);
+ kunmap_local(kaddr);
+ if (copy_size < PAGE_SIZE)
+ memzero_page(page, copy_size, PAGE_SIZE - copy_size);
+ return 0;
+}
+
+/*
+ * Lookup the first extent overlapping a range in a file.
+ *
* @inode: file to search in
* @page: page to read extent data into if the extent is inline
* @pg_offset: offset into @page to copy to
* @start: file offset
* @len: length of range starting at @start
*
- * This returns the first &struct extent_map which overlaps with the given
- * range, reading it from the B-tree and caching it if necessary. Note that
- * there may be more extents which overlap the given range after the returned
- * extent_map.
+ * Return the first &struct extent_map which overlaps the given range, reading
+ * it from the B-tree and caching it if necessary. Note that there may be more
+ * extents which overlap the given range after the returned extent_map.
*
* If @page is not NULL and the extent is inline, this also reads the extent
* data directly into the page and marks the extent up to date in the io_tree.
@@ -6966,51 +7070,33 @@ next:
goto insert;
}
- btrfs_extent_item_to_extent_map(inode, path, item, !page, em);
+ btrfs_extent_item_to_extent_map(inode, path, item, em);
if (extent_type == BTRFS_FILE_EXTENT_REG ||
extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
goto insert;
} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
- unsigned long ptr;
- char *map;
- size_t size;
- size_t extent_offset;
- size_t copy_size;
-
- if (!page)
- goto out;
+ /*
+ * Inline extent can only exist at file offset 0. This is
+ * ensured by tree-checker and inline extent creation path.
+ * Thus all members representing file offsets should be zero.
+ */
+ ASSERT(pg_offset == 0);
+ ASSERT(extent_start == 0);
+ ASSERT(em->start == 0);
- size = btrfs_file_extent_ram_bytes(leaf, item);
- extent_offset = page_offset(page) + pg_offset - extent_start;
- copy_size = min_t(u64, PAGE_SIZE - pg_offset,
- size - extent_offset);
- em->start = extent_start + extent_offset;
- em->len = ALIGN(copy_size, fs_info->sectorsize);
- em->orig_block_len = em->len;
- em->orig_start = em->start;
- ptr = btrfs_file_extent_inline_start(item) + extent_offset;
+ /*
+ * btrfs_extent_item_to_extent_map() should have properly
+ * initialized em members already.
+ *
+ * Other members are not utilized for inline extents.
+ */
+ ASSERT(em->block_start == EXTENT_MAP_INLINE);
+ ASSERT(em->len = fs_info->sectorsize);
- if (!PageUptodate(page)) {
- if (btrfs_file_extent_compression(leaf, item) !=
- BTRFS_COMPRESS_NONE) {
- ret = uncompress_inline(path, page, pg_offset,
- extent_offset, item);
- if (ret)
- goto out;
- } else {
- map = kmap_local_page(page);
- read_extent_buffer(leaf, map + pg_offset, ptr,
- copy_size);
- if (pg_offset + copy_size < PAGE_SIZE) {
- memset(map + pg_offset + copy_size, 0,
- PAGE_SIZE - pg_offset -
- copy_size);
- }
- kunmap_local(map);
- }
- flush_dcache_page(page);
- }
+ ret = read_inline_extent(inode, path, page);
+ if (ret < 0)
+ goto out;
goto insert;
}
not_found:
@@ -7255,7 +7341,8 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
while (1) {
if (nowait) {
- if (!try_lock_extent(io_tree, lockstart, lockend))
+ if (!try_lock_extent(io_tree, lockstart, lockend,
+ cached_state))
return -EAGAIN;
} else {
lock_extent(io_tree, lockstart, lockend, cached_state);
@@ -7811,11 +7898,11 @@ static void btrfs_dio_private_put(struct btrfs_dio_private *dip)
return;
if (btrfs_op(&dip->bio) == BTRFS_MAP_WRITE) {
- btrfs_mark_ordered_io_finished(BTRFS_I(dip->inode), NULL,
+ btrfs_mark_ordered_io_finished(dip->inode, NULL,
dip->file_offset, dip->bytes,
!dip->bio.bi_status);
} else {
- unlock_extent(&BTRFS_I(dip->inode)->io_tree,
+ unlock_extent(&dip->inode->io_tree,
dip->file_offset,
dip->file_offset + dip->bytes - 1, NULL);
}
@@ -7824,24 +7911,21 @@ static void btrfs_dio_private_put(struct btrfs_dio_private *dip)
bio_endio(&dip->bio);
}
-static void submit_dio_repair_bio(struct inode *inode, struct bio *bio,
- int mirror_num,
- enum btrfs_compression_type compress_type)
+void btrfs_submit_dio_repair_bio(struct btrfs_inode *inode, struct bio *bio, int mirror_num)
{
struct btrfs_dio_private *dip = btrfs_bio(bio)->private;
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
BUG_ON(bio_op(bio) == REQ_OP_WRITE);
refcount_inc(&dip->refs);
- btrfs_submit_bio(fs_info, bio, mirror_num);
+ btrfs_submit_bio(inode->root->fs_info, bio, mirror_num);
}
static blk_status_t btrfs_check_read_dio_bio(struct btrfs_dio_private *dip,
struct btrfs_bio *bbio,
const bool uptodate)
{
- struct inode *inode = dip->inode;
+ struct inode *inode = &dip->inode->vfs_inode;
struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
const bool csum = !(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM);
blk_status_t err = BLK_STS_OK;
@@ -7853,16 +7937,15 @@ static blk_status_t btrfs_check_read_dio_bio(struct btrfs_dio_private *dip,
u64 start = bbio->file_offset + offset;
if (uptodate &&
- (!csum || !btrfs_check_data_csum(inode, bbio, offset, bv.bv_page,
- bv.bv_offset))) {
+ (!csum || !btrfs_check_data_csum(BTRFS_I(inode), bbio, offset,
+ bv.bv_page, bv.bv_offset))) {
btrfs_clean_io_failure(BTRFS_I(inode), start,
bv.bv_page, bv.bv_offset);
} else {
int ret;
- ret = btrfs_repair_one_sector(inode, bbio, offset,
- bv.bv_page, bv.bv_offset,
- submit_dio_repair_bio);
+ ret = btrfs_repair_one_sector(BTRFS_I(inode), bbio, offset,
+ bv.bv_page, bv.bv_offset, false);
if (ret)
err = errno_to_blk_status(ret);
}
@@ -7871,11 +7954,11 @@ static blk_status_t btrfs_check_read_dio_bio(struct btrfs_dio_private *dip,
return err;
}
-static blk_status_t btrfs_submit_bio_start_direct_io(struct inode *inode,
- struct bio *bio,
- u64 dio_file_offset)
+blk_status_t btrfs_submit_bio_start_direct_io(struct btrfs_inode *inode,
+ struct bio *bio,
+ u64 dio_file_offset)
{
- return btrfs_csum_one_bio(BTRFS_I(inode), bio, dio_file_offset, false);
+ return btrfs_csum_one_bio(inode, bio, dio_file_offset, false);
}
static void btrfs_end_dio_bio(struct btrfs_bio *bbio)
@@ -7885,9 +7968,9 @@ static void btrfs_end_dio_bio(struct btrfs_bio *bbio)
blk_status_t err = bio->bi_status;
if (err)
- btrfs_warn(BTRFS_I(dip->inode)->root->fs_info,
+ btrfs_warn(dip->inode->root->fs_info,
"direct IO failed ino %llu rw %d,%u sector %#Lx len %u err no %d",
- btrfs_ino(BTRFS_I(dip->inode)), bio_op(bio),
+ btrfs_ino(dip->inode), bio_op(bio),
bio->bi_opf, bio->bi_iter.bi_sector,
bio->bi_iter.bi_size, err);
@@ -7897,16 +7980,16 @@ static void btrfs_end_dio_bio(struct btrfs_bio *bbio)
if (err)
dip->bio.bi_status = err;
- btrfs_record_physical_zoned(dip->inode, bbio->file_offset, bio);
+ btrfs_record_physical_zoned(&dip->inode->vfs_inode, bbio->file_offset, bio);
bio_put(bio);
btrfs_dio_private_put(dip);
}
-static void btrfs_submit_dio_bio(struct bio *bio, struct inode *inode,
+static void btrfs_submit_dio_bio(struct bio *bio, struct btrfs_inode *inode,
u64 file_offset, int async_submit)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct btrfs_dio_private *dip = btrfs_bio(bio)->private;
blk_status_t ret;
@@ -7914,21 +7997,21 @@ static void btrfs_submit_dio_bio(struct bio *bio, struct inode *inode,
if (btrfs_op(bio) == BTRFS_MAP_READ)
btrfs_bio(bio)->iter = bio->bi_iter;
- if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)
+ if (inode->flags & BTRFS_INODE_NODATASUM)
goto map;
if (btrfs_op(bio) == BTRFS_MAP_WRITE) {
/* Check btrfs_submit_data_write_bio() for async submit rules */
- if (async_submit && !atomic_read(&BTRFS_I(inode)->sync_writers) &&
+ if (async_submit && !atomic_read(&inode->sync_writers) &&
btrfs_wq_submit_bio(inode, bio, 0, file_offset,
- btrfs_submit_bio_start_direct_io))
+ WQ_SUBMIT_DATA_DIO))
return;
/*
* If we aren't doing async submit, calculate the csum of the
* bio now.
*/
- ret = btrfs_csum_one_bio(BTRFS_I(inode), bio, file_offset, false);
+ ret = btrfs_csum_one_bio(inode, bio, file_offset, false);
if (ret) {
btrfs_bio_end_io(btrfs_bio(bio), ret);
return;
@@ -7964,7 +8047,7 @@ static void btrfs_submit_direct(const struct iomap_iter *iter,
struct btrfs_dio_data *dio_data = iter->private;
struct extent_map *em = NULL;
- dip->inode = inode;
+ dip->inode = BTRFS_I(inode);
dip->file_offset = file_offset;
dip->bytes = dio_bio->bi_iter.bi_size;
refcount_set(&dip->refs, 1);
@@ -8050,7 +8133,7 @@ static void btrfs_submit_direct(const struct iomap_iter *iter,
async_submit = 1;
}
- btrfs_submit_dio_bio(bio, inode, file_offset, async_submit);
+ btrfs_submit_dio_bio(bio, BTRFS_I(inode), file_offset, async_submit);
dio_data->submitted += clone_len;
clone_offset += clone_len;
@@ -8540,16 +8623,16 @@ out_noreserve:
return ret;
}
-static int btrfs_truncate(struct inode *inode, bool skip_writeback)
+static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback)
{
struct btrfs_truncate_control control = {
- .inode = BTRFS_I(inode),
- .ino = btrfs_ino(BTRFS_I(inode)),
+ .inode = inode,
+ .ino = btrfs_ino(inode),
.min_type = BTRFS_EXTENT_DATA_KEY,
.clear_extent_range = true,
};
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_root *root = inode->root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_block_rsv *rsv;
int ret;
struct btrfs_trans_handle *trans;
@@ -8557,7 +8640,8 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback)
u64 min_size = btrfs_calc_metadata_size(fs_info, 1);
if (!skip_writeback) {
- ret = btrfs_wait_ordered_range(inode, inode->i_size & (~mask),
+ ret = btrfs_wait_ordered_range(&inode->vfs_inode,
+ inode->vfs_inode.i_size & (~mask),
(u64)-1);
if (ret)
return ret;
@@ -8616,34 +8700,32 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback)
while (1) {
struct extent_state *cached_state = NULL;
- const u64 new_size = inode->i_size;
+ const u64 new_size = inode->vfs_inode.i_size;
const u64 lock_start = ALIGN_DOWN(new_size, fs_info->sectorsize);
control.new_size = new_size;
- lock_extent(&BTRFS_I(inode)->io_tree, lock_start, (u64)-1,
- &cached_state);
+ lock_extent(&inode->io_tree, lock_start, (u64)-1, &cached_state);
/*
* We want to drop from the next block forward in case this new
* size is not block aligned since we will be keeping the last
* block of the extent just the way it is.
*/
- btrfs_drop_extent_map_range(BTRFS_I(inode),
+ btrfs_drop_extent_map_range(inode,
ALIGN(new_size, fs_info->sectorsize),
(u64)-1, false);
ret = btrfs_truncate_inode_items(trans, root, &control);
- inode_sub_bytes(inode, control.sub_bytes);
- btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), control.last_size);
+ inode_sub_bytes(&inode->vfs_inode, control.sub_bytes);
+ btrfs_inode_safe_disk_i_size_write(inode, control.last_size);
- unlock_extent(&BTRFS_I(inode)->io_tree, lock_start, (u64)-1,
- &cached_state);
+ unlock_extent(&inode->io_tree, lock_start, (u64)-1, &cached_state);
trans->block_rsv = &fs_info->trans_block_rsv;
if (ret != -ENOSPC && ret != -EAGAIN)
break;
- ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
+ ret = btrfs_update_inode(trans, root, inode);
if (ret)
break;
@@ -8674,7 +8756,7 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback)
btrfs_end_transaction(trans);
btrfs_btree_balance_dirty(fs_info);
- ret = btrfs_truncate_block(BTRFS_I(inode), inode->i_size, 0, 0);
+ ret = btrfs_truncate_block(inode, inode->vfs_inode.i_size, 0, 0);
if (ret)
goto out;
trans = btrfs_start_transaction(root, 1);
@@ -8682,14 +8764,14 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback)
ret = PTR_ERR(trans);
goto out;
}
- btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0);
+ btrfs_inode_safe_disk_i_size_write(inode, 0);
}
if (trans) {
int ret2;
trans->block_rsv = &fs_info->trans_block_rsv;
- ret2 = btrfs_update_inode(trans, root, BTRFS_I(inode));
+ ret2 = btrfs_update_inode(trans, root, inode);
if (ret2 && !ret)
ret = ret2;
@@ -8715,7 +8797,7 @@ out:
* extents beyond i_size to drop.
*/
if (control.extents_found > 0)
- btrfs_set_inode_full_sync(BTRFS_I(inode));
+ btrfs_set_inode_full_sync(inode);
return ret;
}
@@ -8784,9 +8866,10 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
inode = &ei->vfs_inode;
extent_map_tree_init(&ei->extent_tree);
- extent_io_tree_init(fs_info, &ei->io_tree, IO_TREE_INODE_IO, inode);
+ extent_io_tree_init(fs_info, &ei->io_tree, IO_TREE_INODE_IO);
+ ei->io_tree.inode = ei;
extent_io_tree_init(fs_info, &ei->file_extent_tree,
- IO_TREE_INODE_FILE_EXTENT, NULL);
+ IO_TREE_INODE_FILE_EXTENT);
ei->io_failure_tree = RB_ROOT;
atomic_set(&ei->sync_writers, 0);
mutex_init(&ei->log_mutex);
@@ -8899,10 +8982,6 @@ void __cold btrfs_destroy_cachep(void)
rcu_barrier();
bioset_exit(&btrfs_dio_bioset);
kmem_cache_destroy(btrfs_inode_cachep);
- kmem_cache_destroy(btrfs_trans_handle_cachep);
- kmem_cache_destroy(btrfs_path_cachep);
- kmem_cache_destroy(btrfs_free_space_cachep);
- kmem_cache_destroy(btrfs_free_space_bitmap_cachep);
}
int __init btrfs_init_cachep(void)
@@ -8914,30 +8993,6 @@ int __init btrfs_init_cachep(void)
if (!btrfs_inode_cachep)
goto fail;
- btrfs_trans_handle_cachep = kmem_cache_create("btrfs_trans_handle",
- sizeof(struct btrfs_trans_handle), 0,
- SLAB_TEMPORARY | SLAB_MEM_SPREAD, NULL);
- if (!btrfs_trans_handle_cachep)
- goto fail;
-
- btrfs_path_cachep = kmem_cache_create("btrfs_path",
- sizeof(struct btrfs_path), 0,
- SLAB_MEM_SPREAD, NULL);
- if (!btrfs_path_cachep)
- goto fail;
-
- btrfs_free_space_cachep = kmem_cache_create("btrfs_free_space",
- sizeof(struct btrfs_free_space), 0,
- SLAB_MEM_SPREAD, NULL);
- if (!btrfs_free_space_cachep)
- goto fail;
-
- btrfs_free_space_bitmap_cachep = kmem_cache_create("btrfs_free_space_bitmap",
- PAGE_SIZE, PAGE_SIZE,
- SLAB_MEM_SPREAD, NULL);
- if (!btrfs_free_space_bitmap_cachep)
- goto fail;
-
if (bioset_init(&btrfs_dio_bioset, BIO_POOL_SIZE,
offsetof(struct btrfs_dio_private, bio),
BIOSET_NEED_BVECS))
@@ -9013,6 +9068,8 @@ static int btrfs_rename_exchange(struct inode *old_dir,
int ret;
int ret2;
bool need_abort = false;
+ struct fscrypt_name old_fname, new_fname;
+ struct fscrypt_str *old_name, *new_name;
/*
* For non-subvolumes allow exchange only within one subvolume, in the
@@ -9024,6 +9081,19 @@ static int btrfs_rename_exchange(struct inode *old_dir,
new_ino != BTRFS_FIRST_FREE_OBJECTID))
return -EXDEV;
+ ret = fscrypt_setup_filename(old_dir, &old_dentry->d_name, 0, &old_fname);
+ if (ret)
+ return ret;
+
+ ret = fscrypt_setup_filename(new_dir, &new_dentry->d_name, 0, &new_fname);
+ if (ret) {
+ fscrypt_free_filename(&old_fname);
+ return ret;
+ }
+
+ old_name = &old_fname.disk_name;
+ new_name = &new_fname.disk_name;
+
/* close the race window with snapshot create/destroy ioctl */
if (old_ino == BTRFS_FIRST_FREE_OBJECTID ||
new_ino == BTRFS_FIRST_FREE_OBJECTID)
@@ -9091,10 +9161,7 @@ static int btrfs_rename_exchange(struct inode *old_dir,
/* force full log commit if subvolume involved. */
btrfs_set_log_full_commit(trans);
} else {
- ret = btrfs_insert_inode_ref(trans, dest,
- new_dentry->d_name.name,
- new_dentry->d_name.len,
- old_ino,
+ ret = btrfs_insert_inode_ref(trans, dest, new_name, old_ino,
btrfs_ino(BTRFS_I(new_dir)),
old_idx);
if (ret)
@@ -9107,10 +9174,7 @@ static int btrfs_rename_exchange(struct inode *old_dir,
/* force full log commit if subvolume involved. */
btrfs_set_log_full_commit(trans);
} else {
- ret = btrfs_insert_inode_ref(trans, root,
- old_dentry->d_name.name,
- old_dentry->d_name.len,
- new_ino,
+ ret = btrfs_insert_inode_ref(trans, root, old_name, new_ino,
btrfs_ino(BTRFS_I(old_dir)),
new_idx);
if (ret) {
@@ -9141,13 +9205,11 @@ static int btrfs_rename_exchange(struct inode *old_dir,
/* src is a subvolume */
if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
- ret = btrfs_unlink_subvol(trans, old_dir, old_dentry);
+ ret = btrfs_unlink_subvol(trans, BTRFS_I(old_dir), old_dentry);
} else { /* src is an inode */
ret = __btrfs_unlink_inode(trans, BTRFS_I(old_dir),
BTRFS_I(old_dentry->d_inode),
- old_dentry->d_name.name,
- old_dentry->d_name.len,
- &old_rename_ctx);
+ old_name, &old_rename_ctx);
if (!ret)
ret = btrfs_update_inode(trans, root, BTRFS_I(old_inode));
}
@@ -9158,13 +9220,11 @@ static int btrfs_rename_exchange(struct inode *old_dir,
/* dest is a subvolume */
if (new_ino == BTRFS_FIRST_FREE_OBJECTID) {
- ret = btrfs_unlink_subvol(trans, new_dir, new_dentry);
+ ret = btrfs_unlink_subvol(trans, BTRFS_I(new_dir), new_dentry);
} else { /* dest is an inode */
ret = __btrfs_unlink_inode(trans, BTRFS_I(new_dir),
BTRFS_I(new_dentry->d_inode),
- new_dentry->d_name.name,
- new_dentry->d_name.len,
- &new_rename_ctx);
+ new_name, &new_rename_ctx);
if (!ret)
ret = btrfs_update_inode(trans, dest, BTRFS_I(new_inode));
}
@@ -9174,16 +9234,14 @@ static int btrfs_rename_exchange(struct inode *old_dir,
}
ret = btrfs_add_link(trans, BTRFS_I(new_dir), BTRFS_I(old_inode),
- new_dentry->d_name.name,
- new_dentry->d_name.len, 0, old_idx);
+ new_name, 0, old_idx);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto out_fail;
}
ret = btrfs_add_link(trans, BTRFS_I(old_dir), BTRFS_I(new_inode),
- old_dentry->d_name.name,
- old_dentry->d_name.len, 0, new_idx);
+ old_name, 0, new_idx);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto out_fail;
@@ -9226,6 +9284,8 @@ out_notrans:
old_ino == BTRFS_FIRST_FREE_OBJECTID)
up_read(&fs_info->subvol_sem);
+ fscrypt_free_filename(&new_fname);
+ fscrypt_free_filename(&old_fname);
return ret;
}
@@ -9265,6 +9325,7 @@ static int btrfs_rename(struct user_namespace *mnt_userns,
int ret;
int ret2;
u64 old_ino = btrfs_ino(BTRFS_I(old_inode));
+ struct fscrypt_name old_fname, new_fname;
if (btrfs_ino(BTRFS_I(new_dir)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
return -EPERM;
@@ -9281,22 +9342,28 @@ static int btrfs_rename(struct user_namespace *mnt_userns,
new_inode->i_size > BTRFS_EMPTY_DIR_SIZE)
return -ENOTEMPTY;
+ ret = fscrypt_setup_filename(old_dir, &old_dentry->d_name, 0, &old_fname);
+ if (ret)
+ return ret;
- /* check for collisions, even if the name isn't there */
- ret = btrfs_check_dir_item_collision(dest, new_dir->i_ino,
- new_dentry->d_name.name,
- new_dentry->d_name.len);
+ ret = fscrypt_setup_filename(new_dir, &new_dentry->d_name, 0, &new_fname);
+ if (ret) {
+ fscrypt_free_filename(&old_fname);
+ return ret;
+ }
+ /* check for collisions, even if the name isn't there */
+ ret = btrfs_check_dir_item_collision(dest, new_dir->i_ino, &new_fname.disk_name);
if (ret) {
if (ret == -EEXIST) {
/* we shouldn't get
* eexist without a new_inode */
if (WARN_ON(!new_inode)) {
- return ret;
+ goto out_fscrypt_names;
}
} else {
/* maybe -EOVERFLOW */
- return ret;
+ goto out_fscrypt_names;
}
}
ret = 0;
@@ -9379,11 +9446,9 @@ static int btrfs_rename(struct user_namespace *mnt_userns,
/* force full log commit if subvolume involved. */
btrfs_set_log_full_commit(trans);
} else {
- ret = btrfs_insert_inode_ref(trans, dest,
- new_dentry->d_name.name,
- new_dentry->d_name.len,
- old_ino,
- btrfs_ino(BTRFS_I(new_dir)), index);
+ ret = btrfs_insert_inode_ref(trans, dest, &new_fname.disk_name,
+ old_ino, btrfs_ino(BTRFS_I(new_dir)),
+ index);
if (ret)
goto out_fail;
}
@@ -9402,13 +9467,11 @@ static int btrfs_rename(struct user_namespace *mnt_userns,
BTRFS_I(old_inode), 1);
if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) {
- ret = btrfs_unlink_subvol(trans, old_dir, old_dentry);
+ ret = btrfs_unlink_subvol(trans, BTRFS_I(old_dir), old_dentry);
} else {
ret = __btrfs_unlink_inode(trans, BTRFS_I(old_dir),
- BTRFS_I(d_inode(old_dentry)),
- old_dentry->d_name.name,
- old_dentry->d_name.len,
- &rename_ctx);
+ BTRFS_I(d_inode(old_dentry)),
+ &old_fname.disk_name, &rename_ctx);
if (!ret)
ret = btrfs_update_inode(trans, root, BTRFS_I(old_inode));
}
@@ -9422,13 +9485,12 @@ static int btrfs_rename(struct user_namespace *mnt_userns,
new_inode->i_ctime = current_time(new_inode);
if (unlikely(btrfs_ino(BTRFS_I(new_inode)) ==
BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
- ret = btrfs_unlink_subvol(trans, new_dir, new_dentry);
+ ret = btrfs_unlink_subvol(trans, BTRFS_I(new_dir), new_dentry);
BUG_ON(new_inode->i_nlink == 0);
} else {
ret = btrfs_unlink_inode(trans, BTRFS_I(new_dir),
BTRFS_I(d_inode(new_dentry)),
- new_dentry->d_name.name,
- new_dentry->d_name.len);
+ &new_fname.disk_name);
}
if (!ret && new_inode->i_nlink == 0)
ret = btrfs_orphan_add(trans,
@@ -9440,8 +9502,7 @@ static int btrfs_rename(struct user_namespace *mnt_userns,
}
ret = btrfs_add_link(trans, BTRFS_I(new_dir), BTRFS_I(old_inode),
- new_dentry->d_name.name,
- new_dentry->d_name.len, 0, index);
+ &new_fname.disk_name, 0, index);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto out_fail;
@@ -9476,6 +9537,9 @@ out_notrans:
out_whiteout_inode:
if (flags & RENAME_WHITEOUT)
iput(whiteout_args.inode);
+out_fscrypt_names:
+ fscrypt_free_filename(&old_fname);
+ fscrypt_free_filename(&new_fname);
return ret;
}
@@ -9595,7 +9659,7 @@ static int start_delalloc_inodes(struct btrfs_root *root,
&work->work);
} else {
ret = filemap_fdatawrite_wbc(inode->i_mapping, wbc);
- btrfs_add_delayed_iput(inode);
+ btrfs_add_delayed_iput(BTRFS_I(inode));
if (ret || wbc->nr_to_write <= 0)
goto out;
}
@@ -10205,7 +10269,7 @@ static ssize_t btrfs_encoded_read_inline(
read_extent_buffer(leaf, tmp, ptr, count);
btrfs_release_path(path);
unlock_extent(io_tree, start, lockend, cached_state);
- btrfs_inode_unlock(&inode->vfs_inode, BTRFS_ILOCK_SHARED);
+ btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
*unlocked = true;
ret = copy_to_iter(tmp, count, iter);
@@ -10265,7 +10329,7 @@ static blk_status_t btrfs_encoded_read_verify_csum(struct btrfs_bio *bbio)
pgoff = bvec->bv_offset;
for (i = 0; i < nr_sectors; i++) {
ASSERT(pgoff < PAGE_SIZE);
- if (btrfs_check_data_csum(&inode->vfs_inode, bbio, bio_offset,
+ if (btrfs_check_data_csum(inode, bbio, bio_offset,
bvec->bv_page, pgoff))
return BLK_STS_IOERR;
bio_offset += sectorsize;
@@ -10408,7 +10472,7 @@ static ssize_t btrfs_encoded_read_regular(struct kiocb *iocb,
goto out;
unlock_extent(io_tree, start, lockend, cached_state);
- btrfs_inode_unlock(&inode->vfs_inode, BTRFS_ILOCK_SHARED);
+ btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
*unlocked = true;
if (compressed) {
@@ -10457,10 +10521,10 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
file_accessed(iocb->ki_filp);
- btrfs_inode_lock(&inode->vfs_inode, BTRFS_ILOCK_SHARED);
+ btrfs_inode_lock(inode, BTRFS_ILOCK_SHARED);
if (iocb->ki_pos >= inode->vfs_inode.i_size) {
- btrfs_inode_unlock(&inode->vfs_inode, BTRFS_ILOCK_SHARED);
+ btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
return 0;
}
start = ALIGN_DOWN(iocb->ki_pos, fs_info->sectorsize);
@@ -10558,7 +10622,7 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
if (disk_bytenr == EXTENT_MAP_HOLE) {
unlock_extent(io_tree, start, lockend, &cached_state);
- btrfs_inode_unlock(&inode->vfs_inode, BTRFS_ILOCK_SHARED);
+ btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
unlocked = true;
ret = iov_iter_zero(count, iter);
if (ret != count)
@@ -10581,7 +10645,7 @@ out_unlock_extent:
unlock_extent(io_tree, start, lockend, &cached_state);
out_unlock_inode:
if (!unlocked)
- btrfs_inode_unlock(&inode->vfs_inode, BTRFS_ILOCK_SHARED);
+ btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
return ret;
}
@@ -11247,7 +11311,7 @@ void btrfs_update_inode_bytes(struct btrfs_inode *inode,
spin_unlock(&inode->lock);
}
-/**
+/*
* Verify that there are no ordered extents for a given file range.
*
* @inode: The target inode.
@@ -11296,7 +11360,7 @@ static const struct inode_operations btrfs_dir_inode_operations = {
.mknod = btrfs_mknod,
.listxattr = btrfs_listxattr,
.permission = btrfs_permission,
- .get_acl = btrfs_get_acl,
+ .get_inode_acl = btrfs_get_acl,
.set_acl = btrfs_set_acl,
.update_time = btrfs_update_time,
.tmpfile = btrfs_tmpfile,
@@ -11349,7 +11413,7 @@ static const struct inode_operations btrfs_file_inode_operations = {
.listxattr = btrfs_listxattr,
.permission = btrfs_permission,
.fiemap = btrfs_fiemap,
- .get_acl = btrfs_get_acl,
+ .get_inode_acl = btrfs_get_acl,
.set_acl = btrfs_set_acl,
.update_time = btrfs_update_time,
.fileattr_get = btrfs_fileattr_get,
@@ -11360,7 +11424,7 @@ static const struct inode_operations btrfs_special_inode_operations = {
.setattr = btrfs_setattr,
.permission = btrfs_permission,
.listxattr = btrfs_listxattr,
- .get_acl = btrfs_get_acl,
+ .get_inode_acl = btrfs_get_acl,
.set_acl = btrfs_set_acl,
.update_time = btrfs_update_time,
};
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index d5dd8bed1488..7e348bd2ccde 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -50,6 +50,17 @@
#include "delalloc-space.h"
#include "block-group.h"
#include "subpage.h"
+#include "fs.h"
+#include "accessors.h"
+#include "extent-tree.h"
+#include "root-tree.h"
+#include "defrag.h"
+#include "dir-item.h"
+#include "uuid-tree.h"
+#include "ioctl.h"
+#include "file.h"
+#include "scrub.h"
+#include "super.h"
#ifdef CONFIG_64BIT
/* If we have a 32-bit userspace and 64-bit kernel, then the UAPI
@@ -949,6 +960,7 @@ static noinline int btrfs_mksubvol(const struct path *parent,
struct inode *dir = d_inode(parent->dentry);
struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
struct dentry *dentry;
+ struct fscrypt_str name_str = FSTR_INIT((char *)name, namelen);
int error;
error = down_write_killable_nested(&dir->i_rwsem, I_MUTEX_PARENT);
@@ -969,8 +981,7 @@ static noinline int btrfs_mksubvol(const struct path *parent,
* check for them now when we can safely fail
*/
error = btrfs_check_dir_item_collision(BTRFS_I(dir)->root,
- dir->i_ino, name,
- namelen);
+ dir->i_ino, &name_str);
if (error)
goto out_dput;
@@ -991,7 +1002,7 @@ out_up_read:
out_dput:
dput(dentry);
out_unlock:
- btrfs_inode_unlock(dir, 0);
+ btrfs_inode_unlock(BTRFS_I(dir), 0);
return error;
}
@@ -1036,908 +1047,6 @@ out:
}
/*
- * Defrag specific helper to get an extent map.
- *
- * Differences between this and btrfs_get_extent() are:
- *
- * - No extent_map will be added to inode->extent_tree
- * To reduce memory usage in the long run.
- *
- * - Extra optimization to skip file extents older than @newer_than
- * By using btrfs_search_forward() we can skip entire file ranges that
- * have extents created in past transactions, because btrfs_search_forward()
- * will not visit leaves and nodes with a generation smaller than given
- * minimal generation threshold (@newer_than).
- *
- * Return valid em if we find a file extent matching the requirement.
- * Return NULL if we can not find a file extent matching the requirement.
- *
- * Return ERR_PTR() for error.
- */
-static struct extent_map *defrag_get_extent(struct btrfs_inode *inode,
- u64 start, u64 newer_than)
-{
- struct btrfs_root *root = inode->root;
- struct btrfs_file_extent_item *fi;
- struct btrfs_path path = { 0 };
- struct extent_map *em;
- struct btrfs_key key;
- u64 ino = btrfs_ino(inode);
- int ret;
-
- em = alloc_extent_map();
- if (!em) {
- ret = -ENOMEM;
- goto err;
- }
-
- key.objectid = ino;
- key.type = BTRFS_EXTENT_DATA_KEY;
- key.offset = start;
-
- if (newer_than) {
- ret = btrfs_search_forward(root, &key, &path, newer_than);
- if (ret < 0)
- goto err;
- /* Can't find anything newer */
- if (ret > 0)
- goto not_found;
- } else {
- ret = btrfs_search_slot(NULL, root, &key, &path, 0, 0);
- if (ret < 0)
- goto err;
- }
- if (path.slots[0] >= btrfs_header_nritems(path.nodes[0])) {
- /*
- * If btrfs_search_slot() makes path to point beyond nritems,
- * we should not have an empty leaf, as this inode must at
- * least have its INODE_ITEM.
- */
- ASSERT(btrfs_header_nritems(path.nodes[0]));
- path.slots[0] = btrfs_header_nritems(path.nodes[0]) - 1;
- }
- btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]);
- /* Perfect match, no need to go one slot back */
- if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY &&
- key.offset == start)
- goto iterate;
-
- /* We didn't find a perfect match, needs to go one slot back */
- if (path.slots[0] > 0) {
- btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]);
- if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY)
- path.slots[0]--;
- }
-
-iterate:
- /* Iterate through the path to find a file extent covering @start */
- while (true) {
- u64 extent_end;
-
- if (path.slots[0] >= btrfs_header_nritems(path.nodes[0]))
- goto next;
-
- btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]);
-
- /*
- * We may go one slot back to INODE_REF/XATTR item, then
- * need to go forward until we reach an EXTENT_DATA.
- * But we should still has the correct ino as key.objectid.
- */
- if (WARN_ON(key.objectid < ino) || key.type < BTRFS_EXTENT_DATA_KEY)
- goto next;
-
- /* It's beyond our target range, definitely not extent found */
- if (key.objectid > ino || key.type > BTRFS_EXTENT_DATA_KEY)
- goto not_found;
-
- /*
- * | |<- File extent ->|
- * \- start
- *
- * This means there is a hole between start and key.offset.
- */
- if (key.offset > start) {
- em->start = start;
- em->orig_start = start;
- em->block_start = EXTENT_MAP_HOLE;
- em->len = key.offset - start;
- break;
- }
-
- fi = btrfs_item_ptr(path.nodes[0], path.slots[0],
- struct btrfs_file_extent_item);
- extent_end = btrfs_file_extent_end(&path);
-
- /*
- * |<- file extent ->| |
- * \- start
- *
- * We haven't reached start, search next slot.
- */
- if (extent_end <= start)
- goto next;
-
- /* Now this extent covers @start, convert it to em */
- btrfs_extent_item_to_extent_map(inode, &path, fi, false, em);
- break;
-next:
- ret = btrfs_next_item(root, &path);
- if (ret < 0)
- goto err;
- if (ret > 0)
- goto not_found;
- }
- btrfs_release_path(&path);
- return em;
-
-not_found:
- btrfs_release_path(&path);
- free_extent_map(em);
- return NULL;
-
-err:
- btrfs_release_path(&path);
- free_extent_map(em);
- return ERR_PTR(ret);
-}
-
-static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start,
- u64 newer_than, bool locked)
-{
- struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
- struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
- struct extent_map *em;
- const u32 sectorsize = BTRFS_I(inode)->root->fs_info->sectorsize;
-
- /*
- * hopefully we have this extent in the tree already, try without
- * the full extent lock
- */
- read_lock(&em_tree->lock);
- em = lookup_extent_mapping(em_tree, start, sectorsize);
- read_unlock(&em_tree->lock);
-
- /*
- * We can get a merged extent, in that case, we need to re-search
- * tree to get the original em for defrag.
- *
- * If @newer_than is 0 or em::generation < newer_than, we can trust
- * this em, as either we don't care about the generation, or the
- * merged extent map will be rejected anyway.
- */
- if (em && test_bit(EXTENT_FLAG_MERGED, &em->flags) &&
- newer_than && em->generation >= newer_than) {
- free_extent_map(em);
- em = NULL;
- }
-
- if (!em) {
- struct extent_state *cached = NULL;
- u64 end = start + sectorsize - 1;
-
- /* get the big lock and read metadata off disk */
- if (!locked)
- lock_extent(io_tree, start, end, &cached);
- em = defrag_get_extent(BTRFS_I(inode), start, newer_than);
- if (!locked)
- unlock_extent(io_tree, start, end, &cached);
-
- if (IS_ERR(em))
- return NULL;
- }
-
- return em;
-}
-
-static u32 get_extent_max_capacity(const struct btrfs_fs_info *fs_info,
- const struct extent_map *em)
-{
- if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
- return BTRFS_MAX_COMPRESSED;
- return fs_info->max_extent_size;
-}
-
-static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em,
- u32 extent_thresh, u64 newer_than, bool locked)
-{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- struct extent_map *next;
- bool ret = false;
-
- /* this is the last extent */
- if (em->start + em->len >= i_size_read(inode))
- return false;
-
- /*
- * Here we need to pass @newer_then when checking the next extent, or
- * we will hit a case we mark current extent for defrag, but the next
- * one will not be a target.
- * This will just cause extra IO without really reducing the fragments.
- */
- next = defrag_lookup_extent(inode, em->start + em->len, newer_than, locked);
- /* No more em or hole */
- if (!next || next->block_start >= EXTENT_MAP_LAST_BYTE)
- goto out;
- if (test_bit(EXTENT_FLAG_PREALLOC, &next->flags))
- goto out;
- /*
- * If the next extent is at its max capacity, defragging current extent
- * makes no sense, as the total number of extents won't change.
- */
- if (next->len >= get_extent_max_capacity(fs_info, em))
- goto out;
- /* Skip older extent */
- if (next->generation < newer_than)
- goto out;
- /* Also check extent size */
- if (next->len >= extent_thresh)
- goto out;
-
- ret = true;
-out:
- free_extent_map(next);
- return ret;
-}
-
-/*
- * Prepare one page to be defragged.
- *
- * This will ensure:
- *
- * - Returned page is locked and has been set up properly.
- * - No ordered extent exists in the page.
- * - The page is uptodate.
- *
- * NOTE: Caller should also wait for page writeback after the cluster is
- * prepared, here we don't do writeback wait for each page.
- */
-static struct page *defrag_prepare_one_page(struct btrfs_inode *inode,
- pgoff_t index)
-{
- struct address_space *mapping = inode->vfs_inode.i_mapping;
- gfp_t mask = btrfs_alloc_write_mask(mapping);
- u64 page_start = (u64)index << PAGE_SHIFT;
- u64 page_end = page_start + PAGE_SIZE - 1;
- struct extent_state *cached_state = NULL;
- struct page *page;
- int ret;
-
-again:
- page = find_or_create_page(mapping, index, mask);
- if (!page)
- return ERR_PTR(-ENOMEM);
-
- /*
- * Since we can defragment files opened read-only, we can encounter
- * transparent huge pages here (see CONFIG_READ_ONLY_THP_FOR_FS). We
- * can't do I/O using huge pages yet, so return an error for now.
- * Filesystem transparent huge pages are typically only used for
- * executables that explicitly enable them, so this isn't very
- * restrictive.
- */
- if (PageCompound(page)) {
- unlock_page(page);
- put_page(page);
- return ERR_PTR(-ETXTBSY);
- }
-
- ret = set_page_extent_mapped(page);
- if (ret < 0) {
- unlock_page(page);
- put_page(page);
- return ERR_PTR(ret);
- }
-
- /* Wait for any existing ordered extent in the range */
- while (1) {
- struct btrfs_ordered_extent *ordered;
-
- lock_extent(&inode->io_tree, page_start, page_end, &cached_state);
- ordered = btrfs_lookup_ordered_range(inode, page_start, PAGE_SIZE);
- unlock_extent(&inode->io_tree, page_start, page_end,
- &cached_state);
- if (!ordered)
- break;
-
- unlock_page(page);
- btrfs_start_ordered_extent(ordered, 1);
- btrfs_put_ordered_extent(ordered);
- lock_page(page);
- /*
- * We unlocked the page above, so we need check if it was
- * released or not.
- */
- if (page->mapping != mapping || !PagePrivate(page)) {
- unlock_page(page);
- put_page(page);
- goto again;
- }
- }
-
- /*
- * Now the page range has no ordered extent any more. Read the page to
- * make it uptodate.
- */
- if (!PageUptodate(page)) {
- btrfs_read_folio(NULL, page_folio(page));
- lock_page(page);
- if (page->mapping != mapping || !PagePrivate(page)) {
- unlock_page(page);
- put_page(page);
- goto again;
- }
- if (!PageUptodate(page)) {
- unlock_page(page);
- put_page(page);
- return ERR_PTR(-EIO);
- }
- }
- return page;
-}
-
-struct defrag_target_range {
- struct list_head list;
- u64 start;
- u64 len;
-};
-
-/*
- * Collect all valid target extents.
- *
- * @start: file offset to lookup
- * @len: length to lookup
- * @extent_thresh: file extent size threshold, any extent size >= this value
- * will be ignored
- * @newer_than: only defrag extents newer than this value
- * @do_compress: whether the defrag is doing compression
- * if true, @extent_thresh will be ignored and all regular
- * file extents meeting @newer_than will be targets.
- * @locked: if the range has already held extent lock
- * @target_list: list of targets file extents
- */
-static int defrag_collect_targets(struct btrfs_inode *inode,
- u64 start, u64 len, u32 extent_thresh,
- u64 newer_than, bool do_compress,
- bool locked, struct list_head *target_list,
- u64 *last_scanned_ret)
-{
- struct btrfs_fs_info *fs_info = inode->root->fs_info;
- bool last_is_target = false;
- u64 cur = start;
- int ret = 0;
-
- while (cur < start + len) {
- struct extent_map *em;
- struct defrag_target_range *new;
- bool next_mergeable = true;
- u64 range_len;
-
- last_is_target = false;
- em = defrag_lookup_extent(&inode->vfs_inode, cur,
- newer_than, locked);
- if (!em)
- break;
-
- /*
- * If the file extent is an inlined one, we may still want to
- * defrag it (fallthrough) if it will cause a regular extent.
- * This is for users who want to convert inline extents to
- * regular ones through max_inline= mount option.
- */
- if (em->block_start == EXTENT_MAP_INLINE &&
- em->len <= inode->root->fs_info->max_inline)
- goto next;
-
- /* Skip hole/delalloc/preallocated extents */
- if (em->block_start == EXTENT_MAP_HOLE ||
- em->block_start == EXTENT_MAP_DELALLOC ||
- test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
- goto next;
-
- /* Skip older extent */
- if (em->generation < newer_than)
- goto next;
-
- /* This em is under writeback, no need to defrag */
- if (em->generation == (u64)-1)
- goto next;
-
- /*
- * Our start offset might be in the middle of an existing extent
- * map, so take that into account.
- */
- range_len = em->len - (cur - em->start);
- /*
- * If this range of the extent map is already flagged for delalloc,
- * skip it, because:
- *
- * 1) We could deadlock later, when trying to reserve space for
- * delalloc, because in case we can't immediately reserve space
- * the flusher can start delalloc and wait for the respective
- * ordered extents to complete. The deadlock would happen
- * because we do the space reservation while holding the range
- * locked, and starting writeback, or finishing an ordered
- * extent, requires locking the range;
- *
- * 2) If there's delalloc there, it means there's dirty pages for
- * which writeback has not started yet (we clean the delalloc
- * flag when starting writeback and after creating an ordered
- * extent). If we mark pages in an adjacent range for defrag,
- * then we will have a larger contiguous range for delalloc,
- * very likely resulting in a larger extent after writeback is
- * triggered (except in a case of free space fragmentation).
- */
- if (test_range_bit(&inode->io_tree, cur, cur + range_len - 1,
- EXTENT_DELALLOC, 0, NULL))
- goto next;
-
- /*
- * For do_compress case, we want to compress all valid file
- * extents, thus no @extent_thresh or mergeable check.
- */
- if (do_compress)
- goto add;
-
- /* Skip too large extent */
- if (range_len >= extent_thresh)
- goto next;
-
- /*
- * Skip extents already at its max capacity, this is mostly for
- * compressed extents, which max cap is only 128K.
- */
- if (em->len >= get_extent_max_capacity(fs_info, em))
- goto next;
-
- /*
- * Normally there are no more extents after an inline one, thus
- * @next_mergeable will normally be false and not defragged.
- * So if an inline extent passed all above checks, just add it
- * for defrag, and be converted to regular extents.
- */
- if (em->block_start == EXTENT_MAP_INLINE)
- goto add;
-
- next_mergeable = defrag_check_next_extent(&inode->vfs_inode, em,
- extent_thresh, newer_than, locked);
- if (!next_mergeable) {
- struct defrag_target_range *last;
-
- /* Empty target list, no way to merge with last entry */
- if (list_empty(target_list))
- goto next;
- last = list_entry(target_list->prev,
- struct defrag_target_range, list);
- /* Not mergeable with last entry */
- if (last->start + last->len != cur)
- goto next;
-
- /* Mergeable, fall through to add it to @target_list. */
- }
-
-add:
- last_is_target = true;
- range_len = min(extent_map_end(em), start + len) - cur;
- /*
- * This one is a good target, check if it can be merged into
- * last range of the target list.
- */
- if (!list_empty(target_list)) {
- struct defrag_target_range *last;
-
- last = list_entry(target_list->prev,
- struct defrag_target_range, list);
- ASSERT(last->start + last->len <= cur);
- if (last->start + last->len == cur) {
- /* Mergeable, enlarge the last entry */
- last->len += range_len;
- goto next;
- }
- /* Fall through to allocate a new entry */
- }
-
- /* Allocate new defrag_target_range */
- new = kmalloc(sizeof(*new), GFP_NOFS);
- if (!new) {
- free_extent_map(em);
- ret = -ENOMEM;
- break;
- }
- new->start = cur;
- new->len = range_len;
- list_add_tail(&new->list, target_list);
-
-next:
- cur = extent_map_end(em);
- free_extent_map(em);
- }
- if (ret < 0) {
- struct defrag_target_range *entry;
- struct defrag_target_range *tmp;
-
- list_for_each_entry_safe(entry, tmp, target_list, list) {
- list_del_init(&entry->list);
- kfree(entry);
- }
- }
- if (!ret && last_scanned_ret) {
- /*
- * If the last extent is not a target, the caller can skip to
- * the end of that extent.
- * Otherwise, we can only go the end of the specified range.
- */
- if (!last_is_target)
- *last_scanned_ret = max(cur, *last_scanned_ret);
- else
- *last_scanned_ret = max(start + len, *last_scanned_ret);
- }
- return ret;
-}
-
-#define CLUSTER_SIZE (SZ_256K)
-static_assert(IS_ALIGNED(CLUSTER_SIZE, PAGE_SIZE));
-
-/*
- * Defrag one contiguous target range.
- *
- * @inode: target inode
- * @target: target range to defrag
- * @pages: locked pages covering the defrag range
- * @nr_pages: number of locked pages
- *
- * Caller should ensure:
- *
- * - Pages are prepared
- * Pages should be locked, no ordered extent in the pages range,
- * no writeback.
- *
- * - Extent bits are locked
- */
-static int defrag_one_locked_target(struct btrfs_inode *inode,
- struct defrag_target_range *target,
- struct page **pages, int nr_pages,
- struct extent_state **cached_state)
-{
- struct btrfs_fs_info *fs_info = inode->root->fs_info;
- struct extent_changeset *data_reserved = NULL;
- const u64 start = target->start;
- const u64 len = target->len;
- unsigned long last_index = (start + len - 1) >> PAGE_SHIFT;
- unsigned long start_index = start >> PAGE_SHIFT;
- unsigned long first_index = page_index(pages[0]);
- int ret = 0;
- int i;
-
- ASSERT(last_index - first_index + 1 <= nr_pages);
-
- ret = btrfs_delalloc_reserve_space(inode, &data_reserved, start, len);
- if (ret < 0)
- return ret;
- clear_extent_bit(&inode->io_tree, start, start + len - 1,
- EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
- EXTENT_DEFRAG, cached_state);
- set_extent_defrag(&inode->io_tree, start, start + len - 1, cached_state);
-
- /* Update the page status */
- for (i = start_index - first_index; i <= last_index - first_index; i++) {
- ClearPageChecked(pages[i]);
- btrfs_page_clamp_set_dirty(fs_info, pages[i], start, len);
- }
- btrfs_delalloc_release_extents(inode, len);
- extent_changeset_free(data_reserved);
-
- return ret;
-}
-
-static int defrag_one_range(struct btrfs_inode *inode, u64 start, u32 len,
- u32 extent_thresh, u64 newer_than, bool do_compress,
- u64 *last_scanned_ret)
-{
- struct extent_state *cached_state = NULL;
- struct defrag_target_range *entry;
- struct defrag_target_range *tmp;
- LIST_HEAD(target_list);
- struct page **pages;
- const u32 sectorsize = inode->root->fs_info->sectorsize;
- u64 last_index = (start + len - 1) >> PAGE_SHIFT;
- u64 start_index = start >> PAGE_SHIFT;
- unsigned int nr_pages = last_index - start_index + 1;
- int ret = 0;
- int i;
-
- ASSERT(nr_pages <= CLUSTER_SIZE / PAGE_SIZE);
- ASSERT(IS_ALIGNED(start, sectorsize) && IS_ALIGNED(len, sectorsize));
-
- pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
- if (!pages)
- return -ENOMEM;
-
- /* Prepare all pages */
- for (i = 0; i < nr_pages; i++) {
- pages[i] = defrag_prepare_one_page(inode, start_index + i);
- if (IS_ERR(pages[i])) {
- ret = PTR_ERR(pages[i]);
- pages[i] = NULL;
- goto free_pages;
- }
- }
- for (i = 0; i < nr_pages; i++)
- wait_on_page_writeback(pages[i]);
-
- /* Lock the pages range */
- lock_extent(&inode->io_tree, start_index << PAGE_SHIFT,
- (last_index << PAGE_SHIFT) + PAGE_SIZE - 1,
- &cached_state);
- /*
- * Now we have a consistent view about the extent map, re-check
- * which range really needs to be defragged.
- *
- * And this time we have extent locked already, pass @locked = true
- * so that we won't relock the extent range and cause deadlock.
- */
- ret = defrag_collect_targets(inode, start, len, extent_thresh,
- newer_than, do_compress, true,
- &target_list, last_scanned_ret);
- if (ret < 0)
- goto unlock_extent;
-
- list_for_each_entry(entry, &target_list, list) {
- ret = defrag_one_locked_target(inode, entry, pages, nr_pages,
- &cached_state);
- if (ret < 0)
- break;
- }
-
- list_for_each_entry_safe(entry, tmp, &target_list, list) {
- list_del_init(&entry->list);
- kfree(entry);
- }
-unlock_extent:
- unlock_extent(&inode->io_tree, start_index << PAGE_SHIFT,
- (last_index << PAGE_SHIFT) + PAGE_SIZE - 1,
- &cached_state);
-free_pages:
- for (i = 0; i < nr_pages; i++) {
- if (pages[i]) {
- unlock_page(pages[i]);
- put_page(pages[i]);
- }
- }
- kfree(pages);
- return ret;
-}
-
-static int defrag_one_cluster(struct btrfs_inode *inode,
- struct file_ra_state *ra,
- u64 start, u32 len, u32 extent_thresh,
- u64 newer_than, bool do_compress,
- unsigned long *sectors_defragged,
- unsigned long max_sectors,
- u64 *last_scanned_ret)
-{
- const u32 sectorsize = inode->root->fs_info->sectorsize;
- struct defrag_target_range *entry;
- struct defrag_target_range *tmp;
- LIST_HEAD(target_list);
- int ret;
-
- ret = defrag_collect_targets(inode, start, len, extent_thresh,
- newer_than, do_compress, false,
- &target_list, NULL);
- if (ret < 0)
- goto out;
-
- list_for_each_entry(entry, &target_list, list) {
- u32 range_len = entry->len;
-
- /* Reached or beyond the limit */
- if (max_sectors && *sectors_defragged >= max_sectors) {
- ret = 1;
- break;
- }
-
- if (max_sectors)
- range_len = min_t(u32, range_len,
- (max_sectors - *sectors_defragged) * sectorsize);
-
- /*
- * If defrag_one_range() has updated last_scanned_ret,
- * our range may already be invalid (e.g. hole punched).
- * Skip if our range is before last_scanned_ret, as there is
- * no need to defrag the range anymore.
- */
- if (entry->start + range_len <= *last_scanned_ret)
- continue;
-
- if (ra)
- page_cache_sync_readahead(inode->vfs_inode.i_mapping,
- ra, NULL, entry->start >> PAGE_SHIFT,
- ((entry->start + range_len - 1) >> PAGE_SHIFT) -
- (entry->start >> PAGE_SHIFT) + 1);
- /*
- * Here we may not defrag any range if holes are punched before
- * we locked the pages.
- * But that's fine, it only affects the @sectors_defragged
- * accounting.
- */
- ret = defrag_one_range(inode, entry->start, range_len,
- extent_thresh, newer_than, do_compress,
- last_scanned_ret);
- if (ret < 0)
- break;
- *sectors_defragged += range_len >>
- inode->root->fs_info->sectorsize_bits;
- }
-out:
- list_for_each_entry_safe(entry, tmp, &target_list, list) {
- list_del_init(&entry->list);
- kfree(entry);
- }
- if (ret >= 0)
- *last_scanned_ret = max(*last_scanned_ret, start + len);
- return ret;
-}
-
-/*
- * Entry point to file defragmentation.
- *
- * @inode: inode to be defragged
- * @ra: readahead state (can be NUL)
- * @range: defrag options including range and flags
- * @newer_than: minimum transid to defrag
- * @max_to_defrag: max number of sectors to be defragged, if 0, the whole inode
- * will be defragged.
- *
- * Return <0 for error.
- * Return >=0 for the number of sectors defragged, and range->start will be updated
- * to indicate the file offset where next defrag should be started at.
- * (Mostly for autodefrag, which sets @max_to_defrag thus we may exit early without
- * defragging all the range).
- */
-int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra,
- struct btrfs_ioctl_defrag_range_args *range,
- u64 newer_than, unsigned long max_to_defrag)
-{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- unsigned long sectors_defragged = 0;
- u64 isize = i_size_read(inode);
- u64 cur;
- u64 last_byte;
- bool do_compress = range->flags & BTRFS_DEFRAG_RANGE_COMPRESS;
- bool ra_allocated = false;
- int compress_type = BTRFS_COMPRESS_ZLIB;
- int ret = 0;
- u32 extent_thresh = range->extent_thresh;
- pgoff_t start_index;
-
- if (isize == 0)
- return 0;
-
- if (range->start >= isize)
- return -EINVAL;
-
- if (do_compress) {
- if (range->compress_type >= BTRFS_NR_COMPRESS_TYPES)
- return -EINVAL;
- if (range->compress_type)
- compress_type = range->compress_type;
- }
-
- if (extent_thresh == 0)
- extent_thresh = SZ_256K;
-
- if (range->start + range->len > range->start) {
- /* Got a specific range */
- last_byte = min(isize, range->start + range->len);
- } else {
- /* Defrag until file end */
- last_byte = isize;
- }
-
- /* Align the range */
- cur = round_down(range->start, fs_info->sectorsize);
- last_byte = round_up(last_byte, fs_info->sectorsize) - 1;
-
- /*
- * If we were not given a ra, allocate a readahead context. As
- * readahead is just an optimization, defrag will work without it so
- * we don't error out.
- */
- if (!ra) {
- ra_allocated = true;
- ra = kzalloc(sizeof(*ra), GFP_KERNEL);
- if (ra)
- file_ra_state_init(ra, inode->i_mapping);
- }
-
- /*
- * Make writeback start from the beginning of the range, so that the
- * defrag range can be written sequentially.
- */
- start_index = cur >> PAGE_SHIFT;
- if (start_index < inode->i_mapping->writeback_index)
- inode->i_mapping->writeback_index = start_index;
-
- while (cur < last_byte) {
- const unsigned long prev_sectors_defragged = sectors_defragged;
- u64 last_scanned = cur;
- u64 cluster_end;
-
- if (btrfs_defrag_cancelled(fs_info)) {
- ret = -EAGAIN;
- break;
- }
-
- /* We want the cluster end at page boundary when possible */
- cluster_end = (((cur >> PAGE_SHIFT) +
- (SZ_256K >> PAGE_SHIFT)) << PAGE_SHIFT) - 1;
- cluster_end = min(cluster_end, last_byte);
-
- btrfs_inode_lock(inode, 0);
- if (IS_SWAPFILE(inode)) {
- ret = -ETXTBSY;
- btrfs_inode_unlock(inode, 0);
- break;
- }
- if (!(inode->i_sb->s_flags & SB_ACTIVE)) {
- btrfs_inode_unlock(inode, 0);
- break;
- }
- if (do_compress)
- BTRFS_I(inode)->defrag_compress = compress_type;
- ret = defrag_one_cluster(BTRFS_I(inode), ra, cur,
- cluster_end + 1 - cur, extent_thresh,
- newer_than, do_compress, &sectors_defragged,
- max_to_defrag, &last_scanned);
-
- if (sectors_defragged > prev_sectors_defragged)
- balance_dirty_pages_ratelimited(inode->i_mapping);
-
- btrfs_inode_unlock(inode, 0);
- if (ret < 0)
- break;
- cur = max(cluster_end + 1, last_scanned);
- if (ret > 0) {
- ret = 0;
- break;
- }
- cond_resched();
- }
-
- if (ra_allocated)
- kfree(ra);
- /*
- * Update range.start for autodefrag, this will indicate where to start
- * in next run.
- */
- range->start = cur;
- if (sectors_defragged) {
- /*
- * We have defragged some sectors, for compression case they
- * need to be written back immediately.
- */
- if (range->flags & BTRFS_DEFRAG_RANGE_START_IO) {
- filemap_flush(inode->i_mapping);
- if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
- &BTRFS_I(inode)->runtime_flags))
- filemap_flush(inode->i_mapping);
- }
- if (range->compress_type == BTRFS_COMPRESS_LZO)
- btrfs_set_fs_incompat(fs_info, COMPRESS_LZO);
- else if (range->compress_type == BTRFS_COMPRESS_ZSTD)
- btrfs_set_fs_incompat(fs_info, COMPRESS_ZSTD);
- ret = sectors_defragged;
- }
- if (do_compress) {
- btrfs_inode_lock(inode, 0);
- BTRFS_I(inode)->defrag_compress = BTRFS_COMPRESS_NONE;
- btrfs_inode_unlock(inode, 0);
- }
- return ret;
-}
-
-/*
* Try to start exclusive operation @type or cancel it if it's running.
*
* Return:
@@ -2119,7 +1228,7 @@ static noinline int btrfs_ioctl_resize(struct file *file,
if (ret == 0 && new_size != old_size)
btrfs_info_in_rcu(fs_info,
"resize device %s (devid %llu) from %llu to %llu",
- rcu_str_deref(device->name), device->devid,
+ btrfs_dev_name(device), device->devid,
old_size, new_size);
out_finish:
btrfs_exclop_finish(fs_info);
@@ -3105,6 +2214,8 @@ static int btrfs_ioctl_get_subvol_info(struct inode *inode, void __user *argp)
}
}
+ btrfs_free_path(path);
+ path = NULL;
if (copy_to_user(argp, subvol_info, sizeof(*subvol_info)))
ret = -EFAULT;
@@ -3194,6 +2305,8 @@ static int btrfs_ioctl_get_subvol_rootref(struct btrfs_root *root,
}
out:
+ btrfs_free_path(path);
+
if (!ret || ret == -EOVERFLOW) {
rootrefs->num_items = found;
/* update min_treeid for next search */
@@ -3205,7 +2318,6 @@ out:
}
kfree(rootrefs);
- btrfs_free_path(path);
return ret;
}
@@ -3271,7 +2383,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
dentry = btrfs_get_dentry(fs_info->sb,
BTRFS_FIRST_FREE_OBJECTID,
- vol_args2->subvolid, 0, 0);
+ vol_args2->subvolid, 0);
if (IS_ERR(dentry)) {
err = PTR_ERR(dentry);
goto out_drop_write;
@@ -3416,16 +2528,16 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
goto out_dput;
}
- btrfs_inode_lock(inode, 0);
- err = btrfs_delete_subvolume(dir, dentry);
- btrfs_inode_unlock(inode, 0);
+ btrfs_inode_lock(BTRFS_I(inode), 0);
+ err = btrfs_delete_subvolume(BTRFS_I(dir), dentry);
+ btrfs_inode_unlock(BTRFS_I(inode), 0);
if (!err)
d_delete_notify(dir, dentry);
out_dput:
dput(dentry);
out_unlock_dir:
- btrfs_inode_unlock(dir, 0);
+ btrfs_inode_unlock(BTRFS_I(dir), 0);
free_subvol_name:
kfree(subvol_name_ptr);
free_parent:
@@ -3747,13 +2859,10 @@ static long btrfs_ioctl_dev_info(struct btrfs_fs_info *fs_info,
di_args->bytes_used = btrfs_device_get_bytes_used(dev);
di_args->total_bytes = btrfs_device_get_total_bytes(dev);
memcpy(di_args->uuid, dev->uuid, sizeof(di_args->uuid));
- if (dev->name) {
- strncpy(di_args->path, rcu_str_deref(dev->name),
- sizeof(di_args->path) - 1);
- di_args->path[sizeof(di_args->path) - 1] = 0;
- } else {
+ if (dev->name)
+ strscpy(di_args->path, btrfs_dev_name(dev), sizeof(di_args->path));
+ else
di_args->path[0] = '\0';
- }
out:
rcu_read_unlock();
@@ -3774,6 +2883,7 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
struct btrfs_trans_handle *trans;
struct btrfs_path *path = NULL;
struct btrfs_disk_key disk_key;
+ struct fscrypt_str name = FSTR_INIT("default", 7);
u64 objectid = 0;
u64 dir_id;
int ret;
@@ -3817,7 +2927,7 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
dir_id = btrfs_super_root_dir(fs_info->super_copy);
di = btrfs_lookup_dir_item(trans, fs_info->tree_root, path,
- dir_id, "default", 7, 1);
+ dir_id, &name, 1);
if (IS_ERR_OR_NULL(di)) {
btrfs_release_path(path);
btrfs_end_transaction(trans);
@@ -4231,6 +3341,8 @@ static long btrfs_ioctl_ino_to_path(struct btrfs_root *root, void __user *arg)
ipath->fspath->val[i] = rel_ptr;
}
+ btrfs_free_path(path);
+ path = NULL;
ret = copy_to_user((void __user *)(unsigned long)ipa->fspath,
ipath->fspath, size);
if (ret) {
@@ -4281,21 +3393,20 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_fs_info *fs_info,
size = min_t(u32, loi->size, SZ_16M);
}
- path = btrfs_alloc_path();
- if (!path) {
- ret = -ENOMEM;
- goto out;
- }
-
inodes = init_data_container(size);
if (IS_ERR(inodes)) {
ret = PTR_ERR(inodes);
- inodes = NULL;
- goto out;
+ goto out_loi;
}
+ path = btrfs_alloc_path();
+ if (!path) {
+ ret = -ENOMEM;
+ goto out;
+ }
ret = iterate_inodes_from_logical(loi->logical, fs_info, path,
inodes, ignore_offset);
+ btrfs_free_path(path);
if (ret == -EINVAL)
ret = -ENOENT;
if (ret < 0)
@@ -4307,7 +3418,6 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_fs_info *fs_info,
ret = -EFAULT;
out:
- btrfs_free_path(path);
kvfree(inodes);
out_loi:
kfree(loi);
@@ -4338,7 +3448,7 @@ void btrfs_update_ioctl_balance_args(struct btrfs_fs_info *fs_info,
spin_unlock(&fs_info->balance_lock);
}
-/**
+/*
* Try to acquire fs_info::balance_mutex as well as set BTRFS_EXLCOP_BALANCE as
* required.
*
@@ -5283,7 +4393,7 @@ static int btrfs_ioctl_encoded_read(struct file *file, void __user *argp,
goto out_acct;
}
- ret = import_iovec(READ, args.iov, args.iovcnt, ARRAY_SIZE(iovstack),
+ ret = import_iovec(ITER_DEST, args.iov, args.iovcnt, ARRAY_SIZE(iovstack),
&iov, &iter);
if (ret < 0)
goto out_acct;
@@ -5382,7 +4492,7 @@ static int btrfs_ioctl_encoded_write(struct file *file, void __user *argp, bool
if (args.len > args.unencoded_len - args.unencoded_offset)
goto out_acct;
- ret = import_iovec(WRITE, args.iov, args.iovcnt, ARRAY_SIZE(iovstack),
+ ret = import_iovec(ITER_SOURCE, args.iov, args.iovcnt, ARRAY_SIZE(iovstack),
&iov, &iter);
if (ret < 0)
goto out_acct;
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
new file mode 100644
index 000000000000..8a855d5ac2fa
--- /dev/null
+++ b/fs/btrfs/ioctl.h
@@ -0,0 +1,17 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef BTRFS_IOCTL_H
+#define BTRFS_IOCTL_H
+
+long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
+long btrfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
+int btrfs_fileattr_get(struct dentry *dentry, struct fileattr *fa);
+int btrfs_fileattr_set(struct user_namespace *mnt_userns,
+ struct dentry *dentry, struct fileattr *fa);
+int btrfs_ioctl_get_supported_features(void __user *arg);
+void btrfs_sync_inode_flags_to_i_flags(struct inode *inode);
+int __pure btrfs_is_empty_uuid(u8 *uuid);
+void btrfs_update_ioctl_balance_args(struct btrfs_fs_info *fs_info,
+ struct btrfs_ioctl_balance_args *bargs);
+
+#endif
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index 0eab3cb274a1..870528d87526 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -12,6 +12,7 @@
#include "ctree.h"
#include "extent_io.h"
#include "locking.h"
+#include "accessors.h"
/*
* Lockdep class keys for extent_buffer->lock's in this root. For a given
diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h
index 490c7a79e995..11c2269b4b6f 100644
--- a/fs/btrfs/locking.h
+++ b/fs/btrfs/locking.h
@@ -78,6 +78,82 @@ enum btrfs_lock_nesting {
BTRFS_NESTING_MAX,
};
+enum btrfs_lockdep_trans_states {
+ BTRFS_LOCKDEP_TRANS_COMMIT_START,
+ BTRFS_LOCKDEP_TRANS_UNBLOCKED,
+ BTRFS_LOCKDEP_TRANS_SUPER_COMMITTED,
+ BTRFS_LOCKDEP_TRANS_COMPLETED,
+};
+
+/*
+ * Lockdep annotation for wait events.
+ *
+ * @owner: The struct where the lockdep map is defined
+ * @lock: The lockdep map corresponding to a wait event
+ *
+ * This macro is used to annotate a wait event. In this case a thread acquires
+ * the lockdep map as writer (exclusive lock) because it has to block until all
+ * the threads that hold the lock as readers signal the condition for the wait
+ * event and release their locks.
+ */
+#define btrfs_might_wait_for_event(owner, lock) \
+ do { \
+ rwsem_acquire(&owner->lock##_map, 0, 0, _THIS_IP_); \
+ rwsem_release(&owner->lock##_map, _THIS_IP_); \
+ } while (0)
+
+/*
+ * Protection for the resource/condition of a wait event.
+ *
+ * @owner: The struct where the lockdep map is defined
+ * @lock: The lockdep map corresponding to a wait event
+ *
+ * Many threads can modify the condition for the wait event at the same time
+ * and signal the threads that block on the wait event. The threads that modify
+ * the condition and do the signaling acquire the lock as readers (shared
+ * lock).
+ */
+#define btrfs_lockdep_acquire(owner, lock) \
+ rwsem_acquire_read(&owner->lock##_map, 0, 0, _THIS_IP_)
+
+/*
+ * Used after signaling the condition for a wait event to release the lockdep
+ * map held by a reader thread.
+ */
+#define btrfs_lockdep_release(owner, lock) \
+ rwsem_release(&owner->lock##_map, _THIS_IP_)
+
+/*
+ * Macros for the transaction states wait events, similar to the generic wait
+ * event macros.
+ */
+#define btrfs_might_wait_for_state(owner, i) \
+ do { \
+ rwsem_acquire(&owner->btrfs_state_change_map[i], 0, 0, _THIS_IP_); \
+ rwsem_release(&owner->btrfs_state_change_map[i], _THIS_IP_); \
+ } while (0)
+
+#define btrfs_trans_state_lockdep_acquire(owner, i) \
+ rwsem_acquire_read(&owner->btrfs_state_change_map[i], 0, 0, _THIS_IP_)
+
+#define btrfs_trans_state_lockdep_release(owner, i) \
+ rwsem_release(&owner->btrfs_state_change_map[i], _THIS_IP_)
+
+/* Initialization of the lockdep map */
+#define btrfs_lockdep_init_map(owner, lock) \
+ do { \
+ static struct lock_class_key lock##_key; \
+ lockdep_init_map(&owner->lock##_map, #lock, &lock##_key, 0); \
+ } while (0)
+
+/* Initialization of the transaction states lockdep maps. */
+#define btrfs_state_lockdep_init_map(owner, lock, state) \
+ do { \
+ static struct lock_class_key lock##_key; \
+ lockdep_init_map(&owner->btrfs_state_change_map[state], #lock, \
+ &lock##_key, 0); \
+ } while (0)
+
static_assert(BTRFS_NESTING_MAX <= MAX_LOCKDEP_SUBCLASSES,
"too many lock subclasses defined");
diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c
index 89bc5f825e0a..d5e78cbc8fbc 100644
--- a/fs/btrfs/lzo.c
+++ b/fs/btrfs/lzo.c
@@ -13,8 +13,10 @@
#include <linux/bio.h>
#include <linux/lzo.h>
#include <linux/refcount.h>
+#include "messages.h"
#include "compression.h"
#include "ctree.h"
+#include "super.h"
#define LZO_LEN 4
@@ -425,7 +427,7 @@ out:
return ret;
}
-int lzo_decompress(struct list_head *ws, unsigned char *data_in,
+int lzo_decompress(struct list_head *ws, const u8 *data_in,
struct page *dest_page, unsigned long start_byte, size_t srclen,
size_t destlen)
{
diff --git a/fs/btrfs/messages.c b/fs/btrfs/messages.c
new file mode 100644
index 000000000000..625bbbbb2608
--- /dev/null
+++ b/fs/btrfs/messages.c
@@ -0,0 +1,353 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "fs.h"
+#include "messages.h"
+#include "discard.h"
+#include "transaction.h"
+#include "space-info.h"
+#include "super.h"
+
+#ifdef CONFIG_PRINTK
+
+#define STATE_STRING_PREFACE ": state "
+#define STATE_STRING_BUF_LEN (sizeof(STATE_STRING_PREFACE) + BTRFS_FS_STATE_COUNT)
+
+/*
+ * Characters to print to indicate error conditions or uncommon filesystem state.
+ * RO is not an error.
+ */
+static const char fs_state_chars[] = {
+ [BTRFS_FS_STATE_ERROR] = 'E',
+ [BTRFS_FS_STATE_REMOUNTING] = 'M',
+ [BTRFS_FS_STATE_RO] = 0,
+ [BTRFS_FS_STATE_TRANS_ABORTED] = 'A',
+ [BTRFS_FS_STATE_DEV_REPLACING] = 'R',
+ [BTRFS_FS_STATE_DUMMY_FS_INFO] = 0,
+ [BTRFS_FS_STATE_NO_CSUMS] = 'C',
+ [BTRFS_FS_STATE_LOG_CLEANUP_ERROR] = 'L',
+};
+
+static void btrfs_state_to_string(const struct btrfs_fs_info *info, char *buf)
+{
+ unsigned int bit;
+ bool states_printed = false;
+ unsigned long fs_state = READ_ONCE(info->fs_state);
+ char *curr = buf;
+
+ memcpy(curr, STATE_STRING_PREFACE, sizeof(STATE_STRING_PREFACE));
+ curr += sizeof(STATE_STRING_PREFACE) - 1;
+
+ for_each_set_bit(bit, &fs_state, sizeof(fs_state)) {
+ WARN_ON_ONCE(bit >= BTRFS_FS_STATE_COUNT);
+ if ((bit < BTRFS_FS_STATE_COUNT) && fs_state_chars[bit]) {
+ *curr++ = fs_state_chars[bit];
+ states_printed = true;
+ }
+ }
+
+ /* If no states were printed, reset the buffer */
+ if (!states_printed)
+ curr = buf;
+
+ *curr++ = 0;
+}
+#endif
+
+/*
+ * Generally the error codes correspond to their respective errors, but there
+ * are a few special cases.
+ *
+ * EUCLEAN: Any sort of corruption that we encounter. The tree-checker for
+ * instance will return EUCLEAN if any of the blocks are corrupted in
+ * a way that is problematic. We want to reserve EUCLEAN for these
+ * sort of corruptions.
+ *
+ * EROFS: If we check BTRFS_FS_STATE_ERROR and fail out with a return error, we
+ * need to use EROFS for this case. We will have no idea of the
+ * original failure, that will have been reported at the time we tripped
+ * over the error. Each subsequent error that doesn't have any context
+ * of the original error should use EROFS when handling BTRFS_FS_STATE_ERROR.
+ */
+const char * __attribute_const__ btrfs_decode_error(int errno)
+{
+ char *errstr = "unknown";
+
+ switch (errno) {
+ case -ENOENT: /* -2 */
+ errstr = "No such entry";
+ break;
+ case -EIO: /* -5 */
+ errstr = "IO failure";
+ break;
+ case -ENOMEM: /* -12*/
+ errstr = "Out of memory";
+ break;
+ case -EEXIST: /* -17 */
+ errstr = "Object already exists";
+ break;
+ case -ENOSPC: /* -28 */
+ errstr = "No space left";
+ break;
+ case -EROFS: /* -30 */
+ errstr = "Readonly filesystem";
+ break;
+ case -EOPNOTSUPP: /* -95 */
+ errstr = "Operation not supported";
+ break;
+ case -EUCLEAN: /* -117 */
+ errstr = "Filesystem corrupted";
+ break;
+ case -EDQUOT: /* -122 */
+ errstr = "Quota exceeded";
+ break;
+ }
+
+ return errstr;
+}
+
+/*
+ * __btrfs_handle_fs_error decodes expected errors from the caller and
+ * invokes the appropriate error response.
+ */
+__cold
+void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function,
+ unsigned int line, int errno, const char *fmt, ...)
+{
+ struct super_block *sb = fs_info->sb;
+#ifdef CONFIG_PRINTK
+ char statestr[STATE_STRING_BUF_LEN];
+ const char *errstr;
+#endif
+
+#ifdef CONFIG_PRINTK_INDEX
+ printk_index_subsys_emit(
+ "BTRFS: error (device %s%s) in %s:%d: errno=%d %s", KERN_CRIT, fmt);
+#endif
+
+ /*
+ * Special case: if the error is EROFS, and we're already under
+ * SB_RDONLY, then it is safe here.
+ */
+ if (errno == -EROFS && sb_rdonly(sb))
+ return;
+
+#ifdef CONFIG_PRINTK
+ errstr = btrfs_decode_error(errno);
+ btrfs_state_to_string(fs_info, statestr);
+ if (fmt) {
+ struct va_format vaf;
+ va_list args;
+
+ va_start(args, fmt);
+ vaf.fmt = fmt;
+ vaf.va = &args;
+
+ pr_crit("BTRFS: error (device %s%s) in %s:%d: errno=%d %s (%pV)\n",
+ sb->s_id, statestr, function, line, errno, errstr, &vaf);
+ va_end(args);
+ } else {
+ pr_crit("BTRFS: error (device %s%s) in %s:%d: errno=%d %s\n",
+ sb->s_id, statestr, function, line, errno, errstr);
+ }
+#endif
+
+ /*
+ * Today we only save the error info to memory. Long term we'll also
+ * send it down to the disk.
+ */
+ set_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state);
+
+ /* Don't go through full error handling during mount. */
+ if (!(sb->s_flags & SB_BORN))
+ return;
+
+ if (sb_rdonly(sb))
+ return;
+
+ btrfs_discard_stop(fs_info);
+
+ /* Handle error by forcing the filesystem readonly. */
+ btrfs_set_sb_rdonly(sb);
+ btrfs_info(fs_info, "forced readonly");
+ /*
+ * Note that a running device replace operation is not canceled here
+ * although there is no way to update the progress. It would add the
+ * risk of a deadlock, therefore the canceling is omitted. The only
+ * penalty is that some I/O remains active until the procedure
+ * completes. The next time when the filesystem is mounted writable
+ * again, the device replace operation continues.
+ */
+}
+
+#ifdef CONFIG_PRINTK
+static const char * const logtypes[] = {
+ "emergency",
+ "alert",
+ "critical",
+ "error",
+ "warning",
+ "notice",
+ "info",
+ "debug",
+};
+
+/*
+ * Use one ratelimit state per log level so that a flood of less important
+ * messages doesn't cause more important ones to be dropped.
+ */
+static struct ratelimit_state printk_limits[] = {
+ RATELIMIT_STATE_INIT(printk_limits[0], DEFAULT_RATELIMIT_INTERVAL, 100),
+ RATELIMIT_STATE_INIT(printk_limits[1], DEFAULT_RATELIMIT_INTERVAL, 100),
+ RATELIMIT_STATE_INIT(printk_limits[2], DEFAULT_RATELIMIT_INTERVAL, 100),
+ RATELIMIT_STATE_INIT(printk_limits[3], DEFAULT_RATELIMIT_INTERVAL, 100),
+ RATELIMIT_STATE_INIT(printk_limits[4], DEFAULT_RATELIMIT_INTERVAL, 100),
+ RATELIMIT_STATE_INIT(printk_limits[5], DEFAULT_RATELIMIT_INTERVAL, 100),
+ RATELIMIT_STATE_INIT(printk_limits[6], DEFAULT_RATELIMIT_INTERVAL, 100),
+ RATELIMIT_STATE_INIT(printk_limits[7], DEFAULT_RATELIMIT_INTERVAL, 100),
+};
+
+void __cold _btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...)
+{
+ char lvl[PRINTK_MAX_SINGLE_HEADER_LEN + 1] = "\0";
+ struct va_format vaf;
+ va_list args;
+ int kern_level;
+ const char *type = logtypes[4];
+ struct ratelimit_state *ratelimit = &printk_limits[4];
+
+#ifdef CONFIG_PRINTK_INDEX
+ printk_index_subsys_emit("%sBTRFS %s (device %s): ", NULL, fmt);
+#endif
+
+ va_start(args, fmt);
+
+ while ((kern_level = printk_get_level(fmt)) != 0) {
+ size_t size = printk_skip_level(fmt) - fmt;
+
+ if (kern_level >= '0' && kern_level <= '7') {
+ memcpy(lvl, fmt, size);
+ lvl[size] = '\0';
+ type = logtypes[kern_level - '0'];
+ ratelimit = &printk_limits[kern_level - '0'];
+ }
+ fmt += size;
+ }
+
+ vaf.fmt = fmt;
+ vaf.va = &args;
+
+ if (__ratelimit(ratelimit)) {
+ if (fs_info) {
+ char statestr[STATE_STRING_BUF_LEN];
+
+ btrfs_state_to_string(fs_info, statestr);
+ _printk("%sBTRFS %s (device %s%s): %pV\n", lvl, type,
+ fs_info->sb->s_id, statestr, &vaf);
+ } else {
+ _printk("%sBTRFS %s: %pV\n", lvl, type, &vaf);
+ }
+ }
+
+ va_end(args);
+}
+#endif
+
+#ifdef CONFIG_BTRFS_ASSERT
+void __cold btrfs_assertfail(const char *expr, const char *file, int line)
+{
+ pr_err("assertion failed: %s, in %s:%d\n", expr, file, line);
+ BUG();
+}
+#endif
+
+void __cold btrfs_print_v0_err(struct btrfs_fs_info *fs_info)
+{
+ btrfs_err(fs_info,
+"Unsupported V0 extent filesystem detected. Aborting. Please re-create your filesystem with a newer kernel");
+}
+
+#if BITS_PER_LONG == 32
+void __cold btrfs_warn_32bit_limit(struct btrfs_fs_info *fs_info)
+{
+ if (!test_and_set_bit(BTRFS_FS_32BIT_WARN, &fs_info->flags)) {
+ btrfs_warn(fs_info, "reaching 32bit limit for logical addresses");
+ btrfs_warn(fs_info,
+"due to page cache limit on 32bit systems, btrfs can't access metadata at or beyond %lluT",
+ BTRFS_32BIT_MAX_FILE_SIZE >> 40);
+ btrfs_warn(fs_info,
+ "please consider upgrading to 64bit kernel/hardware");
+ }
+}
+
+void __cold btrfs_err_32bit_limit(struct btrfs_fs_info *fs_info)
+{
+ if (!test_and_set_bit(BTRFS_FS_32BIT_ERROR, &fs_info->flags)) {
+ btrfs_err(fs_info, "reached 32bit limit for logical addresses");
+ btrfs_err(fs_info,
+"due to page cache limit on 32bit systems, metadata beyond %lluT can't be accessed",
+ BTRFS_32BIT_MAX_FILE_SIZE >> 40);
+ btrfs_err(fs_info,
+ "please consider upgrading to 64bit kernel/hardware");
+ }
+}
+#endif
+
+/*
+ * We only mark the transaction aborted and then set the file system read-only.
+ * This will prevent new transactions from starting or trying to join this
+ * one.
+ *
+ * This means that error recovery at the call site is limited to freeing
+ * any local memory allocations and passing the error code up without
+ * further cleanup. The transaction should complete as it normally would
+ * in the call path but will return -EIO.
+ *
+ * We'll complete the cleanup in btrfs_end_transaction and
+ * btrfs_commit_transaction.
+ */
+__cold
+void __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
+ const char *function,
+ unsigned int line, int errno, bool first_hit)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+
+ WRITE_ONCE(trans->aborted, errno);
+ WRITE_ONCE(trans->transaction->aborted, errno);
+ if (first_hit && errno == -ENOSPC)
+ btrfs_dump_space_info_for_trans_abort(fs_info);
+ /* Wake up anybody who may be waiting on this transaction */
+ wake_up(&fs_info->transaction_wait);
+ wake_up(&fs_info->transaction_blocked_wait);
+ __btrfs_handle_fs_error(fs_info, function, line, errno, NULL);
+}
+
+/*
+ * __btrfs_panic decodes unexpected, fatal errors from the caller, issues an
+ * alert, and either panics or BUGs, depending on mount options.
+ */
+__cold
+void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function,
+ unsigned int line, int errno, const char *fmt, ...)
+{
+ char *s_id = "<unknown>";
+ const char *errstr;
+ struct va_format vaf = { .fmt = fmt };
+ va_list args;
+
+ if (fs_info)
+ s_id = fs_info->sb->s_id;
+
+ va_start(args, fmt);
+ vaf.va = &args;
+
+ errstr = btrfs_decode_error(errno);
+ if (fs_info && (btrfs_test_opt(fs_info, PANIC_ON_FATAL_ERROR)))
+ panic(KERN_CRIT "BTRFS panic (device %s) in %s:%d: %pV (errno=%d %s)\n",
+ s_id, function, line, &vaf, errno, errstr);
+
+ btrfs_crit(fs_info, "panic in %s:%d: %pV (errno=%d %s)",
+ function, line, &vaf, errno, errstr);
+ va_end(args);
+ /* Caller calls BUG() */
+}
diff --git a/fs/btrfs/messages.h b/fs/btrfs/messages.h
new file mode 100644
index 000000000000..190af1f698d9
--- /dev/null
+++ b/fs/btrfs/messages.h
@@ -0,0 +1,245 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef BTRFS_MESSAGES_H
+#define BTRFS_MESSAGES_H
+
+#include <linux/types.h>
+
+struct btrfs_fs_info;
+struct btrfs_trans_handle;
+
+static inline __printf(2, 3) __cold
+void btrfs_no_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...)
+{
+}
+
+#ifdef CONFIG_PRINTK
+
+#define btrfs_printk(fs_info, fmt, args...) \
+ _btrfs_printk(fs_info, fmt, ##args)
+
+__printf(2, 3)
+__cold
+void _btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...);
+
+#else
+
+#define btrfs_printk(fs_info, fmt, args...) \
+ btrfs_no_printk(fs_info, fmt, ##args)
+#endif
+
+#define btrfs_emerg(fs_info, fmt, args...) \
+ btrfs_printk(fs_info, KERN_EMERG fmt, ##args)
+#define btrfs_alert(fs_info, fmt, args...) \
+ btrfs_printk(fs_info, KERN_ALERT fmt, ##args)
+#define btrfs_crit(fs_info, fmt, args...) \
+ btrfs_printk(fs_info, KERN_CRIT fmt, ##args)
+#define btrfs_err(fs_info, fmt, args...) \
+ btrfs_printk(fs_info, KERN_ERR fmt, ##args)
+#define btrfs_warn(fs_info, fmt, args...) \
+ btrfs_printk(fs_info, KERN_WARNING fmt, ##args)
+#define btrfs_notice(fs_info, fmt, args...) \
+ btrfs_printk(fs_info, KERN_NOTICE fmt, ##args)
+#define btrfs_info(fs_info, fmt, args...) \
+ btrfs_printk(fs_info, KERN_INFO fmt, ##args)
+
+/*
+ * Wrappers that use printk_in_rcu
+ */
+#define btrfs_emerg_in_rcu(fs_info, fmt, args...) \
+ btrfs_printk_in_rcu(fs_info, KERN_EMERG fmt, ##args)
+#define btrfs_alert_in_rcu(fs_info, fmt, args...) \
+ btrfs_printk_in_rcu(fs_info, KERN_ALERT fmt, ##args)
+#define btrfs_crit_in_rcu(fs_info, fmt, args...) \
+ btrfs_printk_in_rcu(fs_info, KERN_CRIT fmt, ##args)
+#define btrfs_err_in_rcu(fs_info, fmt, args...) \
+ btrfs_printk_in_rcu(fs_info, KERN_ERR fmt, ##args)
+#define btrfs_warn_in_rcu(fs_info, fmt, args...) \
+ btrfs_printk_in_rcu(fs_info, KERN_WARNING fmt, ##args)
+#define btrfs_notice_in_rcu(fs_info, fmt, args...) \
+ btrfs_printk_in_rcu(fs_info, KERN_NOTICE fmt, ##args)
+#define btrfs_info_in_rcu(fs_info, fmt, args...) \
+ btrfs_printk_in_rcu(fs_info, KERN_INFO fmt, ##args)
+
+/*
+ * Wrappers that use a ratelimited printk_in_rcu
+ */
+#define btrfs_emerg_rl_in_rcu(fs_info, fmt, args...) \
+ btrfs_printk_rl_in_rcu(fs_info, KERN_EMERG fmt, ##args)
+#define btrfs_alert_rl_in_rcu(fs_info, fmt, args...) \
+ btrfs_printk_rl_in_rcu(fs_info, KERN_ALERT fmt, ##args)
+#define btrfs_crit_rl_in_rcu(fs_info, fmt, args...) \
+ btrfs_printk_rl_in_rcu(fs_info, KERN_CRIT fmt, ##args)
+#define btrfs_err_rl_in_rcu(fs_info, fmt, args...) \
+ btrfs_printk_rl_in_rcu(fs_info, KERN_ERR fmt, ##args)
+#define btrfs_warn_rl_in_rcu(fs_info, fmt, args...) \
+ btrfs_printk_rl_in_rcu(fs_info, KERN_WARNING fmt, ##args)
+#define btrfs_notice_rl_in_rcu(fs_info, fmt, args...) \
+ btrfs_printk_rl_in_rcu(fs_info, KERN_NOTICE fmt, ##args)
+#define btrfs_info_rl_in_rcu(fs_info, fmt, args...) \
+ btrfs_printk_rl_in_rcu(fs_info, KERN_INFO fmt, ##args)
+
+/*
+ * Wrappers that use a ratelimited printk
+ */
+#define btrfs_emerg_rl(fs_info, fmt, args...) \
+ btrfs_printk_ratelimited(fs_info, KERN_EMERG fmt, ##args)
+#define btrfs_alert_rl(fs_info, fmt, args...) \
+ btrfs_printk_ratelimited(fs_info, KERN_ALERT fmt, ##args)
+#define btrfs_crit_rl(fs_info, fmt, args...) \
+ btrfs_printk_ratelimited(fs_info, KERN_CRIT fmt, ##args)
+#define btrfs_err_rl(fs_info, fmt, args...) \
+ btrfs_printk_ratelimited(fs_info, KERN_ERR fmt, ##args)
+#define btrfs_warn_rl(fs_info, fmt, args...) \
+ btrfs_printk_ratelimited(fs_info, KERN_WARNING fmt, ##args)
+#define btrfs_notice_rl(fs_info, fmt, args...) \
+ btrfs_printk_ratelimited(fs_info, KERN_NOTICE fmt, ##args)
+#define btrfs_info_rl(fs_info, fmt, args...) \
+ btrfs_printk_ratelimited(fs_info, KERN_INFO fmt, ##args)
+
+#if defined(CONFIG_DYNAMIC_DEBUG)
+#define btrfs_debug(fs_info, fmt, args...) \
+ _dynamic_func_call_no_desc(fmt, btrfs_printk, \
+ fs_info, KERN_DEBUG fmt, ##args)
+#define btrfs_debug_in_rcu(fs_info, fmt, args...) \
+ _dynamic_func_call_no_desc(fmt, btrfs_printk_in_rcu, \
+ fs_info, KERN_DEBUG fmt, ##args)
+#define btrfs_debug_rl_in_rcu(fs_info, fmt, args...) \
+ _dynamic_func_call_no_desc(fmt, btrfs_printk_rl_in_rcu, \
+ fs_info, KERN_DEBUG fmt, ##args)
+#define btrfs_debug_rl(fs_info, fmt, args...) \
+ _dynamic_func_call_no_desc(fmt, btrfs_printk_ratelimited, \
+ fs_info, KERN_DEBUG fmt, ##args)
+#elif defined(DEBUG)
+#define btrfs_debug(fs_info, fmt, args...) \
+ btrfs_printk(fs_info, KERN_DEBUG fmt, ##args)
+#define btrfs_debug_in_rcu(fs_info, fmt, args...) \
+ btrfs_printk_in_rcu(fs_info, KERN_DEBUG fmt, ##args)
+#define btrfs_debug_rl_in_rcu(fs_info, fmt, args...) \
+ btrfs_printk_rl_in_rcu(fs_info, KERN_DEBUG fmt, ##args)
+#define btrfs_debug_rl(fs_info, fmt, args...) \
+ btrfs_printk_ratelimited(fs_info, KERN_DEBUG fmt, ##args)
+#else
+#define btrfs_debug(fs_info, fmt, args...) \
+ btrfs_no_printk(fs_info, KERN_DEBUG fmt, ##args)
+#define btrfs_debug_in_rcu(fs_info, fmt, args...) \
+ btrfs_no_printk_in_rcu(fs_info, KERN_DEBUG fmt, ##args)
+#define btrfs_debug_rl_in_rcu(fs_info, fmt, args...) \
+ btrfs_no_printk_in_rcu(fs_info, KERN_DEBUG fmt, ##args)
+#define btrfs_debug_rl(fs_info, fmt, args...) \
+ btrfs_no_printk(fs_info, KERN_DEBUG fmt, ##args)
+#endif
+
+#define btrfs_printk_in_rcu(fs_info, fmt, args...) \
+do { \
+ rcu_read_lock(); \
+ btrfs_printk(fs_info, fmt, ##args); \
+ rcu_read_unlock(); \
+} while (0)
+
+#define btrfs_no_printk_in_rcu(fs_info, fmt, args...) \
+do { \
+ rcu_read_lock(); \
+ btrfs_no_printk(fs_info, fmt, ##args); \
+ rcu_read_unlock(); \
+} while (0)
+
+#define btrfs_printk_ratelimited(fs_info, fmt, args...) \
+do { \
+ static DEFINE_RATELIMIT_STATE(_rs, \
+ DEFAULT_RATELIMIT_INTERVAL, \
+ DEFAULT_RATELIMIT_BURST); \
+ if (__ratelimit(&_rs)) \
+ btrfs_printk(fs_info, fmt, ##args); \
+} while (0)
+
+#define btrfs_printk_rl_in_rcu(fs_info, fmt, args...) \
+do { \
+ rcu_read_lock(); \
+ btrfs_printk_ratelimited(fs_info, fmt, ##args); \
+ rcu_read_unlock(); \
+} while (0)
+
+#ifdef CONFIG_BTRFS_ASSERT
+void __cold btrfs_assertfail(const char *expr, const char *file, int line);
+
+#define ASSERT(expr) \
+ (likely(expr) ? (void)0 : btrfs_assertfail(#expr, __FILE__, __LINE__))
+#else
+#define ASSERT(expr) (void)(expr)
+#endif
+
+void __cold btrfs_print_v0_err(struct btrfs_fs_info *fs_info);
+
+__printf(5, 6)
+__cold
+void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function,
+ unsigned int line, int errno, const char *fmt, ...);
+
+const char * __attribute_const__ btrfs_decode_error(int errno);
+
+__cold
+void __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
+ const char *function,
+ unsigned int line, int errno, bool first_hit);
+
+bool __cold abort_should_print_stack(int errno);
+
+/*
+ * Call btrfs_abort_transaction as early as possible when an error condition is
+ * detected, that way the exact stack trace is reported for some errors.
+ */
+#define btrfs_abort_transaction(trans, errno) \
+do { \
+ bool first = false; \
+ /* Report first abort since mount */ \
+ if (!test_and_set_bit(BTRFS_FS_STATE_TRANS_ABORTED, \
+ &((trans)->fs_info->fs_state))) { \
+ first = true; \
+ if (WARN(abort_should_print_stack(errno), \
+ KERN_ERR \
+ "BTRFS: Transaction aborted (error %d)\n", \
+ (errno))) { \
+ /* Stack trace printed. */ \
+ } else { \
+ btrfs_err((trans)->fs_info, \
+ "Transaction aborted (error %d)", \
+ (errno)); \
+ } \
+ } \
+ __btrfs_abort_transaction((trans), __func__, \
+ __LINE__, (errno), first); \
+} while (0)
+
+#define btrfs_handle_fs_error(fs_info, errno, fmt, args...) \
+ __btrfs_handle_fs_error((fs_info), __func__, __LINE__, \
+ (errno), fmt, ##args)
+
+__printf(5, 6)
+__cold
+void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function,
+ unsigned int line, int errno, const char *fmt, ...);
+/*
+ * If BTRFS_MOUNT_PANIC_ON_FATAL_ERROR is in mount_opt, __btrfs_panic
+ * will panic(). Otherwise we BUG() here.
+ */
+#define btrfs_panic(fs_info, errno, fmt, args...) \
+do { \
+ __btrfs_panic(fs_info, __func__, __LINE__, errno, fmt, ##args); \
+ BUG(); \
+} while (0)
+
+#if BITS_PER_LONG == 32
+#define BTRFS_32BIT_MAX_FILE_SIZE (((u64)ULONG_MAX + 1) << PAGE_SHIFT)
+/*
+ * The warning threshold is 5/8th of the MAX_LFS_FILESIZE that limits the logical
+ * addresses of extents.
+ *
+ * For 4K page size it's about 10T, for 64K it's 160T.
+ */
+#define BTRFS_32BIT_EARLY_WARN_THRESHOLD (BTRFS_32BIT_MAX_FILE_SIZE * 5 / 8)
+void btrfs_warn_32bit_limit(struct btrfs_fs_info *fs_info);
+void btrfs_err_32bit_limit(struct btrfs_fs_info *fs_info);
+#endif
+
+#endif
diff --git a/fs/btrfs/misc.h b/fs/btrfs/misc.h
index f9850edfd726..768583a440e1 100644
--- a/fs/btrfs/misc.h
+++ b/fs/btrfs/misc.h
@@ -10,6 +10,14 @@
#define in_range(b, first, len) ((b) >= (first) && (b) < (first) + (len))
+/*
+ * Enumerate bits using enum autoincrement. Define the @name as the n-th bit.
+ */
+#define ENUM_BIT(name) \
+ __ ## name ## _BIT, \
+ name = (1U << __ ## name ## _BIT), \
+ __ ## name ## _SEQ = __ ## name ## _BIT
+
static inline void cond_wake_up(struct wait_queue_head *wq)
{
/*
@@ -32,22 +40,10 @@ static inline void cond_wake_up_nomb(struct wait_queue_head *wq)
wake_up(wq);
}
-static inline u64 div_factor(u64 num, int factor)
+static inline u64 mult_perc(u64 num, u32 percent)
{
- if (factor == 10)
- return num;
- num *= factor;
- return div_u64(num, 10);
+ return div_u64(num * percent, 100);
}
-
-static inline u64 div_factor_fine(u64 num, int factor)
-{
- if (factor == 100)
- return num;
- num *= factor;
- return div_u64(num, 100);
-}
-
/* Copy of is_power_of_two that is 64bit safe */
static inline bool is_power_of_two_u64(u64 n)
{
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index e54f8280031f..57d8c72737e1 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -7,6 +7,7 @@
#include <linux/blkdev.h>
#include <linux/writeback.h>
#include <linux/sched/mm.h>
+#include "messages.h"
#include "misc.h"
#include "ctree.h"
#include "transaction.h"
@@ -17,6 +18,8 @@
#include "delalloc-space.h"
#include "qgroup.h"
#include "subpage.h"
+#include "file.h"
+#include "super.h"
static struct kmem_cache *btrfs_ordered_extent_cache;
@@ -143,7 +146,7 @@ static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree,
return ret;
}
-/**
+/*
* Add an ordered extent to the per-inode tree.
*
* @inode: Inode that this extent is for.
@@ -501,7 +504,7 @@ void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry)
ASSERT(list_empty(&entry->log_list));
ASSERT(RB_EMPTY_NODE(&entry->rb_node));
if (entry->inode)
- btrfs_add_delayed_iput(entry->inode);
+ btrfs_add_delayed_iput(BTRFS_I(entry->inode));
while (!list_empty(&entry->list)) {
cur = entry->list.next;
sum = list_entry(cur, struct btrfs_ordered_sum, list);
@@ -761,11 +764,11 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
struct btrfs_ordered_extent *ordered;
if (start + len < start) {
- orig_end = INT_LIMIT(loff_t);
+ orig_end = OFFSET_MAX;
} else {
orig_end = start + len - 1;
- if (orig_end > INT_LIMIT(loff_t))
- orig_end = INT_LIMIT(loff_t);
+ if (orig_end > OFFSET_MAX)
+ orig_end = OFFSET_MAX;
}
/* start IO across the range first to instantiate any delalloc
@@ -1019,17 +1022,18 @@ out:
}
/*
- * btrfs_flush_ordered_range - Lock the passed range and ensures all pending
- * ordered extents in it are run to completion.
+ * Lock the passed range and ensures all pending ordered extents in it are run
+ * to completion.
*
* @inode: Inode whose ordered tree is to be searched
* @start: Beginning of range to flush
* @end: Last byte of range to lock
* @cached_state: If passed, will return the extent state responsible for the
- * locked range. It's the caller's responsibility to free the cached state.
+ * locked range. It's the caller's responsibility to free the
+ * cached state.
*
- * This function always returns with the given range locked, ensuring after it's
- * called no order extent can be pending.
+ * Always return with the given range locked, ensuring after it's called no
+ * order extent can be pending.
*/
void btrfs_lock_and_flush_ordered_range(struct btrfs_inode *inode, u64 start,
u64 end,
@@ -1069,11 +1073,12 @@ void btrfs_lock_and_flush_ordered_range(struct btrfs_inode *inode, u64 start,
* Return true if btrfs_lock_ordered_range does not return any extents,
* otherwise false.
*/
-bool btrfs_try_lock_ordered_range(struct btrfs_inode *inode, u64 start, u64 end)
+bool btrfs_try_lock_ordered_range(struct btrfs_inode *inode, u64 start, u64 end,
+ struct extent_state **cached_state)
{
struct btrfs_ordered_extent *ordered;
- if (!try_lock_extent(&inode->io_tree, start, end))
+ if (!try_lock_extent(&inode->io_tree, start, end, cached_state))
return false;
ordered = btrfs_lookup_ordered_range(inode, start, end - start + 1);
@@ -1081,7 +1086,7 @@ bool btrfs_try_lock_ordered_range(struct btrfs_inode *inode, u64 start, u64 end)
return true;
btrfs_put_ordered_extent(ordered);
- unlock_extent(&inode->io_tree, start, end, NULL);
+ unlock_extent(&inode->io_tree, start, end, cached_state);
return false;
}
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index f59f2dbdb25e..89f82b78f590 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -206,7 +206,8 @@ void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr,
void btrfs_lock_and_flush_ordered_range(struct btrfs_inode *inode, u64 start,
u64 end,
struct extent_state **cached_state);
-bool btrfs_try_lock_ordered_range(struct btrfs_inode *inode, u64 start, u64 end);
+bool btrfs_try_lock_ordered_range(struct btrfs_inode *inode, u64 start, u64 end,
+ struct extent_state **cached_state);
int btrfs_split_ordered_extent(struct btrfs_ordered_extent *ordered, u64 pre,
u64 post);
int __init ordered_data_init(void);
diff --git a/fs/btrfs/orphan.c b/fs/btrfs/orphan.c
index aa534108c1e2..7a1b021b5669 100644
--- a/fs/btrfs/orphan.c
+++ b/fs/btrfs/orphan.c
@@ -5,6 +5,7 @@
#include "ctree.h"
#include "disk-io.h"
+#include "orphan.h"
int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 offset)
diff --git a/fs/btrfs/orphan.h b/fs/btrfs/orphan.h
new file mode 100644
index 000000000000..3faab5cbb59a
--- /dev/null
+++ b/fs/btrfs/orphan.h
@@ -0,0 +1,11 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef BTRFS_ORPHAN_H
+#define BTRFS_ORPHAN_H
+
+int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, u64 offset);
+int btrfs_del_orphan_item(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, u64 offset);
+
+#endif
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index dd8777872143..b93c96213304 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -3,9 +3,12 @@
* Copyright (C) 2007 Oracle. All rights reserved.
*/
+#include "messages.h"
#include "ctree.h"
#include "disk-io.h"
#include "print-tree.h"
+#include "accessors.h"
+#include "tree-checker.h"
struct root_name_map {
u64 id;
@@ -240,9 +243,9 @@ void btrfs_print_leaf(struct extent_buffer *l)
case BTRFS_DIR_ITEM_KEY:
di = btrfs_item_ptr(l, i, struct btrfs_dir_item);
btrfs_dir_item_key_to_cpu(l, di, &found_key);
- pr_info("\t\tdir oid %llu type %u\n",
+ pr_info("\t\tdir oid %llu flags %u\n",
found_key.objectid,
- btrfs_dir_type(l, di));
+ btrfs_dir_flags(l, di));
break;
case BTRFS_ROOT_ITEM_KEY:
ri = btrfs_item_ptr(l, i, struct btrfs_root_item);
@@ -384,14 +387,16 @@ void btrfs_print_tree(struct extent_buffer *c, bool follow)
if (!follow)
return;
for (i = 0; i < nr; i++) {
- struct btrfs_key first_key;
+ struct btrfs_tree_parent_check check = {
+ .level = level - 1,
+ .transid = btrfs_node_ptr_generation(c, i),
+ .owner_root = btrfs_header_owner(c),
+ .has_first_key = true
+ };
struct extent_buffer *next;
- btrfs_node_key_to_cpu(c, &first_key, i);
- next = read_tree_block(fs_info, btrfs_node_blockptr(c, i),
- btrfs_header_owner(c),
- btrfs_node_ptr_generation(c, i),
- level - 1, &first_key);
+ btrfs_node_key_to_cpu(c, &check.first_key, i);
+ next = read_tree_block(fs_info, btrfs_node_blockptr(c, i), &check);
if (IS_ERR(next))
continue;
if (!extent_buffer_uptodate(next)) {
diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c
index 055a631276ce..0755af0e53e3 100644
--- a/fs/btrfs/props.c
+++ b/fs/btrfs/props.c
@@ -4,12 +4,17 @@
*/
#include <linux/hashtable.h>
+#include "messages.h"
#include "props.h"
#include "btrfs_inode.h"
#include "transaction.h"
#include "ctree.h"
#include "xattr.h"
#include "compression.h"
+#include "space-info.h"
+#include "fs.h"
+#include "accessors.h"
+#include "super.h"
#define BTRFS_PROP_HANDLERS_HT_BITS 8
static DEFINE_HASHTABLE(prop_handlers_ht, BTRFS_PROP_HANDLERS_HT_BITS);
@@ -453,7 +458,7 @@ int btrfs_inode_inherit_props(struct btrfs_trans_handle *trans,
return 0;
}
-void __init btrfs_props_init(void)
+int __init btrfs_props_init(void)
{
int i;
@@ -463,5 +468,6 @@ void __init btrfs_props_init(void)
hash_add(prop_handlers_ht, &p->node, h);
}
+ return 0;
}
diff --git a/fs/btrfs/props.h b/fs/btrfs/props.h
index ca9dd3df129b..6e283196e38a 100644
--- a/fs/btrfs/props.h
+++ b/fs/btrfs/props.h
@@ -8,7 +8,7 @@
#include "ctree.h"
-void __init btrfs_props_init(void);
+int __init btrfs_props_init(void);
int btrfs_set_prop(struct btrfs_trans_handle *trans, struct inode *inode,
const char *name, const char *value, size_t value_len,
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 9334c3157c22..5c636e00d77d 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -24,6 +24,11 @@
#include "block-group.h"
#include "sysfs.h"
#include "tree-mod-log.h"
+#include "fs.h"
+#include "accessors.h"
+#include "extent-tree.h"
+#include "root-tree.h"
+#include "tree-checker.h"
/*
* Helpers to access qgroup reservation
@@ -1790,8 +1795,7 @@ int btrfs_qgroup_trace_extent_nolock(struct btrfs_fs_info *fs_info,
int btrfs_qgroup_trace_extent_post(struct btrfs_trans_handle *trans,
struct btrfs_qgroup_extent_record *qrecord)
{
- struct ulist *old_root;
- u64 bytenr = qrecord->bytenr;
+ struct btrfs_backref_walk_ctx ctx = { 0 };
int ret;
/*
@@ -1818,8 +1822,10 @@ int btrfs_qgroup_trace_extent_post(struct btrfs_trans_handle *trans,
if (trans->fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING)
return 0;
- ret = btrfs_find_all_roots(NULL, trans->fs_info, bytenr, 0, &old_root,
- true);
+ ctx.bytenr = qrecord->bytenr;
+ ctx.fs_info = trans->fs_info;
+
+ ret = btrfs_find_all_roots(&ctx, true);
if (ret < 0) {
qgroup_mark_inconsistent(trans->fs_info);
btrfs_warn(trans->fs_info,
@@ -1835,12 +1841,12 @@ int btrfs_qgroup_trace_extent_post(struct btrfs_trans_handle *trans,
*
* So modifying qrecord->old_roots is safe here
*/
- qrecord->old_roots = old_root;
+ qrecord->old_roots = ctx.roots;
return 0;
}
int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr,
- u64 num_bytes, gfp_t gfp_flag)
+ u64 num_bytes)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_qgroup_extent_record *record;
@@ -1850,7 +1856,7 @@ int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr,
if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)
|| bytenr == 0 || num_bytes == 0)
return 0;
- record = kzalloc(sizeof(*record), gfp_flag);
+ record = kzalloc(sizeof(*record), GFP_NOFS);
if (!record)
return -ENOMEM;
@@ -1902,8 +1908,7 @@ int btrfs_qgroup_trace_leaf_items(struct btrfs_trans_handle *trans,
num_bytes = btrfs_file_extent_disk_num_bytes(eb, fi);
- ret = btrfs_qgroup_trace_extent(trans, bytenr, num_bytes,
- GFP_NOFS);
+ ret = btrfs_qgroup_trace_extent(trans, bytenr, num_bytes);
if (ret)
return ret;
}
@@ -2102,12 +2107,11 @@ static int qgroup_trace_extent_swap(struct btrfs_trans_handle* trans,
* blocks for qgroup accounting.
*/
ret = btrfs_qgroup_trace_extent(trans, src_path->nodes[dst_level]->start,
- nodesize, GFP_NOFS);
+ nodesize);
if (ret < 0)
goto out;
- ret = btrfs_qgroup_trace_extent(trans,
- dst_path->nodes[dst_level]->start,
- nodesize, GFP_NOFS);
+ ret = btrfs_qgroup_trace_extent(trans, dst_path->nodes[dst_level]->start,
+ nodesize);
if (ret < 0)
goto out;
@@ -2336,7 +2340,13 @@ int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans,
}
if (!extent_buffer_uptodate(root_eb)) {
- ret = btrfs_read_extent_buffer(root_eb, root_gen, root_level, NULL);
+ struct btrfs_tree_parent_check check = {
+ .has_first_key = false,
+ .transid = root_gen,
+ .level = root_level
+ };
+
+ ret = btrfs_read_extent_buffer(root_eb, &check);
if (ret)
goto out;
}
@@ -2391,8 +2401,7 @@ walk_down:
path->locks[level] = BTRFS_READ_LOCK;
ret = btrfs_qgroup_trace_extent(trans, child_bytenr,
- fs_info->nodesize,
- GFP_NOFS);
+ fs_info->nodesize);
if (ret)
goto out;
}
@@ -2749,17 +2758,22 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans)
if (!ret && !(fs_info->qgroup_flags &
BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING)) {
+ struct btrfs_backref_walk_ctx ctx = { 0 };
+
+ ctx.bytenr = record->bytenr;
+ ctx.fs_info = fs_info;
+
/*
* Old roots should be searched when inserting qgroup
* extent record
*/
if (WARN_ON(!record->old_roots)) {
/* Search commit root to find old_roots */
- ret = btrfs_find_all_roots(NULL, fs_info,
- record->bytenr, 0,
- &record->old_roots, false);
+ ret = btrfs_find_all_roots(&ctx, false);
if (ret < 0)
goto cleanup;
+ record->old_roots = ctx.roots;
+ ctx.roots = NULL;
}
/* Free the reserved data space */
@@ -2772,10 +2786,11 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans)
* which doesn't lock tree or delayed_refs and search
* current root. It's safe inside commit_transaction().
*/
- ret = btrfs_find_all_roots(trans, fs_info,
- record->bytenr, BTRFS_SEQ_LAST, &new_roots, false);
+ ctx.trans = trans;
+ ret = btrfs_find_all_roots(&ctx, false);
if (ret < 0)
goto cleanup;
+ new_roots = ctx.roots;
if (qgroup_to_skip) {
ulist_del(new_roots, qgroup_to_skip, 0);
ulist_del(record->old_roots, qgroup_to_skip,
@@ -2951,14 +2966,7 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid,
dstgroup->rsv_rfer = inherit->lim.rsv_rfer;
dstgroup->rsv_excl = inherit->lim.rsv_excl;
- ret = update_qgroup_limit_item(trans, dstgroup);
- if (ret) {
- qgroup_mark_inconsistent(fs_info);
- btrfs_info(fs_info,
- "unable to update quota limit for %llu",
- dstgroup->qgroupid);
- goto unlock;
- }
+ qgroup_dirty(fs_info, dstgroup);
}
if (srcid) {
@@ -3248,7 +3256,6 @@ static int qgroup_rescan_leaf(struct btrfs_trans_handle *trans,
struct btrfs_root *extent_root;
struct btrfs_key found;
struct extent_buffer *scratch_leaf = NULL;
- struct ulist *roots = NULL;
u64 num_bytes;
bool done;
int slot;
@@ -3298,6 +3305,8 @@ static int qgroup_rescan_leaf(struct btrfs_trans_handle *trans,
mutex_unlock(&fs_info->qgroup_rescan_lock);
for (; slot < btrfs_header_nritems(scratch_leaf); ++slot) {
+ struct btrfs_backref_walk_ctx ctx = { 0 };
+
btrfs_item_key_to_cpu(scratch_leaf, &found, slot);
if (found.type != BTRFS_EXTENT_ITEM_KEY &&
found.type != BTRFS_METADATA_ITEM_KEY)
@@ -3307,13 +3316,15 @@ static int qgroup_rescan_leaf(struct btrfs_trans_handle *trans,
else
num_bytes = found.offset;
- ret = btrfs_find_all_roots(NULL, fs_info, found.objectid, 0,
- &roots, false);
+ ctx.bytenr = found.objectid;
+ ctx.fs_info = fs_info;
+
+ ret = btrfs_find_all_roots(&ctx, false);
if (ret < 0)
goto out;
/* For rescan, just pass old_roots as NULL */
ret = btrfs_qgroup_account_extent(trans, found.objectid,
- num_bytes, NULL, roots);
+ num_bytes, NULL, ctx.roots);
if (ret < 0)
goto out;
}
@@ -4299,6 +4310,7 @@ int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans,
struct extent_buffer *subvol_eb)
{
struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_tree_parent_check check = { 0 };
struct btrfs_qgroup_swapped_blocks *blocks = &root->swapped_blocks;
struct btrfs_qgroup_swapped_block *block;
struct extent_buffer *reloc_eb = NULL;
@@ -4347,10 +4359,13 @@ int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans,
blocks->swapped = swapped;
spin_unlock(&blocks->lock);
+ check.level = block->level;
+ check.transid = block->reloc_generation;
+ check.has_first_key = true;
+ memcpy(&check.first_key, &block->first_key, sizeof(check.first_key));
+
/* Read out reloc subtree root */
- reloc_eb = read_tree_block(fs_info, block->reloc_bytenr, 0,
- block->reloc_generation, block->level,
- &block->first_key);
+ reloc_eb = read_tree_block(fs_info, block->reloc_bytenr, &check);
if (IS_ERR(reloc_eb)) {
ret = PTR_ERR(reloc_eb);
reloc_eb = NULL;
diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h
index 578c77e94200..7bffa10589d6 100644
--- a/fs/btrfs/qgroup.h
+++ b/fs/btrfs/qgroup.h
@@ -11,6 +11,7 @@
#include <linux/kobject.h>
#include "ulist.h"
#include "delayed-ref.h"
+#include "misc.h"
/*
* Btrfs qgroup overview
@@ -242,9 +243,11 @@ static inline u64 btrfs_qgroup_subvolid(u64 qgroupid)
/*
* For qgroup event trace points only
*/
-#define QGROUP_RESERVE (1<<0)
-#define QGROUP_RELEASE (1<<1)
-#define QGROUP_FREE (1<<2)
+enum {
+ ENUM_BIT(QGROUP_RESERVE),
+ ENUM_BIT(QGROUP_RELEASE),
+ ENUM_BIT(QGROUP_FREE),
+};
int btrfs_quota_enable(struct btrfs_fs_info *fs_info);
int btrfs_quota_disable(struct btrfs_fs_info *fs_info);
@@ -318,7 +321,7 @@ int btrfs_qgroup_trace_extent_post(struct btrfs_trans_handle *trans,
* (NULL trans)
*/
int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr,
- u64 num_bytes, gfp_t gfp_flag);
+ u64 num_bytes);
/*
* Inform qgroup to trace all leaf items of data
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index 82c8e991300e..2d90a6b5eb00 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -13,12 +13,15 @@
#include <linux/list_sort.h>
#include <linux/raid/xor.h>
#include <linux/mm.h>
+#include "messages.h"
#include "misc.h"
#include "ctree.h"
#include "disk-io.h"
#include "volumes.h"
#include "raid56.h"
#include "async-thread.h"
+#include "file-item.h"
+#include "btrfs_inode.h"
/* set when additional merges to this rbio are not allowed */
#define RBIO_RMW_LOCKED_BIT 1
@@ -63,19 +66,45 @@ struct sector_ptr {
unsigned int uptodate:8;
};
-static int __raid56_parity_recover(struct btrfs_raid_bio *rbio);
-static noinline void finish_rmw(struct btrfs_raid_bio *rbio);
-static void rmw_work(struct work_struct *work);
-static void read_rebuild_work(struct work_struct *work);
-static int fail_bio_stripe(struct btrfs_raid_bio *rbio, struct bio *bio);
-static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed);
-static void __free_raid_bio(struct btrfs_raid_bio *rbio);
+static void rmw_rbio_work(struct work_struct *work);
+static void rmw_rbio_work_locked(struct work_struct *work);
static void index_rbio_pages(struct btrfs_raid_bio *rbio);
static int alloc_rbio_pages(struct btrfs_raid_bio *rbio);
-static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
- int need_check);
-static void scrub_parity_work(struct work_struct *work);
+static int finish_parity_scrub(struct btrfs_raid_bio *rbio, int need_check);
+static void scrub_rbio_work_locked(struct work_struct *work);
+
+static void free_raid_bio_pointers(struct btrfs_raid_bio *rbio)
+{
+ bitmap_free(rbio->error_bitmap);
+ kfree(rbio->stripe_pages);
+ kfree(rbio->bio_sectors);
+ kfree(rbio->stripe_sectors);
+ kfree(rbio->finish_pointers);
+}
+
+static void free_raid_bio(struct btrfs_raid_bio *rbio)
+{
+ int i;
+
+ if (!refcount_dec_and_test(&rbio->refs))
+ return;
+
+ WARN_ON(!list_empty(&rbio->stripe_cache));
+ WARN_ON(!list_empty(&rbio->hash_list));
+ WARN_ON(!bio_list_empty(&rbio->bio_list));
+
+ for (i = 0; i < rbio->nr_pages; i++) {
+ if (rbio->stripe_pages[i]) {
+ __free_page(rbio->stripe_pages[i]);
+ rbio->stripe_pages[i] = NULL;
+ }
+ }
+
+ btrfs_put_bioc(rbio->bioc);
+ free_raid_bio_pointers(rbio);
+ kfree(rbio);
+}
static void start_async_work(struct btrfs_raid_bio *rbio, work_func_t work_func)
{
@@ -146,8 +175,16 @@ static void cache_rbio_pages(struct btrfs_raid_bio *rbio)
for (i = 0; i < rbio->nr_sectors; i++) {
/* Some range not covered by bio (partial write), skip it */
- if (!rbio->bio_sectors[i].page)
+ if (!rbio->bio_sectors[i].page) {
+ /*
+ * Even if the sector is not covered by bio, if it is
+ * a data sector it should still be uptodate as it is
+ * read from disk.
+ */
+ if (i < rbio->nr_data * rbio->stripe_nsectors)
+ ASSERT(rbio->stripe_sectors[i].uptodate);
continue;
+ }
ASSERT(rbio->stripe_sectors[i].page);
memcpy_page(rbio->stripe_sectors[i].page,
@@ -234,6 +271,21 @@ static void steal_rbio_page(struct btrfs_raid_bio *src,
dest->stripe_sectors[i].uptodate = true;
}
+static bool is_data_stripe_page(struct btrfs_raid_bio *rbio, int page_nr)
+{
+ const int sector_nr = (page_nr << PAGE_SHIFT) >>
+ rbio->bioc->fs_info->sectorsize_bits;
+
+ /*
+ * We have ensured PAGE_SIZE is aligned with sectorsize, thus
+ * we won't have a page which is half data half parity.
+ *
+ * Thus if the first sector of the page belongs to data stripes, then
+ * the full page belongs to data stripes.
+ */
+ return (sector_nr < rbio->nr_data * rbio->stripe_nsectors);
+}
+
/*
* Stealing an rbio means taking all the uptodate pages from the stripe array
* in the source rbio and putting them into the destination rbio.
@@ -244,16 +296,26 @@ static void steal_rbio_page(struct btrfs_raid_bio *src,
static void steal_rbio(struct btrfs_raid_bio *src, struct btrfs_raid_bio *dest)
{
int i;
- struct page *s;
if (!test_bit(RBIO_CACHE_READY_BIT, &src->flags))
return;
for (i = 0; i < dest->nr_pages; i++) {
- s = src->stripe_pages[i];
- if (!s || !full_page_sectors_uptodate(src, i))
+ struct page *p = src->stripe_pages[i];
+
+ /*
+ * We don't need to steal P/Q pages as they will always be
+ * regenerated for RMW or full write anyway.
+ */
+ if (!is_data_stripe_page(src, i))
continue;
+ /*
+ * If @src already has RBIO_CACHE_READY_BIT, it should have
+ * all data stripe pages present and uptodate.
+ */
+ ASSERT(p);
+ ASSERT(full_page_sectors_uptodate(src, i));
steal_rbio_page(src, dest, i);
}
index_stripe_sectors(dest);
@@ -336,7 +398,7 @@ static void __remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
spin_unlock(&h->lock);
if (freeit)
- __free_raid_bio(rbio);
+ free_raid_bio(rbio);
}
/*
@@ -526,28 +588,10 @@ static int rbio_can_merge(struct btrfs_raid_bio *last,
if (last->operation == BTRFS_RBIO_PARITY_SCRUB)
return 0;
- if (last->operation == BTRFS_RBIO_REBUILD_MISSING)
+ if (last->operation == BTRFS_RBIO_REBUILD_MISSING ||
+ last->operation == BTRFS_RBIO_READ_REBUILD)
return 0;
- if (last->operation == BTRFS_RBIO_READ_REBUILD) {
- int fa = last->faila;
- int fb = last->failb;
- int cur_fa = cur->faila;
- int cur_fb = cur->failb;
-
- if (last->faila >= last->failb) {
- fa = last->failb;
- fb = last->faila;
- }
-
- if (cur->faila >= cur->failb) {
- cur_fa = cur->failb;
- cur_fb = cur->faila;
- }
-
- if (fa != cur_fa || fb != cur_fb)
- return 0;
- }
return 1;
}
@@ -684,10 +728,12 @@ out:
if (cache_drop)
remove_rbio_from_cache(cache_drop);
if (freeit)
- __free_raid_bio(freeit);
+ free_raid_bio(freeit);
return ret;
}
+static void recover_rbio_work_locked(struct work_struct *work);
+
/*
* called as rmw or parity rebuild is completed. If the plug list has more
* rbios waiting for this stripe, the next one on the list will be started
@@ -745,16 +791,16 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
spin_unlock_irqrestore(&h->lock, flags);
if (next->operation == BTRFS_RBIO_READ_REBUILD)
- start_async_work(next, read_rebuild_work);
+ start_async_work(next, recover_rbio_work_locked);
else if (next->operation == BTRFS_RBIO_REBUILD_MISSING) {
steal_rbio(rbio, next);
- start_async_work(next, read_rebuild_work);
+ start_async_work(next, recover_rbio_work_locked);
} else if (next->operation == BTRFS_RBIO_WRITE) {
steal_rbio(rbio, next);
- start_async_work(next, rmw_work);
+ start_async_work(next, rmw_rbio_work_locked);
} else if (next->operation == BTRFS_RBIO_PARITY_SCRUB) {
steal_rbio(rbio, next);
- start_async_work(next, scrub_parity_work);
+ start_async_work(next, scrub_rbio_work_locked);
}
goto done_nolock;
@@ -769,28 +815,6 @@ done_nolock:
remove_rbio_from_cache(rbio);
}
-static void __free_raid_bio(struct btrfs_raid_bio *rbio)
-{
- int i;
-
- if (!refcount_dec_and_test(&rbio->refs))
- return;
-
- WARN_ON(!list_empty(&rbio->stripe_cache));
- WARN_ON(!list_empty(&rbio->hash_list));
- WARN_ON(!bio_list_empty(&rbio->bio_list));
-
- for (i = 0; i < rbio->nr_pages; i++) {
- if (rbio->stripe_pages[i]) {
- __free_page(rbio->stripe_pages[i]);
- rbio->stripe_pages[i] = NULL;
- }
- }
-
- btrfs_put_bioc(rbio->bioc);
- kfree(rbio);
-}
-
static void rbio_endio_bio_list(struct bio *cur, blk_status_t err)
{
struct bio *next;
@@ -813,6 +837,11 @@ static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, blk_status_t err)
struct bio *cur = bio_list_get(&rbio->bio_list);
struct bio *extra;
+ kfree(rbio->csum_buf);
+ bitmap_free(rbio->csum_bitmap);
+ rbio->csum_buf = NULL;
+ rbio->csum_bitmap = NULL;
+
/*
* Clear the data bitmap, as the rbio may be cached for later usage.
* do this before before unlock_stripe() so there will be no new bio
@@ -830,7 +859,7 @@ static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, blk_status_t err)
*/
unlock_stripe(rbio);
extra = bio_list_get(&rbio->bio_list);
- __free_raid_bio(rbio);
+ free_raid_bio(rbio);
rbio_endio_bio_list(cur, err);
if (extra)
@@ -838,36 +867,7 @@ static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, blk_status_t err)
}
/*
- * end io function used by finish_rmw. When we finally
- * get here, we've written a full stripe
- */
-static void raid_write_end_io(struct bio *bio)
-{
- struct btrfs_raid_bio *rbio = bio->bi_private;
- blk_status_t err = bio->bi_status;
- int max_errors;
-
- if (err)
- fail_bio_stripe(rbio, bio);
-
- bio_put(bio);
-
- if (!atomic_dec_and_test(&rbio->stripes_pending))
- return;
-
- err = BLK_STS_OK;
-
- /* OK, we have read all the stripes we need to. */
- max_errors = (rbio->operation == BTRFS_RBIO_PARITY_SCRUB) ?
- 0 : rbio->bioc->max_errors;
- if (atomic_read(&rbio->error) > max_errors)
- err = BLK_STS_IOERR;
-
- rbio_orig_end_io(rbio, err);
-}
-
-/**
- * Get a sector pointer specified by its @stripe_nr and @sector_nr
+ * Get a sector pointer specified by its @stripe_nr and @sector_nr.
*
* @rbio: The raid bio
* @stripe_nr: Stripe number, valid range [0, real_stripe)
@@ -919,7 +919,6 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info,
BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits;
const unsigned int num_sectors = stripe_nsectors * real_stripes;
struct btrfs_raid_bio *rbio;
- void *p;
/* PAGE_SIZE must also be aligned to sectorsize for subpage support */
ASSERT(IS_ALIGNED(PAGE_SIZE, fs_info->sectorsize));
@@ -929,16 +928,27 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info,
*/
ASSERT(stripe_nsectors <= BITS_PER_LONG);
- rbio = kzalloc(sizeof(*rbio) +
- sizeof(*rbio->stripe_pages) * num_pages +
- sizeof(*rbio->bio_sectors) * num_sectors +
- sizeof(*rbio->stripe_sectors) * num_sectors +
- sizeof(*rbio->finish_pointers) * real_stripes,
- GFP_NOFS);
+ rbio = kzalloc(sizeof(*rbio), GFP_NOFS);
if (!rbio)
return ERR_PTR(-ENOMEM);
+ rbio->stripe_pages = kcalloc(num_pages, sizeof(struct page *),
+ GFP_NOFS);
+ rbio->bio_sectors = kcalloc(num_sectors, sizeof(struct sector_ptr),
+ GFP_NOFS);
+ rbio->stripe_sectors = kcalloc(num_sectors, sizeof(struct sector_ptr),
+ GFP_NOFS);
+ rbio->finish_pointers = kcalloc(real_stripes, sizeof(void *), GFP_NOFS);
+ rbio->error_bitmap = bitmap_zalloc(num_sectors, GFP_NOFS);
+
+ if (!rbio->stripe_pages || !rbio->bio_sectors || !rbio->stripe_sectors ||
+ !rbio->finish_pointers || !rbio->error_bitmap) {
+ free_raid_bio_pointers(rbio);
+ kfree(rbio);
+ return ERR_PTR(-ENOMEM);
+ }
bio_list_init(&rbio->bio_list);
+ init_waitqueue_head(&rbio->io_wait);
INIT_LIST_HEAD(&rbio->plug_list);
spin_lock_init(&rbio->bio_list_lock);
INIT_LIST_HEAD(&rbio->stripe_cache);
@@ -950,27 +960,9 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info,
rbio->real_stripes = real_stripes;
rbio->stripe_npages = stripe_npages;
rbio->stripe_nsectors = stripe_nsectors;
- rbio->faila = -1;
- rbio->failb = -1;
refcount_set(&rbio->refs, 1);
- atomic_set(&rbio->error, 0);
atomic_set(&rbio->stripes_pending, 0);
- /*
- * The stripe_pages, bio_sectors, etc arrays point to the extra memory
- * we allocated past the end of the rbio.
- */
- p = rbio + 1;
-#define CONSUME_ALLOC(ptr, count) do { \
- ptr = p; \
- p = (unsigned char *)p + sizeof(*(ptr)) * (count); \
- } while (0)
- CONSUME_ALLOC(rbio->stripe_pages, num_pages);
- CONSUME_ALLOC(rbio->bio_sectors, num_sectors);
- CONSUME_ALLOC(rbio->stripe_sectors, num_sectors);
- CONSUME_ALLOC(rbio->finish_pointers, real_stripes);
-#undef CONSUME_ALLOC
-
ASSERT(btrfs_nr_parity_stripes(bioc->map_type));
rbio->nr_data = real_stripes - btrfs_nr_parity_stripes(bioc->map_type);
@@ -1006,6 +998,45 @@ static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio)
}
/*
+ * Return the total numer of errors found in the vertical stripe of @sector_nr.
+ *
+ * @faila and @failb will also be updated to the first and second stripe
+ * number of the errors.
+ */
+static int get_rbio_veritical_errors(struct btrfs_raid_bio *rbio, int sector_nr,
+ int *faila, int *failb)
+{
+ int stripe_nr;
+ int found_errors = 0;
+
+ if (faila || failb) {
+ /*
+ * Both @faila and @failb should be valid pointers if any of
+ * them is specified.
+ */
+ ASSERT(faila && failb);
+ *faila = -1;
+ *failb = -1;
+ }
+
+ for (stripe_nr = 0; stripe_nr < rbio->real_stripes; stripe_nr++) {
+ int total_sector_nr = stripe_nr * rbio->stripe_nsectors + sector_nr;
+
+ if (test_bit(total_sector_nr, rbio->error_bitmap)) {
+ found_errors++;
+ if (faila) {
+ /* Update faila and failb. */
+ if (*faila < 0)
+ *faila = stripe_nr;
+ else if (*failb < 0)
+ *failb = stripe_nr;
+ }
+ }
+ }
+ return found_errors;
+}
+
+/*
* Add a single sector @sector into our list of bios for IO.
*
* Return 0 if everything went well.
@@ -1038,8 +1069,19 @@ static int rbio_add_io_sector(struct btrfs_raid_bio *rbio,
disk_start = stripe->physical + sector_nr * sectorsize;
/* if the device is missing, just fail this stripe */
- if (!stripe->dev->bdev)
- return fail_rbio_index(rbio, stripe_nr);
+ if (!stripe->dev->bdev) {
+ int found_errors;
+
+ set_bit(stripe_nr * rbio->stripe_nsectors + sector_nr,
+ rbio->error_bitmap);
+
+ /* Check if we have reached tolerance early. */
+ found_errors = get_rbio_veritical_errors(rbio, sector_nr,
+ NULL, NULL);
+ if (found_errors > rbio->bioc->max_errors)
+ return -EIO;
+ return 0;
+ }
/* see if we can add this page onto our existing bio */
if (last) {
@@ -1071,23 +1113,6 @@ static int rbio_add_io_sector(struct btrfs_raid_bio *rbio,
return 0;
}
-/*
- * while we're doing the read/modify/write cycle, we could
- * have errors in reading pages off the disk. This checks
- * for errors and if we're not able to read the page it'll
- * trigger parity reconstruction. The rmw will be finished
- * after we've reconstructed the failed stripes
- */
-static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio)
-{
- if (rbio->faila >= 0 || rbio->failb >= 0) {
- BUG_ON(rbio->faila == rbio->real_stripes - 1);
- __raid56_parity_recover(rbio);
- } else {
- finish_rmw(rbio);
- }
-}
-
static void index_one_bio(struct btrfs_raid_bio *rbio, struct bio *bio)
{
const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
@@ -1158,109 +1183,71 @@ not_found:
trace_info->stripe_nr = -1;
}
-/*
- * this is called from one of two situations. We either
- * have a full stripe from the higher layers, or we've read all
- * the missing bits off disk.
- *
- * This will calculate the parity and then send down any
- * changed blocks.
- */
-static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
+/* Generate PQ for one veritical stripe. */
+static void generate_pq_vertical(struct btrfs_raid_bio *rbio, int sectornr)
{
- struct btrfs_io_context *bioc = rbio->bioc;
- const u32 sectorsize = bioc->fs_info->sectorsize;
void **pointers = rbio->finish_pointers;
- int nr_data = rbio->nr_data;
+ const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
+ struct sector_ptr *sector;
+ int stripe;
+ const bool has_qstripe = rbio->bioc->map_type & BTRFS_BLOCK_GROUP_RAID6;
+
+ /* First collect one sector from each data stripe */
+ for (stripe = 0; stripe < rbio->nr_data; stripe++) {
+ sector = sector_in_rbio(rbio, stripe, sectornr, 0);
+ pointers[stripe] = kmap_local_page(sector->page) +
+ sector->pgoff;
+ }
+
+ /* Then add the parity stripe */
+ sector = rbio_pstripe_sector(rbio, sectornr);
+ sector->uptodate = 1;
+ pointers[stripe++] = kmap_local_page(sector->page) + sector->pgoff;
+
+ if (has_qstripe) {
+ /*
+ * RAID6, add the qstripe and call the library function
+ * to fill in our p/q
+ */
+ sector = rbio_qstripe_sector(rbio, sectornr);
+ sector->uptodate = 1;
+ pointers[stripe++] = kmap_local_page(sector->page) +
+ sector->pgoff;
+
+ raid6_call.gen_syndrome(rbio->real_stripes, sectorsize,
+ pointers);
+ } else {
+ /* raid5 */
+ memcpy(pointers[rbio->nr_data], pointers[0], sectorsize);
+ run_xor(pointers + 1, rbio->nr_data - 1, sectorsize);
+ }
+ for (stripe = stripe - 1; stripe >= 0; stripe--)
+ kunmap_local(pointers[stripe]);
+}
+
+static int rmw_assemble_write_bios(struct btrfs_raid_bio *rbio,
+ struct bio_list *bio_list)
+{
+ struct bio *bio;
/* The total sector number inside the full stripe. */
int total_sector_nr;
- int stripe;
- /* Sector number inside a stripe. */
int sectornr;
- bool has_qstripe;
- struct bio_list bio_list;
- struct bio *bio;
+ int stripe;
int ret;
- bio_list_init(&bio_list);
-
- if (rbio->real_stripes - rbio->nr_data == 1)
- has_qstripe = false;
- else if (rbio->real_stripes - rbio->nr_data == 2)
- has_qstripe = true;
- else
- BUG();
+ ASSERT(bio_list_size(bio_list) == 0);
/* We should have at least one data sector. */
ASSERT(bitmap_weight(&rbio->dbitmap, rbio->stripe_nsectors));
- /* at this point we either have a full stripe,
- * or we've read the full stripe from the drive.
- * recalculate the parity and write the new results.
- *
- * We're not allowed to add any new bios to the
- * bio list here, anyone else that wants to
- * change this stripe needs to do their own rmw.
- */
- spin_lock_irq(&rbio->bio_list_lock);
- set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
- spin_unlock_irq(&rbio->bio_list_lock);
-
- atomic_set(&rbio->error, 0);
-
/*
- * now that we've set rmw_locked, run through the
- * bio list one last time and map the page pointers
- *
- * We don't cache full rbios because we're assuming
- * the higher layers are unlikely to use this area of
- * the disk again soon. If they do use it again,
- * hopefully they will send another full bio.
+ * Reset errors, as we may have errors inherited from from degraded
+ * write.
*/
- index_rbio_pages(rbio);
- if (!rbio_is_full(rbio))
- cache_rbio_pages(rbio);
- else
- clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
-
- for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) {
- struct sector_ptr *sector;
-
- /* First collect one sector from each data stripe */
- for (stripe = 0; stripe < nr_data; stripe++) {
- sector = sector_in_rbio(rbio, stripe, sectornr, 0);
- pointers[stripe] = kmap_local_page(sector->page) +
- sector->pgoff;
- }
-
- /* Then add the parity stripe */
- sector = rbio_pstripe_sector(rbio, sectornr);
- sector->uptodate = 1;
- pointers[stripe++] = kmap_local_page(sector->page) + sector->pgoff;
-
- if (has_qstripe) {
- /*
- * RAID6, add the qstripe and call the library function
- * to fill in our p/q
- */
- sector = rbio_qstripe_sector(rbio, sectornr);
- sector->uptodate = 1;
- pointers[stripe++] = kmap_local_page(sector->page) +
- sector->pgoff;
-
- raid6_call.gen_syndrome(rbio->real_stripes, sectorsize,
- pointers);
- } else {
- /* raid5 */
- memcpy(pointers[nr_data], pointers[0], sectorsize);
- run_xor(pointers + 1, nr_data - 1, sectorsize);
- }
- for (stripe = stripe - 1; stripe >= 0; stripe--)
- kunmap_local(pointers[stripe]);
- }
+ bitmap_clear(rbio->error_bitmap, 0, rbio->nr_sectors);
/*
- * Start writing. Make bios for everything from the higher layers (the
+ * Start assembly. Make bios for everything from the higher layers (the
* bio_list in our rbio) and our P/Q. Ignore everything else.
*/
for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors;
@@ -1282,15 +1269,16 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
sector = rbio_stripe_sector(rbio, stripe, sectornr);
}
- ret = rbio_add_io_sector(rbio, &bio_list, sector, stripe,
+ ret = rbio_add_io_sector(rbio, bio_list, sector, stripe,
sectornr, REQ_OP_WRITE);
if (ret)
- goto cleanup;
+ goto error;
}
- if (likely(!bioc->num_tgtdevs))
- goto write_data;
+ if (likely(!rbio->bioc->num_tgtdevs))
+ return 0;
+ /* Make a copy for the replace target device. */
for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors;
total_sector_nr++) {
struct sector_ptr *sector;
@@ -1298,7 +1286,7 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
stripe = total_sector_nr / rbio->stripe_nsectors;
sectornr = total_sector_nr % rbio->stripe_nsectors;
- if (!bioc->tgtdev_map[stripe]) {
+ if (!rbio->bioc->tgtdev_map[stripe]) {
/*
* We can skip the whole stripe completely, note
* total_sector_nr will be increased by one anyway.
@@ -1320,125 +1308,52 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
sector = rbio_stripe_sector(rbio, stripe, sectornr);
}
- ret = rbio_add_io_sector(rbio, &bio_list, sector,
+ ret = rbio_add_io_sector(rbio, bio_list, sector,
rbio->bioc->tgtdev_map[stripe],
sectornr, REQ_OP_WRITE);
if (ret)
- goto cleanup;
- }
-
-write_data:
- atomic_set(&rbio->stripes_pending, bio_list_size(&bio_list));
- BUG_ON(atomic_read(&rbio->stripes_pending) == 0);
-
- while ((bio = bio_list_pop(&bio_list))) {
- bio->bi_end_io = raid_write_end_io;
-
- if (trace_raid56_write_stripe_enabled()) {
- struct raid56_bio_trace_info trace_info = { 0 };
-
- bio_get_trace_info(rbio, bio, &trace_info);
- trace_raid56_write_stripe(rbio, bio, &trace_info);
- }
- submit_bio(bio);
+ goto error;
}
- return;
-cleanup:
- rbio_orig_end_io(rbio, BLK_STS_IOERR);
-
- while ((bio = bio_list_pop(&bio_list)))
+ return 0;
+error:
+ while ((bio = bio_list_pop(bio_list)))
bio_put(bio);
+ return -EIO;
}
-/*
- * helper to find the stripe number for a given bio. Used to figure out which
- * stripe has failed. This expects the bio to correspond to a physical disk,
- * so it looks up based on physical sector numbers.
- */
-static int find_bio_stripe(struct btrfs_raid_bio *rbio,
- struct bio *bio)
-{
- u64 physical = bio->bi_iter.bi_sector;
- int i;
- struct btrfs_io_stripe *stripe;
-
- physical <<= 9;
-
- for (i = 0; i < rbio->bioc->num_stripes; i++) {
- stripe = &rbio->bioc->stripes[i];
- if (in_range(physical, stripe->physical, BTRFS_STRIPE_LEN) &&
- stripe->dev->bdev && bio->bi_bdev == stripe->dev->bdev) {
- return i;
- }
- }
- return -1;
-}
-
-/*
- * helper to find the stripe number for a given
- * bio (before mapping). Used to figure out which stripe has
- * failed. This looks up based on logical block numbers.
- */
-static int find_logical_bio_stripe(struct btrfs_raid_bio *rbio,
- struct bio *bio)
-{
- u64 logical = bio->bi_iter.bi_sector << 9;
- int i;
-
- for (i = 0; i < rbio->nr_data; i++) {
- u64 stripe_start = rbio->bioc->raid_map[i];
-
- if (in_range(logical, stripe_start, BTRFS_STRIPE_LEN))
- return i;
- }
- return -1;
-}
-
-/*
- * returns -EIO if we had too many failures
- */
-static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed)
+static void set_rbio_range_error(struct btrfs_raid_bio *rbio, struct bio *bio)
{
- unsigned long flags;
- int ret = 0;
+ struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
+ u32 offset = (bio->bi_iter.bi_sector << SECTOR_SHIFT) -
+ rbio->bioc->raid_map[0];
+ int total_nr_sector = offset >> fs_info->sectorsize_bits;
- spin_lock_irqsave(&rbio->bio_list_lock, flags);
+ ASSERT(total_nr_sector < rbio->nr_data * rbio->stripe_nsectors);
- /* we already know this stripe is bad, move on */
- if (rbio->faila == failed || rbio->failb == failed)
- goto out;
+ bitmap_set(rbio->error_bitmap, total_nr_sector,
+ bio->bi_iter.bi_size >> fs_info->sectorsize_bits);
- if (rbio->faila == -1) {
- /* first failure on this rbio */
- rbio->faila = failed;
- atomic_inc(&rbio->error);
- } else if (rbio->failb == -1) {
- /* second failure on this rbio */
- rbio->failb = failed;
- atomic_inc(&rbio->error);
- } else {
- ret = -EIO;
+ /*
+ * Special handling for raid56_alloc_missing_rbio() used by
+ * scrub/replace. Unlike call path in raid56_parity_recover(), they
+ * pass an empty bio here. Thus we have to find out the missing device
+ * and mark the stripe error instead.
+ */
+ if (bio->bi_iter.bi_size == 0) {
+ bool found_missing = false;
+ int stripe_nr;
+
+ for (stripe_nr = 0; stripe_nr < rbio->real_stripes; stripe_nr++) {
+ if (!rbio->bioc->stripes[stripe_nr].dev->bdev) {
+ found_missing = true;
+ bitmap_set(rbio->error_bitmap,
+ stripe_nr * rbio->stripe_nsectors,
+ rbio->stripe_nsectors);
+ }
+ }
+ ASSERT(found_missing);
}
-out:
- spin_unlock_irqrestore(&rbio->bio_list_lock, flags);
-
- return ret;
-}
-
-/*
- * helper to fail a stripe based on a physical disk
- * bio.
- */
-static int fail_bio_stripe(struct btrfs_raid_bio *rbio,
- struct bio *bio)
-{
- int failed = find_bio_stripe(rbio, bio);
-
- if (failed < 0)
- return -EIO;
-
- return fail_rbio_index(rbio, failed);
}
/*
@@ -1486,191 +1401,163 @@ static void set_bio_pages_uptodate(struct btrfs_raid_bio *rbio, struct bio *bio)
}
}
-static void raid56_bio_end_io(struct bio *bio)
+static int get_bio_sector_nr(struct btrfs_raid_bio *rbio, struct bio *bio)
{
- struct btrfs_raid_bio *rbio = bio->bi_private;
-
- if (bio->bi_status)
- fail_bio_stripe(rbio, bio);
- else
- set_bio_pages_uptodate(rbio, bio);
+ struct bio_vec *bv = bio_first_bvec_all(bio);
+ int i;
- bio_put(bio);
+ for (i = 0; i < rbio->nr_sectors; i++) {
+ struct sector_ptr *sector;
- if (atomic_dec_and_test(&rbio->stripes_pending))
- queue_work(rbio->bioc->fs_info->endio_raid56_workers,
- &rbio->end_io_work);
+ sector = &rbio->stripe_sectors[i];
+ if (sector->page == bv->bv_page && sector->pgoff == bv->bv_offset)
+ break;
+ sector = &rbio->bio_sectors[i];
+ if (sector->page == bv->bv_page && sector->pgoff == bv->bv_offset)
+ break;
+ }
+ ASSERT(i < rbio->nr_sectors);
+ return i;
}
-/*
- * End io handler for the read phase of the RMW cycle. All the bios here are
- * physical stripe bios we've read from the disk so we can recalculate the
- * parity of the stripe.
- *
- * This will usually kick off finish_rmw once all the bios are read in, but it
- * may trigger parity reconstruction if we had any errors along the way
- */
-static void raid56_rmw_end_io_work(struct work_struct *work)
+static void rbio_update_error_bitmap(struct btrfs_raid_bio *rbio, struct bio *bio)
{
- struct btrfs_raid_bio *rbio =
- container_of(work, struct btrfs_raid_bio, end_io_work);
+ int total_sector_nr = get_bio_sector_nr(rbio, bio);
+ u32 bio_size = 0;
+ struct bio_vec *bvec;
+ struct bvec_iter_all iter_all;
- if (atomic_read(&rbio->error) > rbio->bioc->max_errors) {
- rbio_orig_end_io(rbio, BLK_STS_IOERR);
- return;
- }
+ bio_for_each_segment_all(bvec, bio, iter_all)
+ bio_size += bvec->bv_len;
- /*
- * This will normally call finish_rmw to start our write but if there
- * are any failed stripes we'll reconstruct from parity first.
- */
- validate_rbio_for_rmw(rbio);
+ bitmap_set(rbio->error_bitmap, total_sector_nr,
+ bio_size >> rbio->bioc->fs_info->sectorsize_bits);
}
-/*
- * the stripe must be locked by the caller. It will
- * unlock after all the writes are done
- */
-static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
+/* Verify the data sectors at read time. */
+static void verify_bio_data_sectors(struct btrfs_raid_bio *rbio,
+ struct bio *bio)
{
- int bios_to_read = 0;
- struct bio_list bio_list;
- const int nr_data_sectors = rbio->stripe_nsectors * rbio->nr_data;
- int ret;
- int total_sector_nr;
- struct bio *bio;
+ struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
+ int total_sector_nr = get_bio_sector_nr(rbio, bio);
+ struct bio_vec *bvec;
+ struct bvec_iter_all iter_all;
- bio_list_init(&bio_list);
+ /* No data csum for the whole stripe, no need to verify. */
+ if (!rbio->csum_bitmap || !rbio->csum_buf)
+ return;
- ret = alloc_rbio_pages(rbio);
- if (ret)
- goto cleanup;
+ /* P/Q stripes, they have no data csum to verify against. */
+ if (total_sector_nr >= rbio->nr_data * rbio->stripe_nsectors)
+ return;
- index_rbio_pages(rbio);
+ bio_for_each_segment_all(bvec, bio, iter_all) {
+ int bv_offset;
+
+ for (bv_offset = bvec->bv_offset;
+ bv_offset < bvec->bv_offset + bvec->bv_len;
+ bv_offset += fs_info->sectorsize, total_sector_nr++) {
+ u8 csum_buf[BTRFS_CSUM_SIZE];
+ u8 *expected_csum = rbio->csum_buf +
+ total_sector_nr * fs_info->csum_size;
+ int ret;
- atomic_set(&rbio->error, 0);
- /* Build a list of bios to read all the missing data sectors. */
- for (total_sector_nr = 0; total_sector_nr < nr_data_sectors;
- total_sector_nr++) {
- struct sector_ptr *sector;
- int stripe = total_sector_nr / rbio->stripe_nsectors;
- int sectornr = total_sector_nr % rbio->stripe_nsectors;
+ /* No csum for this sector, skip to the next sector. */
+ if (!test_bit(total_sector_nr, rbio->csum_bitmap))
+ continue;
- /*
- * We want to find all the sectors missing from the rbio and
- * read them from the disk. If sector_in_rbio() finds a page
- * in the bio list we don't need to read it off the stripe.
- */
- sector = sector_in_rbio(rbio, stripe, sectornr, 1);
- if (sector)
- continue;
+ ret = btrfs_check_sector_csum(fs_info, bvec->bv_page,
+ bv_offset, csum_buf, expected_csum);
+ if (ret < 0)
+ set_bit(total_sector_nr, rbio->error_bitmap);
+ }
+ }
+}
- sector = rbio_stripe_sector(rbio, stripe, sectornr);
- /*
- * The bio cache may have handed us an uptodate page. If so,
- * use it.
- */
- if (sector->uptodate)
- continue;
+static void raid_wait_read_end_io(struct bio *bio)
+{
+ struct btrfs_raid_bio *rbio = bio->bi_private;
- ret = rbio_add_io_sector(rbio, &bio_list, sector,
- stripe, sectornr, REQ_OP_READ);
- if (ret)
- goto cleanup;
+ if (bio->bi_status) {
+ rbio_update_error_bitmap(rbio, bio);
+ } else {
+ set_bio_pages_uptodate(rbio, bio);
+ verify_bio_data_sectors(rbio, bio);
}
- bios_to_read = bio_list_size(&bio_list);
- if (!bios_to_read) {
- /*
- * this can happen if others have merged with
- * us, it means there is nothing left to read.
- * But if there are missing devices it may not be
- * safe to do the full stripe write yet.
- */
- goto finish;
- }
+ bio_put(bio);
+ if (atomic_dec_and_test(&rbio->stripes_pending))
+ wake_up(&rbio->io_wait);
+}
- /*
- * The bioc may be freed once we submit the last bio. Make sure not to
- * touch it after that.
- */
- atomic_set(&rbio->stripes_pending, bios_to_read);
- INIT_WORK(&rbio->end_io_work, raid56_rmw_end_io_work);
- while ((bio = bio_list_pop(&bio_list))) {
- bio->bi_end_io = raid56_bio_end_io;
+static void submit_read_bios(struct btrfs_raid_bio *rbio,
+ struct bio_list *bio_list)
+{
+ struct bio *bio;
+
+ atomic_set(&rbio->stripes_pending, bio_list_size(bio_list));
+ while ((bio = bio_list_pop(bio_list))) {
+ bio->bi_end_io = raid_wait_read_end_io;
- if (trace_raid56_read_partial_enabled()) {
+ if (trace_raid56_scrub_read_recover_enabled()) {
struct raid56_bio_trace_info trace_info = { 0 };
bio_get_trace_info(rbio, bio, &trace_info);
- trace_raid56_read_partial(rbio, bio, &trace_info);
+ trace_raid56_scrub_read_recover(rbio, bio, &trace_info);
}
submit_bio(bio);
}
- /* the actual write will happen once the reads are done */
- return 0;
+}
-cleanup:
- rbio_orig_end_io(rbio, BLK_STS_IOERR);
+static int rmw_assemble_read_bios(struct btrfs_raid_bio *rbio,
+ struct bio_list *bio_list)
+{
+ struct bio *bio;
+ int total_sector_nr;
+ int ret = 0;
- while ((bio = bio_list_pop(&bio_list)))
- bio_put(bio);
+ ASSERT(bio_list_size(bio_list) == 0);
- return -EIO;
+ /*
+ * Build a list of bios to read all sectors (including data and P/Q).
+ *
+ * This behaviro is to compensate the later csum verification and
+ * recovery.
+ */
+ for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors;
+ total_sector_nr++) {
+ struct sector_ptr *sector;
+ int stripe = total_sector_nr / rbio->stripe_nsectors;
+ int sectornr = total_sector_nr % rbio->stripe_nsectors;
-finish:
- validate_rbio_for_rmw(rbio);
+ sector = rbio_stripe_sector(rbio, stripe, sectornr);
+ ret = rbio_add_io_sector(rbio, bio_list, sector,
+ stripe, sectornr, REQ_OP_READ);
+ if (ret)
+ goto cleanup;
+ }
return 0;
+
+cleanup:
+ while ((bio = bio_list_pop(bio_list)))
+ bio_put(bio);
+ return ret;
}
-/*
- * if the upper layers pass in a full stripe, we thank them by only allocating
- * enough pages to hold the parity, and sending it all down quickly.
- */
-static int full_stripe_write(struct btrfs_raid_bio *rbio)
+static int alloc_rbio_data_pages(struct btrfs_raid_bio *rbio)
{
+ const int data_pages = rbio->nr_data * rbio->stripe_npages;
int ret;
- ret = alloc_rbio_parity_pages(rbio);
- if (ret)
+ ret = btrfs_alloc_page_array(data_pages, rbio->stripe_pages);
+ if (ret < 0)
return ret;
- ret = lock_stripe_add(rbio);
- if (ret == 0)
- finish_rmw(rbio);
- return 0;
-}
-
-/*
- * partial stripe writes get handed over to async helpers.
- * We're really hoping to merge a few more writes into this
- * rbio before calculating new parity
- */
-static int partial_stripe_write(struct btrfs_raid_bio *rbio)
-{
- int ret;
-
- ret = lock_stripe_add(rbio);
- if (ret == 0)
- start_async_work(rbio, rmw_work);
+ index_stripe_sectors(rbio);
return 0;
}
/*
- * sometimes while we were reading from the drive to
- * recalculate parity, enough new bios come into create
- * a full stripe. So we do a check here to see if we can
- * go directly to finish_rmw
- */
-static int __raid56_parity_write(struct btrfs_raid_bio *rbio)
-{
- /* head off into rmw land if we don't have a full stripe */
- if (!rbio_is_full(rbio))
- return partial_stripe_write(rbio);
- return full_stripe_write(rbio);
-}
-
-/*
* We use plugging call backs to collect full stripes.
* Any time we get a partial stripe write while plugged
* we collect it into a list. When the unplug comes down,
@@ -1704,71 +1591,39 @@ static int plug_cmp(void *priv, const struct list_head *a,
return 0;
}
-static void run_plug(struct btrfs_plug_cb *plug)
+static void raid_unplug(struct blk_plug_cb *cb, bool from_schedule)
{
+ struct btrfs_plug_cb *plug = container_of(cb, struct btrfs_plug_cb, cb);
struct btrfs_raid_bio *cur;
struct btrfs_raid_bio *last = NULL;
- /*
- * sort our plug list then try to merge
- * everything we can in hopes of creating full
- * stripes.
- */
list_sort(NULL, &plug->rbio_list, plug_cmp);
+
while (!list_empty(&plug->rbio_list)) {
cur = list_entry(plug->rbio_list.next,
struct btrfs_raid_bio, plug_list);
list_del_init(&cur->plug_list);
if (rbio_is_full(cur)) {
- int ret;
-
- /* we have a full stripe, send it down */
- ret = full_stripe_write(cur);
- BUG_ON(ret);
+ /* We have a full stripe, queue it down. */
+ start_async_work(cur, rmw_rbio_work);
continue;
}
if (last) {
if (rbio_can_merge(last, cur)) {
merge_rbio(last, cur);
- __free_raid_bio(cur);
+ free_raid_bio(cur);
continue;
-
}
- __raid56_parity_write(last);
+ start_async_work(last, rmw_rbio_work);
}
last = cur;
}
- if (last) {
- __raid56_parity_write(last);
- }
+ if (last)
+ start_async_work(last, rmw_rbio_work);
kfree(plug);
}
-/*
- * if the unplug comes from schedule, we have to push the
- * work off to a helper thread
- */
-static void unplug_work(struct work_struct *work)
-{
- struct btrfs_plug_cb *plug;
- plug = container_of(work, struct btrfs_plug_cb, work);
- run_plug(plug);
-}
-
-static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule)
-{
- struct btrfs_plug_cb *plug;
- plug = container_of(cb, struct btrfs_plug_cb, cb);
-
- if (from_schedule) {
- INIT_WORK(&plug->work, unplug_work);
- queue_work(plug->info->rmw_workers, &plug->work);
- return;
- }
- run_plug(plug);
-}
-
/* Add the original bio into rbio->bio_list, and update rbio::dbitmap. */
static void rbio_add_bio(struct btrfs_raid_bio *rbio, struct bio *orig_bio)
{
@@ -1816,19 +1671,13 @@ void raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc)
rbio_add_bio(rbio, bio);
/*
- * don't plug on full rbios, just get them out the door
+ * Don't plug on full rbios, just get them out the door
* as quickly as we can
*/
- if (rbio_is_full(rbio)) {
- ret = full_stripe_write(rbio);
- if (ret) {
- __free_raid_bio(rbio);
- goto fail;
- }
- return;
- }
+ if (rbio_is_full(rbio))
+ goto queue_rbio;
- cb = blk_check_plugged(btrfs_raid_unplug, fs_info, sizeof(*plug));
+ cb = blk_check_plugged(raid_unplug, fs_info, sizeof(*plug));
if (cb) {
plug = container_of(cb, struct btrfs_plug_cb, cb);
if (!plug->info) {
@@ -1836,13 +1685,14 @@ void raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc)
INIT_LIST_HEAD(&plug->rbio_list);
}
list_add_tail(&rbio->plug_list, &plug->rbio_list);
- } else {
- ret = __raid56_parity_write(rbio);
- if (ret) {
- __free_raid_bio(rbio);
- goto fail;
- }
+ return;
}
+queue_rbio:
+ /*
+ * Either we don't have any existing plug, or we're doing a full stripe,
+ * can queue the rmw work now.
+ */
+ start_async_work(rbio, rmw_rbio_work);
return;
@@ -1851,268 +1701,254 @@ fail:
bio_endio(bio);
}
-/*
- * all parity reconstruction happens here. We've read in everything
- * we can find from the drives and this does the heavy lifting of
- * sorting the good from the bad.
- */
-static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
+static int verify_one_sector(struct btrfs_raid_bio *rbio,
+ int stripe_nr, int sector_nr)
{
- const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
- int sectornr, stripe;
- void **pointers;
- void **unmap_array;
- int faila = -1, failb = -1;
- blk_status_t err;
- int i;
+ struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
+ struct sector_ptr *sector;
+ u8 csum_buf[BTRFS_CSUM_SIZE];
+ u8 *csum_expected;
+ int ret;
- /*
- * This array stores the pointer for each sector, thus it has the extra
- * pgoff value added from each sector
- */
- pointers = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS);
- if (!pointers) {
- err = BLK_STS_RESOURCE;
- goto cleanup_io;
- }
+ if (!rbio->csum_bitmap || !rbio->csum_buf)
+ return 0;
+ /* No way to verify P/Q as they are not covered by data csum. */
+ if (stripe_nr >= rbio->nr_data)
+ return 0;
/*
- * Store copy of pointers that does not get reordered during
- * reconstruction so that kunmap_local works.
+ * If we're rebuilding a read, we have to use pages from the
+ * bio list if possible.
*/
- unmap_array = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS);
- if (!unmap_array) {
- err = BLK_STS_RESOURCE;
- goto cleanup_pointers;
+ if ((rbio->operation == BTRFS_RBIO_READ_REBUILD ||
+ rbio->operation == BTRFS_RBIO_REBUILD_MISSING)) {
+ sector = sector_in_rbio(rbio, stripe_nr, sector_nr, 0);
+ } else {
+ sector = rbio_stripe_sector(rbio, stripe_nr, sector_nr);
}
- faila = rbio->faila;
- failb = rbio->failb;
+ ASSERT(sector->page);
- if (rbio->operation == BTRFS_RBIO_READ_REBUILD ||
- rbio->operation == BTRFS_RBIO_REBUILD_MISSING) {
- spin_lock_irq(&rbio->bio_list_lock);
- set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
- spin_unlock_irq(&rbio->bio_list_lock);
- }
+ csum_expected = rbio->csum_buf +
+ (stripe_nr * rbio->stripe_nsectors + sector_nr) *
+ fs_info->csum_size;
+ ret = btrfs_check_sector_csum(fs_info, sector->page, sector->pgoff,
+ csum_buf, csum_expected);
+ return ret;
+}
- index_rbio_pages(rbio);
+/*
+ * Recover a vertical stripe specified by @sector_nr.
+ * @*pointers are the pre-allocated pointers by the caller, so we don't
+ * need to allocate/free the pointers again and again.
+ */
+static int recover_vertical(struct btrfs_raid_bio *rbio, int sector_nr,
+ void **pointers, void **unmap_array)
+{
+ struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
+ struct sector_ptr *sector;
+ const u32 sectorsize = fs_info->sectorsize;
+ int found_errors;
+ int faila;
+ int failb;
+ int stripe_nr;
+ int ret = 0;
- for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) {
- struct sector_ptr *sector;
+ /*
+ * Now we just use bitmap to mark the horizontal stripes in
+ * which we have data when doing parity scrub.
+ */
+ if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB &&
+ !test_bit(sector_nr, &rbio->dbitmap))
+ return 0;
- /*
- * Now we just use bitmap to mark the horizontal stripes in
- * which we have data when doing parity scrub.
- */
- if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB &&
- !test_bit(sectornr, &rbio->dbitmap))
- continue;
+ found_errors = get_rbio_veritical_errors(rbio, sector_nr, &faila,
+ &failb);
+ /*
+ * No errors in the veritical stripe, skip it. Can happen for recovery
+ * which only part of a stripe failed csum check.
+ */
+ if (!found_errors)
+ return 0;
+ if (found_errors > rbio->bioc->max_errors)
+ return -EIO;
+
+ /*
+ * Setup our array of pointers with sectors from each stripe
+ *
+ * NOTE: store a duplicate array of pointers to preserve the
+ * pointer order.
+ */
+ for (stripe_nr = 0; stripe_nr < rbio->real_stripes; stripe_nr++) {
/*
- * Setup our array of pointers with sectors from each stripe
- *
- * NOTE: store a duplicate array of pointers to preserve the
- * pointer order
+ * If we're rebuilding a read, we have to use pages from the
+ * bio list if possible.
*/
- for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
- /*
- * If we're rebuilding a read, we have to use
- * pages from the bio list
- */
- if ((rbio->operation == BTRFS_RBIO_READ_REBUILD ||
- rbio->operation == BTRFS_RBIO_REBUILD_MISSING) &&
- (stripe == faila || stripe == failb)) {
- sector = sector_in_rbio(rbio, stripe, sectornr, 0);
- } else {
- sector = rbio_stripe_sector(rbio, stripe, sectornr);
- }
- ASSERT(sector->page);
- pointers[stripe] = kmap_local_page(sector->page) +
- sector->pgoff;
- unmap_array[stripe] = pointers[stripe];
+ if ((rbio->operation == BTRFS_RBIO_READ_REBUILD ||
+ rbio->operation == BTRFS_RBIO_REBUILD_MISSING)) {
+ sector = sector_in_rbio(rbio, stripe_nr, sector_nr, 0);
+ } else {
+ sector = rbio_stripe_sector(rbio, stripe_nr, sector_nr);
}
+ ASSERT(sector->page);
+ pointers[stripe_nr] = kmap_local_page(sector->page) +
+ sector->pgoff;
+ unmap_array[stripe_nr] = pointers[stripe_nr];
+ }
- /* All raid6 handling here */
- if (rbio->bioc->map_type & BTRFS_BLOCK_GROUP_RAID6) {
- /* Single failure, rebuild from parity raid5 style */
- if (failb < 0) {
- if (faila == rbio->nr_data) {
- /*
- * Just the P stripe has failed, without
- * a bad data or Q stripe.
- * TODO, we should redo the xor here.
- */
- err = BLK_STS_IOERR;
- goto cleanup;
- }
+ /* All raid6 handling here */
+ if (rbio->bioc->map_type & BTRFS_BLOCK_GROUP_RAID6) {
+ /* Single failure, rebuild from parity raid5 style */
+ if (failb < 0) {
+ if (faila == rbio->nr_data)
/*
- * a single failure in raid6 is rebuilt
- * in the pstripe code below
+ * Just the P stripe has failed, without
+ * a bad data or Q stripe.
+ * We have nothing to do, just skip the
+ * recovery for this stripe.
*/
- goto pstripe;
- }
-
- /* make sure our ps and qs are in order */
- if (faila > failb)
- swap(faila, failb);
-
- /* if the q stripe is failed, do a pstripe reconstruction
- * from the xors.
- * If both the q stripe and the P stripe are failed, we're
- * here due to a crc mismatch and we can't give them the
- * data they want
+ goto cleanup;
+ /*
+ * a single failure in raid6 is rebuilt
+ * in the pstripe code below
*/
- if (rbio->bioc->raid_map[failb] == RAID6_Q_STRIPE) {
- if (rbio->bioc->raid_map[faila] ==
- RAID5_P_STRIPE) {
- err = BLK_STS_IOERR;
- goto cleanup;
- }
+ goto pstripe;
+ }
+
+ /*
+ * If the q stripe is failed, do a pstripe reconstruction from
+ * the xors.
+ * If both the q stripe and the P stripe are failed, we're
+ * here due to a crc mismatch and we can't give them the
+ * data they want.
+ */
+ if (rbio->bioc->raid_map[failb] == RAID6_Q_STRIPE) {
+ if (rbio->bioc->raid_map[faila] ==
+ RAID5_P_STRIPE)
/*
- * otherwise we have one bad data stripe and
- * a good P stripe. raid5!
+ * Only P and Q are corrupted.
+ * We only care about data stripes recovery,
+ * can skip this vertical stripe.
*/
- goto pstripe;
- }
+ goto cleanup;
+ /*
+ * Otherwise we have one bad data stripe and
+ * a good P stripe. raid5!
+ */
+ goto pstripe;
+ }
- if (rbio->bioc->raid_map[failb] == RAID5_P_STRIPE) {
- raid6_datap_recov(rbio->real_stripes,
- sectorsize, faila, pointers);
- } else {
- raid6_2data_recov(rbio->real_stripes,
- sectorsize, faila, failb,
- pointers);
- }
+ if (rbio->bioc->raid_map[failb] == RAID5_P_STRIPE) {
+ raid6_datap_recov(rbio->real_stripes, sectorsize,
+ faila, pointers);
} else {
- void *p;
+ raid6_2data_recov(rbio->real_stripes, sectorsize,
+ faila, failb, pointers);
+ }
+ } else {
+ void *p;
- /* rebuild from P stripe here (raid5 or raid6) */
- BUG_ON(failb != -1);
+ /* Rebuild from P stripe here (raid5 or raid6). */
+ ASSERT(failb == -1);
pstripe:
- /* Copy parity block into failed block to start with */
- memcpy(pointers[faila], pointers[rbio->nr_data], sectorsize);
+ /* Copy parity block into failed block to start with */
+ memcpy(pointers[faila], pointers[rbio->nr_data], sectorsize);
- /* rearrange the pointer array */
- p = pointers[faila];
- for (stripe = faila; stripe < rbio->nr_data - 1; stripe++)
- pointers[stripe] = pointers[stripe + 1];
- pointers[rbio->nr_data - 1] = p;
+ /* Rearrange the pointer array */
+ p = pointers[faila];
+ for (stripe_nr = faila; stripe_nr < rbio->nr_data - 1;
+ stripe_nr++)
+ pointers[stripe_nr] = pointers[stripe_nr + 1];
+ pointers[rbio->nr_data - 1] = p;
- /* xor in the rest */
- run_xor(pointers, rbio->nr_data - 1, sectorsize);
- }
- /* if we're doing this rebuild as part of an rmw, go through
- * and set all of our private rbio pages in the
- * failed stripes as uptodate. This way finish_rmw will
- * know they can be trusted. If this was a read reconstruction,
- * other endio functions will fiddle the uptodate bits
- */
- if (rbio->operation == BTRFS_RBIO_WRITE) {
- for (i = 0; i < rbio->stripe_nsectors; i++) {
- if (faila != -1) {
- sector = rbio_stripe_sector(rbio, faila, i);
- sector->uptodate = 1;
- }
- if (failb != -1) {
- sector = rbio_stripe_sector(rbio, failb, i);
- sector->uptodate = 1;
- }
- }
- }
- for (stripe = rbio->real_stripes - 1; stripe >= 0; stripe--)
- kunmap_local(unmap_array[stripe]);
+ /* Xor in the rest */
+ run_xor(pointers, rbio->nr_data - 1, sectorsize);
+
+ }
+
+ /*
+ * No matter if this is a RMW or recovery, we should have all
+ * failed sectors repaired in the vertical stripe, thus they are now
+ * uptodate.
+ * Especially if we determine to cache the rbio, we need to
+ * have at least all data sectors uptodate.
+ *
+ * If possible, also check if the repaired sector matches its data
+ * checksum.
+ */
+ if (faila >= 0) {
+ ret = verify_one_sector(rbio, faila, sector_nr);
+ if (ret < 0)
+ goto cleanup;
+
+ sector = rbio_stripe_sector(rbio, faila, sector_nr);
+ sector->uptodate = 1;
+ }
+ if (failb >= 0) {
+ ret = verify_one_sector(rbio, faila, sector_nr);
+ if (ret < 0)
+ goto cleanup;
+
+ sector = rbio_stripe_sector(rbio, failb, sector_nr);
+ sector->uptodate = 1;
}
- err = BLK_STS_OK;
cleanup:
- kfree(unmap_array);
-cleanup_pointers:
- kfree(pointers);
+ for (stripe_nr = rbio->real_stripes - 1; stripe_nr >= 0; stripe_nr--)
+ kunmap_local(unmap_array[stripe_nr]);
+ return ret;
+}
+
+static int recover_sectors(struct btrfs_raid_bio *rbio)
+{
+ void **pointers = NULL;
+ void **unmap_array = NULL;
+ int sectornr;
+ int ret = 0;
-cleanup_io:
/*
- * Similar to READ_REBUILD, REBUILD_MISSING at this point also has a
- * valid rbio which is consistent with ondisk content, thus such a
- * valid rbio can be cached to avoid further disk reads.
+ * @pointers array stores the pointer for each sector.
+ *
+ * @unmap_array stores copy of pointers that does not get reordered
+ * during reconstruction so that kunmap_local works.
*/
+ pointers = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS);
+ unmap_array = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS);
+ if (!pointers || !unmap_array) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
if (rbio->operation == BTRFS_RBIO_READ_REBUILD ||
rbio->operation == BTRFS_RBIO_REBUILD_MISSING) {
- /*
- * - In case of two failures, where rbio->failb != -1:
- *
- * Do not cache this rbio since the above read reconstruction
- * (raid6_datap_recov() or raid6_2data_recov()) may have
- * changed some content of stripes which are not identical to
- * on-disk content any more, otherwise, a later write/recover
- * may steal stripe_pages from this rbio and end up with
- * corruptions or rebuild failures.
- *
- * - In case of single failure, where rbio->failb == -1:
- *
- * Cache this rbio iff the above read reconstruction is
- * executed without problems.
- */
- if (err == BLK_STS_OK && rbio->failb < 0)
- cache_rbio_pages(rbio);
- else
- clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
+ spin_lock_irq(&rbio->bio_list_lock);
+ set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
+ spin_unlock_irq(&rbio->bio_list_lock);
+ }
- rbio_orig_end_io(rbio, err);
- } else if (err == BLK_STS_OK) {
- rbio->faila = -1;
- rbio->failb = -1;
+ index_rbio_pages(rbio);
- if (rbio->operation == BTRFS_RBIO_WRITE)
- finish_rmw(rbio);
- else if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB)
- finish_parity_scrub(rbio, 0);
- else
- BUG();
- } else {
- rbio_orig_end_io(rbio, err);
+ for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) {
+ ret = recover_vertical(rbio, sectornr, pointers, unmap_array);
+ if (ret < 0)
+ break;
}
-}
-
-/*
- * This is called only for stripes we've read from disk to reconstruct the
- * parity.
- */
-static void raid_recover_end_io_work(struct work_struct *work)
-{
- struct btrfs_raid_bio *rbio =
- container_of(work, struct btrfs_raid_bio, end_io_work);
- if (atomic_read(&rbio->error) > rbio->bioc->max_errors)
- rbio_orig_end_io(rbio, BLK_STS_IOERR);
- else
- __raid_recover_end_io(rbio);
+out:
+ kfree(pointers);
+ kfree(unmap_array);
+ return ret;
}
-/*
- * reads everything we need off the disk to reconstruct
- * the parity. endio handlers trigger final reconstruction
- * when the IO is done.
- *
- * This is used both for reads from the higher layers and for
- * parity construction required to finish a rmw cycle.
- */
-static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
+static int recover_assemble_read_bios(struct btrfs_raid_bio *rbio,
+ struct bio_list *bio_list)
{
- int bios_to_read = 0;
- struct bio_list bio_list;
- int ret;
- int total_sector_nr;
struct bio *bio;
+ int total_sector_nr;
+ int ret = 0;
- bio_list_init(&bio_list);
-
- ret = alloc_rbio_pages(rbio);
- if (ret)
- goto cleanup;
-
- atomic_set(&rbio->error, 0);
-
+ ASSERT(bio_list_size(bio_list) == 0);
/*
* Read everything that hasn't failed. However this time we will
* not trust any cached sector.
@@ -2127,64 +1963,139 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
int sectornr = total_sector_nr % rbio->stripe_nsectors;
struct sector_ptr *sector;
- if (rbio->faila == stripe || rbio->failb == stripe) {
- atomic_inc(&rbio->error);
- /* Skip the current stripe. */
- ASSERT(sectornr == 0);
- total_sector_nr += rbio->stripe_nsectors - 1;
+ /*
+ * Skip the range which has error. It can be a range which is
+ * marked error (for csum mismatch), or it can be a missing
+ * device.
+ */
+ if (!rbio->bioc->stripes[stripe].dev->bdev ||
+ test_bit(total_sector_nr, rbio->error_bitmap)) {
+ /*
+ * Also set the error bit for missing device, which
+ * may not yet have its error bit set.
+ */
+ set_bit(total_sector_nr, rbio->error_bitmap);
continue;
}
+
sector = rbio_stripe_sector(rbio, stripe, sectornr);
- ret = rbio_add_io_sector(rbio, &bio_list, sector, stripe,
+ ret = rbio_add_io_sector(rbio, bio_list, sector, stripe,
sectornr, REQ_OP_READ);
if (ret < 0)
- goto cleanup;
+ goto error;
}
+ return 0;
+error:
+ while ((bio = bio_list_pop(bio_list)))
+ bio_put(bio);
- bios_to_read = bio_list_size(&bio_list);
- if (!bios_to_read) {
- /*
- * we might have no bios to read just because the pages
- * were up to date, or we might have no bios to read because
- * the devices were gone.
- */
- if (atomic_read(&rbio->error) <= rbio->bioc->max_errors) {
- __raid_recover_end_io(rbio);
- return 0;
- } else {
- goto cleanup;
- }
- }
+ return -EIO;
+}
+
+static int recover_rbio(struct btrfs_raid_bio *rbio)
+{
+ struct bio_list bio_list;
+ struct bio *bio;
+ int ret;
/*
- * The bioc may be freed once we submit the last bio. Make sure not to
- * touch it after that.
+ * Either we're doing recover for a read failure or degraded write,
+ * caller should have set error bitmap correctly.
*/
- atomic_set(&rbio->stripes_pending, bios_to_read);
- INIT_WORK(&rbio->end_io_work, raid_recover_end_io_work);
- while ((bio = bio_list_pop(&bio_list))) {
- bio->bi_end_io = raid56_bio_end_io;
+ ASSERT(bitmap_weight(rbio->error_bitmap, rbio->nr_sectors));
+ bio_list_init(&bio_list);
- if (trace_raid56_scrub_read_recover_enabled()) {
- struct raid56_bio_trace_info trace_info = { 0 };
+ /* For recovery, we need to read all sectors including P/Q. */
+ ret = alloc_rbio_pages(rbio);
+ if (ret < 0)
+ goto out;
- bio_get_trace_info(rbio, bio, &trace_info);
- trace_raid56_scrub_read_recover(rbio, bio, &trace_info);
- }
- submit_bio(bio);
- }
+ index_rbio_pages(rbio);
- return 0;
+ ret = recover_assemble_read_bios(rbio, &bio_list);
+ if (ret < 0)
+ goto out;
-cleanup:
- if (rbio->operation == BTRFS_RBIO_READ_REBUILD ||
- rbio->operation == BTRFS_RBIO_REBUILD_MISSING)
- rbio_orig_end_io(rbio, BLK_STS_IOERR);
+ submit_read_bios(rbio, &bio_list);
+ wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0);
+
+ ret = recover_sectors(rbio);
+out:
while ((bio = bio_list_pop(&bio_list)))
bio_put(bio);
- return -EIO;
+ return ret;
+}
+
+static void recover_rbio_work(struct work_struct *work)
+{
+ struct btrfs_raid_bio *rbio;
+ int ret;
+
+ rbio = container_of(work, struct btrfs_raid_bio, work);
+
+ ret = lock_stripe_add(rbio);
+ if (ret == 0) {
+ ret = recover_rbio(rbio);
+ rbio_orig_end_io(rbio, errno_to_blk_status(ret));
+ }
+}
+
+static void recover_rbio_work_locked(struct work_struct *work)
+{
+ struct btrfs_raid_bio *rbio;
+ int ret;
+
+ rbio = container_of(work, struct btrfs_raid_bio, work);
+
+ ret = recover_rbio(rbio);
+ rbio_orig_end_io(rbio, errno_to_blk_status(ret));
+}
+
+static void set_rbio_raid6_extra_error(struct btrfs_raid_bio *rbio, int mirror_num)
+{
+ bool found = false;
+ int sector_nr;
+
+ /*
+ * This is for RAID6 extra recovery tries, thus mirror number should
+ * be large than 2.
+ * Mirror 1 means read from data stripes. Mirror 2 means rebuild using
+ * RAID5 methods.
+ */
+ ASSERT(mirror_num > 2);
+ for (sector_nr = 0; sector_nr < rbio->stripe_nsectors; sector_nr++) {
+ int found_errors;
+ int faila;
+ int failb;
+
+ found_errors = get_rbio_veritical_errors(rbio, sector_nr,
+ &faila, &failb);
+ /* This vertical stripe doesn't have errors. */
+ if (!found_errors)
+ continue;
+
+ /*
+ * If we found errors, there should be only one error marked
+ * by previous set_rbio_range_error().
+ */
+ ASSERT(found_errors == 1);
+ found = true;
+
+ /* Now select another stripe to mark as error. */
+ failb = rbio->real_stripes - (mirror_num - 1);
+ if (failb <= faila)
+ failb--;
+
+ /* Set the extra bit in error bitmap. */
+ if (failb >= 0)
+ set_bit(failb * rbio->stripe_nsectors + sector_nr,
+ rbio->error_bitmap);
+ }
+
+ /* We should found at least one vertical stripe with error.*/
+ ASSERT(found);
}
/*
@@ -2202,68 +2113,284 @@ void raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc,
rbio = alloc_rbio(fs_info, bioc);
if (IS_ERR(rbio)) {
bio->bi_status = errno_to_blk_status(PTR_ERR(rbio));
- goto out_end_bio;
+ bio_endio(bio);
+ return;
}
rbio->operation = BTRFS_RBIO_READ_REBUILD;
rbio_add_bio(rbio, bio);
- rbio->faila = find_logical_bio_stripe(rbio, bio);
- if (rbio->faila == -1) {
- btrfs_warn(fs_info,
-"%s could not find the bad stripe in raid56 so that we cannot recover any more (bio has logical %llu len %llu, bioc has map_type %llu)",
- __func__, bio->bi_iter.bi_sector << 9,
- (u64)bio->bi_iter.bi_size, bioc->map_type);
- __free_raid_bio(rbio);
- bio->bi_status = BLK_STS_IOERR;
- goto out_end_bio;
- }
+ set_rbio_range_error(rbio, bio);
/*
* Loop retry:
* for 'mirror == 2', reconstruct from all other stripes.
* for 'mirror_num > 2', select a stripe to fail on every retry.
*/
- if (mirror_num > 2) {
+ if (mirror_num > 2)
+ set_rbio_raid6_extra_error(rbio, mirror_num);
+
+ start_async_work(rbio, recover_rbio_work);
+}
+
+static void fill_data_csums(struct btrfs_raid_bio *rbio)
+{
+ struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
+ struct btrfs_root *csum_root = btrfs_csum_root(fs_info,
+ rbio->bioc->raid_map[0]);
+ const u64 start = rbio->bioc->raid_map[0];
+ const u32 len = (rbio->nr_data * rbio->stripe_nsectors) <<
+ fs_info->sectorsize_bits;
+ int ret;
+
+ /* The rbio should not have its csum buffer initialized. */
+ ASSERT(!rbio->csum_buf && !rbio->csum_bitmap);
+
+ /*
+ * Skip the csum search if:
+ *
+ * - The rbio doesn't belong to data block groups
+ * Then we are doing IO for tree blocks, no need to search csums.
+ *
+ * - The rbio belongs to mixed block groups
+ * This is to avoid deadlock, as we're already holding the full
+ * stripe lock, if we trigger a metadata read, and it needs to do
+ * raid56 recovery, we will deadlock.
+ */
+ if (!(rbio->bioc->map_type & BTRFS_BLOCK_GROUP_DATA) ||
+ rbio->bioc->map_type & BTRFS_BLOCK_GROUP_METADATA)
+ return;
+
+ rbio->csum_buf = kzalloc(rbio->nr_data * rbio->stripe_nsectors *
+ fs_info->csum_size, GFP_NOFS);
+ rbio->csum_bitmap = bitmap_zalloc(rbio->nr_data * rbio->stripe_nsectors,
+ GFP_NOFS);
+ if (!rbio->csum_buf || !rbio->csum_bitmap) {
+ ret = -ENOMEM;
+ goto error;
+ }
+
+ ret = btrfs_lookup_csums_bitmap(csum_root, start, start + len - 1,
+ rbio->csum_buf, rbio->csum_bitmap);
+ if (ret < 0)
+ goto error;
+ if (bitmap_empty(rbio->csum_bitmap, len >> fs_info->sectorsize_bits))
+ goto no_csum;
+ return;
+
+error:
+ /*
+ * We failed to allocate memory or grab the csum, but it's not fatal,
+ * we can still continue. But better to warn users that RMW is no
+ * longer safe for this particular sub-stripe write.
+ */
+ btrfs_warn_rl(fs_info,
+"sub-stripe write for full stripe %llu is not safe, failed to get csum: %d",
+ rbio->bioc->raid_map[0], ret);
+no_csum:
+ kfree(rbio->csum_buf);
+ bitmap_free(rbio->csum_bitmap);
+ rbio->csum_buf = NULL;
+ rbio->csum_bitmap = NULL;
+}
+
+static int rmw_read_wait_recover(struct btrfs_raid_bio *rbio)
+{
+ struct bio_list bio_list;
+ struct bio *bio;
+ int ret;
+
+ bio_list_init(&bio_list);
+
+ /*
+ * Fill the data csums we need for data verification. We need to fill
+ * the csum_bitmap/csum_buf first, as our endio function will try to
+ * verify the data sectors.
+ */
+ fill_data_csums(rbio);
+
+ ret = rmw_assemble_read_bios(rbio, &bio_list);
+ if (ret < 0)
+ goto out;
+
+ submit_read_bios(rbio, &bio_list);
+ wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0);
+
+ /*
+ * We may or may not have any corrupted sectors (including missing dev
+ * and csum mismatch), just let recover_sectors() to handle them all.
+ */
+ ret = recover_sectors(rbio);
+ return ret;
+out:
+ while ((bio = bio_list_pop(&bio_list)))
+ bio_put(bio);
+
+ return ret;
+}
+
+static void raid_wait_write_end_io(struct bio *bio)
+{
+ struct btrfs_raid_bio *rbio = bio->bi_private;
+ blk_status_t err = bio->bi_status;
+
+ if (err)
+ rbio_update_error_bitmap(rbio, bio);
+ bio_put(bio);
+ if (atomic_dec_and_test(&rbio->stripes_pending))
+ wake_up(&rbio->io_wait);
+}
+
+static void submit_write_bios(struct btrfs_raid_bio *rbio,
+ struct bio_list *bio_list)
+{
+ struct bio *bio;
+
+ atomic_set(&rbio->stripes_pending, bio_list_size(bio_list));
+ while ((bio = bio_list_pop(bio_list))) {
+ bio->bi_end_io = raid_wait_write_end_io;
+
+ if (trace_raid56_write_stripe_enabled()) {
+ struct raid56_bio_trace_info trace_info = { 0 };
+
+ bio_get_trace_info(rbio, bio, &trace_info);
+ trace_raid56_write_stripe(rbio, bio, &trace_info);
+ }
+ submit_bio(bio);
+ }
+}
+
+/*
+ * To determine if we need to read any sector from the disk.
+ * Should only be utilized in RMW path, to skip cached rbio.
+ */
+static bool need_read_stripe_sectors(struct btrfs_raid_bio *rbio)
+{
+ int i;
+
+ for (i = 0; i < rbio->nr_data * rbio->stripe_nsectors; i++) {
+ struct sector_ptr *sector = &rbio->stripe_sectors[i];
+
/*
- * 'mirror == 3' is to fail the p stripe and
- * reconstruct from the q stripe. 'mirror > 3' is to
- * fail a data stripe and reconstruct from p+q stripe.
+ * We have a sector which doesn't have page nor uptodate,
+ * thus this rbio can not be cached one, as cached one must
+ * have all its data sectors present and uptodate.
*/
- rbio->failb = rbio->real_stripes - (mirror_num - 1);
- ASSERT(rbio->failb > 0);
- if (rbio->failb <= rbio->faila)
- rbio->failb--;
+ if (!sector->page || !sector->uptodate)
+ return true;
}
+ return false;
+}
- if (lock_stripe_add(rbio))
- return;
+static int rmw_rbio(struct btrfs_raid_bio *rbio)
+{
+ struct bio_list bio_list;
+ int sectornr;
+ int ret = 0;
/*
- * This adds our rbio to the list of rbios that will be handled after
- * the current lock owner is done.
+ * Allocate the pages for parity first, as P/Q pages will always be
+ * needed for both full-stripe and sub-stripe writes.
*/
- __raid56_parity_recover(rbio);
- return;
+ ret = alloc_rbio_parity_pages(rbio);
+ if (ret < 0)
+ return ret;
-out_end_bio:
- bio_endio(bio);
+ /*
+ * Either full stripe write, or we have every data sector already
+ * cached, can go to write path immediately.
+ */
+ if (rbio_is_full(rbio) || !need_read_stripe_sectors(rbio))
+ goto write;
+
+ /*
+ * Now we're doing sub-stripe write, also need all data stripes to do
+ * the full RMW.
+ */
+ ret = alloc_rbio_data_pages(rbio);
+ if (ret < 0)
+ return ret;
+
+ index_rbio_pages(rbio);
+
+ ret = rmw_read_wait_recover(rbio);
+ if (ret < 0)
+ return ret;
+
+write:
+ /*
+ * At this stage we're not allowed to add any new bios to the
+ * bio list any more, anyone else that wants to change this stripe
+ * needs to do their own rmw.
+ */
+ spin_lock_irq(&rbio->bio_list_lock);
+ set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
+ spin_unlock_irq(&rbio->bio_list_lock);
+
+ bitmap_clear(rbio->error_bitmap, 0, rbio->nr_sectors);
+
+ index_rbio_pages(rbio);
+
+ /*
+ * We don't cache full rbios because we're assuming
+ * the higher layers are unlikely to use this area of
+ * the disk again soon. If they do use it again,
+ * hopefully they will send another full bio.
+ */
+ if (!rbio_is_full(rbio))
+ cache_rbio_pages(rbio);
+ else
+ clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
+
+ for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++)
+ generate_pq_vertical(rbio, sectornr);
+
+ bio_list_init(&bio_list);
+ ret = rmw_assemble_write_bios(rbio, &bio_list);
+ if (ret < 0)
+ return ret;
+
+ /* We should have at least one bio assembled. */
+ ASSERT(bio_list_size(&bio_list));
+ submit_write_bios(rbio, &bio_list);
+ wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0);
+
+ /* We may have more errors than our tolerance during the read. */
+ for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) {
+ int found_errors;
+
+ found_errors = get_rbio_veritical_errors(rbio, sectornr, NULL, NULL);
+ if (found_errors > rbio->bioc->max_errors) {
+ ret = -EIO;
+ break;
+ }
+ }
+ return ret;
}
-static void rmw_work(struct work_struct *work)
+static void rmw_rbio_work(struct work_struct *work)
{
struct btrfs_raid_bio *rbio;
+ int ret;
rbio = container_of(work, struct btrfs_raid_bio, work);
- raid56_rmw_stripe(rbio);
+
+ ret = lock_stripe_add(rbio);
+ if (ret == 0) {
+ ret = rmw_rbio(rbio);
+ rbio_orig_end_io(rbio, errno_to_blk_status(ret));
+ }
}
-static void read_rebuild_work(struct work_struct *work)
+static void rmw_rbio_work_locked(struct work_struct *work)
{
struct btrfs_raid_bio *rbio;
+ int ret;
rbio = container_of(work, struct btrfs_raid_bio, work);
- __raid56_parity_recover(rbio);
+
+ ret = rmw_rbio(rbio);
+ rbio_orig_end_io(rbio, errno_to_blk_status(ret));
}
/*
@@ -2358,8 +2485,7 @@ static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio)
return 0;
}
-static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
- int need_check)
+static int finish_parity_scrub(struct btrfs_raid_bio *rbio, int need_check)
{
struct btrfs_io_context *bioc = rbio->bioc;
const u32 sectorsize = bioc->fs_info->sectorsize;
@@ -2402,7 +2528,7 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
p_sector.page = alloc_page(GFP_NOFS);
if (!p_sector.page)
- goto cleanup;
+ return -ENOMEM;
p_sector.pgoff = 0;
p_sector.uptodate = 1;
@@ -2412,14 +2538,14 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
if (!q_sector.page) {
__free_page(p_sector.page);
p_sector.page = NULL;
- goto cleanup;
+ return -ENOMEM;
}
q_sector.pgoff = 0;
q_sector.uptodate = 1;
pointers[rbio->real_stripes - 1] = kmap_local_page(q_sector.page);
}
- atomic_set(&rbio->error, 0);
+ bitmap_clear(rbio->error_bitmap, 0, rbio->nr_sectors);
/* Map the parity stripe just once */
pointers[nr_data] = kmap_local_page(p_sector.page);
@@ -2499,33 +2625,13 @@ writeback:
}
submit_write:
- nr_data = bio_list_size(&bio_list);
- if (!nr_data) {
- /* Every parity is right */
- rbio_orig_end_io(rbio, BLK_STS_OK);
- return;
- }
-
- atomic_set(&rbio->stripes_pending, nr_data);
-
- while ((bio = bio_list_pop(&bio_list))) {
- bio->bi_end_io = raid_write_end_io;
-
- if (trace_raid56_scrub_write_stripe_enabled()) {
- struct raid56_bio_trace_info trace_info = { 0 };
-
- bio_get_trace_info(rbio, bio, &trace_info);
- trace_raid56_scrub_write_stripe(rbio, bio, &trace_info);
- }
- submit_bio(bio);
- }
- return;
+ submit_write_bios(rbio, &bio_list);
+ return 0;
cleanup:
- rbio_orig_end_io(rbio, BLK_STS_IOERR);
-
while ((bio = bio_list_pop(&bio_list)))
bio_put(bio);
+ return ret;
}
static inline int is_data_stripe(struct btrfs_raid_bio *rbio, int stripe)
@@ -2535,102 +2641,99 @@ static inline int is_data_stripe(struct btrfs_raid_bio *rbio, int stripe)
return 0;
}
-/*
- * While we're doing the parity check and repair, we could have errors
- * in reading pages off the disk. This checks for errors and if we're
- * not able to read the page it'll trigger parity reconstruction. The
- * parity scrub will be finished after we've reconstructed the failed
- * stripes
- */
-static void validate_rbio_for_parity_scrub(struct btrfs_raid_bio *rbio)
+static int recover_scrub_rbio(struct btrfs_raid_bio *rbio)
{
- if (atomic_read(&rbio->error) > rbio->bioc->max_errors)
- goto cleanup;
+ void **pointers = NULL;
+ void **unmap_array = NULL;
+ int sector_nr;
+ int ret;
+
+ /*
+ * @pointers array stores the pointer for each sector.
+ *
+ * @unmap_array stores copy of pointers that does not get reordered
+ * during reconstruction so that kunmap_local works.
+ */
+ pointers = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS);
+ unmap_array = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS);
+ if (!pointers || !unmap_array) {
+ ret = -ENOMEM;
+ goto out;
+ }
- if (rbio->faila >= 0 || rbio->failb >= 0) {
+ for (sector_nr = 0; sector_nr < rbio->stripe_nsectors; sector_nr++) {
int dfail = 0, failp = -1;
+ int faila;
+ int failb;
+ int found_errors;
+
+ found_errors = get_rbio_veritical_errors(rbio, sector_nr,
+ &faila, &failb);
+ if (found_errors > rbio->bioc->max_errors) {
+ ret = -EIO;
+ goto out;
+ }
+ if (found_errors == 0)
+ continue;
- if (is_data_stripe(rbio, rbio->faila))
- dfail++;
- else if (is_parity_stripe(rbio->faila))
- failp = rbio->faila;
+ /* We should have at least one error here. */
+ ASSERT(faila >= 0 || failb >= 0);
- if (is_data_stripe(rbio, rbio->failb))
+ if (is_data_stripe(rbio, faila))
dfail++;
- else if (is_parity_stripe(rbio->failb))
- failp = rbio->failb;
+ else if (is_parity_stripe(faila))
+ failp = faila;
+ if (is_data_stripe(rbio, failb))
+ dfail++;
+ else if (is_parity_stripe(failb))
+ failp = failb;
/*
- * Because we can not use a scrubbing parity to repair
- * the data, so the capability of the repair is declined.
- * (In the case of RAID5, we can not repair anything)
+ * Because we can not use a scrubbing parity to repair the
+ * data, so the capability of the repair is declined. (In the
+ * case of RAID5, we can not repair anything.)
*/
- if (dfail > rbio->bioc->max_errors - 1)
- goto cleanup;
-
+ if (dfail > rbio->bioc->max_errors - 1) {
+ ret = -EIO;
+ goto out;
+ }
/*
- * If all data is good, only parity is correctly, just
- * repair the parity.
+ * If all data is good, only parity is correctly, just repair
+ * the parity, no need to recover data stripes.
*/
- if (dfail == 0) {
- finish_parity_scrub(rbio, 0);
- return;
- }
+ if (dfail == 0)
+ continue;
/*
* Here means we got one corrupted data stripe and one
- * corrupted parity on RAID6, if the corrupted parity
- * is scrubbing parity, luckily, use the other one to repair
- * the data, or we can not repair the data stripe.
+ * corrupted parity on RAID6, if the corrupted parity is
+ * scrubbing parity, luckily, use the other one to repair the
+ * data, or we can not repair the data stripe.
*/
- if (failp != rbio->scrubp)
- goto cleanup;
+ if (failp != rbio->scrubp) {
+ ret = -EIO;
+ goto out;
+ }
- __raid_recover_end_io(rbio);
- } else {
- finish_parity_scrub(rbio, 1);
+ ret = recover_vertical(rbio, sector_nr, pointers, unmap_array);
+ if (ret < 0)
+ goto out;
}
- return;
-
-cleanup:
- rbio_orig_end_io(rbio, BLK_STS_IOERR);
-}
-
-/*
- * end io for the read phase of the rmw cycle. All the bios here are physical
- * stripe bios we've read from the disk so we can recalculate the parity of the
- * stripe.
- *
- * This will usually kick off finish_rmw once all the bios are read in, but it
- * may trigger parity reconstruction if we had any errors along the way
- */
-static void raid56_parity_scrub_end_io_work(struct work_struct *work)
-{
- struct btrfs_raid_bio *rbio =
- container_of(work, struct btrfs_raid_bio, end_io_work);
-
- /*
- * This will normally call finish_rmw to start our write, but if there
- * are any failed stripes we'll reconstruct from parity first
- */
- validate_rbio_for_parity_scrub(rbio);
+out:
+ kfree(pointers);
+ kfree(unmap_array);
+ return ret;
}
-static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio)
+static int scrub_assemble_read_bios(struct btrfs_raid_bio *rbio,
+ struct bio_list *bio_list)
{
- int bios_to_read = 0;
- struct bio_list bio_list;
- int ret;
- int total_sector_nr;
struct bio *bio;
+ int total_sector_nr;
+ int ret = 0;
- bio_list_init(&bio_list);
-
- ret = alloc_rbio_essential_pages(rbio);
- if (ret)
- goto cleanup;
+ ASSERT(bio_list_size(bio_list) == 0);
- atomic_set(&rbio->error, 0);
/* Build a list of bios to read all the missing parts. */
for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors;
total_sector_nr++) {
@@ -2659,67 +2762,84 @@ static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio)
if (sector->uptodate)
continue;
- ret = rbio_add_io_sector(rbio, &bio_list, sector, stripe,
+ ret = rbio_add_io_sector(rbio, bio_list, sector, stripe,
sectornr, REQ_OP_READ);
if (ret)
- goto cleanup;
+ goto error;
}
+ return 0;
+error:
+ while ((bio = bio_list_pop(bio_list)))
+ bio_put(bio);
+ return ret;
+}
- bios_to_read = bio_list_size(&bio_list);
- if (!bios_to_read) {
- /*
- * this can happen if others have merged with
- * us, it means there is nothing left to read.
- * But if there are missing devices it may not be
- * safe to do the full stripe write yet.
- */
- goto finish;
- }
+static int scrub_rbio(struct btrfs_raid_bio *rbio)
+{
+ bool need_check = false;
+ struct bio_list bio_list;
+ int sector_nr;
+ int ret;
+ struct bio *bio;
- /*
- * The bioc may be freed once we submit the last bio. Make sure not to
- * touch it after that.
- */
- atomic_set(&rbio->stripes_pending, bios_to_read);
- INIT_WORK(&rbio->end_io_work, raid56_parity_scrub_end_io_work);
- while ((bio = bio_list_pop(&bio_list))) {
- bio->bi_end_io = raid56_bio_end_io;
+ bio_list_init(&bio_list);
- if (trace_raid56_scrub_read_enabled()) {
- struct raid56_bio_trace_info trace_info = { 0 };
+ ret = alloc_rbio_essential_pages(rbio);
+ if (ret)
+ goto cleanup;
- bio_get_trace_info(rbio, bio, &trace_info);
- trace_raid56_scrub_read(rbio, bio, &trace_info);
+ bitmap_clear(rbio->error_bitmap, 0, rbio->nr_sectors);
+
+ ret = scrub_assemble_read_bios(rbio, &bio_list);
+ if (ret < 0)
+ goto cleanup;
+
+ submit_read_bios(rbio, &bio_list);
+ wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0);
+
+ /* We may have some failures, recover the failed sectors first. */
+ ret = recover_scrub_rbio(rbio);
+ if (ret < 0)
+ goto cleanup;
+
+ /*
+ * We have every sector properly prepared. Can finish the scrub
+ * and writeback the good content.
+ */
+ ret = finish_parity_scrub(rbio, need_check);
+ wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0);
+ for (sector_nr = 0; sector_nr < rbio->stripe_nsectors; sector_nr++) {
+ int found_errors;
+
+ found_errors = get_rbio_veritical_errors(rbio, sector_nr, NULL, NULL);
+ if (found_errors > rbio->bioc->max_errors) {
+ ret = -EIO;
+ break;
}
- submit_bio(bio);
}
- /* the actual write will happen once the reads are done */
- return;
+ return ret;
cleanup:
- rbio_orig_end_io(rbio, BLK_STS_IOERR);
-
while ((bio = bio_list_pop(&bio_list)))
bio_put(bio);
- return;
-
-finish:
- validate_rbio_for_parity_scrub(rbio);
+ return ret;
}
-static void scrub_parity_work(struct work_struct *work)
+static void scrub_rbio_work_locked(struct work_struct *work)
{
struct btrfs_raid_bio *rbio;
+ int ret;
rbio = container_of(work, struct btrfs_raid_bio, work);
- raid56_parity_scrub_stripe(rbio);
+ ret = scrub_rbio(rbio);
+ rbio_orig_end_io(rbio, errno_to_blk_status(ret));
}
void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio)
{
if (!lock_stripe_add(rbio))
- start_async_work(rbio, scrub_parity_work);
+ start_async_work(rbio, scrub_rbio_work_locked);
}
/* The following code is used for dev replace of a missing RAID 5/6 device. */
@@ -2742,20 +2862,12 @@ raid56_alloc_missing_rbio(struct bio *bio, struct btrfs_io_context *bioc)
*/
ASSERT(!bio->bi_iter.bi_size);
- rbio->faila = find_logical_bio_stripe(rbio, bio);
- if (rbio->faila == -1) {
- btrfs_warn_rl(fs_info,
- "can not determine the failed stripe number for full stripe %llu",
- bioc->raid_map[0]);
- __free_raid_bio(rbio);
- return NULL;
- }
+ set_rbio_range_error(rbio, bio);
return rbio;
}
void raid56_submit_missing_rbio(struct btrfs_raid_bio *rbio)
{
- if (!lock_stripe_add(rbio))
- start_async_work(rbio, read_rebuild_work);
+ start_async_work(rbio, recover_rbio_work);
}
diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h
index 91d5c0adad15..7c73a443939e 100644
--- a/fs/btrfs/raid56.h
+++ b/fs/btrfs/raid56.h
@@ -74,12 +74,6 @@ struct btrfs_raid_bio {
/* How many sectors there are for each stripe */
u8 stripe_nsectors;
- /* First bad stripe, -1 means no corruption */
- s8 faila;
-
- /* Second bad stripe (for RAID6 use) */
- s8 failb;
-
/* Stripe number that we're scrubbing */
u8 scrubp;
@@ -93,9 +87,7 @@ struct btrfs_raid_bio {
atomic_t stripes_pending;
- atomic_t error;
-
- struct work_struct end_io_work;
+ wait_queue_head_t io_wait;
/* Bitmap to record which horizontal stripe has data */
unsigned long dbitmap;
@@ -126,6 +118,29 @@ struct btrfs_raid_bio {
/* Allocated with real_stripes-many pointers for finish_*() calls */
void **finish_pointers;
+
+ /*
+ * The bitmap recording where IO errors happened.
+ * Each bit is corresponding to one sector in either bio_sectors[] or
+ * stripe_sectors[] array.
+ *
+ * The reason we don't use another bit in sector_ptr is, we have two
+ * arrays of sectors, and a lot of IO can use sectors in both arrays.
+ * Thus making it much harder to iterate.
+ */
+ unsigned long *error_bitmap;
+
+ /*
+ * Checksum buffer if the rbio is for data. The buffer should cover
+ * all data sectors (exlcuding P/Q sectors).
+ */
+ u8 *csum_buf;
+
+ /*
+ * Each bit represents if the corresponding sector has data csum found.
+ * Should only cover data sectors (excluding P/Q sectors).
+ */
+ unsigned long *csum_bitmap;
};
/*
diff --git a/fs/btrfs/rcu-string.h b/fs/btrfs/rcu-string.h
index 5c1a617eb25d..5c2b66d155ef 100644
--- a/fs/btrfs/rcu-string.h
+++ b/fs/btrfs/rcu-string.h
@@ -18,7 +18,11 @@ static inline struct rcu_string *rcu_string_strdup(const char *src, gfp_t mask)
(len * sizeof(char)), mask);
if (!ret)
return ret;
- strncpy(ret->str, src, len);
+ /* Warn if the source got unexpectedly truncated. */
+ if (WARN_ON(strscpy(ret->str, src, len) < 0)) {
+ kfree(ret);
+ return NULL;
+ }
return ret;
}
diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c
index a248f46cfe72..95d28497de7c 100644
--- a/fs/btrfs/ref-verify.c
+++ b/fs/btrfs/ref-verify.c
@@ -5,11 +5,14 @@
#include <linux/sched.h>
#include <linux/stacktrace.h>
+#include "messages.h"
#include "ctree.h"
#include "disk-io.h"
#include "locking.h"
#include "delayed-ref.h"
#include "ref-verify.h"
+#include "fs.h"
+#include "accessors.h"
/*
* Used to keep track the roots and number of refs each root has for a given
diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c
index f50586ff85c8..0474bbe39da7 100644
--- a/fs/btrfs/reflink.c
+++ b/fs/btrfs/reflink.c
@@ -2,13 +2,19 @@
#include <linux/blkdev.h>
#include <linux/iversion.h>
-#include "compression.h"
#include "ctree.h"
+#include "fs.h"
+#include "messages.h"
+#include "compression.h"
#include "delalloc-space.h"
#include "disk-io.h"
#include "reflink.h"
#include "transaction.h"
#include "subpage.h"
+#include "accessors.h"
+#include "file-item.h"
+#include "file.h"
+#include "super.h"
#define BTRFS_MAX_DEDUPE_LEN SZ_16M
@@ -318,16 +324,16 @@ copy_to_page:
goto out;
}
-/**
- * btrfs_clone() - clone a range from inode file to another
+/*
+ * Clone a range from inode file to another.
*
- * @src: Inode to clone from
- * @inode: Inode to clone to
- * @off: Offset within source to start clone from
- * @olen: Original length, passed by user, of range to clone
- * @olen_aligned: Block-aligned value of olen
- * @destoff: Offset within @inode to start clone
- * @no_time_update: Whether to update mtime/ctime on the target inode
+ * @src: Inode to clone from
+ * @inode: Inode to clone to
+ * @off: Offset within source to start clone from
+ * @olen: Original length, passed by user, of range to clone
+ * @olen_aligned: Block-aligned value of olen
+ * @destoff: Offset within @inode to start clone
+ * @no_time_update: Whether to update mtime/ctime on the target inode
*/
static int btrfs_clone(struct inode *src, struct inode *inode,
const u64 off, const u64 olen, const u64 olen_aligned,
@@ -887,7 +893,7 @@ loff_t btrfs_remap_file_range(struct file *src_file, loff_t off,
return -EINVAL;
if (same_inode) {
- btrfs_inode_lock(src_inode, BTRFS_ILOCK_MMAP);
+ btrfs_inode_lock(BTRFS_I(src_inode), BTRFS_ILOCK_MMAP);
} else {
lock_two_nondirectories(src_inode, dst_inode);
btrfs_double_mmap_lock(src_inode, dst_inode);
@@ -905,7 +911,7 @@ loff_t btrfs_remap_file_range(struct file *src_file, loff_t off,
out_unlock:
if (same_inode) {
- btrfs_inode_unlock(src_inode, BTRFS_ILOCK_MMAP);
+ btrfs_inode_unlock(BTRFS_I(src_inode), BTRFS_ILOCK_MMAP);
} else {
btrfs_double_mmap_unlock(src_inode, dst_inode);
unlock_two_nondirectories(src_inode, dst_inode);
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 666a37a0ee89..31ec4a7658ce 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -27,6 +27,15 @@
#include "subpage.h"
#include "zoned.h"
#include "inode-item.h"
+#include "space-info.h"
+#include "fs.h"
+#include "accessors.h"
+#include "extent-tree.h"
+#include "root-tree.h"
+#include "file-item.h"
+#include "relocation.h"
+#include "super.h"
+#include "tree-checker.h"
/*
* Relocation overview
@@ -470,7 +479,7 @@ static noinline_for_stack struct btrfs_backref_node *build_backref_tree(
int ret;
int err = 0;
- iter = btrfs_backref_iter_alloc(rc->extent_root->fs_info, GFP_NOFS);
+ iter = btrfs_backref_iter_alloc(rc->extent_root->fs_info);
if (!iter)
return ERR_PTR(-ENOMEM);
path = btrfs_alloc_path();
@@ -1109,10 +1118,12 @@ int replace_file_extents(struct btrfs_trans_handle *trans,
inode = find_next_inode(root, key.objectid);
first = 0;
} else if (inode && btrfs_ino(BTRFS_I(inode)) < key.objectid) {
- btrfs_add_delayed_iput(inode);
+ btrfs_add_delayed_iput(BTRFS_I(inode));
inode = find_next_inode(root, key.objectid);
}
if (inode && btrfs_ino(BTRFS_I(inode)) == key.objectid) {
+ struct extent_state *cached_state = NULL;
+
end = key.offset +
btrfs_file_extent_num_bytes(leaf, fi);
WARN_ON(!IS_ALIGNED(key.offset,
@@ -1120,14 +1131,15 @@ int replace_file_extents(struct btrfs_trans_handle *trans,
WARN_ON(!IS_ALIGNED(end, fs_info->sectorsize));
end--;
ret = try_lock_extent(&BTRFS_I(inode)->io_tree,
- key.offset, end);
+ key.offset, end,
+ &cached_state);
if (!ret)
continue;
btrfs_drop_extent_map_range(BTRFS_I(inode),
key.offset, end, true);
unlock_extent(&BTRFS_I(inode)->io_tree,
- key.offset, end, NULL);
+ key.offset, end, &cached_state);
}
}
@@ -1170,7 +1182,7 @@ int replace_file_extents(struct btrfs_trans_handle *trans,
if (dirty)
btrfs_mark_buffer_dirty(leaf);
if (inode)
- btrfs_add_delayed_iput(inode);
+ btrfs_add_delayed_iput(BTRFS_I(inode));
return ret;
}
@@ -1516,6 +1528,8 @@ static int invalidate_extent_cache(struct btrfs_root *root,
objectid = min_key->objectid;
while (1) {
+ struct extent_state *cached_state = NULL;
+
cond_resched();
iput(inode);
@@ -1566,9 +1580,9 @@ static int invalidate_extent_cache(struct btrfs_root *root,
}
/* the lock_extent waits for read_folio to complete */
- lock_extent(&BTRFS_I(inode)->io_tree, start, end, NULL);
+ lock_extent(&BTRFS_I(inode)->io_tree, start, end, &cached_state);
btrfs_drop_extent_map_range(BTRFS_I(inode), start, end, true);
- unlock_extent(&BTRFS_I(inode)->io_tree, start, end, NULL);
+ unlock_extent(&BTRFS_I(inode)->io_tree, start, end, &cached_state);
}
return 0;
}
@@ -2597,10 +2611,14 @@ static int tree_block_processed(u64 bytenr, struct reloc_control *rc)
static int get_tree_block_key(struct btrfs_fs_info *fs_info,
struct tree_block *block)
{
+ struct btrfs_tree_parent_check check = {
+ .level = block->level,
+ .owner_root = block->owner,
+ .transid = block->key.offset
+ };
struct extent_buffer *eb;
- eb = read_tree_block(fs_info, block->bytenr, block->owner,
- block->key.offset, block->level, NULL);
+ eb = read_tree_block(fs_info, block->bytenr, &check);
if (IS_ERR(eb))
return PTR_ERR(eb);
if (!extent_buffer_uptodate(eb)) {
@@ -2861,25 +2879,27 @@ static noinline_for_stack int prealloc_file_extent_cluster(
if (ret)
return ret;
- btrfs_inode_lock(&inode->vfs_inode, 0);
+ btrfs_inode_lock(inode, 0);
for (nr = 0; nr < cluster->nr; nr++) {
+ struct extent_state *cached_state = NULL;
+
start = cluster->boundary[nr] - offset;
if (nr + 1 < cluster->nr)
end = cluster->boundary[nr + 1] - 1 - offset;
else
end = cluster->end - offset;
- lock_extent(&inode->io_tree, start, end, NULL);
+ lock_extent(&inode->io_tree, start, end, &cached_state);
num_bytes = end + 1 - start;
ret = btrfs_prealloc_file_range(&inode->vfs_inode, 0, start,
num_bytes, num_bytes,
end + 1, &alloc_hint);
cur_offset = end + 1;
- unlock_extent(&inode->io_tree, start, end, NULL);
+ unlock_extent(&inode->io_tree, start, end, &cached_state);
if (ret)
break;
}
- btrfs_inode_unlock(&inode->vfs_inode, 0);
+ btrfs_inode_unlock(inode, 0);
if (cur_offset < prealloc_end)
btrfs_free_reserved_data_space_noquota(inode->root->fs_info,
@@ -2891,6 +2911,7 @@ static noinline_for_stack int setup_relocation_extent_mapping(struct inode *inod
u64 start, u64 end, u64 block_start)
{
struct extent_map *em;
+ struct extent_state *cached_state = NULL;
int ret = 0;
em = alloc_extent_map();
@@ -2903,9 +2924,9 @@ static noinline_for_stack int setup_relocation_extent_mapping(struct inode *inod
em->block_start = block_start;
set_bit(EXTENT_FLAG_PINNED, &em->flags);
- lock_extent(&BTRFS_I(inode)->io_tree, start, end, NULL);
+ lock_extent(&BTRFS_I(inode)->io_tree, start, end, &cached_state);
ret = btrfs_replace_extent_map_range(BTRFS_I(inode), em, false);
- unlock_extent(&BTRFS_I(inode)->io_tree, start, end, NULL);
+ unlock_extent(&BTRFS_I(inode)->io_tree, start, end, &cached_state);
free_extent_map(em);
return ret;
@@ -2983,6 +3004,7 @@ static int relocate_one_page(struct inode *inode, struct file_ra_state *ra,
*/
cur = max(page_start, cluster->boundary[*cluster_nr] - offset);
while (cur <= page_end) {
+ struct extent_state *cached_state = NULL;
u64 extent_start = cluster->boundary[*cluster_nr] - offset;
u64 extent_end = get_cluster_boundary_end(cluster,
*cluster_nr) - offset;
@@ -2998,13 +3020,15 @@ static int relocate_one_page(struct inode *inode, struct file_ra_state *ra,
goto release_page;
/* Mark the range delalloc and dirty for later writeback */
- lock_extent(&BTRFS_I(inode)->io_tree, clamped_start, clamped_end, NULL);
+ lock_extent(&BTRFS_I(inode)->io_tree, clamped_start, clamped_end,
+ &cached_state);
ret = btrfs_set_extent_delalloc(BTRFS_I(inode), clamped_start,
- clamped_end, 0, NULL);
+ clamped_end, 0, &cached_state);
if (ret) {
- clear_extent_bits(&BTRFS_I(inode)->io_tree,
- clamped_start, clamped_end,
- EXTENT_LOCKED | EXTENT_BOUNDARY);
+ clear_extent_bit(&BTRFS_I(inode)->io_tree,
+ clamped_start, clamped_end,
+ EXTENT_LOCKED | EXTENT_BOUNDARY,
+ &cached_state);
btrfs_delalloc_release_metadata(BTRFS_I(inode),
clamped_len, true);
btrfs_delalloc_release_extents(BTRFS_I(inode),
@@ -3031,7 +3055,8 @@ static int relocate_one_page(struct inode *inode, struct file_ra_state *ra,
boundary_start, boundary_end,
EXTENT_BOUNDARY);
}
- unlock_extent(&BTRFS_I(inode)->io_tree, clamped_start, clamped_end, NULL);
+ unlock_extent(&BTRFS_I(inode)->io_tree, clamped_start, clamped_end,
+ &cached_state);
btrfs_delalloc_release_extents(BTRFS_I(inode), clamped_len);
cur += clamped_len;
@@ -3388,24 +3413,28 @@ int add_data_references(struct reloc_control *rc,
struct btrfs_path *path,
struct rb_root *blocks)
{
- struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
- struct ulist *leaves = NULL;
+ struct btrfs_backref_walk_ctx ctx = { 0 };
struct ulist_iterator leaf_uiter;
struct ulist_node *ref_node = NULL;
- const u32 blocksize = fs_info->nodesize;
+ const u32 blocksize = rc->extent_root->fs_info->nodesize;
int ret = 0;
btrfs_release_path(path);
- ret = btrfs_find_all_leafs(NULL, fs_info, extent_key->objectid,
- 0, &leaves, NULL, true);
+
+ ctx.bytenr = extent_key->objectid;
+ ctx.ignore_extent_item_pos = true;
+ ctx.fs_info = rc->extent_root->fs_info;
+
+ ret = btrfs_find_all_leafs(&ctx);
if (ret < 0)
return ret;
ULIST_ITER_INIT(&leaf_uiter);
- while ((ref_node = ulist_next(leaves, &leaf_uiter))) {
+ while ((ref_node = ulist_next(ctx.refs, &leaf_uiter))) {
+ struct btrfs_tree_parent_check check = { 0 };
struct extent_buffer *eb;
- eb = read_tree_block(fs_info, ref_node->val, 0, 0, 0, NULL);
+ eb = read_tree_block(ctx.fs_info, ref_node->val, &check);
if (IS_ERR(eb)) {
ret = PTR_ERR(eb);
break;
@@ -3421,7 +3450,7 @@ int add_data_references(struct reloc_control *rc,
}
if (ret < 0)
free_block_list(blocks);
- ulist_free(leaves);
+ ulist_free(ctx.refs);
return ret;
}
@@ -3905,8 +3934,7 @@ static struct reloc_control *alloc_reloc_control(struct btrfs_fs_info *fs_info)
INIT_LIST_HEAD(&rc->dirty_subvol_roots);
btrfs_backref_init_cache(fs_info, &rc->backref_cache, 1);
mapping_tree_init(&rc->reloc_root_tree);
- extent_io_tree_init(fs_info, &rc->processed_blocks,
- IO_TREE_RELOC_BLOCKS, NULL);
+ extent_io_tree_init(fs_info, &rc->processed_blocks, IO_TREE_RELOC_BLOCKS);
return rc;
}
@@ -4330,8 +4358,8 @@ int btrfs_reloc_clone_csums(struct btrfs_inode *inode, u64 file_pos, u64 len)
disk_bytenr = file_pos + inode->index_cnt;
csum_root = btrfs_csum_root(fs_info, disk_bytenr);
- ret = btrfs_lookup_csums_range(csum_root, disk_bytenr,
- disk_bytenr + len - 1, &list, 0, false);
+ ret = btrfs_lookup_csums_list(csum_root, disk_bytenr,
+ disk_bytenr + len - 1, &list, 0, false);
if (ret)
goto out;
diff --git a/fs/btrfs/relocation.h b/fs/btrfs/relocation.h
new file mode 100644
index 000000000000..2041a86186de
--- /dev/null
+++ b/fs/btrfs/relocation.h
@@ -0,0 +1,23 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef BTRFS_RELOCATION_H
+#define BTRFS_RELOCATION_H
+
+int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start);
+int btrfs_init_reloc_root(struct btrfs_trans_handle *trans, struct btrfs_root *root);
+int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root);
+int btrfs_recover_relocation(struct btrfs_fs_info *fs_info);
+int btrfs_reloc_clone_csums(struct btrfs_inode *inode, u64 file_pos, u64 len);
+int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct extent_buffer *buf,
+ struct extent_buffer *cow);
+void btrfs_reloc_pre_snapshot(struct btrfs_pending_snapshot *pending,
+ u64 *bytes_to_reserve);
+int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,
+ struct btrfs_pending_snapshot *pending);
+int btrfs_should_cancel_balance(struct btrfs_fs_info *fs_info);
+struct btrfs_root *find_reloc_root(struct btrfs_fs_info *fs_info, u64 bytenr);
+int btrfs_should_ignore_reloc_root(struct btrfs_root *root);
+
+#endif
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index e1f599d7a916..859874579456 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -6,11 +6,16 @@
#include <linux/err.h>
#include <linux/uuid.h>
#include "ctree.h"
+#include "fs.h"
+#include "messages.h"
#include "transaction.h"
#include "disk-io.h"
#include "print-tree.h"
#include "qgroup.h"
#include "space-info.h"
+#include "accessors.h"
+#include "root-tree.h"
+#include "orphan.h"
/*
* Read a root item from the tree. In case we detect a root item smaller then
@@ -327,9 +332,8 @@ out:
}
int btrfs_del_root_ref(struct btrfs_trans_handle *trans, u64 root_id,
- u64 ref_id, u64 dirid, u64 *sequence, const char *name,
- int name_len)
-
+ u64 ref_id, u64 dirid, u64 *sequence,
+ const struct fscrypt_str *name)
{
struct btrfs_root *tree_root = trans->fs_info->tree_root;
struct btrfs_path *path;
@@ -356,8 +360,8 @@ again:
struct btrfs_root_ref);
ptr = (unsigned long)(ref + 1);
if ((btrfs_root_ref_dirid(leaf, ref) != dirid) ||
- (btrfs_root_ref_name_len(leaf, ref) != name_len) ||
- memcmp_extent_buffer(leaf, name, ptr, name_len)) {
+ (btrfs_root_ref_name_len(leaf, ref) != name->len) ||
+ memcmp_extent_buffer(leaf, name->name, ptr, name->len)) {
ret = -ENOENT;
goto out;
}
@@ -400,8 +404,8 @@ out:
* Will return 0, -ENOMEM, or anything from the CoW path
*/
int btrfs_add_root_ref(struct btrfs_trans_handle *trans, u64 root_id,
- u64 ref_id, u64 dirid, u64 sequence, const char *name,
- int name_len)
+ u64 ref_id, u64 dirid, u64 sequence,
+ const struct fscrypt_str *name)
{
struct btrfs_root *tree_root = trans->fs_info->tree_root;
struct btrfs_key key;
@@ -420,7 +424,7 @@ int btrfs_add_root_ref(struct btrfs_trans_handle *trans, u64 root_id,
key.offset = ref_id;
again:
ret = btrfs_insert_empty_item(trans, tree_root, path, &key,
- sizeof(*ref) + name_len);
+ sizeof(*ref) + name->len);
if (ret) {
btrfs_abort_transaction(trans, ret);
btrfs_free_path(path);
@@ -431,9 +435,9 @@ again:
ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref);
btrfs_set_root_ref_dirid(leaf, ref, dirid);
btrfs_set_root_ref_sequence(leaf, ref, sequence);
- btrfs_set_root_ref_name_len(leaf, ref, name_len);
+ btrfs_set_root_ref_name_len(leaf, ref, name->len);
ptr = (unsigned long)(ref + 1);
- write_extent_buffer(leaf, name, ptr, name_len);
+ write_extent_buffer(leaf, name->name, ptr, name->len);
btrfs_mark_buffer_dirty(leaf);
if (key.type == BTRFS_ROOT_BACKREF_KEY) {
diff --git a/fs/btrfs/root-tree.h b/fs/btrfs/root-tree.h
new file mode 100644
index 000000000000..cbbaca32126e
--- /dev/null
+++ b/fs/btrfs/root-tree.h
@@ -0,0 +1,34 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef BTRFS_ROOT_TREE_H
+#define BTRFS_ROOT_TREE_H
+
+int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
+ struct btrfs_block_rsv *rsv,
+ int nitems, bool use_global_rsv);
+void btrfs_subvolume_release_metadata(struct btrfs_root *root,
+ struct btrfs_block_rsv *rsv);
+int btrfs_add_root_ref(struct btrfs_trans_handle *trans, u64 root_id,
+ u64 ref_id, u64 dirid, u64 sequence,
+ const struct fscrypt_str *name);
+int btrfs_del_root_ref(struct btrfs_trans_handle *trans, u64 root_id,
+ u64 ref_id, u64 dirid, u64 *sequence,
+ const struct fscrypt_str *name);
+int btrfs_del_root(struct btrfs_trans_handle *trans, const struct btrfs_key *key);
+int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+ const struct btrfs_key *key,
+ struct btrfs_root_item *item);
+int __must_check btrfs_update_root(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_key *key,
+ struct btrfs_root_item *item);
+int btrfs_find_root(struct btrfs_root *root, const struct btrfs_key *search_key,
+ struct btrfs_path *path, struct btrfs_root_item *root_item,
+ struct btrfs_key *root_key);
+int btrfs_find_orphan_roots(struct btrfs_fs_info *fs_info);
+void btrfs_set_root_node(struct btrfs_root_item *item,
+ struct extent_buffer *node);
+void btrfs_check_and_init_root_item(struct btrfs_root_item *item);
+void btrfs_update_root_times(struct btrfs_trans_handle *trans, struct btrfs_root *root);
+
+#endif
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index f260c53829e5..52b346795f66 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -17,10 +17,13 @@
#include "extent_io.h"
#include "dev-replace.h"
#include "check-integrity.h"
-#include "rcu-string.h"
#include "raid56.h"
#include "block-group.h"
#include "zoned.h"
+#include "fs.h"
+#include "accessors.h"
+#include "file-item.h"
+#include "scrub.h"
/*
* This is only the first step towards a full-features scrub. It reads all
@@ -56,6 +59,17 @@ struct scrub_ctx;
#define SCRUB_MAX_PAGES (DIV_ROUND_UP(BTRFS_MAX_METADATA_BLOCKSIZE, PAGE_SIZE))
+/*
+ * Maximum number of mirrors that can be available for all profiles counting
+ * the target device of dev-replace as one. During an active device replace
+ * procedure, the target device of the copy operation is a mirror for the
+ * filesystem data as well that can be used to read data in order to repair
+ * read errors on other disks.
+ *
+ * Current value is derived from RAID1C4 with 4 copies.
+ */
+#define BTRFS_MAX_MIRRORS (4 + 1)
+
struct scrub_recover {
refcount_t refs;
struct btrfs_io_context *bioc;
@@ -284,7 +298,7 @@ static struct scrub_block *alloc_scrub_block(struct scrub_ctx *sctx,
* Will also allocate new pages for @sblock if needed.
*/
static struct scrub_sector *alloc_scrub_sector(struct scrub_block *sblock,
- u64 logical, gfp_t gfp)
+ u64 logical)
{
const pgoff_t page_index = (logical - sblock->logical) >> PAGE_SHIFT;
struct scrub_sector *ssector;
@@ -292,7 +306,7 @@ static struct scrub_sector *alloc_scrub_sector(struct scrub_block *sblock,
/* We must never have scrub_block exceed U32_MAX in size. */
ASSERT(logical - sblock->logical < U32_MAX);
- ssector = kzalloc(sizeof(*ssector), gfp);
+ ssector = kzalloc(sizeof(*ssector), GFP_KERNEL);
if (!ssector)
return NULL;
@@ -300,7 +314,7 @@ static struct scrub_sector *alloc_scrub_sector(struct scrub_block *sblock,
if (!sblock->pages[page_index]) {
int ret;
- sblock->pages[page_index] = alloc_page(gfp);
+ sblock->pages[page_index] = alloc_page(GFP_KERNEL);
if (!sblock->pages[page_index]) {
kfree(ssector);
return NULL;
@@ -794,8 +808,8 @@ nomem:
return ERR_PTR(-ENOMEM);
}
-static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root,
- void *warn_ctx)
+static int scrub_print_warning_inode(u64 inum, u64 offset, u64 num_bytes,
+ u64 root, void *warn_ctx)
{
u32 nlink;
int ret;
@@ -862,7 +876,7 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root,
btrfs_warn_in_rcu(fs_info,
"%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu, length %u, links %u (path: %s)",
swarn->errstr, swarn->logical,
- rcu_str_deref(swarn->dev->name),
+ btrfs_dev_name(swarn->dev),
swarn->physical,
root, inum, offset,
fs_info->sectorsize, nlink,
@@ -876,7 +890,7 @@ err:
btrfs_warn_in_rcu(fs_info,
"%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu: path resolving failed with ret=%d",
swarn->errstr, swarn->logical,
- rcu_str_deref(swarn->dev->name),
+ btrfs_dev_name(swarn->dev),
swarn->physical,
root, inum, offset, ret);
@@ -894,7 +908,6 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
struct btrfs_extent_item *ei;
struct scrub_warning swarn;
unsigned long ptr = 0;
- u64 extent_item_pos;
u64 flags = 0;
u64 ref_root;
u32 item_size;
@@ -908,8 +921,7 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
/* Super block error, no need to search extent tree. */
if (sblock->sectors[0]->flags & BTRFS_EXTENT_FLAG_SUPER) {
btrfs_warn_in_rcu(fs_info, "%s on device %s, physical %llu",
- errstr, rcu_str_deref(dev->name),
- sblock->physical);
+ errstr, btrfs_dev_name(dev), sblock->physical);
return;
}
path = btrfs_alloc_path();
@@ -926,7 +938,6 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
if (ret < 0)
goto out;
- extent_item_pos = swarn.logical - found_key.objectid;
swarn.extent_item_size = found_key.offset;
eb = path->nodes[0];
@@ -941,7 +952,7 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
btrfs_warn_in_rcu(fs_info,
"%s at logical %llu on dev %s, physical %llu: metadata %s (level %d) in tree %llu",
errstr, swarn.logical,
- rcu_str_deref(dev->name),
+ btrfs_dev_name(dev),
swarn.physical,
ref_level ? "node" : "leaf",
ret < 0 ? -1 : ref_level,
@@ -949,12 +960,18 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
} while (ret != 1);
btrfs_release_path(path);
} else {
+ struct btrfs_backref_walk_ctx ctx = { 0 };
+
btrfs_release_path(path);
+
+ ctx.bytenr = found_key.objectid;
+ ctx.extent_item_pos = swarn.logical - found_key.objectid;
+ ctx.fs_info = fs_info;
+
swarn.path = path;
swarn.dev = dev;
- iterate_extent_inodes(fs_info, found_key.objectid,
- extent_item_pos, 1,
- scrub_print_warning_inode, &swarn, false);
+
+ iterate_extent_inodes(&ctx, true, scrub_print_warning_inode, &swarn);
}
out:
@@ -1358,7 +1375,7 @@ corrected_error:
spin_unlock(&sctx->stat_lock);
btrfs_err_rl_in_rcu(fs_info,
"fixed up error at logical %llu on dev %s",
- logical, rcu_str_deref(dev->name));
+ logical, btrfs_dev_name(dev));
}
} else {
did_not_correct_error:
@@ -1367,7 +1384,7 @@ did_not_correct_error:
spin_unlock(&sctx->stat_lock);
btrfs_err_rl_in_rcu(fs_info,
"unable to fixup (regular) error at logical %llu on dev %s",
- logical, rcu_str_deref(dev->name));
+ logical, btrfs_dev_name(dev));
}
out:
@@ -1480,7 +1497,7 @@ static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
return -EIO;
}
- recover = kzalloc(sizeof(struct scrub_recover), GFP_NOFS);
+ recover = kzalloc(sizeof(struct scrub_recover), GFP_KERNEL);
if (!recover) {
btrfs_put_bioc(bioc);
btrfs_bio_counter_dec(fs_info);
@@ -1503,7 +1520,7 @@ static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
sblock = sblocks_for_recheck[mirror_index];
sblock->sctx = sctx;
- sector = alloc_scrub_sector(sblock, logical, GFP_NOFS);
+ sector = alloc_scrub_sector(sblock, logical);
if (!sector) {
spin_lock(&sctx->stat_lock);
sctx->stat.malloc_errors++;
@@ -2313,14 +2330,14 @@ static void scrub_missing_raid56_worker(struct work_struct *work)
spin_unlock(&sctx->stat_lock);
btrfs_err_rl_in_rcu(fs_info,
"IO error rebuilding logical %llu for dev %s",
- logical, rcu_str_deref(dev->name));
+ logical, btrfs_dev_name(dev));
} else if (sblock->header_error || sblock->checksum_error) {
spin_lock(&sctx->stat_lock);
sctx->stat.uncorrectable_errors++;
spin_unlock(&sctx->stat_lock);
btrfs_err_rl_in_rcu(fs_info,
"failed to rebuild valid logical %llu for dev %s",
- logical, rcu_str_deref(dev->name));
+ logical, btrfs_dev_name(dev));
} else {
scrub_write_block_to_dev_replace(sblock);
}
@@ -2425,7 +2442,7 @@ static int scrub_sectors(struct scrub_ctx *sctx, u64 logical, u32 len,
*/
u32 l = min(sectorsize, len);
- sector = alloc_scrub_sector(sblock, logical, GFP_KERNEL);
+ sector = alloc_scrub_sector(sblock, logical);
if (!sector) {
spin_lock(&sctx->stat_lock);
sctx->stat.malloc_errors++;
@@ -2672,17 +2689,11 @@ static int scrub_extent(struct scrub_ctx *sctx, struct map_lookup *map,
u8 csum[BTRFS_CSUM_SIZE];
u32 blocksize;
- /*
- * Block size determines how many scrub_block will be allocated. Here
- * we use BTRFS_STRIPE_LEN (64KiB) as default limit, so we won't
- * allocate too many scrub_block, while still won't cause too large
- * bios for large extents.
- */
if (flags & BTRFS_EXTENT_FLAG_DATA) {
if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
blocksize = map->stripe_len;
else
- blocksize = BTRFS_STRIPE_LEN;
+ blocksize = sctx->fs_info->sectorsize;
spin_lock(&sctx->stat_lock);
sctx->stat.data_extents_scrubbed++;
sctx->stat.data_bytes_scrubbed += len;
@@ -2762,7 +2773,7 @@ static int scrub_sectors_for_parity(struct scrub_parity *sparity,
for (index = 0; len > 0; index++) {
struct scrub_sector *sector;
- sector = alloc_scrub_sector(sblock, logical, GFP_KERNEL);
+ sector = alloc_scrub_sector(sblock, logical);
if (!sector) {
spin_lock(&sctx->stat_lock);
sctx->stat.malloc_errors++;
@@ -3227,9 +3238,9 @@ static int scrub_raid56_data_stripe_for_parity(struct scrub_ctx *sctx,
extent_dev = bioc->stripes[0].dev;
btrfs_put_bioc(bioc);
- ret = btrfs_lookup_csums_range(csum_root, extent_start,
- extent_start + extent_size - 1,
- &sctx->csum_list, 1, false);
+ ret = btrfs_lookup_csums_list(csum_root, extent_start,
+ extent_start + extent_size - 1,
+ &sctx->csum_list, 1, false);
if (ret) {
scrub_parity_mark_sectors_error(sparity, extent_start,
extent_size);
@@ -3453,7 +3464,7 @@ static int scrub_simple_mirror(struct scrub_ctx *sctx,
cur_logical;
if (extent_flags & BTRFS_EXTENT_FLAG_DATA) {
- ret = btrfs_lookup_csums_range(csum_root, cur_logical,
+ ret = btrfs_lookup_csums_list(csum_root, cur_logical,
cur_logical + scrub_len - 1,
&sctx->csum_list, 1, false);
if (ret)
@@ -3917,7 +3928,6 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
if (sctx->is_dev_replace && btrfs_is_zoned(fs_info)) {
if (!test_bit(BLOCK_GROUP_FLAG_TO_COPY, &cache->runtime_flags)) {
- spin_unlock(&cache->lock);
btrfs_put_block_group(cache);
goto skip;
}
@@ -4291,7 +4301,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
btrfs_err_in_rcu(fs_info,
"scrub on devid %llu: filesystem on %s is not writable",
- devid, rcu_str_deref(dev->name));
+ devid, btrfs_dev_name(dev));
ret = -EROFS;
goto out;
}
diff --git a/fs/btrfs/scrub.h b/fs/btrfs/scrub.h
new file mode 100644
index 000000000000..7639103ebf9d
--- /dev/null
+++ b/fs/btrfs/scrub.h
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef BTRFS_SCRUB_H
+#define BTRFS_SCRUB_H
+
+int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
+ u64 end, struct btrfs_scrub_progress *progress,
+ int readonly, int is_dev_replace);
+void btrfs_scrub_pause(struct btrfs_fs_info *fs_info);
+void btrfs_scrub_continue(struct btrfs_fs_info *fs_info);
+int btrfs_scrub_cancel(struct btrfs_fs_info *info);
+int btrfs_scrub_cancel_dev(struct btrfs_device *dev);
+int btrfs_scrub_progress(struct btrfs_fs_info *fs_info, u64 devid,
+ struct btrfs_scrub_progress *progress);
+
+#endif
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 145c84b44fd0..67f7c698ade3 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -27,6 +27,11 @@
#include "compression.h"
#include "xattr.h"
#include "print-tree.h"
+#include "accessors.h"
+#include "dir-item.h"
+#include "file-item.h"
+#include "ioctl.h"
+#include "verity.h"
/*
* Maximum number of references an extent can have in order for us to attempt to
@@ -34,7 +39,7 @@
* avoid hitting limitations of the backreference walking code (taking a lot of
* time and using too much memory for extents with large number of references).
*/
-#define SEND_MAX_EXTENT_REFS 64
+#define SEND_MAX_EXTENT_REFS 1024
/*
* A fs_path is a helper to dynamically build path names with unknown size.
@@ -71,13 +76,46 @@ struct clone_root {
struct btrfs_root *root;
u64 ino;
u64 offset;
-
- u64 found_refs;
+ u64 num_bytes;
+ bool found_ref;
};
#define SEND_CTX_MAX_NAME_CACHE_SIZE 128
#define SEND_CTX_NAME_CACHE_CLEAN_SIZE (SEND_CTX_MAX_NAME_CACHE_SIZE * 2)
+/*
+ * Limit the root_ids array of struct backref_cache_entry to 12 elements.
+ * This makes the size of a cache entry to be exactly 128 bytes on x86_64.
+ * The most common case is to have a single root for cloning, which corresponds
+ * to the send root. Having the user specify more than 11 clone roots is not
+ * common, and in such rare cases we simply don't use caching if the number of
+ * cloning roots that lead down to a leaf is more than 12.
+ */
+#define SEND_MAX_BACKREF_CACHE_ROOTS 12
+
+/*
+ * Max number of entries in the cache.
+ * With SEND_MAX_BACKREF_CACHE_ROOTS as 12, the size in bytes, excluding
+ * maple tree's internal nodes, is 16K.
+ */
+#define SEND_MAX_BACKREF_CACHE_SIZE 128
+
+/*
+ * A backref cache entry maps a leaf to a list of IDs of roots from which the
+ * leaf is accessible and we can use for clone operations.
+ * With SEND_MAX_BACKREF_CACHE_ROOTS as 12, each cache entry is 128 bytes (on
+ * x86_64).
+ */
+struct backref_cache_entry {
+ /* List to link to the cache's lru list. */
+ struct list_head list;
+ /* The key for this entry in the cache. */
+ u64 key;
+ u64 root_ids[SEND_MAX_BACKREF_CACHE_ROOTS];
+ /* Number of valid elements in the root_ids array. */
+ int num_roots;
+};
+
struct send_ctx {
struct file *send_filp;
loff_t send_off;
@@ -246,6 +284,14 @@ struct send_ctx {
struct rb_root rbtree_new_refs;
struct rb_root rbtree_deleted_refs;
+
+ struct {
+ u64 last_reloc_trans;
+ struct list_head lru_list;
+ struct maple_tree entries;
+ /* Number of entries stored in the cache. */
+ int size;
+ } backref_cache;
};
struct pending_dir_move {
@@ -1093,7 +1139,7 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
data_len = btrfs_dir_data_len(eb, di);
btrfs_dir_item_key_to_cpu(eb, di, &di_key);
- if (btrfs_dir_type(eb, di) == BTRFS_FT_XATTR) {
+ if (btrfs_dir_ftype(eb, di) == BTRFS_FT_XATTR) {
if (name_len > XATTR_NAME_MAX) {
ret = -ENAMETOOLONG;
goto out;
@@ -1236,8 +1282,12 @@ struct backref_ctx {
/* may be truncated in case it's the last extent in a file */
u64 extent_len;
- /* Just to check for bugs in backref resolving */
- int found_itself;
+ /* The bytenr the file extent item we are processing refers to. */
+ u64 bytenr;
+ /* The owner (root id) of the data backref for the current extent. */
+ u64 backref_owner;
+ /* The offset of the data backref for the current extent. */
+ u64 backref_offset;
};
static int __clone_root_cmp_bsearch(const void *key, const void *elt)
@@ -1266,32 +1316,33 @@ static int __clone_root_cmp_sort(const void *e1, const void *e2)
/*
* Called for every backref that is found for the current extent.
- * Results are collected in sctx->clone_roots->ino/offset/found_refs
+ * Results are collected in sctx->clone_roots->ino/offset.
*/
-static int __iterate_backrefs(u64 ino, u64 offset, u64 root, void *ctx_)
+static int iterate_backrefs(u64 ino, u64 offset, u64 num_bytes, u64 root_id,
+ void *ctx_)
{
struct backref_ctx *bctx = ctx_;
- struct clone_root *found;
+ struct clone_root *clone_root;
/* First check if the root is in the list of accepted clone sources */
- found = bsearch((void *)(uintptr_t)root, bctx->sctx->clone_roots,
- bctx->sctx->clone_roots_cnt,
- sizeof(struct clone_root),
- __clone_root_cmp_bsearch);
- if (!found)
+ clone_root = bsearch((void *)(uintptr_t)root_id, bctx->sctx->clone_roots,
+ bctx->sctx->clone_roots_cnt,
+ sizeof(struct clone_root),
+ __clone_root_cmp_bsearch);
+ if (!clone_root)
return 0;
- if (found->root == bctx->sctx->send_root &&
+ /* This is our own reference, bail out as we can't clone from it. */
+ if (clone_root->root == bctx->sctx->send_root &&
ino == bctx->cur_objectid &&
- offset == bctx->cur_offset) {
- bctx->found_itself = 1;
- }
+ offset == bctx->cur_offset)
+ return 0;
/*
* Make sure we don't consider clones from send_root that are
* behind the current inode/offset.
*/
- if (found->root == bctx->sctx->send_root) {
+ if (clone_root->root == bctx->sctx->send_root) {
/*
* If the source inode was not yet processed we can't issue a
* clone operation, as the source extent does not exist yet at
@@ -1312,21 +1363,217 @@ static int __iterate_backrefs(u64 ino, u64 offset, u64 root, void *ctx_)
}
bctx->found++;
- found->found_refs++;
- if (ino < found->ino) {
- found->ino = ino;
- found->offset = offset;
- } else if (found->ino == ino) {
+ clone_root->found_ref = true;
+
+ /*
+ * If the given backref refers to a file extent item with a larger
+ * number of bytes than what we found before, use the new one so that
+ * we clone more optimally and end up doing less writes and getting
+ * less exclusive, non-shared extents at the destination.
+ */
+ if (num_bytes > clone_root->num_bytes) {
+ clone_root->ino = ino;
+ clone_root->offset = offset;
+ clone_root->num_bytes = num_bytes;
+
/*
- * same extent found more then once in the same file.
+ * Found a perfect candidate, so there's no need to continue
+ * backref walking.
*/
- if (found->offset > offset + bctx->extent_len)
- found->offset = offset;
+ if (num_bytes >= bctx->extent_len)
+ return BTRFS_ITERATE_EXTENT_INODES_STOP;
}
return 0;
}
+static void empty_backref_cache(struct send_ctx *sctx)
+{
+ struct backref_cache_entry *entry;
+ struct backref_cache_entry *tmp;
+
+ list_for_each_entry_safe(entry, tmp, &sctx->backref_cache.lru_list, list)
+ kfree(entry);
+
+ INIT_LIST_HEAD(&sctx->backref_cache.lru_list);
+ mtree_destroy(&sctx->backref_cache.entries);
+ sctx->backref_cache.size = 0;
+}
+
+static bool lookup_backref_cache(u64 leaf_bytenr, void *ctx,
+ const u64 **root_ids_ret, int *root_count_ret)
+{
+ struct backref_ctx *bctx = ctx;
+ struct send_ctx *sctx = bctx->sctx;
+ struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
+ const u64 key = leaf_bytenr >> fs_info->sectorsize_bits;
+ struct backref_cache_entry *entry;
+
+ if (sctx->backref_cache.size == 0)
+ return false;
+
+ /*
+ * If relocation happened since we first filled the cache, then we must
+ * empty the cache and can not use it, because even though we operate on
+ * read-only roots, their leaves and nodes may have been reallocated and
+ * now be used for different nodes/leaves of the same tree or some other
+ * tree.
+ *
+ * We are called from iterate_extent_inodes() while either holding a
+ * transaction handle or holding fs_info->commit_root_sem, so no need
+ * to take any lock here.
+ */
+ if (fs_info->last_reloc_trans > sctx->backref_cache.last_reloc_trans) {
+ empty_backref_cache(sctx);
+ return false;
+ }
+
+ entry = mtree_load(&sctx->backref_cache.entries, key);
+ if (!entry)
+ return false;
+
+ *root_ids_ret = entry->root_ids;
+ *root_count_ret = entry->num_roots;
+ list_move_tail(&entry->list, &sctx->backref_cache.lru_list);
+
+ return true;
+}
+
+static void store_backref_cache(u64 leaf_bytenr, const struct ulist *root_ids,
+ void *ctx)
+{
+ struct backref_ctx *bctx = ctx;
+ struct send_ctx *sctx = bctx->sctx;
+ struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
+ struct backref_cache_entry *new_entry;
+ struct ulist_iterator uiter;
+ struct ulist_node *node;
+ int ret;
+
+ /*
+ * We're called while holding a transaction handle or while holding
+ * fs_info->commit_root_sem (at iterate_extent_inodes()), so must do a
+ * NOFS allocation.
+ */
+ new_entry = kmalloc(sizeof(struct backref_cache_entry), GFP_NOFS);
+ /* No worries, cache is optional. */
+ if (!new_entry)
+ return;
+
+ new_entry->key = leaf_bytenr >> fs_info->sectorsize_bits;
+ new_entry->num_roots = 0;
+ ULIST_ITER_INIT(&uiter);
+ while ((node = ulist_next(root_ids, &uiter)) != NULL) {
+ const u64 root_id = node->val;
+ struct clone_root *root;
+
+ root = bsearch((void *)(uintptr_t)root_id, sctx->clone_roots,
+ sctx->clone_roots_cnt, sizeof(struct clone_root),
+ __clone_root_cmp_bsearch);
+ if (!root)
+ continue;
+
+ /* Too many roots, just exit, no worries as caching is optional. */
+ if (new_entry->num_roots >= SEND_MAX_BACKREF_CACHE_ROOTS) {
+ kfree(new_entry);
+ return;
+ }
+
+ new_entry->root_ids[new_entry->num_roots] = root_id;
+ new_entry->num_roots++;
+ }
+
+ /*
+ * We may have not added any roots to the new cache entry, which means
+ * none of the roots is part of the list of roots from which we are
+ * allowed to clone. Cache the new entry as it's still useful to avoid
+ * backref walking to determine which roots have a path to the leaf.
+ */
+
+ if (sctx->backref_cache.size >= SEND_MAX_BACKREF_CACHE_SIZE) {
+ struct backref_cache_entry *lru_entry;
+ struct backref_cache_entry *mt_entry;
+
+ lru_entry = list_first_entry(&sctx->backref_cache.lru_list,
+ struct backref_cache_entry, list);
+ mt_entry = mtree_erase(&sctx->backref_cache.entries, lru_entry->key);
+ ASSERT(mt_entry == lru_entry);
+ list_del(&mt_entry->list);
+ kfree(mt_entry);
+ sctx->backref_cache.size--;
+ }
+
+ ret = mtree_insert(&sctx->backref_cache.entries, new_entry->key,
+ new_entry, GFP_NOFS);
+ ASSERT(ret == 0 || ret == -ENOMEM);
+ if (ret) {
+ /* Caching is optional, no worries. */
+ kfree(new_entry);
+ return;
+ }
+
+ list_add_tail(&new_entry->list, &sctx->backref_cache.lru_list);
+
+ /*
+ * We are called from iterate_extent_inodes() while either holding a
+ * transaction handle or holding fs_info->commit_root_sem, so no need
+ * to take any lock here.
+ */
+ if (sctx->backref_cache.size == 0)
+ sctx->backref_cache.last_reloc_trans = fs_info->last_reloc_trans;
+
+ sctx->backref_cache.size++;
+}
+
+static int check_extent_item(u64 bytenr, const struct btrfs_extent_item *ei,
+ const struct extent_buffer *leaf, void *ctx)
+{
+ const u64 refs = btrfs_extent_refs(leaf, ei);
+ const struct backref_ctx *bctx = ctx;
+ const struct send_ctx *sctx = bctx->sctx;
+
+ if (bytenr == bctx->bytenr) {
+ const u64 flags = btrfs_extent_flags(leaf, ei);
+
+ if (WARN_ON(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK))
+ return -EUCLEAN;
+
+ /*
+ * If we have only one reference and only the send root as a
+ * clone source - meaning no clone roots were given in the
+ * struct btrfs_ioctl_send_args passed to the send ioctl - then
+ * it's our reference and there's no point in doing backref
+ * walking which is expensive, so exit early.
+ */
+ if (refs == 1 && sctx->clone_roots_cnt == 1)
+ return -ENOENT;
+ }
+
+ /*
+ * Backreference walking (iterate_extent_inodes() below) is currently
+ * too expensive when an extent has a large number of references, both
+ * in time spent and used memory. So for now just fallback to write
+ * operations instead of clone operations when an extent has more than
+ * a certain amount of references.
+ */
+ if (refs > SEND_MAX_EXTENT_REFS)
+ return -ENOENT;
+
+ return 0;
+}
+
+static bool skip_self_data_ref(u64 root, u64 ino, u64 offset, void *ctx)
+{
+ const struct backref_ctx *bctx = ctx;
+
+ if (ino == bctx->cur_objectid &&
+ root == bctx->backref_owner &&
+ offset == bctx->backref_offset)
+ return true;
+
+ return false;
+}
+
/*
* Given an inode, offset and extent item, it finds a good clone for a clone
* instruction. Returns -ENOENT when none could be found. The function makes
@@ -1348,79 +1595,36 @@ static int find_extent_clone(struct send_ctx *sctx,
u64 logical;
u64 disk_byte;
u64 num_bytes;
- u64 extent_item_pos;
- u64 flags = 0;
struct btrfs_file_extent_item *fi;
struct extent_buffer *eb = path->nodes[0];
- struct backref_ctx backref_ctx = {0};
+ struct backref_ctx backref_ctx = { 0 };
+ struct btrfs_backref_walk_ctx backref_walk_ctx = { 0 };
struct clone_root *cur_clone_root;
- struct btrfs_key found_key;
- struct btrfs_path *tmp_path;
- struct btrfs_extent_item *ei;
int compressed;
u32 i;
- tmp_path = alloc_path_for_send();
- if (!tmp_path)
- return -ENOMEM;
+ /*
+ * With fallocate we can get prealloc extents beyond the inode's i_size,
+ * so we don't do anything here because clone operations can not clone
+ * to a range beyond i_size without increasing the i_size of the
+ * destination inode.
+ */
+ if (data_offset >= ino_size)
+ return 0;
- /* We only use this path under the commit sem */
- tmp_path->need_commit_sem = 0;
+ fi = btrfs_item_ptr(eb, path->slots[0], struct btrfs_file_extent_item);
+ extent_type = btrfs_file_extent_type(eb, fi);
+ if (extent_type == BTRFS_FILE_EXTENT_INLINE)
+ return -ENOENT;
- if (data_offset >= ino_size) {
- /*
- * There may be extents that lie behind the file's size.
- * I at least had this in combination with snapshotting while
- * writing large files.
- */
- ret = 0;
- goto out;
- }
+ disk_byte = btrfs_file_extent_disk_bytenr(eb, fi);
+ if (disk_byte == 0)
+ return -ENOENT;
- fi = btrfs_item_ptr(eb, path->slots[0],
- struct btrfs_file_extent_item);
- extent_type = btrfs_file_extent_type(eb, fi);
- if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
- ret = -ENOENT;
- goto out;
- }
compressed = btrfs_file_extent_compression(eb, fi);
-
num_bytes = btrfs_file_extent_num_bytes(eb, fi);
- disk_byte = btrfs_file_extent_disk_bytenr(eb, fi);
- if (disk_byte == 0) {
- ret = -ENOENT;
- goto out;
- }
logical = disk_byte + btrfs_file_extent_offset(eb, fi);
- down_read(&fs_info->commit_root_sem);
- ret = extent_from_logical(fs_info, disk_byte, tmp_path,
- &found_key, &flags);
- up_read(&fs_info->commit_root_sem);
-
- if (ret < 0)
- goto out;
- if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
- ret = -EIO;
- goto out;
- }
-
- ei = btrfs_item_ptr(tmp_path->nodes[0], tmp_path->slots[0],
- struct btrfs_extent_item);
- /*
- * Backreference walking (iterate_extent_inodes() below) is currently
- * too expensive when an extent has a large number of references, both
- * in time spent and used memory. So for now just fallback to write
- * operations instead of clone operations when an extent has more than
- * a certain amount of references.
- */
- if (btrfs_extent_refs(tmp_path->nodes[0], ei) > SEND_MAX_EXTENT_REFS) {
- ret = -ENOENT;
- goto out;
- }
- btrfs_release_path(tmp_path);
-
/*
* Setup the clone roots.
*/
@@ -1428,37 +1632,59 @@ static int find_extent_clone(struct send_ctx *sctx,
cur_clone_root = sctx->clone_roots + i;
cur_clone_root->ino = (u64)-1;
cur_clone_root->offset = 0;
- cur_clone_root->found_refs = 0;
+ cur_clone_root->num_bytes = 0;
+ cur_clone_root->found_ref = false;
}
backref_ctx.sctx = sctx;
- backref_ctx.found = 0;
backref_ctx.cur_objectid = ino;
backref_ctx.cur_offset = data_offset;
- backref_ctx.found_itself = 0;
- backref_ctx.extent_len = num_bytes;
+ backref_ctx.bytenr = disk_byte;
+ /*
+ * Use the header owner and not the send root's id, because in case of a
+ * snapshot we can have shared subtrees.
+ */
+ backref_ctx.backref_owner = btrfs_header_owner(eb);
+ backref_ctx.backref_offset = data_offset - btrfs_file_extent_offset(eb, fi);
/*
* The last extent of a file may be too large due to page alignment.
* We need to adjust extent_len in this case so that the checks in
- * __iterate_backrefs work.
+ * iterate_backrefs() work.
*/
if (data_offset + num_bytes >= ino_size)
backref_ctx.extent_len = ino_size - data_offset;
+ else
+ backref_ctx.extent_len = num_bytes;
/*
* Now collect all backrefs.
*/
+ backref_walk_ctx.bytenr = disk_byte;
if (compressed == BTRFS_COMPRESS_NONE)
- extent_item_pos = logical - found_key.objectid;
- else
- extent_item_pos = 0;
- ret = iterate_extent_inodes(fs_info, found_key.objectid,
- extent_item_pos, 1, __iterate_backrefs,
- &backref_ctx, false);
+ backref_walk_ctx.extent_item_pos = btrfs_file_extent_offset(eb, fi);
+ backref_walk_ctx.fs_info = fs_info;
+ backref_walk_ctx.cache_lookup = lookup_backref_cache;
+ backref_walk_ctx.cache_store = store_backref_cache;
+ backref_walk_ctx.indirect_ref_iterator = iterate_backrefs;
+ backref_walk_ctx.check_extent_item = check_extent_item;
+ backref_walk_ctx.user_ctx = &backref_ctx;
+
+ /*
+ * If have a single clone root, then it's the send root and we can tell
+ * the backref walking code to skip our own backref and not resolve it,
+ * since we can not use it for cloning - the source and destination
+ * ranges can't overlap and in case the leaf is shared through a subtree
+ * due to snapshots, we can't use those other roots since they are not
+ * in the list of clone roots.
+ */
+ if (sctx->clone_roots_cnt == 1)
+ backref_walk_ctx.skip_data_ref = skip_self_data_ref;
+ ret = iterate_extent_inodes(&backref_walk_ctx, true, iterate_backrefs,
+ &backref_ctx);
if (ret < 0)
- goto out;
+ return ret;
down_read(&fs_info->commit_root_sem);
if (fs_info->last_reloc_trans > sctx->last_reloc_trans) {
@@ -1475,37 +1701,42 @@ static int find_extent_clone(struct send_ctx *sctx,
* was already reallocated after the relocation.
*/
up_read(&fs_info->commit_root_sem);
- ret = -ENOENT;
- goto out;
+ return -ENOENT;
}
up_read(&fs_info->commit_root_sem);
- if (!backref_ctx.found_itself) {
- /* found a bug in backref code? */
- ret = -EIO;
- btrfs_err(fs_info,
- "did not find backref in send_root. inode=%llu, offset=%llu, disk_byte=%llu found extent=%llu",
- ino, data_offset, disk_byte, found_key.objectid);
- goto out;
- }
-
btrfs_debug(fs_info,
"find_extent_clone: data_offset=%llu, ino=%llu, num_bytes=%llu, logical=%llu",
data_offset, ino, num_bytes, logical);
- if (!backref_ctx.found)
+ if (!backref_ctx.found) {
btrfs_debug(fs_info, "no clones found");
+ return -ENOENT;
+ }
cur_clone_root = NULL;
for (i = 0; i < sctx->clone_roots_cnt; i++) {
- if (sctx->clone_roots[i].found_refs) {
- if (!cur_clone_root)
- cur_clone_root = sctx->clone_roots + i;
- else if (sctx->clone_roots[i].root == sctx->send_root)
- /* prefer clones from send_root over others */
- cur_clone_root = sctx->clone_roots + i;
- }
+ struct clone_root *clone_root = &sctx->clone_roots[i];
+ if (!clone_root->found_ref)
+ continue;
+
+ /*
+ * Choose the root from which we can clone more bytes, to
+ * minimize write operations and therefore have more extent
+ * sharing at the destination (the same as in the source).
+ */
+ if (!cur_clone_root ||
+ clone_root->num_bytes > cur_clone_root->num_bytes) {
+ cur_clone_root = clone_root;
+
+ /*
+ * We found an optimal clone candidate (any inode from
+ * any root is fine), so we're done.
+ */
+ if (clone_root->num_bytes >= backref_ctx.extent_len)
+ break;
+ }
}
if (cur_clone_root) {
@@ -1515,8 +1746,6 @@ static int find_extent_clone(struct send_ctx *sctx,
ret = -ENOENT;
}
-out:
- btrfs_free_path(tmp_path);
return ret;
}
@@ -1596,13 +1825,17 @@ static int gen_unique_name(struct send_ctx *sctx,
return -ENOMEM;
while (1) {
+ struct fscrypt_str tmp_name;
+
len = snprintf(tmp, sizeof(tmp), "o%llu-%llu-%llu",
ino, gen, idx);
ASSERT(len < sizeof(tmp));
+ tmp_name.name = tmp;
+ tmp_name.len = strlen(tmp);
di = btrfs_lookup_dir_item(NULL, sctx->send_root,
path, BTRFS_FIRST_FREE_OBJECTID,
- tmp, strlen(tmp), 0);
+ &tmp_name, 0);
btrfs_release_path(path);
if (IS_ERR(di)) {
ret = PTR_ERR(di);
@@ -1622,7 +1855,7 @@ static int gen_unique_name(struct send_ctx *sctx,
di = btrfs_lookup_dir_item(NULL, sctx->parent_root,
path, BTRFS_FIRST_FREE_OBJECTID,
- tmp, strlen(tmp), 0);
+ &tmp_name, 0);
btrfs_release_path(path);
if (IS_ERR(di)) {
ret = PTR_ERR(di);
@@ -1752,13 +1985,13 @@ static int lookup_dir_item_inode(struct btrfs_root *root,
struct btrfs_dir_item *di;
struct btrfs_key key;
struct btrfs_path *path;
+ struct fscrypt_str name_str = FSTR_INIT((char *)name, name_len);
path = alloc_path_for_send();
if (!path)
return -ENOMEM;
- di = btrfs_lookup_dir_item(NULL, root, path,
- dir, name, name_len, 0);
+ di = btrfs_lookup_dir_item(NULL, root, path, dir, &name_str, 0);
if (IS_ERR_OR_NULL(di)) {
ret = di ? PTR_ERR(di) : -ENOENT;
goto out;
@@ -5702,6 +5935,7 @@ static int clone_range(struct send_ctx *sctx, struct btrfs_path *dst_path,
u64 ext_len;
u64 clone_len;
u64 clone_data_offset;
+ bool crossed_src_i_size = false;
if (slot >= btrfs_header_nritems(leaf)) {
ret = btrfs_next_leaf(clone_root->root, path);
@@ -5759,8 +5993,10 @@ static int clone_range(struct send_ctx *sctx, struct btrfs_path *dst_path,
if (key.offset >= clone_src_i_size)
break;
- if (key.offset + ext_len > clone_src_i_size)
+ if (key.offset + ext_len > clone_src_i_size) {
ext_len = clone_src_i_size - key.offset;
+ crossed_src_i_size = true;
+ }
clone_data_offset = btrfs_file_extent_offset(leaf, ei);
if (btrfs_file_extent_disk_bytenr(leaf, ei) == disk_byte) {
@@ -5821,6 +6057,25 @@ static int clone_range(struct send_ctx *sctx, struct btrfs_path *dst_path,
ret = send_clone(sctx, offset, clone_len,
clone_root);
}
+ } else if (crossed_src_i_size && clone_len < len) {
+ /*
+ * If we are at i_size of the clone source inode and we
+ * can not clone from it, terminate the loop. This is
+ * to avoid sending two write operations, one with a
+ * length matching clone_len and the final one after
+ * this loop with a length of len - clone_len.
+ *
+ * When using encoded writes (BTRFS_SEND_FLAG_COMPRESSED
+ * was passed to the send ioctl), this helps avoid
+ * sending an encoded write for an offset that is not
+ * sector size aligned, in case the i_size of the source
+ * inode is not sector size aligned. That will make the
+ * receiver fallback to decompression of the data and
+ * writing it using regular buffered IO, therefore while
+ * not incorrect, it's not optimal due decompression and
+ * possible re-compression at the receiver.
+ */
+ break;
} else {
ret = send_extent_data(sctx, dst_path, offset,
clone_len);
@@ -7841,6 +8096,9 @@ long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg)
INIT_RADIX_TREE(&sctx->name_cache, GFP_KERNEL);
INIT_LIST_HEAD(&sctx->name_cache_list);
+ INIT_LIST_HEAD(&sctx->backref_cache.lru_list);
+ mt_init(&sctx->backref_cache.entries);
+
sctx->flags = arg->flags;
if (arg->flags & BTRFS_SEND_FLAG_VERSION) {
@@ -7879,7 +8137,7 @@ long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg)
if (sctx->proto >= 2) {
u32 send_buf_num_pages;
- sctx->send_max_size = ALIGN(SZ_16K + BTRFS_MAX_COMPRESSED, PAGE_SIZE);
+ sctx->send_max_size = BTRFS_SEND_BUF_SIZE_V2;
sctx->send_buf = vmalloc(sctx->send_max_size);
if (!sctx->send_buf) {
ret = -ENOMEM;
@@ -8103,6 +8361,8 @@ out:
close_current_inode(sctx);
+ empty_backref_cache(sctx);
+
kfree(sctx);
}
diff --git a/fs/btrfs/send.h b/fs/btrfs/send.h
index f7585cfa7e52..4f5509cb1803 100644
--- a/fs/btrfs/send.h
+++ b/fs/btrfs/send.h
@@ -18,10 +18,12 @@
#endif
/*
- * In send stream v1, no command is larger than 64K. In send stream v2, no limit
- * should be assumed.
+ * In send stream v1, no command is larger than 64K. In send stream v2, no
+ * limit should be assumed, the buffer size is set to be a header with
+ * compressed extent size.
*/
#define BTRFS_SEND_BUF_SIZE_V1 SZ_64K
+#define BTRFS_SEND_BUF_SIZE_V2 ALIGN(SZ_16K + BTRFS_MAX_COMPRESSED, PAGE_SIZE)
struct inode;
struct btrfs_ioctl_send_args;
diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
index f171bf875633..d28ee4e36f3d 100644
--- a/fs/btrfs/space-info.c
+++ b/fs/btrfs/space-info.c
@@ -10,6 +10,9 @@
#include "transaction.h"
#include "block-group.h"
#include "zoned.h"
+#include "fs.h"
+#include "accessors.h"
+#include "extent-tree.h"
/*
* HOW DOES SPACE RESERVATION WORK
@@ -856,7 +859,7 @@ static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info,
u64 thresh;
u64 used;
- thresh = div_factor_fine(total, 90);
+ thresh = mult_perc(total, 90);
lockdep_assert_held(&space_info->lock);
@@ -974,7 +977,7 @@ static bool steal_from_global_rsv(struct btrfs_fs_info *fs_info,
return false;
spin_lock(&global_rsv->lock);
- min_bytes = div_factor(global_rsv->size, 1);
+ min_bytes = mult_perc(global_rsv->size, 10);
if (global_rsv->reserved < min_bytes + ticket->bytes) {
spin_unlock(&global_rsv->lock);
return false;
@@ -1490,8 +1493,8 @@ static void wait_reserve_ticket(struct btrfs_fs_info *fs_info,
spin_unlock(&space_info->lock);
}
-/**
- * Do the appropriate flushing and waiting for a ticket
+/*
+ * Do the appropriate flushing and waiting for a ticket.
*
* @fs_info: the filesystem
* @space_info: space info for the reservation
@@ -1583,8 +1586,18 @@ static inline bool can_steal(enum btrfs_reserve_flush_enum flush)
flush == BTRFS_RESERVE_FLUSH_EVICT);
}
-/**
- * Try to reserve bytes from the block_rsv's space
+/*
+ * NO_FLUSH and FLUSH_EMERGENCY don't want to create a ticket, they just want to
+ * fail as quickly as possible.
+ */
+static inline bool can_ticket(enum btrfs_reserve_flush_enum flush)
+{
+ return (flush != BTRFS_RESERVE_NO_FLUSH &&
+ flush != BTRFS_RESERVE_FLUSH_EMERGENCY);
+}
+
+/*
+ * Try to reserve bytes from the block_rsv's space.
*
* @fs_info: the filesystem
* @space_info: space info we want to allocate from
@@ -1645,13 +1658,28 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info,
}
/*
+ * Things are dire, we need to make a reservation so we don't abort. We
+ * will let this reservation go through as long as we have actual space
+ * left to allocate for the block.
+ */
+ if (ret && unlikely(flush == BTRFS_RESERVE_FLUSH_EMERGENCY)) {
+ used = btrfs_space_info_used(space_info, false);
+ if (used + orig_bytes <=
+ writable_total_bytes(fs_info, space_info)) {
+ btrfs_space_info_update_bytes_may_use(fs_info, space_info,
+ orig_bytes);
+ ret = 0;
+ }
+ }
+
+ /*
* If we couldn't make a reservation then setup our reservation ticket
* and kick the async worker if it's not already running.
*
* If we are a priority flusher then we just need to add our ticket to
* the list and we will do our own flushing further down.
*/
- if (ret && flush != BTRFS_RESERVE_NO_FLUSH) {
+ if (ret && can_ticket(flush)) {
ticket.bytes = orig_bytes;
ticket.error = 0;
space_info->reclaim_size += ticket.bytes;
@@ -1701,15 +1729,15 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info,
}
}
spin_unlock(&space_info->lock);
- if (!ret || flush == BTRFS_RESERVE_NO_FLUSH)
+ if (!ret || !can_ticket(flush))
return ret;
return handle_reserve_ticket(fs_info, space_info, &ticket, start_ns,
orig_bytes, flush);
}
-/**
- * Trye to reserve metadata bytes from the block_rsv's space
+/*
+ * Try to reserve metadata bytes from the block_rsv's space.
*
* @fs_info: the filesystem
* @block_rsv: block_rsv we're allocating for
@@ -1743,8 +1771,8 @@ int btrfs_reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
return ret;
}
-/**
- * Try to reserve data bytes for an allocation
+/*
+ * Try to reserve data bytes for an allocation.
*
* @fs_info: the filesystem
* @bytes: number of bytes we need
@@ -1787,3 +1815,37 @@ __cold void btrfs_dump_space_info_for_trans_abort(struct btrfs_fs_info *fs_info)
}
dump_global_block_rsv(fs_info);
}
+
+/*
+ * Account the unused space of all the readonly block group in the space_info.
+ * takes mirrors into account.
+ */
+u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo)
+{
+ struct btrfs_block_group *block_group;
+ u64 free_bytes = 0;
+ int factor;
+
+ /* It's df, we don't care if it's racy */
+ if (list_empty(&sinfo->ro_bgs))
+ return 0;
+
+ spin_lock(&sinfo->lock);
+ list_for_each_entry(block_group, &sinfo->ro_bgs, ro_list) {
+ spin_lock(&block_group->lock);
+
+ if (!block_group->ro) {
+ spin_unlock(&block_group->lock);
+ continue;
+ }
+
+ factor = btrfs_bg_type_to_factor(block_group->flags);
+ free_bytes += (block_group->length -
+ block_group->used) * factor;
+
+ spin_unlock(&block_group->lock);
+ }
+ spin_unlock(&sinfo->lock);
+
+ return free_bytes;
+}
diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h
index ce66023a9eb8..fc99ea2b0c34 100644
--- a/fs/btrfs/space-info.h
+++ b/fs/btrfs/space-info.h
@@ -5,6 +5,83 @@
#include "volumes.h"
+/*
+ * Different levels for to flush space when doing space reservations.
+ *
+ * The higher the level, the more methods we try to reclaim space.
+ */
+enum btrfs_reserve_flush_enum {
+ /* If we are in the transaction, we can't flush anything.*/
+ BTRFS_RESERVE_NO_FLUSH,
+
+ /*
+ * Flush space by:
+ * - Running delayed inode items
+ * - Allocating a new chunk
+ */
+ BTRFS_RESERVE_FLUSH_LIMIT,
+
+ /*
+ * Flush space by:
+ * - Running delayed inode items
+ * - Running delayed refs
+ * - Running delalloc and waiting for ordered extents
+ * - Allocating a new chunk
+ */
+ BTRFS_RESERVE_FLUSH_EVICT,
+
+ /*
+ * Flush space by above mentioned methods and by:
+ * - Running delayed iputs
+ * - Committing transaction
+ *
+ * Can be interrupted by a fatal signal.
+ */
+ BTRFS_RESERVE_FLUSH_DATA,
+ BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE,
+ BTRFS_RESERVE_FLUSH_ALL,
+
+ /*
+ * Pretty much the same as FLUSH_ALL, but can also steal space from
+ * global rsv.
+ *
+ * Can be interrupted by a fatal signal.
+ */
+ BTRFS_RESERVE_FLUSH_ALL_STEAL,
+
+ /*
+ * This is for btrfs_use_block_rsv only. We have exhausted our block
+ * rsv and our global block rsv. This can happen for things like
+ * delalloc where we are overwriting a lot of extents with a single
+ * extent and didn't reserve enough space. Alternatively it can happen
+ * with delalloc where we reserve 1 extents worth for a large extent but
+ * fragmentation leads to multiple extents being created. This will
+ * give us the reservation in the case of
+ *
+ * if (num_bytes < (space_info->total_bytes -
+ * btrfs_space_info_used(space_info, false))
+ *
+ * Which ignores bytes_may_use. This is potentially dangerous, but our
+ * reservation system is generally pessimistic so is able to absorb this
+ * style of mistake.
+ */
+ BTRFS_RESERVE_FLUSH_EMERGENCY,
+};
+
+enum btrfs_flush_state {
+ FLUSH_DELAYED_ITEMS_NR = 1,
+ FLUSH_DELAYED_ITEMS = 2,
+ FLUSH_DELAYED_REFS_NR = 3,
+ FLUSH_DELAYED_REFS = 4,
+ FLUSH_DELALLOC = 5,
+ FLUSH_DELALLOC_WAIT = 6,
+ FLUSH_DELALLOC_FULL = 7,
+ ALLOC_CHUNK = 8,
+ ALLOC_CHUNK_FORCE = 9,
+ RUN_DELAYED_IPUTS = 10,
+ COMMIT_TRANS = 11,
+};
+
struct btrfs_space_info {
spinlock_t lock;
@@ -159,5 +236,6 @@ int btrfs_reserve_data_bytes(struct btrfs_fs_info *fs_info, u64 bytes,
enum btrfs_reserve_flush_enum flush);
void btrfs_dump_space_info_for_trans_abort(struct btrfs_fs_info *fs_info);
void btrfs_init_async_reclaim_work(struct btrfs_fs_info *fs_info);
+u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo);
#endif /* BTRFS_SPACE_INFO_H */
diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c
index 9a176af847d7..dd46b978ac2c 100644
--- a/fs/btrfs/subpage.c
+++ b/fs/btrfs/subpage.c
@@ -1,6 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
#include <linux/slab.h>
+#include "messages.h"
#include "ctree.h"
#include "subpage.h"
#include "btrfs_inode.h"
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 5942b9384088..93f52ee85f6f 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -26,6 +26,7 @@
#include <linux/ratelimit.h>
#include <linux/crc32c.h>
#include <linux/btrfs.h>
+#include "messages.h"
#include "delayed-inode.h"
#include "ctree.h"
#include "disk-io.h"
@@ -34,7 +35,7 @@
#include "print-tree.h"
#include "props.h"
#include "xattr.h"
-#include "volumes.h"
+#include "bio.h"
#include "export.h"
#include "compression.h"
#include "rcu-string.h"
@@ -49,6 +50,14 @@
#include "discard.h"
#include "qgroup.h"
#include "raid56.h"
+#include "fs.h"
+#include "accessors.h"
+#include "defrag.h"
+#include "dir-item.h"
+#include "ioctl.h"
+#include "scrub.h"
+#include "verity.h"
+#include "super.h"
#define CREATE_TRACE_POINTS
#include <trace/events/btrfs.h>
@@ -67,328 +76,6 @@ static struct file_system_type btrfs_root_fs_type;
static int btrfs_remount(struct super_block *sb, int *flags, char *data);
-#ifdef CONFIG_PRINTK
-
-#define STATE_STRING_PREFACE ": state "
-#define STATE_STRING_BUF_LEN (sizeof(STATE_STRING_PREFACE) + BTRFS_FS_STATE_COUNT)
-
-/*
- * Characters to print to indicate error conditions or uncommon filesystem state.
- * RO is not an error.
- */
-static const char fs_state_chars[] = {
- [BTRFS_FS_STATE_ERROR] = 'E',
- [BTRFS_FS_STATE_REMOUNTING] = 'M',
- [BTRFS_FS_STATE_RO] = 0,
- [BTRFS_FS_STATE_TRANS_ABORTED] = 'A',
- [BTRFS_FS_STATE_DEV_REPLACING] = 'R',
- [BTRFS_FS_STATE_DUMMY_FS_INFO] = 0,
- [BTRFS_FS_STATE_NO_CSUMS] = 'C',
- [BTRFS_FS_STATE_LOG_CLEANUP_ERROR] = 'L',
-};
-
-static void btrfs_state_to_string(const struct btrfs_fs_info *info, char *buf)
-{
- unsigned int bit;
- bool states_printed = false;
- unsigned long fs_state = READ_ONCE(info->fs_state);
- char *curr = buf;
-
- memcpy(curr, STATE_STRING_PREFACE, sizeof(STATE_STRING_PREFACE));
- curr += sizeof(STATE_STRING_PREFACE) - 1;
-
- for_each_set_bit(bit, &fs_state, sizeof(fs_state)) {
- WARN_ON_ONCE(bit >= BTRFS_FS_STATE_COUNT);
- if ((bit < BTRFS_FS_STATE_COUNT) && fs_state_chars[bit]) {
- *curr++ = fs_state_chars[bit];
- states_printed = true;
- }
- }
-
- /* If no states were printed, reset the buffer */
- if (!states_printed)
- curr = buf;
-
- *curr++ = 0;
-}
-#endif
-
-/*
- * Generally the error codes correspond to their respective errors, but there
- * are a few special cases.
- *
- * EUCLEAN: Any sort of corruption that we encounter. The tree-checker for
- * instance will return EUCLEAN if any of the blocks are corrupted in
- * a way that is problematic. We want to reserve EUCLEAN for these
- * sort of corruptions.
- *
- * EROFS: If we check BTRFS_FS_STATE_ERROR and fail out with a return error, we
- * need to use EROFS for this case. We will have no idea of the
- * original failure, that will have been reported at the time we tripped
- * over the error. Each subsequent error that doesn't have any context
- * of the original error should use EROFS when handling BTRFS_FS_STATE_ERROR.
- */
-const char * __attribute_const__ btrfs_decode_error(int errno)
-{
- char *errstr = "unknown";
-
- switch (errno) {
- case -ENOENT: /* -2 */
- errstr = "No such entry";
- break;
- case -EIO: /* -5 */
- errstr = "IO failure";
- break;
- case -ENOMEM: /* -12*/
- errstr = "Out of memory";
- break;
- case -EEXIST: /* -17 */
- errstr = "Object already exists";
- break;
- case -ENOSPC: /* -28 */
- errstr = "No space left";
- break;
- case -EROFS: /* -30 */
- errstr = "Readonly filesystem";
- break;
- case -EOPNOTSUPP: /* -95 */
- errstr = "Operation not supported";
- break;
- case -EUCLEAN: /* -117 */
- errstr = "Filesystem corrupted";
- break;
- case -EDQUOT: /* -122 */
- errstr = "Quota exceeded";
- break;
- }
-
- return errstr;
-}
-
-/*
- * __btrfs_handle_fs_error decodes expected errors from the caller and
- * invokes the appropriate error response.
- */
-__cold
-void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function,
- unsigned int line, int errno, const char *fmt, ...)
-{
- struct super_block *sb = fs_info->sb;
-#ifdef CONFIG_PRINTK
- char statestr[STATE_STRING_BUF_LEN];
- const char *errstr;
-#endif
-
- /*
- * Special case: if the error is EROFS, and we're already
- * under SB_RDONLY, then it is safe here.
- */
- if (errno == -EROFS && sb_rdonly(sb))
- return;
-
-#ifdef CONFIG_PRINTK
- errstr = btrfs_decode_error(errno);
- btrfs_state_to_string(fs_info, statestr);
- if (fmt) {
- struct va_format vaf;
- va_list args;
-
- va_start(args, fmt);
- vaf.fmt = fmt;
- vaf.va = &args;
-
- pr_crit("BTRFS: error (device %s%s) in %s:%d: errno=%d %s (%pV)\n",
- sb->s_id, statestr, function, line, errno, errstr, &vaf);
- va_end(args);
- } else {
- pr_crit("BTRFS: error (device %s%s) in %s:%d: errno=%d %s\n",
- sb->s_id, statestr, function, line, errno, errstr);
- }
-#endif
-
- /*
- * Today we only save the error info to memory. Long term we'll
- * also send it down to the disk
- */
- set_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state);
-
- /* Don't go through full error handling during mount */
- if (!(sb->s_flags & SB_BORN))
- return;
-
- if (sb_rdonly(sb))
- return;
-
- btrfs_discard_stop(fs_info);
-
- /* btrfs handle error by forcing the filesystem readonly */
- btrfs_set_sb_rdonly(sb);
- btrfs_info(fs_info, "forced readonly");
- /*
- * Note that a running device replace operation is not canceled here
- * although there is no way to update the progress. It would add the
- * risk of a deadlock, therefore the canceling is omitted. The only
- * penalty is that some I/O remains active until the procedure
- * completes. The next time when the filesystem is mounted writable
- * again, the device replace operation continues.
- */
-}
-
-#ifdef CONFIG_PRINTK
-static const char * const logtypes[] = {
- "emergency",
- "alert",
- "critical",
- "error",
- "warning",
- "notice",
- "info",
- "debug",
-};
-
-
-/*
- * Use one ratelimit state per log level so that a flood of less important
- * messages doesn't cause more important ones to be dropped.
- */
-static struct ratelimit_state printk_limits[] = {
- RATELIMIT_STATE_INIT(printk_limits[0], DEFAULT_RATELIMIT_INTERVAL, 100),
- RATELIMIT_STATE_INIT(printk_limits[1], DEFAULT_RATELIMIT_INTERVAL, 100),
- RATELIMIT_STATE_INIT(printk_limits[2], DEFAULT_RATELIMIT_INTERVAL, 100),
- RATELIMIT_STATE_INIT(printk_limits[3], DEFAULT_RATELIMIT_INTERVAL, 100),
- RATELIMIT_STATE_INIT(printk_limits[4], DEFAULT_RATELIMIT_INTERVAL, 100),
- RATELIMIT_STATE_INIT(printk_limits[5], DEFAULT_RATELIMIT_INTERVAL, 100),
- RATELIMIT_STATE_INIT(printk_limits[6], DEFAULT_RATELIMIT_INTERVAL, 100),
- RATELIMIT_STATE_INIT(printk_limits[7], DEFAULT_RATELIMIT_INTERVAL, 100),
-};
-
-void __cold _btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...)
-{
- char lvl[PRINTK_MAX_SINGLE_HEADER_LEN + 1] = "\0";
- struct va_format vaf;
- va_list args;
- int kern_level;
- const char *type = logtypes[4];
- struct ratelimit_state *ratelimit = &printk_limits[4];
-
- va_start(args, fmt);
-
- while ((kern_level = printk_get_level(fmt)) != 0) {
- size_t size = printk_skip_level(fmt) - fmt;
-
- if (kern_level >= '0' && kern_level <= '7') {
- memcpy(lvl, fmt, size);
- lvl[size] = '\0';
- type = logtypes[kern_level - '0'];
- ratelimit = &printk_limits[kern_level - '0'];
- }
- fmt += size;
- }
-
- vaf.fmt = fmt;
- vaf.va = &args;
-
- if (__ratelimit(ratelimit)) {
- if (fs_info) {
- char statestr[STATE_STRING_BUF_LEN];
-
- btrfs_state_to_string(fs_info, statestr);
- _printk("%sBTRFS %s (device %s%s): %pV\n", lvl, type,
- fs_info->sb->s_id, statestr, &vaf);
- } else {
- _printk("%sBTRFS %s: %pV\n", lvl, type, &vaf);
- }
- }
-
- va_end(args);
-}
-#endif
-
-#if BITS_PER_LONG == 32
-void __cold btrfs_warn_32bit_limit(struct btrfs_fs_info *fs_info)
-{
- if (!test_and_set_bit(BTRFS_FS_32BIT_WARN, &fs_info->flags)) {
- btrfs_warn(fs_info, "reaching 32bit limit for logical addresses");
- btrfs_warn(fs_info,
-"due to page cache limit on 32bit systems, btrfs can't access metadata at or beyond %lluT",
- BTRFS_32BIT_MAX_FILE_SIZE >> 40);
- btrfs_warn(fs_info,
- "please consider upgrading to 64bit kernel/hardware");
- }
-}
-
-void __cold btrfs_err_32bit_limit(struct btrfs_fs_info *fs_info)
-{
- if (!test_and_set_bit(BTRFS_FS_32BIT_ERROR, &fs_info->flags)) {
- btrfs_err(fs_info, "reached 32bit limit for logical addresses");
- btrfs_err(fs_info,
-"due to page cache limit on 32bit systems, metadata beyond %lluT can't be accessed",
- BTRFS_32BIT_MAX_FILE_SIZE >> 40);
- btrfs_err(fs_info,
- "please consider upgrading to 64bit kernel/hardware");
- }
-}
-#endif
-
-/*
- * We only mark the transaction aborted and then set the file system read-only.
- * This will prevent new transactions from starting or trying to join this
- * one.
- *
- * This means that error recovery at the call site is limited to freeing
- * any local memory allocations and passing the error code up without
- * further cleanup. The transaction should complete as it normally would
- * in the call path but will return -EIO.
- *
- * We'll complete the cleanup in btrfs_end_transaction and
- * btrfs_commit_transaction.
- */
-__cold
-void __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
- const char *function,
- unsigned int line, int errno, bool first_hit)
-{
- struct btrfs_fs_info *fs_info = trans->fs_info;
-
- WRITE_ONCE(trans->aborted, errno);
- WRITE_ONCE(trans->transaction->aborted, errno);
- if (first_hit && errno == -ENOSPC)
- btrfs_dump_space_info_for_trans_abort(fs_info);
- /* Wake up anybody who may be waiting on this transaction */
- wake_up(&fs_info->transaction_wait);
- wake_up(&fs_info->transaction_blocked_wait);
- __btrfs_handle_fs_error(fs_info, function, line, errno, NULL);
-}
-/*
- * __btrfs_panic decodes unexpected, fatal errors from the caller,
- * issues an alert, and either panics or BUGs, depending on mount options.
- */
-__cold
-void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function,
- unsigned int line, int errno, const char *fmt, ...)
-{
- char *s_id = "<unknown>";
- const char *errstr;
- struct va_format vaf = { .fmt = fmt };
- va_list args;
-
- if (fs_info)
- s_id = fs_info->sb->s_id;
-
- va_start(args, fmt);
- vaf.va = &args;
-
- errstr = btrfs_decode_error(errno);
- if (fs_info && (btrfs_test_opt(fs_info, PANIC_ON_FATAL_ERROR)))
- panic(KERN_CRIT "BTRFS panic (device %s) in %s:%d: %pV (errno=%d %s)\n",
- s_id, function, line, &vaf, errno, errstr);
-
- btrfs_crit(fs_info, "panic in %s:%d: %pV (errno=%d %s)",
- function, line, &vaf, errno, errstr);
- va_end(args);
- /* Caller calls BUG() */
-}
-
static void btrfs_put_super(struct super_block *sb)
{
close_ctree(btrfs_sb(sb));
@@ -918,12 +605,14 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
ret = -EINVAL;
goto out;
}
+ btrfs_clear_opt(info->mount_opt, NODISCARD);
break;
case Opt_nodiscard:
btrfs_clear_and_info(info, DISCARD_SYNC,
"turning off discard");
btrfs_clear_and_info(info, DISCARD_ASYNC,
"turning off async discard");
+ btrfs_set_opt(info->mount_opt, NODISCARD);
break;
case Opt_space_cache:
case Opt_space_cache_version:
@@ -1394,6 +1083,7 @@ static int get_default_subvol_objectid(struct btrfs_fs_info *fs_info, u64 *objec
struct btrfs_dir_item *di;
struct btrfs_path *path;
struct btrfs_key location;
+ struct fscrypt_str name = FSTR_INIT("default", 7);
u64 dir_id;
path = btrfs_alloc_path();
@@ -1406,7 +1096,7 @@ static int get_default_subvol_objectid(struct btrfs_fs_info *fs_info, u64 *objec
* to mount.
*/
dir_id = btrfs_super_root_dir(fs_info->super_copy);
- di = btrfs_lookup_dir_item(NULL, root, path, dir_id, "default", 7, 0);
+ di = btrfs_lookup_dir_item(NULL, root, path, dir_id, &name, 0);
if (IS_ERR(di)) {
btrfs_free_path(path);
return PTR_ERR(di);
@@ -1507,7 +1197,8 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
* Exit unless we have some pending changes
* that need to go through commit
*/
- if (fs_info->pending_changes == 0)
+ if (!test_bit(BTRFS_FS_NEED_TRANS_COMMIT,
+ &fs_info->flags))
return 0;
/*
* A non-blocking test if the fs is frozen. We must not
@@ -2645,7 +2336,7 @@ static int btrfs_show_devname(struct seq_file *m, struct dentry *root)
* the end of RCU grace period.
*/
rcu_read_lock();
- seq_escape(m, rcu_str_deref(fs_info->fs_devices->latest_dev->name), " \t\n\\");
+ seq_escape(m, btrfs_dev_name(fs_info->fs_devices->latest_dev), " \t\n\\");
rcu_read_unlock();
return 0;
@@ -2694,7 +2385,7 @@ static __cold void btrfs_interface_exit(void)
misc_deregister(&btrfs_misc);
}
-static void __init btrfs_print_mod_info(void)
+static int __init btrfs_print_mod_info(void)
{
static const char options[] = ""
#ifdef CONFIG_BTRFS_DEBUG
@@ -2721,122 +2412,125 @@ static void __init btrfs_print_mod_info(void)
#endif
;
pr_info("Btrfs loaded, crc32c=%s%s\n", crc32c_impl(), options);
+ return 0;
}
-static int __init init_btrfs_fs(void)
+static int register_btrfs(void)
{
- int err;
-
- btrfs_props_init();
-
- err = btrfs_init_sysfs();
- if (err)
- return err;
-
- btrfs_init_compress();
-
- err = btrfs_init_cachep();
- if (err)
- goto free_compress;
-
- err = extent_state_init_cachep();
- if (err)
- goto free_cachep;
-
- err = extent_buffer_init_cachep();
- if (err)
- goto free_extent_cachep;
-
- err = btrfs_bioset_init();
- if (err)
- goto free_eb_cachep;
-
- err = extent_map_init();
- if (err)
- goto free_bioset;
-
- err = ordered_data_init();
- if (err)
- goto free_extent_map;
-
- err = btrfs_delayed_inode_init();
- if (err)
- goto free_ordered_data;
-
- err = btrfs_auto_defrag_init();
- if (err)
- goto free_delayed_inode;
-
- err = btrfs_delayed_ref_init();
- if (err)
- goto free_auto_defrag;
-
- err = btrfs_prelim_ref_init();
- if (err)
- goto free_delayed_ref;
-
- err = btrfs_interface_init();
- if (err)
- goto free_prelim_ref;
+ return register_filesystem(&btrfs_fs_type);
+}
- btrfs_print_mod_info();
+static void unregister_btrfs(void)
+{
+ unregister_filesystem(&btrfs_fs_type);
+}
- err = btrfs_run_sanity_tests();
- if (err)
- goto unregister_ioctl;
+/* Helper structure for long init/exit functions. */
+struct init_sequence {
+ int (*init_func)(void);
+ /* Can be NULL if the init_func doesn't need cleanup. */
+ void (*exit_func)(void);
+};
- err = register_filesystem(&btrfs_fs_type);
- if (err)
- goto unregister_ioctl;
+static const struct init_sequence mod_init_seq[] = {
+ {
+ .init_func = btrfs_props_init,
+ .exit_func = NULL,
+ }, {
+ .init_func = btrfs_init_sysfs,
+ .exit_func = btrfs_exit_sysfs,
+ }, {
+ .init_func = btrfs_init_compress,
+ .exit_func = btrfs_exit_compress,
+ }, {
+ .init_func = btrfs_init_cachep,
+ .exit_func = btrfs_destroy_cachep,
+ }, {
+ .init_func = btrfs_transaction_init,
+ .exit_func = btrfs_transaction_exit,
+ }, {
+ .init_func = btrfs_ctree_init,
+ .exit_func = btrfs_ctree_exit,
+ }, {
+ .init_func = btrfs_free_space_init,
+ .exit_func = btrfs_free_space_exit,
+ }, {
+ .init_func = extent_state_init_cachep,
+ .exit_func = extent_state_free_cachep,
+ }, {
+ .init_func = extent_buffer_init_cachep,
+ .exit_func = extent_buffer_free_cachep,
+ }, {
+ .init_func = btrfs_bioset_init,
+ .exit_func = btrfs_bioset_exit,
+ }, {
+ .init_func = extent_map_init,
+ .exit_func = extent_map_exit,
+ }, {
+ .init_func = ordered_data_init,
+ .exit_func = ordered_data_exit,
+ }, {
+ .init_func = btrfs_delayed_inode_init,
+ .exit_func = btrfs_delayed_inode_exit,
+ }, {
+ .init_func = btrfs_auto_defrag_init,
+ .exit_func = btrfs_auto_defrag_exit,
+ }, {
+ .init_func = btrfs_delayed_ref_init,
+ .exit_func = btrfs_delayed_ref_exit,
+ }, {
+ .init_func = btrfs_prelim_ref_init,
+ .exit_func = btrfs_prelim_ref_exit,
+ }, {
+ .init_func = btrfs_interface_init,
+ .exit_func = btrfs_interface_exit,
+ }, {
+ .init_func = btrfs_print_mod_info,
+ .exit_func = NULL,
+ }, {
+ .init_func = btrfs_run_sanity_tests,
+ .exit_func = NULL,
+ }, {
+ .init_func = register_btrfs,
+ .exit_func = unregister_btrfs,
+ }
+};
- return 0;
+static bool mod_init_result[ARRAY_SIZE(mod_init_seq)];
-unregister_ioctl:
- btrfs_interface_exit();
-free_prelim_ref:
- btrfs_prelim_ref_exit();
-free_delayed_ref:
- btrfs_delayed_ref_exit();
-free_auto_defrag:
- btrfs_auto_defrag_exit();
-free_delayed_inode:
- btrfs_delayed_inode_exit();
-free_ordered_data:
- ordered_data_exit();
-free_extent_map:
- extent_map_exit();
-free_bioset:
- btrfs_bioset_exit();
-free_eb_cachep:
- extent_buffer_free_cachep();
-free_extent_cachep:
- extent_state_free_cachep();
-free_cachep:
- btrfs_destroy_cachep();
-free_compress:
- btrfs_exit_compress();
- btrfs_exit_sysfs();
+static __always_inline void btrfs_exit_btrfs_fs(void)
+{
+ int i;
- return err;
+ for (i = ARRAY_SIZE(mod_init_seq) - 1; i >= 0; i--) {
+ if (!mod_init_result[i])
+ continue;
+ if (mod_init_seq[i].exit_func)
+ mod_init_seq[i].exit_func();
+ mod_init_result[i] = false;
+ }
}
static void __exit exit_btrfs_fs(void)
{
- btrfs_destroy_cachep();
- btrfs_delayed_ref_exit();
- btrfs_auto_defrag_exit();
- btrfs_delayed_inode_exit();
- btrfs_prelim_ref_exit();
- ordered_data_exit();
- extent_map_exit();
- btrfs_bioset_exit();
- extent_state_free_cachep();
- extent_buffer_free_cachep();
- btrfs_interface_exit();
- unregister_filesystem(&btrfs_fs_type);
- btrfs_exit_sysfs();
- btrfs_cleanup_fs_uuids();
- btrfs_exit_compress();
+ btrfs_exit_btrfs_fs();
+}
+
+static int __init init_btrfs_fs(void)
+{
+ int ret;
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(mod_init_seq); i++) {
+ ASSERT(!mod_init_result[i]);
+ ret = mod_init_seq[i].init_func();
+ if (ret < 0) {
+ btrfs_exit_btrfs_fs();
+ return ret;
+ }
+ mod_init_result[i] = true;
+ }
+ return 0;
}
late_initcall(init_btrfs_fs);
diff --git a/fs/btrfs/super.h b/fs/btrfs/super.h
new file mode 100644
index 000000000000..8dbb909b364f
--- /dev/null
+++ b/fs/btrfs/super.h
@@ -0,0 +1,29 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef BTRFS_SUPER_H
+#define BTRFS_SUPER_H
+
+int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
+ unsigned long new_flags);
+int btrfs_sync_fs(struct super_block *sb, int wait);
+char *btrfs_get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info,
+ u64 subvol_objectid);
+
+static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb)
+{
+ return sb->s_fs_info;
+}
+
+static inline void btrfs_set_sb_rdonly(struct super_block *sb)
+{
+ sb->s_flags |= SB_RDONLY;
+ set_bit(BTRFS_FS_STATE_RO, &btrfs_sb(sb)->fs_state);
+}
+
+static inline void btrfs_clear_sb_rdonly(struct super_block *sb)
+{
+ sb->s_flags &= ~SB_RDONLY;
+ clear_bit(BTRFS_FS_STATE_RO, &btrfs_sb(sb)->fs_state);
+}
+
+#endif
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 699b54b3acaa..45615ce36498 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -10,7 +10,7 @@
#include <linux/completion.h>
#include <linux/bug.h>
#include <crypto/hash.h>
-
+#include "messages.h"
#include "ctree.h"
#include "discard.h"
#include "disk-io.h"
@@ -22,6 +22,8 @@
#include "block-group.h"
#include "qgroup.h"
#include "misc.h"
+#include "fs.h"
+#include "accessors.h"
/*
* Structure name Path
@@ -248,7 +250,7 @@ static ssize_t btrfs_feature_attr_store(struct kobject *kobj,
/*
* We don't want to do full transaction commit from inside sysfs
*/
- btrfs_set_pending(fs_info, COMMIT);
+ set_bit(BTRFS_FS_NEED_TRANS_COMMIT, &fs_info->flags);
wake_up_process(fs_info->transaction_kthread);
return count;
@@ -762,7 +764,7 @@ static ssize_t btrfs_chunk_size_store(struct kobject *kobj,
val = min(val, BTRFS_MAX_DATA_CHUNK_SIZE);
/* Limit stripe size to 10% of available space. */
- val = min(div_factor(fs_info->fs_devices->total_rw_bytes, 1), val);
+ val = min(mult_perc(fs_info->fs_devices->total_rw_bytes, 10), val);
/* Must be multiple of 256M. */
val &= ~((u64)SZ_256M - 1);
@@ -959,7 +961,7 @@ static ssize_t btrfs_label_store(struct kobject *kobj,
/*
* We don't want to do full transaction commit from inside sysfs
*/
- btrfs_set_pending(fs_info, COMMIT);
+ set_bit(BTRFS_FS_NEED_TRANS_COMMIT, &fs_info->flags);
wake_up_process(fs_info->transaction_kthread);
return len;
@@ -1160,16 +1162,16 @@ static ssize_t btrfs_read_policy_show(struct kobject *kobj,
for (i = 0; i < BTRFS_NR_READ_POLICY; i++) {
if (fs_devices->read_policy == i)
- ret += scnprintf(buf + ret, PAGE_SIZE - ret, "%s[%s]",
+ ret += sysfs_emit_at(buf, ret, "%s[%s]",
(ret == 0 ? "" : " "),
btrfs_read_policy_name[i]);
else
- ret += scnprintf(buf + ret, PAGE_SIZE - ret, "%s%s",
+ ret += sysfs_emit_at(buf, ret, "%s%s",
(ret == 0 ? "" : " "),
btrfs_read_policy_name[i]);
}
- ret += scnprintf(buf + ret, PAGE_SIZE - ret, "\n");
+ ret += sysfs_emit_at(buf, ret, "\n");
return ret;
}
@@ -2321,8 +2323,11 @@ int __init btrfs_init_sysfs(void)
#ifdef CONFIG_BTRFS_DEBUG
ret = sysfs_create_group(&btrfs_kset->kobj, &btrfs_debug_feature_attr_group);
- if (ret)
- goto out2;
+ if (ret) {
+ sysfs_unmerge_group(&btrfs_kset->kobj,
+ &btrfs_static_feature_attr_group);
+ goto out_remove_group;
+ }
#endif
return 0;
diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c
index 9c478fa256f6..181469fc0bb3 100644
--- a/fs/btrfs/tests/btrfs-tests.c
+++ b/fs/btrfs/tests/btrfs-tests.c
@@ -16,6 +16,7 @@
#include "../disk-io.h"
#include "../qgroup.h"
#include "../block-group.h"
+#include "../fs.h"
static struct vfsmount *test_mnt = NULL;
@@ -101,7 +102,7 @@ struct btrfs_device *btrfs_alloc_dummy_device(struct btrfs_fs_info *fs_info)
if (!dev)
return ERR_PTR(-ENOMEM);
- extent_io_tree_init(NULL, &dev->alloc_state, 0, NULL);
+ extent_io_tree_init(NULL, &dev->alloc_state, 0);
INIT_LIST_HEAD(&dev->dev_list);
list_add(&dev->dev_list, &fs_info->fs_devices->devices);
@@ -200,7 +201,7 @@ void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info)
void btrfs_free_dummy_root(struct btrfs_root *root)
{
- if (!root)
+ if (IS_ERR_OR_NULL(root))
return;
/* Will be freed by btrfs_free_fs_roots */
if (WARN_ON(test_bit(BTRFS_ROOT_IN_RADIX, &root->state)))
diff --git a/fs/btrfs/tests/extent-buffer-tests.c b/fs/btrfs/tests/extent-buffer-tests.c
index b7d181a08eab..5ef0b90e25c3 100644
--- a/fs/btrfs/tests/extent-buffer-tests.c
+++ b/fs/btrfs/tests/extent-buffer-tests.c
@@ -8,6 +8,7 @@
#include "../ctree.h"
#include "../extent_io.h"
#include "../disk-io.h"
+#include "../accessors.h"
static int test_btrfs_split_item(u32 sectorsize, u32 nodesize)
{
diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c
index 350da449db08..dfc5c7fa6038 100644
--- a/fs/btrfs/tests/extent-io-tests.c
+++ b/fs/btrfs/tests/extent-io-tests.c
@@ -132,7 +132,7 @@ static int test_find_delalloc(u32 sectorsize)
* Passing NULL as we don't have fs_info but tracepoints are not used
* at this point
*/
- extent_io_tree_init(NULL, tmp, IO_TREE_SELFTEST, NULL);
+ extent_io_tree_init(NULL, tmp, IO_TREE_SELFTEST);
/*
* First go through and create and mark all of our pages dirty, we pin
@@ -489,7 +489,7 @@ static int test_find_first_clear_extent_bit(void)
test_msg("running find_first_clear_extent_bit test");
- extent_io_tree_init(NULL, &tree, IO_TREE_SELFTEST, NULL);
+ extent_io_tree_init(NULL, &tree, IO_TREE_SELFTEST);
/* Test correct handling of empty tree */
find_first_clear_extent_bit(&tree, 0, &start, &end, CHUNK_TRIMMED);
diff --git a/fs/btrfs/tests/free-space-tree-tests.c b/fs/btrfs/tests/free-space-tree-tests.c
index 13734ed43bfc..b61972046feb 100644
--- a/fs/btrfs/tests/free-space-tree-tests.c
+++ b/fs/btrfs/tests/free-space-tree-tests.c
@@ -10,6 +10,7 @@
#include "../free-space-tree.h"
#include "../transaction.h"
#include "../block-group.h"
+#include "../accessors.h"
struct free_space_extent {
u64 start;
@@ -470,7 +471,7 @@ static int run_test(test_func_t test_func, int bitmaps, u32 sectorsize,
}
cache->bitmap_low_thresh = 0;
cache->bitmap_high_thresh = (u32)-1;
- cache->needs_free_space = 1;
+ set_bit(BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE, &cache->runtime_flags);
cache->fs_info = root->fs_info;
btrfs_init_dummy_trans(&trans, root->fs_info);
diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c
index 625f7d398368..05b03f5eab83 100644
--- a/fs/btrfs/tests/inode-tests.c
+++ b/fs/btrfs/tests/inode-tests.c
@@ -11,6 +11,7 @@
#include "../extent_io.h"
#include "../volumes.h"
#include "../compression.h"
+#include "../accessors.h"
static void insert_extent(struct btrfs_root *root, u64 start, u64 len,
u64 ram_bytes, u64 offset, u64 disk_bytenr,
@@ -72,8 +73,8 @@ static void insert_inode_item_key(struct btrfs_root *root)
* diagram of how the extents will look though this may not be possible we still
* want to make sure everything acts normally (the last number is not inclusive)
*
- * [0 - 5][5 - 6][ 6 - 4096 ][ 4096 - 4100][4100 - 8195][8195 - 12291]
- * [hole ][inline][hole but no extent][ hole ][ regular ][regular1 split]
+ * [0 - 6][ 6 - 4096 ][ 4096 - 4100][4100 - 8195][8195 - 12291]
+ * [inline][hole but no extent][ hole ][ regular ][regular1 split]
*
* [12291 - 16387][16387 - 24579][24579 - 28675][ 28675 - 32771][32771 - 36867 ]
* [ hole ][regular1 split][ prealloc ][ prealloc1 ][prealloc1 written]
@@ -90,19 +91,12 @@ static void setup_file_extents(struct btrfs_root *root, u32 sectorsize)
u64 disk_bytenr = SZ_1M;
u64 offset = 0;
- /* First we want a hole */
- insert_extent(root, offset, 5, 5, 0, 0, 0, BTRFS_FILE_EXTENT_REG, 0,
- slot);
- slot++;
- offset += 5;
-
/*
- * Now we want an inline extent, I don't think this is possible but hey
- * why not? Also keep in mind if we have an inline extent it counts as
- * the whole first page. If we were to expand it we would have to cow
- * and we wouldn't have an inline extent anymore.
+ * Tree-checker has strict limits on inline extents that they can only
+ * exist at file offset 0, thus we can only have one inline file extent
+ * at most.
*/
- insert_extent(root, offset, 1, 1, 0, 0, 0, BTRFS_FILE_EXTENT_INLINE, 0,
+ insert_extent(root, offset, 6, 6, 0, 0, 0, BTRFS_FILE_EXTENT_INLINE, 0,
slot);
slot++;
offset = sectorsize;
@@ -281,37 +275,25 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
test_err("got an error when we shouldn't have");
goto out;
}
- if (em->block_start != EXTENT_MAP_HOLE) {
- test_err("expected a hole, got %llu", em->block_start);
- goto out;
- }
- if (em->start != 0 || em->len != 5) {
- test_err(
- "unexpected extent wanted start 0 len 5, got start %llu len %llu",
- em->start, em->len);
- goto out;
- }
- if (em->flags != 0) {
- test_err("unexpected flags set, want 0 have %lu", em->flags);
- goto out;
- }
- offset = em->start + em->len;
- free_extent_map(em);
-
- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize);
- if (IS_ERR(em)) {
- test_err("got an error when we shouldn't have");
- goto out;
- }
if (em->block_start != EXTENT_MAP_INLINE) {
test_err("expected an inline, got %llu", em->block_start);
goto out;
}
- if (em->start != offset || em->len != (sectorsize - 5)) {
+ /*
+ * For inline extent, we always round up the em to sectorsize, as
+ * they are either:
+ *
+ * a) a hidden hole
+ * The range will be zeroed at inline extent read time.
+ *
+ * b) a file extent with unaligned bytenr
+ * Tree checker will reject it.
+ */
+ if (em->start != 0 || em->len != sectorsize) {
test_err(
- "unexpected extent wanted start %llu len 1, got start %llu len %llu",
- offset, em->start, em->len);
+ "unexpected extent wanted start 0 len %u, got start %llu len %llu",
+ sectorsize, em->start, em->len);
goto out;
}
if (em->flags != 0) {
diff --git a/fs/btrfs/tests/qgroup-tests.c b/fs/btrfs/tests/qgroup-tests.c
index 63676ea19f29..3fc8dc3fd980 100644
--- a/fs/btrfs/tests/qgroup-tests.c
+++ b/fs/btrfs/tests/qgroup-tests.c
@@ -10,6 +10,8 @@
#include "../disk-io.h"
#include "../qgroup.h"
#include "../backref.h"
+#include "../fs.h"
+#include "../accessors.h"
static int insert_normal_tree_ref(struct btrfs_root *root, u64 bytenr,
u64 num_bytes, u64 parent, u64 root_objectid)
@@ -203,6 +205,7 @@ static int remove_extent_ref(struct btrfs_root *root, u64 bytenr,
static int test_no_shared_qgroup(struct btrfs_root *root,
u32 sectorsize, u32 nodesize)
{
+ struct btrfs_backref_walk_ctx ctx = { 0 };
struct btrfs_trans_handle trans;
struct btrfs_fs_info *fs_info = root->fs_info;
struct ulist *old_roots = NULL;
@@ -218,16 +221,22 @@ static int test_no_shared_qgroup(struct btrfs_root *root,
return ret;
}
+ ctx.bytenr = nodesize;
+ ctx.trans = &trans;
+ ctx.fs_info = fs_info;
+
/*
* Since the test trans doesn't have the complicated delayed refs,
* we can only call btrfs_qgroup_account_extent() directly to test
* quota.
*/
- ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots, false);
+ ret = btrfs_find_all_roots(&ctx, false);
if (ret) {
test_err("couldn't find old roots: %d", ret);
return ret;
}
+ old_roots = ctx.roots;
+ ctx.roots = NULL;
ret = insert_normal_tree_ref(root, nodesize, nodesize, 0,
BTRFS_FS_TREE_OBJECTID);
@@ -236,12 +245,14 @@ static int test_no_shared_qgroup(struct btrfs_root *root,
return ret;
}
- ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots, false);
+ ret = btrfs_find_all_roots(&ctx, false);
if (ret) {
ulist_free(old_roots);
test_err("couldn't find old roots: %d", ret);
return ret;
}
+ new_roots = ctx.roots;
+ ctx.roots = NULL;
ret = btrfs_qgroup_account_extent(&trans, nodesize, nodesize, old_roots,
new_roots);
@@ -260,11 +271,13 @@ static int test_no_shared_qgroup(struct btrfs_root *root,
return -EINVAL;
}
- ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots, false);
+ ret = btrfs_find_all_roots(&ctx, false);
if (ret) {
test_err("couldn't find old roots: %d", ret);
return ret;
}
+ old_roots = ctx.roots;
+ ctx.roots = NULL;
ret = remove_extent_item(root, nodesize, nodesize);
if (ret) {
@@ -272,12 +285,14 @@ static int test_no_shared_qgroup(struct btrfs_root *root,
return -EINVAL;
}
- ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots, false);
+ ret = btrfs_find_all_roots(&ctx, false);
if (ret) {
ulist_free(old_roots);
test_err("couldn't find old roots: %d", ret);
return ret;
}
+ new_roots = ctx.roots;
+ ctx.roots = NULL;
ret = btrfs_qgroup_account_extent(&trans, nodesize, nodesize, old_roots,
new_roots);
@@ -302,6 +317,7 @@ static int test_no_shared_qgroup(struct btrfs_root *root,
static int test_multiple_refs(struct btrfs_root *root,
u32 sectorsize, u32 nodesize)
{
+ struct btrfs_backref_walk_ctx ctx = { 0 };
struct btrfs_trans_handle trans;
struct btrfs_fs_info *fs_info = root->fs_info;
struct ulist *old_roots = NULL;
@@ -322,11 +338,17 @@ static int test_multiple_refs(struct btrfs_root *root,
return ret;
}
- ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots, false);
+ ctx.bytenr = nodesize;
+ ctx.trans = &trans;
+ ctx.fs_info = fs_info;
+
+ ret = btrfs_find_all_roots(&ctx, false);
if (ret) {
test_err("couldn't find old roots: %d", ret);
return ret;
}
+ old_roots = ctx.roots;
+ ctx.roots = NULL;
ret = insert_normal_tree_ref(root, nodesize, nodesize, 0,
BTRFS_FS_TREE_OBJECTID);
@@ -335,12 +357,14 @@ static int test_multiple_refs(struct btrfs_root *root,
return ret;
}
- ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots, false);
+ ret = btrfs_find_all_roots(&ctx, false);
if (ret) {
ulist_free(old_roots);
test_err("couldn't find old roots: %d", ret);
return ret;
}
+ new_roots = ctx.roots;
+ ctx.roots = NULL;
ret = btrfs_qgroup_account_extent(&trans, nodesize, nodesize, old_roots,
new_roots);
@@ -355,11 +379,13 @@ static int test_multiple_refs(struct btrfs_root *root,
return -EINVAL;
}
- ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots, false);
+ ret = btrfs_find_all_roots(&ctx, false);
if (ret) {
test_err("couldn't find old roots: %d", ret);
return ret;
}
+ old_roots = ctx.roots;
+ ctx.roots = NULL;
ret = add_tree_ref(root, nodesize, nodesize, 0,
BTRFS_FIRST_FREE_OBJECTID);
@@ -368,12 +394,14 @@ static int test_multiple_refs(struct btrfs_root *root,
return ret;
}
- ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots, false);
+ ret = btrfs_find_all_roots(&ctx, false);
if (ret) {
ulist_free(old_roots);
test_err("couldn't find old roots: %d", ret);
return ret;
}
+ new_roots = ctx.roots;
+ ctx.roots = NULL;
ret = btrfs_qgroup_account_extent(&trans, nodesize, nodesize, old_roots,
new_roots);
@@ -394,11 +422,13 @@ static int test_multiple_refs(struct btrfs_root *root,
return -EINVAL;
}
- ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots, false);
+ ret = btrfs_find_all_roots(&ctx, false);
if (ret) {
test_err("couldn't find old roots: %d", ret);
return ret;
}
+ old_roots = ctx.roots;
+ ctx.roots = NULL;
ret = remove_extent_ref(root, nodesize, nodesize, 0,
BTRFS_FIRST_FREE_OBJECTID);
@@ -407,12 +437,14 @@ static int test_multiple_refs(struct btrfs_root *root,
return ret;
}
- ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots, false);
+ ret = btrfs_find_all_roots(&ctx, false);
if (ret) {
ulist_free(old_roots);
test_err("couldn't find old roots: %d", ret);
return ret;
}
+ new_roots = ctx.roots;
+ ctx.roots = NULL;
ret = btrfs_qgroup_account_extent(&trans, nodesize, nodesize, old_roots,
new_roots);
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index d1f1da6820fb..b8c52e89688c 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -6,6 +6,7 @@
#include <linux/fs.h>
#include <linux/slab.h>
#include <linux/sched.h>
+#include <linux/sched/mm.h>
#include <linux/writeback.h>
#include <linux/pagemap.h>
#include <linux/blkdev.h>
@@ -23,6 +24,18 @@
#include "block-group.h"
#include "space-info.h"
#include "zoned.h"
+#include "fs.h"
+#include "accessors.h"
+#include "extent-tree.h"
+#include "root-tree.h"
+#include "defrag.h"
+#include "dir-item.h"
+#include "uuid-tree.h"
+#include "ioctl.h"
+#include "relocation.h"
+#include "scrub.h"
+
+static struct kmem_cache *btrfs_trans_handle_cachep;
#define BTRFS_ROOT_TRANS_TAG 0
@@ -365,9 +378,9 @@ loop:
spin_lock_init(&cur_trans->releasing_ebs_lock);
list_add_tail(&cur_trans->list, &fs_info->trans_list);
extent_io_tree_init(fs_info, &cur_trans->dirty_pages,
- IO_TREE_TRANS_DIRTY_PAGES, NULL);
+ IO_TREE_TRANS_DIRTY_PAGES);
extent_io_tree_init(fs_info, &cur_trans->pinned_extents,
- IO_TREE_FS_PINNED_EXTENTS, NULL);
+ IO_TREE_FS_PINNED_EXTENTS);
fs_info->generation++;
cur_trans->transid = fs_info->generation;
fs_info->running_transaction = cur_trans;
@@ -936,7 +949,7 @@ static bool should_end_transaction(struct btrfs_trans_handle *trans)
if (btrfs_check_space_for_delayed_refs(fs_info))
return true;
- return !!btrfs_block_rsv_check(&fs_info->global_block_rsv, 5);
+ return !!btrfs_block_rsv_check(&fs_info->global_block_rsv, 50);
}
bool btrfs_should_end_transaction(struct btrfs_trans_handle *trans)
@@ -1607,10 +1620,9 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
struct btrfs_root *root = pending->root;
struct btrfs_root *parent_root;
struct btrfs_block_rsv *rsv;
- struct inode *parent_inode;
+ struct inode *parent_inode = pending->dir;
struct btrfs_path *path;
struct btrfs_dir_item *dir_item;
- struct dentry *dentry;
struct extent_buffer *tmp;
struct extent_buffer *old;
struct timespec64 cur_time;
@@ -1619,6 +1631,8 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
u64 index = 0;
u64 objectid;
u64 root_flags;
+ unsigned int nofs_flags;
+ struct fscrypt_name fname;
ASSERT(pending->path);
path = pending->path;
@@ -1626,9 +1640,22 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
ASSERT(pending->root_item);
new_root_item = pending->root_item;
+ /*
+ * We're inside a transaction and must make sure that any potential
+ * allocations with GFP_KERNEL in fscrypt won't recurse back to
+ * filesystem.
+ */
+ nofs_flags = memalloc_nofs_save();
+ pending->error = fscrypt_setup_filename(parent_inode,
+ &pending->dentry->d_name, 0,
+ &fname);
+ memalloc_nofs_restore(nofs_flags);
+ if (pending->error)
+ goto free_pending;
+
pending->error = btrfs_get_free_objectid(tree_root, &objectid);
if (pending->error)
- goto no_free_objectid;
+ goto free_fname;
/*
* Make qgroup to skip current new snapshot's qgroupid, as it is
@@ -1657,8 +1684,6 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
trace_btrfs_space_reservation(fs_info, "transaction",
trans->transid,
trans->bytes_reserved, 1);
- dentry = pending->dentry;
- parent_inode = pending->dir;
parent_root = BTRFS_I(parent_inode)->root;
ret = record_root_in_trans(trans, parent_root, 0);
if (ret)
@@ -1674,8 +1699,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
/* check if there is a file/dir which has the same name. */
dir_item = btrfs_lookup_dir_item(NULL, parent_root, path,
btrfs_ino(BTRFS_I(parent_inode)),
- dentry->d_name.name,
- dentry->d_name.len, 0);
+ &fname.disk_name, 0);
if (dir_item != NULL && !IS_ERR(dir_item)) {
pending->error = -EEXIST;
goto dir_item_existed;
@@ -1770,7 +1794,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
ret = btrfs_add_root_ref(trans, objectid,
parent_root->root_key.objectid,
btrfs_ino(BTRFS_I(parent_inode)), index,
- dentry->d_name.name, dentry->d_name.len);
+ &fname.disk_name);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto fail;
@@ -1802,9 +1826,9 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
if (ret < 0)
goto fail;
- ret = btrfs_insert_dir_item(trans, dentry->d_name.name,
- dentry->d_name.len, BTRFS_I(parent_inode),
- &key, BTRFS_FT_DIR, index);
+ ret = btrfs_insert_dir_item(trans, &fname.disk_name,
+ BTRFS_I(parent_inode), &key, BTRFS_FT_DIR,
+ index);
/* We have check then name at the beginning, so it is impossible. */
BUG_ON(ret == -EEXIST || ret == -EOVERFLOW);
if (ret) {
@@ -1813,7 +1837,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
}
btrfs_i_size_write(BTRFS_I(parent_inode), parent_inode->i_size +
- dentry->d_name.len * 2);
+ fname.disk_name.len * 2);
parent_inode->i_mtime = current_time(parent_inode);
parent_inode->i_ctime = parent_inode->i_mtime;
ret = btrfs_update_inode_fallback(trans, parent_root, BTRFS_I(parent_inode));
@@ -1845,7 +1869,9 @@ dir_item_existed:
trans->bytes_reserved = 0;
clear_skip_qgroup:
btrfs_clear_skip_qgroup(trans);
-no_free_objectid:
+free_fname:
+ fscrypt_free_filename(&fname);
+free_pending:
kfree(new_root_item);
pending->root_item = NULL;
btrfs_free_path(path);
@@ -2101,6 +2127,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
ASSERT(refcount_read(&trans->use_count) == 1);
btrfs_trans_state_lockdep_acquire(fs_info, BTRFS_LOCKDEP_TRANS_COMMIT_START);
+ clear_bit(BTRFS_FS_NEED_TRANS_COMMIT, &fs_info->flags);
+
/* Stop the commit early if ->aborted is set */
if (TRANS_ABORTED(cur_trans)) {
ret = cur_trans->aborted;
@@ -2354,12 +2382,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
if (ret)
goto unlock_reloc;
- /*
- * Since the transaction is done, we can apply the pending changes
- * before the next transaction.
- */
- btrfs_apply_pending_changes(fs_info);
-
/* commit_fs_roots gets rid of all the tree log roots, it is now
* safe to free the root of tree log roots
*/
@@ -2582,21 +2604,17 @@ int btrfs_clean_one_deleted_snapshot(struct btrfs_fs_info *fs_info)
return (ret < 0) ? 0 : 1;
}
-void btrfs_apply_pending_changes(struct btrfs_fs_info *fs_info)
+int __init btrfs_transaction_init(void)
{
- unsigned long prev;
- unsigned long bit;
-
- prev = xchg(&fs_info->pending_changes, 0);
- if (!prev)
- return;
-
- bit = 1 << BTRFS_PENDING_COMMIT;
- if (prev & bit)
- btrfs_debug(fs_info, "pending commit done");
- prev &= ~bit;
+ btrfs_trans_handle_cachep = kmem_cache_create("btrfs_trans_handle",
+ sizeof(struct btrfs_trans_handle), 0,
+ SLAB_TEMPORARY | SLAB_MEM_SPREAD, NULL);
+ if (!btrfs_trans_handle_cachep)
+ return -ENOMEM;
+ return 0;
+}
- if (prev)
- btrfs_warn(fs_info,
- "unknown pending changes left 0x%lx, ignoring", prev);
+void __cold btrfs_transaction_exit(void)
+{
+ kmem_cache_destroy(btrfs_trans_handle_cachep);
}
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 970ff316069d..97f6c39f59c8 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -10,6 +10,7 @@
#include "btrfs_inode.h"
#include "delayed-ref.h"
#include "ctree.h"
+#include "misc.h"
enum btrfs_trans_state {
TRANS_STATE_RUNNING,
@@ -98,14 +99,15 @@ struct btrfs_transaction {
struct list_head releasing_ebs;
};
-#define __TRANS_FREEZABLE (1U << 0)
-
-#define __TRANS_START (1U << 9)
-#define __TRANS_ATTACH (1U << 10)
-#define __TRANS_JOIN (1U << 11)
-#define __TRANS_JOIN_NOLOCK (1U << 12)
-#define __TRANS_DUMMY (1U << 13)
-#define __TRANS_JOIN_NOSTART (1U << 14)
+enum {
+ ENUM_BIT(__TRANS_FREEZABLE),
+ ENUM_BIT(__TRANS_START),
+ ENUM_BIT(__TRANS_ATTACH),
+ ENUM_BIT(__TRANS_JOIN),
+ ENUM_BIT(__TRANS_JOIN_NOLOCK),
+ ENUM_BIT(__TRANS_DUMMY),
+ ENUM_BIT(__TRANS_JOIN_NOSTART),
+};
#define TRANS_START (__TRANS_START | __TRANS_FREEZABLE)
#define TRANS_ATTACH (__TRANS_ATTACH)
@@ -231,9 +233,11 @@ int btrfs_wait_tree_log_extents(struct btrfs_root *root, int mark);
int btrfs_transaction_blocked(struct btrfs_fs_info *info);
int btrfs_transaction_in_commit(struct btrfs_fs_info *info);
void btrfs_put_transaction(struct btrfs_transaction *transaction);
-void btrfs_apply_pending_changes(struct btrfs_fs_info *fs_info);
void btrfs_add_dropped_root(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans);
+int __init btrfs_transaction_init(void);
+void __cold btrfs_transaction_exit(void);
+
#endif
diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
index 43f905ab0a18..baad1ed7e111 100644
--- a/fs/btrfs/tree-checker.c
+++ b/fs/btrfs/tree-checker.c
@@ -18,6 +18,7 @@
#include <linux/types.h>
#include <linux/stddef.h>
#include <linux/error-injection.h>
+#include "messages.h"
#include "ctree.h"
#include "tree-checker.h"
#include "disk-io.h"
@@ -25,6 +26,9 @@
#include "volumes.h"
#include "misc.h"
#include "btrfs_inode.h"
+#include "fs.h"
+#include "accessors.h"
+#include "file-item.h"
/*
* Error message should follow the following format:
@@ -528,7 +532,7 @@ static int check_dir_item(struct extent_buffer *leaf,
}
/* dir type check */
- dir_type = btrfs_dir_type(leaf, di);
+ dir_type = btrfs_dir_ftype(leaf, di);
if (unlikely(dir_type >= BTRFS_FT_MAX)) {
dir_item_err(leaf, slot,
"invalid dir item type, have %u expect [0, %u)",
@@ -1780,10 +1784,10 @@ static int check_leaf(struct extent_buffer *leaf, bool check_item_data)
/* Also check if the item pointer overlaps with btrfs item. */
if (unlikely(btrfs_item_ptr_offset(leaf, slot) <
- btrfs_item_nr_offset(slot) + sizeof(struct btrfs_item))) {
+ btrfs_item_nr_offset(leaf, slot) + sizeof(struct btrfs_item))) {
generic_err(leaf, slot,
"slot overlaps with its data, item end %lu data start %lu",
- btrfs_item_nr_offset(slot) +
+ btrfs_item_nr_offset(leaf, slot) +
sizeof(struct btrfs_item),
btrfs_item_ptr_offset(leaf, slot));
return -EUCLEAN;
diff --git a/fs/btrfs/tree-checker.h b/fs/btrfs/tree-checker.h
index ece497e26558..bfb5efa4e01f 100644
--- a/fs/btrfs/tree-checker.h
+++ b/fs/btrfs/tree-checker.h
@@ -6,8 +6,39 @@
#ifndef BTRFS_TREE_CHECKER_H
#define BTRFS_TREE_CHECKER_H
-#include "ctree.h"
-#include "extent_io.h"
+#include <uapi/linux/btrfs_tree.h>
+
+struct extent_buffer;
+struct btrfs_chunk;
+
+/* All the extra info needed to verify the parentness of a tree block. */
+struct btrfs_tree_parent_check {
+ /*
+ * The owner check against the tree block.
+ *
+ * Can be 0 to skip the owner check.
+ */
+ u64 owner_root;
+
+ /*
+ * Expected transid, can be 0 to skip the check, but such skip
+ * should only be utlized for backref walk related code.
+ */
+ u64 transid;
+
+ /*
+ * The expected first key.
+ *
+ * This check can be skipped if @has_first_key is false, such skip
+ * can happen for case where we don't have the parent node key,
+ * e.g. reading the tree root, doing backref walk.
+ */
+ struct btrfs_key first_key;
+ bool has_first_key;
+
+ /* The expected level. Should always be set. */
+ u8 level;
+};
/*
* Comprehensive leaf checker.
diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c
deleted file mode 100644
index b6cf39f4e7e4..000000000000
--- a/fs/btrfs/tree-defrag.c
+++ /dev/null
@@ -1,132 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Copyright (C) 2007 Oracle. All rights reserved.
- */
-
-#include <linux/sched.h>
-#include "ctree.h"
-#include "disk-io.h"
-#include "print-tree.h"
-#include "transaction.h"
-#include "locking.h"
-
-/*
- * Defrag all the leaves in a given btree.
- * Read all the leaves and try to get key order to
- * better reflect disk order
- */
-
-int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
- struct btrfs_root *root)
-{
- struct btrfs_path *path = NULL;
- struct btrfs_key key;
- int ret = 0;
- int wret;
- int level;
- int next_key_ret = 0;
- u64 last_ret = 0;
-
- if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state))
- goto out;
-
- path = btrfs_alloc_path();
- if (!path)
- return -ENOMEM;
-
- level = btrfs_header_level(root->node);
-
- if (level == 0)
- goto out;
-
- if (root->defrag_progress.objectid == 0) {
- struct extent_buffer *root_node;
- u32 nritems;
-
- root_node = btrfs_lock_root_node(root);
- nritems = btrfs_header_nritems(root_node);
- root->defrag_max.objectid = 0;
- /* from above we know this is not a leaf */
- btrfs_node_key_to_cpu(root_node, &root->defrag_max,
- nritems - 1);
- btrfs_tree_unlock(root_node);
- free_extent_buffer(root_node);
- memset(&key, 0, sizeof(key));
- } else {
- memcpy(&key, &root->defrag_progress, sizeof(key));
- }
-
- path->keep_locks = 1;
-
- ret = btrfs_search_forward(root, &key, path, BTRFS_OLDEST_GENERATION);
- if (ret < 0)
- goto out;
- if (ret > 0) {
- ret = 0;
- goto out;
- }
- btrfs_release_path(path);
- /*
- * We don't need a lock on a leaf. btrfs_realloc_node() will lock all
- * leafs from path->nodes[1], so set lowest_level to 1 to avoid later
- * a deadlock (attempting to write lock an already write locked leaf).
- */
- path->lowest_level = 1;
- wret = btrfs_search_slot(trans, root, &key, path, 0, 1);
-
- if (wret < 0) {
- ret = wret;
- goto out;
- }
- if (!path->nodes[1]) {
- ret = 0;
- goto out;
- }
- /*
- * The node at level 1 must always be locked when our path has
- * keep_locks set and lowest_level is 1, regardless of the value of
- * path->slots[1].
- */
- BUG_ON(path->locks[1] == 0);
- ret = btrfs_realloc_node(trans, root,
- path->nodes[1], 0,
- &last_ret,
- &root->defrag_progress);
- if (ret) {
- WARN_ON(ret == -EAGAIN);
- goto out;
- }
- /*
- * Now that we reallocated the node we can find the next key. Note that
- * btrfs_find_next_key() can release our path and do another search
- * without COWing, this is because even with path->keep_locks = 1,
- * btrfs_search_slot() / ctree.c:unlock_up() does not keeps a lock on a
- * node when path->slots[node_level - 1] does not point to the last
- * item or a slot beyond the last item (ctree.c:unlock_up()). Therefore
- * we search for the next key after reallocating our node.
- */
- path->slots[1] = btrfs_header_nritems(path->nodes[1]);
- next_key_ret = btrfs_find_next_key(root, path, &key, 1,
- BTRFS_OLDEST_GENERATION);
- if (next_key_ret == 0) {
- memcpy(&root->defrag_progress, &key, sizeof(key));
- ret = -EAGAIN;
- }
-out:
- btrfs_free_path(path);
- if (ret == -EAGAIN) {
- if (root->defrag_max.objectid > root->defrag_progress.objectid)
- goto done;
- if (root->defrag_max.type > root->defrag_progress.type)
- goto done;
- if (root->defrag_max.offset > root->defrag_progress.offset)
- goto done;
- ret = 0;
- }
-done:
- if (ret != -EAGAIN)
- memset(&root->defrag_progress, 0,
- sizeof(root->defrag_progress));
-
- return ret;
-}
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 813986e38258..a3c43f0b1c95 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -21,6 +21,15 @@
#include "space-info.h"
#include "zoned.h"
#include "inode-item.h"
+#include "fs.h"
+#include "accessors.h"
+#include "extent-tree.h"
+#include "root-tree.h"
+#include "dir-item.h"
+#include "file-item.h"
+#include "file.h"
+#include "orphan.h"
+#include "tree-checker.h"
#define MAX_CONFLICT_INODES 10
@@ -333,7 +342,12 @@ static int process_one_buffer(struct btrfs_root *log,
* pin down any logged extents, so we have to read the block.
*/
if (btrfs_fs_incompat(fs_info, MIXED_GROUPS)) {
- ret = btrfs_read_extent_buffer(eb, gen, level, NULL);
+ struct btrfs_tree_parent_check check = {
+ .level = level,
+ .transid = gen
+ };
+
+ ret = btrfs_read_extent_buffer(eb, &check);
if (ret)
return ret;
}
@@ -351,11 +365,25 @@ static int process_one_buffer(struct btrfs_root *log,
return ret;
}
-static int do_overwrite_item(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_path *path,
- struct extent_buffer *eb, int slot,
- struct btrfs_key *key)
+/*
+ * Item overwrite used by replay and tree logging. eb, slot and key all refer
+ * to the src data we are copying out.
+ *
+ * root is the tree we are copying into, and path is a scratch
+ * path for use in this function (it should be released on entry and
+ * will be released on exit).
+ *
+ * If the key is already in the destination tree the existing item is
+ * overwritten. If the existing item isn't big enough, it is extended.
+ * If it is too large, it is truncated.
+ *
+ * If the key isn't in the destination yet, a new item is inserted.
+ */
+static int overwrite_item(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct extent_buffer *eb, int slot,
+ struct btrfs_key *key)
{
int ret;
u32 item_size;
@@ -363,31 +391,24 @@ static int do_overwrite_item(struct btrfs_trans_handle *trans,
int save_old_i_size = 0;
unsigned long src_ptr;
unsigned long dst_ptr;
- int overwrite_root = 0;
bool inode_item = key->type == BTRFS_INODE_ITEM_KEY;
- if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
- overwrite_root = 1;
+ /*
+ * This is only used during log replay, so the root is always from a
+ * fs/subvolume tree. In case we ever need to support a log root, then
+ * we'll have to clone the leaf in the path, release the path and use
+ * the leaf before writing into the log tree. See the comments at
+ * copy_items() for more details.
+ */
+ ASSERT(root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID);
item_size = btrfs_item_size(eb, slot);
src_ptr = btrfs_item_ptr_offset(eb, slot);
- /* Our caller must have done a search for the key for us. */
- ASSERT(path->nodes[0] != NULL);
-
- /*
- * And the slot must point to the exact key or the slot where the key
- * should be at (the first item with a key greater than 'key')
- */
- if (path->slots[0] < btrfs_header_nritems(path->nodes[0])) {
- struct btrfs_key found_key;
-
- btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
- ret = btrfs_comp_cpu_keys(&found_key, key);
- ASSERT(ret >= 0);
- } else {
- ret = 1;
- }
+ /* Look for the key in the destination tree. */
+ ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
+ if (ret < 0)
+ return ret;
if (ret == 0) {
char *src_copy;
@@ -532,8 +553,7 @@ insert:
goto no_copy;
}
- if (overwrite_root &&
- S_ISDIR(btrfs_inode_mode(eb, src_item)) &&
+ if (S_ISDIR(btrfs_inode_mode(eb, src_item)) &&
S_ISDIR(btrfs_inode_mode(path->nodes[0], dst_item))) {
save_old_i_size = 1;
saved_i_size = btrfs_inode_size(path->nodes[0],
@@ -565,34 +585,19 @@ no_copy:
return 0;
}
-/*
- * Item overwrite used by replay and tree logging. eb, slot and key all refer
- * to the src data we are copying out.
- *
- * root is the tree we are copying into, and path is a scratch
- * path for use in this function (it should be released on entry and
- * will be released on exit).
- *
- * If the key is already in the destination tree the existing item is
- * overwritten. If the existing item isn't big enough, it is extended.
- * If it is too large, it is truncated.
- *
- * If the key isn't in the destination yet, a new item is inserted.
- */
-static int overwrite_item(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_path *path,
- struct extent_buffer *eb, int slot,
- struct btrfs_key *key)
+static int read_alloc_one_name(struct extent_buffer *eb, void *start, int len,
+ struct fscrypt_str *name)
{
- int ret;
+ char *buf;
- /* Look for the key in the destination tree. */
- ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
- if (ret < 0)
- return ret;
+ buf = kmalloc(len, GFP_NOFS);
+ if (!buf)
+ return -ENOMEM;
- return do_overwrite_item(trans, root, path, eb, slot, key);
+ read_extent_buffer(eb, buf, (unsigned long)start, len);
+ name->name = buf;
+ name->len = len;
+ return 0;
}
/*
@@ -747,8 +752,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
*/
ret = btrfs_qgroup_trace_extent(trans,
btrfs_file_extent_disk_bytenr(eb, item),
- btrfs_file_extent_disk_num_bytes(eb, item),
- GFP_NOFS);
+ btrfs_file_extent_disk_num_bytes(eb, item));
if (ret < 0)
goto out;
@@ -799,7 +803,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
btrfs_file_extent_num_bytes(eb, item);
}
- ret = btrfs_lookup_csums_range(root->log_root,
+ ret = btrfs_lookup_csums_list(root->log_root,
csum_start, csum_end - 1,
&ordered_sums, 0, false);
if (ret)
@@ -901,12 +905,11 @@ out:
static int unlink_inode_for_log_replay(struct btrfs_trans_handle *trans,
struct btrfs_inode *dir,
struct btrfs_inode *inode,
- const char *name,
- int name_len)
+ const struct fscrypt_str *name)
{
int ret;
- ret = btrfs_unlink_inode(trans, dir, inode, name, name_len);
+ ret = btrfs_unlink_inode(trans, dir, inode, name);
if (ret)
return ret;
/*
@@ -933,8 +936,7 @@ static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
{
struct btrfs_root *root = dir->root;
struct inode *inode;
- char *name;
- int name_len;
+ struct fscrypt_str name;
struct extent_buffer *leaf;
struct btrfs_key location;
int ret;
@@ -942,12 +944,10 @@ static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
leaf = path->nodes[0];
btrfs_dir_item_key_to_cpu(leaf, di, &location);
- name_len = btrfs_dir_name_len(leaf, di);
- name = kmalloc(name_len, GFP_NOFS);
- if (!name)
+ ret = read_alloc_one_name(leaf, di + 1, btrfs_dir_name_len(leaf, di), &name);
+ if (ret)
return -ENOMEM;
- read_extent_buffer(leaf, name, (unsigned long)(di + 1), name_len);
btrfs_release_path(path);
inode = read_one_inode(root, location.objectid);
@@ -960,10 +960,9 @@ static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
if (ret)
goto out;
- ret = unlink_inode_for_log_replay(trans, dir, BTRFS_I(inode), name,
- name_len);
+ ret = unlink_inode_for_log_replay(trans, dir, BTRFS_I(inode), &name);
out:
- kfree(name);
+ kfree(name.name);
iput(inode);
return ret;
}
@@ -978,14 +977,14 @@ out:
static noinline int inode_in_dir(struct btrfs_root *root,
struct btrfs_path *path,
u64 dirid, u64 objectid, u64 index,
- const char *name, int name_len)
+ struct fscrypt_str *name)
{
struct btrfs_dir_item *di;
struct btrfs_key location;
int ret = 0;
di = btrfs_lookup_dir_index_item(NULL, root, path, dirid,
- index, name, name_len, 0);
+ index, name, 0);
if (IS_ERR(di)) {
ret = PTR_ERR(di);
goto out;
@@ -998,7 +997,7 @@ static noinline int inode_in_dir(struct btrfs_root *root,
}
btrfs_release_path(path);
- di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, name_len, 0);
+ di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, 0);
if (IS_ERR(di)) {
ret = PTR_ERR(di);
goto out;
@@ -1025,7 +1024,7 @@ out:
static noinline int backref_in_log(struct btrfs_root *log,
struct btrfs_key *key,
u64 ref_objectid,
- const char *name, int namelen)
+ const struct fscrypt_str *name)
{
struct btrfs_path *path;
int ret;
@@ -1045,12 +1044,10 @@ static noinline int backref_in_log(struct btrfs_root *log,
if (key->type == BTRFS_INODE_EXTREF_KEY)
ret = !!btrfs_find_name_in_ext_backref(path->nodes[0],
path->slots[0],
- ref_objectid,
- name, namelen);
+ ref_objectid, name);
else
ret = !!btrfs_find_name_in_backref(path->nodes[0],
- path->slots[0],
- name, namelen);
+ path->slots[0], name);
out:
btrfs_free_path(path);
return ret;
@@ -1063,11 +1060,9 @@ static inline int __add_inode_ref(struct btrfs_trans_handle *trans,
struct btrfs_inode *dir,
struct btrfs_inode *inode,
u64 inode_objectid, u64 parent_objectid,
- u64 ref_index, char *name, int namelen)
+ u64 ref_index, struct fscrypt_str *name)
{
int ret;
- char *victim_name;
- int victim_name_len;
struct extent_buffer *leaf;
struct btrfs_dir_item *di;
struct btrfs_key search_key;
@@ -1099,43 +1094,40 @@ again:
ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
ptr_end = ptr + btrfs_item_size(leaf, path->slots[0]);
while (ptr < ptr_end) {
- victim_ref = (struct btrfs_inode_ref *)ptr;
- victim_name_len = btrfs_inode_ref_name_len(leaf,
- victim_ref);
- victim_name = kmalloc(victim_name_len, GFP_NOFS);
- if (!victim_name)
- return -ENOMEM;
+ struct fscrypt_str victim_name;
- read_extent_buffer(leaf, victim_name,
- (unsigned long)(victim_ref + 1),
- victim_name_len);
+ victim_ref = (struct btrfs_inode_ref *)ptr;
+ ret = read_alloc_one_name(leaf, (victim_ref + 1),
+ btrfs_inode_ref_name_len(leaf, victim_ref),
+ &victim_name);
+ if (ret)
+ return ret;
ret = backref_in_log(log_root, &search_key,
- parent_objectid, victim_name,
- victim_name_len);
+ parent_objectid, &victim_name);
if (ret < 0) {
- kfree(victim_name);
+ kfree(victim_name.name);
return ret;
} else if (!ret) {
inc_nlink(&inode->vfs_inode);
btrfs_release_path(path);
ret = unlink_inode_for_log_replay(trans, dir, inode,
- victim_name, victim_name_len);
- kfree(victim_name);
+ &victim_name);
+ kfree(victim_name.name);
if (ret)
return ret;
goto again;
}
- kfree(victim_name);
+ kfree(victim_name.name);
- ptr = (unsigned long)(victim_ref + 1) + victim_name_len;
+ ptr = (unsigned long)(victim_ref + 1) + victim_name.len;
}
}
btrfs_release_path(path);
/* Same search but for extended refs */
- extref = btrfs_lookup_inode_extref(NULL, root, path, name, namelen,
+ extref = btrfs_lookup_inode_extref(NULL, root, path, name,
inode_objectid, parent_objectid, 0,
0);
if (IS_ERR(extref)) {
@@ -1152,29 +1144,28 @@ again:
base = btrfs_item_ptr_offset(leaf, path->slots[0]);
while (cur_offset < item_size) {
- extref = (struct btrfs_inode_extref *)(base + cur_offset);
+ struct fscrypt_str victim_name;
- victim_name_len = btrfs_inode_extref_name_len(leaf, extref);
+ extref = (struct btrfs_inode_extref *)(base + cur_offset);
if (btrfs_inode_extref_parent(leaf, extref) != parent_objectid)
goto next;
- victim_name = kmalloc(victim_name_len, GFP_NOFS);
- if (!victim_name)
- return -ENOMEM;
- read_extent_buffer(leaf, victim_name, (unsigned long)&extref->name,
- victim_name_len);
+ ret = read_alloc_one_name(leaf, &extref->name,
+ btrfs_inode_extref_name_len(leaf, extref),
+ &victim_name);
+ if (ret)
+ return ret;
search_key.objectid = inode_objectid;
search_key.type = BTRFS_INODE_EXTREF_KEY;
search_key.offset = btrfs_extref_hash(parent_objectid,
- victim_name,
- victim_name_len);
+ victim_name.name,
+ victim_name.len);
ret = backref_in_log(log_root, &search_key,
- parent_objectid, victim_name,
- victim_name_len);
+ parent_objectid, &victim_name);
if (ret < 0) {
- kfree(victim_name);
+ kfree(victim_name.name);
return ret;
} else if (!ret) {
ret = -ENOENT;
@@ -1186,26 +1177,24 @@ again:
ret = unlink_inode_for_log_replay(trans,
BTRFS_I(victim_parent),
- inode,
- victim_name,
- victim_name_len);
+ inode, &victim_name);
}
iput(victim_parent);
- kfree(victim_name);
+ kfree(victim_name.name);
if (ret)
return ret;
goto again;
}
- kfree(victim_name);
+ kfree(victim_name.name);
next:
- cur_offset += victim_name_len + sizeof(*extref);
+ cur_offset += victim_name.len + sizeof(*extref);
}
}
btrfs_release_path(path);
/* look for a conflicting sequence number */
di = btrfs_lookup_dir_index_item(trans, root, path, btrfs_ino(dir),
- ref_index, name, namelen, 0);
+ ref_index, name, 0);
if (IS_ERR(di)) {
return PTR_ERR(di);
} else if (di) {
@@ -1216,8 +1205,7 @@ next:
btrfs_release_path(path);
/* look for a conflicting name */
- di = btrfs_lookup_dir_item(trans, root, path, btrfs_ino(dir),
- name, namelen, 0);
+ di = btrfs_lookup_dir_item(trans, root, path, btrfs_ino(dir), name, 0);
if (IS_ERR(di)) {
return PTR_ERR(di);
} else if (di) {
@@ -1231,20 +1219,18 @@ next:
}
static int extref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr,
- u32 *namelen, char **name, u64 *index,
+ struct fscrypt_str *name, u64 *index,
u64 *parent_objectid)
{
struct btrfs_inode_extref *extref;
+ int ret;
extref = (struct btrfs_inode_extref *)ref_ptr;
- *namelen = btrfs_inode_extref_name_len(eb, extref);
- *name = kmalloc(*namelen, GFP_NOFS);
- if (*name == NULL)
- return -ENOMEM;
-
- read_extent_buffer(eb, *name, (unsigned long)&extref->name,
- *namelen);
+ ret = read_alloc_one_name(eb, &extref->name,
+ btrfs_inode_extref_name_len(eb, extref), name);
+ if (ret)
+ return ret;
if (index)
*index = btrfs_inode_extref_index(eb, extref);
@@ -1255,18 +1241,17 @@ static int extref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr,
}
static int ref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr,
- u32 *namelen, char **name, u64 *index)
+ struct fscrypt_str *name, u64 *index)
{
struct btrfs_inode_ref *ref;
+ int ret;
ref = (struct btrfs_inode_ref *)ref_ptr;
- *namelen = btrfs_inode_ref_name_len(eb, ref);
- *name = kmalloc(*namelen, GFP_NOFS);
- if (*name == NULL)
- return -ENOMEM;
-
- read_extent_buffer(eb, *name, (unsigned long)(ref + 1), *namelen);
+ ret = read_alloc_one_name(eb, ref + 1, btrfs_inode_ref_name_len(eb, ref),
+ name);
+ if (ret)
+ return ret;
if (index)
*index = btrfs_inode_ref_index(eb, ref);
@@ -1308,28 +1293,24 @@ again:
ref_ptr = btrfs_item_ptr_offset(eb, path->slots[0]);
ref_end = ref_ptr + btrfs_item_size(eb, path->slots[0]);
while (ref_ptr < ref_end) {
- char *name = NULL;
- int namelen;
+ struct fscrypt_str name;
u64 parent_id;
if (key->type == BTRFS_INODE_EXTREF_KEY) {
- ret = extref_get_fields(eb, ref_ptr, &namelen, &name,
+ ret = extref_get_fields(eb, ref_ptr, &name,
NULL, &parent_id);
} else {
parent_id = key->offset;
- ret = ref_get_fields(eb, ref_ptr, &namelen, &name,
- NULL);
+ ret = ref_get_fields(eb, ref_ptr, &name, NULL);
}
if (ret)
goto out;
if (key->type == BTRFS_INODE_EXTREF_KEY)
ret = !!btrfs_find_name_in_ext_backref(log_eb, log_slot,
- parent_id, name,
- namelen);
+ parent_id, &name);
else
- ret = !!btrfs_find_name_in_backref(log_eb, log_slot,
- name, namelen);
+ ret = !!btrfs_find_name_in_backref(log_eb, log_slot, &name);
if (!ret) {
struct inode *dir;
@@ -1338,20 +1319,20 @@ again:
dir = read_one_inode(root, parent_id);
if (!dir) {
ret = -ENOENT;
- kfree(name);
+ kfree(name.name);
goto out;
}
ret = unlink_inode_for_log_replay(trans, BTRFS_I(dir),
- inode, name, namelen);
- kfree(name);
+ inode, &name);
+ kfree(name.name);
iput(dir);
if (ret)
goto out;
goto again;
}
- kfree(name);
- ref_ptr += namelen;
+ kfree(name.name);
+ ref_ptr += name.len;
if (key->type == BTRFS_INODE_EXTREF_KEY)
ref_ptr += sizeof(struct btrfs_inode_extref);
else
@@ -1380,8 +1361,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
struct inode *inode = NULL;
unsigned long ref_ptr;
unsigned long ref_end;
- char *name = NULL;
- int namelen;
+ struct fscrypt_str name;
int ret;
int log_ref_ver = 0;
u64 parent_objectid;
@@ -1425,7 +1405,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
while (ref_ptr < ref_end) {
if (log_ref_ver) {
- ret = extref_get_fields(eb, ref_ptr, &namelen, &name,
+ ret = extref_get_fields(eb, ref_ptr, &name,
&ref_index, &parent_objectid);
/*
* parent object can change from one array
@@ -1438,15 +1418,13 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
goto out;
}
} else {
- ret = ref_get_fields(eb, ref_ptr, &namelen, &name,
- &ref_index);
+ ret = ref_get_fields(eb, ref_ptr, &name, &ref_index);
}
if (ret)
goto out;
ret = inode_in_dir(root, path, btrfs_ino(BTRFS_I(dir)),
- btrfs_ino(BTRFS_I(inode)), ref_index,
- name, namelen);
+ btrfs_ino(BTRFS_I(inode)), ref_index, &name);
if (ret < 0) {
goto out;
} else if (ret == 0) {
@@ -1460,7 +1438,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
ret = __add_inode_ref(trans, root, path, log,
BTRFS_I(dir), BTRFS_I(inode),
inode_objectid, parent_objectid,
- ref_index, name, namelen);
+ ref_index, &name);
if (ret) {
if (ret == 1)
ret = 0;
@@ -1469,7 +1447,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
/* insert our name */
ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode),
- name, namelen, 0, ref_index);
+ &name, 0, ref_index);
if (ret)
goto out;
@@ -1479,9 +1457,9 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
}
/* Else, ret == 1, we already have a perfect match, we're done. */
- ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + namelen;
- kfree(name);
- name = NULL;
+ ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + name.len;
+ kfree(name.name);
+ name.name = NULL;
if (log_ref_ver) {
iput(dir);
dir = NULL;
@@ -1505,7 +1483,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
ret = overwrite_item(trans, root, path, eb, slot, key);
out:
btrfs_release_path(path);
- kfree(name);
+ kfree(name.name);
iput(dir);
iput(inode);
return ret;
@@ -1777,7 +1755,7 @@ static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans,
static noinline int insert_one_name(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 dirid, u64 index,
- char *name, int name_len,
+ const struct fscrypt_str *name,
struct btrfs_key *location)
{
struct inode *inode;
@@ -1795,7 +1773,7 @@ static noinline int insert_one_name(struct btrfs_trans_handle *trans,
}
ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), name,
- name_len, 1, index);
+ 1, index);
/* FIXME, put inode into FIXUP list */
@@ -1809,7 +1787,7 @@ static int delete_conflicting_dir_entry(struct btrfs_trans_handle *trans,
struct btrfs_path *path,
struct btrfs_dir_item *dst_di,
const struct btrfs_key *log_key,
- u8 log_type,
+ u8 log_flags,
bool exists)
{
struct btrfs_key found_key;
@@ -1819,7 +1797,7 @@ static int delete_conflicting_dir_entry(struct btrfs_trans_handle *trans,
if (found_key.objectid == log_key->objectid &&
found_key.type == log_key->type &&
found_key.offset == log_key->offset &&
- btrfs_dir_type(path->nodes[0], dst_di) == log_type)
+ btrfs_dir_flags(path->nodes[0], dst_di) == log_flags)
return 1;
/*
@@ -1855,8 +1833,7 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
struct btrfs_dir_item *di,
struct btrfs_key *key)
{
- char *name;
- int name_len;
+ struct fscrypt_str name;
struct btrfs_dir_item *dir_dst_di;
struct btrfs_dir_item *index_dst_di;
bool dir_dst_matches = false;
@@ -1864,7 +1841,7 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
struct btrfs_key log_key;
struct btrfs_key search_key;
struct inode *dir;
- u8 log_type;
+ u8 log_flags;
bool exists;
int ret;
bool update_size = true;
@@ -1874,17 +1851,11 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
if (!dir)
return -EIO;
- name_len = btrfs_dir_name_len(eb, di);
- name = kmalloc(name_len, GFP_NOFS);
- if (!name) {
- ret = -ENOMEM;
+ ret = read_alloc_one_name(eb, di + 1, btrfs_dir_name_len(eb, di), &name);
+ if (ret)
goto out;
- }
-
- log_type = btrfs_dir_type(eb, di);
- read_extent_buffer(eb, name, (unsigned long)(di + 1),
- name_len);
+ log_flags = btrfs_dir_flags(eb, di);
btrfs_dir_item_key_to_cpu(eb, di, &log_key);
ret = btrfs_lookup_inode(trans, root, path, &log_key, 0);
btrfs_release_path(path);
@@ -1894,14 +1865,14 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
ret = 0;
dir_dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid,
- name, name_len, 1);
+ &name, 1);
if (IS_ERR(dir_dst_di)) {
ret = PTR_ERR(dir_dst_di);
goto out;
} else if (dir_dst_di) {
ret = delete_conflicting_dir_entry(trans, BTRFS_I(dir), path,
- dir_dst_di, &log_key, log_type,
- exists);
+ dir_dst_di, &log_key,
+ log_flags, exists);
if (ret < 0)
goto out;
dir_dst_matches = (ret == 1);
@@ -1911,14 +1882,14 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
index_dst_di = btrfs_lookup_dir_index_item(trans, root, path,
key->objectid, key->offset,
- name, name_len, 1);
+ &name, 1);
if (IS_ERR(index_dst_di)) {
ret = PTR_ERR(index_dst_di);
goto out;
} else if (index_dst_di) {
ret = delete_conflicting_dir_entry(trans, BTRFS_I(dir), path,
index_dst_di, &log_key,
- log_type, exists);
+ log_flags, exists);
if (ret < 0)
goto out;
index_dst_matches = (ret == 1);
@@ -1939,7 +1910,7 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
search_key.objectid = log_key.objectid;
search_key.type = BTRFS_INODE_REF_KEY;
search_key.offset = key->objectid;
- ret = backref_in_log(root->log_root, &search_key, 0, name, name_len);
+ ret = backref_in_log(root->log_root, &search_key, 0, &name);
if (ret < 0) {
goto out;
} else if (ret) {
@@ -1952,8 +1923,7 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
search_key.objectid = log_key.objectid;
search_key.type = BTRFS_INODE_EXTREF_KEY;
search_key.offset = key->objectid;
- ret = backref_in_log(root->log_root, &search_key, key->objectid, name,
- name_len);
+ ret = backref_in_log(root->log_root, &search_key, key->objectid, &name);
if (ret < 0) {
goto out;
} else if (ret) {
@@ -1964,7 +1934,7 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
}
btrfs_release_path(path);
ret = insert_one_name(trans, root, key->objectid, key->offset,
- name, name_len, &log_key);
+ &name, &log_key);
if (ret && ret != -ENOENT && ret != -EEXIST)
goto out;
if (!ret)
@@ -1974,10 +1944,10 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
out:
if (!ret && update_size) {
- btrfs_i_size_write(BTRFS_I(dir), dir->i_size + name_len * 2);
+ btrfs_i_size_write(BTRFS_I(dir), dir->i_size + name.len * 2);
ret = btrfs_update_inode(trans, root, BTRFS_I(dir));
}
- kfree(name);
+ kfree(name.name);
iput(dir);
if (!ret && name_added)
ret = 1;
@@ -2028,7 +1998,7 @@ static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans,
* to ever delete the parent directory has it would result in stale
* dentries that can never be deleted.
*/
- if (ret == 1 && btrfs_dir_type(eb, di) != BTRFS_FT_DIR) {
+ if (ret == 1 && btrfs_dir_ftype(eb, di) != BTRFS_FT_DIR) {
struct btrfs_path *fixup_path;
struct btrfs_key di_key;
@@ -2143,8 +2113,7 @@ static noinline int check_item_in_log(struct btrfs_trans_handle *trans,
struct extent_buffer *eb;
int slot;
struct btrfs_dir_item *di;
- int name_len;
- char *name;
+ struct fscrypt_str name;
struct inode *inode = NULL;
struct btrfs_key location;
@@ -2159,22 +2128,16 @@ static noinline int check_item_in_log(struct btrfs_trans_handle *trans,
eb = path->nodes[0];
slot = path->slots[0];
di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item);
- name_len = btrfs_dir_name_len(eb, di);
- name = kmalloc(name_len, GFP_NOFS);
- if (!name) {
- ret = -ENOMEM;
+ ret = read_alloc_one_name(eb, di + 1, btrfs_dir_name_len(eb, di), &name);
+ if (ret)
goto out;
- }
-
- read_extent_buffer(eb, name, (unsigned long)(di + 1), name_len);
if (log) {
struct btrfs_dir_item *log_di;
log_di = btrfs_lookup_dir_index_item(trans, log, log_path,
dir_key->objectid,
- dir_key->offset,
- name, name_len, 0);
+ dir_key->offset, &name, 0);
if (IS_ERR(log_di)) {
ret = PTR_ERR(log_di);
goto out;
@@ -2200,7 +2163,7 @@ static noinline int check_item_in_log(struct btrfs_trans_handle *trans,
inc_nlink(inode);
ret = unlink_inode_for_log_replay(trans, BTRFS_I(dir), BTRFS_I(inode),
- name, name_len);
+ &name);
/*
* Unlike dir item keys, dir index keys can only have one name (entry) in
* them, as there are no key collisions since each key has a unique offset
@@ -2209,7 +2172,7 @@ static noinline int check_item_in_log(struct btrfs_trans_handle *trans,
out:
btrfs_release_path(path);
btrfs_release_path(log_path);
- kfree(name);
+ kfree(name.name);
iput(inode);
return ret;
}
@@ -2430,13 +2393,17 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
struct walk_control *wc, u64 gen, int level)
{
int nritems;
+ struct btrfs_tree_parent_check check = {
+ .transid = gen,
+ .level = level
+ };
struct btrfs_path *path;
struct btrfs_root *root = wc->replay_dest;
struct btrfs_key key;
int i;
int ret;
- ret = btrfs_read_extent_buffer(eb, gen, level, NULL);
+ ret = btrfs_read_extent_buffer(eb, &check);
if (ret)
return ret;
@@ -2616,7 +2583,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
int ret = 0;
while (*level > 0) {
- struct btrfs_key first_key;
+ struct btrfs_tree_parent_check check = { 0 };
cur = path->nodes[*level];
@@ -2628,7 +2595,10 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
- btrfs_node_key_to_cpu(cur, &first_key, path->slots[*level]);
+ check.transid = ptr_gen;
+ check.level = *level - 1;
+ check.has_first_key = true;
+ btrfs_node_key_to_cpu(cur, &check.first_key, path->slots[*level]);
blocksize = fs_info->nodesize;
next = btrfs_find_create_tree_block(fs_info, bytenr,
@@ -2647,8 +2617,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
path->slots[*level]++;
if (wc->free) {
- ret = btrfs_read_extent_buffer(next, ptr_gen,
- *level - 1, &first_key);
+ ret = btrfs_read_extent_buffer(next, &check);
if (ret) {
free_extent_buffer(next);
return ret;
@@ -2676,7 +2645,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
free_extent_buffer(next);
continue;
}
- ret = btrfs_read_extent_buffer(next, ptr_gen, *level - 1, &first_key);
+ ret = btrfs_read_extent_buffer(next, &check);
if (ret) {
free_extent_buffer(next);
return ret;
@@ -3448,7 +3417,7 @@ static int del_logged_dentry(struct btrfs_trans_handle *trans,
struct btrfs_root *log,
struct btrfs_path *path,
u64 dir_ino,
- const char *name, int name_len,
+ const struct fscrypt_str *name,
u64 index)
{
struct btrfs_dir_item *di;
@@ -3458,7 +3427,7 @@ static int del_logged_dentry(struct btrfs_trans_handle *trans,
* for dir item keys.
*/
di = btrfs_lookup_dir_index_item(trans, log, path, dir_ino,
- index, name, name_len, -1);
+ index, name, -1);
if (IS_ERR(di))
return PTR_ERR(di);
else if (!di)
@@ -3495,7 +3464,7 @@ static int del_logged_dentry(struct btrfs_trans_handle *trans,
*/
void btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
- const char *name, int name_len,
+ const struct fscrypt_str *name,
struct btrfs_inode *dir, u64 index)
{
struct btrfs_path *path;
@@ -3522,7 +3491,7 @@ void btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
}
ret = del_logged_dentry(trans, root->log_root, path, btrfs_ino(dir),
- name, name_len, index);
+ name, index);
btrfs_free_path(path);
out_unlock:
mutex_unlock(&dir->log_mutex);
@@ -3534,7 +3503,7 @@ out_unlock:
/* see comments for btrfs_del_dir_entries_in_log */
void btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
- const char *name, int name_len,
+ const struct fscrypt_str *name,
struct btrfs_inode *inode, u64 dirid)
{
struct btrfs_root *log;
@@ -3555,7 +3524,7 @@ void btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
log = root->log_root;
mutex_lock(&inode->log_mutex);
- ret = btrfs_del_inode_ref(trans, log, name, name_len, btrfs_ino(inode),
+ ret = btrfs_del_inode_ref(trans, log, name, btrfs_ino(inode),
dirid, &index);
mutex_unlock(&inode->log_mutex);
if (ret < 0 && ret != -ENOENT)
@@ -3694,15 +3663,29 @@ static int process_dir_items_leaf(struct btrfs_trans_handle *trans,
u64 *last_old_dentry_offset)
{
struct btrfs_root *log = inode->root->log_root;
- struct extent_buffer *src = path->nodes[0];
- const int nritems = btrfs_header_nritems(src);
+ struct extent_buffer *src;
+ const int nritems = btrfs_header_nritems(path->nodes[0]);
const u64 ino = btrfs_ino(inode);
bool last_found = false;
int batch_start = 0;
int batch_size = 0;
int i;
- for (i = path->slots[0]; i < nritems; i++) {
+ /*
+ * We need to clone the leaf, release the read lock on it, and use the
+ * clone before modifying the log tree. See the comment at copy_items()
+ * about why we need to do this.
+ */
+ src = btrfs_clone_extent_buffer(path->nodes[0]);
+ if (!src)
+ return -ENOMEM;
+
+ i = path->slots[0];
+ btrfs_release_path(path);
+ path->nodes[0] = src;
+ path->slots[0] = i;
+
+ for (; i < nritems; i++) {
struct btrfs_dir_item *di;
struct btrfs_key key;
int ret;
@@ -4303,7 +4286,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
{
struct btrfs_root *log = inode->root->log_root;
struct btrfs_file_extent_item *extent;
- struct extent_buffer *src = src_path->nodes[0];
+ struct extent_buffer *src;
int ret = 0;
struct btrfs_key *ins_keys;
u32 *ins_sizes;
@@ -4314,6 +4297,43 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
const bool skip_csum = (inode->flags & BTRFS_INODE_NODATASUM);
const u64 i_size = i_size_read(&inode->vfs_inode);
+ /*
+ * To keep lockdep happy and avoid deadlocks, clone the source leaf and
+ * use the clone. This is because otherwise we would be changing the log
+ * tree, to insert items from the subvolume tree or insert csum items,
+ * while holding a read lock on a leaf from the subvolume tree, which
+ * creates a nasty lock dependency when COWing log tree nodes/leaves:
+ *
+ * 1) Modifying the log tree triggers an extent buffer allocation while
+ * holding a write lock on a parent extent buffer from the log tree.
+ * Allocating the pages for an extent buffer, or the extent buffer
+ * struct, can trigger inode eviction and finally the inode eviction
+ * will trigger a release/remove of a delayed node, which requires
+ * taking the delayed node's mutex;
+ *
+ * 2) Allocating a metadata extent for a log tree can trigger the async
+ * reclaim thread and make us wait for it to release enough space and
+ * unblock our reservation ticket. The reclaim thread can start
+ * flushing delayed items, and that in turn results in the need to
+ * lock delayed node mutexes and in the need to write lock extent
+ * buffers of a subvolume tree - all this while holding a write lock
+ * on the parent extent buffer in the log tree.
+ *
+ * So one task in scenario 1) running in parallel with another task in
+ * scenario 2) could lead to a deadlock, one wanting to lock a delayed
+ * node mutex while having a read lock on a leaf from the subvolume,
+ * while the other is holding the delayed node's mutex and wants to
+ * write lock the same subvolume leaf for flushing delayed items.
+ */
+ src = btrfs_clone_extent_buffer(src_path->nodes[0]);
+ if (!src)
+ return -ENOMEM;
+
+ i = src_path->slots[0];
+ btrfs_release_path(src_path);
+ src_path->nodes[0] = src;
+ src_path->slots[0] = i;
+
ins_data = kmalloc(nr * sizeof(struct btrfs_key) +
nr * sizeof(u32), GFP_NOFS);
if (!ins_data)
@@ -4400,9 +4420,9 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
csum_root = btrfs_csum_root(trans->fs_info, disk_bytenr);
disk_bytenr += extent_offset;
- ret = btrfs_lookup_csums_range(csum_root, disk_bytenr,
- disk_bytenr + extent_num_bytes - 1,
- &ordered_sums, 0, false);
+ ret = btrfs_lookup_csums_list(csum_root, disk_bytenr,
+ disk_bytenr + extent_num_bytes - 1,
+ &ordered_sums, 0, false);
if (ret)
goto out;
@@ -4595,10 +4615,9 @@ static int log_extent_csums(struct btrfs_trans_handle *trans,
/* block start is already adjusted for the file extent offset. */
csum_root = btrfs_csum_root(trans->fs_info, em->block_start);
- ret = btrfs_lookup_csums_range(csum_root,
- em->block_start + csum_offset,
- em->block_start + csum_offset +
- csum_len - 1, &ordered_sums, 0, false);
+ ret = btrfs_lookup_csums_list(csum_root, em->block_start + csum_offset,
+ em->block_start + csum_offset +
+ csum_len - 1, &ordered_sums, 0, false);
if (ret)
return ret;
@@ -5218,6 +5237,7 @@ static int btrfs_check_ref_name_override(struct extent_buffer *eb,
u32 this_len;
unsigned long name_ptr;
struct btrfs_dir_item *di;
+ struct fscrypt_str name_str;
if (key->type == BTRFS_INODE_REF_KEY) {
struct btrfs_inode_ref *iref;
@@ -5251,8 +5271,11 @@ static int btrfs_check_ref_name_override(struct extent_buffer *eb,
}
read_extent_buffer(eb, name, name_ptr, this_name_len);
+
+ name_str.name = name;
+ name_str.len = this_name_len;
di = btrfs_lookup_dir_item(NULL, inode->root, search_path,
- parent, name, this_name_len, 0);
+ parent, &name_str, 0);
if (di && !IS_ERR(di)) {
struct btrfs_key di_key;
@@ -5359,7 +5382,7 @@ struct btrfs_dir_list {
* has a size that doesn't match the sum of the lengths of all the logged
* names - this is ok, not a problem, because at log replay time we set the
* directory's i_size to the correct value (see replay_one_name() and
- * do_overwrite_item()).
+ * overwrite_item()).
*/
static int log_new_dir_dentries(struct btrfs_trans_handle *trans,
struct btrfs_inode *start_inode,
@@ -5422,7 +5445,7 @@ again:
}
di = btrfs_item_ptr(leaf, i, struct btrfs_dir_item);
- type = btrfs_dir_type(leaf, di);
+ type = btrfs_dir_ftype(leaf, di);
if (btrfs_dir_transid(leaf, di) < trans->transid)
continue;
btrfs_dir_item_key_to_cpu(leaf, di, &di_key);
@@ -5437,7 +5460,7 @@ again:
}
if (!need_log_inode(trans, BTRFS_I(di_inode))) {
- btrfs_add_delayed_iput(di_inode);
+ btrfs_add_delayed_iput(BTRFS_I(di_inode));
break;
}
@@ -5446,7 +5469,7 @@ again:
log_mode = LOG_INODE_ALL;
ret = btrfs_log_inode(trans, BTRFS_I(di_inode),
log_mode, ctx);
- btrfs_add_delayed_iput(di_inode);
+ btrfs_add_delayed_iput(BTRFS_I(di_inode));
if (ret)
goto out;
if (ctx->log_new_dentries) {
@@ -5640,11 +5663,11 @@ static int add_conflicting_inode(struct btrfs_trans_handle *trans,
* so that the log ends up with the new name and without the old name.
*/
if (!need_log_inode(trans, BTRFS_I(inode))) {
- btrfs_add_delayed_iput(inode);
+ btrfs_add_delayed_iput(BTRFS_I(inode));
return 0;
}
- btrfs_add_delayed_iput(inode);
+ btrfs_add_delayed_iput(BTRFS_I(inode));
ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS);
if (!ino_elem)
@@ -5719,7 +5742,7 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
*/
ret = btrfs_log_inode(trans, BTRFS_I(inode),
LOG_INODE_ALL, ctx);
- btrfs_add_delayed_iput(inode);
+ btrfs_add_delayed_iput(BTRFS_I(inode));
if (ret)
break;
continue;
@@ -5736,7 +5759,7 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
* that, we can avoid doing it again.
*/
if (!need_log_inode(trans, BTRFS_I(inode))) {
- btrfs_add_delayed_iput(inode);
+ btrfs_add_delayed_iput(BTRFS_I(inode));
continue;
}
@@ -5748,7 +5771,7 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
* log with the new name before we unpin it.
*/
ret = btrfs_log_inode(trans, BTRFS_I(inode), LOG_INODE_EXISTS, ctx);
- btrfs_add_delayed_iput(inode);
+ btrfs_add_delayed_iput(BTRFS_I(inode));
if (ret)
break;
}
@@ -6258,11 +6281,11 @@ static int log_new_delayed_dentries(struct btrfs_trans_handle *trans,
}
if (!need_log_inode(trans, BTRFS_I(di_inode))) {
- btrfs_add_delayed_iput(di_inode);
+ btrfs_add_delayed_iput(BTRFS_I(di_inode));
continue;
}
- if (btrfs_stack_dir_type(dir_item) == BTRFS_FT_DIR)
+ if (btrfs_stack_dir_ftype(dir_item) == BTRFS_FT_DIR)
log_mode = LOG_INODE_ALL;
ctx->log_new_dentries = false;
@@ -6271,7 +6294,7 @@ static int log_new_delayed_dentries(struct btrfs_trans_handle *trans,
if (!ret && ctx->log_new_dentries)
ret = log_new_dir_dentries(trans, BTRFS_I(di_inode), ctx);
- btrfs_add_delayed_iput(di_inode);
+ btrfs_add_delayed_iput(BTRFS_I(di_inode));
if (ret)
break;
@@ -6732,7 +6755,7 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans,
}
if (!need_log_inode(trans, BTRFS_I(dir_inode))) {
- btrfs_add_delayed_iput(dir_inode);
+ btrfs_add_delayed_iput(BTRFS_I(dir_inode));
continue;
}
@@ -6742,7 +6765,7 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans,
if (!ret && ctx->log_new_dentries)
ret = log_new_dir_dentries(trans,
BTRFS_I(dir_inode), ctx);
- btrfs_add_delayed_iput(dir_inode);
+ btrfs_add_delayed_iput(BTRFS_I(dir_inode));
if (ret)
goto out;
}
@@ -6787,7 +6810,7 @@ static int log_new_ancestors(struct btrfs_trans_handle *trans,
need_log_inode(trans, BTRFS_I(inode)))
ret = btrfs_log_inode(trans, BTRFS_I(inode),
LOG_INODE_EXISTS, ctx);
- btrfs_add_delayed_iput(inode);
+ btrfs_add_delayed_iput(BTRFS_I(inode));
if (ret)
return ret;
@@ -7349,7 +7372,7 @@ void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans,
mutex_unlock(&dir->log_mutex);
}
-/**
+/*
* Update the log after adding a new name for an inode.
*
* @trans: Transaction handle.
@@ -7416,9 +7439,14 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans,
if (old_dir && old_dir->logged_trans == trans->transid) {
struct btrfs_root *log = old_dir->root->log_root;
struct btrfs_path *path;
+ struct fscrypt_name fname;
ASSERT(old_dir_index >= BTRFS_DIR_START_INDEX);
+ ret = fscrypt_setup_filename(&old_dir->vfs_inode,
+ &old_dentry->d_name, 0, &fname);
+ if (ret)
+ goto out;
/*
* We have two inodes to update in the log, the old directory and
* the inode that got renamed, so we must pin the log to prevent
@@ -7438,6 +7466,7 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans,
path = btrfs_alloc_path();
if (!path) {
ret = -ENOMEM;
+ fscrypt_free_filename(&fname);
goto out;
}
@@ -7453,8 +7482,7 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans,
*/
mutex_lock(&old_dir->log_mutex);
ret = del_logged_dentry(trans, log, path, btrfs_ino(old_dir),
- old_dentry->d_name.name,
- old_dentry->d_name.len, old_dir_index);
+ &fname.disk_name, old_dir_index);
if (ret > 0) {
/*
* The dentry does not exist in the log, so record its
@@ -7468,6 +7496,7 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans,
mutex_unlock(&old_dir->log_mutex);
btrfs_free_path(path);
+ fscrypt_free_filename(&fname);
if (ret < 0)
goto out;
}
diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
index aed1e05e9879..85b43075ac58 100644
--- a/fs/btrfs/tree-log.h
+++ b/fs/btrfs/tree-log.h
@@ -6,6 +6,7 @@
#ifndef BTRFS_TREE_LOG_H
#define BTRFS_TREE_LOG_H
+#include "messages.h"
#include "ctree.h"
#include "transaction.h"
@@ -86,11 +87,11 @@ int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
struct btrfs_log_ctx *ctx);
void btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
- const char *name, int name_len,
+ const struct fscrypt_str *name,
struct btrfs_inode *dir, u64 index);
void btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
- const char *name, int name_len,
+ const struct fscrypt_str *name,
struct btrfs_inode *inode, u64 dirid);
void btrfs_end_log_trans(struct btrfs_root *root);
void btrfs_pin_log_trans(struct btrfs_root *root);
diff --git a/fs/btrfs/tree-mod-log.c b/fs/btrfs/tree-mod-log.c
index 8a3a14686d3e..a555baa0143a 100644
--- a/fs/btrfs/tree-mod-log.c
+++ b/fs/btrfs/tree-mod-log.c
@@ -1,7 +1,11 @@
// SPDX-License-Identifier: GPL-2.0
+#include "messages.h"
#include "tree-mod-log.h"
#include "disk-io.h"
+#include "fs.h"
+#include "accessors.h"
+#include "tree-checker.h"
struct tree_mod_root {
u64 logical;
@@ -197,12 +201,11 @@ static inline bool tree_mod_need_log(const struct btrfs_fs_info *fs_info,
static struct tree_mod_elem *alloc_tree_mod_elem(struct extent_buffer *eb,
int slot,
- enum btrfs_mod_log_op op,
- gfp_t flags)
+ enum btrfs_mod_log_op op)
{
struct tree_mod_elem *tm;
- tm = kzalloc(sizeof(*tm), flags);
+ tm = kzalloc(sizeof(*tm), GFP_NOFS);
if (!tm)
return NULL;
@@ -220,7 +223,7 @@ static struct tree_mod_elem *alloc_tree_mod_elem(struct extent_buffer *eb,
}
int btrfs_tree_mod_log_insert_key(struct extent_buffer *eb, int slot,
- enum btrfs_mod_log_op op, gfp_t flags)
+ enum btrfs_mod_log_op op)
{
struct tree_mod_elem *tm;
int ret;
@@ -228,7 +231,7 @@ int btrfs_tree_mod_log_insert_key(struct extent_buffer *eb, int slot,
if (!tree_mod_need_log(eb->fs_info, eb))
return 0;
- tm = alloc_tree_mod_elem(eb, slot, op, flags);
+ tm = alloc_tree_mod_elem(eb, slot, op);
if (!tm)
return -ENOMEM;
@@ -276,7 +279,7 @@ int btrfs_tree_mod_log_insert_move(struct extent_buffer *eb,
for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
tm_list[i] = alloc_tree_mod_elem(eb, i + dst_slot,
- BTRFS_MOD_LOG_KEY_REMOVE_WHILE_MOVING, GFP_NOFS);
+ BTRFS_MOD_LOG_KEY_REMOVE_WHILE_MOVING);
if (!tm_list[i]) {
ret = -ENOMEM;
goto free_tms;
@@ -364,7 +367,7 @@ int btrfs_tree_mod_log_insert_root(struct extent_buffer *old_root,
}
for (i = 0; i < nritems; i++) {
tm_list[i] = alloc_tree_mod_elem(old_root, i,
- BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS);
+ BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING);
if (!tm_list[i]) {
ret = -ENOMEM;
goto free_tms;
@@ -502,14 +505,14 @@ int btrfs_tree_mod_log_eb_copy(struct extent_buffer *dst,
tm_list_rem = tm_list + nr_items;
for (i = 0; i < nr_items; i++) {
tm_list_rem[i] = alloc_tree_mod_elem(src, i + src_offset,
- BTRFS_MOD_LOG_KEY_REMOVE, GFP_NOFS);
+ BTRFS_MOD_LOG_KEY_REMOVE);
if (!tm_list_rem[i]) {
ret = -ENOMEM;
goto free_tms;
}
tm_list_add[i] = alloc_tree_mod_elem(dst, i + dst_offset,
- BTRFS_MOD_LOG_KEY_ADD, GFP_NOFS);
+ BTRFS_MOD_LOG_KEY_ADD);
if (!tm_list_add[i]) {
ret = -ENOMEM;
goto free_tms;
@@ -564,7 +567,7 @@ int btrfs_tree_mod_log_free_eb(struct extent_buffer *eb)
for (i = 0; i < nritems; i++) {
tm_list[i] = alloc_tree_mod_elem(eb, i,
- BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS);
+ BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING);
if (!tm_list[i]) {
ret = -ENOMEM;
goto free_tms;
@@ -694,8 +697,8 @@ static void tree_mod_log_rewind(struct btrfs_fs_info *fs_info,
n--;
break;
case BTRFS_MOD_LOG_MOVE_KEYS:
- o_dst = btrfs_node_key_ptr_offset(tm->slot);
- o_src = btrfs_node_key_ptr_offset(tm->move.dst_slot);
+ o_dst = btrfs_node_key_ptr_offset(eb, tm->slot);
+ o_src = btrfs_node_key_ptr_offset(eb, tm->move.dst_slot);
memmove_extent_buffer(eb, o_dst, o_src,
tm->move.nr_items * p_size);
break;
@@ -819,10 +822,15 @@ struct extent_buffer *btrfs_get_old_root(struct btrfs_root *root, u64 time_seq)
tm = tree_mod_log_search(fs_info, logical, time_seq);
if (old_root && tm && tm->op != BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
+ struct btrfs_tree_parent_check check = { 0 };
+
btrfs_tree_read_unlock(eb_root);
free_extent_buffer(eb_root);
- old = read_tree_block(fs_info, logical, root->root_key.objectid,
- 0, level, NULL);
+
+ check.level = level;
+ check.owner_root = root->root_key.objectid;
+
+ old = read_tree_block(fs_info, logical, &check);
if (WARN_ON(IS_ERR(old) || !extent_buffer_uptodate(old))) {
if (!IS_ERR(old))
free_extent_buffer(old);
diff --git a/fs/btrfs/tree-mod-log.h b/fs/btrfs/tree-mod-log.h
index 12605d19621b..94f10afeee97 100644
--- a/fs/btrfs/tree-mod-log.h
+++ b/fs/btrfs/tree-mod-log.h
@@ -1,4 +1,4 @@
-// SPDX-License-Identifier: GPL-2.0
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef BTRFS_TREE_MOD_LOG_H
#define BTRFS_TREE_MOD_LOG_H
@@ -32,7 +32,7 @@ int btrfs_tree_mod_log_insert_root(struct extent_buffer *old_root,
struct extent_buffer *new_root,
bool log_removal);
int btrfs_tree_mod_log_insert_key(struct extent_buffer *eb, int slot,
- enum btrfs_mod_log_op op, gfp_t flags);
+ enum btrfs_mod_log_op op);
int btrfs_tree_mod_log_free_eb(struct extent_buffer *eb);
struct extent_buffer *btrfs_tree_mod_log_rewind(struct btrfs_fs_info *fs_info,
struct btrfs_path *path,
diff --git a/fs/btrfs/ulist.c b/fs/btrfs/ulist.c
index 3374c9e9be67..33606025513d 100644
--- a/fs/btrfs/ulist.c
+++ b/fs/btrfs/ulist.c
@@ -5,6 +5,7 @@
*/
#include <linux/slab.h>
+#include "messages.h"
#include "ulist.h"
#include "ctree.h"
@@ -37,8 +38,9 @@
* loop would be similar to the above.
*/
-/**
- * ulist_init - freshly initialize a ulist
+/*
+ * Freshly initialize a ulist.
+ *
* @ulist: the ulist to initialize
*
* Note: don't use this function to init an already used ulist, use
@@ -51,8 +53,9 @@ void ulist_init(struct ulist *ulist)
ulist->nnodes = 0;
}
-/**
- * ulist_release - free up additionally allocated memory for the ulist
+/*
+ * Free up additionally allocated memory for the ulist.
+ *
* @ulist: the ulist from which to free the additional memory
*
* This is useful in cases where the base 'struct ulist' has been statically
@@ -70,8 +73,9 @@ void ulist_release(struct ulist *ulist)
INIT_LIST_HEAD(&ulist->nodes);
}
-/**
- * ulist_reinit - prepare a ulist for reuse
+/*
+ * Prepare a ulist for reuse.
+ *
* @ulist: ulist to be reused
*
* Free up all additional memory allocated for the list elements and reinit
@@ -83,8 +87,9 @@ void ulist_reinit(struct ulist *ulist)
ulist_init(ulist);
}
-/**
- * ulist_alloc - dynamically allocate a ulist
+/*
+ * Dynamically allocate a ulist.
+ *
* @gfp_mask: allocation flags to for base allocation
*
* The allocated ulist will be returned in an initialized state.
@@ -101,8 +106,9 @@ struct ulist *ulist_alloc(gfp_t gfp_mask)
return ulist;
}
-/**
- * ulist_free - free dynamically allocated ulist
+/*
+ * Free dynamically allocated ulist.
+ *
* @ulist: ulist to free
*
* It is not necessary to call ulist_release before.
@@ -163,8 +169,9 @@ static int ulist_rbtree_insert(struct ulist *ulist, struct ulist_node *ins)
return 0;
}
-/**
- * ulist_add - add an element to the ulist
+/*
+ * Add an element to the ulist.
+ *
* @ulist: ulist to add the element to
* @val: value to add to ulist
* @aux: auxiliary value to store along with val
@@ -242,8 +249,9 @@ int ulist_del(struct ulist *ulist, u64 val, u64 aux)
return 0;
}
-/**
- * ulist_next - iterate ulist
+/*
+ * Iterate ulist.
+ *
* @ulist: ulist to iterate
* @uiter: iterator variable, initialized with ULIST_ITER_INIT(&iterator)
*
@@ -258,7 +266,7 @@ int ulist_del(struct ulist *ulist, u64 val, u64 aux)
* It is allowed to call ulist_add during an enumeration. Newly added items
* are guaranteed to show up in the running enumeration.
*/
-struct ulist_node *ulist_next(struct ulist *ulist, struct ulist_iterator *uiter)
+struct ulist_node *ulist_next(const struct ulist *ulist, struct ulist_iterator *uiter)
{
struct ulist_node *node;
diff --git a/fs/btrfs/ulist.h b/fs/btrfs/ulist.h
index 02fda0a2d4ce..b2cef187ea8e 100644
--- a/fs/btrfs/ulist.h
+++ b/fs/btrfs/ulist.h
@@ -66,7 +66,7 @@ static inline int ulist_add_merge_ptr(struct ulist *ulist, u64 val, void *aux,
#endif
}
-struct ulist_node *ulist_next(struct ulist *ulist,
+struct ulist_node *ulist_next(const struct ulist *ulist,
struct ulist_iterator *uiter);
#define ULIST_ITER_INIT(uiter) ((uiter)->cur_list = NULL)
diff --git a/fs/btrfs/uuid-tree.c b/fs/btrfs/uuid-tree.c
index b458452a1aaf..7c7001f42b14 100644
--- a/fs/btrfs/uuid-tree.c
+++ b/fs/btrfs/uuid-tree.c
@@ -5,11 +5,14 @@
#include <linux/uuid.h>
#include <asm/unaligned.h>
+#include "messages.h"
#include "ctree.h"
#include "transaction.h"
#include "disk-io.h"
#include "print-tree.h"
-
+#include "fs.h"
+#include "accessors.h"
+#include "uuid-tree.h"
static void btrfs_uuid_to_key(u8 *uuid, u8 type, struct btrfs_key *key)
{
diff --git a/fs/btrfs/uuid-tree.h b/fs/btrfs/uuid-tree.h
new file mode 100644
index 000000000000..5350c87fe2ca
--- /dev/null
+++ b/fs/btrfs/uuid-tree.h
@@ -0,0 +1,12 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef BTRFS_UUID_TREE_H
+#define BTRFS_UUID_TREE_H
+
+int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans, u8 *uuid, u8 type,
+ u64 subid);
+int btrfs_uuid_tree_remove(struct btrfs_trans_handle *trans, u8 *uuid, u8 type,
+ u64 subid);
+int btrfs_uuid_tree_iterate(struct btrfs_fs_info *fs_info);
+
+#endif
diff --git a/fs/btrfs/verity.c b/fs/btrfs/verity.c
index ee00e33c309e..bf9eb693a6a7 100644
--- a/fs/btrfs/verity.c
+++ b/fs/btrfs/verity.c
@@ -10,11 +10,17 @@
#include <linux/iversion.h>
#include <linux/fsverity.h>
#include <linux/sched/mm.h>
+#include "messages.h"
#include "ctree.h"
#include "btrfs_inode.h"
#include "transaction.h"
#include "disk-io.h"
#include "locking.h"
+#include "fs.h"
+#include "accessors.h"
+#include "ioctl.h"
+#include "verity.h"
+#include "orphan.h"
/*
* Implementation of the interface defined in struct fsverity_operations.
diff --git a/fs/btrfs/verity.h b/fs/btrfs/verity.h
new file mode 100644
index 000000000000..91c10f7d0a46
--- /dev/null
+++ b/fs/btrfs/verity.h
@@ -0,0 +1,28 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef BTRFS_VERITY_H
+#define BTRFS_VERITY_H
+
+#ifdef CONFIG_FS_VERITY
+
+extern const struct fsverity_operations btrfs_verityops;
+
+int btrfs_drop_verity_items(struct btrfs_inode *inode);
+int btrfs_get_verity_descriptor(struct inode *inode, void *buf, size_t buf_size);
+
+#else
+
+static inline int btrfs_drop_verity_items(struct btrfs_inode *inode)
+{
+ return 0;
+}
+
+static inline int btrfs_get_verity_descriptor(struct inode *inode, void *buf,
+ size_t buf_size)
+{
+ return -EPERM;
+}
+
+#endif
+
+#endif
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index a8d4bc6a1937..aa25fa335d3e 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -5,12 +5,9 @@
#include <linux/sched.h>
#include <linux/sched/mm.h>
-#include <linux/bio.h>
#include <linux/slab.h>
-#include <linux/blkdev.h>
#include <linux/ratelimit.h>
#include <linux/kthread.h>
-#include <linux/raid/pq.h>
#include <linux/semaphore.h>
#include <linux/uuid.h>
#include <linux/list_sort.h>
@@ -23,8 +20,6 @@
#include "print-tree.h"
#include "volumes.h"
#include "raid56.h"
-#include "async-thread.h"
-#include "check-integrity.h"
#include "rcu-string.h"
#include "dev-replace.h"
#include "sysfs.h"
@@ -33,8 +28,13 @@
#include "block-group.h"
#include "discard.h"
#include "zoned.h"
-
-static struct bio_set btrfs_bioset;
+#include "fs.h"
+#include "accessors.h"
+#include "uuid-tree.h"
+#include "ioctl.h"
+#include "relocation.h"
+#include "scrub.h"
+#include "super.h"
#define BTRFS_BLOCK_GROUP_STRIPE_MASK (BTRFS_BLOCK_GROUP_RAID0 | \
BTRFS_BLOCK_GROUP_RAID10 | \
@@ -248,11 +248,6 @@ out_overflow:;
static int init_first_rw_device(struct btrfs_trans_handle *trans);
static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info);
static void btrfs_dev_stat_print_on_load(struct btrfs_device *device);
-static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
- enum btrfs_map_op op, u64 logical, u64 *length,
- struct btrfs_io_context **bioc_ret,
- struct btrfs_io_stripe *smap,
- int *mirror_num_ret, int need_raid_map);
/*
* Device locking
@@ -529,14 +524,14 @@ error:
return ret;
}
-/**
- * Search and remove all stale devices (which are not mounted).
- * When both inputs are NULL, it will search and release all stale devices.
+/*
+ * Search and remove all stale devices (which are not mounted). When both
+ * inputs are NULL, it will search and release all stale devices.
*
- * @devt: Optional. When provided will it release all unmounted devices
- * matching this devt only.
+ * @devt: Optional. When provided will it release all unmounted devices
+ * matching this devt only.
* @skip_device: Optional. Will skip this device when searching for the stale
- * devices.
+ * devices.
*
* Return: 0 for success or if @devt is 0.
* -EBUSY if @devt is a mounted device.
@@ -641,6 +636,9 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
if (!bdev_nonrot(bdev))
fs_devices->rotating = true;
+ if (bdev_max_discard_sectors(bdev))
+ fs_devices->discardable = true;
+
device->bdev = bdev;
clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
device->mode = flags;
@@ -835,26 +833,23 @@ static noinline struct btrfs_device *device_list_add(const char *path,
}
if (!device) {
+ unsigned int nofs_flag;
+
if (fs_devices->opened) {
mutex_unlock(&fs_devices->device_list_mutex);
return ERR_PTR(-EBUSY);
}
+ nofs_flag = memalloc_nofs_save();
device = btrfs_alloc_device(NULL, &devid,
- disk_super->dev_item.uuid);
+ disk_super->dev_item.uuid, path);
+ memalloc_nofs_restore(nofs_flag);
if (IS_ERR(device)) {
mutex_unlock(&fs_devices->device_list_mutex);
/* we can safely leave the fs_devices entry around */
return device;
}
- name = rcu_string_strdup(path, GFP_NOFS);
- if (!name) {
- btrfs_free_device(device);
- mutex_unlock(&fs_devices->device_list_mutex);
- return ERR_PTR(-ENOMEM);
- }
- rcu_assign_pointer(device->name, name);
device->devt = path_devt;
list_add_rcu(&device->dev_list, &fs_devices->devices);
@@ -934,7 +929,7 @@ static noinline struct btrfs_device *device_list_add(const char *path,
}
btrfs_info_in_rcu(NULL,
"devid %llu device path %s changed to %s scanned by %s (%d)",
- devid, rcu_str_deref(device->name),
+ devid, btrfs_dev_name(device),
path, current->comm,
task_pid_nr(current));
}
@@ -987,28 +982,32 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
fs_devices->total_devices = orig->total_devices;
list_for_each_entry(orig_dev, &orig->devices, dev_list) {
- struct rcu_string *name;
+ const char *dev_path = NULL;
+
+ /*
+ * This is ok to do without RCU read locked because we hold the
+ * uuid mutex so nothing we touch in here is going to disappear.
+ */
+ if (orig_dev->name)
+ dev_path = orig_dev->name->str;
device = btrfs_alloc_device(NULL, &orig_dev->devid,
- orig_dev->uuid);
+ orig_dev->uuid, dev_path);
if (IS_ERR(device)) {
ret = PTR_ERR(device);
goto error;
}
- /*
- * This is ok to do without rcu read locked because we hold the
- * uuid mutex so nothing we touch in here is going to disappear.
- */
- if (orig_dev->name) {
- name = rcu_string_strdup(orig_dev->name->str,
- GFP_KERNEL);
- if (!name) {
+ if (orig_dev->zone_info) {
+ struct btrfs_zoned_device_info *zone_info;
+
+ zone_info = btrfs_clone_dev_zone_info(orig_dev);
+ if (!zone_info) {
btrfs_free_device(device);
ret = -ENOMEM;
goto error;
}
- rcu_assign_pointer(device->name, name);
+ device->zone_info = zone_info;
}
list_add(&device->dev_list, &fs_devices->devices);
@@ -1461,8 +1460,9 @@ static bool dev_extent_hole_check_zoned(struct btrfs_device *device,
return changed;
}
-/**
- * dev_extent_hole_check - check if specified hole is suitable for allocation
+/*
+ * Check if specified hole is suitable for allocation.
+ *
* @device: the device which we have the hole
* @hole_start: starting position of the hole
* @hole_size: the size of the hole
@@ -1516,7 +1516,8 @@ static bool dev_extent_hole_check(struct btrfs_device *device, u64 *hole_start,
}
/*
- * find_free_dev_extent_start - find free space in the specified device
+ * Find free space in the specified device.
+ *
* @device: the device which we search the free space in
* @num_bytes: the size of the free space that we need
* @search_start: the position from which to begin the search
@@ -1524,9 +1525,8 @@ static bool dev_extent_hole_check(struct btrfs_device *device, u64 *hole_start,
* @len: the size of the free space. that we find, or the size
* of the max free space if we don't find suitable free space
*
- * this uses a pretty simple search, the expectation is that it is
- * called very infrequently and that a given device has a small number
- * of extents
+ * This does a pretty simple search, the expectation is that it is called very
+ * infrequently and that a given device has a small number of extents.
*
* @start is used to store the start of the free space if we find. But if we
* don't find suitable free space, it will be used to store the start position
@@ -2089,7 +2089,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info,
if (btrfs_pinned_by_swapfile(fs_info, device)) {
btrfs_warn_in_rcu(fs_info,
"cannot remove device %s (devid %llu) due to active swapfile",
- rcu_str_deref(device->name), device->devid);
+ btrfs_dev_name(device), device->devid);
return -ETXTBSY;
}
@@ -2305,8 +2305,8 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev)
btrfs_free_device(tgtdev);
}
-/**
- * Populate args from device at path
+/*
+ * Populate args from device at path.
*
* @fs_info: the filesystem
* @args: the args to populate
@@ -2581,7 +2581,6 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
struct btrfs_device *device;
struct block_device *bdev;
struct super_block *sb = fs_info->sb;
- struct rcu_string *name;
struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
struct btrfs_fs_devices *seed_devices;
u64 orig_super_total_bytes;
@@ -2622,20 +2621,13 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
}
rcu_read_unlock();
- device = btrfs_alloc_device(fs_info, NULL, NULL);
+ device = btrfs_alloc_device(fs_info, NULL, NULL, device_path);
if (IS_ERR(device)) {
/* we can safely leave the fs_devices entry around */
ret = PTR_ERR(device);
goto error;
}
- name = rcu_string_strdup(device_path, GFP_KERNEL);
- if (!name) {
- ret = -ENOMEM;
- goto error_free_device;
- }
- rcu_assign_pointer(device->name, name);
-
device->fs_info = fs_info;
device->bdev = bdev;
ret = lookup_bdev(device_path, &device->devt);
@@ -3591,16 +3583,14 @@ static int chunk_usage_range_filter(struct btrfs_fs_info *fs_info, u64 chunk_off
if (bargs->usage_min == 0)
user_thresh_min = 0;
else
- user_thresh_min = div_factor_fine(cache->length,
- bargs->usage_min);
+ user_thresh_min = mult_perc(cache->length, bargs->usage_min);
if (bargs->usage_max == 0)
user_thresh_max = 1;
else if (bargs->usage_max > 100)
user_thresh_max = cache->length;
else
- user_thresh_max = div_factor_fine(cache->length,
- bargs->usage_max);
+ user_thresh_max = mult_perc(cache->length, bargs->usage_max);
if (user_thresh_min <= chunk_used && chunk_used < user_thresh_max)
ret = 0;
@@ -3624,7 +3614,7 @@ static int chunk_usage_filter(struct btrfs_fs_info *fs_info,
else if (bargs->usage > 100)
user_thresh = cache->length;
else
- user_thresh = div_factor_fine(cache->length, bargs->usage);
+ user_thresh = mult_perc(cache->length, bargs->usage);
if (chunk_used < user_thresh)
ret = 0;
@@ -4014,10 +4004,11 @@ error:
return ret;
}
-/**
- * alloc_profile_is_valid - see if a given profile is valid and reduced
- * @flags: profile to validate
- * @extended: if true @flags is treated as an extended profile
+/*
+ * See if a given profile is valid and reduced.
+ *
+ * @flags: profile to validate
+ * @extended: if true @flags is treated as an extended profile
*/
static int alloc_profile_is_valid(u64 flags, int extended)
{
@@ -5089,7 +5080,7 @@ static void init_alloc_chunk_ctl_policy_regular(
ctl->devs_max = min_t(int, ctl->devs_max, BTRFS_MAX_DEVS_SYS_CHUNK);
/* We don't want a chunk larger than 10% of writable space */
- ctl->max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1),
+ ctl->max_chunk_size = min(mult_perc(fs_devices->total_rw_bytes, 10),
ctl->max_chunk_size);
ctl->dev_extent_min = BTRFS_STRIPE_LEN * ctl->dev_stripes;
}
@@ -5120,7 +5111,7 @@ static void init_alloc_chunk_ctl_policy_zoned(
}
/* We don't want a chunk larger than 10% of writable space */
- limit = max(round_down(div_factor(fs_devices->total_rw_bytes, 1),
+ limit = max(round_down(mult_perc(fs_devices->total_rw_bytes, 10),
zone_size),
min_chunk_size);
ctl->max_chunk_size = min(limit, ctl->max_chunk_size);
@@ -5896,7 +5887,10 @@ static struct btrfs_io_context *alloc_btrfs_io_context(struct btrfs_fs_info *fs_
* and the stripes.
*/
sizeof(u64) * (total_stripes),
- GFP_NOFS|__GFP_NOFAIL);
+ GFP_NOFS);
+
+ if (!bioc)
+ return NULL;
refcount_set(&bioc->refs, 1);
@@ -6358,11 +6352,11 @@ static void set_io_stripe(struct btrfs_io_stripe *dst, const struct map_lookup *
stripe_offset + stripe_nr * map->stripe_len;
}
-static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
- enum btrfs_map_op op, u64 logical, u64 *length,
- struct btrfs_io_context **bioc_ret,
- struct btrfs_io_stripe *smap,
- int *mirror_num_ret, int need_raid_map)
+int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
+ u64 logical, u64 *length,
+ struct btrfs_io_context **bioc_ret,
+ struct btrfs_io_stripe *smap, int *mirror_num_ret,
+ int need_raid_map)
{
struct extent_map *em;
struct map_lookup *map;
@@ -6645,266 +6639,6 @@ int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
NULL, NULL, 1);
}
-/*
- * Initialize a btrfs_bio structure. This skips the embedded bio itself as it
- * is already initialized by the block layer.
- */
-static inline void btrfs_bio_init(struct btrfs_bio *bbio,
- btrfs_bio_end_io_t end_io, void *private)
-{
- memset(bbio, 0, offsetof(struct btrfs_bio, bio));
- bbio->end_io = end_io;
- bbio->private = private;
-}
-
-/*
- * Allocate a btrfs_bio structure. The btrfs_bio is the main I/O container for
- * btrfs, and is used for all I/O submitted through btrfs_submit_bio.
- *
- * Just like the underlying bio_alloc_bioset it will not fail as it is backed by
- * a mempool.
- */
-struct bio *btrfs_bio_alloc(unsigned int nr_vecs, blk_opf_t opf,
- btrfs_bio_end_io_t end_io, void *private)
-{
- struct bio *bio;
-
- bio = bio_alloc_bioset(NULL, nr_vecs, opf, GFP_NOFS, &btrfs_bioset);
- btrfs_bio_init(btrfs_bio(bio), end_io, private);
- return bio;
-}
-
-struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size,
- btrfs_bio_end_io_t end_io, void *private)
-{
- struct bio *bio;
- struct btrfs_bio *bbio;
-
- ASSERT(offset <= UINT_MAX && size <= UINT_MAX);
-
- bio = bio_alloc_clone(orig->bi_bdev, orig, GFP_NOFS, &btrfs_bioset);
- bbio = btrfs_bio(bio);
- btrfs_bio_init(bbio, end_io, private);
-
- bio_trim(bio, offset >> 9, size >> 9);
- bbio->iter = bio->bi_iter;
- return bio;
-}
-
-static void btrfs_log_dev_io_error(struct bio *bio, struct btrfs_device *dev)
-{
- if (!dev || !dev->bdev)
- return;
- if (bio->bi_status != BLK_STS_IOERR && bio->bi_status != BLK_STS_TARGET)
- return;
-
- if (btrfs_op(bio) == BTRFS_MAP_WRITE)
- btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS);
- if (!(bio->bi_opf & REQ_RAHEAD))
- btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
- if (bio->bi_opf & REQ_PREFLUSH)
- btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_FLUSH_ERRS);
-}
-
-static struct workqueue_struct *btrfs_end_io_wq(struct btrfs_fs_info *fs_info,
- struct bio *bio)
-{
- if (bio->bi_opf & REQ_META)
- return fs_info->endio_meta_workers;
- return fs_info->endio_workers;
-}
-
-static void btrfs_end_bio_work(struct work_struct *work)
-{
- struct btrfs_bio *bbio =
- container_of(work, struct btrfs_bio, end_io_work);
-
- bbio->end_io(bbio);
-}
-
-static void btrfs_simple_end_io(struct bio *bio)
-{
- struct btrfs_fs_info *fs_info = bio->bi_private;
- struct btrfs_bio *bbio = btrfs_bio(bio);
-
- btrfs_bio_counter_dec(fs_info);
-
- if (bio->bi_status)
- btrfs_log_dev_io_error(bio, bbio->device);
-
- if (bio_op(bio) == REQ_OP_READ) {
- INIT_WORK(&bbio->end_io_work, btrfs_end_bio_work);
- queue_work(btrfs_end_io_wq(fs_info, bio), &bbio->end_io_work);
- } else {
- bbio->end_io(bbio);
- }
-}
-
-static void btrfs_raid56_end_io(struct bio *bio)
-{
- struct btrfs_io_context *bioc = bio->bi_private;
- struct btrfs_bio *bbio = btrfs_bio(bio);
-
- btrfs_bio_counter_dec(bioc->fs_info);
- bbio->mirror_num = bioc->mirror_num;
- bbio->end_io(bbio);
-
- btrfs_put_bioc(bioc);
-}
-
-static void btrfs_orig_write_end_io(struct bio *bio)
-{
- struct btrfs_io_stripe *stripe = bio->bi_private;
- struct btrfs_io_context *bioc = stripe->bioc;
- struct btrfs_bio *bbio = btrfs_bio(bio);
-
- btrfs_bio_counter_dec(bioc->fs_info);
-
- if (bio->bi_status) {
- atomic_inc(&bioc->error);
- btrfs_log_dev_io_error(bio, stripe->dev);
- }
-
- /*
- * Only send an error to the higher layers if it is beyond the tolerance
- * threshold.
- */
- if (atomic_read(&bioc->error) > bioc->max_errors)
- bio->bi_status = BLK_STS_IOERR;
- else
- bio->bi_status = BLK_STS_OK;
-
- bbio->end_io(bbio);
- btrfs_put_bioc(bioc);
-}
-
-static void btrfs_clone_write_end_io(struct bio *bio)
-{
- struct btrfs_io_stripe *stripe = bio->bi_private;
-
- if (bio->bi_status) {
- atomic_inc(&stripe->bioc->error);
- btrfs_log_dev_io_error(bio, stripe->dev);
- }
-
- /* Pass on control to the original bio this one was cloned from */
- bio_endio(stripe->bioc->orig_bio);
- bio_put(bio);
-}
-
-static void btrfs_submit_dev_bio(struct btrfs_device *dev, struct bio *bio)
-{
- if (!dev || !dev->bdev ||
- test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) ||
- (btrfs_op(bio) == BTRFS_MAP_WRITE &&
- !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))) {
- bio_io_error(bio);
- return;
- }
-
- bio_set_dev(bio, dev->bdev);
-
- /*
- * For zone append writing, bi_sector must point the beginning of the
- * zone
- */
- if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
- u64 physical = bio->bi_iter.bi_sector << SECTOR_SHIFT;
-
- if (btrfs_dev_is_sequential(dev, physical)) {
- u64 zone_start = round_down(physical,
- dev->fs_info->zone_size);
-
- bio->bi_iter.bi_sector = zone_start >> SECTOR_SHIFT;
- } else {
- bio->bi_opf &= ~REQ_OP_ZONE_APPEND;
- bio->bi_opf |= REQ_OP_WRITE;
- }
- }
- btrfs_debug_in_rcu(dev->fs_info,
- "%s: rw %d 0x%x, sector=%llu, dev=%lu (%s id %llu), size=%u",
- __func__, bio_op(bio), bio->bi_opf, bio->bi_iter.bi_sector,
- (unsigned long)dev->bdev->bd_dev, rcu_str_deref(dev->name),
- dev->devid, bio->bi_iter.bi_size);
-
- btrfsic_check_bio(bio);
- submit_bio(bio);
-}
-
-static void btrfs_submit_mirrored_bio(struct btrfs_io_context *bioc, int dev_nr)
-{
- struct bio *orig_bio = bioc->orig_bio, *bio;
-
- ASSERT(bio_op(orig_bio) != REQ_OP_READ);
-
- /* Reuse the bio embedded into the btrfs_bio for the last mirror */
- if (dev_nr == bioc->num_stripes - 1) {
- bio = orig_bio;
- bio->bi_end_io = btrfs_orig_write_end_io;
- } else {
- bio = bio_alloc_clone(NULL, orig_bio, GFP_NOFS, &fs_bio_set);
- bio_inc_remaining(orig_bio);
- bio->bi_end_io = btrfs_clone_write_end_io;
- }
-
- bio->bi_private = &bioc->stripes[dev_nr];
- bio->bi_iter.bi_sector = bioc->stripes[dev_nr].physical >> SECTOR_SHIFT;
- bioc->stripes[dev_nr].bioc = bioc;
- btrfs_submit_dev_bio(bioc->stripes[dev_nr].dev, bio);
-}
-
-void btrfs_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio, int mirror_num)
-{
- u64 logical = bio->bi_iter.bi_sector << 9;
- u64 length = bio->bi_iter.bi_size;
- u64 map_length = length;
- struct btrfs_io_context *bioc = NULL;
- struct btrfs_io_stripe smap;
- int ret;
-
- btrfs_bio_counter_inc_blocked(fs_info);
- ret = __btrfs_map_block(fs_info, btrfs_op(bio), logical, &map_length,
- &bioc, &smap, &mirror_num, 1);
- if (ret) {
- btrfs_bio_counter_dec(fs_info);
- btrfs_bio_end_io(btrfs_bio(bio), errno_to_blk_status(ret));
- return;
- }
-
- if (map_length < length) {
- btrfs_crit(fs_info,
- "mapping failed logical %llu bio len %llu len %llu",
- logical, length, map_length);
- BUG();
- }
-
- if (!bioc) {
- /* Single mirror read/write fast path */
- btrfs_bio(bio)->mirror_num = mirror_num;
- btrfs_bio(bio)->device = smap.dev;
- bio->bi_iter.bi_sector = smap.physical >> SECTOR_SHIFT;
- bio->bi_private = fs_info;
- bio->bi_end_io = btrfs_simple_end_io;
- btrfs_submit_dev_bio(smap.dev, bio);
- } else if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
- /* Parity RAID write or read recovery */
- bio->bi_private = bioc;
- bio->bi_end_io = btrfs_raid56_end_io;
- if (bio_op(bio) == REQ_OP_READ)
- raid56_parity_recover(bio, bioc, mirror_num);
- else
- raid56_parity_write(bio, bioc);
- } else {
- /* Write to multiple mirrors */
- int total_devs = bioc->num_stripes;
- int dev_nr;
-
- bioc->orig_bio = bio;
- for (dev_nr = 0; dev_nr < total_devs; dev_nr++)
- btrfs_submit_mirrored_bio(bioc, dev_nr);
- }
-}
-
static bool dev_args_match_fs_devices(const struct btrfs_dev_lookup_args *args,
const struct btrfs_fs_devices *fs_devices)
{
@@ -6918,18 +6652,18 @@ static bool dev_args_match_fs_devices(const struct btrfs_dev_lookup_args *args,
static bool dev_args_match_device(const struct btrfs_dev_lookup_args *args,
const struct btrfs_device *device)
{
- ASSERT((args->devid != (u64)-1) || args->missing);
+ if (args->missing) {
+ if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state) &&
+ !device->bdev)
+ return true;
+ return false;
+ }
- if ((args->devid != (u64)-1) && device->devid != args->devid)
+ if (device->devid != args->devid)
return false;
if (args->uuid && memcmp(device->uuid, args->uuid, BTRFS_UUID_SIZE) != 0)
return false;
- if (!args->missing)
- return true;
- if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state) &&
- !device->bdev)
- return true;
- return false;
+ return true;
}
/*
@@ -6976,8 +6710,9 @@ static struct btrfs_device *add_missing_dev(struct btrfs_fs_devices *fs_devices,
* always do NOFS because we use it in a lot of other GFP_KERNEL safe
* places.
*/
+
nofs_flag = memalloc_nofs_save();
- device = btrfs_alloc_device(NULL, &devid, dev_uuid);
+ device = btrfs_alloc_device(NULL, &devid, dev_uuid, NULL);
memalloc_nofs_restore(nofs_flag);
if (IS_ERR(device))
return device;
@@ -6992,22 +6727,24 @@ static struct btrfs_device *add_missing_dev(struct btrfs_fs_devices *fs_devices,
return device;
}
-/**
- * btrfs_alloc_device - allocate struct btrfs_device
+/*
+ * Allocate new device struct, set up devid and UUID.
+ *
* @fs_info: used only for generating a new devid, can be NULL if
* devid is provided (i.e. @devid != NULL).
* @devid: a pointer to devid for this device. If NULL a new devid
* is generated.
* @uuid: a pointer to UUID for this device. If NULL a new UUID
* is generated.
+ * @path: a pointer to device path if available, NULL otherwise.
*
* Return: a pointer to a new &struct btrfs_device on success; ERR_PTR()
* on error. Returned struct is not linked onto any lists and must be
* destroyed with btrfs_free_device.
*/
struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
- const u64 *devid,
- const u8 *uuid)
+ const u64 *devid, const u8 *uuid,
+ const char *path)
{
struct btrfs_device *dev;
u64 tmp;
@@ -7025,8 +6762,7 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
atomic_set(&dev->dev_stats_ccnt, 0);
btrfs_device_data_ordered_init(dev);
- extent_io_tree_init(fs_info, &dev->alloc_state,
- IO_TREE_DEVICE_ALLOC_STATE, NULL);
+ extent_io_tree_init(fs_info, &dev->alloc_state, IO_TREE_DEVICE_ALLOC_STATE);
if (devid)
tmp = *devid;
@@ -7046,6 +6782,17 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
else
generate_random_uuid(dev->uuid);
+ if (path) {
+ struct rcu_string *name;
+
+ name = rcu_string_strdup(path, GFP_KERNEL);
+ if (!name) {
+ btrfs_free_device(dev);
+ return ERR_PTR(-ENOMEM);
+ }
+ rcu_assign_pointer(dev->name, name);
+ }
+
return dev;
}
@@ -7229,8 +6976,9 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf,
map->stripes[i].dev = handle_missing_device(fs_info,
devid, uuid);
if (IS_ERR(map->stripes[i].dev)) {
+ ret = PTR_ERR(map->stripes[i].dev);
free_extent_map(em);
- return PTR_ERR(map->stripes[i].dev);
+ return ret;
}
}
@@ -7744,10 +7492,11 @@ error:
return ret;
}
-void btrfs_init_devices_late(struct btrfs_fs_info *fs_info)
+int btrfs_init_devices_late(struct btrfs_fs_info *fs_info)
{
struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs;
struct btrfs_device *device;
+ int ret = 0;
fs_devices->fs_info = fs_info;
@@ -7756,12 +7505,18 @@ void btrfs_init_devices_late(struct btrfs_fs_info *fs_info)
device->fs_info = fs_info;
list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) {
- list_for_each_entry(device, &seed_devs->devices, dev_list)
+ list_for_each_entry(device, &seed_devs->devices, dev_list) {
device->fs_info = fs_info;
+ ret = btrfs_get_dev_zone_info(device, false);
+ if (ret)
+ break;
+ }
seed_devs->fs_info = fs_info;
}
mutex_unlock(&fs_devices->device_list_mutex);
+
+ return ret;
}
static u64 btrfs_dev_stats_value(const struct extent_buffer *eb,
@@ -7885,7 +7640,7 @@ static int update_dev_stat_item(struct btrfs_trans_handle *trans,
if (ret < 0) {
btrfs_warn_in_rcu(fs_info,
"error %d while searching for dev_stats item for device %s",
- ret, rcu_str_deref(device->name));
+ ret, btrfs_dev_name(device));
goto out;
}
@@ -7896,7 +7651,7 @@ static int update_dev_stat_item(struct btrfs_trans_handle *trans,
if (ret != 0) {
btrfs_warn_in_rcu(fs_info,
"delete too small dev_stats item for device %s failed %d",
- rcu_str_deref(device->name), ret);
+ btrfs_dev_name(device), ret);
goto out;
}
ret = 1;
@@ -7910,7 +7665,7 @@ static int update_dev_stat_item(struct btrfs_trans_handle *trans,
if (ret < 0) {
btrfs_warn_in_rcu(fs_info,
"insert dev_stats item for device %s failed %d",
- rcu_str_deref(device->name), ret);
+ btrfs_dev_name(device), ret);
goto out;
}
}
@@ -7975,7 +7730,7 @@ void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index)
return;
btrfs_err_rl_in_rcu(dev->fs_info,
"bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u",
- rcu_str_deref(dev->name),
+ btrfs_dev_name(dev),
btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS),
btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS),
@@ -7995,7 +7750,7 @@ static void btrfs_dev_stat_print_on_load(struct btrfs_device *dev)
btrfs_info_in_rcu(dev->fs_info,
"bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u",
- rcu_str_deref(dev->name),
+ btrfs_dev_name(dev),
btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS),
btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS),
@@ -8414,17 +8169,3 @@ bool btrfs_repair_one_zone(struct btrfs_fs_info *fs_info, u64 logical)
return true;
}
-
-int __init btrfs_bioset_init(void)
-{
- if (bioset_init(&btrfs_bioset, BIO_POOL_SIZE,
- offsetof(struct btrfs_bio, bio),
- BIOSET_NEED_BVECS))
- return -ENOMEM;
- return 0;
-}
-
-void __cold btrfs_bioset_exit(void)
-{
- bioset_exit(&btrfs_bioset);
-}
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index f8b668dc8bf8..6b7a05f6cf82 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -6,10 +6,12 @@
#ifndef BTRFS_VOLUMES_H
#define BTRFS_VOLUMES_H
-#include <linux/bio.h>
#include <linux/sort.h>
#include <linux/btrfs.h>
#include "async-thread.h"
+#include "messages.h"
+#include "tree-checker.h"
+#include "rcu-string.h"
#define BTRFS_MAX_DATA_CHUNK_SIZE (10ULL * SZ_1G)
@@ -354,6 +356,8 @@ struct btrfs_fs_devices {
* nonrot flag set
*/
bool rotating;
+ /* Devices support TRIM/discard commands */
+ bool discardable;
struct btrfs_fs_info *fs_info;
/* sysfs kobjects */
@@ -368,8 +372,6 @@ struct btrfs_fs_devices {
enum btrfs_read_policy read_policy;
};
-#define BTRFS_BIO_INLINE_CSUM_SIZE 64
-
#define BTRFS_MAX_DEVS(info) ((BTRFS_MAX_ITEM_SIZE(info) \
- sizeof(struct btrfs_chunk)) \
/ sizeof(struct btrfs_stripe) + 1)
@@ -379,89 +381,6 @@ struct btrfs_fs_devices {
- 2 * sizeof(struct btrfs_chunk)) \
/ sizeof(struct btrfs_stripe) + 1)
-/*
- * Maximum number of sectors for a single bio to limit the size of the
- * checksum array. This matches the number of bio_vecs per bio and thus the
- * I/O size for buffered I/O.
- */
-#define BTRFS_MAX_BIO_SECTORS (256)
-
-typedef void (*btrfs_bio_end_io_t)(struct btrfs_bio *bbio);
-
-/*
- * Additional info to pass along bio.
- *
- * Mostly for btrfs specific features like csum and mirror_num.
- */
-struct btrfs_bio {
- unsigned int mirror_num;
- struct bvec_iter iter;
-
- /* for direct I/O */
- u64 file_offset;
-
- /* @device is for stripe IO submission. */
- struct btrfs_device *device;
- u8 *csum;
- u8 csum_inline[BTRFS_BIO_INLINE_CSUM_SIZE];
-
- /* End I/O information supplied to btrfs_bio_alloc */
- btrfs_bio_end_io_t end_io;
- void *private;
-
- /* For read end I/O handling */
- struct work_struct end_io_work;
-
- /*
- * This member must come last, bio_alloc_bioset will allocate enough
- * bytes for entire btrfs_bio but relies on bio being last.
- */
- struct bio bio;
-};
-
-static inline struct btrfs_bio *btrfs_bio(struct bio *bio)
-{
- return container_of(bio, struct btrfs_bio, bio);
-}
-
-int __init btrfs_bioset_init(void);
-void __cold btrfs_bioset_exit(void);
-
-struct bio *btrfs_bio_alloc(unsigned int nr_vecs, blk_opf_t opf,
- btrfs_bio_end_io_t end_io, void *private);
-struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size,
- btrfs_bio_end_io_t end_io, void *private);
-
-static inline void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status)
-{
- bbio->bio.bi_status = status;
- bbio->end_io(bbio);
-}
-
-static inline void btrfs_bio_free_csum(struct btrfs_bio *bbio)
-{
- if (bbio->csum != bbio->csum_inline) {
- kfree(bbio->csum);
- bbio->csum = NULL;
- }
-}
-
-/*
- * Iterate through a btrfs_bio (@bbio) on a per-sector basis.
- *
- * bvl - struct bio_vec
- * bbio - struct btrfs_bio
- * iters - struct bvec_iter
- * bio_offset - unsigned int
- */
-#define btrfs_bio_for_each_sector(fs_info, bvl, bbio, iter, bio_offset) \
- for ((iter) = (bbio)->iter, (bio_offset) = 0; \
- (iter).bi_size && \
- (((bvl) = bio_iter_iovec((&(bbio)->bio), (iter))), 1); \
- (bio_offset) += fs_info->sectorsize, \
- bio_advance_iter_single(&(bbio)->bio, &(iter), \
- (fs_info)->sectorsize))
-
struct btrfs_io_stripe {
struct btrfs_device *dev;
union {
@@ -603,6 +522,13 @@ static inline enum btrfs_map_op btrfs_op(struct bio *bio)
}
}
+static inline unsigned long btrfs_chunk_item_size(int num_stripes)
+{
+ ASSERT(num_stripes);
+ return sizeof(struct btrfs_chunk) +
+ sizeof(struct btrfs_stripe) * (num_stripes - 1);
+}
+
void btrfs_get_bioc(struct btrfs_io_context *bioc);
void btrfs_put_bioc(struct btrfs_io_context *bioc);
int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
@@ -611,6 +537,11 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
u64 logical, u64 *length,
struct btrfs_io_context **bioc_ret);
+int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
+ u64 logical, u64 *length,
+ struct btrfs_io_context **bioc_ret,
+ struct btrfs_io_stripe *smap, int *mirror_num_ret,
+ int need_raid_map);
struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info,
u64 logical, u64 *length_ret,
u32 *num_stripes);
@@ -622,7 +553,6 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info);
struct btrfs_block_group *btrfs_create_chunk(struct btrfs_trans_handle *trans,
u64 type);
void btrfs_mapping_tree_free(struct extent_map_tree *tree);
-void btrfs_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio, int mirror_num);
int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
fmode_t flags, void *holder);
struct btrfs_device *btrfs_scan_one_device(const char *path,
@@ -639,8 +569,8 @@ int btrfs_get_dev_args_from_path(struct btrfs_fs_info *fs_info,
struct btrfs_dev_lookup_args *args,
const char *path);
struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
- const u64 *devid,
- const u8 *uuid);
+ const u64 *devid, const u8 *uuid,
+ const char *path);
void btrfs_put_dev_args_from_path(struct btrfs_dev_lookup_args *args);
void btrfs_free_device(struct btrfs_device *device);
int btrfs_rm_device(struct btrfs_fs_info *fs_info,
@@ -671,7 +601,7 @@ int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,
void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index);
int btrfs_get_dev_stats(struct btrfs_fs_info *fs_info,
struct btrfs_ioctl_get_dev_stats *stats);
-void btrfs_init_devices_late(struct btrfs_fs_info *fs_info);
+int btrfs_init_devices_late(struct btrfs_fs_info *fs_info);
int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info);
int btrfs_run_dev_stats(struct btrfs_trans_handle *trans);
void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_device *srcdev);
@@ -741,6 +671,14 @@ static inline void btrfs_dev_stat_set(struct btrfs_device *dev,
atomic_inc(&dev->dev_stats_ccnt);
}
+static inline const char *btrfs_dev_name(const struct btrfs_device *device)
+{
+ if (!device || test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
+ return "<missing disk>";
+ else
+ return rcu_str_deref(device->name);
+}
+
void btrfs_commit_device_sizes(struct btrfs_transaction *trans);
struct list_head * __attribute_const__ btrfs_get_fs_uuids(void);
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 5bb8d8c86311..0ed4b119a7ca 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -13,12 +13,16 @@
#include <linux/iversion.h>
#include <linux/sched/mm.h>
#include "ctree.h"
+#include "fs.h"
+#include "messages.h"
#include "btrfs_inode.h"
#include "transaction.h"
#include "xattr.h"
#include "disk-io.h"
#include "props.h"
#include "locking.h"
+#include "accessors.h"
+#include "dir-item.h"
int btrfs_getxattr(struct inode *inode, const char *name,
void *buffer, size_t size)
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
index b4f44662cda7..01a13de11832 100644
--- a/fs/btrfs/zlib.c
+++ b/fs/btrfs/zlib.c
@@ -155,8 +155,8 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
in_page = find_get_page(mapping,
start >> PAGE_SHIFT);
data_in = kmap_local_page(in_page);
- memcpy(workspace->buf + i * PAGE_SIZE,
- data_in, PAGE_SIZE);
+ copy_page(workspace->buf + i * PAGE_SIZE,
+ data_in);
start += PAGE_SIZE;
}
workspace->strm.next_in = workspace->buf;
@@ -355,7 +355,7 @@ done:
return ret;
}
-int zlib_decompress(struct list_head *ws, unsigned char *data_in,
+int zlib_decompress(struct list_head *ws, const u8 *data_in,
struct page *dest_page, unsigned long start_byte, size_t srclen,
size_t destlen)
{
diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
index e2d073b08a7d..a759668477bb 100644
--- a/fs/btrfs/zoned.c
+++ b/fs/btrfs/zoned.c
@@ -15,6 +15,8 @@
#include "transaction.h"
#include "dev-replace.h"
#include "space-info.h"
+#include "fs.h"
+#include "accessors.h"
/* Maximum number of zones to report per blkdev_report_zones() call */
#define BTRFS_REPORT_NR_ZONES 4096
@@ -134,7 +136,8 @@ static int sb_write_pointer(struct block_device *bdev, struct blk_zone *zones,
super[i] = page_address(page[i]);
}
- if (super[0]->generation > super[1]->generation)
+ if (btrfs_super_generation(super[0]) >
+ btrfs_super_generation(super[1]))
sector = zones[1].start;
else
sector = zones[0].start;
@@ -391,8 +394,7 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache)
zone_sectors = bdev_zone_sectors(bdev);
}
- /* Check if it's power of 2 (see is_power_of_2) */
- ASSERT(zone_sectors != 0 && (zone_sectors & (zone_sectors - 1)) == 0);
+ ASSERT(is_power_of_two_u64(zone_sectors));
zone_info->zone_size = zone_sectors << SECTOR_SHIFT;
/* We reject devices with a zone size larger than 8GB */
@@ -466,7 +468,7 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache)
goto out;
}
- zones = kcalloc(BTRFS_REPORT_NR_ZONES, sizeof(struct blk_zone), GFP_KERNEL);
+ zones = kvcalloc(BTRFS_REPORT_NR_ZONES, sizeof(struct blk_zone), GFP_KERNEL);
if (!zones) {
ret = -ENOMEM;
goto out;
@@ -585,7 +587,7 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache)
}
- kfree(zones);
+ kvfree(zones);
switch (bdev_zoned_model(bdev)) {
case BLK_ZONED_HM:
@@ -617,7 +619,7 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache)
return 0;
out:
- kfree(zones);
+ kvfree(zones);
out_free_zone_info:
btrfs_destroy_dev_zone_info(device);
@@ -639,6 +641,46 @@ void btrfs_destroy_dev_zone_info(struct btrfs_device *device)
device->zone_info = NULL;
}
+struct btrfs_zoned_device_info *btrfs_clone_dev_zone_info(struct btrfs_device *orig_dev)
+{
+ struct btrfs_zoned_device_info *zone_info;
+
+ zone_info = kmemdup(orig_dev->zone_info, sizeof(*zone_info), GFP_KERNEL);
+ if (!zone_info)
+ return NULL;
+
+ zone_info->seq_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL);
+ if (!zone_info->seq_zones)
+ goto out;
+
+ bitmap_copy(zone_info->seq_zones, orig_dev->zone_info->seq_zones,
+ zone_info->nr_zones);
+
+ zone_info->empty_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL);
+ if (!zone_info->empty_zones)
+ goto out;
+
+ bitmap_copy(zone_info->empty_zones, orig_dev->zone_info->empty_zones,
+ zone_info->nr_zones);
+
+ zone_info->active_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL);
+ if (!zone_info->active_zones)
+ goto out;
+
+ bitmap_copy(zone_info->active_zones, orig_dev->zone_info->active_zones,
+ zone_info->nr_zones);
+ zone_info->zone_cache = NULL;
+
+ return zone_info;
+
+out:
+ bitmap_free(zone_info->seq_zones);
+ bitmap_free(zone_info->empty_zones);
+ bitmap_free(zone_info->active_zones);
+ kfree(zone_info);
+ return NULL;
+}
+
int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos,
struct blk_zone *zone)
{
@@ -977,8 +1019,8 @@ int btrfs_reset_sb_log_zones(struct block_device *bdev, int mirror)
zone_sectors * BTRFS_NR_SB_LOG_ZONES, GFP_NOFS);
}
-/**
- * btrfs_find_allocatable_zones - find allocatable zones within a given region
+/*
+ * Find allocatable zones within a given region.
*
* @device: the device to allocate a region on
* @hole_start: the position of the hole to allocate the region
@@ -1395,7 +1437,7 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
}
if (num_sequential > 0)
- cache->seq_zone = true;
+ set_bit(BLOCK_GROUP_FLAG_SEQUENTIAL_ZONE, &cache->runtime_flags);
if (num_conventional > 0) {
/* Zone capacity is always zone size in emulation */
@@ -1607,7 +1649,7 @@ bool btrfs_use_zone_append(struct btrfs_inode *inode, u64 start)
if (!cache)
return false;
- ret = cache->seq_zone;
+ ret = !!test_bit(BLOCK_GROUP_FLAG_SEQUENTIAL_ZONE, &cache->runtime_flags);
btrfs_put_block_group(cache);
return ret;
@@ -1821,7 +1863,7 @@ struct btrfs_device *btrfs_zoned_get_device(struct btrfs_fs_info *fs_info,
return device;
}
-/**
+/*
* Activate block group and underlying device zones
*
* @block_group: the block group to activate
@@ -2112,7 +2154,8 @@ static void btrfs_zone_finish_endio_workfn(struct work_struct *work)
void btrfs_schedule_zone_finish_bg(struct btrfs_block_group *bg,
struct extent_buffer *eb)
{
- if (!bg->seq_zone || eb->start + eb->len * 2 <= bg->start + bg->zone_capacity)
+ if (!test_bit(BLOCK_GROUP_FLAG_SEQUENTIAL_ZONE, &bg->runtime_flags) ||
+ eb->start + eb->len * 2 <= bg->start + bg->zone_capacity)
return;
if (WARN_ON(bg->zone_finish_work.func == btrfs_zone_finish_endio_workfn)) {
diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h
index e17462db3a84..f43990985d80 100644
--- a/fs/btrfs/zoned.h
+++ b/fs/btrfs/zoned.h
@@ -5,6 +5,7 @@
#include <linux/types.h>
#include <linux/blkdev.h>
+#include "messages.h"
#include "volumes.h"
#include "disk-io.h"
#include "block-group.h"
@@ -36,6 +37,7 @@ int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos,
int btrfs_get_dev_zone_info_all_devices(struct btrfs_fs_info *fs_info);
int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache);
void btrfs_destroy_dev_zone_info(struct btrfs_device *device);
+struct btrfs_zoned_device_info *btrfs_clone_dev_zone_info(struct btrfs_device *orig_dev);
int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info);
int btrfs_check_mountopts_zoned(struct btrfs_fs_info *info);
int btrfs_sb_log_location_bdev(struct block_device *bdev, int mirror, int rw,
@@ -103,6 +105,16 @@ static inline int btrfs_get_dev_zone_info(struct btrfs_device *device,
static inline void btrfs_destroy_dev_zone_info(struct btrfs_device *device) { }
+/*
+ * In case the kernel is compiled without CONFIG_BLK_DEV_ZONED we'll never call
+ * into btrfs_clone_dev_zone_info() so it's safe to return NULL here.
+ */
+static inline struct btrfs_zoned_device_info *btrfs_clone_dev_zone_info(
+ struct btrfs_device *orig_dev)
+{
+ return NULL;
+}
+
static inline int btrfs_check_zoned_mode(const struct btrfs_fs_info *fs_info)
{
if (!btrfs_is_zoned(fs_info))
diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c
index 35a0224d4eb7..e34f1ab99d56 100644
--- a/fs/btrfs/zstd.c
+++ b/fs/btrfs/zstd.c
@@ -94,7 +94,7 @@ static inline struct workspace *list_to_workspace(struct list_head *list)
void zstd_free_workspace(struct list_head *ws);
struct list_head *zstd_alloc_workspace(unsigned int level);
-/**
+/*
* Timer callback to free unused workspaces.
*
* @t: timer
@@ -616,7 +616,7 @@ done:
return ret;
}
-int zstd_decompress(struct list_head *ws, unsigned char *data_in,
+int zstd_decompress(struct list_head *ws, const u8 *data_in,
struct page *dest_page, unsigned long start_byte, size_t srclen,
size_t destlen)
{