diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2022-12-13 05:47:51 +0100 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2022-12-13 05:47:51 +0100 |
commit | 149c51f876322d9bfbd5e2d6ffae7aff3d794384 (patch) | |
tree | a61c7dd828356e307fca06fc66dbdbf9b109c18f /fs | |
parent | Merge tag 'dlm-6.2' of git://git.kernel.org/pub/scm/linux/kernel/git/teigland... (diff) | |
parent | btrfs: print transaction aborted messages with an error level (diff) | |
download | linux-149c51f876322d9bfbd5e2d6ffae7aff3d794384.tar.xz linux-149c51f876322d9bfbd5e2d6ffae7aff3d794384.zip |
Merge tag 'for-6.2-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux
Pull btrfs updates from David Sterba:
"This round there are a lot of cleanups and moved code so the diffstat
looks huge, otherwise there are some nice performance improvements and
an update to raid56 reliability.
User visible features:
- raid56 reliability vs performance trade off:
- fix destructive RMW for raid5 data (raid6 still needs work): do
full checksum verification for all data during RMW cycle, this
should prevent rewriting potentially corrupted data without
notice
- stripes are cached in memory which should reduce the performance
impact but still can hurt some workloads
- checksums are verified after repair again
- this is the last option without introducing additional features
(write intent bitmap, journal, another tree), the extra checksum
read/verification was supposed to be avoided by the original
implementation exactly for performance reasons but that caused
all the reliability problems
- discard=async by default for devices that support it
- implement emergency flush reserve to avoid almost all unnecessary
transaction aborts due to ENOSPC in cases where there are too many
delayed refs or delayed allocation
- skip block group synchronization if there's no change in used
bytes, can reduce transaction commit count for some workloads
Performance improvements:
- fiemap and lseek:
- overall speedup due to skipping unnecessary or duplicate
searches (-40% run time)
- cache some data structures and sharedness of extents (-30% run
time)
- send:
- faster backref resolution when finding clones
- cached leaf to root mapping for faster backref walking
- improved clone/sharing detection
- overall run time improvements (-70%)
Core:
- module initialization converted to a table of function pointers run
in a sequence
- preparation for fscrypt, extend passing file names across calls,
dir item can store encryption status
- raid56 updates:
- more accurate error tracking of sectors within stripe
- simplify recovery path and remove dedicated endio worker kthread
- simplify scrub call paths
- refactoring to support the extra data checksum verification
during RMW cycle
- tree block parentness checks consolidated and done at metadata read
time
- improved error handling
- cleanups:
- move a lot of code for better synchronization between kernel and
user space sources, split big files
- enum cleanups
- GFP flag cleanups
- header file cleanups, prototypes, dependencies
- redundant parameter cleanups
- inline extent handling simplifications
- inode parameter conversion
- data structure cleanups, reductions, renames, merges"
* tag 'for-6.2-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux: (249 commits)
btrfs: print transaction aborted messages with an error level
btrfs: sync some cleanups from progs into uapi/btrfs.h
btrfs: do not BUG_ON() on ENOMEM when dropping extent items for a range
btrfs: fix extent map use-after-free when handling missing device in read_one_chunk
btrfs: remove outdated logic from overwrite_item() and add assertion
btrfs: unify overwrite_item() and do_overwrite_item()
btrfs: replace strncpy() with strscpy()
btrfs: fix uninitialized variable in find_first_clear_extent_bit
btrfs: fix uninitialized parent in insert_state
btrfs: add might_sleep() annotations
btrfs: add stack helpers for a few btrfs items
btrfs: add nr_global_roots to the super block definition
btrfs: remove BTRFS_LEAF_DATA_OFFSET
btrfs: add helpers for manipulating leaf items and data
btrfs: add eb to btrfs_node_key_ptr_offset
btrfs: pass the extent buffer for the btrfs_item_nr helpers
btrfs: move the csum helpers into ctree.h
btrfs: move eb offset helpers into extent_io.h
btrfs: move file_extent_item helpers into file-item.h
btrfs: move leaf_data_end into ctree.c
...
Diffstat (limited to 'fs')
118 files changed, 10924 insertions, 9422 deletions
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index fa9ddcc9eb0b..555c962fdad6 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -23,15 +23,15 @@ obj-$(CONFIG_BTRFS_FS) := btrfs.o btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ file-item.o inode-item.o disk-io.o \ - transaction.o inode.o file.o tree-defrag.o \ - extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \ + transaction.o inode.o file.o defrag.o \ + extent_map.o sysfs.o accessors.o xattr.o ordered-data.o \ extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \ export.o tree-log.o free-space-cache.o zlib.o lzo.o zstd.o \ compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \ backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o \ uuid-tree.o props.o free-space-tree.o tree-checker.o space-info.o \ block-rsv.o delalloc-space.o block-group.o discard.o reflink.o \ - subpage.o tree-mod-log.o extent-io-tree.o + subpage.o tree-mod-log.o extent-io-tree.o fs.o messages.o bio.o btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o diff --git a/fs/btrfs/struct-funcs.c b/fs/btrfs/accessors.c index 12455b2b41de..206cf1612c1d 100644 --- a/fs/btrfs/struct-funcs.c +++ b/fs/btrfs/accessors.c @@ -4,8 +4,9 @@ */ #include <asm/unaligned.h> - +#include "messages.h" #include "ctree.h" +#include "accessors.h" static bool check_setget_bounds(const struct extent_buffer *eb, const void *ptr, unsigned off, int size) @@ -23,6 +24,13 @@ static bool check_setget_bounds(const struct extent_buffer *eb, return true; } +void btrfs_init_map_token(struct btrfs_map_token *token, struct extent_buffer *eb) +{ + token->eb = eb; + token->kaddr = page_address(eb->pages[0]); + token->offset = 0; +} + /* * Macro templates that define helpers to read/write extent buffer data of a * given size, that are also used via ctree.h for access to item members by @@ -160,7 +168,7 @@ DEFINE_BTRFS_SETGET_BITS(64) void btrfs_node_key(const struct extent_buffer *eb, struct btrfs_disk_key *disk_key, int nr) { - unsigned long ptr = btrfs_node_key_ptr_offset(nr); + unsigned long ptr = btrfs_node_key_ptr_offset(eb, nr); read_eb_member(eb, (struct btrfs_key_ptr *)ptr, struct btrfs_key_ptr, key, disk_key); } diff --git a/fs/btrfs/accessors.h b/fs/btrfs/accessors.h new file mode 100644 index 000000000000..ceadfc5d6c66 --- /dev/null +++ b/fs/btrfs/accessors.h @@ -0,0 +1,1073 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef BTRFS_ACCESSORS_H +#define BTRFS_ACCESSORS_H + +struct btrfs_map_token { + struct extent_buffer *eb; + char *kaddr; + unsigned long offset; +}; + +void btrfs_init_map_token(struct btrfs_map_token *token, struct extent_buffer *eb); + +/* + * Some macros to generate set/get functions for the struct fields. This + * assumes there is a lefoo_to_cpu for every type, so lets make a simple one + * for u8: + */ +#define le8_to_cpu(v) (v) +#define cpu_to_le8(v) (v) +#define __le8 u8 + +static inline u8 get_unaligned_le8(const void *p) +{ + return *(u8 *)p; +} + +static inline void put_unaligned_le8(u8 val, void *p) +{ + *(u8 *)p = val; +} + +#define read_eb_member(eb, ptr, type, member, result) (\ + read_extent_buffer(eb, (char *)(result), \ + ((unsigned long)(ptr)) + \ + offsetof(type, member), \ + sizeof(((type *)0)->member))) + +#define write_eb_member(eb, ptr, type, member, result) (\ + write_extent_buffer(eb, (char *)(result), \ + ((unsigned long)(ptr)) + \ + offsetof(type, member), \ + sizeof(((type *)0)->member))) + +#define DECLARE_BTRFS_SETGET_BITS(bits) \ +u##bits btrfs_get_token_##bits(struct btrfs_map_token *token, \ + const void *ptr, unsigned long off); \ +void btrfs_set_token_##bits(struct btrfs_map_token *token, \ + const void *ptr, unsigned long off, \ + u##bits val); \ +u##bits btrfs_get_##bits(const struct extent_buffer *eb, \ + const void *ptr, unsigned long off); \ +void btrfs_set_##bits(const struct extent_buffer *eb, void *ptr, \ + unsigned long off, u##bits val); + +DECLARE_BTRFS_SETGET_BITS(8) +DECLARE_BTRFS_SETGET_BITS(16) +DECLARE_BTRFS_SETGET_BITS(32) +DECLARE_BTRFS_SETGET_BITS(64) + +#define BTRFS_SETGET_FUNCS(name, type, member, bits) \ +static inline u##bits btrfs_##name(const struct extent_buffer *eb, \ + const type *s) \ +{ \ + static_assert(sizeof(u##bits) == sizeof(((type *)0))->member); \ + return btrfs_get_##bits(eb, s, offsetof(type, member)); \ +} \ +static inline void btrfs_set_##name(const struct extent_buffer *eb, type *s, \ + u##bits val) \ +{ \ + static_assert(sizeof(u##bits) == sizeof(((type *)0))->member); \ + btrfs_set_##bits(eb, s, offsetof(type, member), val); \ +} \ +static inline u##bits btrfs_token_##name(struct btrfs_map_token *token, \ + const type *s) \ +{ \ + static_assert(sizeof(u##bits) == sizeof(((type *)0))->member); \ + return btrfs_get_token_##bits(token, s, offsetof(type, member));\ +} \ +static inline void btrfs_set_token_##name(struct btrfs_map_token *token,\ + type *s, u##bits val) \ +{ \ + static_assert(sizeof(u##bits) == sizeof(((type *)0))->member); \ + btrfs_set_token_##bits(token, s, offsetof(type, member), val); \ +} + +#define BTRFS_SETGET_HEADER_FUNCS(name, type, member, bits) \ +static inline u##bits btrfs_##name(const struct extent_buffer *eb) \ +{ \ + const type *p = page_address(eb->pages[0]) + \ + offset_in_page(eb->start); \ + return get_unaligned_le##bits(&p->member); \ +} \ +static inline void btrfs_set_##name(const struct extent_buffer *eb, \ + u##bits val) \ +{ \ + type *p = page_address(eb->pages[0]) + offset_in_page(eb->start); \ + put_unaligned_le##bits(val, &p->member); \ +} + +#define BTRFS_SETGET_STACK_FUNCS(name, type, member, bits) \ +static inline u##bits btrfs_##name(const type *s) \ +{ \ + return get_unaligned_le##bits(&s->member); \ +} \ +static inline void btrfs_set_##name(type *s, u##bits val) \ +{ \ + put_unaligned_le##bits(val, &s->member); \ +} + +static inline u64 btrfs_device_total_bytes(const struct extent_buffer *eb, + struct btrfs_dev_item *s) +{ + static_assert(sizeof(u64) == + sizeof(((struct btrfs_dev_item *)0))->total_bytes); + return btrfs_get_64(eb, s, offsetof(struct btrfs_dev_item, + total_bytes)); +} +static inline void btrfs_set_device_total_bytes(const struct extent_buffer *eb, + struct btrfs_dev_item *s, + u64 val) +{ + static_assert(sizeof(u64) == + sizeof(((struct btrfs_dev_item *)0))->total_bytes); + WARN_ON(!IS_ALIGNED(val, eb->fs_info->sectorsize)); + btrfs_set_64(eb, s, offsetof(struct btrfs_dev_item, total_bytes), val); +} + +BTRFS_SETGET_FUNCS(device_type, struct btrfs_dev_item, type, 64); +BTRFS_SETGET_FUNCS(device_bytes_used, struct btrfs_dev_item, bytes_used, 64); +BTRFS_SETGET_FUNCS(device_io_align, struct btrfs_dev_item, io_align, 32); +BTRFS_SETGET_FUNCS(device_io_width, struct btrfs_dev_item, io_width, 32); +BTRFS_SETGET_FUNCS(device_start_offset, struct btrfs_dev_item, start_offset, 64); +BTRFS_SETGET_FUNCS(device_sector_size, struct btrfs_dev_item, sector_size, 32); +BTRFS_SETGET_FUNCS(device_id, struct btrfs_dev_item, devid, 64); +BTRFS_SETGET_FUNCS(device_group, struct btrfs_dev_item, dev_group, 32); +BTRFS_SETGET_FUNCS(device_seek_speed, struct btrfs_dev_item, seek_speed, 8); +BTRFS_SETGET_FUNCS(device_bandwidth, struct btrfs_dev_item, bandwidth, 8); +BTRFS_SETGET_FUNCS(device_generation, struct btrfs_dev_item, generation, 64); + +BTRFS_SETGET_STACK_FUNCS(stack_device_type, struct btrfs_dev_item, type, 64); +BTRFS_SETGET_STACK_FUNCS(stack_device_total_bytes, struct btrfs_dev_item, + total_bytes, 64); +BTRFS_SETGET_STACK_FUNCS(stack_device_bytes_used, struct btrfs_dev_item, + bytes_used, 64); +BTRFS_SETGET_STACK_FUNCS(stack_device_io_align, struct btrfs_dev_item, + io_align, 32); +BTRFS_SETGET_STACK_FUNCS(stack_device_io_width, struct btrfs_dev_item, + io_width, 32); +BTRFS_SETGET_STACK_FUNCS(stack_device_sector_size, struct btrfs_dev_item, + sector_size, 32); +BTRFS_SETGET_STACK_FUNCS(stack_device_id, struct btrfs_dev_item, devid, 64); +BTRFS_SETGET_STACK_FUNCS(stack_device_group, struct btrfs_dev_item, dev_group, 32); +BTRFS_SETGET_STACK_FUNCS(stack_device_seek_speed, struct btrfs_dev_item, + seek_speed, 8); +BTRFS_SETGET_STACK_FUNCS(stack_device_bandwidth, struct btrfs_dev_item, + bandwidth, 8); +BTRFS_SETGET_STACK_FUNCS(stack_device_generation, struct btrfs_dev_item, + generation, 64); + +static inline unsigned long btrfs_device_uuid(struct btrfs_dev_item *d) +{ + return (unsigned long)d + offsetof(struct btrfs_dev_item, uuid); +} + +static inline unsigned long btrfs_device_fsid(struct btrfs_dev_item *d) +{ + return (unsigned long)d + offsetof(struct btrfs_dev_item, fsid); +} + +BTRFS_SETGET_FUNCS(chunk_length, struct btrfs_chunk, length, 64); +BTRFS_SETGET_FUNCS(chunk_owner, struct btrfs_chunk, owner, 64); +BTRFS_SETGET_FUNCS(chunk_stripe_len, struct btrfs_chunk, stripe_len, 64); +BTRFS_SETGET_FUNCS(chunk_io_align, struct btrfs_chunk, io_align, 32); +BTRFS_SETGET_FUNCS(chunk_io_width, struct btrfs_chunk, io_width, 32); +BTRFS_SETGET_FUNCS(chunk_sector_size, struct btrfs_chunk, sector_size, 32); +BTRFS_SETGET_FUNCS(chunk_type, struct btrfs_chunk, type, 64); +BTRFS_SETGET_FUNCS(chunk_num_stripes, struct btrfs_chunk, num_stripes, 16); +BTRFS_SETGET_FUNCS(chunk_sub_stripes, struct btrfs_chunk, sub_stripes, 16); +BTRFS_SETGET_FUNCS(stripe_devid, struct btrfs_stripe, devid, 64); +BTRFS_SETGET_FUNCS(stripe_offset, struct btrfs_stripe, offset, 64); + +static inline char *btrfs_stripe_dev_uuid(struct btrfs_stripe *s) +{ + return (char *)s + offsetof(struct btrfs_stripe, dev_uuid); +} + +BTRFS_SETGET_STACK_FUNCS(stack_chunk_length, struct btrfs_chunk, length, 64); +BTRFS_SETGET_STACK_FUNCS(stack_chunk_owner, struct btrfs_chunk, owner, 64); +BTRFS_SETGET_STACK_FUNCS(stack_chunk_stripe_len, struct btrfs_chunk, + stripe_len, 64); +BTRFS_SETGET_STACK_FUNCS(stack_chunk_io_align, struct btrfs_chunk, io_align, 32); +BTRFS_SETGET_STACK_FUNCS(stack_chunk_io_width, struct btrfs_chunk, io_width, 32); +BTRFS_SETGET_STACK_FUNCS(stack_chunk_sector_size, struct btrfs_chunk, + sector_size, 32); +BTRFS_SETGET_STACK_FUNCS(stack_chunk_type, struct btrfs_chunk, type, 64); +BTRFS_SETGET_STACK_FUNCS(stack_chunk_num_stripes, struct btrfs_chunk, + num_stripes, 16); +BTRFS_SETGET_STACK_FUNCS(stack_chunk_sub_stripes, struct btrfs_chunk, + sub_stripes, 16); +BTRFS_SETGET_STACK_FUNCS(stack_stripe_devid, struct btrfs_stripe, devid, 64); +BTRFS_SETGET_STACK_FUNCS(stack_stripe_offset, struct btrfs_stripe, offset, 64); + +static inline struct btrfs_stripe *btrfs_stripe_nr(struct btrfs_chunk *c, int nr) +{ + unsigned long offset = (unsigned long)c; + + offset += offsetof(struct btrfs_chunk, stripe); + offset += nr * sizeof(struct btrfs_stripe); + return (struct btrfs_stripe *)offset; +} + +static inline char *btrfs_stripe_dev_uuid_nr(struct btrfs_chunk *c, int nr) +{ + return btrfs_stripe_dev_uuid(btrfs_stripe_nr(c, nr)); +} + +static inline u64 btrfs_stripe_offset_nr(const struct extent_buffer *eb, + struct btrfs_chunk *c, int nr) +{ + return btrfs_stripe_offset(eb, btrfs_stripe_nr(c, nr)); +} + +static inline void btrfs_set_stripe_offset_nr(struct extent_buffer *eb, + struct btrfs_chunk *c, int nr, + u64 val) +{ + btrfs_set_stripe_offset(eb, btrfs_stripe_nr(c, nr), val); +} + +static inline u64 btrfs_stripe_devid_nr(const struct extent_buffer *eb, + struct btrfs_chunk *c, int nr) +{ + return btrfs_stripe_devid(eb, btrfs_stripe_nr(c, nr)); +} + +static inline void btrfs_set_stripe_devid_nr(struct extent_buffer *eb, + struct btrfs_chunk *c, int nr, + u64 val) +{ + btrfs_set_stripe_devid(eb, btrfs_stripe_nr(c, nr), val); +} + +/* struct btrfs_block_group_item */ +BTRFS_SETGET_STACK_FUNCS(stack_block_group_used, struct btrfs_block_group_item, + used, 64); +BTRFS_SETGET_FUNCS(block_group_used, struct btrfs_block_group_item, used, 64); +BTRFS_SETGET_STACK_FUNCS(stack_block_group_chunk_objectid, + struct btrfs_block_group_item, chunk_objectid, 64); + +BTRFS_SETGET_FUNCS(block_group_chunk_objectid, + struct btrfs_block_group_item, chunk_objectid, 64); +BTRFS_SETGET_FUNCS(block_group_flags, struct btrfs_block_group_item, flags, 64); +BTRFS_SETGET_STACK_FUNCS(stack_block_group_flags, + struct btrfs_block_group_item, flags, 64); + +/* struct btrfs_free_space_info */ +BTRFS_SETGET_FUNCS(free_space_extent_count, struct btrfs_free_space_info, + extent_count, 32); +BTRFS_SETGET_FUNCS(free_space_flags, struct btrfs_free_space_info, flags, 32); + +/* struct btrfs_inode_ref */ +BTRFS_SETGET_FUNCS(inode_ref_name_len, struct btrfs_inode_ref, name_len, 16); +BTRFS_SETGET_FUNCS(inode_ref_index, struct btrfs_inode_ref, index, 64); +BTRFS_SETGET_STACK_FUNCS(stack_inode_ref_name_len, struct btrfs_inode_ref, name_len, 16); +BTRFS_SETGET_STACK_FUNCS(stack_inode_ref_index, struct btrfs_inode_ref, index, 64); + +/* struct btrfs_inode_extref */ +BTRFS_SETGET_FUNCS(inode_extref_parent, struct btrfs_inode_extref, + parent_objectid, 64); +BTRFS_SETGET_FUNCS(inode_extref_name_len, struct btrfs_inode_extref, + name_len, 16); +BTRFS_SETGET_FUNCS(inode_extref_index, struct btrfs_inode_extref, index, 64); + +/* struct btrfs_inode_item */ +BTRFS_SETGET_FUNCS(inode_generation, struct btrfs_inode_item, generation, 64); +BTRFS_SETGET_FUNCS(inode_sequence, struct btrfs_inode_item, sequence, 64); +BTRFS_SETGET_FUNCS(inode_transid, struct btrfs_inode_item, transid, 64); +BTRFS_SETGET_FUNCS(inode_size, struct btrfs_inode_item, size, 64); +BTRFS_SETGET_FUNCS(inode_nbytes, struct btrfs_inode_item, nbytes, 64); +BTRFS_SETGET_FUNCS(inode_block_group, struct btrfs_inode_item, block_group, 64); +BTRFS_SETGET_FUNCS(inode_nlink, struct btrfs_inode_item, nlink, 32); +BTRFS_SETGET_FUNCS(inode_uid, struct btrfs_inode_item, uid, 32); +BTRFS_SETGET_FUNCS(inode_gid, struct btrfs_inode_item, gid, 32); +BTRFS_SETGET_FUNCS(inode_mode, struct btrfs_inode_item, mode, 32); +BTRFS_SETGET_FUNCS(inode_rdev, struct btrfs_inode_item, rdev, 64); +BTRFS_SETGET_FUNCS(inode_flags, struct btrfs_inode_item, flags, 64); +BTRFS_SETGET_STACK_FUNCS(stack_inode_generation, struct btrfs_inode_item, + generation, 64); +BTRFS_SETGET_STACK_FUNCS(stack_inode_sequence, struct btrfs_inode_item, + sequence, 64); +BTRFS_SETGET_STACK_FUNCS(stack_inode_transid, struct btrfs_inode_item, + transid, 64); +BTRFS_SETGET_STACK_FUNCS(stack_inode_size, struct btrfs_inode_item, size, 64); +BTRFS_SETGET_STACK_FUNCS(stack_inode_nbytes, struct btrfs_inode_item, nbytes, 64); +BTRFS_SETGET_STACK_FUNCS(stack_inode_block_group, struct btrfs_inode_item, + block_group, 64); +BTRFS_SETGET_STACK_FUNCS(stack_inode_nlink, struct btrfs_inode_item, nlink, 32); +BTRFS_SETGET_STACK_FUNCS(stack_inode_uid, struct btrfs_inode_item, uid, 32); +BTRFS_SETGET_STACK_FUNCS(stack_inode_gid, struct btrfs_inode_item, gid, 32); +BTRFS_SETGET_STACK_FUNCS(stack_inode_mode, struct btrfs_inode_item, mode, 32); +BTRFS_SETGET_STACK_FUNCS(stack_inode_rdev, struct btrfs_inode_item, rdev, 64); +BTRFS_SETGET_STACK_FUNCS(stack_inode_flags, struct btrfs_inode_item, flags, 64); +BTRFS_SETGET_FUNCS(timespec_sec, struct btrfs_timespec, sec, 64); +BTRFS_SETGET_FUNCS(timespec_nsec, struct btrfs_timespec, nsec, 32); +BTRFS_SETGET_STACK_FUNCS(stack_timespec_sec, struct btrfs_timespec, sec, 64); +BTRFS_SETGET_STACK_FUNCS(stack_timespec_nsec, struct btrfs_timespec, nsec, 32); + +/* struct btrfs_dev_extent */ +BTRFS_SETGET_FUNCS(dev_extent_chunk_tree, struct btrfs_dev_extent, chunk_tree, 64); +BTRFS_SETGET_FUNCS(dev_extent_chunk_objectid, struct btrfs_dev_extent, + chunk_objectid, 64); +BTRFS_SETGET_FUNCS(dev_extent_chunk_offset, struct btrfs_dev_extent, + chunk_offset, 64); +BTRFS_SETGET_FUNCS(dev_extent_length, struct btrfs_dev_extent, length, 64); +BTRFS_SETGET_STACK_FUNCS(stack_dev_extent_chunk_tree, struct btrfs_dev_extent, + chunk_tree, 64); +BTRFS_SETGET_STACK_FUNCS(stack_dev_extent_chunk_objectid, struct btrfs_dev_extent, + chunk_objectid, 64); +BTRFS_SETGET_STACK_FUNCS(stack_dev_extent_chunk_offset, struct btrfs_dev_extent, + chunk_offset, 64); +BTRFS_SETGET_STACK_FUNCS(stack_dev_extent_length, struct btrfs_dev_extent, length, 64); + +BTRFS_SETGET_FUNCS(extent_refs, struct btrfs_extent_item, refs, 64); +BTRFS_SETGET_FUNCS(extent_generation, struct btrfs_extent_item, generation, 64); +BTRFS_SETGET_FUNCS(extent_flags, struct btrfs_extent_item, flags, 64); + +BTRFS_SETGET_FUNCS(tree_block_level, struct btrfs_tree_block_info, level, 8); + +static inline void btrfs_tree_block_key(const struct extent_buffer *eb, + struct btrfs_tree_block_info *item, + struct btrfs_disk_key *key) +{ + read_eb_member(eb, item, struct btrfs_tree_block_info, key, key); +} + +static inline void btrfs_set_tree_block_key(const struct extent_buffer *eb, + struct btrfs_tree_block_info *item, + struct btrfs_disk_key *key) +{ + write_eb_member(eb, item, struct btrfs_tree_block_info, key, key); +} + +BTRFS_SETGET_FUNCS(extent_data_ref_root, struct btrfs_extent_data_ref, root, 64); +BTRFS_SETGET_FUNCS(extent_data_ref_objectid, struct btrfs_extent_data_ref, + objectid, 64); +BTRFS_SETGET_FUNCS(extent_data_ref_offset, struct btrfs_extent_data_ref, + offset, 64); +BTRFS_SETGET_FUNCS(extent_data_ref_count, struct btrfs_extent_data_ref, count, 32); + +BTRFS_SETGET_FUNCS(shared_data_ref_count, struct btrfs_shared_data_ref, count, 32); + +BTRFS_SETGET_FUNCS(extent_inline_ref_type, struct btrfs_extent_inline_ref, + type, 8); +BTRFS_SETGET_FUNCS(extent_inline_ref_offset, struct btrfs_extent_inline_ref, + offset, 64); + +static inline u32 btrfs_extent_inline_ref_size(int type) +{ + if (type == BTRFS_TREE_BLOCK_REF_KEY || + type == BTRFS_SHARED_BLOCK_REF_KEY) + return sizeof(struct btrfs_extent_inline_ref); + if (type == BTRFS_SHARED_DATA_REF_KEY) + return sizeof(struct btrfs_shared_data_ref) + + sizeof(struct btrfs_extent_inline_ref); + if (type == BTRFS_EXTENT_DATA_REF_KEY) + return sizeof(struct btrfs_extent_data_ref) + + offsetof(struct btrfs_extent_inline_ref, offset); + return 0; +} + +/* struct btrfs_node */ +BTRFS_SETGET_FUNCS(key_blockptr, struct btrfs_key_ptr, blockptr, 64); +BTRFS_SETGET_FUNCS(key_generation, struct btrfs_key_ptr, generation, 64); +BTRFS_SETGET_STACK_FUNCS(stack_key_blockptr, struct btrfs_key_ptr, blockptr, 64); +BTRFS_SETGET_STACK_FUNCS(stack_key_generation, struct btrfs_key_ptr, + generation, 64); + +static inline u64 btrfs_node_blockptr(const struct extent_buffer *eb, int nr) +{ + unsigned long ptr; + + ptr = offsetof(struct btrfs_node, ptrs) + + sizeof(struct btrfs_key_ptr) * nr; + return btrfs_key_blockptr(eb, (struct btrfs_key_ptr *)ptr); +} + +static inline void btrfs_set_node_blockptr(const struct extent_buffer *eb, + int nr, u64 val) +{ + unsigned long ptr; + + ptr = offsetof(struct btrfs_node, ptrs) + + sizeof(struct btrfs_key_ptr) * nr; + btrfs_set_key_blockptr(eb, (struct btrfs_key_ptr *)ptr, val); +} + +static inline u64 btrfs_node_ptr_generation(const struct extent_buffer *eb, int nr) +{ + unsigned long ptr; + + ptr = offsetof(struct btrfs_node, ptrs) + + sizeof(struct btrfs_key_ptr) * nr; + return btrfs_key_generation(eb, (struct btrfs_key_ptr *)ptr); +} + +static inline void btrfs_set_node_ptr_generation(const struct extent_buffer *eb, + int nr, u64 val) +{ + unsigned long ptr; + + ptr = offsetof(struct btrfs_node, ptrs) + + sizeof(struct btrfs_key_ptr) * nr; + btrfs_set_key_generation(eb, (struct btrfs_key_ptr *)ptr, val); +} + +static inline unsigned long btrfs_node_key_ptr_offset(const struct extent_buffer *eb, int nr) +{ + return offsetof(struct btrfs_node, ptrs) + + sizeof(struct btrfs_key_ptr) * nr; +} + +void btrfs_node_key(const struct extent_buffer *eb, + struct btrfs_disk_key *disk_key, int nr); + +static inline void btrfs_set_node_key(const struct extent_buffer *eb, + struct btrfs_disk_key *disk_key, int nr) +{ + unsigned long ptr; + + ptr = btrfs_node_key_ptr_offset(eb, nr); + write_eb_member(eb, (struct btrfs_key_ptr *)ptr, + struct btrfs_key_ptr, key, disk_key); +} + +/* struct btrfs_item */ +BTRFS_SETGET_FUNCS(raw_item_offset, struct btrfs_item, offset, 32); +BTRFS_SETGET_FUNCS(raw_item_size, struct btrfs_item, size, 32); +BTRFS_SETGET_STACK_FUNCS(stack_item_offset, struct btrfs_item, offset, 32); +BTRFS_SETGET_STACK_FUNCS(stack_item_size, struct btrfs_item, size, 32); + +static inline unsigned long btrfs_item_nr_offset(const struct extent_buffer *eb, int nr) +{ + return offsetof(struct btrfs_leaf, items) + + sizeof(struct btrfs_item) * nr; +} + +static inline struct btrfs_item *btrfs_item_nr(const struct extent_buffer *eb, int nr) +{ + return (struct btrfs_item *)btrfs_item_nr_offset(eb, nr); +} + +#define BTRFS_ITEM_SETGET_FUNCS(member) \ +static inline u32 btrfs_item_##member(const struct extent_buffer *eb, int slot) \ +{ \ + return btrfs_raw_item_##member(eb, btrfs_item_nr(eb, slot)); \ +} \ +static inline void btrfs_set_item_##member(const struct extent_buffer *eb, \ + int slot, u32 val) \ +{ \ + btrfs_set_raw_item_##member(eb, btrfs_item_nr(eb, slot), val); \ +} \ +static inline u32 btrfs_token_item_##member(struct btrfs_map_token *token, \ + int slot) \ +{ \ + struct btrfs_item *item = btrfs_item_nr(token->eb, slot); \ + return btrfs_token_raw_item_##member(token, item); \ +} \ +static inline void btrfs_set_token_item_##member(struct btrfs_map_token *token, \ + int slot, u32 val) \ +{ \ + struct btrfs_item *item = btrfs_item_nr(token->eb, slot); \ + btrfs_set_token_raw_item_##member(token, item, val); \ +} + +BTRFS_ITEM_SETGET_FUNCS(offset) +BTRFS_ITEM_SETGET_FUNCS(size); + +static inline u32 btrfs_item_data_end(const struct extent_buffer *eb, int nr) +{ + return btrfs_item_offset(eb, nr) + btrfs_item_size(eb, nr); +} + +static inline void btrfs_item_key(const struct extent_buffer *eb, + struct btrfs_disk_key *disk_key, int nr) +{ + struct btrfs_item *item = btrfs_item_nr(eb, nr); + + read_eb_member(eb, item, struct btrfs_item, key, disk_key); +} + +static inline void btrfs_set_item_key(struct extent_buffer *eb, + struct btrfs_disk_key *disk_key, int nr) +{ + struct btrfs_item *item = btrfs_item_nr(eb, nr); + + write_eb_member(eb, item, struct btrfs_item, key, disk_key); +} + +BTRFS_SETGET_FUNCS(dir_log_end, struct btrfs_dir_log_item, end, 64); + +/* struct btrfs_root_ref */ +BTRFS_SETGET_FUNCS(root_ref_dirid, struct btrfs_root_ref, dirid, 64); +BTRFS_SETGET_FUNCS(root_ref_sequence, struct btrfs_root_ref, sequence, 64); +BTRFS_SETGET_FUNCS(root_ref_name_len, struct btrfs_root_ref, name_len, 16); +BTRFS_SETGET_STACK_FUNCS(stack_root_ref_dirid, struct btrfs_root_ref, dirid, 64); +BTRFS_SETGET_STACK_FUNCS(stack_root_ref_sequence, struct btrfs_root_ref, sequence, 64); +BTRFS_SETGET_STACK_FUNCS(stack_root_ref_name_len, struct btrfs_root_ref, name_len, 16); + +/* struct btrfs_dir_item */ +BTRFS_SETGET_FUNCS(dir_data_len, struct btrfs_dir_item, data_len, 16); +BTRFS_SETGET_FUNCS(dir_flags, struct btrfs_dir_item, type, 8); +BTRFS_SETGET_FUNCS(dir_name_len, struct btrfs_dir_item, name_len, 16); +BTRFS_SETGET_FUNCS(dir_transid, struct btrfs_dir_item, transid, 64); +BTRFS_SETGET_STACK_FUNCS(stack_dir_flags, struct btrfs_dir_item, type, 8); +BTRFS_SETGET_STACK_FUNCS(stack_dir_data_len, struct btrfs_dir_item, data_len, 16); +BTRFS_SETGET_STACK_FUNCS(stack_dir_name_len, struct btrfs_dir_item, name_len, 16); +BTRFS_SETGET_STACK_FUNCS(stack_dir_transid, struct btrfs_dir_item, transid, 64); + +static inline u8 btrfs_dir_ftype(const struct extent_buffer *eb, + const struct btrfs_dir_item *item) +{ + return btrfs_dir_flags_to_ftype(btrfs_dir_flags(eb, item)); +} + +static inline u8 btrfs_stack_dir_ftype(const struct btrfs_dir_item *item) +{ + return btrfs_dir_flags_to_ftype(btrfs_stack_dir_flags(item)); +} + +static inline void btrfs_dir_item_key(const struct extent_buffer *eb, + const struct btrfs_dir_item *item, + struct btrfs_disk_key *key) +{ + read_eb_member(eb, item, struct btrfs_dir_item, location, key); +} + +static inline void btrfs_set_dir_item_key(struct extent_buffer *eb, + struct btrfs_dir_item *item, + const struct btrfs_disk_key *key) +{ + write_eb_member(eb, item, struct btrfs_dir_item, location, key); +} + +BTRFS_SETGET_FUNCS(free_space_entries, struct btrfs_free_space_header, + num_entries, 64); +BTRFS_SETGET_FUNCS(free_space_bitmaps, struct btrfs_free_space_header, + num_bitmaps, 64); +BTRFS_SETGET_FUNCS(free_space_generation, struct btrfs_free_space_header, + generation, 64); + +static inline void btrfs_free_space_key(const struct extent_buffer *eb, + const struct btrfs_free_space_header *h, + struct btrfs_disk_key *key) +{ + read_eb_member(eb, h, struct btrfs_free_space_header, location, key); +} + +static inline void btrfs_set_free_space_key(struct extent_buffer *eb, + struct btrfs_free_space_header *h, + const struct btrfs_disk_key *key) +{ + write_eb_member(eb, h, struct btrfs_free_space_header, location, key); +} + +/* struct btrfs_disk_key */ +BTRFS_SETGET_STACK_FUNCS(disk_key_objectid, struct btrfs_disk_key, objectid, 64); +BTRFS_SETGET_STACK_FUNCS(disk_key_offset, struct btrfs_disk_key, offset, 64); +BTRFS_SETGET_STACK_FUNCS(disk_key_type, struct btrfs_disk_key, type, 8); + +#ifdef __LITTLE_ENDIAN + +/* + * Optimized helpers for little-endian architectures where CPU and on-disk + * structures have the same endianness and we can skip conversions. + */ + +static inline void btrfs_disk_key_to_cpu(struct btrfs_key *cpu_key, + const struct btrfs_disk_key *disk_key) +{ + memcpy(cpu_key, disk_key, sizeof(struct btrfs_key)); +} + +static inline void btrfs_cpu_key_to_disk(struct btrfs_disk_key *disk_key, + const struct btrfs_key *cpu_key) +{ + memcpy(disk_key, cpu_key, sizeof(struct btrfs_key)); +} + +static inline void btrfs_node_key_to_cpu(const struct extent_buffer *eb, + struct btrfs_key *cpu_key, int nr) +{ + struct btrfs_disk_key *disk_key = (struct btrfs_disk_key *)cpu_key; + + btrfs_node_key(eb, disk_key, nr); +} + +static inline void btrfs_item_key_to_cpu(const struct extent_buffer *eb, + struct btrfs_key *cpu_key, int nr) +{ + struct btrfs_disk_key *disk_key = (struct btrfs_disk_key *)cpu_key; + + btrfs_item_key(eb, disk_key, nr); +} + +static inline void btrfs_dir_item_key_to_cpu(const struct extent_buffer *eb, + const struct btrfs_dir_item *item, + struct btrfs_key *cpu_key) +{ + struct btrfs_disk_key *disk_key = (struct btrfs_disk_key *)cpu_key; + + btrfs_dir_item_key(eb, item, disk_key); +} + +#else + +static inline void btrfs_disk_key_to_cpu(struct btrfs_key *cpu, + const struct btrfs_disk_key *disk) +{ + cpu->offset = le64_to_cpu(disk->offset); + cpu->type = disk->type; + cpu->objectid = le64_to_cpu(disk->objectid); +} + +static inline void btrfs_cpu_key_to_disk(struct btrfs_disk_key *disk, + const struct btrfs_key *cpu) +{ + disk->offset = cpu_to_le64(cpu->offset); + disk->type = cpu->type; + disk->objectid = cpu_to_le64(cpu->objectid); +} + +static inline void btrfs_node_key_to_cpu(const struct extent_buffer *eb, + struct btrfs_key *key, int nr) +{ + struct btrfs_disk_key disk_key; + + btrfs_node_key(eb, &disk_key, nr); + btrfs_disk_key_to_cpu(key, &disk_key); +} + +static inline void btrfs_item_key_to_cpu(const struct extent_buffer *eb, + struct btrfs_key *key, int nr) +{ + struct btrfs_disk_key disk_key; + + btrfs_item_key(eb, &disk_key, nr); + btrfs_disk_key_to_cpu(key, &disk_key); +} + +static inline void btrfs_dir_item_key_to_cpu(const struct extent_buffer *eb, + const struct btrfs_dir_item *item, + struct btrfs_key *key) +{ + struct btrfs_disk_key disk_key; + + btrfs_dir_item_key(eb, item, &disk_key); + btrfs_disk_key_to_cpu(key, &disk_key); +} + +#endif + +/* struct btrfs_header */ +BTRFS_SETGET_HEADER_FUNCS(header_bytenr, struct btrfs_header, bytenr, 64); +BTRFS_SETGET_HEADER_FUNCS(header_generation, struct btrfs_header, generation, 64); +BTRFS_SETGET_HEADER_FUNCS(header_owner, struct btrfs_header, owner, 64); +BTRFS_SETGET_HEADER_FUNCS(header_nritems, struct btrfs_header, nritems, 32); +BTRFS_SETGET_HEADER_FUNCS(header_flags, struct btrfs_header, flags, 64); +BTRFS_SETGET_HEADER_FUNCS(header_level, struct btrfs_header, level, 8); +BTRFS_SETGET_STACK_FUNCS(stack_header_generation, struct btrfs_header, + generation, 64); +BTRFS_SETGET_STACK_FUNCS(stack_header_owner, struct btrfs_header, owner, 64); +BTRFS_SETGET_STACK_FUNCS(stack_header_nritems, struct btrfs_header, nritems, 32); +BTRFS_SETGET_STACK_FUNCS(stack_header_bytenr, struct btrfs_header, bytenr, 64); + +static inline int btrfs_header_flag(const struct extent_buffer *eb, u64 flag) +{ + return (btrfs_header_flags(eb) & flag) == flag; +} + +static inline void btrfs_set_header_flag(struct extent_buffer *eb, u64 flag) +{ + u64 flags = btrfs_header_flags(eb); + + btrfs_set_header_flags(eb, flags | flag); +} + +static inline void btrfs_clear_header_flag(struct extent_buffer *eb, u64 flag) +{ + u64 flags = btrfs_header_flags(eb); + + btrfs_set_header_flags(eb, flags & ~flag); +} + +static inline int btrfs_header_backref_rev(const struct extent_buffer *eb) +{ + u64 flags = btrfs_header_flags(eb); + + return flags >> BTRFS_BACKREF_REV_SHIFT; +} + +static inline void btrfs_set_header_backref_rev(struct extent_buffer *eb, int rev) +{ + u64 flags = btrfs_header_flags(eb); + + flags &= ~BTRFS_BACKREF_REV_MASK; + flags |= (u64)rev << BTRFS_BACKREF_REV_SHIFT; + btrfs_set_header_flags(eb, flags); +} + +static inline int btrfs_is_leaf(const struct extent_buffer *eb) +{ + return btrfs_header_level(eb) == 0; +} + +/* struct btrfs_root_item */ +BTRFS_SETGET_FUNCS(disk_root_generation, struct btrfs_root_item, generation, 64); +BTRFS_SETGET_FUNCS(disk_root_refs, struct btrfs_root_item, refs, 32); +BTRFS_SETGET_FUNCS(disk_root_bytenr, struct btrfs_root_item, bytenr, 64); +BTRFS_SETGET_FUNCS(disk_root_level, struct btrfs_root_item, level, 8); + +BTRFS_SETGET_STACK_FUNCS(root_generation, struct btrfs_root_item, generation, 64); +BTRFS_SETGET_STACK_FUNCS(root_bytenr, struct btrfs_root_item, bytenr, 64); +BTRFS_SETGET_STACK_FUNCS(root_drop_level, struct btrfs_root_item, drop_level, 8); +BTRFS_SETGET_STACK_FUNCS(root_level, struct btrfs_root_item, level, 8); +BTRFS_SETGET_STACK_FUNCS(root_dirid, struct btrfs_root_item, root_dirid, 64); +BTRFS_SETGET_STACK_FUNCS(root_refs, struct btrfs_root_item, refs, 32); +BTRFS_SETGET_STACK_FUNCS(root_flags, struct btrfs_root_item, flags, 64); +BTRFS_SETGET_STACK_FUNCS(root_used, struct btrfs_root_item, bytes_used, 64); +BTRFS_SETGET_STACK_FUNCS(root_limit, struct btrfs_root_item, byte_limit, 64); +BTRFS_SETGET_STACK_FUNCS(root_last_snapshot, struct btrfs_root_item, + last_snapshot, 64); +BTRFS_SETGET_STACK_FUNCS(root_generation_v2, struct btrfs_root_item, + generation_v2, 64); +BTRFS_SETGET_STACK_FUNCS(root_ctransid, struct btrfs_root_item, ctransid, 64); +BTRFS_SETGET_STACK_FUNCS(root_otransid, struct btrfs_root_item, otransid, 64); +BTRFS_SETGET_STACK_FUNCS(root_stransid, struct btrfs_root_item, stransid, 64); +BTRFS_SETGET_STACK_FUNCS(root_rtransid, struct btrfs_root_item, rtransid, 64); + +/* struct btrfs_root_backup */ +BTRFS_SETGET_STACK_FUNCS(backup_tree_root, struct btrfs_root_backup, + tree_root, 64); +BTRFS_SETGET_STACK_FUNCS(backup_tree_root_gen, struct btrfs_root_backup, + tree_root_gen, 64); +BTRFS_SETGET_STACK_FUNCS(backup_tree_root_level, struct btrfs_root_backup, + tree_root_level, 8); + +BTRFS_SETGET_STACK_FUNCS(backup_chunk_root, struct btrfs_root_backup, + chunk_root, 64); +BTRFS_SETGET_STACK_FUNCS(backup_chunk_root_gen, struct btrfs_root_backup, + chunk_root_gen, 64); +BTRFS_SETGET_STACK_FUNCS(backup_chunk_root_level, struct btrfs_root_backup, + chunk_root_level, 8); + +BTRFS_SETGET_STACK_FUNCS(backup_extent_root, struct btrfs_root_backup, + extent_root, 64); +BTRFS_SETGET_STACK_FUNCS(backup_extent_root_gen, struct btrfs_root_backup, + extent_root_gen, 64); +BTRFS_SETGET_STACK_FUNCS(backup_extent_root_level, struct btrfs_root_backup, + extent_root_level, 8); + +BTRFS_SETGET_STACK_FUNCS(backup_fs_root, struct btrfs_root_backup, + fs_root, 64); +BTRFS_SETGET_STACK_FUNCS(backup_fs_root_gen, struct btrfs_root_backup, + fs_root_gen, 64); +BTRFS_SETGET_STACK_FUNCS(backup_fs_root_level, struct btrfs_root_backup, + fs_root_level, 8); + +BTRFS_SETGET_STACK_FUNCS(backup_dev_root, struct btrfs_root_backup, + dev_root, 64); +BTRFS_SETGET_STACK_FUNCS(backup_dev_root_gen, struct btrfs_root_backup, + dev_root_gen, 64); +BTRFS_SETGET_STACK_FUNCS(backup_dev_root_level, struct btrfs_root_backup, + dev_root_level, 8); + +BTRFS_SETGET_STACK_FUNCS(backup_csum_root, struct btrfs_root_backup, + csum_root, 64); +BTRFS_SETGET_STACK_FUNCS(backup_csum_root_gen, struct btrfs_root_backup, + csum_root_gen, 64); +BTRFS_SETGET_STACK_FUNCS(backup_csum_root_level, struct btrfs_root_backup, + csum_root_level, 8); +BTRFS_SETGET_STACK_FUNCS(backup_total_bytes, struct btrfs_root_backup, + total_bytes, 64); +BTRFS_SETGET_STACK_FUNCS(backup_bytes_used, struct btrfs_root_backup, + bytes_used, 64); +BTRFS_SETGET_STACK_FUNCS(backup_num_devices, struct btrfs_root_backup, + num_devices, 64); + +/* struct btrfs_balance_item */ +BTRFS_SETGET_FUNCS(balance_flags, struct btrfs_balance_item, flags, 64); + +static inline void btrfs_balance_data(const struct extent_buffer *eb, + const struct btrfs_balance_item *bi, + struct btrfs_disk_balance_args *ba) +{ + read_eb_member(eb, bi, struct btrfs_balance_item, data, ba); +} + +static inline void btrfs_set_balance_data(struct extent_buffer *eb, + struct btrfs_balance_item *bi, + const struct btrfs_disk_balance_args *ba) +{ + write_eb_member(eb, bi, struct btrfs_balance_item, data, ba); +} + +static inline void btrfs_balance_meta(const struct extent_buffer *eb, + const struct btrfs_balance_item *bi, + struct btrfs_disk_balance_args *ba) +{ + read_eb_member(eb, bi, struct btrfs_balance_item, meta, ba); +} + +static inline void btrfs_set_balance_meta(struct extent_buffer *eb, + struct btrfs_balance_item *bi, + const struct btrfs_disk_balance_args *ba) +{ + write_eb_member(eb, bi, struct btrfs_balance_item, meta, ba); +} + +static inline void btrfs_balance_sys(const struct extent_buffer *eb, + const struct btrfs_balance_item *bi, + struct btrfs_disk_balance_args *ba) +{ + read_eb_member(eb, bi, struct btrfs_balance_item, sys, ba); +} + +static inline void btrfs_set_balance_sys(struct extent_buffer *eb, + struct btrfs_balance_item *bi, + const struct btrfs_disk_balance_args *ba) +{ + write_eb_member(eb, bi, struct btrfs_balance_item, sys, ba); +} + +static inline void btrfs_disk_balance_args_to_cpu(struct btrfs_balance_args *cpu, + const struct btrfs_disk_balance_args *disk) +{ + memset(cpu, 0, sizeof(*cpu)); + + cpu->profiles = le64_to_cpu(disk->profiles); + cpu->usage = le64_to_cpu(disk->usage); + cpu->devid = le64_to_cpu(disk->devid); + cpu->pstart = le64_to_cpu(disk->pstart); + cpu->pend = le64_to_cpu(disk->pend); + cpu->vstart = le64_to_cpu(disk->vstart); + cpu->vend = le64_to_cpu(disk->vend); + cpu->target = le64_to_cpu(disk->target); + cpu->flags = le64_to_cpu(disk->flags); + cpu->limit = le64_to_cpu(disk->limit); + cpu->stripes_min = le32_to_cpu(disk->stripes_min); + cpu->stripes_max = le32_to_cpu(disk->stripes_max); +} + +static inline void btrfs_cpu_balance_args_to_disk( + struct btrfs_disk_balance_args *disk, + const struct btrfs_balance_args *cpu) +{ + memset(disk, 0, sizeof(*disk)); + + disk->profiles = cpu_to_le64(cpu->profiles); + disk->usage = cpu_to_le64(cpu->usage); + disk->devid = cpu_to_le64(cpu->devid); + disk->pstart = cpu_to_le64(cpu->pstart); + disk->pend = cpu_to_le64(cpu->pend); + disk->vstart = cpu_to_le64(cpu->vstart); + disk->vend = cpu_to_le64(cpu->vend); + disk->target = cpu_to_le64(cpu->target); + disk->flags = cpu_to_le64(cpu->flags); + disk->limit = cpu_to_le64(cpu->limit); + disk->stripes_min = cpu_to_le32(cpu->stripes_min); + disk->stripes_max = cpu_to_le32(cpu->stripes_max); +} + +/* struct btrfs_super_block */ +BTRFS_SETGET_STACK_FUNCS(super_bytenr, struct btrfs_super_block, bytenr, 64); +BTRFS_SETGET_STACK_FUNCS(super_flags, struct btrfs_super_block, flags, 64); +BTRFS_SETGET_STACK_FUNCS(super_generation, struct btrfs_super_block, + generation, 64); +BTRFS_SETGET_STACK_FUNCS(super_root, struct btrfs_super_block, root, 64); +BTRFS_SETGET_STACK_FUNCS(super_sys_array_size, + struct btrfs_super_block, sys_chunk_array_size, 32); +BTRFS_SETGET_STACK_FUNCS(super_chunk_root_generation, + struct btrfs_super_block, chunk_root_generation, 64); +BTRFS_SETGET_STACK_FUNCS(super_root_level, struct btrfs_super_block, + root_level, 8); +BTRFS_SETGET_STACK_FUNCS(super_chunk_root, struct btrfs_super_block, + chunk_root, 64); +BTRFS_SETGET_STACK_FUNCS(super_chunk_root_level, struct btrfs_super_block, + chunk_root_level, 8); +BTRFS_SETGET_STACK_FUNCS(super_log_root, struct btrfs_super_block, log_root, 64); +BTRFS_SETGET_STACK_FUNCS(super_log_root_level, struct btrfs_super_block, + log_root_level, 8); +BTRFS_SETGET_STACK_FUNCS(super_total_bytes, struct btrfs_super_block, + total_bytes, 64); +BTRFS_SETGET_STACK_FUNCS(super_bytes_used, struct btrfs_super_block, + bytes_used, 64); +BTRFS_SETGET_STACK_FUNCS(super_sectorsize, struct btrfs_super_block, + sectorsize, 32); +BTRFS_SETGET_STACK_FUNCS(super_nodesize, struct btrfs_super_block, + nodesize, 32); +BTRFS_SETGET_STACK_FUNCS(super_stripesize, struct btrfs_super_block, + stripesize, 32); +BTRFS_SETGET_STACK_FUNCS(super_root_dir, struct btrfs_super_block, + root_dir_objectid, 64); +BTRFS_SETGET_STACK_FUNCS(super_num_devices, struct btrfs_super_block, + num_devices, 64); +BTRFS_SETGET_STACK_FUNCS(super_compat_flags, struct btrfs_super_block, + compat_flags, 64); +BTRFS_SETGET_STACK_FUNCS(super_compat_ro_flags, struct btrfs_super_block, + compat_ro_flags, 64); +BTRFS_SETGET_STACK_FUNCS(super_incompat_flags, struct btrfs_super_block, + incompat_flags, 64); +BTRFS_SETGET_STACK_FUNCS(super_csum_type, struct btrfs_super_block, + csum_type, 16); +BTRFS_SETGET_STACK_FUNCS(super_cache_generation, struct btrfs_super_block, + cache_generation, 64); +BTRFS_SETGET_STACK_FUNCS(super_magic, struct btrfs_super_block, magic, 64); +BTRFS_SETGET_STACK_FUNCS(super_uuid_tree_generation, struct btrfs_super_block, + uuid_tree_generation, 64); +BTRFS_SETGET_STACK_FUNCS(super_nr_global_roots, struct btrfs_super_block, + nr_global_roots, 64); + +/* struct btrfs_file_extent_item */ +BTRFS_SETGET_STACK_FUNCS(stack_file_extent_type, struct btrfs_file_extent_item, + type, 8); +BTRFS_SETGET_STACK_FUNCS(stack_file_extent_disk_bytenr, + struct btrfs_file_extent_item, disk_bytenr, 64); +BTRFS_SETGET_STACK_FUNCS(stack_file_extent_offset, + struct btrfs_file_extent_item, offset, 64); +BTRFS_SETGET_STACK_FUNCS(stack_file_extent_generation, + struct btrfs_file_extent_item, generation, 64); +BTRFS_SETGET_STACK_FUNCS(stack_file_extent_num_bytes, + struct btrfs_file_extent_item, num_bytes, 64); +BTRFS_SETGET_STACK_FUNCS(stack_file_extent_ram_bytes, + struct btrfs_file_extent_item, ram_bytes, 64); +BTRFS_SETGET_STACK_FUNCS(stack_file_extent_disk_num_bytes, + struct btrfs_file_extent_item, disk_num_bytes, 64); +BTRFS_SETGET_STACK_FUNCS(stack_file_extent_compression, + struct btrfs_file_extent_item, compression, 8); + + +BTRFS_SETGET_FUNCS(file_extent_type, struct btrfs_file_extent_item, type, 8); +BTRFS_SETGET_FUNCS(file_extent_disk_bytenr, struct btrfs_file_extent_item, + disk_bytenr, 64); +BTRFS_SETGET_FUNCS(file_extent_generation, struct btrfs_file_extent_item, + generation, 64); +BTRFS_SETGET_FUNCS(file_extent_disk_num_bytes, struct btrfs_file_extent_item, + disk_num_bytes, 64); +BTRFS_SETGET_FUNCS(file_extent_offset, struct btrfs_file_extent_item, + offset, 64); +BTRFS_SETGET_FUNCS(file_extent_num_bytes, struct btrfs_file_extent_item, + num_bytes, 64); +BTRFS_SETGET_FUNCS(file_extent_ram_bytes, struct btrfs_file_extent_item, + ram_bytes, 64); +BTRFS_SETGET_FUNCS(file_extent_compression, struct btrfs_file_extent_item, + compression, 8); +BTRFS_SETGET_FUNCS(file_extent_encryption, struct btrfs_file_extent_item, + encryption, 8); +BTRFS_SETGET_FUNCS(file_extent_other_encoding, struct btrfs_file_extent_item, + other_encoding, 16); + +/* btrfs_qgroup_status_item */ +BTRFS_SETGET_FUNCS(qgroup_status_generation, struct btrfs_qgroup_status_item, + generation, 64); +BTRFS_SETGET_FUNCS(qgroup_status_version, struct btrfs_qgroup_status_item, + version, 64); +BTRFS_SETGET_FUNCS(qgroup_status_flags, struct btrfs_qgroup_status_item, + flags, 64); +BTRFS_SETGET_FUNCS(qgroup_status_rescan, struct btrfs_qgroup_status_item, + rescan, 64); + +/* btrfs_qgroup_info_item */ +BTRFS_SETGET_FUNCS(qgroup_info_generation, struct btrfs_qgroup_info_item, + generation, 64); +BTRFS_SETGET_FUNCS(qgroup_info_rfer, struct btrfs_qgroup_info_item, rfer, 64); +BTRFS_SETGET_FUNCS(qgroup_info_rfer_cmpr, struct btrfs_qgroup_info_item, + rfer_cmpr, 64); +BTRFS_SETGET_FUNCS(qgroup_info_excl, struct btrfs_qgroup_info_item, excl, 64); +BTRFS_SETGET_FUNCS(qgroup_info_excl_cmpr, struct btrfs_qgroup_info_item, + excl_cmpr, 64); + +BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_generation, + struct btrfs_qgroup_info_item, generation, 64); +BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_rfer, struct btrfs_qgroup_info_item, + rfer, 64); +BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_rfer_cmpr, + struct btrfs_qgroup_info_item, rfer_cmpr, 64); +BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_excl, struct btrfs_qgroup_info_item, + excl, 64); +BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_excl_cmpr, + struct btrfs_qgroup_info_item, excl_cmpr, 64); + +/* btrfs_qgroup_limit_item */ +BTRFS_SETGET_FUNCS(qgroup_limit_flags, struct btrfs_qgroup_limit_item, flags, 64); +BTRFS_SETGET_FUNCS(qgroup_limit_max_rfer, struct btrfs_qgroup_limit_item, + max_rfer, 64); +BTRFS_SETGET_FUNCS(qgroup_limit_max_excl, struct btrfs_qgroup_limit_item, + max_excl, 64); +BTRFS_SETGET_FUNCS(qgroup_limit_rsv_rfer, struct btrfs_qgroup_limit_item, + rsv_rfer, 64); +BTRFS_SETGET_FUNCS(qgroup_limit_rsv_excl, struct btrfs_qgroup_limit_item, + rsv_excl, 64); +BTRFS_SETGET_STACK_FUNCS(stack_qgroup_limit_flags, + struct btrfs_qgroup_limit_item, flags, 64); +BTRFS_SETGET_STACK_FUNCS(stack_qgroup_limit_max_rfer, + struct btrfs_qgroup_limit_item, max_rfer, 64); +BTRFS_SETGET_STACK_FUNCS(stack_qgroup_limit_max_excl, + struct btrfs_qgroup_limit_item, max_excl, 64); +BTRFS_SETGET_STACK_FUNCS(stack_qgroup_limit_rsv_rfer, + struct btrfs_qgroup_limit_item, rsv_rfer, 64); +BTRFS_SETGET_STACK_FUNCS(stack_qgroup_limit_rsv_excl, + struct btrfs_qgroup_limit_item, rsv_excl, 64); + +/* btrfs_dev_replace_item */ +BTRFS_SETGET_FUNCS(dev_replace_src_devid, + struct btrfs_dev_replace_item, src_devid, 64); +BTRFS_SETGET_FUNCS(dev_replace_cont_reading_from_srcdev_mode, + struct btrfs_dev_replace_item, cont_reading_from_srcdev_mode, + 64); +BTRFS_SETGET_FUNCS(dev_replace_replace_state, struct btrfs_dev_replace_item, + replace_state, 64); +BTRFS_SETGET_FUNCS(dev_replace_time_started, struct btrfs_dev_replace_item, + time_started, 64); +BTRFS_SETGET_FUNCS(dev_replace_time_stopped, struct btrfs_dev_replace_item, + time_stopped, 64); +BTRFS_SETGET_FUNCS(dev_replace_num_write_errors, struct btrfs_dev_replace_item, + num_write_errors, 64); +BTRFS_SETGET_FUNCS(dev_replace_num_uncorrectable_read_errors, + struct btrfs_dev_replace_item, num_uncorrectable_read_errors, + 64); +BTRFS_SETGET_FUNCS(dev_replace_cursor_left, struct btrfs_dev_replace_item, + cursor_left, 64); +BTRFS_SETGET_FUNCS(dev_replace_cursor_right, struct btrfs_dev_replace_item, + cursor_right, 64); + +BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_src_devid, + struct btrfs_dev_replace_item, src_devid, 64); +BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cont_reading_from_srcdev_mode, + struct btrfs_dev_replace_item, + cont_reading_from_srcdev_mode, 64); +BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_replace_state, + struct btrfs_dev_replace_item, replace_state, 64); +BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_time_started, + struct btrfs_dev_replace_item, time_started, 64); +BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_time_stopped, + struct btrfs_dev_replace_item, time_stopped, 64); +BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_num_write_errors, + struct btrfs_dev_replace_item, num_write_errors, 64); +BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_num_uncorrectable_read_errors, + struct btrfs_dev_replace_item, + num_uncorrectable_read_errors, 64); +BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cursor_left, + struct btrfs_dev_replace_item, cursor_left, 64); +BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cursor_right, + struct btrfs_dev_replace_item, cursor_right, 64); + +/* btrfs_verity_descriptor_item */ +BTRFS_SETGET_FUNCS(verity_descriptor_encryption, struct btrfs_verity_descriptor_item, + encryption, 8); +BTRFS_SETGET_FUNCS(verity_descriptor_size, struct btrfs_verity_descriptor_item, + size, 64); +BTRFS_SETGET_STACK_FUNCS(stack_verity_descriptor_encryption, + struct btrfs_verity_descriptor_item, encryption, 8); +BTRFS_SETGET_STACK_FUNCS(stack_verity_descriptor_size, + struct btrfs_verity_descriptor_item, size, 64); + +/* Cast into the data area of the leaf. */ +#define btrfs_item_ptr(leaf, slot, type) \ + ((type *)(btrfs_item_nr_offset(leaf, 0) + btrfs_item_offset(leaf, slot))) + +#define btrfs_item_ptr_offset(leaf, slot) \ + ((unsigned long)(btrfs_item_nr_offset(leaf, 0) + btrfs_item_offset(leaf, slot))) + +#endif diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c index 1e47b3ec3989..3da1779e8b79 100644 --- a/fs/btrfs/acl.c +++ b/fs/btrfs/acl.c @@ -11,10 +11,10 @@ #include <linux/sched.h> #include <linux/sched/mm.h> #include <linux/slab.h> - #include "ctree.h" #include "btrfs_inode.h" #include "xattr.h" +#include "acl.h" struct posix_acl *btrfs_get_acl(struct inode *inode, int type, bool rcu) { diff --git a/fs/btrfs/acl.h b/fs/btrfs/acl.h new file mode 100644 index 000000000000..39bd36e6eeb7 --- /dev/null +++ b/fs/btrfs/acl.h @@ -0,0 +1,27 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef BTRFS_ACL_H +#define BTRFS_ACL_H + +#ifdef CONFIG_BTRFS_FS_POSIX_ACL + +struct posix_acl *btrfs_get_acl(struct inode *inode, int type, bool rcu); +int btrfs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, + struct posix_acl *acl, int type); +int __btrfs_set_acl(struct btrfs_trans_handle *trans, struct inode *inode, + struct posix_acl *acl, int type); + +#else + +#define btrfs_get_acl NULL +#define btrfs_set_acl NULL +static inline int __btrfs_set_acl(struct btrfs_trans_handle *trans, + struct inode *inode, struct posix_acl *acl, + int type) +{ + return -EOPNOTSUPP; +} + +#endif + +#endif diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 18374a6d05bd..21c92c74bf71 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -15,49 +15,76 @@ #include "locking.h" #include "misc.h" #include "tree-mod-log.h" +#include "fs.h" +#include "accessors.h" +#include "extent-tree.h" +#include "relocation.h" +#include "tree-checker.h" -/* Just an arbitrary number so we can be sure this happened */ -#define BACKREF_FOUND_SHARED 6 +/* Just arbitrary numbers so we can be sure one of these happened. */ +#define BACKREF_FOUND_SHARED 6 +#define BACKREF_FOUND_NOT_SHARED 7 struct extent_inode_elem { u64 inum; u64 offset; + u64 num_bytes; struct extent_inode_elem *next; }; -static int check_extent_in_eb(const struct btrfs_key *key, +static int check_extent_in_eb(struct btrfs_backref_walk_ctx *ctx, + const struct btrfs_key *key, const struct extent_buffer *eb, const struct btrfs_file_extent_item *fi, - u64 extent_item_pos, - struct extent_inode_elem **eie, - bool ignore_offset) + struct extent_inode_elem **eie) { - u64 offset = 0; + const u64 data_len = btrfs_file_extent_num_bytes(eb, fi); + u64 offset = key->offset; struct extent_inode_elem *e; + const u64 *root_ids; + int root_count; + bool cached; - if (!ignore_offset && - !btrfs_file_extent_compression(eb, fi) && + if (!btrfs_file_extent_compression(eb, fi) && !btrfs_file_extent_encryption(eb, fi) && !btrfs_file_extent_other_encoding(eb, fi)) { u64 data_offset; - u64 data_len; data_offset = btrfs_file_extent_offset(eb, fi); - data_len = btrfs_file_extent_num_bytes(eb, fi); - if (extent_item_pos < data_offset || - extent_item_pos >= data_offset + data_len) + if (ctx->extent_item_pos < data_offset || + ctx->extent_item_pos >= data_offset + data_len) return 1; - offset = extent_item_pos - data_offset; + offset += ctx->extent_item_pos - data_offset; } + if (!ctx->indirect_ref_iterator || !ctx->cache_lookup) + goto add_inode_elem; + + cached = ctx->cache_lookup(eb->start, ctx->user_ctx, &root_ids, + &root_count); + if (!cached) + goto add_inode_elem; + + for (int i = 0; i < root_count; i++) { + int ret; + + ret = ctx->indirect_ref_iterator(key->objectid, offset, + data_len, root_ids[i], + ctx->user_ctx); + if (ret) + return ret; + } + +add_inode_elem: e = kmalloc(sizeof(*e), GFP_NOFS); if (!e) return -ENOMEM; e->next = *eie; e->inum = key->objectid; - e->offset = key->offset + offset; + e->offset = offset; + e->num_bytes = data_len; *eie = e; return 0; @@ -73,10 +100,9 @@ static void free_inode_elem_list(struct extent_inode_elem *eie) } } -static int find_extent_in_eb(const struct extent_buffer *eb, - u64 wanted_disk_byte, u64 extent_item_pos, - struct extent_inode_elem **eie, - bool ignore_offset) +static int find_extent_in_eb(struct btrfs_backref_walk_ctx *ctx, + const struct extent_buffer *eb, + struct extent_inode_elem **eie) { u64 disk_byte; struct btrfs_key key; @@ -102,11 +128,11 @@ static int find_extent_in_eb(const struct extent_buffer *eb, continue; /* don't skip BTRFS_FILE_EXTENT_PREALLOC, we can handle that */ disk_byte = btrfs_file_extent_disk_bytenr(eb, fi); - if (disk_byte != wanted_disk_byte) + if (disk_byte != ctx->bytenr) continue; - ret = check_extent_in_eb(&key, eb, fi, extent_item_pos, eie, ignore_offset); - if (ret < 0) + ret = check_extent_in_eb(ctx, &key, eb, fi, eie); + if (ret == BTRFS_ITERATE_EXTENT_INODES_STOP || ret < 0) return ret; } @@ -135,9 +161,29 @@ struct preftrees { * - decremented when a ref->count transitions to <1 */ struct share_check { - u64 root_objectid; + struct btrfs_backref_share_check_ctx *ctx; + struct btrfs_root *root; u64 inum; + u64 data_bytenr; + u64 data_extent_gen; + /* + * Counts number of inodes that refer to an extent (different inodes in + * the same root or different roots) that we could find. The sharedness + * check typically stops once this counter gets greater than 1, so it + * may not reflect the total number of inodes. + */ int share_count; + /* + * The number of times we found our inode refers to the data extent we + * are determining the sharedness. In other words, how many file extent + * items we could find for our inode that point to our target data + * extent. The value we get here after finishing the extent sharedness + * check may be smaller than reality, but if it ends up being greater + * than 1, then we know for sure the inode has multiple file extent + * items that point to our inode, and we can safely assume it's useful + * to cache the sharedness check result. + */ + int self_ref_count; bool have_delayed_delete_refs; }; @@ -207,7 +253,7 @@ static int prelim_ref_compare(struct prelim_ref *ref1, } static void update_share_count(struct share_check *sc, int oldcount, - int newcount) + int newcount, struct prelim_ref *newref) { if ((!sc) || (oldcount == 0 && newcount < 1)) return; @@ -216,6 +262,11 @@ static void update_share_count(struct share_check *sc, int oldcount, sc->share_count--; else if (oldcount < 1 && newcount > 0) sc->share_count++; + + if (newref->root_id == sc->root->root_key.objectid && + newref->wanted_disk_byte == sc->data_bytenr && + newref->key_for_search.objectid == sc->inum) + sc->self_ref_count += newref->count; } /* @@ -266,14 +317,14 @@ static void prelim_ref_insert(const struct btrfs_fs_info *fs_info, * BTRFS_[ADD|DROP]_DELAYED_REF actions. */ update_share_count(sc, ref->count, - ref->count + newref->count); + ref->count + newref->count, newref); ref->count += newref->count; free_pref(newref); return; } } - update_share_count(sc, 0, newref->count); + update_share_count(sc, 0, newref->count, newref); preftree->count++; trace_btrfs_prelim_ref_insert(fs_info, newref, NULL, preftree->count); rb_link_node(&newref->rbnode, parent, p); @@ -416,11 +467,11 @@ static int is_shared_data_backref(struct preftrees *preftrees, u64 bytenr) return 0; } -static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path, +static int add_all_parents(struct btrfs_backref_walk_ctx *ctx, + struct btrfs_root *root, struct btrfs_path *path, struct ulist *parents, struct preftrees *preftrees, struct prelim_ref *ref, - int level, u64 time_seq, const u64 *extent_item_pos, - bool ignore_offset) + int level) { int ret = 0; int slot; @@ -456,10 +507,10 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path, if (path->slots[0] >= btrfs_header_nritems(eb) || is_shared_data_backref(preftrees, eb->start) || ref->root_id != btrfs_header_owner(eb)) { - if (time_seq == BTRFS_SEQ_LAST) + if (ctx->time_seq == BTRFS_SEQ_LAST) ret = btrfs_next_leaf(root, path); else - ret = btrfs_next_old_leaf(root, path, time_seq); + ret = btrfs_next_old_leaf(root, path, ctx->time_seq); } while (!ret && count < ref->count) { @@ -480,10 +531,10 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path, if (slot == 0 && (is_shared_data_backref(preftrees, eb->start) || ref->root_id != btrfs_header_owner(eb))) { - if (time_seq == BTRFS_SEQ_LAST) + if (ctx->time_seq == BTRFS_SEQ_LAST) ret = btrfs_next_leaf(root, path); else - ret = btrfs_next_old_leaf(root, path, time_seq); + ret = btrfs_next_old_leaf(root, path, ctx->time_seq); continue; } fi = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item); @@ -497,11 +548,10 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path, count++; else goto next; - if (extent_item_pos) { - ret = check_extent_in_eb(&key, eb, fi, - *extent_item_pos, - &eie, ignore_offset); - if (ret < 0) + if (!ctx->ignore_extent_item_pos) { + ret = check_extent_in_eb(ctx, &key, eb, fi, &eie); + if (ret == BTRFS_ITERATE_EXTENT_INODES_STOP || + ret < 0) break; } if (ret > 0) @@ -510,7 +560,7 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path, eie, (void **)&old, GFP_NOFS); if (ret < 0) break; - if (!ret && extent_item_pos) { + if (!ret && !ctx->ignore_extent_item_pos) { while (old->next) old = old->next; old->next = eie; @@ -518,16 +568,17 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path, eie = NULL; } next: - if (time_seq == BTRFS_SEQ_LAST) + if (ctx->time_seq == BTRFS_SEQ_LAST) ret = btrfs_next_item(root, path); else - ret = btrfs_next_old_item(root, path, time_seq); + ret = btrfs_next_old_item(root, path, ctx->time_seq); } - if (ret > 0) - ret = 0; - else if (ret < 0) + if (ret == BTRFS_ITERATE_EXTENT_INODES_STOP || ret < 0) free_inode_elem_list(eie); + else if (ret > 0) + ret = 0; + return ret; } @@ -535,11 +586,10 @@ next: * resolve an indirect backref in the form (root_id, key, level) * to a logical address */ -static int resolve_indirect_ref(struct btrfs_fs_info *fs_info, - struct btrfs_path *path, u64 time_seq, +static int resolve_indirect_ref(struct btrfs_backref_walk_ctx *ctx, + struct btrfs_path *path, struct preftrees *preftrees, - struct prelim_ref *ref, struct ulist *parents, - const u64 *extent_item_pos, bool ignore_offset) + struct prelim_ref *ref, struct ulist *parents) { struct btrfs_root *root; struct extent_buffer *eb; @@ -557,9 +607,9 @@ static int resolve_indirect_ref(struct btrfs_fs_info *fs_info, * here. */ if (path->search_commit_root) - root = btrfs_get_fs_root_commit_root(fs_info, path, ref->root_id); + root = btrfs_get_fs_root_commit_root(ctx->fs_info, path, ref->root_id); else - root = btrfs_get_fs_root(fs_info, ref->root_id, false); + root = btrfs_get_fs_root(ctx->fs_info, ref->root_id, false); if (IS_ERR(root)) { ret = PTR_ERR(root); goto out_free; @@ -571,17 +621,17 @@ static int resolve_indirect_ref(struct btrfs_fs_info *fs_info, goto out; } - if (btrfs_is_testing(fs_info)) { + if (btrfs_is_testing(ctx->fs_info)) { ret = -ENOENT; goto out; } if (path->search_commit_root) root_level = btrfs_header_level(root->commit_root); - else if (time_seq == BTRFS_SEQ_LAST) + else if (ctx->time_seq == BTRFS_SEQ_LAST) root_level = btrfs_header_level(root->node); else - root_level = btrfs_old_root_level(root, time_seq); + root_level = btrfs_old_root_level(root, ctx->time_seq); if (root_level + 1 == level) goto out; @@ -609,12 +659,12 @@ static int resolve_indirect_ref(struct btrfs_fs_info *fs_info, search_key.offset >= LLONG_MAX) search_key.offset = 0; path->lowest_level = level; - if (time_seq == BTRFS_SEQ_LAST) + if (ctx->time_seq == BTRFS_SEQ_LAST) ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); else - ret = btrfs_search_old_slot(root, &search_key, path, time_seq); + ret = btrfs_search_old_slot(root, &search_key, path, ctx->time_seq); - btrfs_debug(fs_info, + btrfs_debug(ctx->fs_info, "search slot in root %llu (level %d, ref count %d) returned %d for key (%llu %u %llu)", ref->root_id, level, ref->count, ret, ref->key_for_search.objectid, ref->key_for_search.type, @@ -632,8 +682,7 @@ static int resolve_indirect_ref(struct btrfs_fs_info *fs_info, eb = path->nodes[level]; } - ret = add_all_parents(root, path, parents, preftrees, ref, level, - time_seq, extent_item_pos, ignore_offset); + ret = add_all_parents(ctx, root, path, parents, preftrees, ref, level); out: btrfs_put_root(root); out_free: @@ -678,11 +727,10 @@ static void free_leaf_list(struct ulist *ulist) * rbtree as they are encountered. The new backrefs are subsequently * resolved as above. */ -static int resolve_indirect_refs(struct btrfs_fs_info *fs_info, - struct btrfs_path *path, u64 time_seq, +static int resolve_indirect_refs(struct btrfs_backref_walk_ctx *ctx, + struct btrfs_path *path, struct preftrees *preftrees, - const u64 *extent_item_pos, - struct share_check *sc, bool ignore_offset) + struct share_check *sc) { int err; int ret = 0; @@ -719,21 +767,18 @@ static int resolve_indirect_refs(struct btrfs_fs_info *fs_info, continue; } - if (sc && sc->root_objectid && - ref->root_id != sc->root_objectid) { + if (sc && ref->root_id != sc->root->root_key.objectid) { free_pref(ref); ret = BACKREF_FOUND_SHARED; goto out; } - err = resolve_indirect_ref(fs_info, path, time_seq, preftrees, - ref, parents, extent_item_pos, - ignore_offset); + err = resolve_indirect_ref(ctx, path, preftrees, ref, parents); /* * we can only tolerate ENOENT,otherwise,we should catch error * and return directly. */ if (err == -ENOENT) { - prelim_ref_insert(fs_info, &preftrees->direct, ref, + prelim_ref_insert(ctx->fs_info, &preftrees->direct, ref, NULL); continue; } else if (err) { @@ -762,7 +807,7 @@ static int resolve_indirect_refs(struct btrfs_fs_info *fs_info, memcpy(new_ref, ref, sizeof(*ref)); new_ref->parent = node->val; new_ref->inode_list = unode_aux_to_inode_list(node); - prelim_ref_insert(fs_info, &preftrees->direct, + prelim_ref_insert(ctx->fs_info, &preftrees->direct, new_ref, NULL); } @@ -770,7 +815,7 @@ static int resolve_indirect_refs(struct btrfs_fs_info *fs_info, * Now it's a direct ref, put it in the direct tree. We must * do this last because the ref could be merged/freed here. */ - prelim_ref_insert(fs_info, &preftrees->direct, ref, NULL); + prelim_ref_insert(ctx->fs_info, &preftrees->direct, ref, NULL); ulist_reinit(parents); cond_resched(); @@ -796,6 +841,8 @@ static int add_missing_keys(struct btrfs_fs_info *fs_info, struct rb_node *node; while ((node = rb_first_cached(&tree->root))) { + struct btrfs_tree_parent_check check = { 0 }; + ref = rb_entry(node, struct prelim_ref, rbnode); rb_erase_cached(node, &tree->root); @@ -803,8 +850,10 @@ static int add_missing_keys(struct btrfs_fs_info *fs_info, BUG_ON(ref->key_for_search.type); BUG_ON(!ref->wanted_disk_byte); - eb = read_tree_block(fs_info, ref->wanted_disk_byte, - ref->root_id, 0, ref->level - 1, NULL); + check.level = ref->level - 1; + check.owner_root = ref->root_id; + + eb = read_tree_block(fs_info, ref->wanted_disk_byte, &check); if (IS_ERR(eb)) { free_pref(ref); return PTR_ERR(eb); @@ -959,8 +1008,8 @@ static int add_delayed_refs(const struct btrfs_fs_info *fs_info, * * Returns 0 on success, <0 on error, or BACKREF_FOUND_SHARED. */ -static int add_inline_refs(const struct btrfs_fs_info *fs_info, - struct btrfs_path *path, u64 bytenr, +static int add_inline_refs(struct btrfs_backref_walk_ctx *ctx, + struct btrfs_path *path, int *info_level, struct preftrees *preftrees, struct share_check *sc) { @@ -985,6 +1034,13 @@ static int add_inline_refs(const struct btrfs_fs_info *fs_info, BUG_ON(item_size < sizeof(*ei)); ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item); + + if (ctx->check_extent_item) { + ret = ctx->check_extent_item(ctx->bytenr, ei, leaf, ctx->user_ctx); + if (ret) + return ret; + } + flags = btrfs_extent_flags(leaf, ei); btrfs_item_key_to_cpu(leaf, &found_key, slot); @@ -1020,9 +1076,9 @@ static int add_inline_refs(const struct btrfs_fs_info *fs_info, switch (type) { case BTRFS_SHARED_BLOCK_REF_KEY: - ret = add_direct_ref(fs_info, preftrees, + ret = add_direct_ref(ctx->fs_info, preftrees, *info_level + 1, offset, - bytenr, 1, NULL, GFP_NOFS); + ctx->bytenr, 1, NULL, GFP_NOFS); break; case BTRFS_SHARED_DATA_REF_KEY: { struct btrfs_shared_data_ref *sdref; @@ -1031,14 +1087,14 @@ static int add_inline_refs(const struct btrfs_fs_info *fs_info, sdref = (struct btrfs_shared_data_ref *)(iref + 1); count = btrfs_shared_data_ref_count(leaf, sdref); - ret = add_direct_ref(fs_info, preftrees, 0, offset, - bytenr, count, sc, GFP_NOFS); + ret = add_direct_ref(ctx->fs_info, preftrees, 0, offset, + ctx->bytenr, count, sc, GFP_NOFS); break; } case BTRFS_TREE_BLOCK_REF_KEY: - ret = add_indirect_ref(fs_info, preftrees, offset, + ret = add_indirect_ref(ctx->fs_info, preftrees, offset, NULL, *info_level + 1, - bytenr, 1, NULL, GFP_NOFS); + ctx->bytenr, 1, NULL, GFP_NOFS); break; case BTRFS_EXTENT_DATA_REF_KEY: { struct btrfs_extent_data_ref *dref; @@ -1052,7 +1108,7 @@ static int add_inline_refs(const struct btrfs_fs_info *fs_info, key.type = BTRFS_EXTENT_DATA_KEY; key.offset = btrfs_extent_data_ref_offset(leaf, dref); - if (sc && sc->inum && key.objectid != sc->inum && + if (sc && key.objectid != sc->inum && !sc->have_delayed_delete_refs) { ret = BACKREF_FOUND_SHARED; break; @@ -1060,10 +1116,12 @@ static int add_inline_refs(const struct btrfs_fs_info *fs_info, root = btrfs_extent_data_ref_root(leaf, dref); - ret = add_indirect_ref(fs_info, preftrees, root, - &key, 0, bytenr, count, - sc, GFP_NOFS); - + if (!ctx->skip_data_ref || + !ctx->skip_data_ref(root, key.objectid, key.offset, + ctx->user_ctx)) + ret = add_indirect_ref(ctx->fs_info, preftrees, + root, &key, 0, ctx->bytenr, + count, sc, GFP_NOFS); break; } default: @@ -1082,8 +1140,9 @@ static int add_inline_refs(const struct btrfs_fs_info *fs_info, * * Returns 0 on success, <0 on error, or BACKREF_FOUND_SHARED. */ -static int add_keyed_refs(struct btrfs_root *extent_root, - struct btrfs_path *path, u64 bytenr, +static int add_keyed_refs(struct btrfs_backref_walk_ctx *ctx, + struct btrfs_root *extent_root, + struct btrfs_path *path, int info_level, struct preftrees *preftrees, struct share_check *sc) { @@ -1106,7 +1165,7 @@ static int add_keyed_refs(struct btrfs_root *extent_root, leaf = path->nodes[0]; btrfs_item_key_to_cpu(leaf, &key, slot); - if (key.objectid != bytenr) + if (key.objectid != ctx->bytenr) break; if (key.type < BTRFS_TREE_BLOCK_REF_KEY) continue; @@ -1118,7 +1177,7 @@ static int add_keyed_refs(struct btrfs_root *extent_root, /* SHARED DIRECT METADATA backref */ ret = add_direct_ref(fs_info, preftrees, info_level + 1, key.offset, - bytenr, 1, NULL, GFP_NOFS); + ctx->bytenr, 1, NULL, GFP_NOFS); break; case BTRFS_SHARED_DATA_REF_KEY: { /* SHARED DIRECT FULL backref */ @@ -1129,14 +1188,14 @@ static int add_keyed_refs(struct btrfs_root *extent_root, struct btrfs_shared_data_ref); count = btrfs_shared_data_ref_count(leaf, sdref); ret = add_direct_ref(fs_info, preftrees, 0, - key.offset, bytenr, count, + key.offset, ctx->bytenr, count, sc, GFP_NOFS); break; } case BTRFS_TREE_BLOCK_REF_KEY: /* NORMAL INDIRECT METADATA backref */ ret = add_indirect_ref(fs_info, preftrees, key.offset, - NULL, info_level + 1, bytenr, + NULL, info_level + 1, ctx->bytenr, 1, NULL, GFP_NOFS); break; case BTRFS_EXTENT_DATA_REF_KEY: { @@ -1153,16 +1212,20 @@ static int add_keyed_refs(struct btrfs_root *extent_root, key.type = BTRFS_EXTENT_DATA_KEY; key.offset = btrfs_extent_data_ref_offset(leaf, dref); - if (sc && sc->inum && key.objectid != sc->inum && + if (sc && key.objectid != sc->inum && !sc->have_delayed_delete_refs) { ret = BACKREF_FOUND_SHARED; break; } root = btrfs_extent_data_ref_root(leaf, dref); - ret = add_indirect_ref(fs_info, preftrees, root, - &key, 0, bytenr, count, - sc, GFP_NOFS); + + if (!ctx->skip_data_ref || + !ctx->skip_data_ref(root, key.objectid, key.offset, + ctx->user_ctx)) + ret = add_indirect_ref(fs_info, preftrees, root, + &key, 0, ctx->bytenr, + count, sc, GFP_NOFS); break; } default: @@ -1177,34 +1240,141 @@ static int add_keyed_refs(struct btrfs_root *extent_root, } /* + * The caller has joined a transaction or is holding a read lock on the + * fs_info->commit_root_sem semaphore, so no need to worry about the root's last + * snapshot field changing while updating or checking the cache. + */ +static bool lookup_backref_shared_cache(struct btrfs_backref_share_check_ctx *ctx, + struct btrfs_root *root, + u64 bytenr, int level, bool *is_shared) +{ + struct btrfs_backref_shared_cache_entry *entry; + + if (!ctx->use_path_cache) + return false; + + if (WARN_ON_ONCE(level >= BTRFS_MAX_LEVEL)) + return false; + + /* + * Level -1 is used for the data extent, which is not reliable to cache + * because its reference count can increase or decrease without us + * realizing. We cache results only for extent buffers that lead from + * the root node down to the leaf with the file extent item. + */ + ASSERT(level >= 0); + + entry = &ctx->path_cache_entries[level]; + + /* Unused cache entry or being used for some other extent buffer. */ + if (entry->bytenr != bytenr) + return false; + + /* + * We cached a false result, but the last snapshot generation of the + * root changed, so we now have a snapshot. Don't trust the result. + */ + if (!entry->is_shared && + entry->gen != btrfs_root_last_snapshot(&root->root_item)) + return false; + + /* + * If we cached a true result and the last generation used for dropping + * a root changed, we can not trust the result, because the dropped root + * could be a snapshot sharing this extent buffer. + */ + if (entry->is_shared && + entry->gen != btrfs_get_last_root_drop_gen(root->fs_info)) + return false; + + *is_shared = entry->is_shared; + /* + * If the node at this level is shared, than all nodes below are also + * shared. Currently some of the nodes below may be marked as not shared + * because we have just switched from one leaf to another, and switched + * also other nodes above the leaf and below the current level, so mark + * them as shared. + */ + if (*is_shared) { + for (int i = 0; i < level; i++) { + ctx->path_cache_entries[i].is_shared = true; + ctx->path_cache_entries[i].gen = entry->gen; + } + } + + return true; +} + +/* + * The caller has joined a transaction or is holding a read lock on the + * fs_info->commit_root_sem semaphore, so no need to worry about the root's last + * snapshot field changing while updating or checking the cache. + */ +static void store_backref_shared_cache(struct btrfs_backref_share_check_ctx *ctx, + struct btrfs_root *root, + u64 bytenr, int level, bool is_shared) +{ + struct btrfs_backref_shared_cache_entry *entry; + u64 gen; + + if (!ctx->use_path_cache) + return; + + if (WARN_ON_ONCE(level >= BTRFS_MAX_LEVEL)) + return; + + /* + * Level -1 is used for the data extent, which is not reliable to cache + * because its reference count can increase or decrease without us + * realizing. We cache results only for extent buffers that lead from + * the root node down to the leaf with the file extent item. + */ + ASSERT(level >= 0); + + if (is_shared) + gen = btrfs_get_last_root_drop_gen(root->fs_info); + else + gen = btrfs_root_last_snapshot(&root->root_item); + + entry = &ctx->path_cache_entries[level]; + entry->bytenr = bytenr; + entry->is_shared = is_shared; + entry->gen = gen; + + /* + * If we found an extent buffer is shared, set the cache result for all + * extent buffers below it to true. As nodes in the path are COWed, + * their sharedness is moved to their children, and if a leaf is COWed, + * then the sharedness of a data extent becomes direct, the refcount of + * data extent is increased in the extent item at the extent tree. + */ + if (is_shared) { + for (int i = 0; i < level; i++) { + entry = &ctx->path_cache_entries[i]; + entry->is_shared = is_shared; + entry->gen = gen; + } + } +} + +/* * this adds all existing backrefs (inline backrefs, backrefs and delayed * refs) for the given bytenr to the refs list, merges duplicates and resolves * indirect refs to their parent bytenr. * When roots are found, they're added to the roots list * - * If time_seq is set to BTRFS_SEQ_LAST, it will not search delayed_refs, and - * behave much like trans == NULL case, the difference only lies in it will not - * commit root. - * The special case is for qgroup to search roots in commit_transaction(). - * - * @sc - if !NULL, then immediately return BACKREF_FOUND_SHARED when a - * shared extent is detected. + * @ctx: Backref walking context object, must be not NULL. + * @sc: If !NULL, then immediately return BACKREF_FOUND_SHARED when a + * shared extent is detected. * * Otherwise this returns 0 for success and <0 for an error. * - * If ignore_offset is set to false, only extent refs whose offsets match - * extent_item_pos are returned. If true, every extent ref is returned - * and extent_item_pos is ignored. - * * FIXME some caching might speed things up */ -static int find_parent_nodes(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, u64 bytenr, - u64 time_seq, struct ulist *refs, - struct ulist *roots, const u64 *extent_item_pos, - struct share_check *sc, bool ignore_offset) +static int find_parent_nodes(struct btrfs_backref_walk_ctx *ctx, + struct share_check *sc) { - struct btrfs_root *root = btrfs_extent_root(fs_info, bytenr); + struct btrfs_root *root = btrfs_extent_root(ctx->fs_info, ctx->bytenr); struct btrfs_key key; struct btrfs_path *path; struct btrfs_delayed_ref_root *delayed_refs = NULL; @@ -1220,9 +1390,13 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans, .indirect_missing_keys = PREFTREE_INIT }; - key.objectid = bytenr; + /* Roots ulist is not needed when using a sharedness check context. */ + if (sc) + ASSERT(ctx->roots == NULL); + + key.objectid = ctx->bytenr; key.offset = (u64)-1; - if (btrfs_fs_incompat(fs_info, SKINNY_METADATA)) + if (btrfs_fs_incompat(ctx->fs_info, SKINNY_METADATA)) key.type = BTRFS_METADATA_ITEM_KEY; else key.type = BTRFS_EXTENT_ITEM_KEY; @@ -1230,12 +1404,12 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans, path = btrfs_alloc_path(); if (!path) return -ENOMEM; - if (!trans) { + if (!ctx->trans) { path->search_commit_root = 1; path->skip_locking = 1; } - if (time_seq == BTRFS_SEQ_LAST) + if (ctx->time_seq == BTRFS_SEQ_LAST) path->skip_locking = 1; again: @@ -1251,17 +1425,17 @@ again: goto out; } - if (trans && likely(trans->type != __TRANS_DUMMY) && - time_seq != BTRFS_SEQ_LAST) { + if (ctx->trans && likely(ctx->trans->type != __TRANS_DUMMY) && + ctx->time_seq != BTRFS_SEQ_LAST) { /* * We have a specific time_seq we care about and trans which * means we have the path lock, we need to grab the ref head and * lock it so we have a consistent view of the refs at the given * time. */ - delayed_refs = &trans->transaction->delayed_refs; + delayed_refs = &ctx->trans->transaction->delayed_refs; spin_lock(&delayed_refs->lock); - head = btrfs_find_delayed_ref_head(delayed_refs, bytenr); + head = btrfs_find_delayed_ref_head(delayed_refs, ctx->bytenr); if (head) { if (!mutex_trylock(&head->mutex)) { refcount_inc(&head->refs); @@ -1279,7 +1453,7 @@ again: goto again; } spin_unlock(&delayed_refs->lock); - ret = add_delayed_refs(fs_info, head, time_seq, + ret = add_delayed_refs(ctx->fs_info, head, ctx->time_seq, &preftrees, sc); mutex_unlock(&head->mutex); if (ret) @@ -1297,30 +1471,96 @@ again: leaf = path->nodes[0]; slot = path->slots[0]; btrfs_item_key_to_cpu(leaf, &key, slot); - if (key.objectid == bytenr && + if (key.objectid == ctx->bytenr && (key.type == BTRFS_EXTENT_ITEM_KEY || key.type == BTRFS_METADATA_ITEM_KEY)) { - ret = add_inline_refs(fs_info, path, bytenr, - &info_level, &preftrees, sc); + ret = add_inline_refs(ctx, path, &info_level, + &preftrees, sc); if (ret) goto out; - ret = add_keyed_refs(root, path, bytenr, info_level, + ret = add_keyed_refs(ctx, root, path, info_level, &preftrees, sc); if (ret) goto out; } } + /* + * If we have a share context and we reached here, it means the extent + * is not directly shared (no multiple reference items for it), + * otherwise we would have exited earlier with a return value of + * BACKREF_FOUND_SHARED after processing delayed references or while + * processing inline or keyed references from the extent tree. + * The extent may however be indirectly shared through shared subtrees + * as a result from creating snapshots, so we determine below what is + * its parent node, in case we are dealing with a metadata extent, or + * what's the leaf (or leaves), from a fs tree, that has a file extent + * item pointing to it in case we are dealing with a data extent. + */ + ASSERT(extent_is_shared(sc) == 0); + + /* + * If we are here for a data extent and we have a share_check structure + * it means the data extent is not directly shared (does not have + * multiple reference items), so we have to check if a path in the fs + * tree (going from the root node down to the leaf that has the file + * extent item pointing to the data extent) is shared, that is, if any + * of the extent buffers in the path is referenced by other trees. + */ + if (sc && ctx->bytenr == sc->data_bytenr) { + /* + * If our data extent is from a generation more recent than the + * last generation used to snapshot the root, then we know that + * it can not be shared through subtrees, so we can skip + * resolving indirect references, there's no point in + * determining the extent buffers for the path from the fs tree + * root node down to the leaf that has the file extent item that + * points to the data extent. + */ + if (sc->data_extent_gen > + btrfs_root_last_snapshot(&sc->root->root_item)) { + ret = BACKREF_FOUND_NOT_SHARED; + goto out; + } + + /* + * If we are only determining if a data extent is shared or not + * and the corresponding file extent item is located in the same + * leaf as the previous file extent item, we can skip resolving + * indirect references for a data extent, since the fs tree path + * is the same (same leaf, so same path). We skip as long as the + * cached result for the leaf is valid and only if there's only + * one file extent item pointing to the data extent, because in + * the case of multiple file extent items, they may be located + * in different leaves and therefore we have multiple paths. + */ + if (sc->ctx->curr_leaf_bytenr == sc->ctx->prev_leaf_bytenr && + sc->self_ref_count == 1) { + bool cached; + bool is_shared; + + cached = lookup_backref_shared_cache(sc->ctx, sc->root, + sc->ctx->curr_leaf_bytenr, + 0, &is_shared); + if (cached) { + if (is_shared) + ret = BACKREF_FOUND_SHARED; + else + ret = BACKREF_FOUND_NOT_SHARED; + goto out; + } + } + } + btrfs_release_path(path); - ret = add_missing_keys(fs_info, &preftrees, path->skip_locking == 0); + ret = add_missing_keys(ctx->fs_info, &preftrees, path->skip_locking == 0); if (ret) goto out; WARN_ON(!RB_EMPTY_ROOT(&preftrees.indirect_missing_keys.root.rb_root)); - ret = resolve_indirect_refs(fs_info, path, time_seq, &preftrees, - extent_item_pos, sc, ignore_offset); + ret = resolve_indirect_refs(ctx, path, &preftrees, sc); if (ret) goto out; @@ -1347,25 +1587,22 @@ again: * e.g. different offsets would not be merged, * and would retain their original ref->count < 0. */ - if (roots && ref->count && ref->root_id && ref->parent == 0) { - if (sc && sc->root_objectid && - ref->root_id != sc->root_objectid) { - ret = BACKREF_FOUND_SHARED; - goto out; - } - + if (ctx->roots && ref->count && ref->root_id && ref->parent == 0) { /* no parent == root of tree */ - ret = ulist_add(roots, ref->root_id, 0, GFP_NOFS); + ret = ulist_add(ctx->roots, ref->root_id, 0, GFP_NOFS); if (ret < 0) goto out; } if (ref->count && ref->parent) { - if (extent_item_pos && !ref->inode_list && + if (!ctx->ignore_extent_item_pos && !ref->inode_list && ref->level == 0) { + struct btrfs_tree_parent_check check = { 0 }; struct extent_buffer *eb; - eb = read_tree_block(fs_info, ref->parent, 0, - 0, ref->level, NULL); + check.level = ref->level; + + eb = read_tree_block(ctx->fs_info, ref->parent, + &check); if (IS_ERR(eb)) { ret = PTR_ERR(eb); goto out; @@ -1378,12 +1615,12 @@ again: if (!path->skip_locking) btrfs_tree_read_lock(eb); - ret = find_extent_in_eb(eb, bytenr, - *extent_item_pos, &eie, ignore_offset); + ret = find_extent_in_eb(ctx, eb, &eie); if (!path->skip_locking) btrfs_tree_read_unlock(eb); free_extent_buffer(eb); - if (ret < 0) + if (ret == BTRFS_ITERATE_EXTENT_INODES_STOP || + ret < 0) goto out; ref->inode_list = eie; /* @@ -1393,12 +1630,12 @@ again: */ eie = NULL; } - ret = ulist_add_merge_ptr(refs, ref->parent, + ret = ulist_add_merge_ptr(ctx->refs, ref->parent, ref->inode_list, (void **)&eie, GFP_NOFS); if (ret < 0) goto out; - if (!ret && extent_item_pos) { + if (!ret && !ctx->ignore_extent_item_pos) { /* * We've recorded that parent, so we must extend * its inode list here. @@ -1436,34 +1673,36 @@ out: prelim_release(&preftrees.indirect); prelim_release(&preftrees.indirect_missing_keys); - if (ret < 0) + if (ret == BTRFS_ITERATE_EXTENT_INODES_STOP || ret < 0) free_inode_elem_list(eie); return ret; } /* - * Finds all leafs with a reference to the specified combination of bytenr and - * offset. key_list_head will point to a list of corresponding keys (caller must - * free each list element). The leafs will be stored in the leafs ulist, which - * must be freed with ulist_free. + * Finds all leaves with a reference to the specified combination of + * @ctx->bytenr and @ctx->extent_item_pos. The bytenr of the found leaves are + * added to the ulist at @ctx->refs, and that ulist is allocated by this + * function. The caller should free the ulist with free_leaf_list() if + * @ctx->ignore_extent_item_pos is false, otherwise a fimple ulist_free() is + * enough. * - * returns 0 on success, <0 on error + * Returns 0 on success and < 0 on error. On error @ctx->refs is not allocated. */ -int btrfs_find_all_leafs(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, u64 bytenr, - u64 time_seq, struct ulist **leafs, - const u64 *extent_item_pos, bool ignore_offset) +int btrfs_find_all_leafs(struct btrfs_backref_walk_ctx *ctx) { int ret; - *leafs = ulist_alloc(GFP_NOFS); - if (!*leafs) + ASSERT(ctx->refs == NULL); + + ctx->refs = ulist_alloc(GFP_NOFS); + if (!ctx->refs) return -ENOMEM; - ret = find_parent_nodes(trans, fs_info, bytenr, time_seq, - *leafs, NULL, extent_item_pos, NULL, ignore_offset); - if (ret < 0 && ret != -ENOENT) { - free_leaf_list(*leafs); + ret = find_parent_nodes(ctx, NULL); + if (ret == BTRFS_ITERATE_EXTENT_INODES_STOP || + (ret < 0 && ret != -ENOENT)) { + free_leaf_list(ctx->refs); + ctx->refs = NULL; return ret; } @@ -1471,7 +1710,7 @@ int btrfs_find_all_leafs(struct btrfs_trans_handle *trans, } /* - * walk all backrefs for a given extent to find all roots that reference this + * Walk all backrefs for a given extent to find all roots that reference this * extent. Walking a backref means finding all extents that reference this * extent and in turn walk the backrefs of those, too. Naturally this is a * recursive process, but here it is implemented in an iterative fashion: We @@ -1479,195 +1718,113 @@ int btrfs_find_all_leafs(struct btrfs_trans_handle *trans, * list. In turn, we find all referencing extents for those, further appending * to the list. The way we iterate the list allows adding more elements after * the current while iterating. The process stops when we reach the end of the - * list. Found roots are added to the roots list. + * list. + * + * Found roots are added to @ctx->roots, which is allocated by this function if + * it points to NULL, in which case the caller is responsible for freeing it + * after it's not needed anymore. + * This function requires @ctx->refs to be NULL, as it uses it for allocating a + * ulist to do temporary work, and frees it before returning. * - * returns 0 on success, < 0 on error. + * Returns 0 on success, < 0 on error. */ -static int btrfs_find_all_roots_safe(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, u64 bytenr, - u64 time_seq, struct ulist **roots, - bool ignore_offset) +static int btrfs_find_all_roots_safe(struct btrfs_backref_walk_ctx *ctx) { - struct ulist *tmp; - struct ulist_node *node = NULL; + const u64 orig_bytenr = ctx->bytenr; + const bool orig_ignore_extent_item_pos = ctx->ignore_extent_item_pos; + bool roots_ulist_allocated = false; struct ulist_iterator uiter; - int ret; + int ret = 0; - tmp = ulist_alloc(GFP_NOFS); - if (!tmp) - return -ENOMEM; - *roots = ulist_alloc(GFP_NOFS); - if (!*roots) { - ulist_free(tmp); + ASSERT(ctx->refs == NULL); + + ctx->refs = ulist_alloc(GFP_NOFS); + if (!ctx->refs) return -ENOMEM; + + if (!ctx->roots) { + ctx->roots = ulist_alloc(GFP_NOFS); + if (!ctx->roots) { + ulist_free(ctx->refs); + ctx->refs = NULL; + return -ENOMEM; + } + roots_ulist_allocated = true; } + ctx->ignore_extent_item_pos = true; + ULIST_ITER_INIT(&uiter); while (1) { - ret = find_parent_nodes(trans, fs_info, bytenr, time_seq, - tmp, *roots, NULL, NULL, ignore_offset); + struct ulist_node *node; + + ret = find_parent_nodes(ctx, NULL); if (ret < 0 && ret != -ENOENT) { - ulist_free(tmp); - ulist_free(*roots); - *roots = NULL; - return ret; + if (roots_ulist_allocated) { + ulist_free(ctx->roots); + ctx->roots = NULL; + } + break; } - node = ulist_next(tmp, &uiter); + ret = 0; + node = ulist_next(ctx->refs, &uiter); if (!node) break; - bytenr = node->val; + ctx->bytenr = node->val; cond_resched(); } - ulist_free(tmp); - return 0; + ulist_free(ctx->refs); + ctx->refs = NULL; + ctx->bytenr = orig_bytenr; + ctx->ignore_extent_item_pos = orig_ignore_extent_item_pos; + + return ret; } -int btrfs_find_all_roots(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, u64 bytenr, - u64 time_seq, struct ulist **roots, +int btrfs_find_all_roots(struct btrfs_backref_walk_ctx *ctx, bool skip_commit_root_sem) { int ret; - if (!trans && !skip_commit_root_sem) - down_read(&fs_info->commit_root_sem); - ret = btrfs_find_all_roots_safe(trans, fs_info, bytenr, - time_seq, roots, false); - if (!trans && !skip_commit_root_sem) - up_read(&fs_info->commit_root_sem); + if (!ctx->trans && !skip_commit_root_sem) + down_read(&ctx->fs_info->commit_root_sem); + ret = btrfs_find_all_roots_safe(ctx); + if (!ctx->trans && !skip_commit_root_sem) + up_read(&ctx->fs_info->commit_root_sem); return ret; } -/* - * The caller has joined a transaction or is holding a read lock on the - * fs_info->commit_root_sem semaphore, so no need to worry about the root's last - * snapshot field changing while updating or checking the cache. - */ -static bool lookup_backref_shared_cache(struct btrfs_backref_shared_cache *cache, - struct btrfs_root *root, - u64 bytenr, int level, bool *is_shared) +struct btrfs_backref_share_check_ctx *btrfs_alloc_backref_share_check_ctx(void) { - struct btrfs_backref_shared_cache_entry *entry; - - if (!cache->use_cache) - return false; - - if (WARN_ON_ONCE(level >= BTRFS_MAX_LEVEL)) - return false; - - /* - * Level -1 is used for the data extent, which is not reliable to cache - * because its reference count can increase or decrease without us - * realizing. We cache results only for extent buffers that lead from - * the root node down to the leaf with the file extent item. - */ - ASSERT(level >= 0); - - entry = &cache->entries[level]; + struct btrfs_backref_share_check_ctx *ctx; - /* Unused cache entry or being used for some other extent buffer. */ - if (entry->bytenr != bytenr) - return false; - - /* - * We cached a false result, but the last snapshot generation of the - * root changed, so we now have a snapshot. Don't trust the result. - */ - if (!entry->is_shared && - entry->gen != btrfs_root_last_snapshot(&root->root_item)) - return false; - - /* - * If we cached a true result and the last generation used for dropping - * a root changed, we can not trust the result, because the dropped root - * could be a snapshot sharing this extent buffer. - */ - if (entry->is_shared && - entry->gen != btrfs_get_last_root_drop_gen(root->fs_info)) - return false; + ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); + if (!ctx) + return NULL; - *is_shared = entry->is_shared; - /* - * If the node at this level is shared, than all nodes below are also - * shared. Currently some of the nodes below may be marked as not shared - * because we have just switched from one leaf to another, and switched - * also other nodes above the leaf and below the current level, so mark - * them as shared. - */ - if (*is_shared) { - for (int i = 0; i < level; i++) { - cache->entries[i].is_shared = true; - cache->entries[i].gen = entry->gen; - } - } + ulist_init(&ctx->refs); - return true; + return ctx; } -/* - * The caller has joined a transaction or is holding a read lock on the - * fs_info->commit_root_sem semaphore, so no need to worry about the root's last - * snapshot field changing while updating or checking the cache. - */ -static void store_backref_shared_cache(struct btrfs_backref_shared_cache *cache, - struct btrfs_root *root, - u64 bytenr, int level, bool is_shared) +void btrfs_free_backref_share_ctx(struct btrfs_backref_share_check_ctx *ctx) { - struct btrfs_backref_shared_cache_entry *entry; - u64 gen; - - if (!cache->use_cache) - return; - - if (WARN_ON_ONCE(level >= BTRFS_MAX_LEVEL)) + if (!ctx) return; - /* - * Level -1 is used for the data extent, which is not reliable to cache - * because its reference count can increase or decrease without us - * realizing. We cache results only for extent buffers that lead from - * the root node down to the leaf with the file extent item. - */ - ASSERT(level >= 0); - - if (is_shared) - gen = btrfs_get_last_root_drop_gen(root->fs_info); - else - gen = btrfs_root_last_snapshot(&root->root_item); - - entry = &cache->entries[level]; - entry->bytenr = bytenr; - entry->is_shared = is_shared; - entry->gen = gen; - - /* - * If we found an extent buffer is shared, set the cache result for all - * extent buffers below it to true. As nodes in the path are COWed, - * their sharedness is moved to their children, and if a leaf is COWed, - * then the sharedness of a data extent becomes direct, the refcount of - * data extent is increased in the extent item at the extent tree. - */ - if (is_shared) { - for (int i = 0; i < level; i++) { - entry = &cache->entries[i]; - entry->is_shared = is_shared; - entry->gen = gen; - } - } + ulist_release(&ctx->refs); + kfree(ctx); } /* * Check if a data extent is shared or not. * - * @root: The root the inode belongs to. - * @inum: Number of the inode whose extent we are checking. + * @inode: The inode whose extent we are checking. * @bytenr: Logical bytenr of the extent we are checking. * @extent_gen: Generation of the extent (file extent item) or 0 if it is * not known. - * @roots: List of roots this extent is shared among. - * @tmp: Temporary list used for iteration. - * @cache: A backref lookup result cache. + * @ctx: A backref sharedness check context. * * btrfs_is_data_extent_shared uses the backref walking code but will short * circuit as soon as it finds a root or inode that doesn't match the @@ -1680,11 +1837,12 @@ static void store_backref_shared_cache(struct btrfs_backref_shared_cache *cache, * * Return: 0 if extent is not shared, 1 if it is shared, < 0 on error. */ -int btrfs_is_data_extent_shared(struct btrfs_root *root, u64 inum, u64 bytenr, +int btrfs_is_data_extent_shared(struct btrfs_inode *inode, u64 bytenr, u64 extent_gen, - struct ulist *roots, struct ulist *tmp, - struct btrfs_backref_shared_cache *cache) + struct btrfs_backref_share_check_ctx *ctx) { + struct btrfs_backref_walk_ctx walk_ctx = { 0 }; + struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_trans_handle *trans; struct ulist_iterator uiter; @@ -1692,15 +1850,23 @@ int btrfs_is_data_extent_shared(struct btrfs_root *root, u64 inum, u64 bytenr, struct btrfs_seq_list elem = BTRFS_SEQ_LIST_INIT(elem); int ret = 0; struct share_check shared = { - .root_objectid = root->root_key.objectid, - .inum = inum, + .ctx = ctx, + .root = root, + .inum = btrfs_ino(inode), + .data_bytenr = bytenr, + .data_extent_gen = extent_gen, .share_count = 0, + .self_ref_count = 0, .have_delayed_delete_refs = false, }; int level; - ulist_init(roots); - ulist_init(tmp); + for (int i = 0; i < BTRFS_BACKREF_CTX_PREV_EXTENTS_SIZE; i++) { + if (ctx->prev_extents_cache[i].bytenr == bytenr) + return ctx->prev_extents_cache[i].is_shared; + } + + ulist_init(&ctx->refs); trans = btrfs_join_transaction_nostart(root); if (IS_ERR(trans)) { @@ -1712,40 +1878,36 @@ int btrfs_is_data_extent_shared(struct btrfs_root *root, u64 inum, u64 bytenr, down_read(&fs_info->commit_root_sem); } else { btrfs_get_tree_mod_seq(fs_info, &elem); + walk_ctx.time_seq = elem.seq; } + walk_ctx.ignore_extent_item_pos = true; + walk_ctx.trans = trans; + walk_ctx.fs_info = fs_info; + walk_ctx.refs = &ctx->refs; + /* -1 means we are in the bytenr of the data extent. */ level = -1; ULIST_ITER_INIT(&uiter); - cache->use_cache = true; + ctx->use_path_cache = true; while (1) { bool is_shared; bool cached; - ret = find_parent_nodes(trans, fs_info, bytenr, elem.seq, tmp, - roots, NULL, &shared, false); - if (ret == BACKREF_FOUND_SHARED) { - /* this is the only condition under which we return 1 */ - ret = 1; + walk_ctx.bytenr = bytenr; + ret = find_parent_nodes(&walk_ctx, &shared); + if (ret == BACKREF_FOUND_SHARED || + ret == BACKREF_FOUND_NOT_SHARED) { + /* If shared must return 1, otherwise return 0. */ + ret = (ret == BACKREF_FOUND_SHARED) ? 1 : 0; if (level >= 0) - store_backref_shared_cache(cache, root, bytenr, - level, true); + store_backref_shared_cache(ctx, root, bytenr, + level, ret == 1); break; } if (ret < 0 && ret != -ENOENT) break; ret = 0; - /* - * If our data extent is not shared through reflinks and it was - * created in a generation after the last one used to create a - * snapshot of the inode's root, then it can not be shared - * indirectly through subtrees, as that can only happen with - * snapshots. In this case bail out, no need to check for the - * sharedness of extent buffers. - */ - if (level == -1 && - extent_gen > btrfs_root_last_snapshot(&root->root_item)) - break; /* * If our data extent was not directly shared (without multiple @@ -1762,18 +1924,18 @@ int btrfs_is_data_extent_shared(struct btrfs_root *root, u64 inum, u64 bytenr, * deal with), we can not use it if we have multiple leaves * (which implies multiple paths). */ - if (level == -1 && tmp->nnodes > 1) - cache->use_cache = false; + if (level == -1 && ctx->refs.nnodes > 1) + ctx->use_path_cache = false; if (level >= 0) - store_backref_shared_cache(cache, root, bytenr, + store_backref_shared_cache(ctx, root, bytenr, level, false); - node = ulist_next(tmp, &uiter); + node = ulist_next(&ctx->refs, &uiter); if (!node) break; bytenr = node->val; level++; - cached = lookup_backref_shared_cache(cache, root, bytenr, level, + cached = lookup_backref_shared_cache(ctx, root, bytenr, level, &is_shared); if (cached) { ret = (is_shared ? 1 : 0); @@ -1784,6 +1946,20 @@ int btrfs_is_data_extent_shared(struct btrfs_root *root, u64 inum, u64 bytenr, cond_resched(); } + /* + * Cache the sharedness result for the data extent if we know our inode + * has more than 1 file extent item that refers to the data extent. + */ + if (ret >= 0 && shared.self_ref_count > 1) { + int slot = ctx->prev_extents_cache_slot; + + ctx->prev_extents_cache[slot].bytenr = shared.data_bytenr; + ctx->prev_extents_cache[slot].is_shared = (ret == 1); + + slot = (slot + 1) % BTRFS_BACKREF_CTX_PREV_EXTENTS_SIZE; + ctx->prev_extents_cache_slot = slot; + } + if (trans) { btrfs_put_tree_mod_seq(fs_info, &elem); btrfs_end_transaction(trans); @@ -1791,8 +1967,9 @@ int btrfs_is_data_extent_shared(struct btrfs_root *root, u64 inum, u64 bytenr, up_read(&fs_info->commit_root_sem); } out: - ulist_release(roots); - ulist_release(tmp); + ulist_release(&ctx->refs); + ctx->prev_leaf_bytenr = ctx->curr_leaf_bytenr; + return ret; } @@ -2139,7 +2316,7 @@ static int iterate_leaf_refs(struct btrfs_fs_info *fs_info, "ref for %llu resolved, key (%llu EXTEND_DATA %llu), root %llu", extent_item_objectid, eie->inum, eie->offset, root); - ret = iterate(eie->inum, eie->offset, root, ctx); + ret = iterate(eie->inum, eie->offset, eie->num_bytes, root, ctx); if (ret) { btrfs_debug(fs_info, "stopping iteration for %llu due to ret=%d", @@ -2156,82 +2333,128 @@ static int iterate_leaf_refs(struct btrfs_fs_info *fs_info, * the given parameters. * when the iterator function returns a non-zero value, iteration stops. */ -int iterate_extent_inodes(struct btrfs_fs_info *fs_info, - u64 extent_item_objectid, u64 extent_item_pos, - int search_commit_root, - iterate_extent_inodes_t *iterate, void *ctx, - bool ignore_offset) +int iterate_extent_inodes(struct btrfs_backref_walk_ctx *ctx, + bool search_commit_root, + iterate_extent_inodes_t *iterate, void *user_ctx) { int ret; - struct btrfs_trans_handle *trans = NULL; - struct ulist *refs = NULL; - struct ulist *roots = NULL; - struct ulist_node *ref_node = NULL; - struct ulist_node *root_node = NULL; + struct ulist *refs; + struct ulist_node *ref_node; struct btrfs_seq_list seq_elem = BTRFS_SEQ_LIST_INIT(seq_elem); struct ulist_iterator ref_uiter; - struct ulist_iterator root_uiter; - btrfs_debug(fs_info, "resolving all inodes for extent %llu", - extent_item_objectid); + btrfs_debug(ctx->fs_info, "resolving all inodes for extent %llu", + ctx->bytenr); + + ASSERT(ctx->trans == NULL); + ASSERT(ctx->roots == NULL); if (!search_commit_root) { - trans = btrfs_attach_transaction(fs_info->tree_root); + struct btrfs_trans_handle *trans; + + trans = btrfs_attach_transaction(ctx->fs_info->tree_root); if (IS_ERR(trans)) { if (PTR_ERR(trans) != -ENOENT && PTR_ERR(trans) != -EROFS) return PTR_ERR(trans); trans = NULL; } + ctx->trans = trans; } - if (trans) - btrfs_get_tree_mod_seq(fs_info, &seq_elem); - else - down_read(&fs_info->commit_root_sem); + if (ctx->trans) { + btrfs_get_tree_mod_seq(ctx->fs_info, &seq_elem); + ctx->time_seq = seq_elem.seq; + } else { + down_read(&ctx->fs_info->commit_root_sem); + } - ret = btrfs_find_all_leafs(trans, fs_info, extent_item_objectid, - seq_elem.seq, &refs, - &extent_item_pos, ignore_offset); + ret = btrfs_find_all_leafs(ctx); if (ret) goto out; + refs = ctx->refs; + ctx->refs = NULL; ULIST_ITER_INIT(&ref_uiter); while (!ret && (ref_node = ulist_next(refs, &ref_uiter))) { - ret = btrfs_find_all_roots_safe(trans, fs_info, ref_node->val, - seq_elem.seq, &roots, - ignore_offset); + const u64 leaf_bytenr = ref_node->val; + struct ulist_node *root_node; + struct ulist_iterator root_uiter; + struct extent_inode_elem *inode_list; + + inode_list = (struct extent_inode_elem *)(uintptr_t)ref_node->aux; + + if (ctx->cache_lookup) { + const u64 *root_ids; + int root_count; + bool cached; + + cached = ctx->cache_lookup(leaf_bytenr, ctx->user_ctx, + &root_ids, &root_count); + if (cached) { + for (int i = 0; i < root_count; i++) { + ret = iterate_leaf_refs(ctx->fs_info, + inode_list, + root_ids[i], + leaf_bytenr, + iterate, + user_ctx); + if (ret) + break; + } + continue; + } + } + + if (!ctx->roots) { + ctx->roots = ulist_alloc(GFP_NOFS); + if (!ctx->roots) { + ret = -ENOMEM; + break; + } + } + + ctx->bytenr = leaf_bytenr; + ret = btrfs_find_all_roots_safe(ctx); if (ret) break; + + if (ctx->cache_store) + ctx->cache_store(leaf_bytenr, ctx->roots, ctx->user_ctx); + ULIST_ITER_INIT(&root_uiter); - while (!ret && (root_node = ulist_next(roots, &root_uiter))) { - btrfs_debug(fs_info, + while (!ret && (root_node = ulist_next(ctx->roots, &root_uiter))) { + btrfs_debug(ctx->fs_info, "root %llu references leaf %llu, data list %#llx", root_node->val, ref_node->val, ref_node->aux); - ret = iterate_leaf_refs(fs_info, - (struct extent_inode_elem *) - (uintptr_t)ref_node->aux, - root_node->val, - extent_item_objectid, - iterate, ctx); + ret = iterate_leaf_refs(ctx->fs_info, inode_list, + root_node->val, ctx->bytenr, + iterate, user_ctx); } - ulist_free(roots); + ulist_reinit(ctx->roots); } free_leaf_list(refs); out: - if (trans) { - btrfs_put_tree_mod_seq(fs_info, &seq_elem); - btrfs_end_transaction(trans); + if (ctx->trans) { + btrfs_put_tree_mod_seq(ctx->fs_info, &seq_elem); + btrfs_end_transaction(ctx->trans); + ctx->trans = NULL; } else { - up_read(&fs_info->commit_root_sem); + up_read(&ctx->fs_info->commit_root_sem); } + ulist_free(ctx->roots); + ctx->roots = NULL; + + if (ret == BTRFS_ITERATE_EXTENT_INODES_STOP) + ret = 0; + return ret; } -static int build_ino_list(u64 inum, u64 offset, u64 root, void *ctx) +static int build_ino_list(u64 inum, u64 offset, u64 num_bytes, u64 root, void *ctx) { struct btrfs_data_container *inodes = ctx; const size_t c = 3 * sizeof(u64); @@ -2255,8 +2478,8 @@ int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info, struct btrfs_path *path, void *ctx, bool ignore_offset) { + struct btrfs_backref_walk_ctx walk_ctx = { 0 }; int ret; - u64 extent_item_pos; u64 flags = 0; struct btrfs_key found_key; int search_commit_root = path->search_commit_root; @@ -2268,12 +2491,15 @@ int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info, if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) return -EINVAL; - extent_item_pos = logical - found_key.objectid; - ret = iterate_extent_inodes(fs_info, found_key.objectid, - extent_item_pos, search_commit_root, - build_ino_list, ctx, ignore_offset); + walk_ctx.bytenr = found_key.objectid; + if (ignore_offset) + walk_ctx.ignore_extent_item_pos = true; + else + walk_ctx.extent_item_pos = logical - found_key.objectid; + walk_ctx.fs_info = fs_info; - return ret; + return iterate_extent_inodes(&walk_ctx, search_commit_root, + build_ino_list, ctx); } static int inode_to_path(u64 inum, u32 name_len, unsigned long name_off, @@ -2526,12 +2752,11 @@ void free_ipath(struct inode_fs_paths *ipath) kfree(ipath); } -struct btrfs_backref_iter *btrfs_backref_iter_alloc( - struct btrfs_fs_info *fs_info, gfp_t gfp_flag) +struct btrfs_backref_iter *btrfs_backref_iter_alloc(struct btrfs_fs_info *fs_info) { struct btrfs_backref_iter *ret; - ret = kzalloc(sizeof(*ret), gfp_flag); + ret = kzalloc(sizeof(*ret), GFP_NOFS); if (!ret) return NULL; diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h index 8e69584d538d..ef6bbea3f456 100644 --- a/fs/btrfs/backref.h +++ b/fs/btrfs/backref.h @@ -7,10 +7,128 @@ #define BTRFS_BACKREF_H #include <linux/btrfs.h> +#include "messages.h" #include "ulist.h" #include "disk-io.h" #include "extent_io.h" +/* + * Used by implementations of iterate_extent_inodes_t (see definition below) to + * signal that backref iteration can stop immediately and no error happened. + * The value must be non-negative and must not be 0, 1 (which is a common return + * value from things like btrfs_search_slot() and used internally in the backref + * walking code) and different from BACKREF_FOUND_SHARED and + * BACKREF_FOUND_NOT_SHARED + */ +#define BTRFS_ITERATE_EXTENT_INODES_STOP 5 + +/* + * Should return 0 if no errors happened and iteration of backrefs should + * continue. Can return BTRFS_ITERATE_EXTENT_INODES_STOP or any other non-zero + * value to immediately stop iteration and possibly signal an error back to + * the caller. + */ +typedef int (iterate_extent_inodes_t)(u64 inum, u64 offset, u64 num_bytes, + u64 root, void *ctx); + +/* + * Context and arguments for backref walking functions. Some of the fields are + * to be filled by the caller of such functions while other are filled by the + * functions themselves, as described below. + */ +struct btrfs_backref_walk_ctx { + /* + * The address of the extent for which we are doing backref walking. + * Can be either a data extent or a metadata extent. + * + * Must always be set by the top level caller. + */ + u64 bytenr; + /* + * Offset relative to the target extent. This is only used for data + * extents, and it's meaningful because we can have file extent items + * that point only to a section of a data extent ("bookend" extents), + * and we want to filter out any that don't point to a section of the + * data extent containing the given offset. + * + * Must always be set by the top level caller. + */ + u64 extent_item_pos; + /* + * If true and bytenr corresponds to a data extent, then references from + * all file extent items that point to the data extent are considered, + * @extent_item_pos is ignored. + */ + bool ignore_extent_item_pos; + /* A valid transaction handle or NULL. */ + struct btrfs_trans_handle *trans; + /* + * The file system's info object, can not be NULL. + * + * Must always be set by the top level caller. + */ + struct btrfs_fs_info *fs_info; + /* + * Time sequence acquired from btrfs_get_tree_mod_seq(), in case the + * caller joined the tree mod log to get a consistent view of b+trees + * while we do backref walking, or BTRFS_SEQ_LAST. + * When using BTRFS_SEQ_LAST, delayed refs are not checked and it uses + * commit roots when searching b+trees - this is a special case for + * qgroups used during a transaction commit. + */ + u64 time_seq; + /* + * Used to collect the bytenr of metadata extents that point to the + * target extent. + */ + struct ulist *refs; + /* + * List used to collect the IDs of the roots from which the target + * extent is accessible. Can be NULL in case the caller does not care + * about collecting root IDs. + */ + struct ulist *roots; + /* + * Used by iterate_extent_inodes() and the main backref walk code + * (find_parent_nodes()). Lookup and store functions for an optional + * cache which maps the logical address (bytenr) of leaves to an array + * of root IDs. + */ + bool (*cache_lookup)(u64 leaf_bytenr, void *user_ctx, + const u64 **root_ids_ret, int *root_count_ret); + void (*cache_store)(u64 leaf_bytenr, const struct ulist *root_ids, + void *user_ctx); + /* + * If this is not NULL, then the backref walking code will call this + * for each indirect data extent reference as soon as it finds one, + * before collecting all the remaining backrefs and before resolving + * indirect backrefs. This allows for the caller to terminate backref + * walking as soon as it finds one backref that matches some specific + * criteria. The @cache_lookup and @cache_store callbacks should not + * be NULL in order to use this callback. + */ + iterate_extent_inodes_t *indirect_ref_iterator; + /* + * If this is not NULL, then the backref walking code will call this for + * each extent item it's meant to process before it actually starts + * processing it. If this returns anything other than 0, then it stops + * the backref walking code immediately. + */ + int (*check_extent_item)(u64 bytenr, const struct btrfs_extent_item *ei, + const struct extent_buffer *leaf, void *user_ctx); + /* + * If this is not NULL, then the backref walking code will call this for + * each extent data ref it finds (BTRFS_EXTENT_DATA_REF_KEY keys) before + * processing that data ref. If this callback return false, then it will + * ignore this data ref and it will never resolve the indirect data ref, + * saving time searching for leaves in a fs tree with file extent items + * matching the data ref. + */ + bool (*skip_data_ref)(u64 root, u64 ino, u64 offset, void *user_ctx); + /* Context object to pass to the callbacks defined above. */ + void *user_ctx; +}; + struct inode_fs_paths { struct btrfs_path *btrfs_path; struct btrfs_root *fs_root; @@ -23,17 +141,59 @@ struct btrfs_backref_shared_cache_entry { bool is_shared; }; -struct btrfs_backref_shared_cache { +#define BTRFS_BACKREF_CTX_PREV_EXTENTS_SIZE 8 + +struct btrfs_backref_share_check_ctx { + /* Ulists used during backref walking. */ + struct ulist refs; + /* + * The current leaf the caller of btrfs_is_data_extent_shared() is at. + * Typically the caller (at the moment only fiemap) tries to determine + * the sharedness of data extents point by file extent items from entire + * leaves. + */ + u64 curr_leaf_bytenr; + /* + * The previous leaf the caller was at in the previous call to + * btrfs_is_data_extent_shared(). This may be the same as the current + * leaf. On the first call it must be 0. + */ + u64 prev_leaf_bytenr; /* * A path from a root to a leaf that has a file extent item pointing to * a given data extent should never exceed the maximum b+tree height. */ - struct btrfs_backref_shared_cache_entry entries[BTRFS_MAX_LEVEL]; - bool use_cache; + struct btrfs_backref_shared_cache_entry path_cache_entries[BTRFS_MAX_LEVEL]; + bool use_path_cache; + /* + * Cache the sharedness result for the last few extents we have found, + * but only for extents for which we have multiple file extent items + * that point to them. + * It's very common to have several file extent items that point to the + * same extent (bytenr) but with different offsets and lengths. This + * typically happens for COW writes, partial writes into prealloc + * extents, NOCOW writes after snapshoting a root, hole punching or + * reflinking within the same file (less common perhaps). + * So keep a small cache with the lookup results for the extent pointed + * by the last few file extent items. This cache is checked, with a + * linear scan, whenever btrfs_is_data_extent_shared() is called, so + * it must be small so that it does not negatively affect performance in + * case we don't have multiple file extent items that point to the same + * data extent. + */ + struct { + u64 bytenr; + bool is_shared; + } prev_extents_cache[BTRFS_BACKREF_CTX_PREV_EXTENTS_SIZE]; + /* + * The slot in the prev_extents_cache array that will be used for + * storing the sharedness result of a new data extent. + */ + int prev_extents_cache_slot; }; -typedef int (iterate_extent_inodes_t)(u64 inum, u64 offset, u64 root, - void *ctx); +struct btrfs_backref_share_check_ctx *btrfs_alloc_backref_share_check_ctx(void); +void btrfs_free_backref_share_ctx(struct btrfs_backref_share_check_ctx *ctx); int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical, struct btrfs_path *path, struct btrfs_key *found_key, @@ -43,11 +203,9 @@ int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb, struct btrfs_key *key, struct btrfs_extent_item *ei, u32 item_size, u64 *out_root, u8 *out_level); -int iterate_extent_inodes(struct btrfs_fs_info *fs_info, - u64 extent_item_objectid, - u64 extent_offset, int search_commit_root, - iterate_extent_inodes_t *iterate, void *ctx, - bool ignore_offset); +int iterate_extent_inodes(struct btrfs_backref_walk_ctx *ctx, + bool search_commit_root, + iterate_extent_inodes_t *iterate, void *user_ctx); int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info, struct btrfs_path *path, void *ctx, @@ -55,13 +213,8 @@ int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info, int paths_from_inode(u64 inum, struct inode_fs_paths *ipath); -int btrfs_find_all_leafs(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, u64 bytenr, - u64 time_seq, struct ulist **leafs, - const u64 *extent_item_pos, bool ignore_offset); -int btrfs_find_all_roots(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, u64 bytenr, - u64 time_seq, struct ulist **roots, +int btrfs_find_all_leafs(struct btrfs_backref_walk_ctx *ctx); +int btrfs_find_all_roots(struct btrfs_backref_walk_ctx *ctx, bool skip_commit_root_sem); char *btrfs_ref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path, u32 name_len, unsigned long name_off, @@ -77,10 +230,9 @@ int btrfs_find_one_extref(struct btrfs_root *root, u64 inode_objectid, u64 start_off, struct btrfs_path *path, struct btrfs_inode_extref **ret_extref, u64 *found_off); -int btrfs_is_data_extent_shared(struct btrfs_root *root, u64 inum, u64 bytenr, +int btrfs_is_data_extent_shared(struct btrfs_inode *inode, u64 bytenr, u64 extent_gen, - struct ulist *roots, struct ulist *tmp, - struct btrfs_backref_shared_cache *cache); + struct btrfs_backref_share_check_ctx *ctx); int __init btrfs_prelim_ref_init(void); void __cold btrfs_prelim_ref_exit(void); @@ -111,8 +263,7 @@ struct btrfs_backref_iter { u32 end_ptr; }; -struct btrfs_backref_iter *btrfs_backref_iter_alloc( - struct btrfs_fs_info *fs_info, gfp_t gfp_flag); +struct btrfs_backref_iter *btrfs_backref_iter_alloc(struct btrfs_fs_info *fs_info); static inline void btrfs_backref_iter_free(struct btrfs_backref_iter *iter) { diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c new file mode 100644 index 000000000000..b8fb7ef6b520 --- /dev/null +++ b/fs/btrfs/bio.c @@ -0,0 +1,381 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * Copyright (C) 2022 Christoph Hellwig. + */ + +#include <linux/bio.h> +#include "bio.h" +#include "ctree.h" +#include "volumes.h" +#include "raid56.h" +#include "async-thread.h" +#include "check-integrity.h" +#include "dev-replace.h" +#include "rcu-string.h" +#include "zoned.h" + +static struct bio_set btrfs_bioset; + +/* + * Initialize a btrfs_bio structure. This skips the embedded bio itself as it + * is already initialized by the block layer. + */ +static inline void btrfs_bio_init(struct btrfs_bio *bbio, + btrfs_bio_end_io_t end_io, void *private) +{ + memset(bbio, 0, offsetof(struct btrfs_bio, bio)); + bbio->end_io = end_io; + bbio->private = private; +} + +/* + * Allocate a btrfs_bio structure. The btrfs_bio is the main I/O container for + * btrfs, and is used for all I/O submitted through btrfs_submit_bio. + * + * Just like the underlying bio_alloc_bioset it will not fail as it is backed by + * a mempool. + */ +struct bio *btrfs_bio_alloc(unsigned int nr_vecs, blk_opf_t opf, + btrfs_bio_end_io_t end_io, void *private) +{ + struct bio *bio; + + bio = bio_alloc_bioset(NULL, nr_vecs, opf, GFP_NOFS, &btrfs_bioset); + btrfs_bio_init(btrfs_bio(bio), end_io, private); + return bio; +} + +struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size, + btrfs_bio_end_io_t end_io, void *private) +{ + struct bio *bio; + struct btrfs_bio *bbio; + + ASSERT(offset <= UINT_MAX && size <= UINT_MAX); + + bio = bio_alloc_clone(orig->bi_bdev, orig, GFP_NOFS, &btrfs_bioset); + bbio = btrfs_bio(bio); + btrfs_bio_init(bbio, end_io, private); + + bio_trim(bio, offset >> 9, size >> 9); + bbio->iter = bio->bi_iter; + return bio; +} + +static void btrfs_log_dev_io_error(struct bio *bio, struct btrfs_device *dev) +{ + if (!dev || !dev->bdev) + return; + if (bio->bi_status != BLK_STS_IOERR && bio->bi_status != BLK_STS_TARGET) + return; + + if (btrfs_op(bio) == BTRFS_MAP_WRITE) + btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS); + if (!(bio->bi_opf & REQ_RAHEAD)) + btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS); + if (bio->bi_opf & REQ_PREFLUSH) + btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_FLUSH_ERRS); +} + +static struct workqueue_struct *btrfs_end_io_wq(struct btrfs_fs_info *fs_info, + struct bio *bio) +{ + if (bio->bi_opf & REQ_META) + return fs_info->endio_meta_workers; + return fs_info->endio_workers; +} + +static void btrfs_end_bio_work(struct work_struct *work) +{ + struct btrfs_bio *bbio = container_of(work, struct btrfs_bio, end_io_work); + + bbio->end_io(bbio); +} + +static void btrfs_simple_end_io(struct bio *bio) +{ + struct btrfs_fs_info *fs_info = bio->bi_private; + struct btrfs_bio *bbio = btrfs_bio(bio); + + btrfs_bio_counter_dec(fs_info); + + if (bio->bi_status) + btrfs_log_dev_io_error(bio, bbio->device); + + if (bio_op(bio) == REQ_OP_READ) { + INIT_WORK(&bbio->end_io_work, btrfs_end_bio_work); + queue_work(btrfs_end_io_wq(fs_info, bio), &bbio->end_io_work); + } else { + bbio->end_io(bbio); + } +} + +static void btrfs_raid56_end_io(struct bio *bio) +{ + struct btrfs_io_context *bioc = bio->bi_private; + struct btrfs_bio *bbio = btrfs_bio(bio); + + btrfs_bio_counter_dec(bioc->fs_info); + bbio->mirror_num = bioc->mirror_num; + bbio->end_io(bbio); + + btrfs_put_bioc(bioc); +} + +static void btrfs_orig_write_end_io(struct bio *bio) +{ + struct btrfs_io_stripe *stripe = bio->bi_private; + struct btrfs_io_context *bioc = stripe->bioc; + struct btrfs_bio *bbio = btrfs_bio(bio); + + btrfs_bio_counter_dec(bioc->fs_info); + + if (bio->bi_status) { + atomic_inc(&bioc->error); + btrfs_log_dev_io_error(bio, stripe->dev); + } + + /* + * Only send an error to the higher layers if it is beyond the tolerance + * threshold. + */ + if (atomic_read(&bioc->error) > bioc->max_errors) + bio->bi_status = BLK_STS_IOERR; + else + bio->bi_status = BLK_STS_OK; + + bbio->end_io(bbio); + btrfs_put_bioc(bioc); +} + +static void btrfs_clone_write_end_io(struct bio *bio) +{ + struct btrfs_io_stripe *stripe = bio->bi_private; + + if (bio->bi_status) { + atomic_inc(&stripe->bioc->error); + btrfs_log_dev_io_error(bio, stripe->dev); + } + + /* Pass on control to the original bio this one was cloned from */ + bio_endio(stripe->bioc->orig_bio); + bio_put(bio); +} + +static void btrfs_submit_dev_bio(struct btrfs_device *dev, struct bio *bio) +{ + if (!dev || !dev->bdev || + test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) || + (btrfs_op(bio) == BTRFS_MAP_WRITE && + !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))) { + bio_io_error(bio); + return; + } + + bio_set_dev(bio, dev->bdev); + + /* + * For zone append writing, bi_sector must point the beginning of the + * zone + */ + if (bio_op(bio) == REQ_OP_ZONE_APPEND) { + u64 physical = bio->bi_iter.bi_sector << SECTOR_SHIFT; + + if (btrfs_dev_is_sequential(dev, physical)) { + u64 zone_start = round_down(physical, + dev->fs_info->zone_size); + + bio->bi_iter.bi_sector = zone_start >> SECTOR_SHIFT; + } else { + bio->bi_opf &= ~REQ_OP_ZONE_APPEND; + bio->bi_opf |= REQ_OP_WRITE; + } + } + btrfs_debug_in_rcu(dev->fs_info, + "%s: rw %d 0x%x, sector=%llu, dev=%lu (%s id %llu), size=%u", + __func__, bio_op(bio), bio->bi_opf, bio->bi_iter.bi_sector, + (unsigned long)dev->bdev->bd_dev, btrfs_dev_name(dev), + dev->devid, bio->bi_iter.bi_size); + + btrfsic_check_bio(bio); + submit_bio(bio); +} + +static void btrfs_submit_mirrored_bio(struct btrfs_io_context *bioc, int dev_nr) +{ + struct bio *orig_bio = bioc->orig_bio, *bio; + + ASSERT(bio_op(orig_bio) != REQ_OP_READ); + + /* Reuse the bio embedded into the btrfs_bio for the last mirror */ + if (dev_nr == bioc->num_stripes - 1) { + bio = orig_bio; + bio->bi_end_io = btrfs_orig_write_end_io; + } else { + bio = bio_alloc_clone(NULL, orig_bio, GFP_NOFS, &fs_bio_set); + bio_inc_remaining(orig_bio); + bio->bi_end_io = btrfs_clone_write_end_io; + } + + bio->bi_private = &bioc->stripes[dev_nr]; + bio->bi_iter.bi_sector = bioc->stripes[dev_nr].physical >> SECTOR_SHIFT; + bioc->stripes[dev_nr].bioc = bioc; + btrfs_submit_dev_bio(bioc->stripes[dev_nr].dev, bio); +} + +void btrfs_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio, int mirror_num) +{ + u64 logical = bio->bi_iter.bi_sector << 9; + u64 length = bio->bi_iter.bi_size; + u64 map_length = length; + struct btrfs_io_context *bioc = NULL; + struct btrfs_io_stripe smap; + int ret; + + btrfs_bio_counter_inc_blocked(fs_info); + ret = __btrfs_map_block(fs_info, btrfs_op(bio), logical, &map_length, + &bioc, &smap, &mirror_num, 1); + if (ret) { + btrfs_bio_counter_dec(fs_info); + btrfs_bio_end_io(btrfs_bio(bio), errno_to_blk_status(ret)); + return; + } + + if (map_length < length) { + btrfs_crit(fs_info, + "mapping failed logical %llu bio len %llu len %llu", + logical, length, map_length); + BUG(); + } + + if (!bioc) { + /* Single mirror read/write fast path */ + btrfs_bio(bio)->mirror_num = mirror_num; + btrfs_bio(bio)->device = smap.dev; + bio->bi_iter.bi_sector = smap.physical >> SECTOR_SHIFT; + bio->bi_private = fs_info; + bio->bi_end_io = btrfs_simple_end_io; + btrfs_submit_dev_bio(smap.dev, bio); + } else if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) { + /* Parity RAID write or read recovery */ + bio->bi_private = bioc; + bio->bi_end_io = btrfs_raid56_end_io; + if (bio_op(bio) == REQ_OP_READ) + raid56_parity_recover(bio, bioc, mirror_num); + else + raid56_parity_write(bio, bioc); + } else { + /* Write to multiple mirrors */ + int total_devs = bioc->num_stripes; + int dev_nr; + + bioc->orig_bio = bio; + for (dev_nr = 0; dev_nr < total_devs; dev_nr++) + btrfs_submit_mirrored_bio(bioc, dev_nr); + } +} + +/* + * Submit a repair write. + * + * This bypasses btrfs_submit_bio deliberately, as that writes all copies in a + * RAID setup. Here we only want to write the one bad copy, so we do the + * mapping ourselves and submit the bio directly. + * + * The I/O is issued sychronously to block the repair read completion from + * freeing the bio. + */ +int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, + u64 length, u64 logical, struct page *page, + unsigned int pg_offset, int mirror_num) +{ + struct btrfs_device *dev; + struct bio_vec bvec; + struct bio bio; + u64 map_length = 0; + u64 sector; + struct btrfs_io_context *bioc = NULL; + int ret = 0; + + ASSERT(!(fs_info->sb->s_flags & SB_RDONLY)); + BUG_ON(!mirror_num); + + if (btrfs_repair_one_zone(fs_info, logical)) + return 0; + + map_length = length; + + /* + * Avoid races with device replace and make sure our bioc has devices + * associated to its stripes that don't go away while we are doing the + * read repair operation. + */ + btrfs_bio_counter_inc_blocked(fs_info); + if (btrfs_is_parity_mirror(fs_info, logical, length)) { + /* + * Note that we don't use BTRFS_MAP_WRITE because it's supposed + * to update all raid stripes, but here we just want to correct + * bad stripe, thus BTRFS_MAP_READ is abused to only get the bad + * stripe's dev and sector. + */ + ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, logical, + &map_length, &bioc, 0); + if (ret) + goto out_counter_dec; + ASSERT(bioc->mirror_num == 1); + } else { + ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, logical, + &map_length, &bioc, mirror_num); + if (ret) + goto out_counter_dec; + BUG_ON(mirror_num != bioc->mirror_num); + } + + sector = bioc->stripes[bioc->mirror_num - 1].physical >> 9; + dev = bioc->stripes[bioc->mirror_num - 1].dev; + btrfs_put_bioc(bioc); + + if (!dev || !dev->bdev || + !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) { + ret = -EIO; + goto out_counter_dec; + } + + bio_init(&bio, dev->bdev, &bvec, 1, REQ_OP_WRITE | REQ_SYNC); + bio.bi_iter.bi_sector = sector; + __bio_add_page(&bio, page, length, pg_offset); + + btrfsic_check_bio(&bio); + ret = submit_bio_wait(&bio); + if (ret) { + /* try to remap that extent elsewhere? */ + btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS); + goto out_bio_uninit; + } + + btrfs_info_rl_in_rcu(fs_info, + "read error corrected: ino %llu off %llu (dev %s sector %llu)", + ino, start, btrfs_dev_name(dev), sector); + ret = 0; + +out_bio_uninit: + bio_uninit(&bio); +out_counter_dec: + btrfs_bio_counter_dec(fs_info); + return ret; +} + +int __init btrfs_bioset_init(void) +{ + if (bioset_init(&btrfs_bioset, BIO_POOL_SIZE, + offsetof(struct btrfs_bio, bio), + BIOSET_NEED_BVECS)) + return -ENOMEM; + return 0; +} + +void __cold btrfs_bioset_exit(void) +{ + bioset_exit(&btrfs_bioset); +} diff --git a/fs/btrfs/bio.h b/fs/btrfs/bio.h new file mode 100644 index 000000000000..b12f84b3b341 --- /dev/null +++ b/fs/btrfs/bio.h @@ -0,0 +1,127 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * Copyright (C) 2022 Christoph Hellwig. + */ + +#ifndef BTRFS_BIO_H +#define BTRFS_BIO_H + +#include <linux/bio.h> +#include <linux/workqueue.h> +#include "tree-checker.h" + +struct btrfs_bio; +struct btrfs_fs_info; + +#define BTRFS_BIO_INLINE_CSUM_SIZE 64 + +/* + * Maximum number of sectors for a single bio to limit the size of the + * checksum array. This matches the number of bio_vecs per bio and thus the + * I/O size for buffered I/O. + */ +#define BTRFS_MAX_BIO_SECTORS (256) + +typedef void (*btrfs_bio_end_io_t)(struct btrfs_bio *bbio); + +/* + * Additional info to pass along bio. + * + * Mostly for btrfs specific features like csum and mirror_num. + */ +struct btrfs_bio { + unsigned int mirror_num:7; + + /* + * Extra indicator for metadata bios. + * For some btrfs bios they use pages without a mapping, thus + * we can not rely on page->mapping->host to determine if + * it's a metadata bio. + */ + unsigned int is_metadata:1; + struct bvec_iter iter; + + /* for direct I/O */ + u64 file_offset; + + /* @device is for stripe IO submission. */ + struct btrfs_device *device; + union { + /* For data checksum verification. */ + struct { + u8 *csum; + u8 csum_inline[BTRFS_BIO_INLINE_CSUM_SIZE]; + }; + + /* For metadata parentness verification. */ + struct btrfs_tree_parent_check parent_check; + }; + + /* End I/O information supplied to btrfs_bio_alloc */ + btrfs_bio_end_io_t end_io; + void *private; + + /* For read end I/O handling */ + struct work_struct end_io_work; + + /* + * This member must come last, bio_alloc_bioset will allocate enough + * bytes for entire btrfs_bio but relies on bio being last. + */ + struct bio bio; +}; + +static inline struct btrfs_bio *btrfs_bio(struct bio *bio) +{ + return container_of(bio, struct btrfs_bio, bio); +} + +int __init btrfs_bioset_init(void); +void __cold btrfs_bioset_exit(void); + +struct bio *btrfs_bio_alloc(unsigned int nr_vecs, blk_opf_t opf, + btrfs_bio_end_io_t end_io, void *private); +struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size, + btrfs_bio_end_io_t end_io, void *private); + + +static inline void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status) +{ + bbio->bio.bi_status = status; + bbio->end_io(bbio); +} + +static inline void btrfs_bio_free_csum(struct btrfs_bio *bbio) +{ + if (bbio->is_metadata) + return; + if (bbio->csum != bbio->csum_inline) { + kfree(bbio->csum); + bbio->csum = NULL; + } +} + +/* + * Iterate through a btrfs_bio (@bbio) on a per-sector basis. + * + * bvl - struct bio_vec + * bbio - struct btrfs_bio + * iters - struct bvec_iter + * bio_offset - unsigned int + */ +#define btrfs_bio_for_each_sector(fs_info, bvl, bbio, iter, bio_offset) \ + for ((iter) = (bbio)->iter, (bio_offset) = 0; \ + (iter).bi_size && \ + (((bvl) = bio_iter_iovec((&(bbio)->bio), (iter))), 1); \ + (bio_offset) += fs_info->sectorsize, \ + bio_advance_iter_single(&(bbio)->bio, &(iter), \ + (fs_info)->sectorsize)) + +void btrfs_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio, + int mirror_num); +int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, + u64 length, u64 logical, struct page *page, + unsigned int pg_offset, int mirror_num); + +#endif diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index deebc8ddbd93..708d843daa72 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -17,6 +17,21 @@ #include "discard.h" #include "raid56.h" #include "zoned.h" +#include "fs.h" +#include "accessors.h" +#include "extent-tree.h" + +#ifdef CONFIG_BTRFS_DEBUG +int btrfs_should_fragment_free_space(struct btrfs_block_group *block_group) +{ + struct btrfs_fs_info *fs_info = block_group->fs_info; + + return (btrfs_test_opt(fs_info, FRAGMENT_METADATA) && + block_group->flags & BTRFS_BLOCK_GROUP_METADATA) || + (btrfs_test_opt(fs_info, FRAGMENT_DATA) && + block_group->flags & BTRFS_BLOCK_GROUP_DATA); +} +#endif /* * Return target flags in extended format or 0 if restripe for this chunk_type @@ -284,7 +299,7 @@ struct btrfs_block_group *btrfs_next_block_group( return cache; } -/** +/* * Check if we can do a NOCOW write for a given extent. * * @fs_info: The filesystem information object. @@ -325,11 +340,9 @@ struct btrfs_block_group *btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info, return bg; } -/** +/* * Decrement the number of NOCOW writers in a block group. * - * @bg: The block group. - * * This is meant to be called after a previous call to btrfs_inc_nocow_writers(), * and on the block group returned by that call. Typically this is called after * creating an ordered extent for a NOCOW write, to prevent races with scrub and @@ -1527,6 +1540,30 @@ static inline bool btrfs_should_reclaim(struct btrfs_fs_info *fs_info) return true; } +static bool should_reclaim_block_group(struct btrfs_block_group *bg, u64 bytes_freed) +{ + const struct btrfs_space_info *space_info = bg->space_info; + const int reclaim_thresh = READ_ONCE(space_info->bg_reclaim_threshold); + const u64 new_val = bg->used; + const u64 old_val = new_val + bytes_freed; + u64 thresh; + + if (reclaim_thresh == 0) + return false; + + thresh = mult_perc(bg->length, reclaim_thresh); + + /* + * If we were below the threshold before don't reclaim, we are likely a + * brand new block group and we don't want to relocate new block groups. + */ + if (old_val < thresh) + return false; + if (new_val >= thresh) + return false; + return true; +} + void btrfs_reclaim_bgs_work(struct work_struct *work) { struct btrfs_fs_info *fs_info = @@ -1594,6 +1631,40 @@ void btrfs_reclaim_bgs_work(struct work_struct *work) up_write(&space_info->groups_sem); goto next; } + if (bg->used == 0) { + /* + * It is possible that we trigger relocation on a block + * group as its extents are deleted and it first goes + * below the threshold, then shortly after goes empty. + * + * In this case, relocating it does delete it, but has + * some overhead in relocation specific metadata, looking + * for the non-existent extents and running some extra + * transactions, which we can avoid by using one of the + * other mechanisms for dealing with empty block groups. + */ + if (!btrfs_test_opt(fs_info, DISCARD_ASYNC)) + btrfs_mark_bg_unused(bg); + spin_unlock(&bg->lock); + up_write(&space_info->groups_sem); + goto next; + + } + /* + * The block group might no longer meet the reclaim condition by + * the time we get around to reclaiming it, so to avoid + * reclaiming overly full block_groups, skip reclaiming them. + * + * Since the decision making process also depends on the amount + * being freed, pass in a fake giant value to skip that extra + * check, which is more meaningful when adding to the list in + * the first place. + */ + if (!should_reclaim_block_group(bg, bg->length)) { + spin_unlock(&bg->lock); + up_write(&space_info->groups_sem); + goto next; + } spin_unlock(&bg->lock); /* Get out fast, in case we're unmounting the filesystem */ @@ -1740,8 +1811,8 @@ static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) write_sequnlock(&fs_info->profiles_lock); } -/** - * Map a physical disk address to a list of logical addresses +/* + * Map a physical disk address to a list of logical addresses. * * @fs_info: the filesystem * @chunk_start: logical address of block group @@ -2001,6 +2072,7 @@ static int read_one_block_group(struct btrfs_fs_info *info, cache->length = key->offset; cache->used = btrfs_stack_block_group_used(bgi); + cache->commit_used = cache->used; cache->flags = btrfs_stack_block_group_flags(bgi); cache->global_root_id = btrfs_stack_block_group_chunk_objectid(bgi); @@ -2481,7 +2553,7 @@ struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *tran cache->global_root_id = calculate_global_root_id(fs_info, cache->start); if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) - cache->needs_free_space = 1; + set_bit(BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE, &cache->runtime_flags); ret = btrfs_load_block_group_zone_info(cache, true); if (ret) { @@ -2692,6 +2764,25 @@ static int update_block_group_item(struct btrfs_trans_handle *trans, struct extent_buffer *leaf; struct btrfs_block_group_item bgi; struct btrfs_key key; + u64 old_commit_used; + u64 used; + + /* + * Block group items update can be triggered out of commit transaction + * critical section, thus we need a consistent view of used bytes. + * We cannot use cache->used directly outside of the spin lock, as it + * may be changed. + */ + spin_lock(&cache->lock); + old_commit_used = cache->commit_used; + used = cache->used; + /* No change in used bytes, can safely skip it. */ + if (cache->commit_used == used) { + spin_unlock(&cache->lock); + return 0; + } + cache->commit_used = used; + spin_unlock(&cache->lock); key.objectid = cache->start; key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; @@ -2706,7 +2797,7 @@ static int update_block_group_item(struct btrfs_trans_handle *trans, leaf = path->nodes[0]; bi = btrfs_item_ptr_offset(leaf, path->slots[0]); - btrfs_set_stack_block_group_used(&bgi, cache->used); + btrfs_set_stack_block_group_used(&bgi, used); btrfs_set_stack_block_group_chunk_objectid(&bgi, cache->global_root_id); btrfs_set_stack_block_group_flags(&bgi, cache->flags); @@ -2714,6 +2805,12 @@ static int update_block_group_item(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(leaf); fail: btrfs_release_path(path); + /* We didn't update the block group item, need to revert @commit_used. */ + if (ret < 0) { + spin_lock(&cache->lock); + cache->commit_used = old_commit_used; + spin_unlock(&cache->lock); + } return ret; } @@ -3211,31 +3308,6 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans) return ret; } -static inline bool should_reclaim_block_group(struct btrfs_block_group *bg, - u64 bytes_freed) -{ - const struct btrfs_space_info *space_info = bg->space_info; - const int reclaim_thresh = READ_ONCE(space_info->bg_reclaim_threshold); - const u64 new_val = bg->used; - const u64 old_val = new_val + bytes_freed; - u64 thresh; - - if (reclaim_thresh == 0) - return false; - - thresh = div_factor_fine(bg->length, reclaim_thresh); - - /* - * If we were below the threshold before don't reclaim, we are likely a - * brand new block group and we don't want to relocate new block groups. - */ - if (old_val < thresh) - return false; - if (new_val >= thresh) - return false; - return true; -} - int btrfs_update_block_group(struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, bool alloc) { @@ -3347,8 +3419,9 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans, return ret; } -/** - * btrfs_add_reserved_bytes - update the block_group and space info counters +/* + * Update the block_group and space info counters. + * * @cache: The cache we are manipulating * @ram_bytes: The number of bytes of file content, and will be same to * @num_bytes except for the compress path. @@ -3391,8 +3464,9 @@ int btrfs_add_reserved_bytes(struct btrfs_block_group *cache, return ret; } -/** - * btrfs_free_reserved_bytes - update the block_group and space info counters +/* + * Update the block_group and space info counters. + * * @cache: The cache we are manipulating * @num_bytes: The number of bytes in question * @delalloc: The blocks are allocated for the delalloc write @@ -3449,13 +3523,13 @@ static int should_alloc_chunk(struct btrfs_fs_info *fs_info, */ if (force == CHUNK_ALLOC_LIMITED) { thresh = btrfs_super_total_bytes(fs_info->super_copy); - thresh = max_t(u64, SZ_64M, div_factor_fine(thresh, 1)); + thresh = max_t(u64, SZ_64M, mult_perc(thresh, 1)); if (sinfo->total_bytes - bytes_used < thresh) return 1; } - if (bytes_used + SZ_2M < div_factor(sinfo->total_bytes, 8)) + if (bytes_used + SZ_2M < mult_perc(sinfo->total_bytes, 80)) return 0; return 1; } diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h index 8fb14b99a1d1..a02ea76fd6cf 100644 --- a/fs/btrfs/block-group.h +++ b/fs/btrfs/block-group.h @@ -55,6 +55,10 @@ enum btrfs_block_group_flags { BLOCK_GROUP_FLAG_CHUNK_ITEM_INSERTED, BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, + /* Does the block group need to be added to the free space tree? */ + BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE, + /* Indicate that the block group is placed on a sequential zone */ + BLOCK_GROUP_FLAG_SEQUENTIAL_ZONE, }; enum btrfs_caching_type { @@ -100,6 +104,12 @@ struct btrfs_block_group { u64 global_root_id; /* + * The last committed used bytes of this block group, if the above @used + * is still the same as @commit_used, we don't need to update block + * group item of this block group. + */ + u64 commit_used; + /* * If the free space extent count exceeds this number, convert the block * group to bitmaps. */ @@ -203,15 +213,6 @@ struct btrfs_block_group { struct mutex free_space_lock; /* - * Does the block group need to be added to the free space tree? - * Protected by free_space_lock. - */ - int needs_free_space; - - /* Flag indicating this block group is placed on a sequential zone */ - bool seq_zone; - - /* * Number of extents in this block group used for swap files. * All accesses protected by the spinlock 'lock'. */ @@ -251,16 +252,7 @@ static inline bool btrfs_is_block_group_data_only( } #ifdef CONFIG_BTRFS_DEBUG -static inline int btrfs_should_fragment_free_space( - struct btrfs_block_group *block_group) -{ - struct btrfs_fs_info *fs_info = block_group->fs_info; - - return (btrfs_test_opt(fs_info, FRAGMENT_METADATA) && - block_group->flags & BTRFS_BLOCK_GROUP_METADATA) || - (btrfs_test_opt(fs_info, FRAGMENT_DATA) && - block_group->flags & BTRFS_BLOCK_GROUP_DATA); -} +int btrfs_should_fragment_free_space(struct btrfs_block_group *block_group); #endif struct btrfs_block_group *btrfs_lookup_first_block_group( diff --git a/fs/btrfs/block-rsv.c b/fs/btrfs/block-rsv.c index ec96285357e0..5367a14d44d2 100644 --- a/fs/btrfs/block-rsv.c +++ b/fs/btrfs/block-rsv.c @@ -7,6 +7,8 @@ #include "transaction.h" #include "block-group.h" #include "disk-io.h" +#include "fs.h" +#include "accessors.h" /* * HOW DO BLOCK RESERVES WORK @@ -225,7 +227,7 @@ int btrfs_block_rsv_add(struct btrfs_fs_info *fs_info, return ret; } -int btrfs_block_rsv_check(struct btrfs_block_rsv *block_rsv, int min_factor) +int btrfs_block_rsv_check(struct btrfs_block_rsv *block_rsv, int min_percent) { u64 num_bytes = 0; int ret = -ENOSPC; @@ -234,7 +236,7 @@ int btrfs_block_rsv_check(struct btrfs_block_rsv *block_rsv, int min_factor) return 0; spin_lock(&block_rsv->lock); - num_bytes = div_factor(block_rsv->size, min_factor); + num_bytes = mult_perc(block_rsv->size, min_percent); if (block_rsv->reserved >= num_bytes) ret = 0; spin_unlock(&block_rsv->lock); @@ -323,31 +325,6 @@ void btrfs_block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv, spin_unlock(&block_rsv->lock); } -int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info, - struct btrfs_block_rsv *dest, u64 num_bytes, - int min_factor) -{ - struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; - u64 min_bytes; - - if (global_rsv->space_info != dest->space_info) - return -ENOSPC; - - spin_lock(&global_rsv->lock); - min_bytes = div_factor(global_rsv->size, min_factor); - if (global_rsv->reserved < min_bytes + num_bytes) { - spin_unlock(&global_rsv->lock); - return -ENOSPC; - } - global_rsv->reserved -= num_bytes; - if (global_rsv->reserved < global_rsv->size) - global_rsv->full = false; - spin_unlock(&global_rsv->lock); - - btrfs_block_rsv_add_bytes(dest, num_bytes, true); - return 0; -} - void btrfs_update_global_block_rsv(struct btrfs_fs_info *fs_info) { struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv; @@ -552,5 +529,17 @@ try_reserve: if (!ret) return global_rsv; } + + /* + * All hope is lost, but of course our reservations are overly + * pessimistic, so instead of possibly having an ENOSPC abort here, try + * one last time to force a reservation if there's enough actual space + * on disk to make the reservation. + */ + ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv, blocksize, + BTRFS_RESERVE_FLUSH_EMERGENCY); + if (!ret) + return block_rsv; + return ERR_PTR(ret); } diff --git a/fs/btrfs/block-rsv.h b/fs/btrfs/block-rsv.h index 578c3497a455..4cc41c9aaa82 100644 --- a/fs/btrfs/block-rsv.h +++ b/fs/btrfs/block-rsv.h @@ -4,6 +4,7 @@ #define BTRFS_BLOCK_RSV_H struct btrfs_trans_handle; +struct btrfs_root; enum btrfs_reserve_flush_enum; /* @@ -62,7 +63,7 @@ void btrfs_free_block_rsv(struct btrfs_fs_info *fs_info, int btrfs_block_rsv_add(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *block_rsv, u64 num_bytes, enum btrfs_reserve_flush_enum flush); -int btrfs_block_rsv_check(struct btrfs_block_rsv *block_rsv, int min_factor); +int btrfs_block_rsv_check(struct btrfs_block_rsv *block_rsv, int min_percent); int btrfs_block_rsv_refill(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *block_rsv, u64 min_reserved, enum btrfs_reserve_flush_enum flush); @@ -70,9 +71,6 @@ int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv, struct btrfs_block_rsv *dst_rsv, u64 num_bytes, bool update_size); int btrfs_block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv, u64 num_bytes); -int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info, - struct btrfs_block_rsv *dest, u64 num_bytes, - int min_factor); void btrfs_block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv, u64 num_bytes, bool update_size); u64 btrfs_block_rsv_release(struct btrfs_fs_info *fs_info, diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 54c2ccb36b61..195c09e20609 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -411,29 +411,142 @@ static inline void btrfs_inode_split_flags(u64 inode_item_flags, #define CSUM_FMT "0x%*phN" #define CSUM_FMT_VALUE(size, bytes) size, bytes -static inline void btrfs_print_data_csum_error(struct btrfs_inode *inode, - u64 logical_start, u8 *csum, u8 *csum_expected, int mirror_num) -{ - struct btrfs_root *root = inode->root; - const u32 csum_size = root->fs_info->csum_size; - - /* Output minus objectid, which is more meaningful */ - if (root->root_key.objectid >= BTRFS_LAST_FREE_OBJECTID) - btrfs_warn_rl(root->fs_info, -"csum failed root %lld ino %lld off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d", - root->root_key.objectid, btrfs_ino(inode), - logical_start, - CSUM_FMT_VALUE(csum_size, csum), - CSUM_FMT_VALUE(csum_size, csum_expected), - mirror_num); - else - btrfs_warn_rl(root->fs_info, -"csum failed root %llu ino %llu off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d", - root->root_key.objectid, btrfs_ino(inode), - logical_start, - CSUM_FMT_VALUE(csum_size, csum), - CSUM_FMT_VALUE(csum_size, csum_expected), - mirror_num); -} +void btrfs_submit_data_write_bio(struct btrfs_inode *inode, struct bio *bio, int mirror_num); +void btrfs_submit_data_read_bio(struct btrfs_inode *inode, struct bio *bio, + int mirror_num, enum btrfs_compression_type compress_type); +void btrfs_submit_dio_repair_bio(struct btrfs_inode *inode, struct bio *bio, int mirror_num); +blk_status_t btrfs_submit_bio_start(struct btrfs_inode *inode, struct bio *bio); +blk_status_t btrfs_submit_bio_start_direct_io(struct btrfs_inode *inode, + struct bio *bio, + u64 dio_file_offset); +int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, struct page *page, + u32 pgoff, u8 *csum, const u8 * const csum_expected); +int btrfs_check_data_csum(struct btrfs_inode *inode, struct btrfs_bio *bbio, + u32 bio_offset, struct page *page, u32 pgoff); +unsigned int btrfs_verify_data_csum(struct btrfs_bio *bbio, + u32 bio_offset, struct page *page, + u64 start, u64 end); +noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, + u64 *orig_start, u64 *orig_block_len, + u64 *ram_bytes, bool nowait, bool strict); + +void __btrfs_del_delalloc_inode(struct btrfs_root *root, struct btrfs_inode *inode); +struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry); +int btrfs_set_inode_index(struct btrfs_inode *dir, u64 *index); +int btrfs_unlink_inode(struct btrfs_trans_handle *trans, + struct btrfs_inode *dir, struct btrfs_inode *inode, + const struct fscrypt_str *name); +int btrfs_add_link(struct btrfs_trans_handle *trans, + struct btrfs_inode *parent_inode, struct btrfs_inode *inode, + const struct fscrypt_str *name, int add_backref, u64 index); +int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry); +int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len, + int front); + +int btrfs_start_delalloc_snapshot(struct btrfs_root *root, bool in_reclaim_context); +int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr, + bool in_reclaim_context); +int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end, + unsigned int extra_bits, + struct extent_state **cached_state); + +struct btrfs_new_inode_args { + /* Input */ + struct inode *dir; + struct dentry *dentry; + struct inode *inode; + bool orphan; + bool subvol; + + /* Output from btrfs_new_inode_prepare(), input to btrfs_create_new_inode(). */ + struct posix_acl *default_acl; + struct posix_acl *acl; + struct fscrypt_name fname; +}; + +int btrfs_new_inode_prepare(struct btrfs_new_inode_args *args, + unsigned int *trans_num_items); +int btrfs_create_new_inode(struct btrfs_trans_handle *trans, + struct btrfs_new_inode_args *args); +void btrfs_new_inode_args_destroy(struct btrfs_new_inode_args *args); +struct inode *btrfs_new_subvol_inode(struct user_namespace *mnt_userns, + struct inode *dir); + void btrfs_set_delalloc_extent(struct btrfs_inode *inode, struct extent_state *state, + u32 bits); +void btrfs_clear_delalloc_extent(struct btrfs_inode *inode, + struct extent_state *state, u32 bits); +void btrfs_merge_delalloc_extent(struct btrfs_inode *inode, struct extent_state *new, + struct extent_state *other); +void btrfs_split_delalloc_extent(struct btrfs_inode *inode, + struct extent_state *orig, u64 split); +void btrfs_set_range_writeback(struct btrfs_inode *inode, u64 start, u64 end); +vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf); +void btrfs_evict_inode(struct inode *inode); +struct inode *btrfs_alloc_inode(struct super_block *sb); +void btrfs_destroy_inode(struct inode *inode); +void btrfs_free_inode(struct inode *inode); +int btrfs_drop_inode(struct inode *inode); +int __init btrfs_init_cachep(void); +void __cold btrfs_destroy_cachep(void); +struct inode *btrfs_iget_path(struct super_block *s, u64 ino, + struct btrfs_root *root, struct btrfs_path *path); +struct inode *btrfs_iget(struct super_block *s, u64 ino, struct btrfs_root *root); +struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, + struct page *page, size_t pg_offset, + u64 start, u64 end); +int btrfs_update_inode(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct btrfs_inode *inode); +int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct btrfs_inode *inode); +int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct btrfs_inode *inode); +int btrfs_orphan_cleanup(struct btrfs_root *root); +int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size); +void btrfs_add_delayed_iput(struct btrfs_inode *inode); +void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info); +int btrfs_wait_on_delayed_iputs(struct btrfs_fs_info *fs_info); +int btrfs_prealloc_file_range(struct inode *inode, int mode, + u64 start, u64 num_bytes, u64 min_size, + loff_t actual_len, u64 *alloc_hint); +int btrfs_prealloc_file_range_trans(struct inode *inode, + struct btrfs_trans_handle *trans, int mode, + u64 start, u64 num_bytes, u64 min_size, + loff_t actual_len, u64 *alloc_hint); +int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page, + u64 start, u64 end, int *page_started, + unsigned long *nr_written, struct writeback_control *wbc); +int btrfs_writepage_cow_fixup(struct page *page); +void btrfs_writepage_endio_finish_ordered(struct btrfs_inode *inode, + struct page *page, u64 start, + u64 end, bool uptodate); +int btrfs_encoded_io_compression_from_extent(struct btrfs_fs_info *fs_info, + int compress_type); +int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, + u64 file_offset, u64 disk_bytenr, + u64 disk_io_size, + struct page **pages); +ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter, + struct btrfs_ioctl_encoded_io_args *encoded); +ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from, + const struct btrfs_ioctl_encoded_io_args *encoded); + +ssize_t btrfs_dio_read(struct kiocb *iocb, struct iov_iter *iter, + size_t done_before); +struct iomap_dio *btrfs_dio_write(struct kiocb *iocb, struct iov_iter *iter, + size_t done_before); + +extern const struct dentry_operations btrfs_dentry_operations; + +/* Inode locking type flags, by default the exclusive lock is taken. */ +enum btrfs_ilock_type { + ENUM_BIT(BTRFS_ILOCK_SHARED), + ENUM_BIT(BTRFS_ILOCK_TRY), + ENUM_BIT(BTRFS_ILOCK_MMAP), +}; + +int btrfs_inode_lock(struct btrfs_inode *inode, unsigned int ilock_flags); +void btrfs_inode_unlock(struct btrfs_inode *inode, unsigned int ilock_flags); +void btrfs_update_inode_bytes(struct btrfs_inode *inode, const u64 add_bytes, + const u64 del_bytes); +void btrfs_assert_inode_range_clean(struct btrfs_inode *inode, u64 start, u64 end); #endif diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index 98c6e5feab19..82e49d985019 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -82,6 +82,7 @@ #include <linux/mm.h> #include <linux/string.h> #include <crypto/hash.h> +#include "messages.h" #include "ctree.h" #include "disk-io.h" #include "transaction.h" @@ -92,6 +93,7 @@ #include "check-integrity.h" #include "rcu-string.h" #include "compression.h" +#include "accessors.h" #define BTRFSIC_BLOCK_HASHTABLE_SIZE 0x10000 #define BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE 0x10000 @@ -755,7 +757,7 @@ static int btrfsic_process_superblock_dev_mirror( btrfs_info_in_rcu(fs_info, "new initial S-block (bdev %p, %s) @%llu (%pg/%llu/%d)", superblock_bdev, - rcu_str_deref(device->name), dev_bytenr, + btrfs_dev_name(device), dev_bytenr, dev_state->bdev, dev_bytenr, superblock_mirror_num); list_add(&superblock_tmp->all_blocks_node, diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index e6635fe70067..5122ca79f7ea 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -23,16 +23,19 @@ #include <crypto/hash.h> #include "misc.h" #include "ctree.h" +#include "fs.h" #include "disk-io.h" #include "transaction.h" #include "btrfs_inode.h" -#include "volumes.h" +#include "bio.h" #include "ordered-data.h" #include "compression.h" #include "extent_io.h" #include "extent_map.h" #include "subpage.h" #include "zoned.h" +#include "file-item.h" +#include "super.h" static const char* const btrfs_compress_types[] = { "", "zlib", "lzo", "zstd" }; @@ -116,7 +119,7 @@ static int compression_decompress_bio(struct list_head *ws, } static int compression_decompress(int type, struct list_head *ws, - unsigned char *data_in, struct page *dest_page, + const u8 *data_in, struct page *dest_page, unsigned long start_byte, size_t srclen, size_t destlen) { switch (type) { @@ -183,7 +186,7 @@ static void end_compressed_bio_read(struct btrfs_bio *bbio) u64 start = bbio->file_offset + offset; if (!status && - (!csum || !btrfs_check_data_csum(inode, bbio, offset, + (!csum || !btrfs_check_data_csum(bi, bbio, offset, bv.bv_page, bv.bv_offset))) { btrfs_clean_io_failure(bi, start, bv.bv_page, bv.bv_offset); @@ -191,9 +194,9 @@ static void end_compressed_bio_read(struct btrfs_bio *bbio) int ret; refcount_inc(&cb->pending_ios); - ret = btrfs_repair_one_sector(inode, bbio, offset, + ret = btrfs_repair_one_sector(BTRFS_I(inode), bbio, offset, bv.bv_page, bv.bv_offset, - btrfs_submit_data_read_bio); + true); if (ret) { refcount_dec(&cb->pending_ios); status = errno_to_blk_status(ret); @@ -1229,7 +1232,7 @@ static int btrfs_decompress_bio(struct compressed_bio *cb) * single page, and we want to read a single page out of it. * start_byte tells us the offset into the compressed data we're interested in */ -int btrfs_decompress(int type, unsigned char *data_in, struct page *dest_page, +int btrfs_decompress(int type, const u8 *data_in, struct page *dest_page, unsigned long start_byte, size_t srclen, size_t destlen) { struct list_head *workspace; @@ -1243,12 +1246,13 @@ int btrfs_decompress(int type, unsigned char *data_in, struct page *dest_page, return ret; } -void __init btrfs_init_compress(void) +int __init btrfs_init_compress(void) { btrfs_init_workspace_manager(BTRFS_COMPRESS_NONE); btrfs_init_workspace_manager(BTRFS_COMPRESS_ZLIB); btrfs_init_workspace_manager(BTRFS_COMPRESS_LZO); zstd_init_workspace_manager(); + return 0; } void __cold btrfs_exit_compress(void) diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index 1aa02903de69..6209d40a1e08 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -6,6 +6,7 @@ #ifndef BTRFS_COMPRESSION_H #define BTRFS_COMPRESSION_H +#include <linux/blk_types.h> #include <linux/sizes.h> struct btrfs_inode; @@ -77,7 +78,7 @@ static inline unsigned int btrfs_compress_level(unsigned int type_level) return ((type_level & 0xF0) >> 4); } -void __init btrfs_init_compress(void); +int __init btrfs_init_compress(void); void __cold btrfs_exit_compress(void); int btrfs_compress_pages(unsigned int type_level, struct address_space *mapping, @@ -85,7 +86,7 @@ int btrfs_compress_pages(unsigned int type_level, struct address_space *mapping, unsigned long *out_pages, unsigned long *total_in, unsigned long *total_out); -int btrfs_decompress(int type, unsigned char *data_in, struct page *dest_page, +int btrfs_decompress(int type, const u8 *data_in, struct page *dest_page, unsigned long start_byte, size_t srclen, size_t destlen); int btrfs_decompress_buf2page(const char *buf, u32 buf_len, struct compressed_bio *cb, u32 decompressed); @@ -149,7 +150,7 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping, u64 start, struct page **pages, unsigned long *out_pages, unsigned long *total_in, unsigned long *total_out); int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb); -int zlib_decompress(struct list_head *ws, unsigned char *data_in, +int zlib_decompress(struct list_head *ws, const u8 *data_in, struct page *dest_page, unsigned long start_byte, size_t srclen, size_t destlen); struct list_head *zlib_alloc_workspace(unsigned int level); @@ -160,7 +161,7 @@ int lzo_compress_pages(struct list_head *ws, struct address_space *mapping, u64 start, struct page **pages, unsigned long *out_pages, unsigned long *total_in, unsigned long *total_out); int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb); -int lzo_decompress(struct list_head *ws, unsigned char *data_in, +int lzo_decompress(struct list_head *ws, const u8 *data_in, struct page *dest_page, unsigned long start_byte, size_t srclen, size_t destlen); struct list_head *lzo_alloc_workspace(unsigned int level); @@ -170,7 +171,7 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping, u64 start, struct page **pages, unsigned long *out_pages, unsigned long *total_in, unsigned long *total_out); int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb); -int zstd_decompress(struct list_head *ws, unsigned char *data_in, +int zstd_decompress(struct list_head *ws, const u8 *data_in, struct page *dest_page, unsigned long start_byte, size_t srclen, size_t destlen); void zstd_init_workspace_manager(void); diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index dcb510f38dda..4754c9101a4c 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -8,6 +8,7 @@ #include <linux/rbtree.h> #include <linux/mm.h> #include <linux/error-injection.h> +#include "messages.h" #include "ctree.h" #include "disk-io.h" #include "transaction.h" @@ -17,6 +18,13 @@ #include "qgroup.h" #include "tree-mod-log.h" #include "tree-checker.h" +#include "fs.h" +#include "accessors.h" +#include "extent-tree.h" +#include "relocation.h" +#include "file-item.h" + +static struct kmem_cache *btrfs_path_cachep; static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, int level); @@ -44,6 +52,104 @@ static const struct btrfs_csums { .driver = "blake2b-256" }, }; +/* + * The leaf data grows from end-to-front in the node. this returns the address + * of the start of the last item, which is the stop of the leaf data stack. + */ +static unsigned int leaf_data_end(const struct extent_buffer *leaf) +{ + u32 nr = btrfs_header_nritems(leaf); + + if (nr == 0) + return BTRFS_LEAF_DATA_SIZE(leaf->fs_info); + return btrfs_item_offset(leaf, nr - 1); +} + +/* + * Move data in a @leaf (using memmove, safe for overlapping ranges). + * + * @leaf: leaf that we're doing a memmove on + * @dst_offset: item data offset we're moving to + * @src_offset: item data offset were' moving from + * @len: length of the data we're moving + * + * Wrapper around memmove_extent_buffer() that takes into account the header on + * the leaf. The btrfs_item offset's start directly after the header, so we + * have to adjust any offsets to account for the header in the leaf. This + * handles that math to simplify the callers. + */ +static inline void memmove_leaf_data(const struct extent_buffer *leaf, + unsigned long dst_offset, + unsigned long src_offset, + unsigned long len) +{ + memmove_extent_buffer(leaf, btrfs_item_nr_offset(leaf, 0) + dst_offset, + btrfs_item_nr_offset(leaf, 0) + src_offset, len); +} + +/* + * Copy item data from @src into @dst at the given @offset. + * + * @dst: destination leaf that we're copying into + * @src: source leaf that we're copying from + * @dst_offset: item data offset we're copying to + * @src_offset: item data offset were' copying from + * @len: length of the data we're copying + * + * Wrapper around copy_extent_buffer() that takes into account the header on + * the leaf. The btrfs_item offset's start directly after the header, so we + * have to adjust any offsets to account for the header in the leaf. This + * handles that math to simplify the callers. + */ +static inline void copy_leaf_data(const struct extent_buffer *dst, + const struct extent_buffer *src, + unsigned long dst_offset, + unsigned long src_offset, unsigned long len) +{ + copy_extent_buffer(dst, src, btrfs_item_nr_offset(dst, 0) + dst_offset, + btrfs_item_nr_offset(src, 0) + src_offset, len); +} + +/* + * Move items in a @leaf (using memmove). + * + * @dst: destination leaf for the items + * @dst_item: the item nr we're copying into + * @src_item: the item nr we're copying from + * @nr_items: the number of items to copy + * + * Wrapper around memmove_extent_buffer() that does the math to get the + * appropriate offsets into the leaf from the item numbers. + */ +static inline void memmove_leaf_items(const struct extent_buffer *leaf, + int dst_item, int src_item, int nr_items) +{ + memmove_extent_buffer(leaf, btrfs_item_nr_offset(leaf, dst_item), + btrfs_item_nr_offset(leaf, src_item), + nr_items * sizeof(struct btrfs_item)); +} + +/* + * Copy items from @src into @dst at the given @offset. + * + * @dst: destination leaf for the items + * @src: source leaf for the items + * @dst_item: the item nr we're copying into + * @src_item: the item nr we're copying from + * @nr_items: the number of items to copy + * + * Wrapper around copy_extent_buffer() that does the math to get the + * appropriate offsets into the leaf from the item numbers. + */ +static inline void copy_leaf_items(const struct extent_buffer *dst, + const struct extent_buffer *src, + int dst_item, int src_item, int nr_items) +{ + copy_extent_buffer(dst, src, btrfs_item_nr_offset(dst, dst_item), + btrfs_item_nr_offset(src, src_item), + nr_items * sizeof(struct btrfs_item)); +} + int btrfs_super_csum_size(const struct btrfs_super_block *s) { u16 t = btrfs_super_csum_type(s); @@ -78,6 +184,8 @@ size_t __attribute_const__ btrfs_get_num_csums(void) struct btrfs_path *btrfs_alloc_path(void) { + might_sleep(); + return kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS); } @@ -487,7 +595,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, } else { WARN_ON(trans->transid != btrfs_header_generation(parent)); btrfs_tree_mod_log_insert_key(parent, parent_slot, - BTRFS_MOD_LOG_KEY_REPLACE, GFP_NOFS); + BTRFS_MOD_LOG_KEY_REPLACE); btrfs_set_node_blockptr(parent, parent_slot, cow->start); btrfs_set_node_ptr_generation(parent, parent_slot, @@ -850,19 +958,22 @@ struct extent_buffer *btrfs_read_node_slot(struct extent_buffer *parent, int slot) { int level = btrfs_header_level(parent); + struct btrfs_tree_parent_check check = { 0 }; struct extent_buffer *eb; - struct btrfs_key first_key; if (slot < 0 || slot >= btrfs_header_nritems(parent)) return ERR_PTR(-ENOENT); BUG_ON(level == 0); - btrfs_node_key_to_cpu(parent, &first_key, slot); + check.level = level - 1; + check.transid = btrfs_node_ptr_generation(parent, slot); + check.owner_root = btrfs_header_owner(parent); + check.has_first_key = true; + btrfs_node_key_to_cpu(parent, &check.first_key, slot); + eb = read_tree_block(parent->fs_info, btrfs_node_blockptr(parent, slot), - btrfs_header_owner(parent), - btrfs_node_ptr_generation(parent, slot), - level - 1, &first_key); + &check); if (IS_ERR(eb)) return eb; if (!extent_buffer_uptodate(eb)) { @@ -1016,7 +1127,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, struct btrfs_disk_key right_key; btrfs_node_key(right, &right_key, 0); ret = btrfs_tree_mod_log_insert_key(parent, pslot + 1, - BTRFS_MOD_LOG_KEY_REPLACE, GFP_NOFS); + BTRFS_MOD_LOG_KEY_REPLACE); BUG_ON(ret < 0); btrfs_set_node_key(parent, &right_key, pslot + 1); btrfs_mark_buffer_dirty(parent); @@ -1062,7 +1173,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, struct btrfs_disk_key mid_key; btrfs_node_key(mid, &mid_key, 0); ret = btrfs_tree_mod_log_insert_key(parent, pslot, - BTRFS_MOD_LOG_KEY_REPLACE, GFP_NOFS); + BTRFS_MOD_LOG_KEY_REPLACE); BUG_ON(ret < 0); btrfs_set_node_key(parent, &mid_key, pslot); btrfs_mark_buffer_dirty(parent); @@ -1164,7 +1275,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, orig_slot += left_nr; btrfs_node_key(mid, &disk_key, 0); ret = btrfs_tree_mod_log_insert_key(parent, pslot, - BTRFS_MOD_LOG_KEY_REPLACE, GFP_NOFS); + BTRFS_MOD_LOG_KEY_REPLACE); BUG_ON(ret < 0); btrfs_set_node_key(parent, &disk_key, pslot); btrfs_mark_buffer_dirty(parent); @@ -1218,7 +1329,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, btrfs_node_key(right, &disk_key, 0); ret = btrfs_tree_mod_log_insert_key(parent, pslot + 1, - BTRFS_MOD_LOG_KEY_REPLACE, GFP_NOFS); + BTRFS_MOD_LOG_KEY_REPLACE); BUG_ON(ret < 0); btrfs_set_node_key(parent, &disk_key, pslot + 1); btrfs_mark_buffer_dirty(parent); @@ -1421,10 +1532,10 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p, const struct btrfs_key *key) { struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_tree_parent_check check = { 0 }; u64 blocknr; u64 gen; struct extent_buffer *tmp; - struct btrfs_key first_key; int ret; int parent_level; bool unlock_up; @@ -1433,7 +1544,11 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p, blocknr = btrfs_node_blockptr(*eb_ret, slot); gen = btrfs_node_ptr_generation(*eb_ret, slot); parent_level = btrfs_header_level(*eb_ret); - btrfs_node_key_to_cpu(*eb_ret, &first_key, slot); + btrfs_node_key_to_cpu(*eb_ret, &check.first_key, slot); + check.has_first_key = true; + check.level = parent_level - 1; + check.transid = gen; + check.owner_root = root->root_key.objectid; /* * If we need to read an extent buffer from disk and we are holding locks @@ -1455,7 +1570,7 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p, * parents (shared tree blocks). */ if (btrfs_verify_level_key(tmp, - parent_level - 1, &first_key, gen)) { + parent_level - 1, &check.first_key, gen)) { free_extent_buffer(tmp); return -EUCLEAN; } @@ -1472,7 +1587,7 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p, btrfs_unlock_up_safe(p, level + 1); /* now we're allowed to do a blocking uptodate check */ - ret = btrfs_read_extent_buffer(tmp, gen, parent_level - 1, &first_key); + ret = btrfs_read_extent_buffer(tmp, &check); if (ret) { free_extent_buffer(tmp); btrfs_release_path(p); @@ -1502,8 +1617,7 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p, if (p->reada != READA_NONE) reada_for_search(fs_info, p, level, slot, key->objectid); - tmp = read_tree_block(fs_info, blocknr, root->root_key.objectid, - gen, parent_level - 1, &first_key); + tmp = read_tree_block(fs_info, blocknr, &check); if (IS_ERR(tmp)) { btrfs_release_path(p); return PTR_ERR(tmp); @@ -1934,6 +2048,8 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root, int min_write_lock_level; int prev_cmp; + might_sleep(); + lowest_level = p->lowest_level; WARN_ON(lowest_level && ins_len > 0); WARN_ON(p->nodes[0] != NULL); @@ -2357,7 +2473,7 @@ int btrfs_search_backwards(struct btrfs_root *root, struct btrfs_key *key, return ret; } -/** +/* * Search for a valid slot for the given path. * * @root: The root node of the tree. @@ -2416,7 +2532,7 @@ static void fixup_low_keys(struct btrfs_path *path, break; t = path->nodes[i]; ret = btrfs_tree_mod_log_insert_key(t, tslot, - BTRFS_MOD_LOG_KEY_REPLACE, GFP_ATOMIC); + BTRFS_MOD_LOG_KEY_REPLACE); BUG_ON(ret < 0); btrfs_set_node_key(t, key, tslot); btrfs_mark_buffer_dirty(path->nodes[i]); @@ -2585,8 +2701,8 @@ static int push_node_left(struct btrfs_trans_handle *trans, return ret; } copy_extent_buffer(dst, src, - btrfs_node_key_ptr_offset(dst_nritems), - btrfs_node_key_ptr_offset(0), + btrfs_node_key_ptr_offset(dst, dst_nritems), + btrfs_node_key_ptr_offset(src, 0), push_items * sizeof(struct btrfs_key_ptr)); if (push_items < src_nritems) { @@ -2594,8 +2710,8 @@ static int push_node_left(struct btrfs_trans_handle *trans, * Don't call btrfs_tree_mod_log_insert_move() here, key removal * was already fully logged by btrfs_tree_mod_log_eb_copy() above. */ - memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0), - btrfs_node_key_ptr_offset(push_items), + memmove_extent_buffer(src, btrfs_node_key_ptr_offset(src, 0), + btrfs_node_key_ptr_offset(src, push_items), (src_nritems - push_items) * sizeof(struct btrfs_key_ptr)); } @@ -2655,8 +2771,8 @@ static int balance_node_right(struct btrfs_trans_handle *trans, } ret = btrfs_tree_mod_log_insert_move(dst, push_items, 0, dst_nritems); BUG_ON(ret < 0); - memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items), - btrfs_node_key_ptr_offset(0), + memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(dst, push_items), + btrfs_node_key_ptr_offset(dst, 0), (dst_nritems) * sizeof(struct btrfs_key_ptr)); @@ -2667,8 +2783,8 @@ static int balance_node_right(struct btrfs_trans_handle *trans, return ret; } copy_extent_buffer(dst, src, - btrfs_node_key_ptr_offset(0), - btrfs_node_key_ptr_offset(src_nritems - push_items), + btrfs_node_key_ptr_offset(dst, 0), + btrfs_node_key_ptr_offset(src, src_nritems - push_items), push_items * sizeof(struct btrfs_key_ptr)); btrfs_set_header_nritems(src, src_nritems - push_items); @@ -2771,13 +2887,13 @@ static void insert_ptr(struct btrfs_trans_handle *trans, BUG_ON(ret < 0); } memmove_extent_buffer(lower, - btrfs_node_key_ptr_offset(slot + 1), - btrfs_node_key_ptr_offset(slot), + btrfs_node_key_ptr_offset(lower, slot + 1), + btrfs_node_key_ptr_offset(lower, slot), (nritems - slot) * sizeof(struct btrfs_key_ptr)); } if (level) { ret = btrfs_tree_mod_log_insert_key(lower, slot, - BTRFS_MOD_LOG_KEY_ADD, GFP_NOFS); + BTRFS_MOD_LOG_KEY_ADD); BUG_ON(ret < 0); } btrfs_set_node_key(lower, key, slot); @@ -2854,8 +2970,8 @@ static noinline int split_node(struct btrfs_trans_handle *trans, return ret; } copy_extent_buffer(split, c, - btrfs_node_key_ptr_offset(0), - btrfs_node_key_ptr_offset(mid), + btrfs_node_key_ptr_offset(split, 0), + btrfs_node_key_ptr_offset(c, mid), (c_nritems - mid) * sizeof(struct btrfs_key_ptr)); btrfs_set_header_nritems(split, c_nritems - mid); btrfs_set_header_nritems(c, mid); @@ -2995,25 +3111,17 @@ static noinline int __push_leaf_right(struct btrfs_path *path, /* make room in the right data area */ data_end = leaf_data_end(right); - memmove_extent_buffer(right, - BTRFS_LEAF_DATA_OFFSET + data_end - push_space, - BTRFS_LEAF_DATA_OFFSET + data_end, - BTRFS_LEAF_DATA_SIZE(fs_info) - data_end); + memmove_leaf_data(right, data_end - push_space, data_end, + BTRFS_LEAF_DATA_SIZE(fs_info) - data_end); /* copy from the left data area */ - copy_extent_buffer(right, left, BTRFS_LEAF_DATA_OFFSET + - BTRFS_LEAF_DATA_SIZE(fs_info) - push_space, - BTRFS_LEAF_DATA_OFFSET + leaf_data_end(left), - push_space); + copy_leaf_data(right, left, BTRFS_LEAF_DATA_SIZE(fs_info) - push_space, + leaf_data_end(left), push_space); - memmove_extent_buffer(right, btrfs_item_nr_offset(push_items), - btrfs_item_nr_offset(0), - right_nritems * sizeof(struct btrfs_item)); + memmove_leaf_items(right, push_items, 0, right_nritems); /* copy the items from left to right */ - copy_extent_buffer(right, left, btrfs_item_nr_offset(0), - btrfs_item_nr_offset(left_nritems - push_items), - push_items * sizeof(struct btrfs_item)); + copy_leaf_items(right, left, 0, left_nritems - push_items, push_items); /* update the item pointers */ btrfs_init_map_token(&token, right); @@ -3205,19 +3313,13 @@ static noinline int __push_leaf_left(struct btrfs_path *path, int data_size, WARN_ON(!empty && push_items == btrfs_header_nritems(right)); /* push data from right to left */ - copy_extent_buffer(left, right, - btrfs_item_nr_offset(btrfs_header_nritems(left)), - btrfs_item_nr_offset(0), - push_items * sizeof(struct btrfs_item)); + copy_leaf_items(left, right, btrfs_header_nritems(left), 0, push_items); push_space = BTRFS_LEAF_DATA_SIZE(fs_info) - btrfs_item_offset(right, push_items - 1); - copy_extent_buffer(left, right, BTRFS_LEAF_DATA_OFFSET + - leaf_data_end(left) - push_space, - BTRFS_LEAF_DATA_OFFSET + - btrfs_item_offset(right, push_items - 1), - push_space); + copy_leaf_data(left, right, leaf_data_end(left) - push_space, + btrfs_item_offset(right, push_items - 1), push_space); old_left_nritems = btrfs_header_nritems(left); BUG_ON(old_left_nritems <= 0); @@ -3240,15 +3342,12 @@ static noinline int __push_leaf_left(struct btrfs_path *path, int data_size, if (push_items < right_nritems) { push_space = btrfs_item_offset(right, push_items - 1) - leaf_data_end(right); - memmove_extent_buffer(right, BTRFS_LEAF_DATA_OFFSET + - BTRFS_LEAF_DATA_SIZE(fs_info) - push_space, - BTRFS_LEAF_DATA_OFFSET + - leaf_data_end(right), push_space); + memmove_leaf_data(right, + BTRFS_LEAF_DATA_SIZE(fs_info) - push_space, + leaf_data_end(right), push_space); - memmove_extent_buffer(right, btrfs_item_nr_offset(0), - btrfs_item_nr_offset(push_items), - (btrfs_header_nritems(right) - push_items) * - sizeof(struct btrfs_item)); + memmove_leaf_items(right, 0, push_items, + btrfs_header_nritems(right) - push_items); } btrfs_init_map_token(&token, right); @@ -3380,14 +3479,10 @@ static noinline void copy_for_split(struct btrfs_trans_handle *trans, btrfs_set_header_nritems(right, nritems); data_copy_size = btrfs_item_data_end(l, mid) - leaf_data_end(l); - copy_extent_buffer(right, l, btrfs_item_nr_offset(0), - btrfs_item_nr_offset(mid), - nritems * sizeof(struct btrfs_item)); + copy_leaf_items(right, l, 0, mid, nritems); - copy_extent_buffer(right, l, - BTRFS_LEAF_DATA_OFFSET + BTRFS_LEAF_DATA_SIZE(fs_info) - - data_copy_size, BTRFS_LEAF_DATA_OFFSET + - leaf_data_end(l), data_copy_size); + copy_leaf_data(right, l, BTRFS_LEAF_DATA_SIZE(fs_info) - data_copy_size, + leaf_data_end(l), data_copy_size); rt_data_off = BTRFS_LEAF_DATA_SIZE(fs_info) - btrfs_item_data_end(l, mid); @@ -3757,9 +3852,7 @@ static noinline int split_item(struct btrfs_path *path, nritems = btrfs_header_nritems(leaf); if (slot != nritems) { /* shift the items */ - memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + 1), - btrfs_item_nr_offset(slot), - (nritems - slot) * sizeof(struct btrfs_item)); + memmove_leaf_items(leaf, slot + 1, slot, nritems - slot); } btrfs_cpu_key_to_disk(&disk_key, new_key); @@ -3870,9 +3963,8 @@ void btrfs_truncate_item(struct btrfs_path *path, u32 new_size, int from_end) /* shift the data */ if (from_end) { - memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET + - data_end + size_diff, BTRFS_LEAF_DATA_OFFSET + - data_end, old_data_start + new_size - data_end); + memmove_leaf_data(leaf, data_end + size_diff, data_end, + old_data_start + new_size - data_end); } else { struct btrfs_disk_key disk_key; u64 offset; @@ -3897,9 +3989,8 @@ void btrfs_truncate_item(struct btrfs_path *path, u32 new_size, int from_end) } } - memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET + - data_end + size_diff, BTRFS_LEAF_DATA_OFFSET + - data_end, old_data_start - data_end); + memmove_leaf_data(leaf, data_end + size_diff, data_end, + old_data_start - data_end); offset = btrfs_disk_key_offset(&disk_key); btrfs_set_disk_key_offset(&disk_key, offset + size_diff); @@ -3964,9 +4055,8 @@ void btrfs_extend_item(struct btrfs_path *path, u32 data_size) } /* shift the data */ - memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET + - data_end - data_size, BTRFS_LEAF_DATA_OFFSET + - data_end, old_data - data_end); + memmove_leaf_data(leaf, data_end - data_size, data_end, + old_data - data_end); data_end = old_data; old_size = btrfs_item_size(leaf, slot); @@ -3979,14 +4069,15 @@ void btrfs_extend_item(struct btrfs_path *path, u32 data_size) } } -/** - * setup_items_for_insert - Helper called before inserting one or more items - * to a leaf. Main purpose is to save stack depth by doing the bulk of the work - * in a function that doesn't call btrfs_search_slot +/* + * Make space in the node before inserting one or more items. * * @root: root we are inserting items to * @path: points to the leaf/slot where we are going to insert new items * @batch: information about the batch of items to insert + * + * Main purpose is to save stack depth by doing the bulk of the work in a + * function that doesn't call btrfs_search_slot */ static void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path, const struct btrfs_item_batch *batch) @@ -4049,15 +4140,11 @@ static void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *p ioff - batch->total_data_size); } /* shift the items */ - memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + batch->nr), - btrfs_item_nr_offset(slot), - (nritems - slot) * sizeof(struct btrfs_item)); + memmove_leaf_items(leaf, slot + batch->nr, slot, nritems - slot); /* shift the data */ - memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET + - data_end - batch->total_data_size, - BTRFS_LEAF_DATA_OFFSET + data_end, - old_data - data_end); + memmove_leaf_data(leaf, data_end - batch->total_data_size, + data_end, old_data - data_end); data_end = old_data; } @@ -4211,13 +4298,13 @@ static void del_ptr(struct btrfs_root *root, struct btrfs_path *path, BUG_ON(ret < 0); } memmove_extent_buffer(parent, - btrfs_node_key_ptr_offset(slot), - btrfs_node_key_ptr_offset(slot + 1), + btrfs_node_key_ptr_offset(parent, slot), + btrfs_node_key_ptr_offset(parent, slot + 1), sizeof(struct btrfs_key_ptr) * (nritems - slot - 1)); } else if (level) { ret = btrfs_tree_mod_log_insert_key(parent, slot, - BTRFS_MOD_LOG_KEY_REMOVE, GFP_NOFS); + BTRFS_MOD_LOG_KEY_REMOVE); BUG_ON(ret < 0); } @@ -4292,10 +4379,8 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, for (i = 0; i < nr; i++) dsize += btrfs_item_size(leaf, slot + i); - memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET + - data_end + dsize, - BTRFS_LEAF_DATA_OFFSET + data_end, - last_off - data_end); + memmove_leaf_data(leaf, data_end + dsize, data_end, + last_off - data_end); btrfs_init_map_token(&token, leaf); for (i = slot + nr; i < nritems; i++) { @@ -4305,10 +4390,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, btrfs_set_token_item_offset(&token, i, ioff + dsize); } - memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot), - btrfs_item_nr_offset(slot + nr), - sizeof(struct btrfs_item) * - (nritems - slot - nr)); + memmove_leaf_items(leaf, slot, slot + nr, nritems - slot - nr); } btrfs_set_header_nritems(leaf, nritems - nr); nritems -= nr; @@ -4850,6 +4932,14 @@ done: return ret; } +int btrfs_next_old_item(struct btrfs_root *root, struct btrfs_path *path, u64 time_seq) +{ + path->slots[0]++; + if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) + return btrfs_next_old_leaf(root, path, time_seq); + return 0; +} + /* * this uses btrfs_prev_leaf to walk backwards in the tree, and keeps * searching until it gets past min_objectid or finds an item of 'type' @@ -4933,3 +5023,18 @@ int btrfs_previous_extent_item(struct btrfs_root *root, } return 1; } + +int __init btrfs_ctree_init(void) +{ + btrfs_path_cachep = kmem_cache_create("btrfs_path", + sizeof(struct btrfs_path), 0, + SLAB_MEM_SPREAD, NULL); + if (!btrfs_path_cachep) + return -ENOMEM; + return 0; +} + +void __cold btrfs_ctree_exit(void) +{ + kmem_cache_destroy(btrfs_path_cachep); +} diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 919670d35919..6965703a81b6 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -28,12 +28,15 @@ #include <linux/refcount.h> #include <linux/crc32c.h> #include <linux/iomap.h> +#include <linux/fscrypt.h> #include "extent-io-tree.h" #include "extent_io.h" #include "extent_map.h" #include "async-thread.h" #include "block-rsv.h" #include "locking.h" +#include "misc.h" +#include "fs.h" struct btrfs_trans_handle; struct btrfs_transaction; @@ -41,10 +44,6 @@ struct btrfs_pending_snapshot; struct btrfs_delayed_ref_root; struct btrfs_space_info; struct btrfs_block_group; -extern struct kmem_cache *btrfs_trans_handle_cachep; -extern struct kmem_cache *btrfs_path_cachep; -extern struct kmem_cache *btrfs_free_space_cachep; -extern struct kmem_cache *btrfs_free_space_bitmap_cachep; struct btrfs_ordered_sum; struct btrfs_ref; struct btrfs_bio; @@ -55,340 +54,6 @@ struct btrfs_balance_control; struct btrfs_delayed_root; struct reloc_control; -#define BTRFS_MAGIC 0x4D5F53665248425FULL /* ascii _BHRfS_M, no null */ - -/* - * Maximum number of mirrors that can be available for all profiles counting - * the target device of dev-replace as one. During an active device replace - * procedure, the target device of the copy operation is a mirror for the - * filesystem data as well that can be used to read data in order to repair - * read errors on other disks. - * - * Current value is derived from RAID1C4 with 4 copies. - */ -#define BTRFS_MAX_MIRRORS (4 + 1) - -#define BTRFS_MAX_LEVEL 8 - -#define BTRFS_OLDEST_GENERATION 0ULL - -/* - * we can actually store much bigger names, but lets not confuse the rest - * of linux - */ -#define BTRFS_NAME_LEN 255 - -/* - * Theoretical limit is larger, but we keep this down to a sane - * value. That should limit greatly the possibility of collisions on - * inode ref items. - */ -#define BTRFS_LINK_MAX 65535U - -#define BTRFS_EMPTY_DIR_SIZE 0 - -/* ioprio of readahead is set to idle */ -#define BTRFS_IOPRIO_READA (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)) - -#define BTRFS_DIRTY_METADATA_THRESH SZ_32M - -/* - * Use large batch size to reduce overhead of metadata updates. On the reader - * side, we only read it when we are close to ENOSPC and the read overhead is - * mostly related to the number of CPUs, so it is OK to use arbitrary large - * value here. - */ -#define BTRFS_TOTAL_BYTES_PINNED_BATCH SZ_128M - -#define BTRFS_MAX_EXTENT_SIZE SZ_128M - -/* - * Deltas are an effective way to populate global statistics. Give macro names - * to make it clear what we're doing. An example is discard_extents in - * btrfs_free_space_ctl. - */ -#define BTRFS_STAT_NR_ENTRIES 2 -#define BTRFS_STAT_CURR 0 -#define BTRFS_STAT_PREV 1 - -static inline unsigned long btrfs_chunk_item_size(int num_stripes) -{ - BUG_ON(num_stripes == 0); - return sizeof(struct btrfs_chunk) + - sizeof(struct btrfs_stripe) * (num_stripes - 1); -} - -/* - * Runtime (in-memory) states of filesystem - */ -enum { - /* Global indicator of serious filesystem errors */ - BTRFS_FS_STATE_ERROR, - /* - * Filesystem is being remounted, allow to skip some operations, like - * defrag - */ - BTRFS_FS_STATE_REMOUNTING, - /* Filesystem in RO mode */ - BTRFS_FS_STATE_RO, - /* Track if a transaction abort has been reported on this filesystem */ - BTRFS_FS_STATE_TRANS_ABORTED, - /* - * Bio operations should be blocked on this filesystem because a source - * or target device is being destroyed as part of a device replace - */ - BTRFS_FS_STATE_DEV_REPLACING, - /* The btrfs_fs_info created for self-tests */ - BTRFS_FS_STATE_DUMMY_FS_INFO, - - BTRFS_FS_STATE_NO_CSUMS, - - /* Indicates there was an error cleaning up a log tree. */ - BTRFS_FS_STATE_LOG_CLEANUP_ERROR, - - BTRFS_FS_STATE_COUNT -}; - -#define BTRFS_BACKREF_REV_MAX 256 -#define BTRFS_BACKREF_REV_SHIFT 56 -#define BTRFS_BACKREF_REV_MASK (((u64)BTRFS_BACKREF_REV_MAX - 1) << \ - BTRFS_BACKREF_REV_SHIFT) - -#define BTRFS_OLD_BACKREF_REV 0 -#define BTRFS_MIXED_BACKREF_REV 1 - -/* - * every tree block (leaf or node) starts with this header. - */ -struct btrfs_header { - /* these first four must match the super block */ - u8 csum[BTRFS_CSUM_SIZE]; - u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */ - __le64 bytenr; /* which block this node is supposed to live in */ - __le64 flags; - - /* allowed to be different from the super from here on down */ - u8 chunk_tree_uuid[BTRFS_UUID_SIZE]; - __le64 generation; - __le64 owner; - __le32 nritems; - u8 level; -} __attribute__ ((__packed__)); - -/* - * this is a very generous portion of the super block, giving us - * room to translate 14 chunks with 3 stripes each. - */ -#define BTRFS_SYSTEM_CHUNK_ARRAY_SIZE 2048 - -/* - * just in case we somehow lose the roots and are not able to mount, - * we store an array of the roots from previous transactions - * in the super. - */ -#define BTRFS_NUM_BACKUP_ROOTS 4 -struct btrfs_root_backup { - __le64 tree_root; - __le64 tree_root_gen; - - __le64 chunk_root; - __le64 chunk_root_gen; - - __le64 extent_root; - __le64 extent_root_gen; - - __le64 fs_root; - __le64 fs_root_gen; - - __le64 dev_root; - __le64 dev_root_gen; - - __le64 csum_root; - __le64 csum_root_gen; - - __le64 total_bytes; - __le64 bytes_used; - __le64 num_devices; - /* future */ - __le64 unused_64[4]; - - u8 tree_root_level; - u8 chunk_root_level; - u8 extent_root_level; - u8 fs_root_level; - u8 dev_root_level; - u8 csum_root_level; - /* future and to align */ - u8 unused_8[10]; -} __attribute__ ((__packed__)); - -#define BTRFS_SUPER_INFO_OFFSET SZ_64K -#define BTRFS_SUPER_INFO_SIZE 4096 - -/* - * The reserved space at the beginning of each device. - * It covers the primary super block and leaves space for potential use by other - * tools like bootloaders or to lower potential damage of accidental overwrite. - */ -#define BTRFS_DEVICE_RANGE_RESERVED (SZ_1M) - -/* - * the super block basically lists the main trees of the FS - * it currently lacks any block count etc etc - */ -struct btrfs_super_block { - /* the first 4 fields must match struct btrfs_header */ - u8 csum[BTRFS_CSUM_SIZE]; - /* FS specific UUID, visible to user */ - u8 fsid[BTRFS_FSID_SIZE]; - __le64 bytenr; /* this block number */ - __le64 flags; - - /* allowed to be different from the btrfs_header from here own down */ - __le64 magic; - __le64 generation; - __le64 root; - __le64 chunk_root; - __le64 log_root; - - /* - * This member has never been utilized since the very beginning, thus - * it's always 0 regardless of kernel version. We always use - * generation + 1 to read log tree root. So here we mark it deprecated. - */ - __le64 __unused_log_root_transid; - __le64 total_bytes; - __le64 bytes_used; - __le64 root_dir_objectid; - __le64 num_devices; - __le32 sectorsize; - __le32 nodesize; - __le32 __unused_leafsize; - __le32 stripesize; - __le32 sys_chunk_array_size; - __le64 chunk_root_generation; - __le64 compat_flags; - __le64 compat_ro_flags; - __le64 incompat_flags; - __le16 csum_type; - u8 root_level; - u8 chunk_root_level; - u8 log_root_level; - struct btrfs_dev_item dev_item; - - char label[BTRFS_LABEL_SIZE]; - - __le64 cache_generation; - __le64 uuid_tree_generation; - - /* the UUID written into btree blocks */ - u8 metadata_uuid[BTRFS_FSID_SIZE]; - - /* future expansion */ - u8 reserved8[8]; - __le64 reserved[27]; - u8 sys_chunk_array[BTRFS_SYSTEM_CHUNK_ARRAY_SIZE]; - struct btrfs_root_backup super_roots[BTRFS_NUM_BACKUP_ROOTS]; - - /* Padded to 4096 bytes */ - u8 padding[565]; -} __attribute__ ((__packed__)); -static_assert(sizeof(struct btrfs_super_block) == BTRFS_SUPER_INFO_SIZE); - -/* - * Compat flags that we support. If any incompat flags are set other than the - * ones specified below then we will fail to mount - */ -#define BTRFS_FEATURE_COMPAT_SUPP 0ULL -#define BTRFS_FEATURE_COMPAT_SAFE_SET 0ULL -#define BTRFS_FEATURE_COMPAT_SAFE_CLEAR 0ULL - -#define BTRFS_FEATURE_COMPAT_RO_SUPP \ - (BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE | \ - BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE_VALID | \ - BTRFS_FEATURE_COMPAT_RO_VERITY | \ - BTRFS_FEATURE_COMPAT_RO_BLOCK_GROUP_TREE) - -#define BTRFS_FEATURE_COMPAT_RO_SAFE_SET 0ULL -#define BTRFS_FEATURE_COMPAT_RO_SAFE_CLEAR 0ULL - -#ifdef CONFIG_BTRFS_DEBUG -/* - * Extent tree v2 supported only with CONFIG_BTRFS_DEBUG - */ -#define BTRFS_FEATURE_INCOMPAT_SUPP \ - (BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF | \ - BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL | \ - BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS | \ - BTRFS_FEATURE_INCOMPAT_BIG_METADATA | \ - BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO | \ - BTRFS_FEATURE_INCOMPAT_COMPRESS_ZSTD | \ - BTRFS_FEATURE_INCOMPAT_RAID56 | \ - BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF | \ - BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA | \ - BTRFS_FEATURE_INCOMPAT_NO_HOLES | \ - BTRFS_FEATURE_INCOMPAT_METADATA_UUID | \ - BTRFS_FEATURE_INCOMPAT_RAID1C34 | \ - BTRFS_FEATURE_INCOMPAT_ZONED | \ - BTRFS_FEATURE_INCOMPAT_EXTENT_TREE_V2) -#else -#define BTRFS_FEATURE_INCOMPAT_SUPP \ - (BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF | \ - BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL | \ - BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS | \ - BTRFS_FEATURE_INCOMPAT_BIG_METADATA | \ - BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO | \ - BTRFS_FEATURE_INCOMPAT_COMPRESS_ZSTD | \ - BTRFS_FEATURE_INCOMPAT_RAID56 | \ - BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF | \ - BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA | \ - BTRFS_FEATURE_INCOMPAT_NO_HOLES | \ - BTRFS_FEATURE_INCOMPAT_METADATA_UUID | \ - BTRFS_FEATURE_INCOMPAT_RAID1C34 | \ - BTRFS_FEATURE_INCOMPAT_ZONED) -#endif - -#define BTRFS_FEATURE_INCOMPAT_SAFE_SET \ - (BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF) -#define BTRFS_FEATURE_INCOMPAT_SAFE_CLEAR 0ULL - -/* - * A leaf is full of items. offset and size tell us where to find - * the item in the leaf (relative to the start of the data area) - */ -struct btrfs_item { - struct btrfs_disk_key key; - __le32 offset; - __le32 size; -} __attribute__ ((__packed__)); - -/* - * leaves have an item area and a data area: - * [item0, item1....itemN] [free space] [dataN...data1, data0] - * - * The data is separate from the items to get the keys closer together - * during searches. - */ -struct btrfs_leaf { - struct btrfs_header header; - struct btrfs_item items[]; -} __attribute__ ((__packed__)); - -/* - * all non-leaf blocks are nodes, they hold only keys and pointers to - * other blocks - */ -struct btrfs_key_ptr { - struct btrfs_disk_key key; - __le64 blockptr; - __le64 generation; -} __attribute__ ((__packed__)); - -struct btrfs_node { - struct btrfs_header header; - struct btrfs_key_ptr ptrs[]; -} __attribute__ ((__packed__)); - /* Read ahead values for struct btrfs_path.reada */ enum { READA_NONE, @@ -447,712 +112,6 @@ struct btrfs_path { unsigned int nowait:1; }; -struct btrfs_dev_replace { - u64 replace_state; /* see #define above */ - time64_t time_started; /* seconds since 1-Jan-1970 */ - time64_t time_stopped; /* seconds since 1-Jan-1970 */ - atomic64_t num_write_errors; - atomic64_t num_uncorrectable_read_errors; - - u64 cursor_left; - u64 committed_cursor_left; - u64 cursor_left_last_write_of_item; - u64 cursor_right; - - u64 cont_reading_from_srcdev_mode; /* see #define above */ - - int is_valid; - int item_needs_writeback; - struct btrfs_device *srcdev; - struct btrfs_device *tgtdev; - - struct mutex lock_finishing_cancel_unmount; - struct rw_semaphore rwsem; - - struct btrfs_scrub_progress scrub_progress; - - struct percpu_counter bio_counter; - wait_queue_head_t replace_wait; -}; - -/* - * free clusters are used to claim free space in relatively large chunks, - * allowing us to do less seeky writes. They are used for all metadata - * allocations. In ssd_spread mode they are also used for data allocations. - */ -struct btrfs_free_cluster { - spinlock_t lock; - spinlock_t refill_lock; - struct rb_root root; - - /* largest extent in this cluster */ - u64 max_size; - - /* first extent starting offset */ - u64 window_start; - - /* We did a full search and couldn't create a cluster */ - bool fragmented; - - struct btrfs_block_group *block_group; - /* - * when a cluster is allocated from a block group, we put the - * cluster onto a list in the block group so that it can - * be freed before the block group is freed. - */ - struct list_head block_group_list; -}; - -/* Discard control. */ -/* - * Async discard uses multiple lists to differentiate the discard filter - * parameters. Index 0 is for completely free block groups where we need to - * ensure the entire block group is trimmed without being lossy. Indices - * afterwards represent monotonically decreasing discard filter sizes to - * prioritize what should be discarded next. - */ -#define BTRFS_NR_DISCARD_LISTS 3 -#define BTRFS_DISCARD_INDEX_UNUSED 0 -#define BTRFS_DISCARD_INDEX_START 1 - -struct btrfs_discard_ctl { - struct workqueue_struct *discard_workers; - struct delayed_work work; - spinlock_t lock; - struct btrfs_block_group *block_group; - struct list_head discard_list[BTRFS_NR_DISCARD_LISTS]; - u64 prev_discard; - u64 prev_discard_time; - atomic_t discardable_extents; - atomic64_t discardable_bytes; - u64 max_discard_size; - u64 delay_ms; - u32 iops_limit; - u32 kbps_limit; - u64 discard_extent_bytes; - u64 discard_bitmap_bytes; - atomic64_t discard_bytes_saved; -}; - -enum { - BTRFS_FS_CLOSING_START, - BTRFS_FS_CLOSING_DONE, - BTRFS_FS_LOG_RECOVERING, - BTRFS_FS_OPEN, - BTRFS_FS_QUOTA_ENABLED, - BTRFS_FS_UPDATE_UUID_TREE_GEN, - BTRFS_FS_CREATING_FREE_SPACE_TREE, - BTRFS_FS_BTREE_ERR, - BTRFS_FS_LOG1_ERR, - BTRFS_FS_LOG2_ERR, - BTRFS_FS_QUOTA_OVERRIDE, - /* Used to record internally whether fs has been frozen */ - BTRFS_FS_FROZEN, - /* - * Indicate that balance has been set up from the ioctl and is in the - * main phase. The fs_info::balance_ctl is initialized. - */ - BTRFS_FS_BALANCE_RUNNING, - - /* - * Indicate that relocation of a chunk has started, it's set per chunk - * and is toggled between chunks. - */ - BTRFS_FS_RELOC_RUNNING, - - /* Indicate that the cleaner thread is awake and doing something. */ - BTRFS_FS_CLEANER_RUNNING, - - /* - * The checksumming has an optimized version and is considered fast, - * so we don't need to offload checksums to workqueues. - */ - BTRFS_FS_CSUM_IMPL_FAST, - - /* Indicate that the discard workqueue can service discards. */ - BTRFS_FS_DISCARD_RUNNING, - - /* Indicate that we need to cleanup space cache v1 */ - BTRFS_FS_CLEANUP_SPACE_CACHE_V1, - - /* Indicate that we can't trust the free space tree for caching yet */ - BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED, - - /* Indicate whether there are any tree modification log users */ - BTRFS_FS_TREE_MOD_LOG_USERS, - - /* Indicate that we want the transaction kthread to commit right now. */ - BTRFS_FS_COMMIT_TRANS, - - /* Indicate we have half completed snapshot deletions pending. */ - BTRFS_FS_UNFINISHED_DROPS, - - /* Indicate we have to finish a zone to do next allocation. */ - BTRFS_FS_NEED_ZONE_FINISH, - -#if BITS_PER_LONG == 32 - /* Indicate if we have error/warn message printed on 32bit systems */ - BTRFS_FS_32BIT_ERROR, - BTRFS_FS_32BIT_WARN, -#endif -}; - -/* - * Exclusive operations (device replace, resize, device add/remove, balance) - */ -enum btrfs_exclusive_operation { - BTRFS_EXCLOP_NONE, - BTRFS_EXCLOP_BALANCE_PAUSED, - BTRFS_EXCLOP_BALANCE, - BTRFS_EXCLOP_DEV_ADD, - BTRFS_EXCLOP_DEV_REMOVE, - BTRFS_EXCLOP_DEV_REPLACE, - BTRFS_EXCLOP_RESIZE, - BTRFS_EXCLOP_SWAP_ACTIVATE, -}; - -/* Store data about transaction commits, exported via sysfs. */ -struct btrfs_commit_stats { - /* Total number of commits */ - u64 commit_count; - /* The maximum commit duration so far in ns */ - u64 max_commit_dur; - /* The last commit duration in ns */ - u64 last_commit_dur; - /* The total commit duration in ns */ - u64 total_commit_dur; -}; - -struct btrfs_fs_info { - u8 chunk_tree_uuid[BTRFS_UUID_SIZE]; - unsigned long flags; - struct btrfs_root *tree_root; - struct btrfs_root *chunk_root; - struct btrfs_root *dev_root; - struct btrfs_root *fs_root; - struct btrfs_root *quota_root; - struct btrfs_root *uuid_root; - struct btrfs_root *data_reloc_root; - struct btrfs_root *block_group_root; - - /* the log root tree is a directory of all the other log roots */ - struct btrfs_root *log_root_tree; - - /* The tree that holds the global roots (csum, extent, etc) */ - rwlock_t global_root_lock; - struct rb_root global_root_tree; - - spinlock_t fs_roots_radix_lock; - struct radix_tree_root fs_roots_radix; - - /* block group cache stuff */ - rwlock_t block_group_cache_lock; - struct rb_root_cached block_group_cache_tree; - - /* keep track of unallocated space */ - atomic64_t free_chunk_space; - - /* Track ranges which are used by log trees blocks/logged data extents */ - struct extent_io_tree excluded_extents; - - /* logical->physical extent mapping */ - struct extent_map_tree mapping_tree; - - /* - * block reservation for extent, checksum, root tree and - * delayed dir index item - */ - struct btrfs_block_rsv global_block_rsv; - /* block reservation for metadata operations */ - struct btrfs_block_rsv trans_block_rsv; - /* block reservation for chunk tree */ - struct btrfs_block_rsv chunk_block_rsv; - /* block reservation for delayed operations */ - struct btrfs_block_rsv delayed_block_rsv; - /* block reservation for delayed refs */ - struct btrfs_block_rsv delayed_refs_rsv; - - struct btrfs_block_rsv empty_block_rsv; - - u64 generation; - u64 last_trans_committed; - /* - * Generation of the last transaction used for block group relocation - * since the filesystem was last mounted (or 0 if none happened yet). - * Must be written and read while holding btrfs_fs_info::commit_root_sem. - */ - u64 last_reloc_trans; - u64 avg_delayed_ref_runtime; - - /* - * this is updated to the current trans every time a full commit - * is required instead of the faster short fsync log commits - */ - u64 last_trans_log_full_commit; - unsigned long mount_opt; - /* - * Track requests for actions that need to be done during transaction - * commit (like for some mount options). - */ - unsigned long pending_changes; - unsigned long compress_type:4; - unsigned int compress_level; - u32 commit_interval; - /* - * It is a suggestive number, the read side is safe even it gets a - * wrong number because we will write out the data into a regular - * extent. The write side(mount/remount) is under ->s_umount lock, - * so it is also safe. - */ - u64 max_inline; - - struct btrfs_transaction *running_transaction; - wait_queue_head_t transaction_throttle; - wait_queue_head_t transaction_wait; - wait_queue_head_t transaction_blocked_wait; - wait_queue_head_t async_submit_wait; - - /* - * Used to protect the incompat_flags, compat_flags, compat_ro_flags - * when they are updated. - * - * Because we do not clear the flags for ever, so we needn't use - * the lock on the read side. - * - * We also needn't use the lock when we mount the fs, because - * there is no other task which will update the flag. - */ - spinlock_t super_lock; - struct btrfs_super_block *super_copy; - struct btrfs_super_block *super_for_commit; - struct super_block *sb; - struct inode *btree_inode; - struct mutex tree_log_mutex; - struct mutex transaction_kthread_mutex; - struct mutex cleaner_mutex; - struct mutex chunk_mutex; - - /* - * this is taken to make sure we don't set block groups ro after - * the free space cache has been allocated on them - */ - struct mutex ro_block_group_mutex; - - /* this is used during read/modify/write to make sure - * no two ios are trying to mod the same stripe at the same - * time - */ - struct btrfs_stripe_hash_table *stripe_hash_table; - - /* - * this protects the ordered operations list only while we are - * processing all of the entries on it. This way we make - * sure the commit code doesn't find the list temporarily empty - * because another function happens to be doing non-waiting preflush - * before jumping into the main commit. - */ - struct mutex ordered_operations_mutex; - - struct rw_semaphore commit_root_sem; - - struct rw_semaphore cleanup_work_sem; - - struct rw_semaphore subvol_sem; - - spinlock_t trans_lock; - /* - * the reloc mutex goes with the trans lock, it is taken - * during commit to protect us from the relocation code - */ - struct mutex reloc_mutex; - - struct list_head trans_list; - struct list_head dead_roots; - struct list_head caching_block_groups; - - spinlock_t delayed_iput_lock; - struct list_head delayed_iputs; - atomic_t nr_delayed_iputs; - wait_queue_head_t delayed_iputs_wait; - - atomic64_t tree_mod_seq; - - /* this protects tree_mod_log and tree_mod_seq_list */ - rwlock_t tree_mod_log_lock; - struct rb_root tree_mod_log; - struct list_head tree_mod_seq_list; - - atomic_t async_delalloc_pages; - - /* - * this is used to protect the following list -- ordered_roots. - */ - spinlock_t ordered_root_lock; - - /* - * all fs/file tree roots in which there are data=ordered extents - * pending writeback are added into this list. - * - * these can span multiple transactions and basically include - * every dirty data page that isn't from nodatacow - */ - struct list_head ordered_roots; - - struct mutex delalloc_root_mutex; - spinlock_t delalloc_root_lock; - /* all fs/file tree roots that have delalloc inodes. */ - struct list_head delalloc_roots; - - /* - * there is a pool of worker threads for checksumming during writes - * and a pool for checksumming after reads. This is because readers - * can run with FS locks held, and the writers may be waiting for - * those locks. We don't want ordering in the pending list to cause - * deadlocks, and so the two are serviced separately. - * - * A third pool does submit_bio to avoid deadlocking with the other - * two - */ - struct btrfs_workqueue *workers; - struct btrfs_workqueue *hipri_workers; - struct btrfs_workqueue *delalloc_workers; - struct btrfs_workqueue *flush_workers; - struct workqueue_struct *endio_workers; - struct workqueue_struct *endio_meta_workers; - struct workqueue_struct *endio_raid56_workers; - struct workqueue_struct *rmw_workers; - struct workqueue_struct *compressed_write_workers; - struct btrfs_workqueue *endio_write_workers; - struct btrfs_workqueue *endio_freespace_worker; - struct btrfs_workqueue *caching_workers; - - /* - * fixup workers take dirty pages that didn't properly go through - * the cow mechanism and make them safe to write. It happens - * for the sys_munmap function call path - */ - struct btrfs_workqueue *fixup_workers; - struct btrfs_workqueue *delayed_workers; - - struct task_struct *transaction_kthread; - struct task_struct *cleaner_kthread; - u32 thread_pool_size; - - struct kobject *space_info_kobj; - struct kobject *qgroups_kobj; - struct kobject *discard_kobj; - - /* used to keep from writing metadata until there is a nice batch */ - struct percpu_counter dirty_metadata_bytes; - struct percpu_counter delalloc_bytes; - struct percpu_counter ordered_bytes; - s32 dirty_metadata_batch; - s32 delalloc_batch; - - struct list_head dirty_cowonly_roots; - - struct btrfs_fs_devices *fs_devices; - - /* - * The space_info list is effectively read only after initial - * setup. It is populated at mount time and cleaned up after - * all block groups are removed. RCU is used to protect it. - */ - struct list_head space_info; - - struct btrfs_space_info *data_sinfo; - - struct reloc_control *reloc_ctl; - - /* data_alloc_cluster is only used in ssd_spread mode */ - struct btrfs_free_cluster data_alloc_cluster; - - /* all metadata allocations go through this cluster */ - struct btrfs_free_cluster meta_alloc_cluster; - - /* auto defrag inodes go here */ - spinlock_t defrag_inodes_lock; - struct rb_root defrag_inodes; - atomic_t defrag_running; - - /* Used to protect avail_{data, metadata, system}_alloc_bits */ - seqlock_t profiles_lock; - /* - * these three are in extended format (availability of single - * chunks is denoted by BTRFS_AVAIL_ALLOC_BIT_SINGLE bit, other - * types are denoted by corresponding BTRFS_BLOCK_GROUP_* bits) - */ - u64 avail_data_alloc_bits; - u64 avail_metadata_alloc_bits; - u64 avail_system_alloc_bits; - - /* restriper state */ - spinlock_t balance_lock; - struct mutex balance_mutex; - atomic_t balance_pause_req; - atomic_t balance_cancel_req; - struct btrfs_balance_control *balance_ctl; - wait_queue_head_t balance_wait_q; - - /* Cancellation requests for chunk relocation */ - atomic_t reloc_cancel_req; - - u32 data_chunk_allocations; - u32 metadata_ratio; - - void *bdev_holder; - - /* private scrub information */ - struct mutex scrub_lock; - atomic_t scrubs_running; - atomic_t scrub_pause_req; - atomic_t scrubs_paused; - atomic_t scrub_cancel_req; - wait_queue_head_t scrub_pause_wait; - /* - * The worker pointers are NULL iff the refcount is 0, ie. scrub is not - * running. - */ - refcount_t scrub_workers_refcnt; - struct workqueue_struct *scrub_workers; - struct workqueue_struct *scrub_wr_completion_workers; - struct workqueue_struct *scrub_parity_workers; - struct btrfs_subpage_info *subpage_info; - - struct btrfs_discard_ctl discard_ctl; - -#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY - u32 check_integrity_print_mask; -#endif - /* is qgroup tracking in a consistent state? */ - u64 qgroup_flags; - - /* holds configuration and tracking. Protected by qgroup_lock */ - struct rb_root qgroup_tree; - spinlock_t qgroup_lock; - - /* - * used to avoid frequently calling ulist_alloc()/ulist_free() - * when doing qgroup accounting, it must be protected by qgroup_lock. - */ - struct ulist *qgroup_ulist; - - /* - * Protect user change for quota operations. If a transaction is needed, - * it must be started before locking this lock. - */ - struct mutex qgroup_ioctl_lock; - - /* list of dirty qgroups to be written at next commit */ - struct list_head dirty_qgroups; - - /* used by qgroup for an efficient tree traversal */ - u64 qgroup_seq; - - /* qgroup rescan items */ - struct mutex qgroup_rescan_lock; /* protects the progress item */ - struct btrfs_key qgroup_rescan_progress; - struct btrfs_workqueue *qgroup_rescan_workers; - struct completion qgroup_rescan_completion; - struct btrfs_work qgroup_rescan_work; - bool qgroup_rescan_running; /* protected by qgroup_rescan_lock */ - u8 qgroup_drop_subtree_thres; - - /* filesystem state */ - unsigned long fs_state; - - struct btrfs_delayed_root *delayed_root; - - /* Extent buffer radix tree */ - spinlock_t buffer_lock; - /* Entries are eb->start / sectorsize */ - struct radix_tree_root buffer_radix; - - /* next backup root to be overwritten */ - int backup_root_index; - - /* device replace state */ - struct btrfs_dev_replace dev_replace; - - struct semaphore uuid_tree_rescan_sem; - - /* Used to reclaim the metadata space in the background. */ - struct work_struct async_reclaim_work; - struct work_struct async_data_reclaim_work; - struct work_struct preempt_reclaim_work; - - /* Reclaim partially filled block groups in the background */ - struct work_struct reclaim_bgs_work; - struct list_head reclaim_bgs; - int bg_reclaim_threshold; - - spinlock_t unused_bgs_lock; - struct list_head unused_bgs; - struct mutex unused_bg_unpin_mutex; - /* Protect block groups that are going to be deleted */ - struct mutex reclaim_bgs_lock; - - /* Cached block sizes */ - u32 nodesize; - u32 sectorsize; - /* ilog2 of sectorsize, use to avoid 64bit division */ - u32 sectorsize_bits; - u32 csum_size; - u32 csums_per_leaf; - u32 stripesize; - - /* - * Maximum size of an extent. BTRFS_MAX_EXTENT_SIZE on regular - * filesystem, on zoned it depends on the device constraints. - */ - u64 max_extent_size; - - /* Block groups and devices containing active swapfiles. */ - spinlock_t swapfile_pins_lock; - struct rb_root swapfile_pins; - - struct crypto_shash *csum_shash; - - /* Type of exclusive operation running, protected by super_lock */ - enum btrfs_exclusive_operation exclusive_operation; - - /* - * Zone size > 0 when in ZONED mode, otherwise it's used for a check - * if the mode is enabled - */ - u64 zone_size; - - /* Max size to emit ZONE_APPEND write command */ - u64 max_zone_append_size; - struct mutex zoned_meta_io_lock; - spinlock_t treelog_bg_lock; - u64 treelog_bg; - - /* - * Start of the dedicated data relocation block group, protected by - * relocation_bg_lock. - */ - spinlock_t relocation_bg_lock; - u64 data_reloc_bg; - struct mutex zoned_data_reloc_io_lock; - - u64 nr_global_roots; - - spinlock_t zone_active_bgs_lock; - struct list_head zone_active_bgs; - - /* Updates are not protected by any lock */ - struct btrfs_commit_stats commit_stats; - - /* - * Last generation where we dropped a non-relocation root. - * Use btrfs_set_last_root_drop_gen() and btrfs_get_last_root_drop_gen() - * to change it and to read it, respectively. - */ - u64 last_root_drop_gen; - - /* - * Annotations for transaction events (structures are empty when - * compiled without lockdep). - */ - struct lockdep_map btrfs_trans_num_writers_map; - struct lockdep_map btrfs_trans_num_extwriters_map; - struct lockdep_map btrfs_state_change_map[4]; - struct lockdep_map btrfs_trans_pending_ordered_map; - struct lockdep_map btrfs_ordered_extent_map; - -#ifdef CONFIG_BTRFS_FS_REF_VERIFY - spinlock_t ref_verify_lock; - struct rb_root block_tree; -#endif - -#ifdef CONFIG_BTRFS_DEBUG - struct kobject *debug_kobj; - struct list_head allocated_roots; - - spinlock_t eb_leak_lock; - struct list_head allocated_ebs; -#endif -}; - -static inline void btrfs_set_last_root_drop_gen(struct btrfs_fs_info *fs_info, - u64 gen) -{ - WRITE_ONCE(fs_info->last_root_drop_gen, gen); -} - -static inline u64 btrfs_get_last_root_drop_gen(const struct btrfs_fs_info *fs_info) -{ - return READ_ONCE(fs_info->last_root_drop_gen); -} - -static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb) -{ - return sb->s_fs_info; -} - -/* - * Take the number of bytes to be checksummed and figure out how many leaves - * it would require to store the csums for that many bytes. - */ -static inline u64 btrfs_csum_bytes_to_leaves( - const struct btrfs_fs_info *fs_info, u64 csum_bytes) -{ - const u64 num_csums = csum_bytes >> fs_info->sectorsize_bits; - - return DIV_ROUND_UP_ULL(num_csums, fs_info->csums_per_leaf); -} - -/* - * Use this if we would be adding new items, as we could split nodes as we cow - * down the tree. - */ -static inline u64 btrfs_calc_insert_metadata_size(struct btrfs_fs_info *fs_info, - unsigned num_items) -{ - return (u64)fs_info->nodesize * BTRFS_MAX_LEVEL * 2 * num_items; -} - -/* - * Doing a truncate or a modification won't result in new nodes or leaves, just - * what we need for COW. - */ -static inline u64 btrfs_calc_metadata_size(struct btrfs_fs_info *fs_info, - unsigned num_items) -{ - return (u64)fs_info->nodesize * BTRFS_MAX_LEVEL * num_items; -} - -#define BTRFS_MAX_EXTENT_ITEM_SIZE(r) ((BTRFS_LEAF_DATA_SIZE(r->fs_info) >> 4) - \ - sizeof(struct btrfs_item)) - -static inline bool btrfs_is_zoned(const struct btrfs_fs_info *fs_info) -{ - return fs_info->zone_size > 0; -} - -/* - * Count how many fs_info->max_extent_size cover the @size - */ -static inline u32 count_max_extents(struct btrfs_fs_info *fs_info, u64 size) -{ -#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS - if (!fs_info) - return div_u64(size + BTRFS_MAX_EXTENT_SIZE - 1, BTRFS_MAX_EXTENT_SIZE); -#endif - - return div_u64(size + fs_info->max_extent_size - 1, fs_info->max_extent_size); -} - -bool btrfs_exclop_start(struct btrfs_fs_info *fs_info, - enum btrfs_exclusive_operation type); -bool btrfs_exclop_start_try_lock(struct btrfs_fs_info *fs_info, - enum btrfs_exclusive_operation type); -void btrfs_exclop_start_unlock(struct btrfs_fs_info *fs_info); -void btrfs_exclop_finish(struct btrfs_fs_info *fs_info); -void btrfs_exclop_balance(struct btrfs_fs_info *fs_info, - enum btrfs_exclusive_operation op); - /* * The state of btrfs root */ @@ -1215,87 +174,6 @@ enum { BTRFS_ROOT_RESET_LOCKDEP_CLASS, }; -enum btrfs_lockdep_trans_states { - BTRFS_LOCKDEP_TRANS_COMMIT_START, - BTRFS_LOCKDEP_TRANS_UNBLOCKED, - BTRFS_LOCKDEP_TRANS_SUPER_COMMITTED, - BTRFS_LOCKDEP_TRANS_COMPLETED, -}; - -/* - * Lockdep annotation for wait events. - * - * @owner: The struct where the lockdep map is defined - * @lock: The lockdep map corresponding to a wait event - * - * This macro is used to annotate a wait event. In this case a thread acquires - * the lockdep map as writer (exclusive lock) because it has to block until all - * the threads that hold the lock as readers signal the condition for the wait - * event and release their locks. - */ -#define btrfs_might_wait_for_event(owner, lock) \ - do { \ - rwsem_acquire(&owner->lock##_map, 0, 0, _THIS_IP_); \ - rwsem_release(&owner->lock##_map, _THIS_IP_); \ - } while (0) - -/* - * Protection for the resource/condition of a wait event. - * - * @owner: The struct where the lockdep map is defined - * @lock: The lockdep map corresponding to a wait event - * - * Many threads can modify the condition for the wait event at the same time - * and signal the threads that block on the wait event. The threads that modify - * the condition and do the signaling acquire the lock as readers (shared - * lock). - */ -#define btrfs_lockdep_acquire(owner, lock) \ - rwsem_acquire_read(&owner->lock##_map, 0, 0, _THIS_IP_) - -/* - * Used after signaling the condition for a wait event to release the lockdep - * map held by a reader thread. - */ -#define btrfs_lockdep_release(owner, lock) \ - rwsem_release(&owner->lock##_map, _THIS_IP_) - -/* - * Macros for the transaction states wait events, similar to the generic wait - * event macros. - */ -#define btrfs_might_wait_for_state(owner, i) \ - do { \ - rwsem_acquire(&owner->btrfs_state_change_map[i], 0, 0, _THIS_IP_); \ - rwsem_release(&owner->btrfs_state_change_map[i], _THIS_IP_); \ - } while (0) - -#define btrfs_trans_state_lockdep_acquire(owner, i) \ - rwsem_acquire_read(&owner->btrfs_state_change_map[i], 0, 0, _THIS_IP_) - -#define btrfs_trans_state_lockdep_release(owner, i) \ - rwsem_release(&owner->btrfs_state_change_map[i], _THIS_IP_) - -/* Initialization of the lockdep map */ -#define btrfs_lockdep_init_map(owner, lock) \ - do { \ - static struct lock_class_key lock##_key; \ - lockdep_init_map(&owner->lock##_map, #lock, &lock##_key, 0); \ - } while (0) - -/* Initialization of the transaction states lockdep maps. */ -#define btrfs_state_lockdep_init_map(owner, lock, state) \ - do { \ - static struct lock_class_key lock##_key; \ - lockdep_init_map(&owner->btrfs_state_change_map[state], #lock, \ - &lock##_key, 0); \ - } while (0) - -static inline void btrfs_wake_unfinished_drop(struct btrfs_fs_info *fs_info) -{ - clear_and_wake_up_bit(BTRFS_FS_UNFINISHED_DROPS, &fs_info->flags); -} - /* * Record swapped tree blocks of a subvolume tree for delayed subtree trace * code. For detail check comment in fs/btrfs/qgroup.c. @@ -1457,6 +335,23 @@ struct btrfs_root { #endif }; +static inline bool btrfs_root_readonly(const struct btrfs_root *root) +{ + /* Byte-swap the constant at compile time, root_item::flags is LE */ + return (root->root_item.flags & cpu_to_le64(BTRFS_ROOT_SUBVOL_RDONLY)) != 0; +} + +static inline bool btrfs_root_dead(const struct btrfs_root *root) +{ + /* Byte-swap the constant at compile time, root_item::flags is LE */ + return (root->root_item.flags & cpu_to_le64(BTRFS_ROOT_SUBVOL_DEAD)) != 0; +} + +static inline u64 btrfs_root_id(const struct btrfs_root *root) +{ + return root->root_key.objectid; +} + /* * Structure that conveys information about an extent that is going to replace * all the extents in a file range. @@ -1548,17 +443,14 @@ struct btrfs_drop_extents_args { struct btrfs_file_private { void *filldir_buf; + struct extent_state *llseek_cached_state; }; - static inline u32 BTRFS_LEAF_DATA_SIZE(const struct btrfs_fs_info *info) { - return info->nodesize - sizeof(struct btrfs_header); } -#define BTRFS_LEAF_DATA_OFFSET offsetof(struct btrfs_leaf, items) - static inline u32 BTRFS_MAX_ITEM_SIZE(const struct btrfs_fs_info *info) { return BTRFS_LEAF_DATA_SIZE(info) - sizeof(struct btrfs_item); @@ -1569,1254 +461,14 @@ static inline u32 BTRFS_NODEPTRS_PER_BLOCK(const struct btrfs_fs_info *info) return BTRFS_LEAF_DATA_SIZE(info) / sizeof(struct btrfs_key_ptr); } -#define BTRFS_FILE_EXTENT_INLINE_DATA_START \ - (offsetof(struct btrfs_file_extent_item, disk_bytenr)) -static inline u32 BTRFS_MAX_INLINE_DATA_SIZE(const struct btrfs_fs_info *info) -{ - return BTRFS_MAX_ITEM_SIZE(info) - - BTRFS_FILE_EXTENT_INLINE_DATA_START; -} - static inline u32 BTRFS_MAX_XATTR_SIZE(const struct btrfs_fs_info *info) { return BTRFS_MAX_ITEM_SIZE(info) - sizeof(struct btrfs_dir_item); } -/* - * Flags for mount options. - * - * Note: don't forget to add new options to btrfs_show_options() - */ -enum { - BTRFS_MOUNT_NODATASUM = (1UL << 0), - BTRFS_MOUNT_NODATACOW = (1UL << 1), - BTRFS_MOUNT_NOBARRIER = (1UL << 2), - BTRFS_MOUNT_SSD = (1UL << 3), - BTRFS_MOUNT_DEGRADED = (1UL << 4), - BTRFS_MOUNT_COMPRESS = (1UL << 5), - BTRFS_MOUNT_NOTREELOG = (1UL << 6), - BTRFS_MOUNT_FLUSHONCOMMIT = (1UL << 7), - BTRFS_MOUNT_SSD_SPREAD = (1UL << 8), - BTRFS_MOUNT_NOSSD = (1UL << 9), - BTRFS_MOUNT_DISCARD_SYNC = (1UL << 10), - BTRFS_MOUNT_FORCE_COMPRESS = (1UL << 11), - BTRFS_MOUNT_SPACE_CACHE = (1UL << 12), - BTRFS_MOUNT_CLEAR_CACHE = (1UL << 13), - BTRFS_MOUNT_USER_SUBVOL_RM_ALLOWED = (1UL << 14), - BTRFS_MOUNT_ENOSPC_DEBUG = (1UL << 15), - BTRFS_MOUNT_AUTO_DEFRAG = (1UL << 16), - BTRFS_MOUNT_USEBACKUPROOT = (1UL << 17), - BTRFS_MOUNT_SKIP_BALANCE = (1UL << 18), - BTRFS_MOUNT_CHECK_INTEGRITY = (1UL << 19), - BTRFS_MOUNT_CHECK_INTEGRITY_DATA = (1UL << 20), - BTRFS_MOUNT_PANIC_ON_FATAL_ERROR = (1UL << 21), - BTRFS_MOUNT_RESCAN_UUID_TREE = (1UL << 22), - BTRFS_MOUNT_FRAGMENT_DATA = (1UL << 23), - BTRFS_MOUNT_FRAGMENT_METADATA = (1UL << 24), - BTRFS_MOUNT_FREE_SPACE_TREE = (1UL << 25), - BTRFS_MOUNT_NOLOGREPLAY = (1UL << 26), - BTRFS_MOUNT_REF_VERIFY = (1UL << 27), - BTRFS_MOUNT_DISCARD_ASYNC = (1UL << 28), - BTRFS_MOUNT_IGNOREBADROOTS = (1UL << 29), - BTRFS_MOUNT_IGNOREDATACSUMS = (1UL << 30), -}; - -#define BTRFS_DEFAULT_COMMIT_INTERVAL (30) -#define BTRFS_DEFAULT_MAX_INLINE (2048) - -#define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt) -#define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt) -#define btrfs_raw_test_opt(o, opt) ((o) & BTRFS_MOUNT_##opt) -#define btrfs_test_opt(fs_info, opt) ((fs_info)->mount_opt & \ - BTRFS_MOUNT_##opt) - -#define btrfs_set_and_info(fs_info, opt, fmt, args...) \ -do { \ - if (!btrfs_test_opt(fs_info, opt)) \ - btrfs_info(fs_info, fmt, ##args); \ - btrfs_set_opt(fs_info->mount_opt, opt); \ -} while (0) - -#define btrfs_clear_and_info(fs_info, opt, fmt, args...) \ -do { \ - if (btrfs_test_opt(fs_info, opt)) \ - btrfs_info(fs_info, fmt, ##args); \ - btrfs_clear_opt(fs_info->mount_opt, opt); \ -} while (0) - -/* - * Requests for changes that need to be done during transaction commit. - * - * Internal mount options that are used for special handling of the real - * mount options (eg. cannot be set during remount and have to be set during - * transaction commit) - */ - -#define BTRFS_PENDING_COMMIT (0) - -#define btrfs_test_pending(info, opt) \ - test_bit(BTRFS_PENDING_##opt, &(info)->pending_changes) -#define btrfs_set_pending(info, opt) \ - set_bit(BTRFS_PENDING_##opt, &(info)->pending_changes) -#define btrfs_clear_pending(info, opt) \ - clear_bit(BTRFS_PENDING_##opt, &(info)->pending_changes) - -/* - * Helpers for setting pending mount option changes. - * - * Expects corresponding macros - * BTRFS_PENDING_SET_ and CLEAR_ + short mount option name - */ -#define btrfs_set_pending_and_info(info, opt, fmt, args...) \ -do { \ - if (!btrfs_raw_test_opt((info)->mount_opt, opt)) { \ - btrfs_info((info), fmt, ##args); \ - btrfs_set_pending((info), SET_##opt); \ - btrfs_clear_pending((info), CLEAR_##opt); \ - } \ -} while(0) - -#define btrfs_clear_pending_and_info(info, opt, fmt, args...) \ -do { \ - if (btrfs_raw_test_opt((info)->mount_opt, opt)) { \ - btrfs_info((info), fmt, ##args); \ - btrfs_set_pending((info), CLEAR_##opt); \ - btrfs_clear_pending((info), SET_##opt); \ - } \ -} while(0) - -/* - * Inode flags - */ -#define BTRFS_INODE_NODATASUM (1U << 0) -#define BTRFS_INODE_NODATACOW (1U << 1) -#define BTRFS_INODE_READONLY (1U << 2) -#define BTRFS_INODE_NOCOMPRESS (1U << 3) -#define BTRFS_INODE_PREALLOC (1U << 4) -#define BTRFS_INODE_SYNC (1U << 5) -#define BTRFS_INODE_IMMUTABLE (1U << 6) -#define BTRFS_INODE_APPEND (1U << 7) -#define BTRFS_INODE_NODUMP (1U << 8) -#define BTRFS_INODE_NOATIME (1U << 9) -#define BTRFS_INODE_DIRSYNC (1U << 10) -#define BTRFS_INODE_COMPRESS (1U << 11) - -#define BTRFS_INODE_ROOT_ITEM_INIT (1U << 31) - -#define BTRFS_INODE_FLAG_MASK \ - (BTRFS_INODE_NODATASUM | \ - BTRFS_INODE_NODATACOW | \ - BTRFS_INODE_READONLY | \ - BTRFS_INODE_NOCOMPRESS | \ - BTRFS_INODE_PREALLOC | \ - BTRFS_INODE_SYNC | \ - BTRFS_INODE_IMMUTABLE | \ - BTRFS_INODE_APPEND | \ - BTRFS_INODE_NODUMP | \ - BTRFS_INODE_NOATIME | \ - BTRFS_INODE_DIRSYNC | \ - BTRFS_INODE_COMPRESS | \ - BTRFS_INODE_ROOT_ITEM_INIT) - -#define BTRFS_INODE_RO_VERITY (1U << 0) - -#define BTRFS_INODE_RO_FLAG_MASK (BTRFS_INODE_RO_VERITY) - -struct btrfs_map_token { - struct extent_buffer *eb; - char *kaddr; - unsigned long offset; -}; - #define BTRFS_BYTES_TO_BLKS(fs_info, bytes) \ ((bytes) >> (fs_info)->sectorsize_bits) -static inline void btrfs_init_map_token(struct btrfs_map_token *token, - struct extent_buffer *eb) -{ - token->eb = eb; - token->kaddr = page_address(eb->pages[0]); - token->offset = 0; -} - -/* some macros to generate set/get functions for the struct fields. This - * assumes there is a lefoo_to_cpu for every type, so lets make a simple - * one for u8: - */ -#define le8_to_cpu(v) (v) -#define cpu_to_le8(v) (v) -#define __le8 u8 - -static inline u8 get_unaligned_le8(const void *p) -{ - return *(u8 *)p; -} - -static inline void put_unaligned_le8(u8 val, void *p) -{ - *(u8 *)p = val; -} - -#define read_eb_member(eb, ptr, type, member, result) (\ - read_extent_buffer(eb, (char *)(result), \ - ((unsigned long)(ptr)) + \ - offsetof(type, member), \ - sizeof(((type *)0)->member))) - -#define write_eb_member(eb, ptr, type, member, result) (\ - write_extent_buffer(eb, (char *)(result), \ - ((unsigned long)(ptr)) + \ - offsetof(type, member), \ - sizeof(((type *)0)->member))) - -#define DECLARE_BTRFS_SETGET_BITS(bits) \ -u##bits btrfs_get_token_##bits(struct btrfs_map_token *token, \ - const void *ptr, unsigned long off); \ -void btrfs_set_token_##bits(struct btrfs_map_token *token, \ - const void *ptr, unsigned long off, \ - u##bits val); \ -u##bits btrfs_get_##bits(const struct extent_buffer *eb, \ - const void *ptr, unsigned long off); \ -void btrfs_set_##bits(const struct extent_buffer *eb, void *ptr, \ - unsigned long off, u##bits val); - -DECLARE_BTRFS_SETGET_BITS(8) -DECLARE_BTRFS_SETGET_BITS(16) -DECLARE_BTRFS_SETGET_BITS(32) -DECLARE_BTRFS_SETGET_BITS(64) - -#define BTRFS_SETGET_FUNCS(name, type, member, bits) \ -static inline u##bits btrfs_##name(const struct extent_buffer *eb, \ - const type *s) \ -{ \ - static_assert(sizeof(u##bits) == sizeof(((type *)0))->member); \ - return btrfs_get_##bits(eb, s, offsetof(type, member)); \ -} \ -static inline void btrfs_set_##name(const struct extent_buffer *eb, type *s, \ - u##bits val) \ -{ \ - static_assert(sizeof(u##bits) == sizeof(((type *)0))->member); \ - btrfs_set_##bits(eb, s, offsetof(type, member), val); \ -} \ -static inline u##bits btrfs_token_##name(struct btrfs_map_token *token, \ - const type *s) \ -{ \ - static_assert(sizeof(u##bits) == sizeof(((type *)0))->member); \ - return btrfs_get_token_##bits(token, s, offsetof(type, member));\ -} \ -static inline void btrfs_set_token_##name(struct btrfs_map_token *token,\ - type *s, u##bits val) \ -{ \ - static_assert(sizeof(u##bits) == sizeof(((type *)0))->member); \ - btrfs_set_token_##bits(token, s, offsetof(type, member), val); \ -} - -#define BTRFS_SETGET_HEADER_FUNCS(name, type, member, bits) \ -static inline u##bits btrfs_##name(const struct extent_buffer *eb) \ -{ \ - const type *p = page_address(eb->pages[0]) + \ - offset_in_page(eb->start); \ - return get_unaligned_le##bits(&p->member); \ -} \ -static inline void btrfs_set_##name(const struct extent_buffer *eb, \ - u##bits val) \ -{ \ - type *p = page_address(eb->pages[0]) + offset_in_page(eb->start); \ - put_unaligned_le##bits(val, &p->member); \ -} - -#define BTRFS_SETGET_STACK_FUNCS(name, type, member, bits) \ -static inline u##bits btrfs_##name(const type *s) \ -{ \ - return get_unaligned_le##bits(&s->member); \ -} \ -static inline void btrfs_set_##name(type *s, u##bits val) \ -{ \ - put_unaligned_le##bits(val, &s->member); \ -} - -static inline u64 btrfs_device_total_bytes(const struct extent_buffer *eb, - struct btrfs_dev_item *s) -{ - static_assert(sizeof(u64) == - sizeof(((struct btrfs_dev_item *)0))->total_bytes); - return btrfs_get_64(eb, s, offsetof(struct btrfs_dev_item, - total_bytes)); -} -static inline void btrfs_set_device_total_bytes(const struct extent_buffer *eb, - struct btrfs_dev_item *s, - u64 val) -{ - static_assert(sizeof(u64) == - sizeof(((struct btrfs_dev_item *)0))->total_bytes); - WARN_ON(!IS_ALIGNED(val, eb->fs_info->sectorsize)); - btrfs_set_64(eb, s, offsetof(struct btrfs_dev_item, total_bytes), val); -} - - -BTRFS_SETGET_FUNCS(device_type, struct btrfs_dev_item, type, 64); -BTRFS_SETGET_FUNCS(device_bytes_used, struct btrfs_dev_item, bytes_used, 64); -BTRFS_SETGET_FUNCS(device_io_align, struct btrfs_dev_item, io_align, 32); -BTRFS_SETGET_FUNCS(device_io_width, struct btrfs_dev_item, io_width, 32); -BTRFS_SETGET_FUNCS(device_start_offset, struct btrfs_dev_item, - start_offset, 64); -BTRFS_SETGET_FUNCS(device_sector_size, struct btrfs_dev_item, sector_size, 32); -BTRFS_SETGET_FUNCS(device_id, struct btrfs_dev_item, devid, 64); -BTRFS_SETGET_FUNCS(device_group, struct btrfs_dev_item, dev_group, 32); -BTRFS_SETGET_FUNCS(device_seek_speed, struct btrfs_dev_item, seek_speed, 8); -BTRFS_SETGET_FUNCS(device_bandwidth, struct btrfs_dev_item, bandwidth, 8); -BTRFS_SETGET_FUNCS(device_generation, struct btrfs_dev_item, generation, 64); - -BTRFS_SETGET_STACK_FUNCS(stack_device_type, struct btrfs_dev_item, type, 64); -BTRFS_SETGET_STACK_FUNCS(stack_device_total_bytes, struct btrfs_dev_item, - total_bytes, 64); -BTRFS_SETGET_STACK_FUNCS(stack_device_bytes_used, struct btrfs_dev_item, - bytes_used, 64); -BTRFS_SETGET_STACK_FUNCS(stack_device_io_align, struct btrfs_dev_item, - io_align, 32); -BTRFS_SETGET_STACK_FUNCS(stack_device_io_width, struct btrfs_dev_item, - io_width, 32); -BTRFS_SETGET_STACK_FUNCS(stack_device_sector_size, struct btrfs_dev_item, - sector_size, 32); -BTRFS_SETGET_STACK_FUNCS(stack_device_id, struct btrfs_dev_item, devid, 64); -BTRFS_SETGET_STACK_FUNCS(stack_device_group, struct btrfs_dev_item, - dev_group, 32); -BTRFS_SETGET_STACK_FUNCS(stack_device_seek_speed, struct btrfs_dev_item, - seek_speed, 8); -BTRFS_SETGET_STACK_FUNCS(stack_device_bandwidth, struct btrfs_dev_item, - bandwidth, 8); -BTRFS_SETGET_STACK_FUNCS(stack_device_generation, struct btrfs_dev_item, - generation, 64); - -static inline unsigned long btrfs_device_uuid(struct btrfs_dev_item *d) -{ - return (unsigned long)d + offsetof(struct btrfs_dev_item, uuid); -} - -static inline unsigned long btrfs_device_fsid(struct btrfs_dev_item *d) -{ - return (unsigned long)d + offsetof(struct btrfs_dev_item, fsid); -} - -BTRFS_SETGET_FUNCS(chunk_length, struct btrfs_chunk, length, 64); -BTRFS_SETGET_FUNCS(chunk_owner, struct btrfs_chunk, owner, 64); -BTRFS_SETGET_FUNCS(chunk_stripe_len, struct btrfs_chunk, stripe_len, 64); -BTRFS_SETGET_FUNCS(chunk_io_align, struct btrfs_chunk, io_align, 32); -BTRFS_SETGET_FUNCS(chunk_io_width, struct btrfs_chunk, io_width, 32); -BTRFS_SETGET_FUNCS(chunk_sector_size, struct btrfs_chunk, sector_size, 32); -BTRFS_SETGET_FUNCS(chunk_type, struct btrfs_chunk, type, 64); -BTRFS_SETGET_FUNCS(chunk_num_stripes, struct btrfs_chunk, num_stripes, 16); -BTRFS_SETGET_FUNCS(chunk_sub_stripes, struct btrfs_chunk, sub_stripes, 16); -BTRFS_SETGET_FUNCS(stripe_devid, struct btrfs_stripe, devid, 64); -BTRFS_SETGET_FUNCS(stripe_offset, struct btrfs_stripe, offset, 64); - -static inline char *btrfs_stripe_dev_uuid(struct btrfs_stripe *s) -{ - return (char *)s + offsetof(struct btrfs_stripe, dev_uuid); -} - -BTRFS_SETGET_STACK_FUNCS(stack_chunk_length, struct btrfs_chunk, length, 64); -BTRFS_SETGET_STACK_FUNCS(stack_chunk_owner, struct btrfs_chunk, owner, 64); -BTRFS_SETGET_STACK_FUNCS(stack_chunk_stripe_len, struct btrfs_chunk, - stripe_len, 64); -BTRFS_SETGET_STACK_FUNCS(stack_chunk_io_align, struct btrfs_chunk, - io_align, 32); -BTRFS_SETGET_STACK_FUNCS(stack_chunk_io_width, struct btrfs_chunk, - io_width, 32); -BTRFS_SETGET_STACK_FUNCS(stack_chunk_sector_size, struct btrfs_chunk, - sector_size, 32); -BTRFS_SETGET_STACK_FUNCS(stack_chunk_type, struct btrfs_chunk, type, 64); -BTRFS_SETGET_STACK_FUNCS(stack_chunk_num_stripes, struct btrfs_chunk, - num_stripes, 16); -BTRFS_SETGET_STACK_FUNCS(stack_chunk_sub_stripes, struct btrfs_chunk, - sub_stripes, 16); -BTRFS_SETGET_STACK_FUNCS(stack_stripe_devid, struct btrfs_stripe, devid, 64); -BTRFS_SETGET_STACK_FUNCS(stack_stripe_offset, struct btrfs_stripe, offset, 64); - -static inline struct btrfs_stripe *btrfs_stripe_nr(struct btrfs_chunk *c, - int nr) -{ - unsigned long offset = (unsigned long)c; - offset += offsetof(struct btrfs_chunk, stripe); - offset += nr * sizeof(struct btrfs_stripe); - return (struct btrfs_stripe *)offset; -} - -static inline char *btrfs_stripe_dev_uuid_nr(struct btrfs_chunk *c, int nr) -{ - return btrfs_stripe_dev_uuid(btrfs_stripe_nr(c, nr)); -} - -static inline u64 btrfs_stripe_offset_nr(const struct extent_buffer *eb, - struct btrfs_chunk *c, int nr) -{ - return btrfs_stripe_offset(eb, btrfs_stripe_nr(c, nr)); -} - -static inline u64 btrfs_stripe_devid_nr(const struct extent_buffer *eb, - struct btrfs_chunk *c, int nr) -{ - return btrfs_stripe_devid(eb, btrfs_stripe_nr(c, nr)); -} - -/* struct btrfs_block_group_item */ -BTRFS_SETGET_STACK_FUNCS(stack_block_group_used, struct btrfs_block_group_item, - used, 64); -BTRFS_SETGET_FUNCS(block_group_used, struct btrfs_block_group_item, - used, 64); -BTRFS_SETGET_STACK_FUNCS(stack_block_group_chunk_objectid, - struct btrfs_block_group_item, chunk_objectid, 64); - -BTRFS_SETGET_FUNCS(block_group_chunk_objectid, - struct btrfs_block_group_item, chunk_objectid, 64); -BTRFS_SETGET_FUNCS(block_group_flags, - struct btrfs_block_group_item, flags, 64); -BTRFS_SETGET_STACK_FUNCS(stack_block_group_flags, - struct btrfs_block_group_item, flags, 64); - -/* struct btrfs_free_space_info */ -BTRFS_SETGET_FUNCS(free_space_extent_count, struct btrfs_free_space_info, - extent_count, 32); -BTRFS_SETGET_FUNCS(free_space_flags, struct btrfs_free_space_info, flags, 32); - -/* struct btrfs_inode_ref */ -BTRFS_SETGET_FUNCS(inode_ref_name_len, struct btrfs_inode_ref, name_len, 16); -BTRFS_SETGET_FUNCS(inode_ref_index, struct btrfs_inode_ref, index, 64); - -/* struct btrfs_inode_extref */ -BTRFS_SETGET_FUNCS(inode_extref_parent, struct btrfs_inode_extref, - parent_objectid, 64); -BTRFS_SETGET_FUNCS(inode_extref_name_len, struct btrfs_inode_extref, - name_len, 16); -BTRFS_SETGET_FUNCS(inode_extref_index, struct btrfs_inode_extref, index, 64); - -/* struct btrfs_inode_item */ -BTRFS_SETGET_FUNCS(inode_generation, struct btrfs_inode_item, generation, 64); -BTRFS_SETGET_FUNCS(inode_sequence, struct btrfs_inode_item, sequence, 64); -BTRFS_SETGET_FUNCS(inode_transid, struct btrfs_inode_item, transid, 64); -BTRFS_SETGET_FUNCS(inode_size, struct btrfs_inode_item, size, 64); -BTRFS_SETGET_FUNCS(inode_nbytes, struct btrfs_inode_item, nbytes, 64); -BTRFS_SETGET_FUNCS(inode_block_group, struct btrfs_inode_item, block_group, 64); -BTRFS_SETGET_FUNCS(inode_nlink, struct btrfs_inode_item, nlink, 32); -BTRFS_SETGET_FUNCS(inode_uid, struct btrfs_inode_item, uid, 32); -BTRFS_SETGET_FUNCS(inode_gid, struct btrfs_inode_item, gid, 32); -BTRFS_SETGET_FUNCS(inode_mode, struct btrfs_inode_item, mode, 32); -BTRFS_SETGET_FUNCS(inode_rdev, struct btrfs_inode_item, rdev, 64); -BTRFS_SETGET_FUNCS(inode_flags, struct btrfs_inode_item, flags, 64); -BTRFS_SETGET_STACK_FUNCS(stack_inode_generation, struct btrfs_inode_item, - generation, 64); -BTRFS_SETGET_STACK_FUNCS(stack_inode_sequence, struct btrfs_inode_item, - sequence, 64); -BTRFS_SETGET_STACK_FUNCS(stack_inode_transid, struct btrfs_inode_item, - transid, 64); -BTRFS_SETGET_STACK_FUNCS(stack_inode_size, struct btrfs_inode_item, size, 64); -BTRFS_SETGET_STACK_FUNCS(stack_inode_nbytes, struct btrfs_inode_item, - nbytes, 64); -BTRFS_SETGET_STACK_FUNCS(stack_inode_block_group, struct btrfs_inode_item, - block_group, 64); -BTRFS_SETGET_STACK_FUNCS(stack_inode_nlink, struct btrfs_inode_item, nlink, 32); -BTRFS_SETGET_STACK_FUNCS(stack_inode_uid, struct btrfs_inode_item, uid, 32); -BTRFS_SETGET_STACK_FUNCS(stack_inode_gid, struct btrfs_inode_item, gid, 32); -BTRFS_SETGET_STACK_FUNCS(stack_inode_mode, struct btrfs_inode_item, mode, 32); -BTRFS_SETGET_STACK_FUNCS(stack_inode_rdev, struct btrfs_inode_item, rdev, 64); -BTRFS_SETGET_STACK_FUNCS(stack_inode_flags, struct btrfs_inode_item, flags, 64); -BTRFS_SETGET_FUNCS(timespec_sec, struct btrfs_timespec, sec, 64); -BTRFS_SETGET_FUNCS(timespec_nsec, struct btrfs_timespec, nsec, 32); -BTRFS_SETGET_STACK_FUNCS(stack_timespec_sec, struct btrfs_timespec, sec, 64); -BTRFS_SETGET_STACK_FUNCS(stack_timespec_nsec, struct btrfs_timespec, nsec, 32); - -/* struct btrfs_dev_extent */ -BTRFS_SETGET_FUNCS(dev_extent_chunk_tree, struct btrfs_dev_extent, - chunk_tree, 64); -BTRFS_SETGET_FUNCS(dev_extent_chunk_objectid, struct btrfs_dev_extent, - chunk_objectid, 64); -BTRFS_SETGET_FUNCS(dev_extent_chunk_offset, struct btrfs_dev_extent, - chunk_offset, 64); -BTRFS_SETGET_FUNCS(dev_extent_length, struct btrfs_dev_extent, length, 64); -BTRFS_SETGET_FUNCS(extent_refs, struct btrfs_extent_item, refs, 64); -BTRFS_SETGET_FUNCS(extent_generation, struct btrfs_extent_item, - generation, 64); -BTRFS_SETGET_FUNCS(extent_flags, struct btrfs_extent_item, flags, 64); - -BTRFS_SETGET_FUNCS(tree_block_level, struct btrfs_tree_block_info, level, 8); - -static inline void btrfs_tree_block_key(const struct extent_buffer *eb, - struct btrfs_tree_block_info *item, - struct btrfs_disk_key *key) -{ - read_eb_member(eb, item, struct btrfs_tree_block_info, key, key); -} - -static inline void btrfs_set_tree_block_key(const struct extent_buffer *eb, - struct btrfs_tree_block_info *item, - struct btrfs_disk_key *key) -{ - write_eb_member(eb, item, struct btrfs_tree_block_info, key, key); -} - -BTRFS_SETGET_FUNCS(extent_data_ref_root, struct btrfs_extent_data_ref, - root, 64); -BTRFS_SETGET_FUNCS(extent_data_ref_objectid, struct btrfs_extent_data_ref, - objectid, 64); -BTRFS_SETGET_FUNCS(extent_data_ref_offset, struct btrfs_extent_data_ref, - offset, 64); -BTRFS_SETGET_FUNCS(extent_data_ref_count, struct btrfs_extent_data_ref, - count, 32); - -BTRFS_SETGET_FUNCS(shared_data_ref_count, struct btrfs_shared_data_ref, - count, 32); - -BTRFS_SETGET_FUNCS(extent_inline_ref_type, struct btrfs_extent_inline_ref, - type, 8); -BTRFS_SETGET_FUNCS(extent_inline_ref_offset, struct btrfs_extent_inline_ref, - offset, 64); - -static inline u32 btrfs_extent_inline_ref_size(int type) -{ - if (type == BTRFS_TREE_BLOCK_REF_KEY || - type == BTRFS_SHARED_BLOCK_REF_KEY) - return sizeof(struct btrfs_extent_inline_ref); - if (type == BTRFS_SHARED_DATA_REF_KEY) - return sizeof(struct btrfs_shared_data_ref) + - sizeof(struct btrfs_extent_inline_ref); - if (type == BTRFS_EXTENT_DATA_REF_KEY) - return sizeof(struct btrfs_extent_data_ref) + - offsetof(struct btrfs_extent_inline_ref, offset); - return 0; -} - -/* struct btrfs_node */ -BTRFS_SETGET_FUNCS(key_blockptr, struct btrfs_key_ptr, blockptr, 64); -BTRFS_SETGET_FUNCS(key_generation, struct btrfs_key_ptr, generation, 64); -BTRFS_SETGET_STACK_FUNCS(stack_key_blockptr, struct btrfs_key_ptr, - blockptr, 64); -BTRFS_SETGET_STACK_FUNCS(stack_key_generation, struct btrfs_key_ptr, - generation, 64); - -static inline u64 btrfs_node_blockptr(const struct extent_buffer *eb, int nr) -{ - unsigned long ptr; - ptr = offsetof(struct btrfs_node, ptrs) + - sizeof(struct btrfs_key_ptr) * nr; - return btrfs_key_blockptr(eb, (struct btrfs_key_ptr *)ptr); -} - -static inline void btrfs_set_node_blockptr(const struct extent_buffer *eb, - int nr, u64 val) -{ - unsigned long ptr; - ptr = offsetof(struct btrfs_node, ptrs) + - sizeof(struct btrfs_key_ptr) * nr; - btrfs_set_key_blockptr(eb, (struct btrfs_key_ptr *)ptr, val); -} - -static inline u64 btrfs_node_ptr_generation(const struct extent_buffer *eb, int nr) -{ - unsigned long ptr; - ptr = offsetof(struct btrfs_node, ptrs) + - sizeof(struct btrfs_key_ptr) * nr; - return btrfs_key_generation(eb, (struct btrfs_key_ptr *)ptr); -} - -static inline void btrfs_set_node_ptr_generation(const struct extent_buffer *eb, - int nr, u64 val) -{ - unsigned long ptr; - ptr = offsetof(struct btrfs_node, ptrs) + - sizeof(struct btrfs_key_ptr) * nr; - btrfs_set_key_generation(eb, (struct btrfs_key_ptr *)ptr, val); -} - -static inline unsigned long btrfs_node_key_ptr_offset(int nr) -{ - return offsetof(struct btrfs_node, ptrs) + - sizeof(struct btrfs_key_ptr) * nr; -} - -void btrfs_node_key(const struct extent_buffer *eb, - struct btrfs_disk_key *disk_key, int nr); - -static inline void btrfs_set_node_key(const struct extent_buffer *eb, - struct btrfs_disk_key *disk_key, int nr) -{ - unsigned long ptr; - ptr = btrfs_node_key_ptr_offset(nr); - write_eb_member(eb, (struct btrfs_key_ptr *)ptr, - struct btrfs_key_ptr, key, disk_key); -} - -/* struct btrfs_item */ -BTRFS_SETGET_FUNCS(raw_item_offset, struct btrfs_item, offset, 32); -BTRFS_SETGET_FUNCS(raw_item_size, struct btrfs_item, size, 32); -BTRFS_SETGET_STACK_FUNCS(stack_item_offset, struct btrfs_item, offset, 32); -BTRFS_SETGET_STACK_FUNCS(stack_item_size, struct btrfs_item, size, 32); - -static inline unsigned long btrfs_item_nr_offset(int nr) -{ - return offsetof(struct btrfs_leaf, items) + - sizeof(struct btrfs_item) * nr; -} - -static inline struct btrfs_item *btrfs_item_nr(int nr) -{ - return (struct btrfs_item *)btrfs_item_nr_offset(nr); -} - -#define BTRFS_ITEM_SETGET_FUNCS(member) \ -static inline u32 btrfs_item_##member(const struct extent_buffer *eb, \ - int slot) \ -{ \ - return btrfs_raw_item_##member(eb, btrfs_item_nr(slot)); \ -} \ -static inline void btrfs_set_item_##member(const struct extent_buffer *eb, \ - int slot, u32 val) \ -{ \ - btrfs_set_raw_item_##member(eb, btrfs_item_nr(slot), val); \ -} \ -static inline u32 btrfs_token_item_##member(struct btrfs_map_token *token, \ - int slot) \ -{ \ - struct btrfs_item *item = btrfs_item_nr(slot); \ - return btrfs_token_raw_item_##member(token, item); \ -} \ -static inline void btrfs_set_token_item_##member(struct btrfs_map_token *token, \ - int slot, u32 val) \ -{ \ - struct btrfs_item *item = btrfs_item_nr(slot); \ - btrfs_set_token_raw_item_##member(token, item, val); \ -} - -BTRFS_ITEM_SETGET_FUNCS(offset) -BTRFS_ITEM_SETGET_FUNCS(size); - -static inline u32 btrfs_item_data_end(const struct extent_buffer *eb, int nr) -{ - return btrfs_item_offset(eb, nr) + btrfs_item_size(eb, nr); -} - -static inline void btrfs_item_key(const struct extent_buffer *eb, - struct btrfs_disk_key *disk_key, int nr) -{ - struct btrfs_item *item = btrfs_item_nr(nr); - read_eb_member(eb, item, struct btrfs_item, key, disk_key); -} - -static inline void btrfs_set_item_key(struct extent_buffer *eb, - struct btrfs_disk_key *disk_key, int nr) -{ - struct btrfs_item *item = btrfs_item_nr(nr); - write_eb_member(eb, item, struct btrfs_item, key, disk_key); -} - -BTRFS_SETGET_FUNCS(dir_log_end, struct btrfs_dir_log_item, end, 64); - -/* - * struct btrfs_root_ref - */ -BTRFS_SETGET_FUNCS(root_ref_dirid, struct btrfs_root_ref, dirid, 64); -BTRFS_SETGET_FUNCS(root_ref_sequence, struct btrfs_root_ref, sequence, 64); -BTRFS_SETGET_FUNCS(root_ref_name_len, struct btrfs_root_ref, name_len, 16); - -/* struct btrfs_dir_item */ -BTRFS_SETGET_FUNCS(dir_data_len, struct btrfs_dir_item, data_len, 16); -BTRFS_SETGET_FUNCS(dir_type, struct btrfs_dir_item, type, 8); -BTRFS_SETGET_FUNCS(dir_name_len, struct btrfs_dir_item, name_len, 16); -BTRFS_SETGET_FUNCS(dir_transid, struct btrfs_dir_item, transid, 64); -BTRFS_SETGET_STACK_FUNCS(stack_dir_type, struct btrfs_dir_item, type, 8); -BTRFS_SETGET_STACK_FUNCS(stack_dir_data_len, struct btrfs_dir_item, - data_len, 16); -BTRFS_SETGET_STACK_FUNCS(stack_dir_name_len, struct btrfs_dir_item, - name_len, 16); -BTRFS_SETGET_STACK_FUNCS(stack_dir_transid, struct btrfs_dir_item, - transid, 64); - -static inline void btrfs_dir_item_key(const struct extent_buffer *eb, - const struct btrfs_dir_item *item, - struct btrfs_disk_key *key) -{ - read_eb_member(eb, item, struct btrfs_dir_item, location, key); -} - -static inline void btrfs_set_dir_item_key(struct extent_buffer *eb, - struct btrfs_dir_item *item, - const struct btrfs_disk_key *key) -{ - write_eb_member(eb, item, struct btrfs_dir_item, location, key); -} - -BTRFS_SETGET_FUNCS(free_space_entries, struct btrfs_free_space_header, - num_entries, 64); -BTRFS_SETGET_FUNCS(free_space_bitmaps, struct btrfs_free_space_header, - num_bitmaps, 64); -BTRFS_SETGET_FUNCS(free_space_generation, struct btrfs_free_space_header, - generation, 64); - -static inline void btrfs_free_space_key(const struct extent_buffer *eb, - const struct btrfs_free_space_header *h, - struct btrfs_disk_key *key) -{ - read_eb_member(eb, h, struct btrfs_free_space_header, location, key); -} - -static inline void btrfs_set_free_space_key(struct extent_buffer *eb, - struct btrfs_free_space_header *h, - const struct btrfs_disk_key *key) -{ - write_eb_member(eb, h, struct btrfs_free_space_header, location, key); -} - -/* struct btrfs_disk_key */ -BTRFS_SETGET_STACK_FUNCS(disk_key_objectid, struct btrfs_disk_key, - objectid, 64); -BTRFS_SETGET_STACK_FUNCS(disk_key_offset, struct btrfs_disk_key, offset, 64); -BTRFS_SETGET_STACK_FUNCS(disk_key_type, struct btrfs_disk_key, type, 8); - -#ifdef __LITTLE_ENDIAN - -/* - * Optimized helpers for little-endian architectures where CPU and on-disk - * structures have the same endianness and we can skip conversions. - */ - -static inline void btrfs_disk_key_to_cpu(struct btrfs_key *cpu_key, - const struct btrfs_disk_key *disk_key) -{ - memcpy(cpu_key, disk_key, sizeof(struct btrfs_key)); -} - -static inline void btrfs_cpu_key_to_disk(struct btrfs_disk_key *disk_key, - const struct btrfs_key *cpu_key) -{ - memcpy(disk_key, cpu_key, sizeof(struct btrfs_key)); -} - -static inline void btrfs_node_key_to_cpu(const struct extent_buffer *eb, - struct btrfs_key *cpu_key, int nr) -{ - struct btrfs_disk_key *disk_key = (struct btrfs_disk_key *)cpu_key; - - btrfs_node_key(eb, disk_key, nr); -} - -static inline void btrfs_item_key_to_cpu(const struct extent_buffer *eb, - struct btrfs_key *cpu_key, int nr) -{ - struct btrfs_disk_key *disk_key = (struct btrfs_disk_key *)cpu_key; - - btrfs_item_key(eb, disk_key, nr); -} - -static inline void btrfs_dir_item_key_to_cpu(const struct extent_buffer *eb, - const struct btrfs_dir_item *item, - struct btrfs_key *cpu_key) -{ - struct btrfs_disk_key *disk_key = (struct btrfs_disk_key *)cpu_key; - - btrfs_dir_item_key(eb, item, disk_key); -} - -#else - -static inline void btrfs_disk_key_to_cpu(struct btrfs_key *cpu, - const struct btrfs_disk_key *disk) -{ - cpu->offset = le64_to_cpu(disk->offset); - cpu->type = disk->type; - cpu->objectid = le64_to_cpu(disk->objectid); -} - -static inline void btrfs_cpu_key_to_disk(struct btrfs_disk_key *disk, - const struct btrfs_key *cpu) -{ - disk->offset = cpu_to_le64(cpu->offset); - disk->type = cpu->type; - disk->objectid = cpu_to_le64(cpu->objectid); -} - -static inline void btrfs_node_key_to_cpu(const struct extent_buffer *eb, - struct btrfs_key *key, int nr) -{ - struct btrfs_disk_key disk_key; - btrfs_node_key(eb, &disk_key, nr); - btrfs_disk_key_to_cpu(key, &disk_key); -} - -static inline void btrfs_item_key_to_cpu(const struct extent_buffer *eb, - struct btrfs_key *key, int nr) -{ - struct btrfs_disk_key disk_key; - btrfs_item_key(eb, &disk_key, nr); - btrfs_disk_key_to_cpu(key, &disk_key); -} - -static inline void btrfs_dir_item_key_to_cpu(const struct extent_buffer *eb, - const struct btrfs_dir_item *item, - struct btrfs_key *key) -{ - struct btrfs_disk_key disk_key; - btrfs_dir_item_key(eb, item, &disk_key); - btrfs_disk_key_to_cpu(key, &disk_key); -} - -#endif - -/* struct btrfs_header */ -BTRFS_SETGET_HEADER_FUNCS(header_bytenr, struct btrfs_header, bytenr, 64); -BTRFS_SETGET_HEADER_FUNCS(header_generation, struct btrfs_header, - generation, 64); -BTRFS_SETGET_HEADER_FUNCS(header_owner, struct btrfs_header, owner, 64); -BTRFS_SETGET_HEADER_FUNCS(header_nritems, struct btrfs_header, nritems, 32); -BTRFS_SETGET_HEADER_FUNCS(header_flags, struct btrfs_header, flags, 64); -BTRFS_SETGET_HEADER_FUNCS(header_level, struct btrfs_header, level, 8); -BTRFS_SETGET_STACK_FUNCS(stack_header_generation, struct btrfs_header, - generation, 64); -BTRFS_SETGET_STACK_FUNCS(stack_header_owner, struct btrfs_header, owner, 64); -BTRFS_SETGET_STACK_FUNCS(stack_header_nritems, struct btrfs_header, - nritems, 32); -BTRFS_SETGET_STACK_FUNCS(stack_header_bytenr, struct btrfs_header, bytenr, 64); - -static inline int btrfs_header_flag(const struct extent_buffer *eb, u64 flag) -{ - return (btrfs_header_flags(eb) & flag) == flag; -} - -static inline void btrfs_set_header_flag(struct extent_buffer *eb, u64 flag) -{ - u64 flags = btrfs_header_flags(eb); - btrfs_set_header_flags(eb, flags | flag); -} - -static inline void btrfs_clear_header_flag(struct extent_buffer *eb, u64 flag) -{ - u64 flags = btrfs_header_flags(eb); - btrfs_set_header_flags(eb, flags & ~flag); -} - -static inline int btrfs_header_backref_rev(const struct extent_buffer *eb) -{ - u64 flags = btrfs_header_flags(eb); - return flags >> BTRFS_BACKREF_REV_SHIFT; -} - -static inline void btrfs_set_header_backref_rev(struct extent_buffer *eb, - int rev) -{ - u64 flags = btrfs_header_flags(eb); - flags &= ~BTRFS_BACKREF_REV_MASK; - flags |= (u64)rev << BTRFS_BACKREF_REV_SHIFT; - btrfs_set_header_flags(eb, flags); -} - -static inline int btrfs_is_leaf(const struct extent_buffer *eb) -{ - return btrfs_header_level(eb) == 0; -} - -/* struct btrfs_root_item */ -BTRFS_SETGET_FUNCS(disk_root_generation, struct btrfs_root_item, - generation, 64); -BTRFS_SETGET_FUNCS(disk_root_refs, struct btrfs_root_item, refs, 32); -BTRFS_SETGET_FUNCS(disk_root_bytenr, struct btrfs_root_item, bytenr, 64); -BTRFS_SETGET_FUNCS(disk_root_level, struct btrfs_root_item, level, 8); - -BTRFS_SETGET_STACK_FUNCS(root_generation, struct btrfs_root_item, - generation, 64); -BTRFS_SETGET_STACK_FUNCS(root_bytenr, struct btrfs_root_item, bytenr, 64); -BTRFS_SETGET_STACK_FUNCS(root_drop_level, struct btrfs_root_item, drop_level, 8); -BTRFS_SETGET_STACK_FUNCS(root_level, struct btrfs_root_item, level, 8); -BTRFS_SETGET_STACK_FUNCS(root_dirid, struct btrfs_root_item, root_dirid, 64); -BTRFS_SETGET_STACK_FUNCS(root_refs, struct btrfs_root_item, refs, 32); -BTRFS_SETGET_STACK_FUNCS(root_flags, struct btrfs_root_item, flags, 64); -BTRFS_SETGET_STACK_FUNCS(root_used, struct btrfs_root_item, bytes_used, 64); -BTRFS_SETGET_STACK_FUNCS(root_limit, struct btrfs_root_item, byte_limit, 64); -BTRFS_SETGET_STACK_FUNCS(root_last_snapshot, struct btrfs_root_item, - last_snapshot, 64); -BTRFS_SETGET_STACK_FUNCS(root_generation_v2, struct btrfs_root_item, - generation_v2, 64); -BTRFS_SETGET_STACK_FUNCS(root_ctransid, struct btrfs_root_item, - ctransid, 64); -BTRFS_SETGET_STACK_FUNCS(root_otransid, struct btrfs_root_item, - otransid, 64); -BTRFS_SETGET_STACK_FUNCS(root_stransid, struct btrfs_root_item, - stransid, 64); -BTRFS_SETGET_STACK_FUNCS(root_rtransid, struct btrfs_root_item, - rtransid, 64); - -static inline bool btrfs_root_readonly(const struct btrfs_root *root) -{ - /* Byte-swap the constant at compile time, root_item::flags is LE */ - return (root->root_item.flags & cpu_to_le64(BTRFS_ROOT_SUBVOL_RDONLY)) != 0; -} - -static inline bool btrfs_root_dead(const struct btrfs_root *root) -{ - /* Byte-swap the constant at compile time, root_item::flags is LE */ - return (root->root_item.flags & cpu_to_le64(BTRFS_ROOT_SUBVOL_DEAD)) != 0; -} - -static inline u64 btrfs_root_id(const struct btrfs_root *root) -{ - return root->root_key.objectid; -} - -/* struct btrfs_root_backup */ -BTRFS_SETGET_STACK_FUNCS(backup_tree_root, struct btrfs_root_backup, - tree_root, 64); -BTRFS_SETGET_STACK_FUNCS(backup_tree_root_gen, struct btrfs_root_backup, - tree_root_gen, 64); -BTRFS_SETGET_STACK_FUNCS(backup_tree_root_level, struct btrfs_root_backup, - tree_root_level, 8); - -BTRFS_SETGET_STACK_FUNCS(backup_chunk_root, struct btrfs_root_backup, - chunk_root, 64); -BTRFS_SETGET_STACK_FUNCS(backup_chunk_root_gen, struct btrfs_root_backup, - chunk_root_gen, 64); -BTRFS_SETGET_STACK_FUNCS(backup_chunk_root_level, struct btrfs_root_backup, - chunk_root_level, 8); - -BTRFS_SETGET_STACK_FUNCS(backup_extent_root, struct btrfs_root_backup, - extent_root, 64); -BTRFS_SETGET_STACK_FUNCS(backup_extent_root_gen, struct btrfs_root_backup, - extent_root_gen, 64); -BTRFS_SETGET_STACK_FUNCS(backup_extent_root_level, struct btrfs_root_backup, - extent_root_level, 8); - -BTRFS_SETGET_STACK_FUNCS(backup_fs_root, struct btrfs_root_backup, - fs_root, 64); -BTRFS_SETGET_STACK_FUNCS(backup_fs_root_gen, struct btrfs_root_backup, - fs_root_gen, 64); -BTRFS_SETGET_STACK_FUNCS(backup_fs_root_level, struct btrfs_root_backup, - fs_root_level, 8); - -BTRFS_SETGET_STACK_FUNCS(backup_dev_root, struct btrfs_root_backup, - dev_root, 64); -BTRFS_SETGET_STACK_FUNCS(backup_dev_root_gen, struct btrfs_root_backup, - dev_root_gen, 64); -BTRFS_SETGET_STACK_FUNCS(backup_dev_root_level, struct btrfs_root_backup, - dev_root_level, 8); - -BTRFS_SETGET_STACK_FUNCS(backup_csum_root, struct btrfs_root_backup, - csum_root, 64); -BTRFS_SETGET_STACK_FUNCS(backup_csum_root_gen, struct btrfs_root_backup, - csum_root_gen, 64); -BTRFS_SETGET_STACK_FUNCS(backup_csum_root_level, struct btrfs_root_backup, - csum_root_level, 8); -BTRFS_SETGET_STACK_FUNCS(backup_total_bytes, struct btrfs_root_backup, - total_bytes, 64); -BTRFS_SETGET_STACK_FUNCS(backup_bytes_used, struct btrfs_root_backup, - bytes_used, 64); -BTRFS_SETGET_STACK_FUNCS(backup_num_devices, struct btrfs_root_backup, - num_devices, 64); - -/* struct btrfs_balance_item */ -BTRFS_SETGET_FUNCS(balance_flags, struct btrfs_balance_item, flags, 64); - -static inline void btrfs_balance_data(const struct extent_buffer *eb, - const struct btrfs_balance_item *bi, - struct btrfs_disk_balance_args *ba) -{ - read_eb_member(eb, bi, struct btrfs_balance_item, data, ba); -} - -static inline void btrfs_set_balance_data(struct extent_buffer *eb, - struct btrfs_balance_item *bi, - const struct btrfs_disk_balance_args *ba) -{ - write_eb_member(eb, bi, struct btrfs_balance_item, data, ba); -} - -static inline void btrfs_balance_meta(const struct extent_buffer *eb, - const struct btrfs_balance_item *bi, - struct btrfs_disk_balance_args *ba) -{ - read_eb_member(eb, bi, struct btrfs_balance_item, meta, ba); -} - -static inline void btrfs_set_balance_meta(struct extent_buffer *eb, - struct btrfs_balance_item *bi, - const struct btrfs_disk_balance_args *ba) -{ - write_eb_member(eb, bi, struct btrfs_balance_item, meta, ba); -} - -static inline void btrfs_balance_sys(const struct extent_buffer *eb, - const struct btrfs_balance_item *bi, - struct btrfs_disk_balance_args *ba) -{ - read_eb_member(eb, bi, struct btrfs_balance_item, sys, ba); -} - -static inline void btrfs_set_balance_sys(struct extent_buffer *eb, - struct btrfs_balance_item *bi, - const struct btrfs_disk_balance_args *ba) -{ - write_eb_member(eb, bi, struct btrfs_balance_item, sys, ba); -} - -static inline void -btrfs_disk_balance_args_to_cpu(struct btrfs_balance_args *cpu, - const struct btrfs_disk_balance_args *disk) -{ - memset(cpu, 0, sizeof(*cpu)); - - cpu->profiles = le64_to_cpu(disk->profiles); - cpu->usage = le64_to_cpu(disk->usage); - cpu->devid = le64_to_cpu(disk->devid); - cpu->pstart = le64_to_cpu(disk->pstart); - cpu->pend = le64_to_cpu(disk->pend); - cpu->vstart = le64_to_cpu(disk->vstart); - cpu->vend = le64_to_cpu(disk->vend); - cpu->target = le64_to_cpu(disk->target); - cpu->flags = le64_to_cpu(disk->flags); - cpu->limit = le64_to_cpu(disk->limit); - cpu->stripes_min = le32_to_cpu(disk->stripes_min); - cpu->stripes_max = le32_to_cpu(disk->stripes_max); -} - -static inline void -btrfs_cpu_balance_args_to_disk(struct btrfs_disk_balance_args *disk, - const struct btrfs_balance_args *cpu) -{ - memset(disk, 0, sizeof(*disk)); - - disk->profiles = cpu_to_le64(cpu->profiles); - disk->usage = cpu_to_le64(cpu->usage); - disk->devid = cpu_to_le64(cpu->devid); - disk->pstart = cpu_to_le64(cpu->pstart); - disk->pend = cpu_to_le64(cpu->pend); - disk->vstart = cpu_to_le64(cpu->vstart); - disk->vend = cpu_to_le64(cpu->vend); - disk->target = cpu_to_le64(cpu->target); - disk->flags = cpu_to_le64(cpu->flags); - disk->limit = cpu_to_le64(cpu->limit); - disk->stripes_min = cpu_to_le32(cpu->stripes_min); - disk->stripes_max = cpu_to_le32(cpu->stripes_max); -} - -/* struct btrfs_super_block */ -BTRFS_SETGET_STACK_FUNCS(super_bytenr, struct btrfs_super_block, bytenr, 64); -BTRFS_SETGET_STACK_FUNCS(super_flags, struct btrfs_super_block, flags, 64); -BTRFS_SETGET_STACK_FUNCS(super_generation, struct btrfs_super_block, - generation, 64); -BTRFS_SETGET_STACK_FUNCS(super_root, struct btrfs_super_block, root, 64); -BTRFS_SETGET_STACK_FUNCS(super_sys_array_size, - struct btrfs_super_block, sys_chunk_array_size, 32); -BTRFS_SETGET_STACK_FUNCS(super_chunk_root_generation, - struct btrfs_super_block, chunk_root_generation, 64); -BTRFS_SETGET_STACK_FUNCS(super_root_level, struct btrfs_super_block, - root_level, 8); -BTRFS_SETGET_STACK_FUNCS(super_chunk_root, struct btrfs_super_block, - chunk_root, 64); -BTRFS_SETGET_STACK_FUNCS(super_chunk_root_level, struct btrfs_super_block, - chunk_root_level, 8); -BTRFS_SETGET_STACK_FUNCS(super_log_root, struct btrfs_super_block, - log_root, 64); -BTRFS_SETGET_STACK_FUNCS(super_log_root_level, struct btrfs_super_block, - log_root_level, 8); -BTRFS_SETGET_STACK_FUNCS(super_total_bytes, struct btrfs_super_block, - total_bytes, 64); -BTRFS_SETGET_STACK_FUNCS(super_bytes_used, struct btrfs_super_block, - bytes_used, 64); -BTRFS_SETGET_STACK_FUNCS(super_sectorsize, struct btrfs_super_block, - sectorsize, 32); -BTRFS_SETGET_STACK_FUNCS(super_nodesize, struct btrfs_super_block, - nodesize, 32); -BTRFS_SETGET_STACK_FUNCS(super_stripesize, struct btrfs_super_block, - stripesize, 32); -BTRFS_SETGET_STACK_FUNCS(super_root_dir, struct btrfs_super_block, - root_dir_objectid, 64); -BTRFS_SETGET_STACK_FUNCS(super_num_devices, struct btrfs_super_block, - num_devices, 64); -BTRFS_SETGET_STACK_FUNCS(super_compat_flags, struct btrfs_super_block, - compat_flags, 64); -BTRFS_SETGET_STACK_FUNCS(super_compat_ro_flags, struct btrfs_super_block, - compat_ro_flags, 64); -BTRFS_SETGET_STACK_FUNCS(super_incompat_flags, struct btrfs_super_block, - incompat_flags, 64); -BTRFS_SETGET_STACK_FUNCS(super_csum_type, struct btrfs_super_block, - csum_type, 16); -BTRFS_SETGET_STACK_FUNCS(super_cache_generation, struct btrfs_super_block, - cache_generation, 64); -BTRFS_SETGET_STACK_FUNCS(super_magic, struct btrfs_super_block, magic, 64); -BTRFS_SETGET_STACK_FUNCS(super_uuid_tree_generation, struct btrfs_super_block, - uuid_tree_generation, 64); - -int btrfs_super_csum_size(const struct btrfs_super_block *s); -const char *btrfs_super_csum_name(u16 csum_type); -const char *btrfs_super_csum_driver(u16 csum_type); -size_t __attribute_const__ btrfs_get_num_csums(void); - - -/* - * The leaf data grows from end-to-front in the node. - * this returns the address of the start of the last item, - * which is the stop of the leaf data stack - */ -static inline unsigned int leaf_data_end(const struct extent_buffer *leaf) -{ - u32 nr = btrfs_header_nritems(leaf); - - if (nr == 0) - return BTRFS_LEAF_DATA_SIZE(leaf->fs_info); - return btrfs_item_offset(leaf, nr - 1); -} - -/* struct btrfs_file_extent_item */ -BTRFS_SETGET_STACK_FUNCS(stack_file_extent_type, struct btrfs_file_extent_item, - type, 8); -BTRFS_SETGET_STACK_FUNCS(stack_file_extent_disk_bytenr, - struct btrfs_file_extent_item, disk_bytenr, 64); -BTRFS_SETGET_STACK_FUNCS(stack_file_extent_offset, - struct btrfs_file_extent_item, offset, 64); -BTRFS_SETGET_STACK_FUNCS(stack_file_extent_generation, - struct btrfs_file_extent_item, generation, 64); -BTRFS_SETGET_STACK_FUNCS(stack_file_extent_num_bytes, - struct btrfs_file_extent_item, num_bytes, 64); -BTRFS_SETGET_STACK_FUNCS(stack_file_extent_ram_bytes, - struct btrfs_file_extent_item, ram_bytes, 64); -BTRFS_SETGET_STACK_FUNCS(stack_file_extent_disk_num_bytes, - struct btrfs_file_extent_item, disk_num_bytes, 64); -BTRFS_SETGET_STACK_FUNCS(stack_file_extent_compression, - struct btrfs_file_extent_item, compression, 8); - -static inline unsigned long -btrfs_file_extent_inline_start(const struct btrfs_file_extent_item *e) -{ - return (unsigned long)e + BTRFS_FILE_EXTENT_INLINE_DATA_START; -} - -static inline u32 btrfs_file_extent_calc_inline_size(u32 datasize) -{ - return BTRFS_FILE_EXTENT_INLINE_DATA_START + datasize; -} - -BTRFS_SETGET_FUNCS(file_extent_type, struct btrfs_file_extent_item, type, 8); -BTRFS_SETGET_FUNCS(file_extent_disk_bytenr, struct btrfs_file_extent_item, - disk_bytenr, 64); -BTRFS_SETGET_FUNCS(file_extent_generation, struct btrfs_file_extent_item, - generation, 64); -BTRFS_SETGET_FUNCS(file_extent_disk_num_bytes, struct btrfs_file_extent_item, - disk_num_bytes, 64); -BTRFS_SETGET_FUNCS(file_extent_offset, struct btrfs_file_extent_item, - offset, 64); -BTRFS_SETGET_FUNCS(file_extent_num_bytes, struct btrfs_file_extent_item, - num_bytes, 64); -BTRFS_SETGET_FUNCS(file_extent_ram_bytes, struct btrfs_file_extent_item, - ram_bytes, 64); -BTRFS_SETGET_FUNCS(file_extent_compression, struct btrfs_file_extent_item, - compression, 8); -BTRFS_SETGET_FUNCS(file_extent_encryption, struct btrfs_file_extent_item, - encryption, 8); -BTRFS_SETGET_FUNCS(file_extent_other_encoding, struct btrfs_file_extent_item, - other_encoding, 16); - -/* - * this returns the number of bytes used by the item on disk, minus the - * size of any extent headers. If a file is compressed on disk, this is - * the compressed size - */ -static inline u32 btrfs_file_extent_inline_item_len( - const struct extent_buffer *eb, - int nr) -{ - return btrfs_item_size(eb, nr) - BTRFS_FILE_EXTENT_INLINE_DATA_START; -} - -/* btrfs_qgroup_status_item */ -BTRFS_SETGET_FUNCS(qgroup_status_generation, struct btrfs_qgroup_status_item, - generation, 64); -BTRFS_SETGET_FUNCS(qgroup_status_version, struct btrfs_qgroup_status_item, - version, 64); -BTRFS_SETGET_FUNCS(qgroup_status_flags, struct btrfs_qgroup_status_item, - flags, 64); -BTRFS_SETGET_FUNCS(qgroup_status_rescan, struct btrfs_qgroup_status_item, - rescan, 64); - -/* btrfs_qgroup_info_item */ -BTRFS_SETGET_FUNCS(qgroup_info_generation, struct btrfs_qgroup_info_item, - generation, 64); -BTRFS_SETGET_FUNCS(qgroup_info_rfer, struct btrfs_qgroup_info_item, rfer, 64); -BTRFS_SETGET_FUNCS(qgroup_info_rfer_cmpr, struct btrfs_qgroup_info_item, - rfer_cmpr, 64); -BTRFS_SETGET_FUNCS(qgroup_info_excl, struct btrfs_qgroup_info_item, excl, 64); -BTRFS_SETGET_FUNCS(qgroup_info_excl_cmpr, struct btrfs_qgroup_info_item, - excl_cmpr, 64); - -BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_generation, - struct btrfs_qgroup_info_item, generation, 64); -BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_rfer, struct btrfs_qgroup_info_item, - rfer, 64); -BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_rfer_cmpr, - struct btrfs_qgroup_info_item, rfer_cmpr, 64); -BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_excl, struct btrfs_qgroup_info_item, - excl, 64); -BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_excl_cmpr, - struct btrfs_qgroup_info_item, excl_cmpr, 64); - -/* btrfs_qgroup_limit_item */ -BTRFS_SETGET_FUNCS(qgroup_limit_flags, struct btrfs_qgroup_limit_item, - flags, 64); -BTRFS_SETGET_FUNCS(qgroup_limit_max_rfer, struct btrfs_qgroup_limit_item, - max_rfer, 64); -BTRFS_SETGET_FUNCS(qgroup_limit_max_excl, struct btrfs_qgroup_limit_item, - max_excl, 64); -BTRFS_SETGET_FUNCS(qgroup_limit_rsv_rfer, struct btrfs_qgroup_limit_item, - rsv_rfer, 64); -BTRFS_SETGET_FUNCS(qgroup_limit_rsv_excl, struct btrfs_qgroup_limit_item, - rsv_excl, 64); - -/* btrfs_dev_replace_item */ -BTRFS_SETGET_FUNCS(dev_replace_src_devid, - struct btrfs_dev_replace_item, src_devid, 64); -BTRFS_SETGET_FUNCS(dev_replace_cont_reading_from_srcdev_mode, - struct btrfs_dev_replace_item, cont_reading_from_srcdev_mode, - 64); -BTRFS_SETGET_FUNCS(dev_replace_replace_state, struct btrfs_dev_replace_item, - replace_state, 64); -BTRFS_SETGET_FUNCS(dev_replace_time_started, struct btrfs_dev_replace_item, - time_started, 64); -BTRFS_SETGET_FUNCS(dev_replace_time_stopped, struct btrfs_dev_replace_item, - time_stopped, 64); -BTRFS_SETGET_FUNCS(dev_replace_num_write_errors, struct btrfs_dev_replace_item, - num_write_errors, 64); -BTRFS_SETGET_FUNCS(dev_replace_num_uncorrectable_read_errors, - struct btrfs_dev_replace_item, num_uncorrectable_read_errors, - 64); -BTRFS_SETGET_FUNCS(dev_replace_cursor_left, struct btrfs_dev_replace_item, - cursor_left, 64); -BTRFS_SETGET_FUNCS(dev_replace_cursor_right, struct btrfs_dev_replace_item, - cursor_right, 64); - -BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_src_devid, - struct btrfs_dev_replace_item, src_devid, 64); -BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cont_reading_from_srcdev_mode, - struct btrfs_dev_replace_item, - cont_reading_from_srcdev_mode, 64); -BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_replace_state, - struct btrfs_dev_replace_item, replace_state, 64); -BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_time_started, - struct btrfs_dev_replace_item, time_started, 64); -BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_time_stopped, - struct btrfs_dev_replace_item, time_stopped, 64); -BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_num_write_errors, - struct btrfs_dev_replace_item, num_write_errors, 64); -BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_num_uncorrectable_read_errors, - struct btrfs_dev_replace_item, - num_uncorrectable_read_errors, 64); -BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cursor_left, - struct btrfs_dev_replace_item, cursor_left, 64); -BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cursor_right, - struct btrfs_dev_replace_item, cursor_right, 64); - -/* helper function to cast into the data area of the leaf. */ -#define btrfs_item_ptr(leaf, slot, type) \ - ((type *)(BTRFS_LEAF_DATA_OFFSET + \ - btrfs_item_offset(leaf, slot))) - -#define btrfs_item_ptr_offset(leaf, slot) \ - ((unsigned long)(BTRFS_LEAF_DATA_OFFSET + \ - btrfs_item_offset(leaf, slot))) - static inline u32 btrfs_crc32c(u32 crc, const void *address, unsigned length) { return crc32c(crc, address, length); @@ -2846,163 +498,15 @@ static inline gfp_t btrfs_alloc_write_mask(struct address_space *mapping) return mapping_gfp_constraint(mapping, ~__GFP_FS); } -/* extent-tree.c */ - -enum btrfs_inline_ref_type { - BTRFS_REF_TYPE_INVALID, - BTRFS_REF_TYPE_BLOCK, - BTRFS_REF_TYPE_DATA, - BTRFS_REF_TYPE_ANY, -}; - -int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb, - struct btrfs_extent_inline_ref *iref, - enum btrfs_inline_ref_type is_data); -u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset); - - -int btrfs_add_excluded_extent(struct btrfs_fs_info *fs_info, - u64 start, u64 num_bytes); -void btrfs_free_excluded_extents(struct btrfs_block_group *cache); -int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, - unsigned long count); -void btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info, - struct btrfs_delayed_ref_root *delayed_refs, - struct btrfs_delayed_ref_head *head); -int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len); -int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, u64 bytenr, - u64 offset, int metadata, u64 *refs, u64 *flags); -int btrfs_pin_extent(struct btrfs_trans_handle *trans, u64 bytenr, u64 num, - int reserved); -int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans, - u64 bytenr, u64 num_bytes); -int btrfs_exclude_logged_extents(struct extent_buffer *eb); -int btrfs_cross_ref_exist(struct btrfs_root *root, - u64 objectid, u64 offset, u64 bytenr, bool strict, - struct btrfs_path *path); -struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - u64 parent, u64 root_objectid, - const struct btrfs_disk_key *key, - int level, u64 hint, - u64 empty_size, - enum btrfs_lock_nesting nest); -void btrfs_free_tree_block(struct btrfs_trans_handle *trans, - u64 root_id, - struct extent_buffer *buf, - u64 parent, int last_ref); -int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 owner, - u64 offset, u64 ram_bytes, - struct btrfs_key *ins); -int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, - u64 root_objectid, u64 owner, u64 offset, - struct btrfs_key *ins); -int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes, u64 num_bytes, - u64 min_alloc_size, u64 empty_size, u64 hint_byte, - struct btrfs_key *ins, int is_data, int delalloc); -int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct extent_buffer *buf, int full_backref); -int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct extent_buffer *buf, int full_backref); -int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, - struct extent_buffer *eb, u64 flags, int level); -int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref); - -int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info, - u64 start, u64 len, int delalloc); -int btrfs_pin_reserved_extent(struct btrfs_trans_handle *trans, u64 start, - u64 len); -int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans); -int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, - struct btrfs_ref *generic_ref); - -void btrfs_clear_space_info_full(struct btrfs_fs_info *info); - -/* - * Different levels for to flush space when doing space reservations. - * - * The higher the level, the more methods we try to reclaim space. - */ -enum btrfs_reserve_flush_enum { - /* If we are in the transaction, we can't flush anything.*/ - BTRFS_RESERVE_NO_FLUSH, - - /* - * Flush space by: - * - Running delayed inode items - * - Allocating a new chunk - */ - BTRFS_RESERVE_FLUSH_LIMIT, - - /* - * Flush space by: - * - Running delayed inode items - * - Running delayed refs - * - Running delalloc and waiting for ordered extents - * - Allocating a new chunk - */ - BTRFS_RESERVE_FLUSH_EVICT, - - /* - * Flush space by above mentioned methods and by: - * - Running delayed iputs - * - Committing transaction - * - * Can be interrupted by a fatal signal. - */ - BTRFS_RESERVE_FLUSH_DATA, - BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE, - BTRFS_RESERVE_FLUSH_ALL, - - /* - * Pretty much the same as FLUSH_ALL, but can also steal space from - * global rsv. - * - * Can be interrupted by a fatal signal. - */ - BTRFS_RESERVE_FLUSH_ALL_STEAL, -}; - -enum btrfs_flush_state { - FLUSH_DELAYED_ITEMS_NR = 1, - FLUSH_DELAYED_ITEMS = 2, - FLUSH_DELAYED_REFS_NR = 3, - FLUSH_DELAYED_REFS = 4, - FLUSH_DELALLOC = 5, - FLUSH_DELALLOC_WAIT = 6, - FLUSH_DELALLOC_FULL = 7, - ALLOC_CHUNK = 8, - ALLOC_CHUNK_FORCE = 9, - RUN_DELAYED_IPUTS = 10, - COMMIT_TRANS = 11, -}; - -int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, - struct btrfs_block_rsv *rsv, - int nitems, bool use_global_rsv); -void btrfs_subvolume_release_metadata(struct btrfs_root *root, - struct btrfs_block_rsv *rsv); -void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes); - -int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes, - u64 disk_num_bytes, bool noflush); -u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo); int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info, u64 start, u64 end); int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr, u64 num_bytes, u64 *actual_bytes); int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range); -int btrfs_init_space_info(struct btrfs_fs_info *fs_info); -int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info); -int btrfs_start_write_no_snapshotting(struct btrfs_root *root); -void btrfs_end_write_no_snapshotting(struct btrfs_root *root); -void btrfs_wait_for_snapshot_creation(struct btrfs_root *root); - /* ctree.c */ +int __init btrfs_ctree_init(void); +void __cold btrfs_ctree_exit(void); int btrfs_bin_search(struct extent_buffer *eb, const struct btrfs_key *key, int *slot); int __pure btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_key *k2); @@ -3163,14 +667,7 @@ int btrfs_get_next_valid_item(struct btrfs_root *root, struct btrfs_key *key, (path)->slots[0]++ \ ) -static inline int btrfs_next_old_item(struct btrfs_root *root, - struct btrfs_path *p, u64 time_seq) -{ - ++p->slots[0]; - if (p->slots[0] >= btrfs_header_nritems(p->nodes[0])) - return btrfs_next_old_leaf(root, p, time_seq); - return 0; -} +int btrfs_next_old_item(struct btrfs_root *root, struct btrfs_path *path, u64 time_seq); /* * Search the tree again to find a leaf with greater keys. @@ -3188,865 +685,6 @@ static inline int btrfs_next_item(struct btrfs_root *root, struct btrfs_path *p) return btrfs_next_old_item(root, p, 0); } int btrfs_leaf_free_space(struct extent_buffer *leaf); -int __must_check btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, - int for_reloc); -int btrfs_drop_subtree(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct extent_buffer *node, - struct extent_buffer *parent); -static inline int btrfs_fs_closing(struct btrfs_fs_info *fs_info) -{ - /* - * Do it this way so we only ever do one test_bit in the normal case. - */ - if (test_bit(BTRFS_FS_CLOSING_START, &fs_info->flags)) { - if (test_bit(BTRFS_FS_CLOSING_DONE, &fs_info->flags)) - return 2; - return 1; - } - return 0; -} - -/* - * If we remount the fs to be R/O or umount the fs, the cleaner needn't do - * anything except sleeping. This function is used to check the status of - * the fs. - * We check for BTRFS_FS_STATE_RO to avoid races with a concurrent remount, - * since setting and checking for SB_RDONLY in the superblock's flags is not - * atomic. - */ -static inline int btrfs_need_cleaner_sleep(struct btrfs_fs_info *fs_info) -{ - return test_bit(BTRFS_FS_STATE_RO, &fs_info->fs_state) || - btrfs_fs_closing(fs_info); -} - -static inline void btrfs_set_sb_rdonly(struct super_block *sb) -{ - sb->s_flags |= SB_RDONLY; - set_bit(BTRFS_FS_STATE_RO, &btrfs_sb(sb)->fs_state); -} - -static inline void btrfs_clear_sb_rdonly(struct super_block *sb) -{ - sb->s_flags &= ~SB_RDONLY; - clear_bit(BTRFS_FS_STATE_RO, &btrfs_sb(sb)->fs_state); -} - -/* root-item.c */ -int btrfs_add_root_ref(struct btrfs_trans_handle *trans, u64 root_id, - u64 ref_id, u64 dirid, u64 sequence, const char *name, - int name_len); -int btrfs_del_root_ref(struct btrfs_trans_handle *trans, u64 root_id, - u64 ref_id, u64 dirid, u64 *sequence, const char *name, - int name_len); -int btrfs_del_root(struct btrfs_trans_handle *trans, - const struct btrfs_key *key); -int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root *root, - const struct btrfs_key *key, - struct btrfs_root_item *item); -int __must_check btrfs_update_root(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_key *key, - struct btrfs_root_item *item); -int btrfs_find_root(struct btrfs_root *root, const struct btrfs_key *search_key, - struct btrfs_path *path, struct btrfs_root_item *root_item, - struct btrfs_key *root_key); -int btrfs_find_orphan_roots(struct btrfs_fs_info *fs_info); -void btrfs_set_root_node(struct btrfs_root_item *item, - struct extent_buffer *node); -void btrfs_check_and_init_root_item(struct btrfs_root_item *item); -void btrfs_update_root_times(struct btrfs_trans_handle *trans, - struct btrfs_root *root); - -/* uuid-tree.c */ -int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans, u8 *uuid, u8 type, - u64 subid); -int btrfs_uuid_tree_remove(struct btrfs_trans_handle *trans, u8 *uuid, u8 type, - u64 subid); -int btrfs_uuid_tree_iterate(struct btrfs_fs_info *fs_info); - -/* dir-item.c */ -int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir, - const char *name, int name_len); -int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, const char *name, - int name_len, struct btrfs_inode *dir, - struct btrfs_key *location, u8 type, u64 index); -struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, u64 dir, - const char *name, int name_len, - int mod); -struct btrfs_dir_item * -btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, u64 dir, - u64 index, const char *name, int name_len, - int mod); -struct btrfs_dir_item * -btrfs_search_dir_index_item(struct btrfs_root *root, - struct btrfs_path *path, u64 dirid, - const char *name, int name_len); -int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - struct btrfs_dir_item *di); -int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, u64 objectid, - const char *name, u16 name_len, - const void *data, u16 data_len); -struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, u64 dir, - const char *name, u16 name_len, - int mod); -struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_fs_info *fs_info, - struct btrfs_path *path, - const char *name, - int name_len); - -/* orphan.c */ -int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 offset); -int btrfs_del_orphan_item(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 offset); -int btrfs_find_orphan_item(struct btrfs_root *root, u64 offset); - -/* file-item.c */ -int btrfs_del_csums(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 bytenr, u64 len); -blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst); -int btrfs_insert_hole_extent(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 objectid, u64 pos, - u64 num_bytes); -int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, u64 objectid, - u64 bytenr, int mod); -int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_ordered_sum *sums); -blk_status_t btrfs_csum_one_bio(struct btrfs_inode *inode, struct bio *bio, - u64 offset, bool one_ordered); -int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, - struct list_head *list, int search_commit, - bool nowait); -void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode, - const struct btrfs_path *path, - struct btrfs_file_extent_item *fi, - const bool new_inline, - struct extent_map *em); -int btrfs_inode_clear_file_extent_range(struct btrfs_inode *inode, u64 start, - u64 len); -int btrfs_inode_set_file_extent_range(struct btrfs_inode *inode, u64 start, - u64 len); -void btrfs_inode_safe_disk_i_size_write(struct btrfs_inode *inode, u64 new_i_size); -u64 btrfs_file_extent_end(const struct btrfs_path *path); - -/* inode.c */ -void btrfs_submit_data_write_bio(struct inode *inode, struct bio *bio, int mirror_num); -void btrfs_submit_data_read_bio(struct inode *inode, struct bio *bio, - int mirror_num, enum btrfs_compression_type compress_type); -int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, struct page *page, - u32 pgoff, u8 *csum, const u8 * const csum_expected); -int btrfs_check_data_csum(struct inode *inode, struct btrfs_bio *bbio, - u32 bio_offset, struct page *page, u32 pgoff); -unsigned int btrfs_verify_data_csum(struct btrfs_bio *bbio, - u32 bio_offset, struct page *page, - u64 start, u64 end); -int btrfs_check_data_csum(struct inode *inode, struct btrfs_bio *bbio, - u32 bio_offset, struct page *page, u32 pgoff); -noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, - u64 *orig_start, u64 *orig_block_len, - u64 *ram_bytes, bool nowait, bool strict); - -void __btrfs_del_delalloc_inode(struct btrfs_root *root, - struct btrfs_inode *inode); -struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry); -int btrfs_set_inode_index(struct btrfs_inode *dir, u64 *index); -int btrfs_unlink_inode(struct btrfs_trans_handle *trans, - struct btrfs_inode *dir, struct btrfs_inode *inode, - const char *name, int name_len); -int btrfs_add_link(struct btrfs_trans_handle *trans, - struct btrfs_inode *parent_inode, struct btrfs_inode *inode, - const char *name, int name_len, int add_backref, u64 index); -int btrfs_delete_subvolume(struct inode *dir, struct dentry *dentry); -int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len, - int front); - -int btrfs_start_delalloc_snapshot(struct btrfs_root *root, bool in_reclaim_context); -int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr, - bool in_reclaim_context); -int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end, - unsigned int extra_bits, - struct extent_state **cached_state); -struct btrfs_new_inode_args { - /* Input */ - struct inode *dir; - struct dentry *dentry; - struct inode *inode; - bool orphan; - bool subvol; - - /* - * Output from btrfs_new_inode_prepare(), input to - * btrfs_create_new_inode(). - */ - struct posix_acl *default_acl; - struct posix_acl *acl; -}; -int btrfs_new_inode_prepare(struct btrfs_new_inode_args *args, - unsigned int *trans_num_items); -int btrfs_create_new_inode(struct btrfs_trans_handle *trans, - struct btrfs_new_inode_args *args); -void btrfs_new_inode_args_destroy(struct btrfs_new_inode_args *args); -struct inode *btrfs_new_subvol_inode(struct user_namespace *mnt_userns, - struct inode *dir); - void btrfs_set_delalloc_extent(struct inode *inode, struct extent_state *state, - u32 bits); -void btrfs_clear_delalloc_extent(struct inode *inode, - struct extent_state *state, u32 bits); -void btrfs_merge_delalloc_extent(struct inode *inode, struct extent_state *new, - struct extent_state *other); -void btrfs_split_delalloc_extent(struct inode *inode, - struct extent_state *orig, u64 split); -void btrfs_set_range_writeback(struct btrfs_inode *inode, u64 start, u64 end); -vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf); -void btrfs_evict_inode(struct inode *inode); -struct inode *btrfs_alloc_inode(struct super_block *sb); -void btrfs_destroy_inode(struct inode *inode); -void btrfs_free_inode(struct inode *inode); -int btrfs_drop_inode(struct inode *inode); -int __init btrfs_init_cachep(void); -void __cold btrfs_destroy_cachep(void); -struct inode *btrfs_iget_path(struct super_block *s, u64 ino, - struct btrfs_root *root, struct btrfs_path *path); -struct inode *btrfs_iget(struct super_block *s, u64 ino, struct btrfs_root *root); -struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, - struct page *page, size_t pg_offset, - u64 start, u64 end); -int btrfs_update_inode(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct btrfs_inode *inode); -int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct btrfs_inode *inode); -int btrfs_orphan_add(struct btrfs_trans_handle *trans, - struct btrfs_inode *inode); -int btrfs_orphan_cleanup(struct btrfs_root *root); -int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size); -void btrfs_add_delayed_iput(struct inode *inode); -void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info); -int btrfs_wait_on_delayed_iputs(struct btrfs_fs_info *fs_info); -int btrfs_prealloc_file_range(struct inode *inode, int mode, - u64 start, u64 num_bytes, u64 min_size, - loff_t actual_len, u64 *alloc_hint); -int btrfs_prealloc_file_range_trans(struct inode *inode, - struct btrfs_trans_handle *trans, int mode, - u64 start, u64 num_bytes, u64 min_size, - loff_t actual_len, u64 *alloc_hint); -int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page, - u64 start, u64 end, int *page_started, unsigned long *nr_written, - struct writeback_control *wbc); -int btrfs_writepage_cow_fixup(struct page *page); -void btrfs_writepage_endio_finish_ordered(struct btrfs_inode *inode, - struct page *page, u64 start, - u64 end, bool uptodate); -int btrfs_encoded_io_compression_from_extent(struct btrfs_fs_info *fs_info, - int compress_type); -int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, - u64 file_offset, u64 disk_bytenr, - u64 disk_io_size, - struct page **pages); -ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter, - struct btrfs_ioctl_encoded_io_args *encoded); -ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from, - const struct btrfs_ioctl_encoded_io_args *encoded); - -ssize_t btrfs_dio_read(struct kiocb *iocb, struct iov_iter *iter, - size_t done_before); -struct iomap_dio *btrfs_dio_write(struct kiocb *iocb, struct iov_iter *iter, - size_t done_before); - -extern const struct dentry_operations btrfs_dentry_operations; - -/* Inode locking type flags, by default the exclusive lock is taken */ -#define BTRFS_ILOCK_SHARED (1U << 0) -#define BTRFS_ILOCK_TRY (1U << 1) -#define BTRFS_ILOCK_MMAP (1U << 2) - -int btrfs_inode_lock(struct inode *inode, unsigned int ilock_flags); -void btrfs_inode_unlock(struct inode *inode, unsigned int ilock_flags); -void btrfs_update_inode_bytes(struct btrfs_inode *inode, - const u64 add_bytes, - const u64 del_bytes); -void btrfs_assert_inode_range_clean(struct btrfs_inode *inode, u64 start, u64 end); - -/* ioctl.c */ -long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg); -long btrfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg); -int btrfs_fileattr_get(struct dentry *dentry, struct fileattr *fa); -int btrfs_fileattr_set(struct user_namespace *mnt_userns, - struct dentry *dentry, struct fileattr *fa); -int btrfs_ioctl_get_supported_features(void __user *arg); -void btrfs_sync_inode_flags_to_i_flags(struct inode *inode); -int __pure btrfs_is_empty_uuid(u8 *uuid); -int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra, - struct btrfs_ioctl_defrag_range_args *range, - u64 newer_than, unsigned long max_to_defrag); -void btrfs_get_block_group_info(struct list_head *groups_list, - struct btrfs_ioctl_space_info *space); -void btrfs_update_ioctl_balance_args(struct btrfs_fs_info *fs_info, - struct btrfs_ioctl_balance_args *bargs); - -/* file.c */ -int __init btrfs_auto_defrag_init(void); -void __cold btrfs_auto_defrag_exit(void); -int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, - struct btrfs_inode *inode, u32 extent_thresh); -int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info); -void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info); -int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync); -extern const struct file_operations btrfs_file_operations; -int btrfs_drop_extents(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct btrfs_inode *inode, - struct btrfs_drop_extents_args *args); -int btrfs_replace_file_extents(struct btrfs_inode *inode, - struct btrfs_path *path, const u64 start, - const u64 end, - struct btrfs_replace_extent_info *extent_info, - struct btrfs_trans_handle **trans_out); -int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, - struct btrfs_inode *inode, u64 start, u64 end); -ssize_t btrfs_do_write_iter(struct kiocb *iocb, struct iov_iter *from, - const struct btrfs_ioctl_encoded_io_args *encoded); -int btrfs_release_file(struct inode *inode, struct file *file); -int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages, - size_t num_pages, loff_t pos, size_t write_bytes, - struct extent_state **cached, bool noreserve); -int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end); -int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos, - size_t *write_bytes, bool nowait); -void btrfs_check_nocow_unlock(struct btrfs_inode *inode); -bool btrfs_find_delalloc_in_range(struct btrfs_inode *inode, u64 start, u64 end, - u64 *delalloc_start_ret, u64 *delalloc_end_ret); - -/* tree-defrag.c */ -int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, - struct btrfs_root *root); - -/* super.c */ -int btrfs_parse_options(struct btrfs_fs_info *info, char *options, - unsigned long new_flags); -int btrfs_sync_fs(struct super_block *sb, int wait); -char *btrfs_get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info, - u64 subvol_objectid); - -static inline __printf(2, 3) __cold -void btrfs_no_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...) -{ -} - -#ifdef CONFIG_PRINTK_INDEX - -#define btrfs_printk(fs_info, fmt, args...) \ -do { \ - printk_index_subsys_emit("%sBTRFS %s (device %s): ", NULL, fmt); \ - _btrfs_printk(fs_info, fmt, ##args); \ -} while (0) - -__printf(2, 3) -__cold -void _btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...); - -#elif defined(CONFIG_PRINTK) - -#define btrfs_printk(fs_info, fmt, args...) \ - _btrfs_printk(fs_info, fmt, ##args) - -__printf(2, 3) -__cold -void _btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...); - -#else - -#define btrfs_printk(fs_info, fmt, args...) \ - btrfs_no_printk(fs_info, fmt, ##args) -#endif - -#define btrfs_emerg(fs_info, fmt, args...) \ - btrfs_printk(fs_info, KERN_EMERG fmt, ##args) -#define btrfs_alert(fs_info, fmt, args...) \ - btrfs_printk(fs_info, KERN_ALERT fmt, ##args) -#define btrfs_crit(fs_info, fmt, args...) \ - btrfs_printk(fs_info, KERN_CRIT fmt, ##args) -#define btrfs_err(fs_info, fmt, args...) \ - btrfs_printk(fs_info, KERN_ERR fmt, ##args) -#define btrfs_warn(fs_info, fmt, args...) \ - btrfs_printk(fs_info, KERN_WARNING fmt, ##args) -#define btrfs_notice(fs_info, fmt, args...) \ - btrfs_printk(fs_info, KERN_NOTICE fmt, ##args) -#define btrfs_info(fs_info, fmt, args...) \ - btrfs_printk(fs_info, KERN_INFO fmt, ##args) - -/* - * Wrappers that use printk_in_rcu - */ -#define btrfs_emerg_in_rcu(fs_info, fmt, args...) \ - btrfs_printk_in_rcu(fs_info, KERN_EMERG fmt, ##args) -#define btrfs_alert_in_rcu(fs_info, fmt, args...) \ - btrfs_printk_in_rcu(fs_info, KERN_ALERT fmt, ##args) -#define btrfs_crit_in_rcu(fs_info, fmt, args...) \ - btrfs_printk_in_rcu(fs_info, KERN_CRIT fmt, ##args) -#define btrfs_err_in_rcu(fs_info, fmt, args...) \ - btrfs_printk_in_rcu(fs_info, KERN_ERR fmt, ##args) -#define btrfs_warn_in_rcu(fs_info, fmt, args...) \ - btrfs_printk_in_rcu(fs_info, KERN_WARNING fmt, ##args) -#define btrfs_notice_in_rcu(fs_info, fmt, args...) \ - btrfs_printk_in_rcu(fs_info, KERN_NOTICE fmt, ##args) -#define btrfs_info_in_rcu(fs_info, fmt, args...) \ - btrfs_printk_in_rcu(fs_info, KERN_INFO fmt, ##args) - -/* - * Wrappers that use a ratelimited printk_in_rcu - */ -#define btrfs_emerg_rl_in_rcu(fs_info, fmt, args...) \ - btrfs_printk_rl_in_rcu(fs_info, KERN_EMERG fmt, ##args) -#define btrfs_alert_rl_in_rcu(fs_info, fmt, args...) \ - btrfs_printk_rl_in_rcu(fs_info, KERN_ALERT fmt, ##args) -#define btrfs_crit_rl_in_rcu(fs_info, fmt, args...) \ - btrfs_printk_rl_in_rcu(fs_info, KERN_CRIT fmt, ##args) -#define btrfs_err_rl_in_rcu(fs_info, fmt, args...) \ - btrfs_printk_rl_in_rcu(fs_info, KERN_ERR fmt, ##args) -#define btrfs_warn_rl_in_rcu(fs_info, fmt, args...) \ - btrfs_printk_rl_in_rcu(fs_info, KERN_WARNING fmt, ##args) -#define btrfs_notice_rl_in_rcu(fs_info, fmt, args...) \ - btrfs_printk_rl_in_rcu(fs_info, KERN_NOTICE fmt, ##args) -#define btrfs_info_rl_in_rcu(fs_info, fmt, args...) \ - btrfs_printk_rl_in_rcu(fs_info, KERN_INFO fmt, ##args) - -/* - * Wrappers that use a ratelimited printk - */ -#define btrfs_emerg_rl(fs_info, fmt, args...) \ - btrfs_printk_ratelimited(fs_info, KERN_EMERG fmt, ##args) -#define btrfs_alert_rl(fs_info, fmt, args...) \ - btrfs_printk_ratelimited(fs_info, KERN_ALERT fmt, ##args) -#define btrfs_crit_rl(fs_info, fmt, args...) \ - btrfs_printk_ratelimited(fs_info, KERN_CRIT fmt, ##args) -#define btrfs_err_rl(fs_info, fmt, args...) \ - btrfs_printk_ratelimited(fs_info, KERN_ERR fmt, ##args) -#define btrfs_warn_rl(fs_info, fmt, args...) \ - btrfs_printk_ratelimited(fs_info, KERN_WARNING fmt, ##args) -#define btrfs_notice_rl(fs_info, fmt, args...) \ - btrfs_printk_ratelimited(fs_info, KERN_NOTICE fmt, ##args) -#define btrfs_info_rl(fs_info, fmt, args...) \ - btrfs_printk_ratelimited(fs_info, KERN_INFO fmt, ##args) - -#if defined(CONFIG_DYNAMIC_DEBUG) -#define btrfs_debug(fs_info, fmt, args...) \ - _dynamic_func_call_no_desc(fmt, btrfs_printk, \ - fs_info, KERN_DEBUG fmt, ##args) -#define btrfs_debug_in_rcu(fs_info, fmt, args...) \ - _dynamic_func_call_no_desc(fmt, btrfs_printk_in_rcu, \ - fs_info, KERN_DEBUG fmt, ##args) -#define btrfs_debug_rl_in_rcu(fs_info, fmt, args...) \ - _dynamic_func_call_no_desc(fmt, btrfs_printk_rl_in_rcu, \ - fs_info, KERN_DEBUG fmt, ##args) -#define btrfs_debug_rl(fs_info, fmt, args...) \ - _dynamic_func_call_no_desc(fmt, btrfs_printk_ratelimited, \ - fs_info, KERN_DEBUG fmt, ##args) -#elif defined(DEBUG) -#define btrfs_debug(fs_info, fmt, args...) \ - btrfs_printk(fs_info, KERN_DEBUG fmt, ##args) -#define btrfs_debug_in_rcu(fs_info, fmt, args...) \ - btrfs_printk_in_rcu(fs_info, KERN_DEBUG fmt, ##args) -#define btrfs_debug_rl_in_rcu(fs_info, fmt, args...) \ - btrfs_printk_rl_in_rcu(fs_info, KERN_DEBUG fmt, ##args) -#define btrfs_debug_rl(fs_info, fmt, args...) \ - btrfs_printk_ratelimited(fs_info, KERN_DEBUG fmt, ##args) -#else -#define btrfs_debug(fs_info, fmt, args...) \ - btrfs_no_printk(fs_info, KERN_DEBUG fmt, ##args) -#define btrfs_debug_in_rcu(fs_info, fmt, args...) \ - btrfs_no_printk_in_rcu(fs_info, KERN_DEBUG fmt, ##args) -#define btrfs_debug_rl_in_rcu(fs_info, fmt, args...) \ - btrfs_no_printk_in_rcu(fs_info, KERN_DEBUG fmt, ##args) -#define btrfs_debug_rl(fs_info, fmt, args...) \ - btrfs_no_printk(fs_info, KERN_DEBUG fmt, ##args) -#endif - -#define btrfs_printk_in_rcu(fs_info, fmt, args...) \ -do { \ - rcu_read_lock(); \ - btrfs_printk(fs_info, fmt, ##args); \ - rcu_read_unlock(); \ -} while (0) - -#define btrfs_no_printk_in_rcu(fs_info, fmt, args...) \ -do { \ - rcu_read_lock(); \ - btrfs_no_printk(fs_info, fmt, ##args); \ - rcu_read_unlock(); \ -} while (0) - -#define btrfs_printk_ratelimited(fs_info, fmt, args...) \ -do { \ - static DEFINE_RATELIMIT_STATE(_rs, \ - DEFAULT_RATELIMIT_INTERVAL, \ - DEFAULT_RATELIMIT_BURST); \ - if (__ratelimit(&_rs)) \ - btrfs_printk(fs_info, fmt, ##args); \ -} while (0) - -#define btrfs_printk_rl_in_rcu(fs_info, fmt, args...) \ -do { \ - rcu_read_lock(); \ - btrfs_printk_ratelimited(fs_info, fmt, ##args); \ - rcu_read_unlock(); \ -} while (0) - -#ifdef CONFIG_BTRFS_ASSERT -__cold __noreturn -static inline void assertfail(const char *expr, const char *file, int line) -{ - pr_err("assertion failed: %s, in %s:%d\n", expr, file, line); - BUG(); -} - -#define ASSERT(expr) \ - (likely(expr) ? (void)0 : assertfail(#expr, __FILE__, __LINE__)) - -#else -static inline void assertfail(const char *expr, const char* file, int line) { } -#define ASSERT(expr) (void)(expr) -#endif - -#if BITS_PER_LONG == 32 -#define BTRFS_32BIT_MAX_FILE_SIZE (((u64)ULONG_MAX + 1) << PAGE_SHIFT) -/* - * The warning threshold is 5/8th of the MAX_LFS_FILESIZE that limits the logical - * addresses of extents. - * - * For 4K page size it's about 10T, for 64K it's 160T. - */ -#define BTRFS_32BIT_EARLY_WARN_THRESHOLD (BTRFS_32BIT_MAX_FILE_SIZE * 5 / 8) -void btrfs_warn_32bit_limit(struct btrfs_fs_info *fs_info); -void btrfs_err_32bit_limit(struct btrfs_fs_info *fs_info); -#endif - -/* - * Get the correct offset inside the page of extent buffer. - * - * @eb: target extent buffer - * @start: offset inside the extent buffer - * - * Will handle both sectorsize == PAGE_SIZE and sectorsize < PAGE_SIZE cases. - */ -static inline size_t get_eb_offset_in_page(const struct extent_buffer *eb, - unsigned long offset) -{ - /* - * For sectorsize == PAGE_SIZE case, eb->start will always be aligned - * to PAGE_SIZE, thus adding it won't cause any difference. - * - * For sectorsize < PAGE_SIZE, we must only read the data that belongs - * to the eb, thus we have to take the eb->start into consideration. - */ - return offset_in_page(offset + eb->start); -} - -static inline unsigned long get_eb_page_index(unsigned long offset) -{ - /* - * For sectorsize == PAGE_SIZE case, plain >> PAGE_SHIFT is enough. - * - * For sectorsize < PAGE_SIZE case, we only support 64K PAGE_SIZE, - * and have ensured that all tree blocks are contained in one page, - * thus we always get index == 0. - */ - return offset >> PAGE_SHIFT; -} - -/* - * Use that for functions that are conditionally exported for sanity tests but - * otherwise static - */ -#ifndef CONFIG_BTRFS_FS_RUN_SANITY_TESTS -#define EXPORT_FOR_TESTS static -#else -#define EXPORT_FOR_TESTS -#endif - -__cold -static inline void btrfs_print_v0_err(struct btrfs_fs_info *fs_info) -{ - btrfs_err(fs_info, -"Unsupported V0 extent filesystem detected. Aborting. Please re-create your filesystem with a newer kernel"); -} - -__printf(5, 6) -__cold -void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function, - unsigned int line, int errno, const char *fmt, ...); - -const char * __attribute_const__ btrfs_decode_error(int errno); - -__cold -void __btrfs_abort_transaction(struct btrfs_trans_handle *trans, - const char *function, - unsigned int line, int errno, bool first_hit); - -bool __cold abort_should_print_stack(int errno); - -/* - * Call btrfs_abort_transaction as early as possible when an error condition is - * detected, that way the exact stack trace is reported for some errors. - */ -#define btrfs_abort_transaction(trans, errno) \ -do { \ - bool first = false; \ - /* Report first abort since mount */ \ - if (!test_and_set_bit(BTRFS_FS_STATE_TRANS_ABORTED, \ - &((trans)->fs_info->fs_state))) { \ - first = true; \ - if (WARN(abort_should_print_stack(errno), \ - KERN_DEBUG \ - "BTRFS: Transaction aborted (error %d)\n", \ - (errno))) { \ - /* Stack trace printed. */ \ - } else { \ - btrfs_debug((trans)->fs_info, \ - "Transaction aborted (error %d)", \ - (errno)); \ - } \ - } \ - __btrfs_abort_transaction((trans), __func__, \ - __LINE__, (errno), first); \ -} while (0) - -#ifdef CONFIG_PRINTK_INDEX - -#define btrfs_handle_fs_error(fs_info, errno, fmt, args...) \ -do { \ - printk_index_subsys_emit( \ - "BTRFS: error (device %s%s) in %s:%d: errno=%d %s", \ - KERN_CRIT, fmt); \ - __btrfs_handle_fs_error((fs_info), __func__, __LINE__, \ - (errno), fmt, ##args); \ -} while (0) - -#else - -#define btrfs_handle_fs_error(fs_info, errno, fmt, args...) \ - __btrfs_handle_fs_error((fs_info), __func__, __LINE__, \ - (errno), fmt, ##args) - -#endif - -#define BTRFS_FS_ERROR(fs_info) (unlikely(test_bit(BTRFS_FS_STATE_ERROR, \ - &(fs_info)->fs_state))) -#define BTRFS_FS_LOG_CLEANUP_ERROR(fs_info) \ - (unlikely(test_bit(BTRFS_FS_STATE_LOG_CLEANUP_ERROR, \ - &(fs_info)->fs_state))) - -__printf(5, 6) -__cold -void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function, - unsigned int line, int errno, const char *fmt, ...); -/* - * If BTRFS_MOUNT_PANIC_ON_FATAL_ERROR is in mount_opt, __btrfs_panic - * will panic(). Otherwise we BUG() here. - */ -#define btrfs_panic(fs_info, errno, fmt, args...) \ -do { \ - __btrfs_panic(fs_info, __func__, __LINE__, errno, fmt, ##args); \ - BUG(); \ -} while (0) - - -/* compatibility and incompatibility defines */ - -#define btrfs_set_fs_incompat(__fs_info, opt) \ - __btrfs_set_fs_incompat((__fs_info), BTRFS_FEATURE_INCOMPAT_##opt, \ - #opt) - -static inline void __btrfs_set_fs_incompat(struct btrfs_fs_info *fs_info, - u64 flag, const char* name) -{ - struct btrfs_super_block *disk_super; - u64 features; - - disk_super = fs_info->super_copy; - features = btrfs_super_incompat_flags(disk_super); - if (!(features & flag)) { - spin_lock(&fs_info->super_lock); - features = btrfs_super_incompat_flags(disk_super); - if (!(features & flag)) { - features |= flag; - btrfs_set_super_incompat_flags(disk_super, features); - btrfs_info(fs_info, - "setting incompat feature flag for %s (0x%llx)", - name, flag); - } - spin_unlock(&fs_info->super_lock); - } -} - -#define btrfs_clear_fs_incompat(__fs_info, opt) \ - __btrfs_clear_fs_incompat((__fs_info), BTRFS_FEATURE_INCOMPAT_##opt, \ - #opt) - -static inline void __btrfs_clear_fs_incompat(struct btrfs_fs_info *fs_info, - u64 flag, const char* name) -{ - struct btrfs_super_block *disk_super; - u64 features; - - disk_super = fs_info->super_copy; - features = btrfs_super_incompat_flags(disk_super); - if (features & flag) { - spin_lock(&fs_info->super_lock); - features = btrfs_super_incompat_flags(disk_super); - if (features & flag) { - features &= ~flag; - btrfs_set_super_incompat_flags(disk_super, features); - btrfs_info(fs_info, - "clearing incompat feature flag for %s (0x%llx)", - name, flag); - } - spin_unlock(&fs_info->super_lock); - } -} - -#define btrfs_fs_incompat(fs_info, opt) \ - __btrfs_fs_incompat((fs_info), BTRFS_FEATURE_INCOMPAT_##opt) - -static inline bool __btrfs_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag) -{ - struct btrfs_super_block *disk_super; - disk_super = fs_info->super_copy; - return !!(btrfs_super_incompat_flags(disk_super) & flag); -} - -#define btrfs_set_fs_compat_ro(__fs_info, opt) \ - __btrfs_set_fs_compat_ro((__fs_info), BTRFS_FEATURE_COMPAT_RO_##opt, \ - #opt) - -static inline void __btrfs_set_fs_compat_ro(struct btrfs_fs_info *fs_info, - u64 flag, const char *name) -{ - struct btrfs_super_block *disk_super; - u64 features; - - disk_super = fs_info->super_copy; - features = btrfs_super_compat_ro_flags(disk_super); - if (!(features & flag)) { - spin_lock(&fs_info->super_lock); - features = btrfs_super_compat_ro_flags(disk_super); - if (!(features & flag)) { - features |= flag; - btrfs_set_super_compat_ro_flags(disk_super, features); - btrfs_info(fs_info, - "setting compat-ro feature flag for %s (0x%llx)", - name, flag); - } - spin_unlock(&fs_info->super_lock); - } -} - -#define btrfs_clear_fs_compat_ro(__fs_info, opt) \ - __btrfs_clear_fs_compat_ro((__fs_info), BTRFS_FEATURE_COMPAT_RO_##opt, \ - #opt) - -static inline void __btrfs_clear_fs_compat_ro(struct btrfs_fs_info *fs_info, - u64 flag, const char *name) -{ - struct btrfs_super_block *disk_super; - u64 features; - - disk_super = fs_info->super_copy; - features = btrfs_super_compat_ro_flags(disk_super); - if (features & flag) { - spin_lock(&fs_info->super_lock); - features = btrfs_super_compat_ro_flags(disk_super); - if (features & flag) { - features &= ~flag; - btrfs_set_super_compat_ro_flags(disk_super, features); - btrfs_info(fs_info, - "clearing compat-ro feature flag for %s (0x%llx)", - name, flag); - } - spin_unlock(&fs_info->super_lock); - } -} - -#define btrfs_fs_compat_ro(fs_info, opt) \ - __btrfs_fs_compat_ro((fs_info), BTRFS_FEATURE_COMPAT_RO_##opt) - -static inline int __btrfs_fs_compat_ro(struct btrfs_fs_info *fs_info, u64 flag) -{ - struct btrfs_super_block *disk_super; - disk_super = fs_info->super_copy; - return !!(btrfs_super_compat_ro_flags(disk_super) & flag); -} - -/* acl.c */ -#ifdef CONFIG_BTRFS_FS_POSIX_ACL -struct posix_acl *btrfs_get_acl(struct inode *inode, int type, bool rcu); -int btrfs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, - struct posix_acl *acl, int type); -int __btrfs_set_acl(struct btrfs_trans_handle *trans, struct inode *inode, - struct posix_acl *acl, int type); -#else -#define btrfs_get_acl NULL -#define btrfs_set_acl NULL -static inline int __btrfs_set_acl(struct btrfs_trans_handle *trans, - struct inode *inode, struct posix_acl *acl, - int type) -{ - return -EOPNOTSUPP; -} -#endif - -/* relocation.c */ -int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start); -int btrfs_init_reloc_root(struct btrfs_trans_handle *trans, - struct btrfs_root *root); -int btrfs_update_reloc_root(struct btrfs_trans_handle *trans, - struct btrfs_root *root); -int btrfs_recover_relocation(struct btrfs_fs_info *fs_info); -int btrfs_reloc_clone_csums(struct btrfs_inode *inode, u64 file_pos, u64 len); -int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct extent_buffer *buf, - struct extent_buffer *cow); -void btrfs_reloc_pre_snapshot(struct btrfs_pending_snapshot *pending, - u64 *bytes_to_reserve); -int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans, - struct btrfs_pending_snapshot *pending); -int btrfs_should_cancel_balance(struct btrfs_fs_info *fs_info); -struct btrfs_root *find_reloc_root(struct btrfs_fs_info *fs_info, - u64 bytenr); -int btrfs_should_ignore_reloc_root(struct btrfs_root *root); - -/* scrub.c */ -int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, - u64 end, struct btrfs_scrub_progress *progress, - int readonly, int is_dev_replace); -void btrfs_scrub_pause(struct btrfs_fs_info *fs_info); -void btrfs_scrub_continue(struct btrfs_fs_info *fs_info); -int btrfs_scrub_cancel(struct btrfs_fs_info *info); -int btrfs_scrub_cancel_dev(struct btrfs_device *dev); -int btrfs_scrub_progress(struct btrfs_fs_info *fs_info, u64 devid, - struct btrfs_scrub_progress *progress); - -/* dev-replace.c */ -void btrfs_bio_counter_inc_blocked(struct btrfs_fs_info *fs_info); -void btrfs_bio_counter_sub(struct btrfs_fs_info *fs_info, s64 amount); - -static inline void btrfs_bio_counter_dec(struct btrfs_fs_info *fs_info) -{ - btrfs_bio_counter_sub(fs_info, 1); -} static inline int is_fstree(u64 rootid) { @@ -4057,61 +695,16 @@ static inline int is_fstree(u64 rootid) return 0; } -static inline int btrfs_defrag_cancelled(struct btrfs_fs_info *fs_info) -{ - return signal_pending(current); -} - -/* verity.c */ -#ifdef CONFIG_FS_VERITY - -extern const struct fsverity_operations btrfs_verityops; -int btrfs_drop_verity_items(struct btrfs_inode *inode); -int btrfs_get_verity_descriptor(struct inode *inode, void *buf, size_t buf_size); - -BTRFS_SETGET_FUNCS(verity_descriptor_encryption, struct btrfs_verity_descriptor_item, - encryption, 8); -BTRFS_SETGET_FUNCS(verity_descriptor_size, struct btrfs_verity_descriptor_item, - size, 64); -BTRFS_SETGET_STACK_FUNCS(stack_verity_descriptor_encryption, - struct btrfs_verity_descriptor_item, encryption, 8); -BTRFS_SETGET_STACK_FUNCS(stack_verity_descriptor_size, - struct btrfs_verity_descriptor_item, size, 64); - -#else - -static inline int btrfs_drop_verity_items(struct btrfs_inode *inode) -{ - return 0; -} - -static inline int btrfs_get_verity_descriptor(struct inode *inode, void *buf, - size_t buf_size) -{ - return -EPERM; -} - -#endif - -/* Sanity test specific functions */ -#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS -void btrfs_test_destroy_inode(struct inode *inode); -static inline int btrfs_is_testing(struct btrfs_fs_info *fs_info) -{ - return test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state); -} -#else -static inline int btrfs_is_testing(struct btrfs_fs_info *fs_info) -{ - return 0; -} -#endif - static inline bool btrfs_is_data_reloc_root(const struct btrfs_root *root) { return root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID; } +int btrfs_super_csum_size(const struct btrfs_super_block *s); +const char *btrfs_super_csum_name(u16 csum_type); +const char *btrfs_super_csum_driver(u16 csum_type); +size_t __attribute_const__ btrfs_get_num_csums(void); + /* * We use page status Private2 to indicate there is an ordered extent with * unfinished IO. diff --git a/fs/btrfs/defrag.c b/fs/btrfs/defrag.c new file mode 100644 index 000000000000..0a3c261b69c9 --- /dev/null +++ b/fs/btrfs/defrag.c @@ -0,0 +1,1376 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2007 Oracle. All rights reserved. + */ + +#include <linux/sched.h> +#include "ctree.h" +#include "disk-io.h" +#include "print-tree.h" +#include "transaction.h" +#include "locking.h" +#include "accessors.h" +#include "messages.h" +#include "delalloc-space.h" +#include "subpage.h" +#include "defrag.h" +#include "file-item.h" +#include "super.h" + +static struct kmem_cache *btrfs_inode_defrag_cachep; + +/* + * When auto defrag is enabled we queue up these defrag structs to remember + * which inodes need defragging passes. + */ +struct inode_defrag { + struct rb_node rb_node; + /* Inode number */ + u64 ino; + /* + * Transid where the defrag was added, we search for extents newer than + * this. + */ + u64 transid; + + /* Root objectid */ + u64 root; + + /* + * The extent size threshold for autodefrag. + * + * This value is different for compressed/non-compressed extents, thus + * needs to be passed from higher layer. + * (aka, inode_should_defrag()) + */ + u32 extent_thresh; +}; + +static int __compare_inode_defrag(struct inode_defrag *defrag1, + struct inode_defrag *defrag2) +{ + if (defrag1->root > defrag2->root) + return 1; + else if (defrag1->root < defrag2->root) + return -1; + else if (defrag1->ino > defrag2->ino) + return 1; + else if (defrag1->ino < defrag2->ino) + return -1; + else + return 0; +} + +/* + * Pop a record for an inode into the defrag tree. The lock must be held + * already. + * + * If you're inserting a record for an older transid than an existing record, + * the transid already in the tree is lowered. + * + * If an existing record is found the defrag item you pass in is freed. + */ +static int __btrfs_add_inode_defrag(struct btrfs_inode *inode, + struct inode_defrag *defrag) +{ + struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct inode_defrag *entry; + struct rb_node **p; + struct rb_node *parent = NULL; + int ret; + + p = &fs_info->defrag_inodes.rb_node; + while (*p) { + parent = *p; + entry = rb_entry(parent, struct inode_defrag, rb_node); + + ret = __compare_inode_defrag(defrag, entry); + if (ret < 0) + p = &parent->rb_left; + else if (ret > 0) + p = &parent->rb_right; + else { + /* + * If we're reinserting an entry for an old defrag run, + * make sure to lower the transid of our existing + * record. + */ + if (defrag->transid < entry->transid) + entry->transid = defrag->transid; + entry->extent_thresh = min(defrag->extent_thresh, + entry->extent_thresh); + return -EEXIST; + } + } + set_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags); + rb_link_node(&defrag->rb_node, parent, p); + rb_insert_color(&defrag->rb_node, &fs_info->defrag_inodes); + return 0; +} + +static inline int __need_auto_defrag(struct btrfs_fs_info *fs_info) +{ + if (!btrfs_test_opt(fs_info, AUTO_DEFRAG)) + return 0; + + if (btrfs_fs_closing(fs_info)) + return 0; + + return 1; +} + +/* + * Insert a defrag record for this inode if auto defrag is enabled. + */ +int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, + struct btrfs_inode *inode, u32 extent_thresh) +{ + struct btrfs_root *root = inode->root; + struct btrfs_fs_info *fs_info = root->fs_info; + struct inode_defrag *defrag; + u64 transid; + int ret; + + if (!__need_auto_defrag(fs_info)) + return 0; + + if (test_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags)) + return 0; + + if (trans) + transid = trans->transid; + else + transid = inode->root->last_trans; + + defrag = kmem_cache_zalloc(btrfs_inode_defrag_cachep, GFP_NOFS); + if (!defrag) + return -ENOMEM; + + defrag->ino = btrfs_ino(inode); + defrag->transid = transid; + defrag->root = root->root_key.objectid; + defrag->extent_thresh = extent_thresh; + + spin_lock(&fs_info->defrag_inodes_lock); + if (!test_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags)) { + /* + * If we set IN_DEFRAG flag and evict the inode from memory, + * and then re-read this inode, this new inode doesn't have + * IN_DEFRAG flag. At the case, we may find the existed defrag. + */ + ret = __btrfs_add_inode_defrag(inode, defrag); + if (ret) + kmem_cache_free(btrfs_inode_defrag_cachep, defrag); + } else { + kmem_cache_free(btrfs_inode_defrag_cachep, defrag); + } + spin_unlock(&fs_info->defrag_inodes_lock); + return 0; +} + +/* + * Pick the defragable inode that we want, if it doesn't exist, we will get the + * next one. + */ +static struct inode_defrag *btrfs_pick_defrag_inode( + struct btrfs_fs_info *fs_info, u64 root, u64 ino) +{ + struct inode_defrag *entry = NULL; + struct inode_defrag tmp; + struct rb_node *p; + struct rb_node *parent = NULL; + int ret; + + tmp.ino = ino; + tmp.root = root; + + spin_lock(&fs_info->defrag_inodes_lock); + p = fs_info->defrag_inodes.rb_node; + while (p) { + parent = p; + entry = rb_entry(parent, struct inode_defrag, rb_node); + + ret = __compare_inode_defrag(&tmp, entry); + if (ret < 0) + p = parent->rb_left; + else if (ret > 0) + p = parent->rb_right; + else + goto out; + } + + if (parent && __compare_inode_defrag(&tmp, entry) > 0) { + parent = rb_next(parent); + if (parent) + entry = rb_entry(parent, struct inode_defrag, rb_node); + else + entry = NULL; + } +out: + if (entry) + rb_erase(parent, &fs_info->defrag_inodes); + spin_unlock(&fs_info->defrag_inodes_lock); + return entry; +} + +void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info) +{ + struct inode_defrag *defrag; + struct rb_node *node; + + spin_lock(&fs_info->defrag_inodes_lock); + node = rb_first(&fs_info->defrag_inodes); + while (node) { + rb_erase(node, &fs_info->defrag_inodes); + defrag = rb_entry(node, struct inode_defrag, rb_node); + kmem_cache_free(btrfs_inode_defrag_cachep, defrag); + + cond_resched_lock(&fs_info->defrag_inodes_lock); + + node = rb_first(&fs_info->defrag_inodes); + } + spin_unlock(&fs_info->defrag_inodes_lock); +} + +#define BTRFS_DEFRAG_BATCH 1024 + +static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info, + struct inode_defrag *defrag) +{ + struct btrfs_root *inode_root; + struct inode *inode; + struct btrfs_ioctl_defrag_range_args range; + int ret = 0; + u64 cur = 0; + +again: + if (test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state)) + goto cleanup; + if (!__need_auto_defrag(fs_info)) + goto cleanup; + + /* Get the inode */ + inode_root = btrfs_get_fs_root(fs_info, defrag->root, true); + if (IS_ERR(inode_root)) { + ret = PTR_ERR(inode_root); + goto cleanup; + } + + inode = btrfs_iget(fs_info->sb, defrag->ino, inode_root); + btrfs_put_root(inode_root); + if (IS_ERR(inode)) { + ret = PTR_ERR(inode); + goto cleanup; + } + + if (cur >= i_size_read(inode)) { + iput(inode); + goto cleanup; + } + + /* Do a chunk of defrag */ + clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags); + memset(&range, 0, sizeof(range)); + range.len = (u64)-1; + range.start = cur; + range.extent_thresh = defrag->extent_thresh; + + sb_start_write(fs_info->sb); + ret = btrfs_defrag_file(inode, NULL, &range, defrag->transid, + BTRFS_DEFRAG_BATCH); + sb_end_write(fs_info->sb); + iput(inode); + + if (ret < 0) + goto cleanup; + + cur = max(cur + fs_info->sectorsize, range.start); + goto again; + +cleanup: + kmem_cache_free(btrfs_inode_defrag_cachep, defrag); + return ret; +} + +/* + * Run through the list of inodes in the FS that need defragging. + */ +int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info) +{ + struct inode_defrag *defrag; + u64 first_ino = 0; + u64 root_objectid = 0; + + atomic_inc(&fs_info->defrag_running); + while (1) { + /* Pause the auto defragger. */ + if (test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state)) + break; + + if (!__need_auto_defrag(fs_info)) + break; + + /* find an inode to defrag */ + defrag = btrfs_pick_defrag_inode(fs_info, root_objectid, first_ino); + if (!defrag) { + if (root_objectid || first_ino) { + root_objectid = 0; + first_ino = 0; + continue; + } else { + break; + } + } + + first_ino = defrag->ino + 1; + root_objectid = defrag->root; + + __btrfs_run_defrag_inode(fs_info, defrag); + } + atomic_dec(&fs_info->defrag_running); + + /* + * During unmount, we use the transaction_wait queue to wait for the + * defragger to stop. + */ + wake_up(&fs_info->transaction_wait); + return 0; +} + +/* + * Defrag all the leaves in a given btree. + * Read all the leaves and try to get key order to + * better reflect disk order + */ + +int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + struct btrfs_path *path = NULL; + struct btrfs_key key; + int ret = 0; + int wret; + int level; + int next_key_ret = 0; + u64 last_ret = 0; + + if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) + goto out; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + level = btrfs_header_level(root->node); + + if (level == 0) + goto out; + + if (root->defrag_progress.objectid == 0) { + struct extent_buffer *root_node; + u32 nritems; + + root_node = btrfs_lock_root_node(root); + nritems = btrfs_header_nritems(root_node); + root->defrag_max.objectid = 0; + /* from above we know this is not a leaf */ + btrfs_node_key_to_cpu(root_node, &root->defrag_max, + nritems - 1); + btrfs_tree_unlock(root_node); + free_extent_buffer(root_node); + memset(&key, 0, sizeof(key)); + } else { + memcpy(&key, &root->defrag_progress, sizeof(key)); + } + + path->keep_locks = 1; + + ret = btrfs_search_forward(root, &key, path, BTRFS_OLDEST_GENERATION); + if (ret < 0) + goto out; + if (ret > 0) { + ret = 0; + goto out; + } + btrfs_release_path(path); + /* + * We don't need a lock on a leaf. btrfs_realloc_node() will lock all + * leafs from path->nodes[1], so set lowest_level to 1 to avoid later + * a deadlock (attempting to write lock an already write locked leaf). + */ + path->lowest_level = 1; + wret = btrfs_search_slot(trans, root, &key, path, 0, 1); + + if (wret < 0) { + ret = wret; + goto out; + } + if (!path->nodes[1]) { + ret = 0; + goto out; + } + /* + * The node at level 1 must always be locked when our path has + * keep_locks set and lowest_level is 1, regardless of the value of + * path->slots[1]. + */ + BUG_ON(path->locks[1] == 0); + ret = btrfs_realloc_node(trans, root, + path->nodes[1], 0, + &last_ret, + &root->defrag_progress); + if (ret) { + WARN_ON(ret == -EAGAIN); + goto out; + } + /* + * Now that we reallocated the node we can find the next key. Note that + * btrfs_find_next_key() can release our path and do another search + * without COWing, this is because even with path->keep_locks = 1, + * btrfs_search_slot() / ctree.c:unlock_up() does not keeps a lock on a + * node when path->slots[node_level - 1] does not point to the last + * item or a slot beyond the last item (ctree.c:unlock_up()). Therefore + * we search for the next key after reallocating our node. + */ + path->slots[1] = btrfs_header_nritems(path->nodes[1]); + next_key_ret = btrfs_find_next_key(root, path, &key, 1, + BTRFS_OLDEST_GENERATION); + if (next_key_ret == 0) { + memcpy(&root->defrag_progress, &key, sizeof(key)); + ret = -EAGAIN; + } +out: + btrfs_free_path(path); + if (ret == -EAGAIN) { + if (root->defrag_max.objectid > root->defrag_progress.objectid) + goto done; + if (root->defrag_max.type > root->defrag_progress.type) + goto done; + if (root->defrag_max.offset > root->defrag_progress.offset) + goto done; + ret = 0; + } +done: + if (ret != -EAGAIN) + memset(&root->defrag_progress, 0, + sizeof(root->defrag_progress)); + + return ret; +} + +/* + * Defrag specific helper to get an extent map. + * + * Differences between this and btrfs_get_extent() are: + * + * - No extent_map will be added to inode->extent_tree + * To reduce memory usage in the long run. + * + * - Extra optimization to skip file extents older than @newer_than + * By using btrfs_search_forward() we can skip entire file ranges that + * have extents created in past transactions, because btrfs_search_forward() + * will not visit leaves and nodes with a generation smaller than given + * minimal generation threshold (@newer_than). + * + * Return valid em if we find a file extent matching the requirement. + * Return NULL if we can not find a file extent matching the requirement. + * + * Return ERR_PTR() for error. + */ +static struct extent_map *defrag_get_extent(struct btrfs_inode *inode, + u64 start, u64 newer_than) +{ + struct btrfs_root *root = inode->root; + struct btrfs_file_extent_item *fi; + struct btrfs_path path = { 0 }; + struct extent_map *em; + struct btrfs_key key; + u64 ino = btrfs_ino(inode); + int ret; + + em = alloc_extent_map(); + if (!em) { + ret = -ENOMEM; + goto err; + } + + key.objectid = ino; + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = start; + + if (newer_than) { + ret = btrfs_search_forward(root, &key, &path, newer_than); + if (ret < 0) + goto err; + /* Can't find anything newer */ + if (ret > 0) + goto not_found; + } else { + ret = btrfs_search_slot(NULL, root, &key, &path, 0, 0); + if (ret < 0) + goto err; + } + if (path.slots[0] >= btrfs_header_nritems(path.nodes[0])) { + /* + * If btrfs_search_slot() makes path to point beyond nritems, + * we should not have an empty leaf, as this inode must at + * least have its INODE_ITEM. + */ + ASSERT(btrfs_header_nritems(path.nodes[0])); + path.slots[0] = btrfs_header_nritems(path.nodes[0]) - 1; + } + btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]); + /* Perfect match, no need to go one slot back */ + if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY && + key.offset == start) + goto iterate; + + /* We didn't find a perfect match, needs to go one slot back */ + if (path.slots[0] > 0) { + btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]); + if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY) + path.slots[0]--; + } + +iterate: + /* Iterate through the path to find a file extent covering @start */ + while (true) { + u64 extent_end; + + if (path.slots[0] >= btrfs_header_nritems(path.nodes[0])) + goto next; + + btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]); + + /* + * We may go one slot back to INODE_REF/XATTR item, then + * need to go forward until we reach an EXTENT_DATA. + * But we should still has the correct ino as key.objectid. + */ + if (WARN_ON(key.objectid < ino) || key.type < BTRFS_EXTENT_DATA_KEY) + goto next; + + /* It's beyond our target range, definitely not extent found */ + if (key.objectid > ino || key.type > BTRFS_EXTENT_DATA_KEY) + goto not_found; + + /* + * | |<- File extent ->| + * \- start + * + * This means there is a hole between start and key.offset. + */ + if (key.offset > start) { + em->start = start; + em->orig_start = start; + em->block_start = EXTENT_MAP_HOLE; + em->len = key.offset - start; + break; + } + + fi = btrfs_item_ptr(path.nodes[0], path.slots[0], + struct btrfs_file_extent_item); + extent_end = btrfs_file_extent_end(&path); + + /* + * |<- file extent ->| | + * \- start + * + * We haven't reached start, search next slot. + */ + if (extent_end <= start) + goto next; + + /* Now this extent covers @start, convert it to em */ + btrfs_extent_item_to_extent_map(inode, &path, fi, em); + break; +next: + ret = btrfs_next_item(root, &path); + if (ret < 0) + goto err; + if (ret > 0) + goto not_found; + } + btrfs_release_path(&path); + return em; + +not_found: + btrfs_release_path(&path); + free_extent_map(em); + return NULL; + +err: + btrfs_release_path(&path); + free_extent_map(em); + return ERR_PTR(ret); +} + +static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start, + u64 newer_than, bool locked) +{ + struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; + struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + struct extent_map *em; + const u32 sectorsize = BTRFS_I(inode)->root->fs_info->sectorsize; + + /* + * Hopefully we have this extent in the tree already, try without the + * full extent lock. + */ + read_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, start, sectorsize); + read_unlock(&em_tree->lock); + + /* + * We can get a merged extent, in that case, we need to re-search + * tree to get the original em for defrag. + * + * If @newer_than is 0 or em::generation < newer_than, we can trust + * this em, as either we don't care about the generation, or the + * merged extent map will be rejected anyway. + */ + if (em && test_bit(EXTENT_FLAG_MERGED, &em->flags) && + newer_than && em->generation >= newer_than) { + free_extent_map(em); + em = NULL; + } + + if (!em) { + struct extent_state *cached = NULL; + u64 end = start + sectorsize - 1; + + /* Get the big lock and read metadata off disk. */ + if (!locked) + lock_extent(io_tree, start, end, &cached); + em = defrag_get_extent(BTRFS_I(inode), start, newer_than); + if (!locked) + unlock_extent(io_tree, start, end, &cached); + + if (IS_ERR(em)) + return NULL; + } + + return em; +} + +static u32 get_extent_max_capacity(const struct btrfs_fs_info *fs_info, + const struct extent_map *em) +{ + if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) + return BTRFS_MAX_COMPRESSED; + return fs_info->max_extent_size; +} + +static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em, + u32 extent_thresh, u64 newer_than, bool locked) +{ + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct extent_map *next; + bool ret = false; + + /* This is the last extent */ + if (em->start + em->len >= i_size_read(inode)) + return false; + + /* + * Here we need to pass @newer_then when checking the next extent, or + * we will hit a case we mark current extent for defrag, but the next + * one will not be a target. + * This will just cause extra IO without really reducing the fragments. + */ + next = defrag_lookup_extent(inode, em->start + em->len, newer_than, locked); + /* No more em or hole */ + if (!next || next->block_start >= EXTENT_MAP_LAST_BYTE) + goto out; + if (test_bit(EXTENT_FLAG_PREALLOC, &next->flags)) + goto out; + /* + * If the next extent is at its max capacity, defragging current extent + * makes no sense, as the total number of extents won't change. + */ + if (next->len >= get_extent_max_capacity(fs_info, em)) + goto out; + /* Skip older extent */ + if (next->generation < newer_than) + goto out; + /* Also check extent size */ + if (next->len >= extent_thresh) + goto out; + + ret = true; +out: + free_extent_map(next); + return ret; +} + +/* + * Prepare one page to be defragged. + * + * This will ensure: + * + * - Returned page is locked and has been set up properly. + * - No ordered extent exists in the page. + * - The page is uptodate. + * + * NOTE: Caller should also wait for page writeback after the cluster is + * prepared, here we don't do writeback wait for each page. + */ +static struct page *defrag_prepare_one_page(struct btrfs_inode *inode, pgoff_t index) +{ + struct address_space *mapping = inode->vfs_inode.i_mapping; + gfp_t mask = btrfs_alloc_write_mask(mapping); + u64 page_start = (u64)index << PAGE_SHIFT; + u64 page_end = page_start + PAGE_SIZE - 1; + struct extent_state *cached_state = NULL; + struct page *page; + int ret; + +again: + page = find_or_create_page(mapping, index, mask); + if (!page) + return ERR_PTR(-ENOMEM); + + /* + * Since we can defragment files opened read-only, we can encounter + * transparent huge pages here (see CONFIG_READ_ONLY_THP_FOR_FS). We + * can't do I/O using huge pages yet, so return an error for now. + * Filesystem transparent huge pages are typically only used for + * executables that explicitly enable them, so this isn't very + * restrictive. + */ + if (PageCompound(page)) { + unlock_page(page); + put_page(page); + return ERR_PTR(-ETXTBSY); + } + + ret = set_page_extent_mapped(page); + if (ret < 0) { + unlock_page(page); + put_page(page); + return ERR_PTR(ret); + } + + /* Wait for any existing ordered extent in the range */ + while (1) { + struct btrfs_ordered_extent *ordered; + + lock_extent(&inode->io_tree, page_start, page_end, &cached_state); + ordered = btrfs_lookup_ordered_range(inode, page_start, PAGE_SIZE); + unlock_extent(&inode->io_tree, page_start, page_end, + &cached_state); + if (!ordered) + break; + + unlock_page(page); + btrfs_start_ordered_extent(ordered, 1); + btrfs_put_ordered_extent(ordered); + lock_page(page); + /* + * We unlocked the page above, so we need check if it was + * released or not. + */ + if (page->mapping != mapping || !PagePrivate(page)) { + unlock_page(page); + put_page(page); + goto again; + } + } + + /* + * Now the page range has no ordered extent any more. Read the page to + * make it uptodate. + */ + if (!PageUptodate(page)) { + btrfs_read_folio(NULL, page_folio(page)); + lock_page(page); + if (page->mapping != mapping || !PagePrivate(page)) { + unlock_page(page); + put_page(page); + goto again; + } + if (!PageUptodate(page)) { + unlock_page(page); + put_page(page); + return ERR_PTR(-EIO); + } + } + return page; +} + +struct defrag_target_range { + struct list_head list; + u64 start; + u64 len; +}; + +/* + * Collect all valid target extents. + * + * @start: file offset to lookup + * @len: length to lookup + * @extent_thresh: file extent size threshold, any extent size >= this value + * will be ignored + * @newer_than: only defrag extents newer than this value + * @do_compress: whether the defrag is doing compression + * if true, @extent_thresh will be ignored and all regular + * file extents meeting @newer_than will be targets. + * @locked: if the range has already held extent lock + * @target_list: list of targets file extents + */ +static int defrag_collect_targets(struct btrfs_inode *inode, + u64 start, u64 len, u32 extent_thresh, + u64 newer_than, bool do_compress, + bool locked, struct list_head *target_list, + u64 *last_scanned_ret) +{ + struct btrfs_fs_info *fs_info = inode->root->fs_info; + bool last_is_target = false; + u64 cur = start; + int ret = 0; + + while (cur < start + len) { + struct extent_map *em; + struct defrag_target_range *new; + bool next_mergeable = true; + u64 range_len; + + last_is_target = false; + em = defrag_lookup_extent(&inode->vfs_inode, cur, newer_than, locked); + if (!em) + break; + + /* + * If the file extent is an inlined one, we may still want to + * defrag it (fallthrough) if it will cause a regular extent. + * This is for users who want to convert inline extents to + * regular ones through max_inline= mount option. + */ + if (em->block_start == EXTENT_MAP_INLINE && + em->len <= inode->root->fs_info->max_inline) + goto next; + + /* Skip hole/delalloc/preallocated extents */ + if (em->block_start == EXTENT_MAP_HOLE || + em->block_start == EXTENT_MAP_DELALLOC || + test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) + goto next; + + /* Skip older extent */ + if (em->generation < newer_than) + goto next; + + /* This em is under writeback, no need to defrag */ + if (em->generation == (u64)-1) + goto next; + + /* + * Our start offset might be in the middle of an existing extent + * map, so take that into account. + */ + range_len = em->len - (cur - em->start); + /* + * If this range of the extent map is already flagged for delalloc, + * skip it, because: + * + * 1) We could deadlock later, when trying to reserve space for + * delalloc, because in case we can't immediately reserve space + * the flusher can start delalloc and wait for the respective + * ordered extents to complete. The deadlock would happen + * because we do the space reservation while holding the range + * locked, and starting writeback, or finishing an ordered + * extent, requires locking the range; + * + * 2) If there's delalloc there, it means there's dirty pages for + * which writeback has not started yet (we clean the delalloc + * flag when starting writeback and after creating an ordered + * extent). If we mark pages in an adjacent range for defrag, + * then we will have a larger contiguous range for delalloc, + * very likely resulting in a larger extent after writeback is + * triggered (except in a case of free space fragmentation). + */ + if (test_range_bit(&inode->io_tree, cur, cur + range_len - 1, + EXTENT_DELALLOC, 0, NULL)) + goto next; + + /* + * For do_compress case, we want to compress all valid file + * extents, thus no @extent_thresh or mergeable check. + */ + if (do_compress) + goto add; + + /* Skip too large extent */ + if (range_len >= extent_thresh) + goto next; + + /* + * Skip extents already at its max capacity, this is mostly for + * compressed extents, which max cap is only 128K. + */ + if (em->len >= get_extent_max_capacity(fs_info, em)) + goto next; + + /* + * Normally there are no more extents after an inline one, thus + * @next_mergeable will normally be false and not defragged. + * So if an inline extent passed all above checks, just add it + * for defrag, and be converted to regular extents. + */ + if (em->block_start == EXTENT_MAP_INLINE) + goto add; + + next_mergeable = defrag_check_next_extent(&inode->vfs_inode, em, + extent_thresh, newer_than, locked); + if (!next_mergeable) { + struct defrag_target_range *last; + + /* Empty target list, no way to merge with last entry */ + if (list_empty(target_list)) + goto next; + last = list_entry(target_list->prev, + struct defrag_target_range, list); + /* Not mergeable with last entry */ + if (last->start + last->len != cur) + goto next; + + /* Mergeable, fall through to add it to @target_list. */ + } + +add: + last_is_target = true; + range_len = min(extent_map_end(em), start + len) - cur; + /* + * This one is a good target, check if it can be merged into + * last range of the target list. + */ + if (!list_empty(target_list)) { + struct defrag_target_range *last; + + last = list_entry(target_list->prev, + struct defrag_target_range, list); + ASSERT(last->start + last->len <= cur); + if (last->start + last->len == cur) { + /* Mergeable, enlarge the last entry */ + last->len += range_len; + goto next; + } + /* Fall through to allocate a new entry */ + } + + /* Allocate new defrag_target_range */ + new = kmalloc(sizeof(*new), GFP_NOFS); + if (!new) { + free_extent_map(em); + ret = -ENOMEM; + break; + } + new->start = cur; + new->len = range_len; + list_add_tail(&new->list, target_list); + +next: + cur = extent_map_end(em); + free_extent_map(em); + } + if (ret < 0) { + struct defrag_target_range *entry; + struct defrag_target_range *tmp; + + list_for_each_entry_safe(entry, tmp, target_list, list) { + list_del_init(&entry->list); + kfree(entry); + } + } + if (!ret && last_scanned_ret) { + /* + * If the last extent is not a target, the caller can skip to + * the end of that extent. + * Otherwise, we can only go the end of the specified range. + */ + if (!last_is_target) + *last_scanned_ret = max(cur, *last_scanned_ret); + else + *last_scanned_ret = max(start + len, *last_scanned_ret); + } + return ret; +} + +#define CLUSTER_SIZE (SZ_256K) +static_assert(IS_ALIGNED(CLUSTER_SIZE, PAGE_SIZE)); + +/* + * Defrag one contiguous target range. + * + * @inode: target inode + * @target: target range to defrag + * @pages: locked pages covering the defrag range + * @nr_pages: number of locked pages + * + * Caller should ensure: + * + * - Pages are prepared + * Pages should be locked, no ordered extent in the pages range, + * no writeback. + * + * - Extent bits are locked + */ +static int defrag_one_locked_target(struct btrfs_inode *inode, + struct defrag_target_range *target, + struct page **pages, int nr_pages, + struct extent_state **cached_state) +{ + struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct extent_changeset *data_reserved = NULL; + const u64 start = target->start; + const u64 len = target->len; + unsigned long last_index = (start + len - 1) >> PAGE_SHIFT; + unsigned long start_index = start >> PAGE_SHIFT; + unsigned long first_index = page_index(pages[0]); + int ret = 0; + int i; + + ASSERT(last_index - first_index + 1 <= nr_pages); + + ret = btrfs_delalloc_reserve_space(inode, &data_reserved, start, len); + if (ret < 0) + return ret; + clear_extent_bit(&inode->io_tree, start, start + len - 1, + EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | + EXTENT_DEFRAG, cached_state); + set_extent_defrag(&inode->io_tree, start, start + len - 1, cached_state); + + /* Update the page status */ + for (i = start_index - first_index; i <= last_index - first_index; i++) { + ClearPageChecked(pages[i]); + btrfs_page_clamp_set_dirty(fs_info, pages[i], start, len); + } + btrfs_delalloc_release_extents(inode, len); + extent_changeset_free(data_reserved); + + return ret; +} + +static int defrag_one_range(struct btrfs_inode *inode, u64 start, u32 len, + u32 extent_thresh, u64 newer_than, bool do_compress, + u64 *last_scanned_ret) +{ + struct extent_state *cached_state = NULL; + struct defrag_target_range *entry; + struct defrag_target_range *tmp; + LIST_HEAD(target_list); + struct page **pages; + const u32 sectorsize = inode->root->fs_info->sectorsize; + u64 last_index = (start + len - 1) >> PAGE_SHIFT; + u64 start_index = start >> PAGE_SHIFT; + unsigned int nr_pages = last_index - start_index + 1; + int ret = 0; + int i; + + ASSERT(nr_pages <= CLUSTER_SIZE / PAGE_SIZE); + ASSERT(IS_ALIGNED(start, sectorsize) && IS_ALIGNED(len, sectorsize)); + + pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS); + if (!pages) + return -ENOMEM; + + /* Prepare all pages */ + for (i = 0; i < nr_pages; i++) { + pages[i] = defrag_prepare_one_page(inode, start_index + i); + if (IS_ERR(pages[i])) { + ret = PTR_ERR(pages[i]); + pages[i] = NULL; + goto free_pages; + } + } + for (i = 0; i < nr_pages; i++) + wait_on_page_writeback(pages[i]); + + /* Lock the pages range */ + lock_extent(&inode->io_tree, start_index << PAGE_SHIFT, + (last_index << PAGE_SHIFT) + PAGE_SIZE - 1, + &cached_state); + /* + * Now we have a consistent view about the extent map, re-check + * which range really needs to be defragged. + * + * And this time we have extent locked already, pass @locked = true + * so that we won't relock the extent range and cause deadlock. + */ + ret = defrag_collect_targets(inode, start, len, extent_thresh, + newer_than, do_compress, true, + &target_list, last_scanned_ret); + if (ret < 0) + goto unlock_extent; + + list_for_each_entry(entry, &target_list, list) { + ret = defrag_one_locked_target(inode, entry, pages, nr_pages, + &cached_state); + if (ret < 0) + break; + } + + list_for_each_entry_safe(entry, tmp, &target_list, list) { + list_del_init(&entry->list); + kfree(entry); + } +unlock_extent: + unlock_extent(&inode->io_tree, start_index << PAGE_SHIFT, + (last_index << PAGE_SHIFT) + PAGE_SIZE - 1, + &cached_state); +free_pages: + for (i = 0; i < nr_pages; i++) { + if (pages[i]) { + unlock_page(pages[i]); + put_page(pages[i]); + } + } + kfree(pages); + return ret; +} + +static int defrag_one_cluster(struct btrfs_inode *inode, + struct file_ra_state *ra, + u64 start, u32 len, u32 extent_thresh, + u64 newer_than, bool do_compress, + unsigned long *sectors_defragged, + unsigned long max_sectors, + u64 *last_scanned_ret) +{ + const u32 sectorsize = inode->root->fs_info->sectorsize; + struct defrag_target_range *entry; + struct defrag_target_range *tmp; + LIST_HEAD(target_list); + int ret; + + ret = defrag_collect_targets(inode, start, len, extent_thresh, + newer_than, do_compress, false, + &target_list, NULL); + if (ret < 0) + goto out; + + list_for_each_entry(entry, &target_list, list) { + u32 range_len = entry->len; + + /* Reached or beyond the limit */ + if (max_sectors && *sectors_defragged >= max_sectors) { + ret = 1; + break; + } + + if (max_sectors) + range_len = min_t(u32, range_len, + (max_sectors - *sectors_defragged) * sectorsize); + + /* + * If defrag_one_range() has updated last_scanned_ret, + * our range may already be invalid (e.g. hole punched). + * Skip if our range is before last_scanned_ret, as there is + * no need to defrag the range anymore. + */ + if (entry->start + range_len <= *last_scanned_ret) + continue; + + if (ra) + page_cache_sync_readahead(inode->vfs_inode.i_mapping, + ra, NULL, entry->start >> PAGE_SHIFT, + ((entry->start + range_len - 1) >> PAGE_SHIFT) - + (entry->start >> PAGE_SHIFT) + 1); + /* + * Here we may not defrag any range if holes are punched before + * we locked the pages. + * But that's fine, it only affects the @sectors_defragged + * accounting. + */ + ret = defrag_one_range(inode, entry->start, range_len, + extent_thresh, newer_than, do_compress, + last_scanned_ret); + if (ret < 0) + break; + *sectors_defragged += range_len >> + inode->root->fs_info->sectorsize_bits; + } +out: + list_for_each_entry_safe(entry, tmp, &target_list, list) { + list_del_init(&entry->list); + kfree(entry); + } + if (ret >= 0) + *last_scanned_ret = max(*last_scanned_ret, start + len); + return ret; +} + +/* + * Entry point to file defragmentation. + * + * @inode: inode to be defragged + * @ra: readahead state (can be NUL) + * @range: defrag options including range and flags + * @newer_than: minimum transid to defrag + * @max_to_defrag: max number of sectors to be defragged, if 0, the whole inode + * will be defragged. + * + * Return <0 for error. + * Return >=0 for the number of sectors defragged, and range->start will be updated + * to indicate the file offset where next defrag should be started at. + * (Mostly for autodefrag, which sets @max_to_defrag thus we may exit early without + * defragging all the range). + */ +int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra, + struct btrfs_ioctl_defrag_range_args *range, + u64 newer_than, unsigned long max_to_defrag) +{ + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + unsigned long sectors_defragged = 0; + u64 isize = i_size_read(inode); + u64 cur; + u64 last_byte; + bool do_compress = (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS); + bool ra_allocated = false; + int compress_type = BTRFS_COMPRESS_ZLIB; + int ret = 0; + u32 extent_thresh = range->extent_thresh; + pgoff_t start_index; + + if (isize == 0) + return 0; + + if (range->start >= isize) + return -EINVAL; + + if (do_compress) { + if (range->compress_type >= BTRFS_NR_COMPRESS_TYPES) + return -EINVAL; + if (range->compress_type) + compress_type = range->compress_type; + } + + if (extent_thresh == 0) + extent_thresh = SZ_256K; + + if (range->start + range->len > range->start) { + /* Got a specific range */ + last_byte = min(isize, range->start + range->len); + } else { + /* Defrag until file end */ + last_byte = isize; + } + + /* Align the range */ + cur = round_down(range->start, fs_info->sectorsize); + last_byte = round_up(last_byte, fs_info->sectorsize) - 1; + + /* + * If we were not given a ra, allocate a readahead context. As + * readahead is just an optimization, defrag will work without it so + * we don't error out. + */ + if (!ra) { + ra_allocated = true; + ra = kzalloc(sizeof(*ra), GFP_KERNEL); + if (ra) + file_ra_state_init(ra, inode->i_mapping); + } + + /* + * Make writeback start from the beginning of the range, so that the + * defrag range can be written sequentially. + */ + start_index = cur >> PAGE_SHIFT; + if (start_index < inode->i_mapping->writeback_index) + inode->i_mapping->writeback_index = start_index; + + while (cur < last_byte) { + const unsigned long prev_sectors_defragged = sectors_defragged; + u64 last_scanned = cur; + u64 cluster_end; + + if (btrfs_defrag_cancelled(fs_info)) { + ret = -EAGAIN; + break; + } + + /* We want the cluster end at page boundary when possible */ + cluster_end = (((cur >> PAGE_SHIFT) + + (SZ_256K >> PAGE_SHIFT)) << PAGE_SHIFT) - 1; + cluster_end = min(cluster_end, last_byte); + + btrfs_inode_lock(BTRFS_I(inode), 0); + if (IS_SWAPFILE(inode)) { + ret = -ETXTBSY; + btrfs_inode_unlock(BTRFS_I(inode), 0); + break; + } + if (!(inode->i_sb->s_flags & SB_ACTIVE)) { + btrfs_inode_unlock(BTRFS_I(inode), 0); + break; + } + if (do_compress) + BTRFS_I(inode)->defrag_compress = compress_type; + ret = defrag_one_cluster(BTRFS_I(inode), ra, cur, + cluster_end + 1 - cur, extent_thresh, + newer_than, do_compress, §ors_defragged, + max_to_defrag, &last_scanned); + + if (sectors_defragged > prev_sectors_defragged) + balance_dirty_pages_ratelimited(inode->i_mapping); + + btrfs_inode_unlock(BTRFS_I(inode), 0); + if (ret < 0) + break; + cur = max(cluster_end + 1, last_scanned); + if (ret > 0) { + ret = 0; + break; + } + cond_resched(); + } + + if (ra_allocated) + kfree(ra); + /* + * Update range.start for autodefrag, this will indicate where to start + * in next run. + */ + range->start = cur; + if (sectors_defragged) { + /* + * We have defragged some sectors, for compression case they + * need to be written back immediately. + */ + if (range->flags & BTRFS_DEFRAG_RANGE_START_IO) { + filemap_flush(inode->i_mapping); + if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, + &BTRFS_I(inode)->runtime_flags)) + filemap_flush(inode->i_mapping); + } + if (range->compress_type == BTRFS_COMPRESS_LZO) + btrfs_set_fs_incompat(fs_info, COMPRESS_LZO); + else if (range->compress_type == BTRFS_COMPRESS_ZSTD) + btrfs_set_fs_incompat(fs_info, COMPRESS_ZSTD); + ret = sectors_defragged; + } + if (do_compress) { + btrfs_inode_lock(BTRFS_I(inode), 0); + BTRFS_I(inode)->defrag_compress = BTRFS_COMPRESS_NONE; + btrfs_inode_unlock(BTRFS_I(inode), 0); + } + return ret; +} + +void __cold btrfs_auto_defrag_exit(void) +{ + kmem_cache_destroy(btrfs_inode_defrag_cachep); +} + +int __init btrfs_auto_defrag_init(void) +{ + btrfs_inode_defrag_cachep = kmem_cache_create("btrfs_inode_defrag", + sizeof(struct inode_defrag), 0, + SLAB_MEM_SPREAD, + NULL); + if (!btrfs_inode_defrag_cachep) + return -ENOMEM; + + return 0; +} diff --git a/fs/btrfs/defrag.h b/fs/btrfs/defrag.h new file mode 100644 index 000000000000..5305f2283b5e --- /dev/null +++ b/fs/btrfs/defrag.h @@ -0,0 +1,22 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef BTRFS_DEFRAG_H +#define BTRFS_DEFRAG_H + +int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra, + struct btrfs_ioctl_defrag_range_args *range, + u64 newer_than, unsigned long max_to_defrag); +int __init btrfs_auto_defrag_init(void); +void __cold btrfs_auto_defrag_exit(void); +int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, + struct btrfs_inode *inode, u32 extent_thresh); +int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info); +void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info); +int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, struct btrfs_root *root); + +static inline int btrfs_defrag_cancelled(struct btrfs_fs_info *fs_info) +{ + return signal_pending(current); +} + +#endif diff --git a/fs/btrfs/delalloc-space.c b/fs/btrfs/delalloc-space.c index 118b2e20b2e1..7ddb1d104e8e 100644 --- a/fs/btrfs/delalloc-space.c +++ b/fs/btrfs/delalloc-space.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 +#include "messages.h" #include "ctree.h" #include "delalloc-space.h" #include "block-rsv.h" @@ -8,6 +9,7 @@ #include "transaction.h" #include "qgroup.h" #include "block-group.h" +#include "fs.h" /* * HOW DOES THIS WORK @@ -200,8 +202,8 @@ void btrfs_free_reserved_data_space(struct btrfs_inode *inode, btrfs_qgroup_free_data(inode, reserved, start, len); } -/** - * Release any excessive reservation +/* + * Release any excessive reservations for an inode. * * @inode: the inode we need to release from * @qgroup_free: free or convert qgroup meta. Unlike normal operation, qgroup @@ -375,12 +377,12 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes, return 0; } -/** - * Release a metadata reservation for an inode +/* + * Release a metadata reservation for an inode. * - * @inode: the inode to release the reservation for. - * @num_bytes: the number of bytes we are releasing. - * @qgroup_free: free qgroup reservation or convert it to per-trans reservation + * @inode: the inode to release the reservation for. + * @num_bytes: the number of bytes we are releasing. + * @qgroup_free: free qgroup reservation or convert it to per-trans reservation * * This will release the metadata reservation for an inode. This can be called * once we complete IO for a given set of bytes to release their metadata @@ -403,10 +405,11 @@ void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes, btrfs_inode_rsv_release(inode, qgroup_free); } -/** - * btrfs_delalloc_release_extents - release our outstanding_extents - * @inode: the inode to balance the reservation for. - * @num_bytes: the number of bytes we originally reserved with +/* + * Release our outstanding_extents for an inode. + * + * @inode: the inode to balance the reservation for. + * @num_bytes: the number of bytes we originally reserved with * * When we reserve space we increase outstanding_extents for the extents we may * add. Once we've set the range as delalloc or created our ordered extents we @@ -431,30 +434,30 @@ void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes) btrfs_inode_rsv_release(inode, true); } -/** - * btrfs_delalloc_reserve_space - reserve data and metadata space for - * delalloc - * @inode: inode we're writing to - * @start: start range we are writing to - * @len: how long the range we are writing to - * @reserved: mandatory parameter, record actually reserved qgroup ranges of - * current reservation. +/* + * Reserve data and metadata space for delalloc + * + * @inode: inode we're writing to + * @start: start range we are writing to + * @len: how long the range we are writing to + * @reserved: mandatory parameter, record actually reserved qgroup ranges of + * current reservation. * * This will do the following things * - * - reserve space in data space info for num bytes - * and reserve precious corresponding qgroup space + * - reserve space in data space info for num bytes and reserve precious + * corresponding qgroup space * (Done in check_data_free_space) * * - reserve space for metadata space, based on the number of outstanding - * extents and how much csums will be needed - * also reserve metadata space in a per root over-reserve method. + * extents and how much csums will be needed also reserve metadata space in a + * per root over-reserve method. * - add to the inodes->delalloc_bytes * - add it to the fs_info's delalloc inodes list. * (Above 3 all done in delalloc_reserve_metadata) * * Return 0 for success - * Return <0 for error(-ENOSPC or -EQUOT) + * Return <0 for error(-ENOSPC or -EDQUOT) */ int btrfs_delalloc_reserve_space(struct btrfs_inode *inode, struct extent_changeset **reserved, u64 start, u64 len) @@ -473,7 +476,7 @@ int btrfs_delalloc_reserve_space(struct btrfs_inode *inode, return ret; } -/** +/* * Release data and metadata space for delalloc * * @inode: inode we're releasing space for @@ -482,10 +485,10 @@ int btrfs_delalloc_reserve_space(struct btrfs_inode *inode, * @len: length of the space already reserved * @qgroup_free: should qgroup reserved-space also be freed * - * This function will release the metadata space that was not used and will - * decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes - * list if there are no delalloc bytes left. - * Also it will handle the qgroup reserved space. + * Release the metadata space that was not used and will decrement + * ->delalloc_bytes and remove it from the fs_info->delalloc_inodes list if + * there are no delalloc bytes left. Also it will handle the qgroup reserved + * space. */ void btrfs_delalloc_release_space(struct btrfs_inode *inode, struct extent_changeset *reserved, diff --git a/fs/btrfs/delalloc-space.h b/fs/btrfs/delalloc-space.h index e07d46043455..c5d573f2366e 100644 --- a/fs/btrfs/delalloc-space.h +++ b/fs/btrfs/delalloc-space.h @@ -20,5 +20,8 @@ void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes, bool qgroup_free); int btrfs_delalloc_reserve_space(struct btrfs_inode *inode, struct extent_changeset **reserved, u64 start, u64 len); +int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes, + u64 disk_num_bytes, bool noflush); +void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes); #endif /* BTRFS_DELALLOC_SPACE_H */ diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index cac5169eaf8d..0095c6e4c3d1 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -6,14 +6,19 @@ #include <linux/slab.h> #include <linux/iversion.h> +#include "ctree.h" +#include "fs.h" +#include "messages.h" #include "misc.h" #include "delayed-inode.h" #include "disk-io.h" #include "transaction.h" -#include "ctree.h" #include "qgroup.h" #include "locking.h" #include "inode-item.h" +#include "space-info.h" +#include "accessors.h" +#include "file-item.h" #define BTRFS_DELAYED_WRITEBACK 512 #define BTRFS_DELAYED_BACKGROUND 128 @@ -1412,7 +1417,7 @@ void btrfs_balance_delayed_items(struct btrfs_fs_info *fs_info) int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans, const char *name, int name_len, struct btrfs_inode *dir, - struct btrfs_disk_key *disk_key, u8 type, + struct btrfs_disk_key *disk_key, u8 flags, u64 index) { struct btrfs_fs_info *fs_info = trans->fs_info; @@ -1443,7 +1448,7 @@ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans, btrfs_set_stack_dir_transid(dir_item, trans->transid); btrfs_set_stack_dir_data_len(dir_item, 0); btrfs_set_stack_dir_name_len(dir_item, name_len); - btrfs_set_stack_dir_type(dir_item, type); + btrfs_set_stack_dir_flags(dir_item, flags); memcpy((char *)(dir_item + 1), name, name_len); data_len = delayed_item->data_len + sizeof(struct btrfs_item); @@ -1641,8 +1646,8 @@ bool btrfs_readdir_get_delayed_items(struct inode *inode, * We can only do one readdir with delayed items at a time because of * item->readdir_list. */ - btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); - btrfs_inode_lock(inode, 0); + btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_SHARED); + btrfs_inode_lock(BTRFS_I(inode), 0); mutex_lock(&delayed_node->mutex); item = __btrfs_first_delayed_insertion_item(delayed_node); @@ -1753,7 +1758,7 @@ int btrfs_readdir_delayed_dir_index(struct dir_context *ctx, name = (char *)(di + 1); name_len = btrfs_stack_dir_name_len(di); - d_type = fs_ftype_to_dtype(di->type); + d_type = fs_ftype_to_dtype(btrfs_dir_flags_to_ftype(di->type)); btrfs_disk_key_to_cpu(&location, &di->location); over = !dir_emit(ctx, name, name_len, diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h index 0163ca637a96..4f21daa3dbc7 100644 --- a/fs/btrfs/delayed-inode.h +++ b/fs/btrfs/delayed-inode.h @@ -113,7 +113,7 @@ static inline void btrfs_init_delayed_root( int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans, const char *name, int name_len, struct btrfs_inode *dir, - struct btrfs_disk_key *disk_key, u8 type, + struct btrfs_disk_key *disk_key, u8 flags, u64 index); int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index 36a3debe9493..573ebab886e2 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -6,12 +6,14 @@ #include <linux/sched.h> #include <linux/slab.h> #include <linux/sort.h> +#include "messages.h" #include "ctree.h" #include "delayed-ref.h" #include "transaction.h" #include "qgroup.h" #include "space-info.h" #include "tree-mod-log.h" +#include "fs.h" struct kmem_cache *btrfs_delayed_ref_head_cachep; struct kmem_cache *btrfs_delayed_tree_ref_cachep; @@ -69,14 +71,14 @@ int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans) return btrfs_check_space_for_delayed_refs(trans->fs_info); } -/** - * Release a ref head's reservation +/* + * Release a ref head's reservation. * * @fs_info: the filesystem * @nr: number of items to drop * - * This drops the delayed ref head's count from the delayed refs rsv and frees - * any excess reservation we had. + * Drops the delayed ref head's count from the delayed refs rsv and free any + * excess reservation we had. */ void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr) { @@ -102,8 +104,7 @@ void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr) } /* - * btrfs_update_delayed_refs_rsv - adjust the size of the delayed refs rsv - * @trans - the trans that may have generated delayed refs + * Adjust the size of the delayed refs rsv. * * This is to be called anytime we may have adjusted trans->delayed_ref_updates, * it'll calculate the additional size and add it to the delayed_refs_rsv. @@ -137,8 +138,8 @@ void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans) trans->delayed_ref_updates = 0; } -/** - * Transfer bytes to our delayed refs rsv +/* + * Transfer bytes to our delayed refs rsv. * * @fs_info: the filesystem * @src: source block rsv to transfer from @@ -186,8 +187,8 @@ void btrfs_migrate_to_delayed_refs_rsv(struct btrfs_fs_info *fs_info, delayed_refs_rsv->space_info, to_free); } -/** - * Refill based on our delayed refs usage +/* + * Refill based on our delayed refs usage. * * @fs_info: the filesystem * @flush: control how we can flush for this reservation. diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 61e58066b5fd..78696d331639 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -18,11 +18,13 @@ #include "volumes.h" #include "async-thread.h" #include "check-integrity.h" -#include "rcu-string.h" #include "dev-replace.h" #include "sysfs.h" #include "zoned.h" #include "block-group.h" +#include "fs.h" +#include "accessors.h" +#include "scrub.h" /* * Device replace overview @@ -246,7 +248,6 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; struct btrfs_device *device; struct block_device *bdev; - struct rcu_string *name; u64 devid = BTRFS_DEV_REPLACE_DEVID; int ret = 0; @@ -290,19 +291,12 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, } - device = btrfs_alloc_device(NULL, &devid, NULL); + device = btrfs_alloc_device(NULL, &devid, NULL, device_path); if (IS_ERR(device)) { ret = PTR_ERR(device); goto error; } - name = rcu_string_strdup(device_path, GFP_KERNEL); - if (!name) { - btrfs_free_device(device); - ret = -ENOMEM; - goto error; - } - rcu_assign_pointer(device->name, name); ret = lookup_bdev(device_path, &device->devt); if (ret) goto error; @@ -456,14 +450,6 @@ out: return ret; } -static char* btrfs_dev_name(struct btrfs_device *device) -{ - if (!device || test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) - return "<missing disk>"; - else - return rcu_str_deref(device->name); -} - static int mark_block_group_to_copy(struct btrfs_fs_info *fs_info, struct btrfs_device *src_dev) { @@ -679,7 +665,7 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, "dev_replace from %s (devid %llu) to %s started", btrfs_dev_name(src_device), src_device->devid, - rcu_str_deref(tgt_device->name)); + btrfs_dev_name(tgt_device)); /* * from now on, the writes to the srcdev are all duplicated to @@ -938,7 +924,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, "btrfs_scrub_dev(%s, %llu, %s) failed %d", btrfs_dev_name(src_device), src_device->devid, - rcu_str_deref(tgt_device->name), scrub_ret); + btrfs_dev_name(tgt_device), scrub_ret); error: up_write(&dev_replace->rwsem); mutex_unlock(&fs_info->chunk_mutex); @@ -956,7 +942,7 @@ error: "dev_replace from %s (devid %llu) to %s finished", btrfs_dev_name(src_device), src_device->devid, - rcu_str_deref(tgt_device->name)); + btrfs_dev_name(tgt_device)); clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &tgt_device->dev_state); tgt_device->devid = src_device->devid; src_device->devid = BTRFS_DEV_REPLACE_DEVID; diff --git a/fs/btrfs/dev-replace.h b/fs/btrfs/dev-replace.h index 6084b313056a..675082ccec89 100644 --- a/fs/btrfs/dev-replace.h +++ b/fs/btrfs/dev-replace.h @@ -25,5 +25,13 @@ int __pure btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace); bool btrfs_finish_block_group_to_copy(struct btrfs_device *srcdev, struct btrfs_block_group *cache, u64 physical); +void btrfs_bio_counter_inc_blocked(struct btrfs_fs_info *fs_info); +void btrfs_bio_counter_sub(struct btrfs_fs_info *fs_info, s64 amount); + +static inline void btrfs_bio_counter_dec(struct btrfs_fs_info *fs_info) +{ + btrfs_bio_counter_sub(fs_info, 1); +} + #endif diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c index 72fb2c518a2b..082eb0e19598 100644 --- a/fs/btrfs/dir-item.c +++ b/fs/btrfs/dir-item.c @@ -3,9 +3,12 @@ * Copyright (C) 2007 Oracle. All rights reserved. */ +#include "messages.h" #include "ctree.h" #include "disk-io.h" #include "transaction.h" +#include "accessors.h" +#include "dir-item.h" /* * insert a name into a directory, doing overflow properly if there is a hash @@ -81,7 +84,7 @@ int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans, leaf = path->nodes[0]; btrfs_cpu_key_to_disk(&disk_key, &location); btrfs_set_dir_item_key(leaf, dir_item, &disk_key); - btrfs_set_dir_type(leaf, dir_item, BTRFS_FT_XATTR); + btrfs_set_dir_flags(leaf, dir_item, BTRFS_FT_XATTR); btrfs_set_dir_name_len(leaf, dir_item, name_len); btrfs_set_dir_transid(leaf, dir_item, trans->transid); btrfs_set_dir_data_len(leaf, dir_item, data_len); @@ -103,8 +106,8 @@ int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans, * to use for the second index (if one is created). * Will return 0 or -ENOMEM */ -int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, const char *name, - int name_len, struct btrfs_inode *dir, +int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, + const struct fscrypt_str *name, struct btrfs_inode *dir, struct btrfs_key *location, u8 type, u64 index) { int ret = 0; @@ -120,7 +123,7 @@ int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, const char *name, key.objectid = btrfs_ino(dir); key.type = BTRFS_DIR_ITEM_KEY; - key.offset = btrfs_name_hash(name, name_len); + key.offset = btrfs_name_hash(name->name, name->len); path = btrfs_alloc_path(); if (!path) @@ -128,9 +131,9 @@ int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, const char *name, btrfs_cpu_key_to_disk(&disk_key, location); - data_size = sizeof(*dir_item) + name_len; + data_size = sizeof(*dir_item) + name->len; dir_item = insert_with_overflow(trans, root, path, &key, data_size, - name, name_len); + name->name, name->len); if (IS_ERR(dir_item)) { ret = PTR_ERR(dir_item); if (ret == -EEXIST) @@ -138,15 +141,18 @@ int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, const char *name, goto out_free; } + if (IS_ENCRYPTED(&dir->vfs_inode)) + type |= BTRFS_FT_ENCRYPTED; + leaf = path->nodes[0]; btrfs_set_dir_item_key(leaf, dir_item, &disk_key); - btrfs_set_dir_type(leaf, dir_item, type); + btrfs_set_dir_flags(leaf, dir_item, type); btrfs_set_dir_data_len(leaf, dir_item, 0); - btrfs_set_dir_name_len(leaf, dir_item, name_len); + btrfs_set_dir_name_len(leaf, dir_item, name->len); btrfs_set_dir_transid(leaf, dir_item, trans->transid); name_ptr = (unsigned long)(dir_item + 1); - write_extent_buffer(leaf, name, name_ptr, name_len); + write_extent_buffer(leaf, name->name, name_ptr, name->len); btrfs_mark_buffer_dirty(leaf); second_insert: @@ -157,7 +163,7 @@ second_insert: } btrfs_release_path(path); - ret2 = btrfs_insert_delayed_dir_index(trans, name, name_len, dir, + ret2 = btrfs_insert_delayed_dir_index(trans, name->name, name->len, dir, &disk_key, type, index); out_free: btrfs_free_path(path); @@ -206,7 +212,7 @@ static struct btrfs_dir_item *btrfs_lookup_match_dir( struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, u64 dir, - const char *name, int name_len, + const struct fscrypt_str *name, int mod) { struct btrfs_key key; @@ -214,9 +220,10 @@ struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans, key.objectid = dir; key.type = BTRFS_DIR_ITEM_KEY; - key.offset = btrfs_name_hash(name, name_len); + key.offset = btrfs_name_hash(name->name, name->len); - di = btrfs_lookup_match_dir(trans, root, path, &key, name, name_len, mod); + di = btrfs_lookup_match_dir(trans, root, path, &key, name->name, + name->len, mod); if (IS_ERR(di) && PTR_ERR(di) == -ENOENT) return NULL; @@ -224,7 +231,7 @@ struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans, } int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir, - const char *name, int name_len) + const struct fscrypt_str *name) { int ret; struct btrfs_key key; @@ -240,9 +247,10 @@ int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir, key.objectid = dir; key.type = BTRFS_DIR_ITEM_KEY; - key.offset = btrfs_name_hash(name, name_len); + key.offset = btrfs_name_hash(name->name, name->len); - di = btrfs_lookup_match_dir(NULL, root, path, &key, name, name_len, 0); + di = btrfs_lookup_match_dir(NULL, root, path, &key, name->name, + name->len, 0); if (IS_ERR(di)) { ret = PTR_ERR(di); /* Nothing found, we're safe */ @@ -262,11 +270,8 @@ int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir, goto out; } - /* - * see if there is room in the item to insert this - * name - */ - data_size = sizeof(*di) + name_len; + /* See if there is room in the item to insert this name. */ + data_size = sizeof(*di) + name->len; leaf = path->nodes[0]; slot = path->slots[0]; if (data_size + btrfs_item_size(leaf, slot) + @@ -303,8 +308,7 @@ struct btrfs_dir_item * btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, u64 dir, - u64 index, const char *name, int name_len, - int mod) + u64 index, const struct fscrypt_str *name, int mod) { struct btrfs_dir_item *di; struct btrfs_key key; @@ -313,7 +317,8 @@ btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans, key.type = BTRFS_DIR_INDEX_KEY; key.offset = index; - di = btrfs_lookup_match_dir(trans, root, path, &key, name, name_len, mod); + di = btrfs_lookup_match_dir(trans, root, path, &key, name->name, + name->len, mod); if (di == ERR_PTR(-ENOENT)) return NULL; @@ -321,9 +326,8 @@ btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans, } struct btrfs_dir_item * -btrfs_search_dir_index_item(struct btrfs_root *root, - struct btrfs_path *path, u64 dirid, - const char *name, int name_len) +btrfs_search_dir_index_item(struct btrfs_root *root, struct btrfs_path *path, + u64 dirid, const struct fscrypt_str *name) { struct btrfs_dir_item *di; struct btrfs_key key; @@ -338,7 +342,7 @@ btrfs_search_dir_index_item(struct btrfs_root *root, break; di = btrfs_match_dir_item_name(root->fs_info, path, - name, name_len); + name->name, name->len); if (di) return di; } diff --git a/fs/btrfs/dir-item.h b/fs/btrfs/dir-item.h new file mode 100644 index 000000000000..aab4b7cc7fa0 --- /dev/null +++ b/fs/btrfs/dir-item.h @@ -0,0 +1,42 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef BTRFS_DIR_ITEM_H +#define BTRFS_DIR_ITEM_H + +int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir, + const struct fscrypt_str *name); +int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, + const struct fscrypt_str *name, struct btrfs_inode *dir, + struct btrfs_key *location, u8 type, u64 index); +struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, u64 dir, + const struct fscrypt_str *name, int mod); +struct btrfs_dir_item *btrfs_lookup_dir_index_item( + struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, u64 dir, + u64 index, const struct fscrypt_str *name, int mod); +struct btrfs_dir_item *btrfs_search_dir_index_item(struct btrfs_root *root, + struct btrfs_path *path, u64 dirid, + const struct fscrypt_str *name); +int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_dir_item *di); +int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, u64 objectid, + const char *name, u16 name_len, + const void *data, u16 data_len); +struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, u64 dir, + const char *name, u16 name_len, + int mod); +struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_fs_info *fs_info, + struct btrfs_path *path, + const char *name, + int name_len); + +#endif diff --git a/fs/btrfs/discard.c b/fs/btrfs/discard.c index e1b7bd927d69..ff2e524d9937 100644 --- a/fs/btrfs/discard.c +++ b/fs/btrfs/discard.c @@ -11,6 +11,7 @@ #include "block-group.h" #include "discard.h" #include "free-space-cache.h" +#include "fs.h" /* * This contains the logic to handle async discard. @@ -61,7 +62,7 @@ #define BTRFS_DISCARD_MAX_DELAY_MSEC (1000UL) #define BTRFS_DISCARD_MAX_IOPS (10U) -/* Montonically decreasing minimum length filters after index 0 */ +/* Monotonically decreasing minimum length filters after index 0 */ static int discard_minlen[BTRFS_NR_DISCARD_LISTS] = { 0, BTRFS_ASYNC_DISCARD_MAX_FILTER, @@ -146,10 +147,11 @@ static bool remove_from_discard_list(struct btrfs_discard_ctl *discard_ctl, return running; } -/** - * find_next_block_group - find block_group that's up next for discarding - * @discard_ctl: discard control - * @now: current time +/* + * Find block_group that's up next for discarding. + * + * @discard_ctl: discard control + * @now: current time * * Iterate over the discard lists to find the next block_group up for * discarding checking the discard_eligible_time of block_group. @@ -184,17 +186,17 @@ static struct btrfs_block_group *find_next_block_group( return ret_block_group; } -/** - * Wrap find_next_block_group() +/* + * Look up next block group and set it for use. * * @discard_ctl: discard control * @discard_state: the discard_state of the block_group after state management * @discard_index: the discard_index of the block_group after state management * @now: time when discard was invoked, in ns * - * This wraps find_next_block_group() and sets the block_group to be in use. - * discard_state's control flow is managed here. Variables related to - * discard_state are reset here as needed (eg discard_cursor). @discard_state + * Wrap find_next_block_group() and set the block_group to be in use. + * @discard_state's control flow is managed here. Variables related to + * @discard_state are reset here as needed (eg. @discard_cursor). @discard_state * and @discard_index are remembered as it may change while we're discarding, * but we want the discard to execute in the context determined here. */ @@ -233,10 +235,11 @@ again: return block_group; } -/** - * btrfs_discard_check_filter - updates a block groups filters - * @block_group: block group of interest - * @bytes: recently freed region size after coalescing +/* + * Update a block group's filters. + * + * @block_group: block group of interest + * @bytes: recently freed region size after coalescing * * Async discard maintains multiple lists with progressively smaller filters * to prioritize discarding based on size. Should a free space that matches @@ -271,8 +274,9 @@ void btrfs_discard_check_filter(struct btrfs_block_group *block_group, } } -/** - * btrfs_update_discard_index - moves a block group along the discard lists +/* + * Move a block group along the discard lists. + * * @discard_ctl: discard control * @block_group: block_group of interest * @@ -291,13 +295,14 @@ static void btrfs_update_discard_index(struct btrfs_discard_ctl *discard_ctl, add_to_discard_list(discard_ctl, block_group); } -/** - * btrfs_discard_cancel_work - remove a block_group from the discard lists +/* + * Remove a block_group from the discard lists. + * * @discard_ctl: discard control * @block_group: block_group of interest * - * This removes @block_group from the discard lists. If necessary, it waits on - * the current work and then reschedules the delayed work. + * Remove @block_group from the discard lists. If necessary, wait on the + * current work and then reschedule the delayed work. */ void btrfs_discard_cancel_work(struct btrfs_discard_ctl *discard_ctl, struct btrfs_block_group *block_group) @@ -308,12 +313,13 @@ void btrfs_discard_cancel_work(struct btrfs_discard_ctl *discard_ctl, } } -/** - * btrfs_discard_queue_work - handles queuing the block_groups +/* + * Handles queuing the block_groups. + * * @discard_ctl: discard control * @block_group: block_group of interest * - * This maintains the LRU order of the discard lists. + * Maintain the LRU order of the discard lists. */ void btrfs_discard_queue_work(struct btrfs_discard_ctl *discard_ctl, struct btrfs_block_group *block_group) @@ -383,7 +389,8 @@ static void __btrfs_discard_schedule_work(struct btrfs_discard_ctl *discard_ctl, } /* - * btrfs_discard_schedule_work - responsible for scheduling the discard work + * Responsible for scheduling the discard work. + * * @discard_ctl: discard control * @override: override the current timer * @@ -401,15 +408,16 @@ void btrfs_discard_schedule_work(struct btrfs_discard_ctl *discard_ctl, spin_unlock(&discard_ctl->lock); } -/** - * btrfs_finish_discard_pass - determine next step of a block_group +/* + * Determine next step of a block_group. + * * @discard_ctl: discard control * @block_group: block_group of interest * - * This determines the next step for a block group after it's finished going - * through a pass on a discard list. If it is unused and fully trimmed, we can - * mark it unused and send it to the unused_bgs path. Otherwise, pass it onto - * the appropriate filter list or let it fall off. + * Determine the next step for a block group after it's finished going through + * a pass on a discard list. If it is unused and fully trimmed, we can mark it + * unused and send it to the unused_bgs path. Otherwise, pass it onto the + * appropriate filter list or let it fall off. */ static void btrfs_finish_discard_pass(struct btrfs_discard_ctl *discard_ctl, struct btrfs_block_group *block_group) @@ -426,12 +434,13 @@ static void btrfs_finish_discard_pass(struct btrfs_discard_ctl *discard_ctl, } } -/** - * btrfs_discard_workfn - discard work function +/* + * Discard work queue callback + * * @work: work * - * This finds the next block_group to start discarding and then discards a - * single region. It does this in a two-pass fashion: first extents and second + * Find the next block_group to start discarding and then discard a single + * region. It does this in a two-pass fashion: first extents and second * bitmaps. Completely discarded block groups are sent to the unused_bgs path. */ static void btrfs_discard_workfn(struct work_struct *work) @@ -507,11 +516,12 @@ static void btrfs_discard_workfn(struct work_struct *work) spin_unlock(&discard_ctl->lock); } -/** - * btrfs_run_discard_work - determines if async discard should be running +/* + * Determine if async discard should be running. + * * @discard_ctl: discard control * - * Checks if the file system is writeable and BTRFS_FS_DISCARD_RUNNING is set. + * Check if the file system is writeable and BTRFS_FS_DISCARD_RUNNING is set. */ bool btrfs_run_discard_work(struct btrfs_discard_ctl *discard_ctl) { @@ -523,8 +533,9 @@ bool btrfs_run_discard_work(struct btrfs_discard_ctl *discard_ctl) test_bit(BTRFS_FS_DISCARD_RUNNING, &fs_info->flags)); } -/** - * btrfs_discard_calc_delay - recalculate the base delay +/* + * Recalculate the base delay. + * * @discard_ctl: discard control * * Recalculate the base delay which is based off the total number of @@ -545,7 +556,7 @@ void btrfs_discard_calc_delay(struct btrfs_discard_ctl *discard_ctl) spin_lock(&discard_ctl->lock); /* - * The following is to fix a potential -1 discrepenancy that we're not + * The following is to fix a potential -1 discrepancy that we're not * sure how to reproduce. But given that this is the only place that * utilizes these numbers and this is only called by from * btrfs_finish_extent_commit() which is synchronized, we can correct @@ -578,13 +589,14 @@ void btrfs_discard_calc_delay(struct btrfs_discard_ctl *discard_ctl) spin_unlock(&discard_ctl->lock); } -/** - * btrfs_discard_update_discardable - propagate discard counters +/* + * Propagate discard counters. + * * @block_group: block_group of interest * - * This propagates deltas of counters up to the discard_ctl. It maintains a - * current counter and a previous counter passing the delta up to the global - * stat. Then the current counter value becomes the previous counter value. + * Propagate deltas of counters up to the discard_ctl. It maintains a current + * counter and a previous counter passing the delta up to the global stat. + * Then the current counter value becomes the previous counter value. */ void btrfs_discard_update_discardable(struct btrfs_block_group *block_group) { @@ -619,8 +631,9 @@ void btrfs_discard_update_discardable(struct btrfs_block_group *block_group) } } -/** - * btrfs_discard_punt_unused_bgs_list - punt unused_bgs list to discard lists +/* + * Punt unused_bgs list to discard lists. + * * @fs_info: fs_info of interest * * The unused_bgs list needs to be punted to the discard lists because the @@ -644,8 +657,9 @@ void btrfs_discard_punt_unused_bgs_list(struct btrfs_fs_info *fs_info) spin_unlock(&fs_info->unused_bgs_lock); } -/** - * btrfs_discard_purge_list - purge discard lists +/* + * Purge discard lists. + * * @discard_ctl: discard control * * If we are disabling async discard, we may have intercepted block groups that diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index d99bf7c64611..0888d484df80 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -23,7 +23,7 @@ #include "disk-io.h" #include "transaction.h" #include "btrfs_inode.h" -#include "volumes.h" +#include "bio.h" #include "print-tree.h" #include "locking.h" #include "tree-log.h" @@ -43,6 +43,15 @@ #include "space-info.h" #include "zoned.h" #include "subpage.h" +#include "fs.h" +#include "accessors.h" +#include "extent-tree.h" +#include "root-tree.h" +#include "defrag.h" +#include "uuid-tree.h" +#include "relocation.h" +#include "scrub.h" +#include "super.h" #define BTRFS_SUPER_FLAG_SUPP (BTRFS_HEADER_FLAG_WRITTEN |\ BTRFS_HEADER_FLAG_RELOC |\ @@ -75,12 +84,12 @@ static void btrfs_free_csum_hash(struct btrfs_fs_info *fs_info) * just before they are sent down the IO stack. */ struct async_submit_bio { - struct inode *inode; + struct btrfs_inode *inode; struct bio *bio; - extent_submit_bio_start_t *submit_bio_start; + enum btrfs_wq_submit_cmd submit_cmd; int mirror_num; - /* Optional parameter for submit_bio_start used by direct io */ + /* Optional parameter for used by direct io */ u64 dio_file_offset; struct btrfs_work work; blk_status_t status; @@ -246,40 +255,54 @@ int btrfs_verify_level_key(struct extent_buffer *eb, int level, return ret; } +static int btrfs_repair_eb_io_failure(const struct extent_buffer *eb, + int mirror_num) +{ + struct btrfs_fs_info *fs_info = eb->fs_info; + u64 start = eb->start; + int i, num_pages = num_extent_pages(eb); + int ret = 0; + + if (sb_rdonly(fs_info->sb)) + return -EROFS; + + for (i = 0; i < num_pages; i++) { + struct page *p = eb->pages[i]; + + ret = btrfs_repair_io_failure(fs_info, 0, start, PAGE_SIZE, + start, p, start - page_offset(p), mirror_num); + if (ret) + break; + start += PAGE_SIZE; + } + + return ret; +} + /* * helper to read a given tree block, doing retries as required when * the checksums don't match and we have alternate mirrors to try. * - * @parent_transid: expected transid, skip check if 0 - * @level: expected level, mandatory check - * @first_key: expected key of first slot, skip check if NULL + * @check: expected tree parentness check, see the comments of the + * structure for details. */ int btrfs_read_extent_buffer(struct extent_buffer *eb, - u64 parent_transid, int level, - struct btrfs_key *first_key) + struct btrfs_tree_parent_check *check) { struct btrfs_fs_info *fs_info = eb->fs_info; - struct extent_io_tree *io_tree; int failed = 0; int ret; int num_copies = 0; int mirror_num = 0; int failed_mirror = 0; - io_tree = &BTRFS_I(fs_info->btree_inode)->io_tree; + ASSERT(check); + while (1) { clear_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags); - ret = read_extent_buffer_pages(eb, WAIT_COMPLETE, mirror_num); - if (!ret) { - if (verify_parent_transid(io_tree, eb, - parent_transid, 0)) - ret = -EIO; - else if (btrfs_verify_level_key(eb, level, - first_key, parent_transid)) - ret = -EUCLEAN; - else - break; - } + ret = read_extent_buffer_pages(eb, WAIT_COMPLETE, mirror_num, check); + if (!ret) + break; num_copies = btrfs_num_copies(fs_info, eb->start, eb->len); @@ -455,7 +478,8 @@ static int check_tree_block_fsid(struct extent_buffer *eb) } /* Do basic extent buffer checks at read time */ -static int validate_extent_buffer(struct extent_buffer *eb) +static int validate_extent_buffer(struct extent_buffer *eb, + struct btrfs_tree_parent_check *check) { struct btrfs_fs_info *fs_info = eb->fs_info; u64 found_start; @@ -465,6 +489,8 @@ static int validate_extent_buffer(struct extent_buffer *eb) const u8 *header_csum; int ret = 0; + ASSERT(check); + found_start = btrfs_header_bytenr(eb); if (found_start != eb->start) { btrfs_err_rl(fs_info, @@ -503,6 +529,45 @@ static int validate_extent_buffer(struct extent_buffer *eb) goto out; } + if (found_level != check->level) { + ret = -EIO; + goto out; + } + if (unlikely(check->transid && + btrfs_header_generation(eb) != check->transid)) { + btrfs_err_rl(eb->fs_info, +"parent transid verify failed on logical %llu mirror %u wanted %llu found %llu", + eb->start, eb->read_mirror, check->transid, + btrfs_header_generation(eb)); + ret = -EIO; + goto out; + } + if (check->has_first_key) { + struct btrfs_key *expect_key = &check->first_key; + struct btrfs_key found_key; + + if (found_level) + btrfs_node_key_to_cpu(eb, &found_key, 0); + else + btrfs_item_key_to_cpu(eb, &found_key, 0); + if (unlikely(btrfs_comp_cpu_keys(expect_key, &found_key))) { + btrfs_err(fs_info, +"tree first key mismatch detected, bytenr=%llu parent_transid=%llu key expected=(%llu,%u,%llu) has=(%llu,%u,%llu)", + eb->start, check->transid, + expect_key->objectid, + expect_key->type, expect_key->offset, + found_key.objectid, found_key.type, + found_key.offset); + ret = -EUCLEAN; + goto out; + } + } + if (check->owner_root) { + ret = btrfs_check_eb_owner(eb, check->owner_root); + if (ret < 0) + goto out; + } + /* * If this is a leaf block and it is corrupt, set the corrupt bit so * that we don't try and read the other copies of this block, just @@ -527,13 +592,15 @@ out: } static int validate_subpage_buffer(struct page *page, u64 start, u64 end, - int mirror) + int mirror, struct btrfs_tree_parent_check *check) { struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb); struct extent_buffer *eb; bool reads_done; int ret = 0; + ASSERT(check); + /* * We don't allow bio merge for subpage metadata read, so we should * only get one eb for each endio hook. @@ -557,7 +624,7 @@ static int validate_subpage_buffer(struct page *page, u64 start, u64 end, ret = -EIO; goto err; } - ret = validate_extent_buffer(eb); + ret = validate_extent_buffer(eb, check); if (ret < 0) goto err; @@ -587,7 +654,8 @@ int btrfs_validate_metadata_buffer(struct btrfs_bio *bbio, ASSERT(page->private); if (btrfs_sb(page->mapping->host->i_sb)->nodesize < PAGE_SIZE) - return validate_subpage_buffer(page, start, end, mirror); + return validate_subpage_buffer(page, start, end, mirror, + &bbio->parent_check); eb = (struct extent_buffer *)page->private; @@ -606,7 +674,7 @@ int btrfs_validate_metadata_buffer(struct btrfs_bio *bbio, ret = -EIO; goto err; } - ret = validate_extent_buffer(eb); + ret = validate_extent_buffer(eb, &bbio->parent_check); err: if (ret) { /* @@ -628,8 +696,18 @@ static void run_one_async_start(struct btrfs_work *work) blk_status_t ret; async = container_of(work, struct async_submit_bio, work); - ret = async->submit_bio_start(async->inode, async->bio, - async->dio_file_offset); + switch (async->submit_cmd) { + case WQ_SUBMIT_METADATA: + ret = btree_submit_bio_start(async->bio); + break; + case WQ_SUBMIT_DATA: + ret = btrfs_submit_bio_start(async->inode, async->bio); + break; + case WQ_SUBMIT_DATA_DIO: + ret = btrfs_submit_bio_start_direct_io(async->inode, + async->bio, async->dio_file_offset); + break; + } if (ret) async->status = ret; } @@ -646,7 +724,7 @@ static void run_one_async_done(struct btrfs_work *work) { struct async_submit_bio *async = container_of(work, struct async_submit_bio, work); - struct inode *inode = async->inode; + struct btrfs_inode *inode = async->inode; struct btrfs_bio *bbio = btrfs_bio(async->bio); /* If an error occurred we just want to clean up the bio and move on */ @@ -661,7 +739,7 @@ static void run_one_async_done(struct btrfs_work *work) * This changes nothing when cgroups aren't in use. */ async->bio->bi_opf |= REQ_CGROUP_PUNT; - btrfs_submit_bio(btrfs_sb(inode->i_sb), async->bio, async->mirror_num); + btrfs_submit_bio(inode->root->fs_info, async->bio, async->mirror_num); } static void run_one_async_free(struct btrfs_work *work) @@ -679,11 +757,10 @@ static void run_one_async_free(struct btrfs_work *work) * - true if the work has been succesfuly submitted * - false in case of error */ -bool btrfs_wq_submit_bio(struct inode *inode, struct bio *bio, int mirror_num, - u64 dio_file_offset, - extent_submit_bio_start_t *submit_bio_start) +bool btrfs_wq_submit_bio(struct btrfs_inode *inode, struct bio *bio, int mirror_num, + u64 dio_file_offset, enum btrfs_wq_submit_cmd cmd) { - struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; + struct btrfs_fs_info *fs_info = inode->root->fs_info; struct async_submit_bio *async; async = kmalloc(sizeof(*async), GFP_NOFS); @@ -693,7 +770,7 @@ bool btrfs_wq_submit_bio(struct inode *inode, struct bio *bio, int mirror_num, async->inode = inode; async->bio = bio; async->mirror_num = mirror_num; - async->submit_bio_start = submit_bio_start; + async->submit_cmd = cmd; btrfs_init_work(&async->work, run_one_async_start, run_one_async_done, run_one_async_free); @@ -727,8 +804,7 @@ static blk_status_t btree_csum_one_bio(struct bio *bio) return errno_to_blk_status(ret); } -static blk_status_t btree_submit_bio_start(struct inode *inode, struct bio *bio, - u64 dio_file_offset) +blk_status_t btree_submit_bio_start(struct bio *bio) { /* * when we're called for a write, we're already in the async @@ -749,13 +825,14 @@ static bool should_async_write(struct btrfs_fs_info *fs_info, return true; } -void btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio, int mirror_num) +void btrfs_submit_metadata_bio(struct btrfs_inode *inode, struct bio *bio, int mirror_num) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode->root->fs_info; struct btrfs_bio *bbio = btrfs_bio(bio); blk_status_t ret; bio->bi_opf |= REQ_META; + bbio->is_metadata = 1; if (btrfs_op(bio) != BTRFS_MAP_WRITE) { btrfs_submit_bio(fs_info, bio, mirror_num); @@ -766,8 +843,8 @@ void btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio, int mirror_ * Kthread helpers are used to submit writes so that checksumming can * happen in parallel across all CPUs. */ - if (should_async_write(fs_info, BTRFS_I(inode)) && - btrfs_wq_submit_bio(inode, bio, mirror_num, 0, btree_submit_bio_start)) + if (should_async_write(fs_info, inode) && + btrfs_wq_submit_bio(inode, bio, mirror_num, 0, WQ_SUBMIT_METADATA)) return; ret = btree_csum_one_bio(bio); @@ -919,28 +996,28 @@ struct extent_buffer *btrfs_find_create_tree_block( * Read tree block at logical address @bytenr and do variant basic but critical * verification. * - * @owner_root: the objectid of the root owner for this block. - * @parent_transid: expected transid of this tree block, skip check if 0 - * @level: expected level, mandatory check - * @first_key: expected key in slot 0, skip check if NULL + * @check: expected tree parentness check, see comments of the + * structure for details. */ struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr, - u64 owner_root, u64 parent_transid, - int level, struct btrfs_key *first_key) + struct btrfs_tree_parent_check *check) { struct extent_buffer *buf = NULL; int ret; - buf = btrfs_find_create_tree_block(fs_info, bytenr, owner_root, level); + ASSERT(check); + + buf = btrfs_find_create_tree_block(fs_info, bytenr, check->owner_root, + check->level); if (IS_ERR(buf)) return buf; - ret = btrfs_read_extent_buffer(buf, parent_transid, level, first_key); + ret = btrfs_read_extent_buffer(buf, check); if (ret) { free_extent_buffer_stale(buf); return ERR_PTR(ret); } - if (btrfs_check_eb_owner(buf, owner_root)) { + if (btrfs_check_eb_owner(buf, check->owner_root)) { free_extent_buffer_stale(buf); return ERR_PTR(-EUCLEAN); } @@ -1027,9 +1104,9 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info, root->anon_dev = 0; if (!dummy) { extent_io_tree_init(fs_info, &root->dirty_log_pages, - IO_TREE_ROOT_DIRTY_LOG_PAGES, NULL); + IO_TREE_ROOT_DIRTY_LOG_PAGES); extent_io_tree_init(fs_info, &root->log_csum_range, - IO_TREE_LOG_CSUM_RANGE, NULL); + IO_TREE_LOG_CSUM_RANGE); } spin_lock_init(&root->root_item_lock); @@ -1167,6 +1244,13 @@ struct btrfs_root *btrfs_extent_root(struct btrfs_fs_info *fs_info, u64 bytenr) return btrfs_global_root(fs_info, &key); } +struct btrfs_root *btrfs_block_group_root(struct btrfs_fs_info *fs_info) +{ + if (btrfs_fs_compat_ro(fs_info, BLOCK_GROUP_TREE)) + return fs_info->block_group_root; + return btrfs_extent_root(fs_info, 0); +} + struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, u64 objectid) { @@ -1197,7 +1281,7 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, if (IS_ERR(leaf)) { ret = PTR_ERR(leaf); leaf = NULL; - goto fail_unlock; + goto fail; } root->node = leaf; @@ -1232,9 +1316,6 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, return root; -fail_unlock: - if (leaf) - btrfs_tree_unlock(leaf); fail: btrfs_put_root(root); @@ -1352,6 +1433,7 @@ static struct btrfs_root *read_tree_root_path(struct btrfs_root *tree_root, struct btrfs_key *key) { struct btrfs_root *root; + struct btrfs_tree_parent_check check = { 0 }; struct btrfs_fs_info *fs_info = tree_root->fs_info; u64 generation; int ret; @@ -1371,9 +1453,11 @@ static struct btrfs_root *read_tree_root_path(struct btrfs_root *tree_root, generation = btrfs_root_generation(&root->root_item); level = btrfs_root_level(&root->root_item); - root->node = read_tree_block(fs_info, - btrfs_root_bytenr(&root->root_item), - key->objectid, generation, level, NULL); + check.level = level; + check.transid = generation; + check.owner_root = key->objectid; + root->node = read_tree_block(fs_info, btrfs_root_bytenr(&root->root_item), + &check); if (IS_ERR(root->node)) { ret = PTR_ERR(root->node); root->node = NULL; @@ -2084,8 +2168,6 @@ static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info) btrfs_destroy_workqueue(fs_info->workers); if (fs_info->endio_workers) destroy_workqueue(fs_info->endio_workers); - if (fs_info->endio_raid56_workers) - destroy_workqueue(fs_info->endio_raid56_workers); if (fs_info->rmw_workers) destroy_workqueue(fs_info->rmw_workers); if (fs_info->compressed_write_workers) @@ -2231,7 +2313,7 @@ static void btrfs_init_btree_inode(struct btrfs_fs_info *fs_info) RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node); extent_io_tree_init(fs_info, &BTRFS_I(inode)->io_tree, - IO_TREE_BTREE_INODE_IO, NULL); + IO_TREE_BTREE_INODE_IO); extent_map_tree_init(&BTRFS_I(inode)->extent_tree); BTRFS_I(inode)->root = btrfs_grab_root(fs_info->tree_root); @@ -2291,8 +2373,6 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info) alloc_workqueue("btrfs-endio", flags, max_active); fs_info->endio_meta_workers = alloc_workqueue("btrfs-endio-meta", flags, max_active); - fs_info->endio_raid56_workers = - alloc_workqueue("btrfs-endio-raid56", flags, max_active); fs_info->rmw_workers = alloc_workqueue("btrfs-rmw", flags, max_active); fs_info->endio_write_workers = btrfs_alloc_workqueue(fs_info, "endio-write", flags, @@ -2314,7 +2394,7 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info) fs_info->delalloc_workers && fs_info->flush_workers && fs_info->endio_workers && fs_info->endio_meta_workers && fs_info->compressed_write_workers && - fs_info->endio_write_workers && fs_info->endio_raid56_workers && + fs_info->endio_write_workers && fs_info->endio_freespace_worker && fs_info->rmw_workers && fs_info->caching_workers && fs_info->fixup_workers && fs_info->delayed_workers && fs_info->qgroup_rescan_workers && @@ -2350,6 +2430,7 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info, struct btrfs_fs_devices *fs_devices) { int ret; + struct btrfs_tree_parent_check check = { 0 }; struct btrfs_root *log_tree_root; struct btrfs_super_block *disk_super = fs_info->super_copy; u64 bytenr = btrfs_super_log_root(disk_super); @@ -2365,10 +2446,10 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info, if (!log_tree_root) return -ENOMEM; - log_tree_root->node = read_tree_block(fs_info, bytenr, - BTRFS_TREE_LOG_OBJECTID, - fs_info->generation + 1, level, - NULL); + check.level = level; + check.transid = fs_info->generation + 1; + check.owner_root = BTRFS_TREE_LOG_OBJECTID; + log_tree_root->node = read_tree_block(fs_info, bytenr, &check); if (IS_ERR(log_tree_root->node)) { btrfs_warn(fs_info, "failed to read log tree"); ret = PTR_ERR(log_tree_root->node); @@ -2846,10 +2927,14 @@ out: static int load_super_root(struct btrfs_root *root, u64 bytenr, u64 gen, int level) { + struct btrfs_tree_parent_check check = { + .level = level, + .transid = gen, + .owner_root = root->root_key.objectid + }; int ret = 0; - root->node = read_tree_block(root->fs_info, bytenr, - root->root_key.objectid, gen, level, NULL); + root->node = read_tree_block(root->fs_info, bytenr, &check); if (IS_ERR(root->node)) { ret = PTR_ERR(root->node); root->node = NULL; @@ -3057,7 +3142,7 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) fs_info->block_group_cache_tree = RB_ROOT_CACHED; extent_io_tree_init(fs_info, &fs_info->excluded_extents, - IO_TREE_FS_EXCLUDED_EXTENTS, NULL); + IO_TREE_FS_EXCLUDED_EXTENTS); mutex_init(&fs_info->ordered_operations_mutex); mutex_init(&fs_info->tree_log_mutex); @@ -3743,10 +3828,18 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device } /* - * Mount does not set all options immediately, we can do it now and do - * not have to wait for transaction commit + * For devices supporting discard turn on discard=async automatically, + * unless it's already set or disabled. This could be turned off by + * nodiscard for the same mount. */ - btrfs_apply_pending_changes(fs_info); + if (!(btrfs_test_opt(fs_info, DISCARD_SYNC) || + btrfs_test_opt(fs_info, DISCARD_ASYNC) || + btrfs_test_opt(fs_info, NODISCARD)) && + fs_info->fs_devices->discardable) { + btrfs_set_and_info(fs_info, DISCARD_ASYNC, + "auto enabling async discard"); + btrfs_clear_opt(fs_info->mount_opt, NODISCARD); + } #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY if (btrfs_test_opt(fs_info, CHECK_INTEGRITY)) { @@ -3875,7 +3968,7 @@ static void btrfs_end_super_write(struct bio *bio) if (bio->bi_status) { btrfs_warn_rl_in_rcu(device->fs_info, "lost page write due to IO error on %s (%d)", - rcu_str_deref(device->name), + btrfs_dev_name(device), blk_status_to_errno(bio->bi_status)); ClearPageUptodate(page); SetPageError(page); diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 9fa923e005a3..363935cfc084 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -27,14 +27,14 @@ static inline u64 btrfs_sb_offset(int mirror) struct btrfs_device; struct btrfs_fs_devices; +struct btrfs_tree_parent_check; void btrfs_check_leaked_roots(struct btrfs_fs_info *fs_info); void btrfs_init_fs_info(struct btrfs_fs_info *fs_info); int btrfs_verify_level_key(struct extent_buffer *eb, int level, struct btrfs_key *first_key, u64 parent_transid); struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr, - u64 owner_root, u64 parent_transid, - int level, struct btrfs_key *first_key); + struct btrfs_tree_parent_check *check); struct extent_buffer *btrfs_find_create_tree_block( struct btrfs_fs_info *fs_info, u64 bytenr, u64 owner_root, @@ -75,6 +75,7 @@ struct btrfs_root *btrfs_global_root(struct btrfs_fs_info *fs_info, struct btrfs_key *key); struct btrfs_root *btrfs_csum_root(struct btrfs_fs_info *fs_info, u64 bytenr); struct btrfs_root *btrfs_extent_root(struct btrfs_fs_info *fs_info, u64 bytenr); +struct btrfs_root *btrfs_block_group_root(struct btrfs_fs_info *fs_info); void btrfs_free_fs_info(struct btrfs_fs_info *fs_info); int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info); @@ -85,7 +86,7 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info, int btrfs_validate_metadata_buffer(struct btrfs_bio *bbio, struct page *page, u64 start, u64 end, int mirror); -void btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio, int mirror_num); +void btrfs_submit_metadata_bio(struct btrfs_inode *inode, struct bio *bio, int mirror_num); #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS struct btrfs_root *btrfs_alloc_dummy_root(struct btrfs_fs_info *fs_info); #endif @@ -106,24 +107,22 @@ static inline struct btrfs_root *btrfs_grab_root(struct btrfs_root *root) return NULL; } -static inline struct btrfs_root *btrfs_block_group_root(struct btrfs_fs_info *fs_info) -{ - if (btrfs_fs_compat_ro(fs_info, BLOCK_GROUP_TREE)) - return fs_info->block_group_root; - return btrfs_extent_root(fs_info, 0); -} - void btrfs_put_root(struct btrfs_root *root); void btrfs_mark_buffer_dirty(struct extent_buffer *buf); int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid, int atomic); -int btrfs_read_extent_buffer(struct extent_buffer *buf, u64 parent_transid, - int level, struct btrfs_key *first_key); -bool btrfs_wq_submit_bio(struct inode *inode, struct bio *bio, int mirror_num, - u64 dio_file_offset, - extent_submit_bio_start_t *submit_bio_start); -blk_status_t btrfs_submit_bio_done(void *private_data, struct bio *bio, - int mirror_num); +int btrfs_read_extent_buffer(struct extent_buffer *buf, + struct btrfs_tree_parent_check *check); + +enum btrfs_wq_submit_cmd { + WQ_SUBMIT_METADATA, + WQ_SUBMIT_DATA, + WQ_SUBMIT_DATA_DIO, +}; + +bool btrfs_wq_submit_bio(struct btrfs_inode *inode, struct bio *bio, int mirror_num, + u64 dio_file_offset, enum btrfs_wq_submit_cmd cmd); +blk_status_t btree_submit_bio_start(struct bio *bio); int btrfs_alloc_log_tree_node(struct btrfs_trans_handle *trans, struct btrfs_root *root); int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans, @@ -136,8 +135,6 @@ void btrfs_cleanup_one_transaction(struct btrfs_transaction *trans, struct btrfs_fs_info *fs_info); struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, u64 objectid); -int btree_lock_page_hook(struct page *page, void *data, - void (*flush_fn)(void *)); int btrfs_get_num_tolerated_disk_barrier_failures(u64 flags); int btrfs_get_free_objectid(struct btrfs_root *root, u64 *objectid); int btrfs_init_root_free_objectid(struct btrfs_root *root); diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c index fab7eb76e53b..744a02b7fd67 100644 --- a/fs/btrfs/export.c +++ b/fs/btrfs/export.c @@ -7,6 +7,8 @@ #include "btrfs_inode.h" #include "print-tree.h" #include "export.h" +#include "accessors.h" +#include "super.h" #define BTRFS_FID_SIZE_NON_CONNECTABLE (offsetof(struct btrfs_fid, \ parent_objectid) / 4) @@ -57,9 +59,20 @@ static int btrfs_encode_fh(struct inode *inode, u32 *fh, int *max_len, return type; } +/* + * Read dentry of inode with @objectid from filesystem root @root_objectid. + * + * @sb: the filesystem super block + * @objectid: inode objectid + * @root_objectid: object id of the subvolume root where to look up the inode + * @generation: optional, if not zero, verify that the found inode + * generation matches + * + * Return dentry alias for the inode, otherwise an error. In case the + * generation does not match return ESTALE. + */ struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid, - u64 root_objectid, u64 generation, - int check_generation) + u64 root_objectid, u64 generation) { struct btrfs_fs_info *fs_info = btrfs_sb(sb); struct btrfs_root *root; @@ -77,7 +90,7 @@ struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid, if (IS_ERR(inode)) return ERR_CAST(inode); - if (check_generation && generation != inode->i_generation) { + if (generation != 0 && generation != inode->i_generation) { iput(inode); return ERR_PTR(-ESTALE); } @@ -106,7 +119,7 @@ static struct dentry *btrfs_fh_to_parent(struct super_block *sb, struct fid *fh, objectid = fid->parent_objectid; generation = fid->parent_gen; - return btrfs_get_dentry(sb, objectid, root_objectid, generation, 1); + return btrfs_get_dentry(sb, objectid, root_objectid, generation); } static struct dentry *btrfs_fh_to_dentry(struct super_block *sb, struct fid *fh, @@ -128,7 +141,7 @@ static struct dentry *btrfs_fh_to_dentry(struct super_block *sb, struct fid *fh, root_objectid = fid->root_objectid; generation = fid->gen; - return btrfs_get_dentry(sb, objectid, root_objectid, generation, 1); + return btrfs_get_dentry(sb, objectid, root_objectid, generation); } struct dentry *btrfs_get_parent(struct dentry *child) @@ -188,7 +201,7 @@ struct dentry *btrfs_get_parent(struct dentry *child) if (found_key.type == BTRFS_ROOT_BACKREF_KEY) { return btrfs_get_dentry(fs_info->sb, key.objectid, - found_key.offset, 0, 0); + found_key.offset, 0); } return d_obtain_alias(btrfs_iget(fs_info->sb, key.objectid, root)); diff --git a/fs/btrfs/export.h b/fs/btrfs/export.h index 5afb7ca42828..eba6bc4f5a61 100644 --- a/fs/btrfs/export.h +++ b/fs/btrfs/export.h @@ -19,8 +19,7 @@ struct btrfs_fid { } __attribute__ ((packed)); struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid, - u64 root_objectid, u64 generation, - int check_generation); + u64 root_objectid, u64 generation); struct dentry *btrfs_get_parent(struct dentry *child); #endif diff --git a/fs/btrfs/extent-io-tree.c b/fs/btrfs/extent-io-tree.c index 83cb0378096f..9ae9cd1e7035 100644 --- a/fs/btrfs/extent-io-tree.c +++ b/fs/btrfs/extent-io-tree.c @@ -2,6 +2,7 @@ #include <linux/slab.h> #include <trace/events/btrfs.h> +#include "messages.h" #include "ctree.h" #include "extent-io-tree.h" #include "btrfs_inode.h" @@ -57,17 +58,17 @@ static inline void __btrfs_debug_check_extent_io_range(const char *caller, struct extent_io_tree *tree, u64 start, u64 end) { - struct inode *inode = tree->private_data; + struct btrfs_inode *inode = tree->inode; u64 isize; if (!inode) return; - isize = i_size_read(inode); + isize = i_size_read(&inode->vfs_inode); if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) { - btrfs_debug_rl(BTRFS_I(inode)->root->fs_info, + btrfs_debug_rl(inode->root->fs_info, "%s: ino %llu isize %llu odd range [%llu,%llu]", - caller, btrfs_ino(BTRFS_I(inode)), isize, start, end); + caller, btrfs_ino(inode), isize, start, end); } } #else @@ -93,13 +94,12 @@ struct tree_entry { }; void extent_io_tree_init(struct btrfs_fs_info *fs_info, - struct extent_io_tree *tree, unsigned int owner, - void *private_data) + struct extent_io_tree *tree, unsigned int owner) { tree->fs_info = fs_info; tree->state = RB_ROOT; spin_lock_init(&tree->lock); - tree->private_data = private_data; + tree->inode = NULL; tree->owner = owner; if (owner == IO_TREE_INODE_FILE_EXTENT) lockdep_set_class(&tree->lock, &file_extent_tree_class); @@ -346,9 +346,8 @@ static void merge_state(struct extent_io_tree *tree, struct extent_state *state) other = prev_state(state); if (other && other->end == state->start - 1 && other->state == state->state) { - if (tree->private_data) - btrfs_merge_delalloc_extent(tree->private_data, - state, other); + if (tree->inode) + btrfs_merge_delalloc_extent(tree->inode, state, other); state->start = other->start; rb_erase(&other->rb_node, &tree->state); RB_CLEAR_NODE(&other->rb_node); @@ -357,9 +356,8 @@ static void merge_state(struct extent_io_tree *tree, struct extent_state *state) other = next_state(state); if (other && other->start == state->end + 1 && other->state == state->state) { - if (tree->private_data) - btrfs_merge_delalloc_extent(tree->private_data, state, - other); + if (tree->inode) + btrfs_merge_delalloc_extent(tree->inode, state, other); state->end = other->end; rb_erase(&other->rb_node, &tree->state); RB_CLEAR_NODE(&other->rb_node); @@ -374,8 +372,8 @@ static void set_state_bits(struct extent_io_tree *tree, u32 bits_to_set = bits & ~EXTENT_CTLBITS; int ret; - if (tree->private_data) - btrfs_set_delalloc_extent(tree->private_data, state, bits); + if (tree->inode) + btrfs_set_delalloc_extent(tree->inode, state, bits); ret = add_extent_changeset(state, bits_to_set, changeset, 1); BUG_ON(ret < 0); @@ -397,7 +395,7 @@ static int insert_state(struct extent_io_tree *tree, u32 bits, struct extent_changeset *changeset) { struct rb_node **node; - struct rb_node *parent; + struct rb_node *parent = NULL; const u64 end = state->end; set_state_bits(tree, state, bits, changeset); @@ -462,8 +460,8 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig, struct rb_node *parent = NULL; struct rb_node **node; - if (tree->private_data) - btrfs_split_delalloc_extent(tree->private_data, orig, split); + if (tree->inode) + btrfs_split_delalloc_extent(tree->inode, orig, split); prealloc->start = orig->start; prealloc->end = split - 1; @@ -510,8 +508,8 @@ static struct extent_state *clear_state_bit(struct extent_io_tree *tree, u32 bits_to_clear = bits & ~EXTENT_CTLBITS; int ret; - if (tree->private_data) - btrfs_clear_delalloc_extent(tree->private_data, state, bits); + if (tree->inode) + btrfs_clear_delalloc_extent(tree->inode, state, bits); ret = add_extent_changeset(state, bits_to_clear, changeset, 0); BUG_ON(ret < 0); @@ -572,7 +570,7 @@ int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, if (bits & (EXTENT_LOCKED | EXTENT_BOUNDARY)) clear = 1; again: - if (!prealloc && gfpflags_allow_blocking(mask)) { + if (!prealloc) { /* * Don't care for allocation failure here because we might end * up not needing the pre-allocated extent state at all, which @@ -636,7 +634,8 @@ hit_next: if (state->start < start) { prealloc = alloc_extent_state_atomic(prealloc); - BUG_ON(!prealloc); + if (!prealloc) + goto search_again; err = split_state(tree, state, prealloc, start); if (err) extent_io_tree_panic(tree, err); @@ -657,7 +656,8 @@ hit_next: */ if (state->start <= end && state->end > end) { prealloc = alloc_extent_state_atomic(prealloc); - BUG_ON(!prealloc); + if (!prealloc) + goto search_again; err = split_state(tree, state, prealloc, end + 1); if (err) extent_io_tree_panic(tree, err); @@ -714,7 +714,8 @@ static void wait_on_state(struct extent_io_tree *tree, * The range [start, end] is inclusive. * The tree lock is taken by this function */ -void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bits) +void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bits, + struct extent_state **cached_state) { struct extent_state *state; @@ -722,6 +723,16 @@ void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bits) spin_lock(&tree->lock); again: + /* + * Maintain cached_state, as we may not remove it from the tree if there + * are more bits than the bits we're waiting on set on this state. + */ + if (cached_state && *cached_state) { + state = *cached_state; + if (extent_state_in_tree(state) && + state->start <= start && start < state->end) + goto process_node; + } while (1) { /* * This search will find all the extents that end after our @@ -752,6 +763,12 @@ process_node: } } out: + /* This state is no longer useful, clear it and free it up. */ + if (cached_state && *cached_state) { + state = *cached_state; + *cached_state = NULL; + free_extent_state(state); + } spin_unlock(&tree->lock); } @@ -939,13 +956,17 @@ out: * sleeping, so the gfp mask is used to indicate what is allowed. * * If any of the exclusive bits are set, this will fail with -EEXIST if some - * part of the range already has the desired bits set. The start of the - * existing range is returned in failed_start in this case. + * part of the range already has the desired bits set. The extent_state of the + * existing range is returned in failed_state in this case, and the start of the + * existing range is returned in failed_start. failed_state is used as an + * optimization for wait_extent_bit, failed_start must be used as the source of + * truth as failed_state may have changed since we returned. * * [start, end] is inclusive This takes the tree lock. */ static int __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bits, u64 *failed_start, + struct extent_state **failed_state, struct extent_state **cached_state, struct extent_changeset *changeset, gfp_t mask) { @@ -964,9 +985,9 @@ static int __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, if (exclusive_bits) ASSERT(failed_start); else - ASSERT(failed_start == NULL); + ASSERT(failed_start == NULL && failed_state == NULL); again: - if (!prealloc && gfpflags_allow_blocking(mask)) { + if (!prealloc) { /* * Don't care for allocation failure here because we might end * up not needing the pre-allocated extent state at all, which @@ -991,7 +1012,8 @@ again: state = tree_search_for_insert(tree, start, &p, &parent); if (!state) { prealloc = alloc_extent_state_atomic(prealloc); - BUG_ON(!prealloc); + if (!prealloc) + goto search_again; prealloc->start = start; prealloc->end = end; insert_state_fast(tree, prealloc, p, parent, bits, changeset); @@ -1012,6 +1034,7 @@ hit_next: if (state->start == start && state->end <= end) { if (state->state & exclusive_bits) { *failed_start = state->start; + cache_state(state, failed_state); err = -EEXIST; goto out; } @@ -1047,6 +1070,7 @@ hit_next: if (state->start < start) { if (state->state & exclusive_bits) { *failed_start = start; + cache_state(state, failed_state); err = -EEXIST; goto out; } @@ -1062,7 +1086,8 @@ hit_next: } prealloc = alloc_extent_state_atomic(prealloc); - BUG_ON(!prealloc); + if (!prealloc) + goto search_again; err = split_state(tree, state, prealloc, start); if (err) extent_io_tree_panic(tree, err); @@ -1099,7 +1124,8 @@ hit_next: this_end = last_start - 1; prealloc = alloc_extent_state_atomic(prealloc); - BUG_ON(!prealloc); + if (!prealloc) + goto search_again; /* * Avoid to free 'prealloc' if it can be merged with the later @@ -1125,12 +1151,14 @@ hit_next: if (state->start <= end && state->end > end) { if (state->state & exclusive_bits) { *failed_start = start; + cache_state(state, failed_state); err = -EEXIST; goto out; } prealloc = alloc_extent_state_atomic(prealloc); - BUG_ON(!prealloc); + if (!prealloc) + goto search_again; err = split_state(tree, state, prealloc, end + 1); if (err) extent_io_tree_panic(tree, err); @@ -1162,8 +1190,8 @@ out: int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bits, struct extent_state **cached_state, gfp_t mask) { - return __set_extent_bit(tree, start, end, bits, NULL, cached_state, - NULL, mask); + return __set_extent_bit(tree, start, end, bits, NULL, NULL, + cached_state, NULL, mask); } /* @@ -1397,7 +1425,7 @@ void find_first_clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 *start_ret, u64 *end_ret, u32 bits) { struct extent_state *state; - struct extent_state *prev = NULL, *next; + struct extent_state *prev = NULL, *next = NULL; spin_lock(&tree->lock); @@ -1487,15 +1515,37 @@ out: } /* - * Count the number of bytes in the tree that have a given bit(s) set. This - * can be fairly slow, except for EXTENT_DIRTY which is cached. The total - * number found is returned. + * Count the number of bytes in the tree that have a given bit(s) set for a + * given range. + * + * @tree: The io tree to search. + * @start: The start offset of the range. This value is updated to the + * offset of the first byte found with the given bit(s), so it + * can end up being bigger than the initial value. + * @search_end: The end offset (inclusive value) of the search range. + * @max_bytes: The maximum byte count we are interested. The search stops + * once it reaches this count. + * @bits: The bits the range must have in order to be accounted for. + * If multiple bits are set, then only subranges that have all + * the bits set are accounted for. + * @contig: Indicate if we should ignore holes in the range or not. If + * this is true, then stop once we find a hole. + * @cached_state: A cached state to be used across multiple calls to this + * function in order to speedup searches. Use NULL if this is + * called only once or if each call does not start where the + * previous one ended. + * + * Returns the total number of bytes found within the given range that have + * all given bits set. If the returned number of bytes is greater than zero + * then @start is updated with the offset of the first byte with the bits set. */ u64 count_range_bits(struct extent_io_tree *tree, u64 *start, u64 search_end, u64 max_bytes, - u32 bits, int contig) + u32 bits, int contig, + struct extent_state **cached_state) { - struct extent_state *state; + struct extent_state *state = NULL; + struct extent_state *cached; u64 cur_start = *start; u64 total_bytes = 0; u64 last = 0; @@ -1506,11 +1556,41 @@ u64 count_range_bits(struct extent_io_tree *tree, spin_lock(&tree->lock); + if (!cached_state || !*cached_state) + goto search; + + cached = *cached_state; + + if (!extent_state_in_tree(cached)) + goto search; + + if (cached->start <= cur_start && cur_start <= cached->end) { + state = cached; + } else if (cached->start > cur_start) { + struct extent_state *prev; + + /* + * The cached state starts after our search range's start. Check + * if the previous state record starts at or before the range we + * are looking for, and if so, use it - this is a common case + * when there are holes between records in the tree. If there is + * no previous state record, we can start from our cached state. + */ + prev = prev_state(cached); + if (!prev) + state = cached; + else if (prev->start <= cur_start && cur_start <= prev->end) + state = prev; + } + /* * This search will find all the extents that end after our range * starts. */ - state = tree_search(tree, cur_start); +search: + if (!state) + state = tree_search(tree, cur_start); + while (state) { if (state->start > search_end) break; @@ -1531,7 +1611,16 @@ u64 count_range_bits(struct extent_io_tree *tree, } state = next_state(state); } + + if (cached_state) { + free_extent_state(*cached_state); + *cached_state = state; + if (state) + refcount_inc(&state->refs); + } + spin_unlock(&tree->lock); + return total_bytes; } @@ -1598,8 +1687,8 @@ int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, */ ASSERT(!(bits & EXTENT_LOCKED)); - return __set_extent_bit(tree, start, end, bits, NULL, NULL, changeset, - GFP_NOFS); + return __set_extent_bit(tree, start, end, bits, NULL, NULL, NULL, + changeset, GFP_NOFS); } int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, @@ -1615,17 +1704,18 @@ int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, changeset); } -int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end) +int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end, + struct extent_state **cached) { int err; u64 failed_start; err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, &failed_start, - NULL, NULL, GFP_NOFS); + NULL, cached, NULL, GFP_NOFS); if (err == -EEXIST) { if (failed_start > start) clear_extent_bit(tree, start, failed_start - 1, - EXTENT_LOCKED, NULL); + EXTENT_LOCKED, cached); return 0; } return 1; @@ -1638,20 +1728,22 @@ int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end) int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, struct extent_state **cached_state) { + struct extent_state *failed_state = NULL; int err; u64 failed_start; err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, &failed_start, - cached_state, NULL, GFP_NOFS); + &failed_state, cached_state, NULL, GFP_NOFS); while (err == -EEXIST) { if (failed_start != start) clear_extent_bit(tree, start, failed_start - 1, EXTENT_LOCKED, cached_state); - wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED); + wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED, + &failed_state); err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, - &failed_start, cached_state, NULL, - GFP_NOFS); + &failed_start, &failed_state, + cached_state, NULL, GFP_NOFS); } return err; } diff --git a/fs/btrfs/extent-io-tree.h b/fs/btrfs/extent-io-tree.h index a855f40dd61d..e3eeec380844 100644 --- a/fs/btrfs/extent-io-tree.h +++ b/fs/btrfs/extent-io-tree.h @@ -3,43 +3,48 @@ #ifndef BTRFS_EXTENT_IO_TREE_H #define BTRFS_EXTENT_IO_TREE_H +#include "misc.h" + struct extent_changeset; struct io_failure_record; /* Bits for the extent state */ -#define EXTENT_DIRTY (1U << 0) -#define EXTENT_UPTODATE (1U << 1) -#define EXTENT_LOCKED (1U << 2) -#define EXTENT_NEW (1U << 3) -#define EXTENT_DELALLOC (1U << 4) -#define EXTENT_DEFRAG (1U << 5) -#define EXTENT_BOUNDARY (1U << 6) -#define EXTENT_NODATASUM (1U << 7) -#define EXTENT_CLEAR_META_RESV (1U << 8) -#define EXTENT_NEED_WAIT (1U << 9) -#define EXTENT_NORESERVE (1U << 11) -#define EXTENT_QGROUP_RESERVED (1U << 12) -#define EXTENT_CLEAR_DATA_RESV (1U << 13) -/* - * Must be cleared only during ordered extent completion or on error paths if we - * did not manage to submit bios and create the ordered extents for the range. - * Should not be cleared during page release and page invalidation (if there is - * an ordered extent in flight), that is left for the ordered extent completion. - */ -#define EXTENT_DELALLOC_NEW (1U << 14) -/* - * When an ordered extent successfully completes for a region marked as a new - * delalloc range, use this flag when clearing a new delalloc range to indicate - * that the VFS' inode number of bytes should be incremented and the inode's new - * delalloc bytes decremented, in an atomic way to prevent races with stat(2). - */ -#define EXTENT_ADD_INODE_BYTES (1U << 15) - -/* - * Set during truncate when we're clearing an entire range and we just want the - * extent states to go away. - */ -#define EXTENT_CLEAR_ALL_BITS (1U << 16) +enum { + ENUM_BIT(EXTENT_DIRTY), + ENUM_BIT(EXTENT_UPTODATE), + ENUM_BIT(EXTENT_LOCKED), + ENUM_BIT(EXTENT_NEW), + ENUM_BIT(EXTENT_DELALLOC), + ENUM_BIT(EXTENT_DEFRAG), + ENUM_BIT(EXTENT_BOUNDARY), + ENUM_BIT(EXTENT_NODATASUM), + ENUM_BIT(EXTENT_CLEAR_META_RESV), + ENUM_BIT(EXTENT_NEED_WAIT), + ENUM_BIT(EXTENT_NORESERVE), + ENUM_BIT(EXTENT_QGROUP_RESERVED), + ENUM_BIT(EXTENT_CLEAR_DATA_RESV), + /* + * Must be cleared only during ordered extent completion or on error + * paths if we did not manage to submit bios and create the ordered + * extents for the range. Should not be cleared during page release + * and page invalidation (if there is an ordered extent in flight), + * that is left for the ordered extent completion. + */ + ENUM_BIT(EXTENT_DELALLOC_NEW), + /* + * When an ordered extent successfully completes for a region marked as + * a new delalloc range, use this flag when clearing a new delalloc + * range to indicate that the VFS' inode number of bytes should be + * incremented and the inode's new delalloc bytes decremented, in an + * atomic way to prevent races with stat(2). + */ + ENUM_BIT(EXTENT_ADD_INODE_BYTES), + /* + * Set during truncate when we're clearing an entire range and we just + * want the extent states to go away. + */ + ENUM_BIT(EXTENT_CLEAR_ALL_BITS), +}; #define EXTENT_DO_ACCOUNTING (EXTENT_CLEAR_META_RESV | \ EXTENT_CLEAR_DATA_RESV) @@ -75,7 +80,8 @@ enum { struct extent_io_tree { struct rb_root state; struct btrfs_fs_info *fs_info; - void *private_data; + /* Inode associated with this tree, or NULL. */ + struct btrfs_inode *inode; /* Who owns this io tree, should be one of IO_TREE_* */ u8 owner; @@ -99,21 +105,22 @@ struct extent_state { }; void extent_io_tree_init(struct btrfs_fs_info *fs_info, - struct extent_io_tree *tree, unsigned int owner, - void *private_data); + struct extent_io_tree *tree, unsigned int owner); void extent_io_tree_release(struct extent_io_tree *tree); int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, struct extent_state **cached); -int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end); +int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end, + struct extent_state **cached); int __init extent_state_init_cachep(void); void __cold extent_state_free_cachep(void); u64 count_range_bits(struct extent_io_tree *tree, u64 *start, u64 search_end, - u64 max_bytes, u32 bits, int contig); + u64 max_bytes, u32 bits, int contig, + struct extent_state **cached_state); void free_extent_state(struct extent_state *state); int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, @@ -139,13 +146,6 @@ static inline int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end, GFP_NOFS, NULL); } -static inline int unlock_extent_atomic(struct extent_io_tree *tree, u64 start, - u64 end, struct extent_state **cached) -{ - return __clear_extent_bit(tree, start, end, EXTENT_LOCKED, cached, - GFP_ATOMIC, NULL); -} - static inline int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, u32 bits) { @@ -217,13 +217,6 @@ static inline int set_extent_new(struct extent_io_tree *tree, u64 start, return set_extent_bit(tree, start, end, EXTENT_NEW, NULL, GFP_NOFS); } -static inline int set_extent_uptodate(struct extent_io_tree *tree, u64 start, - u64 end, struct extent_state **cached_state, gfp_t mask) -{ - return set_extent_bit(tree, start, end, EXTENT_UPTODATE, - cached_state, mask); -} - int find_first_extent_bit(struct extent_io_tree *tree, u64 start, u64 *start_ret, u64 *end_ret, u32 bits, struct extent_state **cached_state); @@ -234,6 +227,7 @@ int find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start, bool btrfs_find_delalloc_range(struct extent_io_tree *tree, u64 *start, u64 *end, u64 max_bytes, struct extent_state **cached_state); -void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bits); +void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bits, + struct extent_state **cached_state); #endif /* BTRFS_EXTENT_IO_TREE_H */ diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 2801c991814f..892d78c1853c 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -36,6 +36,13 @@ #include "rcu-string.h" #include "zoned.h" #include "dev-replace.h" +#include "fs.h" +#include "accessors.h" +#include "extent-tree.h" +#include "root-tree.h" +#include "file-item.h" +#include "orphan.h" +#include "tree-checker.h" #undef SCRAMBLE_DELAYED_REFS @@ -5255,8 +5262,8 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, u64 bytenr; u64 generation; u64 parent; + struct btrfs_tree_parent_check check = { 0 }; struct btrfs_key key; - struct btrfs_key first_key; struct btrfs_ref ref = { 0 }; struct extent_buffer *next; int level = wc->level; @@ -5278,7 +5285,12 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, } bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]); - btrfs_node_key_to_cpu(path->nodes[level], &first_key, + + check.level = level - 1; + check.transid = generation; + check.owner_root = root->root_key.objectid; + check.has_first_key = true; + btrfs_node_key_to_cpu(path->nodes[level], &check.first_key, path->slots[level]); next = find_extent_buffer(fs_info, bytenr); @@ -5340,8 +5352,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, if (!next) { if (reada && level == 1) reada_walk_down(trans, root, wc, path); - next = read_tree_block(fs_info, bytenr, root->root_key.objectid, - generation, level - 1, &first_key); + next = read_tree_block(fs_info, bytenr, &check); if (IS_ERR(next)) { return PTR_ERR(next); } else if (!extent_buffer_uptodate(next)) { @@ -5973,40 +5984,6 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans, return ret; } -/* - * helper to account the unused space of all the readonly block group in the - * space_info. takes mirrors into account. - */ -u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo) -{ - struct btrfs_block_group *block_group; - u64 free_bytes = 0; - int factor; - - /* It's df, we don't care if it's racy */ - if (list_empty(&sinfo->ro_bgs)) - return 0; - - spin_lock(&sinfo->lock); - list_for_each_entry(block_group, &sinfo->ro_bgs, ro_list) { - spin_lock(&block_group->lock); - - if (!block_group->ro) { - spin_unlock(&block_group->lock); - continue; - } - - factor = btrfs_bg_type_to_factor(block_group->flags); - free_bytes += (block_group->length - - block_group->used) * factor; - - spin_unlock(&block_group->lock); - } - spin_unlock(&sinfo->lock); - - return free_bytes; -} - int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info, u64 start, u64 end) { @@ -6072,7 +6049,7 @@ static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed) btrfs_warn_in_rcu(fs_info, "ignoring attempt to trim beyond device size: offset %llu length %llu device %s device size %llu", start, end - start + 1, - rcu_str_deref(device->name), + btrfs_dev_name(device), device->total_bytes); mutex_unlock(&fs_info->chunk_mutex); ret = 0; diff --git a/fs/btrfs/extent-tree.h b/fs/btrfs/extent-tree.h new file mode 100644 index 000000000000..ae5425253603 --- /dev/null +++ b/fs/btrfs/extent-tree.h @@ -0,0 +1,78 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef BTRFS_EXTENT_TREE_H +#define BTRFS_EXTENT_TREE_H + +enum btrfs_inline_ref_type { + BTRFS_REF_TYPE_INVALID, + BTRFS_REF_TYPE_BLOCK, + BTRFS_REF_TYPE_DATA, + BTRFS_REF_TYPE_ANY, +}; + +int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb, + struct btrfs_extent_inline_ref *iref, + enum btrfs_inline_ref_type is_data); +u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset); + +int btrfs_add_excluded_extent(struct btrfs_fs_info *fs_info, + u64 start, u64 num_bytes); +void btrfs_free_excluded_extents(struct btrfs_block_group *cache); +int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, unsigned long count); +void btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info, + struct btrfs_delayed_ref_root *delayed_refs, + struct btrfs_delayed_ref_head *head); +int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len); +int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, u64 bytenr, + u64 offset, int metadata, u64 *refs, u64 *flags); +int btrfs_pin_extent(struct btrfs_trans_handle *trans, u64 bytenr, u64 num, + int reserved); +int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans, + u64 bytenr, u64 num_bytes); +int btrfs_exclude_logged_extents(struct extent_buffer *eb); +int btrfs_cross_ref_exist(struct btrfs_root *root, + u64 objectid, u64 offset, u64 bytenr, bool strict, + struct btrfs_path *path); +struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 parent, u64 root_objectid, + const struct btrfs_disk_key *key, + int level, u64 hint, + u64 empty_size, + enum btrfs_lock_nesting nest); +void btrfs_free_tree_block(struct btrfs_trans_handle *trans, + u64 root_id, + struct extent_buffer *buf, + u64 parent, int last_ref); +int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 owner, + u64 offset, u64 ram_bytes, + struct btrfs_key *ins); +int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, + u64 root_objectid, u64 owner, u64 offset, + struct btrfs_key *ins); +int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes, u64 num_bytes, + u64 min_alloc_size, u64 empty_size, u64 hint_byte, + struct btrfs_key *ins, int is_data, int delalloc); +int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, + struct extent_buffer *buf, int full_backref); +int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, + struct extent_buffer *buf, int full_backref); +int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, + struct extent_buffer *eb, u64 flags, int level); +int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref); + +int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info, + u64 start, u64 len, int delalloc); +int btrfs_pin_reserved_extent(struct btrfs_trans_handle *trans, u64 start, u64 len); +int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans); +int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, struct btrfs_ref *generic_ref); +int __must_check btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, + int for_reloc); +int btrfs_drop_subtree(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct extent_buffer *node, + struct extent_buffer *parent); + +#endif diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 4dcf22e051ff..83dd3aa59663 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -20,7 +20,7 @@ #include "extent_map.h" #include "ctree.h" #include "btrfs_inode.h" -#include "volumes.h" +#include "bio.h" #include "check-integrity.h" #include "locking.h" #include "rcu-string.h" @@ -30,6 +30,12 @@ #include "zoned.h" #include "block-group.h" #include "compression.h" +#include "fs.h" +#include "accessors.h" +#include "file-item.h" +#include "file.h" +#include "dev-replace.h" +#include "super.h" static struct kmem_cache *extent_buffer_cache; @@ -96,24 +102,22 @@ struct btrfs_bio_ctrl { u32 len_to_stripe_boundary; u32 len_to_oe_boundary; btrfs_bio_end_io_t end_io_func; -}; -struct extent_page_data { - struct btrfs_bio_ctrl bio_ctrl; - /* tells writepage not to lock the state bits for this range - * it still does the unlocking + /* + * Tell writepage not to lock the state bits for this range, it still + * does the unlocking. */ - unsigned int extent_locked:1; + bool extent_locked; - /* tells the submit_bio code to use REQ_SYNC */ - unsigned int sync_io:1; + /* Tell the submit_bio code to use REQ_SYNC */ + bool sync_io; }; static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl) { struct bio *bio; struct bio_vec *bv; - struct inode *inode; + struct btrfs_inode *inode; int mirror_num; if (!bio_ctrl->bio) @@ -121,7 +125,7 @@ static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl) bio = bio_ctrl->bio; bv = bio_first_bvec_all(bio); - inode = bv->bv_page->mapping->host; + inode = BTRFS_I(bv->bv_page->mapping->host); mirror_num = bio_ctrl->mirror_num; /* Caller should ensure the bio has at least some range added */ @@ -129,7 +133,7 @@ static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl) btrfs_bio(bio)->file_offset = page_offset(bv->bv_page) + bv->bv_offset; - if (!is_data_inode(inode)) + if (!is_data_inode(&inode->vfs_inode)) btrfs_submit_metadata_bio(inode, bio, mirror_num); else if (btrfs_op(bio) == BTRFS_MAP_WRITE) btrfs_submit_data_write_bio(inode, bio, mirror_num); @@ -142,11 +146,11 @@ static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl) } /* - * Submit or fail the current bio in an extent_page_data structure. + * Submit or fail the current bio in the bio_ctrl structure. */ -static void submit_write_bio(struct extent_page_data *epd, int ret) +static void submit_write_bio(struct btrfs_bio_ctrl *bio_ctrl, int ret) { - struct bio *bio = epd->bio_ctrl.bio; + struct bio *bio = bio_ctrl->bio; if (!bio) return; @@ -155,9 +159,9 @@ static void submit_write_bio(struct extent_page_data *epd, int ret) ASSERT(ret < 0); btrfs_bio_end_io(btrfs_bio(bio), errno_to_blk_status(ret)); /* The bio is owned by the end_io handler now */ - epd->bio_ctrl.bio = NULL; + bio_ctrl->bio = NULL; } else { - submit_one_bio(&epd->bio_ctrl); + submit_one_bio(bio_ctrl); } } @@ -527,120 +531,6 @@ static void free_io_failure(struct btrfs_inode *inode, kfree(rec); } -/* - * this bypasses the standard btrfs submit functions deliberately, as - * the standard behavior is to write all copies in a raid setup. here we only - * want to write the one bad copy. so we do the mapping for ourselves and issue - * submit_bio directly. - * to avoid any synchronization issues, wait for the data after writing, which - * actually prevents the read that triggered the error from finishing. - * currently, there can be no more than two copies of every data bit. thus, - * exactly one rewrite is required. - */ -static int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, - u64 length, u64 logical, struct page *page, - unsigned int pg_offset, int mirror_num) -{ - struct btrfs_device *dev; - struct bio_vec bvec; - struct bio bio; - u64 map_length = 0; - u64 sector; - struct btrfs_io_context *bioc = NULL; - int ret = 0; - - ASSERT(!(fs_info->sb->s_flags & SB_RDONLY)); - BUG_ON(!mirror_num); - - if (btrfs_repair_one_zone(fs_info, logical)) - return 0; - - map_length = length; - - /* - * Avoid races with device replace and make sure our bioc has devices - * associated to its stripes that don't go away while we are doing the - * read repair operation. - */ - btrfs_bio_counter_inc_blocked(fs_info); - if (btrfs_is_parity_mirror(fs_info, logical, length)) { - /* - * Note that we don't use BTRFS_MAP_WRITE because it's supposed - * to update all raid stripes, but here we just want to correct - * bad stripe, thus BTRFS_MAP_READ is abused to only get the bad - * stripe's dev and sector. - */ - ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, logical, - &map_length, &bioc, 0); - if (ret) - goto out_counter_dec; - ASSERT(bioc->mirror_num == 1); - } else { - ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, logical, - &map_length, &bioc, mirror_num); - if (ret) - goto out_counter_dec; - BUG_ON(mirror_num != bioc->mirror_num); - } - - sector = bioc->stripes[bioc->mirror_num - 1].physical >> 9; - dev = bioc->stripes[bioc->mirror_num - 1].dev; - btrfs_put_bioc(bioc); - - if (!dev || !dev->bdev || - !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) { - ret = -EIO; - goto out_counter_dec; - } - - bio_init(&bio, dev->bdev, &bvec, 1, REQ_OP_WRITE | REQ_SYNC); - bio.bi_iter.bi_sector = sector; - __bio_add_page(&bio, page, length, pg_offset); - - btrfsic_check_bio(&bio); - ret = submit_bio_wait(&bio); - if (ret) { - /* try to remap that extent elsewhere? */ - btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS); - goto out_bio_uninit; - } - - btrfs_info_rl_in_rcu(fs_info, - "read error corrected: ino %llu off %llu (dev %s sector %llu)", - ino, start, - rcu_str_deref(dev->name), sector); - ret = 0; - -out_bio_uninit: - bio_uninit(&bio); -out_counter_dec: - btrfs_bio_counter_dec(fs_info); - return ret; -} - -int btrfs_repair_eb_io_failure(const struct extent_buffer *eb, int mirror_num) -{ - struct btrfs_fs_info *fs_info = eb->fs_info; - u64 start = eb->start; - int i, num_pages = num_extent_pages(eb); - int ret = 0; - - if (sb_rdonly(fs_info->sb)) - return -EROFS; - - for (i = 0; i < num_pages; i++) { - struct page *p = eb->pages[i]; - - ret = repair_io_failure(fs_info, 0, start, PAGE_SIZE, start, p, - start - page_offset(p), mirror_num); - if (ret) - break; - start += PAGE_SIZE; - } - - return ret; -} - static int next_mirror(const struct io_failure_record *failrec, int cur_mirror) { if (cur_mirror == failrec->num_copies) @@ -688,7 +578,7 @@ int btrfs_clean_io_failure(struct btrfs_inode *inode, u64 start, mirror = failrec->this_mirror; do { mirror = prev_mirror(failrec, mirror); - repair_io_failure(fs_info, ino, start, failrec->len, + btrfs_repair_io_failure(fs_info, ino, start, failrec->len, failrec->logical, page, pg_offset, mirror); } while (mirror != failrec->failed_mirror); @@ -791,13 +681,13 @@ static struct io_failure_record *btrfs_get_io_failure_record(struct inode *inode return failrec; } -int btrfs_repair_one_sector(struct inode *inode, struct btrfs_bio *failed_bbio, +int btrfs_repair_one_sector(struct btrfs_inode *inode, struct btrfs_bio *failed_bbio, u32 bio_offset, struct page *page, unsigned int pgoff, - submit_bio_hook_t *submit_bio_hook) + bool submit_buffered) { u64 start = failed_bbio->file_offset + bio_offset; struct io_failure_record *failrec; - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode->root->fs_info; struct bio *failed_bio = &failed_bbio->bio; const int icsum = bio_offset >> fs_info->sectorsize_bits; struct bio *repair_bio; @@ -808,7 +698,7 @@ int btrfs_repair_one_sector(struct inode *inode, struct btrfs_bio *failed_bbio, BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE); - failrec = btrfs_get_io_failure_record(inode, failed_bbio, bio_offset); + failrec = btrfs_get_io_failure_record(&inode->vfs_inode, failed_bbio, bio_offset); if (IS_ERR(failrec)) return PTR_ERR(failrec); @@ -819,14 +709,14 @@ int btrfs_repair_one_sector(struct inode *inode, struct btrfs_bio *failed_bbio, * * Since we're only doing repair for one sector, we only need to get * a good copy of the failed sector and if we succeed, we have setup - * everything for repair_io_failure to do the rest for us. + * everything for btrfs_repair_io_failure to do the rest for us. */ failrec->this_mirror = next_mirror(failrec, failrec->this_mirror); if (failrec->this_mirror == failrec->failed_mirror) { btrfs_debug(fs_info, "failed to repair num_copies %d this_mirror %d failed_mirror %d", failrec->num_copies, failrec->this_mirror, failrec->failed_mirror); - free_io_failure(BTRFS_I(inode), failrec); + free_io_failure(inode, failrec); return -EIO; } @@ -847,16 +737,21 @@ int btrfs_repair_one_sector(struct inode *inode, struct btrfs_bio *failed_bbio, bio_add_page(repair_bio, page, failrec->len, pgoff); repair_bbio->iter = repair_bio->bi_iter; - btrfs_debug(btrfs_sb(inode->i_sb), + btrfs_debug(fs_info, "repair read error: submitting new read to mirror %d", failrec->this_mirror); /* - * At this point we have a bio, so any errors from submit_bio_hook() - * will be handled by the endio on the repair_bio, so we can't return an + * At this point we have a bio, so any errors from bio submission will + * be handled by the endio on the repair_bio, so we can't return an * error here. */ - submit_bio_hook(inode, repair_bio, failrec->this_mirror, 0); + if (submit_buffered) + btrfs_submit_data_read_bio(inode, repair_bio, + failrec->this_mirror, 0); + else + btrfs_submit_dio_repair_bio(inode, repair_bio, failrec->this_mirror); + return BLK_STS_OK; } @@ -892,14 +787,9 @@ static void end_sector_io(struct page *page, u64 offset, bool uptodate) { struct btrfs_inode *inode = BTRFS_I(page->mapping->host); const u32 sectorsize = inode->root->fs_info->sectorsize; - struct extent_state *cached = NULL; end_page_read(page, uptodate, offset, sectorsize); - if (uptodate) - set_extent_uptodate(&inode->io_tree, offset, - offset + sectorsize - 1, &cached, GFP_ATOMIC); - unlock_extent_atomic(&inode->io_tree, offset, offset + sectorsize - 1, - &cached); + unlock_extent(&inode->io_tree, offset, offset + sectorsize - 1, NULL); } static void submit_data_read_repair(struct inode *inode, @@ -945,9 +835,9 @@ static void submit_data_read_repair(struct inode *inode, goto next; } - ret = btrfs_repair_one_sector(inode, failed_bbio, + ret = btrfs_repair_one_sector(BTRFS_I(inode), failed_bbio, bio_offset + offset, page, pgoff + offset, - btrfs_submit_data_read_bio); + true); if (!ret) { /* * We have submitted the read repair, the page release @@ -1103,7 +993,7 @@ static void endio_readpage_release_extent(struct processed_extent *processed, * Now we don't have range contiguous to the processed range, release * the processed range now. */ - unlock_extent_atomic(tree, processed->start, processed->end, &cached); + unlock_extent(tree, processed->start, processed->end, &cached); update: /* Update processed to current range */ @@ -1296,7 +1186,7 @@ static void end_bio_extent_readpage(struct btrfs_bio *bbio) bio_put(bio); } -/** +/* * Populate every free slot in a provided array with pages. * * @nr_pages: number of pages to allocate @@ -1332,16 +1222,16 @@ int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array) return 0; } -/** - * Attempt to add a page to bio +/* + * Attempt to add a page to bio. * - * @bio_ctrl: record both the bio, and its bio_flags - * @page: page to add to the bio - * @disk_bytenr: offset of the new bio or to check whether we are adding - * a contiguous page to the previous one - * @size: portion of page that we want to write - * @pg_offset: starting offset in the page - * @compress_type: compression type of the current bio to see if we can merge them + * @bio_ctrl: record both the bio, and its bio_flags + * @page: page to add to the bio + * @disk_bytenr: offset of the new bio or to check whether we are adding + * a contiguous page to the previous one + * @size: portion of page that we want to write + * @pg_offset: starting offset in the page + * @compress_type: compression type of the current bio to see if we can merge them * * Attempt to add a page to bio considering stripe alignment etc. * @@ -1772,13 +1662,9 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, ASSERT(IS_ALIGNED(cur, fs_info->sectorsize)); if (cur >= last_byte) { - struct extent_state *cached = NULL; - iosize = PAGE_SIZE - pg_offset; memzero_page(page, pg_offset, iosize); - set_extent_uptodate(tree, cur, cur + iosize - 1, - &cached, GFP_NOFS); - unlock_extent(tree, cur, cur + iosize - 1, &cached); + unlock_extent(tree, cur, cur + iosize - 1, NULL); end_page_read(page, true, cur, iosize); break; } @@ -1854,13 +1740,9 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, /* we've found a hole, just zero and go on */ if (block_start == EXTENT_MAP_HOLE) { - struct extent_state *cached = NULL; - memzero_page(page, pg_offset, iosize); - set_extent_uptodate(tree, cur, cur + iosize - 1, - &cached, GFP_NOFS); - unlock_extent(tree, cur, cur + iosize - 1, &cached); + unlock_extent(tree, cur, cur + iosize - 1, NULL); end_page_read(page, true, cur, iosize); cur = cur + iosize; pg_offset += iosize; @@ -2065,7 +1947,7 @@ static void find_next_dirty_byte(struct btrfs_fs_info *fs_info, static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, struct page *page, struct writeback_control *wbc, - struct extent_page_data *epd, + struct btrfs_bio_ctrl *bio_ctrl, loff_t i_size, int *nr_ret) { @@ -2097,7 +1979,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, */ wbc->nr_to_write--; - epd->bio_ctrl.end_io_func = end_bio_extent_writepage; + bio_ctrl->end_io_func = end_bio_extent_writepage; while (cur <= end) { u64 disk_bytenr; u64 em_end; @@ -2191,7 +2073,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, btrfs_page_clear_dirty(fs_info, page, cur, iosize); ret = submit_extent_page(op | write_flags, wbc, - &epd->bio_ctrl, disk_bytenr, + bio_ctrl, disk_bytenr, page, iosize, cur - page_offset(page), 0, false); @@ -2231,7 +2113,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, * Return <0 for error. */ static int __extent_writepage(struct page *page, struct writeback_control *wbc, - struct extent_page_data *epd) + struct btrfs_bio_ctrl *bio_ctrl) { struct folio *folio = page_folio(page); struct inode *inode = page->mapping->host; @@ -2268,7 +2150,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, goto done; } - if (!epd->extent_locked) { + if (!bio_ctrl->extent_locked) { ret = writepage_delalloc(BTRFS_I(inode), page, wbc); if (ret == 1) return 0; @@ -2276,7 +2158,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, goto done; } - ret = __extent_writepage_io(BTRFS_I(inode), page, wbc, epd, i_size, + ret = __extent_writepage_io(BTRFS_I(inode), page, wbc, bio_ctrl, i_size, &nr); if (ret == 1) return 0; @@ -2320,9 +2202,9 @@ done: */ if (PageError(page)) end_extent_writepage(page, ret, page_start, page_end); - if (epd->extent_locked) { + if (bio_ctrl->extent_locked) { /* - * If epd->extent_locked, it's from extent_write_locked_range(), + * If bio_ctrl->extent_locked, it's from extent_write_locked_range(), * the page can either be locked by lock_page() or * process_one_page(). * Let btrfs_page_unlock_writer() handle both cases. @@ -2361,7 +2243,7 @@ static void end_extent_buffer_writeback(struct extent_buffer *eb) * Return <0 if something went wrong, no page is locked. */ static noinline_for_stack int lock_extent_buffer_for_io(struct extent_buffer *eb, - struct extent_page_data *epd) + struct btrfs_bio_ctrl *bio_ctrl) { struct btrfs_fs_info *fs_info = eb->fs_info; int i, num_pages; @@ -2369,17 +2251,17 @@ static noinline_for_stack int lock_extent_buffer_for_io(struct extent_buffer *eb int ret = 0; if (!btrfs_try_tree_write_lock(eb)) { - submit_write_bio(epd, 0); + submit_write_bio(bio_ctrl, 0); flush = 1; btrfs_tree_lock(eb); } if (test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags)) { btrfs_tree_unlock(eb); - if (!epd->sync_io) + if (!bio_ctrl->sync_io) return 0; if (!flush) { - submit_write_bio(epd, 0); + submit_write_bio(bio_ctrl, 0); flush = 1; } while (1) { @@ -2426,7 +2308,7 @@ static noinline_for_stack int lock_extent_buffer_for_io(struct extent_buffer *eb if (!trylock_page(p)) { if (!flush) { - submit_write_bio(epd, 0); + submit_write_bio(bio_ctrl, 0); flush = 1; } lock_page(p); @@ -2647,15 +2529,19 @@ static void prepare_eb_write(struct extent_buffer *eb) /* Set btree blocks beyond nritems with 0 to avoid stale content */ nritems = btrfs_header_nritems(eb); if (btrfs_header_level(eb) > 0) { - end = btrfs_node_key_ptr_offset(nritems); + end = btrfs_node_key_ptr_offset(eb, nritems); memzero_extent_buffer(eb, end, eb->len - end); } else { /* * Leaf: * header 0 1 2 .. N ... data_N .. data_2 data_1 data_0 */ - start = btrfs_item_nr_offset(nritems); - end = BTRFS_LEAF_DATA_OFFSET + leaf_data_end(eb); + start = btrfs_item_nr_offset(eb, nritems); + end = btrfs_item_nr_offset(eb, 0); + if (nritems == 0) + end += BTRFS_LEAF_DATA_SIZE(eb->fs_info); + else + end += btrfs_item_offset(eb, nritems - 1); memzero_extent_buffer(eb, start, end - start); } } @@ -2666,7 +2552,7 @@ static void prepare_eb_write(struct extent_buffer *eb) */ static int write_one_subpage_eb(struct extent_buffer *eb, struct writeback_control *wbc, - struct extent_page_data *epd) + struct btrfs_bio_ctrl *bio_ctrl) { struct btrfs_fs_info *fs_info = eb->fs_info; struct page *page = eb->pages[0]; @@ -2686,10 +2572,10 @@ static int write_one_subpage_eb(struct extent_buffer *eb, if (no_dirty_ebs) clear_page_dirty_for_io(page); - epd->bio_ctrl.end_io_func = end_bio_subpage_eb_writepage; + bio_ctrl->end_io_func = end_bio_subpage_eb_writepage; ret = submit_extent_page(REQ_OP_WRITE | write_flags, wbc, - &epd->bio_ctrl, eb->start, page, eb->len, + bio_ctrl, eb->start, page, eb->len, eb->start - page_offset(page), 0, false); if (ret) { btrfs_subpage_clear_writeback(fs_info, page, eb->start, eb->len); @@ -2712,7 +2598,7 @@ static int write_one_subpage_eb(struct extent_buffer *eb, static noinline_for_stack int write_one_eb(struct extent_buffer *eb, struct writeback_control *wbc, - struct extent_page_data *epd) + struct btrfs_bio_ctrl *bio_ctrl) { u64 disk_bytenr = eb->start; int i, num_pages; @@ -2721,7 +2607,7 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb, prepare_eb_write(eb); - epd->bio_ctrl.end_io_func = end_bio_extent_buffer_writepage; + bio_ctrl->end_io_func = end_bio_extent_buffer_writepage; num_pages = num_extent_pages(eb); for (i = 0; i < num_pages; i++) { @@ -2730,7 +2616,7 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb, clear_page_dirty_for_io(p); set_page_writeback(p); ret = submit_extent_page(REQ_OP_WRITE | write_flags, wbc, - &epd->bio_ctrl, disk_bytenr, p, + bio_ctrl, disk_bytenr, p, PAGE_SIZE, 0, 0, false); if (ret) { set_btree_ioerr(p, eb); @@ -2773,7 +2659,7 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb, */ static int submit_eb_subpage(struct page *page, struct writeback_control *wbc, - struct extent_page_data *epd) + struct btrfs_bio_ctrl *bio_ctrl) { struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb); int submitted = 0; @@ -2826,7 +2712,7 @@ static int submit_eb_subpage(struct page *page, if (!eb) continue; - ret = lock_extent_buffer_for_io(eb, epd); + ret = lock_extent_buffer_for_io(eb, bio_ctrl); if (ret == 0) { free_extent_buffer(eb); continue; @@ -2835,7 +2721,7 @@ static int submit_eb_subpage(struct page *page, free_extent_buffer(eb); goto cleanup; } - ret = write_one_subpage_eb(eb, wbc, epd); + ret = write_one_subpage_eb(eb, wbc, bio_ctrl); free_extent_buffer(eb); if (ret < 0) goto cleanup; @@ -2845,7 +2731,7 @@ static int submit_eb_subpage(struct page *page, cleanup: /* We hit error, end bio for the submitted extent buffers */ - submit_write_bio(epd, ret); + submit_write_bio(bio_ctrl, ret); return ret; } @@ -2870,7 +2756,7 @@ cleanup: * Return <0 for fatal error. */ static int submit_eb_page(struct page *page, struct writeback_control *wbc, - struct extent_page_data *epd, + struct btrfs_bio_ctrl *bio_ctrl, struct extent_buffer **eb_context) { struct address_space *mapping = page->mapping; @@ -2882,7 +2768,7 @@ static int submit_eb_page(struct page *page, struct writeback_control *wbc, return 0; if (btrfs_sb(page->mapping->host->i_sb)->nodesize < PAGE_SIZE) - return submit_eb_subpage(page, wbc, epd); + return submit_eb_subpage(page, wbc, bio_ctrl); spin_lock(&mapping->private_lock); if (!PagePrivate(page)) { @@ -2925,7 +2811,7 @@ static int submit_eb_page(struct page *page, struct writeback_control *wbc, *eb_context = eb; - ret = lock_extent_buffer_for_io(eb, epd); + ret = lock_extent_buffer_for_io(eb, bio_ctrl); if (ret <= 0) { btrfs_revert_meta_write_pointer(cache, eb); if (cache) @@ -2940,7 +2826,7 @@ static int submit_eb_page(struct page *page, struct writeback_control *wbc, btrfs_schedule_zone_finish_bg(cache, eb); btrfs_put_block_group(cache); } - ret = write_one_eb(eb, wbc, epd); + ret = write_one_eb(eb, wbc, bio_ctrl); free_extent_buffer(eb); if (ret < 0) return ret; @@ -2951,10 +2837,9 @@ int btree_write_cache_pages(struct address_space *mapping, struct writeback_control *wbc) { struct extent_buffer *eb_context = NULL; - struct extent_page_data epd = { - .bio_ctrl = { 0 }, + struct btrfs_bio_ctrl bio_ctrl = { .extent_locked = 0, - .sync_io = wbc->sync_mode == WB_SYNC_ALL, + .sync_io = (wbc->sync_mode == WB_SYNC_ALL), }; struct btrfs_fs_info *fs_info = BTRFS_I(mapping->host)->root->fs_info; int ret = 0; @@ -2997,7 +2882,7 @@ retry: for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; - ret = submit_eb_page(page, wbc, &epd, &eb_context); + ret = submit_eb_page(page, wbc, &bio_ctrl, &eb_context); if (ret == 0) continue; if (ret < 0) { @@ -3058,18 +2943,18 @@ retry: ret = 0; if (!ret && BTRFS_FS_ERROR(fs_info)) ret = -EROFS; - submit_write_bio(&epd, ret); + submit_write_bio(&bio_ctrl, ret); btrfs_zoned_meta_io_unlock(fs_info); return ret; } -/** +/* * Walk the list of dirty pages of the given address space and write all of them. * - * @mapping: address space structure to write - * @wbc: subtract the number of written pages from *@wbc->nr_to_write - * @epd: holds context for the write, namely the bio + * @mapping: address space structure to write + * @wbc: subtract the number of written pages from *@wbc->nr_to_write + * @bio_ctrl: holds context for the write, namely the bio * * If a page is already under I/O, write_cache_pages() skips it, even * if it's dirty. This is desirable behaviour for memory-cleaning writeback, @@ -3081,7 +2966,7 @@ retry: */ static int extent_write_cache_pages(struct address_space *mapping, struct writeback_control *wbc, - struct extent_page_data *epd) + struct btrfs_bio_ctrl *bio_ctrl) { struct inode *inode = mapping->host; int ret = 0; @@ -3162,7 +3047,7 @@ retry: * tmpfs file mapping */ if (!trylock_page(page)) { - submit_write_bio(epd, 0); + submit_write_bio(bio_ctrl, 0); lock_page(page); } @@ -3173,7 +3058,7 @@ retry: if (wbc->sync_mode != WB_SYNC_NONE) { if (PageWriteback(page)) - submit_write_bio(epd, 0); + submit_write_bio(bio_ctrl, 0); wait_on_page_writeback(page); } @@ -3183,7 +3068,7 @@ retry: continue; } - ret = __extent_writepage(page, wbc, epd); + ret = __extent_writepage(page, wbc, bio_ctrl); if (ret < 0) { done = 1; break; @@ -3213,14 +3098,14 @@ retry: * page in our current bio, and thus deadlock, so flush the * write bio here. */ - submit_write_bio(epd, 0); + submit_write_bio(bio_ctrl, 0); goto retry; } if (wbc->range_cyclic || (wbc->nr_to_write > 0 && range_whole)) mapping->writeback_index = done_index; - btrfs_add_delayed_iput(inode); + btrfs_add_delayed_iput(BTRFS_I(inode)); return ret; } @@ -3239,8 +3124,7 @@ int extent_write_locked_range(struct inode *inode, u64 start, u64 end) u64 cur = start; unsigned long nr_pages; const u32 sectorsize = btrfs_sb(inode->i_sb)->sectorsize; - struct extent_page_data epd = { - .bio_ctrl = { 0 }, + struct btrfs_bio_ctrl bio_ctrl = { .extent_locked = 1, .sync_io = 1, }; @@ -3271,7 +3155,7 @@ int extent_write_locked_range(struct inode *inode, u64 start, u64 end) ASSERT(PageLocked(page)); ASSERT(PageDirty(page)); clear_page_dirty_for_io(page); - ret = __extent_writepage(page, &wbc_writepages, &epd); + ret = __extent_writepage(page, &wbc_writepages, &bio_ctrl); ASSERT(ret <= 0); if (ret < 0) { found_error = true; @@ -3281,7 +3165,7 @@ int extent_write_locked_range(struct inode *inode, u64 start, u64 end) cur = cur_end + 1; } - submit_write_bio(&epd, found_error ? ret : 0); + submit_write_bio(&bio_ctrl, found_error ? ret : 0); wbc_detach_inode(&wbc_writepages); if (found_error) @@ -3294,10 +3178,9 @@ int extent_writepages(struct address_space *mapping, { struct inode *inode = mapping->host; int ret = 0; - struct extent_page_data epd = { - .bio_ctrl = { 0 }, + struct btrfs_bio_ctrl bio_ctrl = { .extent_locked = 0, - .sync_io = wbc->sync_mode == WB_SYNC_ALL, + .sync_io = (wbc->sync_mode == WB_SYNC_ALL), }; /* @@ -3305,8 +3188,8 @@ int extent_writepages(struct address_space *mapping, * protect the write pointer updates. */ btrfs_zoned_data_reloc_lock(BTRFS_I(inode)); - ret = extent_write_cache_pages(mapping, wbc, &epd); - submit_write_bio(&epd, ret); + ret = extent_write_cache_pages(mapping, wbc, &bio_ctrl); + submit_write_bio(&bio_ctrl, ret); btrfs_zoned_data_reloc_unlock(BTRFS_I(inode)); return ret; } @@ -3705,14 +3588,13 @@ static int fiemap_search_slot(struct btrfs_inode *inode, struct btrfs_path *path static int fiemap_process_hole(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo, struct fiemap_cache *cache, - struct btrfs_backref_shared_cache *backref_cache, + struct extent_state **delalloc_cached_state, + struct btrfs_backref_share_check_ctx *backref_ctx, u64 disk_bytenr, u64 extent_offset, u64 extent_gen, - struct ulist *roots, struct ulist *tmp_ulist, u64 start, u64 end) { const u64 i_size = i_size_read(&inode->vfs_inode); - const u64 ino = btrfs_ino(inode); u64 cur_offset = start; u64 last_delalloc_end = 0; u32 prealloc_flags = FIEMAP_EXTENT_UNWRITTEN; @@ -3731,6 +3613,7 @@ static int fiemap_process_hole(struct btrfs_inode *inode, bool delalloc; delalloc = btrfs_find_delalloc_in_range(inode, cur_offset, end, + delalloc_cached_state, &delalloc_start, &delalloc_end); if (!delalloc) @@ -3752,11 +3635,10 @@ static int fiemap_process_hole(struct btrfs_inode *inode, if (prealloc_len > 0) { if (!checked_extent_shared && fieinfo->fi_extents_max) { - ret = btrfs_is_data_extent_shared(inode->root, - ino, disk_bytenr, - extent_gen, roots, - tmp_ulist, - backref_cache); + ret = btrfs_is_data_extent_shared(inode, + disk_bytenr, + extent_gen, + backref_ctx); if (ret < 0) return ret; else if (ret > 0) @@ -3802,11 +3684,10 @@ static int fiemap_process_hole(struct btrfs_inode *inode, } if (!checked_extent_shared && fieinfo->fi_extents_max) { - ret = btrfs_is_data_extent_shared(inode->root, - ino, disk_bytenr, - extent_gen, roots, - tmp_ulist, - backref_cache); + ret = btrfs_is_data_extent_shared(inode, + disk_bytenr, + extent_gen, + backref_ctx); if (ret < 0) return ret; else if (ret > 0) @@ -3903,12 +3784,10 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo, { const u64 ino = btrfs_ino(inode); struct extent_state *cached_state = NULL; + struct extent_state *delalloc_cached_state = NULL; struct btrfs_path *path; - struct btrfs_root *root = inode->root; struct fiemap_cache cache = { 0 }; - struct btrfs_backref_shared_cache *backref_cache; - struct ulist *roots; - struct ulist *tmp_ulist; + struct btrfs_backref_share_check_ctx *backref_ctx; u64 last_extent_end; u64 prev_extent_end; u64 lockstart; @@ -3916,17 +3795,15 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo, bool stopped = false; int ret; - backref_cache = kzalloc(sizeof(*backref_cache), GFP_KERNEL); + backref_ctx = btrfs_alloc_backref_share_check_ctx(); path = btrfs_alloc_path(); - roots = ulist_alloc(GFP_KERNEL); - tmp_ulist = ulist_alloc(GFP_KERNEL); - if (!backref_cache || !path || !roots || !tmp_ulist) { + if (!backref_ctx || !path) { ret = -ENOMEM; goto out; } - lockstart = round_down(start, root->fs_info->sectorsize); - lockend = round_up(start + len, root->fs_info->sectorsize); + lockstart = round_down(start, inode->root->fs_info->sectorsize); + lockend = round_up(start + len, inode->root->fs_info->sectorsize); prev_extent_end = lockstart; lock_extent(&inode->io_tree, lockstart, lockend, &cached_state); @@ -3975,13 +3852,15 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo, if (extent_end <= lockstart) goto next_item; + backref_ctx->curr_leaf_bytenr = leaf->start; + /* We have in implicit hole (NO_HOLES feature enabled). */ if (prev_extent_end < key.offset) { const u64 range_end = min(key.offset, lockend) - 1; ret = fiemap_process_hole(inode, fieinfo, &cache, - backref_cache, 0, 0, 0, - roots, tmp_ulist, + &delalloc_cached_state, + backref_ctx, 0, 0, 0, prev_extent_end, range_end); if (ret < 0) { goto out_unlock; @@ -4021,25 +3900,24 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo, extent_len, flags); } else if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) { ret = fiemap_process_hole(inode, fieinfo, &cache, - backref_cache, + &delalloc_cached_state, + backref_ctx, disk_bytenr, extent_offset, - extent_gen, roots, tmp_ulist, - key.offset, extent_end - 1); + extent_gen, key.offset, + extent_end - 1); } else if (disk_bytenr == 0) { /* We have an explicit hole. */ ret = fiemap_process_hole(inode, fieinfo, &cache, - backref_cache, 0, 0, 0, - roots, tmp_ulist, + &delalloc_cached_state, + backref_ctx, 0, 0, 0, key.offset, extent_end - 1); } else { /* We have a regular extent. */ if (fieinfo->fi_extents_max) { - ret = btrfs_is_data_extent_shared(root, ino, + ret = btrfs_is_data_extent_shared(inode, disk_bytenr, extent_gen, - roots, - tmp_ulist, - backref_cache); + backref_ctx); if (ret < 0) goto out_unlock; else if (ret > 0) @@ -4088,9 +3966,9 @@ check_eof_delalloc: path = NULL; if (!stopped && prev_extent_end < lockend) { - ret = fiemap_process_hole(inode, fieinfo, &cache, backref_cache, - 0, 0, 0, roots, tmp_ulist, - prev_extent_end, lockend - 1); + ret = fiemap_process_hole(inode, fieinfo, &cache, + &delalloc_cached_state, backref_ctx, + 0, 0, 0, prev_extent_end, lockend - 1); if (ret < 0) goto out_unlock; prev_extent_end = lockend; @@ -4107,6 +3985,7 @@ check_eof_delalloc: delalloc = btrfs_find_delalloc_in_range(inode, prev_extent_end, i_size - 1, + &delalloc_cached_state, &delalloc_start, &delalloc_end); if (!delalloc) @@ -4121,10 +4000,9 @@ check_eof_delalloc: out_unlock: unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state); out: - kfree(backref_cache); + free_extent_state(delalloc_cached_state); + btrfs_free_backref_share_ctx(backref_ctx); btrfs_free_path(path); - ulist_free(roots); - ulist_free(tmp_ulist); return ret; } @@ -4266,7 +4144,6 @@ __alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start, eb->start = start; eb->len = len; eb->fs_info = fs_info; - eb->bflags = 0; init_rwsem(&eb->lock); btrfs_leak_debug_add_eb(eb); @@ -4299,7 +4176,6 @@ struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src) */ set_bit(EXTENT_BUFFER_UNMAPPED, &new->bflags); - memset(new->pages, 0, sizeof(*new->pages) * num_pages); ret = btrfs_alloc_page_array(num_pages, new->pages); if (ret) { btrfs_release_extent_buffer(new); @@ -4944,11 +4820,13 @@ void set_extent_buffer_uptodate(struct extent_buffer *eb) } static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait, - int mirror_num) + int mirror_num, + struct btrfs_tree_parent_check *check) { struct btrfs_fs_info *fs_info = eb->fs_info; struct extent_io_tree *io_tree; struct page *page = eb->pages[0]; + struct extent_state *cached_state = NULL; struct btrfs_bio_ctrl bio_ctrl = { .mirror_num = mirror_num, }; @@ -4956,13 +4834,16 @@ static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait, ASSERT(!test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags)); ASSERT(PagePrivate(page)); + ASSERT(check); io_tree = &BTRFS_I(fs_info->btree_inode)->io_tree; if (wait == WAIT_NONE) { - if (!try_lock_extent(io_tree, eb->start, eb->start + eb->len - 1)) + if (!try_lock_extent(io_tree, eb->start, eb->start + eb->len - 1, + &cached_state)) return -EAGAIN; } else { - ret = lock_extent(io_tree, eb->start, eb->start + eb->len - 1, NULL); + ret = lock_extent(io_tree, eb->start, eb->start + eb->len - 1, + &cached_state); if (ret < 0) return ret; } @@ -4972,7 +4853,8 @@ static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait, PageUptodate(page) || btrfs_subpage_test_uptodate(fs_info, page, eb->start, eb->len)) { set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); - unlock_extent(io_tree, eb->start, eb->start + eb->len - 1, NULL); + unlock_extent(io_tree, eb->start, eb->start + eb->len - 1, + &cached_state); return ret; } @@ -4996,17 +4878,22 @@ static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait, */ atomic_dec(&eb->io_pages); } + memcpy(&btrfs_bio(bio_ctrl.bio)->parent_check, check, sizeof(*check)); submit_one_bio(&bio_ctrl); - if (ret || wait != WAIT_COMPLETE) + if (ret || wait != WAIT_COMPLETE) { + free_extent_state(cached_state); return ret; + } - wait_extent_bit(io_tree, eb->start, eb->start + eb->len - 1, EXTENT_LOCKED); + wait_extent_bit(io_tree, eb->start, eb->start + eb->len - 1, + EXTENT_LOCKED, &cached_state); if (!test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) ret = -EIO; return ret; } -int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num) +int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num, + struct btrfs_tree_parent_check *check) { int i; struct page *page; @@ -5032,7 +4919,7 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num) return -EIO; if (eb->fs_info->nodesize < PAGE_SIZE) - return read_extent_buffer_subpage(eb, wait, mirror_num); + return read_extent_buffer_subpage(eb, wait, mirror_num, check); num_pages = num_extent_pages(eb); for (i = 0; i < num_pages; i++) { @@ -5109,6 +4996,7 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num) } } + memcpy(&btrfs_bio(bio_ctrl.bio)->parent_check, check, sizeof(*check)); submit_one_bio(&bio_ctrl); if (ret || wait != WAIT_COMPLETE) @@ -5467,11 +5355,12 @@ static inline void eb_bitmap_offset(const struct extent_buffer *eb, *page_offset = offset_in_page(offset); } -/** - * extent_buffer_test_bit - determine whether a bit in a bitmap item is set - * @eb: the extent buffer - * @start: offset of the bitmap item in the extent buffer - * @nr: bit number to test +/* + * Determine whether a bit in a bitmap item is set. + * + * @eb: the extent buffer + * @start: offset of the bitmap item in the extent buffer + * @nr: bit number to test */ int extent_buffer_test_bit(const struct extent_buffer *eb, unsigned long start, unsigned long nr) @@ -5488,12 +5377,13 @@ int extent_buffer_test_bit(const struct extent_buffer *eb, unsigned long start, return 1U & (kaddr[offset] >> (nr & (BITS_PER_BYTE - 1))); } -/** - * extent_buffer_bitmap_set - set an area of a bitmap - * @eb: the extent buffer - * @start: offset of the bitmap item in the extent buffer - * @pos: bit number of the first bit - * @len: number of bits to set +/* + * Set an area of a bitmap to 1. + * + * @eb: the extent buffer + * @start: offset of the bitmap item in the extent buffer + * @pos: bit number of the first bit + * @len: number of bits to set */ void extent_buffer_bitmap_set(const struct extent_buffer *eb, unsigned long start, unsigned long pos, unsigned long len) @@ -5530,12 +5420,13 @@ void extent_buffer_bitmap_set(const struct extent_buffer *eb, unsigned long star } -/** - * extent_buffer_bitmap_clear - clear an area of a bitmap - * @eb: the extent buffer - * @start: offset of the bitmap item in the extent buffer - * @pos: bit number of the first bit - * @len: number of bits to clear +/* + * Clear an area of a bitmap. + * + * @eb: the extent buffer + * @start: offset of the bitmap item in the extent buffer + * @pos: bit number of the first bit + * @len: number of bits to clear */ void extent_buffer_bitmap_clear(const struct extent_buffer *eb, unsigned long start, unsigned long pos, @@ -5841,6 +5732,11 @@ int try_release_extent_buffer(struct page *page) void btrfs_readahead_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr, u64 owner_root, u64 gen, int level) { + struct btrfs_tree_parent_check check = { + .has_first_key = 0, + .level = level, + .transid = gen + }; struct extent_buffer *eb; int ret; @@ -5853,7 +5749,7 @@ void btrfs_readahead_tree_block(struct btrfs_fs_info *fs_info, return; } - ret = read_extent_buffer_pages(eb, WAIT_NONE, 0); + ret = read_extent_buffer_pages(eb, WAIT_NONE, 0, &check); if (ret < 0) free_extent_buffer_stale(eb); else diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 7929f054dda3..a2c82448b2e0 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -9,6 +9,7 @@ #include <linux/btrfs_tree.h> #include "compression.h" #include "ulist.h" +#include "misc.h" enum { EXTENT_BUFFER_UPTODATE, @@ -29,13 +30,15 @@ enum { }; /* these are flags for __process_pages_contig */ -#define PAGE_UNLOCK (1 << 0) -/* Page starts writeback, clear dirty bit and set writeback bit */ -#define PAGE_START_WRITEBACK (1 << 1) -#define PAGE_END_WRITEBACK (1 << 2) -#define PAGE_SET_ORDERED (1 << 3) -#define PAGE_SET_ERROR (1 << 4) -#define PAGE_LOCK (1 << 5) +enum { + ENUM_BIT(PAGE_UNLOCK), + /* Page starts writeback, clear dirty bit and set writeback bit */ + ENUM_BIT(PAGE_START_WRITEBACK), + ENUM_BIT(PAGE_END_WRITEBACK), + ENUM_BIT(PAGE_SET_ORDERED), + ENUM_BIT(PAGE_SET_ERROR), + ENUM_BIT(PAGE_LOCK), +}; /* * page->private values. Every page that is controlled by the extent @@ -63,17 +66,11 @@ struct btrfs_inode; struct btrfs_fs_info; struct io_failure_record; struct extent_io_tree; +struct btrfs_tree_parent_check; int __init extent_buffer_init_cachep(void); void __cold extent_buffer_free_cachep(void); -typedef void (submit_bio_hook_t)(struct inode *inode, struct bio *bio, - int mirror_num, - enum btrfs_compression_type compress_type); - -typedef blk_status_t (extent_submit_bio_start_t)(struct inode *inode, - struct bio *bio, u64 dio_file_offset); - #define INLINE_EXTENT_BUFFER_PAGES (BTRFS_MAX_METADATA_BLOCKSIZE / PAGE_SIZE) struct extent_buffer { u64 start; @@ -99,6 +96,39 @@ struct extent_buffer { }; /* + * Get the correct offset inside the page of extent buffer. + * + * @eb: target extent buffer + * @start: offset inside the extent buffer + * + * Will handle both sectorsize == PAGE_SIZE and sectorsize < PAGE_SIZE cases. + */ +static inline size_t get_eb_offset_in_page(const struct extent_buffer *eb, + unsigned long offset) +{ + /* + * For sectorsize == PAGE_SIZE case, eb->start will always be aligned + * to PAGE_SIZE, thus adding it won't cause any difference. + * + * For sectorsize < PAGE_SIZE, we must only read the data that belongs + * to the eb, thus we have to take the eb->start into consideration. + */ + return offset_in_page(offset + eb->start); +} + +static inline unsigned long get_eb_page_index(unsigned long offset) +{ + /* + * For sectorsize == PAGE_SIZE case, plain >> PAGE_SHIFT is enough. + * + * For sectorsize < PAGE_SIZE case, we only support 64K PAGE_SIZE, + * and have ensured that all tree blocks are contained in one page, + * thus we always get index == 0. + */ + return offset >> PAGE_SHIFT; +} + +/* * Structure to record how many bytes and which ranges are set/cleared */ struct extent_changeset { @@ -174,8 +204,8 @@ void free_extent_buffer_stale(struct extent_buffer *eb); #define WAIT_NONE 0 #define WAIT_COMPLETE 1 #define WAIT_PAGE_LOCK 2 -int read_extent_buffer_pages(struct extent_buffer *eb, int wait, - int mirror_num); +int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num, + struct btrfs_tree_parent_check *parent_check); void wait_on_extent_buffer_writeback(struct extent_buffer *eb); void btrfs_readahead_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr, u64 owner_root, u64 gen, int level); @@ -248,7 +278,6 @@ int extent_invalidate_folio(struct extent_io_tree *tree, int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array); void end_extent_writepage(struct page *page, int err, u64 start, u64 end); -int btrfs_repair_eb_io_failure(const struct extent_buffer *eb, int mirror_num); /* * When IO fails, either with EIO or csum verification fails, we @@ -272,9 +301,9 @@ struct io_failure_record { int num_copies; }; -int btrfs_repair_one_sector(struct inode *inode, struct btrfs_bio *failed_bbio, +int btrfs_repair_one_sector(struct btrfs_inode *inode, struct btrfs_bio *failed_bbio, u32 bio_offset, struct page *page, unsigned int pgoff, - submit_bio_hook_t *submit_bio_hook); + bool submit_buffered); void btrfs_free_io_failure_record(struct btrfs_inode *inode, u64 start, u64 end); int btrfs_clean_io_failure(struct btrfs_inode *inode, u64 start, struct page *page, unsigned int pg_offset); diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 6092a4eedc92..be94030e1dfb 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -3,6 +3,7 @@ #include <linux/err.h> #include <linux/slab.h> #include <linux/spinlock.h> +#include "messages.h" #include "ctree.h" #include "volumes.h" #include "extent_map.h" @@ -27,12 +28,9 @@ void __cold extent_map_exit(void) kmem_cache_destroy(extent_map_cache); } -/** - * extent_map_tree_init - initialize extent map tree - * @tree: tree to initialize - * - * Initialize the extent tree @tree. Should be called for each new inode - * or other user of the extent_map interface. +/* + * Initialize the extent tree @tree. Should be called for each new inode or + * other user of the extent_map interface. */ void extent_map_tree_init(struct extent_map_tree *tree) { @@ -41,12 +39,9 @@ void extent_map_tree_init(struct extent_map_tree *tree) rwlock_init(&tree->lock); } -/** - * alloc_extent_map - allocate new extent map structure - * - * Allocate a new extent_map structure. The new structure is - * returned with a reference count of one and needs to be - * freed using free_extent_map() +/* + * Allocate a new extent_map structure. The new structure is returned with a + * reference count of one and needs to be freed using free_extent_map() */ struct extent_map *alloc_extent_map(void) { @@ -61,12 +56,9 @@ struct extent_map *alloc_extent_map(void) return em; } -/** - * free_extent_map - drop reference count of an extent_map - * @em: extent map being released - * - * Drops the reference out on @em by one and free the structure - * if the reference count hits zero. +/* + * Drop the reference out on @em by one and free the structure if the reference + * count hits zero. */ void free_extent_map(struct extent_map *em) { @@ -81,7 +73,7 @@ void free_extent_map(struct extent_map *em) } } -/* simple helper to do math around the end of an extent, handling wrap */ +/* Do the math around the end of an extent, handling wrapping. */ static u64 range_end(u64 start, u64 len) { if (start + len < start) @@ -137,8 +129,8 @@ static int tree_insert(struct rb_root_cached *root, struct extent_map *em) } /* - * search through the tree for an extent_map with a given offset. If - * it can't be found, try to find some neighboring extents + * Search through the tree for an extent_map with a given offset. If it can't + * be found, try to find some neighboring extents */ static struct rb_node *__tree_search(struct rb_root *root, u64 offset, struct rb_node **prev_or_next_ret) @@ -190,7 +182,7 @@ static struct rb_node *__tree_search(struct rb_root *root, u64 offset, return NULL; } -/* check to see if two extent_map structs are adjacent and safe to merge */ +/* Check to see if two extent_map structs are adjacent and safe to merge. */ static int mergable_maps(struct extent_map *prev, struct extent_map *next) { if (test_bit(EXTENT_FLAG_PINNED, &prev->flags)) @@ -288,8 +280,9 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em) } } -/** - * unpin_extent_cache - unpin an extent from the cache +/* + * Unpin an extent from the cache. + * * @tree: tree to unpin the extent in * @start: logical offset in the file * @len: length of the extent @@ -392,7 +385,7 @@ static void extent_map_device_clear_bits(struct extent_map *em, unsigned bits) } } -/** +/* * Add new extent map to the extent tree * * @tree: tree to insert new map in @@ -451,8 +444,9 @@ __lookup_extent_mapping(struct extent_map_tree *tree, return em; } -/** - * lookup_extent_mapping - lookup extent_map +/* + * Lookup extent_map that intersects @start + @len range. + * * @tree: tree to lookup in * @start: byte offset to start the search * @len: length of the lookup range @@ -468,8 +462,9 @@ struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree, return __lookup_extent_mapping(tree, start, len, 1); } -/** - * search_extent_mapping - find a nearby extent map +/* + * Find a nearby extent map intersecting @start + @len (not an exact search). + * * @tree: tree to lookup in * @start: byte offset to start the search * @len: length of the lookup range @@ -485,13 +480,14 @@ struct extent_map *search_extent_mapping(struct extent_map_tree *tree, return __lookup_extent_mapping(tree, start, len, 0); } -/** - * remove_extent_mapping - removes an extent_map from the extent tree +/* + * Remove an extent_map from the extent tree. + * * @tree: extent tree to remove from * @em: extent map being removed * - * Removes @em from @tree. No reference counts are dropped, and no checks - * are done to see if the range is in use + * Remove @em from @tree. No reference counts are dropped, and no checks + * are done to see if the range is in use. */ void remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em) { @@ -523,7 +519,7 @@ void replace_extent_mapping(struct extent_map_tree *tree, setup_extent_mapping(tree, new, modified); } -static struct extent_map *next_extent_map(struct extent_map *em) +static struct extent_map *next_extent_map(const struct extent_map *em) { struct rb_node *next; @@ -585,8 +581,8 @@ static noinline int merge_extent_mapping(struct extent_map_tree *em_tree, return add_extent_mapping(em_tree, em, 0); } -/** - * Add extent mapping into em_tree +/* + * Add extent mapping into em_tree. * * @fs_info: the filesystem * @em_tree: extent tree into which we want to insert the extent mapping @@ -613,6 +609,13 @@ int btrfs_add_extent_mapping(struct btrfs_fs_info *fs_info, int ret; struct extent_map *em = *em_in; + /* + * Tree-checker should have rejected any inline extent with non-zero + * file offset. Here just do a sanity check. + */ + if (em->block_start == EXTENT_MAP_INLINE) + ASSERT(em->start == 0); + ret = add_extent_mapping(em_tree, em, 0); /* it is possible that someone inserted the extent into the tree * while we had the lock dropped. It is also possible that diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 6bb9fa961a6a..5de73466b2ca 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -9,13 +9,18 @@ #include <linux/highmem.h> #include <linux/sched/mm.h> #include <crypto/hash.h> +#include "messages.h" #include "misc.h" #include "ctree.h" #include "disk-io.h" #include "transaction.h" -#include "volumes.h" +#include "bio.h" #include "print-tree.h" #include "compression.h" +#include "fs.h" +#include "accessors.h" +#include "file-item.h" +#include "super.h" #define __MAX_CSUM_ITEMS(r, size) ((unsigned long)(((BTRFS_LEAF_DATA_SIZE(r) - \ sizeof(struct btrfs_item) * 2) / \ @@ -24,8 +29,8 @@ #define MAX_CSUM_ITEMS(r, size) (min_t(u32, __MAX_CSUM_ITEMS(r, size), \ PAGE_SIZE)) -/** - * Set inode's size according to filesystem options +/* + * Set inode's size according to filesystem options. * * @inode: inode we want to update the disk_i_size for * @new_i_size: i_size we want to set to, 0 if we use i_size @@ -64,8 +69,8 @@ void btrfs_inode_safe_disk_i_size_write(struct btrfs_inode *inode, u64 new_i_siz spin_unlock(&inode->lock); } -/** - * Mark range within a file as having a new extent inserted +/* + * Mark range within a file as having a new extent inserted. * * @inode: inode being modified * @start: start file offset of the file extent we've inserted @@ -92,8 +97,8 @@ int btrfs_inode_set_file_extent_range(struct btrfs_inode *inode, u64 start, EXTENT_DIRTY); } -/** - * Marks an inode range as not having a backing extent +/* + * Mark an inode range as not having a backing extent. * * @inode: inode being modified * @start: start file offset of the file extent we've inserted @@ -121,12 +126,26 @@ int btrfs_inode_clear_file_extent_range(struct btrfs_inode *inode, u64 start, start + len - 1, EXTENT_DIRTY, NULL); } -static inline u32 max_ordered_sum_bytes(struct btrfs_fs_info *fs_info, - u16 csum_size) +static size_t bytes_to_csum_size(const struct btrfs_fs_info *fs_info, u32 bytes) +{ + ASSERT(IS_ALIGNED(bytes, fs_info->sectorsize)); + + return (bytes >> fs_info->sectorsize_bits) * fs_info->csum_size; +} + +static size_t csum_size_to_bytes(const struct btrfs_fs_info *fs_info, u32 csum_size) { - u32 ncsums = (PAGE_SIZE - sizeof(struct btrfs_ordered_sum)) / csum_size; + ASSERT(IS_ALIGNED(csum_size, fs_info->csum_size)); - return ncsums * fs_info->sectorsize; + return (csum_size / fs_info->csum_size) << fs_info->sectorsize_bits; +} + +static inline u32 max_ordered_sum_bytes(const struct btrfs_fs_info *fs_info) +{ + u32 max_csum_size = round_down(PAGE_SIZE - sizeof(struct btrfs_ordered_sum), + fs_info->csum_size); + + return csum_size_to_bytes(fs_info, max_csum_size); } /* @@ -135,9 +154,7 @@ static inline u32 max_ordered_sum_bytes(struct btrfs_fs_info *fs_info, */ static int btrfs_ordered_sum_size(struct btrfs_fs_info *fs_info, unsigned long bytes) { - int num_sectors = (int)DIV_ROUND_UP(bytes, fs_info->sectorsize); - - return sizeof(struct btrfs_ordered_sum) + num_sectors * fs_info->csum_size; + return sizeof(struct btrfs_ordered_sum) + bytes_to_csum_size(fs_info, bytes); } int btrfs_insert_hole_extent(struct btrfs_trans_handle *trans, @@ -254,7 +271,7 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans, /* * Find checksums for logical bytenr range [disk_bytenr, disk_bytenr + len) and - * estore the result to @dst. + * store the result to @dst. * * Return >0 for the number of sectors we found. * Return 0 for the range [disk_bytenr, disk_bytenr + sectorsize) has no csum @@ -360,15 +377,15 @@ static int search_file_offset_in_bio(struct bio *bio, struct inode *inode, return ret; } -/** +/* * Lookup the checksum for the read bio in csum tree. * - * @inode: inode that the bio is for. - * @bio: bio to look up. - * @dst: Buffer of size nblocks * btrfs_super_csum_size() used to return - * checksum (nblocks = bio->bi_iter.bi_size / fs_info->sectorsize). If - * NULL, the checksum buffer is allocated and returned in - * btrfs_bio(bio)->csum instead. + * @inode: inode that the bio is for. + * @bio: bio to look up. + * @dst: Buffer of size nblocks * btrfs_super_csum_size() used to return + * checksum (nblocks = bio->bi_iter.bi_size / fs_info->sectorsize). If + * NULL, the checksum buffer is allocated and returned in + * btrfs_bio(bio)->csum instead. * * Return: BLK_STS_RESOURCE if allocating memory fails, BLK_STS_OK otherwise. */ @@ -510,9 +527,9 @@ blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst return ret; } -int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, - struct list_head *list, int search_commit, - bool nowait) +int btrfs_lookup_csums_list(struct btrfs_root *root, u64 start, u64 end, + struct list_head *list, int search_commit, + bool nowait) { struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_key key; @@ -521,11 +538,7 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, struct btrfs_ordered_sum *sums; struct btrfs_csum_item *item; LIST_HEAD(tmplist); - unsigned long offset; int ret; - size_t size; - u64 csum_end; - const u32 csum_size = fs_info->csum_size; ASSERT(IS_ALIGNED(start, fs_info->sectorsize) && IS_ALIGNED(end + 1, fs_info->sectorsize)); @@ -551,16 +564,33 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, if (ret > 0 && path->slots[0] > 0) { leaf = path->nodes[0]; btrfs_item_key_to_cpu(leaf, &key, path->slots[0] - 1); + + /* + * There are two cases we can hit here for the previous csum + * item: + * + * |<- search range ->| + * |<- csum item ->| + * + * Or + * |<- search range ->| + * |<- csum item ->| + * + * Check if the previous csum item covers the leading part of + * the search range. If so we have to start from previous csum + * item. + */ if (key.objectid == BTRFS_EXTENT_CSUM_OBJECTID && key.type == BTRFS_EXTENT_CSUM_KEY) { - offset = (start - key.offset) >> fs_info->sectorsize_bits; - if (offset * csum_size < + if (bytes_to_csum_size(fs_info, start - key.offset) < btrfs_item_size(leaf, path->slots[0] - 1)) path->slots[0]--; } } while (start <= end) { + u64 csum_end; + leaf = path->nodes[0]; if (path->slots[0] >= btrfs_header_nritems(leaf)) { ret = btrfs_next_leaf(root, path); @@ -580,8 +610,8 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, if (key.offset > start) start = key.offset; - size = btrfs_item_size(leaf, path->slots[0]); - csum_end = key.offset + (size / csum_size) * fs_info->sectorsize; + csum_end = key.offset + csum_size_to_bytes(fs_info, + btrfs_item_size(leaf, path->slots[0])); if (csum_end <= start) { path->slots[0]++; continue; @@ -591,8 +621,11 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, item = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_csum_item); while (start < csum_end) { + unsigned long offset; + size_t size; + size = min_t(size_t, csum_end - start, - max_ordered_sum_bytes(fs_info, csum_size)); + max_ordered_sum_bytes(fs_info)); sums = kzalloc(btrfs_ordered_sum_size(fs_info, size), GFP_NOFS); if (!sums) { @@ -603,16 +636,14 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, sums->bytenr = start; sums->len = (int)size; - offset = (start - key.offset) >> fs_info->sectorsize_bits; - offset *= csum_size; - size >>= fs_info->sectorsize_bits; + offset = bytes_to_csum_size(fs_info, start - key.offset); read_extent_buffer(path->nodes[0], sums->sums, ((unsigned long)item) + offset, - csum_size * size); + bytes_to_csum_size(fs_info, size)); - start += fs_info->sectorsize * size; + start += size; list_add_tail(&sums->list, &tmplist); } path->slots[0]++; @@ -630,8 +661,129 @@ fail: return ret; } -/** - * Calculate checksums of the data contained inside a bio +/* + * Do the same work as btrfs_lookup_csums_list(), the difference is in how + * we return the result. + * + * This version will set the corresponding bits in @csum_bitmap to represent + * that there is a csum found. + * Each bit represents a sector. Thus caller should ensure @csum_buf passed + * in is large enough to contain all csums. + */ +int btrfs_lookup_csums_bitmap(struct btrfs_root *root, u64 start, u64 end, + u8 *csum_buf, unsigned long *csum_bitmap) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_key key; + struct btrfs_path *path; + struct extent_buffer *leaf; + struct btrfs_csum_item *item; + const u64 orig_start = start; + int ret; + + ASSERT(IS_ALIGNED(start, fs_info->sectorsize) && + IS_ALIGNED(end + 1, fs_info->sectorsize)); + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + key.objectid = BTRFS_EXTENT_CSUM_OBJECTID; + key.type = BTRFS_EXTENT_CSUM_KEY; + key.offset = start; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + goto fail; + if (ret > 0 && path->slots[0] > 0) { + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &key, path->slots[0] - 1); + + /* + * There are two cases we can hit here for the previous csum + * item: + * + * |<- search range ->| + * |<- csum item ->| + * + * Or + * |<- search range ->| + * |<- csum item ->| + * + * Check if the previous csum item covers the leading part of + * the search range. If so we have to start from previous csum + * item. + */ + if (key.objectid == BTRFS_EXTENT_CSUM_OBJECTID && + key.type == BTRFS_EXTENT_CSUM_KEY) { + if (bytes_to_csum_size(fs_info, start - key.offset) < + btrfs_item_size(leaf, path->slots[0] - 1)) + path->slots[0]--; + } + } + + while (start <= end) { + u64 csum_end; + + leaf = path->nodes[0]; + if (path->slots[0] >= btrfs_header_nritems(leaf)) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) + goto fail; + if (ret > 0) + break; + leaf = path->nodes[0]; + } + + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + if (key.objectid != BTRFS_EXTENT_CSUM_OBJECTID || + key.type != BTRFS_EXTENT_CSUM_KEY || + key.offset > end) + break; + + if (key.offset > start) + start = key.offset; + + csum_end = key.offset + csum_size_to_bytes(fs_info, + btrfs_item_size(leaf, path->slots[0])); + if (csum_end <= start) { + path->slots[0]++; + continue; + } + + csum_end = min(csum_end, end + 1); + item = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_csum_item); + while (start < csum_end) { + unsigned long offset; + size_t size; + u8 *csum_dest = csum_buf + bytes_to_csum_size(fs_info, + start - orig_start); + + size = min_t(size_t, csum_end - start, end + 1 - start); + + offset = bytes_to_csum_size(fs_info, start - key.offset); + + read_extent_buffer(path->nodes[0], csum_dest, + ((unsigned long)item) + offset, + bytes_to_csum_size(fs_info, size)); + + bitmap_set(csum_bitmap, + (start - orig_start) >> fs_info->sectorsize_bits, + size >> fs_info->sectorsize_bits); + + start += size; + } + path->slots[0]++; + } + ret = 0; +fail: + btrfs_free_path(path); + return ret; +} + +/* + * Calculate checksums of the data contained inside a bio. * * @inode: Owner of the data inside the bio * @bio: Contains the data to be checksummed @@ -746,15 +898,16 @@ blk_status_t btrfs_csum_one_bio(struct btrfs_inode *inode, struct bio *bio, } /* - * helper function for csum removal, this expects the - * key to describe the csum pointed to by the path, and it expects - * the csum to overlap the range [bytenr, len] + * Remove one checksum overlapping a range. * - * The csum should not be entirely contained in the range and the - * range should not be entirely contained in the csum. + * This expects the key to describe the csum pointed to by the path, and it + * expects the csum to overlap the range [bytenr, len] * - * This calls btrfs_truncate_item with the correct args based on the - * overlap, and fixes up the key as required. + * The csum should not be entirely contained in the range and the range should + * not be entirely contained in the csum. + * + * This calls btrfs_truncate_item with the correct args based on the overlap, + * and fixes up the key as required. */ static noinline void truncate_one_csum(struct btrfs_fs_info *fs_info, struct btrfs_path *path, @@ -803,8 +956,7 @@ static noinline void truncate_one_csum(struct btrfs_fs_info *fs_info, } /* - * deletes the csum items from the csum tree for a given - * range of bytes. + * Delete the csum items from the csum tree for a given range of bytes. */ int btrfs_del_csums(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 len) @@ -1209,7 +1361,6 @@ out: void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode, const struct btrfs_path *path, struct btrfs_file_extent_item *fi, - const bool new_inline, struct extent_map *em) { struct btrfs_fs_info *fs_info = inode->root->fs_info; @@ -1261,10 +1412,9 @@ void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode, */ em->orig_start = EXTENT_MAP_HOLE; em->block_len = (u64)-1; - if (!new_inline && compress_type != BTRFS_COMPRESS_NONE) { + em->compress_type = compress_type; + if (compress_type != BTRFS_COMPRESS_NONE) set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); - em->compress_type = compress_type; - } } else { btrfs_err(fs_info, "unknown file extent item type %d, inode %llu, offset %llu, " diff --git a/fs/btrfs/file-item.h b/fs/btrfs/file-item.h new file mode 100644 index 000000000000..031225668434 --- /dev/null +++ b/fs/btrfs/file-item.h @@ -0,0 +1,69 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef BTRFS_FILE_ITEM_H +#define BTRFS_FILE_ITEM_H + +#include "accessors.h" + +#define BTRFS_FILE_EXTENT_INLINE_DATA_START \ + (offsetof(struct btrfs_file_extent_item, disk_bytenr)) + +static inline u32 BTRFS_MAX_INLINE_DATA_SIZE(const struct btrfs_fs_info *info) +{ + return BTRFS_MAX_ITEM_SIZE(info) - BTRFS_FILE_EXTENT_INLINE_DATA_START; +} + +/* + * Return the number of bytes used by the item on disk, minus the size of any + * extent headers. If a file is compressed on disk, this is the compressed + * size. + */ +static inline u32 btrfs_file_extent_inline_item_len( + const struct extent_buffer *eb, + int nr) +{ + return btrfs_item_size(eb, nr) - BTRFS_FILE_EXTENT_INLINE_DATA_START; +} + +static inline unsigned long btrfs_file_extent_inline_start( + const struct btrfs_file_extent_item *e) +{ + return (unsigned long)e + BTRFS_FILE_EXTENT_INLINE_DATA_START; +} + +static inline u32 btrfs_file_extent_calc_inline_size(u32 datasize) +{ + return BTRFS_FILE_EXTENT_INLINE_DATA_START + datasize; +} + +int btrfs_del_csums(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 bytenr, u64 len); +blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst); +int btrfs_insert_hole_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 objectid, u64 pos, + u64 num_bytes); +int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, u64 objectid, + u64 bytenr, int mod); +int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_ordered_sum *sums); +blk_status_t btrfs_csum_one_bio(struct btrfs_inode *inode, struct bio *bio, + u64 offset, bool one_ordered); +int btrfs_lookup_csums_list(struct btrfs_root *root, u64 start, u64 end, + struct list_head *list, int search_commit, + bool nowait); +int btrfs_lookup_csums_bitmap(struct btrfs_root *root, u64 start, u64 end, + u8 *csum_buf, unsigned long *csum_bitmap); +void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode, + const struct btrfs_path *path, + struct btrfs_file_extent_item *fi, + struct extent_map *em); +int btrfs_inode_clear_file_extent_range(struct btrfs_inode *inode, u64 start, + u64 len); +int btrfs_inode_set_file_extent_range(struct btrfs_inode *inode, u64 start, u64 len); +void btrfs_inode_safe_disk_i_size_write(struct btrfs_inode *inode, u64 new_i_size); +u64 btrfs_file_extent_end(const struct btrfs_path *path); + +#endif diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index d01631d47806..91b00eb2440e 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -30,329 +30,13 @@ #include "delalloc-space.h" #include "reflink.h" #include "subpage.h" - -static struct kmem_cache *btrfs_inode_defrag_cachep; -/* - * when auto defrag is enabled we - * queue up these defrag structs to remember which - * inodes need defragging passes - */ -struct inode_defrag { - struct rb_node rb_node; - /* objectid */ - u64 ino; - /* - * transid where the defrag was added, we search for - * extents newer than this - */ - u64 transid; - - /* root objectid */ - u64 root; - - /* - * The extent size threshold for autodefrag. - * - * This value is different for compressed/non-compressed extents, - * thus needs to be passed from higher layer. - * (aka, inode_should_defrag()) - */ - u32 extent_thresh; -}; - -static int __compare_inode_defrag(struct inode_defrag *defrag1, - struct inode_defrag *defrag2) -{ - if (defrag1->root > defrag2->root) - return 1; - else if (defrag1->root < defrag2->root) - return -1; - else if (defrag1->ino > defrag2->ino) - return 1; - else if (defrag1->ino < defrag2->ino) - return -1; - else - return 0; -} - -/* pop a record for an inode into the defrag tree. The lock - * must be held already - * - * If you're inserting a record for an older transid than an - * existing record, the transid already in the tree is lowered - * - * If an existing record is found the defrag item you - * pass in is freed - */ -static int __btrfs_add_inode_defrag(struct btrfs_inode *inode, - struct inode_defrag *defrag) -{ - struct btrfs_fs_info *fs_info = inode->root->fs_info; - struct inode_defrag *entry; - struct rb_node **p; - struct rb_node *parent = NULL; - int ret; - - p = &fs_info->defrag_inodes.rb_node; - while (*p) { - parent = *p; - entry = rb_entry(parent, struct inode_defrag, rb_node); - - ret = __compare_inode_defrag(defrag, entry); - if (ret < 0) - p = &parent->rb_left; - else if (ret > 0) - p = &parent->rb_right; - else { - /* if we're reinserting an entry for - * an old defrag run, make sure to - * lower the transid of our existing record - */ - if (defrag->transid < entry->transid) - entry->transid = defrag->transid; - entry->extent_thresh = min(defrag->extent_thresh, - entry->extent_thresh); - return -EEXIST; - } - } - set_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags); - rb_link_node(&defrag->rb_node, parent, p); - rb_insert_color(&defrag->rb_node, &fs_info->defrag_inodes); - return 0; -} - -static inline int __need_auto_defrag(struct btrfs_fs_info *fs_info) -{ - if (!btrfs_test_opt(fs_info, AUTO_DEFRAG)) - return 0; - - if (btrfs_fs_closing(fs_info)) - return 0; - - return 1; -} - -/* - * insert a defrag record for this inode if auto defrag is - * enabled - */ -int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, - struct btrfs_inode *inode, u32 extent_thresh) -{ - struct btrfs_root *root = inode->root; - struct btrfs_fs_info *fs_info = root->fs_info; - struct inode_defrag *defrag; - u64 transid; - int ret; - - if (!__need_auto_defrag(fs_info)) - return 0; - - if (test_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags)) - return 0; - - if (trans) - transid = trans->transid; - else - transid = inode->root->last_trans; - - defrag = kmem_cache_zalloc(btrfs_inode_defrag_cachep, GFP_NOFS); - if (!defrag) - return -ENOMEM; - - defrag->ino = btrfs_ino(inode); - defrag->transid = transid; - defrag->root = root->root_key.objectid; - defrag->extent_thresh = extent_thresh; - - spin_lock(&fs_info->defrag_inodes_lock); - if (!test_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags)) { - /* - * If we set IN_DEFRAG flag and evict the inode from memory, - * and then re-read this inode, this new inode doesn't have - * IN_DEFRAG flag. At the case, we may find the existed defrag. - */ - ret = __btrfs_add_inode_defrag(inode, defrag); - if (ret) - kmem_cache_free(btrfs_inode_defrag_cachep, defrag); - } else { - kmem_cache_free(btrfs_inode_defrag_cachep, defrag); - } - spin_unlock(&fs_info->defrag_inodes_lock); - return 0; -} - -/* - * pick the defragable inode that we want, if it doesn't exist, we will get - * the next one. - */ -static struct inode_defrag * -btrfs_pick_defrag_inode(struct btrfs_fs_info *fs_info, u64 root, u64 ino) -{ - struct inode_defrag *entry = NULL; - struct inode_defrag tmp; - struct rb_node *p; - struct rb_node *parent = NULL; - int ret; - - tmp.ino = ino; - tmp.root = root; - - spin_lock(&fs_info->defrag_inodes_lock); - p = fs_info->defrag_inodes.rb_node; - while (p) { - parent = p; - entry = rb_entry(parent, struct inode_defrag, rb_node); - - ret = __compare_inode_defrag(&tmp, entry); - if (ret < 0) - p = parent->rb_left; - else if (ret > 0) - p = parent->rb_right; - else - goto out; - } - - if (parent && __compare_inode_defrag(&tmp, entry) > 0) { - parent = rb_next(parent); - if (parent) - entry = rb_entry(parent, struct inode_defrag, rb_node); - else - entry = NULL; - } -out: - if (entry) - rb_erase(parent, &fs_info->defrag_inodes); - spin_unlock(&fs_info->defrag_inodes_lock); - return entry; -} - -void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info) -{ - struct inode_defrag *defrag; - struct rb_node *node; - - spin_lock(&fs_info->defrag_inodes_lock); - node = rb_first(&fs_info->defrag_inodes); - while (node) { - rb_erase(node, &fs_info->defrag_inodes); - defrag = rb_entry(node, struct inode_defrag, rb_node); - kmem_cache_free(btrfs_inode_defrag_cachep, defrag); - - cond_resched_lock(&fs_info->defrag_inodes_lock); - - node = rb_first(&fs_info->defrag_inodes); - } - spin_unlock(&fs_info->defrag_inodes_lock); -} - -#define BTRFS_DEFRAG_BATCH 1024 - -static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info, - struct inode_defrag *defrag) -{ - struct btrfs_root *inode_root; - struct inode *inode; - struct btrfs_ioctl_defrag_range_args range; - int ret = 0; - u64 cur = 0; - -again: - if (test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state)) - goto cleanup; - if (!__need_auto_defrag(fs_info)) - goto cleanup; - - /* get the inode */ - inode_root = btrfs_get_fs_root(fs_info, defrag->root, true); - if (IS_ERR(inode_root)) { - ret = PTR_ERR(inode_root); - goto cleanup; - } - - inode = btrfs_iget(fs_info->sb, defrag->ino, inode_root); - btrfs_put_root(inode_root); - if (IS_ERR(inode)) { - ret = PTR_ERR(inode); - goto cleanup; - } - - if (cur >= i_size_read(inode)) { - iput(inode); - goto cleanup; - } - - /* do a chunk of defrag */ - clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags); - memset(&range, 0, sizeof(range)); - range.len = (u64)-1; - range.start = cur; - range.extent_thresh = defrag->extent_thresh; - - sb_start_write(fs_info->sb); - ret = btrfs_defrag_file(inode, NULL, &range, defrag->transid, - BTRFS_DEFRAG_BATCH); - sb_end_write(fs_info->sb); - iput(inode); - - if (ret < 0) - goto cleanup; - - cur = max(cur + fs_info->sectorsize, range.start); - goto again; - -cleanup: - kmem_cache_free(btrfs_inode_defrag_cachep, defrag); - return ret; -} - -/* - * run through the list of inodes in the FS that need - * defragging - */ -int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info) -{ - struct inode_defrag *defrag; - u64 first_ino = 0; - u64 root_objectid = 0; - - atomic_inc(&fs_info->defrag_running); - while (1) { - /* Pause the auto defragger. */ - if (test_bit(BTRFS_FS_STATE_REMOUNTING, - &fs_info->fs_state)) - break; - - if (!__need_auto_defrag(fs_info)) - break; - - /* find an inode to defrag */ - defrag = btrfs_pick_defrag_inode(fs_info, root_objectid, - first_ino); - if (!defrag) { - if (root_objectid || first_ino) { - root_objectid = 0; - first_ino = 0; - continue; - } else { - break; - } - } - - first_ino = defrag->ino + 1; - root_objectid = defrag->root; - - __btrfs_run_defrag_inode(fs_info, defrag); - } - atomic_dec(&fs_info->defrag_running); - - /* - * during unmount, we use the transaction_wait queue to - * wait for the defragger to stop - */ - wake_up(&fs_info->transaction_wait); - return 0; -} +#include "fs.h" +#include "accessors.h" +#include "extent-tree.h" +#include "file-item.h" +#include "ioctl.h" +#include "file.h" +#include "super.h" /* simple helper to fault in pages and copy. This should go away * and be replaced with calls into generic code. @@ -696,7 +380,10 @@ next_slot: args->start - extent_offset, 0, false); ret = btrfs_inc_extent_ref(trans, &ref); - BUG_ON(ret); /* -ENOMEM */ + if (ret) { + btrfs_abort_transaction(trans, ret); + break; + } } key.offset = args->start; } @@ -783,7 +470,10 @@ delete_extent_item: key.offset - extent_offset, 0, false); ret = btrfs_free_extent(trans, &ref); - BUG_ON(ret); /* -ENOMEM */ + if (ret) { + btrfs_abort_transaction(trans, ret); + break; + } args->bytes_found += extent_end - key.offset; } @@ -1302,7 +992,8 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages, struct btrfs_ordered_extent *ordered; if (nowait) { - if (!try_lock_extent(&inode->io_tree, start_pos, last_pos)) { + if (!try_lock_extent(&inode->io_tree, start_pos, last_pos, + cached_state)) { for (i = 0; i < num_pages; i++) { unlock_page(pages[i]); put_page(pages[i]); @@ -1372,6 +1063,7 @@ int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos, { struct btrfs_fs_info *fs_info = inode->root->fs_info; struct btrfs_root *root = inode->root; + struct extent_state *cached_state = NULL; u64 lockstart, lockend; u64 num_bytes; int ret; @@ -1388,12 +1080,14 @@ int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos, num_bytes = lockend - lockstart + 1; if (nowait) { - if (!btrfs_try_lock_ordered_range(inode, lockstart, lockend)) { + if (!btrfs_try_lock_ordered_range(inode, lockstart, lockend, + &cached_state)) { btrfs_drew_write_unlock(&root->snapshot_lock); return -EAGAIN; } } else { - btrfs_lock_and_flush_ordered_range(inode, lockstart, lockend, NULL); + btrfs_lock_and_flush_ordered_range(inode, lockstart, lockend, + &cached_state); } ret = can_nocow_extent(&inode->vfs_inode, lockstart, &num_bytes, NULL, NULL, NULL, nowait, false); @@ -1402,7 +1096,7 @@ int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos, else *write_bytes = min_t(size_t, *write_bytes , num_bytes - pos + lockstart); - unlock_extent(&inode->io_tree, lockstart, lockend, NULL); + unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state); return ret; } @@ -1505,7 +1199,7 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb, if (nowait) ilock_flags |= BTRFS_ILOCK_TRY; - ret = btrfs_inode_lock(inode, ilock_flags); + ret = btrfs_inode_lock(BTRFS_I(inode), ilock_flags); if (ret < 0) return ret; @@ -1740,7 +1434,7 @@ again: iocb->ki_pos += num_written; } out: - btrfs_inode_unlock(inode, ilock_flags); + btrfs_inode_unlock(BTRFS_I(inode), ilock_flags); return num_written ? num_written : ret; } @@ -1780,19 +1474,19 @@ static ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from) ilock_flags |= BTRFS_ILOCK_SHARED; relock: - err = btrfs_inode_lock(inode, ilock_flags); + err = btrfs_inode_lock(BTRFS_I(inode), ilock_flags); if (err < 0) return err; err = generic_write_checks(iocb, from); if (err <= 0) { - btrfs_inode_unlock(inode, ilock_flags); + btrfs_inode_unlock(BTRFS_I(inode), ilock_flags); return err; } err = btrfs_write_check(iocb, from, err); if (err < 0) { - btrfs_inode_unlock(inode, ilock_flags); + btrfs_inode_unlock(BTRFS_I(inode), ilock_flags); goto out; } @@ -1803,13 +1497,13 @@ relock: */ if ((ilock_flags & BTRFS_ILOCK_SHARED) && pos + iov_iter_count(from) > i_size_read(inode)) { - btrfs_inode_unlock(inode, ilock_flags); + btrfs_inode_unlock(BTRFS_I(inode), ilock_flags); ilock_flags &= ~BTRFS_ILOCK_SHARED; goto relock; } if (check_direct_IO(fs_info, from, pos)) { - btrfs_inode_unlock(inode, ilock_flags); + btrfs_inode_unlock(BTRFS_I(inode), ilock_flags); goto buffered; } @@ -1840,7 +1534,7 @@ relock: * iocb, and that needs to lock the inode. So unlock it before calling * iomap_dio_complete() to avoid a deadlock. */ - btrfs_inode_unlock(inode, ilock_flags); + btrfs_inode_unlock(BTRFS_I(inode), ilock_flags); if (IS_ERR_OR_NULL(dio)) err = PTR_ERR_OR_ZERO(dio); @@ -1887,8 +1581,8 @@ buffered: /* * If we are in a NOWAIT context, then return -EAGAIN to signal the caller * it must retry the operation in a context where blocking is acceptable, - * since we currently don't have NOWAIT semantics support for buffered IO - * and may block there for many reasons (reserving space for example). + * because even if we end up not blocking during the buffered IO attempt + * below, we will block when flushing and waiting for the IO. */ if (iocb->ki_flags & IOCB_NOWAIT) { err = -EAGAIN; @@ -1928,7 +1622,7 @@ static ssize_t btrfs_encoded_write(struct kiocb *iocb, struct iov_iter *from, loff_t count; ssize_t ret; - btrfs_inode_lock(inode, 0); + btrfs_inode_lock(BTRFS_I(inode), 0); count = encoded->len; ret = generic_write_checks_count(iocb, &count); if (ret == 0 && count != encoded->len) { @@ -1947,7 +1641,7 @@ static ssize_t btrfs_encoded_write(struct kiocb *iocb, struct iov_iter *from, ret = btrfs_do_encoded_write(iocb, from, encoded); out: - btrfs_inode_unlock(inode, 0); + btrfs_inode_unlock(BTRFS_I(inode), 0); return ret; } @@ -2008,10 +1702,12 @@ int btrfs_release_file(struct inode *inode, struct file *filp) { struct btrfs_file_private *private = filp->private_data; - if (private && private->filldir_buf) + if (private) { kfree(private->filldir_buf); - kfree(private); - filp->private_data = NULL; + free_extent_state(private->llseek_cached_state); + kfree(private); + filp->private_data = NULL; + } /* * Set by setattr when we are about to truncate a file from a non-zero @@ -2118,7 +1814,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) if (ret) goto out; - btrfs_inode_lock(inode, BTRFS_ILOCK_MMAP); + btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_MMAP); atomic_inc(&root->log_batch); @@ -2142,7 +1838,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) */ ret = start_ordered_ops(inode, start, end); if (ret) { - btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP); + btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP); goto out; } @@ -2245,7 +1941,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) * file again, but that will end up using the synchronization * inside btrfs_sync_log to keep things safe. */ - btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP); + btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP); if (ret == BTRFS_NO_LOG_SYNC) { ret = btrfs_end_transaction(trans); @@ -2313,7 +2009,7 @@ out: out_release_extents: btrfs_release_log_ctx_extents(&ctx); - btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP); + btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP); goto out; } @@ -2908,7 +2604,7 @@ static int btrfs_punch_hole(struct file *file, loff_t offset, loff_t len) bool truncated_block = false; bool updated_inode = false; - btrfs_inode_lock(inode, BTRFS_ILOCK_MMAP); + btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_MMAP); ret = btrfs_wait_ordered_range(inode, offset, len); if (ret) @@ -2956,7 +2652,7 @@ static int btrfs_punch_hole(struct file *file, loff_t offset, loff_t len) truncated_block = true; ret = btrfs_truncate_block(BTRFS_I(inode), offset, 0, 0); if (ret) { - btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP); + btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP); return ret; } } @@ -3055,7 +2751,7 @@ out_only_mutex: ret = ret2; } } - btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP); + btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP); return ret; } @@ -3366,7 +3062,7 @@ static long btrfs_fallocate(struct file *file, int mode, if (mode & FALLOC_FL_PUNCH_HOLE) return btrfs_punch_hole(file, offset, len); - btrfs_inode_lock(inode, BTRFS_ILOCK_MMAP); + btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_MMAP); if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size) { ret = inode_newsize_ok(inode, offset + len); @@ -3416,7 +3112,7 @@ static long btrfs_fallocate(struct file *file, int mode, if (mode & FALLOC_FL_ZERO_RANGE) { ret = btrfs_zero_range(inode, offset, len, mode); - btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP); + btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP); return ret; } @@ -3514,7 +3210,7 @@ out_unlock: unlock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, &cached_state); out: - btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP); + btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP); extent_changeset_free(data_reserved); return ret; } @@ -3526,117 +3222,106 @@ out: * looping while it gets adjacent subranges, and merging them together. */ static bool find_delalloc_subrange(struct btrfs_inode *inode, u64 start, u64 end, + struct extent_state **cached_state, + bool *search_io_tree, u64 *delalloc_start_ret, u64 *delalloc_end_ret) { - const u64 len = end + 1 - start; - struct extent_map_tree *em_tree = &inode->extent_tree; - struct extent_map *em; - u64 em_end; - u64 delalloc_len; + u64 len = end + 1 - start; + u64 delalloc_len = 0; + struct btrfs_ordered_extent *oe; + u64 oe_start; + u64 oe_end; /* * Search the io tree first for EXTENT_DELALLOC. If we find any, it * means we have delalloc (dirty pages) for which writeback has not * started yet. */ - *delalloc_start_ret = start; - delalloc_len = count_range_bits(&inode->io_tree, delalloc_start_ret, end, - len, EXTENT_DELALLOC, 1); - /* - * If delalloc was found then *delalloc_start_ret has a sector size - * aligned value (rounded down). - */ - if (delalloc_len > 0) + if (*search_io_tree) { + spin_lock(&inode->lock); + if (inode->delalloc_bytes > 0) { + spin_unlock(&inode->lock); + *delalloc_start_ret = start; + delalloc_len = count_range_bits(&inode->io_tree, + delalloc_start_ret, end, + len, EXTENT_DELALLOC, 1, + cached_state); + } else { + spin_unlock(&inode->lock); + } + } + + if (delalloc_len > 0) { + /* + * If delalloc was found then *delalloc_start_ret has a sector size + * aligned value (rounded down). + */ *delalloc_end_ret = *delalloc_start_ret + delalloc_len - 1; + if (*delalloc_start_ret == start) { + /* Delalloc for the whole range, nothing more to do. */ + if (*delalloc_end_ret == end) + return true; + /* Else trim our search range for ordered extents. */ + start = *delalloc_end_ret + 1; + len = end + 1 - start; + } + } else { + /* No delalloc, future calls don't need to search again. */ + *search_io_tree = false; + } + /* - * Now also check if there's any extent map in the range that does not - * map to a hole or prealloc extent. We do this because: + * Now also check if there's any ordered extent in the range. + * We do this because: * * 1) When delalloc is flushed, the file range is locked, we clear the - * EXTENT_DELALLOC bit from the io tree and create an extent map for - * an allocated extent. So we might just have been called after - * delalloc is flushed and before the ordered extent completes and - * inserts the new file extent item in the subvolume's btree; + * EXTENT_DELALLOC bit from the io tree and create an extent map and + * an ordered extent for the write. So we might just have been called + * after delalloc is flushed and before the ordered extent completes + * and inserts the new file extent item in the subvolume's btree; * - * 2) We may have an extent map created by flushing delalloc for a + * 2) We may have an ordered extent created by flushing delalloc for a * subrange that starts before the subrange we found marked with * EXTENT_DELALLOC in the io tree. + * + * We could also use the extent map tree to find such delalloc that is + * being flushed, but using the ordered extents tree is more efficient + * because it's usually much smaller as ordered extents are removed from + * the tree once they complete. With the extent maps, we mau have them + * in the extent map tree for a very long time, and they were either + * created by previous writes or loaded by read operations. */ - read_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, start, len); - read_unlock(&em_tree->lock); - - /* extent_map_end() returns a non-inclusive end offset. */ - em_end = em ? extent_map_end(em) : 0; - - /* - * If we have a hole/prealloc extent map, check the next one if this one - * ends before our range's end. - */ - if (em && (em->block_start == EXTENT_MAP_HOLE || - test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) && em_end < end) { - struct extent_map *next_em; - - read_lock(&em_tree->lock); - next_em = lookup_extent_mapping(em_tree, em_end, len - em_end); - read_unlock(&em_tree->lock); - - free_extent_map(em); - em_end = next_em ? extent_map_end(next_em) : 0; - em = next_em; - } + oe = btrfs_lookup_first_ordered_range(inode, start, len); + if (!oe) + return (delalloc_len > 0); - if (em && (em->block_start == EXTENT_MAP_HOLE || - test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) { - free_extent_map(em); - em = NULL; - } + /* The ordered extent may span beyond our search range. */ + oe_start = max(oe->file_offset, start); + oe_end = min(oe->file_offset + oe->num_bytes - 1, end); - /* - * No extent map or one for a hole or prealloc extent. Use the delalloc - * range we found in the io tree if we have one. - */ - if (!em) - return (delalloc_len > 0); + btrfs_put_ordered_extent(oe); - /* - * We don't have any range as EXTENT_DELALLOC in the io tree, so the - * extent map is the only subrange representing delalloc. - */ + /* Don't have unflushed delalloc, return the ordered extent range. */ if (delalloc_len == 0) { - *delalloc_start_ret = em->start; - *delalloc_end_ret = min(end, em_end - 1); - free_extent_map(em); + *delalloc_start_ret = oe_start; + *delalloc_end_ret = oe_end; return true; } /* - * The extent map represents a delalloc range that starts before the - * delalloc range we found in the io tree. + * We have both unflushed delalloc (io_tree) and an ordered extent. + * If the ranges are adjacent returned a combined range, otherwise + * return the leftmost range. */ - if (em->start < *delalloc_start_ret) { - *delalloc_start_ret = em->start; - /* - * If the ranges are adjacent, return a combined range. - * Otherwise return the extent map's range. - */ - if (em_end < *delalloc_start_ret) - *delalloc_end_ret = min(end, em_end - 1); - - free_extent_map(em); - return true; + if (oe_start < *delalloc_start_ret) { + if (oe_end < *delalloc_start_ret) + *delalloc_end_ret = oe_end; + *delalloc_start_ret = oe_start; + } else if (*delalloc_end_ret + 1 == oe_start) { + *delalloc_end_ret = oe_end; } - /* - * The extent map starts after the delalloc range we found in the io - * tree. If it's adjacent, return a combined range, otherwise return - * the range found in the io tree. - */ - if (*delalloc_end_ret + 1 == em->start) - *delalloc_end_ret = min(end, em_end - 1); - - free_extent_map(em); return true; } @@ -3648,6 +3333,8 @@ static bool find_delalloc_subrange(struct btrfs_inode *inode, u64 start, u64 end * sector size aligned. * @end: The end offset (inclusive value) of the search range. * It does not need to be sector size aligned. + * @cached_state: Extent state record used for speeding up delalloc + * searches in the inode's io_tree. Can be NULL. * @delalloc_start_ret: Output argument, set to the start offset of the * subrange found with delalloc (may not be sector size * aligned). @@ -3659,10 +3346,12 @@ static bool find_delalloc_subrange(struct btrfs_inode *inode, u64 start, u64 end * end offsets of the subrange. */ bool btrfs_find_delalloc_in_range(struct btrfs_inode *inode, u64 start, u64 end, + struct extent_state **cached_state, u64 *delalloc_start_ret, u64 *delalloc_end_ret) { u64 cur_offset = round_down(start, inode->root->fs_info->sectorsize); u64 prev_delalloc_end = 0; + bool search_io_tree = true; bool ret = false; while (cur_offset < end) { @@ -3671,6 +3360,7 @@ bool btrfs_find_delalloc_in_range(struct btrfs_inode *inode, u64 start, u64 end, bool delalloc; delalloc = find_delalloc_subrange(inode, cur_offset, end, + cached_state, &search_io_tree, &delalloc_start, &delalloc_end); if (!delalloc) @@ -3716,13 +3406,14 @@ bool btrfs_find_delalloc_in_range(struct btrfs_inode *inode, u64 start, u64 end, * is found, it updates @start_ret with the start of the subrange. */ static bool find_desired_extent_in_hole(struct btrfs_inode *inode, int whence, + struct extent_state **cached_state, u64 start, u64 end, u64 *start_ret) { u64 delalloc_start; u64 delalloc_end; bool delalloc; - delalloc = btrfs_find_delalloc_in_range(inode, start, end, + delalloc = btrfs_find_delalloc_in_range(inode, start, end, cached_state, &delalloc_start, &delalloc_end); if (delalloc && whence == SEEK_DATA) { *start_ret = delalloc_start; @@ -3765,11 +3456,13 @@ static bool find_desired_extent_in_hole(struct btrfs_inode *inode, int whence, return false; } -static loff_t find_desired_extent(struct btrfs_inode *inode, loff_t offset, - int whence) +static loff_t find_desired_extent(struct file *file, loff_t offset, int whence) { + struct btrfs_inode *inode = BTRFS_I(file->f_mapping->host); + struct btrfs_file_private *private = file->private_data; struct btrfs_fs_info *fs_info = inode->root->fs_info; struct extent_state *cached_state = NULL; + struct extent_state **delalloc_cached_state; const loff_t i_size = i_size_read(&inode->vfs_inode); const u64 ino = btrfs_ino(inode); struct btrfs_root *root = inode->root; @@ -3794,6 +3487,22 @@ static loff_t find_desired_extent(struct btrfs_inode *inode, loff_t offset, inode_get_bytes(&inode->vfs_inode) == i_size) return i_size; + if (!private) { + private = kzalloc(sizeof(*private), GFP_KERNEL); + /* + * No worries if memory allocation failed. + * The private structure is used only for speeding up multiple + * lseek SEEK_HOLE/DATA calls to a file when there's delalloc, + * so everything will still be correct. + */ + file->private_data = private; + } + + if (private) + delalloc_cached_state = &private->llseek_cached_state; + else + delalloc_cached_state = NULL; + /* * offset can be negative, in this case we start finding DATA/HOLE from * the very start of the file. @@ -3871,6 +3580,7 @@ static loff_t find_desired_extent(struct btrfs_inode *inode, loff_t offset, search_start = offset; found = find_desired_extent_in_hole(inode, whence, + delalloc_cached_state, search_start, key.offset - 1, &found_start); @@ -3905,6 +3615,7 @@ static loff_t find_desired_extent(struct btrfs_inode *inode, loff_t offset, search_start = offset; found = find_desired_extent_in_hole(inode, whence, + delalloc_cached_state, search_start, extent_end - 1, &found_start); @@ -3946,7 +3657,8 @@ static loff_t find_desired_extent(struct btrfs_inode *inode, loff_t offset, /* We have an implicit hole from the last extent found up to i_size. */ if (!found && start < i_size) { - found = find_desired_extent_in_hole(inode, whence, start, + found = find_desired_extent_in_hole(inode, whence, + delalloc_cached_state, start, i_size - 1, &start); if (!found) start = i_size; @@ -3974,9 +3686,9 @@ static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int whence) return generic_file_llseek(file, offset, whence); case SEEK_DATA: case SEEK_HOLE: - btrfs_inode_lock(inode, BTRFS_ILOCK_SHARED); - offset = find_desired_extent(BTRFS_I(inode), offset, whence); - btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); + btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_SHARED); + offset = find_desired_extent(file, offset, whence); + btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_SHARED); break; } @@ -4031,7 +3743,7 @@ static ssize_t btrfs_direct_read(struct kiocb *iocb, struct iov_iter *to) if (check_direct_read(btrfs_sb(inode->i_sb), to, iocb->ki_pos)) return 0; - btrfs_inode_lock(inode, BTRFS_ILOCK_SHARED); + btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_SHARED); again: /* * This is similar to what we do for direct IO writes, see the comment @@ -4080,7 +3792,7 @@ again: goto again; } } - btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); + btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_SHARED); return ret < 0 ? ret : read; } @@ -4117,23 +3829,6 @@ const struct file_operations btrfs_file_operations = { .remap_file_range = btrfs_remap_file_range, }; -void __cold btrfs_auto_defrag_exit(void) -{ - kmem_cache_destroy(btrfs_inode_defrag_cachep); -} - -int __init btrfs_auto_defrag_init(void) -{ - btrfs_inode_defrag_cachep = kmem_cache_create("btrfs_inode_defrag", - sizeof(struct inode_defrag), 0, - SLAB_MEM_SPREAD, - NULL); - if (!btrfs_inode_defrag_cachep) - return -ENOMEM; - - return 0; -} - int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end) { int ret; diff --git a/fs/btrfs/file.h b/fs/btrfs/file.h new file mode 100644 index 000000000000..82b34fbb295f --- /dev/null +++ b/fs/btrfs/file.h @@ -0,0 +1,33 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef BTRFS_FILE_H +#define BTRFS_FILE_H + +extern const struct file_operations btrfs_file_operations; + +int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync); +int btrfs_drop_extents(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct btrfs_inode *inode, + struct btrfs_drop_extents_args *args); +int btrfs_replace_file_extents(struct btrfs_inode *inode, + struct btrfs_path *path, const u64 start, + const u64 end, + struct btrfs_replace_extent_info *extent_info, + struct btrfs_trans_handle **trans_out); +int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, + struct btrfs_inode *inode, u64 start, u64 end); +ssize_t btrfs_do_write_iter(struct kiocb *iocb, struct iov_iter *from, + const struct btrfs_ioctl_encoded_io_args *encoded); +int btrfs_release_file(struct inode *inode, struct file *file); +int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages, + size_t num_pages, loff_t pos, size_t write_bytes, + struct extent_state **cached, bool noreserve); +int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end); +int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos, + size_t *write_bytes, bool nowait); +void btrfs_check_nocow_unlock(struct btrfs_inode *inode); +bool btrfs_find_delalloc_in_range(struct btrfs_inode *inode, u64 start, u64 end, + struct extent_state **cached_state, + u64 *delalloc_start_ret, u64 *delalloc_end_ret); + +#endif diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index f4023651dd68..0d250d052487 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -11,8 +11,10 @@ #include <linux/ratelimit.h> #include <linux/error-injection.h> #include <linux/sched/mm.h> -#include "misc.h" #include "ctree.h" +#include "fs.h" +#include "messages.h" +#include "misc.h" #include "free-space-cache.h" #include "transaction.h" #include "disk-io.h" @@ -24,11 +26,18 @@ #include "discard.h" #include "subpage.h" #include "inode-item.h" +#include "accessors.h" +#include "file-item.h" +#include "file.h" +#include "super.h" #define BITS_PER_BITMAP (PAGE_SIZE * 8UL) #define MAX_CACHE_BYTES_PER_GIG SZ_64K #define FORCE_EXTENT_THRESHOLD SZ_1M +static struct kmem_cache *btrfs_free_space_cachep; +static struct kmem_cache *btrfs_free_space_bitmap_cachep; + struct btrfs_trim_range { u64 start; u64 bytes; @@ -251,7 +260,7 @@ int btrfs_remove_free_space_inode(struct btrfs_trans_handle *trans, } ret = btrfs_orphan_add(trans, BTRFS_I(inode)); if (ret) { - btrfs_add_delayed_iput(inode); + btrfs_add_delayed_iput(BTRFS_I(inode)); goto out; } clear_nlink(inode); @@ -265,7 +274,7 @@ int btrfs_remove_free_space_inode(struct btrfs_trans_handle *trans, spin_unlock(&block_group->lock); } /* One for the lookup ref */ - btrfs_add_delayed_iput(inode); + btrfs_add_delayed_iput(BTRFS_I(inode)); key.objectid = BTRFS_FREE_SPACE_OBJECTID; key.type = 0; @@ -1363,8 +1372,8 @@ int btrfs_wait_cache_io(struct btrfs_trans_handle *trans, path, block_group->start); } -/** - * Write out cached info to an inode +/* + * Write out cached info to an inode. * * @root: root the inode belongs to * @inode: freespace inode we are writing out @@ -2717,8 +2726,7 @@ static int __btrfs_add_free_space_zoned(struct btrfs_block_group *block_group, btrfs_mark_bg_unused(block_group); } else if (bg_reclaim_threshold && reclaimable_unusable >= - div_factor_fine(block_group->zone_capacity, - bg_reclaim_threshold)) { + mult_perc(block_group->zone_capacity, bg_reclaim_threshold)) { btrfs_mark_bg_to_reclaim(block_group); } @@ -3028,10 +3036,7 @@ void btrfs_remove_free_space_cache(struct btrfs_block_group *block_group) } -/** - * btrfs_is_free_space_trimmed - see if everything is trimmed - * @block_group: block_group of interest - * +/* * Walk @block_group's free space rb_tree to determine if everything is trimmed. */ bool btrfs_is_free_space_trimmed(struct btrfs_block_group *block_group) @@ -4132,6 +4137,31 @@ out: return ret; } +int __init btrfs_free_space_init(void) +{ + btrfs_free_space_cachep = kmem_cache_create("btrfs_free_space", + sizeof(struct btrfs_free_space), 0, + SLAB_MEM_SPREAD, NULL); + if (!btrfs_free_space_cachep) + return -ENOMEM; + + btrfs_free_space_bitmap_cachep = kmem_cache_create("btrfs_free_space_bitmap", + PAGE_SIZE, PAGE_SIZE, + SLAB_MEM_SPREAD, NULL); + if (!btrfs_free_space_bitmap_cachep) { + kmem_cache_destroy(btrfs_free_space_cachep); + return -ENOMEM; + } + + return 0; +} + +void __cold btrfs_free_space_exit(void) +{ + kmem_cache_destroy(btrfs_free_space_cachep); + kmem_cache_destroy(btrfs_free_space_bitmap_cachep); +} + #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS /* * Use this if you need to make a bitmap or extent entry specifically, it diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h index 6d419ba53e95..a855e0483e03 100644 --- a/fs/btrfs/free-space-cache.h +++ b/fs/btrfs/free-space-cache.h @@ -43,6 +43,17 @@ static inline bool btrfs_free_space_trimming_bitmap( return (info->trim_state == BTRFS_TRIM_STATE_TRIMMING); } +/* + * Deltas are an effective way to populate global statistics. Give macro names + * to make it clear what we're doing. An example is discard_extents in + * btrfs_free_space_ctl. + */ +enum { + BTRFS_STAT_CURR, + BTRFS_STAT_PREV, + BTRFS_STAT_NR_ENTRIES, +}; + struct btrfs_free_space_ctl { spinlock_t tree_lock; struct rb_root free_space_offset; @@ -79,6 +90,8 @@ struct btrfs_io_ctl { int bitmaps; }; +int __init btrfs_free_space_init(void); +void __cold btrfs_free_space_exit(void); struct inode *lookup_free_space_inode(struct btrfs_block_group *block_group, struct btrfs_path *path); int create_free_space_inode(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c index 367bcfcf68f5..c667e878ef1a 100644 --- a/fs/btrfs/free-space-tree.c +++ b/fs/btrfs/free-space-tree.c @@ -5,12 +5,17 @@ #include <linux/kernel.h> #include <linux/sched/mm.h> +#include "messages.h" #include "ctree.h" #include "disk-io.h" #include "locking.h" #include "free-space-tree.h" #include "transaction.h" #include "block-group.h" +#include "fs.h" +#include "accessors.h" +#include "extent-tree.h" +#include "root-tree.h" static int __add_block_group_free_space(struct btrfs_trans_handle *trans, struct btrfs_block_group *block_group, @@ -803,7 +808,7 @@ int __remove_from_free_space_tree(struct btrfs_trans_handle *trans, u32 flags; int ret; - if (block_group->needs_free_space) { + if (test_bit(BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE, &block_group->runtime_flags)) { ret = __add_block_group_free_space(trans, block_group, path); if (ret) return ret; @@ -996,7 +1001,7 @@ int __add_to_free_space_tree(struct btrfs_trans_handle *trans, u32 flags; int ret; - if (block_group->needs_free_space) { + if (test_bit(BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE, &block_group->runtime_flags)) { ret = __add_block_group_free_space(trans, block_group, path); if (ret) return ret; @@ -1299,7 +1304,7 @@ static int __add_block_group_free_space(struct btrfs_trans_handle *trans, { int ret; - block_group->needs_free_space = 0; + clear_bit(BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE, &block_group->runtime_flags); ret = add_new_free_space_info(trans, block_group, path); if (ret) @@ -1321,7 +1326,7 @@ int add_block_group_free_space(struct btrfs_trans_handle *trans, return 0; mutex_lock(&block_group->free_space_lock); - if (!block_group->needs_free_space) + if (!test_bit(BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE, &block_group->runtime_flags)) goto out; path = btrfs_alloc_path(); @@ -1354,7 +1359,7 @@ int remove_block_group_free_space(struct btrfs_trans_handle *trans, if (!btrfs_fs_compat_ro(trans->fs_info, FREE_SPACE_TREE)) return 0; - if (block_group->needs_free_space) { + if (test_bit(BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE, &block_group->runtime_flags)) { /* We never added this block group to the free space tree. */ return 0; } diff --git a/fs/btrfs/fs.c b/fs/btrfs/fs.c new file mode 100644 index 000000000000..5553e1f8afe8 --- /dev/null +++ b/fs/btrfs/fs.c @@ -0,0 +1,94 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "messages.h" +#include "ctree.h" +#include "fs.h" +#include "accessors.h" + +void __btrfs_set_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag, + const char *name) +{ + struct btrfs_super_block *disk_super; + u64 features; + + disk_super = fs_info->super_copy; + features = btrfs_super_incompat_flags(disk_super); + if (!(features & flag)) { + spin_lock(&fs_info->super_lock); + features = btrfs_super_incompat_flags(disk_super); + if (!(features & flag)) { + features |= flag; + btrfs_set_super_incompat_flags(disk_super, features); + btrfs_info(fs_info, + "setting incompat feature flag for %s (0x%llx)", + name, flag); + } + spin_unlock(&fs_info->super_lock); + } +} + +void __btrfs_clear_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag, + const char *name) +{ + struct btrfs_super_block *disk_super; + u64 features; + + disk_super = fs_info->super_copy; + features = btrfs_super_incompat_flags(disk_super); + if (features & flag) { + spin_lock(&fs_info->super_lock); + features = btrfs_super_incompat_flags(disk_super); + if (features & flag) { + features &= ~flag; + btrfs_set_super_incompat_flags(disk_super, features); + btrfs_info(fs_info, + "clearing incompat feature flag for %s (0x%llx)", + name, flag); + } + spin_unlock(&fs_info->super_lock); + } +} + +void __btrfs_set_fs_compat_ro(struct btrfs_fs_info *fs_info, u64 flag, + const char *name) +{ + struct btrfs_super_block *disk_super; + u64 features; + + disk_super = fs_info->super_copy; + features = btrfs_super_compat_ro_flags(disk_super); + if (!(features & flag)) { + spin_lock(&fs_info->super_lock); + features = btrfs_super_compat_ro_flags(disk_super); + if (!(features & flag)) { + features |= flag; + btrfs_set_super_compat_ro_flags(disk_super, features); + btrfs_info(fs_info, + "setting compat-ro feature flag for %s (0x%llx)", + name, flag); + } + spin_unlock(&fs_info->super_lock); + } +} + +void __btrfs_clear_fs_compat_ro(struct btrfs_fs_info *fs_info, u64 flag, + const char *name) +{ + struct btrfs_super_block *disk_super; + u64 features; + + disk_super = fs_info->super_copy; + features = btrfs_super_compat_ro_flags(disk_super); + if (features & flag) { + spin_lock(&fs_info->super_lock); + features = btrfs_super_compat_ro_flags(disk_super); + if (features & flag) { + features &= ~flag; + btrfs_set_super_compat_ro_flags(disk_super, features); + btrfs_info(fs_info, + "clearing compat-ro feature flag for %s (0x%llx)", + name, flag); + } + spin_unlock(&fs_info->super_lock); + } +} diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h new file mode 100644 index 000000000000..a749367e5ae2 --- /dev/null +++ b/fs/btrfs/fs.h @@ -0,0 +1,976 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef BTRFS_FS_H +#define BTRFS_FS_H + +#include <linux/fs.h> +#include <linux/btrfs_tree.h> +#include <linux/sizes.h> +#include "extent-io-tree.h" +#include "extent_map.h" +#include "async-thread.h" +#include "block-rsv.h" + +#define BTRFS_MAX_EXTENT_SIZE SZ_128M + +#define BTRFS_OLDEST_GENERATION 0ULL + +#define BTRFS_EMPTY_DIR_SIZE 0 + +#define BTRFS_DIRTY_METADATA_THRESH SZ_32M + +#define BTRFS_SUPER_INFO_OFFSET SZ_64K +#define BTRFS_SUPER_INFO_SIZE 4096 +static_assert(sizeof(struct btrfs_super_block) == BTRFS_SUPER_INFO_SIZE); + +/* + * The reserved space at the beginning of each device. It covers the primary + * super block and leaves space for potential use by other tools like + * bootloaders or to lower potential damage of accidental overwrite. + */ +#define BTRFS_DEVICE_RANGE_RESERVED (SZ_1M) +/* + * Runtime (in-memory) states of filesystem + */ +enum { + /* Global indicator of serious filesystem errors */ + BTRFS_FS_STATE_ERROR, + /* + * Filesystem is being remounted, allow to skip some operations, like + * defrag + */ + BTRFS_FS_STATE_REMOUNTING, + /* Filesystem in RO mode */ + BTRFS_FS_STATE_RO, + /* Track if a transaction abort has been reported on this filesystem */ + BTRFS_FS_STATE_TRANS_ABORTED, + /* + * Bio operations should be blocked on this filesystem because a source + * or target device is being destroyed as part of a device replace + */ + BTRFS_FS_STATE_DEV_REPLACING, + /* The btrfs_fs_info created for self-tests */ + BTRFS_FS_STATE_DUMMY_FS_INFO, + + BTRFS_FS_STATE_NO_CSUMS, + + /* Indicates there was an error cleaning up a log tree. */ + BTRFS_FS_STATE_LOG_CLEANUP_ERROR, + + BTRFS_FS_STATE_COUNT +}; + +enum { + BTRFS_FS_CLOSING_START, + BTRFS_FS_CLOSING_DONE, + BTRFS_FS_LOG_RECOVERING, + BTRFS_FS_OPEN, + BTRFS_FS_QUOTA_ENABLED, + BTRFS_FS_UPDATE_UUID_TREE_GEN, + BTRFS_FS_CREATING_FREE_SPACE_TREE, + BTRFS_FS_BTREE_ERR, + BTRFS_FS_LOG1_ERR, + BTRFS_FS_LOG2_ERR, + BTRFS_FS_QUOTA_OVERRIDE, + /* Used to record internally whether fs has been frozen */ + BTRFS_FS_FROZEN, + /* + * Indicate that balance has been set up from the ioctl and is in the + * main phase. The fs_info::balance_ctl is initialized. + */ + BTRFS_FS_BALANCE_RUNNING, + + /* + * Indicate that relocation of a chunk has started, it's set per chunk + * and is toggled between chunks. + */ + BTRFS_FS_RELOC_RUNNING, + + /* Indicate that the cleaner thread is awake and doing something. */ + BTRFS_FS_CLEANER_RUNNING, + + /* + * The checksumming has an optimized version and is considered fast, + * so we don't need to offload checksums to workqueues. + */ + BTRFS_FS_CSUM_IMPL_FAST, + + /* Indicate that the discard workqueue can service discards. */ + BTRFS_FS_DISCARD_RUNNING, + + /* Indicate that we need to cleanup space cache v1 */ + BTRFS_FS_CLEANUP_SPACE_CACHE_V1, + + /* Indicate that we can't trust the free space tree for caching yet */ + BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED, + + /* Indicate whether there are any tree modification log users */ + BTRFS_FS_TREE_MOD_LOG_USERS, + + /* Indicate that we want the transaction kthread to commit right now. */ + BTRFS_FS_COMMIT_TRANS, + + /* Indicate we have half completed snapshot deletions pending. */ + BTRFS_FS_UNFINISHED_DROPS, + + /* Indicate we have to finish a zone to do next allocation. */ + BTRFS_FS_NEED_ZONE_FINISH, + + /* Indicate that we want to commit the transaction. */ + BTRFS_FS_NEED_TRANS_COMMIT, + +#if BITS_PER_LONG == 32 + /* Indicate if we have error/warn message printed on 32bit systems */ + BTRFS_FS_32BIT_ERROR, + BTRFS_FS_32BIT_WARN, +#endif +}; + +/* + * Flags for mount options. + * + * Note: don't forget to add new options to btrfs_show_options() + */ +enum { + BTRFS_MOUNT_NODATASUM = (1UL << 0), + BTRFS_MOUNT_NODATACOW = (1UL << 1), + BTRFS_MOUNT_NOBARRIER = (1UL << 2), + BTRFS_MOUNT_SSD = (1UL << 3), + BTRFS_MOUNT_DEGRADED = (1UL << 4), + BTRFS_MOUNT_COMPRESS = (1UL << 5), + BTRFS_MOUNT_NOTREELOG = (1UL << 6), + BTRFS_MOUNT_FLUSHONCOMMIT = (1UL << 7), + BTRFS_MOUNT_SSD_SPREAD = (1UL << 8), + BTRFS_MOUNT_NOSSD = (1UL << 9), + BTRFS_MOUNT_DISCARD_SYNC = (1UL << 10), + BTRFS_MOUNT_FORCE_COMPRESS = (1UL << 11), + BTRFS_MOUNT_SPACE_CACHE = (1UL << 12), + BTRFS_MOUNT_CLEAR_CACHE = (1UL << 13), + BTRFS_MOUNT_USER_SUBVOL_RM_ALLOWED = (1UL << 14), + BTRFS_MOUNT_ENOSPC_DEBUG = (1UL << 15), + BTRFS_MOUNT_AUTO_DEFRAG = (1UL << 16), + BTRFS_MOUNT_USEBACKUPROOT = (1UL << 17), + BTRFS_MOUNT_SKIP_BALANCE = (1UL << 18), + BTRFS_MOUNT_CHECK_INTEGRITY = (1UL << 19), + BTRFS_MOUNT_CHECK_INTEGRITY_DATA = (1UL << 20), + BTRFS_MOUNT_PANIC_ON_FATAL_ERROR = (1UL << 21), + BTRFS_MOUNT_RESCAN_UUID_TREE = (1UL << 22), + BTRFS_MOUNT_FRAGMENT_DATA = (1UL << 23), + BTRFS_MOUNT_FRAGMENT_METADATA = (1UL << 24), + BTRFS_MOUNT_FREE_SPACE_TREE = (1UL << 25), + BTRFS_MOUNT_NOLOGREPLAY = (1UL << 26), + BTRFS_MOUNT_REF_VERIFY = (1UL << 27), + BTRFS_MOUNT_DISCARD_ASYNC = (1UL << 28), + BTRFS_MOUNT_IGNOREBADROOTS = (1UL << 29), + BTRFS_MOUNT_IGNOREDATACSUMS = (1UL << 30), + BTRFS_MOUNT_NODISCARD = (1UL << 31), +}; + +/* + * Compat flags that we support. If any incompat flags are set other than the + * ones specified below then we will fail to mount + */ +#define BTRFS_FEATURE_COMPAT_SUPP 0ULL +#define BTRFS_FEATURE_COMPAT_SAFE_SET 0ULL +#define BTRFS_FEATURE_COMPAT_SAFE_CLEAR 0ULL + +#define BTRFS_FEATURE_COMPAT_RO_SUPP \ + (BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE | \ + BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE_VALID | \ + BTRFS_FEATURE_COMPAT_RO_VERITY | \ + BTRFS_FEATURE_COMPAT_RO_BLOCK_GROUP_TREE) + +#define BTRFS_FEATURE_COMPAT_RO_SAFE_SET 0ULL +#define BTRFS_FEATURE_COMPAT_RO_SAFE_CLEAR 0ULL + +#ifdef CONFIG_BTRFS_DEBUG +/* + * Extent tree v2 supported only with CONFIG_BTRFS_DEBUG + */ +#define BTRFS_FEATURE_INCOMPAT_SUPP \ + (BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF | \ + BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL | \ + BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS | \ + BTRFS_FEATURE_INCOMPAT_BIG_METADATA | \ + BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO | \ + BTRFS_FEATURE_INCOMPAT_COMPRESS_ZSTD | \ + BTRFS_FEATURE_INCOMPAT_RAID56 | \ + BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF | \ + BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA | \ + BTRFS_FEATURE_INCOMPAT_NO_HOLES | \ + BTRFS_FEATURE_INCOMPAT_METADATA_UUID | \ + BTRFS_FEATURE_INCOMPAT_RAID1C34 | \ + BTRFS_FEATURE_INCOMPAT_ZONED | \ + BTRFS_FEATURE_INCOMPAT_EXTENT_TREE_V2) +#else +#define BTRFS_FEATURE_INCOMPAT_SUPP \ + (BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF | \ + BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL | \ + BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS | \ + BTRFS_FEATURE_INCOMPAT_BIG_METADATA | \ + BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO | \ + BTRFS_FEATURE_INCOMPAT_COMPRESS_ZSTD | \ + BTRFS_FEATURE_INCOMPAT_RAID56 | \ + BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF | \ + BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA | \ + BTRFS_FEATURE_INCOMPAT_NO_HOLES | \ + BTRFS_FEATURE_INCOMPAT_METADATA_UUID | \ + BTRFS_FEATURE_INCOMPAT_RAID1C34 | \ + BTRFS_FEATURE_INCOMPAT_ZONED) +#endif + +#define BTRFS_FEATURE_INCOMPAT_SAFE_SET \ + (BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF) +#define BTRFS_FEATURE_INCOMPAT_SAFE_CLEAR 0ULL + +#define BTRFS_DEFAULT_COMMIT_INTERVAL (30) +#define BTRFS_DEFAULT_MAX_INLINE (2048) + +struct btrfs_dev_replace { + /* See #define above */ + u64 replace_state; + /* Seconds since 1-Jan-1970 */ + time64_t time_started; + /* Seconds since 1-Jan-1970 */ + time64_t time_stopped; + atomic64_t num_write_errors; + atomic64_t num_uncorrectable_read_errors; + + u64 cursor_left; + u64 committed_cursor_left; + u64 cursor_left_last_write_of_item; + u64 cursor_right; + + /* See #define above */ + u64 cont_reading_from_srcdev_mode; + + int is_valid; + int item_needs_writeback; + struct btrfs_device *srcdev; + struct btrfs_device *tgtdev; + + struct mutex lock_finishing_cancel_unmount; + struct rw_semaphore rwsem; + + struct btrfs_scrub_progress scrub_progress; + + struct percpu_counter bio_counter; + wait_queue_head_t replace_wait; +}; + +/* + * Free clusters are used to claim free space in relatively large chunks, + * allowing us to do less seeky writes. They are used for all metadata + * allocations. In ssd_spread mode they are also used for data allocations. + */ +struct btrfs_free_cluster { + spinlock_t lock; + spinlock_t refill_lock; + struct rb_root root; + + /* Largest extent in this cluster */ + u64 max_size; + + /* First extent starting offset */ + u64 window_start; + + /* We did a full search and couldn't create a cluster */ + bool fragmented; + + struct btrfs_block_group *block_group; + /* + * When a cluster is allocated from a block group, we put the cluster + * onto a list in the block group so that it can be freed before the + * block group is freed. + */ + struct list_head block_group_list; +}; + +/* Discard control. */ +/* + * Async discard uses multiple lists to differentiate the discard filter + * parameters. Index 0 is for completely free block groups where we need to + * ensure the entire block group is trimmed without being lossy. Indices + * afterwards represent monotonically decreasing discard filter sizes to + * prioritize what should be discarded next. + */ +#define BTRFS_NR_DISCARD_LISTS 3 +#define BTRFS_DISCARD_INDEX_UNUSED 0 +#define BTRFS_DISCARD_INDEX_START 1 + +struct btrfs_discard_ctl { + struct workqueue_struct *discard_workers; + struct delayed_work work; + spinlock_t lock; + struct btrfs_block_group *block_group; + struct list_head discard_list[BTRFS_NR_DISCARD_LISTS]; + u64 prev_discard; + u64 prev_discard_time; + atomic_t discardable_extents; + atomic64_t discardable_bytes; + u64 max_discard_size; + u64 delay_ms; + u32 iops_limit; + u32 kbps_limit; + u64 discard_extent_bytes; + u64 discard_bitmap_bytes; + atomic64_t discard_bytes_saved; +}; + +/* + * Exclusive operations (device replace, resize, device add/remove, balance) + */ +enum btrfs_exclusive_operation { + BTRFS_EXCLOP_NONE, + BTRFS_EXCLOP_BALANCE_PAUSED, + BTRFS_EXCLOP_BALANCE, + BTRFS_EXCLOP_DEV_ADD, + BTRFS_EXCLOP_DEV_REMOVE, + BTRFS_EXCLOP_DEV_REPLACE, + BTRFS_EXCLOP_RESIZE, + BTRFS_EXCLOP_SWAP_ACTIVATE, +}; + +/* Store data about transaction commits, exported via sysfs. */ +struct btrfs_commit_stats { + /* Total number of commits */ + u64 commit_count; + /* The maximum commit duration so far in ns */ + u64 max_commit_dur; + /* The last commit duration in ns */ + u64 last_commit_dur; + /* The total commit duration in ns */ + u64 total_commit_dur; +}; + +struct btrfs_fs_info { + u8 chunk_tree_uuid[BTRFS_UUID_SIZE]; + unsigned long flags; + struct btrfs_root *tree_root; + struct btrfs_root *chunk_root; + struct btrfs_root *dev_root; + struct btrfs_root *fs_root; + struct btrfs_root *quota_root; + struct btrfs_root *uuid_root; + struct btrfs_root *data_reloc_root; + struct btrfs_root *block_group_root; + + /* The log root tree is a directory of all the other log roots */ + struct btrfs_root *log_root_tree; + + /* The tree that holds the global roots (csum, extent, etc) */ + rwlock_t global_root_lock; + struct rb_root global_root_tree; + + spinlock_t fs_roots_radix_lock; + struct radix_tree_root fs_roots_radix; + + /* Block group cache stuff */ + rwlock_t block_group_cache_lock; + struct rb_root_cached block_group_cache_tree; + + /* Keep track of unallocated space */ + atomic64_t free_chunk_space; + + /* Track ranges which are used by log trees blocks/logged data extents */ + struct extent_io_tree excluded_extents; + + /* logical->physical extent mapping */ + struct extent_map_tree mapping_tree; + + /* + * Block reservation for extent, checksum, root tree and delayed dir + * index item. + */ + struct btrfs_block_rsv global_block_rsv; + /* Block reservation for metadata operations */ + struct btrfs_block_rsv trans_block_rsv; + /* Block reservation for chunk tree */ + struct btrfs_block_rsv chunk_block_rsv; + /* Block reservation for delayed operations */ + struct btrfs_block_rsv delayed_block_rsv; + /* Block reservation for delayed refs */ + struct btrfs_block_rsv delayed_refs_rsv; + + struct btrfs_block_rsv empty_block_rsv; + + u64 generation; + u64 last_trans_committed; + /* + * Generation of the last transaction used for block group relocation + * since the filesystem was last mounted (or 0 if none happened yet). + * Must be written and read while holding btrfs_fs_info::commit_root_sem. + */ + u64 last_reloc_trans; + u64 avg_delayed_ref_runtime; + + /* + * This is updated to the current trans every time a full commit is + * required instead of the faster short fsync log commits + */ + u64 last_trans_log_full_commit; + unsigned long mount_opt; + + unsigned long compress_type:4; + unsigned int compress_level; + u32 commit_interval; + /* + * It is a suggestive number, the read side is safe even it gets a + * wrong number because we will write out the data into a regular + * extent. The write side(mount/remount) is under ->s_umount lock, + * so it is also safe. + */ + u64 max_inline; + + struct btrfs_transaction *running_transaction; + wait_queue_head_t transaction_throttle; + wait_queue_head_t transaction_wait; + wait_queue_head_t transaction_blocked_wait; + wait_queue_head_t async_submit_wait; + + /* + * Used to protect the incompat_flags, compat_flags, compat_ro_flags + * when they are updated. + * + * Because we do not clear the flags for ever, so we needn't use + * the lock on the read side. + * + * We also needn't use the lock when we mount the fs, because + * there is no other task which will update the flag. + */ + spinlock_t super_lock; + struct btrfs_super_block *super_copy; + struct btrfs_super_block *super_for_commit; + struct super_block *sb; + struct inode *btree_inode; + struct mutex tree_log_mutex; + struct mutex transaction_kthread_mutex; + struct mutex cleaner_mutex; + struct mutex chunk_mutex; + + /* + * This is taken to make sure we don't set block groups ro after the + * free space cache has been allocated on them. + */ + struct mutex ro_block_group_mutex; + + /* + * This is used during read/modify/write to make sure no two ios are + * trying to mod the same stripe at the same time. + */ + struct btrfs_stripe_hash_table *stripe_hash_table; + + /* + * This protects the ordered operations list only while we are + * processing all of the entries on it. This way we make sure the + * commit code doesn't find the list temporarily empty because another + * function happens to be doing non-waiting preflush before jumping + * into the main commit. + */ + struct mutex ordered_operations_mutex; + + struct rw_semaphore commit_root_sem; + + struct rw_semaphore cleanup_work_sem; + + struct rw_semaphore subvol_sem; + + spinlock_t trans_lock; + /* + * The reloc mutex goes with the trans lock, it is taken during commit + * to protect us from the relocation code. + */ + struct mutex reloc_mutex; + + struct list_head trans_list; + struct list_head dead_roots; + struct list_head caching_block_groups; + + spinlock_t delayed_iput_lock; + struct list_head delayed_iputs; + atomic_t nr_delayed_iputs; + wait_queue_head_t delayed_iputs_wait; + + atomic64_t tree_mod_seq; + + /* This protects tree_mod_log and tree_mod_seq_list */ + rwlock_t tree_mod_log_lock; + struct rb_root tree_mod_log; + struct list_head tree_mod_seq_list; + + atomic_t async_delalloc_pages; + + /* This is used to protect the following list -- ordered_roots. */ + spinlock_t ordered_root_lock; + + /* + * All fs/file tree roots in which there are data=ordered extents + * pending writeback are added into this list. + * + * These can span multiple transactions and basically include every + * dirty data page that isn't from nodatacow. + */ + struct list_head ordered_roots; + + struct mutex delalloc_root_mutex; + spinlock_t delalloc_root_lock; + /* All fs/file tree roots that have delalloc inodes. */ + struct list_head delalloc_roots; + + /* + * There is a pool of worker threads for checksumming during writes and + * a pool for checksumming after reads. This is because readers can + * run with FS locks held, and the writers may be waiting for those + * locks. We don't want ordering in the pending list to cause + * deadlocks, and so the two are serviced separately. + * + * A third pool does submit_bio to avoid deadlocking with the other two. + */ + struct btrfs_workqueue *workers; + struct btrfs_workqueue *hipri_workers; + struct btrfs_workqueue *delalloc_workers; + struct btrfs_workqueue *flush_workers; + struct workqueue_struct *endio_workers; + struct workqueue_struct *endio_meta_workers; + struct workqueue_struct *rmw_workers; + struct workqueue_struct *compressed_write_workers; + struct btrfs_workqueue *endio_write_workers; + struct btrfs_workqueue *endio_freespace_worker; + struct btrfs_workqueue *caching_workers; + + /* + * Fixup workers take dirty pages that didn't properly go through the + * cow mechanism and make them safe to write. It happens for the + * sys_munmap function call path. + */ + struct btrfs_workqueue *fixup_workers; + struct btrfs_workqueue *delayed_workers; + + struct task_struct *transaction_kthread; + struct task_struct *cleaner_kthread; + u32 thread_pool_size; + + struct kobject *space_info_kobj; + struct kobject *qgroups_kobj; + struct kobject *discard_kobj; + + /* Used to keep from writing metadata until there is a nice batch */ + struct percpu_counter dirty_metadata_bytes; + struct percpu_counter delalloc_bytes; + struct percpu_counter ordered_bytes; + s32 dirty_metadata_batch; + s32 delalloc_batch; + + struct list_head dirty_cowonly_roots; + + struct btrfs_fs_devices *fs_devices; + + /* + * The space_info list is effectively read only after initial setup. + * It is populated at mount time and cleaned up after all block groups + * are removed. RCU is used to protect it. + */ + struct list_head space_info; + + struct btrfs_space_info *data_sinfo; + + struct reloc_control *reloc_ctl; + + /* data_alloc_cluster is only used in ssd_spread mode */ + struct btrfs_free_cluster data_alloc_cluster; + + /* All metadata allocations go through this cluster. */ + struct btrfs_free_cluster meta_alloc_cluster; + + /* Auto defrag inodes go here. */ + spinlock_t defrag_inodes_lock; + struct rb_root defrag_inodes; + atomic_t defrag_running; + + /* Used to protect avail_{data, metadata, system}_alloc_bits */ + seqlock_t profiles_lock; + /* + * These three are in extended format (availability of single chunks is + * denoted by BTRFS_AVAIL_ALLOC_BIT_SINGLE bit, other types are denoted + * by corresponding BTRFS_BLOCK_GROUP_* bits) + */ + u64 avail_data_alloc_bits; + u64 avail_metadata_alloc_bits; + u64 avail_system_alloc_bits; + + /* Balance state */ + spinlock_t balance_lock; + struct mutex balance_mutex; + atomic_t balance_pause_req; + atomic_t balance_cancel_req; + struct btrfs_balance_control *balance_ctl; + wait_queue_head_t balance_wait_q; + + /* Cancellation requests for chunk relocation */ + atomic_t reloc_cancel_req; + + u32 data_chunk_allocations; + u32 metadata_ratio; + + void *bdev_holder; + + /* Private scrub information */ + struct mutex scrub_lock; + atomic_t scrubs_running; + atomic_t scrub_pause_req; + atomic_t scrubs_paused; + atomic_t scrub_cancel_req; + wait_queue_head_t scrub_pause_wait; + /* + * The worker pointers are NULL iff the refcount is 0, ie. scrub is not + * running. + */ + refcount_t scrub_workers_refcnt; + struct workqueue_struct *scrub_workers; + struct workqueue_struct *scrub_wr_completion_workers; + struct workqueue_struct *scrub_parity_workers; + struct btrfs_subpage_info *subpage_info; + + struct btrfs_discard_ctl discard_ctl; + +#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY + u32 check_integrity_print_mask; +#endif + /* Is qgroup tracking in a consistent state? */ + u64 qgroup_flags; + + /* Holds configuration and tracking. Protected by qgroup_lock. */ + struct rb_root qgroup_tree; + spinlock_t qgroup_lock; + + /* + * Used to avoid frequently calling ulist_alloc()/ulist_free() + * when doing qgroup accounting, it must be protected by qgroup_lock. + */ + struct ulist *qgroup_ulist; + + /* + * Protect user change for quota operations. If a transaction is needed, + * it must be started before locking this lock. + */ + struct mutex qgroup_ioctl_lock; + + /* List of dirty qgroups to be written at next commit. */ + struct list_head dirty_qgroups; + + /* Used by qgroup for an efficient tree traversal. */ + u64 qgroup_seq; + + /* Qgroup rescan items. */ + /* Protects the progress item */ + struct mutex qgroup_rescan_lock; + struct btrfs_key qgroup_rescan_progress; + struct btrfs_workqueue *qgroup_rescan_workers; + struct completion qgroup_rescan_completion; + struct btrfs_work qgroup_rescan_work; + /* Protected by qgroup_rescan_lock */ + bool qgroup_rescan_running; + u8 qgroup_drop_subtree_thres; + + /* Filesystem state */ + unsigned long fs_state; + + struct btrfs_delayed_root *delayed_root; + + /* Extent buffer radix tree */ + spinlock_t buffer_lock; + /* Entries are eb->start / sectorsize */ + struct radix_tree_root buffer_radix; + + /* Next backup root to be overwritten */ + int backup_root_index; + + /* Device replace state */ + struct btrfs_dev_replace dev_replace; + + struct semaphore uuid_tree_rescan_sem; + + /* Used to reclaim the metadata space in the background. */ + struct work_struct async_reclaim_work; + struct work_struct async_data_reclaim_work; + struct work_struct preempt_reclaim_work; + + /* Reclaim partially filled block groups in the background */ + struct work_struct reclaim_bgs_work; + struct list_head reclaim_bgs; + int bg_reclaim_threshold; + + spinlock_t unused_bgs_lock; + struct list_head unused_bgs; + struct mutex unused_bg_unpin_mutex; + /* Protect block groups that are going to be deleted */ + struct mutex reclaim_bgs_lock; + + /* Cached block sizes */ + u32 nodesize; + u32 sectorsize; + /* ilog2 of sectorsize, use to avoid 64bit division */ + u32 sectorsize_bits; + u32 csum_size; + u32 csums_per_leaf; + u32 stripesize; + + /* + * Maximum size of an extent. BTRFS_MAX_EXTENT_SIZE on regular + * filesystem, on zoned it depends on the device constraints. + */ + u64 max_extent_size; + + /* Block groups and devices containing active swapfiles. */ + spinlock_t swapfile_pins_lock; + struct rb_root swapfile_pins; + + struct crypto_shash *csum_shash; + + /* Type of exclusive operation running, protected by super_lock */ + enum btrfs_exclusive_operation exclusive_operation; + + /* + * Zone size > 0 when in ZONED mode, otherwise it's used for a check + * if the mode is enabled + */ + u64 zone_size; + + /* Max size to emit ZONE_APPEND write command */ + u64 max_zone_append_size; + struct mutex zoned_meta_io_lock; + spinlock_t treelog_bg_lock; + u64 treelog_bg; + + /* + * Start of the dedicated data relocation block group, protected by + * relocation_bg_lock. + */ + spinlock_t relocation_bg_lock; + u64 data_reloc_bg; + struct mutex zoned_data_reloc_io_lock; + + u64 nr_global_roots; + + spinlock_t zone_active_bgs_lock; + struct list_head zone_active_bgs; + + /* Updates are not protected by any lock */ + struct btrfs_commit_stats commit_stats; + + /* + * Last generation where we dropped a non-relocation root. + * Use btrfs_set_last_root_drop_gen() and btrfs_get_last_root_drop_gen() + * to change it and to read it, respectively. + */ + u64 last_root_drop_gen; + + /* + * Annotations for transaction events (structures are empty when + * compiled without lockdep). + */ + struct lockdep_map btrfs_trans_num_writers_map; + struct lockdep_map btrfs_trans_num_extwriters_map; + struct lockdep_map btrfs_state_change_map[4]; + struct lockdep_map btrfs_trans_pending_ordered_map; + struct lockdep_map btrfs_ordered_extent_map; + +#ifdef CONFIG_BTRFS_FS_REF_VERIFY + spinlock_t ref_verify_lock; + struct rb_root block_tree; +#endif + +#ifdef CONFIG_BTRFS_DEBUG + struct kobject *debug_kobj; + struct list_head allocated_roots; + + spinlock_t eb_leak_lock; + struct list_head allocated_ebs; +#endif +}; + +static inline void btrfs_set_last_root_drop_gen(struct btrfs_fs_info *fs_info, + u64 gen) +{ + WRITE_ONCE(fs_info->last_root_drop_gen, gen); +} + +static inline u64 btrfs_get_last_root_drop_gen(const struct btrfs_fs_info *fs_info) +{ + return READ_ONCE(fs_info->last_root_drop_gen); +} + +/* + * Take the number of bytes to be checksummed and figure out how many leaves + * it would require to store the csums for that many bytes. + */ +static inline u64 btrfs_csum_bytes_to_leaves( + const struct btrfs_fs_info *fs_info, u64 csum_bytes) +{ + const u64 num_csums = csum_bytes >> fs_info->sectorsize_bits; + + return DIV_ROUND_UP_ULL(num_csums, fs_info->csums_per_leaf); +} + +/* + * Use this if we would be adding new items, as we could split nodes as we cow + * down the tree. + */ +static inline u64 btrfs_calc_insert_metadata_size(struct btrfs_fs_info *fs_info, + unsigned num_items) +{ + return (u64)fs_info->nodesize * BTRFS_MAX_LEVEL * 2 * num_items; +} + +/* + * Doing a truncate or a modification won't result in new nodes or leaves, just + * what we need for COW. + */ +static inline u64 btrfs_calc_metadata_size(struct btrfs_fs_info *fs_info, + unsigned num_items) +{ + return (u64)fs_info->nodesize * BTRFS_MAX_LEVEL * num_items; +} + +#define BTRFS_MAX_EXTENT_ITEM_SIZE(r) ((BTRFS_LEAF_DATA_SIZE(r->fs_info) >> 4) - \ + sizeof(struct btrfs_item)) + +static inline bool btrfs_is_zoned(const struct btrfs_fs_info *fs_info) +{ + return fs_info->zone_size > 0; +} + +/* + * Count how many fs_info->max_extent_size cover the @size + */ +static inline u32 count_max_extents(struct btrfs_fs_info *fs_info, u64 size) +{ +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS + if (!fs_info) + return div_u64(size + BTRFS_MAX_EXTENT_SIZE - 1, BTRFS_MAX_EXTENT_SIZE); +#endif + + return div_u64(size + fs_info->max_extent_size - 1, fs_info->max_extent_size); +} + +bool btrfs_exclop_start(struct btrfs_fs_info *fs_info, + enum btrfs_exclusive_operation type); +bool btrfs_exclop_start_try_lock(struct btrfs_fs_info *fs_info, + enum btrfs_exclusive_operation type); +void btrfs_exclop_start_unlock(struct btrfs_fs_info *fs_info); +void btrfs_exclop_finish(struct btrfs_fs_info *fs_info); +void btrfs_exclop_balance(struct btrfs_fs_info *fs_info, + enum btrfs_exclusive_operation op); + +/* Compatibility and incompatibility defines */ +void __btrfs_set_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag, + const char *name); +void __btrfs_clear_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag, + const char *name); +void __btrfs_set_fs_compat_ro(struct btrfs_fs_info *fs_info, u64 flag, + const char *name); +void __btrfs_clear_fs_compat_ro(struct btrfs_fs_info *fs_info, u64 flag, + const char *name); + +#define __btrfs_fs_incompat(fs_info, flags) \ + (!!(btrfs_super_incompat_flags((fs_info)->super_copy) & (flags))) + +#define __btrfs_fs_compat_ro(fs_info, flags) \ + (!!(btrfs_super_compat_ro_flags((fs_info)->super_copy) & (flags))) + +#define btrfs_set_fs_incompat(__fs_info, opt) \ + __btrfs_set_fs_incompat((__fs_info), BTRFS_FEATURE_INCOMPAT_##opt, #opt) + +#define btrfs_clear_fs_incompat(__fs_info, opt) \ + __btrfs_clear_fs_incompat((__fs_info), BTRFS_FEATURE_INCOMPAT_##opt, #opt) + +#define btrfs_fs_incompat(fs_info, opt) \ + __btrfs_fs_incompat((fs_info), BTRFS_FEATURE_INCOMPAT_##opt) + +#define btrfs_set_fs_compat_ro(__fs_info, opt) \ + __btrfs_set_fs_compat_ro((__fs_info), BTRFS_FEATURE_COMPAT_RO_##opt, #opt) + +#define btrfs_clear_fs_compat_ro(__fs_info, opt) \ + __btrfs_clear_fs_compat_ro((__fs_info), BTRFS_FEATURE_COMPAT_RO_##opt, #opt) + +#define btrfs_fs_compat_ro(fs_info, opt) \ + __btrfs_fs_compat_ro((fs_info), BTRFS_FEATURE_COMPAT_RO_##opt) + +#define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt) +#define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt) +#define btrfs_raw_test_opt(o, opt) ((o) & BTRFS_MOUNT_##opt) +#define btrfs_test_opt(fs_info, opt) ((fs_info)->mount_opt & \ + BTRFS_MOUNT_##opt) + +#define btrfs_set_and_info(fs_info, opt, fmt, args...) \ +do { \ + if (!btrfs_test_opt(fs_info, opt)) \ + btrfs_info(fs_info, fmt, ##args); \ + btrfs_set_opt(fs_info->mount_opt, opt); \ +} while (0) + +#define btrfs_clear_and_info(fs_info, opt, fmt, args...) \ +do { \ + if (btrfs_test_opt(fs_info, opt)) \ + btrfs_info(fs_info, fmt, ##args); \ + btrfs_clear_opt(fs_info->mount_opt, opt); \ +} while (0) + +static inline int btrfs_fs_closing(struct btrfs_fs_info *fs_info) +{ + /* Do it this way so we only ever do one test_bit in the normal case. */ + if (test_bit(BTRFS_FS_CLOSING_START, &fs_info->flags)) { + if (test_bit(BTRFS_FS_CLOSING_DONE, &fs_info->flags)) + return 2; + return 1; + } + return 0; +} + +/* + * If we remount the fs to be R/O or umount the fs, the cleaner needn't do + * anything except sleeping. This function is used to check the status of + * the fs. + * We check for BTRFS_FS_STATE_RO to avoid races with a concurrent remount, + * since setting and checking for SB_RDONLY in the superblock's flags is not + * atomic. + */ +static inline int btrfs_need_cleaner_sleep(struct btrfs_fs_info *fs_info) +{ + return test_bit(BTRFS_FS_STATE_RO, &fs_info->fs_state) || + btrfs_fs_closing(fs_info); +} + +static inline void btrfs_wake_unfinished_drop(struct btrfs_fs_info *fs_info) +{ + clear_and_wake_up_bit(BTRFS_FS_UNFINISHED_DROPS, &fs_info->flags); +} + +#define BTRFS_FS_ERROR(fs_info) (unlikely(test_bit(BTRFS_FS_STATE_ERROR, \ + &(fs_info)->fs_state))) +#define BTRFS_FS_LOG_CLEANUP_ERROR(fs_info) \ + (unlikely(test_bit(BTRFS_FS_STATE_LOG_CLEANUP_ERROR, \ + &(fs_info)->fs_state))) + +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS + +#define EXPORT_FOR_TESTS + +static inline int btrfs_is_testing(struct btrfs_fs_info *fs_info) +{ + return test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state); +} + +void btrfs_test_destroy_inode(struct inode *inode); + +#else + +#define EXPORT_FOR_TESTS static + +static inline int btrfs_is_testing(struct btrfs_fs_info *fs_info) +{ + return 0; +} +#endif + +#endif diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c index 0eeb5ea87894..b65c45b5d681 100644 --- a/fs/btrfs/inode-item.c +++ b/fs/btrfs/inode-item.c @@ -4,14 +4,20 @@ */ #include "ctree.h" +#include "fs.h" +#include "messages.h" #include "inode-item.h" #include "disk-io.h" #include "transaction.h" #include "print-tree.h" +#include "space-info.h" +#include "accessors.h" +#include "extent-tree.h" +#include "file-item.h" struct btrfs_inode_ref *btrfs_find_name_in_backref(struct extent_buffer *leaf, - int slot, const char *name, - int name_len) + int slot, + const struct fscrypt_str *name) { struct btrfs_inode_ref *ref; unsigned long ptr; @@ -27,9 +33,10 @@ struct btrfs_inode_ref *btrfs_find_name_in_backref(struct extent_buffer *leaf, len = btrfs_inode_ref_name_len(leaf, ref); name_ptr = (unsigned long)(ref + 1); cur_offset += len + sizeof(*ref); - if (len != name_len) + if (len != name->len) continue; - if (memcmp_extent_buffer(leaf, name, name_ptr, name_len) == 0) + if (memcmp_extent_buffer(leaf, name->name, name_ptr, + name->len) == 0) return ref; } return NULL; @@ -37,7 +44,7 @@ struct btrfs_inode_ref *btrfs_find_name_in_backref(struct extent_buffer *leaf, struct btrfs_inode_extref *btrfs_find_name_in_ext_backref( struct extent_buffer *leaf, int slot, u64 ref_objectid, - const char *name, int name_len) + const struct fscrypt_str *name) { struct btrfs_inode_extref *extref; unsigned long ptr; @@ -60,9 +67,10 @@ struct btrfs_inode_extref *btrfs_find_name_in_ext_backref( name_ptr = (unsigned long)(&extref->name); ref_name_len = btrfs_inode_extref_name_len(leaf, extref); - if (ref_name_len == name_len && + if (ref_name_len == name->len && btrfs_inode_extref_parent(leaf, extref) == ref_objectid && - (memcmp_extent_buffer(leaf, name, name_ptr, name_len) == 0)) + (memcmp_extent_buffer(leaf, name->name, name_ptr, + name->len) == 0)) return extref; cur_offset += ref_name_len + sizeof(*extref); @@ -75,7 +83,7 @@ struct btrfs_inode_extref * btrfs_lookup_inode_extref(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, - const char *name, int name_len, + const struct fscrypt_str *name, u64 inode_objectid, u64 ref_objectid, int ins_len, int cow) { @@ -84,7 +92,7 @@ btrfs_lookup_inode_extref(struct btrfs_trans_handle *trans, key.objectid = inode_objectid; key.type = BTRFS_INODE_EXTREF_KEY; - key.offset = btrfs_extref_hash(ref_objectid, name, name_len); + key.offset = btrfs_extref_hash(ref_objectid, name->name, name->len); ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow); if (ret < 0) @@ -92,13 +100,13 @@ btrfs_lookup_inode_extref(struct btrfs_trans_handle *trans, if (ret > 0) return NULL; return btrfs_find_name_in_ext_backref(path->nodes[0], path->slots[0], - ref_objectid, name, name_len); + ref_objectid, name); } static int btrfs_del_inode_extref(struct btrfs_trans_handle *trans, struct btrfs_root *root, - const char *name, int name_len, + const struct fscrypt_str *name, u64 inode_objectid, u64 ref_objectid, u64 *index) { @@ -107,14 +115,14 @@ static int btrfs_del_inode_extref(struct btrfs_trans_handle *trans, struct btrfs_inode_extref *extref; struct extent_buffer *leaf; int ret; - int del_len = name_len + sizeof(*extref); + int del_len = name->len + sizeof(*extref); unsigned long ptr; unsigned long item_start; u32 item_size; key.objectid = inode_objectid; key.type = BTRFS_INODE_EXTREF_KEY; - key.offset = btrfs_extref_hash(ref_objectid, name, name_len); + key.offset = btrfs_extref_hash(ref_objectid, name->name, name->len); path = btrfs_alloc_path(); if (!path) @@ -132,7 +140,7 @@ static int btrfs_del_inode_extref(struct btrfs_trans_handle *trans, * readonly. */ extref = btrfs_find_name_in_ext_backref(path->nodes[0], path->slots[0], - ref_objectid, name, name_len); + ref_objectid, name); if (!extref) { btrfs_handle_fs_error(root->fs_info, -ENOENT, NULL); ret = -EROFS; @@ -168,8 +176,7 @@ out: } int btrfs_del_inode_ref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - const char *name, int name_len, + struct btrfs_root *root, const struct fscrypt_str *name, u64 inode_objectid, u64 ref_objectid, u64 *index) { struct btrfs_path *path; @@ -182,7 +189,7 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans, u32 sub_item_len; int ret; int search_ext_refs = 0; - int del_len = name_len + sizeof(*ref); + int del_len = name->len + sizeof(*ref); key.objectid = inode_objectid; key.offset = ref_objectid; @@ -201,8 +208,7 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans, goto out; } - ref = btrfs_find_name_in_backref(path->nodes[0], path->slots[0], name, - name_len); + ref = btrfs_find_name_in_backref(path->nodes[0], path->slots[0], name); if (!ref) { ret = -ENOENT; search_ext_refs = 1; @@ -219,7 +225,7 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans, goto out; } ptr = (unsigned long)ref; - sub_item_len = name_len + sizeof(*ref); + sub_item_len = name->len + sizeof(*ref); item_start = btrfs_item_ptr_offset(leaf, path->slots[0]); memmove_extent_buffer(leaf, ptr, ptr + sub_item_len, item_size - (ptr + sub_item_len - item_start)); @@ -233,7 +239,7 @@ out: * name in our ref array. Find and remove the extended * inode ref then. */ - return btrfs_del_inode_extref(trans, root, name, name_len, + return btrfs_del_inode_extref(trans, root, name, inode_objectid, ref_objectid, index); } @@ -247,12 +253,13 @@ out: */ static int btrfs_insert_inode_extref(struct btrfs_trans_handle *trans, struct btrfs_root *root, - const char *name, int name_len, - u64 inode_objectid, u64 ref_objectid, u64 index) + const struct fscrypt_str *name, + u64 inode_objectid, u64 ref_objectid, + u64 index) { struct btrfs_inode_extref *extref; int ret; - int ins_len = name_len + sizeof(*extref); + int ins_len = name->len + sizeof(*extref); unsigned long ptr; struct btrfs_path *path; struct btrfs_key key; @@ -260,7 +267,7 @@ static int btrfs_insert_inode_extref(struct btrfs_trans_handle *trans, key.objectid = inode_objectid; key.type = BTRFS_INODE_EXTREF_KEY; - key.offset = btrfs_extref_hash(ref_objectid, name, name_len); + key.offset = btrfs_extref_hash(ref_objectid, name->name, name->len); path = btrfs_alloc_path(); if (!path) @@ -272,7 +279,7 @@ static int btrfs_insert_inode_extref(struct btrfs_trans_handle *trans, if (btrfs_find_name_in_ext_backref(path->nodes[0], path->slots[0], ref_objectid, - name, name_len)) + name)) goto out; btrfs_extend_item(path, ins_len); @@ -286,12 +293,12 @@ static int btrfs_insert_inode_extref(struct btrfs_trans_handle *trans, ptr += btrfs_item_size(leaf, path->slots[0]) - ins_len; extref = (struct btrfs_inode_extref *)ptr; - btrfs_set_inode_extref_name_len(path->nodes[0], extref, name_len); + btrfs_set_inode_extref_name_len(path->nodes[0], extref, name->len); btrfs_set_inode_extref_index(path->nodes[0], extref, index); btrfs_set_inode_extref_parent(path->nodes[0], extref, ref_objectid); ptr = (unsigned long)&extref->name; - write_extent_buffer(path->nodes[0], name, ptr, name_len); + write_extent_buffer(path->nodes[0], name->name, ptr, name->len); btrfs_mark_buffer_dirty(path->nodes[0]); out: @@ -301,8 +308,7 @@ out: /* Will return 0, -ENOMEM, -EMLINK, or -EEXIST or anything from the CoW path */ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - const char *name, int name_len, + struct btrfs_root *root, const struct fscrypt_str *name, u64 inode_objectid, u64 ref_objectid, u64 index) { struct btrfs_fs_info *fs_info = root->fs_info; @@ -311,7 +317,7 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans, struct btrfs_inode_ref *ref; unsigned long ptr; int ret; - int ins_len = name_len + sizeof(*ref); + int ins_len = name->len + sizeof(*ref); key.objectid = inode_objectid; key.offset = ref_objectid; @@ -327,7 +333,7 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans, if (ret == -EEXIST) { u32 old_size; ref = btrfs_find_name_in_backref(path->nodes[0], path->slots[0], - name, name_len); + name); if (ref) goto out; @@ -336,7 +342,7 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans, ref = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_inode_ref); ref = (struct btrfs_inode_ref *)((unsigned long)ref + old_size); - btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len); + btrfs_set_inode_ref_name_len(path->nodes[0], ref, name->len); btrfs_set_inode_ref_index(path->nodes[0], ref, index); ptr = (unsigned long)(ref + 1); ret = 0; @@ -344,7 +350,7 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans, if (ret == -EOVERFLOW) { if (btrfs_find_name_in_backref(path->nodes[0], path->slots[0], - name, name_len)) + name)) ret = -EEXIST; else ret = -EMLINK; @@ -353,11 +359,11 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans, } else { ref = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_inode_ref); - btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len); + btrfs_set_inode_ref_name_len(path->nodes[0], ref, name->len); btrfs_set_inode_ref_index(path->nodes[0], ref, index); ptr = (unsigned long)(ref + 1); } - write_extent_buffer(path->nodes[0], name, ptr, name_len); + write_extent_buffer(path->nodes[0], name->name, ptr, name->len); btrfs_mark_buffer_dirty(path->nodes[0]); out: @@ -370,7 +376,6 @@ out: if (btrfs_super_incompat_flags(disk_super) & BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF) ret = btrfs_insert_inode_extref(trans, root, name, - name_len, inode_objectid, ref_objectid, index); } diff --git a/fs/btrfs/inode-item.h b/fs/btrfs/inode-item.h index a8fc16d0147f..b80aeb715701 100644 --- a/fs/btrfs/inode-item.h +++ b/fs/btrfs/inode-item.h @@ -64,33 +64,31 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_truncate_control *control); int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - const char *name, int name_len, + struct btrfs_root *root, const struct fscrypt_str *name, u64 inode_objectid, u64 ref_objectid, u64 index); int btrfs_del_inode_ref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - const char *name, int name_len, - u64 inode_objectid, u64 ref_objectid, u64 *index); + struct btrfs_root *root, const struct fscrypt_str *name, + u64 inode_objectid, u64 ref_objectid, u64 *index); int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, u64 objectid); -int btrfs_lookup_inode(struct btrfs_trans_handle *trans, struct btrfs_root - *root, struct btrfs_path *path, +int btrfs_lookup_inode(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct btrfs_path *path, struct btrfs_key *location, int mod); struct btrfs_inode_extref *btrfs_lookup_inode_extref( struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, - const char *name, int name_len, + const struct fscrypt_str *name, u64 inode_objectid, u64 ref_objectid, int ins_len, int cow); struct btrfs_inode_ref *btrfs_find_name_in_backref(struct extent_buffer *leaf, - int slot, const char *name, - int name_len); + int slot, + const struct fscrypt_str *name); struct btrfs_inode_extref *btrfs_find_name_in_ext_backref( struct extent_buffer *leaf, int slot, u64 ref_objectid, - const char *name, int name_len); + const struct fscrypt_str *name); #endif diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 5a54bb93c413..8bcad9940154 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -43,7 +43,7 @@ #include "ordered-data.h" #include "xattr.h" #include "tree-log.h" -#include "volumes.h" +#include "bio.h" #include "compression.h" #include "locking.h" #include "free-space-cache.h" @@ -55,6 +55,21 @@ #include "zoned.h" #include "subpage.h" #include "inode-item.h" +#include "fs.h" +#include "accessors.h" +#include "extent-tree.h" +#include "root-tree.h" +#include "defrag.h" +#include "dir-item.h" +#include "file-item.h" +#include "uuid-tree.h" +#include "ioctl.h" +#include "file.h" +#include "acl.h" +#include "relocation.h" +#include "verity.h" +#include "super.h" +#include "orphan.h" struct btrfs_iget_args { u64 ino; @@ -69,7 +84,7 @@ struct btrfs_dio_data { }; struct btrfs_dio_private { - struct inode *inode; + struct btrfs_inode *inode; /* * Since DIO can use anonymous page, we cannot use page_offset() to @@ -107,13 +122,9 @@ static const struct address_space_operations btrfs_aops; static const struct file_operations btrfs_dir_file_operations; static struct kmem_cache *btrfs_inode_cachep; -struct kmem_cache *btrfs_trans_handle_cachep; -struct kmem_cache *btrfs_path_cachep; -struct kmem_cache *btrfs_free_space_cachep; -struct kmem_cache *btrfs_free_space_bitmap_cachep; static int btrfs_setsize(struct inode *inode, struct iattr *attr); -static int btrfs_truncate(struct inode *inode, bool skip_writeback); +static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback); static noinline int cow_file_range(struct btrfs_inode *inode, struct page *locked_page, u64 start, u64 end, int *page_started, @@ -125,6 +136,32 @@ static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start, u64 ram_bytes, int compress_type, int type); +static void __cold btrfs_print_data_csum_error(struct btrfs_inode *inode, + u64 logical_start, u8 *csum, u8 *csum_expected, int mirror_num) +{ + struct btrfs_root *root = inode->root; + const u32 csum_size = root->fs_info->csum_size; + + /* Output without objectid, which is more meaningful */ + if (root->root_key.objectid >= BTRFS_LAST_FREE_OBJECTID) { + btrfs_warn_rl(root->fs_info, +"csum failed root %lld ino %lld off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d", + root->root_key.objectid, btrfs_ino(inode), + logical_start, + CSUM_FMT_VALUE(csum_size, csum), + CSUM_FMT_VALUE(csum_size, csum_expected), + mirror_num); + } else { + btrfs_warn_rl(root->fs_info, +"csum failed root %llu ino %llu off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d", + root->root_key.objectid, btrfs_ino(inode), + logical_start, + CSUM_FMT_VALUE(csum_size, csum), + CSUM_FMT_VALUE(csum_size, csum_expected), + mirror_num); + } +} + /* * btrfs_inode_lock - lock inode i_rwsem based on arguments passed * @@ -135,27 +172,27 @@ static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start, * return -EAGAIN * BTRFS_ILOCK_MMAP - acquire a write lock on the i_mmap_lock */ -int btrfs_inode_lock(struct inode *inode, unsigned int ilock_flags) +int btrfs_inode_lock(struct btrfs_inode *inode, unsigned int ilock_flags) { if (ilock_flags & BTRFS_ILOCK_SHARED) { if (ilock_flags & BTRFS_ILOCK_TRY) { - if (!inode_trylock_shared(inode)) + if (!inode_trylock_shared(&inode->vfs_inode)) return -EAGAIN; else return 0; } - inode_lock_shared(inode); + inode_lock_shared(&inode->vfs_inode); } else { if (ilock_flags & BTRFS_ILOCK_TRY) { - if (!inode_trylock(inode)) + if (!inode_trylock(&inode->vfs_inode)) return -EAGAIN; else return 0; } - inode_lock(inode); + inode_lock(&inode->vfs_inode); } if (ilock_flags & BTRFS_ILOCK_MMAP) - down_write(&BTRFS_I(inode)->i_mmap_lock); + down_write(&inode->i_mmap_lock); return 0; } @@ -165,14 +202,14 @@ int btrfs_inode_lock(struct inode *inode, unsigned int ilock_flags) * ilock_flags should contain the same bits set as passed to btrfs_inode_lock() * to decide whether the lock acquired is shared or exclusive. */ -void btrfs_inode_unlock(struct inode *inode, unsigned int ilock_flags) +void btrfs_inode_unlock(struct btrfs_inode *inode, unsigned int ilock_flags) { if (ilock_flags & BTRFS_ILOCK_MMAP) - up_write(&BTRFS_I(inode)->i_mmap_lock); + up_write(&inode->i_mmap_lock); if (ilock_flags & BTRFS_ILOCK_SHARED) - inode_unlock_shared(inode); + inode_unlock_shared(&inode->vfs_inode); else - inode_unlock(inode); + inode_unlock(&inode->vfs_inode); } /* @@ -249,7 +286,7 @@ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode, return btrfs_mark_ordered_io_finished(inode, NULL, offset, bytes, false); } -static int btrfs_dirty_inode(struct inode *inode); +static int btrfs_dirty_inode(struct btrfs_inode *inode); static int btrfs_init_inode_security(struct btrfs_trans_handle *trans, struct btrfs_new_inode_args *args) @@ -483,7 +520,7 @@ struct async_extent { }; struct async_chunk { - struct inode *inode; + struct btrfs_inode *inode; struct page *locked_page; u64 start; u64 end; @@ -611,8 +648,8 @@ static inline void inode_should_defrag(struct btrfs_inode *inode, */ static noinline int compress_file_range(struct async_chunk *async_chunk) { - struct inode *inode = async_chunk->inode; - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_inode *inode = async_chunk->inode; + struct btrfs_fs_info *fs_info = inode->root->fs_info; u64 blocksize = fs_info->sectorsize; u64 start = async_chunk->start; u64 end = async_chunk->end; @@ -629,8 +666,7 @@ static noinline int compress_file_range(struct async_chunk *async_chunk) int compressed_extents = 0; int redirty = 0; - inode_should_defrag(BTRFS_I(inode), start, end, end - start + 1, - SZ_16K); + inode_should_defrag(inode, start, end, end - start + 1, SZ_16K); /* * We need to save i_size before now because it could change in between @@ -642,7 +678,7 @@ static noinline int compress_file_range(struct async_chunk *async_chunk) * does that for us. */ barrier(); - i_size = i_size_read(inode); + i_size = i_size_read(&inode->vfs_inode); barrier(); actual_end = min_t(u64, i_size, end + 1); again: @@ -671,7 +707,7 @@ again: * isn't an inline extent, since it doesn't save disk space at all. */ if (total_compressed <= blocksize && - (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size)) + (start > 0 || end + 1 < inode->disk_i_size)) goto cleanup_and_bail_uncompressed; /* @@ -695,7 +731,7 @@ again: * inode has not been flagged as nocompress. This flag can * change at any time if we discover bad compression ratios. */ - if (inode_need_compress(BTRFS_I(inode), start, end)) { + if (inode_need_compress(inode, start, end)) { WARN_ON(pages); pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS); if (!pages) { @@ -704,10 +740,10 @@ again: goto cont; } - if (BTRFS_I(inode)->defrag_compress) - compress_type = BTRFS_I(inode)->defrag_compress; - else if (BTRFS_I(inode)->prop_compress) - compress_type = BTRFS_I(inode)->prop_compress; + if (inode->defrag_compress) + compress_type = inode->defrag_compress; + else if (inode->prop_compress) + compress_type = inode->prop_compress; /* * we need to call clear_page_dirty_for_io on each @@ -722,14 +758,14 @@ again: * has moved, the end is the original one. */ if (!redirty) { - extent_range_clear_dirty_for_io(inode, start, end); + extent_range_clear_dirty_for_io(&inode->vfs_inode, start, end); redirty = 1; } /* Compression level is applied here and only here */ ret = btrfs_compress_pages( compress_type | (fs_info->compress_level << 4), - inode->i_mapping, start, + inode->vfs_inode.i_mapping, start, pages, &nr_pages, &total_in, @@ -758,12 +794,12 @@ cont: /* we didn't compress the entire range, try * to make an uncompressed inline extent. */ - ret = cow_file_range_inline(BTRFS_I(inode), actual_end, + ret = cow_file_range_inline(inode, actual_end, 0, BTRFS_COMPRESS_NONE, NULL, false); } else { /* try making a compressed inline extent */ - ret = cow_file_range_inline(BTRFS_I(inode), actual_end, + ret = cow_file_range_inline(inode, actual_end, total_compressed, compress_type, pages, false); @@ -786,7 +822,7 @@ cont: * our outstanding extent for clearing delalloc for this * range. */ - extent_clear_unlock_delalloc(BTRFS_I(inode), start, end, + extent_clear_unlock_delalloc(inode, start, end, NULL, clear_flags, PAGE_UNLOCK | @@ -861,8 +897,8 @@ cont: /* flag the file so we don't compress in the future */ if (!btrfs_test_opt(fs_info, FORCE_COMPRESS) && - !(BTRFS_I(inode)->prop_compress)) { - BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS; + !(inode->prop_compress)) { + inode->flags |= BTRFS_INODE_NOCOMPRESS; } } cleanup_and_bail_uncompressed: @@ -880,7 +916,7 @@ cleanup_and_bail_uncompressed: } if (redirty) - extent_range_redirty_for_io(inode, start, end); + extent_range_redirty_for_io(&inode->vfs_inode, start, end); add_async_extent(async_chunk, start, end - start + 1, 0, NULL, 0, BTRFS_COMPRESS_NONE); compressed_extents++; @@ -1076,7 +1112,7 @@ out_free: */ static noinline void submit_compressed_extents(struct async_chunk *async_chunk) { - struct btrfs_inode *inode = BTRFS_I(async_chunk->inode); + struct btrfs_inode *inode = async_chunk->inode; struct btrfs_fs_info *fs_info = inode->root->fs_info; struct async_extent *async_extent; u64 alloc_hint = 0; @@ -1565,7 +1601,7 @@ static int cow_file_range_async(struct btrfs_inode *inode, */ ihold(&inode->vfs_inode); async_chunk[i].async_cow = ctx; - async_chunk[i].inode = &inode->vfs_inode; + async_chunk[i].inode = inode; async_chunk[i].start = start; async_chunk[i].end = cur_end; async_chunk[i].write_flags = write_flags; @@ -1673,9 +1709,8 @@ static noinline int csum_exist_in_range(struct btrfs_fs_info *fs_info, int ret; LIST_HEAD(list); - ret = btrfs_lookup_csums_range(csum_root, bytenr, - bytenr + num_bytes - 1, &list, 0, - nowait); + ret = btrfs_lookup_csums_list(csum_root, bytenr, bytenr + num_bytes - 1, + &list, 0, nowait); if (ret == 0 && list_empty(&list)) return 0; @@ -1733,7 +1768,7 @@ static int fallback_to_cow(struct btrfs_inode *inode, struct page *locked_page, * when starting writeback. */ count = count_range_bits(io_tree, &range_start, end, range_bytes, - EXTENT_NORESERVE, 0); + EXTENT_NORESERVE, 0, NULL); if (count > 0 || is_space_ino || is_reloc_ino) { u64 bytes = count; struct btrfs_fs_info *fs_info = inode->root->fs_info; @@ -2240,10 +2275,10 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page return ret; } -void btrfs_split_delalloc_extent(struct inode *inode, +void btrfs_split_delalloc_extent(struct btrfs_inode *inode, struct extent_state *orig, u64 split) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode->root->fs_info; u64 size; /* not delalloc, ignore it */ @@ -2267,9 +2302,9 @@ void btrfs_split_delalloc_extent(struct inode *inode, return; } - spin_lock(&BTRFS_I(inode)->lock); - btrfs_mod_outstanding_extents(BTRFS_I(inode), 1); - spin_unlock(&BTRFS_I(inode)->lock); + spin_lock(&inode->lock); + btrfs_mod_outstanding_extents(inode, 1); + spin_unlock(&inode->lock); } /* @@ -2277,10 +2312,10 @@ void btrfs_split_delalloc_extent(struct inode *inode, * that are just merged onto old extents, such as when we are doing sequential * writes, so we can properly account for the metadata space we'll need. */ -void btrfs_merge_delalloc_extent(struct inode *inode, struct extent_state *new, +void btrfs_merge_delalloc_extent(struct btrfs_inode *inode, struct extent_state *new, struct extent_state *other) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode->root->fs_info; u64 new_size, old_size; u32 num_extents; @@ -2295,9 +2330,9 @@ void btrfs_merge_delalloc_extent(struct inode *inode, struct extent_state *new, /* we're not bigger than the max, unreserve the space and go */ if (new_size <= fs_info->max_extent_size) { - spin_lock(&BTRFS_I(inode)->lock); - btrfs_mod_outstanding_extents(BTRFS_I(inode), -1); - spin_unlock(&BTRFS_I(inode)->lock); + spin_lock(&inode->lock); + btrfs_mod_outstanding_extents(inode, -1); + spin_unlock(&inode->lock); return; } @@ -2326,22 +2361,20 @@ void btrfs_merge_delalloc_extent(struct inode *inode, struct extent_state *new, if (count_max_extents(fs_info, new_size) >= num_extents) return; - spin_lock(&BTRFS_I(inode)->lock); - btrfs_mod_outstanding_extents(BTRFS_I(inode), -1); - spin_unlock(&BTRFS_I(inode)->lock); + spin_lock(&inode->lock); + btrfs_mod_outstanding_extents(inode, -1); + spin_unlock(&inode->lock); } static void btrfs_add_delalloc_inodes(struct btrfs_root *root, - struct inode *inode) + struct btrfs_inode *inode) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode->root->fs_info; spin_lock(&root->delalloc_lock); - if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) { - list_add_tail(&BTRFS_I(inode)->delalloc_inodes, - &root->delalloc_inodes); - set_bit(BTRFS_INODE_IN_DELALLOC_LIST, - &BTRFS_I(inode)->runtime_flags); + if (list_empty(&inode->delalloc_inodes)) { + list_add_tail(&inode->delalloc_inodes, &root->delalloc_inodes); + set_bit(BTRFS_INODE_IN_DELALLOC_LIST, &inode->runtime_flags); root->nr_delalloc_inodes++; if (root->nr_delalloc_inodes == 1) { spin_lock(&fs_info->delalloc_root_lock); @@ -2354,7 +2387,6 @@ static void btrfs_add_delalloc_inodes(struct btrfs_root *root, spin_unlock(&root->delalloc_lock); } - void __btrfs_del_delalloc_inode(struct btrfs_root *root, struct btrfs_inode *inode) { @@ -2387,10 +2419,10 @@ static void btrfs_del_delalloc_inode(struct btrfs_root *root, * Properly track delayed allocation bytes in the inode and to maintain the * list of inodes that have pending delalloc work to be done. */ -void btrfs_set_delalloc_extent(struct inode *inode, struct extent_state *state, +void btrfs_set_delalloc_extent(struct btrfs_inode *inode, struct extent_state *state, u32 bits) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode->root->fs_info; if ((bits & EXTENT_DEFRAG) && !(bits & EXTENT_DELALLOC)) WARN_ON(1); @@ -2400,14 +2432,14 @@ void btrfs_set_delalloc_extent(struct inode *inode, struct extent_state *state, * bit, which is only set or cleared with irqs on */ if (!(state->state & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) { - struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_root *root = inode->root; u64 len = state->end + 1 - state->start; u32 num_extents = count_max_extents(fs_info, len); - bool do_list = !btrfs_is_free_space_inode(BTRFS_I(inode)); + bool do_list = !btrfs_is_free_space_inode(inode); - spin_lock(&BTRFS_I(inode)->lock); - btrfs_mod_outstanding_extents(BTRFS_I(inode), num_extents); - spin_unlock(&BTRFS_I(inode)->lock); + spin_lock(&inode->lock); + btrfs_mod_outstanding_extents(inode, num_extents); + spin_unlock(&inode->lock); /* For sanity tests */ if (btrfs_is_testing(fs_info)) @@ -2415,22 +2447,21 @@ void btrfs_set_delalloc_extent(struct inode *inode, struct extent_state *state, percpu_counter_add_batch(&fs_info->delalloc_bytes, len, fs_info->delalloc_batch); - spin_lock(&BTRFS_I(inode)->lock); - BTRFS_I(inode)->delalloc_bytes += len; + spin_lock(&inode->lock); + inode->delalloc_bytes += len; if (bits & EXTENT_DEFRAG) - BTRFS_I(inode)->defrag_bytes += len; + inode->defrag_bytes += len; if (do_list && !test_bit(BTRFS_INODE_IN_DELALLOC_LIST, - &BTRFS_I(inode)->runtime_flags)) + &inode->runtime_flags)) btrfs_add_delalloc_inodes(root, inode); - spin_unlock(&BTRFS_I(inode)->lock); + spin_unlock(&inode->lock); } if (!(state->state & EXTENT_DELALLOC_NEW) && (bits & EXTENT_DELALLOC_NEW)) { - spin_lock(&BTRFS_I(inode)->lock); - BTRFS_I(inode)->new_delalloc_bytes += state->end + 1 - - state->start; - spin_unlock(&BTRFS_I(inode)->lock); + spin_lock(&inode->lock); + inode->new_delalloc_bytes += state->end + 1 - state->start; + spin_unlock(&inode->lock); } } @@ -2438,11 +2469,10 @@ void btrfs_set_delalloc_extent(struct inode *inode, struct extent_state *state, * Once a range is no longer delalloc this function ensures that proper * accounting happens. */ -void btrfs_clear_delalloc_extent(struct inode *vfs_inode, +void btrfs_clear_delalloc_extent(struct btrfs_inode *inode, struct extent_state *state, u32 bits) { - struct btrfs_inode *inode = BTRFS_I(vfs_inode); - struct btrfs_fs_info *fs_info = btrfs_sb(vfs_inode->i_sb); + struct btrfs_fs_info *fs_info = inode->root->fs_info; u64 len = state->end + 1 - state->start; u32 num_extents = count_max_extents(fs_info, len); @@ -2513,10 +2543,9 @@ void btrfs_clear_delalloc_extent(struct inode *vfs_inode, * At IO completion time the cums attached on the ordered extent record * are inserted into the btree */ -static blk_status_t btrfs_submit_bio_start(struct inode *inode, struct bio *bio, - u64 dio_file_offset) +blk_status_t btrfs_submit_bio_start(struct btrfs_inode *inode, struct bio *bio) { - return btrfs_csum_one_bio(BTRFS_I(inode), bio, (u64)-1, false); + return btrfs_csum_one_bio(inode, bio, (u64)-1, false); } /* @@ -2694,14 +2723,13 @@ out: return errno_to_blk_status(ret); } -void btrfs_submit_data_write_bio(struct inode *inode, struct bio *bio, int mirror_num) +void btrfs_submit_data_write_bio(struct btrfs_inode *inode, struct bio *bio, int mirror_num) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct btrfs_inode *bi = BTRFS_I(inode); + struct btrfs_fs_info *fs_info = inode->root->fs_info; blk_status_t ret; if (bio_op(bio) == REQ_OP_ZONE_APPEND) { - ret = extract_ordered_extent(bi, bio, + ret = extract_ordered_extent(inode, bio, page_offset(bio_first_bvec_all(bio)->bv_page)); if (ret) { btrfs_bio_end_io(btrfs_bio(bio), ret); @@ -2717,15 +2745,14 @@ void btrfs_submit_data_write_bio(struct inode *inode, struct bio *bio, int mirro * Csum items for reloc roots have already been cloned at this point, * so they are handled as part of the no-checksum case. */ - if (!(bi->flags & BTRFS_INODE_NODATASUM) && + if (!(inode->flags & BTRFS_INODE_NODATASUM) && !test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state) && - !btrfs_is_data_reloc_root(bi->root)) { - if (!atomic_read(&bi->sync_writers) && - btrfs_wq_submit_bio(inode, bio, mirror_num, 0, - btrfs_submit_bio_start)) + !btrfs_is_data_reloc_root(inode->root)) { + if (!atomic_read(&inode->sync_writers) && + btrfs_wq_submit_bio(inode, bio, mirror_num, 0, WQ_SUBMIT_DATA)) return; - ret = btrfs_csum_one_bio(bi, bio, (u64)-1, false); + ret = btrfs_csum_one_bio(inode, bio, (u64)-1, false); if (ret) { btrfs_bio_end_io(btrfs_bio(bio), ret); return; @@ -2734,10 +2761,10 @@ void btrfs_submit_data_write_bio(struct inode *inode, struct bio *bio, int mirro btrfs_submit_bio(fs_info, bio, mirror_num); } -void btrfs_submit_data_read_bio(struct inode *inode, struct bio *bio, +void btrfs_submit_data_read_bio(struct btrfs_inode *inode, struct bio *bio, int mirror_num, enum btrfs_compression_type compress_type) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode->root->fs_info; blk_status_t ret; if (compress_type != BTRFS_COMPRESS_NONE) { @@ -2745,7 +2772,7 @@ void btrfs_submit_data_read_bio(struct inode *inode, struct bio *bio, * btrfs_submit_compressed_read will handle completing the bio * if there were any errors, so just return here. */ - btrfs_submit_compressed_read(inode, bio, mirror_num); + btrfs_submit_compressed_read(&inode->vfs_inode, bio, mirror_num); return; } @@ -2756,7 +2783,7 @@ void btrfs_submit_data_read_bio(struct inode *inode, struct bio *bio, * Lookup bio sums does extra checks around whether we need to csum or * not, which is why we ignore skip_sum here. */ - ret = btrfs_lookup_bio_sums(inode, bio, NULL); + ret = btrfs_lookup_bio_sums(&inode->vfs_inode, bio, NULL); if (ret) { btrfs_bio_end_io(btrfs_bio(bio), ret); return; @@ -2859,7 +2886,7 @@ int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end, /* see btrfs_writepage_start_hook for details on why this is required */ struct btrfs_writepage_fixup { struct page *page; - struct inode *inode; + struct btrfs_inode *inode; struct btrfs_work work; }; @@ -2878,7 +2905,7 @@ static void btrfs_writepage_fixup_worker(struct btrfs_work *work) fixup = container_of(work, struct btrfs_writepage_fixup, work); page = fixup->page; - inode = BTRFS_I(fixup->inode); + inode = fixup->inode; page_start = page_offset(page); page_end = page_offset(page) + PAGE_SIZE - 1; @@ -2988,7 +3015,7 @@ out_page: * that could need flushing space. Recursing back to fixup worker would * deadlock. */ - btrfs_add_delayed_iput(&inode->vfs_inode); + btrfs_add_delayed_iput(inode); } /* @@ -3037,7 +3064,7 @@ int btrfs_writepage_cow_fixup(struct page *page) get_page(page); btrfs_init_work(&fixup->work, btrfs_writepage_fixup_worker, NULL, NULL); fixup->page = page; - fixup->inode = inode; + fixup->inode = BTRFS_I(inode); btrfs_queue_work(fs_info->fixup_workers, &fixup->work); return -EAGAIN; @@ -3459,10 +3486,10 @@ static u8 *btrfs_csum_ptr(const struct btrfs_fs_info *fs_info, u8 *csums, u64 of * When csum mismatch is detected, we will also report the error and fill the * corrupted range with zero. (Thus it needs the extra parameters) */ -int btrfs_check_data_csum(struct inode *inode, struct btrfs_bio *bbio, +int btrfs_check_data_csum(struct btrfs_inode *inode, struct btrfs_bio *bbio, u32 bio_offset, struct page *page, u32 pgoff) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode->root->fs_info; u32 len = fs_info->sectorsize; u8 *csum_expected; u8 csum[BTRFS_CSUM_SIZE]; @@ -3476,8 +3503,7 @@ int btrfs_check_data_csum(struct inode *inode, struct btrfs_bio *bbio, return 0; zeroit: - btrfs_print_data_csum_error(BTRFS_I(inode), - bbio->file_offset + bio_offset, + btrfs_print_data_csum_error(inode, bbio->file_offset + bio_offset, csum, csum_expected, bbio->mirror_num); if (bbio->device) btrfs_dev_stat_inc_and_print(bbio->device, @@ -3502,10 +3528,10 @@ unsigned int btrfs_verify_data_csum(struct btrfs_bio *bbio, u32 bio_offset, struct page *page, u64 start, u64 end) { - struct inode *inode = page->mapping->host; - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; - struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_inode *inode = BTRFS_I(page->mapping->host); + struct btrfs_root *root = inode->root; + struct btrfs_fs_info *fs_info = root->fs_info; + struct extent_io_tree *io_tree = &inode->io_tree; const u32 sectorsize = root->fs_info->sectorsize; u32 pg_off; unsigned int result = 0; @@ -3518,7 +3544,7 @@ unsigned int btrfs_verify_data_csum(struct btrfs_bio *bbio, if (bbio->csum == NULL) return 0; - if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) + if (inode->flags & BTRFS_INODE_NODATASUM) return 0; if (unlikely(test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state))) @@ -3563,18 +3589,17 @@ unsigned int btrfs_verify_data_csum(struct btrfs_bio *bbio, * the inode to the delayed iput machinery. Delayed iputs are processed at * transaction commit time/superblock commit/cleaner kthread. */ -void btrfs_add_delayed_iput(struct inode *inode) +void btrfs_add_delayed_iput(struct btrfs_inode *inode) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct btrfs_inode *binode = BTRFS_I(inode); + struct btrfs_fs_info *fs_info = inode->root->fs_info; - if (atomic_add_unless(&inode->i_count, -1, 1)) + if (atomic_add_unless(&inode->vfs_inode.i_count, -1, 1)) return; atomic_inc(&fs_info->nr_delayed_iputs); spin_lock(&fs_info->delayed_iput_lock); - ASSERT(list_empty(&binode->delayed_iput)); - list_add_tail(&binode->delayed_iput, &fs_info->delayed_iputs); + ASSERT(list_empty(&inode->delayed_iput)); + list_add_tail(&inode->delayed_iput, &fs_info->delayed_iputs); spin_unlock(&fs_info->delayed_iput_lock); if (!test_bit(BTRFS_FS_CLEANER_RUNNING, &fs_info->flags)) wake_up_process(fs_info->cleaner_kthread); @@ -3617,7 +3642,7 @@ void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info) spin_unlock(&fs_info->delayed_iput_lock); } -/** +/* * Wait for flushing all delayed iputs * * @fs_info: the filesystem @@ -4262,7 +4287,7 @@ int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans, static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans, struct btrfs_inode *dir, struct btrfs_inode *inode, - const char *name, int name_len, + const struct fscrypt_str *name, struct btrfs_rename_ctx *rename_ctx) { struct btrfs_root *root = dir->root; @@ -4280,8 +4305,7 @@ static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans, goto out; } - di = btrfs_lookup_dir_item(trans, root, path, dir_ino, - name, name_len, -1); + di = btrfs_lookup_dir_item(trans, root, path, dir_ino, name, -1); if (IS_ERR_OR_NULL(di)) { ret = di ? PTR_ERR(di) : -ENOENT; goto err; @@ -4309,12 +4333,11 @@ static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans, } } - ret = btrfs_del_inode_ref(trans, root, name, name_len, ino, - dir_ino, &index); + ret = btrfs_del_inode_ref(trans, root, name, ino, dir_ino, &index); if (ret) { btrfs_info(fs_info, "failed to delete reference to %.*s, inode %llu parent %llu", - name_len, name, ino, dir_ino); + name->len, name->name, ino, dir_ino); btrfs_abort_transaction(trans, ret); goto err; } @@ -4335,10 +4358,8 @@ skip_backref: * operations on the log tree, increasing latency for applications. */ if (!rename_ctx) { - btrfs_del_inode_ref_in_log(trans, root, name, name_len, inode, - dir_ino); - btrfs_del_dir_entries_in_log(trans, root, name, name_len, dir, - index); + btrfs_del_inode_ref_in_log(trans, root, name, inode, dir_ino); + btrfs_del_dir_entries_in_log(trans, root, name, dir, index); } /* @@ -4356,7 +4377,7 @@ err: if (ret) goto out; - btrfs_i_size_write(dir, dir->vfs_inode.i_size - name_len * 2); + btrfs_i_size_write(dir, dir->vfs_inode.i_size - name->len * 2); inode_inc_iversion(&inode->vfs_inode); inode_inc_iversion(&dir->vfs_inode); inode->vfs_inode.i_ctime = current_time(&inode->vfs_inode); @@ -4369,10 +4390,11 @@ out: int btrfs_unlink_inode(struct btrfs_trans_handle *trans, struct btrfs_inode *dir, struct btrfs_inode *inode, - const char *name, int name_len) + const struct fscrypt_str *name) { int ret; - ret = __btrfs_unlink_inode(trans, dir, inode, name, name_len, NULL); + + ret = __btrfs_unlink_inode(trans, dir, inode, name, NULL); if (!ret) { drop_nlink(&inode->vfs_inode); ret = btrfs_update_inode(trans, inode->root, inode); @@ -4388,9 +4410,9 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans, * plenty of slack room in the global reserve to migrate, otherwise we cannot * allow the unlink to occur. */ -static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir) +static struct btrfs_trans_handle *__unlink_start_trans(struct btrfs_inode *dir) { - struct btrfs_root *root = BTRFS_I(dir)->root; + struct btrfs_root *root = dir->root; /* * 1 for the possible orphan item @@ -4408,47 +4430,62 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry) struct btrfs_trans_handle *trans; struct inode *inode = d_inode(dentry); int ret; + struct fscrypt_name fname; - trans = __unlink_start_trans(dir); - if (IS_ERR(trans)) - return PTR_ERR(trans); + ret = fscrypt_setup_filename(dir, &dentry->d_name, 1, &fname); + if (ret) + return ret; + + /* This needs to handle no-key deletions later on */ + + trans = __unlink_start_trans(BTRFS_I(dir)); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto fscrypt_free; + } btrfs_record_unlink_dir(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)), 0); - ret = btrfs_unlink_inode(trans, BTRFS_I(dir), - BTRFS_I(d_inode(dentry)), dentry->d_name.name, - dentry->d_name.len); + ret = btrfs_unlink_inode(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)), + &fname.disk_name); if (ret) - goto out; + goto end_trans; if (inode->i_nlink == 0) { ret = btrfs_orphan_add(trans, BTRFS_I(inode)); if (ret) - goto out; + goto end_trans; } -out: +end_trans: btrfs_end_transaction(trans); btrfs_btree_balance_dirty(BTRFS_I(dir)->root->fs_info); +fscrypt_free: + fscrypt_free_filename(&fname); return ret; } static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, - struct inode *dir, struct dentry *dentry) + struct btrfs_inode *dir, struct dentry *dentry) { - struct btrfs_root *root = BTRFS_I(dir)->root; + struct btrfs_root *root = dir->root; struct btrfs_inode *inode = BTRFS_I(d_inode(dentry)); struct btrfs_path *path; struct extent_buffer *leaf; struct btrfs_dir_item *di; struct btrfs_key key; - const char *name = dentry->d_name.name; - int name_len = dentry->d_name.len; u64 index; int ret; u64 objectid; - u64 dir_ino = btrfs_ino(BTRFS_I(dir)); + u64 dir_ino = btrfs_ino(dir); + struct fscrypt_name fname; + + ret = fscrypt_setup_filename(&dir->vfs_inode, &dentry->d_name, 1, &fname); + if (ret) + return ret; + + /* This needs to handle no-key deletions later on */ if (btrfs_ino(inode) == BTRFS_FIRST_FREE_OBJECTID) { objectid = inode->root->root_key.objectid; @@ -4456,15 +4493,18 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, objectid = inode->location.objectid; } else { WARN_ON(1); + fscrypt_free_filename(&fname); return -EINVAL; } path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; + if (!path) { + ret = -ENOMEM; + goto out; + } di = btrfs_lookup_dir_item(trans, root, path, dir_ino, - name, name_len, -1); + &fname.disk_name, -1); if (IS_ERR_OR_NULL(di)) { ret = di ? PTR_ERR(di) : -ENOENT; goto out; @@ -4490,8 +4530,7 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, * call btrfs_del_root_ref, and it _shouldn't_ fail. */ if (btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) { - di = btrfs_search_dir_index_item(root, path, dir_ino, - name, name_len); + di = btrfs_search_dir_index_item(root, path, dir_ino, &fname.disk_name); if (IS_ERR_OR_NULL(di)) { if (!di) ret = -ENOENT; @@ -4508,28 +4547,29 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, } else { ret = btrfs_del_root_ref(trans, objectid, root->root_key.objectid, dir_ino, - &index, name, name_len); + &index, &fname.disk_name); if (ret) { btrfs_abort_transaction(trans, ret); goto out; } } - ret = btrfs_delete_delayed_dir_index(trans, BTRFS_I(dir), index); + ret = btrfs_delete_delayed_dir_index(trans, dir, index); if (ret) { btrfs_abort_transaction(trans, ret); goto out; } - btrfs_i_size_write(BTRFS_I(dir), dir->i_size - name_len * 2); - inode_inc_iversion(dir); - dir->i_mtime = current_time(dir); - dir->i_ctime = dir->i_mtime; - ret = btrfs_update_inode_fallback(trans, root, BTRFS_I(dir)); + btrfs_i_size_write(dir, dir->vfs_inode.i_size - fname.disk_name.len * 2); + inode_inc_iversion(&dir->vfs_inode); + dir->vfs_inode.i_mtime = current_time(&dir->vfs_inode); + dir->vfs_inode.i_ctime = dir->vfs_inode.i_mtime; + ret = btrfs_update_inode_fallback(trans, root, dir); if (ret) btrfs_abort_transaction(trans, ret); out: btrfs_free_path(path); + fscrypt_free_filename(&fname); return ret; } @@ -4543,6 +4583,7 @@ static noinline int may_destroy_subvol(struct btrfs_root *root) struct btrfs_path *path; struct btrfs_dir_item *di; struct btrfs_key key; + struct fscrypt_str name = FSTR_INIT("default", 7); u64 dir_id; int ret; @@ -4553,7 +4594,7 @@ static noinline int may_destroy_subvol(struct btrfs_root *root) /* Make sure this root isn't set as the default subvol */ dir_id = btrfs_super_root_dir(fs_info->super_copy); di = btrfs_lookup_dir_item(NULL, fs_info->tree_root, path, - dir_id, "default", 7, 0); + dir_id, &name, 0); if (di && !IS_ERR(di)) { btrfs_dir_item_key_to_cpu(path->nodes[0], di, &key); if (key.objectid == root->root_key.objectid) { @@ -4652,10 +4693,10 @@ again: spin_unlock(&root->inode_lock); } -int btrfs_delete_subvolume(struct inode *dir, struct dentry *dentry) +int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry) { struct btrfs_fs_info *fs_info = btrfs_sb(dentry->d_sb); - struct btrfs_root *root = BTRFS_I(dir)->root; + struct btrfs_root *root = dir->root; struct inode *inode = d_inode(dentry); struct btrfs_root *dest = BTRFS_I(inode)->root; struct btrfs_trans_handle *trans; @@ -4712,7 +4753,7 @@ int btrfs_delete_subvolume(struct inode *dir, struct dentry *dentry) trans->block_rsv = &block_rsv; trans->bytes_reserved = block_rsv.size; - btrfs_record_snapshot_destroy(trans, BTRFS_I(dir)); + btrfs_record_snapshot_destroy(trans, dir); ret = btrfs_unlink_subvol(trans, dir, dentry); if (ret) { @@ -4792,6 +4833,7 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) int err = 0; struct btrfs_trans_handle *trans; u64 last_unlink_trans; + struct fscrypt_name fname; if (inode->i_size > BTRFS_EMPTY_DIR_SIZE) return -ENOTEMPTY; @@ -4801,15 +4843,23 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) "extent tree v2 doesn't support snapshot deletion yet"); return -EOPNOTSUPP; } - return btrfs_delete_subvolume(dir, dentry); + return btrfs_delete_subvolume(BTRFS_I(dir), dentry); } - trans = __unlink_start_trans(dir); - if (IS_ERR(trans)) - return PTR_ERR(trans); + err = fscrypt_setup_filename(dir, &dentry->d_name, 1, &fname); + if (err) + return err; + + /* This needs to handle no-key deletions later on */ + + trans = __unlink_start_trans(BTRFS_I(dir)); + if (IS_ERR(trans)) { + err = PTR_ERR(trans); + goto out_notrans; + } if (unlikely(btrfs_ino(BTRFS_I(inode)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) { - err = btrfs_unlink_subvol(trans, dir, dentry); + err = btrfs_unlink_subvol(trans, BTRFS_I(dir), dentry); goto out; } @@ -4820,9 +4870,8 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) last_unlink_trans = BTRFS_I(inode)->last_unlink_trans; /* now the directory is empty */ - err = btrfs_unlink_inode(trans, BTRFS_I(dir), - BTRFS_I(d_inode(dentry)), dentry->d_name.name, - dentry->d_name.len); + err = btrfs_unlink_inode(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)), + &fname.disk_name); if (!err) { btrfs_i_size_write(BTRFS_I(inode), 0); /* @@ -4841,7 +4890,9 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) } out: btrfs_end_transaction(trans); +out_notrans: btrfs_btree_balance_dirty(fs_info); + fscrypt_free_filename(&fname); return err; } @@ -5210,7 +5261,7 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr) inode_dio_wait(inode); - ret = btrfs_truncate(inode, newsize == oldsize); + ret = btrfs_truncate(BTRFS_I(inode), newsize == oldsize); if (ret && inode->i_nlink) { int err; @@ -5253,7 +5304,7 @@ static int btrfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentr if (attr->ia_valid) { setattr_copy(mnt_userns, inode, attr); inode_inc_iversion(inode); - err = btrfs_dirty_inode(inode); + err = btrfs_dirty_inode(BTRFS_I(inode)); if (!err && attr->ia_valid & ATTR_MODE) err = posix_acl_chmod(mnt_userns, dentry, inode->i_mode); @@ -5511,22 +5562,27 @@ no_delete: * If no dir entries were found, returns -ENOENT. * If found a corrupted location in dir entry, returns -EUCLEAN. */ -static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry, +static int btrfs_inode_by_name(struct btrfs_inode *dir, struct dentry *dentry, struct btrfs_key *location, u8 *type) { - const char *name = dentry->d_name.name; - int namelen = dentry->d_name.len; struct btrfs_dir_item *di; struct btrfs_path *path; - struct btrfs_root *root = BTRFS_I(dir)->root; + struct btrfs_root *root = dir->root; int ret = 0; + struct fscrypt_name fname; path = btrfs_alloc_path(); if (!path) return -ENOMEM; - di = btrfs_lookup_dir_item(NULL, root, path, btrfs_ino(BTRFS_I(dir)), - name, namelen, 0); + ret = fscrypt_setup_filename(&dir->vfs_inode, &dentry->d_name, 1, &fname); + if (ret) + goto out; + + /* This needs to handle no-key deletions later on */ + + di = btrfs_lookup_dir_item(NULL, root, path, btrfs_ino(dir), + &fname.disk_name, 0); if (IS_ERR_OR_NULL(di)) { ret = di ? PTR_ERR(di) : -ENOENT; goto out; @@ -5538,12 +5594,13 @@ static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry, ret = -EUCLEAN; btrfs_warn(root->fs_info, "%s gets something invalid in DIR_ITEM (name %s, directory ino %llu, location(%llu %u %llu))", - __func__, name, btrfs_ino(BTRFS_I(dir)), + __func__, fname.disk_name.name, btrfs_ino(dir), location->objectid, location->type, location->offset); } if (!ret) - *type = btrfs_dir_type(path->nodes[0], di); + *type = btrfs_dir_ftype(path->nodes[0], di); out: + fscrypt_free_filename(&fname); btrfs_free_path(path); return ret; } @@ -5554,7 +5611,7 @@ out: * is kind of like crossing a mount point. */ static int fixup_tree_root_location(struct btrfs_fs_info *fs_info, - struct inode *dir, + struct btrfs_inode *dir, struct dentry *dentry, struct btrfs_key *location, struct btrfs_root **sub_root) @@ -5566,6 +5623,11 @@ static int fixup_tree_root_location(struct btrfs_fs_info *fs_info, struct btrfs_key key; int ret; int err = 0; + struct fscrypt_name fname; + + ret = fscrypt_setup_filename(&dir->vfs_inode, &dentry->d_name, 0, &fname); + if (ret) + return ret; path = btrfs_alloc_path(); if (!path) { @@ -5574,7 +5636,7 @@ static int fixup_tree_root_location(struct btrfs_fs_info *fs_info, } err = -ENOENT; - key.objectid = BTRFS_I(dir)->root->root_key.objectid; + key.objectid = dir->root->root_key.objectid; key.type = BTRFS_ROOT_REF_KEY; key.offset = location->objectid; @@ -5587,13 +5649,12 @@ static int fixup_tree_root_location(struct btrfs_fs_info *fs_info, leaf = path->nodes[0]; ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref); - if (btrfs_root_ref_dirid(leaf, ref) != btrfs_ino(BTRFS_I(dir)) || - btrfs_root_ref_name_len(leaf, ref) != dentry->d_name.len) + if (btrfs_root_ref_dirid(leaf, ref) != btrfs_ino(dir) || + btrfs_root_ref_name_len(leaf, ref) != fname.disk_name.len) goto out; - ret = memcmp_extent_buffer(leaf, dentry->d_name.name, - (unsigned long)(ref + 1), - dentry->d_name.len); + ret = memcmp_extent_buffer(leaf, fname.disk_name.name, + (unsigned long)(ref + 1), fname.disk_name.len); if (ret) goto out; @@ -5612,19 +5673,20 @@ static int fixup_tree_root_location(struct btrfs_fs_info *fs_info, err = 0; out: btrfs_free_path(path); + fscrypt_free_filename(&fname); return err; } -static void inode_tree_add(struct inode *inode) +static void inode_tree_add(struct btrfs_inode *inode) { - struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_root *root = inode->root; struct btrfs_inode *entry; struct rb_node **p; struct rb_node *parent; - struct rb_node *new = &BTRFS_I(inode)->rb_node; - u64 ino = btrfs_ino(BTRFS_I(inode)); + struct rb_node *new = &inode->rb_node; + u64 ino = btrfs_ino(inode); - if (inode_unhashed(inode)) + if (inode_unhashed(&inode->vfs_inode)) return; parent = NULL; spin_lock(&root->inode_lock); @@ -5736,7 +5798,7 @@ struct inode *btrfs_iget_path(struct super_block *s, u64 ino, ret = btrfs_read_locked_inode(inode, path); if (!ret) { - inode_tree_add(inode); + inode_tree_add(BTRFS_I(inode)); unlock_new_inode(inode); } else { iget_failed(inode); @@ -5816,7 +5878,7 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) if (dentry->d_name.len > BTRFS_NAME_LEN) return ERR_PTR(-ENAMETOOLONG); - ret = btrfs_inode_by_name(dir, dentry, &location, &di_type); + ret = btrfs_inode_by_name(BTRFS_I(dir), dentry, &location, &di_type); if (ret < 0) return ERR_PTR(ret); @@ -5837,7 +5899,7 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) return inode; } - ret = fixup_tree_root_location(fs_info, dir, dentry, + ret = fixup_tree_root_location(fs_info, BTRFS_I(dir), dentry, &location, &sub_root); if (ret < 0) { if (ret != -ENOENT) @@ -5985,6 +6047,7 @@ again: btrfs_for_each_slot(root, &key, &found_key, path, ret) { struct dir_entry *entry; struct extent_buffer *leaf = path->nodes[0]; + u8 ftype; if (found_key.objectid != key.objectid) break; @@ -6008,13 +6071,13 @@ again: goto again; } + ftype = btrfs_dir_flags_to_ftype(btrfs_dir_flags(leaf, di)); entry = addr; - put_unaligned(name_len, &entry->name_len); name_ptr = (char *)(entry + 1); - read_extent_buffer(leaf, name_ptr, (unsigned long)(di + 1), - name_len); - put_unaligned(fs_ftype_to_dtype(btrfs_dir_type(leaf, di)), - &entry->type); + read_extent_buffer(leaf, name_ptr, + (unsigned long)(di + 1), name_len); + put_unaligned(name_len, &entry->name_len); + put_unaligned(fs_ftype_to_dtype(ftype), &entry->type); btrfs_dir_item_key_to_cpu(leaf, di, &location); put_unaligned(location.objectid, &entry->ino); put_unaligned(found_key.offset, &entry->offset); @@ -6072,21 +6135,21 @@ err: * FIXME, needs more benchmarking...there are no reasons other than performance * to keep or drop this code. */ -static int btrfs_dirty_inode(struct inode *inode) +static int btrfs_dirty_inode(struct btrfs_inode *inode) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_root *root = inode->root; + struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_trans_handle *trans; int ret; - if (test_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags)) + if (test_bit(BTRFS_INODE_DUMMY, &inode->runtime_flags)) return 0; trans = btrfs_join_transaction(root); if (IS_ERR(trans)) return PTR_ERR(trans); - ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); + ret = btrfs_update_inode(trans, root, inode); if (ret && (ret == -ENOSPC || ret == -EDQUOT)) { /* whoops, lets try again with the full transaction */ btrfs_end_transaction(trans); @@ -6094,10 +6157,10 @@ static int btrfs_dirty_inode(struct inode *inode) if (IS_ERR(trans)) return PTR_ERR(trans); - ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); + ret = btrfs_update_inode(trans, root, inode); } btrfs_end_transaction(trans); - if (BTRFS_I(inode)->delayed_node) + if (inode->delayed_node) btrfs_balance_delayed_items(fs_info); return ret; @@ -6124,7 +6187,7 @@ static int btrfs_update_time(struct inode *inode, struct timespec64 *now, inode->i_mtime = *now; if (flags & S_ATIME) inode->i_atime = *now; - return dirty ? btrfs_dirty_inode(inode) : 0; + return dirty ? btrfs_dirty_inode(BTRFS_I(inode)) : 0; } /* @@ -6220,9 +6283,18 @@ int btrfs_new_inode_prepare(struct btrfs_new_inode_args *args, struct inode *inode = args->inode; int ret; + if (!args->orphan) { + ret = fscrypt_setup_filename(dir, &args->dentry->d_name, 0, + &args->fname); + if (ret) + return ret; + } + ret = posix_acl_create(dir, &inode->i_mode, &args->default_acl, &args->acl); - if (ret) + if (ret) { + fscrypt_free_filename(&args->fname); return ret; + } /* 1 to add inode item */ *trans_num_items = 1; @@ -6262,6 +6334,7 @@ void btrfs_new_inode_args_destroy(struct btrfs_new_inode_args *args) { posix_acl_release(args->acl); posix_acl_release(args->default_acl); + fscrypt_free_filename(&args->fname); } /* @@ -6269,27 +6342,27 @@ void btrfs_new_inode_args_destroy(struct btrfs_new_inode_args *args) * * Currently only the compression flags and the cow flags are inherited. */ -static void btrfs_inherit_iflags(struct inode *inode, struct inode *dir) +static void btrfs_inherit_iflags(struct btrfs_inode *inode, struct btrfs_inode *dir) { unsigned int flags; - flags = BTRFS_I(dir)->flags; + flags = dir->flags; if (flags & BTRFS_INODE_NOCOMPRESS) { - BTRFS_I(inode)->flags &= ~BTRFS_INODE_COMPRESS; - BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS; + inode->flags &= ~BTRFS_INODE_COMPRESS; + inode->flags |= BTRFS_INODE_NOCOMPRESS; } else if (flags & BTRFS_INODE_COMPRESS) { - BTRFS_I(inode)->flags &= ~BTRFS_INODE_NOCOMPRESS; - BTRFS_I(inode)->flags |= BTRFS_INODE_COMPRESS; + inode->flags &= ~BTRFS_INODE_NOCOMPRESS; + inode->flags |= BTRFS_INODE_COMPRESS; } if (flags & BTRFS_INODE_NODATACOW) { - BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW; - if (S_ISREG(inode->i_mode)) - BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM; + inode->flags |= BTRFS_INODE_NODATACOW; + if (S_ISREG(inode->vfs_inode.i_mode)) + inode->flags |= BTRFS_INODE_NODATASUM; } - btrfs_sync_inode_flags_to_i_flags(inode); + btrfs_sync_inode_flags_to_i_flags(&inode->vfs_inode); } int btrfs_create_new_inode(struct btrfs_trans_handle *trans, @@ -6297,8 +6370,7 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans, { struct inode *dir = args->dir; struct inode *inode = args->inode; - const char *name = args->orphan ? NULL : args->dentry->d_name.name; - int name_len = args->orphan ? 0 : args->dentry->d_name.len; + const struct fscrypt_str *name = args->orphan ? NULL : &args->fname.disk_name; struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); struct btrfs_root *root; struct btrfs_inode_item *inode_item; @@ -6349,7 +6421,7 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans, * change it now without compatibility issues. */ if (!args->subvol) - btrfs_inherit_iflags(inode, dir); + btrfs_inherit_iflags(BTRFS_I(inode), BTRFS_I(dir)); if (S_ISREG(inode->i_mode)) { if (btrfs_test_opt(fs_info, NODATASUM)) @@ -6399,7 +6471,7 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans, sizes[1] = 2 + sizeof(*ref); } else { key[1].offset = btrfs_ino(BTRFS_I(dir)); - sizes[1] = name_len + sizeof(*ref); + sizes[1] = name->len + sizeof(*ref); } } @@ -6438,10 +6510,12 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans, btrfs_set_inode_ref_index(path->nodes[0], ref, 0); write_extent_buffer(path->nodes[0], "..", ptr, 2); } else { - btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len); + btrfs_set_inode_ref_name_len(path->nodes[0], ref, + name->len); btrfs_set_inode_ref_index(path->nodes[0], ref, BTRFS_I(inode)->dir_index); - write_extent_buffer(path->nodes[0], name, ptr, name_len); + write_extent_buffer(path->nodes[0], name->name, ptr, + name->len); } } @@ -6491,7 +6565,7 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans, } } - inode_tree_add(inode); + inode_tree_add(BTRFS_I(inode)); trace_btrfs_inode_new(inode); btrfs_set_inode_last_trans(trans, BTRFS_I(inode)); @@ -6502,7 +6576,7 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans, ret = btrfs_orphan_add(trans, BTRFS_I(inode)); } else { ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), name, - name_len, 0, BTRFS_I(inode)->dir_index); + 0, BTRFS_I(inode)->dir_index); } if (ret) { btrfs_abort_transaction(trans, ret); @@ -6531,7 +6605,7 @@ out: */ int btrfs_add_link(struct btrfs_trans_handle *trans, struct btrfs_inode *parent_inode, struct btrfs_inode *inode, - const char *name, int name_len, int add_backref, u64 index) + const struct fscrypt_str *name, int add_backref, u64 index) { int ret = 0; struct btrfs_key key; @@ -6550,17 +6624,17 @@ int btrfs_add_link(struct btrfs_trans_handle *trans, if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) { ret = btrfs_add_root_ref(trans, key.objectid, root->root_key.objectid, parent_ino, - index, name, name_len); + index, name); } else if (add_backref) { - ret = btrfs_insert_inode_ref(trans, root, name, name_len, ino, - parent_ino, index); + ret = btrfs_insert_inode_ref(trans, root, name, + ino, parent_ino, index); } /* Nothing to clean up yet */ if (ret) return ret; - ret = btrfs_insert_dir_item(trans, name, name_len, parent_inode, &key, + ret = btrfs_insert_dir_item(trans, name, parent_inode, &key, btrfs_inode_type(&inode->vfs_inode), index); if (ret == -EEXIST || ret == -EOVERFLOW) goto fail_dir_item; @@ -6570,7 +6644,7 @@ int btrfs_add_link(struct btrfs_trans_handle *trans, } btrfs_i_size_write(parent_inode, parent_inode->vfs_inode.i_size + - name_len * 2); + name->len * 2); inode_inc_iversion(&parent_inode->vfs_inode); /* * If we are replaying a log tree, we do not want to update the mtime @@ -6595,15 +6669,15 @@ fail_dir_item: int err; err = btrfs_del_root_ref(trans, key.objectid, root->root_key.objectid, parent_ino, - &local_index, name, name_len); + &local_index, name); if (err) btrfs_abort_transaction(trans, err); } else if (add_backref) { u64 local_index; int err; - err = btrfs_del_inode_ref(trans, root, name, name_len, - ino, parent_ino, &local_index); + err = btrfs_del_inode_ref(trans, root, name, ino, parent_ino, + &local_index); if (err) btrfs_abort_transaction(trans, err); } @@ -6686,6 +6760,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, struct btrfs_root *root = BTRFS_I(dir)->root; struct inode *inode = d_inode(old_dentry); struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct fscrypt_name fname; u64 index; int err; int drop_inode = 0; @@ -6697,6 +6772,10 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, if (inode->i_nlink >= BTRFS_LINK_MAX) return -EMLINK; + err = fscrypt_setup_filename(dir, &dentry->d_name, 0, &fname); + if (err) + goto fail; + err = btrfs_set_inode_index(BTRFS_I(dir), &index); if (err) goto fail; @@ -6723,7 +6802,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags); err = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), - dentry->d_name.name, dentry->d_name.len, 1, index); + &fname.disk_name, 1, index); if (err) { drop_inode = 1; @@ -6747,6 +6826,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, } fail: + fscrypt_free_filename(&fname); if (trans) btrfs_end_transaction(trans); if (drop_inode) { @@ -6773,7 +6853,6 @@ static int btrfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, static noinline int uncompress_inline(struct btrfs_path *path, struct page *page, - size_t pg_offset, u64 extent_offset, struct btrfs_file_extent_item *item) { int ret; @@ -6784,7 +6863,6 @@ static noinline int uncompress_inline(struct btrfs_path *path, unsigned long ptr; int compress_type; - WARN_ON(pg_offset != 0); compress_type = btrfs_file_extent_compression(leaf, item); max_size = btrfs_file_extent_ram_bytes(leaf, item); inline_size = btrfs_file_extent_inline_item_len(leaf, path->slots[0]); @@ -6796,8 +6874,7 @@ static noinline int uncompress_inline(struct btrfs_path *path, read_extent_buffer(leaf, tmp, ptr, inline_size); max_size = min_t(unsigned long, PAGE_SIZE, max_size); - ret = btrfs_decompress(compress_type, tmp, page, - extent_offset, inline_size, max_size); + ret = btrfs_decompress(compress_type, tmp, page, 0, inline_size, max_size); /* * decompression code contains a memset to fill in any space between the end @@ -6807,25 +6884,52 @@ static noinline int uncompress_inline(struct btrfs_path *path, * cover that region here. */ - if (max_size + pg_offset < PAGE_SIZE) - memzero_page(page, pg_offset + max_size, - PAGE_SIZE - max_size - pg_offset); + if (max_size < PAGE_SIZE) + memzero_page(page, max_size, PAGE_SIZE - max_size); kfree(tmp); return ret; } -/** - * btrfs_get_extent - Lookup the first extent overlapping a range in a file. +static int read_inline_extent(struct btrfs_inode *inode, struct btrfs_path *path, + struct page *page) +{ + struct btrfs_file_extent_item *fi; + void *kaddr; + size_t copy_size; + + if (!page || PageUptodate(page)) + return 0; + + ASSERT(page_offset(page) == 0); + + fi = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_file_extent_item); + if (btrfs_file_extent_compression(path->nodes[0], fi) != BTRFS_COMPRESS_NONE) + return uncompress_inline(path, page, fi); + + copy_size = min_t(u64, PAGE_SIZE, + btrfs_file_extent_ram_bytes(path->nodes[0], fi)); + kaddr = kmap_local_page(page); + read_extent_buffer(path->nodes[0], kaddr, + btrfs_file_extent_inline_start(fi), copy_size); + kunmap_local(kaddr); + if (copy_size < PAGE_SIZE) + memzero_page(page, copy_size, PAGE_SIZE - copy_size); + return 0; +} + +/* + * Lookup the first extent overlapping a range in a file. + * * @inode: file to search in * @page: page to read extent data into if the extent is inline * @pg_offset: offset into @page to copy to * @start: file offset * @len: length of range starting at @start * - * This returns the first &struct extent_map which overlaps with the given - * range, reading it from the B-tree and caching it if necessary. Note that - * there may be more extents which overlap the given range after the returned - * extent_map. + * Return the first &struct extent_map which overlaps the given range, reading + * it from the B-tree and caching it if necessary. Note that there may be more + * extents which overlap the given range after the returned extent_map. * * If @page is not NULL and the extent is inline, this also reads the extent * data directly into the page and marks the extent up to date in the io_tree. @@ -6966,51 +7070,33 @@ next: goto insert; } - btrfs_extent_item_to_extent_map(inode, path, item, !page, em); + btrfs_extent_item_to_extent_map(inode, path, item, em); if (extent_type == BTRFS_FILE_EXTENT_REG || extent_type == BTRFS_FILE_EXTENT_PREALLOC) { goto insert; } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { - unsigned long ptr; - char *map; - size_t size; - size_t extent_offset; - size_t copy_size; - - if (!page) - goto out; + /* + * Inline extent can only exist at file offset 0. This is + * ensured by tree-checker and inline extent creation path. + * Thus all members representing file offsets should be zero. + */ + ASSERT(pg_offset == 0); + ASSERT(extent_start == 0); + ASSERT(em->start == 0); - size = btrfs_file_extent_ram_bytes(leaf, item); - extent_offset = page_offset(page) + pg_offset - extent_start; - copy_size = min_t(u64, PAGE_SIZE - pg_offset, - size - extent_offset); - em->start = extent_start + extent_offset; - em->len = ALIGN(copy_size, fs_info->sectorsize); - em->orig_block_len = em->len; - em->orig_start = em->start; - ptr = btrfs_file_extent_inline_start(item) + extent_offset; + /* + * btrfs_extent_item_to_extent_map() should have properly + * initialized em members already. + * + * Other members are not utilized for inline extents. + */ + ASSERT(em->block_start == EXTENT_MAP_INLINE); + ASSERT(em->len = fs_info->sectorsize); - if (!PageUptodate(page)) { - if (btrfs_file_extent_compression(leaf, item) != - BTRFS_COMPRESS_NONE) { - ret = uncompress_inline(path, page, pg_offset, - extent_offset, item); - if (ret) - goto out; - } else { - map = kmap_local_page(page); - read_extent_buffer(leaf, map + pg_offset, ptr, - copy_size); - if (pg_offset + copy_size < PAGE_SIZE) { - memset(map + pg_offset + copy_size, 0, - PAGE_SIZE - pg_offset - - copy_size); - } - kunmap_local(map); - } - flush_dcache_page(page); - } + ret = read_inline_extent(inode, path, page); + if (ret < 0) + goto out; goto insert; } not_found: @@ -7255,7 +7341,8 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend, while (1) { if (nowait) { - if (!try_lock_extent(io_tree, lockstart, lockend)) + if (!try_lock_extent(io_tree, lockstart, lockend, + cached_state)) return -EAGAIN; } else { lock_extent(io_tree, lockstart, lockend, cached_state); @@ -7811,11 +7898,11 @@ static void btrfs_dio_private_put(struct btrfs_dio_private *dip) return; if (btrfs_op(&dip->bio) == BTRFS_MAP_WRITE) { - btrfs_mark_ordered_io_finished(BTRFS_I(dip->inode), NULL, + btrfs_mark_ordered_io_finished(dip->inode, NULL, dip->file_offset, dip->bytes, !dip->bio.bi_status); } else { - unlock_extent(&BTRFS_I(dip->inode)->io_tree, + unlock_extent(&dip->inode->io_tree, dip->file_offset, dip->file_offset + dip->bytes - 1, NULL); } @@ -7824,24 +7911,21 @@ static void btrfs_dio_private_put(struct btrfs_dio_private *dip) bio_endio(&dip->bio); } -static void submit_dio_repair_bio(struct inode *inode, struct bio *bio, - int mirror_num, - enum btrfs_compression_type compress_type) +void btrfs_submit_dio_repair_bio(struct btrfs_inode *inode, struct bio *bio, int mirror_num) { struct btrfs_dio_private *dip = btrfs_bio(bio)->private; - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); BUG_ON(bio_op(bio) == REQ_OP_WRITE); refcount_inc(&dip->refs); - btrfs_submit_bio(fs_info, bio, mirror_num); + btrfs_submit_bio(inode->root->fs_info, bio, mirror_num); } static blk_status_t btrfs_check_read_dio_bio(struct btrfs_dio_private *dip, struct btrfs_bio *bbio, const bool uptodate) { - struct inode *inode = dip->inode; + struct inode *inode = &dip->inode->vfs_inode; struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; const bool csum = !(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM); blk_status_t err = BLK_STS_OK; @@ -7853,16 +7937,15 @@ static blk_status_t btrfs_check_read_dio_bio(struct btrfs_dio_private *dip, u64 start = bbio->file_offset + offset; if (uptodate && - (!csum || !btrfs_check_data_csum(inode, bbio, offset, bv.bv_page, - bv.bv_offset))) { + (!csum || !btrfs_check_data_csum(BTRFS_I(inode), bbio, offset, + bv.bv_page, bv.bv_offset))) { btrfs_clean_io_failure(BTRFS_I(inode), start, bv.bv_page, bv.bv_offset); } else { int ret; - ret = btrfs_repair_one_sector(inode, bbio, offset, - bv.bv_page, bv.bv_offset, - submit_dio_repair_bio); + ret = btrfs_repair_one_sector(BTRFS_I(inode), bbio, offset, + bv.bv_page, bv.bv_offset, false); if (ret) err = errno_to_blk_status(ret); } @@ -7871,11 +7954,11 @@ static blk_status_t btrfs_check_read_dio_bio(struct btrfs_dio_private *dip, return err; } -static blk_status_t btrfs_submit_bio_start_direct_io(struct inode *inode, - struct bio *bio, - u64 dio_file_offset) +blk_status_t btrfs_submit_bio_start_direct_io(struct btrfs_inode *inode, + struct bio *bio, + u64 dio_file_offset) { - return btrfs_csum_one_bio(BTRFS_I(inode), bio, dio_file_offset, false); + return btrfs_csum_one_bio(inode, bio, dio_file_offset, false); } static void btrfs_end_dio_bio(struct btrfs_bio *bbio) @@ -7885,9 +7968,9 @@ static void btrfs_end_dio_bio(struct btrfs_bio *bbio) blk_status_t err = bio->bi_status; if (err) - btrfs_warn(BTRFS_I(dip->inode)->root->fs_info, + btrfs_warn(dip->inode->root->fs_info, "direct IO failed ino %llu rw %d,%u sector %#Lx len %u err no %d", - btrfs_ino(BTRFS_I(dip->inode)), bio_op(bio), + btrfs_ino(dip->inode), bio_op(bio), bio->bi_opf, bio->bi_iter.bi_sector, bio->bi_iter.bi_size, err); @@ -7897,16 +7980,16 @@ static void btrfs_end_dio_bio(struct btrfs_bio *bbio) if (err) dip->bio.bi_status = err; - btrfs_record_physical_zoned(dip->inode, bbio->file_offset, bio); + btrfs_record_physical_zoned(&dip->inode->vfs_inode, bbio->file_offset, bio); bio_put(bio); btrfs_dio_private_put(dip); } -static void btrfs_submit_dio_bio(struct bio *bio, struct inode *inode, +static void btrfs_submit_dio_bio(struct bio *bio, struct btrfs_inode *inode, u64 file_offset, int async_submit) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode->root->fs_info; struct btrfs_dio_private *dip = btrfs_bio(bio)->private; blk_status_t ret; @@ -7914,21 +7997,21 @@ static void btrfs_submit_dio_bio(struct bio *bio, struct inode *inode, if (btrfs_op(bio) == BTRFS_MAP_READ) btrfs_bio(bio)->iter = bio->bi_iter; - if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) + if (inode->flags & BTRFS_INODE_NODATASUM) goto map; if (btrfs_op(bio) == BTRFS_MAP_WRITE) { /* Check btrfs_submit_data_write_bio() for async submit rules */ - if (async_submit && !atomic_read(&BTRFS_I(inode)->sync_writers) && + if (async_submit && !atomic_read(&inode->sync_writers) && btrfs_wq_submit_bio(inode, bio, 0, file_offset, - btrfs_submit_bio_start_direct_io)) + WQ_SUBMIT_DATA_DIO)) return; /* * If we aren't doing async submit, calculate the csum of the * bio now. */ - ret = btrfs_csum_one_bio(BTRFS_I(inode), bio, file_offset, false); + ret = btrfs_csum_one_bio(inode, bio, file_offset, false); if (ret) { btrfs_bio_end_io(btrfs_bio(bio), ret); return; @@ -7964,7 +8047,7 @@ static void btrfs_submit_direct(const struct iomap_iter *iter, struct btrfs_dio_data *dio_data = iter->private; struct extent_map *em = NULL; - dip->inode = inode; + dip->inode = BTRFS_I(inode); dip->file_offset = file_offset; dip->bytes = dio_bio->bi_iter.bi_size; refcount_set(&dip->refs, 1); @@ -8050,7 +8133,7 @@ static void btrfs_submit_direct(const struct iomap_iter *iter, async_submit = 1; } - btrfs_submit_dio_bio(bio, inode, file_offset, async_submit); + btrfs_submit_dio_bio(bio, BTRFS_I(inode), file_offset, async_submit); dio_data->submitted += clone_len; clone_offset += clone_len; @@ -8540,16 +8623,16 @@ out_noreserve: return ret; } -static int btrfs_truncate(struct inode *inode, bool skip_writeback) +static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback) { struct btrfs_truncate_control control = { - .inode = BTRFS_I(inode), - .ino = btrfs_ino(BTRFS_I(inode)), + .inode = inode, + .ino = btrfs_ino(inode), .min_type = BTRFS_EXTENT_DATA_KEY, .clear_extent_range = true, }; - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_root *root = inode->root; + struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_block_rsv *rsv; int ret; struct btrfs_trans_handle *trans; @@ -8557,7 +8640,8 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback) u64 min_size = btrfs_calc_metadata_size(fs_info, 1); if (!skip_writeback) { - ret = btrfs_wait_ordered_range(inode, inode->i_size & (~mask), + ret = btrfs_wait_ordered_range(&inode->vfs_inode, + inode->vfs_inode.i_size & (~mask), (u64)-1); if (ret) return ret; @@ -8616,34 +8700,32 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback) while (1) { struct extent_state *cached_state = NULL; - const u64 new_size = inode->i_size; + const u64 new_size = inode->vfs_inode.i_size; const u64 lock_start = ALIGN_DOWN(new_size, fs_info->sectorsize); control.new_size = new_size; - lock_extent(&BTRFS_I(inode)->io_tree, lock_start, (u64)-1, - &cached_state); + lock_extent(&inode->io_tree, lock_start, (u64)-1, &cached_state); /* * We want to drop from the next block forward in case this new * size is not block aligned since we will be keeping the last * block of the extent just the way it is. */ - btrfs_drop_extent_map_range(BTRFS_I(inode), + btrfs_drop_extent_map_range(inode, ALIGN(new_size, fs_info->sectorsize), (u64)-1, false); ret = btrfs_truncate_inode_items(trans, root, &control); - inode_sub_bytes(inode, control.sub_bytes); - btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), control.last_size); + inode_sub_bytes(&inode->vfs_inode, control.sub_bytes); + btrfs_inode_safe_disk_i_size_write(inode, control.last_size); - unlock_extent(&BTRFS_I(inode)->io_tree, lock_start, (u64)-1, - &cached_state); + unlock_extent(&inode->io_tree, lock_start, (u64)-1, &cached_state); trans->block_rsv = &fs_info->trans_block_rsv; if (ret != -ENOSPC && ret != -EAGAIN) break; - ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); + ret = btrfs_update_inode(trans, root, inode); if (ret) break; @@ -8674,7 +8756,7 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback) btrfs_end_transaction(trans); btrfs_btree_balance_dirty(fs_info); - ret = btrfs_truncate_block(BTRFS_I(inode), inode->i_size, 0, 0); + ret = btrfs_truncate_block(inode, inode->vfs_inode.i_size, 0, 0); if (ret) goto out; trans = btrfs_start_transaction(root, 1); @@ -8682,14 +8764,14 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback) ret = PTR_ERR(trans); goto out; } - btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0); + btrfs_inode_safe_disk_i_size_write(inode, 0); } if (trans) { int ret2; trans->block_rsv = &fs_info->trans_block_rsv; - ret2 = btrfs_update_inode(trans, root, BTRFS_I(inode)); + ret2 = btrfs_update_inode(trans, root, inode); if (ret2 && !ret) ret = ret2; @@ -8715,7 +8797,7 @@ out: * extents beyond i_size to drop. */ if (control.extents_found > 0) - btrfs_set_inode_full_sync(BTRFS_I(inode)); + btrfs_set_inode_full_sync(inode); return ret; } @@ -8784,9 +8866,10 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) inode = &ei->vfs_inode; extent_map_tree_init(&ei->extent_tree); - extent_io_tree_init(fs_info, &ei->io_tree, IO_TREE_INODE_IO, inode); + extent_io_tree_init(fs_info, &ei->io_tree, IO_TREE_INODE_IO); + ei->io_tree.inode = ei; extent_io_tree_init(fs_info, &ei->file_extent_tree, - IO_TREE_INODE_FILE_EXTENT, NULL); + IO_TREE_INODE_FILE_EXTENT); ei->io_failure_tree = RB_ROOT; atomic_set(&ei->sync_writers, 0); mutex_init(&ei->log_mutex); @@ -8899,10 +8982,6 @@ void __cold btrfs_destroy_cachep(void) rcu_barrier(); bioset_exit(&btrfs_dio_bioset); kmem_cache_destroy(btrfs_inode_cachep); - kmem_cache_destroy(btrfs_trans_handle_cachep); - kmem_cache_destroy(btrfs_path_cachep); - kmem_cache_destroy(btrfs_free_space_cachep); - kmem_cache_destroy(btrfs_free_space_bitmap_cachep); } int __init btrfs_init_cachep(void) @@ -8914,30 +8993,6 @@ int __init btrfs_init_cachep(void) if (!btrfs_inode_cachep) goto fail; - btrfs_trans_handle_cachep = kmem_cache_create("btrfs_trans_handle", - sizeof(struct btrfs_trans_handle), 0, - SLAB_TEMPORARY | SLAB_MEM_SPREAD, NULL); - if (!btrfs_trans_handle_cachep) - goto fail; - - btrfs_path_cachep = kmem_cache_create("btrfs_path", - sizeof(struct btrfs_path), 0, - SLAB_MEM_SPREAD, NULL); - if (!btrfs_path_cachep) - goto fail; - - btrfs_free_space_cachep = kmem_cache_create("btrfs_free_space", - sizeof(struct btrfs_free_space), 0, - SLAB_MEM_SPREAD, NULL); - if (!btrfs_free_space_cachep) - goto fail; - - btrfs_free_space_bitmap_cachep = kmem_cache_create("btrfs_free_space_bitmap", - PAGE_SIZE, PAGE_SIZE, - SLAB_MEM_SPREAD, NULL); - if (!btrfs_free_space_bitmap_cachep) - goto fail; - if (bioset_init(&btrfs_dio_bioset, BIO_POOL_SIZE, offsetof(struct btrfs_dio_private, bio), BIOSET_NEED_BVECS)) @@ -9013,6 +9068,8 @@ static int btrfs_rename_exchange(struct inode *old_dir, int ret; int ret2; bool need_abort = false; + struct fscrypt_name old_fname, new_fname; + struct fscrypt_str *old_name, *new_name; /* * For non-subvolumes allow exchange only within one subvolume, in the @@ -9024,6 +9081,19 @@ static int btrfs_rename_exchange(struct inode *old_dir, new_ino != BTRFS_FIRST_FREE_OBJECTID)) return -EXDEV; + ret = fscrypt_setup_filename(old_dir, &old_dentry->d_name, 0, &old_fname); + if (ret) + return ret; + + ret = fscrypt_setup_filename(new_dir, &new_dentry->d_name, 0, &new_fname); + if (ret) { + fscrypt_free_filename(&old_fname); + return ret; + } + + old_name = &old_fname.disk_name; + new_name = &new_fname.disk_name; + /* close the race window with snapshot create/destroy ioctl */ if (old_ino == BTRFS_FIRST_FREE_OBJECTID || new_ino == BTRFS_FIRST_FREE_OBJECTID) @@ -9091,10 +9161,7 @@ static int btrfs_rename_exchange(struct inode *old_dir, /* force full log commit if subvolume involved. */ btrfs_set_log_full_commit(trans); } else { - ret = btrfs_insert_inode_ref(trans, dest, - new_dentry->d_name.name, - new_dentry->d_name.len, - old_ino, + ret = btrfs_insert_inode_ref(trans, dest, new_name, old_ino, btrfs_ino(BTRFS_I(new_dir)), old_idx); if (ret) @@ -9107,10 +9174,7 @@ static int btrfs_rename_exchange(struct inode *old_dir, /* force full log commit if subvolume involved. */ btrfs_set_log_full_commit(trans); } else { - ret = btrfs_insert_inode_ref(trans, root, - old_dentry->d_name.name, - old_dentry->d_name.len, - new_ino, + ret = btrfs_insert_inode_ref(trans, root, old_name, new_ino, btrfs_ino(BTRFS_I(old_dir)), new_idx); if (ret) { @@ -9141,13 +9205,11 @@ static int btrfs_rename_exchange(struct inode *old_dir, /* src is a subvolume */ if (old_ino == BTRFS_FIRST_FREE_OBJECTID) { - ret = btrfs_unlink_subvol(trans, old_dir, old_dentry); + ret = btrfs_unlink_subvol(trans, BTRFS_I(old_dir), old_dentry); } else { /* src is an inode */ ret = __btrfs_unlink_inode(trans, BTRFS_I(old_dir), BTRFS_I(old_dentry->d_inode), - old_dentry->d_name.name, - old_dentry->d_name.len, - &old_rename_ctx); + old_name, &old_rename_ctx); if (!ret) ret = btrfs_update_inode(trans, root, BTRFS_I(old_inode)); } @@ -9158,13 +9220,11 @@ static int btrfs_rename_exchange(struct inode *old_dir, /* dest is a subvolume */ if (new_ino == BTRFS_FIRST_FREE_OBJECTID) { - ret = btrfs_unlink_subvol(trans, new_dir, new_dentry); + ret = btrfs_unlink_subvol(trans, BTRFS_I(new_dir), new_dentry); } else { /* dest is an inode */ ret = __btrfs_unlink_inode(trans, BTRFS_I(new_dir), BTRFS_I(new_dentry->d_inode), - new_dentry->d_name.name, - new_dentry->d_name.len, - &new_rename_ctx); + new_name, &new_rename_ctx); if (!ret) ret = btrfs_update_inode(trans, dest, BTRFS_I(new_inode)); } @@ -9174,16 +9234,14 @@ static int btrfs_rename_exchange(struct inode *old_dir, } ret = btrfs_add_link(trans, BTRFS_I(new_dir), BTRFS_I(old_inode), - new_dentry->d_name.name, - new_dentry->d_name.len, 0, old_idx); + new_name, 0, old_idx); if (ret) { btrfs_abort_transaction(trans, ret); goto out_fail; } ret = btrfs_add_link(trans, BTRFS_I(old_dir), BTRFS_I(new_inode), - old_dentry->d_name.name, - old_dentry->d_name.len, 0, new_idx); + old_name, 0, new_idx); if (ret) { btrfs_abort_transaction(trans, ret); goto out_fail; @@ -9226,6 +9284,8 @@ out_notrans: old_ino == BTRFS_FIRST_FREE_OBJECTID) up_read(&fs_info->subvol_sem); + fscrypt_free_filename(&new_fname); + fscrypt_free_filename(&old_fname); return ret; } @@ -9265,6 +9325,7 @@ static int btrfs_rename(struct user_namespace *mnt_userns, int ret; int ret2; u64 old_ino = btrfs_ino(BTRFS_I(old_inode)); + struct fscrypt_name old_fname, new_fname; if (btrfs_ino(BTRFS_I(new_dir)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) return -EPERM; @@ -9281,22 +9342,28 @@ static int btrfs_rename(struct user_namespace *mnt_userns, new_inode->i_size > BTRFS_EMPTY_DIR_SIZE) return -ENOTEMPTY; + ret = fscrypt_setup_filename(old_dir, &old_dentry->d_name, 0, &old_fname); + if (ret) + return ret; - /* check for collisions, even if the name isn't there */ - ret = btrfs_check_dir_item_collision(dest, new_dir->i_ino, - new_dentry->d_name.name, - new_dentry->d_name.len); + ret = fscrypt_setup_filename(new_dir, &new_dentry->d_name, 0, &new_fname); + if (ret) { + fscrypt_free_filename(&old_fname); + return ret; + } + /* check for collisions, even if the name isn't there */ + ret = btrfs_check_dir_item_collision(dest, new_dir->i_ino, &new_fname.disk_name); if (ret) { if (ret == -EEXIST) { /* we shouldn't get * eexist without a new_inode */ if (WARN_ON(!new_inode)) { - return ret; + goto out_fscrypt_names; } } else { /* maybe -EOVERFLOW */ - return ret; + goto out_fscrypt_names; } } ret = 0; @@ -9379,11 +9446,9 @@ static int btrfs_rename(struct user_namespace *mnt_userns, /* force full log commit if subvolume involved. */ btrfs_set_log_full_commit(trans); } else { - ret = btrfs_insert_inode_ref(trans, dest, - new_dentry->d_name.name, - new_dentry->d_name.len, - old_ino, - btrfs_ino(BTRFS_I(new_dir)), index); + ret = btrfs_insert_inode_ref(trans, dest, &new_fname.disk_name, + old_ino, btrfs_ino(BTRFS_I(new_dir)), + index); if (ret) goto out_fail; } @@ -9402,13 +9467,11 @@ static int btrfs_rename(struct user_namespace *mnt_userns, BTRFS_I(old_inode), 1); if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) { - ret = btrfs_unlink_subvol(trans, old_dir, old_dentry); + ret = btrfs_unlink_subvol(trans, BTRFS_I(old_dir), old_dentry); } else { ret = __btrfs_unlink_inode(trans, BTRFS_I(old_dir), - BTRFS_I(d_inode(old_dentry)), - old_dentry->d_name.name, - old_dentry->d_name.len, - &rename_ctx); + BTRFS_I(d_inode(old_dentry)), + &old_fname.disk_name, &rename_ctx); if (!ret) ret = btrfs_update_inode(trans, root, BTRFS_I(old_inode)); } @@ -9422,13 +9485,12 @@ static int btrfs_rename(struct user_namespace *mnt_userns, new_inode->i_ctime = current_time(new_inode); if (unlikely(btrfs_ino(BTRFS_I(new_inode)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) { - ret = btrfs_unlink_subvol(trans, new_dir, new_dentry); + ret = btrfs_unlink_subvol(trans, BTRFS_I(new_dir), new_dentry); BUG_ON(new_inode->i_nlink == 0); } else { ret = btrfs_unlink_inode(trans, BTRFS_I(new_dir), BTRFS_I(d_inode(new_dentry)), - new_dentry->d_name.name, - new_dentry->d_name.len); + &new_fname.disk_name); } if (!ret && new_inode->i_nlink == 0) ret = btrfs_orphan_add(trans, @@ -9440,8 +9502,7 @@ static int btrfs_rename(struct user_namespace *mnt_userns, } ret = btrfs_add_link(trans, BTRFS_I(new_dir), BTRFS_I(old_inode), - new_dentry->d_name.name, - new_dentry->d_name.len, 0, index); + &new_fname.disk_name, 0, index); if (ret) { btrfs_abort_transaction(trans, ret); goto out_fail; @@ -9476,6 +9537,9 @@ out_notrans: out_whiteout_inode: if (flags & RENAME_WHITEOUT) iput(whiteout_args.inode); +out_fscrypt_names: + fscrypt_free_filename(&old_fname); + fscrypt_free_filename(&new_fname); return ret; } @@ -9595,7 +9659,7 @@ static int start_delalloc_inodes(struct btrfs_root *root, &work->work); } else { ret = filemap_fdatawrite_wbc(inode->i_mapping, wbc); - btrfs_add_delayed_iput(inode); + btrfs_add_delayed_iput(BTRFS_I(inode)); if (ret || wbc->nr_to_write <= 0) goto out; } @@ -10205,7 +10269,7 @@ static ssize_t btrfs_encoded_read_inline( read_extent_buffer(leaf, tmp, ptr, count); btrfs_release_path(path); unlock_extent(io_tree, start, lockend, cached_state); - btrfs_inode_unlock(&inode->vfs_inode, BTRFS_ILOCK_SHARED); + btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); *unlocked = true; ret = copy_to_iter(tmp, count, iter); @@ -10265,7 +10329,7 @@ static blk_status_t btrfs_encoded_read_verify_csum(struct btrfs_bio *bbio) pgoff = bvec->bv_offset; for (i = 0; i < nr_sectors; i++) { ASSERT(pgoff < PAGE_SIZE); - if (btrfs_check_data_csum(&inode->vfs_inode, bbio, bio_offset, + if (btrfs_check_data_csum(inode, bbio, bio_offset, bvec->bv_page, pgoff)) return BLK_STS_IOERR; bio_offset += sectorsize; @@ -10408,7 +10472,7 @@ static ssize_t btrfs_encoded_read_regular(struct kiocb *iocb, goto out; unlock_extent(io_tree, start, lockend, cached_state); - btrfs_inode_unlock(&inode->vfs_inode, BTRFS_ILOCK_SHARED); + btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); *unlocked = true; if (compressed) { @@ -10457,10 +10521,10 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter, file_accessed(iocb->ki_filp); - btrfs_inode_lock(&inode->vfs_inode, BTRFS_ILOCK_SHARED); + btrfs_inode_lock(inode, BTRFS_ILOCK_SHARED); if (iocb->ki_pos >= inode->vfs_inode.i_size) { - btrfs_inode_unlock(&inode->vfs_inode, BTRFS_ILOCK_SHARED); + btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); return 0; } start = ALIGN_DOWN(iocb->ki_pos, fs_info->sectorsize); @@ -10558,7 +10622,7 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter, if (disk_bytenr == EXTENT_MAP_HOLE) { unlock_extent(io_tree, start, lockend, &cached_state); - btrfs_inode_unlock(&inode->vfs_inode, BTRFS_ILOCK_SHARED); + btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); unlocked = true; ret = iov_iter_zero(count, iter); if (ret != count) @@ -10581,7 +10645,7 @@ out_unlock_extent: unlock_extent(io_tree, start, lockend, &cached_state); out_unlock_inode: if (!unlocked) - btrfs_inode_unlock(&inode->vfs_inode, BTRFS_ILOCK_SHARED); + btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); return ret; } @@ -11247,7 +11311,7 @@ void btrfs_update_inode_bytes(struct btrfs_inode *inode, spin_unlock(&inode->lock); } -/** +/* * Verify that there are no ordered extents for a given file range. * * @inode: The target inode. diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index f897be9ec1e9..7e348bd2ccde 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -50,6 +50,17 @@ #include "delalloc-space.h" #include "block-group.h" #include "subpage.h" +#include "fs.h" +#include "accessors.h" +#include "extent-tree.h" +#include "root-tree.h" +#include "defrag.h" +#include "dir-item.h" +#include "uuid-tree.h" +#include "ioctl.h" +#include "file.h" +#include "scrub.h" +#include "super.h" #ifdef CONFIG_64BIT /* If we have a 32-bit userspace and 64-bit kernel, then the UAPI @@ -949,6 +960,7 @@ static noinline int btrfs_mksubvol(const struct path *parent, struct inode *dir = d_inode(parent->dentry); struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); struct dentry *dentry; + struct fscrypt_str name_str = FSTR_INIT((char *)name, namelen); int error; error = down_write_killable_nested(&dir->i_rwsem, I_MUTEX_PARENT); @@ -969,8 +981,7 @@ static noinline int btrfs_mksubvol(const struct path *parent, * check for them now when we can safely fail */ error = btrfs_check_dir_item_collision(BTRFS_I(dir)->root, - dir->i_ino, name, - namelen); + dir->i_ino, &name_str); if (error) goto out_dput; @@ -991,7 +1002,7 @@ out_up_read: out_dput: dput(dentry); out_unlock: - btrfs_inode_unlock(dir, 0); + btrfs_inode_unlock(BTRFS_I(dir), 0); return error; } @@ -1036,908 +1047,6 @@ out: } /* - * Defrag specific helper to get an extent map. - * - * Differences between this and btrfs_get_extent() are: - * - * - No extent_map will be added to inode->extent_tree - * To reduce memory usage in the long run. - * - * - Extra optimization to skip file extents older than @newer_than - * By using btrfs_search_forward() we can skip entire file ranges that - * have extents created in past transactions, because btrfs_search_forward() - * will not visit leaves and nodes with a generation smaller than given - * minimal generation threshold (@newer_than). - * - * Return valid em if we find a file extent matching the requirement. - * Return NULL if we can not find a file extent matching the requirement. - * - * Return ERR_PTR() for error. - */ -static struct extent_map *defrag_get_extent(struct btrfs_inode *inode, - u64 start, u64 newer_than) -{ - struct btrfs_root *root = inode->root; - struct btrfs_file_extent_item *fi; - struct btrfs_path path = { 0 }; - struct extent_map *em; - struct btrfs_key key; - u64 ino = btrfs_ino(inode); - int ret; - - em = alloc_extent_map(); - if (!em) { - ret = -ENOMEM; - goto err; - } - - key.objectid = ino; - key.type = BTRFS_EXTENT_DATA_KEY; - key.offset = start; - - if (newer_than) { - ret = btrfs_search_forward(root, &key, &path, newer_than); - if (ret < 0) - goto err; - /* Can't find anything newer */ - if (ret > 0) - goto not_found; - } else { - ret = btrfs_search_slot(NULL, root, &key, &path, 0, 0); - if (ret < 0) - goto err; - } - if (path.slots[0] >= btrfs_header_nritems(path.nodes[0])) { - /* - * If btrfs_search_slot() makes path to point beyond nritems, - * we should not have an empty leaf, as this inode must at - * least have its INODE_ITEM. - */ - ASSERT(btrfs_header_nritems(path.nodes[0])); - path.slots[0] = btrfs_header_nritems(path.nodes[0]) - 1; - } - btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]); - /* Perfect match, no need to go one slot back */ - if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY && - key.offset == start) - goto iterate; - - /* We didn't find a perfect match, needs to go one slot back */ - if (path.slots[0] > 0) { - btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]); - if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY) - path.slots[0]--; - } - -iterate: - /* Iterate through the path to find a file extent covering @start */ - while (true) { - u64 extent_end; - - if (path.slots[0] >= btrfs_header_nritems(path.nodes[0])) - goto next; - - btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]); - - /* - * We may go one slot back to INODE_REF/XATTR item, then - * need to go forward until we reach an EXTENT_DATA. - * But we should still has the correct ino as key.objectid. - */ - if (WARN_ON(key.objectid < ino) || key.type < BTRFS_EXTENT_DATA_KEY) - goto next; - - /* It's beyond our target range, definitely not extent found */ - if (key.objectid > ino || key.type > BTRFS_EXTENT_DATA_KEY) - goto not_found; - - /* - * | |<- File extent ->| - * \- start - * - * This means there is a hole between start and key.offset. - */ - if (key.offset > start) { - em->start = start; - em->orig_start = start; - em->block_start = EXTENT_MAP_HOLE; - em->len = key.offset - start; - break; - } - - fi = btrfs_item_ptr(path.nodes[0], path.slots[0], - struct btrfs_file_extent_item); - extent_end = btrfs_file_extent_end(&path); - - /* - * |<- file extent ->| | - * \- start - * - * We haven't reached start, search next slot. - */ - if (extent_end <= start) - goto next; - - /* Now this extent covers @start, convert it to em */ - btrfs_extent_item_to_extent_map(inode, &path, fi, false, em); - break; -next: - ret = btrfs_next_item(root, &path); - if (ret < 0) - goto err; - if (ret > 0) - goto not_found; - } - btrfs_release_path(&path); - return em; - -not_found: - btrfs_release_path(&path); - free_extent_map(em); - return NULL; - -err: - btrfs_release_path(&path); - free_extent_map(em); - return ERR_PTR(ret); -} - -static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start, - u64 newer_than, bool locked) -{ - struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; - struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; - struct extent_map *em; - const u32 sectorsize = BTRFS_I(inode)->root->fs_info->sectorsize; - - /* - * hopefully we have this extent in the tree already, try without - * the full extent lock - */ - read_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, start, sectorsize); - read_unlock(&em_tree->lock); - - /* - * We can get a merged extent, in that case, we need to re-search - * tree to get the original em for defrag. - * - * If @newer_than is 0 or em::generation < newer_than, we can trust - * this em, as either we don't care about the generation, or the - * merged extent map will be rejected anyway. - */ - if (em && test_bit(EXTENT_FLAG_MERGED, &em->flags) && - newer_than && em->generation >= newer_than) { - free_extent_map(em); - em = NULL; - } - - if (!em) { - struct extent_state *cached = NULL; - u64 end = start + sectorsize - 1; - - /* get the big lock and read metadata off disk */ - if (!locked) - lock_extent(io_tree, start, end, &cached); - em = defrag_get_extent(BTRFS_I(inode), start, newer_than); - if (!locked) - unlock_extent(io_tree, start, end, &cached); - - if (IS_ERR(em)) - return NULL; - } - - return em; -} - -static u32 get_extent_max_capacity(const struct btrfs_fs_info *fs_info, - const struct extent_map *em) -{ - if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) - return BTRFS_MAX_COMPRESSED; - return fs_info->max_extent_size; -} - -static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em, - u32 extent_thresh, u64 newer_than, bool locked) -{ - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct extent_map *next; - bool ret = false; - - /* this is the last extent */ - if (em->start + em->len >= i_size_read(inode)) - return false; - - /* - * Here we need to pass @newer_then when checking the next extent, or - * we will hit a case we mark current extent for defrag, but the next - * one will not be a target. - * This will just cause extra IO without really reducing the fragments. - */ - next = defrag_lookup_extent(inode, em->start + em->len, newer_than, locked); - /* No more em or hole */ - if (!next || next->block_start >= EXTENT_MAP_LAST_BYTE) - goto out; - if (test_bit(EXTENT_FLAG_PREALLOC, &next->flags)) - goto out; - /* - * If the next extent is at its max capacity, defragging current extent - * makes no sense, as the total number of extents won't change. - */ - if (next->len >= get_extent_max_capacity(fs_info, em)) - goto out; - /* Skip older extent */ - if (next->generation < newer_than) - goto out; - /* Also check extent size */ - if (next->len >= extent_thresh) - goto out; - - ret = true; -out: - free_extent_map(next); - return ret; -} - -/* - * Prepare one page to be defragged. - * - * This will ensure: - * - * - Returned page is locked and has been set up properly. - * - No ordered extent exists in the page. - * - The page is uptodate. - * - * NOTE: Caller should also wait for page writeback after the cluster is - * prepared, here we don't do writeback wait for each page. - */ -static struct page *defrag_prepare_one_page(struct btrfs_inode *inode, - pgoff_t index) -{ - struct address_space *mapping = inode->vfs_inode.i_mapping; - gfp_t mask = btrfs_alloc_write_mask(mapping); - u64 page_start = (u64)index << PAGE_SHIFT; - u64 page_end = page_start + PAGE_SIZE - 1; - struct extent_state *cached_state = NULL; - struct page *page; - int ret; - -again: - page = find_or_create_page(mapping, index, mask); - if (!page) - return ERR_PTR(-ENOMEM); - - /* - * Since we can defragment files opened read-only, we can encounter - * transparent huge pages here (see CONFIG_READ_ONLY_THP_FOR_FS). We - * can't do I/O using huge pages yet, so return an error for now. - * Filesystem transparent huge pages are typically only used for - * executables that explicitly enable them, so this isn't very - * restrictive. - */ - if (PageCompound(page)) { - unlock_page(page); - put_page(page); - return ERR_PTR(-ETXTBSY); - } - - ret = set_page_extent_mapped(page); - if (ret < 0) { - unlock_page(page); - put_page(page); - return ERR_PTR(ret); - } - - /* Wait for any existing ordered extent in the range */ - while (1) { - struct btrfs_ordered_extent *ordered; - - lock_extent(&inode->io_tree, page_start, page_end, &cached_state); - ordered = btrfs_lookup_ordered_range(inode, page_start, PAGE_SIZE); - unlock_extent(&inode->io_tree, page_start, page_end, - &cached_state); - if (!ordered) - break; - - unlock_page(page); - btrfs_start_ordered_extent(ordered, 1); - btrfs_put_ordered_extent(ordered); - lock_page(page); - /* - * We unlocked the page above, so we need check if it was - * released or not. - */ - if (page->mapping != mapping || !PagePrivate(page)) { - unlock_page(page); - put_page(page); - goto again; - } - } - - /* - * Now the page range has no ordered extent any more. Read the page to - * make it uptodate. - */ - if (!PageUptodate(page)) { - btrfs_read_folio(NULL, page_folio(page)); - lock_page(page); - if (page->mapping != mapping || !PagePrivate(page)) { - unlock_page(page); - put_page(page); - goto again; - } - if (!PageUptodate(page)) { - unlock_page(page); - put_page(page); - return ERR_PTR(-EIO); - } - } - return page; -} - -struct defrag_target_range { - struct list_head list; - u64 start; - u64 len; -}; - -/* - * Collect all valid target extents. - * - * @start: file offset to lookup - * @len: length to lookup - * @extent_thresh: file extent size threshold, any extent size >= this value - * will be ignored - * @newer_than: only defrag extents newer than this value - * @do_compress: whether the defrag is doing compression - * if true, @extent_thresh will be ignored and all regular - * file extents meeting @newer_than will be targets. - * @locked: if the range has already held extent lock - * @target_list: list of targets file extents - */ -static int defrag_collect_targets(struct btrfs_inode *inode, - u64 start, u64 len, u32 extent_thresh, - u64 newer_than, bool do_compress, - bool locked, struct list_head *target_list, - u64 *last_scanned_ret) -{ - struct btrfs_fs_info *fs_info = inode->root->fs_info; - bool last_is_target = false; - u64 cur = start; - int ret = 0; - - while (cur < start + len) { - struct extent_map *em; - struct defrag_target_range *new; - bool next_mergeable = true; - u64 range_len; - - last_is_target = false; - em = defrag_lookup_extent(&inode->vfs_inode, cur, - newer_than, locked); - if (!em) - break; - - /* - * If the file extent is an inlined one, we may still want to - * defrag it (fallthrough) if it will cause a regular extent. - * This is for users who want to convert inline extents to - * regular ones through max_inline= mount option. - */ - if (em->block_start == EXTENT_MAP_INLINE && - em->len <= inode->root->fs_info->max_inline) - goto next; - - /* Skip hole/delalloc/preallocated extents */ - if (em->block_start == EXTENT_MAP_HOLE || - em->block_start == EXTENT_MAP_DELALLOC || - test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) - goto next; - - /* Skip older extent */ - if (em->generation < newer_than) - goto next; - - /* This em is under writeback, no need to defrag */ - if (em->generation == (u64)-1) - goto next; - - /* - * Our start offset might be in the middle of an existing extent - * map, so take that into account. - */ - range_len = em->len - (cur - em->start); - /* - * If this range of the extent map is already flagged for delalloc, - * skip it, because: - * - * 1) We could deadlock later, when trying to reserve space for - * delalloc, because in case we can't immediately reserve space - * the flusher can start delalloc and wait for the respective - * ordered extents to complete. The deadlock would happen - * because we do the space reservation while holding the range - * locked, and starting writeback, or finishing an ordered - * extent, requires locking the range; - * - * 2) If there's delalloc there, it means there's dirty pages for - * which writeback has not started yet (we clean the delalloc - * flag when starting writeback and after creating an ordered - * extent). If we mark pages in an adjacent range for defrag, - * then we will have a larger contiguous range for delalloc, - * very likely resulting in a larger extent after writeback is - * triggered (except in a case of free space fragmentation). - */ - if (test_range_bit(&inode->io_tree, cur, cur + range_len - 1, - EXTENT_DELALLOC, 0, NULL)) - goto next; - - /* - * For do_compress case, we want to compress all valid file - * extents, thus no @extent_thresh or mergeable check. - */ - if (do_compress) - goto add; - - /* Skip too large extent */ - if (range_len >= extent_thresh) - goto next; - - /* - * Skip extents already at its max capacity, this is mostly for - * compressed extents, which max cap is only 128K. - */ - if (em->len >= get_extent_max_capacity(fs_info, em)) - goto next; - - /* - * Normally there are no more extents after an inline one, thus - * @next_mergeable will normally be false and not defragged. - * So if an inline extent passed all above checks, just add it - * for defrag, and be converted to regular extents. - */ - if (em->block_start == EXTENT_MAP_INLINE) - goto add; - - next_mergeable = defrag_check_next_extent(&inode->vfs_inode, em, - extent_thresh, newer_than, locked); - if (!next_mergeable) { - struct defrag_target_range *last; - - /* Empty target list, no way to merge with last entry */ - if (list_empty(target_list)) - goto next; - last = list_entry(target_list->prev, - struct defrag_target_range, list); - /* Not mergeable with last entry */ - if (last->start + last->len != cur) - goto next; - - /* Mergeable, fall through to add it to @target_list. */ - } - -add: - last_is_target = true; - range_len = min(extent_map_end(em), start + len) - cur; - /* - * This one is a good target, check if it can be merged into - * last range of the target list. - */ - if (!list_empty(target_list)) { - struct defrag_target_range *last; - - last = list_entry(target_list->prev, - struct defrag_target_range, list); - ASSERT(last->start + last->len <= cur); - if (last->start + last->len == cur) { - /* Mergeable, enlarge the last entry */ - last->len += range_len; - goto next; - } - /* Fall through to allocate a new entry */ - } - - /* Allocate new defrag_target_range */ - new = kmalloc(sizeof(*new), GFP_NOFS); - if (!new) { - free_extent_map(em); - ret = -ENOMEM; - break; - } - new->start = cur; - new->len = range_len; - list_add_tail(&new->list, target_list); - -next: - cur = extent_map_end(em); - free_extent_map(em); - } - if (ret < 0) { - struct defrag_target_range *entry; - struct defrag_target_range *tmp; - - list_for_each_entry_safe(entry, tmp, target_list, list) { - list_del_init(&entry->list); - kfree(entry); - } - } - if (!ret && last_scanned_ret) { - /* - * If the last extent is not a target, the caller can skip to - * the end of that extent. - * Otherwise, we can only go the end of the specified range. - */ - if (!last_is_target) - *last_scanned_ret = max(cur, *last_scanned_ret); - else - *last_scanned_ret = max(start + len, *last_scanned_ret); - } - return ret; -} - -#define CLUSTER_SIZE (SZ_256K) -static_assert(IS_ALIGNED(CLUSTER_SIZE, PAGE_SIZE)); - -/* - * Defrag one contiguous target range. - * - * @inode: target inode - * @target: target range to defrag - * @pages: locked pages covering the defrag range - * @nr_pages: number of locked pages - * - * Caller should ensure: - * - * - Pages are prepared - * Pages should be locked, no ordered extent in the pages range, - * no writeback. - * - * - Extent bits are locked - */ -static int defrag_one_locked_target(struct btrfs_inode *inode, - struct defrag_target_range *target, - struct page **pages, int nr_pages, - struct extent_state **cached_state) -{ - struct btrfs_fs_info *fs_info = inode->root->fs_info; - struct extent_changeset *data_reserved = NULL; - const u64 start = target->start; - const u64 len = target->len; - unsigned long last_index = (start + len - 1) >> PAGE_SHIFT; - unsigned long start_index = start >> PAGE_SHIFT; - unsigned long first_index = page_index(pages[0]); - int ret = 0; - int i; - - ASSERT(last_index - first_index + 1 <= nr_pages); - - ret = btrfs_delalloc_reserve_space(inode, &data_reserved, start, len); - if (ret < 0) - return ret; - clear_extent_bit(&inode->io_tree, start, start + len - 1, - EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | - EXTENT_DEFRAG, cached_state); - set_extent_defrag(&inode->io_tree, start, start + len - 1, cached_state); - - /* Update the page status */ - for (i = start_index - first_index; i <= last_index - first_index; i++) { - ClearPageChecked(pages[i]); - btrfs_page_clamp_set_dirty(fs_info, pages[i], start, len); - } - btrfs_delalloc_release_extents(inode, len); - extent_changeset_free(data_reserved); - - return ret; -} - -static int defrag_one_range(struct btrfs_inode *inode, u64 start, u32 len, - u32 extent_thresh, u64 newer_than, bool do_compress, - u64 *last_scanned_ret) -{ - struct extent_state *cached_state = NULL; - struct defrag_target_range *entry; - struct defrag_target_range *tmp; - LIST_HEAD(target_list); - struct page **pages; - const u32 sectorsize = inode->root->fs_info->sectorsize; - u64 last_index = (start + len - 1) >> PAGE_SHIFT; - u64 start_index = start >> PAGE_SHIFT; - unsigned int nr_pages = last_index - start_index + 1; - int ret = 0; - int i; - - ASSERT(nr_pages <= CLUSTER_SIZE / PAGE_SIZE); - ASSERT(IS_ALIGNED(start, sectorsize) && IS_ALIGNED(len, sectorsize)); - - pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS); - if (!pages) - return -ENOMEM; - - /* Prepare all pages */ - for (i = 0; i < nr_pages; i++) { - pages[i] = defrag_prepare_one_page(inode, start_index + i); - if (IS_ERR(pages[i])) { - ret = PTR_ERR(pages[i]); - pages[i] = NULL; - goto free_pages; - } - } - for (i = 0; i < nr_pages; i++) - wait_on_page_writeback(pages[i]); - - /* Lock the pages range */ - lock_extent(&inode->io_tree, start_index << PAGE_SHIFT, - (last_index << PAGE_SHIFT) + PAGE_SIZE - 1, - &cached_state); - /* - * Now we have a consistent view about the extent map, re-check - * which range really needs to be defragged. - * - * And this time we have extent locked already, pass @locked = true - * so that we won't relock the extent range and cause deadlock. - */ - ret = defrag_collect_targets(inode, start, len, extent_thresh, - newer_than, do_compress, true, - &target_list, last_scanned_ret); - if (ret < 0) - goto unlock_extent; - - list_for_each_entry(entry, &target_list, list) { - ret = defrag_one_locked_target(inode, entry, pages, nr_pages, - &cached_state); - if (ret < 0) - break; - } - - list_for_each_entry_safe(entry, tmp, &target_list, list) { - list_del_init(&entry->list); - kfree(entry); - } -unlock_extent: - unlock_extent(&inode->io_tree, start_index << PAGE_SHIFT, - (last_index << PAGE_SHIFT) + PAGE_SIZE - 1, - &cached_state); -free_pages: - for (i = 0; i < nr_pages; i++) { - if (pages[i]) { - unlock_page(pages[i]); - put_page(pages[i]); - } - } - kfree(pages); - return ret; -} - -static int defrag_one_cluster(struct btrfs_inode *inode, - struct file_ra_state *ra, - u64 start, u32 len, u32 extent_thresh, - u64 newer_than, bool do_compress, - unsigned long *sectors_defragged, - unsigned long max_sectors, - u64 *last_scanned_ret) -{ - const u32 sectorsize = inode->root->fs_info->sectorsize; - struct defrag_target_range *entry; - struct defrag_target_range *tmp; - LIST_HEAD(target_list); - int ret; - - ret = defrag_collect_targets(inode, start, len, extent_thresh, - newer_than, do_compress, false, - &target_list, NULL); - if (ret < 0) - goto out; - - list_for_each_entry(entry, &target_list, list) { - u32 range_len = entry->len; - - /* Reached or beyond the limit */ - if (max_sectors && *sectors_defragged >= max_sectors) { - ret = 1; - break; - } - - if (max_sectors) - range_len = min_t(u32, range_len, - (max_sectors - *sectors_defragged) * sectorsize); - - /* - * If defrag_one_range() has updated last_scanned_ret, - * our range may already be invalid (e.g. hole punched). - * Skip if our range is before last_scanned_ret, as there is - * no need to defrag the range anymore. - */ - if (entry->start + range_len <= *last_scanned_ret) - continue; - - if (ra) - page_cache_sync_readahead(inode->vfs_inode.i_mapping, - ra, NULL, entry->start >> PAGE_SHIFT, - ((entry->start + range_len - 1) >> PAGE_SHIFT) - - (entry->start >> PAGE_SHIFT) + 1); - /* - * Here we may not defrag any range if holes are punched before - * we locked the pages. - * But that's fine, it only affects the @sectors_defragged - * accounting. - */ - ret = defrag_one_range(inode, entry->start, range_len, - extent_thresh, newer_than, do_compress, - last_scanned_ret); - if (ret < 0) - break; - *sectors_defragged += range_len >> - inode->root->fs_info->sectorsize_bits; - } -out: - list_for_each_entry_safe(entry, tmp, &target_list, list) { - list_del_init(&entry->list); - kfree(entry); - } - if (ret >= 0) - *last_scanned_ret = max(*last_scanned_ret, start + len); - return ret; -} - -/* - * Entry point to file defragmentation. - * - * @inode: inode to be defragged - * @ra: readahead state (can be NUL) - * @range: defrag options including range and flags - * @newer_than: minimum transid to defrag - * @max_to_defrag: max number of sectors to be defragged, if 0, the whole inode - * will be defragged. - * - * Return <0 for error. - * Return >=0 for the number of sectors defragged, and range->start will be updated - * to indicate the file offset where next defrag should be started at. - * (Mostly for autodefrag, which sets @max_to_defrag thus we may exit early without - * defragging all the range). - */ -int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra, - struct btrfs_ioctl_defrag_range_args *range, - u64 newer_than, unsigned long max_to_defrag) -{ - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - unsigned long sectors_defragged = 0; - u64 isize = i_size_read(inode); - u64 cur; - u64 last_byte; - bool do_compress = range->flags & BTRFS_DEFRAG_RANGE_COMPRESS; - bool ra_allocated = false; - int compress_type = BTRFS_COMPRESS_ZLIB; - int ret = 0; - u32 extent_thresh = range->extent_thresh; - pgoff_t start_index; - - if (isize == 0) - return 0; - - if (range->start >= isize) - return -EINVAL; - - if (do_compress) { - if (range->compress_type >= BTRFS_NR_COMPRESS_TYPES) - return -EINVAL; - if (range->compress_type) - compress_type = range->compress_type; - } - - if (extent_thresh == 0) - extent_thresh = SZ_256K; - - if (range->start + range->len > range->start) { - /* Got a specific range */ - last_byte = min(isize, range->start + range->len); - } else { - /* Defrag until file end */ - last_byte = isize; - } - - /* Align the range */ - cur = round_down(range->start, fs_info->sectorsize); - last_byte = round_up(last_byte, fs_info->sectorsize) - 1; - - /* - * If we were not given a ra, allocate a readahead context. As - * readahead is just an optimization, defrag will work without it so - * we don't error out. - */ - if (!ra) { - ra_allocated = true; - ra = kzalloc(sizeof(*ra), GFP_KERNEL); - if (ra) - file_ra_state_init(ra, inode->i_mapping); - } - - /* - * Make writeback start from the beginning of the range, so that the - * defrag range can be written sequentially. - */ - start_index = cur >> PAGE_SHIFT; - if (start_index < inode->i_mapping->writeback_index) - inode->i_mapping->writeback_index = start_index; - - while (cur < last_byte) { - const unsigned long prev_sectors_defragged = sectors_defragged; - u64 last_scanned = cur; - u64 cluster_end; - - if (btrfs_defrag_cancelled(fs_info)) { - ret = -EAGAIN; - break; - } - - /* We want the cluster end at page boundary when possible */ - cluster_end = (((cur >> PAGE_SHIFT) + - (SZ_256K >> PAGE_SHIFT)) << PAGE_SHIFT) - 1; - cluster_end = min(cluster_end, last_byte); - - btrfs_inode_lock(inode, 0); - if (IS_SWAPFILE(inode)) { - ret = -ETXTBSY; - btrfs_inode_unlock(inode, 0); - break; - } - if (!(inode->i_sb->s_flags & SB_ACTIVE)) { - btrfs_inode_unlock(inode, 0); - break; - } - if (do_compress) - BTRFS_I(inode)->defrag_compress = compress_type; - ret = defrag_one_cluster(BTRFS_I(inode), ra, cur, - cluster_end + 1 - cur, extent_thresh, - newer_than, do_compress, §ors_defragged, - max_to_defrag, &last_scanned); - - if (sectors_defragged > prev_sectors_defragged) - balance_dirty_pages_ratelimited(inode->i_mapping); - - btrfs_inode_unlock(inode, 0); - if (ret < 0) - break; - cur = max(cluster_end + 1, last_scanned); - if (ret > 0) { - ret = 0; - break; - } - cond_resched(); - } - - if (ra_allocated) - kfree(ra); - /* - * Update range.start for autodefrag, this will indicate where to start - * in next run. - */ - range->start = cur; - if (sectors_defragged) { - /* - * We have defragged some sectors, for compression case they - * need to be written back immediately. - */ - if (range->flags & BTRFS_DEFRAG_RANGE_START_IO) { - filemap_flush(inode->i_mapping); - if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, - &BTRFS_I(inode)->runtime_flags)) - filemap_flush(inode->i_mapping); - } - if (range->compress_type == BTRFS_COMPRESS_LZO) - btrfs_set_fs_incompat(fs_info, COMPRESS_LZO); - else if (range->compress_type == BTRFS_COMPRESS_ZSTD) - btrfs_set_fs_incompat(fs_info, COMPRESS_ZSTD); - ret = sectors_defragged; - } - if (do_compress) { - btrfs_inode_lock(inode, 0); - BTRFS_I(inode)->defrag_compress = BTRFS_COMPRESS_NONE; - btrfs_inode_unlock(inode, 0); - } - return ret; -} - -/* * Try to start exclusive operation @type or cancel it if it's running. * * Return: @@ -2119,7 +1228,7 @@ static noinline int btrfs_ioctl_resize(struct file *file, if (ret == 0 && new_size != old_size) btrfs_info_in_rcu(fs_info, "resize device %s (devid %llu) from %llu to %llu", - rcu_str_deref(device->name), device->devid, + btrfs_dev_name(device), device->devid, old_size, new_size); out_finish: btrfs_exclop_finish(fs_info); @@ -3274,7 +2383,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, dentry = btrfs_get_dentry(fs_info->sb, BTRFS_FIRST_FREE_OBJECTID, - vol_args2->subvolid, 0, 0); + vol_args2->subvolid, 0); if (IS_ERR(dentry)) { err = PTR_ERR(dentry); goto out_drop_write; @@ -3419,16 +2528,16 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, goto out_dput; } - btrfs_inode_lock(inode, 0); - err = btrfs_delete_subvolume(dir, dentry); - btrfs_inode_unlock(inode, 0); + btrfs_inode_lock(BTRFS_I(inode), 0); + err = btrfs_delete_subvolume(BTRFS_I(dir), dentry); + btrfs_inode_unlock(BTRFS_I(inode), 0); if (!err) d_delete_notify(dir, dentry); out_dput: dput(dentry); out_unlock_dir: - btrfs_inode_unlock(dir, 0); + btrfs_inode_unlock(BTRFS_I(dir), 0); free_subvol_name: kfree(subvol_name_ptr); free_parent: @@ -3750,13 +2859,10 @@ static long btrfs_ioctl_dev_info(struct btrfs_fs_info *fs_info, di_args->bytes_used = btrfs_device_get_bytes_used(dev); di_args->total_bytes = btrfs_device_get_total_bytes(dev); memcpy(di_args->uuid, dev->uuid, sizeof(di_args->uuid)); - if (dev->name) { - strncpy(di_args->path, rcu_str_deref(dev->name), - sizeof(di_args->path) - 1); - di_args->path[sizeof(di_args->path) - 1] = 0; - } else { + if (dev->name) + strscpy(di_args->path, btrfs_dev_name(dev), sizeof(di_args->path)); + else di_args->path[0] = '\0'; - } out: rcu_read_unlock(); @@ -3777,6 +2883,7 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp) struct btrfs_trans_handle *trans; struct btrfs_path *path = NULL; struct btrfs_disk_key disk_key; + struct fscrypt_str name = FSTR_INIT("default", 7); u64 objectid = 0; u64 dir_id; int ret; @@ -3820,7 +2927,7 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp) dir_id = btrfs_super_root_dir(fs_info->super_copy); di = btrfs_lookup_dir_item(trans, fs_info->tree_root, path, - dir_id, "default", 7, 1); + dir_id, &name, 1); if (IS_ERR_OR_NULL(di)) { btrfs_release_path(path); btrfs_end_transaction(trans); @@ -4341,7 +3448,7 @@ void btrfs_update_ioctl_balance_args(struct btrfs_fs_info *fs_info, spin_unlock(&fs_info->balance_lock); } -/** +/* * Try to acquire fs_info::balance_mutex as well as set BTRFS_EXLCOP_BALANCE as * required. * diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h new file mode 100644 index 000000000000..8a855d5ac2fa --- /dev/null +++ b/fs/btrfs/ioctl.h @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef BTRFS_IOCTL_H +#define BTRFS_IOCTL_H + +long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg); +long btrfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg); +int btrfs_fileattr_get(struct dentry *dentry, struct fileattr *fa); +int btrfs_fileattr_set(struct user_namespace *mnt_userns, + struct dentry *dentry, struct fileattr *fa); +int btrfs_ioctl_get_supported_features(void __user *arg); +void btrfs_sync_inode_flags_to_i_flags(struct inode *inode); +int __pure btrfs_is_empty_uuid(u8 *uuid); +void btrfs_update_ioctl_balance_args(struct btrfs_fs_info *fs_info, + struct btrfs_ioctl_balance_args *bargs); + +#endif diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c index 0eab3cb274a1..870528d87526 100644 --- a/fs/btrfs/locking.c +++ b/fs/btrfs/locking.c @@ -12,6 +12,7 @@ #include "ctree.h" #include "extent_io.h" #include "locking.h" +#include "accessors.h" /* * Lockdep class keys for extent_buffer->lock's in this root. For a given diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h index 490c7a79e995..11c2269b4b6f 100644 --- a/fs/btrfs/locking.h +++ b/fs/btrfs/locking.h @@ -78,6 +78,82 @@ enum btrfs_lock_nesting { BTRFS_NESTING_MAX, }; +enum btrfs_lockdep_trans_states { + BTRFS_LOCKDEP_TRANS_COMMIT_START, + BTRFS_LOCKDEP_TRANS_UNBLOCKED, + BTRFS_LOCKDEP_TRANS_SUPER_COMMITTED, + BTRFS_LOCKDEP_TRANS_COMPLETED, +}; + +/* + * Lockdep annotation for wait events. + * + * @owner: The struct where the lockdep map is defined + * @lock: The lockdep map corresponding to a wait event + * + * This macro is used to annotate a wait event. In this case a thread acquires + * the lockdep map as writer (exclusive lock) because it has to block until all + * the threads that hold the lock as readers signal the condition for the wait + * event and release their locks. + */ +#define btrfs_might_wait_for_event(owner, lock) \ + do { \ + rwsem_acquire(&owner->lock##_map, 0, 0, _THIS_IP_); \ + rwsem_release(&owner->lock##_map, _THIS_IP_); \ + } while (0) + +/* + * Protection for the resource/condition of a wait event. + * + * @owner: The struct where the lockdep map is defined + * @lock: The lockdep map corresponding to a wait event + * + * Many threads can modify the condition for the wait event at the same time + * and signal the threads that block on the wait event. The threads that modify + * the condition and do the signaling acquire the lock as readers (shared + * lock). + */ +#define btrfs_lockdep_acquire(owner, lock) \ + rwsem_acquire_read(&owner->lock##_map, 0, 0, _THIS_IP_) + +/* + * Used after signaling the condition for a wait event to release the lockdep + * map held by a reader thread. + */ +#define btrfs_lockdep_release(owner, lock) \ + rwsem_release(&owner->lock##_map, _THIS_IP_) + +/* + * Macros for the transaction states wait events, similar to the generic wait + * event macros. + */ +#define btrfs_might_wait_for_state(owner, i) \ + do { \ + rwsem_acquire(&owner->btrfs_state_change_map[i], 0, 0, _THIS_IP_); \ + rwsem_release(&owner->btrfs_state_change_map[i], _THIS_IP_); \ + } while (0) + +#define btrfs_trans_state_lockdep_acquire(owner, i) \ + rwsem_acquire_read(&owner->btrfs_state_change_map[i], 0, 0, _THIS_IP_) + +#define btrfs_trans_state_lockdep_release(owner, i) \ + rwsem_release(&owner->btrfs_state_change_map[i], _THIS_IP_) + +/* Initialization of the lockdep map */ +#define btrfs_lockdep_init_map(owner, lock) \ + do { \ + static struct lock_class_key lock##_key; \ + lockdep_init_map(&owner->lock##_map, #lock, &lock##_key, 0); \ + } while (0) + +/* Initialization of the transaction states lockdep maps. */ +#define btrfs_state_lockdep_init_map(owner, lock, state) \ + do { \ + static struct lock_class_key lock##_key; \ + lockdep_init_map(&owner->btrfs_state_change_map[state], #lock, \ + &lock##_key, 0); \ + } while (0) + static_assert(BTRFS_NESTING_MAX <= MAX_LOCKDEP_SUBCLASSES, "too many lock subclasses defined"); diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c index 89bc5f825e0a..d5e78cbc8fbc 100644 --- a/fs/btrfs/lzo.c +++ b/fs/btrfs/lzo.c @@ -13,8 +13,10 @@ #include <linux/bio.h> #include <linux/lzo.h> #include <linux/refcount.h> +#include "messages.h" #include "compression.h" #include "ctree.h" +#include "super.h" #define LZO_LEN 4 @@ -425,7 +427,7 @@ out: return ret; } -int lzo_decompress(struct list_head *ws, unsigned char *data_in, +int lzo_decompress(struct list_head *ws, const u8 *data_in, struct page *dest_page, unsigned long start_byte, size_t srclen, size_t destlen) { diff --git a/fs/btrfs/messages.c b/fs/btrfs/messages.c new file mode 100644 index 000000000000..625bbbbb2608 --- /dev/null +++ b/fs/btrfs/messages.c @@ -0,0 +1,353 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "fs.h" +#include "messages.h" +#include "discard.h" +#include "transaction.h" +#include "space-info.h" +#include "super.h" + +#ifdef CONFIG_PRINTK + +#define STATE_STRING_PREFACE ": state " +#define STATE_STRING_BUF_LEN (sizeof(STATE_STRING_PREFACE) + BTRFS_FS_STATE_COUNT) + +/* + * Characters to print to indicate error conditions or uncommon filesystem state. + * RO is not an error. + */ +static const char fs_state_chars[] = { + [BTRFS_FS_STATE_ERROR] = 'E', + [BTRFS_FS_STATE_REMOUNTING] = 'M', + [BTRFS_FS_STATE_RO] = 0, + [BTRFS_FS_STATE_TRANS_ABORTED] = 'A', + [BTRFS_FS_STATE_DEV_REPLACING] = 'R', + [BTRFS_FS_STATE_DUMMY_FS_INFO] = 0, + [BTRFS_FS_STATE_NO_CSUMS] = 'C', + [BTRFS_FS_STATE_LOG_CLEANUP_ERROR] = 'L', +}; + +static void btrfs_state_to_string(const struct btrfs_fs_info *info, char *buf) +{ + unsigned int bit; + bool states_printed = false; + unsigned long fs_state = READ_ONCE(info->fs_state); + char *curr = buf; + + memcpy(curr, STATE_STRING_PREFACE, sizeof(STATE_STRING_PREFACE)); + curr += sizeof(STATE_STRING_PREFACE) - 1; + + for_each_set_bit(bit, &fs_state, sizeof(fs_state)) { + WARN_ON_ONCE(bit >= BTRFS_FS_STATE_COUNT); + if ((bit < BTRFS_FS_STATE_COUNT) && fs_state_chars[bit]) { + *curr++ = fs_state_chars[bit]; + states_printed = true; + } + } + + /* If no states were printed, reset the buffer */ + if (!states_printed) + curr = buf; + + *curr++ = 0; +} +#endif + +/* + * Generally the error codes correspond to their respective errors, but there + * are a few special cases. + * + * EUCLEAN: Any sort of corruption that we encounter. The tree-checker for + * instance will return EUCLEAN if any of the blocks are corrupted in + * a way that is problematic. We want to reserve EUCLEAN for these + * sort of corruptions. + * + * EROFS: If we check BTRFS_FS_STATE_ERROR and fail out with a return error, we + * need to use EROFS for this case. We will have no idea of the + * original failure, that will have been reported at the time we tripped + * over the error. Each subsequent error that doesn't have any context + * of the original error should use EROFS when handling BTRFS_FS_STATE_ERROR. + */ +const char * __attribute_const__ btrfs_decode_error(int errno) +{ + char *errstr = "unknown"; + + switch (errno) { + case -ENOENT: /* -2 */ + errstr = "No such entry"; + break; + case -EIO: /* -5 */ + errstr = "IO failure"; + break; + case -ENOMEM: /* -12*/ + errstr = "Out of memory"; + break; + case -EEXIST: /* -17 */ + errstr = "Object already exists"; + break; + case -ENOSPC: /* -28 */ + errstr = "No space left"; + break; + case -EROFS: /* -30 */ + errstr = "Readonly filesystem"; + break; + case -EOPNOTSUPP: /* -95 */ + errstr = "Operation not supported"; + break; + case -EUCLEAN: /* -117 */ + errstr = "Filesystem corrupted"; + break; + case -EDQUOT: /* -122 */ + errstr = "Quota exceeded"; + break; + } + + return errstr; +} + +/* + * __btrfs_handle_fs_error decodes expected errors from the caller and + * invokes the appropriate error response. + */ +__cold +void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function, + unsigned int line, int errno, const char *fmt, ...) +{ + struct super_block *sb = fs_info->sb; +#ifdef CONFIG_PRINTK + char statestr[STATE_STRING_BUF_LEN]; + const char *errstr; +#endif + +#ifdef CONFIG_PRINTK_INDEX + printk_index_subsys_emit( + "BTRFS: error (device %s%s) in %s:%d: errno=%d %s", KERN_CRIT, fmt); +#endif + + /* + * Special case: if the error is EROFS, and we're already under + * SB_RDONLY, then it is safe here. + */ + if (errno == -EROFS && sb_rdonly(sb)) + return; + +#ifdef CONFIG_PRINTK + errstr = btrfs_decode_error(errno); + btrfs_state_to_string(fs_info, statestr); + if (fmt) { + struct va_format vaf; + va_list args; + + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + + pr_crit("BTRFS: error (device %s%s) in %s:%d: errno=%d %s (%pV)\n", + sb->s_id, statestr, function, line, errno, errstr, &vaf); + va_end(args); + } else { + pr_crit("BTRFS: error (device %s%s) in %s:%d: errno=%d %s\n", + sb->s_id, statestr, function, line, errno, errstr); + } +#endif + + /* + * Today we only save the error info to memory. Long term we'll also + * send it down to the disk. + */ + set_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state); + + /* Don't go through full error handling during mount. */ + if (!(sb->s_flags & SB_BORN)) + return; + + if (sb_rdonly(sb)) + return; + + btrfs_discard_stop(fs_info); + + /* Handle error by forcing the filesystem readonly. */ + btrfs_set_sb_rdonly(sb); + btrfs_info(fs_info, "forced readonly"); + /* + * Note that a running device replace operation is not canceled here + * although there is no way to update the progress. It would add the + * risk of a deadlock, therefore the canceling is omitted. The only + * penalty is that some I/O remains active until the procedure + * completes. The next time when the filesystem is mounted writable + * again, the device replace operation continues. + */ +} + +#ifdef CONFIG_PRINTK +static const char * const logtypes[] = { + "emergency", + "alert", + "critical", + "error", + "warning", + "notice", + "info", + "debug", +}; + +/* + * Use one ratelimit state per log level so that a flood of less important + * messages doesn't cause more important ones to be dropped. + */ +static struct ratelimit_state printk_limits[] = { + RATELIMIT_STATE_INIT(printk_limits[0], DEFAULT_RATELIMIT_INTERVAL, 100), + RATELIMIT_STATE_INIT(printk_limits[1], DEFAULT_RATELIMIT_INTERVAL, 100), + RATELIMIT_STATE_INIT(printk_limits[2], DEFAULT_RATELIMIT_INTERVAL, 100), + RATELIMIT_STATE_INIT(printk_limits[3], DEFAULT_RATELIMIT_INTERVAL, 100), + RATELIMIT_STATE_INIT(printk_limits[4], DEFAULT_RATELIMIT_INTERVAL, 100), + RATELIMIT_STATE_INIT(printk_limits[5], DEFAULT_RATELIMIT_INTERVAL, 100), + RATELIMIT_STATE_INIT(printk_limits[6], DEFAULT_RATELIMIT_INTERVAL, 100), + RATELIMIT_STATE_INIT(printk_limits[7], DEFAULT_RATELIMIT_INTERVAL, 100), +}; + +void __cold _btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...) +{ + char lvl[PRINTK_MAX_SINGLE_HEADER_LEN + 1] = "\0"; + struct va_format vaf; + va_list args; + int kern_level; + const char *type = logtypes[4]; + struct ratelimit_state *ratelimit = &printk_limits[4]; + +#ifdef CONFIG_PRINTK_INDEX + printk_index_subsys_emit("%sBTRFS %s (device %s): ", NULL, fmt); +#endif + + va_start(args, fmt); + + while ((kern_level = printk_get_level(fmt)) != 0) { + size_t size = printk_skip_level(fmt) - fmt; + + if (kern_level >= '0' && kern_level <= '7') { + memcpy(lvl, fmt, size); + lvl[size] = '\0'; + type = logtypes[kern_level - '0']; + ratelimit = &printk_limits[kern_level - '0']; + } + fmt += size; + } + + vaf.fmt = fmt; + vaf.va = &args; + + if (__ratelimit(ratelimit)) { + if (fs_info) { + char statestr[STATE_STRING_BUF_LEN]; + + btrfs_state_to_string(fs_info, statestr); + _printk("%sBTRFS %s (device %s%s): %pV\n", lvl, type, + fs_info->sb->s_id, statestr, &vaf); + } else { + _printk("%sBTRFS %s: %pV\n", lvl, type, &vaf); + } + } + + va_end(args); +} +#endif + +#ifdef CONFIG_BTRFS_ASSERT +void __cold btrfs_assertfail(const char *expr, const char *file, int line) +{ + pr_err("assertion failed: %s, in %s:%d\n", expr, file, line); + BUG(); +} +#endif + +void __cold btrfs_print_v0_err(struct btrfs_fs_info *fs_info) +{ + btrfs_err(fs_info, +"Unsupported V0 extent filesystem detected. Aborting. Please re-create your filesystem with a newer kernel"); +} + +#if BITS_PER_LONG == 32 +void __cold btrfs_warn_32bit_limit(struct btrfs_fs_info *fs_info) +{ + if (!test_and_set_bit(BTRFS_FS_32BIT_WARN, &fs_info->flags)) { + btrfs_warn(fs_info, "reaching 32bit limit for logical addresses"); + btrfs_warn(fs_info, +"due to page cache limit on 32bit systems, btrfs can't access metadata at or beyond %lluT", + BTRFS_32BIT_MAX_FILE_SIZE >> 40); + btrfs_warn(fs_info, + "please consider upgrading to 64bit kernel/hardware"); + } +} + +void __cold btrfs_err_32bit_limit(struct btrfs_fs_info *fs_info) +{ + if (!test_and_set_bit(BTRFS_FS_32BIT_ERROR, &fs_info->flags)) { + btrfs_err(fs_info, "reached 32bit limit for logical addresses"); + btrfs_err(fs_info, +"due to page cache limit on 32bit systems, metadata beyond %lluT can't be accessed", + BTRFS_32BIT_MAX_FILE_SIZE >> 40); + btrfs_err(fs_info, + "please consider upgrading to 64bit kernel/hardware"); + } +} +#endif + +/* + * We only mark the transaction aborted and then set the file system read-only. + * This will prevent new transactions from starting or trying to join this + * one. + * + * This means that error recovery at the call site is limited to freeing + * any local memory allocations and passing the error code up without + * further cleanup. The transaction should complete as it normally would + * in the call path but will return -EIO. + * + * We'll complete the cleanup in btrfs_end_transaction and + * btrfs_commit_transaction. + */ +__cold +void __btrfs_abort_transaction(struct btrfs_trans_handle *trans, + const char *function, + unsigned int line, int errno, bool first_hit) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + + WRITE_ONCE(trans->aborted, errno); + WRITE_ONCE(trans->transaction->aborted, errno); + if (first_hit && errno == -ENOSPC) + btrfs_dump_space_info_for_trans_abort(fs_info); + /* Wake up anybody who may be waiting on this transaction */ + wake_up(&fs_info->transaction_wait); + wake_up(&fs_info->transaction_blocked_wait); + __btrfs_handle_fs_error(fs_info, function, line, errno, NULL); +} + +/* + * __btrfs_panic decodes unexpected, fatal errors from the caller, issues an + * alert, and either panics or BUGs, depending on mount options. + */ +__cold +void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function, + unsigned int line, int errno, const char *fmt, ...) +{ + char *s_id = "<unknown>"; + const char *errstr; + struct va_format vaf = { .fmt = fmt }; + va_list args; + + if (fs_info) + s_id = fs_info->sb->s_id; + + va_start(args, fmt); + vaf.va = &args; + + errstr = btrfs_decode_error(errno); + if (fs_info && (btrfs_test_opt(fs_info, PANIC_ON_FATAL_ERROR))) + panic(KERN_CRIT "BTRFS panic (device %s) in %s:%d: %pV (errno=%d %s)\n", + s_id, function, line, &vaf, errno, errstr); + + btrfs_crit(fs_info, "panic in %s:%d: %pV (errno=%d %s)", + function, line, &vaf, errno, errstr); + va_end(args); + /* Caller calls BUG() */ +} diff --git a/fs/btrfs/messages.h b/fs/btrfs/messages.h new file mode 100644 index 000000000000..190af1f698d9 --- /dev/null +++ b/fs/btrfs/messages.h @@ -0,0 +1,245 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef BTRFS_MESSAGES_H +#define BTRFS_MESSAGES_H + +#include <linux/types.h> + +struct btrfs_fs_info; +struct btrfs_trans_handle; + +static inline __printf(2, 3) __cold +void btrfs_no_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...) +{ +} + +#ifdef CONFIG_PRINTK + +#define btrfs_printk(fs_info, fmt, args...) \ + _btrfs_printk(fs_info, fmt, ##args) + +__printf(2, 3) +__cold +void _btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...); + +#else + +#define btrfs_printk(fs_info, fmt, args...) \ + btrfs_no_printk(fs_info, fmt, ##args) +#endif + +#define btrfs_emerg(fs_info, fmt, args...) \ + btrfs_printk(fs_info, KERN_EMERG fmt, ##args) +#define btrfs_alert(fs_info, fmt, args...) \ + btrfs_printk(fs_info, KERN_ALERT fmt, ##args) +#define btrfs_crit(fs_info, fmt, args...) \ + btrfs_printk(fs_info, KERN_CRIT fmt, ##args) +#define btrfs_err(fs_info, fmt, args...) \ + btrfs_printk(fs_info, KERN_ERR fmt, ##args) +#define btrfs_warn(fs_info, fmt, args...) \ + btrfs_printk(fs_info, KERN_WARNING fmt, ##args) +#define btrfs_notice(fs_info, fmt, args...) \ + btrfs_printk(fs_info, KERN_NOTICE fmt, ##args) +#define btrfs_info(fs_info, fmt, args...) \ + btrfs_printk(fs_info, KERN_INFO fmt, ##args) + +/* + * Wrappers that use printk_in_rcu + */ +#define btrfs_emerg_in_rcu(fs_info, fmt, args...) \ + btrfs_printk_in_rcu(fs_info, KERN_EMERG fmt, ##args) +#define btrfs_alert_in_rcu(fs_info, fmt, args...) \ + btrfs_printk_in_rcu(fs_info, KERN_ALERT fmt, ##args) +#define btrfs_crit_in_rcu(fs_info, fmt, args...) \ + btrfs_printk_in_rcu(fs_info, KERN_CRIT fmt, ##args) +#define btrfs_err_in_rcu(fs_info, fmt, args...) \ + btrfs_printk_in_rcu(fs_info, KERN_ERR fmt, ##args) +#define btrfs_warn_in_rcu(fs_info, fmt, args...) \ + btrfs_printk_in_rcu(fs_info, KERN_WARNING fmt, ##args) +#define btrfs_notice_in_rcu(fs_info, fmt, args...) \ + btrfs_printk_in_rcu(fs_info, KERN_NOTICE fmt, ##args) +#define btrfs_info_in_rcu(fs_info, fmt, args...) \ + btrfs_printk_in_rcu(fs_info, KERN_INFO fmt, ##args) + +/* + * Wrappers that use a ratelimited printk_in_rcu + */ +#define btrfs_emerg_rl_in_rcu(fs_info, fmt, args...) \ + btrfs_printk_rl_in_rcu(fs_info, KERN_EMERG fmt, ##args) +#define btrfs_alert_rl_in_rcu(fs_info, fmt, args...) \ + btrfs_printk_rl_in_rcu(fs_info, KERN_ALERT fmt, ##args) +#define btrfs_crit_rl_in_rcu(fs_info, fmt, args...) \ + btrfs_printk_rl_in_rcu(fs_info, KERN_CRIT fmt, ##args) +#define btrfs_err_rl_in_rcu(fs_info, fmt, args...) \ + btrfs_printk_rl_in_rcu(fs_info, KERN_ERR fmt, ##args) +#define btrfs_warn_rl_in_rcu(fs_info, fmt, args...) \ + btrfs_printk_rl_in_rcu(fs_info, KERN_WARNING fmt, ##args) +#define btrfs_notice_rl_in_rcu(fs_info, fmt, args...) \ + btrfs_printk_rl_in_rcu(fs_info, KERN_NOTICE fmt, ##args) +#define btrfs_info_rl_in_rcu(fs_info, fmt, args...) \ + btrfs_printk_rl_in_rcu(fs_info, KERN_INFO fmt, ##args) + +/* + * Wrappers that use a ratelimited printk + */ +#define btrfs_emerg_rl(fs_info, fmt, args...) \ + btrfs_printk_ratelimited(fs_info, KERN_EMERG fmt, ##args) +#define btrfs_alert_rl(fs_info, fmt, args...) \ + btrfs_printk_ratelimited(fs_info, KERN_ALERT fmt, ##args) +#define btrfs_crit_rl(fs_info, fmt, args...) \ + btrfs_printk_ratelimited(fs_info, KERN_CRIT fmt, ##args) +#define btrfs_err_rl(fs_info, fmt, args...) \ + btrfs_printk_ratelimited(fs_info, KERN_ERR fmt, ##args) +#define btrfs_warn_rl(fs_info, fmt, args...) \ + btrfs_printk_ratelimited(fs_info, KERN_WARNING fmt, ##args) +#define btrfs_notice_rl(fs_info, fmt, args...) \ + btrfs_printk_ratelimited(fs_info, KERN_NOTICE fmt, ##args) +#define btrfs_info_rl(fs_info, fmt, args...) \ + btrfs_printk_ratelimited(fs_info, KERN_INFO fmt, ##args) + +#if defined(CONFIG_DYNAMIC_DEBUG) +#define btrfs_debug(fs_info, fmt, args...) \ + _dynamic_func_call_no_desc(fmt, btrfs_printk, \ + fs_info, KERN_DEBUG fmt, ##args) +#define btrfs_debug_in_rcu(fs_info, fmt, args...) \ + _dynamic_func_call_no_desc(fmt, btrfs_printk_in_rcu, \ + fs_info, KERN_DEBUG fmt, ##args) +#define btrfs_debug_rl_in_rcu(fs_info, fmt, args...) \ + _dynamic_func_call_no_desc(fmt, btrfs_printk_rl_in_rcu, \ + fs_info, KERN_DEBUG fmt, ##args) +#define btrfs_debug_rl(fs_info, fmt, args...) \ + _dynamic_func_call_no_desc(fmt, btrfs_printk_ratelimited, \ + fs_info, KERN_DEBUG fmt, ##args) +#elif defined(DEBUG) +#define btrfs_debug(fs_info, fmt, args...) \ + btrfs_printk(fs_info, KERN_DEBUG fmt, ##args) +#define btrfs_debug_in_rcu(fs_info, fmt, args...) \ + btrfs_printk_in_rcu(fs_info, KERN_DEBUG fmt, ##args) +#define btrfs_debug_rl_in_rcu(fs_info, fmt, args...) \ + btrfs_printk_rl_in_rcu(fs_info, KERN_DEBUG fmt, ##args) +#define btrfs_debug_rl(fs_info, fmt, args...) \ + btrfs_printk_ratelimited(fs_info, KERN_DEBUG fmt, ##args) +#else +#define btrfs_debug(fs_info, fmt, args...) \ + btrfs_no_printk(fs_info, KERN_DEBUG fmt, ##args) +#define btrfs_debug_in_rcu(fs_info, fmt, args...) \ + btrfs_no_printk_in_rcu(fs_info, KERN_DEBUG fmt, ##args) +#define btrfs_debug_rl_in_rcu(fs_info, fmt, args...) \ + btrfs_no_printk_in_rcu(fs_info, KERN_DEBUG fmt, ##args) +#define btrfs_debug_rl(fs_info, fmt, args...) \ + btrfs_no_printk(fs_info, KERN_DEBUG fmt, ##args) +#endif + +#define btrfs_printk_in_rcu(fs_info, fmt, args...) \ +do { \ + rcu_read_lock(); \ + btrfs_printk(fs_info, fmt, ##args); \ + rcu_read_unlock(); \ +} while (0) + +#define btrfs_no_printk_in_rcu(fs_info, fmt, args...) \ +do { \ + rcu_read_lock(); \ + btrfs_no_printk(fs_info, fmt, ##args); \ + rcu_read_unlock(); \ +} while (0) + +#define btrfs_printk_ratelimited(fs_info, fmt, args...) \ +do { \ + static DEFINE_RATELIMIT_STATE(_rs, \ + DEFAULT_RATELIMIT_INTERVAL, \ + DEFAULT_RATELIMIT_BURST); \ + if (__ratelimit(&_rs)) \ + btrfs_printk(fs_info, fmt, ##args); \ +} while (0) + +#define btrfs_printk_rl_in_rcu(fs_info, fmt, args...) \ +do { \ + rcu_read_lock(); \ + btrfs_printk_ratelimited(fs_info, fmt, ##args); \ + rcu_read_unlock(); \ +} while (0) + +#ifdef CONFIG_BTRFS_ASSERT +void __cold btrfs_assertfail(const char *expr, const char *file, int line); + +#define ASSERT(expr) \ + (likely(expr) ? (void)0 : btrfs_assertfail(#expr, __FILE__, __LINE__)) +#else +#define ASSERT(expr) (void)(expr) +#endif + +void __cold btrfs_print_v0_err(struct btrfs_fs_info *fs_info); + +__printf(5, 6) +__cold +void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function, + unsigned int line, int errno, const char *fmt, ...); + +const char * __attribute_const__ btrfs_decode_error(int errno); + +__cold +void __btrfs_abort_transaction(struct btrfs_trans_handle *trans, + const char *function, + unsigned int line, int errno, bool first_hit); + +bool __cold abort_should_print_stack(int errno); + +/* + * Call btrfs_abort_transaction as early as possible when an error condition is + * detected, that way the exact stack trace is reported for some errors. + */ +#define btrfs_abort_transaction(trans, errno) \ +do { \ + bool first = false; \ + /* Report first abort since mount */ \ + if (!test_and_set_bit(BTRFS_FS_STATE_TRANS_ABORTED, \ + &((trans)->fs_info->fs_state))) { \ + first = true; \ + if (WARN(abort_should_print_stack(errno), \ + KERN_ERR \ + "BTRFS: Transaction aborted (error %d)\n", \ + (errno))) { \ + /* Stack trace printed. */ \ + } else { \ + btrfs_err((trans)->fs_info, \ + "Transaction aborted (error %d)", \ + (errno)); \ + } \ + } \ + __btrfs_abort_transaction((trans), __func__, \ + __LINE__, (errno), first); \ +} while (0) + +#define btrfs_handle_fs_error(fs_info, errno, fmt, args...) \ + __btrfs_handle_fs_error((fs_info), __func__, __LINE__, \ + (errno), fmt, ##args) + +__printf(5, 6) +__cold +void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function, + unsigned int line, int errno, const char *fmt, ...); +/* + * If BTRFS_MOUNT_PANIC_ON_FATAL_ERROR is in mount_opt, __btrfs_panic + * will panic(). Otherwise we BUG() here. + */ +#define btrfs_panic(fs_info, errno, fmt, args...) \ +do { \ + __btrfs_panic(fs_info, __func__, __LINE__, errno, fmt, ##args); \ + BUG(); \ +} while (0) + +#if BITS_PER_LONG == 32 +#define BTRFS_32BIT_MAX_FILE_SIZE (((u64)ULONG_MAX + 1) << PAGE_SHIFT) +/* + * The warning threshold is 5/8th of the MAX_LFS_FILESIZE that limits the logical + * addresses of extents. + * + * For 4K page size it's about 10T, for 64K it's 160T. + */ +#define BTRFS_32BIT_EARLY_WARN_THRESHOLD (BTRFS_32BIT_MAX_FILE_SIZE * 5 / 8) +void btrfs_warn_32bit_limit(struct btrfs_fs_info *fs_info); +void btrfs_err_32bit_limit(struct btrfs_fs_info *fs_info); +#endif + +#endif diff --git a/fs/btrfs/misc.h b/fs/btrfs/misc.h index f9850edfd726..768583a440e1 100644 --- a/fs/btrfs/misc.h +++ b/fs/btrfs/misc.h @@ -10,6 +10,14 @@ #define in_range(b, first, len) ((b) >= (first) && (b) < (first) + (len)) +/* + * Enumerate bits using enum autoincrement. Define the @name as the n-th bit. + */ +#define ENUM_BIT(name) \ + __ ## name ## _BIT, \ + name = (1U << __ ## name ## _BIT), \ + __ ## name ## _SEQ = __ ## name ## _BIT + static inline void cond_wake_up(struct wait_queue_head *wq) { /* @@ -32,22 +40,10 @@ static inline void cond_wake_up_nomb(struct wait_queue_head *wq) wake_up(wq); } -static inline u64 div_factor(u64 num, int factor) +static inline u64 mult_perc(u64 num, u32 percent) { - if (factor == 10) - return num; - num *= factor; - return div_u64(num, 10); + return div_u64(num * percent, 100); } - -static inline u64 div_factor_fine(u64 num, int factor) -{ - if (factor == 100) - return num; - num *= factor; - return div_u64(num, 100); -} - /* Copy of is_power_of_two that is 64bit safe */ static inline bool is_power_of_two_u64(u64 n) { diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 100d9f4836b1..57d8c72737e1 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -7,6 +7,7 @@ #include <linux/blkdev.h> #include <linux/writeback.h> #include <linux/sched/mm.h> +#include "messages.h" #include "misc.h" #include "ctree.h" #include "transaction.h" @@ -17,6 +18,8 @@ #include "delalloc-space.h" #include "qgroup.h" #include "subpage.h" +#include "file.h" +#include "super.h" static struct kmem_cache *btrfs_ordered_extent_cache; @@ -143,7 +146,7 @@ static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree, return ret; } -/** +/* * Add an ordered extent to the per-inode tree. * * @inode: Inode that this extent is for. @@ -501,7 +504,7 @@ void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry) ASSERT(list_empty(&entry->log_list)); ASSERT(RB_EMPTY_NODE(&entry->rb_node)); if (entry->inode) - btrfs_add_delayed_iput(entry->inode); + btrfs_add_delayed_iput(BTRFS_I(entry->inode)); while (!list_empty(&entry->list)) { cur = entry->list.next; sum = list_entry(cur, struct btrfs_ordered_sum, list); @@ -1019,17 +1022,18 @@ out: } /* - * btrfs_flush_ordered_range - Lock the passed range and ensures all pending - * ordered extents in it are run to completion. + * Lock the passed range and ensures all pending ordered extents in it are run + * to completion. * * @inode: Inode whose ordered tree is to be searched * @start: Beginning of range to flush * @end: Last byte of range to lock * @cached_state: If passed, will return the extent state responsible for the - * locked range. It's the caller's responsibility to free the cached state. + * locked range. It's the caller's responsibility to free the + * cached state. * - * This function always returns with the given range locked, ensuring after it's - * called no order extent can be pending. + * Always return with the given range locked, ensuring after it's called no + * order extent can be pending. */ void btrfs_lock_and_flush_ordered_range(struct btrfs_inode *inode, u64 start, u64 end, @@ -1069,11 +1073,12 @@ void btrfs_lock_and_flush_ordered_range(struct btrfs_inode *inode, u64 start, * Return true if btrfs_lock_ordered_range does not return any extents, * otherwise false. */ -bool btrfs_try_lock_ordered_range(struct btrfs_inode *inode, u64 start, u64 end) +bool btrfs_try_lock_ordered_range(struct btrfs_inode *inode, u64 start, u64 end, + struct extent_state **cached_state) { struct btrfs_ordered_extent *ordered; - if (!try_lock_extent(&inode->io_tree, start, end)) + if (!try_lock_extent(&inode->io_tree, start, end, cached_state)) return false; ordered = btrfs_lookup_ordered_range(inode, start, end - start + 1); @@ -1081,7 +1086,7 @@ bool btrfs_try_lock_ordered_range(struct btrfs_inode *inode, u64 start, u64 end) return true; btrfs_put_ordered_extent(ordered); - unlock_extent(&inode->io_tree, start, end, NULL); + unlock_extent(&inode->io_tree, start, end, cached_state); return false; } diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index f59f2dbdb25e..89f82b78f590 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -206,7 +206,8 @@ void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr, void btrfs_lock_and_flush_ordered_range(struct btrfs_inode *inode, u64 start, u64 end, struct extent_state **cached_state); -bool btrfs_try_lock_ordered_range(struct btrfs_inode *inode, u64 start, u64 end); +bool btrfs_try_lock_ordered_range(struct btrfs_inode *inode, u64 start, u64 end, + struct extent_state **cached_state); int btrfs_split_ordered_extent(struct btrfs_ordered_extent *ordered, u64 pre, u64 post); int __init ordered_data_init(void); diff --git a/fs/btrfs/orphan.c b/fs/btrfs/orphan.c index aa534108c1e2..7a1b021b5669 100644 --- a/fs/btrfs/orphan.c +++ b/fs/btrfs/orphan.c @@ -5,6 +5,7 @@ #include "ctree.h" #include "disk-io.h" +#include "orphan.h" int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 offset) diff --git a/fs/btrfs/orphan.h b/fs/btrfs/orphan.h new file mode 100644 index 000000000000..3faab5cbb59a --- /dev/null +++ b/fs/btrfs/orphan.h @@ -0,0 +1,11 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef BTRFS_ORPHAN_H +#define BTRFS_ORPHAN_H + +int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 offset); +int btrfs_del_orphan_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 offset); + +#endif diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c index dd8777872143..b93c96213304 100644 --- a/fs/btrfs/print-tree.c +++ b/fs/btrfs/print-tree.c @@ -3,9 +3,12 @@ * Copyright (C) 2007 Oracle. All rights reserved. */ +#include "messages.h" #include "ctree.h" #include "disk-io.h" #include "print-tree.h" +#include "accessors.h" +#include "tree-checker.h" struct root_name_map { u64 id; @@ -240,9 +243,9 @@ void btrfs_print_leaf(struct extent_buffer *l) case BTRFS_DIR_ITEM_KEY: di = btrfs_item_ptr(l, i, struct btrfs_dir_item); btrfs_dir_item_key_to_cpu(l, di, &found_key); - pr_info("\t\tdir oid %llu type %u\n", + pr_info("\t\tdir oid %llu flags %u\n", found_key.objectid, - btrfs_dir_type(l, di)); + btrfs_dir_flags(l, di)); break; case BTRFS_ROOT_ITEM_KEY: ri = btrfs_item_ptr(l, i, struct btrfs_root_item); @@ -384,14 +387,16 @@ void btrfs_print_tree(struct extent_buffer *c, bool follow) if (!follow) return; for (i = 0; i < nr; i++) { - struct btrfs_key first_key; + struct btrfs_tree_parent_check check = { + .level = level - 1, + .transid = btrfs_node_ptr_generation(c, i), + .owner_root = btrfs_header_owner(c), + .has_first_key = true + }; struct extent_buffer *next; - btrfs_node_key_to_cpu(c, &first_key, i); - next = read_tree_block(fs_info, btrfs_node_blockptr(c, i), - btrfs_header_owner(c), - btrfs_node_ptr_generation(c, i), - level - 1, &first_key); + btrfs_node_key_to_cpu(c, &check.first_key, i); + next = read_tree_block(fs_info, btrfs_node_blockptr(c, i), &check); if (IS_ERR(next)) continue; if (!extent_buffer_uptodate(next)) { diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c index 055a631276ce..0755af0e53e3 100644 --- a/fs/btrfs/props.c +++ b/fs/btrfs/props.c @@ -4,12 +4,17 @@ */ #include <linux/hashtable.h> +#include "messages.h" #include "props.h" #include "btrfs_inode.h" #include "transaction.h" #include "ctree.h" #include "xattr.h" #include "compression.h" +#include "space-info.h" +#include "fs.h" +#include "accessors.h" +#include "super.h" #define BTRFS_PROP_HANDLERS_HT_BITS 8 static DEFINE_HASHTABLE(prop_handlers_ht, BTRFS_PROP_HANDLERS_HT_BITS); @@ -453,7 +458,7 @@ int btrfs_inode_inherit_props(struct btrfs_trans_handle *trans, return 0; } -void __init btrfs_props_init(void) +int __init btrfs_props_init(void) { int i; @@ -463,5 +468,6 @@ void __init btrfs_props_init(void) hash_add(prop_handlers_ht, &p->node, h); } + return 0; } diff --git a/fs/btrfs/props.h b/fs/btrfs/props.h index ca9dd3df129b..6e283196e38a 100644 --- a/fs/btrfs/props.h +++ b/fs/btrfs/props.h @@ -8,7 +8,7 @@ #include "ctree.h" -void __init btrfs_props_init(void); +int __init btrfs_props_init(void); int btrfs_set_prop(struct btrfs_trans_handle *trans, struct inode *inode, const char *name, const char *value, size_t value_len, diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index b74105a10f16..5c636e00d77d 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -24,6 +24,11 @@ #include "block-group.h" #include "sysfs.h" #include "tree-mod-log.h" +#include "fs.h" +#include "accessors.h" +#include "extent-tree.h" +#include "root-tree.h" +#include "tree-checker.h" /* * Helpers to access qgroup reservation @@ -1790,8 +1795,7 @@ int btrfs_qgroup_trace_extent_nolock(struct btrfs_fs_info *fs_info, int btrfs_qgroup_trace_extent_post(struct btrfs_trans_handle *trans, struct btrfs_qgroup_extent_record *qrecord) { - struct ulist *old_root; - u64 bytenr = qrecord->bytenr; + struct btrfs_backref_walk_ctx ctx = { 0 }; int ret; /* @@ -1818,8 +1822,10 @@ int btrfs_qgroup_trace_extent_post(struct btrfs_trans_handle *trans, if (trans->fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING) return 0; - ret = btrfs_find_all_roots(NULL, trans->fs_info, bytenr, 0, &old_root, - true); + ctx.bytenr = qrecord->bytenr; + ctx.fs_info = trans->fs_info; + + ret = btrfs_find_all_roots(&ctx, true); if (ret < 0) { qgroup_mark_inconsistent(trans->fs_info); btrfs_warn(trans->fs_info, @@ -1835,12 +1841,12 @@ int btrfs_qgroup_trace_extent_post(struct btrfs_trans_handle *trans, * * So modifying qrecord->old_roots is safe here */ - qrecord->old_roots = old_root; + qrecord->old_roots = ctx.roots; return 0; } int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr, - u64 num_bytes, gfp_t gfp_flag) + u64 num_bytes) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_qgroup_extent_record *record; @@ -1850,7 +1856,7 @@ int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr, if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) || bytenr == 0 || num_bytes == 0) return 0; - record = kzalloc(sizeof(*record), gfp_flag); + record = kzalloc(sizeof(*record), GFP_NOFS); if (!record) return -ENOMEM; @@ -1902,8 +1908,7 @@ int btrfs_qgroup_trace_leaf_items(struct btrfs_trans_handle *trans, num_bytes = btrfs_file_extent_disk_num_bytes(eb, fi); - ret = btrfs_qgroup_trace_extent(trans, bytenr, num_bytes, - GFP_NOFS); + ret = btrfs_qgroup_trace_extent(trans, bytenr, num_bytes); if (ret) return ret; } @@ -2102,12 +2107,11 @@ static int qgroup_trace_extent_swap(struct btrfs_trans_handle* trans, * blocks for qgroup accounting. */ ret = btrfs_qgroup_trace_extent(trans, src_path->nodes[dst_level]->start, - nodesize, GFP_NOFS); + nodesize); if (ret < 0) goto out; - ret = btrfs_qgroup_trace_extent(trans, - dst_path->nodes[dst_level]->start, - nodesize, GFP_NOFS); + ret = btrfs_qgroup_trace_extent(trans, dst_path->nodes[dst_level]->start, + nodesize); if (ret < 0) goto out; @@ -2336,7 +2340,13 @@ int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans, } if (!extent_buffer_uptodate(root_eb)) { - ret = btrfs_read_extent_buffer(root_eb, root_gen, root_level, NULL); + struct btrfs_tree_parent_check check = { + .has_first_key = false, + .transid = root_gen, + .level = root_level + }; + + ret = btrfs_read_extent_buffer(root_eb, &check); if (ret) goto out; } @@ -2391,8 +2401,7 @@ walk_down: path->locks[level] = BTRFS_READ_LOCK; ret = btrfs_qgroup_trace_extent(trans, child_bytenr, - fs_info->nodesize, - GFP_NOFS); + fs_info->nodesize); if (ret) goto out; } @@ -2749,17 +2758,22 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans) if (!ret && !(fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING)) { + struct btrfs_backref_walk_ctx ctx = { 0 }; + + ctx.bytenr = record->bytenr; + ctx.fs_info = fs_info; + /* * Old roots should be searched when inserting qgroup * extent record */ if (WARN_ON(!record->old_roots)) { /* Search commit root to find old_roots */ - ret = btrfs_find_all_roots(NULL, fs_info, - record->bytenr, 0, - &record->old_roots, false); + ret = btrfs_find_all_roots(&ctx, false); if (ret < 0) goto cleanup; + record->old_roots = ctx.roots; + ctx.roots = NULL; } /* Free the reserved data space */ @@ -2772,10 +2786,11 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans) * which doesn't lock tree or delayed_refs and search * current root. It's safe inside commit_transaction(). */ - ret = btrfs_find_all_roots(trans, fs_info, - record->bytenr, BTRFS_SEQ_LAST, &new_roots, false); + ctx.trans = trans; + ret = btrfs_find_all_roots(&ctx, false); if (ret < 0) goto cleanup; + new_roots = ctx.roots; if (qgroup_to_skip) { ulist_del(new_roots, qgroup_to_skip, 0); ulist_del(record->old_roots, qgroup_to_skip, @@ -3241,7 +3256,6 @@ static int qgroup_rescan_leaf(struct btrfs_trans_handle *trans, struct btrfs_root *extent_root; struct btrfs_key found; struct extent_buffer *scratch_leaf = NULL; - struct ulist *roots = NULL; u64 num_bytes; bool done; int slot; @@ -3291,6 +3305,8 @@ static int qgroup_rescan_leaf(struct btrfs_trans_handle *trans, mutex_unlock(&fs_info->qgroup_rescan_lock); for (; slot < btrfs_header_nritems(scratch_leaf); ++slot) { + struct btrfs_backref_walk_ctx ctx = { 0 }; + btrfs_item_key_to_cpu(scratch_leaf, &found, slot); if (found.type != BTRFS_EXTENT_ITEM_KEY && found.type != BTRFS_METADATA_ITEM_KEY) @@ -3300,13 +3316,15 @@ static int qgroup_rescan_leaf(struct btrfs_trans_handle *trans, else num_bytes = found.offset; - ret = btrfs_find_all_roots(NULL, fs_info, found.objectid, 0, - &roots, false); + ctx.bytenr = found.objectid; + ctx.fs_info = fs_info; + + ret = btrfs_find_all_roots(&ctx, false); if (ret < 0) goto out; /* For rescan, just pass old_roots as NULL */ ret = btrfs_qgroup_account_extent(trans, found.objectid, - num_bytes, NULL, roots); + num_bytes, NULL, ctx.roots); if (ret < 0) goto out; } @@ -4292,6 +4310,7 @@ int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans, struct extent_buffer *subvol_eb) { struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_tree_parent_check check = { 0 }; struct btrfs_qgroup_swapped_blocks *blocks = &root->swapped_blocks; struct btrfs_qgroup_swapped_block *block; struct extent_buffer *reloc_eb = NULL; @@ -4340,10 +4359,13 @@ int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans, blocks->swapped = swapped; spin_unlock(&blocks->lock); + check.level = block->level; + check.transid = block->reloc_generation; + check.has_first_key = true; + memcpy(&check.first_key, &block->first_key, sizeof(check.first_key)); + /* Read out reloc subtree root */ - reloc_eb = read_tree_block(fs_info, block->reloc_bytenr, 0, - block->reloc_generation, block->level, - &block->first_key); + reloc_eb = read_tree_block(fs_info, block->reloc_bytenr, &check); if (IS_ERR(reloc_eb)) { ret = PTR_ERR(reloc_eb); reloc_eb = NULL; diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h index 578c77e94200..7bffa10589d6 100644 --- a/fs/btrfs/qgroup.h +++ b/fs/btrfs/qgroup.h @@ -11,6 +11,7 @@ #include <linux/kobject.h> #include "ulist.h" #include "delayed-ref.h" +#include "misc.h" /* * Btrfs qgroup overview @@ -242,9 +243,11 @@ static inline u64 btrfs_qgroup_subvolid(u64 qgroupid) /* * For qgroup event trace points only */ -#define QGROUP_RESERVE (1<<0) -#define QGROUP_RELEASE (1<<1) -#define QGROUP_FREE (1<<2) +enum { + ENUM_BIT(QGROUP_RESERVE), + ENUM_BIT(QGROUP_RELEASE), + ENUM_BIT(QGROUP_FREE), +}; int btrfs_quota_enable(struct btrfs_fs_info *fs_info); int btrfs_quota_disable(struct btrfs_fs_info *fs_info); @@ -318,7 +321,7 @@ int btrfs_qgroup_trace_extent_post(struct btrfs_trans_handle *trans, * (NULL trans) */ int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr, - u64 num_bytes, gfp_t gfp_flag); + u64 num_bytes); /* * Inform qgroup to trace all leaf items of data diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 82c8e991300e..2d90a6b5eb00 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -13,12 +13,15 @@ #include <linux/list_sort.h> #include <linux/raid/xor.h> #include <linux/mm.h> +#include "messages.h" #include "misc.h" #include "ctree.h" #include "disk-io.h" #include "volumes.h" #include "raid56.h" #include "async-thread.h" +#include "file-item.h" +#include "btrfs_inode.h" /* set when additional merges to this rbio are not allowed */ #define RBIO_RMW_LOCKED_BIT 1 @@ -63,19 +66,45 @@ struct sector_ptr { unsigned int uptodate:8; }; -static int __raid56_parity_recover(struct btrfs_raid_bio *rbio); -static noinline void finish_rmw(struct btrfs_raid_bio *rbio); -static void rmw_work(struct work_struct *work); -static void read_rebuild_work(struct work_struct *work); -static int fail_bio_stripe(struct btrfs_raid_bio *rbio, struct bio *bio); -static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed); -static void __free_raid_bio(struct btrfs_raid_bio *rbio); +static void rmw_rbio_work(struct work_struct *work); +static void rmw_rbio_work_locked(struct work_struct *work); static void index_rbio_pages(struct btrfs_raid_bio *rbio); static int alloc_rbio_pages(struct btrfs_raid_bio *rbio); -static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio, - int need_check); -static void scrub_parity_work(struct work_struct *work); +static int finish_parity_scrub(struct btrfs_raid_bio *rbio, int need_check); +static void scrub_rbio_work_locked(struct work_struct *work); + +static void free_raid_bio_pointers(struct btrfs_raid_bio *rbio) +{ + bitmap_free(rbio->error_bitmap); + kfree(rbio->stripe_pages); + kfree(rbio->bio_sectors); + kfree(rbio->stripe_sectors); + kfree(rbio->finish_pointers); +} + +static void free_raid_bio(struct btrfs_raid_bio *rbio) +{ + int i; + + if (!refcount_dec_and_test(&rbio->refs)) + return; + + WARN_ON(!list_empty(&rbio->stripe_cache)); + WARN_ON(!list_empty(&rbio->hash_list)); + WARN_ON(!bio_list_empty(&rbio->bio_list)); + + for (i = 0; i < rbio->nr_pages; i++) { + if (rbio->stripe_pages[i]) { + __free_page(rbio->stripe_pages[i]); + rbio->stripe_pages[i] = NULL; + } + } + + btrfs_put_bioc(rbio->bioc); + free_raid_bio_pointers(rbio); + kfree(rbio); +} static void start_async_work(struct btrfs_raid_bio *rbio, work_func_t work_func) { @@ -146,8 +175,16 @@ static void cache_rbio_pages(struct btrfs_raid_bio *rbio) for (i = 0; i < rbio->nr_sectors; i++) { /* Some range not covered by bio (partial write), skip it */ - if (!rbio->bio_sectors[i].page) + if (!rbio->bio_sectors[i].page) { + /* + * Even if the sector is not covered by bio, if it is + * a data sector it should still be uptodate as it is + * read from disk. + */ + if (i < rbio->nr_data * rbio->stripe_nsectors) + ASSERT(rbio->stripe_sectors[i].uptodate); continue; + } ASSERT(rbio->stripe_sectors[i].page); memcpy_page(rbio->stripe_sectors[i].page, @@ -234,6 +271,21 @@ static void steal_rbio_page(struct btrfs_raid_bio *src, dest->stripe_sectors[i].uptodate = true; } +static bool is_data_stripe_page(struct btrfs_raid_bio *rbio, int page_nr) +{ + const int sector_nr = (page_nr << PAGE_SHIFT) >> + rbio->bioc->fs_info->sectorsize_bits; + + /* + * We have ensured PAGE_SIZE is aligned with sectorsize, thus + * we won't have a page which is half data half parity. + * + * Thus if the first sector of the page belongs to data stripes, then + * the full page belongs to data stripes. + */ + return (sector_nr < rbio->nr_data * rbio->stripe_nsectors); +} + /* * Stealing an rbio means taking all the uptodate pages from the stripe array * in the source rbio and putting them into the destination rbio. @@ -244,16 +296,26 @@ static void steal_rbio_page(struct btrfs_raid_bio *src, static void steal_rbio(struct btrfs_raid_bio *src, struct btrfs_raid_bio *dest) { int i; - struct page *s; if (!test_bit(RBIO_CACHE_READY_BIT, &src->flags)) return; for (i = 0; i < dest->nr_pages; i++) { - s = src->stripe_pages[i]; - if (!s || !full_page_sectors_uptodate(src, i)) + struct page *p = src->stripe_pages[i]; + + /* + * We don't need to steal P/Q pages as they will always be + * regenerated for RMW or full write anyway. + */ + if (!is_data_stripe_page(src, i)) continue; + /* + * If @src already has RBIO_CACHE_READY_BIT, it should have + * all data stripe pages present and uptodate. + */ + ASSERT(p); + ASSERT(full_page_sectors_uptodate(src, i)); steal_rbio_page(src, dest, i); } index_stripe_sectors(dest); @@ -336,7 +398,7 @@ static void __remove_rbio_from_cache(struct btrfs_raid_bio *rbio) spin_unlock(&h->lock); if (freeit) - __free_raid_bio(rbio); + free_raid_bio(rbio); } /* @@ -526,28 +588,10 @@ static int rbio_can_merge(struct btrfs_raid_bio *last, if (last->operation == BTRFS_RBIO_PARITY_SCRUB) return 0; - if (last->operation == BTRFS_RBIO_REBUILD_MISSING) + if (last->operation == BTRFS_RBIO_REBUILD_MISSING || + last->operation == BTRFS_RBIO_READ_REBUILD) return 0; - if (last->operation == BTRFS_RBIO_READ_REBUILD) { - int fa = last->faila; - int fb = last->failb; - int cur_fa = cur->faila; - int cur_fb = cur->failb; - - if (last->faila >= last->failb) { - fa = last->failb; - fb = last->faila; - } - - if (cur->faila >= cur->failb) { - cur_fa = cur->failb; - cur_fb = cur->faila; - } - - if (fa != cur_fa || fb != cur_fb) - return 0; - } return 1; } @@ -684,10 +728,12 @@ out: if (cache_drop) remove_rbio_from_cache(cache_drop); if (freeit) - __free_raid_bio(freeit); + free_raid_bio(freeit); return ret; } +static void recover_rbio_work_locked(struct work_struct *work); + /* * called as rmw or parity rebuild is completed. If the plug list has more * rbios waiting for this stripe, the next one on the list will be started @@ -745,16 +791,16 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio) spin_unlock_irqrestore(&h->lock, flags); if (next->operation == BTRFS_RBIO_READ_REBUILD) - start_async_work(next, read_rebuild_work); + start_async_work(next, recover_rbio_work_locked); else if (next->operation == BTRFS_RBIO_REBUILD_MISSING) { steal_rbio(rbio, next); - start_async_work(next, read_rebuild_work); + start_async_work(next, recover_rbio_work_locked); } else if (next->operation == BTRFS_RBIO_WRITE) { steal_rbio(rbio, next); - start_async_work(next, rmw_work); + start_async_work(next, rmw_rbio_work_locked); } else if (next->operation == BTRFS_RBIO_PARITY_SCRUB) { steal_rbio(rbio, next); - start_async_work(next, scrub_parity_work); + start_async_work(next, scrub_rbio_work_locked); } goto done_nolock; @@ -769,28 +815,6 @@ done_nolock: remove_rbio_from_cache(rbio); } -static void __free_raid_bio(struct btrfs_raid_bio *rbio) -{ - int i; - - if (!refcount_dec_and_test(&rbio->refs)) - return; - - WARN_ON(!list_empty(&rbio->stripe_cache)); - WARN_ON(!list_empty(&rbio->hash_list)); - WARN_ON(!bio_list_empty(&rbio->bio_list)); - - for (i = 0; i < rbio->nr_pages; i++) { - if (rbio->stripe_pages[i]) { - __free_page(rbio->stripe_pages[i]); - rbio->stripe_pages[i] = NULL; - } - } - - btrfs_put_bioc(rbio->bioc); - kfree(rbio); -} - static void rbio_endio_bio_list(struct bio *cur, blk_status_t err) { struct bio *next; @@ -813,6 +837,11 @@ static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, blk_status_t err) struct bio *cur = bio_list_get(&rbio->bio_list); struct bio *extra; + kfree(rbio->csum_buf); + bitmap_free(rbio->csum_bitmap); + rbio->csum_buf = NULL; + rbio->csum_bitmap = NULL; + /* * Clear the data bitmap, as the rbio may be cached for later usage. * do this before before unlock_stripe() so there will be no new bio @@ -830,7 +859,7 @@ static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, blk_status_t err) */ unlock_stripe(rbio); extra = bio_list_get(&rbio->bio_list); - __free_raid_bio(rbio); + free_raid_bio(rbio); rbio_endio_bio_list(cur, err); if (extra) @@ -838,36 +867,7 @@ static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, blk_status_t err) } /* - * end io function used by finish_rmw. When we finally - * get here, we've written a full stripe - */ -static void raid_write_end_io(struct bio *bio) -{ - struct btrfs_raid_bio *rbio = bio->bi_private; - blk_status_t err = bio->bi_status; - int max_errors; - - if (err) - fail_bio_stripe(rbio, bio); - - bio_put(bio); - - if (!atomic_dec_and_test(&rbio->stripes_pending)) - return; - - err = BLK_STS_OK; - - /* OK, we have read all the stripes we need to. */ - max_errors = (rbio->operation == BTRFS_RBIO_PARITY_SCRUB) ? - 0 : rbio->bioc->max_errors; - if (atomic_read(&rbio->error) > max_errors) - err = BLK_STS_IOERR; - - rbio_orig_end_io(rbio, err); -} - -/** - * Get a sector pointer specified by its @stripe_nr and @sector_nr + * Get a sector pointer specified by its @stripe_nr and @sector_nr. * * @rbio: The raid bio * @stripe_nr: Stripe number, valid range [0, real_stripe) @@ -919,7 +919,6 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info, BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits; const unsigned int num_sectors = stripe_nsectors * real_stripes; struct btrfs_raid_bio *rbio; - void *p; /* PAGE_SIZE must also be aligned to sectorsize for subpage support */ ASSERT(IS_ALIGNED(PAGE_SIZE, fs_info->sectorsize)); @@ -929,16 +928,27 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info, */ ASSERT(stripe_nsectors <= BITS_PER_LONG); - rbio = kzalloc(sizeof(*rbio) + - sizeof(*rbio->stripe_pages) * num_pages + - sizeof(*rbio->bio_sectors) * num_sectors + - sizeof(*rbio->stripe_sectors) * num_sectors + - sizeof(*rbio->finish_pointers) * real_stripes, - GFP_NOFS); + rbio = kzalloc(sizeof(*rbio), GFP_NOFS); if (!rbio) return ERR_PTR(-ENOMEM); + rbio->stripe_pages = kcalloc(num_pages, sizeof(struct page *), + GFP_NOFS); + rbio->bio_sectors = kcalloc(num_sectors, sizeof(struct sector_ptr), + GFP_NOFS); + rbio->stripe_sectors = kcalloc(num_sectors, sizeof(struct sector_ptr), + GFP_NOFS); + rbio->finish_pointers = kcalloc(real_stripes, sizeof(void *), GFP_NOFS); + rbio->error_bitmap = bitmap_zalloc(num_sectors, GFP_NOFS); + + if (!rbio->stripe_pages || !rbio->bio_sectors || !rbio->stripe_sectors || + !rbio->finish_pointers || !rbio->error_bitmap) { + free_raid_bio_pointers(rbio); + kfree(rbio); + return ERR_PTR(-ENOMEM); + } bio_list_init(&rbio->bio_list); + init_waitqueue_head(&rbio->io_wait); INIT_LIST_HEAD(&rbio->plug_list); spin_lock_init(&rbio->bio_list_lock); INIT_LIST_HEAD(&rbio->stripe_cache); @@ -950,27 +960,9 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info, rbio->real_stripes = real_stripes; rbio->stripe_npages = stripe_npages; rbio->stripe_nsectors = stripe_nsectors; - rbio->faila = -1; - rbio->failb = -1; refcount_set(&rbio->refs, 1); - atomic_set(&rbio->error, 0); atomic_set(&rbio->stripes_pending, 0); - /* - * The stripe_pages, bio_sectors, etc arrays point to the extra memory - * we allocated past the end of the rbio. - */ - p = rbio + 1; -#define CONSUME_ALLOC(ptr, count) do { \ - ptr = p; \ - p = (unsigned char *)p + sizeof(*(ptr)) * (count); \ - } while (0) - CONSUME_ALLOC(rbio->stripe_pages, num_pages); - CONSUME_ALLOC(rbio->bio_sectors, num_sectors); - CONSUME_ALLOC(rbio->stripe_sectors, num_sectors); - CONSUME_ALLOC(rbio->finish_pointers, real_stripes); -#undef CONSUME_ALLOC - ASSERT(btrfs_nr_parity_stripes(bioc->map_type)); rbio->nr_data = real_stripes - btrfs_nr_parity_stripes(bioc->map_type); @@ -1006,6 +998,45 @@ static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio) } /* + * Return the total numer of errors found in the vertical stripe of @sector_nr. + * + * @faila and @failb will also be updated to the first and second stripe + * number of the errors. + */ +static int get_rbio_veritical_errors(struct btrfs_raid_bio *rbio, int sector_nr, + int *faila, int *failb) +{ + int stripe_nr; + int found_errors = 0; + + if (faila || failb) { + /* + * Both @faila and @failb should be valid pointers if any of + * them is specified. + */ + ASSERT(faila && failb); + *faila = -1; + *failb = -1; + } + + for (stripe_nr = 0; stripe_nr < rbio->real_stripes; stripe_nr++) { + int total_sector_nr = stripe_nr * rbio->stripe_nsectors + sector_nr; + + if (test_bit(total_sector_nr, rbio->error_bitmap)) { + found_errors++; + if (faila) { + /* Update faila and failb. */ + if (*faila < 0) + *faila = stripe_nr; + else if (*failb < 0) + *failb = stripe_nr; + } + } + } + return found_errors; +} + +/* * Add a single sector @sector into our list of bios for IO. * * Return 0 if everything went well. @@ -1038,8 +1069,19 @@ static int rbio_add_io_sector(struct btrfs_raid_bio *rbio, disk_start = stripe->physical + sector_nr * sectorsize; /* if the device is missing, just fail this stripe */ - if (!stripe->dev->bdev) - return fail_rbio_index(rbio, stripe_nr); + if (!stripe->dev->bdev) { + int found_errors; + + set_bit(stripe_nr * rbio->stripe_nsectors + sector_nr, + rbio->error_bitmap); + + /* Check if we have reached tolerance early. */ + found_errors = get_rbio_veritical_errors(rbio, sector_nr, + NULL, NULL); + if (found_errors > rbio->bioc->max_errors) + return -EIO; + return 0; + } /* see if we can add this page onto our existing bio */ if (last) { @@ -1071,23 +1113,6 @@ static int rbio_add_io_sector(struct btrfs_raid_bio *rbio, return 0; } -/* - * while we're doing the read/modify/write cycle, we could - * have errors in reading pages off the disk. This checks - * for errors and if we're not able to read the page it'll - * trigger parity reconstruction. The rmw will be finished - * after we've reconstructed the failed stripes - */ -static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio) -{ - if (rbio->faila >= 0 || rbio->failb >= 0) { - BUG_ON(rbio->faila == rbio->real_stripes - 1); - __raid56_parity_recover(rbio); - } else { - finish_rmw(rbio); - } -} - static void index_one_bio(struct btrfs_raid_bio *rbio, struct bio *bio) { const u32 sectorsize = rbio->bioc->fs_info->sectorsize; @@ -1158,109 +1183,71 @@ not_found: trace_info->stripe_nr = -1; } -/* - * this is called from one of two situations. We either - * have a full stripe from the higher layers, or we've read all - * the missing bits off disk. - * - * This will calculate the parity and then send down any - * changed blocks. - */ -static noinline void finish_rmw(struct btrfs_raid_bio *rbio) +/* Generate PQ for one veritical stripe. */ +static void generate_pq_vertical(struct btrfs_raid_bio *rbio, int sectornr) { - struct btrfs_io_context *bioc = rbio->bioc; - const u32 sectorsize = bioc->fs_info->sectorsize; void **pointers = rbio->finish_pointers; - int nr_data = rbio->nr_data; + const u32 sectorsize = rbio->bioc->fs_info->sectorsize; + struct sector_ptr *sector; + int stripe; + const bool has_qstripe = rbio->bioc->map_type & BTRFS_BLOCK_GROUP_RAID6; + + /* First collect one sector from each data stripe */ + for (stripe = 0; stripe < rbio->nr_data; stripe++) { + sector = sector_in_rbio(rbio, stripe, sectornr, 0); + pointers[stripe] = kmap_local_page(sector->page) + + sector->pgoff; + } + + /* Then add the parity stripe */ + sector = rbio_pstripe_sector(rbio, sectornr); + sector->uptodate = 1; + pointers[stripe++] = kmap_local_page(sector->page) + sector->pgoff; + + if (has_qstripe) { + /* + * RAID6, add the qstripe and call the library function + * to fill in our p/q + */ + sector = rbio_qstripe_sector(rbio, sectornr); + sector->uptodate = 1; + pointers[stripe++] = kmap_local_page(sector->page) + + sector->pgoff; + + raid6_call.gen_syndrome(rbio->real_stripes, sectorsize, + pointers); + } else { + /* raid5 */ + memcpy(pointers[rbio->nr_data], pointers[0], sectorsize); + run_xor(pointers + 1, rbio->nr_data - 1, sectorsize); + } + for (stripe = stripe - 1; stripe >= 0; stripe--) + kunmap_local(pointers[stripe]); +} + +static int rmw_assemble_write_bios(struct btrfs_raid_bio *rbio, + struct bio_list *bio_list) +{ + struct bio *bio; /* The total sector number inside the full stripe. */ int total_sector_nr; - int stripe; - /* Sector number inside a stripe. */ int sectornr; - bool has_qstripe; - struct bio_list bio_list; - struct bio *bio; + int stripe; int ret; - bio_list_init(&bio_list); - - if (rbio->real_stripes - rbio->nr_data == 1) - has_qstripe = false; - else if (rbio->real_stripes - rbio->nr_data == 2) - has_qstripe = true; - else - BUG(); + ASSERT(bio_list_size(bio_list) == 0); /* We should have at least one data sector. */ ASSERT(bitmap_weight(&rbio->dbitmap, rbio->stripe_nsectors)); - /* at this point we either have a full stripe, - * or we've read the full stripe from the drive. - * recalculate the parity and write the new results. - * - * We're not allowed to add any new bios to the - * bio list here, anyone else that wants to - * change this stripe needs to do their own rmw. - */ - spin_lock_irq(&rbio->bio_list_lock); - set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); - spin_unlock_irq(&rbio->bio_list_lock); - - atomic_set(&rbio->error, 0); - /* - * now that we've set rmw_locked, run through the - * bio list one last time and map the page pointers - * - * We don't cache full rbios because we're assuming - * the higher layers are unlikely to use this area of - * the disk again soon. If they do use it again, - * hopefully they will send another full bio. + * Reset errors, as we may have errors inherited from from degraded + * write. */ - index_rbio_pages(rbio); - if (!rbio_is_full(rbio)) - cache_rbio_pages(rbio); - else - clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags); - - for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) { - struct sector_ptr *sector; - - /* First collect one sector from each data stripe */ - for (stripe = 0; stripe < nr_data; stripe++) { - sector = sector_in_rbio(rbio, stripe, sectornr, 0); - pointers[stripe] = kmap_local_page(sector->page) + - sector->pgoff; - } - - /* Then add the parity stripe */ - sector = rbio_pstripe_sector(rbio, sectornr); - sector->uptodate = 1; - pointers[stripe++] = kmap_local_page(sector->page) + sector->pgoff; - - if (has_qstripe) { - /* - * RAID6, add the qstripe and call the library function - * to fill in our p/q - */ - sector = rbio_qstripe_sector(rbio, sectornr); - sector->uptodate = 1; - pointers[stripe++] = kmap_local_page(sector->page) + - sector->pgoff; - - raid6_call.gen_syndrome(rbio->real_stripes, sectorsize, - pointers); - } else { - /* raid5 */ - memcpy(pointers[nr_data], pointers[0], sectorsize); - run_xor(pointers + 1, nr_data - 1, sectorsize); - } - for (stripe = stripe - 1; stripe >= 0; stripe--) - kunmap_local(pointers[stripe]); - } + bitmap_clear(rbio->error_bitmap, 0, rbio->nr_sectors); /* - * Start writing. Make bios for everything from the higher layers (the + * Start assembly. Make bios for everything from the higher layers (the * bio_list in our rbio) and our P/Q. Ignore everything else. */ for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors; @@ -1282,15 +1269,16 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio) sector = rbio_stripe_sector(rbio, stripe, sectornr); } - ret = rbio_add_io_sector(rbio, &bio_list, sector, stripe, + ret = rbio_add_io_sector(rbio, bio_list, sector, stripe, sectornr, REQ_OP_WRITE); if (ret) - goto cleanup; + goto error; } - if (likely(!bioc->num_tgtdevs)) - goto write_data; + if (likely(!rbio->bioc->num_tgtdevs)) + return 0; + /* Make a copy for the replace target device. */ for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors; total_sector_nr++) { struct sector_ptr *sector; @@ -1298,7 +1286,7 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio) stripe = total_sector_nr / rbio->stripe_nsectors; sectornr = total_sector_nr % rbio->stripe_nsectors; - if (!bioc->tgtdev_map[stripe]) { + if (!rbio->bioc->tgtdev_map[stripe]) { /* * We can skip the whole stripe completely, note * total_sector_nr will be increased by one anyway. @@ -1320,125 +1308,52 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio) sector = rbio_stripe_sector(rbio, stripe, sectornr); } - ret = rbio_add_io_sector(rbio, &bio_list, sector, + ret = rbio_add_io_sector(rbio, bio_list, sector, rbio->bioc->tgtdev_map[stripe], sectornr, REQ_OP_WRITE); if (ret) - goto cleanup; - } - -write_data: - atomic_set(&rbio->stripes_pending, bio_list_size(&bio_list)); - BUG_ON(atomic_read(&rbio->stripes_pending) == 0); - - while ((bio = bio_list_pop(&bio_list))) { - bio->bi_end_io = raid_write_end_io; - - if (trace_raid56_write_stripe_enabled()) { - struct raid56_bio_trace_info trace_info = { 0 }; - - bio_get_trace_info(rbio, bio, &trace_info); - trace_raid56_write_stripe(rbio, bio, &trace_info); - } - submit_bio(bio); + goto error; } - return; -cleanup: - rbio_orig_end_io(rbio, BLK_STS_IOERR); - - while ((bio = bio_list_pop(&bio_list))) + return 0; +error: + while ((bio = bio_list_pop(bio_list))) bio_put(bio); + return -EIO; } -/* - * helper to find the stripe number for a given bio. Used to figure out which - * stripe has failed. This expects the bio to correspond to a physical disk, - * so it looks up based on physical sector numbers. - */ -static int find_bio_stripe(struct btrfs_raid_bio *rbio, - struct bio *bio) -{ - u64 physical = bio->bi_iter.bi_sector; - int i; - struct btrfs_io_stripe *stripe; - - physical <<= 9; - - for (i = 0; i < rbio->bioc->num_stripes; i++) { - stripe = &rbio->bioc->stripes[i]; - if (in_range(physical, stripe->physical, BTRFS_STRIPE_LEN) && - stripe->dev->bdev && bio->bi_bdev == stripe->dev->bdev) { - return i; - } - } - return -1; -} - -/* - * helper to find the stripe number for a given - * bio (before mapping). Used to figure out which stripe has - * failed. This looks up based on logical block numbers. - */ -static int find_logical_bio_stripe(struct btrfs_raid_bio *rbio, - struct bio *bio) -{ - u64 logical = bio->bi_iter.bi_sector << 9; - int i; - - for (i = 0; i < rbio->nr_data; i++) { - u64 stripe_start = rbio->bioc->raid_map[i]; - - if (in_range(logical, stripe_start, BTRFS_STRIPE_LEN)) - return i; - } - return -1; -} - -/* - * returns -EIO if we had too many failures - */ -static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed) +static void set_rbio_range_error(struct btrfs_raid_bio *rbio, struct bio *bio) { - unsigned long flags; - int ret = 0; + struct btrfs_fs_info *fs_info = rbio->bioc->fs_info; + u32 offset = (bio->bi_iter.bi_sector << SECTOR_SHIFT) - + rbio->bioc->raid_map[0]; + int total_nr_sector = offset >> fs_info->sectorsize_bits; - spin_lock_irqsave(&rbio->bio_list_lock, flags); + ASSERT(total_nr_sector < rbio->nr_data * rbio->stripe_nsectors); - /* we already know this stripe is bad, move on */ - if (rbio->faila == failed || rbio->failb == failed) - goto out; + bitmap_set(rbio->error_bitmap, total_nr_sector, + bio->bi_iter.bi_size >> fs_info->sectorsize_bits); - if (rbio->faila == -1) { - /* first failure on this rbio */ - rbio->faila = failed; - atomic_inc(&rbio->error); - } else if (rbio->failb == -1) { - /* second failure on this rbio */ - rbio->failb = failed; - atomic_inc(&rbio->error); - } else { - ret = -EIO; + /* + * Special handling for raid56_alloc_missing_rbio() used by + * scrub/replace. Unlike call path in raid56_parity_recover(), they + * pass an empty bio here. Thus we have to find out the missing device + * and mark the stripe error instead. + */ + if (bio->bi_iter.bi_size == 0) { + bool found_missing = false; + int stripe_nr; + + for (stripe_nr = 0; stripe_nr < rbio->real_stripes; stripe_nr++) { + if (!rbio->bioc->stripes[stripe_nr].dev->bdev) { + found_missing = true; + bitmap_set(rbio->error_bitmap, + stripe_nr * rbio->stripe_nsectors, + rbio->stripe_nsectors); + } + } + ASSERT(found_missing); } -out: - spin_unlock_irqrestore(&rbio->bio_list_lock, flags); - - return ret; -} - -/* - * helper to fail a stripe based on a physical disk - * bio. - */ -static int fail_bio_stripe(struct btrfs_raid_bio *rbio, - struct bio *bio) -{ - int failed = find_bio_stripe(rbio, bio); - - if (failed < 0) - return -EIO; - - return fail_rbio_index(rbio, failed); } /* @@ -1486,191 +1401,163 @@ static void set_bio_pages_uptodate(struct btrfs_raid_bio *rbio, struct bio *bio) } } -static void raid56_bio_end_io(struct bio *bio) +static int get_bio_sector_nr(struct btrfs_raid_bio *rbio, struct bio *bio) { - struct btrfs_raid_bio *rbio = bio->bi_private; - - if (bio->bi_status) - fail_bio_stripe(rbio, bio); - else - set_bio_pages_uptodate(rbio, bio); + struct bio_vec *bv = bio_first_bvec_all(bio); + int i; - bio_put(bio); + for (i = 0; i < rbio->nr_sectors; i++) { + struct sector_ptr *sector; - if (atomic_dec_and_test(&rbio->stripes_pending)) - queue_work(rbio->bioc->fs_info->endio_raid56_workers, - &rbio->end_io_work); + sector = &rbio->stripe_sectors[i]; + if (sector->page == bv->bv_page && sector->pgoff == bv->bv_offset) + break; + sector = &rbio->bio_sectors[i]; + if (sector->page == bv->bv_page && sector->pgoff == bv->bv_offset) + break; + } + ASSERT(i < rbio->nr_sectors); + return i; } -/* - * End io handler for the read phase of the RMW cycle. All the bios here are - * physical stripe bios we've read from the disk so we can recalculate the - * parity of the stripe. - * - * This will usually kick off finish_rmw once all the bios are read in, but it - * may trigger parity reconstruction if we had any errors along the way - */ -static void raid56_rmw_end_io_work(struct work_struct *work) +static void rbio_update_error_bitmap(struct btrfs_raid_bio *rbio, struct bio *bio) { - struct btrfs_raid_bio *rbio = - container_of(work, struct btrfs_raid_bio, end_io_work); + int total_sector_nr = get_bio_sector_nr(rbio, bio); + u32 bio_size = 0; + struct bio_vec *bvec; + struct bvec_iter_all iter_all; - if (atomic_read(&rbio->error) > rbio->bioc->max_errors) { - rbio_orig_end_io(rbio, BLK_STS_IOERR); - return; - } + bio_for_each_segment_all(bvec, bio, iter_all) + bio_size += bvec->bv_len; - /* - * This will normally call finish_rmw to start our write but if there - * are any failed stripes we'll reconstruct from parity first. - */ - validate_rbio_for_rmw(rbio); + bitmap_set(rbio->error_bitmap, total_sector_nr, + bio_size >> rbio->bioc->fs_info->sectorsize_bits); } -/* - * the stripe must be locked by the caller. It will - * unlock after all the writes are done - */ -static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio) +/* Verify the data sectors at read time. */ +static void verify_bio_data_sectors(struct btrfs_raid_bio *rbio, + struct bio *bio) { - int bios_to_read = 0; - struct bio_list bio_list; - const int nr_data_sectors = rbio->stripe_nsectors * rbio->nr_data; - int ret; - int total_sector_nr; - struct bio *bio; + struct btrfs_fs_info *fs_info = rbio->bioc->fs_info; + int total_sector_nr = get_bio_sector_nr(rbio, bio); + struct bio_vec *bvec; + struct bvec_iter_all iter_all; - bio_list_init(&bio_list); + /* No data csum for the whole stripe, no need to verify. */ + if (!rbio->csum_bitmap || !rbio->csum_buf) + return; - ret = alloc_rbio_pages(rbio); - if (ret) - goto cleanup; + /* P/Q stripes, they have no data csum to verify against. */ + if (total_sector_nr >= rbio->nr_data * rbio->stripe_nsectors) + return; - index_rbio_pages(rbio); + bio_for_each_segment_all(bvec, bio, iter_all) { + int bv_offset; + + for (bv_offset = bvec->bv_offset; + bv_offset < bvec->bv_offset + bvec->bv_len; + bv_offset += fs_info->sectorsize, total_sector_nr++) { + u8 csum_buf[BTRFS_CSUM_SIZE]; + u8 *expected_csum = rbio->csum_buf + + total_sector_nr * fs_info->csum_size; + int ret; - atomic_set(&rbio->error, 0); - /* Build a list of bios to read all the missing data sectors. */ - for (total_sector_nr = 0; total_sector_nr < nr_data_sectors; - total_sector_nr++) { - struct sector_ptr *sector; - int stripe = total_sector_nr / rbio->stripe_nsectors; - int sectornr = total_sector_nr % rbio->stripe_nsectors; + /* No csum for this sector, skip to the next sector. */ + if (!test_bit(total_sector_nr, rbio->csum_bitmap)) + continue; - /* - * We want to find all the sectors missing from the rbio and - * read them from the disk. If sector_in_rbio() finds a page - * in the bio list we don't need to read it off the stripe. - */ - sector = sector_in_rbio(rbio, stripe, sectornr, 1); - if (sector) - continue; + ret = btrfs_check_sector_csum(fs_info, bvec->bv_page, + bv_offset, csum_buf, expected_csum); + if (ret < 0) + set_bit(total_sector_nr, rbio->error_bitmap); + } + } +} - sector = rbio_stripe_sector(rbio, stripe, sectornr); - /* - * The bio cache may have handed us an uptodate page. If so, - * use it. - */ - if (sector->uptodate) - continue; +static void raid_wait_read_end_io(struct bio *bio) +{ + struct btrfs_raid_bio *rbio = bio->bi_private; - ret = rbio_add_io_sector(rbio, &bio_list, sector, - stripe, sectornr, REQ_OP_READ); - if (ret) - goto cleanup; + if (bio->bi_status) { + rbio_update_error_bitmap(rbio, bio); + } else { + set_bio_pages_uptodate(rbio, bio); + verify_bio_data_sectors(rbio, bio); } - bios_to_read = bio_list_size(&bio_list); - if (!bios_to_read) { - /* - * this can happen if others have merged with - * us, it means there is nothing left to read. - * But if there are missing devices it may not be - * safe to do the full stripe write yet. - */ - goto finish; - } + bio_put(bio); + if (atomic_dec_and_test(&rbio->stripes_pending)) + wake_up(&rbio->io_wait); +} - /* - * The bioc may be freed once we submit the last bio. Make sure not to - * touch it after that. - */ - atomic_set(&rbio->stripes_pending, bios_to_read); - INIT_WORK(&rbio->end_io_work, raid56_rmw_end_io_work); - while ((bio = bio_list_pop(&bio_list))) { - bio->bi_end_io = raid56_bio_end_io; +static void submit_read_bios(struct btrfs_raid_bio *rbio, + struct bio_list *bio_list) +{ + struct bio *bio; + + atomic_set(&rbio->stripes_pending, bio_list_size(bio_list)); + while ((bio = bio_list_pop(bio_list))) { + bio->bi_end_io = raid_wait_read_end_io; - if (trace_raid56_read_partial_enabled()) { + if (trace_raid56_scrub_read_recover_enabled()) { struct raid56_bio_trace_info trace_info = { 0 }; bio_get_trace_info(rbio, bio, &trace_info); - trace_raid56_read_partial(rbio, bio, &trace_info); + trace_raid56_scrub_read_recover(rbio, bio, &trace_info); } submit_bio(bio); } - /* the actual write will happen once the reads are done */ - return 0; +} -cleanup: - rbio_orig_end_io(rbio, BLK_STS_IOERR); +static int rmw_assemble_read_bios(struct btrfs_raid_bio *rbio, + struct bio_list *bio_list) +{ + struct bio *bio; + int total_sector_nr; + int ret = 0; - while ((bio = bio_list_pop(&bio_list))) - bio_put(bio); + ASSERT(bio_list_size(bio_list) == 0); - return -EIO; + /* + * Build a list of bios to read all sectors (including data and P/Q). + * + * This behaviro is to compensate the later csum verification and + * recovery. + */ + for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors; + total_sector_nr++) { + struct sector_ptr *sector; + int stripe = total_sector_nr / rbio->stripe_nsectors; + int sectornr = total_sector_nr % rbio->stripe_nsectors; -finish: - validate_rbio_for_rmw(rbio); + sector = rbio_stripe_sector(rbio, stripe, sectornr); + ret = rbio_add_io_sector(rbio, bio_list, sector, + stripe, sectornr, REQ_OP_READ); + if (ret) + goto cleanup; + } return 0; + +cleanup: + while ((bio = bio_list_pop(bio_list))) + bio_put(bio); + return ret; } -/* - * if the upper layers pass in a full stripe, we thank them by only allocating - * enough pages to hold the parity, and sending it all down quickly. - */ -static int full_stripe_write(struct btrfs_raid_bio *rbio) +static int alloc_rbio_data_pages(struct btrfs_raid_bio *rbio) { + const int data_pages = rbio->nr_data * rbio->stripe_npages; int ret; - ret = alloc_rbio_parity_pages(rbio); - if (ret) + ret = btrfs_alloc_page_array(data_pages, rbio->stripe_pages); + if (ret < 0) return ret; - ret = lock_stripe_add(rbio); - if (ret == 0) - finish_rmw(rbio); - return 0; -} - -/* - * partial stripe writes get handed over to async helpers. - * We're really hoping to merge a few more writes into this - * rbio before calculating new parity - */ -static int partial_stripe_write(struct btrfs_raid_bio *rbio) -{ - int ret; - - ret = lock_stripe_add(rbio); - if (ret == 0) - start_async_work(rbio, rmw_work); + index_stripe_sectors(rbio); return 0; } /* - * sometimes while we were reading from the drive to - * recalculate parity, enough new bios come into create - * a full stripe. So we do a check here to see if we can - * go directly to finish_rmw - */ -static int __raid56_parity_write(struct btrfs_raid_bio *rbio) -{ - /* head off into rmw land if we don't have a full stripe */ - if (!rbio_is_full(rbio)) - return partial_stripe_write(rbio); - return full_stripe_write(rbio); -} - -/* * We use plugging call backs to collect full stripes. * Any time we get a partial stripe write while plugged * we collect it into a list. When the unplug comes down, @@ -1704,71 +1591,39 @@ static int plug_cmp(void *priv, const struct list_head *a, return 0; } -static void run_plug(struct btrfs_plug_cb *plug) +static void raid_unplug(struct blk_plug_cb *cb, bool from_schedule) { + struct btrfs_plug_cb *plug = container_of(cb, struct btrfs_plug_cb, cb); struct btrfs_raid_bio *cur; struct btrfs_raid_bio *last = NULL; - /* - * sort our plug list then try to merge - * everything we can in hopes of creating full - * stripes. - */ list_sort(NULL, &plug->rbio_list, plug_cmp); + while (!list_empty(&plug->rbio_list)) { cur = list_entry(plug->rbio_list.next, struct btrfs_raid_bio, plug_list); list_del_init(&cur->plug_list); if (rbio_is_full(cur)) { - int ret; - - /* we have a full stripe, send it down */ - ret = full_stripe_write(cur); - BUG_ON(ret); + /* We have a full stripe, queue it down. */ + start_async_work(cur, rmw_rbio_work); continue; } if (last) { if (rbio_can_merge(last, cur)) { merge_rbio(last, cur); - __free_raid_bio(cur); + free_raid_bio(cur); continue; - } - __raid56_parity_write(last); + start_async_work(last, rmw_rbio_work); } last = cur; } - if (last) { - __raid56_parity_write(last); - } + if (last) + start_async_work(last, rmw_rbio_work); kfree(plug); } -/* - * if the unplug comes from schedule, we have to push the - * work off to a helper thread - */ -static void unplug_work(struct work_struct *work) -{ - struct btrfs_plug_cb *plug; - plug = container_of(work, struct btrfs_plug_cb, work); - run_plug(plug); -} - -static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule) -{ - struct btrfs_plug_cb *plug; - plug = container_of(cb, struct btrfs_plug_cb, cb); - - if (from_schedule) { - INIT_WORK(&plug->work, unplug_work); - queue_work(plug->info->rmw_workers, &plug->work); - return; - } - run_plug(plug); -} - /* Add the original bio into rbio->bio_list, and update rbio::dbitmap. */ static void rbio_add_bio(struct btrfs_raid_bio *rbio, struct bio *orig_bio) { @@ -1816,19 +1671,13 @@ void raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc) rbio_add_bio(rbio, bio); /* - * don't plug on full rbios, just get them out the door + * Don't plug on full rbios, just get them out the door * as quickly as we can */ - if (rbio_is_full(rbio)) { - ret = full_stripe_write(rbio); - if (ret) { - __free_raid_bio(rbio); - goto fail; - } - return; - } + if (rbio_is_full(rbio)) + goto queue_rbio; - cb = blk_check_plugged(btrfs_raid_unplug, fs_info, sizeof(*plug)); + cb = blk_check_plugged(raid_unplug, fs_info, sizeof(*plug)); if (cb) { plug = container_of(cb, struct btrfs_plug_cb, cb); if (!plug->info) { @@ -1836,13 +1685,14 @@ void raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc) INIT_LIST_HEAD(&plug->rbio_list); } list_add_tail(&rbio->plug_list, &plug->rbio_list); - } else { - ret = __raid56_parity_write(rbio); - if (ret) { - __free_raid_bio(rbio); - goto fail; - } + return; } +queue_rbio: + /* + * Either we don't have any existing plug, or we're doing a full stripe, + * can queue the rmw work now. + */ + start_async_work(rbio, rmw_rbio_work); return; @@ -1851,268 +1701,254 @@ fail: bio_endio(bio); } -/* - * all parity reconstruction happens here. We've read in everything - * we can find from the drives and this does the heavy lifting of - * sorting the good from the bad. - */ -static void __raid_recover_end_io(struct btrfs_raid_bio *rbio) +static int verify_one_sector(struct btrfs_raid_bio *rbio, + int stripe_nr, int sector_nr) { - const u32 sectorsize = rbio->bioc->fs_info->sectorsize; - int sectornr, stripe; - void **pointers; - void **unmap_array; - int faila = -1, failb = -1; - blk_status_t err; - int i; + struct btrfs_fs_info *fs_info = rbio->bioc->fs_info; + struct sector_ptr *sector; + u8 csum_buf[BTRFS_CSUM_SIZE]; + u8 *csum_expected; + int ret; - /* - * This array stores the pointer for each sector, thus it has the extra - * pgoff value added from each sector - */ - pointers = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS); - if (!pointers) { - err = BLK_STS_RESOURCE; - goto cleanup_io; - } + if (!rbio->csum_bitmap || !rbio->csum_buf) + return 0; + /* No way to verify P/Q as they are not covered by data csum. */ + if (stripe_nr >= rbio->nr_data) + return 0; /* - * Store copy of pointers that does not get reordered during - * reconstruction so that kunmap_local works. + * If we're rebuilding a read, we have to use pages from the + * bio list if possible. */ - unmap_array = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS); - if (!unmap_array) { - err = BLK_STS_RESOURCE; - goto cleanup_pointers; + if ((rbio->operation == BTRFS_RBIO_READ_REBUILD || + rbio->operation == BTRFS_RBIO_REBUILD_MISSING)) { + sector = sector_in_rbio(rbio, stripe_nr, sector_nr, 0); + } else { + sector = rbio_stripe_sector(rbio, stripe_nr, sector_nr); } - faila = rbio->faila; - failb = rbio->failb; + ASSERT(sector->page); - if (rbio->operation == BTRFS_RBIO_READ_REBUILD || - rbio->operation == BTRFS_RBIO_REBUILD_MISSING) { - spin_lock_irq(&rbio->bio_list_lock); - set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); - spin_unlock_irq(&rbio->bio_list_lock); - } + csum_expected = rbio->csum_buf + + (stripe_nr * rbio->stripe_nsectors + sector_nr) * + fs_info->csum_size; + ret = btrfs_check_sector_csum(fs_info, sector->page, sector->pgoff, + csum_buf, csum_expected); + return ret; +} - index_rbio_pages(rbio); +/* + * Recover a vertical stripe specified by @sector_nr. + * @*pointers are the pre-allocated pointers by the caller, so we don't + * need to allocate/free the pointers again and again. + */ +static int recover_vertical(struct btrfs_raid_bio *rbio, int sector_nr, + void **pointers, void **unmap_array) +{ + struct btrfs_fs_info *fs_info = rbio->bioc->fs_info; + struct sector_ptr *sector; + const u32 sectorsize = fs_info->sectorsize; + int found_errors; + int faila; + int failb; + int stripe_nr; + int ret = 0; - for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) { - struct sector_ptr *sector; + /* + * Now we just use bitmap to mark the horizontal stripes in + * which we have data when doing parity scrub. + */ + if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB && + !test_bit(sector_nr, &rbio->dbitmap)) + return 0; - /* - * Now we just use bitmap to mark the horizontal stripes in - * which we have data when doing parity scrub. - */ - if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB && - !test_bit(sectornr, &rbio->dbitmap)) - continue; + found_errors = get_rbio_veritical_errors(rbio, sector_nr, &faila, + &failb); + /* + * No errors in the veritical stripe, skip it. Can happen for recovery + * which only part of a stripe failed csum check. + */ + if (!found_errors) + return 0; + if (found_errors > rbio->bioc->max_errors) + return -EIO; + + /* + * Setup our array of pointers with sectors from each stripe + * + * NOTE: store a duplicate array of pointers to preserve the + * pointer order. + */ + for (stripe_nr = 0; stripe_nr < rbio->real_stripes; stripe_nr++) { /* - * Setup our array of pointers with sectors from each stripe - * - * NOTE: store a duplicate array of pointers to preserve the - * pointer order + * If we're rebuilding a read, we have to use pages from the + * bio list if possible. */ - for (stripe = 0; stripe < rbio->real_stripes; stripe++) { - /* - * If we're rebuilding a read, we have to use - * pages from the bio list - */ - if ((rbio->operation == BTRFS_RBIO_READ_REBUILD || - rbio->operation == BTRFS_RBIO_REBUILD_MISSING) && - (stripe == faila || stripe == failb)) { - sector = sector_in_rbio(rbio, stripe, sectornr, 0); - } else { - sector = rbio_stripe_sector(rbio, stripe, sectornr); - } - ASSERT(sector->page); - pointers[stripe] = kmap_local_page(sector->page) + - sector->pgoff; - unmap_array[stripe] = pointers[stripe]; + if ((rbio->operation == BTRFS_RBIO_READ_REBUILD || + rbio->operation == BTRFS_RBIO_REBUILD_MISSING)) { + sector = sector_in_rbio(rbio, stripe_nr, sector_nr, 0); + } else { + sector = rbio_stripe_sector(rbio, stripe_nr, sector_nr); } + ASSERT(sector->page); + pointers[stripe_nr] = kmap_local_page(sector->page) + + sector->pgoff; + unmap_array[stripe_nr] = pointers[stripe_nr]; + } - /* All raid6 handling here */ - if (rbio->bioc->map_type & BTRFS_BLOCK_GROUP_RAID6) { - /* Single failure, rebuild from parity raid5 style */ - if (failb < 0) { - if (faila == rbio->nr_data) { - /* - * Just the P stripe has failed, without - * a bad data or Q stripe. - * TODO, we should redo the xor here. - */ - err = BLK_STS_IOERR; - goto cleanup; - } + /* All raid6 handling here */ + if (rbio->bioc->map_type & BTRFS_BLOCK_GROUP_RAID6) { + /* Single failure, rebuild from parity raid5 style */ + if (failb < 0) { + if (faila == rbio->nr_data) /* - * a single failure in raid6 is rebuilt - * in the pstripe code below + * Just the P stripe has failed, without + * a bad data or Q stripe. + * We have nothing to do, just skip the + * recovery for this stripe. */ - goto pstripe; - } - - /* make sure our ps and qs are in order */ - if (faila > failb) - swap(faila, failb); - - /* if the q stripe is failed, do a pstripe reconstruction - * from the xors. - * If both the q stripe and the P stripe are failed, we're - * here due to a crc mismatch and we can't give them the - * data they want + goto cleanup; + /* + * a single failure in raid6 is rebuilt + * in the pstripe code below */ - if (rbio->bioc->raid_map[failb] == RAID6_Q_STRIPE) { - if (rbio->bioc->raid_map[faila] == - RAID5_P_STRIPE) { - err = BLK_STS_IOERR; - goto cleanup; - } + goto pstripe; + } + + /* + * If the q stripe is failed, do a pstripe reconstruction from + * the xors. + * If both the q stripe and the P stripe are failed, we're + * here due to a crc mismatch and we can't give them the + * data they want. + */ + if (rbio->bioc->raid_map[failb] == RAID6_Q_STRIPE) { + if (rbio->bioc->raid_map[faila] == + RAID5_P_STRIPE) /* - * otherwise we have one bad data stripe and - * a good P stripe. raid5! + * Only P and Q are corrupted. + * We only care about data stripes recovery, + * can skip this vertical stripe. */ - goto pstripe; - } + goto cleanup; + /* + * Otherwise we have one bad data stripe and + * a good P stripe. raid5! + */ + goto pstripe; + } - if (rbio->bioc->raid_map[failb] == RAID5_P_STRIPE) { - raid6_datap_recov(rbio->real_stripes, - sectorsize, faila, pointers); - } else { - raid6_2data_recov(rbio->real_stripes, - sectorsize, faila, failb, - pointers); - } + if (rbio->bioc->raid_map[failb] == RAID5_P_STRIPE) { + raid6_datap_recov(rbio->real_stripes, sectorsize, + faila, pointers); } else { - void *p; + raid6_2data_recov(rbio->real_stripes, sectorsize, + faila, failb, pointers); + } + } else { + void *p; - /* rebuild from P stripe here (raid5 or raid6) */ - BUG_ON(failb != -1); + /* Rebuild from P stripe here (raid5 or raid6). */ + ASSERT(failb == -1); pstripe: - /* Copy parity block into failed block to start with */ - memcpy(pointers[faila], pointers[rbio->nr_data], sectorsize); + /* Copy parity block into failed block to start with */ + memcpy(pointers[faila], pointers[rbio->nr_data], sectorsize); - /* rearrange the pointer array */ - p = pointers[faila]; - for (stripe = faila; stripe < rbio->nr_data - 1; stripe++) - pointers[stripe] = pointers[stripe + 1]; - pointers[rbio->nr_data - 1] = p; + /* Rearrange the pointer array */ + p = pointers[faila]; + for (stripe_nr = faila; stripe_nr < rbio->nr_data - 1; + stripe_nr++) + pointers[stripe_nr] = pointers[stripe_nr + 1]; + pointers[rbio->nr_data - 1] = p; - /* xor in the rest */ - run_xor(pointers, rbio->nr_data - 1, sectorsize); - } - /* if we're doing this rebuild as part of an rmw, go through - * and set all of our private rbio pages in the - * failed stripes as uptodate. This way finish_rmw will - * know they can be trusted. If this was a read reconstruction, - * other endio functions will fiddle the uptodate bits - */ - if (rbio->operation == BTRFS_RBIO_WRITE) { - for (i = 0; i < rbio->stripe_nsectors; i++) { - if (faila != -1) { - sector = rbio_stripe_sector(rbio, faila, i); - sector->uptodate = 1; - } - if (failb != -1) { - sector = rbio_stripe_sector(rbio, failb, i); - sector->uptodate = 1; - } - } - } - for (stripe = rbio->real_stripes - 1; stripe >= 0; stripe--) - kunmap_local(unmap_array[stripe]); + /* Xor in the rest */ + run_xor(pointers, rbio->nr_data - 1, sectorsize); + + } + + /* + * No matter if this is a RMW or recovery, we should have all + * failed sectors repaired in the vertical stripe, thus they are now + * uptodate. + * Especially if we determine to cache the rbio, we need to + * have at least all data sectors uptodate. + * + * If possible, also check if the repaired sector matches its data + * checksum. + */ + if (faila >= 0) { + ret = verify_one_sector(rbio, faila, sector_nr); + if (ret < 0) + goto cleanup; + + sector = rbio_stripe_sector(rbio, faila, sector_nr); + sector->uptodate = 1; + } + if (failb >= 0) { + ret = verify_one_sector(rbio, faila, sector_nr); + if (ret < 0) + goto cleanup; + + sector = rbio_stripe_sector(rbio, failb, sector_nr); + sector->uptodate = 1; } - err = BLK_STS_OK; cleanup: - kfree(unmap_array); -cleanup_pointers: - kfree(pointers); + for (stripe_nr = rbio->real_stripes - 1; stripe_nr >= 0; stripe_nr--) + kunmap_local(unmap_array[stripe_nr]); + return ret; +} + +static int recover_sectors(struct btrfs_raid_bio *rbio) +{ + void **pointers = NULL; + void **unmap_array = NULL; + int sectornr; + int ret = 0; -cleanup_io: /* - * Similar to READ_REBUILD, REBUILD_MISSING at this point also has a - * valid rbio which is consistent with ondisk content, thus such a - * valid rbio can be cached to avoid further disk reads. + * @pointers array stores the pointer for each sector. + * + * @unmap_array stores copy of pointers that does not get reordered + * during reconstruction so that kunmap_local works. */ + pointers = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS); + unmap_array = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS); + if (!pointers || !unmap_array) { + ret = -ENOMEM; + goto out; + } + if (rbio->operation == BTRFS_RBIO_READ_REBUILD || rbio->operation == BTRFS_RBIO_REBUILD_MISSING) { - /* - * - In case of two failures, where rbio->failb != -1: - * - * Do not cache this rbio since the above read reconstruction - * (raid6_datap_recov() or raid6_2data_recov()) may have - * changed some content of stripes which are not identical to - * on-disk content any more, otherwise, a later write/recover - * may steal stripe_pages from this rbio and end up with - * corruptions or rebuild failures. - * - * - In case of single failure, where rbio->failb == -1: - * - * Cache this rbio iff the above read reconstruction is - * executed without problems. - */ - if (err == BLK_STS_OK && rbio->failb < 0) - cache_rbio_pages(rbio); - else - clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags); + spin_lock_irq(&rbio->bio_list_lock); + set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); + spin_unlock_irq(&rbio->bio_list_lock); + } - rbio_orig_end_io(rbio, err); - } else if (err == BLK_STS_OK) { - rbio->faila = -1; - rbio->failb = -1; + index_rbio_pages(rbio); - if (rbio->operation == BTRFS_RBIO_WRITE) - finish_rmw(rbio); - else if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB) - finish_parity_scrub(rbio, 0); - else - BUG(); - } else { - rbio_orig_end_io(rbio, err); + for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) { + ret = recover_vertical(rbio, sectornr, pointers, unmap_array); + if (ret < 0) + break; } -} - -/* - * This is called only for stripes we've read from disk to reconstruct the - * parity. - */ -static void raid_recover_end_io_work(struct work_struct *work) -{ - struct btrfs_raid_bio *rbio = - container_of(work, struct btrfs_raid_bio, end_io_work); - if (atomic_read(&rbio->error) > rbio->bioc->max_errors) - rbio_orig_end_io(rbio, BLK_STS_IOERR); - else - __raid_recover_end_io(rbio); +out: + kfree(pointers); + kfree(unmap_array); + return ret; } -/* - * reads everything we need off the disk to reconstruct - * the parity. endio handlers trigger final reconstruction - * when the IO is done. - * - * This is used both for reads from the higher layers and for - * parity construction required to finish a rmw cycle. - */ -static int __raid56_parity_recover(struct btrfs_raid_bio *rbio) +static int recover_assemble_read_bios(struct btrfs_raid_bio *rbio, + struct bio_list *bio_list) { - int bios_to_read = 0; - struct bio_list bio_list; - int ret; - int total_sector_nr; struct bio *bio; + int total_sector_nr; + int ret = 0; - bio_list_init(&bio_list); - - ret = alloc_rbio_pages(rbio); - if (ret) - goto cleanup; - - atomic_set(&rbio->error, 0); - + ASSERT(bio_list_size(bio_list) == 0); /* * Read everything that hasn't failed. However this time we will * not trust any cached sector. @@ -2127,64 +1963,139 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio) int sectornr = total_sector_nr % rbio->stripe_nsectors; struct sector_ptr *sector; - if (rbio->faila == stripe || rbio->failb == stripe) { - atomic_inc(&rbio->error); - /* Skip the current stripe. */ - ASSERT(sectornr == 0); - total_sector_nr += rbio->stripe_nsectors - 1; + /* + * Skip the range which has error. It can be a range which is + * marked error (for csum mismatch), or it can be a missing + * device. + */ + if (!rbio->bioc->stripes[stripe].dev->bdev || + test_bit(total_sector_nr, rbio->error_bitmap)) { + /* + * Also set the error bit for missing device, which + * may not yet have its error bit set. + */ + set_bit(total_sector_nr, rbio->error_bitmap); continue; } + sector = rbio_stripe_sector(rbio, stripe, sectornr); - ret = rbio_add_io_sector(rbio, &bio_list, sector, stripe, + ret = rbio_add_io_sector(rbio, bio_list, sector, stripe, sectornr, REQ_OP_READ); if (ret < 0) - goto cleanup; + goto error; } + return 0; +error: + while ((bio = bio_list_pop(bio_list))) + bio_put(bio); - bios_to_read = bio_list_size(&bio_list); - if (!bios_to_read) { - /* - * we might have no bios to read just because the pages - * were up to date, or we might have no bios to read because - * the devices were gone. - */ - if (atomic_read(&rbio->error) <= rbio->bioc->max_errors) { - __raid_recover_end_io(rbio); - return 0; - } else { - goto cleanup; - } - } + return -EIO; +} + +static int recover_rbio(struct btrfs_raid_bio *rbio) +{ + struct bio_list bio_list; + struct bio *bio; + int ret; /* - * The bioc may be freed once we submit the last bio. Make sure not to - * touch it after that. + * Either we're doing recover for a read failure or degraded write, + * caller should have set error bitmap correctly. */ - atomic_set(&rbio->stripes_pending, bios_to_read); - INIT_WORK(&rbio->end_io_work, raid_recover_end_io_work); - while ((bio = bio_list_pop(&bio_list))) { - bio->bi_end_io = raid56_bio_end_io; + ASSERT(bitmap_weight(rbio->error_bitmap, rbio->nr_sectors)); + bio_list_init(&bio_list); - if (trace_raid56_scrub_read_recover_enabled()) { - struct raid56_bio_trace_info trace_info = { 0 }; + /* For recovery, we need to read all sectors including P/Q. */ + ret = alloc_rbio_pages(rbio); + if (ret < 0) + goto out; - bio_get_trace_info(rbio, bio, &trace_info); - trace_raid56_scrub_read_recover(rbio, bio, &trace_info); - } - submit_bio(bio); - } + index_rbio_pages(rbio); - return 0; + ret = recover_assemble_read_bios(rbio, &bio_list); + if (ret < 0) + goto out; -cleanup: - if (rbio->operation == BTRFS_RBIO_READ_REBUILD || - rbio->operation == BTRFS_RBIO_REBUILD_MISSING) - rbio_orig_end_io(rbio, BLK_STS_IOERR); + submit_read_bios(rbio, &bio_list); + wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0); + + ret = recover_sectors(rbio); +out: while ((bio = bio_list_pop(&bio_list))) bio_put(bio); - return -EIO; + return ret; +} + +static void recover_rbio_work(struct work_struct *work) +{ + struct btrfs_raid_bio *rbio; + int ret; + + rbio = container_of(work, struct btrfs_raid_bio, work); + + ret = lock_stripe_add(rbio); + if (ret == 0) { + ret = recover_rbio(rbio); + rbio_orig_end_io(rbio, errno_to_blk_status(ret)); + } +} + +static void recover_rbio_work_locked(struct work_struct *work) +{ + struct btrfs_raid_bio *rbio; + int ret; + + rbio = container_of(work, struct btrfs_raid_bio, work); + + ret = recover_rbio(rbio); + rbio_orig_end_io(rbio, errno_to_blk_status(ret)); +} + +static void set_rbio_raid6_extra_error(struct btrfs_raid_bio *rbio, int mirror_num) +{ + bool found = false; + int sector_nr; + + /* + * This is for RAID6 extra recovery tries, thus mirror number should + * be large than 2. + * Mirror 1 means read from data stripes. Mirror 2 means rebuild using + * RAID5 methods. + */ + ASSERT(mirror_num > 2); + for (sector_nr = 0; sector_nr < rbio->stripe_nsectors; sector_nr++) { + int found_errors; + int faila; + int failb; + + found_errors = get_rbio_veritical_errors(rbio, sector_nr, + &faila, &failb); + /* This vertical stripe doesn't have errors. */ + if (!found_errors) + continue; + + /* + * If we found errors, there should be only one error marked + * by previous set_rbio_range_error(). + */ + ASSERT(found_errors == 1); + found = true; + + /* Now select another stripe to mark as error. */ + failb = rbio->real_stripes - (mirror_num - 1); + if (failb <= faila) + failb--; + + /* Set the extra bit in error bitmap. */ + if (failb >= 0) + set_bit(failb * rbio->stripe_nsectors + sector_nr, + rbio->error_bitmap); + } + + /* We should found at least one vertical stripe with error.*/ + ASSERT(found); } /* @@ -2202,68 +2113,284 @@ void raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc, rbio = alloc_rbio(fs_info, bioc); if (IS_ERR(rbio)) { bio->bi_status = errno_to_blk_status(PTR_ERR(rbio)); - goto out_end_bio; + bio_endio(bio); + return; } rbio->operation = BTRFS_RBIO_READ_REBUILD; rbio_add_bio(rbio, bio); - rbio->faila = find_logical_bio_stripe(rbio, bio); - if (rbio->faila == -1) { - btrfs_warn(fs_info, -"%s could not find the bad stripe in raid56 so that we cannot recover any more (bio has logical %llu len %llu, bioc has map_type %llu)", - __func__, bio->bi_iter.bi_sector << 9, - (u64)bio->bi_iter.bi_size, bioc->map_type); - __free_raid_bio(rbio); - bio->bi_status = BLK_STS_IOERR; - goto out_end_bio; - } + set_rbio_range_error(rbio, bio); /* * Loop retry: * for 'mirror == 2', reconstruct from all other stripes. * for 'mirror_num > 2', select a stripe to fail on every retry. */ - if (mirror_num > 2) { + if (mirror_num > 2) + set_rbio_raid6_extra_error(rbio, mirror_num); + + start_async_work(rbio, recover_rbio_work); +} + +static void fill_data_csums(struct btrfs_raid_bio *rbio) +{ + struct btrfs_fs_info *fs_info = rbio->bioc->fs_info; + struct btrfs_root *csum_root = btrfs_csum_root(fs_info, + rbio->bioc->raid_map[0]); + const u64 start = rbio->bioc->raid_map[0]; + const u32 len = (rbio->nr_data * rbio->stripe_nsectors) << + fs_info->sectorsize_bits; + int ret; + + /* The rbio should not have its csum buffer initialized. */ + ASSERT(!rbio->csum_buf && !rbio->csum_bitmap); + + /* + * Skip the csum search if: + * + * - The rbio doesn't belong to data block groups + * Then we are doing IO for tree blocks, no need to search csums. + * + * - The rbio belongs to mixed block groups + * This is to avoid deadlock, as we're already holding the full + * stripe lock, if we trigger a metadata read, and it needs to do + * raid56 recovery, we will deadlock. + */ + if (!(rbio->bioc->map_type & BTRFS_BLOCK_GROUP_DATA) || + rbio->bioc->map_type & BTRFS_BLOCK_GROUP_METADATA) + return; + + rbio->csum_buf = kzalloc(rbio->nr_data * rbio->stripe_nsectors * + fs_info->csum_size, GFP_NOFS); + rbio->csum_bitmap = bitmap_zalloc(rbio->nr_data * rbio->stripe_nsectors, + GFP_NOFS); + if (!rbio->csum_buf || !rbio->csum_bitmap) { + ret = -ENOMEM; + goto error; + } + + ret = btrfs_lookup_csums_bitmap(csum_root, start, start + len - 1, + rbio->csum_buf, rbio->csum_bitmap); + if (ret < 0) + goto error; + if (bitmap_empty(rbio->csum_bitmap, len >> fs_info->sectorsize_bits)) + goto no_csum; + return; + +error: + /* + * We failed to allocate memory or grab the csum, but it's not fatal, + * we can still continue. But better to warn users that RMW is no + * longer safe for this particular sub-stripe write. + */ + btrfs_warn_rl(fs_info, +"sub-stripe write for full stripe %llu is not safe, failed to get csum: %d", + rbio->bioc->raid_map[0], ret); +no_csum: + kfree(rbio->csum_buf); + bitmap_free(rbio->csum_bitmap); + rbio->csum_buf = NULL; + rbio->csum_bitmap = NULL; +} + +static int rmw_read_wait_recover(struct btrfs_raid_bio *rbio) +{ + struct bio_list bio_list; + struct bio *bio; + int ret; + + bio_list_init(&bio_list); + + /* + * Fill the data csums we need for data verification. We need to fill + * the csum_bitmap/csum_buf first, as our endio function will try to + * verify the data sectors. + */ + fill_data_csums(rbio); + + ret = rmw_assemble_read_bios(rbio, &bio_list); + if (ret < 0) + goto out; + + submit_read_bios(rbio, &bio_list); + wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0); + + /* + * We may or may not have any corrupted sectors (including missing dev + * and csum mismatch), just let recover_sectors() to handle them all. + */ + ret = recover_sectors(rbio); + return ret; +out: + while ((bio = bio_list_pop(&bio_list))) + bio_put(bio); + + return ret; +} + +static void raid_wait_write_end_io(struct bio *bio) +{ + struct btrfs_raid_bio *rbio = bio->bi_private; + blk_status_t err = bio->bi_status; + + if (err) + rbio_update_error_bitmap(rbio, bio); + bio_put(bio); + if (atomic_dec_and_test(&rbio->stripes_pending)) + wake_up(&rbio->io_wait); +} + +static void submit_write_bios(struct btrfs_raid_bio *rbio, + struct bio_list *bio_list) +{ + struct bio *bio; + + atomic_set(&rbio->stripes_pending, bio_list_size(bio_list)); + while ((bio = bio_list_pop(bio_list))) { + bio->bi_end_io = raid_wait_write_end_io; + + if (trace_raid56_write_stripe_enabled()) { + struct raid56_bio_trace_info trace_info = { 0 }; + + bio_get_trace_info(rbio, bio, &trace_info); + trace_raid56_write_stripe(rbio, bio, &trace_info); + } + submit_bio(bio); + } +} + +/* + * To determine if we need to read any sector from the disk. + * Should only be utilized in RMW path, to skip cached rbio. + */ +static bool need_read_stripe_sectors(struct btrfs_raid_bio *rbio) +{ + int i; + + for (i = 0; i < rbio->nr_data * rbio->stripe_nsectors; i++) { + struct sector_ptr *sector = &rbio->stripe_sectors[i]; + /* - * 'mirror == 3' is to fail the p stripe and - * reconstruct from the q stripe. 'mirror > 3' is to - * fail a data stripe and reconstruct from p+q stripe. + * We have a sector which doesn't have page nor uptodate, + * thus this rbio can not be cached one, as cached one must + * have all its data sectors present and uptodate. */ - rbio->failb = rbio->real_stripes - (mirror_num - 1); - ASSERT(rbio->failb > 0); - if (rbio->failb <= rbio->faila) - rbio->failb--; + if (!sector->page || !sector->uptodate) + return true; } + return false; +} - if (lock_stripe_add(rbio)) - return; +static int rmw_rbio(struct btrfs_raid_bio *rbio) +{ + struct bio_list bio_list; + int sectornr; + int ret = 0; /* - * This adds our rbio to the list of rbios that will be handled after - * the current lock owner is done. + * Allocate the pages for parity first, as P/Q pages will always be + * needed for both full-stripe and sub-stripe writes. */ - __raid56_parity_recover(rbio); - return; + ret = alloc_rbio_parity_pages(rbio); + if (ret < 0) + return ret; -out_end_bio: - bio_endio(bio); + /* + * Either full stripe write, or we have every data sector already + * cached, can go to write path immediately. + */ + if (rbio_is_full(rbio) || !need_read_stripe_sectors(rbio)) + goto write; + + /* + * Now we're doing sub-stripe write, also need all data stripes to do + * the full RMW. + */ + ret = alloc_rbio_data_pages(rbio); + if (ret < 0) + return ret; + + index_rbio_pages(rbio); + + ret = rmw_read_wait_recover(rbio); + if (ret < 0) + return ret; + +write: + /* + * At this stage we're not allowed to add any new bios to the + * bio list any more, anyone else that wants to change this stripe + * needs to do their own rmw. + */ + spin_lock_irq(&rbio->bio_list_lock); + set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); + spin_unlock_irq(&rbio->bio_list_lock); + + bitmap_clear(rbio->error_bitmap, 0, rbio->nr_sectors); + + index_rbio_pages(rbio); + + /* + * We don't cache full rbios because we're assuming + * the higher layers are unlikely to use this area of + * the disk again soon. If they do use it again, + * hopefully they will send another full bio. + */ + if (!rbio_is_full(rbio)) + cache_rbio_pages(rbio); + else + clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags); + + for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) + generate_pq_vertical(rbio, sectornr); + + bio_list_init(&bio_list); + ret = rmw_assemble_write_bios(rbio, &bio_list); + if (ret < 0) + return ret; + + /* We should have at least one bio assembled. */ + ASSERT(bio_list_size(&bio_list)); + submit_write_bios(rbio, &bio_list); + wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0); + + /* We may have more errors than our tolerance during the read. */ + for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) { + int found_errors; + + found_errors = get_rbio_veritical_errors(rbio, sectornr, NULL, NULL); + if (found_errors > rbio->bioc->max_errors) { + ret = -EIO; + break; + } + } + return ret; } -static void rmw_work(struct work_struct *work) +static void rmw_rbio_work(struct work_struct *work) { struct btrfs_raid_bio *rbio; + int ret; rbio = container_of(work, struct btrfs_raid_bio, work); - raid56_rmw_stripe(rbio); + + ret = lock_stripe_add(rbio); + if (ret == 0) { + ret = rmw_rbio(rbio); + rbio_orig_end_io(rbio, errno_to_blk_status(ret)); + } } -static void read_rebuild_work(struct work_struct *work) +static void rmw_rbio_work_locked(struct work_struct *work) { struct btrfs_raid_bio *rbio; + int ret; rbio = container_of(work, struct btrfs_raid_bio, work); - __raid56_parity_recover(rbio); + + ret = rmw_rbio(rbio); + rbio_orig_end_io(rbio, errno_to_blk_status(ret)); } /* @@ -2358,8 +2485,7 @@ static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio) return 0; } -static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio, - int need_check) +static int finish_parity_scrub(struct btrfs_raid_bio *rbio, int need_check) { struct btrfs_io_context *bioc = rbio->bioc; const u32 sectorsize = bioc->fs_info->sectorsize; @@ -2402,7 +2528,7 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio, p_sector.page = alloc_page(GFP_NOFS); if (!p_sector.page) - goto cleanup; + return -ENOMEM; p_sector.pgoff = 0; p_sector.uptodate = 1; @@ -2412,14 +2538,14 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio, if (!q_sector.page) { __free_page(p_sector.page); p_sector.page = NULL; - goto cleanup; + return -ENOMEM; } q_sector.pgoff = 0; q_sector.uptodate = 1; pointers[rbio->real_stripes - 1] = kmap_local_page(q_sector.page); } - atomic_set(&rbio->error, 0); + bitmap_clear(rbio->error_bitmap, 0, rbio->nr_sectors); /* Map the parity stripe just once */ pointers[nr_data] = kmap_local_page(p_sector.page); @@ -2499,33 +2625,13 @@ writeback: } submit_write: - nr_data = bio_list_size(&bio_list); - if (!nr_data) { - /* Every parity is right */ - rbio_orig_end_io(rbio, BLK_STS_OK); - return; - } - - atomic_set(&rbio->stripes_pending, nr_data); - - while ((bio = bio_list_pop(&bio_list))) { - bio->bi_end_io = raid_write_end_io; - - if (trace_raid56_scrub_write_stripe_enabled()) { - struct raid56_bio_trace_info trace_info = { 0 }; - - bio_get_trace_info(rbio, bio, &trace_info); - trace_raid56_scrub_write_stripe(rbio, bio, &trace_info); - } - submit_bio(bio); - } - return; + submit_write_bios(rbio, &bio_list); + return 0; cleanup: - rbio_orig_end_io(rbio, BLK_STS_IOERR); - while ((bio = bio_list_pop(&bio_list))) bio_put(bio); + return ret; } static inline int is_data_stripe(struct btrfs_raid_bio *rbio, int stripe) @@ -2535,102 +2641,99 @@ static inline int is_data_stripe(struct btrfs_raid_bio *rbio, int stripe) return 0; } -/* - * While we're doing the parity check and repair, we could have errors - * in reading pages off the disk. This checks for errors and if we're - * not able to read the page it'll trigger parity reconstruction. The - * parity scrub will be finished after we've reconstructed the failed - * stripes - */ -static void validate_rbio_for_parity_scrub(struct btrfs_raid_bio *rbio) +static int recover_scrub_rbio(struct btrfs_raid_bio *rbio) { - if (atomic_read(&rbio->error) > rbio->bioc->max_errors) - goto cleanup; + void **pointers = NULL; + void **unmap_array = NULL; + int sector_nr; + int ret; + + /* + * @pointers array stores the pointer for each sector. + * + * @unmap_array stores copy of pointers that does not get reordered + * during reconstruction so that kunmap_local works. + */ + pointers = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS); + unmap_array = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS); + if (!pointers || !unmap_array) { + ret = -ENOMEM; + goto out; + } - if (rbio->faila >= 0 || rbio->failb >= 0) { + for (sector_nr = 0; sector_nr < rbio->stripe_nsectors; sector_nr++) { int dfail = 0, failp = -1; + int faila; + int failb; + int found_errors; + + found_errors = get_rbio_veritical_errors(rbio, sector_nr, + &faila, &failb); + if (found_errors > rbio->bioc->max_errors) { + ret = -EIO; + goto out; + } + if (found_errors == 0) + continue; - if (is_data_stripe(rbio, rbio->faila)) - dfail++; - else if (is_parity_stripe(rbio->faila)) - failp = rbio->faila; + /* We should have at least one error here. */ + ASSERT(faila >= 0 || failb >= 0); - if (is_data_stripe(rbio, rbio->failb)) + if (is_data_stripe(rbio, faila)) dfail++; - else if (is_parity_stripe(rbio->failb)) - failp = rbio->failb; + else if (is_parity_stripe(faila)) + failp = faila; + if (is_data_stripe(rbio, failb)) + dfail++; + else if (is_parity_stripe(failb)) + failp = failb; /* - * Because we can not use a scrubbing parity to repair - * the data, so the capability of the repair is declined. - * (In the case of RAID5, we can not repair anything) + * Because we can not use a scrubbing parity to repair the + * data, so the capability of the repair is declined. (In the + * case of RAID5, we can not repair anything.) */ - if (dfail > rbio->bioc->max_errors - 1) - goto cleanup; - + if (dfail > rbio->bioc->max_errors - 1) { + ret = -EIO; + goto out; + } /* - * If all data is good, only parity is correctly, just - * repair the parity. + * If all data is good, only parity is correctly, just repair + * the parity, no need to recover data stripes. */ - if (dfail == 0) { - finish_parity_scrub(rbio, 0); - return; - } + if (dfail == 0) + continue; /* * Here means we got one corrupted data stripe and one - * corrupted parity on RAID6, if the corrupted parity - * is scrubbing parity, luckily, use the other one to repair - * the data, or we can not repair the data stripe. + * corrupted parity on RAID6, if the corrupted parity is + * scrubbing parity, luckily, use the other one to repair the + * data, or we can not repair the data stripe. */ - if (failp != rbio->scrubp) - goto cleanup; + if (failp != rbio->scrubp) { + ret = -EIO; + goto out; + } - __raid_recover_end_io(rbio); - } else { - finish_parity_scrub(rbio, 1); + ret = recover_vertical(rbio, sector_nr, pointers, unmap_array); + if (ret < 0) + goto out; } - return; - -cleanup: - rbio_orig_end_io(rbio, BLK_STS_IOERR); -} - -/* - * end io for the read phase of the rmw cycle. All the bios here are physical - * stripe bios we've read from the disk so we can recalculate the parity of the - * stripe. - * - * This will usually kick off finish_rmw once all the bios are read in, but it - * may trigger parity reconstruction if we had any errors along the way - */ -static void raid56_parity_scrub_end_io_work(struct work_struct *work) -{ - struct btrfs_raid_bio *rbio = - container_of(work, struct btrfs_raid_bio, end_io_work); - - /* - * This will normally call finish_rmw to start our write, but if there - * are any failed stripes we'll reconstruct from parity first - */ - validate_rbio_for_parity_scrub(rbio); +out: + kfree(pointers); + kfree(unmap_array); + return ret; } -static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio) +static int scrub_assemble_read_bios(struct btrfs_raid_bio *rbio, + struct bio_list *bio_list) { - int bios_to_read = 0; - struct bio_list bio_list; - int ret; - int total_sector_nr; struct bio *bio; + int total_sector_nr; + int ret = 0; - bio_list_init(&bio_list); - - ret = alloc_rbio_essential_pages(rbio); - if (ret) - goto cleanup; + ASSERT(bio_list_size(bio_list) == 0); - atomic_set(&rbio->error, 0); /* Build a list of bios to read all the missing parts. */ for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors; total_sector_nr++) { @@ -2659,67 +2762,84 @@ static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio) if (sector->uptodate) continue; - ret = rbio_add_io_sector(rbio, &bio_list, sector, stripe, + ret = rbio_add_io_sector(rbio, bio_list, sector, stripe, sectornr, REQ_OP_READ); if (ret) - goto cleanup; + goto error; } + return 0; +error: + while ((bio = bio_list_pop(bio_list))) + bio_put(bio); + return ret; +} - bios_to_read = bio_list_size(&bio_list); - if (!bios_to_read) { - /* - * this can happen if others have merged with - * us, it means there is nothing left to read. - * But if there are missing devices it may not be - * safe to do the full stripe write yet. - */ - goto finish; - } +static int scrub_rbio(struct btrfs_raid_bio *rbio) +{ + bool need_check = false; + struct bio_list bio_list; + int sector_nr; + int ret; + struct bio *bio; - /* - * The bioc may be freed once we submit the last bio. Make sure not to - * touch it after that. - */ - atomic_set(&rbio->stripes_pending, bios_to_read); - INIT_WORK(&rbio->end_io_work, raid56_parity_scrub_end_io_work); - while ((bio = bio_list_pop(&bio_list))) { - bio->bi_end_io = raid56_bio_end_io; + bio_list_init(&bio_list); - if (trace_raid56_scrub_read_enabled()) { - struct raid56_bio_trace_info trace_info = { 0 }; + ret = alloc_rbio_essential_pages(rbio); + if (ret) + goto cleanup; - bio_get_trace_info(rbio, bio, &trace_info); - trace_raid56_scrub_read(rbio, bio, &trace_info); + bitmap_clear(rbio->error_bitmap, 0, rbio->nr_sectors); + + ret = scrub_assemble_read_bios(rbio, &bio_list); + if (ret < 0) + goto cleanup; + + submit_read_bios(rbio, &bio_list); + wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0); + + /* We may have some failures, recover the failed sectors first. */ + ret = recover_scrub_rbio(rbio); + if (ret < 0) + goto cleanup; + + /* + * We have every sector properly prepared. Can finish the scrub + * and writeback the good content. + */ + ret = finish_parity_scrub(rbio, need_check); + wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0); + for (sector_nr = 0; sector_nr < rbio->stripe_nsectors; sector_nr++) { + int found_errors; + + found_errors = get_rbio_veritical_errors(rbio, sector_nr, NULL, NULL); + if (found_errors > rbio->bioc->max_errors) { + ret = -EIO; + break; } - submit_bio(bio); } - /* the actual write will happen once the reads are done */ - return; + return ret; cleanup: - rbio_orig_end_io(rbio, BLK_STS_IOERR); - while ((bio = bio_list_pop(&bio_list))) bio_put(bio); - return; - -finish: - validate_rbio_for_parity_scrub(rbio); + return ret; } -static void scrub_parity_work(struct work_struct *work) +static void scrub_rbio_work_locked(struct work_struct *work) { struct btrfs_raid_bio *rbio; + int ret; rbio = container_of(work, struct btrfs_raid_bio, work); - raid56_parity_scrub_stripe(rbio); + ret = scrub_rbio(rbio); + rbio_orig_end_io(rbio, errno_to_blk_status(ret)); } void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio) { if (!lock_stripe_add(rbio)) - start_async_work(rbio, scrub_parity_work); + start_async_work(rbio, scrub_rbio_work_locked); } /* The following code is used for dev replace of a missing RAID 5/6 device. */ @@ -2742,20 +2862,12 @@ raid56_alloc_missing_rbio(struct bio *bio, struct btrfs_io_context *bioc) */ ASSERT(!bio->bi_iter.bi_size); - rbio->faila = find_logical_bio_stripe(rbio, bio); - if (rbio->faila == -1) { - btrfs_warn_rl(fs_info, - "can not determine the failed stripe number for full stripe %llu", - bioc->raid_map[0]); - __free_raid_bio(rbio); - return NULL; - } + set_rbio_range_error(rbio, bio); return rbio; } void raid56_submit_missing_rbio(struct btrfs_raid_bio *rbio) { - if (!lock_stripe_add(rbio)) - start_async_work(rbio, read_rebuild_work); + start_async_work(rbio, recover_rbio_work); } diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h index 91d5c0adad15..7c73a443939e 100644 --- a/fs/btrfs/raid56.h +++ b/fs/btrfs/raid56.h @@ -74,12 +74,6 @@ struct btrfs_raid_bio { /* How many sectors there are for each stripe */ u8 stripe_nsectors; - /* First bad stripe, -1 means no corruption */ - s8 faila; - - /* Second bad stripe (for RAID6 use) */ - s8 failb; - /* Stripe number that we're scrubbing */ u8 scrubp; @@ -93,9 +87,7 @@ struct btrfs_raid_bio { atomic_t stripes_pending; - atomic_t error; - - struct work_struct end_io_work; + wait_queue_head_t io_wait; /* Bitmap to record which horizontal stripe has data */ unsigned long dbitmap; @@ -126,6 +118,29 @@ struct btrfs_raid_bio { /* Allocated with real_stripes-many pointers for finish_*() calls */ void **finish_pointers; + + /* + * The bitmap recording where IO errors happened. + * Each bit is corresponding to one sector in either bio_sectors[] or + * stripe_sectors[] array. + * + * The reason we don't use another bit in sector_ptr is, we have two + * arrays of sectors, and a lot of IO can use sectors in both arrays. + * Thus making it much harder to iterate. + */ + unsigned long *error_bitmap; + + /* + * Checksum buffer if the rbio is for data. The buffer should cover + * all data sectors (exlcuding P/Q sectors). + */ + u8 *csum_buf; + + /* + * Each bit represents if the corresponding sector has data csum found. + * Should only cover data sectors (excluding P/Q sectors). + */ + unsigned long *csum_bitmap; }; /* diff --git a/fs/btrfs/rcu-string.h b/fs/btrfs/rcu-string.h index 5c1a617eb25d..5c2b66d155ef 100644 --- a/fs/btrfs/rcu-string.h +++ b/fs/btrfs/rcu-string.h @@ -18,7 +18,11 @@ static inline struct rcu_string *rcu_string_strdup(const char *src, gfp_t mask) (len * sizeof(char)), mask); if (!ret) return ret; - strncpy(ret->str, src, len); + /* Warn if the source got unexpectedly truncated. */ + if (WARN_ON(strscpy(ret->str, src, len) < 0)) { + kfree(ret); + return NULL; + } return ret; } diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c index a248f46cfe72..95d28497de7c 100644 --- a/fs/btrfs/ref-verify.c +++ b/fs/btrfs/ref-verify.c @@ -5,11 +5,14 @@ #include <linux/sched.h> #include <linux/stacktrace.h> +#include "messages.h" #include "ctree.h" #include "disk-io.h" #include "locking.h" #include "delayed-ref.h" #include "ref-verify.h" +#include "fs.h" +#include "accessors.h" /* * Used to keep track the roots and number of refs each root has for a given diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c index f50586ff85c8..0474bbe39da7 100644 --- a/fs/btrfs/reflink.c +++ b/fs/btrfs/reflink.c @@ -2,13 +2,19 @@ #include <linux/blkdev.h> #include <linux/iversion.h> -#include "compression.h" #include "ctree.h" +#include "fs.h" +#include "messages.h" +#include "compression.h" #include "delalloc-space.h" #include "disk-io.h" #include "reflink.h" #include "transaction.h" #include "subpage.h" +#include "accessors.h" +#include "file-item.h" +#include "file.h" +#include "super.h" #define BTRFS_MAX_DEDUPE_LEN SZ_16M @@ -318,16 +324,16 @@ copy_to_page: goto out; } -/** - * btrfs_clone() - clone a range from inode file to another +/* + * Clone a range from inode file to another. * - * @src: Inode to clone from - * @inode: Inode to clone to - * @off: Offset within source to start clone from - * @olen: Original length, passed by user, of range to clone - * @olen_aligned: Block-aligned value of olen - * @destoff: Offset within @inode to start clone - * @no_time_update: Whether to update mtime/ctime on the target inode + * @src: Inode to clone from + * @inode: Inode to clone to + * @off: Offset within source to start clone from + * @olen: Original length, passed by user, of range to clone + * @olen_aligned: Block-aligned value of olen + * @destoff: Offset within @inode to start clone + * @no_time_update: Whether to update mtime/ctime on the target inode */ static int btrfs_clone(struct inode *src, struct inode *inode, const u64 off, const u64 olen, const u64 olen_aligned, @@ -887,7 +893,7 @@ loff_t btrfs_remap_file_range(struct file *src_file, loff_t off, return -EINVAL; if (same_inode) { - btrfs_inode_lock(src_inode, BTRFS_ILOCK_MMAP); + btrfs_inode_lock(BTRFS_I(src_inode), BTRFS_ILOCK_MMAP); } else { lock_two_nondirectories(src_inode, dst_inode); btrfs_double_mmap_lock(src_inode, dst_inode); @@ -905,7 +911,7 @@ loff_t btrfs_remap_file_range(struct file *src_file, loff_t off, out_unlock: if (same_inode) { - btrfs_inode_unlock(src_inode, BTRFS_ILOCK_MMAP); + btrfs_inode_unlock(BTRFS_I(src_inode), BTRFS_ILOCK_MMAP); } else { btrfs_double_mmap_unlock(src_inode, dst_inode); unlock_two_nondirectories(src_inode, dst_inode); diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 666a37a0ee89..31ec4a7658ce 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -27,6 +27,15 @@ #include "subpage.h" #include "zoned.h" #include "inode-item.h" +#include "space-info.h" +#include "fs.h" +#include "accessors.h" +#include "extent-tree.h" +#include "root-tree.h" +#include "file-item.h" +#include "relocation.h" +#include "super.h" +#include "tree-checker.h" /* * Relocation overview @@ -470,7 +479,7 @@ static noinline_for_stack struct btrfs_backref_node *build_backref_tree( int ret; int err = 0; - iter = btrfs_backref_iter_alloc(rc->extent_root->fs_info, GFP_NOFS); + iter = btrfs_backref_iter_alloc(rc->extent_root->fs_info); if (!iter) return ERR_PTR(-ENOMEM); path = btrfs_alloc_path(); @@ -1109,10 +1118,12 @@ int replace_file_extents(struct btrfs_trans_handle *trans, inode = find_next_inode(root, key.objectid); first = 0; } else if (inode && btrfs_ino(BTRFS_I(inode)) < key.objectid) { - btrfs_add_delayed_iput(inode); + btrfs_add_delayed_iput(BTRFS_I(inode)); inode = find_next_inode(root, key.objectid); } if (inode && btrfs_ino(BTRFS_I(inode)) == key.objectid) { + struct extent_state *cached_state = NULL; + end = key.offset + btrfs_file_extent_num_bytes(leaf, fi); WARN_ON(!IS_ALIGNED(key.offset, @@ -1120,14 +1131,15 @@ int replace_file_extents(struct btrfs_trans_handle *trans, WARN_ON(!IS_ALIGNED(end, fs_info->sectorsize)); end--; ret = try_lock_extent(&BTRFS_I(inode)->io_tree, - key.offset, end); + key.offset, end, + &cached_state); if (!ret) continue; btrfs_drop_extent_map_range(BTRFS_I(inode), key.offset, end, true); unlock_extent(&BTRFS_I(inode)->io_tree, - key.offset, end, NULL); + key.offset, end, &cached_state); } } @@ -1170,7 +1182,7 @@ int replace_file_extents(struct btrfs_trans_handle *trans, if (dirty) btrfs_mark_buffer_dirty(leaf); if (inode) - btrfs_add_delayed_iput(inode); + btrfs_add_delayed_iput(BTRFS_I(inode)); return ret; } @@ -1516,6 +1528,8 @@ static int invalidate_extent_cache(struct btrfs_root *root, objectid = min_key->objectid; while (1) { + struct extent_state *cached_state = NULL; + cond_resched(); iput(inode); @@ -1566,9 +1580,9 @@ static int invalidate_extent_cache(struct btrfs_root *root, } /* the lock_extent waits for read_folio to complete */ - lock_extent(&BTRFS_I(inode)->io_tree, start, end, NULL); + lock_extent(&BTRFS_I(inode)->io_tree, start, end, &cached_state); btrfs_drop_extent_map_range(BTRFS_I(inode), start, end, true); - unlock_extent(&BTRFS_I(inode)->io_tree, start, end, NULL); + unlock_extent(&BTRFS_I(inode)->io_tree, start, end, &cached_state); } return 0; } @@ -2597,10 +2611,14 @@ static int tree_block_processed(u64 bytenr, struct reloc_control *rc) static int get_tree_block_key(struct btrfs_fs_info *fs_info, struct tree_block *block) { + struct btrfs_tree_parent_check check = { + .level = block->level, + .owner_root = block->owner, + .transid = block->key.offset + }; struct extent_buffer *eb; - eb = read_tree_block(fs_info, block->bytenr, block->owner, - block->key.offset, block->level, NULL); + eb = read_tree_block(fs_info, block->bytenr, &check); if (IS_ERR(eb)) return PTR_ERR(eb); if (!extent_buffer_uptodate(eb)) { @@ -2861,25 +2879,27 @@ static noinline_for_stack int prealloc_file_extent_cluster( if (ret) return ret; - btrfs_inode_lock(&inode->vfs_inode, 0); + btrfs_inode_lock(inode, 0); for (nr = 0; nr < cluster->nr; nr++) { + struct extent_state *cached_state = NULL; + start = cluster->boundary[nr] - offset; if (nr + 1 < cluster->nr) end = cluster->boundary[nr + 1] - 1 - offset; else end = cluster->end - offset; - lock_extent(&inode->io_tree, start, end, NULL); + lock_extent(&inode->io_tree, start, end, &cached_state); num_bytes = end + 1 - start; ret = btrfs_prealloc_file_range(&inode->vfs_inode, 0, start, num_bytes, num_bytes, end + 1, &alloc_hint); cur_offset = end + 1; - unlock_extent(&inode->io_tree, start, end, NULL); + unlock_extent(&inode->io_tree, start, end, &cached_state); if (ret) break; } - btrfs_inode_unlock(&inode->vfs_inode, 0); + btrfs_inode_unlock(inode, 0); if (cur_offset < prealloc_end) btrfs_free_reserved_data_space_noquota(inode->root->fs_info, @@ -2891,6 +2911,7 @@ static noinline_for_stack int setup_relocation_extent_mapping(struct inode *inod u64 start, u64 end, u64 block_start) { struct extent_map *em; + struct extent_state *cached_state = NULL; int ret = 0; em = alloc_extent_map(); @@ -2903,9 +2924,9 @@ static noinline_for_stack int setup_relocation_extent_mapping(struct inode *inod em->block_start = block_start; set_bit(EXTENT_FLAG_PINNED, &em->flags); - lock_extent(&BTRFS_I(inode)->io_tree, start, end, NULL); + lock_extent(&BTRFS_I(inode)->io_tree, start, end, &cached_state); ret = btrfs_replace_extent_map_range(BTRFS_I(inode), em, false); - unlock_extent(&BTRFS_I(inode)->io_tree, start, end, NULL); + unlock_extent(&BTRFS_I(inode)->io_tree, start, end, &cached_state); free_extent_map(em); return ret; @@ -2983,6 +3004,7 @@ static int relocate_one_page(struct inode *inode, struct file_ra_state *ra, */ cur = max(page_start, cluster->boundary[*cluster_nr] - offset); while (cur <= page_end) { + struct extent_state *cached_state = NULL; u64 extent_start = cluster->boundary[*cluster_nr] - offset; u64 extent_end = get_cluster_boundary_end(cluster, *cluster_nr) - offset; @@ -2998,13 +3020,15 @@ static int relocate_one_page(struct inode *inode, struct file_ra_state *ra, goto release_page; /* Mark the range delalloc and dirty for later writeback */ - lock_extent(&BTRFS_I(inode)->io_tree, clamped_start, clamped_end, NULL); + lock_extent(&BTRFS_I(inode)->io_tree, clamped_start, clamped_end, + &cached_state); ret = btrfs_set_extent_delalloc(BTRFS_I(inode), clamped_start, - clamped_end, 0, NULL); + clamped_end, 0, &cached_state); if (ret) { - clear_extent_bits(&BTRFS_I(inode)->io_tree, - clamped_start, clamped_end, - EXTENT_LOCKED | EXTENT_BOUNDARY); + clear_extent_bit(&BTRFS_I(inode)->io_tree, + clamped_start, clamped_end, + EXTENT_LOCKED | EXTENT_BOUNDARY, + &cached_state); btrfs_delalloc_release_metadata(BTRFS_I(inode), clamped_len, true); btrfs_delalloc_release_extents(BTRFS_I(inode), @@ -3031,7 +3055,8 @@ static int relocate_one_page(struct inode *inode, struct file_ra_state *ra, boundary_start, boundary_end, EXTENT_BOUNDARY); } - unlock_extent(&BTRFS_I(inode)->io_tree, clamped_start, clamped_end, NULL); + unlock_extent(&BTRFS_I(inode)->io_tree, clamped_start, clamped_end, + &cached_state); btrfs_delalloc_release_extents(BTRFS_I(inode), clamped_len); cur += clamped_len; @@ -3388,24 +3413,28 @@ int add_data_references(struct reloc_control *rc, struct btrfs_path *path, struct rb_root *blocks) { - struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; - struct ulist *leaves = NULL; + struct btrfs_backref_walk_ctx ctx = { 0 }; struct ulist_iterator leaf_uiter; struct ulist_node *ref_node = NULL; - const u32 blocksize = fs_info->nodesize; + const u32 blocksize = rc->extent_root->fs_info->nodesize; int ret = 0; btrfs_release_path(path); - ret = btrfs_find_all_leafs(NULL, fs_info, extent_key->objectid, - 0, &leaves, NULL, true); + + ctx.bytenr = extent_key->objectid; + ctx.ignore_extent_item_pos = true; + ctx.fs_info = rc->extent_root->fs_info; + + ret = btrfs_find_all_leafs(&ctx); if (ret < 0) return ret; ULIST_ITER_INIT(&leaf_uiter); - while ((ref_node = ulist_next(leaves, &leaf_uiter))) { + while ((ref_node = ulist_next(ctx.refs, &leaf_uiter))) { + struct btrfs_tree_parent_check check = { 0 }; struct extent_buffer *eb; - eb = read_tree_block(fs_info, ref_node->val, 0, 0, 0, NULL); + eb = read_tree_block(ctx.fs_info, ref_node->val, &check); if (IS_ERR(eb)) { ret = PTR_ERR(eb); break; @@ -3421,7 +3450,7 @@ int add_data_references(struct reloc_control *rc, } if (ret < 0) free_block_list(blocks); - ulist_free(leaves); + ulist_free(ctx.refs); return ret; } @@ -3905,8 +3934,7 @@ static struct reloc_control *alloc_reloc_control(struct btrfs_fs_info *fs_info) INIT_LIST_HEAD(&rc->dirty_subvol_roots); btrfs_backref_init_cache(fs_info, &rc->backref_cache, 1); mapping_tree_init(&rc->reloc_root_tree); - extent_io_tree_init(fs_info, &rc->processed_blocks, - IO_TREE_RELOC_BLOCKS, NULL); + extent_io_tree_init(fs_info, &rc->processed_blocks, IO_TREE_RELOC_BLOCKS); return rc; } @@ -4330,8 +4358,8 @@ int btrfs_reloc_clone_csums(struct btrfs_inode *inode, u64 file_pos, u64 len) disk_bytenr = file_pos + inode->index_cnt; csum_root = btrfs_csum_root(fs_info, disk_bytenr); - ret = btrfs_lookup_csums_range(csum_root, disk_bytenr, - disk_bytenr + len - 1, &list, 0, false); + ret = btrfs_lookup_csums_list(csum_root, disk_bytenr, + disk_bytenr + len - 1, &list, 0, false); if (ret) goto out; diff --git a/fs/btrfs/relocation.h b/fs/btrfs/relocation.h new file mode 100644 index 000000000000..2041a86186de --- /dev/null +++ b/fs/btrfs/relocation.h @@ -0,0 +1,23 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef BTRFS_RELOCATION_H +#define BTRFS_RELOCATION_H + +int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start); +int btrfs_init_reloc_root(struct btrfs_trans_handle *trans, struct btrfs_root *root); +int btrfs_update_reloc_root(struct btrfs_trans_handle *trans, + struct btrfs_root *root); +int btrfs_recover_relocation(struct btrfs_fs_info *fs_info); +int btrfs_reloc_clone_csums(struct btrfs_inode *inode, u64 file_pos, u64 len); +int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct extent_buffer *buf, + struct extent_buffer *cow); +void btrfs_reloc_pre_snapshot(struct btrfs_pending_snapshot *pending, + u64 *bytes_to_reserve); +int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans, + struct btrfs_pending_snapshot *pending); +int btrfs_should_cancel_balance(struct btrfs_fs_info *fs_info); +struct btrfs_root *find_reloc_root(struct btrfs_fs_info *fs_info, u64 bytenr); +int btrfs_should_ignore_reloc_root(struct btrfs_root *root); + +#endif diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index e1f599d7a916..859874579456 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -6,11 +6,16 @@ #include <linux/err.h> #include <linux/uuid.h> #include "ctree.h" +#include "fs.h" +#include "messages.h" #include "transaction.h" #include "disk-io.h" #include "print-tree.h" #include "qgroup.h" #include "space-info.h" +#include "accessors.h" +#include "root-tree.h" +#include "orphan.h" /* * Read a root item from the tree. In case we detect a root item smaller then @@ -327,9 +332,8 @@ out: } int btrfs_del_root_ref(struct btrfs_trans_handle *trans, u64 root_id, - u64 ref_id, u64 dirid, u64 *sequence, const char *name, - int name_len) - + u64 ref_id, u64 dirid, u64 *sequence, + const struct fscrypt_str *name) { struct btrfs_root *tree_root = trans->fs_info->tree_root; struct btrfs_path *path; @@ -356,8 +360,8 @@ again: struct btrfs_root_ref); ptr = (unsigned long)(ref + 1); if ((btrfs_root_ref_dirid(leaf, ref) != dirid) || - (btrfs_root_ref_name_len(leaf, ref) != name_len) || - memcmp_extent_buffer(leaf, name, ptr, name_len)) { + (btrfs_root_ref_name_len(leaf, ref) != name->len) || + memcmp_extent_buffer(leaf, name->name, ptr, name->len)) { ret = -ENOENT; goto out; } @@ -400,8 +404,8 @@ out: * Will return 0, -ENOMEM, or anything from the CoW path */ int btrfs_add_root_ref(struct btrfs_trans_handle *trans, u64 root_id, - u64 ref_id, u64 dirid, u64 sequence, const char *name, - int name_len) + u64 ref_id, u64 dirid, u64 sequence, + const struct fscrypt_str *name) { struct btrfs_root *tree_root = trans->fs_info->tree_root; struct btrfs_key key; @@ -420,7 +424,7 @@ int btrfs_add_root_ref(struct btrfs_trans_handle *trans, u64 root_id, key.offset = ref_id; again: ret = btrfs_insert_empty_item(trans, tree_root, path, &key, - sizeof(*ref) + name_len); + sizeof(*ref) + name->len); if (ret) { btrfs_abort_transaction(trans, ret); btrfs_free_path(path); @@ -431,9 +435,9 @@ again: ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref); btrfs_set_root_ref_dirid(leaf, ref, dirid); btrfs_set_root_ref_sequence(leaf, ref, sequence); - btrfs_set_root_ref_name_len(leaf, ref, name_len); + btrfs_set_root_ref_name_len(leaf, ref, name->len); ptr = (unsigned long)(ref + 1); - write_extent_buffer(leaf, name, ptr, name_len); + write_extent_buffer(leaf, name->name, ptr, name->len); btrfs_mark_buffer_dirty(leaf); if (key.type == BTRFS_ROOT_BACKREF_KEY) { diff --git a/fs/btrfs/root-tree.h b/fs/btrfs/root-tree.h new file mode 100644 index 000000000000..cbbaca32126e --- /dev/null +++ b/fs/btrfs/root-tree.h @@ -0,0 +1,34 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef BTRFS_ROOT_TREE_H +#define BTRFS_ROOT_TREE_H + +int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, + struct btrfs_block_rsv *rsv, + int nitems, bool use_global_rsv); +void btrfs_subvolume_release_metadata(struct btrfs_root *root, + struct btrfs_block_rsv *rsv); +int btrfs_add_root_ref(struct btrfs_trans_handle *trans, u64 root_id, + u64 ref_id, u64 dirid, u64 sequence, + const struct fscrypt_str *name); +int btrfs_del_root_ref(struct btrfs_trans_handle *trans, u64 root_id, + u64 ref_id, u64 dirid, u64 *sequence, + const struct fscrypt_str *name); +int btrfs_del_root(struct btrfs_trans_handle *trans, const struct btrfs_key *key); +int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root *root, + const struct btrfs_key *key, + struct btrfs_root_item *item); +int __must_check btrfs_update_root(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_key *key, + struct btrfs_root_item *item); +int btrfs_find_root(struct btrfs_root *root, const struct btrfs_key *search_key, + struct btrfs_path *path, struct btrfs_root_item *root_item, + struct btrfs_key *root_key); +int btrfs_find_orphan_roots(struct btrfs_fs_info *fs_info); +void btrfs_set_root_node(struct btrfs_root_item *item, + struct extent_buffer *node); +void btrfs_check_and_init_root_item(struct btrfs_root_item *item); +void btrfs_update_root_times(struct btrfs_trans_handle *trans, struct btrfs_root *root); + +#endif diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 196c4c6ed1ed..52b346795f66 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -17,10 +17,13 @@ #include "extent_io.h" #include "dev-replace.h" #include "check-integrity.h" -#include "rcu-string.h" #include "raid56.h" #include "block-group.h" #include "zoned.h" +#include "fs.h" +#include "accessors.h" +#include "file-item.h" +#include "scrub.h" /* * This is only the first step towards a full-features scrub. It reads all @@ -56,6 +59,17 @@ struct scrub_ctx; #define SCRUB_MAX_PAGES (DIV_ROUND_UP(BTRFS_MAX_METADATA_BLOCKSIZE, PAGE_SIZE)) +/* + * Maximum number of mirrors that can be available for all profiles counting + * the target device of dev-replace as one. During an active device replace + * procedure, the target device of the copy operation is a mirror for the + * filesystem data as well that can be used to read data in order to repair + * read errors on other disks. + * + * Current value is derived from RAID1C4 with 4 copies. + */ +#define BTRFS_MAX_MIRRORS (4 + 1) + struct scrub_recover { refcount_t refs; struct btrfs_io_context *bioc; @@ -284,7 +298,7 @@ static struct scrub_block *alloc_scrub_block(struct scrub_ctx *sctx, * Will also allocate new pages for @sblock if needed. */ static struct scrub_sector *alloc_scrub_sector(struct scrub_block *sblock, - u64 logical, gfp_t gfp) + u64 logical) { const pgoff_t page_index = (logical - sblock->logical) >> PAGE_SHIFT; struct scrub_sector *ssector; @@ -292,7 +306,7 @@ static struct scrub_sector *alloc_scrub_sector(struct scrub_block *sblock, /* We must never have scrub_block exceed U32_MAX in size. */ ASSERT(logical - sblock->logical < U32_MAX); - ssector = kzalloc(sizeof(*ssector), gfp); + ssector = kzalloc(sizeof(*ssector), GFP_KERNEL); if (!ssector) return NULL; @@ -300,7 +314,7 @@ static struct scrub_sector *alloc_scrub_sector(struct scrub_block *sblock, if (!sblock->pages[page_index]) { int ret; - sblock->pages[page_index] = alloc_page(gfp); + sblock->pages[page_index] = alloc_page(GFP_KERNEL); if (!sblock->pages[page_index]) { kfree(ssector); return NULL; @@ -794,8 +808,8 @@ nomem: return ERR_PTR(-ENOMEM); } -static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, - void *warn_ctx) +static int scrub_print_warning_inode(u64 inum, u64 offset, u64 num_bytes, + u64 root, void *warn_ctx) { u32 nlink; int ret; @@ -862,7 +876,7 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, btrfs_warn_in_rcu(fs_info, "%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu, length %u, links %u (path: %s)", swarn->errstr, swarn->logical, - rcu_str_deref(swarn->dev->name), + btrfs_dev_name(swarn->dev), swarn->physical, root, inum, offset, fs_info->sectorsize, nlink, @@ -876,7 +890,7 @@ err: btrfs_warn_in_rcu(fs_info, "%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu: path resolving failed with ret=%d", swarn->errstr, swarn->logical, - rcu_str_deref(swarn->dev->name), + btrfs_dev_name(swarn->dev), swarn->physical, root, inum, offset, ret); @@ -894,7 +908,6 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock) struct btrfs_extent_item *ei; struct scrub_warning swarn; unsigned long ptr = 0; - u64 extent_item_pos; u64 flags = 0; u64 ref_root; u32 item_size; @@ -908,8 +921,7 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock) /* Super block error, no need to search extent tree. */ if (sblock->sectors[0]->flags & BTRFS_EXTENT_FLAG_SUPER) { btrfs_warn_in_rcu(fs_info, "%s on device %s, physical %llu", - errstr, rcu_str_deref(dev->name), - sblock->physical); + errstr, btrfs_dev_name(dev), sblock->physical); return; } path = btrfs_alloc_path(); @@ -926,7 +938,6 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock) if (ret < 0) goto out; - extent_item_pos = swarn.logical - found_key.objectid; swarn.extent_item_size = found_key.offset; eb = path->nodes[0]; @@ -941,7 +952,7 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock) btrfs_warn_in_rcu(fs_info, "%s at logical %llu on dev %s, physical %llu: metadata %s (level %d) in tree %llu", errstr, swarn.logical, - rcu_str_deref(dev->name), + btrfs_dev_name(dev), swarn.physical, ref_level ? "node" : "leaf", ret < 0 ? -1 : ref_level, @@ -949,12 +960,18 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock) } while (ret != 1); btrfs_release_path(path); } else { + struct btrfs_backref_walk_ctx ctx = { 0 }; + btrfs_release_path(path); + + ctx.bytenr = found_key.objectid; + ctx.extent_item_pos = swarn.logical - found_key.objectid; + ctx.fs_info = fs_info; + swarn.path = path; swarn.dev = dev; - iterate_extent_inodes(fs_info, found_key.objectid, - extent_item_pos, 1, - scrub_print_warning_inode, &swarn, false); + + iterate_extent_inodes(&ctx, true, scrub_print_warning_inode, &swarn); } out: @@ -1358,7 +1375,7 @@ corrected_error: spin_unlock(&sctx->stat_lock); btrfs_err_rl_in_rcu(fs_info, "fixed up error at logical %llu on dev %s", - logical, rcu_str_deref(dev->name)); + logical, btrfs_dev_name(dev)); } } else { did_not_correct_error: @@ -1367,7 +1384,7 @@ did_not_correct_error: spin_unlock(&sctx->stat_lock); btrfs_err_rl_in_rcu(fs_info, "unable to fixup (regular) error at logical %llu on dev %s", - logical, rcu_str_deref(dev->name)); + logical, btrfs_dev_name(dev)); } out: @@ -1480,7 +1497,7 @@ static int scrub_setup_recheck_block(struct scrub_block *original_sblock, return -EIO; } - recover = kzalloc(sizeof(struct scrub_recover), GFP_NOFS); + recover = kzalloc(sizeof(struct scrub_recover), GFP_KERNEL); if (!recover) { btrfs_put_bioc(bioc); btrfs_bio_counter_dec(fs_info); @@ -1503,7 +1520,7 @@ static int scrub_setup_recheck_block(struct scrub_block *original_sblock, sblock = sblocks_for_recheck[mirror_index]; sblock->sctx = sctx; - sector = alloc_scrub_sector(sblock, logical, GFP_NOFS); + sector = alloc_scrub_sector(sblock, logical); if (!sector) { spin_lock(&sctx->stat_lock); sctx->stat.malloc_errors++; @@ -2313,14 +2330,14 @@ static void scrub_missing_raid56_worker(struct work_struct *work) spin_unlock(&sctx->stat_lock); btrfs_err_rl_in_rcu(fs_info, "IO error rebuilding logical %llu for dev %s", - logical, rcu_str_deref(dev->name)); + logical, btrfs_dev_name(dev)); } else if (sblock->header_error || sblock->checksum_error) { spin_lock(&sctx->stat_lock); sctx->stat.uncorrectable_errors++; spin_unlock(&sctx->stat_lock); btrfs_err_rl_in_rcu(fs_info, "failed to rebuild valid logical %llu for dev %s", - logical, rcu_str_deref(dev->name)); + logical, btrfs_dev_name(dev)); } else { scrub_write_block_to_dev_replace(sblock); } @@ -2425,7 +2442,7 @@ static int scrub_sectors(struct scrub_ctx *sctx, u64 logical, u32 len, */ u32 l = min(sectorsize, len); - sector = alloc_scrub_sector(sblock, logical, GFP_KERNEL); + sector = alloc_scrub_sector(sblock, logical); if (!sector) { spin_lock(&sctx->stat_lock); sctx->stat.malloc_errors++; @@ -2756,7 +2773,7 @@ static int scrub_sectors_for_parity(struct scrub_parity *sparity, for (index = 0; len > 0; index++) { struct scrub_sector *sector; - sector = alloc_scrub_sector(sblock, logical, GFP_KERNEL); + sector = alloc_scrub_sector(sblock, logical); if (!sector) { spin_lock(&sctx->stat_lock); sctx->stat.malloc_errors++; @@ -3221,9 +3238,9 @@ static int scrub_raid56_data_stripe_for_parity(struct scrub_ctx *sctx, extent_dev = bioc->stripes[0].dev; btrfs_put_bioc(bioc); - ret = btrfs_lookup_csums_range(csum_root, extent_start, - extent_start + extent_size - 1, - &sctx->csum_list, 1, false); + ret = btrfs_lookup_csums_list(csum_root, extent_start, + extent_start + extent_size - 1, + &sctx->csum_list, 1, false); if (ret) { scrub_parity_mark_sectors_error(sparity, extent_start, extent_size); @@ -3447,7 +3464,7 @@ static int scrub_simple_mirror(struct scrub_ctx *sctx, cur_logical; if (extent_flags & BTRFS_EXTENT_FLAG_DATA) { - ret = btrfs_lookup_csums_range(csum_root, cur_logical, + ret = btrfs_lookup_csums_list(csum_root, cur_logical, cur_logical + scrub_len - 1, &sctx->csum_list, 1, false); if (ret) @@ -4284,7 +4301,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, mutex_unlock(&fs_info->fs_devices->device_list_mutex); btrfs_err_in_rcu(fs_info, "scrub on devid %llu: filesystem on %s is not writable", - devid, rcu_str_deref(dev->name)); + devid, btrfs_dev_name(dev)); ret = -EROFS; goto out; } diff --git a/fs/btrfs/scrub.h b/fs/btrfs/scrub.h new file mode 100644 index 000000000000..7639103ebf9d --- /dev/null +++ b/fs/btrfs/scrub.h @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef BTRFS_SCRUB_H +#define BTRFS_SCRUB_H + +int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, + u64 end, struct btrfs_scrub_progress *progress, + int readonly, int is_dev_replace); +void btrfs_scrub_pause(struct btrfs_fs_info *fs_info); +void btrfs_scrub_continue(struct btrfs_fs_info *fs_info); +int btrfs_scrub_cancel(struct btrfs_fs_info *info); +int btrfs_scrub_cancel_dev(struct btrfs_device *dev); +int btrfs_scrub_progress(struct btrfs_fs_info *fs_info, u64 devid, + struct btrfs_scrub_progress *progress); + +#endif diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 1c4b693ee4a3..67f7c698ade3 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -27,6 +27,11 @@ #include "compression.h" #include "xattr.h" #include "print-tree.h" +#include "accessors.h" +#include "dir-item.h" +#include "file-item.h" +#include "ioctl.h" +#include "verity.h" /* * Maximum number of references an extent can have in order for us to attempt to @@ -34,7 +39,7 @@ * avoid hitting limitations of the backreference walking code (taking a lot of * time and using too much memory for extents with large number of references). */ -#define SEND_MAX_EXTENT_REFS 64 +#define SEND_MAX_EXTENT_REFS 1024 /* * A fs_path is a helper to dynamically build path names with unknown size. @@ -71,13 +76,46 @@ struct clone_root { struct btrfs_root *root; u64 ino; u64 offset; - - u64 found_refs; + u64 num_bytes; + bool found_ref; }; #define SEND_CTX_MAX_NAME_CACHE_SIZE 128 #define SEND_CTX_NAME_CACHE_CLEAN_SIZE (SEND_CTX_MAX_NAME_CACHE_SIZE * 2) +/* + * Limit the root_ids array of struct backref_cache_entry to 12 elements. + * This makes the size of a cache entry to be exactly 128 bytes on x86_64. + * The most common case is to have a single root for cloning, which corresponds + * to the send root. Having the user specify more than 11 clone roots is not + * common, and in such rare cases we simply don't use caching if the number of + * cloning roots that lead down to a leaf is more than 12. + */ +#define SEND_MAX_BACKREF_CACHE_ROOTS 12 + +/* + * Max number of entries in the cache. + * With SEND_MAX_BACKREF_CACHE_ROOTS as 12, the size in bytes, excluding + * maple tree's internal nodes, is 16K. + */ +#define SEND_MAX_BACKREF_CACHE_SIZE 128 + +/* + * A backref cache entry maps a leaf to a list of IDs of roots from which the + * leaf is accessible and we can use for clone operations. + * With SEND_MAX_BACKREF_CACHE_ROOTS as 12, each cache entry is 128 bytes (on + * x86_64). + */ +struct backref_cache_entry { + /* List to link to the cache's lru list. */ + struct list_head list; + /* The key for this entry in the cache. */ + u64 key; + u64 root_ids[SEND_MAX_BACKREF_CACHE_ROOTS]; + /* Number of valid elements in the root_ids array. */ + int num_roots; +}; + struct send_ctx { struct file *send_filp; loff_t send_off; @@ -246,6 +284,14 @@ struct send_ctx { struct rb_root rbtree_new_refs; struct rb_root rbtree_deleted_refs; + + struct { + u64 last_reloc_trans; + struct list_head lru_list; + struct maple_tree entries; + /* Number of entries stored in the cache. */ + int size; + } backref_cache; }; struct pending_dir_move { @@ -1093,7 +1139,7 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path, data_len = btrfs_dir_data_len(eb, di); btrfs_dir_item_key_to_cpu(eb, di, &di_key); - if (btrfs_dir_type(eb, di) == BTRFS_FT_XATTR) { + if (btrfs_dir_ftype(eb, di) == BTRFS_FT_XATTR) { if (name_len > XATTR_NAME_MAX) { ret = -ENAMETOOLONG; goto out; @@ -1236,8 +1282,12 @@ struct backref_ctx { /* may be truncated in case it's the last extent in a file */ u64 extent_len; - /* Just to check for bugs in backref resolving */ - int found_itself; + /* The bytenr the file extent item we are processing refers to. */ + u64 bytenr; + /* The owner (root id) of the data backref for the current extent. */ + u64 backref_owner; + /* The offset of the data backref for the current extent. */ + u64 backref_offset; }; static int __clone_root_cmp_bsearch(const void *key, const void *elt) @@ -1266,32 +1316,33 @@ static int __clone_root_cmp_sort(const void *e1, const void *e2) /* * Called for every backref that is found for the current extent. - * Results are collected in sctx->clone_roots->ino/offset/found_refs + * Results are collected in sctx->clone_roots->ino/offset. */ -static int __iterate_backrefs(u64 ino, u64 offset, u64 root, void *ctx_) +static int iterate_backrefs(u64 ino, u64 offset, u64 num_bytes, u64 root_id, + void *ctx_) { struct backref_ctx *bctx = ctx_; - struct clone_root *found; + struct clone_root *clone_root; /* First check if the root is in the list of accepted clone sources */ - found = bsearch((void *)(uintptr_t)root, bctx->sctx->clone_roots, - bctx->sctx->clone_roots_cnt, - sizeof(struct clone_root), - __clone_root_cmp_bsearch); - if (!found) + clone_root = bsearch((void *)(uintptr_t)root_id, bctx->sctx->clone_roots, + bctx->sctx->clone_roots_cnt, + sizeof(struct clone_root), + __clone_root_cmp_bsearch); + if (!clone_root) return 0; - if (found->root == bctx->sctx->send_root && + /* This is our own reference, bail out as we can't clone from it. */ + if (clone_root->root == bctx->sctx->send_root && ino == bctx->cur_objectid && - offset == bctx->cur_offset) { - bctx->found_itself = 1; - } + offset == bctx->cur_offset) + return 0; /* * Make sure we don't consider clones from send_root that are * behind the current inode/offset. */ - if (found->root == bctx->sctx->send_root) { + if (clone_root->root == bctx->sctx->send_root) { /* * If the source inode was not yet processed we can't issue a * clone operation, as the source extent does not exist yet at @@ -1312,21 +1363,217 @@ static int __iterate_backrefs(u64 ino, u64 offset, u64 root, void *ctx_) } bctx->found++; - found->found_refs++; - if (ino < found->ino) { - found->ino = ino; - found->offset = offset; - } else if (found->ino == ino) { + clone_root->found_ref = true; + + /* + * If the given backref refers to a file extent item with a larger + * number of bytes than what we found before, use the new one so that + * we clone more optimally and end up doing less writes and getting + * less exclusive, non-shared extents at the destination. + */ + if (num_bytes > clone_root->num_bytes) { + clone_root->ino = ino; + clone_root->offset = offset; + clone_root->num_bytes = num_bytes; + + /* + * Found a perfect candidate, so there's no need to continue + * backref walking. + */ + if (num_bytes >= bctx->extent_len) + return BTRFS_ITERATE_EXTENT_INODES_STOP; + } + + return 0; +} + +static void empty_backref_cache(struct send_ctx *sctx) +{ + struct backref_cache_entry *entry; + struct backref_cache_entry *tmp; + + list_for_each_entry_safe(entry, tmp, &sctx->backref_cache.lru_list, list) + kfree(entry); + + INIT_LIST_HEAD(&sctx->backref_cache.lru_list); + mtree_destroy(&sctx->backref_cache.entries); + sctx->backref_cache.size = 0; +} + +static bool lookup_backref_cache(u64 leaf_bytenr, void *ctx, + const u64 **root_ids_ret, int *root_count_ret) +{ + struct backref_ctx *bctx = ctx; + struct send_ctx *sctx = bctx->sctx; + struct btrfs_fs_info *fs_info = sctx->send_root->fs_info; + const u64 key = leaf_bytenr >> fs_info->sectorsize_bits; + struct backref_cache_entry *entry; + + if (sctx->backref_cache.size == 0) + return false; + + /* + * If relocation happened since we first filled the cache, then we must + * empty the cache and can not use it, because even though we operate on + * read-only roots, their leaves and nodes may have been reallocated and + * now be used for different nodes/leaves of the same tree or some other + * tree. + * + * We are called from iterate_extent_inodes() while either holding a + * transaction handle or holding fs_info->commit_root_sem, so no need + * to take any lock here. + */ + if (fs_info->last_reloc_trans > sctx->backref_cache.last_reloc_trans) { + empty_backref_cache(sctx); + return false; + } + + entry = mtree_load(&sctx->backref_cache.entries, key); + if (!entry) + return false; + + *root_ids_ret = entry->root_ids; + *root_count_ret = entry->num_roots; + list_move_tail(&entry->list, &sctx->backref_cache.lru_list); + + return true; +} + +static void store_backref_cache(u64 leaf_bytenr, const struct ulist *root_ids, + void *ctx) +{ + struct backref_ctx *bctx = ctx; + struct send_ctx *sctx = bctx->sctx; + struct btrfs_fs_info *fs_info = sctx->send_root->fs_info; + struct backref_cache_entry *new_entry; + struct ulist_iterator uiter; + struct ulist_node *node; + int ret; + + /* + * We're called while holding a transaction handle or while holding + * fs_info->commit_root_sem (at iterate_extent_inodes()), so must do a + * NOFS allocation. + */ + new_entry = kmalloc(sizeof(struct backref_cache_entry), GFP_NOFS); + /* No worries, cache is optional. */ + if (!new_entry) + return; + + new_entry->key = leaf_bytenr >> fs_info->sectorsize_bits; + new_entry->num_roots = 0; + ULIST_ITER_INIT(&uiter); + while ((node = ulist_next(root_ids, &uiter)) != NULL) { + const u64 root_id = node->val; + struct clone_root *root; + + root = bsearch((void *)(uintptr_t)root_id, sctx->clone_roots, + sctx->clone_roots_cnt, sizeof(struct clone_root), + __clone_root_cmp_bsearch); + if (!root) + continue; + + /* Too many roots, just exit, no worries as caching is optional. */ + if (new_entry->num_roots >= SEND_MAX_BACKREF_CACHE_ROOTS) { + kfree(new_entry); + return; + } + + new_entry->root_ids[new_entry->num_roots] = root_id; + new_entry->num_roots++; + } + + /* + * We may have not added any roots to the new cache entry, which means + * none of the roots is part of the list of roots from which we are + * allowed to clone. Cache the new entry as it's still useful to avoid + * backref walking to determine which roots have a path to the leaf. + */ + + if (sctx->backref_cache.size >= SEND_MAX_BACKREF_CACHE_SIZE) { + struct backref_cache_entry *lru_entry; + struct backref_cache_entry *mt_entry; + + lru_entry = list_first_entry(&sctx->backref_cache.lru_list, + struct backref_cache_entry, list); + mt_entry = mtree_erase(&sctx->backref_cache.entries, lru_entry->key); + ASSERT(mt_entry == lru_entry); + list_del(&mt_entry->list); + kfree(mt_entry); + sctx->backref_cache.size--; + } + + ret = mtree_insert(&sctx->backref_cache.entries, new_entry->key, + new_entry, GFP_NOFS); + ASSERT(ret == 0 || ret == -ENOMEM); + if (ret) { + /* Caching is optional, no worries. */ + kfree(new_entry); + return; + } + + list_add_tail(&new_entry->list, &sctx->backref_cache.lru_list); + + /* + * We are called from iterate_extent_inodes() while either holding a + * transaction handle or holding fs_info->commit_root_sem, so no need + * to take any lock here. + */ + if (sctx->backref_cache.size == 0) + sctx->backref_cache.last_reloc_trans = fs_info->last_reloc_trans; + + sctx->backref_cache.size++; +} + +static int check_extent_item(u64 bytenr, const struct btrfs_extent_item *ei, + const struct extent_buffer *leaf, void *ctx) +{ + const u64 refs = btrfs_extent_refs(leaf, ei); + const struct backref_ctx *bctx = ctx; + const struct send_ctx *sctx = bctx->sctx; + + if (bytenr == bctx->bytenr) { + const u64 flags = btrfs_extent_flags(leaf, ei); + + if (WARN_ON(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)) + return -EUCLEAN; + /* - * same extent found more then once in the same file. + * If we have only one reference and only the send root as a + * clone source - meaning no clone roots were given in the + * struct btrfs_ioctl_send_args passed to the send ioctl - then + * it's our reference and there's no point in doing backref + * walking which is expensive, so exit early. */ - if (found->offset > offset + bctx->extent_len) - found->offset = offset; + if (refs == 1 && sctx->clone_roots_cnt == 1) + return -ENOENT; } + /* + * Backreference walking (iterate_extent_inodes() below) is currently + * too expensive when an extent has a large number of references, both + * in time spent and used memory. So for now just fallback to write + * operations instead of clone operations when an extent has more than + * a certain amount of references. + */ + if (refs > SEND_MAX_EXTENT_REFS) + return -ENOENT; + return 0; } +static bool skip_self_data_ref(u64 root, u64 ino, u64 offset, void *ctx) +{ + const struct backref_ctx *bctx = ctx; + + if (ino == bctx->cur_objectid && + root == bctx->backref_owner && + offset == bctx->backref_offset) + return true; + + return false; +} + /* * Given an inode, offset and extent item, it finds a good clone for a clone * instruction. Returns -ENOENT when none could be found. The function makes @@ -1348,79 +1595,36 @@ static int find_extent_clone(struct send_ctx *sctx, u64 logical; u64 disk_byte; u64 num_bytes; - u64 extent_item_pos; - u64 flags = 0; struct btrfs_file_extent_item *fi; struct extent_buffer *eb = path->nodes[0]; - struct backref_ctx backref_ctx = {0}; + struct backref_ctx backref_ctx = { 0 }; + struct btrfs_backref_walk_ctx backref_walk_ctx = { 0 }; struct clone_root *cur_clone_root; - struct btrfs_key found_key; - struct btrfs_path *tmp_path; - struct btrfs_extent_item *ei; int compressed; u32 i; - tmp_path = alloc_path_for_send(); - if (!tmp_path) - return -ENOMEM; + /* + * With fallocate we can get prealloc extents beyond the inode's i_size, + * so we don't do anything here because clone operations can not clone + * to a range beyond i_size without increasing the i_size of the + * destination inode. + */ + if (data_offset >= ino_size) + return 0; - /* We only use this path under the commit sem */ - tmp_path->need_commit_sem = 0; + fi = btrfs_item_ptr(eb, path->slots[0], struct btrfs_file_extent_item); + extent_type = btrfs_file_extent_type(eb, fi); + if (extent_type == BTRFS_FILE_EXTENT_INLINE) + return -ENOENT; - if (data_offset >= ino_size) { - /* - * There may be extents that lie behind the file's size. - * I at least had this in combination with snapshotting while - * writing large files. - */ - ret = 0; - goto out; - } + disk_byte = btrfs_file_extent_disk_bytenr(eb, fi); + if (disk_byte == 0) + return -ENOENT; - fi = btrfs_item_ptr(eb, path->slots[0], - struct btrfs_file_extent_item); - extent_type = btrfs_file_extent_type(eb, fi); - if (extent_type == BTRFS_FILE_EXTENT_INLINE) { - ret = -ENOENT; - goto out; - } compressed = btrfs_file_extent_compression(eb, fi); - num_bytes = btrfs_file_extent_num_bytes(eb, fi); - disk_byte = btrfs_file_extent_disk_bytenr(eb, fi); - if (disk_byte == 0) { - ret = -ENOENT; - goto out; - } logical = disk_byte + btrfs_file_extent_offset(eb, fi); - down_read(&fs_info->commit_root_sem); - ret = extent_from_logical(fs_info, disk_byte, tmp_path, - &found_key, &flags); - up_read(&fs_info->commit_root_sem); - - if (ret < 0) - goto out; - if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { - ret = -EIO; - goto out; - } - - ei = btrfs_item_ptr(tmp_path->nodes[0], tmp_path->slots[0], - struct btrfs_extent_item); - /* - * Backreference walking (iterate_extent_inodes() below) is currently - * too expensive when an extent has a large number of references, both - * in time spent and used memory. So for now just fallback to write - * operations instead of clone operations when an extent has more than - * a certain amount of references. - */ - if (btrfs_extent_refs(tmp_path->nodes[0], ei) > SEND_MAX_EXTENT_REFS) { - ret = -ENOENT; - goto out; - } - btrfs_release_path(tmp_path); - /* * Setup the clone roots. */ @@ -1428,37 +1632,59 @@ static int find_extent_clone(struct send_ctx *sctx, cur_clone_root = sctx->clone_roots + i; cur_clone_root->ino = (u64)-1; cur_clone_root->offset = 0; - cur_clone_root->found_refs = 0; + cur_clone_root->num_bytes = 0; + cur_clone_root->found_ref = false; } backref_ctx.sctx = sctx; - backref_ctx.found = 0; backref_ctx.cur_objectid = ino; backref_ctx.cur_offset = data_offset; - backref_ctx.found_itself = 0; - backref_ctx.extent_len = num_bytes; + backref_ctx.bytenr = disk_byte; + /* + * Use the header owner and not the send root's id, because in case of a + * snapshot we can have shared subtrees. + */ + backref_ctx.backref_owner = btrfs_header_owner(eb); + backref_ctx.backref_offset = data_offset - btrfs_file_extent_offset(eb, fi); /* * The last extent of a file may be too large due to page alignment. * We need to adjust extent_len in this case so that the checks in - * __iterate_backrefs work. + * iterate_backrefs() work. */ if (data_offset + num_bytes >= ino_size) backref_ctx.extent_len = ino_size - data_offset; + else + backref_ctx.extent_len = num_bytes; /* * Now collect all backrefs. */ + backref_walk_ctx.bytenr = disk_byte; if (compressed == BTRFS_COMPRESS_NONE) - extent_item_pos = logical - found_key.objectid; - else - extent_item_pos = 0; - ret = iterate_extent_inodes(fs_info, found_key.objectid, - extent_item_pos, 1, __iterate_backrefs, - &backref_ctx, false); + backref_walk_ctx.extent_item_pos = btrfs_file_extent_offset(eb, fi); + backref_walk_ctx.fs_info = fs_info; + backref_walk_ctx.cache_lookup = lookup_backref_cache; + backref_walk_ctx.cache_store = store_backref_cache; + backref_walk_ctx.indirect_ref_iterator = iterate_backrefs; + backref_walk_ctx.check_extent_item = check_extent_item; + backref_walk_ctx.user_ctx = &backref_ctx; + + /* + * If have a single clone root, then it's the send root and we can tell + * the backref walking code to skip our own backref and not resolve it, + * since we can not use it for cloning - the source and destination + * ranges can't overlap and in case the leaf is shared through a subtree + * due to snapshots, we can't use those other roots since they are not + * in the list of clone roots. + */ + if (sctx->clone_roots_cnt == 1) + backref_walk_ctx.skip_data_ref = skip_self_data_ref; + ret = iterate_extent_inodes(&backref_walk_ctx, true, iterate_backrefs, + &backref_ctx); if (ret < 0) - goto out; + return ret; down_read(&fs_info->commit_root_sem); if (fs_info->last_reloc_trans > sctx->last_reloc_trans) { @@ -1475,37 +1701,42 @@ static int find_extent_clone(struct send_ctx *sctx, * was already reallocated after the relocation. */ up_read(&fs_info->commit_root_sem); - ret = -ENOENT; - goto out; + return -ENOENT; } up_read(&fs_info->commit_root_sem); - if (!backref_ctx.found_itself) { - /* found a bug in backref code? */ - ret = -EIO; - btrfs_err(fs_info, - "did not find backref in send_root. inode=%llu, offset=%llu, disk_byte=%llu found extent=%llu", - ino, data_offset, disk_byte, found_key.objectid); - goto out; - } - btrfs_debug(fs_info, "find_extent_clone: data_offset=%llu, ino=%llu, num_bytes=%llu, logical=%llu", data_offset, ino, num_bytes, logical); - if (!backref_ctx.found) + if (!backref_ctx.found) { btrfs_debug(fs_info, "no clones found"); + return -ENOENT; + } cur_clone_root = NULL; for (i = 0; i < sctx->clone_roots_cnt; i++) { - if (sctx->clone_roots[i].found_refs) { - if (!cur_clone_root) - cur_clone_root = sctx->clone_roots + i; - else if (sctx->clone_roots[i].root == sctx->send_root) - /* prefer clones from send_root over others */ - cur_clone_root = sctx->clone_roots + i; - } + struct clone_root *clone_root = &sctx->clone_roots[i]; + + if (!clone_root->found_ref) + continue; + /* + * Choose the root from which we can clone more bytes, to + * minimize write operations and therefore have more extent + * sharing at the destination (the same as in the source). + */ + if (!cur_clone_root || + clone_root->num_bytes > cur_clone_root->num_bytes) { + cur_clone_root = clone_root; + + /* + * We found an optimal clone candidate (any inode from + * any root is fine), so we're done. + */ + if (clone_root->num_bytes >= backref_ctx.extent_len) + break; + } } if (cur_clone_root) { @@ -1515,8 +1746,6 @@ static int find_extent_clone(struct send_ctx *sctx, ret = -ENOENT; } -out: - btrfs_free_path(tmp_path); return ret; } @@ -1596,13 +1825,17 @@ static int gen_unique_name(struct send_ctx *sctx, return -ENOMEM; while (1) { + struct fscrypt_str tmp_name; + len = snprintf(tmp, sizeof(tmp), "o%llu-%llu-%llu", ino, gen, idx); ASSERT(len < sizeof(tmp)); + tmp_name.name = tmp; + tmp_name.len = strlen(tmp); di = btrfs_lookup_dir_item(NULL, sctx->send_root, path, BTRFS_FIRST_FREE_OBJECTID, - tmp, strlen(tmp), 0); + &tmp_name, 0); btrfs_release_path(path); if (IS_ERR(di)) { ret = PTR_ERR(di); @@ -1622,7 +1855,7 @@ static int gen_unique_name(struct send_ctx *sctx, di = btrfs_lookup_dir_item(NULL, sctx->parent_root, path, BTRFS_FIRST_FREE_OBJECTID, - tmp, strlen(tmp), 0); + &tmp_name, 0); btrfs_release_path(path); if (IS_ERR(di)) { ret = PTR_ERR(di); @@ -1752,13 +1985,13 @@ static int lookup_dir_item_inode(struct btrfs_root *root, struct btrfs_dir_item *di; struct btrfs_key key; struct btrfs_path *path; + struct fscrypt_str name_str = FSTR_INIT((char *)name, name_len); path = alloc_path_for_send(); if (!path) return -ENOMEM; - di = btrfs_lookup_dir_item(NULL, root, path, - dir, name, name_len, 0); + di = btrfs_lookup_dir_item(NULL, root, path, dir, &name_str, 0); if (IS_ERR_OR_NULL(di)) { ret = di ? PTR_ERR(di) : -ENOENT; goto out; @@ -7863,6 +8096,9 @@ long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg) INIT_RADIX_TREE(&sctx->name_cache, GFP_KERNEL); INIT_LIST_HEAD(&sctx->name_cache_list); + INIT_LIST_HEAD(&sctx->backref_cache.lru_list); + mt_init(&sctx->backref_cache.entries); + sctx->flags = arg->flags; if (arg->flags & BTRFS_SEND_FLAG_VERSION) { @@ -7901,7 +8137,7 @@ long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg) if (sctx->proto >= 2) { u32 send_buf_num_pages; - sctx->send_max_size = ALIGN(SZ_16K + BTRFS_MAX_COMPRESSED, PAGE_SIZE); + sctx->send_max_size = BTRFS_SEND_BUF_SIZE_V2; sctx->send_buf = vmalloc(sctx->send_max_size); if (!sctx->send_buf) { ret = -ENOMEM; @@ -8125,6 +8361,8 @@ out: close_current_inode(sctx); + empty_backref_cache(sctx); + kfree(sctx); } diff --git a/fs/btrfs/send.h b/fs/btrfs/send.h index f7585cfa7e52..4f5509cb1803 100644 --- a/fs/btrfs/send.h +++ b/fs/btrfs/send.h @@ -18,10 +18,12 @@ #endif /* - * In send stream v1, no command is larger than 64K. In send stream v2, no limit - * should be assumed. + * In send stream v1, no command is larger than 64K. In send stream v2, no + * limit should be assumed, the buffer size is set to be a header with + * compressed extent size. */ #define BTRFS_SEND_BUF_SIZE_V1 SZ_64K +#define BTRFS_SEND_BUF_SIZE_V2 ALIGN(SZ_16K + BTRFS_MAX_COMPRESSED, PAGE_SIZE) struct inode; struct btrfs_ioctl_send_args; diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index f171bf875633..d28ee4e36f3d 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -10,6 +10,9 @@ #include "transaction.h" #include "block-group.h" #include "zoned.h" +#include "fs.h" +#include "accessors.h" +#include "extent-tree.h" /* * HOW DOES SPACE RESERVATION WORK @@ -856,7 +859,7 @@ static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info, u64 thresh; u64 used; - thresh = div_factor_fine(total, 90); + thresh = mult_perc(total, 90); lockdep_assert_held(&space_info->lock); @@ -974,7 +977,7 @@ static bool steal_from_global_rsv(struct btrfs_fs_info *fs_info, return false; spin_lock(&global_rsv->lock); - min_bytes = div_factor(global_rsv->size, 1); + min_bytes = mult_perc(global_rsv->size, 10); if (global_rsv->reserved < min_bytes + ticket->bytes) { spin_unlock(&global_rsv->lock); return false; @@ -1490,8 +1493,8 @@ static void wait_reserve_ticket(struct btrfs_fs_info *fs_info, spin_unlock(&space_info->lock); } -/** - * Do the appropriate flushing and waiting for a ticket +/* + * Do the appropriate flushing and waiting for a ticket. * * @fs_info: the filesystem * @space_info: space info for the reservation @@ -1583,8 +1586,18 @@ static inline bool can_steal(enum btrfs_reserve_flush_enum flush) flush == BTRFS_RESERVE_FLUSH_EVICT); } -/** - * Try to reserve bytes from the block_rsv's space +/* + * NO_FLUSH and FLUSH_EMERGENCY don't want to create a ticket, they just want to + * fail as quickly as possible. + */ +static inline bool can_ticket(enum btrfs_reserve_flush_enum flush) +{ + return (flush != BTRFS_RESERVE_NO_FLUSH && + flush != BTRFS_RESERVE_FLUSH_EMERGENCY); +} + +/* + * Try to reserve bytes from the block_rsv's space. * * @fs_info: the filesystem * @space_info: space info we want to allocate from @@ -1645,13 +1658,28 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info, } /* + * Things are dire, we need to make a reservation so we don't abort. We + * will let this reservation go through as long as we have actual space + * left to allocate for the block. + */ + if (ret && unlikely(flush == BTRFS_RESERVE_FLUSH_EMERGENCY)) { + used = btrfs_space_info_used(space_info, false); + if (used + orig_bytes <= + writable_total_bytes(fs_info, space_info)) { + btrfs_space_info_update_bytes_may_use(fs_info, space_info, + orig_bytes); + ret = 0; + } + } + + /* * If we couldn't make a reservation then setup our reservation ticket * and kick the async worker if it's not already running. * * If we are a priority flusher then we just need to add our ticket to * the list and we will do our own flushing further down. */ - if (ret && flush != BTRFS_RESERVE_NO_FLUSH) { + if (ret && can_ticket(flush)) { ticket.bytes = orig_bytes; ticket.error = 0; space_info->reclaim_size += ticket.bytes; @@ -1701,15 +1729,15 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info, } } spin_unlock(&space_info->lock); - if (!ret || flush == BTRFS_RESERVE_NO_FLUSH) + if (!ret || !can_ticket(flush)) return ret; return handle_reserve_ticket(fs_info, space_info, &ticket, start_ns, orig_bytes, flush); } -/** - * Trye to reserve metadata bytes from the block_rsv's space +/* + * Try to reserve metadata bytes from the block_rsv's space. * * @fs_info: the filesystem * @block_rsv: block_rsv we're allocating for @@ -1743,8 +1771,8 @@ int btrfs_reserve_metadata_bytes(struct btrfs_fs_info *fs_info, return ret; } -/** - * Try to reserve data bytes for an allocation +/* + * Try to reserve data bytes for an allocation. * * @fs_info: the filesystem * @bytes: number of bytes we need @@ -1787,3 +1815,37 @@ __cold void btrfs_dump_space_info_for_trans_abort(struct btrfs_fs_info *fs_info) } dump_global_block_rsv(fs_info); } + +/* + * Account the unused space of all the readonly block group in the space_info. + * takes mirrors into account. + */ +u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo) +{ + struct btrfs_block_group *block_group; + u64 free_bytes = 0; + int factor; + + /* It's df, we don't care if it's racy */ + if (list_empty(&sinfo->ro_bgs)) + return 0; + + spin_lock(&sinfo->lock); + list_for_each_entry(block_group, &sinfo->ro_bgs, ro_list) { + spin_lock(&block_group->lock); + + if (!block_group->ro) { + spin_unlock(&block_group->lock); + continue; + } + + factor = btrfs_bg_type_to_factor(block_group->flags); + free_bytes += (block_group->length - + block_group->used) * factor; + + spin_unlock(&block_group->lock); + } + spin_unlock(&sinfo->lock); + + return free_bytes; +} diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h index ce66023a9eb8..fc99ea2b0c34 100644 --- a/fs/btrfs/space-info.h +++ b/fs/btrfs/space-info.h @@ -5,6 +5,83 @@ #include "volumes.h" +/* + * Different levels for to flush space when doing space reservations. + * + * The higher the level, the more methods we try to reclaim space. + */ +enum btrfs_reserve_flush_enum { + /* If we are in the transaction, we can't flush anything.*/ + BTRFS_RESERVE_NO_FLUSH, + + /* + * Flush space by: + * - Running delayed inode items + * - Allocating a new chunk + */ + BTRFS_RESERVE_FLUSH_LIMIT, + + /* + * Flush space by: + * - Running delayed inode items + * - Running delayed refs + * - Running delalloc and waiting for ordered extents + * - Allocating a new chunk + */ + BTRFS_RESERVE_FLUSH_EVICT, + + /* + * Flush space by above mentioned methods and by: + * - Running delayed iputs + * - Committing transaction + * + * Can be interrupted by a fatal signal. + */ + BTRFS_RESERVE_FLUSH_DATA, + BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE, + BTRFS_RESERVE_FLUSH_ALL, + + /* + * Pretty much the same as FLUSH_ALL, but can also steal space from + * global rsv. + * + * Can be interrupted by a fatal signal. + */ + BTRFS_RESERVE_FLUSH_ALL_STEAL, + + /* + * This is for btrfs_use_block_rsv only. We have exhausted our block + * rsv and our global block rsv. This can happen for things like + * delalloc where we are overwriting a lot of extents with a single + * extent and didn't reserve enough space. Alternatively it can happen + * with delalloc where we reserve 1 extents worth for a large extent but + * fragmentation leads to multiple extents being created. This will + * give us the reservation in the case of + * + * if (num_bytes < (space_info->total_bytes - + * btrfs_space_info_used(space_info, false)) + * + * Which ignores bytes_may_use. This is potentially dangerous, but our + * reservation system is generally pessimistic so is able to absorb this + * style of mistake. + */ + BTRFS_RESERVE_FLUSH_EMERGENCY, +}; + +enum btrfs_flush_state { + FLUSH_DELAYED_ITEMS_NR = 1, + FLUSH_DELAYED_ITEMS = 2, + FLUSH_DELAYED_REFS_NR = 3, + FLUSH_DELAYED_REFS = 4, + FLUSH_DELALLOC = 5, + FLUSH_DELALLOC_WAIT = 6, + FLUSH_DELALLOC_FULL = 7, + ALLOC_CHUNK = 8, + ALLOC_CHUNK_FORCE = 9, + RUN_DELAYED_IPUTS = 10, + COMMIT_TRANS = 11, +}; + struct btrfs_space_info { spinlock_t lock; @@ -159,5 +236,6 @@ int btrfs_reserve_data_bytes(struct btrfs_fs_info *fs_info, u64 bytes, enum btrfs_reserve_flush_enum flush); void btrfs_dump_space_info_for_trans_abort(struct btrfs_fs_info *fs_info); void btrfs_init_async_reclaim_work(struct btrfs_fs_info *fs_info); +u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo); #endif /* BTRFS_SPACE_INFO_H */ diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c index 9a176af847d7..dd46b978ac2c 100644 --- a/fs/btrfs/subpage.c +++ b/fs/btrfs/subpage.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include <linux/slab.h> +#include "messages.h" #include "ctree.h" #include "subpage.h" #include "btrfs_inode.h" diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 5942b9384088..93f52ee85f6f 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -26,6 +26,7 @@ #include <linux/ratelimit.h> #include <linux/crc32c.h> #include <linux/btrfs.h> +#include "messages.h" #include "delayed-inode.h" #include "ctree.h" #include "disk-io.h" @@ -34,7 +35,7 @@ #include "print-tree.h" #include "props.h" #include "xattr.h" -#include "volumes.h" +#include "bio.h" #include "export.h" #include "compression.h" #include "rcu-string.h" @@ -49,6 +50,14 @@ #include "discard.h" #include "qgroup.h" #include "raid56.h" +#include "fs.h" +#include "accessors.h" +#include "defrag.h" +#include "dir-item.h" +#include "ioctl.h" +#include "scrub.h" +#include "verity.h" +#include "super.h" #define CREATE_TRACE_POINTS #include <trace/events/btrfs.h> @@ -67,328 +76,6 @@ static struct file_system_type btrfs_root_fs_type; static int btrfs_remount(struct super_block *sb, int *flags, char *data); -#ifdef CONFIG_PRINTK - -#define STATE_STRING_PREFACE ": state " -#define STATE_STRING_BUF_LEN (sizeof(STATE_STRING_PREFACE) + BTRFS_FS_STATE_COUNT) - -/* - * Characters to print to indicate error conditions or uncommon filesystem state. - * RO is not an error. - */ -static const char fs_state_chars[] = { - [BTRFS_FS_STATE_ERROR] = 'E', - [BTRFS_FS_STATE_REMOUNTING] = 'M', - [BTRFS_FS_STATE_RO] = 0, - [BTRFS_FS_STATE_TRANS_ABORTED] = 'A', - [BTRFS_FS_STATE_DEV_REPLACING] = 'R', - [BTRFS_FS_STATE_DUMMY_FS_INFO] = 0, - [BTRFS_FS_STATE_NO_CSUMS] = 'C', - [BTRFS_FS_STATE_LOG_CLEANUP_ERROR] = 'L', -}; - -static void btrfs_state_to_string(const struct btrfs_fs_info *info, char *buf) -{ - unsigned int bit; - bool states_printed = false; - unsigned long fs_state = READ_ONCE(info->fs_state); - char *curr = buf; - - memcpy(curr, STATE_STRING_PREFACE, sizeof(STATE_STRING_PREFACE)); - curr += sizeof(STATE_STRING_PREFACE) - 1; - - for_each_set_bit(bit, &fs_state, sizeof(fs_state)) { - WARN_ON_ONCE(bit >= BTRFS_FS_STATE_COUNT); - if ((bit < BTRFS_FS_STATE_COUNT) && fs_state_chars[bit]) { - *curr++ = fs_state_chars[bit]; - states_printed = true; - } - } - - /* If no states were printed, reset the buffer */ - if (!states_printed) - curr = buf; - - *curr++ = 0; -} -#endif - -/* - * Generally the error codes correspond to their respective errors, but there - * are a few special cases. - * - * EUCLEAN: Any sort of corruption that we encounter. The tree-checker for - * instance will return EUCLEAN if any of the blocks are corrupted in - * a way that is problematic. We want to reserve EUCLEAN for these - * sort of corruptions. - * - * EROFS: If we check BTRFS_FS_STATE_ERROR and fail out with a return error, we - * need to use EROFS for this case. We will have no idea of the - * original failure, that will have been reported at the time we tripped - * over the error. Each subsequent error that doesn't have any context - * of the original error should use EROFS when handling BTRFS_FS_STATE_ERROR. - */ -const char * __attribute_const__ btrfs_decode_error(int errno) -{ - char *errstr = "unknown"; - - switch (errno) { - case -ENOENT: /* -2 */ - errstr = "No such entry"; - break; - case -EIO: /* -5 */ - errstr = "IO failure"; - break; - case -ENOMEM: /* -12*/ - errstr = "Out of memory"; - break; - case -EEXIST: /* -17 */ - errstr = "Object already exists"; - break; - case -ENOSPC: /* -28 */ - errstr = "No space left"; - break; - case -EROFS: /* -30 */ - errstr = "Readonly filesystem"; - break; - case -EOPNOTSUPP: /* -95 */ - errstr = "Operation not supported"; - break; - case -EUCLEAN: /* -117 */ - errstr = "Filesystem corrupted"; - break; - case -EDQUOT: /* -122 */ - errstr = "Quota exceeded"; - break; - } - - return errstr; -} - -/* - * __btrfs_handle_fs_error decodes expected errors from the caller and - * invokes the appropriate error response. - */ -__cold -void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function, - unsigned int line, int errno, const char *fmt, ...) -{ - struct super_block *sb = fs_info->sb; -#ifdef CONFIG_PRINTK - char statestr[STATE_STRING_BUF_LEN]; - const char *errstr; -#endif - - /* - * Special case: if the error is EROFS, and we're already - * under SB_RDONLY, then it is safe here. - */ - if (errno == -EROFS && sb_rdonly(sb)) - return; - -#ifdef CONFIG_PRINTK - errstr = btrfs_decode_error(errno); - btrfs_state_to_string(fs_info, statestr); - if (fmt) { - struct va_format vaf; - va_list args; - - va_start(args, fmt); - vaf.fmt = fmt; - vaf.va = &args; - - pr_crit("BTRFS: error (device %s%s) in %s:%d: errno=%d %s (%pV)\n", - sb->s_id, statestr, function, line, errno, errstr, &vaf); - va_end(args); - } else { - pr_crit("BTRFS: error (device %s%s) in %s:%d: errno=%d %s\n", - sb->s_id, statestr, function, line, errno, errstr); - } -#endif - - /* - * Today we only save the error info to memory. Long term we'll - * also send it down to the disk - */ - set_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state); - - /* Don't go through full error handling during mount */ - if (!(sb->s_flags & SB_BORN)) - return; - - if (sb_rdonly(sb)) - return; - - btrfs_discard_stop(fs_info); - - /* btrfs handle error by forcing the filesystem readonly */ - btrfs_set_sb_rdonly(sb); - btrfs_info(fs_info, "forced readonly"); - /* - * Note that a running device replace operation is not canceled here - * although there is no way to update the progress. It would add the - * risk of a deadlock, therefore the canceling is omitted. The only - * penalty is that some I/O remains active until the procedure - * completes. The next time when the filesystem is mounted writable - * again, the device replace operation continues. - */ -} - -#ifdef CONFIG_PRINTK -static const char * const logtypes[] = { - "emergency", - "alert", - "critical", - "error", - "warning", - "notice", - "info", - "debug", -}; - - -/* - * Use one ratelimit state per log level so that a flood of less important - * messages doesn't cause more important ones to be dropped. - */ -static struct ratelimit_state printk_limits[] = { - RATELIMIT_STATE_INIT(printk_limits[0], DEFAULT_RATELIMIT_INTERVAL, 100), - RATELIMIT_STATE_INIT(printk_limits[1], DEFAULT_RATELIMIT_INTERVAL, 100), - RATELIMIT_STATE_INIT(printk_limits[2], DEFAULT_RATELIMIT_INTERVAL, 100), - RATELIMIT_STATE_INIT(printk_limits[3], DEFAULT_RATELIMIT_INTERVAL, 100), - RATELIMIT_STATE_INIT(printk_limits[4], DEFAULT_RATELIMIT_INTERVAL, 100), - RATELIMIT_STATE_INIT(printk_limits[5], DEFAULT_RATELIMIT_INTERVAL, 100), - RATELIMIT_STATE_INIT(printk_limits[6], DEFAULT_RATELIMIT_INTERVAL, 100), - RATELIMIT_STATE_INIT(printk_limits[7], DEFAULT_RATELIMIT_INTERVAL, 100), -}; - -void __cold _btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...) -{ - char lvl[PRINTK_MAX_SINGLE_HEADER_LEN + 1] = "\0"; - struct va_format vaf; - va_list args; - int kern_level; - const char *type = logtypes[4]; - struct ratelimit_state *ratelimit = &printk_limits[4]; - - va_start(args, fmt); - - while ((kern_level = printk_get_level(fmt)) != 0) { - size_t size = printk_skip_level(fmt) - fmt; - - if (kern_level >= '0' && kern_level <= '7') { - memcpy(lvl, fmt, size); - lvl[size] = '\0'; - type = logtypes[kern_level - '0']; - ratelimit = &printk_limits[kern_level - '0']; - } - fmt += size; - } - - vaf.fmt = fmt; - vaf.va = &args; - - if (__ratelimit(ratelimit)) { - if (fs_info) { - char statestr[STATE_STRING_BUF_LEN]; - - btrfs_state_to_string(fs_info, statestr); - _printk("%sBTRFS %s (device %s%s): %pV\n", lvl, type, - fs_info->sb->s_id, statestr, &vaf); - } else { - _printk("%sBTRFS %s: %pV\n", lvl, type, &vaf); - } - } - - va_end(args); -} -#endif - -#if BITS_PER_LONG == 32 -void __cold btrfs_warn_32bit_limit(struct btrfs_fs_info *fs_info) -{ - if (!test_and_set_bit(BTRFS_FS_32BIT_WARN, &fs_info->flags)) { - btrfs_warn(fs_info, "reaching 32bit limit for logical addresses"); - btrfs_warn(fs_info, -"due to page cache limit on 32bit systems, btrfs can't access metadata at or beyond %lluT", - BTRFS_32BIT_MAX_FILE_SIZE >> 40); - btrfs_warn(fs_info, - "please consider upgrading to 64bit kernel/hardware"); - } -} - -void __cold btrfs_err_32bit_limit(struct btrfs_fs_info *fs_info) -{ - if (!test_and_set_bit(BTRFS_FS_32BIT_ERROR, &fs_info->flags)) { - btrfs_err(fs_info, "reached 32bit limit for logical addresses"); - btrfs_err(fs_info, -"due to page cache limit on 32bit systems, metadata beyond %lluT can't be accessed", - BTRFS_32BIT_MAX_FILE_SIZE >> 40); - btrfs_err(fs_info, - "please consider upgrading to 64bit kernel/hardware"); - } -} -#endif - -/* - * We only mark the transaction aborted and then set the file system read-only. - * This will prevent new transactions from starting or trying to join this - * one. - * - * This means that error recovery at the call site is limited to freeing - * any local memory allocations and passing the error code up without - * further cleanup. The transaction should complete as it normally would - * in the call path but will return -EIO. - * - * We'll complete the cleanup in btrfs_end_transaction and - * btrfs_commit_transaction. - */ -__cold -void __btrfs_abort_transaction(struct btrfs_trans_handle *trans, - const char *function, - unsigned int line, int errno, bool first_hit) -{ - struct btrfs_fs_info *fs_info = trans->fs_info; - - WRITE_ONCE(trans->aborted, errno); - WRITE_ONCE(trans->transaction->aborted, errno); - if (first_hit && errno == -ENOSPC) - btrfs_dump_space_info_for_trans_abort(fs_info); - /* Wake up anybody who may be waiting on this transaction */ - wake_up(&fs_info->transaction_wait); - wake_up(&fs_info->transaction_blocked_wait); - __btrfs_handle_fs_error(fs_info, function, line, errno, NULL); -} -/* - * __btrfs_panic decodes unexpected, fatal errors from the caller, - * issues an alert, and either panics or BUGs, depending on mount options. - */ -__cold -void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function, - unsigned int line, int errno, const char *fmt, ...) -{ - char *s_id = "<unknown>"; - const char *errstr; - struct va_format vaf = { .fmt = fmt }; - va_list args; - - if (fs_info) - s_id = fs_info->sb->s_id; - - va_start(args, fmt); - vaf.va = &args; - - errstr = btrfs_decode_error(errno); - if (fs_info && (btrfs_test_opt(fs_info, PANIC_ON_FATAL_ERROR))) - panic(KERN_CRIT "BTRFS panic (device %s) in %s:%d: %pV (errno=%d %s)\n", - s_id, function, line, &vaf, errno, errstr); - - btrfs_crit(fs_info, "panic in %s:%d: %pV (errno=%d %s)", - function, line, &vaf, errno, errstr); - va_end(args); - /* Caller calls BUG() */ -} - static void btrfs_put_super(struct super_block *sb) { close_ctree(btrfs_sb(sb)); @@ -918,12 +605,14 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, ret = -EINVAL; goto out; } + btrfs_clear_opt(info->mount_opt, NODISCARD); break; case Opt_nodiscard: btrfs_clear_and_info(info, DISCARD_SYNC, "turning off discard"); btrfs_clear_and_info(info, DISCARD_ASYNC, "turning off async discard"); + btrfs_set_opt(info->mount_opt, NODISCARD); break; case Opt_space_cache: case Opt_space_cache_version: @@ -1394,6 +1083,7 @@ static int get_default_subvol_objectid(struct btrfs_fs_info *fs_info, u64 *objec struct btrfs_dir_item *di; struct btrfs_path *path; struct btrfs_key location; + struct fscrypt_str name = FSTR_INIT("default", 7); u64 dir_id; path = btrfs_alloc_path(); @@ -1406,7 +1096,7 @@ static int get_default_subvol_objectid(struct btrfs_fs_info *fs_info, u64 *objec * to mount. */ dir_id = btrfs_super_root_dir(fs_info->super_copy); - di = btrfs_lookup_dir_item(NULL, root, path, dir_id, "default", 7, 0); + di = btrfs_lookup_dir_item(NULL, root, path, dir_id, &name, 0); if (IS_ERR(di)) { btrfs_free_path(path); return PTR_ERR(di); @@ -1507,7 +1197,8 @@ int btrfs_sync_fs(struct super_block *sb, int wait) * Exit unless we have some pending changes * that need to go through commit */ - if (fs_info->pending_changes == 0) + if (!test_bit(BTRFS_FS_NEED_TRANS_COMMIT, + &fs_info->flags)) return 0; /* * A non-blocking test if the fs is frozen. We must not @@ -2645,7 +2336,7 @@ static int btrfs_show_devname(struct seq_file *m, struct dentry *root) * the end of RCU grace period. */ rcu_read_lock(); - seq_escape(m, rcu_str_deref(fs_info->fs_devices->latest_dev->name), " \t\n\\"); + seq_escape(m, btrfs_dev_name(fs_info->fs_devices->latest_dev), " \t\n\\"); rcu_read_unlock(); return 0; @@ -2694,7 +2385,7 @@ static __cold void btrfs_interface_exit(void) misc_deregister(&btrfs_misc); } -static void __init btrfs_print_mod_info(void) +static int __init btrfs_print_mod_info(void) { static const char options[] = "" #ifdef CONFIG_BTRFS_DEBUG @@ -2721,122 +2412,125 @@ static void __init btrfs_print_mod_info(void) #endif ; pr_info("Btrfs loaded, crc32c=%s%s\n", crc32c_impl(), options); + return 0; } -static int __init init_btrfs_fs(void) +static int register_btrfs(void) { - int err; - - btrfs_props_init(); - - err = btrfs_init_sysfs(); - if (err) - return err; - - btrfs_init_compress(); - - err = btrfs_init_cachep(); - if (err) - goto free_compress; - - err = extent_state_init_cachep(); - if (err) - goto free_cachep; - - err = extent_buffer_init_cachep(); - if (err) - goto free_extent_cachep; - - err = btrfs_bioset_init(); - if (err) - goto free_eb_cachep; - - err = extent_map_init(); - if (err) - goto free_bioset; - - err = ordered_data_init(); - if (err) - goto free_extent_map; - - err = btrfs_delayed_inode_init(); - if (err) - goto free_ordered_data; - - err = btrfs_auto_defrag_init(); - if (err) - goto free_delayed_inode; - - err = btrfs_delayed_ref_init(); - if (err) - goto free_auto_defrag; - - err = btrfs_prelim_ref_init(); - if (err) - goto free_delayed_ref; - - err = btrfs_interface_init(); - if (err) - goto free_prelim_ref; + return register_filesystem(&btrfs_fs_type); +} - btrfs_print_mod_info(); +static void unregister_btrfs(void) +{ + unregister_filesystem(&btrfs_fs_type); +} - err = btrfs_run_sanity_tests(); - if (err) - goto unregister_ioctl; +/* Helper structure for long init/exit functions. */ +struct init_sequence { + int (*init_func)(void); + /* Can be NULL if the init_func doesn't need cleanup. */ + void (*exit_func)(void); +}; - err = register_filesystem(&btrfs_fs_type); - if (err) - goto unregister_ioctl; +static const struct init_sequence mod_init_seq[] = { + { + .init_func = btrfs_props_init, + .exit_func = NULL, + }, { + .init_func = btrfs_init_sysfs, + .exit_func = btrfs_exit_sysfs, + }, { + .init_func = btrfs_init_compress, + .exit_func = btrfs_exit_compress, + }, { + .init_func = btrfs_init_cachep, + .exit_func = btrfs_destroy_cachep, + }, { + .init_func = btrfs_transaction_init, + .exit_func = btrfs_transaction_exit, + }, { + .init_func = btrfs_ctree_init, + .exit_func = btrfs_ctree_exit, + }, { + .init_func = btrfs_free_space_init, + .exit_func = btrfs_free_space_exit, + }, { + .init_func = extent_state_init_cachep, + .exit_func = extent_state_free_cachep, + }, { + .init_func = extent_buffer_init_cachep, + .exit_func = extent_buffer_free_cachep, + }, { + .init_func = btrfs_bioset_init, + .exit_func = btrfs_bioset_exit, + }, { + .init_func = extent_map_init, + .exit_func = extent_map_exit, + }, { + .init_func = ordered_data_init, + .exit_func = ordered_data_exit, + }, { + .init_func = btrfs_delayed_inode_init, + .exit_func = btrfs_delayed_inode_exit, + }, { + .init_func = btrfs_auto_defrag_init, + .exit_func = btrfs_auto_defrag_exit, + }, { + .init_func = btrfs_delayed_ref_init, + .exit_func = btrfs_delayed_ref_exit, + }, { + .init_func = btrfs_prelim_ref_init, + .exit_func = btrfs_prelim_ref_exit, + }, { + .init_func = btrfs_interface_init, + .exit_func = btrfs_interface_exit, + }, { + .init_func = btrfs_print_mod_info, + .exit_func = NULL, + }, { + .init_func = btrfs_run_sanity_tests, + .exit_func = NULL, + }, { + .init_func = register_btrfs, + .exit_func = unregister_btrfs, + } +}; - return 0; +static bool mod_init_result[ARRAY_SIZE(mod_init_seq)]; -unregister_ioctl: - btrfs_interface_exit(); -free_prelim_ref: - btrfs_prelim_ref_exit(); -free_delayed_ref: - btrfs_delayed_ref_exit(); -free_auto_defrag: - btrfs_auto_defrag_exit(); -free_delayed_inode: - btrfs_delayed_inode_exit(); -free_ordered_data: - ordered_data_exit(); -free_extent_map: - extent_map_exit(); -free_bioset: - btrfs_bioset_exit(); -free_eb_cachep: - extent_buffer_free_cachep(); -free_extent_cachep: - extent_state_free_cachep(); -free_cachep: - btrfs_destroy_cachep(); -free_compress: - btrfs_exit_compress(); - btrfs_exit_sysfs(); +static __always_inline void btrfs_exit_btrfs_fs(void) +{ + int i; - return err; + for (i = ARRAY_SIZE(mod_init_seq) - 1; i >= 0; i--) { + if (!mod_init_result[i]) + continue; + if (mod_init_seq[i].exit_func) + mod_init_seq[i].exit_func(); + mod_init_result[i] = false; + } } static void __exit exit_btrfs_fs(void) { - btrfs_destroy_cachep(); - btrfs_delayed_ref_exit(); - btrfs_auto_defrag_exit(); - btrfs_delayed_inode_exit(); - btrfs_prelim_ref_exit(); - ordered_data_exit(); - extent_map_exit(); - btrfs_bioset_exit(); - extent_state_free_cachep(); - extent_buffer_free_cachep(); - btrfs_interface_exit(); - unregister_filesystem(&btrfs_fs_type); - btrfs_exit_sysfs(); - btrfs_cleanup_fs_uuids(); - btrfs_exit_compress(); + btrfs_exit_btrfs_fs(); +} + +static int __init init_btrfs_fs(void) +{ + int ret; + int i; + + for (i = 0; i < ARRAY_SIZE(mod_init_seq); i++) { + ASSERT(!mod_init_result[i]); + ret = mod_init_seq[i].init_func(); + if (ret < 0) { + btrfs_exit_btrfs_fs(); + return ret; + } + mod_init_result[i] = true; + } + return 0; } late_initcall(init_btrfs_fs); diff --git a/fs/btrfs/super.h b/fs/btrfs/super.h new file mode 100644 index 000000000000..8dbb909b364f --- /dev/null +++ b/fs/btrfs/super.h @@ -0,0 +1,29 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef BTRFS_SUPER_H +#define BTRFS_SUPER_H + +int btrfs_parse_options(struct btrfs_fs_info *info, char *options, + unsigned long new_flags); +int btrfs_sync_fs(struct super_block *sb, int wait); +char *btrfs_get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info, + u64 subvol_objectid); + +static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb) +{ + return sb->s_fs_info; +} + +static inline void btrfs_set_sb_rdonly(struct super_block *sb) +{ + sb->s_flags |= SB_RDONLY; + set_bit(BTRFS_FS_STATE_RO, &btrfs_sb(sb)->fs_state); +} + +static inline void btrfs_clear_sb_rdonly(struct super_block *sb) +{ + sb->s_flags &= ~SB_RDONLY; + clear_bit(BTRFS_FS_STATE_RO, &btrfs_sb(sb)->fs_state); +} + +#endif diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 74fef1f49c35..45615ce36498 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -10,7 +10,7 @@ #include <linux/completion.h> #include <linux/bug.h> #include <crypto/hash.h> - +#include "messages.h" #include "ctree.h" #include "discard.h" #include "disk-io.h" @@ -22,6 +22,8 @@ #include "block-group.h" #include "qgroup.h" #include "misc.h" +#include "fs.h" +#include "accessors.h" /* * Structure name Path @@ -248,7 +250,7 @@ static ssize_t btrfs_feature_attr_store(struct kobject *kobj, /* * We don't want to do full transaction commit from inside sysfs */ - btrfs_set_pending(fs_info, COMMIT); + set_bit(BTRFS_FS_NEED_TRANS_COMMIT, &fs_info->flags); wake_up_process(fs_info->transaction_kthread); return count; @@ -762,7 +764,7 @@ static ssize_t btrfs_chunk_size_store(struct kobject *kobj, val = min(val, BTRFS_MAX_DATA_CHUNK_SIZE); /* Limit stripe size to 10% of available space. */ - val = min(div_factor(fs_info->fs_devices->total_rw_bytes, 1), val); + val = min(mult_perc(fs_info->fs_devices->total_rw_bytes, 10), val); /* Must be multiple of 256M. */ val &= ~((u64)SZ_256M - 1); @@ -959,7 +961,7 @@ static ssize_t btrfs_label_store(struct kobject *kobj, /* * We don't want to do full transaction commit from inside sysfs */ - btrfs_set_pending(fs_info, COMMIT); + set_bit(BTRFS_FS_NEED_TRANS_COMMIT, &fs_info->flags); wake_up_process(fs_info->transaction_kthread); return len; @@ -1160,16 +1162,16 @@ static ssize_t btrfs_read_policy_show(struct kobject *kobj, for (i = 0; i < BTRFS_NR_READ_POLICY; i++) { if (fs_devices->read_policy == i) - ret += scnprintf(buf + ret, PAGE_SIZE - ret, "%s[%s]", + ret += sysfs_emit_at(buf, ret, "%s[%s]", (ret == 0 ? "" : " "), btrfs_read_policy_name[i]); else - ret += scnprintf(buf + ret, PAGE_SIZE - ret, "%s%s", + ret += sysfs_emit_at(buf, ret, "%s%s", (ret == 0 ? "" : " "), btrfs_read_policy_name[i]); } - ret += scnprintf(buf + ret, PAGE_SIZE - ret, "\n"); + ret += sysfs_emit_at(buf, ret, "\n"); return ret; } diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c index d43cb5242fec..181469fc0bb3 100644 --- a/fs/btrfs/tests/btrfs-tests.c +++ b/fs/btrfs/tests/btrfs-tests.c @@ -16,6 +16,7 @@ #include "../disk-io.h" #include "../qgroup.h" #include "../block-group.h" +#include "../fs.h" static struct vfsmount *test_mnt = NULL; @@ -101,7 +102,7 @@ struct btrfs_device *btrfs_alloc_dummy_device(struct btrfs_fs_info *fs_info) if (!dev) return ERR_PTR(-ENOMEM); - extent_io_tree_init(NULL, &dev->alloc_state, 0, NULL); + extent_io_tree_init(NULL, &dev->alloc_state, 0); INIT_LIST_HEAD(&dev->dev_list); list_add(&dev->dev_list, &fs_info->fs_devices->devices); diff --git a/fs/btrfs/tests/extent-buffer-tests.c b/fs/btrfs/tests/extent-buffer-tests.c index b7d181a08eab..5ef0b90e25c3 100644 --- a/fs/btrfs/tests/extent-buffer-tests.c +++ b/fs/btrfs/tests/extent-buffer-tests.c @@ -8,6 +8,7 @@ #include "../ctree.h" #include "../extent_io.h" #include "../disk-io.h" +#include "../accessors.h" static int test_btrfs_split_item(u32 sectorsize, u32 nodesize) { diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c index 350da449db08..dfc5c7fa6038 100644 --- a/fs/btrfs/tests/extent-io-tests.c +++ b/fs/btrfs/tests/extent-io-tests.c @@ -132,7 +132,7 @@ static int test_find_delalloc(u32 sectorsize) * Passing NULL as we don't have fs_info but tracepoints are not used * at this point */ - extent_io_tree_init(NULL, tmp, IO_TREE_SELFTEST, NULL); + extent_io_tree_init(NULL, tmp, IO_TREE_SELFTEST); /* * First go through and create and mark all of our pages dirty, we pin @@ -489,7 +489,7 @@ static int test_find_first_clear_extent_bit(void) test_msg("running find_first_clear_extent_bit test"); - extent_io_tree_init(NULL, &tree, IO_TREE_SELFTEST, NULL); + extent_io_tree_init(NULL, &tree, IO_TREE_SELFTEST); /* Test correct handling of empty tree */ find_first_clear_extent_bit(&tree, 0, &start, &end, CHUNK_TRIMMED); diff --git a/fs/btrfs/tests/free-space-tree-tests.c b/fs/btrfs/tests/free-space-tree-tests.c index 13734ed43bfc..b61972046feb 100644 --- a/fs/btrfs/tests/free-space-tree-tests.c +++ b/fs/btrfs/tests/free-space-tree-tests.c @@ -10,6 +10,7 @@ #include "../free-space-tree.h" #include "../transaction.h" #include "../block-group.h" +#include "../accessors.h" struct free_space_extent { u64 start; @@ -470,7 +471,7 @@ static int run_test(test_func_t test_func, int bitmaps, u32 sectorsize, } cache->bitmap_low_thresh = 0; cache->bitmap_high_thresh = (u32)-1; - cache->needs_free_space = 1; + set_bit(BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE, &cache->runtime_flags); cache->fs_info = root->fs_info; btrfs_init_dummy_trans(&trans, root->fs_info); diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c index 625f7d398368..05b03f5eab83 100644 --- a/fs/btrfs/tests/inode-tests.c +++ b/fs/btrfs/tests/inode-tests.c @@ -11,6 +11,7 @@ #include "../extent_io.h" #include "../volumes.h" #include "../compression.h" +#include "../accessors.h" static void insert_extent(struct btrfs_root *root, u64 start, u64 len, u64 ram_bytes, u64 offset, u64 disk_bytenr, @@ -72,8 +73,8 @@ static void insert_inode_item_key(struct btrfs_root *root) * diagram of how the extents will look though this may not be possible we still * want to make sure everything acts normally (the last number is not inclusive) * - * [0 - 5][5 - 6][ 6 - 4096 ][ 4096 - 4100][4100 - 8195][8195 - 12291] - * [hole ][inline][hole but no extent][ hole ][ regular ][regular1 split] + * [0 - 6][ 6 - 4096 ][ 4096 - 4100][4100 - 8195][8195 - 12291] + * [inline][hole but no extent][ hole ][ regular ][regular1 split] * * [12291 - 16387][16387 - 24579][24579 - 28675][ 28675 - 32771][32771 - 36867 ] * [ hole ][regular1 split][ prealloc ][ prealloc1 ][prealloc1 written] @@ -90,19 +91,12 @@ static void setup_file_extents(struct btrfs_root *root, u32 sectorsize) u64 disk_bytenr = SZ_1M; u64 offset = 0; - /* First we want a hole */ - insert_extent(root, offset, 5, 5, 0, 0, 0, BTRFS_FILE_EXTENT_REG, 0, - slot); - slot++; - offset += 5; - /* - * Now we want an inline extent, I don't think this is possible but hey - * why not? Also keep in mind if we have an inline extent it counts as - * the whole first page. If we were to expand it we would have to cow - * and we wouldn't have an inline extent anymore. + * Tree-checker has strict limits on inline extents that they can only + * exist at file offset 0, thus we can only have one inline file extent + * at most. */ - insert_extent(root, offset, 1, 1, 0, 0, 0, BTRFS_FILE_EXTENT_INLINE, 0, + insert_extent(root, offset, 6, 6, 0, 0, 0, BTRFS_FILE_EXTENT_INLINE, 0, slot); slot++; offset = sectorsize; @@ -281,37 +275,25 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) test_err("got an error when we shouldn't have"); goto out; } - if (em->block_start != EXTENT_MAP_HOLE) { - test_err("expected a hole, got %llu", em->block_start); - goto out; - } - if (em->start != 0 || em->len != 5) { - test_err( - "unexpected extent wanted start 0 len 5, got start %llu len %llu", - em->start, em->len); - goto out; - } - if (em->flags != 0) { - test_err("unexpected flags set, want 0 have %lu", em->flags); - goto out; - } - offset = em->start + em->len; - free_extent_map(em); - - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize); - if (IS_ERR(em)) { - test_err("got an error when we shouldn't have"); - goto out; - } if (em->block_start != EXTENT_MAP_INLINE) { test_err("expected an inline, got %llu", em->block_start); goto out; } - if (em->start != offset || em->len != (sectorsize - 5)) { + /* + * For inline extent, we always round up the em to sectorsize, as + * they are either: + * + * a) a hidden hole + * The range will be zeroed at inline extent read time. + * + * b) a file extent with unaligned bytenr + * Tree checker will reject it. + */ + if (em->start != 0 || em->len != sectorsize) { test_err( - "unexpected extent wanted start %llu len 1, got start %llu len %llu", - offset, em->start, em->len); + "unexpected extent wanted start 0 len %u, got start %llu len %llu", + sectorsize, em->start, em->len); goto out; } if (em->flags != 0) { diff --git a/fs/btrfs/tests/qgroup-tests.c b/fs/btrfs/tests/qgroup-tests.c index 63676ea19f29..3fc8dc3fd980 100644 --- a/fs/btrfs/tests/qgroup-tests.c +++ b/fs/btrfs/tests/qgroup-tests.c @@ -10,6 +10,8 @@ #include "../disk-io.h" #include "../qgroup.h" #include "../backref.h" +#include "../fs.h" +#include "../accessors.h" static int insert_normal_tree_ref(struct btrfs_root *root, u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid) @@ -203,6 +205,7 @@ static int remove_extent_ref(struct btrfs_root *root, u64 bytenr, static int test_no_shared_qgroup(struct btrfs_root *root, u32 sectorsize, u32 nodesize) { + struct btrfs_backref_walk_ctx ctx = { 0 }; struct btrfs_trans_handle trans; struct btrfs_fs_info *fs_info = root->fs_info; struct ulist *old_roots = NULL; @@ -218,16 +221,22 @@ static int test_no_shared_qgroup(struct btrfs_root *root, return ret; } + ctx.bytenr = nodesize; + ctx.trans = &trans; + ctx.fs_info = fs_info; + /* * Since the test trans doesn't have the complicated delayed refs, * we can only call btrfs_qgroup_account_extent() directly to test * quota. */ - ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots, false); + ret = btrfs_find_all_roots(&ctx, false); if (ret) { test_err("couldn't find old roots: %d", ret); return ret; } + old_roots = ctx.roots; + ctx.roots = NULL; ret = insert_normal_tree_ref(root, nodesize, nodesize, 0, BTRFS_FS_TREE_OBJECTID); @@ -236,12 +245,14 @@ static int test_no_shared_qgroup(struct btrfs_root *root, return ret; } - ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots, false); + ret = btrfs_find_all_roots(&ctx, false); if (ret) { ulist_free(old_roots); test_err("couldn't find old roots: %d", ret); return ret; } + new_roots = ctx.roots; + ctx.roots = NULL; ret = btrfs_qgroup_account_extent(&trans, nodesize, nodesize, old_roots, new_roots); @@ -260,11 +271,13 @@ static int test_no_shared_qgroup(struct btrfs_root *root, return -EINVAL; } - ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots, false); + ret = btrfs_find_all_roots(&ctx, false); if (ret) { test_err("couldn't find old roots: %d", ret); return ret; } + old_roots = ctx.roots; + ctx.roots = NULL; ret = remove_extent_item(root, nodesize, nodesize); if (ret) { @@ -272,12 +285,14 @@ static int test_no_shared_qgroup(struct btrfs_root *root, return -EINVAL; } - ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots, false); + ret = btrfs_find_all_roots(&ctx, false); if (ret) { ulist_free(old_roots); test_err("couldn't find old roots: %d", ret); return ret; } + new_roots = ctx.roots; + ctx.roots = NULL; ret = btrfs_qgroup_account_extent(&trans, nodesize, nodesize, old_roots, new_roots); @@ -302,6 +317,7 @@ static int test_no_shared_qgroup(struct btrfs_root *root, static int test_multiple_refs(struct btrfs_root *root, u32 sectorsize, u32 nodesize) { + struct btrfs_backref_walk_ctx ctx = { 0 }; struct btrfs_trans_handle trans; struct btrfs_fs_info *fs_info = root->fs_info; struct ulist *old_roots = NULL; @@ -322,11 +338,17 @@ static int test_multiple_refs(struct btrfs_root *root, return ret; } - ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots, false); + ctx.bytenr = nodesize; + ctx.trans = &trans; + ctx.fs_info = fs_info; + + ret = btrfs_find_all_roots(&ctx, false); if (ret) { test_err("couldn't find old roots: %d", ret); return ret; } + old_roots = ctx.roots; + ctx.roots = NULL; ret = insert_normal_tree_ref(root, nodesize, nodesize, 0, BTRFS_FS_TREE_OBJECTID); @@ -335,12 +357,14 @@ static int test_multiple_refs(struct btrfs_root *root, return ret; } - ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots, false); + ret = btrfs_find_all_roots(&ctx, false); if (ret) { ulist_free(old_roots); test_err("couldn't find old roots: %d", ret); return ret; } + new_roots = ctx.roots; + ctx.roots = NULL; ret = btrfs_qgroup_account_extent(&trans, nodesize, nodesize, old_roots, new_roots); @@ -355,11 +379,13 @@ static int test_multiple_refs(struct btrfs_root *root, return -EINVAL; } - ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots, false); + ret = btrfs_find_all_roots(&ctx, false); if (ret) { test_err("couldn't find old roots: %d", ret); return ret; } + old_roots = ctx.roots; + ctx.roots = NULL; ret = add_tree_ref(root, nodesize, nodesize, 0, BTRFS_FIRST_FREE_OBJECTID); @@ -368,12 +394,14 @@ static int test_multiple_refs(struct btrfs_root *root, return ret; } - ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots, false); + ret = btrfs_find_all_roots(&ctx, false); if (ret) { ulist_free(old_roots); test_err("couldn't find old roots: %d", ret); return ret; } + new_roots = ctx.roots; + ctx.roots = NULL; ret = btrfs_qgroup_account_extent(&trans, nodesize, nodesize, old_roots, new_roots); @@ -394,11 +422,13 @@ static int test_multiple_refs(struct btrfs_root *root, return -EINVAL; } - ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots, false); + ret = btrfs_find_all_roots(&ctx, false); if (ret) { test_err("couldn't find old roots: %d", ret); return ret; } + old_roots = ctx.roots; + ctx.roots = NULL; ret = remove_extent_ref(root, nodesize, nodesize, 0, BTRFS_FIRST_FREE_OBJECTID); @@ -407,12 +437,14 @@ static int test_multiple_refs(struct btrfs_root *root, return ret; } - ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots, false); + ret = btrfs_find_all_roots(&ctx, false); if (ret) { ulist_free(old_roots); test_err("couldn't find old roots: %d", ret); return ret; } + new_roots = ctx.roots; + ctx.roots = NULL; ret = btrfs_qgroup_account_extent(&trans, nodesize, nodesize, old_roots, new_roots); diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index d1f1da6820fb..b8c52e89688c 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -6,6 +6,7 @@ #include <linux/fs.h> #include <linux/slab.h> #include <linux/sched.h> +#include <linux/sched/mm.h> #include <linux/writeback.h> #include <linux/pagemap.h> #include <linux/blkdev.h> @@ -23,6 +24,18 @@ #include "block-group.h" #include "space-info.h" #include "zoned.h" +#include "fs.h" +#include "accessors.h" +#include "extent-tree.h" +#include "root-tree.h" +#include "defrag.h" +#include "dir-item.h" +#include "uuid-tree.h" +#include "ioctl.h" +#include "relocation.h" +#include "scrub.h" + +static struct kmem_cache *btrfs_trans_handle_cachep; #define BTRFS_ROOT_TRANS_TAG 0 @@ -365,9 +378,9 @@ loop: spin_lock_init(&cur_trans->releasing_ebs_lock); list_add_tail(&cur_trans->list, &fs_info->trans_list); extent_io_tree_init(fs_info, &cur_trans->dirty_pages, - IO_TREE_TRANS_DIRTY_PAGES, NULL); + IO_TREE_TRANS_DIRTY_PAGES); extent_io_tree_init(fs_info, &cur_trans->pinned_extents, - IO_TREE_FS_PINNED_EXTENTS, NULL); + IO_TREE_FS_PINNED_EXTENTS); fs_info->generation++; cur_trans->transid = fs_info->generation; fs_info->running_transaction = cur_trans; @@ -936,7 +949,7 @@ static bool should_end_transaction(struct btrfs_trans_handle *trans) if (btrfs_check_space_for_delayed_refs(fs_info)) return true; - return !!btrfs_block_rsv_check(&fs_info->global_block_rsv, 5); + return !!btrfs_block_rsv_check(&fs_info->global_block_rsv, 50); } bool btrfs_should_end_transaction(struct btrfs_trans_handle *trans) @@ -1607,10 +1620,9 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root *root = pending->root; struct btrfs_root *parent_root; struct btrfs_block_rsv *rsv; - struct inode *parent_inode; + struct inode *parent_inode = pending->dir; struct btrfs_path *path; struct btrfs_dir_item *dir_item; - struct dentry *dentry; struct extent_buffer *tmp; struct extent_buffer *old; struct timespec64 cur_time; @@ -1619,6 +1631,8 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, u64 index = 0; u64 objectid; u64 root_flags; + unsigned int nofs_flags; + struct fscrypt_name fname; ASSERT(pending->path); path = pending->path; @@ -1626,9 +1640,22 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, ASSERT(pending->root_item); new_root_item = pending->root_item; + /* + * We're inside a transaction and must make sure that any potential + * allocations with GFP_KERNEL in fscrypt won't recurse back to + * filesystem. + */ + nofs_flags = memalloc_nofs_save(); + pending->error = fscrypt_setup_filename(parent_inode, + &pending->dentry->d_name, 0, + &fname); + memalloc_nofs_restore(nofs_flags); + if (pending->error) + goto free_pending; + pending->error = btrfs_get_free_objectid(tree_root, &objectid); if (pending->error) - goto no_free_objectid; + goto free_fname; /* * Make qgroup to skip current new snapshot's qgroupid, as it is @@ -1657,8 +1684,6 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, trace_btrfs_space_reservation(fs_info, "transaction", trans->transid, trans->bytes_reserved, 1); - dentry = pending->dentry; - parent_inode = pending->dir; parent_root = BTRFS_I(parent_inode)->root; ret = record_root_in_trans(trans, parent_root, 0); if (ret) @@ -1674,8 +1699,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, /* check if there is a file/dir which has the same name. */ dir_item = btrfs_lookup_dir_item(NULL, parent_root, path, btrfs_ino(BTRFS_I(parent_inode)), - dentry->d_name.name, - dentry->d_name.len, 0); + &fname.disk_name, 0); if (dir_item != NULL && !IS_ERR(dir_item)) { pending->error = -EEXIST; goto dir_item_existed; @@ -1770,7 +1794,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, ret = btrfs_add_root_ref(trans, objectid, parent_root->root_key.objectid, btrfs_ino(BTRFS_I(parent_inode)), index, - dentry->d_name.name, dentry->d_name.len); + &fname.disk_name); if (ret) { btrfs_abort_transaction(trans, ret); goto fail; @@ -1802,9 +1826,9 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, if (ret < 0) goto fail; - ret = btrfs_insert_dir_item(trans, dentry->d_name.name, - dentry->d_name.len, BTRFS_I(parent_inode), - &key, BTRFS_FT_DIR, index); + ret = btrfs_insert_dir_item(trans, &fname.disk_name, + BTRFS_I(parent_inode), &key, BTRFS_FT_DIR, + index); /* We have check then name at the beginning, so it is impossible. */ BUG_ON(ret == -EEXIST || ret == -EOVERFLOW); if (ret) { @@ -1813,7 +1837,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, } btrfs_i_size_write(BTRFS_I(parent_inode), parent_inode->i_size + - dentry->d_name.len * 2); + fname.disk_name.len * 2); parent_inode->i_mtime = current_time(parent_inode); parent_inode->i_ctime = parent_inode->i_mtime; ret = btrfs_update_inode_fallback(trans, parent_root, BTRFS_I(parent_inode)); @@ -1845,7 +1869,9 @@ dir_item_existed: trans->bytes_reserved = 0; clear_skip_qgroup: btrfs_clear_skip_qgroup(trans); -no_free_objectid: +free_fname: + fscrypt_free_filename(&fname); +free_pending: kfree(new_root_item); pending->root_item = NULL; btrfs_free_path(path); @@ -2101,6 +2127,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) ASSERT(refcount_read(&trans->use_count) == 1); btrfs_trans_state_lockdep_acquire(fs_info, BTRFS_LOCKDEP_TRANS_COMMIT_START); + clear_bit(BTRFS_FS_NEED_TRANS_COMMIT, &fs_info->flags); + /* Stop the commit early if ->aborted is set */ if (TRANS_ABORTED(cur_trans)) { ret = cur_trans->aborted; @@ -2354,12 +2382,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) if (ret) goto unlock_reloc; - /* - * Since the transaction is done, we can apply the pending changes - * before the next transaction. - */ - btrfs_apply_pending_changes(fs_info); - /* commit_fs_roots gets rid of all the tree log roots, it is now * safe to free the root of tree log roots */ @@ -2582,21 +2604,17 @@ int btrfs_clean_one_deleted_snapshot(struct btrfs_fs_info *fs_info) return (ret < 0) ? 0 : 1; } -void btrfs_apply_pending_changes(struct btrfs_fs_info *fs_info) +int __init btrfs_transaction_init(void) { - unsigned long prev; - unsigned long bit; - - prev = xchg(&fs_info->pending_changes, 0); - if (!prev) - return; - - bit = 1 << BTRFS_PENDING_COMMIT; - if (prev & bit) - btrfs_debug(fs_info, "pending commit done"); - prev &= ~bit; + btrfs_trans_handle_cachep = kmem_cache_create("btrfs_trans_handle", + sizeof(struct btrfs_trans_handle), 0, + SLAB_TEMPORARY | SLAB_MEM_SPREAD, NULL); + if (!btrfs_trans_handle_cachep) + return -ENOMEM; + return 0; +} - if (prev) - btrfs_warn(fs_info, - "unknown pending changes left 0x%lx, ignoring", prev); +void __cold btrfs_transaction_exit(void) +{ + kmem_cache_destroy(btrfs_trans_handle_cachep); } diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 970ff316069d..97f6c39f59c8 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -10,6 +10,7 @@ #include "btrfs_inode.h" #include "delayed-ref.h" #include "ctree.h" +#include "misc.h" enum btrfs_trans_state { TRANS_STATE_RUNNING, @@ -98,14 +99,15 @@ struct btrfs_transaction { struct list_head releasing_ebs; }; -#define __TRANS_FREEZABLE (1U << 0) - -#define __TRANS_START (1U << 9) -#define __TRANS_ATTACH (1U << 10) -#define __TRANS_JOIN (1U << 11) -#define __TRANS_JOIN_NOLOCK (1U << 12) -#define __TRANS_DUMMY (1U << 13) -#define __TRANS_JOIN_NOSTART (1U << 14) +enum { + ENUM_BIT(__TRANS_FREEZABLE), + ENUM_BIT(__TRANS_START), + ENUM_BIT(__TRANS_ATTACH), + ENUM_BIT(__TRANS_JOIN), + ENUM_BIT(__TRANS_JOIN_NOLOCK), + ENUM_BIT(__TRANS_DUMMY), + ENUM_BIT(__TRANS_JOIN_NOSTART), +}; #define TRANS_START (__TRANS_START | __TRANS_FREEZABLE) #define TRANS_ATTACH (__TRANS_ATTACH) @@ -231,9 +233,11 @@ int btrfs_wait_tree_log_extents(struct btrfs_root *root, int mark); int btrfs_transaction_blocked(struct btrfs_fs_info *info); int btrfs_transaction_in_commit(struct btrfs_fs_info *info); void btrfs_put_transaction(struct btrfs_transaction *transaction); -void btrfs_apply_pending_changes(struct btrfs_fs_info *fs_info); void btrfs_add_dropped_root(struct btrfs_trans_handle *trans, struct btrfs_root *root); void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans); +int __init btrfs_transaction_init(void); +void __cold btrfs_transaction_exit(void); + #endif diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index 43f905ab0a18..baad1ed7e111 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -18,6 +18,7 @@ #include <linux/types.h> #include <linux/stddef.h> #include <linux/error-injection.h> +#include "messages.h" #include "ctree.h" #include "tree-checker.h" #include "disk-io.h" @@ -25,6 +26,9 @@ #include "volumes.h" #include "misc.h" #include "btrfs_inode.h" +#include "fs.h" +#include "accessors.h" +#include "file-item.h" /* * Error message should follow the following format: @@ -528,7 +532,7 @@ static int check_dir_item(struct extent_buffer *leaf, } /* dir type check */ - dir_type = btrfs_dir_type(leaf, di); + dir_type = btrfs_dir_ftype(leaf, di); if (unlikely(dir_type >= BTRFS_FT_MAX)) { dir_item_err(leaf, slot, "invalid dir item type, have %u expect [0, %u)", @@ -1780,10 +1784,10 @@ static int check_leaf(struct extent_buffer *leaf, bool check_item_data) /* Also check if the item pointer overlaps with btrfs item. */ if (unlikely(btrfs_item_ptr_offset(leaf, slot) < - btrfs_item_nr_offset(slot) + sizeof(struct btrfs_item))) { + btrfs_item_nr_offset(leaf, slot) + sizeof(struct btrfs_item))) { generic_err(leaf, slot, "slot overlaps with its data, item end %lu data start %lu", - btrfs_item_nr_offset(slot) + + btrfs_item_nr_offset(leaf, slot) + sizeof(struct btrfs_item), btrfs_item_ptr_offset(leaf, slot)); return -EUCLEAN; diff --git a/fs/btrfs/tree-checker.h b/fs/btrfs/tree-checker.h index ece497e26558..bfb5efa4e01f 100644 --- a/fs/btrfs/tree-checker.h +++ b/fs/btrfs/tree-checker.h @@ -6,8 +6,39 @@ #ifndef BTRFS_TREE_CHECKER_H #define BTRFS_TREE_CHECKER_H -#include "ctree.h" -#include "extent_io.h" +#include <uapi/linux/btrfs_tree.h> + +struct extent_buffer; +struct btrfs_chunk; + +/* All the extra info needed to verify the parentness of a tree block. */ +struct btrfs_tree_parent_check { + /* + * The owner check against the tree block. + * + * Can be 0 to skip the owner check. + */ + u64 owner_root; + + /* + * Expected transid, can be 0 to skip the check, but such skip + * should only be utlized for backref walk related code. + */ + u64 transid; + + /* + * The expected first key. + * + * This check can be skipped if @has_first_key is false, such skip + * can happen for case where we don't have the parent node key, + * e.g. reading the tree root, doing backref walk. + */ + struct btrfs_key first_key; + bool has_first_key; + + /* The expected level. Should always be set. */ + u8 level; +}; /* * Comprehensive leaf checker. diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c deleted file mode 100644 index b6cf39f4e7e4..000000000000 --- a/fs/btrfs/tree-defrag.c +++ /dev/null @@ -1,132 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright (C) 2007 Oracle. All rights reserved. - */ - -#include <linux/sched.h> -#include "ctree.h" -#include "disk-io.h" -#include "print-tree.h" -#include "transaction.h" -#include "locking.h" - -/* - * Defrag all the leaves in a given btree. - * Read all the leaves and try to get key order to - * better reflect disk order - */ - -int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, - struct btrfs_root *root) -{ - struct btrfs_path *path = NULL; - struct btrfs_key key; - int ret = 0; - int wret; - int level; - int next_key_ret = 0; - u64 last_ret = 0; - - if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) - goto out; - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - - level = btrfs_header_level(root->node); - - if (level == 0) - goto out; - - if (root->defrag_progress.objectid == 0) { - struct extent_buffer *root_node; - u32 nritems; - - root_node = btrfs_lock_root_node(root); - nritems = btrfs_header_nritems(root_node); - root->defrag_max.objectid = 0; - /* from above we know this is not a leaf */ - btrfs_node_key_to_cpu(root_node, &root->defrag_max, - nritems - 1); - btrfs_tree_unlock(root_node); - free_extent_buffer(root_node); - memset(&key, 0, sizeof(key)); - } else { - memcpy(&key, &root->defrag_progress, sizeof(key)); - } - - path->keep_locks = 1; - - ret = btrfs_search_forward(root, &key, path, BTRFS_OLDEST_GENERATION); - if (ret < 0) - goto out; - if (ret > 0) { - ret = 0; - goto out; - } - btrfs_release_path(path); - /* - * We don't need a lock on a leaf. btrfs_realloc_node() will lock all - * leafs from path->nodes[1], so set lowest_level to 1 to avoid later - * a deadlock (attempting to write lock an already write locked leaf). - */ - path->lowest_level = 1; - wret = btrfs_search_slot(trans, root, &key, path, 0, 1); - - if (wret < 0) { - ret = wret; - goto out; - } - if (!path->nodes[1]) { - ret = 0; - goto out; - } - /* - * The node at level 1 must always be locked when our path has - * keep_locks set and lowest_level is 1, regardless of the value of - * path->slots[1]. - */ - BUG_ON(path->locks[1] == 0); - ret = btrfs_realloc_node(trans, root, - path->nodes[1], 0, - &last_ret, - &root->defrag_progress); - if (ret) { - WARN_ON(ret == -EAGAIN); - goto out; - } - /* - * Now that we reallocated the node we can find the next key. Note that - * btrfs_find_next_key() can release our path and do another search - * without COWing, this is because even with path->keep_locks = 1, - * btrfs_search_slot() / ctree.c:unlock_up() does not keeps a lock on a - * node when path->slots[node_level - 1] does not point to the last - * item or a slot beyond the last item (ctree.c:unlock_up()). Therefore - * we search for the next key after reallocating our node. - */ - path->slots[1] = btrfs_header_nritems(path->nodes[1]); - next_key_ret = btrfs_find_next_key(root, path, &key, 1, - BTRFS_OLDEST_GENERATION); - if (next_key_ret == 0) { - memcpy(&root->defrag_progress, &key, sizeof(key)); - ret = -EAGAIN; - } -out: - btrfs_free_path(path); - if (ret == -EAGAIN) { - if (root->defrag_max.objectid > root->defrag_progress.objectid) - goto done; - if (root->defrag_max.type > root->defrag_progress.type) - goto done; - if (root->defrag_max.offset > root->defrag_progress.offset) - goto done; - ret = 0; - } -done: - if (ret != -EAGAIN) - memset(&root->defrag_progress, 0, - sizeof(root->defrag_progress)); - - return ret; -} diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index c3cf3dabe0b1..a3c43f0b1c95 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -21,6 +21,15 @@ #include "space-info.h" #include "zoned.h" #include "inode-item.h" +#include "fs.h" +#include "accessors.h" +#include "extent-tree.h" +#include "root-tree.h" +#include "dir-item.h" +#include "file-item.h" +#include "file.h" +#include "orphan.h" +#include "tree-checker.h" #define MAX_CONFLICT_INODES 10 @@ -333,7 +342,12 @@ static int process_one_buffer(struct btrfs_root *log, * pin down any logged extents, so we have to read the block. */ if (btrfs_fs_incompat(fs_info, MIXED_GROUPS)) { - ret = btrfs_read_extent_buffer(eb, gen, level, NULL); + struct btrfs_tree_parent_check check = { + .level = level, + .transid = gen + }; + + ret = btrfs_read_extent_buffer(eb, &check); if (ret) return ret; } @@ -351,11 +365,25 @@ static int process_one_buffer(struct btrfs_root *log, return ret; } -static int do_overwrite_item(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - struct extent_buffer *eb, int slot, - struct btrfs_key *key) +/* + * Item overwrite used by replay and tree logging. eb, slot and key all refer + * to the src data we are copying out. + * + * root is the tree we are copying into, and path is a scratch + * path for use in this function (it should be released on entry and + * will be released on exit). + * + * If the key is already in the destination tree the existing item is + * overwritten. If the existing item isn't big enough, it is extended. + * If it is too large, it is truncated. + * + * If the key isn't in the destination yet, a new item is inserted. + */ +static int overwrite_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct extent_buffer *eb, int slot, + struct btrfs_key *key) { int ret; u32 item_size; @@ -363,31 +391,24 @@ static int do_overwrite_item(struct btrfs_trans_handle *trans, int save_old_i_size = 0; unsigned long src_ptr; unsigned long dst_ptr; - int overwrite_root = 0; bool inode_item = key->type == BTRFS_INODE_ITEM_KEY; - if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) - overwrite_root = 1; + /* + * This is only used during log replay, so the root is always from a + * fs/subvolume tree. In case we ever need to support a log root, then + * we'll have to clone the leaf in the path, release the path and use + * the leaf before writing into the log tree. See the comments at + * copy_items() for more details. + */ + ASSERT(root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID); item_size = btrfs_item_size(eb, slot); src_ptr = btrfs_item_ptr_offset(eb, slot); - /* Our caller must have done a search for the key for us. */ - ASSERT(path->nodes[0] != NULL); - - /* - * And the slot must point to the exact key or the slot where the key - * should be at (the first item with a key greater than 'key') - */ - if (path->slots[0] < btrfs_header_nritems(path->nodes[0])) { - struct btrfs_key found_key; - - btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]); - ret = btrfs_comp_cpu_keys(&found_key, key); - ASSERT(ret >= 0); - } else { - ret = 1; - } + /* Look for the key in the destination tree. */ + ret = btrfs_search_slot(NULL, root, key, path, 0, 0); + if (ret < 0) + return ret; if (ret == 0) { char *src_copy; @@ -532,8 +553,7 @@ insert: goto no_copy; } - if (overwrite_root && - S_ISDIR(btrfs_inode_mode(eb, src_item)) && + if (S_ISDIR(btrfs_inode_mode(eb, src_item)) && S_ISDIR(btrfs_inode_mode(path->nodes[0], dst_item))) { save_old_i_size = 1; saved_i_size = btrfs_inode_size(path->nodes[0], @@ -565,34 +585,19 @@ no_copy: return 0; } -/* - * Item overwrite used by replay and tree logging. eb, slot and key all refer - * to the src data we are copying out. - * - * root is the tree we are copying into, and path is a scratch - * path for use in this function (it should be released on entry and - * will be released on exit). - * - * If the key is already in the destination tree the existing item is - * overwritten. If the existing item isn't big enough, it is extended. - * If it is too large, it is truncated. - * - * If the key isn't in the destination yet, a new item is inserted. - */ -static int overwrite_item(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - struct extent_buffer *eb, int slot, - struct btrfs_key *key) +static int read_alloc_one_name(struct extent_buffer *eb, void *start, int len, + struct fscrypt_str *name) { - int ret; + char *buf; - /* Look for the key in the destination tree. */ - ret = btrfs_search_slot(NULL, root, key, path, 0, 0); - if (ret < 0) - return ret; + buf = kmalloc(len, GFP_NOFS); + if (!buf) + return -ENOMEM; - return do_overwrite_item(trans, root, path, eb, slot, key); + read_extent_buffer(eb, buf, (unsigned long)start, len); + name->name = buf; + name->len = len; + return 0; } /* @@ -747,8 +752,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, */ ret = btrfs_qgroup_trace_extent(trans, btrfs_file_extent_disk_bytenr(eb, item), - btrfs_file_extent_disk_num_bytes(eb, item), - GFP_NOFS); + btrfs_file_extent_disk_num_bytes(eb, item)); if (ret < 0) goto out; @@ -799,7 +803,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, btrfs_file_extent_num_bytes(eb, item); } - ret = btrfs_lookup_csums_range(root->log_root, + ret = btrfs_lookup_csums_list(root->log_root, csum_start, csum_end - 1, &ordered_sums, 0, false); if (ret) @@ -901,12 +905,11 @@ out: static int unlink_inode_for_log_replay(struct btrfs_trans_handle *trans, struct btrfs_inode *dir, struct btrfs_inode *inode, - const char *name, - int name_len) + const struct fscrypt_str *name) { int ret; - ret = btrfs_unlink_inode(trans, dir, inode, name, name_len); + ret = btrfs_unlink_inode(trans, dir, inode, name); if (ret) return ret; /* @@ -933,8 +936,7 @@ static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans, { struct btrfs_root *root = dir->root; struct inode *inode; - char *name; - int name_len; + struct fscrypt_str name; struct extent_buffer *leaf; struct btrfs_key location; int ret; @@ -942,12 +944,10 @@ static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans, leaf = path->nodes[0]; btrfs_dir_item_key_to_cpu(leaf, di, &location); - name_len = btrfs_dir_name_len(leaf, di); - name = kmalloc(name_len, GFP_NOFS); - if (!name) + ret = read_alloc_one_name(leaf, di + 1, btrfs_dir_name_len(leaf, di), &name); + if (ret) return -ENOMEM; - read_extent_buffer(leaf, name, (unsigned long)(di + 1), name_len); btrfs_release_path(path); inode = read_one_inode(root, location.objectid); @@ -960,10 +960,9 @@ static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans, if (ret) goto out; - ret = unlink_inode_for_log_replay(trans, dir, BTRFS_I(inode), name, - name_len); + ret = unlink_inode_for_log_replay(trans, dir, BTRFS_I(inode), &name); out: - kfree(name); + kfree(name.name); iput(inode); return ret; } @@ -978,14 +977,14 @@ out: static noinline int inode_in_dir(struct btrfs_root *root, struct btrfs_path *path, u64 dirid, u64 objectid, u64 index, - const char *name, int name_len) + struct fscrypt_str *name) { struct btrfs_dir_item *di; struct btrfs_key location; int ret = 0; di = btrfs_lookup_dir_index_item(NULL, root, path, dirid, - index, name, name_len, 0); + index, name, 0); if (IS_ERR(di)) { ret = PTR_ERR(di); goto out; @@ -998,7 +997,7 @@ static noinline int inode_in_dir(struct btrfs_root *root, } btrfs_release_path(path); - di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, name_len, 0); + di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, 0); if (IS_ERR(di)) { ret = PTR_ERR(di); goto out; @@ -1025,7 +1024,7 @@ out: static noinline int backref_in_log(struct btrfs_root *log, struct btrfs_key *key, u64 ref_objectid, - const char *name, int namelen) + const struct fscrypt_str *name) { struct btrfs_path *path; int ret; @@ -1045,12 +1044,10 @@ static noinline int backref_in_log(struct btrfs_root *log, if (key->type == BTRFS_INODE_EXTREF_KEY) ret = !!btrfs_find_name_in_ext_backref(path->nodes[0], path->slots[0], - ref_objectid, - name, namelen); + ref_objectid, name); else ret = !!btrfs_find_name_in_backref(path->nodes[0], - path->slots[0], - name, namelen); + path->slots[0], name); out: btrfs_free_path(path); return ret; @@ -1063,11 +1060,9 @@ static inline int __add_inode_ref(struct btrfs_trans_handle *trans, struct btrfs_inode *dir, struct btrfs_inode *inode, u64 inode_objectid, u64 parent_objectid, - u64 ref_index, char *name, int namelen) + u64 ref_index, struct fscrypt_str *name) { int ret; - char *victim_name; - int victim_name_len; struct extent_buffer *leaf; struct btrfs_dir_item *di; struct btrfs_key search_key; @@ -1099,43 +1094,40 @@ again: ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); ptr_end = ptr + btrfs_item_size(leaf, path->slots[0]); while (ptr < ptr_end) { - victim_ref = (struct btrfs_inode_ref *)ptr; - victim_name_len = btrfs_inode_ref_name_len(leaf, - victim_ref); - victim_name = kmalloc(victim_name_len, GFP_NOFS); - if (!victim_name) - return -ENOMEM; + struct fscrypt_str victim_name; - read_extent_buffer(leaf, victim_name, - (unsigned long)(victim_ref + 1), - victim_name_len); + victim_ref = (struct btrfs_inode_ref *)ptr; + ret = read_alloc_one_name(leaf, (victim_ref + 1), + btrfs_inode_ref_name_len(leaf, victim_ref), + &victim_name); + if (ret) + return ret; ret = backref_in_log(log_root, &search_key, - parent_objectid, victim_name, - victim_name_len); + parent_objectid, &victim_name); if (ret < 0) { - kfree(victim_name); + kfree(victim_name.name); return ret; } else if (!ret) { inc_nlink(&inode->vfs_inode); btrfs_release_path(path); ret = unlink_inode_for_log_replay(trans, dir, inode, - victim_name, victim_name_len); - kfree(victim_name); + &victim_name); + kfree(victim_name.name); if (ret) return ret; goto again; } - kfree(victim_name); + kfree(victim_name.name); - ptr = (unsigned long)(victim_ref + 1) + victim_name_len; + ptr = (unsigned long)(victim_ref + 1) + victim_name.len; } } btrfs_release_path(path); /* Same search but for extended refs */ - extref = btrfs_lookup_inode_extref(NULL, root, path, name, namelen, + extref = btrfs_lookup_inode_extref(NULL, root, path, name, inode_objectid, parent_objectid, 0, 0); if (IS_ERR(extref)) { @@ -1152,29 +1144,28 @@ again: base = btrfs_item_ptr_offset(leaf, path->slots[0]); while (cur_offset < item_size) { - extref = (struct btrfs_inode_extref *)(base + cur_offset); + struct fscrypt_str victim_name; - victim_name_len = btrfs_inode_extref_name_len(leaf, extref); + extref = (struct btrfs_inode_extref *)(base + cur_offset); if (btrfs_inode_extref_parent(leaf, extref) != parent_objectid) goto next; - victim_name = kmalloc(victim_name_len, GFP_NOFS); - if (!victim_name) - return -ENOMEM; - read_extent_buffer(leaf, victim_name, (unsigned long)&extref->name, - victim_name_len); + ret = read_alloc_one_name(leaf, &extref->name, + btrfs_inode_extref_name_len(leaf, extref), + &victim_name); + if (ret) + return ret; search_key.objectid = inode_objectid; search_key.type = BTRFS_INODE_EXTREF_KEY; search_key.offset = btrfs_extref_hash(parent_objectid, - victim_name, - victim_name_len); + victim_name.name, + victim_name.len); ret = backref_in_log(log_root, &search_key, - parent_objectid, victim_name, - victim_name_len); + parent_objectid, &victim_name); if (ret < 0) { - kfree(victim_name); + kfree(victim_name.name); return ret; } else if (!ret) { ret = -ENOENT; @@ -1186,26 +1177,24 @@ again: ret = unlink_inode_for_log_replay(trans, BTRFS_I(victim_parent), - inode, - victim_name, - victim_name_len); + inode, &victim_name); } iput(victim_parent); - kfree(victim_name); + kfree(victim_name.name); if (ret) return ret; goto again; } - kfree(victim_name); + kfree(victim_name.name); next: - cur_offset += victim_name_len + sizeof(*extref); + cur_offset += victim_name.len + sizeof(*extref); } } btrfs_release_path(path); /* look for a conflicting sequence number */ di = btrfs_lookup_dir_index_item(trans, root, path, btrfs_ino(dir), - ref_index, name, namelen, 0); + ref_index, name, 0); if (IS_ERR(di)) { return PTR_ERR(di); } else if (di) { @@ -1216,8 +1205,7 @@ next: btrfs_release_path(path); /* look for a conflicting name */ - di = btrfs_lookup_dir_item(trans, root, path, btrfs_ino(dir), - name, namelen, 0); + di = btrfs_lookup_dir_item(trans, root, path, btrfs_ino(dir), name, 0); if (IS_ERR(di)) { return PTR_ERR(di); } else if (di) { @@ -1231,20 +1219,18 @@ next: } static int extref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr, - u32 *namelen, char **name, u64 *index, + struct fscrypt_str *name, u64 *index, u64 *parent_objectid) { struct btrfs_inode_extref *extref; + int ret; extref = (struct btrfs_inode_extref *)ref_ptr; - *namelen = btrfs_inode_extref_name_len(eb, extref); - *name = kmalloc(*namelen, GFP_NOFS); - if (*name == NULL) - return -ENOMEM; - - read_extent_buffer(eb, *name, (unsigned long)&extref->name, - *namelen); + ret = read_alloc_one_name(eb, &extref->name, + btrfs_inode_extref_name_len(eb, extref), name); + if (ret) + return ret; if (index) *index = btrfs_inode_extref_index(eb, extref); @@ -1255,18 +1241,17 @@ static int extref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr, } static int ref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr, - u32 *namelen, char **name, u64 *index) + struct fscrypt_str *name, u64 *index) { struct btrfs_inode_ref *ref; + int ret; ref = (struct btrfs_inode_ref *)ref_ptr; - *namelen = btrfs_inode_ref_name_len(eb, ref); - *name = kmalloc(*namelen, GFP_NOFS); - if (*name == NULL) - return -ENOMEM; - - read_extent_buffer(eb, *name, (unsigned long)(ref + 1), *namelen); + ret = read_alloc_one_name(eb, ref + 1, btrfs_inode_ref_name_len(eb, ref), + name); + if (ret) + return ret; if (index) *index = btrfs_inode_ref_index(eb, ref); @@ -1308,28 +1293,24 @@ again: ref_ptr = btrfs_item_ptr_offset(eb, path->slots[0]); ref_end = ref_ptr + btrfs_item_size(eb, path->slots[0]); while (ref_ptr < ref_end) { - char *name = NULL; - int namelen; + struct fscrypt_str name; u64 parent_id; if (key->type == BTRFS_INODE_EXTREF_KEY) { - ret = extref_get_fields(eb, ref_ptr, &namelen, &name, + ret = extref_get_fields(eb, ref_ptr, &name, NULL, &parent_id); } else { parent_id = key->offset; - ret = ref_get_fields(eb, ref_ptr, &namelen, &name, - NULL); + ret = ref_get_fields(eb, ref_ptr, &name, NULL); } if (ret) goto out; if (key->type == BTRFS_INODE_EXTREF_KEY) ret = !!btrfs_find_name_in_ext_backref(log_eb, log_slot, - parent_id, name, - namelen); + parent_id, &name); else - ret = !!btrfs_find_name_in_backref(log_eb, log_slot, - name, namelen); + ret = !!btrfs_find_name_in_backref(log_eb, log_slot, &name); if (!ret) { struct inode *dir; @@ -1338,20 +1319,20 @@ again: dir = read_one_inode(root, parent_id); if (!dir) { ret = -ENOENT; - kfree(name); + kfree(name.name); goto out; } ret = unlink_inode_for_log_replay(trans, BTRFS_I(dir), - inode, name, namelen); - kfree(name); + inode, &name); + kfree(name.name); iput(dir); if (ret) goto out; goto again; } - kfree(name); - ref_ptr += namelen; + kfree(name.name); + ref_ptr += name.len; if (key->type == BTRFS_INODE_EXTREF_KEY) ref_ptr += sizeof(struct btrfs_inode_extref); else @@ -1380,8 +1361,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, struct inode *inode = NULL; unsigned long ref_ptr; unsigned long ref_end; - char *name = NULL; - int namelen; + struct fscrypt_str name; int ret; int log_ref_ver = 0; u64 parent_objectid; @@ -1425,7 +1405,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, while (ref_ptr < ref_end) { if (log_ref_ver) { - ret = extref_get_fields(eb, ref_ptr, &namelen, &name, + ret = extref_get_fields(eb, ref_ptr, &name, &ref_index, &parent_objectid); /* * parent object can change from one array @@ -1438,15 +1418,13 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, goto out; } } else { - ret = ref_get_fields(eb, ref_ptr, &namelen, &name, - &ref_index); + ret = ref_get_fields(eb, ref_ptr, &name, &ref_index); } if (ret) goto out; ret = inode_in_dir(root, path, btrfs_ino(BTRFS_I(dir)), - btrfs_ino(BTRFS_I(inode)), ref_index, - name, namelen); + btrfs_ino(BTRFS_I(inode)), ref_index, &name); if (ret < 0) { goto out; } else if (ret == 0) { @@ -1460,7 +1438,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, ret = __add_inode_ref(trans, root, path, log, BTRFS_I(dir), BTRFS_I(inode), inode_objectid, parent_objectid, - ref_index, name, namelen); + ref_index, &name); if (ret) { if (ret == 1) ret = 0; @@ -1469,7 +1447,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, /* insert our name */ ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), - name, namelen, 0, ref_index); + &name, 0, ref_index); if (ret) goto out; @@ -1479,9 +1457,9 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, } /* Else, ret == 1, we already have a perfect match, we're done. */ - ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + namelen; - kfree(name); - name = NULL; + ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + name.len; + kfree(name.name); + name.name = NULL; if (log_ref_ver) { iput(dir); dir = NULL; @@ -1505,7 +1483,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, ret = overwrite_item(trans, root, path, eb, slot, key); out: btrfs_release_path(path); - kfree(name); + kfree(name.name); iput(dir); iput(inode); return ret; @@ -1777,7 +1755,7 @@ static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans, static noinline int insert_one_name(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 dirid, u64 index, - char *name, int name_len, + const struct fscrypt_str *name, struct btrfs_key *location) { struct inode *inode; @@ -1795,7 +1773,7 @@ static noinline int insert_one_name(struct btrfs_trans_handle *trans, } ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), name, - name_len, 1, index); + 1, index); /* FIXME, put inode into FIXUP list */ @@ -1809,7 +1787,7 @@ static int delete_conflicting_dir_entry(struct btrfs_trans_handle *trans, struct btrfs_path *path, struct btrfs_dir_item *dst_di, const struct btrfs_key *log_key, - u8 log_type, + u8 log_flags, bool exists) { struct btrfs_key found_key; @@ -1819,7 +1797,7 @@ static int delete_conflicting_dir_entry(struct btrfs_trans_handle *trans, if (found_key.objectid == log_key->objectid && found_key.type == log_key->type && found_key.offset == log_key->offset && - btrfs_dir_type(path->nodes[0], dst_di) == log_type) + btrfs_dir_flags(path->nodes[0], dst_di) == log_flags) return 1; /* @@ -1855,8 +1833,7 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans, struct btrfs_dir_item *di, struct btrfs_key *key) { - char *name; - int name_len; + struct fscrypt_str name; struct btrfs_dir_item *dir_dst_di; struct btrfs_dir_item *index_dst_di; bool dir_dst_matches = false; @@ -1864,7 +1841,7 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans, struct btrfs_key log_key; struct btrfs_key search_key; struct inode *dir; - u8 log_type; + u8 log_flags; bool exists; int ret; bool update_size = true; @@ -1874,17 +1851,11 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans, if (!dir) return -EIO; - name_len = btrfs_dir_name_len(eb, di); - name = kmalloc(name_len, GFP_NOFS); - if (!name) { - ret = -ENOMEM; + ret = read_alloc_one_name(eb, di + 1, btrfs_dir_name_len(eb, di), &name); + if (ret) goto out; - } - - log_type = btrfs_dir_type(eb, di); - read_extent_buffer(eb, name, (unsigned long)(di + 1), - name_len); + log_flags = btrfs_dir_flags(eb, di); btrfs_dir_item_key_to_cpu(eb, di, &log_key); ret = btrfs_lookup_inode(trans, root, path, &log_key, 0); btrfs_release_path(path); @@ -1894,14 +1865,14 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans, ret = 0; dir_dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid, - name, name_len, 1); + &name, 1); if (IS_ERR(dir_dst_di)) { ret = PTR_ERR(dir_dst_di); goto out; } else if (dir_dst_di) { ret = delete_conflicting_dir_entry(trans, BTRFS_I(dir), path, - dir_dst_di, &log_key, log_type, - exists); + dir_dst_di, &log_key, + log_flags, exists); if (ret < 0) goto out; dir_dst_matches = (ret == 1); @@ -1911,14 +1882,14 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans, index_dst_di = btrfs_lookup_dir_index_item(trans, root, path, key->objectid, key->offset, - name, name_len, 1); + &name, 1); if (IS_ERR(index_dst_di)) { ret = PTR_ERR(index_dst_di); goto out; } else if (index_dst_di) { ret = delete_conflicting_dir_entry(trans, BTRFS_I(dir), path, index_dst_di, &log_key, - log_type, exists); + log_flags, exists); if (ret < 0) goto out; index_dst_matches = (ret == 1); @@ -1939,7 +1910,7 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans, search_key.objectid = log_key.objectid; search_key.type = BTRFS_INODE_REF_KEY; search_key.offset = key->objectid; - ret = backref_in_log(root->log_root, &search_key, 0, name, name_len); + ret = backref_in_log(root->log_root, &search_key, 0, &name); if (ret < 0) { goto out; } else if (ret) { @@ -1952,8 +1923,7 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans, search_key.objectid = log_key.objectid; search_key.type = BTRFS_INODE_EXTREF_KEY; search_key.offset = key->objectid; - ret = backref_in_log(root->log_root, &search_key, key->objectid, name, - name_len); + ret = backref_in_log(root->log_root, &search_key, key->objectid, &name); if (ret < 0) { goto out; } else if (ret) { @@ -1964,7 +1934,7 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans, } btrfs_release_path(path); ret = insert_one_name(trans, root, key->objectid, key->offset, - name, name_len, &log_key); + &name, &log_key); if (ret && ret != -ENOENT && ret != -EEXIST) goto out; if (!ret) @@ -1974,10 +1944,10 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans, out: if (!ret && update_size) { - btrfs_i_size_write(BTRFS_I(dir), dir->i_size + name_len * 2); + btrfs_i_size_write(BTRFS_I(dir), dir->i_size + name.len * 2); ret = btrfs_update_inode(trans, root, BTRFS_I(dir)); } - kfree(name); + kfree(name.name); iput(dir); if (!ret && name_added) ret = 1; @@ -2028,7 +1998,7 @@ static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans, * to ever delete the parent directory has it would result in stale * dentries that can never be deleted. */ - if (ret == 1 && btrfs_dir_type(eb, di) != BTRFS_FT_DIR) { + if (ret == 1 && btrfs_dir_ftype(eb, di) != BTRFS_FT_DIR) { struct btrfs_path *fixup_path; struct btrfs_key di_key; @@ -2143,8 +2113,7 @@ static noinline int check_item_in_log(struct btrfs_trans_handle *trans, struct extent_buffer *eb; int slot; struct btrfs_dir_item *di; - int name_len; - char *name; + struct fscrypt_str name; struct inode *inode = NULL; struct btrfs_key location; @@ -2159,22 +2128,16 @@ static noinline int check_item_in_log(struct btrfs_trans_handle *trans, eb = path->nodes[0]; slot = path->slots[0]; di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item); - name_len = btrfs_dir_name_len(eb, di); - name = kmalloc(name_len, GFP_NOFS); - if (!name) { - ret = -ENOMEM; + ret = read_alloc_one_name(eb, di + 1, btrfs_dir_name_len(eb, di), &name); + if (ret) goto out; - } - - read_extent_buffer(eb, name, (unsigned long)(di + 1), name_len); if (log) { struct btrfs_dir_item *log_di; log_di = btrfs_lookup_dir_index_item(trans, log, log_path, dir_key->objectid, - dir_key->offset, - name, name_len, 0); + dir_key->offset, &name, 0); if (IS_ERR(log_di)) { ret = PTR_ERR(log_di); goto out; @@ -2200,7 +2163,7 @@ static noinline int check_item_in_log(struct btrfs_trans_handle *trans, inc_nlink(inode); ret = unlink_inode_for_log_replay(trans, BTRFS_I(dir), BTRFS_I(inode), - name, name_len); + &name); /* * Unlike dir item keys, dir index keys can only have one name (entry) in * them, as there are no key collisions since each key has a unique offset @@ -2209,7 +2172,7 @@ static noinline int check_item_in_log(struct btrfs_trans_handle *trans, out: btrfs_release_path(path); btrfs_release_path(log_path); - kfree(name); + kfree(name.name); iput(inode); return ret; } @@ -2430,13 +2393,17 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, struct walk_control *wc, u64 gen, int level) { int nritems; + struct btrfs_tree_parent_check check = { + .transid = gen, + .level = level + }; struct btrfs_path *path; struct btrfs_root *root = wc->replay_dest; struct btrfs_key key; int i; int ret; - ret = btrfs_read_extent_buffer(eb, gen, level, NULL); + ret = btrfs_read_extent_buffer(eb, &check); if (ret) return ret; @@ -2616,7 +2583,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, int ret = 0; while (*level > 0) { - struct btrfs_key first_key; + struct btrfs_tree_parent_check check = { 0 }; cur = path->nodes[*level]; @@ -2628,7 +2595,10 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, bytenr = btrfs_node_blockptr(cur, path->slots[*level]); ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]); - btrfs_node_key_to_cpu(cur, &first_key, path->slots[*level]); + check.transid = ptr_gen; + check.level = *level - 1; + check.has_first_key = true; + btrfs_node_key_to_cpu(cur, &check.first_key, path->slots[*level]); blocksize = fs_info->nodesize; next = btrfs_find_create_tree_block(fs_info, bytenr, @@ -2647,8 +2617,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, path->slots[*level]++; if (wc->free) { - ret = btrfs_read_extent_buffer(next, ptr_gen, - *level - 1, &first_key); + ret = btrfs_read_extent_buffer(next, &check); if (ret) { free_extent_buffer(next); return ret; @@ -2676,7 +2645,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, free_extent_buffer(next); continue; } - ret = btrfs_read_extent_buffer(next, ptr_gen, *level - 1, &first_key); + ret = btrfs_read_extent_buffer(next, &check); if (ret) { free_extent_buffer(next); return ret; @@ -3448,7 +3417,7 @@ static int del_logged_dentry(struct btrfs_trans_handle *trans, struct btrfs_root *log, struct btrfs_path *path, u64 dir_ino, - const char *name, int name_len, + const struct fscrypt_str *name, u64 index) { struct btrfs_dir_item *di; @@ -3458,7 +3427,7 @@ static int del_logged_dentry(struct btrfs_trans_handle *trans, * for dir item keys. */ di = btrfs_lookup_dir_index_item(trans, log, path, dir_ino, - index, name, name_len, -1); + index, name, -1); if (IS_ERR(di)) return PTR_ERR(di); else if (!di) @@ -3495,7 +3464,7 @@ static int del_logged_dentry(struct btrfs_trans_handle *trans, */ void btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, struct btrfs_root *root, - const char *name, int name_len, + const struct fscrypt_str *name, struct btrfs_inode *dir, u64 index) { struct btrfs_path *path; @@ -3522,7 +3491,7 @@ void btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, } ret = del_logged_dentry(trans, root->log_root, path, btrfs_ino(dir), - name, name_len, index); + name, index); btrfs_free_path(path); out_unlock: mutex_unlock(&dir->log_mutex); @@ -3534,7 +3503,7 @@ out_unlock: /* see comments for btrfs_del_dir_entries_in_log */ void btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, struct btrfs_root *root, - const char *name, int name_len, + const struct fscrypt_str *name, struct btrfs_inode *inode, u64 dirid) { struct btrfs_root *log; @@ -3555,7 +3524,7 @@ void btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, log = root->log_root; mutex_lock(&inode->log_mutex); - ret = btrfs_del_inode_ref(trans, log, name, name_len, btrfs_ino(inode), + ret = btrfs_del_inode_ref(trans, log, name, btrfs_ino(inode), dirid, &index); mutex_unlock(&inode->log_mutex); if (ret < 0 && ret != -ENOENT) @@ -4451,9 +4420,9 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, csum_root = btrfs_csum_root(trans->fs_info, disk_bytenr); disk_bytenr += extent_offset; - ret = btrfs_lookup_csums_range(csum_root, disk_bytenr, - disk_bytenr + extent_num_bytes - 1, - &ordered_sums, 0, false); + ret = btrfs_lookup_csums_list(csum_root, disk_bytenr, + disk_bytenr + extent_num_bytes - 1, + &ordered_sums, 0, false); if (ret) goto out; @@ -4646,10 +4615,9 @@ static int log_extent_csums(struct btrfs_trans_handle *trans, /* block start is already adjusted for the file extent offset. */ csum_root = btrfs_csum_root(trans->fs_info, em->block_start); - ret = btrfs_lookup_csums_range(csum_root, - em->block_start + csum_offset, - em->block_start + csum_offset + - csum_len - 1, &ordered_sums, 0, false); + ret = btrfs_lookup_csums_list(csum_root, em->block_start + csum_offset, + em->block_start + csum_offset + + csum_len - 1, &ordered_sums, 0, false); if (ret) return ret; @@ -5269,6 +5237,7 @@ static int btrfs_check_ref_name_override(struct extent_buffer *eb, u32 this_len; unsigned long name_ptr; struct btrfs_dir_item *di; + struct fscrypt_str name_str; if (key->type == BTRFS_INODE_REF_KEY) { struct btrfs_inode_ref *iref; @@ -5302,8 +5271,11 @@ static int btrfs_check_ref_name_override(struct extent_buffer *eb, } read_extent_buffer(eb, name, name_ptr, this_name_len); + + name_str.name = name; + name_str.len = this_name_len; di = btrfs_lookup_dir_item(NULL, inode->root, search_path, - parent, name, this_name_len, 0); + parent, &name_str, 0); if (di && !IS_ERR(di)) { struct btrfs_key di_key; @@ -5410,7 +5382,7 @@ struct btrfs_dir_list { * has a size that doesn't match the sum of the lengths of all the logged * names - this is ok, not a problem, because at log replay time we set the * directory's i_size to the correct value (see replay_one_name() and - * do_overwrite_item()). + * overwrite_item()). */ static int log_new_dir_dentries(struct btrfs_trans_handle *trans, struct btrfs_inode *start_inode, @@ -5473,7 +5445,7 @@ again: } di = btrfs_item_ptr(leaf, i, struct btrfs_dir_item); - type = btrfs_dir_type(leaf, di); + type = btrfs_dir_ftype(leaf, di); if (btrfs_dir_transid(leaf, di) < trans->transid) continue; btrfs_dir_item_key_to_cpu(leaf, di, &di_key); @@ -5488,7 +5460,7 @@ again: } if (!need_log_inode(trans, BTRFS_I(di_inode))) { - btrfs_add_delayed_iput(di_inode); + btrfs_add_delayed_iput(BTRFS_I(di_inode)); break; } @@ -5497,7 +5469,7 @@ again: log_mode = LOG_INODE_ALL; ret = btrfs_log_inode(trans, BTRFS_I(di_inode), log_mode, ctx); - btrfs_add_delayed_iput(di_inode); + btrfs_add_delayed_iput(BTRFS_I(di_inode)); if (ret) goto out; if (ctx->log_new_dentries) { @@ -5691,11 +5663,11 @@ static int add_conflicting_inode(struct btrfs_trans_handle *trans, * so that the log ends up with the new name and without the old name. */ if (!need_log_inode(trans, BTRFS_I(inode))) { - btrfs_add_delayed_iput(inode); + btrfs_add_delayed_iput(BTRFS_I(inode)); return 0; } - btrfs_add_delayed_iput(inode); + btrfs_add_delayed_iput(BTRFS_I(inode)); ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS); if (!ino_elem) @@ -5770,7 +5742,7 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans, */ ret = btrfs_log_inode(trans, BTRFS_I(inode), LOG_INODE_ALL, ctx); - btrfs_add_delayed_iput(inode); + btrfs_add_delayed_iput(BTRFS_I(inode)); if (ret) break; continue; @@ -5787,7 +5759,7 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans, * that, we can avoid doing it again. */ if (!need_log_inode(trans, BTRFS_I(inode))) { - btrfs_add_delayed_iput(inode); + btrfs_add_delayed_iput(BTRFS_I(inode)); continue; } @@ -5799,7 +5771,7 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans, * log with the new name before we unpin it. */ ret = btrfs_log_inode(trans, BTRFS_I(inode), LOG_INODE_EXISTS, ctx); - btrfs_add_delayed_iput(inode); + btrfs_add_delayed_iput(BTRFS_I(inode)); if (ret) break; } @@ -6309,11 +6281,11 @@ static int log_new_delayed_dentries(struct btrfs_trans_handle *trans, } if (!need_log_inode(trans, BTRFS_I(di_inode))) { - btrfs_add_delayed_iput(di_inode); + btrfs_add_delayed_iput(BTRFS_I(di_inode)); continue; } - if (btrfs_stack_dir_type(dir_item) == BTRFS_FT_DIR) + if (btrfs_stack_dir_ftype(dir_item) == BTRFS_FT_DIR) log_mode = LOG_INODE_ALL; ctx->log_new_dentries = false; @@ -6322,7 +6294,7 @@ static int log_new_delayed_dentries(struct btrfs_trans_handle *trans, if (!ret && ctx->log_new_dentries) ret = log_new_dir_dentries(trans, BTRFS_I(di_inode), ctx); - btrfs_add_delayed_iput(di_inode); + btrfs_add_delayed_iput(BTRFS_I(di_inode)); if (ret) break; @@ -6783,7 +6755,7 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans, } if (!need_log_inode(trans, BTRFS_I(dir_inode))) { - btrfs_add_delayed_iput(dir_inode); + btrfs_add_delayed_iput(BTRFS_I(dir_inode)); continue; } @@ -6793,7 +6765,7 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans, if (!ret && ctx->log_new_dentries) ret = log_new_dir_dentries(trans, BTRFS_I(dir_inode), ctx); - btrfs_add_delayed_iput(dir_inode); + btrfs_add_delayed_iput(BTRFS_I(dir_inode)); if (ret) goto out; } @@ -6838,7 +6810,7 @@ static int log_new_ancestors(struct btrfs_trans_handle *trans, need_log_inode(trans, BTRFS_I(inode))) ret = btrfs_log_inode(trans, BTRFS_I(inode), LOG_INODE_EXISTS, ctx); - btrfs_add_delayed_iput(inode); + btrfs_add_delayed_iput(BTRFS_I(inode)); if (ret) return ret; @@ -7400,7 +7372,7 @@ void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans, mutex_unlock(&dir->log_mutex); } -/** +/* * Update the log after adding a new name for an inode. * * @trans: Transaction handle. @@ -7467,9 +7439,14 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans, if (old_dir && old_dir->logged_trans == trans->transid) { struct btrfs_root *log = old_dir->root->log_root; struct btrfs_path *path; + struct fscrypt_name fname; ASSERT(old_dir_index >= BTRFS_DIR_START_INDEX); + ret = fscrypt_setup_filename(&old_dir->vfs_inode, + &old_dentry->d_name, 0, &fname); + if (ret) + goto out; /* * We have two inodes to update in the log, the old directory and * the inode that got renamed, so we must pin the log to prevent @@ -7489,6 +7466,7 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans, path = btrfs_alloc_path(); if (!path) { ret = -ENOMEM; + fscrypt_free_filename(&fname); goto out; } @@ -7504,8 +7482,7 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans, */ mutex_lock(&old_dir->log_mutex); ret = del_logged_dentry(trans, log, path, btrfs_ino(old_dir), - old_dentry->d_name.name, - old_dentry->d_name.len, old_dir_index); + &fname.disk_name, old_dir_index); if (ret > 0) { /* * The dentry does not exist in the log, so record its @@ -7519,6 +7496,7 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans, mutex_unlock(&old_dir->log_mutex); btrfs_free_path(path); + fscrypt_free_filename(&fname); if (ret < 0) goto out; } diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h index aed1e05e9879..85b43075ac58 100644 --- a/fs/btrfs/tree-log.h +++ b/fs/btrfs/tree-log.h @@ -6,6 +6,7 @@ #ifndef BTRFS_TREE_LOG_H #define BTRFS_TREE_LOG_H +#include "messages.h" #include "ctree.h" #include "transaction.h" @@ -86,11 +87,11 @@ int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, struct btrfs_log_ctx *ctx); void btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, struct btrfs_root *root, - const char *name, int name_len, + const struct fscrypt_str *name, struct btrfs_inode *dir, u64 index); void btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, struct btrfs_root *root, - const char *name, int name_len, + const struct fscrypt_str *name, struct btrfs_inode *inode, u64 dirid); void btrfs_end_log_trans(struct btrfs_root *root); void btrfs_pin_log_trans(struct btrfs_root *root); diff --git a/fs/btrfs/tree-mod-log.c b/fs/btrfs/tree-mod-log.c index 8a3a14686d3e..a555baa0143a 100644 --- a/fs/btrfs/tree-mod-log.c +++ b/fs/btrfs/tree-mod-log.c @@ -1,7 +1,11 @@ // SPDX-License-Identifier: GPL-2.0 +#include "messages.h" #include "tree-mod-log.h" #include "disk-io.h" +#include "fs.h" +#include "accessors.h" +#include "tree-checker.h" struct tree_mod_root { u64 logical; @@ -197,12 +201,11 @@ static inline bool tree_mod_need_log(const struct btrfs_fs_info *fs_info, static struct tree_mod_elem *alloc_tree_mod_elem(struct extent_buffer *eb, int slot, - enum btrfs_mod_log_op op, - gfp_t flags) + enum btrfs_mod_log_op op) { struct tree_mod_elem *tm; - tm = kzalloc(sizeof(*tm), flags); + tm = kzalloc(sizeof(*tm), GFP_NOFS); if (!tm) return NULL; @@ -220,7 +223,7 @@ static struct tree_mod_elem *alloc_tree_mod_elem(struct extent_buffer *eb, } int btrfs_tree_mod_log_insert_key(struct extent_buffer *eb, int slot, - enum btrfs_mod_log_op op, gfp_t flags) + enum btrfs_mod_log_op op) { struct tree_mod_elem *tm; int ret; @@ -228,7 +231,7 @@ int btrfs_tree_mod_log_insert_key(struct extent_buffer *eb, int slot, if (!tree_mod_need_log(eb->fs_info, eb)) return 0; - tm = alloc_tree_mod_elem(eb, slot, op, flags); + tm = alloc_tree_mod_elem(eb, slot, op); if (!tm) return -ENOMEM; @@ -276,7 +279,7 @@ int btrfs_tree_mod_log_insert_move(struct extent_buffer *eb, for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) { tm_list[i] = alloc_tree_mod_elem(eb, i + dst_slot, - BTRFS_MOD_LOG_KEY_REMOVE_WHILE_MOVING, GFP_NOFS); + BTRFS_MOD_LOG_KEY_REMOVE_WHILE_MOVING); if (!tm_list[i]) { ret = -ENOMEM; goto free_tms; @@ -364,7 +367,7 @@ int btrfs_tree_mod_log_insert_root(struct extent_buffer *old_root, } for (i = 0; i < nritems; i++) { tm_list[i] = alloc_tree_mod_elem(old_root, i, - BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS); + BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING); if (!tm_list[i]) { ret = -ENOMEM; goto free_tms; @@ -502,14 +505,14 @@ int btrfs_tree_mod_log_eb_copy(struct extent_buffer *dst, tm_list_rem = tm_list + nr_items; for (i = 0; i < nr_items; i++) { tm_list_rem[i] = alloc_tree_mod_elem(src, i + src_offset, - BTRFS_MOD_LOG_KEY_REMOVE, GFP_NOFS); + BTRFS_MOD_LOG_KEY_REMOVE); if (!tm_list_rem[i]) { ret = -ENOMEM; goto free_tms; } tm_list_add[i] = alloc_tree_mod_elem(dst, i + dst_offset, - BTRFS_MOD_LOG_KEY_ADD, GFP_NOFS); + BTRFS_MOD_LOG_KEY_ADD); if (!tm_list_add[i]) { ret = -ENOMEM; goto free_tms; @@ -564,7 +567,7 @@ int btrfs_tree_mod_log_free_eb(struct extent_buffer *eb) for (i = 0; i < nritems; i++) { tm_list[i] = alloc_tree_mod_elem(eb, i, - BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS); + BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING); if (!tm_list[i]) { ret = -ENOMEM; goto free_tms; @@ -694,8 +697,8 @@ static void tree_mod_log_rewind(struct btrfs_fs_info *fs_info, n--; break; case BTRFS_MOD_LOG_MOVE_KEYS: - o_dst = btrfs_node_key_ptr_offset(tm->slot); - o_src = btrfs_node_key_ptr_offset(tm->move.dst_slot); + o_dst = btrfs_node_key_ptr_offset(eb, tm->slot); + o_src = btrfs_node_key_ptr_offset(eb, tm->move.dst_slot); memmove_extent_buffer(eb, o_dst, o_src, tm->move.nr_items * p_size); break; @@ -819,10 +822,15 @@ struct extent_buffer *btrfs_get_old_root(struct btrfs_root *root, u64 time_seq) tm = tree_mod_log_search(fs_info, logical, time_seq); if (old_root && tm && tm->op != BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING) { + struct btrfs_tree_parent_check check = { 0 }; + btrfs_tree_read_unlock(eb_root); free_extent_buffer(eb_root); - old = read_tree_block(fs_info, logical, root->root_key.objectid, - 0, level, NULL); + + check.level = level; + check.owner_root = root->root_key.objectid; + + old = read_tree_block(fs_info, logical, &check); if (WARN_ON(IS_ERR(old) || !extent_buffer_uptodate(old))) { if (!IS_ERR(old)) free_extent_buffer(old); diff --git a/fs/btrfs/tree-mod-log.h b/fs/btrfs/tree-mod-log.h index 12605d19621b..94f10afeee97 100644 --- a/fs/btrfs/tree-mod-log.h +++ b/fs/btrfs/tree-mod-log.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef BTRFS_TREE_MOD_LOG_H #define BTRFS_TREE_MOD_LOG_H @@ -32,7 +32,7 @@ int btrfs_tree_mod_log_insert_root(struct extent_buffer *old_root, struct extent_buffer *new_root, bool log_removal); int btrfs_tree_mod_log_insert_key(struct extent_buffer *eb, int slot, - enum btrfs_mod_log_op op, gfp_t flags); + enum btrfs_mod_log_op op); int btrfs_tree_mod_log_free_eb(struct extent_buffer *eb); struct extent_buffer *btrfs_tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct btrfs_path *path, diff --git a/fs/btrfs/ulist.c b/fs/btrfs/ulist.c index 3374c9e9be67..33606025513d 100644 --- a/fs/btrfs/ulist.c +++ b/fs/btrfs/ulist.c @@ -5,6 +5,7 @@ */ #include <linux/slab.h> +#include "messages.h" #include "ulist.h" #include "ctree.h" @@ -37,8 +38,9 @@ * loop would be similar to the above. */ -/** - * ulist_init - freshly initialize a ulist +/* + * Freshly initialize a ulist. + * * @ulist: the ulist to initialize * * Note: don't use this function to init an already used ulist, use @@ -51,8 +53,9 @@ void ulist_init(struct ulist *ulist) ulist->nnodes = 0; } -/** - * ulist_release - free up additionally allocated memory for the ulist +/* + * Free up additionally allocated memory for the ulist. + * * @ulist: the ulist from which to free the additional memory * * This is useful in cases where the base 'struct ulist' has been statically @@ -70,8 +73,9 @@ void ulist_release(struct ulist *ulist) INIT_LIST_HEAD(&ulist->nodes); } -/** - * ulist_reinit - prepare a ulist for reuse +/* + * Prepare a ulist for reuse. + * * @ulist: ulist to be reused * * Free up all additional memory allocated for the list elements and reinit @@ -83,8 +87,9 @@ void ulist_reinit(struct ulist *ulist) ulist_init(ulist); } -/** - * ulist_alloc - dynamically allocate a ulist +/* + * Dynamically allocate a ulist. + * * @gfp_mask: allocation flags to for base allocation * * The allocated ulist will be returned in an initialized state. @@ -101,8 +106,9 @@ struct ulist *ulist_alloc(gfp_t gfp_mask) return ulist; } -/** - * ulist_free - free dynamically allocated ulist +/* + * Free dynamically allocated ulist. + * * @ulist: ulist to free * * It is not necessary to call ulist_release before. @@ -163,8 +169,9 @@ static int ulist_rbtree_insert(struct ulist *ulist, struct ulist_node *ins) return 0; } -/** - * ulist_add - add an element to the ulist +/* + * Add an element to the ulist. + * * @ulist: ulist to add the element to * @val: value to add to ulist * @aux: auxiliary value to store along with val @@ -242,8 +249,9 @@ int ulist_del(struct ulist *ulist, u64 val, u64 aux) return 0; } -/** - * ulist_next - iterate ulist +/* + * Iterate ulist. + * * @ulist: ulist to iterate * @uiter: iterator variable, initialized with ULIST_ITER_INIT(&iterator) * @@ -258,7 +266,7 @@ int ulist_del(struct ulist *ulist, u64 val, u64 aux) * It is allowed to call ulist_add during an enumeration. Newly added items * are guaranteed to show up in the running enumeration. */ -struct ulist_node *ulist_next(struct ulist *ulist, struct ulist_iterator *uiter) +struct ulist_node *ulist_next(const struct ulist *ulist, struct ulist_iterator *uiter) { struct ulist_node *node; diff --git a/fs/btrfs/ulist.h b/fs/btrfs/ulist.h index 02fda0a2d4ce..b2cef187ea8e 100644 --- a/fs/btrfs/ulist.h +++ b/fs/btrfs/ulist.h @@ -66,7 +66,7 @@ static inline int ulist_add_merge_ptr(struct ulist *ulist, u64 val, void *aux, #endif } -struct ulist_node *ulist_next(struct ulist *ulist, +struct ulist_node *ulist_next(const struct ulist *ulist, struct ulist_iterator *uiter); #define ULIST_ITER_INIT(uiter) ((uiter)->cur_list = NULL) diff --git a/fs/btrfs/uuid-tree.c b/fs/btrfs/uuid-tree.c index b458452a1aaf..7c7001f42b14 100644 --- a/fs/btrfs/uuid-tree.c +++ b/fs/btrfs/uuid-tree.c @@ -5,11 +5,14 @@ #include <linux/uuid.h> #include <asm/unaligned.h> +#include "messages.h" #include "ctree.h" #include "transaction.h" #include "disk-io.h" #include "print-tree.h" - +#include "fs.h" +#include "accessors.h" +#include "uuid-tree.h" static void btrfs_uuid_to_key(u8 *uuid, u8 type, struct btrfs_key *key) { diff --git a/fs/btrfs/uuid-tree.h b/fs/btrfs/uuid-tree.h new file mode 100644 index 000000000000..5350c87fe2ca --- /dev/null +++ b/fs/btrfs/uuid-tree.h @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef BTRFS_UUID_TREE_H +#define BTRFS_UUID_TREE_H + +int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans, u8 *uuid, u8 type, + u64 subid); +int btrfs_uuid_tree_remove(struct btrfs_trans_handle *trans, u8 *uuid, u8 type, + u64 subid); +int btrfs_uuid_tree_iterate(struct btrfs_fs_info *fs_info); + +#endif diff --git a/fs/btrfs/verity.c b/fs/btrfs/verity.c index ee00e33c309e..bf9eb693a6a7 100644 --- a/fs/btrfs/verity.c +++ b/fs/btrfs/verity.c @@ -10,11 +10,17 @@ #include <linux/iversion.h> #include <linux/fsverity.h> #include <linux/sched/mm.h> +#include "messages.h" #include "ctree.h" #include "btrfs_inode.h" #include "transaction.h" #include "disk-io.h" #include "locking.h" +#include "fs.h" +#include "accessors.h" +#include "ioctl.h" +#include "verity.h" +#include "orphan.h" /* * Implementation of the interface defined in struct fsverity_operations. diff --git a/fs/btrfs/verity.h b/fs/btrfs/verity.h new file mode 100644 index 000000000000..91c10f7d0a46 --- /dev/null +++ b/fs/btrfs/verity.h @@ -0,0 +1,28 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef BTRFS_VERITY_H +#define BTRFS_VERITY_H + +#ifdef CONFIG_FS_VERITY + +extern const struct fsverity_operations btrfs_verityops; + +int btrfs_drop_verity_items(struct btrfs_inode *inode); +int btrfs_get_verity_descriptor(struct inode *inode, void *buf, size_t buf_size); + +#else + +static inline int btrfs_drop_verity_items(struct btrfs_inode *inode) +{ + return 0; +} + +static inline int btrfs_get_verity_descriptor(struct inode *inode, void *buf, + size_t buf_size) +{ + return -EPERM; +} + +#endif + +#endif diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 635f45f1a2ef..aa25fa335d3e 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -5,12 +5,9 @@ #include <linux/sched.h> #include <linux/sched/mm.h> -#include <linux/bio.h> #include <linux/slab.h> -#include <linux/blkdev.h> #include <linux/ratelimit.h> #include <linux/kthread.h> -#include <linux/raid/pq.h> #include <linux/semaphore.h> #include <linux/uuid.h> #include <linux/list_sort.h> @@ -23,8 +20,6 @@ #include "print-tree.h" #include "volumes.h" #include "raid56.h" -#include "async-thread.h" -#include "check-integrity.h" #include "rcu-string.h" #include "dev-replace.h" #include "sysfs.h" @@ -33,8 +28,13 @@ #include "block-group.h" #include "discard.h" #include "zoned.h" - -static struct bio_set btrfs_bioset; +#include "fs.h" +#include "accessors.h" +#include "uuid-tree.h" +#include "ioctl.h" +#include "relocation.h" +#include "scrub.h" +#include "super.h" #define BTRFS_BLOCK_GROUP_STRIPE_MASK (BTRFS_BLOCK_GROUP_RAID0 | \ BTRFS_BLOCK_GROUP_RAID10 | \ @@ -248,11 +248,6 @@ out_overflow:; static int init_first_rw_device(struct btrfs_trans_handle *trans); static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info); static void btrfs_dev_stat_print_on_load(struct btrfs_device *device); -static int __btrfs_map_block(struct btrfs_fs_info *fs_info, - enum btrfs_map_op op, u64 logical, u64 *length, - struct btrfs_io_context **bioc_ret, - struct btrfs_io_stripe *smap, - int *mirror_num_ret, int need_raid_map); /* * Device locking @@ -529,14 +524,14 @@ error: return ret; } -/** - * Search and remove all stale devices (which are not mounted). - * When both inputs are NULL, it will search and release all stale devices. +/* + * Search and remove all stale devices (which are not mounted). When both + * inputs are NULL, it will search and release all stale devices. * - * @devt: Optional. When provided will it release all unmounted devices - * matching this devt only. + * @devt: Optional. When provided will it release all unmounted devices + * matching this devt only. * @skip_device: Optional. Will skip this device when searching for the stale - * devices. + * devices. * * Return: 0 for success or if @devt is 0. * -EBUSY if @devt is a mounted device. @@ -641,6 +636,9 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices, if (!bdev_nonrot(bdev)) fs_devices->rotating = true; + if (bdev_max_discard_sectors(bdev)) + fs_devices->discardable = true; + device->bdev = bdev; clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); device->mode = flags; @@ -835,26 +833,23 @@ static noinline struct btrfs_device *device_list_add(const char *path, } if (!device) { + unsigned int nofs_flag; + if (fs_devices->opened) { mutex_unlock(&fs_devices->device_list_mutex); return ERR_PTR(-EBUSY); } + nofs_flag = memalloc_nofs_save(); device = btrfs_alloc_device(NULL, &devid, - disk_super->dev_item.uuid); + disk_super->dev_item.uuid, path); + memalloc_nofs_restore(nofs_flag); if (IS_ERR(device)) { mutex_unlock(&fs_devices->device_list_mutex); /* we can safely leave the fs_devices entry around */ return device; } - name = rcu_string_strdup(path, GFP_NOFS); - if (!name) { - btrfs_free_device(device); - mutex_unlock(&fs_devices->device_list_mutex); - return ERR_PTR(-ENOMEM); - } - rcu_assign_pointer(device->name, name); device->devt = path_devt; list_add_rcu(&device->dev_list, &fs_devices->devices); @@ -934,7 +929,7 @@ static noinline struct btrfs_device *device_list_add(const char *path, } btrfs_info_in_rcu(NULL, "devid %llu device path %s changed to %s scanned by %s (%d)", - devid, rcu_str_deref(device->name), + devid, btrfs_dev_name(device), path, current->comm, task_pid_nr(current)); } @@ -987,30 +982,22 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig) fs_devices->total_devices = orig->total_devices; list_for_each_entry(orig_dev, &orig->devices, dev_list) { - struct rcu_string *name; + const char *dev_path = NULL; + + /* + * This is ok to do without RCU read locked because we hold the + * uuid mutex so nothing we touch in here is going to disappear. + */ + if (orig_dev->name) + dev_path = orig_dev->name->str; device = btrfs_alloc_device(NULL, &orig_dev->devid, - orig_dev->uuid); + orig_dev->uuid, dev_path); if (IS_ERR(device)) { ret = PTR_ERR(device); goto error; } - /* - * This is ok to do without rcu read locked because we hold the - * uuid mutex so nothing we touch in here is going to disappear. - */ - if (orig_dev->name) { - name = rcu_string_strdup(orig_dev->name->str, - GFP_KERNEL); - if (!name) { - btrfs_free_device(device); - ret = -ENOMEM; - goto error; - } - rcu_assign_pointer(device->name, name); - } - if (orig_dev->zone_info) { struct btrfs_zoned_device_info *zone_info; @@ -1473,8 +1460,9 @@ static bool dev_extent_hole_check_zoned(struct btrfs_device *device, return changed; } -/** - * dev_extent_hole_check - check if specified hole is suitable for allocation +/* + * Check if specified hole is suitable for allocation. + * * @device: the device which we have the hole * @hole_start: starting position of the hole * @hole_size: the size of the hole @@ -1528,7 +1516,8 @@ static bool dev_extent_hole_check(struct btrfs_device *device, u64 *hole_start, } /* - * find_free_dev_extent_start - find free space in the specified device + * Find free space in the specified device. + * * @device: the device which we search the free space in * @num_bytes: the size of the free space that we need * @search_start: the position from which to begin the search @@ -1536,9 +1525,8 @@ static bool dev_extent_hole_check(struct btrfs_device *device, u64 *hole_start, * @len: the size of the free space. that we find, or the size * of the max free space if we don't find suitable free space * - * this uses a pretty simple search, the expectation is that it is - * called very infrequently and that a given device has a small number - * of extents + * This does a pretty simple search, the expectation is that it is called very + * infrequently and that a given device has a small number of extents. * * @start is used to store the start of the free space if we find. But if we * don't find suitable free space, it will be used to store the start position @@ -2101,7 +2089,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, if (btrfs_pinned_by_swapfile(fs_info, device)) { btrfs_warn_in_rcu(fs_info, "cannot remove device %s (devid %llu) due to active swapfile", - rcu_str_deref(device->name), device->devid); + btrfs_dev_name(device), device->devid); return -ETXTBSY; } @@ -2317,8 +2305,8 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev) btrfs_free_device(tgtdev); } -/** - * Populate args from device at path +/* + * Populate args from device at path. * * @fs_info: the filesystem * @args: the args to populate @@ -2593,7 +2581,6 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path struct btrfs_device *device; struct block_device *bdev; struct super_block *sb = fs_info->sb; - struct rcu_string *name; struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; struct btrfs_fs_devices *seed_devices; u64 orig_super_total_bytes; @@ -2634,20 +2621,13 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path } rcu_read_unlock(); - device = btrfs_alloc_device(fs_info, NULL, NULL); + device = btrfs_alloc_device(fs_info, NULL, NULL, device_path); if (IS_ERR(device)) { /* we can safely leave the fs_devices entry around */ ret = PTR_ERR(device); goto error; } - name = rcu_string_strdup(device_path, GFP_KERNEL); - if (!name) { - ret = -ENOMEM; - goto error_free_device; - } - rcu_assign_pointer(device->name, name); - device->fs_info = fs_info; device->bdev = bdev; ret = lookup_bdev(device_path, &device->devt); @@ -3603,16 +3583,14 @@ static int chunk_usage_range_filter(struct btrfs_fs_info *fs_info, u64 chunk_off if (bargs->usage_min == 0) user_thresh_min = 0; else - user_thresh_min = div_factor_fine(cache->length, - bargs->usage_min); + user_thresh_min = mult_perc(cache->length, bargs->usage_min); if (bargs->usage_max == 0) user_thresh_max = 1; else if (bargs->usage_max > 100) user_thresh_max = cache->length; else - user_thresh_max = div_factor_fine(cache->length, - bargs->usage_max); + user_thresh_max = mult_perc(cache->length, bargs->usage_max); if (user_thresh_min <= chunk_used && chunk_used < user_thresh_max) ret = 0; @@ -3636,7 +3614,7 @@ static int chunk_usage_filter(struct btrfs_fs_info *fs_info, else if (bargs->usage > 100) user_thresh = cache->length; else - user_thresh = div_factor_fine(cache->length, bargs->usage); + user_thresh = mult_perc(cache->length, bargs->usage); if (chunk_used < user_thresh) ret = 0; @@ -4026,10 +4004,11 @@ error: return ret; } -/** - * alloc_profile_is_valid - see if a given profile is valid and reduced - * @flags: profile to validate - * @extended: if true @flags is treated as an extended profile +/* + * See if a given profile is valid and reduced. + * + * @flags: profile to validate + * @extended: if true @flags is treated as an extended profile */ static int alloc_profile_is_valid(u64 flags, int extended) { @@ -5101,7 +5080,7 @@ static void init_alloc_chunk_ctl_policy_regular( ctl->devs_max = min_t(int, ctl->devs_max, BTRFS_MAX_DEVS_SYS_CHUNK); /* We don't want a chunk larger than 10% of writable space */ - ctl->max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1), + ctl->max_chunk_size = min(mult_perc(fs_devices->total_rw_bytes, 10), ctl->max_chunk_size); ctl->dev_extent_min = BTRFS_STRIPE_LEN * ctl->dev_stripes; } @@ -5132,7 +5111,7 @@ static void init_alloc_chunk_ctl_policy_zoned( } /* We don't want a chunk larger than 10% of writable space */ - limit = max(round_down(div_factor(fs_devices->total_rw_bytes, 1), + limit = max(round_down(mult_perc(fs_devices->total_rw_bytes, 10), zone_size), min_chunk_size); ctl->max_chunk_size = min(limit, ctl->max_chunk_size); @@ -5908,7 +5887,10 @@ static struct btrfs_io_context *alloc_btrfs_io_context(struct btrfs_fs_info *fs_ * and the stripes. */ sizeof(u64) * (total_stripes), - GFP_NOFS|__GFP_NOFAIL); + GFP_NOFS); + + if (!bioc) + return NULL; refcount_set(&bioc->refs, 1); @@ -6370,11 +6352,11 @@ static void set_io_stripe(struct btrfs_io_stripe *dst, const struct map_lookup * stripe_offset + stripe_nr * map->stripe_len; } -static int __btrfs_map_block(struct btrfs_fs_info *fs_info, - enum btrfs_map_op op, u64 logical, u64 *length, - struct btrfs_io_context **bioc_ret, - struct btrfs_io_stripe *smap, - int *mirror_num_ret, int need_raid_map) +int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, + u64 logical, u64 *length, + struct btrfs_io_context **bioc_ret, + struct btrfs_io_stripe *smap, int *mirror_num_ret, + int need_raid_map) { struct extent_map *em; struct map_lookup *map; @@ -6657,266 +6639,6 @@ int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, NULL, NULL, 1); } -/* - * Initialize a btrfs_bio structure. This skips the embedded bio itself as it - * is already initialized by the block layer. - */ -static inline void btrfs_bio_init(struct btrfs_bio *bbio, - btrfs_bio_end_io_t end_io, void *private) -{ - memset(bbio, 0, offsetof(struct btrfs_bio, bio)); - bbio->end_io = end_io; - bbio->private = private; -} - -/* - * Allocate a btrfs_bio structure. The btrfs_bio is the main I/O container for - * btrfs, and is used for all I/O submitted through btrfs_submit_bio. - * - * Just like the underlying bio_alloc_bioset it will not fail as it is backed by - * a mempool. - */ -struct bio *btrfs_bio_alloc(unsigned int nr_vecs, blk_opf_t opf, - btrfs_bio_end_io_t end_io, void *private) -{ - struct bio *bio; - - bio = bio_alloc_bioset(NULL, nr_vecs, opf, GFP_NOFS, &btrfs_bioset); - btrfs_bio_init(btrfs_bio(bio), end_io, private); - return bio; -} - -struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size, - btrfs_bio_end_io_t end_io, void *private) -{ - struct bio *bio; - struct btrfs_bio *bbio; - - ASSERT(offset <= UINT_MAX && size <= UINT_MAX); - - bio = bio_alloc_clone(orig->bi_bdev, orig, GFP_NOFS, &btrfs_bioset); - bbio = btrfs_bio(bio); - btrfs_bio_init(bbio, end_io, private); - - bio_trim(bio, offset >> 9, size >> 9); - bbio->iter = bio->bi_iter; - return bio; -} - -static void btrfs_log_dev_io_error(struct bio *bio, struct btrfs_device *dev) -{ - if (!dev || !dev->bdev) - return; - if (bio->bi_status != BLK_STS_IOERR && bio->bi_status != BLK_STS_TARGET) - return; - - if (btrfs_op(bio) == BTRFS_MAP_WRITE) - btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS); - if (!(bio->bi_opf & REQ_RAHEAD)) - btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS); - if (bio->bi_opf & REQ_PREFLUSH) - btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_FLUSH_ERRS); -} - -static struct workqueue_struct *btrfs_end_io_wq(struct btrfs_fs_info *fs_info, - struct bio *bio) -{ - if (bio->bi_opf & REQ_META) - return fs_info->endio_meta_workers; - return fs_info->endio_workers; -} - -static void btrfs_end_bio_work(struct work_struct *work) -{ - struct btrfs_bio *bbio = - container_of(work, struct btrfs_bio, end_io_work); - - bbio->end_io(bbio); -} - -static void btrfs_simple_end_io(struct bio *bio) -{ - struct btrfs_fs_info *fs_info = bio->bi_private; - struct btrfs_bio *bbio = btrfs_bio(bio); - - btrfs_bio_counter_dec(fs_info); - - if (bio->bi_status) - btrfs_log_dev_io_error(bio, bbio->device); - - if (bio_op(bio) == REQ_OP_READ) { - INIT_WORK(&bbio->end_io_work, btrfs_end_bio_work); - queue_work(btrfs_end_io_wq(fs_info, bio), &bbio->end_io_work); - } else { - bbio->end_io(bbio); - } -} - -static void btrfs_raid56_end_io(struct bio *bio) -{ - struct btrfs_io_context *bioc = bio->bi_private; - struct btrfs_bio *bbio = btrfs_bio(bio); - - btrfs_bio_counter_dec(bioc->fs_info); - bbio->mirror_num = bioc->mirror_num; - bbio->end_io(bbio); - - btrfs_put_bioc(bioc); -} - -static void btrfs_orig_write_end_io(struct bio *bio) -{ - struct btrfs_io_stripe *stripe = bio->bi_private; - struct btrfs_io_context *bioc = stripe->bioc; - struct btrfs_bio *bbio = btrfs_bio(bio); - - btrfs_bio_counter_dec(bioc->fs_info); - - if (bio->bi_status) { - atomic_inc(&bioc->error); - btrfs_log_dev_io_error(bio, stripe->dev); - } - - /* - * Only send an error to the higher layers if it is beyond the tolerance - * threshold. - */ - if (atomic_read(&bioc->error) > bioc->max_errors) - bio->bi_status = BLK_STS_IOERR; - else - bio->bi_status = BLK_STS_OK; - - bbio->end_io(bbio); - btrfs_put_bioc(bioc); -} - -static void btrfs_clone_write_end_io(struct bio *bio) -{ - struct btrfs_io_stripe *stripe = bio->bi_private; - - if (bio->bi_status) { - atomic_inc(&stripe->bioc->error); - btrfs_log_dev_io_error(bio, stripe->dev); - } - - /* Pass on control to the original bio this one was cloned from */ - bio_endio(stripe->bioc->orig_bio); - bio_put(bio); -} - -static void btrfs_submit_dev_bio(struct btrfs_device *dev, struct bio *bio) -{ - if (!dev || !dev->bdev || - test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) || - (btrfs_op(bio) == BTRFS_MAP_WRITE && - !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))) { - bio_io_error(bio); - return; - } - - bio_set_dev(bio, dev->bdev); - - /* - * For zone append writing, bi_sector must point the beginning of the - * zone - */ - if (bio_op(bio) == REQ_OP_ZONE_APPEND) { - u64 physical = bio->bi_iter.bi_sector << SECTOR_SHIFT; - - if (btrfs_dev_is_sequential(dev, physical)) { - u64 zone_start = round_down(physical, - dev->fs_info->zone_size); - - bio->bi_iter.bi_sector = zone_start >> SECTOR_SHIFT; - } else { - bio->bi_opf &= ~REQ_OP_ZONE_APPEND; - bio->bi_opf |= REQ_OP_WRITE; - } - } - btrfs_debug_in_rcu(dev->fs_info, - "%s: rw %d 0x%x, sector=%llu, dev=%lu (%s id %llu), size=%u", - __func__, bio_op(bio), bio->bi_opf, bio->bi_iter.bi_sector, - (unsigned long)dev->bdev->bd_dev, rcu_str_deref(dev->name), - dev->devid, bio->bi_iter.bi_size); - - btrfsic_check_bio(bio); - submit_bio(bio); -} - -static void btrfs_submit_mirrored_bio(struct btrfs_io_context *bioc, int dev_nr) -{ - struct bio *orig_bio = bioc->orig_bio, *bio; - - ASSERT(bio_op(orig_bio) != REQ_OP_READ); - - /* Reuse the bio embedded into the btrfs_bio for the last mirror */ - if (dev_nr == bioc->num_stripes - 1) { - bio = orig_bio; - bio->bi_end_io = btrfs_orig_write_end_io; - } else { - bio = bio_alloc_clone(NULL, orig_bio, GFP_NOFS, &fs_bio_set); - bio_inc_remaining(orig_bio); - bio->bi_end_io = btrfs_clone_write_end_io; - } - - bio->bi_private = &bioc->stripes[dev_nr]; - bio->bi_iter.bi_sector = bioc->stripes[dev_nr].physical >> SECTOR_SHIFT; - bioc->stripes[dev_nr].bioc = bioc; - btrfs_submit_dev_bio(bioc->stripes[dev_nr].dev, bio); -} - -void btrfs_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio, int mirror_num) -{ - u64 logical = bio->bi_iter.bi_sector << 9; - u64 length = bio->bi_iter.bi_size; - u64 map_length = length; - struct btrfs_io_context *bioc = NULL; - struct btrfs_io_stripe smap; - int ret; - - btrfs_bio_counter_inc_blocked(fs_info); - ret = __btrfs_map_block(fs_info, btrfs_op(bio), logical, &map_length, - &bioc, &smap, &mirror_num, 1); - if (ret) { - btrfs_bio_counter_dec(fs_info); - btrfs_bio_end_io(btrfs_bio(bio), errno_to_blk_status(ret)); - return; - } - - if (map_length < length) { - btrfs_crit(fs_info, - "mapping failed logical %llu bio len %llu len %llu", - logical, length, map_length); - BUG(); - } - - if (!bioc) { - /* Single mirror read/write fast path */ - btrfs_bio(bio)->mirror_num = mirror_num; - btrfs_bio(bio)->device = smap.dev; - bio->bi_iter.bi_sector = smap.physical >> SECTOR_SHIFT; - bio->bi_private = fs_info; - bio->bi_end_io = btrfs_simple_end_io; - btrfs_submit_dev_bio(smap.dev, bio); - } else if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) { - /* Parity RAID write or read recovery */ - bio->bi_private = bioc; - bio->bi_end_io = btrfs_raid56_end_io; - if (bio_op(bio) == REQ_OP_READ) - raid56_parity_recover(bio, bioc, mirror_num); - else - raid56_parity_write(bio, bioc); - } else { - /* Write to multiple mirrors */ - int total_devs = bioc->num_stripes; - int dev_nr; - - bioc->orig_bio = bio; - for (dev_nr = 0; dev_nr < total_devs; dev_nr++) - btrfs_submit_mirrored_bio(bioc, dev_nr); - } -} - static bool dev_args_match_fs_devices(const struct btrfs_dev_lookup_args *args, const struct btrfs_fs_devices *fs_devices) { @@ -6988,8 +6710,9 @@ static struct btrfs_device *add_missing_dev(struct btrfs_fs_devices *fs_devices, * always do NOFS because we use it in a lot of other GFP_KERNEL safe * places. */ + nofs_flag = memalloc_nofs_save(); - device = btrfs_alloc_device(NULL, &devid, dev_uuid); + device = btrfs_alloc_device(NULL, &devid, dev_uuid, NULL); memalloc_nofs_restore(nofs_flag); if (IS_ERR(device)) return device; @@ -7004,22 +6727,24 @@ static struct btrfs_device *add_missing_dev(struct btrfs_fs_devices *fs_devices, return device; } -/** - * btrfs_alloc_device - allocate struct btrfs_device +/* + * Allocate new device struct, set up devid and UUID. + * * @fs_info: used only for generating a new devid, can be NULL if * devid is provided (i.e. @devid != NULL). * @devid: a pointer to devid for this device. If NULL a new devid * is generated. * @uuid: a pointer to UUID for this device. If NULL a new UUID * is generated. + * @path: a pointer to device path if available, NULL otherwise. * * Return: a pointer to a new &struct btrfs_device on success; ERR_PTR() * on error. Returned struct is not linked onto any lists and must be * destroyed with btrfs_free_device. */ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info, - const u64 *devid, - const u8 *uuid) + const u64 *devid, const u8 *uuid, + const char *path) { struct btrfs_device *dev; u64 tmp; @@ -7037,8 +6762,7 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info, atomic_set(&dev->dev_stats_ccnt, 0); btrfs_device_data_ordered_init(dev); - extent_io_tree_init(fs_info, &dev->alloc_state, - IO_TREE_DEVICE_ALLOC_STATE, NULL); + extent_io_tree_init(fs_info, &dev->alloc_state, IO_TREE_DEVICE_ALLOC_STATE); if (devid) tmp = *devid; @@ -7058,6 +6782,17 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info, else generate_random_uuid(dev->uuid); + if (path) { + struct rcu_string *name; + + name = rcu_string_strdup(path, GFP_KERNEL); + if (!name) { + btrfs_free_device(dev); + return ERR_PTR(-ENOMEM); + } + rcu_assign_pointer(dev->name, name); + } + return dev; } @@ -7241,8 +6976,9 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf, map->stripes[i].dev = handle_missing_device(fs_info, devid, uuid); if (IS_ERR(map->stripes[i].dev)) { + ret = PTR_ERR(map->stripes[i].dev); free_extent_map(em); - return PTR_ERR(map->stripes[i].dev); + return ret; } } @@ -7904,7 +7640,7 @@ static int update_dev_stat_item(struct btrfs_trans_handle *trans, if (ret < 0) { btrfs_warn_in_rcu(fs_info, "error %d while searching for dev_stats item for device %s", - ret, rcu_str_deref(device->name)); + ret, btrfs_dev_name(device)); goto out; } @@ -7915,7 +7651,7 @@ static int update_dev_stat_item(struct btrfs_trans_handle *trans, if (ret != 0) { btrfs_warn_in_rcu(fs_info, "delete too small dev_stats item for device %s failed %d", - rcu_str_deref(device->name), ret); + btrfs_dev_name(device), ret); goto out; } ret = 1; @@ -7929,7 +7665,7 @@ static int update_dev_stat_item(struct btrfs_trans_handle *trans, if (ret < 0) { btrfs_warn_in_rcu(fs_info, "insert dev_stats item for device %s failed %d", - rcu_str_deref(device->name), ret); + btrfs_dev_name(device), ret); goto out; } } @@ -7994,7 +7730,7 @@ void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index) return; btrfs_err_rl_in_rcu(dev->fs_info, "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u", - rcu_str_deref(dev->name), + btrfs_dev_name(dev), btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS), btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS), btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS), @@ -8014,7 +7750,7 @@ static void btrfs_dev_stat_print_on_load(struct btrfs_device *dev) btrfs_info_in_rcu(dev->fs_info, "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u", - rcu_str_deref(dev->name), + btrfs_dev_name(dev), btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS), btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS), btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS), @@ -8433,17 +8169,3 @@ bool btrfs_repair_one_zone(struct btrfs_fs_info *fs_info, u64 logical) return true; } - -int __init btrfs_bioset_init(void) -{ - if (bioset_init(&btrfs_bioset, BIO_POOL_SIZE, - offsetof(struct btrfs_bio, bio), - BIOSET_NEED_BVECS)) - return -ENOMEM; - return 0; -} - -void __cold btrfs_bioset_exit(void) -{ - bioset_exit(&btrfs_bioset); -} diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 099def5613b8..6b7a05f6cf82 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -6,10 +6,12 @@ #ifndef BTRFS_VOLUMES_H #define BTRFS_VOLUMES_H -#include <linux/bio.h> #include <linux/sort.h> #include <linux/btrfs.h> #include "async-thread.h" +#include "messages.h" +#include "tree-checker.h" +#include "rcu-string.h" #define BTRFS_MAX_DATA_CHUNK_SIZE (10ULL * SZ_1G) @@ -354,6 +356,8 @@ struct btrfs_fs_devices { * nonrot flag set */ bool rotating; + /* Devices support TRIM/discard commands */ + bool discardable; struct btrfs_fs_info *fs_info; /* sysfs kobjects */ @@ -368,8 +372,6 @@ struct btrfs_fs_devices { enum btrfs_read_policy read_policy; }; -#define BTRFS_BIO_INLINE_CSUM_SIZE 64 - #define BTRFS_MAX_DEVS(info) ((BTRFS_MAX_ITEM_SIZE(info) \ - sizeof(struct btrfs_chunk)) \ / sizeof(struct btrfs_stripe) + 1) @@ -379,89 +381,6 @@ struct btrfs_fs_devices { - 2 * sizeof(struct btrfs_chunk)) \ / sizeof(struct btrfs_stripe) + 1) -/* - * Maximum number of sectors for a single bio to limit the size of the - * checksum array. This matches the number of bio_vecs per bio and thus the - * I/O size for buffered I/O. - */ -#define BTRFS_MAX_BIO_SECTORS (256) - -typedef void (*btrfs_bio_end_io_t)(struct btrfs_bio *bbio); - -/* - * Additional info to pass along bio. - * - * Mostly for btrfs specific features like csum and mirror_num. - */ -struct btrfs_bio { - unsigned int mirror_num; - struct bvec_iter iter; - - /* for direct I/O */ - u64 file_offset; - - /* @device is for stripe IO submission. */ - struct btrfs_device *device; - u8 *csum; - u8 csum_inline[BTRFS_BIO_INLINE_CSUM_SIZE]; - - /* End I/O information supplied to btrfs_bio_alloc */ - btrfs_bio_end_io_t end_io; - void *private; - - /* For read end I/O handling */ - struct work_struct end_io_work; - - /* - * This member must come last, bio_alloc_bioset will allocate enough - * bytes for entire btrfs_bio but relies on bio being last. - */ - struct bio bio; -}; - -static inline struct btrfs_bio *btrfs_bio(struct bio *bio) -{ - return container_of(bio, struct btrfs_bio, bio); -} - -int __init btrfs_bioset_init(void); -void __cold btrfs_bioset_exit(void); - -struct bio *btrfs_bio_alloc(unsigned int nr_vecs, blk_opf_t opf, - btrfs_bio_end_io_t end_io, void *private); -struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size, - btrfs_bio_end_io_t end_io, void *private); - -static inline void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status) -{ - bbio->bio.bi_status = status; - bbio->end_io(bbio); -} - -static inline void btrfs_bio_free_csum(struct btrfs_bio *bbio) -{ - if (bbio->csum != bbio->csum_inline) { - kfree(bbio->csum); - bbio->csum = NULL; - } -} - -/* - * Iterate through a btrfs_bio (@bbio) on a per-sector basis. - * - * bvl - struct bio_vec - * bbio - struct btrfs_bio - * iters - struct bvec_iter - * bio_offset - unsigned int - */ -#define btrfs_bio_for_each_sector(fs_info, bvl, bbio, iter, bio_offset) \ - for ((iter) = (bbio)->iter, (bio_offset) = 0; \ - (iter).bi_size && \ - (((bvl) = bio_iter_iovec((&(bbio)->bio), (iter))), 1); \ - (bio_offset) += fs_info->sectorsize, \ - bio_advance_iter_single(&(bbio)->bio, &(iter), \ - (fs_info)->sectorsize)) - struct btrfs_io_stripe { struct btrfs_device *dev; union { @@ -603,6 +522,13 @@ static inline enum btrfs_map_op btrfs_op(struct bio *bio) } } +static inline unsigned long btrfs_chunk_item_size(int num_stripes) +{ + ASSERT(num_stripes); + return sizeof(struct btrfs_chunk) + + sizeof(struct btrfs_stripe) * (num_stripes - 1); +} + void btrfs_get_bioc(struct btrfs_io_context *bioc); void btrfs_put_bioc(struct btrfs_io_context *bioc); int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, @@ -611,6 +537,11 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, u64 logical, u64 *length, struct btrfs_io_context **bioc_ret); +int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, + u64 logical, u64 *length, + struct btrfs_io_context **bioc_ret, + struct btrfs_io_stripe *smap, int *mirror_num_ret, + int need_raid_map); struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info, u64 logical, u64 *length_ret, u32 *num_stripes); @@ -622,7 +553,6 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info); struct btrfs_block_group *btrfs_create_chunk(struct btrfs_trans_handle *trans, u64 type); void btrfs_mapping_tree_free(struct extent_map_tree *tree); -void btrfs_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio, int mirror_num); int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, fmode_t flags, void *holder); struct btrfs_device *btrfs_scan_one_device(const char *path, @@ -639,8 +569,8 @@ int btrfs_get_dev_args_from_path(struct btrfs_fs_info *fs_info, struct btrfs_dev_lookup_args *args, const char *path); struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info, - const u64 *devid, - const u8 *uuid); + const u64 *devid, const u8 *uuid, + const char *path); void btrfs_put_dev_args_from_path(struct btrfs_dev_lookup_args *args); void btrfs_free_device(struct btrfs_device *device); int btrfs_rm_device(struct btrfs_fs_info *fs_info, @@ -741,6 +671,14 @@ static inline void btrfs_dev_stat_set(struct btrfs_device *dev, atomic_inc(&dev->dev_stats_ccnt); } +static inline const char *btrfs_dev_name(const struct btrfs_device *device) +{ + if (!device || test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) + return "<missing disk>"; + else + return rcu_str_deref(device->name); +} + void btrfs_commit_device_sizes(struct btrfs_transaction *trans); struct list_head * __attribute_const__ btrfs_get_fs_uuids(void); diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index 5bb8d8c86311..0ed4b119a7ca 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -13,12 +13,16 @@ #include <linux/iversion.h> #include <linux/sched/mm.h> #include "ctree.h" +#include "fs.h" +#include "messages.h" #include "btrfs_inode.h" #include "transaction.h" #include "xattr.h" #include "disk-io.h" #include "props.h" #include "locking.h" +#include "accessors.h" +#include "dir-item.h" int btrfs_getxattr(struct inode *inode, const char *name, void *buffer, size_t size) diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c index b4f44662cda7..01a13de11832 100644 --- a/fs/btrfs/zlib.c +++ b/fs/btrfs/zlib.c @@ -155,8 +155,8 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping, in_page = find_get_page(mapping, start >> PAGE_SHIFT); data_in = kmap_local_page(in_page); - memcpy(workspace->buf + i * PAGE_SIZE, - data_in, PAGE_SIZE); + copy_page(workspace->buf + i * PAGE_SIZE, + data_in); start += PAGE_SIZE; } workspace->strm.next_in = workspace->buf; @@ -355,7 +355,7 @@ done: return ret; } -int zlib_decompress(struct list_head *ws, unsigned char *data_in, +int zlib_decompress(struct list_head *ws, const u8 *data_in, struct page *dest_page, unsigned long start_byte, size_t srclen, size_t destlen) { diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index c9e2b0c85309..a759668477bb 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -15,6 +15,8 @@ #include "transaction.h" #include "dev-replace.h" #include "space-info.h" +#include "fs.h" +#include "accessors.h" /* Maximum number of zones to report per blkdev_report_zones() call */ #define BTRFS_REPORT_NR_ZONES 4096 @@ -392,8 +394,7 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache) zone_sectors = bdev_zone_sectors(bdev); } - /* Check if it's power of 2 (see is_power_of_2) */ - ASSERT(zone_sectors != 0 && (zone_sectors & (zone_sectors - 1)) == 0); + ASSERT(is_power_of_two_u64(zone_sectors)); zone_info->zone_size = zone_sectors << SECTOR_SHIFT; /* We reject devices with a zone size larger than 8GB */ @@ -1018,8 +1019,8 @@ int btrfs_reset_sb_log_zones(struct block_device *bdev, int mirror) zone_sectors * BTRFS_NR_SB_LOG_ZONES, GFP_NOFS); } -/** - * btrfs_find_allocatable_zones - find allocatable zones within a given region +/* + * Find allocatable zones within a given region. * * @device: the device to allocate a region on * @hole_start: the position of the hole to allocate the region @@ -1436,7 +1437,7 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) } if (num_sequential > 0) - cache->seq_zone = true; + set_bit(BLOCK_GROUP_FLAG_SEQUENTIAL_ZONE, &cache->runtime_flags); if (num_conventional > 0) { /* Zone capacity is always zone size in emulation */ @@ -1648,7 +1649,7 @@ bool btrfs_use_zone_append(struct btrfs_inode *inode, u64 start) if (!cache) return false; - ret = cache->seq_zone; + ret = !!test_bit(BLOCK_GROUP_FLAG_SEQUENTIAL_ZONE, &cache->runtime_flags); btrfs_put_block_group(cache); return ret; @@ -1862,7 +1863,7 @@ struct btrfs_device *btrfs_zoned_get_device(struct btrfs_fs_info *fs_info, return device; } -/** +/* * Activate block group and underlying device zones * * @block_group: the block group to activate @@ -2153,7 +2154,8 @@ static void btrfs_zone_finish_endio_workfn(struct work_struct *work) void btrfs_schedule_zone_finish_bg(struct btrfs_block_group *bg, struct extent_buffer *eb) { - if (!bg->seq_zone || eb->start + eb->len * 2 <= bg->start + bg->zone_capacity) + if (!test_bit(BLOCK_GROUP_FLAG_SEQUENTIAL_ZONE, &bg->runtime_flags) || + eb->start + eb->len * 2 <= bg->start + bg->zone_capacity) return; if (WARN_ON(bg->zone_finish_work.func == btrfs_zone_finish_endio_workfn)) { diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h index 8bd16d40b7c6..f43990985d80 100644 --- a/fs/btrfs/zoned.h +++ b/fs/btrfs/zoned.h @@ -5,6 +5,7 @@ #include <linux/types.h> #include <linux/blkdev.h> +#include "messages.h" #include "volumes.h" #include "disk-io.h" #include "block-group.h" diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c index 35a0224d4eb7..e34f1ab99d56 100644 --- a/fs/btrfs/zstd.c +++ b/fs/btrfs/zstd.c @@ -94,7 +94,7 @@ static inline struct workspace *list_to_workspace(struct list_head *list) void zstd_free_workspace(struct list_head *ws); struct list_head *zstd_alloc_workspace(unsigned int level); -/** +/* * Timer callback to free unused workspaces. * * @t: timer @@ -616,7 +616,7 @@ done: return ret; } -int zstd_decompress(struct list_head *ws, unsigned char *data_in, +int zstd_decompress(struct list_head *ws, const u8 *data_in, struct page *dest_page, unsigned long start_byte, size_t srclen, size_t destlen) { |